From 9079c36425a93f27ecc97d79ef6c46d2cf8022de Mon Sep 17 00:00:00 2001 From: maxusmusti Date: Mon, 13 Feb 2023 17:49:00 -0500 Subject: [PATCH 1/8] Status update, details adjust --- src/codeflare_sdk/cluster/cluster.py | 183 ++++++++++++++++-------- src/codeflare_sdk/cluster/model.py | 7 +- src/codeflare_sdk/utils/pretty_print.py | 53 ++++++- 3 files changed, 179 insertions(+), 64 deletions(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index d279b874d..730f2f719 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -19,6 +19,7 @@ """ from os import stat +from time import sleep from typing import List, Optional, Tuple import openshift as oc @@ -97,8 +98,15 @@ def up(self): """ self.config.auth.login() namespace = self.config.namespace - with oc.project(namespace): - oc.invoke("apply", ["-f", self.app_wrapper_yaml]) + try: + with oc.project(namespace): + oc.invoke("apply", ["-f", self.app_wrapper_yaml]) + except oc.OpenShiftPythonException as osp: + error_msg = osp.result.err() + if "Unauthorized" in error_msg: + raise PermissionError( + "Action not permitted, have you put in the correct auth credentials?" + ) def down(self): """ @@ -106,54 +114,29 @@ def down(self): associated with the cluster. """ namespace = self.config.namespace - with oc.project(namespace): - oc.invoke("delete", ["AppWrapper", self.app_wrapper_name]) - self.config.auth.logout() - - def status(self, print_to_console: bool = True): # pragma: no cover - """ - TO BE UPDATED: Will soon return (and print by default) the cluster's - status, from AppWrapper submission to setup completion. All resource - details will be moved to cluster.details(). - """ - cluster = _ray_cluster_status(self.config.name, self.config.namespace) - if cluster: - # overriding the number of gpus with requested - cluster.worker_gpu = self.config.gpu - if print_to_console: - pretty_print.print_clusters([cluster]) - return cluster.status - else: - if print_to_console: - pretty_print.print_no_resources_found() - return None - - def cluster_uri(self) -> str: - """ - Returns a string containing the cluster's URI. - """ - return f"ray://{self.config.name}-head-svc.{self.config.namespace}.svc:10001" - - def cluster_dashboard_uri(self, namespace: str = "default") -> str: - """ - Returns a string containing the cluster's dashboard URI. - """ try: with oc.project(namespace): - route = oc.invoke( - "get", ["route", "-o", "jsonpath='{$.items[*].spec.host}'"] + oc.invoke("delete", ["AppWrapper", self.app_wrapper_name]) + except oc.OpenShiftPythonException as osp: + error_msg = osp.result.err() + if ( + 'the server doesn\'t have a resource type "AppWrapper"' in error_msg + or "forbidden" in error_msg + or "Unauthorized" in error_msg + ): + raise PermissionError( + "Action not permitted, have you run cluster.up() yet?" ) - route = route.out().split(" ") - route = [x for x in route if f"ray-dashboard-{self.config.name}" in x] - route = route[0].strip().strip("'") - return f"http://{route}" - except: - return "Dashboard route not available yet. Did you run cluster.up()?" + else: + raise osp + self.config.auth.logout() - # checks whether the ray cluster is ready - def is_ready(self, print_to_console: bool = True): # pragma: no cover + def status( + self, print_to_console: bool = True + ) -> Tuple[CodeFlareClusterStatus, bool]: """ - TO BE DEPRECATED: functionality will be added into cluster.status(). + Returns the requested cluster's status, as well as whether or not + it is ready for use. """ ready = False status = CodeFlareClusterStatus.UNKNOWN @@ -166,7 +149,7 @@ def is_ready(self, print_to_console: bool = True): # pragma: no cover AppWrapperStatus.RUNNING_HOLD_COMPLETION, ]: ready = False - status = CodeFlareClusterStatus.QUEUED + status = CodeFlareClusterStatus.STARTING elif appwrapper.status in [ AppWrapperStatus.FAILED, AppWrapperStatus.DELETED, @@ -200,9 +183,65 @@ def is_ready(self, print_to_console: bool = True): # pragma: no cover if print_to_console: # overriding the number of gpus with requested cluster.worker_gpu = self.config.gpu - pretty_print.print_clusters([cluster]) + pretty_print.print_cluster_status(cluster) + elif print_to_console: + if status == CodeFlareClusterStatus.UNKNOWN: + pretty_print.print_no_resources_found() + else: + pretty_print.print_app_wrappers_status([appwrapper]) + return status, ready + def wait_ready(self, timeout: Optional[int] = None): + """ + Waits for requested cluster to be ready, up to an optional timeout (s). + Checks every five seconds. + """ + # FIXME - BREAKING EARLY + print("Waiting for requested resources to be set up...") + ready = False + status = None + time = 0 + while not ready: + status, ready = self.status(print_to_console=False) + if status == CodeFlareClusterStatus.UNKNOWN: + print( + "WARNING: Current cluster status is unknown, have you run cluster.up yet?" + ) + if not ready: + if timeout and time >= timeout: + raise TimeoutError(f"wait() timed out after waiting {timeout}s") + sleep(5) + time += 5 + print("Requested cluster up and running!") + + def details(self, print_to_console: bool = True): + # FIXME - Add a return as well? + # FIXME - When not up? + pretty_print.print_cluster_status(self) + + def cluster_uri(self) -> str: + """ + Returns a string containing the cluster's URI. + """ + return f"ray://{self.config.name}-head-svc.{self.config.namespace}.svc:10001" + + def cluster_dashboard_uri(self) -> str: + """ + Returns a string containing the cluster's dashboard URI. + """ + try: + with oc.project(self.config.namespace): + route = oc.invoke( + "get", ["route", "-o", "jsonpath='{$.items[*].spec.host}'"] + ) + route = route.out().split(" ") + route = [x for x in route if f"ray-dashboard-{self.config.name}" in x] + route = route[0].strip().strip("'") + return f"http://{route}" + except: + return "Dashboard route not available yet, have you run cluster.up()?" + def list_jobs(self) -> List: """ This method accesses the head ray node in your cluster and lists the running jobs. @@ -232,7 +271,16 @@ def get_current_namespace() -> str: # pragma: no cover """ Returns the user's current working namespace. """ - namespace = oc.invoke("project", ["-q"]).actions()[0].out.strip() + try: + namespace = oc.invoke("project", ["-q"]).actions()[0].out.strip() + except oc.OpenShiftPythonException as osp: + error_msg = osp.result.err() + if "do not have rights" in error_msg: + raise PermissionError( + "Action not permitted, have you run auth.login() or cluster.up()?" + ) + else: + raise osp return namespace @@ -242,6 +290,7 @@ def list_all_clusters( """ Returns (and prints by default) a list of all clusters in a given namespace. """ + # FIXME - NOT RUNNING BREAK clusters = _get_ray_clusters(namespace) if print_to_console: pretty_print.print_clusters(clusters) @@ -253,6 +302,7 @@ def list_all_queued(namespace: str, print_to_console: bool = True): # pragma: n Returns (and prints by default) a list of all currently queued-up AppWrappers in a given namespace. """ + # FIXME - FORMATTING ISSUES app_wrappers = _get_app_wrappers( namespace, filter=[AppWrapperStatus.RUNNING, AppWrapperStatus.PENDING] ) @@ -267,25 +317,44 @@ def list_all_queued(namespace: str, print_to_console: bool = True): # pragma: n def _app_wrapper_status( name, namespace="default" ) -> Optional[AppWrapper]: # pragma: no cover - with oc.project(namespace), oc.timeout(10 * 60): - cluster = oc.selector(f"appwrapper/{name}").object() + cluster = None + try: + with oc.project(namespace), oc.timeout(10 * 60): + cluster = oc.selector(f"appwrapper/{name}").object() + except oc.OpenShiftPythonException as osp: + error_msg = osp.result.err() + if not ( + 'the server doesn\'t have a resource type "appwrapper"' in error_msg + or "forbidden" in error_msg + or "Unauthorized" in error_msg + ): + raise osp + if cluster: return _map_to_app_wrapper(cluster) + return cluster + def _ray_cluster_status( name, namespace="default" ) -> Optional[RayCluster]: # pragma: no cover - # FIXME should we check the appwrapper first cluster = None try: with oc.project(namespace), oc.timeout(10 * 60): cluster = oc.selector(f"rayclusters/{name}").object() + except oc.OpenShiftPythonException as osp: + error_msg = osp.result.err() + if not ( + 'the server doesn\'t have a resource type "rayclusters"' in error_msg + or "forbidden" in error_msg + or "Unauthorized" in error_msg + ): + raise osp + + if cluster: + return _map_to_ray_cluster(cluster) - if cluster: - return _map_to_ray_cluster(cluster) - except: - pass return cluster @@ -317,8 +386,10 @@ def _get_app_wrappers( return list_of_app_wrappers -def _map_to_ray_cluster(cluster) -> RayCluster: # pragma: no cover +def _map_to_ray_cluster(cluster) -> Optional[RayCluster]: # pragma: no cover cluster_model = cluster.model + if type(cluster_model.status.state) == oc.model.MissingModel: + return None with oc.project(cluster.namespace()), oc.timeout(10 * 60): route = ( diff --git a/src/codeflare_sdk/cluster/model.py b/src/codeflare_sdk/cluster/model.py index ec788ad6d..9f034da9d 100644 --- a/src/codeflare_sdk/cluster/model.py +++ b/src/codeflare_sdk/cluster/model.py @@ -53,9 +53,10 @@ class CodeFlareClusterStatus(Enum): """ READY = 1 - QUEUED = 2 - FAILED = 3 - UNKNOWN = 4 + STARTING = 2 + QUEUED = 3 + FAILED = 4 + UNKNOWN = 5 @dataclass diff --git a/src/codeflare_sdk/utils/pretty_print.py b/src/codeflare_sdk/utils/pretty_print.py index 6083a9fbd..3e978dcdd 100644 --- a/src/codeflare_sdk/utils/pretty_print.py +++ b/src/codeflare_sdk/utils/pretty_print.py @@ -29,7 +29,7 @@ def print_no_resources_found(): # pragma: no cover console = Console() - console.print(Panel("[red]No resources found")) + console.print(Panel("[red]No resources found, have you run cluster.up() yet?")) def print_app_wrappers_status(app_wrappers: List[AppWrapper]): # pragma: no cover @@ -44,7 +44,7 @@ def print_app_wrappers_status(app_wrappers: List[AppWrapper]): # pragma: no cov table = Table( box=box.ASCII_DOUBLE_HEAD, - title="[bold] :rocket: List of CodeFlare clusters in queue:rocket:", + title="[bold] :rocket: Cluster Queue Status :rocket:", ) table.add_column("Name", style="cyan", no_wrap=True) table.add_column("Status", style="magenta") @@ -52,8 +52,51 @@ def print_app_wrappers_status(app_wrappers: List[AppWrapper]): # pragma: no cov table.add_row("") # empty row for spacing console.print(Panel.fit(table)) +def print_cluster_status(cluster: RayCluster): + "Pretty prints the status of a passed-in cluster" + if not cluster: + print_no_resources_found + return -def print_clusters(clusters: List[RayCluster], verbose=True): # pragma: no cover + console = Console() + status = ( + "Active :white_heavy_check_mark:" + if cluster.status == RayClusterStatus.READY + else "InActive :x:" + ) + name = cluster.name + dashboard = cluster.dashboard + # owned = bool(cluster["userOwned"]) + owned = True + + #'table0' to display the cluster name, status, url, and dashboard link + table0 = Table(box=None, show_header=False) + if owned: + table0.add_row("[white on green][bold]Name") + else: + table0.add_row("") + table0.add_row("[bold underline]" + name, status) + table0.add_row() + # fixme harcded to default for now + table0.add_row( + f"[bold]URI:[/bold] ray://{cluster.name}-head-svc.{cluster.namespace}.svc:10001" + ) # format that is used to generate the name of the service + table0.add_row() + table0.add_row(f"[link={dashboard} blue underline]Dashboard:link:[/link]") + table0.add_row("") # empty row for spacing + + # table4 to display table0 and table3, one below the other + table4 = Table(box=None, show_header=False) + table4.add_row(table0) + + # Encompass all details of the cluster in a single panel + table5 = Table( + box=None, title="[bold] :rocket: CodeFlare Cluster Status :rocket:" + ) + table5.add_row(Panel.fit(table4)) + console.print(table5) + +def print_clusters(clusters: List[RayCluster]): # pragma: no cover if not clusters: print_no_resources_found() return # shortcircuit @@ -80,7 +123,7 @@ def print_clusters(clusters: List[RayCluster], verbose=True): # pragma: no cove #'table0' to display the cluster name, status, url, and dashboard link table0 = Table(box=None, show_header=False) if owned: - table0.add_row("[white on green][bold]Owner") + table0.add_row("[white on green][bold]Name") else: table0.add_row("") table0.add_row("[bold underline]" + name, status) @@ -131,7 +174,7 @@ def print_clusters(clusters: List[RayCluster], verbose=True): # pragma: no cove # than being center aligned on the console/terminal if we simply use console.print(title) table5 = Table( - box=None, title="[bold] :rocket: List of CodeFlare clusters :rocket:" + box=None, title="[bold] :rocket: CodeFlare Cluster Details :rocket:" ) table5.add_row(Panel.fit(table4)) console.print(table5) From e5e8ab333e6307889da5029c084a954109a8f698 Mon Sep 17 00:00:00 2001 From: maxusmusti Date: Tue, 14 Feb 2023 10:31:48 -0500 Subject: [PATCH 2/8] Format fix --- src/codeflare_sdk/utils/pretty_print.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/codeflare_sdk/utils/pretty_print.py b/src/codeflare_sdk/utils/pretty_print.py index 3e978dcdd..b92908a09 100644 --- a/src/codeflare_sdk/utils/pretty_print.py +++ b/src/codeflare_sdk/utils/pretty_print.py @@ -52,6 +52,7 @@ def print_app_wrappers_status(app_wrappers: List[AppWrapper]): # pragma: no cov table.add_row("") # empty row for spacing console.print(Panel.fit(table)) + def print_cluster_status(cluster: RayCluster): "Pretty prints the status of a passed-in cluster" if not cluster: @@ -60,9 +61,9 @@ def print_cluster_status(cluster: RayCluster): console = Console() status = ( - "Active :white_heavy_check_mark:" - if cluster.status == RayClusterStatus.READY - else "InActive :x:" + "Active :white_heavy_check_mark:" + if cluster.status == RayClusterStatus.READY + else "InActive :x:" ) name = cluster.name dashboard = cluster.dashboard @@ -90,12 +91,11 @@ def print_cluster_status(cluster: RayCluster): table4.add_row(table0) # Encompass all details of the cluster in a single panel - table5 = Table( - box=None, title="[bold] :rocket: CodeFlare Cluster Status :rocket:" - ) + table5 = Table(box=None, title="[bold] :rocket: CodeFlare Cluster Status :rocket:") table5.add_row(Panel.fit(table4)) console.print(table5) + def print_clusters(clusters: List[RayCluster]): # pragma: no cover if not clusters: print_no_resources_found() From fb08580b093dc1dfdd181df45aa7feb01bc5b084 Mon Sep 17 00:00:00 2001 From: maxusmusti Date: Tue, 14 Feb 2023 10:48:10 -0500 Subject: [PATCH 3/8] Test fixes --- src/codeflare_sdk/cluster/cluster.py | 6 +++--- tests/unit_test.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index 730f2f719..91e14ce38 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -246,7 +246,7 @@ def list_jobs(self) -> List: """ This method accesses the head ray node in your cluster and lists the running jobs. """ - dashboard_route = self.cluster_dashboard_uri(namespace=self.config.namespace) + dashboard_route = self.cluster_dashboard_uri() client = JobSubmissionClient(dashboard_route) return client.list_jobs() @@ -254,7 +254,7 @@ def job_status(self, job_id: str) -> str: """ This method accesses the head ray node in your cluster and returns the job status for the provided job id. """ - dashboard_route = self.cluster_dashboard_uri(namespace=self.config.namespace) + dashboard_route = self.cluster_dashboard_uri() client = JobSubmissionClient(dashboard_route) return client.get_job_status(job_id) @@ -262,7 +262,7 @@ def job_logs(self, job_id: str) -> str: """ This method accesses the head ray node in your cluster and returns the logs for the provided job id. """ - dashboard_route = self.cluster_dashboard_uri(namespace=self.config.namespace) + dashboard_route = self.cluster_dashboard_uri() client = JobSubmissionClient(dashboard_route) return client.get_job_logs(job_id) diff --git a/tests/unit_test.py b/tests/unit_test.py index a7ee0514e..6e6a02760 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -255,7 +255,7 @@ def test_cluster_uris(mocker): cluster.config.name = "fake" assert ( cluster.cluster_dashboard_uri() - == "Dashboard route not available yet. Did you run cluster.up()?" + == "Dashboard route not available yet, have you run cluster.up()?" ) From 3a5e7baaea7e0484c90b6ce9cef80f6de2395c10 Mon Sep 17 00:00:00 2001 From: maxusmusti Date: Mon, 20 Feb 2023 11:35:31 -0500 Subject: [PATCH 4/8] Functioning details and minor improvements --- src/codeflare_sdk/cluster/auth.py | 8 ++++---- src/codeflare_sdk/cluster/cluster.py | 26 ++++++++++++++++++++++--- src/codeflare_sdk/utils/pretty_print.py | 4 ++-- 3 files changed, 29 insertions(+), 9 deletions(-) diff --git a/src/codeflare_sdk/cluster/auth.py b/src/codeflare_sdk/cluster/auth.py index 9519d6631..41c5d7e3c 100644 --- a/src/codeflare_sdk/cluster/auth.py +++ b/src/codeflare_sdk/cluster/auth.py @@ -59,7 +59,7 @@ def __init__(self, token: str = None, server: str = None, skip_tls: bool = False self.server = server self.skip_tls = skip_tls - def login(self): + def login(self) -> str: """ This function is used to login to an OpenShift cluster using the user's API token and API server address. Depending on the cluster, a user can choose to login in with "--insecure-skip-tls-verify` by setting `skip_tls` @@ -78,7 +78,7 @@ def login(self): return error_msg return response.out() - def logout(self): + def logout(self) -> str: """ This function is used to logout of an OpenShift cluster. """ @@ -104,14 +104,14 @@ def __init__( self.username = username self.password = password - def login(self): + def login(self) -> str: """ This function is used to login to an OpenShift cluster using the user's `username` and `password`. """ response = oc.login(self.username, self.password) return response.out() - def logout(self): + def logout(self) -> str: """ This function is used to logout of an OpenShift cluster. """ diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index 91e14ce38..727e7e358 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -96,7 +96,9 @@ def up(self): Applies the AppWrapper yaml, pushing the resource request onto the MCAD queue. """ - self.config.auth.login() + resp = self.config.auth.login() + if "invalid" in resp: + raise PermissionError(resp) namespace = self.config.namespace try: with oc.project(namespace): @@ -105,8 +107,9 @@ def up(self): error_msg = osp.result.err() if "Unauthorized" in error_msg: raise PermissionError( - "Action not permitted, have you put in the correct auth credentials?" + "Action not permitted, have you put in correct/up-to-date auth credentials?" ) + raise osp def down(self): """ @@ -218,7 +221,8 @@ def wait_ready(self, timeout: Optional[int] = None): def details(self, print_to_console: bool = True): # FIXME - Add a return as well? # FIXME - When not up? - pretty_print.print_cluster_status(self) + cluster = _copy_to_ray(self) + pretty_print.print_clusters([cluster]) def cluster_uri(self) -> str: """ @@ -427,3 +431,19 @@ def _map_to_app_wrapper(cluster) -> AppWrapper: # pragma: no cover can_run=cluster_model.status.canrun, job_state=cluster_model.status.queuejobstate, ) + + +def _copy_to_ray(cluster: Cluster) -> RayCluster: + ray = RayCluster( + name=cluster.config.name, + status=cluster.status(print_to_console=False)[0], + min_workers=cluster.config.min_worker, + max_workers=cluster.config.max_worker, + worker_mem_min=cluster.config.min_memory, + worker_mem_max=cluster.config.max_memory, + worker_cpu=cluster.config.min_cpus, + worker_gpu=cluster.config.gpu, + namespace=cluster.config.namespace, + dashboard=cluster.cluster_dashboard_uri(), + ) + return ray diff --git a/src/codeflare_sdk/utils/pretty_print.py b/src/codeflare_sdk/utils/pretty_print.py index b92908a09..140e00354 100644 --- a/src/codeflare_sdk/utils/pretty_print.py +++ b/src/codeflare_sdk/utils/pretty_print.py @@ -108,13 +108,13 @@ def print_clusters(clusters: List[RayCluster]): # pragma: no cover status = ( "Active :white_heavy_check_mark:" if cluster.status == RayClusterStatus.READY - else "InActive :x:" + else "Inactive :x:" ) name = cluster.name dashboard = cluster.dashboard mincount = str(cluster.min_workers) maxcount = str(cluster.max_workers) - memory = cluster.worker_mem_min + "~" + cluster.worker_mem_max + memory = str(cluster.worker_mem_min) + "~" + str(cluster.worker_mem_max) cpu = str(cluster.worker_cpu) gpu = str(cluster.worker_gpu) # owned = bool(cluster["userOwned"]) From 3f0b5507f0440838fedc945a839e2d304623d41c Mon Sep 17 00:00:00 2001 From: maxusmusti Date: Tue, 21 Feb 2023 02:44:07 -0500 Subject: [PATCH 5/8] Fixed/completed functionality, more bug fixes --- src/codeflare_sdk/cluster/cluster.py | 23 ++++++++++++----------- src/codeflare_sdk/utils/pretty_print.py | 17 +++++++++-------- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index 727e7e358..d9160ca00 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -166,13 +166,13 @@ def status( if print_to_console: pretty_print.print_app_wrappers_status([appwrapper]) return ( - ready, status, + ready, ) # no need to check the ray status since still in queue # check the ray cluster status cluster = _ray_cluster_status(self.config.name, self.config.namespace) - if cluster: + if cluster and not cluster.status == RayClusterStatus.UNKNOWN: if cluster.status == RayClusterStatus.READY: ready = True status = CodeFlareClusterStatus.READY @@ -200,7 +200,6 @@ def wait_ready(self, timeout: Optional[int] = None): Waits for requested cluster to be ready, up to an optional timeout (s). Checks every five seconds. """ - # FIXME - BREAKING EARLY print("Waiting for requested resources to be set up...") ready = False status = None @@ -218,11 +217,11 @@ def wait_ready(self, timeout: Optional[int] = None): time += 5 print("Requested cluster up and running!") - def details(self, print_to_console: bool = True): - # FIXME - Add a return as well? - # FIXME - When not up? + def details(self, print_to_console: bool = True) -> RayCluster: cluster = _copy_to_ray(self) - pretty_print.print_clusters([cluster]) + if print_to_console: + pretty_print.print_clusters([cluster]) + return cluster def cluster_uri(self) -> str: """ @@ -294,7 +293,6 @@ def list_all_clusters( """ Returns (and prints by default) a list of all clusters in a given namespace. """ - # FIXME - NOT RUNNING BREAK clusters = _get_ray_clusters(namespace) if print_to_console: pretty_print.print_clusters(clusters) @@ -306,7 +304,6 @@ def list_all_queued(namespace: str, print_to_console: bool = True): # pragma: n Returns (and prints by default) a list of all currently queued-up AppWrappers in a given namespace. """ - # FIXME - FORMATTING ISSUES app_wrappers = _get_app_wrappers( namespace, filter=[AppWrapperStatus.RUNNING, AppWrapperStatus.PENDING] ) @@ -393,7 +390,9 @@ def _get_app_wrappers( def _map_to_ray_cluster(cluster) -> Optional[RayCluster]: # pragma: no cover cluster_model = cluster.model if type(cluster_model.status.state) == oc.model.MissingModel: - return None + status = RayClusterStatus.UNKNOWN + else: + status = RayClusterStatus(cluster_model.status.state.lower()) with oc.project(cluster.namespace()), oc.timeout(10 * 60): route = ( @@ -404,7 +403,7 @@ def _map_to_ray_cluster(cluster) -> Optional[RayCluster]: # pragma: no cover return RayCluster( name=cluster.name(), - status=RayClusterStatus(cluster_model.status.state.lower()), + status=status, # for now we are not using autoscaling so same replicas is fine min_workers=cluster_model.spec.workerGroupSpecs[0].replicas, max_workers=cluster_model.spec.workerGroupSpecs[0].replicas, @@ -446,4 +445,6 @@ def _copy_to_ray(cluster: Cluster) -> RayCluster: namespace=cluster.config.namespace, dashboard=cluster.cluster_dashboard_uri(), ) + if ray.status == CodeFlareClusterStatus.READY: + ray.status = RayClusterStatus.READY return ray diff --git a/src/codeflare_sdk/utils/pretty_print.py b/src/codeflare_sdk/utils/pretty_print.py index 140e00354..9e0830899 100644 --- a/src/codeflare_sdk/utils/pretty_print.py +++ b/src/codeflare_sdk/utils/pretty_print.py @@ -38,19 +38,20 @@ def print_app_wrappers_status(app_wrappers: List[AppWrapper]): # pragma: no cov return # shortcircuit console = Console() + table = Table( + box=box.ASCII_DOUBLE_HEAD, + title="[bold] :rocket: Cluster Queue Status :rocket:", + ) + table.add_column("Name", style="cyan", no_wrap=True) + table.add_column("Status", style="magenta") + for app_wrapper in app_wrappers: name = app_wrapper.name status = app_wrapper.status.value - - table = Table( - box=box.ASCII_DOUBLE_HEAD, - title="[bold] :rocket: Cluster Queue Status :rocket:", - ) - table.add_column("Name", style="cyan", no_wrap=True) - table.add_column("Status", style="magenta") table.add_row(name, status) table.add_row("") # empty row for spacing - console.print(Panel.fit(table)) + + console.print(Panel.fit(table)) def print_cluster_status(cluster: RayCluster): From 8ec0f7941beb28a3d7f90718bb008b0230bc1e46 Mon Sep 17 00:00:00 2001 From: maxusmusti Date: Tue, 21 Feb 2023 23:44:57 -0500 Subject: [PATCH 6/8] tests pt1 --- src/codeflare_sdk/cluster/cluster.py | 22 +- src/codeflare_sdk/utils/pretty_print.py | 8 +- tests/unit_test.py | 294 +++++++++++++++++++++++- 3 files changed, 301 insertions(+), 23 deletions(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index d9160ca00..eac8fe7fd 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -159,7 +159,7 @@ def status( ]: ready = False status = CodeFlareClusterStatus.FAILED # should deleted be separate - return ready, status # exit early, no need to check ray status + return status, ready # exit early, no need to check ray status elif appwrapper.status in [AppWrapperStatus.PENDING]: ready = False status = CodeFlareClusterStatus.QUEUED @@ -270,7 +270,7 @@ def job_logs(self, job_id: str) -> str: return client.get_job_logs(job_id) -def get_current_namespace() -> str: # pragma: no cover +def get_current_namespace() -> str: """ Returns the user's current working namespace. """ @@ -287,9 +287,7 @@ def get_current_namespace() -> str: # pragma: no cover return namespace -def list_all_clusters( - namespace: str, print_to_console: bool = True -): # pragma: no cover +def list_all_clusters(namespace: str, print_to_console: bool = True): """ Returns (and prints by default) a list of all clusters in a given namespace. """ @@ -299,7 +297,7 @@ def list_all_clusters( return clusters -def list_all_queued(namespace: str, print_to_console: bool = True): # pragma: no cover +def list_all_queued(namespace: str, print_to_console: bool = True): """ Returns (and prints by default) a list of all currently queued-up AppWrappers in a given namespace. @@ -337,9 +335,7 @@ def _app_wrapper_status( return cluster -def _ray_cluster_status( - name, namespace="default" -) -> Optional[RayCluster]: # pragma: no cover +def _ray_cluster_status(name, namespace="default") -> Optional[RayCluster]: cluster = None try: with oc.project(namespace), oc.timeout(10 * 60): @@ -359,7 +355,7 @@ def _ray_cluster_status( return cluster -def _get_ray_clusters(namespace="default") -> List[RayCluster]: # pragma: no cover +def _get_ray_clusters(namespace="default") -> List[RayCluster]: list_of_clusters = [] with oc.project(namespace), oc.timeout(10 * 60): @@ -372,7 +368,7 @@ def _get_ray_clusters(namespace="default") -> List[RayCluster]: # pragma: no co def _get_app_wrappers( namespace="default", filter=List[AppWrapperStatus] -) -> List[AppWrapper]: # pragma: no cover +) -> List[AppWrapper]: list_of_app_wrappers = [] with oc.project(namespace), oc.timeout(10 * 60): @@ -387,7 +383,7 @@ def _get_app_wrappers( return list_of_app_wrappers -def _map_to_ray_cluster(cluster) -> Optional[RayCluster]: # pragma: no cover +def _map_to_ray_cluster(cluster) -> Optional[RayCluster]: cluster_model = cluster.model if type(cluster_model.status.state) == oc.model.MissingModel: status = RayClusterStatus.UNKNOWN @@ -422,7 +418,7 @@ def _map_to_ray_cluster(cluster) -> Optional[RayCluster]: # pragma: no cover ) -def _map_to_app_wrapper(cluster) -> AppWrapper: # pragma: no cover +def _map_to_app_wrapper(cluster) -> AppWrapper: cluster_model = cluster.model return AppWrapper( name=cluster.name(), diff --git a/src/codeflare_sdk/utils/pretty_print.py b/src/codeflare_sdk/utils/pretty_print.py index 9e0830899..1591658a6 100644 --- a/src/codeflare_sdk/utils/pretty_print.py +++ b/src/codeflare_sdk/utils/pretty_print.py @@ -27,12 +27,12 @@ from ..cluster.model import RayCluster, AppWrapper, RayClusterStatus -def print_no_resources_found(): # pragma: no cover +def print_no_resources_found(): console = Console() console.print(Panel("[red]No resources found, have you run cluster.up() yet?")) -def print_app_wrappers_status(app_wrappers: List[AppWrapper]): # pragma: no cover +def print_app_wrappers_status(app_wrappers: List[AppWrapper]): if not app_wrappers: print_no_resources_found() return # shortcircuit @@ -64,7 +64,7 @@ def print_cluster_status(cluster: RayCluster): status = ( "Active :white_heavy_check_mark:" if cluster.status == RayClusterStatus.READY - else "InActive :x:" + else "Inactive :x:" ) name = cluster.name dashboard = cluster.dashboard @@ -97,7 +97,7 @@ def print_cluster_status(cluster: RayCluster): console.print(table5) -def print_clusters(clusters: List[RayCluster]): # pragma: no cover +def print_clusters(clusters: List[RayCluster]): if not clusters: print_no_resources_found() return # shortcircuit diff --git a/tests/unit_test.py b/tests/unit_test.py index 6e6a02760..270da2d74 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -26,13 +26,26 @@ get_current_namespace, list_all_clusters, list_all_queued, + _copy_to_ray, ) from codeflare_sdk.cluster.auth import ( TokenAuthentication, PasswordUserAuthentication, Authentication, ) -from codeflare_sdk.utils.generate_yaml import main +from codeflare_sdk.utils.pretty_print import ( + print_no_resources_found, + print_app_wrappers_status, + print_cluster_status, + print_clusters, +) +from codeflare_sdk.cluster.model import ( + AppWrapper, + RayCluster, + AppWrapperStatus, + RayClusterStatus, + CodeFlareClusterStatus, +) import openshift from openshift import OpenShiftPythonException import ray @@ -283,16 +296,285 @@ def test_ray_job_wrapping(mocker): # cluster.job_logs() +def test_print_no_resources(capsys): + try: + print_no_resources_found() + except: + assert 1 == 0 + captured = capsys.readouterr() + assert captured.out == ( + "╭──────────────────────────────────────────────────────────────────────────────╮\n" + "│ No resources found, have you run cluster.up() yet? │\n" + "╰──────────────────────────────────────────────────────────────────────────────╯\n" + ) + + +def test_print_appwrappers(capsys): + aw1 = AppWrapper( + name="awtest1", + status=AppWrapperStatus.PENDING, + can_run=False, + job_state="queue-state", + ) + aw2 = AppWrapper( + name="awtest2", + status=AppWrapperStatus.RUNNING, + can_run=False, + job_state="queue-state", + ) + try: + print_app_wrappers_status([aw1, aw2]) + except: + assert 1 == 0 + captured = capsys.readouterr() + assert captured.out == ( + "╭───────────────────────╮\n" + "│ 🚀 Cluster Queue │\n" + "│ Status 🚀 │\n" + "│ +---------+---------+ │\n" + "│ | Name | Status | │\n" + "│ +=========+=========+ │\n" + "│ | awtest1 | pending | │\n" + "│ | | | │\n" + "│ | awtest2 | running | │\n" + "│ | | | │\n" + "│ +---------+---------+ │\n" + "╰───────────────────────╯\n" + ) + + +def test_ray_details(capsys): + ray1 = RayCluster( + name="raytest1", + status=RayClusterStatus.READY, + min_workers=1, + max_workers=1, + worker_mem_min=2, + worker_mem_max=2, + worker_cpu=1, + worker_gpu=0, + namespace="ns", + dashboard="fake-uri", + ) + cf = Cluster(ClusterConfiguration(name="raytest2", namespace="ns")) + captured = capsys.readouterr() + ray2 = _copy_to_ray(cf) + details = cf.details() + assert details == ray2 + assert ray2.name == "raytest2" + assert ray1.namespace == ray2.namespace + assert ray1.min_workers == ray2.min_workers + assert ray1.max_workers == ray2.max_workers + assert ray1.worker_mem_min == ray2.worker_mem_min + assert ray1.worker_mem_max == ray2.worker_mem_max + assert ray1.worker_cpu == ray2.worker_cpu + assert ray1.worker_gpu == ray2.worker_gpu + try: + print_clusters([ray1, ray2]) + print_cluster_status(ray1) + print_cluster_status(ray2) + except: + assert 0 == 1 + captured = capsys.readouterr() + assert captured.out == ( + " 🚀 CodeFlare Cluster Details 🚀 \n" + " \n" + " ╭──────────────────────────────────────────────────────────────╮ \n" + " │ Name │ \n" + " │ raytest2 Inactive ❌ │ \n" + " │ │ \n" + " │ URI: ray://raytest2-head-svc.ns.svc:10001 │ \n" + " │ │ \n" + " │ Dashboard🔗 │ \n" + " │ │ \n" + " │ Cluster Resources │ \n" + " │ ╭─ Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n" + " │ │ Min Max │ │ Memory CPU GPU │ │ \n" + " │ │ │ │ │ │ \n" + " │ │ 1 1 │ │ 2~2 1 0 │ │ \n" + " │ │ │ │ │ │ \n" + " │ ╰────────────╯ ╰──────────────────────────────────────╯ │ \n" + " ╰──────────────────────────────────────────────────────────────╯ \n" + " 🚀 CodeFlare Cluster Details 🚀 \n" + " \n" + " ╭──────────────────────────────────────────────────────────────╮ \n" + " │ Name │ \n" + " │ raytest1 Active ✅ │ \n" + " │ │ \n" + " │ URI: ray://raytest1-head-svc.ns.svc:10001 │ \n" + " │ │ \n" + " │ Dashboard🔗 │ \n" + " │ │ \n" + " │ Cluster Resources │ \n" + " │ ╭─ Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n" + " │ │ Min Max │ │ Memory CPU GPU │ │ \n" + " │ │ │ │ │ │ \n" + " │ │ 1 1 │ │ 2~2 1 0 │ │ \n" + " │ │ │ │ │ │ \n" + " │ ╰────────────╯ ╰──────────────────────────────────────╯ │ \n" + " ╰──────────────────────────────────────────────────────────────╯ \n" + "╭──────────────────────────────────────────────────────────────╮\n" + "│ Name │\n" + "│ raytest2 Inactive ❌ │\n" + "│ │\n" + "│ URI: ray://raytest2-head-svc.ns.svc:10001 │\n" + "│ │\n" + "│ Dashboard🔗 │\n" + "│ │\n" + "│ Cluster Resources │\n" + "│ ╭─ Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │\n" + "│ │ Min Max │ │ Memory CPU GPU │ │\n" + "│ │ │ │ │ │\n" + "│ │ 1 1 │ │ 2~2 1 0 │ │\n" + "│ │ │ │ │ │\n" + "│ ╰────────────╯ ╰──────────────────────────────────────╯ │\n" + "╰──────────────────────────────────────────────────────────────╯\n" + " 🚀 CodeFlare Cluster Status 🚀 \n" + " \n" + " ╭──────────────────────────────────────────────────────────╮ \n" + " │ Name │ \n" + " │ raytest1 Active ✅ │ \n" + " │ │ \n" + " │ URI: ray://raytest1-head-svc.ns.svc:10001 │ \n" + " │ │ \n" + " │ Dashboard🔗 │ \n" + " │ │ \n" + " ╰──────────────────────────────────────────────────────────╯ \n" + " 🚀 CodeFlare Cluster Status 🚀 \n" + " \n" + " ╭────────────────────────────────────────────────────────────╮ \n" + " │ Name │ \n" + " │ raytest2 Inactive ❌ │ \n" + " │ │ \n" + " │ URI: ray://raytest2-head-svc.ns.svc:10001 │ \n" + " │ │ \n" + " │ Dashboard🔗 │ \n" + " │ │ \n" + " ╰────────────────────────────────────────────────────────────╯ \n" + ) + + +def act_side_effect_list(self): + print([self]) + self.out = str(self.high_level_operation) + return [self] + + def test_get_namespace(mocker): - pass + mocker.patch("openshift.invoke", side_effect=arg_side_effect) + mock_res = mocker.patch.object(openshift.Result, "actions") + mock_res.side_effect = lambda: act_side_effect_list(fake_res) + vars = get_current_namespace() + assert vars == "('project', ['-q'])" + +# def test_list_clusters(mocker): +# list_all_clusters("ns") -def test_list_clusters(mocker): - pass +# def test_list_queue(mocker): +# list_all_queued("ns") -def test_list_queue(mocker): - pass + +def test_cluster_status(mocker): + fake_aw = AppWrapper( + "test", AppWrapperStatus.FAILED, can_run=True, job_state="unused" + ) + fake_ray = RayCluster( + name="test", + status=RayClusterStatus.UNKNOWN, + min_workers=1, + max_workers=1, + worker_mem_min=2, + worker_mem_max=2, + worker_cpu=1, + worker_gpu=0, + namespace="ns", + dashboard="fake-uri", + ) + cf = Cluster(ClusterConfiguration(name="test", namespace="ns")) + status, ready = cf.status() + assert status == CodeFlareClusterStatus.UNKNOWN + assert ready == False + + mocker.patch( + "codeflare_sdk.cluster.cluster._app_wrapper_status", return_value=fake_aw + ) + status, ready = cf.status() + assert status == CodeFlareClusterStatus.FAILED + assert ready == False + + fake_aw.status = AppWrapperStatus.DELETED + status, ready = cf.status() + assert status == CodeFlareClusterStatus.FAILED + assert ready == False + + fake_aw.status = AppWrapperStatus.PENDING + status, ready = cf.status() + assert status == CodeFlareClusterStatus.QUEUED + assert ready == False + + fake_aw.status = AppWrapperStatus.COMPLETED + status, ready = cf.status() + assert status == CodeFlareClusterStatus.STARTING + assert ready == False + + fake_aw.status = AppWrapperStatus.RUNNING_HOLD_COMPLETION + status, ready = cf.status() + assert status == CodeFlareClusterStatus.STARTING + assert ready == False + + fake_aw.status = AppWrapperStatus.RUNNING + status, ready = cf.status() + assert status == CodeFlareClusterStatus.STARTING + assert ready == False + + mocker.patch( + "codeflare_sdk.cluster.cluster._ray_cluster_status", return_value=fake_ray + ) + status, ready = cf.status() + assert status == CodeFlareClusterStatus.STARTING + assert ready == False + + fake_ray.status = RayClusterStatus.FAILED + status, ready = cf.status() + assert status == CodeFlareClusterStatus.FAILED + assert ready == False + + fake_ray.status = RayClusterStatus.UNHEALTHY + status, ready = cf.status() + assert status == CodeFlareClusterStatus.FAILED + assert ready == False + + fake_ray.status = RayClusterStatus.READY + status, ready = cf.status() + assert status == CodeFlareClusterStatus.READY + assert ready == True + + +def test_wait_ready(mocker, capsys): + cf = Cluster(ClusterConfiguration(name="test", namespace="ns")) + try: + cf.wait_ready(timeout=5) + assert 1 == 0 + except Exception as e: + assert type(e) == TimeoutError + + captured = capsys.readouterr() + assert ( + "WARNING: Current cluster status is unknown, have you run cluster.up yet?" + in captured.out + ) + mocker.patch( + "codeflare_sdk.cluster.cluster.Cluster.status", + return_value=(True, CodeFlareClusterStatus.READY), + ) + cf.wait_ready() + captured = capsys.readouterr() + assert ( + captured.out + == "Waiting for requested resources to be set up...\nRequested cluster up and running!\n" + ) def test_cmd_line_generation(): From e62eaf10729f51d3f74d9891fec31d2810e0e2a2 Mon Sep 17 00:00:00 2001 From: maxusmusti Date: Wed, 22 Feb 2023 14:14:36 -0500 Subject: [PATCH 7/8] Tests pt2 --- src/codeflare_sdk/cluster/cluster.py | 12 + tests/unit_test.py | 988 ++++++++++++++++++++++++++- 2 files changed, 996 insertions(+), 4 deletions(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index eac8fe7fd..988d0b881 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -126,10 +126,13 @@ def down(self): 'the server doesn\'t have a resource type "AppWrapper"' in error_msg or "forbidden" in error_msg or "Unauthorized" in error_msg + or "Missing or incomplete configuration" in error_msg ): raise PermissionError( "Action not permitted, have you run cluster.up() yet?" ) + elif "not found" in error_msg: + print("Cluster not found, have you run cluster.up() yet?") else: raise osp self.config.auth.logout() @@ -321,11 +324,15 @@ def _app_wrapper_status( with oc.project(namespace), oc.timeout(10 * 60): cluster = oc.selector(f"appwrapper/{name}").object() except oc.OpenShiftPythonException as osp: + msg = osp.msg + if "Expected a single object, but selected 0" in msg: + return cluster error_msg = osp.result.err() if not ( 'the server doesn\'t have a resource type "appwrapper"' in error_msg or "forbidden" in error_msg or "Unauthorized" in error_msg + or "Missing or incomplete configuration" in error_msg ): raise osp @@ -341,11 +348,15 @@ def _ray_cluster_status(name, namespace="default") -> Optional[RayCluster]: with oc.project(namespace), oc.timeout(10 * 60): cluster = oc.selector(f"rayclusters/{name}").object() except oc.OpenShiftPythonException as osp: + msg = osp.msg + if "Expected a single object, but selected 0" in msg: + return cluster error_msg = osp.result.err() if not ( 'the server doesn\'t have a resource type "rayclusters"' in error_msg or "forbidden" in error_msg or "Unauthorized" in error_msg + or "Missing or incomplete configuration" in error_msg ): raise osp @@ -379,6 +390,7 @@ def _get_app_wrappers( if filter and app_wrapper.status in filter: list_of_app_wrappers.append(app_wrapper) else: + # Unsure what the purpose of the filter is list_of_app_wrappers.append(app_wrapper) return list_of_app_wrappers diff --git a/tests/unit_test.py b/tests/unit_test.py index 270da2d74..b8c971a01 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -48,6 +48,7 @@ ) import openshift from openshift import OpenShiftPythonException +from openshift.selector import Selector import ray import pytest @@ -468,12 +469,991 @@ def test_get_namespace(mocker): assert vars == "('project', ['-q'])" -# def test_list_clusters(mocker): -# list_all_clusters("ns") +def get_selector(*args): + selector = Selector({"operation": "selector", "status": 0, "actions": []}) + return selector + + +def get_obj_none(): + return [] + + +def get_ray_obj(cls=None): + api_obj = openshift.apiobject.APIObject( + { + "apiVersion": "ray.io/v1alpha1", + "kind": "RayCluster", + "metadata": { + "creationTimestamp": "2023-02-22T16:26:07Z", + "generation": 1, + "labels": { + "appwrapper.mcad.ibm.com": "quicktest", + "controller-tools.k8s.io": "1.0", + "resourceName": "quicktest", + }, + "managedFields": [ + { + "apiVersion": "ray.io/v1alpha1", + "fieldsType": "FieldsV1", + "fieldsV1": { + "f:metadata": { + "f:labels": { + ".": {}, + "f:appwrapper.mcad.ibm.com": {}, + "f:controller-tools.k8s.io": {}, + "f:resourceName": {}, + }, + "f:ownerReferences": { + ".": {}, + 'k:{"uid":"6334fc1b-471e-4876-8e7b-0b2277679235"}': {}, + }, + }, + "f:spec": { + ".": {}, + "f:autoscalerOptions": { + ".": {}, + "f:idleTimeoutSeconds": {}, + "f:imagePullPolicy": {}, + "f:resources": { + ".": {}, + "f:limits": { + ".": {}, + "f:cpu": {}, + "f:memory": {}, + }, + "f:requests": { + ".": {}, + "f:cpu": {}, + "f:memory": {}, + }, + }, + "f:upscalingMode": {}, + }, + "f:enableInTreeAutoscaling": {}, + "f:headGroupSpec": { + ".": {}, + "f:rayStartParams": { + ".": {}, + "f:block": {}, + "f:dashboard-host": {}, + "f:num-gpus": {}, + }, + "f:serviceType": {}, + "f:template": { + ".": {}, + "f:spec": {".": {}, "f:containers": {}}, + }, + }, + "f:rayVersion": {}, + "f:workerGroupSpecs": {}, + }, + }, + "manager": "mcad-controller", + "operation": "Update", + "time": "2023-02-22T16:26:07Z", + }, + { + "apiVersion": "ray.io/v1alpha1", + "fieldsType": "FieldsV1", + "fieldsV1": { + "f:status": { + ".": {}, + "f:availableWorkerReplicas": {}, + "f:desiredWorkerReplicas": {}, + "f:endpoints": { + ".": {}, + "f:client": {}, + "f:dashboard": {}, + "f:gcs": {}, + }, + "f:lastUpdateTime": {}, + "f:maxWorkerReplicas": {}, + "f:minWorkerReplicas": {}, + "f:state": {}, + } + }, + "manager": "manager", + "operation": "Update", + "subresource": "status", + "time": "2023-02-22T16:26:16Z", + }, + ], + "name": "quicktest", + "namespace": "ns", + "ownerReferences": [ + { + "apiVersion": "mcad.ibm.com/v1beta1", + "blockOwnerDeletion": True, + "controller": True, + "kind": "AppWrapper", + "name": "quicktest", + "uid": "6334fc1b-471e-4876-8e7b-0b2277679235", + } + ], + "resourceVersion": "9482407", + "uid": "44d45d1f-26c8-43e7-841f-831dbd8c1285", + }, + "spec": { + "autoscalerOptions": { + "idleTimeoutSeconds": 60, + "imagePullPolicy": "Always", + "resources": { + "limits": {"cpu": "500m", "memory": "512Mi"}, + "requests": {"cpu": "500m", "memory": "512Mi"}, + }, + "upscalingMode": "Default", + }, + "enableInTreeAutoscaling": False, + "headGroupSpec": { + "rayStartParams": { + "block": "true", + "dashboard-host": "0.0.0.0", + "num-gpus": "0", + }, + "serviceType": "ClusterIP", + "template": { + "spec": { + "containers": [ + { + "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", + "imagePullPolicy": "Always", + "lifecycle": { + "preStop": { + "exec": { + "command": ["/bin/sh", "-c", "ray stop"] + } + } + }, + "name": "ray-head", + "ports": [ + { + "containerPort": 6379, + "name": "gcs", + "protocol": "TCP", + }, + { + "containerPort": 8265, + "name": "dashboard", + "protocol": "TCP", + }, + { + "containerPort": 10001, + "name": "client", + "protocol": "TCP", + }, + ], + "resources": { + "limits": { + "cpu": 2, + "memory": "8G", + "nvidia.com/gpu": 0, + }, + "requests": { + "cpu": 2, + "memory": "8G", + "nvidia.com/gpu": 0, + }, + }, + } + ] + } + }, + }, + "rayVersion": "1.12.0", + "workerGroupSpecs": [ + { + "groupName": "small-group-quicktest", + "maxReplicas": 1, + "minReplicas": 1, + "rayStartParams": {"block": "true", "num-gpus": "0"}, + "replicas": 1, + "template": { + "metadata": { + "annotations": {"key": "value"}, + "labels": {"key": "value"}, + }, + "spec": { + "containers": [ + { + "env": [ + { + "name": "MY_POD_IP", + "valueFrom": { + "fieldRef": { + "fieldPath": "status.podIP" + } + }, + } + ], + "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", + "lifecycle": { + "preStop": { + "exec": { + "command": [ + "/bin/sh", + "-c", + "ray stop", + ] + } + } + }, + "name": "machine-learning", + "resources": { + "limits": { + "cpu": 1, + "memory": "2G", + "nvidia.com/gpu": 0, + }, + "requests": { + "cpu": 1, + "memory": "2G", + "nvidia.com/gpu": 0, + }, + }, + } + ], + "initContainers": [ + { + "command": [ + "sh", + "-c", + "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done", + ], + "image": "busybox:1.28", + "name": "init-myservice", + } + ], + }, + }, + } + ], + }, + "status": { + "availableWorkerReplicas": 2, + "desiredWorkerReplicas": 1, + "endpoints": {"client": "10001", "dashboard": "8265", "gcs": "6379"}, + "lastUpdateTime": "2023-02-22T16:26:16Z", + "maxWorkerReplicas": 1, + "minWorkerReplicas": 1, + "state": "ready", + }, + } + ) + return [api_obj] + + +def get_aw_obj(): + api_obj1 = openshift.apiobject.APIObject( + { + "apiVersion": "mcad.ibm.com/v1beta1", + "kind": "AppWrapper", + "metadata": { + "annotations": { + "kubectl.kubernetes.io/last-applied-configuration": '{"apiVersion":"mcad.ibm.com/v1beta1","kind":"AppWrapper","metadata":{"annotations":{},"name":"quicktest1","namespace":"ns"},"spec":{"priority":9,"resources":{"GenericItems":[{"custompodresources":[{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}},{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}],"generictemplate":{"apiVersion":"ray.io/v1alpha1","kind":"RayCluster","metadata":{"labels":{"appwrapper.mcad.ibm.com":"quicktest1","controller-tools.k8s.io":"1.0"},"name":"quicktest1","namespace":"ns"},"spec":{"autoscalerOptions":{"idleTimeoutSeconds":60,"imagePullPolicy":"Always","resources":{"limits":{"cpu":"500m","memory":"512Mi"},"requests":{"cpu":"500m","memory":"512Mi"}},"upscalingMode":"Default"},"enableInTreeAutoscaling":false,"headGroupSpec":{"rayStartParams":{"block":"true","dashboard-host":"0.0.0.0","num-gpus":"0"},"serviceType":"ClusterIP","template":{"spec":{"containers":[{"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","imagePullPolicy":"Always","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"ray-head","ports":[{"containerPort":6379,"name":"gcs"},{"containerPort":8265,"name":"dashboard"},{"containerPort":10001,"name":"client"}],"resources":{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}}}]}}},"rayVersion":"1.12.0","workerGroupSpecs":[{"groupName":"small-group-quicktest","maxReplicas":1,"minReplicas":1,"rayStartParams":{"block":"true","num-gpus":"0"},"replicas":1,"template":{"metadata":{"annotations":{"key":"value"},"labels":{"key":"value"}},"spec":{"containers":[{"env":[{"name":"MY_POD_IP","valueFrom":{"fieldRef":{"fieldPath":"status.podIP"}}}],"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"machine-learning","resources":{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}}],"initContainers":[{"command":["sh","-c","until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"],"image":"busybox:1.28","name":"init-myservice"}]}}}]}},"replicas":1},{"generictemplate":{"apiVersion":"route.openshift.io/v1","kind":"Route","metadata":{"labels":{"odh-ray-cluster-service":"quicktest-head-svc"},"name":"ray-dashboard-quicktest","namespace":"default"},"spec":{"port":{"targetPort":"dashboard"},"to":{"kind":"Service","name":"quicktest-head-svc"}}},"replica":1}],"Items":[]}}}\n' + }, + "creationTimestamp": "2023-02-22T16:26:07Z", + "generation": 4, + "managedFields": [ + { + "apiVersion": "mcad.ibm.com/v1beta1", + "fieldsType": "FieldsV1", + "fieldsV1": { + "f:spec": { + "f:resources": {"f:GenericItems": {}, "f:metadata": {}}, + "f:schedulingSpec": {}, + "f:service": {".": {}, "f:spec": {}}, + }, + "f:status": { + ".": {}, + "f:canrun": {}, + "f:conditions": {}, + "f:controllerfirsttimestamp": {}, + "f:filterignore": {}, + "f:queuejobstate": {}, + "f:sender": {}, + "f:state": {}, + "f:systempriority": {}, + }, + }, + "manager": "Go-http-client", + "operation": "Update", + "time": "2023-02-22T16:26:07Z", + }, + { + "apiVersion": "mcad.ibm.com/v1beta1", + "fieldsType": "FieldsV1", + "fieldsV1": { + "f:metadata": { + "f:annotations": { + ".": {}, + "f:kubectl.kubernetes.io/last-applied-configuration": {}, + } + }, + "f:spec": { + ".": {}, + "f:priority": {}, + "f:resources": {".": {}, "f:Items": {}}, + }, + }, + "manager": "kubectl-client-side-apply", + "operation": "Update", + "time": "2023-02-22T16:26:07Z", + }, + ], + "name": "quicktest1", + "namespace": "ns", + "resourceVersion": "9482384", + "uid": "6334fc1b-471e-4876-8e7b-0b2277679235", + }, + "spec": { + "priority": 9, + "resources": { + "GenericItems": [ + { + "allocated": 0, + "custompodresources": [ + { + "limits": { + "cpu": "2", + "memory": "8G", + "nvidia.com/gpu": "0", + }, + "replicas": 1, + "requests": { + "cpu": "2", + "memory": "8G", + "nvidia.com/gpu": "0", + }, + }, + { + "limits": { + "cpu": "1", + "memory": "2G", + "nvidia.com/gpu": "0", + }, + "replicas": 1, + "requests": { + "cpu": "1", + "memory": "2G", + "nvidia.com/gpu": "0", + }, + }, + ], + "generictemplate": { + "apiVersion": "ray.io/v1alpha1", + "kind": "RayCluster", + "metadata": { + "labels": { + "appwrapper.mcad.ibm.com": "quicktest1", + "controller-tools.k8s.io": "1.0", + }, + "name": "quicktest1", + "namespace": "ns", + }, + "spec": { + "autoscalerOptions": { + "idleTimeoutSeconds": 60, + "imagePullPolicy": "Always", + "resources": { + "limits": { + "cpu": "500m", + "memory": "512Mi", + }, + "requests": { + "cpu": "500m", + "memory": "512Mi", + }, + }, + "upscalingMode": "Default", + }, + "enableInTreeAutoscaling": False, + "headGroupSpec": { + "rayStartParams": { + "block": "true", + "dashboard-host": "0.0.0.0", + "num-gpus": "0", + }, + "serviceType": "ClusterIP", + "template": { + "spec": { + "containers": [ + { + "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", + "imagePullPolicy": "Always", + "lifecycle": { + "preStop": { + "exec": { + "command": [ + "/bin/sh", + "-c", + "ray stop", + ] + } + } + }, + "name": "ray-head", + "ports": [ + { + "containerPort": 6379, + "name": "gcs", + }, + { + "containerPort": 8265, + "name": "dashboard", + }, + { + "containerPort": 10001, + "name": "client", + }, + ], + "resources": { + "limits": { + "cpu": 2, + "memory": "8G", + "nvidia.com/gpu": 0, + }, + "requests": { + "cpu": 2, + "memory": "8G", + "nvidia.com/gpu": 0, + }, + }, + } + ] + } + }, + }, + "rayVersion": "1.12.0", + "workerGroupSpecs": [ + { + "groupName": "small-group-quicktest", + "maxReplicas": 1, + "minReplicas": 1, + "rayStartParams": { + "block": "true", + "num-gpus": "0", + }, + "replicas": 1, + "template": { + "metadata": { + "annotations": {"key": "value"}, + "labels": {"key": "value"}, + }, + "spec": { + "containers": [ + { + "env": [ + { + "name": "MY_POD_IP", + "valueFrom": { + "fieldRef": { + "fieldPath": "status.podIP" + } + }, + } + ], + "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", + "lifecycle": { + "preStop": { + "exec": { + "command": [ + "/bin/sh", + "-c", + "ray stop", + ] + } + } + }, + "name": "machine-learning", + "resources": { + "limits": { + "cpu": 1, + "memory": "2G", + "nvidia.com/gpu": 0, + }, + "requests": { + "cpu": 1, + "memory": "2G", + "nvidia.com/gpu": 0, + }, + }, + } + ], + "initContainers": [ + { + "command": [ + "sh", + "-c", + "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done", + ], + "image": "busybox:1.28", + "name": "init-myservice", + } + ], + }, + }, + } + ], + }, + }, + "metadata": {}, + "priority": 0, + "priorityslope": 0, + "replicas": 1, + }, + { + "allocated": 0, + "generictemplate": { + "apiVersion": "route.openshift.io/v1", + "kind": "Route", + "metadata": { + "labels": { + "odh-ray-cluster-service": "quicktest-head-svc" + }, + "name": "ray-dashboard-quicktest", + "namespace": "default", + }, + "spec": { + "port": {"targetPort": "dashboard"}, + "to": { + "kind": "Service", + "name": "quicktest-head-svc", + }, + }, + }, + "metadata": {}, + "priority": 0, + "priorityslope": 0, + }, + ], + "Items": [], + "metadata": {}, + }, + "schedulingSpec": {}, + "service": {"spec": {}}, + }, + "status": { + "canrun": True, + "conditions": [ + { + "lastTransitionMicroTime": "2023-02-22T16:26:07.559447Z", + "lastUpdateMicroTime": "2023-02-22T16:26:07.559447Z", + "status": "True", + "type": "Init", + }, + { + "lastTransitionMicroTime": "2023-02-22T16:26:07.559551Z", + "lastUpdateMicroTime": "2023-02-22T16:26:07.559551Z", + "reason": "AwaitingHeadOfLine", + "status": "True", + "type": "Queueing", + }, + { + "lastTransitionMicroTime": "2023-02-22T16:26:13.220564Z", + "lastUpdateMicroTime": "2023-02-22T16:26:13.220564Z", + "reason": "AppWrapperRunnable", + "status": "True", + "type": "Dispatched", + }, + ], + "controllerfirsttimestamp": "2023-02-22T16:26:07.559447Z", + "filterignore": True, + "queuejobstate": "Dispatched", + "sender": "before manageQueueJob - afterEtcdDispatching", + "state": "Running", + "systempriority": 9, + }, + } + ) + api_obj2 = openshift.apiobject.APIObject( + { + "apiVersion": "mcad.ibm.com/v1beta1", + "kind": "AppWrapper", + "metadata": { + "annotations": { + "kubectl.kubernetes.io/last-applied-configuration": '{"apiVersion":"mcad.ibm.com/v1beta1","kind":"AppWrapper","metadata":{"annotations":{},"name":"quicktest2","namespace":"ns"},"spec":{"priority":9,"resources":{"GenericItems":[{"custompodresources":[{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}},{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}],"generictemplate":{"apiVersion":"ray.io/v1alpha1","kind":"RayCluster","metadata":{"labels":{"appwrapper.mcad.ibm.com":"quicktest2","controller-tools.k8s.io":"1.0"},"name":"quicktest2","namespace":"ns"},"spec":{"autoscalerOptions":{"idleTimeoutSeconds":60,"imagePullPolicy":"Always","resources":{"limits":{"cpu":"500m","memory":"512Mi"},"requests":{"cpu":"500m","memory":"512Mi"}},"upscalingMode":"Default"},"enableInTreeAutoscaling":false,"headGroupSpec":{"rayStartParams":{"block":"true","dashboard-host":"0.0.0.0","num-gpus":"0"},"serviceType":"ClusterIP","template":{"spec":{"containers":[{"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","imagePullPolicy":"Always","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"ray-head","ports":[{"containerPort":6379,"name":"gcs"},{"containerPort":8265,"name":"dashboard"},{"containerPort":10001,"name":"client"}],"resources":{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}}}]}}},"rayVersion":"1.12.0","workerGroupSpecs":[{"groupName":"small-group-quicktest","maxReplicas":1,"minReplicas":1,"rayStartParams":{"block":"true","num-gpus":"0"},"replicas":1,"template":{"metadata":{"annotations":{"key":"value"},"labels":{"key":"value"}},"spec":{"containers":[{"env":[{"name":"MY_POD_IP","valueFrom":{"fieldRef":{"fieldPath":"status.podIP"}}}],"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"machine-learning","resources":{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}}],"initContainers":[{"command":["sh","-c","until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"],"image":"busybox:1.28","name":"init-myservice"}]}}}]}},"replicas":1},{"generictemplate":{"apiVersion":"route.openshift.io/v1","kind":"Route","metadata":{"labels":{"odh-ray-cluster-service":"quicktest-head-svc"},"name":"ray-dashboard-quicktest","namespace":"default"},"spec":{"port":{"targetPort":"dashboard"},"to":{"kind":"Service","name":"quicktest-head-svc"}}},"replica":1}],"Items":[]}}}\n' + }, + "creationTimestamp": "2023-02-22T16:26:07Z", + "generation": 4, + "managedFields": [ + { + "apiVersion": "mcad.ibm.com/v1beta1", + "fieldsType": "FieldsV1", + "fieldsV1": { + "f:spec": { + "f:resources": {"f:GenericItems": {}, "f:metadata": {}}, + "f:schedulingSpec": {}, + "f:service": {".": {}, "f:spec": {}}, + }, + "f:status": { + ".": {}, + "f:canrun": {}, + "f:conditions": {}, + "f:controllerfirsttimestamp": {}, + "f:filterignore": {}, + "f:queuejobstate": {}, + "f:sender": {}, + "f:state": {}, + "f:systempriority": {}, + }, + }, + "manager": "Go-http-client", + "operation": "Update", + "time": "2023-02-22T16:26:07Z", + }, + { + "apiVersion": "mcad.ibm.com/v1beta1", + "fieldsType": "FieldsV1", + "fieldsV1": { + "f:metadata": { + "f:annotations": { + ".": {}, + "f:kubectl.kubernetes.io/last-applied-configuration": {}, + } + }, + "f:spec": { + ".": {}, + "f:priority": {}, + "f:resources": {".": {}, "f:Items": {}}, + }, + }, + "manager": "kubectl-client-side-apply", + "operation": "Update", + "time": "2023-02-22T16:26:07Z", + }, + ], + "name": "quicktest2", + "namespace": "ns", + "resourceVersion": "9482384", + "uid": "6334fc1b-471e-4876-8e7b-0b2277679235", + }, + "spec": { + "priority": 9, + "resources": { + "GenericItems": [ + { + "allocated": 0, + "custompodresources": [ + { + "limits": { + "cpu": "2", + "memory": "8G", + "nvidia.com/gpu": "0", + }, + "replicas": 1, + "requests": { + "cpu": "2", + "memory": "8G", + "nvidia.com/gpu": "0", + }, + }, + { + "limits": { + "cpu": "1", + "memory": "2G", + "nvidia.com/gpu": "0", + }, + "replicas": 1, + "requests": { + "cpu": "1", + "memory": "2G", + "nvidia.com/gpu": "0", + }, + }, + ], + "generictemplate": { + "apiVersion": "ray.io/v1alpha1", + "kind": "RayCluster", + "metadata": { + "labels": { + "appwrapper.mcad.ibm.com": "quicktest2", + "controller-tools.k8s.io": "1.0", + }, + "name": "quicktest2", + "namespace": "ns", + }, + "spec": { + "autoscalerOptions": { + "idleTimeoutSeconds": 60, + "imagePullPolicy": "Always", + "resources": { + "limits": { + "cpu": "500m", + "memory": "512Mi", + }, + "requests": { + "cpu": "500m", + "memory": "512Mi", + }, + }, + "upscalingMode": "Default", + }, + "enableInTreeAutoscaling": False, + "headGroupSpec": { + "rayStartParams": { + "block": "true", + "dashboard-host": "0.0.0.0", + "num-gpus": "0", + }, + "serviceType": "ClusterIP", + "template": { + "spec": { + "containers": [ + { + "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", + "imagePullPolicy": "Always", + "lifecycle": { + "preStop": { + "exec": { + "command": [ + "/bin/sh", + "-c", + "ray stop", + ] + } + } + }, + "name": "ray-head", + "ports": [ + { + "containerPort": 6379, + "name": "gcs", + }, + { + "containerPort": 8265, + "name": "dashboard", + }, + { + "containerPort": 10001, + "name": "client", + }, + ], + "resources": { + "limits": { + "cpu": 2, + "memory": "8G", + "nvidia.com/gpu": 0, + }, + "requests": { + "cpu": 2, + "memory": "8G", + "nvidia.com/gpu": 0, + }, + }, + } + ] + } + }, + }, + "rayVersion": "1.12.0", + "workerGroupSpecs": [ + { + "groupName": "small-group-quicktest", + "maxReplicas": 1, + "minReplicas": 1, + "rayStartParams": { + "block": "true", + "num-gpus": "0", + }, + "replicas": 1, + "template": { + "metadata": { + "annotations": {"key": "value"}, + "labels": {"key": "value"}, + }, + "spec": { + "containers": [ + { + "env": [ + { + "name": "MY_POD_IP", + "valueFrom": { + "fieldRef": { + "fieldPath": "status.podIP" + } + }, + } + ], + "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", + "lifecycle": { + "preStop": { + "exec": { + "command": [ + "/bin/sh", + "-c", + "ray stop", + ] + } + } + }, + "name": "machine-learning", + "resources": { + "limits": { + "cpu": 1, + "memory": "2G", + "nvidia.com/gpu": 0, + }, + "requests": { + "cpu": 1, + "memory": "2G", + "nvidia.com/gpu": 0, + }, + }, + } + ], + "initContainers": [ + { + "command": [ + "sh", + "-c", + "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done", + ], + "image": "busybox:1.28", + "name": "init-myservice", + } + ], + }, + }, + } + ], + }, + }, + "metadata": {}, + "priority": 0, + "priorityslope": 0, + "replicas": 1, + }, + { + "allocated": 0, + "generictemplate": { + "apiVersion": "route.openshift.io/v1", + "kind": "Route", + "metadata": { + "labels": { + "odh-ray-cluster-service": "quicktest-head-svc" + }, + "name": "ray-dashboard-quicktest", + "namespace": "default", + }, + "spec": { + "port": {"targetPort": "dashboard"}, + "to": { + "kind": "Service", + "name": "quicktest-head-svc", + }, + }, + }, + "metadata": {}, + "priority": 0, + "priorityslope": 0, + }, + ], + "Items": [], + "metadata": {}, + }, + "schedulingSpec": {}, + "service": {"spec": {}}, + }, + "status": { + "canrun": True, + "conditions": [ + { + "lastTransitionMicroTime": "2023-02-22T16:26:07.559447Z", + "lastUpdateMicroTime": "2023-02-22T16:26:07.559447Z", + "status": "True", + "type": "Init", + }, + { + "lastTransitionMicroTime": "2023-02-22T16:26:07.559551Z", + "lastUpdateMicroTime": "2023-02-22T16:26:07.559551Z", + "reason": "AwaitingHeadOfLine", + "status": "True", + "type": "Queueing", + }, + { + "lastTransitionMicroTime": "2023-02-22T16:26:13.220564Z", + "lastUpdateMicroTime": "2023-02-22T16:26:13.220564Z", + "reason": "AppWrapperRunnable", + "status": "True", + "type": "Dispatched", + }, + ], + "controllerfirsttimestamp": "2023-02-22T16:26:07.559447Z", + "filterignore": True, + "queuejobstate": "Dispatched", + "sender": "before manageQueueJob - afterEtcdDispatching", + "state": "Pending", + "systempriority": 9, + }, + } + ) + return [api_obj1, api_obj2] + + +def test_list_clusters(mocker, capsys): + mocker.patch("openshift.selector", side_effect=get_selector) + mock_res = mocker.patch.object(Selector, "objects") + mock_res.side_effect = get_obj_none + list_all_clusters("ns") + captured = capsys.readouterr() + assert captured.out == ( + "╭──────────────────────────────────────────────────────────────────────────────╮\n" + "│ No resources found, have you run cluster.up() yet? │\n" + "╰──────────────────────────────────────────────────────────────────────────────╯\n" + ) + mock_res.side_effect = get_ray_obj + list_all_clusters("ns") + captured = capsys.readouterr() + assert captured.out == ( + " 🚀 CodeFlare Cluster Details 🚀 \n" + " \n" + " ╭──────────────────────────────────────────────────────────────╮ \n" + " │ Name │ \n" + " │ quicktest Active ✅ │ \n" + " │ │ \n" + " │ URI: ray://quicktest-head-svc.ns.svc:10001 │ \n" + " │ │ \n" + " │ Dashboard🔗 │ \n" + " │ │ \n" + " │ Cluster Resources │ \n" + " │ ╭─ Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n" + " │ │ Min Max │ │ Memory CPU GPU │ │ \n" + " │ │ │ │ │ │ \n" + " │ │ 1 1 │ │ 2G~2G 1 0 │ │ \n" + " │ │ │ │ │ │ \n" + " │ ╰────────────╯ ╰──────────────────────────────────────╯ │ \n" + " ╰──────────────────────────────────────────────────────────────╯ \n" + ) -# def test_list_queue(mocker): -# list_all_queued("ns") +def test_list_queue(mocker, capsys): + mocker.patch("openshift.selector", side_effect=get_selector) + mock_res = mocker.patch.object(Selector, "objects") + mock_res.side_effect = get_obj_none + list_all_queued("ns") + captured = capsys.readouterr() + assert captured.out == ( + "╭──────────────────────────────────────────────────────────────────────────────╮\n" + "│ No resources found, have you run cluster.up() yet? │\n" + "╰──────────────────────────────────────────────────────────────────────────────╯\n" + ) + mock_res.side_effect = get_aw_obj + list_all_queued("ns") + captured = capsys.readouterr() + assert captured.out == ( + "╭──────────────────────────╮\n" + "│ 🚀 Cluster Queue Status │\n" + "│ 🚀 │\n" + "│ +------------+---------+ │\n" + "│ | Name | Status | │\n" + "│ +============+=========+ │\n" + "│ | quicktest1 | running | │\n" + "│ | | | │\n" + "│ | quicktest2 | pending | │\n" + "│ | | | │\n" + "│ +------------+---------+ │\n" + "╰──────────────────────────╯\n" + ) def test_cluster_status(mocker): From 1039d1e68345ceb6c21f75bd6572b5930eef9144 Mon Sep 17 00:00:00 2001 From: maxusmusti Date: Wed, 22 Feb 2023 17:48:36 -0500 Subject: [PATCH 8/8] tests pt3 and final bugfixes --- src/codeflare_sdk/cluster/cluster.py | 56 +++++++++++++++++++++------- tests/unit_test.py | 17 +++++++-- 2 files changed, 55 insertions(+), 18 deletions(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index 988d0b881..30ad8b4df 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -103,7 +103,7 @@ def up(self): try: with oc.project(namespace): oc.invoke("apply", ["-f", self.app_wrapper_yaml]) - except oc.OpenShiftPythonException as osp: + except oc.OpenShiftPythonException as osp: # pragma: no cover error_msg = osp.result.err() if "Unauthorized" in error_msg: raise PermissionError( @@ -120,7 +120,7 @@ def down(self): try: with oc.project(namespace): oc.invoke("delete", ["AppWrapper", self.app_wrapper_name]) - except oc.OpenShiftPythonException as osp: + except oc.OpenShiftPythonException as osp: # pragma: no cover error_msg = osp.result.err() if ( 'the server doesn\'t have a resource type "AppWrapper"' in error_msg @@ -279,9 +279,12 @@ def get_current_namespace() -> str: """ try: namespace = oc.invoke("project", ["-q"]).actions()[0].out.strip() - except oc.OpenShiftPythonException as osp: + except oc.OpenShiftPythonException as osp: # pragma: no cover error_msg = osp.result.err() - if "do not have rights" in error_msg: + if ( + "do not have rights" in error_msg + or "Missing or incomplete configuration" in error_msg + ): raise PermissionError( "Action not permitted, have you run auth.login() or cluster.up()?" ) @@ -316,14 +319,12 @@ def list_all_queued(namespace: str, print_to_console: bool = True): # private methods -def _app_wrapper_status( - name, namespace="default" -) -> Optional[AppWrapper]: # pragma: no cover +def _app_wrapper_status(name, namespace="default") -> Optional[AppWrapper]: cluster = None try: with oc.project(namespace), oc.timeout(10 * 60): cluster = oc.selector(f"appwrapper/{name}").object() - except oc.OpenShiftPythonException as osp: + except oc.OpenShiftPythonException as osp: # pragma: no cover msg = osp.msg if "Expected a single object, but selected 0" in msg: return cluster @@ -347,7 +348,7 @@ def _ray_cluster_status(name, namespace="default") -> Optional[RayCluster]: try: with oc.project(namespace), oc.timeout(10 * 60): cluster = oc.selector(f"rayclusters/{name}").object() - except oc.OpenShiftPythonException as osp: + except oc.OpenShiftPythonException as osp: # pragma: no cover msg = osp.msg if "Expected a single object, but selected 0" in msg: return cluster @@ -368,9 +369,22 @@ def _ray_cluster_status(name, namespace="default") -> Optional[RayCluster]: def _get_ray_clusters(namespace="default") -> List[RayCluster]: list_of_clusters = [] - - with oc.project(namespace), oc.timeout(10 * 60): - ray_clusters = oc.selector("rayclusters").objects() + try: + with oc.project(namespace), oc.timeout(10 * 60): + ray_clusters = oc.selector("rayclusters").objects() + except oc.OpenShiftPythonException as osp: # pragma: no cover + error_msg = osp.result.err() + if ( + 'the server doesn\'t have a resource type "rayclusters"' in error_msg + or "forbidden" in error_msg + or "Unauthorized" in error_msg + or "Missing or incomplete configuration" in error_msg + ): + raise PermissionError( + "Action not permitted, have you put in correct/up-to-date auth credentials?" + ) + else: + raise osp for cluster in ray_clusters: list_of_clusters.append(_map_to_ray_cluster(cluster)) @@ -382,8 +396,22 @@ def _get_app_wrappers( ) -> List[AppWrapper]: list_of_app_wrappers = [] - with oc.project(namespace), oc.timeout(10 * 60): - app_wrappers = oc.selector("appwrappers").objects() + try: + with oc.project(namespace), oc.timeout(10 * 60): + app_wrappers = oc.selector("appwrappers").objects() + except oc.OpenShiftPythonException as osp: # pragma: no cover + error_msg = osp.result.err() + if ( + 'the server doesn\'t have a resource type "appwrappers"' in error_msg + or "forbidden" in error_msg + or "Unauthorized" in error_msg + or "Missing or incomplete configuration" in error_msg + ): + raise PermissionError( + "Action not permitted, have you put in correct/up-to-date auth credentials?" + ) + else: + raise osp for item in app_wrappers: app_wrapper = _map_to_app_wrapper(item) diff --git a/tests/unit_test.py b/tests/unit_test.py index b8c971a01..f7548e19d 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -273,7 +273,7 @@ def test_cluster_uris(mocker): ) -def ray_addr(self): +def ray_addr(self, *args): return self._address @@ -287,14 +287,23 @@ def test_ray_job_wrapping(mocker): "ray.job_submission.JobSubmissionClient._check_connection_and_version_with_url", return_value="None", ) - # mocker.patch("ray.job_submission.JobSubmissionClient.list_jobs", side_effect=ray_addr) mock_res = mocker.patch.object( ray.job_submission.JobSubmissionClient, "list_jobs", autospec=True ) mock_res.side_effect = ray_addr assert cluster.list_jobs() == cluster.cluster_dashboard_uri() - # cluster.job_status() - # cluster.job_logs() + + mock_res = mocker.patch.object( + ray.job_submission.JobSubmissionClient, "get_job_status", autospec=True + ) + mock_res.side_effect = ray_addr + assert cluster.job_status("fake_id") == cluster.cluster_dashboard_uri() + + mock_res = mocker.patch.object( + ray.job_submission.JobSubmissionClient, "get_job_logs", autospec=True + ) + mock_res.side_effect = ray_addr + assert cluster.job_logs("fake_id") == cluster.cluster_dashboard_uri() def test_print_no_resources(capsys):