From e2b277e06c2649d3b2a3d3415a6ee688a4d39f00 Mon Sep 17 00:00:00 2001 From: maxusmusti Date: Fri, 2 Jun 2023 17:29:03 -0400 Subject: [PATCH 1/9] First pass on just the basics --- src/codeflare_sdk/cluster/cluster.py | 148 ++++++++++++++++----------- src/codeflare_sdk/cluster/config.py | 2 - 2 files changed, 91 insertions(+), 59 deletions(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index 0e0e73c06..f537d8243 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -36,6 +36,10 @@ RayClusterStatus, ) +from kubernetes import client, config + +import yaml + class Cluster: """ @@ -110,8 +114,17 @@ def up(self): """ namespace = self.config.namespace try: - with oc.project(namespace): - oc.invoke("apply", ["-f", self.app_wrapper_yaml]) + config.load_kube_config() + api_instance = client.CustomObjectsApi() + with open(self.app_wrapper_yaml) as f: + aw = yaml.load(f, Loader=yaml.FullLoader) + api_instance.create_namespaced_custom_object( + group="mcad.ibm.com", + version="v1beta1", + namespace=namespace, + plural="appwrappers", + body=aw, + ) except oc.OpenShiftPythonException as osp: # pragma: no cover error_msg = osp.result.err() if "Unauthorized" in error_msg: @@ -127,8 +140,15 @@ def down(self): """ namespace = self.config.namespace try: - with oc.project(namespace): - oc.invoke("delete", ["AppWrapper", self.app_wrapper_name]) + config.load_kube_config() + api_instance = client.CustomObjectsApi() + api_instance.delete_namespaced_custom_object( + group="mcad.ibm.com", + version="v1beta1", + namespace=namespace, + plural="appwrappers", + name=self.app_wrapper_name, + ) except oc.OpenShiftPythonException as osp: # pragma: no cover error_msg = osp.result.err() if ( @@ -322,14 +342,16 @@ def list_all_queued(namespace: str, print_to_console: bool = True): def _app_wrapper_status(name, namespace="default") -> Optional[AppWrapper]: - cluster = None try: - with oc.project(namespace), oc.timeout(10 * 60): - cluster = oc.selector(f"appwrapper/{name}").object() + config.load_kube_config() + api_instance = client.CustomObjectsApi() + aws = api_instance.list_namespaced_custom_object( + group="mcad.ibm.com", + version="v1beta1", + namespace=namespace, + plural="appwrappers", + ) except oc.OpenShiftPythonException as osp: # pragma: no cover - msg = osp.msg - if "Expected a single object, but selected 0" in msg: - return cluster error_msg = osp.result.err() if not ( 'the server doesn\'t have a resource type "appwrapper"' in error_msg @@ -339,21 +361,23 @@ def _app_wrapper_status(name, namespace="default") -> Optional[AppWrapper]: ): raise osp - if cluster: - return _map_to_app_wrapper(cluster) - - return cluster + for aw in aws["items"]: + if aw["metadata"]["name"] == name: + return _map_to_app_wrapper(aw) + return None def _ray_cluster_status(name, namespace="default") -> Optional[RayCluster]: - cluster = None try: - with oc.project(namespace), oc.timeout(10 * 60): - cluster = oc.selector(f"rayclusters/{name}").object() + config.load_kube_config() + api_instance = client.CustomObjectsApi() + rcs = api_instance.list_namespaced_custom_object( + group="ray.io", + version="v1alpha1", + namespace=namespace, + plural="rayclusters", + ) except oc.OpenShiftPythonException as osp: # pragma: no cover - msg = osp.msg - if "Expected a single object, but selected 0" in msg: - return cluster error_msg = osp.result.err() if not ( 'the server doesn\'t have a resource type "rayclusters"' in error_msg @@ -363,17 +387,23 @@ def _ray_cluster_status(name, namespace="default") -> Optional[RayCluster]: ): raise osp - if cluster: - return _map_to_ray_cluster(cluster) - - return cluster + for rc in rcs["items"]: + if rc["metadata"]["name"] == name: + return _map_to_ray_cluster(rc) + return None def _get_ray_clusters(namespace="default") -> List[RayCluster]: list_of_clusters = [] try: - with oc.project(namespace), oc.timeout(10 * 60): - ray_clusters = oc.selector("rayclusters").objects() + config.load_kube_config() + api_instance = client.CustomObjectsApi() + rcs = api_instance.list_namespaced_custom_object( + group="ray.io", + version="v1alpha1", + namespace=namespace, + plural="rayclusters", + ) except oc.OpenShiftPythonException as osp: # pragma: no cover error_msg = osp.result.err() if ( @@ -388,8 +418,8 @@ def _get_ray_clusters(namespace="default") -> List[RayCluster]: else: raise osp - for cluster in ray_clusters: - list_of_clusters.append(_map_to_ray_cluster(cluster)) + for rc in rcs["items"]: + list_of_clusters.append(_map_to_ray_cluster(rc)) return list_of_clusters @@ -399,8 +429,14 @@ def _get_app_wrappers( list_of_app_wrappers = [] try: - with oc.project(namespace), oc.timeout(10 * 60): - app_wrappers = oc.selector("appwrappers").objects() + config.load_kube_config() + api_instance = client.CustomObjectsApi() + aws = api_instance.list_namespaced_custom_object( + group="mcad.ibm.com", + version="v1beta1", + namespace=namespace, + plural="appwrappers", + ) except oc.OpenShiftPythonException as osp: # pragma: no cover error_msg = osp.result.err() if ( @@ -415,7 +451,7 @@ def _get_app_wrappers( else: raise osp - for item in app_wrappers: + for item in aws["items"]: app_wrapper = _map_to_app_wrapper(item) if filter and app_wrapper.status in filter: list_of_app_wrappers.append(app_wrapper) @@ -425,48 +461,46 @@ def _get_app_wrappers( return list_of_app_wrappers -def _map_to_ray_cluster(cluster) -> Optional[RayCluster]: - cluster_model = cluster.model - if type(cluster_model.status.state) == oc.model.MissingModel: - status = RayClusterStatus.UNKNOWN +def _map_to_ray_cluster(rc) -> Optional[RayCluster]: + if "state" in rc["status"]: + status = RayClusterStatus(rc["status"]["state"].lower()) else: - status = RayClusterStatus(cluster_model.status.state.lower()) + status = RayClusterStatus.UNKNOWN - with oc.project(cluster.namespace()), oc.timeout(10 * 60): + with oc.project(rc["metadata"]["namespace"]), oc.timeout(10 * 60): route = ( - oc.selector(f"route/ray-dashboard-{cluster.name()}") + oc.selector(f"route/ray-dashboard-{rc['metadata']['name']}") .object() .model.spec.host ) return RayCluster( - name=cluster.name(), + name=rc["metadata"]["name"], status=status, # for now we are not using autoscaling so same replicas is fine - min_workers=cluster_model.spec.workerGroupSpecs[0].replicas, - max_workers=cluster_model.spec.workerGroupSpecs[0].replicas, - worker_mem_max=cluster_model.spec.workerGroupSpecs[0] - .template.spec.containers[0] - .resources.limits.memory, - worker_mem_min=cluster_model.spec.workerGroupSpecs[0] - .template.spec.containers[0] - .resources.requests.memory, - worker_cpu=cluster_model.spec.workerGroupSpecs[0] - .template.spec.containers[0] - .resources.limits.cpu, + min_workers=rc["spec"]["workerGroupSpecs"][0]["replicas"], + max_workers=rc["spec"]["workerGroupSpecs"][0]["replicas"], + worker_mem_max=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + "containers" + ][0]["resources"]["limits"]["memory"], + worker_mem_min=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + "containers" + ][0]["resources"]["requests"]["memory"], + worker_cpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ + 0 + ]["resources"]["limits"]["cpu"], worker_gpu=0, # hard to detect currently how many gpus, can override it with what the user asked for - namespace=cluster.namespace(), + namespace=rc["metadata"]["namespace"], dashboard=route, ) -def _map_to_app_wrapper(cluster) -> AppWrapper: - cluster_model = cluster.model +def _map_to_app_wrapper(aw) -> AppWrapper: return AppWrapper( - name=cluster.name(), - status=AppWrapperStatus(cluster_model.status.state.lower()), - can_run=cluster_model.status.canrun, - job_state=cluster_model.status.queuejobstate, + name=aw["metadata"]["name"], + status=AppWrapperStatus(aw["status"]["state"].lower()), + can_run=aw["status"]["canrun"], + job_state=aw["status"]["queuejobstate"], ) diff --git a/src/codeflare_sdk/cluster/config.py b/src/codeflare_sdk/cluster/config.py index 25392db75..692cb352f 100644 --- a/src/codeflare_sdk/cluster/config.py +++ b/src/codeflare_sdk/cluster/config.py @@ -19,9 +19,7 @@ """ from dataclasses import dataclass, field -from .auth import Authentication import pathlib -import openshift dir = pathlib.Path(__file__).parent.parent.resolve() From 000b277888068b4a02f31fc6a3bfedb57e1970f7 Mon Sep 17 00:00:00 2001 From: maxusmusti Date: Mon, 5 Jun 2023 16:39:02 -0400 Subject: [PATCH 2/9] Added namespace retrieval and dashboard route access via kubernetes --- src/codeflare_sdk/cluster/cluster.py | 65 ++++++++++++++++++++-------- src/codeflare_sdk/job/jobs.py | 4 +- 2 files changed, 50 insertions(+), 19 deletions(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index f537d8243..3b89a68a6 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -69,7 +69,7 @@ def create_app_wrapper(self): """ if self.config.namespace is None: - self.config.namespace = oc.get_project_name() + self.config.namespace = get_current_namespace() if type(self.config.namespace) is not str: raise TypeError( f"Namespace {self.config.namespace} is of type {type(self.config.namespace)}. Check your Kubernetes Authentication." @@ -265,16 +265,21 @@ def cluster_dashboard_uri(self) -> str: Returns a string containing the cluster's dashboard URI. """ try: - with oc.project(self.config.namespace): - route = oc.invoke( - "get", ["route", "-o", "jsonpath='{$.items[*].spec.host}'"] - ) - route = route.out().split(" ") - route = [x for x in route if f"ray-dashboard-{self.config.name}" in x] - route = route[0].strip().strip("'") - return f"http://{route}" + config.load_kube_config() + api_instance = client.CustomObjectsApi() + routes = api_instance.list_namespaced_custom_object( + group="route.openshift.io", + version="v1", + namespace=self.config.namespace, + plural="routes", + ) except: - return "Dashboard route not available yet, have you run cluster.up()?" + pass + + for route in routes["items"]: + if route["metadata"]["name"] == f"ray-dashboard-{self.config.name}": + return f"http://{route['spec']['host']}" + return "Dashboard route not available yet, have you run cluster.up()?" def list_jobs(self) -> List: """ @@ -338,6 +343,19 @@ def list_all_queued(namespace: str, print_to_console: bool = True): return app_wrappers +def get_current_namespace(): + try: + _, active_context = config.list_kube_config_contexts() + except config.ConfigException: + raise PermissionError( + "Retrieving current namespace not permitted, have you put in correct/up-to-date auth credentials?" + ) + try: + return active_context["context"]["namespace"] + except KeyError: + return "default" + + # private methods @@ -467,12 +485,25 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]: else: status = RayClusterStatus.UNKNOWN - with oc.project(rc["metadata"]["namespace"]), oc.timeout(10 * 60): - route = ( - oc.selector(f"route/ray-dashboard-{rc['metadata']['name']}") - .object() - .model.spec.host - ) + config.load_kube_config() + api_instance = client.CustomObjectsApi() + routes = api_instance.list_namespaced_custom_object( + group="route.openshift.io", + version="v1", + namespace=rc["metadata"]["namespace"], + plural="routes", + ) + ray_route = None + for route in routes["items"]: + if route["metadata"]["name"] == f"ray-dashboard-{rc['metadata']['name']}": + ray_route = route["spec"]["host"] + + # with oc.project(rc["metadata"]["namespace"]), oc.timeout(10 * 60): + # route = ( + # oc.selector(f"route/ray-dashboard-{rc['metadata']['name']}") + # .object() + # .model.spec.host + # ) return RayCluster( name=rc["metadata"]["name"], @@ -491,7 +522,7 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]: ]["resources"]["limits"]["cpu"], worker_gpu=0, # hard to detect currently how many gpus, can override it with what the user asked for namespace=rc["metadata"]["namespace"], - dashboard=route, + dashboard=ray_route, ) diff --git a/src/codeflare_sdk/job/jobs.py b/src/codeflare_sdk/job/jobs.py index 6b5ce0a53..b1db70d54 100644 --- a/src/codeflare_sdk/job/jobs.py +++ b/src/codeflare_sdk/job/jobs.py @@ -17,13 +17,13 @@ from typing import TYPE_CHECKING, Optional, Dict, List from pathlib import Path -import openshift as oc from torchx.components.dist import ddp from torchx.runner import get_runner from torchx.specs import AppHandle, parse_app_handle, AppDryRunInfo if TYPE_CHECKING: from ..cluster.cluster import Cluster +from ..cluster.cluster import get_current_namespace all_jobs: List["Job"] = [] torchx_runner = get_runner() @@ -124,7 +124,7 @@ def _missing_spec(self, spec: str): def _dry_run_no_cluster(self): if self.scheduler_args is not None: if self.scheduler_args.get("namespace") is None: - self.scheduler_args["namespace"] = oc.get_project_name() + self.scheduler_args["namespace"] = get_current_namespace() return torchx_runner.dryrun( app=ddp( *self.script_args, From d9f67aae67ef04e93dfdb74779dbd165f92b8038 Mon Sep 17 00:00:00 2001 From: maxusmusti Date: Tue, 6 Jun 2023 17:48:35 -0400 Subject: [PATCH 3/9] Added exception handling --- src/codeflare_sdk/cluster/cluster.py | 119 +++++++++------------------ 1 file changed, 39 insertions(+), 80 deletions(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index 3b89a68a6..2e34bc986 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -22,7 +22,6 @@ from time import sleep from typing import List, Optional, Tuple, Dict -import openshift as oc from ray.job_submission import JobSubmissionClient from ..utils import pretty_print @@ -39,6 +38,7 @@ from kubernetes import client, config import yaml +import executing class Cluster: @@ -125,13 +125,8 @@ def up(self): plural="appwrappers", body=aw, ) - except oc.OpenShiftPythonException as osp: # pragma: no cover - error_msg = osp.result.err() - if "Unauthorized" in error_msg: - raise PermissionError( - "Action not permitted, have you put in correct/up-to-date auth credentials?" - ) - raise osp + except Exception as e: + return _kube_api_error_handling(e) def down(self): """ @@ -149,21 +144,10 @@ def down(self): plural="appwrappers", name=self.app_wrapper_name, ) - except oc.OpenShiftPythonException as osp: # pragma: no cover - error_msg = osp.result.err() - if ( - 'the server doesn\'t have a resource type "AppWrapper"' in error_msg - or "forbidden" in error_msg - or "Unauthorized" in error_msg - or "Missing or incomplete configuration" in error_msg - ): - raise PermissionError( - "Action not permitted, have you run auth.login()/cluster.up() yet?" - ) - elif "not found" in error_msg: - print("Cluster not found, have you run cluster.up() yet?") - else: - raise osp + except Exception as e: + return _kube_api_error_handling(e) + # elif "not found" in error_msg: + # print("Cluster not found, have you run cluster.up() yet?") def status( self, print_to_console: bool = True @@ -273,8 +257,8 @@ def cluster_dashboard_uri(self) -> str: namespace=self.config.namespace, plural="routes", ) - except: - pass + except Exception as e: + return _kube_api_error_handling(e) for route in routes["items"]: if route["metadata"]["name"] == f"ray-dashboard-{self.config.name}": @@ -345,11 +329,10 @@ def list_all_queued(namespace: str, print_to_console: bool = True): def get_current_namespace(): try: + config.load_kube_config() _, active_context = config.list_kube_config_contexts() - except config.ConfigException: - raise PermissionError( - "Retrieving current namespace not permitted, have you put in correct/up-to-date auth credentials?" - ) + except Exception as e: + return _kube_api_error_handling(e) try: return active_context["context"]["namespace"] except KeyError: @@ -359,6 +342,25 @@ def get_current_namespace(): # private methods +def _kube_api_error_handling(e: Exception): + perm_msg = ( + "Action not permitted, have you put in correct/up-to-date auth credentials?" + ) + nf_msg = "No instances found, nothing to be done." + if type(e) == config.ConfigException: + raise PermissionError(perm_msg) + if type(e) == executing.executing.NotOneValueFound: + print(nf_msg) + return + if type(e) == client.ApiException: + if e.reason == "Not Found": + print(nf_msg) + return + elif e.reason == "Unauthorized" or e.reason == "Forbidden": + raise PermissionError(perm_msg) + raise e + + def _app_wrapper_status(name, namespace="default") -> Optional[AppWrapper]: try: config.load_kube_config() @@ -369,15 +371,8 @@ def _app_wrapper_status(name, namespace="default") -> Optional[AppWrapper]: namespace=namespace, plural="appwrappers", ) - except oc.OpenShiftPythonException as osp: # pragma: no cover - error_msg = osp.result.err() - if not ( - 'the server doesn\'t have a resource type "appwrapper"' in error_msg - or "forbidden" in error_msg - or "Unauthorized" in error_msg - or "Missing or incomplete configuration" in error_msg - ): - raise osp + except Exception as e: + return _kube_api_error_handling(e) for aw in aws["items"]: if aw["metadata"]["name"] == name: @@ -395,15 +390,8 @@ def _ray_cluster_status(name, namespace="default") -> Optional[RayCluster]: namespace=namespace, plural="rayclusters", ) - except oc.OpenShiftPythonException as osp: # pragma: no cover - error_msg = osp.result.err() - if not ( - 'the server doesn\'t have a resource type "rayclusters"' in error_msg - or "forbidden" in error_msg - or "Unauthorized" in error_msg - or "Missing or incomplete configuration" in error_msg - ): - raise osp + except Exception as e: + return _kube_api_error_handling(e) for rc in rcs["items"]: if rc["metadata"]["name"] == name: @@ -422,19 +410,8 @@ def _get_ray_clusters(namespace="default") -> List[RayCluster]: namespace=namespace, plural="rayclusters", ) - except oc.OpenShiftPythonException as osp: # pragma: no cover - error_msg = osp.result.err() - if ( - 'the server doesn\'t have a resource type "rayclusters"' in error_msg - or "forbidden" in error_msg - or "Unauthorized" in error_msg - or "Missing or incomplete configuration" in error_msg - ): - raise PermissionError( - "Action not permitted, have you put in correct/up-to-date auth credentials?" - ) - else: - raise osp + except Exception as e: + return _kube_api_error_handling(e) for rc in rcs["items"]: list_of_clusters.append(_map_to_ray_cluster(rc)) @@ -455,19 +432,8 @@ def _get_app_wrappers( namespace=namespace, plural="appwrappers", ) - except oc.OpenShiftPythonException as osp: # pragma: no cover - error_msg = osp.result.err() - if ( - 'the server doesn\'t have a resource type "appwrappers"' in error_msg - or "forbidden" in error_msg - or "Unauthorized" in error_msg - or "Missing or incomplete configuration" in error_msg - ): - raise PermissionError( - "Action not permitted, have you put in correct/up-to-date auth credentials?" - ) - else: - raise osp + except Exception as e: + return _kube_api_error_handling(e) for item in aws["items"]: app_wrapper = _map_to_app_wrapper(item) @@ -498,13 +464,6 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]: if route["metadata"]["name"] == f"ray-dashboard-{rc['metadata']['name']}": ray_route = route["spec"]["host"] - # with oc.project(rc["metadata"]["namespace"]), oc.timeout(10 * 60): - # route = ( - # oc.selector(f"route/ray-dashboard-{rc['metadata']['name']}") - # .object() - # .model.spec.host - # ) - return RayCluster( name=rc["metadata"]["name"], status=status, From 8d40bf8f23406a937aa3c8644a72c4961e997318 Mon Sep 17 00:00:00 2001 From: maxusmusti Date: Thu, 8 Jun 2023 15:13:33 -0400 Subject: [PATCH 4/9] Remove unnecessary comment --- src/codeflare_sdk/cluster/cluster.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index 2e34bc986..bffca0d6a 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -146,8 +146,6 @@ def down(self): ) except Exception as e: return _kube_api_error_handling(e) - # elif "not found" in error_msg: - # print("Cluster not found, have you run cluster.up() yet?") def status( self, print_to_console: bool = True From 9b399104e2ec3de12309c111d398e27ee3cd1628 Mon Sep 17 00:00:00 2001 From: maxusmusti Date: Thu, 8 Jun 2023 17:51:25 -0400 Subject: [PATCH 5/9] Change AW fs loading to k8s and begin converting unit tests --- src/codeflare_sdk/cluster/awload.py | 65 +++++++++++++--------------- src/codeflare_sdk/cluster/cluster.py | 7 ++- tests/unit_test.py | 34 ++++++++++----- 3 files changed, 57 insertions(+), 49 deletions(-) diff --git a/src/codeflare_sdk/cluster/awload.py b/src/codeflare_sdk/cluster/awload.py index 5621d6734..25f614232 100644 --- a/src/codeflare_sdk/cluster/awload.py +++ b/src/codeflare_sdk/cluster/awload.py @@ -23,6 +23,9 @@ import openshift as oc import yaml +from kubernetes import client, config +from .cluster import _kube_api_error_handling + class AWManager: """ @@ -40,10 +43,10 @@ def __init__(self, filename: str) -> None: self.filename = filename try: with open(self.filename) as f: - awyaml = yaml.load(f, Loader=yaml.FullLoader) - assert awyaml["kind"] == "AppWrapper" - self.name = awyaml["metadata"]["name"] - self.namespace = awyaml["metadata"]["namespace"] + self.awyaml = yaml.load(f, Loader=yaml.FullLoader) + assert self.awyaml["kind"] == "AppWrapper" + self.name = self.awyaml["metadata"]["name"] + self.namespace = self.awyaml["metadata"]["namespace"] except: raise ValueError( f"{filename } is not a correctly formatted AppWrapper yaml" @@ -55,19 +58,17 @@ def submit(self) -> None: Attempts to create the AppWrapper custom resource using the yaml file """ try: - with oc.project(self.namespace): - oc.invoke("create", ["-f", self.filename]) - except oc.OpenShiftPythonException as osp: # pragma: no cover - error_msg = osp.result.err() - if "Unauthorized" in error_msg or "Forbidden" in error_msg: - raise PermissionError( - "Action not permitted, have you put in correct/up-to-date auth credentials?" - ) - elif "AlreadyExists" in error_msg: - raise FileExistsError( - f"An AppWrapper of the name {self.name} already exists in namespace {self.namespace}" - ) - raise osp + config.load_kube_config() + api_instance = client.CustomObjectsApi() + api_instance.create_namespaced_custom_object( + group="mcad.ibm.com", + version="v1beta1", + namespace=self.namespace, + plural="appwrappers", + body=self.awyaml, + ) + except Exception as e: + return _kube_api_error_handling(e) self.submitted = True print(f"AppWrapper {self.filename} submitted!") @@ -82,25 +83,17 @@ def remove(self) -> None: return try: - with oc.project(self.namespace): - oc.invoke("delete", ["AppWrapper", self.name]) - except oc.OpenShiftPythonException as osp: # pragma: no cover - error_msg = osp.result.err() - if ( - 'the server doesn\'t have a resource type "AppWrapper"' in error_msg - or "forbidden" in error_msg - or "Unauthorized" in error_msg - or "Missing or incomplete configuration" in error_msg - ): - raise PermissionError( - "Action not permitted, have you put in correct/up-to-date auth credentials?" - ) - elif "not found" in error_msg: - self.submitted = False - print("AppWrapper not found, was deleted in another manner") - return - else: - raise osp + config.load_kube_config() + api_instance = client.CustomObjectsApi() + api_instance.delete_namespaced_custom_object( + group="mcad.ibm.com", + version="v1beta1", + namespace=self.namespace, + plural="appwrappers", + name=self.name, + ) + except Exception as e: + return _kube_api_error_handling(e) self.submitted = False print(f"AppWrapper {self.name} removed!") diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index bffca0d6a..448dc9684 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -325,7 +325,7 @@ def list_all_queued(namespace: str, print_to_console: bool = True): return app_wrappers -def get_current_namespace(): +def get_current_namespace(): # pragma: no cover try: config.load_kube_config() _, active_context = config.list_kube_config_contexts() @@ -340,11 +340,12 @@ def get_current_namespace(): # private methods -def _kube_api_error_handling(e: Exception): +def _kube_api_error_handling(e: Exception): # pragma: no cover perm_msg = ( "Action not permitted, have you put in correct/up-to-date auth credentials?" ) nf_msg = "No instances found, nothing to be done." + exists_msg = "Resource with this name already exists." if type(e) == config.ConfigException: raise PermissionError(perm_msg) if type(e) == executing.executing.NotOneValueFound: @@ -356,6 +357,8 @@ def _kube_api_error_handling(e: Exception): return elif e.reason == "Unauthorized" or e.reason == "Forbidden": raise PermissionError(perm_msg) + elif e.reason == "Conflict": + raise FileExistsError(exists_msg) raise e diff --git a/tests/unit_test.py b/tests/unit_test.py index ead3521c8..29ec91e8e 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -70,6 +70,7 @@ from torchx.schedulers.ray_scheduler import RayJob from torchx.schedulers.kubernetes_mcad_scheduler import KubernetesMCADJob import pytest +import yaml # For mocking openshift client results @@ -249,7 +250,7 @@ def test_cluster_creation(): def test_default_cluster_creation(mocker): mocker.patch( - "openshift.get_project_name", + "codeflare_sdk.cluster.cluster.get_current_namespace", return_value="opendatahub", ) default_config = ClusterConfiguration( @@ -264,27 +265,38 @@ def test_default_cluster_creation(mocker): return cluster -def arg_check_apply_effect(*args): - assert args[0] == "apply" - assert args[1] == ["-f", "unit-test-cluster.yaml"] +def arg_check_apply_effect(group, version, namespace, plural, body, *args): + assert group == "mcad.ibm.com" + assert version == "v1beta1" + assert namespace == "ns" + assert plural == "appwrappers" + with open("unit-test-cluster.yaml") as f: + aw = yaml.load(f, Loader=yaml.FullLoader) + assert body == aw + assert args == tuple() -def arg_check_del_effect(*args): - assert args[0] == "delete" - assert args[1] == ["AppWrapper", "unit-test-cluster"] +def arg_check_del_effect(group, version, namespace, plural, name, *args): + assert group == "mcad.ibm.com" + assert version == "v1beta1" + assert namespace == "ns" + assert plural == "appwrappers" + assert name == "unit-test-cluster" + assert args == tuple() def test_cluster_up_down(mocker): + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") mocker.patch( - "codeflare_sdk.cluster.auth.TokenAuthentication.login", return_value="ignore" + "kubernetes.client.CustomObjectsApi.create_namespaced_custom_object", + side_effect=arg_check_apply_effect, ) mocker.patch( - "codeflare_sdk.cluster.auth.TokenAuthentication.logout", return_value="ignore" + "kubernetes.client.CustomObjectsApi.delete_namespaced_custom_object", + side_effect=arg_check_del_effect, ) - mocker.patch("openshift.invoke", side_effect=arg_check_apply_effect) cluster = test_cluster_creation() cluster.up() - mocker.patch("openshift.invoke", side_effect=arg_check_del_effect) cluster.down() From ca006baaf302ec3d801ae9006b7c8dd5db8df725 Mon Sep 17 00:00:00 2001 From: maxusmusti Date: Fri, 9 Jun 2023 14:12:26 -0400 Subject: [PATCH 6/9] Finished unit test update --- src/codeflare_sdk/cluster/cluster.py | 14 +- tests/unit_test.py | 1733 ++++++++++++++------------ 2 files changed, 937 insertions(+), 810 deletions(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index 448dc9684..26f7ed62a 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -125,7 +125,7 @@ def up(self): plural="appwrappers", body=aw, ) - except Exception as e: + except Exception as e: # pragma: no cover return _kube_api_error_handling(e) def down(self): @@ -144,7 +144,7 @@ def down(self): plural="appwrappers", name=self.app_wrapper_name, ) - except Exception as e: + except Exception as e: # pragma: no cover return _kube_api_error_handling(e) def status( @@ -255,7 +255,7 @@ def cluster_dashboard_uri(self) -> str: namespace=self.config.namespace, plural="routes", ) - except Exception as e: + except Exception as e: # pragma: no cover return _kube_api_error_handling(e) for route in routes["items"]: @@ -372,7 +372,7 @@ def _app_wrapper_status(name, namespace="default") -> Optional[AppWrapper]: namespace=namespace, plural="appwrappers", ) - except Exception as e: + except Exception as e: # pragma: no cover return _kube_api_error_handling(e) for aw in aws["items"]: @@ -391,7 +391,7 @@ def _ray_cluster_status(name, namespace="default") -> Optional[RayCluster]: namespace=namespace, plural="rayclusters", ) - except Exception as e: + except Exception as e: # pragma: no cover return _kube_api_error_handling(e) for rc in rcs["items"]: @@ -411,7 +411,7 @@ def _get_ray_clusters(namespace="default") -> List[RayCluster]: namespace=namespace, plural="rayclusters", ) - except Exception as e: + except Exception as e: # pragma: no cover return _kube_api_error_handling(e) for rc in rcs["items"]: @@ -433,7 +433,7 @@ def _get_app_wrappers( namespace=namespace, plural="appwrappers", ) - except Exception as e: + except Exception as e: # pragma: no cover return _kube_api_error_handling(e) for item in aws["items"]: diff --git a/tests/unit_test.py b/tests/unit_test.py index 29ec91e8e..7225b6725 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -29,6 +29,8 @@ list_all_clusters, list_all_queued, _copy_to_ray, + _app_wrapper_status, + _ray_cluster_status, ) from codeflare_sdk.cluster.auth import ( TokenAuthentication, @@ -300,14 +302,68 @@ def test_cluster_up_down(mocker): cluster.down() -def out_route(self): - return "ray-dashboard-raycluster-autoscaler-ns.apps.cluster.awsroute.org ray-dashboard-unit-test-cluster-ns.apps.cluster.awsroute.org" +def aw_status_fields(group, version, namespace, plural, *args): + assert group == "mcad.ibm.com" + assert version == "v1beta1" + assert namespace == "test-ns" + assert plural == "appwrappers" + assert args == tuple() + return {"items": []} + + +def test_aw_status(mocker): + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=aw_status_fields, + ) + aw = _app_wrapper_status("test-aw", "test-ns") + assert aw == None + + +def rc_status_fields(group, version, namespace, plural, *args): + assert group == "ray.io" + assert version == "v1alpha1" + assert namespace == "test-ns" + assert plural == "rayclusters" + assert args == tuple() + return {"items": []} + + +def test_rc_status(mocker): + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=rc_status_fields, + ) + rc = _ray_cluster_status("test-rc", "test-ns") + assert rc == None + + +def uri_retreival(group, version, namespace, plural, *args): + assert group == "route.openshift.io" + assert version == "v1" + assert namespace == "ns" + assert plural == "routes" + assert args == tuple() + return { + "items": [ + { + "metadata": {"name": "ray-dashboard-unit-test-cluster"}, + "spec": { + "host": "ray-dashboard-unit-test-cluster-ns.apps.cluster.awsroute.org" + }, + } + ] + } def test_cluster_uris(mocker): - mocker.patch("openshift.invoke", return_value=fake_res) - mock_res = mocker.patch.object(openshift.Result, "out") - mock_res.side_effect = lambda: out_route(fake_res) + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=uri_retreival, + ) cluster = test_cluster_creation() assert cluster.cluster_uri() == "ray://unit-test-cluster-head-svc.ns.svc:10001" @@ -327,9 +383,11 @@ def ray_addr(self, *args): def test_ray_job_wrapping(mocker): - mocker.patch("openshift.invoke", return_value=fake_res) - mock_res = mocker.patch.object(openshift.Result, "out") - mock_res.side_effect = lambda: out_route(fake_res) + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=uri_retreival, + ) cluster = test_cluster_creation() mocker.patch( @@ -415,7 +473,7 @@ def test_print_appwrappers(capsys): ) -def test_ray_details(capsys): +def test_ray_details(mocker, capsys): ray1 = RayCluster( name="raytest1", status=RayClusterStatus.READY, @@ -428,6 +486,14 @@ def test_ray_details(capsys): namespace="ns", dashboard="fake-uri", ) + mocker.patch( + "codeflare_sdk.cluster.cluster.Cluster.status", + return_value=(False, CodeFlareClusterStatus.UNKNOWN), + ) + mocker.patch( + "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri", + return_value="", + ) cf = Cluster(ClusterConfiguration(name="raytest2", namespace="ns")) captured = capsys.readouterr() ray2 = _copy_to_ray(cf) @@ -532,223 +598,150 @@ def act_side_effect_list(self): return [self] -def get_selector(*args): - selector = Selector({"operation": "selector", "status": 0, "actions": []}) - return selector +def get_obj_none(group, version, namespace, plural): + return {"items": []} -def get_obj_none(): - return [] - - -def get_ray_obj(cls=None): - api_obj = openshift.apiobject.APIObject( - { - "apiVersion": "ray.io/v1alpha1", - "kind": "RayCluster", - "metadata": { - "creationTimestamp": "2023-02-22T16:26:07Z", - "generation": 1, - "labels": { - "appwrapper.mcad.ibm.com": "quicktest", - "controller-tools.k8s.io": "1.0", - "resourceName": "quicktest", - }, - "managedFields": [ - { - "apiVersion": "ray.io/v1alpha1", - "fieldsType": "FieldsV1", - "fieldsV1": { - "f:metadata": { - "f:labels": { - ".": {}, - "f:appwrapper.mcad.ibm.com": {}, - "f:controller-tools.k8s.io": {}, - "f:resourceName": {}, - }, - "f:ownerReferences": { - ".": {}, - 'k:{"uid":"6334fc1b-471e-4876-8e7b-0b2277679235"}': {}, - }, - }, - "f:spec": { - ".": {}, - "f:autoscalerOptions": { - ".": {}, - "f:idleTimeoutSeconds": {}, - "f:imagePullPolicy": {}, - "f:resources": { +def get_ray_obj(group, version, namespace, plural, cls=None): + api_obj = { + "items": [ + { + "apiVersion": "ray.io/v1alpha1", + "kind": "RayCluster", + "metadata": { + "creationTimestamp": "2023-02-22T16:26:07Z", + "generation": 1, + "labels": { + "appwrapper.mcad.ibm.com": "quicktest", + "controller-tools.k8s.io": "1.0", + "resourceName": "quicktest", + }, + "managedFields": [ + { + "apiVersion": "ray.io/v1alpha1", + "fieldsType": "FieldsV1", + "fieldsV1": { + "f:metadata": { + "f:labels": { ".": {}, - "f:limits": { - ".": {}, - "f:cpu": {}, - "f:memory": {}, - }, - "f:requests": { - ".": {}, - "f:cpu": {}, - "f:memory": {}, - }, + "f:appwrapper.mcad.ibm.com": {}, + "f:controller-tools.k8s.io": {}, + "f:resourceName": {}, + }, + "f:ownerReferences": { + ".": {}, + 'k:{"uid":"6334fc1b-471e-4876-8e7b-0b2277679235"}': {}, }, - "f:upscalingMode": {}, }, - "f:enableInTreeAutoscaling": {}, - "f:headGroupSpec": { + "f:spec": { ".": {}, - "f:rayStartParams": { + "f:autoscalerOptions": { ".": {}, - "f:block": {}, - "f:dashboard-host": {}, - "f:num-gpus": {}, + "f:idleTimeoutSeconds": {}, + "f:imagePullPolicy": {}, + "f:resources": { + ".": {}, + "f:limits": { + ".": {}, + "f:cpu": {}, + "f:memory": {}, + }, + "f:requests": { + ".": {}, + "f:cpu": {}, + "f:memory": {}, + }, + }, + "f:upscalingMode": {}, }, - "f:serviceType": {}, - "f:template": { + "f:enableInTreeAutoscaling": {}, + "f:headGroupSpec": { ".": {}, - "f:spec": {".": {}, "f:containers": {}}, + "f:rayStartParams": { + ".": {}, + "f:block": {}, + "f:dashboard-host": {}, + "f:num-gpus": {}, + }, + "f:serviceType": {}, + "f:template": { + ".": {}, + "f:spec": {".": {}, "f:containers": {}}, + }, }, + "f:rayVersion": {}, + "f:workerGroupSpecs": {}, }, - "f:rayVersion": {}, - "f:workerGroupSpecs": {}, }, + "manager": "mcad-controller", + "operation": "Update", + "time": "2023-02-22T16:26:07Z", }, - "manager": "mcad-controller", - "operation": "Update", - "time": "2023-02-22T16:26:07Z", - }, - { - "apiVersion": "ray.io/v1alpha1", - "fieldsType": "FieldsV1", - "fieldsV1": { - "f:status": { - ".": {}, - "f:availableWorkerReplicas": {}, - "f:desiredWorkerReplicas": {}, - "f:endpoints": { + { + "apiVersion": "ray.io/v1alpha1", + "fieldsType": "FieldsV1", + "fieldsV1": { + "f:status": { ".": {}, - "f:client": {}, - "f:dashboard": {}, - "f:gcs": {}, - }, - "f:lastUpdateTime": {}, - "f:maxWorkerReplicas": {}, - "f:minWorkerReplicas": {}, - "f:state": {}, - } - }, - "manager": "manager", - "operation": "Update", - "subresource": "status", - "time": "2023-02-22T16:26:16Z", - }, - ], - "name": "quicktest", - "namespace": "ns", - "ownerReferences": [ - { - "apiVersion": "mcad.ibm.com/v1beta1", - "blockOwnerDeletion": True, - "controller": True, - "kind": "AppWrapper", - "name": "quicktest", - "uid": "6334fc1b-471e-4876-8e7b-0b2277679235", - } - ], - "resourceVersion": "9482407", - "uid": "44d45d1f-26c8-43e7-841f-831dbd8c1285", - }, - "spec": { - "autoscalerOptions": { - "idleTimeoutSeconds": 60, - "imagePullPolicy": "Always", - "resources": { - "limits": {"cpu": "500m", "memory": "512Mi"}, - "requests": {"cpu": "500m", "memory": "512Mi"}, - }, - "upscalingMode": "Default", - }, - "enableInTreeAutoscaling": False, - "headGroupSpec": { - "rayStartParams": { - "block": "true", - "dashboard-host": "0.0.0.0", - "num-gpus": "0", - }, - "serviceType": "ClusterIP", - "template": { - "spec": { - "containers": [ - { - "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", - "imagePullPolicy": "Always", - "lifecycle": { - "preStop": { - "exec": { - "command": ["/bin/sh", "-c", "ray stop"] - } - } - }, - "name": "ray-head", - "ports": [ - { - "containerPort": 6379, - "name": "gcs", - "protocol": "TCP", - }, - { - "containerPort": 8265, - "name": "dashboard", - "protocol": "TCP", - }, - { - "containerPort": 10001, - "name": "client", - "protocol": "TCP", - }, - ], - "resources": { - "limits": { - "cpu": 2, - "memory": "8G", - "nvidia.com/gpu": 0, - }, - "requests": { - "cpu": 2, - "memory": "8G", - "nvidia.com/gpu": 0, - }, + "f:availableWorkerReplicas": {}, + "f:desiredWorkerReplicas": {}, + "f:endpoints": { + ".": {}, + "f:client": {}, + "f:dashboard": {}, + "f:gcs": {}, }, + "f:lastUpdateTime": {}, + "f:maxWorkerReplicas": {}, + "f:minWorkerReplicas": {}, + "f:state": {}, } - ] + }, + "manager": "manager", + "operation": "Update", + "subresource": "status", + "time": "2023-02-22T16:26:16Z", + }, + ], + "name": "quicktest", + "namespace": "ns", + "ownerReferences": [ + { + "apiVersion": "mcad.ibm.com/v1beta1", + "blockOwnerDeletion": True, + "controller": True, + "kind": "AppWrapper", + "name": "quicktest", + "uid": "6334fc1b-471e-4876-8e7b-0b2277679235", } - }, + ], + "resourceVersion": "9482407", + "uid": "44d45d1f-26c8-43e7-841f-831dbd8c1285", }, - "rayVersion": "1.12.0", - "workerGroupSpecs": [ - { - "groupName": "small-group-quicktest", - "maxReplicas": 1, - "minReplicas": 1, - "rayStartParams": {"block": "true", "num-gpus": "0"}, - "replicas": 1, + "spec": { + "autoscalerOptions": { + "idleTimeoutSeconds": 60, + "imagePullPolicy": "Always", + "resources": { + "limits": {"cpu": "500m", "memory": "512Mi"}, + "requests": {"cpu": "500m", "memory": "512Mi"}, + }, + "upscalingMode": "Default", + }, + "enableInTreeAutoscaling": False, + "headGroupSpec": { + "rayStartParams": { + "block": "true", + "dashboard-host": "0.0.0.0", + "num-gpus": "0", + }, + "serviceType": "ClusterIP", "template": { - "metadata": { - "annotations": {"key": "value"}, - "labels": {"key": "value"}, - }, "spec": { "containers": [ { - "env": [ - { - "name": "MY_POD_IP", - "valueFrom": { - "fieldRef": { - "fieldPath": "status.podIP" - } - }, - } - ], "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", + "imagePullPolicy": "Always", "lifecycle": { "preStop": { "exec": { @@ -760,262 +753,271 @@ def get_ray_obj(cls=None): } } }, - "name": "machine-learning", + "name": "ray-head", + "ports": [ + { + "containerPort": 6379, + "name": "gcs", + "protocol": "TCP", + }, + { + "containerPort": 8265, + "name": "dashboard", + "protocol": "TCP", + }, + { + "containerPort": 10001, + "name": "client", + "protocol": "TCP", + }, + ], "resources": { "limits": { - "cpu": 1, - "memory": "2G", + "cpu": 2, + "memory": "8G", "nvidia.com/gpu": 0, }, "requests": { - "cpu": 1, - "memory": "2G", + "cpu": 2, + "memory": "8G", "nvidia.com/gpu": 0, }, }, } - ], - "initContainers": [ - { - "command": [ - "sh", - "-c", - "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done", - ], - "image": "busybox:1.28", - "name": "init-myservice", - } - ], - }, - }, - } - ], - }, - "status": { - "availableWorkerReplicas": 2, - "desiredWorkerReplicas": 1, - "endpoints": {"client": "10001", "dashboard": "8265", "gcs": "6379"}, - "lastUpdateTime": "2023-02-22T16:26:16Z", - "maxWorkerReplicas": 1, - "minWorkerReplicas": 1, - "state": "ready", - }, - } - ) - return [api_obj] - - -def get_aw_obj(): - api_obj1 = openshift.apiobject.APIObject( - { - "apiVersion": "mcad.ibm.com/v1beta1", - "kind": "AppWrapper", - "metadata": { - "annotations": { - "kubectl.kubernetes.io/last-applied-configuration": '{"apiVersion":"mcad.ibm.com/v1beta1","kind":"AppWrapper","metadata":{"annotations":{},"name":"quicktest1","namespace":"ns"},"spec":{"priority":9,"resources":{"GenericItems":[{"custompodresources":[{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}},{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}],"generictemplate":{"apiVersion":"ray.io/v1alpha1","kind":"RayCluster","metadata":{"labels":{"appwrapper.mcad.ibm.com":"quicktest1","controller-tools.k8s.io":"1.0"},"name":"quicktest1","namespace":"ns"},"spec":{"autoscalerOptions":{"idleTimeoutSeconds":60,"imagePullPolicy":"Always","resources":{"limits":{"cpu":"500m","memory":"512Mi"},"requests":{"cpu":"500m","memory":"512Mi"}},"upscalingMode":"Default"},"enableInTreeAutoscaling":false,"headGroupSpec":{"rayStartParams":{"block":"true","dashboard-host":"0.0.0.0","num-gpus":"0"},"serviceType":"ClusterIP","template":{"spec":{"containers":[{"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","imagePullPolicy":"Always","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"ray-head","ports":[{"containerPort":6379,"name":"gcs"},{"containerPort":8265,"name":"dashboard"},{"containerPort":10001,"name":"client"}],"resources":{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}}}]}}},"rayVersion":"1.12.0","workerGroupSpecs":[{"groupName":"small-group-quicktest","maxReplicas":1,"minReplicas":1,"rayStartParams":{"block":"true","num-gpus":"0"},"replicas":1,"template":{"metadata":{"annotations":{"key":"value"},"labels":{"key":"value"}},"spec":{"containers":[{"env":[{"name":"MY_POD_IP","valueFrom":{"fieldRef":{"fieldPath":"status.podIP"}}}],"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"machine-learning","resources":{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}}],"initContainers":[{"command":["sh","-c","until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"],"image":"busybox:1.28","name":"init-myservice"}]}}}]}},"replicas":1},{"generictemplate":{"apiVersion":"route.openshift.io/v1","kind":"Route","metadata":{"labels":{"odh-ray-cluster-service":"quicktest-head-svc"},"name":"ray-dashboard-quicktest","namespace":"default"},"spec":{"port":{"targetPort":"dashboard"},"to":{"kind":"Service","name":"quicktest-head-svc"}}},"replica":1}],"Items":[]}}}\n' - }, - "creationTimestamp": "2023-02-22T16:26:07Z", - "generation": 4, - "managedFields": [ - { - "apiVersion": "mcad.ibm.com/v1beta1", - "fieldsType": "FieldsV1", - "fieldsV1": { - "f:spec": { - "f:resources": {"f:GenericItems": {}, "f:metadata": {}}, - "f:schedulingSpec": {}, - "f:service": {".": {}, "f:spec": {}}, - }, - "f:status": { - ".": {}, - "f:canrun": {}, - "f:conditions": {}, - "f:controllerfirsttimestamp": {}, - "f:filterignore": {}, - "f:queuejobstate": {}, - "f:sender": {}, - "f:state": {}, - "f:systempriority": {}, - }, + ] + } }, - "manager": "Go-http-client", - "operation": "Update", - "time": "2023-02-22T16:26:07Z", }, - { - "apiVersion": "mcad.ibm.com/v1beta1", - "fieldsType": "FieldsV1", - "fieldsV1": { - "f:metadata": { - "f:annotations": { - ".": {}, - "f:kubectl.kubernetes.io/last-applied-configuration": {}, - } - }, - "f:spec": { - ".": {}, - "f:priority": {}, - "f:resources": {".": {}, "f:Items": {}}, + "rayVersion": "1.12.0", + "workerGroupSpecs": [ + { + "groupName": "small-group-quicktest", + "maxReplicas": 1, + "minReplicas": 1, + "rayStartParams": {"block": "true", "num-gpus": "0"}, + "replicas": 1, + "template": { + "metadata": { + "annotations": {"key": "value"}, + "labels": {"key": "value"}, + }, + "spec": { + "containers": [ + { + "env": [ + { + "name": "MY_POD_IP", + "valueFrom": { + "fieldRef": { + "fieldPath": "status.podIP" + } + }, + } + ], + "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", + "lifecycle": { + "preStop": { + "exec": { + "command": [ + "/bin/sh", + "-c", + "ray stop", + ] + } + } + }, + "name": "machine-learning", + "resources": { + "limits": { + "cpu": 1, + "memory": "2G", + "nvidia.com/gpu": 0, + }, + "requests": { + "cpu": 1, + "memory": "2G", + "nvidia.com/gpu": 0, + }, + }, + } + ], + "initContainers": [ + { + "command": [ + "sh", + "-c", + "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done", + ], + "image": "busybox:1.28", + "name": "init-myservice", + } + ], + }, }, - }, - "manager": "kubectl-client-side-apply", - "operation": "Update", - "time": "2023-02-22T16:26:07Z", + } + ], + }, + "status": { + "availableWorkerReplicas": 2, + "desiredWorkerReplicas": 1, + "endpoints": { + "client": "10001", + "dashboard": "8265", + "gcs": "6379", }, - ], - "name": "quicktest1", - "namespace": "ns", - "resourceVersion": "9482384", - "uid": "6334fc1b-471e-4876-8e7b-0b2277679235", - }, - "spec": { - "priority": 9, - "resources": { - "GenericItems": [ + "lastUpdateTime": "2023-02-22T16:26:16Z", + "maxWorkerReplicas": 1, + "minWorkerReplicas": 1, + "state": "ready", + }, + } + ] + } + return api_obj + + +def get_aw_obj(group, version, namespace, plural): + api_obj1 = { + "items": [ + { + "apiVersion": "mcad.ibm.com/v1beta1", + "kind": "AppWrapper", + "metadata": { + "annotations": { + "kubectl.kubernetes.io/last-applied-configuration": '{"apiVersion":"mcad.ibm.com/v1beta1","kind":"AppWrapper","metadata":{"annotations":{},"name":"quicktest1","namespace":"ns"},"spec":{"priority":9,"resources":{"GenericItems":[{"custompodresources":[{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}},{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}],"generictemplate":{"apiVersion":"ray.io/v1alpha1","kind":"RayCluster","metadata":{"labels":{"appwrapper.mcad.ibm.com":"quicktest1","controller-tools.k8s.io":"1.0"},"name":"quicktest1","namespace":"ns"},"spec":{"autoscalerOptions":{"idleTimeoutSeconds":60,"imagePullPolicy":"Always","resources":{"limits":{"cpu":"500m","memory":"512Mi"},"requests":{"cpu":"500m","memory":"512Mi"}},"upscalingMode":"Default"},"enableInTreeAutoscaling":false,"headGroupSpec":{"rayStartParams":{"block":"true","dashboard-host":"0.0.0.0","num-gpus":"0"},"serviceType":"ClusterIP","template":{"spec":{"containers":[{"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","imagePullPolicy":"Always","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"ray-head","ports":[{"containerPort":6379,"name":"gcs"},{"containerPort":8265,"name":"dashboard"},{"containerPort":10001,"name":"client"}],"resources":{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}}}]}}},"rayVersion":"1.12.0","workerGroupSpecs":[{"groupName":"small-group-quicktest","maxReplicas":1,"minReplicas":1,"rayStartParams":{"block":"true","num-gpus":"0"},"replicas":1,"template":{"metadata":{"annotations":{"key":"value"},"labels":{"key":"value"}},"spec":{"containers":[{"env":[{"name":"MY_POD_IP","valueFrom":{"fieldRef":{"fieldPath":"status.podIP"}}}],"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"machine-learning","resources":{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}}],"initContainers":[{"command":["sh","-c","until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"],"image":"busybox:1.28","name":"init-myservice"}]}}}]}},"replicas":1},{"generictemplate":{"apiVersion":"route.openshift.io/v1","kind":"Route","metadata":{"labels":{"odh-ray-cluster-service":"quicktest-head-svc"},"name":"ray-dashboard-quicktest","namespace":"default"},"spec":{"port":{"targetPort":"dashboard"},"to":{"kind":"Service","name":"quicktest-head-svc"}}},"replica":1}],"Items":[]}}}\n' + }, + "creationTimestamp": "2023-02-22T16:26:07Z", + "generation": 4, + "managedFields": [ { - "allocated": 0, - "custompodresources": [ - { - "limits": { - "cpu": "2", - "memory": "8G", - "nvidia.com/gpu": "0", - }, - "replicas": 1, - "requests": { - "cpu": "2", - "memory": "8G", - "nvidia.com/gpu": "0", + "apiVersion": "mcad.ibm.com/v1beta1", + "fieldsType": "FieldsV1", + "fieldsV1": { + "f:spec": { + "f:resources": { + "f:GenericItems": {}, + "f:metadata": {}, }, + "f:schedulingSpec": {}, + "f:service": {".": {}, "f:spec": {}}, }, - { - "limits": { - "cpu": "1", - "memory": "2G", - "nvidia.com/gpu": "0", - }, - "replicas": 1, - "requests": { - "cpu": "1", - "memory": "2G", - "nvidia.com/gpu": "0", - }, + "f:status": { + ".": {}, + "f:canrun": {}, + "f:conditions": {}, + "f:controllerfirsttimestamp": {}, + "f:filterignore": {}, + "f:queuejobstate": {}, + "f:sender": {}, + "f:state": {}, + "f:systempriority": {}, }, - ], - "generictemplate": { - "apiVersion": "ray.io/v1alpha1", - "kind": "RayCluster", - "metadata": { - "labels": { - "appwrapper.mcad.ibm.com": "quicktest1", - "controller-tools.k8s.io": "1.0", - }, - "name": "quicktest1", - "namespace": "ns", + }, + "manager": "Go-http-client", + "operation": "Update", + "time": "2023-02-22T16:26:07Z", + }, + { + "apiVersion": "mcad.ibm.com/v1beta1", + "fieldsType": "FieldsV1", + "fieldsV1": { + "f:metadata": { + "f:annotations": { + ".": {}, + "f:kubectl.kubernetes.io/last-applied-configuration": {}, + } }, - "spec": { - "autoscalerOptions": { - "idleTimeoutSeconds": 60, - "imagePullPolicy": "Always", - "resources": { - "limits": { - "cpu": "500m", - "memory": "512Mi", - }, - "requests": { - "cpu": "500m", - "memory": "512Mi", - }, + "f:spec": { + ".": {}, + "f:priority": {}, + "f:resources": {".": {}, "f:Items": {}}, + }, + }, + "manager": "kubectl-client-side-apply", + "operation": "Update", + "time": "2023-02-22T16:26:07Z", + }, + ], + "name": "quicktest1", + "namespace": "ns", + "resourceVersion": "9482384", + "uid": "6334fc1b-471e-4876-8e7b-0b2277679235", + }, + "spec": { + "priority": 9, + "resources": { + "GenericItems": [ + { + "allocated": 0, + "custompodresources": [ + { + "limits": { + "cpu": "2", + "memory": "8G", + "nvidia.com/gpu": "0", + }, + "replicas": 1, + "requests": { + "cpu": "2", + "memory": "8G", + "nvidia.com/gpu": "0", }, - "upscalingMode": "Default", }, - "enableInTreeAutoscaling": False, - "headGroupSpec": { - "rayStartParams": { - "block": "true", - "dashboard-host": "0.0.0.0", - "num-gpus": "0", + { + "limits": { + "cpu": "1", + "memory": "2G", + "nvidia.com/gpu": "0", }, - "serviceType": "ClusterIP", - "template": { - "spec": { - "containers": [ - { - "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", - "imagePullPolicy": "Always", - "lifecycle": { - "preStop": { - "exec": { - "command": [ - "/bin/sh", - "-c", - "ray stop", - ] - } - } - }, - "name": "ray-head", - "ports": [ - { - "containerPort": 6379, - "name": "gcs", - }, - { - "containerPort": 8265, - "name": "dashboard", - }, - { - "containerPort": 10001, - "name": "client", - }, - ], - "resources": { - "limits": { - "cpu": 2, - "memory": "8G", - "nvidia.com/gpu": 0, - }, - "requests": { - "cpu": 2, - "memory": "8G", - "nvidia.com/gpu": 0, - }, - }, - } - ] - } + "replicas": 1, + "requests": { + "cpu": "1", + "memory": "2G", + "nvidia.com/gpu": "0", }, }, - "rayVersion": "1.12.0", - "workerGroupSpecs": [ - { - "groupName": "small-group-quicktest", - "maxReplicas": 1, - "minReplicas": 1, + ], + "generictemplate": { + "apiVersion": "ray.io/v1alpha1", + "kind": "RayCluster", + "metadata": { + "labels": { + "appwrapper.mcad.ibm.com": "quicktest1", + "controller-tools.k8s.io": "1.0", + }, + "name": "quicktest1", + "namespace": "ns", + }, + "spec": { + "autoscalerOptions": { + "idleTimeoutSeconds": 60, + "imagePullPolicy": "Always", + "resources": { + "limits": { + "cpu": "500m", + "memory": "512Mi", + }, + "requests": { + "cpu": "500m", + "memory": "512Mi", + }, + }, + "upscalingMode": "Default", + }, + "enableInTreeAutoscaling": False, + "headGroupSpec": { "rayStartParams": { "block": "true", + "dashboard-host": "0.0.0.0", "num-gpus": "0", }, - "replicas": 1, + "serviceType": "ClusterIP", "template": { - "metadata": { - "annotations": {"key": "value"}, - "labels": {"key": "value"}, - }, "spec": { "containers": [ { - "env": [ - { - "name": "MY_POD_IP", - "valueFrom": { - "fieldRef": { - "fieldPath": "status.podIP" - } - }, - } - ], "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", + "imagePullPolicy": "Always", "lifecycle": { "preStop": { "exec": { @@ -1027,317 +1029,318 @@ def get_aw_obj(): } } }, - "name": "machine-learning", + "name": "ray-head", + "ports": [ + { + "containerPort": 6379, + "name": "gcs", + }, + { + "containerPort": 8265, + "name": "dashboard", + }, + { + "containerPort": 10001, + "name": "client", + }, + ], "resources": { "limits": { - "cpu": 1, - "memory": "2G", + "cpu": 2, + "memory": "8G", "nvidia.com/gpu": 0, }, "requests": { - "cpu": 1, - "memory": "2G", + "cpu": 2, + "memory": "8G", "nvidia.com/gpu": 0, }, }, } - ], - "initContainers": [ - { - "command": [ - "sh", - "-c", - "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done", - ], - "image": "busybox:1.28", - "name": "init-myservice", - } - ], - }, + ] + } }, - } - ], + }, + "rayVersion": "1.12.0", + "workerGroupSpecs": [ + { + "groupName": "small-group-quicktest", + "maxReplicas": 1, + "minReplicas": 1, + "rayStartParams": { + "block": "true", + "num-gpus": "0", + }, + "replicas": 1, + "template": { + "metadata": { + "annotations": {"key": "value"}, + "labels": {"key": "value"}, + }, + "spec": { + "containers": [ + { + "env": [ + { + "name": "MY_POD_IP", + "valueFrom": { + "fieldRef": { + "fieldPath": "status.podIP" + } + }, + } + ], + "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", + "lifecycle": { + "preStop": { + "exec": { + "command": [ + "/bin/sh", + "-c", + "ray stop", + ] + } + } + }, + "name": "machine-learning", + "resources": { + "limits": { + "cpu": 1, + "memory": "2G", + "nvidia.com/gpu": 0, + }, + "requests": { + "cpu": 1, + "memory": "2G", + "nvidia.com/gpu": 0, + }, + }, + } + ], + "initContainers": [ + { + "command": [ + "sh", + "-c", + "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done", + ], + "image": "busybox:1.28", + "name": "init-myservice", + } + ], + }, + }, + } + ], + }, }, + "metadata": {}, + "priority": 0, + "priorityslope": 0, + "replicas": 1, }, - "metadata": {}, - "priority": 0, - "priorityslope": 0, - "replicas": 1, - }, - { - "allocated": 0, - "generictemplate": { - "apiVersion": "route.openshift.io/v1", - "kind": "Route", - "metadata": { - "labels": { - "odh-ray-cluster-service": "quicktest-head-svc" + { + "allocated": 0, + "generictemplate": { + "apiVersion": "route.openshift.io/v1", + "kind": "Route", + "metadata": { + "labels": { + "odh-ray-cluster-service": "quicktest-head-svc" + }, + "name": "ray-dashboard-quicktest", + "namespace": "default", }, - "name": "ray-dashboard-quicktest", - "namespace": "default", - }, - "spec": { - "port": {"targetPort": "dashboard"}, - "to": { - "kind": "Service", - "name": "quicktest-head-svc", + "spec": { + "port": {"targetPort": "dashboard"}, + "to": { + "kind": "Service", + "name": "quicktest-head-svc", + }, }, }, + "metadata": {}, + "priority": 0, + "priorityslope": 0, }, - "metadata": {}, - "priority": 0, - "priorityslope": 0, + ], + "Items": [], + "metadata": {}, + }, + "schedulingSpec": {}, + "service": {"spec": {}}, + }, + "status": { + "canrun": True, + "conditions": [ + { + "lastTransitionMicroTime": "2023-02-22T16:26:07.559447Z", + "lastUpdateMicroTime": "2023-02-22T16:26:07.559447Z", + "status": "True", + "type": "Init", + }, + { + "lastTransitionMicroTime": "2023-02-22T16:26:07.559551Z", + "lastUpdateMicroTime": "2023-02-22T16:26:07.559551Z", + "reason": "AwaitingHeadOfLine", + "status": "True", + "type": "Queueing", + }, + { + "lastTransitionMicroTime": "2023-02-22T16:26:13.220564Z", + "lastUpdateMicroTime": "2023-02-22T16:26:13.220564Z", + "reason": "AppWrapperRunnable", + "status": "True", + "type": "Dispatched", }, ], - "Items": [], - "metadata": {}, + "controllerfirsttimestamp": "2023-02-22T16:26:07.559447Z", + "filterignore": True, + "queuejobstate": "Dispatched", + "sender": "before manageQueueJob - afterEtcdDispatching", + "state": "Running", + "systempriority": 9, }, - "schedulingSpec": {}, - "service": {"spec": {}}, - }, - "status": { - "canrun": True, - "conditions": [ - { - "lastTransitionMicroTime": "2023-02-22T16:26:07.559447Z", - "lastUpdateMicroTime": "2023-02-22T16:26:07.559447Z", - "status": "True", - "type": "Init", - }, - { - "lastTransitionMicroTime": "2023-02-22T16:26:07.559551Z", - "lastUpdateMicroTime": "2023-02-22T16:26:07.559551Z", - "reason": "AwaitingHeadOfLine", - "status": "True", - "type": "Queueing", - }, - { - "lastTransitionMicroTime": "2023-02-22T16:26:13.220564Z", - "lastUpdateMicroTime": "2023-02-22T16:26:13.220564Z", - "reason": "AppWrapperRunnable", - "status": "True", - "type": "Dispatched", - }, - ], - "controllerfirsttimestamp": "2023-02-22T16:26:07.559447Z", - "filterignore": True, - "queuejobstate": "Dispatched", - "sender": "before manageQueueJob - afterEtcdDispatching", - "state": "Running", - "systempriority": 9, }, - } - ) - api_obj2 = openshift.apiobject.APIObject( - { - "apiVersion": "mcad.ibm.com/v1beta1", - "kind": "AppWrapper", - "metadata": { - "annotations": { - "kubectl.kubernetes.io/last-applied-configuration": '{"apiVersion":"mcad.ibm.com/v1beta1","kind":"AppWrapper","metadata":{"annotations":{},"name":"quicktest2","namespace":"ns"},"spec":{"priority":9,"resources":{"GenericItems":[{"custompodresources":[{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}},{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}],"generictemplate":{"apiVersion":"ray.io/v1alpha1","kind":"RayCluster","metadata":{"labels":{"appwrapper.mcad.ibm.com":"quicktest2","controller-tools.k8s.io":"1.0"},"name":"quicktest2","namespace":"ns"},"spec":{"autoscalerOptions":{"idleTimeoutSeconds":60,"imagePullPolicy":"Always","resources":{"limits":{"cpu":"500m","memory":"512Mi"},"requests":{"cpu":"500m","memory":"512Mi"}},"upscalingMode":"Default"},"enableInTreeAutoscaling":false,"headGroupSpec":{"rayStartParams":{"block":"true","dashboard-host":"0.0.0.0","num-gpus":"0"},"serviceType":"ClusterIP","template":{"spec":{"containers":[{"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","imagePullPolicy":"Always","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"ray-head","ports":[{"containerPort":6379,"name":"gcs"},{"containerPort":8265,"name":"dashboard"},{"containerPort":10001,"name":"client"}],"resources":{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}}}]}}},"rayVersion":"1.12.0","workerGroupSpecs":[{"groupName":"small-group-quicktest","maxReplicas":1,"minReplicas":1,"rayStartParams":{"block":"true","num-gpus":"0"},"replicas":1,"template":{"metadata":{"annotations":{"key":"value"},"labels":{"key":"value"}},"spec":{"containers":[{"env":[{"name":"MY_POD_IP","valueFrom":{"fieldRef":{"fieldPath":"status.podIP"}}}],"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"machine-learning","resources":{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}}],"initContainers":[{"command":["sh","-c","until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"],"image":"busybox:1.28","name":"init-myservice"}]}}}]}},"replicas":1},{"generictemplate":{"apiVersion":"route.openshift.io/v1","kind":"Route","metadata":{"labels":{"odh-ray-cluster-service":"quicktest-head-svc"},"name":"ray-dashboard-quicktest","namespace":"default"},"spec":{"port":{"targetPort":"dashboard"},"to":{"kind":"Service","name":"quicktest-head-svc"}}},"replica":1}],"Items":[]}}}\n' - }, - "creationTimestamp": "2023-02-22T16:26:07Z", - "generation": 4, - "managedFields": [ - { - "apiVersion": "mcad.ibm.com/v1beta1", - "fieldsType": "FieldsV1", - "fieldsV1": { - "f:spec": { - "f:resources": {"f:GenericItems": {}, "f:metadata": {}}, - "f:schedulingSpec": {}, - "f:service": {".": {}, "f:spec": {}}, - }, - "f:status": { - ".": {}, - "f:canrun": {}, - "f:conditions": {}, - "f:controllerfirsttimestamp": {}, - "f:filterignore": {}, - "f:queuejobstate": {}, - "f:sender": {}, - "f:state": {}, - "f:systempriority": {}, - }, - }, - "manager": "Go-http-client", - "operation": "Update", - "time": "2023-02-22T16:26:07Z", + { + "apiVersion": "mcad.ibm.com/v1beta1", + "kind": "AppWrapper", + "metadata": { + "annotations": { + "kubectl.kubernetes.io/last-applied-configuration": '{"apiVersion":"mcad.ibm.com/v1beta1","kind":"AppWrapper","metadata":{"annotations":{},"name":"quicktest2","namespace":"ns"},"spec":{"priority":9,"resources":{"GenericItems":[{"custompodresources":[{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}},{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}],"generictemplate":{"apiVersion":"ray.io/v1alpha1","kind":"RayCluster","metadata":{"labels":{"appwrapper.mcad.ibm.com":"quicktest2","controller-tools.k8s.io":"1.0"},"name":"quicktest2","namespace":"ns"},"spec":{"autoscalerOptions":{"idleTimeoutSeconds":60,"imagePullPolicy":"Always","resources":{"limits":{"cpu":"500m","memory":"512Mi"},"requests":{"cpu":"500m","memory":"512Mi"}},"upscalingMode":"Default"},"enableInTreeAutoscaling":false,"headGroupSpec":{"rayStartParams":{"block":"true","dashboard-host":"0.0.0.0","num-gpus":"0"},"serviceType":"ClusterIP","template":{"spec":{"containers":[{"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","imagePullPolicy":"Always","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"ray-head","ports":[{"containerPort":6379,"name":"gcs"},{"containerPort":8265,"name":"dashboard"},{"containerPort":10001,"name":"client"}],"resources":{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}}}]}}},"rayVersion":"1.12.0","workerGroupSpecs":[{"groupName":"small-group-quicktest","maxReplicas":1,"minReplicas":1,"rayStartParams":{"block":"true","num-gpus":"0"},"replicas":1,"template":{"metadata":{"annotations":{"key":"value"},"labels":{"key":"value"}},"spec":{"containers":[{"env":[{"name":"MY_POD_IP","valueFrom":{"fieldRef":{"fieldPath":"status.podIP"}}}],"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"machine-learning","resources":{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}}],"initContainers":[{"command":["sh","-c","until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"],"image":"busybox:1.28","name":"init-myservice"}]}}}]}},"replicas":1},{"generictemplate":{"apiVersion":"route.openshift.io/v1","kind":"Route","metadata":{"labels":{"odh-ray-cluster-service":"quicktest-head-svc"},"name":"ray-dashboard-quicktest","namespace":"default"},"spec":{"port":{"targetPort":"dashboard"},"to":{"kind":"Service","name":"quicktest-head-svc"}}},"replica":1}],"Items":[]}}}\n' }, - { - "apiVersion": "mcad.ibm.com/v1beta1", - "fieldsType": "FieldsV1", - "fieldsV1": { - "f:metadata": { - "f:annotations": { + "creationTimestamp": "2023-02-22T16:26:07Z", + "generation": 4, + "managedFields": [ + { + "apiVersion": "mcad.ibm.com/v1beta1", + "fieldsType": "FieldsV1", + "fieldsV1": { + "f:spec": { + "f:resources": { + "f:GenericItems": {}, + "f:metadata": {}, + }, + "f:schedulingSpec": {}, + "f:service": {".": {}, "f:spec": {}}, + }, + "f:status": { ".": {}, - "f:kubectl.kubernetes.io/last-applied-configuration": {}, - } - }, - "f:spec": { - ".": {}, - "f:priority": {}, - "f:resources": {".": {}, "f:Items": {}}, + "f:canrun": {}, + "f:conditions": {}, + "f:controllerfirsttimestamp": {}, + "f:filterignore": {}, + "f:queuejobstate": {}, + "f:sender": {}, + "f:state": {}, + "f:systempriority": {}, + }, }, + "manager": "Go-http-client", + "operation": "Update", + "time": "2023-02-22T16:26:07Z", }, - "manager": "kubectl-client-side-apply", - "operation": "Update", - "time": "2023-02-22T16:26:07Z", - }, - ], - "name": "quicktest2", - "namespace": "ns", - "resourceVersion": "9482384", - "uid": "6334fc1b-471e-4876-8e7b-0b2277679235", - }, - "spec": { - "priority": 9, - "resources": { - "GenericItems": [ { - "allocated": 0, - "custompodresources": [ - { - "limits": { - "cpu": "2", - "memory": "8G", - "nvidia.com/gpu": "0", - }, - "replicas": 1, - "requests": { - "cpu": "2", - "memory": "8G", - "nvidia.com/gpu": "0", - }, - }, - { - "limits": { - "cpu": "1", - "memory": "2G", - "nvidia.com/gpu": "0", - }, - "replicas": 1, - "requests": { - "cpu": "1", - "memory": "2G", - "nvidia.com/gpu": "0", - }, + "apiVersion": "mcad.ibm.com/v1beta1", + "fieldsType": "FieldsV1", + "fieldsV1": { + "f:metadata": { + "f:annotations": { + ".": {}, + "f:kubectl.kubernetes.io/last-applied-configuration": {}, + } }, - ], - "generictemplate": { - "apiVersion": "ray.io/v1alpha1", - "kind": "RayCluster", - "metadata": { - "labels": { - "appwrapper.mcad.ibm.com": "quicktest2", - "controller-tools.k8s.io": "1.0", - }, - "name": "quicktest2", - "namespace": "ns", + "f:spec": { + ".": {}, + "f:priority": {}, + "f:resources": {".": {}, "f:Items": {}}, }, - "spec": { - "autoscalerOptions": { - "idleTimeoutSeconds": 60, - "imagePullPolicy": "Always", - "resources": { - "limits": { - "cpu": "500m", - "memory": "512Mi", - }, - "requests": { - "cpu": "500m", - "memory": "512Mi", - }, + }, + "manager": "kubectl-client-side-apply", + "operation": "Update", + "time": "2023-02-22T16:26:07Z", + }, + ], + "name": "quicktest2", + "namespace": "ns", + "resourceVersion": "9482384", + "uid": "6334fc1b-471e-4876-8e7b-0b2277679235", + }, + "spec": { + "priority": 9, + "resources": { + "GenericItems": [ + { + "allocated": 0, + "custompodresources": [ + { + "limits": { + "cpu": "2", + "memory": "8G", + "nvidia.com/gpu": "0", + }, + "replicas": 1, + "requests": { + "cpu": "2", + "memory": "8G", + "nvidia.com/gpu": "0", }, - "upscalingMode": "Default", }, - "enableInTreeAutoscaling": False, - "headGroupSpec": { - "rayStartParams": { - "block": "true", - "dashboard-host": "0.0.0.0", - "num-gpus": "0", + { + "limits": { + "cpu": "1", + "memory": "2G", + "nvidia.com/gpu": "0", }, - "serviceType": "ClusterIP", - "template": { - "spec": { - "containers": [ - { - "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", - "imagePullPolicy": "Always", - "lifecycle": { - "preStop": { - "exec": { - "command": [ - "/bin/sh", - "-c", - "ray stop", - ] - } - } - }, - "name": "ray-head", - "ports": [ - { - "containerPort": 6379, - "name": "gcs", - }, - { - "containerPort": 8265, - "name": "dashboard", - }, - { - "containerPort": 10001, - "name": "client", - }, - ], - "resources": { - "limits": { - "cpu": 2, - "memory": "8G", - "nvidia.com/gpu": 0, - }, - "requests": { - "cpu": 2, - "memory": "8G", - "nvidia.com/gpu": 0, - }, - }, - } - ] - } + "replicas": 1, + "requests": { + "cpu": "1", + "memory": "2G", + "nvidia.com/gpu": "0", }, }, - "rayVersion": "1.12.0", - "workerGroupSpecs": [ - { - "groupName": "small-group-quicktest", - "maxReplicas": 1, - "minReplicas": 1, + ], + "generictemplate": { + "apiVersion": "ray.io/v1alpha1", + "kind": "RayCluster", + "metadata": { + "labels": { + "appwrapper.mcad.ibm.com": "quicktest2", + "controller-tools.k8s.io": "1.0", + }, + "name": "quicktest2", + "namespace": "ns", + }, + "spec": { + "autoscalerOptions": { + "idleTimeoutSeconds": 60, + "imagePullPolicy": "Always", + "resources": { + "limits": { + "cpu": "500m", + "memory": "512Mi", + }, + "requests": { + "cpu": "500m", + "memory": "512Mi", + }, + }, + "upscalingMode": "Default", + }, + "enableInTreeAutoscaling": False, + "headGroupSpec": { "rayStartParams": { "block": "true", + "dashboard-host": "0.0.0.0", "num-gpus": "0", }, - "replicas": 1, + "serviceType": "ClusterIP", "template": { - "metadata": { - "annotations": {"key": "value"}, - "labels": {"key": "value"}, - }, "spec": { "containers": [ { - "env": [ - { - "name": "MY_POD_IP", - "valueFrom": { - "fieldRef": { - "fieldPath": "status.podIP" - } - }, - } - ], "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", + "imagePullPolicy": "Always", "lifecycle": { "preStop": { "exec": { @@ -1349,114 +1352,190 @@ def get_aw_obj(): } } }, - "name": "machine-learning", + "name": "ray-head", + "ports": [ + { + "containerPort": 6379, + "name": "gcs", + }, + { + "containerPort": 8265, + "name": "dashboard", + }, + { + "containerPort": 10001, + "name": "client", + }, + ], "resources": { "limits": { - "cpu": 1, - "memory": "2G", + "cpu": 2, + "memory": "8G", "nvidia.com/gpu": 0, }, "requests": { - "cpu": 1, - "memory": "2G", + "cpu": 2, + "memory": "8G", "nvidia.com/gpu": 0, }, }, } - ], - "initContainers": [ - { - "command": [ - "sh", - "-c", - "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done", - ], - "image": "busybox:1.28", - "name": "init-myservice", - } - ], - }, + ] + } }, - } - ], + }, + "rayVersion": "1.12.0", + "workerGroupSpecs": [ + { + "groupName": "small-group-quicktest", + "maxReplicas": 1, + "minReplicas": 1, + "rayStartParams": { + "block": "true", + "num-gpus": "0", + }, + "replicas": 1, + "template": { + "metadata": { + "annotations": {"key": "value"}, + "labels": {"key": "value"}, + }, + "spec": { + "containers": [ + { + "env": [ + { + "name": "MY_POD_IP", + "valueFrom": { + "fieldRef": { + "fieldPath": "status.podIP" + } + }, + } + ], + "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", + "lifecycle": { + "preStop": { + "exec": { + "command": [ + "/bin/sh", + "-c", + "ray stop", + ] + } + } + }, + "name": "machine-learning", + "resources": { + "limits": { + "cpu": 1, + "memory": "2G", + "nvidia.com/gpu": 0, + }, + "requests": { + "cpu": 1, + "memory": "2G", + "nvidia.com/gpu": 0, + }, + }, + } + ], + "initContainers": [ + { + "command": [ + "sh", + "-c", + "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done", + ], + "image": "busybox:1.28", + "name": "init-myservice", + } + ], + }, + }, + } + ], + }, }, + "metadata": {}, + "priority": 0, + "priorityslope": 0, + "replicas": 1, }, - "metadata": {}, - "priority": 0, - "priorityslope": 0, - "replicas": 1, - }, - { - "allocated": 0, - "generictemplate": { - "apiVersion": "route.openshift.io/v1", - "kind": "Route", - "metadata": { - "labels": { - "odh-ray-cluster-service": "quicktest-head-svc" + { + "allocated": 0, + "generictemplate": { + "apiVersion": "route.openshift.io/v1", + "kind": "Route", + "metadata": { + "labels": { + "odh-ray-cluster-service": "quicktest-head-svc" + }, + "name": "ray-dashboard-quicktest", + "namespace": "default", }, - "name": "ray-dashboard-quicktest", - "namespace": "default", - }, - "spec": { - "port": {"targetPort": "dashboard"}, - "to": { - "kind": "Service", - "name": "quicktest-head-svc", + "spec": { + "port": {"targetPort": "dashboard"}, + "to": { + "kind": "Service", + "name": "quicktest-head-svc", + }, }, }, + "metadata": {}, + "priority": 0, + "priorityslope": 0, }, - "metadata": {}, - "priority": 0, - "priorityslope": 0, + ], + "Items": [], + "metadata": {}, + }, + "schedulingSpec": {}, + "service": {"spec": {}}, + }, + "status": { + "canrun": True, + "conditions": [ + { + "lastTransitionMicroTime": "2023-02-22T16:26:07.559447Z", + "lastUpdateMicroTime": "2023-02-22T16:26:07.559447Z", + "status": "True", + "type": "Init", + }, + { + "lastTransitionMicroTime": "2023-02-22T16:26:07.559551Z", + "lastUpdateMicroTime": "2023-02-22T16:26:07.559551Z", + "reason": "AwaitingHeadOfLine", + "status": "True", + "type": "Queueing", + }, + { + "lastTransitionMicroTime": "2023-02-22T16:26:13.220564Z", + "lastUpdateMicroTime": "2023-02-22T16:26:13.220564Z", + "reason": "AppWrapperRunnable", + "status": "True", + "type": "Dispatched", }, ], - "Items": [], - "metadata": {}, + "controllerfirsttimestamp": "2023-02-22T16:26:07.559447Z", + "filterignore": True, + "queuejobstate": "Dispatched", + "sender": "before manageQueueJob - afterEtcdDispatching", + "state": "Pending", + "systempriority": 9, }, - "schedulingSpec": {}, - "service": {"spec": {}}, - }, - "status": { - "canrun": True, - "conditions": [ - { - "lastTransitionMicroTime": "2023-02-22T16:26:07.559447Z", - "lastUpdateMicroTime": "2023-02-22T16:26:07.559447Z", - "status": "True", - "type": "Init", - }, - { - "lastTransitionMicroTime": "2023-02-22T16:26:07.559551Z", - "lastUpdateMicroTime": "2023-02-22T16:26:07.559551Z", - "reason": "AwaitingHeadOfLine", - "status": "True", - "type": "Queueing", - }, - { - "lastTransitionMicroTime": "2023-02-22T16:26:13.220564Z", - "lastUpdateMicroTime": "2023-02-22T16:26:13.220564Z", - "reason": "AppWrapperRunnable", - "status": "True", - "type": "Dispatched", - }, - ], - "controllerfirsttimestamp": "2023-02-22T16:26:07.559447Z", - "filterignore": True, - "queuejobstate": "Dispatched", - "sender": "before manageQueueJob - afterEtcdDispatching", - "state": "Pending", - "systempriority": 9, }, - } - ) - return [api_obj1, api_obj2] + ] + } + return api_obj1 def test_list_clusters(mocker, capsys): - mocker.patch("openshift.selector", side_effect=get_selector) - mock_res = mocker.patch.object(Selector, "objects") - mock_res.side_effect = get_obj_none + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=get_obj_none, + ) list_all_clusters("ns") captured = capsys.readouterr() assert captured.out == ( @@ -1464,7 +1543,10 @@ def test_list_clusters(mocker, capsys): "│ No resources found, have you run cluster.up() yet? │\n" "╰──────────────────────────────────────────────────────────────────────────────╯\n" ) - mock_res.side_effect = get_ray_obj + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=get_ray_obj, + ) list_all_clusters("ns") captured = capsys.readouterr() assert captured.out == ( @@ -1490,9 +1572,11 @@ def test_list_clusters(mocker, capsys): def test_list_queue(mocker, capsys): - mocker.patch("openshift.selector", side_effect=get_selector) - mock_res = mocker.patch.object(Selector, "objects") - mock_res.side_effect = get_obj_none + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=get_obj_none, + ) list_all_queued("ns") captured = capsys.readouterr() assert captured.out == ( @@ -1500,7 +1584,10 @@ def test_list_queue(mocker, capsys): "│ No resources found, have you run cluster.up() yet? │\n" "╰──────────────────────────────────────────────────────────────────────────────╯\n" ) - mock_res.side_effect = get_aw_obj + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=get_aw_obj, + ) list_all_queued("ns") captured = capsys.readouterr() assert captured.out == ( @@ -1520,6 +1607,7 @@ def test_list_queue(mocker, capsys): def test_cluster_status(mocker): + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") fake_aw = AppWrapper( "test", AppWrapperStatus.FAILED, can_run=True, job_state="unused" ) @@ -1536,6 +1624,8 @@ def test_cluster_status(mocker): dashboard="fake-uri", ) cf = Cluster(ClusterConfiguration(name="test", namespace="ns")) + mocker.patch("codeflare_sdk.cluster.cluster._app_wrapper_status", return_value=None) + mocker.patch("codeflare_sdk.cluster.cluster._ray_cluster_status", return_value=None) status, ready = cf.status() assert status == CodeFlareClusterStatus.UNKNOWN assert ready == False @@ -1597,6 +1687,9 @@ def test_cluster_status(mocker): def test_wait_ready(mocker, capsys): + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch("codeflare_sdk.cluster.cluster._app_wrapper_status", return_value=None) + mocker.patch("codeflare_sdk.cluster.cluster._ray_cluster_status", return_value=None) cf = Cluster(ClusterConfiguration(name="test", namespace="ns")) try: cf.wait_ready(timeout=5) @@ -1668,12 +1761,17 @@ def test_DDPJobDefinition_creation(): return ddp -def test_DDPJobDefinition_dry_run(): +def test_DDPJobDefinition_dry_run(mocker): """ Test that the dry run method returns the correct type: AppDryRunInfo, that the attributes of the returned object are of the correct type, and that the values from cluster and job definition are correctly passed. """ + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri", + return_value="", + ) ddp = test_DDPJobDefinition_creation() cluster = Cluster(test_config_creation()) ddp_job = ddp._dry_run(cluster) @@ -1706,7 +1804,7 @@ def test_DDPJobDefinition_dry_run_no_cluster(mocker): """ mocker.patch( - "openshift.get_project_name", + "codeflare_sdk.job.jobs.get_current_namespace", return_value="opendatahub", ) @@ -1738,11 +1836,15 @@ def test_DDPJobDefinition_dry_run_no_cluster(mocker): assert ddp_job._scheduler == "kubernetes_mcad" -def test_DDPJobDefinition_dry_run_no_resource_args(): +def test_DDPJobDefinition_dry_run_no_resource_args(mocker): """ Test that the dry run correctly gets resources from the cluster object when the job definition does not specify resources. """ + mocker.patch( + "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri", + return_value="", + ) cluster = Cluster(test_config_creation()) ddp = DDPJobDefinition( script="test.py", @@ -1775,7 +1877,7 @@ def test_DDPJobDefinition_dry_run_no_cluster_no_resource_args(mocker): """ mocker.patch( - "openshift.get_project_name", + "codeflare_sdk.job.jobs.get_current_namespace", return_value="opendatahub", ) @@ -1827,11 +1929,15 @@ def test_DDPJobDefinition_submit(mocker): Tests that the submit method returns the correct type: DDPJob And that the attributes of the returned object are of the correct type """ + mocker.patch( + "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri", + return_value="fake-dashboard-uri", + ) ddp_def = test_DDPJobDefinition_creation() cluster = Cluster(test_config_creation()) mocker.patch( - "openshift.get_project_name", - return_value="opendatahub", + "codeflare_sdk.job.jobs.get_current_namespace", + side_effect="opendatahub", ) mocker.patch( "codeflare_sdk.job.jobs.torchx_runner.schedule", @@ -1854,6 +1960,10 @@ def test_DDPJobDefinition_submit(mocker): def test_DDPJob_creation(mocker): + mocker.patch( + "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri", + return_value="fake-dashboard-uri", + ) ddp_def = test_DDPJobDefinition_creation() cluster = Cluster(test_config_creation()) mocker.patch( @@ -1880,8 +1990,8 @@ def test_DDPJob_creation_no_cluster(mocker): ddp_def = test_DDPJobDefinition_creation() ddp_def.image = "fake-image" mocker.patch( - "openshift.get_project_name", - return_value="opendatahub", + "codeflare_sdk.job.jobs.get_current_namespace", + side_effect="opendatahub", ) mocker.patch( "codeflare_sdk.job.jobs.torchx_runner.schedule", @@ -1972,14 +2082,24 @@ def test_AWManager_creation(): ) -def arg_check_aw_create_effect(*args): - assert args[0] == "create" - assert args[1] == ["-f", "test.yaml"] +def arg_check_aw_apply_effect(group, version, namespace, plural, body, *args): + assert group == "mcad.ibm.com" + assert version == "v1beta1" + assert namespace == "ns" + assert plural == "appwrappers" + with open("test.yaml") as f: + aw = yaml.load(f, Loader=yaml.FullLoader) + assert body == aw + assert args == tuple() -def arg_check_aw_delete_effect(*args): - assert args[0] == "delete" - assert args[1] == ["AppWrapper", "test"] +def arg_check_aw_del_effect(group, version, namespace, plural, name, *args): + assert group == "mcad.ibm.com" + assert version == "v1beta1" + assert namespace == "ns" + assert plural == "appwrappers" + assert name == "test" + assert args == tuple() def test_AWManager_submit_remove(mocker, capsys): @@ -1991,10 +2111,17 @@ def test_AWManager_submit_remove(mocker, capsys): == "AppWrapper not submitted by this manager yet, nothing to remove\n" ) assert testaw.submitted == False - mocker.patch("openshift.invoke", side_effect=arg_check_aw_create_effect) + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.create_namespaced_custom_object", + side_effect=arg_check_aw_apply_effect, + ) + mocker.patch( + "kubernetes.client.CustomObjectsApi.delete_namespaced_custom_object", + side_effect=arg_check_aw_del_effect, + ) testaw.submit() assert testaw.submitted == True - mocker.patch("openshift.invoke", side_effect=arg_check_aw_delete_effect) testaw.remove() assert testaw.submitted == False From 18ae25e418921eb404865e221b00b41de5f8ce96 Mon Sep 17 00:00:00 2001 From: maxusmusti Date: Fri, 9 Jun 2023 14:15:34 -0400 Subject: [PATCH 7/9] Update requirements --- pyproject.toml | 1 + requirements.txt | 2 ++ 2 files changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 79b90cb37..a7bac3c01 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ ray = {version = "2.1.0", extras = ["default"]} kubernetes = ">= 25.3.0, < 27" codeflare-torchx = "0.6.0.dev0" cryptography = "40.0.2" +executing = "1.2.0" [tool.poetry.group.docs] optional = true diff --git a/requirements.txt b/requirements.txt index e529bc39f..b0d3b410c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,5 @@ rich==12.5.1 ray[default]==2.1.0 kubernetes>=25.3.0,<27 codeflare-torchx==0.6.0.dev0 +cryptography==40.0.2 +executing==1.2.0 From 687e31d5f32a2b5f92c6e5e5ff9f6fd4f9720587 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Fri, 30 Jun 2023 14:00:07 -0700 Subject: [PATCH 8/9] Add: get_cluster function to get cluster with specified name and namespace --- src/codeflare_sdk/cluster/cluster.py | 48 ++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index 26f7ed62a..a06696978 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -335,6 +335,54 @@ def get_current_namespace(): # pragma: no cover return active_context["context"]["namespace"] except KeyError: return "default" + + +def get_cluster(cluster_name: str, namespace: str = "default"): + try: + config.load_kube_config() + api_instance = client.CustomObjectsApi() + rcs = api_instance.list_namespaced_custom_object( + group="ray.io", + version="v1alpha1", + namespace=namespace, + plural="rayclusters", + ) + except Exception as e: + return _kube_api_error_handling(e) + + + for rc in rcs["items"]: + if rc["metadata"]["name"] == cluster_name: + machine_types = rc["metadata"]["labels"]["orderedinstance"].split("_") if "orderedinstance" in rc["metadata"]["labels"] else [] + local_interactive = "volumeMounts" in rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0] + cluster_config = ClusterConfiguration( + name=rc["metadata"]["name"], + namespace=rc["metadata"]["namespace"], + machine_types=machine_types, + min_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ + 0 + ]["resources"]["requests"]["cpu"], + max_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ + 0 + ]["resources"]["limits"]["cpu"], + min_memory=int(rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ + 0 + ]["resources"]["requests"]["memory"][:-1]), + max_memory=int(rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ + 0 + ]["resources"]["limits"]["memory"][:-1]), + gpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ + 0 + ]["resources"]["limits"]["nvidia.com/gpu"], + instascale=True if machine_types else False, + image=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ + 0 + ]["image"], + local_interactive=local_interactive, + ) + return Cluster(cluster_config) + raise FileNotFoundError(f'Cluster {cluster_name} is not found in {namespace} namespace') + # private methods From a23081492b172ba8a580166d42286eeeae467fd5 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Fri, 30 Jun 2023 14:06:55 -0700 Subject: [PATCH 9/9] Test: make unit tests for get_cluster function --- src/codeflare_sdk/cluster/cluster.py | 1 - tests/unit_test.py | 19 +++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index a06696978..ab833e119 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -384,7 +384,6 @@ def get_cluster(cluster_name: str, namespace: str = "default"): raise FileNotFoundError(f'Cluster {cluster_name} is not found in {namespace} namespace') - # private methods diff --git a/tests/unit_test.py b/tests/unit_test.py index 7225b6725..83d7729ab 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -29,6 +29,7 @@ list_all_clusters, list_all_queued, _copy_to_ray, + get_cluster, _app_wrapper_status, _ray_cluster_status, ) @@ -615,6 +616,7 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "appwrapper.mcad.ibm.com": "quicktest", "controller-tools.k8s.io": "1.0", "resourceName": "quicktest", + "orderedinstance": "m4.xlarge_g4dn.xlarge", }, "managedFields": [ { @@ -1530,6 +1532,23 @@ def get_aw_obj(group, version, namespace, plural): return api_obj1 +def test_get_cluster(mocker): + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=get_ray_obj, + ) + cluster = get_cluster("quicktest") + cluster_config = cluster.config + assert cluster_config.name == "quicktest" and cluster_config.namespace == "ns" + assert "m4.xlarge" in cluster_config.machine_types and "g4dn.xlarge" in cluster_config.machine_types + assert cluster_config.min_cpus == 1 and cluster_config.max_cpus == 1 + assert cluster_config.min_memory == 2 and cluster_config.max_memory == 2 + assert cluster_config.gpu == 0 + assert cluster_config.instascale + assert cluster_config.image == "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103" + + def test_list_clusters(mocker, capsys): mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") mocker.patch(