diff --git a/pyproject.toml b/pyproject.toml index 79b90cb37..a7bac3c01 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ ray = {version = "2.1.0", extras = ["default"]} kubernetes = ">= 25.3.0, < 27" codeflare-torchx = "0.6.0.dev0" cryptography = "40.0.2" +executing = "1.2.0" [tool.poetry.group.docs] optional = true diff --git a/requirements.txt b/requirements.txt index e529bc39f..b0d3b410c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,5 @@ rich==12.5.1 ray[default]==2.1.0 kubernetes>=25.3.0,<27 codeflare-torchx==0.6.0.dev0 +cryptography==40.0.2 +executing==1.2.0 diff --git a/src/codeflare_sdk/cluster/awload.py b/src/codeflare_sdk/cluster/awload.py index 5621d6734..25f614232 100644 --- a/src/codeflare_sdk/cluster/awload.py +++ b/src/codeflare_sdk/cluster/awload.py @@ -23,6 +23,9 @@ import openshift as oc import yaml +from kubernetes import client, config +from .cluster import _kube_api_error_handling + class AWManager: """ @@ -40,10 +43,10 @@ def __init__(self, filename: str) -> None: self.filename = filename try: with open(self.filename) as f: - awyaml = yaml.load(f, Loader=yaml.FullLoader) - assert awyaml["kind"] == "AppWrapper" - self.name = awyaml["metadata"]["name"] - self.namespace = awyaml["metadata"]["namespace"] + self.awyaml = yaml.load(f, Loader=yaml.FullLoader) + assert self.awyaml["kind"] == "AppWrapper" + self.name = self.awyaml["metadata"]["name"] + self.namespace = self.awyaml["metadata"]["namespace"] except: raise ValueError( f"{filename } is not a correctly formatted AppWrapper yaml" @@ -55,19 +58,17 @@ def submit(self) -> None: Attempts to create the AppWrapper custom resource using the yaml file """ try: - with oc.project(self.namespace): - oc.invoke("create", ["-f", self.filename]) - except oc.OpenShiftPythonException as osp: # pragma: no cover - error_msg = osp.result.err() - if "Unauthorized" in error_msg or "Forbidden" in error_msg: - raise PermissionError( - "Action not permitted, have you put in correct/up-to-date auth credentials?" - ) - elif "AlreadyExists" in error_msg: - raise FileExistsError( - f"An AppWrapper of the name {self.name} already exists in namespace {self.namespace}" - ) - raise osp + config.load_kube_config() + api_instance = client.CustomObjectsApi() + api_instance.create_namespaced_custom_object( + group="mcad.ibm.com", + version="v1beta1", + namespace=self.namespace, + plural="appwrappers", + body=self.awyaml, + ) + except Exception as e: + return _kube_api_error_handling(e) self.submitted = True print(f"AppWrapper {self.filename} submitted!") @@ -82,25 +83,17 @@ def remove(self) -> None: return try: - with oc.project(self.namespace): - oc.invoke("delete", ["AppWrapper", self.name]) - except oc.OpenShiftPythonException as osp: # pragma: no cover - error_msg = osp.result.err() - if ( - 'the server doesn\'t have a resource type "AppWrapper"' in error_msg - or "forbidden" in error_msg - or "Unauthorized" in error_msg - or "Missing or incomplete configuration" in error_msg - ): - raise PermissionError( - "Action not permitted, have you put in correct/up-to-date auth credentials?" - ) - elif "not found" in error_msg: - self.submitted = False - print("AppWrapper not found, was deleted in another manner") - return - else: - raise osp + config.load_kube_config() + api_instance = client.CustomObjectsApi() + api_instance.delete_namespaced_custom_object( + group="mcad.ibm.com", + version="v1beta1", + namespace=self.namespace, + plural="appwrappers", + name=self.name, + ) + except Exception as e: + return _kube_api_error_handling(e) self.submitted = False print(f"AppWrapper {self.name} removed!") diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index c09e981b3..73d7521e0 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -22,7 +22,6 @@ from time import sleep from typing import List, Optional, Tuple, Dict -import openshift as oc from ray.job_submission import JobSubmissionClient from ..utils import pretty_print @@ -36,6 +35,11 @@ RayClusterStatus, ) +from kubernetes import client, config + +import yaml +import executing + class Cluster: """ @@ -65,7 +69,7 @@ def create_app_wrapper(self): """ if self.config.namespace is None: - self.config.namespace = oc.get_project_name() + self.config.namespace = get_current_namespace() if type(self.config.namespace) is not str: raise TypeError( f"Namespace {self.config.namespace} is of type {type(self.config.namespace)}. Check your Kubernetes Authentication." @@ -112,15 +116,19 @@ def up(self): """ namespace = self.config.namespace try: - with oc.project(namespace): - oc.invoke("apply", ["-f", self.app_wrapper_yaml]) - except oc.OpenShiftPythonException as osp: # pragma: no cover - error_msg = osp.result.err() - if "Unauthorized" in error_msg: - raise PermissionError( - "Action not permitted, have you put in correct/up-to-date auth credentials?" - ) - raise osp + config.load_kube_config() + api_instance = client.CustomObjectsApi() + with open(self.app_wrapper_yaml) as f: + aw = yaml.load(f, Loader=yaml.FullLoader) + api_instance.create_namespaced_custom_object( + group="mcad.ibm.com", + version="v1beta1", + namespace=namespace, + plural="appwrappers", + body=aw, + ) + except Exception as e: # pragma: no cover + return _kube_api_error_handling(e) def down(self): """ @@ -129,23 +137,17 @@ def down(self): """ namespace = self.config.namespace try: - with oc.project(namespace): - oc.invoke("delete", ["AppWrapper", self.app_wrapper_name]) - except oc.OpenShiftPythonException as osp: # pragma: no cover - error_msg = osp.result.err() - if ( - 'the server doesn\'t have a resource type "AppWrapper"' in error_msg - or "forbidden" in error_msg - or "Unauthorized" in error_msg - or "Missing or incomplete configuration" in error_msg - ): - raise PermissionError( - "Action not permitted, have you run auth.login()/cluster.up() yet?" - ) - elif "not found" in error_msg: - print("Cluster not found, have you run cluster.up() yet?") - else: - raise osp + config.load_kube_config() + api_instance = client.CustomObjectsApi() + api_instance.delete_namespaced_custom_object( + group="mcad.ibm.com", + version="v1beta1", + namespace=namespace, + plural="appwrappers", + name=self.app_wrapper_name, + ) + except Exception as e: # pragma: no cover + return _kube_api_error_handling(e) def status( self, print_to_console: bool = True @@ -247,16 +249,21 @@ def cluster_dashboard_uri(self) -> str: Returns a string containing the cluster's dashboard URI. """ try: - with oc.project(self.config.namespace): - route = oc.invoke( - "get", ["route", "-o", "jsonpath='{$.items[*].spec.host}'"] - ) - route = route.out().split(" ") - route = [x for x in route if f"ray-dashboard-{self.config.name}" in x] - route = route[0].strip().strip("'") - return f"http://{route}" - except: - return "Dashboard route not available yet, have you run cluster.up()?" + config.load_kube_config() + api_instance = client.CustomObjectsApi() + routes = api_instance.list_namespaced_custom_object( + group="route.openshift.io", + version="v1", + namespace=self.config.namespace, + plural="routes", + ) + except Exception as e: # pragma: no cover + return _kube_api_error_handling(e) + + for route in routes["items"]: + if route["metadata"]["name"] == f"ray-dashboard-{self.config.name}": + return f"http://{route['spec']['host']}" + return "Dashboard route not available yet, have you run cluster.up()?" def list_jobs(self) -> List: """ @@ -320,78 +327,144 @@ def list_all_queued(namespace: str, print_to_console: bool = True): return app_wrappers +def get_current_namespace(): # pragma: no cover + try: + config.load_kube_config() + _, active_context = config.list_kube_config_contexts() + except Exception as e: + return _kube_api_error_handling(e) + try: + return active_context["context"]["namespace"] + except KeyError: + return "default" + + +def get_cluster(cluster_name: str, namespace: str = "default"): + try: + config.load_kube_config() + api_instance = client.CustomObjectsApi() + rcs = api_instance.list_namespaced_custom_object( + group="ray.io", + version="v1alpha1", + namespace=namespace, + plural="rayclusters", + ) + except Exception as e: + return _kube_api_error_handling(e) + + + for rc in rcs["items"]: + if rc["metadata"]["name"] == cluster_name: + machine_types = rc["metadata"]["labels"]["orderedinstance"].split("_") if "orderedinstance" in rc["metadata"]["labels"] else [] + local_interactive = "volumeMounts" in rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0] + cluster_config = ClusterConfiguration( + name=rc["metadata"]["name"], + namespace=rc["metadata"]["namespace"], + machine_types=machine_types, + min_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ + 0 + ]["resources"]["requests"]["cpu"], + max_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ + 0 + ]["resources"]["limits"]["cpu"], + min_memory=int(rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ + 0 + ]["resources"]["requests"]["memory"][:-1]), + max_memory=int(rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ + 0 + ]["resources"]["limits"]["memory"][:-1]), + gpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ + 0 + ]["resources"]["limits"]["nvidia.com/gpu"], + instascale=True if machine_types else False, + image=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ + 0 + ]["image"], + local_interactive=local_interactive, + ) + return Cluster(cluster_config) + raise FileNotFoundError(f'Cluster {cluster_name} is not found in {namespace} namespace') + + # private methods +def _kube_api_error_handling(e: Exception): # pragma: no cover + perm_msg = ( + "Action not permitted, have you put in correct/up-to-date auth credentials?" + ) + nf_msg = "No instances found, nothing to be done." + exists_msg = "Resource with this name already exists." + if type(e) == config.ConfigException: + raise PermissionError(perm_msg) + if type(e) == executing.executing.NotOneValueFound: + print(nf_msg) + return + if type(e) == client.ApiException: + if e.reason == "Not Found": + print(nf_msg) + return + elif e.reason == "Unauthorized" or e.reason == "Forbidden": + raise PermissionError(perm_msg) + elif e.reason == "Conflict": + raise FileExistsError(exists_msg) + raise e + + def _app_wrapper_status(name, namespace="default") -> Optional[AppWrapper]: - cluster = None try: - with oc.project(namespace), oc.timeout(10 * 60): - cluster = oc.selector(f"appwrapper/{name}").object() - except oc.OpenShiftPythonException as osp: # pragma: no cover - msg = osp.msg - if "Expected a single object, but selected 0" in msg: - return cluster - error_msg = osp.result.err() - if not ( - 'the server doesn\'t have a resource type "appwrapper"' in error_msg - or "forbidden" in error_msg - or "Unauthorized" in error_msg - or "Missing or incomplete configuration" in error_msg - ): - raise osp - - if cluster: - return _map_to_app_wrapper(cluster) - - return cluster + config.load_kube_config() + api_instance = client.CustomObjectsApi() + aws = api_instance.list_namespaced_custom_object( + group="mcad.ibm.com", + version="v1beta1", + namespace=namespace, + plural="appwrappers", + ) + except Exception as e: # pragma: no cover + return _kube_api_error_handling(e) + + for aw in aws["items"]: + if aw["metadata"]["name"] == name: + return _map_to_app_wrapper(aw) + return None def _ray_cluster_status(name, namespace="default") -> Optional[RayCluster]: - cluster = None try: - with oc.project(namespace), oc.timeout(10 * 60): - cluster = oc.selector(f"rayclusters/{name}").object() - except oc.OpenShiftPythonException as osp: # pragma: no cover - msg = osp.msg - if "Expected a single object, but selected 0" in msg: - return cluster - error_msg = osp.result.err() - if not ( - 'the server doesn\'t have a resource type "rayclusters"' in error_msg - or "forbidden" in error_msg - or "Unauthorized" in error_msg - or "Missing or incomplete configuration" in error_msg - ): - raise osp - - if cluster: - return _map_to_ray_cluster(cluster) - - return cluster + config.load_kube_config() + api_instance = client.CustomObjectsApi() + rcs = api_instance.list_namespaced_custom_object( + group="ray.io", + version="v1alpha1", + namespace=namespace, + plural="rayclusters", + ) + except Exception as e: # pragma: no cover + return _kube_api_error_handling(e) + + for rc in rcs["items"]: + if rc["metadata"]["name"] == name: + return _map_to_ray_cluster(rc) + return None def _get_ray_clusters(namespace="default") -> List[RayCluster]: list_of_clusters = [] try: - with oc.project(namespace), oc.timeout(10 * 60): - ray_clusters = oc.selector("rayclusters").objects() - except oc.OpenShiftPythonException as osp: # pragma: no cover - error_msg = osp.result.err() - if ( - 'the server doesn\'t have a resource type "rayclusters"' in error_msg - or "forbidden" in error_msg - or "Unauthorized" in error_msg - or "Missing or incomplete configuration" in error_msg - ): - raise PermissionError( - "Action not permitted, have you put in correct/up-to-date auth credentials?" - ) - else: - raise osp + config.load_kube_config() + api_instance = client.CustomObjectsApi() + rcs = api_instance.list_namespaced_custom_object( + group="ray.io", + version="v1alpha1", + namespace=namespace, + plural="rayclusters", + ) + except Exception as e: # pragma: no cover + return _kube_api_error_handling(e) - for cluster in ray_clusters: - list_of_clusters.append(_map_to_ray_cluster(cluster)) + for rc in rcs["items"]: + list_of_clusters.append(_map_to_ray_cluster(rc)) return list_of_clusters @@ -401,23 +474,18 @@ def _get_app_wrappers( list_of_app_wrappers = [] try: - with oc.project(namespace), oc.timeout(10 * 60): - app_wrappers = oc.selector("appwrappers").objects() - except oc.OpenShiftPythonException as osp: # pragma: no cover - error_msg = osp.result.err() - if ( - 'the server doesn\'t have a resource type "appwrappers"' in error_msg - or "forbidden" in error_msg - or "Unauthorized" in error_msg - or "Missing or incomplete configuration" in error_msg - ): - raise PermissionError( - "Action not permitted, have you put in correct/up-to-date auth credentials?" - ) - else: - raise osp + config.load_kube_config() + api_instance = client.CustomObjectsApi() + aws = api_instance.list_namespaced_custom_object( + group="mcad.ibm.com", + version="v1beta1", + namespace=namespace, + plural="appwrappers", + ) + except Exception as e: # pragma: no cover + return _kube_api_error_handling(e) - for item in app_wrappers: + for item in aws["items"]: app_wrapper = _map_to_app_wrapper(item) if filter and app_wrapper.status in filter: list_of_app_wrappers.append(app_wrapper) @@ -427,48 +495,52 @@ def _get_app_wrappers( return list_of_app_wrappers -def _map_to_ray_cluster(cluster) -> Optional[RayCluster]: - cluster_model = cluster.model - if type(cluster_model.status.state) == oc.model.MissingModel: - status = RayClusterStatus.UNKNOWN +def _map_to_ray_cluster(rc) -> Optional[RayCluster]: + if "state" in rc["status"]: + status = RayClusterStatus(rc["status"]["state"].lower()) else: - status = RayClusterStatus(cluster_model.status.state.lower()) + status = RayClusterStatus.UNKNOWN - with oc.project(cluster.namespace()), oc.timeout(10 * 60): - route = ( - oc.selector(f"route/ray-dashboard-{cluster.name()}") - .object() - .model.spec.host - ) + config.load_kube_config() + api_instance = client.CustomObjectsApi() + routes = api_instance.list_namespaced_custom_object( + group="route.openshift.io", + version="v1", + namespace=rc["metadata"]["namespace"], + plural="routes", + ) + ray_route = None + for route in routes["items"]: + if route["metadata"]["name"] == f"ray-dashboard-{rc['metadata']['name']}": + ray_route = route["spec"]["host"] return RayCluster( - name=cluster.name(), + name=rc["metadata"]["name"], status=status, # for now we are not using autoscaling so same replicas is fine - min_workers=cluster_model.spec.workerGroupSpecs[0].replicas, - max_workers=cluster_model.spec.workerGroupSpecs[0].replicas, - worker_mem_max=cluster_model.spec.workerGroupSpecs[0] - .template.spec.containers[0] - .resources.limits.memory, - worker_mem_min=cluster_model.spec.workerGroupSpecs[0] - .template.spec.containers[0] - .resources.requests.memory, - worker_cpu=cluster_model.spec.workerGroupSpecs[0] - .template.spec.containers[0] - .resources.limits.cpu, + min_workers=rc["spec"]["workerGroupSpecs"][0]["replicas"], + max_workers=rc["spec"]["workerGroupSpecs"][0]["replicas"], + worker_mem_max=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + "containers" + ][0]["resources"]["limits"]["memory"], + worker_mem_min=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + "containers" + ][0]["resources"]["requests"]["memory"], + worker_cpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ + 0 + ]["resources"]["limits"]["cpu"], worker_gpu=0, # hard to detect currently how many gpus, can override it with what the user asked for - namespace=cluster.namespace(), - dashboard=route, + namespace=rc["metadata"]["namespace"], + dashboard=ray_route, ) -def _map_to_app_wrapper(cluster) -> AppWrapper: - cluster_model = cluster.model +def _map_to_app_wrapper(aw) -> AppWrapper: return AppWrapper( - name=cluster.name(), - status=AppWrapperStatus(cluster_model.status.state.lower()), - can_run=cluster_model.status.canrun, - job_state=cluster_model.status.queuejobstate, + name=aw["metadata"]["name"], + status=AppWrapperStatus(aw["status"]["state"].lower()), + can_run=aw["status"]["canrun"], + job_state=aw["status"]["queuejobstate"], ) diff --git a/src/codeflare_sdk/cluster/config.py b/src/codeflare_sdk/cluster/config.py index a00d128a2..46aa69b89 100644 --- a/src/codeflare_sdk/cluster/config.py +++ b/src/codeflare_sdk/cluster/config.py @@ -19,9 +19,7 @@ """ from dataclasses import dataclass, field -from .auth import Authentication import pathlib -import openshift dir = pathlib.Path(__file__).parent.parent.resolve() diff --git a/src/codeflare_sdk/job/jobs.py b/src/codeflare_sdk/job/jobs.py index 6b5ce0a53..b1db70d54 100644 --- a/src/codeflare_sdk/job/jobs.py +++ b/src/codeflare_sdk/job/jobs.py @@ -17,13 +17,13 @@ from typing import TYPE_CHECKING, Optional, Dict, List from pathlib import Path -import openshift as oc from torchx.components.dist import ddp from torchx.runner import get_runner from torchx.specs import AppHandle, parse_app_handle, AppDryRunInfo if TYPE_CHECKING: from ..cluster.cluster import Cluster +from ..cluster.cluster import get_current_namespace all_jobs: List["Job"] = [] torchx_runner = get_runner() @@ -124,7 +124,7 @@ def _missing_spec(self, spec: str): def _dry_run_no_cluster(self): if self.scheduler_args is not None: if self.scheduler_args.get("namespace") is None: - self.scheduler_args["namespace"] = oc.get_project_name() + self.scheduler_args["namespace"] = get_current_namespace() return torchx_runner.dryrun( app=ddp( *self.script_args, diff --git a/tests/unit_test.py b/tests/unit_test.py index d6e6c6ef6..f0934e68f 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -29,6 +29,9 @@ list_all_clusters, list_all_queued, _copy_to_ray, + get_cluster, + _app_wrapper_status, + _ray_cluster_status, ) from codeflare_sdk.cluster.auth import ( TokenAuthentication, @@ -70,6 +73,7 @@ from torchx.schedulers.ray_scheduler import RayJob from torchx.schedulers.kubernetes_mcad_scheduler import KubernetesMCADJob import pytest +import yaml # For mocking openshift client results @@ -251,7 +255,7 @@ def test_cluster_creation(): def test_default_cluster_creation(mocker): mocker.patch( - "openshift.get_project_name", + "codeflare_sdk.cluster.cluster.get_current_namespace", return_value="opendatahub", ) default_config = ClusterConfiguration( @@ -266,38 +270,103 @@ def test_default_cluster_creation(mocker): return cluster -def arg_check_apply_effect(*args): - assert args[0] == "apply" - assert args[1] == ["-f", "unit-test-cluster.yaml"] +def arg_check_apply_effect(group, version, namespace, plural, body, *args): + assert group == "mcad.ibm.com" + assert version == "v1beta1" + assert namespace == "ns" + assert plural == "appwrappers" + with open("unit-test-cluster.yaml") as f: + aw = yaml.load(f, Loader=yaml.FullLoader) + assert body == aw + assert args == tuple() -def arg_check_del_effect(*args): - assert args[0] == "delete" - assert args[1] == ["AppWrapper", "unit-test-cluster"] +def arg_check_del_effect(group, version, namespace, plural, name, *args): + assert group == "mcad.ibm.com" + assert version == "v1beta1" + assert namespace == "ns" + assert plural == "appwrappers" + assert name == "unit-test-cluster" + assert args == tuple() def test_cluster_up_down(mocker): + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") mocker.patch( - "codeflare_sdk.cluster.auth.TokenAuthentication.login", return_value="ignore" + "kubernetes.client.CustomObjectsApi.create_namespaced_custom_object", + side_effect=arg_check_apply_effect, ) mocker.patch( - "codeflare_sdk.cluster.auth.TokenAuthentication.logout", return_value="ignore" + "kubernetes.client.CustomObjectsApi.delete_namespaced_custom_object", + side_effect=arg_check_del_effect, ) - mocker.patch("openshift.invoke", side_effect=arg_check_apply_effect) cluster = test_cluster_creation() cluster.up() - mocker.patch("openshift.invoke", side_effect=arg_check_del_effect) cluster.down() -def out_route(self): - return "ray-dashboard-raycluster-autoscaler-ns.apps.cluster.awsroute.org ray-dashboard-unit-test-cluster-ns.apps.cluster.awsroute.org" +def aw_status_fields(group, version, namespace, plural, *args): + assert group == "mcad.ibm.com" + assert version == "v1beta1" + assert namespace == "test-ns" + assert plural == "appwrappers" + assert args == tuple() + return {"items": []} + + +def test_aw_status(mocker): + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=aw_status_fields, + ) + aw = _app_wrapper_status("test-aw", "test-ns") + assert aw == None + + +def rc_status_fields(group, version, namespace, plural, *args): + assert group == "ray.io" + assert version == "v1alpha1" + assert namespace == "test-ns" + assert plural == "rayclusters" + assert args == tuple() + return {"items": []} + + +def test_rc_status(mocker): + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=rc_status_fields, + ) + rc = _ray_cluster_status("test-rc", "test-ns") + assert rc == None + + +def uri_retreival(group, version, namespace, plural, *args): + assert group == "route.openshift.io" + assert version == "v1" + assert namespace == "ns" + assert plural == "routes" + assert args == tuple() + return { + "items": [ + { + "metadata": {"name": "ray-dashboard-unit-test-cluster"}, + "spec": { + "host": "ray-dashboard-unit-test-cluster-ns.apps.cluster.awsroute.org" + }, + } + ] + } def test_cluster_uris(mocker): - mocker.patch("openshift.invoke", return_value=fake_res) - mock_res = mocker.patch.object(openshift.Result, "out") - mock_res.side_effect = lambda: out_route(fake_res) + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=uri_retreival, + ) cluster = test_cluster_creation() assert cluster.cluster_uri() == "ray://unit-test-cluster-head-svc.ns.svc:10001" @@ -317,9 +386,11 @@ def ray_addr(self, *args): def test_ray_job_wrapping(mocker): - mocker.patch("openshift.invoke", return_value=fake_res) - mock_res = mocker.patch.object(openshift.Result, "out") - mock_res.side_effect = lambda: out_route(fake_res) + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=uri_retreival, + ) cluster = test_cluster_creation() mocker.patch( @@ -405,7 +476,7 @@ def test_print_appwrappers(capsys): ) -def test_ray_details(capsys): +def test_ray_details(mocker, capsys): ray1 = RayCluster( name="raytest1", status=RayClusterStatus.READY, @@ -418,6 +489,14 @@ def test_ray_details(capsys): namespace="ns", dashboard="fake-uri", ) + mocker.patch( + "codeflare_sdk.cluster.cluster.Cluster.status", + return_value=(False, CodeFlareClusterStatus.UNKNOWN), + ) + mocker.patch( + "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri", + return_value="", + ) cf = Cluster(ClusterConfiguration(name="raytest2", namespace="ns")) captured = capsys.readouterr() ray2 = _copy_to_ray(cf) @@ -522,223 +601,151 @@ def act_side_effect_list(self): return [self] -def get_selector(*args): - selector = Selector({"operation": "selector", "status": 0, "actions": []}) - return selector - - -def get_obj_none(): - return [] - - -def get_ray_obj(cls=None): - api_obj = openshift.apiobject.APIObject( - { - "apiVersion": "ray.io/v1alpha1", - "kind": "RayCluster", - "metadata": { - "creationTimestamp": "2023-02-22T16:26:07Z", - "generation": 1, - "labels": { - "appwrapper.mcad.ibm.com": "quicktest", - "controller-tools.k8s.io": "1.0", - "resourceName": "quicktest", - }, - "managedFields": [ - { - "apiVersion": "ray.io/v1alpha1", - "fieldsType": "FieldsV1", - "fieldsV1": { - "f:metadata": { - "f:labels": { - ".": {}, - "f:appwrapper.mcad.ibm.com": {}, - "f:controller-tools.k8s.io": {}, - "f:resourceName": {}, - }, - "f:ownerReferences": { - ".": {}, - 'k:{"uid":"6334fc1b-471e-4876-8e7b-0b2277679235"}': {}, - }, - }, - "f:spec": { - ".": {}, - "f:autoscalerOptions": { - ".": {}, - "f:idleTimeoutSeconds": {}, - "f:imagePullPolicy": {}, - "f:resources": { +def get_obj_none(group, version, namespace, plural): + return {"items": []} + + +def get_ray_obj(group, version, namespace, plural, cls=None): + api_obj = { + "items": [ + { + "apiVersion": "ray.io/v1alpha1", + "kind": "RayCluster", + "metadata": { + "creationTimestamp": "2023-02-22T16:26:07Z", + "generation": 1, + "labels": { + "appwrapper.mcad.ibm.com": "quicktest", + "controller-tools.k8s.io": "1.0", + "resourceName": "quicktest", + "orderedinstance": "m4.xlarge_g4dn.xlarge", + }, + "managedFields": [ + { + "apiVersion": "ray.io/v1alpha1", + "fieldsType": "FieldsV1", + "fieldsV1": { + "f:metadata": { + "f:labels": { ".": {}, - "f:limits": { - ".": {}, - "f:cpu": {}, - "f:memory": {}, - }, - "f:requests": { - ".": {}, - "f:cpu": {}, - "f:memory": {}, - }, + "f:appwrapper.mcad.ibm.com": {}, + "f:controller-tools.k8s.io": {}, + "f:resourceName": {}, + }, + "f:ownerReferences": { + ".": {}, + 'k:{"uid":"6334fc1b-471e-4876-8e7b-0b2277679235"}': {}, }, - "f:upscalingMode": {}, }, - "f:enableInTreeAutoscaling": {}, - "f:headGroupSpec": { + "f:spec": { ".": {}, - "f:rayStartParams": { + "f:autoscalerOptions": { ".": {}, - "f:block": {}, - "f:dashboard-host": {}, - "f:num-gpus": {}, + "f:idleTimeoutSeconds": {}, + "f:imagePullPolicy": {}, + "f:resources": { + ".": {}, + "f:limits": { + ".": {}, + "f:cpu": {}, + "f:memory": {}, + }, + "f:requests": { + ".": {}, + "f:cpu": {}, + "f:memory": {}, + }, + }, + "f:upscalingMode": {}, }, - "f:serviceType": {}, - "f:template": { + "f:enableInTreeAutoscaling": {}, + "f:headGroupSpec": { ".": {}, - "f:spec": {".": {}, "f:containers": {}}, + "f:rayStartParams": { + ".": {}, + "f:block": {}, + "f:dashboard-host": {}, + "f:num-gpus": {}, + }, + "f:serviceType": {}, + "f:template": { + ".": {}, + "f:spec": {".": {}, "f:containers": {}}, + }, }, + "f:rayVersion": {}, + "f:workerGroupSpecs": {}, }, - "f:rayVersion": {}, - "f:workerGroupSpecs": {}, }, + "manager": "mcad-controller", + "operation": "Update", + "time": "2023-02-22T16:26:07Z", }, - "manager": "mcad-controller", - "operation": "Update", - "time": "2023-02-22T16:26:07Z", - }, - { - "apiVersion": "ray.io/v1alpha1", - "fieldsType": "FieldsV1", - "fieldsV1": { - "f:status": { - ".": {}, - "f:availableWorkerReplicas": {}, - "f:desiredWorkerReplicas": {}, - "f:endpoints": { + { + "apiVersion": "ray.io/v1alpha1", + "fieldsType": "FieldsV1", + "fieldsV1": { + "f:status": { ".": {}, - "f:client": {}, - "f:dashboard": {}, - "f:gcs": {}, - }, - "f:lastUpdateTime": {}, - "f:maxWorkerReplicas": {}, - "f:minWorkerReplicas": {}, - "f:state": {}, - } - }, - "manager": "manager", - "operation": "Update", - "subresource": "status", - "time": "2023-02-22T16:26:16Z", - }, - ], - "name": "quicktest", - "namespace": "ns", - "ownerReferences": [ - { - "apiVersion": "mcad.ibm.com/v1beta1", - "blockOwnerDeletion": True, - "controller": True, - "kind": "AppWrapper", - "name": "quicktest", - "uid": "6334fc1b-471e-4876-8e7b-0b2277679235", - } - ], - "resourceVersion": "9482407", - "uid": "44d45d1f-26c8-43e7-841f-831dbd8c1285", - }, - "spec": { - "autoscalerOptions": { - "idleTimeoutSeconds": 60, - "imagePullPolicy": "Always", - "resources": { - "limits": {"cpu": "500m", "memory": "512Mi"}, - "requests": {"cpu": "500m", "memory": "512Mi"}, - }, - "upscalingMode": "Default", - }, - "enableInTreeAutoscaling": False, - "headGroupSpec": { - "rayStartParams": { - "block": "true", - "dashboard-host": "0.0.0.0", - "num-gpus": "0", - }, - "serviceType": "ClusterIP", - "template": { - "spec": { - "containers": [ - { - "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", - "imagePullPolicy": "Always", - "lifecycle": { - "preStop": { - "exec": { - "command": ["/bin/sh", "-c", "ray stop"] - } - } - }, - "name": "ray-head", - "ports": [ - { - "containerPort": 6379, - "name": "gcs", - "protocol": "TCP", - }, - { - "containerPort": 8265, - "name": "dashboard", - "protocol": "TCP", - }, - { - "containerPort": 10001, - "name": "client", - "protocol": "TCP", - }, - ], - "resources": { - "limits": { - "cpu": 2, - "memory": "8G", - "nvidia.com/gpu": 0, - }, - "requests": { - "cpu": 2, - "memory": "8G", - "nvidia.com/gpu": 0, - }, + "f:availableWorkerReplicas": {}, + "f:desiredWorkerReplicas": {}, + "f:endpoints": { + ".": {}, + "f:client": {}, + "f:dashboard": {}, + "f:gcs": {}, }, + "f:lastUpdateTime": {}, + "f:maxWorkerReplicas": {}, + "f:minWorkerReplicas": {}, + "f:state": {}, } - ] + }, + "manager": "manager", + "operation": "Update", + "subresource": "status", + "time": "2023-02-22T16:26:16Z", + }, + ], + "name": "quicktest", + "namespace": "ns", + "ownerReferences": [ + { + "apiVersion": "mcad.ibm.com/v1beta1", + "blockOwnerDeletion": True, + "controller": True, + "kind": "AppWrapper", + "name": "quicktest", + "uid": "6334fc1b-471e-4876-8e7b-0b2277679235", } - }, + ], + "resourceVersion": "9482407", + "uid": "44d45d1f-26c8-43e7-841f-831dbd8c1285", }, - "rayVersion": "1.12.0", - "workerGroupSpecs": [ - { - "groupName": "small-group-quicktest", - "maxReplicas": 1, - "minReplicas": 1, - "rayStartParams": {"block": "true", "num-gpus": "0"}, - "replicas": 1, + "spec": { + "autoscalerOptions": { + "idleTimeoutSeconds": 60, + "imagePullPolicy": "Always", + "resources": { + "limits": {"cpu": "500m", "memory": "512Mi"}, + "requests": {"cpu": "500m", "memory": "512Mi"}, + }, + "upscalingMode": "Default", + }, + "enableInTreeAutoscaling": False, + "headGroupSpec": { + "rayStartParams": { + "block": "true", + "dashboard-host": "0.0.0.0", + "num-gpus": "0", + }, + "serviceType": "ClusterIP", "template": { - "metadata": { - "annotations": {"key": "value"}, - "labels": {"key": "value"}, - }, "spec": { "containers": [ { - "env": [ - { - "name": "MY_POD_IP", - "valueFrom": { - "fieldRef": { - "fieldPath": "status.podIP" - } - }, - } - ], "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", + "imagePullPolicy": "Always", "lifecycle": { "preStop": { "exec": { @@ -750,262 +757,271 @@ def get_ray_obj(cls=None): } } }, - "name": "machine-learning", + "name": "ray-head", + "ports": [ + { + "containerPort": 6379, + "name": "gcs", + "protocol": "TCP", + }, + { + "containerPort": 8265, + "name": "dashboard", + "protocol": "TCP", + }, + { + "containerPort": 10001, + "name": "client", + "protocol": "TCP", + }, + ], "resources": { "limits": { - "cpu": 1, - "memory": "2G", + "cpu": 2, + "memory": "8G", "nvidia.com/gpu": 0, }, "requests": { - "cpu": 1, - "memory": "2G", + "cpu": 2, + "memory": "8G", "nvidia.com/gpu": 0, }, }, } - ], - "initContainers": [ - { - "command": [ - "sh", - "-c", - "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done", - ], - "image": "busybox:1.28", - "name": "init-myservice", - } - ], - }, - }, - } - ], - }, - "status": { - "availableWorkerReplicas": 2, - "desiredWorkerReplicas": 1, - "endpoints": {"client": "10001", "dashboard": "8265", "gcs": "6379"}, - "lastUpdateTime": "2023-02-22T16:26:16Z", - "maxWorkerReplicas": 1, - "minWorkerReplicas": 1, - "state": "ready", - }, - } - ) - return [api_obj] - - -def get_aw_obj(): - api_obj1 = openshift.apiobject.APIObject( - { - "apiVersion": "mcad.ibm.com/v1beta1", - "kind": "AppWrapper", - "metadata": { - "annotations": { - "kubectl.kubernetes.io/last-applied-configuration": '{"apiVersion":"mcad.ibm.com/v1beta1","kind":"AppWrapper","metadata":{"annotations":{},"name":"quicktest1","namespace":"ns"},"spec":{"priority":9,"resources":{"GenericItems":[{"custompodresources":[{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}},{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}],"generictemplate":{"apiVersion":"ray.io/v1alpha1","kind":"RayCluster","metadata":{"labels":{"appwrapper.mcad.ibm.com":"quicktest1","controller-tools.k8s.io":"1.0"},"name":"quicktest1","namespace":"ns"},"spec":{"autoscalerOptions":{"idleTimeoutSeconds":60,"imagePullPolicy":"Always","resources":{"limits":{"cpu":"500m","memory":"512Mi"},"requests":{"cpu":"500m","memory":"512Mi"}},"upscalingMode":"Default"},"enableInTreeAutoscaling":false,"headGroupSpec":{"rayStartParams":{"block":"true","dashboard-host":"0.0.0.0","num-gpus":"0"},"serviceType":"ClusterIP","template":{"spec":{"containers":[{"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","imagePullPolicy":"Always","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"ray-head","ports":[{"containerPort":6379,"name":"gcs"},{"containerPort":8265,"name":"dashboard"},{"containerPort":10001,"name":"client"}],"resources":{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}}}]}}},"rayVersion":"1.12.0","workerGroupSpecs":[{"groupName":"small-group-quicktest","maxReplicas":1,"minReplicas":1,"rayStartParams":{"block":"true","num-gpus":"0"},"replicas":1,"template":{"metadata":{"annotations":{"key":"value"},"labels":{"key":"value"}},"spec":{"containers":[{"env":[{"name":"MY_POD_IP","valueFrom":{"fieldRef":{"fieldPath":"status.podIP"}}}],"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"machine-learning","resources":{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}}],"initContainers":[{"command":["sh","-c","until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"],"image":"busybox:1.28","name":"init-myservice"}]}}}]}},"replicas":1},{"generictemplate":{"apiVersion":"route.openshift.io/v1","kind":"Route","metadata":{"labels":{"odh-ray-cluster-service":"quicktest-head-svc"},"name":"ray-dashboard-quicktest","namespace":"default"},"spec":{"port":{"targetPort":"dashboard"},"to":{"kind":"Service","name":"quicktest-head-svc"}}},"replica":1}],"Items":[]}}}\n' - }, - "creationTimestamp": "2023-02-22T16:26:07Z", - "generation": 4, - "managedFields": [ - { - "apiVersion": "mcad.ibm.com/v1beta1", - "fieldsType": "FieldsV1", - "fieldsV1": { - "f:spec": { - "f:resources": {"f:GenericItems": {}, "f:metadata": {}}, - "f:schedulingSpec": {}, - "f:service": {".": {}, "f:spec": {}}, - }, - "f:status": { - ".": {}, - "f:canrun": {}, - "f:conditions": {}, - "f:controllerfirsttimestamp": {}, - "f:filterignore": {}, - "f:queuejobstate": {}, - "f:sender": {}, - "f:state": {}, - "f:systempriority": {}, - }, + ] + } }, - "manager": "Go-http-client", - "operation": "Update", - "time": "2023-02-22T16:26:07Z", }, - { - "apiVersion": "mcad.ibm.com/v1beta1", - "fieldsType": "FieldsV1", - "fieldsV1": { - "f:metadata": { - "f:annotations": { - ".": {}, - "f:kubectl.kubernetes.io/last-applied-configuration": {}, - } - }, - "f:spec": { - ".": {}, - "f:priority": {}, - "f:resources": {".": {}, "f:Items": {}}, + "rayVersion": "1.12.0", + "workerGroupSpecs": [ + { + "groupName": "small-group-quicktest", + "maxReplicas": 1, + "minReplicas": 1, + "rayStartParams": {"block": "true", "num-gpus": "0"}, + "replicas": 1, + "template": { + "metadata": { + "annotations": {"key": "value"}, + "labels": {"key": "value"}, + }, + "spec": { + "containers": [ + { + "env": [ + { + "name": "MY_POD_IP", + "valueFrom": { + "fieldRef": { + "fieldPath": "status.podIP" + } + }, + } + ], + "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", + "lifecycle": { + "preStop": { + "exec": { + "command": [ + "/bin/sh", + "-c", + "ray stop", + ] + } + } + }, + "name": "machine-learning", + "resources": { + "limits": { + "cpu": 1, + "memory": "2G", + "nvidia.com/gpu": 0, + }, + "requests": { + "cpu": 1, + "memory": "2G", + "nvidia.com/gpu": 0, + }, + }, + } + ], + "initContainers": [ + { + "command": [ + "sh", + "-c", + "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done", + ], + "image": "busybox:1.28", + "name": "init-myservice", + } + ], + }, }, - }, - "manager": "kubectl-client-side-apply", - "operation": "Update", - "time": "2023-02-22T16:26:07Z", + } + ], + }, + "status": { + "availableWorkerReplicas": 2, + "desiredWorkerReplicas": 1, + "endpoints": { + "client": "10001", + "dashboard": "8265", + "gcs": "6379", }, - ], - "name": "quicktest1", - "namespace": "ns", - "resourceVersion": "9482384", - "uid": "6334fc1b-471e-4876-8e7b-0b2277679235", - }, - "spec": { - "priority": 9, - "resources": { - "GenericItems": [ + "lastUpdateTime": "2023-02-22T16:26:16Z", + "maxWorkerReplicas": 1, + "minWorkerReplicas": 1, + "state": "ready", + }, + } + ] + } + return api_obj + + +def get_aw_obj(group, version, namespace, plural): + api_obj1 = { + "items": [ + { + "apiVersion": "mcad.ibm.com/v1beta1", + "kind": "AppWrapper", + "metadata": { + "annotations": { + "kubectl.kubernetes.io/last-applied-configuration": '{"apiVersion":"mcad.ibm.com/v1beta1","kind":"AppWrapper","metadata":{"annotations":{},"name":"quicktest1","namespace":"ns"},"spec":{"priority":9,"resources":{"GenericItems":[{"custompodresources":[{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}},{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}],"generictemplate":{"apiVersion":"ray.io/v1alpha1","kind":"RayCluster","metadata":{"labels":{"appwrapper.mcad.ibm.com":"quicktest1","controller-tools.k8s.io":"1.0"},"name":"quicktest1","namespace":"ns"},"spec":{"autoscalerOptions":{"idleTimeoutSeconds":60,"imagePullPolicy":"Always","resources":{"limits":{"cpu":"500m","memory":"512Mi"},"requests":{"cpu":"500m","memory":"512Mi"}},"upscalingMode":"Default"},"enableInTreeAutoscaling":false,"headGroupSpec":{"rayStartParams":{"block":"true","dashboard-host":"0.0.0.0","num-gpus":"0"},"serviceType":"ClusterIP","template":{"spec":{"containers":[{"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","imagePullPolicy":"Always","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"ray-head","ports":[{"containerPort":6379,"name":"gcs"},{"containerPort":8265,"name":"dashboard"},{"containerPort":10001,"name":"client"}],"resources":{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}}}]}}},"rayVersion":"1.12.0","workerGroupSpecs":[{"groupName":"small-group-quicktest","maxReplicas":1,"minReplicas":1,"rayStartParams":{"block":"true","num-gpus":"0"},"replicas":1,"template":{"metadata":{"annotations":{"key":"value"},"labels":{"key":"value"}},"spec":{"containers":[{"env":[{"name":"MY_POD_IP","valueFrom":{"fieldRef":{"fieldPath":"status.podIP"}}}],"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"machine-learning","resources":{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}}],"initContainers":[{"command":["sh","-c","until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"],"image":"busybox:1.28","name":"init-myservice"}]}}}]}},"replicas":1},{"generictemplate":{"apiVersion":"route.openshift.io/v1","kind":"Route","metadata":{"labels":{"odh-ray-cluster-service":"quicktest-head-svc"},"name":"ray-dashboard-quicktest","namespace":"default"},"spec":{"port":{"targetPort":"dashboard"},"to":{"kind":"Service","name":"quicktest-head-svc"}}},"replica":1}],"Items":[]}}}\n' + }, + "creationTimestamp": "2023-02-22T16:26:07Z", + "generation": 4, + "managedFields": [ { - "allocated": 0, - "custompodresources": [ - { - "limits": { - "cpu": "2", - "memory": "8G", - "nvidia.com/gpu": "0", - }, - "replicas": 1, - "requests": { - "cpu": "2", - "memory": "8G", - "nvidia.com/gpu": "0", + "apiVersion": "mcad.ibm.com/v1beta1", + "fieldsType": "FieldsV1", + "fieldsV1": { + "f:spec": { + "f:resources": { + "f:GenericItems": {}, + "f:metadata": {}, }, + "f:schedulingSpec": {}, + "f:service": {".": {}, "f:spec": {}}, }, - { - "limits": { - "cpu": "1", - "memory": "2G", - "nvidia.com/gpu": "0", - }, - "replicas": 1, - "requests": { - "cpu": "1", - "memory": "2G", - "nvidia.com/gpu": "0", - }, + "f:status": { + ".": {}, + "f:canrun": {}, + "f:conditions": {}, + "f:controllerfirsttimestamp": {}, + "f:filterignore": {}, + "f:queuejobstate": {}, + "f:sender": {}, + "f:state": {}, + "f:systempriority": {}, }, - ], - "generictemplate": { - "apiVersion": "ray.io/v1alpha1", - "kind": "RayCluster", - "metadata": { - "labels": { - "appwrapper.mcad.ibm.com": "quicktest1", - "controller-tools.k8s.io": "1.0", - }, - "name": "quicktest1", - "namespace": "ns", + }, + "manager": "Go-http-client", + "operation": "Update", + "time": "2023-02-22T16:26:07Z", + }, + { + "apiVersion": "mcad.ibm.com/v1beta1", + "fieldsType": "FieldsV1", + "fieldsV1": { + "f:metadata": { + "f:annotations": { + ".": {}, + "f:kubectl.kubernetes.io/last-applied-configuration": {}, + } }, - "spec": { - "autoscalerOptions": { - "idleTimeoutSeconds": 60, - "imagePullPolicy": "Always", - "resources": { - "limits": { - "cpu": "500m", - "memory": "512Mi", - }, - "requests": { - "cpu": "500m", - "memory": "512Mi", - }, + "f:spec": { + ".": {}, + "f:priority": {}, + "f:resources": {".": {}, "f:Items": {}}, + }, + }, + "manager": "kubectl-client-side-apply", + "operation": "Update", + "time": "2023-02-22T16:26:07Z", + }, + ], + "name": "quicktest1", + "namespace": "ns", + "resourceVersion": "9482384", + "uid": "6334fc1b-471e-4876-8e7b-0b2277679235", + }, + "spec": { + "priority": 9, + "resources": { + "GenericItems": [ + { + "allocated": 0, + "custompodresources": [ + { + "limits": { + "cpu": "2", + "memory": "8G", + "nvidia.com/gpu": "0", + }, + "replicas": 1, + "requests": { + "cpu": "2", + "memory": "8G", + "nvidia.com/gpu": "0", }, - "upscalingMode": "Default", }, - "enableInTreeAutoscaling": False, - "headGroupSpec": { - "rayStartParams": { - "block": "true", - "dashboard-host": "0.0.0.0", - "num-gpus": "0", + { + "limits": { + "cpu": "1", + "memory": "2G", + "nvidia.com/gpu": "0", }, - "serviceType": "ClusterIP", - "template": { - "spec": { - "containers": [ - { - "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", - "imagePullPolicy": "Always", - "lifecycle": { - "preStop": { - "exec": { - "command": [ - "/bin/sh", - "-c", - "ray stop", - ] - } - } - }, - "name": "ray-head", - "ports": [ - { - "containerPort": 6379, - "name": "gcs", - }, - { - "containerPort": 8265, - "name": "dashboard", - }, - { - "containerPort": 10001, - "name": "client", - }, - ], - "resources": { - "limits": { - "cpu": 2, - "memory": "8G", - "nvidia.com/gpu": 0, - }, - "requests": { - "cpu": 2, - "memory": "8G", - "nvidia.com/gpu": 0, - }, - }, - } - ] - } + "replicas": 1, + "requests": { + "cpu": "1", + "memory": "2G", + "nvidia.com/gpu": "0", }, }, - "rayVersion": "1.12.0", - "workerGroupSpecs": [ - { - "groupName": "small-group-quicktest", - "maxReplicas": 1, - "minReplicas": 1, + ], + "generictemplate": { + "apiVersion": "ray.io/v1alpha1", + "kind": "RayCluster", + "metadata": { + "labels": { + "appwrapper.mcad.ibm.com": "quicktest1", + "controller-tools.k8s.io": "1.0", + }, + "name": "quicktest1", + "namespace": "ns", + }, + "spec": { + "autoscalerOptions": { + "idleTimeoutSeconds": 60, + "imagePullPolicy": "Always", + "resources": { + "limits": { + "cpu": "500m", + "memory": "512Mi", + }, + "requests": { + "cpu": "500m", + "memory": "512Mi", + }, + }, + "upscalingMode": "Default", + }, + "enableInTreeAutoscaling": False, + "headGroupSpec": { "rayStartParams": { "block": "true", + "dashboard-host": "0.0.0.0", "num-gpus": "0", }, - "replicas": 1, + "serviceType": "ClusterIP", "template": { - "metadata": { - "annotations": {"key": "value"}, - "labels": {"key": "value"}, - }, "spec": { "containers": [ { - "env": [ - { - "name": "MY_POD_IP", - "valueFrom": { - "fieldRef": { - "fieldPath": "status.podIP" - } - }, - } - ], "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", + "imagePullPolicy": "Always", "lifecycle": { "preStop": { "exec": { @@ -1017,317 +1033,318 @@ def get_aw_obj(): } } }, - "name": "machine-learning", + "name": "ray-head", + "ports": [ + { + "containerPort": 6379, + "name": "gcs", + }, + { + "containerPort": 8265, + "name": "dashboard", + }, + { + "containerPort": 10001, + "name": "client", + }, + ], "resources": { "limits": { - "cpu": 1, - "memory": "2G", + "cpu": 2, + "memory": "8G", "nvidia.com/gpu": 0, }, "requests": { - "cpu": 1, - "memory": "2G", + "cpu": 2, + "memory": "8G", "nvidia.com/gpu": 0, }, }, } - ], - "initContainers": [ - { - "command": [ - "sh", - "-c", - "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done", - ], - "image": "busybox:1.28", - "name": "init-myservice", - } - ], - }, + ] + } }, - } - ], + }, + "rayVersion": "1.12.0", + "workerGroupSpecs": [ + { + "groupName": "small-group-quicktest", + "maxReplicas": 1, + "minReplicas": 1, + "rayStartParams": { + "block": "true", + "num-gpus": "0", + }, + "replicas": 1, + "template": { + "metadata": { + "annotations": {"key": "value"}, + "labels": {"key": "value"}, + }, + "spec": { + "containers": [ + { + "env": [ + { + "name": "MY_POD_IP", + "valueFrom": { + "fieldRef": { + "fieldPath": "status.podIP" + } + }, + } + ], + "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", + "lifecycle": { + "preStop": { + "exec": { + "command": [ + "/bin/sh", + "-c", + "ray stop", + ] + } + } + }, + "name": "machine-learning", + "resources": { + "limits": { + "cpu": 1, + "memory": "2G", + "nvidia.com/gpu": 0, + }, + "requests": { + "cpu": 1, + "memory": "2G", + "nvidia.com/gpu": 0, + }, + }, + } + ], + "initContainers": [ + { + "command": [ + "sh", + "-c", + "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done", + ], + "image": "busybox:1.28", + "name": "init-myservice", + } + ], + }, + }, + } + ], + }, }, + "metadata": {}, + "priority": 0, + "priorityslope": 0, + "replicas": 1, }, - "metadata": {}, - "priority": 0, - "priorityslope": 0, - "replicas": 1, - }, - { - "allocated": 0, - "generictemplate": { - "apiVersion": "route.openshift.io/v1", - "kind": "Route", - "metadata": { - "labels": { - "odh-ray-cluster-service": "quicktest-head-svc" + { + "allocated": 0, + "generictemplate": { + "apiVersion": "route.openshift.io/v1", + "kind": "Route", + "metadata": { + "labels": { + "odh-ray-cluster-service": "quicktest-head-svc" + }, + "name": "ray-dashboard-quicktest", + "namespace": "default", }, - "name": "ray-dashboard-quicktest", - "namespace": "default", - }, - "spec": { - "port": {"targetPort": "dashboard"}, - "to": { - "kind": "Service", - "name": "quicktest-head-svc", + "spec": { + "port": {"targetPort": "dashboard"}, + "to": { + "kind": "Service", + "name": "quicktest-head-svc", + }, }, }, + "metadata": {}, + "priority": 0, + "priorityslope": 0, }, - "metadata": {}, - "priority": 0, - "priorityslope": 0, + ], + "Items": [], + "metadata": {}, + }, + "schedulingSpec": {}, + "service": {"spec": {}}, + }, + "status": { + "canrun": True, + "conditions": [ + { + "lastTransitionMicroTime": "2023-02-22T16:26:07.559447Z", + "lastUpdateMicroTime": "2023-02-22T16:26:07.559447Z", + "status": "True", + "type": "Init", + }, + { + "lastTransitionMicroTime": "2023-02-22T16:26:07.559551Z", + "lastUpdateMicroTime": "2023-02-22T16:26:07.559551Z", + "reason": "AwaitingHeadOfLine", + "status": "True", + "type": "Queueing", + }, + { + "lastTransitionMicroTime": "2023-02-22T16:26:13.220564Z", + "lastUpdateMicroTime": "2023-02-22T16:26:13.220564Z", + "reason": "AppWrapperRunnable", + "status": "True", + "type": "Dispatched", }, ], - "Items": [], - "metadata": {}, + "controllerfirsttimestamp": "2023-02-22T16:26:07.559447Z", + "filterignore": True, + "queuejobstate": "Dispatched", + "sender": "before manageQueueJob - afterEtcdDispatching", + "state": "Running", + "systempriority": 9, }, - "schedulingSpec": {}, - "service": {"spec": {}}, - }, - "status": { - "canrun": True, - "conditions": [ - { - "lastTransitionMicroTime": "2023-02-22T16:26:07.559447Z", - "lastUpdateMicroTime": "2023-02-22T16:26:07.559447Z", - "status": "True", - "type": "Init", - }, - { - "lastTransitionMicroTime": "2023-02-22T16:26:07.559551Z", - "lastUpdateMicroTime": "2023-02-22T16:26:07.559551Z", - "reason": "AwaitingHeadOfLine", - "status": "True", - "type": "Queueing", - }, - { - "lastTransitionMicroTime": "2023-02-22T16:26:13.220564Z", - "lastUpdateMicroTime": "2023-02-22T16:26:13.220564Z", - "reason": "AppWrapperRunnable", - "status": "True", - "type": "Dispatched", - }, - ], - "controllerfirsttimestamp": "2023-02-22T16:26:07.559447Z", - "filterignore": True, - "queuejobstate": "Dispatched", - "sender": "before manageQueueJob - afterEtcdDispatching", - "state": "Running", - "systempriority": 9, }, - } - ) - api_obj2 = openshift.apiobject.APIObject( - { - "apiVersion": "mcad.ibm.com/v1beta1", - "kind": "AppWrapper", - "metadata": { - "annotations": { - "kubectl.kubernetes.io/last-applied-configuration": '{"apiVersion":"mcad.ibm.com/v1beta1","kind":"AppWrapper","metadata":{"annotations":{},"name":"quicktest2","namespace":"ns"},"spec":{"priority":9,"resources":{"GenericItems":[{"custompodresources":[{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}},{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}],"generictemplate":{"apiVersion":"ray.io/v1alpha1","kind":"RayCluster","metadata":{"labels":{"appwrapper.mcad.ibm.com":"quicktest2","controller-tools.k8s.io":"1.0"},"name":"quicktest2","namespace":"ns"},"spec":{"autoscalerOptions":{"idleTimeoutSeconds":60,"imagePullPolicy":"Always","resources":{"limits":{"cpu":"500m","memory":"512Mi"},"requests":{"cpu":"500m","memory":"512Mi"}},"upscalingMode":"Default"},"enableInTreeAutoscaling":false,"headGroupSpec":{"rayStartParams":{"block":"true","dashboard-host":"0.0.0.0","num-gpus":"0"},"serviceType":"ClusterIP","template":{"spec":{"containers":[{"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","imagePullPolicy":"Always","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"ray-head","ports":[{"containerPort":6379,"name":"gcs"},{"containerPort":8265,"name":"dashboard"},{"containerPort":10001,"name":"client"}],"resources":{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}}}]}}},"rayVersion":"1.12.0","workerGroupSpecs":[{"groupName":"small-group-quicktest","maxReplicas":1,"minReplicas":1,"rayStartParams":{"block":"true","num-gpus":"0"},"replicas":1,"template":{"metadata":{"annotations":{"key":"value"},"labels":{"key":"value"}},"spec":{"containers":[{"env":[{"name":"MY_POD_IP","valueFrom":{"fieldRef":{"fieldPath":"status.podIP"}}}],"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"machine-learning","resources":{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}}],"initContainers":[{"command":["sh","-c","until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"],"image":"busybox:1.28","name":"init-myservice"}]}}}]}},"replicas":1},{"generictemplate":{"apiVersion":"route.openshift.io/v1","kind":"Route","metadata":{"labels":{"odh-ray-cluster-service":"quicktest-head-svc"},"name":"ray-dashboard-quicktest","namespace":"default"},"spec":{"port":{"targetPort":"dashboard"},"to":{"kind":"Service","name":"quicktest-head-svc"}}},"replica":1}],"Items":[]}}}\n' - }, - "creationTimestamp": "2023-02-22T16:26:07Z", - "generation": 4, - "managedFields": [ - { - "apiVersion": "mcad.ibm.com/v1beta1", - "fieldsType": "FieldsV1", - "fieldsV1": { - "f:spec": { - "f:resources": {"f:GenericItems": {}, "f:metadata": {}}, - "f:schedulingSpec": {}, - "f:service": {".": {}, "f:spec": {}}, - }, - "f:status": { - ".": {}, - "f:canrun": {}, - "f:conditions": {}, - "f:controllerfirsttimestamp": {}, - "f:filterignore": {}, - "f:queuejobstate": {}, - "f:sender": {}, - "f:state": {}, - "f:systempriority": {}, - }, - }, - "manager": "Go-http-client", - "operation": "Update", - "time": "2023-02-22T16:26:07Z", + { + "apiVersion": "mcad.ibm.com/v1beta1", + "kind": "AppWrapper", + "metadata": { + "annotations": { + "kubectl.kubernetes.io/last-applied-configuration": '{"apiVersion":"mcad.ibm.com/v1beta1","kind":"AppWrapper","metadata":{"annotations":{},"name":"quicktest2","namespace":"ns"},"spec":{"priority":9,"resources":{"GenericItems":[{"custompodresources":[{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}},{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}],"generictemplate":{"apiVersion":"ray.io/v1alpha1","kind":"RayCluster","metadata":{"labels":{"appwrapper.mcad.ibm.com":"quicktest2","controller-tools.k8s.io":"1.0"},"name":"quicktest2","namespace":"ns"},"spec":{"autoscalerOptions":{"idleTimeoutSeconds":60,"imagePullPolicy":"Always","resources":{"limits":{"cpu":"500m","memory":"512Mi"},"requests":{"cpu":"500m","memory":"512Mi"}},"upscalingMode":"Default"},"enableInTreeAutoscaling":false,"headGroupSpec":{"rayStartParams":{"block":"true","dashboard-host":"0.0.0.0","num-gpus":"0"},"serviceType":"ClusterIP","template":{"spec":{"containers":[{"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","imagePullPolicy":"Always","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"ray-head","ports":[{"containerPort":6379,"name":"gcs"},{"containerPort":8265,"name":"dashboard"},{"containerPort":10001,"name":"client"}],"resources":{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}}}]}}},"rayVersion":"1.12.0","workerGroupSpecs":[{"groupName":"small-group-quicktest","maxReplicas":1,"minReplicas":1,"rayStartParams":{"block":"true","num-gpus":"0"},"replicas":1,"template":{"metadata":{"annotations":{"key":"value"},"labels":{"key":"value"}},"spec":{"containers":[{"env":[{"name":"MY_POD_IP","valueFrom":{"fieldRef":{"fieldPath":"status.podIP"}}}],"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"machine-learning","resources":{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}}],"initContainers":[{"command":["sh","-c","until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"],"image":"busybox:1.28","name":"init-myservice"}]}}}]}},"replicas":1},{"generictemplate":{"apiVersion":"route.openshift.io/v1","kind":"Route","metadata":{"labels":{"odh-ray-cluster-service":"quicktest-head-svc"},"name":"ray-dashboard-quicktest","namespace":"default"},"spec":{"port":{"targetPort":"dashboard"},"to":{"kind":"Service","name":"quicktest-head-svc"}}},"replica":1}],"Items":[]}}}\n' }, - { - "apiVersion": "mcad.ibm.com/v1beta1", - "fieldsType": "FieldsV1", - "fieldsV1": { - "f:metadata": { - "f:annotations": { + "creationTimestamp": "2023-02-22T16:26:07Z", + "generation": 4, + "managedFields": [ + { + "apiVersion": "mcad.ibm.com/v1beta1", + "fieldsType": "FieldsV1", + "fieldsV1": { + "f:spec": { + "f:resources": { + "f:GenericItems": {}, + "f:metadata": {}, + }, + "f:schedulingSpec": {}, + "f:service": {".": {}, "f:spec": {}}, + }, + "f:status": { ".": {}, - "f:kubectl.kubernetes.io/last-applied-configuration": {}, - } - }, - "f:spec": { - ".": {}, - "f:priority": {}, - "f:resources": {".": {}, "f:Items": {}}, + "f:canrun": {}, + "f:conditions": {}, + "f:controllerfirsttimestamp": {}, + "f:filterignore": {}, + "f:queuejobstate": {}, + "f:sender": {}, + "f:state": {}, + "f:systempriority": {}, + }, }, + "manager": "Go-http-client", + "operation": "Update", + "time": "2023-02-22T16:26:07Z", }, - "manager": "kubectl-client-side-apply", - "operation": "Update", - "time": "2023-02-22T16:26:07Z", - }, - ], - "name": "quicktest2", - "namespace": "ns", - "resourceVersion": "9482384", - "uid": "6334fc1b-471e-4876-8e7b-0b2277679235", - }, - "spec": { - "priority": 9, - "resources": { - "GenericItems": [ { - "allocated": 0, - "custompodresources": [ - { - "limits": { - "cpu": "2", - "memory": "8G", - "nvidia.com/gpu": "0", - }, - "replicas": 1, - "requests": { - "cpu": "2", - "memory": "8G", - "nvidia.com/gpu": "0", - }, - }, - { - "limits": { - "cpu": "1", - "memory": "2G", - "nvidia.com/gpu": "0", - }, - "replicas": 1, - "requests": { - "cpu": "1", - "memory": "2G", - "nvidia.com/gpu": "0", - }, + "apiVersion": "mcad.ibm.com/v1beta1", + "fieldsType": "FieldsV1", + "fieldsV1": { + "f:metadata": { + "f:annotations": { + ".": {}, + "f:kubectl.kubernetes.io/last-applied-configuration": {}, + } }, - ], - "generictemplate": { - "apiVersion": "ray.io/v1alpha1", - "kind": "RayCluster", - "metadata": { - "labels": { - "appwrapper.mcad.ibm.com": "quicktest2", - "controller-tools.k8s.io": "1.0", - }, - "name": "quicktest2", - "namespace": "ns", + "f:spec": { + ".": {}, + "f:priority": {}, + "f:resources": {".": {}, "f:Items": {}}, }, - "spec": { - "autoscalerOptions": { - "idleTimeoutSeconds": 60, - "imagePullPolicy": "Always", - "resources": { - "limits": { - "cpu": "500m", - "memory": "512Mi", - }, - "requests": { - "cpu": "500m", - "memory": "512Mi", - }, + }, + "manager": "kubectl-client-side-apply", + "operation": "Update", + "time": "2023-02-22T16:26:07Z", + }, + ], + "name": "quicktest2", + "namespace": "ns", + "resourceVersion": "9482384", + "uid": "6334fc1b-471e-4876-8e7b-0b2277679235", + }, + "spec": { + "priority": 9, + "resources": { + "GenericItems": [ + { + "allocated": 0, + "custompodresources": [ + { + "limits": { + "cpu": "2", + "memory": "8G", + "nvidia.com/gpu": "0", + }, + "replicas": 1, + "requests": { + "cpu": "2", + "memory": "8G", + "nvidia.com/gpu": "0", }, - "upscalingMode": "Default", }, - "enableInTreeAutoscaling": False, - "headGroupSpec": { - "rayStartParams": { - "block": "true", - "dashboard-host": "0.0.0.0", - "num-gpus": "0", + { + "limits": { + "cpu": "1", + "memory": "2G", + "nvidia.com/gpu": "0", }, - "serviceType": "ClusterIP", - "template": { - "spec": { - "containers": [ - { - "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", - "imagePullPolicy": "Always", - "lifecycle": { - "preStop": { - "exec": { - "command": [ - "/bin/sh", - "-c", - "ray stop", - ] - } - } - }, - "name": "ray-head", - "ports": [ - { - "containerPort": 6379, - "name": "gcs", - }, - { - "containerPort": 8265, - "name": "dashboard", - }, - { - "containerPort": 10001, - "name": "client", - }, - ], - "resources": { - "limits": { - "cpu": 2, - "memory": "8G", - "nvidia.com/gpu": 0, - }, - "requests": { - "cpu": 2, - "memory": "8G", - "nvidia.com/gpu": 0, - }, - }, - } - ] - } + "replicas": 1, + "requests": { + "cpu": "1", + "memory": "2G", + "nvidia.com/gpu": "0", }, }, - "rayVersion": "1.12.0", - "workerGroupSpecs": [ - { - "groupName": "small-group-quicktest", - "maxReplicas": 1, - "minReplicas": 1, + ], + "generictemplate": { + "apiVersion": "ray.io/v1alpha1", + "kind": "RayCluster", + "metadata": { + "labels": { + "appwrapper.mcad.ibm.com": "quicktest2", + "controller-tools.k8s.io": "1.0", + }, + "name": "quicktest2", + "namespace": "ns", + }, + "spec": { + "autoscalerOptions": { + "idleTimeoutSeconds": 60, + "imagePullPolicy": "Always", + "resources": { + "limits": { + "cpu": "500m", + "memory": "512Mi", + }, + "requests": { + "cpu": "500m", + "memory": "512Mi", + }, + }, + "upscalingMode": "Default", + }, + "enableInTreeAutoscaling": False, + "headGroupSpec": { "rayStartParams": { "block": "true", + "dashboard-host": "0.0.0.0", "num-gpus": "0", }, - "replicas": 1, + "serviceType": "ClusterIP", "template": { - "metadata": { - "annotations": {"key": "value"}, - "labels": {"key": "value"}, - }, "spec": { "containers": [ { - "env": [ - { - "name": "MY_POD_IP", - "valueFrom": { - "fieldRef": { - "fieldPath": "status.podIP" - } - }, - } - ], "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", + "imagePullPolicy": "Always", "lifecycle": { "preStop": { "exec": { @@ -1339,114 +1356,207 @@ def get_aw_obj(): } } }, - "name": "machine-learning", + "name": "ray-head", + "ports": [ + { + "containerPort": 6379, + "name": "gcs", + }, + { + "containerPort": 8265, + "name": "dashboard", + }, + { + "containerPort": 10001, + "name": "client", + }, + ], "resources": { "limits": { - "cpu": 1, - "memory": "2G", + "cpu": 2, + "memory": "8G", "nvidia.com/gpu": 0, }, "requests": { - "cpu": 1, - "memory": "2G", + "cpu": 2, + "memory": "8G", "nvidia.com/gpu": 0, }, }, } - ], - "initContainers": [ - { - "command": [ - "sh", - "-c", - "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done", - ], - "image": "busybox:1.28", - "name": "init-myservice", - } - ], - }, + ] + } }, - } - ], + }, + "rayVersion": "1.12.0", + "workerGroupSpecs": [ + { + "groupName": "small-group-quicktest", + "maxReplicas": 1, + "minReplicas": 1, + "rayStartParams": { + "block": "true", + "num-gpus": "0", + }, + "replicas": 1, + "template": { + "metadata": { + "annotations": {"key": "value"}, + "labels": {"key": "value"}, + }, + "spec": { + "containers": [ + { + "env": [ + { + "name": "MY_POD_IP", + "valueFrom": { + "fieldRef": { + "fieldPath": "status.podIP" + } + }, + } + ], + "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", + "lifecycle": { + "preStop": { + "exec": { + "command": [ + "/bin/sh", + "-c", + "ray stop", + ] + } + } + }, + "name": "machine-learning", + "resources": { + "limits": { + "cpu": 1, + "memory": "2G", + "nvidia.com/gpu": 0, + }, + "requests": { + "cpu": 1, + "memory": "2G", + "nvidia.com/gpu": 0, + }, + }, + } + ], + "initContainers": [ + { + "command": [ + "sh", + "-c", + "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done", + ], + "image": "busybox:1.28", + "name": "init-myservice", + } + ], + }, + }, + } + ], + }, }, + "metadata": {}, + "priority": 0, + "priorityslope": 0, + "replicas": 1, }, - "metadata": {}, - "priority": 0, - "priorityslope": 0, - "replicas": 1, - }, - { - "allocated": 0, - "generictemplate": { - "apiVersion": "route.openshift.io/v1", - "kind": "Route", - "metadata": { - "labels": { - "odh-ray-cluster-service": "quicktest-head-svc" + { + "allocated": 0, + "generictemplate": { + "apiVersion": "route.openshift.io/v1", + "kind": "Route", + "metadata": { + "labels": { + "odh-ray-cluster-service": "quicktest-head-svc" + }, + "name": "ray-dashboard-quicktest", + "namespace": "default", }, - "name": "ray-dashboard-quicktest", - "namespace": "default", - }, - "spec": { - "port": {"targetPort": "dashboard"}, - "to": { - "kind": "Service", - "name": "quicktest-head-svc", + "spec": { + "port": {"targetPort": "dashboard"}, + "to": { + "kind": "Service", + "name": "quicktest-head-svc", + }, }, }, + "metadata": {}, + "priority": 0, + "priorityslope": 0, }, - "metadata": {}, - "priority": 0, - "priorityslope": 0, + ], + "Items": [], + "metadata": {}, + }, + "schedulingSpec": {}, + "service": {"spec": {}}, + }, + "status": { + "canrun": True, + "conditions": [ + { + "lastTransitionMicroTime": "2023-02-22T16:26:07.559447Z", + "lastUpdateMicroTime": "2023-02-22T16:26:07.559447Z", + "status": "True", + "type": "Init", + }, + { + "lastTransitionMicroTime": "2023-02-22T16:26:07.559551Z", + "lastUpdateMicroTime": "2023-02-22T16:26:07.559551Z", + "reason": "AwaitingHeadOfLine", + "status": "True", + "type": "Queueing", + }, + { + "lastTransitionMicroTime": "2023-02-22T16:26:13.220564Z", + "lastUpdateMicroTime": "2023-02-22T16:26:13.220564Z", + "reason": "AppWrapperRunnable", + "status": "True", + "type": "Dispatched", }, ], - "Items": [], - "metadata": {}, + "controllerfirsttimestamp": "2023-02-22T16:26:07.559447Z", + "filterignore": True, + "queuejobstate": "Dispatched", + "sender": "before manageQueueJob - afterEtcdDispatching", + "state": "Pending", + "systempriority": 9, }, - "schedulingSpec": {}, - "service": {"spec": {}}, }, - "status": { - "canrun": True, - "conditions": [ - { - "lastTransitionMicroTime": "2023-02-22T16:26:07.559447Z", - "lastUpdateMicroTime": "2023-02-22T16:26:07.559447Z", - "status": "True", - "type": "Init", - }, - { - "lastTransitionMicroTime": "2023-02-22T16:26:07.559551Z", - "lastUpdateMicroTime": "2023-02-22T16:26:07.559551Z", - "reason": "AwaitingHeadOfLine", - "status": "True", - "type": "Queueing", - }, - { - "lastTransitionMicroTime": "2023-02-22T16:26:13.220564Z", - "lastUpdateMicroTime": "2023-02-22T16:26:13.220564Z", - "reason": "AppWrapperRunnable", - "status": "True", - "type": "Dispatched", - }, - ], - "controllerfirsttimestamp": "2023-02-22T16:26:07.559447Z", - "filterignore": True, - "queuejobstate": "Dispatched", - "sender": "before manageQueueJob - afterEtcdDispatching", - "state": "Pending", - "systempriority": 9, - }, - } - ) - return [api_obj1, api_obj2] + ] + } + return api_obj1 + + +def test_get_cluster(mocker): + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=get_ray_obj, + ) + cluster = get_cluster("quicktest") + cluster_config = cluster.config + assert cluster_config.name == "quicktest" and cluster_config.namespace == "ns" + assert "m4.xlarge" in cluster_config.machine_types and "g4dn.xlarge" in cluster_config.machine_types + assert cluster_config.min_cpus == 1 and cluster_config.max_cpus == 1 + assert cluster_config.min_memory == 2 and cluster_config.max_memory == 2 + assert cluster_config.gpu == 0 + assert cluster_config.instascale + assert cluster_config.image == "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103" def test_list_clusters(mocker, capsys): - mocker.patch("openshift.selector", side_effect=get_selector) - mock_res = mocker.patch.object(Selector, "objects") - mock_res.side_effect = get_obj_none + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=get_obj_none, + ) list_all_clusters("ns") captured = capsys.readouterr() assert captured.out == ( @@ -1454,7 +1564,10 @@ def test_list_clusters(mocker, capsys): "│ No resources found, have you run cluster.up() yet? │\n" "╰──────────────────────────────────────────────────────────────────────────────╯\n" ) - mock_res.side_effect = get_ray_obj + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=get_ray_obj, + ) list_all_clusters("ns") captured = capsys.readouterr() assert captured.out == ( @@ -1480,9 +1593,11 @@ def test_list_clusters(mocker, capsys): def test_list_queue(mocker, capsys): - mocker.patch("openshift.selector", side_effect=get_selector) - mock_res = mocker.patch.object(Selector, "objects") - mock_res.side_effect = get_obj_none + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=get_obj_none, + ) list_all_queued("ns") captured = capsys.readouterr() assert captured.out == ( @@ -1490,7 +1605,10 @@ def test_list_queue(mocker, capsys): "│ No resources found, have you run cluster.up() yet? │\n" "╰──────────────────────────────────────────────────────────────────────────────╯\n" ) - mock_res.side_effect = get_aw_obj + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=get_aw_obj, + ) list_all_queued("ns") captured = capsys.readouterr() assert captured.out == ( @@ -1510,6 +1628,7 @@ def test_list_queue(mocker, capsys): def test_cluster_status(mocker): + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") fake_aw = AppWrapper( "test", AppWrapperStatus.FAILED, can_run=True, job_state="unused" ) @@ -1526,6 +1645,8 @@ def test_cluster_status(mocker): dashboard="fake-uri", ) cf = Cluster(ClusterConfiguration(name="test", namespace="ns")) + mocker.patch("codeflare_sdk.cluster.cluster._app_wrapper_status", return_value=None) + mocker.patch("codeflare_sdk.cluster.cluster._ray_cluster_status", return_value=None) status, ready = cf.status() assert status == CodeFlareClusterStatus.UNKNOWN assert ready == False @@ -1587,6 +1708,9 @@ def test_cluster_status(mocker): def test_wait_ready(mocker, capsys): + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch("codeflare_sdk.cluster.cluster._app_wrapper_status", return_value=None) + mocker.patch("codeflare_sdk.cluster.cluster._ray_cluster_status", return_value=None) cf = Cluster(ClusterConfiguration(name="test", namespace="ns")) try: cf.wait_ready(timeout=5) @@ -1658,12 +1782,17 @@ def test_DDPJobDefinition_creation(): return ddp -def test_DDPJobDefinition_dry_run(): +def test_DDPJobDefinition_dry_run(mocker): """ Test that the dry run method returns the correct type: AppDryRunInfo, that the attributes of the returned object are of the correct type, and that the values from cluster and job definition are correctly passed. """ + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri", + return_value="", + ) ddp = test_DDPJobDefinition_creation() cluster = Cluster(test_config_creation()) ddp_job = ddp._dry_run(cluster) @@ -1696,7 +1825,7 @@ def test_DDPJobDefinition_dry_run_no_cluster(mocker): """ mocker.patch( - "openshift.get_project_name", + "codeflare_sdk.job.jobs.get_current_namespace", return_value="opendatahub", ) @@ -1728,11 +1857,15 @@ def test_DDPJobDefinition_dry_run_no_cluster(mocker): assert ddp_job._scheduler == "kubernetes_mcad" -def test_DDPJobDefinition_dry_run_no_resource_args(): +def test_DDPJobDefinition_dry_run_no_resource_args(mocker): """ Test that the dry run correctly gets resources from the cluster object when the job definition does not specify resources. """ + mocker.patch( + "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri", + return_value="", + ) cluster = Cluster(test_config_creation()) ddp = DDPJobDefinition( script="test.py", @@ -1765,7 +1898,7 @@ def test_DDPJobDefinition_dry_run_no_cluster_no_resource_args(mocker): """ mocker.patch( - "openshift.get_project_name", + "codeflare_sdk.job.jobs.get_current_namespace", return_value="opendatahub", ) @@ -1817,11 +1950,15 @@ def test_DDPJobDefinition_submit(mocker): Tests that the submit method returns the correct type: DDPJob And that the attributes of the returned object are of the correct type """ + mocker.patch( + "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri", + return_value="fake-dashboard-uri", + ) ddp_def = test_DDPJobDefinition_creation() cluster = Cluster(test_config_creation()) mocker.patch( - "openshift.get_project_name", - return_value="opendatahub", + "codeflare_sdk.job.jobs.get_current_namespace", + side_effect="opendatahub", ) mocker.patch( "codeflare_sdk.job.jobs.torchx_runner.schedule", @@ -1844,6 +1981,10 @@ def test_DDPJobDefinition_submit(mocker): def test_DDPJob_creation(mocker): + mocker.patch( + "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri", + return_value="fake-dashboard-uri", + ) ddp_def = test_DDPJobDefinition_creation() cluster = Cluster(test_config_creation()) mocker.patch( @@ -1870,8 +2011,8 @@ def test_DDPJob_creation_no_cluster(mocker): ddp_def = test_DDPJobDefinition_creation() ddp_def.image = "fake-image" mocker.patch( - "openshift.get_project_name", - return_value="opendatahub", + "codeflare_sdk.job.jobs.get_current_namespace", + side_effect="opendatahub", ) mocker.patch( "codeflare_sdk.job.jobs.torchx_runner.schedule", @@ -1962,14 +2103,24 @@ def test_AWManager_creation(): ) -def arg_check_aw_create_effect(*args): - assert args[0] == "create" - assert args[1] == ["-f", "test.yaml"] +def arg_check_aw_apply_effect(group, version, namespace, plural, body, *args): + assert group == "mcad.ibm.com" + assert version == "v1beta1" + assert namespace == "ns" + assert plural == "appwrappers" + with open("test.yaml") as f: + aw = yaml.load(f, Loader=yaml.FullLoader) + assert body == aw + assert args == tuple() -def arg_check_aw_delete_effect(*args): - assert args[0] == "delete" - assert args[1] == ["AppWrapper", "test"] +def arg_check_aw_del_effect(group, version, namespace, plural, name, *args): + assert group == "mcad.ibm.com" + assert version == "v1beta1" + assert namespace == "ns" + assert plural == "appwrappers" + assert name == "test" + assert args == tuple() def test_AWManager_submit_remove(mocker, capsys): @@ -1981,10 +2132,17 @@ def test_AWManager_submit_remove(mocker, capsys): == "AppWrapper not submitted by this manager yet, nothing to remove\n" ) assert testaw.submitted == False - mocker.patch("openshift.invoke", side_effect=arg_check_aw_create_effect) + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.create_namespaced_custom_object", + side_effect=arg_check_aw_apply_effect, + ) + mocker.patch( + "kubernetes.client.CustomObjectsApi.delete_namespaced_custom_object", + side_effect=arg_check_aw_del_effect, + ) testaw.submit() assert testaw.submitted == True - mocker.patch("openshift.invoke", side_effect=arg_check_aw_delete_effect) testaw.remove() assert testaw.submitted == False