diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index 26f7ed62a..f2148b9e1 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -301,6 +301,49 @@ def torchx_config( to_return["requirements"] = requirements return to_return + def from_k8_cluster_object(rc): + machine_types = ( + rc["metadata"]["labels"]["orderedinstance"].split("_") + if "orderedinstance" in rc["metadata"]["labels"] + else [] + ) + local_interactive = ( + "volumeMounts" + in rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0] + ) + cluster_config = ClusterConfiguration( + name=rc["metadata"]["name"], + namespace=rc["metadata"]["namespace"], + machine_types=machine_types, + min_worker=rc["spec"]["workerGroupSpecs"][0]["minReplicas"], + max_worker=rc["spec"]["workerGroupSpecs"][0]["maxReplicas"], + min_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + "containers" + ][0]["resources"]["requests"]["cpu"], + max_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + "containers" + ][0]["resources"]["limits"]["cpu"], + min_memory=int( + rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][ + "resources" + ]["requests"]["memory"][:-1] + ), + max_memory=int( + rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][ + "resources" + ]["limits"]["memory"][:-1] + ), + gpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][ + "resources" + ]["limits"]["nvidia.com/gpu"], + instascale=True if machine_types else False, + image=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ + 0 + ]["image"], + local_interactive=local_interactive, + ) + return Cluster(cluster_config) + def list_all_clusters(namespace: str, print_to_console: bool = True): """ @@ -337,6 +380,27 @@ def get_current_namespace(): # pragma: no cover return "default" +def get_cluster(cluster_name: str, namespace: str = "default"): + try: + config.load_kube_config() + api_instance = client.CustomObjectsApi() + rcs = api_instance.list_namespaced_custom_object( + group="ray.io", + version="v1alpha1", + namespace=namespace, + plural="rayclusters", + ) + except Exception as e: + return _kube_api_error_handling(e) + + for rc in rcs["items"]: + if rc["metadata"]["name"] == cluster_name: + return Cluster.from_k8_cluster_object(rc) + raise FileNotFoundError( + f"Cluster {cluster_name} is not found in {namespace} namespace" + ) + + # private methods diff --git a/tests/unit_test.py b/tests/unit_test.py index 7225b6725..8174f0b54 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -29,6 +29,7 @@ list_all_clusters, list_all_queued, _copy_to_ray, + get_cluster, _app_wrapper_status, _ray_cluster_status, ) @@ -615,6 +616,7 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "appwrapper.mcad.ibm.com": "quicktest", "controller-tools.k8s.io": "1.0", "resourceName": "quicktest", + "orderedinstance": "m4.xlarge_g4dn.xlarge", }, "managedFields": [ { @@ -792,10 +794,10 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "workerGroupSpecs": [ { "groupName": "small-group-quicktest", - "maxReplicas": 1, - "minReplicas": 1, + "maxReplicas": 2, + "minReplicas": 2, "rayStartParams": {"block": "true", "num-gpus": "0"}, - "replicas": 1, + "replicas": 2, "template": { "metadata": { "annotations": {"key": "value"}, @@ -1530,6 +1532,30 @@ def get_aw_obj(group, version, namespace, plural): return api_obj1 +def test_get_cluster(mocker): + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=get_ray_obj, + ) + cluster = get_cluster("quicktest") + cluster_config = cluster.config + assert cluster_config.name == "quicktest" and cluster_config.namespace == "ns" + assert ( + "m4.xlarge" in cluster_config.machine_types + and "g4dn.xlarge" in cluster_config.machine_types + ) + assert cluster_config.min_cpus == 1 and cluster_config.max_cpus == 1 + assert cluster_config.min_memory == 2 and cluster_config.max_memory == 2 + assert cluster_config.gpu == 0 + assert cluster_config.instascale + assert ( + cluster_config.image + == "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103" + ) + assert cluster_config.min_worker == 2 and cluster_config.max_worker == 2 + + def test_list_clusters(mocker, capsys): mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") mocker.patch(