From 687e31d5f32a2b5f92c6e5e5ff9f6fd4f9720587 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Fri, 30 Jun 2023 14:00:07 -0700 Subject: [PATCH 1/6] Add: get_cluster function to get cluster with specified name and namespace --- src/codeflare_sdk/cluster/cluster.py | 48 ++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index 26f7ed62a..a06696978 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -335,6 +335,54 @@ def get_current_namespace(): # pragma: no cover return active_context["context"]["namespace"] except KeyError: return "default" + + +def get_cluster(cluster_name: str, namespace: str = "default"): + try: + config.load_kube_config() + api_instance = client.CustomObjectsApi() + rcs = api_instance.list_namespaced_custom_object( + group="ray.io", + version="v1alpha1", + namespace=namespace, + plural="rayclusters", + ) + except Exception as e: + return _kube_api_error_handling(e) + + + for rc in rcs["items"]: + if rc["metadata"]["name"] == cluster_name: + machine_types = rc["metadata"]["labels"]["orderedinstance"].split("_") if "orderedinstance" in rc["metadata"]["labels"] else [] + local_interactive = "volumeMounts" in rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0] + cluster_config = ClusterConfiguration( + name=rc["metadata"]["name"], + namespace=rc["metadata"]["namespace"], + machine_types=machine_types, + min_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ + 0 + ]["resources"]["requests"]["cpu"], + max_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ + 0 + ]["resources"]["limits"]["cpu"], + min_memory=int(rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ + 0 + ]["resources"]["requests"]["memory"][:-1]), + max_memory=int(rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ + 0 + ]["resources"]["limits"]["memory"][:-1]), + gpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ + 0 + ]["resources"]["limits"]["nvidia.com/gpu"], + instascale=True if machine_types else False, + image=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ + 0 + ]["image"], + local_interactive=local_interactive, + ) + return Cluster(cluster_config) + raise FileNotFoundError(f'Cluster {cluster_name} is not found in {namespace} namespace') + # private methods From a23081492b172ba8a580166d42286eeeae467fd5 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Fri, 30 Jun 2023 14:06:55 -0700 Subject: [PATCH 2/6] Test: make unit tests for get_cluster function --- src/codeflare_sdk/cluster/cluster.py | 1 - tests/unit_test.py | 19 +++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index a06696978..ab833e119 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -384,7 +384,6 @@ def get_cluster(cluster_name: str, namespace: str = "default"): raise FileNotFoundError(f'Cluster {cluster_name} is not found in {namespace} namespace') - # private methods diff --git a/tests/unit_test.py b/tests/unit_test.py index 7225b6725..83d7729ab 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -29,6 +29,7 @@ list_all_clusters, list_all_queued, _copy_to_ray, + get_cluster, _app_wrapper_status, _ray_cluster_status, ) @@ -615,6 +616,7 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "appwrapper.mcad.ibm.com": "quicktest", "controller-tools.k8s.io": "1.0", "resourceName": "quicktest", + "orderedinstance": "m4.xlarge_g4dn.xlarge", }, "managedFields": [ { @@ -1530,6 +1532,23 @@ def get_aw_obj(group, version, namespace, plural): return api_obj1 +def test_get_cluster(mocker): + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=get_ray_obj, + ) + cluster = get_cluster("quicktest") + cluster_config = cluster.config + assert cluster_config.name == "quicktest" and cluster_config.namespace == "ns" + assert "m4.xlarge" in cluster_config.machine_types and "g4dn.xlarge" in cluster_config.machine_types + assert cluster_config.min_cpus == 1 and cluster_config.max_cpus == 1 + assert cluster_config.min_memory == 2 and cluster_config.max_memory == 2 + assert cluster_config.gpu == 0 + assert cluster_config.instascale + assert cluster_config.image == "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103" + + def test_list_clusters(mocker, capsys): mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") mocker.patch( From 7bc6fb36cafbed310407556afaf7b7522e8b8d9c Mon Sep 17 00:00:00 2001 From: carsonmh Date: Fri, 30 Jun 2023 14:28:19 -0700 Subject: [PATCH 3/6] refactor --- src/codeflare_sdk/cluster/cluster.py | 104 +++++++++++++++------------ 1 file changed, 59 insertions(+), 45 deletions(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index ab833e119..f0964cdde 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -335,53 +335,67 @@ def get_current_namespace(): # pragma: no cover return active_context["context"]["namespace"] except KeyError: return "default" - + def get_cluster(cluster_name: str, namespace: str = "default"): - try: - config.load_kube_config() - api_instance = client.CustomObjectsApi() - rcs = api_instance.list_namespaced_custom_object( - group="ray.io", - version="v1alpha1", - namespace=namespace, - plural="rayclusters", - ) - except Exception as e: - return _kube_api_error_handling(e) - - - for rc in rcs["items"]: - if rc["metadata"]["name"] == cluster_name: - machine_types = rc["metadata"]["labels"]["orderedinstance"].split("_") if "orderedinstance" in rc["metadata"]["labels"] else [] - local_interactive = "volumeMounts" in rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0] - cluster_config = ClusterConfiguration( - name=rc["metadata"]["name"], - namespace=rc["metadata"]["namespace"], - machine_types=machine_types, - min_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ - 0 - ]["resources"]["requests"]["cpu"], - max_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ - 0 - ]["resources"]["limits"]["cpu"], - min_memory=int(rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ - 0 - ]["resources"]["requests"]["memory"][:-1]), - max_memory=int(rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ - 0 - ]["resources"]["limits"]["memory"][:-1]), - gpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ - 0 - ]["resources"]["limits"]["nvidia.com/gpu"], - instascale=True if machine_types else False, - image=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ - 0 - ]["image"], - local_interactive=local_interactive, - ) - return Cluster(cluster_config) - raise FileNotFoundError(f'Cluster {cluster_name} is not found in {namespace} namespace') + try: + config.load_kube_config() + api_instance = client.CustomObjectsApi() + rcs = api_instance.list_namespaced_custom_object( + group="ray.io", + version="v1alpha1", + namespace=namespace, + plural="rayclusters", + ) + except Exception as e: + return _kube_api_error_handling(e) + + for rc in rcs["items"]: + if rc["metadata"]["name"] == cluster_name: + machine_types = ( + rc["metadata"]["labels"]["orderedinstance"].split("_") + if "orderedinstance" in rc["metadata"]["labels"] + else [] + ) + local_interactive = ( + "volumeMounts" + in rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ + 0 + ] + ) + cluster_config = ClusterConfiguration( + name=rc["metadata"]["name"], + namespace=rc["metadata"]["namespace"], + machine_types=machine_types, + min_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + "containers" + ][0]["resources"]["requests"]["cpu"], + max_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + "containers" + ][0]["resources"]["limits"]["cpu"], + min_memory=int( + rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ + 0 + ]["resources"]["requests"]["memory"][:-1] + ), + max_memory=int( + rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ + 0 + ]["resources"]["limits"]["memory"][:-1] + ), + gpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ + 0 + ]["resources"]["limits"]["nvidia.com/gpu"], + instascale=True if machine_types else False, + image=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + "containers" + ][0]["image"], + local_interactive=local_interactive, + ) + return Cluster(cluster_config) + raise FileNotFoundError( + f"Cluster {cluster_name} is not found in {namespace} namespace" + ) # private methods From 40dba538a655406a75aa0671cccc70d95df2c43f Mon Sep 17 00:00:00 2001 From: carsonmh Date: Fri, 30 Jun 2023 14:30:11 -0700 Subject: [PATCH 4/6] refactor tests --- tests/unit_test.py | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/tests/unit_test.py b/tests/unit_test.py index 83d7729ab..a03527279 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -1533,20 +1533,26 @@ def get_aw_obj(group, version, namespace, plural): def test_get_cluster(mocker): - mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") - mocker.patch( - "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", - side_effect=get_ray_obj, - ) - cluster = get_cluster("quicktest") - cluster_config = cluster.config - assert cluster_config.name == "quicktest" and cluster_config.namespace == "ns" - assert "m4.xlarge" in cluster_config.machine_types and "g4dn.xlarge" in cluster_config.machine_types - assert cluster_config.min_cpus == 1 and cluster_config.max_cpus == 1 - assert cluster_config.min_memory == 2 and cluster_config.max_memory == 2 - assert cluster_config.gpu == 0 - assert cluster_config.instascale - assert cluster_config.image == "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103" + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=get_ray_obj, + ) + cluster = get_cluster("quicktest") + cluster_config = cluster.config + assert cluster_config.name == "quicktest" and cluster_config.namespace == "ns" + assert ( + "m4.xlarge" in cluster_config.machine_types + and "g4dn.xlarge" in cluster_config.machine_types + ) + assert cluster_config.min_cpus == 1 and cluster_config.max_cpus == 1 + assert cluster_config.min_memory == 2 and cluster_config.max_memory == 2 + assert cluster_config.gpu == 0 + assert cluster_config.instascale + assert ( + cluster_config.image + == "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103" + ) def test_list_clusters(mocker, capsys): From 583f65f574a4bb7417d2478a4dbd9e415f66dd10 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Mon, 3 Jul 2023 15:05:03 -0700 Subject: [PATCH 5/6] Refactor: add from_k8_cluster_object(ray_cluster) to simplify get_cluster function --- src/codeflare_sdk/cluster/cluster.py | 83 ++++++++++++++-------------- 1 file changed, 42 insertions(+), 41 deletions(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index f0964cdde..f53a79b3d 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -301,6 +301,47 @@ def torchx_config( to_return["requirements"] = requirements return to_return + def from_k8_cluster_object(rc): + machine_types = ( + rc["metadata"]["labels"]["orderedinstance"].split("_") + if "orderedinstance" in rc["metadata"]["labels"] + else [] + ) + local_interactive = ( + "volumeMounts" + in rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0] + ) + cluster_config = ClusterConfiguration( + name=rc["metadata"]["name"], + namespace=rc["metadata"]["namespace"], + machine_types=machine_types, + min_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + "containers" + ][0]["resources"]["requests"]["cpu"], + max_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + "containers" + ][0]["resources"]["limits"]["cpu"], + min_memory=int( + rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][ + "resources" + ]["requests"]["memory"][:-1] + ), + max_memory=int( + rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][ + "resources" + ]["limits"]["memory"][:-1] + ), + gpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][ + "resources" + ]["limits"]["nvidia.com/gpu"], + instascale=True if machine_types else False, + image=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ + 0 + ]["image"], + local_interactive=local_interactive, + ) + return Cluster(cluster_config) + def list_all_clusters(namespace: str, print_to_console: bool = True): """ @@ -352,47 +393,7 @@ def get_cluster(cluster_name: str, namespace: str = "default"): for rc in rcs["items"]: if rc["metadata"]["name"] == cluster_name: - machine_types = ( - rc["metadata"]["labels"]["orderedinstance"].split("_") - if "orderedinstance" in rc["metadata"]["labels"] - else [] - ) - local_interactive = ( - "volumeMounts" - in rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ - 0 - ] - ) - cluster_config = ClusterConfiguration( - name=rc["metadata"]["name"], - namespace=rc["metadata"]["namespace"], - machine_types=machine_types, - min_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ - "containers" - ][0]["resources"]["requests"]["cpu"], - max_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ - "containers" - ][0]["resources"]["limits"]["cpu"], - min_memory=int( - rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ - 0 - ]["resources"]["requests"]["memory"][:-1] - ), - max_memory=int( - rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ - 0 - ]["resources"]["limits"]["memory"][:-1] - ), - gpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ - 0 - ]["resources"]["limits"]["nvidia.com/gpu"], - instascale=True if machine_types else False, - image=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ - "containers" - ][0]["image"], - local_interactive=local_interactive, - ) - return Cluster(cluster_config) + return Cluster.from_k8_cluster_object(rc) raise FileNotFoundError( f"Cluster {cluster_name} is not found in {namespace} namespace" ) From a929226b51439112d3d89e61a31f0b7b4294b0c6 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Wed, 5 Jul 2023 15:07:54 -0700 Subject: [PATCH 6/6] Fix: min_worker and max_worker were using default value --- src/codeflare_sdk/cluster/cluster.py | 2 ++ tests/unit_test.py | 7 ++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index f53a79b3d..f2148b9e1 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -315,6 +315,8 @@ def from_k8_cluster_object(rc): name=rc["metadata"]["name"], namespace=rc["metadata"]["namespace"], machine_types=machine_types, + min_worker=rc["spec"]["workerGroupSpecs"][0]["minReplicas"], + max_worker=rc["spec"]["workerGroupSpecs"][0]["maxReplicas"], min_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0]["resources"]["requests"]["cpu"], diff --git a/tests/unit_test.py b/tests/unit_test.py index a03527279..8174f0b54 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -794,10 +794,10 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "workerGroupSpecs": [ { "groupName": "small-group-quicktest", - "maxReplicas": 1, - "minReplicas": 1, + "maxReplicas": 2, + "minReplicas": 2, "rayStartParams": {"block": "true", "num-gpus": "0"}, - "replicas": 1, + "replicas": 2, "template": { "metadata": { "annotations": {"key": "value"}, @@ -1553,6 +1553,7 @@ def test_get_cluster(mocker): cluster_config.image == "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103" ) + assert cluster_config.min_worker == 2 and cluster_config.max_worker == 2 def test_list_clusters(mocker, capsys):