Skip to content

Commit 89ac249

Browse files
carsonmhMaxusmusti
authored andcommitted
Get cluster (#189)
* Add: get_cluster function to get cluster with specified name and namespace * Test: make unit tests for get_cluster function
1 parent 9647455 commit 89ac249

File tree

2 files changed

+93
-3
lines changed

2 files changed

+93
-3
lines changed

Diff for: src/codeflare_sdk/cluster/cluster.py

+64
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,49 @@ def torchx_config(
303303
to_return["requirements"] = requirements
304304
return to_return
305305

306+
def from_k8_cluster_object(rc):
307+
machine_types = (
308+
rc["metadata"]["labels"]["orderedinstance"].split("_")
309+
if "orderedinstance" in rc["metadata"]["labels"]
310+
else []
311+
)
312+
local_interactive = (
313+
"volumeMounts"
314+
in rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0]
315+
)
316+
cluster_config = ClusterConfiguration(
317+
name=rc["metadata"]["name"],
318+
namespace=rc["metadata"]["namespace"],
319+
machine_types=machine_types,
320+
min_worker=rc["spec"]["workerGroupSpecs"][0]["minReplicas"],
321+
max_worker=rc["spec"]["workerGroupSpecs"][0]["maxReplicas"],
322+
min_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
323+
"containers"
324+
][0]["resources"]["requests"]["cpu"],
325+
max_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
326+
"containers"
327+
][0]["resources"]["limits"]["cpu"],
328+
min_memory=int(
329+
rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
330+
"resources"
331+
]["requests"]["memory"][:-1]
332+
),
333+
max_memory=int(
334+
rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
335+
"resources"
336+
]["limits"]["memory"][:-1]
337+
),
338+
gpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
339+
"resources"
340+
]["limits"]["nvidia.com/gpu"],
341+
instascale=True if machine_types else False,
342+
image=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][
343+
0
344+
]["image"],
345+
local_interactive=local_interactive,
346+
)
347+
return Cluster(cluster_config)
348+
306349

307350
def list_all_clusters(namespace: str, print_to_console: bool = True):
308351
"""
@@ -339,6 +382,27 @@ def get_current_namespace(): # pragma: no cover
339382
return "default"
340383

341384

385+
def get_cluster(cluster_name: str, namespace: str = "default"):
386+
try:
387+
config.load_kube_config()
388+
api_instance = client.CustomObjectsApi()
389+
rcs = api_instance.list_namespaced_custom_object(
390+
group="ray.io",
391+
version="v1alpha1",
392+
namespace=namespace,
393+
plural="rayclusters",
394+
)
395+
except Exception as e:
396+
return _kube_api_error_handling(e)
397+
398+
for rc in rcs["items"]:
399+
if rc["metadata"]["name"] == cluster_name:
400+
return Cluster.from_k8_cluster_object(rc)
401+
raise FileNotFoundError(
402+
f"Cluster {cluster_name} is not found in {namespace} namespace"
403+
)
404+
405+
342406
# private methods
343407

344408

Diff for: tests/unit_test.py

+29-3
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
list_all_clusters,
3030
list_all_queued,
3131
_copy_to_ray,
32+
get_cluster,
3233
_app_wrapper_status,
3334
_ray_cluster_status,
3435
)
@@ -614,6 +615,7 @@ def get_ray_obj(group, version, namespace, plural, cls=None):
614615
"appwrapper.mcad.ibm.com": "quicktest",
615616
"controller-tools.k8s.io": "1.0",
616617
"resourceName": "quicktest",
618+
"orderedinstance": "m4.xlarge_g4dn.xlarge",
617619
},
618620
"managedFields": [
619621
{
@@ -791,10 +793,10 @@ def get_ray_obj(group, version, namespace, plural, cls=None):
791793
"workerGroupSpecs": [
792794
{
793795
"groupName": "small-group-quicktest",
794-
"maxReplicas": 1,
795-
"minReplicas": 1,
796+
"maxReplicas": 2,
797+
"minReplicas": 2,
796798
"rayStartParams": {"block": "true", "num-gpus": "0"},
797-
"replicas": 1,
799+
"replicas": 2,
798800
"template": {
799801
"metadata": {
800802
"annotations": {"key": "value"},
@@ -1529,6 +1531,30 @@ def get_aw_obj(group, version, namespace, plural):
15291531
return api_obj1
15301532

15311533

1534+
def test_get_cluster(mocker):
1535+
mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
1536+
mocker.patch(
1537+
"kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
1538+
side_effect=get_ray_obj,
1539+
)
1540+
cluster = get_cluster("quicktest")
1541+
cluster_config = cluster.config
1542+
assert cluster_config.name == "quicktest" and cluster_config.namespace == "ns"
1543+
assert (
1544+
"m4.xlarge" in cluster_config.machine_types
1545+
and "g4dn.xlarge" in cluster_config.machine_types
1546+
)
1547+
assert cluster_config.min_cpus == 1 and cluster_config.max_cpus == 1
1548+
assert cluster_config.min_memory == 2 and cluster_config.max_memory == 2
1549+
assert cluster_config.gpu == 0
1550+
assert cluster_config.instascale
1551+
assert (
1552+
cluster_config.image
1553+
== "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103"
1554+
)
1555+
assert cluster_config.min_worker == 2 and cluster_config.max_worker == 2
1556+
1557+
15321558
def test_list_clusters(mocker, capsys):
15331559
mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
15341560
mocker.patch(

0 commit comments

Comments
 (0)