Skip to content

Commit a02fdf9

Browse files
committed
Finished up/down and added unit tests
1 parent 6e5d9ea commit a02fdf9

File tree

3 files changed

+293
-18
lines changed

3 files changed

+293
-18
lines changed

Diff for: src/codeflare_sdk/cluster/cluster.py

+36-8
Original file line numberDiff line numberDiff line change
@@ -187,8 +187,20 @@ def up(self):
187187
plural="rayclusters",
188188
body=resource,
189189
)
190-
else:
191-
print(resource["kind"])
190+
elif resource["kind"] == "Route":
191+
api_instance.create_namespaced_custom_object(
192+
group="route.openshift.io",
193+
version="v1",
194+
namespace=namespace,
195+
plural="routes",
196+
body=resource,
197+
)
198+
elif resource["kind"] == "Secret":
199+
secret_instance = client.CoreV1Api(api_config_handler())
200+
secret_instance.create_namespaced_secret(
201+
namespace=namespace,
202+
body=resource,
203+
)
192204
except Exception as e: # pragma: no cover
193205
return _kube_api_error_handling(e)
194206

@@ -221,8 +233,22 @@ def down(self):
221233
plural="rayclusters",
222234
name=self.app_wrapper_name,
223235
)
224-
else:
225-
print(resource["kind"])
236+
elif resource["kind"] == "Route":
237+
name = resource["metadata"]["name"]
238+
api_instance.delete_namespaced_custom_object(
239+
group="route.openshift.io",
240+
version="v1",
241+
namespace=namespace,
242+
plural="routes",
243+
name=name,
244+
)
245+
elif resource["kind"] == "Secret":
246+
name = resource["metadata"]["name"]
247+
secret_instance = client.CoreV1Api(api_config_handler())
248+
secret_instance.delete_namespaced_secret(
249+
namespace=namespace,
250+
name=name,
251+
)
226252
except Exception as e: # pragma: no cover
227253
return _kube_api_error_handling(e)
228254

@@ -313,7 +339,7 @@ def wait_ready(self, timeout: Optional[int] = None, dashboard_check: bool = True
313339
time = 0
314340
while not ready:
315341
status, ready = self.status(print_to_console=False)
316-
if status == CodeFlareClusterStatus.UNKNOWN:
342+
if self.config.mcad and status == CodeFlareClusterStatus.UNKNOWN:
317343
print(
318344
"WARNING: Current cluster status is unknown, have you run cluster.up yet?"
319345
)
@@ -410,7 +436,7 @@ def torchx_config(
410436
to_return["requirements"] = requirements
411437
return to_return
412438

413-
def from_k8_cluster_object(rc):
439+
def from_k8_cluster_object(rc, mcad=True):
414440
machine_types = (
415441
rc["metadata"]["labels"]["orderedinstance"].split("_")
416442
if "orderedinstance" in rc["metadata"]["labels"]
@@ -449,6 +475,7 @@ def from_k8_cluster_object(rc):
449475
0
450476
]["image"],
451477
local_interactive=local_interactive,
478+
mcad=mcad,
452479
)
453480
return Cluster(cluster_config)
454481

@@ -509,7 +536,7 @@ def get_current_namespace(): # pragma: no cover
509536
return None
510537

511538

512-
def get_cluster(cluster_name: str, namespace: str = "default"):
539+
def get_cluster(cluster_name: str, namespace: str = "default", mcad=True):
513540
try:
514541
config_check()
515542
api_instance = client.CustomObjectsApi(api_config_handler())
@@ -524,7 +551,7 @@ def get_cluster(cluster_name: str, namespace: str = "default"):
524551

525552
for rc in rcs["items"]:
526553
if rc["metadata"]["name"] == cluster_name:
527-
return Cluster.from_k8_cluster_object(rc)
554+
return Cluster.from_k8_cluster_object(rc, mcad=mcad)
528555
raise FileNotFoundError(
529556
f"Cluster {cluster_name} is not found in {namespace} namespace"
530557
)
@@ -635,6 +662,7 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]:
635662

636663
config_check()
637664
api_instance = client.CustomObjectsApi(api_config_handler())
665+
# UPDATE THIS
638666
routes = api_instance.list_namespaced_custom_object(
639667
group="route.openshift.io",
640668
version="v1",

Diff for: tests/test-case-no-mcad.yamls

+162
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
---
2+
apiVersion: ray.io/v1alpha1
3+
kind: RayCluster
4+
metadata:
5+
labels:
6+
appwrapper.mcad.ibm.com: unit-test-cluster-ray
7+
controller-tools.k8s.io: '1.0'
8+
name: unit-test-cluster-ray
9+
namespace: ns
10+
spec:
11+
autoscalerOptions:
12+
idleTimeoutSeconds: 60
13+
imagePullPolicy: Always
14+
resources:
15+
limits:
16+
cpu: 500m
17+
memory: 512Mi
18+
requests:
19+
cpu: 500m
20+
memory: 512Mi
21+
upscalingMode: Default
22+
enableInTreeAutoscaling: false
23+
headGroupSpec:
24+
rayStartParams:
25+
block: 'true'
26+
dashboard-host: 0.0.0.0
27+
num-gpus: '0'
28+
serviceType: ClusterIP
29+
template:
30+
spec:
31+
affinity:
32+
nodeAffinity:
33+
requiredDuringSchedulingIgnoredDuringExecution:
34+
nodeSelectorTerms:
35+
- matchExpressions:
36+
- key: unit-test-cluster-ray
37+
operator: In
38+
values:
39+
- unit-test-cluster-ray
40+
containers:
41+
- env:
42+
- name: MY_POD_IP
43+
valueFrom:
44+
fieldRef:
45+
fieldPath: status.podIP
46+
- name: RAY_USE_TLS
47+
value: '0'
48+
- name: RAY_TLS_SERVER_CERT
49+
value: /home/ray/workspace/tls/server.crt
50+
- name: RAY_TLS_SERVER_KEY
51+
value: /home/ray/workspace/tls/server.key
52+
- name: RAY_TLS_CA_CERT
53+
value: /home/ray/workspace/tls/ca.crt
54+
image: quay.io/project-codeflare/ray:2.5.0-py38-cu116
55+
imagePullPolicy: Always
56+
lifecycle:
57+
preStop:
58+
exec:
59+
command:
60+
- /bin/sh
61+
- -c
62+
- ray stop
63+
name: ray-head
64+
ports:
65+
- containerPort: 6379
66+
name: gcs
67+
- containerPort: 8265
68+
name: dashboard
69+
- containerPort: 10001
70+
name: client
71+
resources:
72+
limits:
73+
cpu: 2
74+
memory: 8G
75+
nvidia.com/gpu: 0
76+
requests:
77+
cpu: 2
78+
memory: 8G
79+
nvidia.com/gpu: 0
80+
imagePullSecrets:
81+
- name: unit-test-pull-secret
82+
rayVersion: 2.5.0
83+
workerGroupSpecs:
84+
- groupName: small-group-unit-test-cluster-ray
85+
maxReplicas: 2
86+
minReplicas: 2
87+
rayStartParams:
88+
block: 'true'
89+
num-gpus: '7'
90+
replicas: 2
91+
template:
92+
metadata:
93+
annotations:
94+
key: value
95+
labels:
96+
key: value
97+
spec:
98+
affinity:
99+
nodeAffinity:
100+
requiredDuringSchedulingIgnoredDuringExecution:
101+
nodeSelectorTerms:
102+
- matchExpressions:
103+
- key: unit-test-cluster-ray
104+
operator: In
105+
values:
106+
- unit-test-cluster-ray
107+
containers:
108+
- env:
109+
- name: MY_POD_IP
110+
valueFrom:
111+
fieldRef:
112+
fieldPath: status.podIP
113+
- name: RAY_USE_TLS
114+
value: '0'
115+
- name: RAY_TLS_SERVER_CERT
116+
value: /home/ray/workspace/tls/server.crt
117+
- name: RAY_TLS_SERVER_KEY
118+
value: /home/ray/workspace/tls/server.key
119+
- name: RAY_TLS_CA_CERT
120+
value: /home/ray/workspace/tls/ca.crt
121+
image: quay.io/project-codeflare/ray:2.5.0-py38-cu116
122+
lifecycle:
123+
preStop:
124+
exec:
125+
command:
126+
- /bin/sh
127+
- -c
128+
- ray stop
129+
name: machine-learning
130+
resources:
131+
limits:
132+
cpu: 4
133+
memory: 6G
134+
nvidia.com/gpu: 7
135+
requests:
136+
cpu: 3
137+
memory: 5G
138+
nvidia.com/gpu: 7
139+
imagePullSecrets:
140+
- name: unit-test-pull-secret
141+
initContainers:
142+
- command:
143+
- sh
144+
- -c
145+
- until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local;
146+
do echo waiting for myservice; sleep 2; done
147+
image: busybox:1.28
148+
name: init-myservice
149+
---
150+
apiVersion: route.openshift.io/v1
151+
kind: Route
152+
metadata:
153+
labels:
154+
odh-ray-cluster-service: unit-test-cluster-ray-head-svc
155+
name: ray-dashboard-unit-test-cluster-ray
156+
namespace: ns
157+
spec:
158+
port:
159+
targetPort: dashboard
160+
to:
161+
kind: Service
162+
name: unit-test-cluster-ray-head-svc

0 commit comments

Comments
 (0)