Skip to content

Commit df48547

Browse files
Allow interaction with ray client service via Route from outside of OCP cluster (#100)
* Allow interation with rayclient via route from outside of OCP cluster * update rayclient route generation * update test-case yamls for client route * add unit tests for generate_cert.py * Update local interactive notebook * Add more cert test cases * Replace openshift client with k8s python client * Update initcontainer to work the KubeRay v0.5.0 * Allow rayCluster to spin up in other ns * Add auth cell to the notebook * remove non function method --------- Co-authored-by: MichaelClifford <[email protected]>
1 parent 8506985 commit df48547

File tree

10 files changed

+847
-0
lines changed

10 files changed

+847
-0
lines changed

Diff for: demo-notebooks/interactive/local_interactive.ipynb

+358
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,358 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"id": "9a44568b-61ef-41c7-8ad1-9a3b128f03a7",
7+
"metadata": {
8+
"tags": []
9+
},
10+
"outputs": [],
11+
"source": [
12+
"# Import pieces from codeflare-sdk\n",
13+
"from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration\n",
14+
"from codeflare_sdk.cluster.auth import TokenAuthentication"
15+
]
16+
},
17+
{
18+
"cell_type": "code",
19+
"execution_count": null,
20+
"id": "2cc66278",
21+
"metadata": {},
22+
"outputs": [],
23+
"source": [
24+
"# Create authentication object and log in to desired user account (if not already authenticated)\n",
25+
"auth = TokenAuthentication(\n",
26+
" token = \"XXXX\",\n",
27+
" server = \"XXXX\",\n",
28+
" skip_tls = False\n",
29+
")\n",
30+
"auth.login()"
31+
]
32+
},
33+
{
34+
"cell_type": "code",
35+
"execution_count": 2,
36+
"id": "4364ac2e-dd10-4d30-ba66-12708daefb3f",
37+
"metadata": {
38+
"tags": []
39+
},
40+
"outputs": [
41+
{
42+
"name": "stdout",
43+
"output_type": "stream",
44+
"text": [
45+
"Written to: hfgputest-1.yaml\n"
46+
]
47+
}
48+
],
49+
"source": [
50+
"# Create our cluster and submit appwrapper\n",
51+
"namespace = \"default\"\n",
52+
"cluster_name = \"hfgputest-1\"\n",
53+
"local_interactive = True\n",
54+
"\n",
55+
"cluster = Cluster(ClusterConfiguration(local_interactive=local_interactive, namespace=namespace, name=cluster_name, min_worker=1, max_worker=1, min_cpus=1, max_cpus=1, min_memory=4, max_memory=4, gpu=0, instascale=False, machine_types=[\"m5.xlarge\", \"p3.8xlarge\"]))"
56+
]
57+
},
58+
{
59+
"cell_type": "code",
60+
"execution_count": 3,
61+
"id": "69968140-15e6-482f-9529-82b0cd19524b",
62+
"metadata": {
63+
"tags": []
64+
},
65+
"outputs": [],
66+
"source": [
67+
"cluster.up()"
68+
]
69+
},
70+
{
71+
"cell_type": "code",
72+
"execution_count": 4,
73+
"id": "e20f9982-f671-460b-8c22-3d62e101fed9",
74+
"metadata": {
75+
"tags": []
76+
},
77+
"outputs": [
78+
{
79+
"name": "stdout",
80+
"output_type": "stream",
81+
"text": [
82+
"Waiting for requested resources to be set up...\n",
83+
"Requested cluster up and running!\n"
84+
]
85+
}
86+
],
87+
"source": [
88+
"cluster.wait_ready()"
89+
]
90+
},
91+
{
92+
"attachments": {},
93+
"cell_type": "markdown",
94+
"id": "12eef53c",
95+
"metadata": {},
96+
"source": [
97+
"### Connect via the rayclient route"
98+
]
99+
},
100+
{
101+
"cell_type": "code",
102+
"execution_count": 11,
103+
"id": "cf1b749e-2335-42c2-b673-26768ec9895d",
104+
"metadata": {},
105+
"outputs": [
106+
{
107+
"name": "stdout",
108+
"output_type": "stream",
109+
"text": [
110+
"rayclient-hfgputest-1-default.apps.tedbig412.cp.fyre.ibm.com\n"
111+
]
112+
}
113+
],
114+
"source": [
115+
"import openshift as oc\n",
116+
"from codeflare_sdk.utils import generate_cert\n",
117+
"\n",
118+
"if local_interactive:\n",
119+
" generate_cert.generate_tls_cert(cluster_name, namespace)\n",
120+
" generate_cert.export_env(cluster_name, namespace)\n",
121+
"\n",
122+
"with oc.project(namespace):\n",
123+
" routes=oc.selector(\"route\").objects()\n",
124+
" rayclient_url=\"\"\n",
125+
" for r in routes:\n",
126+
" if \"rayclient\" in r.name():\n",
127+
" rayclient_url=r.model.spec.host\n",
128+
"print(rayclient_url)"
129+
]
130+
},
131+
{
132+
"cell_type": "code",
133+
"execution_count": 12,
134+
"id": "9483bb98-33b3-4beb-9b15-163d7e76c1d7",
135+
"metadata": {
136+
"scrolled": true,
137+
"tags": []
138+
},
139+
"outputs": [
140+
{
141+
"name": "stderr",
142+
"output_type": "stream",
143+
"text": [
144+
"2023-05-31 14:12:37,816\tINFO client_builder.py:251 -- Passing the following kwargs to ray.init() on the server: logging_level\n",
145+
"2023-05-31 14:12:37,820\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.IDLE\n",
146+
"2023-05-31 14:12:38,034\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.CONNECTING\n",
147+
"2023-05-31 14:12:38,246\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.READY\n",
148+
"2023-05-31 14:12:38,290\tDEBUG worker.py:807 -- Pinging server.\n",
149+
"2023-05-31 14:12:40,521\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000001000000\n",
150+
"2023-05-31 14:12:40,523\tDEBUG worker.py:564 -- Scheduling task get_dashboard_url 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x01\\x00\\x00\\x00'\n",
151+
"2023-05-31 14:12:40,535\tDEBUG worker.py:640 -- Retaining c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n",
152+
"2023-05-31 14:12:41,379\tDEBUG worker.py:636 -- Releasing c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n"
153+
]
154+
},
155+
{
156+
"data": {
157+
"text/html": [
158+
"<div>\n",
159+
" <div style=\"margin-left: 50px;display: flex;flex-direction: row;align-items: center\">\n",
160+
" <h3 style=\"color: var(--jp-ui-font-color0)\">Ray</h3>\n",
161+
" <svg version=\"1.1\" id=\"ray\" width=\"3em\" viewBox=\"0 0 144.5 144.6\" style=\"margin-left: 3em;margin-right: 3em\">\n",
162+
" <g id=\"layer-1\">\n",
163+
" <path fill=\"#00a2e9\" class=\"st0\" d=\"M97.3,77.2c-3.8-1.1-6.2,0.9-8.3,5.1c-3.5,6.8-9.9,9.9-17.4,9.6S58,88.1,54.8,81.2c-1.4-3-3-4-6.3-4.1\n",
164+
" c-5.6-0.1-9.9,0.1-13.1,6.4c-3.8,7.6-13.6,10.2-21.8,7.6C5.2,88.4-0.4,80.5,0,71.7c0.1-8.4,5.7-15.8,13.8-18.2\n",
165+
" c8.4-2.6,17.5,0.7,22.3,8c1.3,1.9,1.3,5.2,3.6,5.6c3.9,0.6,8,0.2,12,0.2c1.8,0,1.9-1.6,2.4-2.8c3.5-7.8,9.7-11.8,18-11.9\n",
166+
" c8.2-0.1,14.4,3.9,17.8,11.4c1.3,2.8,2.9,3.6,5.7,3.3c1-0.1,2,0.1,3,0c2.8-0.5,6.4,1.7,8.1-2.7s-2.3-5.5-4.1-7.5\n",
167+
" c-5.1-5.7-10.9-10.8-16.1-16.3C84,38,81.9,37.1,78,38.3C66.7,42,56.2,35.7,53,24.1C50.3,14,57.3,2.8,67.7,0.5\n",
168+
" C78.4-2,89,4.7,91.5,15.3c0.1,0.3,0.1,0.5,0.2,0.8c0.7,3.4,0.7,6.9-0.8,9.8c-1.7,3.2-0.8,5,1.5,7.2c6.7,6.5,13.3,13,19.8,19.7\n",
169+
" c1.8,1.8,3,2.1,5.5,1.2c9.1-3.4,17.9-0.6,23.4,7c4.8,6.9,4.6,16.1-0.4,22.9c-5.4,7.2-14.2,9.9-23.1,6.5c-2.3-0.9-3.5-0.6-5.1,1.1\n",
170+
" c-6.7,6.9-13.6,13.7-20.5,20.4c-1.8,1.8-2.5,3.2-1.4,5.9c3.5,8.7,0.3,18.6-7.7,23.6c-7.9,5-18.2,3.8-24.8-2.9\n",
171+
" c-6.4-6.4-7.4-16.2-2.5-24.3c4.9-7.8,14.5-11,23.1-7.8c3,1.1,4.7,0.5,6.9-1.7C91.7,98.4,98,92.3,104.2,86c1.6-1.6,4.1-2.7,2.6-6.2\n",
172+
" c-1.4-3.3-3.8-2.5-6.2-2.6C99.8,77.2,98.9,77.2,97.3,77.2z M72.1,29.7c5.5,0.1,9.9-4.3,10-9.8c0-0.1,0-0.2,0-0.3\n",
173+
" C81.8,14,77,9.8,71.5,10.2c-5,0.3-9,4.2-9.3,9.2c-0.2,5.5,4,10.1,9.5,10.3C71.8,29.7,72,29.7,72.1,29.7z M72.3,62.3\n",
174+
" c-5.4-0.1-9.9,4.2-10.1,9.7c0,0.2,0,0.3,0,0.5c0.2,5.4,4.5,9.7,9.9,10c5.1,0.1,9.9-4.7,10.1-9.8c0.2-5.5-4-10-9.5-10.3\n",
175+
" C72.6,62.3,72.4,62.3,72.3,62.3z M115,72.5c0.1,5.4,4.5,9.7,9.8,9.9c5.6-0.2,10-4.8,10-10.4c-0.2-5.4-4.6-9.7-10-9.7\n",
176+
" c-5.3-0.1-9.8,4.2-9.9,9.5C115,72.1,115,72.3,115,72.5z M19.5,62.3c-5.4,0.1-9.8,4.4-10,9.8c-0.1,5.1,5.2,10.4,10.2,10.3\n",
177+
" c5.6-0.2,10-4.9,9.8-10.5c-0.1-5.4-4.5-9.7-9.9-9.6C19.6,62.3,19.5,62.3,19.5,62.3z M71.8,134.6c5.9,0.2,10.3-3.9,10.4-9.6\n",
178+
" c0.5-5.5-3.6-10.4-9.1-10.8c-5.5-0.5-10.4,3.6-10.8,9.1c0,0.5,0,0.9,0,1.4c-0.2,5.3,4,9.8,9.3,10\n",
179+
" C71.6,134.6,71.7,134.6,71.8,134.6z\"/>\n",
180+
" </g>\n",
181+
" </svg>\n",
182+
" <table>\n",
183+
" <tr>\n",
184+
" <td style=\"text-align: left\"><b>Python version:</b></td>\n",
185+
" <td style=\"text-align: left\"><b>3.8.13</b></td>\n",
186+
" </tr>\n",
187+
" <tr>\n",
188+
" <td style=\"text-align: left\"><b>Ray version:</b></td>\n",
189+
" <td style=\"text-align: left\"><b> 2.1.0</b></td>\n",
190+
" </tr>\n",
191+
" <tr>\n",
192+
" <td style=\"text-align: left\"><b>Dashboard:</b></td>\n",
193+
" <td style=\"text-align: left\"><b><a href=\"http://10.254.12.141:8265\" target=\"_blank\">http://10.254.12.141:8265</a></b></td>\n",
194+
"</tr>\n",
195+
"\n",
196+
" </table>\n",
197+
" </div>\n",
198+
"</div>\n"
199+
],
200+
"text/plain": [
201+
"ClientContext(dashboard_url='10.254.12.141:8265', python_version='3.8.13', ray_version='2.1.0', ray_commit='23f34d948dae8de9b168667ab27e6cf940b3ae85', protocol_version='2022-10-05', _num_clients=1, _context_to_restore=<ray.util.client._ClientContext object at 0x10e5d2bb0>)"
202+
]
203+
},
204+
"execution_count": 12,
205+
"metadata": {},
206+
"output_type": "execute_result"
207+
}
208+
],
209+
"source": [
210+
"import ray\n",
211+
"\n",
212+
"ray.shutdown()\n",
213+
"ray.init(address=f\"ray://{rayclient_url}\", logging_level=\"DEBUG\")"
214+
]
215+
},
216+
{
217+
"cell_type": "code",
218+
"execution_count": 13,
219+
"id": "3436eb4a-217c-4109-a3c3-309fda7e2442",
220+
"metadata": {},
221+
"outputs": [],
222+
"source": [
223+
"import math\n",
224+
"import ray\n",
225+
"\n",
226+
"@ray.remote\n",
227+
"def heavy_calculation_part(num_iterations):\n",
228+
" result = 0.0\n",
229+
" for i in range(num_iterations):\n",
230+
" for j in range(num_iterations):\n",
231+
" for k in range(num_iterations):\n",
232+
" result += math.sin(i) * math.cos(j) * math.tan(k)\n",
233+
" return result\n",
234+
"@ray.remote\n",
235+
"def heavy_calculation(num_iterations):\n",
236+
" results = ray.get([heavy_calculation_part.remote(num_iterations//30) for _ in range(30)])\n",
237+
" return sum(results)\n"
238+
]
239+
},
240+
{
241+
"cell_type": "code",
242+
"execution_count": 14,
243+
"id": "5cca1874-2be3-4631-ae48-9adfa45e3af3",
244+
"metadata": {
245+
"scrolled": true,
246+
"tags": []
247+
},
248+
"outputs": [
249+
{
250+
"name": "stderr",
251+
"output_type": "stream",
252+
"text": [
253+
"2023-05-31 14:13:29,868\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000002000000\n",
254+
"2023-05-31 14:13:29,870\tDEBUG worker.py:564 -- Scheduling task heavy_calculation 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x02\\x00\\x00\\x00'\n"
255+
]
256+
}
257+
],
258+
"source": [
259+
"ref = heavy_calculation.remote(3000)"
260+
]
261+
},
262+
{
263+
"cell_type": "code",
264+
"execution_count": 15,
265+
"id": "01172c29-e8bf-41ef-8db5-eccb07906111",
266+
"metadata": {},
267+
"outputs": [
268+
{
269+
"name": "stderr",
270+
"output_type": "stream",
271+
"text": [
272+
"2023-05-31 14:13:32,643\tDEBUG worker.py:640 -- Retaining 16310a0f0a45af5cffffffffffffffffffffffff0100000001000000\n",
273+
"2023-05-31 14:13:34,677\tDEBUG worker.py:439 -- Internal retry for get [ClientObjectRef(16310a0f0a45af5cffffffffffffffffffffffff0100000001000000)]\n"
274+
]
275+
},
276+
{
277+
"data": {
278+
"text/plain": [
279+
"1789.4644387076714"
280+
]
281+
},
282+
"execution_count": 15,
283+
"metadata": {},
284+
"output_type": "execute_result"
285+
}
286+
],
287+
"source": [
288+
"ray.get(ref)"
289+
]
290+
},
291+
{
292+
"cell_type": "code",
293+
"execution_count": 16,
294+
"id": "9e79b547-a457-4232-b77d-19147067b972",
295+
"metadata": {},
296+
"outputs": [
297+
{
298+
"name": "stderr",
299+
"output_type": "stream",
300+
"text": [
301+
"2023-05-31 14:13:37,659\tDEBUG dataclient.py:287 -- Got unawaited response connection_cleanup {\n",
302+
"}\n",
303+
"\n",
304+
"2023-05-31 14:13:38,681\tDEBUG dataclient.py:278 -- Shutting down data channel.\n"
305+
]
306+
}
307+
],
308+
"source": [
309+
"ray.cancel(ref)\n",
310+
"ray.shutdown()"
311+
]
312+
},
313+
{
314+
"cell_type": "code",
315+
"execution_count": 17,
316+
"id": "2c198f1f-68bf-43ff-a148-02b5cb000ff2",
317+
"metadata": {},
318+
"outputs": [],
319+
"source": [
320+
"cluster.down()"
321+
]
322+
},
323+
{
324+
"cell_type": "code",
325+
"execution_count": null,
326+
"id": "6879e471-a69f-4c74-9cec-a195cdead47c",
327+
"metadata": {},
328+
"outputs": [],
329+
"source": []
330+
}
331+
],
332+
"metadata": {
333+
"kernelspec": {
334+
"display_name": "Python 3 (ipykernel)",
335+
"language": "python",
336+
"name": "python3"
337+
},
338+
"language_info": {
339+
"codemirror_mode": {
340+
"name": "ipython",
341+
"version": 3
342+
},
343+
"file_extension": ".py",
344+
"mimetype": "text/x-python",
345+
"name": "python",
346+
"nbconvert_exporter": "python",
347+
"pygments_lexer": "ipython3",
348+
"version": "3.8.13"
349+
},
350+
"vscode": {
351+
"interpreter": {
352+
"hash": "f9f85f796d01129d0dd105a088854619f454435301f6ffec2fea96ecbd9be4ac"
353+
}
354+
}
355+
},
356+
"nbformat": 4,
357+
"nbformat_minor": 5
358+
}

Diff for: pyproject.toml

+1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ rich = "^12.5"
2626
ray = {version = "2.1.0", extras = ["default"]}
2727
kubernetes = ">= 25.3.0, < 27"
2828
codeflare-torchx = "0.6.0.dev0"
29+
cryptography = "40.0.2"
2930

3031
[tool.poetry.group.docs]
3132
optional = true

Diff for: src/codeflare_sdk/cluster/cluster.py

+2
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ def create_app_wrapper(self):
8484
instascale = self.config.instascale
8585
instance_types = self.config.machine_types
8686
env = self.config.envs
87+
local_interactive = self.config.local_interactive
8788
return generate_appwrapper(
8889
name=name,
8990
namespace=namespace,
@@ -98,6 +99,7 @@ def create_app_wrapper(self):
9899
instascale=instascale,
99100
instance_types=instance_types,
100101
env=env,
102+
local_interactive=local_interactive,
101103
)
102104

103105
# creates a new cluster with the provided or default spec

Diff for: src/codeflare_sdk/cluster/config.py

+1
Original file line numberDiff line numberDiff line change
@@ -48,3 +48,4 @@ class ClusterConfiguration:
4848
instascale: bool = False
4949
envs: dict = field(default_factory=dict)
5050
image: str = "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103"
51+
local_interactive: bool = False

0 commit comments

Comments
 (0)