Skip to content

Commit d6175c4

Browse files
committed
Allow interation with rayclient via route from outside of OCP cluster
1 parent d1faddb commit d6175c4

File tree

7 files changed

+709
-0
lines changed

7 files changed

+709
-0
lines changed
+342
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,342 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"id": "9a44568b-61ef-41c7-8ad1-9a3b128f03a7",
7+
"metadata": {
8+
"tags": []
9+
},
10+
"outputs": [],
11+
"source": [
12+
"# Import pieces from codeflare-sdk\n",
13+
"from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration\n",
14+
"from codeflare_sdk.cluster.auth import TokenAuthentication"
15+
]
16+
},
17+
{
18+
"cell_type": "code",
19+
"execution_count": 8,
20+
"id": "4364ac2e-dd10-4d30-ba66-12708daefb3f",
21+
"metadata": {
22+
"tags": []
23+
},
24+
"outputs": [
25+
{
26+
"name": "stdout",
27+
"output_type": "stream",
28+
"text": [
29+
"Written to: hfgputest-1.yaml\n"
30+
]
31+
}
32+
],
33+
"source": [
34+
"# Create our cluster and submit appwrapper\n",
35+
"namespace = \"default\"\n",
36+
"cluster_name = \"hfgputest-1\"\n",
37+
"local_interactive = True\n",
38+
"\n",
39+
"cluster = Cluster(ClusterConfiguration(local_interactive=local_interactive, namespace=namespace, name=cluster_name, min_worker=1, max_worker=1, min_cpus=1, max_cpus=1, min_memory=4, max_memory=4, gpu=0, instascale=False, machine_types=[\"m5.xlarge\", \"p3.8xlarge\"]))"
40+
]
41+
},
42+
{
43+
"cell_type": "code",
44+
"execution_count": 5,
45+
"id": "69968140-15e6-482f-9529-82b0cd19524b",
46+
"metadata": {
47+
"tags": []
48+
},
49+
"outputs": [],
50+
"source": [
51+
"cluster.up()"
52+
]
53+
},
54+
{
55+
"cell_type": "code",
56+
"execution_count": 6,
57+
"id": "e20f9982-f671-460b-8c22-3d62e101fed9",
58+
"metadata": {
59+
"tags": []
60+
},
61+
"outputs": [
62+
{
63+
"name": "stdout",
64+
"output_type": "stream",
65+
"text": [
66+
"Waiting for requested resources to be set up...\n",
67+
"Requested cluster up and running!\n"
68+
]
69+
}
70+
],
71+
"source": [
72+
"cluster.wait_ready()"
73+
]
74+
},
75+
{
76+
"cell_type": "markdown",
77+
"id": "12eef53c",
78+
"metadata": {},
79+
"source": [
80+
"### Connect via the rayclient route"
81+
]
82+
},
83+
{
84+
"cell_type": "code",
85+
"execution_count": 9,
86+
"id": "cf1b749e-2335-42c2-b673-26768ec9895d",
87+
"metadata": {},
88+
"outputs": [
89+
{
90+
"name": "stdout",
91+
"output_type": "stream",
92+
"text": [
93+
"rayclient-hfgputest-1-default.apps.ted412.cp.fyre.ibm.com\n"
94+
]
95+
}
96+
],
97+
"source": [
98+
"import openshift as oc\n",
99+
"from codeflare_sdk.utils import generate_cert\n",
100+
"\n",
101+
"if local_interactive:\n",
102+
" generate_cert.generate_tls_cert(cluster_name, namespace)\n",
103+
" generate_cert.export_env(cluster_name, namespace)\n",
104+
"\n",
105+
"routes=oc.selector(\"route\").objects()\n",
106+
"rayclient_url=\"\"\n",
107+
"for r in routes:\n",
108+
" if \"rayclient\" in r.name():\n",
109+
" rayclient_url=r.model.spec.host\n",
110+
"print(rayclient_url)"
111+
]
112+
},
113+
{
114+
"cell_type": "code",
115+
"execution_count": 10,
116+
"id": "9483bb98-33b3-4beb-9b15-163d7e76c1d7",
117+
"metadata": {
118+
"scrolled": true,
119+
"tags": []
120+
},
121+
"outputs": [
122+
{
123+
"name": "stderr",
124+
"output_type": "stream",
125+
"text": [
126+
"2023-05-12 17:21:56,769\tINFO client_builder.py:251 -- Passing the following kwargs to ray.init() on the server: logging_level\n",
127+
"2023-05-12 17:21:56,782\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.IDLE\n",
128+
"2023-05-12 17:21:56,989\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.CONNECTING\n",
129+
"2023-05-12 17:22:01,786\tDEBUG worker.py:226 -- Couldn't connect channel in 5 seconds, retrying\n",
130+
"2023-05-12 17:22:01,787\tDEBUG worker.py:237 -- Waiting for Ray to become ready on the server, retry in 5s...\n",
131+
"2023-05-12 17:22:04,463\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.READY\n",
132+
"2023-05-12 17:22:04,465\tDEBUG worker.py:807 -- Pinging server.\n",
133+
"2023-05-12 17:22:10,488\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000001000000\n",
134+
"2023-05-12 17:22:10,490\tDEBUG worker.py:564 -- Scheduling task get_dashboard_url 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x01\\x00\\x00\\x00'\n",
135+
"2023-05-12 17:22:11,249\tDEBUG worker.py:640 -- Retaining c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n",
136+
"2023-05-12 17:22:12,142\tDEBUG worker.py:636 -- Releasing c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n"
137+
]
138+
},
139+
{
140+
"data": {
141+
"text/html": [
142+
"<div>\n",
143+
" <div style=\"margin-left: 50px;display: flex;flex-direction: row;align-items: center\">\n",
144+
" <h3 style=\"color: var(--jp-ui-font-color0)\">Ray</h3>\n",
145+
" <svg version=\"1.1\" id=\"ray\" width=\"3em\" viewBox=\"0 0 144.5 144.6\" style=\"margin-left: 3em;margin-right: 3em\">\n",
146+
" <g id=\"layer-1\">\n",
147+
" <path fill=\"#00a2e9\" class=\"st0\" d=\"M97.3,77.2c-3.8-1.1-6.2,0.9-8.3,5.1c-3.5,6.8-9.9,9.9-17.4,9.6S58,88.1,54.8,81.2c-1.4-3-3-4-6.3-4.1\n",
148+
" c-5.6-0.1-9.9,0.1-13.1,6.4c-3.8,7.6-13.6,10.2-21.8,7.6C5.2,88.4-0.4,80.5,0,71.7c0.1-8.4,5.7-15.8,13.8-18.2\n",
149+
" c8.4-2.6,17.5,0.7,22.3,8c1.3,1.9,1.3,5.2,3.6,5.6c3.9,0.6,8,0.2,12,0.2c1.8,0,1.9-1.6,2.4-2.8c3.5-7.8,9.7-11.8,18-11.9\n",
150+
" c8.2-0.1,14.4,3.9,17.8,11.4c1.3,2.8,2.9,3.6,5.7,3.3c1-0.1,2,0.1,3,0c2.8-0.5,6.4,1.7,8.1-2.7s-2.3-5.5-4.1-7.5\n",
151+
" c-5.1-5.7-10.9-10.8-16.1-16.3C84,38,81.9,37.1,78,38.3C66.7,42,56.2,35.7,53,24.1C50.3,14,57.3,2.8,67.7,0.5\n",
152+
" C78.4-2,89,4.7,91.5,15.3c0.1,0.3,0.1,0.5,0.2,0.8c0.7,3.4,0.7,6.9-0.8,9.8c-1.7,3.2-0.8,5,1.5,7.2c6.7,6.5,13.3,13,19.8,19.7\n",
153+
" c1.8,1.8,3,2.1,5.5,1.2c9.1-3.4,17.9-0.6,23.4,7c4.8,6.9,4.6,16.1-0.4,22.9c-5.4,7.2-14.2,9.9-23.1,6.5c-2.3-0.9-3.5-0.6-5.1,1.1\n",
154+
" c-6.7,6.9-13.6,13.7-20.5,20.4c-1.8,1.8-2.5,3.2-1.4,5.9c3.5,8.7,0.3,18.6-7.7,23.6c-7.9,5-18.2,3.8-24.8-2.9\n",
155+
" c-6.4-6.4-7.4-16.2-2.5-24.3c4.9-7.8,14.5-11,23.1-7.8c3,1.1,4.7,0.5,6.9-1.7C91.7,98.4,98,92.3,104.2,86c1.6-1.6,4.1-2.7,2.6-6.2\n",
156+
" c-1.4-3.3-3.8-2.5-6.2-2.6C99.8,77.2,98.9,77.2,97.3,77.2z M72.1,29.7c5.5,0.1,9.9-4.3,10-9.8c0-0.1,0-0.2,0-0.3\n",
157+
" C81.8,14,77,9.8,71.5,10.2c-5,0.3-9,4.2-9.3,9.2c-0.2,5.5,4,10.1,9.5,10.3C71.8,29.7,72,29.7,72.1,29.7z M72.3,62.3\n",
158+
" c-5.4-0.1-9.9,4.2-10.1,9.7c0,0.2,0,0.3,0,0.5c0.2,5.4,4.5,9.7,9.9,10c5.1,0.1,9.9-4.7,10.1-9.8c0.2-5.5-4-10-9.5-10.3\n",
159+
" C72.6,62.3,72.4,62.3,72.3,62.3z M115,72.5c0.1,5.4,4.5,9.7,9.8,9.9c5.6-0.2,10-4.8,10-10.4c-0.2-5.4-4.6-9.7-10-9.7\n",
160+
" c-5.3-0.1-9.8,4.2-9.9,9.5C115,72.1,115,72.3,115,72.5z M19.5,62.3c-5.4,0.1-9.8,4.4-10,9.8c-0.1,5.1,5.2,10.4,10.2,10.3\n",
161+
" c5.6-0.2,10-4.9,9.8-10.5c-0.1-5.4-4.5-9.7-9.9-9.6C19.6,62.3,19.5,62.3,19.5,62.3z M71.8,134.6c5.9,0.2,10.3-3.9,10.4-9.6\n",
162+
" c0.5-5.5-3.6-10.4-9.1-10.8c-5.5-0.5-10.4,3.6-10.8,9.1c0,0.5,0,0.9,0,1.4c-0.2,5.3,4,9.8,9.3,10\n",
163+
" C71.6,134.6,71.7,134.6,71.8,134.6z\"/>\n",
164+
" </g>\n",
165+
" </svg>\n",
166+
" <table>\n",
167+
" <tr>\n",
168+
" <td style=\"text-align: left\"><b>Python version:</b></td>\n",
169+
" <td style=\"text-align: left\"><b>3.8.13</b></td>\n",
170+
" </tr>\n",
171+
" <tr>\n",
172+
" <td style=\"text-align: left\"><b>Ray version:</b></td>\n",
173+
" <td style=\"text-align: left\"><b> 2.1.0</b></td>\n",
174+
" </tr>\n",
175+
" <tr>\n",
176+
" <td style=\"text-align: left\"><b>Dashboard:</b></td>\n",
177+
" <td style=\"text-align: left\"><b><a href=\"http://10.254.20.24:8265\" target=\"_blank\">http://10.254.20.24:8265</a></b></td>\n",
178+
"</tr>\n",
179+
"\n",
180+
" </table>\n",
181+
" </div>\n",
182+
"</div>\n"
183+
],
184+
"text/plain": [
185+
"ClientContext(dashboard_url='10.254.20.24:8265', python_version='3.8.13', ray_version='2.1.0', ray_commit='23f34d948dae8de9b168667ab27e6cf940b3ae85', protocol_version='2022-10-05', _num_clients=1, _context_to_restore=<ray.util.client._ClientContext object at 0x10c263eb0>)"
186+
]
187+
},
188+
"execution_count": 10,
189+
"metadata": {},
190+
"output_type": "execute_result"
191+
}
192+
],
193+
"source": [
194+
"import ray\n",
195+
"\n",
196+
"ray.shutdown()\n",
197+
"ray.init(address=f\"ray://{rayclient_url}\", logging_level=\"DEBUG\")"
198+
]
199+
},
200+
{
201+
"cell_type": "code",
202+
"execution_count": 11,
203+
"id": "3436eb4a-217c-4109-a3c3-309fda7e2442",
204+
"metadata": {},
205+
"outputs": [],
206+
"source": [
207+
"import math\n",
208+
"import ray\n",
209+
"\n",
210+
"@ray.remote\n",
211+
"def heavy_calculation_part(num_iterations):\n",
212+
" result = 0.0\n",
213+
" for i in range(num_iterations):\n",
214+
" for j in range(num_iterations):\n",
215+
" for k in range(num_iterations):\n",
216+
" result += math.sin(i) * math.cos(j) * math.tan(k)\n",
217+
" return result\n",
218+
"@ray.remote\n",
219+
"def heavy_calculation(num_iterations):\n",
220+
" results = ray.get([heavy_calculation_part.remote(num_iterations//30) for _ in range(30)])\n",
221+
" return sum(results)\n"
222+
]
223+
},
224+
{
225+
"cell_type": "code",
226+
"execution_count": 12,
227+
"id": "5cca1874-2be3-4631-ae48-9adfa45e3af3",
228+
"metadata": {
229+
"scrolled": true,
230+
"tags": []
231+
},
232+
"outputs": [
233+
{
234+
"name": "stderr",
235+
"output_type": "stream",
236+
"text": [
237+
"2023-05-12 17:22:13,923\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000002000000\n",
238+
"2023-05-12 17:22:13,924\tDEBUG worker.py:564 -- Scheduling task heavy_calculation 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x02\\x00\\x00\\x00'\n"
239+
]
240+
}
241+
],
242+
"source": [
243+
"ref = heavy_calculation.remote(3000)"
244+
]
245+
},
246+
{
247+
"cell_type": "code",
248+
"execution_count": 13,
249+
"id": "01172c29-e8bf-41ef-8db5-eccb07906111",
250+
"metadata": {},
251+
"outputs": [
252+
{
253+
"name": "stderr",
254+
"output_type": "stream",
255+
"text": [
256+
"2023-05-12 17:22:14,712\tDEBUG worker.py:640 -- Retaining 16310a0f0a45af5cffffffffffffffffffffffff0100000001000000\n",
257+
"2023-05-12 17:22:17,522\tDEBUG worker.py:439 -- Internal retry for get [ClientObjectRef(16310a0f0a45af5cffffffffffffffffffffffff0100000001000000)]\n"
258+
]
259+
},
260+
{
261+
"data": {
262+
"text/plain": [
263+
"1789.4644387076714"
264+
]
265+
},
266+
"execution_count": 13,
267+
"metadata": {},
268+
"output_type": "execute_result"
269+
}
270+
],
271+
"source": [
272+
"ray.get(ref)"
273+
]
274+
},
275+
{
276+
"cell_type": "code",
277+
"execution_count": 14,
278+
"id": "9e79b547-a457-4232-b77d-19147067b972",
279+
"metadata": {},
280+
"outputs": [
281+
{
282+
"name": "stderr",
283+
"output_type": "stream",
284+
"text": [
285+
"2023-05-12 17:22:22,542\tDEBUG dataclient.py:287 -- Got unawaited response connection_cleanup {\n",
286+
"}\n",
287+
"\n",
288+
"2023-05-12 17:22:23,465\tDEBUG dataclient.py:278 -- Shutting down data channel.\n"
289+
]
290+
}
291+
],
292+
"source": [
293+
"ray.cancel(ref)\n",
294+
"ray.shutdown()"
295+
]
296+
},
297+
{
298+
"cell_type": "code",
299+
"execution_count": 15,
300+
"id": "2c198f1f-68bf-43ff-a148-02b5cb000ff2",
301+
"metadata": {},
302+
"outputs": [],
303+
"source": [
304+
"cluster.down()"
305+
]
306+
},
307+
{
308+
"cell_type": "code",
309+
"execution_count": null,
310+
"id": "6879e471-a69f-4c74-9cec-a195cdead47c",
311+
"metadata": {},
312+
"outputs": [],
313+
"source": []
314+
}
315+
],
316+
"metadata": {
317+
"kernelspec": {
318+
"display_name": "Python 3 (ipykernel)",
319+
"language": "python",
320+
"name": "python3"
321+
},
322+
"language_info": {
323+
"codemirror_mode": {
324+
"name": "ipython",
325+
"version": 3
326+
},
327+
"file_extension": ".py",
328+
"mimetype": "text/x-python",
329+
"name": "python",
330+
"nbconvert_exporter": "python",
331+
"pygments_lexer": "ipython3",
332+
"version": "3.8.13"
333+
},
334+
"vscode": {
335+
"interpreter": {
336+
"hash": "f9f85f796d01129d0dd105a088854619f454435301f6ffec2fea96ecbd9be4ac"
337+
}
338+
}
339+
},
340+
"nbformat": 4,
341+
"nbformat_minor": 5
342+
}

Diff for: pyproject.toml

+1
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,4 @@ rich = "^12.5"
2626
ray = {version = "2.1.0", extras = ["default"]}
2727
kubernetes = "26.1.0"
2828
codeflare-torchx = "0.6.0.dev0"
29+
cryptography = "40.0.2"

Diff for: src/codeflare_sdk/cluster/cluster.py

+2
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ def create_app_wrapper(self):
8484
instascale = self.config.instascale
8585
instance_types = self.config.machine_types
8686
env = self.config.envs
87+
local_interactive = self.config.local_interactive
8788
return generate_appwrapper(
8889
name=name,
8990
namespace=namespace,
@@ -98,6 +99,7 @@ def create_app_wrapper(self):
9899
instascale=instascale,
99100
instance_types=instance_types,
100101
env=env,
102+
local_interactive=local_interactive,
101103
)
102104

103105
# creates a new cluster with the provided or default spec

Diff for: src/codeflare_sdk/cluster/config.py

+1
Original file line numberDiff line numberDiff line change
@@ -48,3 +48,4 @@ class ClusterConfiguration:
4848
instascale: bool = False
4949
envs: dict = field(default_factory=dict)
5050
image: str = "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103"
51+
local_interactive: bool = False

0 commit comments

Comments
 (0)