Skip to content

K8s Support for SDK #146

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Jul 20, 2023
Merged
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,5 @@ Pipfile.lock
poetry.lock
.venv*
build/
tls-cluster-namespace
quicktest.yaml
88 changes: 31 additions & 57 deletions demo-notebooks/interactive/local_interactive.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -32,20 +32,12 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"id": "4364ac2e-dd10-4d30-ba66-12708daefb3f",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Written to: hfgputest-1.yaml\n"
]
}
],
"outputs": [],
"source": [
"# Create our cluster and submit appwrapper\n",
"namespace = \"default\"\n",
Expand Down Expand Up @@ -89,7 +81,6 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "12eef53c",
"metadata": {},
Expand All @@ -99,38 +90,21 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": null,
"id": "cf1b749e-2335-42c2-b673-26768ec9895d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"rayclient-hfgputest-1-default.apps.tedbig412.cp.fyre.ibm.com\n"
]
}
],
"outputs": [],
"source": [
"import openshift as oc\n",
"from codeflare_sdk.utils import generate_cert\n",
"\n",
"if local_interactive:\n",
" generate_cert.generate_tls_cert(cluster_name, namespace)\n",
" generate_cert.export_env(cluster_name, namespace)\n",
"\n",
"with oc.project(namespace):\n",
" routes=oc.selector(\"route\").objects()\n",
" rayclient_url=\"\"\n",
" for r in routes:\n",
" if \"rayclient\" in r.name():\n",
" rayclient_url=r.model.spec.host\n",
"print(rayclient_url)"
" generate_cert.export_env(cluster_name, namespace)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 6,
"id": "9483bb98-33b3-4beb-9b15-163d7e76c1d7",
"metadata": {
"scrolled": true,
Expand All @@ -141,15 +115,15 @@
"name": "stderr",
"output_type": "stream",
"text": [
"2023-05-31 14:12:37,816\tINFO client_builder.py:251 -- Passing the following kwargs to ray.init() on the server: logging_level\n",
"2023-05-31 14:12:37,820\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.IDLE\n",
"2023-05-31 14:12:38,034\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.CONNECTING\n",
"2023-05-31 14:12:38,246\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.READY\n",
"2023-05-31 14:12:38,290\tDEBUG worker.py:807 -- Pinging server.\n",
"2023-05-31 14:12:40,521\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000001000000\n",
"2023-05-31 14:12:40,523\tDEBUG worker.py:564 -- Scheduling task get_dashboard_url 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x01\\x00\\x00\\x00'\n",
"2023-05-31 14:12:40,535\tDEBUG worker.py:640 -- Retaining c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n",
"2023-05-31 14:12:41,379\tDEBUG worker.py:636 -- Releasing c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n"
"2023-06-27 19:14:16,088\tINFO client_builder.py:251 -- Passing the following kwargs to ray.init() on the server: logging_level\n",
"2023-06-27 19:14:16,100\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.IDLE\n",
"2023-06-27 19:14:16,308\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.CONNECTING\n",
"2023-06-27 19:14:16,434\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.READY\n",
"2023-06-27 19:14:16,436\tDEBUG worker.py:807 -- Pinging server.\n",
"2023-06-27 19:14:18,634\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000001000000\n",
"2023-06-27 19:14:18,635\tDEBUG worker.py:564 -- Scheduling task get_dashboard_url 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x01\\x00\\x00\\x00'\n",
"2023-06-27 19:14:18,645\tDEBUG worker.py:640 -- Retaining c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n",
"2023-06-27 19:14:19,454\tDEBUG worker.py:636 -- Releasing c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n"
]
},
{
Expand Down Expand Up @@ -190,18 +164,18 @@
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left\"><b>Dashboard:</b></td>\n",
" <td style=\"text-align: left\"><b><a href=\"http://10.254.12.141:8265\" target=\"_blank\">http://10.254.12.141:8265</a></b></td>\n",
" <td style=\"text-align: left\"><b><a href=\"http://10.254.20.41:8265\" target=\"_blank\">http://10.254.20.41:8265</a></b></td>\n",
"</tr>\n",
"\n",
" </table>\n",
" </div>\n",
"</div>\n"
],
"text/plain": [
"ClientContext(dashboard_url='10.254.12.141:8265', python_version='3.8.13', ray_version='2.1.0', ray_commit='23f34d948dae8de9b168667ab27e6cf940b3ae85', protocol_version='2022-10-05', _num_clients=1, _context_to_restore=<ray.util.client._ClientContext object at 0x10e5d2bb0>)"
"ClientContext(dashboard_url='10.254.20.41:8265', python_version='3.8.13', ray_version='2.1.0', ray_commit='23f34d948dae8de9b168667ab27e6cf940b3ae85', protocol_version='2022-10-05', _num_clients=1, _context_to_restore=<ray.util.client._ClientContext object at 0x108ca2730>)"
]
},
"execution_count": 12,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -210,12 +184,12 @@
"import ray\n",
"\n",
"ray.shutdown()\n",
"ray.init(address=f\"ray://{rayclient_url}\", logging_level=\"DEBUG\")"
"ray.init(address=cluster.local_client_url(), logging_level=\"DEBUG\")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 7,
"id": "3436eb4a-217c-4109-a3c3-309fda7e2442",
"metadata": {},
"outputs": [],
Expand All @@ -239,7 +213,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 8,
"id": "5cca1874-2be3-4631-ae48-9adfa45e3af3",
"metadata": {
"scrolled": true,
Expand All @@ -250,8 +224,8 @@
"name": "stderr",
"output_type": "stream",
"text": [
"2023-05-31 14:13:29,868\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000002000000\n",
"2023-05-31 14:13:29,870\tDEBUG worker.py:564 -- Scheduling task heavy_calculation 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x02\\x00\\x00\\x00'\n"
"2023-06-27 19:14:28,222\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000002000000\n",
"2023-06-27 19:14:28,222\tDEBUG worker.py:564 -- Scheduling task heavy_calculation 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x02\\x00\\x00\\x00'\n"
]
}
],
Expand All @@ -261,16 +235,16 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 9,
"id": "01172c29-e8bf-41ef-8db5-eccb07906111",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2023-05-31 14:13:32,643\tDEBUG worker.py:640 -- Retaining 16310a0f0a45af5cffffffffffffffffffffffff0100000001000000\n",
"2023-05-31 14:13:34,677\tDEBUG worker.py:439 -- Internal retry for get [ClientObjectRef(16310a0f0a45af5cffffffffffffffffffffffff0100000001000000)]\n"
"2023-06-27 19:14:29,202\tDEBUG worker.py:640 -- Retaining 16310a0f0a45af5cffffffffffffffffffffffff0100000001000000\n",
"2023-06-27 19:14:31,224\tDEBUG worker.py:439 -- Internal retry for get [ClientObjectRef(16310a0f0a45af5cffffffffffffffffffffffff0100000001000000)]\n"
]
},
{
Expand All @@ -279,7 +253,7 @@
"1789.4644387076714"
]
},
"execution_count": 15,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -290,18 +264,18 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 10,
"id": "9e79b547-a457-4232-b77d-19147067b972",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2023-05-31 14:13:37,659\tDEBUG dataclient.py:287 -- Got unawaited response connection_cleanup {\n",
"2023-06-27 19:14:33,161\tDEBUG dataclient.py:287 -- Got unawaited response connection_cleanup {\n",
"}\n",
"\n",
"2023-05-31 14:13:38,681\tDEBUG dataclient.py:278 -- Shutting down data channel.\n"
"2023-06-27 19:14:34,460\tDEBUG dataclient.py:278 -- Shutting down data channel.\n"
]
}
],
Expand All @@ -312,7 +286,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 11,
"id": "2c198f1f-68bf-43ff-a148-02b5cb000ff2",
"metadata": {},
"outputs": [],
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,10 @@ python = "^3.7"
openshift-client = "1.0.18"
rich = "^12.5"
ray = {version = "2.5.0", extras = ["default"]}
kubernetes = "25.3.0"
kubernetes = ">= 25.3.0, < 27"
codeflare-torchx = "0.6.0.dev0"
cryptography = "40.0.2"
executing = "1.2.0"

[tool.poetry.group.docs]
optional = true
Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@ ray[default]==2.5.0
kubernetes>=25.3.0,<27
codeflare-torchx==0.6.0.dev0
pydantic<2 # 2.0+ broke ray[default] see detail: https://github.com/ray-project/ray/pull/37000
cryptography==40.0.2
executing==1.2.0
Loading