Skip to content

Remove oc client and add helper functions #187

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jul 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 31 additions & 57 deletions demo-notebooks/interactive/local_interactive.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -32,20 +32,12 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"id": "4364ac2e-dd10-4d30-ba66-12708daefb3f",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Written to: hfgputest-1.yaml\n"
]
}
],
"outputs": [],
"source": [
"# Create our cluster and submit appwrapper\n",
"namespace = \"default\"\n",
Expand Down Expand Up @@ -89,7 +81,6 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "12eef53c",
"metadata": {},
Expand All @@ -99,38 +90,21 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": null,
"id": "cf1b749e-2335-42c2-b673-26768ec9895d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"rayclient-hfgputest-1-default.apps.tedbig412.cp.fyre.ibm.com\n"
]
}
],
"outputs": [],
"source": [
"import openshift as oc\n",
"from codeflare_sdk.utils import generate_cert\n",
"\n",
"if local_interactive:\n",
" generate_cert.generate_tls_cert(cluster_name, namespace)\n",
" generate_cert.export_env(cluster_name, namespace)\n",
"\n",
"with oc.project(namespace):\n",
" routes=oc.selector(\"route\").objects()\n",
" rayclient_url=\"\"\n",
" for r in routes:\n",
" if \"rayclient\" in r.name():\n",
" rayclient_url=r.model.spec.host\n",
"print(rayclient_url)"
" generate_cert.export_env(cluster_name, namespace)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 6,
"id": "9483bb98-33b3-4beb-9b15-163d7e76c1d7",
"metadata": {
"scrolled": true,
Expand All @@ -141,15 +115,15 @@
"name": "stderr",
"output_type": "stream",
"text": [
"2023-05-31 14:12:37,816\tINFO client_builder.py:251 -- Passing the following kwargs to ray.init() on the server: logging_level\n",
"2023-05-31 14:12:37,820\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.IDLE\n",
"2023-05-31 14:12:38,034\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.CONNECTING\n",
"2023-05-31 14:12:38,246\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.READY\n",
"2023-05-31 14:12:38,290\tDEBUG worker.py:807 -- Pinging server.\n",
"2023-05-31 14:12:40,521\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000001000000\n",
"2023-05-31 14:12:40,523\tDEBUG worker.py:564 -- Scheduling task get_dashboard_url 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x01\\x00\\x00\\x00'\n",
"2023-05-31 14:12:40,535\tDEBUG worker.py:640 -- Retaining c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n",
"2023-05-31 14:12:41,379\tDEBUG worker.py:636 -- Releasing c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n"
"2023-06-27 19:14:16,088\tINFO client_builder.py:251 -- Passing the following kwargs to ray.init() on the server: logging_level\n",
"2023-06-27 19:14:16,100\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.IDLE\n",
"2023-06-27 19:14:16,308\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.CONNECTING\n",
"2023-06-27 19:14:16,434\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.READY\n",
"2023-06-27 19:14:16,436\tDEBUG worker.py:807 -- Pinging server.\n",
"2023-06-27 19:14:18,634\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000001000000\n",
"2023-06-27 19:14:18,635\tDEBUG worker.py:564 -- Scheduling task get_dashboard_url 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x01\\x00\\x00\\x00'\n",
"2023-06-27 19:14:18,645\tDEBUG worker.py:640 -- Retaining c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n",
"2023-06-27 19:14:19,454\tDEBUG worker.py:636 -- Releasing c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n"
]
},
{
Expand Down Expand Up @@ -190,18 +164,18 @@
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left\"><b>Dashboard:</b></td>\n",
" <td style=\"text-align: left\"><b><a href=\"http://10.254.12.141:8265\" target=\"_blank\">http://10.254.12.141:8265</a></b></td>\n",
" <td style=\"text-align: left\"><b><a href=\"http://10.254.20.41:8265\" target=\"_blank\">http://10.254.20.41:8265</a></b></td>\n",
"</tr>\n",
"\n",
" </table>\n",
" </div>\n",
"</div>\n"
],
"text/plain": [
"ClientContext(dashboard_url='10.254.12.141:8265', python_version='3.8.13', ray_version='2.1.0', ray_commit='23f34d948dae8de9b168667ab27e6cf940b3ae85', protocol_version='2022-10-05', _num_clients=1, _context_to_restore=<ray.util.client._ClientContext object at 0x10e5d2bb0>)"
"ClientContext(dashboard_url='10.254.20.41:8265', python_version='3.8.13', ray_version='2.1.0', ray_commit='23f34d948dae8de9b168667ab27e6cf940b3ae85', protocol_version='2022-10-05', _num_clients=1, _context_to_restore=<ray.util.client._ClientContext object at 0x108ca2730>)"
]
},
"execution_count": 12,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -210,12 +184,12 @@
"import ray\n",
"\n",
"ray.shutdown()\n",
"ray.init(address=f\"ray://{rayclient_url}\", logging_level=\"DEBUG\")"
"ray.init(address=cluster.local_client_url(), logging_level=\"DEBUG\")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 7,
"id": "3436eb4a-217c-4109-a3c3-309fda7e2442",
"metadata": {},
"outputs": [],
Expand All @@ -239,7 +213,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 8,
"id": "5cca1874-2be3-4631-ae48-9adfa45e3af3",
"metadata": {
"scrolled": true,
Expand All @@ -250,8 +224,8 @@
"name": "stderr",
"output_type": "stream",
"text": [
"2023-05-31 14:13:29,868\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000002000000\n",
"2023-05-31 14:13:29,870\tDEBUG worker.py:564 -- Scheduling task heavy_calculation 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x02\\x00\\x00\\x00'\n"
"2023-06-27 19:14:28,222\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000002000000\n",
"2023-06-27 19:14:28,222\tDEBUG worker.py:564 -- Scheduling task heavy_calculation 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x02\\x00\\x00\\x00'\n"
]
}
],
Expand All @@ -261,16 +235,16 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 9,
"id": "01172c29-e8bf-41ef-8db5-eccb07906111",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2023-05-31 14:13:32,643\tDEBUG worker.py:640 -- Retaining 16310a0f0a45af5cffffffffffffffffffffffff0100000001000000\n",
"2023-05-31 14:13:34,677\tDEBUG worker.py:439 -- Internal retry for get [ClientObjectRef(16310a0f0a45af5cffffffffffffffffffffffff0100000001000000)]\n"
"2023-06-27 19:14:29,202\tDEBUG worker.py:640 -- Retaining 16310a0f0a45af5cffffffffffffffffffffffff0100000001000000\n",
"2023-06-27 19:14:31,224\tDEBUG worker.py:439 -- Internal retry for get [ClientObjectRef(16310a0f0a45af5cffffffffffffffffffffffff0100000001000000)]\n"
]
},
{
Expand All @@ -279,7 +253,7 @@
"1789.4644387076714"
]
},
"execution_count": 15,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -290,18 +264,18 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 10,
"id": "9e79b547-a457-4232-b77d-19147067b972",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2023-05-31 14:13:37,659\tDEBUG dataclient.py:287 -- Got unawaited response connection_cleanup {\n",
"2023-06-27 19:14:33,161\tDEBUG dataclient.py:287 -- Got unawaited response connection_cleanup {\n",
"}\n",
"\n",
"2023-05-31 14:13:38,681\tDEBUG dataclient.py:278 -- Shutting down data channel.\n"
"2023-06-27 19:14:34,460\tDEBUG dataclient.py:278 -- Shutting down data channel.\n"
]
}
],
Expand All @@ -312,7 +286,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 11,
"id": "2c198f1f-68bf-43ff-a148-02b5cb000ff2",
"metadata": {},
"outputs": [],
Expand Down
3 changes: 1 addition & 2 deletions src/codeflare_sdk/cluster/awload.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,10 @@
from os.path import isfile
import errno
import os
import openshift as oc
import yaml

from kubernetes import client, config
from .cluster import _kube_api_error_handling
from ..utils.kube_api_helpers import _kube_api_error_handling


class AWManager:
Expand Down
42 changes: 18 additions & 24 deletions src/codeflare_sdk/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,14 @@
cluster setup queue, a list of all existing clusters, and the user's working namespace.
"""

from os import stat
from time import sleep
from typing import List, Optional, Tuple, Dict

from ray.job_submission import JobSubmissionClient

from ..utils import pretty_print
from ..utils.generate_yaml import generate_appwrapper
from ..utils.kube_api_helpers import _kube_api_error_handling
from .config import ClusterConfiguration
from .model import (
AppWrapper,
Expand All @@ -34,7 +34,6 @@
RayCluster,
RayClusterStatus,
)

from kubernetes import client, config

import yaml
Expand Down Expand Up @@ -344,6 +343,13 @@ def from_k8_cluster_object(rc):
)
return Cluster(cluster_config)

def local_client_url(self):
if self.config.local_interactive == True:
ingress_domain = _get_ingress_domain()
return f"ray://rayclient-{self.config.name}-{self.config.namespace}.{ingress_domain}"
else:
return "None"


def list_all_clusters(namespace: str, print_to_console: bool = True):
"""
Expand Down Expand Up @@ -402,28 +408,16 @@ def get_cluster(cluster_name: str, namespace: str = "default"):


# private methods


def _kube_api_error_handling(e: Exception): # pragma: no cover
perm_msg = (
"Action not permitted, have you put in correct/up-to-date auth credentials?"
)
nf_msg = "No instances found, nothing to be done."
exists_msg = "Resource with this name already exists."
if type(e) == config.ConfigException:
raise PermissionError(perm_msg)
if type(e) == executing.executing.NotOneValueFound:
print(nf_msg)
return
if type(e) == client.ApiException:
if e.reason == "Not Found":
print(nf_msg)
return
elif e.reason == "Unauthorized" or e.reason == "Forbidden":
raise PermissionError(perm_msg)
elif e.reason == "Conflict":
raise FileExistsError(exists_msg)
raise e
def _get_ingress_domain():
try:
config.load_kube_config()
api_client = client.CustomObjectsApi()
ingress = api_client.get_cluster_custom_object(
"config.openshift.io", "v1", "ingresses", "cluster"
)
except Exception as e: # pragma: no cover
return _kube_api_error_handling(e)
return ingress["spec"]["domain"]


def _app_wrapper_status(name, namespace="default") -> Optional[AppWrapper]:
Expand Down
Loading