Skip to content

Commit 000b277

Browse files
committed
Added namespace retrieval and dashboard route access via kubernetes
1 parent e2b277e commit 000b277

File tree

2 files changed

+50
-19
lines changed

2 files changed

+50
-19
lines changed

src/codeflare_sdk/cluster/cluster.py

+48-17
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ def create_app_wrapper(self):
6969
"""
7070

7171
if self.config.namespace is None:
72-
self.config.namespace = oc.get_project_name()
72+
self.config.namespace = get_current_namespace()
7373
if type(self.config.namespace) is not str:
7474
raise TypeError(
7575
f"Namespace {self.config.namespace} is of type {type(self.config.namespace)}. Check your Kubernetes Authentication."
@@ -265,16 +265,21 @@ def cluster_dashboard_uri(self) -> str:
265265
Returns a string containing the cluster's dashboard URI.
266266
"""
267267
try:
268-
with oc.project(self.config.namespace):
269-
route = oc.invoke(
270-
"get", ["route", "-o", "jsonpath='{$.items[*].spec.host}'"]
271-
)
272-
route = route.out().split(" ")
273-
route = [x for x in route if f"ray-dashboard-{self.config.name}" in x]
274-
route = route[0].strip().strip("'")
275-
return f"http://{route}"
268+
config.load_kube_config()
269+
api_instance = client.CustomObjectsApi()
270+
routes = api_instance.list_namespaced_custom_object(
271+
group="route.openshift.io",
272+
version="v1",
273+
namespace=self.config.namespace,
274+
plural="routes",
275+
)
276276
except:
277-
return "Dashboard route not available yet, have you run cluster.up()?"
277+
pass
278+
279+
for route in routes["items"]:
280+
if route["metadata"]["name"] == f"ray-dashboard-{self.config.name}":
281+
return f"http://{route['spec']['host']}"
282+
return "Dashboard route not available yet, have you run cluster.up()?"
278283

279284
def list_jobs(self) -> List:
280285
"""
@@ -338,6 +343,19 @@ def list_all_queued(namespace: str, print_to_console: bool = True):
338343
return app_wrappers
339344

340345

346+
def get_current_namespace():
347+
try:
348+
_, active_context = config.list_kube_config_contexts()
349+
except config.ConfigException:
350+
raise PermissionError(
351+
"Retrieving current namespace not permitted, have you put in correct/up-to-date auth credentials?"
352+
)
353+
try:
354+
return active_context["context"]["namespace"]
355+
except KeyError:
356+
return "default"
357+
358+
341359
# private methods
342360

343361

@@ -467,12 +485,25 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]:
467485
else:
468486
status = RayClusterStatus.UNKNOWN
469487

470-
with oc.project(rc["metadata"]["namespace"]), oc.timeout(10 * 60):
471-
route = (
472-
oc.selector(f"route/ray-dashboard-{rc['metadata']['name']}")
473-
.object()
474-
.model.spec.host
475-
)
488+
config.load_kube_config()
489+
api_instance = client.CustomObjectsApi()
490+
routes = api_instance.list_namespaced_custom_object(
491+
group="route.openshift.io",
492+
version="v1",
493+
namespace=rc["metadata"]["namespace"],
494+
plural="routes",
495+
)
496+
ray_route = None
497+
for route in routes["items"]:
498+
if route["metadata"]["name"] == f"ray-dashboard-{rc['metadata']['name']}":
499+
ray_route = route["spec"]["host"]
500+
501+
# with oc.project(rc["metadata"]["namespace"]), oc.timeout(10 * 60):
502+
# route = (
503+
# oc.selector(f"route/ray-dashboard-{rc['metadata']['name']}")
504+
# .object()
505+
# .model.spec.host
506+
# )
476507

477508
return RayCluster(
478509
name=rc["metadata"]["name"],
@@ -491,7 +522,7 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]:
491522
]["resources"]["limits"]["cpu"],
492523
worker_gpu=0, # hard to detect currently how many gpus, can override it with what the user asked for
493524
namespace=rc["metadata"]["namespace"],
494-
dashboard=route,
525+
dashboard=ray_route,
495526
)
496527

497528

src/codeflare_sdk/job/jobs.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,13 @@
1717
from typing import TYPE_CHECKING, Optional, Dict, List
1818
from pathlib import Path
1919

20-
import openshift as oc
2120
from torchx.components.dist import ddp
2221
from torchx.runner import get_runner
2322
from torchx.specs import AppHandle, parse_app_handle, AppDryRunInfo
2423

2524
if TYPE_CHECKING:
2625
from ..cluster.cluster import Cluster
26+
from ..cluster.cluster import get_current_namespace
2727

2828
all_jobs: List["Job"] = []
2929
torchx_runner = get_runner()
@@ -124,7 +124,7 @@ def _missing_spec(self, spec: str):
124124
def _dry_run_no_cluster(self):
125125
if self.scheduler_args is not None:
126126
if self.scheduler_args.get("namespace") is None:
127-
self.scheduler_args["namespace"] = oc.get_project_name()
127+
self.scheduler_args["namespace"] = get_current_namespace()
128128
return torchx_runner.dryrun(
129129
app=ddp(
130130
*self.script_args,

0 commit comments

Comments
 (0)