Skip to content

Commit b3251fc

Browse files
committed
checkpoint: RayJob works when not using our modh runtime images
1 parent 5d1714d commit b3251fc

File tree

11 files changed

+1194
-730
lines changed

11 files changed

+1194
-730
lines changed

poetry.lock

Lines changed: 701 additions & 661 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,14 @@ cryptography = "43.0.3"
2929
executing = "1.2.0"
3030
pydantic = "< 2"
3131
ipywidgets = "8.1.2"
32+
odh-kuberay-client = {version = "0.0.0.dev40", source = "testpypi"}
33+
34+
[[tool.poetry.source]]
35+
name = "pypi"
36+
37+
[[tool.poetry.source]]
38+
name = "testpypi"
39+
url = "https://test.pypi.org/simple/"
3240

3341
[tool.poetry.group.docs]
3442
optional = true

src/codeflare_sdk/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
AWManager,
1111
AppWrapperStatus,
1212
RayJobClient,
13+
RayJob,
1314
)
1415

1516
from .common.widgets import view_clusters

src/codeflare_sdk/ray/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@
44
RayJobClient,
55
)
66

7+
from .rayjobs import (
8+
RayJob,
9+
)
10+
711
from .cluster import (
812
Cluster,
913
ClusterConfiguration,

src/codeflare_sdk/ray/cluster/build_ray_cluster.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ def build_ray_cluster(cluster: "codeflare_sdk.ray.cluster.Cluster"):
136136
"enableIngress": False,
137137
"rayStartParams": {
138138
"dashboard-host": "0.0.0.0",
139+
"dashboard-port": "8265",
139140
"block": "true",
140141
"num-gpus": str(head_gpu_count),
141142
"resources": head_resources,
@@ -245,6 +246,7 @@ def get_labels(cluster: "codeflare_sdk.ray.cluster.Cluster"):
245246
"""
246247
labels = {
247248
"controller-tools.k8s.io": "1.0",
249+
"ray.io/cluster": cluster.config.name, # Enforced label always present
248250
}
249251
if cluster.config.labels != {}:
250252
labels.update(cluster.config.labels)

src/codeflare_sdk/ray/cluster/cluster.py

Lines changed: 103 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -663,26 +663,31 @@ def run_job_with_managed_cluster(
663663
raise ValueError("job_config.entrypoint must be specified.")
664664

665665
# Warn if Pydantic V1/V2 specific fields in RayJobSpec are set, as they are not used for RayJob CR.
666-
if job_config.entrypoint_num_cpus is not None or \
667-
job_config.entrypoint_num_gpus is not None or \
668-
job_config.entrypoint_memory is not None:
666+
if (
667+
job_config.entrypoint_num_cpus is not None
668+
or job_config.entrypoint_num_gpus is not None
669+
or job_config.entrypoint_memory is not None
670+
):
669671
warnings.warn(
670672
"RayJobSpec fields 'entrypoint_num_cpus', 'entrypoint_num_gpus', 'entrypoint_memory' "
671673
"are not directly used when creating a RayJob CR. They are primarily for the Ray Job Submission Client. "
672674
"Resource requests for the job driver pod should be configured in the RayCluster head node spec via ClusterConfiguration.",
673-
UserWarning
675+
UserWarning,
674676
)
675677

676678
# Generate rayClusterSpec from ClusterConfiguration
677679
temp_config_for_spec = copy.deepcopy(cluster_config)
678680
temp_config_for_spec.appwrapper = False
679-
681+
680682
with warnings.catch_warnings():
681683
warnings.simplefilter("ignore", UserWarning)
682684
dummy_cluster_for_spec = Cluster(temp_config_for_spec)
683685

684686
ray_cluster_cr_dict = dummy_cluster_for_spec.resource_yaml
685-
if not isinstance(ray_cluster_cr_dict, dict) or "spec" not in ray_cluster_cr_dict:
687+
if (
688+
not isinstance(ray_cluster_cr_dict, dict)
689+
or "spec" not in ray_cluster_cr_dict
690+
):
686691
raise ValueError(
687692
"Failed to generate RayCluster CR dictionary from ClusterConfiguration. "
688693
f"Got: {type(ray_cluster_cr_dict)}"
@@ -691,13 +696,15 @@ def run_job_with_managed_cluster(
691696

692697
# Prepare RayJob CR
693698
actual_job_cr_name = job_cr_name or f"rayjob-{uuid.uuid4().hex[:10]}"
694-
699+
695700
runtime_env_yaml_str = ""
696701
if job_config.runtime_env:
697702
try:
698703
runtime_env_yaml_str = yaml.dump(job_config.runtime_env)
699704
except yaml.YAMLError as e:
700-
raise ValueError(f"Invalid job_config.runtime_env, failed to dump to YAML: {e}")
705+
raise ValueError(
706+
f"Invalid job_config.runtime_env, failed to dump to YAML: {e}"
707+
)
701708

702709
ray_job_cr_spec = {
703710
"entrypoint": job_config.entrypoint,
@@ -735,7 +742,9 @@ def run_job_with_managed_cluster(
735742
ray_cluster_name_actual = None
736743

737744
try:
738-
print(f"Submitting RayJob '{actual_job_cr_name}' to namespace '{namespace}'...")
745+
print(
746+
f"Submitting RayJob '{actual_job_cr_name}' to namespace '{namespace}'..."
747+
)
739748
k8s_co_api.create_namespaced_custom_object(
740749
group="ray.io",
741750
version="v1",
@@ -750,27 +759,37 @@ def run_job_with_managed_cluster(
750759
start_time = time.time()
751760
while True:
752761
try:
753-
ray_job_status_cr = k8s_co_api.get_namespaced_custom_object_status(
754-
group="ray.io",
755-
version="v1",
756-
namespace=namespace,
757-
plural="rayjobs",
758-
name=actual_job_cr_name,
762+
ray_job_status_cr = (
763+
k8s_co_api.get_namespaced_custom_object_status(
764+
group="ray.io",
765+
version="v1",
766+
namespace=namespace,
767+
plural="rayjobs",
768+
name=actual_job_cr_name,
769+
)
759770
)
760771
except ApiException as e:
761772
if e.status == 404:
762-
print(f"RayJob '{actual_job_cr_name}' status not found yet, retrying...")
773+
print(
774+
f"RayJob '{actual_job_cr_name}' status not found yet, retrying..."
775+
)
763776
time.sleep(job_polling_interval_seconds)
764777
continue
765778
raise
766779

767780
status_field = ray_job_status_cr.get("status", {})
768-
job_deployment_status = status_field.get("jobDeploymentStatus", "UNKNOWN")
781+
job_deployment_status = status_field.get(
782+
"jobDeploymentStatus", "UNKNOWN"
783+
)
769784
current_job_status = status_field.get("jobStatus", "PENDING")
770-
785+
771786
dashboard_url = status_field.get("dashboardURL", dashboard_url)
772-
ray_cluster_name_actual = status_field.get("rayClusterName", ray_cluster_name_actual)
773-
returned_job_submission_id = status_field.get("jobId", job_config.submission_id)
787+
ray_cluster_name_actual = status_field.get(
788+
"rayClusterName", ray_cluster_name_actual
789+
)
790+
returned_job_submission_id = status_field.get(
791+
"jobId", job_config.submission_id
792+
)
774793

775794
final_job_status = current_job_status
776795
print(
@@ -779,41 +798,72 @@ def run_job_with_managed_cluster(
779798

780799
if current_job_status in ["SUCCEEDED", "FAILED", "STOPPED"]:
781800
break
782-
783-
if job_timeout_seconds and (time.time() - start_time) > job_timeout_seconds:
801+
802+
if (
803+
job_timeout_seconds
804+
and (time.time() - start_time) > job_timeout_seconds
805+
):
784806
try:
785-
ray_job_status_cr_final = k8s_co_api.get_namespaced_custom_object_status(
786-
group="ray.io", version="v1", namespace=namespace, plural="rayjobs", name=actual_job_cr_name
807+
ray_job_status_cr_final = (
808+
k8s_co_api.get_namespaced_custom_object_status(
809+
group="ray.io",
810+
version="v1",
811+
namespace=namespace,
812+
plural="rayjobs",
813+
name=actual_job_cr_name,
814+
)
815+
)
816+
status_field_final = ray_job_status_cr_final.get(
817+
"status", {}
818+
)
819+
final_job_status = status_field_final.get(
820+
"jobStatus", final_job_status
821+
)
822+
returned_job_submission_id = status_field_final.get(
823+
"jobId", returned_job_submission_id
824+
)
825+
dashboard_url = status_field_final.get(
826+
"dashboardURL", dashboard_url
827+
)
828+
ray_cluster_name_actual = status_field_final.get(
829+
"rayClusterName", ray_cluster_name_actual
787830
)
788-
status_field_final = ray_job_status_cr_final.get("status", {})
789-
final_job_status = status_field_final.get("jobStatus", final_job_status)
790-
returned_job_submission_id = status_field_final.get("jobId", returned_job_submission_id)
791-
dashboard_url = status_field_final.get("dashboardURL", dashboard_url)
792-
ray_cluster_name_actual = status_field_final.get("rayClusterName", ray_cluster_name_actual)
793831
except Exception:
794832
pass
795833
raise TimeoutError(
796834
f"RayJob '{actual_job_cr_name}' timed out after {job_timeout_seconds} seconds. Last status: {final_job_status}"
797835
)
798836

799837
time.sleep(job_polling_interval_seconds)
800-
801-
print(f"RayJob '{actual_job_cr_name}' finished with status: {final_job_status}")
838+
839+
print(
840+
f"RayJob '{actual_job_cr_name}' finished with status: {final_job_status}"
841+
)
802842
else:
803843
try:
804844
ray_job_status_cr = k8s_co_api.get_namespaced_custom_object_status(
805-
group="ray.io", version="v1", namespace=namespace, plural="rayjobs", name=actual_job_cr_name
845+
group="ray.io",
846+
version="v1",
847+
namespace=namespace,
848+
plural="rayjobs",
849+
name=actual_job_cr_name,
806850
)
807851
status_field = ray_job_status_cr.get("status", {})
808852
final_job_status = status_field.get("jobStatus", "SUBMITTED")
809-
returned_job_submission_id = status_field.get("jobId", job_config.submission_id)
853+
returned_job_submission_id = status_field.get(
854+
"jobId", job_config.submission_id
855+
)
810856
dashboard_url = status_field.get("dashboardURL", dashboard_url)
811-
ray_cluster_name_actual = status_field.get("rayClusterName", ray_cluster_name_actual)
857+
ray_cluster_name_actual = status_field.get(
858+
"rayClusterName", ray_cluster_name_actual
859+
)
812860
except ApiException as e:
813861
if e.status == 404:
814862
final_job_status = "SUBMITTED_NOT_FOUND"
815863
else:
816-
print(f"Warning: Could not fetch initial status for RayJob '{actual_job_cr_name}': {e}")
864+
print(
865+
f"Warning: Could not fetch initial status for RayJob '{actual_job_cr_name}': {e}"
866+
)
817867
final_job_status = "UNKNOWN_API_ERROR"
818868

819869
return {
@@ -825,20 +875,30 @@ def run_job_with_managed_cluster(
825875
}
826876

827877
except ApiException as e:
828-
print(f"Kubernetes API error during RayJob '{actual_job_cr_name}' management: {e.reason} (status: {e.status})")
878+
print(
879+
f"Kubernetes API error during RayJob '{actual_job_cr_name}' management: {e.reason} (status: {e.status})"
880+
)
829881
final_status_on_error = "ERROR_BEFORE_SUBMISSION"
830882
if actual_job_cr_name:
831883
try:
832884
ray_job_status_cr = k8s_co_api.get_namespaced_custom_object_status(
833-
group="ray.io", version="v1", namespace=namespace, plural="rayjobs", name=actual_job_cr_name
885+
group="ray.io",
886+
version="v1",
887+
namespace=namespace,
888+
plural="rayjobs",
889+
name=actual_job_cr_name,
834890
)
835891
status_field = ray_job_status_cr.get("status", {})
836-
final_status_on_error = status_field.get("jobStatus", "UNKNOWN_AFTER_K8S_ERROR")
892+
final_status_on_error = status_field.get(
893+
"jobStatus", "UNKNOWN_AFTER_K8S_ERROR"
894+
)
837895
except Exception:
838896
final_status_on_error = "UNKNOWN_FINAL_STATUS_FETCH_FAILED"
839897
raise
840898
except Exception as e:
841-
print(f"An unexpected error occurred during managed RayJob execution for '{actual_job_cr_name}': {e}")
899+
print(
900+
f"An unexpected error occurred during managed RayJob execution for '{actual_job_cr_name}': {e}"
901+
)
842902
raise
843903

844904

@@ -999,8 +1059,10 @@ def get_cluster(
9991059
)
10001060
# 1. Prepare RayClusterSpec from ClusterConfiguration
10011061
# Create a temporary config with appwrapper=False to ensure build_ray_cluster returns RayCluster YAML
1002-
temp_cluster_config_dict = cluster_config.dict(exclude_none=True) # Assuming Pydantic V1 or similar .dict() method
1003-
temp_cluster_config_dict['appwrapper'] = False
1062+
temp_cluster_config_dict = cluster_config.dict(
1063+
exclude_none=True
1064+
) # Assuming Pydantic V1 or similar .dict() method
1065+
temp_cluster_config_dict["appwrapper"] = False
10041066
temp_cluster_config_for_spec = ClusterConfiguration(**temp_cluster_config_dict)
10051067
# Ignore the warning here for the lack of a ClusterConfiguration
10061068
with warnings.catch_warnings():

0 commit comments

Comments
 (0)