Skip to content

Commit bafab36

Browse files
committed
docs: enhance ray module code documentation
1 parent eb5ce8d commit bafab36

File tree

3 files changed

+239
-75
lines changed

3 files changed

+239
-75
lines changed

src/codeflare_sdk/ray/client/ray_jobs.py

+115-30
Original file line numberDiff line numberDiff line change
@@ -24,16 +24,28 @@
2424

2525
class RayJobClient:
2626
"""
27-
A class that functions as a wrapper for the Ray Job Submission Client.
28-
29-
parameters:
30-
address -- Either (1) the address of the Ray cluster, or (2) the HTTP address of the dashboard server on the head node, e.g. “http://<head-node-ip>:8265”. In case (1) it must be specified as an address that can be passed to ray.init(),
31-
e.g. a Ray Client address (ray://<head_node_host>:10001), or “auto”, or “localhost:<port>”. If unspecified, will try to connect to a running local Ray cluster. This argument is always overridden by the RAY_ADDRESS environment variable.
32-
create_cluster_if_needed -- Indicates whether the cluster at the specified address needs to already be running. Ray doesn't start a cluster before interacting with jobs, but third-party job managers may do so.
33-
cookies -- Cookies to use when sending requests to the HTTP job server.
34-
metadata -- Arbitrary metadata to store along with all jobs. New metadata specified per job will be merged with the global metadata provided here via a simple dict update.
35-
headers -- Headers to use when sending requests to the HTTP job server, used for cases like authentication to a remote cluster.
36-
verify -- Boolean indication to verify the server's TLS certificate or a path to a file or directory of trusted certificates. Default: True.
27+
A wrapper class for the Ray Job Submission Client, used for interacting with Ray clusters to manage job
28+
submissions, deletions, and other job-related information.
29+
30+
Args:
31+
address (Optional[str]):
32+
The Ray cluster's address, which may be either the Ray Client address, HTTP address
33+
of the dashboard server on the head node, or "auto" / "localhost:<port>" for a local cluster.
34+
This is overridden by the RAY_ADDRESS environment variable if set.
35+
create_cluster_if_needed (bool):
36+
If True, a new cluster will be created if not already running at the
37+
specified address. By default, Ray requires an existing cluster.
38+
cookies (Optional[Dict[str, Any]]):
39+
HTTP cookies to send with requests to the job server.
40+
metadata (Optional[Dict[str, Any]]):
41+
Global metadata to store with all jobs, merged with job-specific
42+
metadata during job submission.
43+
headers (Optional[Dict[str, Any]]):
44+
HTTP headers to send with requests to the job server, can be used for
45+
authentication.
46+
verify (Optional[Union[str, bool]]):
47+
If True, verifies the server's TLS certificate. Can also be a path
48+
to trusted certificates. Default is True.
3749
"""
3850

3951
def __init__(
@@ -67,18 +79,35 @@ def submit_job(
6779
entrypoint_resources: Optional[Dict[str, float]] = None,
6880
) -> str:
6981
"""
70-
Method for submitting jobs to a Ray Cluster and returning the job id with entrypoint being a mandatory field.
71-
72-
Parameters:
73-
entrypoint -- The shell command to run for this job.
74-
submission_id -- A unique ID for this job.
75-
runtime_env -- The runtime environment to install and run this job in.
76-
metadata -- Arbitrary data to store along with this job.
77-
job_id -- DEPRECATED. This has been renamed to submission_id
78-
entrypoint_num_cpus -- The quantity of CPU cores to reserve for the execution of the entrypoint command, separately from any tasks or actors launched by it. Defaults to 0.
79-
entrypoint_num_gpus -- The quantity of GPUs to reserve for the execution of the entrypoint command, separately from any tasks or actors launched by it. Defaults to 0.
80-
entrypoint_memory –- The quantity of memory to reserve for the execution of the entrypoint command, separately from any tasks or actors launched by it. Defaults to 0.
81-
entrypoint_resources -- The quantity of custom resources to reserve for the execution of the entrypoint command, separately from any tasks or actors launched by it.
82+
Submits a job to the Ray cluster with specified resources and returns the job ID.
83+
84+
Args:
85+
entrypoint (str):
86+
The command to execute for this job.
87+
job_id (Optional[str]):
88+
Deprecated, use `submission_id`. A unique job identifier.
89+
runtime_env (Optional[Dict[str, Any]]):
90+
The runtime environment for this job.
91+
metadata (Optional[Dict[str, str]]):
92+
Metadata associated with the job, merged with global metadata.
93+
submission_id (Optional[str]):
94+
Unique ID for the job submission.
95+
entrypoint_num_cpus (Optional[Union[int, float]]):
96+
The quantity of CPU cores to reserve for the execution of the entrypoint command,
97+
separately from any tasks or actors launched by it. Defaults to 0.
98+
entrypoint_num_gpus (Optional[Union[int, float]]):
99+
The quantity of GPUs to reserve for the execution of the entrypoint command,
100+
separately from any tasks or actors launched by it. Defaults to 0.
101+
entrypoint_memory (Optional[int]):
102+
The quantity of memory to reserve for the execution of the entrypoint command,
103+
separately from any tasks or actors launched by it. Defaults to 0.
104+
entrypoint_resources (Optional[Dict[str, float]]):
105+
The quantity of custom resources to reserve for the execution of the entrypoint command,
106+
separately from any tasks or actors launched by it.
107+
108+
Returns:
109+
str:
110+
The unique identifier for the submitted job.
82111
"""
83112
return self.rayJobClient.submit_job(
84113
entrypoint=entrypoint,
@@ -94,7 +123,15 @@ def submit_job(
94123

95124
def delete_job(self, job_id: str) -> (bool, str):
96125
"""
97-
Method for deleting jobs with the job id being a mandatory field.
126+
Deletes a job by job ID.
127+
128+
Args:
129+
job_id (str):
130+
The unique identifier of the job to delete.
131+
132+
Returns:
133+
tuple(bool, str):
134+
A tuple with deletion status and a message.
98135
"""
99136
deletion_status = self.rayJobClient.delete_job(job_id=job_id)
100137

@@ -107,37 +144,77 @@ def delete_job(self, job_id: str) -> (bool, str):
107144

108145
def get_address(self) -> str:
109146
"""
110-
Method for getting the address from the RayJobClient
147+
Retrieves the address of the connected Ray cluster.
148+
149+
Returns:
150+
str:
151+
The Ray cluster's address.
111152
"""
112153
return self.rayJobClient.get_address()
113154

114155
def get_job_info(self, job_id: str):
115156
"""
116-
Method for getting the job info with the job id being a mandatory field.
157+
Fetches information about a job by job ID.
158+
159+
Args:
160+
job_id (str):
161+
The unique identifier of the job.
162+
163+
Returns:
164+
JobInfo:
165+
Information about the job's status, progress, and other details.
117166
"""
118167
return self.rayJobClient.get_job_info(job_id=job_id)
119168

120169
def get_job_logs(self, job_id: str) -> str:
121170
"""
122-
Method for getting the job logs with the job id being a mandatory field.
171+
Retrieves the logs for a specific job by job ID.
172+
173+
Args:
174+
job_id (str):
175+
The unique identifier of the job.
176+
177+
Returns:
178+
str:
179+
Logs output from the job.
123180
"""
124181
return self.rayJobClient.get_job_logs(job_id=job_id)
125182

126183
def get_job_status(self, job_id: str) -> str:
127184
"""
128-
Method for getting the job's status with the job id being a mandatory field.
185+
Fetches the current status of a job by job ID.
186+
187+
Args:
188+
job_id (str):
189+
The unique identifier of the job.
190+
191+
Returns:
192+
str:
193+
The job's status.
129194
"""
130195
return self.rayJobClient.get_job_status(job_id=job_id)
131196

132197
def list_jobs(self) -> List[JobDetails]:
133198
"""
134-
Method for getting a list of current jobs in the Ray Cluster.
199+
Lists all current jobs in the Ray cluster.
200+
201+
Returns:
202+
List[JobDetails]:
203+
A list of job details for each current job in the cluster.
135204
"""
136205
return self.rayJobClient.list_jobs()
137206

138207
def stop_job(self, job_id: str) -> (bool, str):
139208
"""
140-
Method for stopping a job with the job id being a mandatory field.
209+
Stops a running job by job ID.
210+
211+
Args:
212+
job_id (str):
213+
The unique identifier of the job to stop.
214+
215+
Returns:
216+
tuple(bool, str):
217+
A tuple with the stop status and a message.
141218
"""
142219
stop_job_status = self.rayJobClient.stop_job(job_id=job_id)
143220
if stop_job_status:
@@ -148,6 +225,14 @@ def stop_job(self, job_id: str) -> (bool, str):
148225

149226
def tail_job_logs(self, job_id: str) -> Iterator[str]:
150227
"""
151-
Method for getting an iterator that follows the logs of a job with the job id being a mandatory field.
228+
Continuously streams the logs of a job.
229+
230+
Args:
231+
job_id (str):
232+
The unique identifier of the job.
233+
234+
Returns:
235+
Iterator[str]:
236+
An iterator that yields log entries in real-time.
152237
"""
153238
return self.rayJobClient.tail_job_logs(job_id=job_id)

src/codeflare_sdk/ray/cluster/cluster.py

+79-22
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,17 @@ def status(
296296
return status, ready
297297

298298
def is_dashboard_ready(self) -> bool:
299+
"""
300+
Checks if the cluster's dashboard is ready and accessible.
301+
302+
This method attempts to send a GET request to the cluster dashboard URI.
303+
If the request is successful (HTTP status code 200), it returns True.
304+
If an SSL error occurs, it returns False, indicating the dashboard is not ready.
305+
306+
Returns:
307+
bool:
308+
True if the dashboard is ready, False otherwise.
309+
"""
299310
try:
300311
response = requests.get(
301312
self.cluster_dashboard_uri(),
@@ -313,8 +324,22 @@ def is_dashboard_ready(self) -> bool:
313324

314325
def wait_ready(self, timeout: Optional[int] = None, dashboard_check: bool = True):
315326
"""
316-
Waits for requested cluster to be ready, up to an optional timeout (s).
317-
Checks every five seconds.
327+
Waits for the requested cluster to be ready, up to an optional timeout.
328+
329+
This method checks the status of the cluster every five seconds until it is
330+
ready or the timeout is reached. If dashboard_check is enabled, it will also
331+
check for the readiness of the dashboard.
332+
333+
Args:
334+
timeout (Optional[int]):
335+
The maximum time to wait for the cluster to be ready in seconds. If None, waits indefinitely.
336+
dashboard_check (bool):
337+
Flag to determine if the dashboard readiness should
338+
be checked. Defaults to True.
339+
340+
Raises:
341+
TimeoutError:
342+
If the timeout is reached before the cluster or dashboard is ready.
318343
"""
319344
print("Waiting for requested resources to be set up...")
320345
time = 0
@@ -346,6 +371,21 @@ def wait_ready(self, timeout: Optional[int] = None, dashboard_check: bool = True
346371
time += 5
347372

348373
def details(self, print_to_console: bool = True) -> RayCluster:
374+
"""
375+
Retrieves details about the Ray Cluster.
376+
377+
This method returns a copy of the Ray Cluster information and optionally prints
378+
the details to the console.
379+
380+
Args:
381+
print_to_console (bool):
382+
Flag to determine if the cluster details should be
383+
printed to the console. Defaults to True.
384+
385+
Returns:
386+
RayCluster:
387+
A copy of the Ray Cluster details.
388+
"""
349389
cluster = _copy_to_ray(self)
350390
if print_to_console:
351391
pretty_print.print_clusters([cluster])
@@ -447,6 +487,13 @@ def _head_worker_extended_resources_from_rc_dict(rc: Dict) -> Tuple[dict, dict]:
447487
return head_extended_resources, worker_extended_resources
448488

449489
def local_client_url(self):
490+
"""
491+
Constructs the URL for the local Ray client.
492+
493+
Returns:
494+
str:
495+
The Ray client URL based on the ingress domain.
496+
"""
450497
ingress_domain = _get_ingress_domain(self)
451498
return f"ray://{ingress_domain}"
452499

@@ -504,6 +551,13 @@ def list_all_queued(
504551

505552

506553
def get_current_namespace(): # pragma: no cover
554+
"""
555+
Retrieves the current Kubernetes namespace.
556+
557+
Returns:
558+
str:
559+
The current namespace or None if not found.
560+
"""
507561
if os.path.isfile("/var/run/secrets/kubernetes.io/serviceaccount/namespace"):
508562
try:
509563
file = open("/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r")
@@ -528,26 +582,29 @@ def get_cluster(
528582
verify_tls: bool = True,
529583
write_to_file: bool = False,
530584
):
531-
"""Returns the given Ray Cluster/AppWrapper as a Cluster Object
532-
533-
The get_cluster() method is used for retrieving a Ray Cluster that already exists in your K8s Cluster.
534-
Returned is a basic Cluster object which includes the exact yaml for your Ray Cluster under Cluster.resource_yaml.
535-
536-
Parameters
537-
----------
538-
cluster_name : str
539-
The name of the Ray Cluster/AppWrapper
540-
namespace : str
541-
The namespace of the Ray Cluster/AppWrapper
542-
verify_tls : bool
543-
A boolean indicating whether to verify TLS when connecting to the cluster
544-
write_to_file : bool
545-
A boolean indicating whether or not to write the resource to a Yaml file
546-
547-
Raises
548-
------
549-
Exception
550-
If the Ray Cluster/AppWrapper cannot be found/does not exist
585+
"""
586+
Retrieves an existing Ray Cluster or AppWrapper as a Cluster object.
587+
588+
This function fetches an existing Ray Cluster or AppWrapper from the Kubernetes cluster and returns
589+
it as a `Cluster` object, including its YAML configuration under `Cluster.resource_yaml`.
590+
591+
Args:
592+
cluster_name (str):
593+
The name of the Ray Cluster or AppWrapper.
594+
namespace (str, optional):
595+
The Kubernetes namespace where the Ray Cluster or AppWrapper is located. Default is "default".
596+
verify_tls (bool, optional):
597+
Whether to verify TLS when connecting to the cluster. Default is True.
598+
write_to_file (bool, optional):
599+
If True, writes the resource configuration to a YAML file. Default is False.
600+
601+
Returns:
602+
Cluster:
603+
A Cluster object representing the retrieved Ray Cluster or AppWrapper.
604+
605+
Raises:
606+
Exception:
607+
If the Ray Cluster or AppWrapper cannot be found or does not exist.
551608
"""
552609
config_check()
553610
api_instance = client.CustomObjectsApi(get_api_client())

0 commit comments

Comments
 (0)