From e4c489336fdfd9cdbef031f00feb7164fc901301 Mon Sep 17 00:00:00 2001
From: Ignas Baranauskas <ibaranau@redhat.com>
Date: Tue, 29 Oct 2024 11:58:01 +0000
Subject: [PATCH 1/2] docs: enhance ray module code documentation

---
 src/codeflare_sdk/ray/client/ray_jobs.py | 145 ++++++++++++++++++-----
 src/codeflare_sdk/ray/cluster/cluster.py | 101 ++++++++++++----
 src/codeflare_sdk/ray/cluster/config.py  |  68 +++++++----
 3 files changed, 239 insertions(+), 75 deletions(-)
diff --git a/src/codeflare_sdk/ray/client/ray_jobs.py b/src/codeflare_sdk/ray/client/ray_jobs.py
index 9f0022951..2c0ceee00 100644
--- a/src/codeflare_sdk/ray/client/ray_jobs.py
+++ b/src/codeflare_sdk/ray/client/ray_jobs.py
@@ -24,16 +24,28 @@
 
 class RayJobClient:
     """
-    A class that functions as a wrapper for the Ray Job Submission Client.
-
-    parameters:
-    address -- Either (1) the address of the Ray cluster, or (2) the HTTP address of the dashboard server on the head node, e.g. “http://<head-node-ip>:8265”. In case (1) it must be specified as an address that can be passed to ray.init(),
-    e.g. a Ray Client address (ray://<head_node_host>:10001), or “auto”, or “localhost:<port>”. If unspecified, will try to connect to a running local Ray cluster. This argument is always overridden by the RAY_ADDRESS environment variable.
-    create_cluster_if_needed -- Indicates whether the cluster at the specified address needs to already be running. Ray doesn't start a cluster before interacting with jobs, but third-party job managers may do so.
-    cookies -- Cookies to use when sending requests to the HTTP job server.
-    metadata -- Arbitrary metadata to store along with all jobs. New metadata specified per job will be merged with the global metadata provided here via a simple dict update.
-    headers -- Headers to use when sending requests to the HTTP job server, used for cases like authentication to a remote cluster.
-    verify -- Boolean indication to verify the server's TLS certificate or a path to a file or directory of trusted certificates. Default: True.
+    A wrapper class for the Ray Job Submission Client, used for interacting with Ray clusters to manage job
+    submissions, deletions, and other job-related information.
+
+    Args:
+        address (Optional[str]):
+            The Ray cluster's address, which may be either the Ray Client address, HTTP address
+            of the dashboard server on the head node, or "auto" / "localhost:<port>" for a local cluster.
+            This is overridden by the RAY_ADDRESS environment variable if set.
+        create_cluster_if_needed (bool):
+            If True, a new cluster will be created if not already running at the
+            specified address. By default, Ray requires an existing cluster.
+        cookies (Optional[Dict[str, Any]]):
+            HTTP cookies to send with requests to the job server.
+        metadata (Optional[Dict[str, Any]]):
+            Global metadata to store with all jobs, merged with job-specific
+            metadata during job submission.
+        headers (Optional[Dict[str, Any]]):
+            HTTP headers to send with requests to the job server, can be used for
+            authentication.
+        verify (Optional[Union[str, bool]]):
+            If True, verifies the server's TLS certificate. Can also be a path
+            to trusted certificates. Default is True.
     """
 
     def __init__(
@@ -67,18 +79,35 @@ def submit_job(
         entrypoint_resources: Optional[Dict[str, float]] = None,
     ) -> str:
         """
-        Method for submitting jobs to a Ray Cluster and returning the job id with entrypoint being a mandatory field.
-
-        Parameters:
-        entrypoint -- The shell command to run for this job.
-        submission_id -- A unique ID for this job.
-        runtime_env -- The runtime environment to install and run this job in.
-        metadata -- Arbitrary data to store along with this job.
-        job_id -- DEPRECATED. This has been renamed to submission_id
-        entrypoint_num_cpus -- The quantity of CPU cores to reserve for the execution of the entrypoint command, separately from any tasks or actors launched by it. Defaults to 0.
-        entrypoint_num_gpus -- The quantity of GPUs to reserve for the execution of the entrypoint command, separately from any tasks or actors launched by it. Defaults to 0.
-        entrypoint_memory –- The quantity of memory to reserve for the execution of the entrypoint command, separately from any tasks or actors launched by it. Defaults to 0.
-        entrypoint_resources -- The quantity of custom resources to reserve for the execution of the entrypoint command, separately from any tasks or actors launched by it.
+        Submits a job to the Ray cluster with specified resources and returns the job ID.
+
+        Args:
+            entrypoint (str):
+                The command to execute for this job.
+            job_id (Optional[str]):
+                Deprecated, use `submission_id`. A unique job identifier.
+            runtime_env (Optional[Dict[str, Any]]):
+                The runtime environment for this job.
+            metadata (Optional[Dict[str, str]]):
+                Metadata associated with the job, merged with global metadata.
+            submission_id (Optional[str]):
+                Unique ID for the job submission.
+            entrypoint_num_cpus (Optional[Union[int, float]]):
+                The quantity of CPU cores to reserve for the execution of the entrypoint command,
+                separately from any tasks or actors launched by it. Defaults to 0.
+            entrypoint_num_gpus (Optional[Union[int, float]]):
+                The quantity of GPUs to reserve for the execution of the entrypoint command,
+                separately from any tasks or actors launched by it. Defaults to 0.
+            entrypoint_memory (Optional[int]):
+                The quantity of memory to reserve for the execution of the entrypoint command,
+                separately from any tasks or actors launched by it. Defaults to 0.
+            entrypoint_resources (Optional[Dict[str, float]]):
+                The quantity of custom resources to reserve for the execution of the entrypoint command,
+                separately from any tasks or actors launched by it.
+
+        Returns:
+            str:
+                The unique identifier for the submitted job.
         """
         return self.rayJobClient.submit_job(
             entrypoint=entrypoint,
@@ -94,7 +123,15 @@ def submit_job(
 
     def delete_job(self, job_id: str) -> (bool, str):
         """
-        Method for deleting jobs with the job id being a mandatory field.
+        Deletes a job by job ID.
+
+        Args:
+            job_id (str):
+                The unique identifier of the job to delete.
+
+        Returns:
+            tuple(bool, str):
+                A tuple with deletion status and a message.
         """
         deletion_status = self.rayJobClient.delete_job(job_id=job_id)
 
@@ -107,37 +144,77 @@ def delete_job(self, job_id: str) -> (bool, str):
 
     def get_address(self) -> str:
         """
-        Method for getting the address from the RayJobClient
+        Retrieves the address of the connected Ray cluster.
+
+        Returns:
+            str:
+                The Ray cluster's address.
         """
         return self.rayJobClient.get_address()
 
     def get_job_info(self, job_id: str):
         """
-        Method for getting the job info with the job id being a mandatory field.
+        Fetches information about a job by job ID.
+
+        Args:
+            job_id (str):
+                The unique identifier of the job.
+
+        Returns:
+            JobInfo:
+                Information about the job's status, progress, and other details.
         """
         return self.rayJobClient.get_job_info(job_id=job_id)
 
     def get_job_logs(self, job_id: str) -> str:
         """
-        Method for getting the job logs with the job id being a mandatory field.
+        Retrieves the logs for a specific job by job ID.
+
+        Args:
+            job_id (str):
+                The unique identifier of the job.
+
+        Returns:
+            str:
+                Logs output from the job.
         """
         return self.rayJobClient.get_job_logs(job_id=job_id)
 
     def get_job_status(self, job_id: str) -> str:
         """
-        Method for getting the job's status with the job id being a mandatory field.
+        Fetches the current status of a job by job ID.
+
+        Args:
+            job_id (str):
+                The unique identifier of the job.
+
+        Returns:
+            str:
+                The job's status.
         """
         return self.rayJobClient.get_job_status(job_id=job_id)
 
     def list_jobs(self) -> List[JobDetails]:
         """
-        Method for getting a list of current jobs in the Ray Cluster.
+        Lists all current jobs in the Ray cluster.
+
+        Returns:
+            List[JobDetails]:
+                A list of job details for each current job in the cluster.
         """
         return self.rayJobClient.list_jobs()
 
     def stop_job(self, job_id: str) -> (bool, str):
         """
-        Method for stopping a job with the job id being a mandatory field.
+        Stops a running job by job ID.
+
+        Args:
+            job_id (str):
+                The unique identifier of the job to stop.
+
+        Returns:
+            tuple(bool, str):
+                A tuple with the stop status and a message.
         """
         stop_job_status = self.rayJobClient.stop_job(job_id=job_id)
         if stop_job_status:
@@ -148,6 +225,14 @@ def stop_job(self, job_id: str) -> (bool, str):
 
     def tail_job_logs(self, job_id: str) -> Iterator[str]:
         """
-        Method for getting an iterator that follows the logs of a job with the job id being a mandatory field.
+        Continuously streams the logs of a job.
+
+        Args:
+            job_id (str):
+                The unique identifier of the job.
+
+        Returns:
+            Iterator[str]:
+                An iterator that yields log entries in real-time.
         """
         return self.rayJobClient.tail_job_logs(job_id=job_id)
diff --git a/src/codeflare_sdk/ray/cluster/cluster.py b/src/codeflare_sdk/ray/cluster/cluster.py
index fe29eaa97..fd0592771 100644
--- a/src/codeflare_sdk/ray/cluster/cluster.py
+++ b/src/codeflare_sdk/ray/cluster/cluster.py
@@ -296,6 +296,17 @@ def status(
         return status, ready
 
     def is_dashboard_ready(self) -> bool:
+        """
+        Checks if the cluster's dashboard is ready and accessible.
+
+        This method attempts to send a GET request to the cluster dashboard URI.
+        If the request is successful (HTTP status code 200), it returns True.
+        If an SSL error occurs, it returns False, indicating the dashboard is not ready.
+
+        Returns:
+            bool:
+                True if the dashboard is ready, False otherwise.
+        """
         try:
             response = requests.get(
                 self.cluster_dashboard_uri(),
@@ -313,8 +324,22 @@ def is_dashboard_ready(self) -> bool:
 
     def wait_ready(self, timeout: Optional[int] = None, dashboard_check: bool = True):
         """
-        Waits for requested cluster to be ready, up to an optional timeout (s).
-        Checks every five seconds.
+        Waits for the requested cluster to be ready, up to an optional timeout.
+
+        This method checks the status of the cluster every five seconds until it is
+        ready or the timeout is reached. If dashboard_check is enabled, it will also
+        check for the readiness of the dashboard.
+
+        Args:
+            timeout (Optional[int]):
+                The maximum time to wait for the cluster to be ready in seconds. If None, waits indefinitely.
+            dashboard_check (bool):
+                Flag to determine if the dashboard readiness should
+                be checked. Defaults to True.
+
+        Raises:
+            TimeoutError:
+                If the timeout is reached before the cluster or dashboard is ready.
         """
         print("Waiting for requested resources to be set up...")
         time = 0
@@ -346,6 +371,21 @@ def wait_ready(self, timeout: Optional[int] = None, dashboard_check: bool = True
             time += 5
 
     def details(self, print_to_console: bool = True) -> RayCluster:
+        """
+        Retrieves details about the Ray Cluster.
+
+        This method returns a copy of the Ray Cluster information and optionally prints
+        the details to the console.
+
+        Args:
+            print_to_console (bool):
+                Flag to determine if the cluster details should be
+                printed to the console. Defaults to True.
+
+        Returns:
+            RayCluster:
+                A copy of the Ray Cluster details.
+        """
         cluster = _copy_to_ray(self)
         if print_to_console:
             pretty_print.print_clusters([cluster])
@@ -447,6 +487,13 @@ def _head_worker_extended_resources_from_rc_dict(rc: Dict) -> Tuple[dict, dict]:
         return head_extended_resources, worker_extended_resources
 
     def local_client_url(self):
+        """
+        Constructs the URL for the local Ray client.
+
+        Returns:
+            str:
+                The Ray client URL based on the ingress domain.
+        """
         ingress_domain = _get_ingress_domain(self)
         return f"ray://{ingress_domain}"
 
@@ -504,6 +551,13 @@ def list_all_queued(
 
 
 def get_current_namespace():  # pragma: no cover
+    """
+    Retrieves the current Kubernetes namespace.
+
+    Returns:
+        str:
+            The current namespace or None if not found.
+    """
     if os.path.isfile("/var/run/secrets/kubernetes.io/serviceaccount/namespace"):
         try:
             file = open("/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r")
@@ -528,26 +582,29 @@ def get_cluster(
     verify_tls: bool = True,
     write_to_file: bool = False,
 ):
-    """Returns the given Ray Cluster/AppWrapper as a Cluster Object
-
-    The get_cluster() method is used for retrieving a Ray Cluster that already exists in your K8s Cluster.
-    Returned is a basic Cluster object which includes the exact yaml for your Ray Cluster under Cluster.resource_yaml.
-
-    Parameters
-    ----------
-    cluster_name : str
-        The name of the Ray Cluster/AppWrapper
-    namespace : str
-        The namespace of the Ray Cluster/AppWrapper
-    verify_tls : bool
-        A boolean indicating whether to verify TLS when connecting to the cluster
-    write_to_file : bool
-        A boolean indicating whether or not to write the resource to a Yaml file
-
-    Raises
-    ------
-    Exception
-        If the Ray Cluster/AppWrapper cannot be found/does not exist
+    """
+    Retrieves an existing Ray Cluster or AppWrapper as a Cluster object.
+
+    This function fetches an existing Ray Cluster or AppWrapper from the Kubernetes cluster and returns
+    it as a `Cluster` object, including its YAML configuration under `Cluster.resource_yaml`.
+
+    Args:
+        cluster_name (str):
+            The name of the Ray Cluster or AppWrapper.
+        namespace (str, optional):
+            The Kubernetes namespace where the Ray Cluster or AppWrapper is located. Default is "default".
+        verify_tls (bool, optional):
+            Whether to verify TLS when connecting to the cluster. Default is True.
+        write_to_file (bool, optional):
+            If True, writes the resource configuration to a YAML file. Default is False.
+
+    Returns:
+        Cluster:
+            A Cluster object representing the retrieved Ray Cluster or AppWrapper.
+
+    Raises:
+        Exception:
+            If the Ray Cluster or AppWrapper cannot be found or does not exist.
     """
     config_check()
     api_instance = client.CustomObjectsApi(get_api_client())
diff --git a/src/codeflare_sdk/ray/cluster/config.py b/src/codeflare_sdk/ray/cluster/config.py
index b8b1652e3..f321c278a 100644
--- a/src/codeflare_sdk/ray/cluster/config.py
+++ b/src/codeflare_sdk/ray/cluster/config.py
@@ -44,29 +44,51 @@ class ClusterConfiguration:
     This dataclass is used to specify resource requirements and other details, and
     is passed in as an argument when creating a Cluster object.
 
-    Attributes:
-    - name: The name of the cluster.
-    - namespace: The namespace in which the cluster should be created.
-    - head_cpus: The number of CPUs to allocate to the head node.
-    - head_memory: The amount of memory to allocate to the head node.
-    - head_gpus: The number of GPUs to allocate to the head node. (Deprecated, use head_extended_resource_requests)
-    - head_extended_resource_requests: A dictionary of extended resource requests for the head node. ex: {"nvidia.com/gpu": 1}
-    - min_cpus: The minimum number of CPUs to allocate to each worker.
-    - max_cpus: The maximum number of CPUs to allocate to each worker.
-    - num_workers: The number of workers to create.
-    - min_memory: The minimum amount of memory to allocate to each worker.
-    - max_memory: The maximum amount of memory to allocate to each worker.
-    - num_gpus: The number of GPUs to allocate to each worker. (Deprecated, use worker_extended_resource_requests)
-    - appwrapper: A boolean indicating whether to use an AppWrapper.
-    - envs: A dictionary of environment variables to set for the cluster.
-    - image: The image to use for the cluster.
-    - image_pull_secrets: A list of image pull secrets to use for the cluster.
-    - write_to_file: A boolean indicating whether to write the cluster configuration to a file.
-    - verify_tls: A boolean indicating whether to verify TLS when connecting to the cluster.
-    - labels: A dictionary of labels to apply to the cluster.
-    - worker_extended_resource_requests: A dictionary of extended resource requests for each worker. ex: {"nvidia.com/gpu": 1}
-    - extended_resource_mapping: A dictionary of custom resource mappings to map extended resource requests to RayCluster resource names
-    - overwrite_default_resource_mapping: A boolean indicating whether to overwrite the default resource mapping.
+    Args:
+        name:
+            The name of the cluster.
+        namespace:
+            The namespace in which the cluster should be created.
+        head_cpus:
+            The number of CPUs to allocate to the head node.
+        head_memory:
+            The amount of memory to allocate to the head node.
+        head_gpus:
+            The number of GPUs to allocate to the head node. (Deprecated, use head_extended_resource_requests)
+        head_extended_resource_requests:
+            A dictionary of extended resource requests for the head node. ex: {"nvidia.com/gpu": 1}
+        min_cpus:
+            The minimum number of CPUs to allocate to each worker.
+        max_cpus:
+            The maximum number of CPUs to allocate to each worker.
+        num_workers:
+            The number of workers to create.
+        min_memory:
+            The minimum amount of memory to allocate to each worker.
+        max_memory:
+            The maximum amount of memory to allocate to each worker.
+        num_gpus:
+            The number of GPUs to allocate to each worker. (Deprecated, use worker_extended_resource_requests)
+        appwrapper:
+            A boolean indicating whether to use an AppWrapper.
+        envs:
+            A dictionary of environment variables to set for the cluster.
+        image:
+            The image to use for the cluster.
+        image_pull_secrets:
+            A list of image pull secrets to use for the cluster.
+        write_to_file:
+            A boolean indicating whether to write the cluster configuration to a file.
+        verify_tls:
+            A boolean indicating whether to verify TLS when connecting to the cluster.
+        labels:
+            A dictionary of labels to apply to the cluster.
+        worker_extended_resource_requests:
+            A dictionary of extended resource requests for each worker. ex: {"nvidia.com/gpu": 1}
+        extended_resource_mapping:
+            A dictionary of custom resource mappings to map extended resource requests to RayCluster resource names
+        overwrite_default_resource_mapping:
+            A boolean indicating whether to overwrite the default resource mapping.
     """
 
     name: str

From fdfabfb81fb7692f250a6a6b7f716710578d8466 Mon Sep 17 00:00:00 2001
From: Ignas Baranauskas <ibaranau@redhat.com>
Date: Tue, 12 Nov 2024 10:03:24 +0000
Subject: [PATCH 2/2] removing params that do not exist

---
 docs/sphinx/user-docs/cluster-configuration.rst     | 1 -
 src/codeflare_sdk/common/utils/unit_test_support.py | 1 -
 2 files changed, 2 deletions(-)

diff --git a/docs/sphinx/user-docs/cluster-configuration.rst b/docs/sphinx/user-docs/cluster-configuration.rst
index 238ad51c9..dc3f2cf42 100644
--- a/docs/sphinx/user-docs/cluster-configuration.rst
+++ b/docs/sphinx/user-docs/cluster-configuration.rst
@@ -25,7 +25,6 @@ requirements for creating the Ray Cluster.
        worker_memory_requests=2, # Default 2
        worker_memory_limits=2, # Default 2
        # image="", # Optional Field
-       machine_types=["m5.xlarge", "g4dn.xlarge"],
        labels={"exampleLabel": "example", "secondLabel": "example"},
    ))
 
diff --git a/src/codeflare_sdk/common/utils/unit_test_support.py b/src/codeflare_sdk/common/utils/unit_test_support.py
index 88b65aa4f..9345fbc37 100644
--- a/src/codeflare_sdk/common/utils/unit_test_support.py
+++ b/src/codeflare_sdk/common/utils/unit_test_support.py
@@ -62,7 +62,6 @@ def createClusterWrongType():
         worker_memory_limits=6,
         worker_extended_resource_requests={"nvidia.com/gpu": 7},
         appwrapper=True,
-        machine_types=[True, False],
         image_pull_secrets=["unit-test-pull-secret"],
         image="quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06",
         write_to_file=True,