diff --git a/tests/e2e/mnist_raycluster_sdk_aw_kind_test.py b/tests/e2e/mnist_raycluster_sdk_aw_kind_test.py index 4623a9e53..b94fe2526 100644 --- a/tests/e2e/mnist_raycluster_sdk_aw_kind_test.py +++ b/tests/e2e/mnist_raycluster_sdk_aw_kind_test.py @@ -2,7 +2,7 @@ from time import sleep -from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication +from codeflare_sdk import Cluster, ClusterConfiguration from codeflare_sdk.ray.client import RayJobClient import pytest @@ -68,6 +68,9 @@ def run_mnist_raycluster_sdk_kind( cluster.details() self.assert_jobsubmit_withoutlogin_kind(cluster, accelerator, number_of_gpus) + assert_get_cluster_and_jobsubmit( + self, "mnist", accelerator="gpu", number_of_gpus=1 + ) # Assertions @@ -106,8 +109,6 @@ def assert_jobsubmit_withoutlogin_kind(self, cluster, accelerator, number_of_gpu client.delete_job(submission_id) - cluster.down() - def assert_job_completion(self, status): if status == "SUCCEEDED": print(f"Job has completed: '{status}'") diff --git a/tests/e2e/mnist_raycluster_sdk_kind_test.py b/tests/e2e/mnist_raycluster_sdk_kind_test.py index 6bfb19af0..42d0c46b2 100644 --- a/tests/e2e/mnist_raycluster_sdk_kind_test.py +++ b/tests/e2e/mnist_raycluster_sdk_kind_test.py @@ -2,7 +2,7 @@ from time import sleep -from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication +from codeflare_sdk import Cluster, ClusterConfiguration from codeflare_sdk.ray.client import RayJobClient import pytest @@ -44,8 +44,6 @@ def run_mnist_raycluster_sdk_kind( num_workers=1, head_cpu_requests="500m", head_cpu_limits="500m", - head_memory_requests=2, - head_memory_limits=2, worker_cpu_requests="500m", worker_cpu_limits=1, worker_memory_requests=1, @@ -68,6 +66,10 @@ def run_mnist_raycluster_sdk_kind( self.assert_jobsubmit_withoutlogin_kind(cluster, accelerator, number_of_gpus) + assert_get_cluster_and_jobsubmit( + self, "mnist", accelerator="gpu", number_of_gpus=1 + ) + # Assertions def assert_jobsubmit_withoutlogin_kind(self, cluster, accelerator, number_of_gpus): @@ -105,8 +107,6 @@ def assert_jobsubmit_withoutlogin_kind(self, cluster, accelerator, number_of_gpu client.delete_job(submission_id) - cluster.down() - def assert_job_completion(self, status): if status == "SUCCEEDED": print(f"Job has completed: '{status}'") diff --git a/tests/e2e/mnist_raycluster_sdk_oauth_test.py b/tests/e2e/mnist_raycluster_sdk_oauth_test.py index d3e698682..212c9784b 100644 --- a/tests/e2e/mnist_raycluster_sdk_oauth_test.py +++ b/tests/e2e/mnist_raycluster_sdk_oauth_test.py @@ -2,7 +2,11 @@ from time import sleep -from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication +from codeflare_sdk import ( + Cluster, + ClusterConfiguration, + TokenAuthentication, +) from codeflare_sdk.ray.client import RayJobClient import pytest @@ -44,8 +48,6 @@ def run_mnist_raycluster_sdk_oauth(self): num_workers=1, head_cpu_requests="500m", head_cpu_limits="500m", - head_memory_requests=4, - head_memory_limits=4, worker_cpu_requests=1, worker_cpu_limits=1, worker_memory_requests=1, @@ -68,6 +70,7 @@ def run_mnist_raycluster_sdk_oauth(self): self.assert_jobsubmit_withoutLogin(cluster) self.assert_jobsubmit_withlogin(cluster) + assert_get_cluster_and_jobsubmit(self, "mnist") # Assertions @@ -132,8 +135,6 @@ def assert_jobsubmit_withlogin(self, cluster): client.delete_job(submission_id) - cluster.down() - def assert_job_completion(self, status): if status == "SUCCEEDED": print(f"Job has completed: '{status}'") diff --git a/tests/e2e/support.py b/tests/e2e/support.py index d76b460c8..2ff33e911 100644 --- a/tests/e2e/support.py +++ b/tests/e2e/support.py @@ -3,6 +3,7 @@ import random import string import subprocess +from codeflare_sdk import get_cluster from kubernetes import client, config import kubernetes.client from codeflare_sdk.common.kubernetes_cluster.kube_api_helpers import ( @@ -348,3 +349,45 @@ def get_nodes_by_label(self, node_labels): label_selector = ",".join(f"{k}={v}" for k, v in node_labels.items()) nodes = self.api_instance.list_node(label_selector=label_selector) return [node.metadata.name for node in nodes.items] + + +def assert_get_cluster_and_jobsubmit( + self, cluster_name, accelerator=None, number_of_gpus=None +): + # Retrieve the cluster + cluster = get_cluster(cluster_name, self.namespace, False) + + cluster.details() + + # Initialize the job client + client = cluster.job_client + + # Submit a job and get the submission ID + env_vars = ( + get_setup_env_variables(ACCELERATOR=accelerator) + if accelerator + else get_setup_env_variables() + ) + submission_id = client.submit_job( + entrypoint="python mnist.py", + runtime_env={ + "working_dir": "./tests/e2e/", + "pip": "./tests/e2e/mnist_pip_requirements.txt", + "env_vars": env_vars, + }, + entrypoint_num_cpus=1 if number_of_gpus is None else None, + entrypoint_num_gpus=number_of_gpus, + ) + print(f"Submitted job with ID: {submission_id}") + + # Fetch the list of jobs and validate + job_list = client.list_jobs() + print(f"List of Jobs: {job_list}") + + # Validate the number of jobs in the list + assert len(job_list) == 1 + + # Validate the submission ID matches + assert job_list[0].submission_id == submission_id + + cluster.down()