diff --git a/.gitignore b/.gitignore
index eef1052fe..fbb31b2b9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,5 @@ Pipfile.lock
poetry.lock
.venv*
build/
+tls-cluster-namespace
+quicktest.yaml
diff --git a/demo-notebooks/interactive/local_interactive.ipynb b/demo-notebooks/interactive/local_interactive.ipynb
index 88a6ccd58..d70c00df7 100644
--- a/demo-notebooks/interactive/local_interactive.ipynb
+++ b/demo-notebooks/interactive/local_interactive.ipynb
@@ -32,20 +32,12 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": null,
"id": "4364ac2e-dd10-4d30-ba66-12708daefb3f",
"metadata": {
"tags": []
},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Written to: hfgputest-1.yaml\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# Create our cluster and submit appwrapper\n",
"namespace = \"default\"\n",
@@ -89,7 +81,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "12eef53c",
"metadata": {},
@@ -99,38 +90,21 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": null,
"id": "cf1b749e-2335-42c2-b673-26768ec9895d",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "rayclient-hfgputest-1-default.apps.tedbig412.cp.fyre.ibm.com\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
- "import openshift as oc\n",
"from codeflare_sdk.utils import generate_cert\n",
"\n",
"if local_interactive:\n",
" generate_cert.generate_tls_cert(cluster_name, namespace)\n",
- " generate_cert.export_env(cluster_name, namespace)\n",
- "\n",
- "with oc.project(namespace):\n",
- " routes=oc.selector(\"route\").objects()\n",
- " rayclient_url=\"\"\n",
- " for r in routes:\n",
- " if \"rayclient\" in r.name():\n",
- " rayclient_url=r.model.spec.host\n",
- "print(rayclient_url)"
+ " generate_cert.export_env(cluster_name, namespace)"
]
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 6,
"id": "9483bb98-33b3-4beb-9b15-163d7e76c1d7",
"metadata": {
"scrolled": true,
@@ -141,15 +115,15 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "2023-05-31 14:12:37,816\tINFO client_builder.py:251 -- Passing the following kwargs to ray.init() on the server: logging_level\n",
- "2023-05-31 14:12:37,820\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.IDLE\n",
- "2023-05-31 14:12:38,034\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.CONNECTING\n",
- "2023-05-31 14:12:38,246\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.READY\n",
- "2023-05-31 14:12:38,290\tDEBUG worker.py:807 -- Pinging server.\n",
- "2023-05-31 14:12:40,521\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000001000000\n",
- "2023-05-31 14:12:40,523\tDEBUG worker.py:564 -- Scheduling task get_dashboard_url 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x01\\x00\\x00\\x00'\n",
- "2023-05-31 14:12:40,535\tDEBUG worker.py:640 -- Retaining c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n",
- "2023-05-31 14:12:41,379\tDEBUG worker.py:636 -- Releasing c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n"
+ "2023-06-27 19:14:16,088\tINFO client_builder.py:251 -- Passing the following kwargs to ray.init() on the server: logging_level\n",
+ "2023-06-27 19:14:16,100\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.IDLE\n",
+ "2023-06-27 19:14:16,308\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.CONNECTING\n",
+ "2023-06-27 19:14:16,434\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.READY\n",
+ "2023-06-27 19:14:16,436\tDEBUG worker.py:807 -- Pinging server.\n",
+ "2023-06-27 19:14:18,634\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000001000000\n",
+ "2023-06-27 19:14:18,635\tDEBUG worker.py:564 -- Scheduling task get_dashboard_url 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x01\\x00\\x00\\x00'\n",
+ "2023-06-27 19:14:18,645\tDEBUG worker.py:640 -- Retaining c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n",
+ "2023-06-27 19:14:19,454\tDEBUG worker.py:636 -- Releasing c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n"
]
},
{
@@ -190,7 +164,7 @@
" \n",
"
\n",
" Dashboard: | \n",
- " http://10.254.12.141:8265 | \n",
+ " http://10.254.20.41:8265 | \n",
"
\n",
"\n",
" \n",
@@ -198,10 +172,10 @@
"\n"
],
"text/plain": [
- "ClientContext(dashboard_url='10.254.12.141:8265', python_version='3.8.13', ray_version='2.1.0', ray_commit='23f34d948dae8de9b168667ab27e6cf940b3ae85', protocol_version='2022-10-05', _num_clients=1, _context_to_restore=)"
+ "ClientContext(dashboard_url='10.254.20.41:8265', python_version='3.8.13', ray_version='2.1.0', ray_commit='23f34d948dae8de9b168667ab27e6cf940b3ae85', protocol_version='2022-10-05', _num_clients=1, _context_to_restore=)"
]
},
- "execution_count": 12,
+ "execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@@ -210,12 +184,12 @@
"import ray\n",
"\n",
"ray.shutdown()\n",
- "ray.init(address=f\"ray://{rayclient_url}\", logging_level=\"DEBUG\")"
+ "ray.init(address=cluster.local_client_url(), logging_level=\"DEBUG\")"
]
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 7,
"id": "3436eb4a-217c-4109-a3c3-309fda7e2442",
"metadata": {},
"outputs": [],
@@ -239,7 +213,7 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 8,
"id": "5cca1874-2be3-4631-ae48-9adfa45e3af3",
"metadata": {
"scrolled": true,
@@ -250,8 +224,8 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "2023-05-31 14:13:29,868\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000002000000\n",
- "2023-05-31 14:13:29,870\tDEBUG worker.py:564 -- Scheduling task heavy_calculation 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x02\\x00\\x00\\x00'\n"
+ "2023-06-27 19:14:28,222\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000002000000\n",
+ "2023-06-27 19:14:28,222\tDEBUG worker.py:564 -- Scheduling task heavy_calculation 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x02\\x00\\x00\\x00'\n"
]
}
],
@@ -261,7 +235,7 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 9,
"id": "01172c29-e8bf-41ef-8db5-eccb07906111",
"metadata": {},
"outputs": [
@@ -269,8 +243,8 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "2023-05-31 14:13:32,643\tDEBUG worker.py:640 -- Retaining 16310a0f0a45af5cffffffffffffffffffffffff0100000001000000\n",
- "2023-05-31 14:13:34,677\tDEBUG worker.py:439 -- Internal retry for get [ClientObjectRef(16310a0f0a45af5cffffffffffffffffffffffff0100000001000000)]\n"
+ "2023-06-27 19:14:29,202\tDEBUG worker.py:640 -- Retaining 16310a0f0a45af5cffffffffffffffffffffffff0100000001000000\n",
+ "2023-06-27 19:14:31,224\tDEBUG worker.py:439 -- Internal retry for get [ClientObjectRef(16310a0f0a45af5cffffffffffffffffffffffff0100000001000000)]\n"
]
},
{
@@ -279,7 +253,7 @@
"1789.4644387076714"
]
},
- "execution_count": 15,
+ "execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@@ -290,7 +264,7 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 10,
"id": "9e79b547-a457-4232-b77d-19147067b972",
"metadata": {},
"outputs": [
@@ -298,10 +272,10 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "2023-05-31 14:13:37,659\tDEBUG dataclient.py:287 -- Got unawaited response connection_cleanup {\n",
+ "2023-06-27 19:14:33,161\tDEBUG dataclient.py:287 -- Got unawaited response connection_cleanup {\n",
"}\n",
"\n",
- "2023-05-31 14:13:38,681\tDEBUG dataclient.py:278 -- Shutting down data channel.\n"
+ "2023-06-27 19:14:34,460\tDEBUG dataclient.py:278 -- Shutting down data channel.\n"
]
}
],
@@ -312,7 +286,7 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 11,
"id": "2c198f1f-68bf-43ff-a148-02b5cb000ff2",
"metadata": {},
"outputs": [],
diff --git a/pyproject.toml b/pyproject.toml
index ffbce20f1..e4224fd99 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,9 +24,10 @@ python = "^3.7"
openshift-client = "1.0.18"
rich = "^12.5"
ray = {version = "2.5.0", extras = ["default"]}
-kubernetes = "25.3.0"
+kubernetes = ">= 25.3.0, < 27"
codeflare-torchx = "0.6.0.dev0"
cryptography = "40.0.2"
+executing = "1.2.0"
[tool.poetry.group.docs]
optional = true
diff --git a/requirements.txt b/requirements.txt
index c654bf782..2a48812aa 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,5 @@ ray[default]==2.5.0
kubernetes>=25.3.0,<27
codeflare-torchx==0.6.0.dev0
pydantic<2 # 2.0+ broke ray[default] see detail: https://github.com/ray-project/ray/pull/37000
+cryptography==40.0.2
+executing==1.2.0
diff --git a/src/codeflare_sdk/cluster/auth.py b/src/codeflare_sdk/cluster/auth.py
index 33ad8cf7d..85db3d61d 100644
--- a/src/codeflare_sdk/cluster/auth.py
+++ b/src/codeflare_sdk/cluster/auth.py
@@ -20,8 +20,12 @@
"""
import abc
-import openshift as oc
-from openshift import OpenShiftPythonException
+from kubernetes import client, config
+
+global api_client
+api_client = None
+global config_path
+config_path = None
class Authentication(metaclass=abc.ABCMeta):
@@ -43,80 +47,131 @@ def logout(self):
pass
+class KubeConfiguration(metaclass=abc.ABCMeta):
+ """
+ An abstract class that defines the method for loading a user defined config file using the `load_kube_config()` function
+ """
+
+ def load_kube_config(self):
+ """
+ Method for setting your Kubernetes configuration to a certain file
+ """
+ pass
+
+ def logout(self):
+ """
+ Method for logging out of the remote cluster
+ """
+ pass
+
+
class TokenAuthentication(Authentication):
"""
- `TokenAuthentication` is a subclass of `Authentication`. It can be used to authenticate to an OpenShift
+ `TokenAuthentication` is a subclass of `Authentication`. It can be used to authenticate to a Kubernetes
cluster when the user has an API token and the API server address.
"""
- def __init__(self, token: str = None, server: str = None, skip_tls: bool = False):
+ def __init__(
+ self,
+ token: str,
+ server: str,
+ skip_tls: bool = False,
+ ca_cert_path: str = None,
+ ):
"""
Initialize a TokenAuthentication object that requires a value for `token`, the API Token
- and `server`, the API server address for authenticating to an OpenShift cluster.
+ and `server`, the API server address for authenticating to a Kubernetes cluster.
"""
self.token = token
self.server = server
self.skip_tls = skip_tls
+ self.ca_cert_path = ca_cert_path
def login(self) -> str:
"""
- This function is used to login to an OpenShift cluster using the user's API token and API server address.
- Depending on the cluster, a user can choose to login in with "--insecure-skip-tls-verify` by setting `skip_tls`
- to `True`.
+ This function is used to log in to a Kubernetes cluster using the user's API token and API server address.
+ Depending on the cluster, a user can choose to login in with `--insecure-skip-tls-verify` by setting `skip_tls`
+ to `True` or `--certificate-authority` by setting `skip_tls` to False and providing a path to a ca bundle with `ca_cert_path`.
"""
- args = [f"--token={self.token}", f"--server={self.server}"]
- if self.skip_tls:
- args.append("--insecure-skip-tls-verify")
+ global config_path
+ global api_client
try:
- response = oc.invoke("login", args)
- except OpenShiftPythonException as osp: # pragma: no cover
- error_msg = osp.result.err()
- if "The server uses a certificate signed by unknown authority" in error_msg:
- return "Error: certificate auth failure, please set `skip_tls=True` in TokenAuthentication"
- elif "invalid" in error_msg:
- raise PermissionError(error_msg)
+ configuration = client.Configuration()
+ configuration.api_key_prefix["authorization"] = "Bearer"
+ configuration.host = self.server
+ configuration.api_key["authorization"] = self.token
+ if self.skip_tls == False and self.ca_cert_path == None:
+ configuration.verify_ssl = True
+ elif self.skip_tls == False:
+ configuration.ssl_ca_cert = self.ca_cert_path
else:
- return error_msg
- return response.out()
+ configuration.verify_ssl = False
+ api_client = client.ApiClient(configuration)
+ client.AuthenticationApi(api_client).get_api_group()
+ config_path = None
+ return "Logged into %s" % self.server
+ except client.ApiException: # pragma: no cover
+ api_client = None
+ print("Authentication Error please provide the correct token + server")
def logout(self) -> str:
"""
- This function is used to logout of an OpenShift cluster.
+ This function is used to logout of a Kubernetes cluster.
"""
- args = [f"--token={self.token}", f"--server={self.server}"]
- response = oc.invoke("logout", args)
- return response.out()
+ global config_path
+ config_path = None
+ global api_client
+ api_client = None
+ return "Successfully logged out of %s" % self.server
-class PasswordUserAuthentication(Authentication):
+class KubeConfigFileAuthentication(KubeConfiguration):
"""
- `PasswordUserAuthentication` is a subclass of `Authentication`. It can be used to authenticate to an OpenShift
- cluster when the user has a username and password.
+ A class that defines the necessary methods for passing a user's own Kubernetes config file.
+ Specifically this class defines the `load_kube_config()` and `config_check()` functions.
"""
- def __init__(
- self,
- username: str = None,
- password: str = None,
- ):
- """
- Initialize a PasswordUserAuthentication object that requires a value for `username`
- and `password` for authenticating to an OpenShift cluster.
- """
- self.username = username
- self.password = password
+ def __init__(self, kube_config_path: str = None):
+ self.kube_config_path = kube_config_path
- def login(self) -> str:
+ def load_kube_config(self):
"""
- This function is used to login to an OpenShift cluster using the user's `username` and `password`.
+ Function for loading a user's own predefined Kubernetes config file.
"""
- response = oc.login(self.username, self.password)
- return response.out()
+ global config_path
+ global api_client
+ try:
+ if self.kube_config_path == None:
+ return "Please specify a config file path"
+ config_path = self.kube_config_path
+ api_client = None
+ config.load_kube_config(config_path)
+ response = "Loaded user config file at path %s" % self.kube_config_path
+ except config.ConfigException: # pragma: no cover
+ config_path = None
+ raise Exception("Please specify a config file path")
+ return response
+
+
+def config_check() -> str:
+ """
+ Function for loading the config file at the default config location ~/.kube/config if the user has not
+ specified their own config file or has logged in with their token and server.
+ """
+ global config_path
+ global api_client
+ if config_path == None and api_client == None:
+ config.load_kube_config()
+ if config_path != None and api_client == None:
+ return config_path
- def logout(self) -> str:
- """
- This function is used to logout of an OpenShift cluster.
- """
- response = oc.invoke("logout")
- return response.out()
+
+def api_config_handler() -> str:
+ """
+ This function is used to load the api client if the user has logged in
+ """
+ if api_client != None and config_path == None:
+ return api_client
+ else:
+ return None
diff --git a/src/codeflare_sdk/cluster/awload.py b/src/codeflare_sdk/cluster/awload.py
index 5621d6734..12544ebac 100644
--- a/src/codeflare_sdk/cluster/awload.py
+++ b/src/codeflare_sdk/cluster/awload.py
@@ -20,9 +20,12 @@
from os.path import isfile
import errno
import os
-import openshift as oc
import yaml
+from kubernetes import client, config
+from ..utils.kube_api_helpers import _kube_api_error_handling
+from .auth import config_check, api_config_handler
+
class AWManager:
"""
@@ -40,10 +43,10 @@ def __init__(self, filename: str) -> None:
self.filename = filename
try:
with open(self.filename) as f:
- awyaml = yaml.load(f, Loader=yaml.FullLoader)
- assert awyaml["kind"] == "AppWrapper"
- self.name = awyaml["metadata"]["name"]
- self.namespace = awyaml["metadata"]["namespace"]
+ self.awyaml = yaml.load(f, Loader=yaml.FullLoader)
+ assert self.awyaml["kind"] == "AppWrapper"
+ self.name = self.awyaml["metadata"]["name"]
+ self.namespace = self.awyaml["metadata"]["namespace"]
except:
raise ValueError(
f"{filename } is not a correctly formatted AppWrapper yaml"
@@ -55,19 +58,17 @@ def submit(self) -> None:
Attempts to create the AppWrapper custom resource using the yaml file
"""
try:
- with oc.project(self.namespace):
- oc.invoke("create", ["-f", self.filename])
- except oc.OpenShiftPythonException as osp: # pragma: no cover
- error_msg = osp.result.err()
- if "Unauthorized" in error_msg or "Forbidden" in error_msg:
- raise PermissionError(
- "Action not permitted, have you put in correct/up-to-date auth credentials?"
- )
- elif "AlreadyExists" in error_msg:
- raise FileExistsError(
- f"An AppWrapper of the name {self.name} already exists in namespace {self.namespace}"
- )
- raise osp
+ config_check()
+ api_instance = client.CustomObjectsApi(api_config_handler())
+ api_instance.create_namespaced_custom_object(
+ group="mcad.ibm.com",
+ version="v1beta1",
+ namespace=self.namespace,
+ plural="appwrappers",
+ body=self.awyaml,
+ )
+ except Exception as e:
+ return _kube_api_error_handling(e)
self.submitted = True
print(f"AppWrapper {self.filename} submitted!")
@@ -82,25 +83,17 @@ def remove(self) -> None:
return
try:
- with oc.project(self.namespace):
- oc.invoke("delete", ["AppWrapper", self.name])
- except oc.OpenShiftPythonException as osp: # pragma: no cover
- error_msg = osp.result.err()
- if (
- 'the server doesn\'t have a resource type "AppWrapper"' in error_msg
- or "forbidden" in error_msg
- or "Unauthorized" in error_msg
- or "Missing or incomplete configuration" in error_msg
- ):
- raise PermissionError(
- "Action not permitted, have you put in correct/up-to-date auth credentials?"
- )
- elif "not found" in error_msg:
- self.submitted = False
- print("AppWrapper not found, was deleted in another manner")
- return
- else:
- raise osp
+ config_check()
+ api_instance = client.CustomObjectsApi(api_config_handler())
+ api_instance.delete_namespaced_custom_object(
+ group="mcad.ibm.com",
+ version="v1beta1",
+ namespace=self.namespace,
+ plural="appwrappers",
+ name=self.name,
+ )
+ except Exception as e:
+ return _kube_api_error_handling(e)
self.submitted = False
print(f"AppWrapper {self.name} removed!")
diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py
index c09e981b3..ff92bfcf0 100644
--- a/src/codeflare_sdk/cluster/cluster.py
+++ b/src/codeflare_sdk/cluster/cluster.py
@@ -18,15 +18,15 @@
cluster setup queue, a list of all existing clusters, and the user's working namespace.
"""
-from os import stat
from time import sleep
from typing import List, Optional, Tuple, Dict
-import openshift as oc
from ray.job_submission import JobSubmissionClient
+from .auth import config_check, api_config_handler
from ..utils import pretty_print
from ..utils.generate_yaml import generate_appwrapper
+from ..utils.kube_api_helpers import _kube_api_error_handling
from .config import ClusterConfiguration
from .model import (
AppWrapper,
@@ -35,6 +35,9 @@
RayCluster,
RayClusterStatus,
)
+from kubernetes import client, config
+import yaml
+import os
class Cluster:
@@ -65,8 +68,10 @@ def create_app_wrapper(self):
"""
if self.config.namespace is None:
- self.config.namespace = oc.get_project_name()
- if type(self.config.namespace) is not str:
+ self.config.namespace = get_current_namespace()
+ if self.config.namespace is None:
+ print("Please specify with namespace=")
+ elif type(self.config.namespace) is not str:
raise TypeError(
f"Namespace {self.config.namespace} is of type {type(self.config.namespace)}. Check your Kubernetes Authentication."
)
@@ -112,15 +117,19 @@ def up(self):
"""
namespace = self.config.namespace
try:
- with oc.project(namespace):
- oc.invoke("apply", ["-f", self.app_wrapper_yaml])
- except oc.OpenShiftPythonException as osp: # pragma: no cover
- error_msg = osp.result.err()
- if "Unauthorized" in error_msg:
- raise PermissionError(
- "Action not permitted, have you put in correct/up-to-date auth credentials?"
- )
- raise osp
+ config_check()
+ api_instance = client.CustomObjectsApi(api_config_handler())
+ with open(self.app_wrapper_yaml) as f:
+ aw = yaml.load(f, Loader=yaml.FullLoader)
+ api_instance.create_namespaced_custom_object(
+ group="mcad.ibm.com",
+ version="v1beta1",
+ namespace=namespace,
+ plural="appwrappers",
+ body=aw,
+ )
+ except Exception as e: # pragma: no cover
+ return _kube_api_error_handling(e)
def down(self):
"""
@@ -129,23 +138,17 @@ def down(self):
"""
namespace = self.config.namespace
try:
- with oc.project(namespace):
- oc.invoke("delete", ["AppWrapper", self.app_wrapper_name])
- except oc.OpenShiftPythonException as osp: # pragma: no cover
- error_msg = osp.result.err()
- if (
- 'the server doesn\'t have a resource type "AppWrapper"' in error_msg
- or "forbidden" in error_msg
- or "Unauthorized" in error_msg
- or "Missing or incomplete configuration" in error_msg
- ):
- raise PermissionError(
- "Action not permitted, have you run auth.login()/cluster.up() yet?"
- )
- elif "not found" in error_msg:
- print("Cluster not found, have you run cluster.up() yet?")
- else:
- raise osp
+ config_check()
+ api_instance = client.CustomObjectsApi(api_config_handler())
+ api_instance.delete_namespaced_custom_object(
+ group="mcad.ibm.com",
+ version="v1beta1",
+ namespace=namespace,
+ plural="appwrappers",
+ name=self.app_wrapper_name,
+ )
+ except Exception as e: # pragma: no cover
+ return _kube_api_error_handling(e)
def status(
self, print_to_console: bool = True
@@ -247,16 +250,21 @@ def cluster_dashboard_uri(self) -> str:
Returns a string containing the cluster's dashboard URI.
"""
try:
- with oc.project(self.config.namespace):
- route = oc.invoke(
- "get", ["route", "-o", "jsonpath='{$.items[*].spec.host}'"]
- )
- route = route.out().split(" ")
- route = [x for x in route if f"ray-dashboard-{self.config.name}" in x]
- route = route[0].strip().strip("'")
- return f"http://{route}"
- except:
- return "Dashboard route not available yet, have you run cluster.up()?"
+ config_check()
+ api_instance = client.CustomObjectsApi(api_config_handler())
+ routes = api_instance.list_namespaced_custom_object(
+ group="route.openshift.io",
+ version="v1",
+ namespace=self.config.namespace,
+ plural="routes",
+ )
+ except Exception as e: # pragma: no cover
+ return _kube_api_error_handling(e)
+
+ for route in routes["items"]:
+ if route["metadata"]["name"] == f"ray-dashboard-{self.config.name}":
+ return f"http://{route['spec']['host']}"
+ return "Dashboard route not available yet, have you run cluster.up()?"
def list_jobs(self) -> List:
"""
@@ -296,6 +304,56 @@ def torchx_config(
to_return["requirements"] = requirements
return to_return
+ def from_k8_cluster_object(rc):
+ machine_types = (
+ rc["metadata"]["labels"]["orderedinstance"].split("_")
+ if "orderedinstance" in rc["metadata"]["labels"]
+ else []
+ )
+ local_interactive = (
+ "volumeMounts"
+ in rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0]
+ )
+ cluster_config = ClusterConfiguration(
+ name=rc["metadata"]["name"],
+ namespace=rc["metadata"]["namespace"],
+ machine_types=machine_types,
+ min_worker=rc["spec"]["workerGroupSpecs"][0]["minReplicas"],
+ max_worker=rc["spec"]["workerGroupSpecs"][0]["maxReplicas"],
+ min_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
+ "containers"
+ ][0]["resources"]["requests"]["cpu"],
+ max_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
+ "containers"
+ ][0]["resources"]["limits"]["cpu"],
+ min_memory=int(
+ rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
+ "resources"
+ ]["requests"]["memory"][:-1]
+ ),
+ max_memory=int(
+ rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
+ "resources"
+ ]["limits"]["memory"][:-1]
+ ),
+ gpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
+ "resources"
+ ]["limits"]["nvidia.com/gpu"],
+ instascale=True if machine_types else False,
+ image=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][
+ 0
+ ]["image"],
+ local_interactive=local_interactive,
+ )
+ return Cluster(cluster_config)
+
+ def local_client_url(self):
+ if self.config.local_interactive == True:
+ ingress_domain = _get_ingress_domain()
+ return f"ray://rayclient-{self.config.name}-{self.config.namespace}.{ingress_domain}"
+ else:
+ return "None"
+
def list_all_clusters(namespace: str, print_to_console: bool = True):
"""
@@ -320,78 +378,120 @@ def list_all_queued(namespace: str, print_to_console: bool = True):
return app_wrappers
+def get_current_namespace(): # pragma: no cover
+ if api_config_handler() != None:
+ if os.path.isfile("/var/run/secrets/kubernetes.io/serviceaccount/namespace"):
+ try:
+ file = open(
+ "/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r"
+ )
+ active_context = file.readline().strip("\n")
+ return active_context
+ except Exception as e:
+ print("Unable to find current namespace")
+ return None
+ else:
+ print("Unable to find current namespace")
+ return None
+ else:
+ try:
+ _, active_context = config.list_kube_config_contexts(config_check())
+ except Exception as e:
+ return _kube_api_error_handling(e)
+ try:
+ return active_context["context"]["namespace"]
+ except KeyError:
+ return None
+
+
+def get_cluster(cluster_name: str, namespace: str = "default"):
+ try:
+ config.load_kube_config()
+ api_instance = client.CustomObjectsApi()
+ rcs = api_instance.list_namespaced_custom_object(
+ group="ray.io",
+ version="v1alpha1",
+ namespace=namespace,
+ plural="rayclusters",
+ )
+ except Exception as e:
+ return _kube_api_error_handling(e)
+
+ for rc in rcs["items"]:
+ if rc["metadata"]["name"] == cluster_name:
+ return Cluster.from_k8_cluster_object(rc)
+ raise FileNotFoundError(
+ f"Cluster {cluster_name} is not found in {namespace} namespace"
+ )
+
+
# private methods
+def _get_ingress_domain():
+ try:
+ config.load_kube_config()
+ api_client = client.CustomObjectsApi()
+ ingress = api_client.get_cluster_custom_object(
+ "config.openshift.io", "v1", "ingresses", "cluster"
+ )
+ except Exception as e: # pragma: no cover
+ return _kube_api_error_handling(e)
+ return ingress["spec"]["domain"]
def _app_wrapper_status(name, namespace="default") -> Optional[AppWrapper]:
- cluster = None
try:
- with oc.project(namespace), oc.timeout(10 * 60):
- cluster = oc.selector(f"appwrapper/{name}").object()
- except oc.OpenShiftPythonException as osp: # pragma: no cover
- msg = osp.msg
- if "Expected a single object, but selected 0" in msg:
- return cluster
- error_msg = osp.result.err()
- if not (
- 'the server doesn\'t have a resource type "appwrapper"' in error_msg
- or "forbidden" in error_msg
- or "Unauthorized" in error_msg
- or "Missing or incomplete configuration" in error_msg
- ):
- raise osp
-
- if cluster:
- return _map_to_app_wrapper(cluster)
-
- return cluster
+ config_check()
+ api_instance = client.CustomObjectsApi(api_config_handler())
+ aws = api_instance.list_namespaced_custom_object(
+ group="mcad.ibm.com",
+ version="v1beta1",
+ namespace=namespace,
+ plural="appwrappers",
+ )
+ except Exception as e: # pragma: no cover
+ return _kube_api_error_handling(e)
+
+ for aw in aws["items"]:
+ if aw["metadata"]["name"] == name:
+ return _map_to_app_wrapper(aw)
+ return None
def _ray_cluster_status(name, namespace="default") -> Optional[RayCluster]:
- cluster = None
try:
- with oc.project(namespace), oc.timeout(10 * 60):
- cluster = oc.selector(f"rayclusters/{name}").object()
- except oc.OpenShiftPythonException as osp: # pragma: no cover
- msg = osp.msg
- if "Expected a single object, but selected 0" in msg:
- return cluster
- error_msg = osp.result.err()
- if not (
- 'the server doesn\'t have a resource type "rayclusters"' in error_msg
- or "forbidden" in error_msg
- or "Unauthorized" in error_msg
- or "Missing or incomplete configuration" in error_msg
- ):
- raise osp
-
- if cluster:
- return _map_to_ray_cluster(cluster)
-
- return cluster
+ config_check()
+ api_instance = client.CustomObjectsApi(api_config_handler())
+ rcs = api_instance.list_namespaced_custom_object(
+ group="ray.io",
+ version="v1alpha1",
+ namespace=namespace,
+ plural="rayclusters",
+ )
+ except Exception as e: # pragma: no cover
+ return _kube_api_error_handling(e)
+
+ for rc in rcs["items"]:
+ if rc["metadata"]["name"] == name:
+ return _map_to_ray_cluster(rc)
+ return None
def _get_ray_clusters(namespace="default") -> List[RayCluster]:
list_of_clusters = []
try:
- with oc.project(namespace), oc.timeout(10 * 60):
- ray_clusters = oc.selector("rayclusters").objects()
- except oc.OpenShiftPythonException as osp: # pragma: no cover
- error_msg = osp.result.err()
- if (
- 'the server doesn\'t have a resource type "rayclusters"' in error_msg
- or "forbidden" in error_msg
- or "Unauthorized" in error_msg
- or "Missing or incomplete configuration" in error_msg
- ):
- raise PermissionError(
- "Action not permitted, have you put in correct/up-to-date auth credentials?"
- )
- else:
- raise osp
+ config_check()
+ api_instance = client.CustomObjectsApi(api_config_handler())
+ rcs = api_instance.list_namespaced_custom_object(
+ group="ray.io",
+ version="v1alpha1",
+ namespace=namespace,
+ plural="rayclusters",
+ )
+ except Exception as e: # pragma: no cover
+ return _kube_api_error_handling(e)
- for cluster in ray_clusters:
- list_of_clusters.append(_map_to_ray_cluster(cluster))
+ for rc in rcs["items"]:
+ list_of_clusters.append(_map_to_ray_cluster(rc))
return list_of_clusters
@@ -401,23 +501,18 @@ def _get_app_wrappers(
list_of_app_wrappers = []
try:
- with oc.project(namespace), oc.timeout(10 * 60):
- app_wrappers = oc.selector("appwrappers").objects()
- except oc.OpenShiftPythonException as osp: # pragma: no cover
- error_msg = osp.result.err()
- if (
- 'the server doesn\'t have a resource type "appwrappers"' in error_msg
- or "forbidden" in error_msg
- or "Unauthorized" in error_msg
- or "Missing or incomplete configuration" in error_msg
- ):
- raise PermissionError(
- "Action not permitted, have you put in correct/up-to-date auth credentials?"
- )
- else:
- raise osp
+ config_check()
+ api_instance = client.CustomObjectsApi(api_config_handler())
+ aws = api_instance.list_namespaced_custom_object(
+ group="mcad.ibm.com",
+ version="v1beta1",
+ namespace=namespace,
+ plural="appwrappers",
+ )
+ except Exception as e: # pragma: no cover
+ return _kube_api_error_handling(e)
- for item in app_wrappers:
+ for item in aws["items"]:
app_wrapper = _map_to_app_wrapper(item)
if filter and app_wrapper.status in filter:
list_of_app_wrappers.append(app_wrapper)
@@ -427,48 +522,52 @@ def _get_app_wrappers(
return list_of_app_wrappers
-def _map_to_ray_cluster(cluster) -> Optional[RayCluster]:
- cluster_model = cluster.model
- if type(cluster_model.status.state) == oc.model.MissingModel:
- status = RayClusterStatus.UNKNOWN
+def _map_to_ray_cluster(rc) -> Optional[RayCluster]:
+ if "state" in rc["status"]:
+ status = RayClusterStatus(rc["status"]["state"].lower())
else:
- status = RayClusterStatus(cluster_model.status.state.lower())
+ status = RayClusterStatus.UNKNOWN
- with oc.project(cluster.namespace()), oc.timeout(10 * 60):
- route = (
- oc.selector(f"route/ray-dashboard-{cluster.name()}")
- .object()
- .model.spec.host
- )
+ config_check()
+ api_instance = client.CustomObjectsApi(api_config_handler())
+ routes = api_instance.list_namespaced_custom_object(
+ group="route.openshift.io",
+ version="v1",
+ namespace=rc["metadata"]["namespace"],
+ plural="routes",
+ )
+ ray_route = None
+ for route in routes["items"]:
+ if route["metadata"]["name"] == f"ray-dashboard-{rc['metadata']['name']}":
+ ray_route = route["spec"]["host"]
return RayCluster(
- name=cluster.name(),
+ name=rc["metadata"]["name"],
status=status,
# for now we are not using autoscaling so same replicas is fine
- min_workers=cluster_model.spec.workerGroupSpecs[0].replicas,
- max_workers=cluster_model.spec.workerGroupSpecs[0].replicas,
- worker_mem_max=cluster_model.spec.workerGroupSpecs[0]
- .template.spec.containers[0]
- .resources.limits.memory,
- worker_mem_min=cluster_model.spec.workerGroupSpecs[0]
- .template.spec.containers[0]
- .resources.requests.memory,
- worker_cpu=cluster_model.spec.workerGroupSpecs[0]
- .template.spec.containers[0]
- .resources.limits.cpu,
+ min_workers=rc["spec"]["workerGroupSpecs"][0]["replicas"],
+ max_workers=rc["spec"]["workerGroupSpecs"][0]["replicas"],
+ worker_mem_max=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
+ "containers"
+ ][0]["resources"]["limits"]["memory"],
+ worker_mem_min=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
+ "containers"
+ ][0]["resources"]["requests"]["memory"],
+ worker_cpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][
+ 0
+ ]["resources"]["limits"]["cpu"],
worker_gpu=0, # hard to detect currently how many gpus, can override it with what the user asked for
- namespace=cluster.namespace(),
- dashboard=route,
+ namespace=rc["metadata"]["namespace"],
+ dashboard=ray_route,
)
-def _map_to_app_wrapper(cluster) -> AppWrapper:
- cluster_model = cluster.model
+def _map_to_app_wrapper(aw) -> AppWrapper:
return AppWrapper(
- name=cluster.name(),
- status=AppWrapperStatus(cluster_model.status.state.lower()),
- can_run=cluster_model.status.canrun,
- job_state=cluster_model.status.queuejobstate,
+ name=aw["metadata"]["name"],
+ status=AppWrapperStatus(aw["status"]["state"].lower()),
+ can_run=aw["status"]["canrun"],
+ job_state=aw["status"]["queuejobstate"],
)
diff --git a/src/codeflare_sdk/cluster/config.py b/src/codeflare_sdk/cluster/config.py
index f24425635..31f70d6b9 100644
--- a/src/codeflare_sdk/cluster/config.py
+++ b/src/codeflare_sdk/cluster/config.py
@@ -19,9 +19,7 @@
"""
from dataclasses import dataclass, field
-from .auth import Authentication
import pathlib
-import openshift
dir = pathlib.Path(__file__).parent.parent.resolve()
diff --git a/src/codeflare_sdk/job/jobs.py b/src/codeflare_sdk/job/jobs.py
index 6b5ce0a53..b1db70d54 100644
--- a/src/codeflare_sdk/job/jobs.py
+++ b/src/codeflare_sdk/job/jobs.py
@@ -17,13 +17,13 @@
from typing import TYPE_CHECKING, Optional, Dict, List
from pathlib import Path
-import openshift as oc
from torchx.components.dist import ddp
from torchx.runner import get_runner
from torchx.specs import AppHandle, parse_app_handle, AppDryRunInfo
if TYPE_CHECKING:
from ..cluster.cluster import Cluster
+from ..cluster.cluster import get_current_namespace
all_jobs: List["Job"] = []
torchx_runner = get_runner()
@@ -124,7 +124,7 @@ def _missing_spec(self, spec: str):
def _dry_run_no_cluster(self):
if self.scheduler_args is not None:
if self.scheduler_args.get("namespace") is None:
- self.scheduler_args["namespace"] = oc.get_project_name()
+ self.scheduler_args["namespace"] = get_current_namespace()
return torchx_runner.dryrun(
app=ddp(
*self.script_args,
diff --git a/src/codeflare_sdk/utils/generate_cert.py b/src/codeflare_sdk/utils/generate_cert.py
index 2d73621b8..04b04d3e0 100644
--- a/src/codeflare_sdk/utils/generate_cert.py
+++ b/src/codeflare_sdk/utils/generate_cert.py
@@ -19,6 +19,7 @@
from cryptography import x509
from cryptography.x509.oid import NameOID
import datetime
+from ..cluster.auth import config_check, api_config_handler
from kubernetes import client, config
@@ -82,8 +83,8 @@ def generate_tls_cert(cluster_name, namespace, days=30):
# Similar to:
# oc get secret ca-secret- -o template='{{index .data "ca.key"}}'
# oc get secret ca-secret- -o template='{{index .data "ca.crt"}}'|base64 -d > ${TLSDIR}/ca.crt
- config.load_kube_config()
- v1 = client.CoreV1Api()
+ config_check()
+ v1 = client.CoreV1Api(api_config_handler())
secret = v1.read_namespaced_secret(f"ca-secret-{cluster_name}", namespace).data
ca_cert = secret.get("ca.crt")
ca_key = secret.get("ca.key")
diff --git a/src/codeflare_sdk/utils/generate_yaml.py b/src/codeflare_sdk/utils/generate_yaml.py
index 9538a1e8e..c4361abae 100755
--- a/src/codeflare_sdk/utils/generate_yaml.py
+++ b/src/codeflare_sdk/utils/generate_yaml.py
@@ -21,7 +21,8 @@
import sys
import argparse
import uuid
-import openshift as oc
+from kubernetes import client, config
+from .kube_api_helpers import _kube_api_error_handling
def read_template(template):
@@ -248,12 +249,16 @@ def enable_local_interactive(resources, cluster_name, namespace):
][0].get("command")[2]
command = command.replace("deployment-name", cluster_name)
-
- server_name = (
- oc.whoami("--show-server").split(":")[1].split("//")[1].replace("api", "apps")
- )
-
- command = command.replace("server-name", server_name)
+ try:
+ config.load_kube_config()
+ api_client = client.CustomObjectsApi()
+ ingress = api_client.get_cluster_custom_object(
+ "config.openshift.io", "v1", "ingresses", "cluster"
+ )
+ except Exception as e: # pragma: no cover
+ return _kube_api_error_handling(e)
+ domain = ingress["spec"]["domain"]
+ command = command.replace("server-name", domain)
item["generictemplate"]["spec"]["headGroupSpec"]["template"]["spec"][
"initContainers"
@@ -338,131 +343,3 @@ def generate_appwrapper(
outfile = appwrapper_name + ".yaml"
write_user_appwrapper(user_yaml, outfile)
return outfile
-
-
-def main(): # pragma: no cover
- parser = argparse.ArgumentParser(description="Generate user AppWrapper")
- parser.add_argument(
- "--name",
- required=False,
- default="",
- help="User selected name for AppWrapper and Ray Cluster (auto-generated if not provided)",
- )
- parser.add_argument(
- "--min-cpu",
- type=int,
- required=True,
- help="min number of CPU(s) in a worker required for running job",
- )
- parser.add_argument(
- "--max-cpu",
- type=int,
- required=True,
- help="max number of CPU(s) in a worker required for running job",
- )
- parser.add_argument(
- "--min-memory",
- type=int,
- required=True,
- help="min RAM required in a worker for running job, in GB",
- )
- parser.add_argument(
- "--max-memory",
- type=int,
- required=True,
- help="max RAM required in a worker for running job, in GB",
- )
- parser.add_argument(
- "--gpu",
- type=int,
- required=True,
- help="GPU(s) required in a worker for running job",
- )
- parser.add_argument(
- "--workers",
- type=int,
- required=True,
- help="How many workers are required in the cluster",
- )
- parser.add_argument(
- "--template", required=True, help="Template AppWrapper yaml file"
- )
- parser.add_argument(
- "--image",
- required=False,
- default="rayproject/ray:latest",
- help="Ray image to be used (defaults to rayproject/ray:latest)",
- )
- parser.add_argument(
- "--instascale",
- default=False,
- required=False,
- action="store_true",
- help="Indicates that instascale is installed on the cluster",
- )
- parser.add_argument(
- "--instance-types",
- type=str,
- nargs="+",
- default=[],
- required=False,
- help="Head,worker instance types (space separated)",
- )
- parser.add_argument(
- "--namespace",
- required=False,
- default="default",
- help="Set the kubernetes namespace you want to deploy your cluster to. Default. If left blank, uses the 'default' namespace",
- )
- parser.add_argument(
- "--local-interactive",
- required=False,
- default=False,
- help="Enable local interactive mode",
- )
- parser.add_argument(
- "--image-pull-secrets",
- required=False,
- default=[],
- help="Set image pull secrets for private registries",
- )
-
- args = parser.parse_args()
- name = args.name
- min_cpu = args.min_cpu
- max_cpu = args.max_cpu
- min_memory = args.min_memory
- max_memory = args.max_memory
- gpu = args.gpu
- workers = args.workers
- template = args.template
- image = args.image
- instascale = args.instascale
- instance_types = args.instance_types
- namespace = args.namespace
- local_interactive = args.local_interactive
- env = {}
- image_pull_secrets = args.image_pull_secrets
-
- outfile = generate_appwrapper(
- name,
- namespace,
- min_cpu,
- max_cpu,
- min_memory,
- max_memory,
- gpu,
- workers,
- template,
- image,
- instascale,
- instance_types,
- local_interactive,
- env,
- image_pull_secrets,
- )
- return outfile
-
-
-if __name__ == "__main__": # pragma: no cover
- main()
diff --git a/src/codeflare_sdk/utils/kube_api_helpers.py b/src/codeflare_sdk/utils/kube_api_helpers.py
new file mode 100644
index 000000000..58358a053
--- /dev/null
+++ b/src/codeflare_sdk/utils/kube_api_helpers.py
@@ -0,0 +1,44 @@
+# Copyright 2022 IBM, Red Hat
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This sub-module exists primarily to be used internally for any Kubernetes
+API error handling or wrapping.
+"""
+
+import executing
+from kubernetes import client, config
+
+
+# private methods
+def _kube_api_error_handling(e: Exception): # pragma: no cover
+ perm_msg = (
+ "Action not permitted, have you put in correct/up-to-date auth credentials?"
+ )
+ nf_msg = "No instances found, nothing to be done."
+ exists_msg = "Resource with this name already exists."
+ if type(e) == config.ConfigException:
+ raise PermissionError(perm_msg)
+ if type(e) == executing.executing.NotOneValueFound:
+ print(nf_msg)
+ return
+ if type(e) == client.ApiException:
+ if e.reason == "Not Found":
+ print(nf_msg)
+ return
+ elif e.reason == "Unauthorized" or e.reason == "Forbidden":
+ raise PermissionError(perm_msg)
+ elif e.reason == "Conflict":
+ raise FileExistsError(exists_msg)
+ raise e
diff --git a/tests/test-case-cmd.yaml b/tests/test-case-cmd.yaml
deleted file mode 100644
index ea235ec9a..000000000
--- a/tests/test-case-cmd.yaml
+++ /dev/null
@@ -1,173 +0,0 @@
-apiVersion: mcad.ibm.com/v1beta1
-kind: AppWrapper
-metadata:
- name: unit-cmd-cluster
- namespace: default
-spec:
- priority: 9
- resources:
- GenericItems:
- - custompodresources:
- - limits:
- cpu: 2
- memory: 8G
- nvidia.com/gpu: 0
- replicas: 1
- requests:
- cpu: 2
- memory: 8G
- nvidia.com/gpu: 0
- - limits:
- cpu: 1
- memory: 2G
- nvidia.com/gpu: 1
- replicas: 2
- requests:
- cpu: 1
- memory: 2G
- nvidia.com/gpu: 1
- generictemplate:
- apiVersion: ray.io/v1alpha1
- kind: RayCluster
- metadata:
- labels:
- appwrapper.mcad.ibm.com: unit-cmd-cluster
- controller-tools.k8s.io: '1.0'
- name: unit-cmd-cluster
- namespace: default
- spec:
- autoscalerOptions:
- idleTimeoutSeconds: 60
- imagePullPolicy: Always
- resources:
- limits:
- cpu: 500m
- memory: 512Mi
- requests:
- cpu: 500m
- memory: 512Mi
- upscalingMode: Default
- enableInTreeAutoscaling: false
- headGroupSpec:
- rayStartParams:
- block: 'true'
- dashboard-host: 0.0.0.0
- num-gpus: '0'
- serviceType: ClusterIP
- template:
- spec:
- containers:
- - env:
- - name: MY_POD_IP
- valueFrom:
- fieldRef:
- fieldPath: status.podIP
- - name: RAY_USE_TLS
- value: '0'
- - name: RAY_TLS_SERVER_CERT
- value: /home/ray/workspace/tls/server.crt
- - name: RAY_TLS_SERVER_KEY
- value: /home/ray/workspace/tls/server.key
- - name: RAY_TLS_CA_CERT
- value: /home/ray/workspace/tls/ca.crt
- image: rayproject/ray:latest
- imagePullPolicy: Always
- lifecycle:
- preStop:
- exec:
- command:
- - /bin/sh
- - -c
- - ray stop
- name: ray-head
- ports:
- - containerPort: 6379
- name: gcs
- - containerPort: 8265
- name: dashboard
- - containerPort: 10001
- name: client
- resources:
- limits:
- cpu: 2
- memory: 8G
- nvidia.com/gpu: 0
- requests:
- cpu: 2
- memory: 8G
- nvidia.com/gpu: 0
- imagePullSecrets: []
- rayVersion: 2.1.0
- workerGroupSpecs:
- - groupName: small-group-unit-cmd-cluster
- maxReplicas: 2
- minReplicas: 2
- rayStartParams:
- block: 'true'
- num-gpus: '1'
- replicas: 2
- template:
- metadata:
- annotations:
- key: value
- labels:
- key: value
- spec:
- containers:
- - env:
- - name: MY_POD_IP
- valueFrom:
- fieldRef:
- fieldPath: status.podIP
- - name: RAY_USE_TLS
- value: '0'
- - name: RAY_TLS_SERVER_CERT
- value: /home/ray/workspace/tls/server.crt
- - name: RAY_TLS_SERVER_KEY
- value: /home/ray/workspace/tls/server.key
- - name: RAY_TLS_CA_CERT
- value: /home/ray/workspace/tls/ca.crt
- image: rayproject/ray:latest
- lifecycle:
- preStop:
- exec:
- command:
- - /bin/sh
- - -c
- - ray stop
- name: machine-learning
- resources:
- limits:
- cpu: 1
- memory: 2G
- nvidia.com/gpu: 1
- requests:
- cpu: 1
- memory: 2G
- nvidia.com/gpu: 1
- imagePullSecrets: []
- initContainers:
- - command:
- - sh
- - -c
- - until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local;
- do echo waiting for myservice; sleep 2; done
- image: busybox:1.28
- name: init-myservice
- replicas: 1
- - generictemplate:
- apiVersion: route.openshift.io/v1
- kind: Route
- metadata:
- labels:
- odh-ray-cluster-service: unit-cmd-cluster-head-svc
- name: ray-dashboard-unit-cmd-cluster
- namespace: default
- spec:
- port:
- targetPort: dashboard
- to:
- kind: Service
- name: unit-cmd-cluster-head-svc
- replica: 1
- Items: []
diff --git a/tests/unit_test.py b/tests/unit_test.py
index d1ea3e75e..21c1adf24 100644
--- a/tests/unit_test.py
+++ b/tests/unit_test.py
@@ -21,7 +21,7 @@
parent = Path(__file__).resolve().parents[1]
sys.path.append(str(parent) + "/src")
-from kubernetes import client
+from kubernetes import client, config
from codeflare_sdk.cluster.awload import AWManager
from codeflare_sdk.cluster.cluster import (
Cluster,
@@ -29,11 +29,14 @@
list_all_clusters,
list_all_queued,
_copy_to_ray,
+ get_cluster,
+ _app_wrapper_status,
+ _ray_cluster_status,
)
from codeflare_sdk.cluster.auth import (
TokenAuthentication,
- PasswordUserAuthentication,
Authentication,
+ KubeConfigFileAuthentication,
)
from codeflare_sdk.utils.pretty_print import (
print_no_resources_found,
@@ -62,7 +65,6 @@
)
import openshift
-from openshift import OpenShiftPythonException
from openshift.selector import Selector
import ray
from torchx.specs import AppDryRunInfo, AppDef
@@ -70,6 +72,7 @@
from torchx.schedulers.ray_scheduler import RayJob
from torchx.schedulers.kubernetes_mcad_scheduler import KubernetesMCADJob
import pytest
+import yaml
# For mocking openshift client results
@@ -85,120 +88,79 @@ def att_side_effect(self):
return self.high_level_operation
-def att_side_effect_tls(self):
- if "--insecure-skip-tls-verify" in self.high_level_operation[1]:
- return self.high_level_operation
- else:
- raise OpenShiftPythonException(
- "The server uses a certificate signed by unknown authority"
- )
-
-
def test_token_auth_creation():
try:
- token_auth = TokenAuthentication()
- assert token_auth.token == None
- assert token_auth.server == None
- assert token_auth.skip_tls == False
-
- token_auth = TokenAuthentication("token")
- assert token_auth.token == "token"
- assert token_auth.server == None
- assert token_auth.skip_tls == False
-
- token_auth = TokenAuthentication("token", "server")
+ token_auth = TokenAuthentication(token="token", server="server")
assert token_auth.token == "token"
assert token_auth.server == "server"
assert token_auth.skip_tls == False
+ assert token_auth.ca_cert_path == None
- token_auth = TokenAuthentication("token", server="server")
+ token_auth = TokenAuthentication(token="token", server="server", skip_tls=True)
assert token_auth.token == "token"
assert token_auth.server == "server"
- assert token_auth.skip_tls == False
+ assert token_auth.skip_tls == True
+ assert token_auth.ca_cert_path == None
- token_auth = TokenAuthentication(token="token", server="server")
+ token_auth = TokenAuthentication(token="token", server="server", skip_tls=False)
assert token_auth.token == "token"
assert token_auth.server == "server"
assert token_auth.skip_tls == False
+ assert token_auth.ca_cert_path == None
- token_auth = TokenAuthentication(token="token", server="server", skip_tls=True)
+ token_auth = TokenAuthentication(
+ token="token", server="server", skip_tls=False, ca_cert_path="path/to/cert"
+ )
assert token_auth.token == "token"
assert token_auth.server == "server"
- assert token_auth.skip_tls == True
+ assert token_auth.skip_tls == False
+ assert token_auth.ca_cert_path == "path/to/cert"
except Exception:
assert 0 == 1
def test_token_auth_login_logout(mocker):
- mocker.patch("openshift.invoke", side_effect=arg_side_effect)
- mock_res = mocker.patch.object(openshift.Result, "out")
- mock_res.side_effect = lambda: att_side_effect(fake_res)
+ mocker.patch.object(client, "ApiClient")
- token_auth = TokenAuthentication(token="testtoken", server="testserver:6443")
- assert token_auth.login() == (
- "login",
- ["--token=testtoken", "--server=testserver:6443"],
- )
- assert token_auth.logout() == (
- "logout",
- ["--token=testtoken", "--server=testserver:6443"],
+ token_auth = TokenAuthentication(
+ token="testtoken", server="testserver:6443", skip_tls=False, ca_cert_path=None
)
+ assert token_auth.login() == ("Logged into testserver:6443")
+ assert token_auth.logout() == ("Successfully logged out of testserver:6443")
def test_token_auth_login_tls(mocker):
- mocker.patch("openshift.invoke", side_effect=arg_side_effect)
- mock_res = mocker.patch.object(openshift.Result, "out")
- mock_res.side_effect = lambda: att_side_effect_tls(fake_res)
-
- # FIXME - Pytest mocker not allowing caught exception
- # token_auth = TokenAuthentication(token="testtoken", server="testserver")
- # assert token_auth.login() == "Error: certificate auth failure, please set `skip_tls=True` in TokenAuthentication"
+ mocker.patch.object(client, "ApiClient")
token_auth = TokenAuthentication(
- token="testtoken", server="testserver:6443", skip_tls=True
+ token="testtoken", server="testserver:6443", skip_tls=True, ca_cert_path=None
)
- assert token_auth.login() == (
- "login",
- ["--token=testtoken", "--server=testserver:6443", "--insecure-skip-tls-verify"],
+ assert token_auth.login() == ("Logged into testserver:6443")
+ token_auth = TokenAuthentication(
+ token="testtoken", server="testserver:6443", skip_tls=False, ca_cert_path=None
)
+ assert token_auth.login() == ("Logged into testserver:6443")
+ token_auth = TokenAuthentication(
+ token="testtoken",
+ server="testserver:6443",
+ skip_tls=False,
+ ca_cert_path="path/to/cert",
+ )
+ assert token_auth.login() == ("Logged into testserver:6443")
-def test_passwd_auth_creation():
- try:
- passwd_auth = PasswordUserAuthentication()
- assert passwd_auth.username == None
- assert passwd_auth.password == None
-
- passwd_auth = PasswordUserAuthentication("user")
- assert passwd_auth.username == "user"
- assert passwd_auth.password == None
-
- passwd_auth = PasswordUserAuthentication("user", "passwd")
- assert passwd_auth.username == "user"
- assert passwd_auth.password == "passwd"
-
- passwd_auth = PasswordUserAuthentication("user", password="passwd")
- assert passwd_auth.username == "user"
- assert passwd_auth.password == "passwd"
-
- passwd_auth = PasswordUserAuthentication(username="user", password="passwd")
- assert passwd_auth.username == "user"
- assert passwd_auth.password == "passwd"
-
- except Exception:
- assert 0 == 1
-
-
-def test_passwd_auth_login_logout(mocker):
- mocker.patch("openshift.invoke", side_effect=arg_side_effect)
- mocker.patch("openshift.login", side_effect=arg_side_effect)
- mock_res = mocker.patch.object(openshift.Result, "out")
- mock_res.side_effect = lambda: att_side_effect(fake_res)
+def test_load_kube_config(mocker):
+ mocker.patch.object(config, "load_kube_config")
+ kube_config_auth = KubeConfigFileAuthentication(
+ kube_config_path="/path/to/your/config"
+ )
+ response = kube_config_auth.load_kube_config()
- token_auth = PasswordUserAuthentication(username="user", password="passwd")
- assert token_auth.login() == ("user", "passwd")
- assert token_auth.logout() == ("logout",)
+ assert (
+ response
+ == "Loaded user config file at path %s" % kube_config_auth.kube_config_path
+ )
def test_auth_coverage():
@@ -248,7 +210,7 @@ def test_cluster_creation():
def test_default_cluster_creation(mocker):
mocker.patch(
- "openshift.get_project_name",
+ "codeflare_sdk.cluster.cluster.get_current_namespace",
return_value="opendatahub",
)
default_config = ClusterConfiguration(
@@ -263,38 +225,103 @@ def test_default_cluster_creation(mocker):
return cluster
-def arg_check_apply_effect(*args):
- assert args[0] == "apply"
- assert args[1] == ["-f", "unit-test-cluster.yaml"]
+def arg_check_apply_effect(group, version, namespace, plural, body, *args):
+ assert group == "mcad.ibm.com"
+ assert version == "v1beta1"
+ assert namespace == "ns"
+ assert plural == "appwrappers"
+ with open("unit-test-cluster.yaml") as f:
+ aw = yaml.load(f, Loader=yaml.FullLoader)
+ assert body == aw
+ assert args == tuple()
-def arg_check_del_effect(*args):
- assert args[0] == "delete"
- assert args[1] == ["AppWrapper", "unit-test-cluster"]
+def arg_check_del_effect(group, version, namespace, plural, name, *args):
+ assert group == "mcad.ibm.com"
+ assert version == "v1beta1"
+ assert namespace == "ns"
+ assert plural == "appwrappers"
+ assert name == "unit-test-cluster"
+ assert args == tuple()
def test_cluster_up_down(mocker):
+ mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
mocker.patch(
- "codeflare_sdk.cluster.auth.TokenAuthentication.login", return_value="ignore"
+ "kubernetes.client.CustomObjectsApi.create_namespaced_custom_object",
+ side_effect=arg_check_apply_effect,
)
mocker.patch(
- "codeflare_sdk.cluster.auth.TokenAuthentication.logout", return_value="ignore"
+ "kubernetes.client.CustomObjectsApi.delete_namespaced_custom_object",
+ side_effect=arg_check_del_effect,
)
- mocker.patch("openshift.invoke", side_effect=arg_check_apply_effect)
cluster = test_cluster_creation()
cluster.up()
- mocker.patch("openshift.invoke", side_effect=arg_check_del_effect)
cluster.down()
-def out_route(self):
- return "ray-dashboard-raycluster-autoscaler-ns.apps.cluster.awsroute.org ray-dashboard-unit-test-cluster-ns.apps.cluster.awsroute.org"
+def aw_status_fields(group, version, namespace, plural, *args):
+ assert group == "mcad.ibm.com"
+ assert version == "v1beta1"
+ assert namespace == "test-ns"
+ assert plural == "appwrappers"
+ assert args == tuple()
+ return {"items": []}
+
+
+def test_aw_status(mocker):
+ mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
+ mocker.patch(
+ "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
+ side_effect=aw_status_fields,
+ )
+ aw = _app_wrapper_status("test-aw", "test-ns")
+ assert aw == None
+
+
+def rc_status_fields(group, version, namespace, plural, *args):
+ assert group == "ray.io"
+ assert version == "v1alpha1"
+ assert namespace == "test-ns"
+ assert plural == "rayclusters"
+ assert args == tuple()
+ return {"items": []}
+
+
+def test_rc_status(mocker):
+ mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
+ mocker.patch(
+ "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
+ side_effect=rc_status_fields,
+ )
+ rc = _ray_cluster_status("test-rc", "test-ns")
+ assert rc == None
+
+
+def uri_retreival(group, version, namespace, plural, *args):
+ assert group == "route.openshift.io"
+ assert version == "v1"
+ assert namespace == "ns"
+ assert plural == "routes"
+ assert args == tuple()
+ return {
+ "items": [
+ {
+ "metadata": {"name": "ray-dashboard-unit-test-cluster"},
+ "spec": {
+ "host": "ray-dashboard-unit-test-cluster-ns.apps.cluster.awsroute.org"
+ },
+ }
+ ]
+ }
def test_cluster_uris(mocker):
- mocker.patch("openshift.invoke", return_value=fake_res)
- mock_res = mocker.patch.object(openshift.Result, "out")
- mock_res.side_effect = lambda: out_route(fake_res)
+ mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
+ mocker.patch(
+ "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
+ side_effect=uri_retreival,
+ )
cluster = test_cluster_creation()
assert cluster.cluster_uri() == "ray://unit-test-cluster-head-svc.ns.svc:10001"
@@ -309,14 +336,40 @@ def test_cluster_uris(mocker):
)
+def test_local_client_url(mocker):
+ mocker.patch(
+ "kubernetes.client.CustomObjectsApi.get_cluster_custom_object",
+ return_value={"spec": {"domain": ""}},
+ )
+ mocker.patch(
+ "codeflare_sdk.cluster.cluster._get_ingress_domain",
+ return_value="apps.cluster.awsroute.org",
+ )
+ mocker.patch(
+ "codeflare_sdk.cluster.cluster.Cluster.create_app_wrapper",
+ return_value="unit-test-cluster-localinter.yaml",
+ )
+
+ cluster_config = ClusterConfiguration(
+ name="unit-test-cluster-localinter", namespace="ns", local_interactive=True
+ )
+ cluster = Cluster(cluster_config)
+ assert (
+ cluster.local_client_url()
+ == "ray://rayclient-unit-test-cluster-localinter-ns.apps.cluster.awsroute.org"
+ )
+
+
def ray_addr(self, *args):
return self._address
def test_ray_job_wrapping(mocker):
- mocker.patch("openshift.invoke", return_value=fake_res)
- mock_res = mocker.patch.object(openshift.Result, "out")
- mock_res.side_effect = lambda: out_route(fake_res)
+ mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
+ mocker.patch(
+ "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
+ side_effect=uri_retreival,
+ )
cluster = test_cluster_creation()
mocker.patch(
@@ -402,7 +455,7 @@ def test_print_appwrappers(capsys):
)
-def test_ray_details(capsys):
+def test_ray_details(mocker, capsys):
ray1 = RayCluster(
name="raytest1",
status=RayClusterStatus.READY,
@@ -415,6 +468,14 @@ def test_ray_details(capsys):
namespace="ns",
dashboard="fake-uri",
)
+ mocker.patch(
+ "codeflare_sdk.cluster.cluster.Cluster.status",
+ return_value=(False, CodeFlareClusterStatus.UNKNOWN),
+ )
+ mocker.patch(
+ "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri",
+ return_value="",
+ )
cf = Cluster(ClusterConfiguration(name="raytest2", namespace="ns"))
captured = capsys.readouterr()
ray2 = _copy_to_ray(cf)
@@ -519,223 +580,151 @@ def act_side_effect_list(self):
return [self]
-def get_selector(*args):
- selector = Selector({"operation": "selector", "status": 0, "actions": []})
- return selector
-
-
-def get_obj_none():
- return []
-
-
-def get_ray_obj(cls=None):
- api_obj = openshift.apiobject.APIObject(
- {
- "apiVersion": "ray.io/v1alpha1",
- "kind": "RayCluster",
- "metadata": {
- "creationTimestamp": "2023-02-22T16:26:07Z",
- "generation": 1,
- "labels": {
- "appwrapper.mcad.ibm.com": "quicktest",
- "controller-tools.k8s.io": "1.0",
- "resourceName": "quicktest",
- },
- "managedFields": [
- {
- "apiVersion": "ray.io/v1alpha1",
- "fieldsType": "FieldsV1",
- "fieldsV1": {
- "f:metadata": {
- "f:labels": {
- ".": {},
- "f:appwrapper.mcad.ibm.com": {},
- "f:controller-tools.k8s.io": {},
- "f:resourceName": {},
- },
- "f:ownerReferences": {
- ".": {},
- 'k:{"uid":"6334fc1b-471e-4876-8e7b-0b2277679235"}': {},
- },
- },
- "f:spec": {
- ".": {},
- "f:autoscalerOptions": {
- ".": {},
- "f:idleTimeoutSeconds": {},
- "f:imagePullPolicy": {},
- "f:resources": {
+def get_obj_none(group, version, namespace, plural):
+ return {"items": []}
+
+
+def get_ray_obj(group, version, namespace, plural, cls=None):
+ api_obj = {
+ "items": [
+ {
+ "apiVersion": "ray.io/v1alpha1",
+ "kind": "RayCluster",
+ "metadata": {
+ "creationTimestamp": "2023-02-22T16:26:07Z",
+ "generation": 1,
+ "labels": {
+ "appwrapper.mcad.ibm.com": "quicktest",
+ "controller-tools.k8s.io": "1.0",
+ "resourceName": "quicktest",
+ "orderedinstance": "m4.xlarge_g4dn.xlarge",
+ },
+ "managedFields": [
+ {
+ "apiVersion": "ray.io/v1alpha1",
+ "fieldsType": "FieldsV1",
+ "fieldsV1": {
+ "f:metadata": {
+ "f:labels": {
".": {},
- "f:limits": {
- ".": {},
- "f:cpu": {},
- "f:memory": {},
- },
- "f:requests": {
- ".": {},
- "f:cpu": {},
- "f:memory": {},
- },
+ "f:appwrapper.mcad.ibm.com": {},
+ "f:controller-tools.k8s.io": {},
+ "f:resourceName": {},
+ },
+ "f:ownerReferences": {
+ ".": {},
+ 'k:{"uid":"6334fc1b-471e-4876-8e7b-0b2277679235"}': {},
},
- "f:upscalingMode": {},
},
- "f:enableInTreeAutoscaling": {},
- "f:headGroupSpec": {
+ "f:spec": {
".": {},
- "f:rayStartParams": {
+ "f:autoscalerOptions": {
".": {},
- "f:block": {},
- "f:dashboard-host": {},
- "f:num-gpus": {},
+ "f:idleTimeoutSeconds": {},
+ "f:imagePullPolicy": {},
+ "f:resources": {
+ ".": {},
+ "f:limits": {
+ ".": {},
+ "f:cpu": {},
+ "f:memory": {},
+ },
+ "f:requests": {
+ ".": {},
+ "f:cpu": {},
+ "f:memory": {},
+ },
+ },
+ "f:upscalingMode": {},
},
- "f:serviceType": {},
- "f:template": {
+ "f:enableInTreeAutoscaling": {},
+ "f:headGroupSpec": {
".": {},
- "f:spec": {".": {}, "f:containers": {}},
+ "f:rayStartParams": {
+ ".": {},
+ "f:block": {},
+ "f:dashboard-host": {},
+ "f:num-gpus": {},
+ },
+ "f:serviceType": {},
+ "f:template": {
+ ".": {},
+ "f:spec": {".": {}, "f:containers": {}},
+ },
},
+ "f:rayVersion": {},
+ "f:workerGroupSpecs": {},
},
- "f:rayVersion": {},
- "f:workerGroupSpecs": {},
},
+ "manager": "mcad-controller",
+ "operation": "Update",
+ "time": "2023-02-22T16:26:07Z",
},
- "manager": "mcad-controller",
- "operation": "Update",
- "time": "2023-02-22T16:26:07Z",
- },
- {
- "apiVersion": "ray.io/v1alpha1",
- "fieldsType": "FieldsV1",
- "fieldsV1": {
- "f:status": {
- ".": {},
- "f:availableWorkerReplicas": {},
- "f:desiredWorkerReplicas": {},
- "f:endpoints": {
+ {
+ "apiVersion": "ray.io/v1alpha1",
+ "fieldsType": "FieldsV1",
+ "fieldsV1": {
+ "f:status": {
".": {},
- "f:client": {},
- "f:dashboard": {},
- "f:gcs": {},
- },
- "f:lastUpdateTime": {},
- "f:maxWorkerReplicas": {},
- "f:minWorkerReplicas": {},
- "f:state": {},
- }
- },
- "manager": "manager",
- "operation": "Update",
- "subresource": "status",
- "time": "2023-02-22T16:26:16Z",
- },
- ],
- "name": "quicktest",
- "namespace": "ns",
- "ownerReferences": [
- {
- "apiVersion": "mcad.ibm.com/v1beta1",
- "blockOwnerDeletion": True,
- "controller": True,
- "kind": "AppWrapper",
- "name": "quicktest",
- "uid": "6334fc1b-471e-4876-8e7b-0b2277679235",
- }
- ],
- "resourceVersion": "9482407",
- "uid": "44d45d1f-26c8-43e7-841f-831dbd8c1285",
- },
- "spec": {
- "autoscalerOptions": {
- "idleTimeoutSeconds": 60,
- "imagePullPolicy": "Always",
- "resources": {
- "limits": {"cpu": "500m", "memory": "512Mi"},
- "requests": {"cpu": "500m", "memory": "512Mi"},
- },
- "upscalingMode": "Default",
- },
- "enableInTreeAutoscaling": False,
- "headGroupSpec": {
- "rayStartParams": {
- "block": "true",
- "dashboard-host": "0.0.0.0",
- "num-gpus": "0",
- },
- "serviceType": "ClusterIP",
- "template": {
- "spec": {
- "containers": [
- {
- "image": "quay.io/project-codeflare/ray:2.5.0-py38-cu116",
- "imagePullPolicy": "Always",
- "lifecycle": {
- "preStop": {
- "exec": {
- "command": ["/bin/sh", "-c", "ray stop"]
- }
- }
- },
- "name": "ray-head",
- "ports": [
- {
- "containerPort": 6379,
- "name": "gcs",
- "protocol": "TCP",
- },
- {
- "containerPort": 8265,
- "name": "dashboard",
- "protocol": "TCP",
- },
- {
- "containerPort": 10001,
- "name": "client",
- "protocol": "TCP",
- },
- ],
- "resources": {
- "limits": {
- "cpu": 2,
- "memory": "8G",
- "nvidia.com/gpu": 0,
- },
- "requests": {
- "cpu": 2,
- "memory": "8G",
- "nvidia.com/gpu": 0,
- },
+ "f:availableWorkerReplicas": {},
+ "f:desiredWorkerReplicas": {},
+ "f:endpoints": {
+ ".": {},
+ "f:client": {},
+ "f:dashboard": {},
+ "f:gcs": {},
},
+ "f:lastUpdateTime": {},
+ "f:maxWorkerReplicas": {},
+ "f:minWorkerReplicas": {},
+ "f:state": {},
}
- ]
+ },
+ "manager": "manager",
+ "operation": "Update",
+ "subresource": "status",
+ "time": "2023-02-22T16:26:16Z",
+ },
+ ],
+ "name": "quicktest",
+ "namespace": "ns",
+ "ownerReferences": [
+ {
+ "apiVersion": "mcad.ibm.com/v1beta1",
+ "blockOwnerDeletion": True,
+ "controller": True,
+ "kind": "AppWrapper",
+ "name": "quicktest",
+ "uid": "6334fc1b-471e-4876-8e7b-0b2277679235",
}
- },
+ ],
+ "resourceVersion": "9482407",
+ "uid": "44d45d1f-26c8-43e7-841f-831dbd8c1285",
},
- "rayVersion": "1.12.0",
- "workerGroupSpecs": [
- {
- "groupName": "small-group-quicktest",
- "maxReplicas": 1,
- "minReplicas": 1,
- "rayStartParams": {"block": "true", "num-gpus": "0"},
- "replicas": 1,
+ "spec": {
+ "autoscalerOptions": {
+ "idleTimeoutSeconds": 60,
+ "imagePullPolicy": "Always",
+ "resources": {
+ "limits": {"cpu": "500m", "memory": "512Mi"},
+ "requests": {"cpu": "500m", "memory": "512Mi"},
+ },
+ "upscalingMode": "Default",
+ },
+ "enableInTreeAutoscaling": False,
+ "headGroupSpec": {
+ "rayStartParams": {
+ "block": "true",
+ "dashboard-host": "0.0.0.0",
+ "num-gpus": "0",
+ },
+ "serviceType": "ClusterIP",
"template": {
- "metadata": {
- "annotations": {"key": "value"},
- "labels": {"key": "value"},
- },
"spec": {
"containers": [
{
- "env": [
- {
- "name": "MY_POD_IP",
- "valueFrom": {
- "fieldRef": {
- "fieldPath": "status.podIP"
- }
- },
- }
- ],
- "image": "quay.io/project-codeflare/ray:2.5.0-py38-cu116",
+ "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103",
+ "imagePullPolicy": "Always",
"lifecycle": {
"preStop": {
"exec": {
@@ -747,262 +736,271 @@ def get_ray_obj(cls=None):
}
}
},
- "name": "machine-learning",
+ "name": "ray-head",
+ "ports": [
+ {
+ "containerPort": 6379,
+ "name": "gcs",
+ "protocol": "TCP",
+ },
+ {
+ "containerPort": 8265,
+ "name": "dashboard",
+ "protocol": "TCP",
+ },
+ {
+ "containerPort": 10001,
+ "name": "client",
+ "protocol": "TCP",
+ },
+ ],
"resources": {
"limits": {
- "cpu": 1,
- "memory": "2G",
+ "cpu": 2,
+ "memory": "8G",
"nvidia.com/gpu": 0,
},
"requests": {
- "cpu": 1,
- "memory": "2G",
+ "cpu": 2,
+ "memory": "8G",
"nvidia.com/gpu": 0,
},
},
}
- ],
- "initContainers": [
- {
- "command": [
- "sh",
- "-c",
- "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done",
- ],
- "image": "busybox:1.28",
- "name": "init-myservice",
- }
- ],
- },
- },
- }
- ],
- },
- "status": {
- "availableWorkerReplicas": 2,
- "desiredWorkerReplicas": 1,
- "endpoints": {"client": "10001", "dashboard": "8265", "gcs": "6379"},
- "lastUpdateTime": "2023-02-22T16:26:16Z",
- "maxWorkerReplicas": 1,
- "minWorkerReplicas": 1,
- "state": "ready",
- },
- }
- )
- return [api_obj]
-
-
-def get_aw_obj():
- api_obj1 = openshift.apiobject.APIObject(
- {
- "apiVersion": "mcad.ibm.com/v1beta1",
- "kind": "AppWrapper",
- "metadata": {
- "annotations": {
- "kubectl.kubernetes.io/last-applied-configuration": '{"apiVersion":"mcad.ibm.com/v1beta1","kind":"AppWrapper","metadata":{"annotations":{},"name":"quicktest1","namespace":"ns"},"spec":{"priority":9,"resources":{"GenericItems":[{"custompodresources":[{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}},{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}],"generictemplate":{"apiVersion":"ray.io/v1alpha1","kind":"RayCluster","metadata":{"labels":{"appwrapper.mcad.ibm.com":"quicktest1","controller-tools.k8s.io":"1.0"},"name":"quicktest1","namespace":"ns"},"spec":{"autoscalerOptions":{"idleTimeoutSeconds":60,"imagePullPolicy":"Always","resources":{"limits":{"cpu":"500m","memory":"512Mi"},"requests":{"cpu":"500m","memory":"512Mi"}},"upscalingMode":"Default"},"enableInTreeAutoscaling":false,"headGroupSpec":{"rayStartParams":{"block":"true","dashboard-host":"0.0.0.0","num-gpus":"0"},"serviceType":"ClusterIP","template":{"spec":{"containers":[{"image":"quay.io/project-codeflare/ray:2.5.0-py38-cu116","imagePullPolicy":"Always","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"ray-head","ports":[{"containerPort":6379,"name":"gcs"},{"containerPort":8265,"name":"dashboard"},{"containerPort":10001,"name":"client"}],"resources":{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}}}]}}},"rayVersion":"1.12.0","workerGroupSpecs":[{"groupName":"small-group-quicktest","maxReplicas":1,"minReplicas":1,"rayStartParams":{"block":"true","num-gpus":"0"},"replicas":1,"template":{"metadata":{"annotations":{"key":"value"},"labels":{"key":"value"}},"spec":{"containers":[{"env":[{"name":"MY_POD_IP","valueFrom":{"fieldRef":{"fieldPath":"status.podIP"}}}],"image":"quay.io/project-codeflare/ray:2.5.0-py38-cu116","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"machine-learning","resources":{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}}],"initContainers":[{"command":["sh","-c","until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"],"image":"busybox:1.28","name":"init-myservice"}]}}}]}},"replicas":1},{"generictemplate":{"apiVersion":"route.openshift.io/v1","kind":"Route","metadata":{"labels":{"odh-ray-cluster-service":"quicktest-head-svc"},"name":"ray-dashboard-quicktest","namespace":"default"},"spec":{"port":{"targetPort":"dashboard"},"to":{"kind":"Service","name":"quicktest-head-svc"}}},"replica":1}],"Items":[]}}}\n'
- },
- "creationTimestamp": "2023-02-22T16:26:07Z",
- "generation": 4,
- "managedFields": [
- {
- "apiVersion": "mcad.ibm.com/v1beta1",
- "fieldsType": "FieldsV1",
- "fieldsV1": {
- "f:spec": {
- "f:resources": {"f:GenericItems": {}, "f:metadata": {}},
- "f:schedulingSpec": {},
- "f:service": {".": {}, "f:spec": {}},
- },
- "f:status": {
- ".": {},
- "f:canrun": {},
- "f:conditions": {},
- "f:controllerfirsttimestamp": {},
- "f:filterignore": {},
- "f:queuejobstate": {},
- "f:sender": {},
- "f:state": {},
- "f:systempriority": {},
- },
+ ]
+ }
},
- "manager": "Go-http-client",
- "operation": "Update",
- "time": "2023-02-22T16:26:07Z",
},
- {
- "apiVersion": "mcad.ibm.com/v1beta1",
- "fieldsType": "FieldsV1",
- "fieldsV1": {
- "f:metadata": {
- "f:annotations": {
- ".": {},
- "f:kubectl.kubernetes.io/last-applied-configuration": {},
- }
- },
- "f:spec": {
- ".": {},
- "f:priority": {},
- "f:resources": {".": {}, "f:Items": {}},
+ "rayVersion": "1.12.0",
+ "workerGroupSpecs": [
+ {
+ "groupName": "small-group-quicktest",
+ "maxReplicas": 1,
+ "minReplicas": 1,
+ "rayStartParams": {"block": "true", "num-gpus": "0"},
+ "replicas": 1,
+ "template": {
+ "metadata": {
+ "annotations": {"key": "value"},
+ "labels": {"key": "value"},
+ },
+ "spec": {
+ "containers": [
+ {
+ "env": [
+ {
+ "name": "MY_POD_IP",
+ "valueFrom": {
+ "fieldRef": {
+ "fieldPath": "status.podIP"
+ }
+ },
+ }
+ ],
+ "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103",
+ "lifecycle": {
+ "preStop": {
+ "exec": {
+ "command": [
+ "/bin/sh",
+ "-c",
+ "ray stop",
+ ]
+ }
+ }
+ },
+ "name": "machine-learning",
+ "resources": {
+ "limits": {
+ "cpu": 1,
+ "memory": "2G",
+ "nvidia.com/gpu": 0,
+ },
+ "requests": {
+ "cpu": 1,
+ "memory": "2G",
+ "nvidia.com/gpu": 0,
+ },
+ },
+ }
+ ],
+ "initContainers": [
+ {
+ "command": [
+ "sh",
+ "-c",
+ "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done",
+ ],
+ "image": "busybox:1.28",
+ "name": "init-myservice",
+ }
+ ],
+ },
},
- },
- "manager": "kubectl-client-side-apply",
- "operation": "Update",
- "time": "2023-02-22T16:26:07Z",
+ }
+ ],
+ },
+ "status": {
+ "availableWorkerReplicas": 2,
+ "desiredWorkerReplicas": 1,
+ "endpoints": {
+ "client": "10001",
+ "dashboard": "8265",
+ "gcs": "6379",
},
- ],
- "name": "quicktest1",
- "namespace": "ns",
- "resourceVersion": "9482384",
- "uid": "6334fc1b-471e-4876-8e7b-0b2277679235",
- },
- "spec": {
- "priority": 9,
- "resources": {
- "GenericItems": [
+ "lastUpdateTime": "2023-02-22T16:26:16Z",
+ "maxWorkerReplicas": 1,
+ "minWorkerReplicas": 1,
+ "state": "ready",
+ },
+ }
+ ]
+ }
+ return api_obj
+
+
+def get_aw_obj(group, version, namespace, plural):
+ api_obj1 = {
+ "items": [
+ {
+ "apiVersion": "mcad.ibm.com/v1beta1",
+ "kind": "AppWrapper",
+ "metadata": {
+ "annotations": {
+ "kubectl.kubernetes.io/last-applied-configuration": '{"apiVersion":"mcad.ibm.com/v1beta1","kind":"AppWrapper","metadata":{"annotations":{},"name":"quicktest1","namespace":"ns"},"spec":{"priority":9,"resources":{"GenericItems":[{"custompodresources":[{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}},{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}],"generictemplate":{"apiVersion":"ray.io/v1alpha1","kind":"RayCluster","metadata":{"labels":{"appwrapper.mcad.ibm.com":"quicktest1","controller-tools.k8s.io":"1.0"},"name":"quicktest1","namespace":"ns"},"spec":{"autoscalerOptions":{"idleTimeoutSeconds":60,"imagePullPolicy":"Always","resources":{"limits":{"cpu":"500m","memory":"512Mi"},"requests":{"cpu":"500m","memory":"512Mi"}},"upscalingMode":"Default"},"enableInTreeAutoscaling":false,"headGroupSpec":{"rayStartParams":{"block":"true","dashboard-host":"0.0.0.0","num-gpus":"0"},"serviceType":"ClusterIP","template":{"spec":{"containers":[{"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","imagePullPolicy":"Always","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"ray-head","ports":[{"containerPort":6379,"name":"gcs"},{"containerPort":8265,"name":"dashboard"},{"containerPort":10001,"name":"client"}],"resources":{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}}}]}}},"rayVersion":"1.12.0","workerGroupSpecs":[{"groupName":"small-group-quicktest","maxReplicas":1,"minReplicas":1,"rayStartParams":{"block":"true","num-gpus":"0"},"replicas":1,"template":{"metadata":{"annotations":{"key":"value"},"labels":{"key":"value"}},"spec":{"containers":[{"env":[{"name":"MY_POD_IP","valueFrom":{"fieldRef":{"fieldPath":"status.podIP"}}}],"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"machine-learning","resources":{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}}],"initContainers":[{"command":["sh","-c","until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"],"image":"busybox:1.28","name":"init-myservice"}]}}}]}},"replicas":1},{"generictemplate":{"apiVersion":"route.openshift.io/v1","kind":"Route","metadata":{"labels":{"odh-ray-cluster-service":"quicktest-head-svc"},"name":"ray-dashboard-quicktest","namespace":"default"},"spec":{"port":{"targetPort":"dashboard"},"to":{"kind":"Service","name":"quicktest-head-svc"}}},"replica":1}],"Items":[]}}}\n'
+ },
+ "creationTimestamp": "2023-02-22T16:26:07Z",
+ "generation": 4,
+ "managedFields": [
{
- "allocated": 0,
- "custompodresources": [
- {
- "limits": {
- "cpu": "2",
- "memory": "8G",
- "nvidia.com/gpu": "0",
- },
- "replicas": 1,
- "requests": {
- "cpu": "2",
- "memory": "8G",
- "nvidia.com/gpu": "0",
+ "apiVersion": "mcad.ibm.com/v1beta1",
+ "fieldsType": "FieldsV1",
+ "fieldsV1": {
+ "f:spec": {
+ "f:resources": {
+ "f:GenericItems": {},
+ "f:metadata": {},
},
+ "f:schedulingSpec": {},
+ "f:service": {".": {}, "f:spec": {}},
},
- {
- "limits": {
- "cpu": "1",
- "memory": "2G",
- "nvidia.com/gpu": "0",
- },
- "replicas": 1,
- "requests": {
- "cpu": "1",
- "memory": "2G",
- "nvidia.com/gpu": "0",
- },
+ "f:status": {
+ ".": {},
+ "f:canrun": {},
+ "f:conditions": {},
+ "f:controllerfirsttimestamp": {},
+ "f:filterignore": {},
+ "f:queuejobstate": {},
+ "f:sender": {},
+ "f:state": {},
+ "f:systempriority": {},
},
- ],
- "generictemplate": {
- "apiVersion": "ray.io/v1alpha1",
- "kind": "RayCluster",
- "metadata": {
- "labels": {
- "appwrapper.mcad.ibm.com": "quicktest1",
- "controller-tools.k8s.io": "1.0",
- },
- "name": "quicktest1",
- "namespace": "ns",
+ },
+ "manager": "Go-http-client",
+ "operation": "Update",
+ "time": "2023-02-22T16:26:07Z",
+ },
+ {
+ "apiVersion": "mcad.ibm.com/v1beta1",
+ "fieldsType": "FieldsV1",
+ "fieldsV1": {
+ "f:metadata": {
+ "f:annotations": {
+ ".": {},
+ "f:kubectl.kubernetes.io/last-applied-configuration": {},
+ }
},
- "spec": {
- "autoscalerOptions": {
- "idleTimeoutSeconds": 60,
- "imagePullPolicy": "Always",
- "resources": {
- "limits": {
- "cpu": "500m",
- "memory": "512Mi",
- },
- "requests": {
- "cpu": "500m",
- "memory": "512Mi",
- },
+ "f:spec": {
+ ".": {},
+ "f:priority": {},
+ "f:resources": {".": {}, "f:Items": {}},
+ },
+ },
+ "manager": "kubectl-client-side-apply",
+ "operation": "Update",
+ "time": "2023-02-22T16:26:07Z",
+ },
+ ],
+ "name": "quicktest1",
+ "namespace": "ns",
+ "resourceVersion": "9482384",
+ "uid": "6334fc1b-471e-4876-8e7b-0b2277679235",
+ },
+ "spec": {
+ "priority": 9,
+ "resources": {
+ "GenericItems": [
+ {
+ "allocated": 0,
+ "custompodresources": [
+ {
+ "limits": {
+ "cpu": "2",
+ "memory": "8G",
+ "nvidia.com/gpu": "0",
+ },
+ "replicas": 1,
+ "requests": {
+ "cpu": "2",
+ "memory": "8G",
+ "nvidia.com/gpu": "0",
},
- "upscalingMode": "Default",
},
- "enableInTreeAutoscaling": False,
- "headGroupSpec": {
- "rayStartParams": {
- "block": "true",
- "dashboard-host": "0.0.0.0",
- "num-gpus": "0",
+ {
+ "limits": {
+ "cpu": "1",
+ "memory": "2G",
+ "nvidia.com/gpu": "0",
},
- "serviceType": "ClusterIP",
- "template": {
- "spec": {
- "containers": [
- {
- "image": "quay.io/project-codeflare/ray:2.5.0-py38-cu116",
- "imagePullPolicy": "Always",
- "lifecycle": {
- "preStop": {
- "exec": {
- "command": [
- "/bin/sh",
- "-c",
- "ray stop",
- ]
- }
- }
- },
- "name": "ray-head",
- "ports": [
- {
- "containerPort": 6379,
- "name": "gcs",
- },
- {
- "containerPort": 8265,
- "name": "dashboard",
- },
- {
- "containerPort": 10001,
- "name": "client",
- },
- ],
- "resources": {
- "limits": {
- "cpu": 2,
- "memory": "8G",
- "nvidia.com/gpu": 0,
- },
- "requests": {
- "cpu": 2,
- "memory": "8G",
- "nvidia.com/gpu": 0,
- },
- },
- }
- ]
- }
+ "replicas": 1,
+ "requests": {
+ "cpu": "1",
+ "memory": "2G",
+ "nvidia.com/gpu": "0",
},
},
- "rayVersion": "1.12.0",
- "workerGroupSpecs": [
- {
- "groupName": "small-group-quicktest",
- "maxReplicas": 1,
- "minReplicas": 1,
+ ],
+ "generictemplate": {
+ "apiVersion": "ray.io/v1alpha1",
+ "kind": "RayCluster",
+ "metadata": {
+ "labels": {
+ "appwrapper.mcad.ibm.com": "quicktest1",
+ "controller-tools.k8s.io": "1.0",
+ },
+ "name": "quicktest1",
+ "namespace": "ns",
+ },
+ "spec": {
+ "autoscalerOptions": {
+ "idleTimeoutSeconds": 60,
+ "imagePullPolicy": "Always",
+ "resources": {
+ "limits": {
+ "cpu": "500m",
+ "memory": "512Mi",
+ },
+ "requests": {
+ "cpu": "500m",
+ "memory": "512Mi",
+ },
+ },
+ "upscalingMode": "Default",
+ },
+ "enableInTreeAutoscaling": False,
+ "headGroupSpec": {
"rayStartParams": {
"block": "true",
+ "dashboard-host": "0.0.0.0",
"num-gpus": "0",
},
- "replicas": 1,
+ "serviceType": "ClusterIP",
"template": {
- "metadata": {
- "annotations": {"key": "value"},
- "labels": {"key": "value"},
- },
"spec": {
"containers": [
{
- "env": [
- {
- "name": "MY_POD_IP",
- "valueFrom": {
- "fieldRef": {
- "fieldPath": "status.podIP"
- }
- },
- }
- ],
- "image": "quay.io/project-codeflare/ray:2.5.0-py38-cu116",
+ "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103",
+ "imagePullPolicy": "Always",
"lifecycle": {
"preStop": {
"exec": {
@@ -1014,317 +1012,318 @@ def get_aw_obj():
}
}
},
- "name": "machine-learning",
+ "name": "ray-head",
+ "ports": [
+ {
+ "containerPort": 6379,
+ "name": "gcs",
+ },
+ {
+ "containerPort": 8265,
+ "name": "dashboard",
+ },
+ {
+ "containerPort": 10001,
+ "name": "client",
+ },
+ ],
"resources": {
"limits": {
- "cpu": 1,
- "memory": "2G",
+ "cpu": 2,
+ "memory": "8G",
"nvidia.com/gpu": 0,
},
"requests": {
- "cpu": 1,
- "memory": "2G",
+ "cpu": 2,
+ "memory": "8G",
"nvidia.com/gpu": 0,
},
},
}
- ],
- "initContainers": [
- {
- "command": [
- "sh",
- "-c",
- "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done",
- ],
- "image": "busybox:1.28",
- "name": "init-myservice",
- }
- ],
- },
+ ]
+ }
},
- }
- ],
+ },
+ "rayVersion": "1.12.0",
+ "workerGroupSpecs": [
+ {
+ "groupName": "small-group-quicktest",
+ "maxReplicas": 1,
+ "minReplicas": 1,
+ "rayStartParams": {
+ "block": "true",
+ "num-gpus": "0",
+ },
+ "replicas": 1,
+ "template": {
+ "metadata": {
+ "annotations": {"key": "value"},
+ "labels": {"key": "value"},
+ },
+ "spec": {
+ "containers": [
+ {
+ "env": [
+ {
+ "name": "MY_POD_IP",
+ "valueFrom": {
+ "fieldRef": {
+ "fieldPath": "status.podIP"
+ }
+ },
+ }
+ ],
+ "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103",
+ "lifecycle": {
+ "preStop": {
+ "exec": {
+ "command": [
+ "/bin/sh",
+ "-c",
+ "ray stop",
+ ]
+ }
+ }
+ },
+ "name": "machine-learning",
+ "resources": {
+ "limits": {
+ "cpu": 1,
+ "memory": "2G",
+ "nvidia.com/gpu": 0,
+ },
+ "requests": {
+ "cpu": 1,
+ "memory": "2G",
+ "nvidia.com/gpu": 0,
+ },
+ },
+ }
+ ],
+ "initContainers": [
+ {
+ "command": [
+ "sh",
+ "-c",
+ "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done",
+ ],
+ "image": "busybox:1.28",
+ "name": "init-myservice",
+ }
+ ],
+ },
+ },
+ }
+ ],
+ },
},
+ "metadata": {},
+ "priority": 0,
+ "priorityslope": 0,
+ "replicas": 1,
},
- "metadata": {},
- "priority": 0,
- "priorityslope": 0,
- "replicas": 1,
- },
- {
- "allocated": 0,
- "generictemplate": {
- "apiVersion": "route.openshift.io/v1",
- "kind": "Route",
- "metadata": {
- "labels": {
- "odh-ray-cluster-service": "quicktest-head-svc"
+ {
+ "allocated": 0,
+ "generictemplate": {
+ "apiVersion": "route.openshift.io/v1",
+ "kind": "Route",
+ "metadata": {
+ "labels": {
+ "odh-ray-cluster-service": "quicktest-head-svc"
+ },
+ "name": "ray-dashboard-quicktest",
+ "namespace": "default",
},
- "name": "ray-dashboard-quicktest",
- "namespace": "default",
- },
- "spec": {
- "port": {"targetPort": "dashboard"},
- "to": {
- "kind": "Service",
- "name": "quicktest-head-svc",
+ "spec": {
+ "port": {"targetPort": "dashboard"},
+ "to": {
+ "kind": "Service",
+ "name": "quicktest-head-svc",
+ },
},
},
+ "metadata": {},
+ "priority": 0,
+ "priorityslope": 0,
},
- "metadata": {},
- "priority": 0,
- "priorityslope": 0,
+ ],
+ "Items": [],
+ "metadata": {},
+ },
+ "schedulingSpec": {},
+ "service": {"spec": {}},
+ },
+ "status": {
+ "canrun": True,
+ "conditions": [
+ {
+ "lastTransitionMicroTime": "2023-02-22T16:26:07.559447Z",
+ "lastUpdateMicroTime": "2023-02-22T16:26:07.559447Z",
+ "status": "True",
+ "type": "Init",
+ },
+ {
+ "lastTransitionMicroTime": "2023-02-22T16:26:07.559551Z",
+ "lastUpdateMicroTime": "2023-02-22T16:26:07.559551Z",
+ "reason": "AwaitingHeadOfLine",
+ "status": "True",
+ "type": "Queueing",
+ },
+ {
+ "lastTransitionMicroTime": "2023-02-22T16:26:13.220564Z",
+ "lastUpdateMicroTime": "2023-02-22T16:26:13.220564Z",
+ "reason": "AppWrapperRunnable",
+ "status": "True",
+ "type": "Dispatched",
},
],
- "Items": [],
- "metadata": {},
+ "controllerfirsttimestamp": "2023-02-22T16:26:07.559447Z",
+ "filterignore": True,
+ "queuejobstate": "Dispatched",
+ "sender": "before manageQueueJob - afterEtcdDispatching",
+ "state": "Running",
+ "systempriority": 9,
},
- "schedulingSpec": {},
- "service": {"spec": {}},
- },
- "status": {
- "canrun": True,
- "conditions": [
- {
- "lastTransitionMicroTime": "2023-02-22T16:26:07.559447Z",
- "lastUpdateMicroTime": "2023-02-22T16:26:07.559447Z",
- "status": "True",
- "type": "Init",
- },
- {
- "lastTransitionMicroTime": "2023-02-22T16:26:07.559551Z",
- "lastUpdateMicroTime": "2023-02-22T16:26:07.559551Z",
- "reason": "AwaitingHeadOfLine",
- "status": "True",
- "type": "Queueing",
- },
- {
- "lastTransitionMicroTime": "2023-02-22T16:26:13.220564Z",
- "lastUpdateMicroTime": "2023-02-22T16:26:13.220564Z",
- "reason": "AppWrapperRunnable",
- "status": "True",
- "type": "Dispatched",
- },
- ],
- "controllerfirsttimestamp": "2023-02-22T16:26:07.559447Z",
- "filterignore": True,
- "queuejobstate": "Dispatched",
- "sender": "before manageQueueJob - afterEtcdDispatching",
- "state": "Running",
- "systempriority": 9,
},
- }
- )
- api_obj2 = openshift.apiobject.APIObject(
- {
- "apiVersion": "mcad.ibm.com/v1beta1",
- "kind": "AppWrapper",
- "metadata": {
- "annotations": {
- "kubectl.kubernetes.io/last-applied-configuration": '{"apiVersion":"mcad.ibm.com/v1beta1","kind":"AppWrapper","metadata":{"annotations":{},"name":"quicktest2","namespace":"ns"},"spec":{"priority":9,"resources":{"GenericItems":[{"custompodresources":[{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}},{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}],"generictemplate":{"apiVersion":"ray.io/v1alpha1","kind":"RayCluster","metadata":{"labels":{"appwrapper.mcad.ibm.com":"quicktest2","controller-tools.k8s.io":"1.0"},"name":"quicktest2","namespace":"ns"},"spec":{"autoscalerOptions":{"idleTimeoutSeconds":60,"imagePullPolicy":"Always","resources":{"limits":{"cpu":"500m","memory":"512Mi"},"requests":{"cpu":"500m","memory":"512Mi"}},"upscalingMode":"Default"},"enableInTreeAutoscaling":false,"headGroupSpec":{"rayStartParams":{"block":"true","dashboard-host":"0.0.0.0","num-gpus":"0"},"serviceType":"ClusterIP","template":{"spec":{"containers":[{"image":"quay.io/project-codeflare/ray:2.5.0-py38-cu116","imagePullPolicy":"Always","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"ray-head","ports":[{"containerPort":6379,"name":"gcs"},{"containerPort":8265,"name":"dashboard"},{"containerPort":10001,"name":"client"}],"resources":{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}}}]}}},"rayVersion":"1.12.0","workerGroupSpecs":[{"groupName":"small-group-quicktest","maxReplicas":1,"minReplicas":1,"rayStartParams":{"block":"true","num-gpus":"0"},"replicas":1,"template":{"metadata":{"annotations":{"key":"value"},"labels":{"key":"value"}},"spec":{"containers":[{"env":[{"name":"MY_POD_IP","valueFrom":{"fieldRef":{"fieldPath":"status.podIP"}}}],"image":"quay.io/project-codeflare/ray:2.5.0-py38-cu116","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"machine-learning","resources":{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}}],"initContainers":[{"command":["sh","-c","until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"],"image":"busybox:1.28","name":"init-myservice"}]}}}]}},"replicas":1},{"generictemplate":{"apiVersion":"route.openshift.io/v1","kind":"Route","metadata":{"labels":{"odh-ray-cluster-service":"quicktest-head-svc"},"name":"ray-dashboard-quicktest","namespace":"default"},"spec":{"port":{"targetPort":"dashboard"},"to":{"kind":"Service","name":"quicktest-head-svc"}}},"replica":1}],"Items":[]}}}\n'
- },
- "creationTimestamp": "2023-02-22T16:26:07Z",
- "generation": 4,
- "managedFields": [
- {
- "apiVersion": "mcad.ibm.com/v1beta1",
- "fieldsType": "FieldsV1",
- "fieldsV1": {
- "f:spec": {
- "f:resources": {"f:GenericItems": {}, "f:metadata": {}},
- "f:schedulingSpec": {},
- "f:service": {".": {}, "f:spec": {}},
- },
- "f:status": {
- ".": {},
- "f:canrun": {},
- "f:conditions": {},
- "f:controllerfirsttimestamp": {},
- "f:filterignore": {},
- "f:queuejobstate": {},
- "f:sender": {},
- "f:state": {},
- "f:systempriority": {},
- },
- },
- "manager": "Go-http-client",
- "operation": "Update",
- "time": "2023-02-22T16:26:07Z",
+ {
+ "apiVersion": "mcad.ibm.com/v1beta1",
+ "kind": "AppWrapper",
+ "metadata": {
+ "annotations": {
+ "kubectl.kubernetes.io/last-applied-configuration": '{"apiVersion":"mcad.ibm.com/v1beta1","kind":"AppWrapper","metadata":{"annotations":{},"name":"quicktest2","namespace":"ns"},"spec":{"priority":9,"resources":{"GenericItems":[{"custompodresources":[{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}},{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}],"generictemplate":{"apiVersion":"ray.io/v1alpha1","kind":"RayCluster","metadata":{"labels":{"appwrapper.mcad.ibm.com":"quicktest2","controller-tools.k8s.io":"1.0"},"name":"quicktest2","namespace":"ns"},"spec":{"autoscalerOptions":{"idleTimeoutSeconds":60,"imagePullPolicy":"Always","resources":{"limits":{"cpu":"500m","memory":"512Mi"},"requests":{"cpu":"500m","memory":"512Mi"}},"upscalingMode":"Default"},"enableInTreeAutoscaling":false,"headGroupSpec":{"rayStartParams":{"block":"true","dashboard-host":"0.0.0.0","num-gpus":"0"},"serviceType":"ClusterIP","template":{"spec":{"containers":[{"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","imagePullPolicy":"Always","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"ray-head","ports":[{"containerPort":6379,"name":"gcs"},{"containerPort":8265,"name":"dashboard"},{"containerPort":10001,"name":"client"}],"resources":{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}}}]}}},"rayVersion":"1.12.0","workerGroupSpecs":[{"groupName":"small-group-quicktest","maxReplicas":1,"minReplicas":1,"rayStartParams":{"block":"true","num-gpus":"0"},"replicas":1,"template":{"metadata":{"annotations":{"key":"value"},"labels":{"key":"value"}},"spec":{"containers":[{"env":[{"name":"MY_POD_IP","valueFrom":{"fieldRef":{"fieldPath":"status.podIP"}}}],"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"machine-learning","resources":{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}}],"initContainers":[{"command":["sh","-c","until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"],"image":"busybox:1.28","name":"init-myservice"}]}}}]}},"replicas":1},{"generictemplate":{"apiVersion":"route.openshift.io/v1","kind":"Route","metadata":{"labels":{"odh-ray-cluster-service":"quicktest-head-svc"},"name":"ray-dashboard-quicktest","namespace":"default"},"spec":{"port":{"targetPort":"dashboard"},"to":{"kind":"Service","name":"quicktest-head-svc"}}},"replica":1}],"Items":[]}}}\n'
},
- {
- "apiVersion": "mcad.ibm.com/v1beta1",
- "fieldsType": "FieldsV1",
- "fieldsV1": {
- "f:metadata": {
- "f:annotations": {
+ "creationTimestamp": "2023-02-22T16:26:07Z",
+ "generation": 4,
+ "managedFields": [
+ {
+ "apiVersion": "mcad.ibm.com/v1beta1",
+ "fieldsType": "FieldsV1",
+ "fieldsV1": {
+ "f:spec": {
+ "f:resources": {
+ "f:GenericItems": {},
+ "f:metadata": {},
+ },
+ "f:schedulingSpec": {},
+ "f:service": {".": {}, "f:spec": {}},
+ },
+ "f:status": {
".": {},
- "f:kubectl.kubernetes.io/last-applied-configuration": {},
- }
- },
- "f:spec": {
- ".": {},
- "f:priority": {},
- "f:resources": {".": {}, "f:Items": {}},
+ "f:canrun": {},
+ "f:conditions": {},
+ "f:controllerfirsttimestamp": {},
+ "f:filterignore": {},
+ "f:queuejobstate": {},
+ "f:sender": {},
+ "f:state": {},
+ "f:systempriority": {},
+ },
},
+ "manager": "Go-http-client",
+ "operation": "Update",
+ "time": "2023-02-22T16:26:07Z",
},
- "manager": "kubectl-client-side-apply",
- "operation": "Update",
- "time": "2023-02-22T16:26:07Z",
- },
- ],
- "name": "quicktest2",
- "namespace": "ns",
- "resourceVersion": "9482384",
- "uid": "6334fc1b-471e-4876-8e7b-0b2277679235",
- },
- "spec": {
- "priority": 9,
- "resources": {
- "GenericItems": [
{
- "allocated": 0,
- "custompodresources": [
- {
- "limits": {
- "cpu": "2",
- "memory": "8G",
- "nvidia.com/gpu": "0",
- },
- "replicas": 1,
- "requests": {
- "cpu": "2",
- "memory": "8G",
- "nvidia.com/gpu": "0",
- },
- },
- {
- "limits": {
- "cpu": "1",
- "memory": "2G",
- "nvidia.com/gpu": "0",
- },
- "replicas": 1,
- "requests": {
- "cpu": "1",
- "memory": "2G",
- "nvidia.com/gpu": "0",
- },
+ "apiVersion": "mcad.ibm.com/v1beta1",
+ "fieldsType": "FieldsV1",
+ "fieldsV1": {
+ "f:metadata": {
+ "f:annotations": {
+ ".": {},
+ "f:kubectl.kubernetes.io/last-applied-configuration": {},
+ }
},
- ],
- "generictemplate": {
- "apiVersion": "ray.io/v1alpha1",
- "kind": "RayCluster",
- "metadata": {
- "labels": {
- "appwrapper.mcad.ibm.com": "quicktest2",
- "controller-tools.k8s.io": "1.0",
- },
- "name": "quicktest2",
- "namespace": "ns",
+ "f:spec": {
+ ".": {},
+ "f:priority": {},
+ "f:resources": {".": {}, "f:Items": {}},
},
- "spec": {
- "autoscalerOptions": {
- "idleTimeoutSeconds": 60,
- "imagePullPolicy": "Always",
- "resources": {
- "limits": {
- "cpu": "500m",
- "memory": "512Mi",
- },
- "requests": {
- "cpu": "500m",
- "memory": "512Mi",
- },
+ },
+ "manager": "kubectl-client-side-apply",
+ "operation": "Update",
+ "time": "2023-02-22T16:26:07Z",
+ },
+ ],
+ "name": "quicktest2",
+ "namespace": "ns",
+ "resourceVersion": "9482384",
+ "uid": "6334fc1b-471e-4876-8e7b-0b2277679235",
+ },
+ "spec": {
+ "priority": 9,
+ "resources": {
+ "GenericItems": [
+ {
+ "allocated": 0,
+ "custompodresources": [
+ {
+ "limits": {
+ "cpu": "2",
+ "memory": "8G",
+ "nvidia.com/gpu": "0",
+ },
+ "replicas": 1,
+ "requests": {
+ "cpu": "2",
+ "memory": "8G",
+ "nvidia.com/gpu": "0",
},
- "upscalingMode": "Default",
},
- "enableInTreeAutoscaling": False,
- "headGroupSpec": {
- "rayStartParams": {
- "block": "true",
- "dashboard-host": "0.0.0.0",
- "num-gpus": "0",
+ {
+ "limits": {
+ "cpu": "1",
+ "memory": "2G",
+ "nvidia.com/gpu": "0",
},
- "serviceType": "ClusterIP",
- "template": {
- "spec": {
- "containers": [
- {
- "image": "quay.io/project-codeflare/ray:2.5.0-py38-cu116",
- "imagePullPolicy": "Always",
- "lifecycle": {
- "preStop": {
- "exec": {
- "command": [
- "/bin/sh",
- "-c",
- "ray stop",
- ]
- }
- }
- },
- "name": "ray-head",
- "ports": [
- {
- "containerPort": 6379,
- "name": "gcs",
- },
- {
- "containerPort": 8265,
- "name": "dashboard",
- },
- {
- "containerPort": 10001,
- "name": "client",
- },
- ],
- "resources": {
- "limits": {
- "cpu": 2,
- "memory": "8G",
- "nvidia.com/gpu": 0,
- },
- "requests": {
- "cpu": 2,
- "memory": "8G",
- "nvidia.com/gpu": 0,
- },
- },
- }
- ]
- }
+ "replicas": 1,
+ "requests": {
+ "cpu": "1",
+ "memory": "2G",
+ "nvidia.com/gpu": "0",
},
},
- "rayVersion": "1.12.0",
- "workerGroupSpecs": [
- {
- "groupName": "small-group-quicktest",
- "maxReplicas": 1,
- "minReplicas": 1,
+ ],
+ "generictemplate": {
+ "apiVersion": "ray.io/v1alpha1",
+ "kind": "RayCluster",
+ "metadata": {
+ "labels": {
+ "appwrapper.mcad.ibm.com": "quicktest2",
+ "controller-tools.k8s.io": "1.0",
+ },
+ "name": "quicktest2",
+ "namespace": "ns",
+ },
+ "spec": {
+ "autoscalerOptions": {
+ "idleTimeoutSeconds": 60,
+ "imagePullPolicy": "Always",
+ "resources": {
+ "limits": {
+ "cpu": "500m",
+ "memory": "512Mi",
+ },
+ "requests": {
+ "cpu": "500m",
+ "memory": "512Mi",
+ },
+ },
+ "upscalingMode": "Default",
+ },
+ "enableInTreeAutoscaling": False,
+ "headGroupSpec": {
"rayStartParams": {
"block": "true",
+ "dashboard-host": "0.0.0.0",
"num-gpus": "0",
},
- "replicas": 1,
+ "serviceType": "ClusterIP",
"template": {
- "metadata": {
- "annotations": {"key": "value"},
- "labels": {"key": "value"},
- },
"spec": {
"containers": [
{
- "env": [
- {
- "name": "MY_POD_IP",
- "valueFrom": {
- "fieldRef": {
- "fieldPath": "status.podIP"
- }
- },
- }
- ],
- "image": "quay.io/project-codeflare/ray:2.5.0-py38-cu116",
+ "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103",
+ "imagePullPolicy": "Always",
"lifecycle": {
"preStop": {
"exec": {
@@ -1336,114 +1335,214 @@ def get_aw_obj():
}
}
},
- "name": "machine-learning",
+ "name": "ray-head",
+ "ports": [
+ {
+ "containerPort": 6379,
+ "name": "gcs",
+ },
+ {
+ "containerPort": 8265,
+ "name": "dashboard",
+ },
+ {
+ "containerPort": 10001,
+ "name": "client",
+ },
+ ],
"resources": {
"limits": {
- "cpu": 1,
- "memory": "2G",
+ "cpu": 2,
+ "memory": "8G",
"nvidia.com/gpu": 0,
},
"requests": {
- "cpu": 1,
- "memory": "2G",
+ "cpu": 2,
+ "memory": "8G",
"nvidia.com/gpu": 0,
},
},
}
- ],
- "initContainers": [
- {
- "command": [
- "sh",
- "-c",
- "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done",
- ],
- "image": "busybox:1.28",
- "name": "init-myservice",
- }
- ],
- },
+ ]
+ }
},
- }
- ],
+ },
+ "rayVersion": "1.12.0",
+ "workerGroupSpecs": [
+ {
+ "groupName": "small-group-quicktest",
+ "maxReplicas": 1,
+ "minReplicas": 1,
+ "rayStartParams": {
+ "block": "true",
+ "num-gpus": "0",
+ },
+ "replicas": 1,
+ "template": {
+ "metadata": {
+ "annotations": {"key": "value"},
+ "labels": {"key": "value"},
+ },
+ "spec": {
+ "containers": [
+ {
+ "env": [
+ {
+ "name": "MY_POD_IP",
+ "valueFrom": {
+ "fieldRef": {
+ "fieldPath": "status.podIP"
+ }
+ },
+ }
+ ],
+ "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103",
+ "lifecycle": {
+ "preStop": {
+ "exec": {
+ "command": [
+ "/bin/sh",
+ "-c",
+ "ray stop",
+ ]
+ }
+ }
+ },
+ "name": "machine-learning",
+ "resources": {
+ "limits": {
+ "cpu": 1,
+ "memory": "2G",
+ "nvidia.com/gpu": 0,
+ },
+ "requests": {
+ "cpu": 1,
+ "memory": "2G",
+ "nvidia.com/gpu": 0,
+ },
+ },
+ }
+ ],
+ "initContainers": [
+ {
+ "command": [
+ "sh",
+ "-c",
+ "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done",
+ ],
+ "image": "busybox:1.28",
+ "name": "init-myservice",
+ }
+ ],
+ },
+ },
+ }
+ ],
+ },
},
+ "metadata": {},
+ "priority": 0,
+ "priorityslope": 0,
+ "replicas": 1,
},
- "metadata": {},
- "priority": 0,
- "priorityslope": 0,
- "replicas": 1,
- },
- {
- "allocated": 0,
- "generictemplate": {
- "apiVersion": "route.openshift.io/v1",
- "kind": "Route",
- "metadata": {
- "labels": {
- "odh-ray-cluster-service": "quicktest-head-svc"
+ {
+ "allocated": 0,
+ "generictemplate": {
+ "apiVersion": "route.openshift.io/v1",
+ "kind": "Route",
+ "metadata": {
+ "labels": {
+ "odh-ray-cluster-service": "quicktest-head-svc"
+ },
+ "name": "ray-dashboard-quicktest",
+ "namespace": "default",
},
- "name": "ray-dashboard-quicktest",
- "namespace": "default",
- },
- "spec": {
- "port": {"targetPort": "dashboard"},
- "to": {
- "kind": "Service",
- "name": "quicktest-head-svc",
+ "spec": {
+ "port": {"targetPort": "dashboard"},
+ "to": {
+ "kind": "Service",
+ "name": "quicktest-head-svc",
+ },
},
},
+ "metadata": {},
+ "priority": 0,
+ "priorityslope": 0,
},
- "metadata": {},
- "priority": 0,
- "priorityslope": 0,
+ ],
+ "Items": [],
+ "metadata": {},
+ },
+ "schedulingSpec": {},
+ "service": {"spec": {}},
+ },
+ "status": {
+ "canrun": True,
+ "conditions": [
+ {
+ "lastTransitionMicroTime": "2023-02-22T16:26:07.559447Z",
+ "lastUpdateMicroTime": "2023-02-22T16:26:07.559447Z",
+ "status": "True",
+ "type": "Init",
+ },
+ {
+ "lastTransitionMicroTime": "2023-02-22T16:26:07.559551Z",
+ "lastUpdateMicroTime": "2023-02-22T16:26:07.559551Z",
+ "reason": "AwaitingHeadOfLine",
+ "status": "True",
+ "type": "Queueing",
+ },
+ {
+ "lastTransitionMicroTime": "2023-02-22T16:26:13.220564Z",
+ "lastUpdateMicroTime": "2023-02-22T16:26:13.220564Z",
+ "reason": "AppWrapperRunnable",
+ "status": "True",
+ "type": "Dispatched",
},
],
- "Items": [],
- "metadata": {},
+ "controllerfirsttimestamp": "2023-02-22T16:26:07.559447Z",
+ "filterignore": True,
+ "queuejobstate": "Dispatched",
+ "sender": "before manageQueueJob - afterEtcdDispatching",
+ "state": "Pending",
+ "systempriority": 9,
},
- "schedulingSpec": {},
- "service": {"spec": {}},
- },
- "status": {
- "canrun": True,
- "conditions": [
- {
- "lastTransitionMicroTime": "2023-02-22T16:26:07.559447Z",
- "lastUpdateMicroTime": "2023-02-22T16:26:07.559447Z",
- "status": "True",
- "type": "Init",
- },
- {
- "lastTransitionMicroTime": "2023-02-22T16:26:07.559551Z",
- "lastUpdateMicroTime": "2023-02-22T16:26:07.559551Z",
- "reason": "AwaitingHeadOfLine",
- "status": "True",
- "type": "Queueing",
- },
- {
- "lastTransitionMicroTime": "2023-02-22T16:26:13.220564Z",
- "lastUpdateMicroTime": "2023-02-22T16:26:13.220564Z",
- "reason": "AppWrapperRunnable",
- "status": "True",
- "type": "Dispatched",
- },
- ],
- "controllerfirsttimestamp": "2023-02-22T16:26:07.559447Z",
- "filterignore": True,
- "queuejobstate": "Dispatched",
- "sender": "before manageQueueJob - afterEtcdDispatching",
- "state": "Pending",
- "systempriority": 9,
},
- }
+ ]
+ }
+ return api_obj1
+
+
+def test_get_cluster(mocker):
+ mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
+ mocker.patch(
+ "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
+ side_effect=get_ray_obj,
)
- return [api_obj1, api_obj2]
+ cluster = get_cluster("quicktest")
+ cluster_config = cluster.config
+ assert cluster_config.name == "quicktest" and cluster_config.namespace == "ns"
+ assert (
+ "m4.xlarge" in cluster_config.machine_types
+ and "g4dn.xlarge" in cluster_config.machine_types
+ )
+ assert cluster_config.min_cpus == 1 and cluster_config.max_cpus == 1
+ assert cluster_config.min_memory == 2 and cluster_config.max_memory == 2
+ assert cluster_config.gpu == 0
+ assert cluster_config.instascale
+ assert (
+ cluster_config.image
+ == "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103"
+ )
+ assert cluster_config.min_worker == 1 and cluster_config.max_worker == 1
def test_list_clusters(mocker, capsys):
- mocker.patch("openshift.selector", side_effect=get_selector)
- mock_res = mocker.patch.object(Selector, "objects")
- mock_res.side_effect = get_obj_none
+ mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
+ mocker.patch(
+ "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
+ side_effect=get_obj_none,
+ )
list_all_clusters("ns")
captured = capsys.readouterr()
assert captured.out == (
@@ -1451,7 +1550,10 @@ def test_list_clusters(mocker, capsys):
"│ No resources found, have you run cluster.up() yet? │\n"
"╰──────────────────────────────────────────────────────────────────────────────╯\n"
)
- mock_res.side_effect = get_ray_obj
+ mocker.patch(
+ "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
+ side_effect=get_ray_obj,
+ )
list_all_clusters("ns")
captured = capsys.readouterr()
assert captured.out == (
@@ -1477,9 +1579,11 @@ def test_list_clusters(mocker, capsys):
def test_list_queue(mocker, capsys):
- mocker.patch("openshift.selector", side_effect=get_selector)
- mock_res = mocker.patch.object(Selector, "objects")
- mock_res.side_effect = get_obj_none
+ mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
+ mocker.patch(
+ "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
+ side_effect=get_obj_none,
+ )
list_all_queued("ns")
captured = capsys.readouterr()
assert captured.out == (
@@ -1487,7 +1591,10 @@ def test_list_queue(mocker, capsys):
"│ No resources found, have you run cluster.up() yet? │\n"
"╰──────────────────────────────────────────────────────────────────────────────╯\n"
)
- mock_res.side_effect = get_aw_obj
+ mocker.patch(
+ "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
+ side_effect=get_aw_obj,
+ )
list_all_queued("ns")
captured = capsys.readouterr()
assert captured.out == (
@@ -1507,6 +1614,7 @@ def test_list_queue(mocker, capsys):
def test_cluster_status(mocker):
+ mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
fake_aw = AppWrapper(
"test", AppWrapperStatus.FAILED, can_run=True, job_state="unused"
)
@@ -1523,6 +1631,8 @@ def test_cluster_status(mocker):
dashboard="fake-uri",
)
cf = Cluster(ClusterConfiguration(name="test", namespace="ns"))
+ mocker.patch("codeflare_sdk.cluster.cluster._app_wrapper_status", return_value=None)
+ mocker.patch("codeflare_sdk.cluster.cluster._ray_cluster_status", return_value=None)
status, ready = cf.status()
assert status == CodeFlareClusterStatus.UNKNOWN
assert ready == False
@@ -1584,6 +1694,9 @@ def test_cluster_status(mocker):
def test_wait_ready(mocker, capsys):
+ mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
+ mocker.patch("codeflare_sdk.cluster.cluster._app_wrapper_status", return_value=None)
+ mocker.patch("codeflare_sdk.cluster.cluster._ray_cluster_status", return_value=None)
cf = Cluster(ClusterConfiguration(name="test", namespace="ns"))
try:
cf.wait_ready(timeout=5)
@@ -1655,12 +1768,17 @@ def test_DDPJobDefinition_creation():
return ddp
-def test_DDPJobDefinition_dry_run():
+def test_DDPJobDefinition_dry_run(mocker):
"""
Test that the dry run method returns the correct type: AppDryRunInfo,
that the attributes of the returned object are of the correct type,
and that the values from cluster and job definition are correctly passed.
"""
+ mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
+ mocker.patch(
+ "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri",
+ return_value="",
+ )
ddp = test_DDPJobDefinition_creation()
cluster = Cluster(test_config_creation())
ddp_job = ddp._dry_run(cluster)
@@ -1693,7 +1811,7 @@ def test_DDPJobDefinition_dry_run_no_cluster(mocker):
"""
mocker.patch(
- "openshift.get_project_name",
+ "codeflare_sdk.job.jobs.get_current_namespace",
return_value="opendatahub",
)
@@ -1725,11 +1843,15 @@ def test_DDPJobDefinition_dry_run_no_cluster(mocker):
assert ddp_job._scheduler == "kubernetes_mcad"
-def test_DDPJobDefinition_dry_run_no_resource_args():
+def test_DDPJobDefinition_dry_run_no_resource_args(mocker):
"""
Test that the dry run correctly gets resources from the cluster object
when the job definition does not specify resources.
"""
+ mocker.patch(
+ "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri",
+ return_value="",
+ )
cluster = Cluster(test_config_creation())
ddp = DDPJobDefinition(
script="test.py",
@@ -1762,7 +1884,7 @@ def test_DDPJobDefinition_dry_run_no_cluster_no_resource_args(mocker):
"""
mocker.patch(
- "openshift.get_project_name",
+ "codeflare_sdk.job.jobs.get_current_namespace",
return_value="opendatahub",
)
@@ -1814,11 +1936,15 @@ def test_DDPJobDefinition_submit(mocker):
Tests that the submit method returns the correct type: DDPJob
And that the attributes of the returned object are of the correct type
"""
+ mocker.patch(
+ "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri",
+ return_value="fake-dashboard-uri",
+ )
ddp_def = test_DDPJobDefinition_creation()
cluster = Cluster(test_config_creation())
mocker.patch(
- "openshift.get_project_name",
- return_value="opendatahub",
+ "codeflare_sdk.job.jobs.get_current_namespace",
+ side_effect="opendatahub",
)
mocker.patch(
"codeflare_sdk.job.jobs.torchx_runner.schedule",
@@ -1841,6 +1967,10 @@ def test_DDPJobDefinition_submit(mocker):
def test_DDPJob_creation(mocker):
+ mocker.patch(
+ "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri",
+ return_value="fake-dashboard-uri",
+ )
ddp_def = test_DDPJobDefinition_creation()
cluster = Cluster(test_config_creation())
mocker.patch(
@@ -1867,8 +1997,8 @@ def test_DDPJob_creation_no_cluster(mocker):
ddp_def = test_DDPJobDefinition_creation()
ddp_def.image = "fake-image"
mocker.patch(
- "openshift.get_project_name",
- return_value="opendatahub",
+ "codeflare_sdk.job.jobs.get_current_namespace",
+ side_effect="opendatahub",
)
mocker.patch(
"codeflare_sdk.job.jobs.torchx_runner.schedule",
@@ -1959,14 +2089,24 @@ def test_AWManager_creation():
)
-def arg_check_aw_create_effect(*args):
- assert args[0] == "create"
- assert args[1] == ["-f", "test.yaml"]
+def arg_check_aw_apply_effect(group, version, namespace, plural, body, *args):
+ assert group == "mcad.ibm.com"
+ assert version == "v1beta1"
+ assert namespace == "ns"
+ assert plural == "appwrappers"
+ with open("test.yaml") as f:
+ aw = yaml.load(f, Loader=yaml.FullLoader)
+ assert body == aw
+ assert args == tuple()
-def arg_check_aw_delete_effect(*args):
- assert args[0] == "delete"
- assert args[1] == ["AppWrapper", "test"]
+def arg_check_aw_del_effect(group, version, namespace, plural, name, *args):
+ assert group == "mcad.ibm.com"
+ assert version == "v1beta1"
+ assert namespace == "ns"
+ assert plural == "appwrappers"
+ assert name == "test"
+ assert args == tuple()
def test_AWManager_submit_remove(mocker, capsys):
@@ -1978,10 +2118,17 @@ def test_AWManager_submit_remove(mocker, capsys):
== "AppWrapper not submitted by this manager yet, nothing to remove\n"
)
assert testaw.submitted == False
- mocker.patch("openshift.invoke", side_effect=arg_check_aw_create_effect)
+ mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
+ mocker.patch(
+ "kubernetes.client.CustomObjectsApi.create_namespaced_custom_object",
+ side_effect=arg_check_aw_apply_effect,
+ )
+ mocker.patch(
+ "kubernetes.client.CustomObjectsApi.delete_namespaced_custom_object",
+ side_effect=arg_check_aw_del_effect,
+ )
testaw.submit()
assert testaw.submitted == True
- mocker.patch("openshift.invoke", side_effect=arg_check_aw_delete_effect)
testaw.remove()
assert testaw.submitted == False
@@ -2068,20 +2215,9 @@ def test_export_env():
)
-# Make sure to keep this function and the following function at the end of the file
-def test_cmd_line_generation():
- os.system(
- f"python3 {parent}/src/codeflare_sdk/utils/generate_yaml.py --name=unit-cmd-cluster --min-cpu=1 --max-cpu=1 --min-memory=2 --max-memory=2 --gpu=1 --workers=2 --template=src/codeflare_sdk/templates/base-template.yaml"
- )
- assert filecmp.cmp(
- "unit-cmd-cluster.yaml", f"{parent}/tests/test-case-cmd.yaml", shallow=True
- )
- os.remove("unit-test-cluster.yaml")
- os.remove("unit-test-default-cluster.yaml")
- os.remove("unit-cmd-cluster.yaml")
-
-
# Make sure to always keep this function last
def test_cleanup():
+ os.remove("unit-test-cluster.yaml")
+ os.remove("unit-test-default-cluster.yaml")
os.remove("test.yaml")
os.remove("raytest2.yaml")