From 942459dca0805d450d99f5729eec74c98761103b Mon Sep 17 00:00:00 2001 From: ted chang Date: Tue, 27 Jun 2023 19:43:56 -0700 Subject: [PATCH 1/5] Remove oc client and add helper functions --- .../interactive/local_interactive.ipynb | 88 +++++++------------ src/codeflare_sdk/cluster/cluster.py | 15 +++- src/codeflare_sdk/utils/generate_yaml.py | 13 +-- tests/unit_test.py | 19 ++++ 4 files changed, 71 insertions(+), 64 deletions(-) diff --git a/demo-notebooks/interactive/local_interactive.ipynb b/demo-notebooks/interactive/local_interactive.ipynb index 88a6ccd58..d70c00df7 100644 --- a/demo-notebooks/interactive/local_interactive.ipynb +++ b/demo-notebooks/interactive/local_interactive.ipynb @@ -32,20 +32,12 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "4364ac2e-dd10-4d30-ba66-12708daefb3f", "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Written to: hfgputest-1.yaml\n" - ] - } - ], + "outputs": [], "source": [ "# Create our cluster and submit appwrapper\n", "namespace = \"default\"\n", @@ -89,7 +81,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "12eef53c", "metadata": {}, @@ -99,38 +90,21 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "cf1b749e-2335-42c2-b673-26768ec9895d", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "rayclient-hfgputest-1-default.apps.tedbig412.cp.fyre.ibm.com\n" - ] - } - ], + "outputs": [], "source": [ - "import openshift as oc\n", "from codeflare_sdk.utils import generate_cert\n", "\n", "if local_interactive:\n", " generate_cert.generate_tls_cert(cluster_name, namespace)\n", - " generate_cert.export_env(cluster_name, namespace)\n", - "\n", - "with oc.project(namespace):\n", - " routes=oc.selector(\"route\").objects()\n", - " rayclient_url=\"\"\n", - " for r in routes:\n", - " if \"rayclient\" in r.name():\n", - " rayclient_url=r.model.spec.host\n", - "print(rayclient_url)" + " generate_cert.export_env(cluster_name, namespace)" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 6, "id": "9483bb98-33b3-4beb-9b15-163d7e76c1d7", "metadata": { "scrolled": true, @@ -141,15 +115,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-05-31 14:12:37,816\tINFO client_builder.py:251 -- Passing the following kwargs to ray.init() on the server: logging_level\n", - "2023-05-31 14:12:37,820\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.IDLE\n", - "2023-05-31 14:12:38,034\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.CONNECTING\n", - "2023-05-31 14:12:38,246\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.READY\n", - "2023-05-31 14:12:38,290\tDEBUG worker.py:807 -- Pinging server.\n", - "2023-05-31 14:12:40,521\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000001000000\n", - "2023-05-31 14:12:40,523\tDEBUG worker.py:564 -- Scheduling task get_dashboard_url 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x01\\x00\\x00\\x00'\n", - "2023-05-31 14:12:40,535\tDEBUG worker.py:640 -- Retaining c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n", - "2023-05-31 14:12:41,379\tDEBUG worker.py:636 -- Releasing c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n" + "2023-06-27 19:14:16,088\tINFO client_builder.py:251 -- Passing the following kwargs to ray.init() on the server: logging_level\n", + "2023-06-27 19:14:16,100\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.IDLE\n", + "2023-06-27 19:14:16,308\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.CONNECTING\n", + "2023-06-27 19:14:16,434\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.READY\n", + "2023-06-27 19:14:16,436\tDEBUG worker.py:807 -- Pinging server.\n", + "2023-06-27 19:14:18,634\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000001000000\n", + "2023-06-27 19:14:18,635\tDEBUG worker.py:564 -- Scheduling task get_dashboard_url 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x01\\x00\\x00\\x00'\n", + "2023-06-27 19:14:18,645\tDEBUG worker.py:640 -- Retaining c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n", + "2023-06-27 19:14:19,454\tDEBUG worker.py:636 -- Releasing c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n" ] }, { @@ -190,7 +164,7 @@ " \n", " \n", " Dashboard:\n", - " http://10.254.12.141:8265\n", + " http://10.254.20.41:8265\n", "\n", "\n", " \n", @@ -198,10 +172,10 @@ "\n" ], "text/plain": [ - "ClientContext(dashboard_url='10.254.12.141:8265', python_version='3.8.13', ray_version='2.1.0', ray_commit='23f34d948dae8de9b168667ab27e6cf940b3ae85', protocol_version='2022-10-05', _num_clients=1, _context_to_restore=)" + "ClientContext(dashboard_url='10.254.20.41:8265', python_version='3.8.13', ray_version='2.1.0', ray_commit='23f34d948dae8de9b168667ab27e6cf940b3ae85', protocol_version='2022-10-05', _num_clients=1, _context_to_restore=)" ] }, - "execution_count": 12, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -210,12 +184,12 @@ "import ray\n", "\n", "ray.shutdown()\n", - "ray.init(address=f\"ray://{rayclient_url}\", logging_level=\"DEBUG\")" + "ray.init(address=cluster.local_client_url(), logging_level=\"DEBUG\")" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 7, "id": "3436eb4a-217c-4109-a3c3-309fda7e2442", "metadata": {}, "outputs": [], @@ -239,7 +213,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 8, "id": "5cca1874-2be3-4631-ae48-9adfa45e3af3", "metadata": { "scrolled": true, @@ -250,8 +224,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-05-31 14:13:29,868\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000002000000\n", - "2023-05-31 14:13:29,870\tDEBUG worker.py:564 -- Scheduling task heavy_calculation 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x02\\x00\\x00\\x00'\n" + "2023-06-27 19:14:28,222\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000002000000\n", + "2023-06-27 19:14:28,222\tDEBUG worker.py:564 -- Scheduling task heavy_calculation 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x02\\x00\\x00\\x00'\n" ] } ], @@ -261,7 +235,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 9, "id": "01172c29-e8bf-41ef-8db5-eccb07906111", "metadata": {}, "outputs": [ @@ -269,8 +243,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-05-31 14:13:32,643\tDEBUG worker.py:640 -- Retaining 16310a0f0a45af5cffffffffffffffffffffffff0100000001000000\n", - "2023-05-31 14:13:34,677\tDEBUG worker.py:439 -- Internal retry for get [ClientObjectRef(16310a0f0a45af5cffffffffffffffffffffffff0100000001000000)]\n" + "2023-06-27 19:14:29,202\tDEBUG worker.py:640 -- Retaining 16310a0f0a45af5cffffffffffffffffffffffff0100000001000000\n", + "2023-06-27 19:14:31,224\tDEBUG worker.py:439 -- Internal retry for get [ClientObjectRef(16310a0f0a45af5cffffffffffffffffffffffff0100000001000000)]\n" ] }, { @@ -279,7 +253,7 @@ "1789.4644387076714" ] }, - "execution_count": 15, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -290,7 +264,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 10, "id": "9e79b547-a457-4232-b77d-19147067b972", "metadata": {}, "outputs": [ @@ -298,10 +272,10 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-05-31 14:13:37,659\tDEBUG dataclient.py:287 -- Got unawaited response connection_cleanup {\n", + "2023-06-27 19:14:33,161\tDEBUG dataclient.py:287 -- Got unawaited response connection_cleanup {\n", "}\n", "\n", - "2023-05-31 14:13:38,681\tDEBUG dataclient.py:278 -- Shutting down data channel.\n" + "2023-06-27 19:14:34,460\tDEBUG dataclient.py:278 -- Shutting down data channel.\n" ] } ], @@ -312,7 +286,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 11, "id": "2c198f1f-68bf-43ff-a148-02b5cb000ff2", "metadata": {}, "outputs": [], diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index f2148b9e1..f89ecac71 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -34,7 +34,6 @@ RayCluster, RayClusterStatus, ) - from kubernetes import client, config import yaml @@ -344,6 +343,13 @@ def from_k8_cluster_object(rc): ) return Cluster(cluster_config) + def local_client_url(self): + if self.config.local_interactive == True: + ingress_domain = _get_ingress_domain() + return f"ray://rayclient-{self.config.name}-{self.config.namespace}.{ingress_domain}" + else: + return "None" + def list_all_clusters(namespace: str, print_to_console: bool = True): """ @@ -402,6 +408,13 @@ def get_cluster(cluster_name: str, namespace: str = "default"): # private methods +def _get_ingress_domain(): + config.load_kube_config() + api_client = client.CustomObjectsApi() + ingress = api_client.get_cluster_custom_object( + "config.openshift.io", "v1", "ingresses", "cluster" + ) + return ingress["spec"]["domain"] def _kube_api_error_handling(e: Exception): # pragma: no cover diff --git a/src/codeflare_sdk/utils/generate_yaml.py b/src/codeflare_sdk/utils/generate_yaml.py index fffc4fa9a..426203b4e 100755 --- a/src/codeflare_sdk/utils/generate_yaml.py +++ b/src/codeflare_sdk/utils/generate_yaml.py @@ -21,7 +21,7 @@ import sys import argparse import uuid -import openshift as oc +from kubernetes import client, config def read_template(template): @@ -239,12 +239,13 @@ def enable_local_interactive(resources, cluster_name, namespace): ][0].get("command")[2] command = command.replace("deployment-name", cluster_name) - - server_name = ( - oc.whoami("--show-server").split(":")[1].split("//")[1].replace("api", "apps") + config.load_kube_config() + api_client = client.CustomObjectsApi() + ingress = api_client.get_cluster_custom_object( + "config.openshift.io", "v1", "ingresses", "cluster" ) - - command = command.replace("server-name", server_name) + domain = ingress["spec"]["domain"] + command = command.replace("server-name", domain) item["generictemplate"]["spec"]["headGroupSpec"]["template"]["spec"][ "initContainers" diff --git a/tests/unit_test.py b/tests/unit_test.py index 3c69c4b6b..6ca8fd330 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -379,6 +379,25 @@ def test_cluster_uris(mocker): ) +def test_local_client_url(mocker): + mocker.patch( + "kubernetes.client.CustomObjectsApi.get_cluster_custom_object", + return_value={"spec": {"domain": ""}}, + ) + mocker.patch( + "codeflare_sdk.cluster.cluster._get_ingress_domain", + return_value="apps.cluster.awsroute.org", + ) + default_config = ClusterConfiguration( + name="unit-test-cluster", namespace="ns", local_interactive=True + ) + cluster = Cluster(default_config) + assert ( + cluster.local_client_url() + == "ray://rayclient-unit-test-cluster-ns.apps.cluster.awsroute.org" + ) + + def ray_addr(self, *args): return self._address From 39b8fab6737171ab6ad6b56659d0b6599d806245 Mon Sep 17 00:00:00 2001 From: Mustafa Eyceoz Date: Wed, 12 Jul 2023 16:18:15 -0400 Subject: [PATCH 2/5] Updates for error checking --- src/codeflare_sdk/cluster/awload.py | 3 +- src/codeflare_sdk/cluster/cluster.py | 37 +++++------------- src/codeflare_sdk/utils/generate_yaml.py | 14 ++++--- src/codeflare_sdk/utils/kube_api_helpers.py | 43 +++++++++++++++++++++ 4 files changed, 62 insertions(+), 35 deletions(-) create mode 100644 src/codeflare_sdk/utils/kube_api_helpers.py diff --git a/src/codeflare_sdk/cluster/awload.py b/src/codeflare_sdk/cluster/awload.py index 25f614232..ecf432133 100644 --- a/src/codeflare_sdk/cluster/awload.py +++ b/src/codeflare_sdk/cluster/awload.py @@ -20,11 +20,10 @@ from os.path import isfile import errno import os -import openshift as oc import yaml from kubernetes import client, config -from .cluster import _kube_api_error_handling +from ..utils.kube_api_helpers import _kube_api_error_handling class AWManager: diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index f89ecac71..8381ba2c3 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -18,7 +18,6 @@ cluster setup queue, a list of all existing clusters, and the user's working namespace. """ -from os import stat from time import sleep from typing import List, Optional, Tuple, Dict @@ -26,6 +25,7 @@ from ..utils import pretty_print from ..utils.generate_yaml import generate_appwrapper +from ..utils.kube_api_helpers import _kube_api_error_handling from .config import ClusterConfiguration from .model import ( AppWrapper, @@ -409,36 +409,17 @@ def get_cluster(cluster_name: str, namespace: str = "default"): # private methods def _get_ingress_domain(): - config.load_kube_config() - api_client = client.CustomObjectsApi() - ingress = api_client.get_cluster_custom_object( - "config.openshift.io", "v1", "ingresses", "cluster" - ) + try: + config.load_kube_config() + api_client = client.CustomObjectsApi() + ingress = api_client.get_cluster_custom_object( + "config.openshift.io", "v1", "ingresses", "cluster" + ) + except Exception as e: # pragma: no cover + return _kube_api_error_handling(e) return ingress["spec"]["domain"] -def _kube_api_error_handling(e: Exception): # pragma: no cover - perm_msg = ( - "Action not permitted, have you put in correct/up-to-date auth credentials?" - ) - nf_msg = "No instances found, nothing to be done." - exists_msg = "Resource with this name already exists." - if type(e) == config.ConfigException: - raise PermissionError(perm_msg) - if type(e) == executing.executing.NotOneValueFound: - print(nf_msg) - return - if type(e) == client.ApiException: - if e.reason == "Not Found": - print(nf_msg) - return - elif e.reason == "Unauthorized" or e.reason == "Forbidden": - raise PermissionError(perm_msg) - elif e.reason == "Conflict": - raise FileExistsError(exists_msg) - raise e - - def _app_wrapper_status(name, namespace="default") -> Optional[AppWrapper]: try: config.load_kube_config() diff --git a/src/codeflare_sdk/utils/generate_yaml.py b/src/codeflare_sdk/utils/generate_yaml.py index 426203b4e..f71603d3a 100755 --- a/src/codeflare_sdk/utils/generate_yaml.py +++ b/src/codeflare_sdk/utils/generate_yaml.py @@ -22,6 +22,7 @@ import argparse import uuid from kubernetes import client, config +from .kube_api_helpers import _kube_api_error_handling def read_template(template): @@ -239,11 +240,14 @@ def enable_local_interactive(resources, cluster_name, namespace): ][0].get("command")[2] command = command.replace("deployment-name", cluster_name) - config.load_kube_config() - api_client = client.CustomObjectsApi() - ingress = api_client.get_cluster_custom_object( - "config.openshift.io", "v1", "ingresses", "cluster" - ) + try: + config.load_kube_config() + api_client = client.CustomObjectsApi() + ingress = api_client.get_cluster_custom_object( + "config.openshift.io", "v1", "ingresses", "cluster" + ) + except Exception as e: # pragma: no cover + return _kube_api_error_handling(e) domain = ingress["spec"]["domain"] command = command.replace("server-name", domain) diff --git a/src/codeflare_sdk/utils/kube_api_helpers.py b/src/codeflare_sdk/utils/kube_api_helpers.py new file mode 100644 index 000000000..492fc0c52 --- /dev/null +++ b/src/codeflare_sdk/utils/kube_api_helpers.py @@ -0,0 +1,43 @@ +# Copyright 2022 IBM, Red Hat +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This sub-module exists primarily to be used internally for any Kubernetes +API error handling or wrapping. +""" + +import executing +from kubernetes import client, config + +# private methods +def _kube_api_error_handling(e: Exception): # pragma: no cover + perm_msg = ( + "Action not permitted, have you put in correct/up-to-date auth credentials?" + ) + nf_msg = "No instances found, nothing to be done." + exists_msg = "Resource with this name already exists." + if type(e) == config.ConfigException: + raise PermissionError(perm_msg) + if type(e) == executing.executing.NotOneValueFound: + print(nf_msg) + return + if type(e) == client.ApiException: + if e.reason == "Not Found": + print(nf_msg) + return + elif e.reason == "Unauthorized" or e.reason == "Forbidden": + raise PermissionError(perm_msg) + elif e.reason == "Conflict": + raise FileExistsError(exists_msg) + raise e From 350757a0bbe998421ed38281b14401dd4cbb0377 Mon Sep 17 00:00:00 2001 From: ted chang Date: Wed, 12 Jul 2023 18:39:05 -0700 Subject: [PATCH 3/5] make sure tests run without oc login --- src/codeflare_sdk/utils/generate_yaml.py | 4 ++-- src/codeflare_sdk/utils/kube_api_helpers.py | 1 + tests/unit_test.py | 13 +++++++++---- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/codeflare_sdk/utils/generate_yaml.py b/src/codeflare_sdk/utils/generate_yaml.py index f71603d3a..b8ad74e2b 100755 --- a/src/codeflare_sdk/utils/generate_yaml.py +++ b/src/codeflare_sdk/utils/generate_yaml.py @@ -22,7 +22,7 @@ import argparse import uuid from kubernetes import client, config -from .kube_api_helpers import _kube_api_error_handling +from codeflare_sdk.utils.kube_api_helpers import _kube_api_error_handling def read_template(template): @@ -247,7 +247,7 @@ def enable_local_interactive(resources, cluster_name, namespace): "config.openshift.io", "v1", "ingresses", "cluster" ) except Exception as e: # pragma: no cover - return _kube_api_error_handling(e) + return _kube_api_error_handling(e) domain = ingress["spec"]["domain"] command = command.replace("server-name", domain) diff --git a/src/codeflare_sdk/utils/kube_api_helpers.py b/src/codeflare_sdk/utils/kube_api_helpers.py index 492fc0c52..58358a053 100644 --- a/src/codeflare_sdk/utils/kube_api_helpers.py +++ b/src/codeflare_sdk/utils/kube_api_helpers.py @@ -20,6 +20,7 @@ import executing from kubernetes import client, config + # private methods def _kube_api_error_handling(e: Exception): # pragma: no cover perm_msg = ( diff --git a/tests/unit_test.py b/tests/unit_test.py index 6ca8fd330..61836c7a6 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -388,13 +388,18 @@ def test_local_client_url(mocker): "codeflare_sdk.cluster.cluster._get_ingress_domain", return_value="apps.cluster.awsroute.org", ) - default_config = ClusterConfiguration( - name="unit-test-cluster", namespace="ns", local_interactive=True + mocker.patch( + "codeflare_sdk.cluster.cluster.Cluster.create_app_wrapper", + return_value="unit-test-cluster-localinter.yaml", ) - cluster = Cluster(default_config) + + cluster_config = ClusterConfiguration( + name="unit-test-cluster-localinter", namespace="ns", local_interactive=True + ) + cluster = Cluster(cluster_config) assert ( cluster.local_client_url() - == "ray://rayclient-unit-test-cluster-ns.apps.cluster.awsroute.org" + == "ray://rayclient-unit-test-cluster-localinter-ns.apps.cluster.awsroute.org" ) From 2aca8a8d6a52a2d17de22218ab2f23f7ef2dd6c7 Mon Sep 17 00:00:00 2001 From: Mustafa Eyceoz Date: Thu, 13 Jul 2023 10:25:42 -0400 Subject: [PATCH 4/5] Removing CLI appwrapper generation --- src/codeflare_sdk/utils/generate_yaml.py | 120 ---------------- tests/test-case-cmd.yaml | 171 ----------------------- tests/unit_test.py | 15 +- 3 files changed, 2 insertions(+), 304 deletions(-) delete mode 100644 tests/test-case-cmd.yaml diff --git a/src/codeflare_sdk/utils/generate_yaml.py b/src/codeflare_sdk/utils/generate_yaml.py index b8ad74e2b..82af39191 100755 --- a/src/codeflare_sdk/utils/generate_yaml.py +++ b/src/codeflare_sdk/utils/generate_yaml.py @@ -332,123 +332,3 @@ def generate_appwrapper( outfile = appwrapper_name + ".yaml" write_user_appwrapper(user_yaml, outfile) return outfile - - -def main(): # pragma: no cover - parser = argparse.ArgumentParser(description="Generate user AppWrapper") - parser.add_argument( - "--name", - required=False, - default="", - help="User selected name for AppWrapper and Ray Cluster (auto-generated if not provided)", - ) - parser.add_argument( - "--min-cpu", - type=int, - required=True, - help="min number of CPU(s) in a worker required for running job", - ) - parser.add_argument( - "--max-cpu", - type=int, - required=True, - help="max number of CPU(s) in a worker required for running job", - ) - parser.add_argument( - "--min-memory", - type=int, - required=True, - help="min RAM required in a worker for running job, in GB", - ) - parser.add_argument( - "--max-memory", - type=int, - required=True, - help="max RAM required in a worker for running job, in GB", - ) - parser.add_argument( - "--gpu", - type=int, - required=True, - help="GPU(s) required in a worker for running job", - ) - parser.add_argument( - "--workers", - type=int, - required=True, - help="How many workers are required in the cluster", - ) - parser.add_argument( - "--template", required=True, help="Template AppWrapper yaml file" - ) - parser.add_argument( - "--image", - required=False, - default="rayproject/ray:latest", - help="Ray image to be used (defaults to rayproject/ray:latest)", - ) - parser.add_argument( - "--instascale", - default=False, - required=False, - action="store_true", - help="Indicates that instascale is installed on the cluster", - ) - parser.add_argument( - "--instance-types", - type=str, - nargs="+", - default=[], - required=False, - help="Head,worker instance types (space separated)", - ) - parser.add_argument( - "--namespace", - required=False, - default="default", - help="Set the kubernetes namespace you want to deploy your cluster to. Default. If left blank, uses the 'default' namespace", - ) - parser.add_argument( - "--local-interactive", - required=False, - default=False, - help="Enable local interactive mode", - ) - - args = parser.parse_args() - name = args.name - min_cpu = args.min_cpu - max_cpu = args.max_cpu - min_memory = args.min_memory - max_memory = args.max_memory - gpu = args.gpu - workers = args.workers - template = args.template - image = args.image - instascale = args.instascale - instance_types = args.instance_types - namespace = args.namespace - local_interactive = args.local_interactive - env = {} - - outfile = generate_appwrapper( - name, - namespace, - min_cpu, - max_cpu, - min_memory, - max_memory, - gpu, - workers, - template, - image, - instascale, - instance_types, - local_interactive, - env, - ) - return outfile - - -if __name__ == "__main__": # pragma: no cover - main() diff --git a/tests/test-case-cmd.yaml b/tests/test-case-cmd.yaml deleted file mode 100644 index d82096e8f..000000000 --- a/tests/test-case-cmd.yaml +++ /dev/null @@ -1,171 +0,0 @@ -apiVersion: mcad.ibm.com/v1beta1 -kind: AppWrapper -metadata: - name: unit-cmd-cluster - namespace: default -spec: - priority: 9 - resources: - GenericItems: - - custompodresources: - - limits: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - replicas: 1 - requests: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - - limits: - cpu: 1 - memory: 2G - nvidia.com/gpu: 1 - replicas: 2 - requests: - cpu: 1 - memory: 2G - nvidia.com/gpu: 1 - generictemplate: - apiVersion: ray.io/v1alpha1 - kind: RayCluster - metadata: - labels: - appwrapper.mcad.ibm.com: unit-cmd-cluster - controller-tools.k8s.io: '1.0' - name: unit-cmd-cluster - namespace: default - spec: - autoscalerOptions: - idleTimeoutSeconds: 60 - imagePullPolicy: Always - resources: - limits: - cpu: 500m - memory: 512Mi - requests: - cpu: 500m - memory: 512Mi - upscalingMode: Default - enableInTreeAutoscaling: false - headGroupSpec: - rayStartParams: - block: 'true' - dashboard-host: 0.0.0.0 - num-gpus: '0' - serviceType: ClusterIP - template: - spec: - containers: - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: RAY_USE_TLS - value: '0' - - name: RAY_TLS_SERVER_CERT - value: /home/ray/workspace/tls/server.crt - - name: RAY_TLS_SERVER_KEY - value: /home/ray/workspace/tls/server.key - - name: RAY_TLS_CA_CERT - value: /home/ray/workspace/tls/ca.crt - image: rayproject/ray:latest - imagePullPolicy: Always - lifecycle: - preStop: - exec: - command: - - /bin/sh - - -c - - ray stop - name: ray-head - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - resources: - limits: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - requests: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - rayVersion: 2.1.0 - workerGroupSpecs: - - groupName: small-group-unit-cmd-cluster - maxReplicas: 2 - minReplicas: 2 - rayStartParams: - block: 'true' - num-gpus: '1' - replicas: 2 - template: - metadata: - annotations: - key: value - labels: - key: value - spec: - containers: - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: RAY_USE_TLS - value: '0' - - name: RAY_TLS_SERVER_CERT - value: /home/ray/workspace/tls/server.crt - - name: RAY_TLS_SERVER_KEY - value: /home/ray/workspace/tls/server.key - - name: RAY_TLS_CA_CERT - value: /home/ray/workspace/tls/ca.crt - image: rayproject/ray:latest - lifecycle: - preStop: - exec: - command: - - /bin/sh - - -c - - ray stop - name: machine-learning - resources: - limits: - cpu: 1 - memory: 2G - nvidia.com/gpu: 1 - requests: - cpu: 1 - memory: 2G - nvidia.com/gpu: 1 - initContainers: - - command: - - sh - - -c - - until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; - do echo waiting for myservice; sleep 2; done - image: busybox:1.28 - name: init-myservice - replicas: 1 - - generictemplate: - apiVersion: route.openshift.io/v1 - kind: Route - metadata: - labels: - odh-ray-cluster-service: unit-cmd-cluster-head-svc - name: ray-dashboard-unit-cmd-cluster - namespace: default - spec: - port: - targetPort: dashboard - to: - kind: Service - name: unit-cmd-cluster-head-svc - replica: 1 - Items: [] diff --git a/tests/unit_test.py b/tests/unit_test.py index 61836c7a6..20dff976b 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -2258,20 +2258,9 @@ def test_export_env(): ) -# Make sure to keep this function and the following function at the end of the file -def test_cmd_line_generation(): - os.system( - f"python3 {parent}/src/codeflare_sdk/utils/generate_yaml.py --name=unit-cmd-cluster --min-cpu=1 --max-cpu=1 --min-memory=2 --max-memory=2 --gpu=1 --workers=2 --template=src/codeflare_sdk/templates/base-template.yaml" - ) - assert filecmp.cmp( - "unit-cmd-cluster.yaml", f"{parent}/tests/test-case-cmd.yaml", shallow=True - ) - os.remove("unit-test-cluster.yaml") - os.remove("unit-test-default-cluster.yaml") - os.remove("unit-cmd-cluster.yaml") - - # Make sure to always keep this function last def test_cleanup(): + os.remove("unit-test-cluster.yaml") + os.remove("unit-test-default-cluster.yaml") os.remove("test.yaml") os.remove("raytest2.yaml") From 19ce165a0190d3dd762b57aab23cb556ac628652 Mon Sep 17 00:00:00 2001 From: Mustafa Eyceoz Date: Thu, 13 Jul 2023 10:29:14 -0400 Subject: [PATCH 5/5] Updated import --- src/codeflare_sdk/utils/generate_yaml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/codeflare_sdk/utils/generate_yaml.py b/src/codeflare_sdk/utils/generate_yaml.py index 82af39191..193b95819 100755 --- a/src/codeflare_sdk/utils/generate_yaml.py +++ b/src/codeflare_sdk/utils/generate_yaml.py @@ -22,7 +22,7 @@ import argparse import uuid from kubernetes import client, config -from codeflare_sdk.utils.kube_api_helpers import _kube_api_error_handling +from .kube_api_helpers import _kube_api_error_handling def read_template(template):