From d6bb15889963b31b3752ed07dfca98c313d37c37 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Thu, 3 Aug 2023 11:52:05 -0700 Subject: [PATCH 1/4] add: plural alias to list raycluster --- src/codeflare_sdk/cli/cli_utils.py | 16 ++++++++++++++++ src/codeflare_sdk/cli/commands/list.py | 7 +++---- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/src/codeflare_sdk/cli/cli_utils.py b/src/codeflare_sdk/cli/cli_utils.py index 0c557a8ea..c9d2c87a6 100644 --- a/src/codeflare_sdk/cli/cli_utils.py +++ b/src/codeflare_sdk/cli/cli_utils.py @@ -57,3 +57,19 @@ def load_auth(): click.echo("No authentication found, trying default kubeconfig") except client.ApiException: click.echo("Invalid authentication, trying default kubeconfig") + + +class PluralAlias(click.Group): + def get_command(self, ctx, cmd_name): + rv = click.Group.get_command(self, ctx, cmd_name) + if rv is not None: + return rv + for x in self.list_commands(ctx): + if x + "s" == cmd_name: + return click.Group.get_command(self, ctx, x) + return None + + def resolve_command(self, ctx, args): + # always return the full command name + _, cmd, args = super().resolve_command(ctx, args) + return cmd.name, cmd, args diff --git a/src/codeflare_sdk/cli/commands/list.py b/src/codeflare_sdk/cli/commands/list.py index dd3ad4e22..753982579 100644 --- a/src/codeflare_sdk/cli/commands/list.py +++ b/src/codeflare_sdk/cli/commands/list.py @@ -4,12 +4,11 @@ from codeflare_sdk.cluster.cluster import ( list_clusters_all_namespaces, list_all_clusters, - get_current_namespace, ) -from codeflare_sdk.cli.cli_utils import load_auth +from codeflare_sdk.cli.cli_utils import PluralAlias -@click.group() +@click.group(cls=PluralAlias) def cli(): """List a specified resource""" pass @@ -19,7 +18,7 @@ def cli(): @click.option("--namespace", type=str) @click.option("--all", is_flag=True) @click.pass_context -def rayclusters(ctx, namespace, all): +def raycluster(ctx, namespace, all): """List all rayclusters in a specified namespace""" if all and namespace: click.echo("--all and --namespace are mutually exclusive") From 883066eda8e92761713f36ccaf5c9a7f11067a29 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Thu, 3 Aug 2023 12:38:47 -0700 Subject: [PATCH 2/4] change: use current namespace when not specified --- carson.yaml | 173 ++++++++++++++++++++++ src/codeflare_sdk/cli/codeflare_cli.py | 2 + src/codeflare_sdk/cli/commands/define.py | 7 +- src/codeflare_sdk/cli/commands/delete.py | 6 +- src/codeflare_sdk/cli/commands/details.py | 3 +- src/codeflare_sdk/cli/commands/list.py | 4 +- src/codeflare_sdk/cli/commands/status.py | 3 +- test-job.yaml | 173 ++++++++++++++++++++++ tests/unit_test.py | 9 +- 9 files changed, 369 insertions(+), 11 deletions(-) create mode 100644 carson.yaml create mode 100644 test-job.yaml diff --git a/carson.yaml b/carson.yaml new file mode 100644 index 000000000..79ff972cb --- /dev/null +++ b/carson.yaml @@ -0,0 +1,173 @@ +apiVersion: mcad.ibm.com/v1beta1 +kind: AppWrapper +metadata: + name: carson + namespace: default +spec: + priority: 9 + resources: + GenericItems: + - custompodresources: + - limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + replicas: 1 + requests: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + - limits: + cpu: 1 + memory: 2G + nvidia.com/gpu: 0 + replicas: 1 + requests: + cpu: 1 + memory: 2G + nvidia.com/gpu: 0 + generictemplate: + apiVersion: ray.io/v1alpha1 + kind: RayCluster + metadata: + labels: + appwrapper.mcad.ibm.com: carson + controller-tools.k8s.io: '1.0' + name: carson + namespace: default + spec: + autoscalerOptions: + idleTimeoutSeconds: 60 + imagePullPolicy: Always + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 500m + memory: 512Mi + upscalingMode: Default + enableInTreeAutoscaling: false + headGroupSpec: + rayStartParams: + block: 'true' + dashboard-host: 0.0.0.0 + num-gpus: '0' + serviceType: ClusterIP + template: + spec: + containers: + - env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: RAY_USE_TLS + value: '0' + - name: RAY_TLS_SERVER_CERT + value: /home/ray/workspace/tls/server.crt + - name: RAY_TLS_SERVER_KEY + value: /home/ray/workspace/tls/server.key + - name: RAY_TLS_CA_CERT + value: /home/ray/workspace/tls/ca.crt + image: quay.io/project-codeflare/ray:2.5.0-py38-cu116 + imagePullPolicy: Always + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - ray stop + name: ray-head + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + resources: + limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + requests: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + imagePullSecrets: [] + rayVersion: 2.1.0 + workerGroupSpecs: + - groupName: small-group-carson + maxReplicas: 1 + minReplicas: 1 + rayStartParams: + block: 'true' + num-gpus: '0' + replicas: 1 + template: + metadata: + annotations: + key: value + labels: + key: value + spec: + containers: + - env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: RAY_USE_TLS + value: '0' + - name: RAY_TLS_SERVER_CERT + value: /home/ray/workspace/tls/server.crt + - name: RAY_TLS_SERVER_KEY + value: /home/ray/workspace/tls/server.key + - name: RAY_TLS_CA_CERT + value: /home/ray/workspace/tls/ca.crt + image: quay.io/project-codeflare/ray:2.5.0-py38-cu116 + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - ray stop + name: machine-learning + resources: + limits: + cpu: 1 + memory: 2G + nvidia.com/gpu: 0 + requests: + cpu: 1 + memory: 2G + nvidia.com/gpu: 0 + imagePullSecrets: [] + initContainers: + - command: + - sh + - -c + - until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; + do echo waiting for myservice; sleep 2; done + image: busybox:1.28 + name: init-myservice + replicas: 1 + - generictemplate: + apiVersion: route.openshift.io/v1 + kind: Route + metadata: + labels: + odh-ray-cluster-service: carson-head-svc + name: ray-dashboard-carson + namespace: default + spec: + port: + targetPort: dashboard + to: + kind: Service + name: carson-head-svc + replica: 1 + Items: [] diff --git a/src/codeflare_sdk/cli/codeflare_cli.py b/src/codeflare_sdk/cli/codeflare_cli.py index 78354695f..2731ac0b7 100644 --- a/src/codeflare_sdk/cli/codeflare_cli.py +++ b/src/codeflare_sdk/cli/codeflare_cli.py @@ -2,6 +2,7 @@ import os from codeflare_sdk.cli.cli_utils import load_auth +from codeflare_sdk.cluster.cluster import get_current_namespace cmd_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "commands")) @@ -9,6 +10,7 @@ class CodeflareContext: def __init__(self): self.codeflare_path = _initialize_codeflare_folder() + self.current_namespace = get_current_namespace() def _initialize_codeflare_folder(): diff --git a/src/codeflare_sdk/cli/commands/define.py b/src/codeflare_sdk/cli/commands/define.py index 09cfd1f0e..4db177f3b 100644 --- a/src/codeflare_sdk/cli/commands/define.py +++ b/src/codeflare_sdk/cli/commands/define.py @@ -12,8 +12,9 @@ def cli(): @cli.command() +@click.pass_context @click.option("--name", type=str, required=True) -@click.option("--namespace", "-n", type=str, required=True) +@click.option("--namespace", "-n", type=str) @click.option("--head_info", cls=PythonLiteralOption, type=list) @click.option("--machine_types", cls=PythonLiteralOption, type=list) @click.option("--min_cpus", type=int) @@ -29,8 +30,10 @@ def cli(): @click.option("--image", type=str) @click.option("--local_interactive", type=bool) @click.option("--image_pull_secrets", cls=PythonLiteralOption, type=list) -def raycluster(**kwargs): +def raycluster(ctx, **kwargs): """Define a RayCluster with parameter specifications""" filtered_kwargs = {k: v for k, v in kwargs.items() if v is not None} + if "namespace" not in filtered_kwargs.keys(): + filtered_kwargs["namespace"] = ctx.obj.current_namespace clusterConfig = ClusterConfiguration(**filtered_kwargs) Cluster(clusterConfig) # Creates yaml file diff --git a/src/codeflare_sdk/cli/commands/delete.py b/src/codeflare_sdk/cli/commands/delete.py index 7ce9744bd..c225d428a 100644 --- a/src/codeflare_sdk/cli/commands/delete.py +++ b/src/codeflare_sdk/cli/commands/delete.py @@ -12,12 +12,14 @@ def cli(): @cli.command() +@click.pass_context @click.argument("name", type=str) -@click.option("--namespace", type=str, required=True) -def raycluster(name, namespace): +@click.option("--namespace", type=str) +def raycluster(ctx, name, namespace): """ Delete a specified RayCluster from the Kubernetes cluster """ + namespace = namespace or ctx.obj.current_namespace try: cluster = get_cluster(name, namespace) except FileNotFoundError: diff --git a/src/codeflare_sdk/cli/commands/details.py b/src/codeflare_sdk/cli/commands/details.py index b865caa47..f6890e7d6 100644 --- a/src/codeflare_sdk/cli/commands/details.py +++ b/src/codeflare_sdk/cli/commands/details.py @@ -11,10 +11,11 @@ def cli(): @cli.command() @click.argument("name", type=str) -@click.option("--namespace", type=str, required=True) +@click.option("--namespace", type=str) @click.pass_context def raycluster(ctx, name, namespace): """Get the details of a specified RayCluster""" + namespace = namespace or ctx.obj.current_namespace try: cluster = get_cluster(name, namespace) except FileNotFoundError: diff --git a/src/codeflare_sdk/cli/commands/list.py b/src/codeflare_sdk/cli/commands/list.py index 753982579..533aaeda1 100644 --- a/src/codeflare_sdk/cli/commands/list.py +++ b/src/codeflare_sdk/cli/commands/list.py @@ -23,9 +23,7 @@ def raycluster(ctx, namespace, all): if all and namespace: click.echo("--all and --namespace are mutually exclusive") return - if not all and not namespace: - click.echo("You must specify either --namespace or --all") - return + namespace = namespace or ctx.obj.current_namespace if not all: list_all_clusters(namespace) return diff --git a/src/codeflare_sdk/cli/commands/status.py b/src/codeflare_sdk/cli/commands/status.py index fc76ffc1d..dbd92a555 100644 --- a/src/codeflare_sdk/cli/commands/status.py +++ b/src/codeflare_sdk/cli/commands/status.py @@ -11,10 +11,11 @@ def cli(): @cli.command() @click.argument("name", type=str) -@click.option("--namespace", type=str, required=True) +@click.option("--namespace", type=str) @click.pass_context def raycluster(ctx, name, namespace): """Get the status of a specified RayCluster""" + namespace = namespace or ctx.obj.current_namespace try: cluster = get_cluster(name, namespace) except FileNotFoundError: diff --git a/test-job.yaml b/test-job.yaml new file mode 100644 index 000000000..3a0827080 --- /dev/null +++ b/test-job.yaml @@ -0,0 +1,173 @@ +apiVersion: mcad.ibm.com/v1beta1 +kind: AppWrapper +metadata: + name: test-job + namespace: default +spec: + priority: 9 + resources: + GenericItems: + - custompodresources: + - limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + replicas: 1 + requests: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + - limits: + cpu: 1 + memory: 2G + nvidia.com/gpu: 0 + replicas: 1 + requests: + cpu: 1 + memory: 2G + nvidia.com/gpu: 0 + generictemplate: + apiVersion: ray.io/v1alpha1 + kind: RayCluster + metadata: + labels: + appwrapper.mcad.ibm.com: test-job + controller-tools.k8s.io: '1.0' + name: test-job + namespace: default + spec: + autoscalerOptions: + idleTimeoutSeconds: 60 + imagePullPolicy: Always + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 500m + memory: 512Mi + upscalingMode: Default + enableInTreeAutoscaling: false + headGroupSpec: + rayStartParams: + block: 'true' + dashboard-host: 0.0.0.0 + num-gpus: '0' + serviceType: ClusterIP + template: + spec: + containers: + - env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: RAY_USE_TLS + value: '0' + - name: RAY_TLS_SERVER_CERT + value: /home/ray/workspace/tls/server.crt + - name: RAY_TLS_SERVER_KEY + value: /home/ray/workspace/tls/server.key + - name: RAY_TLS_CA_CERT + value: /home/ray/workspace/tls/ca.crt + image: quay.io/project-codeflare/ray:2.5.0-py38-cu116 + imagePullPolicy: Always + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - ray stop + name: ray-head + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + resources: + limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + requests: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + imagePullSecrets: [] + rayVersion: 2.1.0 + workerGroupSpecs: + - groupName: small-group-test-job + maxReplicas: 1 + minReplicas: 1 + rayStartParams: + block: 'true' + num-gpus: '0' + replicas: 1 + template: + metadata: + annotations: + key: value + labels: + key: value + spec: + containers: + - env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: RAY_USE_TLS + value: '0' + - name: RAY_TLS_SERVER_CERT + value: /home/ray/workspace/tls/server.crt + - name: RAY_TLS_SERVER_KEY + value: /home/ray/workspace/tls/server.key + - name: RAY_TLS_CA_CERT + value: /home/ray/workspace/tls/ca.crt + image: quay.io/project-codeflare/ray:2.5.0-py38-cu116 + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - ray stop + name: machine-learning + resources: + limits: + cpu: 1 + memory: 2G + nvidia.com/gpu: 0 + requests: + cpu: 1 + memory: 2G + nvidia.com/gpu: 0 + imagePullSecrets: [] + initContainers: + - command: + - sh + - -c + - until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; + do echo waiting for myservice; sleep 2; done + image: busybox:1.28 + name: init-myservice + replicas: 1 + - generictemplate: + apiVersion: route.openshift.io/v1 + kind: Route + metadata: + labels: + odh-ray-cluster-service: test-job-head-svc + name: ray-dashboard-test-job + namespace: default + spec: + port: + targetPort: dashboard + to: + kind: Service + name: test-job-head-svc + replica: 1 + Items: [] diff --git a/tests/unit_test.py b/tests/unit_test.py index 45b70382d..a9258017f 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -160,7 +160,8 @@ def test_login_tls_cli(mocker): tls_result = runner.invoke(cli, k8s_tls_login_command) skip_tls_result = runner.invoke(cli, k8s_skip_tls_login_command) assert ( - tls_result.output == skip_tls_result.output == "Logged into 'testserver:6443'\n" + "Logged into 'testserver:6443'\n" in tls_result.output + and "Logged into 'testserver:6443'\n" in skip_tls_result.output ) @@ -169,7 +170,7 @@ def test_logout_cli(mocker): mocker.patch.object(client, "ApiClient") k8s_logout_command = "logout" logout_result = runner.invoke(cli, k8s_logout_command) - assert logout_result.output == "Successfully logged out of 'testserver:6443'\n" + assert "Successfully logged out of 'testserver:6443'\n" in logout_result.output assert not os.path.exists(os.path.expanduser("~/.codeflare/auth")) @@ -198,6 +199,10 @@ def test_cluster_deletion_cli(mocker): "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", side_effect=get_ray_obj, ) + mocker.patch( + "codeflare_sdk.cluster.cluster.get_current_namespace", + return_value="ns", + ) runner = CliRunner() delete_cluster_command = """ delete raycluster From 7e287f5d6f9dd501b561edc2455a4a38b9487a9c Mon Sep 17 00:00:00 2001 From: carsonmh Date: Thu, 3 Aug 2023 12:44:37 -0700 Subject: [PATCH 3/4] refactor: make _get_all_rayclusters which handles namespaced and all raycluster listing --- src/codeflare_sdk/cluster/cluster.py | 40 +++++++++++----------------- 1 file changed, 15 insertions(+), 25 deletions(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index a25dd1b9d..b98255e68 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -412,7 +412,7 @@ def list_all_clusters(namespace: str, print_to_console: bool = True): """ Returns (and prints by default) a list of all clusters in a given namespace. """ - clusters = _get_ray_clusters_in_namespace(namespace) + clusters = _get_all_ray_clusters(namespace) if print_to_console: pretty_print.print_clusters(clusters) return clusters @@ -539,17 +539,24 @@ def _ray_cluster_status(name, namespace="default") -> Optional[RayCluster]: return None -def _get_ray_clusters_in_namespace(namespace="default") -> List[RayCluster]: +def _get_all_ray_clusters(namespace: str = None) -> List[RayCluster]: list_of_clusters = [] try: config_check() api_instance = client.CustomObjectsApi(api_config_handler()) - rcs = api_instance.list_namespaced_custom_object( - group="ray.io", - version="v1alpha1", - namespace=namespace, - plural="rayclusters", - ) + if namespace: + rcs = api_instance.list_namespaced_custom_object( + group="ray.io", + version="v1alpha1", + namespace=namespace, + plural="rayclusters", + ) + else: + rcs = api_instance.list_cluster_custom_object( + group="ray.io", + version="v1alpha1", + plural="rayclusters", + ) except Exception as e: # pragma: no cover return _kube_api_error_handling(e) @@ -558,23 +565,6 @@ def _get_ray_clusters_in_namespace(namespace="default") -> List[RayCluster]: return list_of_clusters -def _get_all_ray_clusters() -> List[RayCluster]: - list_of_clusters = [] - try: - config_check() - api_instance = client.CustomObjectsApi(api_config_handler()) - rcs = api_instance.list_cluster_custom_object( - group="ray.io", - version="v1alpha1", - plural="rayclusters", - ) - except Exception as e: - return _kube_api_error_handling(e) - for rc in rcs["items"]: - list_of_clusters.append(_map_to_ray_cluster(rc)) - return list_of_clusters - - def _get_app_wrappers( namespace="default", filter=List[AppWrapperStatus] ) -> List[AppWrapper]: From 8b52a5548459873c58e65b2cd1e43931d03fde13 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Thu, 3 Aug 2023 12:48:59 -0700 Subject: [PATCH 4/4] cleanup --- carson.yaml | 173 -------------------------------------------------- test-job.yaml | 173 -------------------------------------------------- 2 files changed, 346 deletions(-) delete mode 100644 carson.yaml delete mode 100644 test-job.yaml diff --git a/carson.yaml b/carson.yaml deleted file mode 100644 index 79ff972cb..000000000 --- a/carson.yaml +++ /dev/null @@ -1,173 +0,0 @@ -apiVersion: mcad.ibm.com/v1beta1 -kind: AppWrapper -metadata: - name: carson - namespace: default -spec: - priority: 9 - resources: - GenericItems: - - custompodresources: - - limits: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - replicas: 1 - requests: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - - limits: - cpu: 1 - memory: 2G - nvidia.com/gpu: 0 - replicas: 1 - requests: - cpu: 1 - memory: 2G - nvidia.com/gpu: 0 - generictemplate: - apiVersion: ray.io/v1alpha1 - kind: RayCluster - metadata: - labels: - appwrapper.mcad.ibm.com: carson - controller-tools.k8s.io: '1.0' - name: carson - namespace: default - spec: - autoscalerOptions: - idleTimeoutSeconds: 60 - imagePullPolicy: Always - resources: - limits: - cpu: 500m - memory: 512Mi - requests: - cpu: 500m - memory: 512Mi - upscalingMode: Default - enableInTreeAutoscaling: false - headGroupSpec: - rayStartParams: - block: 'true' - dashboard-host: 0.0.0.0 - num-gpus: '0' - serviceType: ClusterIP - template: - spec: - containers: - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: RAY_USE_TLS - value: '0' - - name: RAY_TLS_SERVER_CERT - value: /home/ray/workspace/tls/server.crt - - name: RAY_TLS_SERVER_KEY - value: /home/ray/workspace/tls/server.key - - name: RAY_TLS_CA_CERT - value: /home/ray/workspace/tls/ca.crt - image: quay.io/project-codeflare/ray:2.5.0-py38-cu116 - imagePullPolicy: Always - lifecycle: - preStop: - exec: - command: - - /bin/sh - - -c - - ray stop - name: ray-head - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - resources: - limits: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - requests: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - imagePullSecrets: [] - rayVersion: 2.1.0 - workerGroupSpecs: - - groupName: small-group-carson - maxReplicas: 1 - minReplicas: 1 - rayStartParams: - block: 'true' - num-gpus: '0' - replicas: 1 - template: - metadata: - annotations: - key: value - labels: - key: value - spec: - containers: - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: RAY_USE_TLS - value: '0' - - name: RAY_TLS_SERVER_CERT - value: /home/ray/workspace/tls/server.crt - - name: RAY_TLS_SERVER_KEY - value: /home/ray/workspace/tls/server.key - - name: RAY_TLS_CA_CERT - value: /home/ray/workspace/tls/ca.crt - image: quay.io/project-codeflare/ray:2.5.0-py38-cu116 - lifecycle: - preStop: - exec: - command: - - /bin/sh - - -c - - ray stop - name: machine-learning - resources: - limits: - cpu: 1 - memory: 2G - nvidia.com/gpu: 0 - requests: - cpu: 1 - memory: 2G - nvidia.com/gpu: 0 - imagePullSecrets: [] - initContainers: - - command: - - sh - - -c - - until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; - do echo waiting for myservice; sleep 2; done - image: busybox:1.28 - name: init-myservice - replicas: 1 - - generictemplate: - apiVersion: route.openshift.io/v1 - kind: Route - metadata: - labels: - odh-ray-cluster-service: carson-head-svc - name: ray-dashboard-carson - namespace: default - spec: - port: - targetPort: dashboard - to: - kind: Service - name: carson-head-svc - replica: 1 - Items: [] diff --git a/test-job.yaml b/test-job.yaml deleted file mode 100644 index 3a0827080..000000000 --- a/test-job.yaml +++ /dev/null @@ -1,173 +0,0 @@ -apiVersion: mcad.ibm.com/v1beta1 -kind: AppWrapper -metadata: - name: test-job - namespace: default -spec: - priority: 9 - resources: - GenericItems: - - custompodresources: - - limits: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - replicas: 1 - requests: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - - limits: - cpu: 1 - memory: 2G - nvidia.com/gpu: 0 - replicas: 1 - requests: - cpu: 1 - memory: 2G - nvidia.com/gpu: 0 - generictemplate: - apiVersion: ray.io/v1alpha1 - kind: RayCluster - metadata: - labels: - appwrapper.mcad.ibm.com: test-job - controller-tools.k8s.io: '1.0' - name: test-job - namespace: default - spec: - autoscalerOptions: - idleTimeoutSeconds: 60 - imagePullPolicy: Always - resources: - limits: - cpu: 500m - memory: 512Mi - requests: - cpu: 500m - memory: 512Mi - upscalingMode: Default - enableInTreeAutoscaling: false - headGroupSpec: - rayStartParams: - block: 'true' - dashboard-host: 0.0.0.0 - num-gpus: '0' - serviceType: ClusterIP - template: - spec: - containers: - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: RAY_USE_TLS - value: '0' - - name: RAY_TLS_SERVER_CERT - value: /home/ray/workspace/tls/server.crt - - name: RAY_TLS_SERVER_KEY - value: /home/ray/workspace/tls/server.key - - name: RAY_TLS_CA_CERT - value: /home/ray/workspace/tls/ca.crt - image: quay.io/project-codeflare/ray:2.5.0-py38-cu116 - imagePullPolicy: Always - lifecycle: - preStop: - exec: - command: - - /bin/sh - - -c - - ray stop - name: ray-head - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - resources: - limits: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - requests: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - imagePullSecrets: [] - rayVersion: 2.1.0 - workerGroupSpecs: - - groupName: small-group-test-job - maxReplicas: 1 - minReplicas: 1 - rayStartParams: - block: 'true' - num-gpus: '0' - replicas: 1 - template: - metadata: - annotations: - key: value - labels: - key: value - spec: - containers: - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: RAY_USE_TLS - value: '0' - - name: RAY_TLS_SERVER_CERT - value: /home/ray/workspace/tls/server.crt - - name: RAY_TLS_SERVER_KEY - value: /home/ray/workspace/tls/server.key - - name: RAY_TLS_CA_CERT - value: /home/ray/workspace/tls/ca.crt - image: quay.io/project-codeflare/ray:2.5.0-py38-cu116 - lifecycle: - preStop: - exec: - command: - - /bin/sh - - -c - - ray stop - name: machine-learning - resources: - limits: - cpu: 1 - memory: 2G - nvidia.com/gpu: 0 - requests: - cpu: 1 - memory: 2G - nvidia.com/gpu: 0 - imagePullSecrets: [] - initContainers: - - command: - - sh - - -c - - until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; - do echo waiting for myservice; sleep 2; done - image: busybox:1.28 - name: init-myservice - replicas: 1 - - generictemplate: - apiVersion: route.openshift.io/v1 - kind: Route - metadata: - labels: - odh-ray-cluster-service: test-job-head-svc - name: ray-dashboard-test-job - namespace: default - spec: - port: - targetPort: dashboard - to: - kind: Service - name: test-job-head-svc - replica: 1 - Items: []