You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
We finally made it work, but I do not think it is sustainable for the wider population. Here is what we have to do:
Because Codeflare SDK relies on OC, we had to create a new image for KFP execution
FROM python:3.8.16-slim-bullseye
RUN apt update && apt install -y wget
# install oc
RUN mkdir /opt/oc
RUN wget -O /opt/oc/release.tar.gz https://mirror.openshift.com/pub/openshift-v4/x86_64/clients/ocp/stable-4.11/openshift-client-linux-4.11.40.tar.gz
RUN tar -xzvf /opt/oc/release.tar.gz -C /opt/oc/ && \
mv /opt/oc/oc /usr/bin/ && \
rm -rf /opt/oc
# install libraries
RUN pip install --upgrade pip && pip install codeflare-sdk
RUN pip install "ray[default]"==2.1.0
# Allow writes for yaml files
RUN chmod -R 777 /tmp
Note here that we also had to create a writable directory for saving intermediate YAML
2. Because Codeflare SDK directly manipulates MCAD, RAYCluster, and OpenShift Routes resources, we had to add additional permission to pipeline-runner service account, which should eventually go to KFDef. Here are the files:
To make sure, that we have permission to write files, we had to redirect the working directory. He is the complete code:
import kfp.components as comp
from kfp_tekton.compiler import TektonCompiler
import kfp.dsl as dsl
from kubernetes import client as k8s_client
# execute ray pipeline
def execure_ray_pipeline(token: str, # token to authenticate to cluster
name: str, # name of Ray cluster
min_worker: str, # min number of workers
max_worker: str, # max number of workers
min_cpus: str, # min cpus per worker
max_cpus: str, # max cpus per worker
min_memory: str, # min memory per worker
max_memory: str, # max memory per worker
image: str = "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103"
):
# Ray code - basically hello world
import ray
@ray.remote
class Counter:
def __init__(self):
self.counter = 0
def inc(self):
self.counter += 1
def get_counter(self):
return self.counter
# Import pieces from codeflare-sdk
from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration, list_all_clusters, list_all_queued
from codeflare_sdk.cluster.auth import TokenAuthentication
import os
# get current namespace
ns = os.getenv('NAMESPACE', 'default')
# change the current directory to ensure that we can write
os.chdir("/tmp")
print(f"Executing in namespace {ns}, current working directory is {os.getcwd()}")
# Create authentication object for oc user permissions
auth = TokenAuthentication(
token=token,
server="https://kubernetes.default:443",
skip_tls=True
)
try:
auth.login()
except Exception as e:
print(f"Failed to log into openshift cluster, error {e}. Please check token/server values provided")
os.abort()
print("successfully logged in")
# Create and configure our cluster object (and appwrapper)
cluster = Cluster(ClusterConfiguration(
name = name,
namespace = ns,
min_worker = int(min_worker),
max_worker = int(max_worker),
min_cpus = int(min_cpus),
max_cpus = int(max_cpus),
min_memory = int(min_memory),
max_memory = int(max_memory),
gpu=0,
image = image,
instascale=False
))
print(f"Configuration for Ray cluster {name} in namespace {ns} is created")
# bring up the cluster
cluster.up()
print(f"Creating Ray cluster {name} in namespace {ns}...")
# and wait for it being up
cluster.wait_ready()
rc = cluster.details(print_to_console=False)
print("Ray cluster is ready")
print(rc)
# Get cluster connection points
ray_cluster_uri = cluster.cluster_uri()
print(f"Ray_cluster is at {ray_cluster_uri}")
# Connect to the cluster
try:
ray.init(address=f'{ray_cluster_uri}', ignore_reinit_error=True)
except Exception as e:
print(f"Failed to connect to Ray cluster, error {e}")
os.abort()
print("connected to Ray cluster")
# execute Ray function
print("Running Hello world")
counter = Counter.remote()
for _ in range(5):
ray.get(counter.inc.remote())
print(ray.get(counter.get_counter.remote()))
# delete cluster
print("All done. Cleaning up")
cluster.down()
# components
ray_pipiline_op = comp.func_to_container_op(
func=execure_ray_pipeline,
base_image="blublinsky1/kfp-oc:0.0.2"
)
# Pipeline to invoke execution on remote resource
@dsl.pipeline(
name='simple-ray-pipeline',
description='Pipeline to show how to use codeflare sdk to create Ray cluster and run jobs'
)
def simple_ray_pipeline(token: str, # token to authenticate to cluster
name: str = "kfp-ray", # name of Ray cluster
min_worker: str = "2", # min number of workers
max_worker: str = "2", # max number of workers
min_cpus: str = "2", # min cpus per worker
max_cpus: str = "2", # max cpus per worker
min_memory: str = "4", # min memory per worker
max_memory: str = "4", # max memory per worker
image: str = "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103"
):
# invoke pipeline
pipeline = ray_pipiline_op(token, name, min_worker, max_worker, min_cpus,max_cpus, min_memory,
max_memory, image)
pipeline.execution_options.caching_strategy.max_cache_staleness = "P0D"
pipeline.add_env_variable(k8s_client.V1EnvVar(
name='NAMESPACE',
value_from=k8s_client.V1EnvVarSource(
field_ref=k8s_client.V1ObjectFieldSelector(field_path="metadata.namespace"))))
if __name__ == '__main__':
# Compiling the pipeline
TektonCompiler().compile(simple_ray_pipeline, __file__.replace('.py', '.yaml'))
We finally made it work, but I do not think it is sustainable for the wider population. Here is what we have to do:
Note here that we also had to create a writable directory for saving intermediate YAML
2. Because Codeflare SDK directly manipulates MCAD, RAYCluster, and OpenShift Routes resources, we had to add additional permission to
pipeline-runner
service account, which should eventually go to KFDef. Here are the files:After all this, the workflow works correctly.
Need to also add an implementation based on https://docs.ray.io/en/latest/cluster/running-applications/job-submission/rest.html
The text was updated successfully, but these errors were encountered: