Skip to content

Commit 2c37b26

Browse files
committed
Initial version
1 parent c6b3ceb commit 2c37b26

7 files changed

+5014
-1
lines changed

Diff for: Dockerfile

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# docker build -f Dockerfile -t glue-runtime:ray-1.9.0 ./
2+
3+
FROM rayproject/ray:1.9.0-gpu
4+
5+
# make and cmake is required for installing
6+
RUN sudo apt-get update && sudo apt-get install -y \
7+
build-essential \
8+
&& sudo rm -rf /var/lib/apt/lists/* \
9+
&& sudo apt-get clean
10+
11+
RUN mkdir /home/ray/glue
12+
WORKDIR /home/ray/glue
13+
14+
# install requirements.txt
15+
COPY requirements.txt ./
16+
RUN pip install --no-cache-dir -r requirements.txt && rm requirements.txt
17+
18+
#install torch
19+
RUN pip install --no-cache-dir torch==1.7.1+cu110 -f https://download.pytorch.org/whl/torch_stable.html
20+
21+
# install boto3
22+
RUN pip install --no-cache-dir boto3
23+
24+
# change group permissions for running in OCP
25+
RUN sudo chgrp 0 /home/ray/glue
26+
RUN sudo chmod g+w /home/ray/glue

Diff for: README.md

+59-1
Original file line numberDiff line numberDiff line change
@@ -1 +1,59 @@
1-
# codeflare-transfer-learn
1+
# codeflare-transfer-learn
2+
3+
# Running glue_benchmark on OpenShift (OCP)
4+
Assumes:
5+
* A copy of this repository is installed and you have a command line in this directory
6+
* You have the OpenShift CLI installed (instructions are available in the IBM Cloud and OpenShift web consoles if not)
7+
* You have the S3 credentials needed to access the glue datasets and model to evaluate
8+
9+
1. Log into OCP using the oc login command from the OCP web console
10+
(Go to the menu under IAM#<your username/email>, then "Copy Login Command").
11+
12+
2. Use `oc project` to confirm your namespace is as desired. If not:
13+
```
14+
$ oc project {your-namespace}
15+
```
16+
17+
3. Starting from template-s3-creds.yaml, create a personal yaml secrets file with your namespace and S3 credentials. Then register the secrets:
18+
```
19+
$ oc create -f {your-handle}-s3-creds.yaml
20+
```
21+
4. [Required only once] Check if Ray CRD is installed. Install if not.
22+
```
23+
$ oc get crd | grep ray
24+
```
25+
If not there:
26+
```
27+
$ oc apply -f cluster_crd.yaml
28+
```
29+
30+
5. Create a ray operator in your namespace:
31+
```
32+
$ oc apply -f glue-operator.yaml
33+
```
34+
35+
6. Create a ray cluster in your namespace. Change the min and max number of workers as needed (around line 100)
36+
```
37+
$ oc apply -f glue-cluster.yaml
38+
```
39+
40+
7. When the ray cluster head and worker pods are in ready state, copy the application driver to the head node:
41+
```
42+
$ oc get po --watch
43+
$ oc cp glue_benchmark.py glue-cluster-head-XXXXX:/home/ray/glue
44+
```
45+
46+
8. exec into the head node and run the application. For example:
47+
```
48+
$ oc exec -it glue-cluster-head-cjgzk -- /bin/bash
49+
(base) 1000650000@glue-cluster-head-cjgzk:~/glue$ nohup ./glue_benchmark -b {bucket-name} -m roberta-base -t WNLI 2>&1 &
50+
```
51+
This will evaluate the roberta-base model against the WNLI task with 10 different seeds
52+
53+
9. monitor the progress using nohup.out. The evaluation results will be in /tmp/summary.
54+
55+
10. When finished, clean up the active resources in your project:
56+
```
57+
$ oc delete -f glue-cluster.yaml
58+
$ oc delete -f glue-operator.yaml
59+
```

Diff for: cluster_crd.yaml

+4,321
Large diffs are not rendered by default.

Diff for: glue-cluster.yaml

+169
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
apiVersion: cluster.ray.io/v1
2+
kind: RayCluster
3+
metadata:
4+
name: glue-cluster
5+
spec:
6+
# The maximum number of workers nodes to launch in addition to the head node.
7+
maxWorkers: 10
8+
# The autoscaler will scale up the cluster faster with higher upscaling speed.
9+
# E.g., if the task requires adding more nodes then autoscaler will gradually
10+
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
11+
# This number should be > 0.
12+
upscalingSpeed: 0.0
13+
# If a node is idle for this many minutes, it will be removed.
14+
idleTimeoutMinutes: 99999
15+
# Specify the pod type for the ray head node (as configured below).
16+
headPodType: head-node
17+
# Specify the allowed pod types for this ray cluster and the resources they provide.
18+
podTypes:
19+
- name: head-node
20+
# Minimum number of Ray workers of this Pod type.
21+
minWorkers: 0
22+
# Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers.
23+
maxWorkers: 0
24+
rayResources: {"GPU": 0}
25+
podConfig:
26+
apiVersion: v1
27+
kind: Pod
28+
metadata:
29+
# Automatically generates a name for the pod with this prefix.
30+
generateName: head-
31+
spec:
32+
restartPolicy: Never
33+
imagePullSecrets:
34+
- name: artifactory-codeflare-cred
35+
36+
# This volume allocates shared memory for Ray to use for its plasma
37+
# object store. If you do not provide this, Ray will fall back to
38+
# /tmp which cause slowdowns if is not a shared memory volume.
39+
volumes:
40+
- name: dshm
41+
emptyDir:
42+
medium: Memory
43+
containers:
44+
- name: ray-node
45+
image: res-wsched-team-ray-project-docker-local.artifactory.swg-devops.com/codeflare:glue_benchmark-ray1.9-s3
46+
env:
47+
- name: AWS_ACCESS_KEY_ID
48+
valueFrom:
49+
secretKeyRef:
50+
name: glue-s3-creds
51+
key: AWS_ACCESS_KEY_ID
52+
- name: AWS_SECRET_ACCESS_KEY
53+
valueFrom:
54+
secretKeyRef:
55+
name: glue-s3-creds
56+
key: AWS_SECRET_ACCESS_KEY
57+
- name: ENDPOINT_URL
58+
valueFrom:
59+
secretKeyRef:
60+
name: glue-s3-creds
61+
key: ENDPOINT_URL
62+
# Do not change this command - it keeps the pod alive until it is
63+
# explicitly killed.
64+
command: ["/bin/bash", "-c", "--"]
65+
args: ['trap : TERM INT; sleep infinity & wait;']
66+
ports:
67+
- containerPort: 6379 # Redis port
68+
- containerPort: 10001 # Used by Ray Client
69+
- containerPort: 8265 # Used by Ray Dashboard
70+
71+
# This volume allocates shared memory for Ray to use for its plasma
72+
# object store. If you do not provide this, Ray will fall back to
73+
# /tmp which cause slowdowns if is not a shared memory volume.
74+
volumeMounts:
75+
- mountPath: /dev/shm
76+
name: dshm
77+
resources:
78+
requests:
79+
cpu: "2"
80+
memory: "32G"
81+
ephemeral-storage: "60G"
82+
nvidia.com/gpu: "0"
83+
limits:
84+
# The maximum memory that this pod is allowed to use. The
85+
# limit will be detected by ray and split to use 10% for
86+
# redis, 30% for the shared memory object store, and the
87+
# rest for application memory. If this limit is not set and
88+
# the object store size is not set manually, ray will
89+
# allocate a very large object store in each pod that may
90+
# cause problems for other pods.
91+
cpu: "2"
92+
memory: "32G"
93+
ephemeral-storage: "60G"
94+
nvidia.com/gpu: "0"
95+
- name: worker-node
96+
# Minimum number of Ray workers of this Pod type.
97+
minWorkers: 8
98+
# Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers.
99+
maxWorkers: 8
100+
# User-specified custom resources for use by Ray.
101+
# (Ray detects CPU and GPU from pod spec resource requests and limits, so no need to fill those here.)
102+
rayResources: {"foo": 1, "bar": 0}
103+
podConfig:
104+
apiVersion: v1
105+
kind: Pod
106+
metadata:
107+
# Automatically generates a name for the pod with this prefix.
108+
generateName: worker-
109+
spec:
110+
restartPolicy: Never
111+
imagePullSecrets:
112+
- name: artifactory-codeflare-cred
113+
volumes:
114+
- name: dshm
115+
emptyDir:
116+
medium: Memory
117+
containers:
118+
- name: ray-node
119+
imagePullPolicy: Always
120+
image: res-wsched-team-ray-project-docker-local.artifactory.swg-devops.com/codeflare:glue_benchmark-ray1.9-s3
121+
env:
122+
- name: AWS_ACCESS_KEY_ID
123+
valueFrom:
124+
secretKeyRef:
125+
name: glue-s3-creds
126+
key: AWS_ACCESS_KEY_ID
127+
- name: AWS_SECRET_ACCESS_KEY
128+
valueFrom:
129+
secretKeyRef:
130+
name: glue-s3-creds
131+
key: AWS_SECRET_ACCESS_KEY
132+
- name: ENDPOINT_URL
133+
valueFrom:
134+
secretKeyRef:
135+
name: glue-s3-creds
136+
key: ENDPOINT_URL
137+
command: ["/bin/bash", "-c", "--"]
138+
args: ["trap : TERM INT; sleep infinity & wait;"]
139+
# This volume allocates shared memory for Ray to use for its plasma
140+
# object store. If you do not provide this, Ray will fall back to
141+
# /tmp which cause slowdowns if is not a shared memory volume.
142+
volumeMounts:
143+
- mountPath: /dev/shm
144+
name: dshm
145+
resources:
146+
requests:
147+
cpu: "8"
148+
memory: "16G"
149+
nvidia.com/gpu: "1"
150+
limits:
151+
# The maximum memory that this pod is allowed to use. The
152+
# limit will be detected by ray and split to use 10% for
153+
# redis, 30% for the shared memory object store, and the
154+
# rest for application memory. If this limit is not set and
155+
# the object store size is not set manually, ray will
156+
# allocate a very large object store in each pod that may
157+
# cause problems for other pods.
158+
cpu: "8"
159+
memory: "16G"
160+
nvidia.com/gpu: "1"
161+
# Commands to start Ray on the head node. You don't need to change this.
162+
# Note dashboard-host is set to 0.0.0.0 so that Kubernetes can port forward.
163+
headStartRayCommands:
164+
- ray stop
165+
- ulimit -n 65536; ray start --head --no-monitor --dashboard-host 0.0.0.0
166+
# Commands to start Ray on worker nodes. You don't need to change this.
167+
workerStartRayCommands:
168+
- ray stop
169+
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379

Diff for: glue-operator.yaml

+49
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
operator_role:
2+
apiVersion: v1
3+
kind: ServiceAccount
4+
metadata:
5+
name: ray-operator-serviceaccount
6+
---
7+
kind: Role
8+
apiVersion: rbac.authorization.k8s.io/v1
9+
metadata:
10+
name: ray-operator-role
11+
rules:
12+
- apiGroups: ["", "cluster.ray.io"]
13+
resources: ["rayclusters", "rayclusters/finalizers", "rayclusters/status", "pods", "pods/exec", "services"]
14+
verbs: ["get", "watch", "list", "create", "delete", "patch", "update"]
15+
---
16+
apiVersion: rbac.authorization.k8s.io/v1
17+
kind: RoleBinding
18+
metadata:
19+
name: ray-operator-rolebinding
20+
subjects:
21+
- kind: ServiceAccount
22+
name: ray-operator-serviceaccount
23+
roleRef:
24+
kind: Role
25+
name: ray-operator-role
26+
apiGroup: rbac.authorization.k8s.io
27+
---
28+
apiVersion: v1
29+
kind: Pod
30+
metadata:
31+
name: ray-operator-pod
32+
spec:
33+
serviceAccountName: ray-operator-serviceaccount
34+
containers:
35+
- name: ray
36+
imagePullPolicy: Always
37+
image: rayproject/ray:1.9.0-py37
38+
command: ["ray-operator"]
39+
env:
40+
- name: RAY_OPERATOR_POD_NAMESPACE
41+
valueFrom:
42+
fieldRef:
43+
fieldPath: metadata.namespace
44+
resources:
45+
requests:
46+
cpu: 1
47+
memory: 1Gi
48+
limits:
49+
memory: 2Gi

0 commit comments

Comments
 (0)