project-codeflare
diff --git a/Diff for: ‎Dockerfile
+26 b/Diff for: ‎Dockerfile
+26
diff --git a/Diff for: ‎README.md
+59-1 b/Diff for: ‎README.md
+59-1
diff --git a/Diff for: ‎cluster_crd.yaml
+4,321 b/Diff for: ‎cluster_crd.yaml
+4,321
diff --git a/Diff for: ‎glue-cluster.yaml
+169 b/Diff for: ‎glue-cluster.yaml
+169
diff --git a/Diff for: ‎glue-operator.yaml
+49 b/Diff for: ‎glue-operator.yaml
+49
@@ -0,0 +1,26 @@
+# docker build -f Dockerfile -t glue-runtime:ray-1.9.0 ./
+
+FROM rayproject/ray:1.9.0-gpu
+
+# make and cmake is required for installing
+RUN sudo apt-get update && sudo apt-get install -y \
+    build-essential \
+    && sudo rm -rf /var/lib/apt/lists/* \
+    && sudo apt-get clean
+
+RUN mkdir /home/ray/glue
+WORKDIR /home/ray/glue
+
+# install requirements.txt
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt && rm requirements.txt
+
+#install torch
+RUN pip install --no-cache-dir torch==1.7.1+cu110 -f https://download.pytorch.org/whl/torch_stable.html
+
+# install boto3
+RUN pip install --no-cache-dir boto3
+
+# change group permissions for running in OCP
+RUN sudo chgrp 0 /home/ray/glue
+RUN sudo chmod g+w /home/ray/glue
@@ -1 +1,59 @@
-# codeflare-transfer-learn
+# codeflare-transfer-learn
+
+# Running glue_benchmark on OpenShift (OCP)
+Assumes:
+* A copy of this repository is installed and you have a command line in this directory
+* You have the OpenShift CLI installed (instructions are available in the IBM Cloud and OpenShift web consoles if not)
+* You have the S3 credentials needed to access the glue datasets and model to evaluate
+
+1. Log into OCP using the oc login command from the OCP web console 
+   (Go to the menu under IAM#<your username/email>, then "Copy Login Command").  
+
+2. Use `oc project` to confirm your namespace is as desired. If not:
+```
+$ oc project {your-namespace}
+```
+
+3. Starting from template-s3-creds.yaml, create a personal yaml secrets file with your namespace and S3 credentials. Then register the secrets:
+```
+$ oc create -f {your-handle}-s3-creds.yaml
+```
+4. [Required only once] Check if Ray CRD is installed. Install if not.
+```
+$ oc get crd | grep ray
+```
+If not there:
+```
+$ oc apply -f cluster_crd.yaml  
+```
+
+5. Create a ray operator in your namespace:
+```
+$ oc apply -f glue-operator.yaml
+```
+
+6. Create a ray cluster in your namespace. Change the min and max number of workers as needed (around line 100)
+```
+ $ oc apply -f glue-cluster.yaml 
+```
+
+7. When the ray cluster head and worker pods are in ready state, copy the application driver to the head node:
+```
+$ oc get po --watch
+$ oc cp glue_benchmark.py glue-cluster-head-XXXXX:/home/ray/glue
+```
+
+8. exec into the head node and run the application. For example:
+```
+$ oc exec -it glue-cluster-head-cjgzk -- /bin/bash
+(base) 1000650000@glue-cluster-head-cjgzk:~/glue$ nohup ./glue_benchmark -b {bucket-name} -m roberta-base -t WNLI 2>&1 &
+```
+This will evaluate the roberta-base model against the WNLI task with 10 different seeds
+
+9. monitor the progress using nohup.out. The evaluation results will be in /tmp/summary.
+
+10. When finished, clean up the active resources in your project:
+```
+$ oc delete -f glue-cluster.yaml
+$ oc delete -f glue-operator.yaml
+```
@@ -0,0 +1,169 @@
+apiVersion: cluster.ray.io/v1
+kind: RayCluster
+metadata:
+  name: glue-cluster
+spec:
+  # The maximum number of workers nodes to launch in addition to the head node.
+  maxWorkers: 10
+  # The autoscaler will scale up the cluster faster with higher upscaling speed.
+  # E.g., if the task requires adding more nodes then autoscaler will gradually
+  # scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
+  # This number should be > 0.
+  upscalingSpeed: 0.0
+  # If a node is idle for this many minutes, it will be removed.
+  idleTimeoutMinutes: 99999
+  # Specify the pod type for the ray head node (as configured below).
+  headPodType: head-node
+  # Specify the allowed pod types for this ray cluster and the resources they provide.
+  podTypes:
+  - name: head-node
+    # Minimum number of Ray workers of this Pod type.
+    minWorkers: 0
+    # Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers.
+    maxWorkers: 0
+    rayResources: {"GPU": 0}
+    podConfig:
+      apiVersion: v1
+      kind: Pod
+      metadata:
+        # Automatically generates a name for the pod with this prefix.
+        generateName: head-
+      spec:
+        restartPolicy: Never
+        imagePullSecrets:
+        - name: artifactory-codeflare-cred
+
+        # This volume allocates shared memory for Ray to use for its plasma
+        # object store. If you do not provide this, Ray will fall back to
+        # /tmp which cause slowdowns if is not a shared memory volume.
+        volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+        containers:
+        - name: ray-node
+          image: res-wsched-team-ray-project-docker-local.artifactory.swg-devops.com/codeflare:glue_benchmark-ray1.9-s3
+          env:
+          - name: AWS_ACCESS_KEY_ID
+            valueFrom:
+              secretKeyRef:
+                name: glue-s3-creds
+                key: AWS_ACCESS_KEY_ID
+          - name: AWS_SECRET_ACCESS_KEY
+            valueFrom:
+              secretKeyRef:
+                name: glue-s3-creds
+                key: AWS_SECRET_ACCESS_KEY
+          - name: ENDPOINT_URL
+            valueFrom:
+              secretKeyRef:
+                name: glue-s3-creds
+                key: ENDPOINT_URL
+          # Do not change this command - it keeps the pod alive until it is
+          # explicitly killed.
+          command: ["/bin/bash", "-c", "--"]
+          args: ['trap : TERM INT; sleep infinity & wait;']
+          ports:
+          - containerPort: 6379  # Redis port
+          - containerPort: 10001  # Used by Ray Client
+          - containerPort: 8265  # Used by Ray Dashboard
+
+          # This volume allocates shared memory for Ray to use for its plasma
+          # object store. If you do not provide this, Ray will fall back to
+          # /tmp which cause slowdowns if is not a shared memory volume.
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          resources:
+            requests:
+              cpu: "2"
+              memory: "32G"
+              ephemeral-storage: "60G"
+              nvidia.com/gpu: "0"
+            limits:
+              # The maximum memory that this pod is allowed to use. The
+              # limit will be detected by ray and split to use 10% for
+              # redis, 30% for the shared memory object store, and the
+              # rest for application memory. If this limit is not set and
+              # the object store size is not set manually, ray will
+              # allocate a very large object store in each pod that may
+              # cause problems for other pods.
+              cpu: "2"
+              memory: "32G"
+              ephemeral-storage: "60G"
+              nvidia.com/gpu: "0"
+  - name: worker-node
+    # Minimum number of Ray workers of this Pod type.
+    minWorkers: 8
+    # Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers.
+    maxWorkers: 8
+    # User-specified custom resources for use by Ray.
+    # (Ray detects CPU and GPU from pod spec resource requests and limits, so no need to fill those here.)
+    rayResources: {"foo": 1, "bar": 0}
+    podConfig:
+      apiVersion: v1
+      kind: Pod
+      metadata:
+        # Automatically generates a name for the pod with this prefix.
+        generateName: worker-
+      spec:
+        restartPolicy: Never
+        imagePullSecrets:
+        - name: artifactory-codeflare-cred
+        volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+        containers:
+        - name: ray-node
+          imagePullPolicy: Always
+          image: res-wsched-team-ray-project-docker-local.artifactory.swg-devops.com/codeflare:glue_benchmark-ray1.9-s3
+          env:
+          - name: AWS_ACCESS_KEY_ID
+            valueFrom:
+              secretKeyRef:
+                name: glue-s3-creds
+                key: AWS_ACCESS_KEY_ID
+          - name: AWS_SECRET_ACCESS_KEY
+            valueFrom:
+              secretKeyRef:
+                name: glue-s3-creds
+                key: AWS_SECRET_ACCESS_KEY
+          - name: ENDPOINT_URL
+            valueFrom:
+              secretKeyRef:
+                name: glue-s3-creds
+                key: ENDPOINT_URL
+          command: ["/bin/bash", "-c", "--"]
+          args: ["trap : TERM INT; sleep infinity & wait;"]
+          # This volume allocates shared memory for Ray to use for its plasma
+          # object store. If you do not provide this, Ray will fall back to
+          # /tmp which cause slowdowns if is not a shared memory volume.
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          resources:
+            requests:
+              cpu: "8"
+              memory: "16G"
+              nvidia.com/gpu: "1"
+            limits:
+              # The maximum memory that this pod is allowed to use. The
+              # limit will be detected by ray and split to use 10% for
+              # redis, 30% for the shared memory object store, and the
+              # rest for application memory. If this limit is not set and
+              # the object store size is not set manually, ray will
+              # allocate a very large object store in each pod that may
+              # cause problems for other pods.
+              cpu: "8"
+              memory: "16G"
+              nvidia.com/gpu: "1"
+  # Commands to start Ray on the head node. You don't need to change this.
+  # Note dashboard-host is set to 0.0.0.0 so that Kubernetes can port forward.
+  headStartRayCommands:
+    - ray stop
+    - ulimit -n 65536; ray start --head --no-monitor --dashboard-host 0.0.0.0
+  # Commands to start Ray on worker nodes. You don't need to change this.
+  workerStartRayCommands:
+    - ray stop
+    - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379
@@ -0,0 +1,49 @@
+operator_role:
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: ray-operator-serviceaccount
+---
+kind: Role
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: ray-operator-role
+rules:
+- apiGroups: ["", "cluster.ray.io"]
+  resources: ["rayclusters", "rayclusters/finalizers", "rayclusters/status", "pods", "pods/exec", "services"]
+  verbs: ["get", "watch", "list", "create", "delete", "patch", "update"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: ray-operator-rolebinding
+subjects:
+- kind: ServiceAccount
+  name: ray-operator-serviceaccount
+roleRef:
+  kind: Role
+  name: ray-operator-role
+  apiGroup: rbac.authorization.k8s.io
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: ray-operator-pod
+spec:
+  serviceAccountName: ray-operator-serviceaccount
+  containers:
+  - name: ray
+    imagePullPolicy: Always
+    image: rayproject/ray:1.9.0-py37
+    command: ["ray-operator"]
+    env:
+    - name: RAY_OPERATOR_POD_NAMESPACE
+      valueFrom:
+        fieldRef:
+          fieldPath: metadata.namespace
+    resources:
+      requests:
+        cpu: 1
+        memory: 1Gi
+      limits:
+        memory: 2Gi