forked from dmrub/volcano-k8s-examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvolcano-gpu-pytorch-multi-worker.yaml
63 lines (63 loc) · 1.87 KB
/
volcano-gpu-pytorch-multi-worker.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# Volcano example for pytorch
# Based on
# https://yangkky.github.io/2019/07/08/distributed-pytorch-tutorial.html
# https://github.com/yangkky/distributed_tutorial/blob/master/src/mnist-distributed.py
apiVersion: batch.volcano.sh/v1alpha1
kind: Job
metadata:
name: pytorch-multi-worker
namespace: dmrub
spec:
minAvailable: 3
schedulerName: volcano
queue: default
plugins:
env: []
svc: []
policies:
- event: PodEvicted
action: RestartJob
tasks:
- replicas: 3
name: worker
policies:
- event: TaskCompleted
action: CompleteJob
template:
spec:
imagePullSecrets:
- name: default-secret
containers:
- name: pytorch
image: nvcr.io/nvidia/pytorch:21.12-py3
command:
- sh
- -c
- |
export MASTER_ADDR=$(head -n1 /etc/volcano/worker.host);
export MASTER_PORT=8888;
echo "Starting ..."
python3 mnist-distributed.py -n 3 -g 1 -nr ${VK_TASK_INDEX} --epochs 10;
ERR=$?;
echo "Error: $ERR";
if [ $ERR -eq 134 ]; then
ls -lah;
echo "Process PID: $$";
while true; do sleep 10; done;
fi
exit $ERR;
ports:
- containerPort: 8888
name: pytorch-port
resources:
limits:
nvidia.com/gpu: "1"
workingDir: /data/volcano-k8s-examples/pytorch/src
volumeMounts:
- mountPath: /data
name: data-volume
volumes:
- name: data-volume
persistentVolumeClaim:
claimName: volcano-workspace
restartPolicy: OnFailure