Description
I'm running multi-node training of ResNet50 with Torchx, codeflare-sdk, MCAD on OCP 4.12.
I have a 3 node OCP 4.12 cluster, each node has one Nvidia GPU.
Each of the 3 worker nodes has one local 2.9TB NVME drive and an associated PV and PVC.
[root@e23-h21-740xd ResNet]# oc get pv | grep 2980Gi
local-pv-289604ff 2980Gi RWO Delete Bound default/dianes-amazing-pvc2 local-sc 18h
local-pv-8006a340 2980Gi RWO Delete Bound default/dianes-amazing-pvc1 local-sc 4d1h
local-pv-86bac87f 2980Gi RWO Delete Bound default/dianes-amazing-pvc0 local-sc 21h
NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE
dianes-amazing-pvc0 Bound local-pv-86bac87f 2980Gi RWO local-sc 20h
dianes-amazing-pvc1 Bound local-pv-8006a340 2980Gi RWO local-sc 20h
dianes-amazing-pvc2 Bound local-pv-289604ff 2980Gi RWO local-sc 18h
When I run the following python script "python3 python-multi-node-pvc.py"
[ ResNet]# cat python-multi-node-pvc.py
#
Import pieces from codeflare-sdk
from codeflare_sdk.job.jobs import DDPJobDefinition
arg_list = [
"--train-dir=/init/tiny-imagenet-200/train",
"--val-dir=/init/tiny-imagenet-200/val",
"--log-dir=/init/tiny-imagenet-200",
"--checkpoint-format=/init/checkpoint-{epoch}.pth.tar"
]
jobdef = DDPJobDefinition(
name="resnet50",
script="pytorch/pytorch_imagenet_resnet50.py",
script_args=arg_list,
scheduler_args={"namespace": "default"},
j="3x1",
gpu=1,
cpu=4,
memMB=24000,
image="quay.io/dfeddema/horovod",
mounts=[['type=volume','src=dianes-amazing-pvc0','dst="/init"'],['type=volume','src=dianes-amazing-pvc1','dst="/init"'],['type=volume','src=dianes-amazing-pvc2','dst="/init"']]
)
job = jobdef.submit()
I get the following error:
AttributeError: 'list' object has no attribute 'partition'
If I specify only one of the local NVME drives as shown below:
[ ResNet]# cat test_resnet3_single_PVC.py
#
Import pieces from codeflare-sdk
from codeflare_sdk.job.jobs import DDPJobDefinition
arg_list = [
"--train-dir=/init/tiny-imagenet-200/train",
"--val-dir=/init/tiny-imagenet-200/val",
"--log-dir=/init/tiny-imagenet-200",
"--checkpoint-format=/init/checkpoint-{epoch}.pth.tar"
]
jobdef = DDPJobDefinition(
name="resnet50",
script="pytorch/pytorch_imagenet_resnet50.py",
script_args=arg_list,
scheduler_args={"namespace": "default"},
j="3x1",
gpu=1,
cpu=4,
memMB=24000,
image="quay.io/dfeddema/horovod",
mounts=['type=volume','src=dianes-amazing-pvc0','dst="/init"']
)
job = jobdef.submit()
Then one pod starts up successfully and is assigned it's local volume. The other two pods are not scheduled because
"0/3 nodes are available: 1 Insufficient nvidia.com/gpu, 2 nodes(s) had volume node affinity conflict. preemption 0/3 are available... etc etc"
So you can see that as expected the 2nd and 3rd pods could not be scheduled because "2 nodes(s) had volume node affinity conflict".
I need a way to specify the local NVME drive (and associated PVC) for each of the nodes in my cluster. The training data resides on these local NVME drives.