-
Notifications
You must be signed in to change notification settings - Fork 86
/
Copy pathrun_distributed.sh
executable file
·38 lines (33 loc) · 1.71 KB
/
run_distributed.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#!/bin/bash
###################
# TRAINING CONFIG #
###################
export DATA_DIR=/path/to/data/dir
export RESOLUTION=540p # options: 540p, 720p, 1080p, 4K
export LOADER="NVVL" # options: "NVVL" or "pytorch"
export DATA_TYPE=scenes # options: "scenes" or "frames"
#export CODEC="h264" #
#export CRF="18" # set these three only if used during preprocessing
#export KEYINT="4" #
export ROOT=$DATA_DIR/$RESOLUTION/$DATA_TYPE/$CODEC/${CRF+crf$CRF}/${KEYINT+keyint$KEYINT}/
#export IS_CROPPED="--is_cropped" # Uncomment to crop input images
export CROP_SIZE="-1 -1"
#export CROP_SIZE="540 960" # Only applicable if --is_cropped uncommented
#export TIMING="--timing" # Uncomment to time data loading and computation - slower
#export FP16="--fp16" # Uncomment to load data and train model in fp16
export MINLR=0.0001
export MAXLR=0.001
export BATCHSIZE=2
export FRAMES=3
export MAX_ITER=1000000
export WS=8 # Number of GPUs available
export BASE_RANK=0 # Device ID of first GPU (assumes GPUs numbered sequentially)
export IP=localhost
tensorboard --logdir runs 2> /dev/null &
echo "Tensorboard launched"
# Launch one PyTorch distributed process per GPU
for i in `seq 0 $(($WS - 2))`; do
export RANK=$(($WS - $i - 1))
python main.py --loader $LOADER --rank $(($BASE_RANK + $RANK)) --batchsize $BATCHSIZE --frames $FRAMES --root $ROOT --world_size $WS --ip $IP $BENCHMARK $IS_CROPPED $FP16 --max_iter $MAX_ITER --min_lr $MINLR --max_lr $MAXLR $TIMING --crop_size $CROP_SIZE &
done
python main.py --loader $LOADER --rank $(($BASE_RANK)) --batchsize $BATCHSIZE --frames $FRAMES --root $ROOT --world_size $WS --ip $IP $BENCHMARK $IS_CROPPED $FP16 --max_iter $MAX_ITER --min_lr $MINLR --max_lr $MAXLR $TIMING --crop_size $CROP_SIZE