-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.sbatch
43 lines (43 loc) · 1.25 KB
/
train.sbatch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#!/bin/bash -x
#SBATCH --account=laionize
#SBATCH --nodes={nodes}
#SBATCH --gres=gpu:4
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=24
#SBATCH --time=06:00:00
#SBATCH --partition=booster
#SBATCH --output={output_file}
echo "Job Id:$SLURM_JOB_ID"
ml purge
export TRANSFORMERS_CACHE=cache
export TRANSFORMERS_OFFLINE=1
source /p/project/ccstdl/laion/mamba/bin/activate experimental-torch-nightly
export CUDA_VISIBLE_DEVICES=0,1,2,3
export MASTER_PORT=12802
master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_ADDR=$master_addr"i"
echo "MASTER_ADDR="$MASTER_ADDR
export PYTHONPATH="$PYTHONPATH:$PWD/src"
srun --cpu_bind=none,v --accel-bind=gn python -u src/training/main.py \
--save-frequency 1 \
--zeroshot-frequency 1 \
--train-data="{train_data}" --dataset-type webdataset\
--train-num-samples={train_num_samples} \
--warmup 2000 \
--batch-size={batch_size} \
--report-to=tensorboard \
--epochs={epochs} \
--workers=8 \
--model {model} \
--name {name} \
--logs {logs} \
--seed 0 \
--ddp-static-graph \
--local-loss \
--gather-with-grad \
--lr {lr} \
--save-most-recent \
--precision amp_bfloat16 \
--grad-checkpoint \
--grad-clip-norm 1 \
--resume latest