-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfig.yaml
60 lines (60 loc) · 2.3 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
job_id_regexp: "Job Id:(\\d+)"
cmd: "sbatch {sbatch_script}"
check_interval_secs: 600
experiments:
- small:
model_scale:
model: [ViT-S-32, ViT-M-32, ViT-B-32]
samples_seen_scale:
- 1.28M:
lr: [5e-4, 1e-3]
batch_size: 1024
nodes: 16
train_num_samples: 128_000
epochs: 10
- 12.8M:
lr: [5e-4, 1e-3]
batch_size: 1024
nodes: 16
train_num_samples: 1_280_000
epochs: 10
- medium:
model_scale:
model: [ViT-L-14]
samples_seen_scale:
- 1.28M:
lr: [5e-4, 1e-3]
batch_size: 512
nodes: 16
train_num_samples: 128_000
epochs: 10
- 12.8M:
lr: [5e-4, 1e-3]
batch_size: 512
nodes: 16
train_num_samples: 1_280_000
epochs: 10
mode:
- train:
template: train.sbatch
sbatch_script: "sbatch_scripts/{name}.sbatch"
output_file: "{logs}/{name}/slurm.out"
nodes: 24
# terminate training if we detect that last epoch is finished
# e.g. if number of epochs is 100 and we find the expression Train Epoch: 99 .... 100%, we return 1
# thus terminating the job.
termination_cmd: 'let last={epochs}-1;ne=`grep "Train Epoch: $last.*100%" {output_file}|wc -l`;echo $(( (ne) >= 1 ))'
- eval:
template: eval.sbatch
sbatch_script: "sbatch_scripts/{name}_eval.sbatch"
output_file: "{logs}/{name}/slurm_eval.out"
nodes: 1
# evals have starting condition, they are only launched if number of checkpoints is greater than number of evaluations (json result files)
start_condition_cmd: "nc=`ls {logs}/{name}/checkpoints/*.pt|wc -l`;ne=`ls {logs}/{name}/checkpoints/imagenet1k*.json|wc -l`;echo $(( (nc-ne) > 0 ))"
# we only terminate evals when number of evals is equal to number of epochs
termination_cmd: "ne=`ls {logs}/{name}/checkpoints/imagenet1k*.json|wc -l`;echo $(( (ne) == {epochs}+1 ))"
dataset:
- datacomp:
train_data: "/path/{0000000..0139827}.tar"
logs: "logs"
name: "{dataset}_{model}_{samples_seen_scale}_lr{lr}_bs{batch_size}"