forked from soda-inria/tabicl
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrain_stage2.sh
More file actions
124 lines (118 loc) · 4.03 KB
/
train_stage2.sh
File metadata and controls
124 lines (118 loc) · 4.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# This script is used to train TabICL for the second stage of the curriculum learning
# ----------------------------------
# Generate prior datasets on the fly
# ----------------------------------
torchrun --standalone --nproc_per_node=1 /path/to/tabicl/train/run.py \
--wandb_log True \
--wandb_project TabICL \
--wandb_name Stage2 \
--wandb_dir /my/wandb/dir \
--wandb_mode online \
--device cuda \
--dtype float32 \
--np_seed 42 \
--torch_seed 42 \
--max_steps 2000 \
--batch_size 512 \
--micro_batch_size 1 \
--lr 2e-5 \
--scheduler polynomial_decay_warmup \
--warmup_proportion 0 \
--poly_decay_lr_end 5e-6 \
--poly_decay_power 2.0 \
--gradient_clipping 1.0 \
--prior_type mix_scm \
--prior_device cpu \
--batch_size_per_gp 2 \
--min_features 2 \
--max_features 100 \
--max_classes 10 \
--min_seq_len 1000 \
--max_seq_len 40000 \
--log_seq_len True \
--seq_len_per_gp True \
--min_train_size 0.5 \
--max_train_size 0.9 \
--embed_dim 128 \
--col_num_blocks 3 \
--col_nhead 4 \
--col_num_inds 128 \
--row_num_blocks 3 \
--row_nhead 8 \
--row_num_cls 4 \
--row_rope_base 100000 \
--icl_num_blocks 12 \
--icl_nhead 4 \
--ff_factor 2 \
--norm_first True \
--checkpoint_dir /my/stage2/checkpoint/dir \
--checkpoint_path /my/stage1/checkpoint/dir/step-{latest}.ckpt \
--save_temp_every 5 \
--save_perm_every 100 \
--only_load_model True
# ------------------------------------------------------
# Save prior datasets to disk and load them for training
# ------------------------------------------------------
# Saving to disk
python /path/to/tabicl/prior/genload.py \
--save_dir /my/stage2/prior/dir \
--np_seed 42 \
--torch_seed 42 \
--num_batches 2000 \
--resume_from 0 \
--batch_size 512 \
--batch_size_per_gp 2 \
--prior_type mix_scm \
--min_features 2 \
--max_features 100 \
--max_classes 10 \
--min_seq_len 1000 \
--max_seq_len 40000 \
--log_seq_len True \
--seq_len_per_gp True \
--min_train_size 0.5 \
--max_train_size 0.9 \
--n_jobs -1 \
--num_threads_per_generate 1 \
--device cpu
# Loading from disk and training
torchrun --standalone --nproc_per_node=1 /path/to/tabicl/train/run.py \
--wandb_log True \
--wandb_project TabICL \
--wandb_name Stage2 \
--wandb_dir /my/wandb/dir \
--wandb_mode online \
--device cuda \
--dtype float32 \
--np_seed 42 \
--torch_seed 42 \
--max_steps 2000 \
--batch_size 512 \
--micro_batch_size 1 \
--lr 2e-5 \
--scheduler polynomial_decay_warmup \
--warmup_proportion 0 \
--poly_decay_lr_end 5e-6 \
--poly_decay_power 2.0 \
--gradient_clipping 1.0 \
--prior_dir /my/stage2/prior/dir \
--load_prior_start 0 \
--delete_after_load False \
--prior_device cpu \
--embed_dim 128 \
--col_num_blocks 3 \
--col_nhead 4 \
--col_num_inds 128 \
--row_num_blocks 3 \
--row_nhead 8 \
--row_num_cls 4 \
--row_rope_base 100000 \
--icl_num_blocks 12 \
--icl_nhead 4 \
--ff_factor 2 \
--norm_first True \
--checkpoint_dir /my/stage2/checkpoint/dir \
--checkpoint_path /my/stage1/checkpoint/dir/step-{latest}.ckpt \
--save_temp_every 5 \
--save_perm_every 100 \
--only_load_model True