-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdense.sh
128 lines (116 loc) · 3.89 KB
/
dense.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/bin/bash
#SBATCH -N 1
#SBATCH --job-name=trec
#SBATCH --ntasks-per-node=1
#SBATCH --mem-per-cpu=15G
#SBATCH -o logs/print_generated_train_v2_with_gold_mlm_bm25_hn
#SBATCH -e logs/error_generated_train_v2_with_gold_mlm_bm25_hn
#SBATCH --partition=gpu
#SBATCH --gres=gpu:tesla-smx2:1
#SBATCH --cpus-per-task=10
module load anaconda/3.6
module load cuda/10.0.130
module load gnu/5.4.0
module load mvapich2
source activate trec_ct
export WANDB_PROJECT=TREC_CT_2023
# train with BM25 hard negatives (train stage 1)
RUN=generated_train_v2_with_gold_mlm_bm25_hn
model=/scratch/itee/uqszhuan/TREC_CT_2023/models_pretrain/PubMedBERT_CT_MLM
tokenizer=microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext
CUDA_VISIBLE_DEVICES=0 python -m tevatron.driver.train \
--output_dir models/${RUN} \
--model_name_or_path ${model} \
--save_steps 1000 \
--train_dir data/tokenized/${tokenizer}/generated_train_v2_with_gold_bm25_hn \
--fp16 \
--per_device_train_batch_size 8 \
--train_n_passages 4 \
--learning_rate 5e-6 \
--q_max_len 256 \
--p_max_len 512 \
--num_train_epochs 4 \
--overwrite_output_dir \
--cache_dir cache \
--run_name ${RUN} \
--report_to wandb \
--logging_steps 100
python -m asyncval \
--query_file data/tokenized/${tokenizer}/queries/ct_2022_queries.json \
--candidate_dir data/tokenized/${tokenizer}/corpus/corpus_t_e_s_d \
--ckpts_dir models/${RUN} \
--tokenizer_name_or_path ${tokenizer} \
--qrel_file data/qrels/ct_2022_qrels.txt \
--metrics 'RR(rel=2)' 'nDCG@10' 'P(rel=2)@10' 'Rprec(rel=2)' 'R(rel=2)@1000' \
--output_dir asyncval_runs/${RUN} \
--report_to wandb \
--depth 1000 \
--per_device_eval_batch_size 128 \
--q_max_len 256 \
--p_max_len 512 \
--write_run trec \
--write_embeddings True \
--fp16 \
--cache_dir cache
# hard negative mining
best_ckpt=3000
python -m pyserini.search.faiss \
--index asyncval_runs/${RUN}/embeddings_checkpoint-${best_ckpt} \
--topics data/queries/generated_train_v2_with_gold_queries.tsv \
--encoder models/${RUN}/checkpoint-${best_ckpt} \
--tokenizer microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext \
--encoder-class 'auto' \
--output runs/generated_train_v2_with_gold.DR_hn.txt \
--output-format msmarco \
--device cuda:0 \
--batch-size 128 --threads 12
cd data
train_data=generated_train_v2_with_gold
python3 build_train_hn.py \
--tokenizer_name ${tokenizer} \
--hn_file ../runs/generated_train_v2_with_gold.DR_hn.txt \
--qrels qrels/${train_data}_qrels.txt \
--queries queries/${train_data}_queries.tsv \
--collection corpus_t_e_s_d/ct_corpus.jsonl \
--save_to tokenized/${tokenizer}/${train_data}_DR_hn \
--truncate 512 \
--mp_chunk_size 10
cd ..
## train with DR hard negatives (train stage 2)
RUN=generated_train_v2_with_gold_mlm_DR_hn
model=/scratch/itee/uqszhuan/TREC_CT_2023/models_pretrain/PubMedBERT_CT_MLM
tokenizer=microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext
CUDA_VISIBLE_DEVICES=0 python -m tevatron.driver.train \
--output_dir models/${RUN} \
--model_name_or_path ${model} \
--save_steps 1000 \
--train_dir data/tokenized/${tokenizer}/generated_train_v2_with_gold_DR_hn \
--fp16 \
--per_device_train_batch_size 8 \
--train_n_passages 4 \
--learning_rate 5e-6 \
--q_max_len 256 \
--p_max_len 512 \
--num_train_epochs 4 \
--overwrite_output_dir \
--cache_dir cache \
--run_name ${RUN} \
--report_to wandb \
--logging_steps 100
python -m asyncval \
--query_file data/tokenized/${tokenizer}/queries/ct_2022_queries.json \
--candidate_dir data/tokenized/${tokenizer}/corpus/corpus_t_e_s_d \
--ckpts_dir models/${RUN} \
--tokenizer_name_or_path ${tokenizer} \
--qrel_file data/qrels/ct_2022_qrels.txt \
--metrics 'RR(rel=2)' 'nDCG@10' 'P(rel=2)@10' 'Rprec(rel=2)' 'R(rel=2)@1000' \
--output_dir asyncval_runs/${RUN} \
--report_to wandb \
--depth 1000 \
--per_device_eval_batch_size 128 \
--q_max_len 256 \
--p_max_len 512 \
--write_run trec \
--write_embeddings True \
--fp16 \
--cache_dir cache