-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreranker.sh
132 lines (112 loc) · 4.42 KB
/
reranker.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/bin/bash
#SBATCH -N 1
#SBATCH --job-name=trec
#SBATCH --ntasks-per-node=1
#SBATCH --mem-per-cpu=15G
#SBATCH -o logs/print_generated_train_reranker
#SBATCH -e logs/error_generated_train_reranker
#SBATCH --partition=gpu
#SBATCH --gres=gpu:tesla-smx2:1
#SBATCH --cpus-per-task=10
module load anaconda/3.6
module load cuda/10.0.130
module load gnu/5.4.0
module load mvapich2
source activate trec_ct
export WANDB_PROJECT=TREC_CT_2023
# hard negative mining
RUN=generated_train_v2_with_gold_mlm_DR_hn
tokenizer=microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext
best_ckpt=3000
#
python -m pyserini.search.faiss \
--index asyncval_runs/${RUN}/embeddings_checkpoint-${best_ckpt} \
--topics data/queries/generated_train_v2_with_gold_queries.tsv \
--encoder models/${RUN}/checkpoint-${best_ckpt} \
--tokenizer ${tokenizer} \
--encoder-class 'auto' \
--output runs/generated_train_v2_with_gold.DR_reranker_hn.txt \
--output-format msmarco \
--device cuda:0 \
--batch-size 128 --threads 12
RUN=generated_train_v2_with_gold_splade_hn_splade
best_ckpt=16000
python -m pyserini.search.lucene \
--index data/indexes/${RUN}_ckpt${best_ckpt} \
--topics data/queries/generated_train_v2_with_gold_queries.tsv \
--encoder models/${RUN}/checkpoint-${best_ckpt} \
--output runs/generated_train_v2_with_gold.splade_reranker_hn.txt \
--output-format msmarco \
--batch 36 --threads 12 \
--hits 1000 \
--impact
python tevatron/src/tevatron/utils/format/convert_result_to_trec.py \
--input runs/generated_train_v2_with_gold.DR_reranker_hn.txt \
--output runs/generated_train_v2_with_gold.DR_reranker_hn.trec
python tevatron/src/tevatron/utils/format/convert_result_to_trec.py \
--input runs/generated_train_v2_with_gold.splade_reranker_hn.txt \
--output runs/generated_train_v2_with_gold.splade_reranker_hn.trec
dr_run=runs/generated_train_v2_with_gold.DR_reranker_hn.trec
splade_run=runs/generated_train_v2_with_gold.splade_reranker_hn.trec
python3 fuse.py \
--run1 ${dr_run} \
--run2 ${splade_run} \
--weight1 0.5 --weight2 0.5 \
--output runs/hybrid/generated_train_v2_with_gold_reranker_hn.txt \
--retrieval_fusion
python tevatron/src/tevatron/utils/format/convert_result_to_marco.py \
--input runs/hybrid/generated_train_v2_with_gold_reranker_hn.txt \
--output runs/hybrid/generated_train_v2_with_gold_reranker_hn.tsv
tokenizer=microsoft/BiomedNLP-PubMedBERT-large-uncased-abstract
#cd data
train_data=generated_train_v2_with_gold
python3 build_train_hn.py \
--tokenizer_name ${tokenizer} \
--hn_file ../runs/hybrid/generated_train_v2_with_gold_reranker_hn.tsv \
--qrels qrels/${train_data}_qrels.txt \
--queries queries/${train_data}_queries.tsv \
--collection corpus_t_e_s_d/ct_corpus.jsonl \
--save_to tokenized/${tokenizer}/${train_data}_reranker_hn \
--n_sample 20 \
--mp_chunk_size 10 \
--truncate 512
cd ..
RUN=generated_train_v2_with_gold_reranker
model=microsoft/BiomedNLP-PubMedBERT-large-uncased-abstract
tokenizer=microsoft/BiomedNLP-PubMedBERT-large-uncased-abstract
CUDA_VISIBLE_DEVICES=0 python tevatron/examples/reranker/reranker_train.py \
--output_dir models/generated_train_v2_with_gold_reranker \
--model_name_or_path ${model} \
--save_steps 1000 \
--train_dir data/tokenized/${tokenizer}/${train_data}_reranker_hn \
--fp16 \
--per_device_train_batch_size 1 \
--train_n_passages 8 \
--gradient_accumulation_steps 4 \
--learning_rate 1e-6 \
--q_max_len 182 \
--p_max_len 330 \
--num_train_epochs 2 \
--logging_steps 100 \
--overwrite_output_dir \
--cache_dir cache
python rerank/prepare_rerank_file.py \
--query_file data/queries/ct_2022_queries.tsv \
--corpus_file data/corpus_t_e_s_d/ct_corpus.jsonl \
--retrieval /scratch/itee/uqszhuan/TREC_CT_2023/runs/hybrid/DR_SPLADE_alpha=0.5.txt \
--output_path data/reranker_input_files/rerank_input_file.DR_SPLADE_alpha=0.5.jsonl
for reranker_ckpt in 1 2 3 4 5 6 7 8 9 10 11 12;
do
CUDA_VISIBLE_DEVICES=0 python rerank/reranker_inference.py \
--output_dir=temp \
--model_name_or_path models/generated_train_v2_with_gold_reranker/checkpoint-${reranker_ckpt}000 \
--tokenizer_name ${tokenizer} \
--encode_in_path /scratch/itee/uqszhuan/TREC_CT_2023/data/reranker_input_files/rerank_input_file.DR_SPLADE_alpha=0.5.jsonl \
--fp16 \
--per_device_eval_batch_size 64 \
--q_max_len 182 \
--p_max_len 330 \
--dataset_name rerank/data_script.py \
--encoded_save_path runs/reranker/DR_SPLADE_alpha=0.5.reranker_ckpt${reranker_ckpt}000.txt \
--cache_dir cache
done