-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake_submission.sh
158 lines (129 loc) · 5.66 KB
/
make_submission.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
mkdir -p models_final/DR
mkdir -p models_final/SPLADE
mkdir -p models_final/reranker
cp -r models/generated_train_v2_with_gold_mlm_DR_hn/checkpoint-3000 models_final/DR/checkpoint-3000
cp -r models/generated_train_v2_with_gold_splade_hn_splade/checkpoint-16000 models_final/SPLADE/checkpoint-16000
cp -r models/generated_train_v2_with_gold_reranker/checkpoint-9000 models_final/reranker/checkpoint-9000
DR_CKPT=models_final/DR/
SPLADE_CKPT=models_final/SPLADE/checkpoint-16000
RERANKER_CKPT=models_final/reranker/checkpoint-9000
tokenizer=microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext
cd data
python process_ct_2023_queries.py
python3 tokenize_queries.py \
--tokenizer_name ${tokenizer} \
--truncate 256 \
--query_file queries/ct_2023_queries_generated_nq.tsv \
--save_to tokenized/${tokenizer}/queries/ct_2023_queries_generated_nq.json
python3 tokenize_passages.py \
--tokenizer_name ${tokenizer} \
--truncate 512 \
--file corpus_2023_t_e_s_d/ct_corpus.jsonl \
--n_splits 1 \
--save_to tokenized/${tokenizer}/corpus/corpus_2023_t_e_s_d
cd ..
mkdir runs_final
python -m asyncval \
--query_file data/tokenized/${tokenizer}/queries/ct_2023_queries_generated_nq.json \
--candidate_dir data/tokenized/${tokenizer}/corpus/corpus_2023_t_e_s_d \
--ckpts_dir ${DR_CKPT} \
--tokenizer_name_or_path ${tokenizer} \
--qrel_file data/qrels/ct_2022_qrels.txt \
--metrics 'RR(rel=2)' 'nDCG@10' 'P(rel=2)@10' 'Rprec(rel=2)' 'R(rel=2)@1000' \
--output_dir runs_final/DR \
--report_to wandb \
--depth 1000 \
--per_device_eval_batch_size 128 \
--q_max_len 256 \
--p_max_len 512 \
--write_run trec \
--write_embeddings True \
--fp16 \
--max_num_valid 1 \
--cache_dir cache
cp runs_final/DR/set_0_checkpoint-3000.tsv runs_final/TREC_CT_2023_CSIRO_OSCAR_TEAM_DR.txt
mkdir -p encoding_splade/corpus_2023/SPLADE_FINAL
mkdir -p encoding_splade/query_2023/SPLADE_FINAL
python tevatron/examples/splade/encode_splade.py \
--output_dir encoding_splade \
--model_name_or_path ${SPLADE_CKPT} \
--tokenizer_name ${tokenizer} \
--fp16 \
--per_device_eval_batch_size 64 \
--p_max_len 512 \
--encode_in_path data/tokenized/${tokenizer}/corpus/corpus_2023_t_e_s_d/split00.json \
--encoded_save_path encoding_splade/corpus_2023/SPLADE_FINAL/split00.jsonl \
--cache_dir cache
python tevatron/examples/splade/encode_splade.py \
--output_dir encoding_splade \
--model_name_or_path ${SPLADE_CKPT} \
--tokenizer_name ${tokenizer} \
--fp16 \
--q_max_len 256 \
--encode_is_qry \
--per_device_eval_batch_size 128 \
--encode_in_path data/tokenized/${tokenizer}/queries/ct_2023_queries_generated_nq.json \
--encoded_save_path encoding_splade/query_2023/SPLADE_FINAL/ct_2023_queries_generated_nq.tsv \
--cache_dir cache
python -m pyserini.index.lucene \
--collection JsonVectorCollection \
--input encoding_splade/corpus_2023/SPLADE_FINAL \
--index data/indexes/corpus_2023/SPLADE_FINAL \
--generator DefaultLuceneDocumentGenerator \
--threads 16 \
--impact --pretokenized
python -m pyserini.search.lucene \
--index data/indexes/corpus_2023/SPLADE_FINAL \
--topics encoding_splade/query_2023/SPLADE_FINAL/ct_2023_queries_generated_nq.tsv \
--output runs_final/TREC_CT_2023_CSIRO_OSCAR_TEAM_SPLADE.txt \
--output-format trec \
--batch 50 --threads 32 \
--hits 1000 \
--impact
python3 fuse.py \
--run1 runs_final/TREC_CT_2023_CSIRO_OSCAR_TEAM_DR.txt \
--run2 runs_final/TREC_CT_2023_CSIRO_OSCAR_TEAM_SPLADE.txt \
--weight1 0.5 --weight2 0.5 \
--output runs_final/TREC_CT_2023_CSIRO_OSCAR_TEAM_DR_SPLADE_Hybrid.txt \
--retrieval_fusion
python rerank/prepare_rerank_file.py \
--query_file data/queries/ct_2023_queries_generated_nq.tsv \
--corpus_file data/corpus_2023_t_e_s_d/ct_corpus.jsonl \
--retrieval runs_final/TREC_CT_2023_CSIRO_OSCAR_TEAM_DR_SPLADE_Hybrid.txt \
--output_path rerank/rerank_input_file.TREC_CT_2023_CSIRO_OSCAR_TEAM_DR_SPLADE_Hybrid.jsonl
tokenizer=microsoft/BiomedNLP-PubMedBERT-large-uncased-abstract
CUDA_VISIBLE_DEVICES=0 python rerank/reranker_inference.py \
--output_dir=temp \
--model_name_or_path ${RERANKER_CKPT} \
--tokenizer_name ${tokenizer} \
--encode_in_path rerank_input_file.TREC_CT_2023_CSIRO_OSCAR_TEAM_DR_SPLADE_Hybrid.jsonl \
--fp16 \
--per_device_eval_batch_size 64 \
--q_max_len 182 \
--p_max_len 330 \
--dataset_name rerank/data_script.py \
--encoded_save_path runs_final/TREC_CT_2023_CSIRO_OSCAR_TEAM_DR_SPLADE_Hybrid_rerank.txt \
--cache_dir cache
python3 fuse.py \
--run1 runs_final/TREC_CT_2023_CSIRO_OSCAR_TEAM_DR_SPLADE_Hybrid.txt \
--run2 runs_final/TREC_CT_2023_CSIRO_OSCAR_TEAM_DR_SPLADE_Hybrid_rerank.txt \
--weight1 0.1 --weight2 0.9 \
--output runs_final/TREC_CT_2023_CSIRO_OSCAR_TEAM_DR_SPLADE_Hybrid_rerank_Hybrid.txt \
--retrieval_fusion
# GPT-4 judger
mkdir runs_final/TREC_CT_2023_CSIRO_OSCAR_TEAM_DR_SPLADE_Hybrid_rerank_Hybrid_gpt-4_judger_k=20
python3 run_trecct_gpt_judger.py \
run --run_path runs_final/TREC_CT_2023_CSIRO_OSCAR_TEAM_DR_SPLADE_Hybrid_rerank_Hybrid.txt \
--save_dir runs_final/TREC_CT_2023_CSIRO_OSCAR_TEAM_DR_SPLADE_Hybrid_rerank_Hybrid_gpt-4_judger_k=20 \
--model_name_or_path gpt-4 \
--ir_dataset_name clinicaltrials/2023 \
--query_path data/queries/ct_2023_queries_generated_nq.tsv \
--openai_key 'xxx' \
--k 20 \
--query_length 512 \
--passage_length 1024 \
--cache_dir ./cache
cd runs_final/TREC_CT_2023_CSIRO_OSCAR_TEAM_DR_SPLADE_Hybrid_rerank_Hybrid_gpt-4_judger_k=20
cat *.txt > TREC_CT_2023_CSIRO_OSCAR_TEAM_DR_SPLADE_Hybrid_rerank_Hybrid_gpt-4_judger_k=20.txt
cat *.qrel > TREC_CT_2023_CSIRO_OSCAR_TEAM_DR_SPLADE_Hybrid_rerank_Hybrid_gpt-4_judger_k=20.qrels
cp TREC_CT_2023_CSIRO_OSCAR_TEAM_DR_SPLADE_Hybrid_rerank_Hybrid_gpt-4_judger_k=20.txt ../