-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhybrid.sh
110 lines (94 loc) · 4.35 KB
/
hybrid.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
export TRANSFORMERS_CACHE=cache
export PYSERINI_CACHE=cache
for i in {1..10}; do
python -m pyserini.search.hybrid \
dense --index embeddings/ct-corpus-ance \
--encoder castorini/ance-msmarco-passage \
sparse --index indexes/trec_clinicaltrials_lucene \
--bm25 --k1 0.82 --b 0.68 \
fusion --alpha ${i}.0 --normalization \
run --topics ct_2021_queries.tsv \
--output runs/hybrid.ance.bm25.a=${i}.0.txt \
--batch-size 75 --threads 32 \
--output-format trec
done
trec_eval -m ndcg_cut.10 -m P.10 -m Rprec -m recip_rank ct_2021_qrels.txt runs/hybrid.ance.bm25.a=5.0.txt
alpha=0.8
python -m pyserini.search.hybrid \
dense --index 'asyncval_runs/generated_train_v2_with_gold_pubmedbert_bm25_hn_hn/embeddings_checkpoint-2000' \
--encoder 'models/generated_train_v2_with_gold_pubmedbert_bm25_hn_hn/checkpoint-2000' \
--encoder-class 'auto' \
--tokenizer 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext' \
sparse --index 'data/indexes/trec_clinicaltrials_t_e_s_d_lucene' \
--bm25 --k1 0.82 --b 0.68 \
fusion --alpha ${alpha} \
--normalization \
--hits 1000 \
run --topics 'data/queries/ct_2022_queries.tsv' \
--output runs/hybrid.bm25.pubmedbertDR_bm25_hn_hn.a=${alpha}.txt \
--hits 1000 \
--batch-size 50 --threads 32 \
--output-format trec
trec_eval -m ndcg_cut.10 -m P.10 -m Rprec -m recip_rank -m recall.1000 data/qrels/ct_2022_qrels.txt runs/hybrid.bm25.pubmedbertDR_bm25_hn_hn.a=${alpha}.txt
# splade
for alpha in 0.4 0.5 0.6 0.7 0.8 0.9 1.0 1.1 1.2 1.3 1.4 1.5;
do
python -m pyserini.search.hybrid \
dense --index 'asyncval_runs/generated_train_v2_with_gold_pubmedbert_bm25_hn_hn/embeddings_checkpoint-2000' \
--encoder 'models/generated_train_v2_with_gold_pubmedbert_bm25_hn_hn/checkpoint-2000' \
--encoder-class 'auto' \
--tokenizer 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext' \
sparse --index 'data/indexes/generated_train_v2_with_gold_bm25_hn_pubmedbert_splade_ckpt10000' \
--encoder 'models/generated_train_v2_with_gold_bm25_hn_pubmedbert_splade/checkpoint-10000' \
--impact \
fusion --alpha ${alpha} \
--normalization \
--hits 1000 \
run --topics 'data/queries/ct_2022_queries.tsv' \
--output runs/hybrid.splade.pubmedbertDR_bm25_hn_hn.a=${alpha}.txt \
--hits 1000 \
--batch-size 50 --threads 32 \
--output-format trec
done
trec_eval -m ndcg_cut.10 -m P.10 -m Rprec -m recip_rank -m recall.1000 data/qrels/ct_2022_qrels.txt runs/hybrid.splade.pubmedbertDR_bm25_hn_hn.a=${alpha}.txt
python -m pyserini.search.lucene \
--index data/indexes/${RUN}_ckpt${ckpt} \
--topics encoding_splade/query/${RUN}_ckpt${ckpt}/ct_2022.tsv \
--output runs/${RUN}_ckpt${ckpt}.txt \
--output-format trec \
--batch 50 --threads 32 \
--hits 1000 \
--impact
alpha=0.0
DR_index=/scratch/itee/uqszhuan/TREC_CT_2023/asyncval_runs/generated_train_v2_with_gold_mlm_DR_hn/embeddings_checkpoint-3000
DR_model=/scratch/itee/uqszhuan/TREC_CT_2023/models/generated_train_v2_with_gold_mlm_DR_hn/checkpoint-3000
spalde_index=/scratch/itee/uqszhuan/TREC_CT_2023/data/indexes/generated_train_v2_with_gold_splade_hn_splade_ckpt16000
spalde_model=/scratch/itee/uqszhuan/TREC_CT_2023/models/generated_train_v2_with_gold_splade_hn_splade/checkpoint-16000
python -m pyserini.search.hybrid \
dense --index ${DR_index} \
--encoder ${DR_model} \
--encoder-class 'auto' \
--tokenizer 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext' \
sparse --index ${spalde_index} \
--encoder ${spalde_model} \
--impact \
fusion --alpha ${alpha} \
--normalization \
--hits 1000 \
run --topics 'data/queries/ct_2022_queries.tsv' \
--output runs/hybrid/hybrid.splade.DR.a=${alpha}.txt \
--hits 1000 \
--batch-size 50 --threads 32 \
--output-format trec
dr_run=asyncval_runs/generated_train_v2_with_gold_mlm_DR_hn/set_0_checkpoint-3000.tsv
splade_run=runs/generated_train_v2_with_gold_splade_hn_splade_ckpt16000.txt
for alpha in 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0;
do
num=$(echo "1 - $alpha" | bc)
python3 fuse.py \
--run1 ${dr_run} \
--run2 ${splade_run} \
--weight1 ${alpha} --weight2 $num \
--output runs/hybrid/DR_SPLADE_alpha=${alpha}.txt \
--retrieval_fusion
done