-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBM25_baseline.sh
34 lines (29 loc) · 1.07 KB
/
BM25_baseline.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
export PYSERINI_CACHE=cache
python -m pyserini.index.lucene \
--collection JsonCollection \
--input data/corpus_t_e_s_d \
--index data/indexes/trec_clinicaltrials_t_e_s_d_lucene \
--generator DefaultLuceneDocumentGenerator \
--threads 36 \
--storePositions --storeDocvectors --storeRaw
python -m pyserini.search.lucene \
--index data/indexes/trec_clinicaltrials_t_e_s_d_lucene \
--topics data/queries/ct_2022_queries.tsv \
--output runs/ct2022.bm25.k1=0.82.b=0.68.txt \
--output-format trec \
--hits 1000 \
--threads 32 \
--batch-size 50 \
--bm25 --k1 0.82 --b 0.68
trec_eval -m ndcg_cut.10 -m P.10 -m Rprec -m recip_rank data/qrels/ct_2022_qrels.txt runs/ct2022.bm25.k1=0.82.b=0.68.txt
# BM25 hard negative ranking
train_data=generated_train_v2_llama
python -m pyserini.search.lucene \
--index data/indexes/trec_clinicaltrials_t_e_s_d_lucene \
--topics data/queries/${train_data}_queries.tsv \
--output runs/${train_data}.bm25.k1=0.82.b=0.68.tsv \
--output-format msmarco \
--hits 1000 \
--threads 32 \
--batch-size 128 \
--bm25 --k1 0.82 --b 0.68