-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDR.sh
83 lines (71 loc) · 3.28 KB
/
DR.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
export TRANSFORMERS_CACHE=cache
export PYSERINI_CACHE=cache
CUDA_VISIBLE_DEVICES=0 python -m pyserini.encode \
input --corpus data/corpus_t_e_s_d/ct_corpus.jsonl \
--fields title text \
--delimiter "\n" \
--shard-id 0 \
--shard-num 1 \
output --embeddings embeddings/corpus_t_e_s_d-ance \
--to-faiss \
encoder --encoder castorini/ance-msmarco-passage \
--fields title text \
--max-length 256 \
--batch 64 \
--fp16
python -m pyserini.search.faiss \
--index embeddings/corpus_t_e_s_d-ance \
--topics data/queries/ct_2022_queries.tsv \
--encoder castorini/ance-msmarco-passage \
--output runs/ct2022.ance.txt \
--output-format trec \
--batch-size 36 --threads 12
trec_eval -m ndcg_cut.10 -m P.10 -m Rprec -m recip_rank data/qrels/ct_2021_qrels.txt runs/ct2022.ance.txt
# train stage 1
python -m pyserini.search.faiss \
--index 'asyncval_runs/generated_train_v2_with_gold_bm25_hn_pubmedbert/embeddings_checkpoint-6000' \
--topics 'data/queries/generated_train_v2_with_gold_queries.tsv' \
--encoder 'models/generated_train_v2_with_gold_bm25_hn_pubmedbert/checkpoint-6000' \
--tokenizer 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext' \
--encoder-class 'auto' \
--output runs/generated_train_v2_with_gold.pubmedbert_bm25_hn.txt \
--output-format msmarco \
--device cuda:0 \
--batch-size 128 --threads 12
# train stage 2
python -m pyserini.search.faiss \
--index 'asyncval_runs/generated_train_v2_with_gold_pubmedbert_bm25_hn_hn/embeddings_checkpoint-2000' \
--topics 'data/queries/generated_train_v2_with_gold_queries.tsv' \
--encoder 'models/generated_train_v2_with_gold_pubmedbert_bm25_hn_hn/checkpoint-2000' \
--tokenizer 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext' \
--encoder-class 'auto' \
--output runs/generated_train_v2_with_gold.pubmedbert_bm25_hn_hn_hn.txt \
--output-format msmarco \
--device cuda:0 \
--batch-size 128 --threads 12
# dev
python -m pyserini.search.faiss \
--index 'asyncval_runs/generated_train_v2_with_gold_bm25_hn_pubmedbert/embeddings_checkpoint-6000' \
--topics 'data/queries/ct_2022_queries.tsv' \
--encoder 'models/generated_train_v2_with_gold_bm25_hn_pubmedbert/checkpoint-6000' \
--tokenizer 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext' \
--encoder-class 'auto' \
--output runs/ct2022.pubmedbert_bm25_hn.txt \
--output-format trec \
--device cuda:0 \
--batch-size 128 --threads 12
trec_eval -m ndcg_cut.10 -m P.10 -m Rprec -m recip_rank data/qrels/ct_2022_qrels.txt runs/ct2022.pubmedbert_bm25_hn.txt
# dev prf
python -m pyserini.search.faiss \
--index 'asyncval_runs/generated_train_v2_with_gold_bm25_hn_pubmedbert/embeddings_checkpoint-6000' \
--topics 'data/queries/ct_2022_queries.tsv' \
--encoder 'models/generated_train_v2_with_gold_bm25_hn_pubmedbert/checkpoint-6000' \
--tokenizer 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext' \
--encoder-class 'auto' \
--prf-depth 3 \
--prf-method avg \
--output runs/ct2022.pubmedbert_bm25_hn_prf.txt \
--output-format trec \
--device cuda:0 \
--batch-size 128 --threads 12
trec_eval -m ndcg_cut.10 -m P.10 -m Rprec -m recip_rank -m recall.1000 -l2 data/qrels/ct_2022_qrels.txt runs/ct2022.pubmedbert_bm25_hn_prf.txt