reranker.sh

#!/bin/bash
#SBATCH -N 1
#SBATCH --job-name=trec
#SBATCH --ntasks-per-node=1
#SBATCH --mem-per-cpu=15G
#SBATCH -o logs/print_generated_train_reranker
#SBATCH -e logs/error_generated_train_reranker
#SBATCH --partition=gpu
#SBATCH --gres=gpu:tesla-smx2:1
#SBATCH --cpus-per-task=10

module load anaconda/3.6
module load cuda/10.0.130
module load gnu/5.4.0
module load mvapich2
source activate trec_ct

export WANDB_PROJECT=TREC_CT_2023


# hard negative mining
RUN=generated_train_v2_with_gold_mlm_DR_hn
tokenizer=microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext
best_ckpt=3000
#
python -m pyserini.search.faiss \
  --index asyncval_runs/${RUN}/embeddings_checkpoint-${best_ckpt} \
  --topics data/queries/generated_train_v2_with_gold_queries.tsv \
  --encoder models/${RUN}/checkpoint-${best_ckpt} \
  --tokenizer ${tokenizer} \
  --encoder-class 'auto' \
  --output runs/generated_train_v2_with_gold.DR_reranker_hn.txt \
  --output-format msmarco \
  --device cuda:0 \
  --batch-size 128 --threads 12

RUN=generated_train_v2_with_gold_splade_hn_splade

best_ckpt=16000
python -m pyserini.search.lucene \
  --index data/indexes/${RUN}_ckpt${best_ckpt} \
  --topics data/queries/generated_train_v2_with_gold_queries.tsv \
  --encoder models/${RUN}/checkpoint-${best_ckpt} \
  --output runs/generated_train_v2_with_gold.splade_reranker_hn.txt \
  --output-format msmarco \
  --batch 36 --threads 12 \
  --hits 1000 \
  --impact

python tevatron/src/tevatron/utils/format/convert_result_to_trec.py \
--input runs/generated_train_v2_with_gold.DR_reranker_hn.txt \
--output runs/generated_train_v2_with_gold.DR_reranker_hn.trec

python tevatron/src/tevatron/utils/format/convert_result_to_trec.py \
--input runs/generated_train_v2_with_gold.splade_reranker_hn.txt \
--output runs/generated_train_v2_with_gold.splade_reranker_hn.trec

dr_run=runs/generated_train_v2_with_gold.DR_reranker_hn.trec
splade_run=runs/generated_train_v2_with_gold.splade_reranker_hn.trec

python3 fuse.py \
--run1 ${dr_run} \
--run2 ${splade_run} \
--weight1 0.5 --weight2 0.5 \
--output runs/hybrid/generated_train_v2_with_gold_reranker_hn.txt \
--retrieval_fusion

python tevatron/src/tevatron/utils/format/convert_result_to_marco.py \
--input runs/hybrid/generated_train_v2_with_gold_reranker_hn.txt \
--output runs/hybrid/generated_train_v2_with_gold_reranker_hn.tsv


tokenizer=microsoft/BiomedNLP-PubMedBERT-large-uncased-abstract

#cd data
train_data=generated_train_v2_with_gold
python3 build_train_hn.py \
--tokenizer_name ${tokenizer} \
--hn_file ../runs/hybrid/generated_train_v2_with_gold_reranker_hn.tsv \
--qrels qrels/${train_data}_qrels.txt \
--queries queries/${train_data}_queries.tsv \
--collection corpus_t_e_s_d/ct_corpus.jsonl \
--save_to tokenized/${tokenizer}/${train_data}_reranker_hn \
--n_sample 20 \
--mp_chunk_size 10 \
--truncate 512
cd ..


RUN=generated_train_v2_with_gold_reranker
model=microsoft/BiomedNLP-PubMedBERT-large-uncased-abstract
tokenizer=microsoft/BiomedNLP-PubMedBERT-large-uncased-abstract

CUDA_VISIBLE_DEVICES=0 python tevatron/examples/reranker/reranker_train.py \
  --output_dir models/generated_train_v2_with_gold_reranker \
  --model_name_or_path ${model} \
  --save_steps 1000 \
  --train_dir data/tokenized/${tokenizer}/${train_data}_reranker_hn \
  --fp16 \
  --per_device_train_batch_size 1 \
  --train_n_passages 8 \
  --gradient_accumulation_steps 4 \
  --learning_rate 1e-6 \
  --q_max_len 182 \
  --p_max_len 330 \
  --num_train_epochs 2 \
  --logging_steps 100 \
  --overwrite_output_dir \
  --cache_dir cache

python rerank/prepare_rerank_file.py \
    --query_file data/queries/ct_2022_queries.tsv \
    --corpus_file data/corpus_t_e_s_d/ct_corpus.jsonl \
    --retrieval /scratch/itee/uqszhuan/TREC_CT_2023/runs/hybrid/DR_SPLADE_alpha=0.5.txt \
    --output_path data/reranker_input_files/rerank_input_file.DR_SPLADE_alpha=0.5.jsonl

for reranker_ckpt in 1 2 3 4 5 6 7 8 9 10 11 12;
do
CUDA_VISIBLE_DEVICES=0 python rerank/reranker_inference.py \
  --output_dir=temp \
  --model_name_or_path models/generated_train_v2_with_gold_reranker/checkpoint-${reranker_ckpt}000 \
  --tokenizer_name ${tokenizer} \
  --encode_in_path /scratch/itee/uqszhuan/TREC_CT_2023/data/reranker_input_files/rerank_input_file.DR_SPLADE_alpha=0.5.jsonl \
  --fp16 \
  --per_device_eval_batch_size 64 \
  --q_max_len 182 \
  --p_max_len 330 \
  --dataset_name rerank/data_script.py \
  --encoded_save_path runs/reranker/DR_SPLADE_alpha=0.5.reranker_ckpt${reranker_ckpt}000.txt \
  --cache_dir cache
done