dense.sh

#!/bin/bash
#SBATCH -N 1
#SBATCH --job-name=trec
#SBATCH --ntasks-per-node=1
#SBATCH --mem-per-cpu=15G
#SBATCH -o logs/print_generated_train_v2_with_gold_mlm_bm25_hn
#SBATCH -e logs/error_generated_train_v2_with_gold_mlm_bm25_hn
#SBATCH --partition=gpu
#SBATCH --gres=gpu:tesla-smx2:1
#SBATCH --cpus-per-task=10

module load anaconda/3.6
module load cuda/10.0.130
module load gnu/5.4.0
module load mvapich2
source activate trec_ct

export WANDB_PROJECT=TREC_CT_2023

# train with BM25 hard negatives (train stage 1)
RUN=generated_train_v2_with_gold_mlm_bm25_hn
model=/scratch/itee/uqszhuan/TREC_CT_2023/models_pretrain/PubMedBERT_CT_MLM
tokenizer=microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext

CUDA_VISIBLE_DEVICES=0 python -m tevatron.driver.train \
  --output_dir models/${RUN} \
  --model_name_or_path ${model} \
  --save_steps 1000 \
  --train_dir data/tokenized/${tokenizer}/generated_train_v2_with_gold_bm25_hn \
  --fp16 \
  --per_device_train_batch_size 8 \
  --train_n_passages 4 \
  --learning_rate 5e-6 \
  --q_max_len 256 \
  --p_max_len 512 \
  --num_train_epochs 4 \
  --overwrite_output_dir \
  --cache_dir cache \
  --run_name ${RUN} \
  --report_to wandb \
  --logging_steps 100


python -m asyncval \
--query_file data/tokenized/${tokenizer}/queries/ct_2022_queries.json \
--candidate_dir data/tokenized/${tokenizer}/corpus/corpus_t_e_s_d \
--ckpts_dir models/${RUN} \
--tokenizer_name_or_path ${tokenizer} \
--qrel_file data/qrels/ct_2022_qrels.txt \
--metrics 'RR(rel=2)' 'nDCG@10' 'P(rel=2)@10' 'Rprec(rel=2)' 'R(rel=2)@1000' \
--output_dir asyncval_runs/${RUN} \
--report_to wandb \
--depth 1000 \
--per_device_eval_batch_size 128 \
--q_max_len 256 \
--p_max_len 512 \
--write_run trec \
--write_embeddings True \
--fp16 \
--cache_dir cache

# hard negative mining
best_ckpt=3000
python -m pyserini.search.faiss \
  --index asyncval_runs/${RUN}/embeddings_checkpoint-${best_ckpt} \
  --topics data/queries/generated_train_v2_with_gold_queries.tsv \
  --encoder models/${RUN}/checkpoint-${best_ckpt} \
  --tokenizer microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext \
  --encoder-class 'auto' \
  --output runs/generated_train_v2_with_gold.DR_hn.txt \
  --output-format msmarco \
  --device cuda:0 \
  --batch-size 128 --threads 12

cd data
train_data=generated_train_v2_with_gold
python3 build_train_hn.py \
--tokenizer_name ${tokenizer} \
--hn_file ../runs/generated_train_v2_with_gold.DR_hn.txt \
--qrels qrels/${train_data}_qrels.txt \
--queries queries/${train_data}_queries.tsv \
--collection corpus_t_e_s_d/ct_corpus.jsonl \
--save_to tokenized/${tokenizer}/${train_data}_DR_hn \
--truncate 512 \
--mp_chunk_size 10
cd ..

## train with DR hard negatives (train stage 2)
RUN=generated_train_v2_with_gold_mlm_DR_hn
model=/scratch/itee/uqszhuan/TREC_CT_2023/models_pretrain/PubMedBERT_CT_MLM
tokenizer=microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext

CUDA_VISIBLE_DEVICES=0 python -m tevatron.driver.train \
  --output_dir models/${RUN} \
  --model_name_or_path ${model} \
  --save_steps 1000 \
  --train_dir data/tokenized/${tokenizer}/generated_train_v2_with_gold_DR_hn \
  --fp16 \
  --per_device_train_batch_size 8 \
  --train_n_passages 4 \
  --learning_rate 5e-6 \
  --q_max_len 256 \
  --p_max_len 512 \
  --num_train_epochs 4 \
  --overwrite_output_dir \
  --cache_dir cache \
  --run_name ${RUN} \
  --report_to wandb \
  --logging_steps 100


python -m asyncval \
--query_file data/tokenized/${tokenizer}/queries/ct_2022_queries.json \
--candidate_dir data/tokenized/${tokenizer}/corpus/corpus_t_e_s_d \
--ckpts_dir models/${RUN} \
--tokenizer_name_or_path ${tokenizer} \
--qrel_file data/qrels/ct_2022_qrels.txt \
--metrics 'RR(rel=2)' 'nDCG@10' 'P(rel=2)@10' 'Rprec(rel=2)' 'R(rel=2)@1000' \
--output_dir asyncval_runs/${RUN} \
--report_to wandb \
--depth 1000 \
--per_device_eval_batch_size 128 \
--q_max_len 256 \
--p_max_len 512 \
--write_run trec \
--write_embeddings True \
--fp16 \
--cache_dir cache