Skip to content

Commit

Permalink
Add exact duplicates checking scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
ZJaume committed Nov 30, 2023
1 parent 2251780 commit 908fa4c
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 0 deletions.
10 changes: 10 additions & 0 deletions scripts/check-duplicates.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash
set -euo pipefail
source .env
source .checks

L=$1
dir=$2

INDEX=1-$(ls -1 $WORKSPACE/$dir/$L/${L}_*.zst | sed -E 's#.*/\w{2,3}_([0-9]+)\.jsonl\.zst#\1#' | sort -n | tail -1)
sbatch -J $L-findups-$dir --array=$INDEX scripts/check-duplicates.slurm $L $dir
19 changes: 19 additions & 0 deletions scripts/check-duplicates.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash
#SBATCH --job-name=findups-fix
#SBATCH --partition="small"
#SBATCH --time=72:00:00
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=8
#SBATCH --mem-per-cpu=1750
#SBATCH --output=logs/%x.out
module load cray-python/3.9.12.1
source .env
set -euo pipefail
module load parallel

L=$1
dir=$2

zstdcat $WORKSPACE/$dir/$L/${L}_$SLURM_ARRAY_TASK_ID.jsonl.zst \
| parallel --pipe -j4 jq ".text" \
| python scripts/exactdups.py
14 changes: 14 additions & 0 deletions scripts/exactdups.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from xxhash import xxh64_intdigest
import sys

hashtable = set()
for line in sys.stdin:
digest = xxh64_intdigest(line.rstrip())

if digest in hashtable:
sys.stderr.write("Found duplicate!\n")
sys.exit(1)

hashtable.add(digest)

sys.stderr.write("No duplicates found\n")

0 comments on commit 908fa4c

Please sign in to comment.