File tree Expand file tree Collapse file tree 3 files changed +43
-0
lines changed Expand file tree Collapse file tree 3 files changed +43
-0
lines changed Original file line number Diff line number Diff line change
1
+ #! /bin/bash
2
+ set -euo pipefail
3
+ source .env
4
+ source .checks
5
+
6
+ L=$1
7
+ dir=$2
8
+
9
+ INDEX=1-$( ls -1 $WORKSPACE /$dir /$L /${L} _* .zst | sed -E ' s#.*/\w{2,3}_([0-9]+)\.jsonl\.zst#\1#' | sort -n | tail -1)
10
+ sbatch -J $L -findups-$dir --array=$INDEX scripts/check-duplicates.slurm $L $dir
Original file line number Diff line number Diff line change
1
+ #! /bin/bash
2
+ # SBATCH --job-name=findups-fix
3
+ # SBATCH --partition="small"
4
+ # SBATCH --time=72:00:00
5
+ # SBATCH --ntasks=1
6
+ # SBATCH --cpus-per-task=8
7
+ # SBATCH --mem-per-cpu=1750
8
+ # SBATCH --output=logs/%x.out
9
+ module load cray-python/3.9.12.1
10
+ source .env
11
+ set -euo pipefail
12
+ module load parallel
13
+
14
+ L=$1
15
+ dir=$2
16
+
17
+ zstdcat $WORKSPACE /$dir /$L /${L} _$SLURM_ARRAY_TASK_ID .jsonl.zst \
18
+ | parallel --pipe -j4 jq " .text" \
19
+ | python scripts/exactdups.py
Original file line number Diff line number Diff line change
1
+ from xxhash import xxh64_intdigest
2
+ import sys
3
+
4
+ hashtable = set ()
5
+ for line in sys .stdin :
6
+ digest = xxh64_intdigest (line .rstrip ())
7
+
8
+ if digest in hashtable :
9
+ sys .stderr .write ("Found duplicate!\n " )
10
+ sys .exit (1 )
11
+
12
+ hashtable .add (digest )
13
+
14
+ sys .stderr .write ("No duplicates found\n " )
You can’t perform that action at this time.
0 commit comments