Skip to content

Commit 2251780

Browse files
committed
clean: run discarding documents instead of tagging
Refactor cli for clean jobs, use getopts to add cli options. Pass parameters to job scripts through the environment.
1 parent 03b9c1d commit 2251780

File tree

4 files changed

+77
-27
lines changed

4 files changed

+77
-27
lines changed

30.clean

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,9 @@ set -euo pipefail
1717
L=$1
1818
input_dir=dedup
1919
output_dir=clean
20-
external=false
21-
if [ $# -ge 2 ] && [ "$2" == "external" ]; then
20+
if [ "$EXTERNAL" = true ]; then
2221
input_dir=external
2322
output_dir=external_clean
24-
external=true
2523
fi
2624
INPUT=$WORKSPACE/$input_dir/$L/${L}_$SLURM_ARRAY_TASK_ID.jsonl.zst
2725
OUTPUT=$WORKSPACE/$output_dir/$L/${L}_$SLURM_ARRAY_TASK_ID.jsonl.zst
@@ -37,9 +35,12 @@ case "$L" in
3735
*)
3836
FILTER_PARAMS="-a";;
3937
esac
40-
if [ "$external" = true ]; then
38+
if [ "$EXTERNAL" = true ]; then
4139
FILTER_PARAMS="-w -m";
4240
fi
41+
if [ "$DISCARD" = true ]; then
42+
FILTER_PARAMS="$FILTER_PARAMS -f";
43+
fi
4344

4445
# Do not use -k, we don't care if documents appear in
4546
# different order

30.clean.sh

Lines changed: 60 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -5,48 +5,91 @@
55
source .env
66
source .checks
77
set -euo pipefail
8+
9+
usage () {
10+
echo "Usage: `basename $0` [options] <lang>"
11+
echo "Options:"
12+
echo " -d Discard documents instead of tagging"
13+
echo " -e Process external contributions"
14+
echo " -f FAILED_JOBID Re-run failed jobs of previous job array"
15+
echo " -i INDEX Job array index instead of run 'all'"
16+
echo " -h Shows this message"
17+
}
18+
19+
820
MAX_JOBS=120
921
NUM_JOBS=60
1022
TIME_RETRY=10m
1123

12-
L=$1
13-
external=""
1424
input_dir="dedup"
25+
DISCARD=false
26+
EXTERNAL=false
27+
FAILED=""
28+
INDEX=""
29+
while getopts "df:i:eh" options
30+
do
31+
case "${options}" in
32+
d) DISCARD=true;;
33+
f) FAILED=$OPTARG;;
34+
i) INDEX=$OPTARG;;
35+
e) EXTERNAL=true;;
36+
h) usage
37+
exit 0;;
38+
\?) usage >&2
39+
exit 1;;
40+
esac
41+
done
42+
43+
L=${@:$OPTIND:1}
1544

16-
if [ $# -lt 2 ] || [ "$2" == "all" ]; then
45+
if [ "$INDEX" = "" ]; then
1746
# List all the batches that need to be processed (size of the job array)
18-
if [ $# -ge 3 ] && [ "$3" == "external" ]; then
47+
if [ "$EXTERNAL" = true ]; then
1948
external="external"
2049
input_dir=external
2150
fi
2251
INDEX=1-$(ls -1 $WORKSPACE/$input_dir/$L/${L}_*.zst | sed -E 's#.*/\w{2,3}_([0-9]+)\.jsonl\.zst#\1#' | sort -n | tail -1)
23-
elif [ $# -gt 2 ] && [ "$2" == "failed" ]; then
24-
# Select only failed jobs (timeout, oom and failed status)
25-
# Create a list of batch id's separated by comma
26-
JOB=$3
52+
else
53+
INDEX=$2
54+
fi
55+
56+
if ! [ "$FAILED" = "" ]; then
57+
# Select only failed jobs (timeout, oom and failed status)
58+
# Create a list of batch id's separated by comma
2759
INDEX=$(\
28-
sacct -j $JOB --parsable -s oom,f,to -n \
60+
sacct -j $FAILED --parsable -s oom,f,to -n \
2961
| grep -v '.batch' \
3062
| sed -E 's/[0-9]+_([0-9]+)\|.*/\1/g' \
3163
| paste -sd','
3264
)
33-
else
34-
INDEX=$2
3565
fi
3666

67+
if [ "$DISCARD" = true ]; then
68+
echo "Discarding documents instead of tagging"
69+
fi
3770
echo "Job array of index $INDEX for $L"
3871
read -p "Confirm? [y/n] " -n 1 -r
3972
if [[ ! $REPLY =~ [Yy] ]]; then echo; exit 1; fi
4073
echo
4174

75+
# Send parameters to the job scripts through the env
76+
export EXTERNAL
77+
export DISCARD
78+
JOBNAME="$L"
79+
if [ "$EXTERNAL" = true ]; then
80+
JOBNAME="external-$L"
81+
fi
82+
4283
JOB_ID=$(\
4384
SBATCH_OUTPUT="$SLURM_LOGS_DIR/%x-%A_%a.out" \
4485
sbatch --array=$INDEX \
45-
-J $external$L-clean --parsable \
46-
30.clean $L $external)
86+
-J $JOBNAME-clean --parsable \
87+
30.clean $L)
4788
echo Submitted batch job $JOB_ID
4889

49-
SBATCH_OUTPUT="$SLURM_LOGS_DIR/%x.out" \
50-
sbatch -J $external$L-stats \
51-
-d afterok:$JOB_ID \
52-
30.stats $L $external
90+
if [ "$DISCARD" = false ]; then
91+
SBATCH_OUTPUT="$SLURM_LOGS_DIR/%x.out" \
92+
sbatch -J $JOBNAME-stats \
93+
-d afterok:$JOB_ID \
94+
30.stats $L
95+
fi

30.stats

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,12 @@ set -euo pipefail
1212

1313
L=$1
1414
input_dir=clean
15-
if [ $# -ge 2 ] && [ "$2" == "external" ]; then
15+
if [ $EXTERNAL = true ]; then
1616
input_dir=external_clean
1717
fi
1818
INPUT=$WORKSPACE/$input_dir/$L/
1919
OUTPUT=$WORKSPACE/$input_dir/$L/${L}_stats
2020

21-
# Do not use -k, we don't care if documents appear in
22-
# different order
2321
zstdcat $INPUT/${L}_*.jsonl.zst \
2422
| python scripts/filter-stats.py \
2523
>$OUTPUT.tmp

filter-docs.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@
2020
parser.add_argument('-m','--minimum', action='store_true', help="Remove docs that do not meet the minimum size")
2121
parser.add_argument('-l','--language', action='store_true', help="Remove docs that do not meet the minimum correct language pct")
2222
parser.add_argument('-z','--cjk', action='store_true', help="Process CJK language")
23-
parser.add_argument('-s','--stats', action='store_true', help="Do not filter just print stats+docs for debugging")
23+
parser.add_argument('-f','--filter', action='store_true', help="Discard documents instead of adding filter metadata.")
24+
2425
args = parser.parse_args()
2526
if args.all:
2627
args.explicit = True
@@ -101,5 +102,12 @@ def filter_doc(args, doc):
101102
for line in sys.stdin:
102103
doc = orjson.loads(line)
103104
reason = filter_doc(args, doc)
104-
doc["filter"] = reason
105-
print(orjson.dumps(doc).decode('utf-8'))
105+
106+
# If not discarding documents, just add metadata to the json and print
107+
# otherwise just print the document in case "keep" reason
108+
# with other filter reasons, just do nothing
109+
if not args.filter:
110+
doc["filter"] = reason
111+
print(orjson.dumps(doc).decode('utf-8'))
112+
elif reason == "keep":
113+
print(line, end='')

0 commit comments

Comments
 (0)