Skip to content

Commit 759e600

Browse files
committed
minor fixes
1 parent 22d6d78 commit 759e600

25 files changed

+170
-894
lines changed

.pre-commit-config.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,8 @@ repos:
2121
rev: v0.1.4
2222
hooks:
2323
- id: ruff-format
24+
- repo: https://github.com/PyCQA/bandit
25+
rev: 1.7.8
26+
hooks:
27+
- id: bandit
28+
args: [--skip, B101]

regular.Dockerfile renamed to Dockerfile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
FROM python:3.10-slim
22

3-
RUN apt-get update && apt-get install -y git gcc curl
3+
RUN apt-get update && apt-get install -y git gcc curl openjdk-17-jdk openjdk-17-jre-headless && apt-get clean && rm -rf /var/lib/apt/lists/*
44
RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
5-
RUN pip install wheel build poetry && poetry config virtualenvs.create false
5+
RUN pip install poetry==1.8.2 pyspark==3.5.1 wheel==0.42.0 build==1.1.1 && poetry config virtualenvs.create false
66

77
WORKDIR /app
88
RUN git clone https://github.com/google-research/deduplicate-text-datasets.git
@@ -14,5 +14,6 @@ WORKDIR /app
1414
COPY text_dedup /app/text_dedup
1515
COPY pyproject.toml /app
1616
COPY poetry.lock /app
17+
COPY log4j.properties /app
1718
COPY README.md /app/README.md
1819
RUN poetry install

Makefile

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,31 +2,30 @@ SPHINXOPTS ?=
22
SPHINXBUILD ?= sphinx-build
33
SOURCEDIR = docs/source
44
BUILDDIR = docs/build
5-
ENV = regular
65

76
build:
8-
docker compose build $(ENV)
7+
docker compose build
98

109
up:
11-
docker compose up $(ENV) --detach
10+
docker compose up --detach
1211

1312
down:
14-
docker compose down $(ENV)
13+
docker compose down
1514

1615
build-doc: up
17-
docker compose run $(ENV) $(SPHINXBUILD) -b html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS)
16+
docker compose run $(SPHINXBUILD) -b html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS)
1817

1918
serve: up build-doc
2019
cd "$(BUILDDIR)" && python3 -m http.server
2120

2221
test: up
23-
docker compose exec regular poetry run coverage run -m pytest -vvv -s --doctest-modules . --ignore deduplicate-text-datasets --ignore docs --ignore text_dedup/minhash_spark.py --ignore reference --ignore tests/test_minhash_spark.py --ignore tests/test_benchmark.py
24-
docker compose exec regular poetry run coverage xml -o cobertura.xml
25-
docker compose exec regular poetry run coverage report -m
26-
docker compose cp regular:/app/cobertura.xml cobertura.xml
22+
docker compose exec local poetry run coverage run -m pytest -vvv -s --doctest-modules . --ignore deduplicate-text-datasets --ignore docs --ignore text_dedup/minhash_spark.py --ignore tests/test_benchmark.py
23+
docker compose exec local poetry run coverage xml -o cobertura.xml
24+
docker compose exec local poetry run coverage report -m
25+
docker compose cp local:/app/cobertura.xml cobertura.xml
2726

2827
spark_test: up
29-
docker compose exec spark poetry run pytest -vvv -s --doctest-modules tests/test_minhash_spark.py
28+
docker compose exec local poetry run pytest -vvv -s --doctest-modules tests/test_minhash_spark.py
3029

3130
clean:
3231
docker system prune -a

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ DEBUG __main__ - ---------------------------------------------------------------
9191
9292
```
9393

94-
Or take a look at `reference/bigcode-v2/run.sh` on how to run the job with GCP DataProc.
94+
Or take a look at [bigcode-v2/run.sh](https://github.com/bigcode-project/bigcode-dataset/blob/main/near_deduplication/bigcode-v2/run.sh) on how to run the job with GCP DataProc.
9595

9696
### Suffix Array Substring Exact Deduplication
9797

@@ -224,7 +224,7 @@ A benchmark of different methods here can be found in `benchmarks/wiki40.ipynb`.
224224

225225
For quick reference, here are the results:
226226

227-
| Method | Precision | Recall | F1 | Time |
227+
| Method | Precision | Recall | F1** | Time |
228228
| ---------------------------------------------------------------------------------- | ---------- | ---------- | ---------- | ------ |
229229
| MinHash (Spark) | **0.9570** | **0.9445** | **0.9507** | 18.62s |
230230
| MinHash | **0.9594** | **0.945** | **0.9519** | 18s |
@@ -235,6 +235,8 @@ For quick reference, here are the results:
235235

236236
\*Best SimHash result from `benchmarks/hyperparameter.ipynb`.
237237

238+
\*\* F1 on duplicates as positives
239+
238240
<!-- ## FAQ
239241
240242
### Why use scripts instead of OOD classes and functions?

cobertura.xml

Lines changed: 98 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
<?xml version="1.0" ?>
2-
<coverage version="7.4.3" timestamp="1710641041526" lines-valid="934" lines-covered="630" line-rate="0.6745" branches-valid="402" branches-covered="150" branch-rate="0.3731" complexity="0">
2+
<coverage version="7.4.3" timestamp="1710675264933" lines-valid="934" lines-covered="630" line-rate="0.6745" branches-valid="402" branches-covered="150" branch-rate="0.3731" complexity="0">
33
<!-- Generated by coverage.py: https://coverage.readthedocs.io/en/7.4.3 -->
44
<!-- Based on https://raw.githubusercontent.com/cobertura/web/master/htdocs/xml/coverage-04.dtd -->
55
<sources>
@@ -362,143 +362,143 @@
362362
<line number="36" hits="1"/>
363363
<line number="37" hits="1"/>
364364
<line number="38" hits="1"/>
365-
<line number="40" hits="1"/>
366-
<line number="63" hits="1"/>
367-
<line number="66" hits="1"/>
365+
<line number="41" hits="1"/>
366+
<line number="64" hits="1"/>
368367
<line number="67" hits="1"/>
369-
<line number="84" hits="1"/>
368+
<line number="68" hits="1"/>
370369
<line number="85" hits="1"/>
371370
<line number="86" hits="1"/>
372-
<line number="88" hits="1"/>
371+
<line number="87" hits="1"/>
373372
<line number="89" hits="1"/>
374373
<line number="90" hits="1"/>
375374
<line number="91" hits="1"/>
376375
<line number="92" hits="1"/>
377-
<line number="93" hits="1" branch="true" condition-coverage="100% (2/2)"/>
378-
<line number="94" hits="1"/>
376+
<line number="93" hits="1"/>
377+
<line number="94" hits="1" branch="true" condition-coverage="100% (2/2)"/>
379378
<line number="95" hits="1"/>
380379
<line number="96" hits="1"/>
381380
<line number="97" hits="1"/>
382-
<line number="98" hits="1" branch="true" condition-coverage="100% (2/2)"/>
383-
<line number="99" hits="1"/>
384-
<line number="101" hits="1"/>
385-
<line number="103" hits="1"/>
386-
<line number="105" hits="1"/>
387-
<line number="107" hits="1"/>
381+
<line number="98" hits="1"/>
382+
<line number="99" hits="1" branch="true" condition-coverage="100% (2/2)"/>
383+
<line number="100" hits="1"/>
384+
<line number="102" hits="1"/>
385+
<line number="104" hits="1"/>
386+
<line number="106" hits="1"/>
388387
<line number="108" hits="1"/>
389388
<line number="109" hits="1"/>
390389
<line number="110" hits="1"/>
391390
<line number="111" hits="1"/>
392-
<line number="113" hits="1"/>
393-
<line number="127" hits="1"/>
391+
<line number="112" hits="1"/>
392+
<line number="114" hits="1"/>
394393
<line number="128" hits="1"/>
395-
<line number="130" hits="1" branch="true" condition-coverage="100% (2/2)"/>
396-
<line number="131" hits="1"/>
397-
<line number="132" hits="1" branch="true" condition-coverage="100% (2/2)"/>
398-
<line number="133" hits="1"/>
399-
<line number="135" hits="1"/>
400-
<line number="137" hits="1"/>
401-
<line number="139" hits="1"/>
402-
<line number="153" hits="1"/>
394+
<line number="129" hits="1"/>
395+
<line number="131" hits="1" branch="true" condition-coverage="100% (2/2)"/>
396+
<line number="132" hits="1"/>
397+
<line number="133" hits="1" branch="true" condition-coverage="100% (2/2)"/>
398+
<line number="134" hits="1"/>
399+
<line number="136" hits="1"/>
400+
<line number="138" hits="1"/>
401+
<line number="140" hits="1"/>
403402
<line number="154" hits="1"/>
404-
<line number="155" hits="1" branch="true" condition-coverage="100% (2/2)"/>
403+
<line number="155" hits="1"/>
405404
<line number="156" hits="1" branch="true" condition-coverage="100% (2/2)"/>
406-
<line number="157" hits="1"/>
407-
<line number="159" hits="1"/>
405+
<line number="157" hits="1" branch="true" condition-coverage="100% (2/2)"/>
406+
<line number="158" hits="1"/>
408407
<line number="160" hits="1"/>
409-
<line number="163" hits="1"/>
410-
<line number="191" hits="1"/>
408+
<line number="161" hits="1"/>
409+
<line number="164" hits="1"/>
411410
<line number="192" hits="1"/>
412-
<line number="194" hits="1" branch="true" condition-coverage="100% (2/2)"/>
413-
<line number="195" hits="1"/>
411+
<line number="193" hits="1"/>
412+
<line number="195" hits="1" branch="true" condition-coverage="100% (2/2)"/>
414413
<line number="196" hits="1"/>
415414
<line number="197" hits="1"/>
416415
<line number="198" hits="1"/>
417416
<line number="199" hits="1"/>
418-
<line number="208" hits="1"/>
419-
<line number="210" hits="1"/>
420-
<line number="211" hits="1" branch="true" condition-coverage="100% (2/2)"/>
421-
<line number="212" hits="1"/>
422-
<line number="213" hits="1" branch="true" condition-coverage="100% (2/2)"/>
423-
<line number="214" hits="1"/>
424-
<line number="216" hits="1"/>
425-
<line number="219" hits="1"/>
426-
<line number="244" hits="1"/>
417+
<line number="200" hits="1"/>
418+
<line number="209" hits="1"/>
419+
<line number="211" hits="1"/>
420+
<line number="212" hits="1" branch="true" condition-coverage="100% (2/2)"/>
421+
<line number="213" hits="1"/>
422+
<line number="214" hits="1" branch="true" condition-coverage="100% (2/2)"/>
423+
<line number="215" hits="1"/>
424+
<line number="217" hits="1"/>
425+
<line number="220" hits="1"/>
427426
<line number="245" hits="1"/>
428427
<line number="246" hits="1"/>
429-
<line number="249" hits="1"/>
430-
<line number="273" hits="1" branch="true" condition-coverage="100% (2/2)"/>
431-
<line number="274" hits="1"/>
428+
<line number="247" hits="1"/>
429+
<line number="250" hits="1"/>
430+
<line number="274" hits="1" branch="true" condition-coverage="100% (2/2)"/>
432431
<line number="275" hits="1"/>
433432
<line number="276" hits="1"/>
434433
<line number="277" hits="1"/>
435-
<line number="280" hits="1"/>
436-
<line number="315" hits="1" branch="true" condition-coverage="100% (2/2)"/>
434+
<line number="278" hits="1"/>
435+
<line number="281" hits="1"/>
437436
<line number="316" hits="1" branch="true" condition-coverage="100% (2/2)"/>
438-
<line number="317" hits="1"/>
439-
<line number="318" hits="1" branch="true" condition-coverage="50% (1/2)" missing-branches="319"/>
440-
<line number="319" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="320,326"/>
441-
<line number="320" hits="0"/>
442-
<line number="326" hits="1"/>
443-
<line number="329" hits="1"/>
437+
<line number="317" hits="1" branch="true" condition-coverage="100% (2/2)"/>
438+
<line number="318" hits="1"/>
439+
<line number="319" hits="1" branch="true" condition-coverage="50% (1/2)" missing-branches="320"/>
440+
<line number="320" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="321,327"/>
441+
<line number="321" hits="0"/>
442+
<line number="327" hits="1"/>
444443
<line number="330" hits="1"/>
445444
<line number="331" hits="1"/>
446445
<line number="332" hits="1"/>
447446
<line number="333" hits="1"/>
448-
<line number="339" hits="0"/>
447+
<line number="334" hits="1"/>
449448
<line number="340" hits="0"/>
450449
<line number="341" hits="0"/>
451-
<line number="344" hits="0"/>
452-
<line number="346" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="347,453"/>
453-
<line number="347" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="348,363"/>
454-
<line number="348" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="349,351"/>
455-
<line number="349" hits="0"/>
456-
<line number="351" hits="0"/>
457-
<line number="363" hits="0"/>
458-
<line number="365" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="366,381"/>
459-
<line number="366" hits="0"/>
460-
<line number="381" hits="0"/>
450+
<line number="342" hits="0"/>
451+
<line number="345" hits="0"/>
452+
<line number="347" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="348,454"/>
453+
<line number="348" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="349,364"/>
454+
<line number="349" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="350,352"/>
455+
<line number="350" hits="0"/>
456+
<line number="352" hits="0"/>
457+
<line number="364" hits="0"/>
458+
<line number="366" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="367,382"/>
459+
<line number="367" hits="0"/>
461460
<line number="382" hits="0"/>
462-
<line number="383" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="384,419"/>
463-
<line number="384" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="383,390"/>
464-
<line number="390" hits="0"/>
465-
<line number="393" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="384,401"/>
466-
<line number="401" hits="0"/>
467-
<line number="403" hits="0"/>
468-
<line number="405" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="406,416"/>
469-
<line number="406" hits="0"/>
470-
<line number="409" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="410,414"/>
471-
<line number="410" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="411,412"/>
472-
<line number="411" hits="0"/>
473-
<line number="412" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="409,413"/>
474-
<line number="413" hits="0"/>
461+
<line number="383" hits="0"/>
462+
<line number="384" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="385,420"/>
463+
<line number="385" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="384,391"/>
464+
<line number="391" hits="0"/>
465+
<line number="394" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="385,402"/>
466+
<line number="402" hits="0"/>
467+
<line number="404" hits="0"/>
468+
<line number="406" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="407,417"/>
469+
<line number="407" hits="0"/>
470+
<line number="410" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="411,415"/>
471+
<line number="411" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="412,413"/>
472+
<line number="412" hits="0"/>
473+
<line number="413" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="410,414"/>
475474
<line number="414" hits="0"/>
476-
<line number="416" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="393,417"/>
477-
<line number="417" hits="0"/>
478-
<line number="419" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="420,441"/>
479-
<line number="420" hits="0"/>
475+
<line number="415" hits="0"/>
476+
<line number="417" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="394,418"/>
477+
<line number="418" hits="0"/>
478+
<line number="420" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="421,442"/>
480479
<line number="421" hits="0"/>
481-
<line number="422" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="exit,429"/>
482-
<line number="429" hits="0"/>
480+
<line number="422" hits="0"/>
481+
<line number="423" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="exit,430"/>
483482
<line number="430" hits="0"/>
484-
<line number="434" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="exit,419"/>
485-
<line number="441" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="442,448"/>
486-
<line number="442" hits="0"/>
483+
<line number="431" hits="0"/>
484+
<line number="435" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="exit,420"/>
485+
<line number="442" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="443,449"/>
487486
<line number="443" hits="0"/>
488-
<line number="444" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="441,445"/>
489-
<line number="445" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="441,446"/>
490-
<line number="446" hits="0"/>
491-
<line number="448" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="346,449"/>
492-
<line number="449" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="448,450"/>
493-
<line number="450" hits="0"/>
487+
<line number="444" hits="0"/>
488+
<line number="445" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="442,446"/>
489+
<line number="446" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="442,447"/>
490+
<line number="447" hits="0"/>
491+
<line number="449" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="347,450"/>
492+
<line number="450" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="449,451"/>
494493
<line number="451" hits="0"/>
495-
<line number="453" hits="0"/>
496-
<line number="454" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="455,457"/>
497-
<line number="455" hits="0"/>
498-
<line number="457" hits="0"/>
494+
<line number="452" hits="0"/>
495+
<line number="454" hits="0"/>
496+
<line number="455" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="456,458"/>
497+
<line number="456" hits="0"/>
499498
<line number="458" hits="0"/>
500-
<line number="461" hits="1" branch="true" condition-coverage="50% (1/2)" missing-branches="462"/>
501-
<line number="462" hits="0"/>
499+
<line number="459" hits="0"/>
500+
<line number="462" hits="1" branch="true" condition-coverage="50% (1/2)" missing-branches="464"/>
501+
<line number="464" hits="0"/>
502502
</lines>
503503
</class>
504504
<class name="suffix_array.py" filename="text_dedup/suffix_array.py" complexity="0" line-rate="0.5597" branch-rate="0.4255">
@@ -661,8 +661,8 @@
661661
<line number="403" hits="0"/>
662662
<line number="405" hits="0"/>
663663
<line number="406" hits="0"/>
664-
<line number="409" hits="1" branch="true" condition-coverage="50% (1/2)" missing-branches="410"/>
665-
<line number="410" hits="0"/>
664+
<line number="409" hits="1" branch="true" condition-coverage="50% (1/2)" missing-branches="411"/>
665+
<line number="411" hits="0"/>
666666
</lines>
667667
</class>
668668
</classes>

compose.yaml

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,11 @@
11
services:
2-
regular:
3-
image: regular
2+
local:
3+
image: local
44
build:
55
context: .
6-
dockerfile: regular.Dockerfile
6+
dockerfile: Dockerfile
77
tty: true
88
volumes:
99
- ./docs:/app/docs
10-
- ./reference:/app/reference
11-
- ./tests:/app/tests
12-
- ./text_dedup:/app/text_dedup
13-
14-
spark:
15-
image: spark
16-
build:
17-
context: .
18-
dockerfile: spark.Dockerfile
19-
tty: true
20-
volumes:
2110
- ./tests:/app/tests
2211
- ./text_dedup:/app/text_dedup

0 commit comments

Comments
 (0)