Skip to content

Commit

Permalink
minor fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
ChenghaoMou committed Mar 17, 2024
1 parent 22d6d78 commit 759e600
Show file tree
Hide file tree
Showing 25 changed files with 170 additions and 894 deletions.
5 changes: 5 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,8 @@ repos:
rev: v0.1.4
hooks:
- id: ruff-format
- repo: https://github.com/PyCQA/bandit
rev: 1.7.8
hooks:
- id: bandit
args: [--skip, B101]
5 changes: 3 additions & 2 deletions regular.Dockerfile → Dockerfile
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
FROM python:3.10-slim

RUN apt-get update && apt-get install -y git gcc curl
RUN apt-get update && apt-get install -y git gcc curl openjdk-17-jdk openjdk-17-jre-headless && apt-get clean && rm -rf /var/lib/apt/lists/*
RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
RUN pip install wheel build poetry && poetry config virtualenvs.create false
RUN pip install poetry==1.8.2 pyspark==3.5.1 wheel==0.42.0 build==1.1.1 && poetry config virtualenvs.create false

WORKDIR /app
RUN git clone https://github.com/google-research/deduplicate-text-datasets.git
Expand All @@ -14,5 +14,6 @@ WORKDIR /app
COPY text_dedup /app/text_dedup
COPY pyproject.toml /app
COPY poetry.lock /app
COPY log4j.properties /app
COPY README.md /app/README.md
RUN poetry install
19 changes: 9 additions & 10 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,31 +2,30 @@ SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = docs/source
BUILDDIR = docs/build
ENV = regular

build:
docker compose build $(ENV)
docker compose build

up:
docker compose up $(ENV) --detach
docker compose up --detach

down:
docker compose down $(ENV)
docker compose down

build-doc: up
docker compose run $(ENV) $(SPHINXBUILD) -b html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS)
docker compose run $(SPHINXBUILD) -b html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS)

serve: up build-doc
cd "$(BUILDDIR)" && python3 -m http.server

test: up
docker compose exec regular poetry run coverage run -m pytest -vvv -s --doctest-modules . --ignore deduplicate-text-datasets --ignore docs --ignore text_dedup/minhash_spark.py --ignore reference --ignore tests/test_minhash_spark.py --ignore tests/test_benchmark.py
docker compose exec regular poetry run coverage xml -o cobertura.xml
docker compose exec regular poetry run coverage report -m
docker compose cp regular:/app/cobertura.xml cobertura.xml
docker compose exec local poetry run coverage run -m pytest -vvv -s --doctest-modules . --ignore deduplicate-text-datasets --ignore docs --ignore text_dedup/minhash_spark.py --ignore tests/test_benchmark.py
docker compose exec local poetry run coverage xml -o cobertura.xml
docker compose exec local poetry run coverage report -m
docker compose cp local:/app/cobertura.xml cobertura.xml

spark_test: up
docker compose exec spark poetry run pytest -vvv -s --doctest-modules tests/test_minhash_spark.py
docker compose exec local poetry run pytest -vvv -s --doctest-modules tests/test_minhash_spark.py

clean:
docker system prune -a
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ DEBUG __main__ - ---------------------------------------------------------------
```

Or take a look at `reference/bigcode-v2/run.sh` on how to run the job with GCP DataProc.
Or take a look at [bigcode-v2/run.sh](https://github.com/bigcode-project/bigcode-dataset/blob/main/near_deduplication/bigcode-v2/run.sh) on how to run the job with GCP DataProc.

### Suffix Array Substring Exact Deduplication

Expand Down Expand Up @@ -224,7 +224,7 @@ A benchmark of different methods here can be found in `benchmarks/wiki40.ipynb`.

For quick reference, here are the results:

| Method | Precision | Recall | F1 | Time |
| Method | Precision | Recall | F1** | Time |
| ---------------------------------------------------------------------------------- | ---------- | ---------- | ---------- | ------ |
| MinHash (Spark) | **0.9570** | **0.9445** | **0.9507** | 18.62s |
| MinHash | **0.9594** | **0.945** | **0.9519** | 18s |
Expand All @@ -235,6 +235,8 @@ For quick reference, here are the results:

\*Best SimHash result from `benchmarks/hyperparameter.ipynb`.

\*\* F1 on duplicates as positives

<!-- ## FAQ
### Why use scripts instead of OOD classes and functions?
Expand Down
196 changes: 98 additions & 98 deletions cobertura.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<?xml version="1.0" ?>
<coverage version="7.4.3" timestamp="1710641041526" lines-valid="934" lines-covered="630" line-rate="0.6745" branches-valid="402" branches-covered="150" branch-rate="0.3731" complexity="0">
<coverage version="7.4.3" timestamp="1710675264933" lines-valid="934" lines-covered="630" line-rate="0.6745" branches-valid="402" branches-covered="150" branch-rate="0.3731" complexity="0">
<!-- Generated by coverage.py: https://coverage.readthedocs.io/en/7.4.3 -->
<!-- Based on https://raw.githubusercontent.com/cobertura/web/master/htdocs/xml/coverage-04.dtd -->
<sources>
Expand Down Expand Up @@ -362,143 +362,143 @@
<line number="36" hits="1"/>
<line number="37" hits="1"/>
<line number="38" hits="1"/>
<line number="40" hits="1"/>
<line number="63" hits="1"/>
<line number="66" hits="1"/>
<line number="41" hits="1"/>
<line number="64" hits="1"/>
<line number="67" hits="1"/>
<line number="84" hits="1"/>
<line number="68" hits="1"/>
<line number="85" hits="1"/>
<line number="86" hits="1"/>
<line number="88" hits="1"/>
<line number="87" hits="1"/>
<line number="89" hits="1"/>
<line number="90" hits="1"/>
<line number="91" hits="1"/>
<line number="92" hits="1"/>
<line number="93" hits="1" branch="true" condition-coverage="100% (2/2)"/>
<line number="94" hits="1"/>
<line number="93" hits="1"/>
<line number="94" hits="1" branch="true" condition-coverage="100% (2/2)"/>
<line number="95" hits="1"/>
<line number="96" hits="1"/>
<line number="97" hits="1"/>
<line number="98" hits="1" branch="true" condition-coverage="100% (2/2)"/>
<line number="99" hits="1"/>
<line number="101" hits="1"/>
<line number="103" hits="1"/>
<line number="105" hits="1"/>
<line number="107" hits="1"/>
<line number="98" hits="1"/>
<line number="99" hits="1" branch="true" condition-coverage="100% (2/2)"/>
<line number="100" hits="1"/>
<line number="102" hits="1"/>
<line number="104" hits="1"/>
<line number="106" hits="1"/>
<line number="108" hits="1"/>
<line number="109" hits="1"/>
<line number="110" hits="1"/>
<line number="111" hits="1"/>
<line number="113" hits="1"/>
<line number="127" hits="1"/>
<line number="112" hits="1"/>
<line number="114" hits="1"/>
<line number="128" hits="1"/>
<line number="130" hits="1" branch="true" condition-coverage="100% (2/2)"/>
<line number="131" hits="1"/>
<line number="132" hits="1" branch="true" condition-coverage="100% (2/2)"/>
<line number="133" hits="1"/>
<line number="135" hits="1"/>
<line number="137" hits="1"/>
<line number="139" hits="1"/>
<line number="153" hits="1"/>
<line number="129" hits="1"/>
<line number="131" hits="1" branch="true" condition-coverage="100% (2/2)"/>
<line number="132" hits="1"/>
<line number="133" hits="1" branch="true" condition-coverage="100% (2/2)"/>
<line number="134" hits="1"/>
<line number="136" hits="1"/>
<line number="138" hits="1"/>
<line number="140" hits="1"/>
<line number="154" hits="1"/>
<line number="155" hits="1" branch="true" condition-coverage="100% (2/2)"/>
<line number="155" hits="1"/>
<line number="156" hits="1" branch="true" condition-coverage="100% (2/2)"/>
<line number="157" hits="1"/>
<line number="159" hits="1"/>
<line number="157" hits="1" branch="true" condition-coverage="100% (2/2)"/>
<line number="158" hits="1"/>
<line number="160" hits="1"/>
<line number="163" hits="1"/>
<line number="191" hits="1"/>
<line number="161" hits="1"/>
<line number="164" hits="1"/>
<line number="192" hits="1"/>
<line number="194" hits="1" branch="true" condition-coverage="100% (2/2)"/>
<line number="195" hits="1"/>
<line number="193" hits="1"/>
<line number="195" hits="1" branch="true" condition-coverage="100% (2/2)"/>
<line number="196" hits="1"/>
<line number="197" hits="1"/>
<line number="198" hits="1"/>
<line number="199" hits="1"/>
<line number="208" hits="1"/>
<line number="210" hits="1"/>
<line number="211" hits="1" branch="true" condition-coverage="100% (2/2)"/>
<line number="212" hits="1"/>
<line number="213" hits="1" branch="true" condition-coverage="100% (2/2)"/>
<line number="214" hits="1"/>
<line number="216" hits="1"/>
<line number="219" hits="1"/>
<line number="244" hits="1"/>
<line number="200" hits="1"/>
<line number="209" hits="1"/>
<line number="211" hits="1"/>
<line number="212" hits="1" branch="true" condition-coverage="100% (2/2)"/>
<line number="213" hits="1"/>
<line number="214" hits="1" branch="true" condition-coverage="100% (2/2)"/>
<line number="215" hits="1"/>
<line number="217" hits="1"/>
<line number="220" hits="1"/>
<line number="245" hits="1"/>
<line number="246" hits="1"/>
<line number="249" hits="1"/>
<line number="273" hits="1" branch="true" condition-coverage="100% (2/2)"/>
<line number="274" hits="1"/>
<line number="247" hits="1"/>
<line number="250" hits="1"/>
<line number="274" hits="1" branch="true" condition-coverage="100% (2/2)"/>
<line number="275" hits="1"/>
<line number="276" hits="1"/>
<line number="277" hits="1"/>
<line number="280" hits="1"/>
<line number="315" hits="1" branch="true" condition-coverage="100% (2/2)"/>
<line number="278" hits="1"/>
<line number="281" hits="1"/>
<line number="316" hits="1" branch="true" condition-coverage="100% (2/2)"/>
<line number="317" hits="1"/>
<line number="318" hits="1" branch="true" condition-coverage="50% (1/2)" missing-branches="319"/>
<line number="319" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="320,326"/>
<line number="320" hits="0"/>
<line number="326" hits="1"/>
<line number="329" hits="1"/>
<line number="317" hits="1" branch="true" condition-coverage="100% (2/2)"/>
<line number="318" hits="1"/>
<line number="319" hits="1" branch="true" condition-coverage="50% (1/2)" missing-branches="320"/>
<line number="320" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="321,327"/>
<line number="321" hits="0"/>
<line number="327" hits="1"/>
<line number="330" hits="1"/>
<line number="331" hits="1"/>
<line number="332" hits="1"/>
<line number="333" hits="1"/>
<line number="339" hits="0"/>
<line number="334" hits="1"/>
<line number="340" hits="0"/>
<line number="341" hits="0"/>
<line number="344" hits="0"/>
<line number="346" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="347,453"/>
<line number="347" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="348,363"/>
<line number="348" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="349,351"/>
<line number="349" hits="0"/>
<line number="351" hits="0"/>
<line number="363" hits="0"/>
<line number="365" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="366,381"/>
<line number="366" hits="0"/>
<line number="381" hits="0"/>
<line number="342" hits="0"/>
<line number="345" hits="0"/>
<line number="347" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="348,454"/>
<line number="348" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="349,364"/>
<line number="349" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="350,352"/>
<line number="350" hits="0"/>
<line number="352" hits="0"/>
<line number="364" hits="0"/>
<line number="366" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="367,382"/>
<line number="367" hits="0"/>
<line number="382" hits="0"/>
<line number="383" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="384,419"/>
<line number="384" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="383,390"/>
<line number="390" hits="0"/>
<line number="393" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="384,401"/>
<line number="401" hits="0"/>
<line number="403" hits="0"/>
<line number="405" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="406,416"/>
<line number="406" hits="0"/>
<line number="409" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="410,414"/>
<line number="410" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="411,412"/>
<line number="411" hits="0"/>
<line number="412" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="409,413"/>
<line number="413" hits="0"/>
<line number="383" hits="0"/>
<line number="384" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="385,420"/>
<line number="385" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="384,391"/>
<line number="391" hits="0"/>
<line number="394" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="385,402"/>
<line number="402" hits="0"/>
<line number="404" hits="0"/>
<line number="406" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="407,417"/>
<line number="407" hits="0"/>
<line number="410" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="411,415"/>
<line number="411" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="412,413"/>
<line number="412" hits="0"/>
<line number="413" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="410,414"/>
<line number="414" hits="0"/>
<line number="416" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="393,417"/>
<line number="417" hits="0"/>
<line number="419" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="420,441"/>
<line number="420" hits="0"/>
<line number="415" hits="0"/>
<line number="417" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="394,418"/>
<line number="418" hits="0"/>
<line number="420" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="421,442"/>
<line number="421" hits="0"/>
<line number="422" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="exit,429"/>
<line number="429" hits="0"/>
<line number="422" hits="0"/>
<line number="423" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="exit,430"/>
<line number="430" hits="0"/>
<line number="434" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="exit,419"/>
<line number="441" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="442,448"/>
<line number="442" hits="0"/>
<line number="431" hits="0"/>
<line number="435" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="exit,420"/>
<line number="442" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="443,449"/>
<line number="443" hits="0"/>
<line number="444" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="441,445"/>
<line number="445" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="441,446"/>
<line number="446" hits="0"/>
<line number="448" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="346,449"/>
<line number="449" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="448,450"/>
<line number="450" hits="0"/>
<line number="444" hits="0"/>
<line number="445" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="442,446"/>
<line number="446" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="442,447"/>
<line number="447" hits="0"/>
<line number="449" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="347,450"/>
<line number="450" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="449,451"/>
<line number="451" hits="0"/>
<line number="453" hits="0"/>
<line number="454" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="455,457"/>
<line number="455" hits="0"/>
<line number="457" hits="0"/>
<line number="452" hits="0"/>
<line number="454" hits="0"/>
<line number="455" hits="0" branch="true" condition-coverage="0% (0/2)" missing-branches="456,458"/>
<line number="456" hits="0"/>
<line number="458" hits="0"/>
<line number="461" hits="1" branch="true" condition-coverage="50% (1/2)" missing-branches="462"/>
<line number="462" hits="0"/>
<line number="459" hits="0"/>
<line number="462" hits="1" branch="true" condition-coverage="50% (1/2)" missing-branches="464"/>
<line number="464" hits="0"/>
</lines>
</class>
<class name="suffix_array.py" filename="text_dedup/suffix_array.py" complexity="0" line-rate="0.5597" branch-rate="0.4255">
Expand Down Expand Up @@ -661,8 +661,8 @@
<line number="403" hits="0"/>
<line number="405" hits="0"/>
<line number="406" hits="0"/>
<line number="409" hits="1" branch="true" condition-coverage="50% (1/2)" missing-branches="410"/>
<line number="410" hits="0"/>
<line number="409" hits="1" branch="true" condition-coverage="50% (1/2)" missing-branches="411"/>
<line number="411" hits="0"/>
</lines>
</class>
</classes>
Expand Down
17 changes: 3 additions & 14 deletions compose.yaml
Original file line number Diff line number Diff line change
@@ -1,22 +1,11 @@
services:
regular:
image: regular
local:
image: local
build:
context: .
dockerfile: regular.Dockerfile
dockerfile: Dockerfile
tty: true
volumes:
- ./docs:/app/docs
- ./reference:/app/reference
- ./tests:/app/tests
- ./text_dedup:/app/text_dedup

spark:
image: spark
build:
context: .
dockerfile: spark.Dockerfile
tty: true
volumes:
- ./tests:/app/tests
- ./text_dedup:/app/text_dedup
Loading

0 comments on commit 759e600

Please sign in to comment.