From 5cf6c2e770f6059cbfa45c7967fdaa1c1207dad3 Mon Sep 17 00:00:00 2001
From: Chenghao Mou <mouchenghao@gmail.com>
Date: Sun, 17 Mar 2024 22:07:23 +0000
Subject: [PATCH] add news copy benchmark

---
 Makefile                                      |   3 +-
 README.md                                     |  68 ++++++++---
 compose.yaml                                  |   1 +
 poetry.lock                                   |  66 ++++++++++-
 pyproject.toml                                |   1 +
 ...st_benchmark.py => test_benchmark_core.py} |   0
 tests/test_benchmark_news.py                  | 111 ++++++++++++++++++
 text_dedup/simhash.py                         |   7 +-
 text_dedup/utils/preprocessing.py             |   6 +
 9 files changed, 243 insertions(+), 20 deletions(-)
 rename tests/{test_benchmark.py => test_benchmark_core.py} (100%)
 create mode 100644 tests/test_benchmark_news.py
 create mode 100644 text_dedup/utils/preprocessing.py

diff --git a/Makefile b/Makefile
index 5977388..da3527d 100644
--- a/Makefile
+++ b/Makefile
@@ -25,7 +25,8 @@ test: up
 	docker compose cp local:/app/cobertura.xml cobertura.xml
 
 benchmark: up
-	docker compose exec local poetry run python tests/test_benchmark.py
+	docker compose exec local poetry run python tests/test_benchmark_core.py
+	docker compose exec local poetry run python tests/test_benchmark_news.py
 
 spark_test: up
 	docker compose exec local poetry run pytest -vvv -s --doctest-modules tests/test_minhash_spark.py
diff --git a/README.md b/README.md
index acd4fee..380e84c 100644
--- a/README.md
+++ b/README.md
@@ -220,25 +220,59 @@ INFO     After                         : 47045
 
 ## Benchmarks
 
-A script is provided to benchmark some of the algorithms on `pinecone/core-2020-05-10-deduplication` can be found in `tests/test_benchmark.py`:
-
-| Algorithm                    | Precision (Duplicates) | Recall (Duplicates) | Precision (Non Duplicates) | Recall (Non Duplicates) | Macro F1 score |   Accuracy | Time    |
-| :--------------------------- | ---------------------: | ------------------: | -------------------------: | ----------------------: | -------------: | ---------: | :------ |
-| MinHash Spark                |                  0.957 |              0.9445 |                     0.9471 |                   0.959 |      **0.952** | **0.9202** | 698.76s |
-| MinHash                      |                 0.9594 |              0.9445 |                     0.9474 |                  0.9616 |     **0.9534** |  **0.924** | 18.80s  |
-| SimHash                      |                 0.9007 |              0.6786 |                     0.7681 |                  0.9343 |         0.8344 |     0.8137 | 253.94s |
-| Exact Title                  |                 0.8302 |              0.5521 |                     0.7098 |                  0.9065 |           0.77 |     0.7456 | -       |
-| Exact Title Matching *       |                  0.830 |                0.50 |                      0.709 |                   0.992 |          0.757 |      0.746 | -       |
-| Simhash Matching *           |                  0.697 |               0.247 |                      0.598 |                   0.985 |          0.631 |      0.616 | -       |
-| Document Vector Similarity * |                  0.912 |               0.779 |                      0.861 |                   0.986 |          0.885 |      0.883 | -       |
-| Hybrid Method *              |                  0.908 |               0.828 |                      0.899 |                   0.979 |          0.904 |      0.903 | -       |
-
-\* [(Gyawali et al., LREC 2020)](https://aclanthology.org/2020.lrec-1.113)
-
-\*\* Best SimHash result from `benchmarks/hyperparameter.ipynb`
+### pinecone/core-2020-05-10-deduplication
+
+See `tests/test_benchmark_core.py` for reproduction.
+
+| Algorithm                       | Precision (Duplicates) | Recall (Duplicates) | Precision (Non Duplicates) | Recall (Non Duplicates) | Macro F1 score |   Accuracy | Time    |
+| :------------------------------ | ---------------------: | ------------------: | -------------------------: | ----------------------: | -------------: | ---------: | :------ |
+| MinHash Spark                   |                  0.957 |              0.9445 |                     0.9471 |                   0.959 |      **0.952** | **0.9202** | 698.76s |
+| MinHash                         |                 0.9594 |              0.9445 |                     0.9474 |                  0.9616 |     **0.9534** |  **0.924** | 18.80s  |
+| SimHash**                       |                 0.9007 |              0.6786 |                     0.7681 |                  0.9343 |         0.8344 |     0.8137 | 253.94s |
+| Exact Title                     |                 0.8302 |              0.5521 |                     0.7098 |                  0.9065 |           0.77 |     0.7456 | -       |
+| Exact Title Matching [^1]       |                  0.830 |                0.50 |                      0.709 |                   0.992 |          0.757 |      0.746 | -       |
+| Simhash Matching [^1]           |                  0.697 |               0.247 |                      0.598 |                   0.985 |          0.631 |      0.616 | -       |
+| Document Vector Similarity [^1] |                  0.912 |               0.779 |                      0.861 |                   0.986 |          0.885 |      0.883 | -       |
+| Hybrid Method [^1]              |                  0.908 |               0.828 |                      0.899 |                   0.979 |          0.904 |      0.903 | -       |
+| LaBSE[^2]                       |                  0.937 |               0.923 |                      0.930 |                   0.943 |          0.933 |      0.919 | -       |
+| Multilingual USE[^2]            |                  0.917 |               0.907 |                      0.918 |                   0.927 |          0.917 |      0.909 | -       |
+| Multilingual E5-Base[^2]        |                  0.931 |               0.908 |                      0.919 |                   0.939 |          0.924 |      0.920 | -       |
+| MinHash + LSH[^2]               |                  0.929 |               0.902 |                      0.915 |                   0.938 |          0.921 |      0.918 | -       |
+| RETSimPartial-Dup[^2]           |                  0.945 |               0.941 |                      0.945 |                   0.949 |          0.945 |  **0.928** | -       |
+| RETSimNear-Dup[^2]              |                  0.928 |               0.937 |                      0.942 |                   0.934 |          0.935 |  **0.926** | -       |
+
+
+### NEWS-COPY
+
+See `tests/test_benchmark_news.py` for reproduction.
+
+Adjusted Rand Index (ARI) on NEWS-COPY dataset:
+
+| Model/Algorithm          | ARI       |
+| :----------------------- | :-------- |
+| n-gram [^3]              | 0.440     |
+| SimHash                  | 0.612     |
+| SimHash[^2]              | 0.695     |
+| MinHash                  | 0.742     |
+| MinHash[^3]              | 0.737     |
+| MinHash[^2]              | 0.783     |
+| Multilingual USE[^2]     | 0.730     |
+| Multilingual E5-Base[^2] | 0.742     |
+| S-BERT[^3]               | 0.700     |
+| RETSim Partial-Dup[^2]   | 0.831     |
+| RETSim Near-Dup[^2]      | 0.704     |
+| Re-ranking [^3]          | **0.937** |
+| Bi-encoder [^3]          | 0.915     |
+
+
+[^1]: [(Gyawali et al., LREC 2020)](https://aclanthology.org/2020.lrec-1.113)
+[^2]: [RETSim: Resilient and Efficient Text Similarity](https://arxiv.org/abs/2311.17264)
+[^3]: [Noise-Robust De-Duplication at Scale](https://www.semanticscholar.org/paper/Noise-Robust-De-Duplication-at-Scale-Silcock-D'Amico-Wong/7ca41cc5fc364b713aba5b573ae4ada801fd788a)
 
 > [!note]
-> Spark implementation has some overhead for small datasets, so I recommend using the script only when you have a large dataset and enough compute resources.
+> 1. Best SimHash result from `benchmarks/hyperparameter.ipynb`
+> 2. Spark implementation has some overhead for small datasets, so I recommend using the script only when you have a large dataset and enough compute resources.
+
 
 <!-- ## FAQ
 
diff --git a/compose.yaml b/compose.yaml
index f3ed43b..f445905 100644
--- a/compose.yaml
+++ b/compose.yaml
@@ -9,3 +9,4 @@ services:
       - ./docs:/app/docs
       - ./tests:/app/tests
       - ./text_dedup:/app/text_dedup
+      - ./data:/app/data
diff --git a/poetry.lock b/poetry.lock
index 7a75752..4cc138b 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -960,6 +960,17 @@ MarkupSafe = ">=2.0"
 [package.extras]
 i18n = ["Babel (>=2.7)"]
 
+[[package]]
+name = "joblib"
+version = "1.3.2"
+description = "Lightweight pipelining with Python functions"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "joblib-1.3.2-py3-none-any.whl", hash = "sha256:ef4331c65f239985f3f2220ecc87db222f08fd22097a3dd5698f693875f8cbb9"},
+    {file = "joblib-1.3.2.tar.gz", hash = "sha256:92f865e621e17784e7955080b6d042489e3b8e294949cc44c6eac304f59772b1"},
+]
+
 [[package]]
 name = "latexcodec"
 version = "3.0.0"
@@ -1855,6 +1866,48 @@ files = [
     {file = "ruff-0.3.2.tar.gz", hash = "sha256:fa78ec9418eb1ca3db392811df3376b46471ae93792a81af2d1cbb0e5dcb5142"},
 ]
 
+[[package]]
+name = "scikit-learn"
+version = "1.4.1.post1"
+description = "A set of python modules for machine learning and data mining"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "scikit-learn-1.4.1.post1.tar.gz", hash = "sha256:93d3d496ff1965470f9977d05e5ec3376fb1e63b10e4fda5e39d23c2d8969a30"},
+    {file = "scikit_learn-1.4.1.post1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c540aaf44729ab5cd4bd5e394f2b375e65ceaea9cdd8c195788e70433d91bbc5"},
+    {file = "scikit_learn-1.4.1.post1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:4310bff71aa98b45b46cd26fa641309deb73a5d1c0461d181587ad4f30ea3c36"},
+    {file = "scikit_learn-1.4.1.post1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9f43dd527dabff5521af2786a2f8de5ba381e182ec7292663508901cf6ceaf6e"},
+    {file = "scikit_learn-1.4.1.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c02e27d65b0c7dc32f2c5eb601aaf5530b7a02bfbe92438188624524878336f2"},
+    {file = "scikit_learn-1.4.1.post1-cp310-cp310-win_amd64.whl", hash = "sha256:629e09f772ad42f657ca60a1a52342eef786218dd20cf1369a3b8d085e55ef8f"},
+    {file = "scikit_learn-1.4.1.post1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6145dfd9605b0b50ae72cdf72b61a2acd87501369a763b0d73d004710ebb76b5"},
+    {file = "scikit_learn-1.4.1.post1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:1afed6951bc9d2053c6ee9a518a466cbc9b07c6a3f9d43bfe734192b6125d508"},
+    {file = "scikit_learn-1.4.1.post1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce03506ccf5f96b7e9030fea7eb148999b254c44c10182ac55857bc9b5d4815f"},
+    {file = "scikit_learn-1.4.1.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4ba516fcdc73d60e7f48cbb0bccb9acbdb21807de3651531208aac73c758e3ab"},
+    {file = "scikit_learn-1.4.1.post1-cp311-cp311-win_amd64.whl", hash = "sha256:78cd27b4669513b50db4f683ef41ea35b5dddc797bd2bbd990d49897fd1c8a46"},
+    {file = "scikit_learn-1.4.1.post1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a1e289f33f613cefe6707dead50db31930530dc386b6ccff176c786335a7b01c"},
+    {file = "scikit_learn-1.4.1.post1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:0df87de9ce1c0140f2818beef310fb2e2afdc1e66fc9ad587965577f17733649"},
+    {file = "scikit_learn-1.4.1.post1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:712c1c69c45b58ef21635360b3d0a680ff7d83ac95b6f9b82cf9294070cda710"},
+    {file = "scikit_learn-1.4.1.post1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1754b0c2409d6ed5a3380512d0adcf182a01363c669033a2b55cca429ed86a81"},
+    {file = "scikit_learn-1.4.1.post1-cp312-cp312-win_amd64.whl", hash = "sha256:1d491ef66e37f4e812db7e6c8286520c2c3fc61b34bf5e59b67b4ce528de93af"},
+    {file = "scikit_learn-1.4.1.post1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:aa0029b78ef59af22cfbd833e8ace8526e4df90212db7ceccbea582ebb5d6794"},
+    {file = "scikit_learn-1.4.1.post1-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:14e4c88436ac96bf69eb6d746ac76a574c314a23c6961b7d344b38877f20fee1"},
+    {file = "scikit_learn-1.4.1.post1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7cd3a77c32879311f2aa93466d3c288c955ef71d191503cf0677c3340ae8ae0"},
+    {file = "scikit_learn-1.4.1.post1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a3ee19211ded1a52ee37b0a7b373a8bfc66f95353af058a210b692bd4cda0dd"},
+    {file = "scikit_learn-1.4.1.post1-cp39-cp39-win_amd64.whl", hash = "sha256:234b6bda70fdcae9e4abbbe028582ce99c280458665a155eed0b820599377d25"},
+]
+
+[package.dependencies]
+joblib = ">=1.2.0"
+numpy = ">=1.19.5,<2.0"
+scipy = ">=1.6.0"
+threadpoolctl = ">=2.0.0"
+
+[package.extras]
+benchmark = ["matplotlib (>=3.3.4)", "memory-profiler (>=0.57.0)", "pandas (>=1.1.5)"]
+docs = ["Pillow (>=7.1.2)", "matplotlib (>=3.3.4)", "memory-profiler (>=0.57.0)", "numpydoc (>=1.2.0)", "pandas (>=1.1.5)", "plotly (>=5.14.0)", "pooch (>=1.6.0)", "scikit-image (>=0.17.2)", "seaborn (>=0.9.0)", "sphinx (>=6.0.0)", "sphinx-copybutton (>=0.5.2)", "sphinx-gallery (>=0.15.0)", "sphinx-prompt (>=1.3.0)", "sphinxext-opengraph (>=0.4.2)"]
+examples = ["matplotlib (>=3.3.4)", "pandas (>=1.1.5)", "plotly (>=5.14.0)", "pooch (>=1.6.0)", "scikit-image (>=0.17.2)", "seaborn (>=0.9.0)"]
+tests = ["black (>=23.3.0)", "matplotlib (>=3.3.4)", "mypy (>=1.3)", "numpydoc (>=1.2.0)", "pandas (>=1.1.5)", "polars (>=0.19.12)", "pooch (>=1.6.0)", "pyamg (>=4.0.0)", "pyarrow (>=12.0.0)", "pytest (>=7.1.2)", "pytest-cov (>=2.9.0)", "ruff (>=0.0.272)", "scikit-image (>=0.17.2)"]
+
 [[package]]
 name = "scipy"
 version = "1.12.0"
@@ -2108,6 +2161,17 @@ files = [
 [package.extras]
 tests = ["pytest", "pytest-cov"]
 
+[[package]]
+name = "threadpoolctl"
+version = "3.3.0"
+description = "threadpoolctl"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "threadpoolctl-3.3.0-py3-none-any.whl", hash = "sha256:6155be1f4a39f31a18ea70f94a77e0ccd57dced08122ea61109e7da89883781e"},
+    {file = "threadpoolctl-3.3.0.tar.gz", hash = "sha256:5dac632b4fa2d43f42130267929af3ba01399ef4bd1882918e92dbc30365d30c"},
+]
+
 [[package]]
 name = "tomli"
 version = "2.0.1"
@@ -2492,4 +2556,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "8f0c813e37c65bb9924074f68a36496d1d069826912f1bef90c9268a3f56e8e0"
+content-hash = "202a6c4e3b71c86ffa75a6e79963f31fb77de3cc3a4ef1a40305065f1e9a9246"
diff --git a/pyproject.toml b/pyproject.toml
index b719ed4..026401a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,6 +35,7 @@ pytest = "^8.0.2"
 coverage = "^7.4.3"
 ruff = "^0.3.2"
 tabulate = "^0.9.0"
+scikit-learn = "^1.4.1.post1"
 
 [build-system]
 requires = ["poetry-core"]
diff --git a/tests/test_benchmark.py b/tests/test_benchmark_core.py
similarity index 100%
rename from tests/test_benchmark.py
rename to tests/test_benchmark_core.py
diff --git a/tests/test_benchmark_news.py b/tests/test_benchmark_news.py
new file mode 100644
index 0000000..6d112f0
--- /dev/null
+++ b/tests/test_benchmark_news.py
@@ -0,0 +1,111 @@
+import os
+import pickle  # nosec
+
+import click
+import datasets
+import pandas as pd
+from sklearn.metrics import adjusted_rand_score
+
+from text_dedup.minhash import main as minhash_main
+from text_dedup.simhash import main as simhash_main
+from text_dedup.utils import IOArgs
+from text_dedup.utils import MetaArgs
+from text_dedup.utils import MinHashArgs
+from text_dedup.utils import SimHashArgs
+from text_dedup.utils.preprocessing import news_copy_preprocessing
+from text_dedup.utils.timer import Timer
+from text_dedup.utils.union_find import UnionFind
+
+NUM_PROC = os.cpu_count()
+
+
+def prepare_data(data_path, label_path, output_path_ds, output_path_spark):
+    df = pd.read_json(data_path).T.reset_index()
+    labels = pd.read_json(label_path)
+    id2data = []
+    filename2id = {}
+    uf = UnionFind()
+
+    for i, row in df.iterrows():
+        id2data.append(
+            {
+                "filename": str(row["id"]),
+                "headline": news_copy_preprocessing(str(row["headline"])),
+                "text": news_copy_preprocessing(str(row["headline"] + " " + row["article"])),
+                "article": news_copy_preprocessing(str(row["article"])),
+                "id": int(i),
+            }
+        )
+        filename2id[id2data[i]["filename"]] = i
+
+    for i, row in labels.iterrows():
+        uf.union(filename2id[row[0]], filename2id[row[1]])
+
+    clusters = [None for _ in range(len(df))]
+    for i in range(len(df)):
+        clusters[i] = uf.find(filename2id[id2data[i]["filename"]])
+
+    ds = datasets.Dataset.from_pandas(pd.DataFrame(id2data))
+    ds.save_to_disk(output_path_ds)
+
+    os.makedirs(output_path_spark, exist_ok=True)
+    pd.DataFrame(id2data).to_parquet(output_path_spark + "/data.parquet")
+
+    return clusters
+
+
+def uf2results(labels, output_path):
+    with open(output_path, "rb") as f:
+        uf = pickle.load(f)  # nosec
+
+    predictions = [uf.find(i) for i in range(len(labels))]
+    return adjusted_rand_score(labels, predictions)
+
+
+if __name__ == "__main__":
+    t = Timer()
+
+    output_path_ds = "news_input_ds"
+    output_path_spark = "news_input_spark"
+
+    test_data = ("./data/test_inf_data.json", "./data/full_test_gt.json")
+    val_data = ("./data/1955_inf_data.json", "./data/1955_gt.json")
+    labels = prepare_data(*test_data, output_path_ds, output_path_spark)
+
+    io_args = IOArgs(
+        path=output_path_ds,
+        local=True,
+        num_proc=NUM_PROC,
+        cache_dir=".cache",
+        output="./news_output_minhash",
+        debug=True,
+        clean_cache=True,
+    )
+    meta_args = MetaArgs(column="article", batch_size=10000)
+
+    # TODO: hyperparameter tuning
+    with t("MinHash"):
+        ctx = click.Context(minhash_main)
+        minhash_args = MinHashArgs(num_perm=256, ngram=2, min_length=0, threshold=0.45)
+        io_args.output = minhash_output = "./news_output_minhash"
+        ctx.invoke(
+            minhash_main,
+            io_args=io_args,
+            meta_args=meta_args,
+            minhash_args=minhash_args,
+        )
+
+    # TODO: hyperparameter tuning
+    with t("SimHash"):
+        ctx = click.Context(simhash_main)
+        simhash_args = SimHashArgs(bit_diff=12, num_bucket=13, ngram=5)
+        io_args.output = simhash_output = "./temp_output_simhash"
+        ctx.invoke(
+            simhash_main,
+            io_args=io_args,
+            meta_args=meta_args,
+            simhash_args=simhash_args,
+        )
+
+    print(f"MinHash ARI: {uf2results(labels, f'{minhash_output}/uf.pkl')}")
+    print(f"SimHash ARI: {uf2results(labels, f'{simhash_output}/uf.pkl')}")
diff --git a/text_dedup/simhash.py b/text_dedup/simhash.py
index 5dea6e0..41e6601 100644
--- a/text_dedup/simhash.py
+++ b/text_dedup/simhash.py
@@ -103,7 +103,9 @@ def __init__(self, f: int, k: int, b: int, masks: list[tuple[bitarray, int, int,
 
             self.masks.append(mask)
 
-        assert sum(self.widths) == f, "The sum of block widths must be equal to the fingerprint size"
+        assert (
+            sum(self.widths) == f
+        ), f"The sum of block widths {sum(self.widths)} must be equal to the fingerprint size {f}"
 
         prefix_width = sum(self.widths[: b - k])
         self.search_mask: bitarray = bitarray(f)
@@ -191,9 +193,12 @@ def _create_permutations(f: int, k: int, b: int) -> list[Permutation]:
     """
     block_size: int = math.ceil(f / b)
     masks: list[tuple[bitarray, int, int, int]] = []
+    b = min(b, math.ceil(f / block_size))
 
     for i in range(b):
         start, end = i * block_size, min((i + 1) * block_size, f)
+        if start >= end:
+            break
         mask: bitarray = bitarray(f)
         mask.setall(0)
         mask[start:end] = 1
diff --git a/text_dedup/utils/preprocessing.py b/text_dedup/utils/preprocessing.py
new file mode 100644
index 0000000..cd9024c
--- /dev/null
+++ b/text_dedup/utils/preprocessing.py
@@ -0,0 +1,6 @@
+def news_copy_preprocessing(text: str) -> str:
+    chars_to_remove = r'"#$%&\()*+/:;<=>@[\\]^_`{|}~.?,!\''
+    text = text.replace("-\n", "").replace("\n", " ")
+    text = text.translate(str.maketrans("", "", chars_to_remove))
+    text = text.encode("ascii", "ignore").decode()
+    return text