add news copy benchmark

ChenghaoMou · ChenghaoMou · commit 5cf6c2e770f6 · 2024-03-17T22:07:23.000Z
diff --git a/Makefile b/Makefile
@@ -25,7 +25,8 @@ test: up
 	docker compose cp local:/app/cobertura.xml cobertura.xml
 
 benchmark: up
-	docker compose exec local poetry run python tests/test_benchmark.py
+	docker compose exec local poetry run python tests/test_benchmark_core.py
+	docker compose exec local poetry run python tests/test_benchmark_news.py
 
 spark_test: up
 	docker compose exec local poetry run pytest -vvv -s --doctest-modules tests/test_minhash_spark.py
diff --git a/README.md b/README.md
@@ -220,25 +220,59 @@ INFO     After                         : 47045
 
 ## Benchmarks
 
-A script is provided to benchmark some of the algorithms on `pinecone/core-2020-05-10-deduplication` can be found in `tests/test_benchmark.py`:
-
-| Algorithm                    | Precision (Duplicates) | Recall (Duplicates) | Precision (Non Duplicates) | Recall (Non Duplicates) | Macro F1 score |   Accuracy | Time    |
-| :--------------------------- | ---------------------: | ------------------: | -------------------------: | ----------------------: | -------------: | ---------: | :------ |
-| MinHash Spark                |                  0.957 |              0.9445 |                     0.9471 |                   0.959 |      **0.952** | **0.9202** | 698.76s |
-| MinHash                      |                 0.9594 |              0.9445 |                     0.9474 |                  0.9616 |     **0.9534** |  **0.924** | 18.80s  |
-| SimHash                      |                 0.9007 |              0.6786 |                     0.7681 |                  0.9343 |         0.8344 |     0.8137 | 253.94s |
-| Exact Title                  |                 0.8302 |              0.5521 |                     0.7098 |                  0.9065 |           0.77 |     0.7456 | -       |
-| Exact Title Matching *       |                  0.830 |                0.50 |                      0.709 |                   0.992 |          0.757 |      0.746 | -       |
-| Simhash Matching *           |                  0.697 |               0.247 |                      0.598 |                   0.985 |          0.631 |      0.616 | -       |
-| Document Vector Similarity * |                  0.912 |               0.779 |                      0.861 |                   0.986 |          0.885 |      0.883 | -       |
-| Hybrid Method *              |                  0.908 |               0.828 |                      0.899 |                   0.979 |          0.904 |      0.903 | -       |
-
-\* [(Gyawali et al., LREC 2020)](https://aclanthology.org/2020.lrec-1.113)
-
-\*\* Best SimHash result from `benchmarks/hyperparameter.ipynb`
+### pinecone/core-2020-05-10-deduplication
+
+See `tests/test_benchmark_core.py` for reproduction.
+
+| Algorithm                       | Precision (Duplicates) | Recall (Duplicates) | Precision (Non Duplicates) | Recall (Non Duplicates) | Macro F1 score |   Accuracy | Time    |
+| :------------------------------ | ---------------------: | ------------------: | -------------------------: | ----------------------: | -------------: | ---------: | :------ |
+| MinHash Spark                   |                  0.957 |              0.9445 |                     0.9471 |                   0.959 |      **0.952** | **0.9202** | 698.76s |
+| MinHash                         |                 0.9594 |              0.9445 |                     0.9474 |                  0.9616 |     **0.9534** |  **0.924** | 18.80s  |
+| SimHash**                       |                 0.9007 |              0.6786 |                     0.7681 |                  0.9343 |         0.8344 |     0.8137 | 253.94s |
+| Exact Title                     |                 0.8302 |              0.5521 |                     0.7098 |                  0.9065 |           0.77 |     0.7456 | -       |
+| Exact Title Matching [^1]       |                  0.830 |                0.50 |                      0.709 |                   0.992 |          0.757 |      0.746 | -       |
+| Simhash Matching [^1]           |                  0.697 |               0.247 |                      0.598 |                   0.985 |          0.631 |      0.616 | -       |
+| Document Vector Similarity [^1] |                  0.912 |               0.779 |                      0.861 |                   0.986 |          0.885 |      0.883 | -       |
+| Hybrid Method [^1]              |                  0.908 |               0.828 |                      0.899 |                   0.979 |          0.904 |      0.903 | -       |
+| LaBSE[^2]                       |                  0.937 |               0.923 |                      0.930 |                   0.943 |          0.933 |      0.919 | -       |
+| Multilingual USE[^2]            |                  0.917 |               0.907 |                      0.918 |                   0.927 |          0.917 |      0.909 | -       |
+| Multilingual E5-Base[^2]        |                  0.931 |               0.908 |                      0.919 |                   0.939 |          0.924 |      0.920 | -       |
+| MinHash + LSH[^2]               |                  0.929 |               0.902 |                      0.915 |                   0.938 |          0.921 |      0.918 | -       |
+| RETSimPartial-Dup[^2]           |                  0.945 |               0.941 |                      0.945 |                   0.949 |          0.945 |  **0.928** | -       |
+| RETSimNear-Dup[^2]              |                  0.928 |               0.937 |                      0.942 |                   0.934 |          0.935 |  **0.926** | -       |
+
+
+### NEWS-COPY
+
+See `tests/test_benchmark_news.py` for reproduction.
+
+Adjusted Rand Index (ARI) on NEWS-COPY dataset:
+
+| Model/Algorithm          | ARI       |
+| :----------------------- | :-------- |
+| n-gram [^3]              | 0.440     |
+| SimHash                  | 0.612     |
+| SimHash[^2]              | 0.695     |
+| MinHash                  | 0.742     |
+| MinHash[^3]              | 0.737     |
+| MinHash[^2]              | 0.783     |
+| Multilingual USE[^2]     | 0.730     |
+| Multilingual E5-Base[^2] | 0.742     |
+| S-BERT[^3]               | 0.700     |
+| RETSim Partial-Dup[^2]   | 0.831     |
+| RETSim Near-Dup[^2]      | 0.704     |
+| Re-ranking [^3]          | **0.937** |
+| Bi-encoder [^3]          | 0.915     |
+
+
+[^1]: [(Gyawali et al., LREC 2020)](https://aclanthology.org/2020.lrec-1.113)
+[^2]: [RETSim: Resilient and Efficient Text Similarity](https://arxiv.org/abs/2311.17264)
+[^3]: [Noise-Robust De-Duplication at Scale](https://www.semanticscholar.org/paper/Noise-Robust-De-Duplication-at-Scale-Silcock-D'Amico-Wong/7ca41cc5fc364b713aba5b573ae4ada801fd788a)
 
 > [!note]
-> Spark implementation has some overhead for small datasets, so I recommend using the script only when you have a large dataset and enough compute resources.
+> 1. Best SimHash result from `benchmarks/hyperparameter.ipynb`
+> 2. Spark implementation has some overhead for small datasets, so I recommend using the script only when you have a large dataset and enough compute resources.
+
 
 <!-- ## FAQ
 
diff --git a/compose.yaml b/compose.yaml
@@ -9,3 +9,4 @@ services:
       - ./docs:/app/docs
       - ./tests:/app/tests
       - ./text_dedup:/app/text_dedup
+      - ./data:/app/data
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -35,6 +35,7 @@ pytest = "^8.0.2"
 coverage = "^7.4.3"
 ruff = "^0.3.2"
 tabulate = "^0.9.0"
+scikit-learn = "^1.4.1.post1"
 
 [build-system]
 requires = ["poetry-core"]
diff --git a/tests/test_benchmark_core.py b/tests/test_benchmark_core.py
diff --git a/tests/test_benchmark_news.py b/tests/test_benchmark_news.py
@@ -0,0 +1,111 @@
+import os
+import pickle  # nosec
+
+import click
+import datasets
+import pandas as pd
+from sklearn.metrics import adjusted_rand_score
+
+from text_dedup.minhash import main as minhash_main
+from text_dedup.simhash import main as simhash_main
+from text_dedup.utils import IOArgs
+from text_dedup.utils import MetaArgs
+from text_dedup.utils import MinHashArgs
+from text_dedup.utils import SimHashArgs
+from text_dedup.utils.preprocessing import news_copy_preprocessing
+from text_dedup.utils.timer import Timer
+from text_dedup.utils.union_find import UnionFind
+
+NUM_PROC = os.cpu_count()
+
+
+def prepare_data(data_path, label_path, output_path_ds, output_path_spark):
+    df = pd.read_json(data_path).T.reset_index()
+    labels = pd.read_json(label_path)
+    id2data = []
+    filename2id = {}
+    uf = UnionFind()
+
+    for i, row in df.iterrows():
+        id2data.append(
+            {
+                "filename": str(row["id"]),
+                "headline": news_copy_preprocessing(str(row["headline"])),
+                "text": news_copy_preprocessing(str(row["headline"] + " " + row["article"])),
+                "article": news_copy_preprocessing(str(row["article"])),
+                "id": int(i),
+            }
+        )
+        filename2id[id2data[i]["filename"]] = i
+
+    for i, row in labels.iterrows():
+        uf.union(filename2id[row[0]], filename2id[row[1]])
+
+    clusters = [None for _ in range(len(df))]
+    for i in range(len(df)):
+        clusters[i] = uf.find(filename2id[id2data[i]["filename"]])
+
+    ds = datasets.Dataset.from_pandas(pd.DataFrame(id2data))
+    ds.save_to_disk(output_path_ds)
+
+    os.makedirs(output_path_spark, exist_ok=True)
+    pd.DataFrame(id2data).to_parquet(output_path_spark + "/data.parquet")
+
+    return clusters
+
+
+def uf2results(labels, output_path):
+    with open(output_path, "rb") as f:
+        uf = pickle.load(f)  # nosec
+
+    predictions = [uf.find(i) for i in range(len(labels))]
+    return adjusted_rand_score(labels, predictions)
+
+
+if __name__ == "__main__":
+    t = Timer()
+
+    output_path_ds = "news_input_ds"
+    output_path_spark = "news_input_spark"
+
+    test_data = ("./data/test_inf_data.json", "./data/full_test_gt.json")
+    val_data = ("./data/1955_inf_data.json", "./data/1955_gt.json")
+    labels = prepare_data(*test_data, output_path_ds, output_path_spark)
+
+    io_args = IOArgs(
+        path=output_path_ds,
+        local=True,
+        num_proc=NUM_PROC,
+        cache_dir=".cache",
+        output="./news_output_minhash",
+        debug=True,
+        clean_cache=True,
+    )
+    meta_args = MetaArgs(column="article", batch_size=10000)
+
+    # TODO: hyperparameter tuning
+    with t("MinHash"):
+        ctx = click.Context(minhash_main)
+        minhash_args = MinHashArgs(num_perm=256, ngram=2, min_length=0, threshold=0.45)
+        io_args.output = minhash_output = "./news_output_minhash"
+        ctx.invoke(
+            minhash_main,
+            io_args=io_args,
+            meta_args=meta_args,
+            minhash_args=minhash_args,
+        )
+
+    # TODO: hyperparameter tuning
+    with t("SimHash"):
+        ctx = click.Context(simhash_main)
+        simhash_args = SimHashArgs(bit_diff=12, num_bucket=13, ngram=5)
+        io_args.output = simhash_output = "./temp_output_simhash"
+        ctx.invoke(
+            simhash_main,
+            io_args=io_args,
+            meta_args=meta_args,
+            simhash_args=simhash_args,
+        )
+
+    print(f"MinHash ARI: {uf2results(labels, f'{minhash_output}/uf.pkl')}")
+    print(f"SimHash ARI: {uf2results(labels, f'{simhash_output}/uf.pkl')}")
diff --git a/text_dedup/simhash.py b/text_dedup/simhash.py
@@ -103,7 +103,9 @@ def __init__(self, f: int, k: int, b: int, masks: list[tuple[bitarray, int, int,
 
             self.masks.append(mask)
 
-        assert sum(self.widths) == f, "The sum of block widths must be equal to the fingerprint size"
+        assert (
+            sum(self.widths) == f
+        ), f"The sum of block widths {sum(self.widths)} must be equal to the fingerprint size {f}"
 
         prefix_width = sum(self.widths[: b - k])
         self.search_mask: bitarray = bitarray(f)
@@ -191,9 +193,12 @@ def _create_permutations(f: int, k: int, b: int) -> list[Permutation]:
     """
     block_size: int = math.ceil(f / b)
     masks: list[tuple[bitarray, int, int, int]] = []
+    b = min(b, math.ceil(f / block_size))
 
     for i in range(b):
         start, end = i * block_size, min((i + 1) * block_size, f)
+        if start >= end:
+            break
         mask: bitarray = bitarray(f)
         mask.setall(0)
         mask[start:end] = 1
diff --git a/text_dedup/utils/preprocessing.py b/text_dedup/utils/preprocessing.py
@@ -0,0 +1,6 @@
+def news_copy_preprocessing(text: str) -> str:
+    chars_to_remove = r'"#$%&\()*+/:;<=>@[\\]^_`{|}~.?,!\''
+    text = text.replace("-\n", "").replace("\n", " ")
+    text = text.translate(str.maketrans("", "", chars_to_remove))
+    text = text.encode("ascii", "ignore").decode()
+    return text