From 5cf6c2e770f6059cbfa45c7967fdaa1c1207dad3 Mon Sep 17 00:00:00 2001 From: Chenghao Mou Date: Sun, 17 Mar 2024 22:07:23 +0000 Subject: [PATCH] add news copy benchmark --- Makefile | 3 +- README.md | 68 ++++++++--- compose.yaml | 1 + poetry.lock | 66 ++++++++++- pyproject.toml | 1 + ...st_benchmark.py => test_benchmark_core.py} | 0 tests/test_benchmark_news.py | 111 ++++++++++++++++++ text_dedup/simhash.py | 7 +- text_dedup/utils/preprocessing.py | 6 + 9 files changed, 243 insertions(+), 20 deletions(-) rename tests/{test_benchmark.py => test_benchmark_core.py} (100%) create mode 100644 tests/test_benchmark_news.py create mode 100644 text_dedup/utils/preprocessing.py diff --git a/Makefile b/Makefile index 5977388..da3527d 100644 --- a/Makefile +++ b/Makefile @@ -25,7 +25,8 @@ test: up docker compose cp local:/app/cobertura.xml cobertura.xml benchmark: up - docker compose exec local poetry run python tests/test_benchmark.py + docker compose exec local poetry run python tests/test_benchmark_core.py + docker compose exec local poetry run python tests/test_benchmark_news.py spark_test: up docker compose exec local poetry run pytest -vvv -s --doctest-modules tests/test_minhash_spark.py diff --git a/README.md b/README.md index acd4fee..380e84c 100644 --- a/README.md +++ b/README.md @@ -220,25 +220,59 @@ INFO After : 47045 ## Benchmarks -A script is provided to benchmark some of the algorithms on `pinecone/core-2020-05-10-deduplication` can be found in `tests/test_benchmark.py`: - -| Algorithm | Precision (Duplicates) | Recall (Duplicates) | Precision (Non Duplicates) | Recall (Non Duplicates) | Macro F1 score | Accuracy | Time | -| :--------------------------- | ---------------------: | ------------------: | -------------------------: | ----------------------: | -------------: | ---------: | :------ | -| MinHash Spark | 0.957 | 0.9445 | 0.9471 | 0.959 | **0.952** | **0.9202** | 698.76s | -| MinHash | 0.9594 | 0.9445 | 0.9474 | 0.9616 | **0.9534** | **0.924** | 18.80s | -| SimHash | 0.9007 | 0.6786 | 0.7681 | 0.9343 | 0.8344 | 0.8137 | 253.94s | -| Exact Title | 0.8302 | 0.5521 | 0.7098 | 0.9065 | 0.77 | 0.7456 | - | -| Exact Title Matching * | 0.830 | 0.50 | 0.709 | 0.992 | 0.757 | 0.746 | - | -| Simhash Matching * | 0.697 | 0.247 | 0.598 | 0.985 | 0.631 | 0.616 | - | -| Document Vector Similarity * | 0.912 | 0.779 | 0.861 | 0.986 | 0.885 | 0.883 | - | -| Hybrid Method * | 0.908 | 0.828 | 0.899 | 0.979 | 0.904 | 0.903 | - | - -\* [(Gyawali et al., LREC 2020)](https://aclanthology.org/2020.lrec-1.113) - -\*\* Best SimHash result from `benchmarks/hyperparameter.ipynb` +### pinecone/core-2020-05-10-deduplication + +See `tests/test_benchmark_core.py` for reproduction. + +| Algorithm | Precision (Duplicates) | Recall (Duplicates) | Precision (Non Duplicates) | Recall (Non Duplicates) | Macro F1 score | Accuracy | Time | +| :------------------------------ | ---------------------: | ------------------: | -------------------------: | ----------------------: | -------------: | ---------: | :------ | +| MinHash Spark | 0.957 | 0.9445 | 0.9471 | 0.959 | **0.952** | **0.9202** | 698.76s | +| MinHash | 0.9594 | 0.9445 | 0.9474 | 0.9616 | **0.9534** | **0.924** | 18.80s | +| SimHash** | 0.9007 | 0.6786 | 0.7681 | 0.9343 | 0.8344 | 0.8137 | 253.94s | +| Exact Title | 0.8302 | 0.5521 | 0.7098 | 0.9065 | 0.77 | 0.7456 | - | +| Exact Title Matching [^1] | 0.830 | 0.50 | 0.709 | 0.992 | 0.757 | 0.746 | - | +| Simhash Matching [^1] | 0.697 | 0.247 | 0.598 | 0.985 | 0.631 | 0.616 | - | +| Document Vector Similarity [^1] | 0.912 | 0.779 | 0.861 | 0.986 | 0.885 | 0.883 | - | +| Hybrid Method [^1] | 0.908 | 0.828 | 0.899 | 0.979 | 0.904 | 0.903 | - | +| LaBSE[^2] | 0.937 | 0.923 | 0.930 | 0.943 | 0.933 | 0.919 | - | +| Multilingual USE[^2] | 0.917 | 0.907 | 0.918 | 0.927 | 0.917 | 0.909 | - | +| Multilingual E5-Base[^2] | 0.931 | 0.908 | 0.919 | 0.939 | 0.924 | 0.920 | - | +| MinHash + LSH[^2] | 0.929 | 0.902 | 0.915 | 0.938 | 0.921 | 0.918 | - | +| RETSimPartial-Dup[^2] | 0.945 | 0.941 | 0.945 | 0.949 | 0.945 | **0.928** | - | +| RETSimNear-Dup[^2] | 0.928 | 0.937 | 0.942 | 0.934 | 0.935 | **0.926** | - | + + +### NEWS-COPY + +See `tests/test_benchmark_news.py` for reproduction. + +Adjusted Rand Index (ARI) on NEWS-COPY dataset: + +| Model/Algorithm | ARI | +| :----------------------- | :-------- | +| n-gram [^3] | 0.440 | +| SimHash | 0.612 | +| SimHash[^2] | 0.695 | +| MinHash | 0.742 | +| MinHash[^3] | 0.737 | +| MinHash[^2] | 0.783 | +| Multilingual USE[^2] | 0.730 | +| Multilingual E5-Base[^2] | 0.742 | +| S-BERT[^3] | 0.700 | +| RETSim Partial-Dup[^2] | 0.831 | +| RETSim Near-Dup[^2] | 0.704 | +| Re-ranking [^3] | **0.937** | +| Bi-encoder [^3] | 0.915 | + + +[^1]: [(Gyawali et al., LREC 2020)](https://aclanthology.org/2020.lrec-1.113) +[^2]: [RETSim: Resilient and Efficient Text Similarity](https://arxiv.org/abs/2311.17264) +[^3]: [Noise-Robust De-Duplication at Scale](https://www.semanticscholar.org/paper/Noise-Robust-De-Duplication-at-Scale-Silcock-D'Amico-Wong/7ca41cc5fc364b713aba5b573ae4ada801fd788a) > [!note] -> Spark implementation has some overhead for small datasets, so I recommend using the script only when you have a large dataset and enough compute resources. +> 1. Best SimHash result from `benchmarks/hyperparameter.ipynb` +> 2. Spark implementation has some overhead for small datasets, so I recommend using the script only when you have a large dataset and enough compute resources. +