Skip to content

Commit

Permalink
update benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
ChenghaoMou committed Mar 17, 2024
1 parent 556b82b commit 22d6d78
Show file tree
Hide file tree
Showing 10 changed files with 605 additions and 303 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ serve: up build-doc
cd "$(BUILDDIR)" && python3 -m http.server

test: up
docker compose exec regular poetry run coverage run -m pytest -vvv -s --doctest-modules . --ignore deduplicate-text-datasets --ignore docs --ignore text_dedup/minhash_spark.py --ignore reference
docker compose exec regular poetry run coverage run -m pytest -vvv -s --doctest-modules . --ignore deduplicate-text-datasets --ignore docs --ignore text_dedup/minhash_spark.py --ignore reference --ignore tests/test_minhash_spark.py --ignore tests/test_benchmark.py
docker compose exec regular poetry run coverage xml -o cobertura.xml
docker compose exec regular poetry run coverage report -m
docker compose cp regular:/app/cobertura.xml cobertura.xml
Expand Down
15 changes: 8 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -224,13 +224,14 @@ A benchmark of different methods here can be found in `benchmarks/wiki40.ipynb`.

For quick reference, here are the results:

| Method | Precision | Recall | F1 | Time |
| ------------------------------------------------------------------------------- | ---------------- | ---------------- | ---------------- | ---- |
| MinHash | **0.9464** | **0.9446** | **0.9455** | 24s |
| SimHash\* | 0.9011 | 0.6959 | 0.7853 | 210s |
| SimHash[(Gyawali et al., LREC 2020)](https://aclanthology.org/2020.lrec-1.113) | 0.697 | 0.247 | 0.3647 | - |
| Exact Title (my implementation) | 0.8302 | 0.5521 | 0.6632 | - |
| Exact Title[(Gyawali et al., LREC 2020)](https://aclanthology.org/2020.lrec-1.113) | 0.830 | 0.50 | 0.624 | - |
| Method | Precision | Recall | F1 | Time |
| ---------------------------------------------------------------------------------- | ---------- | ---------- | ---------- | ------ |
| MinHash (Spark) | **0.9570** | **0.9445** | **0.9507** | 18.62s |
| MinHash | **0.9594** | **0.945** | **0.9519** | 18s |
| SimHash\* | 0.9007 | 0.6786 | 0.7740 | 210s |
| SimHash[(Gyawali et al., LREC 2020)](https://aclanthology.org/2020.lrec-1.113) | 0.697 | 0.247 | 0.3647 | - |
| Exact Title (my implementation) | 0.8302 | 0.5521 | 0.6632 | - |
| Exact Title[(Gyawali et al., LREC 2020)](https://aclanthology.org/2020.lrec-1.113) | 0.830 | 0.50 | 0.624 | - |

\*Best SimHash result from `benchmarks/hyperparameter.ipynb`.

Expand Down
92 changes: 42 additions & 50 deletions benchmarks/pinecone.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,16 @@
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/chenghao/miniforge3/envs/dedup/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"import os\n",
"import pickle\n",
Expand Down Expand Up @@ -41,18 +50,15 @@
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6f318857d2ea408586ac4d8769ba0e14",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Saving the dataset (0/1 shards): 0%| | 0/100000 [00:00<?, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
"name": "stderr",
"output_type": "stream",
"text": [
"Downloading data: 100%|██████████| 204M/204M [01:11<00:00, 2.84MB/s] \n",
"Setting num_proc from 10 back to 1 for the train split to disable multiprocessing as it only contains one shard.\n",
"Generating train split: 100000 examples [00:00, 490824.22 examples/s]\n",
"Map (num_proc=10): 100%|██████████| 100000/100000 [00:00<00:00, 128388.64 examples/s]\n",
"Saving the dataset (1/1 shards): 100%|██████████| 100000/100000 [00:00<00:00, 823395.78 examples/s]\n"
]
}
],
"source": [
Expand Down Expand Up @@ -88,18 +94,11 @@
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "efe2081669ca43e893e31d4e7636156f",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Map (num_proc=10): 0%| | 0/100000 [00:00<?, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
"name": "stderr",
"output_type": "stream",
"text": [
"Map (num_proc=10): 100%|██████████| 100000/100000 [00:00<00:00, 189508.39 examples/s]\n"
]
}
],
"source": [
Expand Down Expand Up @@ -161,11 +160,11 @@
{
"data": {
"text/plain": [
"{'Correct': 92401,\n",
" 'Incorrect': 7599,\n",
"{'Correct': 92396,\n",
" 'Incorrect': 7604,\n",
" 'Accuracy': 0.924,\n",
" 'Recall': 0.9646,\n",
" 'Precision': 0.4434}"
" 'Recall': 0.9676,\n",
" 'Precision': 0.4433}"
]
},
"execution_count": 6,
Expand Down Expand Up @@ -242,10 +241,10 @@
{
"data": {
"text/plain": [
"{'Correct': 81331,\n",
" 'Incorrect': 18669,\n",
" 'Accuracy': 0.8133,\n",
" 'Recall': 0.8328,\n",
"{'Correct': 81371,\n",
" 'Incorrect': 18629,\n",
" 'Accuracy': 0.8137,\n",
" 'Recall': 0.8332,\n",
" 'Precision': 0.347}"
]
},
Expand Down Expand Up @@ -311,9 +310,9 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Precision: 0.8994, Recall: 0.6783, F1: 0.7734\n",
"Precision: 0.7681, Recall: 0.9335, F1: 0.8428\n",
"Macro Average F1: 0.8081, Accuracy: 0.8133\n"
"Precision: 0.9007, Recall: 0.6786, F1: 0.7740\n",
"Precision: 0.7681, Recall: 0.9343, F1: 0.8431\n",
"Macro Average F1: 0.8086, Accuracy: 0.8137\n"
]
}
],
Expand Down Expand Up @@ -376,18 +375,11 @@
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "43bbea2f588c4c4280765103d415f1f6",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Map: 0%| | 0/100000 [00:00<?, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
"name": "stderr",
"output_type": "stream",
"text": [
"Map: 100%|██████████| 100000/100000 [00:05<00:00, 19096.76 examples/s]\n"
]
},
{
"name": "stdout",
Expand Down Expand Up @@ -441,7 +433,7 @@
{
"data": {
"text/plain": [
"71"
"11"
]
},
"execution_count": 11,
Expand Down Expand Up @@ -481,7 +473,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
"version": "3.10.13"
},
"orig_nbformat": 4,
"vscode": {
Expand Down
Loading

0 comments on commit 22d6d78

Please sign in to comment.