Skip to content

Commit 22d6d78

Browse files
committed
update benchmarks
1 parent 556b82b commit 22d6d78

File tree

10 files changed

+605
-303
lines changed

10 files changed

+605
-303
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ serve: up build-doc
2020
cd "$(BUILDDIR)" && python3 -m http.server
2121

2222
test: up
23-
docker compose exec regular poetry run coverage run -m pytest -vvv -s --doctest-modules . --ignore deduplicate-text-datasets --ignore docs --ignore text_dedup/minhash_spark.py --ignore reference
23+
docker compose exec regular poetry run coverage run -m pytest -vvv -s --doctest-modules . --ignore deduplicate-text-datasets --ignore docs --ignore text_dedup/minhash_spark.py --ignore reference --ignore tests/test_minhash_spark.py --ignore tests/test_benchmark.py
2424
docker compose exec regular poetry run coverage xml -o cobertura.xml
2525
docker compose exec regular poetry run coverage report -m
2626
docker compose cp regular:/app/cobertura.xml cobertura.xml

README.md

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -224,13 +224,14 @@ A benchmark of different methods here can be found in `benchmarks/wiki40.ipynb`.
224224

225225
For quick reference, here are the results:
226226

227-
| Method | Precision | Recall | F1 | Time |
228-
| ------------------------------------------------------------------------------- | ---------------- | ---------------- | ---------------- | ---- |
229-
| MinHash | **0.9464** | **0.9446** | **0.9455** | 24s |
230-
| SimHash\* | 0.9011 | 0.6959 | 0.7853 | 210s |
231-
| SimHash[(Gyawali et al., LREC 2020)](https://aclanthology.org/2020.lrec-1.113) | 0.697 | 0.247 | 0.3647 | - |
232-
| Exact Title (my implementation) | 0.8302 | 0.5521 | 0.6632 | - |
233-
| Exact Title[(Gyawali et al., LREC 2020)](https://aclanthology.org/2020.lrec-1.113) | 0.830 | 0.50 | 0.624 | - |
227+
| Method | Precision | Recall | F1 | Time |
228+
| ---------------------------------------------------------------------------------- | ---------- | ---------- | ---------- | ------ |
229+
| MinHash (Spark) | **0.9570** | **0.9445** | **0.9507** | 18.62s |
230+
| MinHash | **0.9594** | **0.945** | **0.9519** | 18s |
231+
| SimHash\* | 0.9007 | 0.6786 | 0.7740 | 210s |
232+
| SimHash[(Gyawali et al., LREC 2020)](https://aclanthology.org/2020.lrec-1.113) | 0.697 | 0.247 | 0.3647 | - |
233+
| Exact Title (my implementation) | 0.8302 | 0.5521 | 0.6632 | - |
234+
| Exact Title[(Gyawali et al., LREC 2020)](https://aclanthology.org/2020.lrec-1.113) | 0.830 | 0.50 | 0.624 | - |
234235

235236
\*Best SimHash result from `benchmarks/hyperparameter.ipynb`.
236237

benchmarks/pinecone.ipynb

Lines changed: 42 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,16 @@
44
"cell_type": "code",
55
"execution_count": 1,
66
"metadata": {},
7-
"outputs": [],
7+
"outputs": [
8+
{
9+
"name": "stderr",
10+
"output_type": "stream",
11+
"text": [
12+
"/Users/chenghao/miniforge3/envs/dedup/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
13+
" from .autonotebook import tqdm as notebook_tqdm\n"
14+
]
15+
}
16+
],
817
"source": [
918
"import os\n",
1019
"import pickle\n",
@@ -41,18 +50,15 @@
4150
"metadata": {},
4251
"outputs": [
4352
{
44-
"data": {
45-
"application/vnd.jupyter.widget-view+json": {
46-
"model_id": "6f318857d2ea408586ac4d8769ba0e14",
47-
"version_major": 2,
48-
"version_minor": 0
49-
},
50-
"text/plain": [
51-
"Saving the dataset (0/1 shards): 0%| | 0/100000 [00:00<?, ? examples/s]"
52-
]
53-
},
54-
"metadata": {},
55-
"output_type": "display_data"
53+
"name": "stderr",
54+
"output_type": "stream",
55+
"text": [
56+
"Downloading data: 100%|██████████| 204M/204M [01:11<00:00, 2.84MB/s] \n",
57+
"Setting num_proc from 10 back to 1 for the train split to disable multiprocessing as it only contains one shard.\n",
58+
"Generating train split: 100000 examples [00:00, 490824.22 examples/s]\n",
59+
"Map (num_proc=10): 100%|██████████| 100000/100000 [00:00<00:00, 128388.64 examples/s]\n",
60+
"Saving the dataset (1/1 shards): 100%|██████████| 100000/100000 [00:00<00:00, 823395.78 examples/s]\n"
61+
]
5662
}
5763
],
5864
"source": [
@@ -88,18 +94,11 @@
8894
"metadata": {},
8995
"outputs": [
9096
{
91-
"data": {
92-
"application/vnd.jupyter.widget-view+json": {
93-
"model_id": "efe2081669ca43e893e31d4e7636156f",
94-
"version_major": 2,
95-
"version_minor": 0
96-
},
97-
"text/plain": [
98-
"Map (num_proc=10): 0%| | 0/100000 [00:00<?, ? examples/s]"
99-
]
100-
},
101-
"metadata": {},
102-
"output_type": "display_data"
97+
"name": "stderr",
98+
"output_type": "stream",
99+
"text": [
100+
"Map (num_proc=10): 100%|██████████| 100000/100000 [00:00<00:00, 189508.39 examples/s]\n"
101+
]
103102
}
104103
],
105104
"source": [
@@ -161,11 +160,11 @@
161160
{
162161
"data": {
163162
"text/plain": [
164-
"{'Correct': 92401,\n",
165-
" 'Incorrect': 7599,\n",
163+
"{'Correct': 92396,\n",
164+
" 'Incorrect': 7604,\n",
166165
" 'Accuracy': 0.924,\n",
167-
" 'Recall': 0.9646,\n",
168-
" 'Precision': 0.4434}"
166+
" 'Recall': 0.9676,\n",
167+
" 'Precision': 0.4433}"
169168
]
170169
},
171170
"execution_count": 6,
@@ -242,10 +241,10 @@
242241
{
243242
"data": {
244243
"text/plain": [
245-
"{'Correct': 81331,\n",
246-
" 'Incorrect': 18669,\n",
247-
" 'Accuracy': 0.8133,\n",
248-
" 'Recall': 0.8328,\n",
244+
"{'Correct': 81371,\n",
245+
" 'Incorrect': 18629,\n",
246+
" 'Accuracy': 0.8137,\n",
247+
" 'Recall': 0.8332,\n",
249248
" 'Precision': 0.347}"
250249
]
251250
},
@@ -311,9 +310,9 @@
311310
"name": "stdout",
312311
"output_type": "stream",
313312
"text": [
314-
"Precision: 0.8994, Recall: 0.6783, F1: 0.7734\n",
315-
"Precision: 0.7681, Recall: 0.9335, F1: 0.8428\n",
316-
"Macro Average F1: 0.8081, Accuracy: 0.8133\n"
313+
"Precision: 0.9007, Recall: 0.6786, F1: 0.7740\n",
314+
"Precision: 0.7681, Recall: 0.9343, F1: 0.8431\n",
315+
"Macro Average F1: 0.8086, Accuracy: 0.8137\n"
317316
]
318317
}
319318
],
@@ -376,18 +375,11 @@
376375
"metadata": {},
377376
"outputs": [
378377
{
379-
"data": {
380-
"application/vnd.jupyter.widget-view+json": {
381-
"model_id": "43bbea2f588c4c4280765103d415f1f6",
382-
"version_major": 2,
383-
"version_minor": 0
384-
},
385-
"text/plain": [
386-
"Map: 0%| | 0/100000 [00:00<?, ? examples/s]"
387-
]
388-
},
389-
"metadata": {},
390-
"output_type": "display_data"
378+
"name": "stderr",
379+
"output_type": "stream",
380+
"text": [
381+
"Map: 100%|██████████| 100000/100000 [00:05<00:00, 19096.76 examples/s]\n"
382+
]
391383
},
392384
{
393385
"name": "stdout",
@@ -441,7 +433,7 @@
441433
{
442434
"data": {
443435
"text/plain": [
444-
"71"
436+
"11"
445437
]
446438
},
447439
"execution_count": 11,
@@ -481,7 +473,7 @@
481473
"name": "python",
482474
"nbconvert_exporter": "python",
483475
"pygments_lexer": "ipython3",
484-
"version": "3.10.6"
476+
"version": "3.10.13"
485477
},
486478
"orig_nbformat": 4,
487479
"vscode": {

0 commit comments

Comments
 (0)