ChenghaoMou
diff --git a/‎Makefile
Lines changed: 1 addition & 1 deletion b/‎Makefile
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md
Lines changed: 8 additions & 7 deletions b/‎README.md
Lines changed: 8 additions & 7 deletions
diff --git a/‎benchmarks/pinecone.ipynb
Lines changed: 42 additions & 50 deletions b/‎benchmarks/pinecone.ipynb
Lines changed: 42 additions & 50 deletions
@@ -20,7 +20,7 @@ serve: up build-doc
 	cd "$(BUILDDIR)" && python3 -m http.server
 
 test: up
-	docker compose exec regular poetry run coverage run -m pytest -vvv -s --doctest-modules . --ignore deduplicate-text-datasets --ignore docs --ignore text_dedup/minhash_spark.py --ignore reference
+	docker compose exec regular poetry run coverage run -m pytest -vvv -s --doctest-modules . --ignore deduplicate-text-datasets --ignore docs --ignore text_dedup/minhash_spark.py --ignore reference --ignore tests/test_minhash_spark.py --ignore tests/test_benchmark.py
 	docker compose exec regular poetry run coverage xml -o cobertura.xml
 	docker compose exec regular poetry run coverage report -m
 	docker compose cp regular:/app/cobertura.xml cobertura.xml
 
@@ -224,13 +224,14 @@ A benchmark of different methods here can be found in `benchmarks/wiki40.ipynb`.
 
 For quick reference, here are the results:
 
-| Method                                                                          | Precision        | Recall           | F1               | Time |
-| ------------------------------------------------------------------------------- | ---------------- | ---------------- | ---------------- | ---- |
-| MinHash                                                                         | **0.9464** | **0.9446** | **0.9455** | 24s  |
-| SimHash\*                                                                       | 0.9011           | 0.6959           | 0.7853           | 210s |
-| SimHash[(Gyawali et al., LREC 2020)](https://aclanthology.org/2020.lrec-1.113)     | 0.697            | 0.247            | 0.3647           | -    |
-| Exact Title (my implementation)                                                 | 0.8302           | 0.5521           | 0.6632           | -    |
-| Exact Title[(Gyawali et al., LREC 2020)](https://aclanthology.org/2020.lrec-1.113) | 0.830            | 0.50             | 0.624            | -    |
+| Method                                                                             | Precision  | Recall     | F1         | Time   |
+| ---------------------------------------------------------------------------------- | ---------- | ---------- | ---------- | ------ |
+| MinHash (Spark)                                                                    | **0.9570** | **0.9445** | **0.9507** | 18.62s |
+| MinHash                                                                            | **0.9594** | **0.945**  | **0.9519** | 18s    |
+| SimHash\*                                                                          | 0.9007     | 0.6786     | 0.7740     | 210s   |
+| SimHash[(Gyawali et al., LREC 2020)](https://aclanthology.org/2020.lrec-1.113)     | 0.697      | 0.247      | 0.3647     | -      |
+| Exact Title (my implementation)                                                    | 0.8302     | 0.5521     | 0.6632     | -      |
+| Exact Title[(Gyawali et al., LREC 2020)](https://aclanthology.org/2020.lrec-1.113) | 0.830      | 0.50       | 0.624      | -      |
 
 \*Best SimHash result from `benchmarks/hyperparameter.ipynb`.
 
 
@@ -4,7 +4,16 @@
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/chenghao/miniforge3/envs/dedup/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
    "source": [
     "import os\n",
     "import pickle\n",
@@ -41,18 +50,15 @@
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "6f318857d2ea408586ac4d8769ba0e14",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Saving the dataset (0/1 shards):   0%|          | 0/100000 [00:00<?, ? examples/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading data: 100%|██████████| 204M/204M [01:11<00:00, 2.84MB/s] \n",
+      "Setting num_proc from 10 back to 1 for the train split to disable multiprocessing as it only contains one shard.\n",
+      "Generating train split: 100000 examples [00:00, 490824.22 examples/s]\n",
+      "Map (num_proc=10): 100%|██████████| 100000/100000 [00:00<00:00, 128388.64 examples/s]\n",
+      "Saving the dataset (1/1 shards): 100%|██████████| 100000/100000 [00:00<00:00, 823395.78 examples/s]\n"
+     ]
     }
    ],
    "source": [
@@ -88,18 +94,11 @@
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "efe2081669ca43e893e31d4e7636156f",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Map (num_proc=10):   0%|          | 0/100000 [00:00<?, ? examples/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Map (num_proc=10): 100%|██████████| 100000/100000 [00:00<00:00, 189508.39 examples/s]\n"
+     ]
     }
    ],
    "source": [
@@ -161,11 +160,11 @@
     {
      "data": {
       "text/plain": [
-       "{'Correct': 92401,\n",
-       " 'Incorrect': 7599,\n",
+       "{'Correct': 92396,\n",
+       " 'Incorrect': 7604,\n",
        " 'Accuracy': 0.924,\n",
-       " 'Recall': 0.9646,\n",
-       " 'Precision': 0.4434}"
+       " 'Recall': 0.9676,\n",
+       " 'Precision': 0.4433}"
       ]
      },
      "execution_count": 6,
@@ -242,10 +241,10 @@
     {
      "data": {
       "text/plain": [
-       "{'Correct': 81331,\n",
-       " 'Incorrect': 18669,\n",
-       " 'Accuracy': 0.8133,\n",
-       " 'Recall': 0.8328,\n",
+       "{'Correct': 81371,\n",
+       " 'Incorrect': 18629,\n",
+       " 'Accuracy': 0.8137,\n",
+       " 'Recall': 0.8332,\n",
        " 'Precision': 0.347}"
       ]
      },
@@ -311,9 +310,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Precision: 0.8994, Recall: 0.6783, F1: 0.7734\n",
-      "Precision: 0.7681, Recall: 0.9335, F1: 0.8428\n",
-      "Macro Average F1: 0.8081, Accuracy: 0.8133\n"
+      "Precision: 0.9007, Recall: 0.6786, F1: 0.7740\n",
+      "Precision: 0.7681, Recall: 0.9343, F1: 0.8431\n",
+      "Macro Average F1: 0.8086, Accuracy: 0.8137\n"
      ]
     }
    ],
@@ -376,18 +375,11 @@
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "43bbea2f588c4c4280765103d415f1f6",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Map:   0%|          | 0/100000 [00:00<?, ? examples/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Map: 100%|██████████| 100000/100000 [00:05<00:00, 19096.76 examples/s]\n"
+     ]
     },
     {
      "name": "stdout",
@@ -441,7 +433,7 @@
     {
      "data": {
       "text/plain": [
-       "71"
+       "11"
       ]
      },
      "execution_count": 11,
@@ -481,7 +473,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.6"
+   "version": "3.10.13"
   },
   "orig_nbformat": 4,
   "vscode": {