|
4 | 4 | "cell_type": "code",
|
5 | 5 | "execution_count": 1,
|
6 | 6 | "metadata": {},
|
7 |
| - "outputs": [], |
| 7 | + "outputs": [ |
| 8 | + { |
| 9 | + "name": "stderr", |
| 10 | + "output_type": "stream", |
| 11 | + "text": [ |
| 12 | + "/Users/chenghao/miniforge3/envs/dedup/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", |
| 13 | + " from .autonotebook import tqdm as notebook_tqdm\n" |
| 14 | + ] |
| 15 | + } |
| 16 | + ], |
8 | 17 | "source": [
|
9 | 18 | "import os\n",
|
10 | 19 | "import pickle\n",
|
|
41 | 50 | "metadata": {},
|
42 | 51 | "outputs": [
|
43 | 52 | {
|
44 |
| - "data": { |
45 |
| - "application/vnd.jupyter.widget-view+json": { |
46 |
| - "model_id": "6f318857d2ea408586ac4d8769ba0e14", |
47 |
| - "version_major": 2, |
48 |
| - "version_minor": 0 |
49 |
| - }, |
50 |
| - "text/plain": [ |
51 |
| - "Saving the dataset (0/1 shards): 0%| | 0/100000 [00:00<?, ? examples/s]" |
52 |
| - ] |
53 |
| - }, |
54 |
| - "metadata": {}, |
55 |
| - "output_type": "display_data" |
| 53 | + "name": "stderr", |
| 54 | + "output_type": "stream", |
| 55 | + "text": [ |
| 56 | + "Downloading data: 100%|██████████| 204M/204M [01:11<00:00, 2.84MB/s] \n", |
| 57 | + "Setting num_proc from 10 back to 1 for the train split to disable multiprocessing as it only contains one shard.\n", |
| 58 | + "Generating train split: 100000 examples [00:00, 490824.22 examples/s]\n", |
| 59 | + "Map (num_proc=10): 100%|██████████| 100000/100000 [00:00<00:00, 128388.64 examples/s]\n", |
| 60 | + "Saving the dataset (1/1 shards): 100%|██████████| 100000/100000 [00:00<00:00, 823395.78 examples/s]\n" |
| 61 | + ] |
56 | 62 | }
|
57 | 63 | ],
|
58 | 64 | "source": [
|
|
88 | 94 | "metadata": {},
|
89 | 95 | "outputs": [
|
90 | 96 | {
|
91 |
| - "data": { |
92 |
| - "application/vnd.jupyter.widget-view+json": { |
93 |
| - "model_id": "efe2081669ca43e893e31d4e7636156f", |
94 |
| - "version_major": 2, |
95 |
| - "version_minor": 0 |
96 |
| - }, |
97 |
| - "text/plain": [ |
98 |
| - "Map (num_proc=10): 0%| | 0/100000 [00:00<?, ? examples/s]" |
99 |
| - ] |
100 |
| - }, |
101 |
| - "metadata": {}, |
102 |
| - "output_type": "display_data" |
| 97 | + "name": "stderr", |
| 98 | + "output_type": "stream", |
| 99 | + "text": [ |
| 100 | + "Map (num_proc=10): 100%|██████████| 100000/100000 [00:00<00:00, 189508.39 examples/s]\n" |
| 101 | + ] |
103 | 102 | }
|
104 | 103 | ],
|
105 | 104 | "source": [
|
|
161 | 160 | {
|
162 | 161 | "data": {
|
163 | 162 | "text/plain": [
|
164 |
| - "{'Correct': 92401,\n", |
165 |
| - " 'Incorrect': 7599,\n", |
| 163 | + "{'Correct': 92396,\n", |
| 164 | + " 'Incorrect': 7604,\n", |
166 | 165 | " 'Accuracy': 0.924,\n",
|
167 |
| - " 'Recall': 0.9646,\n", |
168 |
| - " 'Precision': 0.4434}" |
| 166 | + " 'Recall': 0.9676,\n", |
| 167 | + " 'Precision': 0.4433}" |
169 | 168 | ]
|
170 | 169 | },
|
171 | 170 | "execution_count": 6,
|
|
242 | 241 | {
|
243 | 242 | "data": {
|
244 | 243 | "text/plain": [
|
245 |
| - "{'Correct': 81331,\n", |
246 |
| - " 'Incorrect': 18669,\n", |
247 |
| - " 'Accuracy': 0.8133,\n", |
248 |
| - " 'Recall': 0.8328,\n", |
| 244 | + "{'Correct': 81371,\n", |
| 245 | + " 'Incorrect': 18629,\n", |
| 246 | + " 'Accuracy': 0.8137,\n", |
| 247 | + " 'Recall': 0.8332,\n", |
249 | 248 | " 'Precision': 0.347}"
|
250 | 249 | ]
|
251 | 250 | },
|
|
311 | 310 | "name": "stdout",
|
312 | 311 | "output_type": "stream",
|
313 | 312 | "text": [
|
314 |
| - "Precision: 0.8994, Recall: 0.6783, F1: 0.7734\n", |
315 |
| - "Precision: 0.7681, Recall: 0.9335, F1: 0.8428\n", |
316 |
| - "Macro Average F1: 0.8081, Accuracy: 0.8133\n" |
| 313 | + "Precision: 0.9007, Recall: 0.6786, F1: 0.7740\n", |
| 314 | + "Precision: 0.7681, Recall: 0.9343, F1: 0.8431\n", |
| 315 | + "Macro Average F1: 0.8086, Accuracy: 0.8137\n" |
317 | 316 | ]
|
318 | 317 | }
|
319 | 318 | ],
|
|
376 | 375 | "metadata": {},
|
377 | 376 | "outputs": [
|
378 | 377 | {
|
379 |
| - "data": { |
380 |
| - "application/vnd.jupyter.widget-view+json": { |
381 |
| - "model_id": "43bbea2f588c4c4280765103d415f1f6", |
382 |
| - "version_major": 2, |
383 |
| - "version_minor": 0 |
384 |
| - }, |
385 |
| - "text/plain": [ |
386 |
| - "Map: 0%| | 0/100000 [00:00<?, ? examples/s]" |
387 |
| - ] |
388 |
| - }, |
389 |
| - "metadata": {}, |
390 |
| - "output_type": "display_data" |
| 378 | + "name": "stderr", |
| 379 | + "output_type": "stream", |
| 380 | + "text": [ |
| 381 | + "Map: 100%|██████████| 100000/100000 [00:05<00:00, 19096.76 examples/s]\n" |
| 382 | + ] |
391 | 383 | },
|
392 | 384 | {
|
393 | 385 | "name": "stdout",
|
|
441 | 433 | {
|
442 | 434 | "data": {
|
443 | 435 | "text/plain": [
|
444 |
| - "71" |
| 436 | + "11" |
445 | 437 | ]
|
446 | 438 | },
|
447 | 439 | "execution_count": 11,
|
|
481 | 473 | "name": "python",
|
482 | 474 | "nbconvert_exporter": "python",
|
483 | 475 | "pygments_lexer": "ipython3",
|
484 |
| - "version": "3.10.6" |
| 476 | + "version": "3.10.13" |
485 | 477 | },
|
486 | 478 | "orig_nbformat": 4,
|
487 | 479 | "vscode": {
|
|
0 commit comments