cleanlab
diff --git a/‎token-classification-benchmark.ipynb
Lines changed: 42 additions & 102 deletions b/‎token-classification-benchmark.ipynb
Lines changed: 42 additions & 102 deletions
@@ -1,5 +1,13 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "ef6184e4",
+   "metadata": {},
+   "source": [
+    "# Token Classification Benchmark "
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "fc2bb2f0",
@@ -18,25 +26,23 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2022-10-08 00:52:36.013979: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n",
-      "2022-10-08 00:52:36.014003: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n"
+      "2022-10-09 00:39:31.824063: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n"
      ]
     }
    ],
    "source": [
     "import numpy as np\n",
-    "import string\n",
     "import os \n",
     "from itertools import repeat \n",
     "from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline\n",
     "from cleanlab.rank import get_label_quality_scores as main_get_label_quality_scores\n",
     "from cleanlab.filter import find_label_issues as main_find_label_issues \n",
+    "from utils import readfile, get_probs, get_pred_probs \n",
     "\n",
     "from cleanlab.internal.token_classification_utils import get_sentence, filter_sentence, mapping, merge_probs\n",
     "import matplotlib.pyplot as plt \n",
     "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
-    "from token_classification_utils import get_pred_probs\n",
-    "import sklearn.metrics as metrics "
+    "from sklearn import metrics "
    ]
   },
   {
@@ -50,50 +56,11 @@
   {
    "cell_type": "code",
    "execution_count": 2,
-   "id": "2443bf37",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def readfile(filepath, sep=' '): \n",
-    "    \"\"\" \n",
-    "    Reads file in CoNLL format (IOB2) \n",
-    "    \"\"\"\n",
-    "    lines = open(filepath)\n",
-    "    \n",
-    "    data, sentence, label = [], [], []\n",
-    "    for line in lines:\n",
-    "        if len(line) == 0 or line.startswith('-DOCSTART') or line[0] == '\\n':\n",
-    "            if len(sentence) > 0:\n",
-    "                data.append((sentence, label))\n",
-    "                sentence, label = [], []\n",
-    "            continue\n",
-    "        splits = line.split(sep) \n",
-    "        word = splits[0]\n",
-    "        if len(word) > 0 and word[0].isalpha() and word.isupper():\n",
-    "            word = word[0] + word[1:].lower()\n",
-    "        sentence.append(word)\n",
-    "        label.append(entity_map[splits[-1][:-1]])\n",
-    "\n",
-    "    if len(sentence) > 0:\n",
-    "        data.append((sentence, label))\n",
-    "        \n",
-    "    given_words = [d[0] for d in data] \n",
-    "    given_labels = [d[1] for d in data] \n",
-    "    \n",
-    "    return given_words, given_labels "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
    "id": "68106a8d",
    "metadata": {},
    "outputs": [],
    "source": [
     "filepath = 'data/conll.txt'\n",
-    "entities = ['O', 'B-MISC', 'I-MISC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']\n",
-    "entity_map = {entity: i for i, entity in enumerate(entities)} \n",
-    "\n",
     "given_words, given_labels_unmerged = readfile(filepath) \n",
     "sentences = list(map(get_sentence, given_words)) \n",
     "\n",
@@ -104,7 +71,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "id": "90ca23e4",
    "metadata": {},
    "outputs": [],
@@ -138,7 +105,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "id": "0e0100aa",
    "metadata": {},
    "outputs": [],
@@ -161,7 +128,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "id": "b9ba86c2",
    "metadata": {},
    "outputs": [],
@@ -280,34 +247,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
-   "id": "edc20cee",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_probs(sentence): \n",
-    "    ''' \n",
-    "    @parameter sentence: string \n",
-    "    \n",
-    "    @return probs: np.array of shape (n, m) \n",
-    "        where n is the number of tokens in the sentence and m is the number of classes. \n",
-    "        probs[i][j] is the probability that the i'th sentence belongs to entity j. The \n",
-    "        first and last probs are excluded because the first and last tokens are always \n",
-    "        [CLS] and [SEP], to represent the start and end of the sentence, respectively. \n",
-    "    '''\n",
-    "    def softmax(logit): \n",
-    "        return np.exp(logit) / np.sum(np.exp(logit)) \n",
-    "    \n",
-    "    forward = pipe.forward(pipe.preprocess(sentence)) \n",
-    "    logits = forward['logits'][0].numpy() \n",
-    "    probs = np.array([softmax(logit) for logit in logits]) \n",
-    "    probs = probs[1:-1] \n",
-    "    return probs "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 6,
    "id": "c09d4762",
    "metadata": {},
    "outputs": [],
@@ -326,7 +266,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 7,
    "id": "5740ff1b",
    "metadata": {},
    "outputs": [],
@@ -371,7 +311,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 8,
    "id": "7328bae5",
    "metadata": {},
    "outputs": [],
@@ -406,7 +346,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 9,
    "id": "a991528f",
    "metadata": {},
    "outputs": [],
@@ -473,7 +413,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 10,
    "id": "7be9c3b9",
    "metadata": {},
    "outputs": [],
@@ -485,7 +425,7 @@
     "\n",
     "sentence_tokens = [[tokenizer.decode(token) for token in tokenizer(sentence)['input_ids']] for sentence in sentences] \n",
     "sentence_tokens = [[token.replace('#', '') for token in sentence_token][1:-1] for sentence_token in sentence_tokens] \n",
-    "sentence_probs = list(map(get_probs, sentences)) \n",
+    "sentence_probs = list(map(get_probs, repeat(pipe), sentences)) \n",
     "\n",
     "model_maps = given_maps \n",
     "sentence_probs = list(map(merge_probs, sentence_probs, repeat(model_maps)))\n",
@@ -502,7 +442,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 11,
    "id": "7a7d1982",
    "metadata": {},
    "outputs": [
@@ -525,7 +465,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 12,
    "id": "296eae74",
    "metadata": {},
    "outputs": [
@@ -548,7 +488,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 13,
    "id": "f76c0131",
    "metadata": {},
    "outputs": [
@@ -571,7 +511,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 14,
    "id": "a52b93ba",
    "metadata": {},
    "outputs": [
@@ -594,7 +534,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 15,
    "id": "faab2641",
    "metadata": {},
    "outputs": [
@@ -617,7 +557,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 16,
    "id": "ad742241",
    "metadata": {},
    "outputs": [
@@ -648,7 +588,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 17,
    "id": "2a873a0d",
    "metadata": {},
    "outputs": [],
@@ -660,7 +600,7 @@
     "\n",
     "sentence_tokens = [[tokenizer.decode(token) for token in tokenizer(sentence)['input_ids']] for sentence in sentences] \n",
     "sentence_tokens = [[token.replace('#', '') for token in sentence_token][1:-1] for sentence_token in sentence_tokens] \n",
-    "sentence_probs = list(map(get_probs, sentences)) \n",
+    "sentence_probs = list(map(get_probs, repeat(pipe), sentences)) \n",
     "\n",
     "model_maps = [4, 1, 3, 4, 1, 3, 2, 0] \n",
     "sentence_probs = list(map(merge_probs, sentence_probs, repeat(model_maps)))\n",
@@ -677,7 +617,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 18,
    "id": "568de001",
    "metadata": {},
    "outputs": [
@@ -700,7 +640,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 19,
    "id": "a9a43fed",
    "metadata": {},
    "outputs": [
@@ -723,7 +663,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 20,
    "id": "268ffb85",
    "metadata": {},
    "outputs": [
@@ -746,7 +686,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 21,
    "id": "1265eb42",
    "metadata": {},
    "outputs": [
@@ -769,7 +709,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 22,
    "id": "d89485fa",
    "metadata": {},
    "outputs": [
@@ -792,7 +732,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 23,
    "id": "d4e24ff3",
    "metadata": {},
    "outputs": [
@@ -823,7 +763,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 24,
    "id": "ae613cfe",
    "metadata": {},
    "outputs": [],
@@ -835,7 +775,7 @@
     "\n",
     "sentence_tokens = [[tokenizer.decode(token) for token in tokenizer(sentence)['input_ids']] for sentence in sentences] \n",
     "sentence_tokens = [[token.replace('#', '') for token in sentence_token][1:-1] for sentence_token in sentence_tokens] \n",
-    "sentence_probs = list(map(get_probs, sentences)) \n",
+    "sentence_probs = list(map(get_probs, repeat(pipe), sentences)) \n",
     "pred_probs = list(map(get_pred_probs, sentence_probs, sentence_tokens, given_words)) \n",
     "\n",
     "statistics = {(method, cleanlab_method): evaluate(method, cleanlab_method, pred_probs, error_unmerged, unmerged=True) \n",
@@ -849,7 +789,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 25,
    "id": "60066cc8",
    "metadata": {},
    "outputs": [
@@ -872,7 +812,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 26,
    "id": "c258b3e8",
    "metadata": {},
    "outputs": [
@@ -895,7 +835,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 27,
    "id": "19dcc23c",
    "metadata": {},
    "outputs": [
@@ -918,7 +858,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 28,
    "id": "d576c6bf",
    "metadata": {},
    "outputs": [
@@ -941,7 +881,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 29,
    "id": "ec4ba4de",
    "metadata": {},
    "outputs": [
@@ -964,7 +904,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 30,
    "id": "a052f38b",
    "metadata": {},
    "outputs": [