activeloopai · R-Yash · Jan 25, 2023
diff --git a/Multi-NLI/mnli.ipynb b/Multi-NLI/mnli.ipynb
@@ -0,0 +1,293 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "9b221552",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# This notebook demonstrates creating a deeplake datset from Multi NLI dataset from hugging face"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "30aec245",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Found cached dataset glue (C:/Users/USER/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9286ae02cebb44de805c42ae5a96a140",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/5 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# First we will import the dataset from huggingface's website using the load_dataset function\n",
+    "from datasets import load_dataset\n",
+    "dataset = load_dataset(\"glue\",\"mnli\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "3f37271d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# We will then use deep lake to initialize an empty deeplake dataset locally\n",
+    "import deeplake\n",
+    "import numpy as np\n",
+    "ds = deeplake.empty('./mnli_deeplake',overwrite=True) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "99ef103a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['premise', 'hypothesis', 'label', 'idx'],\n",
+       "        num_rows: 392702\n",
+       "    })\n",
+       "    validation_matched: Dataset({\n",
+       "        features: ['premise', 'hypothesis', 'label', 'idx'],\n",
+       "        num_rows: 9815\n",
+       "    })\n",
+       "    validation_mismatched: Dataset({\n",
+       "        features: ['premise', 'hypothesis', 'label', 'idx'],\n",
+       "        num_rows: 9832\n",
+       "    })\n",
+       "    test_matched: Dataset({\n",
+       "        features: ['premise', 'hypothesis', 'label', 'idx'],\n",
+       "        num_rows: 9796\n",
+       "    })\n",
+       "    test_mismatched: Dataset({\n",
+       "        features: ['premise', 'hypothesis', 'label', 'idx'],\n",
+       "        num_rows: 9847\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Let us observe the mnli dataset.\n",
+    "# It is comprised of seperate dictionaries for training, testing and validating our model\n",
+    "# Let us observe the train part\n",
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "3e75cd23",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['premise', 'hypothesis', 'label', 'idx'],\n",
+       "    num_rows: 392702\n",
+       "})"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset['train']\n",
+    "# It contains some attributes like premise, hypothesis, label and idx\n",
+    "# We will extract premise and hypothesis data so that a model can be trained on those two attributes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "24ee6732",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Lets create the tensors and name them premise and hypothesis\n",
+    "with ds:\n",
+    "    ds.create_tensor(\"premise\", htype=\"text\")\n",
+    "    ds.create_tensor(\"hypothesis\",htype='text')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "03153f9b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|████████████████████████████████████████████████████████████████████████| 392701/392701 [05:14<00:00, 1246.77it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from tqdm import tqdm # using tqdm gives us a loading bar that helps us keep track of iterations\n",
+    "# We will iterate through the list and append to Deep Lake dataset. This may take a while.\n",
+    "with ds:\n",
+    "    for i in tqdm(range(1,len(dataset['train']))):\n",
+    "        # Append the data into respective tensors\n",
+    "        ds.premise.append(dataset['train'][i]['premise'])\n",
+    "        ds.hypothesis.append(dataset['train'][i]['hypothesis'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "3e3855bf",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset(path='./mnli_deeplake', tensors=['premise', 'hypothesis'])\n",
+      "\n",
+      "   tensor     htype      shape      dtype  compression\n",
+      "  -------    -------    -------    -------  ------- \n",
+      "  premise     text    (392701, 1)    str     None   \n",
+      " hypothesis   text    (392701, 1)    str     None   \n"
+     ]
+    }
+   ],
+   "source": [
+    "# Print the summary of dataset\n",
+    "ds.summary()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "8da80c88",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "        <iframe\n",
+       "            width=\"90%\"\n",
+       "            height=\"800\"\n",
+       "            src=\"https://app.activeloop.ai/visualizer/hub?url=http://localhost:58815/6a94edb4-cd0b-45e5-b119-ee3ae552d1df/&link_creds_url=http://localhost:58815/creds/dbb3f162-b867-4e8f-bc1a-49e96c529fe8/\"\n",
+       "            frameborder=\"0\"\n",
+       "            allowfullscreen\n",
+       "            \n",
+       "        ></iframe>\n",
+       "        "
+      ],
+      "text/plain": [
+       "<IPython.lib.display.IFrame at 0x148cb542d00>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# This would help us visualize our dataset\n",
+    "ds.visualize()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "18626edf",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(array(['you know during the season and i guess at at your level uh you lose them to the next level if if they decide to recall the the parent team the Braves decide to call to recall a guy from triple A then a double A guy goes up to replace him and a single A guy goes up to replace him'],\n",
+       "       dtype='<U1815'),)"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Inorder to use the data, we will convert tensors into numpy array\n",
+    "# Lets look at the first text in premise and hypothesis tensors\n",
+    "ds.premise.numpy()[0],"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "b8c8686d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['You lose the things to the following level if the people recall.'],\n",
+       "      dtype='<U393')"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ds.hypothesis.numpy()[0]"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}