Skip to content

Added dataset to ingest multi_nli dataset #22

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
293 changes: 293 additions & 0 deletions Multi-NLI/mnli.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,293 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "9b221552",
"metadata": {},
"outputs": [],
"source": [
"# This notebook demonstrates creating a deeplake datset from Multi NLI dataset from hugging face"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "30aec245",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Found cached dataset glue (C:/Users/USER/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9286ae02cebb44de805c42ae5a96a140",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/5 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# First we will import the dataset from huggingface's website using the load_dataset function\n",
"from datasets import load_dataset\n",
"dataset = load_dataset(\"glue\",\"mnli\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "3f37271d",
"metadata": {},
"outputs": [],
"source": [
"# We will then use deep lake to initialize an empty deeplake dataset locally\n",
"import deeplake\n",
"import numpy as np\n",
"ds = deeplake.empty('./mnli_deeplake',overwrite=True) "
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "99ef103a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"DatasetDict({\n",
" train: Dataset({\n",
" features: ['premise', 'hypothesis', 'label', 'idx'],\n",
" num_rows: 392702\n",
" })\n",
" validation_matched: Dataset({\n",
" features: ['premise', 'hypothesis', 'label', 'idx'],\n",
" num_rows: 9815\n",
" })\n",
" validation_mismatched: Dataset({\n",
" features: ['premise', 'hypothesis', 'label', 'idx'],\n",
" num_rows: 9832\n",
" })\n",
" test_matched: Dataset({\n",
" features: ['premise', 'hypothesis', 'label', 'idx'],\n",
" num_rows: 9796\n",
" })\n",
" test_mismatched: Dataset({\n",
" features: ['premise', 'hypothesis', 'label', 'idx'],\n",
" num_rows: 9847\n",
" })\n",
"})"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Let us observe the mnli dataset.\n",
"# It is comprised of seperate dictionaries for training, testing and validating our model\n",
"# Let us observe the train part\n",
"dataset"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "3e75cd23",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Dataset({\n",
" features: ['premise', 'hypothesis', 'label', 'idx'],\n",
" num_rows: 392702\n",
"})"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset['train']\n",
"# It contains some attributes like premise, hypothesis, label and idx\n",
"# We will extract premise and hypothesis data so that a model can be trained on those two attributes"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "24ee6732",
"metadata": {},
"outputs": [],
"source": [
"# Lets create the tensors and name them premise and hypothesis\n",
"with ds:\n",
" ds.create_tensor(\"premise\", htype=\"text\")\n",
" ds.create_tensor(\"hypothesis\",htype='text')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "03153f9b",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|████████████████████████████████████████████████████████████████████████| 392701/392701 [05:14<00:00, 1246.77it/s]\n"
]
}
],
"source": [
"from tqdm import tqdm # using tqdm gives us a loading bar that helps us keep track of iterations\n",
"# We will iterate through the list and append to Deep Lake dataset. This may take a while.\n",
"with ds:\n",
" for i in tqdm(range(1,len(dataset['train']))):\n",
" # Append the data into respective tensors\n",
" ds.premise.append(dataset['train'][i]['premise'])\n",
" ds.hypothesis.append(dataset['train'][i]['hypothesis'])"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "3e3855bf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dataset(path='./mnli_deeplake', tensors=['premise', 'hypothesis'])\n",
"\n",
" tensor htype shape dtype compression\n",
" ------- ------- ------- ------- ------- \n",
" premise text (392701, 1) str None \n",
" hypothesis text (392701, 1) str None \n"
]
}
],
"source": [
"# Print the summary of dataset\n",
"ds.summary()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "8da80c88",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" <iframe\n",
" width=\"90%\"\n",
" height=\"800\"\n",
" src=\"https://app.activeloop.ai/visualizer/hub?url=http://localhost:58815/6a94edb4-cd0b-45e5-b119-ee3ae552d1df/&link_creds_url=http://localhost:58815/creds/dbb3f162-b867-4e8f-bc1a-49e96c529fe8/\"\n",
" frameborder=\"0\"\n",
" allowfullscreen\n",
" \n",
" ></iframe>\n",
" "
],
"text/plain": [
"<IPython.lib.display.IFrame at 0x148cb542d00>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# This would help us visualize our dataset\n",
"ds.visualize()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "18626edf",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(array(['you know during the season and i guess at at your level uh you lose them to the next level if if they decide to recall the the parent team the Braves decide to call to recall a guy from triple A then a double A guy goes up to replace him and a single A guy goes up to replace him'],\n",
" dtype='<U1815'),)"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Inorder to use the data, we will convert tensors into numpy array\n",
"# Lets look at the first text in premise and hypothesis tensors\n",
"ds.premise.numpy()[0],"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "b8c8686d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['You lose the things to the following level if the people recall.'],\n",
" dtype='<U393')"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ds.hypothesis.numpy()[0]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}