Adds KMNIST dataset example

x249wang · x249wang · commit d25274cc0d08 · 2022-04-05T23:03:03.000-04:00
diff --git a/kmnist/upload_kmnist.ipynb b/kmnist/upload_kmnist.ipynb
@@ -0,0 +1,272 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "b59304be",
+   "metadata": {},
+   "source": [
+    "# Installing Dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "77dcbaf5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip3 install hub numpy pandas --quiet"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b5ac6c37",
+   "metadata": {},
+   "source": [
+    "# Loading Packages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "97dec19b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import hub\n",
+    "import numpy as np\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "40100d39",
+   "metadata": {},
+   "source": [
+    "# Downloading Raw Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f50cde54",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "source_url = \"http://codh.rois.ac.jp/kmnist/dataset/kmnist/\"\n",
+    "\n",
+    "train_images_filepath = \"kmnist-train-imgs.npz\"\n",
+    "train_labels_filepath = \"kmnist-train-labels.npz\"\n",
+    "\n",
+    "test_images_filepath = \"kmnist-test-imgs.npz\"\n",
+    "test_labels_filepath = \"kmnist-test-labels.npz\"\n",
+    "\n",
+    "class_map_filepath = \"kmnist_classmap.csv\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e5bc309d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!curl -O {source_url}/{train_images_filepath} # Can also use `wget` if available\n",
+    "!curl -O {source_url}/{train_labels_filepath}\n",
+    "\n",
+    "!curl -O {source_url}/{test_images_filepath}\n",
+    "!curl -O {source_url}/{test_labels_filepath}\n",
+    "\n",
+    "!curl -O {source_url}/{class_map_filepath}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d57879f2",
+   "metadata": {},
+   "source": [
+    "# Loading Class Labels"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4a7dce68",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class_map_table = pd.read_csv(\n",
+    "    class_map_filepath, \n",
+    "    encoding='utf-8', \n",
+    "    index_col=0\n",
+    ")\n",
+    "\n",
+    "class_names = class_map_table.codepoint.tolist()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a5257846",
+   "metadata": {},
+   "source": [
+    "# Creating Dataset and Uploading to `hub`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7485fcd7",
+   "metadata": {},
+   "source": [
+    "## Login\n",
+    "\n",
+    "This is needed if using Activeloop storage."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "641c1fc7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "username = \"<USERNAME>\"\n",
+    "password = \"<PASSWORD>\"\n",
+    "\n",
+    "!activeloop login -u '{username}' -p '{password}'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9f1f6482",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "workspace_path = f\"hub://{username}\" # Or `\".\"` if local"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e136b1e3",
+   "metadata": {},
+   "source": [
+    "## Train Set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ce50ee1f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_name = \"kmnist-train\"\n",
+    "dataset_path = f\"{workspace_path}/{dataset_name}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "919f3198",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds = hub.empty(dataset_path, overwrite=True) # Set `overwrite=True` to overwrite any existing data under the same path\n",
+    "\n",
+    "with ds:\n",
+    "    ds.create_tensor('images', htype = 'image', sample_compression = \"jpg\")\n",
+    "    ds.create_tensor('labels', htype = 'class_label', class_names = class_names)\n",
+    "\n",
+    "    ds.info.update(\n",
+    "        description = \"Kuzushiji-MNIST is a drop-in replacement for the MNIST dataset. It contains 70,000 28x28 grayscale images spanning 10 classes (one from each column of hiragana), and is perfectly balanced like the original MNIST dataset (6k/1k train/test for each class).\", \n",
+    "        citation=\"@online{clanuwat2018deep,  author={Tarin Clanuwat and Mikel Bober-Irizar and Asanobu Kitamoto and Alex Lamb and Kazuaki Yamamoto and David Ha},  title={Deep Learning for Classical Japanese Literature},  date={2018-12-03},  year={2018},  eprintclass={cs.CV},  eprinttype={arXiv},  eprint={cs.CV/1812.01718}}\"\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "with ds:\n",
+    "    for image, label in zip(np.load(train_images_filepath)['arr_0'], np.load(train_labels_filepath)['arr_0']):\n",
+    "        ds.append({'images': image, 'labels': np.uint32(label)})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "683fa2bd",
+   "metadata": {},
+   "source": [
+    "# Test Set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6d0bafdc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_name = \"kmnist-test\"\n",
+    "dataset_path = f\"{workspace_path}/{dataset_name}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f2eac537",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds = hub.empty(dataset_path, overwrite=True)\n",
+    "\n",
+    "with ds:\n",
+    "    ds.create_tensor('images', htype = 'image', sample_compression = \"jpg\")\n",
+    "    ds.create_tensor('labels', htype = 'class_label', class_names = class_names)\n",
+    "\n",
+    "    ds.info.update(\n",
+    "        description = \"Kuzushiji-MNIST is a drop-in replacement for the MNIST dataset. It contains 70,000 28x28 grayscale images spanning 10 classes (one from each column of hiragana), and is perfectly balanced like the original MNIST dataset (6k/1k train/test for each class).\", \n",
+    "        citation=\"@online{clanuwat2018deep,  author={Tarin Clanuwat and Mikel Bober-Irizar and Asanobu Kitamoto and Alex Lamb and Kazuaki Yamamoto and David Ha},  title={Deep Learning for Classical Japanese Literature},  date={2018-12-03},  year={2018},  eprintclass={cs.CV},  eprinttype={arXiv},  eprint={cs.CV/1812.01718}}\"\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "with ds:\n",
+    "    for image, label in zip(np.load(test_images_filepath)['arr_0'], np.load(test_labels_filepath)['arr_0']):\n",
+    "        ds.append({'images': image, 'labels': np.uint32(label)})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "82339916",
+   "metadata": {},
+   "source": [
+    "Dataset documentation: https://docs.activeloop.ai/datasets/kmnist"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0d4013aa",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}