Skip to content

Commit d25274c

Browse files
committed
Adds KMNIST dataset example
1 parent 0ac544f commit d25274c

File tree

1 file changed

+272
-0
lines changed

1 file changed

+272
-0
lines changed

kmnist/upload_kmnist.ipynb

+272
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,272 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "b59304be",
6+
"metadata": {},
7+
"source": [
8+
"# Installing Dependencies"
9+
]
10+
},
11+
{
12+
"cell_type": "code",
13+
"execution_count": null,
14+
"id": "77dcbaf5",
15+
"metadata": {},
16+
"outputs": [],
17+
"source": [
18+
"!pip3 install hub numpy pandas --quiet"
19+
]
20+
},
21+
{
22+
"cell_type": "markdown",
23+
"id": "b5ac6c37",
24+
"metadata": {},
25+
"source": [
26+
"# Loading Packages"
27+
]
28+
},
29+
{
30+
"cell_type": "code",
31+
"execution_count": null,
32+
"id": "97dec19b",
33+
"metadata": {},
34+
"outputs": [],
35+
"source": [
36+
"import hub\n",
37+
"import numpy as np\n",
38+
"import pandas as pd"
39+
]
40+
},
41+
{
42+
"cell_type": "markdown",
43+
"id": "40100d39",
44+
"metadata": {},
45+
"source": [
46+
"# Downloading Raw Data"
47+
]
48+
},
49+
{
50+
"cell_type": "code",
51+
"execution_count": null,
52+
"id": "f50cde54",
53+
"metadata": {},
54+
"outputs": [],
55+
"source": [
56+
"source_url = \"http://codh.rois.ac.jp/kmnist/dataset/kmnist/\"\n",
57+
"\n",
58+
"train_images_filepath = \"kmnist-train-imgs.npz\"\n",
59+
"train_labels_filepath = \"kmnist-train-labels.npz\"\n",
60+
"\n",
61+
"test_images_filepath = \"kmnist-test-imgs.npz\"\n",
62+
"test_labels_filepath = \"kmnist-test-labels.npz\"\n",
63+
"\n",
64+
"class_map_filepath = \"kmnist_classmap.csv\""
65+
]
66+
},
67+
{
68+
"cell_type": "code",
69+
"execution_count": null,
70+
"id": "e5bc309d",
71+
"metadata": {},
72+
"outputs": [],
73+
"source": [
74+
"!curl -O {source_url}/{train_images_filepath} # Can also use `wget` if available\n",
75+
"!curl -O {source_url}/{train_labels_filepath}\n",
76+
"\n",
77+
"!curl -O {source_url}/{test_images_filepath}\n",
78+
"!curl -O {source_url}/{test_labels_filepath}\n",
79+
"\n",
80+
"!curl -O {source_url}/{class_map_filepath}"
81+
]
82+
},
83+
{
84+
"cell_type": "markdown",
85+
"id": "d57879f2",
86+
"metadata": {},
87+
"source": [
88+
"# Loading Class Labels"
89+
]
90+
},
91+
{
92+
"cell_type": "code",
93+
"execution_count": null,
94+
"id": "4a7dce68",
95+
"metadata": {},
96+
"outputs": [],
97+
"source": [
98+
"class_map_table = pd.read_csv(\n",
99+
" class_map_filepath, \n",
100+
" encoding='utf-8', \n",
101+
" index_col=0\n",
102+
")\n",
103+
"\n",
104+
"class_names = class_map_table.codepoint.tolist()"
105+
]
106+
},
107+
{
108+
"cell_type": "markdown",
109+
"id": "a5257846",
110+
"metadata": {},
111+
"source": [
112+
"# Creating Dataset and Uploading to `hub`"
113+
]
114+
},
115+
{
116+
"cell_type": "markdown",
117+
"id": "7485fcd7",
118+
"metadata": {},
119+
"source": [
120+
"## Login\n",
121+
"\n",
122+
"This is needed if using Activeloop storage."
123+
]
124+
},
125+
{
126+
"cell_type": "code",
127+
"execution_count": null,
128+
"id": "641c1fc7",
129+
"metadata": {},
130+
"outputs": [],
131+
"source": [
132+
"username = \"<USERNAME>\"\n",
133+
"password = \"<PASSWORD>\"\n",
134+
"\n",
135+
"!activeloop login -u '{username}' -p '{password}'"
136+
]
137+
},
138+
{
139+
"cell_type": "code",
140+
"execution_count": null,
141+
"id": "9f1f6482",
142+
"metadata": {},
143+
"outputs": [],
144+
"source": [
145+
"workspace_path = f\"hub://{username}\" # Or `\".\"` if local"
146+
]
147+
},
148+
{
149+
"cell_type": "markdown",
150+
"id": "e136b1e3",
151+
"metadata": {},
152+
"source": [
153+
"## Train Set"
154+
]
155+
},
156+
{
157+
"cell_type": "code",
158+
"execution_count": null,
159+
"id": "ce50ee1f",
160+
"metadata": {},
161+
"outputs": [],
162+
"source": [
163+
"dataset_name = \"kmnist-train\"\n",
164+
"dataset_path = f\"{workspace_path}/{dataset_name}\""
165+
]
166+
},
167+
{
168+
"cell_type": "code",
169+
"execution_count": null,
170+
"id": "919f3198",
171+
"metadata": {},
172+
"outputs": [],
173+
"source": [
174+
"ds = hub.empty(dataset_path, overwrite=True) # Set `overwrite=True` to overwrite any existing data under the same path\n",
175+
"\n",
176+
"with ds:\n",
177+
" ds.create_tensor('images', htype = 'image', sample_compression = \"jpg\")\n",
178+
" ds.create_tensor('labels', htype = 'class_label', class_names = class_names)\n",
179+
"\n",
180+
" ds.info.update(\n",
181+
" description = \"Kuzushiji-MNIST is a drop-in replacement for the MNIST dataset. It contains 70,000 28x28 grayscale images spanning 10 classes (one from each column of hiragana), and is perfectly balanced like the original MNIST dataset (6k/1k train/test for each class).\", \n",
182+
" citation=\"@online{clanuwat2018deep, author={Tarin Clanuwat and Mikel Bober-Irizar and Asanobu Kitamoto and Alex Lamb and Kazuaki Yamamoto and David Ha}, title={Deep Learning for Classical Japanese Literature}, date={2018-12-03}, year={2018}, eprintclass={cs.CV}, eprinttype={arXiv}, eprint={cs.CV/1812.01718}}\"\n",
183+
" )\n",
184+
"\n",
185+
"\n",
186+
"with ds:\n",
187+
" for image, label in zip(np.load(train_images_filepath)['arr_0'], np.load(train_labels_filepath)['arr_0']):\n",
188+
" ds.append({'images': image, 'labels': np.uint32(label)})"
189+
]
190+
},
191+
{
192+
"cell_type": "markdown",
193+
"id": "683fa2bd",
194+
"metadata": {},
195+
"source": [
196+
"# Test Set"
197+
]
198+
},
199+
{
200+
"cell_type": "code",
201+
"execution_count": null,
202+
"id": "6d0bafdc",
203+
"metadata": {},
204+
"outputs": [],
205+
"source": [
206+
"dataset_name = \"kmnist-test\"\n",
207+
"dataset_path = f\"{workspace_path}/{dataset_name}\""
208+
]
209+
},
210+
{
211+
"cell_type": "code",
212+
"execution_count": null,
213+
"id": "f2eac537",
214+
"metadata": {},
215+
"outputs": [],
216+
"source": [
217+
"ds = hub.empty(dataset_path, overwrite=True)\n",
218+
"\n",
219+
"with ds:\n",
220+
" ds.create_tensor('images', htype = 'image', sample_compression = \"jpg\")\n",
221+
" ds.create_tensor('labels', htype = 'class_label', class_names = class_names)\n",
222+
"\n",
223+
" ds.info.update(\n",
224+
" description = \"Kuzushiji-MNIST is a drop-in replacement for the MNIST dataset. It contains 70,000 28x28 grayscale images spanning 10 classes (one from each column of hiragana), and is perfectly balanced like the original MNIST dataset (6k/1k train/test for each class).\", \n",
225+
" citation=\"@online{clanuwat2018deep, author={Tarin Clanuwat and Mikel Bober-Irizar and Asanobu Kitamoto and Alex Lamb and Kazuaki Yamamoto and David Ha}, title={Deep Learning for Classical Japanese Literature}, date={2018-12-03}, year={2018}, eprintclass={cs.CV}, eprinttype={arXiv}, eprint={cs.CV/1812.01718}}\"\n",
226+
" )\n",
227+
"\n",
228+
"\n",
229+
"with ds:\n",
230+
" for image, label in zip(np.load(test_images_filepath)['arr_0'], np.load(test_labels_filepath)['arr_0']):\n",
231+
" ds.append({'images': image, 'labels': np.uint32(label)})"
232+
]
233+
},
234+
{
235+
"cell_type": "markdown",
236+
"id": "82339916",
237+
"metadata": {},
238+
"source": [
239+
"Dataset documentation: https://docs.activeloop.ai/datasets/kmnist"
240+
]
241+
},
242+
{
243+
"cell_type": "code",
244+
"execution_count": null,
245+
"id": "0d4013aa",
246+
"metadata": {},
247+
"outputs": [],
248+
"source": []
249+
}
250+
],
251+
"metadata": {
252+
"kernelspec": {
253+
"display_name": "Python 3 (ipykernel)",
254+
"language": "python",
255+
"name": "python3"
256+
},
257+
"language_info": {
258+
"codemirror_mode": {
259+
"name": "ipython",
260+
"version": 3
261+
},
262+
"file_extension": ".py",
263+
"mimetype": "text/x-python",
264+
"name": "python",
265+
"nbconvert_exporter": "python",
266+
"pygments_lexer": "ipython3",
267+
"version": "3.9.5"
268+
}
269+
},
270+
"nbformat": 4,
271+
"nbformat_minor": 5
272+
}

0 commit comments

Comments
 (0)