Skip to content

Commit 6fc4ef9

Browse files
committed
Libri Speech audio dataset added
1 parent 9ac5e79 commit 6fc4ef9

File tree

1 file changed

+316
-0
lines changed

1 file changed

+316
-0
lines changed

Libri Speech/LibriSpeech.ipynb

+316
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,316 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "6216e113",
6+
"metadata": {},
7+
"source": [
8+
"# Installing Deeplake\n"
9+
]
10+
},
11+
{
12+
"cell_type": "code",
13+
"execution_count": null,
14+
"id": "95dc340a",
15+
"metadata": {},
16+
"outputs": [],
17+
"source": [
18+
"pip install deeplake"
19+
]
20+
},
21+
{
22+
"cell_type": "markdown",
23+
"id": "e676adb4",
24+
"metadata": {},
25+
"source": [
26+
"# Importing Deeplake and OS library"
27+
]
28+
},
29+
{
30+
"cell_type": "code",
31+
"execution_count": null,
32+
"id": "ad5eb08b",
33+
"metadata": {},
34+
"outputs": [],
35+
"source": [
36+
"import deeplake\n",
37+
"import os"
38+
]
39+
},
40+
{
41+
"cell_type": "markdown",
42+
"id": "ae1b872c",
43+
"metadata": {},
44+
"source": [
45+
"# Uploading Dataset to S3 "
46+
]
47+
},
48+
{
49+
"cell_type": "code",
50+
"execution_count": null,
51+
"id": "8fc3d1ad",
52+
"metadata": {},
53+
"outputs": [],
54+
"source": [
55+
"ds = deeplake.empty('s3://activeloop-sandbox-datasets/Saherwala/LibriSpeech-train-clean-360',\n",
56+
" creds={\n",
57+
" 'aws_access_key_id': 'YOUR ACCESS KEY',\n",
58+
" 'aws_secret_access_key': 'YOUR SECRET ACCESS KEY'\n",
59+
" })"
60+
]
61+
},
62+
{
63+
"cell_type": "code",
64+
"execution_count": null,
65+
"id": "20a05e54",
66+
"metadata": {},
67+
"outputs": [],
68+
"source": [
69+
"with ds:\n",
70+
" ds.create_tensor(\"audios\", htype=\"audio\", sample_compression=\"flac\")\n",
71+
" ds.create_tensor(\"transcripts\",htype=\"text\")"
72+
]
73+
},
74+
{
75+
"cell_type": "code",
76+
"execution_count": null,
77+
"id": "1eb6aff4",
78+
"metadata": {},
79+
"outputs": [],
80+
"source": [
81+
"with ds:\n",
82+
" for dirpath, dirnames, filenames in os.walk('./train-clean-360'):\n",
83+
" for filename in filenames:\n",
84+
" fname = os.path.join(dirpath,filename)\n",
85+
" if fname.endswith('.txt'):\n",
86+
" with open(fname, \"r\") as f:\n",
87+
" for line in f:\n",
88+
" line = line.split(maxsplit=1)\n",
89+
" audio_name = os.path.join(dirpath,line[0] + '.flac')# audio file\n",
90+
" print(audio_name)\n",
91+
" transcript_text = line[1]\n",
92+
" ds.append({'audios': deeplake.read(audio_name), 'transcripts': transcript_text})\n"
93+
]
94+
},
95+
{
96+
"cell_type": "markdown",
97+
"id": "927c30bb",
98+
"metadata": {},
99+
"source": [
100+
"# Uploading Dataset to Deeplake"
101+
]
102+
},
103+
{
104+
"cell_type": "code",
105+
"execution_count": null,
106+
"id": "a79f0238",
107+
"metadata": {},
108+
"outputs": [],
109+
"source": [
110+
"ds = deeplake.empty('hub://hussain0520/LibriSpeech-train-clean-360',\n",
111+
" token=\"YOUR TOKEN\")"
112+
]
113+
},
114+
{
115+
"cell_type": "code",
116+
"execution_count": null,
117+
"id": "17b03c98",
118+
"metadata": {},
119+
"outputs": [],
120+
"source": [
121+
"with ds:\n",
122+
" ds.create_tensor(\"audios\", htype=\"audio\", sample_compression=\"flac\")\n",
123+
" ds.create_tensor(\"transcripts\",htype=\"text\")"
124+
]
125+
},
126+
{
127+
"cell_type": "code",
128+
"execution_count": null,
129+
"id": "0133d49c",
130+
"metadata": {},
131+
"outputs": [],
132+
"source": [
133+
"with ds:\n",
134+
" for dirpath, dirnames, filenames in os.walk('./train-clean-360'):\n",
135+
" for filename in filenames:\n",
136+
" fname = os.path.join(dirpath,filename)\n",
137+
" if fname.endswith('.txt'):\n",
138+
" with open(fname, \"r\") as f:\n",
139+
" for line in f:\n",
140+
" line = line.split(maxsplit=1)\n",
141+
" audio_name = os.path.join(dirpath,line[0] + '.flac')# audio file\n",
142+
" print(audio_name)\n",
143+
" transcript_text = line[1]\n",
144+
" ds.append({'audios': deeplake.read(audio_name), 'transcripts': transcript_text})\n"
145+
]
146+
},
147+
{
148+
"cell_type": "markdown",
149+
"id": "dabc62fa",
150+
"metadata": {},
151+
"source": [
152+
"# Uploading Dataset to Activeloop Workspace"
153+
]
154+
},
155+
{
156+
"cell_type": "code",
157+
"execution_count": null,
158+
"id": "0d5a5f6c",
159+
"metadata": {},
160+
"outputs": [],
161+
"source": [
162+
"ds = deeplake.empty('hub://activeloop/LibriSpeech-train-clean-360',\n",
163+
" token=\"YOUR TOKEN\")"
164+
]
165+
},
166+
{
167+
"cell_type": "code",
168+
"execution_count": null,
169+
"id": "ead516bf",
170+
"metadata": {},
171+
"outputs": [],
172+
"source": [
173+
"with ds:\n",
174+
" ds.create_tensor(\"audios\", htype=\"audio\", sample_compression=\"flac\")\n",
175+
" ds.create_tensor(\"transcripts\",htype=\"text\")"
176+
]
177+
},
178+
{
179+
"cell_type": "code",
180+
"execution_count": null,
181+
"id": "f09090d4",
182+
"metadata": {},
183+
"outputs": [],
184+
"source": [
185+
"with ds:\n",
186+
" for dirpath, dirnames, filenames in os.walk('./train-clean-360'):\n",
187+
" for filename in filenames:\n",
188+
" fname = os.path.join(dirpath,filename)\n",
189+
" if fname.endswith('.txt'):\n",
190+
" with open(fname, \"r\") as f:\n",
191+
" for line in f:\n",
192+
" line = line.split(maxsplit=1)\n",
193+
" audio_name = os.path.join(dirpath,line[0] + '.flac')# audio file\n",
194+
" print(audio_name)\n",
195+
" transcript_text = line[1]\n",
196+
" ds.append({'audios': deeplake.read(audio_name), 'transcripts': transcript_text})\n"
197+
]
198+
},
199+
{
200+
"cell_type": "markdown",
201+
"id": "a8f529db",
202+
"metadata": {},
203+
"source": [
204+
"# Deepcopying Dataset from S3 to Deeplake storage"
205+
]
206+
},
207+
{
208+
"cell_type": "code",
209+
"execution_count": null,
210+
"id": "aca1c419",
211+
"metadata": {},
212+
"outputs": [],
213+
"source": [
214+
"deeplake.deepcopy(\"s3://activeloop-sandbox-datasets/Saherwala/LibriSpeech-train-clean-360\",\n",
215+
" \"hub://hussain0520/LibriSpeech-train-clean-360\", \n",
216+
" src_creds={'aws_access_key_id': 'YOUR ACCESS KEY',\n",
217+
" 'aws_secret_access_key': 'YOUR SECRET ACCESS KEY'}, dest_token=\"YOUR DESTINATION TOKEN\")"
218+
]
219+
},
220+
{
221+
"cell_type": "markdown",
222+
"id": "794d0644",
223+
"metadata": {},
224+
"source": [
225+
"# Deepcopying Dataset from Deeplake storage to Activeloop Workspace"
226+
]
227+
},
228+
{
229+
"cell_type": "code",
230+
"execution_count": null,
231+
"id": "22380f7b",
232+
"metadata": {},
233+
"outputs": [],
234+
"source": [
235+
"deeplake.deepcopy(\"hub://hussain0520/LibriSpeech-train-clean-360\",\"hub://activeloop/LibriSpeech-train-clean-360\", \n",
236+
" dest_token=\"YOUR TOKEN\")"
237+
]
238+
},
239+
{
240+
"cell_type": "markdown",
241+
"id": "d1e213bb",
242+
"metadata": {},
243+
"source": [
244+
"# Testing and Loading Dataset from S3 storage"
245+
]
246+
},
247+
{
248+
"cell_type": "code",
249+
"execution_count": null,
250+
"id": "85097bd6",
251+
"metadata": {},
252+
"outputs": [],
253+
"source": [
254+
"deeplake.load(\"s3://activeloop-sandbox-datasets/Saherwala/LibriSpeech-train-clean-360\",creds={\n",
255+
" 'aws_access_key_id': 'YOUR ACCESS KEY',\n",
256+
" 'aws_secret_access_key': 'YOUR SECRET ACCESS KEY'\n",
257+
" })"
258+
]
259+
},
260+
{
261+
"cell_type": "markdown",
262+
"id": "c729dfe4",
263+
"metadata": {},
264+
"source": [
265+
"# Testing and Loading Dataset from Activeloop Workspace"
266+
]
267+
},
268+
{
269+
"cell_type": "code",
270+
"execution_count": null,
271+
"id": "7b6eab92",
272+
"metadata": {},
273+
"outputs": [],
274+
"source": [
275+
"ds = deeplake.load('hub://activeloop/LibriSpeech-train-clean-360')"
276+
]
277+
},
278+
{
279+
"cell_type": "markdown",
280+
"id": "1d6d7117",
281+
"metadata": {},
282+
"source": [
283+
"# Similarly uploaded the rest 5 datasets i.e LibriSpeech-dev-clean, LibriSpeech-dev-other, LibriSpeech-test-clean, LibriSpeech-test-other and LibriSpeech-train-clean-100"
284+
]
285+
},
286+
{
287+
"cell_type": "code",
288+
"execution_count": null,
289+
"id": "17348d87",
290+
"metadata": {},
291+
"outputs": [],
292+
"source": []
293+
}
294+
],
295+
"metadata": {
296+
"kernelspec": {
297+
"display_name": "Python 3 (ipykernel)",
298+
"language": "python",
299+
"name": "python3"
300+
},
301+
"language_info": {
302+
"codemirror_mode": {
303+
"name": "ipython",
304+
"version": 3
305+
},
306+
"file_extension": ".py",
307+
"mimetype": "text/x-python",
308+
"name": "python",
309+
"nbconvert_exporter": "python",
310+
"pygments_lexer": "ipython3",
311+
"version": "3.9.12"
312+
}
313+
},
314+
"nbformat": 4,
315+
"nbformat_minor": 5
316+
}

0 commit comments

Comments
 (0)