Libri Speech audio dataset added

Hussain0520 · Hussain0520 · commit 6fc4ef9d6c2d · 2022-12-02T20:45:50.000+05:30
diff --git a/Libri Speech/LibriSpeech.ipynb b/Libri Speech/LibriSpeech.ipynb
@@ -0,0 +1,316 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "6216e113",
+   "metadata": {},
+   "source": [
+    "# Installing Deeplake\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "95dc340a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pip install deeplake"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e676adb4",
+   "metadata": {},
+   "source": [
+    "# Importing Deeplake and OS library"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ad5eb08b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import deeplake\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ae1b872c",
+   "metadata": {},
+   "source": [
+    "# Uploading Dataset to S3 "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8fc3d1ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds = deeplake.empty('s3://activeloop-sandbox-datasets/Saherwala/LibriSpeech-train-clean-360',\n",
+    "    creds={\n",
+    "      'aws_access_key_id': 'YOUR ACCESS KEY',\n",
+    "      'aws_secret_access_key': 'YOUR SECRET ACCESS KEY'\n",
+    "    })"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "20a05e54",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with ds:\n",
+    "    ds.create_tensor(\"audios\", htype=\"audio\", sample_compression=\"flac\")\n",
+    "    ds.create_tensor(\"transcripts\",htype=\"text\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1eb6aff4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with ds:\n",
+    "    for dirpath, dirnames, filenames in os.walk('./train-clean-360'):\n",
+    "        for filename in filenames:\n",
+    "            fname = os.path.join(dirpath,filename)\n",
+    "            if fname.endswith('.txt'):\n",
+    "                with open(fname, \"r\") as f:\n",
+    "                    for line in f:\n",
+    "                        line = line.split(maxsplit=1)\n",
+    "                        audio_name = os.path.join(dirpath,line[0] + '.flac')# audio file\n",
+    "                        print(audio_name)\n",
+    "                        transcript_text = line[1]\n",
+    "                        ds.append({'audios': deeplake.read(audio_name), 'transcripts': transcript_text})\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "927c30bb",
+   "metadata": {},
+   "source": [
+    "# Uploading Dataset to Deeplake"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a79f0238",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds = deeplake.empty('hub://hussain0520/LibriSpeech-train-clean-360',\n",
+    "                    token=\"YOUR TOKEN\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "17b03c98",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with ds:\n",
+    "    ds.create_tensor(\"audios\", htype=\"audio\", sample_compression=\"flac\")\n",
+    "    ds.create_tensor(\"transcripts\",htype=\"text\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0133d49c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with ds:\n",
+    "    for dirpath, dirnames, filenames in os.walk('./train-clean-360'):\n",
+    "        for filename in filenames:\n",
+    "            fname = os.path.join(dirpath,filename)\n",
+    "            if fname.endswith('.txt'):\n",
+    "                with open(fname, \"r\") as f:\n",
+    "                    for line in f:\n",
+    "                        line = line.split(maxsplit=1)\n",
+    "                        audio_name = os.path.join(dirpath,line[0] + '.flac')# audio file\n",
+    "                        print(audio_name)\n",
+    "                        transcript_text = line[1]\n",
+    "                        ds.append({'audios': deeplake.read(audio_name), 'transcripts': transcript_text})\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dabc62fa",
+   "metadata": {},
+   "source": [
+    "# Uploading Dataset to Activeloop Workspace"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0d5a5f6c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds = deeplake.empty('hub://activeloop/LibriSpeech-train-clean-360',\n",
+    "                    token=\"YOUR TOKEN\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ead516bf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with ds:\n",
+    "    ds.create_tensor(\"audios\", htype=\"audio\", sample_compression=\"flac\")\n",
+    "    ds.create_tensor(\"transcripts\",htype=\"text\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f09090d4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with ds:\n",
+    "    for dirpath, dirnames, filenames in os.walk('./train-clean-360'):\n",
+    "        for filename in filenames:\n",
+    "            fname = os.path.join(dirpath,filename)\n",
+    "            if fname.endswith('.txt'):\n",
+    "                with open(fname, \"r\") as f:\n",
+    "                    for line in f:\n",
+    "                        line = line.split(maxsplit=1)\n",
+    "                        audio_name = os.path.join(dirpath,line[0] + '.flac')# audio file\n",
+    "                        print(audio_name)\n",
+    "                        transcript_text = line[1]\n",
+    "                        ds.append({'audios': deeplake.read(audio_name), 'transcripts': transcript_text})\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a8f529db",
+   "metadata": {},
+   "source": [
+    "# Deepcopying Dataset from S3 to Deeplake storage"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aca1c419",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "deeplake.deepcopy(\"s3://activeloop-sandbox-datasets/Saherwala/LibriSpeech-train-clean-360\",\n",
+    "                  \"hub://hussain0520/LibriSpeech-train-clean-360\", \n",
+    "                  src_creds={'aws_access_key_id': 'YOUR ACCESS KEY',\n",
+    "                          'aws_secret_access_key': 'YOUR SECRET ACCESS KEY'},          dest_token=\"YOUR DESTINATION TOKEN\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "794d0644",
+   "metadata": {},
+   "source": [
+    "# Deepcopying Dataset from Deeplake storage to Activeloop Workspace"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "22380f7b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "deeplake.deepcopy(\"hub://hussain0520/LibriSpeech-train-clean-360\",\"hub://activeloop/LibriSpeech-train-clean-360\", \n",
+    "                    dest_token=\"YOUR TOKEN\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d1e213bb",
+   "metadata": {},
+   "source": [
+    "# Testing and Loading Dataset from S3 storage"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "85097bd6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "deeplake.load(\"s3://activeloop-sandbox-datasets/Saherwala/LibriSpeech-train-clean-360\",creds={\n",
+    "      'aws_access_key_id': 'YOUR ACCESS KEY',\n",
+    "      'aws_secret_access_key': 'YOUR SECRET ACCESS KEY'\n",
+    "    })"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c729dfe4",
+   "metadata": {},
+   "source": [
+    "# Testing and Loading Dataset from Activeloop Workspace"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7b6eab92",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds = deeplake.load('hub://activeloop/LibriSpeech-train-clean-360')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1d6d7117",
+   "metadata": {},
+   "source": [
+    "# Similarly uploaded the rest 5 datasets i.e LibriSpeech-dev-clean, LibriSpeech-dev-other, LibriSpeech-test-clean, LibriSpeech-test-other and LibriSpeech-train-clean-100"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "17348d87",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}