forked from oneapi-src/oneAPI-samples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_dataset.py
35 lines (27 loc) · 1.29 KB
/
get_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import os
import shutil
import argparse
from datasets import load_dataset
from tqdm import tqdm
language_to_code = {
"japanese": "ja",
"swedish": "sv-SE"
}
def download_dataset(output_dir):
for lang, lang_code in language_to_code.items():
print(f"Processing dataset for language: {lang_code}")
# Load the dataset for the specific language
dataset = load_dataset("mozilla-foundation/common_voice_11_0", lang_code, split="train", trust_remote_code=True)
# Create a language-specific output folder
output_folder = os.path.join(output_dir, lang, lang_code, "clips")
os.makedirs(output_folder, exist_ok=True)
# Extract and copy MP3 files
for sample in tqdm(dataset, desc=f"Extracting and copying MP3 files for {lang}"):
audio_path = sample['audio']['path']
shutil.copy(audio_path, output_folder)
print("Extraction and copy complete.")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Extract and copy audio files from a dataset to a specified directory.")
parser.add_argument("--output_dir", type=str, default="/data/commonVoice", help="Base output directory for saving the files. Default is /data/commonVoice")
args = parser.parse_args()
download_dataset(args.output_dir)