-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathlibrispeechmix_prepare.py
229 lines (202 loc) · 9.03 KB
/
librispeechmix_prepare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
"""LibriSpeechMix data preparation.
Authors
* Luca Della Libera 2023
"""
import copy
import json
import logging
import os
from collections import defaultdict
from typing import List, Optional, Sequence, Union
__all__ = ["prepare_librispeechmix"]
# Logging configuration
logging.basicConfig(
level=logging.WARNING, # format="%(asctime)s [%(levelname)s] %(funcName)s - %(message)s",
)
_LOGGER = logging.getLogger(__name__)
_DEFAULT_SPLITS = (
"dev-clean-1mix",
"dev-clean-2mix",
"dev-clean-3mix",
"test-clean-1mix",
"test-clean-2mix",
"test-clean-3mix",
)
def prepare_librispeechmix(
data_folder: "str",
save_folder: "Optional[str]" = None,
splits: "Sequence[str]" = _DEFAULT_SPLITS,
num_targets: "Optional[Union[int, List[int]], str]" = None,
num_enrolls: "Optional[int]" = None,
trim_nontarget: "Optional[float]" = None,
suppress_delay: "Optional[bool]" = None,
overlap_ratio: "Optional[float]" = None,
) -> "None":
"""Prepare data manifest JSON files for LibriSpeechMix dataset
(see https://github.com/NaoyukiKanda/LibriSpeechMix).
Arguments
---------
data_folder:
The path to the dataset folder (i.e. a folder containing the standard
LibriSpeech folders + the LibriSpeechMix JSONL annotation files).
save_folder:
The path to the folder where the data manifest JSON files will be stored.
Default to `data_folder`.
splits:
The dataset splits to load.
Splits with the same prefix are merged into a single JSON file
(e.g. "dev-clean-1mix" and "dev-clean-2mix").
Default to all the available splits.
num_targets:
The maximum number of target utterances to extract from each mixture.
If a list, only the utterances at the given indices are extracted.
If `min`, use only the shortest utterance as a target.
If `max`, use only the shortest utterance as a target.
Default to all the available utterances (`n` for the `n`-mix case).
num_enrolls:
The maximum number of enrollment utterances per target speaker.
Default to all the available enrollment utterances.
trim_nontarget:
The maximum number of seconds before and after the target utterance.
Set to 0 to trim the mixture at the edges of the target utterance.
Default to infinity.
suppress_delay:
True to set all delays to 0 (i.e. maximize the overlap).
Must be None if `overlap_ratio` is set.
Default to False.
overlap_ratio:
The overlap ratio with respect to the target utterance.
The target utterance delay is always set to 0, which implies
that a new mixture is created for each target utterance.
Must be None if `suppress_delay` is set.
Default the values specifies in the annotation file for each mixture.
Raises
------
ValueError
If an invalid argument value is given.
RuntimeError
If the data folder's structure does not match the expected one.
Examples
--------
>>> prepare_librispeechmix(
... "LibriSpeechMix",
... splits=["dev-clean-2mix", "test-clean-2mix"],
... )
"""
if not save_folder:
save_folder = data_folder
if not splits:
raise ValueError(f"`splits` ({splits}) must be non-empty")
if suppress_delay is not None and overlap_ratio is not None:
raise ValueError(
f"Either `suppress_delay` or `overlap_ratio` must be set, but not both"
)
if overlap_ratio is not None:
if overlap_ratio < 0.0 or overlap_ratio > 1.0:
raise ValueError(
f"`overlap_ratio` ({overlap_ratio}) must be in the interval [0, 1]"
)
# Grouping
groups = defaultdict(list)
for split in splits:
if split.startswith("train"):
groups["train"].append(split)
elif split.startswith("dev"):
groups["dev"].append(split)
elif split.startswith("test"):
groups["test"].append(split)
else:
raise ValueError(
f'`split` ({split}) must start with either "train", "dev" or "test"'
)
# Write output JSON for each group
for group_name, group in groups.items():
_LOGGER.info(
"----------------------------------------------------------------------",
)
output_entries = {}
for split in group:
_LOGGER.info(f"Split: {split}")
# Read input JSONL
input_jsonl = os.path.join(data_folder, f"{split}.jsonl")
if not os.path.exists(input_jsonl):
raise RuntimeError(f'"{input_jsonl}" not found')
with open(input_jsonl, "r", encoding="utf-8") as fr:
for input_line in fr:
input_entry = json.loads(input_line)
ID = input_entry["id"]
wavs = input_entry["wavs"]
durations = copy.deepcopy(input_entry["durations"])
speaker_profile = input_entry["speaker_profile"]
texts = input_entry["texts"]
speaker_profile_index = input_entry["speaker_profile_index"]
speakers = input_entry["speakers"]
genders = input_entry["genders"]
if isinstance(num_targets, (int, float)):
target_speaker_idxes = list(range(int(num_targets)))
elif isinstance(num_targets, list):
target_speaker_idxes = num_targets
elif num_targets == "min":
min_duration = min(durations)
min_idx = durations.index(min_duration)
target_speaker_idxes = [min_idx]
elif num_targets == "max":
max_duration = max(durations)
max_idx = durations.index(max_duration)
target_speaker_idxes = [max_idx]
elif num_targets is None:
target_speaker_idxes = list(range(len(texts)))
else:
raise NotImplementedError
wavs = [os.path.join("{DATA_ROOT}", wav) for wav in wavs]
for target_speaker_idx in target_speaker_idxes:
text = texts[target_speaker_idx]
idx = speaker_profile_index[target_speaker_idx]
ID_text = f"{ID}_text-{target_speaker_idx}"
# Read here to not overwrite
delays = copy.deepcopy(input_entry["delays"])
if suppress_delay:
delays = [0.0 for _ in delays]
if overlap_ratio is not None:
target_duration = durations[target_speaker_idx]
overlap_start = (1 - overlap_ratio) * target_duration
delays = [overlap_start] * len(wavs)
delays[target_speaker_idx] = 0
start = 0.0
duration = max([d + x for d, x in zip(delays, durations)])
max_duration = copy.deepcopy(duration)
if trim_nontarget is not None:
start = delays[target_speaker_idx]
duration = durations[target_speaker_idx]
new_start = max(0.0, start - trim_nontarget)
duration += start - new_start
duration = min(
duration + trim_nontarget, max_duration - new_start
)
start = new_start
enroll_wavs = speaker_profile[idx]
for enroll_wav in enroll_wavs[:num_enrolls]:
ID_enroll = f"{ID_text}_{enroll_wav}"
enroll_wav = os.path.join("{DATA_ROOT}", enroll_wav)
output_entry = {
"wavs": wavs,
"enroll_wav": enroll_wav,
"delays": delays,
"start": start,
"duration": duration,
"durations": durations,
"target_speaker_idx": target_speaker_idx,
"wrd": text,
"speakers": speakers,
"genders": genders,
}
output_entries[ID_enroll] = output_entry
# Write output JSON
output_json = os.path.join(save_folder, f"{group_name}.json")
_LOGGER.info(f"Writing {output_json}...")
with open(output_json, "w", encoding="utf-8") as fw:
json.dump(output_entries, fw, ensure_ascii=False, indent=4)
_LOGGER.info(
"----------------------------------------------------------------------",
)
_LOGGER.info("Done!")