-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathphonemes.py
149 lines (120 loc) · 4.86 KB
/
phonemes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import pathlib
from typing import List
import argparse
import os
import warnings
import librosa
import soundfile as sf
from tqdm import tqdm
import pyfoal
import numpy as np
from src.utils.tsv import TSVEntry, read_tsv
from src.utils.mp_util import round_robin_map
from src.utils.sound_util import seconds_to_index
warnings.filterwarnings('ignore', 'PySoundFile failed. Trying audioread instead.')
class Phoneme:
def __init__(self, name: str, start: float, stop: float, sample_rate: int, file_length: int):
self.name = name
self.start = start
self.stop = stop
self.sample_rate = sample_rate
self.file_length = file_length
@property
def start_index(self) -> int:
i = seconds_to_index(self.start, self.sample_rate)
if i >= self.file_length:
i -= 1
return i
@property
def stop_index(self) -> int:
i = seconds_to_index(self.stop, self.sample_rate)
if i >= self.file_length:
i -= 1
return i
class RecordingSample:
def __init__(self, e: TSVEntry, loc: pathlib.Path):
self.entry = e
self.directory = loc
self.base_filename = e['path'].split('.')[0]
self.location = loc / e['path']
self.output_location = loc / (self.base_filename + '.wav')
self.alignment_location = loc / (self.base_filename + '.json')
def rec_trim_empty_space(v: RecordingSample):
w, sr = librosa.load(v.location if not v.output_location.exists() else v.output_location, mono=True)
wi = 0
ringbuffer = []
for wi, wt in enumerate(w):
if abs(wt) > 0.01:
if len(ringbuffer) >= 100:
if np.mean(ringbuffer) > -0.01:
wi -= len(ringbuffer)
break
if len(ringbuffer) < 500:
ringbuffer.append(wt)
else:
ringbuffer.pop(0)
ringbuffer.append(wt)
ringbuffer = []
we = -1
for we, wt in enumerate(reversed(w)):
if abs(wt) > 0.01:
if len(ringbuffer) >= 100:
if np.mean(ringbuffer) > -0.01:
we -= len(ringbuffer)
break
if len(ringbuffer) < 500:
ringbuffer.append(wt)
else:
ringbuffer.pop(0)
ringbuffer.append(wt)
if we <= 0:
w = w[wi:]
else:
w = w[wi:-we]
sf.write(v.output_location, w, sr)
return True
def rec_create_forced_alignment(v: RecordingSample):
if not v.alignment_location.exists():
w, sr = librosa.load(v.output_location, mono=True)
try:
alignment = pyfoal.align(v.entry['sentence'], w, sr)
alignment.save_json(v.alignment_location)
phonemes = [Phoneme(p.phoneme, p.start(), p.end(), sr, len(w)) for p in alignment.phonemes()]
pdir = v.directory / 'phonemes'
if not pdir.exists():
os.mkdir(pdir)
for pi, p in enumerate(phonemes):
psubdir = pdir / p.name
if not psubdir.exists():
os.mkdir(psubdir)
sf.write(psubdir / (v.base_filename + '_{}.wav'.format(pi)), w[p.start_index:p.stop_index], sr)
except RuntimeError as e:
print(e)
except IndexError as e:
print(e)
except ValueError as e:
print(e)
return True
def trim_empty_space(values: List[TSVEntry], recording_locations: pathlib.Path):
round_robin_map(list(map(lambda x: RecordingSample(x, recording_locations),
tqdm(values, desc='Creating output file locations'))), rec_trim_empty_space,
1024, 'Trimming empty space')
def create_phoneme_alignment(values: List[TSVEntry], recording_location: pathlib.Path):
round_robin_map(list(map(lambda x: RecordingSample(x, recording_location),
tqdm(values, desc='Creating output file locations'))), rec_create_forced_alignment,
1024, 'Creating Forced Alignments')
def create_text_files(values: List[TSVEntry], recording_location: pathlib.Path):
for v in tqdm(values, desc='Generating text files for pyfoal'):
fname = v['path'].split('.')[0]
with open(recording_location / (fname + '_text.txt'), 'w+') as fp:
fp.write(v['sentence'])
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Processes the phonemes for the audio files')
parser.add_argument('tsv_file', type=pathlib.Path, help='The file containing the list of sample file locations')
parser.add_argument('clips_dir', type=pathlib.Path, help='The directory containing the sample files')
parser.add_argument('--trim', action='store_true')
args = parser.parse_args()
records = read_tsv(args.tsv_file)
if args.trim:
trim_empty_space(records, args.clips_dir)
create_phoneme_alignment(records, args.clips_dir)