-
Notifications
You must be signed in to change notification settings - Fork 2k
/
Copy pathtts_transformers.py
67 lines (61 loc) · 2.88 KB
/
tts_transformers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import torch
import random
import string
import soundfile as sf
device = "cuda" if torch.cuda.is_available() else "cpu"
# load the processor
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
# load the model
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
# load the vocoder, that is the voice encoder
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
# we load this dataset to get the speaker embeddings
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
# speaker ids from the embeddings dataset
speakers = {
'awb': 0, # Scottish male
'bdl': 1138, # US male
'clb': 2271, # US female
'jmk': 3403, # Canadian male
'ksp': 4535, # Indian male
'rms': 5667, # US male
'slt': 6799 # US female
}
def save_text_to_speech(text, speaker=None):
# preprocess text
inputs = processor(text=text, return_tensors="pt").to(device)
if speaker is not None:
# load xvector containing speaker's voice characteristics from a dataset
speaker_embeddings = torch.tensor(embeddings_dataset[speaker]["xvector"]).unsqueeze(0).to(device)
else:
# random vector, meaning a random voice
speaker_embeddings = torch.randn((1, 512)).to(device)
# generate speech with the models
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
if speaker is not None:
# if we have a speaker, we use the speaker's ID in the filename
output_filename = f"{speaker}-{'-'.join(text.split()[:6])}.mp3"
else:
# if we don't have a speaker, we use a random string in the filename
random_str = ''.join(random.sample(string.ascii_letters+string.digits, k=5))
output_filename = f"{random_str}-{'-'.join(text.split()[:6])}.mp3"
# save the generated speech to a file with 16KHz sampling rate
sf.write(output_filename, speech.cpu().numpy(), samplerate=16000)
# return the filename for reference
return output_filename
# generate speech with a US female voice
save_text_to_speech("Python is my favorite programming language", speaker=speakers["slt"])
# generate speech with a random voice
save_text_to_speech("Python is my favorite programming language")
# a challenging text with all speakers
text = """In his miracle year, he published four groundbreaking papers.
These outlined the theory of the photoelectric effect, explained Brownian motion,
introduced special relativity, and demonstrated mass-energy equivalence."""
for speaker_name, speaker in speakers.items():
output_filename = save_text_to_speech(text, speaker)
print(f"Saved {output_filename}")
# random speaker
output_filename = save_text_to_speech(text)
print(f"Saved {output_filename}")