-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathazure_speech.py
158 lines (127 loc) · 6.72 KB
/
azure_speech.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
from dotenv import load_dotenv
import time
import azure.cognitiveservices.speech as speechsdk
import keyboard
import os
# Just in case this file is loaded alone
load_dotenv(dotenv_path=".env.local")
class AzureSpeechAIManager:
azure_speechconfig = None
azure_audioconfig = None
azure_speechrecognizer = None
def __init__(self):
try:
self.azure_speechconfig = speechsdk.SpeechConfig(
subscription=os.getenv("AZURE_TTS_KEY"),
region=os.getenv("AZURE_TTS_REGION"),
)
except TypeError:
exit(
"[red]Err: You forgot to set AZURE_TTS_KEY or AZURE_TTS_REGION in your environment."
)
self.azure_speechconfig.speech_recognition_language = "en-US"
self.azure_speechconfig.speech_synthesis_voice_name = (
"en-US-AvaMultilingualNeural"
)
self.azure_speechsynthesizer = speechsdk.SpeechSynthesizer(
speech_config=self.azure_speechconfig
)
self.azure_audioconfig = speechsdk.audio.AudioConfig(
use_default_microphone=True
)
self.azure_speechrecognizer = speechsdk.SpeechRecognizer(
speech_config=self.azure_speechconfig, audio_config=self.azure_audioconfig
)
def stt_from_mic(self):
print("Speak into your microphone.")
speech_recognition_result = (
self.azure_speechrecognizer.recognize_once_async().get()
)
text_result = speech_recognition_result.text
if speech_recognition_result.reason == speechsdk.ResultReason.RecognizedSpeech:
print("Recognized: {}".format(speech_recognition_result.text))
elif speech_recognition_result.reason == speechsdk.ResultReason.NoMatch:
print(
"No speech could be recognized: {}".format(
speech_recognition_result.no_match_details
)
)
elif speech_recognition_result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = speech_recognition_result.cancellation_details
print("Speech Recognition canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))
print("Did you set the speech resource key and region values?")
return text_result
def stt_from_mic_continuous(self, stop_key="p"):
self.azure_speechrecognizer = speechsdk.SpeechRecognizer(
speech_config=self.azure_speechconfig
)
done = False
# This gets called basically every word.
# def recognizing_cb(evt: speechsdk.SpeechRecognitionEventArgs):
# print('RECOGNIZING: {}'.format(evt))
# self.azure_speechrecognizer.recognizing.connect(recognizing_cb)
# Optional callback to print out whenever a chunk of speech is finished being recognized. Make sure to let this finish before ending the speech recognition.
def recognized_cb(evt: speechsdk.SpeechRecognitionEventArgs):
print("RECOGNIZED: {}".format(evt))
self.azure_speechrecognizer.recognized.connect(recognized_cb)
# We register this to fire if we get a session_stopped or cancelled event.
def stop_cb(evt: speechsdk.SessionEventArgs):
print("CLOSING speech recognition on {}".format(evt))
nonlocal done
done = True
# Connect callbacks to the events fired by the speech recognizer
self.azure_speechrecognizer.session_stopped.connect(stop_cb)
self.azure_speechrecognizer.canceled.connect(stop_cb)
# This is where we compile the results we receive from the ongoing "Recognized" events
all_results = []
def handle_final_result(evt):
all_results.append(evt.result.text)
self.azure_speechrecognizer.recognized.connect(handle_final_result)
# Perform recognition. `start_continuous_recognition_async asynchronously initiates continuous recognition operation,
# Other tasks can be performed on this thread while recognition starts...
# wait on result_future.get() to know when initialization is done.
# Call stop_continuous_recognition_async() to stop recognition.
result_future = self.azure_speechrecognizer.start_continuous_recognition_async()
result_future.get() # wait for voidfuture, so we know engine initialization is done.
print("Continuous Speech Recognition is now running, say something.")
while not done:
# Press the stop key. This is 'p' by default but user can provide different key
if keyboard.read_key() == stop_key:
print("\nEnding azure speech recognition\n")
self.azure_speechrecognizer.stop_continuous_recognition_async()
break
final_result = " ".join(all_results).strip()
print(f"\n\nHeres the result we got!\n\n{final_result}\n\n")
return final_result
def tts(self, text):
result = self.azure_speechsynthesizer.speak_ssml_async(self.ssml(text)).get()
# result = self.azure_speechsynthesizer.speak_text_async(text).get()
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
# print("Speech synthesized for text: {}".format(text))
return
if result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
print("Speech synthesis canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))
def ssml(self, text, voice_style="affectionate", role="Girl"):
return f"""
<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='http://www.w3.org/2001/mstts' xmlns:emo='http://www.w3.org/2009/10/emotionml' xml:lang='en-US'>
<voice name="en-US-AvaMultilingualNeural">
<mstts:express-as style='{voice_style}' role='{role}'>
<prosody rate="fast">
{text}
</prosody>
</mstts:express-as>
</voice>
</speak>
"""
# Tests
if __name__ == "__main__":
speechtotext_manager = AzureSpeechAIManager()
# speechtotext_manager.speechtotext_from_mic()
# result = speechtotext_manager.speechtotext_from_mic_continuous()
# print(f"[green]HERE IS THE RESULT:\n{result}")
speechtotext_manager.tts("what are you doing now?")