1
1
import os
2
2
import datetime
3
3
from openai import OpenAI
4
- client = OpenAI ()
4
+ from elevenlabs .client import ElevenLabs , Voice
5
+ from elevenlabs import stream
5
6
import argparse
6
7
from dataclasses import asdict
7
8
from models import Message
12
13
import dotenv
13
14
dotenv .load_dotenv ('.env' )
14
15
16
+ oai_client = OpenAI ()
17
+ elevenlabs_client = ElevenLabs ()
18
+
15
19
CHAT_MODEL = "gpt-4o"
16
20
TTS_MODEL = "tts-1"
17
21
MODEL_TEMPERATURE = 0.5
18
22
AUDIO_MODEL = "whisper-1"
19
- VOICE_MODEL = "alloy"
23
+ VOICE_ID = os . getenv ( "ELEVENLABS_VOICE_ID" )
20
24
21
25
def ask_gpt_chat (prompt : str , messages : list [Message ]):
22
26
"""Returns ChatGPT's response to the given prompt."""
23
27
system_message = [{"role" : "system" , "content" : prompt }]
24
28
message_dicts = [asdict (message ) for message in messages ]
25
29
conversation_messages = system_message + message_dicts
26
- response = client .chat .completions .create (model = CHAT_MODEL ,
27
- messages = conversation_messages ,
28
- temperature = MODEL_TEMPERATURE )
30
+ response = oai_client .chat .completions .create (
31
+ model = CHAT_MODEL ,
32
+ messages = conversation_messages ,
33
+ temperature = MODEL_TEMPERATURE
34
+ )
29
35
return response .choices [0 ].message .content
30
36
31
37
def setup_prompt (prompt_file : str = 'prompts/vet_prompt.md' ) -> str :
@@ -37,7 +43,7 @@ def setup_prompt(prompt_file: str = 'prompts/vet_prompt.md') -> str:
37
43
38
44
def get_transcription (file_path : str ):
39
45
audio_file = open (file_path , "rb" )
40
- transcription = client .audio .transcriptions .create (
46
+ transcription = oai_client .audio .transcriptions .create (
41
47
model = AUDIO_MODEL ,
42
48
file = audio_file
43
49
)
@@ -64,17 +70,27 @@ def record():
64
70
f .write (transcript )
65
71
return transcript
66
72
67
- def text_to_speech (text : str ):
73
+ def oai_text_to_speech (text : str ):
68
74
timestamp = datetime .datetime .now ().timestamp ()
69
75
speech_file_path = Path (__file__ ).parent / f"outputs/{ timestamp } .mp3"
70
- response = client .audio .speech .create (
76
+ response = oai_client .audio .speech .create (
71
77
model = TTS_MODEL ,
72
- voice = VOICE_MODEL ,
78
+ voice = "nova" ,
73
79
input = text
74
80
)
75
81
response .write_to_file (speech_file_path )
76
82
return speech_file_path
77
83
84
+ def elevenlabs_text_to_speech (text : str ):
85
+ audio_stream = elevenlabs_client .generate (
86
+ text = text ,
87
+ voice = Voice (
88
+ voice_id = VOICE_ID
89
+ ),
90
+ stream = True
91
+ )
92
+ stream (audio_stream )
93
+
78
94
def clean_up ():
79
95
logging .info ('Exiting...' )
80
96
# Delete all the recordings and transcripts
@@ -93,8 +109,10 @@ def clean_up():
93
109
if __name__ == "__main__" :
94
110
parser = argparse .ArgumentParser ()
95
111
parser .add_argument ("-pf" , "--prompt_file" , help = "Specify the prompt file to use." , type = str )
112
+ parser .add_argument ("-tts" , "--tts_type" , help = "Specify the TTS type to use." , type = str , default = "openai" , choices = ["openai" , "elevenlabs" ])
96
113
args = parser .parse_args ()
97
114
prompt_file = args .prompt_file
115
+ tts_type = args .tts_type or "openai"
98
116
99
117
prompt = setup_prompt (prompt_file )
100
118
conversation_messages = []
@@ -106,9 +124,12 @@ def clean_up():
106
124
answer = ask_gpt_chat (prompt , conversation_messages )
107
125
logging .info (f'Caller: { answer } ' )
108
126
logging .info ('Playing audio...' )
109
- audio_file = text_to_speech (answer )
110
- # Play the audio file
111
- os .system (f"afplay { audio_file } " )
127
+ if tts_type == "elevenlabs" :
128
+ elevenlabs_text_to_speech (answer )
129
+ else :
130
+ audio_file = oai_text_to_speech (answer )
131
+ # Play the audio file
132
+ os .system (f"afplay { audio_file } " )
112
133
conversation_messages .append (Message (role = "assistant" , content = answer ))
113
134
if 'bye' in user_input .lower ():
114
135
clean_up ()
0 commit comments