From 5b54e275248ed54179af0e454a4c04613cac2234 Mon Sep 17 00:00:00 2001 From: Fedir Zadniprovskyi Date: Tue, 18 Feb 2025 08:01:52 -0800 Subject: [PATCH] docs: address feeback (#300) --- docs/installation.md | 6 +- ...egration.md => open-webui-intergration.md} | 0 docs/usage/text-to-speech.md | 2 +- mkdocs.yml | 4 +- tests/realtime_vad_test.py | 108 ++++++++++++++++++ 5 files changed, 116 insertions(+), 4 deletions(-) rename docs/usage/{open-webui-integration.md => open-webui-intergration.md} (100%) create mode 100644 tests/realtime_vad_test.py diff --git a/docs/installation.md b/docs/installation.md index 413cf854..3738e353 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -1,8 +1,12 @@ +!!! warning + + Additional steps are required to use the text-to-speech feature. Please see the [Text-to-Speech](/docs/usage/text-to-speech.md#prerequisite). + ## Docker Compose (Recommended) !!! note - I'm using newer Docker Compsose features. If you are using an older version of Docker Compose, you may need need to update. + I'm using newer Docker Compose features. If you are using an older version of Docker Compose, you may need need to update. Download the necessary Docker Compose files diff --git a/docs/usage/open-webui-integration.md b/docs/usage/open-webui-intergration.md similarity index 100% rename from docs/usage/open-webui-integration.md rename to docs/usage/open-webui-intergration.md diff --git a/docs/usage/text-to-speech.md b/docs/usage/text-to-speech.md index 07470149..b30b4df8 100644 --- a/docs/usage/text-to-speech.md +++ b/docs/usage/text-to-speech.md @@ -120,4 +120,4 @@ curl http://localhost:8000/v1/audio/speech --header "Content-Type: application/j ## Limitations - `response_format`: `opus` and `aac` are not supported -- Maximuam audio generation length is 10 seconds for `rhasspy/piper-voices` and 30 seconds for `hexgrad/Kokoro-82M` +- Maximum audio generation length is 10 seconds for `rhasspy/piper-voices` and 30 seconds for `hexgrad/Kokoro-82M` diff --git a/mkdocs.yml b/mkdocs.yml index 528d3141..08b23609 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -42,9 +42,9 @@ plugins: nav: - Introduction: index.md - Usage / Capabilities: - - Voice Chat: usage/voice-chat.md - - Speech-to-Text: usage/speech-to-text.md - Text-to-Speech: usage/text-to-speech.md + - Speech-to-Text: usage/speech-to-text.md + - Voice Chat: usage/voice-chat.md - Open WebUI Integration: usage/open-webui-integration.md - Installation: installation.md - Configuration: configuration.md diff --git a/tests/realtime_vad_test.py b/tests/realtime_vad_test.py new file mode 100644 index 00000000..a22da78d --- /dev/null +++ b/tests/realtime_vad_test.py @@ -0,0 +1,108 @@ +import asyncio +import base64 +import logging +from pathlib import Path + +import numpy as np +from openai import AsyncOpenAI +from openai.resources.beta.realtime.realtime import AsyncRealtimeConnection +from openai.types.beta.realtime.conversation_item_content_param import ConversationItemContentParam +from openai.types.beta.realtime.conversation_item_param import ConversationItemParam +from openai.types.beta.realtime.session_update_event_param import Session, SessionTurnDetection +import pytest +import soundfile as sf +import websockets + +from speaches.audio import resample_audio + +logger = logging.getLogger(__name__) + +SAMPLE_RATE = 24000 +SAMPLE_WIDTH = 2 +BYTERATE = SAMPLE_RATE * SAMPLE_WIDTH # like "bitrate" but in bytes + +WS_BASE_URL = "ws://localhost:8000/v1" +MODEL = "gpt-4o-mini" + +RESPONSE_SESSION = Session(turn_detection=SessionTurnDetection(create_response=True)) +NO_RESPONSE_SESSION = Session(turn_detection=SessionTurnDetection(create_response=False)) + + +async def audio_sender( + conn: AsyncRealtimeConnection, audio_bytes: bytes, chunks_per_second: int = 10, speed: int = 1 +) -> None: + chunk_size = BYTERATE // chunks_per_second + try: + async with asyncio.TaskGroup() as tg: + for i in range(0, len(audio_bytes), chunk_size): + logger.info(f"Sending audio chunk from {i} to {i + chunk_size} of {len(audio_bytes)}") + audio_chunk = audio_bytes[i : i + chunk_size] + tg.create_task(conn.input_audio_buffer.append(audio=base64.b64encode(audio_chunk).decode("utf-8"))) + await asyncio.sleep(1 / chunks_per_second / speed) + except* websockets.exceptions.ConnectionClosedError: + logger.info("Connection closed") + + +async def print_events(conn: AsyncRealtimeConnection, final_event: str | None = None) -> None: + try: + async for event in conn: + if event.type == "response.audio.delta": + size = len(base64.b64decode(event.delta)) + event.delta = f"base64 encoded audio of size {size} bytes" + print(event.model_dump_json()) + if final_event is not None and event.type == final_event: + break + except websockets.exceptions.ConnectionClosedError: + logger.info("Connection closed") + + +data, samplerate = sf.read(Path("1_2_3_4_5_6_7_8.wav"), dtype="int16") +pcm_audio_bytes = data.tobytes() +audio_bytes = resample_audio(pcm_audio_bytes, samplerate, 24000) +quite_audio = np.zeros(SAMPLE_RATE * 3, dtype=np.int16).tobytes() +audio_bytes = audio_bytes + quite_audio + + +@pytest.mark.asyncio +@pytest.mark.requires_openai +async def test_realtime_vad_openai() -> None: + realtime_client = AsyncOpenAI(websocket_base_url=WS_BASE_URL).beta.realtime + async with asyncio.TaskGroup() as tg, realtime_client.connect(model=MODEL) as conn: + print_events_task = tg.create_task( + print_events(conn, final_event="conversation.item.input_audio_transcription.completed") + ) + await conn.session.update(session=NO_RESPONSE_SESSION) + audio_sender_task = tg.create_task(audio_sender(conn, audio_bytes)) + await audio_sender_task + await print_events_task + await conn.close() + + +@pytest.mark.asyncio +@pytest.mark.requires_openai +async def test_realtime_response() -> None: + realtime_client = AsyncOpenAI(websocket_base_url=WS_BASE_URL).beta.realtime + async with asyncio.TaskGroup() as tg, realtime_client.connect(model=MODEL) as conn: + print_events_task = tg.create_task(print_events(conn, final_event=None)) + await conn.session.update(session=RESPONSE_SESSION) + audio_sender_task = tg.create_task(audio_sender(conn, audio_bytes)) + await audio_sender_task + await print_events_task + await conn.close() + + +@pytest.mark.asyncio +@pytest.mark.requires_openai +async def test_realtime_create_conversation_item() -> None: + realtime_client = AsyncOpenAI(websocket_base_url=WS_BASE_URL).beta.realtime + async with asyncio.TaskGroup() as tg, realtime_client.connect(model=MODEL) as conn: + print_events_task = tg.create_task(print_events(conn, final_event="response.done")) + await conn.session.update(session=NO_RESPONSE_SESSION) + await conn.conversation.item.create( + item=ConversationItemParam( + role="user", type="message", content=[ConversationItemContentParam(type="input_text", text="Hello")] + ) + ) + await conn.response.create() + await print_events_task + await conn.close()