From 5b54e275248ed54179af0e454a4c04613cac2234 Mon Sep 17 00:00:00 2001
From: Fedir Zadniprovskyi <github.g1k56@simplelogin.com>
Date: Tue, 18 Feb 2025 08:01:52 -0800
Subject: [PATCH] docs: address feeback (#300)

---
 docs/installation.md                          |   6 +-
 ...egration.md => open-webui-intergration.md} |   0
 docs/usage/text-to-speech.md                  |   2 +-
 mkdocs.yml                                    |   4 +-
 tests/realtime_vad_test.py                    | 108 ++++++++++++++++++
 5 files changed, 116 insertions(+), 4 deletions(-)
 rename docs/usage/{open-webui-integration.md => open-webui-intergration.md} (100%)
 create mode 100644 tests/realtime_vad_test.py

diff --git a/docs/installation.md b/docs/installation.md
index 413cf854..3738e353 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -1,8 +1,12 @@
+!!! warning
+
+    Additional steps are required to use the text-to-speech feature. Please see the [Text-to-Speech](/docs/usage/text-to-speech.md#prerequisite).
+
 ## Docker Compose (Recommended)
 
 !!! note
 
-    I'm using newer Docker Compsose features. If you are using an older version of Docker Compose, you may need need to update.
+    I'm using newer Docker Compose features. If you are using an older version of Docker Compose, you may need need to update.
 
 Download the necessary Docker Compose files
 
diff --git a/docs/usage/open-webui-integration.md b/docs/usage/open-webui-intergration.md
similarity index 100%
rename from docs/usage/open-webui-integration.md
rename to docs/usage/open-webui-intergration.md
diff --git a/docs/usage/text-to-speech.md b/docs/usage/text-to-speech.md
index 07470149..b30b4df8 100644
--- a/docs/usage/text-to-speech.md
+++ b/docs/usage/text-to-speech.md
@@ -120,4 +120,4 @@ curl http://localhost:8000/v1/audio/speech --header "Content-Type: application/j
 ## Limitations
 
 - `response_format`: `opus` and `aac` are not supported
-- Maximuam audio generation length is 10 seconds for `rhasspy/piper-voices` and 30 seconds for `hexgrad/Kokoro-82M`
+- Maximum audio generation length is 10 seconds for `rhasspy/piper-voices` and 30 seconds for `hexgrad/Kokoro-82M`
diff --git a/mkdocs.yml b/mkdocs.yml
index 528d3141..08b23609 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -42,9 +42,9 @@ plugins:
 nav:
   - Introduction: index.md
   - Usage / Capabilities:
-      - Voice Chat: usage/voice-chat.md
-      - Speech-to-Text: usage/speech-to-text.md
       - Text-to-Speech: usage/text-to-speech.md
+      - Speech-to-Text: usage/speech-to-text.md
+      - Voice Chat: usage/voice-chat.md
       - Open WebUI Integration: usage/open-webui-integration.md
   - Installation: installation.md
   - Configuration: configuration.md
diff --git a/tests/realtime_vad_test.py b/tests/realtime_vad_test.py
new file mode 100644
index 00000000..a22da78d
--- /dev/null
+++ b/tests/realtime_vad_test.py
@@ -0,0 +1,108 @@
+import asyncio
+import base64
+import logging
+from pathlib import Path
+
+import numpy as np
+from openai import AsyncOpenAI
+from openai.resources.beta.realtime.realtime import AsyncRealtimeConnection
+from openai.types.beta.realtime.conversation_item_content_param import ConversationItemContentParam
+from openai.types.beta.realtime.conversation_item_param import ConversationItemParam
+from openai.types.beta.realtime.session_update_event_param import Session, SessionTurnDetection
+import pytest
+import soundfile as sf
+import websockets
+
+from speaches.audio import resample_audio
+
+logger = logging.getLogger(__name__)
+
+SAMPLE_RATE = 24000
+SAMPLE_WIDTH = 2
+BYTERATE = SAMPLE_RATE * SAMPLE_WIDTH  # like "bitrate" but in bytes
+
+WS_BASE_URL = "ws://localhost:8000/v1"
+MODEL = "gpt-4o-mini"
+
+RESPONSE_SESSION = Session(turn_detection=SessionTurnDetection(create_response=True))
+NO_RESPONSE_SESSION = Session(turn_detection=SessionTurnDetection(create_response=False))
+
+
+async def audio_sender(
+    conn: AsyncRealtimeConnection, audio_bytes: bytes, chunks_per_second: int = 10, speed: int = 1
+) -> None:
+    chunk_size = BYTERATE // chunks_per_second
+    try:
+        async with asyncio.TaskGroup() as tg:
+            for i in range(0, len(audio_bytes), chunk_size):
+                logger.info(f"Sending audio chunk from {i} to {i + chunk_size} of {len(audio_bytes)}")
+                audio_chunk = audio_bytes[i : i + chunk_size]
+                tg.create_task(conn.input_audio_buffer.append(audio=base64.b64encode(audio_chunk).decode("utf-8")))
+                await asyncio.sleep(1 / chunks_per_second / speed)
+    except* websockets.exceptions.ConnectionClosedError:
+        logger.info("Connection closed")
+
+
+async def print_events(conn: AsyncRealtimeConnection, final_event: str | None = None) -> None:
+    try:
+        async for event in conn:
+            if event.type == "response.audio.delta":
+                size = len(base64.b64decode(event.delta))
+                event.delta = f"base64 encoded audio of size {size} bytes"
+            print(event.model_dump_json())
+            if final_event is not None and event.type == final_event:
+                break
+    except websockets.exceptions.ConnectionClosedError:
+        logger.info("Connection closed")
+
+
+data, samplerate = sf.read(Path("1_2_3_4_5_6_7_8.wav"), dtype="int16")
+pcm_audio_bytes = data.tobytes()
+audio_bytes = resample_audio(pcm_audio_bytes, samplerate, 24000)
+quite_audio = np.zeros(SAMPLE_RATE * 3, dtype=np.int16).tobytes()
+audio_bytes = audio_bytes + quite_audio
+
+
+@pytest.mark.asyncio
+@pytest.mark.requires_openai
+async def test_realtime_vad_openai() -> None:
+    realtime_client = AsyncOpenAI(websocket_base_url=WS_BASE_URL).beta.realtime
+    async with asyncio.TaskGroup() as tg, realtime_client.connect(model=MODEL) as conn:
+        print_events_task = tg.create_task(
+            print_events(conn, final_event="conversation.item.input_audio_transcription.completed")
+        )
+        await conn.session.update(session=NO_RESPONSE_SESSION)
+        audio_sender_task = tg.create_task(audio_sender(conn, audio_bytes))
+        await audio_sender_task
+        await print_events_task
+        await conn.close()
+
+
+@pytest.mark.asyncio
+@pytest.mark.requires_openai
+async def test_realtime_response() -> None:
+    realtime_client = AsyncOpenAI(websocket_base_url=WS_BASE_URL).beta.realtime
+    async with asyncio.TaskGroup() as tg, realtime_client.connect(model=MODEL) as conn:
+        print_events_task = tg.create_task(print_events(conn, final_event=None))
+        await conn.session.update(session=RESPONSE_SESSION)
+        audio_sender_task = tg.create_task(audio_sender(conn, audio_bytes))
+        await audio_sender_task
+        await print_events_task
+        await conn.close()
+
+
+@pytest.mark.asyncio
+@pytest.mark.requires_openai
+async def test_realtime_create_conversation_item() -> None:
+    realtime_client = AsyncOpenAI(websocket_base_url=WS_BASE_URL).beta.realtime
+    async with asyncio.TaskGroup() as tg, realtime_client.connect(model=MODEL) as conn:
+        print_events_task = tg.create_task(print_events(conn, final_event="response.done"))
+        await conn.session.update(session=NO_RESPONSE_SESSION)
+        await conn.conversation.item.create(
+            item=ConversationItemParam(
+                role="user", type="message", content=[ConversationItemContentParam(type="input_text", text="Hello")]
+            )
+        )
+        await conn.response.create()
+        await print_events_task
+        await conn.close()