Skip to content

Commit 98d6f0d

Browse files
Fedir Zadniprovskyifedirz
authored andcommitted
fix: timestamp_granularities[] handling (#28, #58, #81)
1 parent 5cbc876 commit 98d6f0d

File tree

3 files changed

+69
-3
lines changed

3 files changed

+69
-3
lines changed

src/faster_whisper_server/routers/stt.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
APIRouter,
1010
Form,
1111
Query,
12+
Request,
1213
Response,
1314
UploadFile,
1415
WebSocket,
@@ -30,6 +31,8 @@
3031
from faster_whisper_server.core import Segment, segments_to_srt, segments_to_text, segments_to_vtt
3132
from faster_whisper_server.dependencies import ConfigDependency, ModelManagerDependency, get_config
3233
from faster_whisper_server.server_models import (
34+
DEFAULT_TIMESTAMP_GRANULARITIES,
35+
TIMESTAMP_GRANULARITIES_COMBINATIONS,
3336
TimestampGranularities,
3437
TranscriptionJsonResponse,
3538
TranscriptionVerboseJsonResponse,
@@ -150,6 +153,18 @@ def translate_file(
150153
return segments_to_response(segments, transcription_info, response_format)
151154

152155

156+
# HACK: Since Form() doesn't support `alias`, we need to use a workaround.
157+
async def get_timestamp_granularities(request: Request) -> TimestampGranularities:
158+
form = await request.form()
159+
if form.get("timestamp_granularities[]") is None:
160+
return DEFAULT_TIMESTAMP_GRANULARITIES
161+
timestamp_granularities = form.getlist("timestamp_granularities[]")
162+
assert (
163+
timestamp_granularities in TIMESTAMP_GRANULARITIES_COMBINATIONS
164+
), f"{timestamp_granularities} is not a valid value for `timestamp_granularities[]`."
165+
return timestamp_granularities
166+
167+
153168
# https://platform.openai.com/docs/api-reference/audio/createTranscription
154169
# https://github.com/openai/openai-openapi/blob/master/openapi.yaml#L8915
155170
@router.post(
@@ -159,6 +174,7 @@ def translate_file(
159174
def transcribe_file(
160175
config: ConfigDependency,
161176
model_manager: ModelManagerDependency,
177+
request: Request,
162178
file: Annotated[UploadFile, Form()],
163179
model: Annotated[ModelName | None, Form()] = None,
164180
language: Annotated[Language | None, Form()] = None,
@@ -167,6 +183,7 @@ def transcribe_file(
167183
temperature: Annotated[float, Form()] = 0.0,
168184
timestamp_granularities: Annotated[
169185
TimestampGranularities,
186+
# WARN: `alias` doesn't actually work.
170187
Form(alias="timestamp_granularities[]"),
171188
] = ["segment"],
172189
stream: Annotated[bool, Form()] = False,
@@ -178,6 +195,11 @@ def transcribe_file(
178195
language = config.default_language
179196
if response_format is None:
180197
response_format = config.default_response_format
198+
timestamp_granularities = asyncio.run(get_timestamp_granularities(request))
199+
if timestamp_granularities != DEFAULT_TIMESTAMP_GRANULARITIES and response_format != ResponseFormat.VERBOSE_JSON:
200+
logger.warning(
201+
"It only makes sense to provide `timestamp_granularities[]` when `response_format` is set to `verbose_json`. See https://platform.openai.com/docs/api-reference/audio/createTranscription#audio-createtranscription-timestamp_granularities." # noqa: E501
202+
)
181203
whisper = model_manager.load_model(model)
182204
segments, transcription_info = whisper.transcribe(
183205
file.file,

src/faster_whisper_server/server_models.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ class TranscriptionVerboseJsonResponse(BaseModel):
2929
language: str
3030
duration: float
3131
text: str
32-
words: list[Word]
32+
words: list[Word] | None
3333
segments: list[Segment]
3434

3535
@classmethod
@@ -38,7 +38,7 @@ def from_segment(cls, segment: Segment, transcription_info: TranscriptionInfo) -
3838
language=transcription_info.language,
3939
duration=segment.end - segment.start,
4040
text=segment.text,
41-
words=(segment.words if isinstance(segment.words, list) else []),
41+
words=segment.words if transcription_info.transcription_options.word_timestamps else None,
4242
segments=[segment],
4343
)
4444

@@ -51,7 +51,7 @@ def from_segments(
5151
duration=transcription_info.duration,
5252
text=segments_to_text(segments),
5353
segments=segments,
54-
words=Word.from_segments(segments),
54+
words=Word.from_segments(segments) if transcription_info.transcription_options.word_timestamps else None,
5555
)
5656

5757
@classmethod
@@ -112,6 +112,7 @@ class ModelObject(BaseModel):
112112
TimestampGranularities = list[Literal["segment", "word"]]
113113

114114

115+
DEFAULT_TIMESTAMP_GRANULARITIES: TimestampGranularities = ["segment"]
115116
TIMESTAMP_GRANULARITIES_COMBINATIONS: list[TimestampGranularities] = [
116117
[], # should be treated as ["segment"]. https://platform.openai.com/docs/api-reference/audio/createTranscription#audio-createtranscription-timestamp_granularities
117118
["segment"],
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
"""See `tests/openai_timestamp_granularities_test.py` to understand how OpenAI handles `response_type` and `timestamp_granularities`.""" # noqa: E501
2+
3+
from faster_whisper_server.server_models import TIMESTAMP_GRANULARITIES_COMBINATIONS, TimestampGranularities
4+
from openai import AsyncOpenAI
5+
import pytest
6+
7+
8+
@pytest.mark.asyncio()
9+
@pytest.mark.parametrize("timestamp_granularities", TIMESTAMP_GRANULARITIES_COMBINATIONS)
10+
async def test_api_json_response_format_and_timestamp_granularities_combinations(
11+
openai_client: AsyncOpenAI,
12+
timestamp_granularities: TimestampGranularities,
13+
) -> None:
14+
audio_file = open("audio.wav", "rb") # noqa: SIM115, ASYNC230
15+
16+
await openai_client.audio.transcriptions.create(
17+
file=audio_file, model="whisper-1", response_format="json", timestamp_granularities=timestamp_granularities
18+
)
19+
20+
21+
@pytest.mark.asyncio()
22+
@pytest.mark.parametrize("timestamp_granularities", TIMESTAMP_GRANULARITIES_COMBINATIONS)
23+
async def test_api_verbose_json_response_format_and_timestamp_granularities_combinations(
24+
openai_client: AsyncOpenAI,
25+
timestamp_granularities: TimestampGranularities,
26+
) -> None:
27+
audio_file = open("audio.wav", "rb") # noqa: SIM115, ASYNC230
28+
29+
transcription = await openai_client.audio.transcriptions.create(
30+
file=audio_file,
31+
model="whisper-1",
32+
response_format="verbose_json",
33+
timestamp_granularities=timestamp_granularities,
34+
)
35+
36+
assert transcription.__pydantic_extra__
37+
if "word" in timestamp_granularities:
38+
assert transcription.__pydantic_extra__.get("segments") is not None
39+
assert transcription.__pydantic_extra__.get("words") is not None
40+
else:
41+
# Unless explicitly requested, words are not present
42+
assert transcription.__pydantic_extra__.get("segments") is not None
43+
assert transcription.__pydantic_extra__.get("words") is None

0 commit comments

Comments
 (0)