diff --git a/CHANGELOG.md b/CHANGELOG.md index d444df27..be762442 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## Unreleased + +### Added + +- **Audio**: add `timestampGranularities` + ## 3.7.0 ### Added diff --git a/openai-client/src/commonMain/kotlin/com.aallam.openai.client/internal/api/AudioApi.kt b/openai-client/src/commonMain/kotlin/com.aallam.openai.client/internal/api/AudioApi.kt index 6006880e..81c6e3b6 100644 --- a/openai-client/src/commonMain/kotlin/com.aallam.openai.client/internal/api/AudioApi.kt +++ b/openai-client/src/commonMain/kotlin/com.aallam.openai.client/internal/api/AudioApi.kt @@ -61,6 +61,11 @@ internal class AudioApi(val requester: HttpRequester) : Audio { request.responseFormat?.let { append(key = "response_format", value = it.value) } request.temperature?.let { append(key = "temperature", value = it) } request.language?.let { append(key = "language", value = it) } + if (request.responseFormat == AudioResponseFormat.VerboseJson) { + for (timestampGranularity in request.timestampGranularities) { + append(key = "timestamp_granularities[]", value = timestampGranularity.value) + } + } } @BetaOpenAI diff --git a/openai-client/src/commonTest/kotlin/com/aallam/openai/client/TestAudio.kt b/openai-client/src/commonTest/kotlin/com/aallam/openai/client/TestAudio.kt index 0696a267..fcc8fc75 100644 --- a/openai-client/src/commonTest/kotlin/com/aallam/openai/client/TestAudio.kt +++ b/openai-client/src/commonTest/kotlin/com/aallam/openai/client/TestAudio.kt @@ -55,6 +55,22 @@ class TestAudio : TestOpenAI() { assertTrue { transcription.segments?.isNotEmpty() ?: false } } + @Test + fun transcriptionWithWordTimestamps() = test { + val request = transcriptionRequest { + audio = FileSource(path = testFilePath("audio/micro-machines.wav"), fileSystem = TestFileSystem) + model = ModelId("whisper-1") + responseFormat = AudioResponseFormat.VerboseJson + timestampGranularities = listOf(TimestampGranularity.Word) + } + val transcription = openAI.transcription(request) + assertTrue { transcription.text.isNotEmpty() } + assertEquals(transcription.language, "english") + assertEquals(transcription.duration!!, 29.88, absoluteTolerance = 0.1) + assertEquals(transcription.segments, null) + assertTrue { transcription.words?.isNotEmpty() ?: false } + } + @Test fun translation() = test { val request = translationRequest { diff --git a/openai-core/src/commonMain/kotlin/com.aallam.openai.api/audio/TimestampGranularity.kt b/openai-core/src/commonMain/kotlin/com.aallam.openai.api/audio/TimestampGranularity.kt new file mode 100644 index 00000000..dc4b5a20 --- /dev/null +++ b/openai-core/src/commonMain/kotlin/com.aallam.openai.api/audio/TimestampGranularity.kt @@ -0,0 +1,13 @@ +package com.aallam.openai.api.audio + +import kotlinx.serialization.Serializable +import kotlin.jvm.JvmInline + +@Serializable +@JvmInline +public value class TimestampGranularity(public val value: String) { + public companion object { + public val Word: TimestampGranularity = TimestampGranularity("word") + public val Segment: TimestampGranularity = TimestampGranularity("segment") + } +} diff --git a/openai-core/src/commonMain/kotlin/com.aallam.openai.api/audio/Transcription.kt b/openai-core/src/commonMain/kotlin/com.aallam.openai.api/audio/Transcription.kt index 35a6bc80..9ac88a20 100644 --- a/openai-core/src/commonMain/kotlin/com.aallam.openai.api/audio/Transcription.kt +++ b/openai-core/src/commonMain/kotlin/com.aallam.openai.api/audio/Transcription.kt @@ -20,4 +20,5 @@ public data class Transcription( @SerialName("language") val language: String? = null, @SerialName("duration") val duration: Double? = null, @SerialName("segments") val segments: List? = null, + @SerialName("words") val words: List? = null, ) diff --git a/openai-core/src/commonMain/kotlin/com.aallam.openai.api/audio/TranscriptionRequest.kt b/openai-core/src/commonMain/kotlin/com.aallam.openai.api/audio/TranscriptionRequest.kt index 9288583a..1967cac1 100644 --- a/openai-core/src/commonMain/kotlin/com.aallam.openai.api/audio/TranscriptionRequest.kt +++ b/openai-core/src/commonMain/kotlin/com.aallam.openai.api/audio/TranscriptionRequest.kt @@ -43,6 +43,14 @@ public class TranscriptionRequest( * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format will improve accuracy and latency. */ public val language: String? = null, + + /** + * The timestamp granularities to populate for this transcription. + * responseFormat must be set verbose_json to use timestamp granularities. + * Either or both of these options are supported: word, or segment. + * Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency. + */ + public val timestampGranularities: List = emptyList(), ) /** @@ -90,6 +98,14 @@ public class TranscriptionRequestBuilder { */ public var language: String? = null + /** + * The timestamp granularities to populate for this transcription. + * responseFormat must be set verbose_json to use timestamp granularities. + * Either or both of these options are supported: word, or segment. + * Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency. + */ + public var timestampGranularities: List = emptyList() + /** * Builder of [TranscriptionRequest] instances. */ @@ -100,5 +116,6 @@ public class TranscriptionRequestBuilder { responseFormat = responseFormat, temperature = temperature, language = language, + timestampGranularities = timestampGranularities, ) } diff --git a/openai-core/src/commonMain/kotlin/com.aallam.openai.api/audio/Word.kt b/openai-core/src/commonMain/kotlin/com.aallam.openai.api/audio/Word.kt new file mode 100644 index 00000000..94287a6b --- /dev/null +++ b/openai-core/src/commonMain/kotlin/com.aallam.openai.api/audio/Word.kt @@ -0,0 +1,11 @@ +package com.aallam.openai.api.audio + +import kotlinx.serialization.SerialName +import kotlinx.serialization.Serializable + +@Serializable +public data class Word( + @SerialName("word") val word: String, + @SerialName("start") val start: Double, + @SerialName("end") val end: Double, +)