From 8c4f9f1d8fa1a843f2115026a13b52d36dc50d83 Mon Sep 17 00:00:00 2001 From: sachaarbonel Date: Wed, 13 Mar 2024 14:38:39 +0530 Subject: [PATCH 1/5] update docs upon feedback --- .../docs/api/recording/recording_calls.mdx | 2 +- .../api/transcription/transcribing_calls.mdx | 31 ++++++++++++++----- .../docusaurus/docs/api/webhooks/events.mdx | 4 +++ 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/docusaurus/video/docusaurus/docs/api/recording/recording_calls.mdx b/docusaurus/video/docusaurus/docs/api/recording/recording_calls.mdx index e2154def..79ddfeba 100644 --- a/docusaurus/video/docusaurus/docs/api/recording/recording_calls.mdx +++ b/docusaurus/video/docusaurus/docs/api/recording/recording_calls.mdx @@ -10,7 +10,7 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; Calls can be recorded for later use. Calls recording can be started/stopped via API calls or configured to start automatically when the first user joins the call. -Call recording is done by Stream server-side and later stored on AWS S3. You can also configure your Stream application to have files stored on your own S3 bucket (in that case, storage costs will not apply). +Call recording is done by Stream server-side and later stored on AWS S3. There is no charge for storage of recordings. You can also configure your Stream application to have files stored on your own S3 bucket. By default, calls will be recorded as mp4 video files. You can configure recording to only capture the audio. diff --git a/docusaurus/video/docusaurus/docs/api/transcription/transcribing_calls.mdx b/docusaurus/video/docusaurus/docs/api/transcription/transcribing_calls.mdx index 640bcb56..e620c856 100644 --- a/docusaurus/video/docusaurus/docs/api/transcription/transcribing_calls.mdx +++ b/docusaurus/video/docusaurus/docs/api/transcription/transcribing_calls.mdx @@ -8,11 +8,13 @@ title: Transcribing calls import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -Transcribing calls allows for the conversion of spoken words into written text. Transcription can be started/stopped via API calls or configured to start automatically when the first user joins the call. Call transcription is done by the Stream server-side and later stored on AWS S3. You can also configure your Stream application to have files stored on your own S3 bucket (in that case, storage costs will not apply). +Transcribing calls allows for the conversion of spoken words into written text. Transcription can be started/stopped via API calls or configured to start automatically when the first user joins the call. Call transcription is done by the Stream server-side and later stored on AWS S3. There is no charge for storage of transcriptions. You can also configure your Stream application to have files stored on your own S3 bucket. By default, transcriptions will be provided in a jsonl file. -Note: Transcriptions will capture all speakers in a single file. +> **Note:** Transcriptions will capture all speakers in a single file. + +> **Note:** It's important to note that transcriptions should not be used as a replacement for closed captioning (CC). We have it planned on our [roadmap](https://github.com/GetStream/protocol/discussions/127) to support CC in the future. ## Start and stop call transcription @@ -31,10 +33,10 @@ call.stopTranscription(); ```py -// starts transcribing +# starts transcribing call.start_transcription() -// stops the transcription for the call +# stops the transcription for the call call.stop_transcription() ``` @@ -82,10 +84,23 @@ curl -X GET "https://video.stream-io-api.com/video/call/default/${CALL_ID}/trans These events are sent to users connected to the call and your webhook/SQS: -- `call.transcription_started` when the call transcription has started -- `call.transcription_stopped` when the call transcription has stopped -- `call.transcription_ready` when the transcription is available for download -- `call.transcription_failed` when transcribing fails for any reason +- `call.transcription_started` sent when the transcription of the call has started +- `call.transcription_stopped` this event is sent only when the transcription is explicitly stopped through an API call, not in cases where the transcription process encounters an error. +- `call.transcription_ready` dispatched when the transcription is completed and available for download. An example payload of this event is detailed below. +- `call.transcription_failed` sent if the transcription process encounters any issues. + + +## Transcription JSONL file format + + ```jsonl + {"type":"speech", "start_time": "2024-02-28T08:18:18.061031795Z", "stop_time":"2024-02-28T08:18:22.401031795Z", "speaker_id": "Sacha_Arbonel", "text": "hello"} + {"type":"speech", "start_time": "2024-02-28T08:18:22.401031795Z", "stop_time":"2024-02-28T08:18:26.741031795Z", "speaker_id": "Sacha_Arbonel", "text": "how are you"} + {"type":"speech", "start_time": "2024-02-28T08:18:26.741031795Z", "stop_time":"2024-02-28T08:18:31.081031795Z", "speaker_id": "Tommaso_Barbugli", "text": "I'm good"} + {"type":"speech", "start_time": "2024-02-28T08:18:31.081031795Z", "stop_time":"2024-02-28T08:18:35.421031795Z", "speaker_id": "Tommaso_Barbugli", "text": "how about you"} + {"type":"speech", "start_time": "2024-02-28T08:18:35.421031795Z", "stop_time":"2024-02-28T08:18:39.761031795Z", "speaker_id": "Sacha_Arbonel", "text": "I'm good too"} + {"type":"speech", "start_time": "2024-02-28T08:18:39.761031795Z", "stop_time":"2024-02-28T08:18:44.101031795Z", "speaker_id": "Tommaso_Barbugli", "text": "that's great"} + {"type":"speech", "start_time": "2024-02-28T08:18:44.101031795Z", "stop_time":"2024-02-28T08:18:48.441031795Z", "speaker_id": "Tommaso_Barbugli", "text": "I'm glad to hear that"} + ``` ## User Permissions diff --git a/docusaurus/video/docusaurus/docs/api/webhooks/events.mdx b/docusaurus/video/docusaurus/docs/api/webhooks/events.mdx index 5ac85e79..fcbae0ec 100644 --- a/docusaurus/video/docusaurus/docs/api/webhooks/events.mdx +++ b/docusaurus/video/docusaurus/docs/api/webhooks/events.mdx @@ -39,6 +39,10 @@ Here you can find the list of events are sent to Webhook and SQS. | call.recording_stopped | Sent when call recording has stopped | | call.recording_ready | Sent when the recording is available for download | | call.recording_failed | Sent when recording fails for any reason | +| call.transcription_started | Sent when the transcription has started | +| call.transcription_stopped | Sent when the transcription is stopped | +| call.transcription_ready | Sent when the transcription is ready | +| call.transcription_failed | Sent when the transcription fails | You can find the definition of each events in the OpenAPI spec available [here](https://github.com/GetStream/protocol/blob/main/openapi/video-openapi.yaml) From 29810baa30ad2caa1b782c91273b78f94353fd06 Mon Sep 17 00:00:00 2001 From: sachaarbonel Date: Mon, 18 Mar 2024 13:56:00 +0530 Subject: [PATCH 2/5] transcription ready file format --- .../docs/api/transcription/transcribing_calls.mdx | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/docusaurus/video/docusaurus/docs/api/transcription/transcribing_calls.mdx b/docusaurus/video/docusaurus/docs/api/transcription/transcribing_calls.mdx index e620c856..1b567632 100644 --- a/docusaurus/video/docusaurus/docs/api/transcription/transcribing_calls.mdx +++ b/docusaurus/video/docusaurus/docs/api/transcription/transcribing_calls.mdx @@ -89,7 +89,22 @@ These events are sent to users connected to the call and your webhook/SQS: - `call.transcription_ready` dispatched when the transcription is completed and available for download. An example payload of this event is detailed below. - `call.transcription_failed` sent if the transcription process encounters any issues. +## Transcriptio nready format +``` +{ + "type": "call.transcription_ready", + "created_at": "2024-03-18T08:24:14.769328551Z", + "call_cid": "default:mkzN17EUrgvn", + "call_transcription": { + "filename": "transcript_default_mkzN17EUrgvn_1710750207642.jsonl", + "url": "https://frankfurt.stream-io-cdn.com/1129528/video/transcriptions/default_mkzN17EUrgvn/transcript_default_mkzN17EUrgvn_1710750207642.jsonl?Expires=1710751154&Signature=OhdoTClQm5MT8ITPLAEJcKNflsJ7B2G3j7kx~kQyPrAETftrM2rzZy4IIT1XIC~8MrbPduWcj1tILXoSg3ldfZEHWRPqeMFr0caljPAVAL~mybUb4Kct2JoPjfsYfmj4FzSQbT7Iib38qPr7uiP0axTFm0VKRenkNwwCoS0F858u9Mdr8r6fTzILhiOZ1hOjw3V-TT1YbR20Yn4abKi6i50GAs5fqUDtSlo9DmEJgcS79Y0wUD1g18cGZvg3NiH3ogHQnmvoNrf28Cxc0JhBCe4wFErCMJ3pinewEOwDEEOMdHcRtcfWy72w6MTEwi0yomHYIU5flaYgUXCkkOJODw__&Key-Pair-Id=APKAIHG36VEWPDULE23Q", + "start_time": "2024-03-18T08:23:27.642688204Z", + "end_time": "2024-03-18T08:24:14.754731786Z" + }, + "received_at": "2024-03-18T08:24:14.790Z" +} +``` ## Transcription JSONL file format ```jsonl From 39e7270a4e4e282faf4078c0e1d17c5d642ed495 Mon Sep 17 00:00:00 2001 From: sachaarbonel Date: Mon, 18 Mar 2024 13:57:50 +0530 Subject: [PATCH 3/5] typo --- .../docusaurus/docs/api/transcription/transcribing_calls.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docusaurus/video/docusaurus/docs/api/transcription/transcribing_calls.mdx b/docusaurus/video/docusaurus/docs/api/transcription/transcribing_calls.mdx index 1b567632..4010f296 100644 --- a/docusaurus/video/docusaurus/docs/api/transcription/transcribing_calls.mdx +++ b/docusaurus/video/docusaurus/docs/api/transcription/transcribing_calls.mdx @@ -89,7 +89,7 @@ These events are sent to users connected to the call and your webhook/SQS: - `call.transcription_ready` dispatched when the transcription is completed and available for download. An example payload of this event is detailed below. - `call.transcription_failed` sent if the transcription process encounters any issues. -## Transcriptio nready format +## Transcription ready format ``` { "type": "call.transcription_ready", From 13fb5f9d573df0d49338fb92652e6bef79fd9e44 Mon Sep 17 00:00:00 2001 From: Sacha Arbonel Date: Thu, 21 Mar 2024 20:33:54 +0530 Subject: [PATCH 4/5] Update transcribing_calls.mdx --- .../docusaurus/docs/api/transcription/transcribing_calls.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docusaurus/video/docusaurus/docs/api/transcription/transcribing_calls.mdx b/docusaurus/video/docusaurus/docs/api/transcription/transcribing_calls.mdx index 4010f296..122cb575 100644 --- a/docusaurus/video/docusaurus/docs/api/transcription/transcribing_calls.mdx +++ b/docusaurus/video/docusaurus/docs/api/transcription/transcribing_calls.mdx @@ -2,7 +2,7 @@ id: transcription_calls sidebar_position: 1 slug: /transcribing/calls -title: Transcribing calls +title: Call Transcriptions --- import Tabs from '@theme/Tabs'; @@ -214,4 +214,4 @@ client.update( ``` - \ No newline at end of file + From 7b048eda50f00c83e4f31737b9ff9d3590e8a8e8 Mon Sep 17 00:00:00 2001 From: sachaarbonel Date: Wed, 10 Apr 2024 14:57:03 +0200 Subject: [PATCH 5/5] update docs --- .../api/transcription/transcribing_calls.mdx | 69 ++++++++++++++++--- 1 file changed, 58 insertions(+), 11 deletions(-) diff --git a/docusaurus/video/docusaurus/docs/api/transcription/transcribing_calls.mdx b/docusaurus/video/docusaurus/docs/api/transcription/transcribing_calls.mdx index 122cb575..88e3f3f2 100644 --- a/docusaurus/video/docusaurus/docs/api/transcription/transcribing_calls.mdx +++ b/docusaurus/video/docusaurus/docs/api/transcription/transcribing_calls.mdx @@ -2,21 +2,21 @@ id: transcription_calls sidebar_position: 1 slug: /transcribing/calls -title: Call Transcriptions +title: Call Transcription and Closed Captions --- import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -Transcribing calls allows for the conversion of spoken words into written text. Transcription can be started/stopped via API calls or configured to start automatically when the first user joins the call. Call transcription is done by the Stream server-side and later stored on AWS S3. There is no charge for storage of transcriptions. You can also configure your Stream application to have files stored on your own S3 bucket. +You can transcribe calls to text using API calls or configure your call types to be automatically transcribed. When transcription is enabled automatically, the transcription process will start when the first user joins the call, and then stop when all participant have left the call. -By default, transcriptions will be provided in a jsonl file. +Transcriptions are structured as plain-text JSONL files and automatically uploaded to Stream managed storage or to your own configurable storage. Websocket and webhook events are also sent when transcription starts, stops and completes. -> **Note:** Transcriptions will capture all speakers in a single file. +Stream supports transcribing calls in multiple languages as well as transcriptions for closed captions. You can find more information about both later in this document. -> **Note:** It's important to note that transcriptions should not be used as a replacement for closed captioning (CC). We have it planned on our [roadmap](https://github.com/GetStream/protocol/discussions/127) to support CC in the future. +> **Note:**: we transcribe 1 dominant speaker and 2 other participants at a time -## Start and stop call transcription +## Quick Start @@ -49,12 +49,14 @@ curl -X POST "https://video.stream-io-api.com/video/call/default/${CALL_ID}/star curl -X POST "https://video.stream-io-api.com/video/call/default/${CALL_ID}/stop_transcription?api_key=${API_KEY}" -H "Authorization: ${JWT_TOKEN}" -H "stream-auth-type: jwt" ``` +By default the transcriptions are stored on Stream’s S3 bucket and retained for 2-weeks. You can also configure your application to have transcriptions stored on your own external storage, see the storage section of tis document for more detail. + ## List call transcriptions -This endpoint returns the list of transcriptionss for a call. When using Stream S3 as storage (default) all links are signed and expire after 2-weeks. +> **Note:** transcriptions stored on Stream’s S3 bucket (the default) will be returned with a signed URL. @@ -89,8 +91,8 @@ These events are sent to users connected to the call and your webhook/SQS: - `call.transcription_ready` dispatched when the transcription is completed and available for download. An example payload of this event is detailed below. - `call.transcription_failed` sent if the transcription process encounters any issues. -## Transcription ready format -``` +## `transcription.ready` event example +```json { "type": "call.transcription_ready", "created_at": "2024-03-18T08:24:14.769328551Z", @@ -107,7 +109,7 @@ These events are sent to users connected to the call and your webhook/SQS: ``` ## Transcription JSONL file format - ```jsonl + ```json {"type":"speech", "start_time": "2024-02-28T08:18:18.061031795Z", "stop_time":"2024-02-28T08:18:22.401031795Z", "speaker_id": "Sacha_Arbonel", "text": "hello"} {"type":"speech", "start_time": "2024-02-28T08:18:22.401031795Z", "stop_time":"2024-02-28T08:18:26.741031795Z", "speaker_id": "Sacha_Arbonel", "text": "how are you"} {"type":"speech", "start_time": "2024-02-28T08:18:26.741031795Z", "stop_time":"2024-02-28T08:18:31.081031795Z", "speaker_id": "Tommaso_Barbugli", "text": "I'm good"} @@ -119,7 +121,7 @@ These events are sent to users connected to the call and your webhook/SQS: ## User Permissions -The following permissions are checked when users interact with the call transcription API. +The following permissions are available to grant/restrict access to this functionality when used client-side. - `StartTranscription` required to start the transcription - `StopTranscription` required to stop the transcription @@ -215,3 +217,48 @@ client.update( + + +## Multi language support + +When using out of the box, transcriptions are optimized for calls with english speakers. You can configure call transcription to optimize for a different language than english. You can also specify as secondary language as well if you expect to have two languages used simultaneously in the same call. + +Please note: the call transcription feature does not perform any language translation. When you select a different language, the trascription process will simply improve the speech-to-text detection for that language. + +You can set the transcription languages in two ways: either as a call setting or you can provide them to the `StartTranscription` API call. Languages are specified using their international language code (ISO639) +Please note: we currently don’t support changing language settings during the call. + +## Supported languages + +- English (en) - default +- French (fr) +- Spanish (es) +- German (de) +- Italian (it) +- Dutch (nl) +- Portuguese (pt) +- Polish (pl) +- Catalan (ca) +- Czech (cs) +- Danish (da) +- Greek (el) +- Finnish (fi) +- Indonesian (id) +- Japanese (ja) +- Russian (ru) +- Swedish (sv) +- Tamil (ta) +- Thai (th) +- Turkish (tr) +- Hungarian (hu) +- Romanian (to) +- Chinese (zh) +- Arabic (ar) +- Tagalog (tl) +- Hebrew (he) +- Hindi (hi) +- Croatian (hr) +- Korean (ko) +- Malay (ms) +- Norwegian (no) +- Ukrainian (uk) \ No newline at end of file