software-mansion
diff --git a/‎docs/docs/natural-language-processing/useSpeechToText.md
Lines changed: 84 additions & 9 deletions b/‎docs/docs/natural-language-processing/useSpeechToText.md
Lines changed: 84 additions & 9 deletions
diff --git a/‎docs/docs/typescript-api/SpeechToTextModule.md
Lines changed: 14 additions & 7 deletions b/‎docs/docs/typescript-api/SpeechToTextModule.md
Lines changed: 14 additions & 7 deletions
diff --git a/‎examples/llm/ios/Podfile.lock
Lines changed: 8 additions & 14 deletions b/‎examples/llm/ios/Podfile.lock
Lines changed: 8 additions & 14 deletions
@@ -37,20 +37,27 @@ const transcribedText = await SpeechToTextModule.transcribe(waveform);
 
 ### Methods
 
-| Method               | Type                                                                                                                                                                                                                                                                                                     | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
-| -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `load`               | <code>(modelName: 'whisper' &#124 'moonshine' &#124 'whisperMultilingual', transcribeCallback?: (sequence: string) => void, modelDownloadProgressCallback?: (downloadProgress: number) => void, encoderSource?: ResourceSource, decoderSource?: ResourceSource, tokenizerSource?: ResourceSource)</code> | Loads the model specified with `modelName`, where `encoderSource`, `decoderSource`, `tokenizerSource` are strings specifying the location of the binaries for the models. `modelDownloadProgressCallback` allows you to monitor the current progress of the model download, while `transcribeCallback` is invoked with each generated token                                                                                                                                                                                                                                                                                            |
-| `transcribe`         | `(waveform: number[], audioLanguage?: SpeechToTextLanguage): Promise<string>`                                                                                                                                                                                                                            | Starts a transcription process for a given input array, which should be a waveform at 16kHz. Resolves a promise with the output transcription when the model is finished. For multilingual models, you have to specify the audioLanguage flag, which is the language of the spoken language in the audio.                                                                                                                                                                                                                                                                                                                              |
-| `encode`             | `(waveform: number[]) => Promise<number[]>`                                                                                                                                                                                                                                                              | Runs the encoding part of the model. Returns a float array representing the output of the encoder.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
-| `decode`             | `(tokens: number[], encodings?: number[]) => Promise<number[]>`                                                                                                                                                                                                                                          | Runs the decoder of the model. Returns a single token representing a next token in the output sequence. If `encodings` are provided then they are used for decoding process, if not then the cached encodings from most recent `encode` call are used. The cached option is much faster due to very large overhead for communication between native and react layers.                                                                                                                                                                                                                                                                  |
-| `configureStreaming` | <code>(overlapSeconds?: number, windowSize?: number, streamingConfig?: 'fast' &#124; 'balanced' &#124; 'quality') => void</code>                                                                                                                                                                         | Configures options for the streaming algorithm: <ul><li>`overlapSeconds` determines how much adjacent audio chunks overlap (increasing it slows down transcription, decreases probability of weird wording at the chunks intersection, setting it larger than 3 seconds generally is discouraged), </li><li>`windowSize` describes size of the audio chunks (increasing it speeds up the end to end transcription time, but increases latency for the first token to be returned),</li><li> `streamingConfig` predefined configs for `windowSize` and `overlapSeconds` values.</li></ul> Keep `windowSize + 2 * overlapSeconds <= 30`. |
+| Method                | Type                                                                                                                                                                                                                                                                                                     | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| --------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `load`                | <code>(modelName: 'whisper' &#124 'moonshine' &#124 'whisperMultilingual', transcribeCallback?: (sequence: string) => void, modelDownloadProgressCallback?: (downloadProgress: number) => void, encoderSource?: ResourceSource, decoderSource?: ResourceSource, tokenizerSource?: ResourceSource)</code> | Loads the model specified with `modelName`, where `encoderSource`, `decoderSource`, `tokenizerSource` are strings specifying the location of the binaries for the models. `modelDownloadProgressCallback` allows you to monitor the current progress of the model download, while `transcribeCallback` is invoked with each generated token                                                                                                                                                                                                                                                                                                                                  |
+| `transcribe`          | `(waveform: number[], audioLanguage?: SpeechToTextLanguage): Promise<string>`                                                                                                                                                                                                                            | Starts a transcription process for a given input array, which should be a waveform at 16kHz. Resolves a promise with the output transcription when the model is finished. For multilingual models, you have to specify the audioLanguage flag, which is the language of the spoken language in the audio.                                                                                                                                                                                                                                                                                                                                                                    |
+| `streamingTranscribe` | `(streamingAction: STREAMING_ACTION, waveform?: number[], audioLanguage?: SpeechToTextLanguage) => Promise<string>`                                                                                                                                                                                      | This allows for running transcription process on-line, which means where the whole audio is not known beforehand i.e. when transcribing from a live microphone feed. `streamingAction` defines the type of package sent to the model: <li>`START` - initializes the process, allows for optional `waveform` data</li><li>`DATA` - this package should contain consecutive audio data chunks sampled in 16k Hz</li><li>`STOP` - the last data chunk for this transcription, ends the transcription process and flushes internal buffers</li> Each call returns most recent transcription. Returns error when called when module is in use (i.e. processing `transcribe` call) |
+| `encode`              | `(waveform: number[]) => Promise<number[]>`                                                                                                                                                                                                                                                              | Runs the encoding part of the model. Returns a float array representing the output of the encoder.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+| `decode`              | `(tokens: number[], encodings?: number[]) => Promise<number[]>`                                                                                                                                                                                                                                          | Runs the decoder of the model. Returns a single token representing a next token in the output sequence. If `encodings` are provided then they are used for decoding process, if not then the cached encodings from most recent `encode` call are used. The cached option is much faster due to very large overhead for communication between native and react layers.                                                                                                                                                                                                                                                                                                        |
+| `configureStreaming`  | <code>(overlapSeconds?: number, windowSize?: number, streamingConfig?: 'fast' &#124; 'balanced' &#124; 'quality') => void</code>                                                                                                                                                                         | Configures options for the streaming algorithm: <ul><li>`overlapSeconds` determines how much adjacent audio chunks overlap (increasing it slows down transcription, decreases probability of weird wording at the chunks intersection, setting it larger than 3 seconds generally is discouraged), </li><li>`windowSize` describes size of the audio chunks (increasing it speeds up the end to end transcription time, but increases latency for the first token to be returned),</li><li> `streamingConfig` predefined configs for `windowSize` and `overlapSeconds` values.</li></ul> Keep `windowSize + 2 * overlapSeconds <= 30`.                                       |
 
 <details>
 <summary>Type definitions</summary>
 
 ```typescript
 type ResourceSource = string | number | object;
 
+enum STREAMING_ACTION {
+  START,
+  DATA,
+  STOP,
+}
+
 enum SpeechToTextLanguage {
   Afrikaans = 'af',
   Albanian = 'sq',
 
@@ -3,7 +3,7 @@ PODS:
   - DoubleConversion (1.1.6)
   - EXConstants (17.1.6):
     - ExpoModulesCore
-  - Expo (53.0.8):
+  - Expo (53.0.9):
     - DoubleConversion
     - ExpoModulesCore
     - glog
@@ -36,13 +36,13 @@ PODS:
     - ExpoModulesCore
   - ExpoCalendar (14.1.4):
     - ExpoModulesCore
-  - ExpoFileSystem (18.1.9):
+  - ExpoFileSystem (18.1.10):
     - ExpoModulesCore
   - ExpoFont (13.3.1):
     - ExpoModulesCore
   - ExpoKeepAwake (14.1.4):
     - ExpoModulesCore
-  - ExpoModulesCore (2.3.12):
+  - ExpoModulesCore (2.3.13):
     - DoubleConversion
     - glog
     - hermes-engine
@@ -67,8 +67,6 @@ PODS:
     - ReactCommon/turbomodule/bridging
     - ReactCommon/turbomodule/core
     - Yoga
-  - ExpoSpeech (13.1.4):
-    - ExpoModulesCore
   - fast_float (6.1.4)
   - FBLazyVector (0.79.2)
   - fmt (11.0.2)
@@ -1401,7 +1399,7 @@ PODS:
     - React-jsiexecutor
     - React-RCTFBReactNativeSpec
     - ReactCommon/turbomodule/core
-  - react-native-executorch (0.3.1-stt-12):
+  - react-native-executorch (0.3.3):
     - DoubleConversion
     - glog
     - hermes-engine
@@ -2087,7 +2085,6 @@ DEPENDENCIES:
   - ExpoFont (from `../node_modules/expo-font/ios`)
   - ExpoKeepAwake (from `../node_modules/expo-keep-awake/ios`)
   - ExpoModulesCore (from `../node_modules/expo-modules-core`)
-  - ExpoSpeech (from `../node_modules/expo-speech/ios`)
   - fast_float (from `../node_modules/react-native/third-party-podspecs/fast_float.podspec`)
   - FBLazyVector (from `../node_modules/react-native/Libraries/FBLazyVector`)
   - fmt (from `../node_modules/react-native/third-party-podspecs/fmt.podspec`)
@@ -2193,8 +2190,6 @@ EXTERNAL SOURCES:
     :path: "../node_modules/expo-keep-awake/ios"
   ExpoModulesCore:
     :path: "../node_modules/expo-modules-core"
-  ExpoSpeech:
-    :path: "../node_modules/expo-speech/ios"
   fast_float:
     :podspec: "../node_modules/react-native/third-party-podspecs/fast_float.podspec"
   FBLazyVector:
@@ -2349,15 +2344,14 @@ SPEC CHECKSUMS:
   boost: 7e761d76ca2ce687f7cc98e698152abd03a18f90
   DoubleConversion: cb417026b2400c8f53ae97020b2be961b59470cb
   EXConstants: 9f310f44bfedba09087042756802040e464323c0
-  Expo: 769ab5c190382eedebc733af6708bbc9ca5f643b
+  Expo: a9fc723f6c8f673f0e7e036c9021772d3a1a0707
   ExpoAsset: 3bc9adb7dbbf27ae82c18ca97eb988a3ae7e73b1
   ExpoBrightness: c335c6ccc082d5249a4b38dba5cd9a08aa0bf62b
   ExpoCalendar: f5f94ea8dcd957b1434beb4e1c0da1af063322e6
-  ExpoFileSystem: 0f3f466ecd3560f55768cd3f94ac3a17f093b8e6
+  ExpoFileSystem: c36eb8155eb2381c83dda7dc210e3eec332368b6
   ExpoFont: abbb91a911eb961652c2b0a22eef801860425ed6
   ExpoKeepAwake: bf0811570c8da182bfb879169437d4de298376e7
-  ExpoModulesCore: 3ac17421302df62928fc99c133cf25bdbcf0b004
-  ExpoSpeech: 4db7ef7888b9edc39ca9afee54e9c4b3df269ccb
+  ExpoModulesCore: 5d37821c36f3781dcd0ea9a393800c90eaa6259d
   fast_float: 06eeec4fe712a76acc9376682e4808b05ce978b6
   FBLazyVector: 84b955f7b4da8b895faf5946f73748267347c975
   fmt: a40bb5bd0294ea969aaaba240a927bd33d878cdd
@@ -2395,7 +2389,7 @@ SPEC CHECKSUMS:
   React-logger: 8edfcedc100544791cd82692ca5a574240a16219
   React-Mapbuffer: c3f4b608e4a59dd2f6a416ef4d47a14400194468
   React-microtasksnativemodule: 054f34e9b82f02bd40f09cebd4083828b5b2beb6
-  react-native-executorch: 8835fcfdfc71b1d42d30525ee047b2811c359cb8
+  react-native-executorch: d0c3dffa0a4a4111ea9c7b97f3fbf088a48d3b2a
   react-native-safe-area-context: 562163222d999b79a51577eda2ea8ad2c32b4d06
   React-NativeModulesApple: 2c4377e139522c3d73f5df582e4f051a838ff25e
   React-oscompat: ef5df1c734f19b8003e149317d041b8ce1f7d29c