@@ -31,8 +31,6 @@ It is recommended to use models provided by us, which are available at our [Hugg
31
31
32
32
## Reference
33
33
34
- ### File transcription
35
-
36
34
You can obtain waveform from audio in any way most suitable to you, however in the snippet below we utilize ` react-native-audio-api ` library to process a mp3 file.
37
35
38
36
``` typescript
@@ -65,72 +63,6 @@ if (error) {
65
63
}
66
64
```
67
65
68
- ### Live data (microphone) transcription
69
-
70
- ``` typescript
71
- import { STREAMING_ACTION , useSpeechToText } from ' react-native-executorch' ;
72
- import LiveAudioStream from ' react-native-live-audio-stream' ;
73
- import { useState } from ' react' ;
74
- import { Buffer } from ' buffer' ;
75
-
76
- const audioStreamOptions = {
77
- sampleRate: 16000 ,
78
- channels: 1 ,
79
- bitsPerSample: 16 ,
80
- audioSource: 1 ,
81
- bufferSize: 16000 ,
82
- };
83
-
84
- const startStreamingAudio = (options : any , onChunk : (data : string ) => void ) => {
85
- LiveAudioStream .init (options );
86
- LiveAudioStream .on (' data' , onChunk );
87
- LiveAudioStream .start ();
88
- };
89
-
90
- const float32ArrayFromPCMBinaryBuffer = (b64EncodedBuffer : string ) => {
91
- const b64DecodedChunk = Buffer .from (b64EncodedBuffer , ' base64' );
92
- const int16Array = new Int16Array (b64DecodedChunk .buffer );
93
-
94
- const float32Array = new Float32Array (int16Array .length );
95
- for (let i = 0 ; i < int16Array .length ; i ++ ) {
96
- float32Array [i ] = Math .max (
97
- - 1 ,
98
- Math .min (1 , (int16Array [i ] / audioStreamOptions .bufferSize ) * 8 )
99
- );
100
- }
101
- return float32Array ;
102
- };
103
-
104
- const [isRecording, setIsRecording] = useState (false );
105
-
106
- const speechToText = useSpeechToText ({
107
- modelName: ' moonshine' ,
108
- windowSize: 3 ,
109
- overlapSeconds: 1.2 ,
110
- });
111
-
112
- const onChunk = (data : string ) => {
113
- const float32Chunk = float32ArrayFromPCMBinaryBuffer (data );
114
- speechToText .streamingTranscribe (
115
- STREAMING_ACTION .DATA ,
116
- Array .from (float32Chunk )
117
- );
118
- };
119
-
120
- const handleRecordPress = async () => {
121
- if (isRecording ) {
122
- setIsRecording (false );
123
- LiveAudioStream .stop ();
124
- messageRecorded .current = true ;
125
- await speechToText .streamingTranscribe (STREAMING_ACTION .STOP );
126
- } else {
127
- setIsRecording (true );
128
- startStreamingAudio (audioStreamOptions , onChunk );
129
- await speechToText .streamingTranscribe (STREAMING_ACTION .START );
130
- }
131
- };
132
- ```
133
-
134
66
### Streaming
135
67
136
68
Given that STT models can process audio no longer than 30 seconds, there is a need to chunk the input audio. Chunking audio may result in cutting speech mid-sentence, which might be hard to understand for the model. To make it work, we employed an algorithm (adapted for mobile devices from [ whisper-streaming] ( https://aclanthology.org/2023.ijcnlp-demo.3.pdf ) ) that uses overlapping audio chunks. This might introduce some overhead, but allows for processing audio inputs of arbitrary length.
@@ -302,11 +234,97 @@ function App() {
302
234
title = " Transcribe"
303
235
/>
304
236
<Text>{error ? error : sequence }</Text>
305
- </View>****
237
+ </View>
306
238
);
307
239
}
308
240
` ` `
309
241
242
+ ### Live data (microphone) transcription
243
+
244
+ ` ` ` typescript
245
+ import { STREAMING_ACTION , useSpeechToText } from ' react-native-executorch' ;
246
+ import LiveAudioStream from ' react-native-live-audio-stream' ;
247
+ import { useState } from ' react' ;
248
+ import { Buffer } from ' buffer' ;
249
+
250
+ const audioStreamOptions = {
251
+ sampleRate: 16000 ,
252
+ channels: 1 ,
253
+ bitsPerSample: 16 ,
254
+ audioSource: 1 ,
255
+ bufferSize: 16000 ,
256
+ };
257
+
258
+ const startStreamingAudio = (options : any , onChunk : (data : string ) => void ) => {
259
+ LiveAudioStream .init (options );
260
+ LiveAudioStream .on (' data' , onChunk );
261
+ LiveAudioStream .start ();
262
+ };
263
+
264
+ const float32ArrayFromPCMBinaryBuffer = (b64EncodedBuffer : string ) => {
265
+ const b64DecodedChunk = Buffer .from (b64EncodedBuffer , ' base64' );
266
+ const int16Array = new Int16Array (b64DecodedChunk .buffer );
267
+
268
+ const float32Array = new Float32Array (int16Array .length );
269
+ for (let i = 0 ; i < int16Array .length ; i ++ ) {
270
+ float32Array [i ] = Math .max (
271
+ - 1 ,
272
+ Math .min (1 , (int16Array [i ] / audioStreamOptions .bufferSize ) * 8 )
273
+ );
274
+ }
275
+ return float32Array ;
276
+ };
277
+
278
+ function App() {
279
+ const [isRecording, setIsRecording] = useState (false );
280
+ const speechToText = useSpeechToText ({
281
+ modelName: ' moonshine' ,
282
+ windowSize: 3 ,
283
+ overlapSeconds: 1.2 ,
284
+ });
285
+
286
+ const onChunk = (data : string ) => {
287
+ const float32Chunk = float32ArrayFromPCMBinaryBuffer (data );
288
+ speechToText .streamingTranscribe (
289
+ STREAMING_ACTION .DATA ,
290
+ Array .from (float32Chunk )
291
+ );
292
+ };
293
+
294
+ const handleRecordPress = async () => {
295
+ if (isRecording ) {
296
+ setIsRecording (false );
297
+ LiveAudioStream .stop ();
298
+ messageRecorded .current = true ;
299
+ await speechToText .streamingTranscribe (STREAMING_ACTION .STOP );
300
+ } else {
301
+ setIsRecording (true );
302
+ startStreamingAudio (audioStreamOptions , onChunk );
303
+ await speechToText .streamingTranscribe (STREAMING_ACTION .START );
304
+ }
305
+ };
306
+
307
+ return
308
+ <View >
309
+ <Text >
310
+ {speechToText .sequence }
311
+ < / Text >
312
+ < TouchableOpacity
313
+ style = {
314
+ !isRecording ? styles.recordTouchable : styles .recordingInfo
315
+ }
316
+ onPress = {handleRecordPress }
317
+ >
318
+ {isRecording ? (
319
+ <Text >Stop < / Text >
320
+ ) : (
321
+ <Text >Record < / Text >
322
+ )}
323
+ < / TouchableOpacity >
324
+ < / View >
325
+ }
326
+ ` ` `
327
+
310
328
## Supported models
311
329
312
330
| Model | Language |
0 commit comments