Skip to content

Commit ab92c64

Browse files
author
Mateusz Kopciński
committed
final changes
1 parent cb1503a commit ab92c64

File tree

2 files changed

+89
-70
lines changed

2 files changed

+89
-70
lines changed

docs/docs/natural-language-processing/useSpeechToText.md

Lines changed: 87 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,6 @@ It is recommended to use models provided by us, which are available at our [Hugg
3131

3232
## Reference
3333

34-
### File transcription
35-
3634
You can obtain waveform from audio in any way most suitable to you, however in the snippet below we utilize `react-native-audio-api` library to process a mp3 file.
3735

3836
```typescript
@@ -65,72 +63,6 @@ if (error) {
6563
}
6664
```
6765

68-
### Live data (microphone) transcription
69-
70-
```typescript
71-
import { STREAMING_ACTION, useSpeechToText } from 'react-native-executorch';
72-
import LiveAudioStream from 'react-native-live-audio-stream';
73-
import { useState } from 'react';
74-
import { Buffer } from 'buffer';
75-
76-
const audioStreamOptions = {
77-
sampleRate: 16000,
78-
channels: 1,
79-
bitsPerSample: 16,
80-
audioSource: 1,
81-
bufferSize: 16000,
82-
};
83-
84-
const startStreamingAudio = (options: any, onChunk: (data: string) => void) => {
85-
LiveAudioStream.init(options);
86-
LiveAudioStream.on('data', onChunk);
87-
LiveAudioStream.start();
88-
};
89-
90-
const float32ArrayFromPCMBinaryBuffer = (b64EncodedBuffer: string) => {
91-
const b64DecodedChunk = Buffer.from(b64EncodedBuffer, 'base64');
92-
const int16Array = new Int16Array(b64DecodedChunk.buffer);
93-
94-
const float32Array = new Float32Array(int16Array.length);
95-
for (let i = 0; i < int16Array.length; i++) {
96-
float32Array[i] = Math.max(
97-
-1,
98-
Math.min(1, (int16Array[i] / audioStreamOptions.bufferSize) * 8)
99-
);
100-
}
101-
return float32Array;
102-
};
103-
104-
const [isRecording, setIsRecording] = useState(false);
105-
106-
const speechToText = useSpeechToText({
107-
modelName: 'moonshine',
108-
windowSize: 3,
109-
overlapSeconds: 1.2,
110-
});
111-
112-
const onChunk = (data: string) => {
113-
const float32Chunk = float32ArrayFromPCMBinaryBuffer(data);
114-
speechToText.streamingTranscribe(
115-
STREAMING_ACTION.DATA,
116-
Array.from(float32Chunk)
117-
);
118-
};
119-
120-
const handleRecordPress = async () => {
121-
if (isRecording) {
122-
setIsRecording(false);
123-
LiveAudioStream.stop();
124-
messageRecorded.current = true;
125-
await speechToText.streamingTranscribe(STREAMING_ACTION.STOP);
126-
} else {
127-
setIsRecording(true);
128-
startStreamingAudio(audioStreamOptions, onChunk);
129-
await speechToText.streamingTranscribe(STREAMING_ACTION.START);
130-
}
131-
};
132-
```
133-
13466
### Streaming
13567

13668
Given that STT models can process audio no longer than 30 seconds, there is a need to chunk the input audio. Chunking audio may result in cutting speech mid-sentence, which might be hard to understand for the model. To make it work, we employed an algorithm (adapted for mobile devices from [whisper-streaming](https://aclanthology.org/2023.ijcnlp-demo.3.pdf)) that uses overlapping audio chunks. This might introduce some overhead, but allows for processing audio inputs of arbitrary length.
@@ -302,11 +234,97 @@ function App() {
302234
title="Transcribe"
303235
/>
304236
<Text>{error ? error : sequence}</Text>
305-
</View>****
237+
</View>
306238
);
307239
}
308240
```
309241
242+
### Live data (microphone) transcription
243+
244+
```typescript
245+
import { STREAMING_ACTION, useSpeechToText } from 'react-native-executorch';
246+
import LiveAudioStream from 'react-native-live-audio-stream';
247+
import { useState } from 'react';
248+
import { Buffer } from 'buffer';
249+
250+
const audioStreamOptions = {
251+
sampleRate: 16000,
252+
channels: 1,
253+
bitsPerSample: 16,
254+
audioSource: 1,
255+
bufferSize: 16000,
256+
};
257+
258+
const startStreamingAudio = (options: any, onChunk: (data: string) => void) => {
259+
LiveAudioStream.init(options);
260+
LiveAudioStream.on('data', onChunk);
261+
LiveAudioStream.start();
262+
};
263+
264+
const float32ArrayFromPCMBinaryBuffer = (b64EncodedBuffer: string) => {
265+
const b64DecodedChunk = Buffer.from(b64EncodedBuffer, 'base64');
266+
const int16Array = new Int16Array(b64DecodedChunk.buffer);
267+
268+
const float32Array = new Float32Array(int16Array.length);
269+
for (let i = 0; i < int16Array.length; i++) {
270+
float32Array[i] = Math.max(
271+
-1,
272+
Math.min(1, (int16Array[i] / audioStreamOptions.bufferSize) * 8)
273+
);
274+
}
275+
return float32Array;
276+
};
277+
278+
function App() {
279+
const [isRecording, setIsRecording] = useState(false);
280+
const speechToText = useSpeechToText({
281+
modelName: 'moonshine',
282+
windowSize: 3,
283+
overlapSeconds: 1.2,
284+
});
285+
286+
const onChunk = (data: string) => {
287+
const float32Chunk = float32ArrayFromPCMBinaryBuffer(data);
288+
speechToText.streamingTranscribe(
289+
STREAMING_ACTION.DATA,
290+
Array.from(float32Chunk)
291+
);
292+
};
293+
294+
const handleRecordPress = async () => {
295+
if (isRecording) {
296+
setIsRecording(false);
297+
LiveAudioStream.stop();
298+
messageRecorded.current = true;
299+
await speechToText.streamingTranscribe(STREAMING_ACTION.STOP);
300+
} else {
301+
setIsRecording(true);
302+
startStreamingAudio(audioStreamOptions, onChunk);
303+
await speechToText.streamingTranscribe(STREAMING_ACTION.START);
304+
}
305+
};
306+
307+
return
308+
<View>
309+
<Text>
310+
{speechToText.sequence}
311+
</Text>
312+
<TouchableOpacity
313+
style={
314+
!isRecording ? styles.recordTouchable : styles.recordingInfo
315+
}
316+
onPress={handleRecordPress}
317+
>
318+
{isRecording ? (
319+
<Text>Stop</Text>
320+
) : (
321+
<Text>Record</Text>
322+
)}
323+
</TouchableOpacity>
324+
</View>
325+
}
326+
```
327+
310328
## Supported models
311329
312330
| Model | Language |

src/controllers/SpeechToTextController.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,8 @@ export class SpeechToTextController {
7373
};
7474
this.onErrorCallback = (error) => {
7575
if (onErrorCallback) {
76-
return onErrorCallback(error ? new Error(getError(error)) : undefined);
76+
onErrorCallback(error ? new Error(getError(error)) : undefined);
77+
return;
7778
} else {
7879
throw new Error(getError(error));
7980
}

0 commit comments

Comments
 (0)