8
8
#include < string>
9
9
#include < whisper.h>
10
10
#include < sstream>
11
-
11
+ # include < speex/speex_preprocess.h >
12
12
using namespace stream_components ;
13
13
14
- bool processAudio (WhisperService service, std::vector<float > pcm32, const whisper_local_stream_params ¶ms);
15
-
16
14
int main (int argc, char **argv) {
17
15
// Read parameters...
18
16
whisper_local_stream_params params;
@@ -39,6 +37,8 @@ int main(int argc, char **argv) {
39
37
stream_components::WhisperService whisperService (params.service , params.audio , cparams);
40
38
41
39
const int port = 8090 ;
40
+ std::mutex whisper_mutex;
41
+
42
42
43
43
// started handler
44
44
auto started_handler = [](auto *token) {
@@ -65,6 +65,7 @@ int main(int argc, char **argv) {
65
65
thread_local wav_writer wavWriter;
66
66
thread_local std::string filename;
67
67
68
+
68
69
nlohmann::json response;
69
70
if (opCode == uWS::OpCode::TEXT) {
70
71
// printf("%s: Received message on /streaming/save: %s\n", get_current_time().c_str(),std::string(message).c_str());
@@ -93,7 +94,7 @@ int main(int argc, char **argv) {
93
94
// process binary message(PCM16 data)
94
95
auto size = message.size ();
95
96
std::basic_string_view<char , std::char_traits<char >>::const_pointer data = message.data ();
96
- // printf("%s: Received message size on /streaming/save: %zu\n", get_current_time().c_str(), size);
97
+ printf (" %s: Received message size on /streaming/save: %zu\n " , get_current_time ().c_str (), size);
97
98
// add received PCM16 to audio cache
98
99
std::vector<int16_t > pcm16 (size / 2 );
99
100
std::memcpy (pcm16.data (), data, size);
@@ -104,14 +105,17 @@ int main(int argc, char **argv) {
104
105
};
105
106
106
107
// WebSocket /paddlespeech/asr/streaming handler
107
- auto ws_streaming_handler = [&whisperService, ¶ms](auto *ws, std::string_view message, uWS::OpCode opCode) {
108
+ auto ws_streaming_handler = [&whisperService, ¶ms, &whisper_mutex ](auto *ws, std::string_view message, uWS::OpCode opCode) {
108
109
thread_local std::vector<float > audioBuffer; // thread-localized variable
109
110
thread_local wav_writer wavWriter;
110
111
thread_local std::string filename;
111
- thread_local bool is_last_active = false ;
112
+ thread_local bool last_is_speech = false ;
113
+ thread_local int chunk_size = 160 ; // 适用于 16 kHz 采样率的 100 毫秒帧
114
+ thread_local SpeexPreprocessState *st;
115
+
112
116
// std::unique_ptr<nlohmann::json> results(new nlohmann::json(nlohmann::json::array()));
113
117
thread_local nlohmann::json final_results;
114
- auto thread_id = std::this_thread::get_id ();
118
+ // auto thread_id = std::this_thread::get_id();
115
119
// std::cout << get_current_time().c_str() << ": Handling a message in thread: " << thread_id << std::endl;
116
120
nlohmann::json response;
117
121
if (opCode == uWS::OpCode::TEXT) {
@@ -122,45 +126,50 @@ int main(int argc, char **argv) {
122
126
auto jsonMsg = nlohmann::json::parse (message);
123
127
std::string signal = jsonMsg[" signal" ];
124
128
if (signal == " start" ) {
129
+ printf (" %s start\n " ,get_current_time ().c_str ());
130
+
125
131
if (jsonMsg[" name" ].is_string ()) {
126
132
filename = jsonMsg[" name" ];
127
133
} else {
128
134
filename = std::to_string (get_current_time_millis ()) + " .wav" ;
129
135
}
130
- final_results = nlohmann::json (nlohmann::json::array ());
131
136
// 发送服务器准备好的消息
132
137
response = {{" status" , " ok" },
133
138
{" signal" , " server_ready" }};
134
139
ws->send (response.dump (), uWS::OpCode::TEXT);
135
140
wavWriter.open (filename, WHISPER_SAMPLE_RATE, 16 , 1 );
141
+ st = speex_preprocess_state_init (chunk_size, WHISPER_SAMPLE_RATE);
142
+ int vad = 1 ;
143
+ speex_preprocess_ctl (st, SPEEX_PREPROCESS_SET_VAD, &vad);
144
+
136
145
}
137
146
if (signal == " end" ) {
138
- printf (" %s end\n " );
139
- wavWriter.close ();
147
+ printf (" %s end\n " ,get_current_time ().c_str ());
140
148
// nlohmann::json response = {{"name",filename},{"signal", signal}};
141
149
response = {{" name" , filename},
142
150
{" signal" , signal }};
143
- printf (" %s:buffer size:%d \n " ,get_current_time ().c_str (),audioBuffer.size ());
151
+ printf (" %s:buffer size:%lu \n " ,get_current_time ().c_str (),audioBuffer.size ());
144
152
bool isOk = whisperService.process (audioBuffer.data (), audioBuffer.size ());
145
153
if (isOk) {
146
154
final_results = get_result (whisperService.ctx );
147
155
response[" result" ] = final_results;
148
156
}
149
157
ws->send (response.dump (), uWS::OpCode::TEXT);
158
+ wavWriter.close ();
159
+ speex_preprocess_state_destroy (st);
150
160
}
151
161
// other process logic...
152
162
} catch (const std::exception &e) {
153
163
std::cerr << " JSON parse error: " << e.what () << std::endl;
154
164
auto size = message.size ();
155
165
}
156
166
} else if (opCode == uWS::OpCode::BINARY) {
157
- int size = message.size ();
158
167
// process binary message(PCM16 data)
168
+ auto size = message.size ();
159
169
std::basic_string_view<char , std::char_traits<char >>::const_pointer data = message.data ();
160
170
printf (" %s: Received message size on /paddlespeech/asr/streaming: %zu\n " , get_current_time ().c_str (), size);
161
171
// add received PCM16 to audio cache
162
172
std::vector<int16_t > pcm16 (size / 2 );
163
-
164
173
std::memcpy (pcm16.data (), data, size);
165
174
// write to file
166
175
wavWriter.write (pcm16.data (), size / 2 );
@@ -172,28 +181,41 @@ int main(int argc, char **argv) {
172
181
// insert to audio_buffer
173
182
audioBuffer.insert (audioBuffer.end (), temp.begin (), temp.end ());
174
183
175
- printf (" %s:buffer size:%d \n " ,get_current_time ().c_str (),audioBuffer.size ());
184
+ // printf("%s:buffer size:% ld \n",get_current_time().c_str(),audioBuffer.size());
176
185
// 如果开启了VAD
177
186
bool isOk;
178
187
// printf("%s: use_vad: %d\n", get_current_time().c_str(), params.audio.use_vad);
179
188
if (params.audio .use_vad ) {
189
+ whisper_mutex.lock ();
190
+ for (size_t i = 0 ; i < pcm16.size (); i += chunk_size) {
191
+ spx_int16_t frame[chunk_size];
192
+ for (int j = 0 ; j < chunk_size; ++j) {
193
+ if (i + j < pcm16.size ()) {
194
+ frame[j] = (spx_int16_t )(pcm16[i + j]);
195
+ } else {
196
+ frame[j] = 0 ; // 对于超出范围的部分填充 0
197
+ }
198
+ }
199
+ int is_speech = speex_preprocess_run (st, frame);
200
+
201
+ // printf("%s: is_active: %d,is_last_active %d\n", get_current_time().c_str(), is_speech, last_is_speech);
202
+ if (!is_speech && last_is_speech) {
203
+ isOk = whisperService.process (audioBuffer.data (), audioBuffer.size ());
204
+ audioBuffer.clear ();
205
+ break ;
206
+ }
207
+ last_is_speech = is_speech != 0 ;
180
208
181
- bool is_active = ::vad_simple (audioBuffer, WHISPER_SAMPLE_RATE, 1000 , params.audio .vad_thold ,
182
- params.audio .freq_thold , false );
183
- printf (" %s: is_active: %d,is_last_active %d\n " , get_current_time ().c_str (), is_active, is_last_active);
184
- if (!is_active && is_last_active) {
185
- is_last_active = false ;
186
- isOk = whisperService.process (audioBuffer.data (), audioBuffer.size ());
187
- audioBuffer.clear ();
188
- } else {
189
- is_last_active = is_active;
190
209
}
210
+ whisper_mutex.unlock ();
191
211
} else {
192
212
// asr
213
+ whisper_mutex.lock ();
193
214
isOk = whisperService.process (audioBuffer.data (), audioBuffer.size ());
194
215
audioBuffer.clear ();
216
+ whisper_mutex.unlock ();
195
217
}
196
- printf (" %s: is_ok: %d \n " , get_current_time ().c_str (), isOk);
218
+ // printf("%s: is_ok: %d \n", get_current_time().c_str(), isOk);
197
219
if (isOk) {
198
220
final_results = get_result (whisperService.ctx );
199
221
response[" result" ] = final_results;
@@ -221,18 +243,6 @@ int main(int argc, char **argv) {
221
243
.listen (port, started_handler).run ();
222
244
}
223
245
224
- bool processAudio (WhisperService whisperService, std::vector<float > pcm32, const whisper_local_stream_params ¶ms) {
225
- if (params.audio .use_vad ) {
226
- // printf("%s: vad: %d \n", get_current_time().c_str(), params.audio.use_vad);
227
- // TODO: 实现VAD处理,
228
- // bool containsVoice = vad_simple(audioBuffer, WHISPER_SAMPLE_RATE, 1000, params.audio.vad_thold, params.audio.freq_thold, false);
229
- return whisperService.process (pcm32.data (), pcm32.size ());
230
- } else {
231
- // asr
232
- return whisperService.process (pcm32.data (), pcm32.size ());
233
- }
234
- }
235
-
236
246
237
247
238
248
0 commit comments