@@ -166,6 +166,54 @@ public class WhisperCppEngine implements SpeechToTextEngine {
166166 /** Currently used whispercpp no fallback */
167167 private Option <Boolean > whispercppNoFallback ;
168168
169+ /** Config key for setting whispercpp Voice Activity Detection (VAD) */
170+ private static final String WHISPERCPP_VAD_CONFIG_KEY = "whispercpp.vad" ;
171+
172+ /** Currently used whispercpp Voice Activity Detection (VAD) */
173+ private Option <Boolean > whispercppVad ;
174+
175+ /** Config key for setting whispercpp VAD model */
176+ private static final String WHISPERCPP_VAD_MODEL_CONFIG_KEY = "whispercpp.vad-model" ;
177+
178+ /** Currently used whispercpp VAD model */
179+ private Option <String > whispercppVadModel ;
180+
181+ /** Config key for setting whispercpp VAD threshold */
182+ private static final String WHISPERCPP_VAD_THRESHOLD_CONFIG_KEY = "whispercpp.vad-thold" ;
183+
184+ /** Currently used whispercpp VAD threshold */
185+ private Option <Double > whispercppVadThreshold ;
186+
187+ /** Config key for setting whispercpp VAD min speech duration */
188+ private static final String WHISPERCPP_VAD_MIN_SPEECH_CONFIG_KEY = "whispercpp.vad-min-speech-dur" ;
189+
190+ /** Currently used whispercpp VAD min speech duration */
191+ private Option <Integer > whispercppVadMinSpeech ;
192+
193+ /** Config key for setting whispercpp VAD min silence duration */
194+ private static final String WHISPERCPP_VAD_MIN_SILENCE_CONFIG_KEY = "whispercpp.vad-min-silence-dur" ;
195+
196+ /** Currently used whispercpp VAD min silence duration */
197+ private Option <Integer > whispercppVadMinSilence ;
198+
199+ /** Config key for setting whispercpp VAD max speech duration */
200+ private static final String WHISPERCPP_VAD_MAX_SPEECH_CONFIG_KEY = "whispercpp.vad-max-speech-dur" ;
201+
202+ /** Currently used whispercpp VAD max speech duration */
203+ private Option <Double > whispercppVadMaxSpeech ;
204+
205+ /** Config key for setting whispercpp VAD speech padding */
206+ private static final String WHISPERCPP_VAD_SPEECH_PADDING_CONFIG_KEY = "whispercpp.vad-speech-pad" ;
207+
208+ /** Currently used whispercpp VAD speech padding */
209+ private Option <Integer > whispercppVadSpeechPadding ;
210+
211+ /** Config key for setting whispercpp VAD samples overlap */
212+ private static final String WHISPERCPP_VAD_SAMPLES_OVERLAP_CONFIG_KEY = "whispercpp.vad-samples-overlap" ;
213+
214+ /** Currently used whispercpp samples overlap */
215+ private Option <Double > whispercppVadSamplesOverlap ;
216+
169217 /** Config key for automatic audio encoding */
170218 private static final String AUTO_ENCODING_CONFIG_KEY = "whispercpp.auto-encode" ;
171219
@@ -269,6 +317,49 @@ public void activate(ComponentContext cc) {
269317 logger .debug ("WhisperC++ no fallback set to {}" , whispercppNoFallback );
270318 }
271319
320+ whispercppVad = OsgiUtil .getOptCfgAsBoolean (cc .getProperties (), WHISPERCPP_VAD_CONFIG_KEY );
321+ if (whispercppVad .isSome ()) {
322+ logger .debug ("WhisperC++ VAD set to {}" , whispercppVad );
323+ }
324+
325+ whispercppVadModel = OsgiUtil .getOptCfg (cc .getProperties (), WHISPERCPP_VAD_MODEL_CONFIG_KEY );
326+ if (whispercppVadModel .isSome ()) {
327+ logger .debug ("WhisperC++ VAD model set to {}" , whispercppVadModel );
328+ }
329+
330+ whispercppVadThreshold = OsgiUtil .getOptCfg (cc .getProperties (), WHISPERCPP_VAD_THRESHOLD_CONFIG_KEY ).bind (
331+ Strings .toDouble );
332+ if (whispercppVadThreshold .isSome ()) {
333+ logger .debug ("WhisperC++ VAD threshold set to {}" , whispercppVadThreshold );
334+ }
335+
336+ whispercppVadMinSpeech = OsgiUtil .getOptCfgAsInt (cc .getProperties (), WHISPERCPP_VAD_MIN_SPEECH_CONFIG_KEY );
337+ if (whispercppVadMinSpeech .isSome ()) {
338+ logger .debug ("WhisperC++ VAD min speech set to {}" , whispercppVadMinSpeech );
339+ }
340+
341+ whispercppVadMinSilence = OsgiUtil .getOptCfgAsInt (cc .getProperties (), WHISPERCPP_VAD_MIN_SILENCE_CONFIG_KEY );
342+ if (whispercppVadMinSilence .isSome ()) {
343+ logger .debug ("WhisperC++ VAD min silence set to {}" , whispercppVadMinSilence );
344+ }
345+
346+ whispercppVadMaxSpeech = OsgiUtil .getOptCfg (cc .getProperties (), WHISPERCPP_VAD_MAX_SPEECH_CONFIG_KEY ).bind (
347+ Strings .toDouble );
348+ if (whispercppVadMaxSpeech .isSome ()) {
349+ logger .debug ("WhisperC++ VAD max speech set to {}" , whispercppVadMaxSpeech );
350+ }
351+
352+ whispercppVadSpeechPadding = OsgiUtil .getOptCfgAsInt (cc .getProperties (), WHISPERCPP_VAD_SPEECH_PADDING_CONFIG_KEY );
353+ if (whispercppVadSpeechPadding .isSome ()) {
354+ logger .debug ("WhisperC++ VAD speech padding set to {}" , whispercppVadSpeechPadding );
355+ }
356+
357+ whispercppVadSamplesOverlap = OsgiUtil .getOptCfg (cc .getProperties (), WHISPERCPP_VAD_SAMPLES_OVERLAP_CONFIG_KEY )
358+ .bind (Strings .toDouble );
359+ if (whispercppVadSamplesOverlap .isSome ()) {
360+ logger .debug ("WhisperC++ VAD samples overlap set to {}" , whispercppVadSamplesOverlap );
361+ }
362+
272363 autoEncode = BooleanUtils .toBoolean (Objects .toString (
273364 cc .getProperties ().get (AUTO_ENCODING_CONFIG_KEY ),
274365 AUTO_ENCODING_DEFAULT .toString ()));
@@ -370,6 +461,39 @@ public Result generateSubtitlesFile(File mediaFile, File workingDirectory, Strin
370461 command .add ("-nf" );
371462 }
372463
464+ // Optional VAD parameters
465+ if (whispercppVad .isSome () && whispercppVad .get ()) {
466+ command .add ("--vad" );
467+ }
468+ if (whispercppVadModel .isSome ()) {
469+ command .add ("-vm" );
470+ command .add (whispercppVadModel .get ());
471+ }
472+ if (whispercppVadThreshold .isSome ()) {
473+ command .add ("-vt" );
474+ command .add (String .format (Locale .US , "%f" , whispercppVadThreshold .get ()));
475+ }
476+ if (whispercppVadMinSpeech .isSome ()) {
477+ command .add ("-vspd" );
478+ command .add (Integer .toString (whispercppVadMinSpeech .get ()));
479+ }
480+ if (whispercppVadMinSilence .isSome ()) {
481+ command .add ("-vsd" );
482+ command .add (Integer .toString (whispercppVadMinSilence .get ()));
483+ }
484+ if (whispercppVadMaxSpeech .isSome ()) {
485+ command .add ("-vmsd" );
486+ command .add (String .format (Locale .US , "%f" , whispercppVadMaxSpeech .get ()));
487+ }
488+ if (whispercppVadSpeechPadding .isSome ()) {
489+ command .add ("-vp" );
490+ command .add (Integer .toString (whispercppVadSpeechPadding .get ()));
491+ }
492+ if (whispercppVadSamplesOverlap .isSome ()) {
493+ command .add ("-vo" );
494+ command .add (String .format (Locale .US , "%f" , whispercppVadSamplesOverlap .get ()));
495+ }
496+
373497 String subtitleLanguage ;
374498
375499 // set language of the source audio if known
0 commit comments