Skip to content

Commit 4529dab

Browse files
authored
Add Voice Activity Detection (VAD) configs to whisperCpp (opencast#6838)
This PR adds configurations for the Voice Activity Detection (VAD) of WhisperC++, see https://github.com/ggml-org/whisper.cpp?tab=readme-ov-file#voice-activity-detection-vad. ### How to test this patch 1. Install WhisperC++ and configure Opencast according to https://docs.opencast.org/r/17.x/admin/#configuration/transcription.configuration/whispercpp/#enable-whisperc-engine. 2. Configure VAD in `org.opencastproject.speechtotext.impl.engine.WhisperCppEngine.cfg` 3. Upload a video and see the generated subtitles in the player. ### Your pull request should… * [ ] have a concise title * [ ] [close an accompanying issue](https://docs.opencast.org/develop/developer/#participate/development-process/#automatically-closing-issues-when-a-pr-is-merged) if one exists * [ ] [be against the correct branch](https://docs.opencast.org/develop/developer/development-process#acceptance-criteria-for-patches-in-different-versions) * [ ] include migration scripts and documentation, if appropriate * [ ] pass automated tests * [ ] have a clean commit history * [ ] [have proper commit messages (title and body) for all commits](https://medium.com/@steveamaza/e028865e5791) * [ ] explain why it needs to be merged into the legacy branch, if it is targeting the legacy branch
2 parents 629d8d0 + 0554526 commit 4529dab

File tree

2 files changed

+153
-0
lines changed

2 files changed

+153
-0
lines changed

etc/org.opencastproject.speechtotext.impl.engine.WhisperCppEngine.cfg

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,3 +53,32 @@
5353
# This is done by Opencast itself before handing over the audio track.
5454
# Default: true
5555
#whispercpp.auto-encode=true
56+
57+
58+
## Optional settings for Voice Activity Detection (VAD)
59+
60+
# Enable Voice Activity Detection (VAD)
61+
#whispercpp.vad=false
62+
63+
# VAD model path
64+
#whispercpp.vad-model=/usr/share/whisper.cpp/models/ggml-silero-v5.1.2.bin
65+
66+
# VAD threshold for speech recognition
67+
#whispercpp.vad-thold=0.50
68+
69+
# VAD min speech duration (0.0-1.0) in ms
70+
#whispercpp.vad-min-speech-dur=250
71+
72+
# VAD min silence duration (to split segments) in ms
73+
#whispercpp.vad-min-silence-dur=100
74+
75+
# VAD max speech duration (auto-split longer) in s
76+
# Needs to be a valid floating-point number.
77+
# Leave empty for FLT_MAX, i.e., the maximum representable finite floating-point number.
78+
#whispercpp.vad-max-speech-dur=31536000
79+
80+
# VAD speech padding (extend segments) in ms
81+
#whispercpp.vad-speech-pad=30
82+
83+
# VAD samples overlap (seconds between segments)
84+
#whispercpp.vad-samples-overlap=0.10

modules/speech-to-text-impl/src/main/java/org/opencastproject/speechtotext/impl/engine/WhisperCppEngine.java

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,54 @@ public class WhisperCppEngine implements SpeechToTextEngine {
166166
/** Currently used whispercpp no fallback */
167167
private Option<Boolean> whispercppNoFallback;
168168

169+
/** Config key for setting whispercpp Voice Activity Detection (VAD) */
170+
private static final String WHISPERCPP_VAD_CONFIG_KEY = "whispercpp.vad";
171+
172+
/** Currently used whispercpp Voice Activity Detection (VAD) */
173+
private Option<Boolean> whispercppVad;
174+
175+
/** Config key for setting whispercpp VAD model */
176+
private static final String WHISPERCPP_VAD_MODEL_CONFIG_KEY = "whispercpp.vad-model";
177+
178+
/** Currently used whispercpp VAD model */
179+
private Option<String> whispercppVadModel;
180+
181+
/** Config key for setting whispercpp VAD threshold */
182+
private static final String WHISPERCPP_VAD_THRESHOLD_CONFIG_KEY = "whispercpp.vad-thold";
183+
184+
/** Currently used whispercpp VAD threshold */
185+
private Option<Double> whispercppVadThreshold;
186+
187+
/** Config key for setting whispercpp VAD min speech duration */
188+
private static final String WHISPERCPP_VAD_MIN_SPEECH_CONFIG_KEY = "whispercpp.vad-min-speech-dur";
189+
190+
/** Currently used whispercpp VAD min speech duration */
191+
private Option<Integer> whispercppVadMinSpeech;
192+
193+
/** Config key for setting whispercpp VAD min silence duration */
194+
private static final String WHISPERCPP_VAD_MIN_SILENCE_CONFIG_KEY = "whispercpp.vad-min-silence-dur";
195+
196+
/** Currently used whispercpp VAD min silence duration */
197+
private Option<Integer> whispercppVadMinSilence;
198+
199+
/** Config key for setting whispercpp VAD max speech duration */
200+
private static final String WHISPERCPP_VAD_MAX_SPEECH_CONFIG_KEY = "whispercpp.vad-max-speech-dur";
201+
202+
/** Currently used whispercpp VAD max speech duration */
203+
private Option<Double> whispercppVadMaxSpeech;
204+
205+
/** Config key for setting whispercpp VAD speech padding */
206+
private static final String WHISPERCPP_VAD_SPEECH_PADDING_CONFIG_KEY = "whispercpp.vad-speech-pad";
207+
208+
/** Currently used whispercpp VAD speech padding */
209+
private Option<Integer> whispercppVadSpeechPadding;
210+
211+
/** Config key for setting whispercpp VAD samples overlap */
212+
private static final String WHISPERCPP_VAD_SAMPLES_OVERLAP_CONFIG_KEY = "whispercpp.vad-samples-overlap";
213+
214+
/** Currently used whispercpp samples overlap */
215+
private Option<Double> whispercppVadSamplesOverlap;
216+
169217
/** Config key for automatic audio encoding */
170218
private static final String AUTO_ENCODING_CONFIG_KEY = "whispercpp.auto-encode";
171219

@@ -269,6 +317,49 @@ public void activate(ComponentContext cc) {
269317
logger.debug("WhisperC++ no fallback set to {}", whispercppNoFallback);
270318
}
271319

320+
whispercppVad = OsgiUtil.getOptCfgAsBoolean(cc.getProperties(), WHISPERCPP_VAD_CONFIG_KEY);
321+
if (whispercppVad.isSome()) {
322+
logger.debug("WhisperC++ VAD set to {}", whispercppVad);
323+
}
324+
325+
whispercppVadModel = OsgiUtil.getOptCfg(cc.getProperties(), WHISPERCPP_VAD_MODEL_CONFIG_KEY);
326+
if (whispercppVadModel.isSome()) {
327+
logger.debug("WhisperC++ VAD model set to {}", whispercppVadModel);
328+
}
329+
330+
whispercppVadThreshold = OsgiUtil.getOptCfg(cc.getProperties(), WHISPERCPP_VAD_THRESHOLD_CONFIG_KEY).bind(
331+
Strings.toDouble);
332+
if (whispercppVadThreshold.isSome()) {
333+
logger.debug("WhisperC++ VAD threshold set to {}", whispercppVadThreshold);
334+
}
335+
336+
whispercppVadMinSpeech = OsgiUtil.getOptCfgAsInt(cc.getProperties(), WHISPERCPP_VAD_MIN_SPEECH_CONFIG_KEY);
337+
if (whispercppVadMinSpeech.isSome()) {
338+
logger.debug("WhisperC++ VAD min speech set to {}", whispercppVadMinSpeech);
339+
}
340+
341+
whispercppVadMinSilence = OsgiUtil.getOptCfgAsInt(cc.getProperties(), WHISPERCPP_VAD_MIN_SILENCE_CONFIG_KEY);
342+
if (whispercppVadMinSilence.isSome()) {
343+
logger.debug("WhisperC++ VAD min silence set to {}", whispercppVadMinSilence);
344+
}
345+
346+
whispercppVadMaxSpeech = OsgiUtil.getOptCfg(cc.getProperties(), WHISPERCPP_VAD_MAX_SPEECH_CONFIG_KEY).bind(
347+
Strings.toDouble);
348+
if (whispercppVadMaxSpeech.isSome()) {
349+
logger.debug("WhisperC++ VAD max speech set to {}", whispercppVadMaxSpeech);
350+
}
351+
352+
whispercppVadSpeechPadding = OsgiUtil.getOptCfgAsInt(cc.getProperties(), WHISPERCPP_VAD_SPEECH_PADDING_CONFIG_KEY);
353+
if (whispercppVadSpeechPadding.isSome()) {
354+
logger.debug("WhisperC++ VAD speech padding set to {}", whispercppVadSpeechPadding);
355+
}
356+
357+
whispercppVadSamplesOverlap = OsgiUtil.getOptCfg(cc.getProperties(), WHISPERCPP_VAD_SAMPLES_OVERLAP_CONFIG_KEY)
358+
.bind(Strings.toDouble);
359+
if (whispercppVadSamplesOverlap.isSome()) {
360+
logger.debug("WhisperC++ VAD samples overlap set to {}", whispercppVadSamplesOverlap);
361+
}
362+
272363
autoEncode = BooleanUtils.toBoolean(Objects.toString(
273364
cc.getProperties().get(AUTO_ENCODING_CONFIG_KEY),
274365
AUTO_ENCODING_DEFAULT.toString()));
@@ -370,6 +461,39 @@ public Result generateSubtitlesFile(File mediaFile, File workingDirectory, Strin
370461
command.add("-nf");
371462
}
372463

464+
// Optional VAD parameters
465+
if (whispercppVad.isSome() && whispercppVad.get()) {
466+
command.add("--vad");
467+
}
468+
if (whispercppVadModel.isSome()) {
469+
command.add("-vm");
470+
command.add(whispercppVadModel.get());
471+
}
472+
if (whispercppVadThreshold.isSome()) {
473+
command.add("-vt");
474+
command.add(String.format(Locale.US, "%f", whispercppVadThreshold.get()));
475+
}
476+
if (whispercppVadMinSpeech.isSome()) {
477+
command.add("-vspd");
478+
command.add(Integer.toString(whispercppVadMinSpeech.get()));
479+
}
480+
if (whispercppVadMinSilence.isSome()) {
481+
command.add("-vsd");
482+
command.add(Integer.toString(whispercppVadMinSilence.get()));
483+
}
484+
if (whispercppVadMaxSpeech.isSome()) {
485+
command.add("-vmsd");
486+
command.add(String.format(Locale.US, "%f", whispercppVadMaxSpeech.get()));
487+
}
488+
if (whispercppVadSpeechPadding.isSome()) {
489+
command.add("-vp");
490+
command.add(Integer.toString(whispercppVadSpeechPadding.get()));
491+
}
492+
if (whispercppVadSamplesOverlap.isSome()) {
493+
command.add("-vo");
494+
command.add(String.format(Locale.US, "%f", whispercppVadSamplesOverlap.get()));
495+
}
496+
373497
String subtitleLanguage;
374498

375499
// set language of the source audio if known

0 commit comments

Comments
 (0)