Add Voice Activity Detection (VAD) configs to whisperCpp (opencast#6838)

mtneug · web-flow · commit 4529dab6ae53 · 2025-07-22T07:34:59.000+02:00
This PR adds configurations for the Voice Activity Detection (VAD) of WhisperC++, see https://github.com/ggml-org/whisper.cpp?tab=readme-ov-file#voice-activity-detection-vad. ### How to test this patch 1. Install WhisperC++ and configure Opencast according to https://docs.opencast.org/r/17.x/admin/#configuration/transcription.configuration/whispercpp/#enable-whisperc-engine. 2. Configure VAD in `org.opencastproject.speechtotext.impl.engine.WhisperCppEngine.cfg` 3. Upload a video and see the generated subtitles in the player. ### Your pull request should… * [ ] have a concise title * [ ] [close an accompanying issue](https://docs.opencast.org/develop/developer/#participate/development-process/#automatically-closing-issues-when-a-pr-is-merged) if one exists * [ ] [be against the correct branch](https://docs.opencast.org/develop/developer/development-process#acceptance-criteria-for-patches-in-different-versions) * [ ] include migration scripts and documentation, if appropriate * [ ] pass automated tests * [ ] have a clean commit history * [ ] [have proper commit messages (title and body) for all commits](https://medium.com/@steveamaza/e028865e5791) * [ ] explain why it needs to be merged into the legacy branch, if it is targeting the legacy branch
diff --git a/etc/org.opencastproject.speechtotext.impl.engine.WhisperCppEngine.cfg b/etc/org.opencastproject.speechtotext.impl.engine.WhisperCppEngine.cfg
@@ -53,3 +53,32 @@
 # This is done by Opencast itself before handing over the audio track.
 # Default: true
 #whispercpp.auto-encode=true
+
+
+## Optional settings for Voice Activity Detection (VAD)
+
+# Enable Voice Activity Detection (VAD)
+#whispercpp.vad=false
+
+# VAD model path
+#whispercpp.vad-model=/usr/share/whisper.cpp/models/ggml-silero-v5.1.2.bin
+
+# VAD threshold for speech recognition
+#whispercpp.vad-thold=0.50
+
+# VAD min speech duration (0.0-1.0) in ms
+#whispercpp.vad-min-speech-dur=250
+
+# VAD min silence duration (to split segments) in ms
+#whispercpp.vad-min-silence-dur=100
+
+# VAD max speech duration (auto-split longer) in s
+# Needs to be a valid floating-point number.
+# Leave empty for FLT_MAX, i.e., the maximum representable finite floating-point number.
+#whispercpp.vad-max-speech-dur=31536000
+
+# VAD speech padding (extend segments) in ms
+#whispercpp.vad-speech-pad=30
+
+# VAD samples overlap (seconds between segments)
+#whispercpp.vad-samples-overlap=0.10
diff --git a/modules/speech-to-text-impl/src/main/java/org/opencastproject/speechtotext/impl/engine/WhisperCppEngine.java b/modules/speech-to-text-impl/src/main/java/org/opencastproject/speechtotext/impl/engine/WhisperCppEngine.java
@@ -166,6 +166,54 @@ public class WhisperCppEngine implements SpeechToTextEngine {
   /** Currently used whispercpp no fallback */
   private Option<Boolean> whispercppNoFallback;
 
+  /** Config key for setting whispercpp Voice Activity Detection (VAD) */
+  private static final String WHISPERCPP_VAD_CONFIG_KEY = "whispercpp.vad";
+
+  /** Currently used whispercpp Voice Activity Detection (VAD) */
+  private Option<Boolean> whispercppVad;
+
+  /** Config key for setting whispercpp VAD model */
+  private static final String WHISPERCPP_VAD_MODEL_CONFIG_KEY = "whispercpp.vad-model";
+
+  /** Currently used whispercpp VAD model */
+  private Option<String> whispercppVadModel;
+
+  /** Config key for setting whispercpp VAD threshold */
+  private static final String WHISPERCPP_VAD_THRESHOLD_CONFIG_KEY = "whispercpp.vad-thold";
+
+  /** Currently used whispercpp VAD threshold */
+  private Option<Double> whispercppVadThreshold;
+
+  /** Config key for setting whispercpp VAD min speech duration */
+  private static final String WHISPERCPP_VAD_MIN_SPEECH_CONFIG_KEY = "whispercpp.vad-min-speech-dur";
+
+  /** Currently used whispercpp VAD min speech duration */
+  private Option<Integer> whispercppVadMinSpeech;
+
+  /** Config key for setting whispercpp VAD min silence duration */
+  private static final String WHISPERCPP_VAD_MIN_SILENCE_CONFIG_KEY = "whispercpp.vad-min-silence-dur";
+
+  /** Currently used whispercpp VAD min silence duration */
+  private Option<Integer> whispercppVadMinSilence;
+
+  /** Config key for setting whispercpp VAD max speech duration */
+  private static final String WHISPERCPP_VAD_MAX_SPEECH_CONFIG_KEY = "whispercpp.vad-max-speech-dur";
+
+  /** Currently used whispercpp VAD max speech duration */
+  private Option<Double> whispercppVadMaxSpeech;
+
+  /** Config key for setting whispercpp VAD speech padding */
+  private static final String WHISPERCPP_VAD_SPEECH_PADDING_CONFIG_KEY = "whispercpp.vad-speech-pad";
+
+  /** Currently used whispercpp VAD speech padding */
+  private Option<Integer> whispercppVadSpeechPadding;
+
+  /** Config key for setting whispercpp VAD samples overlap */
+  private static final String WHISPERCPP_VAD_SAMPLES_OVERLAP_CONFIG_KEY = "whispercpp.vad-samples-overlap";
+
+  /** Currently used whispercpp samples overlap */
+  private Option<Double> whispercppVadSamplesOverlap;
+
   /** Config key for automatic audio encoding */
   private static final String AUTO_ENCODING_CONFIG_KEY = "whispercpp.auto-encode";
 
@@ -269,6 +317,49 @@ public void activate(ComponentContext cc) {
       logger.debug("WhisperC++ no fallback set to {}", whispercppNoFallback);
     }
 
+    whispercppVad = OsgiUtil.getOptCfgAsBoolean(cc.getProperties(), WHISPERCPP_VAD_CONFIG_KEY);
+    if (whispercppVad.isSome()) {
+      logger.debug("WhisperC++ VAD set to {}", whispercppVad);
+    }
+
+    whispercppVadModel = OsgiUtil.getOptCfg(cc.getProperties(), WHISPERCPP_VAD_MODEL_CONFIG_KEY);
+    if (whispercppVadModel.isSome()) {
+      logger.debug("WhisperC++ VAD model set to {}", whispercppVadModel);
+    }
+
+    whispercppVadThreshold = OsgiUtil.getOptCfg(cc.getProperties(), WHISPERCPP_VAD_THRESHOLD_CONFIG_KEY).bind(
+        Strings.toDouble);
+    if (whispercppVadThreshold.isSome()) {
+      logger.debug("WhisperC++ VAD threshold set to {}", whispercppVadThreshold);
+    }
+
+    whispercppVadMinSpeech = OsgiUtil.getOptCfgAsInt(cc.getProperties(), WHISPERCPP_VAD_MIN_SPEECH_CONFIG_KEY);
+    if (whispercppVadMinSpeech.isSome()) {
+      logger.debug("WhisperC++ VAD min speech set to {}", whispercppVadMinSpeech);
+    }
+
+    whispercppVadMinSilence = OsgiUtil.getOptCfgAsInt(cc.getProperties(), WHISPERCPP_VAD_MIN_SILENCE_CONFIG_KEY);
+    if (whispercppVadMinSilence.isSome()) {
+      logger.debug("WhisperC++ VAD min silence set to {}", whispercppVadMinSilence);
+    }
+
+    whispercppVadMaxSpeech = OsgiUtil.getOptCfg(cc.getProperties(), WHISPERCPP_VAD_MAX_SPEECH_CONFIG_KEY).bind(
+        Strings.toDouble);
+    if (whispercppVadMaxSpeech.isSome()) {
+      logger.debug("WhisperC++ VAD max speech set to {}", whispercppVadMaxSpeech);
+    }
+
+    whispercppVadSpeechPadding = OsgiUtil.getOptCfgAsInt(cc.getProperties(), WHISPERCPP_VAD_SPEECH_PADDING_CONFIG_KEY);
+    if (whispercppVadSpeechPadding.isSome()) {
+      logger.debug("WhisperC++ VAD speech padding set to {}", whispercppVadSpeechPadding);
+    }
+
+    whispercppVadSamplesOverlap = OsgiUtil.getOptCfg(cc.getProperties(), WHISPERCPP_VAD_SAMPLES_OVERLAP_CONFIG_KEY)
+        .bind(Strings.toDouble);
+    if (whispercppVadSamplesOverlap.isSome()) {
+      logger.debug("WhisperC++ VAD samples overlap set to {}", whispercppVadSamplesOverlap);
+    }
+
     autoEncode = BooleanUtils.toBoolean(Objects.toString(
         cc.getProperties().get(AUTO_ENCODING_CONFIG_KEY),
         AUTO_ENCODING_DEFAULT.toString()));
@@ -370,6 +461,39 @@ public Result generateSubtitlesFile(File mediaFile, File workingDirectory, Strin
       command.add("-nf");
     }
 
+    // Optional VAD parameters
+    if (whispercppVad.isSome() && whispercppVad.get()) {
+      command.add("--vad");
+    }
+    if (whispercppVadModel.isSome()) {
+      command.add("-vm");
+      command.add(whispercppVadModel.get());
+    }
+    if (whispercppVadThreshold.isSome()) {
+      command.add("-vt");
+      command.add(String.format(Locale.US, "%f", whispercppVadThreshold.get()));
+    }
+    if (whispercppVadMinSpeech.isSome()) {
+      command.add("-vspd");
+      command.add(Integer.toString(whispercppVadMinSpeech.get()));
+    }
+    if (whispercppVadMinSilence.isSome()) {
+      command.add("-vsd");
+      command.add(Integer.toString(whispercppVadMinSilence.get()));
+    }
+    if (whispercppVadMaxSpeech.isSome()) {
+      command.add("-vmsd");
+      command.add(String.format(Locale.US, "%f", whispercppVadMaxSpeech.get()));
+    }
+    if (whispercppVadSpeechPadding.isSome()) {
+      command.add("-vp");
+      command.add(Integer.toString(whispercppVadSpeechPadding.get()));
+    }
+    if (whispercppVadSamplesOverlap.isSome()) {
+      command.add("-vo");
+      command.add(String.format(Locale.US, "%f", whispercppVadSamplesOverlap.get()));
+    }
+
     String subtitleLanguage;
 
     // set language of the source audio if known