jameslyons · nayyarv · Nov 5, 2014 · Nov 5, 2014 · Nov 5, 2014 · Nov 5, 2014
diff --git a/VoiceActivityDetection.py b/VoiceActivityDetection.py
@@ -0,0 +1,35 @@
+__author__ = 'Varun Nayyar'
+__doc__ = \
+    """
+    This file is to be modified by users to provide their own Voice Activity Detection (VAD) functions.
+    I.e. not all frames will have speech present and it is common to remove these frames in many situations
+
+    These functions can be used in most base functions by passing VAD = myVADfunction where
+    myVADfunction follows the template provided.
+    """
+import numpy as np
+
+def templateVAD(frames, sig):
+    """
+    :param frames: numpy array of [NumFrames][SamplesPerFrame] of all the speech frames
+    :param sig: The entire signal [signLen]
+    :return: the subset of frames where there is voiced activity detected
+    """
+    raise NotImplementedError
+
+
+def simpleVAD(frames, sig, threshold=0.01):
+    """
+    :param frames: numpy array of [NumFrames][SamplesPerFrame] of all the speech frames
+    :param sig: The entire signal [signLen]
+    :param threshold: above what level of average power must the frame be to be considered to have activity
+    :return: the subset of frames where there is voiced activity detected
+
+    Note that the variance of frame/signal represents the average power of the frame/signal
+    so this is a power threshold activity detector applied along the frames
+    """
+
+
+    frameVars = np.var(frames, 1)
+    reducedFrames = frames[np.where(frameVars > sig.var() * threshold)]
+    return reducedFrames
diff --git a/example.py b/example.py
@@ -6,4 +6,13 @@
 mfcc_feat = mfcc(sig,rate)
 fbank_feat = logfbank(sig,rate)
 
-print fbank_feat[1:3,:]
+# print fbank_feat[1:3,:]
+print mfcc_feat[1:3,:]
+
+#Voice Activity detection example
+from VoiceActivityDetection import simpleVAD
+
+print mfcc_feat.shape
+mfcc_feat = mfcc(sig,rate, VAD=simpleVAD)
+print mfcc_feat.shape
+
diff --git a/features/base.py b/features/base.py
@@ -5,7 +5,7 @@
 from scipy.fftpack import dct
 
 def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
-          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True):
+          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True, VAD=None):
     """Compute MFCC features from an audio signal.
 
     :param signal: the audio signal from which to compute features. Should be an N*1 array
@@ -20,17 +20,18 @@ def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
     :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. 
     :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22. 
     :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
+    :param VAD: Voice Activity Detection function, see VoiceActivityDetection.py
     :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
     """            
-    feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph)
+    feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph, VAD)
     feat = numpy.log(feat)
     feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
     feat = lifter(feat,ceplifter)
     if appendEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy
     return feat
 
 def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
-          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97):
+          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, VAD = None):
     """Compute Mel-filterbank energy features from an audio signal.
 
     :param signal: the audio signal from which to compute features. Should be an N*1 array
@@ -41,13 +42,14 @@ def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
     :param nfft: the FFT size. Default is 512.
     :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
     :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
-    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. 
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :param VAD: Voice Activity Detection function, see VoiceActivityDetection.py
     :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
         second return value is the energy in each frame (total energy, unwindowed)
     """          
     highfreq= highfreq or samplerate/2
     signal = sigproc.preemphasis(signal,preemph)
-    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate)
+    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, VAD=VAD)
     pspec = sigproc.powspec(frames,nfft)
     energy = numpy.sum(pspec,1) # this stores the total energy in each frame
 
@@ -56,7 +58,7 @@ def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
     return feat,energy
 
 def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
-          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97):
+          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, VAD = None):
     """Compute log Mel-filterbank energy features from an audio signal.
 
     :param signal: the audio signal from which to compute features. Should be an N*1 array
@@ -67,14 +69,15 @@ def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
     :param nfft: the FFT size. Default is 512.
     :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
     :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
-    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. 
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :param VAD: Voice Activity Detection function, see VoiceActivityDetection.py
     :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. 
     """          
-    feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph)
+    feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph, VAD)
     return numpy.log(feat)
 
 def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01,
-          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97):
+          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, VAD = None):
     """Compute Spectral Subband Centroid features from an audio signal.
 
     :param signal: the audio signal from which to compute features. Should be an N*1 array
@@ -85,12 +88,13 @@ def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01,
     :param nfft: the FFT size. Default is 512.
     :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
     :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
-    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. 
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :param VAD: Voice Activity Detection function, see VoiceActivityDetection.py
     :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. 
     """          
     highfreq= highfreq or samplerate/2
     signal = sigproc.preemphasis(signal,preemph)
-    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate)
+    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, VAD=VAD)
     pspec = sigproc.powspec(frames,nfft)
 
     fb = get_filterbanks(nfilt,nfft,samplerate)

diff --git a/features/sigproc.py b/features/sigproc.py
@@ -4,13 +4,14 @@
 import numpy
 import math
 
-def framesig(sig,frame_len,frame_step,winfunc=lambda x:numpy.ones((1,x))):
+def framesig(sig,frame_len,frame_step,winfunc=lambda x:numpy.ones((1,x)), VAD=None):
     """Frame a signal into overlapping frames.
 
     :param sig: the audio signal to frame.
     :param frame_len: length of each frame measured in samples.
     :param frame_step: number of samples after the start of the previous frame that the next frame should begin.
-    :param winfunc: the analysis window to apply to each frame. By default no window is applied.    
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
+    :param VAD: Voice Activity Detection function, see VoiceActivityDetection.py
     :returns: an array of frames. Size is NUMFRAMES by frame_len.
     """
     slen = len(sig)
@@ -29,12 +30,18 @@ def framesig(sig,frame_len,frame_step,winfunc=lambda x:numpy.ones((1,x))):
     indices = numpy.tile(numpy.arange(0,frame_len),(numframes,1)) + numpy.tile(numpy.arange(0,numframes*frame_step,frame_step),(frame_len,1)).T
     indices = numpy.array(indices,dtype=numpy.int32)
     frames = padsignal[indices]
-    win = numpy.tile(winfunc(frame_len),(numframes,1))
+
+    if VAD is not None:
+        frames = VAD(frames, sig)
+
+    win = numpy.tile(winfunc(frame_len), (frames.shape[0], 1))
+
     return frames*win
 
 
 def deframesig(frames,siglen,frame_len,frame_step,winfunc=lambda x:numpy.ones((1,x))):
-    """Does overlap-add procedure to undo the action of framesig. 
+    """Does overlap-add procedure to undo the action of framesig.
+    Not applicable if Voice Activity Detection has been used in framesig
 
     :param frames: the array of frames.
     :param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples.