Skip to content

Voice Activity Detection #5

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions VoiceActivityDetection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
__author__ = 'Varun Nayyar'
__doc__ = \
"""
This file is to be modified by users to provide their own Voice Activity Detection (VAD) functions.
I.e. not all frames will have speech present and it is common to remove these frames in many situations

These functions can be used in most base functions by passing VAD = myVADfunction where
myVADfunction follows the template provided.
"""
import numpy as np

def templateVAD(frames, sig):
"""
:param frames: numpy array of [NumFrames][SamplesPerFrame] of all the speech frames
:param sig: The entire signal [signLen]
:return: the subset of frames where there is voiced activity detected
"""
raise NotImplementedError


def simpleVAD(frames, sig, threshold=0.01):
"""
:param frames: numpy array of [NumFrames][SamplesPerFrame] of all the speech frames
:param sig: The entire signal [signLen]
:param threshold: above what level of average power must the frame be to be considered to have activity
:return: the subset of frames where there is voiced activity detected

Note that the variance of frame/signal represents the average power of the frame/signal
so this is a power threshold activity detector applied along the frames
"""


frameVars = np.var(frames, 1)
reducedFrames = frames[np.where(frameVars > sig.var() * threshold)]
return reducedFrames
11 changes: 10 additions & 1 deletion example.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,13 @@
mfcc_feat = mfcc(sig,rate)
fbank_feat = logfbank(sig,rate)

print fbank_feat[1:3,:]
# print fbank_feat[1:3,:]
print mfcc_feat[1:3,:]

#Voice Activity detection example
from VoiceActivityDetection import simpleVAD

print mfcc_feat.shape
mfcc_feat = mfcc(sig,rate, VAD=simpleVAD)
print mfcc_feat.shape

26 changes: 15 additions & 11 deletions features/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from scipy.fftpack import dct

def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True):
nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True, VAD=None):
"""Compute MFCC features from an audio signal.

:param signal: the audio signal from which to compute features. Should be an N*1 array
Expand All @@ -20,17 +20,18 @@ def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
:param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22.
:param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
:param VAD: Voice Activity Detection function, see VoiceActivityDetection.py
:returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
"""
feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph)
feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph, VAD)
feat = numpy.log(feat)
feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
feat = lifter(feat,ceplifter)
if appendEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy
return feat

def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97):
nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, VAD = None):
"""Compute Mel-filterbank energy features from an audio signal.

:param signal: the audio signal from which to compute features. Should be an N*1 array
Expand All @@ -41,13 +42,14 @@ def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
:param nfft: the FFT size. Default is 512.
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
:param VAD: Voice Activity Detection function, see VoiceActivityDetection.py
:returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
second return value is the energy in each frame (total energy, unwindowed)
"""
highfreq= highfreq or samplerate/2
signal = sigproc.preemphasis(signal,preemph)
frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate)
frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, VAD=VAD)
pspec = sigproc.powspec(frames,nfft)
energy = numpy.sum(pspec,1) # this stores the total energy in each frame

Expand All @@ -56,7 +58,7 @@ def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
return feat,energy

def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97):
nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, VAD = None):
"""Compute log Mel-filterbank energy features from an audio signal.

:param signal: the audio signal from which to compute features. Should be an N*1 array
Expand All @@ -67,14 +69,15 @@ def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
:param nfft: the FFT size. Default is 512.
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
:param VAD: Voice Activity Detection function, see VoiceActivityDetection.py
:returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
"""
feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph)
feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph, VAD)
return numpy.log(feat)

def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01,
nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97):
nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, VAD = None):
"""Compute Spectral Subband Centroid features from an audio signal.

:param signal: the audio signal from which to compute features. Should be an N*1 array
Expand All @@ -85,12 +88,13 @@ def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01,
:param nfft: the FFT size. Default is 512.
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
:param VAD: Voice Activity Detection function, see VoiceActivityDetection.py
:returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
"""
highfreq= highfreq or samplerate/2
signal = sigproc.preemphasis(signal,preemph)
frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate)
frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, VAD=VAD)
pspec = sigproc.powspec(frames,nfft)

fb = get_filterbanks(nfilt,nfft,samplerate)
Expand Down
15 changes: 11 additions & 4 deletions features/sigproc.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@
import numpy
import math

def framesig(sig,frame_len,frame_step,winfunc=lambda x:numpy.ones((1,x))):
def framesig(sig,frame_len,frame_step,winfunc=lambda x:numpy.ones((1,x)), VAD=None):
"""Frame a signal into overlapping frames.

:param sig: the audio signal to frame.
:param frame_len: length of each frame measured in samples.
:param frame_step: number of samples after the start of the previous frame that the next frame should begin.
:param winfunc: the analysis window to apply to each frame. By default no window is applied.
:param winfunc: the analysis window to apply to each frame. By default no window is applied.
:param VAD: Voice Activity Detection function, see VoiceActivityDetection.py
:returns: an array of frames. Size is NUMFRAMES by frame_len.
"""
slen = len(sig)
Expand All @@ -29,12 +30,18 @@ def framesig(sig,frame_len,frame_step,winfunc=lambda x:numpy.ones((1,x))):
indices = numpy.tile(numpy.arange(0,frame_len),(numframes,1)) + numpy.tile(numpy.arange(0,numframes*frame_step,frame_step),(frame_len,1)).T
indices = numpy.array(indices,dtype=numpy.int32)
frames = padsignal[indices]
win = numpy.tile(winfunc(frame_len),(numframes,1))

if VAD is not None:
frames = VAD(frames, sig)

win = numpy.tile(winfunc(frame_len), (frames.shape[0], 1))

return frames*win


def deframesig(frames,siglen,frame_len,frame_step,winfunc=lambda x:numpy.ones((1,x))):
"""Does overlap-add procedure to undo the action of framesig.
"""Does overlap-add procedure to undo the action of framesig.
Not applicable if Voice Activity Detection has been used in framesig

:param frames: the array of frames.
:param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples.
Expand Down