From 232df9f2942a802edda9a894b11cfa015f6788eb Mon Sep 17 00:00:00 2001 From: Pegita Date: Wed, 12 Dec 2018 01:07:17 -0500 Subject: [PATCH 1/9] added modified MFCC features based on DNN-c and fDNN-c features; it is activated using --modified option. --- src/feat/mel-computations.cc | 248 +++++++++++++++++++++-------------- src/feat/mel-computations.h | 98 +++++++++----- 2 files changed, 220 insertions(+), 126 deletions(-) diff --git a/src/feat/mel-computations.cc b/src/feat/mel-computations.cc index 810b6247e93..bd40a527b29 100644 --- a/src/feat/mel-computations.cc +++ b/src/feat/mel-computations.cc @@ -34,78 +34,57 @@ MelBanks::MelBanks(const MelBanksOptions &opts, const FrameExtractionOptions &frame_opts, BaseFloat vtln_warp_factor): htk_mode_(opts.htk_mode) { + SetConfigs(opts, frame_opts, vtln_warp_factor); + int32 num_bins = opts.num_bins; if (num_bins < 3) KALDI_ERR << "Must have at least 3 mel bins"; - BaseFloat sample_freq = frame_opts.samp_freq; - int32 window_length_padded = frame_opts.PaddedWindowSize(); - KALDI_ASSERT(window_length_padded % 2 == 0); - int32 num_fft_bins = window_length_padded / 2; - BaseFloat nyquist = 0.5 * sample_freq; - BaseFloat low_freq = opts.low_freq, high_freq; - if (opts.high_freq > 0.0) - high_freq = opts.high_freq; - else - high_freq = nyquist + opts.high_freq; - if (low_freq < 0.0 || low_freq >= nyquist - || high_freq <= 0.0 || high_freq > nyquist - || high_freq <= low_freq) - KALDI_ERR << "Bad values in options: low-freq " << low_freq - << " and high-freq " << high_freq << " vs. nyquist " - << nyquist; - - BaseFloat fft_bin_width = sample_freq / window_length_padded; - // fft-bin width [think of it as Nyquist-freq / half-window-length] + BaseFloat mel_low_freq = MelScale(low_freq_); + BaseFloat mel_high_freq = MelScale(high_freq_); - BaseFloat mel_low_freq = MelScale(low_freq); - BaseFloat mel_high_freq = MelScale(high_freq); - debug_ = opts.debug_mel; - // divide by num_bins+1 in next line because of end-effects where the bins - // spread out to the sides. - BaseFloat mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins+1); + bins_.resize(num_bins); + center_freqs_.Resize(num_bins); - BaseFloat vtln_low = opts.vtln_low, - vtln_high = opts.vtln_high; - if (vtln_high < 0.0) { - vtln_high += nyquist; + for (int32 bin = 0; bin < num_bins; bin++) { + BaseFloat mel = mel_low_freq + + (bin + 1) * (mel_high_freq - mel_low_freq) / (num_bins + 1); + if (vtln_warp_factor != 1.0) + mel = VtlnWarpMelFreq(vtln_warp_factor, mel); + center_freqs_(bin) = InverseMelScale(mel); } - if (vtln_warp_factor != 1.0 && - (vtln_low < 0.0 || vtln_low <= low_freq - || vtln_low >= high_freq - || vtln_high <= 0.0 || vtln_high >= high_freq - || vtln_high <= vtln_low)) - KALDI_ERR << "Bad values in options: vtln-low " << vtln_low - << " and vtln-high " << vtln_high << ", versus " - << "low-freq " << low_freq << " and high-freq " - << high_freq; + if (!opts.modified) + ComputeBins(opts.htk_mode); + else + ComputeModifiedBins(); - bins_.resize(num_bins); - center_freqs_.Resize(num_bins); + if (debug_) { + for (size_t i = 0; i < bins_.size(); i++) { + KALDI_LOG << "bin " << i << ", offset = " << bins_[i].first + << ", vec = " << bins_[i].second; + } + } +} +void MelBanks::ComputeBins(bool htk_mode) { + int32 num_bins = center_freqs_.Dim(); for (int32 bin = 0; bin < num_bins; bin++) { - BaseFloat left_mel = mel_low_freq + bin * mel_freq_delta, - center_mel = mel_low_freq + (bin + 1) * mel_freq_delta, - right_mel = mel_low_freq + (bin + 2) * mel_freq_delta; - - if (vtln_warp_factor != 1.0) { - left_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq, - vtln_warp_factor, left_mel); - center_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq, - vtln_warp_factor, center_mel); - right_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq, - vtln_warp_factor, right_mel); - } - center_freqs_(bin) = InverseMelScale(center_mel); + // center_mel is the center frequency (in mel) of this bin, and left_mel and + // right_mel are those of the bins immediately to the left and right. + BaseFloat center_mel = MelScale(center_freqs_(bin)), + left_mel = MelScale(bin == 0 ? + low_freq_ : center_freqs_(bin - 1)), + right_mel = MelScale(bin == num_bins - 1 ? + high_freq_ : center_freqs_(bin + 1)); // this_bin will be a vector of coefficients that is only // nonzero where this mel bin is active. - Vector this_bin(num_fft_bins); + Vector this_bin(num_fft_bins_); int32 first_index = -1, last_index = -1; - for (int32 i = 0; i < num_fft_bins; i++) { - BaseFloat freq = (fft_bin_width * i); // Center frequency of this fft + for (int32 i = 0; i < num_fft_bins_; i++) { + BaseFloat freq = (fft_bin_width_ * i); // Center frequency of this fft // bin. BaseFloat mel = MelScale(freq); if (mel > left_mel && mel < right_mel) { @@ -113,7 +92,7 @@ MelBanks::MelBanks(const MelBanksOptions &opts, if (mel <= center_mel) weight = (mel - left_mel) / (center_mel - left_mel); else - weight = (right_mel-mel) / (right_mel-center_mel); + weight = (right_mel-mel) / (right_mel - center_mel); this_bin(i) = weight; if (first_index == -1) first_index = i; @@ -129,29 +108,73 @@ MelBanks::MelBanks(const MelBanksOptions &opts, bins_[bin].second.CopyFromVec(this_bin.Range(first_index, size)); // Replicate a bug in HTK, for testing purposes. - if (opts.htk_mode && bin == 0 && mel_low_freq != 0.0) + if (htk_mode && bin == 0 && low_freq_ != 0.0) bins_[bin].second(0) = 0.0; - } - if (debug_) { - for (size_t i = 0; i < bins_.size(); i++) { - KALDI_LOG << "bin " << i << ", offset = " << bins_[i].first - << ", vec = " << bins_[i].second; +} + +/* + Notes on the shape of the modified bins. + + They are shaped like a cosine function from -pi/2 to pi/2 (unlike the standard + triangular bins). We define their diameter as the distance between the + first and last nonzero value (pi for the canonical function). If there are + a lot of bins, their diamter is defined by a formula and it's a function of + the center frequency f of the bin: + diameter = 30 + 60 f / (f + 500). + so it increases from 30Hz to 90Hz with a knee around 500Hz. + However (and this matters if the number of bins is relatively small), we never + let the diameter fall below the point where the crossing-point of this and + the next bin would be less than 0.1. By this I mean is the y-value where the + raised-cosines cross. This value ensures that there won't be too a 'dip' + in the middle of the two bins. + */ +void MelBanks::ComputeModifiedBins() { + int32 num_bins = center_freqs_.Dim(); + for (int32 bin = 0; bin < num_bins; bin++) { + BaseFloat center_freq = center_freqs_(bin), + next_center = (bin == num_bins - 1 ? + high_freq_ : center_freqs_(bin + 1)); + + // note: breakpoint_ is 500 (Hz). + BaseFloat diameter_floor = (next_center - center_freq) * 1.1, + diameter = 30.0 + 60.0 * (center_freq / (center_freq + breakpoint_)); + + diameter = pow(diameter * diameter + diameter_floor * diameter_floor, 0.5); + + // 'freq_scale' is the scaling factor on the frequencies that will ensure + // that the diameter becomes equal to pi, like the canonical bin function + // (the cosine from -pi/2 to pi/2). + BaseFloat freq_scale = M_PI / diameter; + + // this_bin will be a vector of coefficients that is only + // nonzero where this mel bin is active. + Vector this_bin(num_fft_bins_); + int32 first_index = -1, last_index = -1; + + for (int32 i = 0; i < num_fft_bins_; i++) { + BaseFloat freq = (fft_bin_width_ * i); // Center frequency of this fft + // bin. + BaseFloat normalized_freq = freq_scale * (freq - center_freq); + if (normalized_freq > -M_PI_2 && normalized_freq < M_PI_2) { + BaseFloat weight = cos(normalized_freq); + this_bin(i) = weight; + if (first_index == -1) + first_index = i; + last_index = i; + } } + KALDI_ASSERT(first_index != -1 && last_index >= first_index + && "You may have set --num-mel-bins too large."); + + bins_[bin].first = first_index; + int32 size = last_index + 1 - first_index; + bins_[bin].second.Resize(size); + bins_[bin].second.CopyFromVec(this_bin.Range(first_index, size)); } } -MelBanks::MelBanks(const MelBanks &other): - center_freqs_(other.center_freqs_), - bins_(other.bins_), - debug_(other.debug_), - htk_mode_(other.htk_mode_) { } - -BaseFloat MelBanks::VtlnWarpFreq(BaseFloat vtln_low_cutoff, // upper+lower frequency cutoffs for VTLN. - BaseFloat vtln_high_cutoff, - BaseFloat low_freq, // upper+lower frequency cutoffs in mel computation - BaseFloat high_freq, - BaseFloat vtln_warp_factor, +BaseFloat MelBanks::VtlnWarpFreq(BaseFloat vtln_warp_factor, BaseFloat freq) { /// This computes a VTLN warping function that is not the same as HTK's one, /// but has similar inputs (this function has the advantage of never producing @@ -180,45 +203,34 @@ BaseFloat MelBanks::VtlnWarpFreq(BaseFloat vtln_low_cutoff, // upper+lower freq /// = vtln_low_cutoff * max(1, vtln_warp_factor) - if (freq < low_freq || freq > high_freq) return freq; // in case this gets called + if (freq < low_freq_ || freq > high_freq_) return freq; // in case this gets called // for out-of-range frequencies, just return the freq. - KALDI_ASSERT(vtln_low_cutoff > low_freq && - "be sure to set the --vtln-low option higher than --low-freq"); - KALDI_ASSERT(vtln_high_cutoff < high_freq && - "be sure to set the --vtln-high option lower than --high-freq [or negative]"); - BaseFloat one = 1.0; - BaseFloat l = vtln_low_cutoff * std::max(one, vtln_warp_factor); - BaseFloat h = vtln_high_cutoff * std::min(one, vtln_warp_factor); + BaseFloat l = vtln_low_ * std::max(BaseFloat(1.0), vtln_warp_factor); + BaseFloat h = vtln_high_ * std::min(BaseFloat(1.0), vtln_warp_factor); BaseFloat scale = 1.0 / vtln_warp_factor; BaseFloat Fl = scale * l; // F(l); BaseFloat Fh = scale * h; // F(h); - KALDI_ASSERT(l > low_freq && h < high_freq); + KALDI_ASSERT(l > low_freq_ && h < high_freq_); // slope of left part of the 3-piece linear function - BaseFloat scale_left = (Fl - low_freq) / (l - low_freq); + BaseFloat scale_left = (Fl - low_freq_) / (l - low_freq_); // [slope of center part is just "scale"] // slope of right part of the 3-piece linear function - BaseFloat scale_right = (high_freq - Fh) / (high_freq - h); + BaseFloat scale_right = (high_freq_ - Fh) / (high_freq_ - h); if (freq < l) { - return low_freq + scale_left * (freq - low_freq); + return low_freq_ + scale_left * (freq - low_freq_); } else if (freq < h) { return scale * freq; } else { // freq >= h - return high_freq + scale_right * (freq - high_freq); + return high_freq_ + scale_right * (freq - high_freq_); } } -BaseFloat MelBanks::VtlnWarpMelFreq(BaseFloat vtln_low_cutoff, // upper+lower frequency cutoffs for VTLN. - BaseFloat vtln_high_cutoff, - BaseFloat low_freq, // upper+lower frequency cutoffs in mel computation - BaseFloat high_freq, - BaseFloat vtln_warp_factor, +BaseFloat MelBanks::VtlnWarpMelFreq(BaseFloat vtln_warp_factor, BaseFloat mel_freq) { - return MelScale(VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff, - low_freq, high_freq, - vtln_warp_factor, InverseMelScale(mel_freq))); + return MelScale(VtlnWarpFreq(vtln_warp_factor, InverseMelScale(mel_freq))); } @@ -250,6 +262,52 @@ void MelBanks::Compute(const VectorBase &power_spectrum, } } +void MelBanks::SetConfigs(const MelBanksOptions &opts, + const FrameExtractionOptions &frame_opts, + BaseFloat vtln_warp_factor) { + BaseFloat sample_freq = frame_opts.samp_freq, + nyquist = 0.5 * sample_freq; + int32 window_length_padded = frame_opts.PaddedWindowSize(); + KALDI_ASSERT(window_length_padded % 2 == 0); + num_fft_bins_ = window_length_padded / 2; + // fft-bin width [think of it as Nyquist-freq / half-window-length] + fft_bin_width_ = sample_freq / window_length_padded; + + debug_ = opts.debug_mel; + + + low_freq_ = opts.low_freq; + if (opts.high_freq > 0.0) + high_freq_ = opts.high_freq; + else + high_freq_ = nyquist + opts.high_freq; + + if (low_freq_ < 0.0 || low_freq_ >= nyquist + || high_freq_ <= 0.0 || high_freq_ > nyquist + || high_freq_ <= low_freq_) + KALDI_ERR << "Bad values in options: low-freq " << low_freq_ + << " and high-freq " << high_freq_ << " vs. nyquist " + << nyquist; + + breakpoint_ = (opts.modified ? 500.0 : 700.0); + vtln_low_ = opts.vtln_low; + if (opts.vtln_high > 0.0) + vtln_high_ = opts.vtln_high; + else + vtln_high_ = opts.vtln_high + nyquist; + + if (vtln_warp_factor != 1.0 && + (vtln_low_ < 0.0 || vtln_low_ <= low_freq_ + || vtln_low_ >= high_freq_ + || vtln_high_ <= 0.0 || vtln_high_ >= high_freq_ + || vtln_high_ <= vtln_low_)) + KALDI_ERR << "Bad values in options: vtln-low " << vtln_low_ + << " and vtln-high " << vtln_high_ << ", versus " + << "low-freq " << low_freq_ << " and high-freq " + << high_freq_; +} + + void ComputeLifterCoeffs(BaseFloat Q, VectorBase *coeffs) { // Compute liftering coefficients (scaling on cepstral coeffs) // coeffs are numbered slightly differently from HTK: the zeroth diff --git a/src/feat/mel-computations.h b/src/feat/mel-computations.h index 5df36c8cb90..12c4d056608 100644 --- a/src/feat/mel-computations.h +++ b/src/feat/mel-computations.h @@ -48,6 +48,8 @@ struct MelBanksOptions { BaseFloat vtln_low; // vtln lower cutoff of warping function. BaseFloat vtln_high; // vtln upper cutoff of warping function: if negative, added // to the Nyquist frequency to get the cutoff. + bool modified; // If true, use 'modified' MFCC, which uses a breakpoint of + // 900 instead of 700. bool debug_mel; // htk_mode is a "hidden" config, it does not show up on command line. // Enables more exact compatibibility with HTK, for testing purposes. Affects @@ -55,7 +57,7 @@ struct MelBanksOptions { bool htk_mode; explicit MelBanksOptions(int num_bins = 25) : num_bins(num_bins), low_freq(20), high_freq(0), vtln_low(100), - vtln_high(-500), debug_mel(false), htk_mode(false) {} + vtln_high(-500), modified(false), debug_mel(false), htk_mode(false) {} void Register(OptionsItf *opts) { opts->Register("num-mel-bins", &num_bins, @@ -69,6 +71,8 @@ struct MelBanksOptions { opts->Register("vtln-high", &vtln_high, "High inflection point in piecewise linear VTLN warping function" " (if negative, offset from high-mel-freq"); + opts->Register("modified", &modified, + "Modified MFCCs, based on paper XXXX. TODO: document this."); opts->Register("debug-mel", &debug_mel, "Print out debugging information for mel bin computation"); } @@ -78,30 +82,6 @@ struct MelBanksOptions { class MelBanks { public: - static inline BaseFloat InverseMelScale(BaseFloat mel_freq) { - return 700.0f * (expf (mel_freq / 1127.0f) - 1.0f); - } - - static inline BaseFloat MelScale(BaseFloat freq) { - return 1127.0f * logf (1.0f + freq / 700.0f); - } - - static BaseFloat VtlnWarpFreq(BaseFloat vtln_low_cutoff, - BaseFloat vtln_high_cutoff, // discontinuities in warp func - BaseFloat low_freq, - BaseFloat high_freq, // upper+lower frequency cutoffs in - // the mel computation - BaseFloat vtln_warp_factor, - BaseFloat freq); - - static BaseFloat VtlnWarpMelFreq(BaseFloat vtln_low_cutoff, - BaseFloat vtln_high_cutoff, - BaseFloat low_freq, - BaseFloat high_freq, - BaseFloat vtln_warp_factor, - BaseFloat mel_freq); - - MelBanks(const MelBanksOptions &opts, const FrameExtractionOptions &frame_opts, BaseFloat vtln_warp_factor); @@ -116,18 +96,74 @@ class MelBanks { // returns vector of central freq of each bin; needed by plp code. const Vector &GetCenterFreqs() const { return center_freqs_; } - // Copy constructor - MelBanks(const MelBanks &other); + // Use the default copy constructor private: + + // This function checks that the provided options make sense, and also sets + // configuration variables like breakpoint_ in this class. + void SetConfigs(const MelBanksOptions &opts, + const FrameExtractionOptions &frame_opts, + BaseFloat vtln_warp_factor); + + // We use simplified formulas for the mel and inverse mel scale, since for + // this application, the multiplicative factor doesn't matter. Note: + // breakpoint_ is 700 for normal mel, or 900 for modified. + inline BaseFloat InverseMelScale(BaseFloat mel_freq) { + return 3500.0 * (expf((mel_freq - breakpoint_) / 3500.0) - 1.0); + } + + inline BaseFloat MelScale(BaseFloat freq) { + return log (breakpoint_ + 3500.0 * log (1.0 + freq / 3500.0)); + } + + BaseFloat VtlnWarpFreq(BaseFloat vtln_warp_factor, BaseFloat freq); + + + BaseFloat VtlnWarpMelFreq(BaseFloat vtln_warp_factor, BaseFloat mel_freq); + + // This sets up the 'bins_' member, for the regular (not modified) + // computation. It assumes center_freqs_ is already set up. + // 'htk_mode' is expected to be a copy of opts.htk_mode as given to the + // constructor. + void ComputeBins(bool htk_mode); + + // This sets up the 'bins_' member, for the modified computaion + // with cosine-shaped bins that are more tightly + // computation. It assumes center_freqs_ is already set up. + // 'htk_mode' is expected to be a copy of opts.htk_mode as given to the + // constructor. + void ComputeModifiedBins(); + // Disallow assignment MelBanks &operator = (const MelBanks &other); - // center frequencies of bins, numbered from 0 ... num_bins-1. - // Needed by GetCenterFreqs(). + + // The following few variables are derived from the configuration + // options passed in; they are used in converting to and from Mel frequencies, + // and for other purposes. + BaseFloat breakpoint_; // The breakpoint in the mel scale: 700 normally; + // 900 if opts.modified is true. + BaseFloat low_freq_; // opts.low_freq + BaseFloat high_freq_; // The same as opts.high_freq if it's >= 0, or + // otherwise the Nyquist plus opts.high_freq. + BaseFloat vtln_low_; // opts.vtln_low; the lower cutoff for VTLN. + BaseFloat vtln_high_; // opts.vtln_high; the upper cutoff for VTLN. + + int32 num_fft_bins_; // The number of FFT frequency bins (actually, excluding + // the one at the Nyquist). Equal to half the padded + // window length. + BaseFloat fft_bin_width_; // The frequency separation between successive + // FFT bins: equal nyquist / num_fft_bins_. + + + // center frequencies of bins (in Hz), numbered from 0 ... num_bins-1. Needed + // by GetCenterFreqs(). Vector center_freqs_; - // the "bins_" vector is a vector, one for each bin, of a pair: - // (the first nonzero fft-bin), (the vector of weights). + // the "bins_" vector is a vector, one for each mel bin, of a pair: (the + // first nonzero fft-bin), (the vector of weights). The pair of (int32, + // Vector) is provided for efficiency, to avoid having a larger vector with + // many zero entries. std::vector > > bins_; bool debug_; From 4eb4862ee321ec04a9f7916a5ad3d04b6d10d652 Mon Sep 17 00:00:00 2001 From: Pegita Date: Sun, 16 Dec 2018 15:09:24 -0500 Subject: [PATCH 2/9] pushed to trigger the build (travis issue) --- src/feat/mel-computations.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/feat/mel-computations.h b/src/feat/mel-computations.h index 12c4d056608..0a2e2bc8482 100644 --- a/src/feat/mel-computations.h +++ b/src/feat/mel-computations.h @@ -142,7 +142,7 @@ class MelBanks { // options passed in; they are used in converting to and from Mel frequencies, // and for other purposes. BaseFloat breakpoint_; // The breakpoint in the mel scale: 700 normally; - // 900 if opts.modified is true. + // 500 if opts.modified is true. BaseFloat low_freq_; // opts.low_freq BaseFloat high_freq_; // The same as opts.high_freq if it's >= 0, or // otherwise the Nyquist plus opts.high_freq. From 126c89a1307a9ff6ca57d776f9c238cc357fe150 Mon Sep 17 00:00:00 2001 From: Pegita Date: Fri, 21 Dec 2018 11:41:37 -0500 Subject: [PATCH 3/9] modified test set w.r.t new VtlnWarpMelFreq function. --- src/feat/feature-mfcc-test.cc | 41 ++++++++++++----------------------- src/feat/mel-computations.cc | 3 ++- src/feat/mel-computations.h | 34 ++++++++++++++++++++++------- 3 files changed, 42 insertions(+), 36 deletions(-) diff --git a/src/feat/feature-mfcc-test.cc b/src/feat/feature-mfcc-test.cc index c4367139707..e81458741ef 100644 --- a/src/feat/feature-mfcc-test.cc +++ b/src/feat/feature-mfcc-test.cc @@ -95,8 +95,8 @@ static void UnitTestSimple() { op.frame_opts.round_to_power_of_two = true; op.mel_opts.low_freq = 0.0; op.mel_opts.htk_mode = true; + op.mel_opts.modified = (Rand() % 2 == 0 ? true : false); op.htk_compat = true; - Mfcc mfcc(op); // use default parameters @@ -613,42 +613,29 @@ static void UnitTestHTKCompare6() { } std::cout << "Test passed :)\n\n"; - + unlink("tmp.test.wav.fea_kaldi.6"); } void UnitTestVtln() { // Test the function VtlnWarpFreq. - BaseFloat low_freq = 10, high_freq = 7800, - vtln_low_cutoff = 20, vtln_high_cutoff = 7400; - + BaseFloat low_freq = 10, high_freq = 7800; + MelBanksOptions mel_opts; + mel_opts.low_freq = low_freq, mel_opts.high_freq = high_freq; + FrameExtractionOptions frame_opts; + MelBanks melfbank(mel_opts, frame_opts, 0.9); for (size_t i = 0; i < 100; i++) { BaseFloat freq = 5000, warp_factor = 0.9 + RandUniform() * 0.2; - AssertEqual(MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff, - low_freq, high_freq, warp_factor, - freq), - freq / warp_factor); - - AssertEqual(MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff, - low_freq, high_freq, warp_factor, - low_freq), - low_freq); - AssertEqual(MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff, - low_freq, high_freq, warp_factor, - high_freq), - high_freq); + AssertEqual(melfbank.VtlnWarpFreq(warp_factor, freq), freq / warp_factor); + + AssertEqual(melfbank.VtlnWarpFreq(warp_factor, low_freq), low_freq); + AssertEqual(melfbank.VtlnWarpFreq(warp_factor, high_freq), high_freq); BaseFloat freq2 = low_freq + (high_freq-low_freq) * RandUniform(), freq3 = freq2 + (high_freq-freq2) * RandUniform(); // freq3>=freq2 - BaseFloat w2 = MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff, - low_freq, high_freq, warp_factor, - freq2); - BaseFloat w3 = MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff, - low_freq, high_freq, warp_factor, - freq3); + BaseFloat w2 = melfbank.VtlnWarpFreq(warp_factor, freq2); + BaseFloat w3 = melfbank.VtlnWarpFreq(warp_factor, freq3); KALDI_ASSERT(w3 >= w2); // increasing function. - BaseFloat w3dash = MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff, - low_freq, high_freq, 1.0, - freq3); + BaseFloat w3dash = melfbank.VtlnWarpFreq(1.0, freq3); AssertEqual(w3dash, freq3); } } diff --git a/src/feat/mel-computations.cc b/src/feat/mel-computations.cc index bd40a527b29..e0b8d8ca268 100644 --- a/src/feat/mel-computations.cc +++ b/src/feat/mel-computations.cc @@ -92,7 +92,7 @@ void MelBanks::ComputeBins(bool htk_mode) { if (mel <= center_mel) weight = (mel - left_mel) / (center_mel - left_mel); else - weight = (right_mel-mel) / (right_mel - center_mel); + weight = (right_mel - mel) / (right_mel - center_mel); this_bin(i) = weight; if (first_index == -1) first_index = i; @@ -290,6 +290,7 @@ void MelBanks::SetConfigs(const MelBanksOptions &opts, << nyquist; breakpoint_ = (opts.modified ? 500.0 : 700.0); + sec_breakpoint_ = (opts.modified ? 3500 : -1); vtln_low_ = opts.vtln_low; if (opts.vtln_high > 0.0) vtln_high_ = opts.vtln_high; diff --git a/src/feat/mel-computations.h b/src/feat/mel-computations.h index 0a2e2bc8482..b1aa5f591f5 100644 --- a/src/feat/mel-computations.h +++ b/src/feat/mel-computations.h @@ -72,7 +72,12 @@ struct MelBanksOptions { "High inflection point in piecewise linear VTLN warping function" " (if negative, offset from high-mel-freq"); opts->Register("modified", &modified, - "Modified MFCCs, based on paper XXXX. TODO: document this."); + "Modified MFCCs, based on paper 'An alternative to MFCCs for ASR' " + "(in progess for publication). This uses a cosine-type " + "filters with a modified mel scale for ceneter frequency " + "with more resolution around 1st and 2nd formant frequencies." + "The new bandwidth is computed as a combination of linear bandwidth " + "and the bandwidth computed based on filter overlap."); opts->Register("debug-mel", &debug_mel, "Print out debugging information for mel bin computation"); } @@ -96,6 +101,11 @@ class MelBanks { // returns vector of central freq of each bin; needed by plp code. const Vector &GetCenterFreqs() const { return center_freqs_; } + BaseFloat VtlnWarpFreq(BaseFloat vtln_warp_factor, BaseFloat freq); + + + BaseFloat VtlnWarpMelFreq(BaseFloat vtln_warp_factor, BaseFloat mel_freq); + // Use the default copy constructor private: @@ -109,18 +119,19 @@ class MelBanks { // this application, the multiplicative factor doesn't matter. Note: // breakpoint_ is 700 for normal mel, or 900 for modified. inline BaseFloat InverseMelScale(BaseFloat mel_freq) { - return 3500.0 * (expf((mel_freq - breakpoint_) / 3500.0) - 1.0); + if (sec_breakpoint_ > 0.0) + return 3500.0 * (expf((expf(mel_freq) - breakpoint_) / 3500.0) - 1.0); + else + return breakpoint_ * (expf(mel_freq) - 1.0); } inline BaseFloat MelScale(BaseFloat freq) { - return log (breakpoint_ + 3500.0 * log (1.0 + freq / 3500.0)); + if (sec_breakpoint_ > 0.0) + return log (breakpoint_ + 3500.0 * log (1.0 + freq / 3500.0)); + else + return log(1.0 + freq / breakpoint_); } - BaseFloat VtlnWarpFreq(BaseFloat vtln_warp_factor, BaseFloat freq); - - - BaseFloat VtlnWarpMelFreq(BaseFloat vtln_warp_factor, BaseFloat mel_freq); - // This sets up the 'bins_' member, for the regular (not modified) // computation. It assumes center_freqs_ is already set up. // 'htk_mode' is expected to be a copy of opts.htk_mode as given to the @@ -143,6 +154,13 @@ class MelBanks { // and for other purposes. BaseFloat breakpoint_; // The breakpoint in the mel scale: 700 normally; // 500 if opts.modified is true. + BaseFloat sec_breakpoint_; // The second breakpoint used in the modified + // mel scale; + // The range is [1500,3500]Hz and it corresponds to + // second breakpoint in the mel scale mainly and + // results in higher center frequency concentration + // around this frequency + // (e.g. avg. freq for second formant) BaseFloat low_freq_; // opts.low_freq BaseFloat high_freq_; // The same as opts.high_freq if it's >= 0, or // otherwise the Nyquist plus opts.high_freq. From e272089fd6c3f74767b646e82c9bef57d22665c9 Mon Sep 17 00:00:00 2001 From: Pegita Date: Sat, 23 Mar 2019 20:16:48 -0400 Subject: [PATCH 4/9] fixed typos. --- src/feat/mel-computations.cc | 16 ++++++++-------- src/feat/mel-computations.h | 21 +++++++++++---------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/src/feat/mel-computations.cc b/src/feat/mel-computations.cc index e0b8d8ca268..228c54672e9 100644 --- a/src/feat/mel-computations.cc +++ b/src/feat/mel-computations.cc @@ -121,11 +121,11 @@ void MelBanks::ComputeBins(bool htk_mode) { first and last nonzero value (pi for the canonical function). If there are a lot of bins, their diamter is defined by a formula and it's a function of the center frequency f of the bin: - diameter = 30 + 60 f / (f + 500). - so it increases from 30Hz to 90Hz with a knee around 500Hz. + diameter = alpha1 + alpha2 * f / (f + breakpoint_). + So, it increases from alpha1 Hz to (alpha1 + alpha2) Hz with a knee around breakpoint_ (Hz). However (and this matters if the number of bins is relatively small), we never let the diameter fall below the point where the crossing-point of this and - the next bin would be less than 0.1. By this I mean is the y-value where the + the next bin would be less than 0.2. By this I mean is the y-value where the raised-cosines cross. This value ensures that there won't be too a 'dip' in the middle of the two bins. */ @@ -136,11 +136,11 @@ void MelBanks::ComputeModifiedBins() { next_center = (bin == num_bins - 1 ? high_freq_ : center_freqs_(bin + 1)); - // note: breakpoint_ is 500 (Hz). - BaseFloat diameter_floor = (next_center - center_freq) * 1.1, - diameter = 30.0 + 60.0 * (center_freq / (center_freq + breakpoint_)); + // note: breakpoint_ is 900 (Hz). + BaseFloat diameter_floor = (next_center - center_freq) * 1.2, + diameter = 80.0 + 100.0 * (center_freq / (center_freq + breakpoint_)); - diameter = pow(diameter * diameter + diameter_floor * diameter_floor, 0.5); + diameter = sqrt(diameter * diameter + diameter_floor * diameter_floor); // 'freq_scale' is the scaling factor on the frequencies that will ensure // that the diameter becomes equal to pi, like the canonical bin function @@ -290,7 +290,7 @@ void MelBanks::SetConfigs(const MelBanksOptions &opts, << nyquist; breakpoint_ = (opts.modified ? 500.0 : 700.0); - sec_breakpoint_ = (opts.modified ? 3500 : -1); + second_breakpoint_ = (opts.modified ? 3500 : -1); vtln_low_ = opts.vtln_low; if (opts.vtln_high > 0.0) vtln_high_ = opts.vtln_high; diff --git a/src/feat/mel-computations.h b/src/feat/mel-computations.h index b1aa5f591f5..2603a321ce4 100644 --- a/src/feat/mel-computations.h +++ b/src/feat/mel-computations.h @@ -48,8 +48,7 @@ struct MelBanksOptions { BaseFloat vtln_low; // vtln lower cutoff of warping function. BaseFloat vtln_high; // vtln upper cutoff of warping function: if negative, added // to the Nyquist frequency to get the cutoff. - bool modified; // If true, use 'modified' MFCC, which uses a breakpoint of - // 900 instead of 700. + bool modified; // If true, use 'modified' MFCC. bool debug_mel; // htk_mode is a "hidden" config, it does not show up on command line. // Enables more exact compatibibility with HTK, for testing purposes. Affects @@ -74,10 +73,12 @@ struct MelBanksOptions { opts->Register("modified", &modified, "Modified MFCCs, based on paper 'An alternative to MFCCs for ASR' " "(in progess for publication). This uses a cosine-type " - "filters with a modified mel scale for ceneter frequency " - "with more resolution around 1st and 2nd formant frequencies." - "The new bandwidth is computed as a combination of linear bandwidth " - "and the bandwidth computed based on filter overlap."); + "filters with a modified mel scale with two breakpoints " + ", which control filters's resolution in the frequency region." + "Also, a filter bandwidth is computed as a norm of two " + "bandwidths 1) a bandwidth value computed using linear " + "equation, and 2) a bandwidth value computed based on filter " + "overlap."); opts->Register("debug-mel", &debug_mel, "Print out debugging information for mel bin computation"); } @@ -119,14 +120,14 @@ class MelBanks { // this application, the multiplicative factor doesn't matter. Note: // breakpoint_ is 700 for normal mel, or 900 for modified. inline BaseFloat InverseMelScale(BaseFloat mel_freq) { - if (sec_breakpoint_ > 0.0) - return 3500.0 * (expf((expf(mel_freq) - breakpoint_) / 3500.0) - 1.0); + if (second_breakpoint_ > 0.0) + return second_breakpoint_ * (expf((expf(mel_freq) - breakpoint_) / second_breakpoint_) - 1.0); else return breakpoint_ * (expf(mel_freq) - 1.0); } inline BaseFloat MelScale(BaseFloat freq) { - if (sec_breakpoint_ > 0.0) + if (second_breakpoint_ > 0.0) return log (breakpoint_ + 3500.0 * log (1.0 + freq / 3500.0)); else return log(1.0 + freq / breakpoint_); @@ -154,7 +155,7 @@ class MelBanks { // and for other purposes. BaseFloat breakpoint_; // The breakpoint in the mel scale: 700 normally; // 500 if opts.modified is true. - BaseFloat sec_breakpoint_; // The second breakpoint used in the modified + BaseFloat second_breakpoint_; // The second breakpoint used in the modified // mel scale; // The range is [1500,3500]Hz and it corresponds to // second breakpoint in the mel scale mainly and From 0f01df645254ee5f4613b35c6fc107de16d284ef Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 24 Mar 2019 16:41:09 -0400 Subject: [PATCH 5/9] [src] Clean up modified mel code --- src/feat/mel-computations.cc | 52 +++++++++++++++++++++++++----------- src/feat/mel-computations.h | 21 ++++++++------- 2 files changed, 48 insertions(+), 25 deletions(-) diff --git a/src/feat/mel-computations.cc b/src/feat/mel-computations.cc index 228c54672e9..b3c69db7805 100644 --- a/src/feat/mel-computations.cc +++ b/src/feat/mel-computations.cc @@ -118,16 +118,33 @@ void MelBanks::ComputeBins(bool htk_mode) { They are shaped like a cosine function from -pi/2 to pi/2 (unlike the standard triangular bins). We define their diameter as the distance between the - first and last nonzero value (pi for the canonical function). If there are - a lot of bins, their diamter is defined by a formula and it's a function of - the center frequency f of the bin: - diameter = alpha1 + alpha2 * f / (f + breakpoint_). - So, it increases from alpha1 Hz to (alpha1 + alpha2) Hz with a knee around breakpoint_ (Hz). - However (and this matters if the number of bins is relatively small), we never - let the diameter fall below the point where the crossing-point of this and - the next bin would be less than 0.2. By this I mean is the y-value where the - raised-cosines cross. This value ensures that there won't be too a 'dip' - in the middle of the two bins. + first and last nonzero value (pi for the canonical function). We choose + the diameter as: + d = sqrt(d1^2 + d2^2) + (this function may be viewed as a kind of soft-max), where d1 and d2 are + two different formulas for the diameter that we describe below. + + d1 is a formula that ensures the bins overlap by at least a minimal amount. + + Let bin_diff be the difference in Hz between this bin's center-frequency + and the next bin's center-frequency, or (if this is the last bin), + the user-specified `high-freq` which is the top of the range of frequencies + we cover. Then: + + d1 = 1.1 * bin_diff + + The formula for d2 is designed to provide a reasonable floor so the bandwidth + don't get ridiculously narrow as we add more bins, and to approximate what we + observed the filter diameters to look like when learning filterbanks via DNNs. + The formula is: + + d2 = 50 + 50 * f / (f + 700) + + which roughly means: start with a diameter of 50Hz, increasing gradually to + 100Hz for bins with center frequency more than about 700Hz. There is no + rocket science behind this formula; it was obtained through a combination of + trying to match the DNN-learned filterbank bandwidths (cite: Pegah's thesis), + and manual tuning. */ void MelBanks::ComputeModifiedBins() { int32 num_bins = center_freqs_.Dim(); @@ -136,11 +153,14 @@ void MelBanks::ComputeModifiedBins() { next_center = (bin == num_bins - 1 ? high_freq_ : center_freqs_(bin + 1)); - // note: breakpoint_ is 900 (Hz). - BaseFloat diameter_floor = (next_center - center_freq) * 1.2, - diameter = 80.0 + 100.0 * (center_freq / (center_freq + breakpoint_)); + BaseFloat d1 = (next_center - center_freq) * 1.1, + d2 = 50.0 + 50.0 * (center_freq / (center_freq + 700.0)); - diameter = sqrt(diameter * diameter + diameter_floor * diameter_floor); + // 'diameter' is in Hz; it represents the distance on the frequency axis + // between the first and last nonzero points of the raised-cosine window + // function. This formula applies our heuristic, described above, to choose + // it. + BaseFloat diameter = sqrt(d1 * d1 + d2 * d2); // 'freq_scale' is the scaling factor on the frequencies that will ensure // that the diameter becomes equal to pi, like the canonical bin function @@ -289,8 +309,8 @@ void MelBanks::SetConfigs(const MelBanksOptions &opts, << " and high-freq " << high_freq_ << " vs. nyquist " << nyquist; - breakpoint_ = (opts.modified ? 500.0 : 700.0); - second_breakpoint_ = (opts.modified ? 3500 : -1); + breakpoint_ = (opts.modified ? 300.0 : 700.0); + second_breakpoint_ = (opts.modified ? 2000.0 : -1); vtln_low_ = opts.vtln_low; if (opts.vtln_high > 0.0) vtln_high_ = opts.vtln_high; diff --git a/src/feat/mel-computations.h b/src/feat/mel-computations.h index b0b96d7af88..f7f163aaaf8 100644 --- a/src/feat/mel-computations.h +++ b/src/feat/mel-computations.h @@ -116,21 +116,24 @@ class MelBanks { const FrameExtractionOptions &frame_opts, BaseFloat vtln_warp_factor); - // We use simplified formulas for the mel and inverse mel scale, since for - // this application, the multiplicative factor doesn't matter. Note: - // breakpoint_ is 700 for normal mel, or 900 for modified. inline BaseFloat InverseMelScale(BaseFloat mel_freq) { - if (second_breakpoint_ > 0.0) - return second_breakpoint_ * (expf((expf(mel_freq) - breakpoint_) / second_breakpoint_) - 1.0); + BaseFloat b1 = breakpoint_, b2 = second_breakpoint_; + if (b2 > 0.0) + return b2 * (expf((expf(mel_freq) - b1) / b2) - 1.0); else - return breakpoint_ * (expf(mel_freq) - 1.0); + return b1 * (expf(mel_freq) - 1.0); } inline BaseFloat MelScale(BaseFloat freq) { - if (second_breakpoint_ > 0.0) - return log (breakpoint_ + 3500.0 * log (1.0 + freq / 3500.0)); - else + BaseFloat b1 = breakpoint_, b2 = second_breakpoint_; + if (b2 > 0.0) { + // Modified Mel: linear, till ~b1, then log till ~b2, then log(log) + return log (b1 + b2 * log(1.0 + freq / b2)); + } else { + // Mel: linear till ~b1 = 700, then logarithmic. We ignore the scaling + // factor as it makes no difference to our application. return log(1.0 + freq / breakpoint_); + } } // This sets up the 'bins_' member, for the regular (not modified) From 5e0d7140bba79e801ee3939e51e34f8a3eb03678 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 24 Mar 2019 16:45:19 -0400 Subject: [PATCH 6/9] [src] Remove debug code --- src/feat/mel-computations.cc | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/src/feat/mel-computations.cc b/src/feat/mel-computations.cc index b3c69db7805..80428ba7d65 100644 --- a/src/feat/mel-computations.cc +++ b/src/feat/mel-computations.cc @@ -158,8 +158,8 @@ void MelBanks::ComputeModifiedBins() { // 'diameter' is in Hz; it represents the distance on the frequency axis // between the first and last nonzero points of the raised-cosine window - // function. This formula applies our heuristic, described above, to choose - // it. + // function. This formula applies our heuristic, described above, + // to choose the diameter. BaseFloat diameter = sqrt(d1 * d1 + d2 * d2); // 'freq_scale' is the scaling factor on the frequencies that will ensure @@ -273,13 +273,6 @@ void MelBanks::Compute(const VectorBase &power_spectrum, // it early. KALDI_ASSERT(!KALDI_ISNAN((*mel_energies_out)(i))); } - - if (debug_) { - fprintf(stderr, "MEL BANKS:\n"); - for (int32 i = 0; i < num_bins; i++) - fprintf(stderr, " %f", (*mel_energies_out)(i)); - fprintf(stderr, "\n"); - } } void MelBanks::SetConfigs(const MelBanksOptions &opts, From ef9450f2a33bad33fd1dca3183c2443a7287a6b2 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Wed, 27 Mar 2019 21:41:18 -0400 Subject: [PATCH 7/9] [src] Cleanups to mel-computations.h --- src/feat/mel-computations.h | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/src/feat/mel-computations.h b/src/feat/mel-computations.h index f7f163aaaf8..3231671fa8b 100644 --- a/src/feat/mel-computations.h +++ b/src/feat/mel-computations.h @@ -71,14 +71,9 @@ struct MelBanksOptions { "High inflection point in piecewise linear VTLN warping function" " (if negative, offset from high-mel-freq"); opts->Register("modified", &modified, - "Modified MFCCs, based on paper 'An alternative to MFCCs for ASR' " - "(in progess for publication). This uses a cosine-type " - "filters with a modified mel scale with two breakpoints " - ", which control filters's resolution in the frequency region." - "Also, a filter bandwidth is computed as a norm of two " - "bandwidths 1) a bandwidth value computed using linear " - "equation, and 2) a bandwidth value computed based on filter " - "overlap."); + "If true, use a modified form of the Mel scale that gives " + "more emphasis to lower frequencies, and use differently " + "tuned bin shapes and widths than normal."); opts->Register("debug-mel", &debug_mel, "Print out debugging information for mel bin computation"); } @@ -118,7 +113,7 @@ class MelBanks { inline BaseFloat InverseMelScale(BaseFloat mel_freq) { BaseFloat b1 = breakpoint_, b2 = second_breakpoint_; - if (b2 > 0.0) + if (b2 > 0.0) // modified Mel scale return b2 * (expf((expf(mel_freq) - b1) / b2) - 1.0); else return b1 * (expf(mel_freq) - 1.0); @@ -132,7 +127,7 @@ class MelBanks { } else { // Mel: linear till ~b1 = 700, then logarithmic. We ignore the scaling // factor as it makes no difference to our application. - return log(1.0 + freq / breakpoint_); + return log(1.0 + freq / b1); } } @@ -153,18 +148,18 @@ class MelBanks { MelBanks &operator = (const MelBanks &other); + // The following few variables are derived from the configuration // options passed in; they are used in converting to and from Mel frequencies, // and for other purposes. - BaseFloat breakpoint_; // The breakpoint in the mel scale: 700 normally; - // 500 if opts.modified is true. + BaseFloat breakpoint_; // The breakpoint of the Mel scale (700) if we + // are using mel scale; otherwise the first + // breakpoint in the modified-mel scale, + // e.g. 300. Only relevant if --modified=true BaseFloat second_breakpoint_; // The second breakpoint used in the modified - // mel scale; - // The range is [1500,3500]Hz and it corresponds to - // second breakpoint in the mel scale mainly and - // results in higher center frequency concentration - // around this frequency - // (e.g. avg. freq for second formant) + // mel scale, e.g. 2000. + // Only relevant if --modified=true + BaseFloat low_freq_; // opts.low_freq BaseFloat high_freq_; // The same as opts.high_freq if it's >= 0, or // otherwise the Nyquist plus opts.high_freq. From d5ae5826577976251b22fb07695040810486532b Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 22 Apr 2019 22:44:23 -0400 Subject: [PATCH 8/9] [src] Configuration change in modified mel --- src/feat/mel-computations.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/feat/mel-computations.cc b/src/feat/mel-computations.cc index 80428ba7d65..838caf94ffa 100644 --- a/src/feat/mel-computations.cc +++ b/src/feat/mel-computations.cc @@ -154,7 +154,7 @@ void MelBanks::ComputeModifiedBins() { high_freq_ : center_freqs_(bin + 1)); BaseFloat d1 = (next_center - center_freq) * 1.1, - d2 = 50.0 + 50.0 * (center_freq / (center_freq + 700.0)); + d2 = 60.0 + 50.0 * (center_freq / (center_freq + 700.0)); // 'diameter' is in Hz; it represents the distance on the frequency axis // between the first and last nonzero points of the raised-cosine window From 390ef59b1f6971f42d790a6fd0eb75d702d30e84 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 28 Apr 2019 14:15:16 -0400 Subject: [PATCH 9/9] [src] Fix to breakpoint in bandwidth computation --- src/feat/mel-computations.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/feat/mel-computations.cc b/src/feat/mel-computations.cc index 838caf94ffa..bf1563434eb 100644 --- a/src/feat/mel-computations.cc +++ b/src/feat/mel-computations.cc @@ -154,7 +154,7 @@ void MelBanks::ComputeModifiedBins() { high_freq_ : center_freqs_(bin + 1)); BaseFloat d1 = (next_center - center_freq) * 1.1, - d2 = 60.0 + 50.0 * (center_freq / (center_freq + 700.0)); + d2 = 60.0 + 50.0 * (center_freq / (center_freq + breakpoint_)); // 'diameter' is in Hz; it represents the distance on the frequency axis // between the first and last nonzero points of the raised-cosine window