├── test ├── conf │ └── gtf.conf └── run_vad.bash ├── featbin ├── apply-nccf-to-pov.cc ├── extract-dims.cc ├── apply-arma.cc ├── apply-vad.cc ├── apply-ltsv.cc ├── compute-dctf-feats.cc ├── apply-vad-merged.cc ├── compute-gabor-feats.cc └── compute-gtf-feats.cc ├── feat ├── feature-dctf.cc ├── feature-dctf.h ├── feature-gtf.h ├── feature-gabor.h ├── feature-gtf.cc └── feature-gabor.cc ├── transform ├── featxtra-functions.h └── featxtra-functions.cc └── README.md /test/conf/gtf.conf: -------------------------------------------------------------------------------- 1 | --num-bins=64 # number of Gammatone filters 2 | --apply-dct=false # true: make Gammatone Frequency Cepstral Coefficients. 3 | --num-ceps=24 # GFCC dimension 4 | --use-energy=false 5 | -------------------------------------------------------------------------------- /test/run_vad.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | threads=8 4 | train_cmd="run.pl" 5 | decode_cmd="run.pl" 6 | source path.sh 7 | export LC_ALL=C 8 | 9 | wavin=in/wav 10 | 11 | compute-kaldi-pitch-feats --sample-frequency=16000 scp:$wavin.scp ark:- | \ 12 | apply-nccf-to-pov ark:- ark:- | \ 13 | extract-dims --start=1 --end=1 ark:- ark:out/vprob.ark 14 | compute-gtf-feats --verbose=2 --config=conf/gtf.conf scp:$wavin.scp ark:- | \ 15 | apply-arma --ar_order=5 ark:- ark:- | \ 16 | apply-ltsv ark:- ark:out/ltsv.ark 17 | paste-feats --length-tolerance=1 ark:out/ltsv.ark ark:out/vprob.ark ark:- | \ 18 | apply-vad --ctx-win=40 --vad-thr=0.2 ark:- ark:out/vad.ark 19 | -------------------------------------------------------------------------------- /featbin/apply-nccf-to-pov.cc: -------------------------------------------------------------------------------- 1 | // featbin/apply-nccf-to-pov.cc 2 | 3 | // Copyright 2009-2011 Microsoft Corporation 4 | 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | // MERCHANTABLITY OR NON-INFRINGEMENT. 15 | // See the Apache 2 License for the specific language governing permissions and 16 | // limitations under the License. 17 | 18 | #include "base/kaldi-common.h" 19 | #include "util/common-utils.h" 20 | #include "matrix/kaldi-matrix.h" 21 | #include "feat/pitch-functions.h" 22 | #include "transform/featxtra-functions.h" 23 | #include 24 | using namespace std; 25 | 26 | int main(int argc, char *argv[]) { 27 | try { 28 | using namespace kaldi; 29 | 30 | const char *usage = 31 | "Transform raw NCCF to accurate Probability of Voicing (POV).\n" 32 | "Kaldi pitch extractor (compute-kaldi-pitch) outputs (NCCF, pitch).\n" 33 | "This program transforms (NCCF, pitch) -> (POV, pitch).\n"; 34 | 35 | ParseOptions po(usage); 36 | 37 | po.Read(argc, argv); 38 | 39 | if (po.NumArgs() != 2) { 40 | po.PrintUsage(); 41 | exit(1); 42 | } 43 | 44 | kaldi::int32 num_done = 0; 45 | 46 | std::string feat_rspecifier = po.GetArg(1); 47 | std::string feat_wspecifier = po.GetArg(2); 48 | 49 | SequentialBaseFloatMatrixReader feat_reader(feat_rspecifier); 50 | BaseFloatMatrixWriter feat_writer(feat_wspecifier); 51 | 52 | for (;!feat_reader.Done(); feat_reader.Next()) { 53 | std::string utt = feat_reader.Key(); 54 | Matrix feat(feat_reader.Value()); 55 | ApplyNccfToPov(&feat); 56 | feat_writer.Write(utt, feat); 57 | num_done++; 58 | } 59 | return (num_done != 0 ? 0 : 1); 60 | } catch(const std::exception &e) { 61 | std::cerr << e.what(); 62 | return -1; 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /featbin/extract-dims.cc: -------------------------------------------------------------------------------- 1 | // featbin/extract-dims.cc 2 | 3 | // Copyright 2009-2011 Microsoft Corporation 4 | 5 | // See ../../COPYING for clarification regarding multiple authors 6 | // 7 | // Licensed under the Apache License, Version 2.0 (the "License"); 8 | // you may not use this file except in compliance with the License. 9 | // You may obtain a copy of the License at 10 | // 11 | // http://www.apache.org/licenses/LICENSE-2.0 12 | // 13 | // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 15 | // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 16 | // MERCHANTABLITY OR NON-INFRINGEMENT. 17 | // See the Apache 2 License for the specific language governing permissions and 18 | // limitations under the License. 19 | 20 | #include "base/kaldi-common.h" 21 | #include "util/common-utils.h" 22 | #include "matrix/kaldi-matrix.h" 23 | 24 | 25 | int main(int argc, char *argv[]) { 26 | try { 27 | using namespace kaldi; 28 | 29 | const char *usage = 30 | "Extract a dimension range from features \n" 31 | "Usage: extract-dims [options] in-rspecifier out-wspecifier\n"; 32 | 33 | ParseOptions po(usage); 34 | 35 | int32 start = 0; 36 | int32 end = 0; 37 | po.Register("start", &start, "If nonnegative, define start or range."); 38 | po.Register("end", &end, "If nonnegative, define end or range."); 39 | 40 | po.Read(argc, argv); 41 | 42 | if (po.NumArgs() != 2) { 43 | po.PrintUsage(); 44 | exit(1); 45 | } 46 | 47 | std::string rspecifier = po.GetArg(1); 48 | std::string wspecifier = po.GetArg(2); 49 | 50 | KALDI_ASSERT(start > 0 || start >= end || end > 0); 51 | 52 | BaseFloatMatrixWriter kaldi_writer(wspecifier); 53 | SequentialBaseFloatMatrixReader kaldi_reader(rspecifier); 54 | int32 k = 0; 55 | for (; !kaldi_reader.Done() ; kaldi_reader.Next(), k++) { 56 | std::string utt = kaldi_reader.Key(); 57 | Matrix feats(kaldi_reader.Value()); 58 | 59 | KALDI_ASSERT(start <= feats.NumCols() || end <= feats.NumCols()); 60 | 61 | Matrix to_write(feats.ColRange(start-1, (end-start)+1)); 62 | kaldi_writer.Write(kaldi_reader.Key(), to_write ); 63 | } 64 | return 0; 65 | } catch(const std::exception &e) { 66 | std::cerr << e.what(); 67 | return -1; 68 | } 69 | } 70 | 71 | 72 | -------------------------------------------------------------------------------- /feat/feature-dctf.cc: -------------------------------------------------------------------------------- 1 | // feat/feature-dctf.cc 2 | 3 | // Copyright 2014 University of Southern California (author: Maarten Van Segbroeck) 4 | 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | // MERCHANTABLITY OR NON-INFRINGEMENT. 15 | // See the Apache 2 License for the specific language governing permissions and 16 | // limitations under the License. 17 | 18 | 19 | #include "feat/feature-dctf.h" 20 | #include "feat/feature-window.h" 21 | 22 | namespace kaldi { 23 | 24 | Dctf::Dctf(const DctfOptions &opts) 25 | : opts_(opts), feature_window_function_(opts.frame_opts) { 26 | int32 num_bins = opts.num_bins; 27 | int32 num_ceps = opts.num_ceps; 28 | Matrix dct_matrix(num_bins, num_bins); 29 | ComputeDctMatrix(&dct_matrix); 30 | // Note that we include zeroth dct in either case. If using the 31 | // energy we replace this with the energy. This means a different 32 | // ordering of features than HTK. 33 | SubMatrix dct_rows(dct_matrix, 0, num_ceps, 0, num_bins); 34 | dct_matrix_.Resize(num_ceps, num_bins); 35 | dct_matrix_.CopyFromMat(dct_rows); // subset of rows. 36 | 37 | } 38 | 39 | Dctf::~Dctf() { 40 | } 41 | 42 | void Dctf::Compute(const VectorBase &wave, 43 | Matrix *output, 44 | Vector *wave_remainder) { 45 | assert(output != NULL); 46 | int32 rows_out = NumFrames(wave.Dim(), opts_.frame_opts); 47 | int32 cols_out = opts_.num_ceps; 48 | if (rows_out == 0) 49 | KALDI_ERR << "Dctf::Compute, no frames fit in file (#samples is " << wave.Dim() << ")"; 50 | output->Resize(rows_out, cols_out); 51 | if (wave_remainder != NULL) 52 | ExtractWaveformRemainder(wave, opts_.frame_opts, wave_remainder); 53 | Vector window; // windowed waveform. 54 | for (int32 r = 0; r < rows_out; r++) { // r is frame index.. 55 | ExtractWindow(0, wave, r, opts_.frame_opts, feature_window_function_, &window, NULL); 56 | 57 | SubVector this_dctf(output->Row(r)); 58 | 59 | // DCTF 60 | this_dctf.AddMatVec(1.0, dct_matrix_, kNoTrans, window, 0.0); 61 | } 62 | } 63 | } // namespace 64 | -------------------------------------------------------------------------------- /featbin/apply-arma.cc: -------------------------------------------------------------------------------- 1 | // featbin/apply-arma.cc 2 | 3 | // Copyright 2009-2011 Microsoft Corporation 4 | 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | // MERCHANTABLITY OR NON-INFRINGEMENT. 15 | // See the Apache 2 License for the specific language governing permissions and 16 | // limitations under the License. 17 | 18 | #include "base/kaldi-common.h" 19 | #include "util/common-utils.h" 20 | #include "matrix/kaldi-matrix.h" 21 | #include "transform/featxtra-functions.h" 22 | #include 23 | using namespace std; 24 | 25 | 26 | int main(int argc, char *argv[]) { 27 | try { 28 | using namespace kaldi; 29 | 30 | const char *usage = 31 | "Apply ARMA (AutoRegressive-moving-average) normalization to a matrix of features using M-tap FIR\n" 32 | "Per-utterance by default, or per-speaker if utt2spk option provided\n" 33 | "Usage: apply-arma [options] feats-rspecifier feats-wspecifier\n"; 34 | 35 | ParseOptions po(usage); 36 | std::string utt2spk_rspecifier; 37 | int ar_order = 2; 38 | po.Register("ar-order", &ar_order, "Order of the autoregressive model [default: 2]"); 39 | 40 | po.Read(argc, argv); 41 | 42 | if (po.NumArgs() != 2) { 43 | po.PrintUsage(); 44 | exit(1); 45 | } 46 | 47 | kaldi::int32 num_done = 0; 48 | 49 | std::string feat_rspecifier = po.GetArg(1); 50 | std::string feat_wspecifier = po.GetArg(2); 51 | 52 | SequentialBaseFloatMatrixReader feat_reader(feat_rspecifier); 53 | BaseFloatMatrixWriter feat_writer(feat_wspecifier); 54 | 55 | if (utt2spk_rspecifier != "") 56 | KALDI_ERR << "--utt2spk option not compatible with rxfilename as input " 57 | << "(did you forget ark:?)"; 58 | 59 | for (;!feat_reader.Done(); feat_reader.Next()) { 60 | std::string utt = feat_reader.Key(); 61 | Matrix feat(feat_reader.Value()); 62 | ApplyArma(ar_order, &feat); 63 | feat_writer.Write(utt, feat); 64 | num_done++; 65 | } 66 | return (num_done != 0 ? 0 : 1); 67 | } catch(const std::exception &e) { 68 | std::cerr << e.what(); 69 | return -1; 70 | } 71 | } 72 | 73 | 74 | -------------------------------------------------------------------------------- /feat/feature-dctf.h: -------------------------------------------------------------------------------- 1 | // feat/feature-dctf.h 2 | 3 | // Copyright 2014 University of Southern California (author: Maarten Van Segbroeck) 4 | 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | // MERCHANTABLITY OR NON-INFRINGEMENT. 15 | // See the Apache 2 License for the specific language governing permissions and 16 | // limitations under the License. 17 | 18 | #ifndef KALDI_FEAT_FEATURE_DCTF_H_ 19 | #define KALDI_FEAT_FEATURE_DCTF_H_ 20 | 21 | #include "feat/feature-functions.h" 22 | #include "feat/feature-window.h" 23 | 24 | namespace kaldi { 25 | /// @addtogroup feat FeatureExtraction 26 | /// @{ 27 | 28 | 29 | /// DctfOptions contains basic options for computing DCTF features 30 | /// It only includes things that can be done in a "stateless" way, i.e. 31 | /// it does not include energy max-normalization. 32 | /// It does not include delta computation. 33 | struct DctfOptions { 34 | FrameExtractionOptions frame_opts; 35 | int32 num_bins; // e.g. 64: num DCTF bins, counting zero. 36 | int32 num_ceps; // e.g. 23: num GF cepstral coeffs, counting zero. 37 | 38 | void Register(ParseOptions *po) { 39 | frame_opts.Register(po); 40 | } 41 | }; 42 | 43 | /// Class for computing DCTF features; see \ref feat_dctf for more information. 44 | class Dctf { 45 | public: 46 | Dctf(const DctfOptions &opts); 47 | ~Dctf(); 48 | 49 | int32 Dim() { return opts_.num_ceps; } 50 | 51 | /// Will throw exception on failure (e.g. if file too short for even one 52 | /// frame). The output "wave_remainder" is the last frame or two of the 53 | /// waveform that it would be necessary to include in the next call to Compute 54 | /// for the same utterance. It is not exactly the un-processed part (it may 55 | /// have been partly processed), it's the start of the next window that we 56 | /// have not already processed. Will throw exception on failure (e.g. if file 57 | /// too short for even one frame). 58 | void Compute(const VectorBase &wave, 59 | Matrix *output, 60 | Vector *wave_remainder = NULL); 61 | 62 | private: 63 | DctfOptions opts_; 64 | Matrix dct_matrix_; // matrix we left-multiply by to perform DCT. 65 | FeatureWindowFunction feature_window_function_; 66 | KALDI_DISALLOW_COPY_AND_ASSIGN(Dctf); 67 | }; 68 | 69 | 70 | /// @} End of "addtogroup feat" 71 | } // namespace kaldi 72 | 73 | 74 | #endif // KALDI_FEAT_FEATURE_DCTF_H_ 75 | -------------------------------------------------------------------------------- /featbin/apply-vad.cc: -------------------------------------------------------------------------------- 1 | // featbin/apply-vad.cc 2 | 3 | // Copyright 2009-2011 Microsoft Corporation 4 | 5 | // See ../../COPYING for clarification regarding multiple authors 6 | // 7 | // Licensed under the Apache License, Version 2.0 (the "License"); 8 | // you may not use this file except in compliance with the License. 9 | // You may obtain a copy of the License at 10 | // 11 | // http://www.apache.org/licenses/LICENSE-2.0 12 | // 13 | // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 15 | // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 16 | // MERCHANTABLITY OR NON-INFRINGEMENT. 17 | // See the Apache 2 License for the specific language governing permissions and 18 | // limitations under the License. 19 | 20 | #include "base/kaldi-common.h" 21 | #include "util/common-utils.h" 22 | #include "matrix/kaldi-matrix.h" 23 | #include "transform/featxtra-functions.h" 24 | 25 | int main(int argc, char *argv[]) { 26 | try { 27 | using namespace kaldi; 28 | 29 | const char *usage = 30 | "Extract a dimension range from features \n" 31 | "Usage: apply-vad [options] in-rspecifier out-wspecifier\n"; 32 | 33 | ParseOptions po(usage); 34 | 35 | int32 ctx_win = 20; 36 | BaseFloat vad_thr = 0.2; 37 | po.Register("ctx-win", &ctx_win, "Define number of frames of median filtering."); 38 | po.Register("vad-thr", &vad_thr, "Define VAD vad-threshold."); 39 | 40 | po.Read(argc, argv); 41 | 42 | if (po.NumArgs() != 2) { 43 | po.PrintUsage(); 44 | exit(1); 45 | } 46 | 47 | std::string rspecifier = po.GetArg(1); 48 | std::string wspecifier = po.GetArg(2); 49 | 50 | KALDI_ASSERT(vad_thr > 0 && ctx_win > 0); 51 | 52 | BaseFloatMatrixWriter kaldi_writer(wspecifier); 53 | SequentialBaseFloatMatrixReader kaldi_reader(rspecifier); 54 | int32 k = 0; 55 | for (; !kaldi_reader.Done() ; kaldi_reader.Next(), k++) { 56 | std::string utt = kaldi_reader.Key(); 57 | Matrix feats(kaldi_reader.Value()); 58 | 59 | KALDI_ASSERT(ctx_win < feats.NumRows()); 60 | Vector vad_out; 61 | 62 | // Mean of the probability streams 63 | ApplyColMean(feats, &vad_out); 64 | // Median filtering of the resulting VAD probability stream 65 | ApplyMedianfiltering(ctx_win, &vad_out); 66 | 67 | Matrix to_write(feats.NumRows(), 1); 68 | to_write.CopyColFromVec(vad_out, 0); 69 | 70 | // Apply thresholding 71 | to_write.Add(-vad_thr); 72 | to_write.ApplyHeaviside(); 73 | 74 | kaldi_writer.Write(kaldi_reader.Key(), to_write ); 75 | } 76 | return 0; 77 | } catch(const std::exception &e) { 78 | std::cerr << e.what(); 79 | return -1; 80 | } 81 | } 82 | 83 | 84 | -------------------------------------------------------------------------------- /transform/featxtra-functions.h: -------------------------------------------------------------------------------- 1 | // transform/cmvn.h 2 | 3 | // Copyright 2014 University of Southern California (author: Maarten Van Segbroeck) 4 | 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | // MERCHANTABLITY OR NON-INFRINGEMENT. 15 | // See the Apache 2 License for the specific language governing permissions and 16 | // limitations under the License. 17 | 18 | 19 | #ifndef KALDI_TRANSFORM_FEATXTRA_FUNCTIONS_H_ 20 | #define KALDI_TRANSFORM_FEATXTRA_FUNCTIONS_H_ 21 | 22 | #include "base/kaldi-common.h" 23 | #include "matrix/matrix-lib.h" 24 | #include "feat/feature-functions.h" 25 | 26 | namespace kaldi { 27 | 28 | // Apply AutoRegressive Moving Average normalization to a matrix 29 | void ApplyArma(int ar_order, 30 | MatrixBase *feats); 31 | // Apply Sigmoid scaling to a matrix 32 | void ApplySigmoidScale(BaseFloat sig_thr, 33 | BaseFloat sig_slope, 34 | MatrixBase *feats); 35 | // Apply Long-Term Spectral Variability to a matrix 36 | void ApplyLtsv(int ctx_win, 37 | BaseFloat ltsv_sigmoidSlope, 38 | BaseFloat ltsv_sigmoidThr, 39 | const MatrixBase *feats, 40 | Matrix *ltsv); 41 | // Compute sum over matrix columns 42 | void ApplyColSum(const Matrix &data, 43 | Vector *colsum); 44 | // Compute mean over matrix columns 45 | void ApplyColMean(const Matrix &data, 46 | Vector *colmean); 47 | // Sort a vector 48 | void ApplySort(VectorBase *s); 49 | // Apply median filtering to a time domain signal (vector) 50 | void ApplyMedianfiltering(int ctx_win, 51 | VectorBase *data); 52 | // Apply 2-dimensional FFT to a matrix of real and imaginary numbers 53 | // function is implemented as fft(fft(A).').' 54 | void ComputeComplexFft(Matrix *real_data, 55 | Matrix *imag_data, 56 | int32 dim0, 57 | int32 dim1, 58 | bool forward_fft); 59 | // Apply 2-dimensional FFT to a matrix of real and imaginary numbers 60 | // use this if matrix dimensions are a power of 2 61 | void ComputeComplexFftPow2(Matrix *real_data, 62 | Matrix *imag_data, 63 | int32 dim0, 64 | int32 dim1, 65 | bool forward_fft); 66 | 67 | // Convert NCCF to POV conversion on kaldi pitch feats 68 | // Kaldi pitch feats are 2-dim features (NCCF, pitch in Hz) 69 | void ApplyNccfToPov(Matrix* kaldi_pitch_feats); 70 | 71 | } // namespace kaldi 72 | 73 | #endif // KALDI_TRANSFORM_FEATXTRA_FUNCTIONS_H_ 74 | -------------------------------------------------------------------------------- /featbin/apply-ltsv.cc: -------------------------------------------------------------------------------- 1 | // featbin/apply-ltsv.cc 2 | 3 | // Copyright 2014 University of Southern California (author: Maarten Van Segbroeck) 4 | 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | // MERCHANTABLITY OR NON-INFRINGEMENT. 15 | // See the Apache 2 License for the specific language governing permissions and 16 | // limitations under the License. 17 | 18 | #include "base/kaldi-common.h" 19 | #include "util/common-utils.h" 20 | #include "matrix/kaldi-matrix.h" 21 | #include "transform/featxtra-functions.h" 22 | using namespace std; 23 | 24 | int main(int argc, char *argv[]) { 25 | try { 26 | using namespace kaldi; 27 | 28 | const char *usage = 29 | "Apply LTSV (Long Term Spectral Variability) measure on a " 30 | "matrix of spectral features\n" 31 | "Per-utterance by default, or per-speaker if utt2spk option provided\n" 32 | "Usage: apply-ltsv [options] feats-rspecifier feats-wspecifier\n"; 33 | 34 | ParseOptions po(usage); 35 | std::string utt2spk_rspecifier; 36 | int32 ar_order = 10; // ARMA filter tab order 37 | int32 ctx_win = 50; // context window parameter 38 | BaseFloat ltsv_slope = 0.2; // sigmoid slope parameter 39 | BaseFloat ltsv_thr = 0.5; // sigmoid threshold parameter 40 | po.Register("ar-order", &ar_order, "Order of the ARMA filtering [default: 10]"); 41 | po.Register("ctx-win", &ctx_win, "Context window frame size [default: 50]"); 42 | po.Register("ltsv-slope", <sv_slope, "Sigmoid slope parameter [default: 0.2]"); 43 | po.Register("ltsv-thr", <sv_thr, "Sigmoid threshold parameter [default: 0.5]"); 44 | 45 | po.Read(argc, argv); 46 | 47 | if (po.NumArgs() != 2) { 48 | po.PrintUsage(); 49 | exit(1); 50 | } 51 | 52 | kaldi::int32 num_done = 0; 53 | 54 | std::string feat_rspecifier = po.GetArg(1); 55 | std::string feat_wspecifier = po.GetArg(2); 56 | 57 | SequentialBaseFloatMatrixReader feat_reader(feat_rspecifier); 58 | BaseFloatMatrixWriter feat_writer(feat_wspecifier); 59 | 60 | if (utt2spk_rspecifier != "") 61 | KALDI_ERR << "--utt2spk option not compatible with rxfilename as input " 62 | << "(did you forget ark:?)"; 63 | 64 | for (;!feat_reader.Done(); feat_reader.Next()) { 65 | std::string utt = feat_reader.Key(); 66 | //Matrix &feats = feat_reader.Value(); 67 | Matrix feat(feat_reader.Value()); 68 | Matrix ltsv; 69 | ApplyArma(ar_order, &feat); 70 | ApplyLtsv(ctx_win, ltsv_slope, ltsv_thr, &feat, <sv); 71 | feat_writer.Write(utt, ltsv); 72 | num_done++; 73 | } 74 | return (num_done != 0 ? 0 : 1); 75 | } catch(const std::exception &e) { 76 | std::cerr << e.what(); 77 | return -1; 78 | } 79 | } 80 | 81 | 82 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Featxtra 2 | The featxtra toolbox lists a set of front-end tools and Signal Processing functions for [Kaldi](http://kaldi.sourceforge.net). 3 | 4 | The toolbox includes: 5 | * feature extraction for speech signals 6 | - Gammatone Frequency Representation, Gammatone Frequency Cepstral Coefficients 7 | - Gabor Features 8 | - DCT features (apply on ltsv, voicing stream) 9 | * Voice Activity Detection 10 | * additional signal filter operations 11 | 12 | ## List of Functions 13 | 14 | ### featbin 15 | * featbin/compute-gtf-feats 16 | - extract Gammatone Frequency Representation (GTF) 17 | - extract Gammatone Frequency Cepstral Coefficients (GFCC) 18 | * featbin/compute-gabor-feats 19 | - extract Gabor Features (GBF) 20 | * featbin/compute-dct-feats 21 | - extract DCT Features from time domain signal 22 | * featbin/apply-arma 23 | - apply Auto-Regressive Moving Average (ARMA) filtering on a spectral representation (e.g GTF ) 24 | * featbin/apply-ltsv 25 | - apply Long-Term Spectral Variabilityi (LTSV) stream on spectral representation (e.g GTF ) 26 | * featbin/apply-vad 27 | - apply Voice Activity Detection (VAD) using voicing and LTSV probability 28 | * featbin/extract-dims 29 | - extract specified dimension range out of a feature matrix 30 | 31 | ### feat 32 | * feat/feature-gtf 33 | - GTF and GFCC feature implementation code 34 | 35 | ### transform 36 | * transform/featxtra-functions 37 | - additional signal processing functions 38 | _____ 39 | 40 | ## Instructions to run 41 | 1. Modify the `Makefile` in directories `feat`, `featbin`, and `transform` as follow: 42 | - feat/Makefile should include `feature-dctf.o feature-gtf.o feature-gabor.o` under `OBJFILES` variable. 43 | - featbin/Makefile should include `compute-dctf-feats compute-gtf-feats compute-gabor-feats apply-arma apply-ltsv apply-nccf-to-pov apply-vad-merged apply-vad extract-dims` under the `BINFILES` variable. 44 | - transform/Makefile should include `featxtra-functions.o` under the `OBJFILES` variable. 45 | 2. Add the following code in the `src/feat/feature-window.cc` file 46 | ```c 47 | void ExtractWaveformRemainder(const VectorBase &wave, 48 | const FrameExtractionOptions &opts, 49 | Vector *wave_remainder) { 50 | int32 frame_shift = opts.WindowShift(); 51 | int32 num_frames = NumFrames(wave.Dim(), opts); 52 | // offset is the amount at the start that has been extracted. 53 | int32 offset = num_frames * frame_shift; 54 | KALDI_ASSERT(wave_remainder != NULL); 55 | int32 remaining_len = wave.Dim() - offset; 56 | wave_remainder->Resize(remaining_len); 57 | KALDI_ASSERT(remaining_len >= 0); 58 | if (remaining_len > 0) 59 | wave_remainder->CopyFromVec(SubVector(wave, offset, remaining_len)); 60 | } 61 | ``` 62 | 3. Add the following code in the `src/featbin/feature-window.h` file 63 | ```c 64 | // ExtractWaveformRemainder is useful if the waveform is coming in segments. 65 | // It extracts the bit of the waveform at the end of this block that you 66 | // would have to append the next bit of waveform to, if you wanted to have 67 | // the same effect as everything being in one big block. 68 | void ExtractWaveformRemainder(const VectorBase &wave, 69 | const FrameExtractionOptions &opts, 70 | Vector *wave_remainder); 71 | ``` 72 | 73 | Steps 2 and 3 are done because of [this](https://github.com/kaldi-asr/kaldi/commit/1180e467c8ca273c7704199bd27cb734509e931e) commit in the kaldi-asr project. 74 | 75 | After these steps just run the `make` command again in the src directory to finally integrate these in your kaldi project. 76 | -------------------------------------------------------------------------------- /featbin/compute-dctf-feats.cc: -------------------------------------------------------------------------------- 1 | // featbin/compute-dctf-feats.cc 2 | 3 | // Copyright 2014 University of Southern California (author: Maarten Van Segbroeck) 4 | 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | // MERCHANTABLITY OR NON-INFRINGEMENT. 15 | // See the Apache 2 License for the specific language governing permissions and 16 | // limitations under the License. 17 | 18 | #include "base/kaldi-common.h" 19 | #include "util/common-utils.h" 20 | #include "matrix/kaldi-matrix.h" 21 | #include "feat/feature-dctf.h" 22 | #include "transform/featxtra-functions.h" 23 | using namespace std; 24 | 25 | int main(int argc, char *argv[]) { 26 | try { 27 | using namespace kaldi; 28 | 29 | const char *usage = 30 | "Compute DCT transform by windowing a one dimensional time domain signal \n" 31 | "Per-utterance by default, or per-speaker if utt2spk option provided\n" 32 | "Usage: compute-dctf-feats [options] feats-rspecifier feats-wspecifier\n"; 33 | 34 | DctfOptions dctf_opts; 35 | ParseOptions po(usage); 36 | std::string utt2spk_rspecifier; 37 | int32 cep_order = 5; // ARMA filter tab order 38 | int32 ctx_win = 10; // context window parameter 39 | 40 | po.Register("cep-order", &cep_order, "Order of the Cepstral filtering [default: 5]"); 41 | po.Register("ctx-win", &ctx_win, "Context window frame size [default: 30]"); 42 | 43 | po.Read(argc, argv); 44 | 45 | if (po.NumArgs() != 2) { 46 | po.PrintUsage(); 47 | exit(1); 48 | } 49 | 50 | kaldi::int32 num_done = 0; 51 | 52 | std::string feat_rspecifier = po.GetArg(1); 53 | std::string feat_wspecifier = po.GetArg(2); 54 | 55 | SequentialBaseFloatMatrixReader feat_reader(feat_rspecifier); 56 | BaseFloatMatrixWriter feat_writer(feat_wspecifier); 57 | 58 | if (utt2spk_rspecifier != "") 59 | KALDI_ERR << "--utt2spk option not compatible with rxfilename as input " 60 | << "(did you forget ark:?)"; 61 | 62 | // DCT 63 | dctf_opts.num_ceps= cep_order; 64 | dctf_opts.num_bins= ctx_win; 65 | dctf_opts.frame_opts.samp_freq=1; 66 | dctf_opts.frame_opts.frame_length_ms=ctx_win*1000; 67 | dctf_opts.frame_opts.frame_shift_ms=1000; 68 | dctf_opts.frame_opts.round_to_power_of_two=false; 69 | dctf_opts.frame_opts.window_type="rectangular"; 70 | dctf_opts.frame_opts.remove_dc_offset=0.0; 71 | dctf_opts.frame_opts.dither=0.0; 72 | dctf_opts.frame_opts.preemph_coeff=0.0; 73 | Dctf dctf(dctf_opts); 74 | 75 | for (;!feat_reader.Done(); feat_reader.Next()) { 76 | std::string utt = feat_reader.Key(); 77 | const Matrix &feats = feat_reader.Value(); 78 | if (feats.NumRows() > ctx_win ) { 79 | Matrix to_write(feats.NumRows(), feats.NumCols()*cep_order); 80 | for (size_t i = 0; i < feats.NumCols(); i++) { 81 | Vector featvec(feats.NumRows()); 82 | featvec.CopyColFromMat(feats, i); 83 | Matrix features; 84 | try { 85 | dctf.Compute(featvec, &features, NULL); 86 | } catch (...) { 87 | KALDI_WARN << "Failed to compute features for utterance " 88 | << utt; 89 | continue; 90 | } 91 | to_write.Range(0, features.NumRows(), i*(cep_order-1), cep_order).CopyFromMat(features); 92 | // Repeat last frame to make output of same length as input 93 | for (size_t j = 0; j < ctx_win-1; j++) { 94 | to_write.Row(features.NumRows() + j).Range(i*(cep_order-1), cep_order).CopyFromVec(features.Row(features.NumRows()-1)); 95 | } 96 | } 97 | feat_writer.Write(utt, to_write); 98 | num_done++; 99 | } 100 | } 101 | return (num_done != 0 ? 0 : 1); 102 | } catch(const std::exception &e) { 103 | std::cerr << e.what(); 104 | return -1; 105 | } 106 | } 107 | 108 | 109 | -------------------------------------------------------------------------------- /feat/feature-gtf.h: -------------------------------------------------------------------------------- 1 | // feat/feature-mfcc.h 2 | 3 | // Copyright 2009-2011 Karel Vesely; Petr Motlicek; Saarland University 4 | 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | // MERCHANTABLITY OR NON-INFRINGEMENT. 15 | // See the Apache 2 License for the specific language governing permissions and 16 | // limitations under the License. 17 | 18 | #ifndef KALDI_FEAT_FEATURE_MFCC_LOGMEL_H_ 19 | #define KALDI_FEAT_FEATURE_MFCC_LOGMEL_H_ 20 | 21 | #include 22 | 23 | #include "feat/feature-functions.h" 24 | #include "feat/feature-window.h" 25 | #include 26 | using namespace std; 27 | 28 | namespace kaldi { 29 | /// @addtogroup feat FeatureExtraction 30 | /// @{ 31 | 32 | 33 | /// GtfOptions contains basic options for computing MFCC features 34 | /// It only includes things that can be done in a "stateless" way, i.e. 35 | /// it does not include energy max-normalization. 36 | /// It does not include delta computation. 37 | struct GtfOptions { 38 | FrameExtractionOptions frame_opts; 39 | bool apply_dct; // make GFCC, else GF filtered spectra 40 | int32 num_bins; // e.g. 13: num cepstral coeffs, counting zero. 41 | int32 num_ceps; // e.g. 23: num cepstral coeffs, counting zero. 42 | bool use_c0; // use c0; else removed from feature vector 43 | bool use_energy; // use energy; else C0 44 | BaseFloat energy_floor; 45 | bool raw_energy; // compute energy before preemphasis and hamming window (else after) 46 | // if 0.0, no liftering is done. 47 | bool htk_compat; // if true, put energy/C0 last and introduce a factor of sqrt(2) 48 | // on C0 to be the same as HTK. 49 | 50 | GtfOptions(): apply_dct(false), 51 | num_bins(64), 52 | num_ceps(23), 53 | use_c0(true), 54 | use_energy(true), 55 | energy_floor(0.0), // not in log scale: a small value e.g. 1.0e-10 56 | raw_energy(true), 57 | htk_compat(false) { } 58 | void Register(ParseOptions *po) { 59 | frame_opts.Register(po); 60 | // gtf_opts.Register(po); 61 | po->Register("apply_dct", &apply_dct, "Apply DCT transform implies computing GFCC, else compute GammaTone filtered Spectra (GTF)"); 62 | po->Register("num-bins", &num_bins, "Number of Gammatone filterbanks"); 63 | po->Register("num-ceps", &num_ceps, "Number of cepstra in GFCC computation (including C0)"); 64 | po->Register("use-c0", &use_c0, "Use c0 (C0) in GFCC computation"); 65 | po->Register("use-energy", &use_energy, "Use energy (not C0) in GFCC computation"); 66 | po->Register("energy-floor", &energy_floor, "Floor on energy (absolute, not relative) in GFCC computation"); 67 | po->Register("raw-energy", &raw_energy, "If true, compute energy (if using energy) before Hamming window and preemphasis"); 68 | po->Register("htk-compat", &htk_compat, "If true, put energy or C0 last and put factor of sqrt(2) on C0. Warning: not sufficient to get HTK compatible features (need to change other parameters)."); 69 | } 70 | 71 | }; 72 | 73 | class MelBanks; 74 | 75 | 76 | /// Class for computing MFCC features; see \ref feat_mfcc for more information. 77 | class Gtf { 78 | public: 79 | Gtf(const GtfOptions &opts); 80 | ~Gtf(); 81 | 82 | int32 Dim() { return opts_.num_ceps; } 83 | 84 | /// Will throw exception on failure (e.g. if file too short for even one 85 | /// frame). The output "wave_remainder" is the last frame or two of the 86 | /// waveform that it would be necessary to include in the next call to Compute 87 | /// for the same utterance. It is not exactly the un-processed part (it may 88 | /// have been partly processed), it's the start of the next window that we 89 | /// have not already processed. Will throw exception on failure (e.g. if file 90 | /// too short for even one frame). 91 | void Compute(const VectorBase &wave, 92 | BaseFloat vtln_warp, 93 | Matrix *output, 94 | Vector *wave_remainder = NULL); 95 | 96 | private: 97 | void ComputeGammatoneMatrix(Matrix *gammatone_matrix_); 98 | Vector GetCosine(Vector vector); 99 | const MelBanks *GetMelBanks(BaseFloat vtln_warp); 100 | GtfOptions opts_; 101 | Vector lifter_coeffs_; 102 | Matrix dct_matrix_; // matrix we left-multiply by to perform DCT. 103 | Matrix gammatone_matrix_; // gammatone matrix 104 | BaseFloat log_energy_floor_; 105 | std::map mel_banks_; // BaseFloat is VTLN coefficient. 106 | FeatureWindowFunction feature_window_function_; 107 | SplitRadixRealFft *srfft_; 108 | KALDI_DISALLOW_COPY_AND_ASSIGN(Gtf); 109 | }; 110 | 111 | 112 | /// @} End of "addtogroup feat" 113 | }// namespace kaldi 114 | 115 | 116 | #endif 117 | -------------------------------------------------------------------------------- /feat/feature-gabor.h: -------------------------------------------------------------------------------- 1 | // feat/feature-gabor.h 2 | 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 10 | // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 11 | // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 12 | // MERCHANTABLITY OR NON-INFRINGEMENT. 13 | // See the Apache 2 License for the specific language governing permissions and 14 | // limitations under the License. 15 | 16 | #ifndef KALDI_FEAT_FEATURE_GABOR_H_ 17 | #define KALDI_FEAT_FEATURE_GABOR_H_ 18 | 19 | #include 20 | 21 | #include "feat/feature-functions.h" 22 | #include "transform/featxtra-functions.h" 23 | #include "feat/feature-window.h" 24 | #include "feat/mel-computations.h" 25 | 26 | #include 27 | using namespace std; 28 | 29 | namespace kaldi { 30 | /// @addtogroup feat FeatureExtraction 31 | /// @{ 32 | 33 | struct GaborOptions { 34 | FrameExtractionOptions frame_opts; 35 | MelBanksOptions mel_opts; 36 | BaseFloat energy_floor; 37 | bool use_cubed_root; 38 | int32 padding_time; 39 | int32 padding_freq; 40 | bool use_reflective_padding; 41 | int32 nb_mod_freq; 42 | bool use_real; 43 | 44 | GaborOptions(): mel_opts(24), // e.g. 24: num spectrogram frequency bins. 45 | energy_floor(0.0), // not in log scale: a small value e.g. 1.0e-10 46 | use_cubed_root(false), 47 | padding_time(50), 48 | padding_freq(0), 49 | use_reflective_padding(true), // otherwise use zero padding 50 | nb_mod_freq(2), 51 | use_real(true) {} 52 | 53 | void Register(ParseOptions *po) { 54 | frame_opts.Register(po); 55 | mel_opts.Register(po); 56 | po->Register("energy-floor", &energy_floor, 57 | "Floor on energy (absolute, not relative) in FBANK computation"); 58 | po->Register("use-cubed-root", &use_cubed_root, 59 | "If true, produce cube-root-filterbank (else produce log)."); 60 | po->Register("padding-time", &padding_time, 61 | "Number of frames for padding of spectrogram."); 62 | po->Register("padding-freq", &padding_freq, 63 | "Number of frequency bins for padding of spectrogram."); 64 | po->Register("nb-mod-freq", &nb_mod_freq, 65 | "Number of modulation frequencies for Gabor filter-bank."); 66 | po->Register("use-reflective-padding", &use_reflective_padding, 67 | "Use reflective padding of spectrogram (else use zero padding)."); 68 | po->Register("use-real", &use_real, 69 | "Use real output of GFB filtered spectrogram (else use imaginary)."); 70 | } 71 | }; 72 | 73 | 74 | class MelBanks; 75 | 76 | 77 | class Gabor { 78 | public: 79 | Gabor(const GaborOptions &opts); 80 | ~Gabor(); 81 | 82 | //int32 Dim() { return opts_.mel_opts; } 83 | 84 | 85 | void Compute(const VectorBase &wave, 86 | BaseFloat vtln_warp, 87 | Matrix *output, 88 | Vector *wave_remainder = NULL); 89 | 90 | void ApplyPadding(Matrix *spectrogram, 91 | int32 ro, 92 | int32 co, 93 | Matrix *padded_spec); 94 | 95 | void RemovePadding(Matrix input, 96 | int32 ro, 97 | int32 co, 98 | Matrix *output); 99 | 100 | void GFBCalcAxis(Vector omega_max, 101 | Vector size_max, 102 | Vector nu, 103 | Vector distance, 104 | Vector *omega_n, 105 | Vector *omega_k); 106 | 107 | void ComputeGaborFilter(BaseFloat omega_k, 108 | BaseFloat omega_n, 109 | Vector nu, 110 | Vector size_max, 111 | Matrix *gfilter_real, 112 | Matrix *gfilter_imag); 113 | 114 | void ComputeHannWindow(BaseFloat width, 115 | Vector *window); 116 | 117 | void ComputeMagnitude(Matrix real, 118 | Matrix imag, 119 | Matrix *mag); 120 | 121 | void ApplyGaborFilter(Matrix gfilter_real, 122 | Matrix gfilter_imag, 123 | Matrix spectrogram, 124 | Matrix *gfilter_spec_real, 125 | Matrix *gfilter_spec_imag); 126 | 127 | void FftConv2(Matrix in1_real, 128 | Matrix in1_imag, 129 | Matrix in2_real, 130 | Matrix in2_imag, 131 | Matrix *out_real, 132 | Matrix *out_imag); 133 | 134 | void GFBSelectRep(Matrix gfilter_real, 135 | Matrix gfilter_imag, 136 | Matrix *gfiltered_spec_real, 137 | Matrix *gfiltered_spec_imag); 138 | 139 | 140 | private: 141 | const MelBanks *GetMelBanks(BaseFloat vtln_warp); 142 | GaborOptions opts_; 143 | std::map mel_banks_; // BaseFloat is VTLN coefficient. 144 | FeatureWindowFunction feature_window_function_; 145 | SplitRadixRealFft *srfft_; 146 | KALDI_DISALLOW_COPY_AND_ASSIGN(Gabor); 147 | }; 148 | 149 | } 150 | 151 | #endif // KALDI_FEAT_FEATURE_GABOR_H_ 152 | -------------------------------------------------------------------------------- /featbin/apply-vad-merged.cc: -------------------------------------------------------------------------------- 1 | // featbin/apply-vad.cc 2 | 3 | // Copyright 2014 University of Southern California (author: Maarten Van Segbroeck) 4 | 5 | // See ../../COPYING for clarification regarding multiple authors 6 | // 7 | // Licensed under the Apache License, Version 2.0 (the "License"); 8 | // you may not use this file except in compliance with the License. 9 | // You may obtain a copy of the License at 10 | // 11 | // http://www.apache.org/licenses/LICENSE-2.0 12 | // 13 | // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 15 | // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 16 | // MERCHANTABLITY OR NON-INFRINGEMENT. 17 | // See the Apache 2 License for the specific language governing permissions and 18 | // limitations under the License. 19 | 20 | #include "base/kaldi-common.h" 21 | #include "util/common-utils.h" 22 | #include "matrix/kaldi-matrix.h" 23 | #include "transform/featxtra-functions.h" 24 | 25 | int main(int argc, char *argv[]) { 26 | try { 27 | using namespace kaldi; 28 | 29 | const char *usage = 30 | "Apply voice activity detection processing on a frame sequence of speech/non-speech probabilities \n" 31 | "Usage: apply-vad-merged [options] in-rspecifier out-wspecifier\n"; 32 | 33 | ParseOptions po(usage); 34 | std::string spk2utt_rspecifier; 35 | int32 prb_pow = 2; 36 | int32 ctx_win = 40; 37 | BaseFloat vad_thr = 0.4; 38 | bool prb_str = false; 39 | 40 | po.Register("prb-pow", &prb_pow, "Power of probability stream prior " 41 | "to median filtering."); 42 | po.Register("ctx-win", &ctx_win, "Number of frames of median filtering."); 43 | po.Register("vad-thr", &vad_thr, "VAD threshold."); 44 | po.Register("prb-str", &prb_str, "Median filtered probability stream."); 45 | po.Register("spk2utt", &spk2utt_rspecifier, "rspecifier for speaker to " 46 | "utterance-list map"); 47 | po.Read(argc, argv); 48 | 49 | if (po.NumArgs() != 2) { 50 | po.PrintUsage(); 51 | exit(1); 52 | } 53 | 54 | int32 num_done = 0, num_err = 0; 55 | std::string rspecifier = po.GetArg(1); 56 | std::string wspecifier = po.GetArg(2); 57 | 58 | KALDI_ASSERT(vad_thr > 0 || ctx_win > 0); 59 | 60 | BaseFloatMatrixWriter kaldi_writer(wspecifier); 61 | SequentialBaseFloatMatrixReader kaldi_reader(rspecifier); 62 | 63 | if (spk2utt_rspecifier != "") { 64 | SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier); 65 | RandomAccessBaseFloatMatrixReader kaldi_reader(rspecifier); 66 | 67 | for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) { 68 | std::string spk = spk2utt_reader.Key(); 69 | const std::vector &uttlist = spk2utt_reader.Value(); 70 | Matrix spkfeats; 71 | for (size_t i = 0; i < uttlist.size(); i++) { 72 | std::string utt = uttlist[i]; 73 | if (!kaldi_reader.HasKey(utt)) { 74 | KALDI_WARN << "Did not find features for utterance " << utt; 75 | num_err++; 76 | continue; 77 | } 78 | const Matrix &uttfeats = kaldi_reader.Value(utt); 79 | spkfeats.Resize(spkfeats.NumRows() + uttfeats.NumRows(), 80 | 1, kCopyData); 81 | if (utt.find("_NT") == std::string::npos) { 82 | spkfeats.Range(spkfeats.NumRows() - uttfeats.NumRows(), 83 | uttfeats.NumRows(), 0, 1).CopyFromMat(uttfeats); 84 | } 85 | num_done++; 86 | } 87 | 88 | if (spkfeats.NumRows() == 0) { 89 | KALDI_WARN << "No stats accumulated for speaker " << spk; 90 | } else { 91 | 92 | int32 nb_frames=spkfeats.NumRows(); 93 | KALDI_ASSERT(ctx_win < nb_frames); 94 | Vector vad_out; 95 | // Mean of the probability streams 96 | ApplyColMean(spkfeats, &vad_out); 97 | // Take square of the probability streams 98 | vad_out.ApplyPow(prb_pow); 99 | // Median filtering of the resulting VAD probability stream 100 | ApplyMedianfiltering(ctx_win, &vad_out); 101 | 102 | Matrix to_write; 103 | if (!prb_str) { 104 | to_write.Resize(nb_frames, 1); 105 | to_write.CopyColFromVec(vad_out, 0); 106 | // Apply thresholding 107 | to_write.Add(-vad_thr); 108 | to_write.ApplyHeaviside(); 109 | } else { 110 | to_write.Resize(nb_frames, 2); 111 | to_write.CopyColFromVec(vad_out, 0); 112 | Vector prob_stream = vad_out; 113 | // Apply thresholding 114 | to_write.Add(-vad_thr); 115 | to_write.ApplyHeaviside(); 116 | // concatenation of the probability stream 117 | to_write.Resize(nb_frames, 2, kCopyData); 118 | to_write.Range(0, nb_frames, 1, 1).CopyColFromVec(prob_stream, 0); 119 | } 120 | 121 | // Write 122 | kaldi_writer.Write(spk, to_write); 123 | 124 | KALDI_LOG << "Done accumulating vad labels for speaker " << spk 125 | << " for " << num_done << " segments; " 126 | << num_err << " had errors; " 127 | << nb_frames << " frames."; 128 | } 129 | } 130 | } 131 | //int32 k = 0; 132 | //for (; !kaldi_reader.Done() ; kaldi_reader.Next(), k++) { 133 | // std::string utt = kaldi_reader.Key(); 134 | // const Matrix &feats = kaldi_reader.Value(); 135 | 136 | // KALDI_ASSERT(ctx_win < feats.NumRows()); 137 | // Vector vad_out; 138 | 139 | // // Mean of the probability streams 140 | // ApplyColMean(feats, &vad_out); 141 | // // Median filtering of the resulting VAD probability stream 142 | // ApplyMedianfiltering(ctx_win, &vad_out); 143 | 144 | // Matrix to_write(feats.NumRows(), 1); 145 | // to_write.CopyColFromVec(vad_out, 0); 146 | 147 | // // Apply thresholding 148 | // to_write.Add(-vad_thr); 149 | // to_write.ApplyHeaviside(); 150 | 151 | // kaldi_writer.Write(utt, to_write); 152 | //} 153 | //return 0; 154 | } catch(const std::exception &e) { 155 | std::cerr << e.what(); 156 | return -1; 157 | } 158 | } 159 | 160 | 161 | -------------------------------------------------------------------------------- /featbin/compute-gabor-feats.cc: -------------------------------------------------------------------------------- 1 | // featbin/compute-gabor-feats.cc 2 | 3 | // Copyright 2014 SAIL (authors: JG & DB) 4 | 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | // MERCHANTABLITY OR NON-INFRINGEMENT. 15 | // See the Apache 2 License for the specific language governing permissions and 16 | // limitations under the License. 17 | 18 | #include "base/kaldi-common.h" 19 | #include "util/common-utils.h" 20 | #include "feat/feature-gabor.h" 21 | #include "feat/wave-reader.h" 22 | #include 23 | using namespace std; 24 | 25 | int main(int argc, char *argv[]) { 26 | try { 27 | using namespace kaldi; 28 | const char *usage = 29 | "Create Gabor feature files.\n" 30 | "Usage: compute-gabor-feats [options...]

\n"; 31 | 32 | // construct all the global objects 33 | ParseOptions po(usage); 34 | GaborOptions gabor_opts; 35 | bool subtract_mean = false; 36 | BaseFloat vtln_warp = 1.0; 37 | std::string vtln_map_rspecifier; 38 | std::string utt2spk_rspecifier; 39 | int32 channel = -1; 40 | BaseFloat min_duration = 0.0; 41 | // Define defaults for gobal options 42 | std::string output_format = "kaldi"; 43 | 44 | // Register the Gabor option struct 45 | gabor_opts.Register(&po); 46 | 47 | // Register the options 48 | po.Register("output-format", &output_format, "Format of the output files [kaldi, htk]"); 49 | po.Register("subtract-mean", &subtract_mean, "Subtract mean of each feature file [CMS]; not recommended to do it this way. "); 50 | po.Register("vtln-warp", &vtln_warp, "Vtln warp factor (only applicable if vtln-map not specified)"); 51 | po.Register("vtln-map", &vtln_map_rspecifier, "Map from utterance or speaker-id to vtln warp factor (rspecifier)"); 52 | po.Register("utt2spk", &utt2spk_rspecifier, "Utterance to speaker-id map (if doing VTLN and you have warps per speaker)"); 53 | po.Register("channel", &channel, "Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right)"); 54 | po.Register("min-duration", &min_duration, "Minimum duration of segments to process (in seconds)."); 55 | 56 | // OPTION PARSING .......................................................... 57 | // 58 | 59 | // parse options (+filling the registered variables) 60 | po.Read(argc, argv); 61 | 62 | if (po.NumArgs() != 2) { 63 | po.PrintUsage(); 64 | exit(1); 65 | } 66 | 67 | std::string wav_rspecifier = po.GetArg(1); 68 | 69 | std::string output_wspecifier = po.GetArg(2); 70 | 71 | Gabor gabor(gabor_opts); 72 | 73 | SequentialTableReader reader(wav_rspecifier); 74 | BaseFloatMatrixWriter kaldi_writer; // typedef to TableWriter. 75 | TableWriter htk_writer; 76 | 77 | if (utt2spk_rspecifier != "") 78 | KALDI_ASSERT(vtln_map_rspecifier != "" && "the utt2spk option is only " 79 | "needed if the vtln-map option is used."); 80 | RandomAccessBaseFloatReaderMapped vtln_map_reader(vtln_map_rspecifier, 81 | utt2spk_rspecifier); 82 | 83 | if (output_format == "kaldi") { 84 | if (!kaldi_writer.Open(output_wspecifier)) 85 | KALDI_ERR << "Could not initialize output with wspecifier " 86 | << output_wspecifier; 87 | } else if (output_format == "htk") { 88 | if (!htk_writer.Open(output_wspecifier)) 89 | KALDI_ERR << "Could not initialize output with wspecifier " 90 | << output_wspecifier; 91 | } else { 92 | KALDI_ERR << "Invalid output_format string " << output_format; 93 | } 94 | 95 | int32 num_utts = 0, num_success = 0; 96 | for (; !reader.Done(); reader.Next()) { 97 | num_utts++; 98 | std::string utt = reader.Key(); 99 | const WaveData &wave_data = reader.Value(); 100 | if (wave_data.Duration() < min_duration) { 101 | KALDI_WARN << "File: " << utt << " is too short (" 102 | << wave_data.Duration() << " sec): producing no output."; 103 | continue; 104 | } 105 | int32 num_chan = wave_data.Data().NumRows(), this_chan = channel; 106 | { // This block works out the channel (0=left, 1=right...) 107 | KALDI_ASSERT(num_chan > 0); // should have been caught in 108 | // reading code if no channels. 109 | if (channel == -1) { 110 | this_chan = 0; 111 | if (num_chan != 1) 112 | KALDI_WARN << "Channel not specified but you have data with " 113 | << num_chan << " channels; defaulting to zero"; 114 | } else { 115 | if (this_chan >= num_chan) { 116 | KALDI_WARN << "File with id " << utt << " has " 117 | << num_chan << " channels but you specified channel " 118 | << channel << ", producing no output."; 119 | continue; 120 | } 121 | } 122 | } 123 | BaseFloat vtln_warp_local; // Work out VTLN warp factor. 124 | if (vtln_map_rspecifier != "") { 125 | if (!vtln_map_reader.HasKey(utt)) { 126 | KALDI_WARN << "No vtln-map entry for utterance-id (or speaker-id) " 127 | << utt; 128 | continue; 129 | } 130 | vtln_warp_local = vtln_map_reader.Value(utt); 131 | } else { 132 | vtln_warp_local = vtln_warp; 133 | } 134 | if (gabor_opts.frame_opts.samp_freq != wave_data.SampFreq()) 135 | KALDI_ERR << "Sample frequency mismatch: you specified " 136 | << gabor_opts.frame_opts.samp_freq << " but data has " 137 | << wave_data.SampFreq() << " (use --sample-frequency option)"; 138 | 139 | SubVector waveform(wave_data.Data(), this_chan); 140 | Matrix features; 141 | try { 142 | gabor.Compute(waveform, vtln_warp_local, &features, NULL); 143 | } catch (...) { 144 | KALDI_WARN << "Failed to compute features for utterance " 145 | << utt; 146 | continue; 147 | } 148 | if (subtract_mean) { 149 | Vector mean(features.NumCols()); 150 | mean.AddRowSumMat(1.0, features); 151 | mean.Scale(1.0 / features.NumRows()); 152 | for (int32 i = 0; i < features.NumRows(); i++) 153 | features.Row(i).AddVec(-1.0, mean); 154 | } 155 | if (output_format == "kaldi") { 156 | kaldi_writer.Write(utt, features); 157 | } else { 158 | std::pair, HtkHeader> p; 159 | p.first.Resize(features.NumRows(), features.NumCols()); 160 | p.first.CopyFromMat(features); 161 | HtkHeader header = { 162 | features.NumRows(), 163 | 100000, // 10ms shift 164 | static_cast(sizeof(float)*(features.NumCols())), 165 | static_cast( 006 | // Gabor 166 | 020000 ) 167 | }; 168 | p.second = header; 169 | htk_writer.Write(utt, p); 170 | } 171 | if (num_utts % 10 == 0) 172 | KALDI_LOG << "Processed " << num_utts << " utterances"; 173 | KALDI_VLOG(2) << "Processed features for key " << utt; 174 | num_success++; 175 | } 176 | KALDI_LOG << " Done " << num_success << " out of " << num_utts 177 | << " utterances."; 178 | return (num_success != 0 ? 0 : 1); 179 | } catch(const std::exception &e) { 180 | std::cerr << e.what(); 181 | return -1; 182 | } 183 | } 184 | 185 | -------------------------------------------------------------------------------- /featbin/compute-gtf-feats.cc: -------------------------------------------------------------------------------- 1 | // featbin/compute-gtf-feats.cc 2 | 3 | // Copyright 2009-2012 Microsoft Corporation 4 | // Johns Hopkins University (author: Daniel Povey) 5 | 6 | // Licensed under the Apache License, Version 2.0 (the "License"); 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // 12 | // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 14 | // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 15 | // MERCHANTABLITY OR NON-INFRINGEMENT. 16 | // See the Apache 2 License for the specific language governing permissions and 17 | // limitations under the License. 18 | 19 | #include "base/kaldi-common.h" 20 | #include "util/common-utils.h" 21 | #include "feat/feature-gtf.h" 22 | #include "feat/wave-reader.h" 23 | #include 24 | using namespace std; 25 | 26 | int main(int argc, char *argv[]) { 27 | try { 28 | using namespace kaldi; 29 | const char *usage = 30 | "Create GT feature files.\n" 31 | "Usage: compute-gtf-feats [options...]

\n"; 32 | 33 | // construct all the global objects 34 | ParseOptions po(usage); 35 | GtfOptions gtf_opts; 36 | bool subtract_mean = false; 37 | BaseFloat vtln_warp = 1.0; 38 | std::string vtln_map_rspecifier; 39 | std::string utt2spk_rspecifier; 40 | int32 channel = -1; 41 | BaseFloat min_duration = 0.0; 42 | // Define defaults for gobal options 43 | std::string output_format = "kaldi"; 44 | 45 | // Register the GTF option struct 46 | gtf_opts.Register(&po); 47 | 48 | // Register the options 49 | po.Register("output-format", &output_format, "Format of the output files [kaldi, htk]"); 50 | po.Register("subtract-mean", &subtract_mean, "Subtract mean of each feature file [CMS]; not recommended to do it this way. "); 51 | po.Register("vtln-warp", &vtln_warp, "Vtln warp factor (only applicable if vtln-map not specified)"); 52 | po.Register("vtln-map", &vtln_map_rspecifier, "Map from utterance or speaker-id to vtln warp factor (rspecifier)"); 53 | po.Register("utt2spk", &utt2spk_rspecifier, "Utterance to speaker-id map (if doing VTLN and you have warps per speaker)"); 54 | po.Register("channel", &channel, "Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right)"); 55 | po.Register("min-duration", &min_duration, "Minimum duration of segments to process (in seconds)."); 56 | 57 | // OPTION PARSING .......................................................... 58 | // 59 | 60 | // parse options (+filling the registered variables) 61 | po.Read(argc, argv); 62 | 63 | if (po.NumArgs() != 2) { 64 | po.PrintUsage(); 65 | exit(1); 66 | } 67 | 68 | std::string wav_rspecifier = po.GetArg(1); 69 | 70 | std::string output_wspecifier = po.GetArg(2); 71 | 72 | Gtf gtf(gtf_opts); 73 | 74 | SequentialTableReader reader(wav_rspecifier); 75 | BaseFloatMatrixWriter kaldi_writer; // typedef to TableWriter. 76 | TableWriter htk_writer; 77 | 78 | if (utt2spk_rspecifier != "") 79 | KALDI_ASSERT(vtln_map_rspecifier != "" && "the utt2spk option is only " 80 | "needed if the vtln-map option is used."); 81 | RandomAccessBaseFloatReaderMapped vtln_map_reader(vtln_map_rspecifier, 82 | utt2spk_rspecifier); 83 | 84 | if (output_format == "kaldi") { 85 | if (!kaldi_writer.Open(output_wspecifier)) 86 | KALDI_ERR << "Could not initialize output with wspecifier " 87 | << output_wspecifier; 88 | } else if (output_format == "htk") { 89 | if (!htk_writer.Open(output_wspecifier)) 90 | KALDI_ERR << "Could not initialize output with wspecifier " 91 | << output_wspecifier; 92 | } else { 93 | KALDI_ERR << "Invalid output_format string " << output_format; 94 | } 95 | 96 | int32 num_utts = 0, num_success = 0; 97 | for (; !reader.Done(); reader.Next()) { 98 | num_utts++; 99 | std::string utt = reader.Key(); 100 | const WaveData &wave_data = reader.Value(); 101 | if (wave_data.Duration() < min_duration) { 102 | KALDI_WARN << "File: " << utt << " is too short (" 103 | << wave_data.Duration() << " sec): producing no output."; 104 | continue; 105 | } 106 | int32 num_chan = wave_data.Data().NumRows(), this_chan = channel; 107 | { // This block works out the channel (0=left, 1=right...) 108 | KALDI_ASSERT(num_chan > 0); // should have been caught in 109 | // reading code if no channels. 110 | if (channel == -1) { 111 | this_chan = 0; 112 | if (num_chan != 1) 113 | KALDI_WARN << "Channel not specified but you have data with " 114 | << num_chan << " channels; defaulting to zero"; 115 | } else { 116 | if (this_chan >= num_chan) { 117 | KALDI_WARN << "File with id " << utt << " has " 118 | << num_chan << " channels but you specified channel " 119 | << channel << ", producing no output."; 120 | continue; 121 | } 122 | } 123 | } 124 | BaseFloat vtln_warp_local; // Work out VTLN warp factor. 125 | if (vtln_map_rspecifier != "") { 126 | if (!vtln_map_reader.HasKey(utt)) { 127 | KALDI_WARN << "No vtln-map entry for utterance-id (or speaker-id) " 128 | << utt; 129 | continue; 130 | } 131 | vtln_warp_local = vtln_map_reader.Value(utt); 132 | } else { 133 | vtln_warp_local = vtln_warp; 134 | } 135 | if (gtf_opts.frame_opts.samp_freq != wave_data.SampFreq()) 136 | KALDI_ERR << "Sample frequency mismatch: you specified " 137 | << gtf_opts.frame_opts.samp_freq << " but data has " 138 | << wave_data.SampFreq() << " (use --sample-frequency option)"; 139 | 140 | SubVector waveform(wave_data.Data(), this_chan); 141 | Matrix features; 142 | try { 143 | gtf.Compute(waveform, vtln_warp_local, &features, NULL); 144 | } catch (...) { 145 | KALDI_WARN << "Failed to compute features for utterance " 146 | << utt; 147 | continue; 148 | } 149 | if (subtract_mean) { 150 | Vector mean(features.NumCols()); 151 | mean.AddRowSumMat(1.0, features); 152 | mean.Scale(1.0 / features.NumRows()); 153 | for (int32 i = 0; i < features.NumRows(); i++) 154 | features.Row(i).AddVec(-1.0, mean); 155 | } 156 | if (output_format == "kaldi") { 157 | kaldi_writer.Write(utt, features); 158 | } else { 159 | std::pair, HtkHeader> p; 160 | p.first.Resize(features.NumRows(), features.NumCols()); 161 | p.first.CopyFromMat(features); 162 | HtkHeader header = { 163 | features.NumRows(), 164 | 100000, // 10ms shift 165 | sizeof(float)*features.NumCols(), 166 | 006 | // GTF 167 | (gtf_opts.use_energy ? 0100 : 020000) // energy; otherwise c0 168 | }; 169 | p.second = header; 170 | htk_writer.Write(utt, p); 171 | } 172 | if (num_utts % 10 == 0) 173 | KALDI_LOG << "Processed " << num_utts << " utterances"; 174 | KALDI_VLOG(2) << "Processed features for key " << utt; 175 | num_success++; 176 | } 177 | KALDI_LOG << " Done " << num_success << " out of " << num_utts 178 | << " utterances."; 179 | return (num_success != 0 ? 0 : 1); 180 | } catch(const std::exception &e) { 181 | std::cerr << e.what(); 182 | return -1; 183 | } 184 | } 185 | 186 | -------------------------------------------------------------------------------- /feat/feature-gtf.cc: -------------------------------------------------------------------------------- 1 | // feat/feature-gtf.cc 2 | 3 | // Copyright 2009-2011 Karel Vesely; Petr Motlicek 4 | 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | // MERCHANTABLITY OR NON-INFRINGEMENT. 15 | // See the Apache 2 License for the specific language governing permissions and 16 | // limitations under the License. 17 | 18 | 19 | #include "feat/feature-gtf.h" 20 | #include "feat/feature-window.h" 21 | 22 | namespace kaldi { 23 | 24 | Gtf::Gtf(const GtfOptions &opts): 25 | opts_(opts), 26 | feature_window_function_(opts.frame_opts), 27 | srfft_(NULL) { 28 | int num_bins = opts.num_bins; 29 | int num_ceps = opts.num_ceps; 30 | Matrix dct_matrix(num_bins, num_bins); 31 | ComputeDctMatrix(&dct_matrix); 32 | // Note that we include zeroth dct in either case. If using the 33 | // energy we replace this with the energy. This means a different 34 | // ordering of features than HTK. 35 | SubMatrix dct_rows(dct_matrix, 0, num_ceps, 0, num_bins); 36 | dct_matrix_.Resize(num_ceps, num_bins); 37 | dct_matrix_.CopyFromMat(dct_rows); // subset of rows. 38 | 39 | ComputeGammatoneMatrix(&gammatone_matrix_); 40 | //KALDI_WARN << gammatone_matrix_ ; 41 | 42 | if (opts.energy_floor != 0.0) 43 | log_energy_floor_ = log(opts.energy_floor); 44 | 45 | int32 padded_window_size = opts.frame_opts.PaddedWindowSize(); 46 | if ((padded_window_size & (padded_window_size-1)) == 0) // Is a power of two... 47 | srfft_ = new SplitRadixRealFft(padded_window_size); 48 | } 49 | 50 | Gtf::~Gtf() { 51 | for (std::map::iterator iter = mel_banks_.begin(); 52 | iter != mel_banks_.end(); 53 | ++iter) 54 | delete iter->second; 55 | if (srfft_) 56 | delete srfft_; 57 | } 58 | 59 | Vector Gtf::GetCosine(Vector vector) { 60 | Vector vector_out(vector); 61 | for (MatrixIndexT i = 0; i < vector.Dim(); i++) vector_out(i) = cos(vector(i)); 62 | return vector_out; 63 | } 64 | 65 | void Gtf::ComputeGammatoneMatrix(Matrix *gammatone_matrix_) { 66 | 67 | // define variables 68 | int nfilts = opts_.num_bins; 69 | int nfft = opts_.frame_opts.PaddedWindowSize(); 70 | int sample_freq = opts_.frame_opts.samp_freq; 71 | BaseFloat width = 0.5; 72 | BaseFloat maxfreq = sample_freq/2; 73 | int minfreq = 50; 74 | int maxlen = nfft/2; 75 | 76 | gammatone_matrix_->Resize(nfilts, maxlen); 77 | 78 | // fixed constants 79 | BaseFloat EarQ = 9.26449; 80 | BaseFloat minBW = 24.7; 81 | int order = 1; 82 | Vector ucirc_real(maxlen); 83 | Vector ucirc_imag(maxlen); 84 | for (MatrixIndexT i = 0; i < maxlen; i++) { 85 | ucirc_real(i) = cos(M_2PI*i/nfft); 86 | ucirc_imag(i) = sin(M_2PI*i/nfft); 87 | } 88 | 89 | BaseFloat ERB, B, r, cf, theta, pole_real, pole_imag, T, A11, A12, A13, A14; 90 | BaseFloat p0r, p0i, p1r, p1i, p2, p3, p4; 91 | BaseFloat g0r, g0i, g1r, g1i, g2r, g2i, g3r, g3i, g4r, g4i, g5r, g5i; 92 | BaseFloat gain; 93 | for (int32 i = 1; i <= nfilts; i++) { 94 | 95 | cf = -(EarQ*minBW) + exp((nfilts+1-i)*(-log(maxfreq + EarQ*minBW) + log(minfreq + EarQ*minBW))/nfilts) * (maxfreq + EarQ*minBW); 96 | ERB = width*pow(pow(cf/EarQ,order) + pow(minBW,order),1/order); 97 | B = 1.019*M_2PI*ERB; 98 | r = exp(-B/sample_freq); 99 | theta = M_2PI*cf/sample_freq; 100 | pole_real = r*cos(theta); 101 | pole_imag = r*sin(theta); 102 | 103 | T = 1.0/sample_freq; 104 | A11 = (2*T*cos(M_2PI*cf*T)/exp(B*T) + 2*sqrt(3+pow(2,1.5))*T*sin(M_2PI*cf*T)/exp(B*T))/(2*T); 105 | A12 = (2*T*cos(M_2PI*cf*T)/exp(B*T) - 2*sqrt(3+pow(2,1.5))*T*sin(M_2PI*cf*T)/exp(B*T))/(2*T); 106 | A13 = (2*T*cos(M_2PI*cf*T)/exp(B*T) + 2*sqrt(3-pow(2,1.5))*T*sin(M_2PI*cf*T)/exp(B*T))/(2*T); 107 | A14 = (2*T*cos(M_2PI*cf*T)/exp(B*T) - 2*sqrt(3-pow(2,1.5))*T*sin(M_2PI*cf*T)/exp(B*T))/(2*T); 108 | 109 | ComplexImExp(static_cast(2*cf*M_2PI*T), &p0r, &p0i ); 110 | ComplexImExp(static_cast(cf*M_2PI*T), &p1r, &p1i ); 111 | p1r*=2*exp(-B*T)*T; p1i*=2*exp(-B*T)*T; 112 | p2=cos(cf*M_2PI*T); 113 | p3=sqrt(3 - pow(2,1.5))* sin(cf*M_2PI*T); 114 | p4=sqrt(3 + pow(2,1.5))* sin(cf*M_2PI*T); 115 | 116 | g0r = -2*T*p0r+p1r*(p2-p3); g0i = -2*T*p0i+p1i*(p2-p3); 117 | g1r = -2*T*p0r+p1r*(p2+p3); g1i = -2*T*p0i+p1i*(p2+p3); 118 | g2r = -2*T*p0r+p1r*(p2-p4); g2i = -2*T*p0i+p1i*(p2-p4); 119 | g3r = -2*T*p0r+p1r*(p2+p4); g3i = -2*T*p0i+p1i*(p2+p4); 120 | g4r = -2*pow(exp(2*B*T),-1) - 2*p0r + 2*exp(-B*T) + 2*exp(-B*T)*p0r; 121 | g4i = -2*p0i + 2*exp(-B*T)*p0i; 122 | ComplexMul(g4r,g4i,&g4r,&g4i); 123 | ComplexMul(g4r,g4i,&g4r,&g4i); 124 | g5r = g4r/(pow(g4r,2)+pow(g4i,2)); 125 | g5i = -g4i/(pow(g4r,2)+pow(g4i,2)); 126 | 127 | ComplexMul(g1r,g1i,&g0r,&g0i); 128 | ComplexMul(g2r,g2i,&g0r,&g0i); 129 | ComplexMul(g3r,g3i,&g0r,&g0i); 130 | ComplexMul(g5r,g5i,&g0r,&g0i); 131 | 132 | gain = sqrt(pow(g0r,2) + pow(g0i,2)); 133 | 134 | Vector gtcol_(maxlen); 135 | BaseFloat g6r, g6i, g6ic, g7; 136 | for (MatrixIndexT j = 0; j < maxlen; j++) { 137 | g6r = pole_real - ucirc_real(j); 138 | g6i = pole_imag - ucirc_imag(j); 139 | g6ic = -pole_imag - ucirc_imag(j); 140 | ComplexMul(g6r,g6ic,&g6r,&g6i); 141 | g7 = pow(sqrt(pow(g6r,2) + pow(g6i,2)),-4); 142 | gtcol_(j) = sqrt(pow(ucirc_real(j)-A11,2) + pow(ucirc_imag(j),2)) * sqrt(pow(ucirc_real(j)-A12,2) + pow(ucirc_imag(j),2)) * 143 | sqrt(pow(ucirc_real(j)-A13,2) + pow(ucirc_imag(j),2)) * sqrt(pow(ucirc_real(j)-A14,2) + pow(ucirc_imag(j),2)) ; 144 | gtcol_(j) *= (pow(T,4)/gain) ; 145 | gtcol_(j) *= g7 ; 146 | } 147 | gammatone_matrix_->Row(i-1).CopyFromVec(gtcol_); 148 | } 149 | } 150 | 151 | void Gtf::Compute(const VectorBase &wave, 152 | BaseFloat vtln_warp, 153 | Matrix *output, 154 | Vector *wave_remainder) { 155 | assert(output != NULL); 156 | int32 rows_out = NumFrames(wave.Dim(), opts_.frame_opts); 157 | int32 cols_out = (opts_.apply_dct)? (opts_.use_c0)? opts_.num_ceps : opts_.num_ceps-1 : opts_.num_bins; 158 | if (rows_out == 0) 159 | KALDI_ERR << "Gtf::Compute, no frames fit in file (#samples is " << wave.Dim() << ")"; 160 | output->Resize(rows_out, cols_out); 161 | if (wave_remainder != NULL) 162 | ExtractWaveformRemainder(wave, opts_.frame_opts, wave_remainder); 163 | Vector window; // windowed waveform. 164 | Vector mel_energies; 165 | for (int32 r = 0; r < rows_out; r++) { // r is frame index.. 166 | BaseFloat log_energy; 167 | ExtractWindow(0, wave, r, opts_.frame_opts, feature_window_function_, &window, 168 | (opts_.use_energy && opts_.raw_energy ? &log_energy : NULL)); 169 | 170 | if (opts_.use_energy && !opts_.raw_energy) 171 | log_energy = VecVec(window, window); 172 | 173 | if (srfft_) srfft_->Compute(window.Data(), true); // Compute FFT using 174 | // split-radix algorithm. 175 | else RealFft(&window, true); // An alternative algorithm that 176 | // works for non-powers-of-two. 177 | 178 | // Convert the FFT into a power spectrum. 179 | ComputePowerSpectrum(&window); 180 | SubVector power_spectrum(window, 0, window.Dim()/2); 181 | power_spectrum.ApplyPow(0.5); 182 | 183 | SubVector this_gtf(output->Row(r)); 184 | 185 | // GTF 186 | Vector gtf(opts_.num_bins); 187 | gtf.AddMatVec(1.0, gammatone_matrix_, kNoTrans, power_spectrum, 0.0); 188 | gtf.ApplyPow(1.0/3); 189 | if (opts_.apply_dct) { 190 | if (opts_.use_c0) { 191 | this_gtf.AddMatVec(1.0, dct_matrix_, kNoTrans, gtf, 0.0); 192 | } else { 193 | this_gtf.AddMatVec(1.0, dct_matrix_.RowRange(1, dct_matrix_.NumRows()-1), kNoTrans, gtf, 0.0); 194 | } 195 | } else { 196 | this_gtf.CopyFromVec(gtf); 197 | } 198 | 199 | } 200 | } 201 | 202 | 203 | 204 | 205 | 206 | 207 | } // namespace 208 | -------------------------------------------------------------------------------- /transform/featxtra-functions.cc: -------------------------------------------------------------------------------- 1 | // transform/featxtra-functions.cc 2 | 3 | // Copyright 2014 University of Southern California (author: Maarten Van Segbroeck) 4 | 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | // MERCHANTABLITY OR NON-INFRINGEMENT. 15 | // See the Apache 2 License for the specific language governing permissions and 16 | // limitations under the License. 17 | 18 | #include "transform/featxtra-functions.h" 19 | #include 20 | using std::vector; 21 | 22 | namespace kaldi { 23 | 24 | void ApplyArma(int ar_order, 25 | MatrixBase *feats) { 26 | KALDI_ASSERT(feats != NULL); 27 | 28 | MatrixIndexT dim = feats->NumCols(); 29 | MatrixIndexT num_frames = feats->NumRows(); 30 | Matrix featsmvn(*feats); 31 | 32 | // Apply the normalization. 33 | BaseFloat tmp1, tmp2; 34 | for (int32 d = 0; d < dim; d++) { 35 | tmp1 = 0; 36 | tmp2 = 0; 37 | for (int32 i = 0; i < num_frames-ar_order; i++) { 38 | if (i < ar_order) { 39 | (*feats)(i, d) = 0.01*featsmvn(i, d); // suppress values 40 | } else if (i == ar_order) { 41 | for (int32 k = 0; k < ar_order; k++) { 42 | tmp1 += (*feats)(i-1-k, d); 43 | tmp2 += featsmvn(i+k, d); 44 | } 45 | tmp2 += featsmvn(i+ar_order, d); 46 | (*feats)(i, d) = ( tmp1 + tmp2 ) / ( 2*ar_order + 1 ); 47 | } else { 48 | tmp1 += (*feats)(i-1, d) - (*feats)(i-1-ar_order, d); 49 | tmp2 += featsmvn(i+ar_order, d) - featsmvn(i-1, d); 50 | (*feats)(i, d) = ( tmp1 + tmp2 ) / ( 2*ar_order + 1 ); 51 | } 52 | } 53 | } 54 | } 55 | 56 | void ApplySigmoidScale(BaseFloat sigmoidThr, 57 | BaseFloat sigmoidSlope, 58 | MatrixBase *feats) { 59 | MatrixIndexT num_rows = feats->NumRows(); 60 | MatrixIndexT num_cols = feats->NumCols(); 61 | for (MatrixIndexT r = 0; r < num_rows; r++) { 62 | for (MatrixIndexT c = 0; c < num_cols; c++) { 63 | (*feats)(r, c) = static_cast(1 / (Exp(-1 / sigmoidSlope * 64 | (2 * (*feats)(r, c) - sigmoidThr)) + 1)); 65 | } 66 | } 67 | } 68 | 69 | void ApplyLtsv(int ctx_win, 70 | BaseFloat ltsv_sigmoidSlope, 71 | BaseFloat ltsv_sigmoidThr, 72 | const MatrixBase *feats, 73 | Matrix *ltsv) { 74 | KALDI_ASSERT(feats != NULL); 75 | MatrixIndexT dim = feats->NumCols(); 76 | MatrixIndexT num_frames = feats->NumRows(); 77 | // Resize ctx_win if larger than number of frames 78 | if (num_frames < ctx_win+1) 79 | ctx_win = num_frames-1; 80 | Matrix featsin(num_frames+ctx_win, dim); 81 | SubMatrix featsappend(feats->Range(num_frames-ctx_win-1, 82 | ctx_win, 0, dim)); 83 | featsin.Range(0, num_frames, 0, dim).CopyFromMat(*feats); 84 | featsin.Range(num_frames, ctx_win, 0, dim).CopyFromMat(featsappend); 85 | (*ltsv).Resize(num_frames, 1); 86 | 87 | Vector moving_context(dim), ltsv_bins(dim), ltsv_bins_log(dim); 88 | moving_context.CopyFromVec(featsin.Row(0)); 89 | moving_context.Scale(round(ctx_win/2)); 90 | for (int32 k = 0; k < round(ctx_win/2); k++) 91 | moving_context.AddVec(1.0, featsin.Row(k)); 92 | 93 | BaseFloat ltsv_val = 0.0; 94 | for (int32 k = 0; k < num_frames; k++) { 95 | if (k < round(ctx_win/2)) { 96 | moving_context.AddVec(-1.0, featsin.Row(0)); 97 | } else { 98 | moving_context.AddVec(-1.0, featsin.Row(k-round(ctx_win/2))); 99 | } 100 | moving_context.AddVec(1.0, featsin.Row(k+round(ctx_win/2))); 101 | 102 | ltsv_bins.CopyFromVec(featsin.Row(k)); 103 | ltsv_bins.DivElements(moving_context); 104 | ltsv_bins.Scale(100); 105 | ltsv_bins_log.CopyFromVec(ltsv_bins); 106 | ltsv_bins_log.ApplyLog(); 107 | 108 | // entropy 109 | ltsv_bins.MulElements(ltsv_bins_log); 110 | ltsv_bins.Scale(-1); 111 | 112 | // variance 113 | ltsv_bins.Add(-ltsv_bins.Sum()/dim); 114 | ltsv_bins.ApplyPow(2.0); 115 | 116 | // ltsv 117 | if (k < num_frames - round(ctx_win/2)) 118 | ltsv_val = ltsv_bins.Sum()/dim; 119 | (*ltsv)(k, 0) = ltsv_val; 120 | } 121 | // sigmoid 122 | ApplySigmoidScale(ltsv_sigmoidThr, ltsv_sigmoidSlope, ltsv); 123 | } 124 | 125 | void ApplyColSum(const Matrix &data, 126 | Vector *colsum ) { 127 | MatrixIndexT num_cols = data.NumCols(); 128 | MatrixIndexT num_rows = data.NumRows(); 129 | colsum->Resize(num_rows); 130 | for (MatrixIndexT r = 0; r < num_rows; r++) { 131 | (*colsum)(r) = data.Range(r, 1, 0, num_cols).Sum(); 132 | } 133 | } 134 | 135 | void ApplyColMean(const Matrix &data, 136 | Vector *colmean ) { 137 | MatrixIndexT num_cols = data.NumCols(); 138 | ApplyColSum(data, colmean); 139 | colmean->Scale(1.0/num_cols); 140 | } 141 | 142 | void ApplySort(VectorBase *s ) { 143 | std::sort(s->Data(), s->Data()+s->Dim()); 144 | } 145 | 146 | void ApplyMedianfiltering(int ctx_win, 147 | VectorBase *data ) { 148 | MatrixIndexT num_singval = data->Dim(); 149 | Vector moving_context; 150 | Vector data_copy(*data); 151 | int ctx_win_half = ctx_win / 2; // integer division 152 | int is_odd_ctx_win = ctx_win % 2; 153 | int data_tail_range_start = num_singval-ctx_win_half+(1-is_odd_ctx_win); 154 | for (int32 k = 0; k < num_singval; k++) { 155 | moving_context.Resize(ctx_win); // reset to zero values 156 | if (k < ctx_win_half) { 157 | moving_context.Range(0, 158 | ctx_win_half+k).CopyFromVec(data_copy.Range(0, 159 | ctx_win_half+k)); // zero padding 160 | } 161 | else if (k >= data_tail_range_start) { 162 | moving_context.Range(0, 163 | ctx_win_half+num_singval-k).CopyFromVec(data_copy.Range(k-ctx_win_half, 164 | ctx_win_half+num_singval-k)); // zero padding 165 | } else { 166 | moving_context.CopyFromVec(data_copy.Range(k-ctx_win_half, ctx_win)); 167 | } 168 | ApplySort(&moving_context); 169 | (*data)(k) = (is_odd_ctx_win == 0 ? (moving_context(ctx_win_half) + 170 | moving_context(ctx_win_half-1)) / 2 : moving_context(ctx_win_half)); 171 | } 172 | } 173 | 174 | void ComputeComplexFft(Matrix *real_data, 175 | Matrix *imag_data, 176 | int32 dim0, 177 | int32 dim1, 178 | bool forward_fft) { 179 | // Copy input matrices into matrices of desired dimensionality 180 | real_data->Resize(dim0, dim1, kCopyData); 181 | imag_data->Resize(dim0, dim1, kCopyData); 182 | 183 | // Apply first FFT to the matrix rows 184 | Matrix gfilter_fft(2*dim0, dim1); 185 | for (MatrixIndexT i = 0 ; i < dim0; i++) { 186 | gfilter_fft.Row(i*2).CopyFromVec(real_data->Row(i)); 187 | gfilter_fft.Row(i*2 + 1).CopyFromVec(imag_data->Row(i)); 188 | } 189 | gfilter_fft.Transpose(); 190 | Vector tmp_fft1(2*dim0); 191 | for (MatrixIndexT i = 0 ; i < dim1; i++) { 192 | tmp_fft1.CopyFromVec(gfilter_fft.Row(i)); 193 | ComplexFft(&tmp_fft1, forward_fft); 194 | gfilter_fft.Row(i).CopyFromVec(tmp_fft1); 195 | } 196 | 197 | // Transpose : fft(A).' 198 | gfilter_fft.Transpose(); 199 | Matrix gfilter_fft_imag(dim0, dim1); 200 | Matrix gfilter_fft_real(dim0, dim1); 201 | for (MatrixIndexT i = 0 ; i < dim0; i++) { 202 | gfilter_fft_real.Row(i).CopyFromVec(gfilter_fft.Row(i*2)); 203 | gfilter_fft_imag.Row(i).CopyFromVec(gfilter_fft.Row(i*2 + 1)); 204 | } 205 | gfilter_fft_imag.Transpose(); 206 | gfilter_fft_real.Transpose(); 207 | 208 | // Apply second FFT to the matrix rows : fft(fft(A).') 209 | gfilter_fft.Resize(2*dim1, dim0); 210 | for (MatrixIndexT i = 0 ; i < dim1; i++) { 211 | gfilter_fft.Row(i*2).CopyFromVec(gfilter_fft_real.Row(i)); 212 | gfilter_fft.Row(i*2 + 1).CopyFromVec(gfilter_fft_imag.Row(i)); 213 | } 214 | gfilter_fft.Transpose(); 215 | Vector tmp_fft2(2*dim1); 216 | for (MatrixIndexT i = 0 ; i < dim0; i++) { 217 | tmp_fft2.CopyFromVec(gfilter_fft.Row(i)); 218 | ComplexFft(&tmp_fft2, forward_fft); 219 | gfilter_fft.Row(i).CopyFromVec(tmp_fft2); 220 | } 221 | 222 | // Transpose : fft(fft(A).').' 223 | gfilter_fft.Transpose(); 224 | for (MatrixIndexT i = 0 ; i < dim1; i++) { 225 | gfilter_fft_real.Row(i).CopyFromVec(gfilter_fft.Row(i*2)); 226 | gfilter_fft_imag.Row(i).CopyFromVec(gfilter_fft.Row(i*2 + 1)); 227 | } 228 | gfilter_fft_imag.Transpose(); 229 | gfilter_fft_real.Transpose(); 230 | 231 | real_data->CopyFromMat(gfilter_fft_real); 232 | imag_data->CopyFromMat(gfilter_fft_imag); 233 | } 234 | 235 | void ComputeComplexFftPow2(Matrix *real_data, 236 | Matrix *imag_data, 237 | int32 dim0, 238 | int32 dim1, 239 | bool forward_fft) { 240 | 241 | if ( (dim0 & (dim0-1)) != 0 || dim0 <= 1) 242 | KALDI_ERR << "ComputeComplexFftPow2 called with invalid number of points " 243 | << dim0; 244 | if ( (dim1 & (dim1-1)) != 0 || dim1 <= 1) 245 | KALDI_ERR << "ComputeComplexFftPow2 called with invalid number of points " 246 | << dim1; 247 | 248 | // Copy input matrices into matrices of desired dimensionality 249 | real_data->Resize(dim0, dim1, kCopyData); 250 | imag_data->Resize(dim0, dim1, kCopyData); 251 | 252 | Matrix *gfilter_fft_imag=real_data; 253 | Matrix *gfilter_fft_real=imag_data; 254 | 255 | // Apply first FFT to the matrix rows 256 | gfilter_fft_real->Transpose(); 257 | gfilter_fft_imag->Transpose(); 258 | SplitRadixComplexFft srfft1(dim0); 259 | Vector tmp_fft1_real(dim0); 260 | Vector tmp_fft1_imag(dim0); 261 | for (MatrixIndexT i = 0 ; i < dim1; i++) { 262 | tmp_fft1_real.CopyFromVec(gfilter_fft_real->Row(i)); 263 | tmp_fft1_imag.CopyFromVec(gfilter_fft_imag->Row(i)); 264 | srfft1.Compute(tmp_fft1_real.Data(), tmp_fft1_imag.Data(), forward_fft); 265 | gfilter_fft_real->Row(i).CopyFromVec(tmp_fft1_real); 266 | gfilter_fft_imag->Row(i).CopyFromVec(tmp_fft1_imag); 267 | } 268 | 269 | // Transpose : fft(A).' 270 | gfilter_fft_imag->Transpose(); 271 | gfilter_fft_real->Transpose(); 272 | 273 | // Apply second FFT to the matrix rows : fft(fft(A).') 274 | SplitRadixComplexFft srfft2(dim1); 275 | Vector tmp_fft2_real(dim1); 276 | Vector tmp_fft2_imag(dim1); 277 | for (MatrixIndexT i = 0 ; i < dim0; i++) { 278 | tmp_fft2_real.CopyFromVec(gfilter_fft_real->Row(i)); 279 | tmp_fft2_imag.CopyFromVec(gfilter_fft_imag->Row(i)); 280 | srfft2.Compute(tmp_fft2_real.Data(), tmp_fft2_imag.Data(), forward_fft); 281 | gfilter_fft_real->Row(i).CopyFromVec(tmp_fft2_real); 282 | gfilter_fft_imag->Row(i).CopyFromVec(tmp_fft2_imag); 283 | } 284 | 285 | } 286 | 287 | 288 | // This function is copied from KALDI (feat/pitch-functions.cc) 289 | inline BaseFloat NccfToPov(BaseFloat n) { 290 | BaseFloat ndash = fabs(n); 291 | if (ndash > 1.0) ndash = 1.0; // just in case it was slightly outside [-1, 1] 292 | BaseFloat r = -5.2 + 5.4 * exp(7.5 * (ndash - 1.0)) + 4.8 * ndash - 293 | 2.0 * exp(-10.0 * ndash) + 4.2 * exp(20.0 * (ndash - 1.0)); 294 | // r is the approximate log-prob-ratio of voicing, log(p/(1-p)). 295 | BaseFloat p = 1.0 / (1 + exp(-1.0 * r)); 296 | KALDI_ASSERT(p - p == 0); // Check for NaN/inf 297 | return p; 298 | } 299 | 300 | void ApplyNccfToPov(Matrix* kaldi_pitch_feats) { 301 | MatrixIndexT num_frames = kaldi_pitch_feats->NumRows(); 302 | for (MatrixIndexT frame = 0; frame < num_frames; ++frame) { 303 | (*kaldi_pitch_feats)(frame, 0) = NccfToPov((*kaldi_pitch_feats)(frame, 0)); 304 | } 305 | } 306 | 307 | } // namespace kaldi 308 | -------------------------------------------------------------------------------- /feat/feature-gabor.cc: -------------------------------------------------------------------------------- 1 | // feat/feature-gabor.cc 2 | 3 | // Copyright 2014 Jimmy & Danny 4 | // July 2014: modified by Maarten Van Segbroeck 5 | 6 | // Licensed under the Apache License, Version 2.0 (the "License"); 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // 12 | // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 14 | // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 15 | // MERCHANTABLITY OR NON-INFRINGEMENT. 16 | // See the Apache 2 License for the specific language governing permissions and 17 | // limitations under the License. 18 | 19 | #include "feat/feature-gabor.h" 20 | #include "time.h" 21 | 22 | 23 | namespace kaldi { 24 | 25 | Gabor::Gabor(const GaborOptions &opts): 26 | opts_(opts), 27 | feature_window_function_(opts.frame_opts), 28 | srfft_(NULL) { 29 | int32 padded_window_size = opts.frame_opts.PaddedWindowSize(); 30 | if ((padded_window_size & (padded_window_size-1)) == 0) // Is a power of two 31 | srfft_ = new SplitRadixRealFft(padded_window_size); 32 | } 33 | 34 | Gabor::~Gabor() { 35 | for (std::map::iterator iter = mel_banks_.begin(); 36 | iter != mel_banks_.end(); 37 | ++iter) 38 | delete iter->second; 39 | if (srfft_ != NULL) 40 | delete srfft_; 41 | } 42 | 43 | 44 | const MelBanks *Gabor::GetMelBanks(BaseFloat vtln_warp) { 45 | MelBanks *this_mel_banks = NULL; 46 | std::map::iterator iter = mel_banks_.find(vtln_warp); 47 | if (iter == mel_banks_.end()) { 48 | this_mel_banks = new MelBanks(opts_.mel_opts, 49 | opts_.frame_opts, 50 | vtln_warp); 51 | mel_banks_[vtln_warp] = this_mel_banks; 52 | } else { 53 | this_mel_banks = iter->second; 54 | } 55 | return this_mel_banks; 56 | } 57 | 58 | 59 | 60 | void Gabor::ApplyPadding(Matrix *spectrogram, 61 | int32 ro, int32 co, Matrix *padded_spec) { 62 | 63 | // ro: row offset for padding 64 | // co: col offset for padding 65 | 66 | int32 rows_out = spectrogram->NumRows(); 67 | int32 cols_out = spectrogram->NumCols(); 68 | 69 | // append padding 70 | SubMatrix spec(padded_spec[0], ro, rows_out, co, cols_out); 71 | spec.CopyFromMat(*spectrogram); 72 | 73 | if (opts_.use_reflective_padding) { 74 | 75 | if (ro>0) { 76 | // top side 77 | SubMatrix top_pad((*padded_spec), 0, ro, co, cols_out); 78 | SubMatrix top_spec(*spectrogram, 0, ro, 0, cols_out); 79 | 80 | Matrix reversed_top_spec(ro, cols_out); 81 | 82 | for (int32 i = 0; i < ro; i++) { 83 | SubVector this_row1(top_spec.Row(i)); 84 | SubVector this_row2(reversed_top_spec.Row(ro-(i+1))); 85 | this_row2.CopyFromVec(this_row1); 86 | } 87 | top_pad.CopyFromMat(reversed_top_spec); 88 | 89 | 90 | // bottom side 91 | SubMatrix bot_pad((*padded_spec), rows_out+ro, ro, co, cols_out); 92 | SubMatrix bot_spec(*spectrogram, rows_out-ro, ro, 0, cols_out); 93 | 94 | Matrix reversed_bot_spec(ro, cols_out); 95 | 96 | for (int32 i = 0; i < ro; i++) { 97 | SubVector this_row1(bot_spec.Row(i)); 98 | SubVector this_row2(reversed_bot_spec.Row(ro-(i+1))); 99 | this_row2.CopyFromVec(this_row1); 100 | } 101 | bot_pad.CopyFromMat(reversed_bot_spec); 102 | 103 | } 104 | 105 | 106 | if (co>0) { 107 | // left side 108 | SubMatrix left_pad((*padded_spec), 0, rows_out+2*ro, 0, co); 109 | SubMatrix left_spec((*padded_spec), 0, rows_out+2*ro, co, co); 110 | 111 | Matrix left_specT(rows_out+2*ro, co); 112 | left_specT.CopyFromMat(left_spec); 113 | left_specT.Transpose(); 114 | 115 | Matrix reversed_left_spec(co, rows_out+2*ro); 116 | 117 | for (int32 i = 0; i < co; i++) { 118 | SubVector this_row1(left_specT.Row(i)); 119 | SubVector this_row2(reversed_left_spec.Row(co-(i+1))); 120 | this_row2.CopyFromVec(this_row1); 121 | } 122 | reversed_left_spec.Transpose(); 123 | left_pad.CopyFromMat(reversed_left_spec); 124 | 125 | // right side 126 | SubMatrix right_pad((*padded_spec), 0, rows_out+2*ro, cols_out+co, co); 127 | SubMatrix right_spec((*padded_spec), 0, rows_out+2*ro, cols_out, co); 128 | 129 | Matrix right_specT(rows_out+2*ro, co); 130 | right_specT.CopyFromMat(right_spec); 131 | right_specT.Transpose(); 132 | 133 | Matrix reversed_right_spec(co, rows_out+2*ro); 134 | 135 | for (int32 i = 0; i < co; i++) { 136 | SubVector this_row1(right_specT.Row(i)); 137 | SubVector this_row2(reversed_right_spec.Row(co-(i+1))); 138 | this_row2.CopyFromVec(this_row1); 139 | } 140 | reversed_right_spec.Transpose(); 141 | right_pad.CopyFromMat(reversed_right_spec); 142 | 143 | } 144 | 145 | } 146 | } 147 | 148 | 149 | void Gabor::RemovePadding(Matrix input, 150 | int32 ro, 151 | int32 co, 152 | Matrix *output) { 153 | 154 | int32 inRows = input.NumRows(); 155 | int32 inCols = input.NumCols(); 156 | int32 outRows = inRows-2*ro; 157 | int32 outCols = inCols-2*co; 158 | 159 | output->Resize(outRows, outCols); 160 | 161 | SubMatrix out(input, ro, outRows, co, outCols); 162 | output->CopyFromMat(out); 163 | 164 | } 165 | 166 | 167 | 168 | void Gabor::GFBCalcAxis(Vector omega_max, 169 | Vector size_max, 170 | Vector nu, 171 | Vector distance, 172 | Vector *omega_n, 173 | Vector *omega_k) { 174 | // % Calculates the modulation center frequencies iteratively. 175 | // Initialize Vectors 176 | Vector omega_min; 177 | omega_min.Resize(size_max.Dim()); 178 | Vector c(distance.Dim()); 179 | // c.Resize(distance.Dim()); 180 | omega_n->Resize(1); 181 | omega_k->Resize(1); 182 | 183 | // % Termination condition for iteration is reaching omega_min, which is 184 | // % derived from size_max. 185 | omega_min.CopyFromVec(nu); 186 | omega_min.DivElements(size_max); 187 | omega_min.Scale(M_PI); 188 | 189 | // % Eq. (2b) 190 | c.CopyFromVec(distance); 191 | c.DivElements(nu); 192 | c.Scale(8.0); 193 | 194 | // % Second factor of Eq. (2a) 195 | BaseFloat space_n = (1.0 + c(1) / 2) / (1.0 - c(1) / 2); 196 | int32 count_n = 1; 197 | (*omega_n)(0) = omega_max(1); 198 | 199 | // % Iterate starting with omega_max in spectral dimension 200 | while ( (*omega_n)(count_n-1) /space_n > omega_min(1) ) { 201 | omega_n->Resize(omega_n->Dim()+1, kCopyData); 202 | (*omega_n)(count_n) = omega_max(1) / pow(space_n,count_n); 203 | count_n++; 204 | } 205 | 206 | // % Add DC 207 | omega_n->Resize(omega_n->Dim()+1, kCopyData); 208 | (*omega_n)(omega_n->Dim()) = 0.0; 209 | 210 | Vector omega_n_tmp(omega_n->Dim()); 211 | omega_n_tmp.CopyFromVec((*omega_n)); 212 | for ( int32 i = 0; iDim(); i++ ) { 213 | (*omega_n)(i) = omega_n_tmp(omega_n->Dim()-(i+1)); 214 | } 215 | 216 | // % Second factor of Eq. (2a) 217 | BaseFloat space_k = (1 + c(0) / 2) / (1 - c(0) / 2); 218 | int32 count_k = 1; 219 | (*omega_k)(0) = omega_max(0); 220 | 221 | // % Iterate starting with omega_max in temporal dimension 222 | while ( (*omega_k)(count_k-1) / space_k > omega_min(0) ) { 223 | omega_k->Resize(omega_k->Dim()+1, kCopyData); 224 | (*omega_k)(count_k) = omega_max(0) / pow(space_k,count_k); 225 | count_k++; 226 | } 227 | 228 | // % Add DC and negative MFs for spectro-temporal opposite 229 | // % filters (upward/downward) 230 | Vector omega_k_tmp(omega_k->Dim()); 231 | omega_k_tmp.CopyFromVec((*omega_k)); 232 | omega_k->Resize(2*(omega_k->Dim())+1); 233 | int32 j = 0; 234 | while ( j < omega_k_tmp.Dim() ) { 235 | (*omega_k)(j) = - omega_k_tmp(j); 236 | j++; 237 | } 238 | (*omega_k)(j) = 0; j++; 239 | while ( j < 2*(omega_k_tmp.Dim())+1 ) { 240 | (*omega_k)(j) = omega_k_tmp(2*(omega_k_tmp.Dim())-j); 241 | j++; 242 | } 243 | 244 | } 245 | 246 | void Gabor::ComputeHannWindow(BaseFloat width, Vector *window) { 247 | 248 | int32 width_i = ceil(width); 249 | 250 | BaseFloat x_center = 0.5; 251 | Vector x_values(width_i+1); 252 | 253 | x_values(width_i/2-1) = (x_center-1.0/(width+1)); 254 | for( int32 i=0; iResize(width_i+1); 261 | for ( int32 i=0; i real, 267 | Matrix imag, 268 | Matrix *mag) { 269 | 270 | real.ApplyPow(2); 271 | imag.ApplyPow(2); 272 | 273 | real.AddMat(1.0, imag); 274 | 275 | real.ApplyPow(0.5); 276 | 277 | mag->CopyFromMat(real); 278 | 279 | } 280 | 281 | 282 | void Gabor::ComputeGaborFilter(BaseFloat omega_k, BaseFloat omega_n, 283 | Vector nu, Vector size_max, 284 | Matrix *gfilter_real, Matrix *gfilter_imag) { 285 | // % Generates a gabor filter function with: 286 | // % omega_k spectral mod. freq. in rad 287 | // % omega_n temporal mod. freq. in rad 288 | // % nu_k number of half waves unter the envelope in spectral dim. 289 | // % nu_n number of half waves unter the envelope in temporal dim. 290 | // % size_max_k max. allowed extension in spectral dimension 291 | // % size_max_n max. allowed extension in temporal dimension 292 | 293 | // % Calculate windows width. 294 | BaseFloat w_n = 2*M_PI / abs(omega_n) * nu(0) / 2; 295 | BaseFloat w_k = 2*M_PI / abs(omega_k) * nu(1) / 2; 296 | 297 | // % If the size exceeds the max. allowed extension in a dimension set the 298 | // % corresponding mod. freq. to zero. 299 | if( w_n > size_max(1) ) { 300 | w_n = size_max(1); 301 | omega_n = 0.0; 302 | } 303 | if( w_k > size_max(0) ) { 304 | w_k = size_max(0); 305 | omega_k = 0.0; 306 | } 307 | 308 | // % Separable hanning envelope, cf. Eq. (1c). 309 | Vector env_n; 310 | Vector env_k; 311 | int32 win_size_k = ceil(w_k); 312 | int32 win_size_n = ceil(w_n); 313 | 314 | ComputeHannWindow(w_n-1, &env_n); 315 | ComputeHannWindow(w_k-1, &env_k); 316 | 317 | Matrix envelope(win_size_k, win_size_n, kSetZero); 318 | 319 | envelope.AddVecVec(1.0, env_k, env_n); 320 | 321 | // % Sinusoid carrier, cf. Eq. (1c). 322 | int32 n_0 = (win_size_n+1) / 2; 323 | int32 k_0 = (win_size_k+1) / 2; 324 | 325 | BaseFloat sinusoid_r; 326 | BaseFloat sinusoid_i; 327 | 328 | gfilter_real->Resize(win_size_k, win_size_n, kSetZero); 329 | gfilter_imag->Resize(win_size_k, win_size_n, kSetZero); 330 | 331 | gfilter_real->CopyFromMat(envelope); 332 | 333 | // % Eq. 1c 334 | for( int32 n=0; nSum() / win_size_k / win_size_n; 346 | BaseFloat gfilter_imag_mean = gfilter_imag->Sum() / win_size_k / win_size_n; 347 | 348 | Matrix comp_r(win_size_k, win_size_n, kSetZero); 349 | comp_r.CopyFromMat(envelope); 350 | comp_r.Scale(-gfilter_real_mean/envelope_mean); 351 | Matrix comp_i(win_size_k, win_size_n, kSetZero); 352 | comp_i.CopyFromMat(envelope); 353 | comp_i.Scale(-gfilter_imag_mean/envelope_mean); 354 | 355 | if( (omega_n != 0) || (omega_k !=0) ) { 356 | 357 | gfilter_real->AddMat(1.0, comp_r); 358 | gfilter_imag->AddMat(1.0, comp_i); 359 | 360 | } 361 | else { 362 | 363 | // Add an imaginary part to DC filter for a fair real/imag comparison. 364 | gfilter_imag->CopyFromMat((*gfilter_real)); 365 | 366 | } 367 | 368 | // 2D FFT 369 | Matrix gfilter_fft_real(win_size_k, win_size_n); 370 | Matrix gfilter_fft_imag(win_size_k, win_size_n); 371 | gfilter_fft_real.CopyFromMat((*gfilter_real)); 372 | gfilter_fft_imag.CopyFromMat((*gfilter_imag)); 373 | ComputeComplexFft(&gfilter_fft_real, &gfilter_fft_imag, win_size_k, win_size_n, true); 374 | 375 | Matrix gfilter_fft_mag(win_size_k, win_size_n); 376 | ComputeMagnitude(gfilter_fft_real, gfilter_fft_imag, &gfilter_fft_mag); 377 | 378 | // % Normalize filter to have gains <= 1. 379 | BaseFloat maxFftMag; 380 | maxFftMag = gfilter_fft_mag.Max(); 381 | 382 | gfilter_real->Scale(1.0/maxFftMag); 383 | gfilter_imag->Scale(1.0/maxFftMag); 384 | 385 | } 386 | 387 | 388 | void Gabor::FftConv2(Matrix in1_real, 389 | Matrix in1_imag, 390 | Matrix in2_real, 391 | Matrix in2_imag, 392 | Matrix *out_real, 393 | Matrix *out_imag) { 394 | 395 | int32 size_y = in1_real.NumRows() + in2_real.NumRows() - 1; 396 | int32 size_x = in1_real.NumCols() + in2_real.NumCols() - 1; 397 | int32 fft_size_y = pow(2, ceil(log2(size_y))); 398 | int32 fft_size_x = pow(2, ceil(log2(size_x))); 399 | int32 outRows = in1_real.NumRows(); 400 | int32 outCols = in1_real.NumCols(); 401 | int32 y_offset = in2_real.NumRows()/2; 402 | int32 x_offset = in2_real.NumCols()/2; 403 | 404 | size_y=fft_size_y; 405 | size_x=fft_size_x; 406 | 407 | ComputeComplexFftPow2(&in1_real, &in1_imag, size_y, size_x, true); 408 | ComputeComplexFftPow2(&in2_real, &in2_imag, size_y, size_x, true); 409 | 410 | 411 | for (int32 i=0; i out_pad_real(size_y, size_x); 418 | Matrix out_pad_imag(size_y, size_x); 419 | 420 | out_pad_real.CopyFromMat(in2_real); 421 | out_pad_imag.CopyFromMat(in2_imag); 422 | 423 | ComputeComplexFftPow2(&out_pad_real, &out_pad_imag, size_y, size_x, false); 424 | out_pad_real.Scale(1.0/(size_y*size_x)); 425 | out_pad_imag.Scale(1.0/(size_y*size_x)); 426 | 427 | SubMatrix this_out_real(out_pad_real, y_offset, outRows, x_offset, outCols); 428 | SubMatrix this_out_imag(out_pad_real, y_offset, outRows, x_offset, outCols); 429 | 430 | out_real->Resize(outRows, outCols); 431 | out_imag->Resize(outRows, outCols); 432 | 433 | out_real->CopyFromMat(this_out_real); 434 | out_imag->CopyFromMat(this_out_imag); 435 | 436 | } 437 | 438 | void Gabor::ApplyGaborFilter(Matrix gfilter_real, 439 | Matrix gfilter_imag, 440 | Matrix spectrogram, 441 | Matrix *gfiltered_spec_real, 442 | Matrix *gfiltered_spec_imag) { 443 | // % Applies the filtering with a 2D Gabor filter to log_mel_spec 444 | // % This includes the special treatment of filters that do not lie fully 445 | // % inside the spectrogram 446 | 447 | BaseFloat gfilter_min; 448 | gfilter_min = gfilter_real.Min(); 449 | 450 | Matrix dc_map_real(spectrogram.NumRows(), spectrogram.NumCols(), kSetZero); 451 | Matrix dc_map_imag(spectrogram.NumRows(), spectrogram.NumCols(), kSetZero); 452 | 453 | // Create zeros matrix for imaginary part of spectrogram 454 | Matrix spec_imag(spectrogram.NumRows(), spectrogram.NumCols(), kSetZero); 455 | 456 | if (gfilter_min < 0){ 457 | // % Compare this code to the compensation for the DC part in the 458 | // % 'gfilter_gen' function. This is an online version of it removing the 459 | // % DC part of the filters by subtracting an appropriate part of the 460 | // % filters' envelope. 461 | 462 | Matrix gfilter_mag(gfilter_real.NumRows(), gfilter_real.NumCols()); 463 | ComputeMagnitude(gfilter_real, gfilter_imag, &gfilter_mag); 464 | 465 | BaseFloat gfilter_mag_sum = gfilter_mag.Sum(); 466 | gfilter_mag.Scale(1.0/gfilter_mag_sum); 467 | 468 | Matrix gfilter_mag_imag(gfilter_real.NumRows(), gfilter_real.NumCols(), kSetZero); 469 | 470 | Matrix gfilter_dc_map_real(spectrogram.NumRows(), spectrogram.NumCols(), kSetZero); 471 | gfilter_dc_map_real.Add(1.0); 472 | Matrix gfilter_dc_map_imag(spectrogram.NumRows(), spectrogram.NumCols(), kSetZero); 473 | 474 | FftConv2(gfilter_dc_map_real, gfilter_dc_map_imag, gfilter_real, gfilter_imag, &gfilter_dc_map_real, &gfilter_dc_map_imag); 475 | 476 | 477 | Matrix env_dc_map_real(spectrogram.NumRows(), spectrogram.NumCols(), kSetZero); 478 | env_dc_map_real.Add(1.0); 479 | Matrix env_dc_map_imag(spectrogram.NumRows(), spectrogram.NumCols(), kSetZero); 480 | 481 | FftConv2(env_dc_map_real, env_dc_map_imag, gfilter_mag, gfilter_mag_imag, &env_dc_map_real, &env_dc_map_imag); 482 | 483 | FftConv2(spectrogram, spec_imag, gfilter_mag, gfilter_mag_imag, &dc_map_real, &dc_map_imag); 484 | 485 | dc_map_real.DivElements(env_dc_map_real); 486 | dc_map_imag.DivElements(env_dc_map_real); 487 | 488 | for (int32 i=0; iAddMat(-1.0, dc_map_real); 499 | gfiltered_spec_imag->AddMat(-1.0, dc_map_imag); 500 | 501 | 502 | } 503 | 504 | 505 | void Gabor::GFBSelectRep(Matrix gfilter_real, 506 | Matrix gfilter_imag, 507 | Matrix *gfiltered_spec_real, 508 | Matrix *gfiltered_spec_imag) { 509 | // % Selects the center channel by choosing k_offset and those with k_factor 510 | // % channels distance to it in spectral dimension where k_factor is approx. 511 | // % 1/4 of the filters extension in the spectral dimension. 512 | 513 | int32 k_factor; 514 | int32 k_offset; 515 | int32 k_chans; 516 | 517 | k_factor = ( (gfilter_real.NumRows()/4) > 1 ? (gfilter_real.NumRows()/4) : 1); 518 | k_offset = (gfiltered_spec_real->NumRows()/2) % k_factor; 519 | k_chans = (gfiltered_spec_real->NumRows()) / k_factor; 520 | 521 | Matrix gfiltered_spec_rep_real(k_chans, gfiltered_spec_real->NumCols()); 522 | Matrix gfiltered_spec_rep_imag(k_chans, gfiltered_spec_real->NumCols()); 523 | 524 | for (int32 k=0; k this_gsrr(gfiltered_spec_rep_real.Row(k)); 527 | SubVector this_gsr(gfiltered_spec_real->Row(k*k_factor+k_offset)); 528 | SubVector this_gsri(gfiltered_spec_rep_imag.Row(k)); 529 | SubVector this_gsi(gfiltered_spec_imag->Row(k*k_factor+k_offset)); 530 | 531 | this_gsrr.CopyFromVec(this_gsr); 532 | this_gsri.CopyFromVec(this_gsi); 533 | } 534 | 535 | gfiltered_spec_real->Resize(k_chans, gfiltered_spec_real->NumCols()); 536 | gfiltered_spec_imag->Resize(k_chans, gfiltered_spec_imag->NumCols()); 537 | 538 | gfiltered_spec_real->CopyFromMat(gfiltered_spec_rep_real); 539 | gfiltered_spec_imag->CopyFromMat(gfiltered_spec_rep_imag); 540 | 541 | } 542 | 543 | 544 | 545 | void Gabor::Compute(const VectorBase &wave, 546 | BaseFloat vtln_warp, 547 | Matrix *output, 548 | Vector *wave_remainder) { 549 | 550 | assert(output != NULL); 551 | int32 rows_out = NumFrames(wave.Dim(), opts_.frame_opts); 552 | int32 cols_out = opts_.mel_opts.num_bins; 553 | int32 ro = opts_.padding_time; // row offset for padding 554 | int32 co = opts_.padding_freq; // col offset for padding 555 | Matrix spectrogram; 556 | 557 | if (rows_out == 0) 558 | KALDI_ERR << "Gabor::Compute, no frames fit in file (#samples is " << wave.Dim() << ")"; 559 | spectrogram.Resize(rows_out, cols_out); 560 | if (wave_remainder != NULL) 561 | ExtractWaveformRemainder(wave, opts_.frame_opts, wave_remainder); 562 | 563 | Vector window; // windowed waveform 564 | Vector mel_energies; 565 | 566 | for (int32 r = 0; r < rows_out; r++) { // r is frame index. 567 | 568 | ExtractWindow(0, wave, r, opts_.frame_opts, feature_window_function_, &window, NULL); 569 | 570 | if (srfft_) srfft_->Compute(window.Data(), true); // Compute FFT using 571 | // split-radix algorithm. 572 | else RealFft(&window, true); // An alternative algorithm that 573 | // works for non-powers-of-two 574 | 575 | // Convert the FFT into a power spectrum 576 | ComputePowerSpectrum(&window); 577 | SubVector power_spectrum(window, 0, window.Dim()/2); 578 | power_spectrum.ApplyPow(0.5); 579 | 580 | // Integrate with MelFiterbank over power spectrum 581 | const MelBanks *this_mel_banks = GetMelBanks(vtln_warp); 582 | this_mel_banks->Compute(power_spectrum, &mel_energies); 583 | 584 | if (opts_.use_cubed_root) 585 | mel_energies.ApplyPow(1.0/3); // apply cubed root 586 | else 587 | mel_energies.ApplyLog(); // take the log 588 | 589 | // Copy to spectrogram 590 | SubVector this_spec(spectrogram.Row(r)); 591 | this_spec.CopyFromVec(mel_energies); 592 | 593 | } 594 | 595 | // additional padding for very short utterances 596 | int32 ro_add = max(ro - rows_out, 0); 597 | rows_out = rows_out + ro_add; 598 | spectrogram.Resize(rows_out, cols_out, kCopyData); 599 | 600 | // Apply reflective padding to spectrogram 601 | Matrix padded_spec(rows_out+2*ro, cols_out+2*co, kSetZero); 602 | 603 | ApplyPadding(&spectrogram, ro, co, &padded_spec); 604 | 605 | // Transpose to match gfilters axes 606 | padded_spec.Transpose(); 607 | 608 | 609 | // Gabor filter stuffs 610 | Vector omega_max(2); 611 | Vector size_max(2); 612 | Vector nu(2); 613 | Vector distance(2); 614 | Vector omega_n; 615 | Vector omega_k; 616 | 617 | //% Filter bank settings [spectral temporal] 618 | omega_max(0) = M_PI/2; omega_max(1) = M_PI/2; //% radians 619 | size_max(0) = 3*23.0; size_max(1) = 40.0; //% bands, frames 620 | nu(0) = 3.5; nu(1) = 3.5; //% half-waves under envelope 621 | distance(0) = 0.3; distance(1) = 0.2; //% controls the spacing of filters 622 | 623 | // % Calculate center modulation frequencies. 624 | GFBCalcAxis(omega_max, size_max, nu, distance, &omega_n, &omega_k); 625 | 626 | // % selection of first number of temporal modulation frequencies 627 | omega_n.Resize(opts_.nb_mod_freq, kCopyData); 628 | 629 | int32 currentRowsOut = 0; 630 | for (int32 n=0; n gfilter_real; 635 | Matrix gfilter_imag; 636 | ComputeGaborFilter(omega_k(k), omega_n(n), nu, size_max, &gfilter_real, &gfilter_imag); 637 | // %% Filter mel spectrogram with filter bank filters and select representative channels. 638 | Matrix gfiltered_spec_real; 639 | Matrix gfiltered_spec_imag; 640 | ApplyGaborFilter(gfilter_real, gfilter_imag, padded_spec, &gfiltered_spec_real, &gfiltered_spec_imag); 641 | GFBSelectRep(gfilter_real, gfilter_imag, &gfiltered_spec_real, &gfiltered_spec_imag); 642 | output->Resize(currentRowsOut + gfiltered_spec_real.NumRows(), gfiltered_spec_real.NumCols(), kCopyData); 643 | // Copy to output 644 | if (opts_.use_real) { 645 | SubMatrix this_gfiltered_spec((*output), currentRowsOut, gfiltered_spec_real.NumRows(), 0, gfiltered_spec_real.NumCols()); 646 | this_gfiltered_spec.CopyFromMat(gfiltered_spec_real); 647 | 648 | } else { 649 | SubMatrix this_gfiltered_spec((*output), currentRowsOut, gfiltered_spec_imag.NumRows(), 0, gfiltered_spec_imag.NumCols()); 650 | this_gfiltered_spec.CopyFromMat(gfiltered_spec_imag); 651 | } 652 | currentRowsOut = currentRowsOut + gfiltered_spec_real.NumRows(); 653 | } 654 | } 655 | } 656 | output->Transpose(); 657 | // remove additional padding for very short utterances 658 | output->Resize(output->NumRows()-ro_add, output->NumCols(), kCopyData); 659 | RemovePadding(*output, ro, co, output); 660 | 661 | } 662 | } 663 | 664 | 665 | --------------------------------------------------------------------------------