├── .gitignore
├── LICENSE
├── README.md
├── html
├── build_doc.py
├── msf_filterbank.html
├── msf_framesig.html
├── msf_lar.html
├── msf_logfb.html
├── msf_lpc.html
├── msf_lpcc.html
├── msf_lsf.html
├── msf_mfcc.html
├── msf_powspec.html
├── msf_rc.html
├── msf_ssc.html
└── publishcode.m
├── msf_filterbank.m
├── msf_framesig.m
├── msf_lar.m
├── msf_logfb.m
├── msf_lpc.m
├── msf_lpcc.m
├── msf_lsf.m
├── msf_mfcc.m
├── msf_powspec.m
├── msf_rc.m
└── msf_ssc.m
/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2013 James Lyons
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
6 | this software and associated documentation files (the "Software"), to deal in
7 | the Software without restriction, including without limitation the rights to
8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | matlab_speech_features
2 | ======================
3 |
4 | A set of speech feature extraction functions for ASR and speaker identification written in matlab.
5 | Documentation is available at http://www.practicalcryptography.com/miscellaneous/machine-learning/matlab_speech_features-documentation/
6 |
7 | Why do all the functions start with "msf_"? Mainly because matlab already has some functions e.g. "lpc" and the other names are short, so I prepended "msf" to the function names so that it was clear which functions are provided by this package and which are not.
8 |
9 | currently implemented:
10 |
11 | Feature | Description
12 | --------| -----------
13 | MFCCs | Mel frequency cepstral coefficients
14 | LFBE | Log filterbank energies
15 | SSC | Spectral Subband Centroids
16 | LPCs | Linear prediction coefficients
17 | LPCCs | Linear prediction cepstral coefficients
18 | LSF | line spectral frequencies
19 | LAR | log area ratios
20 | RC | reflection coefficients
21 |
22 | To be implemented:
23 |
24 | - PLP
25 |
26 | There are other things I'd like to clean up, e.g. reflection coefficients and log area ratios use MATLABs poly2rc and rc2lar functions,
27 | I would like to implement these myself. Also the function naming is a bit wierd, I'll have to fix it up at some point.
28 |
--------------------------------------------------------------------------------
/html/build_doc.py:
--------------------------------------------------------------------------------
1 | # this script uses the comments in each matlab function along with matlabs 'publish' function
2 | # to generate documentation for this library.
3 | # - matlab puts output in different html files for every function, this script combines them into one big one.
4 | # - this script assumes it lives one directory down from all the '.m' files
5 |
6 | import os
7 | from bs4 import BeautifulSoup
8 |
9 | # first run matlab 'publish' to get pretty docs for our functions
10 | publishcode = "opts.codeToEvaluate = 'fs=16000;speech=randn(1,100);';\nopts.showCode = false;\naddpath ..\n"
11 | dirList=os.listdir('..')
12 | for fname in dirList:
13 | temp,ext = os.path.splitext(fname.strip())
14 | if ext != '.m': continue
15 | publishcode += "publish('" + fname + "',opts);\n";
16 |
17 | f = open('publishcode.m','w')
18 | f.write(publishcode)
19 | f.close()
20 |
21 | os.system("matlab -nodesktop -nosplash < publishcode.m")
22 |
23 | # now take all the matlab output and put it in one file
24 | # - note that currently we are outputting a html fragment, copy and paste it into an actual document
25 | dirList=os.listdir('.')
26 |
27 | for fname in dirList:
28 | temp,ext = os.path.splitext(fname.strip())
29 | if ext != '.html': continue
30 | with open(fname, 'r') as f: html_doc = f.read()
31 | soup = BeautifulSoup(html_doc)
32 |
33 | for tag in soup.find_all('h1'):
34 | tag.name = 'h2'
35 | for tag in soup.find_all('p'):
36 | try:
37 | if tag['class'] == ['footer']: tag.replace_with('')
38 | except: continue
39 | print soup.div
40 |
41 |
42 |
--------------------------------------------------------------------------------
/html/msf_filterbank.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 | msf_filterbank - return a mel-spaced filterbank msf_filterbank - return a mel-spaced filterbank function fbank = msf_filterbank(nfilt,fs,lowfreq,highfreq,nfft) returns a mel-spaced filterbank for use with filterbank energies, mfccs, sscs etc.
nfilt - the number filterbanks to use.fs - the sample rate of 'speech', integerlowfreq - the lowest filterbank edge. In Hz.highfreq - the highest filterbank edge. In Hz.nfft - the FFT size to use.Example usage:
lpcs = msf_filterbank(26,16000,0,16000,512);
--------------------------------------------------------------------------------
/html/msf_framesig.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 | msf_framesig - break a signal into frames msf_framesig - break a signal into frames function win_frames = msf_framesig(signal, frame_len, frame_step, winfunc) Takes a 1 by N signal, and breaks it up into frames. Each frame starts frame_step samples after the start of the previous frame. Each frame is windowed by wintype.
- to specify window, use e.g. @hamming, @(x)chebwin(x,30), @(x)ones(x,1), etc.
signal - the input signal, vector of audio samplesframe_len - length of window in samples.frame_step - step between successive windows in seconds. In samples.winfunc - A function to be applied to each window.Example usage with hamming window:
frames = msf_framesig(speech, winlen*fs, winstep*fs, @(x)hamming(x));
--------------------------------------------------------------------------------
/html/msf_lar.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 | msf_lpcc - Log Area Ratios msf_lpcc - Log Area Ratios function feat = msf_lar(speech,fs,varargin) given a speech signal, splits it into frames and computes Log Area Ratios for each frame.
speech - the input speech signal, vector of speech samplesfs - the sample rate of 'speech', integeroptional arguments supported include the following 'name', value pairs from the 3rd argument on:
'winlen' - length of window in seconds. Default: 0.025 (25 milliseconds)'winstep' - step between successive windows in seconds. Default: 0.01 (10 milliseconds)'order' - the number of coefficients to return. Default: 12Example usage:
lars = msf_lar(signal,16000,'order',10);
--------------------------------------------------------------------------------
/html/msf_logfb.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 | msf_logfb - Log Filterbank Energies msf_logfb - Log Filterbank Energies function feat = msf_logfb(speech,fs,varargin) given a speech signal, splits it into frames and computes log filterbank energies for each frame.
speech - the input speech signal, vector of speech samplesfs - the sample rate of 'speech', integeroptional arguments supported include the following 'name', value pairs from the 3rd argument on:
'winlen' - length of window in seconds. Default: 0.025 (25 milliseconds)'winstep' - step between successive windows in seconds. Default: 0.01 (10 milliseconds)'nfilt' - the number filterbanks to use. Default: 26'lowfreq' - the lowest filterbank edge. In Hz. Default: 0'highfreq' - the highest filterbank edge. In Hz. Default: fs/2'nfft' - the FFT size to use. Default: 512Example usage:
logfbs = msf_logfb(signal,16000,'nfilt',40,'ncep',12);
--------------------------------------------------------------------------------
/html/msf_lpc.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 | msf_lpc - Linear Prediction Coefficients msf_lpc - Linear Prediction Coefficients function feat = msf_lpc(speech,fs,varargin) given a speech signal, splits it into frames and computes Linear Prediction Coefficients for each frame.
speech - the input speech signal, vector of speech samplesfs - the sample rate of 'speech', integeroptional arguments supported include the following 'name', value pairs from the 3rd argument on:
'winlen' - length of window in seconds. Default: 0.025 (25 milliseconds)'winstep' - step between successive windows in seconds. Default: 0.01 (10 milliseconds)'order' - the number of coefficients to return. Default: 12Example usage:
lpcs = msf_lpc(signal,16000,'order',10);
--------------------------------------------------------------------------------
/html/msf_lpcc.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 | msf_lpcc - Linear Prediction Cepstral Coefficients msf_lpcc - Linear Prediction Cepstral Coefficients function feat = msf_lpcc(speech,fs,varargin) given a speech signal, splits it into frames and computes Linear Prediction Cepstral Coefficients for each frame.
speech - the input speech signal, vector of speech samplesfs - the sample rate of 'speech', integeroptional arguments supported include the following 'name', value pairs from the 3rd argument on:
'winlen' - length of window in seconds. Default: 0.025 (25 milliseconds)'winstep' - step between successive windows in seconds. Default: 0.01 (10 milliseconds)'order' - the number of coefficients to return. Default: 12Example usage:
lpccs = msf_lpcc(signal,16000,'order',10);
--------------------------------------------------------------------------------
/html/msf_lsf.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 | msf_lsf - Line Spectral Frequencies msf_lsf - Line Spectral Frequencies function feat = msf_lsf(speech,fs,varargin) given a speech signal, splits it into frames and computes Line Spectral Frequencies for each frame.
speech - the input speech signal, vector of speech samplesfs - the sample rate of 'speech', integeroptional arguments supported include the following 'name', value pairs from the 3rd argument on:
'winlen' - length of window in seconds. Default: 0.025 (25 milliseconds)'winstep' - step between successive windows in seconds. Default: 0.01 (10 milliseconds)'order' - the number of coefficients to return. Default: 12Example usage:
lsfs = msf_lsf(signal,16000,'order',10);
--------------------------------------------------------------------------------
/html/msf_mfcc.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 | msf_mfcc - Mel Frequency Cepstral Coefficients msf_mfcc - Mel Frequency Cepstral Coefficients function feat = msf_mfcc(speech,fs,varargin) given a speech signal, splits it into frames and computes Mel frequency cepstral coefficients for each frame. For a tutorial on MFCCs, see MFCC tutorial .
speech - the input speech signal, vector of speech samplesfs - the sample rate of 'speech', integeroptional arguments supported include the following 'name', value pairs from the 3rd argument on:
'winlen' - length of window in seconds. Default: 0.025 (25 milliseconds)'winstep' - step between successive windows in seconds. Default: 0.01 (10 milliseconds)'nfilt' - the number filterbanks to use. Default: 26'lowfreq' - the lowest filterbank edge. In Hz. Default: 0'highfreq' - the highest filterbank edge. In Hz. Default: fs/2'nfft' - the FFT size to use. Default: 512'ncep' - the number of cepstral coeffients to use. Default: 13'liftercoeff' - liftering coefficient, 0 is no lifter. Default: 22'appendenergy' - if true, replaces 0th cep coeff with log of total frame energy. Default: trueExample usage:
mfccs = msf_mfcc(signal,16000,'nfilt',40,'ncep',12);
--------------------------------------------------------------------------------
/html/msf_powspec.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 | msf_powspec - Compute power spectrum of audio frames msf_powspec - Compute power spectrum of audio frames function pspec = msf_powspec(speech,fs,varargin) given a speech signal, splits it into frames and computes the power spectrum for each frame.
speech - the input speech signal, vector of speech samplesfs - the sample rate of 'speech', integeroptional arguments supported include the following 'name', value pairs from the 3rd argument on:
'winlen' - length of window in seconds. Default: 0.025 (25 milliseconds)'winstep' - step between successive windows in seconds. Default: 0.01 (10 milliseconds)'nfft' - the FFT size to use. Default: 512Example usage:
lpcs = msf_powspec(signal,16000,'winlen',0.5);
--------------------------------------------------------------------------------
/html/msf_rc.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 | msf_rc - Reflection Coefficients msf_rc - Reflection Coefficients function feat = msf_rc(speech,fs,varargin) given a speech signal, splits it into frames and computes Reflection Coefficients for each frame.
speech - the input speech signal, vector of speech samplesfs - the sample rate of 'speech', integeroptional arguments supported include the following 'name', value pairs from the 3rd argument on:
'winlen' - length of window in seconds. Default: 0.025 (25 milliseconds)'winstep' - step between successive windows in seconds. Default: 0.01 (10 milliseconds)'order' - the number of coefficients to return. Default: 12Example usage:
rcs = msf_rc(signal,16000,'order',10);
--------------------------------------------------------------------------------
/html/msf_ssc.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 | msf_ssc - Spectral Subband Centroids msf_ssc - Spectral Subband Centroids function feat = msf_ssc(speech,fs,varargin) given a speech signal, splits it into frames and computes Spectral Subband Centroids for each frame.
speech - the input speech signal, vector of speech samplesfs - the sample rate of 'speech', integeroptional arguments supported include the following 'name', value pairs from the 3rd argument on:
'winlen' - length of window in seconds. Default: 0.025 (25 milliseconds)'winstep' - step between successive windows in seconds. Default: 0.01 (10 milliseconds)'nfilt' - the number filterbanks to use. Default: 26'lowfreq' - the lowest filterbank edge. In Hz. Default: 0'highfreq' - the highest filterbank edge. In Hz. Default: fs/2'nfft' - the FFT size to use. Default: 512Example usage:
sscs = msf_ssc(signal,16000,'nfilt',40,'ncep',12);
--------------------------------------------------------------------------------
/html/publishcode.m:
--------------------------------------------------------------------------------
1 | opts.codeToEvaluate = 'fs=16000;speech=randn(1,100);';
2 | opts.showCode = false;
3 | addpath ..
4 | publish('msf_lsf.m',opts);
5 | publish('msf_lpc.m',opts);
6 | publish('msf_ssc.m',opts);
7 | publish('msf_mfcc.m',opts);
8 | publish('msf_powspec.m',opts);
9 | publish('msf_rc.m',opts);
10 | publish('inrange.m',opts);
11 | publish('msf_framesig.m',opts);
12 | publish('msf_lpcc.m',opts);
13 | publish('msf_filterbank.m',opts);
14 | publish('msf_logfb.m',opts);
15 | publish('msf_lar.m',opts);
16 |
--------------------------------------------------------------------------------
/msf_filterbank.m:
--------------------------------------------------------------------------------
1 | %% msf_filterbank - return a mel-spaced filterbank
2 | %
3 | % function fbank = msf_filterbank(nfilt,fs,lowfreq,highfreq,nfft)
4 | %
5 | % returns a mel-spaced filterbank for use with filterbank energies, mfccs, sscs etc.
6 | %
7 | % * |nfilt| - the number filterbanks to use.
8 | % * |fs| - the sample rate of 'speech', integer
9 | % * |lowfreq| - the lowest filterbank edge. In Hz.
10 | % * |highfreq| - the highest filterbank edge. In Hz.
11 | % * |nfft| - the FFT size to use.
12 | %
13 | % Example usage:
14 | %
15 | % lpcs = msf_filterbank(26,16000,0,16000,512);
16 | %
17 | function fbank = msf_filterbank(nfilt,fs,lowfreq,highfreq,nfft)
18 | % compute points evenly spaced in mels
19 | lowmel = hz2mel(lowfreq);
20 | highmel = hz2mel(highfreq);
21 | melpoints = linspace(lowmel,highmel,nfilt+2);
22 | % our points are in Hz, but we use fft bins, so we have to convert from Hz to fft bin number
23 | bin = 1+floor((nfft-1)*mel2hz(melpoints)/fs);
24 |
25 | fbank = zeros(nfilt,nfft/2);
26 | for j = 1:nfilt
27 | for i = bin(j):bin(j+1)
28 | fbank(j,i) = (i - bin(j))/(bin(j+1)-bin(j));
29 | end
30 | for i = bin(j+1):bin(j+2)
31 | fbank(j,i) = (bin(j+2)-i)/(bin(j+2)-bin(j+1));
32 | end
33 | end
34 | end
35 |
36 | function hz = mel2hz(mel)
37 | hz = 700*(10.^(mel./2595) -1);
38 | end
39 |
40 | function mel = hz2mel(hz)
41 | mel = 2595*log10(1+hz./700);
42 | end
43 |
--------------------------------------------------------------------------------
/msf_framesig.m:
--------------------------------------------------------------------------------
1 | %% msf_framesig - break a signal into frames
2 | %
3 | % function win_frames = msf_framesig(signal, frame_len, frame_step, winfunc)
4 | %
5 | % Takes a 1 by N signal, and breaks it up into frames. Each frame starts
6 | % _frame_step_ samples after the start of the previous frame. Each frame is
7 | % windowed by wintype.
8 | %
9 | % - to specify window, use e.g. @hamming, @(x)chebwin(x,30), @(x)ones(x,1), etc.
10 | %
11 | % * |signal| - the input signal, vector of audio samples
12 | % * |frame_len| - length of window in samples.
13 | % * |frame_step| - step between successive windows in seconds. In samples.
14 | % * |winfunc| - A function to be applied to each window.
15 | %
16 | % Example usage with hamming window:
17 | %
18 | % frames = msf_framesig(speech, winlen*fs, winstep*fs, @(x)hamming(x));
19 | %
20 | function win_frames = msf_framesig(signal, frame_len, frame_step, winfunc)
21 | if size(signal,1) ~= 1
22 | signal = signal';
23 | end
24 |
25 | signal_len = length(signal);
26 | if signal_len <= frame_len % if very short frame, pad it to frame_len
27 | num_frames = 1;
28 | else
29 | num_frames = 1 + ceil((signal_len - frame_len)/frame_step);
30 | end
31 | padded_len = (num_frames-1)*frame_step + frame_len;
32 | % make sure signal is exactly divisible into N frames
33 | pad_signal = [signal, zeros(1,padded_len - signal_len)];
34 |
35 | % build array of indices
36 | indices = repmat(1:frame_len, num_frames, 1) + ...
37 | repmat((0: frame_step: num_frames*frame_step-1)', 1, frame_len);
38 | frames = pad_signal(indices);
39 |
40 | win = repmat(winfunc(frame_len)', size(frames, 1), 1);
41 | % apply window
42 | win_frames = frames .* win;
43 | end
44 |
--------------------------------------------------------------------------------
/msf_lar.m:
--------------------------------------------------------------------------------
1 | %% msf_lpcc - Log Area Ratios
2 | %
3 | % function feat = msf_lar(speech,fs,varargin)
4 | %
5 | % given a speech signal, splits it into frames and computes Log Area Ratios for each frame.
6 | %
7 | % * |speech| - the input speech signal, vector of speech samples
8 | % * |fs| - the sample rate of 'speech', integer
9 | %
10 | % optional arguments supported include the following 'name', value pairs
11 | % from the 3rd argument on:
12 | %
13 | % * |'winlen'| - length of window in seconds. Default: 0.025 (25 milliseconds)
14 | % * |'winstep'| - step between successive windows in seconds. Default: 0.01 (10 milliseconds)
15 | % * |'order'| - the number of coefficients to return. Default: 12
16 | %
17 | % Example usage:
18 | %
19 | % lars = msf_lar(signal,16000,'order',10);
20 | %
21 | function feat = msf_lar(speech,fs,varargin)
22 | p = inputParser;
23 | addOptional(p,'winlen', 0.025,@(x)gt(x,0));
24 | addOptional(p,'winstep', 0.01, @(x)gt(x,0));
25 | addOptional(p,'order', 12, @(x)ge(x,1));
26 | addOptional(p,'preemph', 0, @(x)ge(x,0));
27 | parse(p,varargin{:});
28 | in = p.Results;
29 |
30 | frames = msf_framesig(speech,in.winlen*fs,in.winstep*fs,@(x)hamming(x));
31 | temp = lpc(frames',in.order);
32 | feat = zeros(size(temp,1),in.order);
33 | for i = 1:size(temp,1)
34 | temp2 = poly2rc(temp(i,:));
35 | feat(i,:) = rc2lar(temp2)';
36 | end
37 |
38 | end
39 |
--------------------------------------------------------------------------------
/msf_logfb.m:
--------------------------------------------------------------------------------
1 | %% msf_logfb - Log Filterbank Energies
2 | %
3 | % function feat = msf_logfb(speech,fs,varargin)
4 | %
5 | % given a speech signal, splits it into frames and computes log filterbank energies for each frame.
6 | %
7 | % * |speech| - the input speech signal, vector of speech samples
8 | % * |fs| - the sample rate of 'speech', integer
9 | %
10 | % optional arguments supported include the following 'name', value pairs
11 | % from the 3rd argument on:
12 | %
13 | % * |'winlen'| - length of window in seconds. Default: 0.025 (25 milliseconds)
14 | % * |'winstep'| - step between successive windows in seconds. Default: 0.01 (10 milliseconds)
15 | % * |'nfilt'| - the number filterbanks to use. Default: 26
16 | % * |'lowfreq'| - the lowest filterbank edge. In Hz. Default: 0
17 | % * |'highfreq'| - the highest filterbank edge. In Hz. Default: fs/2
18 | % * |'nfft'| - the FFT size to use. Default: 512
19 | %
20 | % Example usage:
21 | %
22 | % logfbs = msf_logfb(signal,16000,'nfilt',40,'ncep',12);
23 | %
24 | function feat = msf_logfb(speech,fs,varargin)
25 | p = inputParser;
26 | addOptional(p,'winlen', 0.025,@(x)gt(x,0));
27 | addOptional(p,'winstep', 0.01, @(x)gt(x,0));
28 | addOptional(p,'nfilt', 26, @(x)ge(x,1));
29 | addOptional(p,'lowfreq', 0, @(x)ge(x,0));
30 | addOptional(p,'highfreq', fs/2, @(x)ge(x,0));
31 | addOptional(p,'nfft', 512, @(x)gt(x,0));
32 | addOptional(p,'preemph', 0, @(x)ge(x,0));
33 | parse(p,varargin{:});
34 | in = p.Results;
35 | H = msf_filterbank(in.nfilt,fs,in.lowfreq,in.highfreq,in.nfft);
36 | pspec = msf_powspec(speech,fs,'winlen',in.winlen,'winstep',in.winstep,'nfft',in.nfft);
37 | feat = log(pspec*H');
38 | end
39 |
--------------------------------------------------------------------------------
/msf_lpc.m:
--------------------------------------------------------------------------------
1 | %% msf_lpc - Linear Prediction Coefficients
2 | %
3 | % function feat = msf_lpc(speech,fs,varargin)
4 | %
5 | % given a speech signal, splits it into frames and computes Linear Prediction Coefficients for each frame.
6 | %
7 | % * |speech| - the input speech signal, vector of speech samples
8 | % * |fs| - the sample rate of 'speech', integer
9 | %
10 | % optional arguments supported include the following 'name', value pairs
11 | % from the 3rd argument on:
12 | %
13 | % * |'winlen'| - length of window in seconds. Default: 0.025 (25 milliseconds)
14 | % * |'winstep'| - step between successive windows in seconds. Default: 0.01 (10 milliseconds)
15 | % * |'order'| - the number of coefficients to return. Default: 12
16 | %
17 | % Example usage:
18 | %
19 | % lpcs = msf_lpc(signal,16000,'order',10);
20 | %
21 | function feat = msf_lpc(speech,fs,varargin)
22 | p = inputParser;
23 | addOptional(p,'winlen', 0.025,@(x)gt(x,0));
24 | addOptional(p,'winstep', 0.01, @(x)gt(x,0));
25 | addOptional(p,'order', 12, @(x)ge(x,1));
26 | addOptional(p,'preemph', 0, @(x)ge(x,0));
27 | parse(p,varargin{:});
28 | in = p.Results;
29 |
30 | frames = msf_framesig(speech,in.winlen*fs,in.winstep*fs,@(x)hamming(x));
31 | feat = lpc(frames',in.order);
32 | feat = feat(:,2:end); % ignore leading ones
33 |
34 | end
35 |
--------------------------------------------------------------------------------
/msf_lpcc.m:
--------------------------------------------------------------------------------
1 | %% msf_lpcc - Linear Prediction Cepstral Coefficients
2 | %
3 | % function feat = msf_lpcc(speech,fs,varargin)
4 | %
5 | % given a speech signal, splits it into frames and computes Linear Prediction Cepstral Coefficients for each frame.
6 | %
7 | % * |speech| - the input speech signal, vector of speech samples
8 | % * |fs| - the sample rate of 'speech', integer
9 | %
10 | % optional arguments supported include the following 'name', value pairs
11 | % from the 3rd argument on:
12 | %
13 | % * |'winlen'| - length of window in seconds. Default: 0.025 (25 milliseconds)
14 | % * |'winstep'| - step between successive windows in seconds. Default: 0.01 (10 milliseconds)
15 | % * |'order'| - the number of coefficients to return. Default: 12
16 | %
17 | % Example usage:
18 | %
19 | % lpccs = msf_lpcc(signal,16000,'order',10);
20 | %
21 | function feat = msf_lpcc(speech,fs,varargin)
22 | p = inputParser;
23 | addOptional(p,'winlen', 0.025,@(x)gt(x,0));
24 | addOptional(p,'winstep', 0.01, @(x)gt(x,0));
25 | addOptional(p,'order', 12, @(x)ge(x,1));
26 | addOptional(p,'preemph', 0, @(x)ge(x,0));
27 | parse(p,varargin{:});
28 | in = p.Results;
29 |
30 | frames = msf_framesig(speech,in.winlen*fs,in.winstep*fs,@(x)hamming(x));
31 | temp = lpc(frames',in.order);
32 | temp = temp(:,2:end); % ignore leading ones
33 | feat = cepst(temp);
34 |
35 | end
36 |
37 | function ccs = cepst(apks)
38 | % ccs = cepst(apks)
39 | % - calculates cepstral coefficients from lpcs
40 | % - apks are the lpc values (without leading 1)
41 | % - if more than one, apks should be a N by D matrix, where N is the
42 | % number of lpc vectors, D is the number of lpcs
43 | % - ccs are the cepstral coefficients
44 | % the number of ccs is the same as the number of lpcs
45 | [N P] = size(apks);
46 | ccs = zeros(N,P);
47 |
48 | for i = 1:N
49 | for m = 1:P
50 | s = 0;
51 | for k = 1:(m-1)
52 | s = s + -1*(m - k)*ccs(i,m - k)*apks(i,k);
53 | end
54 | ccs(i,m) = -1*apks(i,m) + (1/m)*s;
55 | end
56 | end
57 | end
58 |
--------------------------------------------------------------------------------
/msf_lsf.m:
--------------------------------------------------------------------------------
1 | %% msf_lsf - Line Spectral Frequencies
2 | %
3 | % function feat = msf_lsf(speech,fs,varargin)
4 | %
5 | % given a speech signal, splits it into frames and computes Line Spectral Frequencies for each frame.
6 | %
7 | % * |speech| - the input speech signal, vector of speech samples
8 | % * |fs| - the sample rate of 'speech', integer
9 | %
10 | % optional arguments supported include the following 'name', value pairs
11 | % from the 3rd argument on:
12 | %
13 | % * |'winlen'| - length of window in seconds. Default: 0.025 (25 milliseconds)
14 | % * |'winstep'| - step between successive windows in seconds. Default: 0.01 (10 milliseconds)
15 | % * |'order'| - the number of coefficients to return. Default: 12
16 | %
17 | % Example usage:
18 | %
19 | % lsfs = msf_lsf(signal,16000,'order',10);
20 | %
21 | function feat = msf_lsf(speech,fs,varargin)
22 | p = inputParser;
23 | addOptional(p,'winlen', 0.025,@(x)gt(x,0));
24 | addOptional(p,'winstep', 0.01, @(x)gt(x,0));
25 | addOptional(p,'order', 12, @(x)ge(x,1));
26 | addOptional(p,'preemph', 0, @(x)ge(x,0));
27 | parse(p,varargin{:});
28 | in = p.Results;
29 |
30 | frames = msf_framesig(speech,in.winlen*fs,in.winstep*fs,@(x)hamming(x));
31 | temp = lpc(frames',in.order);
32 | feat = zeros(size(temp,1),in.order);
33 | for i = 1:size(temp,1)
34 | feat(i,:) = poly2lsf(temp(i,:))';
35 | end
36 |
37 | end
38 |
--------------------------------------------------------------------------------
/msf_mfcc.m:
--------------------------------------------------------------------------------
1 | %% msf_mfcc - Mel Frequency Cepstral Coefficients
2 | %
3 | % function feat = msf_mfcc(speech,fs,varargin)
4 | %
5 | % given a speech signal, splits it into frames and computes Mel frequency cepstral coefficients for each frame.
6 | % For a tutorial on MFCCs, see .
7 | %
8 | % * |speech| - the input speech signal, vector of speech samples
9 | % * |fs| - the sample rate of 'speech', integer
10 | %
11 | % optional arguments supported include the following 'name', value pairs
12 | % from the 3rd argument on:
13 | %
14 | % * |'winlen'| - length of window in seconds. Default: 0.025 (25 milliseconds)
15 | % * |'winstep'| - step between successive windows in seconds. Default: 0.01 (10 milliseconds)
16 | % * |'nfilt'| - the number filterbanks to use. Default: 26
17 | % * |'lowfreq'| - the lowest filterbank edge. In Hz. Default: 0
18 | % * |'highfreq'| - the highest filterbank edge. In Hz. Default: fs/2
19 | % * |'nfft'| - the FFT size to use. Default: 512
20 | % * |'ncep'| - the number of cepstral coeffients to use. Default: 13
21 | % * |'liftercoeff'| - liftering coefficient, 0 is no lifter. Default: 22
22 | % * |'appendenergy'| - if true, replaces 0th cep coeff with log of total frame energy. Default: true
23 | %
24 | % Example usage:
25 | %
26 | % mfccs = msf_mfcc(signal,16000,'nfilt',40,'ncep',12);
27 | %
28 | function mfccs = msf_mfcc(speech,fs,varargin)
29 | p = inputParser;
30 | addOptional(p,'winlen', 0.025,@(x)gt(x,0));
31 | addOptional(p,'winstep', 0.01, @(x)gt(x,0));
32 | addOptional(p,'nfilt', 26, @(x)ge(x,1));
33 | addOptional(p,'lowfreq', 0, @(x)ge(x,0));
34 | addOptional(p,'highfreq', fs/2, @(x)ge(x,0));
35 | addOptional(p,'nfft', 512, @(x)gt(x,0));
36 | addOptional(p,'ncep', 13, @(x)ge(x,1));
37 | addOptional(p,'liftercoeff', 22, @(x)ge(x,0));
38 | addOptional(p,'appendenergy',true, @(x)ismember(x,[true,false]));
39 | addOptional(p,'preemph', 0, @(x)ge(x,0));
40 | parse(p,varargin{:});
41 | in = p.Results;
42 | H = msf_filterbank(in.nfilt, fs, in.lowfreq, in.highfreq, in.nfft);
43 | pspec = msf_powspec(speech, fs, 'winlen', in.winlen, 'winstep', in.winstep, 'nfft', in.nfft);
44 | en = sum(pspec,2); % energy in each frame
45 | feat = dct(log(H*pspec'))';
46 | mfccs = lifter(feat(:,1:in.ncep), in.liftercoeff);
47 | if in.appendenergy
48 | mfccs(:,1) = log10(en);
49 | end
50 |
51 | end
52 |
53 | function lcep = lifter(cep,L)
54 | [N,D] = size(cep);
55 | n = 0:D-1;
56 | lift = 1 + (L/2)*sin(pi*n/L);
57 | lcep = cep .* repmat(lift,N,1);
58 | end
59 |
60 |
--------------------------------------------------------------------------------
/msf_powspec.m:
--------------------------------------------------------------------------------
1 | %% msf_powspec - Compute power spectrum of audio frames
2 | %
3 | % function pspec = msf_powspec(speech,fs,varargin)
4 | %
5 | % given a speech signal, splits it into frames and computes the power spectrum for each frame.
6 | %
7 | % * |speech| - the input speech signal, vector of speech samples
8 | % * |fs| - the sample rate of 'speech', integer
9 | %
10 | % optional arguments supported include the following 'name', value pairs
11 | % from the 3rd argument on:
12 | %
13 | % * |'winlen'| - length of window in seconds. Default: 0.025 (25 milliseconds)
14 | % * |'winstep'| - step between successive windows in seconds. Default: 0.01 (10 milliseconds)
15 | % * |'nfft'| - the FFT size to use. Default: 512
16 | %
17 | % Example usage:
18 | %
19 | % lpcs = msf_powspec(signal,16000,'winlen',0.5);
20 | %
21 | function pspec = msf_powspec(speech,fs,varargin)
22 | p = inputParser;
23 | addOptional(p,'winlen',0.025,@isnumeric);
24 | addOptional(p,'winstep',0.01,@isnumeric);
25 | addOptional(p,'nfft',512,@isnumeric);
26 | parse(p,varargin{:});
27 | in = p.Results;
28 |
29 | frames = msf_framesig(speech,in.winlen*fs,in.winstep*fs,@(x)hamming(x));
30 | pspec = 1/(in.winlen*fs)*abs(fft(frames,in.nfft,2)).^2;
31 | pspec = pspec(:,1:in.nfft/2);
32 | end
33 |
--------------------------------------------------------------------------------
/msf_rc.m:
--------------------------------------------------------------------------------
1 | %% msf_rc - Reflection Coefficients
2 | %
3 | % function feat = msf_rc(speech,fs,varargin)
4 | %
5 | % given a speech signal, splits it into frames and computes Reflection Coefficients for each frame.
6 | %
7 | % * |speech| - the input speech signal, vector of speech samples
8 | % * |fs| - the sample rate of 'speech', integer
9 | %
10 | % optional arguments supported include the following 'name', value pairs
11 | % from the 3rd argument on:
12 | %
13 | % * |'winlen'| - length of window in seconds. Default: 0.025 (25 milliseconds)
14 | % * |'winstep'| - step between successive windows in seconds. Default: 0.01 (10 milliseconds)
15 | % * |'order'| - the number of coefficients to return. Default: 12
16 | %
17 | % Example usage:
18 | %
19 | % rcs = msf_rc(signal,16000,'order',10);
20 | %
21 | function feat = msf_rc(speech,fs,varargin)
22 | p = inputParser;
23 | addOptional(p,'winlen', 0.025,@(x)gt(x,0));
24 | addOptional(p,'winstep', 0.01, @(x)gt(x,0));
25 | addOptional(p,'order', 12, @(x)ge(x,1));
26 | addOptional(p,'preemph', 0, @(x)ge(x,0));
27 | parse(p,varargin{:});
28 | in = p.Results;
29 |
30 | frames = msf_framesig(speech,in.winlen*fs,in.winstep*fs,@(x)hamming(x));
31 | temp = lpc(frames',in.order);
32 | feat = zeros(size(temp,1),in.order);
33 | for i = 1:size(temp,1)
34 | feat(i,:) = poly2rc(temp(i,:))';
35 | end
36 |
37 | end
38 |
--------------------------------------------------------------------------------
/msf_ssc.m:
--------------------------------------------------------------------------------
1 | %% msf_ssc - Spectral Subband Centroids
2 | %
3 | % function feat = msf_ssc(speech,fs,varargin)
4 | %
5 | % given a speech signal, splits it into frames and computes Spectral Subband Centroids for each frame.
6 | %
7 | % * |speech| - the input speech signal, vector of speech samples
8 | % * |fs| - the sample rate of 'speech', integer
9 | %
10 | % optional arguments supported include the following 'name', value pairs
11 | % from the 3rd argument on:
12 | %
13 | % * |'winlen'| - length of window in seconds. Default: 0.025 (25 milliseconds)
14 | % * |'winstep'| - step between successive windows in seconds. Default: 0.01 (10 milliseconds)
15 | % * |'nfilt'| - the number filterbanks to use. Default: 26
16 | % * |'lowfreq'| - the lowest filterbank edge. In Hz. Default: 0
17 | % * |'highfreq'| - the highest filterbank edge. In Hz. Default: fs/2
18 | % * |'nfft'| - the FFT size to use. Default: 512
19 | %
20 | % Example usage:
21 | %
22 | % sscs = msf_ssc(signal,16000,'nfilt',40,'ncep',12);
23 | %
24 | function feat = msf_ssc(speech,fs,varargin)
25 | p = inputParser;
26 | addOptional(p,'winlen', 0.025,@(x)gt(x,0));
27 | addOptional(p,'winstep', 0.01, @(x)gt(x,0));
28 | addOptional(p,'nfilt', 26, @(x)ge(x,1));
29 | addOptional(p,'lowfreq', 0, @(x)ge(x,0));
30 | addOptional(p,'highfreq', fs/2, @(x)ge(x,0));
31 | addOptional(p,'nfft', 512, @(x)gt(x,0));
32 | addOptional(p,'preemph', 0, @(x)ge(x,0));
33 | parse(p,varargin{:});
34 | in = p.Results;
35 | H = msf_filterbank(in.nfilt,fs,in.lowfreq,in.highfreq,in.nfft);
36 | pspec = msf_powspec(speech,fs,'winlen',in.winlen,'winstep',in.winstep,'nfft',in.nfft);
37 | R = repmat(linspace(0,fs/2,in.nfft/2),size(pspec,1),1);
38 | feat = ((R.*pspec)*H')./ (pspec*H');
39 | end
40 |
--------------------------------------------------------------------------------