├── README.md ├── feature_extractor.py ├── gfcc_extractor.py ├── mfcc_extractor.py └── scikits.zip /README.md: -------------------------------------------------------------------------------- 1 | # Speech_MFCC_GFCC_Python 2 | 求取语音的MFCC参数和GFCC参数,可用于语音信号特征提取,下载完成后解压scikits.zip文件夹到当前路径下即可使用 3 | 4 | # 提取MFCC特征--运行mfcc_extractor.py 5 | 6 | ~~~python 7 | sr, wav_data = wavfile.read(u"clean.wav") 8 | #分别代表 (语音文件,采样率,帧长,帧移,mel,dct,加窗) 9 | mfcc, spect = mfcc_extractor(wav_data[:32000,], sr, sr//1000*20, sr//1000*10, 52, 26, 'hanning', True) 10 | pyplot.imshow(mfcc) 11 | pyplot.show() 12 | ~~~ 13 | # 提取GFCC特征--运行 gfcc_extractor.py 14 | 15 | ~~~python 16 | 17 | #读取语音文件 18 | sr, wav_data = wavfile.read(u"./data/clean.wav") 19 | # xx, sr, win_len, shift_len, channel_number, win_type 20 | # cochleagram_extractor内数字分别代表:帧长,帧移,DCT参数设置 21 | cochlea = cochleagram_extractor(wav_data, sr, 1024, 512, 32, 'hanning') 22 | # cochlea 是DCT之前的参数 23 | plt.matshow(cochlea) 24 | plt.show() 25 | gfcc = gfcc_extractor(cochlea, 32, 16) 26 | ~~~ 27 | -------------------------------------------------------------------------------- /feature_extractor.py: -------------------------------------------------------------------------------- 1 | # coding = utf-8 2 | import numpy as np 3 | from scipy.signal import lfilter, lfilter_zi, lfiltic 4 | from scikits.talkbox.linpred.levinson_lpc import lpc 5 | 6 | 7 | def hz2mel(f): 8 | return 2595. * np.log10(1. + f / 700.) 9 | 10 | 11 | def mel2hz(z): 12 | return 700. * (np.power(10., z / 2595.) - 1.) 13 | 14 | 15 | def get_window(win_len, win_type): 16 | if win_type == 'hanning': 17 | win_len += 2 18 | window = np.hanning(win_len) 19 | window = window[1: -1] 20 | elif win_type == 'hamming': 21 | win_len += 2 22 | window = np.hamming(win_len) 23 | window = window[1: -1] 24 | elif win_type == 'triangle': 25 | window = 1. - (np.abs(win_len + 1. - 2.*np.arange(0., win_len+2., 1.)) / (win_len+1.)) 26 | window = window[1: -1] 27 | else: 28 | window = np.ones(win_len) 29 | return window 30 | 31 | 32 | def get_fft_mel_mat(nfft, sr=8000, nfilts=None, width=1.0, minfrq=20, maxfrq=None, constamp=0): 33 | if nfilts is None: 34 | nfilts = nfft 35 | if maxfrq is None: 36 | maxfrq = sr // 2 37 | wts = np.zeros((nfilts, nfft//2+1)) 38 | fftfrqs = np.arange(0, nfft//2+1) / (1. * nfft) * (sr) 39 | minmel = hz2mel(minfrq) 40 | maxmel = hz2mel(maxfrq) 41 | binfrqs = mel2hz(minmel + np.arange(0, nfilts+2) / (nfilts+1.) * (maxmel - minmel)) 42 | # binbin = np.round(binfrqs / maxfrq * nfft) 43 | for i in range(nfilts): 44 | fs = binfrqs[[i+0, i+1, i+2]] 45 | fs = fs[1] + width * (fs - fs[1]) 46 | loslope = (fftfrqs - fs[0]) / (fs[1] - fs[0]) 47 | hislope = (fs[2] - fftfrqs) / (fs[2] - fs[1]) 48 | wts[i, :] = np.maximum(0, np.minimum(loslope, hislope)) 49 | return wts 50 | 51 | 52 | def mfcc_extractor(xx, sr, win_len, shift_len, mel_channel, dct_channel, win_type, include_delta): 53 | 54 | my_melbank = get_fft_mel_mat(win_len, sr, mel_channel) 55 | 56 | pre_emphasis_weight = 0.9375 57 | 58 | # x = xx * (1-pre_emphasis_weight) 59 | x = np.append(xx[0], xx[1:] - pre_emphasis_weight * xx[:-1]) 60 | dctcoef = np.zeros((dct_channel, mel_channel), dtype=np.float32) 61 | for i in range(dct_channel): 62 | n = np.linspace(0, mel_channel-1, mel_channel) 63 | dctcoef[i, :] = np.cos((2 * n + 1) * i * np.pi / (2 * mel_channel)) 64 | 65 | w = 1 + 6 * np.sin(np.pi * np.linspace(0, dct_channel-1, dct_channel) / (dct_channel-1)) 66 | w /= w.max() 67 | w = np.reshape(w, newshape=(dct_channel, 1)) 68 | 69 | samples = x.shape[0] 70 | frames = 1 + (samples - win_len) // shift_len 71 | stft = np.zeros((win_len, frames), dtype=np.complex64) 72 | spectrum = np.zeros((win_len // 2 + 1, frames), dtype=np.float32) 73 | 74 | mfcc = np.zeros((dct_channel, frames), dtype=np.float32) 75 | 76 | window = get_window(win_len, win_type) 77 | 78 | for i in range(frames): 79 | one_frame = x[i * shift_len: i * shift_len + win_len] 80 | windowed_frame = np.multiply(one_frame, window) 81 | stft[:, i] = np.fft.fft(windowed_frame, win_len) 82 | spectrum[:, i] = np.power(np.abs(stft[0:win_len // 2 + 1, i]), 2) 83 | 84 | c1 = np.matmul(my_melbank, spectrum) 85 | c1 = np.where(c1 == 0.0, np.finfo(float).eps, c1) 86 | mfcc[:dct_channel, :] = np.multiply(np.matmul(dctcoef, np.log(c1)), np.repeat(w, frames, 1)) 87 | 88 | if include_delta: 89 | dtm = np.zeros((dct_channel, frames), dtype=np.float32) 90 | ddtm = np.zeros((dct_channel, frames), dtype=np.float32) 91 | for i in range(2, frames-2): 92 | dtm[:, i] = 2 * mfcc[:, i+2] + mfcc[:, i+1] - mfcc[:, i-1] - 2 * mfcc[:, i-2] 93 | dtm /= 3.0 94 | for i in range(2, frames-2): 95 | ddtm[:, i] = 2 * dtm[:, i+2] + dtm[:, i+1] - dtm[:, i-1] - 2 * dtm[:, i-2] 96 | ddtm /= 3.0 97 | mfcc = np.row_stack((mfcc[:, 4:frames-4], dtm[:, 4:frames-4], ddtm[:, 4:frames-4])) 98 | 99 | return mfcc 100 | 101 | 102 | def log_power_spectrum_extractor(x, win_len, shift_len, win_type, is_log=False): 103 | samples = x.shape[0] 104 | frames = 1 + (samples - win_len) // shift_len 105 | stft = np.zeros((win_len, frames), dtype=np.complex64) 106 | spect = np.zeros((win_len // 2 + 1, frames), dtype=np.float64) 107 | 108 | window = get_window(win_len, win_type) 109 | 110 | for i in range(frames): 111 | one_frame = x[i*shift_len: i*shift_len+win_len] 112 | windowed_frame = np.multiply(one_frame, window) 113 | stft[:, i] = np.fft.fft(windowed_frame, win_len) 114 | if is_log: 115 | spect[:, i] = np.log(np.power(np.abs(stft[0: win_len//2+1, i]), 2.)) 116 | else: 117 | spect[:, i] = np.power(np.abs(stft[0: win_len//2+1, i]), 2.) 118 | 119 | return spect 120 | 121 | 122 | def stft_extractor(x, win_len, shift_len, win_type, n_fft=None): 123 | if n_fft is None: 124 | n_fft = win_len 125 | samples = x.shape[0] 126 | frames = 1 + (samples - win_len) // shift_len 127 | stft = np.zeros((n_fft, frames), dtype=np.complex64) 128 | spect = np.zeros((n_fft // 2 + 1, frames), dtype=np.complex64) 129 | 130 | window = get_window(win_len, win_type) 131 | 132 | for i in range(frames): 133 | one_frame = x[i*shift_len: i*shift_len+win_len] 134 | windowed_frame = np.multiply(one_frame, window) 135 | stft[:, i] = np.fft.fft(windowed_frame, n_fft) 136 | spect[:, i] = stft[: n_fft//2+1, i] 137 | 138 | return spect 139 | 140 | 141 | def erb_space(low_freq=50, high_freq=8000, n=64): 142 | ear_q = 9.26449 143 | min_bw = 24.7 144 | 145 | cf_array = -(ear_q * min_bw) + np.exp(np.linspace(1,n,n) * (-np.log(high_freq + ear_q * min_bw) + np.log(low_freq + ear_q * min_bw)) / n) \ 146 | * (high_freq + ear_q * min_bw) 147 | return cf_array 148 | 149 | 150 | def make_erb_filters(sr, num_channels, low_freq): 151 | t = 1. / sr 152 | cf = erb_space(low_freq, sr // 2, num_channels) 153 | 154 | ear_q = 9.26449 155 | min_bw = 24.7 156 | order = 4 157 | 158 | erb = np.power(np.power(cf/ear_q, order) + (min_bw ** order), 1. / order) 159 | b = 1.019 * 2 * np.pi * erb 160 | 161 | a0 = t 162 | a2 = 0 163 | b0 = 1 164 | b1 = -2 * np.cos(2 * cf * np.pi * t) / np.exp(b*t) 165 | b2 = np.exp(-2 * b * t) 166 | 167 | a11 = -(2 * t * np.cos(2*cf*np.pi*t) / np.exp(b*t) + 2 * np.sqrt(3+2**1.5) * t * np.sin(2*cf*np.pi*t) / np.exp(b*t))/2 168 | a12 = -(2 * t * np.cos(2*cf*np.pi*t) / np.exp(b*t) - 2 * np.sqrt(3+2**1.5) * t * np.sin(2*cf*np.pi*t) / np.exp(b*t))/2 169 | a13 = -(2 * t * np.cos(2*cf*np.pi*t) / np.exp(b*t) + 2 * np.sqrt(3-2**1.5) * t * np.sin(2*cf*np.pi*t) / np.exp(b*t))/2 170 | a14 = -(2 * t * np.cos(2*cf*np.pi*t) / np.exp(b*t) - 2 * np.sqrt(3-2**1.5) * t * np.sin(2*cf*np.pi*t) / np.exp(b*t))/2 171 | 172 | p1 = (-2*np.exp(4j*cf*np.pi*t)*t + 2*np.exp(-(b*t) + 2j*cf*np.pi*t) * t * 173 | (np.cos(2*cf*np.pi*t) - np.sqrt(3 - 2**(3/2))* np.sin(2*cf*np.pi*t))) 174 | p2 = (-2*np.exp(4j*cf*np.pi*t)*t + 2*np.exp(-(b*t) + 2j*cf*np.pi*t) * t * 175 | (np.cos(2*cf*np.pi*t) + np.sqrt(3 - 2**(3/2))* np.sin(2*cf*np.pi*t))) 176 | p3 = (-2*np.exp(4j*cf*np.pi*t)*t + 2*np.exp(-(b*t) + 2j*cf*np.pi*t) * t * 177 | (np.cos(2*cf*np.pi*t) - np.sqrt(3 + 2**(3/2))* np.sin(2*cf*np.pi*t))) 178 | p4 = (-2*np.exp(4j*cf*np.pi*t)*t + 2*np.exp(-(b*t) + 2j*cf*np.pi*t) * t * 179 | (np.cos(2*cf*np.pi*t) + np.sqrt(3 + 2**(3/2))* np.sin(2*cf*np.pi*t))) 180 | p5 = np.power(-2 / np.exp(2*b*t) - 2 * np.exp(4j*cf*np.pi*t) + 2 * (1 + np.exp(4j*cf*np.pi*t)) / np.exp(b*t), 4) 181 | gain = np.abs(p1 * p2 * p3 * p4 / p5) 182 | 183 | allfilts = np.ones((np.size(cf, 0), 1), dtype=np.float32) 184 | fcoefs = np.column_stack((a0*allfilts, a11, a12, a13, a14, a2*allfilts, b0*allfilts, b1, b2, gain)) 185 | return fcoefs, cf 186 | 187 | 188 | def erb_frilter_bank(x, fcoefs): 189 | a0 = fcoefs[:, 0] 190 | a11 = fcoefs[:, 1] 191 | a12 = fcoefs[:, 2] 192 | a13 = fcoefs[:, 3] 193 | a14 = fcoefs[:, 4] 194 | a2 = fcoefs[:, 5] 195 | b0 = fcoefs[:, 6] 196 | b1 = fcoefs[:, 7] 197 | b2 = fcoefs[:, 8] 198 | gain = fcoefs[:, 9] 199 | 200 | output = np.zeros((np.size(gain, 0), np.size(x, 0))) 201 | 202 | for chan in range(np.size(gain, 0)): 203 | y1 = lfilter(np.array([a0[chan] / gain[chan], a11[chan] / gain[chan], a2[chan] / gain[chan]]), 204 | np.array([b0[chan], b1[chan], b2[chan]]), x) 205 | y2 = lfilter(np.array([a0[chan], a12[chan], a2[chan]]), 206 | np.array([b0[chan], b1[chan], b2[chan]]), y1) 207 | y3 = lfilter(np.array([a0[chan], a13[chan], a2[chan]]), 208 | np.array([b0[chan], b1[chan], b2[chan]]), y2) 209 | y4 = lfilter(np.array([a0[chan], a14[chan], a2[chan]]), 210 | np.array([b0[chan], b1[chan], b2[chan]]), y3) 211 | 212 | output[chan, :] = y4 213 | return output 214 | 215 | 216 | def cochleagram_extractor_wdl(xx, sr, win_len, shift_len, channel_number, win_type): 217 | fcoefs, f = make_erb_filters(sr, channel_number, 50) 218 | fcoefs = np.flipud(fcoefs) 219 | xf = erb_frilter_bank(xx, fcoefs) 220 | 221 | window = get_window(win_len, win_type) 222 | window = window.reshape((1, win_len)) 223 | 224 | xe = np.power(xf, 2.0) 225 | frames = 1 + ((np.size(xe, 1)-win_len) // shift_len) 226 | cochleagram = np.zeros((channel_number, frames)) 227 | for i in range(frames): 228 | one_frame = np.multiply(xe[:, i*shift_len:i*shift_len+win_len], np.repeat(window, channel_number, 0)) 229 | cochleagram[:, i] = np.sum(one_frame, 1) 230 | return cochleagram 231 | 232 | 233 | def cochleagram_extractor(xx, sr, win_len, shift_len, channel_number, win_type): 234 | fcoefs, f = make_erb_filters(sr, channel_number, 50) 235 | fcoefs = np.flipud(fcoefs) 236 | xf = erb_frilter_bank(xx, fcoefs) 237 | 238 | window = get_window(win_len, win_type) 239 | window = window.reshape((1, win_len)) 240 | 241 | xe = np.power(xf, 2.0) 242 | frames = 1 + ((np.size(xe, 1)-win_len) // shift_len) 243 | cochleagram = np.zeros((channel_number, frames)) 244 | for i in range(frames): 245 | one_frame = np.multiply(xe[:, i*shift_len:i*shift_len+win_len], np.repeat(window, channel_number, 0)) 246 | cochleagram[:, i] = np.sqrt(np.mean(one_frame, 1)) 247 | 248 | cochleagram = np.where(cochleagram == 0.0, np.finfo(float).eps, cochleagram) 249 | cochleagram = np.power(cochleagram, 1./3) 250 | return cochleagram 251 | 252 | 253 | def fft_to_cochleagram(sr, min_freq, max_freq, win_len, channel_number): 254 | max_len = win_len 255 | nfilts = channel_number 256 | nfft = win_len 257 | 258 | wts = np.zeros((nfilts, nfft // 2 + 1)) 259 | ear_q = 9.26449 260 | min_bw = 24.7 261 | order = 1. 262 | cfreqs = -(ear_q * min_bw) + np.exp(np.arange(1, nfilts+1, 1) * (-np.log(max_freq+ear_q*min_bw) + np.log(min_freq + ear_q*min_bw)) / nfilts) * (max_freq + ear_q*min_bw) 263 | cfreqs = np.flipud(cfreqs) 264 | GTord = 4. 265 | ucirc = np.exp(2j * np.pi * np.arange(0, nfft//2+1, 1)/nfft) 266 | 267 | for i in range(nfilts): 268 | cf = cfreqs[i] 269 | erb = 1.0 * np.power((np.power(cf/ear_q, order) + min_bw ** order), 1.0/order) 270 | b = 1.019 * 2 * np.pi * erb 271 | r = np.exp(-b / sr) 272 | theta = 2 * np.pi * cf / sr 273 | pole = r * np.exp(1j * theta) 274 | 275 | t = 1. / sr 276 | 277 | a11 = -(2 * t * np.cos(2 * cf * np.pi * t) / np.exp(b * t) + 2 * np.sqrt(3 + 2 ** 1.5) * t * np.sin( 278 | 2 * cf * np.pi * t) / np.exp(b * t)) / 2 279 | a12 = -(2 * t * np.cos(2 * cf * np.pi * t) / np.exp(b * t) - 2 * np.sqrt(3 + 2 ** 1.5) * t * np.sin( 280 | 2 * cf * np.pi * t) / np.exp(b * t)) / 2 281 | a13 = -(2 * t * np.cos(2 * cf * np.pi * t) / np.exp(b * t) + 2 * np.sqrt(3 - 2 ** 1.5) * t * np.sin( 282 | 2 * cf * np.pi * t) / np.exp(b * t)) / 2 283 | a14 = -(2 * t * np.cos(2 * cf * np.pi * t) / np.exp(b * t) - 2 * np.sqrt(3 - 2 ** 1.5) * t * np.sin( 284 | 2 * cf * np.pi * t) / np.exp(b * t)) / 2 285 | 286 | zros = -1 * np.column_stack((a11, a12, a13, a14))/t 287 | p1 = (-2 * np.exp(4j * cf * np.pi * t) * t + 2 * np.exp(-(b * t) + 2j * cf * np.pi * t) * t * 288 | (np.cos(2 * cf * np.pi * t) - np.sqrt(3 - 2 ** (3 / 2)) * np.sin(2 * cf * np.pi * t))) 289 | p2 = (-2 * np.exp(4j * cf * np.pi * t) * t + 2 * np.exp(-(b * t) + 2j * cf * np.pi * t) * t * 290 | (np.cos(2 * cf * np.pi * t) + np.sqrt(3 - 2 ** (3 / 2)) * np.sin(2 * cf * np.pi * t))) 291 | p3 = (-2 * np.exp(4j * cf * np.pi * t) * t + 2 * np.exp(-(b * t) + 2j * cf * np.pi * t) * t * 292 | (np.cos(2 * cf * np.pi * t) - np.sqrt(3 + 2 ** (3 / 2)) * np.sin(2 * cf * np.pi * t))) 293 | p4 = (-2 * np.exp(4j * cf * np.pi * t) * t + 2 * np.exp(-(b * t) + 2j * cf * np.pi * t) * t * 294 | (np.cos(2 * cf * np.pi * t) + np.sqrt(3 + 2 ** (3 / 2)) * np.sin(2 * cf * np.pi * t))) 295 | p5 = np.power( 296 | -2 / np.exp(2 * b * t) - 2 * np.exp(4j * cf * np.pi * t) + 2 * (1 + np.exp(4j * cf * np.pi * t)) / np.exp( 297 | b * t), 4) 298 | gain = np.abs(p1 * p2 * p3 * p4 / p5) 299 | 300 | wts[i, :] = ((t ** 4) / gain) * np.abs(ucirc - zros[:, 0]) * np.abs(ucirc - zros[:, 1]) * \ 301 | np.abs(ucirc - zros[:, 2]) * np.abs(ucirc - zros[:, 3]) * \ 302 | np.power(np.abs((pole - ucirc) * (np.conj(pole) - ucirc)), -1*GTord) 303 | 304 | return wts 305 | 306 | 307 | def freq2bark(f): 308 | return 7.*np.log(f/650.+np.sqrt(np.power(1.+(f/650.), 2.))) 309 | 310 | 311 | def bark2freq(b): 312 | return 650.*np.sinh(b/7.) 313 | 314 | 315 | def get_fft_bark_mat(sr, fft_len, barks, min_frq=20, max_frq=None): 316 | if max_frq is None: 317 | max_frq = sr // 2 318 | fft_frqs = np.arange(0, fft_len//2+1) / (1.*fft_len) * sr 319 | min_bark = freq2bark(min_frq) 320 | max_bark = freq2bark(max_frq) 321 | bark_bins = bark2freq(min_bark + np.arange(0, barks+2) / (barks + 1.) * (max_bark - min_bark)) 322 | wts = np.zeros((barks, fft_len//2+1)) 323 | for i in range(barks): 324 | fs = bark_bins[[i+0, i+1, i+2]] 325 | loslope = (fft_frqs - fs[0]) / (fs[1] - fs[0]) 326 | hislope = (fs[2] - fft_frqs) / (fs[2] - fs[1]) 327 | wts[i, :] = np.maximum(0, np.minimum(loslope, hislope)) 328 | return wts 329 | 330 | 331 | def cal_triangle_window(min_freq, max_freq, nfft, window_number, low_freq, high_freq): 332 | fft_freq_bins = np.linspace(min_freq, max_freq, nfft) 333 | center_freq = np.linspace(low_freq, high_freq, window_number+2) 334 | wts = np.zeros(shape=(window_number, nfft)) 335 | for i in range(window_number): 336 | fs = center_freq[[i+0, i+1, i+2]] 337 | fs = fs[1] + 1.0 * (fs - fs[1]) 338 | loslope = (fft_freq_bins - fs[0]) / (fs[1] - fs[0]) 339 | hislope = (fs[2] - fft_freq_bins) / (fs[2] - fs[1]) 340 | wts[i, :] = np.maximum(0, np.minimum(loslope, hislope)) 341 | return wts 342 | 343 | 344 | def calc_normalized_autocorrelation(x, win_len, shift_len, Tn): 345 | from numpy.linalg import norm 346 | frame_number = 1 + (len(x) - win_len) // shift_len 347 | A = np.zeros(shape=(win_len // Tn, frame_number)) 348 | for i in range(frame_number): 349 | one_frame = x[i*shift_len: i*shift_len+win_len] 350 | for t in range(1, win_len // Tn-1): 351 | n = np.arange(t*Tn, win_len, Tn) 352 | A[t, i] = np.sum(one_frame[n]*one_frame[n - t*Tn]) / (norm(one_frame[n]) * norm(one_frame[n - t*Tn])) 353 | return A 354 | 355 | 356 | def calc_average_instaneous_frequence(ac_matrix, win_duration_ms): 357 | frames = np.size(ac_matrix, 1) 358 | average_if = np.zeros(frames) 359 | for i in range(frames): 360 | zero_cross_times = np.sum(np.less(ac_matrix[:-2, i] * ac_matrix[1:-1, i], 0)) 361 | average_if[i] = 1. / (win_duration_ms / zero_cross_times) 362 | return average_if 363 | 364 | 365 | def ams_extractor(x, sr, win_len, shift_len, order=1, decimate_coef=1./4.): 366 | from scipy.signal import hilbert 367 | envelope = np.abs(hilbert(x)) 368 | for i in range(order-1): 369 | envelope = np.abs(hilbert(envelope)) 370 | envelope = envelope * decimate_coef 371 | frames = 1 + (len(envelope) - win_len) // shift_len 372 | hanning_window = np.hanning(win_len) 373 | ams_feature = np.zeros(shape=(15, frames)) 374 | wts = cal_triangle_window(0, sr//2, win_len//2+1, 15, 15.6, 401) 375 | for i in range(frames): 376 | one_frame = envelope[i*shift_len:i*shift_len+win_len] 377 | one_frame = one_frame * hanning_window 378 | frame_fft = np.abs(np.fft.fft(one_frame, win_len)) 379 | frame_fft = frame_fft[:win_len//2+1] 380 | ams_feature[:,i] = np.matmul(wts, frame_fft) 381 | return ams_feature 382 | 383 | 384 | def unknown_feature_extractor(x, sr, win_len, shift_len, barks, inner_win, inner_shift, win_type, method_version): 385 | x_spectrum = stft_extractor(x, win_len, shift_len, win_type) 386 | coef = get_fft_bark_mat(sr, win_len, barks, 20, sr//2) 387 | bark_spect = np.matmul(coef, x_spectrum) 388 | ams = np.zeros((barks, inner_win//2+1, (bark_spect.shape[1] - inner_win)//inner_shift)) 389 | for i in range(barks): 390 | channel_stft = stft_extractor(bark_spect[i, :], inner_win, inner_shift, 'hanning') 391 | if method_version == 'v1': 392 | ams[i, :, :] = 20 * np.log(np.abs(channel_stft[:inner_win//2+1, :(bark_spect.shape[1] - inner_win)//inner_shift])) 393 | elif method_version == 'v2': 394 | channel_amplitude = np.abs(channel_stft[:inner_win//2+1, :(bark_spect.shape[1] - inner_win)//inner_shift]) 395 | channel_angle = np.angle(channel_stft[:inner_win//2+1, :(bark_spect.shape[1] - inner_win)//inner_shift]) 396 | channel_angle = channel_angle - (np.floor(channel_angle / (2.*np.pi)) * (2.*np.pi)) 397 | ams[i, :, :] = np.power(channel_amplitude, 1./3.) * channel_angle 398 | else: 399 | ams[i, :, :] = np.abs(channel_stft) 400 | return ams 401 | 402 | 403 | def rasta_filt(x): 404 | number = np.arange(-2., 3., 1.) 405 | number = -1. * number / np.sum(number*number) 406 | denom = np.array([1., -0.94]) 407 | zi = lfilter_zi(number, 1) 408 | zi = zi.reshape(1, len(zi)) 409 | zi = np.repeat(zi, np.size(x, 0), 0) 410 | y, zf = lfilter(number, 1, x[:,0:4], axis=1, zi=zi) 411 | y, zf = lfilter(number, denom, x, axis=1, zi=zf) 412 | return y 413 | 414 | 415 | def get_equal_loudness(nfpts, fmax, fbtype=None): 416 | if fbtype is None: 417 | fbtype = 'bark' 418 | if fbtype == 'bark': 419 | bancfhz = bark2freq(np.linspace(0, freq2bark(fmax), nfpts)) 420 | fsq = bancfhz * bancfhz 421 | ftmp = fsq + 1.6e5 422 | eql = ((fsq/ftmp)**2) * ((fsq + 1.44e6)/(fsq + 9.61e6)) 423 | eql = eql.reshape(np.size(eql), 1) 424 | return eql 425 | 426 | 427 | def postaud(x, fmax, fbtype=None): 428 | if fbtype is None: 429 | fbtype = 'bark' 430 | nbands = x.shape[0] 431 | nframes = x.shape[1] 432 | nfpts = nbands 433 | if fbtype == 'bark': 434 | bancfhz = bark2freq(np.linspace(0, freq2bark(fmax), nfpts)) 435 | fsq = bancfhz * bancfhz 436 | ftmp = fsq + 1.6e5 437 | eql = ((fsq/ftmp)**2) * ((fsq + 1.44e6)/(fsq + 9.61e6)) 438 | eql = eql.reshape(np.size(eql), 1) 439 | z = np.repeat(eql, nframes, axis=1) * x 440 | z = z ** (1./3.) 441 | y = np.vstack((z[1, :], z[1:nbands-1, :], z[nbands-2, :])) 442 | return y 443 | 444 | 445 | def do_lpc(spec, order, axis=0, error_normal=False): 446 | coeff, error, k = lpc(spec, order, axis=axis) 447 | if error_normal: 448 | error = np.reshape(error, (1, len(error))) 449 | error = np.repeat(error, order+1, axis=axis) 450 | return coeff / error 451 | else: 452 | return coeff[1:, :] 453 | 454 | 455 | def get_dct_coeff(in_channel, out_channel): 456 | dct_coef = np.zeros((out_channel, in_channel), dtype=np.float32) 457 | for i in range(out_channel): 458 | n = np.linspace(0, in_channel - 1, in_channel) 459 | dct_coef[i, :] = np.cos((2 * n + 1) * i * np.pi / (2 * in_channel)) 460 | return dct_coef 461 | 462 | # I cannot understand it, maybe it works... 463 | def lpc2cep(a, nout=None): 464 | nin = np.size(a, 0) 465 | ncol = np.size(a, 1) 466 | order = nin - 1 467 | if nout is None: 468 | nout = order + 1 469 | c = np.zeros((nout, ncol)) 470 | c[0, :] = -1. * np.log(a[0, :]) 471 | renormal_coef = np.reshape(a[0,:], (1, ncol)) 472 | renormal_coef = np.repeat(renormal_coef, nin, axis=0) 473 | a = a / renormal_coef 474 | for n in range(1, nout): 475 | sumn = np.zeros(ncol) 476 | for m in range(1, n+1): 477 | sumn = sumn + (n-m) * a[m, :] * c[n-m, :] 478 | c[n, :] = -1. * (a[n, :] + 1. / n * sumn) 479 | return c 480 | 481 | 482 | def rasta_plp_extractor(x, sr, win_len, shift_len, plp_order=0, do_rasta=True): 483 | spec = log_power_spectrum_extractor(x, win_len, shift_len, 'hanning', False) 484 | bark_filters = int(np.ceil(freq2bark(sr//2))) 485 | wts = get_fft_bark_mat(sr, win_len, bark_filters) 486 | bark_spec = np.matmul(wts, spec) 487 | if do_rasta: 488 | bark_spec = np.where(bark_spec == 0.0, np.finfo(float).eps, bark_spec) 489 | log_bark_spec = np.log(bark_spec) 490 | rasta_log_bark_spec = rasta_filt(log_bark_spec) 491 | bark_spec = np.exp(rasta_log_bark_spec) 492 | post_spec = postaud(bark_spec, sr/2.) 493 | # post_spec = bark_spec 494 | if plp_order > 0: 495 | lpcas = do_lpc(post_spec, plp_order) 496 | else: 497 | lpcas = post_spec 498 | return lpcas 499 | 500 | 501 | def enframe_extractor(x, win_len, shift_len, win_type, delta_size=0): 502 | frame_num = 1 + (len(x)- win_len) // shift_len 503 | frames = np.zeros([win_len, frame_num], dtype=np.float32) 504 | window = get_window(win_len, win_type) 505 | for i in range(frame_num): 506 | frames[:, i] = x[i*shift_len: i*shift_len+win_len] * window 507 | if delta_size > 0: 508 | frames = frames[:, delta_size: -delta_size] 509 | return frames 510 | 511 | 512 | def MfccGFAmsPlp_feature_extractor(xx, sr, win_len, win_shift, win_type, include_delta, arma_m=0): 513 | mfcc = mfcc_extractor(xx, sr, win_len, win_shift, 64, 31, win_type, False) 514 | cochleagram = cochleagram_extractor_wdl(xx, sr, win_len, win_shift, 64, win_type) 515 | cochleagram = np.power(cochleagram, 1./15.) 516 | ams = ams_extractor(xx, sr, win_len, win_shift) 517 | rasta_plp = rasta_plp_extractor(xx, sr, win_len, win_shift, plp_order=12, do_rasta=True) 518 | features = np.concatenate([mfcc, cochleagram, ams, rasta_plp], axis=0) 519 | if include_delta: 520 | delta_features = 2 * features[:, 4:] + features[:, 3:-1] - features[:, 1:-3] - 2 * features[:, 0:-4] 521 | delta_features = 1. / 3. * delta_features 522 | features = np.concatenate((features[:, 2:-2], delta_features), axis=0) 523 | if arma_m > 0: 524 | arma_feature = np.zeros_like(features) 525 | arma_feature[:, :arma_m] = features[:, :arma_m] 526 | for i in range(arma_m, features.shape[1]-arma_m): 527 | arma_feature[:, i] = features[:, i] 528 | for j in range(1, arma_m+1): 529 | arma_feature[:, i] += (arma_feature[:, i-j] + features[:, i+j]) 530 | arma_feature[:, i] /= (2. * arma_m + 1) 531 | features = arma_feature[:, arma_m: -arma_m] 532 | return features 533 | -------------------------------------------------------------------------------- /gfcc_extractor.py: -------------------------------------------------------------------------------- 1 | # coding = utf-8 2 | import numpy as np 3 | from scipy.io import wavfile 4 | from feature_extractor import cochleagram_extractor 5 | from matplotlib import pyplot as plt 6 | 7 | 8 | def gfcc_extractor(cochleagram, gf_channel, cc_channels): 9 | dctcoef = np.zeros((cc_channels, gf_channel)) 10 | for i in range(cc_channels): 11 | n = np.linspace(0, gf_channel-1, gf_channel) 12 | dctcoef[i, :] = np.cos((2 * n + 1) * i * np.pi / (2 * gf_channel)) 13 | plt.figure() 14 | plt.imshow(dctcoef) 15 | plt.show() 16 | return np.matmul(dctcoef, cochleagram) 17 | 18 | 19 | if __name__ == '__main__': 20 | # wav_data, wav_header = read_sphere_wav(u"clean.wav") 21 | sr, wav_data = wavfile.read(u"clean.wav") 22 | sr = 16000 23 | cochlea = cochleagram_extractor(wav_data, sr, 320, 160, 64, 'hanning') 24 | gfcc = gfcc_extractor(cochlea, 64, 31) 25 | plt.figure(figsize=(10,8)) 26 | plt.subplot(211) 27 | plt.imshow(np.flipud(cochlea)) 28 | plt.subplot(212) 29 | plt.imshow(np.flipud(gfcc)) 30 | plt.show() 31 | 32 | plt.figure(figsize=(10, 8)) 33 | plt.plot(gfcc[0,:]) 34 | plt.show() 35 | -------------------------------------------------------------------------------- /mfcc_extractor.py: -------------------------------------------------------------------------------- 1 | # coding = utf-8 2 | import numpy as np 3 | from matplotlib import pyplot 4 | import matplotlib.pyplot as plt 5 | from scipy.io import wavfile 6 | 7 | 8 | def hz2mel(f): 9 | return 2595. * np.log10(1. + f / 700.) 10 | 11 | 12 | def mel2hz(z): 13 | return 700. * (np.power(10., z / 2595.) - 1.) 14 | 15 | 16 | def get_dct_coeff(in_channel, out_channel): 17 | dct_coef = np.zeros((out_channel, in_channel), dtype=np.float32) 18 | for i in range(out_channel): 19 | n = np.linspace(0, in_channel - 1, in_channel) 20 | dct_coef[i, :] = np.cos((2 * n + 1) * i * np.pi / (2 * in_channel)) 21 | return dct_coef 22 | 23 | 24 | def get_fft_mel_mat(nfft, sr=8000, nfilts=None, width=1.0, minfrq=20, maxfrq=None, constamp=0): 25 | if nfilts is None: 26 | nfilts = nfft 27 | if maxfrq is None: 28 | maxfrq = sr // 2 29 | 30 | # a1=nfft/2+1 31 | wts = np.zeros((nfilts, (nfft//2+1))) 32 | fftfrqs = np.arange(0, (nfft//2+1)) / (1. * nfft) * (sr) 33 | minmel = hz2mel(minfrq) 34 | maxmel = hz2mel(maxfrq) 35 | binfrqs = mel2hz(minmel + np.arange(0, nfilts+2) / (nfilts+1.) * (maxmel - minmel)) 36 | # binbin = np.round(binfrqs / maxfrq * nfft) 37 | for i in range(nfilts): 38 | fs = binfrqs[[i+0, i+1, i+2]] 39 | fs = fs[1] + width * (fs - fs[1]) 40 | loslope = (fftfrqs - fs[0]) / (fs[1] - fs[0]) 41 | hislope = (fs[2] - fftfrqs) / (fs[2] - fs[1]) 42 | wts[i, :] = np.maximum(0, np.minimum(loslope, hislope)) 43 | return wts 44 | 45 | 46 | def mfcc_extractor(xx, sr, win_len, shift_len, mel_channel, dct_channel, win_type, include_delta): 47 | 48 | my_melbank = get_fft_mel_mat(win_len, sr, mel_channel) 49 | 50 | pre_emphasis_weight = 0.9375 51 | 52 | # x = xx * (1-pre_emphasis_weight) 53 | x = np.append(xx[0], xx[1:] - pre_emphasis_weight * xx[:-1]) 54 | dctcoef = np.zeros((dct_channel, mel_channel), dtype=np.float32) 55 | for i in range(dct_channel): 56 | n = np.linspace(0, mel_channel-1, mel_channel) 57 | dctcoef[i, :] = np.cos((2 * n + 1) * i * np.pi / (2 * mel_channel)) 58 | 59 | w = 1 + 6 * np.sin(np.pi * np.linspace(0, dct_channel-1, dct_channel) / (dct_channel-1)) 60 | w /= w.max() 61 | w = np.reshape(w, newshape=(dct_channel, 1)) 62 | 63 | samples = x.shape[0] 64 | frames = (samples - win_len) // shift_len 65 | stft = np.zeros((win_len, frames), dtype=np.complex64) 66 | spectrum = np.zeros((win_len // 2 + 1, frames), dtype=np.float32) 67 | 68 | mfcc = np.zeros((dct_channel, frames), dtype=np.float32) 69 | 70 | if win_type == 'hanning': 71 | window = np.hanning(win_len) 72 | elif win_type == 'hamming': 73 | window = np.hamming(win_len) 74 | elif win_type == 'triangle': 75 | window = (1-(np.abs(win_len - 1 - 2*np.arange(1, win_len+1, 1))/(win_len+1))) 76 | else: 77 | window = np.ones(win_len) 78 | 79 | for i in range(frames): 80 | one_frame = x[i * shift_len: i * shift_len + win_len] 81 | windowed_frame = np.multiply(one_frame, window) 82 | stft[:, i] = np.fft.fft(windowed_frame, win_len) 83 | spectrum[:, i] = np.power(np.abs(stft[0:win_len // 2 + 1, i]), 2) 84 | 85 | c1 = np.matmul(my_melbank, spectrum) 86 | c1 = np.where(c1 == 0.0, np.finfo(float).eps, c1) 87 | mfcc[:dct_channel, :] = np.multiply(np.matmul(dctcoef, np.log(c1)), np.repeat(w, frames, 1)) 88 | 89 | if include_delta: 90 | dtm = np.zeros((dct_channel, frames), dtype=np.float32) 91 | ddtm = np.zeros((dct_channel, frames), dtype=np.float32) 92 | for i in range(2, frames-2): 93 | dtm[:, i] = 2 * mfcc[:, i+2] + mfcc[:, i+1] - mfcc[:, i-1] - 2 * mfcc[:, i-2] 94 | dtm /= 3.0 95 | for i in range(2, frames-2): 96 | ddtm[:, i] = 2 * dtm[:, i+2] + dtm[:, i+1] - dtm[:, i-1] - 2 * dtm[:, i-2] 97 | ddtm /= 3.0 98 | mfcc = np.row_stack((mfcc[:, 4:frames-4], dtm[:, 4:frames-4], ddtm[:, 4:frames-4])) 99 | 100 | return mfcc, spectrum 101 | 102 | 103 | if __name__ == '__main__': 104 | sr, wav_data = wavfile.read(u"clean.wav") 105 | #分别代表 (语音文件,采样率,帧长,帧移,mel,dct,加窗) 106 | mfcc, spect = mfcc_extractor(wav_data[:32000,], sr, sr//1000*20, sr//1000*10, 52, 26, 'hanning', True) 107 | # pyplot.subplot(211) 108 | # pyplot.imshow(np.log(spect)) 109 | # pyplot.subplot(212) 110 | pyplot.imshow(mfcc) 111 | pyplot.show() 112 | # pyplot.subplot(311) 113 | # fft2mel = get_fft_mel_mat(320, 16000, 64) 114 | # pyplot.imshow(fft2mel) 115 | # plt.subplot(312) 116 | # plt.hold(True) 117 | # for i in range(24): 118 | # plt.plot(fft2mel[40 + i, :]) 119 | # pyplot.subplot(313) 120 | # dct_coeff = get_dct_coeff(64,24) 121 | # pyplot.imshow(dct_coeff) 122 | # pyplot.show() 123 | 124 | 125 | -------------------------------------------------------------------------------- /scikits.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kingback2019/Speech_MFCC_GFCC_Python/80f885b042a206a897c2734ad800dac185e95b96/scikits.zip --------------------------------------------------------------------------------