├── README.md
├── feature_extractor.py
├── gfcc_extractor.py
├── mfcc_extractor.py
└── scikits.zip


/README.md:
--------------------------------------------------------------------------------
 1 | # Speech_MFCC_GFCC_Python
 2 | 求取语音的MFCC参数和GFCC参数，可用于语音信号特征提取,下载完成后解压scikits.zip文件夹到当前路径下即可使用
 3 | 
 4 | # 提取MFCC特征--运行mfcc_extractor.py
 5 | 
 6 | ~~~python
 7 |  sr, wav_data = wavfile.read(u"clean.wav")
 8 |  #分别代表 （语音文件，采样率，帧长，帧移，mel,dct,加窗）
 9 |  mfcc, spect = mfcc_extractor(wav_data[:32000,], sr, sr//1000*20, sr//1000*10, 52, 26, 'hanning', True)
10 |  pyplot.imshow(mfcc)
11 |  pyplot.show()
12 | ~~~
13 | # 提取GFCC特征--运行 gfcc_extractor.py
14 | 
15 | ~~~python
16 | 
17 |  #读取语音文件
18 |  sr, wav_data = wavfile.read(u"./data/clean.wav")
19 |  # xx, sr, win_len, shift_len, channel_number, win_type
20 |  # cochleagram_extractor内数字分别代表：帧长，帧移，DCT参数设置
21 |  cochlea = cochleagram_extractor(wav_data, sr, 1024, 512, 32, 'hanning')
22 |  # cochlea 是DCT之前的参数
23 |  plt.matshow(cochlea)
24 |  plt.show()
25 |  gfcc = gfcc_extractor(cochlea, 32, 16)
26 | ~~~
27 | 


--------------------------------------------------------------------------------
/feature_extractor.py:
--------------------------------------------------------------------------------
  1 | # coding = utf-8
  2 | import numpy as np
  3 | from scipy.signal import lfilter, lfilter_zi, lfiltic
  4 | from scikits.talkbox.linpred.levinson_lpc import lpc
  5 | 
  6 | 
  7 | def hz2mel(f):
  8 |     return 2595. * np.log10(1. + f / 700.)
  9 | 
 10 | 
 11 | def mel2hz(z):
 12 |     return 700. * (np.power(10., z / 2595.) - 1.)
 13 | 
 14 | 
 15 | def get_window(win_len, win_type):
 16 |     if win_type == 'hanning':
 17 |         win_len += 2
 18 |         window = np.hanning(win_len)
 19 |         window = window[1: -1]
 20 |     elif win_type == 'hamming':
 21 |         win_len += 2
 22 |         window = np.hamming(win_len)
 23 |         window = window[1: -1]
 24 |     elif win_type == 'triangle':
 25 |         window = 1. - (np.abs(win_len + 1. - 2.*np.arange(0., win_len+2., 1.)) / (win_len+1.))
 26 |         window = window[1: -1]
 27 |     else:
 28 |         window = np.ones(win_len)
 29 |     return window
 30 | 
 31 | 
 32 | def get_fft_mel_mat(nfft, sr=8000, nfilts=None, width=1.0, minfrq=20, maxfrq=None, constamp=0):
 33 |     if nfilts is None:
 34 |         nfilts = nfft
 35 |     if maxfrq is None:
 36 |         maxfrq = sr // 2
 37 |     wts = np.zeros((nfilts, nfft//2+1))
 38 |     fftfrqs = np.arange(0, nfft//2+1) / (1. * nfft) * (sr)
 39 |     minmel = hz2mel(minfrq)
 40 |     maxmel = hz2mel(maxfrq)
 41 |     binfrqs = mel2hz(minmel + np.arange(0, nfilts+2) / (nfilts+1.) * (maxmel - minmel))
 42 |     # binbin = np.round(binfrqs / maxfrq * nfft)
 43 |     for i in range(nfilts):
 44 |         fs = binfrqs[[i+0, i+1, i+2]]
 45 |         fs = fs[1] + width * (fs - fs[1])
 46 |         loslope = (fftfrqs - fs[0]) / (fs[1] - fs[0])
 47 |         hislope = (fs[2] - fftfrqs) / (fs[2] - fs[1])
 48 |         wts[i, :] = np.maximum(0, np.minimum(loslope, hislope))
 49 |     return wts
 50 | 
 51 | 
 52 | def mfcc_extractor(xx, sr, win_len, shift_len, mel_channel, dct_channel, win_type, include_delta):
 53 | 
 54 |     my_melbank = get_fft_mel_mat(win_len, sr, mel_channel)
 55 | 
 56 |     pre_emphasis_weight = 0.9375
 57 | 
 58 |     # x = xx * (1-pre_emphasis_weight)
 59 |     x = np.append(xx[0], xx[1:] - pre_emphasis_weight * xx[:-1])
 60 |     dctcoef = np.zeros((dct_channel, mel_channel), dtype=np.float32)
 61 |     for i in range(dct_channel):
 62 |         n = np.linspace(0, mel_channel-1, mel_channel)
 63 |         dctcoef[i, :] = np.cos((2 * n + 1) * i * np.pi / (2 * mel_channel))
 64 | 
 65 |     w = 1 + 6 * np.sin(np.pi * np.linspace(0, dct_channel-1, dct_channel) / (dct_channel-1))
 66 |     w /= w.max()
 67 |     w = np.reshape(w, newshape=(dct_channel, 1))
 68 | 
 69 |     samples = x.shape[0]
 70 |     frames = 1 + (samples - win_len) // shift_len
 71 |     stft = np.zeros((win_len, frames), dtype=np.complex64)
 72 |     spectrum = np.zeros((win_len // 2 + 1, frames), dtype=np.float32)
 73 | 
 74 |     mfcc = np.zeros((dct_channel, frames), dtype=np.float32)
 75 | 
 76 |     window = get_window(win_len, win_type)
 77 | 
 78 |     for i in range(frames):
 79 |         one_frame = x[i * shift_len: i * shift_len + win_len]
 80 |         windowed_frame = np.multiply(one_frame, window)
 81 |         stft[:, i] = np.fft.fft(windowed_frame, win_len)
 82 |         spectrum[:, i] = np.power(np.abs(stft[0:win_len // 2 + 1, i]), 2)
 83 | 
 84 |     c1 = np.matmul(my_melbank, spectrum)
 85 |     c1 = np.where(c1 == 0.0, np.finfo(float).eps, c1)
 86 |     mfcc[:dct_channel, :] = np.multiply(np.matmul(dctcoef, np.log(c1)), np.repeat(w, frames, 1))
 87 | 
 88 |     if include_delta:
 89 |         dtm = np.zeros((dct_channel, frames), dtype=np.float32)
 90 |         ddtm = np.zeros((dct_channel, frames), dtype=np.float32)
 91 |         for i in range(2, frames-2):
 92 |             dtm[:, i] = 2 * mfcc[:, i+2] + mfcc[:, i+1] - mfcc[:, i-1] - 2 * mfcc[:, i-2]
 93 |         dtm /= 3.0
 94 |         for i in range(2, frames-2):
 95 |             ddtm[:, i] = 2 * dtm[:, i+2] + dtm[:, i+1] - dtm[:, i-1] - 2 * dtm[:, i-2]
 96 |         ddtm /= 3.0
 97 |         mfcc = np.row_stack((mfcc[:, 4:frames-4], dtm[:, 4:frames-4], ddtm[:, 4:frames-4]))
 98 | 
 99 |     return mfcc
100 | 
101 | 
102 | def log_power_spectrum_extractor(x, win_len, shift_len, win_type, is_log=False):
103 |     samples = x.shape[0]
104 |     frames = 1 + (samples - win_len) // shift_len
105 |     stft = np.zeros((win_len, frames), dtype=np.complex64)
106 |     spect = np.zeros((win_len // 2 + 1, frames), dtype=np.float64)
107 | 
108 |     window = get_window(win_len, win_type)
109 | 
110 |     for i in range(frames):
111 |         one_frame = x[i*shift_len: i*shift_len+win_len]
112 |         windowed_frame = np.multiply(one_frame, window)
113 |         stft[:, i] = np.fft.fft(windowed_frame, win_len)
114 |         if is_log:
115 |             spect[:, i] = np.log(np.power(np.abs(stft[0: win_len//2+1, i]), 2.))
116 |         else:
117 |             spect[:, i] = np.power(np.abs(stft[0: win_len//2+1, i]), 2.)
118 | 
119 |     return spect
120 | 
121 | 
122 | def stft_extractor(x, win_len, shift_len, win_type, n_fft=None):
123 |     if n_fft is None:
124 |         n_fft = win_len
125 |     samples = x.shape[0]
126 |     frames = 1 + (samples - win_len) // shift_len
127 |     stft = np.zeros((n_fft, frames), dtype=np.complex64)
128 |     spect = np.zeros((n_fft // 2 + 1, frames), dtype=np.complex64)
129 | 
130 |     window = get_window(win_len, win_type)
131 | 
132 |     for i in range(frames):
133 |         one_frame = x[i*shift_len: i*shift_len+win_len]
134 |         windowed_frame = np.multiply(one_frame, window)
135 |         stft[:, i] = np.fft.fft(windowed_frame, n_fft)
136 |         spect[:, i] = stft[: n_fft//2+1, i]
137 | 
138 |     return spect
139 | 
140 | 
141 | def erb_space(low_freq=50, high_freq=8000, n=64):
142 |     ear_q = 9.26449
143 |     min_bw = 24.7
144 | 
145 |     cf_array = -(ear_q * min_bw) + np.exp(np.linspace(1,n,n) * (-np.log(high_freq + ear_q * min_bw) + np.log(low_freq + ear_q * min_bw)) / n) \
146 |                 * (high_freq + ear_q * min_bw)
147 |     return cf_array
148 | 
149 | 
150 | def make_erb_filters(sr, num_channels, low_freq):
151 |     t = 1. / sr
152 |     cf = erb_space(low_freq, sr // 2, num_channels)
153 | 
154 |     ear_q = 9.26449
155 |     min_bw = 24.7
156 |     order = 4
157 | 
158 |     erb = np.power(np.power(cf/ear_q, order) + (min_bw ** order), 1. / order)
159 |     b = 1.019 * 2 * np.pi * erb
160 | 
161 |     a0 = t
162 |     a2 = 0
163 |     b0 = 1
164 |     b1 = -2 * np.cos(2 * cf * np.pi * t) / np.exp(b*t)
165 |     b2 = np.exp(-2 * b * t)
166 | 
167 |     a11 = -(2 * t * np.cos(2*cf*np.pi*t) / np.exp(b*t) + 2 * np.sqrt(3+2**1.5) * t * np.sin(2*cf*np.pi*t) / np.exp(b*t))/2
168 |     a12 = -(2 * t * np.cos(2*cf*np.pi*t) / np.exp(b*t) - 2 * np.sqrt(3+2**1.5) * t * np.sin(2*cf*np.pi*t) / np.exp(b*t))/2
169 |     a13 = -(2 * t * np.cos(2*cf*np.pi*t) / np.exp(b*t) + 2 * np.sqrt(3-2**1.5) * t * np.sin(2*cf*np.pi*t) / np.exp(b*t))/2
170 |     a14 = -(2 * t * np.cos(2*cf*np.pi*t) / np.exp(b*t) - 2 * np.sqrt(3-2**1.5) * t * np.sin(2*cf*np.pi*t) / np.exp(b*t))/2
171 | 
172 |     p1 = (-2*np.exp(4j*cf*np.pi*t)*t + 2*np.exp(-(b*t) + 2j*cf*np.pi*t) * t *
173 |          (np.cos(2*cf*np.pi*t) - np.sqrt(3 - 2**(3/2))* np.sin(2*cf*np.pi*t)))
174 |     p2 = (-2*np.exp(4j*cf*np.pi*t)*t + 2*np.exp(-(b*t) + 2j*cf*np.pi*t) * t *
175 |          (np.cos(2*cf*np.pi*t) + np.sqrt(3 - 2**(3/2))* np.sin(2*cf*np.pi*t)))
176 |     p3 = (-2*np.exp(4j*cf*np.pi*t)*t + 2*np.exp(-(b*t) + 2j*cf*np.pi*t) * t *
177 |          (np.cos(2*cf*np.pi*t) - np.sqrt(3 + 2**(3/2))* np.sin(2*cf*np.pi*t)))
178 |     p4 = (-2*np.exp(4j*cf*np.pi*t)*t + 2*np.exp(-(b*t) + 2j*cf*np.pi*t) * t *
179 |          (np.cos(2*cf*np.pi*t) + np.sqrt(3 + 2**(3/2))* np.sin(2*cf*np.pi*t)))
180 |     p5 = np.power(-2 / np.exp(2*b*t) - 2 * np.exp(4j*cf*np.pi*t) + 2 * (1 + np.exp(4j*cf*np.pi*t)) / np.exp(b*t), 4)
181 |     gain = np.abs(p1 * p2 * p3 * p4 / p5)
182 | 
183 |     allfilts = np.ones((np.size(cf, 0), 1), dtype=np.float32)
184 |     fcoefs = np.column_stack((a0*allfilts, a11, a12, a13, a14, a2*allfilts, b0*allfilts, b1, b2, gain))
185 |     return fcoefs, cf
186 | 
187 | 
188 | def erb_frilter_bank(x, fcoefs):
189 |     a0 = fcoefs[:, 0]
190 |     a11 = fcoefs[:, 1]
191 |     a12 = fcoefs[:, 2]
192 |     a13 = fcoefs[:, 3]
193 |     a14 = fcoefs[:, 4]
194 |     a2 = fcoefs[:, 5]
195 |     b0 = fcoefs[:, 6]
196 |     b1 = fcoefs[:, 7]
197 |     b2 = fcoefs[:, 8]
198 |     gain = fcoefs[:, 9]
199 | 
200 |     output = np.zeros((np.size(gain, 0), np.size(x, 0)))
201 | 
202 |     for chan in range(np.size(gain, 0)):
203 |         y1 = lfilter(np.array([a0[chan] / gain[chan], a11[chan] / gain[chan], a2[chan] / gain[chan]]),
204 |                      np.array([b0[chan], b1[chan], b2[chan]]), x)
205 |         y2 = lfilter(np.array([a0[chan], a12[chan], a2[chan]]),
206 |                      np.array([b0[chan], b1[chan], b2[chan]]), y1)
207 |         y3 = lfilter(np.array([a0[chan], a13[chan], a2[chan]]),
208 |                      np.array([b0[chan], b1[chan], b2[chan]]), y2)
209 |         y4 = lfilter(np.array([a0[chan], a14[chan], a2[chan]]),
210 |                      np.array([b0[chan], b1[chan], b2[chan]]), y3)
211 | 
212 |         output[chan, :] = y4
213 |     return output
214 | 
215 | 
216 | def cochleagram_extractor_wdl(xx, sr, win_len, shift_len, channel_number, win_type):
217 |     fcoefs, f = make_erb_filters(sr, channel_number, 50)
218 |     fcoefs = np.flipud(fcoefs)
219 |     xf = erb_frilter_bank(xx, fcoefs)
220 | 
221 |     window = get_window(win_len, win_type)
222 |     window = window.reshape((1, win_len))
223 | 
224 |     xe = np.power(xf, 2.0)
225 |     frames = 1 + ((np.size(xe, 1)-win_len) // shift_len)
226 |     cochleagram = np.zeros((channel_number, frames))
227 |     for i in range(frames):
228 |         one_frame = np.multiply(xe[:, i*shift_len:i*shift_len+win_len], np.repeat(window, channel_number, 0))
229 |         cochleagram[:, i] = np.sum(one_frame, 1)
230 |     return cochleagram
231 | 
232 | 
233 | def cochleagram_extractor(xx, sr, win_len, shift_len, channel_number, win_type):
234 |     fcoefs, f = make_erb_filters(sr, channel_number, 50)
235 |     fcoefs = np.flipud(fcoefs)
236 |     xf = erb_frilter_bank(xx, fcoefs)
237 | 
238 |     window = get_window(win_len, win_type)
239 |     window = window.reshape((1, win_len))
240 | 
241 |     xe = np.power(xf, 2.0)
242 |     frames = 1 + ((np.size(xe, 1)-win_len) // shift_len)
243 |     cochleagram = np.zeros((channel_number, frames))
244 |     for i in range(frames):
245 |         one_frame = np.multiply(xe[:, i*shift_len:i*shift_len+win_len], np.repeat(window, channel_number, 0))
246 |         cochleagram[:, i] = np.sqrt(np.mean(one_frame, 1))
247 | 
248 |     cochleagram = np.where(cochleagram == 0.0, np.finfo(float).eps, cochleagram)
249 |     cochleagram = np.power(cochleagram, 1./3)
250 |     return cochleagram
251 | 
252 | 
253 | def fft_to_cochleagram(sr, min_freq, max_freq, win_len, channel_number):
254 |     max_len = win_len
255 |     nfilts = channel_number
256 |     nfft = win_len
257 | 
258 |     wts = np.zeros((nfilts, nfft // 2 + 1))
259 |     ear_q = 9.26449
260 |     min_bw = 24.7
261 |     order = 1.
262 |     cfreqs = -(ear_q * min_bw) + np.exp(np.arange(1, nfilts+1, 1) * (-np.log(max_freq+ear_q*min_bw) + np.log(min_freq + ear_q*min_bw)) / nfilts) * (max_freq + ear_q*min_bw)
263 |     cfreqs = np.flipud(cfreqs)
264 |     GTord = 4.
265 |     ucirc = np.exp(2j * np.pi * np.arange(0, nfft//2+1, 1)/nfft)
266 | 
267 |     for i in range(nfilts):
268 |         cf = cfreqs[i]
269 |         erb = 1.0 * np.power((np.power(cf/ear_q, order) + min_bw ** order), 1.0/order)
270 |         b = 1.019 * 2 * np.pi * erb
271 |         r = np.exp(-b / sr)
272 |         theta = 2 * np.pi * cf / sr
273 |         pole = r * np.exp(1j * theta)
274 | 
275 |         t = 1. / sr
276 | 
277 |         a11 = -(2 * t * np.cos(2 * cf * np.pi * t) / np.exp(b * t) + 2 * np.sqrt(3 + 2 ** 1.5) * t * np.sin(
278 |             2 * cf * np.pi * t) / np.exp(b * t)) / 2
279 |         a12 = -(2 * t * np.cos(2 * cf * np.pi * t) / np.exp(b * t) - 2 * np.sqrt(3 + 2 ** 1.5) * t * np.sin(
280 |             2 * cf * np.pi * t) / np.exp(b * t)) / 2
281 |         a13 = -(2 * t * np.cos(2 * cf * np.pi * t) / np.exp(b * t) + 2 * np.sqrt(3 - 2 ** 1.5) * t * np.sin(
282 |             2 * cf * np.pi * t) / np.exp(b * t)) / 2
283 |         a14 = -(2 * t * np.cos(2 * cf * np.pi * t) / np.exp(b * t) - 2 * np.sqrt(3 - 2 ** 1.5) * t * np.sin(
284 |             2 * cf * np.pi * t) / np.exp(b * t)) / 2
285 | 
286 |         zros = -1 * np.column_stack((a11, a12, a13, a14))/t
287 |         p1 = (-2 * np.exp(4j * cf * np.pi * t) * t + 2 * np.exp(-(b * t) + 2j * cf * np.pi * t) * t *
288 |               (np.cos(2 * cf * np.pi * t) - np.sqrt(3 - 2 ** (3 / 2)) * np.sin(2 * cf * np.pi * t)))
289 |         p2 = (-2 * np.exp(4j * cf * np.pi * t) * t + 2 * np.exp(-(b * t) + 2j * cf * np.pi * t) * t *
290 |               (np.cos(2 * cf * np.pi * t) + np.sqrt(3 - 2 ** (3 / 2)) * np.sin(2 * cf * np.pi * t)))
291 |         p3 = (-2 * np.exp(4j * cf * np.pi * t) * t + 2 * np.exp(-(b * t) + 2j * cf * np.pi * t) * t *
292 |               (np.cos(2 * cf * np.pi * t) - np.sqrt(3 + 2 ** (3 / 2)) * np.sin(2 * cf * np.pi * t)))
293 |         p4 = (-2 * np.exp(4j * cf * np.pi * t) * t + 2 * np.exp(-(b * t) + 2j * cf * np.pi * t) * t *
294 |               (np.cos(2 * cf * np.pi * t) + np.sqrt(3 + 2 ** (3 / 2)) * np.sin(2 * cf * np.pi * t)))
295 |         p5 = np.power(
296 |             -2 / np.exp(2 * b * t) - 2 * np.exp(4j * cf * np.pi * t) + 2 * (1 + np.exp(4j * cf * np.pi * t)) / np.exp(
297 |                 b * t), 4)
298 |         gain = np.abs(p1 * p2 * p3 * p4 / p5)
299 | 
300 |         wts[i, :] = ((t ** 4) / gain) * np.abs(ucirc - zros[:, 0]) * np.abs(ucirc - zros[:, 1]) * \
301 |                     np.abs(ucirc - zros[:, 2]) * np.abs(ucirc - zros[:, 3]) * \
302 |                     np.power(np.abs((pole - ucirc) * (np.conj(pole) - ucirc)), -1*GTord)
303 | 
304 |     return wts
305 | 
306 | 
307 | def freq2bark(f):
308 |     return 7.*np.log(f/650.+np.sqrt(np.power(1.+(f/650.), 2.)))
309 | 
310 | 
311 | def bark2freq(b):
312 |     return 650.*np.sinh(b/7.)
313 | 
314 | 
315 | def get_fft_bark_mat(sr, fft_len, barks, min_frq=20, max_frq=None):
316 |     if max_frq is None:
317 |         max_frq = sr // 2
318 |     fft_frqs = np.arange(0, fft_len//2+1) / (1.*fft_len) * sr
319 |     min_bark = freq2bark(min_frq)
320 |     max_bark = freq2bark(max_frq)
321 |     bark_bins = bark2freq(min_bark + np.arange(0, barks+2) / (barks + 1.) * (max_bark - min_bark))
322 |     wts = np.zeros((barks, fft_len//2+1))
323 |     for i in range(barks):
324 |         fs = bark_bins[[i+0, i+1, i+2]]
325 |         loslope = (fft_frqs - fs[0]) / (fs[1] - fs[0])
326 |         hislope = (fs[2] - fft_frqs) / (fs[2] - fs[1])
327 |         wts[i, :] = np.maximum(0, np.minimum(loslope, hislope))
328 |     return wts
329 | 
330 | 
331 | def cal_triangle_window(min_freq, max_freq, nfft, window_number, low_freq, high_freq):
332 |     fft_freq_bins = np.linspace(min_freq, max_freq, nfft)
333 |     center_freq = np.linspace(low_freq, high_freq, window_number+2)
334 |     wts = np.zeros(shape=(window_number, nfft))
335 |     for i in range(window_number):
336 |         fs = center_freq[[i+0, i+1, i+2]]
337 |         fs = fs[1] + 1.0 * (fs - fs[1])
338 |         loslope = (fft_freq_bins - fs[0]) / (fs[1] - fs[0])
339 |         hislope = (fs[2] - fft_freq_bins) / (fs[2] - fs[1])
340 |         wts[i, :] = np.maximum(0, np.minimum(loslope, hislope))
341 |     return wts
342 | 
343 | 
344 | def calc_normalized_autocorrelation(x, win_len, shift_len, Tn):
345 |     from numpy.linalg import norm
346 |     frame_number = 1 + (len(x) - win_len) // shift_len
347 |     A = np.zeros(shape=(win_len // Tn, frame_number))
348 |     for i in range(frame_number):
349 |         one_frame = x[i*shift_len: i*shift_len+win_len]
350 |         for t in range(1, win_len // Tn-1):
351 |             n = np.arange(t*Tn, win_len, Tn)
352 |             A[t, i] = np.sum(one_frame[n]*one_frame[n - t*Tn]) / (norm(one_frame[n]) * norm(one_frame[n - t*Tn]))
353 |     return A
354 | 
355 | 
356 | def calc_average_instaneous_frequence(ac_matrix, win_duration_ms):
357 |     frames = np.size(ac_matrix, 1)
358 |     average_if = np.zeros(frames)
359 |     for i in range(frames):
360 |         zero_cross_times = np.sum(np.less(ac_matrix[:-2, i] * ac_matrix[1:-1, i], 0))
361 |         average_if[i] = 1. / (win_duration_ms / zero_cross_times)
362 |     return average_if
363 | 
364 | 
365 | def ams_extractor(x, sr, win_len, shift_len, order=1, decimate_coef=1./4.):
366 |     from scipy.signal import hilbert
367 |     envelope = np.abs(hilbert(x))
368 |     for i in range(order-1):
369 |         envelope = np.abs(hilbert(envelope))
370 |     envelope = envelope * decimate_coef
371 |     frames = 1 + (len(envelope) - win_len) // shift_len
372 |     hanning_window = np.hanning(win_len)
373 |     ams_feature = np.zeros(shape=(15, frames))
374 |     wts = cal_triangle_window(0, sr//2, win_len//2+1, 15, 15.6, 401)
375 |     for i in range(frames):
376 |         one_frame = envelope[i*shift_len:i*shift_len+win_len]
377 |         one_frame = one_frame * hanning_window
378 |         frame_fft = np.abs(np.fft.fft(one_frame, win_len))
379 |         frame_fft = frame_fft[:win_len//2+1]
380 |         ams_feature[:,i] = np.matmul(wts, frame_fft)
381 |     return ams_feature
382 | 
383 | 
384 | def unknown_feature_extractor(x, sr, win_len, shift_len, barks, inner_win, inner_shift, win_type, method_version):
385 |     x_spectrum = stft_extractor(x, win_len, shift_len, win_type)
386 |     coef = get_fft_bark_mat(sr, win_len, barks, 20, sr//2)
387 |     bark_spect = np.matmul(coef, x_spectrum)
388 |     ams = np.zeros((barks, inner_win//2+1, (bark_spect.shape[1] - inner_win)//inner_shift))
389 |     for i in range(barks):
390 |         channel_stft = stft_extractor(bark_spect[i, :], inner_win, inner_shift, 'hanning')
391 |         if method_version == 'v1':
392 |             ams[i, :, :] = 20 * np.log(np.abs(channel_stft[:inner_win//2+1, :(bark_spect.shape[1] - inner_win)//inner_shift]))
393 |         elif method_version == 'v2':
394 |             channel_amplitude = np.abs(channel_stft[:inner_win//2+1, :(bark_spect.shape[1] - inner_win)//inner_shift])
395 |             channel_angle = np.angle(channel_stft[:inner_win//2+1, :(bark_spect.shape[1] - inner_win)//inner_shift])
396 |             channel_angle = channel_angle - (np.floor(channel_angle / (2.*np.pi)) * (2.*np.pi))
397 |             ams[i, :, :] = np.power(channel_amplitude, 1./3.) * channel_angle
398 |         else:
399 |             ams[i, :, :] = np.abs(channel_stft)
400 |     return ams
401 | 
402 | 
403 | def rasta_filt(x):
404 |     number = np.arange(-2., 3., 1.)
405 |     number = -1. * number / np.sum(number*number)
406 |     denom = np.array([1., -0.94])
407 |     zi = lfilter_zi(number, 1)
408 |     zi = zi.reshape(1, len(zi))
409 |     zi = np.repeat(zi, np.size(x, 0), 0)
410 |     y, zf = lfilter(number, 1, x[:,0:4], axis=1, zi=zi)
411 |     y, zf = lfilter(number, denom, x, axis=1, zi=zf)
412 |     return y
413 | 
414 | 
415 | def get_equal_loudness(nfpts, fmax, fbtype=None):
416 |     if fbtype is None:
417 |         fbtype = 'bark'
418 |     if fbtype == 'bark':
419 |         bancfhz = bark2freq(np.linspace(0, freq2bark(fmax), nfpts))
420 |     fsq = bancfhz * bancfhz
421 |     ftmp = fsq + 1.6e5
422 |     eql = ((fsq/ftmp)**2) * ((fsq + 1.44e6)/(fsq + 9.61e6))
423 |     eql = eql.reshape(np.size(eql), 1)
424 |     return eql
425 | 
426 | 
427 | def postaud(x, fmax, fbtype=None):
428 |     if fbtype is None:
429 |         fbtype = 'bark'
430 |     nbands = x.shape[0]
431 |     nframes = x.shape[1]
432 |     nfpts = nbands
433 |     if fbtype == 'bark':
434 |         bancfhz = bark2freq(np.linspace(0, freq2bark(fmax), nfpts))
435 |     fsq = bancfhz * bancfhz
436 |     ftmp = fsq + 1.6e5
437 |     eql = ((fsq/ftmp)**2) * ((fsq + 1.44e6)/(fsq + 9.61e6))
438 |     eql = eql.reshape(np.size(eql), 1)
439 |     z = np.repeat(eql, nframes, axis=1) * x
440 |     z = z ** (1./3.)
441 |     y = np.vstack((z[1, :], z[1:nbands-1, :], z[nbands-2, :]))
442 |     return y
443 | 
444 | 
445 | def do_lpc(spec, order, axis=0, error_normal=False):
446 |     coeff, error, k = lpc(spec, order, axis=axis)
447 |     if error_normal:
448 |         error = np.reshape(error, (1, len(error)))
449 |         error = np.repeat(error, order+1, axis=axis)
450 |         return coeff / error
451 |     else:
452 |         return coeff[1:, :]
453 | 
454 | 
455 | def get_dct_coeff(in_channel, out_channel):
456 |     dct_coef = np.zeros((out_channel, in_channel), dtype=np.float32)
457 |     for i in range(out_channel):
458 |         n = np.linspace(0, in_channel - 1, in_channel)
459 |         dct_coef[i, :] = np.cos((2 * n + 1) * i * np.pi / (2 * in_channel))
460 |     return dct_coef
461 | 
462 | # I cannot understand it, maybe it works...
463 | def lpc2cep(a, nout=None):
464 |     nin = np.size(a, 0)
465 |     ncol = np.size(a, 1)
466 |     order = nin - 1
467 |     if nout is None:
468 |         nout = order + 1
469 |     c = np.zeros((nout, ncol))
470 |     c[0, :] = -1. * np.log(a[0, :])
471 |     renormal_coef = np.reshape(a[0,:], (1, ncol))
472 |     renormal_coef = np.repeat(renormal_coef, nin, axis=0)
473 |     a = a / renormal_coef
474 |     for n in range(1, nout):
475 |         sumn = np.zeros(ncol)
476 |         for m in range(1, n+1):
477 |             sumn = sumn + (n-m) * a[m, :] * c[n-m, :]
478 |         c[n, :] = -1. * (a[n, :] + 1. / n * sumn)
479 |     return c
480 | 
481 | 
482 | def rasta_plp_extractor(x, sr, win_len, shift_len, plp_order=0, do_rasta=True):
483 |     spec = log_power_spectrum_extractor(x, win_len, shift_len, 'hanning', False)
484 |     bark_filters = int(np.ceil(freq2bark(sr//2)))
485 |     wts = get_fft_bark_mat(sr, win_len, bark_filters)
486 |     bark_spec = np.matmul(wts, spec)
487 |     if do_rasta:
488 |         bark_spec = np.where(bark_spec == 0.0, np.finfo(float).eps, bark_spec)
489 |         log_bark_spec = np.log(bark_spec)
490 |         rasta_log_bark_spec = rasta_filt(log_bark_spec)
491 |         bark_spec = np.exp(rasta_log_bark_spec)
492 |     post_spec = postaud(bark_spec, sr/2.)
493 |     # post_spec = bark_spec
494 |     if plp_order > 0:
495 |         lpcas = do_lpc(post_spec, plp_order)
496 |     else:
497 |         lpcas = post_spec
498 |     return lpcas
499 | 
500 | 
501 | def enframe_extractor(x, win_len, shift_len, win_type, delta_size=0):
502 |     frame_num = 1 + (len(x)- win_len) // shift_len
503 |     frames = np.zeros([win_len, frame_num], dtype=np.float32)
504 |     window = get_window(win_len, win_type)
505 |     for i in range(frame_num):
506 |         frames[:, i] = x[i*shift_len: i*shift_len+win_len] * window
507 |     if delta_size > 0:
508 |         frames = frames[:, delta_size: -delta_size]
509 |     return frames
510 | 
511 | 
512 | def MfccGFAmsPlp_feature_extractor(xx, sr, win_len, win_shift, win_type, include_delta, arma_m=0):
513 |     mfcc = mfcc_extractor(xx, sr, win_len, win_shift, 64, 31, win_type, False)
514 |     cochleagram = cochleagram_extractor_wdl(xx, sr, win_len, win_shift, 64, win_type)
515 |     cochleagram = np.power(cochleagram, 1./15.)
516 |     ams = ams_extractor(xx, sr, win_len, win_shift)
517 |     rasta_plp = rasta_plp_extractor(xx, sr, win_len, win_shift, plp_order=12, do_rasta=True)
518 |     features = np.concatenate([mfcc, cochleagram, ams, rasta_plp], axis=0)
519 |     if include_delta:
520 |         delta_features = 2 * features[:, 4:] + features[:, 3:-1] - features[:, 1:-3] - 2 * features[:, 0:-4]
521 |         delta_features = 1. / 3. * delta_features
522 |         features = np.concatenate((features[:, 2:-2], delta_features), axis=0)
523 |     if arma_m > 0:
524 |         arma_feature = np.zeros_like(features)
525 |         arma_feature[:, :arma_m] = features[:, :arma_m]
526 |         for i in range(arma_m, features.shape[1]-arma_m):
527 |             arma_feature[:, i] = features[:, i]
528 |             for j in range(1, arma_m+1):
529 |                 arma_feature[:, i] += (arma_feature[:, i-j] + features[:, i+j])
530 |             arma_feature[:, i] /= (2. * arma_m + 1)
531 |         features = arma_feature[:, arma_m: -arma_m]
532 |     return features
533 | 


--------------------------------------------------------------------------------
/gfcc_extractor.py:
--------------------------------------------------------------------------------
 1 | # coding = utf-8
 2 | import numpy as np
 3 | from scipy.io import wavfile
 4 | from feature_extractor import cochleagram_extractor
 5 | from matplotlib import  pyplot as plt
 6 | 
 7 | 
 8 | def gfcc_extractor(cochleagram, gf_channel, cc_channels):
 9 |     dctcoef = np.zeros((cc_channels, gf_channel))
10 |     for i in range(cc_channels):
11 |         n = np.linspace(0, gf_channel-1, gf_channel)
12 |         dctcoef[i, :] = np.cos((2 * n + 1) * i * np.pi / (2 * gf_channel))
13 |     plt.figure()
14 |     plt.imshow(dctcoef)
15 |     plt.show()
16 |     return np.matmul(dctcoef, cochleagram)
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     # wav_data, wav_header = read_sphere_wav(u"clean.wav")
21 |     sr, wav_data = wavfile.read(u"clean.wav")
22 |     sr = 16000
23 |     cochlea = cochleagram_extractor(wav_data, sr, 320, 160, 64, 'hanning')
24 |     gfcc = gfcc_extractor(cochlea, 64, 31)
25 |     plt.figure(figsize=(10,8))
26 |     plt.subplot(211)
27 |     plt.imshow(np.flipud(cochlea))
28 |     plt.subplot(212)
29 |     plt.imshow(np.flipud(gfcc))
30 |     plt.show()
31 | 
32 |     plt.figure(figsize=(10, 8))
33 |     plt.plot(gfcc[0,:])
34 |     plt.show()
35 | 


--------------------------------------------------------------------------------
/mfcc_extractor.py:
--------------------------------------------------------------------------------
  1 | # coding = utf-8
  2 | import numpy as np
  3 | from matplotlib import pyplot
  4 | import matplotlib.pyplot as plt
  5 | from scipy.io import wavfile
  6 | 
  7 | 
  8 | def hz2mel(f):
  9 |     return 2595. * np.log10(1. + f / 700.)
 10 | 
 11 | 
 12 | def mel2hz(z):
 13 |     return 700. * (np.power(10., z / 2595.) - 1.)
 14 | 
 15 | 
 16 | def get_dct_coeff(in_channel, out_channel):
 17 |     dct_coef = np.zeros((out_channel, in_channel), dtype=np.float32)
 18 |     for i in range(out_channel):
 19 |         n = np.linspace(0, in_channel - 1, in_channel)
 20 |         dct_coef[i, :] = np.cos((2 * n + 1) * i * np.pi / (2 * in_channel))
 21 |     return dct_coef
 22 | 
 23 | 
 24 | def get_fft_mel_mat(nfft, sr=8000, nfilts=None, width=1.0, minfrq=20, maxfrq=None, constamp=0):
 25 |     if nfilts is None:
 26 |         nfilts = nfft
 27 |     if maxfrq is None:
 28 |         maxfrq = sr // 2
 29 | 
 30 |     # a1=nfft/2+1
 31 |     wts = np.zeros((nfilts, (nfft//2+1)))
 32 |     fftfrqs = np.arange(0, (nfft//2+1)) / (1. * nfft) * (sr)
 33 |     minmel = hz2mel(minfrq)
 34 |     maxmel = hz2mel(maxfrq)
 35 |     binfrqs = mel2hz(minmel + np.arange(0, nfilts+2) / (nfilts+1.) * (maxmel - minmel))
 36 |     # binbin = np.round(binfrqs / maxfrq * nfft)
 37 |     for i in range(nfilts):
 38 |         fs = binfrqs[[i+0, i+1, i+2]]
 39 |         fs = fs[1] + width * (fs - fs[1])
 40 |         loslope = (fftfrqs - fs[0]) / (fs[1] - fs[0])
 41 |         hislope = (fs[2] - fftfrqs) / (fs[2] - fs[1])
 42 |         wts[i, :] = np.maximum(0, np.minimum(loslope, hislope))
 43 |     return wts
 44 | 
 45 | 
 46 | def mfcc_extractor(xx, sr, win_len, shift_len, mel_channel, dct_channel, win_type, include_delta):
 47 | 
 48 |     my_melbank = get_fft_mel_mat(win_len, sr, mel_channel)
 49 | 
 50 |     pre_emphasis_weight = 0.9375
 51 | 
 52 |     # x = xx * (1-pre_emphasis_weight)
 53 |     x = np.append(xx[0], xx[1:] - pre_emphasis_weight * xx[:-1])
 54 |     dctcoef = np.zeros((dct_channel, mel_channel), dtype=np.float32)
 55 |     for i in range(dct_channel):
 56 |         n = np.linspace(0, mel_channel-1, mel_channel)
 57 |         dctcoef[i, :] = np.cos((2 * n + 1) * i * np.pi / (2 * mel_channel))
 58 | 
 59 |     w = 1 + 6 * np.sin(np.pi * np.linspace(0, dct_channel-1, dct_channel) / (dct_channel-1))
 60 |     w /= w.max()
 61 |     w = np.reshape(w, newshape=(dct_channel, 1))
 62 | 
 63 |     samples = x.shape[0]
 64 |     frames = (samples - win_len) // shift_len
 65 |     stft = np.zeros((win_len, frames), dtype=np.complex64)
 66 |     spectrum = np.zeros((win_len // 2 + 1, frames), dtype=np.float32)
 67 | 
 68 |     mfcc = np.zeros((dct_channel, frames), dtype=np.float32)
 69 | 
 70 |     if win_type == 'hanning':
 71 |         window = np.hanning(win_len)
 72 |     elif win_type == 'hamming':
 73 |         window = np.hamming(win_len)
 74 |     elif win_type == 'triangle':
 75 |         window = (1-(np.abs(win_len - 1 - 2*np.arange(1, win_len+1, 1))/(win_len+1)))
 76 |     else:
 77 |         window = np.ones(win_len)
 78 | 
 79 |     for i in range(frames):
 80 |         one_frame = x[i * shift_len: i * shift_len + win_len]
 81 |         windowed_frame = np.multiply(one_frame, window)
 82 |         stft[:, i] = np.fft.fft(windowed_frame, win_len)
 83 |         spectrum[:, i] = np.power(np.abs(stft[0:win_len // 2 + 1, i]), 2)
 84 | 
 85 |     c1 = np.matmul(my_melbank, spectrum)
 86 |     c1 = np.where(c1 == 0.0, np.finfo(float).eps, c1)
 87 |     mfcc[:dct_channel, :] = np.multiply(np.matmul(dctcoef, np.log(c1)), np.repeat(w, frames, 1))
 88 | 
 89 |     if include_delta:
 90 |         dtm = np.zeros((dct_channel, frames), dtype=np.float32)
 91 |         ddtm = np.zeros((dct_channel, frames), dtype=np.float32)
 92 |         for i in range(2, frames-2):
 93 |             dtm[:, i] = 2 * mfcc[:, i+2] + mfcc[:, i+1] - mfcc[:, i-1] - 2 * mfcc[:, i-2]
 94 |         dtm /= 3.0
 95 |         for i in range(2, frames-2):
 96 |             ddtm[:, i] = 2 * dtm[:, i+2] + dtm[:, i+1] - dtm[:, i-1] - 2 * dtm[:, i-2]
 97 |         ddtm /= 3.0
 98 |         mfcc = np.row_stack((mfcc[:, 4:frames-4], dtm[:, 4:frames-4], ddtm[:, 4:frames-4]))
 99 | 
100 |     return mfcc, spectrum
101 | 
102 | 
103 | if __name__ == '__main__':
104 |     sr, wav_data = wavfile.read(u"clean.wav")
105 |     #分别代表 （语音文件，采样率，帧长，帧移，mel,dct,加窗）
106 |     mfcc, spect = mfcc_extractor(wav_data[:32000,], sr, sr//1000*20, sr//1000*10, 52, 26, 'hanning', True)
107 |     # pyplot.subplot(211)
108 |     # pyplot.imshow(np.log(spect))
109 |     # pyplot.subplot(212)
110 |     pyplot.imshow(mfcc)
111 |     pyplot.show()
112 |     # pyplot.subplot(311)
113 |     # fft2mel = get_fft_mel_mat(320, 16000, 64)
114 |     # pyplot.imshow(fft2mel)
115 |     # plt.subplot(312)
116 |     # plt.hold(True)
117 |     # for i in range(24):
118 |     #     plt.plot(fft2mel[40 + i, :])
119 |     # pyplot.subplot(313)
120 |     # dct_coeff = get_dct_coeff(64,24)
121 |     # pyplot.imshow(dct_coeff)
122 |     # pyplot.show()
123 | 
124 | 
125 | 


--------------------------------------------------------------------------------
/scikits.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kingback2019/Speech_MFCC_GFCC_Python/80f885b042a206a897c2734ad800dac185e95b96/scikits.zip


--------------------------------------------------------------------------------