├── GMM
    └── scripts
    │   ├── .vs
    │       ├── VSWorkspaceState.json
    │       ├── scripts
    │       │   └── v16
    │       │   │   └── .suo
    │       └── slnx.sqlite
    │   ├── concat_sentence.py
    │   ├── convert_bitrate.py
    │   ├── extra_mfcc.py
    │   ├── extra_mfcc2.py
    │   ├── extra_mfcc_multiprocess.py
    │   ├── gmm_achieve.py
    │   ├── gmm_achieve_2.py
    │   ├── gmm_model.py
    │   ├── gmm_model_2.py
    │   ├── gmm_model_3.py
    │   ├── gmm_timit.py
    │   ├── mkdir_script.py
    │   ├── mute_remove.py
    │   └── timit_deal
    │       ├── concat_timit_sentence.py
    │       └── timit_data_deal.py
├── README.md
└── self-attention
    └── self-attention_speaker_rec.ipynb


/GMM/scripts/.vs/VSWorkspaceState.json:
--------------------------------------------------------------------------------
1 | {
2 |   "ExpandedNodes": [
3 |     ""
4 |   ],
5 |   "PreviewInSolutionExplorer": false
6 | }


--------------------------------------------------------------------------------
/GMM/scripts/.vs/scripts/v16/.suo:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kevinnan-teen/Speaker-Recognition/159dda4a2e7658c43fec490db6abbb110d6ae4db/GMM/scripts/.vs/scripts/v16/.suo


--------------------------------------------------------------------------------
/GMM/scripts/.vs/slnx.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kevinnan-teen/Speaker-Recognition/159dda4a2e7658c43fec490db6abbb110d6ae4db/GMM/scripts/.vs/slnx.sqlite


--------------------------------------------------------------------------------
/GMM/scripts/concat_sentence.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | import numpy as np
 4 | import scipy.io.wavfile as wav
 5 | 
 6 | #merge_files_in_a_folder # 合并音频
 7 | def merge_files(path_read_folder, path_write_wav_file):
 8 | 
 9 |     #files = os.listdir(path_read_folder)
10 |     merged_signal = []
11 |     for filename in glob.glob(os.path.join(path_read_folder, 'sentence*.wav')):
12 |         print(filename)
13 |         sr, signal = wav.read(filename)
14 |         merged_signal.append(signal)
15 |     # print(len(merged_signal))
16 |     print(merged_signal[0].shape, merged_signal[1].shape)
17 |     merged_signal=np.hstack(merged_signal)
18 |     merged_signal = np.asarray(merged_signal, dtype=np.int16)
19 |     wav.write(path_write_wav_file, sr, merged_signal)
20 |  
21 | 
22 | #noisy train total
23 | female_siri_path = '../speech/female/female_'
24 | male_siri_path = '../speech/male/male_'
25 | for i in range(9):
26 |     path_read_folder = female_siri_path + str(i+1)
27 |     path_write_wav_file = path_read_folder + "/merge_result.wav"
28 |     merge_files(path_read_folder, path_write_wav_file)
29 | for i in range(6):
30 |     path_read_folder = male_siri_path + str(i+1)
31 |     path_write_wav_file = path_read_folder + "/merge_result.wav"
32 |     merge_files(path_read_folder, path_write_wav_file)
33 | 
34 | 


--------------------------------------------------------------------------------
/GMM/scripts/convert_bitrate.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 转换音频采样率/比特率
 3 | '''
 4 | 
 5 | import os
 6 | import subprocess
 7 | 
 8 | 
 9 | def mkdirFamle():
10 | 	input_path = '../speech/male/male_'
11 | 	for i in range(6):
12 | 		output_path = input_path + str(i+1)
13 | 		os.makedirs(output_path)
14 | 
15 | 
16 | def convertBitrate():
17 | 	input_path_list = []
18 | 	output_path_list = []
19 | 	for i in range(6):
20 | 		input_path = '../speech/male/out_' + str(i+1) + '/'
21 | 		output_path = '../speech/male/male_' + str(i+1) + '/'
22 | 		input_path_list.append(input_path)
23 | 		output_path_list.append(output_path)
24 | 	for i in range(6):
25 | 		for file in os.listdir(input_path_list[i]):
26 | 			input_file = input_path_list[i] + file
27 | 			output_file = output_path_list[i] + file[:-4] + '.wav'
28 | 			cmd = "ffmpeg -i " + input_file + " -ar 16000 -ac 1 " + output_file
29 | 			subprocess.call(cmd, shell=True)
30 | 
31 | 
32 | 
33 | 
34 | if __name__=="__main__":
35 | 	#mkdirFamle()
36 | 	convertBitrate()


--------------------------------------------------------------------------------
/GMM/scripts/extra_mfcc.py:
--------------------------------------------------------------------------------
  1 | # _*_ coding=utf-8 _*_
  2 | from scipy import signal
  3 | import pylab as pl
  4 | from sklearn.mixture import GaussianMixture
  5 | import joblib
  6 | import os
  7 | import librosa
  8 | import numpy as np
  9 | import matplotlib.pyplot as plt
 10 | import wave
 11 | import time
 12 | import math
 13 | import glob
 14 | 
 15 | 
 16 | def enframe(wave_data, nw, inc, winfunc):
 17 |     '''将音频信号转化为帧。
 18 |     参数含义：
 19 |     wave_data:原始音频型号
 20 |     nw:每一帧的长度(这里指采样点的长度，即采样频率乘以时间间隔)
 21 |     inc:相邻帧的间隔（同上定义）
 22 |     '''
 23 |     wlen=len(wave_data) #信号总长度
 24 |     if wlen<=nw: #若信号长度小于一个帧的长度，则帧数定义为1
 25 |         nf=1
 26 |     else: #否则，计算帧的总长度
 27 |         nf=int(np.ceil((1.0*wlen-nw+inc)/inc))
 28 |     pad_length=int((nf-1)*inc+nw) #所有帧加起来总的铺平后的长度
 29 |     zeros=np.zeros((pad_length-wlen,)) #不够的长度使用0填补，类似于FFT中的扩充数组操作
 30 |     pad_signal=np.concatenate((wave_data,zeros)) #填补后的信号记为pad_signal
 31 |     indices=np.tile(np.arange(0,nw),(nf,1))+np.tile(np.arange(0,nf*inc,inc),(nw,1)).T  #相当于对所有帧的时间点进行抽取，得到nf*nw长度的矩阵
 32 |     indices=np.array(indices,dtype=np.int32) #将indices转化为矩阵
 33 |     frames=pad_signal[indices] #得到帧信号
 34 |     win=np.tile(winfunc,(nf,1))  #window窗函数，这里默认取1
 35 |     return frames*win   #返回帧信号矩阵
 36 | 
 37 | def getWaveData(filename):
 38 |     fw = wave.open(filename,'rb')
 39 |     params = fw.getparams()
 40 |     #print(params)
 41 |     nchannels, sampwidth, framerate, nframes = params[:4]
 42 |     str_data = fw.readframes(nframes)
 43 |     wave_data = np.fromstring(str_data, dtype=np.int16)
 44 |     wave_data = wave_data * 1.0 / (max(abs(wave_data)))  # wave幅值归一化
 45 |     fw.close()
 46 |     return wave_data
 47 | 
 48 | def extraMFCC(filename, savename):
 49 | 
 50 |     nw = 320  #对于16KHz的文件，20ms的采样点个数
 51 |     inc = 160
 52 |     wave_data=getWaveData(filename)
 53 |     winfunc = signal.hann(nw)
 54 |     X = enframe(wave_data, nw, inc, winfunc)
 55 |     frameNum = X.shape[0]  # 返回矩阵列数，获取帧数
 56 | 
 57 |     data=[]
 58 |     for oneframe in  X:
 59 |         tmpList=list()
 60 |         mfccs = librosa.feature.mfcc(y=oneframe, sr=16000, n_mfcc=24)
 61 |         # print(mfccs.shape)
 62 |         for a in mfccs:
 63 |             # print(a.shape)
 64 |             tmpList.append(a[0])
 65 |         data.append(tmpList)
 66 |     data=np.array(data)
 67 |     # data.shape : (frames_num, 24)
 68 |     print(data.shape)
 69 |     np.save(savename, data)
 70 | 
 71 | def main():
 72 | 
 73 | 	target_dir = '../speech/TIMIT/TEST/'
 74 | 
 75 | 	drs = os.listdir(target_dir)
 76 | 
 77 | 	i = 0
 78 | 
 79 | 	for dr in drs:   
 80 | 		# DR*
 81 | 		drs_path = os.path.join(target_dir, dr)
 82 | 		samples = os.listdir(drs_path)
 83 | 		#print(len(samples))
 84 | 
 85 | 		for sample in samples:
 86 | 			samples_path = os.path.join(drs_path, sample)
 87 | 
 88 | 			for filename in glob.glob(os.path.join(samples_path, '*.wav')):
 89 |                 print(filename)
 90 |                 spk_dir = '../speech/TIMIT/TEST_MFCC/' + 'spk_' + str(i+1)
 91 |                 save_name = spk_dir  + '/spk_' + str(i+1) + '_mfcc.npy'
 92 |                 #print(save_name)
 93 | 
 94 |                 # extraMFCC(filename, save_name)
 95 |                 i += 1
 96 |                 #extraMFCC(filename, save_name)
 97 |                 # if os.path.exists(spk_dir):
 98 |                 # 	pass
 99 |                 # else:
100 |                 # 	os.makedirs(spk_dir)
101 |                 # print(spk_dir)
102 |                 # print(filename)
103 |                 # s_file = filename.split('\\')
104 |                 # save_name = target_dir + filename[25:28] + '/' + s_file[1] + '/' + "merge_result.wav"
105 |     
106 | 
107 | # def main():
108 | # 	start=time.clock()
109 | 
110 | # 	female_siri_path = '../speech/female/female_'
111 | # 	male_siri_path = '../speech/male/male_'
112 | # 	GMMs=[]
113 | # 	# female
114 | # 	for i in range(9):
115 | # 	    train_female_siri_file = female_siri_path + str(i+1) + '/merge_result.wav'	
116 | # 	    save_name = '../mfcc_features/female_' + str(i+1) + '.npy'    
117 | # 	    extraMFCC(train_female_siri_file, save_name)
118 | # 	# male
119 | # 	for i in range(6):
120 | # 	    train_male_siri_file = male_siri_path + str(i+1) + '/merge_result.wav'	 
121 | # 	    save_name = '../mfcc_features/male_' + str(i+1) + '.npy'       
122 | # 	    extraMFCC(train_male_siri_file, save_name)
123 | 
124 | # 	timePointAfterGmm=time.clock()
125 | 
126 | 
127 | if __name__=="__main__":
128 | 	main()
129 | 
130 | 


--------------------------------------------------------------------------------
/GMM/scripts/extra_mfcc2.py:
--------------------------------------------------------------------------------
  1 | # _*_ coding=utf-8 _*_
  2 | from scipy import signal
  3 | import pylab as pl
  4 | from sklearn.mixture import GaussianMixture
  5 | import joblib
  6 | import os
  7 | import librosa
  8 | import numpy as np
  9 | import matplotlib.pyplot as plt
 10 | import wave
 11 | import time
 12 | import math
 13 | import glob
 14 | import multiprocessing
 15 | 
 16 | 
 17 | def enframe(wave_data, nw, inc, winfunc):
 18 |     '''将音频信号转化为帧。
 19 |     参数含义：
 20 |     wave_data:原始音频型号
 21 |     nw:每一帧的长度(这里指采样点的长度，即采样频率乘以时间间隔)
 22 |     inc:相邻帧的间隔（同上定义）
 23 |     '''
 24 |     wlen=len(wave_data) #信号总长度
 25 |     if wlen<=nw: #若信号长度小于一个帧的长度，则帧数定义为1
 26 |         nf=1
 27 |     else: #否则，计算帧的总长度
 28 |         nf=int(np.ceil((1.0*wlen-nw+inc)/inc))
 29 |     pad_length=int((nf-1)*inc+nw) #所有帧加起来总的铺平后的长度
 30 |     zeros=np.zeros((pad_length-wlen,)) #不够的长度使用0填补，类似于FFT中的扩充数组操作
 31 |     pad_signal=np.concatenate((wave_data,zeros)) #填补后的信号记为pad_signal
 32 |     indices=np.tile(np.arange(0,nw),(nf,1))+np.tile(np.arange(0,nf*inc,inc),(nw,1)).T  #相当于对所有帧的时间点进行抽取，得到nf*nw长度的矩阵
 33 |     indices=np.array(indices,dtype=np.int32) #将indices转化为矩阵
 34 |     frames=pad_signal[indices] #得到帧信号
 35 |     win=np.tile(winfunc,(nf,1))  #window窗函数，这里默认取1
 36 |     return frames*win   #返回帧信号矩阵
 37 | 
 38 | def getWaveData(filename):
 39 |     fw = wave.open(filename,'rb')
 40 |     params = fw.getparams()
 41 |     #print(params)
 42 |     nchannels, sampwidth, framerate, nframes = params[:4]
 43 |     str_data = fw.readframes(nframes)
 44 |     wave_data = np.fromstring(str_data, dtype=np.int16)
 45 |     wave_data = wave_data * 1.0 / (max(abs(wave_data)))  # wave幅值归一化
 46 |     fw.close()
 47 |     return wave_data
 48 | 
 49 | def extraMFCC(filename, savename):
 50 | 
 51 |     nw = 320  #对于16KHz的文件，20ms的采样点个数
 52 |     inc = 160
 53 |     wave_data=getWaveData(filename)
 54 |     winfunc = signal.hann(nw)
 55 |     X = enframe(wave_data, nw, inc, winfunc)
 56 |     frameNum = X.shape[0]  # 返回矩阵列数，获取帧数
 57 | 
 58 |     data=[]
 59 |     for oneframe in  X:
 60 |         tmpList=list()
 61 |         mfccs = librosa.feature.mfcc(y=oneframe, sr=16000, n_mfcc=24)
 62 |         # print(mfccs.shape)
 63 |         for a in mfccs:
 64 |             # print(a.shape)
 65 |             tmpList.append(a[0])
 66 |         data.append(tmpList)
 67 |     data=np.array(data)
 68 |     # data.shape : (frames_num, 24)
 69 |     print(data.shape)
 70 |     np.save(savename, data)
 71 | 
 72 | 
 73 | def extra_train_MFCC():
 74 | 	target_dir = '../speech/TIMIT/TRAIN/'
 75 | 
 76 | 	drs = os.listdir(target_dir)
 77 | 
 78 | 	i = 0
 79 | 
 80 | 	for dr in drs:   
 81 | 		# DR*
 82 | 		drs_path = os.path.join(target_dir, dr)
 83 | 		samples = os.listdir(drs_path)
 84 | 		#print(len(samples))
 85 | 
 86 | 		for sample in samples:
 87 | 			samples_path = os.path.join(drs_path, sample)
 88 | 			print(samples_path)
 89 | 			# for filename in glob.glob(os.path.join(samples_path, '*.wav')):
 90 | 			# 	# print(filename)
 91 | 			# 	s_file = filename.split('\\')
 92 | 			# 	# print(s_file[2][:-4])
 93 | 			# 	spk_dir = '../speech/TIMIT/TEST_MFCC/' + 'spk_' + str(i+1)
 94 | 			# 	save_name = spk_dir + '/' + s_file[2][:-4] + 'mfcc.npy'
 95 | 			# 	extraMFCC(filename, save_name)
 96 | 			# 	print(save_name)
 97 | 			# i += 1
 98 | 
 99 | # target_dir = '../speech/TIMIT/TEST/'
100 | 
101 | # drs = os.listdir(target_dir)
102 | 
103 | # i = 0
104 | 
105 | # for dr in drs:   
106 | # 	# DR*
107 | # 	drs_path = os.path.join(target_dir, dr)
108 | # 	samples = os.listdir(drs_path)
109 | # 	#print(len(samples))
110 | 
111 | # 	for sample in samples:
112 | # 		samples_path = os.path.join(drs_path, sample)
113 | # 		for filename in glob.glob(os.path.join(samples_path, '*.wav')):
114 | # 			# print(filename)
115 | # 			s_file = filename.split('\\')
116 | # 			# print(s_file[2][:-4])
117 | # 			spk_dir = '../speech/TIMIT/TEST_MFCC/' + 'spk_' + str(i+1)
118 | # 			save_name = spk_dir + '/' + s_file[2][:-4] + 'mfcc.npy'
119 | # 			extraMFCC(filename, save_name)
120 | # 			print(save_name)
121 | # 		i += 1
122 | 
123 | 
124 | 
125 | if __name__ == '__main__':
126 | 	extra_mfcc = [multiprocessing.Process(target=self.readVideo),
127 | 							 multiprocessing.Process(target=self.dealVideo,),
128 | 							 multiprocessing.Process(target=self.dealData,)]
129 | 							 
130 | 
131 | 			
132 | 			
133 | 	for process in extra_mfcc:
134 | 		process.daemon = True
135 | 		process.start()
136 | 	for process in extra_mfcc:
137 | 		process.join()


--------------------------------------------------------------------------------
/GMM/scripts/extra_mfcc_multiprocess.py:
--------------------------------------------------------------------------------
  1 | # _*_ coding=utf-8 _*_
  2 | from scipy import signal
  3 | import pylab as pl
  4 | from sklearn.mixture import GaussianMixture
  5 | import joblib
  6 | import os
  7 | import librosa
  8 | import numpy as np
  9 | import matplotlib.pyplot as plt
 10 | import wave
 11 | import time
 12 | import math
 13 | import glob
 14 | import multiprocessing
 15 | 
 16 | 
 17 | def enframe(wave_data, nw, inc, winfunc):
 18 |     '''将音频信号转化为帧。
 19 |     参数含义：
 20 |     wave_data:原始音频型号
 21 |     nw:每一帧的长度(这里指采样点的长度，即采样频率乘以时间间隔)
 22 |     inc:相邻帧的间隔（同上定义）
 23 |     '''
 24 |     wlen=len(wave_data) #信号总长度
 25 |     if wlen<=nw: #若信号长度小于一个帧的长度，则帧数定义为1
 26 |         nf=1
 27 |     else: #否则，计算帧的总长度
 28 |         nf=int(np.ceil((1.0*wlen-nw+inc)/inc))
 29 |     pad_length=int((nf-1)*inc+nw) #所有帧加起来总的铺平后的长度
 30 |     zeros=np.zeros((pad_length-wlen,)) #不够的长度使用0填补，类似于FFT中的扩充数组操作
 31 |     pad_signal=np.concatenate((wave_data,zeros)) #填补后的信号记为pad_signal
 32 |     indices=np.tile(np.arange(0,nw),(nf,1))+np.tile(np.arange(0,nf*inc,inc),(nw,1)).T  #相当于对所有帧的时间点进行抽取，得到nf*nw长度的矩阵
 33 |     indices=np.array(indices,dtype=np.int32) #将indices转化为矩阵
 34 |     frames=pad_signal[indices] #得到帧信号
 35 |     win=np.tile(winfunc,(nf,1))  #window窗函数，这里默认取1
 36 |     return frames*win   #返回帧信号矩阵
 37 | 
 38 | def getWaveData(filename):
 39 |     fw = wave.open(filename,'rb')
 40 |     params = fw.getparams()
 41 |     #print(params)
 42 |     nchannels, sampwidth, framerate, nframes = params[:4]
 43 |     str_data = fw.readframes(nframes)
 44 |     wave_data = np.fromstring(str_data, dtype=np.int16)
 45 |     wave_data = wave_data * 1.0 / (max(abs(wave_data)))  # wave幅值归一化
 46 |     fw.close()
 47 |     return wave_data
 48 | 
 49 | def extraMFCC(filename, savename, mfcc_num=13):
 50 |     nw = 320  #对于16KHz的文件，20ms的采样点个数
 51 |     inc = 160
 52 |     wave_data=getWaveData(filename)
 53 |     winfunc = signal.hann(nw)
 54 |     X = enframe(wave_data, nw, inc, winfunc)
 55 |     frameNum = X.shape[0]  # 返回矩阵列数，获取帧数
 56 | 
 57 |     data=[]
 58 |     for oneframe in  X:
 59 |         tmpList=list()
 60 |         mfccs = librosa.feature.mfcc(y=oneframe, sr=16000, n_mfcc=mfcc_num)
 61 |         # print(mfccs.shape)
 62 |         for a in mfccs:
 63 |             # print(a.shape)
 64 |             tmpList.append(a[0])
 65 |         data.append(tmpList)
 66 |     data=np.array(data)
 67 |     # data.shape : (frames_num, 24)
 68 |     print(data.shape)
 69 |     np.save(savename, data)
 70 | 
 71 | 
 72 | def extra_train_MFCC(start, end, spk_num):
 73 |     target_dir = '../speech/TIMIT/TRAIN/'
 74 | 
 75 |     drs = os.listdir(target_dir)
 76 | 
 77 |     i = 0
 78 |     print(drs)
 79 | 
 80 |     for dr in drs[start:end]:
 81 |         # DR*
 82 |         drs_path = os.path.join(target_dir, dr)
 83 |         samples = os.listdir(drs_path)
 84 |         #print(len(samples))
 85 |         for sample in samples:            
 86 |             samples_path = os.path.join(drs_path, sample)
 87 |             # print(samples_path)
 88 |             filename = os.path.join(samples_path, 'merge_result.wav')
 89 |             # print(filename)
 90 |             # if os.path.exists(filename):
 91 |             #     print(filename)
 92 |             spk_dir = '../speech/TIMIT/TRAIN_MFCC/spk_' + str(spk_num+1)            
 93 |             save_name = spk_dir  + '/spk_' + str(spk_num+1) + '_13d_mfcc.npy'
 94 |             print(save_name)
 95 |             # for filename in glob.glob(os.path.join(samples_path, '*.wav')):
 96 |             #   # print(filename)   
 97 |             #   s_file = filename.split('\\')
 98 |             #   # print(s_file[2][:-4])
 99 |             #   spk_dir = '../speech/TIMIT/TEST_MFCC/' + 'spk_' + str(i+1)
100 |             #   save_name = spk_dir + '/' + s_file[2][:-4] + 'mfcc.npy'
101 |             extraMFCC(filename, save_name, 13)
102 |             #   print(save_name)
103 |             # i += 1
104 |             spk_num += 1
105 | 
106 | 
107 | def extra_test_MFCC(start, end, spk_num):
108 |     target_dir = '../speech/TIMIT/TEST/'
109 | 
110 |     drs = os.listdir(target_dir)
111 | 
112 |     i = 0
113 | 
114 |     for dr in drs[start:end]:
115 |       # DR*
116 |       drs_path = os.path.join(target_dir, dr)
117 |       samples = os.listdir(drs_path)
118 |       #print(len(samples))
119 | 
120 |       for sample in samples:
121 |           samples_path = os.path.join(drs_path, sample)
122 |           for filename in glob.glob(os.path.join(samples_path, '*.wav')):
123 |               # print(filename)
124 |               s_file = filename.split('\\')
125 |               # print(s_file[2][:-4])
126 |               spk_dir = '../speech/TIMIT/TEST_MFCC/' + 'spk_' + str(spk_num+1)
127 |               save_name = spk_dir + '/spk_' + str(spk_num+1) + '_13d_mfcc.npy'
128 |               extraMFCC(filename, save_name)
129 |               print(save_name)
130 |           spk_num += 1
131 | 
132 | 
133 | def train_multiprocess():
134 |     extra_mfcc = [multiprocessing.Process(target=extra_train_MFCC, args=(0, 2, 0,),),
135 |                              multiprocessing.Process(target=extra_train_MFCC, args=(2, 4, 114,),),
136 |                              multiprocessing.Process(target=extra_train_MFCC, args=(4, 6, 258,),),
137 |                              multiprocessing.Process(target=extra_train_MFCC, args=(6, 8, 363),)]
138 |                                         
139 |             
140 |     for process in extra_mfcc:
141 |         process.daemon = True
142 |         process.start()
143 |     for process in extra_mfcc:
144 |         process.join()
145 | 
146 | 
147 | def test_multiprocess():
148 |     extra_mfcc = [multiprocessing.Process(target=extra_test_MFCC, args=(0, 2, 0,),),
149 |                              multiprocessing.Process(target=extra_test_MFCC, args=(2, 4, 114,),),
150 |                              multiprocessing.Process(target=extra_test_MFCC, args=(4, 6, 258,),),
151 |                              multiprocessing.Process(target=extra_test_MFCC, args=(6, 8, 363),)]
152 |                                         
153 |             
154 |     for process in extra_mfcc:
155 |         process.daemon = True
156 |         process.start()
157 |     for process in extra_mfcc:
158 |         process.join()
159 | 
160 | if __name__ == '__main__':
161 |     train_multiprocess()
162 |    


--------------------------------------------------------------------------------
/GMM/scripts/gmm_achieve.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy.stats import multivariate_normal
  3 | import matplotlib.pyplot as plt
  4 | from matplotlib.patches import Ellipse
  5 |  
  6 |  
  7 | # 绘制椭圆参考代码，https://github.com/SJinping/Gaussian-ellipse/blob/master/gaussian_%20ellipse.py
  8 | def plot_cov_ellipse(cov, pos, nstd=2, ax=None, **kwargs):
  9 |     """
 10 |     Plots an `nstd` sigma error ellipse based on the specified covariance
 11 |     matrix (`cov`). Additional keyword arguments are passed on to the
 12 |     ellipse patch artist.
 13 |     Parameters
 14 |     ----------
 15 |         cov : The 2x2 covariance matrix to base the ellipse on
 16 |         pos : The location of the center of the ellipse. Expects a 2-element
 17 |             sequence of [x0, y0].
 18 |         nstd : The radius of the ellipse in numbers of standard deviations.
 19 |             Defaults to 2 standard deviations.
 20 |         ax : The axis that the ellipse will be plotted on. Defaults to the
 21 |             current axis.
 22 |         Additional keyword arguments are pass on to the ellipse patch.
 23 |     Returns
 24 |     -------
 25 |         A matplotlib ellipse artist
 26 |     """
 27 |  
 28 |     def eigsorted(cov):
 29 |         vals, vecs = np.linalg.eigh(cov)
 30 |         order = vals.argsort()[::-1]
 31 |         return vals[order], vecs[:, order]
 32 |  
 33 |     if ax is None:
 34 |         ax = plt.gca()
 35 |  
 36 |     vals, vecs = eigsorted(cov)
 37 |     theta = np.degrees(np.arctan2(*vecs[:, 0][::-1]))
 38 |  
 39 |     # Width and height are "full" widths, not radius
 40 |     width, height = 2 * nstd * np.sqrt(vals)
 41 |     ellip = Ellipse(xy=pos, width=width, height=height, angle=theta, **kwargs)
 42 |  
 43 |     ax.add_artist(ellip)
 44 |     return ellip
 45 |  
 46 |  
 47 | def plot(data, mu, covariance, class_label):
 48 |     plt.scatter(data[:, 0], data[:, 1], c=class_label)
 49 |     n_components = len(mu)
 50 |     for j in range(n_components):
 51 |         plot_cov_ellipse(covariance[j], mu[j])
 52 |         pass
 53 |     plt.show()
 54 |  
 55 | 
 56 | class GaussianMixtureModel:
 57 |  
 58 |     def __init__(self, n_components, maxIter=1e4, eps=1e-9):
 59 |         self.n_components = n_components
 60 |         self.class_prior = np.ones(n_components) * 1 / n_components
 61 |         self.mu = None
 62 |         self.covariance = None
 63 |         self.W = None
 64 |         self.pdfs = None
 65 |         self.eps = eps
 66 |         self.maxIter = maxIter
 67 |  
 68 |     def __initParameters(self, X):
 69 |         '''
 70 |         初始化模型参数mu,sigma,class_prior
 71 |         :param X:
 72 |         :return:
 73 |         '''
 74 |         m, n = X.shape
 75 |         self.W = np.random.random((m, self.n_components))
 76 |         self.mu = np.random.random((self.n_components, n))
 77 |         minCol = np.min(X, axis=0)
 78 |         maxCol = np.max(X, axis=0)
 79 |         self.mu = minCol + self.mu * (maxCol - minCol)
 80 |         self.covariance = np.zeros((self.n_components, n, n))
 81 |         dist = np.tile(np.sum(X * X, axis=1).reshape((m, 1)), (1, self.n_components)) + np.tile(
 82 |             np.sum(self.mu * self.mu, axis=1).T,
 83 |             (m, 1)) - 2 * np.dot(X, self.mu.T)
 84 |         self.pdfs = np.zeros((m, self.n_components))
 85 |         labels = np.argmin(dist, axis=1)
 86 |         for i in range(self.n_components):
 87 |             clusterX = X[labels == i, :]
 88 |             self.class_prior[i] = clusterX.shape[0] / m
 89 |             self.covariance[i, :, :] = np.cov(clusterX.T)
 90 |  
 91 |     def train(self, X):
 92 |         '''
 93 |         EM算法得到模型参数，迭代停止条件为：1迭代轮数达到上限   2似然函数的变化极其微小，小于某个阈值
 94 |         :param X:
 95 |         :return:
 96 |         '''
 97 |         self.__initParameters(X)
 98 |         num = 0
 99 |         preLogLikelihood = self.__logLikelihood(X)
100 |         while num < self.maxIter:
101 |             self.__expectation(X)
102 |             self.__maximize(X)
103 |             # plot(X, self.mu, self.covariance,y)
104 |             num += 1
105 |             logLikelihood = self.__logLikelihood(X)
106 |             if abs(logLikelihood - preLogLikelihood) < self.eps:
107 |                 break
108 |             preLogLikelihood = logLikelihood
109 |         plot(X, self.mu, self.covariance,y)
110 |  
111 |     # 根据当前的各个组分先验概率、均值向量和协方差矩阵计算对数似然函数值
112 |     def __logLikelihood(self, X):
113 |         for j in range(self.n_components):
114 |             a = multivariate_normal.pdf(X, self.mu[j], self.covariance[j])
115 |             # print(a)
116 |             self.pdfs[:, j] = self.class_prior[j] * multivariate_normal.pdf(X, self.mu[j], self.covariance[j])
117 |         return np.mean(np.log(np.sum(self.pdfs, axis=1)))
118 |  
119 |     # EM算法的E步，计算样本x_i来自第k个高斯分布的概率
120 |     def __expectation(self, X):
121 |         '''
122 |         对于样本x_i来自第k个高斯分布的概率
123 |         :return:
124 |         '''
125 |         for j in range(self.n_components):
126 |             self.pdfs[:, j] = self.class_prior[j] * multivariate_normal.pdf(X, self.mu[j], self.covariance[j])
127 |             self.W = self.pdfs / np.sum(self.pdfs, axis=1).reshape(-1, 1)
128 |  
129 |     def __maximize(self, X):
130 |         '''
131 |         N_k表示所有数据点属于第k类的概率之和
132 |         更新类别先验，类的期望中心和协方差
133 |         :return:
134 |         '''
135 |         m, n = X.shape
136 |         self.class_prior = np.sum(self.W, axis=0) / np.sum(self.W)
137 |         for j in range(self.n_components):
138 |             self.mu[j] = np.average(X, axis=0, weights=self.W[:, j])
139 |             cov = 0
140 |             for i in range(m):
141 |                 tmp = (X[i, :] - self.mu[j, :]).reshape(-1, 1)
142 |                 cov += self.W[i, j] * np.dot(tmp, tmp.T)
143 |             self.covariance[j, :, :] = cov / np.sum(self.W[:, j])
144 |  
145 |  
146 | # 用三个不同的高斯分布生成三个聚类作为GMM算法的数据
147 | num1, mu1, covar1 = 400, [0.5, 0.5], np.array([[1, 0.5], [0.5, 3]])
148 | X1 = np.random.multivariate_normal(mu1, covar1, num1)
149 | # 第二簇的数据
150 | num2, mu2, covar2 = 600, [5.5, 2.5], np.array([[2, 1], [1, 2]])
151 | X2 = np.random.multivariate_normal(mu2, covar2, num2)
152 | # 第三簇的数据
153 | num3, mu3, covar3 = 1000, [1, 7], np.array([[6, 2], [2, 1]])
154 | X3 = np.random.multivariate_normal(mu3, covar3, num3)
155 | # 合并在一起
156 | Mydata = np.vstack((X1, X2, X3))
157 | print(Mydata.shape)
158 | # 计算聚类结果的对数似然函数值
159 | y = np.hstack((np.zeros(len(X1)), np.ones(len(X2)), 2 * np.ones(len(X3))))
160 | print(len(y))
161 | myGMM = GaussianMixtureModel(3)
162 |  
163 | myGMM.train(Mydata)


--------------------------------------------------------------------------------
/GMM/scripts/gmm_achieve_2.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | from scipy.stats import multivariate_normal as mul_nor_1
  4 | from numpy.random import multivariate_normal as mul_nor_2
  5 | 
  6 | from scipy.stats import multivariate_normal
  7 | from matplotlib.patches import Ellipse
  8 |  
  9 |  
 10 | # 绘制椭圆参考代码，https://github.com/SJinping/Gaussian-ellipse/blob/master/gaussian_%20ellipse.py
 11 | def plot_cov_ellipse(cov, pos, nstd=2, ax=None, **kwargs):
 12 |     """
 13 |     Plots an `nstd` sigma error ellipse based on the specified covariance
 14 |     matrix (`cov`). Additional keyword arguments are passed on to the
 15 |     ellipse patch artist.
 16 |     Parameters
 17 |     ----------
 18 |         cov : The 2x2 covariance matrix to base the ellipse on
 19 |         pos : The location of the center of the ellipse. Expects a 2-element
 20 |             sequence of [x0, y0].
 21 |         nstd : The radius of the ellipse in numbers of standard deviations.
 22 |             Defaults to 2 standard deviations.
 23 |         ax : The axis that the ellipse will be plotted on. Defaults to the
 24 |             current axis.
 25 |         Additional keyword arguments are pass on to the ellipse patch.
 26 |     Returns
 27 |     -------
 28 |         A matplotlib ellipse artist
 29 |     """
 30 |  
 31 |     def eigsorted(cov):
 32 |         vals, vecs = np.linalg.eigh(cov)
 33 |         order = vals.argsort()[::-1]
 34 |         return vals[order], vecs[:, order]
 35 |  
 36 |     if ax is None:
 37 |         ax = plt.gca()
 38 |  
 39 |     vals, vecs = eigsorted(cov)
 40 |     theta = np.degrees(np.arctan2(*vecs[:, 0][::-1]))
 41 |  
 42 |     # Width and height are "full" widths, not radius
 43 |     width, height = 2 * nstd * np.sqrt(vals)
 44 |     ellip = Ellipse(xy=pos, width=width, height=height, angle=theta, **kwargs)
 45 |  
 46 |     ax.add_artist(ellip)
 47 |     return ellip
 48 |  
 49 |  
 50 | def plot(data, mu, covariance, class_label):
 51 |     plt.scatter(data[:, 0], data[:, 1], c=class_label)
 52 |     n_components = len(mu)
 53 |     for j in range(n_components):
 54 |         plot_cov_ellipse(covariance[j], mu[j])
 55 |         pass
 56 |     plt.show()
 57 | 
 58 | 
 59 | 
 60 | 
 61 | # 初始化模型参数
 62 | # shape 是表示样本规模的二元组，(样本数, 特征数)
 63 | # K 表示模型个数
 64 | def init_params(shape, K):
 65 |     N, D = shape
 66 |     mu = np.random.rand(K, D)
 67 |     cov = np.array([np.eye(D)] * K)
 68 |     alpha = np.array([1.0 / K] * K)
 69 |     print("Parameters initialized.")
 70 |     print("mu:", mu, "cov:", cov, "alpha:", alpha, sep="\n")
 71 |     return mu, cov, alpha
 72 | 
 73 | 
 74 | def phi(Y, mu_k, cov_k):
 75 |     norm = mul_nor_1(mean=mu_k, cov=cov_k)
 76 |     return norm.pdf(Y)
 77 | # E 步：计算每个模型对样本的响应度
 78 | # Y 为样本矩阵，每个样本一行，只有一个特征时为列向量
 79 | # mu 为均值多维数组，每行表示一个样本各个特征的均值
 80 | # cov 为协方差矩阵的数组，alpha 对应的每个高斯模型的权重数组
 81 |  
 82 | def getExpectation(Y, mu, cov, alpha):
 83 |     # 样本数
 84 |     N = Y.shape[0]
 85 |     # 模型数
 86 |     K = alpha.shape[0]
 87 | 
 88 |     # 为避免使用单个高斯模型或样本，导致返回结果的类型不一致
 89 |     # 因此要求样本数和模型个数必须大于1
 90 |     assert N > 1, "There must be more than one sample!"
 91 |     assert K > 1, "There must be more than one gaussian model!"
 92 | 
 93 |     # 响应度矩阵，行对应样本，列对应响应度
 94 |     gamma = np.mat(np.zeros((N, K)))
 95 | 
 96 |     # 计算各模型中所有样本出现的概率，行对应样本，列对应模型
 97 |     prob = np.zeros((N, K))
 98 |     for k in range(K):
 99 |         prob[:, k] = phi(Y, mu[k], cov[k])
100 |     prob = np.mat(prob)
101 | 
102 |     # 计算每个模型对每个样本的响应度
103 |     for k in range(K):
104 |         gamma[:, k] = alpha[k] * prob[:, k]
105 |     for i in range(N):
106 |         gamma[i, :] /= np.sum(gamma[i, :])
107 |     return gamma
108 | 
109 | 
110 | 
111 | # M 步：迭代模型参数
112 | # Y 为样本矩阵，gamma 为响应度矩阵
113 | def maximize(Y, gamma):
114 |     # 样本数和特征数
115 |     N, D = Y.shape
116 |     # 模型数
117 |     K = gamma.shape[1]
118 | 
119 |     #初始化参数值
120 |     mu = np.zeros((K, D))
121 |     cov = []
122 |     alpha = np.zeros(K)
123 | 
124 |     # 更新每个模型的参数
125 |     for k in range(K):
126 |         # 第 k 个模型对所有样本的响应度之和
127 |         Nk = np.sum(gamma[:, k])
128 |         # 更新 mu
129 |         # 对每个特征求均值
130 |         mu[k, :] = np.sum(np.multiply(Y, gamma[:, k]), axis=0) / Nk
131 |         # 更新 cov
132 |         cov_k = (Y - mu[k]).T * np.multiply((Y - mu[k]), gamma[:, k]) / Nk
133 |         cov.append(cov_k)
134 |         # 更新 alpha
135 |         alpha[k] = Nk / N
136 |     cov = np.array(cov)
137 |     return mu, cov, alpha
138 | 
139 | # 高斯混合模型 EM 算法
140 | # 给定样本矩阵 Y，计算模型参数
141 | # K 为模型个数
142 | # times 为迭代次数
143 | # 返回每个高斯模型的参数数组和对应的权值数组
144 | 
145 | def GMM_EM(Y, K, times):    
146 |     mu, cov, alpha = init_params(Y.shape, K)
147 |     for i in range(times):
148 |         gamma = getExpectation(Y, mu, cov, alpha)
149 |         mu, cov, alpha = maximize(Y, gamma)
150 |     print("{sep} Result {sep}".format(sep="-" * 20))
151 |     print("mu:", mu, "cov:", cov, "alpha:", alpha, sep="\n")
152 |     return mu, cov, alpha
153 | 
154 | 
155 | 


--------------------------------------------------------------------------------
/GMM/scripts/gmm_model.py:
--------------------------------------------------------------------------------
  1 | # _*_ coding=utf-8 _*_
  2 | from scipy import signal
  3 | import pylab as pl
  4 | from sklearn.mixture import GaussianMixture
  5 | import joblib
  6 | import os
  7 | import librosa
  8 | import numpy as np
  9 | import matplotlib.pyplot as plt
 10 | import wave
 11 | import time
 12 | import math
 13 | 
 14 | 
 15 | def enframe(wave_data, nw, inc, winfunc):
 16 |     '''将音频信号转化为帧。
 17 |     参数含义：
 18 |     wave_data:原始音频型号
 19 |     nw:每一帧的长度(这里指采样点的长度，即采样频率乘以时间间隔)
 20 |     inc:相邻帧的间隔（同上定义）
 21 |     '''
 22 |     wlen=len(wave_data) #信号总长度
 23 |     if wlen<=nw: #若信号长度小于一个帧的长度，则帧数定义为1
 24 |         nf=1
 25 |     else: #否则，计算帧的总长度
 26 |         nf=int(np.ceil((1.0*wlen-nw+inc)/inc))
 27 |     pad_length=int((nf-1)*inc+nw) #所有帧加起来总的铺平后的长度
 28 |     zeros=np.zeros((pad_length-wlen,)) #不够的长度使用0填补，类似于FFT中的扩充数组操作
 29 |     pad_signal=np.concatenate((wave_data,zeros)) #填补后的信号记为pad_signal
 30 |     indices=np.tile(np.arange(0,nw),(nf,1))+np.tile(np.arange(0,nf*inc,inc),(nw,1)).T  #相当于对所有帧的时间点进行抽取，得到nf*nw长度的矩阵
 31 |     indices=np.array(indices,dtype=np.int32) #将indices转化为矩阵
 32 |     frames=pad_signal[indices] #得到帧信号
 33 |     win=np.tile(winfunc,(nf,1))  #window窗函数，这里默认取1
 34 |     return frames*win   #返回帧信号矩阵
 35 | 
 36 | def getWaveData(filename):
 37 |     fw = wave.open(filename,'rb')
 38 |     params = fw.getparams()
 39 |     #print(params)
 40 |     nchannels, sampwidth, framerate, nframes = params[:4]
 41 |     str_data = fw.readframes(nframes)
 42 |     wave_data = np.fromstring(str_data, dtype=np.int16)
 43 |     wave_data = wave_data * 1.0 / (max(abs(wave_data)))  # wave幅值归一化
 44 |     fw.close()
 45 |     return wave_data
 46 | 
 47 | def getGMM(filename):
 48 | 
 49 |     nw = 320  #对于16KHz的文件，20ms的采样点个数
 50 |     inc = 160
 51 |     wave_data=getWaveData(filename)
 52 |     winfunc = signal.hann(nw)
 53 |     X = enframe(wave_data, nw, inc, winfunc)
 54 |     frameNum = X.shape[0]  # 返回矩阵列数，获取帧数
 55 | 
 56 |     data=[]
 57 |     for oneframe in  X:
 58 |         tmpList=list()
 59 |         mfccs = librosa.feature.mfcc(y=oneframe, sr=16000, n_mfcc=24)
 60 |         # print(mfccs.shape)
 61 |         for a in mfccs:
 62 |             # print(a.shape)
 63 |             tmpList.append(a[0])
 64 |         data.append(tmpList)
 65 |     data=np.array(data)
 66 |     # data.shape : (frames_num, 24)
 67 |     print(data.shape)
 68 | 
 69 |     # 高斯混合模型 5个样本时，聚类数量为3，效果最好
 70 |     # 使用贝叶斯GMM，可避免数量选择
 71 |     gmm = GaussianMixture(3, covariance_type='full', random_state=0).fit(data)
 72 |     return gmm
 73 | 
 74 | def softmax(scores):
 75 |     ss = 0.0
 76 |     Sum = 0.0
 77 |     for score in scores:
 78 |         ss += score
 79 |         
 80 |     scores = [(-1)*float(i)/ss for i in scores]
 81 | 
 82 |     for score in scores:
 83 |         Sum += math.exp(score)
 84 |     # for score in scores:        
 85 |     print("probalitiy:{0}, index:{1}".format(math.exp(max(scores)) / Sum, scores.index(max(scores))))
 86 | 
 87 | 
 88 | 
 89 | def train_gmm_model():
 90 |     start=time.clock()
 91 |     female_siri_path = '../speech/female/female_'
 92 |     male_siri_path = '../speech/male/male_'
 93 |     GMMs=[]
 94 |     # female
 95 |     for i in range(9):
 96 |         train_female_siri_file = female_siri_path + str(i+1) + '/sentence_1.wav'
 97 |         model_file = '../models/sentence_female_' + str(i+1) + '_gmm.model'
 98 |         female_siri_gmm = getGMM(train_female_siri_file)
 99 |         joblib.dump(female_siri_gmm, model_file)
100 |         print("finished")
101 |     # male
102 |     for i in range(6):
103 |         train_male_siri_file = male_siri_path + str(i+1) + '/sentence_1.wav'
104 |         model_file = '../models/sentence_male_' + str(i+1) + '_gmm.model'
105 |         male_siri_gmm = getGMM(train_male_siri_file)
106 |         joblib.dump(male_siri_gmm, model_file)
107 |         print("finished")
108 |         # model = joblib.load(model_file)
109 |     timePointAfterGmm=time.clock()
110 | 
111 | 
112 | def test_gmm_model():
113 |     female_siri_path = '../speech/female/female_'
114 |     male_siri_path = '../speech/male/male_'
115 |     #对采样信号处理
116 |     nw=320
117 |     inc = 160
118 |     winfunc = signal.hann(nw)
119 | 
120 |     test_data_list = []
121 |     gmm_model_list = []
122 |     # female
123 |     for i in range(9):
124 |         test_female_siri_file = female_siri_path + str(i+1) + '/xiaoai_2.wav'
125 |         # 加载模型
126 |         model_file = '../models/sentence_female_' + str(i+1) + '_gmm.model'
127 |         gmm_model = joblib.load(model_file)
128 |         gmm_model_list.append(gmm_model)
129 | 
130 |         testFrames=enframe(getWaveData(test_female_siri_file), nw, inc, winfunc)
131 |         data=[]
132 |         # 提取测试序列MFCC特征
133 |         for oneframe in testFrames:
134 |             tmpList=list()
135 |             for a in librosa.feature.mfcc(y=oneframe, sr=16000 , n_mfcc=24):
136 |                 tmpList.append(a[0])
137 |             data.append(tmpList)
138 |         data=np.array(data)
139 |         print(data.shape)
140 |         test_data_list.append(data)
141 |         # maxPro=GMMs[0].score(data)
142 |     # male
143 |     for i in range(6):
144 |         test_male_siri_file = male_siri_path + str(i+1) + '/xiaoai_2.wav'
145 |         # 加载模型
146 |         model_file = '../models/sentence_male_' + str(i+1) + '_gmm.model'
147 |         gmm_model = joblib.load(model_file)
148 |         gmm_model_list.append(gmm_model)
149 | 
150 |         testFrames=enframe(getWaveData(test_male_siri_file), nw, inc, winfunc)
151 |         data=[]
152 |         # 提取测试序列MFCC特征
153 |         for oneframe in testFrames:
154 |             tmpList=list()
155 |             for a in librosa.feature.mfcc(y=oneframe, sr=16000 , n_mfcc=24):
156 |                 tmpList.append(a[0])
157 |             data.append(tmpList)
158 |         data=np.array(data)
159 |         print(data.shape)
160 |         test_data_list.append(data)
161 |     # 测试
162 |     for model in gmm_model_list:
163 |         scores = []
164 |         for test_data in test_data_list:
165 |             test_score = model.score(test_data)
166 |             ss = model.score_samples(test_data)
167 |             scores.append(test_score)
168 |             # print("test_score:", ss.shape)
169 |         softmax(scores)
170 |         print("-------------------")
171 | 
172 | 
173 | 
174 | if __name__=="__main__":
175 |     train_gmm_model()
176 |     test_gmm_model()
177 | 
178 | 


--------------------------------------------------------------------------------
/GMM/scripts/gmm_model_2.py:
--------------------------------------------------------------------------------
 1 | # _*_ coding=utf-8 _*_
 2 | from scipy import signal
 3 | import pylab as pl
 4 | from sklearn.mixture import GaussianMixture
 5 | import joblib
 6 | import os
 7 | import librosa
 8 | import numpy as np
 9 | import matplotlib.pyplot as plt
10 | import wave
11 | import time
12 | import math
13 | 
14 | 
15 | def getGMM(filename):
16 | 
17 |     y, sr = librosa.load(filename)
18 |     # 提取 MFCC feature
19 |     mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=24, hop_length=160, win_length=240)
20 |     print(mfccs.shape)
21 | 
22 |     # 高斯混合模型 5个样本时，聚类数量为3，效果最好
23 |     # 使用贝叶斯GMM，可避免数量选择
24 |     gmm = GaussianMixture(1, covariance_type='full', random_state=0).fit(mfccs.T)
25 |     return gmm
26 | 
27 | 
28 | def softmax(scores):
29 |     ss = 0.0
30 |     Sum = 0.0
31 |     for score in scores:
32 |         ss += score
33 |         
34 |     scores = [(-1)*float(i)/ss for i in scores]
35 | 
36 |     for score in scores:
37 |         Sum += math.exp(score)
38 | 
39 |     # for score in scores:        
40 |     print("probalitiy:{0}, index:{1}".format(math.exp(max(scores)) / Sum, scores.index(max(scores))))
41 | 
42 | 
43 | def train_gmm_model():
44 |     start=time.clock()
45 |     female_siri_path = '../speech/female/female_'
46 | 
47 |     for i in range(9):
48 |         train_female_siri_file = female_siri_path + str(i+1) + '/siri_1.wav'
49 |         model_file = '../models/direct_femele_' + str(i+1) + '_gmm.model'
50 |         female_siri_gmm = getGMM(train_female_siri_file)
51 |         joblib.dump(female_siri_gmm, model_file)
52 |         print("finished")
53 |         # model = joblib.load(model_file)
54 |     timePointAfterGmm=time.clock()
55 | 
56 | 
57 | def test_gmm_model():
58 |     female_siri_path = '../speech/female/female_'
59 |     #对采样信号处理
60 |     nw=320
61 |     inc = 160
62 |     winfunc = signal.hann(nw)
63 | 
64 |     test_data_list = []
65 |     gmm_model_list = []
66 |     for i in range(9):
67 |         test_female_siri_file = female_siri_path + str(i+1) + '/siri_2.wav'
68 |         # 加载模型
69 |         model_file = '../models/direct_femele_' + str(i+1) + '_gmm.model'
70 |         gmm_model = joblib.load(model_file)
71 |         gmm_model_list.append(gmm_model)
72 | 
73 |         y, sr = librosa.load(test_female_siri_file)
74 |         # 提取 MFCC feature
75 |         mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=24, hop_length=160, win_length=240)
76 | 
77 |         test_data_list.append(mfccs.T)
78 |         # maxPro=GMMs[0].score(data)
79 | 
80 |     # 测试
81 |     for model in gmm_model_list:
82 |         scores = []
83 |         for test_data in test_data_list:
84 |             test_score = model.score(test_data)
85 |             scores.append(test_score)
86 |             print("test_score:", test_score)
87 |         softmax(scores)
88 |         print("-------------------")
89 | 
90 | if __name__=="__main__":
91 |     train_gmm_model()
92 |     test_gmm_model()
93 | 
94 | 


--------------------------------------------------------------------------------
/GMM/scripts/gmm_model_3.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # 使用保存的MFCC。
  3 | 
  4 | # _*_ coding=utf-8 _*_
  5 | from scipy import signal
  6 | import pylab as pl
  7 | from sklearn.mixture import GaussianMixture
  8 | import joblib
  9 | import os
 10 | import librosa
 11 | import numpy as np
 12 | import matplotlib.pyplot as plt
 13 | import wave
 14 | import time
 15 | import math
 16 | 
 17 | def enframe(wave_data, nw, inc, winfunc):
 18 |     '''将音频信号转化为帧。
 19 |     参数含义：
 20 |     wave_data:原始音频型号
 21 |     nw:每一帧的长度(这里指采样点的长度，即采样频率乘以时间间隔)
 22 |     inc:相邻帧的间隔（同上定义）
 23 |     '''
 24 |     wlen=len(wave_data) #信号总长度
 25 |     if wlen<=nw: #若信号长度小于一个帧的长度，则帧数定义为1
 26 |         nf=1
 27 |     else: #否则，计算帧的总长度
 28 |         nf=int(np.ceil((1.0*wlen-nw+inc)/inc))
 29 |     pad_length=int((nf-1)*inc+nw) #所有帧加起来总的铺平后的长度
 30 |     zeros=np.zeros((pad_length-wlen,)) #不够的长度使用0填补，类似于FFT中的扩充数组操作
 31 |     pad_signal=np.concatenate((wave_data,zeros)) #填补后的信号记为pad_signal
 32 |     indices=np.tile(np.arange(0,nw),(nf,1))+np.tile(np.arange(0,nf*inc,inc),(nw,1)).T  #相当于对所有帧的时间点进行抽取，得到nf*nw长度的矩阵
 33 |     indices=np.array(indices,dtype=np.int32) #将indices转化为矩阵
 34 |     frames=pad_signal[indices] #得到帧信号
 35 |     win=np.tile(winfunc,(nf,1))  #window窗函数，这里默认取1
 36 |     return frames*win   #返回帧信号矩阵
 37 | 
 38 | def getWaveData(filename):
 39 |     fw = wave.open(filename,'rb')
 40 |     params = fw.getparams()
 41 |     #print(params)
 42 |     nchannels, sampwidth, framerate, nframes = params[:4]
 43 |     str_data = fw.readframes(nframes)
 44 |     wave_data = np.fromstring(str_data, dtype=np.int16)
 45 |     wave_data = wave_data * 1.0 / (max(abs(wave_data)))  # wave幅值归一化
 46 |     fw.close()
 47 |     return wave_data
 48 | 
 49 | 
 50 | def getGMM(filename):
 51 | 
 52 | 	data = np.load(filename)
 53 | 	# 高斯混合模型 5个样本时，聚类数量为3，效果最好
 54 | 	# 使用贝叶斯GMM，可避免数量选择
 55 | 	gmm = GaussianMixture(3, covariance_type='full', random_state=0).fit(data)
 56 | 	return gmm
 57 | 
 58 | def softmax(scores):
 59 |     ss = 0.0
 60 |     Sum = 0.0
 61 |     for score in scores:
 62 |         ss += score
 63 |         
 64 |     scores = [(-1)*float(i)/ss for i in scores]
 65 | 
 66 |     for score in scores:
 67 |         Sum += math.exp(score)
 68 |     # for score in scores:        
 69 |     print("probalitiy:{0}, index:{1}".format(math.exp(max(scores)) / Sum, scores.index(max(scores))))
 70 |     return scores.index(max(scores))
 71 | 
 72 | 
 73 | 
 74 | def train_gmm_model():
 75 |     start=time.clock()
 76 |     root_dir = '../models/TIMIT_MFCC_24/'
 77 |     GMMs=[]
 78 |     # female
 79 |     for i in range(462):
 80 |     	spk_mfcc = '../speech/TIMIT/TRAIN_MFCC/' + 'spk_' + str(i+1) + '/spk_' + str(i+1) + '_mfcc.npy'  
 81 |     	if os.path.exists(spk_mfcc):
 82 |     		model_file = root_dir + 'spk_' + str(i+1) + '/' + 'TIMIT_MFCC_24_gmm.model'
 83 |     		#print(model_file)
 84 |     		timit_gmm = getGMM(spk_mfcc)
 85 |     		joblib.dump(timit_gmm, model_file)
 86 |     		print("finished")
 87 |     	else:
 88 |     		print("nor exists")
 89 |         # save_name = '../mfcc_features/female_' + str(i+1) + '.npy'
 90 |         
 91 |         
 92 |     timePointAfterGmm=time.clock()
 93 | 
 94 | 
 95 | def test_gmm_model():
 96 |     test_data_list = []
 97 |     gmm_model_list = []
 98 |     root_dir = '../models/TIMIT_MFCC_24/'
 99 | 
100 |     for i in range(462):
101 |         # mfcc_npy_file = '../speech/TIMIT/TEST_MFCC/' + 'spk_' + str(i+1) + '/' + s_file[2][:-4] + 'mfcc.npy'
102 |         test_mfcc_path = '../speech/TIMIT/TEST_MFCC/' + 'spk_' + str(i+1) + '/'
103 |         if os.path.exists(test_mfcc_path):        	
104 |         	mfcc_npy_file = test_mfcc_path + os.listdir(test_mfcc_path)[0]
105 |         # 加载模型
106 |         model_file = root_dir + 'spk_' + str(i+1) + '/' + 'TIMIT_MFCC_24_gmm.model'
107 |         gmm_model = joblib.load(model_file)
108 |         gmm_model_list.append(gmm_model)
109 | 
110 |         data = np.load(mfcc_npy_file)
111 |         test_data_list.append(data)
112 |     # 测试
113 |     test_right = 0
114 |     i = 0
115 |     for model in gmm_model_list:
116 |         scores = []
117 |         for test_data in test_data_list:
118 |             test_score = model.score(test_data)
119 |             # ss = model.score_samples(test_data)
120 |             scores.append(test_score)
121 |             # print("test_score:", ss.shape)
122 |         result = softmax(scores)
123 |         if(i == result):
124 |         	test_right += 1
125 |         i += 1
126 |         print("-------------------")
127 |     print("right:{0}, accuracy:{1}".format(test_right, test_right/462))
128 | 
129 | 
130 | 
131 | if __name__=="__main__":
132 |     #train_gmm_model()
133 |     test_gmm_model()
134 | 
135 | 


--------------------------------------------------------------------------------
/GMM/scripts/gmm_timit.py:
--------------------------------------------------------------------------------
  1 | # _*_ coding=utf-8 _*_
  2 | from scipy import signal
  3 | import pylab as pl
  4 | from sklearn.mixture import GaussianMixture
  5 | import joblib
  6 | import os
  7 | import librosa
  8 | import numpy as np
  9 | import matplotlib.pyplot as plt
 10 | import wave
 11 | import time
 12 | import math
 13 | import multiprocessing
 14 | 
 15 | 
 16 | def getGMM(filename):
 17 | 
 18 | 	data = np.load(filename)
 19 | 	# 高斯混合模型 5个样本时，聚类数量为3，效果最好
 20 | 	# 使用贝叶斯GMM，可避免数量选择
 21 | 	gmm = GaussianMixture(7, covariance_type='diag', random_state=0).fit(data)
 22 | 	return gmm
 23 | 
 24 | def softmax(scores):
 25 |     ss = 0.0
 26 |     Sum = 0.0
 27 |     for score in scores:
 28 |         ss += score
 29 |         
 30 |     scores = [(-1)*float(i)/ss for i in scores]
 31 | 
 32 |     for score in scores:
 33 |         Sum += math.exp(score)
 34 |     # for score in scores:        
 35 |     print("probalitiy:{0}, index:{1}".format(math.exp(max(scores)) / Sum, scores.index(max(scores))))
 36 |     return scores.index(max(scores))
 37 | 
 38 | 
 39 | 
 40 | def train_gmm_model(start_num, end_num):
 41 |     start=time.clock()
 42 |     # 训练gmm，只需修改（1）
 43 |     root_dir = '../models/TIMIT_MFCC_24/'
 44 |     GMMs=[]
 45 |     # female
 46 |     for i in range(start_num, end_num):
 47 |     	spk_mfcc = '../speech/TIMIT/TRAIN_MFCC/' + 'spk_' + str(i+1) + '/spk_' + str(i+1) + '_13d_mfcc.npy'  
 48 |     	if os.path.exists(spk_mfcc):
 49 |             # 修改（2）
 50 |     		model_file = root_dir + 'spk_' + str(i+1) + '/' + 'TIMIT_MFCC_24_gmm.model'
 51 |     		#print(model_file)
 52 |     		timit_gmm = getGMM(spk_mfcc)
 53 |     		joblib.dump(timit_gmm, model_file)
 54 |     		print("finished")
 55 |     	else:
 56 |     		print("nor exists")
 57 |         # save_name = '../mfcc_features/female_' + str(i+1) + '.npy'
 58 |         
 59 |         
 60 |     timePointAfterGmm=time.clock()
 61 | 
 62 | 
 63 | def test_gmm_model():
 64 |     test_data_list = []
 65 |     gmm_model_list = []
 66 |     # 修改（1）
 67 |     root_dir = '../models/TIMIT_MFCC_24/'
 68 | 
 69 |     for i in range(462):
 70 |         # mfcc_npy_file = '../speech/TIMIT/TEST_MFCC/' + 'spk_' + str(i+1) + '/' + s_file[2][:-4] + 'mfcc.npy'
 71 |         test_mfcc_path = '../speech/TIMIT/TEST_MFCC/' + 'spk_' + str(i+1) + '/'
 72 |         if os.path.exists(test_mfcc_path):        	
 73 |         	mfcc_npy_file = test_mfcc_path + os.listdir(test_mfcc_path)[0]
 74 |             # mfcc_npy_file = test_mfcc_path + '/spk_' + str(i+1) + '_13d_mfcc.npy'
 75 |         # 加载模型
 76 |         # 修改（2）
 77 |         model_file = root_dir + 'spk_' + str(i+1) + '/' + 'TIMIT_MFCC_24_gmm.model'
 78 |         gmm_model = joblib.load(model_file)
 79 |         gmm_model_list.append(gmm_model)
 80 | 
 81 |         data = np.load(mfcc_npy_file)
 82 |         test_data_list.append(data)
 83 |     # 测试
 84 |     test_right = 0
 85 |     i = 0
 86 |     for model in gmm_model_list:
 87 |         scores = []
 88 |         for test_data in test_data_list:
 89 |             test_score = model.score(test_data)
 90 |             # ss = model.score_samples(test_data)
 91 |             scores.append(test_score)
 92 |             # print("test_score:", ss.shape)
 93 |         result = softmax(scores)
 94 |         if(i == result):
 95 |         	test_right += 1
 96 |         i += 1
 97 |         print("-------------------")
 98 |     print("right:{0}, accuracy:{1}".format(test_right, test_right/462))
 99 | 
100 | 
101 | def gmm_train_multiprocess():
102 |     train_mfcc = [multiprocessing.Process(target=train_gmm_model, args=(0, 100,)),
103 |                              multiprocessing.Process(target=train_gmm_model, args=(100,200,)),
104 |                              multiprocessing.Process(target=train_gmm_model, args=(200,300,)),
105 |                              multiprocessing.Process(target=train_gmm_model, args=(300,400,)),
106 |                              multiprocessing.Process(target=train_gmm_model, args=(400,462,))]
107 |                                 
108 |     for process in train_mfcc:
109 |         process.daemon = True
110 |         process.start()
111 |     for process in train_mfcc:
112 |         process.join()
113 | 
114 | if __name__=="__main__":    
115 |     gmm_train_multiprocess()
116 | 
117 | 


--------------------------------------------------------------------------------
/GMM/scripts/mkdir_script.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | def mkdir_mfcc24_gmm_model():
 4 | 	root_dir = '../models/TIMIT_MFCC_24/'
 5 | 	for i in range(462):
 6 | 		spk_model_dir = root_dir + 'spk_' + str(i+1)
 7 | 		print(spk_model_dir)
 8 | 		os.makedirs(spk_model_dir)
 9 | 
10 | 
11 | def mkdir_mfcc_13d_7_diag_model():
12 | 	root_dir = '../models/TIMIT_MFCC_13d_7_diag/'
13 | 	for i in range(462):
14 | 		spk_model_dir = root_dir + 'spk_' + str(i+1)
15 | 		print(spk_model_dir)
16 | 		os.makedirs(spk_model_dir)
17 | 
18 | 
19 | def mkdir_timit_test():
20 | 	root_dir = '../speech/TIMIT/TEST_MFCC/'
21 | 	for i in range(462):
22 | 		timit_test_dir = root_dir + 'spk_' + str(i+1)
23 | 		print(timit_test_dir)
24 | 		os.makedirs(timit_test_dir)
25 | 
26 | 
27 | def justify_multiprocess_nor_success():
28 | 	root_dir = "../speech/TIMIT/TRAIN_MFCC/"
29 | 	spk_list = os.listdir(root_dir)
30 | 	print(len(spk_list))
31 | 	for spk in spk_list:
32 | 		content = os.listdir(os.path.join(root_dir, spk))
33 | 		if(len(content) == 2):
34 | 			pass
35 | 		else:
36 | 			print("error")
37 | 
38 | 
39 | #mkdir_mfcc24_gmm_model()
40 | # mkdir_timit_test()
41 | #justify_multiprocess_nor_success()
42 | mkdir_mfcc_13d_7_diag_model()


--------------------------------------------------------------------------------
/GMM/scripts/mute_remove.py:
--------------------------------------------------------------------------------
1 | from pyAudioAnalysis import audioBasicIO as aIO
2 | from pyAudioAnalysis import audioSegmentation as aS
3 | [Fs, x] = aIO.read_audio_file("../speech/female/female_1/siri_1.wav")
4 | segments = aS.silence_removal(x, Fs, 0.020, 0.020, smooth_window = 1.0, weight = 0.3, plot = True)
5 | 


--------------------------------------------------------------------------------
/GMM/scripts/timit_deal/concat_timit_sentence.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import os
  3 | import numpy as np
  4 | import scipy.io.wavfile as wav
  5 | 
  6 | import shutil
  7 | 
  8 | def mkdir_files(path_read_folder, path_write_wav_file):
  9 | 
 10 |     target_dir = '../../speech/TIMIT/TRAIN/'
 11 |     target_dir2 = '../../speech/TIMIT/TEST/'
 12 | 
 13 |     drs = os.listdir(path_read_folder)
 14 |     s = 0
 15 |     for dr in drs:   
 16 |         # DR*
 17 |         drs_path = os.path.join(path_read_folder, dr)
 18 |         samples = os.listdir(drs_path)
 19 |         #print(len(samples))
 20 |         for sample in samples:
 21 |             samples_path = os.path.join(drs_path, sample)
 22 |             #print(samples_path)            
 23 |             for filename in glob.glob(os.path.join(samples_path, 'SX*_.wav')):
 24 |                 #print(filename)
 25 |                 s_file = filename.split('\\')
 26 |                 mkdir_dir = target_dir + s_file[1] + '/' + s_file[2]
 27 |                 # print(s_file[3])
 28 |                 target = mkdir_dir + '/' + s_file[3]
 29 |                 print(target)
 30 |                 shutil.copy(filename, target)
 31 |                 # if os.path.exists(mkdir_dir):
 32 |                 #     pass
 33 |                 # else:
 34 |                 #     os.makedirs(mkdir_dir)
 35 | 
 36 |                 # print(mkdir_dir)
 37 |                 s += 1
 38 |                 # break
 39 |                 #print(len(s_file))
 40 |                 #print(s_file[])
 41 |     # drs = os.listdir(target_dir)
 42 |     # s = 0
 43 |     # for dr in drs:   
 44 |     #     # DR*
 45 |     #     drs_path = os.path.join(target_dir, dr)
 46 |     #     samples = os.listdir(drs_path)
 47 |     #     #print(len(samples))
 48 |     #     for sample in samples:
 49 |     #         samples_path = os.path.join(drs_path, sample)
 50 |     #         #print(samples_path)            
 51 |     #         for filename in glob.glob(os.path.join(samples_path, 'SI*_.wav')):
 52 |     #             print(filename)
 53 |     #             # os.remove(filename)
 54 |     #             s_file = filename.split('\\')
 55 |     #             # print(s_file)
 56 |     #             # print(filename[25:28])
 57 |     #             target = target_dir2 + filename[25:28] + '/' + s_file[1] + '/' + s_file[2]
 58 |     #             print(target)
 59 |     #             # if os.path.exists(mkdir_dir):
 60 |     #             #     pass
 61 |     #             # else:
 62 |     #             #     os.makedirs(mkdir_dir)
 63 |     #             # s_file = filename.split('\\')
 64 |     #             # mkdir_dir = target_dir2 + s_file[1] + '/' + s_file[2]
 65 |     #             # # print(s_file[3])
 66 |     #             #target = mkdir_dir + '/' + s_file[3]
 67 |     #             #print(target)
 68 |     #             shutil.copy(filename, target)
 69 |     #             os.remove(filename)
 70 |     #             break
 71 |     print("sum:", s)
 72 | 
 73 | 
 74 | # 合并SX的个句子和SI的第一个句子作为训练集
 75 | # SI的后俩个个句子作为测试集
 76 | def merge_files(path_read_folder, path_write_wav_file):
 77 | 
 78 |     target_dir = '../../speech/TIMIT/TRAIN/'
 79 |     save_name = ''
 80 | 
 81 |     drs = os.listdir(target_dir)
 82 |     s = 0
 83 |     for dr in drs:   
 84 |         # DR*
 85 |         drs_path = os.path.join(target_dir, dr)
 86 |         samples = os.listdir(drs_path)
 87 |         #print(len(samples))
 88 | 
 89 |         for sample in samples:
 90 |             samples_path = os.path.join(drs_path, sample)
 91 |             #print(samples_path)
 92 |             merged_signal = []            
 93 |             for filename in glob.glob(os.path.join(samples_path, '*.wav')):
 94 |                 #print(filename)
 95 |                 s_file = filename.split('\\')
 96 |                 save_name = target_dir + filename[25:28] + '/' + s_file[1] + '/' + "merge_result.wav"
 97 |                 # print(save_name)
 98 |                 sr, signal = wav.read(filename)
 99 |                 merged_signal.append(signal)
100 |             print(len(merged_signal))
101 |             # print(merged_signal[0].shape, merged_signal[1].shape)
102 |             merged_signal=np.hstack(merged_signal)
103 |             merged_signal = np.asarray(merged_signal, dtype=np.int16)
104 |             wav.write(save_name, sr, merged_signal)
105 |     print("sum:", s)
106 |             # print(sample)
107 |         # merged_signal = []
108 |         # for filename in glob.glob(os.path.join(path_read_folder, 'sentence*.wav')):
109 |         #     print(filename)
110 |             #sr, signal = wav.read(filename)
111 |             #merged_signal.append(signal)
112 |         # print(len(merged_signal))
113 |         # print(merged_signal[0].shape, merged_signal[1].shape)
114 |         # merged_signal=np.hstack(merged_signal)
115 |         # merged_signal = np.asarray(merged_signal, dtype=np.int16)
116 |         # wav.write(path_write_wav_file, sr, merged_signal)
117 |  
118 | 
119 | #noisy train total
120 | path_read_folder = '../../speech/TIMIT2/TRAIN'
121 | path_write_wav_file = '../../speech/male/male_'
122 | merge_files(path_read_folder, path_write_wav_file)
123 |     
124 | 
125 | 


--------------------------------------------------------------------------------
/GMM/scripts/timit_deal/timit_data_deal.py:
--------------------------------------------------------------------------------
 1 | import params as hp
 2 | from sphfile import SPHFile
 3 | import glob
 4 | import os
 5 | 
 6 |  
 7 | if __name__ == "__main__":
 8 |     train_path = '../../speech/TIMIT/TRAIN/*/*/*.WAV'
 9 |     test_path  =  '../../speech/TIMIT/TEST/*/*/*.WAV'
10 |     train_sph_files = glob.glob(train_path)
11 |     test_sph_files = glob.glob(test_path)
12 |     print(len(train_sph_files),"train utterences")
13 |     print(len(test_sph_files),"test utterences")
14 |     # for i in train_sph_files:
15 |     #     sph = SPHFile(i)
16 |     #     sph.write_wav(filename=i.replace(".WAV","_.wav"))
17 |         #os.remove(i)
18 |     # path = 'D:/pycharm_proj/corpus/data/lisa/data/timit/raw/TIMIT/TEST/*/*/*.WAV'
19 |     # sph_files_test = glob.glob(path)
20 |     # print(len(sph_files_test),"test utterences")
21 |     for i in test_sph_files:
22 |         sph = SPHFile(i)
23 |         sph.write_wav(filename=i.replace(".WAV","_.wav"))
24 |         # os.remove(i)
25 |     # print("Completed")
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Speaker Recognition
  2 | 
  3 | 说话人识别，又称声纹识别。从上世纪60年代开始到现在，声纹识别一直是生物识别技术研究的主题。从传统的基于模板匹配的方法，到早期基于统计学方法，直到基于深度学习的声纹识别技术成为主流。本项目给出一个从传统（基于GMM、GMM-UBM、GMM-SVM[3]、联合因子分析、i-vector的方法），到基于深度学习的声纹识别方法的实现。
  4 | 
  5 | ## 1、基于GMM的声纹识别
  6 | 
  7 | ### 1.1 测试环境：
  8 | 
  9 | - 操作系统：Windows10
 10 | - 代码环境：Python3.6
 11 | - 主要用到的开源库：sklearn、librosa、numpy
 12 | - 数据集：TIMIT语音识别数据集和我自己收集的有15个说话人，**每个人6句话的小数据集（暂不公开）**
 13 | 
 14 | ### 1.2 在TIMIT数据集上进行测试
 15 | 
 16 | TIMIT语料库是为声学语音知识的获取（模型训练）以及自动语音识别系统（ASR）的评估（模型测试）而构建的，是由国防部赞助，在研究计划署（DARPA-ISTO）、麻省理工学院（MIT）、斯坦福研究院（SRI）、德州仪器（TI）共同努力下完成。说话人信息：由来自美国8个主要方言地区的630位说话者讲10个句子构成。10个句子分为：
 17 | 
 18 | **SA-**方言句子（Dialect sentence）：由SRI设计，总共2句。每个人都会读SA1、SA2这两个句子，体现不同地区方言的差别。（因此可用于方言判断算法的数据集，而其他情况一般不用该类句子）
 19 | 
 20 |  **SX-**音素紧凑的句子（Phondtically-compact sentence）：由MIT设计，总共450句，目的是让句子中的音素分布平衡，尽可能的包含所有音素对。每个人读5个SX句子，并且每个SX句子被7个不同的人读。
 21 | 
 22 |  **SI-**音素发散的句子（Phonetically-diverse sentence）：由TI在现有语料库Brown Corpus与剧作家对话集（the Playwrights Dialog）挑选的，总共1890句。目的是增加句子类型和音素文本的多样性，使之尽可能的包括所有的等位语境（Allophonic context）。每个人读三个SI句子，并且每个SI句子仅被一个人读一次。
 23 | 
 24 | 630个说话人被分为TRAIN（462人）和TEST（168人）。我只用到TRAIN的462个说话人语音数据。所以我的说话人样本数是462个。因为SA的两个句子是方言，所以我并没有用到这两个句子。其他8个句子，我是用SX的5个句子和SI的1个句子作为训练集，SI的另外2个句子作为测试集。并将6个训练句子合并为1个句子方便提取MFCC特征。
 25 | 
 26 | 我自己在TIMIT数据集基础上划分的数据。[[Baidu Driver](https://pan.baidu.com/s/1YnPZochiRY0IDfSoFbivqw?pwd=c1fc) | [Google Driver](https://drive.google.com/file/d/1J8YaWN9oFFGzVH6kNPcI8VsXmFe8g5gr/view?usp=sharing)]
 27 | 
 28 | 也可下载TIMIT原始数据，根据你自己的情况划分数据。[[Baidu Driver](https://pan.baidu.com/s/1YnPZochiRY0IDfSoFbivqw?pwd=c1fc) | [Google Driver](https://drive.google.com/file/d/180mSIiXN9RVDV2Xn1xcWNkMRm5J5MjN4/view?usp=sharing)]
 29 | 
 30 | > ├─TEST（168人）  
 31 | > │  ├─DR1  
 32 | > │  │  ├─FCJF0  
 33 | > │  │  ├─FDAW0 
 34 | > .......  
 35 | > │  ├─DR2  
 36 | > │  │  ├─FAEM0  
 37 | > │  │  ├─FAJW0  
 38 | > ......  
 39 | > │  ├─DR3  
 40 | > │  │  ├─FALK0  
 41 | > │  │  ├─FCKE0  
 42 | > ......  
 43 | > ├─TEST_MFCC（测试集提取MFCC，462人）  
 44 | > │  ├─spk_1  
 45 | > │  ├─spk_10  
 46 | > │  ├─spk_100  
 47 | > ......  
 48 | > ├─TRAIN（训练集数据，462人）  
 49 | > │  ├─DR1  
 50 | > │  │  ├─FCJF0  
 51 | > │  │  ├─FDAW0  
 52 | > ......  
 53 | > │  ├─DR2  
 54 | > │  │  ├─MTJG0  
 55 | >......     
 56 | >│  ├─DR3  
 57 | > │  │  ├─FALK0  
 58 | > │  │  ├─FCKE0  
 59 | > ......  
 60 | > └─TRAIN_MFCC（提取的训练集MFCC，462人）  
 61 | >  ├─spk_1  
 62 | >     ├─spk_10  
 63 | >     ├─spk_100  
 64 | >    ......  
 65 | 
 66 | **我使用Python实现的算法流程大致如下：**
 67 | 
 68 | （1）提取24维MFCC特征。首先分别读入462个说话人的经过合并后的一段长语音（大概20s），MFCC特征提取过程与之前描述的在我自己小样本数据集上提取的过程一致，这里不再赘述。与之不同的主要有两点：第一，对于20s的语音提取MFCC之后特征维度大致为（2000，24）。因此需要将特征保存，避免重复提取。使用librosa提取的MFCC特征为numpy格式，因此我保存为.npy格式的文件，使用时load参数即可。第二，对462个说话人提取24维MFCC特征相当耗时，所以在实际代码实现时，我将462个说话人分为4批，对每一批分别开一个进程进行特征提取，运行效率提升了4倍左右。   
 69 | 
 70 | ![](https://cdn.jsdelivr.net/gh/Kevinnan-teen/CDN/image-20210605101414806.png)              
 71 | 
 72 | （2）进行gmm训练。将每个说话人语音的24维MFCC特征参数作为输入，训练GMM。经过调参对比后，GMM的聚类数量设为3个，协方差矩阵选取full的效果最好。同样，gmm的训练过程也是多进行并行计算。
 73 | 
 74 | （3）测试说话人gmm模型。我使用SI中的1个句子作为测试数据（2s左右）。将2s语音作为输入，分别提取24维MFCC参数。然后分别将462个人的MFCC特征输入gmm模型，然后gmm对每一个输入进行打分。之后使用softmax将所有说话人的得分归一化到[0,1]区间，即得到每个说话人在当前gmm模型上的概率。概率最大的就是模型对应的说话人。
 75 | 
 76 | （4）测试结果：SI第一个句子的测试结果：验证正确的数量为294，验证错误的数量为168，识别准确率为**63.6%**。 SI第二个句子的测试结果为：验证正确的数量为204，验证错误的数量为258，识别准确率为**44.2%**。
 77 | 
 78 | ## 2、基于self-attention的说话人识别
 79 | 
 80 | ### 2.1 测试环境： 
 81 | 
 82 | - google colab（Telsa T4 -16G）
 83 | 
 84 | - Pytorch 1.7.1
 85 | 
 86 | - 数据集：VoxCeleb数据集（选取其中600个说话人）
 87 | 
 88 | **主要参考李宏毅2021年深度学习课程作业HW4**。使用开源的声纹识别数据集VoxCeleb1，我们从中选取了其中600个说话人的数据，然后分别对这600个人的语音使用mel滤波器组提取40维特征，作为神经网络的输入。
 89 | 
 90 | 网络结构部分，我们使用self-attention机制。下图是《attention is all you need》论文中提出的Transformer结构。主要分为编码器encoder和解码器decoder两部分。对于本网络只用到左侧的encoder部分。
 91 | 
 92 | ![](https://cdn.jsdelivr.net/gh/Kevinnan-teen/CDN/image-20210605101612748.png)
 93 | 
 94 | 简单介绍一下Transformer的encoder。Encoder可以由下面一组串联的Block组成。每一个Block是一个self-attention。
 95 | 
 96 | ![image-20210605101637276](https://cdn.jsdelivr.net/gh/Kevinnan-teen/CDN/image-20210605101637276.png)
 97 | 
 98 | ​                               
 99 | 
100 | 这里的self-attention的输出比传统的self-attention在输出之后又加了对应的输入。然后对相加后的结果做了Layer Norm。Layer Norm不同于Batch Norm。Batch Norm是对不同样本的同一个维度的不同特征计算mean和std。Layer Norm是计算同一个样本不同维度的相同特征计算mean和std，然后计算norm。之后再对做了norm的输出通过FC，然后相加，再做Layer Norm，然后输出。
101 | 
102 |  ![](https://cdn.jsdelivr.net/gh/Kevinnan-teen/CDN/image-20210605101642261.png)
103 | 
104 |  
105 | 
106 | 说话人识别网络结构代码：
107 | 
108 | ```python
109 | class Classifier(nn.Module):
110 |   def __init__(self, d_model=80, n_spks=600, dropout=0.1):
111 |     super().__init__()
112 |     # Project the dimension of features from that of input into d_model.
113 |     self.prenet = nn.Linear(40, d_model)
114 |     self.encoder_layer = nn.TransformerEncoderLayer(
115 |       d_model=d_model, dim_feedforward=256, nhead=2
116 |     )
117 |     self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=2)
118 | 
119 |     # Project the the dimension of features from d_model into speaker nums.
120 |     self.pred_layer = nn.Sequential(
121 |       nn.Linear(d_model, n_spks),
122 |     )
123 | 
124 |   def forward(self, mels):
125 |     """
126 |     args:
127 |       mels: (batch size, length, 40)
128 |     return:
129 |       out: (batch size, n_spks)
130 |     """
131 |     # out: (batch size, length, d_model)
132 |     out = self.prenet(mels)
133 |     # out: (length, batch size, d_model)
134 |     out = out.permute(1, 0, 2)
135 |     # The encoder layer expect features in the shape of (length, batch size, d_model).
136 |     out = self.encoder(out)
137 |     # out: (batch size, length, d_model)
138 |     out = out.transpose(0, 1)
139 |     # mean pooling
140 |     stats = out.mean(dim=1)
141 |     # out: (batch, n_spks)
142 |     out = self.pred_layer(stats)
143 |     return out
144 | 
145 | 
146 | net = Classifier()
147 | summary(net.to("cuda"), (2,40), device="cuda")
148 | ```
149 | 
150 | 网络结构如下图所示：
151 | 
152 | ![](https://cdn.jsdelivr.net/gh/Kevinnan-teen/CDN/image-20210605101751501.png)
153 | 
154 | 接下来划分训练集和验证集。将90%的数据用于train，10%的数据用于validation。
155 | 
156 | 由于说话人识别是一个分类问题，所以定义损失函数为CrossEntropyLoss()，在Pytorch中交叉熵损失把softmax和CrossEntropy都定义在nn.CrossEntropyLoss()，因此不需要再定义softmax，只需要将模型的输出和labels输入CrossEntropyLoss()即可。定义优化函数为AdamW，这是Adam的改进版本，有更好的优化效果。
157 | 
158 |   训练过程如下图所示。训练过程共迭代70000次，每2000次做一次validation。从结果可以看出，训练集上的损失在不断下降，准确率在不断上升，训练结束时的准确率为91%，验证集的准确率为80%。
159 | 
160 | ![](https://cdn.jsdelivr.net/gh/Kevinnan-teen/CDN/image-20210605101804628.png)
161 | 


--------------------------------------------------------------------------------