├── GMM └── scripts │ ├── .vs │ ├── VSWorkspaceState.json │ ├── scripts │ │ └── v16 │ │ │ └── .suo │ └── slnx.sqlite │ ├── concat_sentence.py │ ├── convert_bitrate.py │ ├── extra_mfcc.py │ ├── extra_mfcc2.py │ ├── extra_mfcc_multiprocess.py │ ├── gmm_achieve.py │ ├── gmm_achieve_2.py │ ├── gmm_model.py │ ├── gmm_model_2.py │ ├── gmm_model_3.py │ ├── gmm_timit.py │ ├── mkdir_script.py │ ├── mute_remove.py │ └── timit_deal │ ├── concat_timit_sentence.py │ └── timit_data_deal.py ├── README.md └── self-attention └── self-attention_speaker_rec.ipynb /GMM/scripts/.vs/VSWorkspaceState.json: -------------------------------------------------------------------------------- 1 | { 2 | "ExpandedNodes": [ 3 | "" 4 | ], 5 | "PreviewInSolutionExplorer": false 6 | } -------------------------------------------------------------------------------- /GMM/scripts/.vs/scripts/v16/.suo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kevinnan-teen/Speaker-Recognition/159dda4a2e7658c43fec490db6abbb110d6ae4db/GMM/scripts/.vs/scripts/v16/.suo -------------------------------------------------------------------------------- /GMM/scripts/.vs/slnx.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kevinnan-teen/Speaker-Recognition/159dda4a2e7658c43fec490db6abbb110d6ae4db/GMM/scripts/.vs/slnx.sqlite -------------------------------------------------------------------------------- /GMM/scripts/concat_sentence.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import numpy as np 4 | import scipy.io.wavfile as wav 5 | 6 | #merge_files_in_a_folder # 合并音频 7 | def merge_files(path_read_folder, path_write_wav_file): 8 | 9 | #files = os.listdir(path_read_folder) 10 | merged_signal = [] 11 | for filename in glob.glob(os.path.join(path_read_folder, 'sentence*.wav')): 12 | print(filename) 13 | sr, signal = wav.read(filename) 14 | merged_signal.append(signal) 15 | # print(len(merged_signal)) 16 | print(merged_signal[0].shape, merged_signal[1].shape) 17 | merged_signal=np.hstack(merged_signal) 18 | merged_signal = np.asarray(merged_signal, dtype=np.int16) 19 | wav.write(path_write_wav_file, sr, merged_signal) 20 | 21 | 22 | #noisy train total 23 | female_siri_path = '../speech/female/female_' 24 | male_siri_path = '../speech/male/male_' 25 | for i in range(9): 26 | path_read_folder = female_siri_path + str(i+1) 27 | path_write_wav_file = path_read_folder + "/merge_result.wav" 28 | merge_files(path_read_folder, path_write_wav_file) 29 | for i in range(6): 30 | path_read_folder = male_siri_path + str(i+1) 31 | path_write_wav_file = path_read_folder + "/merge_result.wav" 32 | merge_files(path_read_folder, path_write_wav_file) 33 | 34 | -------------------------------------------------------------------------------- /GMM/scripts/convert_bitrate.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 转换音频采样率/比特率 3 | ''' 4 | 5 | import os 6 | import subprocess 7 | 8 | 9 | def mkdirFamle(): 10 | input_path = '../speech/male/male_' 11 | for i in range(6): 12 | output_path = input_path + str(i+1) 13 | os.makedirs(output_path) 14 | 15 | 16 | def convertBitrate(): 17 | input_path_list = [] 18 | output_path_list = [] 19 | for i in range(6): 20 | input_path = '../speech/male/out_' + str(i+1) + '/' 21 | output_path = '../speech/male/male_' + str(i+1) + '/' 22 | input_path_list.append(input_path) 23 | output_path_list.append(output_path) 24 | for i in range(6): 25 | for file in os.listdir(input_path_list[i]): 26 | input_file = input_path_list[i] + file 27 | output_file = output_path_list[i] + file[:-4] + '.wav' 28 | cmd = "ffmpeg -i " + input_file + " -ar 16000 -ac 1 " + output_file 29 | subprocess.call(cmd, shell=True) 30 | 31 | 32 | 33 | 34 | if __name__=="__main__": 35 | #mkdirFamle() 36 | convertBitrate() -------------------------------------------------------------------------------- /GMM/scripts/extra_mfcc.py: -------------------------------------------------------------------------------- 1 | # _*_ coding=utf-8 _*_ 2 | from scipy import signal 3 | import pylab as pl 4 | from sklearn.mixture import GaussianMixture 5 | import joblib 6 | import os 7 | import librosa 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | import wave 11 | import time 12 | import math 13 | import glob 14 | 15 | 16 | def enframe(wave_data, nw, inc, winfunc): 17 | '''将音频信号转化为帧。 18 | 参数含义: 19 | wave_data:原始音频型号 20 | nw:每一帧的长度(这里指采样点的长度,即采样频率乘以时间间隔) 21 | inc:相邻帧的间隔(同上定义) 22 | ''' 23 | wlen=len(wave_data) #信号总长度 24 | if wlen<=nw: #若信号长度小于一个帧的长度,则帧数定义为1 25 | nf=1 26 | else: #否则,计算帧的总长度 27 | nf=int(np.ceil((1.0*wlen-nw+inc)/inc)) 28 | pad_length=int((nf-1)*inc+nw) #所有帧加起来总的铺平后的长度 29 | zeros=np.zeros((pad_length-wlen,)) #不够的长度使用0填补,类似于FFT中的扩充数组操作 30 | pad_signal=np.concatenate((wave_data,zeros)) #填补后的信号记为pad_signal 31 | indices=np.tile(np.arange(0,nw),(nf,1))+np.tile(np.arange(0,nf*inc,inc),(nw,1)).T #相当于对所有帧的时间点进行抽取,得到nf*nw长度的矩阵 32 | indices=np.array(indices,dtype=np.int32) #将indices转化为矩阵 33 | frames=pad_signal[indices] #得到帧信号 34 | win=np.tile(winfunc,(nf,1)) #window窗函数,这里默认取1 35 | return frames*win #返回帧信号矩阵 36 | 37 | def getWaveData(filename): 38 | fw = wave.open(filename,'rb') 39 | params = fw.getparams() 40 | #print(params) 41 | nchannels, sampwidth, framerate, nframes = params[:4] 42 | str_data = fw.readframes(nframes) 43 | wave_data = np.fromstring(str_data, dtype=np.int16) 44 | wave_data = wave_data * 1.0 / (max(abs(wave_data))) # wave幅值归一化 45 | fw.close() 46 | return wave_data 47 | 48 | def extraMFCC(filename, savename): 49 | 50 | nw = 320 #对于16KHz的文件,20ms的采样点个数 51 | inc = 160 52 | wave_data=getWaveData(filename) 53 | winfunc = signal.hann(nw) 54 | X = enframe(wave_data, nw, inc, winfunc) 55 | frameNum = X.shape[0] # 返回矩阵列数,获取帧数 56 | 57 | data=[] 58 | for oneframe in X: 59 | tmpList=list() 60 | mfccs = librosa.feature.mfcc(y=oneframe, sr=16000, n_mfcc=24) 61 | # print(mfccs.shape) 62 | for a in mfccs: 63 | # print(a.shape) 64 | tmpList.append(a[0]) 65 | data.append(tmpList) 66 | data=np.array(data) 67 | # data.shape : (frames_num, 24) 68 | print(data.shape) 69 | np.save(savename, data) 70 | 71 | def main(): 72 | 73 | target_dir = '../speech/TIMIT/TEST/' 74 | 75 | drs = os.listdir(target_dir) 76 | 77 | i = 0 78 | 79 | for dr in drs: 80 | # DR* 81 | drs_path = os.path.join(target_dir, dr) 82 | samples = os.listdir(drs_path) 83 | #print(len(samples)) 84 | 85 | for sample in samples: 86 | samples_path = os.path.join(drs_path, sample) 87 | 88 | for filename in glob.glob(os.path.join(samples_path, '*.wav')): 89 | print(filename) 90 | spk_dir = '../speech/TIMIT/TEST_MFCC/' + 'spk_' + str(i+1) 91 | save_name = spk_dir + '/spk_' + str(i+1) + '_mfcc.npy' 92 | #print(save_name) 93 | 94 | # extraMFCC(filename, save_name) 95 | i += 1 96 | #extraMFCC(filename, save_name) 97 | # if os.path.exists(spk_dir): 98 | # pass 99 | # else: 100 | # os.makedirs(spk_dir) 101 | # print(spk_dir) 102 | # print(filename) 103 | # s_file = filename.split('\\') 104 | # save_name = target_dir + filename[25:28] + '/' + s_file[1] + '/' + "merge_result.wav" 105 | 106 | 107 | # def main(): 108 | # start=time.clock() 109 | 110 | # female_siri_path = '../speech/female/female_' 111 | # male_siri_path = '../speech/male/male_' 112 | # GMMs=[] 113 | # # female 114 | # for i in range(9): 115 | # train_female_siri_file = female_siri_path + str(i+1) + '/merge_result.wav' 116 | # save_name = '../mfcc_features/female_' + str(i+1) + '.npy' 117 | # extraMFCC(train_female_siri_file, save_name) 118 | # # male 119 | # for i in range(6): 120 | # train_male_siri_file = male_siri_path + str(i+1) + '/merge_result.wav' 121 | # save_name = '../mfcc_features/male_' + str(i+1) + '.npy' 122 | # extraMFCC(train_male_siri_file, save_name) 123 | 124 | # timePointAfterGmm=time.clock() 125 | 126 | 127 | if __name__=="__main__": 128 | main() 129 | 130 | -------------------------------------------------------------------------------- /GMM/scripts/extra_mfcc2.py: -------------------------------------------------------------------------------- 1 | # _*_ coding=utf-8 _*_ 2 | from scipy import signal 3 | import pylab as pl 4 | from sklearn.mixture import GaussianMixture 5 | import joblib 6 | import os 7 | import librosa 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | import wave 11 | import time 12 | import math 13 | import glob 14 | import multiprocessing 15 | 16 | 17 | def enframe(wave_data, nw, inc, winfunc): 18 | '''将音频信号转化为帧。 19 | 参数含义: 20 | wave_data:原始音频型号 21 | nw:每一帧的长度(这里指采样点的长度,即采样频率乘以时间间隔) 22 | inc:相邻帧的间隔(同上定义) 23 | ''' 24 | wlen=len(wave_data) #信号总长度 25 | if wlen<=nw: #若信号长度小于一个帧的长度,则帧数定义为1 26 | nf=1 27 | else: #否则,计算帧的总长度 28 | nf=int(np.ceil((1.0*wlen-nw+inc)/inc)) 29 | pad_length=int((nf-1)*inc+nw) #所有帧加起来总的铺平后的长度 30 | zeros=np.zeros((pad_length-wlen,)) #不够的长度使用0填补,类似于FFT中的扩充数组操作 31 | pad_signal=np.concatenate((wave_data,zeros)) #填补后的信号记为pad_signal 32 | indices=np.tile(np.arange(0,nw),(nf,1))+np.tile(np.arange(0,nf*inc,inc),(nw,1)).T #相当于对所有帧的时间点进行抽取,得到nf*nw长度的矩阵 33 | indices=np.array(indices,dtype=np.int32) #将indices转化为矩阵 34 | frames=pad_signal[indices] #得到帧信号 35 | win=np.tile(winfunc,(nf,1)) #window窗函数,这里默认取1 36 | return frames*win #返回帧信号矩阵 37 | 38 | def getWaveData(filename): 39 | fw = wave.open(filename,'rb') 40 | params = fw.getparams() 41 | #print(params) 42 | nchannels, sampwidth, framerate, nframes = params[:4] 43 | str_data = fw.readframes(nframes) 44 | wave_data = np.fromstring(str_data, dtype=np.int16) 45 | wave_data = wave_data * 1.0 / (max(abs(wave_data))) # wave幅值归一化 46 | fw.close() 47 | return wave_data 48 | 49 | def extraMFCC(filename, savename): 50 | 51 | nw = 320 #对于16KHz的文件,20ms的采样点个数 52 | inc = 160 53 | wave_data=getWaveData(filename) 54 | winfunc = signal.hann(nw) 55 | X = enframe(wave_data, nw, inc, winfunc) 56 | frameNum = X.shape[0] # 返回矩阵列数,获取帧数 57 | 58 | data=[] 59 | for oneframe in X: 60 | tmpList=list() 61 | mfccs = librosa.feature.mfcc(y=oneframe, sr=16000, n_mfcc=24) 62 | # print(mfccs.shape) 63 | for a in mfccs: 64 | # print(a.shape) 65 | tmpList.append(a[0]) 66 | data.append(tmpList) 67 | data=np.array(data) 68 | # data.shape : (frames_num, 24) 69 | print(data.shape) 70 | np.save(savename, data) 71 | 72 | 73 | def extra_train_MFCC(): 74 | target_dir = '../speech/TIMIT/TRAIN/' 75 | 76 | drs = os.listdir(target_dir) 77 | 78 | i = 0 79 | 80 | for dr in drs: 81 | # DR* 82 | drs_path = os.path.join(target_dir, dr) 83 | samples = os.listdir(drs_path) 84 | #print(len(samples)) 85 | 86 | for sample in samples: 87 | samples_path = os.path.join(drs_path, sample) 88 | print(samples_path) 89 | # for filename in glob.glob(os.path.join(samples_path, '*.wav')): 90 | # # print(filename) 91 | # s_file = filename.split('\\') 92 | # # print(s_file[2][:-4]) 93 | # spk_dir = '../speech/TIMIT/TEST_MFCC/' + 'spk_' + str(i+1) 94 | # save_name = spk_dir + '/' + s_file[2][:-4] + 'mfcc.npy' 95 | # extraMFCC(filename, save_name) 96 | # print(save_name) 97 | # i += 1 98 | 99 | # target_dir = '../speech/TIMIT/TEST/' 100 | 101 | # drs = os.listdir(target_dir) 102 | 103 | # i = 0 104 | 105 | # for dr in drs: 106 | # # DR* 107 | # drs_path = os.path.join(target_dir, dr) 108 | # samples = os.listdir(drs_path) 109 | # #print(len(samples)) 110 | 111 | # for sample in samples: 112 | # samples_path = os.path.join(drs_path, sample) 113 | # for filename in glob.glob(os.path.join(samples_path, '*.wav')): 114 | # # print(filename) 115 | # s_file = filename.split('\\') 116 | # # print(s_file[2][:-4]) 117 | # spk_dir = '../speech/TIMIT/TEST_MFCC/' + 'spk_' + str(i+1) 118 | # save_name = spk_dir + '/' + s_file[2][:-4] + 'mfcc.npy' 119 | # extraMFCC(filename, save_name) 120 | # print(save_name) 121 | # i += 1 122 | 123 | 124 | 125 | if __name__ == '__main__': 126 | extra_mfcc = [multiprocessing.Process(target=self.readVideo), 127 | multiprocessing.Process(target=self.dealVideo,), 128 | multiprocessing.Process(target=self.dealData,)] 129 | 130 | 131 | 132 | 133 | for process in extra_mfcc: 134 | process.daemon = True 135 | process.start() 136 | for process in extra_mfcc: 137 | process.join() -------------------------------------------------------------------------------- /GMM/scripts/extra_mfcc_multiprocess.py: -------------------------------------------------------------------------------- 1 | # _*_ coding=utf-8 _*_ 2 | from scipy import signal 3 | import pylab as pl 4 | from sklearn.mixture import GaussianMixture 5 | import joblib 6 | import os 7 | import librosa 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | import wave 11 | import time 12 | import math 13 | import glob 14 | import multiprocessing 15 | 16 | 17 | def enframe(wave_data, nw, inc, winfunc): 18 | '''将音频信号转化为帧。 19 | 参数含义: 20 | wave_data:原始音频型号 21 | nw:每一帧的长度(这里指采样点的长度,即采样频率乘以时间间隔) 22 | inc:相邻帧的间隔(同上定义) 23 | ''' 24 | wlen=len(wave_data) #信号总长度 25 | if wlen<=nw: #若信号长度小于一个帧的长度,则帧数定义为1 26 | nf=1 27 | else: #否则,计算帧的总长度 28 | nf=int(np.ceil((1.0*wlen-nw+inc)/inc)) 29 | pad_length=int((nf-1)*inc+nw) #所有帧加起来总的铺平后的长度 30 | zeros=np.zeros((pad_length-wlen,)) #不够的长度使用0填补,类似于FFT中的扩充数组操作 31 | pad_signal=np.concatenate((wave_data,zeros)) #填补后的信号记为pad_signal 32 | indices=np.tile(np.arange(0,nw),(nf,1))+np.tile(np.arange(0,nf*inc,inc),(nw,1)).T #相当于对所有帧的时间点进行抽取,得到nf*nw长度的矩阵 33 | indices=np.array(indices,dtype=np.int32) #将indices转化为矩阵 34 | frames=pad_signal[indices] #得到帧信号 35 | win=np.tile(winfunc,(nf,1)) #window窗函数,这里默认取1 36 | return frames*win #返回帧信号矩阵 37 | 38 | def getWaveData(filename): 39 | fw = wave.open(filename,'rb') 40 | params = fw.getparams() 41 | #print(params) 42 | nchannels, sampwidth, framerate, nframes = params[:4] 43 | str_data = fw.readframes(nframes) 44 | wave_data = np.fromstring(str_data, dtype=np.int16) 45 | wave_data = wave_data * 1.0 / (max(abs(wave_data))) # wave幅值归一化 46 | fw.close() 47 | return wave_data 48 | 49 | def extraMFCC(filename, savename, mfcc_num=13): 50 | nw = 320 #对于16KHz的文件,20ms的采样点个数 51 | inc = 160 52 | wave_data=getWaveData(filename) 53 | winfunc = signal.hann(nw) 54 | X = enframe(wave_data, nw, inc, winfunc) 55 | frameNum = X.shape[0] # 返回矩阵列数,获取帧数 56 | 57 | data=[] 58 | for oneframe in X: 59 | tmpList=list() 60 | mfccs = librosa.feature.mfcc(y=oneframe, sr=16000, n_mfcc=mfcc_num) 61 | # print(mfccs.shape) 62 | for a in mfccs: 63 | # print(a.shape) 64 | tmpList.append(a[0]) 65 | data.append(tmpList) 66 | data=np.array(data) 67 | # data.shape : (frames_num, 24) 68 | print(data.shape) 69 | np.save(savename, data) 70 | 71 | 72 | def extra_train_MFCC(start, end, spk_num): 73 | target_dir = '../speech/TIMIT/TRAIN/' 74 | 75 | drs = os.listdir(target_dir) 76 | 77 | i = 0 78 | print(drs) 79 | 80 | for dr in drs[start:end]: 81 | # DR* 82 | drs_path = os.path.join(target_dir, dr) 83 | samples = os.listdir(drs_path) 84 | #print(len(samples)) 85 | for sample in samples: 86 | samples_path = os.path.join(drs_path, sample) 87 | # print(samples_path) 88 | filename = os.path.join(samples_path, 'merge_result.wav') 89 | # print(filename) 90 | # if os.path.exists(filename): 91 | # print(filename) 92 | spk_dir = '../speech/TIMIT/TRAIN_MFCC/spk_' + str(spk_num+1) 93 | save_name = spk_dir + '/spk_' + str(spk_num+1) + '_13d_mfcc.npy' 94 | print(save_name) 95 | # for filename in glob.glob(os.path.join(samples_path, '*.wav')): 96 | # # print(filename) 97 | # s_file = filename.split('\\') 98 | # # print(s_file[2][:-4]) 99 | # spk_dir = '../speech/TIMIT/TEST_MFCC/' + 'spk_' + str(i+1) 100 | # save_name = spk_dir + '/' + s_file[2][:-4] + 'mfcc.npy' 101 | extraMFCC(filename, save_name, 13) 102 | # print(save_name) 103 | # i += 1 104 | spk_num += 1 105 | 106 | 107 | def extra_test_MFCC(start, end, spk_num): 108 | target_dir = '../speech/TIMIT/TEST/' 109 | 110 | drs = os.listdir(target_dir) 111 | 112 | i = 0 113 | 114 | for dr in drs[start:end]: 115 | # DR* 116 | drs_path = os.path.join(target_dir, dr) 117 | samples = os.listdir(drs_path) 118 | #print(len(samples)) 119 | 120 | for sample in samples: 121 | samples_path = os.path.join(drs_path, sample) 122 | for filename in glob.glob(os.path.join(samples_path, '*.wav')): 123 | # print(filename) 124 | s_file = filename.split('\\') 125 | # print(s_file[2][:-4]) 126 | spk_dir = '../speech/TIMIT/TEST_MFCC/' + 'spk_' + str(spk_num+1) 127 | save_name = spk_dir + '/spk_' + str(spk_num+1) + '_13d_mfcc.npy' 128 | extraMFCC(filename, save_name) 129 | print(save_name) 130 | spk_num += 1 131 | 132 | 133 | def train_multiprocess(): 134 | extra_mfcc = [multiprocessing.Process(target=extra_train_MFCC, args=(0, 2, 0,),), 135 | multiprocessing.Process(target=extra_train_MFCC, args=(2, 4, 114,),), 136 | multiprocessing.Process(target=extra_train_MFCC, args=(4, 6, 258,),), 137 | multiprocessing.Process(target=extra_train_MFCC, args=(6, 8, 363),)] 138 | 139 | 140 | for process in extra_mfcc: 141 | process.daemon = True 142 | process.start() 143 | for process in extra_mfcc: 144 | process.join() 145 | 146 | 147 | def test_multiprocess(): 148 | extra_mfcc = [multiprocessing.Process(target=extra_test_MFCC, args=(0, 2, 0,),), 149 | multiprocessing.Process(target=extra_test_MFCC, args=(2, 4, 114,),), 150 | multiprocessing.Process(target=extra_test_MFCC, args=(4, 6, 258,),), 151 | multiprocessing.Process(target=extra_test_MFCC, args=(6, 8, 363),)] 152 | 153 | 154 | for process in extra_mfcc: 155 | process.daemon = True 156 | process.start() 157 | for process in extra_mfcc: 158 | process.join() 159 | 160 | if __name__ == '__main__': 161 | train_multiprocess() 162 | -------------------------------------------------------------------------------- /GMM/scripts/gmm_achieve.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.stats import multivariate_normal 3 | import matplotlib.pyplot as plt 4 | from matplotlib.patches import Ellipse 5 | 6 | 7 | # 绘制椭圆参考代码,https://github.com/SJinping/Gaussian-ellipse/blob/master/gaussian_%20ellipse.py 8 | def plot_cov_ellipse(cov, pos, nstd=2, ax=None, **kwargs): 9 | """ 10 | Plots an `nstd` sigma error ellipse based on the specified covariance 11 | matrix (`cov`). Additional keyword arguments are passed on to the 12 | ellipse patch artist. 13 | Parameters 14 | ---------- 15 | cov : The 2x2 covariance matrix to base the ellipse on 16 | pos : The location of the center of the ellipse. Expects a 2-element 17 | sequence of [x0, y0]. 18 | nstd : The radius of the ellipse in numbers of standard deviations. 19 | Defaults to 2 standard deviations. 20 | ax : The axis that the ellipse will be plotted on. Defaults to the 21 | current axis. 22 | Additional keyword arguments are pass on to the ellipse patch. 23 | Returns 24 | ------- 25 | A matplotlib ellipse artist 26 | """ 27 | 28 | def eigsorted(cov): 29 | vals, vecs = np.linalg.eigh(cov) 30 | order = vals.argsort()[::-1] 31 | return vals[order], vecs[:, order] 32 | 33 | if ax is None: 34 | ax = plt.gca() 35 | 36 | vals, vecs = eigsorted(cov) 37 | theta = np.degrees(np.arctan2(*vecs[:, 0][::-1])) 38 | 39 | # Width and height are "full" widths, not radius 40 | width, height = 2 * nstd * np.sqrt(vals) 41 | ellip = Ellipse(xy=pos, width=width, height=height, angle=theta, **kwargs) 42 | 43 | ax.add_artist(ellip) 44 | return ellip 45 | 46 | 47 | def plot(data, mu, covariance, class_label): 48 | plt.scatter(data[:, 0], data[:, 1], c=class_label) 49 | n_components = len(mu) 50 | for j in range(n_components): 51 | plot_cov_ellipse(covariance[j], mu[j]) 52 | pass 53 | plt.show() 54 | 55 | 56 | class GaussianMixtureModel: 57 | 58 | def __init__(self, n_components, maxIter=1e4, eps=1e-9): 59 | self.n_components = n_components 60 | self.class_prior = np.ones(n_components) * 1 / n_components 61 | self.mu = None 62 | self.covariance = None 63 | self.W = None 64 | self.pdfs = None 65 | self.eps = eps 66 | self.maxIter = maxIter 67 | 68 | def __initParameters(self, X): 69 | ''' 70 | 初始化模型参数mu,sigma,class_prior 71 | :param X: 72 | :return: 73 | ''' 74 | m, n = X.shape 75 | self.W = np.random.random((m, self.n_components)) 76 | self.mu = np.random.random((self.n_components, n)) 77 | minCol = np.min(X, axis=0) 78 | maxCol = np.max(X, axis=0) 79 | self.mu = minCol + self.mu * (maxCol - minCol) 80 | self.covariance = np.zeros((self.n_components, n, n)) 81 | dist = np.tile(np.sum(X * X, axis=1).reshape((m, 1)), (1, self.n_components)) + np.tile( 82 | np.sum(self.mu * self.mu, axis=1).T, 83 | (m, 1)) - 2 * np.dot(X, self.mu.T) 84 | self.pdfs = np.zeros((m, self.n_components)) 85 | labels = np.argmin(dist, axis=1) 86 | for i in range(self.n_components): 87 | clusterX = X[labels == i, :] 88 | self.class_prior[i] = clusterX.shape[0] / m 89 | self.covariance[i, :, :] = np.cov(clusterX.T) 90 | 91 | def train(self, X): 92 | ''' 93 | EM算法得到模型参数,迭代停止条件为:1迭代轮数达到上限 2似然函数的变化极其微小,小于某个阈值 94 | :param X: 95 | :return: 96 | ''' 97 | self.__initParameters(X) 98 | num = 0 99 | preLogLikelihood = self.__logLikelihood(X) 100 | while num < self.maxIter: 101 | self.__expectation(X) 102 | self.__maximize(X) 103 | # plot(X, self.mu, self.covariance,y) 104 | num += 1 105 | logLikelihood = self.__logLikelihood(X) 106 | if abs(logLikelihood - preLogLikelihood) < self.eps: 107 | break 108 | preLogLikelihood = logLikelihood 109 | plot(X, self.mu, self.covariance,y) 110 | 111 | # 根据当前的各个组分先验概率、均值向量和协方差矩阵计算对数似然函数值 112 | def __logLikelihood(self, X): 113 | for j in range(self.n_components): 114 | a = multivariate_normal.pdf(X, self.mu[j], self.covariance[j]) 115 | # print(a) 116 | self.pdfs[:, j] = self.class_prior[j] * multivariate_normal.pdf(X, self.mu[j], self.covariance[j]) 117 | return np.mean(np.log(np.sum(self.pdfs, axis=1))) 118 | 119 | # EM算法的E步,计算样本x_i来自第k个高斯分布的概率 120 | def __expectation(self, X): 121 | ''' 122 | 对于样本x_i来自第k个高斯分布的概率 123 | :return: 124 | ''' 125 | for j in range(self.n_components): 126 | self.pdfs[:, j] = self.class_prior[j] * multivariate_normal.pdf(X, self.mu[j], self.covariance[j]) 127 | self.W = self.pdfs / np.sum(self.pdfs, axis=1).reshape(-1, 1) 128 | 129 | def __maximize(self, X): 130 | ''' 131 | N_k表示所有数据点属于第k类的概率之和 132 | 更新类别先验,类的期望中心和协方差 133 | :return: 134 | ''' 135 | m, n = X.shape 136 | self.class_prior = np.sum(self.W, axis=0) / np.sum(self.W) 137 | for j in range(self.n_components): 138 | self.mu[j] = np.average(X, axis=0, weights=self.W[:, j]) 139 | cov = 0 140 | for i in range(m): 141 | tmp = (X[i, :] - self.mu[j, :]).reshape(-1, 1) 142 | cov += self.W[i, j] * np.dot(tmp, tmp.T) 143 | self.covariance[j, :, :] = cov / np.sum(self.W[:, j]) 144 | 145 | 146 | # 用三个不同的高斯分布生成三个聚类作为GMM算法的数据 147 | num1, mu1, covar1 = 400, [0.5, 0.5], np.array([[1, 0.5], [0.5, 3]]) 148 | X1 = np.random.multivariate_normal(mu1, covar1, num1) 149 | # 第二簇的数据 150 | num2, mu2, covar2 = 600, [5.5, 2.5], np.array([[2, 1], [1, 2]]) 151 | X2 = np.random.multivariate_normal(mu2, covar2, num2) 152 | # 第三簇的数据 153 | num3, mu3, covar3 = 1000, [1, 7], np.array([[6, 2], [2, 1]]) 154 | X3 = np.random.multivariate_normal(mu3, covar3, num3) 155 | # 合并在一起 156 | Mydata = np.vstack((X1, X2, X3)) 157 | print(Mydata.shape) 158 | # 计算聚类结果的对数似然函数值 159 | y = np.hstack((np.zeros(len(X1)), np.ones(len(X2)), 2 * np.ones(len(X3)))) 160 | print(len(y)) 161 | myGMM = GaussianMixtureModel(3) 162 | 163 | myGMM.train(Mydata) -------------------------------------------------------------------------------- /GMM/scripts/gmm_achieve_2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from scipy.stats import multivariate_normal as mul_nor_1 4 | from numpy.random import multivariate_normal as mul_nor_2 5 | 6 | from scipy.stats import multivariate_normal 7 | from matplotlib.patches import Ellipse 8 | 9 | 10 | # 绘制椭圆参考代码,https://github.com/SJinping/Gaussian-ellipse/blob/master/gaussian_%20ellipse.py 11 | def plot_cov_ellipse(cov, pos, nstd=2, ax=None, **kwargs): 12 | """ 13 | Plots an `nstd` sigma error ellipse based on the specified covariance 14 | matrix (`cov`). Additional keyword arguments are passed on to the 15 | ellipse patch artist. 16 | Parameters 17 | ---------- 18 | cov : The 2x2 covariance matrix to base the ellipse on 19 | pos : The location of the center of the ellipse. Expects a 2-element 20 | sequence of [x0, y0]. 21 | nstd : The radius of the ellipse in numbers of standard deviations. 22 | Defaults to 2 standard deviations. 23 | ax : The axis that the ellipse will be plotted on. Defaults to the 24 | current axis. 25 | Additional keyword arguments are pass on to the ellipse patch. 26 | Returns 27 | ------- 28 | A matplotlib ellipse artist 29 | """ 30 | 31 | def eigsorted(cov): 32 | vals, vecs = np.linalg.eigh(cov) 33 | order = vals.argsort()[::-1] 34 | return vals[order], vecs[:, order] 35 | 36 | if ax is None: 37 | ax = plt.gca() 38 | 39 | vals, vecs = eigsorted(cov) 40 | theta = np.degrees(np.arctan2(*vecs[:, 0][::-1])) 41 | 42 | # Width and height are "full" widths, not radius 43 | width, height = 2 * nstd * np.sqrt(vals) 44 | ellip = Ellipse(xy=pos, width=width, height=height, angle=theta, **kwargs) 45 | 46 | ax.add_artist(ellip) 47 | return ellip 48 | 49 | 50 | def plot(data, mu, covariance, class_label): 51 | plt.scatter(data[:, 0], data[:, 1], c=class_label) 52 | n_components = len(mu) 53 | for j in range(n_components): 54 | plot_cov_ellipse(covariance[j], mu[j]) 55 | pass 56 | plt.show() 57 | 58 | 59 | 60 | 61 | # 初始化模型参数 62 | # shape 是表示样本规模的二元组,(样本数, 特征数) 63 | # K 表示模型个数 64 | def init_params(shape, K): 65 | N, D = shape 66 | mu = np.random.rand(K, D) 67 | cov = np.array([np.eye(D)] * K) 68 | alpha = np.array([1.0 / K] * K) 69 | print("Parameters initialized.") 70 | print("mu:", mu, "cov:", cov, "alpha:", alpha, sep="\n") 71 | return mu, cov, alpha 72 | 73 | 74 | def phi(Y, mu_k, cov_k): 75 | norm = mul_nor_1(mean=mu_k, cov=cov_k) 76 | return norm.pdf(Y) 77 | # E 步:计算每个模型对样本的响应度 78 | # Y 为样本矩阵,每个样本一行,只有一个特征时为列向量 79 | # mu 为均值多维数组,每行表示一个样本各个特征的均值 80 | # cov 为协方差矩阵的数组,alpha 对应的每个高斯模型的权重数组 81 | 82 | def getExpectation(Y, mu, cov, alpha): 83 | # 样本数 84 | N = Y.shape[0] 85 | # 模型数 86 | K = alpha.shape[0] 87 | 88 | # 为避免使用单个高斯模型或样本,导致返回结果的类型不一致 89 | # 因此要求样本数和模型个数必须大于1 90 | assert N > 1, "There must be more than one sample!" 91 | assert K > 1, "There must be more than one gaussian model!" 92 | 93 | # 响应度矩阵,行对应样本,列对应响应度 94 | gamma = np.mat(np.zeros((N, K))) 95 | 96 | # 计算各模型中所有样本出现的概率,行对应样本,列对应模型 97 | prob = np.zeros((N, K)) 98 | for k in range(K): 99 | prob[:, k] = phi(Y, mu[k], cov[k]) 100 | prob = np.mat(prob) 101 | 102 | # 计算每个模型对每个样本的响应度 103 | for k in range(K): 104 | gamma[:, k] = alpha[k] * prob[:, k] 105 | for i in range(N): 106 | gamma[i, :] /= np.sum(gamma[i, :]) 107 | return gamma 108 | 109 | 110 | 111 | # M 步:迭代模型参数 112 | # Y 为样本矩阵,gamma 为响应度矩阵 113 | def maximize(Y, gamma): 114 | # 样本数和特征数 115 | N, D = Y.shape 116 | # 模型数 117 | K = gamma.shape[1] 118 | 119 | #初始化参数值 120 | mu = np.zeros((K, D)) 121 | cov = [] 122 | alpha = np.zeros(K) 123 | 124 | # 更新每个模型的参数 125 | for k in range(K): 126 | # 第 k 个模型对所有样本的响应度之和 127 | Nk = np.sum(gamma[:, k]) 128 | # 更新 mu 129 | # 对每个特征求均值 130 | mu[k, :] = np.sum(np.multiply(Y, gamma[:, k]), axis=0) / Nk 131 | # 更新 cov 132 | cov_k = (Y - mu[k]).T * np.multiply((Y - mu[k]), gamma[:, k]) / Nk 133 | cov.append(cov_k) 134 | # 更新 alpha 135 | alpha[k] = Nk / N 136 | cov = np.array(cov) 137 | return mu, cov, alpha 138 | 139 | # 高斯混合模型 EM 算法 140 | # 给定样本矩阵 Y,计算模型参数 141 | # K 为模型个数 142 | # times 为迭代次数 143 | # 返回每个高斯模型的参数数组和对应的权值数组 144 | 145 | def GMM_EM(Y, K, times): 146 | mu, cov, alpha = init_params(Y.shape, K) 147 | for i in range(times): 148 | gamma = getExpectation(Y, mu, cov, alpha) 149 | mu, cov, alpha = maximize(Y, gamma) 150 | print("{sep} Result {sep}".format(sep="-" * 20)) 151 | print("mu:", mu, "cov:", cov, "alpha:", alpha, sep="\n") 152 | return mu, cov, alpha 153 | 154 | 155 | -------------------------------------------------------------------------------- /GMM/scripts/gmm_model.py: -------------------------------------------------------------------------------- 1 | # _*_ coding=utf-8 _*_ 2 | from scipy import signal 3 | import pylab as pl 4 | from sklearn.mixture import GaussianMixture 5 | import joblib 6 | import os 7 | import librosa 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | import wave 11 | import time 12 | import math 13 | 14 | 15 | def enframe(wave_data, nw, inc, winfunc): 16 | '''将音频信号转化为帧。 17 | 参数含义: 18 | wave_data:原始音频型号 19 | nw:每一帧的长度(这里指采样点的长度,即采样频率乘以时间间隔) 20 | inc:相邻帧的间隔(同上定义) 21 | ''' 22 | wlen=len(wave_data) #信号总长度 23 | if wlen<=nw: #若信号长度小于一个帧的长度,则帧数定义为1 24 | nf=1 25 | else: #否则,计算帧的总长度 26 | nf=int(np.ceil((1.0*wlen-nw+inc)/inc)) 27 | pad_length=int((nf-1)*inc+nw) #所有帧加起来总的铺平后的长度 28 | zeros=np.zeros((pad_length-wlen,)) #不够的长度使用0填补,类似于FFT中的扩充数组操作 29 | pad_signal=np.concatenate((wave_data,zeros)) #填补后的信号记为pad_signal 30 | indices=np.tile(np.arange(0,nw),(nf,1))+np.tile(np.arange(0,nf*inc,inc),(nw,1)).T #相当于对所有帧的时间点进行抽取,得到nf*nw长度的矩阵 31 | indices=np.array(indices,dtype=np.int32) #将indices转化为矩阵 32 | frames=pad_signal[indices] #得到帧信号 33 | win=np.tile(winfunc,(nf,1)) #window窗函数,这里默认取1 34 | return frames*win #返回帧信号矩阵 35 | 36 | def getWaveData(filename): 37 | fw = wave.open(filename,'rb') 38 | params = fw.getparams() 39 | #print(params) 40 | nchannels, sampwidth, framerate, nframes = params[:4] 41 | str_data = fw.readframes(nframes) 42 | wave_data = np.fromstring(str_data, dtype=np.int16) 43 | wave_data = wave_data * 1.0 / (max(abs(wave_data))) # wave幅值归一化 44 | fw.close() 45 | return wave_data 46 | 47 | def getGMM(filename): 48 | 49 | nw = 320 #对于16KHz的文件,20ms的采样点个数 50 | inc = 160 51 | wave_data=getWaveData(filename) 52 | winfunc = signal.hann(nw) 53 | X = enframe(wave_data, nw, inc, winfunc) 54 | frameNum = X.shape[0] # 返回矩阵列数,获取帧数 55 | 56 | data=[] 57 | for oneframe in X: 58 | tmpList=list() 59 | mfccs = librosa.feature.mfcc(y=oneframe, sr=16000, n_mfcc=24) 60 | # print(mfccs.shape) 61 | for a in mfccs: 62 | # print(a.shape) 63 | tmpList.append(a[0]) 64 | data.append(tmpList) 65 | data=np.array(data) 66 | # data.shape : (frames_num, 24) 67 | print(data.shape) 68 | 69 | # 高斯混合模型 5个样本时,聚类数量为3,效果最好 70 | # 使用贝叶斯GMM,可避免数量选择 71 | gmm = GaussianMixture(3, covariance_type='full', random_state=0).fit(data) 72 | return gmm 73 | 74 | def softmax(scores): 75 | ss = 0.0 76 | Sum = 0.0 77 | for score in scores: 78 | ss += score 79 | 80 | scores = [(-1)*float(i)/ss for i in scores] 81 | 82 | for score in scores: 83 | Sum += math.exp(score) 84 | # for score in scores: 85 | print("probalitiy:{0}, index:{1}".format(math.exp(max(scores)) / Sum, scores.index(max(scores)))) 86 | 87 | 88 | 89 | def train_gmm_model(): 90 | start=time.clock() 91 | female_siri_path = '../speech/female/female_' 92 | male_siri_path = '../speech/male/male_' 93 | GMMs=[] 94 | # female 95 | for i in range(9): 96 | train_female_siri_file = female_siri_path + str(i+1) + '/sentence_1.wav' 97 | model_file = '../models/sentence_female_' + str(i+1) + '_gmm.model' 98 | female_siri_gmm = getGMM(train_female_siri_file) 99 | joblib.dump(female_siri_gmm, model_file) 100 | print("finished") 101 | # male 102 | for i in range(6): 103 | train_male_siri_file = male_siri_path + str(i+1) + '/sentence_1.wav' 104 | model_file = '../models/sentence_male_' + str(i+1) + '_gmm.model' 105 | male_siri_gmm = getGMM(train_male_siri_file) 106 | joblib.dump(male_siri_gmm, model_file) 107 | print("finished") 108 | # model = joblib.load(model_file) 109 | timePointAfterGmm=time.clock() 110 | 111 | 112 | def test_gmm_model(): 113 | female_siri_path = '../speech/female/female_' 114 | male_siri_path = '../speech/male/male_' 115 | #对采样信号处理 116 | nw=320 117 | inc = 160 118 | winfunc = signal.hann(nw) 119 | 120 | test_data_list = [] 121 | gmm_model_list = [] 122 | # female 123 | for i in range(9): 124 | test_female_siri_file = female_siri_path + str(i+1) + '/xiaoai_2.wav' 125 | # 加载模型 126 | model_file = '../models/sentence_female_' + str(i+1) + '_gmm.model' 127 | gmm_model = joblib.load(model_file) 128 | gmm_model_list.append(gmm_model) 129 | 130 | testFrames=enframe(getWaveData(test_female_siri_file), nw, inc, winfunc) 131 | data=[] 132 | # 提取测试序列MFCC特征 133 | for oneframe in testFrames: 134 | tmpList=list() 135 | for a in librosa.feature.mfcc(y=oneframe, sr=16000 , n_mfcc=24): 136 | tmpList.append(a[0]) 137 | data.append(tmpList) 138 | data=np.array(data) 139 | print(data.shape) 140 | test_data_list.append(data) 141 | # maxPro=GMMs[0].score(data) 142 | # male 143 | for i in range(6): 144 | test_male_siri_file = male_siri_path + str(i+1) + '/xiaoai_2.wav' 145 | # 加载模型 146 | model_file = '../models/sentence_male_' + str(i+1) + '_gmm.model' 147 | gmm_model = joblib.load(model_file) 148 | gmm_model_list.append(gmm_model) 149 | 150 | testFrames=enframe(getWaveData(test_male_siri_file), nw, inc, winfunc) 151 | data=[] 152 | # 提取测试序列MFCC特征 153 | for oneframe in testFrames: 154 | tmpList=list() 155 | for a in librosa.feature.mfcc(y=oneframe, sr=16000 , n_mfcc=24): 156 | tmpList.append(a[0]) 157 | data.append(tmpList) 158 | data=np.array(data) 159 | print(data.shape) 160 | test_data_list.append(data) 161 | # 测试 162 | for model in gmm_model_list: 163 | scores = [] 164 | for test_data in test_data_list: 165 | test_score = model.score(test_data) 166 | ss = model.score_samples(test_data) 167 | scores.append(test_score) 168 | # print("test_score:", ss.shape) 169 | softmax(scores) 170 | print("-------------------") 171 | 172 | 173 | 174 | if __name__=="__main__": 175 | train_gmm_model() 176 | test_gmm_model() 177 | 178 | -------------------------------------------------------------------------------- /GMM/scripts/gmm_model_2.py: -------------------------------------------------------------------------------- 1 | # _*_ coding=utf-8 _*_ 2 | from scipy import signal 3 | import pylab as pl 4 | from sklearn.mixture import GaussianMixture 5 | import joblib 6 | import os 7 | import librosa 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | import wave 11 | import time 12 | import math 13 | 14 | 15 | def getGMM(filename): 16 | 17 | y, sr = librosa.load(filename) 18 | # 提取 MFCC feature 19 | mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=24, hop_length=160, win_length=240) 20 | print(mfccs.shape) 21 | 22 | # 高斯混合模型 5个样本时,聚类数量为3,效果最好 23 | # 使用贝叶斯GMM,可避免数量选择 24 | gmm = GaussianMixture(1, covariance_type='full', random_state=0).fit(mfccs.T) 25 | return gmm 26 | 27 | 28 | def softmax(scores): 29 | ss = 0.0 30 | Sum = 0.0 31 | for score in scores: 32 | ss += score 33 | 34 | scores = [(-1)*float(i)/ss for i in scores] 35 | 36 | for score in scores: 37 | Sum += math.exp(score) 38 | 39 | # for score in scores: 40 | print("probalitiy:{0}, index:{1}".format(math.exp(max(scores)) / Sum, scores.index(max(scores)))) 41 | 42 | 43 | def train_gmm_model(): 44 | start=time.clock() 45 | female_siri_path = '../speech/female/female_' 46 | 47 | for i in range(9): 48 | train_female_siri_file = female_siri_path + str(i+1) + '/siri_1.wav' 49 | model_file = '../models/direct_femele_' + str(i+1) + '_gmm.model' 50 | female_siri_gmm = getGMM(train_female_siri_file) 51 | joblib.dump(female_siri_gmm, model_file) 52 | print("finished") 53 | # model = joblib.load(model_file) 54 | timePointAfterGmm=time.clock() 55 | 56 | 57 | def test_gmm_model(): 58 | female_siri_path = '../speech/female/female_' 59 | #对采样信号处理 60 | nw=320 61 | inc = 160 62 | winfunc = signal.hann(nw) 63 | 64 | test_data_list = [] 65 | gmm_model_list = [] 66 | for i in range(9): 67 | test_female_siri_file = female_siri_path + str(i+1) + '/siri_2.wav' 68 | # 加载模型 69 | model_file = '../models/direct_femele_' + str(i+1) + '_gmm.model' 70 | gmm_model = joblib.load(model_file) 71 | gmm_model_list.append(gmm_model) 72 | 73 | y, sr = librosa.load(test_female_siri_file) 74 | # 提取 MFCC feature 75 | mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=24, hop_length=160, win_length=240) 76 | 77 | test_data_list.append(mfccs.T) 78 | # maxPro=GMMs[0].score(data) 79 | 80 | # 测试 81 | for model in gmm_model_list: 82 | scores = [] 83 | for test_data in test_data_list: 84 | test_score = model.score(test_data) 85 | scores.append(test_score) 86 | print("test_score:", test_score) 87 | softmax(scores) 88 | print("-------------------") 89 | 90 | if __name__=="__main__": 91 | train_gmm_model() 92 | test_gmm_model() 93 | 94 | -------------------------------------------------------------------------------- /GMM/scripts/gmm_model_3.py: -------------------------------------------------------------------------------- 1 | 2 | # 使用保存的MFCC。 3 | 4 | # _*_ coding=utf-8 _*_ 5 | from scipy import signal 6 | import pylab as pl 7 | from sklearn.mixture import GaussianMixture 8 | import joblib 9 | import os 10 | import librosa 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | import wave 14 | import time 15 | import math 16 | 17 | def enframe(wave_data, nw, inc, winfunc): 18 | '''将音频信号转化为帧。 19 | 参数含义: 20 | wave_data:原始音频型号 21 | nw:每一帧的长度(这里指采样点的长度,即采样频率乘以时间间隔) 22 | inc:相邻帧的间隔(同上定义) 23 | ''' 24 | wlen=len(wave_data) #信号总长度 25 | if wlen<=nw: #若信号长度小于一个帧的长度,则帧数定义为1 26 | nf=1 27 | else: #否则,计算帧的总长度 28 | nf=int(np.ceil((1.0*wlen-nw+inc)/inc)) 29 | pad_length=int((nf-1)*inc+nw) #所有帧加起来总的铺平后的长度 30 | zeros=np.zeros((pad_length-wlen,)) #不够的长度使用0填补,类似于FFT中的扩充数组操作 31 | pad_signal=np.concatenate((wave_data,zeros)) #填补后的信号记为pad_signal 32 | indices=np.tile(np.arange(0,nw),(nf,1))+np.tile(np.arange(0,nf*inc,inc),(nw,1)).T #相当于对所有帧的时间点进行抽取,得到nf*nw长度的矩阵 33 | indices=np.array(indices,dtype=np.int32) #将indices转化为矩阵 34 | frames=pad_signal[indices] #得到帧信号 35 | win=np.tile(winfunc,(nf,1)) #window窗函数,这里默认取1 36 | return frames*win #返回帧信号矩阵 37 | 38 | def getWaveData(filename): 39 | fw = wave.open(filename,'rb') 40 | params = fw.getparams() 41 | #print(params) 42 | nchannels, sampwidth, framerate, nframes = params[:4] 43 | str_data = fw.readframes(nframes) 44 | wave_data = np.fromstring(str_data, dtype=np.int16) 45 | wave_data = wave_data * 1.0 / (max(abs(wave_data))) # wave幅值归一化 46 | fw.close() 47 | return wave_data 48 | 49 | 50 | def getGMM(filename): 51 | 52 | data = np.load(filename) 53 | # 高斯混合模型 5个样本时,聚类数量为3,效果最好 54 | # 使用贝叶斯GMM,可避免数量选择 55 | gmm = GaussianMixture(3, covariance_type='full', random_state=0).fit(data) 56 | return gmm 57 | 58 | def softmax(scores): 59 | ss = 0.0 60 | Sum = 0.0 61 | for score in scores: 62 | ss += score 63 | 64 | scores = [(-1)*float(i)/ss for i in scores] 65 | 66 | for score in scores: 67 | Sum += math.exp(score) 68 | # for score in scores: 69 | print("probalitiy:{0}, index:{1}".format(math.exp(max(scores)) / Sum, scores.index(max(scores)))) 70 | return scores.index(max(scores)) 71 | 72 | 73 | 74 | def train_gmm_model(): 75 | start=time.clock() 76 | root_dir = '../models/TIMIT_MFCC_24/' 77 | GMMs=[] 78 | # female 79 | for i in range(462): 80 | spk_mfcc = '../speech/TIMIT/TRAIN_MFCC/' + 'spk_' + str(i+1) + '/spk_' + str(i+1) + '_mfcc.npy' 81 | if os.path.exists(spk_mfcc): 82 | model_file = root_dir + 'spk_' + str(i+1) + '/' + 'TIMIT_MFCC_24_gmm.model' 83 | #print(model_file) 84 | timit_gmm = getGMM(spk_mfcc) 85 | joblib.dump(timit_gmm, model_file) 86 | print("finished") 87 | else: 88 | print("nor exists") 89 | # save_name = '../mfcc_features/female_' + str(i+1) + '.npy' 90 | 91 | 92 | timePointAfterGmm=time.clock() 93 | 94 | 95 | def test_gmm_model(): 96 | test_data_list = [] 97 | gmm_model_list = [] 98 | root_dir = '../models/TIMIT_MFCC_24/' 99 | 100 | for i in range(462): 101 | # mfcc_npy_file = '../speech/TIMIT/TEST_MFCC/' + 'spk_' + str(i+1) + '/' + s_file[2][:-4] + 'mfcc.npy' 102 | test_mfcc_path = '../speech/TIMIT/TEST_MFCC/' + 'spk_' + str(i+1) + '/' 103 | if os.path.exists(test_mfcc_path): 104 | mfcc_npy_file = test_mfcc_path + os.listdir(test_mfcc_path)[0] 105 | # 加载模型 106 | model_file = root_dir + 'spk_' + str(i+1) + '/' + 'TIMIT_MFCC_24_gmm.model' 107 | gmm_model = joblib.load(model_file) 108 | gmm_model_list.append(gmm_model) 109 | 110 | data = np.load(mfcc_npy_file) 111 | test_data_list.append(data) 112 | # 测试 113 | test_right = 0 114 | i = 0 115 | for model in gmm_model_list: 116 | scores = [] 117 | for test_data in test_data_list: 118 | test_score = model.score(test_data) 119 | # ss = model.score_samples(test_data) 120 | scores.append(test_score) 121 | # print("test_score:", ss.shape) 122 | result = softmax(scores) 123 | if(i == result): 124 | test_right += 1 125 | i += 1 126 | print("-------------------") 127 | print("right:{0}, accuracy:{1}".format(test_right, test_right/462)) 128 | 129 | 130 | 131 | if __name__=="__main__": 132 | #train_gmm_model() 133 | test_gmm_model() 134 | 135 | -------------------------------------------------------------------------------- /GMM/scripts/gmm_timit.py: -------------------------------------------------------------------------------- 1 | # _*_ coding=utf-8 _*_ 2 | from scipy import signal 3 | import pylab as pl 4 | from sklearn.mixture import GaussianMixture 5 | import joblib 6 | import os 7 | import librosa 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | import wave 11 | import time 12 | import math 13 | import multiprocessing 14 | 15 | 16 | def getGMM(filename): 17 | 18 | data = np.load(filename) 19 | # 高斯混合模型 5个样本时,聚类数量为3,效果最好 20 | # 使用贝叶斯GMM,可避免数量选择 21 | gmm = GaussianMixture(7, covariance_type='diag', random_state=0).fit(data) 22 | return gmm 23 | 24 | def softmax(scores): 25 | ss = 0.0 26 | Sum = 0.0 27 | for score in scores: 28 | ss += score 29 | 30 | scores = [(-1)*float(i)/ss for i in scores] 31 | 32 | for score in scores: 33 | Sum += math.exp(score) 34 | # for score in scores: 35 | print("probalitiy:{0}, index:{1}".format(math.exp(max(scores)) / Sum, scores.index(max(scores)))) 36 | return scores.index(max(scores)) 37 | 38 | 39 | 40 | def train_gmm_model(start_num, end_num): 41 | start=time.clock() 42 | # 训练gmm,只需修改(1) 43 | root_dir = '../models/TIMIT_MFCC_24/' 44 | GMMs=[] 45 | # female 46 | for i in range(start_num, end_num): 47 | spk_mfcc = '../speech/TIMIT/TRAIN_MFCC/' + 'spk_' + str(i+1) + '/spk_' + str(i+1) + '_13d_mfcc.npy' 48 | if os.path.exists(spk_mfcc): 49 | # 修改(2) 50 | model_file = root_dir + 'spk_' + str(i+1) + '/' + 'TIMIT_MFCC_24_gmm.model' 51 | #print(model_file) 52 | timit_gmm = getGMM(spk_mfcc) 53 | joblib.dump(timit_gmm, model_file) 54 | print("finished") 55 | else: 56 | print("nor exists") 57 | # save_name = '../mfcc_features/female_' + str(i+1) + '.npy' 58 | 59 | 60 | timePointAfterGmm=time.clock() 61 | 62 | 63 | def test_gmm_model(): 64 | test_data_list = [] 65 | gmm_model_list = [] 66 | # 修改(1) 67 | root_dir = '../models/TIMIT_MFCC_24/' 68 | 69 | for i in range(462): 70 | # mfcc_npy_file = '../speech/TIMIT/TEST_MFCC/' + 'spk_' + str(i+1) + '/' + s_file[2][:-4] + 'mfcc.npy' 71 | test_mfcc_path = '../speech/TIMIT/TEST_MFCC/' + 'spk_' + str(i+1) + '/' 72 | if os.path.exists(test_mfcc_path): 73 | mfcc_npy_file = test_mfcc_path + os.listdir(test_mfcc_path)[0] 74 | # mfcc_npy_file = test_mfcc_path + '/spk_' + str(i+1) + '_13d_mfcc.npy' 75 | # 加载模型 76 | # 修改(2) 77 | model_file = root_dir + 'spk_' + str(i+1) + '/' + 'TIMIT_MFCC_24_gmm.model' 78 | gmm_model = joblib.load(model_file) 79 | gmm_model_list.append(gmm_model) 80 | 81 | data = np.load(mfcc_npy_file) 82 | test_data_list.append(data) 83 | # 测试 84 | test_right = 0 85 | i = 0 86 | for model in gmm_model_list: 87 | scores = [] 88 | for test_data in test_data_list: 89 | test_score = model.score(test_data) 90 | # ss = model.score_samples(test_data) 91 | scores.append(test_score) 92 | # print("test_score:", ss.shape) 93 | result = softmax(scores) 94 | if(i == result): 95 | test_right += 1 96 | i += 1 97 | print("-------------------") 98 | print("right:{0}, accuracy:{1}".format(test_right, test_right/462)) 99 | 100 | 101 | def gmm_train_multiprocess(): 102 | train_mfcc = [multiprocessing.Process(target=train_gmm_model, args=(0, 100,)), 103 | multiprocessing.Process(target=train_gmm_model, args=(100,200,)), 104 | multiprocessing.Process(target=train_gmm_model, args=(200,300,)), 105 | multiprocessing.Process(target=train_gmm_model, args=(300,400,)), 106 | multiprocessing.Process(target=train_gmm_model, args=(400,462,))] 107 | 108 | for process in train_mfcc: 109 | process.daemon = True 110 | process.start() 111 | for process in train_mfcc: 112 | process.join() 113 | 114 | if __name__=="__main__": 115 | gmm_train_multiprocess() 116 | 117 | -------------------------------------------------------------------------------- /GMM/scripts/mkdir_script.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def mkdir_mfcc24_gmm_model(): 4 | root_dir = '../models/TIMIT_MFCC_24/' 5 | for i in range(462): 6 | spk_model_dir = root_dir + 'spk_' + str(i+1) 7 | print(spk_model_dir) 8 | os.makedirs(spk_model_dir) 9 | 10 | 11 | def mkdir_mfcc_13d_7_diag_model(): 12 | root_dir = '../models/TIMIT_MFCC_13d_7_diag/' 13 | for i in range(462): 14 | spk_model_dir = root_dir + 'spk_' + str(i+1) 15 | print(spk_model_dir) 16 | os.makedirs(spk_model_dir) 17 | 18 | 19 | def mkdir_timit_test(): 20 | root_dir = '../speech/TIMIT/TEST_MFCC/' 21 | for i in range(462): 22 | timit_test_dir = root_dir + 'spk_' + str(i+1) 23 | print(timit_test_dir) 24 | os.makedirs(timit_test_dir) 25 | 26 | 27 | def justify_multiprocess_nor_success(): 28 | root_dir = "../speech/TIMIT/TRAIN_MFCC/" 29 | spk_list = os.listdir(root_dir) 30 | print(len(spk_list)) 31 | for spk in spk_list: 32 | content = os.listdir(os.path.join(root_dir, spk)) 33 | if(len(content) == 2): 34 | pass 35 | else: 36 | print("error") 37 | 38 | 39 | #mkdir_mfcc24_gmm_model() 40 | # mkdir_timit_test() 41 | #justify_multiprocess_nor_success() 42 | mkdir_mfcc_13d_7_diag_model() -------------------------------------------------------------------------------- /GMM/scripts/mute_remove.py: -------------------------------------------------------------------------------- 1 | from pyAudioAnalysis import audioBasicIO as aIO 2 | from pyAudioAnalysis import audioSegmentation as aS 3 | [Fs, x] = aIO.read_audio_file("../speech/female/female_1/siri_1.wav") 4 | segments = aS.silence_removal(x, Fs, 0.020, 0.020, smooth_window = 1.0, weight = 0.3, plot = True) 5 | -------------------------------------------------------------------------------- /GMM/scripts/timit_deal/concat_timit_sentence.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import numpy as np 4 | import scipy.io.wavfile as wav 5 | 6 | import shutil 7 | 8 | def mkdir_files(path_read_folder, path_write_wav_file): 9 | 10 | target_dir = '../../speech/TIMIT/TRAIN/' 11 | target_dir2 = '../../speech/TIMIT/TEST/' 12 | 13 | drs = os.listdir(path_read_folder) 14 | s = 0 15 | for dr in drs: 16 | # DR* 17 | drs_path = os.path.join(path_read_folder, dr) 18 | samples = os.listdir(drs_path) 19 | #print(len(samples)) 20 | for sample in samples: 21 | samples_path = os.path.join(drs_path, sample) 22 | #print(samples_path) 23 | for filename in glob.glob(os.path.join(samples_path, 'SX*_.wav')): 24 | #print(filename) 25 | s_file = filename.split('\\') 26 | mkdir_dir = target_dir + s_file[1] + '/' + s_file[2] 27 | # print(s_file[3]) 28 | target = mkdir_dir + '/' + s_file[3] 29 | print(target) 30 | shutil.copy(filename, target) 31 | # if os.path.exists(mkdir_dir): 32 | # pass 33 | # else: 34 | # os.makedirs(mkdir_dir) 35 | 36 | # print(mkdir_dir) 37 | s += 1 38 | # break 39 | #print(len(s_file)) 40 | #print(s_file[]) 41 | # drs = os.listdir(target_dir) 42 | # s = 0 43 | # for dr in drs: 44 | # # DR* 45 | # drs_path = os.path.join(target_dir, dr) 46 | # samples = os.listdir(drs_path) 47 | # #print(len(samples)) 48 | # for sample in samples: 49 | # samples_path = os.path.join(drs_path, sample) 50 | # #print(samples_path) 51 | # for filename in glob.glob(os.path.join(samples_path, 'SI*_.wav')): 52 | # print(filename) 53 | # # os.remove(filename) 54 | # s_file = filename.split('\\') 55 | # # print(s_file) 56 | # # print(filename[25:28]) 57 | # target = target_dir2 + filename[25:28] + '/' + s_file[1] + '/' + s_file[2] 58 | # print(target) 59 | # # if os.path.exists(mkdir_dir): 60 | # # pass 61 | # # else: 62 | # # os.makedirs(mkdir_dir) 63 | # # s_file = filename.split('\\') 64 | # # mkdir_dir = target_dir2 + s_file[1] + '/' + s_file[2] 65 | # # # print(s_file[3]) 66 | # #target = mkdir_dir + '/' + s_file[3] 67 | # #print(target) 68 | # shutil.copy(filename, target) 69 | # os.remove(filename) 70 | # break 71 | print("sum:", s) 72 | 73 | 74 | # 合并SX的个句子和SI的第一个句子作为训练集 75 | # SI的后俩个个句子作为测试集 76 | def merge_files(path_read_folder, path_write_wav_file): 77 | 78 | target_dir = '../../speech/TIMIT/TRAIN/' 79 | save_name = '' 80 | 81 | drs = os.listdir(target_dir) 82 | s = 0 83 | for dr in drs: 84 | # DR* 85 | drs_path = os.path.join(target_dir, dr) 86 | samples = os.listdir(drs_path) 87 | #print(len(samples)) 88 | 89 | for sample in samples: 90 | samples_path = os.path.join(drs_path, sample) 91 | #print(samples_path) 92 | merged_signal = [] 93 | for filename in glob.glob(os.path.join(samples_path, '*.wav')): 94 | #print(filename) 95 | s_file = filename.split('\\') 96 | save_name = target_dir + filename[25:28] + '/' + s_file[1] + '/' + "merge_result.wav" 97 | # print(save_name) 98 | sr, signal = wav.read(filename) 99 | merged_signal.append(signal) 100 | print(len(merged_signal)) 101 | # print(merged_signal[0].shape, merged_signal[1].shape) 102 | merged_signal=np.hstack(merged_signal) 103 | merged_signal = np.asarray(merged_signal, dtype=np.int16) 104 | wav.write(save_name, sr, merged_signal) 105 | print("sum:", s) 106 | # print(sample) 107 | # merged_signal = [] 108 | # for filename in glob.glob(os.path.join(path_read_folder, 'sentence*.wav')): 109 | # print(filename) 110 | #sr, signal = wav.read(filename) 111 | #merged_signal.append(signal) 112 | # print(len(merged_signal)) 113 | # print(merged_signal[0].shape, merged_signal[1].shape) 114 | # merged_signal=np.hstack(merged_signal) 115 | # merged_signal = np.asarray(merged_signal, dtype=np.int16) 116 | # wav.write(path_write_wav_file, sr, merged_signal) 117 | 118 | 119 | #noisy train total 120 | path_read_folder = '../../speech/TIMIT2/TRAIN' 121 | path_write_wav_file = '../../speech/male/male_' 122 | merge_files(path_read_folder, path_write_wav_file) 123 | 124 | 125 | -------------------------------------------------------------------------------- /GMM/scripts/timit_deal/timit_data_deal.py: -------------------------------------------------------------------------------- 1 | import params as hp 2 | from sphfile import SPHFile 3 | import glob 4 | import os 5 | 6 | 7 | if __name__ == "__main__": 8 | train_path = '../../speech/TIMIT/TRAIN/*/*/*.WAV' 9 | test_path = '../../speech/TIMIT/TEST/*/*/*.WAV' 10 | train_sph_files = glob.glob(train_path) 11 | test_sph_files = glob.glob(test_path) 12 | print(len(train_sph_files),"train utterences") 13 | print(len(test_sph_files),"test utterences") 14 | # for i in train_sph_files: 15 | # sph = SPHFile(i) 16 | # sph.write_wav(filename=i.replace(".WAV","_.wav")) 17 | #os.remove(i) 18 | # path = 'D:/pycharm_proj/corpus/data/lisa/data/timit/raw/TIMIT/TEST/*/*/*.WAV' 19 | # sph_files_test = glob.glob(path) 20 | # print(len(sph_files_test),"test utterences") 21 | for i in test_sph_files: 22 | sph = SPHFile(i) 23 | sph.write_wav(filename=i.replace(".WAV","_.wav")) 24 | # os.remove(i) 25 | # print("Completed") 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Speaker Recognition 2 | 3 | 说话人识别,又称声纹识别。从上世纪60年代开始到现在,声纹识别一直是生物识别技术研究的主题。从传统的基于模板匹配的方法,到早期基于统计学方法,直到基于深度学习的声纹识别技术成为主流。本项目给出一个从传统(基于GMM、GMM-UBM、GMM-SVM[3]、联合因子分析、i-vector的方法),到基于深度学习的声纹识别方法的实现。 4 | 5 | ## 1、基于GMM的声纹识别 6 | 7 | ### 1.1 测试环境: 8 | 9 | - 操作系统:Windows10 10 | - 代码环境:Python3.6 11 | - 主要用到的开源库:sklearn、librosa、numpy 12 | - 数据集:TIMIT语音识别数据集和我自己收集的有15个说话人,**每个人6句话的小数据集(暂不公开)** 13 | 14 | ### 1.2 在TIMIT数据集上进行测试 15 | 16 | TIMIT语料库是为声学语音知识的获取(模型训练)以及自动语音识别系统(ASR)的评估(模型测试)而构建的,是由国防部赞助,在研究计划署(DARPA-ISTO)、麻省理工学院(MIT)、斯坦福研究院(SRI)、德州仪器(TI)共同努力下完成。说话人信息:由来自美国8个主要方言地区的630位说话者讲10个句子构成。10个句子分为: 17 | 18 | **SA-**方言句子(Dialect sentence):由SRI设计,总共2句。每个人都会读SA1、SA2这两个句子,体现不同地区方言的差别。(因此可用于方言判断算法的数据集,而其他情况一般不用该类句子) 19 | 20 | **SX-**音素紧凑的句子(Phondtically-compact sentence):由MIT设计,总共450句,目的是让句子中的音素分布平衡,尽可能的包含所有音素对。每个人读5个SX句子,并且每个SX句子被7个不同的人读。 21 | 22 | **SI-**音素发散的句子(Phonetically-diverse sentence):由TI在现有语料库Brown Corpus与剧作家对话集(the Playwrights Dialog)挑选的,总共1890句。目的是增加句子类型和音素文本的多样性,使之尽可能的包括所有的等位语境(Allophonic context)。每个人读三个SI句子,并且每个SI句子仅被一个人读一次。 23 | 24 | 630个说话人被分为TRAIN(462人)和TEST(168人)。我只用到TRAIN的462个说话人语音数据。所以我的说话人样本数是462个。因为SA的两个句子是方言,所以我并没有用到这两个句子。其他8个句子,我是用SX的5个句子和SI的1个句子作为训练集,SI的另外2个句子作为测试集。并将6个训练句子合并为1个句子方便提取MFCC特征。 25 | 26 | 我自己在TIMIT数据集基础上划分的数据。[[Baidu Driver](https://pan.baidu.com/s/1YnPZochiRY0IDfSoFbivqw?pwd=c1fc) | [Google Driver](https://drive.google.com/file/d/1J8YaWN9oFFGzVH6kNPcI8VsXmFe8g5gr/view?usp=sharing)] 27 | 28 | 也可下载TIMIT原始数据,根据你自己的情况划分数据。[[Baidu Driver](https://pan.baidu.com/s/1YnPZochiRY0IDfSoFbivqw?pwd=c1fc) | [Google Driver](https://drive.google.com/file/d/180mSIiXN9RVDV2Xn1xcWNkMRm5J5MjN4/view?usp=sharing)] 29 | 30 | > ├─TEST(168人) 31 | > │ ├─DR1 32 | > │ │ ├─FCJF0 33 | > │ │ ├─FDAW0 34 | > ....... 35 | > │ ├─DR2 36 | > │ │ ├─FAEM0 37 | > │ │ ├─FAJW0 38 | > ...... 39 | > │ ├─DR3 40 | > │ │ ├─FALK0 41 | > │ │ ├─FCKE0 42 | > ...... 43 | > ├─TEST_MFCC(测试集提取MFCC,462人) 44 | > │ ├─spk_1 45 | > │ ├─spk_10 46 | > │ ├─spk_100 47 | > ...... 48 | > ├─TRAIN(训练集数据,462人) 49 | > │ ├─DR1 50 | > │ │ ├─FCJF0 51 | > │ │ ├─FDAW0 52 | > ...... 53 | > │ ├─DR2 54 | > │ │ ├─MTJG0 55 | >...... 56 | >│ ├─DR3 57 | > │ │ ├─FALK0 58 | > │ │ ├─FCKE0 59 | > ...... 60 | > └─TRAIN_MFCC(提取的训练集MFCC,462人) 61 | > ├─spk_1 62 | > ├─spk_10 63 | > ├─spk_100 64 | > ...... 65 | 66 | **我使用Python实现的算法流程大致如下:** 67 | 68 | (1)提取24维MFCC特征。首先分别读入462个说话人的经过合并后的一段长语音(大概20s),MFCC特征提取过程与之前描述的在我自己小样本数据集上提取的过程一致,这里不再赘述。与之不同的主要有两点:第一,对于20s的语音提取MFCC之后特征维度大致为(2000,24)。因此需要将特征保存,避免重复提取。使用librosa提取的MFCC特征为numpy格式,因此我保存为.npy格式的文件,使用时load参数即可。第二,对462个说话人提取24维MFCC特征相当耗时,所以在实际代码实现时,我将462个说话人分为4批,对每一批分别开一个进程进行特征提取,运行效率提升了4倍左右。 69 | 70 | ![](https://cdn.jsdelivr.net/gh/Kevinnan-teen/CDN/image-20210605101414806.png) 71 | 72 | (2)进行gmm训练。将每个说话人语音的24维MFCC特征参数作为输入,训练GMM。经过调参对比后,GMM的聚类数量设为3个,协方差矩阵选取full的效果最好。同样,gmm的训练过程也是多进行并行计算。 73 | 74 | (3)测试说话人gmm模型。我使用SI中的1个句子作为测试数据(2s左右)。将2s语音作为输入,分别提取24维MFCC参数。然后分别将462个人的MFCC特征输入gmm模型,然后gmm对每一个输入进行打分。之后使用softmax将所有说话人的得分归一化到[0,1]区间,即得到每个说话人在当前gmm模型上的概率。概率最大的就是模型对应的说话人。 75 | 76 | (4)测试结果:SI第一个句子的测试结果:验证正确的数量为294,验证错误的数量为168,识别准确率为**63.6%**。 SI第二个句子的测试结果为:验证正确的数量为204,验证错误的数量为258,识别准确率为**44.2%**。 77 | 78 | ## 2、基于self-attention的说话人识别 79 | 80 | ### 2.1 测试环境: 81 | 82 | - google colab(Telsa T4 -16G) 83 | 84 | - Pytorch 1.7.1 85 | 86 | - 数据集:VoxCeleb数据集(选取其中600个说话人) 87 | 88 | **主要参考李宏毅2021年深度学习课程作业HW4**。使用开源的声纹识别数据集VoxCeleb1,我们从中选取了其中600个说话人的数据,然后分别对这600个人的语音使用mel滤波器组提取40维特征,作为神经网络的输入。 89 | 90 | 网络结构部分,我们使用self-attention机制。下图是《attention is all you need》论文中提出的Transformer结构。主要分为编码器encoder和解码器decoder两部分。对于本网络只用到左侧的encoder部分。 91 | 92 | ![](https://cdn.jsdelivr.net/gh/Kevinnan-teen/CDN/image-20210605101612748.png) 93 | 94 | 简单介绍一下Transformer的encoder。Encoder可以由下面一组串联的Block组成。每一个Block是一个self-attention。 95 | 96 | ![image-20210605101637276](https://cdn.jsdelivr.net/gh/Kevinnan-teen/CDN/image-20210605101637276.png) 97 | 98 | ​ 99 | 100 | 这里的self-attention的输出比传统的self-attention在输出之后又加了对应的输入。然后对相加后的结果做了Layer Norm。Layer Norm不同于Batch Norm。Batch Norm是对不同样本的同一个维度的不同特征计算mean和std。Layer Norm是计算同一个样本不同维度的相同特征计算mean和std,然后计算norm。之后再对做了norm的输出通过FC,然后相加,再做Layer Norm,然后输出。 101 | 102 | ![](https://cdn.jsdelivr.net/gh/Kevinnan-teen/CDN/image-20210605101642261.png) 103 | 104 | 105 | 106 | 说话人识别网络结构代码: 107 | 108 | ```python 109 | class Classifier(nn.Module): 110 | def __init__(self, d_model=80, n_spks=600, dropout=0.1): 111 | super().__init__() 112 | # Project the dimension of features from that of input into d_model. 113 | self.prenet = nn.Linear(40, d_model) 114 | self.encoder_layer = nn.TransformerEncoderLayer( 115 | d_model=d_model, dim_feedforward=256, nhead=2 116 | ) 117 | self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=2) 118 | 119 | # Project the the dimension of features from d_model into speaker nums. 120 | self.pred_layer = nn.Sequential( 121 | nn.Linear(d_model, n_spks), 122 | ) 123 | 124 | def forward(self, mels): 125 | """ 126 | args: 127 | mels: (batch size, length, 40) 128 | return: 129 | out: (batch size, n_spks) 130 | """ 131 | # out: (batch size, length, d_model) 132 | out = self.prenet(mels) 133 | # out: (length, batch size, d_model) 134 | out = out.permute(1, 0, 2) 135 | # The encoder layer expect features in the shape of (length, batch size, d_model). 136 | out = self.encoder(out) 137 | # out: (batch size, length, d_model) 138 | out = out.transpose(0, 1) 139 | # mean pooling 140 | stats = out.mean(dim=1) 141 | # out: (batch, n_spks) 142 | out = self.pred_layer(stats) 143 | return out 144 | 145 | 146 | net = Classifier() 147 | summary(net.to("cuda"), (2,40), device="cuda") 148 | ``` 149 | 150 | 网络结构如下图所示: 151 | 152 | ![](https://cdn.jsdelivr.net/gh/Kevinnan-teen/CDN/image-20210605101751501.png) 153 | 154 | 接下来划分训练集和验证集。将90%的数据用于train,10%的数据用于validation。 155 | 156 | 由于说话人识别是一个分类问题,所以定义损失函数为CrossEntropyLoss(),在Pytorch中交叉熵损失把softmax和CrossEntropy都定义在nn.CrossEntropyLoss(),因此不需要再定义softmax,只需要将模型的输出和labels输入CrossEntropyLoss()即可。定义优化函数为AdamW,这是Adam的改进版本,有更好的优化效果。 157 | 158 | 训练过程如下图所示。训练过程共迭代70000次,每2000次做一次validation。从结果可以看出,训练集上的损失在不断下降,准确率在不断上升,训练结束时的准确率为91%,验证集的准确率为80%。 159 | 160 | ![](https://cdn.jsdelivr.net/gh/Kevinnan-teen/CDN/image-20210605101804628.png) 161 | --------------------------------------------------------------------------------