├── .gitignore ├── requirements.txt ├── test ├── s1.wav ├── s2.wav ├── s3.wav ├── s4.wav ├── s5.wav ├── s6.wav ├── s7.wav └── s8.wav ├── train ├── s1.wav ├── s2.wav ├── s3.wav ├── s4.wav ├── s5.wav ├── s6.wav ├── s7.wav └── s8.wav ├── ASR_report.pdf ├── src ├── __init__.py ├── test.py ├── LPC.py ├── spectrogram.py ├── LBG.py ├── train.py └── mel_coefficients.py ├── README.md └── LICENSE /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.out 3 | *.geany 4 | /.env 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy 3 | matplotlib 4 | -------------------------------------------------------------------------------- /test/s1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/orchidas/Speaker-Recognition/HEAD/test/s1.wav -------------------------------------------------------------------------------- /test/s2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/orchidas/Speaker-Recognition/HEAD/test/s2.wav -------------------------------------------------------------------------------- /test/s3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/orchidas/Speaker-Recognition/HEAD/test/s3.wav -------------------------------------------------------------------------------- /test/s4.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/orchidas/Speaker-Recognition/HEAD/test/s4.wav -------------------------------------------------------------------------------- /test/s5.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/orchidas/Speaker-Recognition/HEAD/test/s5.wav -------------------------------------------------------------------------------- /test/s6.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/orchidas/Speaker-Recognition/HEAD/test/s6.wav -------------------------------------------------------------------------------- /test/s7.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/orchidas/Speaker-Recognition/HEAD/test/s7.wav -------------------------------------------------------------------------------- /test/s8.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/orchidas/Speaker-Recognition/HEAD/test/s8.wav -------------------------------------------------------------------------------- /train/s1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/orchidas/Speaker-Recognition/HEAD/train/s1.wav -------------------------------------------------------------------------------- /train/s2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/orchidas/Speaker-Recognition/HEAD/train/s2.wav -------------------------------------------------------------------------------- /train/s3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/orchidas/Speaker-Recognition/HEAD/train/s3.wav -------------------------------------------------------------------------------- /train/s4.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/orchidas/Speaker-Recognition/HEAD/train/s4.wav -------------------------------------------------------------------------------- /train/s5.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/orchidas/Speaker-Recognition/HEAD/train/s5.wav -------------------------------------------------------------------------------- /train/s6.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/orchidas/Speaker-Recognition/HEAD/train/s6.wav -------------------------------------------------------------------------------- /train/s7.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/orchidas/Speaker-Recognition/HEAD/train/s7.wav -------------------------------------------------------------------------------- /train/s8.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/orchidas/Speaker-Recognition/HEAD/train/s8.wav -------------------------------------------------------------------------------- /ASR_report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/orchidas/Speaker-Recognition/HEAD/ASR_report.pdf -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Feb 26 02:37:34 2016 4 | 5 | @author: ORCHISAMA 6 | """ 7 | 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Speaker-Recognition 2 | Automatic Speaker Recognition algorithms in Python 3 | 4 | This repository contains Python programs that can be used for Automatic Speaker Recognition. ASR is done by extracting MFCCs and LPCs from each speaker and then forming a speaker-specific codebook 5 | of the same by using Vector Quantization (I like to think of it as a fancy name for NN-clustering). 6 | After that, the system is trained and tested for 8 different speakers. 7 | 8 | 9 | To test the algorithm, run test.py from the terminal. Certain parameters are open to be changed, such as the order of LPC coefficients, the number of Mel filterbanks and the number of centroids in each codebook. 10 | Everything is included in the repository, including .wav files for testing and training, hence cloning it and running test.py should work. 11 | 12 | A PDF has been included that explains the theory and provides links to relevant websites and projects. 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) [2016] [Orchisama Das] 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Feb 26 20:21:03 2016 4 | 5 | @author: ORCHISAMA 6 | """ 7 | 8 | from __future__ import division 9 | import numpy as np 10 | from scipy.io.wavfile import read 11 | from LBG import EUDistance 12 | from mel_coefficients import mfcc 13 | from LPC import lpc 14 | from train import training 15 | import os 16 | 17 | 18 | nSpeaker = 8 19 | nfiltbank = 12 20 | orderLPC = 15 21 | (codebooks_mfcc, codebooks_lpc) = training(nfiltbank, orderLPC) 22 | directory = os.path.dirname(os.getcwd()) + '/test'; 23 | fname = str() 24 | nCorrect_MFCC = 0 25 | nCorrect_LPC = 0 26 | 27 | 28 | def minDistance(features, codebooks): 29 | speaker = 0 30 | distmin = np.inf 31 | for k in range(np.shape(codebooks)[0]): 32 | D = EUDistance(features, codebooks[k,:,:]) 33 | dist = np.sum(np.min(D, axis = 1))/(np.shape(D)[0]) 34 | if dist < distmin: 35 | distmin = dist 36 | speaker = k 37 | 38 | return speaker 39 | 40 | 41 | for i in range(nSpeaker): 42 | fname = '/s' + str(i+1) + '.wav' 43 | print('Now speaker ', str(i+1), 'features are being tested') 44 | (fs,s) = read(directory + fname) 45 | mel_coefs = mfcc(s,fs,nfiltbank) 46 | lpc_coefs = lpc(s, fs, orderLPC) 47 | sp_mfcc = minDistance(mel_coefs, codebooks_mfcc) 48 | sp_lpc = minDistance(lpc_coefs, codebooks_lpc) 49 | 50 | print('Speaker ', (i+1), ' in test matches with speaker ', (sp_mfcc+1), ' in train for training with MFCC') 51 | print('Speaker ', (i+1), ' in test matches with speaker ', (sp_lpc+1), ' in train for training with LPC') 52 | 53 | if i == sp_mfcc: 54 | nCorrect_MFCC += 1 55 | if i == sp_lpc: 56 | nCorrect_LPC += 1 57 | 58 | 59 | percentageCorrect_MFCC = (nCorrect_MFCC/nSpeaker)*100 60 | print('Accuracy of result for training with MFCC is ', percentageCorrect_MFCC, '%') 61 | percentageCorrect_LPC = (nCorrect_LPC/nSpeaker)*100 62 | print('Accuracy of result for training with LPC is ', percentageCorrect_LPC, '%') 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /src/LPC.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Feb 27 22:01:37 2016 4 | 5 | @author: ORCHISAMA 6 | """ 7 | 8 | #calculate LPC coefficients from sound file 9 | from __future__ import division 10 | import numpy as np 11 | import matplotlib.pyplot as plt 12 | 13 | 14 | def autocorr(x): 15 | n = len(x) 16 | variance = np.var(x) 17 | x = x - np.mean(x) 18 | r = np.correlate(x, x, mode = 'full')[-n:] #n numbers from last index (l-n to l) 19 | result = r/(variance*(np.arange(n, 0, -1))) 20 | return result 21 | 22 | 23 | def createSymmetricMatrix(acf,p): 24 | R = np.empty((p,p)) 25 | for i in range(p): 26 | for j in range(p): 27 | R[i,j] = acf[np.abs(i-j)] 28 | return R 29 | 30 | 31 | def lpc(s,fs,p): 32 | 33 | #divide into segments of 25 ms with overlap of 10ms 34 | nSamples = np.int32(0.025*fs) 35 | overlap = np.int32(0.01*fs) 36 | nFrames = np.int32(np.ceil(len(s)/(nSamples-overlap))) 37 | 38 | #zero padding to make signal length long enough to have nFrames 39 | padding = ((nSamples-overlap)*nFrames) - len(s) 40 | if padding > 0: 41 | signal = np.append(s, np.zeros(padding)) 42 | else: 43 | signal = s 44 | segment = np.empty((nSamples, nFrames)) 45 | start = 0 46 | for i in range(nFrames): 47 | segment[:,i] = signal[start:start+nSamples] 48 | start = (nSamples-overlap)*i 49 | 50 | #calculate LPC with Yule-Walker 51 | lpc_coeffs = np.empty((p, nFrames)) 52 | for i in range(nFrames): 53 | acf = autocorr(segment[:,i]) 54 | # plt.figure(1) 55 | # plt.plot(acf) 56 | # plt.xlabel('lags') 57 | # plt.ylabel('Autocorrelation coefficients') 58 | # plt.axis([0, np.size(acf), -1, 1]) 59 | # plt.title('Autocorrelation function') 60 | # break 61 | r = -acf[1:p+1].T 62 | R = createSymmetricMatrix(acf,p) 63 | lpc_coeffs[:,i] = np.dot(np.linalg.inv(R),r) 64 | lpc_coeffs[:,i] = lpc_coeffs[:,i]/np.max(np.abs(lpc_coeffs[:,i])) 65 | 66 | return lpc_coeffs 67 | -------------------------------------------------------------------------------- /src/spectrogram.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Feb 23 23:53:09 2016 4 | 5 | @author: ORCHISAMA 6 | """ 7 | #calculate short time fourier transform and plot spectrogram 8 | from __future__ import division 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | from scipy.fftpack import fft, fftshift 12 | from scipy.signal import hann 13 | 14 | 15 | def nearestPow2(inp): 16 | power = np.ceil(np.log2(inp)) 17 | return 2**power 18 | 19 | def stft(signal, fs, nfft, overlap): 20 | 21 | #plotting time domain signal 22 | plt.figure(1) 23 | t = np.arange(0,len(signal)/fs, 1/fs) 24 | plt.plot(t,signal) 25 | plt.axis(xmax = 1) 26 | plt.xlabel('Time in seconds') 27 | plt.ylabel('Amplitude') 28 | plt.title('Speech signal') 29 | 30 | if not np.log2(nfft).is_integer(): 31 | nfft = nearestPow2(nfft) 32 | slength = len(signal) 33 | hop_size = np.int32(overlap * nfft) 34 | nFrames = int(np.round(len(signal)/(nfft-hop_size))) 35 | #zero padding to make signal length long enough to have nFrames 36 | signal = np.append(signal, np.zeros(nfft)) 37 | STFT = np.empty((nfft, nFrames)) 38 | segment = np.zeros(nfft) 39 | start = 0 40 | 41 | for n in range(nFrames): 42 | segment = signal[start:start+nfft] * hann(nfft) 43 | padded_seg = np.append(segment,np.zeros(nfft)) 44 | spec = fftshift(fft(padded_seg)) 45 | spec = spec[len(spec)/2:] 46 | spec = abs(spec)/max(abs(spec)) 47 | powerspec = 20*np.log10(spec) 48 | STFT[:,n] = powerspec 49 | start = start + nfft - hop_size 50 | 51 | #plot spectrogram 52 | plt.figure(2) 53 | freq = (fs/(2*nfft)) * np.arange(0,nfft,1) 54 | time = np.arange(0,nFrames)*(slength/(fs*nFrames)) 55 | plt.imshow(STFT, extent = [0,max(time),0,max(freq)],origin='lower', cmap='jet', interpolation='nearest', aspect='auto') 56 | plt.ylabel('Frequency in Hz') 57 | plt.xlabel('Time in seconds') 58 | plt.axis([0,max(time),0,np.max(freq)]) 59 | plt.title('Spectrogram of speech') 60 | plt.show() 61 | return (STFT, time, freq) 62 | 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /src/LBG.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Feb 25 00:34:54 2016 4 | 5 | @author: ORCHISAMA 6 | """ 7 | 8 | #speaker specific Vector Quantization codebook using LBG algorithm 9 | from __future__ import division 10 | import numpy as np 11 | 12 | 13 | def EUDistance(d,c): 14 | 15 | # np.shape(d)[0] = np.shape(c)[0] 16 | n = np.shape(d)[1] 17 | p = np.shape(c)[1] 18 | distance = np.empty((n,p)) 19 | 20 | if n eps: 56 | #nearest neighbour search 57 | prev_distance = np.mean(D) 58 | nearest_codebook = np.argmin(D,axis = 1) 59 | #print 'nearest neighbour',nearest_codebook 60 | 61 | #cluster vectors and find new centroid 62 | #print np.shape(np.mean(features[:, np.where(nearest_codebook == 0)],2)) 63 | for i in range(nCentroid): 64 | codebook[:,i] = np.mean(features[:,np.where(nearest_codebook == i)], 2).T #add along 3rd dimension 65 | 66 | #replace all NaN values with 0 67 | codebook = np.nan_to_num(codebook) 68 | #print 'this codebook', codebook 69 | D = EUDistance(features, codebook) 70 | distortion = (prev_distance - np.mean(D))/prev_distance 71 | #print 'distortion' , distortion 72 | 73 | 74 | #print 'final codebook', codebook, np.shape(codebook) 75 | return codebook 76 | 77 | 78 | -------------------------------------------------------------------------------- /src/train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Feb 26 19:57:32 2016 4 | 5 | @author: ORCHISAMA 6 | """ 7 | 8 | from __future__ import division 9 | import numpy as np 10 | from scipy.io.wavfile import read 11 | from LBG import lbg 12 | from mel_coefficients import mfcc 13 | from LPC import lpc 14 | import matplotlib.pyplot as plt 15 | import os 16 | 17 | 18 | def training(nfiltbank, orderLPC): 19 | nSpeaker = 8 20 | nCentroid = 16 21 | codebooks_mfcc = np.empty((nSpeaker,nfiltbank,nCentroid)) 22 | codebooks_lpc = np.empty((nSpeaker, orderLPC, nCentroid)) 23 | directory = os.path.dirname(os.getcwd()) + '/train'; 24 | fname = str() 25 | 26 | for i in range(nSpeaker): 27 | fname = '/s' + str(i+1) + '.wav' 28 | print('Now speaker ', str(i+1), 'features are being trained' ) 29 | (fs,s) = read(directory + fname) 30 | mel_coeff = mfcc(s, fs, nfiltbank) 31 | lpc_coeff = lpc(s, fs, orderLPC) 32 | codebooks_mfcc[i,:,:] = lbg(mel_coeff, nCentroid) 33 | codebooks_lpc[i,:,:] = lbg(lpc_coeff, nCentroid) 34 | 35 | plt.figure(i) 36 | plt.title('Codebook for speaker ' + str(i+1) + ' with ' + str(nCentroid) + ' centroids') 37 | for j in range(nCentroid): 38 | plt.subplot(211) 39 | plt.stem(codebooks_mfcc[i,:,j]) 40 | plt.ylabel('MFCC') 41 | plt.subplot(212) 42 | markerline, stemlines, baseline = plt.stem(codebooks_lpc[i,:,j]) 43 | plt.setp(markerline,'markerfacecolor','r') 44 | plt.setp(baseline,'color', 'k') 45 | plt.ylabel('LPC') 46 | plt.axis(ymin = -1, ymax = 1) 47 | plt.xlabel('Number of features') 48 | 49 | plt.show() 50 | print('Training complete') 51 | 52 | #plotting 5th and 6th dimension MFCC features on a 2D plane 53 | #comment lines 54 to 71 if you don't want to see codebook 54 | codebooks = np.empty((2, nfiltbank, nCentroid)) 55 | mel_coeff = np.empty((2, nfiltbank, 68)) 56 | 57 | for i in range(2): 58 | fname = '/s' + str(i+2) + '.wav' 59 | (fs,s) = read(directory + fname) 60 | mel_coeff[i,:,:] = mfcc(s, fs, nfiltbank)[:,0:68] 61 | codebooks[i,:,:] = lbg(mel_coeff[i,:,:], nCentroid) 62 | 63 | 64 | plt.figure(nSpeaker + 1) 65 | s1 = plt.scatter(mel_coeff[0,6,:], mel_coeff[0,4,:],s = 100, color = 'r', marker = 'o') 66 | c1 = plt.scatter(codebooks[0,6,:], codebooks[0,4,:], s = 100, color = 'r', marker = '+') 67 | s2 = plt.scatter(mel_coeff[1,6,:], mel_coeff[1,4,:],s = 100, color = 'b', marker = 'o') 68 | c2 = plt.scatter(codebooks[1,6,:], codebooks[1,4,:], s = 100, color = 'b', marker = '+') 69 | plt.grid() 70 | plt.legend((s1, s2, c1, c2), ('Sp1','Sp2','Sp1 centroids', 'Sp2 centroids'), scatterpoints = 1, loc = 'upper left') 71 | plt.show() 72 | 73 | 74 | return (codebooks_mfcc, codebooks_lpc) 75 | 76 | 77 | -------------------------------------------------------------------------------- /src/mel_coefficients.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Feb 24 22:48:15 2016 4 | 5 | @author: ORCHISAMA 6 | """ 7 | 8 | from __future__ import division 9 | from scipy.signal import hamming 10 | from scipy.fftpack import fft, fftshift, dct 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | 14 | def hertz_to_mel(freq): 15 | return 1125*np.log(1 + freq/700) 16 | 17 | def mel_to_hertz(m): 18 | return 700*(np.exp(m/1125) - 1) 19 | 20 | #calculate mel frequency filter bank 21 | def mel_filterbank(nfft, nfiltbank, fs): 22 | 23 | #set limits of mel scale from 300Hz to 8000Hz 24 | lower_mel = hertz_to_mel(300) 25 | upper_mel = hertz_to_mel(8000) 26 | mel = np.linspace(lower_mel, upper_mel, nfiltbank+2) 27 | hertz = [mel_to_hertz(m) for m in mel] 28 | fbins = [int(hz * int(nfft/2+1)/fs) for hz in hertz] 29 | fbank = np.empty((int(nfft/2+1),nfiltbank)) 30 | for i in range(1,nfiltbank+1): 31 | for k in range(int(nfft/2 + 1)): 32 | if k < fbins[i-1]: 33 | fbank[k, i-1] = 0 34 | elif k >= fbins[i-1] and k < fbins[i]: 35 | fbank[k,i-1] = (k - fbins[i-1])/(fbins[i] - fbins[i-1]) 36 | elif k >= fbins[i] and k <= fbins[i+1]: 37 | fbank[k,i-1] = (fbins[i+1] - k)/(fbins[i+1] - fbins[i]) 38 | else: 39 | fbank[k,i-1] = 0 40 | 41 | # plotting mel frequency filter banks 42 | # plt.figure(1) 43 | # xbins = fs*np.arange(0,nfft/2+1)/(nfft/2+1) 44 | # for i in range(nfiltbank): 45 | # plt.plot(xbins, fbank[:,i]) 46 | # plt.axis(xmax = 8000) 47 | # plt.xlabel('Frequency in Hz') 48 | # plt.ylabel('Amplitude') 49 | # plt.title('Mel Filterbank') 50 | # plt.show() 51 | return fbank 52 | 53 | def mfcc(s,fs, nfiltbank): 54 | 55 | #divide into segments of 25 ms with overlap of 10ms 56 | nSamples = np.int32(0.025*fs) 57 | overlap = np.int32(0.01*fs) 58 | nFrames = np.int32(np.ceil(len(s)/(nSamples-overlap))) 59 | #zero padding to make signal length long enough to have nFrames 60 | padding = ((nSamples-overlap)*nFrames) - len(s) 61 | if padding > 0: 62 | signal = np.append(s, np.zeros(padding)) 63 | else: 64 | signal = s 65 | segment = np.empty((nSamples, nFrames)) 66 | start = 0 67 | for i in range(nFrames): 68 | segment[:,i] = signal[start:start+nSamples] 69 | start = (nSamples-overlap)*i 70 | 71 | #compute periodogram 72 | nfft = 512 73 | periodogram = np.empty((nFrames, int(nfft/2 + 1))) 74 | for i in range(nFrames): 75 | x = segment[:,i] * hamming(nSamples) 76 | spectrum = fftshift(fft(x,nfft)) 77 | periodogram[i,:] = abs(spectrum[int(nfft/2-1):])/nSamples 78 | 79 | #calculating mfccs 80 | fbank = mel_filterbank(nfft, nfiltbank, fs) 81 | #nfiltbank MFCCs for each frame 82 | mel_coeff = np.empty((nfiltbank,nFrames)) 83 | for i in range(nfiltbank): 84 | for k in range(nFrames): 85 | mel_coeff[i,k] = np.sum(periodogram[k,:]*fbank[:,i]) 86 | 87 | mel_coeff = np.log10(mel_coeff) 88 | mel_coeff = dct(mel_coeff) 89 | #exclude 0th order coefficient (much larger than others) 90 | mel_coeff[0,:]= np.zeros(nFrames) 91 | return mel_coeff 92 | 93 | 94 | --------------------------------------------------------------------------------