├── female.wav ├── male.wav ├── male_female_pure_mixture.wav ├── README.md ├── MWF.m ├── RTF_based_LCMV_GSC.m └── segmentation.m /female.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tungluai/RTF-based-LCMV-GSC/HEAD/female.wav -------------------------------------------------------------------------------- /male.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tungluai/RTF-based-LCMV-GSC/HEAD/male.wav -------------------------------------------------------------------------------- /male_female_pure_mixture.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tungluai/RTF-based-LCMV-GSC/HEAD/male_female_pure_mixture.wav -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RTF-based-LCMV-GSC 2 | Relative transmission function based multichannel speech enhancement. 3 | The diarization and RTF estimation(corresponding to code segmentation.m ) method reference:Data-Driven Source Separation Based on Simplex Analysis ,Bracha Laufer-Goldshtein,26/Feb/2018. 4 | And then use the LCMV in structure of GSC to do speech enhancement or speech separation. 5 | -------------------------------------------------------------------------------- /MWF.m: -------------------------------------------------------------------------------- 1 | % the MWF beamformer 2 | % SDWMWF: h = (PhiX + mu * PhiN)^-1 * PhiSN(:,bin) 3 | % input: PhiN, (Nch, Nch, Nbin) the noise covariance matrix 4 | % PhiX, (Nch, Nch, Nbin) the speech covariance matrix 5 | % PhiSN,(Nch, Nbin) the noise&speech across-covariance matrix 6 | % mu, the speech distortion/noise reduction trade-off parameter 7 | % output: h, (Nch, Nbin) the beamformer coefficients 8 | % author : Xu Changlai,6/1,2019 9 | 10 | function h = MWF(PhiX,PhiN,PhiSN,mu) 11 | if nargin < 3 12 | mu = 1; % typical value {0, 1} 13 | end 14 | 15 | [Nch, ~, Nbin] = size(PhiX); 16 | h = zeros(Nch, Nbin); 17 | 18 | for bin = 1:Nbin 19 | if rcond(PhiN(:,:,bin)) < eps 20 | % disp(['bin ' num2str(bin) ': Noise covariance ill-conditioned.']); 21 | PhiN(:,:,bin) = PhiN(:,:,bin) + 1e-10 * eye(Nch); 22 | end 23 | h(:,bin) = (PhiX(:,:,bin) + mu * PhiN(:,:,bin)) \ PhiSN(:,bin); 24 | end 25 | 26 | 27 | -------------------------------------------------------------------------------- /RTF_based_LCMV_GSC.m: -------------------------------------------------------------------------------- 1 | % LCMV-GSC for speech enhancement 2 | % author : Xu Changlai,6/2,2019 3 | 4 | clear all 5 | close all 6 | 7 | [speech , fs ] = audioread('male_female_pure_mixture.wav'); 8 | speech = speech'; 9 | [Nch,Nz] = size(speech); 10 | Nfft =floor( fs*64/1000); % 64 ms per frame 11 | Nbin = floor(Nfft/2+1); 12 | Nfrm = floor(Nz/Nbin)-1; 13 | win = sqrt(hanning(Nfft))'; 14 | 15 | yout = zeros(1,Nz); 16 | Ybin_nonclosed = zeros(1,Nbin); 17 | 18 | q = zeros(Nch+1, Nbin); 19 | pest = zeros(Nbin,1); 20 | mu = 0.05; 21 | alphaP = 0.9; 22 | Yfbf = zeros(Nch,Nfft); 23 | phi_x = zeros(Nbin); 24 | phi_n = zeros(Nbin); 25 | PhiN = zeros(Nch+1, Nch+1, Nbin); 26 | PhiS = zeros(Nch+1, Nch+1, Nbin); 27 | PhiSN = zeros(Nch+1,Nbin); 28 | 29 | % processing 30 | [RTF,SPP,Mark] = segmentation (speech,fs,75,64); 31 | % resampling C 32 | C1 = shiftdim(RTF,2); 33 | for nsr = 1 : size(C1,3) 34 | C2(:,:,nsr) = resample(C1(:,:,nsr),64,64); 35 | end 36 | C = shiftdim(C2,1); 37 | 38 | nsrce = size(C,2); 39 | g = [1;zeros(nsrce-1,1)]; 40 | g = flipud(g); % change the enhanced person in the case of 2 speakers 41 | enhansp = find(1 == g); 42 | for frm = 1 : Nfrm 43 | %STFT 44 | for ch = 1 : Nch 45 | Y(ch ,:) = fft(win .* speech(ch ,(frm-1)*Nbin+1:(frm-1)*Nbin+Nfft),Nfft); 46 | end 47 | 48 | for bin=1:Nbin 49 | w0(:,bin) = C(:,:,bin)/(C(:,:,bin)'* C(:,:,bin)) * g; 50 | B(:,:,bin) = eye(Nch,Nch) - C(:,:,bin) /(C(:,:,bin)'*C(:,:,bin))*C(:,:,bin)'; 51 | % processing 52 | % FBF filtering 53 | Yfbf(bin) = w0(:,bin)' * Y(:,bin)/norm(w0(:,bin)); 54 | % BM filtering 55 | u(:,bin) = B(:,:,bin) * Y(:,bin); 56 | 57 | Yout(bin) = Yfbf(bin) - q(:,bin)'* [Yfbf(bin);u(:,bin)]; 58 | 59 | % SDW-MWF 60 | S(:,bin) = [Yout(bin);1e-10 * ones(Nch,1)]; 61 | N(:,bin) = [Yfbf(bin)-Yout(bin); u(:,bin)]; 62 | PhiS(:,:,bin) = 0.98 * PhiS(:,:,bin) + 0.02 * S(:,bin) * S(:,bin)'; 63 | PhiN(:,:,bin) = 0.98 * PhiN(:,:,bin) + 0.02 * N(:,bin) * N(:,bin)'; 64 | PhiSN(:,bin) = 0.98 * PhiSN(:,bin) + 0.02 * N(:,bin) * (Yfbf(bin)-Yout(bin))'; 65 | q(:,bin) = MWF(PhiS(:,:,bin), PhiN(:,:,bin),PhiSN(:,bin),1.11); 66 | end 67 | % load all stuff 68 | yout((frm-1)*Nfft/2+1:(frm-1)*Nfft/2+Nfft) = yout((frm-1)*Nfft/2+1:(frm-1)*Nfft/2+Nfft) + win .* real(ifft([Yout,conj(Yout(end-1:-1:2))])); 69 | end 70 | audiowrite('RTF.wav', yout,fs); 71 | 72 | % plot 73 | figure(2); 74 | subplot(3,1,1); 75 | plot(audioread('male.wav')); 76 | hold on 77 | plot(Mark(1,:)); 78 | subplot(3,1,2); 79 | plot(audioread('female.wav')); 80 | hold on 81 | plot(Mark(2,:)); 82 | subplot(3,1,3); 83 | plot(yout); 84 | 85 | % [ scoresbefore ] = pesq( 'male.wav', 'male_female_pure_mixture.wav' ); 86 | % [ scoresafter ] = pesq( 'male.wav', 'RTF.wav' ); 87 | % [ scoresideal ] = pesq( 'male.wav', 'male.wav' ); 88 | % fprintf('scorebefore: %f\n',scoresbefore); 89 | % fprintf('scoreafter: %f\n',scoresafter); 90 | % fprintf('scoreideal: %f\n',scoresideal); 91 | % fprintf(['improved PESQ socre : %f\n'],scoresafter-scoresbefore); -------------------------------------------------------------------------------- /segmentation.m: -------------------------------------------------------------------------------- 1 | function [RTF,SPP,Mark] = segmentation (speech,fs,ov,t_p_frm) 2 | 3 | % reference : Data-Driven Source Separation Based on Simplex Analysis,2018,Bracha 4 | %******input 5 | % speech : source in time domain 6 | % fs :sample rate 7 | % t_p_frm : time delay in per frame (ms) 8 | % ov : overlap ov% 9 | 10 | %******output 11 | % RTF: Nch x nsrce x Nbin relative transmission fuction 12 | % nsrce : source number 13 | % Mark: nsrce x (length of speech) to mark segments 14 | 15 | %author : Xu Changlai,6/2,2019 16 | 17 | [Nch,Nz] = size(speech); 18 | Nfft =floor( fs*t_p_frm/1000); % 64 ms per frame 19 | Nbin = floor(Nfft/2+1); 20 | Nov = 100 / (100-ov) ; 21 | Lbin = floor( Nfft/Nov ); 22 | Nfrm = floor(Nz/Lbin)-(Nov-1); 23 | win = sqrt(hanning(Nfft))'; 24 | 25 | for frm = 1 : Nfrm 26 | %STFT 27 | for ch = 1 : Nch 28 | Y(ch ,frm,:) = fft(win .* speech(ch ,(frm-1)*Lbin+1:(frm-1)*Lbin+Nfft),Nfft); 29 | end 30 | end 31 | 32 | upbin = floor(4.8*1000/fs*Nfft); 33 | lowbin = floor(0.3*1000/fs*Nfft+1); 34 | for frm = 1: Nfrm 35 | for bin = lowbin : upbin % 0.2 ~ 4.8 KHz 36 | if frm == 1 37 | Am(:,frm,bin-lowbin+1) = (Y(2:end,frm,bin) * Y(1,frm,bin)' + Y(2:end,frm+1,bin) * Y(1,frm+1,bin)') ./... 38 | (Y(1,frm,bin) * Y(1,frm,bin)' + Y(1,frm+1,bin) * Y(1,frm+1,bin)'); 39 | 40 | elseif frm == Nfrm 41 | Am(:,frm,bin-lowbin+1) = (Y(2:end,frm-1,bin) * Y(1,frm-1,bin)' + Y(2:end,frm,bin) * Y(1,frm,bin)') ./... 42 | (Y(1,frm-1,bin) * Y(1,frm-1,bin)' + Y(1,frm,bin) * Y(1,frm,bin)'); 43 | else 44 | Am(:,frm,bin-lowbin+1) = (Y(2:end,frm-1,bin) * Y(1,frm-1,bin)' + Y(2:end,frm,bin) * Y(1,frm,bin)'+ Y(2:end,frm+1,bin) * Y(1,frm+1,bin)') ./... 45 | (Y(1,frm-1,bin) * Y(1,frm-1,bin)' + Y(1,frm,bin) * Y(1,frm,bin)'+ Y(1,frm+1,bin) * Y(1,frm+1,bin)'); 46 | end 47 | end 48 | ac(:,frm) = reshape((reshape(Am(:,frm,:),Nch-1,upbin-lowbin+1))',(Nch-1)*(upbin-lowbin+1),1); 49 | a(:,frm) = [real(ac(:,frm));imag(ac(:,frm))]; 50 | for n = 1 : frm 51 | W(frm,n) = a(:,frm)'* a(:,n)/(2*(Nch-1)*(upbin-lowbin+1)); 52 | W(n,frm) = W(frm,n); 53 | end 54 | end 55 | 56 | % EVD on W 57 | [U,D] = eig(W); 58 | norm_eigv = diag(D)/D(end,end); 59 | V = []; 60 | for cnt = length(norm_eigv):-1:1 61 | if norm_eigv(cnt) < .119 % 0.11 ~ 0.128 62 | nsrce = length(norm_eigv) - cnt; 63 | break; 64 | end 65 | V = [V,U(:,cnt)]; 66 | end 67 | % find probability vector 68 | [~,I1] = max(sum(V.^2,2)); 69 | e(1,:) = V(I1,:); 70 | V1 = V - repmat(e(1,:),size(V,1),1); 71 | [~,I2] = max(sum(V1.^2,2)); 72 | e(2,:) = V(I2,:); 73 | if nsrce > 2 74 | Er = []; 75 | for r = 3:nsrce 76 | er = e(r-1,:)-e(1,:); 77 | Er = [Er er']; 78 | temp = pinv(Er' * Er);%pseudoantique 79 | P = eye(nsrce)- Er * temp * Er'; 80 | [~,I] = max(sum((P * V1').^2,1)); 81 | e(r,:) = V(I,:); 82 | end 83 | end 84 | Q = e'; 85 | SPP = (Q \ V')'; % source present probality 86 | % clustering and estimate the RTF 87 | for n = 1 : nsrce 88 | Ydom = zeros(Nch,1,Nbin); 89 | Yref = zeros(1,1,Nbin); 90 | mark = zeros(1,Nz); 91 | L = find (SPP(:,n) > .96);%classic .95 92 | disp(L); 93 | for i = 1 : length(L) 94 | Ydom = Ydom + Y(: ,L(i),1:Nbin) .* repmat(conj(Y(1 ,L(i),1:Nbin)),Nch,1); 95 | Yref = Yref + Y(1 ,L(i),1:Nbin) .* conj(Y(1 ,L(i),1:Nbin)); 96 | mark((L(i)-1)*Lbin+1:(L(i)-1)*Lbin+Nfft) = ones(1,Nfft); 97 | end 98 | RTF(:,n,:) = Ydom ./ repmat(Yref,Nch,1,1); 99 | % making marks 100 | Mark(n,:) = mark/10; 101 | end 102 | 103 | figure(1); 104 | plot(speech(1,:)); 105 | hold on 106 | plot(Mark(1,:)); 107 | hold on 108 | plot(Mark(2,:)); 109 | end --------------------------------------------------------------------------------