├── .gitignore ├── addnoise_asl.m ├── comp_cep.m ├── comp_fwseg.m ├── comp_fwseg_mars.m ├── comp_fwseg_variant.m ├── comp_is.m ├── comp_llr.m ├── comp_pesq.m ├── comp_snr.m ├── comp_wss.m ├── composite.m ├── enhanced.wav ├── estoi.m ├── evaluate_all.m ├── pesq.ubuntu16.bin ├── readme.md ├── readme.pdf ├── sp04.wav ├── sp04_babble_sn10.wav ├── stoi.m ├── toserver.sh ├── wavread.m └── white_noise.wav /.gitignore: -------------------------------------------------------------------------------- 1 | .git -------------------------------------------------------------------------------- /addnoise_asl.m: -------------------------------------------------------------------------------- 1 | function addnoise_asl(cleanfile, noisefile, outfile, snr) 2 | % ---------------------------------------------------------------------- 3 | % This function adds noise to a file at a specified SNR level. It uses 4 | % the active speech level to compute the speech energy. The 5 | % active speech level is computed as per ITU-T P.56 standard [1]. 6 | % 7 | % Usage: addnoise_asl(cleanFile.wav, noiseFile.wav, noisyFile.wav, SNR) 8 | % 9 | % cleanFile.wav - clean input file in .wav format 10 | % noiseFile.wav - file containing the noise signal in .wav format 11 | % noisyFile.wav - resulting noisy file 12 | % SNR - desired SNR in dB 13 | % 14 | % Note that if the variable IRS below (line 38) is set to 1, then it applies the IRS 15 | % filter to bandlimit the signal to 300 Hz - 3.2 kHz. The default IRS 16 | % value is 0, ie, no IRS filtering is applied. 17 | % 18 | % Example call: 19 | % addnoise_asl('sp04.wav','white_noise.wav','sp04_white_5db.wav',5); 20 | % 21 | % 22 | % References: 23 | % [1] ITU-T (1993). Objective measurement of active speech level. ITU-T 24 | % Recommendation P. 56 25 | % 26 | % Author: Yi Hu and Philipos C. Loizou 27 | % 28 | % Copyright (c) 2006 by Philipos C. Loizou 29 | % $Revision: 0.0 $ $Date: 10/09/2006 $ 30 | % ---------------------------------------------------------------------- 31 | 32 | if nargin ~=4 33 | fprintf('USAGE: addnoise_asl(cleanFile.wav, noiseFile.wav, noisyFile.wav, SNR) \n'); 34 | fprintf('For more help, type: help addnoise_asl\n\n'); 35 | return; 36 | end 37 | 38 | IRS=0; % if 1 apply IRS filter simulating telephone handset bandwidth (300 Hz -3.2 kHz) 39 | 40 | % wavread gives floating point column data 41 | [clean, srate, nbits]= wavread(cleanfile); 42 | % filter clean speech with irs filter 43 | if IRS==1, clean= apply_IRS( clean, srate, nbits); end; 44 | 45 | [Px, asl, c0]= asl_P56 ( clean, srate, nbits); 46 | % Px is the active speech level ms energy, asl is the active factor, and c0 47 | % is the active speech level threshold. 48 | 49 | 50 | x=clean; 51 | x_len= length( x); % length of speech signal 52 | 53 | [noise, srate1, nbits1]= wavread( noisefile); 54 | if (srate1~= srate)| (nbits1~= nbits) 55 | error( 'the formats of the two files dont match!'); 56 | end 57 | noise_len= length( noise); 58 | if (noise_len<= x_len) 59 | error( 'the noise length has to be greater than speech length!'); 60 | end 61 | 62 | rand_start_limit= noise_len- x_len+ 1; 63 | % the start of the noise segment can vary between [1 rand_start_limit] 64 | rand_start= round( (rand_start_limit- 1)* rand( 1)+ 1); 65 | % random start of the noise segment 66 | noise_segment= noise( rand_start: rand_start+ x_len- 1); 67 | 68 | if IRS==1, noise_segment= apply_IRS( noise_segment, srate, nbits); end; 69 | 70 | % this is the randomly selected noise segment that will be added to the 71 | % clean speech x 72 | Pn= noise_segment'* noise_segment/ x_len; 73 | % we need to scale the noise segment samples to obtain the desired snr= 10* 74 | % log10( Px/ (sf^2 * Pn)) 75 | sf= sqrt( Px/Pn/ (10^ (snr/ 10))); % scale factor for noise segment data 76 | noise_segment= noise_segment * sf; 77 | 78 | noisy = x+ noise_segment; 79 | 80 | if ( (max( noisy)>= 1) | (min( noisy)< -1)) 81 | error( 'Overflow occurred!\n'); 82 | end; 83 | 84 | 85 | wavwrite( noisy, srate, nbits, outfile); 86 | 87 | fprintf( 1, '\n NOTE: For comparison, the SNR based on long-term RMS level is %4.2f dB.\n\n', 10*log10((x'*x)/ ... 88 | (noise_segment'*noise_segment))); 89 | 90 | 91 | %------------------------------------------------------------------------ 92 | function data_filtered= apply_IRS( data, Fs, nbits); 93 | 94 | n= length( data); 95 | 96 | % now find the next power of 2 which is greater or equal to n 97 | pow_of_2= 2^ (ceil( log2( n))); 98 | 99 | align_filter_dB= [0, -200; 50, -40; 100, -20; 125, -12; 160, -6; 200, 0;... 100 | 250, 4; 300, 6; 350, 8; 400, 10; 500, 11; 600, 12; 700, 12; 800, 12;... 101 | 1000, 12; 1300, 12; 1600, 12; 2000, 12; 2500, 12; 3000, 12; 3250, 12;... 102 | 3500, 4; 4000, -200; 5000, -200; 6300, -200; 8000, -200]; 103 | 104 | [number_of_points, trivial]= size( align_filter_dB); 105 | overallGainFilter= interp1( align_filter_dB( :, 1), align_filter_dB( :, 2), ... 106 | 1000); 107 | 108 | x= zeros( 1, pow_of_2); 109 | x( 1: n)= data; 110 | 111 | x_fft= fft( x, pow_of_2); 112 | 113 | freq_resolution= Fs/ pow_of_2; 114 | 115 | factorDb( 1: pow_of_2/2+ 1)= interp1( align_filter_dB( :, 1), ... 116 | align_filter_dB( :, 2), (0: pow_of_2/2)* freq_resolution)- ... 117 | overallGainFilter; 118 | factor= 10.^ (factorDb/ 20); 119 | 120 | factor= [factor, fliplr( factor( 2: pow_of_2/2))]; 121 | x_fft= x_fft.* factor; 122 | 123 | y= ifft( x_fft, pow_of_2); 124 | 125 | data_filtered= y( 1: n)'; 126 | 127 | 128 | 129 | function [asl_ms, asl, c0]= asl_P56 ( x, fs, nbits) 130 | % this implements ITU P.56 method B. 131 | % 'speechfile' is the speech file to calculate active speech level for, 132 | % 'asl' is the active speech level (between 0 and 1), 133 | % 'asl_rms' is the active speech level mean square energy. 134 | 135 | % x is the column vector of floating point speech data 136 | 137 | x= x(:); % make sure x is column vector 138 | T= 0.03; % time constant of smoothing, in seconds 139 | H= 0.2; % hangover time in seconds 140 | M= 15.9; 141 | % margin in dB of the difference between threshold and active speech level 142 | thres_no= nbits- 1; % number of thresholds, for 16 bit, it's 15 143 | 144 | I= ceil( fs* H); % hangover in samples 145 | g= exp( -1/( fs* T)); % smoothing factor in envelop detection 146 | c( 1: thres_no)= 2.^ (-15: thres_no- 16); 147 | % vector with thresholds from one quantizing level up to half the maximum 148 | % code, at a step of 2, in the case of 16bit samples, from 2^-15 to 0.5; 149 | a( 1: thres_no)= 0; % activity counter for each level threshold 150 | hang( 1: thres_no)= I; % hangover counter for each level threshold 151 | 152 | sq= x'* x; % long-term level square energy of x 153 | x_len= length( x); % length of x 154 | 155 | % use a 2nd order IIR filter to detect the envelope q 156 | x_abs= abs( x); 157 | p= filter( 1-g, [1 -g], x_abs); 158 | q= filter( 1-g, [1 -g], p); 159 | 160 | for k= 1: x_len 161 | for j= 1: thres_no 162 | if (q(k)>= c(j)) 163 | a(j)= a(j)+ 1; 164 | hang(j)= 0; 165 | elseif (hang(j)< I) 166 | a(j)= a(j)+ 1; 167 | hang(j)= hang(j)+ 1; 168 | else 169 | break; 170 | end 171 | end 172 | end 173 | 174 | asl= 0; 175 | asl_rms= 0; 176 | if (a(1)== 0) 177 | return; 178 | else 179 | AdB1= 10* log10( sq/ a(1)+ eps); 180 | end 181 | 182 | CdB1= 20* log10( c(1)+ eps); 183 | if (AdB1- CdB1< M) 184 | return; 185 | end 186 | 187 | AdB(1)= AdB1; 188 | CdB(1)= CdB1; 189 | Delta(1)= AdB1- CdB1; 190 | 191 | for j= 2: thres_no 192 | AdB(j)= 10* log10( sq/ (a(j)+ eps)+ eps); 193 | CdB(j)= 20* log10( c(j)+ eps); 194 | end 195 | 196 | for j= 2: thres_no 197 | if (a(j) ~= 0) 198 | Delta(j)= AdB(j)- CdB(j); 199 | if (Delta(j)<= M) 200 | % interpolate to find the asl 201 | [asl_ms_log, cl0]= bin_interp( AdB(j), ... 202 | AdB(j-1), CdB(j), CdB(j-1), M, 0.5); 203 | asl_ms= 10^ (asl_ms_log/ 10); 204 | asl= (sq/ x_len)/ asl_ms; 205 | c0= 10^( cl0/ 20); 206 | break; 207 | end 208 | end 209 | end 210 | 211 | 212 | 213 | 214 | function [asl_ms_log, cc]= bin_interp(upcount, lwcount, ... 215 | upthr, lwthr, Margin, tol) 216 | 217 | if (tol < 0) 218 | tol = -tol; 219 | end 220 | 221 | % Check if extreme counts are not already the true active value 222 | iterno = 1; 223 | if (abs(upcount - upthr - Margin) < tol) 224 | asl_ms_log= upcount; 225 | cc= upthr; 226 | return; 227 | end 228 | if (abs(lwcount - lwthr - Margin) < tol) 229 | asl_ms_log= lwcount; 230 | cc= lwthr; 231 | return; 232 | end 233 | 234 | % Initialize first middle for given (initial) bounds 235 | midcount = (upcount + lwcount) / 2.0; 236 | midthr = (upthr + lwthr) / 2.0; 237 | 238 | % Repeats loop until `diff' falls inside the tolerance (-tol<=diff<=tol) 239 | while ( 1) 240 | 241 | diff= midcount- midthr- Margin; 242 | if (abs(diff)<= tol) 243 | break; 244 | end 245 | 246 | % if tolerance is not met up to 20 iteractions, then relax the 247 | % tolerance by 10% 248 | 249 | iterno= iterno+ 1; 250 | 251 | if (iterno>20) 252 | tol = tol* 1.1; 253 | end 254 | 255 | if (diff> tol) % then new bounds are ... 256 | midcount = (upcount + midcount) / 2.0; 257 | % upper and middle activities 258 | midthr = (upthr + midthr) / 2.0; 259 | % ... and thresholds 260 | elseif (diff< -tol) % then new bounds are ... 261 | midcount = (midcount + lwcount) / 2.0; 262 | % middle and lower activities 263 | midthr = (midthr + lwthr) / 2.0; 264 | % ... and thresholds 265 | end 266 | 267 | end 268 | % Since the tolerance has been satisfied, midcount is selected 269 | % as the interpolated value with a tol [dB] tolerance. 270 | 271 | asl_ms_log= midcount; 272 | cc= midthr; 273 | 274 | 275 | 276 | 277 | -------------------------------------------------------------------------------- /comp_cep.m: -------------------------------------------------------------------------------- 1 | function cep_mean= comp_cep(cleanFile, enhdFile); 2 | 3 | % ---------------------------------------------------------------------- 4 | % Cepstrum Distance Objective Speech Quality Measure 5 | % 6 | % This function implements the cepstrum distance measure used 7 | % in [1] 8 | % 9 | % Usage: CEP=comp_cep(cleanFile.wav, enhancedFile.wav) 10 | % 11 | % cleanFile.wav - clean input file in .wav format 12 | % enhancedFile - enhanced output file in .wav format 13 | % CEP - computed cepstrum distance measure 14 | % 15 | % Note that the cepstrum measure is limited in the range [0, 10]. 16 | % 17 | % Example call: CEP =comp_cep('sp04.wav','enhanced.wav') 18 | % 19 | % 20 | % References: 21 | % 22 | % [1] Kitawaki, N., Nagabuchi, H., and Itoh, K. (1988). Objective quality 23 | % evaluation for low bit-rate speech coding systems. IEEE J. Select. 24 | % Areas in Comm., 6(2), 262-273. 25 | % 26 | % Author: Philipos C. Loizou 27 | % (LPC routines were written by Bryan Pellom & John Hansen) 28 | % 29 | % Copyright (c) 2006 by Philipos C. Loizou 30 | % $Revision: 0.0 $ $Date: 10/09/2006 $ 31 | 32 | % ---------------------------------------------------------------------- 33 | if nargin~=2 34 | fprintf('USAGE: CEP=comp_cep(cleanFile.wav, enhancedFile.wav)\n'); 35 | fprintf('For more help, type: help comp_cep\n\n'); 36 | return; 37 | end 38 | 39 | alpha=0.95; 40 | 41 | [data1, Srate1, Nbits1]= wavread(cleanFile); 42 | [data2, Srate2, Nbits2]= wavread(enhdFile); 43 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2) 44 | error( 'The two files do not match!\n'); 45 | end 46 | 47 | len= min( length( data1), length( data2)); 48 | data1= data1( 1: len)+eps; 49 | data2= data2( 1: len)+eps; 50 | 51 | IS_dist= cepstrum( data1, data2,Srate1); 52 | 53 | IS_len= round( length( IS_dist)* alpha); 54 | IS= sort( IS_dist); 55 | 56 | cep_mean= mean( IS( 1: IS_len)); 57 | 58 | 59 | 60 | 61 | function distortion = cepstrum(clean_speech, processed_speech,sample_rate) 62 | 63 | 64 | % ---------------------------------------------------------------------- 65 | % Check the length of the clean and processed speech. Must be the same. 66 | % ---------------------------------------------------------------------- 67 | 68 | clean_length = length(clean_speech); 69 | processed_length = length(processed_speech); 70 | 71 | if (clean_length ~= processed_length) 72 | disp('Error: Both Speech Files must be same length.'); 73 | return 74 | end 75 | 76 | % ---------------------------------------------------------------------- 77 | % Scale both clean speech and processed speech to have same dynamic 78 | % range. Also remove DC component from each signal 79 | % ---------------------------------------------------------------------- 80 | 81 | %clean_speech = clean_speech - mean(clean_speech); 82 | %processed_speech = processed_speech - mean(processed_speech); 83 | 84 | %processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech))); 85 | 86 | % ---------------------------------------------------------------------- 87 | % Global Variables 88 | % ---------------------------------------------------------------------- 89 | 90 | winlength = round(30*sample_rate/1000); %240; % window length in samples 91 | skiprate = floor(winlength/4); % window skip in samples 92 | if sample_rate<10000 93 | P = 10; % LPC Analysis Order 94 | else 95 | P=16; % this could vary depending on sampling frequency. 96 | end 97 | C=10*sqrt(2)/log(10); 98 | % ---------------------------------------------------------------------- 99 | % For each frame of input speech, calculate the Itakura-Saito Measure 100 | % ---------------------------------------------------------------------- 101 | 102 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames 103 | start = 1; % starting sample 104 | window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1))); 105 | 106 | for frame_count = 1:num_frames 107 | 108 | % ---------------------------------------------------------- 109 | % (1) Get the Frames for the test and reference speech. 110 | % Multiply by Hanning Window. 111 | % ---------------------------------------------------------- 112 | 113 | clean_frame = clean_speech(start:start+winlength-1); 114 | processed_frame = processed_speech(start:start+winlength-1); 115 | clean_frame = clean_frame.*window; 116 | processed_frame = processed_frame.*window; 117 | 118 | % ---------------------------------------------------------- 119 | % (2) Get the autocorrelation lags and LPC parameters used 120 | % to compute the IS measure. 121 | % ---------------------------------------------------------- 122 | 123 | [R_clean, Ref_clean, A_clean] = ... 124 | lpcoeff(clean_frame, P); 125 | [R_processed, Ref_processed, A_processed] = ... 126 | lpcoeff(processed_frame, P); 127 | 128 | C_clean=lpc2cep(A_clean); 129 | C_processed=lpc2cep(A_processed); 130 | 131 | % ---------------------------------------------------------- 132 | % (3) Compute the cepstrum-distance measure 133 | % ---------------------------------------------------------- 134 | 135 | 136 | distortion(frame_count) = min(10,C*norm(C_clean-C_processed,2)); 137 | 138 | 139 | start = start + skiprate; 140 | 141 | end 142 | 143 | 144 | 145 | function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order) 146 | 147 | % ---------------------------------------------------------- 148 | % (1) Compute Autocorrelation Lags 149 | % ---------------------------------------------------------- 150 | 151 | winlength = max(size(speech_frame)); 152 | for k=1:model_order+1 153 | R(k) = sum(speech_frame(1:winlength-k+1) ... 154 | .*speech_frame(k:winlength)); 155 | end 156 | 157 | % ---------------------------------------------------------- 158 | % (2) Levinson-Durbin 159 | % ---------------------------------------------------------- 160 | 161 | a = ones(1,model_order); 162 | E(1)=R(1); 163 | for i=1:model_order 164 | a_past(1:i-1) = a(1:i-1); 165 | sum_term = sum(a_past(1:i-1).*R(i:-1:2)); 166 | rcoeff(i)=(R(i+1) - sum_term) / E(i); 167 | a(i)=rcoeff(i); 168 | a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1); 169 | E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i); 170 | end 171 | 172 | acorr = R; 173 | refcoeff = rcoeff; 174 | lpparams = [1 -a]; 175 | 176 | %---------------------------------------------- 177 | function [cep]=lpc2cep(a) 178 | % 179 | % converts prediction to cepstrum coefficients 180 | % 181 | % Author: Philipos C. Loizou 182 | 183 | M=length(a); 184 | cep=zeros(1,M-1); 185 | 186 | cep(1)=-a(2); 187 | 188 | for k=2:M-1 189 | ix=1:k-1; 190 | vec1=cep(ix).*a(k-1+1:-1:2).*ix; 191 | cep(k)=-(a(k+1)+sum(vec1)/k); 192 | 193 | end 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | -------------------------------------------------------------------------------- /comp_fwseg.m: -------------------------------------------------------------------------------- 1 | function fwseg_dist= comp_fwseg(cleanFile, enhancedFile); 2 | 3 | % ---------------------------------------------------------------------- 4 | % Frequency weighted SNRseg Objective Speech Quality Measure 5 | % 6 | % This function implements the frequency-weighted SNRseg measure [1] 7 | % using a different weighting function, the clean spectrum. 8 | % 9 | % Usage: fwSNRseg=comp_fwseg(cleanFile.wav, enhancedFile.wav) 10 | % 11 | % cleanFile.wav - clean input file in .wav format 12 | % enhancedFile - enhanced output file in .wav format 13 | % fwSNRseg - computed frequency weighted SNRseg in dB 14 | % 15 | % Note that large numbers of fwSNRseg are better. 16 | % 17 | % Example call: fwSNRseg =comp_fwseg('sp04.wav','enhanced.wav') 18 | % 19 | % 20 | % References: 21 | % [1] Tribolet, J., Noll, P., McDermott, B., and Crochiere, R. E. (1978). 22 | % A study of complexity and quality of speech waveform coders. Proc. 23 | % IEEE Int. Conf. Acoust. , Speech, Signal Processing, 586-590. 24 | % 25 | % Author: Philipos C. Loizou 26 | % (critical-band filtering routines were written by Bryan Pellom & John Hansen) 27 | % 28 | % Copyright (c) 2006 by Philipos C. Loizou 29 | % $Revision: 0.0 $ $Date: 10/09/2006 $ 30 | % ---------------------------------------------------------------------- 31 | 32 | if nargin~=2 33 | fprintf('USAGE: fwSNRseg=comp_fwseg(cleanFile.wav, enhancedFile.wav)\n'); 34 | fprintf('For more help, type: help comp_fwseg\n\n'); 35 | return; 36 | end 37 | 38 | 39 | [data1, Srate1, Nbits1]= wavread(cleanFile); 40 | [data2, Srate2, Nbits2]= wavread(enhancedFile); 41 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2) 42 | error( 'The two files do not match!\n'); 43 | end 44 | 45 | len= min( length( data1), length( data2)); 46 | data1= data1( 1: len)+eps; 47 | data2= data2( 1: len)+eps; 48 | 49 | wss_dist_vec= fwseg( data1, data2,Srate1); 50 | 51 | fwseg_dist=mean(wss_dist_vec); 52 | 53 | 54 | % ---------------------------------------------------------------------- 55 | 56 | function distortion = fwseg(clean_speech, processed_speech,sample_rate) 57 | 58 | 59 | % ---------------------------------------------------------------------- 60 | % Check the length of the clean and processed speech. Must be the same. 61 | % ---------------------------------------------------------------------- 62 | 63 | clean_length = length(clean_speech); 64 | processed_length = length(processed_speech); 65 | 66 | if (clean_length ~= processed_length) 67 | disp('Error: Files must have same length.'); 68 | return 69 | end 70 | 71 | 72 | 73 | % ---------------------------------------------------------------------- 74 | % Global Variables 75 | % ---------------------------------------------------------------------- 76 | 77 | 78 | winlength = round(30*sample_rate/1000); % window length in samples 79 | skiprate = floor(winlength/4); % window skip in samples 80 | max_freq = sample_rate/2; % maximum bandwidth 81 | num_crit = 25; % number of critical bands 82 | USE_25=1; 83 | n_fft = 2^nextpow2(2*winlength); 84 | n_fftby2 = n_fft/2; % FFT size/2 85 | gamma=0.2; % power exponent 86 | 87 | % ---------------------------------------------------------------------- 88 | % Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz) 89 | % ---------------------------------------------------------------------- 90 | 91 | cent_freq(1) = 50.0000; bandwidth(1) = 70.0000; 92 | cent_freq(2) = 120.000; bandwidth(2) = 70.0000; 93 | cent_freq(3) = 190.000; bandwidth(3) = 70.0000; 94 | cent_freq(4) = 260.000; bandwidth(4) = 70.0000; 95 | cent_freq(5) = 330.000; bandwidth(5) = 70.0000; 96 | cent_freq(6) = 400.000; bandwidth(6) = 70.0000; 97 | cent_freq(7) = 470.000; bandwidth(7) = 70.0000; 98 | cent_freq(8) = 540.000; bandwidth(8) = 77.3724; 99 | cent_freq(9) = 617.372; bandwidth(9) = 86.0056; 100 | cent_freq(10) = 703.378; bandwidth(10) = 95.3398; 101 | cent_freq(11) = 798.717; bandwidth(11) = 105.411; 102 | cent_freq(12) = 904.128; bandwidth(12) = 116.256; 103 | cent_freq(13) = 1020.38; bandwidth(13) = 127.914; 104 | cent_freq(14) = 1148.30; bandwidth(14) = 140.423; 105 | cent_freq(15) = 1288.72; bandwidth(15) = 153.823; 106 | cent_freq(16) = 1442.54; bandwidth(16) = 168.154; 107 | cent_freq(17) = 1610.70; bandwidth(17) = 183.457; 108 | cent_freq(18) = 1794.16; bandwidth(18) = 199.776; 109 | cent_freq(19) = 1993.93; bandwidth(19) = 217.153; 110 | cent_freq(20) = 2211.08; bandwidth(20) = 235.631; 111 | cent_freq(21) = 2446.71; bandwidth(21) = 255.255; 112 | cent_freq(22) = 2701.97; bandwidth(22) = 276.072; 113 | cent_freq(23) = 2978.04; bandwidth(23) = 298.126; 114 | cent_freq(24) = 3276.17; bandwidth(24) = 321.465; 115 | cent_freq(25) = 3597.63; bandwidth(25) = 346.136; 116 | 117 | W=[ % articulation index weights 118 | 0.003 119 | 0.003 120 | 0.003 121 | 0.007 122 | 0.010 123 | 0.016 124 | 0.016 125 | 0.017 126 | 0.017 127 | 0.022 128 | 0.027 129 | 0.028 130 | 0.030 131 | 0.032 132 | 0.034 133 | 0.035 134 | 0.037 135 | 0.036 136 | 0.036 137 | 0.033 138 | 0.030 139 | 0.029 140 | 0.027 141 | 0.026 142 | 0.026]; 143 | 144 | W=W'; 145 | 146 | if USE_25==0 % use 13 bands 147 | % ----- lump adjacent filters together ---------------- 148 | k=2; 149 | cent_freq2(1)=cent_freq(1); 150 | bandwidth2(1)=bandwidth(1)+bandwidth(2); 151 | W2(1)=W(1); 152 | for i=2:13 153 | cent_freq2(i)=cent_freq2(i-1)+bandwidth2(i-1); 154 | bandwidth2(i)=bandwidth(k)+bandwidth(k+1); 155 | W2(i)=0.5*(W(k)+W(k+1)); 156 | k=k+2; 157 | end 158 | 159 | sumW=sum(W2); 160 | bw_min = bandwidth2 (1); % minimum critical bandwidth 161 | else 162 | sumW=sum(W); 163 | bw_min=bandwidth(1); 164 | end 165 | 166 | 167 | % ---------------------------------------------------------------------- 168 | % Set up the critical band filters. Note here that Gaussianly shaped 169 | % filters are used. Also, the sum of the filter weights are equivalent 170 | % for each critical band filter. Filter less than -30 dB and set to 171 | % zero. 172 | % ---------------------------------------------------------------------- 173 | 174 | min_factor = exp (-30.0 / (2.0 * 2.303)); % -30 dB point of filter 175 | if USE_25==0 176 | 177 | num_crit=length(cent_freq2); 178 | 179 | for i = 1:num_crit 180 | f0 = (cent_freq2 (i) / max_freq) * (n_fftby2); 181 | all_f0(i) = floor(f0); 182 | bw = (bandwidth2 (i) / max_freq) * (n_fftby2); 183 | norm_factor = log(bw_min) - log(bandwidth2(i)); 184 | j = 0:1:n_fftby2-1; 185 | crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor); 186 | crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor); 187 | end 188 | 189 | else 190 | for i = 1:num_crit 191 | f0 = (cent_freq (i) / max_freq) * (n_fftby2); 192 | all_f0(i) = floor(f0); 193 | bw = (bandwidth (i) / max_freq) * (n_fftby2); 194 | norm_factor = log(bw_min) - log(bandwidth(i)); 195 | j = 0:1:n_fftby2-1; 196 | crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor); 197 | crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor); 198 | end 199 | end 200 | 201 | 202 | 203 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames 204 | start = 1; % starting sample 205 | window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1))); 206 | 207 | for frame_count = 1:num_frames 208 | 209 | % ---------------------------------------------------------- 210 | % (1) Get the Frames for the test and reference speech. 211 | % Multiply by Hanning Window. 212 | % ---------------------------------------------------------- 213 | 214 | clean_frame = clean_speech(start:start+winlength-1); 215 | processed_frame = processed_speech(start:start+winlength-1); 216 | clean_frame = clean_frame.*window; 217 | processed_frame = processed_frame.*window; 218 | 219 | % ---------------------------------------------------------- 220 | % (2) Compute the magnitude Spectrum of Clean and Processed 221 | % ---------------------------------------------------------- 222 | 223 | 224 | clean_spec = abs(fft(clean_frame,n_fft)); 225 | processed_spec = abs(fft(processed_frame,n_fft)); 226 | 227 | % normalize spectra to have area of one 228 | % 229 | clean_spec=clean_spec/sum(clean_spec(1:n_fftby2)); 230 | processed_spec=processed_spec/sum(processed_spec(1:n_fftby2)); 231 | 232 | % ---------------------------------------------------------- 233 | % (3) Compute Filterbank Output Energies 234 | % ---------------------------------------------------------- 235 | 236 | clean_energy=zeros(1,num_crit); 237 | processed_energy=zeros(1,num_crit); 238 | error_energy=zeros(1,num_crit); 239 | W_freq=zeros(1,num_crit); 240 | 241 | for i = 1:num_crit 242 | clean_energy(i) = sum(clean_spec(1:n_fftby2) ... 243 | .*crit_filter(i,:)'); 244 | processed_energy(i) = sum(processed_spec(1:n_fftby2) ... 245 | .*crit_filter(i,:)'); 246 | 247 | error_energy(i)=max((clean_energy(i)-processed_energy(i))^2,eps); 248 | W_freq(i)=(clean_energy(i))^gamma; 249 | 250 | end 251 | SNRlog=10*log10((clean_energy.^2)./error_energy); 252 | 253 | 254 | 255 | fwSNR=sum(W_freq.*SNRlog)/sum(W_freq); 256 | 257 | distortion(frame_count)=min(max(fwSNR,-10),35); 258 | 259 | start = start + skiprate; 260 | 261 | end 262 | 263 | 264 | 265 | -------------------------------------------------------------------------------- /comp_fwseg_mars.m: -------------------------------------------------------------------------------- 1 | function [SIG,BAK,OVL]= comp_fwseg_mars(cleanFile, enhancedFile); 2 | 3 | % ---------------------------------------------------------------------- 4 | % MARS Frequency-variant fwSNRseg objective speech quality measure 5 | % 6 | % This function implements the frequency-variant fwSNRseg measure based 7 | % on MARS analysis (see Chap. 10, Sec. 10.5.4) 8 | % 9 | % 10 | % Usage: [sig,bak,ovl]=comp_fwseg_mars(cleanFile.wav, enhancedFile.wav) 11 | % 12 | % cleanFile.wav - clean input file in .wav format 13 | % enhancedFile - enhanced output file in .wav format 14 | % sig - predicted rating [1-5] of speech distortion 15 | % bak - predicted rating [1-5] of noise distortion 16 | % ovl - predicted rating [1-5] of overall quality 17 | % 18 | % 19 | % Example call: [s,b,o] =comp_fwseg_mars('sp04.wav','enhanced.wav') 20 | % 21 | % 22 | % References: 23 | % [1] Chapter 10, Sec 10.5.4, 24 | % [2] Chapter 11 25 | % 26 | % Authors: Yi Hu and Philipos C. Loizou 27 | % (critical-band filtering routines were written by Bryan Pellom & John Hansen) 28 | % 29 | % Copyright (c) 2006 by Philipos C. Loizou 30 | % $Revision: 0.0 $ $Date: 10/09/2006 $ 31 | % ---------------------------------------------------------------------- 32 | 33 | if nargin~=2 34 | fprintf('USAGE: [sig,bak,ovl]=comp_fwseg_mars(cleanFile.wav, enhancedFile.wav)\n'); 35 | fprintf('For more help, type: help comp_fwseg_mars\n\n'); 36 | return; 37 | end 38 | 39 | [data1, Srate1, Nbits1]= wavread(cleanFile); 40 | [data2, Srate2, Nbits2]= wavread(enhancedFile); 41 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2) 42 | error( 'The two files do not match!\n'); 43 | end 44 | 45 | len= min( length( data1), length( data2)); 46 | data1= data1( 1: len)+eps; 47 | data2= data2( 1: len)+eps; 48 | 49 | wss_dist_matrix= fwseg( data1, data2,Srate1); 50 | wss_dist=mean(wss_dist_matrix); 51 | 52 | 53 | SIG= sig_mars( wss_dist( 1), wss_dist( 2), wss_dist( 3), wss_dist( 4), ... 54 | wss_dist( 5), wss_dist( 6), wss_dist( 7), wss_dist( 8), ... 55 | wss_dist( 9), wss_dist( 10), wss_dist( 11), wss_dist( 12), ... 56 | wss_dist( 13), wss_dist( 14), wss_dist( 15), wss_dist( 16), ... 57 | wss_dist( 17), wss_dist( 18), wss_dist( 19), wss_dist( 20), ... 58 | wss_dist( 21), wss_dist( 22), wss_dist( 23), wss_dist( 24), ... 59 | wss_dist( 25)); 60 | SIG=max(1,SIG); SIG=min(5, SIG); % limit values to [1, 5] 61 | 62 | BAK= bak_mars( wss_dist( 1), wss_dist( 2), wss_dist( 3), wss_dist( 4), ... 63 | wss_dist( 5), wss_dist( 6), wss_dist( 7), wss_dist( 8), ... 64 | wss_dist( 9), wss_dist( 10), wss_dist( 11), wss_dist( 12), ... 65 | wss_dist( 13), wss_dist( 14), wss_dist( 15), wss_dist( 16), ... 66 | wss_dist( 17), wss_dist( 18), wss_dist( 19), wss_dist( 20), ... 67 | wss_dist( 21), wss_dist( 22), wss_dist( 23), wss_dist( 24), ... 68 | wss_dist( 25)); 69 | BAK=max(1,BAK); BAK=min(5, BAK); % limit values to [1, 5] 70 | 71 | OVL= ovl_mars( wss_dist( 1), wss_dist( 2), wss_dist( 3), wss_dist( 4), ... 72 | wss_dist( 5), wss_dist( 6), wss_dist( 7), wss_dist( 8), ... 73 | wss_dist( 9), wss_dist( 10), wss_dist( 11), wss_dist( 12), ... 74 | wss_dist( 13), wss_dist( 14), wss_dist( 15), wss_dist( 16), ... 75 | wss_dist( 17), wss_dist( 18), wss_dist( 19), wss_dist( 20), ... 76 | wss_dist( 21), wss_dist( 22), wss_dist( 23), wss_dist( 24), ... 77 | wss_dist( 25)); 78 | OVL=max(1,OVL); OVL=min(5, OVL); % limit values to [1, 5] 79 | 80 | 81 | %------------------------------------------------- 82 | function Y= bak_mars( FWSEG_VA, V5, V6, V7, V8, V9, V10, V11, V12, ... 83 | V13, V14, V15, V16, V17, V18, V19, V20, ... 84 | V21, V22, V23, V24, V25, V26, V27, V28) 85 | 86 | BF1 = max(0, V21 - 0.282); 87 | BF2 = max(0, FWSEG_VA + 9.094); 88 | BF3 = max(0, - 9.094 - FWSEG_VA ); 89 | BF5 = max(0, 10.089 - V11 ); 90 | BF7 = max(0, 3.624 - V26 ) * BF3; 91 | BF8 = max(0, V24 - 5.584) * BF5; 92 | BF9 = max(0, 5.584 - V24 ) * BF5; 93 | BF10 = max(0, V19 - 8.030) * BF1; 94 | BF11 = max(0, 8.030 - V19 ) * BF1; 95 | BF12 = max(0, V27 - 4.858) * BF1; 96 | BF13 = max(0, 4.858 - V27 ) * BF1; 97 | BF14 = max(0, FWSEG_VA + 7.282) * BF1; 98 | BF15 = max(0, - 7.282 - FWSEG_VA ) * BF1; 99 | BF17 = max(0, 9.458 - V16 ) * BF10; 100 | BF18 = max(0, V27 - 10.431) * BF11; 101 | BF19 = max(0, 10.431 - V27 ) * BF11; 102 | BF21 = max(0, 11.059 - V22 ) * BF1; 103 | BF22 = max(0, V26 - 8.675) * BF1; 104 | BF23 = max(0, 8.675 - V26 ) * BF1; 105 | BF25 = max(0, 11.195 - V6 ) * BF10; 106 | BF26 = max(0, V8 - 7.138) * BF1; 107 | BF27 = max(0, 7.138 - V8 ) * BF1; 108 | BF29 = max(0, 9.006 - V10 ) * BF26; 109 | BF30 = max(0, V14 - 8.210) * BF15; 110 | BF35 = max(0, 7.026 - V19 ) * BF15; 111 | BF36 = max(0, V11 - 3.424) * BF27; 112 | BF39 = max(0, 5.418 - V17 ) * BF23; 113 | BF40 = max(0, V28 - 6.813); 114 | BF41 = max(0, 6.813 - V28 ); 115 | BF42 = max(0, V26 - 5.998) * BF14; 116 | BF43 = max(0, 5.998 - V26 ) * BF14; 117 | BF44 = max(0, V5 + 0.206) * BF41; 118 | BF45 = max(0, - 0.206 - V5 ) * BF41; 119 | BF46 = max(0, V22 - 7.901) * BF45; 120 | BF49 = max(0, 7.496 - V8 ) * BF44; 121 | BF51 = max(0, 7.904 - V11 ) * BF45; 122 | BF52 = max(0, V26 - 10.938) * BF27; 123 | BF54 = max(0, V9 - 4.507) * BF26; 124 | BF56 = max(0, V28 - 0.549) * BF15; 125 | BF57 = max(0, 0.549 - V28 ) * BF15; 126 | BF58 = max(0, V25 - 3.252) * BF41; 127 | BF59 = max(0, 3.252 - V25 ) * BF41; 128 | BF60 = max(0, V23 - 7.650) * BF58; 129 | BF61 = max(0, 7.650 - V23 ) * BF58; 130 | BF62 = max(0, V25 - 9.931) * BF44; 131 | BF63 = max(0, 9.931 - V25 ) * BF44; 132 | BF64 = max(0, V25 - 4.923) * BF21; 133 | BF65 = max(0, 4.923 - V25 ) * BF21; 134 | BF67 = max(0, 3.746 - V28 ) * BF10; 135 | BF68 = max(0, V11 - 5.346) * BF41; 136 | BF69 = max(0, 5.346 - V11 ) * BF41; 137 | BF70 = max(0, V12 - 9.026) * BF68; 138 | BF71 = max(0, 9.026 - V12 ) * BF68; 139 | BF73 = max(0, - 2.668 - V28 ) * BF21; 140 | BF74 = max(0, V24 - 7.028) * BF41; 141 | BF75 = max(0, 7.028 - V24 ) * BF41; 142 | BF77 = max(0, - 0.224 - V6 ) * BF74; 143 | BF78 = max(0, V5 - 3.884); 144 | BF79 = max(0, 3.884 - V5 ); 145 | BF80 = max(0, V15 - 5.019) * BF78; 146 | BF83 = max(0, - 1.880 - V28 ) * BF13; 147 | BF84 = max(0, V7 - 3.067) * BF12; 148 | BF85 = max(0, 3.067 - V7 ) * BF12; 149 | BF87 = max(0, 5.353 - V6 ); 150 | BF88 = max(0, V13 - 3.405) * BF9; 151 | BF89 = max(0, 3.405 - V13 ) * BF9; 152 | BF91 = max(0, 5.599 - V13 ) * BF45; 153 | BF92 = max(0, V15 - 9.821) * BF8; 154 | BF94 = max(0, V14 + 2.594) * BF79; 155 | BF97 = max(0, 8.635 - V23 ) * BF94; 156 | BF99 = max(0, 1.332 - V24 ) * BF45; 157 | BF100 = max(0, V7 - 0.209) * BF1; 158 | 159 | Y = 2.751 + 0.135 * BF1 - 0.037 * BF2 + 0.328 * BF3 - 0.098 * BF5 ... 160 | + 0.988 * BF7 + 0.014 * BF8 - 0.034 * BF11 - 0.011 * BF12 ... 161 | - 0.013 * BF13 - 0.002 * BF17 + 0.014 * BF18 ... 162 | + 0.004 * BF19 - 0.007 * BF21 - 0.017 * BF22 ... 163 | - .895791E-03 * BF25 + 0.011 * BF26 - 0.009 * BF27 ... 164 | - 0.007 * BF29 + 0.052 * BF30 + 0.022 * BF35 ... 165 | - 0.002 * BF36 - 0.005 * BF39 - 0.059 * BF40 ... 166 | - 0.050 * BF41 + 0.001 * BF42 + .743730E-03 * BF43 ... 167 | + 0.011 * BF44 + 0.022 * BF45 + 0.009 * BF46 ... 168 | + 0.004 * BF49 - 0.005 * BF51 + 0.010 * BF52 ... 169 | - 0.001 * BF54 - 0.005 * BF56 - 0.015 * BF57 ... 170 | - 0.032 * BF59 + 0.009 * BF60 - 0.002 * BF61 ... 171 | - 0.009 * BF62 - 0.001 * BF63 + .819374E-03 * BF64 ... 172 | + 0.002 * BF65 + 0.003 * BF67 + 0.024 * BF69 ... 173 | - 0.011 * BF70 - 0.004 * BF71 + 0.013 * BF73 ... 174 | - 0.026 * BF74 + 0.005 * BF75 + 0.253 * BF77 ... 175 | - 0.065 * BF78 + 0.014 * BF80 - 0.010 * BF83 ... 176 | + 0.001 * BF84 + 0.018 * BF85 - 0.050 * BF87 ... 177 | - 0.002 * BF88 - 0.020 * BF89 + 0.003 * BF91 ... 178 | - 0.043 * BF92 + .707581E-03 * BF97 - 0.015 * BF99 ... 179 | - 0.005 * BF100; 180 | 181 | 182 | function Y= sig_mars( FWSEG_VA, V5, V6, V7, V8, V9, V10, V11, V12, ... 183 | V13, V14, V15, V16, V17, V18, V19, V20, ... 184 | V21, V22, V23, V24, V25, V26, V27, V28) 185 | 186 | BF1 = max(0, V7 - 9.535); 187 | BF2 = max(0, 9.535 - V7 ); 188 | BF3 = max(0, V27 - 1.578); 189 | BF5 = max(0, V6 - 5.422); 190 | BF6 = max(0, 5.422 - V6 ); 191 | BF8 = max(0, 11.333 - V19 ); 192 | BF10 = max(0, - 6.774 - FWSEG_VA ); 193 | BF11 = max(0, V10 - 6.255) * BF8; 194 | BF12 = max(0, 6.255 - V10 ) * BF8; 195 | BF13 = max(0, V24 - 3.894); 196 | BF15 = max(0, V5 - 3.884); 197 | BF16 = max(0, 3.884 - V5 ); 198 | BF17 = max(0, V28 - 7.918); 199 | BF18 = max(0, 7.918 - V28 ); 200 | BF19 = max(0, V13 - 6.077) * BF18; 201 | BF20 = max(0, 6.077 - V13 ) * BF18; 202 | BF22 = max(0, 6.614 - V20 ) * BF10; 203 | BF23 = max(0, FWSEG_VA + 0.936) * BF8; 204 | BF25 = max(0, V23 - 5.039); 205 | BF26 = max(0, 5.039 - V23 ); 206 | BF28 = max(0, 9.007 - V20 ) * BF25; 207 | BF29 = max(0, V25 - 7.582); 208 | BF30 = max(0, 7.582 - V25 ); 209 | BF31 = max(0, V11 + 3.336) * BF16; 210 | BF32 = max(0, V26 - 1.877); 211 | BF35 = max(0, - 5.749 - FWSEG_VA ) * BF6; 212 | BF36 = max(0, V7 - 4.451) * BF29; 213 | BF37 = max(0, 4.451 - V7 ) * BF29; 214 | BF38 = max(0, V14 - 10.158); 215 | BF39 = max(0, 10.158 - V14 ); 216 | BF41 = max(0, 7.172 - V17 ) * BF39; 217 | BF43 = max(0, 7.810 - V24 ) * BF26; 218 | BF44 = max(0, V8 + 1.636) * BF3; 219 | BF45 = max(0, FWSEG_VA - 10.068) * BF39; 220 | BF47 = max(0, V23 - 4.721) * BF30; 221 | BF48 = max(0, 4.721 - V23 ) * BF30; 222 | BF50 = max(0, - 2.397 - V24 ) * BF16; 223 | BF51 = max(0, V14 - 1.428) * BF17; 224 | BF53 = max(0, V16 + 1.940) * BF18; 225 | BF54 = max(0, V10 - 9.442) * BF18; 226 | BF56 = max(0, V10 + 2.144) * BF16; 227 | BF58 = max(0, 1.969 - V26 ) * BF2; 228 | BF59 = max(0, V19 - 6.089) * BF16; 229 | BF62 = max(0, 8.952 - V21 ) * BF15; 230 | BF63 = max(0, V24 - 7.371) * BF3; 231 | BF65 = max(0, V22 - 8.908) * BF6; 232 | BF66 = max(0, 8.908 - V22 ) * BF6; 233 | BF67 = max(0, V27 - 9.485) * BF30; 234 | BF69 = max(0, V18 - 8.608) * BF10; 235 | BF71 = max(0, V13 - 3.374) * BF25; 236 | BF73 = max(0, V14 - 3.616) * BF13; 237 | BF75 = max(0, V18 - 10.321) * BF32; 238 | BF76 = max(0, 10.321 - V18 ) * BF32; 239 | BF78 = max(0, 3.972 - V15 ) * BF26; 240 | BF79 = max(0, V14 - 7.105) * BF26; 241 | BF80 = max(0, 7.105 - V14 ) * BF26; 242 | 243 | Y = 2.638 - 0.089 * BF1 + 0.083 * BF5 - 0.162 * BF6 - 0.037 * BF8 ... 244 | - 0.241 * BF10 + 0.018 * BF11 - 0.008 * BF12 ... 245 | + 0.059 * BF13 - 0.144 * BF17 - 0.116 * BF18 ... 246 | + 0.010 * BF19 - 0.012 * BF20 + 0.085 * BF22 ... 247 | + 0.011 * BF23 + 0.049 * BF25 - 0.159 * BF26 ... 248 | - 0.016 * BF28 - 0.138 * BF29 + 0.010 * BF31 ... 249 | + 0.016 * BF35 + 0.018 * BF36 + 0.246 * BF37 ... 250 | - 0.417 * BF38 + 0.052 * BF39 - 0.005 * BF41 ... 251 | + 0.021 * BF43 + 0.006 * BF44 - 0.047 * BF45 ... 252 | - 0.051 * BF47 - 0.014 * BF48 - 0.113 * BF50 ... 253 | + 0.019 * BF51 + 0.007 * BF53 + 0.017 * BF54 ... 254 | - 0.007 * BF56 - 0.098 * BF58 + 0.011 * BF59 ... 255 | - 0.016 * BF62 - 0.012 * BF63 + 0.113 * BF65 ... 256 | + 0.016 * BF66 + 0.040 * BF67 - 0.065 * BF69 ... 257 | - 0.018 * BF71 + 0.014 * BF73 - 0.009 * BF75 ... 258 | - 0.008 * BF76 - 0.032 * BF78 + 0.032 * BF79 ... 259 | + 0.011 * BF80; 260 | 261 | 262 | function Y= ovl_mars( FWSEG_VA, V5, V6, V7, V8, V9, V10, V11, V12, ... 263 | V13, V14, V15, V16, V17, V18, V19, V20, ... 264 | V21, V22, V23, V24, V25, V26, V27, V28) 265 | 266 | BF1 = max(0, V21 - 4.671); 267 | BF3 = max(0, V6 - 5.396); 268 | BF4 = max(0, 5.396 - V6 ); 269 | BF7 = max(0, V11 - 7.884); 270 | BF8 = max(0, 7.884 - V11 ); 271 | BF9 = max(0, FWSEG_VA + 7.229) * BF1; 272 | BF10 = max(0, - 7.229 - FWSEG_VA ) * BF1; 273 | BF11 = max(0, V19 - 8.128) * BF1; 274 | BF12 = max(0, 8.128 - V19 ) * BF1; 275 | BF13 = max(0, V28 - 7.918); 276 | BF14 = max(0, 7.918 - V28 ); 277 | BF15 = max(0, V5 + 2.888) * BF14; 278 | BF16 = max(0, - 2.888 - V5 ) * BF14; 279 | BF17 = max(0, V24 - 2.924) * BF8; 280 | BF18 = max(0, 2.924 - V24 ) * BF8; 281 | BF20 = max(0, 9.071 - V16 ) * BF15; 282 | BF21 = max(0, V10 - 6.286) * BF14; 283 | BF22 = max(0, 6.286 - V10 ) * BF14; 284 | BF24 = max(0, V23 - 5.173); 285 | BF25 = max(0, 5.173 - V23 ); 286 | BF26 = max(0, V26 - 8.987); 287 | BF29 = max(0, 12.216 - V27 ) * BF3; 288 | BF30 = max(0, V8 - 4.306) * BF16; 289 | BF34 = max(0, V23 - 7.630) * BF21; 290 | BF35 = max(0, 7.630 - V23 ) * BF21; 291 | BF37 = max(0, 3.638 - V7 ) * BF1; 292 | BF39 = max(0, 8.337 - V21 ) * BF17; 293 | BF41 = max(0, 1.590 - V5 ) * BF11; 294 | BF43 = max(0, 13.993 - V8 ) * BF11; 295 | BF44 = max(0, V14 - 5.993) * BF25; 296 | BF45 = max(0, 5.993 - V14 ) * BF25; 297 | BF46 = max(0, V24 - 1.035); 298 | BF47 = max(0, 1.035 - V24 ); 299 | BF49 = max(0, 8.915 - V23 ) * BF12; 300 | BF51 = max(0, - 0.004 - FWSEG_VA ); 301 | BF52 = max(0, V27 - 6.520) * BF24; 302 | BF53 = max(0, 6.520 - V27 ) * BF24; 303 | BF54 = max(0, V7 - 11.484) * BF8; 304 | BF55 = max(0, 11.484 - V7 ) * BF8; 305 | BF57 = max(0, 5.742 - V17 ) * BF25; 306 | BF58 = max(0, V12 - 6.949) * BF12; 307 | BF59 = max(0, 6.949 - V12 ) * BF12; 308 | BF60 = max(0, V25 - 9.203) * BF45; 309 | BF63 = max(0, 1.887 - V13 ) * BF7; 310 | BF65 = max(0, 9.498 - V26 ) * BF15; 311 | BF66 = max(0, V5 - 6.566) * BF22; 312 | BF71 = max(0, 13.239 - V19 ) * BF46; 313 | BF72 = max(0, V19 - 9.925) * BF55; 314 | BF77 = max(0, 3.430 - V22 ) * BF18; 315 | BF78 = max(0, V27 - 6.513) * BF45; 316 | BF79 = max(0, 6.513 - V27 ) * BF45; 317 | BF81 = max(0, 12.511 - V18 ); 318 | BF82 = max(0, V11 - 6.777) * BF81; 319 | BF83 = max(0, 6.777 - V11 ) * BF81; 320 | BF85 = max(0, 3.433 - V5 ) * BF47; 321 | BF87 = max(0, - 3.524 - FWSEG_VA ) * BF47; 322 | BF88 = max(0, V27 - 11.604) * BF9; 323 | BF91 = max(0, 8.845 - V26 ) * BF52; 324 | BF92 = max(0, V14 - 5.931) * BF82; 325 | BF93 = max(0, 5.931 - V14 ) * BF82; 326 | BF94 = max(0, V21 - 7.245) * BF25; 327 | BF95 = max(0, 7.245 - V21 ) * BF25; 328 | BF96 = max(0, V14 - 5.323) * BF7; 329 | BF98 = max(0, V10 - 6.248) * BF71; 330 | BF100 = max(0, V18 - 0.602) * BF95; 331 | 332 | Y = 2.936 + 0.047 * BF1 + 0.061 * BF3 - 0.084 * BF4 - 0.139 * BF8 ... 333 | - 0.064 * BF10 - 0.030 * BF12 - 0.103 * BF13 ... 334 | - 0.039 * BF14 + 0.020 * BF17 - 0.002 * BF20 ... 335 | - 0.005 * BF22 - 0.114 * BF25 - 0.090 * BF26 ... 336 | - 0.011 * BF29 + 0.010 * BF30 + 0.009 * BF34 ... 337 | + 0.002 * BF35 + 0.079 * BF37 - 0.006 * BF39 ... 338 | + 0.007 * BF41 - 0.003 * BF43 + 0.017 * BF44 ... 339 | + 0.076 * BF47 + 0.009 * BF49 + 0.016 * BF51 ... 340 | - 0.042 * BF53 - 0.079 * BF54 - 0.030 * BF57 ... 341 | - 0.018 * BF58 - 0.009 * BF59 - 0.119 * BF60 ... 342 | - 0.210 * BF63 - .456802E-03 * BF65 + 0.028 * BF66 ... 343 | + 0.020 * BF72 + 0.011 * BF77 + 0.005 * BF78 ... 344 | + 0.003 * BF79 - 0.049 * BF81 + 0.012 * BF83 ... 345 | - 0.030 * BF85 + 0.070 * BF87 + 0.008 * BF88 ... 346 | - 0.008 * BF91 + 0.010 * BF92 + 0.003 * BF93 ... 347 | + 0.022 * BF94 - 0.038 * BF96 + .933766E-03 * BF98 ... 348 | + 0.002 * BF100; 349 | 350 | 351 | 352 | function distortion = fwseg(clean_speech, processed_speech,sample_rate) 353 | 354 | 355 | % ---------------------------------------------------------------------- 356 | % Check the length of the clean and processed speech. Must be the same. 357 | % ---------------------------------------------------------------------- 358 | 359 | clean_length = length(clean_speech); 360 | processed_length = length(processed_speech); 361 | 362 | if (clean_length ~= processed_length) 363 | disp('Error: Files must have same length.'); 364 | return 365 | end 366 | 367 | 368 | 369 | % ---------------------------------------------------------------------- 370 | % Global Variables 371 | % ---------------------------------------------------------------------- 372 | 373 | 374 | winlength = round(30*sample_rate/1000); % window length in samples 375 | skiprate = floor(winlength/4); % window skip in samples 376 | max_freq = sample_rate/2; % maximum bandwidth 377 | num_crit = 25; % number of critical bands 378 | 379 | n_fft = 2^nextpow2(2*winlength); 380 | n_fftby2 = n_fft/2; % FFT size/2 381 | 382 | % ---------------------------------------------------------------------- 383 | % Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz) 384 | % ---------------------------------------------------------------------- 385 | 386 | cent_freq(1) = 50.0000; bandwidth(1) = 70.0000; 387 | cent_freq(2) = 120.000; bandwidth(2) = 70.0000; 388 | cent_freq(3) = 190.000; bandwidth(3) = 70.0000; 389 | cent_freq(4) = 260.000; bandwidth(4) = 70.0000; 390 | cent_freq(5) = 330.000; bandwidth(5) = 70.0000; 391 | cent_freq(6) = 400.000; bandwidth(6) = 70.0000; 392 | cent_freq(7) = 470.000; bandwidth(7) = 70.0000; 393 | cent_freq(8) = 540.000; bandwidth(8) = 77.3724; 394 | cent_freq(9) = 617.372; bandwidth(9) = 86.0056; 395 | cent_freq(10) = 703.378; bandwidth(10) = 95.3398; 396 | cent_freq(11) = 798.717; bandwidth(11) = 105.411; 397 | cent_freq(12) = 904.128; bandwidth(12) = 116.256; 398 | cent_freq(13) = 1020.38; bandwidth(13) = 127.914; 399 | cent_freq(14) = 1148.30; bandwidth(14) = 140.423; 400 | cent_freq(15) = 1288.72; bandwidth(15) = 153.823; 401 | cent_freq(16) = 1442.54; bandwidth(16) = 168.154; 402 | cent_freq(17) = 1610.70; bandwidth(17) = 183.457; 403 | cent_freq(18) = 1794.16; bandwidth(18) = 199.776; 404 | cent_freq(19) = 1993.93; bandwidth(19) = 217.153; 405 | cent_freq(20) = 2211.08; bandwidth(20) = 235.631; 406 | cent_freq(21) = 2446.71; bandwidth(21) = 255.255; 407 | cent_freq(22) = 2701.97; bandwidth(22) = 276.072; 408 | cent_freq(23) = 2978.04; bandwidth(23) = 298.126; 409 | cent_freq(24) = 3276.17; bandwidth(24) = 321.465; 410 | cent_freq(25) = 3597.63; bandwidth(25) = 346.136; 411 | 412 | 413 | bw_min = bandwidth (1); % minimum critical bandwidth 414 | 415 | 416 | % ---------------------------------------------------------------------- 417 | % Set up the critical band filters. Note here that Gaussianly shaped 418 | % filters are used. Also, the sum of the filter weights are equivalent 419 | % for each critical band filter. Filter less than -30 dB and set to 420 | % zero. 421 | % ---------------------------------------------------------------------- 422 | 423 | min_factor = exp (-30.0 / (2.0 * 2.303)); % -30 dB point of filter 424 | 425 | for i = 1:num_crit 426 | f0 = (cent_freq (i) / max_freq) * (n_fftby2); 427 | all_f0(i) = floor(f0); 428 | bw = (bandwidth (i) / max_freq) * (n_fftby2); 429 | norm_factor = log(bw_min) - log(bandwidth(i)); 430 | j = 0:1:n_fftby2-1; 431 | crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor); 432 | crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor); 433 | end 434 | 435 | % ---------------------------------------------------------------------- 436 | % For each frame of input speech, calculate the Weighted Spectral 437 | % Slope Measure 438 | % ---------------------------------------------------------------------- 439 | 440 | num_frames = floor(clean_length/skiprate-(winlength/skiprate)); % number of frames 441 | start = 1; % starting sample 442 | window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1))); 443 | 444 | distortion=zeros(num_frames,num_crit); 445 | for frame_count = 1:num_frames 446 | 447 | % ---------------------------------------------------------- 448 | % (1) Get the Frames for the test and reference speech. 449 | % Multiply by Hanning Window. 450 | % ---------------------------------------------------------- 451 | 452 | clean_frame = clean_speech(start:start+winlength-1); 453 | processed_frame = processed_speech(start:start+winlength-1); 454 | clean_frame = clean_frame.*window; 455 | processed_frame = processed_frame.*window; 456 | 457 | % ---------------------------------------------------------- 458 | % (2) Compute the magnitude Spectrum of Clean and Processed 459 | % ---------------------------------------------------------- 460 | 461 | 462 | clean_spec = abs(fft(clean_frame,n_fft)); 463 | processed_spec = abs(fft(processed_frame,n_fft)); 464 | 465 | % normalize so that spectra have unit area ---- 466 | clean_spec=clean_spec/sum(clean_spec(1:n_fftby2)); 467 | processed_spec=processed_spec/sum(processed_spec(1:n_fftby2)); 468 | 469 | % ---------------------------------------------------------- 470 | % (3) Compute Filterbank Output Energies 471 | % ---------------------------------------------------------- 472 | 473 | clean_energy=zeros(1,num_crit); 474 | processed_energy=zeros(1,num_crit); 475 | error_energy=zeros(1,num_crit); 476 | 477 | for i = 1:num_crit 478 | clean_energy(i) = sum(clean_spec(1:n_fftby2) ... 479 | .*crit_filter(i,:)'); 480 | processed_energy(i) = sum(processed_spec(1:n_fftby2) ... 481 | .*crit_filter(i,:)'); 482 | error_energy(i)=max((clean_energy(i)-processed_energy(i))^2,eps); 483 | end 484 | 485 | 486 | SNRlog=10*log10((clean_energy.^2)./error_energy); 487 | 488 | distortion(frame_count,:)=min(max(SNRlog,-10),35); 489 | 490 | start = start + skiprate; 491 | 492 | end 493 | 494 | -------------------------------------------------------------------------------- /comp_fwseg_variant.m: -------------------------------------------------------------------------------- 1 | function [SIG,BAK,OVL]= comp_fwseg_variant(cleanFile, enhancedFile); 2 | 3 | % ---------------------------------------------------------------------- 4 | % Frequency-variant fwSNRseg Objective Speech Quality Measure 5 | % 6 | % This function implements the frequency-variant fwSNRseg measure [1] 7 | % (see also Chap. 10, Eq. 10.24) 8 | % 9 | % 10 | % Usage: [sig,bak,ovl]=comp_fwseg_variant(cleanFile.wav, enhancedFile.wav) 11 | % 12 | % cleanFile.wav - clean input file in .wav format 13 | % enhancedFile - enhanced output file in .wav format 14 | % sig - predicted rating [1-5] of speech distortion 15 | % bak - predicted rating [1-5] of noise distortion 16 | % ovl - predicted rating [1-5] of overall quality 17 | % 18 | % 19 | % Example call: [s,b,o] =comp_fwseg_variant('sp04.wav','enhanced.wav') 20 | % 21 | % 22 | % References: 23 | % [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements, 24 | % Objective Measures of Speech Quality. Prentice Hall 25 | % Advanced Reference Series, Englewood Cliffs, NJ, 1988, 26 | % ISBN: 0-13-629056-6. 27 | % 28 | % Author: Philipos C. Loizou 29 | % (critical-band filtering routines were written by Bryan Pellom & John Hansen) 30 | % 31 | % Copyright (c) 2006 by Philipos C. Loizou 32 | % $Revision: 0.0 $ $Date: 10/09/2006 $ 33 | % ---------------------------------------------------------------------- 34 | 35 | if nargin~=2 36 | fprintf('USAGE: [sig,bak,ovl]=comp_fwseg_variant(cleanFile.wav, enhancedFile.wav)\n'); 37 | fprintf('For more help, type: help comp_fwseg_variant\n\n'); 38 | return; 39 | end 40 | 41 | 42 | [data1, Srate1, Nbits1]= wavread(cleanFile); 43 | [data2, Srate2, Nbits2]= wavread(enhancedFile); 44 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2) 45 | error( 'The two files do not match!\n'); 46 | end 47 | 48 | len= min( length( data1), length( data2)); 49 | data1= data1( 1: len)+eps; 50 | data2= data2( 1: len)+eps; 51 | 52 | wss_dist_matrix= fwseg( data1, data2,Srate1); 53 | wss_dist=mean(wss_dist_matrix); 54 | 55 | % initialize coefficients obtained from multiple linear 56 | % regression analysis 57 | % 58 | b_sig=[0.021,-0.028,0.088,-0.031,0.048,-0.049,0.065,0.009,0.011,0.033,... 59 | -0.040,-0.002,0.041,-0.007,0.033,0.018,-0.007,0.044,-0.001,0.021,... 60 | -0.002,0.017,-0.03,0.073,0.043]; 61 | b_ovl=[-0.003,-0.026,0.066,-0.036,0.038,-0.023,0.037,0.022,0.014,0.009,... 62 | -0.03,0.004,0.044,-0.005,0.017,0.018,-0.001,0.051,0.009,0.011,... 63 | 0.011,-0.002,-0.021,0.043,0.031]; 64 | b_bak=[-0.03,-0.022,0.03,-0.048,0.034,0.002,0.006,0.037,0.017,-0.016,-0.008,... 65 | 0.019,0.024,-0.002,0.01,0.03,-0.018,0.046,0.022,0.005,0.03,-0.028,... 66 | -0.028,0.019,0.005]; 67 | 68 | SIG=0.567+sum(b_sig.*wss_dist); 69 | SIG=max(1,SIG); SIG=min(5, SIG); % limit values to [1, 5] 70 | 71 | BAK=1.013+sum(b_bak.*wss_dist); 72 | BAK=max(1,BAK); BAK=min(5, BAK); % limit values to [1, 5] 73 | 74 | OVL=0.446+sum(b_ovl.*wss_dist); 75 | OVL=max(1,OVL); OVL=min(5, OVL); % limit values to [1, 5] 76 | 77 | 78 | % ---------------------------------------------------------------------- 79 | 80 | function distortion = fwseg(clean_speech, processed_speech,sample_rate) 81 | 82 | 83 | % ---------------------------------------------------------------------- 84 | % Check the length of the clean and processed speech. Must be the same. 85 | % ---------------------------------------------------------------------- 86 | 87 | clean_length = length(clean_speech); 88 | processed_length = length(processed_speech); 89 | 90 | if (clean_length ~= processed_length) 91 | disp('Error: Files must have same length.'); 92 | return 93 | end 94 | 95 | 96 | 97 | % ---------------------------------------------------------------------- 98 | % Global Variables 99 | % ---------------------------------------------------------------------- 100 | 101 | 102 | winlength = round(30*sample_rate/1000); % window length in samples 103 | skiprate = floor(winlength/4); % window skip in samples 104 | max_freq = sample_rate/2; % maximum bandwidth 105 | num_crit = 25; % number of critical bands 106 | 107 | n_fft = 2^nextpow2(2*winlength); 108 | n_fftby2 = n_fft/2; % FFT size/2 109 | 110 | % ---------------------------------------------------------------------- 111 | % Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz) 112 | % ---------------------------------------------------------------------- 113 | 114 | cent_freq(1) = 50.0000; bandwidth(1) = 70.0000; 115 | cent_freq(2) = 120.000; bandwidth(2) = 70.0000; 116 | cent_freq(3) = 190.000; bandwidth(3) = 70.0000; 117 | cent_freq(4) = 260.000; bandwidth(4) = 70.0000; 118 | cent_freq(5) = 330.000; bandwidth(5) = 70.0000; 119 | cent_freq(6) = 400.000; bandwidth(6) = 70.0000; 120 | cent_freq(7) = 470.000; bandwidth(7) = 70.0000; 121 | cent_freq(8) = 540.000; bandwidth(8) = 77.3724; 122 | cent_freq(9) = 617.372; bandwidth(9) = 86.0056; 123 | cent_freq(10) = 703.378; bandwidth(10) = 95.3398; 124 | cent_freq(11) = 798.717; bandwidth(11) = 105.411; 125 | cent_freq(12) = 904.128; bandwidth(12) = 116.256; 126 | cent_freq(13) = 1020.38; bandwidth(13) = 127.914; 127 | cent_freq(14) = 1148.30; bandwidth(14) = 140.423; 128 | cent_freq(15) = 1288.72; bandwidth(15) = 153.823; 129 | cent_freq(16) = 1442.54; bandwidth(16) = 168.154; 130 | cent_freq(17) = 1610.70; bandwidth(17) = 183.457; 131 | cent_freq(18) = 1794.16; bandwidth(18) = 199.776; 132 | cent_freq(19) = 1993.93; bandwidth(19) = 217.153; 133 | cent_freq(20) = 2211.08; bandwidth(20) = 235.631; 134 | cent_freq(21) = 2446.71; bandwidth(21) = 255.255; 135 | cent_freq(22) = 2701.97; bandwidth(22) = 276.072; 136 | cent_freq(23) = 2978.04; bandwidth(23) = 298.126; 137 | cent_freq(24) = 3276.17; bandwidth(24) = 321.465; 138 | cent_freq(25) = 3597.63; bandwidth(25) = 346.136; 139 | 140 | 141 | bw_min = bandwidth (1); % minimum critical bandwidth 142 | 143 | 144 | % ---------------------------------------------------------------------- 145 | % Set up the critical band filters. Note here that Gaussianly shaped 146 | % filters are used. Also, the sum of the filter weights are equivalent 147 | % for each critical band filter. Filter less than -30 dB and set to 148 | % zero. 149 | % ---------------------------------------------------------------------- 150 | 151 | min_factor = exp (-30.0 / (2.0 * 2.303)); % -30 dB point of filter 152 | 153 | for i = 1:num_crit 154 | f0 = (cent_freq (i) / max_freq) * (n_fftby2); 155 | all_f0(i) = floor(f0); 156 | bw = (bandwidth (i) / max_freq) * (n_fftby2); 157 | norm_factor = log(bw_min) - log(bandwidth(i)); 158 | j = 0:1:n_fftby2-1; 159 | crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor); 160 | crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor); 161 | end 162 | 163 | % ---------------------------------------------------------------------- 164 | % For each frame of input speech, calculate the Weighted Spectral 165 | % Slope Measure 166 | % ---------------------------------------------------------------------- 167 | 168 | num_frames = floor(clean_length/skiprate-(winlength/skiprate)); % number of frames 169 | start = 1; % starting sample 170 | window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1))); 171 | 172 | distortion=zeros(num_frames,num_crit); 173 | for frame_count = 1:num_frames 174 | 175 | % ---------------------------------------------------------- 176 | % (1) Get the Frames for the test and reference speech. 177 | % Multiply by Hanning Window. 178 | % ---------------------------------------------------------- 179 | 180 | clean_frame = clean_speech(start:start+winlength-1); 181 | processed_frame = processed_speech(start:start+winlength-1); 182 | clean_frame = clean_frame.*window; 183 | processed_frame = processed_frame.*window; 184 | 185 | % ---------------------------------------------------------- 186 | % (2) Compute the magnitude Spectrum of Clean and Processed 187 | % ---------------------------------------------------------- 188 | 189 | 190 | clean_spec = abs(fft(clean_frame,n_fft)); 191 | processed_spec = abs(fft(processed_frame,n_fft)); 192 | 193 | % normalize so that spectra have unit area ---- 194 | clean_spec=clean_spec/sum(clean_spec(1:n_fftby2)); 195 | processed_spec=processed_spec/sum(processed_spec(1:n_fftby2)); 196 | 197 | % ---------------------------------------------------------- 198 | % (3) Compute Filterbank Output Energies (in dB scale) 199 | % ---------------------------------------------------------- 200 | 201 | clean_energy=zeros(1,num_crit); 202 | processed_energy=zeros(1,num_crit); 203 | error_energy=zeros(1,num_crit); 204 | 205 | for i = 1:num_crit 206 | clean_energy(i) = sum(clean_spec(1:n_fftby2) ... 207 | .*crit_filter(i,:)'); 208 | processed_energy(i) = sum(processed_spec(1:n_fftby2) ... 209 | .*crit_filter(i,:)'); 210 | error_energy(i)=max((clean_energy(i)-processed_energy(i))^2,eps); 211 | end 212 | 213 | 214 | SNRlog=10*log10((clean_energy.^2)./error_energy); 215 | 216 | distortion(frame_count,:)=min(max(SNRlog,-10),35); 217 | 218 | start = start + skiprate; 219 | 220 | end 221 | 222 | -------------------------------------------------------------------------------- /comp_is.m: -------------------------------------------------------------------------------- 1 | function is_mean= comp_is(cleanFile, enhdFile); 2 | % ---------------------------------------------------------------------- 3 | % Itakura-Saito (IS) Objective Speech Quality Measure 4 | % 5 | % This function implements the Itakura-Saito distance measure 6 | % defined on page 50 of [1] (see Equation 2.26). See also 7 | % Equation 12 (page 1480) of [2]. 8 | % 9 | % Usage: IS=comp_is(cleanFile.wav, enhancedFile.wav) 10 | % 11 | % cleanFile.wav - clean input file in .wav format 12 | % enhancedFile - enhanced output file in .wav format 13 | % IS - computed Itakura Saito measure 14 | % 15 | % Note that the IS measure is limited in the range [0, 100]. 16 | % 17 | % Example call: IS =comp_is('sp04.wav','enhanced.wav') 18 | % 19 | % 20 | % References: 21 | % 22 | % [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements, 23 | % Objective Measures of Speech Quality. Prentice Hall 24 | % Advanced Reference Series, Englewood Cliffs, NJ, 1988, 25 | % ISBN: 0-13-629056-6. 26 | % 27 | % [2] B.-H. Juang, "On Using the Itakura-Saito Measures for 28 | % Speech Coder Performance Evaluation", AT&T Bell 29 | % Laboratories Technical Journal, Vol. 63, No. 8, 30 | % October 1984, pp. 1477-1498. 31 | % 32 | % Authors: Bryan L. Pellom and John H. L. Hansen (July 1998) 33 | % Modified by: Philipos C. Loizou (Oct 2006) - limited IS to be in [0,100] 34 | % 35 | % Copyright (c) 2006 by Philipos C. Loizou 36 | % $Revision: 0.0 $ $Date: 10/09/2006 $ 37 | 38 | % ---------------------------------------------------------------------- 39 | 40 | if nargin~=2 41 | fprintf('USAGE: IS=comp_is(cleanFile.wav, enhancedFile.wav)\n'); 42 | fprintf('For more help, type: help comp_is\n\n'); 43 | return; 44 | end 45 | 46 | alpha=0.95; 47 | 48 | [data1, Srate1, Nbits1]= wavread(cleanFile); 49 | [data2, Srate2, Nbits2]= wavread(enhdFile); 50 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2) 51 | error( 'The two files do not match!\n'); 52 | end 53 | 54 | len= min( length( data1), length( data2)); 55 | data1= data1( 1: len)+eps; 56 | data2= data2( 1: len)+eps; 57 | 58 | 59 | IS_dist= is( data1, data2,Srate1); 60 | 61 | IS_len= round( length( IS_dist)* alpha); 62 | IS= sort( IS_dist); 63 | 64 | is_mean= mean( IS( 1: IS_len)); 65 | 66 | 67 | 68 | function distortion = is(clean_speech, processed_speech,sample_rate) 69 | 70 | 71 | % ---------------------------------------------------------------------- 72 | % Check the length of the clean and processed speech. Must be the same. 73 | % ---------------------------------------------------------------------- 74 | 75 | clean_length = length(clean_speech); 76 | processed_length = length(processed_speech); 77 | 78 | if (clean_length ~= processed_length) 79 | disp('Error: Both Speech Files must be same length.'); 80 | return 81 | end 82 | 83 | % ---------------------------------------------------------------------- 84 | % Scale both clean speech and processed speech to have same dynamic 85 | % range. Also remove DC component from each signal 86 | % ---------------------------------------------------------------------- 87 | 88 | %clean_speech = clean_speech - mean(clean_speech); 89 | %processed_speech = processed_speech - mean(processed_speech); 90 | 91 | %processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech))); 92 | 93 | % ---------------------------------------------------------------------- 94 | % Global Variables 95 | % ---------------------------------------------------------------------- 96 | 97 | %sample_rate = 8000; % default sample rate 98 | winlength = round(30*sample_rate/1000); %240; % window length in samples 99 | skiprate = floor(winlength/4); % window skip in samples 100 | if sample_rate<10000 101 | P = 10; % LPC Analysis Order 102 | else 103 | P=16; % this could vary depending on sampling frequency. 104 | end 105 | % ---------------------------------------------------------------------- 106 | % For each frame of input speech, calculate the Itakura-Saito Measure 107 | % ---------------------------------------------------------------------- 108 | 109 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames 110 | start = 1; % starting sample 111 | window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1))); 112 | 113 | for frame_count = 1:num_frames 114 | 115 | % ---------------------------------------------------------- 116 | % (1) Get the Frames for the test and reference speech. 117 | % Multiply by Hanning Window. 118 | % ---------------------------------------------------------- 119 | 120 | clean_frame = clean_speech(start:start+winlength-1); 121 | processed_frame = processed_speech(start:start+winlength-1); 122 | clean_frame = clean_frame.*window; 123 | processed_frame = processed_frame.*window; 124 | 125 | % ---------------------------------------------------------- 126 | % (2) Get the autocorrelation lags and LPC parameters used 127 | % to compute the IS measure. 128 | % ---------------------------------------------------------- 129 | 130 | [R_clean, Ref_clean, A_clean] = ... 131 | lpcoeff(clean_frame, P); 132 | [R_processed, Ref_processed, A_processed] = ... 133 | lpcoeff(processed_frame, P); 134 | 135 | 136 | % ---------------------------------------------------------- 137 | % (3) Compute the IS measure 138 | % ---------------------------------------------------------- 139 | 140 | numerator = A_processed*toeplitz(R_clean)*A_processed'; 141 | denominator = max(A_clean*toeplitz(R_clean)*A_clean',eps); 142 | gain_clean = max(R_clean*A_clean',eps); % this is gain 143 | gain_processed = max(R_processed*A_processed',eps); % squared (sigma^2) 144 | 145 | 146 | ISvalue=(gain_clean/gain_processed)*(numerator/denominator) + ... 147 | log(gain_processed/gain_clean)-1; 148 | 149 | distortion(frame_count) = min(ISvalue,100); 150 | start = start + skiprate; 151 | 152 | end 153 | 154 | 155 | 156 | function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order) 157 | 158 | % ---------------------------------------------------------- 159 | % (1) Compute Autocorrelation Lags 160 | % ---------------------------------------------------------- 161 | 162 | winlength = max(size(speech_frame)); 163 | for k=1:model_order+1 164 | R(k) = sum(speech_frame(1:winlength-k+1) ... 165 | .*speech_frame(k:winlength)); 166 | end 167 | 168 | % ---------------------------------------------------------- 169 | % (2) Levinson-Durbin 170 | % ---------------------------------------------------------- 171 | 172 | a = ones(1,model_order); 173 | E(1)=R(1); 174 | for i=1:model_order 175 | a_past(1:i-1) = a(1:i-1); 176 | sum_term = sum(a_past(1:i-1).*R(i:-1:2)); 177 | rcoeff(i)=(R(i+1) - sum_term) / E(i); 178 | a(i)=rcoeff(i); 179 | a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1); 180 | E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i); 181 | end 182 | 183 | acorr = R; 184 | refcoeff = rcoeff; 185 | lpparams = [1 -a]; 186 | 187 | 188 | 189 | -------------------------------------------------------------------------------- /comp_llr.m: -------------------------------------------------------------------------------- 1 | function llr_mean= comp_llr(cleanFile, enhancedFile); 2 | 3 | % ---------------------------------------------------------------------- 4 | % 5 | % Log Likelihood Ratio (LLR) Objective Speech Quality Measure 6 | % 7 | % 8 | % This function implements the Log Likelihood Ratio Measure 9 | % defined on page 48 of [1] (see Equation 2.18). 10 | % 11 | % Usage: llr=comp_llr(cleanFile.wav, enhancedFile.wav) 12 | % 13 | % cleanFile.wav - clean input file in .wav format 14 | % enhancedFile - enhanced output file in .wav format 15 | % llr - computed likelihood ratio 16 | % 17 | % Note that the LLR measure is limited in the range [0, 2]. 18 | % 19 | % Example call: llr =comp_llr('sp04.wav','enhanced.wav') 20 | % 21 | % 22 | % References: 23 | % 24 | % [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements, 25 | % Objective Measures of Speech Quality. Prentice Hall 26 | % Advanced Reference Series, Englewood Cliffs, NJ, 1988, 27 | % ISBN: 0-13-629056-6. 28 | % 29 | % Authors: Bryan L. Pellom and John H. L. Hansen (July 1998) 30 | % Modified by: Philipos C. Loizou (Oct 2006) - limited LLR to be in [0,2] 31 | % 32 | % Copyright (c) 2006 by Philipos C. Loizou 33 | % $Revision: 0.0 $ $Date: 10/09/2006 $ 34 | % ---------------------------------------------------------------------- 35 | 36 | if nargin~=2 37 | fprintf('USAGE: LLR=comp_llr(cleanFile.wav, enhancedFile.wav)\n'); 38 | fprintf('For more help, type: help comp_llr\n\n'); 39 | return; 40 | end 41 | 42 | alpha=0.95; 43 | [data1, Srate1, Nbits1]= wavread(cleanFile); 44 | [data2, Srate2, Nbits2]= wavread(enhancedFile); 45 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2) 46 | error( 'The two files do not match!\n'); 47 | end 48 | 49 | len= min( length( data1), length( data2)); 50 | data1= data1( 1: len)+eps; 51 | data2= data2( 1: len)+eps; 52 | 53 | IS_dist= llr( data1, data2,Srate1); 54 | 55 | IS_len= round( length( IS_dist)* alpha); 56 | IS= sort( IS_dist); 57 | 58 | llr_mean= mean( IS( 1: IS_len)); 59 | 60 | 61 | 62 | function distortion = llr(clean_speech, processed_speech,sample_rate) 63 | 64 | 65 | % ---------------------------------------------------------------------- 66 | % Check the length of the clean and processed speech. Must be the same. 67 | % ---------------------------------------------------------------------- 68 | 69 | clean_length = length(clean_speech); 70 | processed_length = length(processed_speech); 71 | 72 | if (clean_length ~= processed_length) 73 | disp('Error: Both Speech Files must be same length.'); 74 | return 75 | end 76 | 77 | % ---------------------------------------------------------------------- 78 | % Global Variables 79 | % ---------------------------------------------------------------------- 80 | 81 | winlength = round(30*sample_rate/1000); %240; % window length in samples 82 | skiprate = floor(winlength/4); % window skip in samples 83 | if sample_rate<10000 84 | P = 10; % LPC Analysis Order 85 | else 86 | P=16; % this could vary depending on sampling frequency. 87 | end 88 | % ---------------------------------------------------------------------- 89 | % For each frame of input speech, calculate the Log Likelihood Ratio 90 | % ---------------------------------------------------------------------- 91 | 92 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames 93 | start = 1; % starting sample 94 | window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1))); 95 | 96 | for frame_count = 1:num_frames 97 | 98 | % ---------------------------------------------------------- 99 | % (1) Get the Frames for the test and reference speech. 100 | % Multiply by Hanning Window. 101 | % ---------------------------------------------------------- 102 | 103 | clean_frame = clean_speech(start:start+winlength-1); 104 | processed_frame = processed_speech(start:start+winlength-1); 105 | clean_frame = clean_frame.*window; 106 | processed_frame = processed_frame.*window; 107 | 108 | % ---------------------------------------------------------- 109 | % (2) Get the autocorrelation lags and LPC parameters used 110 | % to compute the LLR measure. 111 | % ---------------------------------------------------------- 112 | 113 | [R_clean, Ref_clean, A_clean] = ... 114 | lpcoeff(clean_frame, P); 115 | [R_processed, Ref_processed, A_processed] = ... 116 | lpcoeff(processed_frame, P); 117 | 118 | % ---------------------------------------------------------- 119 | % (3) Compute the LLR measure 120 | % ---------------------------------------------------------- 121 | 122 | numerator = A_processed*toeplitz(R_clean)*A_processed'; 123 | denominator = A_clean*toeplitz(R_clean)*A_clean'; 124 | distortion(frame_count) = min(2,log(numerator/denominator)); 125 | start = start + skiprate; 126 | 127 | end 128 | 129 | 130 | function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order) 131 | 132 | % ---------------------------------------------------------- 133 | % (1) Compute Autocorrelation Lags 134 | % ---------------------------------------------------------- 135 | 136 | winlength = max(size(speech_frame)); 137 | for k=1:model_order+1 138 | R(k) = sum(speech_frame(1:winlength-k+1) ... 139 | .*speech_frame(k:winlength)); 140 | end 141 | 142 | % ---------------------------------------------------------- 143 | % (2) Levinson-Durbin 144 | % ---------------------------------------------------------- 145 | 146 | a = ones(1,model_order); 147 | E(1)=R(1); 148 | for i=1:model_order 149 | a_past(1:i-1) = a(1:i-1); 150 | sum_term = sum(a_past(1:i-1).*R(i:-1:2)); 151 | rcoeff(i)=(R(i+1) - sum_term) / E(i); 152 | a(i)=rcoeff(i); 153 | a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1); 154 | E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i); 155 | end 156 | 157 | acorr = R; 158 | refcoeff = rcoeff; 159 | lpparams = [1 -a]; 160 | 161 | 162 | 163 | -------------------------------------------------------------------------------- /comp_snr.m: -------------------------------------------------------------------------------- 1 | function [snr_mean, segsnr_mean]= comp_SNR(cleanFile, enhdFile); 2 | % 3 | % Segmental Signal-to-Noise Ratio Objective Speech Quality Measure 4 | % 5 | % This function implements the segmental signal-to-noise ratio 6 | % as defined in [1, p. 45] (see Equation 2.12). 7 | % 8 | % Usage: [SNRovl, SNRseg]=comp_snr(cleanFile.wav, enhancedFile.wav) 9 | % 10 | % cleanFile.wav - clean input file in .wav format 11 | % enhancedFile - enhanced output file in .wav format 12 | % SNRovl - overall SNR (dB) 13 | % SNRseg - segmental SNR (dB) 14 | % 15 | % This function returns 2 parameters. The first item is the 16 | % overall SNR for the two speech signals. The second value 17 | % is the segmental signal-to-noise ratio (1 seg-snr per 18 | % frame of input). The segmental SNR is clamped to range 19 | % between 35dB and -10dB (see suggestions in [2]). 20 | % 21 | % Example call: [SNRovl,SNRseg]=comp_SNR('sp04.wav','enhanced.wav') 22 | % 23 | % References: 24 | % 25 | % [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements, 26 | % Objective Measures of Speech Quality. Prentice Hall 27 | % Advanced Reference Series, Englewood Cliffs, NJ, 1988, 28 | % ISBN: 0-13-629056-6. 29 | % 30 | % [2] P. E. Papamichalis, Practical Approaches to Speech 31 | % Coding, Prentice-Hall, Englewood Cliffs, NJ, 1987. 32 | % ISBN: 0-13-689019-9. (see pages 179-181). 33 | % 34 | % Authors: Bryan L. Pellom and John H. L. Hansen (July 1998) 35 | % Modified by: Philipos C. Loizou (Oct 2006) 36 | % 37 | % Copyright (c) 2006 by Philipos C. Loizou 38 | % $Revision: 0.0 $ $Date: 10/09/2006 $ 39 | %------------------------------------------------------------------------- 40 | 41 | if nargin ~=2 42 | fprintf('USAGE: [snr_mean, segsnr_mean]= comp_SNR(cleanFile, enhdFile) \n'); 43 | return; 44 | end 45 | 46 | [data1, Srate1, Nbits1]= wavread(cleanFile); 47 | [data2, Srate2, Nbits2]= wavread(enhdFile); 48 | if (( Srate1~= Srate2) | ( Nbits1~= Nbits2) | ( length( data1)~= length( data2))) 49 | error( 'The two files do not match!\n'); 50 | end 51 | 52 | % len= min( length( data1), length( data2)); 53 | % data1= data1( 1: len); 54 | % data2= data2( 1: len); 55 | % data1= (data1 - mean(data1))/std(data1); % MVN 56 | % data2= (data2 - mean(data2))/std(data2); 57 | 58 | [snr_dist, segsnr_dist]= snr( data1, data2,Srate1); 59 | 60 | snr_mean= snr_dist; 61 | segsnr_mean= mean( segsnr_dist); 62 | 63 | 64 | % ========================================================================= 65 | function [overall_snr, segmental_snr] = snr(clean_speech, processed_speech,sample_rate) 66 | 67 | % ---------------------------------------------------------------------- 68 | % Check the length of the clean and processed speech. Must be the same. 69 | % ---------------------------------------------------------------------- 70 | 71 | clean_length = length(clean_speech); 72 | processed_length = length(processed_speech); 73 | 74 | if (clean_length ~= processed_length) 75 | disp('Error: Both Speech Files must be same length.'); 76 | return 77 | end 78 | 79 | % ---------------------------------------------------------------------- 80 | % Scale both clean speech and processed speech to have same dynamic 81 | % range. Also remove DC component from each signal 82 | % ---------------------------------------------------------------------- 83 | 84 | %clean_speech = clean_speech - mean(clean_speech); 85 | %processed_speech = processed_speech - mean(processed_speech); 86 | 87 | %processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech))); 88 | 89 | overall_snr = 10* log10( sum(clean_speech.^2)/sum((clean_speech-processed_speech).^2)); 90 | 91 | % ---------------------------------------------------------------------- 92 | % Global Variables 93 | % ---------------------------------------------------------------------- 94 | 95 | 96 | winlength = round(30*sample_rate/1000); %240; % window length in samples for 30-msecs 97 | skiprate = floor(winlength/4); %60; % window skip in samples 98 | MIN_SNR = -10; % minimum SNR in dB 99 | MAX_SNR = 35; % maximum SNR in dB 100 | 101 | % ---------------------------------------------------------------------- 102 | % For each frame of input speech, calculate the Segmental SNR 103 | % ---------------------------------------------------------------------- 104 | 105 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames 106 | start = 1; % starting sample 107 | window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1))); 108 | 109 | for frame_count = 1: num_frames 110 | 111 | % ---------------------------------------------------------- 112 | % (1) Get the Frames for the test and reference speech. 113 | % Multiply by Hanning Window. 114 | % ---------------------------------------------------------- 115 | 116 | clean_frame = clean_speech(start:start+winlength-1); 117 | processed_frame = processed_speech(start:start+winlength-1); 118 | clean_frame = clean_frame.*window; 119 | processed_frame = processed_frame.*window; 120 | 121 | % ---------------------------------------------------------- 122 | % (2) Compute the Segmental SNR 123 | % ---------------------------------------------------------- 124 | 125 | signal_energy = sum(clean_frame.^2); 126 | noise_energy = sum((clean_frame-processed_frame).^2); 127 | segmental_snr(frame_count) = 10*log10(signal_energy/(noise_energy+eps)+eps); 128 | segmental_snr(frame_count) = max(segmental_snr(frame_count),MIN_SNR); 129 | segmental_snr(frame_count) = min(segmental_snr(frame_count),MAX_SNR); 130 | 131 | start = start + skiprate; 132 | 133 | end 134 | 135 | -------------------------------------------------------------------------------- /comp_wss.m: -------------------------------------------------------------------------------- 1 | function wss_dist= comp_wss(cleanFile, enhancedFile); 2 | % ---------------------------------------------------------------------- 3 | % 4 | % Weighted Spectral Slope (WSS) Objective Speech Quality Measure 5 | % 6 | % This function implements the Weighted Spectral Slope (WSS) 7 | % distance measure originally proposed in [1]. The algorithm 8 | % works by first decomposing the speech signal into a set of 9 | % frequency bands (this is done for both the test and reference 10 | % frame). The intensities within each critical band are 11 | % measured. Then, a weighted distances between the measured 12 | % slopes of the log-critical band spectra are computed. 13 | % This measure is also described in Section 2.2.9 (pages 56-58) 14 | % of [2]. 15 | % 16 | % Whereas Klatt's original measure used 36 critical-band 17 | % filters to estimate the smoothed short-time spectrum, this 18 | % implementation considers a bank of 25 filters spanning 19 | % the 4 kHz bandwidth. 20 | % 21 | % Usage: wss_dist=comp_wss(cleanFile.wav, enhancedFile.wav) 22 | % 23 | % cleanFile.wav - clean input file in .wav format 24 | % enhancedFile - enhanced output file in .wav format 25 | % wss_dist - computed spectral slope distance 26 | % 27 | % Example call: ws =comp_wss('sp04.wav','enhanced.wav') 28 | % 29 | % References: 30 | % 31 | % [1] D. H. Klatt, "Prediction of Perceived Phonetic Distance 32 | % from Critical-Band Spectra: A First Step", Proc. IEEE 33 | % ICASSP'82, Volume 2, pp. 1278-1281, May, 1982. 34 | % 35 | % [2] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements, 36 | % Objective Measures of Speech Quality. Prentice Hall 37 | % Advanced Reference Series, Englewood Cliffs, NJ, 1988, 38 | % ISBN: 0-13-629056-6. 39 | % 40 | % Authors: Bryan L. Pellom and John H. L. Hansen (July 1998) 41 | % Modified by: Philipos C. Loizou (Oct 2006) 42 | % 43 | % Copyright (c) 2006 by Philipos C. Loizou 44 | % $Revision: 0.0 $ $Date: 10/09/2006 $ 45 | % 46 | % ---------------------------------------------------------------------- 47 | if nargin~=2 48 | fprintf('USAGE: WSS=comp_wss(cleanFile.wav, enhancedFile.wav)\n'); 49 | fprintf('For more help, type: help comp_wss\n\n'); 50 | return; 51 | end 52 | 53 | alpha= 0.95; 54 | 55 | [data1, Srate1, Nbits1]= wavread(cleanFile); 56 | [data2, Srate2, Nbits2]= wavread(enhancedFile); 57 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2) 58 | error( 'The two files do not match!\n'); 59 | end 60 | 61 | len= min( length( data1), length( data2)); 62 | data1= data1( 1: len)+eps; 63 | data2= data2( 1: len)+eps; 64 | 65 | wss_dist_vec= wss( data1, data2,Srate1); 66 | wss_dist_vec= sort( wss_dist_vec); 67 | wss_dist= mean( wss_dist_vec( 1: round( length( wss_dist_vec)*alpha))); 68 | 69 | 70 | 71 | function distortion = wss(clean_speech, processed_speech,sample_rate) 72 | 73 | 74 | % ---------------------------------------------------------------------- 75 | % Check the length of the clean and processed speech. Must be the same. 76 | % ---------------------------------------------------------------------- 77 | 78 | clean_length = length(clean_speech); 79 | processed_length = length(processed_speech); 80 | 81 | if (clean_length ~= processed_length) 82 | disp('Error: Files musthave same length.'); 83 | return 84 | end 85 | 86 | 87 | 88 | % ---------------------------------------------------------------------- 89 | % Global Variables 90 | % ---------------------------------------------------------------------- 91 | 92 | winlength = round(30*sample_rate/1000); % window length in samples 93 | skiprate = floor(winlength/4); % window skip in samples 94 | max_freq = sample_rate/2; % maximum bandwidth 95 | num_crit = 25; % number of critical bands 96 | 97 | USE_FFT_SPECTRUM = 1; % defaults to 10th order LP spectrum 98 | n_fft = 2^nextpow2(2*winlength); 99 | n_fftby2 = n_fft/2; % FFT size/2 100 | Kmax = 20; % value suggested by Klatt, pg 1280 101 | Klocmax = 1; % value suggested by Klatt, pg 1280 102 | 103 | % ---------------------------------------------------------------------- 104 | % Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz) 105 | % ---------------------------------------------------------------------- 106 | 107 | cent_freq(1) = 50.0000; bandwidth(1) = 70.0000; 108 | cent_freq(2) = 120.000; bandwidth(2) = 70.0000; 109 | cent_freq(3) = 190.000; bandwidth(3) = 70.0000; 110 | cent_freq(4) = 260.000; bandwidth(4) = 70.0000; 111 | cent_freq(5) = 330.000; bandwidth(5) = 70.0000; 112 | cent_freq(6) = 400.000; bandwidth(6) = 70.0000; 113 | cent_freq(7) = 470.000; bandwidth(7) = 70.0000; 114 | cent_freq(8) = 540.000; bandwidth(8) = 77.3724; 115 | cent_freq(9) = 617.372; bandwidth(9) = 86.0056; 116 | cent_freq(10) = 703.378; bandwidth(10) = 95.3398; 117 | cent_freq(11) = 798.717; bandwidth(11) = 105.411; 118 | cent_freq(12) = 904.128; bandwidth(12) = 116.256; 119 | cent_freq(13) = 1020.38; bandwidth(13) = 127.914; 120 | cent_freq(14) = 1148.30; bandwidth(14) = 140.423; 121 | cent_freq(15) = 1288.72; bandwidth(15) = 153.823; 122 | cent_freq(16) = 1442.54; bandwidth(16) = 168.154; 123 | cent_freq(17) = 1610.70; bandwidth(17) = 183.457; 124 | cent_freq(18) = 1794.16; bandwidth(18) = 199.776; 125 | cent_freq(19) = 1993.93; bandwidth(19) = 217.153; 126 | cent_freq(20) = 2211.08; bandwidth(20) = 235.631; 127 | cent_freq(21) = 2446.71; bandwidth(21) = 255.255; 128 | cent_freq(22) = 2701.97; bandwidth(22) = 276.072; 129 | cent_freq(23) = 2978.04; bandwidth(23) = 298.126; 130 | cent_freq(24) = 3276.17; bandwidth(24) = 321.465; 131 | cent_freq(25) = 3597.63; bandwidth(25) = 346.136; 132 | 133 | bw_min = bandwidth (1); % minimum critical bandwidth 134 | 135 | % ---------------------------------------------------------------------- 136 | % Set up the critical band filters. Note here that Gaussianly shaped 137 | % filters are used. Also, the sum of the filter weights are equivalent 138 | % for each critical band filter. Filter less than -30 dB and set to 139 | % zero. 140 | % ---------------------------------------------------------------------- 141 | 142 | min_factor = exp (-30.0 / (2.0 * 2.303)); % -30 dB point of filter 143 | 144 | for i = 1:num_crit 145 | f0 = (cent_freq (i) / max_freq) * (n_fftby2); 146 | all_f0(i) = floor(f0); 147 | bw = (bandwidth (i) / max_freq) * (n_fftby2); 148 | norm_factor = log(bw_min) - log(bandwidth(i)); 149 | j = 0:1:n_fftby2-1; 150 | crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor); 151 | crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor); 152 | end 153 | 154 | % ---------------------------------------------------------------------- 155 | % For each frame of input speech, calculate the Weighted Spectral 156 | % Slope Measure 157 | % ---------------------------------------------------------------------- 158 | 159 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames 160 | start = 1; % starting sample 161 | window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1))); 162 | 163 | for frame_count = 1:num_frames 164 | 165 | % ---------------------------------------------------------- 166 | % (1) Get the Frames for the test and reference speech. 167 | % Multiply by Hanning Window. 168 | % ---------------------------------------------------------- 169 | 170 | clean_frame = clean_speech(start:start+winlength-1); 171 | processed_frame = processed_speech(start:start+winlength-1); 172 | clean_frame = clean_frame.*window; 173 | processed_frame = processed_frame.*window; 174 | 175 | % ---------------------------------------------------------- 176 | % (2) Compute the Power Spectrum of Clean and Processed 177 | % ---------------------------------------------------------- 178 | 179 | if (USE_FFT_SPECTRUM) 180 | clean_spec = (abs(fft(clean_frame,n_fft)).^2); 181 | processed_spec = (abs(fft(processed_frame,n_fft)).^2); 182 | else 183 | a_vec = zeros(1,n_fft); 184 | a_vec(1:11) = lpc(clean_frame,10); 185 | clean_spec = 1.0/(abs(fft(a_vec,n_fft)).^2)'; 186 | 187 | a_vec = zeros(1,n_fft); 188 | a_vec(1:11) = lpc(processed_frame,10); 189 | processed_spec = 1.0/(abs(fft(a_vec,n_fft)).^2)'; 190 | end 191 | 192 | % ---------------------------------------------------------- 193 | % (3) Compute Filterbank Output Energies (in dB scale) 194 | % ---------------------------------------------------------- 195 | 196 | for i = 1:num_crit 197 | clean_energy(i) = sum(clean_spec(1:n_fftby2) ... 198 | .*crit_filter(i,:)'); 199 | processed_energy(i) = sum(processed_spec(1:n_fftby2) ... 200 | .*crit_filter(i,:)'); 201 | end 202 | clean_energy = 10*log10(max(clean_energy,1E-10)); 203 | processed_energy = 10*log10(max(processed_energy,1E-10)); 204 | 205 | % ---------------------------------------------------------- 206 | % (4) Compute Spectral Slope (dB[i+1]-dB[i]) 207 | % ---------------------------------------------------------- 208 | 209 | clean_slope = clean_energy(2:num_crit) - ... 210 | clean_energy(1:num_crit-1); 211 | processed_slope = processed_energy(2:num_crit) - ... 212 | processed_energy(1:num_crit-1); 213 | 214 | % ---------------------------------------------------------- 215 | % (5) Find the nearest peak locations in the spectra to 216 | % each critical band. If the slope is negative, we 217 | % search to the left. If positive, we search to the 218 | % right. 219 | % ---------------------------------------------------------- 220 | 221 | for i = 1:num_crit-1 222 | 223 | % find the peaks in the clean speech signal 224 | 225 | if (clean_slope(i)>0) % search to the right 226 | n = i; 227 | while ((n 0)) 228 | n = n+1; 229 | end 230 | clean_loc_peak(i) = clean_energy(n-1); 231 | else % search to the left 232 | n = i; 233 | while ((n>0) & (clean_slope(n) <= 0)) 234 | n = n-1; 235 | end 236 | clean_loc_peak(i) = clean_energy(n+1); 237 | end 238 | 239 | % find the peaks in the processed speech signal 240 | 241 | if (processed_slope(i)>0) % search to the right 242 | n = i; 243 | while ((n 0)) 244 | n = n+1; 245 | end 246 | processed_loc_peak(i) = processed_energy(n-1); 247 | else % search to the left 248 | n = i; 249 | while ((n>0) & (processed_slope(n) <= 0)) 250 | n = n-1; 251 | end 252 | processed_loc_peak(i) = processed_energy(n+1); 253 | end 254 | 255 | end 256 | 257 | % ---------------------------------------------------------- 258 | % (6) Compute the WSS Measure for this frame. This 259 | % includes determination of the weighting function. 260 | % ---------------------------------------------------------- 261 | 262 | dBMax_clean = max(clean_energy); 263 | dBMax_processed = max(processed_energy); 264 | 265 | % The weights are calculated by averaging individual 266 | % weighting factors from the clean and processed frame. 267 | % These weights W_clean and W_processed should range 268 | % from 0 to 1 and place more emphasis on spectral 269 | % peaks and less emphasis on slope differences in spectral 270 | % valleys. This procedure is described on page 1280 of 271 | % Klatt's 1982 ICASSP paper. 272 | 273 | Wmax_clean = Kmax ./ (Kmax + dBMax_clean - ... 274 | clean_energy(1:num_crit-1)); 275 | Wlocmax_clean = Klocmax ./ ( Klocmax + clean_loc_peak - ... 276 | clean_energy(1:num_crit-1)); 277 | W_clean = Wmax_clean .* Wlocmax_clean; 278 | 279 | Wmax_processed = Kmax ./ (Kmax + dBMax_processed - ... 280 | processed_energy(1:num_crit-1)); 281 | Wlocmax_processed = Klocmax ./ ( Klocmax + processed_loc_peak - ... 282 | processed_energy(1:num_crit-1)); 283 | W_processed = Wmax_processed .* Wlocmax_processed; 284 | 285 | W = (W_clean + W_processed)./2.0; 286 | 287 | distortion(frame_count) = sum(W.*(clean_slope(1:num_crit-1) - ... 288 | processed_slope(1:num_crit-1)).^2); 289 | 290 | % this normalization is not part of Klatt's paper, but helps 291 | % to normalize the measure. Here we scale the measure by the 292 | % sum of the weights. 293 | 294 | distortion(frame_count) = distortion(frame_count)/sum(W); 295 | 296 | start = start + skiprate; 297 | 298 | end 299 | 300 | -------------------------------------------------------------------------------- /composite.m: -------------------------------------------------------------------------------- 1 | function [Csig,Cbak,Covl]= composite(cleanFile, enhancedFile); 2 | % ---------------------------------------------------------------------- 3 | % Composite Objective Speech Quality Measure 4 | % 5 | % This function implements the composite objective measure proposed in 6 | % [1]. 7 | % 8 | % Usage: [sig,bak,ovl]=composite(cleanFile.wav, enhancedFile.wav) 9 | % 10 | % cleanFile.wav - clean input file in .wav format 11 | % enhancedFile - enhanced output file in .wav format 12 | % sig - predicted rating [1-5] of speech distortion 13 | % bak - predicted rating [1-5] of noise distortion 14 | % ovl - predicted rating [1-5] of overall quality 15 | % 16 | % In addition to the above ratings (sig, bak, & ovl) it returns 17 | % the individual values of the LLR, SNRseg, WSS and PESQ measures. 18 | % 19 | % Example call: [sig,bak,ovl] =composite('sp04.wav','enhanced.wav') 20 | % 21 | % 22 | % References: 23 | % 24 | % [1] Hu, Y. and Loizou, P. (2006). Evaluation of objective measures 25 | % for speech enhancement. Proc. Interspeech, Pittsburg, PA. 26 | % 27 | % Authors: Yi Hu and Philipos C. Loizou 28 | % (the LLR, SNRseg and WSS measures were based on Bryan Pellom and John 29 | % Hansen's implementations) 30 | % 31 | % Copyright (c) 2006 by Philipos C. Loizou 32 | % $Revision: 0.0 $ $Date: 10/09/2006 $ 33 | 34 | % ---------------------------------------------------------------------- 35 | 36 | if nargin~=2 37 | fprintf('USAGE: [sig,bak,ovl]=composite(cleanFile.wav, enhancedFile.wav)\n'); 38 | fprintf('For more help, type: help composite\n\n'); 39 | return; 40 | end 41 | 42 | alpha= 0.95; 43 | 44 | [data1, Srate1, Nbits1]= wavread(cleanFile); 45 | [data2, Srate2, Nbits2]= wavread(enhancedFile); 46 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2) | length(data1)~=length(data2) 47 | disp(Srate1); 48 | disp(Srate2); 49 | disp(Nbits1); 50 | disp(Nbits2); 51 | disp(length(data1)); 52 | disp(length(data2)); 53 | error( 'The two files do not match!\n'); 54 | end 55 | 56 | len= min( length( data1), length( data2)); 57 | data1= data1( 1: len)+eps; 58 | data2= data2( 1: len)+eps; 59 | 60 | 61 | % -- compute the WSS measure --- 62 | % 63 | wss_dist_vec= wss( data1, data2,Srate1); 64 | wss_dist_vec= sort( wss_dist_vec); 65 | wss_dist= mean( wss_dist_vec( 1: round( length( wss_dist_vec)*alpha))); 66 | 67 | % --- compute the LLR measure --------- 68 | % 69 | LLR_dist= llr( data1, data2,Srate1); 70 | LLRs= sort(LLR_dist); 71 | LLR_len= round( length(LLR_dist)* alpha); 72 | llr_mean= mean( LLRs( 1: LLR_len)); 73 | 74 | % --- compute the SNRseg ---------------- 75 | % 76 | [snr_dist, segsnr_dist]= snr( data1, data2,Srate1); 77 | snr_mean= snr_dist; 78 | segSNR= mean( segsnr_dist); 79 | 80 | 81 | % -- compute the pesq ---- 82 | % 83 | % if Srate1==8000, mode='nb'; 84 | % elseif Srate1 == 16000, mode='wb'; 85 | % else, 86 | % error ('Sampling freq in PESQ needs to be 8 kHz or 16 kHz'); 87 | % end 88 | 89 | 90 | [pesq_mos_scores]= comp_pesq(cleanFile, enhancedFile); 91 | 92 | if length(pesq_mos_scores)==2 93 | pesq_mos=pesq_mos_scores(1); % take the raw PESQ value instead of the 94 | % MOS-mapped value (this composite 95 | % measure was only validated with the raw 96 | % PESQ value) 97 | else 98 | pesq_mos=pesq_mos_scores; 99 | end 100 | 101 | % --- now compute the composite measures ------------------ 102 | % 103 | Csig = 3.093 - 1.029*llr_mean + 0.603*pesq_mos-0.009*wss_dist; 104 | Csig = max(1,Csig); Csig=min(5, Csig); % limit values to [1, 5] 105 | Cbak = 1.634 + 0.478 *pesq_mos - 0.007*wss_dist + 0.063*segSNR; 106 | Cbak = max(1, Cbak); Cbak=min(5,Cbak); % limit values to [1, 5] 107 | Covl = 1.594 + 0.805*pesq_mos - 0.512*llr_mean - 0.007*wss_dist; 108 | Covl = max(1, Covl); Covl=min(5, Covl); % limit values to [1, 5] 109 | 110 | %fprintf('\n LLR=%f SNRseg=%f WSS=%f PESQ=%f\n',llr_mean,segSNR,wss_dist,pesq_mos); 111 | 112 | return; %================================================================= 113 | 114 | 115 | function distortion = wss(clean_speech, processed_speech,sample_rate) 116 | 117 | 118 | % ---------------------------------------------------------------------- 119 | % Check the length of the clean and processed speech. Must be the same. 120 | % ---------------------------------------------------------------------- 121 | 122 | clean_length = length(clean_speech); 123 | processed_length = length(processed_speech); 124 | 125 | if (clean_length ~= processed_length) 126 | disp('Error: Files musthave same length.'); 127 | return 128 | end 129 | 130 | 131 | 132 | % ---------------------------------------------------------------------- 133 | % Global Variables 134 | % ---------------------------------------------------------------------- 135 | 136 | winlength = round(30*sample_rate/1000); %240; % window length in samples 137 | skiprate = floor(winlength/4); % window skip in samples 138 | max_freq = sample_rate/2; % maximum bandwidth 139 | num_crit = 25; % number of critical bands 140 | 141 | USE_FFT_SPECTRUM = 1; % defaults to 10th order LP spectrum 142 | n_fft = 2^nextpow2(2*winlength); 143 | n_fftby2 = n_fft/2; % FFT size/2 144 | Kmax = 20; % value suggested by Klatt, pg 1280 145 | Klocmax = 1; % value suggested by Klatt, pg 1280 146 | 147 | % ---------------------------------------------------------------------- 148 | % Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz) 149 | % ---------------------------------------------------------------------- 150 | 151 | cent_freq(1) = 50.0000; bandwidth(1) = 70.0000; 152 | cent_freq(2) = 120.000; bandwidth(2) = 70.0000; 153 | cent_freq(3) = 190.000; bandwidth(3) = 70.0000; 154 | cent_freq(4) = 260.000; bandwidth(4) = 70.0000; 155 | cent_freq(5) = 330.000; bandwidth(5) = 70.0000; 156 | cent_freq(6) = 400.000; bandwidth(6) = 70.0000; 157 | cent_freq(7) = 470.000; bandwidth(7) = 70.0000; 158 | cent_freq(8) = 540.000; bandwidth(8) = 77.3724; 159 | cent_freq(9) = 617.372; bandwidth(9) = 86.0056; 160 | cent_freq(10) = 703.378; bandwidth(10) = 95.3398; 161 | cent_freq(11) = 798.717; bandwidth(11) = 105.411; 162 | cent_freq(12) = 904.128; bandwidth(12) = 116.256; 163 | cent_freq(13) = 1020.38; bandwidth(13) = 127.914; 164 | cent_freq(14) = 1148.30; bandwidth(14) = 140.423; 165 | cent_freq(15) = 1288.72; bandwidth(15) = 153.823; 166 | cent_freq(16) = 1442.54; bandwidth(16) = 168.154; 167 | cent_freq(17) = 1610.70; bandwidth(17) = 183.457; 168 | cent_freq(18) = 1794.16; bandwidth(18) = 199.776; 169 | cent_freq(19) = 1993.93; bandwidth(19) = 217.153; 170 | cent_freq(20) = 2211.08; bandwidth(20) = 235.631; 171 | cent_freq(21) = 2446.71; bandwidth(21) = 255.255; 172 | cent_freq(22) = 2701.97; bandwidth(22) = 276.072; 173 | cent_freq(23) = 2978.04; bandwidth(23) = 298.126; 174 | cent_freq(24) = 3276.17; bandwidth(24) = 321.465; 175 | cent_freq(25) = 3597.63; bandwidth(25) = 346.136; 176 | 177 | bw_min = bandwidth (1); % minimum critical bandwidth 178 | 179 | % ---------------------------------------------------------------------- 180 | % Set up the critical band filters. Note here that Gaussianly shaped 181 | % filters are used. Also, the sum of the filter weights are equivalent 182 | % for each critical band filter. Filter less than -30 dB and set to 183 | % zero. 184 | % ---------------------------------------------------------------------- 185 | 186 | min_factor = exp (-30.0 / (2.0 * 2.303)); % -30 dB point of filter 187 | 188 | for i = 1:num_crit 189 | f0 = (cent_freq (i) / max_freq) * (n_fftby2); 190 | all_f0(i) = floor(f0); 191 | bw = (bandwidth (i) / max_freq) * (n_fftby2); 192 | norm_factor = log(bw_min) - log(bandwidth(i)); 193 | j = 0:1:n_fftby2-1; 194 | crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor); 195 | crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor); 196 | end 197 | 198 | % ---------------------------------------------------------------------- 199 | % For each frame of input speech, calculate the Weighted Spectral 200 | % Slope Measure 201 | % ---------------------------------------------------------------------- 202 | 203 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames 204 | start = 1; % starting sample 205 | window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1))); 206 | 207 | for frame_count = 1:num_frames 208 | 209 | % ---------------------------------------------------------- 210 | % (1) Get the Frames for the test and reference speech. 211 | % Multiply by Hanning Window. 212 | % ---------------------------------------------------------- 213 | 214 | clean_frame = clean_speech(start:start+winlength-1); 215 | processed_frame = processed_speech(start:start+winlength-1); 216 | clean_frame = clean_frame.*window; 217 | processed_frame = processed_frame.*window; 218 | 219 | % ---------------------------------------------------------- 220 | % (2) Compute the Power Spectrum of Clean and Processed 221 | % ---------------------------------------------------------- 222 | 223 | if (USE_FFT_SPECTRUM) 224 | clean_spec = (abs(fft(clean_frame,n_fft)).^2); 225 | processed_spec = (abs(fft(processed_frame,n_fft)).^2); 226 | else 227 | a_vec = zeros(1,n_fft); 228 | a_vec(1:11) = lpc(clean_frame,10); 229 | clean_spec = 1.0/(abs(fft(a_vec,n_fft)).^2)'; 230 | 231 | a_vec = zeros(1,n_fft); 232 | a_vec(1:11) = lpc(processed_frame,10); 233 | processed_spec = 1.0/(abs(fft(a_vec,n_fft)).^2)'; 234 | end 235 | 236 | % ---------------------------------------------------------- 237 | % (3) Compute Filterbank Output Energies (in dB scale) 238 | % ---------------------------------------------------------- 239 | 240 | for i = 1:num_crit 241 | clean_energy(i) = sum(clean_spec(1:n_fftby2) ... 242 | .*crit_filter(i,:)'); 243 | processed_energy(i) = sum(processed_spec(1:n_fftby2) ... 244 | .*crit_filter(i,:)'); 245 | end 246 | clean_energy = 10*log10(max(clean_energy,1E-10)); 247 | processed_energy = 10*log10(max(processed_energy,1E-10)); 248 | 249 | % ---------------------------------------------------------- 250 | % (4) Compute Spectral Slope (dB[i+1]-dB[i]) 251 | % ---------------------------------------------------------- 252 | 253 | clean_slope = clean_energy(2:num_crit) - ... 254 | clean_energy(1:num_crit-1); 255 | processed_slope = processed_energy(2:num_crit) - ... 256 | processed_energy(1:num_crit-1); 257 | 258 | % ---------------------------------------------------------- 259 | % (5) Find the nearest peak locations in the spectra to 260 | % each critical band. If the slope is negative, we 261 | % search to the left. If positive, we search to the 262 | % right. 263 | % ---------------------------------------------------------- 264 | 265 | for i = 1:num_crit-1 266 | 267 | % find the peaks in the clean speech signal 268 | 269 | if (clean_slope(i)>0) % search to the right 270 | n = i; 271 | while ((n 0)) 272 | n = n+1; 273 | end 274 | clean_loc_peak(i) = clean_energy(n-1); 275 | else % search to the left 276 | n = i; 277 | while ((n>0) & (clean_slope(n) <= 0)) 278 | n = n-1; 279 | end 280 | clean_loc_peak(i) = clean_energy(n+1); 281 | end 282 | 283 | % find the peaks in the processed speech signal 284 | 285 | if (processed_slope(i)>0) % search to the right 286 | n = i; 287 | while ((n 0)) 288 | n = n+1; 289 | end 290 | processed_loc_peak(i) = processed_energy(n-1); 291 | else % search to the left 292 | n = i; 293 | while ((n>0) & (processed_slope(n) <= 0)) 294 | n = n-1; 295 | end 296 | processed_loc_peak(i) = processed_energy(n+1); 297 | end 298 | 299 | end 300 | 301 | % ---------------------------------------------------------- 302 | % (6) Compute the WSS Measure for this frame. This 303 | % includes determination of the weighting function. 304 | % ---------------------------------------------------------- 305 | 306 | dBMax_clean = max(clean_energy); 307 | dBMax_processed = max(processed_energy); 308 | 309 | % The weights are calculated by averaging individual 310 | % weighting factors from the clean and processed frame. 311 | % These weights W_clean and W_processed should range 312 | % from 0 to 1 and place more emphasis on spectral 313 | % peaks and less emphasis on slope differences in spectral 314 | % valleys. This procedure is described on page 1280 of 315 | % Klatt's 1982 ICASSP paper. 316 | 317 | Wmax_clean = Kmax ./ (Kmax + dBMax_clean - ... 318 | clean_energy(1:num_crit-1)); 319 | Wlocmax_clean = Klocmax ./ ( Klocmax + clean_loc_peak - ... 320 | clean_energy(1:num_crit-1)); 321 | W_clean = Wmax_clean .* Wlocmax_clean; 322 | 323 | Wmax_processed = Kmax ./ (Kmax + dBMax_processed - ... 324 | processed_energy(1:num_crit-1)); 325 | Wlocmax_processed = Klocmax ./ ( Klocmax + processed_loc_peak - ... 326 | processed_energy(1:num_crit-1)); 327 | W_processed = Wmax_processed .* Wlocmax_processed; 328 | 329 | W = (W_clean + W_processed)./2.0; 330 | 331 | distortion(frame_count) = sum(W.*(clean_slope(1:num_crit-1) - ... 332 | processed_slope(1:num_crit-1)).^2); 333 | 334 | % this normalization is not part of Klatt's paper, but helps 335 | % to normalize the measure. Here we scale the measure by the 336 | % sum of the weights. 337 | 338 | distortion(frame_count) = distortion(frame_count)/sum(W); 339 | 340 | start = start + skiprate; 341 | 342 | end 343 | 344 | %----------------------------------------------- 345 | function distortion = llr(clean_speech, processed_speech,sample_rate) 346 | 347 | 348 | % ---------------------------------------------------------------------- 349 | % Check the length of the clean and processed speech. Must be the same. 350 | % ---------------------------------------------------------------------- 351 | 352 | clean_length = length(clean_speech); 353 | processed_length = length(processed_speech); 354 | 355 | if (clean_length ~= processed_length) 356 | disp('Error: Both Speech Files must be same length.'); 357 | return 358 | end 359 | 360 | % ---------------------------------------------------------------------- 361 | % Global Variables 362 | % ---------------------------------------------------------------------- 363 | 364 | winlength = round(30*sample_rate/1000); % window length in samples 365 | skiprate = floor(winlength/4); % window skip in samples 366 | if sample_rate<10000 367 | P = 10; % LPC Analysis Order 368 | else 369 | P=16; % this could vary depending on sampling frequency. 370 | end 371 | 372 | % ---------------------------------------------------------------------- 373 | % For each frame of input speech, calculate the Log Likelihood Ratio 374 | % ---------------------------------------------------------------------- 375 | 376 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames 377 | start = 1; % starting sample 378 | window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1))); 379 | 380 | for frame_count = 1:num_frames 381 | 382 | % ---------------------------------------------------------- 383 | % (1) Get the Frames for the test and reference speech. 384 | % Multiply by Hanning Window. 385 | % ---------------------------------------------------------- 386 | 387 | clean_frame = clean_speech(start:start+winlength-1); 388 | processed_frame = processed_speech(start:start+winlength-1); 389 | clean_frame = clean_frame.*window; 390 | processed_frame = processed_frame.*window; 391 | 392 | % ---------------------------------------------------------- 393 | % (2) Get the autocorrelation lags and LPC parameters used 394 | % to compute the LLR measure. 395 | % ---------------------------------------------------------- 396 | 397 | [R_clean, Ref_clean, A_clean] = ... 398 | lpcoeff(clean_frame, P); 399 | [R_processed, Ref_processed, A_processed] = ... 400 | lpcoeff(processed_frame, P); 401 | 402 | % ---------------------------------------------------------- 403 | % (3) Compute the LLR measure 404 | % ---------------------------------------------------------- 405 | 406 | numerator = A_processed*toeplitz(R_clean)*A_processed'; 407 | denominator = A_clean*toeplitz(R_clean)*A_clean'; 408 | distortion(frame_count) = log(numerator/denominator); 409 | start = start + skiprate; 410 | 411 | end 412 | 413 | %--------------------------------------------- 414 | function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order) 415 | 416 | % ---------------------------------------------------------- 417 | % (1) Compute Autocorrelation Lags 418 | % ---------------------------------------------------------- 419 | 420 | winlength = max(size(speech_frame)); 421 | for k=1:model_order+1 422 | R(k) = sum(speech_frame(1:winlength-k+1) ... 423 | .*speech_frame(k:winlength)); 424 | end 425 | 426 | % ---------------------------------------------------------- 427 | % (2) Levinson-Durbin 428 | % ---------------------------------------------------------- 429 | 430 | a = ones(1,model_order); 431 | E(1)=R(1); 432 | for i=1:model_order 433 | a_past(1:i-1) = a(1:i-1); 434 | sum_term = sum(a_past(1:i-1).*R(i:-1:2)); 435 | rcoeff(i)=(R(i+1) - sum_term) / E(i); 436 | a(i)=rcoeff(i); 437 | a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1); 438 | E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i); 439 | end 440 | 441 | acorr = R; 442 | refcoeff = rcoeff; 443 | lpparams = [1 -a]; 444 | 445 | 446 | % ---------------------------------------------------------------------- 447 | 448 | function [overall_snr, segmental_snr] = snr(clean_speech, processed_speech,sample_rate) 449 | 450 | % ---------------------------------------------------------------------- 451 | % Check the length of the clean and processed speech. Must be the same. 452 | % ---------------------------------------------------------------------- 453 | 454 | clean_length = length(clean_speech); 455 | processed_length = length(processed_speech); 456 | 457 | if (clean_length ~= processed_length) 458 | disp('Error: Both Speech Files must be same length.'); 459 | return 460 | end 461 | 462 | % ---------------------------------------------------------------------- 463 | % Scale both clean speech and processed speech to have same dynamic 464 | % range. Also remove DC component from each signal 465 | % ---------------------------------------------------------------------- 466 | 467 | %clean_speech = clean_speech - mean(clean_speech); 468 | %processed_speech = processed_speech - mean(processed_speech); 469 | 470 | %processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech))); 471 | 472 | overall_snr = 10* log10( sum(clean_speech.^2)/sum((clean_speech-processed_speech).^2)); 473 | 474 | % ---------------------------------------------------------------------- 475 | % Global Variables 476 | % ---------------------------------------------------------------------- 477 | 478 | winlength = round(30*sample_rate/1000); %240; % window length in samples 479 | skiprate = floor(winlength/4); % window skip in samples 480 | MIN_SNR = -10; % minimum SNR in dB 481 | MAX_SNR = 35; % maximum SNR in dB 482 | 483 | % ---------------------------------------------------------------------- 484 | % For each frame of input speech, calculate the Segmental SNR 485 | % ---------------------------------------------------------------------- 486 | 487 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames 488 | start = 1; % starting sample 489 | window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1))); 490 | 491 | for frame_count = 1: num_frames 492 | 493 | % ---------------------------------------------------------- 494 | % (1) Get the Frames for the test and reference speech. 495 | % Multiply by Hanning Window. 496 | % ---------------------------------------------------------- 497 | 498 | clean_frame = clean_speech(start:start+winlength-1); 499 | processed_frame = processed_speech(start:start+winlength-1); 500 | clean_frame = clean_frame.*window; 501 | processed_frame = processed_frame.*window; 502 | 503 | % ---------------------------------------------------------- 504 | % (2) Compute the Segmental SNR 505 | % ---------------------------------------------------------- 506 | 507 | signal_energy = sum(clean_frame.^2); 508 | noise_energy = sum((clean_frame-processed_frame).^2); 509 | segmental_snr(frame_count) = 10*log10(signal_energy/(noise_energy+eps)+eps); 510 | segmental_snr(frame_count) = max(segmental_snr(frame_count),MIN_SNR); 511 | segmental_snr(frame_count) = min(segmental_snr(frame_count),MAX_SNR); 512 | 513 | start = start + skiprate; 514 | 515 | end 516 | 517 | 518 | 519 | -------------------------------------------------------------------------------- /enhanced.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IMLHF/Speech-Enhancement-Measures/42490594d431c5c11a0d5b9e2dc24ccdf0f6ec07/enhanced.wav -------------------------------------------------------------------------------- /estoi.m: -------------------------------------------------------------------------------- 1 | function d = estoi(x, y, fs_signal) 2 | % % % % 3 | % from https://github.com/dakenan1/Speech-measure-SDR-SAR-STOI-PESQ/blob/master/estoi.m 4 | % % % % 5 | % d = estoi(x, y, fs_signal) returns the output of the extended short-time 6 | % objective intelligibility (ESTOI) predictor. 7 | % 8 | % Implementation of the Extended Short-Time Objective 9 | % Intelligibility (ESTOI) predictor, described in Jesper Jensen and 10 | % Cees H. Taal, "An Algorithm for Predicting the Intelligibility of 11 | % Speech Masked by Modulated Noise Maskers," IEEE Transactions on 12 | % Audio, Speech and Language Processing, 2016. 13 | % 14 | % Input: 15 | % x: clean reference time domain signal 16 | % y: noisy/processed time domain signal 17 | % fs_signal: sampling rate [Hz] 18 | % 19 | % Output: 20 | % d: intelligibility index 21 | % 22 | % 23 | % Copyright 2016: Aalborg University, Section for Signal and Information Processing. 24 | % The software is free for non-commercial use. 25 | % The software comes WITHOUT ANY WARRANTY. 26 | 27 | 28 | if length(x)~=length(y) 29 | error('x and y should have the same length'); 30 | end 31 | 32 | % initialization 33 | x = x(:); % clean speech column vector 34 | y = y(:); % processed speech column vector 35 | 36 | fs = 10000; % sample rate of proposed intelligibility measure 37 | N_frame = 256; % window support 38 | K = 512; % FFT size 39 | J = 15; % Number of 1/3 octave bands 40 | mn = 150; % Center frequency of first 1/3 octave band in Hz. 41 | [H,fc_thirdoct] = thirdoct(fs, K, J, mn); % Get 1/3 octave band matrix 42 | N = 30; % Number of frames for intermediate intelligibility measure 43 | dyn_range = 40; % speech dynamic range 44 | 45 | % resample signals if other samplerate is used than fs 46 | if fs_signal ~= fs 47 | x = resample(x, fs, fs_signal); 48 | y = resample(y, fs, fs_signal); 49 | end 50 | 51 | % remove silent frames 52 | [x y] = removeSilentFrames(x, y, dyn_range, N_frame, N_frame/2); 53 | 54 | % apply 1/3 octave band TF-decomposition 55 | x_hat = stdft(x, N_frame, N_frame/2, K); % apply short-time DFT to clean speech 56 | y_hat = stdft(y, N_frame, N_frame/2, K); % apply short-time DFT to processed speech 57 | 58 | 59 | x_hat = x_hat(:, 1:(K/2+1)).'; % take clean single-sided spectrum 60 | y_hat = y_hat(:, 1:(K/2+1)).'; % take processed single-sided spectrum 61 | 62 | X = zeros(J, size(x_hat, 2)); % init memory for clean speech 1/3 octave band TF-representation 63 | Y = zeros(J, size(y_hat, 2)); % init memory for processed speech 1/3 octave band TF-representation 64 | 65 | for i = 1:size(x_hat, 2) 66 | X(:, i) = sqrt(H*abs(x_hat(:, i)).^2); % apply 1/3 octave band filtering 67 | Y(:, i) = sqrt(H*abs(y_hat(:, i)).^2); 68 | end 69 | 70 | % loop all segments of length N and obtain intermediate intelligibility measure for each 71 | d1 = zeros(length(N:size(X, 2)),1); % init memory for intermediate intelligibility measure 72 | for m=N:size(X,2) 73 | X_seg = X(:, (m-N+1):m); % region of length N with clean TF-units for all j 74 | Y_seg = Y(:, (m-N+1):m); % region of length N with processed TF-units for all j 75 | X_seg = X_seg + eps*randn(size(X_seg)); % to avoid divide by zero 76 | Y_seg = Y_seg + eps*randn(size(Y_seg)); % to avoid divide by zero 77 | 78 | %% first normalize rows (to give \bar{S}_m) 79 | XX = X_seg - mean(X_seg.').'*ones(1,N); % normalize rows to zero mean 80 | YY = Y_seg - mean(Y_seg.').'*ones(1,N); % normalize rows to zero mean 81 | 82 | YY = diag(1./sqrt(diag(YY*YY')))*YY; % normalize rows to unit length 83 | XX = diag(1./sqrt(diag(XX*XX')))*XX; % normalize rows to unit length 84 | 85 | XX = XX + eps*randn(size(XX)); % to avoid corr.div.by.0 86 | YY = YY + eps*randn(size(YY)); % to avoid corr.div.by.0 87 | 88 | %% then normalize columns (to give \check{S}_m) 89 | YYY = YY - ones(J,1)*mean(YY); % normalize cols to zero mean 90 | XXX = XX - ones(J,1)*mean(XX); % normalize cols to zero mean 91 | 92 | YYY = YYY*diag(1./sqrt(diag(YYY'*YYY))); % normalize cols to unit length 93 | XXX = XXX*diag(1./sqrt(diag(XXX'*XXX))); % normalize cols to unit length 94 | 95 | %compute average of col.correlations (by stacking cols) 96 | d1(m-N+1) = 1/N*XXX(:).'*YYY(:); 97 | end 98 | d = mean(d1); 99 | 100 | 101 | %% 102 | function [A cf] = thirdoct(fs, N_fft, numBands, mn) 103 | % [A CF] = THIRDOCT(FS, N_FFT, NUMBANDS, MN) returns 1/3 octave band matrix 104 | % inputs: 105 | % FS: samplerate 106 | % N_FFT: FFT size 107 | % NUMBANDS: number of bands 108 | % MN: center frequency of first 1/3 octave band 109 | % outputs: 110 | % A: octave band matrix 111 | % CF: center frequencies 112 | 113 | f = linspace(0, fs, N_fft+1); 114 | f = f(1:(N_fft/2+1)); 115 | k = 0:(numBands-1); 116 | cf = 2.^(k/3)*mn; 117 | fl = sqrt((2.^(k/3)*mn).*2.^((k-1)/3)*mn); 118 | fr = sqrt((2.^(k/3)*mn).*2.^((k+1)/3)*mn); 119 | A = zeros(numBands, length(f)); 120 | 121 | for i = 1:(length(cf)) 122 | [a b] = min((f-fl(i)).^2); 123 | fl(i) = f(b); 124 | fl_ii = b; 125 | 126 | [a b] = min((f-fr(i)).^2); 127 | fr(i) = f(b); 128 | fr_ii = b; 129 | A(i,fl_ii:(fr_ii-1)) = 1; 130 | end 131 | 132 | rnk = sum(A, 2); 133 | numBands = find((rnk(2:end)>=rnk(1:(end-1))) & (rnk(2:end)~=0)~=0, 1, 'last' )+1; 134 | A = A(1:numBands, :); 135 | cf = cf(1:numBands); 136 | 137 | %% 138 | function x_stdft = stdft(x, N, K, N_fft) 139 | % X_STDFT = X_STDFT(X, N, K, N_FFT) returns the short-time 140 | % hanning-windowed dft of X with frame-size N, overlap K and DFT size 141 | % N_FFT. The columns and rows of X_STDFT denote the frame-index and 142 | % dft-bin index, respectively. 143 | 144 | frames = 1:K:(length(x)-N); 145 | x_stdft = zeros(length(frames), N_fft); 146 | 147 | w = hanning(N); 148 | x = x(:); 149 | 150 | for i = 1:length(frames) 151 | ii = frames(i):(frames(i)+N-1); 152 | x_stdft(i, :) = fft(x(ii).*w, N_fft); 153 | end 154 | 155 | %% 156 | function [x_sil y_sil] = removeSilentFrames(x, y, range, N, K) 157 | % [X_SIL Y_SIL] = REMOVESILENTFRAMES(X, Y, RANGE, N, K) X and Y 158 | % are segmented with frame-length N and overlap K, where the maximum energy 159 | % of all frames of X is determined, say X_MAX. X_SIL and Y_SIL are the 160 | % reconstructed signals, excluding the frames, where the energy of a frame 161 | % of X is smaller than X_MAX-RANGE 162 | 163 | x = x(:); 164 | y = y(:); 165 | 166 | frames = 1:K:(length(x)-N); 167 | w = hanning(N); 168 | msk = zeros(size(frames)); 169 | 170 | for j = 1:length(frames) 171 | jj = frames(j):(frames(j)+N-1); 172 | msk(j) = 20*log10(norm(x(jj).*w)./sqrt(N)); 173 | end 174 | 175 | msk = (msk-max(msk)+range)>0; 176 | count = 1; 177 | 178 | x_sil = zeros(size(x)); 179 | y_sil = zeros(size(y)); 180 | 181 | for j = 1:length(frames) 182 | if msk(j) 183 | jj_i = frames(j):(frames(j)+N-1); 184 | jj_o = frames(count):(frames(count)+N-1); 185 | x_sil(jj_o) = x_sil(jj_o) + x(jj_i).*w; 186 | y_sil(jj_o) = y_sil(jj_o) + y(jj_i).*w; 187 | count = count+1; 188 | end 189 | end 190 | 191 | x_sil = x_sil(1:jj_o(end)); 192 | y_sil = y_sil(1:jj_o(end)); -------------------------------------------------------------------------------- /evaluate_all.m: -------------------------------------------------------------------------------- 1 | % get CSIG, CBAK, CVOL, PESQ, SegSNR from two dir list 2 | % [CSIG, CBAK, CVOL, PESQ, SegSNR] = evaluate_all(ref_dir, deg_dir) 3 | 4 | function [Csig, Cbak, Cvol, pesq, SNR, SegSNR] = evaluate_all(ref_dir, deg_dir) 5 | ref_folder = dir(fullfile(ref_dir,'*.wav')); 6 | deg_folder = dir(fullfile(deg_dir,'*.wav')); 7 | ref_names = {ref_folder.name}; 8 | deg_names = {deg_folder.name}; 9 | ref_names = sort(ref_names); 10 | deg_names = sort(deg_names); 11 | disp(ref_names(1:5)); 12 | disp(deg_names(1:5)); 13 | n_refs = length(ref_names); 14 | n_degs = length(deg_names); 15 | assert(n_refs == n_degs, 'n_refs != n_degs'); 16 | csigs = zeros(1, n_refs); 17 | cbaks = zeros(1, n_refs); 18 | cvols = zeros(1, n_refs); 19 | pesqs = zeros(1, n_refs); 20 | snrs = zeros(1, n_refs); 21 | segsnrs = zeros(1,n_refs); 22 | for idx = 1:n_refs 23 | ref_names(idx) = strcat(ref_dir, '/', ref_names(idx)); 24 | deg_names(idx) = strcat(deg_dir, '/', deg_names(idx)); 25 | % disp(ref_names(idx)); 26 | % disp(deg_names(idx)); 27 | ref_file = char(ref_names(idx)); 28 | deg_file = char(deg_names(idx)); 29 | [csig, cbak, cvol] = composite(ref_file, deg_file); 30 | pesq_ = comp_pesq(ref_file, deg_file); 31 | [snr, segsnr] = comp_snr(ref_file, deg_file); 32 | csigs(idx) = csig; 33 | cbaks(idx) = cbak; 34 | cvols(idx) = cvol; 35 | pesqs(idx) = pesq_(1); 36 | snrs(idx) = snr; 37 | segsnrs(idx) = segsnr; 38 | % disp(strcat(ref_names(idx),'\n')) 39 | fprintf('\n idx=%d csig=%f cbak=%f cvol=%f pesq=%f snr=%f ssnr=%f\n',idx,csig,cbak,cvol,pesq_(1),snr,segsnr); 40 | end; 41 | 42 | Csig = mean(csigs); 43 | Cbak = mean(cbaks); 44 | Cvol = mean(cvols); 45 | pesq = mean(pesqs); 46 | SNR = mean(snrs); 47 | SegSNR = mean(segsnrs); 48 | end 49 | -------------------------------------------------------------------------------- /pesq.ubuntu16.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IMLHF/Speech-Enhancement-Measures/42490594d431c5c11a0d5b9e2dc24ccdf0f6ec07/pesq.ubuntu16.bin -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | python version:https://github.com/IMLHF/PHASEN-PyTorch/blob/master/phasen_torch/sepm.py 2 | 3 | This folder contains implementations of objective quality measures 4 | (Chapter 11): 5 | 6 | MATLAB file Description Reference 7 | ----------------------------------------------------------------------------------- 8 | comp_snr.m Overall and segmental SNR [1] 9 | 10 | comp_wss.m Weighted-spectral slope metric [2] 11 | 12 | comp_llr.m Likelihood-ratio measure [3] 13 | 14 | comp_is.m Itakura-Saito measure [3] 15 | comp_cep.m Cepstral distance measure [4] 16 | comp_fwseg Freq. weighted segm. SNR (fwSNRseg) [5],Chap 11 17 | 18 | comp_fwseg_variant Frequency-variant fwSNRseg measure Chap 11 19 | 20 | comp_fwseg_mars Frequency variant fwSNRseg measure Chap 11 21 | based on MARS analysis 22 | 23 | comp_pesq.m PESQ measure (narrowband) ITU-T P.862 [6] 24 | PESQ measure (wideband) ITU-T P.862.2 [7] 25 | 26 | composite.m A composite measure [8] 27 | 28 | 29 | addnoise_asl.m Adds noise to the clean signal at specified SNR 30 | based on active speech level. [9] 31 | 32 | --------------------------------------------------------------------------------- 33 | ## USAGE 34 | 35 | [snr_mean, segsnr_mean]= compSNR(cleanFile.wav, enhdFile.wav); 36 | % where 'snr_mean' is the global overall SNR and 'segsnr_mean' is the segmental SNR. 37 | 38 | wss_mean = comp_wss(cleanFile.wav, enhancedFile.wav); 39 | 40 | llr_mean= comp_llr(cleanFile.wav, enhancedFile.wav); 41 | 42 | is_mean = comp_is(cleanFile.wav, enhancedFile.wav); 43 | 44 | cep_mean = comp_cep(cleanFile.wav, enhancedFile.wav); 45 | 46 | fwSNRseg = comp_fwseg(cleanFile.wav, enhancedFile.wav); 47 | 48 | [SIG,BAK,OVL] = comp_fwseg_variant(cleanFile.wav, enhancedFile.wav); 49 | % where 'SIG' is the predicted rating of speech distortion, 50 | % 'BAK' is the predicted rating of background noise distortion, 51 | % 'OVL' is the predicted rating of overall quality. 52 | 53 | [SIG,BAK,OVL] = comp_fwseg_mars(cleanFile.wav, enhancedFile.wav); 54 | 55 | pesq_val = comp_pesq(cleanFile.wav, enhancedFile.wav); 56 | % Only sampling frequencies of 8000 Hz or 16000 Hz are supported. 57 | 58 | [Csig,Cbak,Covl] = composite(cleanFile.wav, enhancedFile.wav); 59 | % where 'Csig' is the predicted rating of speech distortion, 60 | % 'Cbak' is the predicted rating of background noise distortion, 61 | % 'Covl' is the predicted rating of overall quality. 62 | 63 | addnoise_asl(cleanfile.wav, noisefile.wav, outfile.wav, SNRlevel) 64 | 65 | --------------------------------------------------------------------------- 66 | 67 | ## REFERENCES: 68 | 69 | [1] Hansen, J. and Pellom, B. (1998). An effective quality evaluation 70 | protocol for speech enhancement algorithms. Inter. Conf. on Spoken 71 | Language Processing, 7(2819), 2822 72 | 73 | [2] Klatt, D. (1982). Prediction of perceived phonetic distance from 74 | critical band spectra. Proc. IEEE Int. Conf. Acoust. , Speech, 75 | Signal Processing, 7, 1278-1281. 76 | 77 | [3] Quackenbush, S., Barnwell, T., and Clements, M. (1988). Objective 78 | measures of speech quality. NJ: Prentice-Hall, Eaglewood Cliffs. 79 | 80 | [4] Kitawaki, N., Nagabuchi, H., and Itoh, K. (1988). Objective quality 81 | evaluation for low bit-rate speech coding systems. IEEE J. Select. 82 | Areas in Comm., 6(2), 262-273. 83 | 84 | [5] Tribolet, J., Noll, P., McDermott, B., and Crochiere, R. E. (1978). 85 | A study of complexity and quality of speech waveform coders. Proc. 86 | IEEE Int. Conf. Acoust. , Speech, Signal Processing, 586-590. 87 | 88 | [6] ITU (2000). Perceptual evaluation of speech quality (PESQ), and 89 | objective method for end-to-end speech quality assessment of 90 | narrowband telephone networks and speech codecs. ITU-T 91 | Recommendation P.862 92 | 93 | [7] ITU (2007). Wideband extension to Recommendation P.862 for the 94 | assessment of wideband telephone networks and speech codecs. ITU-T 95 | Recommendation P.862.2 96 | 97 | [8] Hu, Y. and Loizou, P. (2006). Evaluation of objective measures 98 | for speech enhancement. Proc. Interspeech 99 | 100 | [9] ITU-T (1993). Objective measurement of active speech level. ITU-T 101 | Recommendation P. 56 102 | 103 | 104 | Copyright (c) 2012 by Philipos C. Loizou 105 | 106 | Revision: 1.0, Date: 05/14/2012 107 | 108 | ------------------------------------------------------------------------------ 109 | -------------------------------------------------------------------------------- /readme.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IMLHF/Speech-Enhancement-Measures/42490594d431c5c11a0d5b9e2dc24ccdf0f6ec07/readme.pdf -------------------------------------------------------------------------------- /sp04.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IMLHF/Speech-Enhancement-Measures/42490594d431c5c11a0d5b9e2dc24ccdf0f6ec07/sp04.wav -------------------------------------------------------------------------------- /sp04_babble_sn10.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IMLHF/Speech-Enhancement-Measures/42490594d431c5c11a0d5b9e2dc24ccdf0f6ec07/sp04_babble_sn10.wav -------------------------------------------------------------------------------- /stoi.m: -------------------------------------------------------------------------------- 1 | function d = stoi(x, y, fs_signal) 2 | % from https://raw.githubusercontent.com/dakenan1/Speech-measure-SDR-SAR-STOI-PESQ/master/stoi.m 3 | % d = stoi(x, y, fs_signal) returns the output of the short-time 4 | % objective intelligibility (STOI) measure described in [1, 2], where x 5 | % and y denote the clean and processed speech, respectively, with sample 6 | % rate fs_signal in Hz. The output d is expected to have a monotonic 7 | % relation with the subjective speech-intelligibility, where a higher d 8 | % denotes better intelligible speech. See [1, 2] for more details. 9 | % 10 | % References: 11 | % [1] C.H.Taal, R.C.Hendriks, R.Heusdens, J.Jensen 'A Short-Time 12 | % Objective Intelligibility Measure for Time-Frequency Weighted Noisy 13 | % Speech', ICASSP 2010, Texas, Dallas. 14 | % 15 | % [2] C.H.Taal, R.C.Hendriks, R.Heusdens, J.Jensen 'An Algorithm for 16 | % Intelligibility Prediction of Time-Frequency Weighted Noisy Speech', 17 | % IEEE Transactions on Audio, Speech, and Language Processing, 2011. 18 | % 19 | % 20 | % Copyright 2009: Delft University of Technology, Signal & Information 21 | % Processing Lab. The software is free for non-commercial use. This program 22 | % comes WITHOUT ANY WARRANTY. 23 | % 24 | % 25 | % 26 | % Updates: 27 | % 2011-04-26 Using the more efficient 'taa_corr' instead of 'corr' 28 | 29 | if length(x)~=length(y) 30 | error('x and y should have the same length'); 31 | end 32 | 33 | % initialization 34 | x = x(:); % clean speech column vector 35 | y = y(:); % processed speech column vector 36 | 37 | fs = 10000; % sample rate of proposed intelligibility measure 38 | N_frame = 256; % window support 39 | K = 512; % FFT size 40 | J = 15; % Number of 1/3 octave bands 41 | mn = 150; % Center frequency of first 1/3 octave band in Hz. 42 | H = thirdoct(fs, K, J, mn); % Get 1/3 octave band matrix 43 | N = 30; % Number of frames for intermediate intelligibility measure (Length analysis window) 44 | Beta = -15; % lower SDR-bound 45 | dyn_range = 40; % speech dynamic range 46 | 47 | % resample signals if other samplerate is used than fs 48 | if fs_signal ~= fs 49 | x = resample(x, fs, fs_signal); 50 | y = resample(y, fs, fs_signal); 51 | end 52 | 53 | % remove silent frames 54 | [x y] = removeSilentFrames(x, y, dyn_range, N_frame, N_frame/2); 55 | 56 | % apply 1/3 octave band TF-decomposition 57 | x_hat = stdft(x, N_frame, N_frame/2, K); % apply short-time DFT to clean speech 58 | y_hat = stdft(y, N_frame, N_frame/2, K); % apply short-time DFT to processed speech 59 | 60 | x_hat = x_hat(:, 1:(K/2+1)).'; % take clean single-sided spectrum 61 | y_hat = y_hat(:, 1:(K/2+1)).'; % take processed single-sided spectrum 62 | 63 | X = zeros(J, size(x_hat, 2)); % init memory for clean speech 1/3 octave band TF-representation 64 | Y = zeros(J, size(y_hat, 2)); % init memory for processed speech 1/3 octave band TF-representation 65 | 66 | for i = 1:size(x_hat, 2) 67 | X(:, i) = sqrt(H*abs(x_hat(:, i)).^2); % apply 1/3 octave bands as described in Eq.(1) [1] 68 | Y(:, i) = sqrt(H*abs(y_hat(:, i)).^2); 69 | end 70 | 71 | % loop al segments of length N and obtain intermediate intelligibility measure for all TF-regions 72 | d_interm = zeros(J, length(N:size(X, 2))); % init memory for intermediate intelligibility measure 73 | c = 10^(-Beta/20); % constant for clipping procedure 74 | 75 | for m = N:size(X, 2) 76 | X_seg = X(:, (m-N+1):m); % region with length N of clean TF-units for all j 77 | Y_seg = Y(:, (m-N+1):m); % region with length N of processed TF-units for all j 78 | alpha = sqrt(sum(X_seg.^2, 2)./sum(Y_seg.^2, 2)); % obtain scale factor for normalizing processed TF-region for all j 79 | aY_seg = Y_seg.*repmat(alpha, [1 N]); % obtain \alpha*Y_j(n) from Eq.(2) [1] 80 | for j = 1:J 81 | Y_prime = min(aY_seg(j, :), X_seg(j, :)+X_seg(j, :)*c); % apply clipping from Eq.(3) 82 | d_interm(j, m-N+1) = taa_corr(X_seg(j, :).', Y_prime(:)); % obtain correlation coeffecient from Eq.(4) [1] 83 | end 84 | end 85 | 86 | d = mean(d_interm(:)); % combine all intermediate intelligibility measures as in Eq.(4) [1] 87 | 88 | %% 89 | function [A cf] = thirdoct(fs, N_fft, numBands, mn) 90 | % [A CF] = THIRDOCT(FS, N_FFT, NUMBANDS, MN) returns 1/3 octave band matrix 91 | % inputs: 92 | % FS: samplerate 93 | % N_FFT: FFT size 94 | % NUMBANDS: number of bands 95 | % MN: center frequency of first 1/3 octave band 96 | % outputs: 97 | % A: octave band matrix 98 | % CF: center frequencies 99 | 100 | f = linspace(0, fs, N_fft+1); 101 | f = f(1:(N_fft/2+1)); 102 | k = 0:(numBands-1); 103 | cf = 2.^(k/3)*mn; 104 | fl = sqrt((2.^(k/3)*mn).*2.^((k-1)/3)*mn); 105 | fr = sqrt((2.^(k/3)*mn).*2.^((k+1)/3)*mn); 106 | A = zeros(numBands, length(f)); 107 | 108 | for i = 1:(length(cf)) 109 | [a b] = min((f-fl(i)).^2); 110 | fl(i) = f(b); 111 | fl_ii = b; 112 | 113 | [a b] = min((f-fr(i)).^2); 114 | fr(i) = f(b); 115 | fr_ii = b; 116 | A(i,fl_ii:(fr_ii-1)) = 1; 117 | end 118 | 119 | rnk = sum(A, 2); 120 | numBands = find((rnk(2:end)>=rnk(1:(end-1))) & (rnk(2:end)~=0)~=0, 1, 'last' )+1; 121 | A = A(1:numBands, :); 122 | cf = cf(1:numBands); 123 | 124 | %% 125 | function x_stdft = stdft(x, N, K, N_fft) 126 | % X_STDFT = X_STDFT(X, N, K, N_FFT) returns the short-time 127 | % hanning-windowed dft of X with frame-size N, overlap K and DFT size 128 | % N_FFT. The columns and rows of X_STDFT denote the frame-index and 129 | % dft-bin index, respectively. 130 | 131 | frames = 1:K:(length(x)-N); 132 | x_stdft = zeros(length(frames), N_fft); 133 | 134 | w = hanning(N); 135 | x = x(:); 136 | 137 | for i = 1:length(frames) 138 | ii = frames(i):(frames(i)+N-1); 139 | x_stdft(i, :) = fft(x(ii).*w, N_fft); 140 | end 141 | 142 | %% 143 | function [x_sil y_sil] = removeSilentFrames(x, y, range, N, K) 144 | % [X_SIL Y_SIL] = REMOVESILENTFRAMES(X, Y, RANGE, N, K) X and Y 145 | % are segmented with frame-length N and overlap K, where the maximum energy 146 | % of all frames of X is determined, say X_MAX. X_SIL and Y_SIL are the 147 | % reconstructed signals, excluding the frames, where the energy of a frame 148 | % of X is smaller than X_MAX-RANGE 149 | 150 | x = x(:); 151 | y = y(:); 152 | 153 | frames = 1:K:(length(x)-N); 154 | w = hanning(N); 155 | msk = zeros(size(frames)); 156 | 157 | for j = 1:length(frames) 158 | jj = frames(j):(frames(j)+N-1); 159 | msk(j) = 20*log10(norm(x(jj).*w)./sqrt(N)); 160 | end 161 | 162 | msk = (msk-max(msk)+range)>0; 163 | count = 1; 164 | 165 | x_sil = zeros(size(x)); 166 | y_sil = zeros(size(y)); 167 | 168 | for j = 1:length(frames) 169 | if msk(j) 170 | jj_i = frames(j):(frames(j)+N-1); 171 | jj_o = frames(count):(frames(count)+N-1); 172 | x_sil(jj_o) = x_sil(jj_o) + x(jj_i).*w; 173 | y_sil(jj_o) = y_sil(jj_o) + y(jj_i).*w; 174 | count = count+1; 175 | end 176 | end 177 | 178 | x_sil = x_sil(1:jj_o(end)); 179 | y_sil = y_sil(1:jj_o(end)); 180 | 181 | %% 182 | function rho = taa_corr(x, y) 183 | % RHO = TAA_CORR(X, Y) Returns correlation coeffecient between column 184 | % vectors x and y. Gives same results as 'corr' from statistics toolbox. 185 | xn = x-mean(x); 186 | xn = xn/sqrt(sum(xn.^2)); 187 | yn = y-mean(y); 188 | yn = yn/sqrt(sum(yn.^2)); 189 | rho = sum(xn.*yn); -------------------------------------------------------------------------------- /toserver.sh: -------------------------------------------------------------------------------- 1 | # ./toserver.sh room@15123 pc_RealIRM_RelativeLossAFD500 2 | 3 | if [ -z "$1" ] || [ -z "$2" ]; then 4 | echo "Need a destination." 5 | exit -1 6 | fi 7 | site=${1#*@} 8 | user=${1%@*} 9 | rm _data _log -rf 10 | rm *__pycache__* -rf 11 | rm */__pycache__* -rf 12 | # mv exp ../ 13 | # # scp -r -P 15044 ./* xxxx@speaker.is99kdf.xyz:~/lhf/work/irm_test/extract_tfrecord 14 | # scp -r -P 15043 ./* xx@speaker.is99kdf.xyz:~/work/speech_en_test/c001_se 15 | 16 | # mv ../exp ./ 17 | 18 | if [ "$site" == "p40" ]; then 19 | echo "To $user@$site:/home/zhangwenbo5/lihongfeng/$2" 20 | rsync -avh -e "ssh -p 22 -o ProxyCommand='ssh -p 8695 zhangwenbo5@120.92.114.84 -W %h:%p'" --exclude-from='.gitignore' ./* zhangwenbo5@ksai-P40-2:/home/zhangwenbo5/lihongfeng/$2 21 | elif [ "$site" == "v100-3" ]; then 22 | echo "To $user@$site:/home/zhangwenbo5/lihongfeng/$2" 23 | rsync -avh -e "ssh -p 22 -o ProxyCommand='ssh -p 8695 zhangwenbo5@120.92.114.84 -W %h:%p'" --exclude-from='.gitignore' ./* zhangwenbo5@ksai-v100-3:/home/zhangwenbo5/lihongfeng/$2 24 | elif [ "$site" == "15123" ] || [ "$site" == "15041" ] || [ "$site" == "15043" ]; then 25 | echo "To $user@$site:~/worklhf/$2" 26 | rsync -avh -e 'ssh -p '$site --exclude-from='.gitignore' ./* $user@speaker.is99kdf.xyz:~/worklhf/$2 27 | fi 28 | # -a :递归到目录,即复制所有文件和子目录。另外,打开归档模式和所有其他选项(相当于 -rlptgoD) 29 | # -v :详细输出 30 | # -e ssh :使用 ssh 作为远程 shell,这样所有的东西都被加密 31 | # --exclude='*.out' :排除匹配模式的文件,例如 *.out 或 *.c 等。 32 | 33 | # scp -r -P 15043 xx@speaker.is99kdf.xyz:/home/xx/work/paper_se_test/pc001_se/exp/rnn_speech_enhancement/nnet_C001/nnet_iter15* ./ 34 | # scp -P 15223 xx@speaker.is99kdf.xyz:/fast/worklhf/paper_se_test/C_UNIGRU_RealPSM_RelativeLossAFD100/exp/rnn_speech_enhancement/nnet_C_UNIGRU_RealPSM_RelativeLossAFD100/nnet_iter25* ./ 35 | -------------------------------------------------------------------------------- /wavread.m: -------------------------------------------------------------------------------- 1 | function [ ref_data, ref_sampling_rate, nbits ] = wavread( ref_wav ) 2 | [ ref_data, ref_sampling_rate ] = audioread( ref_wav ); 3 | info = audioinfo(ref_wav); 4 | nbits = info.BitsPerSample; 5 | end 6 | -------------------------------------------------------------------------------- /white_noise.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IMLHF/Speech-Enhancement-Measures/42490594d431c5c11a0d5b9e2dc24ccdf0f6ec07/white_noise.wav --------------------------------------------------------------------------------