├── .gitignore
├── addnoise_asl.m
├── comp_cep.m
├── comp_fwseg.m
├── comp_fwseg_mars.m
├── comp_fwseg_variant.m
├── comp_is.m
├── comp_llr.m
├── comp_pesq.m
├── comp_snr.m
├── comp_wss.m
├── composite.m
├── enhanced.wav
├── estoi.m
├── evaluate_all.m
├── pesq.ubuntu16.bin
├── readme.md
├── readme.pdf
├── sp04.wav
├── sp04_babble_sn10.wav
├── stoi.m
├── toserver.sh
├── wavread.m
└── white_noise.wav


/.gitignore:
--------------------------------------------------------------------------------
1 | .git


--------------------------------------------------------------------------------
/addnoise_asl.m:
--------------------------------------------------------------------------------
  1 | function addnoise_asl(cleanfile, noisefile, outfile, snr) 
  2 | % ----------------------------------------------------------------------
  3 | %   This function adds noise to a file at a specified SNR level. It uses
  4 | %   the active speech level to compute the speech energy. The
  5 | %   active speech level is computed as per ITU-T P.56 standard [1].
  6 | %
  7 | %   Usage:  addnoise_asl(cleanFile.wav, noiseFile.wav, noisyFile.wav, SNR)
  8 | %           
  9 | %         cleanFile.wav  - clean input file in .wav format
 10 | %         noiseFile.wav  - file containing the noise signal in .wav format
 11 | %         noisyFile.wav  - resulting noisy file
 12 | %         SNR            - desired SNR in dB
 13 | %
 14 | %   Note that if the variable IRS below (line 38) is set to 1, then it applies the IRS
 15 | %   filter to bandlimit the signal to 300 Hz - 3.2 kHz. The default IRS
 16 | %   value is 0, ie, no IRS filtering is applied.
 17 | %
 18 | %  Example call:
 19 | %       addnoise_asl('sp04.wav','white_noise.wav','sp04_white_5db.wav',5);
 20 | %
 21 | %  
 22 | %  References:
 23 | %   [1] ITU-T (1993). Objective measurement of active speech level. ITU-T 
 24 | %       Recommendation P. 56
 25 | %
 26 | %   Author: Yi Hu and Philipos C. Loizou 
 27 | %
 28 | % Copyright (c) 2006 by Philipos C. Loizou
 29 | % $Revision: 0.0 $  $Date: 10/09/2006 $
 30 | % ----------------------------------------------------------------------
 31 | 
 32 | if nargin ~=4
 33 |     fprintf('USAGE: addnoise_asl(cleanFile.wav, noiseFile.wav, noisyFile.wav, SNR) \n');
 34 |     fprintf('For more help, type: help addnoise_asl\n\n');
 35 |     return;
 36 | end
 37 | 
 38 | IRS=0;  % if 1 apply IRS filter simulating telephone handset bandwidth (300 Hz -3.2 kHz)
 39 | 
 40 | % wavread gives floating point column data
 41 | [clean, srate, nbits]= wavread(cleanfile); 
 42 | % filter clean speech with irs filter
 43 | if IRS==1, clean= apply_IRS( clean, srate, nbits); end;
 44 | 
 45 | [Px, asl, c0]= asl_P56 ( clean, srate, nbits); 
 46 | % Px is the active speech level ms energy, asl is the active factor, and c0
 47 | % is the active speech level threshold. 
 48 | 
 49 | 
 50 | x=clean;
 51 | x_len= length( x); % length of speech signal
 52 | 
 53 | [noise, srate1, nbits1]= wavread( noisefile);
 54 | if (srate1~= srate)| (nbits1~= nbits)
 55 |     error( 'the formats of the two files dont match!');
 56 | end
 57 | noise_len= length( noise);
 58 | if (noise_len<= x_len)
 59 |     error( 'the noise length has to be greater than speech length!');
 60 | end
 61 | 
 62 | rand_start_limit= noise_len- x_len+ 1; 
 63 | % the start of the noise segment can vary between [1 rand_start_limit]
 64 | rand_start= round( (rand_start_limit- 1)* rand( 1)+ 1); 
 65 | % random start of the noise segment 
 66 | noise_segment= noise( rand_start: rand_start+ x_len- 1);
 67 | 
 68 | if IRS==1, noise_segment= apply_IRS( noise_segment, srate, nbits); end;
 69 | 
 70 | % this is the randomly selected noise segment that will be added to the
 71 | % clean speech x
 72 | Pn= noise_segment'* noise_segment/ x_len;
 73 | % we need to scale the noise segment samples to obtain the desired snr= 10*
 74 | % log10( Px/ (sf^2 * Pn))
 75 | sf= sqrt( Px/Pn/ (10^ (snr/ 10))); % scale factor for noise segment data
 76 | noise_segment= noise_segment * sf; 
 77 | 
 78 | noisy = x+ noise_segment;  
 79 | 
 80 | if ( (max( noisy)>= 1) | (min( noisy)< -1))
 81 |     error( 'Overflow occurred!\n');
 82 | end;
 83 | 
 84 | 
 85 | wavwrite( noisy, srate, nbits, outfile);
 86 | 
 87 | fprintf( 1, '\n NOTE: For comparison, the SNR based on long-term RMS level is %4.2f dB.\n\n', 10*log10((x'*x)/ ...
 88 |      (noise_segment'*noise_segment)));
 89 | 
 90 | 
 91 | %------------------------------------------------------------------------
 92 | function data_filtered= apply_IRS( data, Fs, nbits);
 93 | 
 94 | n= length( data);
 95 | 
 96 | % now find the next power of 2 which is greater or equal to n
 97 | pow_of_2= 2^ (ceil( log2( n)));
 98 | 
 99 | align_filter_dB= [0, -200; 50, -40; 100, -20; 125, -12; 160, -6; 200, 0;...    
100 |     250, 4; 300, 6; 350, 8; 400, 10; 500, 11; 600, 12; 700, 12; 800, 12;...
101 |     1000, 12; 1300, 12; 1600, 12; 2000, 12; 2500, 12; 3000, 12; 3250, 12;...
102 |     3500, 4; 4000, -200; 5000, -200; 6300, -200; 8000, -200]; 
103 | 
104 | [number_of_points, trivial]= size( align_filter_dB);
105 | overallGainFilter= interp1( align_filter_dB( :, 1), align_filter_dB( :, 2), ...
106 |     1000);
107 | 
108 | x= zeros( 1, pow_of_2);
109 | x( 1: n)= data;
110 | 
111 | x_fft= fft( x, pow_of_2);
112 | 
113 | freq_resolution= Fs/ pow_of_2;
114 | 
115 | factorDb( 1: pow_of_2/2+ 1)= interp1( align_filter_dB( :, 1), ...
116 |     align_filter_dB( :, 2), (0: pow_of_2/2)* freq_resolution)- ...
117 |     overallGainFilter;
118 | factor= 10.^ (factorDb/ 20);
119 | 
120 | factor= [factor, fliplr( factor( 2: pow_of_2/2))];
121 | x_fft= x_fft.* factor;
122 | 
123 | y= ifft( x_fft, pow_of_2);
124 | 
125 | data_filtered= y( 1: n)';
126 | 
127 | 
128 | 
129 | function [asl_ms, asl, c0]= asl_P56 ( x, fs, nbits)
130 | % this implements ITU P.56 method B. 
131 | % 'speechfile' is the speech file to calculate active speech level for,
132 | % 'asl' is the active speech level (between 0 and 1),
133 | % 'asl_rms' is the active speech level mean square energy.
134 | 
135 | % x is the column vector of floating point speech data
136 | 
137 | x= x(:); % make sure x is column vector
138 | T= 0.03; % time constant of smoothing, in seconds
139 | H= 0.2; % hangover time in seconds
140 | M= 15.9; 
141 | % margin in dB of the difference between threshold and active speech level
142 | thres_no= nbits- 1; % number of thresholds, for 16 bit, it's 15
143 | 
144 | I= ceil( fs* H); % hangover in samples
145 | g= exp( -1/( fs* T)); % smoothing factor in envelop detection
146 | c( 1: thres_no)= 2.^ (-15: thres_no- 16); 
147 | % vector with thresholds from one quantizing level up to half the maximum
148 | % code, at a step of 2, in the case of 16bit samples, from 2^-15 to 0.5; 
149 | a( 1: thres_no)= 0; % activity counter for each level threshold
150 | hang( 1: thres_no)= I; % hangover counter for each level threshold
151 | 
152 | sq= x'* x; % long-term level square energy of x
153 | x_len= length( x); % length of x
154 | 
155 | % use a 2nd order IIR filter to detect the envelope q
156 | x_abs= abs( x); 
157 | p= filter( 1-g, [1 -g], x_abs); 
158 | q= filter( 1-g, [1 -g], p);
159 | 
160 | for k= 1: x_len
161 |     for j= 1: thres_no
162 |         if (q(k)>= c(j))
163 |             a(j)= a(j)+ 1;
164 |             hang(j)= 0;
165 |         elseif (hang(j)< I)
166 |             a(j)= a(j)+ 1;
167 |             hang(j)= hang(j)+ 1;
168 |         else
169 |             break;
170 |         end
171 |     end
172 | end
173 | 
174 | asl= 0; 
175 | asl_rms= 0; 
176 | if (a(1)== 0)
177 |     return;
178 | else
179 |     AdB1= 10* log10( sq/ a(1)+ eps);
180 | end
181 | 
182 | CdB1= 20* log10( c(1)+ eps);
183 | if (AdB1- CdB1< M)
184 |     return;
185 | end
186 | 
187 | AdB(1)= AdB1; 
188 | CdB(1)= CdB1;
189 | Delta(1)= AdB1- CdB1;
190 | 
191 | for j= 2: thres_no
192 |     AdB(j)= 10* log10( sq/ (a(j)+ eps)+ eps);
193 |     CdB(j)= 20* log10( c(j)+ eps);
194 | end
195 | 
196 | for j= 2: thres_no    
197 |     if (a(j) ~= 0)       
198 |         Delta(j)= AdB(j)- CdB(j);        
199 |         if (Delta(j)<= M) 
200 |             % interpolate to find the asl
201 |             [asl_ms_log, cl0]= bin_interp( AdB(j), ...
202 |                 AdB(j-1), CdB(j), CdB(j-1), M, 0.5);
203 |             asl_ms= 10^ (asl_ms_log/ 10);
204 |             asl= (sq/ x_len)/ asl_ms;  
205 |             c0= 10^( cl0/ 20);            
206 |             break;
207 |         end        
208 |     end
209 | end
210 | 
211 | 
212 | 
213 | 
214 | function [asl_ms_log, cc]= bin_interp(upcount, lwcount, ...
215 |     upthr, lwthr, Margin, tol)
216 | 
217 | if (tol < 0)
218 |     tol = -tol;
219 | end
220 | 
221 | % Check if extreme counts are not already the true active value
222 | iterno = 1;
223 | if (abs(upcount - upthr - Margin) < tol)
224 |     asl_ms_log= upcount;
225 |     cc= upthr;
226 |     return;
227 | end
228 | if (abs(lwcount - lwthr - Margin) < tol)
229 |     asl_ms_log= lwcount;
230 |     cc= lwthr;
231 |     return;
232 | end
233 | 
234 | % Initialize first middle for given (initial) bounds 
235 | midcount = (upcount + lwcount) / 2.0;
236 | midthr = (upthr + lwthr) / 2.0;
237 | 
238 | % Repeats loop until `diff' falls inside the tolerance (-tol<=diff<=tol)
239 | while ( 1) 
240 |     
241 |     diff= midcount- midthr- Margin;
242 |     if (abs(diff)<= tol)
243 |         break;
244 |     end
245 |     
246 |     % if tolerance is not met up to 20 iteractions, then relax the
247 |     % tolerance by 10%
248 |     
249 |     iterno= iterno+ 1; 
250 |     
251 |     if (iterno>20) 
252 |       tol = tol* 1.1; 
253 |     end
254 | 
255 |     if (diff> tol)   % then new bounds are ...     
256 |         midcount = (upcount + midcount) / 2.0; 
257 |         % upper and middle activities 
258 |         midthr = (upthr + midthr) / 2.0;	  
259 |         % ... and thresholds     
260 |     elseif (diff< -tol)	% then new bounds are ... 
261 |         midcount = (midcount + lwcount) / 2.0; 
262 |         % middle and lower activities 
263 |         midthr = (midthr + lwthr) / 2.0;   
264 |         % ... and thresholds 
265 |     end    
266 |     
267 | end
268 | %   Since the tolerance has been satisfied, midcount is selected 
269 | %   as the interpolated value with a tol [dB] tolerance.
270 | 
271 | asl_ms_log= midcount;
272 | cc= midthr;
273 | 
274 | 
275 | 
276 | 
277 | 


--------------------------------------------------------------------------------
/comp_cep.m:
--------------------------------------------------------------------------------
  1 | function cep_mean= comp_cep(cleanFile, enhdFile);
  2 | 
  3 | % ----------------------------------------------------------------------
  4 | %          Cepstrum Distance Objective Speech Quality Measure
  5 | %
  6 | %   This function implements the cepstrum distance measure used
  7 | %   in [1]
  8 | %
  9 | %   Usage:  CEP=comp_cep(cleanFile.wav, enhancedFile.wav)
 10 | %           
 11 | %         cleanFile.wav - clean input file in .wav format
 12 | %         enhancedFile  - enhanced output file in .wav format
 13 | %         CEP           - computed cepstrum distance measure
 14 | % 
 15 | %         Note that the cepstrum measure is limited in the range [0, 10].
 16 | %
 17 | %  Example call:  CEP =comp_cep('sp04.wav','enhanced.wav')
 18 | %
 19 | %  
 20 | %  References:
 21 | %
 22 | %     [1]	Kitawaki, N., Nagabuchi, H., and Itoh, K. (1988). Objective quality
 23 | %           evaluation for low bit-rate speech coding systems. IEEE J. Select.
 24 | %           Areas in Comm., 6(2), 262-273.
 25 | %
 26 | %  Author: Philipos C. Loizou 
 27 | %  (LPC routines were written by Bryan Pellom & John Hansen)
 28 | %
 29 | % Copyright (c) 2006 by Philipos C. Loizou
 30 | % $Revision: 0.0 $  $Date: 10/09/2006 $
 31 | 
 32 | % ----------------------------------------------------------------------
 33 | if nargin~=2
 34 |     fprintf('USAGE: CEP=comp_cep(cleanFile.wav, enhancedFile.wav)\n');
 35 |     fprintf('For more help, type: help comp_cep\n\n');
 36 |     return;
 37 | end
 38 | 
 39 | alpha=0.95;
 40 | 
 41 | [data1, Srate1, Nbits1]= wavread(cleanFile);
 42 | [data2, Srate2, Nbits2]= wavread(enhdFile);
 43 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
 44 |     error( 'The two files do not match!\n');
 45 | end
 46 |     
 47 | len= min( length( data1), length( data2));
 48 | data1= data1( 1: len)+eps;
 49 | data2= data2( 1: len)+eps;
 50 | 
 51 | IS_dist= cepstrum( data1, data2,Srate1);
 52 | 
 53 | IS_len= round( length( IS_dist)* alpha);
 54 | IS= sort( IS_dist);
 55 | 
 56 | cep_mean= mean( IS( 1: IS_len)); 
 57 | 
 58 | 
 59 | 
 60 | 
 61 | function distortion = cepstrum(clean_speech, processed_speech,sample_rate)
 62 | 
 63 | 
 64 | % ----------------------------------------------------------------------
 65 | % Check the length of the clean and processed speech.  Must be the same.
 66 | % ----------------------------------------------------------------------
 67 | 
 68 | clean_length      = length(clean_speech);
 69 | processed_length  = length(processed_speech);
 70 | 
 71 | if (clean_length ~= processed_length)
 72 |   disp('Error: Both Speech Files must be same length.');
 73 |   return
 74 | end
 75 | 
 76 | % ----------------------------------------------------------------------
 77 | % Scale both clean speech and processed speech to have same dynamic
 78 | % range.  Also remove DC component from each signal
 79 | % ----------------------------------------------------------------------
 80 | 
 81 | %clean_speech     = clean_speech     - mean(clean_speech);
 82 | %processed_speech = processed_speech - mean(processed_speech);
 83 | 
 84 | %processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech)));
 85 | 
 86 | % ----------------------------------------------------------------------
 87 | % Global Variables
 88 | % ----------------------------------------------------------------------
 89 | 
 90 | winlength   = round(30*sample_rate/1000); %240;		   % window length in samples
 91 | skiprate    = floor(winlength/4);		   % window skip in samples
 92 | if sample_rate<10000
 93 |    P           = 10;		   % LPC Analysis Order
 94 | else
 95 |     P=16;     % this could vary depending on sampling frequency.
 96 | end
 97 | C=10*sqrt(2)/log(10);
 98 | % ----------------------------------------------------------------------
 99 | % For each frame of input speech, calculate the Itakura-Saito Measure
100 | % ----------------------------------------------------------------------
101 | 
102 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
103 | start      = 1;					% starting sample
104 | window     = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
105 | 
106 | for frame_count = 1:num_frames
107 | 
108 |    % ----------------------------------------------------------
109 |    % (1) Get the Frames for the test and reference speech. 
110 |    %     Multiply by Hanning Window.
111 |    % ----------------------------------------------------------
112 | 
113 |    clean_frame = clean_speech(start:start+winlength-1);
114 |    processed_frame = processed_speech(start:start+winlength-1);
115 |    clean_frame = clean_frame.*window;
116 |    processed_frame = processed_frame.*window;
117 | 
118 |    % ----------------------------------------------------------
119 |    % (2) Get the autocorrelation lags and LPC parameters used
120 |    %     to compute the IS measure.
121 |    % ----------------------------------------------------------
122 | 
123 |    [R_clean, Ref_clean, A_clean] = ...
124 |       lpcoeff(clean_frame, P);
125 |    [R_processed, Ref_processed, A_processed] = ...
126 |       lpcoeff(processed_frame, P);
127 | 
128 |   C_clean=lpc2cep(A_clean);
129 |   C_processed=lpc2cep(A_processed);
130 |   
131 |    % ----------------------------------------------------------
132 |    % (3) Compute the cepstrum-distance measure
133 |    % ----------------------------------------------------------
134 | 
135 |   
136 |    distortion(frame_count) = min(10,C*norm(C_clean-C_processed,2)); 
137 |    
138 | 
139 |    start = start + skiprate;
140 | 
141 | end
142 | 
143 | 
144 | 
145 | function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order)
146 | 
147 |    % ----------------------------------------------------------
148 |    % (1) Compute Autocorrelation Lags
149 |    % ----------------------------------------------------------
150 | 
151 |    winlength = max(size(speech_frame));
152 |    for k=1:model_order+1
153 |       R(k) = sum(speech_frame(1:winlength-k+1) ...
154 | 		     .*speech_frame(k:winlength));
155 |    end
156 | 
157 |    % ----------------------------------------------------------
158 |    % (2) Levinson-Durbin
159 |    % ----------------------------------------------------------
160 | 
161 |    a = ones(1,model_order);
162 |    E(1)=R(1);
163 |    for i=1:model_order
164 |       a_past(1:i-1) = a(1:i-1);
165 |       sum_term = sum(a_past(1:i-1).*R(i:-1:2));
166 |       rcoeff(i)=(R(i+1) - sum_term) / E(i);
167 |       a(i)=rcoeff(i);
168 |       a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1);
169 |       E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i);
170 |    end
171 | 
172 |    acorr    = R;
173 |    refcoeff = rcoeff;
174 |    lpparams = [1 -a];
175 | 
176 | %----------------------------------------------
177 | function [cep]=lpc2cep(a)
178 | %
179 | % converts prediction to cepstrum coefficients
180 | %
181 | % Author: Philipos C. Loizou
182 | 
183 | M=length(a);
184 | cep=zeros(1,M-1);
185 | 
186 | cep(1)=-a(2);
187 | 
188 | for k=2:M-1
189 |     ix=1:k-1;
190 |     vec1=cep(ix).*a(k-1+1:-1:2).*ix;
191 |     cep(k)=-(a(k+1)+sum(vec1)/k);
192 |     
193 | end
194 | 
195 | 
196 | 
197 |  
198 | 
199 |     
200 | 
201 | 


--------------------------------------------------------------------------------
/comp_fwseg.m:
--------------------------------------------------------------------------------
  1 | function fwseg_dist= comp_fwseg(cleanFile, enhancedFile);
  2 | 
  3 | % ----------------------------------------------------------------------
  4 | %      Frequency weighted SNRseg Objective Speech Quality Measure
  5 | %
  6 | %   This function implements the frequency-weighted SNRseg measure [1]
  7 | %   using a different weighting function, the clean spectrum.
  8 | %
  9 | %   Usage:  fwSNRseg=comp_fwseg(cleanFile.wav, enhancedFile.wav)
 10 | %           
 11 | %         cleanFile.wav - clean input file in .wav format
 12 | %         enhancedFile  - enhanced output file in .wav format
 13 | %         fwSNRseg      - computed frequency weighted SNRseg in dB
 14 | % 
 15 | %         Note that large numbers of fwSNRseg are better.
 16 | %
 17 | %  Example call:  fwSNRseg =comp_fwseg('sp04.wav','enhanced.wav')
 18 | %
 19 | %  
 20 | %  References:
 21 | %   [1]  Tribolet, J., Noll, P., McDermott, B., and Crochiere, R. E. (1978).
 22 | %        A study of complexity and quality of speech waveform coders. Proc. 
 23 | %        IEEE Int. Conf. Acoust. , Speech, Signal Processing, 586-590.
 24 | %
 25 | %   Author: Philipos C. Loizou 
 26 | %  (critical-band filtering routines were written by Bryan Pellom & John Hansen)
 27 | %
 28 | % Copyright (c) 2006 by Philipos C. Loizou
 29 | % $Revision: 0.0 $  $Date: 10/09/2006 $
 30 | % ----------------------------------------------------------------------
 31 | 
 32 | if nargin~=2
 33 |     fprintf('USAGE: fwSNRseg=comp_fwseg(cleanFile.wav, enhancedFile.wav)\n');
 34 |     fprintf('For more help, type: help comp_fwseg\n\n');
 35 |     return;
 36 | end
 37 | 
 38 | 
 39 | [data1, Srate1, Nbits1]= wavread(cleanFile);
 40 | [data2, Srate2, Nbits2]= wavread(enhancedFile);
 41 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
 42 |     error( 'The two files do not match!\n');
 43 | end
 44 | 
 45 | len= min( length( data1), length( data2));
 46 | data1= data1( 1: len)+eps;
 47 | data2= data2( 1: len)+eps;
 48 | 
 49 | wss_dist_vec= fwseg( data1, data2,Srate1);
 50 | 
 51 | fwseg_dist=mean(wss_dist_vec);
 52 | 
 53 | 
 54 | % ----------------------------------------------------------------------
 55 | 
 56 | function distortion = fwseg(clean_speech, processed_speech,sample_rate)
 57 | 
 58 | 
 59 | % ----------------------------------------------------------------------
 60 | % Check the length of the clean and processed speech.  Must be the same.
 61 | % ----------------------------------------------------------------------
 62 | 
 63 | clean_length      = length(clean_speech);
 64 | processed_length  = length(processed_speech);
 65 | 
 66 | if (clean_length ~= processed_length)
 67 |   disp('Error: Files  must have same length.');
 68 |   return
 69 | end
 70 | 
 71 | 
 72 | 
 73 | % ----------------------------------------------------------------------
 74 | % Global Variables
 75 | % ----------------------------------------------------------------------
 76 | 
 77 | 
 78 | winlength   = round(30*sample_rate/1000); 	   % window length in samples
 79 | skiprate    = floor(winlength/4);		   % window skip in samples
 80 | max_freq    = sample_rate/2;	   % maximum bandwidth
 81 | num_crit    = 25;		   % number of critical bands
 82 | USE_25=1;
 83 | n_fft       = 2^nextpow2(2*winlength);
 84 | n_fftby2    = n_fft/2;		   % FFT size/2
 85 | gamma=0.2;  % power exponent
 86 | 
 87 | % ----------------------------------------------------------------------
 88 | % Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
 89 | % ----------------------------------------------------------------------
 90 | 
 91 | cent_freq(1)  = 50.0000;   bandwidth(1)  = 70.0000;
 92 | cent_freq(2)  = 120.000;   bandwidth(2)  = 70.0000;
 93 | cent_freq(3)  = 190.000;   bandwidth(3)  = 70.0000;
 94 | cent_freq(4)  = 260.000;   bandwidth(4)  = 70.0000;
 95 | cent_freq(5)  = 330.000;   bandwidth(5)  = 70.0000;
 96 | cent_freq(6)  = 400.000;   bandwidth(6)  = 70.0000;
 97 | cent_freq(7)  = 470.000;   bandwidth(7)  = 70.0000;
 98 | cent_freq(8)  = 540.000;   bandwidth(8)  = 77.3724;
 99 | cent_freq(9)  = 617.372;   bandwidth(9)  = 86.0056;
100 | cent_freq(10) = 703.378;   bandwidth(10) = 95.3398;
101 | cent_freq(11) = 798.717;   bandwidth(11) = 105.411;
102 | cent_freq(12) = 904.128;   bandwidth(12) = 116.256;
103 | cent_freq(13) = 1020.38;   bandwidth(13) = 127.914;
104 | cent_freq(14) = 1148.30;   bandwidth(14) = 140.423;
105 | cent_freq(15) = 1288.72;   bandwidth(15) = 153.823;
106 | cent_freq(16) = 1442.54;   bandwidth(16) = 168.154;
107 | cent_freq(17) = 1610.70;   bandwidth(17) = 183.457;
108 | cent_freq(18) = 1794.16;   bandwidth(18) = 199.776;
109 | cent_freq(19) = 1993.93;   bandwidth(19) = 217.153;
110 | cent_freq(20) = 2211.08;   bandwidth(20) = 235.631;
111 | cent_freq(21) = 2446.71;   bandwidth(21) = 255.255;
112 | cent_freq(22) = 2701.97;   bandwidth(22) = 276.072;
113 | cent_freq(23) = 2978.04;   bandwidth(23) = 298.126;
114 | cent_freq(24) = 3276.17;   bandwidth(24) = 321.465;
115 | cent_freq(25) = 3597.63;   bandwidth(25) = 346.136;
116 | 
117 | W=[  % articulation index weights
118 | 0.003
119 | 0.003
120 | 0.003
121 | 0.007
122 | 0.010
123 | 0.016
124 | 0.016
125 | 0.017
126 | 0.017
127 | 0.022
128 | 0.027
129 | 0.028
130 | 0.030
131 | 0.032
132 | 0.034
133 | 0.035
134 | 0.037
135 | 0.036
136 | 0.036
137 | 0.033
138 | 0.030
139 | 0.029
140 | 0.027
141 | 0.026
142 | 0.026];
143 | 
144 | W=W';
145 | 
146 | if USE_25==0  % use 13 bands
147 |     % ----- lump adjacent filters together ----------------
148 |     k=2;
149 |     cent_freq2(1)=cent_freq(1);
150 |     bandwidth2(1)=bandwidth(1)+bandwidth(2);
151 |     W2(1)=W(1);
152 |     for i=2:13
153 |         cent_freq2(i)=cent_freq2(i-1)+bandwidth2(i-1);
154 |         bandwidth2(i)=bandwidth(k)+bandwidth(k+1);
155 |         W2(i)=0.5*(W(k)+W(k+1));
156 |         k=k+2;
157 |     end
158 | 
159 |     sumW=sum(W2);
160 |     bw_min      = bandwidth2 (1);	   % minimum critical bandwidth
161 | else
162 |     sumW=sum(W);
163 |     bw_min=bandwidth(1);
164 | end
165 | 
166 | 
167 | % ----------------------------------------------------------------------
168 | % Set up the critical band filters.  Note here that Gaussianly shaped
169 | % filters are used.  Also, the sum of the filter weights are equivalent
170 | % for each critical band filter.  Filter less than -30 dB and set to
171 | % zero.
172 | % ----------------------------------------------------------------------
173 | 
174 | min_factor = exp (-30.0 / (2.0 * 2.303));       % -30 dB point of filter
175 | if USE_25==0
176 |     
177 |     num_crit=length(cent_freq2);
178 | 
179 |     for i = 1:num_crit
180 |         f0 = (cent_freq2 (i) / max_freq) * (n_fftby2);
181 |         all_f0(i) = floor(f0);
182 |         bw = (bandwidth2 (i) / max_freq) * (n_fftby2);
183 |         norm_factor = log(bw_min) - log(bandwidth2(i));
184 |         j = 0:1:n_fftby2-1;
185 |         crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
186 |         crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
187 |     end
188 | 
189 | else
190 |     for i = 1:num_crit
191 |         f0 = (cent_freq (i) / max_freq) * (n_fftby2);
192 |         all_f0(i) = floor(f0);
193 |         bw = (bandwidth (i) / max_freq) * (n_fftby2);
194 |         norm_factor = log(bw_min) - log(bandwidth(i));
195 |         j = 0:1:n_fftby2-1;
196 |         crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
197 |         crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
198 |     end
199 | end
200 | 
201 | 
202 | 
203 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
204 | start      = 1;					% starting sample
205 | window     = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
206 | 
207 | for frame_count = 1:num_frames
208 | 
209 |    % ----------------------------------------------------------
210 |    % (1) Get the Frames for the test and reference speech. 
211 |    %     Multiply by Hanning Window.
212 |    % ----------------------------------------------------------
213 | 
214 |    clean_frame = clean_speech(start:start+winlength-1);
215 |    processed_frame = processed_speech(start:start+winlength-1);
216 |    clean_frame = clean_frame.*window;
217 |    processed_frame = processed_frame.*window;
218 | 
219 |    % ----------------------------------------------------------
220 |    % (2) Compute the magnitude Spectrum of Clean and Processed
221 |    % ----------------------------------------------------------
222 | 
223 |     
224 |        clean_spec     = abs(fft(clean_frame,n_fft));
225 |        processed_spec = abs(fft(processed_frame,n_fft)); 
226 | 
227 |     % normalize spectra to have area of one
228 |     %
229 |     clean_spec=clean_spec/sum(clean_spec(1:n_fftby2));
230 |     processed_spec=processed_spec/sum(processed_spec(1:n_fftby2));
231 | 
232 |    % ----------------------------------------------------------
233 |    % (3) Compute Filterbank Output Energies 
234 |    % ----------------------------------------------------------
235 |  
236 |    clean_energy=zeros(1,num_crit);
237 |    processed_energy=zeros(1,num_crit);
238 |    error_energy=zeros(1,num_crit);
239 |    W_freq=zeros(1,num_crit);
240 |   
241 |    for i = 1:num_crit
242 |       clean_energy(i) = sum(clean_spec(1:n_fftby2) ...
243 |                          	.*crit_filter(i,:)');
244 |       processed_energy(i) = sum(processed_spec(1:n_fftby2) ...
245 |           .*crit_filter(i,:)');
246 |                   	
247 |         error_energy(i)=max((clean_energy(i)-processed_energy(i))^2,eps);
248 |         W_freq(i)=(clean_energy(i))^gamma;
249 |        
250 |    end
251 |    SNRlog=10*log10((clean_energy.^2)./error_energy);
252 |    
253 |    
254 |    
255 |    fwSNR=sum(W_freq.*SNRlog)/sum(W_freq);
256 |    
257 |    distortion(frame_count)=min(max(fwSNR,-10),35);
258 | 
259 |    start = start + skiprate;
260 |      
261 | end
262 | 
263 | 
264 | 
265 | 


--------------------------------------------------------------------------------
/comp_fwseg_mars.m:
--------------------------------------------------------------------------------
  1 | function [SIG,BAK,OVL]= comp_fwseg_mars(cleanFile, enhancedFile);
  2 | 
  3 | % ----------------------------------------------------------------------
  4 | %      MARS Frequency-variant fwSNRseg objective speech quality measure
  5 | %
  6 | %   This function implements the frequency-variant fwSNRseg measure based
  7 | %   on MARS analysis (see Chap. 10, Sec. 10.5.4)
  8 | %
  9 | %
 10 | %   Usage:  [sig,bak,ovl]=comp_fwseg_mars(cleanFile.wav, enhancedFile.wav)
 11 | %           
 12 | %         cleanFile.wav - clean input file in .wav format
 13 | %         enhancedFile  - enhanced output file in .wav format
 14 | %         sig           - predicted rating [1-5] of speech distortion
 15 | %         bak           - predicted rating [1-5] of noise distortion
 16 | %         ovl           - predicted rating [1-5] of overall quality
 17 | %
 18 | %
 19 | %  Example call:  [s,b,o] =comp_fwseg_mars('sp04.wav','enhanced.wav')
 20 | %
 21 | %  
 22 | %  References:
 23 | %    [1] Chapter 10, Sec 10.5.4,
 24 | %    [2] Chapter 11
 25 | %
 26 | %   Authors: Yi Hu and Philipos C. Loizou 
 27 | %  (critical-band filtering routines were written by Bryan Pellom & John Hansen)
 28 | %
 29 | % Copyright (c) 2006 by Philipos C. Loizou
 30 | % $Revision: 0.0 $  $Date: 10/09/2006 $
 31 | % ----------------------------------------------------------------------
 32 | 
 33 | if nargin~=2
 34 |     fprintf('USAGE: [sig,bak,ovl]=comp_fwseg_mars(cleanFile.wav, enhancedFile.wav)\n');
 35 |     fprintf('For more help, type: help comp_fwseg_mars\n\n');
 36 |     return;
 37 | end
 38 | 
 39 | [data1, Srate1, Nbits1]= wavread(cleanFile);
 40 | [data2, Srate2, Nbits2]= wavread(enhancedFile);
 41 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
 42 |     error( 'The two files do not match!\n');
 43 | end
 44 | 
 45 | len= min( length( data1), length( data2));
 46 | data1= data1( 1: len)+eps;
 47 | data2= data2( 1: len)+eps;
 48 | 
 49 | wss_dist_matrix= fwseg( data1, data2,Srate1);
 50 | wss_dist=mean(wss_dist_matrix);
 51 | 
 52 | 
 53 | SIG= sig_mars( wss_dist( 1), wss_dist( 2), wss_dist( 3), wss_dist( 4), ...
 54 |     wss_dist( 5), wss_dist( 6), wss_dist( 7), wss_dist( 8), ...
 55 |     wss_dist( 9), wss_dist( 10), wss_dist( 11), wss_dist( 12), ...
 56 |     wss_dist( 13), wss_dist( 14), wss_dist( 15), wss_dist( 16), ...
 57 |     wss_dist( 17), wss_dist( 18), wss_dist( 19), wss_dist( 20), ...
 58 |     wss_dist( 21), wss_dist( 22), wss_dist( 23), wss_dist( 24), ...
 59 |     wss_dist( 25));
 60 | SIG=max(1,SIG); SIG=min(5, SIG); % limit values to [1, 5]
 61 | 
 62 | BAK= bak_mars( wss_dist( 1), wss_dist( 2), wss_dist( 3), wss_dist( 4), ...
 63 |     wss_dist( 5), wss_dist( 6), wss_dist( 7), wss_dist( 8), ...
 64 |     wss_dist( 9), wss_dist( 10), wss_dist( 11), wss_dist( 12), ...
 65 |     wss_dist( 13), wss_dist( 14), wss_dist( 15), wss_dist( 16), ...
 66 |     wss_dist( 17), wss_dist( 18), wss_dist( 19), wss_dist( 20), ...
 67 |     wss_dist( 21), wss_dist( 22), wss_dist( 23), wss_dist( 24), ...
 68 |     wss_dist( 25));
 69 | BAK=max(1,BAK); BAK=min(5, BAK); % limit values to [1, 5]
 70 | 
 71 | OVL= ovl_mars( wss_dist( 1), wss_dist( 2), wss_dist( 3), wss_dist( 4), ...
 72 |     wss_dist( 5), wss_dist( 6), wss_dist( 7), wss_dist( 8), ...
 73 |     wss_dist( 9), wss_dist( 10), wss_dist( 11), wss_dist( 12), ...
 74 |     wss_dist( 13), wss_dist( 14), wss_dist( 15), wss_dist( 16), ...
 75 |     wss_dist( 17), wss_dist( 18), wss_dist( 19), wss_dist( 20), ...
 76 |     wss_dist( 21), wss_dist( 22), wss_dist( 23), wss_dist( 24), ...
 77 |     wss_dist( 25));
 78 | OVL=max(1,OVL); OVL=min(5, OVL); % limit values to [1, 5]
 79 | 
 80 | 
 81 | %-------------------------------------------------
 82 | function Y= bak_mars( FWSEG_VA, V5, V6, V7, V8, V9, V10, V11, V12, ...
 83 |     V13, V14, V15, V16, V17, V18, V19, V20, ...
 84 |     V21, V22, V23, V24, V25, V26, V27, V28)
 85 | 
 86 | BF1 = max(0, V21 - 0.282);
 87 | BF2 = max(0, FWSEG_VA + 9.094);
 88 | BF3 = max(0, - 9.094 - FWSEG_VA );
 89 | BF5 = max(0, 10.089 - V11 );
 90 | BF7 = max(0, 3.624 - V26 ) * BF3;
 91 | BF8 = max(0, V24 - 5.584) * BF5;
 92 | BF9 = max(0, 5.584 - V24 ) * BF5;
 93 | BF10 = max(0, V19 - 8.030) * BF1;
 94 | BF11 = max(0, 8.030 - V19 ) * BF1;
 95 | BF12 = max(0, V27 - 4.858) * BF1;
 96 | BF13 = max(0, 4.858 - V27 ) * BF1;
 97 | BF14 = max(0, FWSEG_VA + 7.282) * BF1;
 98 | BF15 = max(0, - 7.282 - FWSEG_VA ) * BF1;
 99 | BF17 = max(0, 9.458 - V16 ) * BF10;
100 | BF18 = max(0, V27 - 10.431) * BF11;
101 | BF19 = max(0, 10.431 - V27 ) * BF11;
102 | BF21 = max(0, 11.059 - V22 ) * BF1;
103 | BF22 = max(0, V26 - 8.675) * BF1;
104 | BF23 = max(0, 8.675 - V26 ) * BF1;
105 | BF25 = max(0, 11.195 - V6 ) * BF10;
106 | BF26 = max(0, V8 - 7.138) * BF1;
107 | BF27 = max(0, 7.138 - V8 ) * BF1;
108 | BF29 = max(0, 9.006 - V10 ) * BF26;
109 | BF30 = max(0, V14 - 8.210) * BF15;
110 | BF35 = max(0, 7.026 - V19 ) * BF15;
111 | BF36 = max(0, V11 - 3.424) * BF27;
112 | BF39 = max(0, 5.418 - V17 ) * BF23;
113 | BF40 = max(0, V28 - 6.813);
114 | BF41 = max(0, 6.813 - V28 );
115 | BF42 = max(0, V26 - 5.998) * BF14;
116 | BF43 = max(0, 5.998 - V26 ) * BF14;
117 | BF44 = max(0, V5 + 0.206) * BF41;
118 | BF45 = max(0, - 0.206 - V5 ) * BF41;
119 | BF46 = max(0, V22 - 7.901) * BF45;
120 | BF49 = max(0, 7.496 - V8 ) * BF44;
121 | BF51 = max(0, 7.904 - V11 ) * BF45;
122 | BF52 = max(0, V26 - 10.938) * BF27;
123 | BF54 = max(0, V9 - 4.507) * BF26;
124 | BF56 = max(0, V28 - 0.549) * BF15;
125 | BF57 = max(0, 0.549 - V28 ) * BF15;
126 | BF58 = max(0, V25 - 3.252) * BF41;
127 | BF59 = max(0, 3.252 - V25 ) * BF41;
128 | BF60 = max(0, V23 - 7.650) * BF58;
129 | BF61 = max(0, 7.650 - V23 ) * BF58;
130 | BF62 = max(0, V25 - 9.931) * BF44;
131 | BF63 = max(0, 9.931 - V25 ) * BF44;
132 | BF64 = max(0, V25 - 4.923) * BF21;
133 | BF65 = max(0, 4.923 - V25 ) * BF21;
134 | BF67 = max(0, 3.746 - V28 ) * BF10;
135 | BF68 = max(0, V11 - 5.346) * BF41;
136 | BF69 = max(0, 5.346 - V11 ) * BF41;
137 | BF70 = max(0, V12 - 9.026) * BF68;
138 | BF71 = max(0, 9.026 - V12 ) * BF68;
139 | BF73 = max(0, - 2.668 - V28 ) * BF21;
140 | BF74 = max(0, V24 - 7.028) * BF41;
141 | BF75 = max(0, 7.028 - V24 ) * BF41;
142 | BF77 = max(0, - 0.224 - V6 ) * BF74;
143 | BF78 = max(0, V5 - 3.884);
144 | BF79 = max(0, 3.884 - V5 );
145 | BF80 = max(0, V15 - 5.019) * BF78;
146 | BF83 = max(0, - 1.880 - V28 ) * BF13;
147 | BF84 = max(0, V7 - 3.067) * BF12;
148 | BF85 = max(0, 3.067 - V7 ) * BF12;
149 | BF87 = max(0, 5.353 - V6 );
150 | BF88 = max(0, V13 - 3.405) * BF9;
151 | BF89 = max(0, 3.405 - V13 ) * BF9;
152 | BF91 = max(0, 5.599 - V13 ) * BF45;
153 | BF92 = max(0, V15 - 9.821) * BF8;
154 | BF94 = max(0, V14 + 2.594) * BF79;
155 | BF97 = max(0, 8.635 - V23 ) * BF94;
156 | BF99 = max(0, 1.332 - V24 ) * BF45;
157 | BF100 = max(0, V7 - 0.209) * BF1;
158 | 
159 | Y = 2.751 + 0.135 * BF1 - 0.037 * BF2 + 0.328 * BF3 - 0.098 * BF5 ...
160 |     + 0.988 * BF7 + 0.014 * BF8 - 0.034 * BF11 - 0.011 * BF12 ...
161 |     - 0.013 * BF13 - 0.002 * BF17 + 0.014 * BF18 ...
162 |     + 0.004 * BF19 - 0.007 * BF21 - 0.017 * BF22 ...
163 |     - .895791E-03 * BF25 + 0.011 * BF26 - 0.009 * BF27 ...
164 |     - 0.007 * BF29 + 0.052 * BF30 + 0.022 * BF35 ...
165 |     - 0.002 * BF36 - 0.005 * BF39 - 0.059 * BF40 ...
166 |     - 0.050 * BF41 + 0.001 * BF42 + .743730E-03 * BF43 ...
167 |     + 0.011 * BF44 + 0.022 * BF45 + 0.009 * BF46 ...
168 |     + 0.004 * BF49 - 0.005 * BF51 + 0.010 * BF52 ...
169 |     - 0.001 * BF54 - 0.005 * BF56 - 0.015 * BF57 ...
170 |     - 0.032 * BF59 + 0.009 * BF60 - 0.002 * BF61 ...
171 |     - 0.009 * BF62 - 0.001 * BF63 + .819374E-03 * BF64 ...
172 |     + 0.002 * BF65 + 0.003 * BF67 + 0.024 * BF69 ...
173 |     - 0.011 * BF70 - 0.004 * BF71 + 0.013 * BF73 ...
174 |     - 0.026 * BF74 + 0.005 * BF75 + 0.253 * BF77 ...
175 |     - 0.065 * BF78 + 0.014 * BF80 - 0.010 * BF83 ...
176 |     + 0.001 * BF84 + 0.018 * BF85 - 0.050 * BF87 ...
177 |     - 0.002 * BF88 - 0.020 * BF89 + 0.003 * BF91 ...
178 |     - 0.043 * BF92 + .707581E-03 * BF97 - 0.015 * BF99 ...
179 |     - 0.005 * BF100;
180 | 
181 | 
182 | function Y= sig_mars( FWSEG_VA, V5, V6, V7, V8, V9, V10, V11, V12, ...
183 |     V13, V14, V15, V16, V17, V18, V19, V20, ...
184 |     V21, V22, V23, V24, V25, V26, V27, V28)
185 | 
186 | BF1 = max(0, V7 - 9.535);
187 | BF2 = max(0, 9.535 - V7 );
188 | BF3 = max(0, V27 - 1.578);
189 | BF5 = max(0, V6 - 5.422);
190 | BF6 = max(0, 5.422 - V6 );
191 | BF8 = max(0, 11.333 - V19 );
192 | BF10 = max(0, - 6.774 - FWSEG_VA );
193 | BF11 = max(0, V10 - 6.255) * BF8;
194 | BF12 = max(0, 6.255 - V10 ) * BF8;
195 | BF13 = max(0, V24 - 3.894);
196 | BF15 = max(0, V5 - 3.884);
197 | BF16 = max(0, 3.884 - V5 );
198 | BF17 = max(0, V28 - 7.918);
199 | BF18 = max(0, 7.918 - V28 );
200 | BF19 = max(0, V13 - 6.077) * BF18;
201 | BF20 = max(0, 6.077 - V13 ) * BF18;
202 | BF22 = max(0, 6.614 - V20 ) * BF10;
203 | BF23 = max(0, FWSEG_VA + 0.936) * BF8;
204 | BF25 = max(0, V23 - 5.039);
205 | BF26 = max(0, 5.039 - V23 );
206 | BF28 = max(0, 9.007 - V20 ) * BF25;
207 | BF29 = max(0, V25 - 7.582);
208 | BF30 = max(0, 7.582 - V25 );
209 | BF31 = max(0, V11 + 3.336) * BF16;
210 | BF32 = max(0, V26 - 1.877);
211 | BF35 = max(0, - 5.749 - FWSEG_VA ) * BF6;
212 | BF36 = max(0, V7 - 4.451) * BF29;
213 | BF37 = max(0, 4.451 - V7 ) * BF29;
214 | BF38 = max(0, V14 - 10.158);
215 | BF39 = max(0, 10.158 - V14 );
216 | BF41 = max(0, 7.172 - V17 ) * BF39;
217 | BF43 = max(0, 7.810 - V24 ) * BF26;
218 | BF44 = max(0, V8 + 1.636) * BF3;
219 | BF45 = max(0, FWSEG_VA - 10.068) * BF39;
220 | BF47 = max(0, V23 - 4.721) * BF30;
221 | BF48 = max(0, 4.721 - V23 ) * BF30;
222 | BF50 = max(0, - 2.397 - V24 ) * BF16;
223 | BF51 = max(0, V14 - 1.428) * BF17;
224 | BF53 = max(0, V16 + 1.940) * BF18;
225 | BF54 = max(0, V10 - 9.442) * BF18;
226 | BF56 = max(0, V10 + 2.144) * BF16;
227 | BF58 = max(0, 1.969 - V26 ) * BF2;
228 | BF59 = max(0, V19 - 6.089) * BF16;
229 | BF62 = max(0, 8.952 - V21 ) * BF15;
230 | BF63 = max(0, V24 - 7.371) * BF3;
231 | BF65 = max(0, V22 - 8.908) * BF6;
232 | BF66 = max(0, 8.908 - V22 ) * BF6;
233 | BF67 = max(0, V27 - 9.485) * BF30;
234 | BF69 = max(0, V18 - 8.608) * BF10;
235 | BF71 = max(0, V13 - 3.374) * BF25;
236 | BF73 = max(0, V14 - 3.616) * BF13;
237 | BF75 = max(0, V18 - 10.321) * BF32;
238 | BF76 = max(0, 10.321 - V18 ) * BF32;
239 | BF78 = max(0, 3.972 - V15 ) * BF26;
240 | BF79 = max(0, V14 - 7.105) * BF26;
241 | BF80 = max(0, 7.105 - V14 ) * BF26;
242 | 
243 | Y = 2.638 - 0.089 * BF1 + 0.083 * BF5 - 0.162 * BF6 - 0.037 * BF8 ...
244 |     - 0.241 * BF10 + 0.018 * BF11 - 0.008 * BF12 ...
245 |     + 0.059 * BF13 - 0.144 * BF17 - 0.116 * BF18 ...
246 |     + 0.010 * BF19 - 0.012 * BF20 + 0.085 * BF22 ...
247 |     + 0.011 * BF23 + 0.049 * BF25 - 0.159 * BF26 ...
248 |     - 0.016 * BF28 - 0.138 * BF29 + 0.010 * BF31 ...
249 |     + 0.016 * BF35 + 0.018 * BF36 + 0.246 * BF37 ...
250 |     - 0.417 * BF38 + 0.052 * BF39 - 0.005 * BF41 ...
251 |     + 0.021 * BF43 + 0.006 * BF44 - 0.047 * BF45 ...
252 |     - 0.051 * BF47 - 0.014 * BF48 - 0.113 * BF50 ...
253 |     + 0.019 * BF51 + 0.007 * BF53 + 0.017 * BF54 ...
254 |     - 0.007 * BF56 - 0.098 * BF58 + 0.011 * BF59 ...
255 |     - 0.016 * BF62 - 0.012 * BF63 + 0.113 * BF65 ...
256 |     + 0.016 * BF66 + 0.040 * BF67 - 0.065 * BF69 ...
257 |     - 0.018 * BF71 + 0.014 * BF73 - 0.009 * BF75 ...
258 |     - 0.008 * BF76 - 0.032 * BF78 + 0.032 * BF79 ...
259 |     + 0.011 * BF80;
260 | 
261 | 
262 | function Y= ovl_mars( FWSEG_VA, V5, V6, V7, V8, V9, V10, V11, V12, ...
263 |     V13, V14, V15, V16, V17, V18, V19, V20, ...
264 |     V21, V22, V23, V24, V25, V26, V27, V28)
265 | 
266 | BF1 = max(0, V21 - 4.671);
267 | BF3 = max(0, V6 - 5.396);
268 | BF4 = max(0, 5.396 - V6 );
269 | BF7 = max(0, V11 - 7.884);
270 | BF8 = max(0, 7.884 - V11 );
271 | BF9 = max(0, FWSEG_VA + 7.229) * BF1;
272 | BF10 = max(0, - 7.229 - FWSEG_VA ) * BF1;
273 | BF11 = max(0, V19 - 8.128) * BF1;
274 | BF12 = max(0, 8.128 - V19 ) * BF1;
275 | BF13 = max(0, V28 - 7.918);
276 | BF14 = max(0, 7.918 - V28 );
277 | BF15 = max(0, V5 + 2.888) * BF14;
278 | BF16 = max(0, - 2.888 - V5 ) * BF14;
279 | BF17 = max(0, V24 - 2.924) * BF8;
280 | BF18 = max(0, 2.924 - V24 ) * BF8;
281 | BF20 = max(0, 9.071 - V16 ) * BF15;
282 | BF21 = max(0, V10 - 6.286) * BF14;
283 | BF22 = max(0, 6.286 - V10 ) * BF14;
284 | BF24 = max(0, V23 - 5.173);
285 | BF25 = max(0, 5.173 - V23 );
286 | BF26 = max(0, V26 - 8.987);
287 | BF29 = max(0, 12.216 - V27 ) * BF3;
288 | BF30 = max(0, V8 - 4.306) * BF16;
289 | BF34 = max(0, V23 - 7.630) * BF21;
290 | BF35 = max(0, 7.630 - V23 ) * BF21;
291 | BF37 = max(0, 3.638 - V7 ) * BF1;
292 | BF39 = max(0, 8.337 - V21 ) * BF17;
293 | BF41 = max(0, 1.590 - V5 ) * BF11;
294 | BF43 = max(0, 13.993 - V8 ) * BF11;
295 | BF44 = max(0, V14 - 5.993) * BF25;
296 | BF45 = max(0, 5.993 - V14 ) * BF25;
297 | BF46 = max(0, V24 - 1.035);
298 | BF47 = max(0, 1.035 - V24 );
299 | BF49 = max(0, 8.915 - V23 ) * BF12;
300 | BF51 = max(0, - 0.004 - FWSEG_VA );
301 | BF52 = max(0, V27 - 6.520) * BF24;
302 | BF53 = max(0, 6.520 - V27 ) * BF24;
303 | BF54 = max(0, V7 - 11.484) * BF8;
304 | BF55 = max(0, 11.484 - V7 ) * BF8;
305 | BF57 = max(0, 5.742 - V17 ) * BF25;
306 | BF58 = max(0, V12 - 6.949) * BF12;
307 | BF59 = max(0, 6.949 - V12 ) * BF12;
308 | BF60 = max(0, V25 - 9.203) * BF45;
309 | BF63 = max(0, 1.887 - V13 ) * BF7;
310 | BF65 = max(0, 9.498 - V26 ) * BF15;
311 | BF66 = max(0, V5 - 6.566) * BF22;
312 | BF71 = max(0, 13.239 - V19 ) * BF46;
313 | BF72 = max(0, V19 - 9.925) * BF55;
314 | BF77 = max(0, 3.430 - V22 ) * BF18;
315 | BF78 = max(0, V27 - 6.513) * BF45;
316 | BF79 = max(0, 6.513 - V27 ) * BF45;
317 | BF81 = max(0, 12.511 - V18 );
318 | BF82 = max(0, V11 - 6.777) * BF81;
319 | BF83 = max(0, 6.777 - V11 ) * BF81;
320 | BF85 = max(0, 3.433 - V5 ) * BF47;
321 | BF87 = max(0, - 3.524 - FWSEG_VA ) * BF47;
322 | BF88 = max(0, V27 - 11.604) * BF9;
323 | BF91 = max(0, 8.845 - V26 ) * BF52;
324 | BF92 = max(0, V14 - 5.931) * BF82;
325 | BF93 = max(0, 5.931 - V14 ) * BF82;
326 | BF94 = max(0, V21 - 7.245) * BF25;
327 | BF95 = max(0, 7.245 - V21 ) * BF25;
328 | BF96 = max(0, V14 - 5.323) * BF7;
329 | BF98 = max(0, V10 - 6.248) * BF71;
330 | BF100 = max(0, V18 - 0.602) * BF95;
331 | 
332 | Y = 2.936 + 0.047 * BF1 + 0.061 * BF3 - 0.084 * BF4 - 0.139 * BF8 ...
333 |     - 0.064 * BF10 - 0.030 * BF12 - 0.103 * BF13 ...
334 |     - 0.039 * BF14 + 0.020 * BF17 - 0.002 * BF20 ...
335 |     - 0.005 * BF22 - 0.114 * BF25 - 0.090 * BF26 ...
336 |     - 0.011 * BF29 + 0.010 * BF30 + 0.009 * BF34 ...
337 |     + 0.002 * BF35 + 0.079 * BF37 - 0.006 * BF39 ...
338 |     + 0.007 * BF41 - 0.003 * BF43 + 0.017 * BF44 ...
339 |     + 0.076 * BF47 + 0.009 * BF49 + 0.016 * BF51 ...
340 |     - 0.042 * BF53 - 0.079 * BF54 - 0.030 * BF57 ...
341 |     - 0.018 * BF58 - 0.009 * BF59 - 0.119 * BF60 ...
342 |     - 0.210 * BF63 - .456802E-03 * BF65 + 0.028 * BF66 ...
343 |     + 0.020 * BF72 + 0.011 * BF77 + 0.005 * BF78 ...
344 |     + 0.003 * BF79 - 0.049 * BF81 + 0.012 * BF83 ...
345 |     - 0.030 * BF85 + 0.070 * BF87 + 0.008 * BF88 ...
346 |     - 0.008 * BF91 + 0.010 * BF92 + 0.003 * BF93 ...
347 |     + 0.022 * BF94 - 0.038 * BF96 + .933766E-03 * BF98 ...
348 |     + 0.002 * BF100;
349 | 
350 | 
351 | 
352 | function distortion = fwseg(clean_speech, processed_speech,sample_rate)
353 | 
354 | 
355 | % ----------------------------------------------------------------------
356 | % Check the length of the clean and processed speech.  Must be the same.
357 | % ----------------------------------------------------------------------
358 | 
359 | clean_length      = length(clean_speech);
360 | processed_length  = length(processed_speech);
361 | 
362 | if (clean_length ~= processed_length)
363 |     disp('Error: Files  must have same length.');
364 |     return
365 | end
366 | 
367 | 
368 | 
369 | % ----------------------------------------------------------------------
370 | % Global Variables
371 | % ----------------------------------------------------------------------
372 | 
373 | 
374 | winlength   = round(30*sample_rate/1000); 	   % window length in samples
375 | skiprate    = floor(winlength/4);		   % window skip in samples
376 | max_freq    = sample_rate/2;	   % maximum bandwidth
377 | num_crit    = 25;		   % number of critical bands
378 | 
379 | n_fft       = 2^nextpow2(2*winlength);
380 | n_fftby2    = n_fft/2;		   % FFT size/2
381 | 
382 | % ----------------------------------------------------------------------
383 | % Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
384 | % ----------------------------------------------------------------------
385 | 
386 | cent_freq(1)  = 50.0000;   bandwidth(1)  = 70.0000;
387 | cent_freq(2)  = 120.000;   bandwidth(2)  = 70.0000;
388 | cent_freq(3)  = 190.000;   bandwidth(3)  = 70.0000;
389 | cent_freq(4)  = 260.000;   bandwidth(4)  = 70.0000;
390 | cent_freq(5)  = 330.000;   bandwidth(5)  = 70.0000;
391 | cent_freq(6)  = 400.000;   bandwidth(6)  = 70.0000;
392 | cent_freq(7)  = 470.000;   bandwidth(7)  = 70.0000;
393 | cent_freq(8)  = 540.000;   bandwidth(8)  = 77.3724;
394 | cent_freq(9)  = 617.372;   bandwidth(9)  = 86.0056;
395 | cent_freq(10) = 703.378;   bandwidth(10) = 95.3398;
396 | cent_freq(11) = 798.717;   bandwidth(11) = 105.411;
397 | cent_freq(12) = 904.128;   bandwidth(12) = 116.256;
398 | cent_freq(13) = 1020.38;   bandwidth(13) = 127.914;
399 | cent_freq(14) = 1148.30;   bandwidth(14) = 140.423;
400 | cent_freq(15) = 1288.72;   bandwidth(15) = 153.823;
401 | cent_freq(16) = 1442.54;   bandwidth(16) = 168.154;
402 | cent_freq(17) = 1610.70;   bandwidth(17) = 183.457;
403 | cent_freq(18) = 1794.16;   bandwidth(18) = 199.776;
404 | cent_freq(19) = 1993.93;   bandwidth(19) = 217.153;
405 | cent_freq(20) = 2211.08;   bandwidth(20) = 235.631;
406 | cent_freq(21) = 2446.71;   bandwidth(21) = 255.255;
407 | cent_freq(22) = 2701.97;   bandwidth(22) = 276.072;
408 | cent_freq(23) = 2978.04;   bandwidth(23) = 298.126;
409 | cent_freq(24) = 3276.17;   bandwidth(24) = 321.465;
410 | cent_freq(25) = 3597.63;   bandwidth(25) = 346.136;
411 | 
412 | 
413 | bw_min      = bandwidth (1);	   % minimum critical bandwidth
414 | 
415 | 
416 | % ----------------------------------------------------------------------
417 | % Set up the critical band filters.  Note here that Gaussianly shaped
418 | % filters are used.  Also, the sum of the filter weights are equivalent
419 | % for each critical band filter.  Filter less than -30 dB and set to
420 | % zero.
421 | % ----------------------------------------------------------------------
422 | 
423 | min_factor = exp (-30.0 / (2.0 * 2.303));       % -30 dB point of filter
424 | 
425 | for i = 1:num_crit
426 |     f0 = (cent_freq (i) / max_freq) * (n_fftby2);
427 |     all_f0(i) = floor(f0);
428 |     bw = (bandwidth (i) / max_freq) * (n_fftby2);
429 |     norm_factor = log(bw_min) - log(bandwidth(i));
430 |     j = 0:1:n_fftby2-1;
431 |     crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
432 |     crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
433 | end
434 | 
435 | % ----------------------------------------------------------------------
436 | % For each frame of input speech, calculate the Weighted Spectral
437 | % Slope Measure
438 | % ----------------------------------------------------------------------
439 | 
440 | num_frames = floor(clean_length/skiprate-(winlength/skiprate)); % number of frames
441 | start      = 1;					% starting sample
442 | window     = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
443 | 
444 | distortion=zeros(num_frames,num_crit);
445 | for frame_count = 1:num_frames
446 | 
447 |     % ----------------------------------------------------------
448 |     % (1) Get the Frames for the test and reference speech.
449 |     %     Multiply by Hanning Window.
450 |     % ----------------------------------------------------------
451 | 
452 |     clean_frame = clean_speech(start:start+winlength-1);
453 |     processed_frame = processed_speech(start:start+winlength-1);
454 |     clean_frame = clean_frame.*window;
455 |     processed_frame = processed_frame.*window;
456 | 
457 |     % ----------------------------------------------------------
458 |     % (2) Compute the magnitude Spectrum of Clean and Processed
459 |     % ----------------------------------------------------------
460 | 
461 | 
462 |     clean_spec     = abs(fft(clean_frame,n_fft));
463 |     processed_spec = abs(fft(processed_frame,n_fft));
464 | 
465 |     % normalize so that spectra have unit area ----
466 |     clean_spec=clean_spec/sum(clean_spec(1:n_fftby2));
467 |     processed_spec=processed_spec/sum(processed_spec(1:n_fftby2));
468 | 
469 |     % ----------------------------------------------------------
470 |     % (3) Compute Filterbank Output Energies 
471 |     % ----------------------------------------------------------
472 | 
473 |     clean_energy=zeros(1,num_crit);
474 |     processed_energy=zeros(1,num_crit);
475 |     error_energy=zeros(1,num_crit);
476 | 
477 |     for i = 1:num_crit
478 |         clean_energy(i) = sum(clean_spec(1:n_fftby2) ...
479 |             .*crit_filter(i,:)');
480 |         processed_energy(i) = sum(processed_spec(1:n_fftby2) ...
481 |             .*crit_filter(i,:)');
482 |         error_energy(i)=max((clean_energy(i)-processed_energy(i))^2,eps);
483 |     end
484 | 
485 | 
486 |     SNRlog=10*log10((clean_energy.^2)./error_energy);
487 | 
488 |     distortion(frame_count,:)=min(max(SNRlog,-10),35);
489 | 
490 |     start = start + skiprate;
491 | 
492 | end
493 | 
494 | 


--------------------------------------------------------------------------------
/comp_fwseg_variant.m:
--------------------------------------------------------------------------------
  1 | function [SIG,BAK,OVL]= comp_fwseg_variant(cleanFile, enhancedFile);
  2 | 
  3 | % ----------------------------------------------------------------------
  4 | %      Frequency-variant fwSNRseg Objective Speech Quality Measure
  5 | %
  6 | %   This function implements the frequency-variant fwSNRseg measure [1]
  7 | %   (see also Chap. 10, Eq. 10.24)
  8 | %
  9 | %
 10 | %   Usage:  [sig,bak,ovl]=comp_fwseg_variant(cleanFile.wav, enhancedFile.wav)
 11 | %           
 12 | %         cleanFile.wav - clean input file in .wav format
 13 | %         enhancedFile  - enhanced output file in .wav format
 14 | %         sig           - predicted rating [1-5] of speech distortion
 15 | %         bak           - predicted rating [1-5] of noise distortion
 16 | %         ovl           - predicted rating [1-5] of overall quality
 17 | %
 18 | %
 19 | %  Example call:  [s,b,o] =comp_fwseg_variant('sp04.wav','enhanced.wav')
 20 | %
 21 | %  
 22 | %  References:
 23 | %     [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
 24 | %	    Objective Measures of Speech Quality.  Prentice Hall
 25 | %	    Advanced Reference Series, Englewood Cliffs, NJ, 1988,
 26 | %	    ISBN: 0-13-629056-6.
 27 | %
 28 | %   Author: Philipos C. Loizou 
 29 | %  (critical-band filtering routines were written by Bryan Pellom & John Hansen)
 30 | %
 31 | % Copyright (c) 2006 by Philipos C. Loizou
 32 | % $Revision: 0.0 $  $Date: 10/09/2006 $
 33 | % ----------------------------------------------------------------------
 34 | 
 35 | if nargin~=2
 36 |     fprintf('USAGE: [sig,bak,ovl]=comp_fwseg_variant(cleanFile.wav, enhancedFile.wav)\n');
 37 |     fprintf('For more help, type: help comp_fwseg_variant\n\n');
 38 |     return;
 39 | end
 40 | 
 41 | 
 42 | [data1, Srate1, Nbits1]= wavread(cleanFile);
 43 | [data2, Srate2, Nbits2]= wavread(enhancedFile);
 44 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
 45 |     error( 'The two files do not match!\n');
 46 | end
 47 | 
 48 | len= min( length( data1), length( data2));
 49 | data1= data1( 1: len)+eps;
 50 | data2= data2( 1: len)+eps;
 51 | 
 52 | wss_dist_matrix= fwseg( data1, data2,Srate1);
 53 | wss_dist=mean(wss_dist_matrix);
 54 | 
 55 | % initialize  coefficients obtained from multiple linear
 56 | % regression analysis
 57 | %
 58 | b_sig=[0.021,-0.028,0.088,-0.031,0.048,-0.049,0.065,0.009,0.011,0.033,...
 59 |     -0.040,-0.002,0.041,-0.007,0.033,0.018,-0.007,0.044,-0.001,0.021,...
 60 |     -0.002,0.017,-0.03,0.073,0.043];
 61 | b_ovl=[-0.003,-0.026,0.066,-0.036,0.038,-0.023,0.037,0.022,0.014,0.009,...
 62 |     -0.03,0.004,0.044,-0.005,0.017,0.018,-0.001,0.051,0.009,0.011,...
 63 |     0.011,-0.002,-0.021,0.043,0.031];
 64 | b_bak=[-0.03,-0.022,0.03,-0.048,0.034,0.002,0.006,0.037,0.017,-0.016,-0.008,...
 65 |     0.019,0.024,-0.002,0.01,0.03,-0.018,0.046,0.022,0.005,0.03,-0.028,...
 66 |     -0.028,0.019,0.005];
 67 | 
 68 | SIG=0.567+sum(b_sig.*wss_dist);
 69 | SIG=max(1,SIG); SIG=min(5, SIG); % limit values to [1, 5]
 70 | 
 71 | BAK=1.013+sum(b_bak.*wss_dist);
 72 | BAK=max(1,BAK); BAK=min(5, BAK); % limit values to [1, 5]
 73 | 
 74 | OVL=0.446+sum(b_ovl.*wss_dist);
 75 | OVL=max(1,OVL); OVL=min(5, OVL); % limit values to [1, 5]
 76 | 
 77 | 
 78 | % ----------------------------------------------------------------------
 79 | 
 80 | function distortion = fwseg(clean_speech, processed_speech,sample_rate)
 81 | 
 82 | 
 83 | % ----------------------------------------------------------------------
 84 | % Check the length of the clean and processed speech.  Must be the same.
 85 | % ----------------------------------------------------------------------
 86 | 
 87 | clean_length      = length(clean_speech);
 88 | processed_length  = length(processed_speech);
 89 | 
 90 | if (clean_length ~= processed_length)
 91 |   disp('Error: Files  must have same length.');
 92 |   return
 93 | end
 94 | 
 95 | 
 96 | 
 97 | % ----------------------------------------------------------------------
 98 | % Global Variables
 99 | % ----------------------------------------------------------------------
100 | 
101 | 
102 | winlength   = round(30*sample_rate/1000); 	   % window length in samples
103 | skiprate    = floor(winlength/4);		   % window skip in samples
104 | max_freq    = sample_rate/2;	   % maximum bandwidth
105 | num_crit    = 25;		   % number of critical bands
106 | 
107 | n_fft       = 2^nextpow2(2*winlength);
108 | n_fftby2    = n_fft/2;		   % FFT size/2
109 | 
110 | % ----------------------------------------------------------------------
111 | % Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
112 | % ----------------------------------------------------------------------
113 | 
114 | cent_freq(1)  = 50.0000;   bandwidth(1)  = 70.0000;
115 | cent_freq(2)  = 120.000;   bandwidth(2)  = 70.0000;
116 | cent_freq(3)  = 190.000;   bandwidth(3)  = 70.0000;
117 | cent_freq(4)  = 260.000;   bandwidth(4)  = 70.0000;
118 | cent_freq(5)  = 330.000;   bandwidth(5)  = 70.0000;
119 | cent_freq(6)  = 400.000;   bandwidth(6)  = 70.0000;
120 | cent_freq(7)  = 470.000;   bandwidth(7)  = 70.0000;
121 | cent_freq(8)  = 540.000;   bandwidth(8)  = 77.3724;
122 | cent_freq(9)  = 617.372;   bandwidth(9)  = 86.0056;
123 | cent_freq(10) = 703.378;   bandwidth(10) = 95.3398;
124 | cent_freq(11) = 798.717;   bandwidth(11) = 105.411;
125 | cent_freq(12) = 904.128;   bandwidth(12) = 116.256;
126 | cent_freq(13) = 1020.38;   bandwidth(13) = 127.914;
127 | cent_freq(14) = 1148.30;   bandwidth(14) = 140.423;
128 | cent_freq(15) = 1288.72;   bandwidth(15) = 153.823;
129 | cent_freq(16) = 1442.54;   bandwidth(16) = 168.154;
130 | cent_freq(17) = 1610.70;   bandwidth(17) = 183.457;
131 | cent_freq(18) = 1794.16;   bandwidth(18) = 199.776;
132 | cent_freq(19) = 1993.93;   bandwidth(19) = 217.153;
133 | cent_freq(20) = 2211.08;   bandwidth(20) = 235.631;
134 | cent_freq(21) = 2446.71;   bandwidth(21) = 255.255;
135 | cent_freq(22) = 2701.97;   bandwidth(22) = 276.072;
136 | cent_freq(23) = 2978.04;   bandwidth(23) = 298.126;
137 | cent_freq(24) = 3276.17;   bandwidth(24) = 321.465;
138 | cent_freq(25) = 3597.63;   bandwidth(25) = 346.136;
139 | 
140 | 
141 | bw_min      = bandwidth (1);	   % minimum critical bandwidth
142 | 
143 | 
144 | % ----------------------------------------------------------------------
145 | % Set up the critical band filters.  Note here that Gaussianly shaped
146 | % filters are used.  Also, the sum of the filter weights are equivalent
147 | % for each critical band filter.  Filter less than -30 dB and set to
148 | % zero.
149 | % ----------------------------------------------------------------------
150 | 
151 | min_factor = exp (-30.0 / (2.0 * 2.303));       % -30 dB point of filter
152 | 
153 | for i = 1:num_crit
154 |   f0 = (cent_freq (i) / max_freq) * (n_fftby2);
155 |   all_f0(i) = floor(f0);
156 |   bw = (bandwidth (i) / max_freq) * (n_fftby2);
157 |   norm_factor = log(bw_min) - log(bandwidth(i));
158 |   j = 0:1:n_fftby2-1;
159 |   crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
160 |   crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);  
161 | end   
162 | 
163 | % ----------------------------------------------------------------------
164 | % For each frame of input speech, calculate the Weighted Spectral
165 | % Slope Measure
166 | % ----------------------------------------------------------------------
167 | 
168 | num_frames = floor(clean_length/skiprate-(winlength/skiprate)); % number of frames
169 | start      = 1;					% starting sample
170 | window     = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
171 | 
172 | distortion=zeros(num_frames,num_crit);
173 | for frame_count = 1:num_frames
174 | 
175 |    % ----------------------------------------------------------
176 |    % (1) Get the Frames for the test and reference speech. 
177 |    %     Multiply by Hanning Window.
178 |    % ----------------------------------------------------------
179 | 
180 |    clean_frame = clean_speech(start:start+winlength-1);
181 |    processed_frame = processed_speech(start:start+winlength-1);
182 |    clean_frame = clean_frame.*window;
183 |    processed_frame = processed_frame.*window;
184 | 
185 |    % ----------------------------------------------------------
186 |    % (2) Compute the magnitude Spectrum of Clean and Processed
187 |    % ----------------------------------------------------------
188 | 
189 |     
190 |        clean_spec     = abs(fft(clean_frame,n_fft));
191 |        processed_spec = abs(fft(processed_frame,n_fft));
192 |        
193 |        % normalize so that spectra have unit area ----
194 |         clean_spec=clean_spec/sum(clean_spec(1:n_fftby2));
195 |         processed_spec=processed_spec/sum(processed_spec(1:n_fftby2));
196 | 
197 |    % ----------------------------------------------------------
198 |    % (3) Compute Filterbank Output Energies (in dB scale)
199 |    % ----------------------------------------------------------
200 |  
201 |    clean_energy=zeros(1,num_crit);
202 |    processed_energy=zeros(1,num_crit);
203 |    error_energy=zeros(1,num_crit);
204 |    
205 |    for i = 1:num_crit
206 |       clean_energy(i) = sum(clean_spec(1:n_fftby2) ...
207 | 		            .*crit_filter(i,:)');
208 |       processed_energy(i) = sum(processed_spec(1:n_fftby2) ...
209 | 			        .*crit_filter(i,:)');
210 |       error_energy(i)=max((clean_energy(i)-processed_energy(i))^2,eps);
211 |    end
212 |    
213 | 
214 |    SNRlog=10*log10((clean_energy.^2)./error_energy);
215 |    
216 |    distortion(frame_count,:)=min(max(SNRlog,-10),35);
217 |       
218 |    start = start + skiprate;
219 |      
220 | end
221 | 
222 | 


--------------------------------------------------------------------------------
/comp_is.m:
--------------------------------------------------------------------------------
  1 | function is_mean= comp_is(cleanFile, enhdFile);
  2 | % ----------------------------------------------------------------------
  3 | %          Itakura-Saito (IS) Objective Speech Quality Measure
  4 | %
  5 | %   This function implements the Itakura-Saito distance measure
  6 | %   defined on page 50 of [1] (see Equation 2.26).  See also
  7 | %   Equation 12 (page 1480) of [2].
  8 | %
  9 | %   Usage:  IS=comp_is(cleanFile.wav, enhancedFile.wav)
 10 | %           
 11 | %         cleanFile.wav - clean input file in .wav format
 12 | %         enhancedFile  - enhanced output file in .wav format
 13 | %         IS            - computed Itakura Saito measure
 14 | % 
 15 | %         Note that the IS measure is limited in the range [0, 100].
 16 | %
 17 | %  Example call:  IS =comp_is('sp04.wav','enhanced.wav')
 18 | %
 19 | %  
 20 | %  References:
 21 | %
 22 | %     [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
 23 | %	    Objective Measures of Speech Quality.  Prentice Hall
 24 | %	    Advanced Reference Series, Englewood Cliffs, NJ, 1988,
 25 | %	    ISBN: 0-13-629056-6.
 26 | %
 27 | %     [2] B.-H. Juang, "On Using the Itakura-Saito Measures for
 28 | %           Speech Coder Performance Evaluation", AT&T Bell
 29 | %  	    Laboratories Technical Journal, Vol. 63, No. 8,
 30 | %	    October 1984, pp. 1477-1498.
 31 | %
 32 | %  Authors: Bryan L. Pellom and John H. L. Hansen (July 1998)
 33 | %  Modified by: Philipos C. Loizou  (Oct 2006) - limited IS to be in [0,100]
 34 | %
 35 | % Copyright (c) 2006 by Philipos C. Loizou
 36 | % $Revision: 0.0 $  $Date: 10/09/2006 $
 37 | 
 38 | % ----------------------------------------------------------------------
 39 | 
 40 | if nargin~=2
 41 |     fprintf('USAGE: IS=comp_is(cleanFile.wav, enhancedFile.wav)\n');
 42 |     fprintf('For more help, type: help comp_is\n\n');
 43 |     return;
 44 | end
 45 | 
 46 | alpha=0.95;
 47 | 
 48 | [data1, Srate1, Nbits1]= wavread(cleanFile);
 49 | [data2, Srate2, Nbits2]= wavread(enhdFile);
 50 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
 51 |     error( 'The two files do not match!\n');
 52 | end
 53 |     
 54 | len= min( length( data1), length( data2));
 55 | data1= data1( 1: len)+eps;
 56 | data2= data2( 1: len)+eps;
 57 | 
 58 | 
 59 | IS_dist= is( data1, data2,Srate1);
 60 | 
 61 | IS_len= round( length( IS_dist)* alpha);
 62 | IS= sort( IS_dist);
 63 | 
 64 | is_mean= mean( IS( 1: IS_len));
 65 | 
 66 | 
 67 | 
 68 | function distortion = is(clean_speech, processed_speech,sample_rate)
 69 | 
 70 | 
 71 | % ----------------------------------------------------------------------
 72 | % Check the length of the clean and processed speech.  Must be the same.
 73 | % ----------------------------------------------------------------------
 74 | 
 75 | clean_length      = length(clean_speech);
 76 | processed_length  = length(processed_speech);
 77 | 
 78 | if (clean_length ~= processed_length)
 79 |   disp('Error: Both Speech Files must be same length.');
 80 |   return
 81 | end
 82 | 
 83 | % ----------------------------------------------------------------------
 84 | % Scale both clean speech and processed speech to have same dynamic
 85 | % range.  Also remove DC component from each signal
 86 | % ----------------------------------------------------------------------
 87 | 
 88 | %clean_speech     = clean_speech     - mean(clean_speech);
 89 | %processed_speech = processed_speech - mean(processed_speech);
 90 | 
 91 | %processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech)));
 92 | 
 93 | % ----------------------------------------------------------------------
 94 | % Global Variables
 95 | % ----------------------------------------------------------------------
 96 | 
 97 | %sample_rate = 8000;		   % default sample rate
 98 | winlength   = round(30*sample_rate/1000); %240;		   % window length in samples
 99 | skiprate    = floor(winlength/4);		   % window skip in samples
100 | if sample_rate<10000
101 |    P           = 10;		   % LPC Analysis Order
102 | else
103 |     P=16;     % this could vary depending on sampling frequency.
104 | end
105 | % ----------------------------------------------------------------------
106 | % For each frame of input speech, calculate the Itakura-Saito Measure
107 | % ----------------------------------------------------------------------
108 | 
109 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
110 | start      = 1;					% starting sample
111 | window     = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
112 | 
113 | for frame_count = 1:num_frames
114 | 
115 |    % ----------------------------------------------------------
116 |    % (1) Get the Frames for the test and reference speech. 
117 |    %     Multiply by Hanning Window.
118 |    % ----------------------------------------------------------
119 | 
120 |    clean_frame = clean_speech(start:start+winlength-1);
121 |    processed_frame = processed_speech(start:start+winlength-1);
122 |    clean_frame = clean_frame.*window;
123 |    processed_frame = processed_frame.*window;
124 | 
125 |    % ----------------------------------------------------------
126 |    % (2) Get the autocorrelation lags and LPC parameters used
127 |    %     to compute the IS measure.
128 |    % ----------------------------------------------------------
129 | 
130 |    [R_clean, Ref_clean, A_clean] = ...
131 |       lpcoeff(clean_frame, P);
132 |    [R_processed, Ref_processed, A_processed] = ...
133 |       lpcoeff(processed_frame, P);
134 | 
135 |   
136 |    % ----------------------------------------------------------
137 |    % (3) Compute the IS measure
138 |    % ----------------------------------------------------------
139 | 
140 |    numerator      = A_processed*toeplitz(R_clean)*A_processed';
141 |    denominator    = max(A_clean*toeplitz(R_clean)*A_clean',eps);
142 |    gain_clean     = max(R_clean*A_clean',eps);	      % this is gain
143 |    gain_processed = max(R_processed*A_processed',eps); % squared (sigma^2)
144 | 
145 |    
146 |      ISvalue=(gain_clean/gain_processed)*(numerator/denominator) + ...
147 |       log(gain_processed/gain_clean)-1; 
148 | 
149 |   distortion(frame_count) = min(ISvalue,100);
150 |    start = start + skiprate;
151 | 
152 | end
153 | 
154 | 
155 | 
156 | function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order)
157 | 
158 |    % ----------------------------------------------------------
159 |    % (1) Compute Autocorrelation Lags
160 |    % ----------------------------------------------------------
161 | 
162 |    winlength = max(size(speech_frame));
163 |    for k=1:model_order+1
164 |       R(k) = sum(speech_frame(1:winlength-k+1) ...
165 | 		     .*speech_frame(k:winlength));
166 |    end
167 | 
168 |    % ----------------------------------------------------------
169 |    % (2) Levinson-Durbin
170 |    % ----------------------------------------------------------
171 | 
172 |    a = ones(1,model_order);
173 |    E(1)=R(1);
174 |    for i=1:model_order
175 |       a_past(1:i-1) = a(1:i-1);
176 |       sum_term = sum(a_past(1:i-1).*R(i:-1:2));
177 |       rcoeff(i)=(R(i+1) - sum_term) / E(i);
178 |       a(i)=rcoeff(i);
179 |       a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1);
180 |       E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i);
181 |    end
182 | 
183 |    acorr    = R;
184 |    refcoeff = rcoeff;
185 |    lpparams = [1 -a];
186 | 
187 | 
188 | 
189 | 


--------------------------------------------------------------------------------
/comp_llr.m:
--------------------------------------------------------------------------------
  1 | function llr_mean= comp_llr(cleanFile, enhancedFile);
  2 | 
  3 | % ----------------------------------------------------------------------
  4 | %
  5 | %      Log Likelihood Ratio (LLR) Objective Speech Quality Measure
  6 | %
  7 | %
  8 | %     This function implements the Log Likelihood Ratio Measure
  9 | %     defined on page 48 of [1] (see Equation 2.18).
 10 | %
 11 | %   Usage:  llr=comp_llr(cleanFile.wav, enhancedFile.wav)
 12 | %           
 13 | %         cleanFile.wav - clean input file in .wav format
 14 | %         enhancedFile  - enhanced output file in .wav format
 15 | %         llr           - computed likelihood ratio
 16 | %
 17 | %         Note that the LLR measure is limited in the range [0, 2].
 18 | %
 19 | %  Example call:  llr =comp_llr('sp04.wav','enhanced.wav')
 20 | %
 21 | %
 22 | %  References:
 23 | %
 24 | %     [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
 25 | %	    Objective Measures of Speech Quality.  Prentice Hall
 26 | %	    Advanced Reference Series, Englewood Cliffs, NJ, 1988,
 27 | %	    ISBN: 0-13-629056-6.
 28 | %
 29 | %  Authors: Bryan L. Pellom and John H. L. Hansen (July 1998)
 30 | %  Modified by: Philipos C. Loizou  (Oct 2006) - limited LLR to be in [0,2]
 31 | %
 32 | % Copyright (c) 2006 by Philipos C. Loizou
 33 | % $Revision: 0.0 $  $Date: 10/09/2006 $
 34 | % ----------------------------------------------------------------------
 35 | 
 36 | if nargin~=2
 37 |     fprintf('USAGE: LLR=comp_llr(cleanFile.wav, enhancedFile.wav)\n');
 38 |     fprintf('For more help, type: help comp_llr\n\n');
 39 |     return;
 40 | end
 41 | 
 42 | alpha=0.95;
 43 | [data1, Srate1, Nbits1]= wavread(cleanFile);
 44 | [data2, Srate2, Nbits2]= wavread(enhancedFile);
 45 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
 46 |     error( 'The two files do not match!\n');
 47 | end
 48 |     
 49 | len= min( length( data1), length( data2));
 50 | data1= data1( 1: len)+eps;
 51 | data2= data2( 1: len)+eps;
 52 | 
 53 | IS_dist= llr( data1, data2,Srate1);
 54 | 
 55 | IS_len= round( length( IS_dist)* alpha);
 56 | IS= sort( IS_dist);
 57 | 
 58 | llr_mean= mean( IS( 1: IS_len));
 59 | 
 60 | 
 61 | 
 62 | function distortion = llr(clean_speech, processed_speech,sample_rate)
 63 | 
 64 | 
 65 | % ----------------------------------------------------------------------
 66 | % Check the length of the clean and processed speech.  Must be the same.
 67 | % ----------------------------------------------------------------------
 68 | 
 69 | clean_length      = length(clean_speech);
 70 | processed_length  = length(processed_speech);
 71 | 
 72 | if (clean_length ~= processed_length)
 73 |   disp('Error: Both Speech Files must be same length.');
 74 |   return
 75 | end
 76 | 
 77 | % ----------------------------------------------------------------------
 78 | % Global Variables
 79 | % ----------------------------------------------------------------------
 80 | 
 81 | winlength   = round(30*sample_rate/1000); %240;		   % window length in samples
 82 | skiprate    = floor(winlength/4);		   % window skip in samples
 83 | if sample_rate<10000
 84 |    P           = 10;		   % LPC Analysis Order
 85 | else
 86 |     P=16;     % this could vary depending on sampling frequency.
 87 | end
 88 | % ----------------------------------------------------------------------
 89 | % For each frame of input speech, calculate the Log Likelihood Ratio 
 90 | % ----------------------------------------------------------------------
 91 | 
 92 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
 93 | start      = 1;					% starting sample
 94 | window     = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
 95 | 
 96 | for frame_count = 1:num_frames
 97 | 
 98 |    % ----------------------------------------------------------
 99 |    % (1) Get the Frames for the test and reference speech. 
100 |    %     Multiply by Hanning Window.
101 |    % ----------------------------------------------------------
102 | 
103 |    clean_frame = clean_speech(start:start+winlength-1);
104 |    processed_frame = processed_speech(start:start+winlength-1);
105 |    clean_frame = clean_frame.*window;
106 |    processed_frame = processed_frame.*window;
107 | 
108 |    % ----------------------------------------------------------
109 |    % (2) Get the autocorrelation lags and LPC parameters used
110 |    %     to compute the LLR measure.
111 |    % ----------------------------------------------------------
112 | 
113 |    [R_clean, Ref_clean, A_clean] = ...
114 |       lpcoeff(clean_frame, P);
115 |    [R_processed, Ref_processed, A_processed] = ...
116 |       lpcoeff(processed_frame, P);
117 | 
118 |    % ----------------------------------------------------------
119 |    % (3) Compute the LLR measure
120 |    % ----------------------------------------------------------
121 | 
122 |    numerator   = A_processed*toeplitz(R_clean)*A_processed';
123 |    denominator = A_clean*toeplitz(R_clean)*A_clean';
124 |    distortion(frame_count) = min(2,log(numerator/denominator));
125 |    start = start + skiprate;
126 | 
127 | end
128 | 
129 | 
130 | function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order)
131 | 
132 |    % ----------------------------------------------------------
133 |    % (1) Compute Autocorrelation Lags
134 |    % ----------------------------------------------------------
135 | 
136 |    winlength = max(size(speech_frame));
137 |    for k=1:model_order+1
138 |       R(k) = sum(speech_frame(1:winlength-k+1) ...
139 | 		     .*speech_frame(k:winlength));
140 |    end
141 | 
142 |    % ----------------------------------------------------------
143 |    % (2) Levinson-Durbin
144 |    % ----------------------------------------------------------
145 | 
146 |    a = ones(1,model_order);
147 |    E(1)=R(1);
148 |    for i=1:model_order
149 |       a_past(1:i-1) = a(1:i-1);
150 |       sum_term = sum(a_past(1:i-1).*R(i:-1:2));
151 |       rcoeff(i)=(R(i+1) - sum_term) / E(i);
152 |       a(i)=rcoeff(i);
153 |       a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1);
154 |       E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i);
155 |    end
156 | 
157 |    acorr    = R;
158 |    refcoeff = rcoeff;
159 |    lpparams = [1 -a];
160 | 
161 | 
162 | 
163 | 


--------------------------------------------------------------------------------
/comp_snr.m:
--------------------------------------------------------------------------------
  1 | function [snr_mean, segsnr_mean]= comp_SNR(cleanFile, enhdFile);
  2 | %
  3 | %   Segmental Signal-to-Noise Ratio Objective Speech Quality Measure
  4 | %
  5 | %     This function implements the segmental signal-to-noise ratio
  6 | %     as defined in [1, p. 45] (see Equation 2.12).
  7 | %
  8 | %   Usage:  [SNRovl, SNRseg]=comp_snr(cleanFile.wav, enhancedFile.wav)
  9 | %           
 10 | %         cleanFile.wav - clean input file in .wav format
 11 | %         enhancedFile  - enhanced output file in .wav format
 12 | %         SNRovl        - overall SNR (dB)
 13 | %         SNRseg        - segmental SNR (dB)
 14 | %
 15 | %     This function returns 2 parameters.  The first item is the
 16 | %     overall SNR for the two speech signals.  The second value
 17 | %     is the segmental signal-to-noise ratio (1 seg-snr per 
 18 | %     frame of input).  The segmental SNR is clamped to range 
 19 | %     between 35dB and -10dB (see suggestions in [2]).
 20 | %
 21 | %   Example call:  [SNRovl,SNRseg]=comp_SNR('sp04.wav','enhanced.wav')
 22 | %
 23 | %  References:
 24 | %
 25 | %     [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
 26 | %	    Objective Measures of Speech Quality.  Prentice Hall
 27 | %	    Advanced Reference Series, Englewood Cliffs, NJ, 1988,
 28 | %	    ISBN: 0-13-629056-6.
 29 | %
 30 | %     [2] P. E. Papamichalis, Practical Approaches to Speech 
 31 | %	    Coding, Prentice-Hall, Englewood Cliffs, NJ, 1987.
 32 | %	    ISBN: 0-13-689019-9. (see pages 179-181).
 33 | %
 34 | %  Authors: Bryan L. Pellom and John H. L. Hansen (July 1998)
 35 | %  Modified by: Philipos C. Loizou  (Oct 2006)
 36 | %
 37 | % Copyright (c) 2006 by Philipos C. Loizou
 38 | % $Revision: 0.0 $  $Date: 10/09/2006 $
 39 | %-------------------------------------------------------------------------
 40 | 
 41 | if nargin ~=2
 42 |     fprintf('USAGE: [snr_mean, segsnr_mean]= comp_SNR(cleanFile, enhdFile) \n');
 43 |     return;
 44 | end   
 45 | 
 46 | [data1, Srate1, Nbits1]= wavread(cleanFile);
 47 | [data2, Srate2, Nbits2]= wavread(enhdFile);
 48 | if (( Srate1~= Srate2) | ( Nbits1~= Nbits2) | ( length( data1)~= length( data2)))
 49 |     error( 'The two files do not match!\n');
 50 | end
 51 |   
 52 | % len= min( length( data1), length( data2));
 53 | % data1= data1( 1: len);
 54 | % data2= data2( 1: len);
 55 | % data1= (data1 - mean(data1))/std(data1); % MVN
 56 | % data2= (data2 - mean(data2))/std(data2);
 57 | 
 58 | [snr_dist, segsnr_dist]= snr( data1, data2,Srate1);
 59 | 
 60 | snr_mean= snr_dist;
 61 | segsnr_mean= mean( segsnr_dist);
 62 | 
 63 | 
 64 | % =========================================================================
 65 | function [overall_snr, segmental_snr] = snr(clean_speech, processed_speech,sample_rate)
 66 | 
 67 | % ----------------------------------------------------------------------
 68 | % Check the length of the clean and processed speech.  Must be the same.
 69 | % ----------------------------------------------------------------------
 70 | 
 71 | clean_length      = length(clean_speech);
 72 | processed_length  = length(processed_speech);
 73 | 
 74 | if (clean_length ~= processed_length)
 75 |   disp('Error: Both Speech Files must be same length.');
 76 |   return
 77 | end
 78 | 
 79 | % ----------------------------------------------------------------------
 80 | % Scale both clean speech and processed speech to have same dynamic
 81 | % range.  Also remove DC component from each signal
 82 | % ----------------------------------------------------------------------
 83 | 
 84 | %clean_speech     = clean_speech     - mean(clean_speech);
 85 | %processed_speech = processed_speech - mean(processed_speech);
 86 | 
 87 | %processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech)));
 88 | 
 89 | overall_snr = 10* log10( sum(clean_speech.^2)/sum((clean_speech-processed_speech).^2));
 90 | 
 91 | % ----------------------------------------------------------------------
 92 | % Global Variables
 93 | % ----------------------------------------------------------------------
 94 | 
 95 | 
 96 | winlength   = round(30*sample_rate/1000); %240;		   % window length in samples for 30-msecs
 97 | skiprate    = floor(winlength/4); %60;		   % window skip in samples
 98 | MIN_SNR     = -10;		   % minimum SNR in dB
 99 | MAX_SNR     =  35;		   % maximum SNR in dB
100 | 
101 | % ----------------------------------------------------------------------
102 | % For each frame of input speech, calculate the Segmental SNR
103 | % ----------------------------------------------------------------------
104 | 
105 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
106 | start      = 1;					% starting sample
107 | window     = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
108 | 
109 | for frame_count = 1: num_frames
110 | 
111 |    % ----------------------------------------------------------
112 |    % (1) Get the Frames for the test and reference speech. 
113 |    %     Multiply by Hanning Window.
114 |    % ----------------------------------------------------------
115 | 
116 |    clean_frame = clean_speech(start:start+winlength-1);
117 |    processed_frame = processed_speech(start:start+winlength-1);
118 |    clean_frame = clean_frame.*window;
119 |    processed_frame = processed_frame.*window;
120 | 
121 |    % ----------------------------------------------------------
122 |    % (2) Compute the Segmental SNR
123 |    % ----------------------------------------------------------
124 | 
125 |    signal_energy = sum(clean_frame.^2);
126 |    noise_energy  = sum((clean_frame-processed_frame).^2);
127 |    segmental_snr(frame_count) = 10*log10(signal_energy/(noise_energy+eps)+eps);
128 |    segmental_snr(frame_count) = max(segmental_snr(frame_count),MIN_SNR);
129 |    segmental_snr(frame_count) = min(segmental_snr(frame_count),MAX_SNR);
130 | 
131 |    start = start + skiprate;
132 | 
133 | end
134 | 
135 | 


--------------------------------------------------------------------------------
/comp_wss.m:
--------------------------------------------------------------------------------
  1 | function wss_dist= comp_wss(cleanFile, enhancedFile);
  2 | % ----------------------------------------------------------------------
  3 | %
  4 | %     Weighted Spectral Slope (WSS) Objective Speech Quality Measure
  5 | %
  6 | %     This function implements the Weighted Spectral Slope (WSS)
  7 | %     distance measure originally proposed in [1].  The algorithm
  8 | %     works by first decomposing the speech signal into a set of
  9 | %     frequency bands (this is done for both the test and reference
 10 | %     frame).  The intensities within each critical band are 
 11 | %     measured.  Then, a weighted distances between the measured
 12 | %     slopes of the log-critical band spectra are computed.  
 13 | %     This measure is also described in Section 2.2.9 (pages 56-58)
 14 | %     of [2].
 15 | %
 16 | %     Whereas Klatt's original measure used 36 critical-band 
 17 | %     filters to estimate the smoothed short-time spectrum, this
 18 | %     implementation considers a bank of 25 filters spanning 
 19 | %     the 4 kHz bandwidth.  
 20 | %
 21 | %   Usage:  wss_dist=comp_wss(cleanFile.wav, enhancedFile.wav)
 22 | %           
 23 | %         cleanFile.wav - clean input file in .wav format
 24 | %         enhancedFile  - enhanced output file in .wav format
 25 | %         wss_dist      - computed spectral slope distance
 26 | %
 27 | %  Example call:  ws =comp_wss('sp04.wav','enhanced.wav')
 28 | %
 29 | %  References:
 30 | %
 31 | %     [1] D. H. Klatt, "Prediction of Perceived Phonetic Distance
 32 | %	    from Critical-Band Spectra: A First Step", Proc. IEEE
 33 | %	    ICASSP'82, Volume 2, pp. 1278-1281, May, 1982.
 34 | %
 35 | %     [2] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
 36 | %	    Objective Measures of Speech Quality.  Prentice Hall
 37 | %	    Advanced Reference Series, Englewood Cliffs, NJ, 1988,
 38 | %	    ISBN: 0-13-629056-6.
 39 | %
 40 | %  Authors: Bryan L. Pellom and John H. L. Hansen (July 1998)
 41 | %  Modified by: Philipos C. Loizou  (Oct 2006)
 42 | %
 43 | % Copyright (c) 2006 by Philipos C. Loizou
 44 | % $Revision: 0.0 $  $Date: 10/09/2006 $
 45 | %
 46 | % ----------------------------------------------------------------------
 47 | if nargin~=2
 48 |     fprintf('USAGE: WSS=comp_wss(cleanFile.wav, enhancedFile.wav)\n');
 49 |     fprintf('For more help, type: help comp_wss\n\n');
 50 |     return;
 51 | end
 52 | 
 53 | alpha= 0.95;
 54 | 
 55 | [data1, Srate1, Nbits1]= wavread(cleanFile);
 56 | [data2, Srate2, Nbits2]= wavread(enhancedFile);
 57 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
 58 |     error( 'The two files do not match!\n');
 59 | end
 60 | 
 61 | len= min( length( data1), length( data2));
 62 | data1= data1( 1: len)+eps;
 63 | data2= data2( 1: len)+eps;
 64 | 
 65 | wss_dist_vec= wss( data1, data2,Srate1);
 66 | wss_dist_vec= sort( wss_dist_vec);
 67 | wss_dist= mean( wss_dist_vec( 1: round( length( wss_dist_vec)*alpha)));
 68 | 
 69 | 
 70 | 
 71 | function distortion = wss(clean_speech, processed_speech,sample_rate)
 72 | 
 73 | 
 74 | % ----------------------------------------------------------------------
 75 | % Check the length of the clean and processed speech.  Must be the same.
 76 | % ----------------------------------------------------------------------
 77 | 
 78 | clean_length      = length(clean_speech);
 79 | processed_length  = length(processed_speech);
 80 | 
 81 | if (clean_length ~= processed_length)
 82 |   disp('Error: Files  musthave same length.');
 83 |   return
 84 | end
 85 | 
 86 | 
 87 | 
 88 | % ----------------------------------------------------------------------
 89 | % Global Variables
 90 | % ----------------------------------------------------------------------
 91 | 
 92 | winlength   = round(30*sample_rate/1000); 	   % window length in samples
 93 | skiprate    = floor(winlength/4);		   % window skip in samples
 94 | max_freq    = sample_rate/2;	   % maximum bandwidth
 95 | num_crit    = 25;		   % number of critical bands
 96 | 
 97 | USE_FFT_SPECTRUM = 1;		   % defaults to 10th order LP spectrum
 98 | n_fft       = 2^nextpow2(2*winlength);
 99 | n_fftby2    = n_fft/2;		   % FFT size/2
100 | Kmax        = 20;		   % value suggested by Klatt, pg 1280
101 | Klocmax     = 1;		   % value suggested by Klatt, pg 1280		
102 | 
103 | % ----------------------------------------------------------------------
104 | % Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
105 | % ----------------------------------------------------------------------
106 | 
107 | cent_freq(1)  = 50.0000;   bandwidth(1)  = 70.0000;
108 | cent_freq(2)  = 120.000;   bandwidth(2)  = 70.0000;
109 | cent_freq(3)  = 190.000;   bandwidth(3)  = 70.0000;
110 | cent_freq(4)  = 260.000;   bandwidth(4)  = 70.0000;
111 | cent_freq(5)  = 330.000;   bandwidth(5)  = 70.0000;
112 | cent_freq(6)  = 400.000;   bandwidth(6)  = 70.0000;
113 | cent_freq(7)  = 470.000;   bandwidth(7)  = 70.0000;
114 | cent_freq(8)  = 540.000;   bandwidth(8)  = 77.3724;
115 | cent_freq(9)  = 617.372;   bandwidth(9)  = 86.0056;
116 | cent_freq(10) = 703.378;   bandwidth(10) = 95.3398;
117 | cent_freq(11) = 798.717;   bandwidth(11) = 105.411;
118 | cent_freq(12) = 904.128;   bandwidth(12) = 116.256;
119 | cent_freq(13) = 1020.38;   bandwidth(13) = 127.914;
120 | cent_freq(14) = 1148.30;   bandwidth(14) = 140.423;
121 | cent_freq(15) = 1288.72;   bandwidth(15) = 153.823;
122 | cent_freq(16) = 1442.54;   bandwidth(16) = 168.154;
123 | cent_freq(17) = 1610.70;   bandwidth(17) = 183.457;
124 | cent_freq(18) = 1794.16;   bandwidth(18) = 199.776;
125 | cent_freq(19) = 1993.93;   bandwidth(19) = 217.153;
126 | cent_freq(20) = 2211.08;   bandwidth(20) = 235.631;
127 | cent_freq(21) = 2446.71;   bandwidth(21) = 255.255;
128 | cent_freq(22) = 2701.97;   bandwidth(22) = 276.072;
129 | cent_freq(23) = 2978.04;   bandwidth(23) = 298.126;
130 | cent_freq(24) = 3276.17;   bandwidth(24) = 321.465;
131 | cent_freq(25) = 3597.63;   bandwidth(25) = 346.136;
132 | 
133 | bw_min      = bandwidth (1);	   % minimum critical bandwidth
134 | 
135 | % ----------------------------------------------------------------------
136 | % Set up the critical band filters.  Note here that Gaussianly shaped
137 | % filters are used.  Also, the sum of the filter weights are equivalent
138 | % for each critical band filter.  Filter less than -30 dB and set to
139 | % zero.
140 | % ----------------------------------------------------------------------
141 | 
142 | min_factor = exp (-30.0 / (2.0 * 2.303));       % -30 dB point of filter
143 | 
144 | for i = 1:num_crit
145 |   f0 = (cent_freq (i) / max_freq) * (n_fftby2);
146 |   all_f0(i) = floor(f0);
147 |   bw = (bandwidth (i) / max_freq) * (n_fftby2);
148 |   norm_factor = log(bw_min) - log(bandwidth(i));
149 |   j = 0:1:n_fftby2-1;
150 |   crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
151 |   crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
152 | end   
153 | 
154 | % ----------------------------------------------------------------------
155 | % For each frame of input speech, calculate the Weighted Spectral
156 | % Slope Measure
157 | % ----------------------------------------------------------------------
158 | 
159 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
160 | start      = 1;					% starting sample
161 | window     = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
162 | 
163 | for frame_count = 1:num_frames
164 | 
165 |    % ----------------------------------------------------------
166 |    % (1) Get the Frames for the test and reference speech. 
167 |    %     Multiply by Hanning Window.
168 |    % ----------------------------------------------------------
169 | 
170 |    clean_frame = clean_speech(start:start+winlength-1);
171 |    processed_frame = processed_speech(start:start+winlength-1);
172 |    clean_frame = clean_frame.*window;
173 |    processed_frame = processed_frame.*window;
174 | 
175 |    % ----------------------------------------------------------
176 |    % (2) Compute the Power Spectrum of Clean and Processed
177 |    % ----------------------------------------------------------
178 | 
179 |     if (USE_FFT_SPECTRUM)
180 |        clean_spec     = (abs(fft(clean_frame,n_fft)).^2);
181 |        processed_spec = (abs(fft(processed_frame,n_fft)).^2);
182 |     else
183 |        a_vec = zeros(1,n_fft);
184 |        a_vec(1:11) = lpc(clean_frame,10);
185 |        clean_spec     = 1.0/(abs(fft(a_vec,n_fft)).^2)';
186 | 
187 |        a_vec = zeros(1,n_fft);
188 |        a_vec(1:11) = lpc(processed_frame,10);
189 |        processed_spec = 1.0/(abs(fft(a_vec,n_fft)).^2)';
190 |     end
191 | 
192 |    % ----------------------------------------------------------
193 |    % (3) Compute Filterbank Output Energies (in dB scale)
194 |    % ----------------------------------------------------------
195 |  
196 |    for i = 1:num_crit
197 |       clean_energy(i) = sum(clean_spec(1:n_fftby2) ...
198 | 		            .*crit_filter(i,:)');
199 |       processed_energy(i) = sum(processed_spec(1:n_fftby2) ...
200 | 			        .*crit_filter(i,:)');
201 |    end
202 |    clean_energy = 10*log10(max(clean_energy,1E-10));
203 |    processed_energy = 10*log10(max(processed_energy,1E-10));
204 | 
205 |    % ----------------------------------------------------------
206 |    % (4) Compute Spectral Slope (dB[i+1]-dB[i]) 
207 |    % ----------------------------------------------------------
208 | 
209 |    clean_slope     = clean_energy(2:num_crit) - ...
210 | 		     clean_energy(1:num_crit-1);
211 |    processed_slope = processed_energy(2:num_crit) - ...
212 | 		     processed_energy(1:num_crit-1);
213 | 
214 |    % ----------------------------------------------------------
215 |    % (5) Find the nearest peak locations in the spectra to 
216 |    %     each critical band.  If the slope is negative, we 
217 |    %     search to the left.  If positive, we search to the 
218 |    %     right.
219 |    % ----------------------------------------------------------
220 | 
221 |    for i = 1:num_crit-1
222 | 
223 |        % find the peaks in the clean speech signal
224 | 	
225 |        if (clean_slope(i)>0) 		% search to the right
226 | 	  n = i;
227 |           while ((n<num_crit) & (clean_slope(n) > 0))
228 | 	     n = n+1;
229 |  	  end
230 | 	  clean_loc_peak(i) = clean_energy(n-1);
231 |        else				% search to the left
232 |           n = i;
233 | 	  while ((n>0) & (clean_slope(n) <= 0))
234 | 	     n = n-1;
235 |  	  end
236 | 	  clean_loc_peak(i) = clean_energy(n+1);
237 |        end
238 | 
239 |        % find the peaks in the processed speech signal
240 | 
241 |        if (processed_slope(i)>0) 	% search to the right
242 | 	  n = i;
243 |           while ((n<num_crit) & (processed_slope(n) > 0))
244 | 	     n = n+1;
245 | 	  end
246 | 	  processed_loc_peak(i) = processed_energy(n-1);
247 |        else				% search to the left
248 |           n = i;
249 | 	  while ((n>0) & (processed_slope(n) <= 0))
250 | 	     n = n-1;
251 |  	  end
252 | 	  processed_loc_peak(i) = processed_energy(n+1);
253 |        end
254 | 
255 |    end
256 | 
257 |    % ----------------------------------------------------------
258 |    %  (6) Compute the WSS Measure for this frame.  This 
259 |    %      includes determination of the weighting function.
260 |    % ----------------------------------------------------------
261 | 
262 |    dBMax_clean       = max(clean_energy);
263 |    dBMax_processed   = max(processed_energy);
264 | 
265 |    % The weights are calculated by averaging individual
266 |    % weighting factors from the clean and processed frame.
267 |    % These weights W_clean and W_processed should range
268 |    % from 0 to 1 and place more emphasis on spectral 
269 |    % peaks and less emphasis on slope differences in spectral
270 |    % valleys.  This procedure is described on page 1280 of
271 |    % Klatt's 1982 ICASSP paper.
272 | 
273 |    Wmax_clean        = Kmax ./ (Kmax + dBMax_clean - ...
274 | 		 	    clean_energy(1:num_crit-1));
275 |    Wlocmax_clean     = Klocmax ./ ( Klocmax + clean_loc_peak - ...
276 | 				clean_energy(1:num_crit-1));
277 |    W_clean           = Wmax_clean .* Wlocmax_clean;
278 | 
279 |    Wmax_processed    = Kmax ./ (Kmax + dBMax_processed - ...
280 | 			        processed_energy(1:num_crit-1));
281 |    Wlocmax_processed = Klocmax ./ ( Klocmax + processed_loc_peak - ...
282 | 			            processed_energy(1:num_crit-1));
283 |    W_processed       = Wmax_processed .* Wlocmax_processed;
284 |   
285 |    W = (W_clean + W_processed)./2.0;
286 |   
287 |    distortion(frame_count) = sum(W.*(clean_slope(1:num_crit-1) - ...
288 | 		       processed_slope(1:num_crit-1)).^2);
289 | 
290 |    % this normalization is not part of Klatt's paper, but helps
291 |    % to normalize the measure.  Here we scale the measure by the
292 |    % sum of the weights.
293 | 
294 |    distortion(frame_count) = distortion(frame_count)/sum(W);
295 |    
296 |    start = start + skiprate;
297 |      
298 | end
299 | 
300 | 


--------------------------------------------------------------------------------
/composite.m:
--------------------------------------------------------------------------------
  1 | function [Csig,Cbak,Covl]= composite(cleanFile, enhancedFile);
  2 | % ----------------------------------------------------------------------
  3 | %          Composite Objective Speech Quality Measure
  4 | %
  5 | %   This function implements the composite objective measure proposed in
  6 | %   [1]. 
  7 | %
  8 | %   Usage:  [sig,bak,ovl]=composite(cleanFile.wav, enhancedFile.wav)
  9 | %           
 10 | %         cleanFile.wav - clean input file in .wav format
 11 | %         enhancedFile  - enhanced output file in .wav format
 12 | %         sig           - predicted rating [1-5] of speech distortion
 13 | %         bak           - predicted rating [1-5] of noise distortion
 14 | %         ovl           - predicted rating [1-5] of overall quality
 15 | %
 16 | %       In addition to the above ratings (sig, bak, & ovl) it returns
 17 | %       the individual values of the LLR, SNRseg, WSS and PESQ measures.
 18 | %
 19 | %  Example call:  [sig,bak,ovl] =composite('sp04.wav','enhanced.wav')
 20 | %
 21 | %  
 22 | %  References:
 23 | %
 24 | %     [1]   Hu, Y. and Loizou, P. (2006). Evaluation of objective measures 
 25 | %           for speech enhancement. Proc. Interspeech, Pittsburg, PA. 
 26 | %        
 27 | %   Authors: Yi Hu and Philipos C. Loizou
 28 | %   (the LLR, SNRseg and WSS measures were based on Bryan Pellom and John
 29 | %     Hansen's implementations)
 30 | %
 31 | % Copyright (c) 2006 by Philipos C. Loizou
 32 | % $Revision: 0.0 $  $Date: 10/09/2006 $
 33 | 
 34 | % ----------------------------------------------------------------------
 35 | 
 36 | if nargin~=2
 37 |     fprintf('USAGE: [sig,bak,ovl]=composite(cleanFile.wav, enhancedFile.wav)\n');
 38 |     fprintf('For more help, type: help composite\n\n');
 39 |     return;
 40 | end
 41 | 
 42 | alpha= 0.95;
 43 | 
 44 | [data1, Srate1, Nbits1]= wavread(cleanFile);
 45 | [data2, Srate2, Nbits2]= wavread(enhancedFile);
 46 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2) | length(data1)~=length(data2)
 47 |    disp(Srate1);
 48 |    disp(Srate2);
 49 |    disp(Nbits1);
 50 |    disp(Nbits2);
 51 |    disp(length(data1));
 52 |    disp(length(data2));
 53 |    error( 'The two files do not match!\n');
 54 | end
 55 | 
 56 | len= min( length( data1), length( data2));
 57 | data1= data1( 1: len)+eps;
 58 | data2= data2( 1: len)+eps;
 59 | 
 60 | 
 61 | % -- compute the WSS measure ---
 62 | %
 63 | wss_dist_vec= wss( data1, data2,Srate1);
 64 | wss_dist_vec= sort( wss_dist_vec);
 65 | wss_dist= mean( wss_dist_vec( 1: round( length( wss_dist_vec)*alpha)));
 66 | 
 67 | % --- compute the LLR measure ---------
 68 | %
 69 | LLR_dist= llr( data1, data2,Srate1);
 70 | LLRs= sort(LLR_dist);
 71 | LLR_len= round( length(LLR_dist)* alpha);
 72 | llr_mean= mean( LLRs( 1: LLR_len));
 73 | 
 74 | % --- compute the SNRseg ----------------
 75 | %
 76 | [snr_dist, segsnr_dist]= snr( data1, data2,Srate1);
 77 | snr_mean= snr_dist;
 78 | segSNR= mean( segsnr_dist);
 79 | 
 80 | 
 81 | % -- compute the pesq ----
 82 | %
 83 | % if     Srate1==8000,    mode='nb';
 84 | % elseif Srate1 == 16000, mode='wb';
 85 | % else,
 86 | %      error ('Sampling freq in PESQ needs to be 8 kHz or 16 kHz');
 87 | % end
 88 | 
 89 |      
 90 |  [pesq_mos_scores]= comp_pesq(cleanFile, enhancedFile);
 91 |  
 92 |  if length(pesq_mos_scores)==2
 93 |      pesq_mos=pesq_mos_scores(1); % take the raw PESQ value instead of the
 94 |                                   % MOS-mapped value (this composite
 95 |                                   % measure was only validated with the raw
 96 |                                   % PESQ value)
 97 |  else
 98 |      pesq_mos=pesq_mos_scores;
 99 |  end
100 |  
101 | % --- now compute the composite measures ------------------
102 | %
103 | Csig = 3.093 - 1.029*llr_mean + 0.603*pesq_mos-0.009*wss_dist;
104 |   Csig = max(1,Csig);  Csig=min(5, Csig); % limit values to [1, 5]
105 | Cbak = 1.634 + 0.478 *pesq_mos - 0.007*wss_dist + 0.063*segSNR;
106 |   Cbak = max(1, Cbak); Cbak=min(5,Cbak); % limit values to [1, 5]
107 | Covl = 1.594 + 0.805*pesq_mos - 0.512*llr_mean - 0.007*wss_dist;
108 |   Covl = max(1, Covl); Covl=min(5, Covl); % limit values to [1, 5]
109 | 
110 | %fprintf('\n LLR=%f   SNRseg=%f   WSS=%f   PESQ=%f\n',llr_mean,segSNR,wss_dist,pesq_mos);
111 | 
112 | return; %=================================================================
113 | 
114 | 
115 | function distortion = wss(clean_speech, processed_speech,sample_rate)
116 | 
117 | 
118 | % ----------------------------------------------------------------------
119 | % Check the length of the clean and processed speech.  Must be the same.
120 | % ----------------------------------------------------------------------
121 | 
122 | clean_length      = length(clean_speech);
123 | processed_length  = length(processed_speech);
124 | 
125 | if (clean_length ~= processed_length)
126 |   disp('Error: Files  musthave same length.');
127 |   return
128 | end
129 | 
130 | 
131 | 
132 | % ----------------------------------------------------------------------
133 | % Global Variables
134 | % ----------------------------------------------------------------------
135 | 
136 | winlength   = round(30*sample_rate/1000); %240;		   % window length in samples
137 | skiprate    = floor(winlength/4);		   % window skip in samples
138 | max_freq    = sample_rate/2;	   % maximum bandwidth
139 | num_crit    = 25;		   % number of critical bands
140 | 
141 | USE_FFT_SPECTRUM = 1;		   % defaults to 10th order LP spectrum
142 | n_fft       = 2^nextpow2(2*winlength);
143 | n_fftby2    = n_fft/2;		   % FFT size/2
144 | Kmax        = 20;		   % value suggested by Klatt, pg 1280
145 | Klocmax     = 1;		   % value suggested by Klatt, pg 1280		
146 | 
147 | % ----------------------------------------------------------------------
148 | % Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
149 | % ----------------------------------------------------------------------
150 | 
151 | cent_freq(1)  = 50.0000;   bandwidth(1)  = 70.0000;
152 | cent_freq(2)  = 120.000;   bandwidth(2)  = 70.0000;
153 | cent_freq(3)  = 190.000;   bandwidth(3)  = 70.0000;
154 | cent_freq(4)  = 260.000;   bandwidth(4)  = 70.0000;
155 | cent_freq(5)  = 330.000;   bandwidth(5)  = 70.0000;
156 | cent_freq(6)  = 400.000;   bandwidth(6)  = 70.0000;
157 | cent_freq(7)  = 470.000;   bandwidth(7)  = 70.0000;
158 | cent_freq(8)  = 540.000;   bandwidth(8)  = 77.3724;
159 | cent_freq(9)  = 617.372;   bandwidth(9)  = 86.0056;
160 | cent_freq(10) = 703.378;   bandwidth(10) = 95.3398;
161 | cent_freq(11) = 798.717;   bandwidth(11) = 105.411;
162 | cent_freq(12) = 904.128;   bandwidth(12) = 116.256;
163 | cent_freq(13) = 1020.38;   bandwidth(13) = 127.914;
164 | cent_freq(14) = 1148.30;   bandwidth(14) = 140.423;
165 | cent_freq(15) = 1288.72;   bandwidth(15) = 153.823;
166 | cent_freq(16) = 1442.54;   bandwidth(16) = 168.154;
167 | cent_freq(17) = 1610.70;   bandwidth(17) = 183.457;
168 | cent_freq(18) = 1794.16;   bandwidth(18) = 199.776;
169 | cent_freq(19) = 1993.93;   bandwidth(19) = 217.153;
170 | cent_freq(20) = 2211.08;   bandwidth(20) = 235.631;
171 | cent_freq(21) = 2446.71;   bandwidth(21) = 255.255;
172 | cent_freq(22) = 2701.97;   bandwidth(22) = 276.072;
173 | cent_freq(23) = 2978.04;   bandwidth(23) = 298.126;
174 | cent_freq(24) = 3276.17;   bandwidth(24) = 321.465;
175 | cent_freq(25) = 3597.63;   bandwidth(25) = 346.136;
176 | 
177 | bw_min      = bandwidth (1);	   % minimum critical bandwidth
178 | 
179 | % ----------------------------------------------------------------------
180 | % Set up the critical band filters.  Note here that Gaussianly shaped
181 | % filters are used.  Also, the sum of the filter weights are equivalent
182 | % for each critical band filter.  Filter less than -30 dB and set to
183 | % zero.
184 | % ----------------------------------------------------------------------
185 | 
186 | min_factor = exp (-30.0 / (2.0 * 2.303));       % -30 dB point of filter
187 | 
188 | for i = 1:num_crit
189 |   f0 = (cent_freq (i) / max_freq) * (n_fftby2);
190 |   all_f0(i) = floor(f0);
191 |   bw = (bandwidth (i) / max_freq) * (n_fftby2);
192 |   norm_factor = log(bw_min) - log(bandwidth(i));
193 |   j = 0:1:n_fftby2-1;
194 |   crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
195 |   crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
196 | end   
197 | 
198 | % ----------------------------------------------------------------------
199 | % For each frame of input speech, calculate the Weighted Spectral
200 | % Slope Measure
201 | % ----------------------------------------------------------------------
202 | 
203 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
204 | start      = 1;					% starting sample
205 | window     = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
206 | 
207 | for frame_count = 1:num_frames
208 | 
209 |    % ----------------------------------------------------------
210 |    % (1) Get the Frames for the test and reference speech. 
211 |    %     Multiply by Hanning Window.
212 |    % ----------------------------------------------------------
213 | 
214 |    clean_frame = clean_speech(start:start+winlength-1);
215 |    processed_frame = processed_speech(start:start+winlength-1);
216 |    clean_frame = clean_frame.*window;
217 |    processed_frame = processed_frame.*window;
218 | 
219 |    % ----------------------------------------------------------
220 |    % (2) Compute the Power Spectrum of Clean and Processed
221 |    % ----------------------------------------------------------
222 | 
223 |     if (USE_FFT_SPECTRUM)
224 |        clean_spec     = (abs(fft(clean_frame,n_fft)).^2);
225 |        processed_spec = (abs(fft(processed_frame,n_fft)).^2);
226 |     else
227 |        a_vec = zeros(1,n_fft);
228 |        a_vec(1:11) = lpc(clean_frame,10);
229 |        clean_spec     = 1.0/(abs(fft(a_vec,n_fft)).^2)';
230 | 
231 |        a_vec = zeros(1,n_fft);
232 |        a_vec(1:11) = lpc(processed_frame,10);
233 |        processed_spec = 1.0/(abs(fft(a_vec,n_fft)).^2)';
234 |     end
235 | 
236 |    % ----------------------------------------------------------
237 |    % (3) Compute Filterbank Output Energies (in dB scale)
238 |    % ----------------------------------------------------------
239 |  
240 |    for i = 1:num_crit
241 |       clean_energy(i) = sum(clean_spec(1:n_fftby2) ...
242 | 		            .*crit_filter(i,:)');
243 |       processed_energy(i) = sum(processed_spec(1:n_fftby2) ...
244 | 			        .*crit_filter(i,:)');
245 |    end
246 |    clean_energy = 10*log10(max(clean_energy,1E-10));
247 |    processed_energy = 10*log10(max(processed_energy,1E-10));
248 | 
249 |    % ----------------------------------------------------------
250 |    % (4) Compute Spectral Slope (dB[i+1]-dB[i]) 
251 |    % ----------------------------------------------------------
252 | 
253 |    clean_slope     = clean_energy(2:num_crit) - ...
254 | 		     clean_energy(1:num_crit-1);
255 |    processed_slope = processed_energy(2:num_crit) - ...
256 | 		     processed_energy(1:num_crit-1);
257 | 
258 |    % ----------------------------------------------------------
259 |    % (5) Find the nearest peak locations in the spectra to 
260 |    %     each critical band.  If the slope is negative, we 
261 |    %     search to the left.  If positive, we search to the 
262 |    %     right.
263 |    % ----------------------------------------------------------
264 | 
265 |    for i = 1:num_crit-1
266 | 
267 |        % find the peaks in the clean speech signal
268 | 	
269 |        if (clean_slope(i)>0) 		% search to the right
270 | 	  n = i;
271 |           while ((n<num_crit) & (clean_slope(n) > 0))
272 | 	     n = n+1;
273 |  	  end
274 | 	  clean_loc_peak(i) = clean_energy(n-1);
275 |        else				% search to the left
276 |           n = i;
277 | 	  while ((n>0) & (clean_slope(n) <= 0))
278 | 	     n = n-1;
279 |  	  end
280 | 	  clean_loc_peak(i) = clean_energy(n+1);
281 |        end
282 | 
283 |        % find the peaks in the processed speech signal
284 | 
285 |        if (processed_slope(i)>0) 	% search to the right
286 | 	  n = i;
287 |           while ((n<num_crit) & (processed_slope(n) > 0))
288 | 	     n = n+1;
289 | 	  end
290 | 	  processed_loc_peak(i) = processed_energy(n-1);
291 |        else				% search to the left
292 |           n = i;
293 | 	  while ((n>0) & (processed_slope(n) <= 0))
294 | 	     n = n-1;
295 |  	  end
296 | 	  processed_loc_peak(i) = processed_energy(n+1);
297 |        end
298 | 
299 |    end
300 | 
301 |    % ----------------------------------------------------------
302 |    %  (6) Compute the WSS Measure for this frame.  This 
303 |    %      includes determination of the weighting function.
304 |    % ----------------------------------------------------------
305 | 
306 |    dBMax_clean       = max(clean_energy);
307 |    dBMax_processed   = max(processed_energy);
308 | 
309 |    % The weights are calculated by averaging individual
310 |    % weighting factors from the clean and processed frame.
311 |    % These weights W_clean and W_processed should range
312 |    % from 0 to 1 and place more emphasis on spectral 
313 |    % peaks and less emphasis on slope differences in spectral
314 |    % valleys.  This procedure is described on page 1280 of
315 |    % Klatt's 1982 ICASSP paper.
316 | 
317 |    Wmax_clean        = Kmax ./ (Kmax + dBMax_clean - ...
318 | 		 	    clean_energy(1:num_crit-1));
319 |    Wlocmax_clean     = Klocmax ./ ( Klocmax + clean_loc_peak - ...
320 | 				clean_energy(1:num_crit-1));
321 |    W_clean           = Wmax_clean .* Wlocmax_clean;
322 | 
323 |    Wmax_processed    = Kmax ./ (Kmax + dBMax_processed - ...
324 | 			        processed_energy(1:num_crit-1));
325 |    Wlocmax_processed = Klocmax ./ ( Klocmax + processed_loc_peak - ...
326 | 			            processed_energy(1:num_crit-1));
327 |    W_processed       = Wmax_processed .* Wlocmax_processed;
328 |   
329 |    W = (W_clean + W_processed)./2.0;
330 |   
331 |    distortion(frame_count) = sum(W.*(clean_slope(1:num_crit-1) - ...
332 | 		       processed_slope(1:num_crit-1)).^2);
333 | 
334 |    % this normalization is not part of Klatt's paper, but helps
335 |    % to normalize the measure.  Here we scale the measure by the
336 |    % sum of the weights.
337 | 
338 |    distortion(frame_count) = distortion(frame_count)/sum(W);
339 |    
340 |    start = start + skiprate;
341 |      
342 | end
343 | 
344 | %-----------------------------------------------
345 | function distortion = llr(clean_speech, processed_speech,sample_rate)
346 | 
347 | 
348 | % ----------------------------------------------------------------------
349 | % Check the length of the clean and processed speech.  Must be the same.
350 | % ----------------------------------------------------------------------
351 | 
352 | clean_length      = length(clean_speech);
353 | processed_length  = length(processed_speech);
354 | 
355 | if (clean_length ~= processed_length)
356 |   disp('Error: Both Speech Files must be same length.');
357 |   return
358 | end
359 | 
360 | % ----------------------------------------------------------------------
361 | % Global Variables
362 | % ----------------------------------------------------------------------
363 | 
364 | winlength   = round(30*sample_rate/1000); %  window length in samples
365 | skiprate    = floor(winlength/4);		   % window skip in samples
366 | if sample_rate<10000
367 |    P           = 10;		   % LPC Analysis Order
368 | else
369 |     P=16;     % this could vary depending on sampling frequency.
370 | end
371 | 
372 | % ----------------------------------------------------------------------
373 | % For each frame of input speech, calculate the Log Likelihood Ratio 
374 | % ----------------------------------------------------------------------
375 | 
376 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
377 | start      = 1;					% starting sample
378 | window     = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
379 | 
380 | for frame_count = 1:num_frames
381 | 
382 |    % ----------------------------------------------------------
383 |    % (1) Get the Frames for the test and reference speech. 
384 |    %     Multiply by Hanning Window.
385 |    % ----------------------------------------------------------
386 | 
387 |    clean_frame = clean_speech(start:start+winlength-1);
388 |    processed_frame = processed_speech(start:start+winlength-1);
389 |    clean_frame = clean_frame.*window;
390 |    processed_frame = processed_frame.*window;
391 | 
392 |    % ----------------------------------------------------------
393 |    % (2) Get the autocorrelation lags and LPC parameters used
394 |    %     to compute the LLR measure.
395 |    % ----------------------------------------------------------
396 | 
397 |    [R_clean, Ref_clean, A_clean] = ...
398 |       lpcoeff(clean_frame, P);
399 |    [R_processed, Ref_processed, A_processed] = ...
400 |       lpcoeff(processed_frame, P);
401 | 
402 |    % ----------------------------------------------------------
403 |    % (3) Compute the LLR measure
404 |    % ----------------------------------------------------------
405 | 
406 |    numerator   = A_processed*toeplitz(R_clean)*A_processed';
407 |    denominator = A_clean*toeplitz(R_clean)*A_clean';
408 |    distortion(frame_count) = log(numerator/denominator); 
409 |    start = start + skiprate;
410 | 
411 | end
412 | 
413 | %---------------------------------------------
414 | function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order)
415 | 
416 |    % ----------------------------------------------------------
417 |    % (1) Compute Autocorrelation Lags
418 |    % ----------------------------------------------------------
419 | 
420 |    winlength = max(size(speech_frame));
421 |    for k=1:model_order+1
422 |       R(k) = sum(speech_frame(1:winlength-k+1) ...
423 | 		     .*speech_frame(k:winlength));
424 |    end
425 | 
426 |    % ----------------------------------------------------------
427 |    % (2) Levinson-Durbin
428 |    % ----------------------------------------------------------
429 | 
430 |    a = ones(1,model_order);
431 |    E(1)=R(1);
432 |    for i=1:model_order
433 |       a_past(1:i-1) = a(1:i-1);
434 |       sum_term = sum(a_past(1:i-1).*R(i:-1:2));
435 |       rcoeff(i)=(R(i+1) - sum_term) / E(i);
436 |       a(i)=rcoeff(i);
437 |       a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1);
438 |       E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i);
439 |    end
440 | 
441 |    acorr    = R;
442 |    refcoeff = rcoeff;
443 |    lpparams = [1 -a];
444 | 
445 |    
446 |    % ----------------------------------------------------------------------
447 | 
448 | function [overall_snr, segmental_snr] = snr(clean_speech, processed_speech,sample_rate)
449 | 
450 | % ----------------------------------------------------------------------
451 | % Check the length of the clean and processed speech.  Must be the same.
452 | % ----------------------------------------------------------------------
453 | 
454 | clean_length      = length(clean_speech);
455 | processed_length  = length(processed_speech);
456 | 
457 | if (clean_length ~= processed_length)
458 |   disp('Error: Both Speech Files must be same length.');
459 |   return
460 | end
461 | 
462 | % ----------------------------------------------------------------------
463 | % Scale both clean speech and processed speech to have same dynamic
464 | % range.  Also remove DC component from each signal
465 | % ----------------------------------------------------------------------
466 | 
467 | %clean_speech     = clean_speech     - mean(clean_speech);
468 | %processed_speech = processed_speech - mean(processed_speech);
469 | 
470 | %processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech)));
471 | 
472 | overall_snr = 10* log10( sum(clean_speech.^2)/sum((clean_speech-processed_speech).^2));
473 | 
474 | % ----------------------------------------------------------------------
475 | % Global Variables
476 | % ----------------------------------------------------------------------
477 | 
478 | winlength   = round(30*sample_rate/1000); %240;		   % window length in samples
479 | skiprate    = floor(winlength/4);		   % window skip in samples
480 | MIN_SNR     = -10;		   % minimum SNR in dB
481 | MAX_SNR     =  35;		   % maximum SNR in dB
482 | 
483 | % ----------------------------------------------------------------------
484 | % For each frame of input speech, calculate the Segmental SNR
485 | % ----------------------------------------------------------------------
486 | 
487 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
488 | start      = 1;					% starting sample
489 | window     = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
490 | 
491 | for frame_count = 1: num_frames
492 | 
493 |    % ----------------------------------------------------------
494 |    % (1) Get the Frames for the test and reference speech. 
495 |    %     Multiply by Hanning Window.
496 |    % ----------------------------------------------------------
497 | 
498 |    clean_frame = clean_speech(start:start+winlength-1);
499 |    processed_frame = processed_speech(start:start+winlength-1);
500 |    clean_frame = clean_frame.*window;
501 |    processed_frame = processed_frame.*window;
502 | 
503 |    % ----------------------------------------------------------
504 |    % (2) Compute the Segmental SNR
505 |    % ----------------------------------------------------------
506 | 
507 |    signal_energy = sum(clean_frame.^2);
508 |    noise_energy  = sum((clean_frame-processed_frame).^2);
509 |    segmental_snr(frame_count) = 10*log10(signal_energy/(noise_energy+eps)+eps);
510 |    segmental_snr(frame_count) = max(segmental_snr(frame_count),MIN_SNR);
511 |    segmental_snr(frame_count) = min(segmental_snr(frame_count),MAX_SNR);
512 | 
513 |    start = start + skiprate;
514 | 
515 | end
516 | 
517 | 
518 | 
519 | 


--------------------------------------------------------------------------------
/enhanced.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IMLHF/Speech-Enhancement-Measures/42490594d431c5c11a0d5b9e2dc24ccdf0f6ec07/enhanced.wav


--------------------------------------------------------------------------------
/estoi.m:
--------------------------------------------------------------------------------
  1 | function d = estoi(x, y, fs_signal)
  2 |     % % % %
  3 |     % from https://github.com/dakenan1/Speech-measure-SDR-SAR-STOI-PESQ/blob/master/estoi.m
  4 |     % % % %
  5 |     %   d = estoi(x, y, fs_signal) returns the output of the extended short-time
  6 |     %   objective intelligibility (ESTOI) predictor.
  7 |     %  
  8 |     % Implementation of the Extended Short-Time Objective
  9 |     % Intelligibility (ESTOI) predictor, described in Jesper Jensen and
 10 |     % Cees H. Taal, "An Algorithm for Predicting the Intelligibility of
 11 |     % Speech Masked by Modulated Noise Maskers," IEEE Transactions on
 12 |     % Audio, Speech and Language Processing, 2016.
 13 |     %
 14 |     % Input:
 15 |     %        x:         clean reference time domain signal
 16 |     %        y:         noisy/processed time domain signal
 17 |     %        fs_signal: sampling rate [Hz]
 18 |     %
 19 |     % Output:
 20 |     %        d: intelligibility index
 21 |     %
 22 |     %
 23 |     % Copyright 2016: Aalborg University, Section for Signal and Information Processing. 
 24 |     % The software is free for non-commercial use. 
 25 |     % The software comes WITHOUT ANY WARRANTY.
 26 |     
 27 |     
 28 |     if length(x)~=length(y)
 29 |       error('x and y should have the same length');
 30 |     end
 31 |     
 32 |     % initialization
 33 |     x               = x(:);                   % clean speech column vector
 34 |     y               = y(:);                   % processed speech column vector
 35 |     
 36 |     fs              = 10000;                  % sample rate of proposed intelligibility measure
 37 |     N_frame         = 256;                    % window support
 38 |     K               = 512;                    % FFT size
 39 |     J               = 15;                     % Number of 1/3 octave bands
 40 |     mn              = 150;                    % Center frequency of first 1/3 octave band in Hz.
 41 |     [H,fc_thirdoct] = thirdoct(fs, K, J, mn); % Get 1/3 octave band matrix
 42 |     N               = 30;                     % Number of frames for intermediate intelligibility measure
 43 |     dyn_range       = 40;                     % speech dynamic range
 44 |     
 45 |     % resample signals if other samplerate is used than fs
 46 |     if fs_signal ~= fs
 47 |       x	= resample(x, fs, fs_signal);
 48 |       y 	= resample(y, fs, fs_signal);
 49 |     end
 50 |     
 51 |     % remove silent frames
 52 |     [x y] = removeSilentFrames(x, y, dyn_range, N_frame, N_frame/2);
 53 |     
 54 |     % apply 1/3 octave band TF-decomposition
 55 |     x_hat     	= stdft(x, N_frame, N_frame/2, K); % apply short-time DFT to clean speech
 56 |     y_hat     	= stdft(y, N_frame, N_frame/2, K); % apply short-time DFT to processed speech
 57 |     
 58 |     
 59 |     x_hat       = x_hat(:, 1:(K/2+1)).'; % take clean single-sided spectrum
 60 |     y_hat       = y_hat(:, 1:(K/2+1)).'; % take processed single-sided spectrum
 61 |     
 62 |     X           = zeros(J, size(x_hat, 2)); % init memory for clean speech 1/3 octave band TF-representation
 63 |     Y           = zeros(J, size(y_hat, 2)); % init memory for processed speech 1/3 octave band TF-representation
 64 |     
 65 |     for i = 1:size(x_hat, 2)
 66 |       X(:, i)	= sqrt(H*abs(x_hat(:, i)).^2); % apply 1/3 octave band filtering
 67 |       Y(:, i)	= sqrt(H*abs(y_hat(:, i)).^2);
 68 |     end
 69 |     
 70 |     % loop all segments of length N and obtain intermediate intelligibility measure for each
 71 |     d1 = zeros(length(N:size(X, 2)),1); % init memory for intermediate intelligibility measure
 72 |     for m=N:size(X,2)
 73 |         X_seg  	= X(:, (m-N+1):m); % region of length N with clean TF-units for all j
 74 |         Y_seg  	= Y(:, (m-N+1):m); % region of length N with processed TF-units for all j
 75 |         X_seg = X_seg + eps*randn(size(X_seg)); % to avoid divide by zero
 76 |         Y_seg = Y_seg + eps*randn(size(Y_seg)); % to avoid divide by zero
 77 |         
 78 |         %% first normalize rows (to give \bar{S}_m)
 79 |         XX = X_seg - mean(X_seg.').'*ones(1,N); % normalize rows to zero mean
 80 |         YY = Y_seg - mean(Y_seg.').'*ones(1,N); % normalize rows to zero mean
 81 |         
 82 |         YY = diag(1./sqrt(diag(YY*YY')))*YY; % normalize rows to unit length
 83 |         XX = diag(1./sqrt(diag(XX*XX')))*XX; % normalize rows to unit length
 84 |     
 85 |         XX = XX + eps*randn(size(XX)); % to avoid corr.div.by.0
 86 |         YY = YY + eps*randn(size(YY)); % to avoid corr.div.by.0
 87 |     
 88 |         %% then normalize columns (to give \check{S}_m)
 89 |         YYY = YY - ones(J,1)*mean(YY); % normalize cols to zero mean
 90 |         XXX = XX - ones(J,1)*mean(XX); % normalize cols to zero mean
 91 |     
 92 |         YYY = YYY*diag(1./sqrt(diag(YYY'*YYY))); % normalize cols to unit length
 93 |         XXX = XXX*diag(1./sqrt(diag(XXX'*XXX))); % normalize cols to unit length
 94 |     
 95 |         %compute average of col.correlations (by stacking cols)
 96 |         d1(m-N+1) = 1/N*XXX(:).'*YYY(:);
 97 |     end
 98 |     d = mean(d1);
 99 |     
100 |     
101 |     %%
102 |     function  [A cf] = thirdoct(fs, N_fft, numBands, mn)
103 |     %   [A CF] = THIRDOCT(FS, N_FFT, NUMBANDS, MN) returns 1/3 octave band matrix
104 |     %   inputs:
105 |     %       FS:         samplerate
106 |     %       N_FFT:      FFT size
107 |     %       NUMBANDS:   number of bands
108 |     %       MN:         center frequency of first 1/3 octave band
109 |     %   outputs:
110 |     %       A:          octave band matrix
111 |     %       CF:         center frequencies
112 |     
113 |     f               = linspace(0, fs, N_fft+1);
114 |     f               = f(1:(N_fft/2+1));
115 |     k               = 0:(numBands-1);
116 |     cf              = 2.^(k/3)*mn;
117 |     fl              = sqrt((2.^(k/3)*mn).*2.^((k-1)/3)*mn);
118 |     fr              = sqrt((2.^(k/3)*mn).*2.^((k+1)/3)*mn);
119 |     A               = zeros(numBands, length(f));
120 |     
121 |     for i = 1:(length(cf))
122 |       [a b]                   = min((f-fl(i)).^2);
123 |       fl(i)                   = f(b);
124 |       fl_ii                   = b;
125 |       
126 |       [a b]                   = min((f-fr(i)).^2);
127 |       fr(i)                   = f(b);
128 |       fr_ii                   = b;
129 |       A(i,fl_ii:(fr_ii-1))	= 1;
130 |     end
131 |     
132 |     rnk         = sum(A, 2);
133 |     numBands  	= find((rnk(2:end)>=rnk(1:(end-1))) & (rnk(2:end)~=0)~=0, 1, 'last' )+1;
134 |     A           = A(1:numBands, :);
135 |     cf          = cf(1:numBands);
136 |     
137 |     %%
138 |     function x_stdft = stdft(x, N, K, N_fft)
139 |     %   X_STDFT = X_STDFT(X, N, K, N_FFT) returns the short-time
140 |     %	hanning-windowed dft of X with frame-size N, overlap K and DFT size
141 |     %   N_FFT. The columns and rows of X_STDFT denote the frame-index and
142 |     %   dft-bin index, respectively.
143 |     
144 |     frames      = 1:K:(length(x)-N);
145 |     x_stdft     = zeros(length(frames), N_fft);
146 |     
147 |     w           = hanning(N);
148 |     x           = x(:);
149 |     
150 |     for i = 1:length(frames)
151 |       ii              = frames(i):(frames(i)+N-1);
152 |       x_stdft(i, :) 	= fft(x(ii).*w, N_fft);
153 |     end
154 |     
155 |     %%
156 |     function [x_sil y_sil] = removeSilentFrames(x, y, range, N, K)
157 |     %   [X_SIL Y_SIL] = REMOVESILENTFRAMES(X, Y, RANGE, N, K) X and Y
158 |     %   are segmented with frame-length N and overlap K, where the maximum energy
159 |     %   of all frames of X is determined, say X_MAX. X_SIL and Y_SIL are the
160 |     %   reconstructed signals, excluding the frames, where the energy of a frame
161 |     %   of X is smaller than X_MAX-RANGE
162 |     
163 |     x       = x(:);
164 |     y       = y(:);
165 |     
166 |     frames  = 1:K:(length(x)-N);
167 |     w       = hanning(N);
168 |     msk     = zeros(size(frames));
169 |     
170 |     for j = 1:length(frames)
171 |       jj      = frames(j):(frames(j)+N-1);
172 |       msk(j) 	= 20*log10(norm(x(jj).*w)./sqrt(N));
173 |     end
174 |     
175 |     msk     = (msk-max(msk)+range)>0;
176 |     count   = 1;
177 |     
178 |     x_sil   = zeros(size(x));
179 |     y_sil   = zeros(size(y));
180 |     
181 |     for j = 1:length(frames)
182 |       if msk(j)
183 |         jj_i            = frames(j):(frames(j)+N-1);
184 |         jj_o            = frames(count):(frames(count)+N-1);
185 |         x_sil(jj_o)     = x_sil(jj_o) + x(jj_i).*w;
186 |         y_sil(jj_o)  	= y_sil(jj_o) + y(jj_i).*w;
187 |         count           = count+1;
188 |       end
189 |     end
190 |     
191 |     x_sil = x_sil(1:jj_o(end));
192 |     y_sil = y_sil(1:jj_o(end));


--------------------------------------------------------------------------------
/evaluate_all.m:
--------------------------------------------------------------------------------
 1 | % get CSIG, CBAK, CVOL, PESQ, SegSNR from two dir list
 2 | % [CSIG, CBAK, CVOL, PESQ, SegSNR] = evaluate_all(ref_dir, deg_dir)
 3 | 
 4 | function [Csig, Cbak, Cvol, pesq, SNR, SegSNR] = evaluate_all(ref_dir, deg_dir)
 5 |     ref_folder = dir(fullfile(ref_dir,'*.wav'));
 6 |     deg_folder = dir(fullfile(deg_dir,'*.wav'));
 7 |     ref_names = {ref_folder.name};
 8 |     deg_names = {deg_folder.name};
 9 |     ref_names = sort(ref_names);
10 |     deg_names = sort(deg_names);
11 |     disp(ref_names(1:5));
12 |     disp(deg_names(1:5));
13 |     n_refs = length(ref_names);
14 |     n_degs = length(deg_names);
15 |     assert(n_refs == n_degs, 'n_refs != n_degs');
16 |     csigs = zeros(1, n_refs);
17 |     cbaks = zeros(1, n_refs);
18 |     cvols = zeros(1, n_refs);
19 |     pesqs = zeros(1, n_refs);
20 |     snrs = zeros(1, n_refs);
21 |     segsnrs = zeros(1,n_refs);
22 |     for idx = 1:n_refs
23 |         ref_names(idx) = strcat(ref_dir, '/', ref_names(idx));
24 |         deg_names(idx) = strcat(deg_dir, '/', deg_names(idx));
25 |         % disp(ref_names(idx));
26 |         % disp(deg_names(idx));
27 |         ref_file = char(ref_names(idx));
28 |         deg_file = char(deg_names(idx));
29 |         [csig, cbak, cvol] = composite(ref_file, deg_file);
30 |         pesq_ = comp_pesq(ref_file, deg_file);
31 |         [snr, segsnr] = comp_snr(ref_file, deg_file);
32 |         csigs(idx) = csig;
33 |         cbaks(idx) = cbak;
34 |         cvols(idx) = cvol;
35 |         pesqs(idx) = pesq_(1);
36 |         snrs(idx) = snr;
37 |         segsnrs(idx) = segsnr;
38 |         % disp(strcat(ref_names(idx),'\n'))
39 |         fprintf('\n idx=%d  csig=%f  cbak=%f  cvol=%f  pesq=%f   snr=%f   ssnr=%f\n',idx,csig,cbak,cvol,pesq_(1),snr,segsnr);
40 |     end;
41 | 
42 |     Csig = mean(csigs);
43 |     Cbak = mean(cbaks);
44 |     Cvol = mean(cvols);
45 |     pesq = mean(pesqs);
46 |     SNR = mean(snrs);
47 |     SegSNR = mean(segsnrs); 
48 | end
49 | 


--------------------------------------------------------------------------------
/pesq.ubuntu16.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IMLHF/Speech-Enhancement-Measures/42490594d431c5c11a0d5b9e2dc24ccdf0f6ec07/pesq.ubuntu16.bin


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | python version：https://github.com/IMLHF/PHASEN-PyTorch/blob/master/phasen_torch/sepm.py
  2 | 
  3 | This folder contains implementations of objective quality measures 
  4 | (Chapter 11):
  5 | 
  6 | 	MATLAB file	Description                                 Reference
  7 | -----------------------------------------------------------------------------------
  8 | 	comp_snr.m	Overall and segmental SNR                       [1]
  9 | 
 10 | 	comp_wss.m	Weighted-spectral slope metric                  [2]
 11 | 
 12 | 	comp_llr.m	Likelihood-ratio measure                        [3]
 13 | 
 14 | 	comp_is.m	Itakura-Saito measure                           [3]
 15 | 	comp_cep.m	Cepstral distance measure                       [4]
 16 | 	comp_fwseg	Freq. weighted segm. SNR (fwSNRseg)    	        [5],Chap 11
 17 | 									 
 18 | 	comp_fwseg_variant   Frequency-variant fwSNRseg measure		Chap 11 
 19 | 									
 20 | 	comp_fwseg_mars	    Frequency variant fwSNRseg measure 		Chap 11
 21 | 			    based on MARS analysis				 									
 22 | 
 23 | 	comp_pesq.m		PESQ measure (narrowband)   ITU-T P.862             [6]
 24 |                 PESQ measure (wideband)     ITU-T P.862.2           [7]
 25 | 
 26 | 	composite.m	A composite measure                                 [8]
 27 | 
 28 | 
 29 | 	addnoise_asl.m	Adds noise to the clean signal at specified SNR 
 30 | 			based on active speech level.                           [9]
 31 | 
 32 | ---------------------------------------------------------------------------------
 33 | ## USAGE
 34 | 
 35 | 	[snr_mean, segsnr_mean]= compSNR(cleanFile.wav, enhdFile.wav);
 36 |     % where 'snr_mean' is the global overall SNR and 'segsnr_mean' is the segmental SNR.
 37 | 
 38 |     wss_mean = comp_wss(cleanFile.wav, enhancedFile.wav);
 39 | 
 40 |     llr_mean= comp_llr(cleanFile.wav, enhancedFile.wav);
 41 | 
 42 |     is_mean = comp_is(cleanFile.wav, enhancedFile.wav);
 43 | 
 44 |     cep_mean = comp_cep(cleanFile.wav, enhancedFile.wav);
 45 | 
 46 |     fwSNRseg = comp_fwseg(cleanFile.wav, enhancedFile.wav);
 47 | 
 48 |     [SIG,BAK,OVL] = comp_fwseg_variant(cleanFile.wav, enhancedFile.wav);
 49 | 	% where 'SIG' is the predicted rating of speech distortion, 
 50 | 	% 'BAK' is the predicted rating of background noise distortion,
 51 | 	% 'OVL' is the predicted rating of overall quality.
 52 | 
 53 |     [SIG,BAK,OVL] = comp_fwseg_mars(cleanFile.wav, enhancedFile.wav);
 54 | 
 55 |     pesq_val = comp_pesq(cleanFile.wav, enhancedFile.wav);
 56 | 	% Only sampling frequencies of 8000 Hz or 16000 Hz are supported.
 57 | 
 58 |     [Csig,Cbak,Covl] = composite(cleanFile.wav, enhancedFile.wav);
 59 | 	% where 'Csig' is the predicted rating of speech distortion,
 60 | 	% 'Cbak' is the predicted rating of background noise distortion,
 61 | 	% 'Covl' is the predicted rating of overall quality.
 62 | 
 63 |   	addnoise_asl(cleanfile.wav, noisefile.wav, outfile.wav, SNRlevel)
 64 | 
 65 | ---------------------------------------------------------------------------
 66 | 
 67 | ## REFERENCES:
 68 | 
 69 | [1] Hansen, J. and Pellom, B. (1998). An effective quality evaluation
 70 | 	protocol for speech enhancement algorithms. Inter. Conf. on Spoken 
 71 | 	Language Processing, 7(2819), 2822
 72 | 
 73 | [2] Klatt, D. (1982). Prediction of perceived phonetic distance from 
 74 | 	critical band spectra. Proc. IEEE Int. Conf. Acoust. , Speech, 
 75 | 	Signal Processing, 7, 1278-1281.
 76 | 
 77 | [3] Quackenbush, S., Barnwell, T., and Clements, M. (1988). Objective
 78 | 	 measures of speech quality. NJ: Prentice-Hall, Eaglewood Cliffs.
 79 | 
 80 | [4]	Kitawaki, N., Nagabuchi, H., and Itoh, K. (1988). Objective quality
 81 | 	evaluation for low bit-rate speech coding systems. IEEE J. Select.
 82 | 	Areas in Comm., 6(2), 262-273.
 83 | 
 84 | [5] Tribolet, J., Noll, P., McDermott, B., and Crochiere, R. E. (1978).
 85 | 	 A study of complexity and quality of speech waveform coders. Proc. 
 86 | 	IEEE Int. Conf. Acoust. , Speech, Signal Processing, 586-590.
 87 | 
 88 | [6] ITU (2000). Perceptual evaluation of speech quality (PESQ), and 
 89 | 	objective method for end-to-end speech quality assessment of 
 90 | 	narrowband telephone networks and speech codecs. ITU-T
 91 | 	Recommendation P.862
 92 | 
 93 | [7] ITU (2007). Wideband extension to Recommendation P.862 for the
 94 |     assessment of wideband telephone networks and speech codecs. ITU-T
 95 |     Recommendation P.862.2
 96 | 
 97 | [8] Hu, Y. and Loizou, P. (2006). Evaluation of objective measures 
 98 | 	for speech enhancement. Proc. Interspeech
 99 | 
100 | [9] ITU-T (1993). Objective measurement of active speech level. ITU-T 
101 | 	Recommendation P. 56
102 | 
103 | 
104 | Copyright (c) 2012 by Philipos C. Loizou
105 | 
106 | Revision: 1.0, Date: 05/14/2012 
107 | 
108 | ------------------------------------------------------------------------------
109 | 


--------------------------------------------------------------------------------
/readme.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IMLHF/Speech-Enhancement-Measures/42490594d431c5c11a0d5b9e2dc24ccdf0f6ec07/readme.pdf


--------------------------------------------------------------------------------
/sp04.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IMLHF/Speech-Enhancement-Measures/42490594d431c5c11a0d5b9e2dc24ccdf0f6ec07/sp04.wav


--------------------------------------------------------------------------------
/sp04_babble_sn10.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IMLHF/Speech-Enhancement-Measures/42490594d431c5c11a0d5b9e2dc24ccdf0f6ec07/sp04_babble_sn10.wav


--------------------------------------------------------------------------------
/stoi.m:
--------------------------------------------------------------------------------
  1 | function d = stoi(x, y, fs_signal)
  2 |     % from https://raw.githubusercontent.com/dakenan1/Speech-measure-SDR-SAR-STOI-PESQ/master/stoi.m
  3 |     %   d = stoi(x, y, fs_signal) returns the output of the short-time
  4 |     %   objective intelligibility (STOI) measure described in [1, 2], where x 
  5 |     %   and y denote the clean and processed speech, respectively, with sample
  6 |     %   rate fs_signal in Hz. The output d is expected to have a monotonic 
  7 |     %   relation with the subjective speech-intelligibility, where a higher d 
  8 |     %   denotes better intelligible speech. See [1, 2] for more details.
  9 |     %
 10 |     %   References:
 11 |     %      [1] C.H.Taal, R.C.Hendriks, R.Heusdens, J.Jensen 'A Short-Time
 12 |     %      Objective Intelligibility Measure for Time-Frequency Weighted Noisy
 13 |     %      Speech', ICASSP 2010, Texas, Dallas.
 14 |     %
 15 |     %      [2] C.H.Taal, R.C.Hendriks, R.Heusdens, J.Jensen 'An Algorithm for 
 16 |     %      Intelligibility Prediction of Time-Frequency Weighted Noisy Speech', 
 17 |     %      IEEE Transactions on Audio, Speech, and Language Processing, 2011. 
 18 |     %
 19 |     %
 20 |     % Copyright 2009: Delft University of Technology, Signal & Information
 21 |     % Processing Lab. The software is free for non-commercial use. This program
 22 |     % comes WITHOUT ANY WARRANTY.
 23 |     %
 24 |     %
 25 |     %
 26 |     % Updates:
 27 |     % 2011-04-26 Using the more efficient 'taa_corr' instead of 'corr'
 28 |     
 29 |     if length(x)~=length(y)
 30 |         error('x and y should have the same length');
 31 |     end
 32 |     
 33 |     % initialization
 34 |     x           = x(:);                             % clean speech column vector
 35 |     y           = y(:);                             % processed speech column vector
 36 |     
 37 |     fs          = 10000;                            % sample rate of proposed intelligibility measure
 38 |     N_frame    	= 256;                              % window support
 39 |     K           = 512;                              % FFT size
 40 |     J           = 15;                               % Number of 1/3 octave bands
 41 |     mn          = 150;                              % Center frequency of first 1/3 octave band in Hz.
 42 |     H           = thirdoct(fs, K, J, mn);           % Get 1/3 octave band matrix
 43 |     N           = 30;                               % Number of frames for intermediate intelligibility measure (Length analysis window)
 44 |     Beta        = -15;                           	% lower SDR-bound
 45 |     dyn_range   = 40;                               % speech dynamic range
 46 |     
 47 |     % resample signals if other samplerate is used than fs
 48 |     if fs_signal ~= fs
 49 |         x	= resample(x, fs, fs_signal);
 50 |         y 	= resample(y, fs, fs_signal);
 51 |     end
 52 |     
 53 |     % remove silent frames
 54 |     [x y] = removeSilentFrames(x, y, dyn_range, N_frame, N_frame/2);
 55 |     
 56 |     % apply 1/3 octave band TF-decomposition
 57 |     x_hat     	= stdft(x, N_frame, N_frame/2, K); 	% apply short-time DFT to clean speech
 58 |     y_hat     	= stdft(y, N_frame, N_frame/2, K); 	% apply short-time DFT to processed speech
 59 |     
 60 |     x_hat       = x_hat(:, 1:(K/2+1)).';         	% take clean single-sided spectrum
 61 |     y_hat       = y_hat(:, 1:(K/2+1)).';        	% take processed single-sided spectrum
 62 |     
 63 |     X           = zeros(J, size(x_hat, 2));         % init memory for clean speech 1/3 octave band TF-representation 
 64 |     Y           = zeros(J, size(y_hat, 2));         % init memory for processed speech 1/3 octave band TF-representation 
 65 |     
 66 |     for i = 1:size(x_hat, 2)
 67 |         X(:, i)	= sqrt(H*abs(x_hat(:, i)).^2);      % apply 1/3 octave bands as described in Eq.(1) [1]
 68 |         Y(:, i)	= sqrt(H*abs(y_hat(:, i)).^2);
 69 |     end
 70 |     
 71 |     % loop al segments of length N and obtain intermediate intelligibility measure for all TF-regions
 72 |     d_interm  	= zeros(J, length(N:size(X, 2)));                               % init memory for intermediate intelligibility measure
 73 |     c           = 10^(-Beta/20);                                                % constant for clipping procedure
 74 |     
 75 |     for m = N:size(X, 2)
 76 |         X_seg  	= X(:, (m-N+1):m);                                              % region with length N of clean TF-units for all j
 77 |         Y_seg  	= Y(:, (m-N+1):m);                                              % region with length N of processed TF-units for all j
 78 |         alpha   = sqrt(sum(X_seg.^2, 2)./sum(Y_seg.^2, 2));                     % obtain scale factor for normalizing processed TF-region for all j
 79 |         aY_seg 	= Y_seg.*repmat(alpha, [1 N]);                               	% obtain \alpha*Y_j(n) from Eq.(2) [1]
 80 |         for j = 1:J
 81 |               Y_prime             = min(aY_seg(j, :), X_seg(j, :)+X_seg(j, :)*c); % apply clipping from Eq.(3)   	
 82 |             d_interm(j, m-N+1)  = taa_corr(X_seg(j, :).', Y_prime(:));          % obtain correlation coeffecient from Eq.(4) [1]
 83 |         end
 84 |     end
 85 |             
 86 |     d = mean(d_interm(:));                                                      % combine all intermediate intelligibility measures as in Eq.(4) [1]
 87 |     
 88 |     %%
 89 |     function  [A cf] = thirdoct(fs, N_fft, numBands, mn)
 90 |     %   [A CF] = THIRDOCT(FS, N_FFT, NUMBANDS, MN) returns 1/3 octave band matrix
 91 |     %   inputs:
 92 |     %       FS:         samplerate 
 93 |     %       N_FFT:      FFT size
 94 |     %       NUMBANDS:   number of bands
 95 |     %       MN:         center frequency of first 1/3 octave band
 96 |     %   outputs:
 97 |     %       A:          octave band matrix
 98 |     %       CF:         center frequencies
 99 |     
100 |     f               = linspace(0, fs, N_fft+1);
101 |     f               = f(1:(N_fft/2+1));
102 |     k               = 0:(numBands-1); 
103 |     cf              = 2.^(k/3)*mn;
104 |     fl              = sqrt((2.^(k/3)*mn).*2.^((k-1)/3)*mn);
105 |     fr              = sqrt((2.^(k/3)*mn).*2.^((k+1)/3)*mn);
106 |     A               = zeros(numBands, length(f));
107 |     
108 |     for i = 1:(length(cf))
109 |         [a b]                   = min((f-fl(i)).^2);
110 |         fl(i)                   = f(b);
111 |         fl_ii                   = b;
112 |     
113 |         [a b]                   = min((f-fr(i)).^2);
114 |         fr(i)                   = f(b);
115 |         fr_ii                   = b;
116 |         A(i,fl_ii:(fr_ii-1))	= 1;
117 |     end
118 |     
119 |     rnk         = sum(A, 2);
120 |     numBands  	= find((rnk(2:end)>=rnk(1:(end-1))) & (rnk(2:end)~=0)~=0, 1, 'last' )+1;
121 |     A           = A(1:numBands, :);
122 |     cf          = cf(1:numBands);
123 |     
124 |     %%
125 |     function x_stdft = stdft(x, N, K, N_fft)
126 |     %   X_STDFT = X_STDFT(X, N, K, N_FFT) returns the short-time
127 |     %	hanning-windowed dft of X with frame-size N, overlap K and DFT size
128 |     %   N_FFT. The columns and rows of X_STDFT denote the frame-index and
129 |     %   dft-bin index, respectively.
130 |     
131 |     frames      = 1:K:(length(x)-N);
132 |     x_stdft     = zeros(length(frames), N_fft);
133 |     
134 |     w           = hanning(N);
135 |     x           = x(:);
136 |     
137 |     for i = 1:length(frames)
138 |         ii              = frames(i):(frames(i)+N-1);
139 |         x_stdft(i, :) 	= fft(x(ii).*w, N_fft);
140 |     end
141 |     
142 |     %%
143 |     function [x_sil y_sil] = removeSilentFrames(x, y, range, N, K)
144 |     %   [X_SIL Y_SIL] = REMOVESILENTFRAMES(X, Y, RANGE, N, K) X and Y
145 |     %   are segmented with frame-length N and overlap K, where the maximum energy
146 |     %   of all frames of X is determined, say X_MAX. X_SIL and Y_SIL are the
147 |     %   reconstructed signals, excluding the frames, where the energy of a frame
148 |     %   of X is smaller than X_MAX-RANGE
149 |     
150 |     x       = x(:);
151 |     y       = y(:);
152 |     
153 |     frames  = 1:K:(length(x)-N);
154 |     w       = hanning(N);
155 |     msk     = zeros(size(frames));
156 |     
157 |     for j = 1:length(frames)
158 |         jj      = frames(j):(frames(j)+N-1);
159 |         msk(j) 	= 20*log10(norm(x(jj).*w)./sqrt(N));
160 |     end
161 |     
162 |     msk     = (msk-max(msk)+range)>0;
163 |     count   = 1;
164 |     
165 |     x_sil   = zeros(size(x));
166 |     y_sil   = zeros(size(y));
167 |     
168 |     for j = 1:length(frames)
169 |         if msk(j)
170 |             jj_i            = frames(j):(frames(j)+N-1);
171 |             jj_o            = frames(count):(frames(count)+N-1);
172 |             x_sil(jj_o)     = x_sil(jj_o) + x(jj_i).*w;
173 |             y_sil(jj_o)  	= y_sil(jj_o) + y(jj_i).*w;
174 |             count           = count+1;
175 |         end
176 |     end
177 |     
178 |     x_sil = x_sil(1:jj_o(end));
179 |     y_sil = y_sil(1:jj_o(end));
180 |     
181 |     %%
182 |     function rho = taa_corr(x, y)
183 |     %   RHO = TAA_CORR(X, Y) Returns correlation coeffecient between column
184 |     %   vectors x and y. Gives same results as 'corr' from statistics toolbox.
185 |     xn    	= x-mean(x);
186 |     xn  	= xn/sqrt(sum(xn.^2));
187 |     yn   	= y-mean(y);
188 |     yn    	= yn/sqrt(sum(yn.^2));
189 |     rho   	= sum(xn.*yn);


--------------------------------------------------------------------------------
/toserver.sh:
--------------------------------------------------------------------------------
 1 | # ./toserver.sh room@15123 pc_RealIRM_RelativeLossAFD500
 2 | 
 3 | if [ -z "$1" ] || [ -z "$2" ]; then
 4 |   echo "Need a destination."
 5 |   exit -1
 6 | fi
 7 | site=${1#*@}
 8 | user=${1%@*}
 9 | rm _data _log -rf
10 | rm *__pycache__* -rf
11 | rm */__pycache__* -rf
12 | # mv exp ../
13 | # # scp -r -P 15044 ./* xxxx@speaker.is99kdf.xyz:~/lhf/work/irm_test/extract_tfrecord
14 | # scp -r -P 15043 ./* xx@speaker.is99kdf.xyz:~/work/speech_en_test/c001_se
15 | 
16 | # mv ../exp ./
17 | 
18 | if [ "$site" == "p40" ]; then
19 |   echo "To $user@$site:/home/zhangwenbo5/lihongfeng/$2"
20 |   rsync -avh -e "ssh -p 22 -o ProxyCommand='ssh -p 8695 zhangwenbo5@120.92.114.84 -W %h:%p'" --exclude-from='.gitignore' ./* zhangwenbo5@ksai-P40-2:/home/zhangwenbo5/lihongfeng/$2
21 | elif [ "$site" == "v100-3" ]; then
22 |   echo "To $user@$site:/home/zhangwenbo5/lihongfeng/$2"
23 |   rsync -avh -e "ssh -p 22 -o ProxyCommand='ssh -p 8695 zhangwenbo5@120.92.114.84 -W %h:%p'" --exclude-from='.gitignore' ./* zhangwenbo5@ksai-v100-3:/home/zhangwenbo5/lihongfeng/$2
24 | elif [ "$site" == "15123" ] || [ "$site" == "15041" ] || [ "$site" == "15043" ]; then
25 |   echo "To $user@$site:~/worklhf/$2"
26 |   rsync -avh -e 'ssh -p '$site --exclude-from='.gitignore' ./* $user@speaker.is99kdf.xyz:~/worklhf/$2
27 | fi
28 | # -a ：递归到目录，即复制所有文件和子目录。另外，打开归档模式和所有其他选项（相当于 -rlptgoD）
29 | # -v ：详细输出
30 | # -e ssh ：使用 ssh 作为远程 shell，这样所有的东西都被加密
31 | # --exclude='*.out' ：排除匹配模式的文件，例如 *.out 或 *.c 等。
32 | 
33 | # scp -r -P 15043 xx@speaker.is99kdf.xyz:/home/xx/work/paper_se_test/pc001_se/exp/rnn_speech_enhancement/nnet_C001/nnet_iter15* ./
34 | # scp -P 15223 xx@speaker.is99kdf.xyz:/fast/worklhf/paper_se_test/C_UNIGRU_RealPSM_RelativeLossAFD100/exp/rnn_speech_enhancement/nnet_C_UNIGRU_RealPSM_RelativeLossAFD100/nnet_iter25* ./
35 | 


--------------------------------------------------------------------------------
/wavread.m:
--------------------------------------------------------------------------------
1 | function [ ref_data, ref_sampling_rate, nbits ] = wavread( ref_wav )
2 | 	[ ref_data, ref_sampling_rate ] = audioread( ref_wav );
3 | 	info = audioinfo(ref_wav);
4 | 	nbits = info.BitsPerSample;
5 | end
6 | 


--------------------------------------------------------------------------------
/white_noise.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IMLHF/Speech-Enhancement-Measures/42490594d431c5c11a0d5b9e2dc24ccdf0f6ec07/white_noise.wav


--------------------------------------------------------------------------------