├── .gitattributes ├── .gitignore ├── GSC.m ├── Leer_Array_Signals.m ├── PESQ ├── DC_block.m ├── FFTNXCorr.m ├── addnoise_asl.asv ├── addnoise_asl.m ├── apply_VAD.asv ├── apply_VAD.m ├── apply_filter.asv ├── apply_filter.m ├── apply_filters.m ├── comp_cep.asv ├── comp_cep.m ├── comp_fwseg.asv ├── comp_fwseg.m ├── comp_fwseg_mars.asv ├── comp_fwseg_mars.m ├── comp_fwseg_variant.asv ├── comp_fwseg_variant.m ├── comp_is.asv ├── comp_is.m ├── comp_llr.asv ├── comp_llr.m ├── comp_snr.asv ├── comp_snr.m ├── comp_wss.asv ├── comp_wss.m ├── composite.asv ├── composite.m ├── crude_align.m ├── enhanced.wav ├── fix_power_level.asv ├── fix_power_level.m ├── id_searchwindows.m ├── id_utterances.m ├── input_filter.m ├── pesq.asv ├── pesq.m ├── pesq_psychoacoustic_model.asv ├── pesq_psychoacoustic_model.m ├── pow_of.m ├── readme.pdf ├── readme.txt ├── setup_global.m ├── sp04.wav ├── sp04_babble_sn10.wav ├── split_align.m ├── time_align.m ├── utterance_locate.m ├── utterance_split.m ├── wavread.m └── white_noise.wav ├── RETO2016_README.txt ├── RETO2016_TOOLS └── signals │ ├── README_Acquisition │ ├── an101-mtms-arr4A.adc │ ├── an102-mtms-arr4A.adc │ ├── an103-mtms-arr4A.adc │ ├── an103-mtms-senn4.adc │ ├── an104-mtms-arr4A.adc │ └── an105-mtms-arr4A.adc ├── ResumenResultados.xlsx ├── array.wav ├── asdf.wav ├── image_2017-02-16_15-18-08.png ├── limpia.wav ├── lms_eq.m ├── offsetcomp.m └── steering_vector.mat /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear in the root of a volume 35 | .DocumentRevisions-V100 36 | .fseventsd 37 | .Spotlight-V100 38 | .TemporaryItems 39 | .Trashes 40 | .VolumeIcon.icns 41 | 42 | # Directories potentially created on remote AFP share 43 | .AppleDB 44 | .AppleDesktop 45 | Network Trash Folder 46 | Temporary Items 47 | .apdisk 48 | -------------------------------------------------------------------------------- /GSC.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/GSC.m -------------------------------------------------------------------------------- /Leer_Array_Signals.m: -------------------------------------------------------------------------------- 1 | % LECTURA DE DATOS MULTICANAL 2 | % 16 kHz 3 | % 16 bits por muestra 4 | % 15 canales 5 | % Big-endian 6 | fm = 16000; % Frec. muestreo 7 | nc = 15; % Nº de canales. 8 | fname = 'an103-mtms-arr4A.adc'; 9 | % dir = '/zona_amp/data/Multimic/multimic/15element/'; 10 | % fname = strcat(dir,fname) 11 | [fid,msg] = fopen(fname,'r','b'); 12 | if fid < 0 13 | disp(msg); 14 | else 15 | data = fread(fid,'int16'); 16 | fclose(fid); 17 | end 18 | 19 | % Separa canales. 20 | nsamp=[]; 21 | for i = 1:nc 22 | x{i} = data(i:nc:end); 23 | x{i} = offsetcomp(x{i}); 24 | nsamp(i)=length(x{i}); 25 | end 26 | Nsamp=min(nsamp); %Numero de muestras a emplear en todas las senales 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /PESQ/DC_block.m: -------------------------------------------------------------------------------- 1 | function mod_data= DC_block( data, Nsamples) 2 | 3 | global Downsample DATAPADDING_MSECS SEARCHBUFFER 4 | 5 | ofs= SEARCHBUFFER* Downsample; 6 | mod_data= data; 7 | 8 | %compute dc component, it is a little weird 9 | facc= sum( data( ofs+ 1: Nsamples- ofs))/ Nsamples; 10 | mod_data( ofs+ 1: Nsamples- ofs)= data( ofs+ 1: Nsamples- ofs)- facc; 11 | 12 | mod_data( ofs+ 1: ofs+ Downsample)= mod_data( ofs+ 1: ofs+ Downsample).* ... 13 | ( 0.5+ (0: Downsample- 1))/ Downsample; 14 | 15 | mod_data( Nsamples- ofs: -1: Nsamples- ofs-Downsample+ 1)= ... 16 | mod_data( Nsamples- ofs: -1: Nsamples- ofs-Downsample+ 1).* ... 17 | ( 0.5+ (0: Downsample- 1))/ Downsample; 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /PESQ/FFTNXCorr.m: -------------------------------------------------------------------------------- 1 | function Y= FFTNXCorr( ref_VAD, startr, nr, deg_VAD, startd, nd) 2 | % this function has other simple implementations, current implementation is 3 | % consistent with the C version 4 | 5 | % one way to do this (in time domain) ===== 6 | x1= ref_VAD( startr: startr+ nr- 1); 7 | x2= deg_VAD( startd: startd+ nd- 1); 8 | x1= fliplr( x1); 9 | Y= conv( x2, x1); 10 | % done ===== 11 | 12 | % % the other way to do this (in freq domain)=== 13 | % Nx= 2^ (ceil( log2( max( nr, nd)))); 14 | % x1= zeros( 1, 2* Nx); 15 | % x2= zeros( 1, 2* Nx); 16 | % x1( 1: nr)= fliplr( ref_VAD( startr: startr+ nr- 1)); 17 | % x2( 1: nd)= deg_VAD( startd: startd+ nd- 1); 18 | % 19 | % if (nr== 491) 20 | % fid= fopen( 'mat_debug.txt', 'wt'); 21 | % fprintf( fid, '%f\n', x1); 22 | % fclose( fid); 23 | % end 24 | % 25 | % x1_fft= fft( x1, 2* Nx); 26 | % x2_fft= fft( x2, 2* Nx); 27 | % 28 | % tmp1= ifft( x1_fft.* x2_fft, 2* Nx); 29 | % 30 | % Ny= nr+ nd- 1; 31 | % Y= tmp1( 1: Ny); 32 | % % done =========== 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /PESQ/addnoise_asl.asv: -------------------------------------------------------------------------------- 1 | function addnoise_asl(cleanfile, noisefile, outfile, snr) 2 | % ---------------------------------------------------------------------- 3 | % This function adds noise to a file at a specified SNR level. It uses 4 | % the active speech level to compute the speech energy. The 5 | % active speech level is computed as per ITU-T P.56 standard [1]. 6 | % 7 | % Usage: addnoise_asl(cleanFile.wav, noiseFile.wav, noisyFile.wav, SNR) 8 | % 9 | % cleanFile.wav - clean input file in .wav format 10 | % noiseFile.wav - file containing the noise signal in .wav format 11 | % noisyFile.wav - resulting noisy file 12 | % SNR - desired SNR in dB 13 | % 14 | % Note that if the variable IRS below is set to 1, then it applies the IRS 15 | % filter to bandlimit the signal to 300 Hz - 3.2 kHz. The default 16 | % Example call: 17 | % 18 | % 19 | % References: 20 | % [1] ITU-T (1993). Objective measurement of active speech level. ITU-T 21 | % Recommendation P. 56 22 | % 23 | % Author: Yi Hu and Philipos C. Loizou 24 | % 25 | % Copyright (c) 2006 by Philipos C. Loizou 26 | % $Revision: 0.0 $ $Date: 10/09/2006 $ 27 | % ---------------------------------------------------------------------- 28 | 29 | IRS=0; % if 1 apply IRS filter simulating telephone handset bandwidth (300 Hz -3.2 kHz) 30 | 31 | % wavread gives floating point column data 32 | [clean, srate, nbits]= wavread(cleanfile); 33 | % filter clean speech with irs filter 34 | if IRS==1, clean= apply_IRS( clean, srate, nbits); end; 35 | 36 | [Px, asl, c0]= asl_P56 ( clean, srate, nbits); 37 | % Px is the active speech level ms energy, asl is the active factor, and c0 38 | % is the active speech level threshold. 39 | 40 | %noiseonly_len= floor( noiseonly* srate/ 1000); 41 | % <--------- insert noiseonly-msecs of silence 42 | %x= [zeros( noiseonly_len, 1); clean]; 43 | x=clean; 44 | x_len= length( x); % length of speech signal 45 | 46 | [noise, srate1, nbits1]= wavread( noisefile); 47 | if (srate1~= srate)| (nbits1~= nbits) 48 | error( 'the formats of the two files dont match!'); 49 | end 50 | noise_len= length( noise); 51 | if (noise_len<= x_len) 52 | error( 'the noise length has to be greater than speech length!'); 53 | end 54 | 55 | rand_start_limit= noise_len- x_len+ 1; 56 | % the start of the noise segment can vary between [1 rand_start_limit] 57 | rand_start= round( (rand_start_limit- 1)* rand( 1)+ 1); 58 | % random start of the noise segment 59 | noise_segment= noise( rand_start: rand_start+ x_len- 1); 60 | 61 | if IRS==1, noise_segment= apply_IRS( noise_segment, srate, nbits); end; 62 | 63 | % this is the randomly selected noise segment that will be added to the 64 | % clean speech x 65 | Pn= noise_segment'* noise_segment/ x_len; 66 | % we need to scale the noise segment samples to obtain the desired snr= 10* 67 | % log10( Px/ (sf^2 * Pn)) 68 | sf= sqrt( Px/Pn/ (10^ (snr/ 10))); % scale factor for noise segment data 69 | noise_segment= noise_segment * sf; 70 | 71 | noisy = x+ noise_segment; 72 | 73 | if ( (max( noisy)>= 1) | (min( noisy)< -1)) 74 | error( 'Overflow occurred!\n'); 75 | end; 76 | 77 | 78 | wavwrite( noisy, srate, nbits, outfile); 79 | 80 | fprintf( 1, 'For comparison, the old SNR based on long-term RMS level is %4.2f dB.\n\n', 10*log10((x'*x)/ ... 81 | (noise_segment'*noise_segment))); 82 | 83 | 84 | %------------------------------------------------------------------------ 85 | function data_filtered= apply_IRS( data, Fs, nbits); 86 | 87 | n= length( data); 88 | 89 | % now find the next power of 2 which is greater or equal to n 90 | pow_of_2= 2^ (ceil( log2( n))); 91 | 92 | align_filter_dB= [0, -200; 50, -40; 100, -20; 125, -12; 160, -6; 200, 0;... 93 | 250, 4; 300, 6; 350, 8; 400, 10; 500, 11; 600, 12; 700, 12; 800, 12;... 94 | 1000, 12; 1300, 12; 1600, 12; 2000, 12; 2500, 12; 3000, 12; 3250, 12;... 95 | 3500, 4; 4000, -200; 5000, -200; 6300, -200; 8000, -200]; 96 | 97 | [number_of_points, trivial]= size( align_filter_dB); 98 | overallGainFilter= interp1( align_filter_dB( :, 1), align_filter_dB( :, 2), ... 99 | 1000); 100 | 101 | x= zeros( 1, pow_of_2); 102 | x( 1: n)= data; 103 | 104 | x_fft= fft( x, pow_of_2); 105 | 106 | freq_resolution= Fs/ pow_of_2; 107 | 108 | factorDb( 1: pow_of_2/2+ 1)= interp1( align_filter_dB( :, 1), ... 109 | align_filter_dB( :, 2), (0: pow_of_2/2)* freq_resolution)- ... 110 | overallGainFilter; 111 | factor= 10.^ (factorDb/ 20); 112 | 113 | factor= [factor, fliplr( factor( 2: pow_of_2/2))]; 114 | x_fft= x_fft.* factor; 115 | 116 | y= ifft( x_fft, pow_of_2); 117 | 118 | data_filtered= y( 1: n)'; 119 | 120 | 121 | 122 | function [asl_ms, asl, c0]= asl_P56 ( x, fs, nbits) 123 | % this implements ITU P.56 method B. 124 | % 'speechfile' is the speech file to calculate active speech level for, 125 | % 'asl' is the active speech level (between 0 and 1), 126 | % 'asl_rms' is the active speech level mean square energy. 127 | 128 | % x is the column vector of floating point speech data 129 | 130 | x= x(:); % make sure x is column vector 131 | T= 0.03; % time constant of smoothing, in seconds 132 | H= 0.2; % hangover time in seconds 133 | M= 15.9; 134 | % margin in dB of the difference between threshold and active speech level 135 | thres_no= nbits- 1; % number of thresholds, for 16 bit, it's 15 136 | 137 | I= ceil( fs* H); % hangover in samples 138 | g= exp( -1/( fs* T)); % smoothing factor in envelop detection 139 | c( 1: thres_no)= 2.^ (-15: thres_no- 16); 140 | % vector with thresholds from one quantizing level up to half the maximum 141 | % code, at a step of 2, in the case of 16bit samples, from 2^-15 to 0.5; 142 | a( 1: thres_no)= 0; % activity counter for each level threshold 143 | hang( 1: thres_no)= I; % hangover counter for each level threshold 144 | 145 | sq= x'* x; % long-term level square energy of x 146 | x_len= length( x); % length of x 147 | 148 | % use a 2nd order IIR filter to detect the envelope q 149 | x_abs= abs( x); 150 | p= filter( 1-g, [1 -g], x_abs); 151 | q= filter( 1-g, [1 -g], p); 152 | 153 | for k= 1: x_len 154 | for j= 1: thres_no 155 | if (q(k)>= c(j)) 156 | a(j)= a(j)+ 1; 157 | hang(j)= 0; 158 | elseif (hang(j)< I) 159 | a(j)= a(j)+ 1; 160 | hang(j)= hang(j)+ 1; 161 | else 162 | break; 163 | end 164 | end 165 | end 166 | 167 | asl= 0; 168 | asl_rms= 0; 169 | if (a(1)== 0) 170 | return; 171 | else 172 | AdB1= 10* log10( sq/ a(1)+ eps); 173 | end 174 | 175 | CdB1= 20* log10( c(1)+ eps); 176 | if (AdB1- CdB1< M) 177 | return; 178 | end 179 | 180 | AdB(1)= AdB1; 181 | CdB(1)= CdB1; 182 | Delta(1)= AdB1- CdB1; 183 | 184 | for j= 2: thres_no 185 | AdB(j)= 10* log10( sq/ (a(j)+ eps)+ eps); 186 | CdB(j)= 20* log10( c(j)+ eps); 187 | end 188 | 189 | for j= 2: thres_no 190 | if (a(j) ~= 0) 191 | Delta(j)= AdB(j)- CdB(j); 192 | if (Delta(j)<= M) 193 | % interpolate to find the asl 194 | [asl_ms_log, cl0]= bin_interp( AdB(j), ... 195 | AdB(j-1), CdB(j), CdB(j-1), M, 0.5); 196 | asl_ms= 10^ (asl_ms_log/ 10); 197 | asl= (sq/ x_len)/ asl_ms; 198 | c0= 10^( cl0/ 20); 199 | break; 200 | end 201 | end 202 | end 203 | 204 | 205 | 206 | 207 | function [asl_ms_log, cc]= bin_interp(upcount, lwcount, ... 208 | upthr, lwthr, Margin, tol) 209 | 210 | if (tol < 0) 211 | tol = -tol; 212 | end 213 | 214 | % Check if extreme counts are not already the true active value 215 | iterno = 1; 216 | if (abs(upcount - upthr - Margin) < tol) 217 | asl_ms_log= upcount; 218 | cc= upthr; 219 | return; 220 | end 221 | if (abs(lwcount - lwthr - Margin) < tol) 222 | asl_ms_log= lwcount; 223 | cc= lwthr; 224 | return; 225 | end 226 | 227 | % Initialize first middle for given (initial) bounds 228 | midcount = (upcount + lwcount) / 2.0; 229 | midthr = (upthr + lwthr) / 2.0; 230 | 231 | % Repeats loop until `diff' falls inside the tolerance (-tol<=diff<=tol) 232 | while ( 1) 233 | 234 | diff= midcount- midthr- Margin; 235 | if (abs(diff)<= tol) 236 | break; 237 | end 238 | 239 | % if tolerance is not met up to 20 iteractions, then relax the 240 | % tolerance by 10% 241 | 242 | iterno= iterno+ 1; 243 | 244 | if (iterno>20) 245 | tol = tol* 1.1; 246 | end 247 | 248 | if (diff> tol) % then new bounds are ... 249 | midcount = (upcount + midcount) / 2.0; 250 | % upper and middle activities 251 | midthr = (upthr + midthr) / 2.0; 252 | % ... and thresholds 253 | elseif (diff< -tol) % then new bounds are ... 254 | midcount = (midcount + lwcount) / 2.0; 255 | % middle and lower activities 256 | midthr = (midthr + lwthr) / 2.0; 257 | % ... and thresholds 258 | end 259 | 260 | end 261 | % Since the tolerance has been satisfied, midcount is selected 262 | % as the interpolated value with a tol [dB] tolerance. 263 | 264 | asl_ms_log= midcount; 265 | cc= midthr; 266 | 267 | 268 | 269 | 270 | -------------------------------------------------------------------------------- /PESQ/addnoise_asl.m: -------------------------------------------------------------------------------- 1 | function addnoise_asl(cleanfile, noisefile, outfile, snr) 2 | % ---------------------------------------------------------------------- 3 | % This function adds noise to a file at a specified SNR level. It uses 4 | % the active speech level to compute the speech energy. The 5 | % active speech level is computed as per ITU-T P.56 standard [1]. 6 | % 7 | % Usage: addnoise_asl(cleanFile.wav, noiseFile.wav, noisyFile.wav, SNR) 8 | % 9 | % cleanFile.wav - clean input file in .wav format 10 | % noiseFile.wav - file containing the noise signal in .wav format 11 | % noisyFile.wav - resulting noisy file 12 | % SNR - desired SNR in dB 13 | % 14 | % Note that if the variable IRS below (line 38) is set to 1, then it applies the IRS 15 | % filter to bandlimit the signal to 300 Hz - 3.2 kHz. The default IRS 16 | % value is 0, ie, no IRS filtering is applied. 17 | % 18 | % Example call: 19 | % addnoise_asl('sp04.wav','white_noise.wav','sp04_white_5db.wav',5); 20 | % 21 | % 22 | % References: 23 | % [1] ITU-T (1993). Objective measurement of active speech level. ITU-T 24 | % Recommendation P. 56 25 | % 26 | % Author: Yi Hu and Philipos C. Loizou 27 | % 28 | % Copyright (c) 2006 by Philipos C. Loizou 29 | % $Revision: 0.0 $ $Date: 10/09/2006 $ 30 | % ---------------------------------------------------------------------- 31 | 32 | if nargin ~=4 33 | fprintf('USAGE: addnoise_asl(cleanFile.wav, noiseFile.wav, noisyFile.wav, SNR) \n'); 34 | fprintf('For more help, type: help addnoise_asl\n\n'); 35 | return; 36 | end 37 | 38 | IRS=0; % if 1 apply IRS filter simulating telephone handset bandwidth (300 Hz -3.2 kHz) 39 | 40 | % wavread gives floating point column data 41 | [clean, srate, nbits]= wavread(cleanfile); 42 | % filter clean speech with irs filter 43 | if IRS==1, clean= apply_IRS( clean, srate, nbits); end; 44 | 45 | [Px, asl, c0]= asl_P56 ( clean, srate, nbits); 46 | % Px is the active speech level ms energy, asl is the active factor, and c0 47 | % is the active speech level threshold. 48 | 49 | 50 | x=clean; 51 | x_len= length( x); % length of speech signal 52 | 53 | [noise, srate1, nbits1]= wavread( noisefile); 54 | if (srate1~= srate)| (nbits1~= nbits) 55 | error( 'the formats of the two files dont match!'); 56 | end 57 | noise_len= length( noise); 58 | if (noise_len<= x_len) 59 | error( 'the noise length has to be greater than speech length!'); 60 | end 61 | 62 | rand_start_limit= noise_len- x_len+ 1; 63 | % the start of the noise segment can vary between [1 rand_start_limit] 64 | rand_start= round( (rand_start_limit- 1)* rand( 1)+ 1); 65 | % random start of the noise segment 66 | noise_segment= noise( rand_start: rand_start+ x_len- 1); 67 | 68 | if IRS==1, noise_segment= apply_IRS( noise_segment, srate, nbits); end; 69 | 70 | % this is the randomly selected noise segment that will be added to the 71 | % clean speech x 72 | Pn= noise_segment'* noise_segment/ x_len; 73 | % we need to scale the noise segment samples to obtain the desired snr= 10* 74 | % log10( Px/ (sf^2 * Pn)) 75 | sf= sqrt( Px/Pn/ (10^ (snr/ 10))); % scale factor for noise segment data 76 | noise_segment= noise_segment * sf; 77 | 78 | noisy = x+ noise_segment; 79 | 80 | if ( (max( noisy)>= 1) | (min( noisy)< -1)) 81 | error( 'Overflow occurred!\n'); 82 | end; 83 | 84 | 85 | wavwrite( noisy, srate, nbits, outfile); 86 | 87 | fprintf( 1, '\n NOTE: For comparison, the SNR based on long-term RMS level is %4.2f dB.\n\n', 10*log10((x'*x)/ ... 88 | (noise_segment'*noise_segment))); 89 | 90 | 91 | %------------------------------------------------------------------------ 92 | function data_filtered= apply_IRS( data, Fs, nbits); 93 | 94 | n= length( data); 95 | 96 | % now find the next power of 2 which is greater or equal to n 97 | pow_of_2= 2^ (ceil( log2( n))); 98 | 99 | align_filter_dB= [0, -200; 50, -40; 100, -20; 125, -12; 160, -6; 200, 0;... 100 | 250, 4; 300, 6; 350, 8; 400, 10; 500, 11; 600, 12; 700, 12; 800, 12;... 101 | 1000, 12; 1300, 12; 1600, 12; 2000, 12; 2500, 12; 3000, 12; 3250, 12;... 102 | 3500, 4; 4000, -200; 5000, -200; 6300, -200; 8000, -200]; 103 | 104 | [number_of_points, trivial]= size( align_filter_dB); 105 | overallGainFilter= interp1( align_filter_dB( :, 1), align_filter_dB( :, 2), ... 106 | 1000); 107 | 108 | x= zeros( 1, pow_of_2); 109 | x( 1: n)= data; 110 | 111 | x_fft= fft( x, pow_of_2); 112 | 113 | freq_resolution= Fs/ pow_of_2; 114 | 115 | factorDb( 1: pow_of_2/2+ 1)= interp1( align_filter_dB( :, 1), ... 116 | align_filter_dB( :, 2), (0: pow_of_2/2)* freq_resolution)- ... 117 | overallGainFilter; 118 | factor= 10.^ (factorDb/ 20); 119 | 120 | factor= [factor, fliplr( factor( 2: pow_of_2/2))]; 121 | x_fft= x_fft.* factor; 122 | 123 | y= ifft( x_fft, pow_of_2); 124 | 125 | data_filtered= y( 1: n)'; 126 | 127 | 128 | 129 | function [asl_ms, asl, c0]= asl_P56 ( x, fs, nbits) 130 | % this implements ITU P.56 method B. 131 | % 'speechfile' is the speech file to calculate active speech level for, 132 | % 'asl' is the active speech level (between 0 and 1), 133 | % 'asl_rms' is the active speech level mean square energy. 134 | 135 | % x is the column vector of floating point speech data 136 | 137 | x= x(:); % make sure x is column vector 138 | T= 0.03; % time constant of smoothing, in seconds 139 | H= 0.2; % hangover time in seconds 140 | M= 15.9; 141 | % margin in dB of the difference between threshold and active speech level 142 | thres_no= nbits- 1; % number of thresholds, for 16 bit, it's 15 143 | 144 | I= ceil( fs* H); % hangover in samples 145 | g= exp( -1/( fs* T)); % smoothing factor in envelop detection 146 | c( 1: thres_no)= 2.^ (-15: thres_no- 16); 147 | % vector with thresholds from one quantizing level up to half the maximum 148 | % code, at a step of 2, in the case of 16bit samples, from 2^-15 to 0.5; 149 | a( 1: thres_no)= 0; % activity counter for each level threshold 150 | hang( 1: thres_no)= I; % hangover counter for each level threshold 151 | 152 | sq= x'* x; % long-term level square energy of x 153 | x_len= length( x); % length of x 154 | 155 | % use a 2nd order IIR filter to detect the envelope q 156 | x_abs= abs( x); 157 | p= filter( 1-g, [1 -g], x_abs); 158 | q= filter( 1-g, [1 -g], p); 159 | 160 | for k= 1: x_len 161 | for j= 1: thres_no 162 | if (q(k)>= c(j)) 163 | a(j)= a(j)+ 1; 164 | hang(j)= 0; 165 | elseif (hang(j)< I) 166 | a(j)= a(j)+ 1; 167 | hang(j)= hang(j)+ 1; 168 | else 169 | break; 170 | end 171 | end 172 | end 173 | 174 | asl= 0; 175 | asl_rms= 0; 176 | if (a(1)== 0) 177 | return; 178 | else 179 | AdB1= 10* log10( sq/ a(1)+ eps); 180 | end 181 | 182 | CdB1= 20* log10( c(1)+ eps); 183 | if (AdB1- CdB1< M) 184 | return; 185 | end 186 | 187 | AdB(1)= AdB1; 188 | CdB(1)= CdB1; 189 | Delta(1)= AdB1- CdB1; 190 | 191 | for j= 2: thres_no 192 | AdB(j)= 10* log10( sq/ (a(j)+ eps)+ eps); 193 | CdB(j)= 20* log10( c(j)+ eps); 194 | end 195 | 196 | for j= 2: thres_no 197 | if (a(j) ~= 0) 198 | Delta(j)= AdB(j)- CdB(j); 199 | if (Delta(j)<= M) 200 | % interpolate to find the asl 201 | [asl_ms_log, cl0]= bin_interp( AdB(j), ... 202 | AdB(j-1), CdB(j), CdB(j-1), M, 0.5); 203 | asl_ms= 10^ (asl_ms_log/ 10); 204 | asl= (sq/ x_len)/ asl_ms; 205 | c0= 10^( cl0/ 20); 206 | break; 207 | end 208 | end 209 | end 210 | 211 | 212 | 213 | 214 | function [asl_ms_log, cc]= bin_interp(upcount, lwcount, ... 215 | upthr, lwthr, Margin, tol) 216 | 217 | if (tol < 0) 218 | tol = -tol; 219 | end 220 | 221 | % Check if extreme counts are not already the true active value 222 | iterno = 1; 223 | if (abs(upcount - upthr - Margin) < tol) 224 | asl_ms_log= upcount; 225 | cc= upthr; 226 | return; 227 | end 228 | if (abs(lwcount - lwthr - Margin) < tol) 229 | asl_ms_log= lwcount; 230 | cc= lwthr; 231 | return; 232 | end 233 | 234 | % Initialize first middle for given (initial) bounds 235 | midcount = (upcount + lwcount) / 2.0; 236 | midthr = (upthr + lwthr) / 2.0; 237 | 238 | % Repeats loop until `diff' falls inside the tolerance (-tol<=diff<=tol) 239 | while ( 1) 240 | 241 | diff= midcount- midthr- Margin; 242 | if (abs(diff)<= tol) 243 | break; 244 | end 245 | 246 | % if tolerance is not met up to 20 iteractions, then relax the 247 | % tolerance by 10% 248 | 249 | iterno= iterno+ 1; 250 | 251 | if (iterno>20) 252 | tol = tol* 1.1; 253 | end 254 | 255 | if (diff> tol) % then new bounds are ... 256 | midcount = (upcount + midcount) / 2.0; 257 | % upper and middle activities 258 | midthr = (upthr + midthr) / 2.0; 259 | % ... and thresholds 260 | elseif (diff< -tol) % then new bounds are ... 261 | midcount = (midcount + lwcount) / 2.0; 262 | % middle and lower activities 263 | midthr = (midthr + lwthr) / 2.0; 264 | % ... and thresholds 265 | end 266 | 267 | end 268 | % Since the tolerance has been satisfied, midcount is selected 269 | % as the interpolated value with a tol [dB] tolerance. 270 | 271 | asl_ms_log= midcount; 272 | cc= midthr; 273 | 274 | 275 | 276 | 277 | -------------------------------------------------------------------------------- /PESQ/apply_VAD.asv: -------------------------------------------------------------------------------- 1 | function [VAD, logVAD]= apply_VAD( data, Nsamples) 2 | 3 | global Downsample MINSPEECHLGTH JOINSPEECHLGTH 4 | 5 | Nwindows= floor( Nsamples/ Downsample); 6 | %number of 4ms window 7 | 8 | VAD= zeros( 1, Nwindows); 9 | for count= 1: Nwindows 10 | VAD( count)= sum( data( (count-1)* Downsample+ 1: ... 11 | count* Downsample).^ 2)/ Downsample; 12 | end 13 | %VAD is the power of each 4ms window 14 | 15 | LevelThresh = sum( VAD)/ Nwindows; 16 | %LevelThresh is set to mean value of VAD 17 | 18 | LevelMin= max( VAD); 19 | if( LevelMin > 0 ) 20 | LevelMin= LevelMin* 1.0e-4; 21 | else 22 | LevelMin = 1.0; 23 | end 24 | %fprintf( 1, 'LevelMin is %f\n', LevelMin); 25 | 26 | VAD( find( VAD< LevelMin))= LevelMin; 27 | 28 | for iteration= 1: 12 29 | LevelNoise= 0; 30 | len= 0; 31 | StDNoise= 0; 32 | 33 | VAD_lessthan_LevelThresh= VAD( find( VAD<= LevelThresh)); 34 | len= length( VAD_lessthan_LevelThresh); 35 | LevelNoise= sum( VAD_lessthan_LevelThresh); 36 | if (len> 0) 37 | LevelNoise= LevelNoise/ len; 38 | StDNoise= sqrt( sum( ... 39 | (VAD_lessthan_LevelThresh- LevelNoise).^ 2)/ len); 40 | end 41 | LevelThresh= 1.001* (LevelNoise+ 2* StDNoise); 42 | end 43 | %fprintf( 1, 'LevelThresh is %f\n', LevelThresh); 44 | 45 | LevelNoise= 0; 46 | LevelSig= 0; 47 | len= 0; 48 | VAD_greaterthan_LevelThresh= VAD( find( VAD> LevelThresh)); 49 | len= length( VAD_greaterthan_LevelThresh); 50 | LevelSig= sum( VAD_greaterthan_LevelThresh); 51 | 52 | VAD_lessorequal_LevelThresh= VAD( find( VAD<= LevelThresh)); 53 | LevelNoise= sum( VAD_lessorequal_LevelThresh); 54 | 55 | if (len> 0) 56 | LevelSig= LevelSig/ len; 57 | else 58 | LevelThresh= -1; 59 | end 60 | %fprintf( 1, 'LevelSig is %f\n', LevelSig); 61 | 62 | if (len< Nwindows) 63 | LevelNoise= LevelNoise/( Nwindows- len); 64 | else 65 | LevelNoise= 1; 66 | end 67 | %fprintf( 1, 'LevelNoise is %f\n', LevelNoise); 68 | 69 | VAD( find( VAD<= LevelThresh))= -VAD( find( VAD<= LevelThresh)); 70 | VAD(1)= -LevelMin; 71 | VAD(Nwindows)= -LevelMin; 72 | 73 | 74 | start= 0; 75 | finish= 0; 76 | for count= 2: Nwindows 77 | if( (VAD(count) > 0.0) && (VAD(count-1) <= 0.0) ) 78 | start = count; 79 | end 80 | if( (VAD(count) <= 0.0) && (VAD(count-1) > 0.0) ) 81 | finish = count; 82 | if( (finish - start)<= MINSPEECHLGTH ) 83 | VAD( start: finish- 1)= -VAD( start: finish- 1); 84 | end 85 | end 86 | end 87 | %to make sure finish- start is more than 4 88 | 89 | if( LevelSig >= (LevelNoise* 1000) ) 90 | for count= 2: Nwindows 91 | if( (VAD(count)> 0) && (VAD(count-1)<= 0) ) 92 | start= count; 93 | end 94 | if( (VAD(count)<= 0) && (VAD(count-1)> 0) ) 95 | finish = count; 96 | g = sum( VAD( start: finish- 1)); 97 | if( g< 3.0* LevelThresh* (finish - start) ) 98 | VAD( start: finish- 1)= -VAD( start: finish- 1); 99 | end 100 | end 101 | end 102 | end 103 | 104 | start = 0; 105 | finish = 0; 106 | for count= 2: Nwindows 107 | if( (VAD(count) > 0.0) && (VAD(count-1) <= 0.0) ) 108 | start = count; 109 | if( (finish > 0) && ((start - finish) <= JOINSPEECHLGTH) ) 110 | VAD( finish: start- 1)= LevelMin; 111 | end 112 | end 113 | if( (VAD(count) <= 0.0) && (VAD(count-1) > 0.0) ) 114 | finish = count; 115 | end 116 | end 117 | 118 | start= 0; 119 | for count= 2: Nwindows 120 | if( (VAD(count)> 0) && (VAD(count-1)<= 0) ) 121 | start= count; 122 | end 123 | end 124 | if( start== 0 ) 125 | VAD= abs(VAD); 126 | VAD(1) = -LevelMin; 127 | VAD(Nwindows) = -LevelMin; 128 | end 129 | 130 | count = 4; 131 | while( count< (Nwindows-1) ) 132 | if( (VAD(count)> 0) && (VAD(count-2) <= 0) ) 133 | VAD(count-2)= VAD(count)* 0.1; 134 | VAD(count-1)= VAD(count)* 0.3; 135 | count= count+ 1; 136 | end 137 | if( (VAD(count)<= 0) && (VAD(count-1)> 0) ) 138 | VAD(count)= VAD(count-1)* 0.3; 139 | VAD(count+ 1)= VAD(count-1)* 0.1; 140 | count= count+ 3; 141 | end 142 | count= count+ 1; 143 | end 144 | 145 | VAD( find( VAD< 0))= 0; 146 | 147 | % fid= fopen( 'mat_vad.txt', 'wt'); 148 | % fprintf( fid, '%f\n', VAD); 149 | % fclose( fid); 150 | 151 | if( LevelThresh<= 0 ) 152 | LevelThresh= LevelMin; 153 | end 154 | %No me queda claro que se hace antes de esto. Es evidente que se calcula el 155 | %nivel umbral (LevelThresh). Pero tambien se le hace 156 | 157 | 158 | %Si VAD (que contiene la energia en 4 ms (32 muestras a 8Khz) NO es superior 159 | %al umbral entonces el envelope es 0 (log(MAX(VAD/LevelThresh,1))) 160 | logVAD( find( VAD<= LevelThresh))= 0; 161 | %Si VAD es superior al umbral, entonces se divide por el Level y se le hace 162 | %el log 163 | VAD_greaterthan_LevelThresh= find( VAD> LevelThresh); 164 | logVAD( VAD_greaterthan_LevelThresh)= log( VAD(VAD_greaterthan_LevelThresh)/ LevelThresh); 165 | %LogVAD queda relleno tanto de los que superan como los que no el umbral. 166 | 167 | 168 | 169 | -------------------------------------------------------------------------------- /PESQ/apply_VAD.m: -------------------------------------------------------------------------------- 1 | function [VAD, logVAD]= apply_VAD( data, Nsamples) 2 | 3 | global Downsample MINSPEECHLGTH JOINSPEECHLGTH 4 | 5 | Nwindows= floor( Nsamples/ Downsample); 6 | %number of 4ms window 7 | 8 | VAD= zeros( 1, Nwindows); 9 | for count= 1: Nwindows 10 | VAD( count)= sum( data( (count-1)* Downsample+ 1: ... 11 | count* Downsample).^ 2)/ Downsample; 12 | end 13 | %VAD is the power of each 4ms window 14 | 15 | LevelThresh = sum( VAD)/ Nwindows; 16 | %LevelThresh is set to mean value of VAD 17 | 18 | LevelMin= max( VAD); 19 | if( LevelMin > 0 ) 20 | LevelMin= LevelMin* 1.0e-4; 21 | else 22 | LevelMin = 1.0; 23 | end 24 | %fprintf( 1, 'LevelMin is %f\n', LevelMin); 25 | 26 | VAD( find( VAD< LevelMin))= LevelMin; 27 | 28 | for iteration= 1: 12 29 | LevelNoise= 0; 30 | len= 0; 31 | StDNoise= 0; 32 | 33 | VAD_lessthan_LevelThresh= VAD( find( VAD<= LevelThresh)); 34 | len= length( VAD_lessthan_LevelThresh); 35 | LevelNoise= sum( VAD_lessthan_LevelThresh); 36 | if (len> 0) 37 | LevelNoise= LevelNoise/ len; 38 | StDNoise= sqrt( sum( ... 39 | (VAD_lessthan_LevelThresh- LevelNoise).^ 2)/ len); 40 | end 41 | LevelThresh= 1.001* (LevelNoise+ 2* StDNoise); 42 | end 43 | %fprintf( 1, 'LevelThresh is %f\n', LevelThresh); 44 | 45 | %HASTA AQUI es asequible, lo siguiente no lo entiendo (pero creo que es 46 | %cocinica. 47 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 48 | 49 | LevelNoise= 0; 50 | LevelSig= 0; 51 | len= 0; 52 | VAD_greaterthan_LevelThresh= VAD( find( VAD> LevelThresh)); 53 | len= length( VAD_greaterthan_LevelThresh); 54 | LevelSig= sum( VAD_greaterthan_LevelThresh); 55 | 56 | VAD_lessorequal_LevelThresh= VAD( find( VAD<= LevelThresh)); 57 | LevelNoise= sum( VAD_lessorequal_LevelThresh); 58 | 59 | if (len> 0) 60 | LevelSig= LevelSig/ len; 61 | else 62 | LevelThresh= -1; 63 | end 64 | %fprintf( 1, 'LevelSig is %f\n', LevelSig); 65 | 66 | if (len< Nwindows) 67 | LevelNoise= LevelNoise/( Nwindows- len); 68 | else 69 | LevelNoise= 1; 70 | end 71 | %fprintf( 1, 'LevelNoise is %f\n', LevelNoise); 72 | 73 | VAD( find( VAD<= LevelThresh))= -VAD( find( VAD<= LevelThresh)); 74 | VAD(1)= -LevelMin; 75 | VAD(Nwindows)= -LevelMin; 76 | 77 | 78 | start= 0; 79 | finish= 0; 80 | for count= 2: Nwindows 81 | if( (VAD(count) > 0.0) && (VAD(count-1) <= 0.0) ) 82 | start = count; 83 | end 84 | if( (VAD(count) <= 0.0) && (VAD(count-1) > 0.0) ) 85 | finish = count; 86 | if( (finish - start)<= MINSPEECHLGTH ) 87 | VAD( start: finish- 1)= -VAD( start: finish- 1); 88 | end 89 | end 90 | end 91 | %to make sure finish- start is more than 4 92 | 93 | if( LevelSig >= (LevelNoise* 1000) ) 94 | for count= 2: Nwindows 95 | if( (VAD(count)> 0) && (VAD(count-1)<= 0) ) 96 | start= count; 97 | end 98 | if( (VAD(count)<= 0) && (VAD(count-1)> 0) ) 99 | finish = count; 100 | g = sum( VAD( start: finish- 1)); 101 | if( g< 3.0* LevelThresh* (finish - start) ) 102 | VAD( start: finish- 1)= -VAD( start: finish- 1); 103 | end 104 | end 105 | end 106 | end 107 | 108 | start = 0; 109 | finish = 0; 110 | for count= 2: Nwindows 111 | if( (VAD(count) > 0.0) && (VAD(count-1) <= 0.0) ) 112 | start = count; 113 | if( (finish > 0) && ((start - finish) <= JOINSPEECHLGTH) ) 114 | VAD( finish: start- 1)= LevelMin; 115 | end 116 | end 117 | if( (VAD(count) <= 0.0) && (VAD(count-1) > 0.0) ) 118 | finish = count; 119 | end 120 | end 121 | 122 | start= 0; 123 | for count= 2: Nwindows 124 | if( (VAD(count)> 0) && (VAD(count-1)<= 0) ) 125 | start= count; 126 | end 127 | end 128 | if( start== 0 ) 129 | VAD= abs(VAD); 130 | VAD(1) = -LevelMin; 131 | VAD(Nwindows) = -LevelMin; 132 | end 133 | 134 | count = 4; 135 | while( count< (Nwindows-1) ) 136 | if( (VAD(count)> 0) && (VAD(count-2) <= 0) ) 137 | VAD(count-2)= VAD(count)* 0.1; 138 | VAD(count-1)= VAD(count)* 0.3; 139 | count= count+ 1; 140 | end 141 | if( (VAD(count)<= 0) && (VAD(count-1)> 0) ) 142 | VAD(count)= VAD(count-1)* 0.3; 143 | VAD(count+ 1)= VAD(count-1)* 0.1; 144 | count= count+ 3; 145 | end 146 | count= count+ 1; 147 | end 148 | 149 | VAD( find( VAD< 0))= 0; 150 | 151 | % fid= fopen( 'mat_vad.txt', 'wt'); 152 | % fprintf( fid, '%f\n', VAD); 153 | % fclose( fid); 154 | 155 | if( LevelThresh<= 0 ) 156 | LevelThresh= LevelMin; 157 | end 158 | 159 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 160 | %No me queda claro que se hace antes de esto. Es evidente que se calcula el 161 | %nivel umbral (LevelThresh). Pero tambien se le hace un acomodado a las 162 | %senal VAD que no se especifica en la descripcion del estandar. Senal VAD 163 | %contiene la energia en 4 ms (32 muestras a 8Khz). 164 | 165 | %Si VAD NO es superior al umbral entonces el envelope es 0 (log(MAX(VAD/LevelThresh,1))) 166 | logVAD( find( VAD<= LevelThresh))= 0; 167 | %Si VAD es superior al umbral, entonces se divide por el Level y se le hace 168 | %el log 169 | VAD_greaterthan_LevelThresh= find( VAD> LevelThresh); 170 | logVAD( VAD_greaterthan_LevelThresh)= log( VAD(VAD_greaterthan_LevelThresh)/ LevelThresh); 171 | %LogVAD queda relleno tanto de los que superan como los que no el umbral. 172 | 173 | 174 | 175 | -------------------------------------------------------------------------------- /PESQ/apply_filter.asv: -------------------------------------------------------------------------------- 1 | function align_filtered= apply_filter( data, data_Nsamples, align_filter_dB) 2 | %Aplica un filtrado sobre la senal de entrada. Esta senal se presupone con 3 | %unos bufferes de inicio y final mas un padding. El filtro se expresa en 4 | %decibelios por cada frecuencia. Las frecuencias no tienen limitacion, ya 5 | %que genera sobre la marcha un filtro FIR basado en IFFT: 6 | % 1.Hace la fft de la senal 7 | % 2.Interpola la respuesta en frecuencia del filtro deseado 8 | % 3.Hace la respuesta del filtro simetrica y desace decibelios 9 | % 4.Lo anterior se hace de forma que el tamano de la FFT de la senal 10 | % coincide con la respuesta. 11 | % 5. Ambas de multiplican 12 | %NOTA: Fijate de que esta forma hace un megafiltrado que deja la fase 13 | %inalterada (la fase del filtro, que se suma a la de la senal, es 0, mientras 14 | %el modulo coincide con el valor absoluto especificado) 15 | global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs 16 | 17 | align_filtered= data; 18 | n= data_Nsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000); 19 | % now find the next power of 2 which is greater or equal to n 20 | pow_of_2= 2^ (ceil( log2( n))); 21 | 22 | [number_of_points, trivial]= size( align_filter_dB); 23 | overallGainFilter= interp1( align_filter_dB( :, 1), align_filter_dB( :, 2), ... 24 | 1000); 25 | 26 | x= zeros( 1, pow_of_2); 27 | x( 1: n)= data( SEARCHBUFFER* Downsample+ 1: SEARCHBUFFER* Downsample+ n); 28 | %Extrae la informacion del vector de entrada con bufferes y la coloca en un 29 | %vector con 0s. ESTO YA LO HACE MATLAB 30 | x_fft= fft( x, pow_of_2); 31 | 32 | freq_resolution= Fs/ pow_of_2; 33 | 34 | factorDb( 1: pow_of_2/2+ 1)= interp1( align_filter_dB( :, 1), ... 35 | align_filter_dB( :, 2), (0: pow_of_2/2)* freq_resolution)- ... 36 | overallGainFilter; 37 | factor= 10.^ (factorDb/ 20); 38 | 39 | factor= [factor, fliplr( factor( 2: pow_of_2/2))]; 40 | x_fft= x_fft.* factor; 41 | 42 | y= ifft( x_fft, pow_of_2); 43 | 44 | align_filtered( SEARCHBUFFER* Downsample+ 1: SEARCHBUFFER* Downsample+ n)... 45 | = y( 1: n); 46 | 47 | % fid= fopen( 'log_mat.txt', 'wt'); 48 | % fprintf( fid, '%f\n', y( 1: n)); 49 | % fclose( fid); 50 | 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /PESQ/apply_filter.m: -------------------------------------------------------------------------------- 1 | function align_filtered= apply_filter( data, data_Nsamples, align_filter_dB) 2 | %Aplica un filtrado sobre la senal de entrada. Esta senal se presupone con 3 | %unos bufferes de inicio y final mas un padding. El filtro se expresa en 4 | %decibelios por cada frecuencia. Las frecuencias no tienen limitacion, ya 5 | %que genera sobre la marcha un filtro FIR basado en IFFT: 6 | % 1.Hace la fft de la senal 7 | % 2.Interpola la respuesta en frecuencia del filtro deseado 8 | % 3.Hace la respuesta del filtro simetrica y desace decibelios 9 | % 4.Lo anterior se hace de forma que el tamano de la FFT de la senal 10 | % coincide con la respuesta. 11 | % 5. Ambas de multiplican 12 | %NOTA: Fijate de que esta forma hace un megafiltrado que deja la fase 13 | %inalterada (la fase del filtro, que se suma a la de la senal, es 0, mientras 14 | %el modulo coincide con el valor absoluto especificado) 15 | global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs 16 | 17 | align_filtered= data; 18 | n= data_Nsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000); 19 | % now find the next power of 2 which is greater or equal to n 20 | pow_of_2= 2^ (ceil( log2( n))); 21 | 22 | [number_of_points, trivial]= size( align_filter_dB); 23 | overallGainFilter= interp1( align_filter_dB( :, 1), align_filter_dB( :, 2), ... 24 | 1000); 25 | 26 | x= zeros( 1, pow_of_2); 27 | x( 1: n)= data( SEARCHBUFFER* Downsample+ 1: SEARCHBUFFER* Downsample+ n); 28 | %Extrae la informacion del vector de entrada con bufferes y la coloca en un 29 | %vector con 0s. ESTO YA LO HACE MATLAB 30 | x_fft= fft( x, pow_of_2); 31 | 32 | freq_resolution= Fs/ pow_of_2; 33 | 34 | factorDb( 1: pow_of_2/2+ 1)= interp1( align_filter_dB( :, 1), ... 35 | align_filter_dB( :, 2), (0: pow_of_2/2)* freq_resolution)- ... 36 | overallGainFilter; 37 | factor= 10.^ (factorDb/ 20); 38 | 39 | factor= [factor, fliplr( factor( 2: pow_of_2/2))]; 40 | x_fft= x_fft.* factor; 41 | 42 | y= ifft( x_fft, pow_of_2); 43 | 44 | align_filtered( SEARCHBUFFER* Downsample+ 1: SEARCHBUFFER* Downsample+ n)... 45 | = y( 1: n); 46 | 47 | % fid= fopen( 'log_mat.txt', 'wt'); 48 | % fprintf( fid, '%f\n', y( 1: n)); 49 | % fclose( fid); 50 | 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /PESQ/apply_filters.m: -------------------------------------------------------------------------------- 1 | function mod_data= apply_filters( data, Nsamples) 2 | %IIRFilt( InIIR_Hsos, InIIR_Nsos, data, data_Nsamples); 3 | 4 | global InIIR_Hsos InIIR_Nsos DATAPADDING_MSECS Fs 5 | % data_Nsamples= Nsamples+ DATAPADDING_MSECS* (Fs/ 1000); 6 | 7 | % now we construct the second order section matrix 8 | sosMatrix= zeros( InIIR_Nsos, 6); 9 | sosMatrix( :, 4)= 1; %set a(1) to 1 10 | % each row of sosMatrix holds [b(1*3) a(1*3)] for each section 11 | sosMatrix( :, 1: 3)= InIIR_Hsos( :, 1: 3); 12 | sosMatrix( :, 5: 6)= InIIR_Hsos( :, 4: 5); 13 | %sosMatrix 14 | 15 | % now we construct second order section direct form II filter 16 | iirdf2= dfilt.df2sos( sosMatrix); 17 | 18 | mod_data= filter( iirdf2, data); 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /PESQ/comp_cep.asv: -------------------------------------------------------------------------------- 1 | function cep_mean= comp_cep(cleanFile, enhdFile); 2 | 3 | % ---------------------------------------------------------------------- 4 | % Cepstrum Distance Objective Speech Quality Measure 5 | % 6 | % This function implements the cepstrum distance measure used 7 | % in [1] 8 | % 9 | % Usage: CEP=comp_cep(cleanFile.wav, enhancedFile.wav) 10 | % 11 | % cleanFile.wav - clean input file in .wav format 12 | % enhancedFile - enhanced output file in .wav format 13 | % CEP - computed cepstrum distance measure 14 | % 15 | % Note that the cepstrum measure is limited in the range [0, 100]. 16 | % 17 | % Example call: IS =comp_is('sp04.wav','enhanced.wav') 18 | % 19 | % 20 | % References: 21 | % 22 | % [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements, 23 | % Objective Measures of Speech Quality. Prentice Hall 24 | % Advanced Reference Series, Englewood Cliffs, NJ, 1988, 25 | % ISBN: 0-13-629056-6. 26 | % 27 | % [2] B.-H. Juang, "On Using the Itakura-Saito Measures for 28 | % Speech Coder Performance Evaluation", AT&T Bell 29 | % Laboratories Technical Journal, Vol. 63, No. 8, 30 | % October 1984, pp. 1477-1498. 31 | % 32 | % Authors: Bryan L. Pellom and John H. L. Hansen (July 1998) 33 | % Modified by: Philipos C. Loizou (Oct 2006) - limited IS to be in [0,100] 34 | % 35 | % Copyright (c) 2006 by Philipos C. Loizou 36 | % $Revision: 0.0 $ $Date: 10/09/2006 $ 37 | 38 | % ---------------------------------------------------------------------- 39 | if nargin~=2 40 | fprintf('USAGE: CEP=comp_cep(cleanFile.wav, enhancedFile.wav)\n'); 41 | fprintf('For more help, type: help comp_cep\n\n'); 42 | return; 43 | end 44 | 45 | alpha=0.95; 46 | 47 | [data1, Srate1, Nbits1]= wavread(cleanFile); 48 | [data2, Srate2, Nbits2]= wavread(enhdFile); 49 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2) 50 | error( 'The two files do not match!\n'); 51 | end 52 | 53 | len= min( length( data1), length( data2)); 54 | data1= data1( 1: len)+eps; 55 | data2= data2( 1: len)+eps; 56 | 57 | IS_dist= cepstrum( data1, data2,Srate1); 58 | 59 | IS_len= round( length( IS_dist)* alpha); 60 | IS= sort( IS_dist); 61 | 62 | is_mean= mean( IS( 1: IS_len)); 63 | 64 | 65 | 66 | 67 | function distortion = cepstrum(clean_speech, processed_speech,sample_rate) 68 | 69 | 70 | % ---------------------------------------------------------------------- 71 | % Check the length of the clean and processed speech. Must be the same. 72 | % ---------------------------------------------------------------------- 73 | 74 | clean_length = length(clean_speech); 75 | processed_length = length(processed_speech); 76 | 77 | if (clean_length ~= processed_length) 78 | disp('Error: Both Speech Files must be same length.'); 79 | return 80 | end 81 | 82 | % ---------------------------------------------------------------------- 83 | % Scale both clean speech and processed speech to have same dynamic 84 | % range. Also remove DC component from each signal 85 | % ---------------------------------------------------------------------- 86 | 87 | %clean_speech = clean_speech - mean(clean_speech); 88 | %processed_speech = processed_speech - mean(processed_speech); 89 | 90 | %processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech))); 91 | 92 | % ---------------------------------------------------------------------- 93 | % Global Variables 94 | % ---------------------------------------------------------------------- 95 | 96 | winlength = round(30*sample_rate/1000); %240; % window length in samples 97 | skiprate = floor(winlength/4); % window skip in samples 98 | if sample_rate<10000 99 | P = 10; % LPC Analysis Order 100 | else 101 | P=16; % this could vary depending on sampling frequency. 102 | end 103 | C=10*sqrt(2)/log(10); 104 | % ---------------------------------------------------------------------- 105 | % For each frame of input speech, calculate the Itakura-Saito Measure 106 | % ---------------------------------------------------------------------- 107 | 108 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames 109 | start = 1; % starting sample 110 | window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1))); 111 | 112 | for frame_count = 1:num_frames 113 | 114 | % ---------------------------------------------------------- 115 | % (1) Get the Frames for the test and reference speech. 116 | % Multiply by Hanning Window. 117 | % ---------------------------------------------------------- 118 | 119 | clean_frame = clean_speech(start:start+winlength-1); 120 | processed_frame = processed_speech(start:start+winlength-1); 121 | clean_frame = clean_frame.*window; 122 | processed_frame = processed_frame.*window; 123 | 124 | % ---------------------------------------------------------- 125 | % (2) Get the autocorrelation lags and LPC parameters used 126 | % to compute the IS measure. 127 | % ---------------------------------------------------------- 128 | 129 | [R_clean, Ref_clean, A_clean] = ... 130 | lpcoeff(clean_frame, P); 131 | [R_processed, Ref_processed, A_processed] = ... 132 | lpcoeff(processed_frame, P); 133 | 134 | C_clean=lpc2cep(A_clean); 135 | C_processed=lpc2cep(A_processed); 136 | 137 | % ---------------------------------------------------------- 138 | % (3) Compute the cepstrum-distance measure 139 | % ---------------------------------------------------------- 140 | 141 | 142 | distortion(frame_count) = min(10,C*norm(C_clean-C_processed,2)); 143 | 144 | 145 | start = start + skiprate; 146 | 147 | end 148 | 149 | 150 | 151 | function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order) 152 | 153 | % ---------------------------------------------------------- 154 | % (1) Compute Autocorrelation Lags 155 | % ---------------------------------------------------------- 156 | 157 | winlength = max(size(speech_frame)); 158 | for k=1:model_order+1 159 | R(k) = sum(speech_frame(1:winlength-k+1) ... 160 | .*speech_frame(k:winlength)); 161 | end 162 | 163 | % ---------------------------------------------------------- 164 | % (2) Levinson-Durbin 165 | % ---------------------------------------------------------- 166 | 167 | a = ones(1,model_order); 168 | E(1)=R(1); 169 | for i=1:model_order 170 | a_past(1:i-1) = a(1:i-1); 171 | sum_term = sum(a_past(1:i-1).*R(i:-1:2)); 172 | rcoeff(i)=(R(i+1) - sum_term) / E(i); 173 | a(i)=rcoeff(i); 174 | a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1); 175 | E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i); 176 | end 177 | 178 | acorr = R; 179 | refcoeff = rcoeff; 180 | lpparams = [1 -a]; 181 | 182 | %---------------------------------------------- 183 | function [cep]=lpc2cep(a) 184 | % 185 | % converts prediction to cepstrum coefficients 186 | % 187 | % Author: Philipos C. Loizou 188 | 189 | M=length(a); 190 | cep=zeros(1,M-1); 191 | 192 | cep(1)=-a(2); 193 | 194 | for k=2:M-1 195 | ix=1:k-1; 196 | vec1=cep(ix).*a(k-1+1:-1:2).*ix; 197 | cep(k)=-(a(k+1)+sum(vec1)/k); 198 | 199 | end 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | -------------------------------------------------------------------------------- /PESQ/comp_cep.m: -------------------------------------------------------------------------------- 1 | function cep_mean= comp_cep(cleanFile, enhdFile); 2 | 3 | % ---------------------------------------------------------------------- 4 | % Cepstrum Distance Objective Speech Quality Measure 5 | % 6 | % This function implements the cepstrum distance measure used 7 | % in [1] 8 | % 9 | % Usage: CEP=comp_cep(cleanFile.wav, enhancedFile.wav) 10 | % 11 | % cleanFile.wav - clean input file in .wav format 12 | % enhancedFile - enhanced output file in .wav format 13 | % CEP - computed cepstrum distance measure 14 | % 15 | % Note that the cepstrum measure is limited in the range [0, 10]. 16 | % 17 | % Example call: CEP =comp_cep('sp04.wav','enhanced.wav') 18 | % 19 | % 20 | % References: 21 | % 22 | % [1] Kitawaki, N., Nagabuchi, H., and Itoh, K. (1988). Objective quality 23 | % evaluation for low bit-rate speech coding systems. IEEE J. Select. 24 | % Areas in Comm., 6(2), 262-273. 25 | % 26 | % Author: Philipos C. Loizou 27 | % (LPC routines were written by Bryan Pellom & John Hansen) 28 | % 29 | % Copyright (c) 2006 by Philipos C. Loizou 30 | % $Revision: 0.0 $ $Date: 10/09/2006 $ 31 | 32 | % ---------------------------------------------------------------------- 33 | if nargin~=2 34 | fprintf('USAGE: CEP=comp_cep(cleanFile.wav, enhancedFile.wav)\n'); 35 | fprintf('For more help, type: help comp_cep\n\n'); 36 | return; 37 | end 38 | 39 | alpha=0.95; 40 | 41 | [data1, Srate1, Nbits1]= wavread(cleanFile); 42 | [data2, Srate2, Nbits2]= wavread(enhdFile); 43 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2) 44 | error( 'The two files do not match!\n'); 45 | end 46 | 47 | len= min( length( data1), length( data2)); 48 | data1= data1( 1: len)+eps; 49 | data2= data2( 1: len)+eps; 50 | 51 | IS_dist= cepstrum( data1, data2,Srate1); 52 | 53 | IS_len= round( length( IS_dist)* alpha); 54 | IS= sort( IS_dist); 55 | 56 | cep_mean= mean( IS( 1: IS_len)); 57 | 58 | 59 | 60 | 61 | function distortion = cepstrum(clean_speech, processed_speech,sample_rate) 62 | 63 | 64 | % ---------------------------------------------------------------------- 65 | % Check the length of the clean and processed speech. Must be the same. 66 | % ---------------------------------------------------------------------- 67 | 68 | clean_length = length(clean_speech); 69 | processed_length = length(processed_speech); 70 | 71 | if (clean_length ~= processed_length) 72 | disp('Error: Both Speech Files must be same length.'); 73 | return 74 | end 75 | 76 | % ---------------------------------------------------------------------- 77 | % Scale both clean speech and processed speech to have same dynamic 78 | % range. Also remove DC component from each signal 79 | % ---------------------------------------------------------------------- 80 | 81 | %clean_speech = clean_speech - mean(clean_speech); 82 | %processed_speech = processed_speech - mean(processed_speech); 83 | 84 | %processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech))); 85 | 86 | % ---------------------------------------------------------------------- 87 | % Global Variables 88 | % ---------------------------------------------------------------------- 89 | 90 | winlength = round(30*sample_rate/1000); %240; % window length in samples 91 | skiprate = floor(winlength/4); % window skip in samples 92 | if sample_rate<10000 93 | P = 10; % LPC Analysis Order 94 | else 95 | P=16; % this could vary depending on sampling frequency. 96 | end 97 | C=10*sqrt(2)/log(10); 98 | % ---------------------------------------------------------------------- 99 | % For each frame of input speech, calculate the Itakura-Saito Measure 100 | % ---------------------------------------------------------------------- 101 | 102 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames 103 | start = 1; % starting sample 104 | window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1))); 105 | 106 | for frame_count = 1:num_frames 107 | 108 | % ---------------------------------------------------------- 109 | % (1) Get the Frames for the test and reference speech. 110 | % Multiply by Hanning Window. 111 | % ---------------------------------------------------------- 112 | 113 | clean_frame = clean_speech(start:start+winlength-1); 114 | processed_frame = processed_speech(start:start+winlength-1); 115 | clean_frame = clean_frame.*window; 116 | processed_frame = processed_frame.*window; 117 | 118 | % ---------------------------------------------------------- 119 | % (2) Get the autocorrelation lags and LPC parameters used 120 | % to compute the IS measure. 121 | % ---------------------------------------------------------- 122 | 123 | [R_clean, Ref_clean, A_clean] = ... 124 | lpcoeff(clean_frame, P); 125 | [R_processed, Ref_processed, A_processed] = ... 126 | lpcoeff(processed_frame, P); 127 | 128 | C_clean=lpc2cep(A_clean); 129 | C_processed=lpc2cep(A_processed); 130 | 131 | % ---------------------------------------------------------- 132 | % (3) Compute the cepstrum-distance measure 133 | % ---------------------------------------------------------- 134 | 135 | 136 | distortion(frame_count) = min(10,C*norm(C_clean-C_processed,2)); 137 | 138 | 139 | start = start + skiprate; 140 | 141 | end 142 | 143 | 144 | 145 | function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order) 146 | 147 | % ---------------------------------------------------------- 148 | % (1) Compute Autocorrelation Lags 149 | % ---------------------------------------------------------- 150 | 151 | winlength = max(size(speech_frame)); 152 | for k=1:model_order+1 153 | R(k) = sum(speech_frame(1:winlength-k+1) ... 154 | .*speech_frame(k:winlength)); 155 | end 156 | 157 | % ---------------------------------------------------------- 158 | % (2) Levinson-Durbin 159 | % ---------------------------------------------------------- 160 | 161 | a = ones(1,model_order); 162 | E(1)=R(1); 163 | for i=1:model_order 164 | a_past(1:i-1) = a(1:i-1); 165 | sum_term = sum(a_past(1:i-1).*R(i:-1:2)); 166 | rcoeff(i)=(R(i+1) - sum_term) / E(i); 167 | a(i)=rcoeff(i); 168 | a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1); 169 | E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i); 170 | end 171 | 172 | acorr = R; 173 | refcoeff = rcoeff; 174 | lpparams = [1 -a]; 175 | 176 | %---------------------------------------------- 177 | function [cep]=lpc2cep(a) 178 | % 179 | % converts prediction to cepstrum coefficients 180 | % 181 | % Author: Philipos C. Loizou 182 | 183 | M=length(a); 184 | cep=zeros(1,M-1); 185 | 186 | cep(1)=-a(2); 187 | 188 | for k=2:M-1 189 | ix=1:k-1; 190 | vec1=cep(ix).*a(k-1+1:-1:2).*ix; 191 | cep(k)=-(a(k+1)+sum(vec1)/k); 192 | 193 | end 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | -------------------------------------------------------------------------------- /PESQ/comp_fwseg.asv: -------------------------------------------------------------------------------- 1 | function wss_dist= comp_fwseg(cleanFile, enhancedFile); 2 | 3 | % ---------------------------------------------------------------------- 4 | % Frequency weighte Objective Speech Quality Measure 5 | % 6 | % This function implements the cepstrum distance measure used 7 | % in [1] 8 | % 9 | % Usage: CEP=comp_cep(cleanFile.wav, enhancedFile.wav) 10 | % 11 | % cleanFile.wav - clean input file in .wav format 12 | % enhancedFile - enhanced output file in .wav format 13 | % CEP - computed cepstrum distance measure 14 | % 15 | % Note that the cepstrum measure is limited in the range [0, 10]. 16 | % 17 | % Example call: CEP =comp_cep('sp04.wav','enhanced.wav') 18 | % 19 | % 20 | % References: 21 | % 22 | % [1] Kitawaki, N., Nagabuchi, H., and Itoh, K. (1988). Objective quality 23 | % evaluation for low bit-rate speech coding systems. IEEE J. Select. 24 | % Areas in Comm., 6(2), 262-273. 25 | % 26 | % Author: Philipos C. Loizou 27 | % (LPC routines were written by Bryan Pellom & John Hansen) 28 | % 29 | % Copyright (c) 2006 by Philipos C. Loizou 30 | % $Revision: 0.0 $ $Date: 10/09/2006 $ 31 | 32 | % ---------------------------------------------------------------------- 33 | if nargin~=2 34 | fprintf('USAGE: CEP=comp_cep(cleanFile.wav, enhancedFile.wav)\n'); 35 | fprintf('For more help, type: help comp_cep\n\n'); 36 | return; 37 | end 38 | 39 | 40 | [data1, Srate1, Nbits1]= wavread(cleanFile); 41 | [data2, Srate2, Nbits2]= wavread(enhancedFile); 42 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2) 43 | error( 'The two files do not match!\n'); 44 | end 45 | 46 | len= min( length( data1), length( data2)); 47 | data1= data1( 1: len)+eps; 48 | data2= data2( 1: len)+eps; 49 | 50 | wss_dist_vec= fwseg( data1, data2,Srate1); 51 | wss_dist=mean(wss_dist_vec); 52 | 53 | 54 | % ---------------------------------------------------------------------- 55 | 56 | function distortion = fwseg(clean_speech, processed_speech,sample_rate) 57 | 58 | 59 | % ---------------------------------------------------------------------- 60 | % Check the length of the clean and processed speech. Must be the same. 61 | % ---------------------------------------------------------------------- 62 | 63 | clean_length = length(clean_speech); 64 | processed_length = length(processed_speech); 65 | 66 | if (clean_length ~= processed_length) 67 | disp('Error: Files must have same length.'); 68 | return 69 | end 70 | 71 | 72 | 73 | % ---------------------------------------------------------------------- 74 | % Global Variables 75 | % ---------------------------------------------------------------------- 76 | 77 | 78 | winlength = round(30*sample_rate/1000); % window length in samples 79 | skiprate = floor(winlength/4); % window skip in samples 80 | max_freq = sample_rate/2; % maximum bandwidth 81 | num_crit = 25; % number of critical bands 82 | USE_25=1; 83 | n_fft = 2^nextpow2(2*winlength); 84 | n_fftby2 = n_fft/2; % FFT size/2 85 | gamma=0.2; % power exponent 86 | 87 | % ---------------------------------------------------------------------- 88 | % Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz) 89 | % ---------------------------------------------------------------------- 90 | 91 | cent_freq(1) = 50.0000; bandwidth(1) = 70.0000; 92 | cent_freq(2) = 120.000; bandwidth(2) = 70.0000; 93 | cent_freq(3) = 190.000; bandwidth(3) = 70.0000; 94 | cent_freq(4) = 260.000; bandwidth(4) = 70.0000; 95 | cent_freq(5) = 330.000; bandwidth(5) = 70.0000; 96 | cent_freq(6) = 400.000; bandwidth(6) = 70.0000; 97 | cent_freq(7) = 470.000; bandwidth(7) = 70.0000; 98 | cent_freq(8) = 540.000; bandwidth(8) = 77.3724; 99 | cent_freq(9) = 617.372; bandwidth(9) = 86.0056; 100 | cent_freq(10) = 703.378; bandwidth(10) = 95.3398; 101 | cent_freq(11) = 798.717; bandwidth(11) = 105.411; 102 | cent_freq(12) = 904.128; bandwidth(12) = 116.256; 103 | cent_freq(13) = 1020.38; bandwidth(13) = 127.914; 104 | cent_freq(14) = 1148.30; bandwidth(14) = 140.423; 105 | cent_freq(15) = 1288.72; bandwidth(15) = 153.823; 106 | cent_freq(16) = 1442.54; bandwidth(16) = 168.154; 107 | cent_freq(17) = 1610.70; bandwidth(17) = 183.457; 108 | cent_freq(18) = 1794.16; bandwidth(18) = 199.776; 109 | cent_freq(19) = 1993.93; bandwidth(19) = 217.153; 110 | cent_freq(20) = 2211.08; bandwidth(20) = 235.631; 111 | cent_freq(21) = 2446.71; bandwidth(21) = 255.255; 112 | cent_freq(22) = 2701.97; bandwidth(22) = 276.072; 113 | cent_freq(23) = 2978.04; bandwidth(23) = 298.126; 114 | cent_freq(24) = 3276.17; bandwidth(24) = 321.465; 115 | cent_freq(25) = 3597.63; bandwidth(25) = 346.136; 116 | 117 | W=[ % articulation index weights 118 | 0.003 119 | 0.003 120 | 0.003 121 | 0.007 122 | 0.010 123 | 0.016 124 | 0.016 125 | 0.017 126 | 0.017 127 | 0.022 128 | 0.027 129 | 0.028 130 | 0.030 131 | 0.032 132 | 0.034 133 | 0.035 134 | 0.037 135 | 0.036 136 | 0.036 137 | 0.033 138 | 0.030 139 | 0.029 140 | 0.027 141 | 0.026 142 | 0.026]; 143 | 144 | W=W'; 145 | 146 | if USE_25==0 % use 13 bands 147 | % ----- lump adjacent filters together ---------------- 148 | k=2; 149 | cent_freq2(1)=cent_freq(1); 150 | bandwidth2(1)=bandwidth(1)+bandwidth(2); 151 | W2(1)=W(1); 152 | for i=2:13 153 | cent_freq2(i)=cent_freq2(i-1)+bandwidth2(i-1); 154 | bandwidth2(i)=bandwidth(k)+bandwidth(k+1); 155 | W2(i)=0.5*(W(k)+W(k+1)); 156 | k=k+2; 157 | end 158 | 159 | sumW=sum(W2); 160 | bw_min = bandwidth2 (1); % minimum critical bandwidth 161 | else 162 | sumW=sum(W); 163 | bw_min=bandwidth(1); 164 | end 165 | 166 | 167 | % ---------------------------------------------------------------------- 168 | % Set up the critical band filters. Note here that Gaussianly shaped 169 | % filters are used. Also, the sum of the filter weights are equivalent 170 | % for each critical band filter. Filter less than -30 dB and set to 171 | % zero. 172 | % ---------------------------------------------------------------------- 173 | 174 | min_factor = exp (-30.0 / (2.0 * 2.303)); % -30 dB point of filter 175 | if USE_25==0 176 | 177 | num_crit=length(cent_freq2); 178 | 179 | for i = 1:num_crit 180 | f0 = (cent_freq2 (i) / max_freq) * (n_fftby2); 181 | all_f0(i) = floor(f0); 182 | bw = (bandwidth2 (i) / max_freq) * (n_fftby2); 183 | norm_factor = log(bw_min) - log(bandwidth2(i)); 184 | j = 0:1:n_fftby2-1; 185 | crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor); 186 | crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor); 187 | end 188 | 189 | else 190 | for i = 1:num_crit 191 | f0 = (cent_freq (i) / max_freq) * (n_fftby2); 192 | all_f0(i) = floor(f0); 193 | bw = (bandwidth (i) / max_freq) * (n_fftby2); 194 | norm_factor = log(bw_min) - log(bandwidth(i)); 195 | j = 0:1:n_fftby2-1; 196 | crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor); 197 | crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor); 198 | end 199 | end 200 | 201 | 202 | 203 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames 204 | start = 1; % starting sample 205 | window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1))); 206 | 207 | for frame_count = 1:num_frames 208 | 209 | % ---------------------------------------------------------- 210 | % (1) Get the Frames for the test and reference speech. 211 | % Multiply by Hanning Window. 212 | % ---------------------------------------------------------- 213 | 214 | clean_frame = clean_speech(start:start+winlength-1); 215 | processed_frame = processed_speech(start:start+winlength-1); 216 | clean_frame = clean_frame.*window; 217 | processed_frame = processed_frame.*window; 218 | 219 | % ---------------------------------------------------------- 220 | % (2) Compute the magnitude Spectrum of Clean and Processed 221 | % ---------------------------------------------------------- 222 | 223 | 224 | clean_spec = abs(fft(clean_frame,n_fft)); 225 | processed_spec = abs(fft(processed_frame,n_fft)); 226 | 227 | % normalize spectra to have area of one 228 | % 229 | clean_spec=clean_spec/sum(clean_spec(1:n_fftby2)); 230 | processed_spec=processed_spec/sum(processed_spec(1:n_fftby2)); 231 | 232 | % ---------------------------------------------------------- 233 | % (3) Compute Filterbank Output Energies 234 | % ---------------------------------------------------------- 235 | 236 | clean_energy=zeros(1,num_crit); 237 | processed_energy=zeros(1,num_crit); 238 | error_energy=zeros(1,num_crit); 239 | W_freq=zeros(1,num_crit); 240 | 241 | for i = 1:num_crit 242 | clean_energy(i) = sum(clean_spec(1:n_fftby2) ... 243 | .*crit_filter(i,:)'); 244 | processed_energy(i) = sum(processed_spec(1:n_fftby2) ... 245 | .*crit_filter(i,:)'); 246 | 247 | error_energy(i)=max((clean_energy(i)-processed_energy(i))^2,eps); 248 | W_freq(i)=(clean_energy(i))^gamma; 249 | 250 | end 251 | SNRlog=10*log10((clean_energy.^2)./error_energy); 252 | 253 | fwSNR=sum(W_freq.*SNRlog)/sum(W_freq); 254 | 255 | distortion(frame_count)=min(max(fwSNR,-10),35); 256 | 257 | start = start + skiprate; 258 | 259 | end 260 | 261 | -------------------------------------------------------------------------------- /PESQ/comp_fwseg.m: -------------------------------------------------------------------------------- 1 | function fwseg_dist= comp_fwseg(cleanFile, enhancedFile); 2 | 3 | % ---------------------------------------------------------------------- 4 | % Frequency weighted SNRseg Objective Speech Quality Measure 5 | % 6 | % This function implements the frequency-weighted SNRseg measure [1] 7 | % using a different weighting function, the clean spectrum. 8 | % 9 | % Usage: fwSNRseg=comp_fwseg(cleanFile.wav, enhancedFile.wav) 10 | % 11 | % cleanFile.wav - clean input file in .wav format 12 | % enhancedFile - enhanced output file in .wav format 13 | % fwSNRseg - computed frequency weighted SNRseg in dB 14 | % 15 | % Note that large numbers of fwSNRseg are better. 16 | % 17 | % Example call: fwSNRseg =comp_fwseg('sp04.wav','enhanced.wav') 18 | % 19 | % 20 | % References: 21 | % [1] Tribolet, J., Noll, P., McDermott, B., and Crochiere, R. E. (1978). 22 | % A study of complexity and quality of speech waveform coders. Proc. 23 | % IEEE Int. Conf. Acoust. , Speech, Signal Processing, 586-590. 24 | % 25 | % Author: Philipos C. Loizou 26 | % (critical-band filtering routines were written by Bryan Pellom & John Hansen) 27 | % 28 | % Copyright (c) 2006 by Philipos C. Loizou 29 | % $Revision: 0.0 $ $Date: 10/09/2006 $ 30 | % ---------------------------------------------------------------------- 31 | 32 | if nargin~=2 33 | fprintf('USAGE: fwSNRseg=comp_fwseg(cleanFile.wav, enhancedFile.wav)\n'); 34 | fprintf('For more help, type: help comp_fwseg\n\n'); 35 | return; 36 | end 37 | 38 | 39 | [data1, Srate1, Nbits1]= wavread(cleanFile); 40 | [data2, Srate2, Nbits2]= wavread(enhancedFile); 41 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2) 42 | error( 'The two files do not match!\n'); 43 | end 44 | 45 | len= min( length( data1), length( data2)); 46 | data1= data1( 1: len)+eps; 47 | data2= data2( 1: len)+eps; 48 | 49 | wss_dist_vec= fwseg( data1, data2,Srate1); 50 | fwseg_dist=mean(wss_dist_vec); 51 | 52 | 53 | % ---------------------------------------------------------------------- 54 | 55 | function distortion = fwseg(clean_speech, processed_speech,sample_rate) 56 | 57 | 58 | % ---------------------------------------------------------------------- 59 | % Check the length of the clean and processed speech. Must be the same. 60 | % ---------------------------------------------------------------------- 61 | 62 | clean_length = length(clean_speech); 63 | processed_length = length(processed_speech); 64 | 65 | if (clean_length ~= processed_length) 66 | disp('Error: Files must have same length.'); 67 | return 68 | end 69 | 70 | 71 | 72 | % ---------------------------------------------------------------------- 73 | % Global Variables 74 | % ---------------------------------------------------------------------- 75 | 76 | 77 | winlength = round(30*sample_rate/1000); % window length in samples 78 | skiprate = floor(winlength/4); % window skip in samples 79 | max_freq = sample_rate/2; % maximum bandwidth 80 | num_crit = 25; % number of critical bands 81 | USE_25=1; 82 | n_fft = 2^nextpow2(2*winlength); 83 | n_fftby2 = n_fft/2; % FFT size/2 84 | gamma=0.2; % power exponent 85 | 86 | % ---------------------------------------------------------------------- 87 | % Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz) 88 | % ---------------------------------------------------------------------- 89 | 90 | cent_freq(1) = 50.0000; bandwidth(1) = 70.0000; 91 | cent_freq(2) = 120.000; bandwidth(2) = 70.0000; 92 | cent_freq(3) = 190.000; bandwidth(3) = 70.0000; 93 | cent_freq(4) = 260.000; bandwidth(4) = 70.0000; 94 | cent_freq(5) = 330.000; bandwidth(5) = 70.0000; 95 | cent_freq(6) = 400.000; bandwidth(6) = 70.0000; 96 | cent_freq(7) = 470.000; bandwidth(7) = 70.0000; 97 | cent_freq(8) = 540.000; bandwidth(8) = 77.3724; 98 | cent_freq(9) = 617.372; bandwidth(9) = 86.0056; 99 | cent_freq(10) = 703.378; bandwidth(10) = 95.3398; 100 | cent_freq(11) = 798.717; bandwidth(11) = 105.411; 101 | cent_freq(12) = 904.128; bandwidth(12) = 116.256; 102 | cent_freq(13) = 1020.38; bandwidth(13) = 127.914; 103 | cent_freq(14) = 1148.30; bandwidth(14) = 140.423; 104 | cent_freq(15) = 1288.72; bandwidth(15) = 153.823; 105 | cent_freq(16) = 1442.54; bandwidth(16) = 168.154; 106 | cent_freq(17) = 1610.70; bandwidth(17) = 183.457; 107 | cent_freq(18) = 1794.16; bandwidth(18) = 199.776; 108 | cent_freq(19) = 1993.93; bandwidth(19) = 217.153; 109 | cent_freq(20) = 2211.08; bandwidth(20) = 235.631; 110 | cent_freq(21) = 2446.71; bandwidth(21) = 255.255; 111 | cent_freq(22) = 2701.97; bandwidth(22) = 276.072; 112 | cent_freq(23) = 2978.04; bandwidth(23) = 298.126; 113 | cent_freq(24) = 3276.17; bandwidth(24) = 321.465; 114 | cent_freq(25) = 3597.63; bandwidth(25) = 346.136; 115 | 116 | W=[ % articulation index weights 117 | 0.003 118 | 0.003 119 | 0.003 120 | 0.007 121 | 0.010 122 | 0.016 123 | 0.016 124 | 0.017 125 | 0.017 126 | 0.022 127 | 0.027 128 | 0.028 129 | 0.030 130 | 0.032 131 | 0.034 132 | 0.035 133 | 0.037 134 | 0.036 135 | 0.036 136 | 0.033 137 | 0.030 138 | 0.029 139 | 0.027 140 | 0.026 141 | 0.026]; 142 | 143 | W=W'; 144 | 145 | if USE_25==0 % use 13 bands 146 | % ----- lump adjacent filters together ---------------- 147 | k=2; 148 | cent_freq2(1)=cent_freq(1); 149 | bandwidth2(1)=bandwidth(1)+bandwidth(2); 150 | W2(1)=W(1); 151 | for i=2:13 152 | cent_freq2(i)=cent_freq2(i-1)+bandwidth2(i-1); 153 | bandwidth2(i)=bandwidth(k)+bandwidth(k+1); 154 | W2(i)=0.5*(W(k)+W(k+1)); 155 | k=k+2; 156 | end 157 | 158 | sumW=sum(W2); 159 | bw_min = bandwidth2 (1); % minimum critical bandwidth 160 | else 161 | sumW=sum(W); 162 | bw_min=bandwidth(1); 163 | end 164 | 165 | 166 | % ---------------------------------------------------------------------- 167 | % Set up the critical band filters. Note here that Gaussianly shaped 168 | % filters are used. Also, the sum of the filter weights are equivalent 169 | % for each critical band filter. Filter less than -30 dB and set to 170 | % zero. 171 | % ---------------------------------------------------------------------- 172 | 173 | min_factor = exp (-30.0 / (2.0 * 2.303)); % -30 dB point of filter 174 | if USE_25==0 175 | 176 | num_crit=length(cent_freq2); 177 | 178 | for i = 1:num_crit 179 | f0 = (cent_freq2 (i) / max_freq) * (n_fftby2); 180 | all_f0(i) = floor(f0); 181 | bw = (bandwidth2 (i) / max_freq) * (n_fftby2); 182 | norm_factor = log(bw_min) - log(bandwidth2(i)); 183 | j = 0:1:n_fftby2-1; 184 | crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor); 185 | crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor); 186 | end 187 | 188 | else 189 | for i = 1:num_crit 190 | f0 = (cent_freq (i) / max_freq) * (n_fftby2); 191 | all_f0(i) = floor(f0); 192 | bw = (bandwidth (i) / max_freq) * (n_fftby2); 193 | norm_factor = log(bw_min) - log(bandwidth(i)); 194 | j = 0:1:n_fftby2-1; 195 | crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor); 196 | crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor); 197 | end 198 | end 199 | 200 | 201 | 202 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames 203 | start = 1; % starting sample 204 | window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1))); 205 | 206 | for frame_count = 1:num_frames 207 | 208 | % ---------------------------------------------------------- 209 | % (1) Get the Frames for the test and reference speech. 210 | % Multiply by Hanning Window. 211 | % ---------------------------------------------------------- 212 | 213 | clean_frame = clean_speech(start:start+winlength-1); 214 | processed_frame = processed_speech(start:start+winlength-1); 215 | clean_frame = clean_frame.*window; 216 | processed_frame = processed_frame.*window; 217 | 218 | % ---------------------------------------------------------- 219 | % (2) Compute the magnitude Spectrum of Clean and Processed 220 | % ---------------------------------------------------------- 221 | 222 | 223 | clean_spec = abs(fft(clean_frame,n_fft)); 224 | processed_spec = abs(fft(processed_frame,n_fft)); 225 | 226 | % normalize spectra to have area of one 227 | % 228 | clean_spec=clean_spec/sum(clean_spec(1:n_fftby2)); 229 | processed_spec=processed_spec/sum(processed_spec(1:n_fftby2)); 230 | 231 | % ---------------------------------------------------------- 232 | % (3) Compute Filterbank Output Energies 233 | % ---------------------------------------------------------- 234 | 235 | clean_energy=zeros(1,num_crit); 236 | processed_energy=zeros(1,num_crit); 237 | error_energy=zeros(1,num_crit); 238 | W_freq=zeros(1,num_crit); 239 | 240 | for i = 1:num_crit 241 | clean_energy(i) = sum(clean_spec(1:n_fftby2) ... 242 | .*crit_filter(i,:)'); 243 | processed_energy(i) = sum(processed_spec(1:n_fftby2) ... 244 | .*crit_filter(i,:)'); 245 | 246 | error_energy(i)=max((clean_energy(i)-processed_energy(i))^2,eps); 247 | W_freq(i)=(clean_energy(i))^gamma; 248 | 249 | end 250 | SNRlog=10*log10((clean_energy.^2)./error_energy); 251 | 252 | fwSNR=sum(W_freq.*SNRlog)/sum(W_freq); 253 | 254 | distortion(frame_count)=min(max(fwSNR,-10),35); 255 | 256 | start = start + skiprate; 257 | 258 | end 259 | 260 | -------------------------------------------------------------------------------- /PESQ/comp_fwseg_variant.asv: -------------------------------------------------------------------------------- 1 | function [SIG,BAK,OVL]= comp_fwseg_variant(cleanFile, enhancedFile); 2 | 3 | % ---------------------------------------------------------------------- 4 | % Frequency-variant fwSNRseg Objective Speech Quality Measure 5 | % 6 | % This function implements the frequency-variant fwSNRseg measure [1] 7 | % 8 | % 9 | % Usage: fwSNRseg=comp_fwseg(cleanFile.wav, enhancedFile.wav) 10 | % 11 | % cleanFile.wav - clean input file in .wav format 12 | % enhancedFile - enhanced output file in .wav format 13 | % fwSNRseg - computed frequency weighted SNRseg in dB 14 | % 15 | % Note that large numbers of fwSNRseg are better. 16 | % 17 | % Example call: fwSNRseg =comp_fwseg('sp04.wav','enhanced.wav') 18 | % 19 | % 20 | % References: 21 | % [1] Tribolet, J., Noll, P., McDermott, B., and Crochiere, R. E. (1978). 22 | % A study of complexity and quality of speech waveform coders. Proc. 23 | % IEEE Int. Conf. Acoust. , Speech, Signal Processing, 586-590. 24 | % 25 | % Author: Philipos C. Loizou 26 | % (critical-band filtering routines were written by Bryan Pellom & John Hansen) 27 | % 28 | % Copyright (c) 2006 by Philipos C. Loizou 29 | % $Revision: 0.0 $ $Date: 10/09/2006 $ 30 | % ---------------------------------------------------------------------- 31 | 32 | if nargin~=2 33 | fprintf('USAGE: fwSNRseg=comp_fwseg(cleanFile.wav, enhancedFile.wav)\n'); 34 | fprintf('For more help, type: help comp_fwseg\n\n'); 35 | return; 36 | end 37 | 38 | 39 | [data1, Srate1, Nbits1]= wavread(cleanFile); 40 | [data2, Srate2, Nbits2]= wavread(enhancedFile); 41 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2) 42 | error( 'The two files do not match!\n'); 43 | end 44 | 45 | len= min( length( data1), length( data2)); 46 | data1= data1( 1: len)+eps; 47 | data2= data2( 1: len)+eps; 48 | 49 | wss_dist_matrix= fwseg( data1, data2,Srate1); 50 | wss_dist=mean(wss_dist_matrix); 51 | 52 | b_sig=[0.021,-0.028,0.088,-0.031,0.048,-0.049,0.065,0.009,0.011,0.033,... 53 | -0.040,-0.002,0.041,-0.007,0.033,0.018,-0.007,0.044,-0.001,0.021,... 54 | -0.002,0.017,-0.03,0.073,0.043]; 55 | b_ovl=[-0.003,-0.026,0.066,-0.036,0.038,-0.023,0.037,0.022,0.014,0.009,... 56 | -0.03,0.004,0.044,-0.005,0.017,0.018,-0.001,0.051,0.009,0.011,... 57 | 0.011,-0.002,-0.021,0.043,0.031]; 58 | b_bak=[-0.03,-0.022,0.03,-0.048,0.034,0.002,0.006,0.037,0.017,-0.016,-0.008,... 59 | 0.019,0.024,-0.002,0.01,0.03,-0.018,0.046,0.022,0.005,0.03,-0.028,... 60 | -0.028,0.019,0.005]; 61 | 62 | SIG=0.567+sum(b_sig.*wss_dist); 63 | BAK=1.013+sum(b_bak.*wss_dist); 64 | OVL=0.446+sum(b_ovl.*wss_dist); 65 | 66 | 67 | % ---------------------------------------------------------------------- 68 | 69 | function distortion = fwseg(clean_speech, processed_speech,sample_rate) 70 | 71 | 72 | % ---------------------------------------------------------------------- 73 | % Check the length of the clean and processed speech. Must be the same. 74 | % ---------------------------------------------------------------------- 75 | 76 | clean_length = length(clean_speech); 77 | processed_length = length(processed_speech); 78 | 79 | if (clean_length ~= processed_length) 80 | disp('Error: Files must have same length.'); 81 | return 82 | end 83 | 84 | 85 | 86 | % ---------------------------------------------------------------------- 87 | % Global Variables 88 | % ---------------------------------------------------------------------- 89 | 90 | 91 | winlength = round(30*sample_rate/1000); % window length in samples 92 | skiprate = floor(winlength/4); % window skip in samples 93 | max_freq = sample_rate/2; % maximum bandwidth 94 | num_crit = 25; % number of critical bands 95 | 96 | n_fft = 2^nextpow2(2*winlength); 97 | n_fftby2 = n_fft/2; % FFT size/2 98 | 99 | % ---------------------------------------------------------------------- 100 | % Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz) 101 | % ---------------------------------------------------------------------- 102 | 103 | cent_freq(1) = 50.0000; bandwidth(1) = 70.0000; 104 | cent_freq(2) = 120.000; bandwidth(2) = 70.0000; 105 | cent_freq(3) = 190.000; bandwidth(3) = 70.0000; 106 | cent_freq(4) = 260.000; bandwidth(4) = 70.0000; 107 | cent_freq(5) = 330.000; bandwidth(5) = 70.0000; 108 | cent_freq(6) = 400.000; bandwidth(6) = 70.0000; 109 | cent_freq(7) = 470.000; bandwidth(7) = 70.0000; 110 | cent_freq(8) = 540.000; bandwidth(8) = 77.3724; 111 | cent_freq(9) = 617.372; bandwidth(9) = 86.0056; 112 | cent_freq(10) = 703.378; bandwidth(10) = 95.3398; 113 | cent_freq(11) = 798.717; bandwidth(11) = 105.411; 114 | cent_freq(12) = 904.128; bandwidth(12) = 116.256; 115 | cent_freq(13) = 1020.38; bandwidth(13) = 127.914; 116 | cent_freq(14) = 1148.30; bandwidth(14) = 140.423; 117 | cent_freq(15) = 1288.72; bandwidth(15) = 153.823; 118 | cent_freq(16) = 1442.54; bandwidth(16) = 168.154; 119 | cent_freq(17) = 1610.70; bandwidth(17) = 183.457; 120 | cent_freq(18) = 1794.16; bandwidth(18) = 199.776; 121 | cent_freq(19) = 1993.93; bandwidth(19) = 217.153; 122 | cent_freq(20) = 2211.08; bandwidth(20) = 235.631; 123 | cent_freq(21) = 2446.71; bandwidth(21) = 255.255; 124 | cent_freq(22) = 2701.97; bandwidth(22) = 276.072; 125 | cent_freq(23) = 2978.04; bandwidth(23) = 298.126; 126 | cent_freq(24) = 3276.17; bandwidth(24) = 321.465; 127 | cent_freq(25) = 3597.63; bandwidth(25) = 346.136; 128 | 129 | 130 | bw_min = bandwidth (1); % minimum critical bandwidth 131 | 132 | 133 | % ---------------------------------------------------------------------- 134 | % Set up the critical band filters. Note here that Gaussianly shaped 135 | % filters are used. Also, the sum of the filter weights are equivalent 136 | % for each critical band filter. Filter less than -30 dB and set to 137 | % zero. 138 | % ---------------------------------------------------------------------- 139 | 140 | min_factor = exp (-30.0 / (2.0 * 2.303)); % -30 dB point of filter 141 | 142 | for i = 1:num_crit 143 | f0 = (cent_freq (i) / max_freq) * (n_fftby2); 144 | all_f0(i) = floor(f0); 145 | bw = (bandwidth (i) / max_freq) * (n_fftby2); 146 | norm_factor = log(bw_min) - log(bandwidth(i)); 147 | j = 0:1:n_fftby2-1; 148 | crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor); 149 | crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor); 150 | end 151 | 152 | % ---------------------------------------------------------------------- 153 | % For each frame of input speech, calculate the Weighted Spectral 154 | % Slope Measure 155 | % ---------------------------------------------------------------------- 156 | 157 | num_frames = floor(clean_length/skiprate-(winlength/skiprate)); % number of frames 158 | start = 1; % starting sample 159 | window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1))); 160 | 161 | distortion=zeros(num_frames,num_crit); 162 | for frame_count = 1:num_frames 163 | 164 | % ---------------------------------------------------------- 165 | % (1) Get the Frames for the test and reference speech. 166 | % Multiply by Hanning Window. 167 | % ---------------------------------------------------------- 168 | 169 | clean_frame = clean_speech(start:start+winlength-1); 170 | processed_frame = processed_speech(start:start+winlength-1); 171 | clean_frame = clean_frame.*window; 172 | processed_frame = processed_frame.*window; 173 | 174 | % ---------------------------------------------------------- 175 | % (2) Compute the magnitude Spectrum of Clean and Processed 176 | % ---------------------------------------------------------- 177 | 178 | 179 | clean_spec = abs(fft(clean_frame,n_fft)); 180 | processed_spec = abs(fft(processed_frame,n_fft)); 181 | 182 | % normalize so that spectra have unit area ---- 183 | clean_spec=clean_spec/sum(clean_spec(1:n_fftby2)); 184 | processed_spec=processed_spec/sum(processed_spec(1:n_fftby2)); 185 | 186 | % ---------------------------------------------------------- 187 | % (3) Compute Filterbank Output Energies (in dB scale) 188 | % ---------------------------------------------------------- 189 | 190 | clean_energy=zeros(1,num_crit); 191 | processed_energy=zeros(1,num_crit); 192 | error_energy=zeros(1,num_crit); 193 | 194 | for i = 1:num_crit 195 | clean_energy(i) = sum(clean_spec(1:n_fftby2) ... 196 | .*crit_filter(i,:)'); 197 | processed_energy(i) = sum(processed_spec(1:n_fftby2) ... 198 | .*crit_filter(i,:)'); 199 | error_energy(i)=max((clean_energy(i)-processed_energy(i))^2,eps); 200 | end 201 | 202 | 203 | SNRlog=10*log10((clean_energy.^2)./error_energy); 204 | 205 | distortion(frame_count,:)=min(max(SNRlog,-10),35); 206 | 207 | start = start + skiprate; 208 | 209 | end 210 | 211 | -------------------------------------------------------------------------------- /PESQ/comp_fwseg_variant.m: -------------------------------------------------------------------------------- 1 | function [SIG,BAK,OVL]= comp_fwseg_variant(cleanFile, enhancedFile); 2 | 3 | % ---------------------------------------------------------------------- 4 | % Frequency-variant fwSNRseg Objective Speech Quality Measure 5 | % 6 | % This function implements the frequency-variant fwSNRseg measure [1] 7 | % (see also Chap. 10, Eq. 10.24) 8 | % 9 | % 10 | % Usage: [sig,bak,ovl]=comp_fwseg_variant(cleanFile.wav, enhancedFile.wav) 11 | % 12 | % cleanFile.wav - clean input file in .wav format 13 | % enhancedFile - enhanced output file in .wav format 14 | % sig - predicted rating [1-5] of speech distortion 15 | % bak - predicted rating [1-5] of noise distortion 16 | % ovl - predicted rating [1-5] of overall quality 17 | % 18 | % 19 | % Example call: [s,b,o] =comp_fwseg_variant('sp04.wav','enhanced.wav') 20 | % 21 | % 22 | % References: 23 | % [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements, 24 | % Objective Measures of Speech Quality. Prentice Hall 25 | % Advanced Reference Series, Englewood Cliffs, NJ, 1988, 26 | % ISBN: 0-13-629056-6. 27 | % 28 | % Author: Philipos C. Loizou 29 | % (critical-band filtering routines were written by Bryan Pellom & John Hansen) 30 | % 31 | % Copyright (c) 2006 by Philipos C. Loizou 32 | % $Revision: 0.0 $ $Date: 10/09/2006 $ 33 | % ---------------------------------------------------------------------- 34 | 35 | if nargin~=2 36 | fprintf('USAGE: [sig,bak,ovl]=comp_fwseg_variant(cleanFile.wav, enhancedFile.wav)\n'); 37 | fprintf('For more help, type: help comp_fwseg_variant\n\n'); 38 | return; 39 | end 40 | 41 | 42 | [data1, Srate1, Nbits1]= wavread(cleanFile); 43 | [data2, Srate2, Nbits2]= wavread(enhancedFile); 44 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2) 45 | error( 'The two files do not match!\n'); 46 | end 47 | 48 | len= min( length( data1), length( data2)); 49 | data1= data1( 1: len)+eps; 50 | data2= data2( 1: len)+eps; 51 | 52 | wss_dist_matrix= fwseg( data1, data2,Srate1); 53 | wss_dist=mean(wss_dist_matrix); 54 | 55 | % initialize coefficients obtained from multiple linear 56 | % regression analysis 57 | % 58 | b_sig=[0.021,-0.028,0.088,-0.031,0.048,-0.049,0.065,0.009,0.011,0.033,... 59 | -0.040,-0.002,0.041,-0.007,0.033,0.018,-0.007,0.044,-0.001,0.021,... 60 | -0.002,0.017,-0.03,0.073,0.043]; 61 | b_ovl=[-0.003,-0.026,0.066,-0.036,0.038,-0.023,0.037,0.022,0.014,0.009,... 62 | -0.03,0.004,0.044,-0.005,0.017,0.018,-0.001,0.051,0.009,0.011,... 63 | 0.011,-0.002,-0.021,0.043,0.031]; 64 | b_bak=[-0.03,-0.022,0.03,-0.048,0.034,0.002,0.006,0.037,0.017,-0.016,-0.008,... 65 | 0.019,0.024,-0.002,0.01,0.03,-0.018,0.046,0.022,0.005,0.03,-0.028,... 66 | -0.028,0.019,0.005]; 67 | 68 | SIG=0.567+sum(b_sig.*wss_dist); 69 | SIG=max(1,SIG); SIG=min(5, SIG); % limit values to [1, 5] 70 | 71 | BAK=1.013+sum(b_bak.*wss_dist); 72 | BAK=max(1,BAK); BAK=min(5, BAK); % limit values to [1, 5] 73 | 74 | OVL=0.446+sum(b_ovl.*wss_dist); 75 | OVL=max(1,OVL); OVL=min(5, OVL); % limit values to [1, 5] 76 | 77 | 78 | % ---------------------------------------------------------------------- 79 | 80 | function distortion = fwseg(clean_speech, processed_speech,sample_rate) 81 | 82 | 83 | % ---------------------------------------------------------------------- 84 | % Check the length of the clean and processed speech. Must be the same. 85 | % ---------------------------------------------------------------------- 86 | 87 | clean_length = length(clean_speech); 88 | processed_length = length(processed_speech); 89 | 90 | if (clean_length ~= processed_length) 91 | disp('Error: Files must have same length.'); 92 | return 93 | end 94 | 95 | 96 | 97 | % ---------------------------------------------------------------------- 98 | % Global Variables 99 | % ---------------------------------------------------------------------- 100 | 101 | 102 | winlength = round(30*sample_rate/1000); % window length in samples 103 | skiprate = floor(winlength/4); % window skip in samples 104 | max_freq = sample_rate/2; % maximum bandwidth 105 | num_crit = 25; % number of critical bands 106 | 107 | n_fft = 2^nextpow2(2*winlength); 108 | n_fftby2 = n_fft/2; % FFT size/2 109 | 110 | % ---------------------------------------------------------------------- 111 | % Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz) 112 | % ---------------------------------------------------------------------- 113 | 114 | cent_freq(1) = 50.0000; bandwidth(1) = 70.0000; 115 | cent_freq(2) = 120.000; bandwidth(2) = 70.0000; 116 | cent_freq(3) = 190.000; bandwidth(3) = 70.0000; 117 | cent_freq(4) = 260.000; bandwidth(4) = 70.0000; 118 | cent_freq(5) = 330.000; bandwidth(5) = 70.0000; 119 | cent_freq(6) = 400.000; bandwidth(6) = 70.0000; 120 | cent_freq(7) = 470.000; bandwidth(7) = 70.0000; 121 | cent_freq(8) = 540.000; bandwidth(8) = 77.3724; 122 | cent_freq(9) = 617.372; bandwidth(9) = 86.0056; 123 | cent_freq(10) = 703.378; bandwidth(10) = 95.3398; 124 | cent_freq(11) = 798.717; bandwidth(11) = 105.411; 125 | cent_freq(12) = 904.128; bandwidth(12) = 116.256; 126 | cent_freq(13) = 1020.38; bandwidth(13) = 127.914; 127 | cent_freq(14) = 1148.30; bandwidth(14) = 140.423; 128 | cent_freq(15) = 1288.72; bandwidth(15) = 153.823; 129 | cent_freq(16) = 1442.54; bandwidth(16) = 168.154; 130 | cent_freq(17) = 1610.70; bandwidth(17) = 183.457; 131 | cent_freq(18) = 1794.16; bandwidth(18) = 199.776; 132 | cent_freq(19) = 1993.93; bandwidth(19) = 217.153; 133 | cent_freq(20) = 2211.08; bandwidth(20) = 235.631; 134 | cent_freq(21) = 2446.71; bandwidth(21) = 255.255; 135 | cent_freq(22) = 2701.97; bandwidth(22) = 276.072; 136 | cent_freq(23) = 2978.04; bandwidth(23) = 298.126; 137 | cent_freq(24) = 3276.17; bandwidth(24) = 321.465; 138 | cent_freq(25) = 3597.63; bandwidth(25) = 346.136; 139 | 140 | 141 | bw_min = bandwidth (1); % minimum critical bandwidth 142 | 143 | 144 | % ---------------------------------------------------------------------- 145 | % Set up the critical band filters. Note here that Gaussianly shaped 146 | % filters are used. Also, the sum of the filter weights are equivalent 147 | % for each critical band filter. Filter less than -30 dB and set to 148 | % zero. 149 | % ---------------------------------------------------------------------- 150 | 151 | min_factor = exp (-30.0 / (2.0 * 2.303)); % -30 dB point of filter 152 | 153 | for i = 1:num_crit 154 | f0 = (cent_freq (i) / max_freq) * (n_fftby2); 155 | all_f0(i) = floor(f0); 156 | bw = (bandwidth (i) / max_freq) * (n_fftby2); 157 | norm_factor = log(bw_min) - log(bandwidth(i)); 158 | j = 0:1:n_fftby2-1; 159 | crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor); 160 | crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor); 161 | end 162 | 163 | % ---------------------------------------------------------------------- 164 | % For each frame of input speech, calculate the Weighted Spectral 165 | % Slope Measure 166 | % ---------------------------------------------------------------------- 167 | 168 | num_frames = floor(clean_length/skiprate-(winlength/skiprate)); % number of frames 169 | start = 1; % starting sample 170 | window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1))); 171 | 172 | distortion=zeros(num_frames,num_crit); 173 | for frame_count = 1:num_frames 174 | 175 | % ---------------------------------------------------------- 176 | % (1) Get the Frames for the test and reference speech. 177 | % Multiply by Hanning Window. 178 | % ---------------------------------------------------------- 179 | 180 | clean_frame = clean_speech(start:start+winlength-1); 181 | processed_frame = processed_speech(start:start+winlength-1); 182 | clean_frame = clean_frame.*window; 183 | processed_frame = processed_frame.*window; 184 | 185 | % ---------------------------------------------------------- 186 | % (2) Compute the magnitude Spectrum of Clean and Processed 187 | % ---------------------------------------------------------- 188 | 189 | 190 | clean_spec = abs(fft(clean_frame,n_fft)); 191 | processed_spec = abs(fft(processed_frame,n_fft)); 192 | 193 | % normalize so that spectra have unit area ---- 194 | clean_spec=clean_spec/sum(clean_spec(1:n_fftby2)); 195 | processed_spec=processed_spec/sum(processed_spec(1:n_fftby2)); 196 | 197 | % ---------------------------------------------------------- 198 | % (3) Compute Filterbank Output Energies (in dB scale) 199 | % ---------------------------------------------------------- 200 | 201 | clean_energy=zeros(1,num_crit); 202 | processed_energy=zeros(1,num_crit); 203 | error_energy=zeros(1,num_crit); 204 | 205 | for i = 1:num_crit 206 | clean_energy(i) = sum(clean_spec(1:n_fftby2) ... 207 | .*crit_filter(i,:)'); 208 | processed_energy(i) = sum(processed_spec(1:n_fftby2) ... 209 | .*crit_filter(i,:)'); 210 | error_energy(i)=max((clean_energy(i)-processed_energy(i))^2,eps); 211 | end 212 | 213 | 214 | SNRlog=10*log10((clean_energy.^2)./error_energy); 215 | 216 | distortion(frame_count,:)=min(max(SNRlog,-10),35); 217 | 218 | start = start + skiprate; 219 | 220 | end 221 | 222 | -------------------------------------------------------------------------------- /PESQ/comp_is.asv: -------------------------------------------------------------------------------- 1 | function is_mean= compIS(cleanFile, enhdFile); 2 | % ---------------------------------------------------------------------- 3 | % 4 | % Itakura-Saito (IS) Objective Speech Quality Measure 5 | % 6 | % This function implements the Itakura-Saito distance measure 7 | % defined on page 50 of [1] (see Equation 2.26). See also 8 | % Equation 12 (page 1480) of [2]. 9 | % 10 | % Usage: llr=comp_llr(cleanFile.wav, enhancedFile.wav) 11 | % 12 | % cleanFile.wav - clean input file in .wav format 13 | % enhancedFile - enhanced output file in .wav format 14 | % llr - computed likelihood ratio 15 | % 16 | % Note that the IS measure is limited in the range [0, 100]. 17 | % 18 | % Example call: llr =comp_llr('sp04.wav','enhanced.wav') 19 | % 20 | % 21 | % References: 22 | % 23 | % [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements, 24 | % Objective Measures of Speech Quality. Prentice Hall 25 | % Advanced Reference Series, Englewood Cliffs, NJ, 1988, 26 | % ISBN: 0-13-629056-6. 27 | % 28 | % [2] B.-H. Juang, "On Using the Itakura-Saito Measures for 29 | % Speech Coder Performance Evaluation", AT&T Bell 30 | % Laboratories Technical Journal, Vol. 63, No. 8, 31 | % October 1984, pp. 1477-1498. 32 | % 33 | % ---------------------------------------------------------------------- 34 | 35 | 36 | alpha=0.95; 37 | 38 | [data1, Srate1, Nbits1]= wavread(cleanFile); 39 | [data2, Srate2, Nbits2]= wavread(enhdFile); 40 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2) 41 | error( 'The two files do not match!\n'); 42 | end 43 | 44 | len= min( length( data1), length( data2)); 45 | data1= data1( 1: len)+eps; 46 | data2= data2( 1: len)+eps; 47 | 48 | 49 | IS_dist= is( data1, data2,Srate1); 50 | 51 | IS_len= round( length( IS_dist)* alpha); 52 | IS= sort( IS_dist); 53 | 54 | is_mean= mean( IS( 1: IS_len)); 55 | 56 | 57 | 58 | function distortion = is(clean_speech, processed_speech,sample_rate) 59 | 60 | 61 | % ---------------------------------------------------------------------- 62 | % Check the length of the clean and processed speech. Must be the same. 63 | % ---------------------------------------------------------------------- 64 | 65 | clean_length = length(clean_speech); 66 | processed_length = length(processed_speech); 67 | 68 | if (clean_length ~= processed_length) 69 | disp('Error: Both Speech Files must be same length.'); 70 | return 71 | end 72 | 73 | % ---------------------------------------------------------------------- 74 | % Scale both clean speech and processed speech to have same dynamic 75 | % range. Also remove DC component from each signal 76 | % ---------------------------------------------------------------------- 77 | 78 | %clean_speech = clean_speech - mean(clean_speech); 79 | %processed_speech = processed_speech - mean(processed_speech); 80 | 81 | %processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech))); 82 | 83 | % ---------------------------------------------------------------------- 84 | % Global Variables 85 | % ---------------------------------------------------------------------- 86 | 87 | %sample_rate = 8000; % default sample rate 88 | winlength = round(30*sample_rate/1000); %240; % window length in samples 89 | skiprate = floor(winlength/4); % window skip in samples 90 | if sample_rate<10000 91 | P = 10; % LPC Analysis Order 92 | else 93 | P=16; % this could vary depending on sampling frequency. 94 | end 95 | % ---------------------------------------------------------------------- 96 | % For each frame of input speech, calculate the Itakura-Saito Measure 97 | % ---------------------------------------------------------------------- 98 | 99 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames 100 | start = 1; % starting sample 101 | window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1))); 102 | 103 | for frame_count = 1:num_frames 104 | 105 | % ---------------------------------------------------------- 106 | % (1) Get the Frames for the test and reference speech. 107 | % Multiply by Hanning Window. 108 | % ---------------------------------------------------------- 109 | 110 | clean_frame = clean_speech(start:start+winlength-1); 111 | processed_frame = processed_speech(start:start+winlength-1); 112 | clean_frame = clean_frame.*window; 113 | processed_frame = processed_frame.*window; 114 | 115 | % ---------------------------------------------------------- 116 | % (2) Get the autocorrelation lags and LPC parameters used 117 | % to compute the IS measure. 118 | % ---------------------------------------------------------- 119 | 120 | [R_clean, Ref_clean, A_clean] = ... 121 | lpcoeff(clean_frame, P); 122 | [R_processed, Ref_processed, A_processed] = ... 123 | lpcoeff(processed_frame, P); 124 | 125 | 126 | % ---------------------------------------------------------- 127 | % (3) Compute the IS measure 128 | % ---------------------------------------------------------- 129 | 130 | numerator = A_processed*toeplitz(R_clean)*A_processed'; 131 | denominator = max(A_clean*toeplitz(R_clean)*A_clean',eps); 132 | gain_clean = max(R_clean*A_clean',eps); % this is gain 133 | gain_processed = max(R_processed*A_processed',eps); % squared (sigma^2) 134 | 135 | 136 | ISvalue=(gain_clean/gain_processed)*(numerator/denominator) + ... 137 | log(gain_processed/gain_clean)-1; 138 | 139 | distortion(frame_count) = min(ISvalue,100); 140 | start = start + skiprate; 141 | 142 | end 143 | 144 | 145 | % ---------------------------------------------------------------------- 146 | % 147 | % Linear Prediction Coefficient Computation 148 | % 149 | % Robust Speech Processing Laboratory 150 | % Duke University, USA 151 | % Copyright (c) 1998 152 | % All Rights Reserved. 153 | % 154 | % Description: 155 | % 156 | % This function returns the autocorrelation lags, reflection 157 | % coefficients, and linear prediction coefficients for a 158 | % given input frame of speech and a desired LP model order. 159 | % it uses the levinson-durbin algorithm as described on page 160 | % 300 (Fig. 5.8) of [1]. 161 | % 162 | % Input/Output: 163 | % 164 | % The input is a reference 8kHz sampled clean frame of speech 165 | % and a desired number of reflection coefficients. The function 166 | % returns the autocorrelation lags, reflection coefficients, 167 | % and linear prediction coefficients in an array. 168 | % 169 | % References: 170 | % 171 | % [1] J. Deller, J. Proakis, J. Hansen, Discrete-Time Processing 172 | % of Speech Signals. Macmillan series for Prentice-Hall, 173 | % New York, 1993. 174 | % 175 | % Authors: 176 | % 177 | % Bryan L. Pellom and John H. L. Hansen 178 | % Robust Speech Processing Laboratory, Duke University 179 | % Department of Electrical Engineeering 180 | % 181 | % Last Modified: 182 | % 183 | % July 22, 1998 184 | % 185 | % ---------------------------------------------------------------------- 186 | 187 | function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order) 188 | 189 | % ---------------------------------------------------------- 190 | % (1) Compute Autocorrelation Lags 191 | % ---------------------------------------------------------- 192 | 193 | winlength = max(size(speech_frame)); 194 | for k=1:model_order+1 195 | R(k) = sum(speech_frame(1:winlength-k+1) ... 196 | .*speech_frame(k:winlength)); 197 | end 198 | 199 | % ---------------------------------------------------------- 200 | % (2) Levinson-Durbin 201 | % ---------------------------------------------------------- 202 | 203 | a = ones(1,model_order); 204 | E(1)=R(1); 205 | for i=1:model_order 206 | a_past(1:i-1) = a(1:i-1); 207 | sum_term = sum(a_past(1:i-1).*R(i:-1:2)); 208 | rcoeff(i)=(R(i+1) - sum_term) / E(i); 209 | a(i)=rcoeff(i); 210 | a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1); 211 | E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i); 212 | end 213 | 214 | acorr = R; 215 | refcoeff = rcoeff; 216 | lpparams = [1 -a]; 217 | 218 | 219 | 220 | -------------------------------------------------------------------------------- /PESQ/comp_is.m: -------------------------------------------------------------------------------- 1 | function is_mean= comp_is(cleanFile, enhdFile); 2 | % ---------------------------------------------------------------------- 3 | % Itakura-Saito (IS) Objective Speech Quality Measure 4 | % 5 | % This function implements the Itakura-Saito distance measure 6 | % defined on page 50 of [1] (see Equation 2.26). See also 7 | % Equation 12 (page 1480) of [2]. 8 | % 9 | % Usage: IS=comp_is(cleanFile.wav, enhancedFile.wav) 10 | % 11 | % cleanFile.wav - clean input file in .wav format 12 | % enhancedFile - enhanced output file in .wav format 13 | % IS - computed Itakura Saito measure 14 | % 15 | % Note that the IS measure is limited in the range [0, 100]. 16 | % 17 | % Example call: IS =comp_is('sp04.wav','enhanced.wav') 18 | % 19 | % 20 | % References: 21 | % 22 | % [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements, 23 | % Objective Measures of Speech Quality. Prentice Hall 24 | % Advanced Reference Series, Englewood Cliffs, NJ, 1988, 25 | % ISBN: 0-13-629056-6. 26 | % 27 | % [2] B.-H. Juang, "On Using the Itakura-Saito Measures for 28 | % Speech Coder Performance Evaluation", AT&T Bell 29 | % Laboratories Technical Journal, Vol. 63, No. 8, 30 | % October 1984, pp. 1477-1498. 31 | % 32 | % Authors: Bryan L. Pellom and John H. L. Hansen (July 1998) 33 | % Modified by: Philipos C. Loizou (Oct 2006) - limited IS to be in [0,100] 34 | % 35 | % Copyright (c) 2006 by Philipos C. Loizou 36 | % $Revision: 0.0 $ $Date: 10/09/2006 $ 37 | 38 | % ---------------------------------------------------------------------- 39 | 40 | if nargin~=2 41 | fprintf('USAGE: IS=comp_is(cleanFile.wav, enhancedFile.wav)\n'); 42 | fprintf('For more help, type: help comp_is\n\n'); 43 | return; 44 | end 45 | 46 | alpha=0.95; 47 | 48 | [data1, Srate1, Nbits1]= wavread(cleanFile); 49 | [data2, Srate2, Nbits2]= wavread(enhdFile); 50 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2) 51 | error( 'The two files do not match!\n'); 52 | end 53 | 54 | len= min( length( data1), length( data2)); 55 | data1= data1( 1: len)+eps; 56 | data2= data2( 1: len)+eps; 57 | 58 | 59 | IS_dist= is( data1, data2,Srate1); 60 | 61 | IS_len= round( length( IS_dist)* alpha); 62 | IS= sort( IS_dist); 63 | 64 | is_mean= mean( IS( 1: IS_len)); 65 | 66 | 67 | 68 | function distortion = is(clean_speech, processed_speech,sample_rate) 69 | 70 | 71 | % ---------------------------------------------------------------------- 72 | % Check the length of the clean and processed speech. Must be the same. 73 | % ---------------------------------------------------------------------- 74 | 75 | clean_length = length(clean_speech); 76 | processed_length = length(processed_speech); 77 | 78 | if (clean_length ~= processed_length) 79 | disp('Error: Both Speech Files must be same length.'); 80 | return 81 | end 82 | 83 | % ---------------------------------------------------------------------- 84 | % Scale both clean speech and processed speech to have same dynamic 85 | % range. Also remove DC component from each signal 86 | % ---------------------------------------------------------------------- 87 | 88 | %clean_speech = clean_speech - mean(clean_speech); 89 | %processed_speech = processed_speech - mean(processed_speech); 90 | 91 | %processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech))); 92 | 93 | % ---------------------------------------------------------------------- 94 | % Global Variables 95 | % ---------------------------------------------------------------------- 96 | 97 | %sample_rate = 8000; % default sample rate 98 | winlength = round(30*sample_rate/1000); %240; % window length in samples 99 | skiprate = floor(winlength/4); % window skip in samples 100 | if sample_rate<10000 101 | P = 10; % LPC Analysis Order 102 | else 103 | P=16; % this could vary depending on sampling frequency. 104 | end 105 | % ---------------------------------------------------------------------- 106 | % For each frame of input speech, calculate the Itakura-Saito Measure 107 | % ---------------------------------------------------------------------- 108 | 109 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames 110 | start = 1; % starting sample 111 | window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1))); 112 | 113 | for frame_count = 1:num_frames 114 | 115 | % ---------------------------------------------------------- 116 | % (1) Get the Frames for the test and reference speech. 117 | % Multiply by Hanning Window. 118 | % ---------------------------------------------------------- 119 | 120 | clean_frame = clean_speech(start:start+winlength-1); 121 | processed_frame = processed_speech(start:start+winlength-1); 122 | clean_frame = clean_frame.*window; 123 | processed_frame = processed_frame.*window; 124 | 125 | % ---------------------------------------------------------- 126 | % (2) Get the autocorrelation lags and LPC parameters used 127 | % to compute the IS measure. 128 | % ---------------------------------------------------------- 129 | 130 | [R_clean, Ref_clean, A_clean] = ... 131 | lpcoeff(clean_frame, P); 132 | [R_processed, Ref_processed, A_processed] = ... 133 | lpcoeff(processed_frame, P); 134 | 135 | 136 | % ---------------------------------------------------------- 137 | % (3) Compute the IS measure 138 | % ---------------------------------------------------------- 139 | 140 | numerator = A_processed*toeplitz(R_clean)*A_processed'; 141 | denominator = max(A_clean*toeplitz(R_clean)*A_clean',eps); 142 | gain_clean = max(R_clean*A_clean',eps); % this is gain 143 | gain_processed = max(R_processed*A_processed',eps); % squared (sigma^2) 144 | 145 | 146 | ISvalue=(gain_clean/gain_processed)*(numerator/denominator) + ... 147 | log(gain_processed/gain_clean)-1; 148 | 149 | distortion(frame_count) = min(ISvalue,100); 150 | start = start + skiprate; 151 | 152 | end 153 | 154 | 155 | 156 | function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order) 157 | 158 | % ---------------------------------------------------------- 159 | % (1) Compute Autocorrelation Lags 160 | % ---------------------------------------------------------- 161 | 162 | winlength = max(size(speech_frame)); 163 | for k=1:model_order+1 164 | R(k) = sum(speech_frame(1:winlength-k+1) ... 165 | .*speech_frame(k:winlength)); 166 | end 167 | 168 | % ---------------------------------------------------------- 169 | % (2) Levinson-Durbin 170 | % ---------------------------------------------------------- 171 | 172 | a = ones(1,model_order); 173 | E(1)=R(1); 174 | for i=1:model_order 175 | a_past(1:i-1) = a(1:i-1); 176 | sum_term = sum(a_past(1:i-1).*R(i:-1:2)); 177 | rcoeff(i)=(R(i+1) - sum_term) / E(i); 178 | a(i)=rcoeff(i); 179 | a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1); 180 | E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i); 181 | end 182 | 183 | acorr = R; 184 | refcoeff = rcoeff; 185 | lpparams = [1 -a]; 186 | 187 | 188 | 189 | -------------------------------------------------------------------------------- /PESQ/comp_llr.asv: -------------------------------------------------------------------------------- 1 | function llr_mean= comp_llr(cleanFile, enhancedFile); 2 | 3 | % ---------------------------------------------------------------------- 4 | % 5 | % Log Likelihood Ratio (LLR) Objective Speech Quality Measure 6 | % 7 | % 8 | % This function implements the Log Likelihood Ratio Measure 9 | % defined on page 48 of [1] (see Equation 2.18). 10 | % 11 | % Usage: llr=comp_llr(cleanFile.wav, enhancedFile.wav) 12 | % 13 | % cleanFile.wav - clean input file in .wav format 14 | % enhancedFile - enhanced output file in .wav format 15 | % llr - computed likelihood ratio 16 | % 17 | % Note that the LLR measure is limited in the range [0, 2]. 18 | % 19 | % Example call: llr =comp_llr('sp04.wav','enhanced.wav') 20 | % 21 | % 22 | % References: 23 | % 24 | % [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements, 25 | % Objective Measures of Speech Quality. Prentice Hall 26 | % Advanced Reference Series, Englewood Cliffs, NJ, 1988, 27 | % ISBN: 0-13-629056-6. 28 | % 29 | % Authors: Bryan L. Pellom and John H. L. Hansen (July 1998) 30 | % Modified by: Philipos C. Loizou (Oct 2006) - limited LLR to be in [0,2] 31 | % 32 | % Copyright (c) 2006 by Philipos C. Loizou 33 | % $Revision: 0.0 $ $Date: 10/09/2006 $ 34 | % ---------------------------------------------------------------------- 35 | 36 | if nargin~=2 37 | fprintf('USAGE: LLR=comp_llr(cleanFile.wav, enhancedFile.wav)\n'); 38 | fprintf('For more help, type: help comp_\n\n'); 39 | return; 40 | end 41 | 42 | alpha=0.95; 43 | [data1, Srate1, Nbits1]= wavread(cleanFile); 44 | [data2, Srate2, Nbits2]= wavread(enhancedFile); 45 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2) 46 | error( 'The two files do not match!\n'); 47 | end 48 | 49 | len= min( length( data1), length( data2)); 50 | data1= data1( 1: len)+eps; 51 | data2= data2( 1: len)+eps; 52 | 53 | IS_dist= llr( data1, data2,Srate1); 54 | 55 | IS_len= round( length( IS_dist)* alpha); 56 | IS= sort( IS_dist); 57 | 58 | llr_mean= mean( IS( 1: IS_len)); 59 | 60 | 61 | 62 | function distortion = llr(clean_speech, processed_speech,sample_rate) 63 | 64 | 65 | % ---------------------------------------------------------------------- 66 | % Check the length of the clean and processed speech. Must be the same. 67 | % ---------------------------------------------------------------------- 68 | 69 | clean_length = length(clean_speech); 70 | processed_length = length(processed_speech); 71 | 72 | if (clean_length ~= processed_length) 73 | disp('Error: Both Speech Files must be same length.'); 74 | return 75 | end 76 | 77 | % ---------------------------------------------------------------------- 78 | % Global Variables 79 | % ---------------------------------------------------------------------- 80 | 81 | winlength = round(30*sample_rate/1000); %240; % window length in samples 82 | skiprate = floor(winlength/4); % window skip in samples 83 | if sample_rate<10000 84 | P = 10; % LPC Analysis Order 85 | else 86 | P=16; % this could vary depending on sampling frequency. 87 | end 88 | % ---------------------------------------------------------------------- 89 | % For each frame of input speech, calculate the Log Likelihood Ratio 90 | % ---------------------------------------------------------------------- 91 | 92 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames 93 | start = 1; % starting sample 94 | window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1))); 95 | 96 | for frame_count = 1:num_frames 97 | 98 | % ---------------------------------------------------------- 99 | % (1) Get the Frames for the test and reference speech. 100 | % Multiply by Hanning Window. 101 | % ---------------------------------------------------------- 102 | 103 | clean_frame = clean_speech(start:start+winlength-1); 104 | processed_frame = processed_speech(start:start+winlength-1); 105 | clean_frame = clean_frame.*window; 106 | processed_frame = processed_frame.*window; 107 | 108 | % ---------------------------------------------------------- 109 | % (2) Get the autocorrelation lags and LPC parameters used 110 | % to compute the LLR measure. 111 | % ---------------------------------------------------------- 112 | 113 | [R_clean, Ref_clean, A_clean] = ... 114 | lpcoeff(clean_frame, P); 115 | [R_processed, Ref_processed, A_processed] = ... 116 | lpcoeff(processed_frame, P); 117 | 118 | % ---------------------------------------------------------- 119 | % (3) Compute the LLR measure 120 | % ---------------------------------------------------------- 121 | 122 | numerator = A_processed*toeplitz(R_clean)*A_processed'; 123 | denominator = A_clean*toeplitz(R_clean)*A_clean'; 124 | distortion(frame_count) = min(2,log(numerator/denominator)); 125 | start = start + skiprate; 126 | 127 | end 128 | 129 | 130 | function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order) 131 | 132 | % ---------------------------------------------------------- 133 | % (1) Compute Autocorrelation Lags 134 | % ---------------------------------------------------------- 135 | 136 | winlength = max(size(speech_frame)); 137 | for k=1:model_order+1 138 | R(k) = sum(speech_frame(1:winlength-k+1) ... 139 | .*speech_frame(k:winlength)); 140 | end 141 | 142 | % ---------------------------------------------------------- 143 | % (2) Levinson-Durbin 144 | % ---------------------------------------------------------- 145 | 146 | a = ones(1,model_order); 147 | E(1)=R(1); 148 | for i=1:model_order 149 | a_past(1:i-1) = a(1:i-1); 150 | sum_term = sum(a_past(1:i-1).*R(i:-1:2)); 151 | rcoeff(i)=(R(i+1) - sum_term) / E(i); 152 | a(i)=rcoeff(i); 153 | a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1); 154 | E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i); 155 | end 156 | 157 | acorr = R; 158 | refcoeff = rcoeff; 159 | lpparams = [1 -a]; 160 | 161 | 162 | 163 | -------------------------------------------------------------------------------- /PESQ/comp_llr.m: -------------------------------------------------------------------------------- 1 | function llr_mean= comp_llr(cleanFile, enhancedFile); 2 | 3 | % ---------------------------------------------------------------------- 4 | % 5 | % Log Likelihood Ratio (LLR) Objective Speech Quality Measure 6 | % 7 | % 8 | % This function implements the Log Likelihood Ratio Measure 9 | % defined on page 48 of [1] (see Equation 2.18). 10 | % 11 | % Usage: llr=comp_llr(cleanFile.wav, enhancedFile.wav) 12 | % 13 | % cleanFile.wav - clean input file in .wav format 14 | % enhancedFile - enhanced output file in .wav format 15 | % llr - computed likelihood ratio 16 | % 17 | % Note that the LLR measure is limited in the range [0, 2]. 18 | % 19 | % Example call: llr =comp_llr('sp04.wav','enhanced.wav') 20 | % 21 | % 22 | % References: 23 | % 24 | % [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements, 25 | % Objective Measures of Speech Quality. Prentice Hall 26 | % Advanced Reference Series, Englewood Cliffs, NJ, 1988, 27 | % ISBN: 0-13-629056-6. 28 | % 29 | % Authors: Bryan L. Pellom and John H. L. Hansen (July 1998) 30 | % Modified by: Philipos C. Loizou (Oct 2006) - limited LLR to be in [0,2] 31 | % 32 | % Copyright (c) 2006 by Philipos C. Loizou 33 | % $Revision: 0.0 $ $Date: 10/09/2006 $ 34 | % ---------------------------------------------------------------------- 35 | 36 | if nargin~=2 37 | fprintf('USAGE: LLR=comp_llr(cleanFile.wav, enhancedFile.wav)\n'); 38 | fprintf('For more help, type: help comp_llr\n\n'); 39 | return; 40 | end 41 | 42 | alpha=0.95; 43 | [data1, Srate1, Nbits1]= wavread(cleanFile); 44 | [data2, Srate2, Nbits2]= wavread(enhancedFile); 45 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2) 46 | error( 'The two files do not match!\n'); 47 | end 48 | 49 | len= min( length( data1), length( data2)); 50 | data1= data1( 1: len)+eps; 51 | data2= data2( 1: len)+eps; 52 | 53 | IS_dist= llr( data1, data2,Srate1); 54 | 55 | IS_len= round( length( IS_dist)* alpha); 56 | IS= sort( IS_dist); 57 | 58 | llr_mean= mean( IS( 1: IS_len)); 59 | 60 | 61 | 62 | function distortion = llr(clean_speech, processed_speech,sample_rate) 63 | 64 | 65 | % ---------------------------------------------------------------------- 66 | % Check the length of the clean and processed speech. Must be the same. 67 | % ---------------------------------------------------------------------- 68 | 69 | clean_length = length(clean_speech); 70 | processed_length = length(processed_speech); 71 | 72 | if (clean_length ~= processed_length) 73 | disp('Error: Both Speech Files must be same length.'); 74 | return 75 | end 76 | 77 | % ---------------------------------------------------------------------- 78 | % Global Variables 79 | % ---------------------------------------------------------------------- 80 | 81 | winlength = round(30*sample_rate/1000); %240; % window length in samples 82 | skiprate = floor(winlength/4); % window skip in samples 83 | if sample_rate<10000 84 | P = 10; % LPC Analysis Order 85 | else 86 | P=16; % this could vary depending on sampling frequency. 87 | end 88 | % ---------------------------------------------------------------------- 89 | % For each frame of input speech, calculate the Log Likelihood Ratio 90 | % ---------------------------------------------------------------------- 91 | 92 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames 93 | start = 1; % starting sample 94 | window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1))); 95 | 96 | for frame_count = 1:num_frames 97 | 98 | % ---------------------------------------------------------- 99 | % (1) Get the Frames for the test and reference speech. 100 | % Multiply by Hanning Window. 101 | % ---------------------------------------------------------- 102 | 103 | clean_frame = clean_speech(start:start+winlength-1); 104 | processed_frame = processed_speech(start:start+winlength-1); 105 | clean_frame = clean_frame.*window; 106 | processed_frame = processed_frame.*window; 107 | 108 | % ---------------------------------------------------------- 109 | % (2) Get the autocorrelation lags and LPC parameters used 110 | % to compute the LLR measure. 111 | % ---------------------------------------------------------- 112 | 113 | [R_clean, Ref_clean, A_clean] = ... 114 | lpcoeff(clean_frame, P); 115 | [R_processed, Ref_processed, A_processed] = ... 116 | lpcoeff(processed_frame, P); 117 | 118 | % ---------------------------------------------------------- 119 | % (3) Compute the LLR measure 120 | % ---------------------------------------------------------- 121 | 122 | numerator = A_processed*toeplitz(R_clean)*A_processed'; 123 | denominator = A_clean*toeplitz(R_clean)*A_clean'; 124 | distortion(frame_count) = min(2,log(numerator/denominator)); 125 | start = start + skiprate; 126 | 127 | end 128 | 129 | 130 | function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order) 131 | 132 | % ---------------------------------------------------------- 133 | % (1) Compute Autocorrelation Lags 134 | % ---------------------------------------------------------- 135 | 136 | winlength = max(size(speech_frame)); 137 | for k=1:model_order+1 138 | R(k) = sum(speech_frame(1:winlength-k+1) ... 139 | .*speech_frame(k:winlength)); 140 | end 141 | 142 | % ---------------------------------------------------------- 143 | % (2) Levinson-Durbin 144 | % ---------------------------------------------------------- 145 | 146 | a = ones(1,model_order); 147 | E(1)=R(1); 148 | for i=1:model_order 149 | a_past(1:i-1) = a(1:i-1); 150 | sum_term = sum(a_past(1:i-1).*R(i:-1:2)); 151 | rcoeff(i)=(R(i+1) - sum_term) / E(i); 152 | a(i)=rcoeff(i); 153 | a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1); 154 | E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i); 155 | end 156 | 157 | acorr = R; 158 | refcoeff = rcoeff; 159 | lpparams = [1 -a]; 160 | 161 | 162 | 163 | -------------------------------------------------------------------------------- /PESQ/comp_snr.asv: -------------------------------------------------------------------------------- 1 | function [snr_mean, segsnr_mean]= compSNR(cleanFile, enhdFile); 2 | % 3 | % Segmental Signal-to-Noise Ratio Objective Speech Quality Measure 4 | % 5 | % 6 | % This function implements the segmental signal-to-noise ratio 7 | % defined on page 45 of [1] (see Equation 2.12). 8 | % 9 | % Usage: [SNRovl, SNRseg]=comp_snr(cleanFile.wav, enhancedFile.wav) 10 | % 11 | % cleanFile.wav - clean input file in .wav format 12 | % enhancedFile - enhanced output file in .wav format 13 | % SNRovl - overall SNR (dB) 14 | % SNRseg - segmental SNR (dB) 15 | % 16 | % This function returns 2 parameters. The first item is the 17 | % overall SNR for the two speech signals. The second value 18 | % is the segmental signal-to-noise ratio (1 seg-snr per 19 | % frame of input). The segmental SNR is clamped to range 20 | % between 35dB and -10dB see suggestions in [2]. 21 | % 22 | % Example call: [SNRovl,SNRseg]=comp_SNR('sp04_babble_sn10.wav','out_log.wav'); 23 | % 24 | % References: 25 | % 26 | % [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements, 27 | % Objective Measures of Speech Quality. Prentice Hall 28 | % Advanced Reference Series, Englewood Cliffs, NJ, 1988, 29 | % ISBN: 0-13-629056-6. 30 | % 31 | % [2] P. E. Papamichalis, Practical Approaches to Speech 32 | % Coding, Prentice-Hall, Englewood Cliffs, NJ, 1987. 33 | % ISBN: 0-13-689019-9. (see pages 179-181). 34 | % 35 | % Authors: Bryan L. Pellom and John H. L. Hansen (July 1998) 36 | % Modified by: Philipos C. Loizou (Oct 2006) 37 | % 38 | % Copyright (c) 2006 by Philipos C. Loizou 39 | % $Revision: 0.0 $ $Date: 10/09/2006 $ 40 | %------------------------------------------------------------------------- 41 | 42 | 43 | 44 | [data1, Srate1, Nbits1]= wavread(cleanFile); 45 | [data2, Srate2, Nbits2]= wavread(enhdFile); 46 | if (( Srate1~= Srate2) | ( Nbits1~= Nbits2)) 47 | error( 'The two files do not match!\n'); 48 | end 49 | 50 | len= min( length( data1), length( data2)); 51 | data1= data1( 1: len); 52 | data2= data2( 1: len); 53 | 54 | [snr_dist, segsnr_dist]= snr( data1, data2,Srate1); 55 | 56 | snr_mean= snr_dist; 57 | segsnr_mean= mean( segsnr_dist); 58 | 59 | 60 | 61 | function [overall_snr, segmental_snr] = snr(clean_speech, processed_speech,sample_rate) 62 | 63 | % ---------------------------------------------------------------------- 64 | % Check the length of the clean and processed speech. Must be the same. 65 | % ---------------------------------------------------------------------- 66 | 67 | clean_length = length(clean_speech); 68 | processed_length = length(processed_speech); 69 | 70 | if (clean_length ~= processed_length) 71 | disp('Error: Both Speech Files must be same length.'); 72 | return 73 | end 74 | 75 | % ---------------------------------------------------------------------- 76 | % Scale both clean speech and processed speech to have same dynamic 77 | % range. Also remove DC component from each signal 78 | % ---------------------------------------------------------------------- 79 | 80 | %clean_speech = clean_speech - mean(clean_speech); 81 | %processed_speech = processed_speech - mean(processed_speech); 82 | 83 | %processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech))); 84 | 85 | overall_snr = 10* log10( sum(clean_speech.^2)/sum((clean_speech-processed_speech).^2)); 86 | 87 | % ---------------------------------------------------------------------- 88 | % Global Variables 89 | % ---------------------------------------------------------------------- 90 | 91 | %sample_rate = 8000; % default sample rate 92 | winlength = round(30*sample_rate/1000); %240; % window length in samples for 30-msecs 93 | skiprate = floor(winlength/4); %60; % window skip in samples 94 | MIN_SNR = -10; % minimum SNR in dB 95 | MAX_SNR = 35; % maximum SNR in dB 96 | 97 | % ---------------------------------------------------------------------- 98 | % For each frame of input speech, calculate the Segmental SNR 99 | % ---------------------------------------------------------------------- 100 | 101 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames 102 | start = 1; % starting sample 103 | window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1))); 104 | 105 | for frame_count = 1: num_frames 106 | 107 | % ---------------------------------------------------------- 108 | % (1) Get the Frames for the test and reference speech. 109 | % Multiply by Hanning Window. 110 | % ---------------------------------------------------------- 111 | 112 | clean_frame = clean_speech(start:start+winlength-1); 113 | processed_frame = processed_speech(start:start+winlength-1); 114 | clean_frame = clean_frame.*window; 115 | processed_frame = processed_frame.*window; 116 | 117 | % ---------------------------------------------------------- 118 | % (2) Compute the Segmental SNR 119 | % ---------------------------------------------------------- 120 | 121 | signal_energy = sum(clean_frame.^2); 122 | noise_energy = sum((clean_frame-processed_frame).^2); 123 | segmental_snr(frame_count) = 10*log10(signal_energy/(noise_energy+eps)+eps); 124 | segmental_snr(frame_count) = max(segmental_snr(frame_count),MIN_SNR); 125 | segmental_snr(frame_count) = min(segmental_snr(frame_count),MAX_SNR); 126 | 127 | start = start + skiprate; 128 | 129 | end 130 | 131 | -------------------------------------------------------------------------------- /PESQ/comp_snr.m: -------------------------------------------------------------------------------- 1 | function [snr_mean, segsnr_mean]= comp_SNR(cleanFile, enhdFile); 2 | % 3 | % Segmental Signal-to-Noise Ratio Objective Speech Quality Measure 4 | % 5 | % This function implements the segmental signal-to-noise ratio 6 | % as defined in [1, p. 45] (see Equation 2.12). 7 | % 8 | % Usage: [SNRovl, SNRseg]=comp_snr(cleanFile.wav, enhancedFile.wav) 9 | % 10 | % cleanFile.wav - clean input file in .wav format 11 | % enhancedFile - enhanced output file in .wav format 12 | % SNRovl - overall SNR (dB) 13 | % SNRseg - segmental SNR (dB) 14 | % 15 | % This function returns 2 parameters. The first item is the 16 | % overall SNR for the two speech signals. The second value 17 | % is the segmental signal-to-noise ratio (1 seg-snr per 18 | % frame of input). The segmental SNR is clamped to range 19 | % between 35dB and -10dB (see suggestions in [2]). 20 | % 21 | % Example call: [SNRovl,SNRseg]=comp_SNR('sp04.wav','enhanced.wav') 22 | % 23 | % References: 24 | % 25 | % [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements, 26 | % Objective Measures of Speech Quality. Prentice Hall 27 | % Advanced Reference Series, Englewood Cliffs, NJ, 1988, 28 | % ISBN: 0-13-629056-6. 29 | % 30 | % [2] P. E. Papamichalis, Practical Approaches to Speech 31 | % Coding, Prentice-Hall, Englewood Cliffs, NJ, 1987. 32 | % ISBN: 0-13-689019-9. (see pages 179-181). 33 | % 34 | % Authors: Bryan L. Pellom and John H. L. Hansen (July 1998) 35 | % Modified by: Philipos C. Loizou (Oct 2006) 36 | % 37 | % Copyright (c) 2006 by Philipos C. Loizou 38 | % $Revision: 0.0 $ $Date: 10/09/2006 $ 39 | %------------------------------------------------------------------------- 40 | 41 | if nargin ~=2 42 | fprintf('USAGE: [snr_mean, segsnr_mean]= comp_SNR(cleanFile, enhdFile) \n'); 43 | return; 44 | end 45 | 46 | [data1, Srate1, Nbits1]= wavread(cleanFile); 47 | [data2, Srate2, Nbits2]= wavread(enhdFile); 48 | if (( Srate1~= Srate2) | ( Nbits1~= Nbits2)) 49 | error( 'The two files do not match!\n'); 50 | end 51 | 52 | len= min( length( data1), length( data2)); 53 | data1= data1( 1: len); 54 | data2= data2( 1: len); 55 | 56 | [snr_dist, segsnr_dist]= snr( data1, data2,Srate1); 57 | 58 | snr_mean= snr_dist; 59 | segsnr_mean= mean( segsnr_dist); 60 | 61 | 62 | % ========================================================================= 63 | function [overall_snr, segmental_snr] = snr(clean_speech, processed_speech,sample_rate) 64 | 65 | % ---------------------------------------------------------------------- 66 | % Check the length of the clean and processed speech. Must be the same. 67 | % ---------------------------------------------------------------------- 68 | 69 | clean_length = length(clean_speech); 70 | processed_length = length(processed_speech); 71 | 72 | if (clean_length ~= processed_length) 73 | disp('Error: Both Speech Files must be same length.'); 74 | return 75 | end 76 | 77 | % ---------------------------------------------------------------------- 78 | % Scale both clean speech and processed speech to have same dynamic 79 | % range. Also remove DC component from each signal 80 | % ---------------------------------------------------------------------- 81 | 82 | %clean_speech = clean_speech - mean(clean_speech); 83 | %processed_speech = processed_speech - mean(processed_speech); 84 | 85 | %processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech))); 86 | 87 | overall_snr = 10* log10( sum(clean_speech.^2)/sum((clean_speech-processed_speech).^2)); 88 | 89 | % ---------------------------------------------------------------------- 90 | % Global Variables 91 | % ---------------------------------------------------------------------- 92 | 93 | 94 | winlength = round(30*sample_rate/1000); %240; % window length in samples for 30-msecs 95 | skiprate = floor(winlength/4); %60; % window skip in samples 96 | MIN_SNR = -10; % minimum SNR in dB 97 | MAX_SNR = 35; % maximum SNR in dB 98 | 99 | % ---------------------------------------------------------------------- 100 | % For each frame of input speech, calculate the Segmental SNR 101 | % ---------------------------------------------------------------------- 102 | 103 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames 104 | start = 1; % starting sample 105 | window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1))); 106 | 107 | for frame_count = 1: num_frames 108 | 109 | % ---------------------------------------------------------- 110 | % (1) Get the Frames for the test and reference speech. 111 | % Multiply by Hanning Window. 112 | % ---------------------------------------------------------- 113 | 114 | clean_frame = clean_speech(start:start+winlength-1); 115 | processed_frame = processed_speech(start:start+winlength-1); 116 | clean_frame = clean_frame.*window; 117 | processed_frame = processed_frame.*window; 118 | 119 | % ---------------------------------------------------------- 120 | % (2) Compute the Segmental SNR 121 | % ---------------------------------------------------------- 122 | 123 | signal_energy = sum(clean_frame.^2); 124 | noise_energy = sum((clean_frame-processed_frame).^2); 125 | segmental_snr(frame_count) = 10*log10(signal_energy/(noise_energy+eps)+eps); 126 | segmental_snr(frame_count) = max(segmental_snr(frame_count),MIN_SNR); 127 | segmental_snr(frame_count) = min(segmental_snr(frame_count),MAX_SNR); 128 | 129 | start = start + skiprate; 130 | 131 | end 132 | 133 | -------------------------------------------------------------------------------- /PESQ/comp_wss.asv: -------------------------------------------------------------------------------- 1 | function wss_dist= comp_wss(cleanFile, enhancedFile); 2 | % ---------------------------------------------------------------------- 3 | % 4 | % Weighted Spectral Slope (WSS) Objective Speech Quality Measure 5 | % 6 | % This function implements the Weighted Spectral Slope (WSS) 7 | % distance measure originally proposed in [1]. The algorithm 8 | % works by first decomposing the speech signal into a set of 9 | % frequency bands (this is done for both the test and reference 10 | % frame). The intensities within each critical band are 11 | % measured. Then, a weighted distances between the measured 12 | % slopes of the log-critical band spectra are computed. 13 | % This measure is also described in Section 2.2.9 (pages 56-58) 14 | % of [2]. 15 | % 16 | % Whereas Klatt's original measure used 36 critical-band 17 | % filters to estimate the smoothed short-time spectrum, this 18 | % implementation considers a bank of 25 filters spanning 19 | % the 4 kHz bandwidth. 20 | % 21 | % Usage: wss_dist=comp_wss(cleanFile.wav, enhancedFile.wav) 22 | % 23 | % cleanFile.wav - clean input file in .wav format 24 | % enhancedFile - enhanced output file in .wav format 25 | % wss_dist - computed spectral slope distance 26 | % 27 | % Example call: ws =comp_wss('sp04.wav','enhanced.wav') 28 | % 29 | % References: 30 | % 31 | % [1] D. H. Klatt, "Prediction of Perceived Phonetic Distance 32 | % from Critical-Band Spectra: A First Step", Proc. IEEE 33 | % ICASSP'82, Volume 2, pp. 1278-1281, May, 1982. 34 | % 35 | % [2] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements, 36 | % Objective Measures of Speech Quality. Prentice Hall 37 | % Advanced Reference Series, Englewood Cliffs, NJ, 1988, 38 | % ISBN: 0-13-629056-6. 39 | % 40 | % Authors: 41 | % 42 | % Bryan L. Pellom and John H. L. Hansen 43 | % Robust Speech Processing Laboratory, Duke University 44 | % Department of Electrical Engineeering 45 | % 46 | % Last Modified: 47 | % 48 | % July 22, 1998 49 | % 50 | % ---------------------------------------------------------------------- 51 | 52 | 53 | alpha= 0.95; 54 | 55 | [data1, Srate1, Nbits1]= wavread(cleanFile); 56 | [data2, Srate2, Nbits2]= wavread(enhancedFile); 57 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2) 58 | error( 'The two files do not match!\n'); 59 | end 60 | 61 | len= min( length( data1), length( data2)); 62 | data1= data1( 1: len)+eps; 63 | data2= data2( 1: len)+eps; 64 | 65 | wss_dist_vec= wss( data1, data2,Srate1); 66 | wss_dist_vec= sort( wss_dist_vec); 67 | wss_dist= mean( wss_dist_vec( 1: round( length( wss_dist_vec)*alpha))); 68 | 69 | 70 | 71 | function distortion = wss(clean_speech, processed_speech,sample_rate) 72 | 73 | 74 | % ---------------------------------------------------------------------- 75 | % Check the length of the clean and processed speech. Must be the same. 76 | % ---------------------------------------------------------------------- 77 | 78 | clean_length = length(clean_speech); 79 | processed_length = length(processed_speech); 80 | 81 | if (clean_length ~= processed_length) 82 | disp('Error: Files musthave same length.'); 83 | return 84 | end 85 | 86 | 87 | 88 | % ---------------------------------------------------------------------- 89 | % Global Variables 90 | % ---------------------------------------------------------------------- 91 | 92 | %ample_rate = 8000; % default sample rate 93 | %winlength = 240; % window length in samples 94 | %skiprate = 60; % window skip in samples 95 | winlength = round(30*sample_rate/1000); % window length in samples 96 | skiprate = floor(winlength/4); % window skip in samples 97 | max_freq = sample_rate/2; % maximum bandwidth 98 | num_crit = 25; % number of critical bands 99 | 100 | USE_FFT_SPECTRUM = 1; % defaults to 10th order LP spectrum 101 | %n_fft = 512; % FFT size 102 | n_fft = 2^nextpow2(2*winlength); 103 | n_fftby2 = n_fft/2; % FFT size/2 104 | Kmax = 20; % value suggested by Klatt, pg 1280 105 | Klocmax = 1; % value suggested by Klatt, pg 1280 106 | 107 | % ---------------------------------------------------------------------- 108 | % Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz) 109 | % ---------------------------------------------------------------------- 110 | 111 | cent_freq(1) = 50.0000; bandwidth(1) = 70.0000; 112 | cent_freq(2) = 120.000; bandwidth(2) = 70.0000; 113 | cent_freq(3) = 190.000; bandwidth(3) = 70.0000; 114 | cent_freq(4) = 260.000; bandwidth(4) = 70.0000; 115 | cent_freq(5) = 330.000; bandwidth(5) = 70.0000; 116 | cent_freq(6) = 400.000; bandwidth(6) = 70.0000; 117 | cent_freq(7) = 470.000; bandwidth(7) = 70.0000; 118 | cent_freq(8) = 540.000; bandwidth(8) = 77.3724; 119 | cent_freq(9) = 617.372; bandwidth(9) = 86.0056; 120 | cent_freq(10) = 703.378; bandwidth(10) = 95.3398; 121 | cent_freq(11) = 798.717; bandwidth(11) = 105.411; 122 | cent_freq(12) = 904.128; bandwidth(12) = 116.256; 123 | cent_freq(13) = 1020.38; bandwidth(13) = 127.914; 124 | cent_freq(14) = 1148.30; bandwidth(14) = 140.423; 125 | cent_freq(15) = 1288.72; bandwidth(15) = 153.823; 126 | cent_freq(16) = 1442.54; bandwidth(16) = 168.154; 127 | cent_freq(17) = 1610.70; bandwidth(17) = 183.457; 128 | cent_freq(18) = 1794.16; bandwidth(18) = 199.776; 129 | cent_freq(19) = 1993.93; bandwidth(19) = 217.153; 130 | cent_freq(20) = 2211.08; bandwidth(20) = 235.631; 131 | cent_freq(21) = 2446.71; bandwidth(21) = 255.255; 132 | cent_freq(22) = 2701.97; bandwidth(22) = 276.072; 133 | cent_freq(23) = 2978.04; bandwidth(23) = 298.126; 134 | cent_freq(24) = 3276.17; bandwidth(24) = 321.465; 135 | cent_freq(25) = 3597.63; bandwidth(25) = 346.136; 136 | 137 | bw_min = bandwidth (1); % minimum critical bandwidth 138 | 139 | % ---------------------------------------------------------------------- 140 | % Set up the critical band filters. Note here that Gaussianly shaped 141 | % filters are used. Also, the sum of the filter weights are equivalent 142 | % for each critical band filter. Filter less than -30 dB and set to 143 | % zero. 144 | % ---------------------------------------------------------------------- 145 | 146 | min_factor = exp (-30.0 / (2.0 * 2.303)); % -30 dB point of filter 147 | 148 | for i = 1:num_crit 149 | f0 = (cent_freq (i) / max_freq) * (n_fftby2); 150 | all_f0(i) = floor(f0); 151 | bw = (bandwidth (i) / max_freq) * (n_fftby2); 152 | norm_factor = log(bw_min) - log(bandwidth(i)); 153 | j = 0:1:n_fftby2-1; 154 | crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor); 155 | crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor); 156 | end 157 | 158 | % ---------------------------------------------------------------------- 159 | % For each frame of input speech, calculate the Weighted Spectral 160 | % Slope Measure 161 | % ---------------------------------------------------------------------- 162 | 163 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames 164 | start = 1; % starting sample 165 | window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1))); 166 | 167 | for frame_count = 1:num_frames 168 | 169 | % ---------------------------------------------------------- 170 | % (1) Get the Frames for the test and reference speech. 171 | % Multiply by Hanning Window. 172 | % ---------------------------------------------------------- 173 | 174 | clean_frame = clean_speech(start:start+winlength-1); 175 | processed_frame = processed_speech(start:start+winlength-1); 176 | clean_frame = clean_frame.*window; 177 | processed_frame = processed_frame.*window; 178 | 179 | % ---------------------------------------------------------- 180 | % (2) Compute the Power Spectrum of Clean and Processed 181 | % ---------------------------------------------------------- 182 | 183 | if (USE_FFT_SPECTRUM) 184 | clean_spec = (abs(fft(clean_frame,n_fft)).^2); 185 | processed_spec = (abs(fft(processed_frame,n_fft)).^2); 186 | else 187 | a_vec = zeros(1,n_fft); 188 | a_vec(1:11) = lpc(clean_frame,10); 189 | clean_spec = 1.0/(abs(fft(a_vec,n_fft)).^2)'; 190 | 191 | a_vec = zeros(1,n_fft); 192 | a_vec(1:11) = lpc(processed_frame,10); 193 | processed_spec = 1.0/(abs(fft(a_vec,n_fft)).^2)'; 194 | end 195 | 196 | % ---------------------------------------------------------- 197 | % (3) Compute Filterbank Output Energies (in dB scale) 198 | % ---------------------------------------------------------- 199 | 200 | for i = 1:num_crit 201 | clean_energy(i) = sum(clean_spec(1:n_fftby2) ... 202 | .*crit_filter(i,:)'); 203 | processed_energy(i) = sum(processed_spec(1:n_fftby2) ... 204 | .*crit_filter(i,:)'); 205 | end 206 | clean_energy = 10*log10(max(clean_energy,1E-10)); 207 | processed_energy = 10*log10(max(processed_energy,1E-10)); 208 | 209 | % ---------------------------------------------------------- 210 | % (4) Compute Spectral Slope (dB[i+1]-dB[i]) 211 | % ---------------------------------------------------------- 212 | 213 | clean_slope = clean_energy(2:num_crit) - ... 214 | clean_energy(1:num_crit-1); 215 | processed_slope = processed_energy(2:num_crit) - ... 216 | processed_energy(1:num_crit-1); 217 | 218 | % ---------------------------------------------------------- 219 | % (5) Find the nearest peak locations in the spectra to 220 | % each critical band. If the slope is negative, we 221 | % search to the left. If positive, we search to the 222 | % right. 223 | % ---------------------------------------------------------- 224 | 225 | for i = 1:num_crit-1 226 | 227 | % find the peaks in the clean speech signal 228 | 229 | if (clean_slope(i)>0) % search to the right 230 | n = i; 231 | while ((n 0)) 232 | n = n+1; 233 | end 234 | clean_loc_peak(i) = clean_energy(n-1); 235 | else % search to the left 236 | n = i; 237 | while ((n>0) & (clean_slope(n) <= 0)) 238 | n = n-1; 239 | end 240 | clean_loc_peak(i) = clean_energy(n+1); 241 | end 242 | 243 | % find the peaks in the processed speech signal 244 | 245 | if (processed_slope(i)>0) % search to the right 246 | n = i; 247 | while ((n 0)) 248 | n = n+1; 249 | end 250 | processed_loc_peak(i) = processed_energy(n-1); 251 | else % search to the left 252 | n = i; 253 | while ((n>0) & (processed_slope(n) <= 0)) 254 | n = n-1; 255 | end 256 | processed_loc_peak(i) = processed_energy(n+1); 257 | end 258 | 259 | end 260 | 261 | % ---------------------------------------------------------- 262 | % (6) Compute the WSS Measure for this frame. This 263 | % includes determination of the weighting function. 264 | % ---------------------------------------------------------- 265 | 266 | dBMax_clean = max(clean_energy); 267 | dBMax_processed = max(processed_energy); 268 | 269 | % The weights are calculated by averaging individual 270 | % weighting factors from the clean and processed frame. 271 | % These weights W_clean and W_processed should range 272 | % from 0 to 1 and place more emphasis on spectral 273 | % peaks and less emphasis on slope differences in spectral 274 | % valleys. This procedure is described on page 1280 of 275 | % Klatt's 1982 ICASSP paper. 276 | 277 | Wmax_clean = Kmax ./ (Kmax + dBMax_clean - ... 278 | clean_energy(1:num_crit-1)); 279 | Wlocmax_clean = Klocmax ./ ( Klocmax + clean_loc_peak - ... 280 | clean_energy(1:num_crit-1)); 281 | W_clean = Wmax_clean .* Wlocmax_clean; 282 | 283 | Wmax_processed = Kmax ./ (Kmax + dBMax_processed - ... 284 | processed_energy(1:num_crit-1)); 285 | Wlocmax_processed = Klocmax ./ ( Klocmax + processed_loc_peak - ... 286 | processed_energy(1:num_crit-1)); 287 | W_processed = Wmax_processed .* Wlocmax_processed; 288 | 289 | W = (W_clean + W_processed)./2.0; 290 | 291 | distortion(frame_count) = sum(W.*(clean_slope(1:num_crit-1) - ... 292 | processed_slope(1:num_crit-1)).^2); 293 | 294 | % this normalization is not part of Klatt's paper, but helps 295 | % to normalize the measure. Here we scale the measure by the 296 | % sum of the weights. 297 | 298 | distortion(frame_count) = distortion(frame_count)/sum(W); 299 | 300 | start = start + skiprate; 301 | 302 | end 303 | 304 | -------------------------------------------------------------------------------- /PESQ/comp_wss.m: -------------------------------------------------------------------------------- 1 | function wss_dist= comp_wss(cleanFile, enhancedFile); 2 | % ---------------------------------------------------------------------- 3 | % 4 | % Weighted Spectral Slope (WSS) Objective Speech Quality Measure 5 | % 6 | % This function implements the Weighted Spectral Slope (WSS) 7 | % distance measure originally proposed in [1]. The algorithm 8 | % works by first decomposing the speech signal into a set of 9 | % frequency bands (this is done for both the test and reference 10 | % frame). The intensities within each critical band are 11 | % measured. Then, a weighted distances between the measured 12 | % slopes of the log-critical band spectra are computed. 13 | % This measure is also described in Section 2.2.9 (pages 56-58) 14 | % of [2]. 15 | % 16 | % Whereas Klatt's original measure used 36 critical-band 17 | % filters to estimate the smoothed short-time spectrum, this 18 | % implementation considers a bank of 25 filters spanning 19 | % the 4 kHz bandwidth. 20 | % 21 | % Usage: wss_dist=comp_wss(cleanFile.wav, enhancedFile.wav) 22 | % 23 | % cleanFile.wav - clean input file in .wav format 24 | % enhancedFile - enhanced output file in .wav format 25 | % wss_dist - computed spectral slope distance 26 | % 27 | % Example call: ws =comp_wss('sp04.wav','enhanced.wav') 28 | % 29 | % References: 30 | % 31 | % [1] D. H. Klatt, "Prediction of Perceived Phonetic Distance 32 | % from Critical-Band Spectra: A First Step", Proc. IEEE 33 | % ICASSP'82, Volume 2, pp. 1278-1281, May, 1982. 34 | % 35 | % [2] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements, 36 | % Objective Measures of Speech Quality. Prentice Hall 37 | % Advanced Reference Series, Englewood Cliffs, NJ, 1988, 38 | % ISBN: 0-13-629056-6. 39 | % 40 | % Authors: Bryan L. Pellom and John H. L. Hansen (July 1998) 41 | % Modified by: Philipos C. Loizou (Oct 2006) 42 | % 43 | % Copyright (c) 2006 by Philipos C. Loizou 44 | % $Revision: 0.0 $ $Date: 10/09/2006 $ 45 | % 46 | % ---------------------------------------------------------------------- 47 | if nargin~=2 48 | fprintf('USAGE: WSS=comp_wss(cleanFile.wav, enhancedFile.wav)\n'); 49 | fprintf('For more help, type: help comp_wss\n\n'); 50 | return; 51 | end 52 | 53 | alpha= 0.95; 54 | 55 | [data1, Srate1, Nbits1]= wavread(cleanFile); 56 | [data2, Srate2, Nbits2]= wavread(enhancedFile); 57 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2) 58 | error( 'The two files do not match!\n'); 59 | end 60 | 61 | len= min( length( data1), length( data2)); 62 | data1= data1( 1: len)+eps; 63 | data2= data2( 1: len)+eps; 64 | 65 | wss_dist_vec= wss( data1, data2,Srate1); 66 | wss_dist_vec= sort( wss_dist_vec); 67 | wss_dist= mean( wss_dist_vec( 1: round( length( wss_dist_vec)*alpha))); 68 | 69 | 70 | 71 | function distortion = wss(clean_speech, processed_speech,sample_rate) 72 | 73 | 74 | % ---------------------------------------------------------------------- 75 | % Check the length of the clean and processed speech. Must be the same. 76 | % ---------------------------------------------------------------------- 77 | 78 | clean_length = length(clean_speech); 79 | processed_length = length(processed_speech); 80 | 81 | if (clean_length ~= processed_length) 82 | disp('Error: Files musthave same length.'); 83 | return 84 | end 85 | 86 | 87 | 88 | % ---------------------------------------------------------------------- 89 | % Global Variables 90 | % ---------------------------------------------------------------------- 91 | 92 | winlength = round(30*sample_rate/1000); % window length in samples 93 | skiprate = floor(winlength/4); % window skip in samples 94 | max_freq = sample_rate/2; % maximum bandwidth 95 | num_crit = 25; % number of critical bands 96 | 97 | USE_FFT_SPECTRUM = 1; % defaults to 10th order LP spectrum 98 | n_fft = 2^nextpow2(2*winlength); 99 | n_fftby2 = n_fft/2; % FFT size/2 100 | Kmax = 20; % value suggested by Klatt, pg 1280 101 | Klocmax = 1; % value suggested by Klatt, pg 1280 102 | 103 | % ---------------------------------------------------------------------- 104 | % Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz) 105 | % ---------------------------------------------------------------------- 106 | 107 | cent_freq(1) = 50.0000; bandwidth(1) = 70.0000; 108 | cent_freq(2) = 120.000; bandwidth(2) = 70.0000; 109 | cent_freq(3) = 190.000; bandwidth(3) = 70.0000; 110 | cent_freq(4) = 260.000; bandwidth(4) = 70.0000; 111 | cent_freq(5) = 330.000; bandwidth(5) = 70.0000; 112 | cent_freq(6) = 400.000; bandwidth(6) = 70.0000; 113 | cent_freq(7) = 470.000; bandwidth(7) = 70.0000; 114 | cent_freq(8) = 540.000; bandwidth(8) = 77.3724; 115 | cent_freq(9) = 617.372; bandwidth(9) = 86.0056; 116 | cent_freq(10) = 703.378; bandwidth(10) = 95.3398; 117 | cent_freq(11) = 798.717; bandwidth(11) = 105.411; 118 | cent_freq(12) = 904.128; bandwidth(12) = 116.256; 119 | cent_freq(13) = 1020.38; bandwidth(13) = 127.914; 120 | cent_freq(14) = 1148.30; bandwidth(14) = 140.423; 121 | cent_freq(15) = 1288.72; bandwidth(15) = 153.823; 122 | cent_freq(16) = 1442.54; bandwidth(16) = 168.154; 123 | cent_freq(17) = 1610.70; bandwidth(17) = 183.457; 124 | cent_freq(18) = 1794.16; bandwidth(18) = 199.776; 125 | cent_freq(19) = 1993.93; bandwidth(19) = 217.153; 126 | cent_freq(20) = 2211.08; bandwidth(20) = 235.631; 127 | cent_freq(21) = 2446.71; bandwidth(21) = 255.255; 128 | cent_freq(22) = 2701.97; bandwidth(22) = 276.072; 129 | cent_freq(23) = 2978.04; bandwidth(23) = 298.126; 130 | cent_freq(24) = 3276.17; bandwidth(24) = 321.465; 131 | cent_freq(25) = 3597.63; bandwidth(25) = 346.136; 132 | 133 | bw_min = bandwidth (1); % minimum critical bandwidth 134 | 135 | % ---------------------------------------------------------------------- 136 | % Set up the critical band filters. Note here that Gaussianly shaped 137 | % filters are used. Also, the sum of the filter weights are equivalent 138 | % for each critical band filter. Filter less than -30 dB and set to 139 | % zero. 140 | % ---------------------------------------------------------------------- 141 | 142 | min_factor = exp (-30.0 / (2.0 * 2.303)); % -30 dB point of filter 143 | 144 | for i = 1:num_crit 145 | f0 = (cent_freq (i) / max_freq) * (n_fftby2); 146 | all_f0(i) = floor(f0); 147 | bw = (bandwidth (i) / max_freq) * (n_fftby2); 148 | norm_factor = log(bw_min) - log(bandwidth(i)); 149 | j = 0:1:n_fftby2-1; 150 | crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor); 151 | crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor); 152 | end 153 | 154 | % ---------------------------------------------------------------------- 155 | % For each frame of input speech, calculate the Weighted Spectral 156 | % Slope Measure 157 | % ---------------------------------------------------------------------- 158 | 159 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames 160 | start = 1; % starting sample 161 | window = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1))); 162 | 163 | for frame_count = 1:num_frames 164 | 165 | % ---------------------------------------------------------- 166 | % (1) Get the Frames for the test and reference speech. 167 | % Multiply by Hanning Window. 168 | % ---------------------------------------------------------- 169 | 170 | clean_frame = clean_speech(start:start+winlength-1); 171 | processed_frame = processed_speech(start:start+winlength-1); 172 | clean_frame = clean_frame.*window; 173 | processed_frame = processed_frame.*window; 174 | 175 | % ---------------------------------------------------------- 176 | % (2) Compute the Power Spectrum of Clean and Processed 177 | % ---------------------------------------------------------- 178 | 179 | if (USE_FFT_SPECTRUM) 180 | clean_spec = (abs(fft(clean_frame,n_fft)).^2); 181 | processed_spec = (abs(fft(processed_frame,n_fft)).^2); 182 | else 183 | a_vec = zeros(1,n_fft); 184 | a_vec(1:11) = lpc(clean_frame,10); 185 | clean_spec = 1.0/(abs(fft(a_vec,n_fft)).^2)'; 186 | 187 | a_vec = zeros(1,n_fft); 188 | a_vec(1:11) = lpc(processed_frame,10); 189 | processed_spec = 1.0/(abs(fft(a_vec,n_fft)).^2)'; 190 | end 191 | 192 | % ---------------------------------------------------------- 193 | % (3) Compute Filterbank Output Energies (in dB scale) 194 | % ---------------------------------------------------------- 195 | 196 | for i = 1:num_crit 197 | clean_energy(i) = sum(clean_spec(1:n_fftby2) ... 198 | .*crit_filter(i,:)'); 199 | processed_energy(i) = sum(processed_spec(1:n_fftby2) ... 200 | .*crit_filter(i,:)'); 201 | end 202 | clean_energy = 10*log10(max(clean_energy,1E-10)); 203 | processed_energy = 10*log10(max(processed_energy,1E-10)); 204 | 205 | % ---------------------------------------------------------- 206 | % (4) Compute Spectral Slope (dB[i+1]-dB[i]) 207 | % ---------------------------------------------------------- 208 | 209 | clean_slope = clean_energy(2:num_crit) - ... 210 | clean_energy(1:num_crit-1); 211 | processed_slope = processed_energy(2:num_crit) - ... 212 | processed_energy(1:num_crit-1); 213 | 214 | % ---------------------------------------------------------- 215 | % (5) Find the nearest peak locations in the spectra to 216 | % each critical band. If the slope is negative, we 217 | % search to the left. If positive, we search to the 218 | % right. 219 | % ---------------------------------------------------------- 220 | 221 | for i = 1:num_crit-1 222 | 223 | % find the peaks in the clean speech signal 224 | 225 | if (clean_slope(i)>0) % search to the right 226 | n = i; 227 | while ((n 0)) 228 | n = n+1; 229 | end 230 | clean_loc_peak(i) = clean_energy(n-1); 231 | else % search to the left 232 | n = i; 233 | while ((n>0) & (clean_slope(n) <= 0)) 234 | n = n-1; 235 | end 236 | clean_loc_peak(i) = clean_energy(n+1); 237 | end 238 | 239 | % find the peaks in the processed speech signal 240 | 241 | if (processed_slope(i)>0) % search to the right 242 | n = i; 243 | while ((n 0)) 244 | n = n+1; 245 | end 246 | processed_loc_peak(i) = processed_energy(n-1); 247 | else % search to the left 248 | n = i; 249 | while ((n>0) & (processed_slope(n) <= 0)) 250 | n = n-1; 251 | end 252 | processed_loc_peak(i) = processed_energy(n+1); 253 | end 254 | 255 | end 256 | 257 | % ---------------------------------------------------------- 258 | % (6) Compute the WSS Measure for this frame. This 259 | % includes determination of the weighting function. 260 | % ---------------------------------------------------------- 261 | 262 | dBMax_clean = max(clean_energy); 263 | dBMax_processed = max(processed_energy); 264 | 265 | % The weights are calculated by averaging individual 266 | % weighting factors from the clean and processed frame. 267 | % These weights W_clean and W_processed should range 268 | % from 0 to 1 and place more emphasis on spectral 269 | % peaks and less emphasis on slope differences in spectral 270 | % valleys. This procedure is described on page 1280 of 271 | % Klatt's 1982 ICASSP paper. 272 | 273 | Wmax_clean = Kmax ./ (Kmax + dBMax_clean - ... 274 | clean_energy(1:num_crit-1)); 275 | Wlocmax_clean = Klocmax ./ ( Klocmax + clean_loc_peak - ... 276 | clean_energy(1:num_crit-1)); 277 | W_clean = Wmax_clean .* Wlocmax_clean; 278 | 279 | Wmax_processed = Kmax ./ (Kmax + dBMax_processed - ... 280 | processed_energy(1:num_crit-1)); 281 | Wlocmax_processed = Klocmax ./ ( Klocmax + processed_loc_peak - ... 282 | processed_energy(1:num_crit-1)); 283 | W_processed = Wmax_processed .* Wlocmax_processed; 284 | 285 | W = (W_clean + W_processed)./2.0; 286 | 287 | distortion(frame_count) = sum(W.*(clean_slope(1:num_crit-1) - ... 288 | processed_slope(1:num_crit-1)).^2); 289 | 290 | % this normalization is not part of Klatt's paper, but helps 291 | % to normalize the measure. Here we scale the measure by the 292 | % sum of the weights. 293 | 294 | distortion(frame_count) = distortion(frame_count)/sum(W); 295 | 296 | start = start + skiprate; 297 | 298 | end 299 | 300 | -------------------------------------------------------------------------------- /PESQ/crude_align.m: -------------------------------------------------------------------------------- 1 | function crude_align( ref_logVAD, ref_Nsamples, deg_logVAD, ... 2 | deg_Nsamples, Utt_id) 3 | 4 | global Downsample 5 | global Nutterances Largest_uttsize Nsurf_samples Crude_DelayEst 6 | global Crude_DelayConf UttSearch_Start UttSearch_End Utt_DelayEst 7 | global Utt_Delay Utt_DelayConf Utt_Start Utt_End 8 | global MAXNUTTERANCES WHOLE_SIGNAL 9 | global pesq_mos subj_mos cond_nr 10 | 11 | if (Utt_id== WHOLE_SIGNAL ) 12 | nr = floor( ref_Nsamples/ Downsample); 13 | nd = floor( deg_Nsamples/ Downsample); 14 | startr= 1; 15 | startd= 1; 16 | elseif Utt_id== MAXNUTTERANCES 17 | startr= UttSearch_Start(MAXNUTTERANCES); 18 | startd= startr+ Utt_DelayEst(MAXNUTTERANCES)/ Downsample; 19 | if ( startd< 0 ) 20 | startr= 1- Utt_DelayEst(MAXNUTTERANCES)/ Downsample; 21 | startd= 1; 22 | end 23 | 24 | nr= UttSearch_End(MAXNUTTERANCES)- startr; 25 | nd= nr; 26 | 27 | if( startd+ nd> floor( deg_Nsamples/ Downsample) ) 28 | nd= floor( deg_Nsamples/ Downsample)- startd; 29 | end 30 | % fprintf( 'nr,nd is %d,%d\n', nr, nd); 31 | 32 | else 33 | startr= UttSearch_Start(Utt_id); 34 | startd= startr+ Crude_DelayEst/ Downsample; 35 | 36 | if ( startd< 0 ) 37 | startr= 1- Crude_DelayEst/ Downsample; 38 | startd= 1; 39 | end 40 | 41 | nr= UttSearch_End(Utt_id)- startr; 42 | nd = nr; 43 | if( startd+ nd> floor( deg_Nsamples/ Downsample)+ 1) 44 | nd = floor( deg_Nsamples/ Downsample)- startd+ 1; 45 | end 46 | end 47 | 48 | max_Y= 0.0; 49 | I_max_Y= nr; 50 | if( (nr> 1) && (nd> 1) ) 51 | Y= FFTNXCorr( ref_logVAD, startr, nr, deg_logVAD, startd, nd); 52 | [max_Y, I_max_Y]= max( Y); 53 | if (max_Y<= 0) 54 | max_Y= 0; 55 | I_max_Y= nr; 56 | end 57 | end 58 | 59 | % fprintf( 'max_Y, I_max_Y is %f, %d\n', max_Y, I_max_Y); 60 | 61 | if( Utt_id== WHOLE_SIGNAL ) 62 | Crude_DelayEst= (I_max_Y- nr)* Downsample; 63 | Crude_DelayConf= 0.0; 64 | % fprintf( 1, 'I_max_Y, nr, Crude_DelayEst is %f, %f, %f\n', ... 65 | % I_max_Y, nr, Crude_DelayEst); 66 | elseif( Utt_id == MAXNUTTERANCES ) 67 | Utt_Delay(MAXNUTTERANCES)= (I_max_Y- nr)* Downsample+ ... 68 | Utt_DelayEst(MAXNUTTERANCES); 69 | % fprintf( 'startr, startd, nr, nd, I_max, Utt_Delay[%d] is %d, %d, %d, %d, %d, %d\n', ... 70 | % MAXNUTTERANCES, startr, startd, nr, nd, ... 71 | % I_max_Y, Utt_Delay(MAXNUTTERANCES) ); 72 | else 73 | % fprintf( 'I_max_Y, nr is %d, %d\n', I_max_Y, nr); 74 | Utt_DelayEst(Utt_id)= (I_max_Y- nr)* Downsample+ ... 75 | Crude_DelayEst; 76 | end 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /PESQ/enhanced.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/PESQ/enhanced.wav -------------------------------------------------------------------------------- /PESQ/fix_power_level.asv: -------------------------------------------------------------------------------- 1 | function mod_data= fix_power_level( data, data_Nsamples, maxNsamples) 2 | % this function is used for level normalization, i.e., to fix the power 3 | % level of data to a preset number, and return it to mod_data. 4 | 5 | global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs 6 | global TARGET_AVG_POWER 7 | TARGET_AVG_POWER= 1e7; 8 | 9 | %Este filtro no coincide con el que propone el estandar (se ve mejor en 10 | %tiempo de ejecucion, por cierto). 11 | align_filter_dB= [0,-500; 50, -500; 100, -500; 125, -500; 160, -500; 200, -500; 12 | 250, -500; 300, -500; 350, 0; 400, 0; 500, 0; 600, 0; 630, 0; 13 | 800, 0; 1000, 0; 1250, 0; 1600, 0; 2000, 0; 2500, 0; 3000, 0; 14 | 3250, 0; 3500, -500; 4000, -500; 5000, -500; 6300, -500; 8000, -500]; 15 | 16 | align_filtered= apply_filter( data, data_Nsamples, align_filter_dB); 17 | power_above_300Hz = pow_of (align_filtered, SEARCHBUFFER* Downsample+ 1, ... 18 | data_Nsamples- SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000), ... 19 | maxNsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000)); 20 | 21 | global_scale= sqrt( TARGET_AVG_POWER/ power_above_300Hz); 22 | % fprintf( 1, '\tglobal_scale is %f\n', global_scale); 23 | mod_data= data* global_scale; 24 | -------------------------------------------------------------------------------- /PESQ/fix_power_level.m: -------------------------------------------------------------------------------- 1 | function mod_data= fix_power_level( data, data_Nsamples, maxNsamples) 2 | % this function is used for level normalization, i.e., to fix the power 3 | % level of data to a preset number, and return it to mod_data. 4 | 5 | global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs 6 | global TARGET_AVG_POWER 7 | TARGET_AVG_POWER= 1e7; 8 | 9 | %Este filtro no coincide con el que propone el estandar (se ve mejor en 10 | %tiempo de ejecucion, por cierto). 11 | align_filter_dB= [0,-500; 50, -500; 100, -500; 125, -500; 160, -500; 200, -500; 12 | 250, -500; 300, -500; 350, 0; 400, 0; 500, 0; 600, 0; 630, 0; 13 | 800, 0; 1000, 0; 1250, 0; 1600, 0; 2000, 0; 2500, 0; 3000, 0; 14 | 3250, 0; 3500, -500; 4000, -500; 5000, -500; 6300, -500; 8000, -500]; 15 | 16 | align_filtered= apply_filter( data, data_Nsamples, align_filter_dB); 17 | %Calcula la potencia (energia por muestra) de la segnal filtrada 18 | power_above_300Hz = pow_of (align_filtered, SEARCHBUFFER* Downsample+ 1, ... 19 | data_Nsamples- SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000), ... 20 | maxNsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000)); 21 | 22 | %Calcula la ganancia necesaria para que la senal tenga una potencia (en la 23 | %banda de interes) igual a la indicada por el estandar 24 | global_scale= sqrt( TARGET_AVG_POWER/ power_above_300Hz); 25 | % fprintf( 1, '\tglobal_scale is %f\n', global_scale); 26 | mod_data= data* global_scale; 27 | -------------------------------------------------------------------------------- /PESQ/id_searchwindows.m: -------------------------------------------------------------------------------- 1 | function id_searchwindows( ref_VAD, ref_Nsamples, deg_VAD, deg_Nsamples); 2 | 3 | global MINUTTLENGTH Downsample MINUTTLENGTH SEARCHBUFFER 4 | global Crude_DelayEst Nutterances UttSearch_Start UttSearch_End 5 | 6 | Utt_num = 1; 7 | speech_flag = 0; 8 | 9 | VAD_length= floor( ref_Nsamples/ Downsample); 10 | del_deg_start= MINUTTLENGTH- Crude_DelayEst/ Downsample; 11 | del_deg_end= floor((deg_Nsamples- Crude_DelayEst)/ Downsample)-... 12 | MINUTTLENGTH; 13 | 14 | for count= 1: VAD_length 15 | VAD_value= ref_VAD(count); 16 | if( (VAD_value> 0) && (speech_flag== 0) ) 17 | speech_flag= 1; 18 | this_start= count; 19 | UttSearch_Start(Utt_num)= count- SEARCHBUFFER; 20 | if( UttSearch_Start(Utt_num)< 0 ) 21 | UttSearch_Start(Utt_num)= 0; 22 | end 23 | end 24 | 25 | if( ((VAD_value== 0) || (count == (VAD_length-1))) && ... 26 | (speech_flag == 1) ) 27 | speech_flag = 0; 28 | UttSearch_End(Utt_num) = count + SEARCHBUFFER; 29 | if( UttSearch_End(Utt_num) > VAD_length - 1 ) 30 | UttSearch_End(Utt_num) = VAD_length -1; 31 | end 32 | 33 | if( ((count - this_start) >= MINUTTLENGTH) &&... 34 | (this_start < del_deg_end) &&... 35 | (count > del_deg_start) ) 36 | Utt_num= Utt_num + 1; 37 | end 38 | end 39 | end 40 | Utt_num= Utt_num- 1; 41 | Nutterances = Utt_num; 42 | 43 | % fprintf( 1, 'Nutterances is %d\n', Nutterances); 44 | 45 | % fid= fopen( 'mat_utt.txt', 'wt'); 46 | % fprintf( fid, '%d\n', UttSearch_Start( 1: Nutterances)); 47 | % fprintf( fid, '\n'); 48 | % fprintf( fid, '%d\n', UttSearch_End( 1: Nutterances)); 49 | % fclose(fid); 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /PESQ/id_utterances.m: -------------------------------------------------------------------------------- 1 | function id_utterances( ref_Nsamples, ref_VAD, deg_Nsamples) 2 | 3 | global Largest_uttsize MINUTTLENGTH MINUTTLENGTH Crude_DelayEst 4 | global Downsample SEARCHBUFFER Nutterances Utt_Start 5 | global Utt_End Utt_Delay 6 | 7 | Utt_num = 1; 8 | speech_flag = 0; 9 | VAD_length = floor( ref_Nsamples / Downsample); 10 | % fprintf( 1, 'VAD_length is %d\n', VAD_length); 11 | 12 | del_deg_start = MINUTTLENGTH - Crude_DelayEst / Downsample; 13 | del_deg_end = floor((deg_Nsamples- Crude_DelayEst)/ Downsample) ... 14 | - MINUTTLENGTH; 15 | 16 | for count = 1: VAD_length 17 | VAD_value = ref_VAD(count); 18 | if( (VAD_value > 0.0) && (speech_flag == 0) ) 19 | speech_flag = 1; 20 | this_start = count; 21 | Utt_Start (Utt_num) = count; 22 | end 23 | 24 | if( ((VAD_value == 0) || (count == VAD_length)) && ... 25 | (speech_flag == 1) ) 26 | speech_flag = 0; 27 | Utt_End (Utt_num) = count; 28 | 29 | if( ((count - this_start) >= MINUTTLENGTH) && ... 30 | (this_start < del_deg_end) && ... 31 | (count > del_deg_start) ) 32 | Utt_num = Utt_num + 1; 33 | end 34 | end 35 | end 36 | 37 | Utt_Start(1) = SEARCHBUFFER+ 1; 38 | Utt_End(Nutterances) = VAD_length - SEARCHBUFFER+ 1; 39 | 40 | for Utt_num = 2: Nutterances 41 | this_start = Utt_Start(Utt_num)- 1; 42 | last_end = Utt_End(Utt_num - 1)- 1; 43 | count = floor( (this_start + last_end) / 2); 44 | Utt_Start(Utt_num) = count+ 1; 45 | Utt_End(Utt_num - 1) = count+ 1; 46 | end 47 | 48 | this_start = (Utt_Start(1)- 1) * Downsample + Utt_Delay(1); 49 | if( this_start < (SEARCHBUFFER * Downsample) ) 50 | count = SEARCHBUFFER + floor( ... 51 | (Downsample - 1 - Utt_Delay(1)) / Downsample); 52 | Utt_Start(1) = count+ 1; 53 | end 54 | 55 | last_end = (Utt_End(Nutterances)- 1) * Downsample + 1 + ... 56 | Utt_Delay(Nutterances); 57 | % fprintf( 'Utt_End(%d) is %d\n', Nutterances, Utt_End(Nutterances)); 58 | % fprintf( 'last_end is %d\n', last_end); 59 | % fprintf( 'Utt_Delay(%d) is %d\n', Nutterances, Utt_Delay(Nutterances)); 60 | if( last_end > (deg_Nsamples - SEARCHBUFFER * Downsample+ 1) ) 61 | count = floor( (deg_Nsamples - Utt_Delay(Nutterances)) / Downsample) ... 62 | - SEARCHBUFFER; 63 | Utt_End(Nutterances) = count+ 1; 64 | end 65 | 66 | for Utt_num = 2: Nutterances 67 | this_start = (Utt_Start(Utt_num)- 1) * Downsample + Utt_Delay(Utt_num); 68 | last_end = (Utt_End(Utt_num - 1)- 1) * Downsample + Utt_Delay(Utt_num - 1); 69 | if( this_start < last_end ) 70 | count = floor( (this_start + last_end) / 2); 71 | this_start = floor( (Downsample- 1+ count- Utt_Delay(Utt_num))... 72 | / Downsample); 73 | last_end = floor( (count - Utt_Delay(Utt_num - 1))... 74 | / Downsample); 75 | Utt_Start(Utt_num) = this_start+ 1; 76 | Utt_End(Utt_num- 1) = last_end+ 1; 77 | end 78 | end 79 | 80 | Largest_uttsize= max( Utt_End- Utt_Start); 81 | 82 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /PESQ/input_filter.m: -------------------------------------------------------------------------------- 1 | function [mod_ref_data, mod_deg_data]= input_filter( ref_data, ref_Nsamples, ... 2 | deg_data, deg_Nsamples) 3 | 4 | mod_ref_data= DC_block( ref_data, ref_Nsamples); 5 | mod_deg_data= DC_block( deg_data, deg_Nsamples); 6 | 7 | mod_ref_data= apply_filters( mod_ref_data, ref_Nsamples); 8 | mod_deg_data= apply_filters( mod_deg_data, deg_Nsamples); 9 | 10 | -------------------------------------------------------------------------------- /PESQ/pesq.asv: -------------------------------------------------------------------------------- 1 | function [pesq_mos]= pesq(ref_wav, deg_wav) 2 | 3 | % ---------------------------------------------------------------------- 4 | % PESQ objective speech quality measure 5 | % 6 | % This function implements the PESQ measure based on the ITU standard 7 | % P.862 [1]. 8 | % 9 | % 10 | % Usage: pval=pesq(cleanFile.wav, enhancedFile.wav) 11 | % 12 | % cleanFile.wav - clean input file in .wav format 13 | % enhancedFile - enhanced output file in .wav format 14 | % pval - PESQ value 15 | % 16 | % Note that the PESQ routine only supports sampling rates of 8 kHz and 17 | % 16 kHz [1] 18 | % 19 | % Example call: pval = pesq ('sp04.wav','enhanced.wav') 20 | % 21 | % 22 | % References: 23 | % [1] ITU (2000). Perceptual evaluation of speech quality (PESQ), and 24 | % objective method for end-to-end speech quality assessment of 25 | % narrowband telephone networks and speech codecs. ITU-T 26 | % Recommendation P. 862 27 | % 28 | % Authors: Yi Hu and Philipos C. Loizou 29 | % 30 | % 31 | % Copyright (c) 2006 by Philipos C. Loizou 32 | % $Revision: 0.0 $ $Date: 10/09/2006 $ 33 | % ---------------------------------------------------------------------- 34 | 35 | %Referencias frente a trasteo ;D 36 | % pesq('sp04.wav','sp04_babble_sn10.wav') 37 | % 2.4634 38 | % pesq('sp04.wav','enhanced.wav') 39 | % 2.5658 40 | 41 | if nargin<2 42 | fprintf('Usage: [pesq_mos]=pesq(cleanfile.wav,enhanced.wav) \n'); 43 | return; 44 | end; 45 | 46 | %Establecemos las siguientes variables globales 47 | global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs WHOLE_SIGNAL 48 | global Align_Nfft Window 49 | 50 | %Leemos el WAV de REFERENCIA (y obtenemos la frec. de muestreo) 51 | [ref_data,sampling_rate,nbits]= wavread( ref_wav); 52 | if sampling_rate~=8000 & sampling_rate~=16000 53 | error('Sampling frequency needs to be either 8000 or 16000 Hz'); 54 | end 55 | %Leemos el WAV de TEST (ignoramos los datos de fec. muestreo) 56 | deg_data= wavread( deg_wav); 57 | 58 | %Establecemos un conjunto de variables Globales, que dependen de la 59 | %frecuencia de muestreo 60 | setup_global( sampling_rate); 61 | %Esta funcion se encarga de definir las siguientes variables globales 62 | %fundamentales que dependen de la frecuencia de muestreo 63 | 64 | 65 | %Align_Nfft define el tamano de la ventana FFT (512 para 8Khz y 1024 para 66 | %16kHz) 67 | TWOPI= 6.28318530717959; 68 | count=0:Align_Nfft- 1; 69 | Window= 0.5 * (1.0 - cos((TWOPI * count) / Align_Nfft)); 70 | % Equivalente a: 71 | % Window= hann( Align_Nfft); %Hanning window 72 | 73 | %Prepara las senales de referencia y degradada 74 | %Duda: reescala las senales a 16 bits (15 bits amp. 1 de signo), porque? 75 | %Mete un buffer de busqueda al principio y otro al final, mas 320 76 | %milisegundos de padding 77 | ref_data= ref_data'; 78 | ref_data= ref_data* 32768; %2^15 79 | ref_Nsamples= length( ref_data)+ 2* SEARCHBUFFER* Downsample; 80 | ref_data= [zeros( 1, SEARCHBUFFER* Downsample), ref_data, zeros( 1, DATAPADDING_MSECS* (Fs/ 1000)+ SEARCHBUFFER* Downsample)]; 81 | 82 | deg_data= deg_data'; 83 | deg_data= deg_data* 32768; %2^15 84 | deg_Nsamples= length( deg_data)+ 2* SEARCHBUFFER* Downsample; 85 | deg_data= [zeros( 1, SEARCHBUFFER* Downsample), deg_data, zeros( 1, DATAPADDING_MSECS* (Fs/ 1000)+ SEARCHBUFFER* Downsample)]; 86 | 87 | maxNsamples= max( ref_Nsamples, deg_Nsamples); 88 | 89 | %Las dos senales deben de tener un nivel de ganancia parecido. Para 90 | %igualarlo se calcula la potencia de las senales. Pero solo se considera la 91 | %region del espectro con voz. Aqui desde 300 a 3Khz, aunque esto NO 92 | %COINCIDE con el standar. 93 | ref_data= fix_power_level( ref_data, ref_Nsamples, maxNsamples); 94 | deg_data= fix_power_level( deg_data, deg_Nsamples, maxNsamples); 95 | 96 | %Aplica un filtrado que simula la respuesta en frecuencia de un dispositivo 97 | %telefonico estandard. 98 | standard_IRS_filter_dB= [0, -200; 50, -40; 100, -20; 125, -12; 160, -6; 200, 0;... 99 | 250, 4; 300, 6; 350, 8; 400, 10; 500, 11; 600, 12; 700, 12; 800, 12;... 100 | 1000, 12; 1300, 12; 1600, 12; 2000, 12; 2500, 12; 3000, 12; 3250, 12;... 101 | 3500, 4; 4000, -200; 5000, -200; 6300, -200; 8000, -200]; 102 | 103 | ref_data= apply_filter( ref_data, ref_Nsamples, standard_IRS_filter_dB); 104 | deg_data= apply_filter( deg_data, deg_Nsamples, standard_IRS_filter_dB); 105 | 106 | 107 | % Salvaguardamos los datos para el modelado perceptual 108 | % Mas adelante las variables model_ref y mode_deg se vuelven a volcar sobre 109 | % ref_data y deg_data 110 | model_ref= ref_data; 111 | model_deg= deg_data; 112 | 113 | %Realmente no tengo ni idea de que diablos se le hace aqui a la senal. Se 114 | %le toca el DC offset y se filtra por algo (pero es muy dificil imaginar 115 | %que). Supongo que se acomoda la senal para la siguiente etapa (han copiado 116 | %las cocinas del codigo PESQ. 117 | [ref_data, deg_data]= input_filter( ref_data, ref_Nsamples, deg_data, ... 118 | deg_Nsamples); 119 | 120 | %Aqui se calcula la envelope de la senal (log(MAX(E(k)/Ethresh,1))) 121 | %E(k) es la energia en 4 ms y Ethresh un umbral del VAD. ref_VAD se refiere 122 | %a la senal antes del logaritmo y el maximo. 123 | [ref_VAD, ref_logVAD]= apply_VAD( ref_data, ref_Nsamples); 124 | [deg_VAD, deg_logVAD]= apply_VAD( deg_data, deg_Nsamples); 125 | 126 | %Sobre el envelope se calcula el alineamiento en crudo. Basicamente se 127 | %calcula la correlacion cruzada entre las senales y se busca el maximo 128 | crude_align (ref_logVAD, ref_Nsamples, deg_logVAD, deg_Nsamples,WHOLE_SIGNAL); 129 | %NOTA: Los resultados se almacenan en variables globales Crude_DelayEst 130 | %y Crude_DelayConf; 131 | 132 | utterance_locate (ref_data, ref_Nsamples, ref_VAD, ref_logVAD,... 133 | deg_data, deg_Nsamples, deg_VAD, deg_logVAD); 134 | 135 | ref_data= model_ref; 136 | deg_data= model_deg; 137 | 138 | % make ref_data and deg_data equal length 139 | if (ref_Nsamples< deg_Nsamples) 140 | newlen= deg_Nsamples+ DATAPADDING_MSECS* (Fs/ 1000); 141 | ref_data( newlen)= 0; 142 | elseif (ref_Nsamples> deg_Nsamples) 143 | newlen= ref_Nsamples+ DATAPADDING_MSECS* (Fs/ 1000); 144 | deg_data( newlen)= 0; 145 | end 146 | 147 | %Tras la identificacion de las sentencias y el alineado se procede a la 148 | %evaluacion objetiva del mos mediante un modelo psicoacustico. 149 | 150 | pesq_mos= pesq_psychoacoustic_model (ref_data, ref_Nsamples, deg_data, ... 151 | deg_Nsamples ); 152 | 153 | 154 | 155 | 156 | -------------------------------------------------------------------------------- /PESQ/pesq.m: -------------------------------------------------------------------------------- 1 | function [pesq_mos]= pesq(ref_wav, deg_wav) 2 | 3 | % ---------------------------------------------------------------------- 4 | % PESQ objective speech quality measure 5 | % 6 | % This function implements the PESQ measure based on the ITU standard 7 | % P.862 [1]. 8 | % 9 | % 10 | % Usage: pval=pesq(cleanFile.wav, enhancedFile.wav) 11 | % 12 | % cleanFile.wav - clean input file in .wav format 13 | % enhancedFile - enhanced output file in .wav format 14 | % pval - PESQ value 15 | % 16 | % Note that the PESQ routine only supports sampling rates of 8 kHz and 17 | % 16 kHz [1] 18 | % 19 | % Example call: pval = pesq ('sp04.wav','enhanced.wav') 20 | % 21 | % 22 | % References: 23 | % [1] ITU (2000). Perceptual evaluation of speech quality (PESQ), and 24 | % objective method for end-to-end speech quality assessment of 25 | % narrowband telephone networks and speech codecs. ITU-T 26 | % Recommendation P. 862 27 | % 28 | % Authors: Yi Hu and Philipos C. Loizou 29 | % 30 | % 31 | % Copyright (c) 2006 by Philipos C. Loizou 32 | % $Revision: 0.0 $ $Date: 10/09/2006 $ 33 | % ---------------------------------------------------------------------- 34 | 35 | %Referencias frente a trasteo ;D 36 | % pesq('sp04.wav','sp04_babble_sn10.wav') 37 | % 2.4634 38 | % pesq('sp04.wav','enhanced.wav') 39 | % 2.5658 40 | 41 | if nargin<2 42 | fprintf('Usage: [pesq_mos]=pesq(cleanfile.wav,enhanced.wav) \n'); 43 | return; 44 | end; 45 | 46 | %Establecemos las siguientes variables globales 47 | global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs WHOLE_SIGNAL 48 | global Align_Nfft Window 49 | 50 | %Leemos el WAV de REFERENCIA (y obtenemos la frec. de muestreo) 51 | [ref_data,sampling_rate,nbits]= wavread( ref_wav); 52 | if sampling_rate~=8000 & sampling_rate~=16000 53 | error('Sampling frequency needs to be either 8000 or 16000 Hz'); 54 | end 55 | %Leemos el WAV de TEST (ignoramos los datos de fec. muestreo) 56 | deg_data= wavread( deg_wav); 57 | 58 | %Establecemos un conjunto de variables Globales, que dependen de la 59 | %frecuencia de muestreo 60 | setup_global( sampling_rate); 61 | %Esta funcion se encarga de definir las siguientes variables globales 62 | %fundamentales que dependen de la frecuencia de muestreo 63 | 64 | 65 | %Align_Nfft define el tamano de la ventana FFT (512 para 8Khz y 1024 para 66 | %16kHz) 67 | TWOPI= 6.28318530717959; 68 | count=0:Align_Nfft- 1; 69 | Window= 0.5 * (1.0 - cos((TWOPI * count) / Align_Nfft)); 70 | % Equivalente a: 71 | % Window= hann( Align_Nfft); %Hanning window 72 | 73 | %Prepara las senales de referencia y degradada 74 | %Duda: reescala las senales a 16 bits (15 bits amp. 1 de signo), porque? 75 | %Mete un buffer de busqueda al principio y otro al final, mas 320 76 | %milisegundos de padding 77 | ref_data= ref_data'; 78 | ref_data= ref_data* 32768; %2^15 79 | ref_Nsamples= length( ref_data)+ 2* SEARCHBUFFER* Downsample; 80 | ref_data= [zeros( 1, SEARCHBUFFER* Downsample), ref_data, zeros( 1, DATAPADDING_MSECS* (Fs/ 1000)+ SEARCHBUFFER* Downsample)]; 81 | 82 | deg_data= deg_data'; 83 | deg_data= deg_data* 32768; %2^15 84 | deg_Nsamples= length( deg_data)+ 2* SEARCHBUFFER* Downsample; 85 | deg_data= [zeros( 1, SEARCHBUFFER* Downsample), deg_data, zeros( 1, DATAPADDING_MSECS* (Fs/ 1000)+ SEARCHBUFFER* Downsample)]; 86 | 87 | maxNsamples= max( ref_Nsamples, deg_Nsamples); 88 | 89 | %Las dos senales deben de tener un nivel de ganancia parecido. Para 90 | %igualarlo se calcula la potencia de las senales. Pero solo se considera la 91 | %region del espectro con voz. Aqui desde 300 a 3Khz, aunque esto NO 92 | %COINCIDE con el standar. 93 | ref_data= fix_power_level( ref_data, ref_Nsamples, maxNsamples); 94 | deg_data= fix_power_level( deg_data, deg_Nsamples, maxNsamples); 95 | 96 | %Aplica un filtrado que simula la respuesta en frecuencia de un dispositivo 97 | %telefonico estandard. 98 | standard_IRS_filter_dB= [0, -200; 50, -40; 100, -20; 125, -12; 160, -6; 200, 0;... 99 | 250, 4; 300, 6; 350, 8; 400, 10; 500, 11; 600, 12; 700, 12; 800, 12;... 100 | 1000, 12; 1300, 12; 1600, 12; 2000, 12; 2500, 12; 3000, 12; 3250, 12;... 101 | 3500, 4; 4000, -200; 5000, -200; 6300, -200; 8000, -200]; 102 | 103 | ref_data= apply_filter( ref_data, ref_Nsamples, standard_IRS_filter_dB); 104 | deg_data= apply_filter( deg_data, deg_Nsamples, standard_IRS_filter_dB); 105 | 106 | 107 | % Salvaguardamos los datos para el modelado perceptual 108 | % Mas adelante las variables model_ref y mode_deg se vuelven a volcar sobre 109 | % ref_data y deg_data 110 | model_ref= ref_data; 111 | model_deg= deg_data; 112 | 113 | %Realmente no tengo ni idea de que diablos se le hace aqui a la senal. Se 114 | %le toca el DC offset y se filtra por algo (pero es muy dificil imaginar 115 | %que). Supongo que se acomoda la senal para la siguiente etapa (han copiado 116 | %las cocinas del codigo PESQ. 117 | [ref_data, deg_data]= input_filter( ref_data, ref_Nsamples, deg_data, ... 118 | deg_Nsamples); 119 | 120 | %Aqui se calcula la envelope de la senal (log(MAX(E(k)/Ethresh,1))) 121 | %E(k) es la energia en 4 ms y Ethresh un umbral del VAD. ref_VAD se refiere 122 | %a la senal antes del logaritmo y el maximo. 123 | [ref_VAD, ref_logVAD]= apply_VAD( ref_data, ref_Nsamples); 124 | [deg_VAD, deg_logVAD]= apply_VAD( deg_data, deg_Nsamples); 125 | 126 | %Sobre el envelope se calcula el alineamiento en crudo. Basicamente se 127 | %calcula la correlacion cruzada entre las senales y se busca el maximo 128 | crude_align (ref_logVAD, ref_Nsamples, deg_logVAD, deg_Nsamples,WHOLE_SIGNAL); 129 | %NOTA: Los resultados se almacenan en variables globales Crude_DelayEst 130 | %y Crude_DelayConf; 131 | 132 | utterance_locate (ref_data, ref_Nsamples, ref_VAD, ref_logVAD,... 133 | deg_data, deg_Nsamples, deg_VAD, deg_logVAD); 134 | 135 | ref_data= model_ref; 136 | deg_data= model_deg; 137 | 138 | % make ref_data and deg_data equal length 139 | if (ref_Nsamples< deg_Nsamples) 140 | newlen= deg_Nsamples+ DATAPADDING_MSECS* (Fs/ 1000); 141 | ref_data( newlen)= 0; 142 | elseif (ref_Nsamples> deg_Nsamples) 143 | newlen= ref_Nsamples+ DATAPADDING_MSECS* (Fs/ 1000); 144 | deg_data( newlen)= 0; 145 | end 146 | 147 | %Tras la identificacion de las sentencias y el alineado se procede a la 148 | %evaluacion objetiva del mos mediante un modelo psicoacustico. 149 | 150 | pesq_mos= pesq_psychoacoustic_model (ref_data, ref_Nsamples, deg_data, ... 151 | deg_Nsamples ); 152 | 153 | 154 | 155 | 156 | -------------------------------------------------------------------------------- /PESQ/pow_of.m: -------------------------------------------------------------------------------- 1 | function power= pow_of( data, start_point, end_point, divisor) 2 | 3 | power= sum( data( start_point: end_point).^ 2)/ divisor; -------------------------------------------------------------------------------- /PESQ/readme.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/PESQ/readme.pdf -------------------------------------------------------------------------------- /PESQ/readme.txt: -------------------------------------------------------------------------------- 1 | This folder contains implementations of objective measures (Chapters 10 and 11): 2 | 3 | MATLAB file Description Reference 4 | ----------------------------------------------------------------------------------- 5 | comp_snr.m Overall and segmental SNR [1] 6 | comp_wss.m Weighted-spectral slope metric [2] 7 | comp_llr.m Likelihood-ratio measure [3] 8 | comp_is.m Itakura-Saito measure [3] 9 | comp_cep.m Cepstral distance measure [4] 10 | comp_fwseg Freq. weighted segm. SNR (fwSNRseg) [5],Chap 11, 11 | Eq. 11.5 12 | comp_fwseg_variant Frequency-variant fwSNRseg measure Chapter 10, 13 | Eq. 10.24 14 | comp_fwseg_mars Frequency variant fwSNRseg measure Chap 10, 15 | based on MARS analysis Sec. 10.5.4 16 | pesq.m PESQ measure [6] 17 | composite.m A composite measure [7] 18 | 19 | addnoise_asl.m Adds noise to the clean signal at specified SNR 20 | based on active speech level. [8] 21 | --------------------------------------------------------------------------------- 22 | USAGE 23 | 24 | >> [snr_mean, segsnr_mean]= compSNR(cleanFile.wav, enhdFile.wav); 25 | where 'snr_mean' is the global overall SNR and 'segsnr_mean' is the 26 | segmental SNR. 27 | 28 | >> wss_mean = comp_wss(cleanFile.wav, enhancedFile.wav); 29 | 30 | >> llr_mean= comp_llr(cleanFile.wav, enhancedFile.wav); 31 | 32 | >> is_mean = comp_is(cleanFile.wav, enhancedFile.wav); 33 | 34 | >> cep_mean = comp_cep(cleanFile.wav, enhancedFile.wav); 35 | 36 | >> fwSNRseg = comp_fwseg(cleanFile.wav, enhancedFile.wav); 37 | 38 | >> [SIG,BAK,OVL]=comp_fwseg_variant(cleanFile.wav, enhancedFile.wav); 39 | where 'SIG' is the predicted rating of speech distortion, 40 | 'BAK' is the predicted rating of background noise distortion, 41 | 'OVL' is the predicted rating of overall quality. 42 | 43 | >> [SIG,BAK,OVL]=comp_fwseg_mars(cleanFile.wav, enhancedFile.wav); 44 | 45 | >> pesq_mean = pesq(cleanFile.wav, enhancedFile.wav); 46 | Only sampling frequencies of 8000 Hz or 16000 Hz are supported. 47 | 48 | >> [Csig,Cbak,Covl]=composite(cleanFile.wav, enhancedFile.wav); 49 | where 'Csig' is the predicted rating of speech distortion, 50 | 'Cbak' is the predicted rating of background noise distortion, 51 | 'Covl' is the predicted rating of overall quality. 52 | 53 | >> addnoise_asl(cleanfile.wav, noisefile.wav, outfile.wav, SNRlevel) 54 | 55 | --------------------------------------------------------------------------- 56 | 57 | REFERENCES: 58 | [1] Hansen, J. and Pellom, B. (1998). An effective quality evaluation 59 | protocol for speech enhancement algorithms. Inter. Conf. on Spoken 60 | Language Processing, 7(2819), 2822 61 | [2] Klatt, D. (1982). Prediction of perceived phonetic distance from 62 | critical band spectra. Proc. IEEE Int. Conf. Acoust. , Speech, 63 | Signal Processing, 7, 1278-1281. 64 | [3] Quackenbush, S., Barnwell, T., and Clements, M. (1988). Objective 65 | measures of speech quality. NJ: Prentice-Hall, Eaglewood Cliffs. 66 | 67 | [4] Kitawaki, N., Nagabuchi, H., and Itoh, K. (1988). Objective quality 68 | evaluation for low bit-rate speech coding systems. IEEE J. Select. 69 | Areas in Comm., 6(2), 262-273. 70 | [5] Tribolet, J., Noll, P., McDermott, B., and Crochiere, R. E. (1978). 71 | A study of complexity and quality of speech waveform coders. Proc. 72 | IEEE Int. Conf. Acoust. , Speech, Signal Processing, 586-590. 73 | [6] ITU (2000). Perceptual evaluation of speech quality (PESQ), and 74 | objective method for end-to-end speech quality assessment of 75 | narrowband telephone networks and speech codecs. ITU-T 76 | Recommendation P. 862 77 | [7] Hu, Y. and Loizou, P. (2006). Evaluation of objective measures 78 | for speech enhancement. Proc. Interspeech 79 | [8] ITU-T (1993). Objective measurement of active speech level. ITU-T 80 | Recommendation P. 56 81 | 82 | 83 | Copyright (c) 2006 by Philipos C. Loizou 84 | $Revision: 0.0 $ $Date: 07/30/2006 $ 85 | ------------------------------------------------------------------------------ -------------------------------------------------------------------------------- /PESQ/sp04.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/PESQ/sp04.wav -------------------------------------------------------------------------------- /PESQ/sp04_babble_sn10.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/PESQ/sp04_babble_sn10.wav -------------------------------------------------------------------------------- /PESQ/split_align.m: -------------------------------------------------------------------------------- 1 | function split_align( ref_data, ref_Nsamples, ref_VAD, ref_logVAD, ... 2 | deg_data, deg_Nsamples, deg_VAD, deg_logVAD, ... 3 | Utt_Start_l, Utt_SpeechStart, Utt_SpeechEnd, Utt_End_l, ... 4 | Utt_DelayEst_l, Utt_DelayConf_l) 5 | 6 | global MAXNUTTERANCES Align_Nfft Downsample Window 7 | global Utt_DelayEst Utt_Delay UttSearch_Start UttSearch_End 8 | global Best_ED1 Best_D1 Best_DC1 Best_ED2 Best_D2 Best_DC2 Best_BP 9 | 10 | Utt_BPs= zeros( 1, 41); 11 | Utt_ED1= zeros( 1, 41); 12 | Utt_ED2= zeros( 1, 41); 13 | Utt_D1= zeros( 1, 41); 14 | Utt_D2= zeros( 1, 41); 15 | Utt_DC1= zeros( 1, 41); 16 | Utt_DC2= zeros( 1, 41); 17 | 18 | 19 | Utt_Len = Utt_SpeechEnd - Utt_SpeechStart; 20 | Utt_Test = MAXNUTTERANCES; 21 | Best_DC1 = 0.0; 22 | Best_DC2 = 0.0; 23 | kernel = Align_Nfft / 64; 24 | Delta = Align_Nfft / (4 * Downsample); 25 | Step = floor( ((0.801 * Utt_Len + 40 * Delta - 1)/(40 * Delta))); 26 | Step = Step* Delta; 27 | % fprintf( 'Step is %f\n', Step); 28 | 29 | Pad = floor( Utt_Len / 10); 30 | if( Pad < 75 ) 31 | Pad = 75; 32 | end 33 | 34 | Utt_BPs(1) = Utt_SpeechStart + Pad; 35 | N_BPs = 1; 36 | while( 1) 37 | N_BPs= N_BPs+ 1; 38 | Utt_BPs(N_BPs)= Utt_BPs(N_BPs- 1)+ Step; 39 | if (~((Utt_BPs(N_BPs) <= (Utt_SpeechEnd- Pad)) && (N_BPs <= 40) )) 40 | break; 41 | end 42 | end 43 | 44 | if( N_BPs <= 1 ) 45 | return; 46 | end 47 | 48 | % fprintf( 'Utt_DelayEst_l, Utt_Start_l, N_BPs is %d,%d,%d\n', ... 49 | % Utt_DelayEst_l, Utt_Start_l, N_BPs); 50 | for bp = 1: N_BPs- 1 51 | Utt_DelayEst(Utt_Test) = Utt_DelayEst_l; 52 | UttSearch_Start(Utt_Test) = Utt_Start_l; 53 | UttSearch_End(Utt_Test) = Utt_BPs(bp); 54 | % fprintf( 'bp,Utt_BPs(%d) is %d,%d\n', bp,bp,Utt_BPs(bp)); 55 | 56 | crude_align( ref_logVAD, ref_Nsamples, deg_logVAD, ... 57 | deg_Nsamples, MAXNUTTERANCES); 58 | Utt_ED1(bp) = Utt_Delay(Utt_Test); 59 | 60 | Utt_DelayEst(Utt_Test) = Utt_DelayEst_l; 61 | UttSearch_Start(Utt_Test) = Utt_BPs(bp); 62 | UttSearch_End(Utt_Test) = Utt_End_l; 63 | 64 | crude_align( ref_logVAD, ref_Nsamples, deg_logVAD, ... 65 | deg_Nsamples, MAXNUTTERANCES); 66 | Utt_ED2(bp) = Utt_Delay(Utt_Test); 67 | end 68 | 69 | % stream = fopen( 'matmat.txt', 'wt' ); 70 | % for count= 1: N_BPs- 1 71 | % fprintf( stream, '%d\n', Utt_ED2(count)); 72 | % end 73 | % fclose( stream ); 74 | 75 | 76 | Utt_DC1(1: N_BPs-1) = -2.0; 77 | % stream= fopen( 'what_mmm.txt', 'at'); 78 | while( 1 ) 79 | bp = 1; 80 | while( (bp <= N_BPs- 1) && (Utt_DC1(bp) > -2.0) ) 81 | bp = bp+ 1; 82 | end 83 | if( bp >= N_BPs ) 84 | break; 85 | end 86 | 87 | estdelay = Utt_ED1(bp); 88 | % fprintf( 'bp,estdelay is %d,%d\n', bp, estdelay); 89 | H(1: Align_Nfft)= 0; 90 | Hsum = 0.0; 91 | 92 | startr = (Utt_Start_l- 1) * Downsample+ 1; 93 | startd = startr + estdelay; 94 | % fprintf( 'startr/startd is %d/%d\n', startr, startd); 95 | 96 | if ( startd < 0 ) 97 | startr = -estdelay+ 1; 98 | startd = 1; 99 | end 100 | 101 | while( ((startd + Align_Nfft) <= 1+ deg_Nsamples) &&... 102 | ((startr + Align_Nfft) <= (1+ (Utt_BPs(bp)- 1) * Downsample)) ) 103 | X1= ref_data(startr: startr+ Align_Nfft- 1).* Window; 104 | X2= deg_data(startd: startd+ Align_Nfft- 1).* Window; 105 | 106 | X1_fft= fft( X1, Align_Nfft ); 107 | X1_fft_conj= conj( X1_fft); 108 | X2_fft= fft( X2, Align_Nfft ); 109 | X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft); 110 | 111 | X1= abs( X1); 112 | v_max= max( X1)* 0.99; 113 | n_max = (v_max^ 0.125 )/ kernel; 114 | % fprintf( stream, '%f %f\n', v_max, n_max); 115 | 116 | for count = 0: Align_Nfft- 1 117 | if( X1(count+ 1) > v_max ) 118 | Hsum = Hsum+ n_max * kernel; 119 | for k = 1-kernel: kernel- 1 120 | H(1+ rem( count+ k+ Align_Nfft, Align_Nfft))= ... 121 | H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))+ ... 122 | n_max* (kernel- abs(k)); 123 | end 124 | end 125 | end 126 | 127 | startr = startr+ (Align_Nfft / 4); 128 | startd = startd+ (Align_Nfft / 4); 129 | end 130 | 131 | [v_max, I_max] = max( H); 132 | if( I_max- 1 >= (Align_Nfft/2) ) 133 | I_max = I_max- Align_Nfft; 134 | end 135 | 136 | Utt_D1(bp) = estdelay + I_max- 1; 137 | if( Hsum > 0.0 ) 138 | % if (Utt_Len== 236) 139 | % fprintf( 'v_max, Hsum is %f, %f\n', v_max, Hsum); 140 | % end 141 | Utt_DC1(bp) = v_max / Hsum; 142 | else 143 | Utt_DC1(bp) = 0.0; 144 | end 145 | 146 | % fprintf( 'bp/startr/startd is %d/%d/%d\n', bp, startr, startd); 147 | while( bp < (N_BPs - 1) ) 148 | bp = bp + 1; 149 | 150 | if( (Utt_ED1(bp) == estdelay) && (Utt_DC1(bp) <= -2.0) ) 151 | % loopno= 0; 152 | while(((startd+ Align_Nfft)<= 1+ deg_Nsamples) && ... 153 | ((startr+ Align_Nfft)<= ... 154 | ((Utt_BPs(bp)- 1)* Downsample+ 1) )) 155 | X1= ref_data( startr: startr+ Align_Nfft- 1).* ... 156 | Window; 157 | % % if (Utt_Len== 321) 158 | % fid= fopen( 'what_mat.txt', 'at'); 159 | % fprintf( fid, '%f\n', Window); 160 | % fclose( fid); 161 | % % fprintf( '\n'); 162 | % % end 163 | X2= deg_data( startd: startd+ Align_Nfft- 1).* ... 164 | Window; 165 | X1_fft= fft( X1, Align_Nfft ); 166 | X1_fft_conj= conj( X1_fft); 167 | X2_fft= fft( X2, Align_Nfft ); 168 | X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft); 169 | 170 | X1= abs( X1); 171 | v_max = 0.99* max( X1); 172 | n_max = (v_max^ 0.125)/ kernel; 173 | % fprintf( 'v_max n_max is %f %f\n', v_max, n_max); 174 | 175 | for count = 0: Align_Nfft- 1 176 | if( X1(count+ 1) > v_max ) 177 | Hsum = Hsum+ n_max * kernel; 178 | for k = 1-kernel: kernel-1 179 | H(1+ rem( count+ k+ Align_Nfft, Align_Nfft))= ... 180 | H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))+ ... 181 | n_max* (kernel- abs(k)); 182 | end 183 | end 184 | end 185 | 186 | startr = startr+ (Align_Nfft / 4); 187 | startd = startd+ (Align_Nfft / 4); 188 | 189 | % loopno= loopno+ 1; 190 | end 191 | % fprintf( 'loopno is %d\n', loopno); 192 | 193 | [v_max, I_max] = max( H); 194 | % fprintf( 'I_max is %d ', I_max); 195 | if( I_max- 1 >= (Align_Nfft/2) ) 196 | I_max = I_max- Align_Nfft; 197 | end 198 | 199 | 200 | Utt_D1(bp) = estdelay + I_max- 1; 201 | if( Hsum > 0.0 ) 202 | % fprintf( 'v_max Hsum is %f %f\n', v_max, Hsum); 203 | Utt_DC1(bp) = v_max / Hsum; 204 | else 205 | Utt_DC1(bp) = 0.0; 206 | end 207 | end 208 | end 209 | end 210 | % fclose( stream); 211 | 212 | for bp= 1: N_BPs- 1 213 | if( Utt_DC1(bp) > Utt_DelayConf_l ) 214 | Utt_DC2(bp) = -2.0; 215 | else 216 | Utt_DC2(bp) = 0.0; 217 | end 218 | end 219 | 220 | while( 1 ) 221 | bp = N_BPs- 1; 222 | while( (bp >= 1) && (Utt_DC2(bp) > -2.0) ) 223 | bp = bp- 1; 224 | end 225 | if( bp < 1 ) 226 | break; 227 | end 228 | 229 | estdelay = Utt_ED2(bp); 230 | H( 1: Align_Nfft)= 0; 231 | Hsum = 0.0; 232 | 233 | startr = (Utt_End_l- 1)* Downsample+ 1- Align_Nfft; 234 | startd = startr + estdelay; 235 | 236 | % fprintf( '***NEW startr is %d\n', startr); 237 | 238 | % fprintf( 'startr/d, deg_Nsamples is %d/%d, %d\n', startr,startd, ... 239 | % deg_Nsamples); 240 | % fprintf( 'deg_data has %d elements\n', numel( deg_data)); 241 | 242 | if ( (startd + Align_Nfft) > deg_Nsamples+ 1 ) 243 | startd = deg_Nsamples - Align_Nfft+ 1; 244 | startr = startd - estdelay; 245 | end 246 | 247 | while( (startd>= 1) && (startr>= (Utt_BPs(bp)- 1)* Downsample+ 1) ) 248 | X1= ref_data( startr: startr+ Align_Nfft- 1).* Window; 249 | X2= deg_data( startd: startd+ Align_Nfft- 1).* Window; 250 | 251 | X1_fft= fft( X1, Align_Nfft); 252 | X1_fft_conj= conj( X1_fft); 253 | X2_fft= fft( X2, Align_Nfft); 254 | 255 | X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft ); 256 | X1= abs( X1); 257 | 258 | v_max = max( X1)* 0.99; 259 | n_max = ( v_max^ 0.125 )/ kernel; 260 | 261 | for count = 0: Align_Nfft- 1 262 | if( X1(count+ 1) > v_max ) 263 | Hsum = Hsum+ n_max * kernel; 264 | for k = 1-kernel: kernel- 1 265 | H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))= ... 266 | H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))+ ... 267 | n_max* (kernel- abs(k)); 268 | end 269 | end 270 | end 271 | 272 | startr = startr- (Align_Nfft / 4); 273 | startd = startd- (Align_Nfft / 4); 274 | end 275 | 276 | [v_max, I_max] = max( H); 277 | if( I_max- 1 >= (Align_Nfft/2) ) 278 | I_max = I_max- Align_Nfft; 279 | end 280 | 281 | Utt_D2(bp) = estdelay + I_max- 1; 282 | if( Hsum > 0.0 ) 283 | Utt_DC2(bp) = v_max / Hsum; 284 | else 285 | Utt_DC2(bp) = 0.0; 286 | end 287 | 288 | while( bp > 1 ) 289 | bp = bp - 1; 290 | if( (Utt_ED2(bp) == estdelay) && (Utt_DC2(bp) <= -2.0) ) 291 | while( (startd >= 1) && (startr >= (Utt_BPs(bp)- 1) * Downsample+ 1)) 292 | X1= ref_data( startr: startr+ Align_Nfft- 1).* Window; 293 | X2= deg_data( startd: startd+ Align_Nfft- 1).* Window; 294 | X1_fft_conj= conj( fft( X1, Align_Nfft)); 295 | X2_fft= fft( X2, Align_Nfft); 296 | X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft); 297 | 298 | X1= abs( X1); 299 | v_max = max( X1)* 0.99; 300 | n_max = (v_max^ 0.125)/ kernel; 301 | 302 | for count = 0: Align_Nfft- 1 303 | if( X1(count+ 1) > v_max ) 304 | Hsum = Hsum+ n_max * kernel; 305 | for k = 1-kernel: kernel- 1 306 | H(1+ rem( count+ k+ Align_Nfft, Align_Nfft))= ... 307 | H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))+ ... 308 | n_max* (kernel- abs(k)); 309 | end 310 | end 311 | end 312 | 313 | startr = startr- (Align_Nfft / 4); 314 | startd = startd- (Align_Nfft / 4); 315 | end 316 | 317 | [v_max, I_max] = max( H); 318 | if( I_max- 1 >= (Align_Nfft/2) ) 319 | I_max = I_max- Align_Nfft; 320 | end 321 | 322 | 323 | Utt_D2(bp) = estdelay + I_max- 1; 324 | if( Hsum > 0.0 ) 325 | Utt_DC2(bp) = v_max / Hsum; 326 | else 327 | Utt_DC2(bp) = 0.0; 328 | end 329 | end 330 | end 331 | end 332 | 333 | % fid= fopen( 'uttinfo_mat.txt', 'wt'); 334 | % fprintf( fid, '%f\n', Utt_D2); 335 | % fprintf( fid, '\n'); 336 | % fprintf( fid, '%f\n', Utt_DC2); 337 | % fclose( fid); 338 | 339 | % fprintf( 'Utt_Len, N_BPs is %d, %d\n', Utt_Len, N_BPs); 340 | for bp = 1: N_BPs- 1 341 | if( (abs(Utt_D2(bp) - Utt_D1(bp)) >= Downsample) && ... 342 | ((Utt_DC1(bp)+ Utt_DC2(bp))> (Best_DC1 + Best_DC2)) &&... 343 | (Utt_DC1(bp) > Utt_DelayConf_l) && ... 344 | (Utt_DC2(bp) > Utt_DelayConf_l) ) 345 | Best_ED1 = Utt_ED1(bp); 346 | Best_D1 = Utt_D1(bp); 347 | Best_DC1 = Utt_DC1(bp); 348 | Best_ED2 = Utt_ED2(bp); 349 | Best_D2 = Utt_D2(bp); 350 | Best_DC2 = Utt_DC2(bp); 351 | Best_BP = Utt_BPs(bp); 352 | % fprintf( 'in loop...'); 353 | end 354 | end 355 | 356 | % if (Utt_Len== 236) 357 | % fid= fopen( 'matmat.txt', 'wt'); 358 | % fprintf( fid, 'N_BPs is %d\n', N_BPs); 359 | % fprintf( fid, 'Utt_DelayConf is %f\n', Utt_DelayConf_l); 360 | % fprintf( fid, 'ED2\t ED1\t D2\t D1\t DC2\t DC1\t BPs\n'); 361 | % for bp= 1: N_BPs- 1 362 | % fprintf( fid, '%d\t %d\t %d\t %d\t %f\t %f\t %d\n', Utt_ED2( bp), ... 363 | % Utt_ED1( bp), Utt_D2(bp), Utt_D1(bp), Utt_DC2(bp),... 364 | % Utt_DC1( bp), Utt_BPs( bp)); 365 | % end 366 | % fclose( fid); 367 | % end 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | -------------------------------------------------------------------------------- /PESQ/time_align.m: -------------------------------------------------------------------------------- 1 | function time_align(ref_data, ref_Nsamples, ... 2 | deg_data, deg_Nsamples, Utt_id) 3 | 4 | global Utt_DelayEst Utt_Delay Utt_DelayConf UttSearch_Start UttSearch_End 5 | global Align_Nfft Downsample Window 6 | 7 | estdelay = Utt_DelayEst(Utt_id); 8 | 9 | H = zeros( 1, Align_Nfft); 10 | X1= zeros( 1, Align_Nfft); 11 | X2= zeros( 1, Align_Nfft); 12 | 13 | startr = (UttSearch_Start(Utt_id)- 1)* Downsample+ 1; 14 | startd = startr + estdelay; 15 | if ( startd < 0 ) 16 | startr = 1 -estdelay; 17 | startd = 1; 18 | end 19 | 20 | while( ((startd + Align_Nfft) <= deg_Nsamples) && ... 21 | ((startr + Align_Nfft) <= ((UttSearch_End(Utt_id)- 1) * Downsample)) ) 22 | X1= ref_data( startr: startr+ Align_Nfft- 1).* Window; 23 | X2= deg_data( startd: startd+ Align_Nfft- 1).* Window; 24 | 25 | % find cross-correlation between X1 and X2 26 | X1_fft= fft( X1, Align_Nfft ); 27 | X1_fft_conj= conj( X1_fft); 28 | X2_fft= fft( X2, Align_Nfft ); 29 | X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft ); 30 | 31 | X1= abs( X1); 32 | v_max = max( X1)* 0.99; 33 | 34 | X1_greater_vmax= find( X1 > v_max ); 35 | H( X1_greater_vmax )= H( X1_greater_vmax )+ v_max^ 0.125; 36 | 37 | startr = startr+ Align_Nfft/ 4; 38 | startd = startd+ Align_Nfft/ 4; 39 | 40 | end 41 | 42 | X1= H; 43 | X2= 0; 44 | Hsum = sum( H); 45 | 46 | X2(1) = 1.0; 47 | kernel = Align_Nfft / 64; 48 | 49 | for count= 2: kernel 50 | X2( count)= 1- (count- 1)/ kernel; 51 | X2( Align_Nfft- count+ 2)= 1- (count- 1)/ kernel; 52 | end 53 | 54 | X1_fft= fft( X1, Align_Nfft ); 55 | X2_fft= fft( X2, Align_Nfft ); 56 | 57 | X1= ifft( X1_fft.* X2_fft, Align_Nfft ); 58 | 59 | if (Hsum> 0) 60 | H= abs( X1)/ Hsum; 61 | else 62 | H= 0; 63 | end 64 | 65 | [v_max, I_max] = max( H); 66 | if( I_max- 1 >= (Align_Nfft/2) ) 67 | I_max = I_max- Align_Nfft; 68 | end 69 | 70 | Utt_Delay(Utt_id) = estdelay + I_max- 1; 71 | Utt_DelayConf(Utt_id) = v_max; % confidence 72 | 73 | 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /PESQ/utterance_locate.m: -------------------------------------------------------------------------------- 1 | function utterance_locate (ref_data, ref_Nsamples, ref_VAD, ref_logVAD,... 2 | deg_data, deg_Nsamples, deg_VAD, deg_logVAD); 3 | 4 | global Nutterances Utt_Delay Utt_DelayConf Utt_Start Utt_End Utt_DelayEst 5 | 6 | id_searchwindows( ref_VAD, ref_Nsamples, deg_VAD, deg_Nsamples); 7 | 8 | for Utt_id= 1: Nutterances 9 | %fprintf( 1, 'Utt_id is %d\n', Utt_id); 10 | crude_align( ref_logVAD, ref_Nsamples, deg_logVAD, deg_Nsamples, Utt_id); 11 | time_align(ref_data, ref_Nsamples, ... 12 | deg_data, deg_Nsamples, Utt_id); 13 | end 14 | 15 | id_utterances( ref_Nsamples, ref_VAD, deg_Nsamples); 16 | 17 | 18 | utterance_split( ref_data, ref_Nsamples, ref_VAD, ref_logVAD, ... 19 | deg_data, deg_Nsamples, deg_VAD, deg_logVAD); 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /PESQ/utterance_split.m: -------------------------------------------------------------------------------- 1 | function utterance_split( ref_data, ref_Nsamples, ref_VAD, ref_logVAD, ... 2 | deg_data, deg_Nsamples, deg_VAD, deg_logVAD) 3 | 4 | global Nutterances MAXNUTTERANCES Downsample SEARCHBUFFER 5 | global Utt_DelayEst Utt_Delay Utt_DelayConf UttSearch_Start 6 | global Utt_Start Utt_End Largest_uttsize UttSearch_End 7 | global Best_ED1 Best_D1 Best_DC1 Best_ED2 Best_D2 Best_DC2 Best_BP 8 | 9 | Utt_id = 1; 10 | while( (Utt_id <= Nutterances) && (Nutterances <= MAXNUTTERANCES) ) 11 | Utt_DelayEst_l = Utt_DelayEst(Utt_id); 12 | Utt_Delay_l = Utt_Delay(Utt_id); 13 | Utt_DelayConf_l = Utt_DelayConf(Utt_id); 14 | Utt_Start_l = Utt_Start(Utt_id); 15 | Utt_End_l = Utt_End(Utt_id); 16 | 17 | Utt_SpeechStart = Utt_Start_l; 18 | % fprintf( 'SpeechStart is %d\n', Utt_SpeechStart); 19 | while( (Utt_SpeechStart < Utt_End_l) && ... 20 | (ref_VAD(Utt_SpeechStart)<= 0.0) ) 21 | Utt_SpeechStart = Utt_SpeechStart + 1; 22 | end %find the SpeechStart for each utterance 23 | Utt_SpeechEnd = Utt_End_l; 24 | % fprintf( 'SpeechEnd is %d\n', Utt_SpeechEnd); 25 | while( (Utt_SpeechEnd > Utt_Start_l) && ... 26 | (ref_VAD(Utt_SpeechEnd) <= 0)) 27 | Utt_SpeechEnd = Utt_SpeechEnd- 1; 28 | end 29 | Utt_SpeechEnd = Utt_SpeechEnd+ 1; 30 | %find SpeechEnd for each utterance 31 | Utt_Len = Utt_SpeechEnd - Utt_SpeechStart; 32 | 33 | % fprintf( 'Utt_Len is %d\n', Utt_Len); 34 | 35 | if( Utt_Len >= 200 ) 36 | split_align( ref_data, ref_Nsamples, ref_VAD, ref_logVAD, ... 37 | deg_data, deg_Nsamples, deg_VAD, deg_logVAD, ... 38 | Utt_Start_l, Utt_SpeechStart, Utt_SpeechEnd, Utt_End_l, ... 39 | Utt_DelayEst_l, Utt_DelayConf_l); 40 | % fprintf( '\nBest_ED1, Best_D1, Best_DC1 is %d, %d, %f\n',... 41 | % Best_ED1, Best_D1, Best_DC1); 42 | % fprintf( 'Best_ED2, Best_D2, Best_DC2 is %d, %d, %f\n',... 43 | % Best_ED2, Best_D2, Best_DC2); 44 | % fprintf( 'Best_BP is %d\n', Best_BP); 45 | 46 | if( (Best_DC1 > Utt_DelayConf_l) && (Best_DC2 > Utt_DelayConf_l) ) 47 | for step = Nutterances: -1: Utt_id+ 1 48 | Utt_DelayEst(step+ 1) = Utt_DelayEst(step); 49 | Utt_Delay(step+ 1) = Utt_Delay(step); 50 | Utt_DelayConf(step+ 1) = Utt_DelayConf(step); 51 | Utt_Start(step+ 1) = Utt_Start(step); 52 | Utt_End(step+ 1) = Utt_End(step); 53 | UttSearch_Start(step+ 1) = Utt_Start( step); 54 | UttSearch_End(step+ 1) = Utt_End( step); 55 | end 56 | 57 | Nutterances = Nutterances+ 1; 58 | 59 | Utt_DelayEst(Utt_id) = Best_ED1; 60 | Utt_Delay(Utt_id) = Best_D1; 61 | Utt_DelayConf(Utt_id) = Best_DC1; 62 | 63 | Utt_DelayEst(Utt_id +1) = Best_ED2; 64 | Utt_Delay(Utt_id +1) = Best_D2; 65 | Utt_DelayConf(Utt_id +1) = Best_DC2; 66 | 67 | UttSearch_Start(Utt_id +1) = UttSearch_Start(Utt_id); 68 | UttSearch_End(Utt_id +1) = UttSearch_End( Utt_id); 69 | if( Best_D2 < Best_D1 ) 70 | Utt_Start(Utt_id) = Utt_Start_l; 71 | Utt_End(Utt_id) = Best_BP; 72 | Utt_Start(Utt_id +1) = Best_BP; 73 | Utt_End(Utt_id +1) = Utt_End_l; 74 | else 75 | Utt_Start( Utt_id) = Utt_Start_l; 76 | Utt_End( Utt_id) = Best_BP + ... 77 | floor( (Best_D2- Best_D1)/ (2 * Downsample)); 78 | Utt_Start( Utt_id +1) = Best_BP - ... 79 | floor( (Best_D2- Best_D1)/ (2 * Downsample)); 80 | Utt_End( Utt_id +1) = Utt_End_l; 81 | end 82 | 83 | if( (Utt_Start(Utt_id)- SEARCHBUFFER- 1)* Downsample+ 1+ ... 84 | Best_D1 < 0 ) 85 | Utt_Start(Utt_id) = SEARCHBUFFER+ 1+ ... 86 | floor( (Downsample - 1 - Best_D1) / Downsample); 87 | end 88 | 89 | if( ((Utt_End( Utt_id +1)- 1)* Downsample+ 1 + Best_D2) >... 90 | (deg_Nsamples - SEARCHBUFFER * Downsample) ) 91 | Utt_End( Utt_id +1) = floor( (deg_Nsamples - Best_D2)... 92 | / Downsample)- SEARCHBUFFER+ 1; 93 | end 94 | else 95 | Utt_id= Utt_id+ 1; 96 | end 97 | else 98 | Utt_id = Utt_id+ 1; 99 | end 100 | end 101 | 102 | Largest_uttsize = max( Utt_End- Utt_Start); 103 | 104 | % fid= fopen( 'uttinfo_mat.txt', 'wt'); 105 | % fprintf( fid, 'Number of Utterances is:\n'); 106 | % fprintf( fid, '%d\n', Nutterances); 107 | % fprintf( fid, 'Utterance Delay Estimation:\n'); 108 | % fprintf( fid, '%d\n', Utt_DelayEst( 1: Nutterances) ); 109 | % fprintf( fid, 'Utterance Delay:\n'); 110 | % fprintf( fid, '%d\n', Utt_Delay( 1: Nutterances)); 111 | % fprintf( fid, 'Utterance Delay Confidence:\n'); 112 | % fprintf( fid, '%f\n', Utt_DelayConf( 1: Nutterances)); 113 | % fprintf( fid, 'Utterance Start:\n'); 114 | % fprintf( fid, '%d\n', Utt_Start( 1: Nutterances)); 115 | % fprintf( fid, 'Utterance End:\n'); 116 | % fprintf( fid, '%d\n', Utt_End( 1: Nutterances)); 117 | % fprintf( fid, 'Largest utterance length:\n'); 118 | % fprintf( fid, '%d\n', Largest_uttsize); 119 | % fclose( fid); 120 | 121 | 122 | 123 | -------------------------------------------------------------------------------- /PESQ/wavread.m: -------------------------------------------------------------------------------- 1 | function [data,Srate,Nbits]=wavread(filename) 2 | 3 | [data,Srate]=audioread(filename); 4 | Nbits=32; 5 | -------------------------------------------------------------------------------- /PESQ/white_noise.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/PESQ/white_noise.wav -------------------------------------------------------------------------------- /RETO2016_README.txt: -------------------------------------------------------------------------------- 1 | - Objetivo del reto: realzar una señal de voz multicanal. 2 | 3 | - La calidad de la voz realzada se medirá mediante un test PESQ (proporciona una nota de 0, calidad pésima, a 5, calidad excelente) respecto a una señal limpia de referencia. Como punto de partida se tomará la calidad proporcionada por el canal central (num 8, PESQ=2.1752) y un beamformer Delay-And-Sum (PESQ=2.3741). Programas: PESQ.zip es un archivo-directorio comprimido con el programa PESQ para la evaluación de la calidad. Descomprimir en el directorio de trabajo y consultar el "readme" correspondiente para su uso. 4 | 5 | - Señales a emplear (directorio signals): 6 | * Tipo de array: lineal, 15 canales, no uniforme (espaciados d, 2*s y 4*d, d=4cm). 7 | * La señal multicanal ruidosa (ruido laboratorio) a realzar es "an103-mtms-arr4A.adc". 8 | * La señal de referencia limpia monocanal adquirida con micrófono de proximidad es "an103-mtms-senn4.adc". 9 | * Otras senales: "an10n-mtms-arr4A.adc" (n=1,2,4,5). Grabadas con el mismo array y tipo de ruido. 10 | * Las especificaciones de adquisición de las señales pueden consultarse en el fichero "README_acquisition". 11 | 12 | - Parámetros: 13 | * Fs=16000; %frecuencia de muestreo 14 | * nc=15; %numero de canales 15 | * L=400; %longitud de la STFT 16 | 17 | - Ficheros adicionales: 18 | * Leer_Array_Signals.m: programa de lectura de la señal multicanal. 19 | * offsetcom.m: función utilizada por el programa de lectura para compensación de componentes DC. 20 | * steering_vector.mat: contiene la variable ds (15x201) con el steering vector a todas las frecuencias posibles (k=1:201). -------------------------------------------------------------------------------- /RETO2016_TOOLS/signals/README_Acquisition: -------------------------------------------------------------------------------- 1 | 2 | This directory contains multi-microphone data recorded by Tom Sullivan 3 | (tms@cs.cmu.edu) at Carnegie Mellon University. 4 | 5 | All data is sampled at 16 kHz, 16-bit linear sampling. 6 | 7 | There are 3 directories included: 8 | 9 | 15element -- Recorded at Carnegie Mellon University. 10 | 11 | 8element -- Recorded at Carnegie Mellon University. 12 | 13 | rutdata -- Recorded at Rutgers University. 14 | 15 | The contents of each directory are described individually in more detail 16 | below. 17 | 18 | 19 | 15element 20 | --------- 21 | These utterances were collected with a 15-element array. The array spacing is 22 | such that it is actually three 7-element sub-arrays (with different spacing) 23 | interleaved (ala. Jim Flanagan's array at ATT/Rutgers U.) Some elements are 24 | shared between the sub-arrays. 25 | 26 | If the minimum spacing of this array is N-cm. The array looks roughly like 27 | this: 28 | 29 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 30 | 31 | Where element 8 is in the exact center of the array. 32 | 33 | Elements 5, 6, 7, 8, 9, 10, and 11 are a 7 element array with a spacing of 34 | N cm between elements. 35 | 36 | Elements 3, 4, 6, 8, 10, 12 and 13 are a 7 element array with a spacing of 37 | 2*N cm between elements. 38 | 39 | Elements 1, 2, 4, 8, 12, 14 and 15 are a 7 element array with a spacing of 40 | 4*N cm between elements. 41 | 42 | Of course you can feel free to combine the elements in any manner you 43 | desire for your own experiments. We used them to study different element 44 | spacing from data that was collected simultaneously to each of the 45 | sub-arrays. 46 | 47 | Within this directory are files of the form: 48 | 49 | {utterance}-{subject}-{microphone_type}.adc 50 | 51 | {utterance} is either an* or cen*, where "*" is a number. These are 52 | alphanumeric and census utterances in the AN4 dataset used often here 53 | at CMU. 54 | 55 | {subject} is the same 4-letter code used to name the sub-directories. 56 | 57 | {microphone_type} is the type of microphone used and a key into the 58 | experimental conditions. "senn" is the Sennheiser HMD414 headset 59 | closetalking microphone, used in every set as a control. "arr" is a 60 | microphone array having multiple elements. All array elements are 61 | Panasonic WD-063 noise cancelling electret condenser elements. 62 | 63 | There are 6 data sets collected with this array: 64 | 65 | 1) This set was collected in a noisy computer lab at Carnegie Mellon Univ. 66 | with an array of 15-elements with a minimum spacing of 3 cm. It is denoted 67 | by "arr3A" for the 15 array elements and "senn3" for the closetalk. The 68 | subject sat one meter from the center of the array. 69 | 70 | Ex: 71 | 72 | an101-mtms-arr3A.adc (15-channel array, 3 cm minimum spacing) 73 | an101-mtms-senn3.adc (closetalking control signal for above) 74 | 75 | 2) This set was collected in the same noisy lab as above but with the 76 | 15-element array with a minimum spacing of 4 cm. It is denoted by 77 | "arr4A" and "senn4" for the closetalk. The subject sat one meter from 78 | the center of the array. 79 | 80 | Ex: 81 | 82 | an101-mtms-arr4A.adc (15-channel array, 4 cm minimum spacing) 83 | an101-mtms-senn4.adc (closetalking control signal for above) 84 | 85 | 3) This set was collected in a conference room with the 15-element array 86 | with a minimum spacing of 4 cm. The conference room is larger than the 87 | noisy lab, but didn't have all of the computer fans. It is denoted by 88 | "arrC1A" and "sennC1" for the closetalk. The subject sat one meter from 89 | the center of the array. 90 | 91 | Ex: 92 | 93 | an101-mtms-arrC1A.adc (15-channel array, 4 cm minimum spacing, 1 meter dist.) 94 | an101-mtms-sennC1.adc (closetalking control signal for above) 95 | 96 | 4) This set was collected in the same conference room as 4) above with the 97 | 15-element array with a minimum spacing of 4 cm. It is denoted by 98 | "arrC1A" and "sennC1" for the closetalk. The subject sat three meters from 99 | the center of the array. 100 | 101 | Ex: 102 | 103 | an101-mtms-arrC3A.adc (15-channel array, 4 cm minimum spacing, 3 meter dist.) 104 | an101-mtms-sennC3.adc (closetalking control signal for above) 105 | 106 | 5) This set was collected the same conference room as above with the 107 | 15-element array with a minimum spacing of 4 cm, but also had an AM talk-radio 108 | jamming signal at approximately 45 degrees off-axis from the center of the 109 | array, competing with the speaker. It is denoted by "arrCR1A" and "sennCR1" 110 | for the closetalk. The subject sat one meter from the center of the array. 111 | 112 | Ex: 113 | 114 | an101-mtms-arrCR1A.adc (15-channel array, 4 cm minimum spacing, 1 meter 115 | dist., radio jamming signal) 116 | an101-mtms-sennCR1.adc (closetalking control signal for above) 117 | 118 | 6) This set was collected the same conference room as above with the 119 | 15-element array with a minimum spacing of 4 cm, but also had an AM talk-radio 120 | jamming signal at approximately 45 degrees off-axis from the center of the 121 | array, competing with the speaker. It is denoted by "arrCR1A" and "sennCR1" 122 | for the closetalk. The subject sat three meters from the center of the array. 123 | 124 | Ex: 125 | 126 | an101-mtms-arrCR3A.adc (15-channel array, 4 cm minimum spacing, 3 meter 127 | dist., radio jamming signal) 128 | an101-mtms-sennCR3.adc (closetalking control signal for above) 129 | 130 | 131 | 8element 132 | -------- 133 | There are ten "subject" sub-directories included. Their names are each four 134 | letters long. The first letter denotes the gender of the speaker ("m" 135 | or "f"). All data we've collected thus far used only male speakers. 136 | The final three letters are the initials of the subject. Hence "mtms" is 137 | a male speaker with the initials TMS. 138 | 139 | Within each subject's sub-directory are files of the form: 140 | 141 | {utterance}-{subject}-{microphone_type}.adc 142 | 143 | {utterance} is either an* or cen*, where "*" is a number. These are 144 | alphanumeric and census utterances in the AN4 dataset used often here 145 | at CMU. 146 | 147 | {subject} is the same 4-letter code used to name the sub-directories. 148 | 149 | {microphone_type} is the type of microphone used and a key into the 150 | experimental conditions. "senn" is the Sennheiser HMD414 headset 151 | closetalking microphone, used in every set as a control. "arr" is a 152 | microphone area having multiple elements. All array elements are 153 | Panasonic WD-063 noise cancelling electret condenser elements. 154 | 155 | There are 3 different data sets contained in these directories. They are 156 | listed as follows: 157 | 158 | This set contains 10 male speakers each speaking 14 utterances. The 159 | microphone array used had eight (8) elements, and these utterances are 160 | denoted by "arrA" as the {microphone_type}. The 8 elements were spaced 161 | linearly and with a spacing of 7 cm between elements. The subject sat 162 | directly in front of the array at a distance of 1 meter from the center. 163 | A pair of Crown PZM6FS microphones were also used to collect a stereo pair 164 | of Crown PZM signals to compare the performance of a quality set of 165 | omnidirectional microphones to the Panasonic WD-063 array elements. 166 | The closetalking control signal is denoted by "senn" only. The set was 167 | collected in a noisy computer lab at Carnegie Mellon Univ. with many 168 | computer and disk-drive fans. 169 | 170 | Ex: 171 | 172 | an101-mtms-arrA.adc (8-channel array) 173 | an101-mtms-pzmS.adc (stereo pair of PZM6FS microphones) 174 | an101-mtms-senn.adc (closetalking control signal) 175 | 176 | rutdata 177 | ------- 178 | 179 | These data were collected at the CAIP Center at Rutgers University in 1991 180 | with the help of Jim Flanagan and Joe French. 181 | 182 | The experimental setup consisted of two 23-element microphone arrays built 183 | at ATT Bell Labs, a Crown PZM6FS microphone, and a Sennheiser HMD414 head- 184 | mounted closetalking microphone. The arrays had 23 microphone elements each 185 | (comprising three interleaved arrays of 11-elements each). One of the arrays 186 | had a bandwidth of 8kHz (standard DARPA speech bandwidth) and the other a 187 | bandwidth of 4kHz (standard telephone speech bandwidth). 188 | 189 | There are two sub-directories: 1meter and 3meters. These directories refer 190 | to the distance of the speaker from the center of the microphone array. 191 | Within each directory are sub-directories, one for each speaker. 192 | 193 | Within each subject's sub-directory are files of the form: 194 | 195 | {utterance}-{subject}-{microphone_type}.adc 196 | 197 | {utterance} is either an* or cen*, where "*" is a number. These are 198 | alphanumeric and census utterances in the AN4 dataset used often here 199 | at CMU. 200 | 201 | {subject} is the same 4-letter code used to name the sub-directories. 202 | 203 | {microphone_type} is the type of microphone used and a key into the 204 | experimental conditions. "sen" is the Sennheiser HMD414 headset 205 | closetalking microphone, used in every set as a control. "pzm" is a 206 | Crown PZM6FS microphone, "arrA(1m or 3m)" is a 8kHZ bandwidth array described 207 | above, and "arrB(1m or 3m) is a 4kHZ bandwidth array as describe above. All 208 | array elements are Panasonic WD-063 noise cancelling electret condenser 209 | elements. 210 | 211 | The arrA and arrB files in this data set are monophonic files. They are 212 | created by combining the array element outputs in real-time via hardware 213 | via delay and sum beamforming. 214 | 215 | ------------------------ 216 | The sentence transcripts for each of the files in "15element" and 8element" 217 | are contained in the "transcripts" directory. 218 | 219 | The sentence transcripts for each of the files in "rutdata" are contained 220 | in the "rutdata/transcripts/trans{1m,3m}" directories. 221 | ------------------------ 222 | 223 | The file ad.h in this directory is a C "include" file that contains the 224 | structure of the soundfile headers found on the .adc files. 225 | 226 | 227 | -------------------------------------------------------------------------------- /RETO2016_TOOLS/signals/an101-mtms-arr4A.adc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/RETO2016_TOOLS/signals/an101-mtms-arr4A.adc -------------------------------------------------------------------------------- /RETO2016_TOOLS/signals/an102-mtms-arr4A.adc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/RETO2016_TOOLS/signals/an102-mtms-arr4A.adc -------------------------------------------------------------------------------- /RETO2016_TOOLS/signals/an103-mtms-arr4A.adc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/RETO2016_TOOLS/signals/an103-mtms-arr4A.adc -------------------------------------------------------------------------------- /RETO2016_TOOLS/signals/an103-mtms-senn4.adc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/RETO2016_TOOLS/signals/an103-mtms-senn4.adc -------------------------------------------------------------------------------- /RETO2016_TOOLS/signals/an104-mtms-arr4A.adc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/RETO2016_TOOLS/signals/an104-mtms-arr4A.adc -------------------------------------------------------------------------------- /RETO2016_TOOLS/signals/an105-mtms-arr4A.adc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/RETO2016_TOOLS/signals/an105-mtms-arr4A.adc -------------------------------------------------------------------------------- /ResumenResultados.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/ResumenResultados.xlsx -------------------------------------------------------------------------------- /array.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/array.wav -------------------------------------------------------------------------------- /asdf.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/asdf.wav -------------------------------------------------------------------------------- /image_2017-02-16_15-18-08.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/image_2017-02-16_15-18-08.png -------------------------------------------------------------------------------- /limpia.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/limpia.wav -------------------------------------------------------------------------------- /lms_eq.m: -------------------------------------------------------------------------------- 1 | function [ yout, ak ] = lms_eq(ak,xbloqueo,xout,mu) 2 | %UNTITLED2 Summary of this function goes here 3 | % Detailed explanation goes here 4 | 5 | E = 10e-5; 6 | 7 | xk = xbloqueo;%14xL/2 8 | yk = sum(ak.*xk); %1xL/2 9 | yout = xout'-yk; %Salida para una trama 1xL/2 10 | err= repmat(yout,14,1).*xk;%1xL/2 11 | ak = ak + mu*err./xk.^2; 12 | 13 | end 14 | 15 | -------------------------------------------------------------------------------- /offsetcomp.m: -------------------------------------------------------------------------------- 1 | function xout = offsetcomp(x) 2 | 3 | F=0.98; 4 | N=length(x); 5 | x=x-mean(x); 6 | x_ant=0; 7 | xof=0; 8 | xout=[]; 9 | for n=1:N 10 | xof=x(n)-x_ant+F*xof; 11 | x_ant=x(n); 12 | xout=[xout; xof]; 13 | end -------------------------------------------------------------------------------- /steering_vector.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/steering_vector.mat --------------------------------------------------------------------------------