├── .gitattributes
├── .gitignore
├── GSC.m
├── Leer_Array_Signals.m
├── PESQ
    ├── DC_block.m
    ├── FFTNXCorr.m
    ├── addnoise_asl.asv
    ├── addnoise_asl.m
    ├── apply_VAD.asv
    ├── apply_VAD.m
    ├── apply_filter.asv
    ├── apply_filter.m
    ├── apply_filters.m
    ├── comp_cep.asv
    ├── comp_cep.m
    ├── comp_fwseg.asv
    ├── comp_fwseg.m
    ├── comp_fwseg_mars.asv
    ├── comp_fwseg_mars.m
    ├── comp_fwseg_variant.asv
    ├── comp_fwseg_variant.m
    ├── comp_is.asv
    ├── comp_is.m
    ├── comp_llr.asv
    ├── comp_llr.m
    ├── comp_snr.asv
    ├── comp_snr.m
    ├── comp_wss.asv
    ├── comp_wss.m
    ├── composite.asv
    ├── composite.m
    ├── crude_align.m
    ├── enhanced.wav
    ├── fix_power_level.asv
    ├── fix_power_level.m
    ├── id_searchwindows.m
    ├── id_utterances.m
    ├── input_filter.m
    ├── pesq.asv
    ├── pesq.m
    ├── pesq_psychoacoustic_model.asv
    ├── pesq_psychoacoustic_model.m
    ├── pow_of.m
    ├── readme.pdf
    ├── readme.txt
    ├── setup_global.m
    ├── sp04.wav
    ├── sp04_babble_sn10.wav
    ├── split_align.m
    ├── time_align.m
    ├── utterance_locate.m
    ├── utterance_split.m
    ├── wavread.m
    └── white_noise.wav
├── RETO2016_README.txt
├── RETO2016_TOOLS
    └── signals
    │   ├── README_Acquisition
    │   ├── an101-mtms-arr4A.adc
    │   ├── an102-mtms-arr4A.adc
    │   ├── an103-mtms-arr4A.adc
    │   ├── an103-mtms-senn4.adc
    │   ├── an104-mtms-arr4A.adc
    │   └── an105-mtms-arr4A.adc
├── ResumenResultados.xlsx
├── array.wav
├── asdf.wav
├── image_2017-02-16_15-18-08.png
├── limpia.wav
├── lms_eq.m
├── offsetcomp.m
└── steering_vector.mat


/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Windows image file caches
 2 | Thumbs.db
 3 | ehthumbs.db
 4 | 
 5 | # Folder config file
 6 | Desktop.ini
 7 | 
 8 | # Recycle Bin used on file shares
 9 | $RECYCLE.BIN/
10 | 
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 | 
17 | # Windows shortcuts
18 | *.lnk
19 | 
20 | # =========================
21 | # Operating System Files
22 | # =========================
23 | 
24 | # OSX
25 | # =========================
26 | 
27 | .DS_Store
28 | .AppleDouble
29 | .LSOverride
30 | 
31 | # Thumbnails
32 | ._*
33 | 
34 | # Files that might appear in the root of a volume
35 | .DocumentRevisions-V100
36 | .fseventsd
37 | .Spotlight-V100
38 | .TemporaryItems
39 | .Trashes
40 | .VolumeIcon.icns
41 | 
42 | # Directories potentially created on remote AFP share
43 | .AppleDB
44 | .AppleDesktop
45 | Network Trash Folder
46 | Temporary Items
47 | .apdisk
48 | 


--------------------------------------------------------------------------------
/GSC.m:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/GSC.m


--------------------------------------------------------------------------------
/Leer_Array_Signals.m:
--------------------------------------------------------------------------------
 1 | % LECTURA DE DATOS MULTICANAL
 2 | % 16 kHz
 3 | % 16 bits por muestra
 4 | % 15 canales
 5 | % Big-endian
 6 | fm = 16000; % Frec. muestreo
 7 | nc = 15;    % Nº de canales.
 8 | fname = 'an103-mtms-arr4A.adc';
 9 | % dir = '/zona_amp/data/Multimic/multimic/15element/';
10 | % fname = strcat(dir,fname)
11 | [fid,msg] = fopen(fname,'r','b');
12 | if fid < 0
13 |   disp(msg);
14 | else
15 |   data = fread(fid,'int16');
16 |   fclose(fid);
17 | end
18 | 
19 | % Separa canales.
20 | nsamp=[];
21 | for i = 1:nc
22 |     x{i} = data(i:nc:end);
23 |     x{i} = offsetcomp(x{i});
24 |     nsamp(i)=length(x{i});
25 | end
26 | Nsamp=min(nsamp); %Numero de muestras a emplear en todas las senales
27 | 
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/PESQ/DC_block.m:
--------------------------------------------------------------------------------
 1 | function mod_data= DC_block( data, Nsamples)
 2 | 
 3 | global Downsample DATAPADDING_MSECS SEARCHBUFFER
 4 | 
 5 | ofs= SEARCHBUFFER* Downsample;
 6 | mod_data= data;
 7 | 
 8 | %compute dc component, it is a little weird
 9 | facc= sum( data( ofs+ 1: Nsamples- ofs))/ Nsamples; 
10 | mod_data( ofs+ 1: Nsamples- ofs)= data( ofs+ 1: Nsamples- ofs)- facc;
11 | 
12 | mod_data( ofs+ 1: ofs+ Downsample)= mod_data( ofs+ 1: ofs+ Downsample).* ...
13 |     ( 0.5+ (0: Downsample- 1))/ Downsample;
14 | 
15 | mod_data( Nsamples- ofs: -1: Nsamples- ofs-Downsample+ 1)= ...
16 |     mod_data( Nsamples- ofs: -1: Nsamples- ofs-Downsample+ 1).* ...
17 |     ( 0.5+ (0: Downsample- 1))/ Downsample;
18 | 
19 | 
20 |      
21 |     
22 | 
23 | 
24 | 
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/PESQ/FFTNXCorr.m:
--------------------------------------------------------------------------------
 1 | function Y= FFTNXCorr( ref_VAD, startr, nr, deg_VAD, startd, nd)
 2 | % this function has other simple implementations, current implementation is
 3 | % consistent with the C version
 4 | 
 5 | % one way to do this (in time domain) =====
 6 | x1= ref_VAD( startr: startr+ nr- 1);
 7 | x2= deg_VAD( startd: startd+ nd- 1);
 8 | x1= fliplr( x1);
 9 | Y= conv( x2, x1);
10 | % done =====
11 | 
12 | % % the other way to do this (in freq domain)===
13 | % Nx= 2^ (ceil( log2( max( nr, nd))));
14 | % x1= zeros( 1, 2* Nx);
15 | % x2= zeros( 1, 2* Nx);
16 | % x1( 1: nr)= fliplr( ref_VAD( startr: startr+ nr- 1));
17 | % x2( 1: nd)= deg_VAD( startd: startd+ nd- 1);
18 | % 
19 | % if (nr== 491)
20 | %     fid= fopen( 'mat_debug.txt', 'wt');
21 | %     fprintf( fid, '%f\n', x1);
22 | %     fclose( fid);
23 | % end
24 | % 
25 | % x1_fft= fft( x1, 2* Nx);
26 | % x2_fft= fft( x2, 2* Nx);
27 | % 
28 | % tmp1= ifft( x1_fft.* x2_fft, 2* Nx);
29 | % 
30 | % Ny= nr+ nd- 1;
31 | % Y= tmp1( 1: Ny);
32 | % % done ===========
33 | 
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/PESQ/addnoise_asl.asv:
--------------------------------------------------------------------------------
  1 | function addnoise_asl(cleanfile, noisefile, outfile, snr) 
  2 | % ----------------------------------------------------------------------
  3 | %   This function adds noise to a file at a specified SNR level. It uses
  4 | %   the active speech level to compute the speech energy. The
  5 | %   active speech level is computed as per ITU-T P.56 standard [1].
  6 | %
  7 | %   Usage:  addnoise_asl(cleanFile.wav, noiseFile.wav, noisyFile.wav, SNR)
  8 | %           
  9 | %         cleanFile.wav  - clean input file in .wav format
 10 | %         noiseFile.wav  - file containing the noise signal in .wav format
 11 | %         noisyFile.wav  - resulting noisy file
 12 | %         SNR            - desired SNR in dB
 13 | %
 14 | %   Note that if the variable IRS below is set to 1, then it applies the IRS
 15 | %   filter to bandlimit the signal to 300 Hz - 3.2 kHz. The default
 16 | %  Example call:  
 17 | %
 18 | %  
 19 | %  References:
 20 | %   [1] ITU-T (1993). Objective measurement of active speech level. ITU-T 
 21 | %       Recommendation P. 56
 22 | %
 23 | %   Author: Yi Hu and Philipos C. Loizou 
 24 | %
 25 | % Copyright (c) 2006 by Philipos C. Loizou
 26 | % $Revision: 0.0 $  $Date: 10/09/2006 $
 27 | % ----------------------------------------------------------------------
 28 | 
 29 | IRS=0;  % if 1 apply IRS filter simulating telephone handset bandwidth (300 Hz -3.2 kHz)
 30 | 
 31 | % wavread gives floating point column data
 32 | [clean, srate, nbits]= wavread(cleanfile); 
 33 | % filter clean speech with irs filter
 34 | if IRS==1, clean= apply_IRS( clean, srate, nbits); end;
 35 | 
 36 | [Px, asl, c0]= asl_P56 ( clean, srate, nbits); 
 37 | % Px is the active speech level ms energy, asl is the active factor, and c0
 38 | % is the active speech level threshold. 
 39 | 
 40 | %noiseonly_len= floor( noiseonly* srate/ 1000); 
 41 | % <--------- insert noiseonly-msecs of silence
 42 | %x= [zeros( noiseonly_len, 1); clean];
 43 | x=clean;
 44 | x_len= length( x); % length of speech signal
 45 | 
 46 | [noise, srate1, nbits1]= wavread( noisefile);
 47 | if (srate1~= srate)| (nbits1~= nbits)
 48 |     error( 'the formats of the two files dont match!');
 49 | end
 50 | noise_len= length( noise);
 51 | if (noise_len<= x_len)
 52 |     error( 'the noise length has to be greater than speech length!');
 53 | end
 54 | 
 55 | rand_start_limit= noise_len- x_len+ 1; 
 56 | % the start of the noise segment can vary between [1 rand_start_limit]
 57 | rand_start= round( (rand_start_limit- 1)* rand( 1)+ 1); 
 58 | % random start of the noise segment 
 59 | noise_segment= noise( rand_start: rand_start+ x_len- 1);
 60 | 
 61 | if IRS==1, noise_segment= apply_IRS( noise_segment, srate, nbits); end;
 62 | 
 63 | % this is the randomly selected noise segment that will be added to the
 64 | % clean speech x
 65 | Pn= noise_segment'* noise_segment/ x_len;
 66 | % we need to scale the noise segment samples to obtain the desired snr= 10*
 67 | % log10( Px/ (sf^2 * Pn))
 68 | sf= sqrt( Px/Pn/ (10^ (snr/ 10))); % scale factor for noise segment data
 69 | noise_segment= noise_segment * sf; 
 70 | 
 71 | noisy = x+ noise_segment;  
 72 | 
 73 | if ( (max( noisy)>= 1) | (min( noisy)< -1))
 74 |     error( 'Overflow occurred!\n');
 75 | end;
 76 | 
 77 | 
 78 | wavwrite( noisy, srate, nbits, outfile);
 79 | 
 80 | fprintf( 1, 'For comparison, the old SNR based on long-term RMS level is %4.2f dB.\n\n', 10*log10((x'*x)/ ...
 81 |      (noise_segment'*noise_segment)));
 82 | 
 83 | 
 84 | %------------------------------------------------------------------------
 85 | function data_filtered= apply_IRS( data, Fs, nbits);
 86 | 
 87 | n= length( data);
 88 | 
 89 | % now find the next power of 2 which is greater or equal to n
 90 | pow_of_2= 2^ (ceil( log2( n)));
 91 | 
 92 | align_filter_dB= [0, -200; 50, -40; 100, -20; 125, -12; 160, -6; 200, 0;...    
 93 |     250, 4; 300, 6; 350, 8; 400, 10; 500, 11; 600, 12; 700, 12; 800, 12;...
 94 |     1000, 12; 1300, 12; 1600, 12; 2000, 12; 2500, 12; 3000, 12; 3250, 12;...
 95 |     3500, 4; 4000, -200; 5000, -200; 6300, -200; 8000, -200]; 
 96 | 
 97 | [number_of_points, trivial]= size( align_filter_dB);
 98 | overallGainFilter= interp1( align_filter_dB( :, 1), align_filter_dB( :, 2), ...
 99 |     1000);
100 | 
101 | x= zeros( 1, pow_of_2);
102 | x( 1: n)= data;
103 | 
104 | x_fft= fft( x, pow_of_2);
105 | 
106 | freq_resolution= Fs/ pow_of_2;
107 | 
108 | factorDb( 1: pow_of_2/2+ 1)= interp1( align_filter_dB( :, 1), ...
109 |     align_filter_dB( :, 2), (0: pow_of_2/2)* freq_resolution)- ...
110 |     overallGainFilter;
111 | factor= 10.^ (factorDb/ 20);
112 | 
113 | factor= [factor, fliplr( factor( 2: pow_of_2/2))];
114 | x_fft= x_fft.* factor;
115 | 
116 | y= ifft( x_fft, pow_of_2);
117 | 
118 | data_filtered= y( 1: n)';
119 | 
120 | 
121 | 
122 | function [asl_ms, asl, c0]= asl_P56 ( x, fs, nbits)
123 | % this implements ITU P.56 method B. 
124 | % 'speechfile' is the speech file to calculate active speech level for,
125 | % 'asl' is the active speech level (between 0 and 1),
126 | % 'asl_rms' is the active speech level mean square energy.
127 | 
128 | % x is the column vector of floating point speech data
129 | 
130 | x= x(:); % make sure x is column vector
131 | T= 0.03; % time constant of smoothing, in seconds
132 | H= 0.2; % hangover time in seconds
133 | M= 15.9; 
134 | % margin in dB of the difference between threshold and active speech level
135 | thres_no= nbits- 1; % number of thresholds, for 16 bit, it's 15
136 | 
137 | I= ceil( fs* H); % hangover in samples
138 | g= exp( -1/( fs* T)); % smoothing factor in envelop detection
139 | c( 1: thres_no)= 2.^ (-15: thres_no- 16); 
140 | % vector with thresholds from one quantizing level up to half the maximum
141 | % code, at a step of 2, in the case of 16bit samples, from 2^-15 to 0.5; 
142 | a( 1: thres_no)= 0; % activity counter for each level threshold
143 | hang( 1: thres_no)= I; % hangover counter for each level threshold
144 | 
145 | sq= x'* x; % long-term level square energy of x
146 | x_len= length( x); % length of x
147 | 
148 | % use a 2nd order IIR filter to detect the envelope q
149 | x_abs= abs( x); 
150 | p= filter( 1-g, [1 -g], x_abs); 
151 | q= filter( 1-g, [1 -g], p);
152 | 
153 | for k= 1: x_len
154 |     for j= 1: thres_no
155 |         if (q(k)>= c(j))
156 |             a(j)= a(j)+ 1;
157 |             hang(j)= 0;
158 |         elseif (hang(j)< I)
159 |             a(j)= a(j)+ 1;
160 |             hang(j)= hang(j)+ 1;
161 |         else
162 |             break;
163 |         end
164 |     end
165 | end
166 | 
167 | asl= 0; 
168 | asl_rms= 0; 
169 | if (a(1)== 0)
170 |     return;
171 | else
172 |     AdB1= 10* log10( sq/ a(1)+ eps);
173 | end
174 | 
175 | CdB1= 20* log10( c(1)+ eps);
176 | if (AdB1- CdB1< M)
177 |     return;
178 | end
179 | 
180 | AdB(1)= AdB1; 
181 | CdB(1)= CdB1;
182 | Delta(1)= AdB1- CdB1;
183 | 
184 | for j= 2: thres_no
185 |     AdB(j)= 10* log10( sq/ (a(j)+ eps)+ eps);
186 |     CdB(j)= 20* log10( c(j)+ eps);
187 | end
188 | 
189 | for j= 2: thres_no    
190 |     if (a(j) ~= 0)       
191 |         Delta(j)= AdB(j)- CdB(j);        
192 |         if (Delta(j)<= M) 
193 |             % interpolate to find the asl
194 |             [asl_ms_log, cl0]= bin_interp( AdB(j), ...
195 |                 AdB(j-1), CdB(j), CdB(j-1), M, 0.5);
196 |             asl_ms= 10^ (asl_ms_log/ 10);
197 |             asl= (sq/ x_len)/ asl_ms;  
198 |             c0= 10^( cl0/ 20);            
199 |             break;
200 |         end        
201 |     end
202 | end
203 | 
204 | 
205 | 
206 | 
207 | function [asl_ms_log, cc]= bin_interp(upcount, lwcount, ...
208 |     upthr, lwthr, Margin, tol)
209 | 
210 | if (tol < 0)
211 |     tol = -tol;
212 | end
213 | 
214 | % Check if extreme counts are not already the true active value
215 | iterno = 1;
216 | if (abs(upcount - upthr - Margin) < tol)
217 |     asl_ms_log= upcount;
218 |     cc= upthr;
219 |     return;
220 | end
221 | if (abs(lwcount - lwthr - Margin) < tol)
222 |     asl_ms_log= lwcount;
223 |     cc= lwthr;
224 |     return;
225 | end
226 | 
227 | % Initialize first middle for given (initial) bounds 
228 | midcount = (upcount + lwcount) / 2.0;
229 | midthr = (upthr + lwthr) / 2.0;
230 | 
231 | % Repeats loop until `diff' falls inside the tolerance (-tol<=diff<=tol)
232 | while ( 1) 
233 |     
234 |     diff= midcount- midthr- Margin;
235 |     if (abs(diff)<= tol)
236 |         break;
237 |     end
238 |     
239 |     % if tolerance is not met up to 20 iteractions, then relax the
240 |     % tolerance by 10%
241 |     
242 |     iterno= iterno+ 1; 
243 |     
244 |     if (iterno>20) 
245 |       tol = tol* 1.1; 
246 |     end
247 | 
248 |     if (diff> tol)   % then new bounds are ...     
249 |         midcount = (upcount + midcount) / 2.0; 
250 |         % upper and middle activities 
251 |         midthr = (upthr + midthr) / 2.0;	  
252 |         % ... and thresholds     
253 |     elseif (diff< -tol)	% then new bounds are ... 
254 |         midcount = (midcount + lwcount) / 2.0; 
255 |         % middle and lower activities 
256 |         midthr = (midthr + lwthr) / 2.0;   
257 |         % ... and thresholds 
258 |     end    
259 |     
260 | end
261 | %   Since the tolerance has been satisfied, midcount is selected 
262 | %   as the interpolated value with a tol [dB] tolerance.
263 | 
264 | asl_ms_log= midcount;
265 | cc= midthr;
266 | 
267 | 
268 | 
269 | 
270 | 


--------------------------------------------------------------------------------
/PESQ/addnoise_asl.m:
--------------------------------------------------------------------------------
  1 | function addnoise_asl(cleanfile, noisefile, outfile, snr) 
  2 | % ----------------------------------------------------------------------
  3 | %   This function adds noise to a file at a specified SNR level. It uses
  4 | %   the active speech level to compute the speech energy. The
  5 | %   active speech level is computed as per ITU-T P.56 standard [1].
  6 | %
  7 | %   Usage:  addnoise_asl(cleanFile.wav, noiseFile.wav, noisyFile.wav, SNR)
  8 | %           
  9 | %         cleanFile.wav  - clean input file in .wav format
 10 | %         noiseFile.wav  - file containing the noise signal in .wav format
 11 | %         noisyFile.wav  - resulting noisy file
 12 | %         SNR            - desired SNR in dB
 13 | %
 14 | %   Note that if the variable IRS below (line 38) is set to 1, then it applies the IRS
 15 | %   filter to bandlimit the signal to 300 Hz - 3.2 kHz. The default IRS
 16 | %   value is 0, ie, no IRS filtering is applied.
 17 | %
 18 | %  Example call:
 19 | %       addnoise_asl('sp04.wav','white_noise.wav','sp04_white_5db.wav',5);
 20 | %
 21 | %  
 22 | %  References:
 23 | %   [1] ITU-T (1993). Objective measurement of active speech level. ITU-T 
 24 | %       Recommendation P. 56
 25 | %
 26 | %   Author: Yi Hu and Philipos C. Loizou 
 27 | %
 28 | % Copyright (c) 2006 by Philipos C. Loizou
 29 | % $Revision: 0.0 $  $Date: 10/09/2006 $
 30 | % ----------------------------------------------------------------------
 31 | 
 32 | if nargin ~=4
 33 |     fprintf('USAGE: addnoise_asl(cleanFile.wav, noiseFile.wav, noisyFile.wav, SNR) \n');
 34 |     fprintf('For more help, type: help addnoise_asl\n\n');
 35 |     return;
 36 | end
 37 | 
 38 | IRS=0;  % if 1 apply IRS filter simulating telephone handset bandwidth (300 Hz -3.2 kHz)
 39 | 
 40 | % wavread gives floating point column data
 41 | [clean, srate, nbits]= wavread(cleanfile); 
 42 | % filter clean speech with irs filter
 43 | if IRS==1, clean= apply_IRS( clean, srate, nbits); end;
 44 | 
 45 | [Px, asl, c0]= asl_P56 ( clean, srate, nbits); 
 46 | % Px is the active speech level ms energy, asl is the active factor, and c0
 47 | % is the active speech level threshold. 
 48 | 
 49 | 
 50 | x=clean;
 51 | x_len= length( x); % length of speech signal
 52 | 
 53 | [noise, srate1, nbits1]= wavread( noisefile);
 54 | if (srate1~= srate)| (nbits1~= nbits)
 55 |     error( 'the formats of the two files dont match!');
 56 | end
 57 | noise_len= length( noise);
 58 | if (noise_len<= x_len)
 59 |     error( 'the noise length has to be greater than speech length!');
 60 | end
 61 | 
 62 | rand_start_limit= noise_len- x_len+ 1; 
 63 | % the start of the noise segment can vary between [1 rand_start_limit]
 64 | rand_start= round( (rand_start_limit- 1)* rand( 1)+ 1); 
 65 | % random start of the noise segment 
 66 | noise_segment= noise( rand_start: rand_start+ x_len- 1);
 67 | 
 68 | if IRS==1, noise_segment= apply_IRS( noise_segment, srate, nbits); end;
 69 | 
 70 | % this is the randomly selected noise segment that will be added to the
 71 | % clean speech x
 72 | Pn= noise_segment'* noise_segment/ x_len;
 73 | % we need to scale the noise segment samples to obtain the desired snr= 10*
 74 | % log10( Px/ (sf^2 * Pn))
 75 | sf= sqrt( Px/Pn/ (10^ (snr/ 10))); % scale factor for noise segment data
 76 | noise_segment= noise_segment * sf; 
 77 | 
 78 | noisy = x+ noise_segment;  
 79 | 
 80 | if ( (max( noisy)>= 1) | (min( noisy)< -1))
 81 |     error( 'Overflow occurred!\n');
 82 | end;
 83 | 
 84 | 
 85 | wavwrite( noisy, srate, nbits, outfile);
 86 | 
 87 | fprintf( 1, '\n NOTE: For comparison, the SNR based on long-term RMS level is %4.2f dB.\n\n', 10*log10((x'*x)/ ...
 88 |      (noise_segment'*noise_segment)));
 89 | 
 90 | 
 91 | %------------------------------------------------------------------------
 92 | function data_filtered= apply_IRS( data, Fs, nbits);
 93 | 
 94 | n= length( data);
 95 | 
 96 | % now find the next power of 2 which is greater or equal to n
 97 | pow_of_2= 2^ (ceil( log2( n)));
 98 | 
 99 | align_filter_dB= [0, -200; 50, -40; 100, -20; 125, -12; 160, -6; 200, 0;...    
100 |     250, 4; 300, 6; 350, 8; 400, 10; 500, 11; 600, 12; 700, 12; 800, 12;...
101 |     1000, 12; 1300, 12; 1600, 12; 2000, 12; 2500, 12; 3000, 12; 3250, 12;...
102 |     3500, 4; 4000, -200; 5000, -200; 6300, -200; 8000, -200]; 
103 | 
104 | [number_of_points, trivial]= size( align_filter_dB);
105 | overallGainFilter= interp1( align_filter_dB( :, 1), align_filter_dB( :, 2), ...
106 |     1000);
107 | 
108 | x= zeros( 1, pow_of_2);
109 | x( 1: n)= data;
110 | 
111 | x_fft= fft( x, pow_of_2);
112 | 
113 | freq_resolution= Fs/ pow_of_2;
114 | 
115 | factorDb( 1: pow_of_2/2+ 1)= interp1( align_filter_dB( :, 1), ...
116 |     align_filter_dB( :, 2), (0: pow_of_2/2)* freq_resolution)- ...
117 |     overallGainFilter;
118 | factor= 10.^ (factorDb/ 20);
119 | 
120 | factor= [factor, fliplr( factor( 2: pow_of_2/2))];
121 | x_fft= x_fft.* factor;
122 | 
123 | y= ifft( x_fft, pow_of_2);
124 | 
125 | data_filtered= y( 1: n)';
126 | 
127 | 
128 | 
129 | function [asl_ms, asl, c0]= asl_P56 ( x, fs, nbits)
130 | % this implements ITU P.56 method B. 
131 | % 'speechfile' is the speech file to calculate active speech level for,
132 | % 'asl' is the active speech level (between 0 and 1),
133 | % 'asl_rms' is the active speech level mean square energy.
134 | 
135 | % x is the column vector of floating point speech data
136 | 
137 | x= x(:); % make sure x is column vector
138 | T= 0.03; % time constant of smoothing, in seconds
139 | H= 0.2; % hangover time in seconds
140 | M= 15.9; 
141 | % margin in dB of the difference between threshold and active speech level
142 | thres_no= nbits- 1; % number of thresholds, for 16 bit, it's 15
143 | 
144 | I= ceil( fs* H); % hangover in samples
145 | g= exp( -1/( fs* T)); % smoothing factor in envelop detection
146 | c( 1: thres_no)= 2.^ (-15: thres_no- 16); 
147 | % vector with thresholds from one quantizing level up to half the maximum
148 | % code, at a step of 2, in the case of 16bit samples, from 2^-15 to 0.5; 
149 | a( 1: thres_no)= 0; % activity counter for each level threshold
150 | hang( 1: thres_no)= I; % hangover counter for each level threshold
151 | 
152 | sq= x'* x; % long-term level square energy of x
153 | x_len= length( x); % length of x
154 | 
155 | % use a 2nd order IIR filter to detect the envelope q
156 | x_abs= abs( x); 
157 | p= filter( 1-g, [1 -g], x_abs); 
158 | q= filter( 1-g, [1 -g], p);
159 | 
160 | for k= 1: x_len
161 |     for j= 1: thres_no
162 |         if (q(k)>= c(j))
163 |             a(j)= a(j)+ 1;
164 |             hang(j)= 0;
165 |         elseif (hang(j)< I)
166 |             a(j)= a(j)+ 1;
167 |             hang(j)= hang(j)+ 1;
168 |         else
169 |             break;
170 |         end
171 |     end
172 | end
173 | 
174 | asl= 0; 
175 | asl_rms= 0; 
176 | if (a(1)== 0)
177 |     return;
178 | else
179 |     AdB1= 10* log10( sq/ a(1)+ eps);
180 | end
181 | 
182 | CdB1= 20* log10( c(1)+ eps);
183 | if (AdB1- CdB1< M)
184 |     return;
185 | end
186 | 
187 | AdB(1)= AdB1; 
188 | CdB(1)= CdB1;
189 | Delta(1)= AdB1- CdB1;
190 | 
191 | for j= 2: thres_no
192 |     AdB(j)= 10* log10( sq/ (a(j)+ eps)+ eps);
193 |     CdB(j)= 20* log10( c(j)+ eps);
194 | end
195 | 
196 | for j= 2: thres_no    
197 |     if (a(j) ~= 0)       
198 |         Delta(j)= AdB(j)- CdB(j);        
199 |         if (Delta(j)<= M) 
200 |             % interpolate to find the asl
201 |             [asl_ms_log, cl0]= bin_interp( AdB(j), ...
202 |                 AdB(j-1), CdB(j), CdB(j-1), M, 0.5);
203 |             asl_ms= 10^ (asl_ms_log/ 10);
204 |             asl= (sq/ x_len)/ asl_ms;  
205 |             c0= 10^( cl0/ 20);            
206 |             break;
207 |         end        
208 |     end
209 | end
210 | 
211 | 
212 | 
213 | 
214 | function [asl_ms_log, cc]= bin_interp(upcount, lwcount, ...
215 |     upthr, lwthr, Margin, tol)
216 | 
217 | if (tol < 0)
218 |     tol = -tol;
219 | end
220 | 
221 | % Check if extreme counts are not already the true active value
222 | iterno = 1;
223 | if (abs(upcount - upthr - Margin) < tol)
224 |     asl_ms_log= upcount;
225 |     cc= upthr;
226 |     return;
227 | end
228 | if (abs(lwcount - lwthr - Margin) < tol)
229 |     asl_ms_log= lwcount;
230 |     cc= lwthr;
231 |     return;
232 | end
233 | 
234 | % Initialize first middle for given (initial) bounds 
235 | midcount = (upcount + lwcount) / 2.0;
236 | midthr = (upthr + lwthr) / 2.0;
237 | 
238 | % Repeats loop until `diff' falls inside the tolerance (-tol<=diff<=tol)
239 | while ( 1) 
240 |     
241 |     diff= midcount- midthr- Margin;
242 |     if (abs(diff)<= tol)
243 |         break;
244 |     end
245 |     
246 |     % if tolerance is not met up to 20 iteractions, then relax the
247 |     % tolerance by 10%
248 |     
249 |     iterno= iterno+ 1; 
250 |     
251 |     if (iterno>20) 
252 |       tol = tol* 1.1; 
253 |     end
254 | 
255 |     if (diff> tol)   % then new bounds are ...     
256 |         midcount = (upcount + midcount) / 2.0; 
257 |         % upper and middle activities 
258 |         midthr = (upthr + midthr) / 2.0;	  
259 |         % ... and thresholds     
260 |     elseif (diff< -tol)	% then new bounds are ... 
261 |         midcount = (midcount + lwcount) / 2.0; 
262 |         % middle and lower activities 
263 |         midthr = (midthr + lwthr) / 2.0;   
264 |         % ... and thresholds 
265 |     end    
266 |     
267 | end
268 | %   Since the tolerance has been satisfied, midcount is selected 
269 | %   as the interpolated value with a tol [dB] tolerance.
270 | 
271 | asl_ms_log= midcount;
272 | cc= midthr;
273 | 
274 | 
275 | 
276 | 
277 | 


--------------------------------------------------------------------------------
/PESQ/apply_VAD.asv:
--------------------------------------------------------------------------------
  1 | function [VAD, logVAD]= apply_VAD( data, Nsamples)
  2 | 
  3 | global Downsample MINSPEECHLGTH JOINSPEECHLGTH
  4 | 
  5 | Nwindows= floor( Nsamples/ Downsample);
  6 | %number of 4ms window
  7 | 
  8 | VAD= zeros( 1, Nwindows);
  9 | for count= 1: Nwindows
 10 |     VAD( count)= sum( data( (count-1)* Downsample+ 1: ...
 11 |         count* Downsample).^ 2)/ Downsample;   
 12 | end
 13 | %VAD is the power of each 4ms window 
 14 | 
 15 | LevelThresh = sum( VAD)/ Nwindows;
 16 | %LevelThresh is set to mean value of VAD
 17 | 
 18 | LevelMin= max( VAD);
 19 | if( LevelMin > 0 )
 20 |     LevelMin= LevelMin* 1.0e-4;
 21 | else
 22 |     LevelMin = 1.0;
 23 | end
 24 | %fprintf( 1, 'LevelMin is %f\n', LevelMin);
 25 | 
 26 | VAD( find( VAD< LevelMin))= LevelMin;
 27 | 
 28 | for iteration= 1: 12    
 29 |     LevelNoise= 0;
 30 |     len= 0;
 31 |     StDNoise= 0;    
 32 |     
 33 |     VAD_lessthan_LevelThresh= VAD( find( VAD<= LevelThresh));
 34 |     len= length( VAD_lessthan_LevelThresh);
 35 |     LevelNoise= sum( VAD_lessthan_LevelThresh);
 36 |     if (len> 0)
 37 |         LevelNoise= LevelNoise/ len;
 38 |         StDNoise= sqrt( sum( ...
 39 |         (VAD_lessthan_LevelThresh- LevelNoise).^ 2)/ len);
 40 |     end
 41 |     LevelThresh= 1.001* (LevelNoise+ 2* StDNoise);  
 42 | end
 43 | %fprintf( 1, 'LevelThresh is %f\n', LevelThresh);
 44 | 
 45 | LevelNoise= 0;
 46 | LevelSig= 0;
 47 | len= 0;
 48 | VAD_greaterthan_LevelThresh= VAD( find( VAD> LevelThresh));
 49 | len= length( VAD_greaterthan_LevelThresh);
 50 | LevelSig= sum( VAD_greaterthan_LevelThresh);
 51 | 
 52 | VAD_lessorequal_LevelThresh= VAD( find( VAD<= LevelThresh));
 53 | LevelNoise= sum( VAD_lessorequal_LevelThresh);
 54 | 
 55 | if (len> 0)
 56 |     LevelSig= LevelSig/ len;
 57 | else
 58 |     LevelThresh= -1;
 59 | end
 60 | %fprintf( 1, 'LevelSig is %f\n', LevelSig);
 61 | 
 62 | if (len< Nwindows)
 63 |     LevelNoise= LevelNoise/( Nwindows- len);
 64 | else
 65 |     LevelNoise= 1;
 66 | end
 67 | %fprintf( 1, 'LevelNoise is %f\n', LevelNoise);
 68 | 
 69 | VAD( find( VAD<= LevelThresh))= -VAD( find( VAD<= LevelThresh));
 70 | VAD(1)= -LevelMin;
 71 | VAD(Nwindows)= -LevelMin;
 72 | 
 73 | 
 74 | start= 0;
 75 | finish= 0;
 76 | for count= 2: Nwindows
 77 |     if( (VAD(count) > 0.0) && (VAD(count-1) <= 0.0) )
 78 |         start = count;
 79 |     end
 80 |     if( (VAD(count) <= 0.0) && (VAD(count-1) > 0.0) )
 81 |         finish = count;
 82 |         if( (finish - start)<= MINSPEECHLGTH )
 83 |             VAD( start: finish- 1)= -VAD( start: finish- 1);
 84 |         end
 85 |     end
 86 | end
 87 | %to make sure finish- start is more than 4
 88 | 
 89 | if( LevelSig >= (LevelNoise* 1000) )
 90 |     for count= 2: Nwindows
 91 |         if( (VAD(count)> 0) && (VAD(count-1)<= 0) )
 92 |             start= count;
 93 |         end
 94 |         if( (VAD(count)<= 0) && (VAD(count-1)> 0) )
 95 |             finish = count;
 96 |             g = sum( VAD( start: finish- 1));
 97 |             if( g< 3.0* LevelThresh* (finish - start) )
 98 |                 VAD( start: finish- 1)= -VAD( start: finish- 1);
 99 |             end
100 |         end
101 |     end
102 | end
103 | 
104 | start = 0;
105 | finish = 0;
106 | for count= 2: Nwindows
107 |     if( (VAD(count) > 0.0) && (VAD(count-1) <= 0.0) )
108 |         start = count;
109 |         if( (finish > 0) && ((start - finish) <= JOINSPEECHLGTH) )
110 |             VAD( finish: start- 1)= LevelMin;
111 |         end        
112 |     end
113 |     if( (VAD(count) <= 0.0) && (VAD(count-1) > 0.0) )
114 |         finish = count;
115 |     end
116 | end
117 | 
118 | start= 0;
119 | for count= 2: Nwindows
120 |     if( (VAD(count)> 0) && (VAD(count-1)<= 0) )
121 |         start= count;
122 |     end
123 | end
124 | if( start== 0 )
125 |     VAD= abs(VAD);
126 |     VAD(1) = -LevelMin;
127 |     VAD(Nwindows) = -LevelMin;
128 | end
129 | 
130 | count = 4;
131 | while( count< (Nwindows-1) )
132 |     if( (VAD(count)> 0) && (VAD(count-2) <= 0) )
133 |         VAD(count-2)= VAD(count)* 0.1;
134 |         VAD(count-1)= VAD(count)* 0.3;
135 |         count= count+ 1;
136 |     end
137 |     if( (VAD(count)<= 0) && (VAD(count-1)> 0) )
138 |         VAD(count)= VAD(count-1)* 0.3;
139 |         VAD(count+ 1)= VAD(count-1)* 0.1;
140 |         count= count+ 3;
141 |     end
142 |     count= count+ 1;
143 | end
144 | 
145 | VAD( find( VAD< 0))= 0;
146 | 
147 | % fid= fopen( 'mat_vad.txt', 'wt');
148 | % fprintf( fid, '%f\n', VAD);
149 | % fclose( fid);
150 | 
151 | if( LevelThresh<= 0 )
152 |     LevelThresh= LevelMin;
153 | end
154 | %No me queda claro que se hace antes de esto. Es evidente que se calcula el
155 | %nivel umbral (LevelThresh). Pero tambien se le hace 
156 | 
157 | 
158 | %Si VAD (que contiene la energia en 4 ms (32 muestras a 8Khz) NO es superior
159 | %al umbral entonces el envelope es 0 (log(MAX(VAD/LevelThresh,1)))
160 | logVAD( find( VAD<= LevelThresh))= 0;
161 | %Si VAD es superior al umbral, entonces se divide por el Level y se le hace
162 | %el log
163 | VAD_greaterthan_LevelThresh= find( VAD> LevelThresh);
164 | logVAD( VAD_greaterthan_LevelThresh)= log( VAD(VAD_greaterthan_LevelThresh)/ LevelThresh);
165 | %LogVAD queda relleno tanto de los que superan como los que no el umbral.
166 | 
167 | 
168 | 
169 | 


--------------------------------------------------------------------------------
/PESQ/apply_VAD.m:
--------------------------------------------------------------------------------
  1 | function [VAD, logVAD]= apply_VAD( data, Nsamples)
  2 | 
  3 | global Downsample MINSPEECHLGTH JOINSPEECHLGTH
  4 | 
  5 | Nwindows= floor( Nsamples/ Downsample);
  6 | %number of 4ms window
  7 | 
  8 | VAD= zeros( 1, Nwindows);
  9 | for count= 1: Nwindows
 10 |     VAD( count)= sum( data( (count-1)* Downsample+ 1: ...
 11 |         count* Downsample).^ 2)/ Downsample;   
 12 | end
 13 | %VAD is the power of each 4ms window 
 14 | 
 15 | LevelThresh = sum( VAD)/ Nwindows;
 16 | %LevelThresh is set to mean value of VAD
 17 | 
 18 | LevelMin= max( VAD);
 19 | if( LevelMin > 0 )
 20 |     LevelMin= LevelMin* 1.0e-4;
 21 | else
 22 |     LevelMin = 1.0;
 23 | end
 24 | %fprintf( 1, 'LevelMin is %f\n', LevelMin);
 25 | 
 26 | VAD( find( VAD< LevelMin))= LevelMin;
 27 | 
 28 | for iteration= 1: 12    
 29 |     LevelNoise= 0;
 30 |     len= 0;
 31 |     StDNoise= 0;    
 32 |     
 33 |     VAD_lessthan_LevelThresh= VAD( find( VAD<= LevelThresh));
 34 |     len= length( VAD_lessthan_LevelThresh);
 35 |     LevelNoise= sum( VAD_lessthan_LevelThresh);
 36 |     if (len> 0)
 37 |         LevelNoise= LevelNoise/ len;
 38 |         StDNoise= sqrt( sum( ...
 39 |         (VAD_lessthan_LevelThresh- LevelNoise).^ 2)/ len);
 40 |     end
 41 |     LevelThresh= 1.001* (LevelNoise+ 2* StDNoise);  
 42 | end
 43 | %fprintf( 1, 'LevelThresh is %f\n', LevelThresh);
 44 | 
 45 | %HASTA AQUI es asequible, lo siguiente no lo entiendo (pero creo que es
 46 | %cocinica.
 47 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 48 | 
 49 | LevelNoise= 0;
 50 | LevelSig= 0;
 51 | len= 0;
 52 | VAD_greaterthan_LevelThresh= VAD( find( VAD> LevelThresh));
 53 | len= length( VAD_greaterthan_LevelThresh);
 54 | LevelSig= sum( VAD_greaterthan_LevelThresh);
 55 | 
 56 | VAD_lessorequal_LevelThresh= VAD( find( VAD<= LevelThresh));
 57 | LevelNoise= sum( VAD_lessorequal_LevelThresh);
 58 | 
 59 | if (len> 0)
 60 |     LevelSig= LevelSig/ len;
 61 | else
 62 |     LevelThresh= -1;
 63 | end
 64 | %fprintf( 1, 'LevelSig is %f\n', LevelSig);
 65 | 
 66 | if (len< Nwindows)
 67 |     LevelNoise= LevelNoise/( Nwindows- len);
 68 | else
 69 |     LevelNoise= 1;
 70 | end
 71 | %fprintf( 1, 'LevelNoise is %f\n', LevelNoise);
 72 | 
 73 | VAD( find( VAD<= LevelThresh))= -VAD( find( VAD<= LevelThresh));
 74 | VAD(1)= -LevelMin;
 75 | VAD(Nwindows)= -LevelMin;
 76 | 
 77 | 
 78 | start= 0;
 79 | finish= 0;
 80 | for count= 2: Nwindows
 81 |     if( (VAD(count) > 0.0) && (VAD(count-1) <= 0.0) )
 82 |         start = count;
 83 |     end
 84 |     if( (VAD(count) <= 0.0) && (VAD(count-1) > 0.0) )
 85 |         finish = count;
 86 |         if( (finish - start)<= MINSPEECHLGTH )
 87 |             VAD( start: finish- 1)= -VAD( start: finish- 1);
 88 |         end
 89 |     end
 90 | end
 91 | %to make sure finish- start is more than 4
 92 | 
 93 | if( LevelSig >= (LevelNoise* 1000) )
 94 |     for count= 2: Nwindows
 95 |         if( (VAD(count)> 0) && (VAD(count-1)<= 0) )
 96 |             start= count;
 97 |         end
 98 |         if( (VAD(count)<= 0) && (VAD(count-1)> 0) )
 99 |             finish = count;
100 |             g = sum( VAD( start: finish- 1));
101 |             if( g< 3.0* LevelThresh* (finish - start) )
102 |                 VAD( start: finish- 1)= -VAD( start: finish- 1);
103 |             end
104 |         end
105 |     end
106 | end
107 | 
108 | start = 0;
109 | finish = 0;
110 | for count= 2: Nwindows
111 |     if( (VAD(count) > 0.0) && (VAD(count-1) <= 0.0) )
112 |         start = count;
113 |         if( (finish > 0) && ((start - finish) <= JOINSPEECHLGTH) )
114 |             VAD( finish: start- 1)= LevelMin;
115 |         end        
116 |     end
117 |     if( (VAD(count) <= 0.0) && (VAD(count-1) > 0.0) )
118 |         finish = count;
119 |     end
120 | end
121 | 
122 | start= 0;
123 | for count= 2: Nwindows
124 |     if( (VAD(count)> 0) && (VAD(count-1)<= 0) )
125 |         start= count;
126 |     end
127 | end
128 | if( start== 0 )
129 |     VAD= abs(VAD);
130 |     VAD(1) = -LevelMin;
131 |     VAD(Nwindows) = -LevelMin;
132 | end
133 | 
134 | count = 4;
135 | while( count< (Nwindows-1) )
136 |     if( (VAD(count)> 0) && (VAD(count-2) <= 0) )
137 |         VAD(count-2)= VAD(count)* 0.1;
138 |         VAD(count-1)= VAD(count)* 0.3;
139 |         count= count+ 1;
140 |     end
141 |     if( (VAD(count)<= 0) && (VAD(count-1)> 0) )
142 |         VAD(count)= VAD(count-1)* 0.3;
143 |         VAD(count+ 1)= VAD(count-1)* 0.1;
144 |         count= count+ 3;
145 |     end
146 |     count= count+ 1;
147 | end
148 | 
149 | VAD( find( VAD< 0))= 0;
150 | 
151 | % fid= fopen( 'mat_vad.txt', 'wt');
152 | % fprintf( fid, '%f\n', VAD);
153 | % fclose( fid);
154 | 
155 | if( LevelThresh<= 0 )
156 |     LevelThresh= LevelMin;
157 | end
158 | 
159 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
160 | %No me queda claro que se hace antes de esto. Es evidente que se calcula el
161 | %nivel umbral (LevelThresh). Pero tambien se le hace un acomodado a las
162 | %senal VAD que no se especifica en la descripcion del estandar. Senal VAD
163 | %contiene la energia en 4 ms (32 muestras a 8Khz).
164 | 
165 | %Si VAD  NO es superior al umbral entonces el envelope es 0 (log(MAX(VAD/LevelThresh,1)))
166 | logVAD( find( VAD<= LevelThresh))= 0;
167 | %Si VAD es superior al umbral, entonces se divide por el Level y se le hace
168 | %el log
169 | VAD_greaterthan_LevelThresh= find( VAD> LevelThresh);
170 | logVAD( VAD_greaterthan_LevelThresh)= log( VAD(VAD_greaterthan_LevelThresh)/ LevelThresh);
171 | %LogVAD queda relleno tanto de los que superan como los que no el umbral.
172 | 
173 | 
174 | 
175 | 


--------------------------------------------------------------------------------
/PESQ/apply_filter.asv:
--------------------------------------------------------------------------------
 1 | function align_filtered= apply_filter( data, data_Nsamples, align_filter_dB)
 2 | %Aplica un filtrado sobre la senal de entrada. Esta senal se presupone con
 3 | %unos bufferes de inicio y final mas un padding. El filtro se expresa en
 4 | %decibelios por cada frecuencia. Las frecuencias no tienen limitacion, ya
 5 | %que genera sobre la marcha un filtro FIR basado en IFFT:
 6 | % 1.Hace la fft de la senal
 7 | % 2.Interpola la respuesta en frecuencia del filtro deseado
 8 | % 3.Hace la respuesta del filtro simetrica y desace decibelios
 9 | % 4.Lo anterior se hace de forma que el tamano de la FFT de la senal
10 | % coincide con la respuesta.
11 | % 5. Ambas de multiplican
12 | %NOTA: Fijate de que esta forma hace un megafiltrado que deja la fase
13 | %inalterada (la fase del filtro, que se suma a la de la senal, es 0, mientras
14 | %el modulo coincide con el valor absoluto especificado)
15 | global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs
16 | 
17 | align_filtered= data;
18 | n= data_Nsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000);
19 | % now find the next power of 2 which is greater or equal to n
20 | pow_of_2= 2^ (ceil( log2( n)));
21 | 
22 | [number_of_points, trivial]= size( align_filter_dB);
23 | overallGainFilter= interp1( align_filter_dB( :, 1), align_filter_dB( :, 2), ...
24 |     1000);
25 | 
26 | x= zeros( 1, pow_of_2);
27 | x( 1: n)= data( SEARCHBUFFER* Downsample+ 1: SEARCHBUFFER* Downsample+ n);
28 | %Extrae la informacion del vector de entrada con bufferes y la coloca en un
29 | %vector con 0s. ESTO YA LO HACE MATLAB
30 | x_fft= fft( x, pow_of_2);
31 | 
32 | freq_resolution= Fs/ pow_of_2;
33 | 
34 | factorDb( 1: pow_of_2/2+ 1)= interp1( align_filter_dB( :, 1), ...
35 |     align_filter_dB( :, 2), (0: pow_of_2/2)* freq_resolution)- ...
36 |     overallGainFilter;
37 | factor= 10.^ (factorDb/ 20);
38 | 
39 | factor= [factor, fliplr( factor( 2: pow_of_2/2))];
40 | x_fft= x_fft.* factor;
41 | 
42 | y= ifft( x_fft, pow_of_2);
43 | 
44 | align_filtered( SEARCHBUFFER* Downsample+ 1: SEARCHBUFFER* Downsample+ n)...
45 |     = y( 1: n);
46 | 
47 | % fid= fopen( 'log_mat.txt', 'wt');
48 | % fprintf( fid, '%f\n', y( 1: n));
49 | % fclose( fid);
50 | 
51 | 
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/PESQ/apply_filter.m:
--------------------------------------------------------------------------------
 1 | function align_filtered= apply_filter( data, data_Nsamples, align_filter_dB)
 2 | %Aplica un filtrado sobre la senal de entrada. Esta senal se presupone con
 3 | %unos bufferes de inicio y final mas un padding. El filtro se expresa en
 4 | %decibelios por cada frecuencia. Las frecuencias no tienen limitacion, ya
 5 | %que genera sobre la marcha un filtro FIR basado en IFFT:
 6 | % 1.Hace la fft de la senal
 7 | % 2.Interpola la respuesta en frecuencia del filtro deseado
 8 | % 3.Hace la respuesta del filtro simetrica y desace decibelios
 9 | % 4.Lo anterior se hace de forma que el tamano de la FFT de la senal
10 | % coincide con la respuesta.
11 | % 5. Ambas de multiplican
12 | %NOTA: Fijate de que esta forma hace un megafiltrado que deja la fase
13 | %inalterada (la fase del filtro, que se suma a la de la senal, es 0, mientras
14 | %el modulo coincide con el valor absoluto especificado)
15 | global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs
16 | 
17 | align_filtered= data;
18 | n= data_Nsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000);
19 | % now find the next power of 2 which is greater or equal to n
20 | pow_of_2= 2^ (ceil( log2( n)));
21 | 
22 | [number_of_points, trivial]= size( align_filter_dB);
23 | overallGainFilter= interp1( align_filter_dB( :, 1), align_filter_dB( :, 2), ...
24 |     1000);
25 | 
26 | x= zeros( 1, pow_of_2);
27 | x( 1: n)= data( SEARCHBUFFER* Downsample+ 1: SEARCHBUFFER* Downsample+ n);
28 | %Extrae la informacion del vector de entrada con bufferes y la coloca en un
29 | %vector con 0s. ESTO YA LO HACE MATLAB
30 | x_fft= fft( x, pow_of_2);
31 | 
32 | freq_resolution= Fs/ pow_of_2;
33 | 
34 | factorDb( 1: pow_of_2/2+ 1)= interp1( align_filter_dB( :, 1), ...
35 |     align_filter_dB( :, 2), (0: pow_of_2/2)* freq_resolution)- ...
36 |     overallGainFilter;
37 | factor= 10.^ (factorDb/ 20);
38 | 
39 | factor= [factor, fliplr( factor( 2: pow_of_2/2))];
40 | x_fft= x_fft.* factor;
41 | 
42 | y= ifft( x_fft, pow_of_2);
43 | 
44 | align_filtered( SEARCHBUFFER* Downsample+ 1: SEARCHBUFFER* Downsample+ n)...
45 |     = y( 1: n);
46 | 
47 | % fid= fopen( 'log_mat.txt', 'wt');
48 | % fprintf( fid, '%f\n', y( 1: n));
49 | % fclose( fid);
50 | 
51 | 
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/PESQ/apply_filters.m:
--------------------------------------------------------------------------------
 1 | function mod_data= apply_filters( data, Nsamples)
 2 | %IIRFilt( InIIR_Hsos, InIIR_Nsos, data, data_Nsamples);
 3 | 
 4 | global InIIR_Hsos InIIR_Nsos DATAPADDING_MSECS Fs
 5 | % data_Nsamples= Nsamples+ DATAPADDING_MSECS* (Fs/ 1000);
 6 | 
 7 | % now we construct the second order section matrix
 8 | sosMatrix= zeros( InIIR_Nsos, 6);
 9 | sosMatrix( :, 4)= 1; %set a(1) to 1
10 | % each row of sosMatrix holds [b(1*3) a(1*3)] for each section
11 | sosMatrix( :, 1: 3)= InIIR_Hsos( :, 1: 3);
12 | sosMatrix( :, 5: 6)= InIIR_Hsos( :, 4: 5);
13 | %sosMatrix
14 | 
15 | % now we construct second order section direct form II filter
16 | iirdf2= dfilt.df2sos( sosMatrix);
17 | 
18 | mod_data= filter( iirdf2, data);
19 | 
20 | 
21 | 
22 | 
23 | 
24 | 
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/PESQ/comp_cep.asv:
--------------------------------------------------------------------------------
  1 | function cep_mean= comp_cep(cleanFile, enhdFile);
  2 | 
  3 | % ----------------------------------------------------------------------
  4 | %          Cepstrum Distance Objective Speech Quality Measure
  5 | %
  6 | %   This function implements the cepstrum distance measure used
  7 | %   in [1]
  8 | %
  9 | %   Usage:  CEP=comp_cep(cleanFile.wav, enhancedFile.wav)
 10 | %           
 11 | %         cleanFile.wav - clean input file in .wav format
 12 | %         enhancedFile  - enhanced output file in .wav format
 13 | %         CEP            - computed cepstrum distance measure
 14 | % 
 15 | %         Note that the cepstrum measure is limited in the range [0, 100].
 16 | %
 17 | %  Example call:  IS =comp_is('sp04.wav','enhanced.wav')
 18 | %
 19 | %  
 20 | %  References:
 21 | %
 22 | %     [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
 23 | %	    Objective Measures of Speech Quality.  Prentice Hall
 24 | %	    Advanced Reference Series, Englewood Cliffs, NJ, 1988,
 25 | %	    ISBN: 0-13-629056-6.
 26 | %
 27 | %     [2] B.-H. Juang, "On Using the Itakura-Saito Measures for
 28 | %           Speech Coder Performance Evaluation", AT&T Bell
 29 | %  	    Laboratories Technical Journal, Vol. 63, No. 8,
 30 | %	    October 1984, pp. 1477-1498.
 31 | %
 32 | %  Authors: Bryan L. Pellom and John H. L. Hansen (July 1998)
 33 | %  Modified by: Philipos C. Loizou  (Oct 2006) - limited IS to be in [0,100]
 34 | %
 35 | % Copyright (c) 2006 by Philipos C. Loizou
 36 | % $Revision: 0.0 $  $Date: 10/09/2006 $
 37 | 
 38 | % ----------------------------------------------------------------------
 39 | if nargin~=2
 40 |     fprintf('USAGE: CEP=comp_cep(cleanFile.wav, enhancedFile.wav)\n');
 41 |     fprintf('For more help, type: help comp_cep\n\n');
 42 |     return;
 43 | end
 44 | 
 45 | alpha=0.95;
 46 | 
 47 | [data1, Srate1, Nbits1]= wavread(cleanFile);
 48 | [data2, Srate2, Nbits2]= wavread(enhdFile);
 49 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
 50 |     error( 'The two files do not match!\n');
 51 | end
 52 |     
 53 | len= min( length( data1), length( data2));
 54 | data1= data1( 1: len)+eps;
 55 | data2= data2( 1: len)+eps;
 56 | 
 57 | IS_dist= cepstrum( data1, data2,Srate1);
 58 | 
 59 | IS_len= round( length( IS_dist)* alpha);
 60 | IS= sort( IS_dist);
 61 | 
 62 | is_mean= mean( IS( 1: IS_len)); 
 63 | 
 64 | 
 65 | 
 66 | 
 67 | function distortion = cepstrum(clean_speech, processed_speech,sample_rate)
 68 | 
 69 | 
 70 | % ----------------------------------------------------------------------
 71 | % Check the length of the clean and processed speech.  Must be the same.
 72 | % ----------------------------------------------------------------------
 73 | 
 74 | clean_length      = length(clean_speech);
 75 | processed_length  = length(processed_speech);
 76 | 
 77 | if (clean_length ~= processed_length)
 78 |   disp('Error: Both Speech Files must be same length.');
 79 |   return
 80 | end
 81 | 
 82 | % ----------------------------------------------------------------------
 83 | % Scale both clean speech and processed speech to have same dynamic
 84 | % range.  Also remove DC component from each signal
 85 | % ----------------------------------------------------------------------
 86 | 
 87 | %clean_speech     = clean_speech     - mean(clean_speech);
 88 | %processed_speech = processed_speech - mean(processed_speech);
 89 | 
 90 | %processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech)));
 91 | 
 92 | % ----------------------------------------------------------------------
 93 | % Global Variables
 94 | % ----------------------------------------------------------------------
 95 | 
 96 | winlength   = round(30*sample_rate/1000); %240;		   % window length in samples
 97 | skiprate    = floor(winlength/4);		   % window skip in samples
 98 | if sample_rate<10000
 99 |    P           = 10;		   % LPC Analysis Order
100 | else
101 |     P=16;     % this could vary depending on sampling frequency.
102 | end
103 | C=10*sqrt(2)/log(10);
104 | % ----------------------------------------------------------------------
105 | % For each frame of input speech, calculate the Itakura-Saito Measure
106 | % ----------------------------------------------------------------------
107 | 
108 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
109 | start      = 1;					% starting sample
110 | window     = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
111 | 
112 | for frame_count = 1:num_frames
113 | 
114 |    % ----------------------------------------------------------
115 |    % (1) Get the Frames for the test and reference speech. 
116 |    %     Multiply by Hanning Window.
117 |    % ----------------------------------------------------------
118 | 
119 |    clean_frame = clean_speech(start:start+winlength-1);
120 |    processed_frame = processed_speech(start:start+winlength-1);
121 |    clean_frame = clean_frame.*window;
122 |    processed_frame = processed_frame.*window;
123 | 
124 |    % ----------------------------------------------------------
125 |    % (2) Get the autocorrelation lags and LPC parameters used
126 |    %     to compute the IS measure.
127 |    % ----------------------------------------------------------
128 | 
129 |    [R_clean, Ref_clean, A_clean] = ...
130 |       lpcoeff(clean_frame, P);
131 |    [R_processed, Ref_processed, A_processed] = ...
132 |       lpcoeff(processed_frame, P);
133 | 
134 |   C_clean=lpc2cep(A_clean);
135 |   C_processed=lpc2cep(A_processed);
136 |   
137 |    % ----------------------------------------------------------
138 |    % (3) Compute the cepstrum-distance measure
139 |    % ----------------------------------------------------------
140 | 
141 |   
142 |    distortion(frame_count) = min(10,C*norm(C_clean-C_processed,2)); 
143 |    
144 | 
145 |    start = start + skiprate;
146 | 
147 | end
148 | 
149 | 
150 | 
151 | function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order)
152 | 
153 |    % ----------------------------------------------------------
154 |    % (1) Compute Autocorrelation Lags
155 |    % ----------------------------------------------------------
156 | 
157 |    winlength = max(size(speech_frame));
158 |    for k=1:model_order+1
159 |       R(k) = sum(speech_frame(1:winlength-k+1) ...
160 | 		     .*speech_frame(k:winlength));
161 |    end
162 | 
163 |    % ----------------------------------------------------------
164 |    % (2) Levinson-Durbin
165 |    % ----------------------------------------------------------
166 | 
167 |    a = ones(1,model_order);
168 |    E(1)=R(1);
169 |    for i=1:model_order
170 |       a_past(1:i-1) = a(1:i-1);
171 |       sum_term = sum(a_past(1:i-1).*R(i:-1:2));
172 |       rcoeff(i)=(R(i+1) - sum_term) / E(i);
173 |       a(i)=rcoeff(i);
174 |       a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1);
175 |       E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i);
176 |    end
177 | 
178 |    acorr    = R;
179 |    refcoeff = rcoeff;
180 |    lpparams = [1 -a];
181 | 
182 | %----------------------------------------------
183 | function [cep]=lpc2cep(a)
184 | %
185 | % converts prediction to cepstrum coefficients
186 | %
187 | % Author: Philipos C. Loizou
188 | 
189 | M=length(a);
190 | cep=zeros(1,M-1);
191 | 
192 | cep(1)=-a(2);
193 | 
194 | for k=2:M-1
195 |     ix=1:k-1;
196 |     vec1=cep(ix).*a(k-1+1:-1:2).*ix;
197 |     cep(k)=-(a(k+1)+sum(vec1)/k);
198 |     
199 | end
200 | 
201 | 
202 | 
203 |  
204 | 
205 |     
206 | 
207 | 


--------------------------------------------------------------------------------
/PESQ/comp_cep.m:
--------------------------------------------------------------------------------
  1 | function cep_mean= comp_cep(cleanFile, enhdFile);
  2 | 
  3 | % ----------------------------------------------------------------------
  4 | %          Cepstrum Distance Objective Speech Quality Measure
  5 | %
  6 | %   This function implements the cepstrum distance measure used
  7 | %   in [1]
  8 | %
  9 | %   Usage:  CEP=comp_cep(cleanFile.wav, enhancedFile.wav)
 10 | %           
 11 | %         cleanFile.wav - clean input file in .wav format
 12 | %         enhancedFile  - enhanced output file in .wav format
 13 | %         CEP           - computed cepstrum distance measure
 14 | % 
 15 | %         Note that the cepstrum measure is limited in the range [0, 10].
 16 | %
 17 | %  Example call:  CEP =comp_cep('sp04.wav','enhanced.wav')
 18 | %
 19 | %  
 20 | %  References:
 21 | %
 22 | %     [1]	Kitawaki, N., Nagabuchi, H., and Itoh, K. (1988). Objective quality
 23 | %           evaluation for low bit-rate speech coding systems. IEEE J. Select.
 24 | %           Areas in Comm., 6(2), 262-273.
 25 | %
 26 | %  Author: Philipos C. Loizou 
 27 | %  (LPC routines were written by Bryan Pellom & John Hansen)
 28 | %
 29 | % Copyright (c) 2006 by Philipos C. Loizou
 30 | % $Revision: 0.0 $  $Date: 10/09/2006 $
 31 | 
 32 | % ----------------------------------------------------------------------
 33 | if nargin~=2
 34 |     fprintf('USAGE: CEP=comp_cep(cleanFile.wav, enhancedFile.wav)\n');
 35 |     fprintf('For more help, type: help comp_cep\n\n');
 36 |     return;
 37 | end
 38 | 
 39 | alpha=0.95;
 40 | 
 41 | [data1, Srate1, Nbits1]= wavread(cleanFile);
 42 | [data2, Srate2, Nbits2]= wavread(enhdFile);
 43 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
 44 |     error( 'The two files do not match!\n');
 45 | end
 46 |     
 47 | len= min( length( data1), length( data2));
 48 | data1= data1( 1: len)+eps;
 49 | data2= data2( 1: len)+eps;
 50 | 
 51 | IS_dist= cepstrum( data1, data2,Srate1);
 52 | 
 53 | IS_len= round( length( IS_dist)* alpha);
 54 | IS= sort( IS_dist);
 55 | 
 56 | cep_mean= mean( IS( 1: IS_len)); 
 57 | 
 58 | 
 59 | 
 60 | 
 61 | function distortion = cepstrum(clean_speech, processed_speech,sample_rate)
 62 | 
 63 | 
 64 | % ----------------------------------------------------------------------
 65 | % Check the length of the clean and processed speech.  Must be the same.
 66 | % ----------------------------------------------------------------------
 67 | 
 68 | clean_length      = length(clean_speech);
 69 | processed_length  = length(processed_speech);
 70 | 
 71 | if (clean_length ~= processed_length)
 72 |   disp('Error: Both Speech Files must be same length.');
 73 |   return
 74 | end
 75 | 
 76 | % ----------------------------------------------------------------------
 77 | % Scale both clean speech and processed speech to have same dynamic
 78 | % range.  Also remove DC component from each signal
 79 | % ----------------------------------------------------------------------
 80 | 
 81 | %clean_speech     = clean_speech     - mean(clean_speech);
 82 | %processed_speech = processed_speech - mean(processed_speech);
 83 | 
 84 | %processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech)));
 85 | 
 86 | % ----------------------------------------------------------------------
 87 | % Global Variables
 88 | % ----------------------------------------------------------------------
 89 | 
 90 | winlength   = round(30*sample_rate/1000); %240;		   % window length in samples
 91 | skiprate    = floor(winlength/4);		   % window skip in samples
 92 | if sample_rate<10000
 93 |    P           = 10;		   % LPC Analysis Order
 94 | else
 95 |     P=16;     % this could vary depending on sampling frequency.
 96 | end
 97 | C=10*sqrt(2)/log(10);
 98 | % ----------------------------------------------------------------------
 99 | % For each frame of input speech, calculate the Itakura-Saito Measure
100 | % ----------------------------------------------------------------------
101 | 
102 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
103 | start      = 1;					% starting sample
104 | window     = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
105 | 
106 | for frame_count = 1:num_frames
107 | 
108 |    % ----------------------------------------------------------
109 |    % (1) Get the Frames for the test and reference speech. 
110 |    %     Multiply by Hanning Window.
111 |    % ----------------------------------------------------------
112 | 
113 |    clean_frame = clean_speech(start:start+winlength-1);
114 |    processed_frame = processed_speech(start:start+winlength-1);
115 |    clean_frame = clean_frame.*window;
116 |    processed_frame = processed_frame.*window;
117 | 
118 |    % ----------------------------------------------------------
119 |    % (2) Get the autocorrelation lags and LPC parameters used
120 |    %     to compute the IS measure.
121 |    % ----------------------------------------------------------
122 | 
123 |    [R_clean, Ref_clean, A_clean] = ...
124 |       lpcoeff(clean_frame, P);
125 |    [R_processed, Ref_processed, A_processed] = ...
126 |       lpcoeff(processed_frame, P);
127 | 
128 |   C_clean=lpc2cep(A_clean);
129 |   C_processed=lpc2cep(A_processed);
130 |   
131 |    % ----------------------------------------------------------
132 |    % (3) Compute the cepstrum-distance measure
133 |    % ----------------------------------------------------------
134 | 
135 |   
136 |    distortion(frame_count) = min(10,C*norm(C_clean-C_processed,2)); 
137 |    
138 | 
139 |    start = start + skiprate;
140 | 
141 | end
142 | 
143 | 
144 | 
145 | function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order)
146 | 
147 |    % ----------------------------------------------------------
148 |    % (1) Compute Autocorrelation Lags
149 |    % ----------------------------------------------------------
150 | 
151 |    winlength = max(size(speech_frame));
152 |    for k=1:model_order+1
153 |       R(k) = sum(speech_frame(1:winlength-k+1) ...
154 | 		     .*speech_frame(k:winlength));
155 |    end
156 | 
157 |    % ----------------------------------------------------------
158 |    % (2) Levinson-Durbin
159 |    % ----------------------------------------------------------
160 | 
161 |    a = ones(1,model_order);
162 |    E(1)=R(1);
163 |    for i=1:model_order
164 |       a_past(1:i-1) = a(1:i-1);
165 |       sum_term = sum(a_past(1:i-1).*R(i:-1:2));
166 |       rcoeff(i)=(R(i+1) - sum_term) / E(i);
167 |       a(i)=rcoeff(i);
168 |       a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1);
169 |       E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i);
170 |    end
171 | 
172 |    acorr    = R;
173 |    refcoeff = rcoeff;
174 |    lpparams = [1 -a];
175 | 
176 | %----------------------------------------------
177 | function [cep]=lpc2cep(a)
178 | %
179 | % converts prediction to cepstrum coefficients
180 | %
181 | % Author: Philipos C. Loizou
182 | 
183 | M=length(a);
184 | cep=zeros(1,M-1);
185 | 
186 | cep(1)=-a(2);
187 | 
188 | for k=2:M-1
189 |     ix=1:k-1;
190 |     vec1=cep(ix).*a(k-1+1:-1:2).*ix;
191 |     cep(k)=-(a(k+1)+sum(vec1)/k);
192 |     
193 | end
194 | 
195 | 
196 | 
197 |  
198 | 
199 |     
200 | 
201 | 


--------------------------------------------------------------------------------
/PESQ/comp_fwseg.asv:
--------------------------------------------------------------------------------
  1 | function wss_dist= comp_fwseg(cleanFile, enhancedFile);
  2 | 
  3 | % ----------------------------------------------------------------------
  4 | %      Frequency weighte Objective Speech Quality Measure
  5 | %
  6 | %   This function implements the cepstrum distance measure used
  7 | %   in [1]
  8 | %
  9 | %   Usage:  CEP=comp_cep(cleanFile.wav, enhancedFile.wav)
 10 | %           
 11 | %         cleanFile.wav - clean input file in .wav format
 12 | %         enhancedFile  - enhanced output file in .wav format
 13 | %         CEP           - computed cepstrum distance measure
 14 | % 
 15 | %         Note that the cepstrum measure is limited in the range [0, 10].
 16 | %
 17 | %  Example call:  CEP =comp_cep('sp04.wav','enhanced.wav')
 18 | %
 19 | %  
 20 | %  References:
 21 | %
 22 | %     [1]	Kitawaki, N., Nagabuchi, H., and Itoh, K. (1988). Objective quality
 23 | %           evaluation for low bit-rate speech coding systems. IEEE J. Select.
 24 | %           Areas in Comm., 6(2), 262-273.
 25 | %
 26 | %  Author: Philipos C. Loizou 
 27 | %  (LPC routines were written by Bryan Pellom & John Hansen)
 28 | %
 29 | % Copyright (c) 2006 by Philipos C. Loizou
 30 | % $Revision: 0.0 $  $Date: 10/09/2006 $
 31 | 
 32 | % ----------------------------------------------------------------------
 33 | if nargin~=2
 34 |     fprintf('USAGE: CEP=comp_cep(cleanFile.wav, enhancedFile.wav)\n');
 35 |     fprintf('For more help, type: help comp_cep\n\n');
 36 |     return;
 37 | end
 38 | 
 39 | 
 40 | [data1, Srate1, Nbits1]= wavread(cleanFile);
 41 | [data2, Srate2, Nbits2]= wavread(enhancedFile);
 42 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
 43 |     error( 'The two files do not match!\n');
 44 | end
 45 | 
 46 | len= min( length( data1), length( data2));
 47 | data1= data1( 1: len)+eps;
 48 | data2= data2( 1: len)+eps;
 49 | 
 50 | wss_dist_vec= fwseg( data1, data2,Srate1);
 51 | wss_dist=mean(wss_dist_vec);
 52 | 
 53 | 
 54 | % ----------------------------------------------------------------------
 55 | 
 56 | function distortion = fwseg(clean_speech, processed_speech,sample_rate)
 57 | 
 58 | 
 59 | % ----------------------------------------------------------------------
 60 | % Check the length of the clean and processed speech.  Must be the same.
 61 | % ----------------------------------------------------------------------
 62 | 
 63 | clean_length      = length(clean_speech);
 64 | processed_length  = length(processed_speech);
 65 | 
 66 | if (clean_length ~= processed_length)
 67 |   disp('Error: Files  must have same length.');
 68 |   return
 69 | end
 70 | 
 71 | 
 72 | 
 73 | % ----------------------------------------------------------------------
 74 | % Global Variables
 75 | % ----------------------------------------------------------------------
 76 | 
 77 | 
 78 | winlength   = round(30*sample_rate/1000); 	   % window length in samples
 79 | skiprate    = floor(winlength/4);		   % window skip in samples
 80 | max_freq    = sample_rate/2;	   % maximum bandwidth
 81 | num_crit    = 25;		   % number of critical bands
 82 | USE_25=1;
 83 | n_fft       = 2^nextpow2(2*winlength);
 84 | n_fftby2    = n_fft/2;		   % FFT size/2
 85 | gamma=0.2;  % power exponent
 86 | 
 87 | % ----------------------------------------------------------------------
 88 | % Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
 89 | % ----------------------------------------------------------------------
 90 | 
 91 | cent_freq(1)  = 50.0000;   bandwidth(1)  = 70.0000;
 92 | cent_freq(2)  = 120.000;   bandwidth(2)  = 70.0000;
 93 | cent_freq(3)  = 190.000;   bandwidth(3)  = 70.0000;
 94 | cent_freq(4)  = 260.000;   bandwidth(4)  = 70.0000;
 95 | cent_freq(5)  = 330.000;   bandwidth(5)  = 70.0000;
 96 | cent_freq(6)  = 400.000;   bandwidth(6)  = 70.0000;
 97 | cent_freq(7)  = 470.000;   bandwidth(7)  = 70.0000;
 98 | cent_freq(8)  = 540.000;   bandwidth(8)  = 77.3724;
 99 | cent_freq(9)  = 617.372;   bandwidth(9)  = 86.0056;
100 | cent_freq(10) = 703.378;   bandwidth(10) = 95.3398;
101 | cent_freq(11) = 798.717;   bandwidth(11) = 105.411;
102 | cent_freq(12) = 904.128;   bandwidth(12) = 116.256;
103 | cent_freq(13) = 1020.38;   bandwidth(13) = 127.914;
104 | cent_freq(14) = 1148.30;   bandwidth(14) = 140.423;
105 | cent_freq(15) = 1288.72;   bandwidth(15) = 153.823;
106 | cent_freq(16) = 1442.54;   bandwidth(16) = 168.154;
107 | cent_freq(17) = 1610.70;   bandwidth(17) = 183.457;
108 | cent_freq(18) = 1794.16;   bandwidth(18) = 199.776;
109 | cent_freq(19) = 1993.93;   bandwidth(19) = 217.153;
110 | cent_freq(20) = 2211.08;   bandwidth(20) = 235.631;
111 | cent_freq(21) = 2446.71;   bandwidth(21) = 255.255;
112 | cent_freq(22) = 2701.97;   bandwidth(22) = 276.072;
113 | cent_freq(23) = 2978.04;   bandwidth(23) = 298.126;
114 | cent_freq(24) = 3276.17;   bandwidth(24) = 321.465;
115 | cent_freq(25) = 3597.63;   bandwidth(25) = 346.136;
116 | 
117 | W=[  % articulation index weights
118 | 0.003
119 | 0.003
120 | 0.003
121 | 0.007
122 | 0.010
123 | 0.016
124 | 0.016
125 | 0.017
126 | 0.017
127 | 0.022
128 | 0.027
129 | 0.028
130 | 0.030
131 | 0.032
132 | 0.034
133 | 0.035
134 | 0.037
135 | 0.036
136 | 0.036
137 | 0.033
138 | 0.030
139 | 0.029
140 | 0.027
141 | 0.026
142 | 0.026];
143 | 
144 | W=W';
145 | 
146 | if USE_25==0  % use 13 bands
147 |     % ----- lump adjacent filters together ----------------
148 |     k=2;
149 |     cent_freq2(1)=cent_freq(1);
150 |     bandwidth2(1)=bandwidth(1)+bandwidth(2);
151 |     W2(1)=W(1);
152 |     for i=2:13
153 |         cent_freq2(i)=cent_freq2(i-1)+bandwidth2(i-1);
154 |         bandwidth2(i)=bandwidth(k)+bandwidth(k+1);
155 |         W2(i)=0.5*(W(k)+W(k+1));
156 |         k=k+2;
157 |     end
158 | 
159 |     sumW=sum(W2);
160 |     bw_min      = bandwidth2 (1);	   % minimum critical bandwidth
161 | else
162 |     sumW=sum(W);
163 |     bw_min=bandwidth(1);
164 | end
165 | 
166 | 
167 | % ----------------------------------------------------------------------
168 | % Set up the critical band filters.  Note here that Gaussianly shaped
169 | % filters are used.  Also, the sum of the filter weights are equivalent
170 | % for each critical band filter.  Filter less than -30 dB and set to
171 | % zero.
172 | % ----------------------------------------------------------------------
173 | 
174 | min_factor = exp (-30.0 / (2.0 * 2.303));       % -30 dB point of filter
175 | if USE_25==0
176 |     
177 |     num_crit=length(cent_freq2);
178 | 
179 |     for i = 1:num_crit
180 |         f0 = (cent_freq2 (i) / max_freq) * (n_fftby2);
181 |         all_f0(i) = floor(f0);
182 |         bw = (bandwidth2 (i) / max_freq) * (n_fftby2);
183 |         norm_factor = log(bw_min) - log(bandwidth2(i));
184 |         j = 0:1:n_fftby2-1;
185 |         crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
186 |         crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
187 |     end
188 | 
189 | else
190 |     for i = 1:num_crit
191 |         f0 = (cent_freq (i) / max_freq) * (n_fftby2);
192 |         all_f0(i) = floor(f0);
193 |         bw = (bandwidth (i) / max_freq) * (n_fftby2);
194 |         norm_factor = log(bw_min) - log(bandwidth(i));
195 |         j = 0:1:n_fftby2-1;
196 |         crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
197 |         crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
198 |     end
199 | end
200 | 
201 | 
202 | 
203 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
204 | start      = 1;					% starting sample
205 | window     = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
206 | 
207 | for frame_count = 1:num_frames
208 | 
209 |    % ----------------------------------------------------------
210 |    % (1) Get the Frames for the test and reference speech. 
211 |    %     Multiply by Hanning Window.
212 |    % ----------------------------------------------------------
213 | 
214 |    clean_frame = clean_speech(start:start+winlength-1);
215 |    processed_frame = processed_speech(start:start+winlength-1);
216 |    clean_frame = clean_frame.*window;
217 |    processed_frame = processed_frame.*window;
218 | 
219 |    % ----------------------------------------------------------
220 |    % (2) Compute the magnitude Spectrum of Clean and Processed
221 |    % ----------------------------------------------------------
222 | 
223 |     
224 |        clean_spec     = abs(fft(clean_frame,n_fft));
225 |        processed_spec = abs(fft(processed_frame,n_fft)); 
226 | 
227 |     % normalize spectra to have area of one
228 |     %
229 |     clean_spec=clean_spec/sum(clean_spec(1:n_fftby2));
230 |     processed_spec=processed_spec/sum(processed_spec(1:n_fftby2));
231 | 
232 |    % ----------------------------------------------------------
233 |    % (3) Compute Filterbank Output Energies 
234 |    % ----------------------------------------------------------
235 |  
236 |    clean_energy=zeros(1,num_crit);
237 |    processed_energy=zeros(1,num_crit);
238 |    error_energy=zeros(1,num_crit);
239 |    W_freq=zeros(1,num_crit);
240 |   
241 |    for i = 1:num_crit
242 |       clean_energy(i) = sum(clean_spec(1:n_fftby2) ...
243 |                          	.*crit_filter(i,:)');
244 |       processed_energy(i) = sum(processed_spec(1:n_fftby2) ...
245 |           .*crit_filter(i,:)');
246 |                   	
247 |         error_energy(i)=max((clean_energy(i)-processed_energy(i))^2,eps);
248 |         W_freq(i)=(clean_energy(i))^gamma;
249 |        
250 |    end
251 |    SNRlog=10*log10((clean_energy.^2)./error_energy);
252 |    
253 |    fwSNR=sum(W_freq.*SNRlog)/sum(W_freq);
254 |    
255 |   distortion(frame_count)=min(max(fwSNR,-10),35);
256 | 
257 |    start = start + skiprate;
258 |      
259 | end
260 | 
261 | 


--------------------------------------------------------------------------------
/PESQ/comp_fwseg.m:
--------------------------------------------------------------------------------
  1 | function fwseg_dist= comp_fwseg(cleanFile, enhancedFile);
  2 | 
  3 | % ----------------------------------------------------------------------
  4 | %      Frequency weighted SNRseg Objective Speech Quality Measure
  5 | %
  6 | %   This function implements the frequency-weighted SNRseg measure [1]
  7 | %   using a different weighting function, the clean spectrum.
  8 | %
  9 | %   Usage:  fwSNRseg=comp_fwseg(cleanFile.wav, enhancedFile.wav)
 10 | %           
 11 | %         cleanFile.wav - clean input file in .wav format
 12 | %         enhancedFile  - enhanced output file in .wav format
 13 | %         fwSNRseg      - computed frequency weighted SNRseg in dB
 14 | % 
 15 | %         Note that large numbers of fwSNRseg are better.
 16 | %
 17 | %  Example call:  fwSNRseg =comp_fwseg('sp04.wav','enhanced.wav')
 18 | %
 19 | %  
 20 | %  References:
 21 | %   [1]  Tribolet, J., Noll, P., McDermott, B., and Crochiere, R. E. (1978).
 22 | %        A study of complexity and quality of speech waveform coders. Proc. 
 23 | %        IEEE Int. Conf. Acoust. , Speech, Signal Processing, 586-590.
 24 | %
 25 | %   Author: Philipos C. Loizou 
 26 | %  (critical-band filtering routines were written by Bryan Pellom & John Hansen)
 27 | %
 28 | % Copyright (c) 2006 by Philipos C. Loizou
 29 | % $Revision: 0.0 $  $Date: 10/09/2006 $
 30 | % ----------------------------------------------------------------------
 31 | 
 32 | if nargin~=2
 33 |     fprintf('USAGE: fwSNRseg=comp_fwseg(cleanFile.wav, enhancedFile.wav)\n');
 34 |     fprintf('For more help, type: help comp_fwseg\n\n');
 35 |     return;
 36 | end
 37 | 
 38 | 
 39 | [data1, Srate1, Nbits1]= wavread(cleanFile);
 40 | [data2, Srate2, Nbits2]= wavread(enhancedFile);
 41 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
 42 |     error( 'The two files do not match!\n');
 43 | end
 44 | 
 45 | len= min( length( data1), length( data2));
 46 | data1= data1( 1: len)+eps;
 47 | data2= data2( 1: len)+eps;
 48 | 
 49 | wss_dist_vec= fwseg( data1, data2,Srate1);
 50 | fwseg_dist=mean(wss_dist_vec);
 51 | 
 52 | 
 53 | % ----------------------------------------------------------------------
 54 | 
 55 | function distortion = fwseg(clean_speech, processed_speech,sample_rate)
 56 | 
 57 | 
 58 | % ----------------------------------------------------------------------
 59 | % Check the length of the clean and processed speech.  Must be the same.
 60 | % ----------------------------------------------------------------------
 61 | 
 62 | clean_length      = length(clean_speech);
 63 | processed_length  = length(processed_speech);
 64 | 
 65 | if (clean_length ~= processed_length)
 66 |   disp('Error: Files  must have same length.');
 67 |   return
 68 | end
 69 | 
 70 | 
 71 | 
 72 | % ----------------------------------------------------------------------
 73 | % Global Variables
 74 | % ----------------------------------------------------------------------
 75 | 
 76 | 
 77 | winlength   = round(30*sample_rate/1000); 	   % window length in samples
 78 | skiprate    = floor(winlength/4);		   % window skip in samples
 79 | max_freq    = sample_rate/2;	   % maximum bandwidth
 80 | num_crit    = 25;		   % number of critical bands
 81 | USE_25=1;
 82 | n_fft       = 2^nextpow2(2*winlength);
 83 | n_fftby2    = n_fft/2;		   % FFT size/2
 84 | gamma=0.2;  % power exponent
 85 | 
 86 | % ----------------------------------------------------------------------
 87 | % Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
 88 | % ----------------------------------------------------------------------
 89 | 
 90 | cent_freq(1)  = 50.0000;   bandwidth(1)  = 70.0000;
 91 | cent_freq(2)  = 120.000;   bandwidth(2)  = 70.0000;
 92 | cent_freq(3)  = 190.000;   bandwidth(3)  = 70.0000;
 93 | cent_freq(4)  = 260.000;   bandwidth(4)  = 70.0000;
 94 | cent_freq(5)  = 330.000;   bandwidth(5)  = 70.0000;
 95 | cent_freq(6)  = 400.000;   bandwidth(6)  = 70.0000;
 96 | cent_freq(7)  = 470.000;   bandwidth(7)  = 70.0000;
 97 | cent_freq(8)  = 540.000;   bandwidth(8)  = 77.3724;
 98 | cent_freq(9)  = 617.372;   bandwidth(9)  = 86.0056;
 99 | cent_freq(10) = 703.378;   bandwidth(10) = 95.3398;
100 | cent_freq(11) = 798.717;   bandwidth(11) = 105.411;
101 | cent_freq(12) = 904.128;   bandwidth(12) = 116.256;
102 | cent_freq(13) = 1020.38;   bandwidth(13) = 127.914;
103 | cent_freq(14) = 1148.30;   bandwidth(14) = 140.423;
104 | cent_freq(15) = 1288.72;   bandwidth(15) = 153.823;
105 | cent_freq(16) = 1442.54;   bandwidth(16) = 168.154;
106 | cent_freq(17) = 1610.70;   bandwidth(17) = 183.457;
107 | cent_freq(18) = 1794.16;   bandwidth(18) = 199.776;
108 | cent_freq(19) = 1993.93;   bandwidth(19) = 217.153;
109 | cent_freq(20) = 2211.08;   bandwidth(20) = 235.631;
110 | cent_freq(21) = 2446.71;   bandwidth(21) = 255.255;
111 | cent_freq(22) = 2701.97;   bandwidth(22) = 276.072;
112 | cent_freq(23) = 2978.04;   bandwidth(23) = 298.126;
113 | cent_freq(24) = 3276.17;   bandwidth(24) = 321.465;
114 | cent_freq(25) = 3597.63;   bandwidth(25) = 346.136;
115 | 
116 | W=[  % articulation index weights
117 | 0.003
118 | 0.003
119 | 0.003
120 | 0.007
121 | 0.010
122 | 0.016
123 | 0.016
124 | 0.017
125 | 0.017
126 | 0.022
127 | 0.027
128 | 0.028
129 | 0.030
130 | 0.032
131 | 0.034
132 | 0.035
133 | 0.037
134 | 0.036
135 | 0.036
136 | 0.033
137 | 0.030
138 | 0.029
139 | 0.027
140 | 0.026
141 | 0.026];
142 | 
143 | W=W';
144 | 
145 | if USE_25==0  % use 13 bands
146 |     % ----- lump adjacent filters together ----------------
147 |     k=2;
148 |     cent_freq2(1)=cent_freq(1);
149 |     bandwidth2(1)=bandwidth(1)+bandwidth(2);
150 |     W2(1)=W(1);
151 |     for i=2:13
152 |         cent_freq2(i)=cent_freq2(i-1)+bandwidth2(i-1);
153 |         bandwidth2(i)=bandwidth(k)+bandwidth(k+1);
154 |         W2(i)=0.5*(W(k)+W(k+1));
155 |         k=k+2;
156 |     end
157 | 
158 |     sumW=sum(W2);
159 |     bw_min      = bandwidth2 (1);	   % minimum critical bandwidth
160 | else
161 |     sumW=sum(W);
162 |     bw_min=bandwidth(1);
163 | end
164 | 
165 | 
166 | % ----------------------------------------------------------------------
167 | % Set up the critical band filters.  Note here that Gaussianly shaped
168 | % filters are used.  Also, the sum of the filter weights are equivalent
169 | % for each critical band filter.  Filter less than -30 dB and set to
170 | % zero.
171 | % ----------------------------------------------------------------------
172 | 
173 | min_factor = exp (-30.0 / (2.0 * 2.303));       % -30 dB point of filter
174 | if USE_25==0
175 |     
176 |     num_crit=length(cent_freq2);
177 | 
178 |     for i = 1:num_crit
179 |         f0 = (cent_freq2 (i) / max_freq) * (n_fftby2);
180 |         all_f0(i) = floor(f0);
181 |         bw = (bandwidth2 (i) / max_freq) * (n_fftby2);
182 |         norm_factor = log(bw_min) - log(bandwidth2(i));
183 |         j = 0:1:n_fftby2-1;
184 |         crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
185 |         crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
186 |     end
187 | 
188 | else
189 |     for i = 1:num_crit
190 |         f0 = (cent_freq (i) / max_freq) * (n_fftby2);
191 |         all_f0(i) = floor(f0);
192 |         bw = (bandwidth (i) / max_freq) * (n_fftby2);
193 |         norm_factor = log(bw_min) - log(bandwidth(i));
194 |         j = 0:1:n_fftby2-1;
195 |         crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
196 |         crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
197 |     end
198 | end
199 | 
200 | 
201 | 
202 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
203 | start      = 1;					% starting sample
204 | window     = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
205 | 
206 | for frame_count = 1:num_frames
207 | 
208 |    % ----------------------------------------------------------
209 |    % (1) Get the Frames for the test and reference speech. 
210 |    %     Multiply by Hanning Window.
211 |    % ----------------------------------------------------------
212 | 
213 |    clean_frame = clean_speech(start:start+winlength-1);
214 |    processed_frame = processed_speech(start:start+winlength-1);
215 |    clean_frame = clean_frame.*window;
216 |    processed_frame = processed_frame.*window;
217 | 
218 |    % ----------------------------------------------------------
219 |    % (2) Compute the magnitude Spectrum of Clean and Processed
220 |    % ----------------------------------------------------------
221 | 
222 |     
223 |        clean_spec     = abs(fft(clean_frame,n_fft));
224 |        processed_spec = abs(fft(processed_frame,n_fft)); 
225 | 
226 |     % normalize spectra to have area of one
227 |     %
228 |     clean_spec=clean_spec/sum(clean_spec(1:n_fftby2));
229 |     processed_spec=processed_spec/sum(processed_spec(1:n_fftby2));
230 | 
231 |    % ----------------------------------------------------------
232 |    % (3) Compute Filterbank Output Energies 
233 |    % ----------------------------------------------------------
234 |  
235 |    clean_energy=zeros(1,num_crit);
236 |    processed_energy=zeros(1,num_crit);
237 |    error_energy=zeros(1,num_crit);
238 |    W_freq=zeros(1,num_crit);
239 |   
240 |    for i = 1:num_crit
241 |       clean_energy(i) = sum(clean_spec(1:n_fftby2) ...
242 |                          	.*crit_filter(i,:)');
243 |       processed_energy(i) = sum(processed_spec(1:n_fftby2) ...
244 |           .*crit_filter(i,:)');
245 |                   	
246 |         error_energy(i)=max((clean_energy(i)-processed_energy(i))^2,eps);
247 |         W_freq(i)=(clean_energy(i))^gamma;
248 |        
249 |    end
250 |    SNRlog=10*log10((clean_energy.^2)./error_energy);
251 |    
252 |    fwSNR=sum(W_freq.*SNRlog)/sum(W_freq);
253 |    
254 |   distortion(frame_count)=min(max(fwSNR,-10),35);
255 | 
256 |    start = start + skiprate;
257 |      
258 | end
259 | 
260 | 


--------------------------------------------------------------------------------
/PESQ/comp_fwseg_variant.asv:
--------------------------------------------------------------------------------
  1 | function [SIG,BAK,OVL]= comp_fwseg_variant(cleanFile, enhancedFile);
  2 | 
  3 | % ----------------------------------------------------------------------
  4 | %      Frequency-variant fwSNRseg Objective Speech Quality Measure
  5 | %
  6 | %   This function implements the frequency-variant fwSNRseg measure [1]
  7 | %
  8 | %
  9 | %   Usage:  fwSNRseg=comp_fwseg(cleanFile.wav, enhancedFile.wav)
 10 | %           
 11 | %         cleanFile.wav - clean input file in .wav format
 12 | %         enhancedFile  - enhanced output file in .wav format
 13 | %         fwSNRseg      - computed frequency weighted SNRseg in dB
 14 | % 
 15 | %         Note that large numbers of fwSNRseg are better.
 16 | %
 17 | %  Example call:  fwSNRseg =comp_fwseg('sp04.wav','enhanced.wav')
 18 | %
 19 | %  
 20 | %  References:
 21 | %   [1]  Tribolet, J., Noll, P., McDermott, B., and Crochiere, R. E. (1978).
 22 | %        A study of complexity and quality of speech waveform coders. Proc. 
 23 | %        IEEE Int. Conf. Acoust. , Speech, Signal Processing, 586-590.
 24 | %
 25 | %   Author: Philipos C. Loizou 
 26 | %  (critical-band filtering routines were written by Bryan Pellom & John Hansen)
 27 | %
 28 | % Copyright (c) 2006 by Philipos C. Loizou
 29 | % $Revision: 0.0 $  $Date: 10/09/2006 $
 30 | % ----------------------------------------------------------------------
 31 | 
 32 | if nargin~=2
 33 |     fprintf('USAGE: fwSNRseg=comp_fwseg(cleanFile.wav, enhancedFile.wav)\n');
 34 |     fprintf('For more help, type: help comp_fwseg\n\n');
 35 |     return;
 36 | end
 37 | 
 38 | 
 39 | [data1, Srate1, Nbits1]= wavread(cleanFile);
 40 | [data2, Srate2, Nbits2]= wavread(enhancedFile);
 41 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
 42 |     error( 'The two files do not match!\n');
 43 | end
 44 | 
 45 | len= min( length( data1), length( data2));
 46 | data1= data1( 1: len)+eps;
 47 | data2= data2( 1: len)+eps;
 48 | 
 49 | wss_dist_matrix= fwseg( data1, data2,Srate1);
 50 | wss_dist=mean(wss_dist_matrix);
 51 | 
 52 | b_sig=[0.021,-0.028,0.088,-0.031,0.048,-0.049,0.065,0.009,0.011,0.033,...
 53 |     -0.040,-0.002,0.041,-0.007,0.033,0.018,-0.007,0.044,-0.001,0.021,...
 54 |     -0.002,0.017,-0.03,0.073,0.043];
 55 | b_ovl=[-0.003,-0.026,0.066,-0.036,0.038,-0.023,0.037,0.022,0.014,0.009,...
 56 |     -0.03,0.004,0.044,-0.005,0.017,0.018,-0.001,0.051,0.009,0.011,...
 57 |     0.011,-0.002,-0.021,0.043,0.031];
 58 | b_bak=[-0.03,-0.022,0.03,-0.048,0.034,0.002,0.006,0.037,0.017,-0.016,-0.008,...
 59 |     0.019,0.024,-0.002,0.01,0.03,-0.018,0.046,0.022,0.005,0.03,-0.028,...
 60 |     -0.028,0.019,0.005];
 61 | 
 62 | SIG=0.567+sum(b_sig.*wss_dist);
 63 | BAK=1.013+sum(b_bak.*wss_dist);
 64 | OVL=0.446+sum(b_ovl.*wss_dist);
 65 | 
 66 | 
 67 | % ----------------------------------------------------------------------
 68 | 
 69 | function distortion = fwseg(clean_speech, processed_speech,sample_rate)
 70 | 
 71 | 
 72 | % ----------------------------------------------------------------------
 73 | % Check the length of the clean and processed speech.  Must be the same.
 74 | % ----------------------------------------------------------------------
 75 | 
 76 | clean_length      = length(clean_speech);
 77 | processed_length  = length(processed_speech);
 78 | 
 79 | if (clean_length ~= processed_length)
 80 |   disp('Error: Files  must have same length.');
 81 |   return
 82 | end
 83 | 
 84 | 
 85 | 
 86 | % ----------------------------------------------------------------------
 87 | % Global Variables
 88 | % ----------------------------------------------------------------------
 89 | 
 90 | 
 91 | winlength   = round(30*sample_rate/1000); 	   % window length in samples
 92 | skiprate    = floor(winlength/4);		   % window skip in samples
 93 | max_freq    = sample_rate/2;	   % maximum bandwidth
 94 | num_crit    = 25;		   % number of critical bands
 95 | 
 96 | n_fft       = 2^nextpow2(2*winlength);
 97 | n_fftby2    = n_fft/2;		   % FFT size/2
 98 | 
 99 | % ----------------------------------------------------------------------
100 | % Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
101 | % ----------------------------------------------------------------------
102 | 
103 | cent_freq(1)  = 50.0000;   bandwidth(1)  = 70.0000;
104 | cent_freq(2)  = 120.000;   bandwidth(2)  = 70.0000;
105 | cent_freq(3)  = 190.000;   bandwidth(3)  = 70.0000;
106 | cent_freq(4)  = 260.000;   bandwidth(4)  = 70.0000;
107 | cent_freq(5)  = 330.000;   bandwidth(5)  = 70.0000;
108 | cent_freq(6)  = 400.000;   bandwidth(6)  = 70.0000;
109 | cent_freq(7)  = 470.000;   bandwidth(7)  = 70.0000;
110 | cent_freq(8)  = 540.000;   bandwidth(8)  = 77.3724;
111 | cent_freq(9)  = 617.372;   bandwidth(9)  = 86.0056;
112 | cent_freq(10) = 703.378;   bandwidth(10) = 95.3398;
113 | cent_freq(11) = 798.717;   bandwidth(11) = 105.411;
114 | cent_freq(12) = 904.128;   bandwidth(12) = 116.256;
115 | cent_freq(13) = 1020.38;   bandwidth(13) = 127.914;
116 | cent_freq(14) = 1148.30;   bandwidth(14) = 140.423;
117 | cent_freq(15) = 1288.72;   bandwidth(15) = 153.823;
118 | cent_freq(16) = 1442.54;   bandwidth(16) = 168.154;
119 | cent_freq(17) = 1610.70;   bandwidth(17) = 183.457;
120 | cent_freq(18) = 1794.16;   bandwidth(18) = 199.776;
121 | cent_freq(19) = 1993.93;   bandwidth(19) = 217.153;
122 | cent_freq(20) = 2211.08;   bandwidth(20) = 235.631;
123 | cent_freq(21) = 2446.71;   bandwidth(21) = 255.255;
124 | cent_freq(22) = 2701.97;   bandwidth(22) = 276.072;
125 | cent_freq(23) = 2978.04;   bandwidth(23) = 298.126;
126 | cent_freq(24) = 3276.17;   bandwidth(24) = 321.465;
127 | cent_freq(25) = 3597.63;   bandwidth(25) = 346.136;
128 | 
129 | 
130 | bw_min      = bandwidth (1);	   % minimum critical bandwidth
131 | 
132 | 
133 | % ----------------------------------------------------------------------
134 | % Set up the critical band filters.  Note here that Gaussianly shaped
135 | % filters are used.  Also, the sum of the filter weights are equivalent
136 | % for each critical band filter.  Filter less than -30 dB and set to
137 | % zero.
138 | % ----------------------------------------------------------------------
139 | 
140 | min_factor = exp (-30.0 / (2.0 * 2.303));       % -30 dB point of filter
141 | 
142 | for i = 1:num_crit
143 |   f0 = (cent_freq (i) / max_freq) * (n_fftby2);
144 |   all_f0(i) = floor(f0);
145 |   bw = (bandwidth (i) / max_freq) * (n_fftby2);
146 |   norm_factor = log(bw_min) - log(bandwidth(i));
147 |   j = 0:1:n_fftby2-1;
148 |   crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
149 |   crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);  
150 | end   
151 | 
152 | % ----------------------------------------------------------------------
153 | % For each frame of input speech, calculate the Weighted Spectral
154 | % Slope Measure
155 | % ----------------------------------------------------------------------
156 | 
157 | num_frames = floor(clean_length/skiprate-(winlength/skiprate)); % number of frames
158 | start      = 1;					% starting sample
159 | window     = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
160 | 
161 | distortion=zeros(num_frames,num_crit);
162 | for frame_count = 1:num_frames
163 | 
164 |    % ----------------------------------------------------------
165 |    % (1) Get the Frames for the test and reference speech. 
166 |    %     Multiply by Hanning Window.
167 |    % ----------------------------------------------------------
168 | 
169 |    clean_frame = clean_speech(start:start+winlength-1);
170 |    processed_frame = processed_speech(start:start+winlength-1);
171 |    clean_frame = clean_frame.*window;
172 |    processed_frame = processed_frame.*window;
173 | 
174 |    % ----------------------------------------------------------
175 |    % (2) Compute the magnitude Spectrum of Clean and Processed
176 |    % ----------------------------------------------------------
177 | 
178 |     
179 |        clean_spec     = abs(fft(clean_frame,n_fft));
180 |        processed_spec = abs(fft(processed_frame,n_fft));
181 |        
182 |        % normalize so that spectra have unit area ----
183 |         clean_spec=clean_spec/sum(clean_spec(1:n_fftby2));
184 |         processed_spec=processed_spec/sum(processed_spec(1:n_fftby2));
185 | 
186 |    % ----------------------------------------------------------
187 |    % (3) Compute Filterbank Output Energies (in dB scale)
188 |    % ----------------------------------------------------------
189 |  
190 |    clean_energy=zeros(1,num_crit);
191 |    processed_energy=zeros(1,num_crit);
192 |    error_energy=zeros(1,num_crit);
193 |    
194 |    for i = 1:num_crit
195 |       clean_energy(i) = sum(clean_spec(1:n_fftby2) ...
196 | 		            .*crit_filter(i,:)');
197 |       processed_energy(i) = sum(processed_spec(1:n_fftby2) ...
198 | 			        .*crit_filter(i,:)');
199 |       error_energy(i)=max((clean_energy(i)-processed_energy(i))^2,eps);
200 |    end
201 |    
202 | 
203 |    SNRlog=10*log10((clean_energy.^2)./error_energy);
204 |    
205 |    distortion(frame_count,:)=min(max(SNRlog,-10),35);
206 |       
207 |    start = start + skiprate;
208 |      
209 | end
210 | 
211 | 


--------------------------------------------------------------------------------
/PESQ/comp_fwseg_variant.m:
--------------------------------------------------------------------------------
  1 | function [SIG,BAK,OVL]= comp_fwseg_variant(cleanFile, enhancedFile);
  2 | 
  3 | % ----------------------------------------------------------------------
  4 | %      Frequency-variant fwSNRseg Objective Speech Quality Measure
  5 | %
  6 | %   This function implements the frequency-variant fwSNRseg measure [1]
  7 | %   (see also Chap. 10, Eq. 10.24)
  8 | %
  9 | %
 10 | %   Usage:  [sig,bak,ovl]=comp_fwseg_variant(cleanFile.wav, enhancedFile.wav)
 11 | %           
 12 | %         cleanFile.wav - clean input file in .wav format
 13 | %         enhancedFile  - enhanced output file in .wav format
 14 | %         sig           - predicted rating [1-5] of speech distortion
 15 | %         bak           - predicted rating [1-5] of noise distortion
 16 | %         ovl           - predicted rating [1-5] of overall quality
 17 | %
 18 | %
 19 | %  Example call:  [s,b,o] =comp_fwseg_variant('sp04.wav','enhanced.wav')
 20 | %
 21 | %  
 22 | %  References:
 23 | %     [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
 24 | %	    Objective Measures of Speech Quality.  Prentice Hall
 25 | %	    Advanced Reference Series, Englewood Cliffs, NJ, 1988,
 26 | %	    ISBN: 0-13-629056-6.
 27 | %
 28 | %   Author: Philipos C. Loizou 
 29 | %  (critical-band filtering routines were written by Bryan Pellom & John Hansen)
 30 | %
 31 | % Copyright (c) 2006 by Philipos C. Loizou
 32 | % $Revision: 0.0 $  $Date: 10/09/2006 $
 33 | % ----------------------------------------------------------------------
 34 | 
 35 | if nargin~=2
 36 |     fprintf('USAGE: [sig,bak,ovl]=comp_fwseg_variant(cleanFile.wav, enhancedFile.wav)\n');
 37 |     fprintf('For more help, type: help comp_fwseg_variant\n\n');
 38 |     return;
 39 | end
 40 | 
 41 | 
 42 | [data1, Srate1, Nbits1]= wavread(cleanFile);
 43 | [data2, Srate2, Nbits2]= wavread(enhancedFile);
 44 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
 45 |     error( 'The two files do not match!\n');
 46 | end
 47 | 
 48 | len= min( length( data1), length( data2));
 49 | data1= data1( 1: len)+eps;
 50 | data2= data2( 1: len)+eps;
 51 | 
 52 | wss_dist_matrix= fwseg( data1, data2,Srate1);
 53 | wss_dist=mean(wss_dist_matrix);
 54 | 
 55 | % initialize  coefficients obtained from multiple linear
 56 | % regression analysis
 57 | %
 58 | b_sig=[0.021,-0.028,0.088,-0.031,0.048,-0.049,0.065,0.009,0.011,0.033,...
 59 |     -0.040,-0.002,0.041,-0.007,0.033,0.018,-0.007,0.044,-0.001,0.021,...
 60 |     -0.002,0.017,-0.03,0.073,0.043];
 61 | b_ovl=[-0.003,-0.026,0.066,-0.036,0.038,-0.023,0.037,0.022,0.014,0.009,...
 62 |     -0.03,0.004,0.044,-0.005,0.017,0.018,-0.001,0.051,0.009,0.011,...
 63 |     0.011,-0.002,-0.021,0.043,0.031];
 64 | b_bak=[-0.03,-0.022,0.03,-0.048,0.034,0.002,0.006,0.037,0.017,-0.016,-0.008,...
 65 |     0.019,0.024,-0.002,0.01,0.03,-0.018,0.046,0.022,0.005,0.03,-0.028,...
 66 |     -0.028,0.019,0.005];
 67 | 
 68 | SIG=0.567+sum(b_sig.*wss_dist);
 69 | SIG=max(1,SIG); SIG=min(5, SIG); % limit values to [1, 5]
 70 | 
 71 | BAK=1.013+sum(b_bak.*wss_dist);
 72 | BAK=max(1,BAK); BAK=min(5, BAK); % limit values to [1, 5]
 73 | 
 74 | OVL=0.446+sum(b_ovl.*wss_dist);
 75 | OVL=max(1,OVL); OVL=min(5, OVL); % limit values to [1, 5]
 76 | 
 77 | 
 78 | % ----------------------------------------------------------------------
 79 | 
 80 | function distortion = fwseg(clean_speech, processed_speech,sample_rate)
 81 | 
 82 | 
 83 | % ----------------------------------------------------------------------
 84 | % Check the length of the clean and processed speech.  Must be the same.
 85 | % ----------------------------------------------------------------------
 86 | 
 87 | clean_length      = length(clean_speech);
 88 | processed_length  = length(processed_speech);
 89 | 
 90 | if (clean_length ~= processed_length)
 91 |   disp('Error: Files  must have same length.');
 92 |   return
 93 | end
 94 | 
 95 | 
 96 | 
 97 | % ----------------------------------------------------------------------
 98 | % Global Variables
 99 | % ----------------------------------------------------------------------
100 | 
101 | 
102 | winlength   = round(30*sample_rate/1000); 	   % window length in samples
103 | skiprate    = floor(winlength/4);		   % window skip in samples
104 | max_freq    = sample_rate/2;	   % maximum bandwidth
105 | num_crit    = 25;		   % number of critical bands
106 | 
107 | n_fft       = 2^nextpow2(2*winlength);
108 | n_fftby2    = n_fft/2;		   % FFT size/2
109 | 
110 | % ----------------------------------------------------------------------
111 | % Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
112 | % ----------------------------------------------------------------------
113 | 
114 | cent_freq(1)  = 50.0000;   bandwidth(1)  = 70.0000;
115 | cent_freq(2)  = 120.000;   bandwidth(2)  = 70.0000;
116 | cent_freq(3)  = 190.000;   bandwidth(3)  = 70.0000;
117 | cent_freq(4)  = 260.000;   bandwidth(4)  = 70.0000;
118 | cent_freq(5)  = 330.000;   bandwidth(5)  = 70.0000;
119 | cent_freq(6)  = 400.000;   bandwidth(6)  = 70.0000;
120 | cent_freq(7)  = 470.000;   bandwidth(7)  = 70.0000;
121 | cent_freq(8)  = 540.000;   bandwidth(8)  = 77.3724;
122 | cent_freq(9)  = 617.372;   bandwidth(9)  = 86.0056;
123 | cent_freq(10) = 703.378;   bandwidth(10) = 95.3398;
124 | cent_freq(11) = 798.717;   bandwidth(11) = 105.411;
125 | cent_freq(12) = 904.128;   bandwidth(12) = 116.256;
126 | cent_freq(13) = 1020.38;   bandwidth(13) = 127.914;
127 | cent_freq(14) = 1148.30;   bandwidth(14) = 140.423;
128 | cent_freq(15) = 1288.72;   bandwidth(15) = 153.823;
129 | cent_freq(16) = 1442.54;   bandwidth(16) = 168.154;
130 | cent_freq(17) = 1610.70;   bandwidth(17) = 183.457;
131 | cent_freq(18) = 1794.16;   bandwidth(18) = 199.776;
132 | cent_freq(19) = 1993.93;   bandwidth(19) = 217.153;
133 | cent_freq(20) = 2211.08;   bandwidth(20) = 235.631;
134 | cent_freq(21) = 2446.71;   bandwidth(21) = 255.255;
135 | cent_freq(22) = 2701.97;   bandwidth(22) = 276.072;
136 | cent_freq(23) = 2978.04;   bandwidth(23) = 298.126;
137 | cent_freq(24) = 3276.17;   bandwidth(24) = 321.465;
138 | cent_freq(25) = 3597.63;   bandwidth(25) = 346.136;
139 | 
140 | 
141 | bw_min      = bandwidth (1);	   % minimum critical bandwidth
142 | 
143 | 
144 | % ----------------------------------------------------------------------
145 | % Set up the critical band filters.  Note here that Gaussianly shaped
146 | % filters are used.  Also, the sum of the filter weights are equivalent
147 | % for each critical band filter.  Filter less than -30 dB and set to
148 | % zero.
149 | % ----------------------------------------------------------------------
150 | 
151 | min_factor = exp (-30.0 / (2.0 * 2.303));       % -30 dB point of filter
152 | 
153 | for i = 1:num_crit
154 |   f0 = (cent_freq (i) / max_freq) * (n_fftby2);
155 |   all_f0(i) = floor(f0);
156 |   bw = (bandwidth (i) / max_freq) * (n_fftby2);
157 |   norm_factor = log(bw_min) - log(bandwidth(i));
158 |   j = 0:1:n_fftby2-1;
159 |   crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
160 |   crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);  
161 | end   
162 | 
163 | % ----------------------------------------------------------------------
164 | % For each frame of input speech, calculate the Weighted Spectral
165 | % Slope Measure
166 | % ----------------------------------------------------------------------
167 | 
168 | num_frames = floor(clean_length/skiprate-(winlength/skiprate)); % number of frames
169 | start      = 1;					% starting sample
170 | window     = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
171 | 
172 | distortion=zeros(num_frames,num_crit);
173 | for frame_count = 1:num_frames
174 | 
175 |    % ----------------------------------------------------------
176 |    % (1) Get the Frames for the test and reference speech. 
177 |    %     Multiply by Hanning Window.
178 |    % ----------------------------------------------------------
179 | 
180 |    clean_frame = clean_speech(start:start+winlength-1);
181 |    processed_frame = processed_speech(start:start+winlength-1);
182 |    clean_frame = clean_frame.*window;
183 |    processed_frame = processed_frame.*window;
184 | 
185 |    % ----------------------------------------------------------
186 |    % (2) Compute the magnitude Spectrum of Clean and Processed
187 |    % ----------------------------------------------------------
188 | 
189 |     
190 |        clean_spec     = abs(fft(clean_frame,n_fft));
191 |        processed_spec = abs(fft(processed_frame,n_fft));
192 |        
193 |        % normalize so that spectra have unit area ----
194 |         clean_spec=clean_spec/sum(clean_spec(1:n_fftby2));
195 |         processed_spec=processed_spec/sum(processed_spec(1:n_fftby2));
196 | 
197 |    % ----------------------------------------------------------
198 |    % (3) Compute Filterbank Output Energies (in dB scale)
199 |    % ----------------------------------------------------------
200 |  
201 |    clean_energy=zeros(1,num_crit);
202 |    processed_energy=zeros(1,num_crit);
203 |    error_energy=zeros(1,num_crit);
204 |    
205 |    for i = 1:num_crit
206 |       clean_energy(i) = sum(clean_spec(1:n_fftby2) ...
207 | 		            .*crit_filter(i,:)');
208 |       processed_energy(i) = sum(processed_spec(1:n_fftby2) ...
209 | 			        .*crit_filter(i,:)');
210 |       error_energy(i)=max((clean_energy(i)-processed_energy(i))^2,eps);
211 |    end
212 |    
213 | 
214 |    SNRlog=10*log10((clean_energy.^2)./error_energy);
215 |    
216 |    distortion(frame_count,:)=min(max(SNRlog,-10),35);
217 |       
218 |    start = start + skiprate;
219 |      
220 | end
221 | 
222 | 


--------------------------------------------------------------------------------
/PESQ/comp_is.asv:
--------------------------------------------------------------------------------
  1 | function is_mean= compIS(cleanFile, enhdFile);
  2 | % ----------------------------------------------------------------------
  3 | %
  4 | %          Itakura-Saito (IS) Objective Speech Quality Measure
  5 | %
  6 | %     This function implements the Itakura-Saito distance measure
  7 | %     defined on page 50 of [1] (see Equation 2.26).  See also
  8 | %     Equation 12 (page 1480) of [2].
  9 | %
 10 | %   Usage:  llr=comp_llr(cleanFile.wav, enhancedFile.wav)
 11 | %           
 12 | %         cleanFile.wav - clean input file in .wav format
 13 | %         enhancedFile  - enhanced output file in .wav format
 14 | %         llr           - computed likelihood ratio
 15 | %
 16 | %         Note that the IS measure is limited in the range [0, 100].
 17 | %
 18 | %  Example call:  llr =comp_llr('sp04.wav','enhanced.wav')
 19 | %
 20 | %  
 21 | %  References:
 22 | %
 23 | %     [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
 24 | %	    Objective Measures of Speech Quality.  Prentice Hall
 25 | %	    Advanced Reference Series, Englewood Cliffs, NJ, 1988,
 26 | %	    ISBN: 0-13-629056-6.
 27 | %
 28 | %     [2] B.-H. Juang, "On Using the Itakura-Saito Measures for
 29 | %           Speech Coder Performance Evaluation", AT&T Bell
 30 | %  	    Laboratories Technical Journal, Vol. 63, No. 8,
 31 | %	    October 1984, pp. 1477-1498.
 32 | %
 33 | % ----------------------------------------------------------------------
 34 | 
 35 | 
 36 | alpha=0.95;
 37 | 
 38 | [data1, Srate1, Nbits1]= wavread(cleanFile);
 39 | [data2, Srate2, Nbits2]= wavread(enhdFile);
 40 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
 41 |     error( 'The two files do not match!\n');
 42 | end
 43 |     
 44 | len= min( length( data1), length( data2));
 45 | data1= data1( 1: len)+eps;
 46 | data2= data2( 1: len)+eps;
 47 | 
 48 | 
 49 | IS_dist= is( data1, data2,Srate1);
 50 | 
 51 | IS_len= round( length( IS_dist)* alpha);
 52 | IS= sort( IS_dist);
 53 | 
 54 | is_mean= mean( IS( 1: IS_len));
 55 | 
 56 | 
 57 | 
 58 | function distortion = is(clean_speech, processed_speech,sample_rate)
 59 | 
 60 | 
 61 | % ----------------------------------------------------------------------
 62 | % Check the length of the clean and processed speech.  Must be the same.
 63 | % ----------------------------------------------------------------------
 64 | 
 65 | clean_length      = length(clean_speech);
 66 | processed_length  = length(processed_speech);
 67 | 
 68 | if (clean_length ~= processed_length)
 69 |   disp('Error: Both Speech Files must be same length.');
 70 |   return
 71 | end
 72 | 
 73 | % ----------------------------------------------------------------------
 74 | % Scale both clean speech and processed speech to have same dynamic
 75 | % range.  Also remove DC component from each signal
 76 | % ----------------------------------------------------------------------
 77 | 
 78 | %clean_speech     = clean_speech     - mean(clean_speech);
 79 | %processed_speech = processed_speech - mean(processed_speech);
 80 | 
 81 | %processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech)));
 82 | 
 83 | % ----------------------------------------------------------------------
 84 | % Global Variables
 85 | % ----------------------------------------------------------------------
 86 | 
 87 | %sample_rate = 8000;		   % default sample rate
 88 | winlength   = round(30*sample_rate/1000); %240;		   % window length in samples
 89 | skiprate    = floor(winlength/4);		   % window skip in samples
 90 | if sample_rate<10000
 91 |    P           = 10;		   % LPC Analysis Order
 92 | else
 93 |     P=16;     % this could vary depending on sampling frequency.
 94 | end
 95 | % ----------------------------------------------------------------------
 96 | % For each frame of input speech, calculate the Itakura-Saito Measure
 97 | % ----------------------------------------------------------------------
 98 | 
 99 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
100 | start      = 1;					% starting sample
101 | window     = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
102 | 
103 | for frame_count = 1:num_frames
104 | 
105 |    % ----------------------------------------------------------
106 |    % (1) Get the Frames for the test and reference speech. 
107 |    %     Multiply by Hanning Window.
108 |    % ----------------------------------------------------------
109 | 
110 |    clean_frame = clean_speech(start:start+winlength-1);
111 |    processed_frame = processed_speech(start:start+winlength-1);
112 |    clean_frame = clean_frame.*window;
113 |    processed_frame = processed_frame.*window;
114 | 
115 |    % ----------------------------------------------------------
116 |    % (2) Get the autocorrelation lags and LPC parameters used
117 |    %     to compute the IS measure.
118 |    % ----------------------------------------------------------
119 | 
120 |    [R_clean, Ref_clean, A_clean] = ...
121 |       lpcoeff(clean_frame, P);
122 |    [R_processed, Ref_processed, A_processed] = ...
123 |       lpcoeff(processed_frame, P);
124 | 
125 |   
126 |    % ----------------------------------------------------------
127 |    % (3) Compute the IS measure
128 |    % ----------------------------------------------------------
129 | 
130 |    numerator      = A_processed*toeplitz(R_clean)*A_processed';
131 |    denominator    = max(A_clean*toeplitz(R_clean)*A_clean',eps);
132 |    gain_clean     = max(R_clean*A_clean',eps);	      % this is gain
133 |    gain_processed = max(R_processed*A_processed',eps); % squared (sigma^2)
134 | 
135 |    
136 |      ISvalue=(gain_clean/gain_processed)*(numerator/denominator) + ...
137 |       log(gain_processed/gain_clean)-1; 
138 | 
139 |   distortion(frame_count) = min(ISvalue,100);
140 |    start = start + skiprate;
141 | 
142 | end
143 | 
144 | 
145 | % ----------------------------------------------------------------------
146 | %
147 | %               Linear Prediction Coefficient Computation
148 | %
149 | %		    Robust Speech Processing Laboratory	
150 | %			  Duke University, USA
151 | %			   Copyright (c) 1998
152 | %			  All Rights Reserved.
153 | %
154 | %  Description:
155 | %
156 | %     This function returns the autocorrelation lags, reflection 
157 | %     coefficients, and linear prediction coefficients for a 
158 | %     given input frame of speech and a desired LP model order.
159 | %     it uses the levinson-durbin algorithm as described on page
160 | %     300 (Fig. 5.8) of [1].
161 | %
162 | %  Input/Output:
163 | %
164 | %     The input is a reference 8kHz sampled clean frame of speech
165 | %     and a desired number of reflection coefficients.  The function
166 | %     returns the autocorrelation lags, reflection coefficients,
167 | %     and linear prediction coefficients in an array.
168 | %
169 | %  References:
170 | %
171 | %     [1] J. Deller, J. Proakis, J. Hansen, Discrete-Time Processing
172 | %            of Speech Signals.  Macmillan series for Prentice-Hall, 
173 | %   	     New York, 1993.
174 | %
175 | %  Authors:
176 | %
177 | %     Bryan L. Pellom and John H. L. Hansen
178 | %     Robust Speech Processing Laboratory, Duke University
179 | %     Department of Electrical Engineeering
180 | %
181 | %  Last Modified:
182 | %
183 | %     July 22, 1998
184 | %
185 | % ----------------------------------------------------------------------
186 | 
187 | function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order)
188 | 
189 |    % ----------------------------------------------------------
190 |    % (1) Compute Autocorrelation Lags
191 |    % ----------------------------------------------------------
192 | 
193 |    winlength = max(size(speech_frame));
194 |    for k=1:model_order+1
195 |       R(k) = sum(speech_frame(1:winlength-k+1) ...
196 | 		     .*speech_frame(k:winlength));
197 |    end
198 | 
199 |    % ----------------------------------------------------------
200 |    % (2) Levinson-Durbin
201 |    % ----------------------------------------------------------
202 | 
203 |    a = ones(1,model_order);
204 |    E(1)=R(1);
205 |    for i=1:model_order
206 |       a_past(1:i-1) = a(1:i-1);
207 |       sum_term = sum(a_past(1:i-1).*R(i:-1:2));
208 |       rcoeff(i)=(R(i+1) - sum_term) / E(i);
209 |       a(i)=rcoeff(i);
210 |       a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1);
211 |       E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i);
212 |    end
213 | 
214 |    acorr    = R;
215 |    refcoeff = rcoeff;
216 |    lpparams = [1 -a];
217 | 
218 | 
219 | 
220 | 


--------------------------------------------------------------------------------
/PESQ/comp_is.m:
--------------------------------------------------------------------------------
  1 | function is_mean= comp_is(cleanFile, enhdFile);
  2 | % ----------------------------------------------------------------------
  3 | %          Itakura-Saito (IS) Objective Speech Quality Measure
  4 | %
  5 | %   This function implements the Itakura-Saito distance measure
  6 | %   defined on page 50 of [1] (see Equation 2.26).  See also
  7 | %   Equation 12 (page 1480) of [2].
  8 | %
  9 | %   Usage:  IS=comp_is(cleanFile.wav, enhancedFile.wav)
 10 | %           
 11 | %         cleanFile.wav - clean input file in .wav format
 12 | %         enhancedFile  - enhanced output file in .wav format
 13 | %         IS            - computed Itakura Saito measure
 14 | % 
 15 | %         Note that the IS measure is limited in the range [0, 100].
 16 | %
 17 | %  Example call:  IS =comp_is('sp04.wav','enhanced.wav')
 18 | %
 19 | %  
 20 | %  References:
 21 | %
 22 | %     [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
 23 | %	    Objective Measures of Speech Quality.  Prentice Hall
 24 | %	    Advanced Reference Series, Englewood Cliffs, NJ, 1988,
 25 | %	    ISBN: 0-13-629056-6.
 26 | %
 27 | %     [2] B.-H. Juang, "On Using the Itakura-Saito Measures for
 28 | %           Speech Coder Performance Evaluation", AT&T Bell
 29 | %  	    Laboratories Technical Journal, Vol. 63, No. 8,
 30 | %	    October 1984, pp. 1477-1498.
 31 | %
 32 | %  Authors: Bryan L. Pellom and John H. L. Hansen (July 1998)
 33 | %  Modified by: Philipos C. Loizou  (Oct 2006) - limited IS to be in [0,100]
 34 | %
 35 | % Copyright (c) 2006 by Philipos C. Loizou
 36 | % $Revision: 0.0 $  $Date: 10/09/2006 $
 37 | 
 38 | % ----------------------------------------------------------------------
 39 | 
 40 | if nargin~=2
 41 |     fprintf('USAGE: IS=comp_is(cleanFile.wav, enhancedFile.wav)\n');
 42 |     fprintf('For more help, type: help comp_is\n\n');
 43 |     return;
 44 | end
 45 | 
 46 | alpha=0.95;
 47 | 
 48 | [data1, Srate1, Nbits1]= wavread(cleanFile);
 49 | [data2, Srate2, Nbits2]= wavread(enhdFile);
 50 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
 51 |     error( 'The two files do not match!\n');
 52 | end
 53 |     
 54 | len= min( length( data1), length( data2));
 55 | data1= data1( 1: len)+eps;
 56 | data2= data2( 1: len)+eps;
 57 | 
 58 | 
 59 | IS_dist= is( data1, data2,Srate1);
 60 | 
 61 | IS_len= round( length( IS_dist)* alpha);
 62 | IS= sort( IS_dist);
 63 | 
 64 | is_mean= mean( IS( 1: IS_len));
 65 | 
 66 | 
 67 | 
 68 | function distortion = is(clean_speech, processed_speech,sample_rate)
 69 | 
 70 | 
 71 | % ----------------------------------------------------------------------
 72 | % Check the length of the clean and processed speech.  Must be the same.
 73 | % ----------------------------------------------------------------------
 74 | 
 75 | clean_length      = length(clean_speech);
 76 | processed_length  = length(processed_speech);
 77 | 
 78 | if (clean_length ~= processed_length)
 79 |   disp('Error: Both Speech Files must be same length.');
 80 |   return
 81 | end
 82 | 
 83 | % ----------------------------------------------------------------------
 84 | % Scale both clean speech and processed speech to have same dynamic
 85 | % range.  Also remove DC component from each signal
 86 | % ----------------------------------------------------------------------
 87 | 
 88 | %clean_speech     = clean_speech     - mean(clean_speech);
 89 | %processed_speech = processed_speech - mean(processed_speech);
 90 | 
 91 | %processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech)));
 92 | 
 93 | % ----------------------------------------------------------------------
 94 | % Global Variables
 95 | % ----------------------------------------------------------------------
 96 | 
 97 | %sample_rate = 8000;		   % default sample rate
 98 | winlength   = round(30*sample_rate/1000); %240;		   % window length in samples
 99 | skiprate    = floor(winlength/4);		   % window skip in samples
100 | if sample_rate<10000
101 |    P           = 10;		   % LPC Analysis Order
102 | else
103 |     P=16;     % this could vary depending on sampling frequency.
104 | end
105 | % ----------------------------------------------------------------------
106 | % For each frame of input speech, calculate the Itakura-Saito Measure
107 | % ----------------------------------------------------------------------
108 | 
109 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
110 | start      = 1;					% starting sample
111 | window     = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
112 | 
113 | for frame_count = 1:num_frames
114 | 
115 |    % ----------------------------------------------------------
116 |    % (1) Get the Frames for the test and reference speech. 
117 |    %     Multiply by Hanning Window.
118 |    % ----------------------------------------------------------
119 | 
120 |    clean_frame = clean_speech(start:start+winlength-1);
121 |    processed_frame = processed_speech(start:start+winlength-1);
122 |    clean_frame = clean_frame.*window;
123 |    processed_frame = processed_frame.*window;
124 | 
125 |    % ----------------------------------------------------------
126 |    % (2) Get the autocorrelation lags and LPC parameters used
127 |    %     to compute the IS measure.
128 |    % ----------------------------------------------------------
129 | 
130 |    [R_clean, Ref_clean, A_clean] = ...
131 |       lpcoeff(clean_frame, P);
132 |    [R_processed, Ref_processed, A_processed] = ...
133 |       lpcoeff(processed_frame, P);
134 | 
135 |   
136 |    % ----------------------------------------------------------
137 |    % (3) Compute the IS measure
138 |    % ----------------------------------------------------------
139 | 
140 |    numerator      = A_processed*toeplitz(R_clean)*A_processed';
141 |    denominator    = max(A_clean*toeplitz(R_clean)*A_clean',eps);
142 |    gain_clean     = max(R_clean*A_clean',eps);	      % this is gain
143 |    gain_processed = max(R_processed*A_processed',eps); % squared (sigma^2)
144 | 
145 |    
146 |      ISvalue=(gain_clean/gain_processed)*(numerator/denominator) + ...
147 |       log(gain_processed/gain_clean)-1; 
148 | 
149 |   distortion(frame_count) = min(ISvalue,100);
150 |    start = start + skiprate;
151 | 
152 | end
153 | 
154 | 
155 | 
156 | function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order)
157 | 
158 |    % ----------------------------------------------------------
159 |    % (1) Compute Autocorrelation Lags
160 |    % ----------------------------------------------------------
161 | 
162 |    winlength = max(size(speech_frame));
163 |    for k=1:model_order+1
164 |       R(k) = sum(speech_frame(1:winlength-k+1) ...
165 | 		     .*speech_frame(k:winlength));
166 |    end
167 | 
168 |    % ----------------------------------------------------------
169 |    % (2) Levinson-Durbin
170 |    % ----------------------------------------------------------
171 | 
172 |    a = ones(1,model_order);
173 |    E(1)=R(1);
174 |    for i=1:model_order
175 |       a_past(1:i-1) = a(1:i-1);
176 |       sum_term = sum(a_past(1:i-1).*R(i:-1:2));
177 |       rcoeff(i)=(R(i+1) - sum_term) / E(i);
178 |       a(i)=rcoeff(i);
179 |       a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1);
180 |       E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i);
181 |    end
182 | 
183 |    acorr    = R;
184 |    refcoeff = rcoeff;
185 |    lpparams = [1 -a];
186 | 
187 | 
188 | 
189 | 


--------------------------------------------------------------------------------
/PESQ/comp_llr.asv:
--------------------------------------------------------------------------------
  1 | function llr_mean= comp_llr(cleanFile, enhancedFile);
  2 | 
  3 | % ----------------------------------------------------------------------
  4 | %
  5 | %      Log Likelihood Ratio (LLR) Objective Speech Quality Measure
  6 | %
  7 | %
  8 | %     This function implements the Log Likelihood Ratio Measure
  9 | %     defined on page 48 of [1] (see Equation 2.18).
 10 | %
 11 | %   Usage:  llr=comp_llr(cleanFile.wav, enhancedFile.wav)
 12 | %           
 13 | %         cleanFile.wav - clean input file in .wav format
 14 | %         enhancedFile  - enhanced output file in .wav format
 15 | %         llr           - computed likelihood ratio
 16 | %
 17 | %         Note that the LLR measure is limited in the range [0, 2].
 18 | %
 19 | %  Example call:  llr =comp_llr('sp04.wav','enhanced.wav')
 20 | %
 21 | %
 22 | %  References:
 23 | %
 24 | %     [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
 25 | %	    Objective Measures of Speech Quality.  Prentice Hall
 26 | %	    Advanced Reference Series, Englewood Cliffs, NJ, 1988,
 27 | %	    ISBN: 0-13-629056-6.
 28 | %
 29 | %  Authors: Bryan L. Pellom and John H. L. Hansen (July 1998)
 30 | %  Modified by: Philipos C. Loizou  (Oct 2006) - limited LLR to be in [0,2]
 31 | %
 32 | % Copyright (c) 2006 by Philipos C. Loizou
 33 | % $Revision: 0.0 $  $Date: 10/09/2006 $
 34 | % ----------------------------------------------------------------------
 35 | 
 36 | if nargin~=2
 37 |     fprintf('USAGE: LLR=comp_llr(cleanFile.wav, enhancedFile.wav)\n');
 38 |     fprintf('For more help, type: help comp_\n\n');
 39 |     return;
 40 | end
 41 | 
 42 | alpha=0.95;
 43 | [data1, Srate1, Nbits1]= wavread(cleanFile);
 44 | [data2, Srate2, Nbits2]= wavread(enhancedFile);
 45 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
 46 |     error( 'The two files do not match!\n');
 47 | end
 48 |     
 49 | len= min( length( data1), length( data2));
 50 | data1= data1( 1: len)+eps;
 51 | data2= data2( 1: len)+eps;
 52 | 
 53 | IS_dist= llr( data1, data2,Srate1);
 54 | 
 55 | IS_len= round( length( IS_dist)* alpha);
 56 | IS= sort( IS_dist);
 57 | 
 58 | llr_mean= mean( IS( 1: IS_len));
 59 | 
 60 | 
 61 | 
 62 | function distortion = llr(clean_speech, processed_speech,sample_rate)
 63 | 
 64 | 
 65 | % ----------------------------------------------------------------------
 66 | % Check the length of the clean and processed speech.  Must be the same.
 67 | % ----------------------------------------------------------------------
 68 | 
 69 | clean_length      = length(clean_speech);
 70 | processed_length  = length(processed_speech);
 71 | 
 72 | if (clean_length ~= processed_length)
 73 |   disp('Error: Both Speech Files must be same length.');
 74 |   return
 75 | end
 76 | 
 77 | % ----------------------------------------------------------------------
 78 | % Global Variables
 79 | % ----------------------------------------------------------------------
 80 | 
 81 | winlength   = round(30*sample_rate/1000); %240;		   % window length in samples
 82 | skiprate    = floor(winlength/4);		   % window skip in samples
 83 | if sample_rate<10000
 84 |    P           = 10;		   % LPC Analysis Order
 85 | else
 86 |     P=16;     % this could vary depending on sampling frequency.
 87 | end
 88 | % ----------------------------------------------------------------------
 89 | % For each frame of input speech, calculate the Log Likelihood Ratio 
 90 | % ----------------------------------------------------------------------
 91 | 
 92 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
 93 | start      = 1;					% starting sample
 94 | window     = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
 95 | 
 96 | for frame_count = 1:num_frames
 97 | 
 98 |    % ----------------------------------------------------------
 99 |    % (1) Get the Frames for the test and reference speech. 
100 |    %     Multiply by Hanning Window.
101 |    % ----------------------------------------------------------
102 | 
103 |    clean_frame = clean_speech(start:start+winlength-1);
104 |    processed_frame = processed_speech(start:start+winlength-1);
105 |    clean_frame = clean_frame.*window;
106 |    processed_frame = processed_frame.*window;
107 | 
108 |    % ----------------------------------------------------------
109 |    % (2) Get the autocorrelation lags and LPC parameters used
110 |    %     to compute the LLR measure.
111 |    % ----------------------------------------------------------
112 | 
113 |    [R_clean, Ref_clean, A_clean] = ...
114 |       lpcoeff(clean_frame, P);
115 |    [R_processed, Ref_processed, A_processed] = ...
116 |       lpcoeff(processed_frame, P);
117 | 
118 |    % ----------------------------------------------------------
119 |    % (3) Compute the LLR measure
120 |    % ----------------------------------------------------------
121 | 
122 |    numerator   = A_processed*toeplitz(R_clean)*A_processed';
123 |    denominator = A_clean*toeplitz(R_clean)*A_clean';
124 |    distortion(frame_count) = min(2,log(numerator/denominator));
125 |    start = start + skiprate;
126 | 
127 | end
128 | 
129 | 
130 | function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order)
131 | 
132 |    % ----------------------------------------------------------
133 |    % (1) Compute Autocorrelation Lags
134 |    % ----------------------------------------------------------
135 | 
136 |    winlength = max(size(speech_frame));
137 |    for k=1:model_order+1
138 |       R(k) = sum(speech_frame(1:winlength-k+1) ...
139 | 		     .*speech_frame(k:winlength));
140 |    end
141 | 
142 |    % ----------------------------------------------------------
143 |    % (2) Levinson-Durbin
144 |    % ----------------------------------------------------------
145 | 
146 |    a = ones(1,model_order);
147 |    E(1)=R(1);
148 |    for i=1:model_order
149 |       a_past(1:i-1) = a(1:i-1);
150 |       sum_term = sum(a_past(1:i-1).*R(i:-1:2));
151 |       rcoeff(i)=(R(i+1) - sum_term) / E(i);
152 |       a(i)=rcoeff(i);
153 |       a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1);
154 |       E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i);
155 |    end
156 | 
157 |    acorr    = R;
158 |    refcoeff = rcoeff;
159 |    lpparams = [1 -a];
160 | 
161 | 
162 | 
163 | 


--------------------------------------------------------------------------------
/PESQ/comp_llr.m:
--------------------------------------------------------------------------------
  1 | function llr_mean= comp_llr(cleanFile, enhancedFile);
  2 | 
  3 | % ----------------------------------------------------------------------
  4 | %
  5 | %      Log Likelihood Ratio (LLR) Objective Speech Quality Measure
  6 | %
  7 | %
  8 | %     This function implements the Log Likelihood Ratio Measure
  9 | %     defined on page 48 of [1] (see Equation 2.18).
 10 | %
 11 | %   Usage:  llr=comp_llr(cleanFile.wav, enhancedFile.wav)
 12 | %           
 13 | %         cleanFile.wav - clean input file in .wav format
 14 | %         enhancedFile  - enhanced output file in .wav format
 15 | %         llr           - computed likelihood ratio
 16 | %
 17 | %         Note that the LLR measure is limited in the range [0, 2].
 18 | %
 19 | %  Example call:  llr =comp_llr('sp04.wav','enhanced.wav')
 20 | %
 21 | %
 22 | %  References:
 23 | %
 24 | %     [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
 25 | %	    Objective Measures of Speech Quality.  Prentice Hall
 26 | %	    Advanced Reference Series, Englewood Cliffs, NJ, 1988,
 27 | %	    ISBN: 0-13-629056-6.
 28 | %
 29 | %  Authors: Bryan L. Pellom and John H. L. Hansen (July 1998)
 30 | %  Modified by: Philipos C. Loizou  (Oct 2006) - limited LLR to be in [0,2]
 31 | %
 32 | % Copyright (c) 2006 by Philipos C. Loizou
 33 | % $Revision: 0.0 $  $Date: 10/09/2006 $
 34 | % ----------------------------------------------------------------------
 35 | 
 36 | if nargin~=2
 37 |     fprintf('USAGE: LLR=comp_llr(cleanFile.wav, enhancedFile.wav)\n');
 38 |     fprintf('For more help, type: help comp_llr\n\n');
 39 |     return;
 40 | end
 41 | 
 42 | alpha=0.95;
 43 | [data1, Srate1, Nbits1]= wavread(cleanFile);
 44 | [data2, Srate2, Nbits2]= wavread(enhancedFile);
 45 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
 46 |     error( 'The two files do not match!\n');
 47 | end
 48 |     
 49 | len= min( length( data1), length( data2));
 50 | data1= data1( 1: len)+eps;
 51 | data2= data2( 1: len)+eps;
 52 | 
 53 | IS_dist= llr( data1, data2,Srate1);
 54 | 
 55 | IS_len= round( length( IS_dist)* alpha);
 56 | IS= sort( IS_dist);
 57 | 
 58 | llr_mean= mean( IS( 1: IS_len));
 59 | 
 60 | 
 61 | 
 62 | function distortion = llr(clean_speech, processed_speech,sample_rate)
 63 | 
 64 | 
 65 | % ----------------------------------------------------------------------
 66 | % Check the length of the clean and processed speech.  Must be the same.
 67 | % ----------------------------------------------------------------------
 68 | 
 69 | clean_length      = length(clean_speech);
 70 | processed_length  = length(processed_speech);
 71 | 
 72 | if (clean_length ~= processed_length)
 73 |   disp('Error: Both Speech Files must be same length.');
 74 |   return
 75 | end
 76 | 
 77 | % ----------------------------------------------------------------------
 78 | % Global Variables
 79 | % ----------------------------------------------------------------------
 80 | 
 81 | winlength   = round(30*sample_rate/1000); %240;		   % window length in samples
 82 | skiprate    = floor(winlength/4);		   % window skip in samples
 83 | if sample_rate<10000
 84 |    P           = 10;		   % LPC Analysis Order
 85 | else
 86 |     P=16;     % this could vary depending on sampling frequency.
 87 | end
 88 | % ----------------------------------------------------------------------
 89 | % For each frame of input speech, calculate the Log Likelihood Ratio 
 90 | % ----------------------------------------------------------------------
 91 | 
 92 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
 93 | start      = 1;					% starting sample
 94 | window     = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
 95 | 
 96 | for frame_count = 1:num_frames
 97 | 
 98 |    % ----------------------------------------------------------
 99 |    % (1) Get the Frames for the test and reference speech. 
100 |    %     Multiply by Hanning Window.
101 |    % ----------------------------------------------------------
102 | 
103 |    clean_frame = clean_speech(start:start+winlength-1);
104 |    processed_frame = processed_speech(start:start+winlength-1);
105 |    clean_frame = clean_frame.*window;
106 |    processed_frame = processed_frame.*window;
107 | 
108 |    % ----------------------------------------------------------
109 |    % (2) Get the autocorrelation lags and LPC parameters used
110 |    %     to compute the LLR measure.
111 |    % ----------------------------------------------------------
112 | 
113 |    [R_clean, Ref_clean, A_clean] = ...
114 |       lpcoeff(clean_frame, P);
115 |    [R_processed, Ref_processed, A_processed] = ...
116 |       lpcoeff(processed_frame, P);
117 | 
118 |    % ----------------------------------------------------------
119 |    % (3) Compute the LLR measure
120 |    % ----------------------------------------------------------
121 | 
122 |    numerator   = A_processed*toeplitz(R_clean)*A_processed';
123 |    denominator = A_clean*toeplitz(R_clean)*A_clean';
124 |    distortion(frame_count) = min(2,log(numerator/denominator));
125 |    start = start + skiprate;
126 | 
127 | end
128 | 
129 | 
130 | function [acorr, refcoeff, lpparams] = lpcoeff(speech_frame, model_order)
131 | 
132 |    % ----------------------------------------------------------
133 |    % (1) Compute Autocorrelation Lags
134 |    % ----------------------------------------------------------
135 | 
136 |    winlength = max(size(speech_frame));
137 |    for k=1:model_order+1
138 |       R(k) = sum(speech_frame(1:winlength-k+1) ...
139 | 		     .*speech_frame(k:winlength));
140 |    end
141 | 
142 |    % ----------------------------------------------------------
143 |    % (2) Levinson-Durbin
144 |    % ----------------------------------------------------------
145 | 
146 |    a = ones(1,model_order);
147 |    E(1)=R(1);
148 |    for i=1:model_order
149 |       a_past(1:i-1) = a(1:i-1);
150 |       sum_term = sum(a_past(1:i-1).*R(i:-1:2));
151 |       rcoeff(i)=(R(i+1) - sum_term) / E(i);
152 |       a(i)=rcoeff(i);
153 |       a(1:i-1) = a_past(1:i-1) - rcoeff(i).*a_past(i-1:-1:1);
154 |       E(i+1)=(1-rcoeff(i)*rcoeff(i))*E(i);
155 |    end
156 | 
157 |    acorr    = R;
158 |    refcoeff = rcoeff;
159 |    lpparams = [1 -a];
160 | 
161 | 
162 | 
163 | 


--------------------------------------------------------------------------------
/PESQ/comp_snr.asv:
--------------------------------------------------------------------------------
  1 | function [snr_mean, segsnr_mean]= compSNR(cleanFile, enhdFile);
  2 | %
  3 | %   Segmental Signal-to-Noise Ratio Objective Speech Quality Measure
  4 | %
  5 | %
  6 | %     This function implements the segmental signal-to-noise ratio
  7 | %     defined on page 45 of [1] (see Equation 2.12).
  8 | %
  9 | %   Usage:  [SNRovl, SNRseg]=comp_snr(cleanFile.wav, enhancedFile.wav)
 10 | %           
 11 | %         cleanFile.wav - clean input file in .wav format
 12 | %         enhancedFile  - enhanced output file in .wav format
 13 | %         SNRovl        - overall SNR (dB)
 14 | %         SNRseg        - segmental SNR (dB)
 15 | %
 16 | %     This function returns 2 parameters.  The first item is the
 17 | %     overall SNR for the two speech signals.  The second value
 18 | %     is the segmental signal-to-noise ratio (1 seg-snr per 
 19 | %     frame of input).  The segmental SNR is clamped to range 
 20 | %     between 35dB and -10dB see suggestions in [2].
 21 | %
 22 | %   Example call:  [SNRovl,SNRseg]=comp_SNR('sp04_babble_sn10.wav','out_log.wav');
 23 | %
 24 | %  References:
 25 | %
 26 | %     [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
 27 | %	    Objective Measures of Speech Quality.  Prentice Hall
 28 | %	    Advanced Reference Series, Englewood Cliffs, NJ, 1988,
 29 | %	    ISBN: 0-13-629056-6.
 30 | %
 31 | %     [2] P. E. Papamichalis, Practical Approaches to Speech 
 32 | %	    Coding, Prentice-Hall, Englewood Cliffs, NJ, 1987.
 33 | %	    ISBN: 0-13-689019-9. (see pages 179-181).
 34 | %
 35 | %  Authors: Bryan L. Pellom and John H. L. Hansen (July 1998)
 36 | %  Modified by: Philipos C. Loizou  (Oct 2006)
 37 | %
 38 | % Copyright (c) 2006 by Philipos C. Loizou
 39 | % $Revision: 0.0 $  $Date: 10/09/2006 $
 40 | %-------------------------------------------------------------------------
 41 | 
 42 | 
 43 | 
 44 | [data1, Srate1, Nbits1]= wavread(cleanFile);
 45 | [data2, Srate2, Nbits2]= wavread(enhdFile);
 46 | if (( Srate1~= Srate2) | ( Nbits1~= Nbits2))
 47 |     error( 'The two files do not match!\n');
 48 | end
 49 |   
 50 | len= min( length( data1), length( data2));
 51 | data1= data1( 1: len);
 52 | data2= data2( 1: len);
 53 | 
 54 | [snr_dist, segsnr_dist]= snr( data1, data2,Srate1);
 55 | 
 56 | snr_mean= snr_dist;
 57 | segsnr_mean= mean( segsnr_dist);
 58 | 
 59 | 
 60 | 
 61 | function [overall_snr, segmental_snr] = snr(clean_speech, processed_speech,sample_rate)
 62 | 
 63 | % ----------------------------------------------------------------------
 64 | % Check the length of the clean and processed speech.  Must be the same.
 65 | % ----------------------------------------------------------------------
 66 | 
 67 | clean_length      = length(clean_speech);
 68 | processed_length  = length(processed_speech);
 69 | 
 70 | if (clean_length ~= processed_length)
 71 |   disp('Error: Both Speech Files must be same length.');
 72 |   return
 73 | end
 74 | 
 75 | % ----------------------------------------------------------------------
 76 | % Scale both clean speech and processed speech to have same dynamic
 77 | % range.  Also remove DC component from each signal
 78 | % ----------------------------------------------------------------------
 79 | 
 80 | %clean_speech     = clean_speech     - mean(clean_speech);
 81 | %processed_speech = processed_speech - mean(processed_speech);
 82 | 
 83 | %processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech)));
 84 | 
 85 | overall_snr = 10* log10( sum(clean_speech.^2)/sum((clean_speech-processed_speech).^2));
 86 | 
 87 | % ----------------------------------------------------------------------
 88 | % Global Variables
 89 | % ----------------------------------------------------------------------
 90 | 
 91 | %sample_rate = 8000;		   % default sample rate
 92 | winlength   = round(30*sample_rate/1000); %240;		   % window length in samples for 30-msecs
 93 | skiprate    = floor(winlength/4); %60;		   % window skip in samples
 94 | MIN_SNR     = -10;		   % minimum SNR in dB
 95 | MAX_SNR     =  35;		   % maximum SNR in dB
 96 | 
 97 | % ----------------------------------------------------------------------
 98 | % For each frame of input speech, calculate the Segmental SNR
 99 | % ----------------------------------------------------------------------
100 | 
101 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
102 | start      = 1;					% starting sample
103 | window     = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
104 | 
105 | for frame_count = 1: num_frames
106 | 
107 |    % ----------------------------------------------------------
108 |    % (1) Get the Frames for the test and reference speech. 
109 |    %     Multiply by Hanning Window.
110 |    % ----------------------------------------------------------
111 | 
112 |    clean_frame = clean_speech(start:start+winlength-1);
113 |    processed_frame = processed_speech(start:start+winlength-1);
114 |    clean_frame = clean_frame.*window;
115 |    processed_frame = processed_frame.*window;
116 | 
117 |    % ----------------------------------------------------------
118 |    % (2) Compute the Segmental SNR
119 |    % ----------------------------------------------------------
120 | 
121 |    signal_energy = sum(clean_frame.^2);
122 |    noise_energy  = sum((clean_frame-processed_frame).^2);
123 |    segmental_snr(frame_count) = 10*log10(signal_energy/(noise_energy+eps)+eps);
124 |    segmental_snr(frame_count) = max(segmental_snr(frame_count),MIN_SNR);
125 |    segmental_snr(frame_count) = min(segmental_snr(frame_count),MAX_SNR);
126 | 
127 |    start = start + skiprate;
128 | 
129 | end
130 | 
131 | 


--------------------------------------------------------------------------------
/PESQ/comp_snr.m:
--------------------------------------------------------------------------------
  1 | function [snr_mean, segsnr_mean]= comp_SNR(cleanFile, enhdFile);
  2 | %
  3 | %   Segmental Signal-to-Noise Ratio Objective Speech Quality Measure
  4 | %
  5 | %     This function implements the segmental signal-to-noise ratio
  6 | %     as defined in [1, p. 45] (see Equation 2.12).
  7 | %
  8 | %   Usage:  [SNRovl, SNRseg]=comp_snr(cleanFile.wav, enhancedFile.wav)
  9 | %           
 10 | %         cleanFile.wav - clean input file in .wav format
 11 | %         enhancedFile  - enhanced output file in .wav format
 12 | %         SNRovl        - overall SNR (dB)
 13 | %         SNRseg        - segmental SNR (dB)
 14 | %
 15 | %     This function returns 2 parameters.  The first item is the
 16 | %     overall SNR for the two speech signals.  The second value
 17 | %     is the segmental signal-to-noise ratio (1 seg-snr per 
 18 | %     frame of input).  The segmental SNR is clamped to range 
 19 | %     between 35dB and -10dB (see suggestions in [2]).
 20 | %
 21 | %   Example call:  [SNRovl,SNRseg]=comp_SNR('sp04.wav','enhanced.wav')
 22 | %
 23 | %  References:
 24 | %
 25 | %     [1] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
 26 | %	    Objective Measures of Speech Quality.  Prentice Hall
 27 | %	    Advanced Reference Series, Englewood Cliffs, NJ, 1988,
 28 | %	    ISBN: 0-13-629056-6.
 29 | %
 30 | %     [2] P. E. Papamichalis, Practical Approaches to Speech 
 31 | %	    Coding, Prentice-Hall, Englewood Cliffs, NJ, 1987.
 32 | %	    ISBN: 0-13-689019-9. (see pages 179-181).
 33 | %
 34 | %  Authors: Bryan L. Pellom and John H. L. Hansen (July 1998)
 35 | %  Modified by: Philipos C. Loizou  (Oct 2006)
 36 | %
 37 | % Copyright (c) 2006 by Philipos C. Loizou
 38 | % $Revision: 0.0 $  $Date: 10/09/2006 $
 39 | %-------------------------------------------------------------------------
 40 | 
 41 | if nargin ~=2
 42 |     fprintf('USAGE: [snr_mean, segsnr_mean]= comp_SNR(cleanFile, enhdFile) \n');
 43 |     return;
 44 | end   
 45 | 
 46 | [data1, Srate1, Nbits1]= wavread(cleanFile);
 47 | [data2, Srate2, Nbits2]= wavread(enhdFile);
 48 | if (( Srate1~= Srate2) | ( Nbits1~= Nbits2))
 49 |     error( 'The two files do not match!\n');
 50 | end
 51 |   
 52 | len= min( length( data1), length( data2));
 53 | data1= data1( 1: len);
 54 | data2= data2( 1: len);
 55 | 
 56 | [snr_dist, segsnr_dist]= snr( data1, data2,Srate1);
 57 | 
 58 | snr_mean= snr_dist;
 59 | segsnr_mean= mean( segsnr_dist);
 60 | 
 61 | 
 62 | % =========================================================================
 63 | function [overall_snr, segmental_snr] = snr(clean_speech, processed_speech,sample_rate)
 64 | 
 65 | % ----------------------------------------------------------------------
 66 | % Check the length of the clean and processed speech.  Must be the same.
 67 | % ----------------------------------------------------------------------
 68 | 
 69 | clean_length      = length(clean_speech);
 70 | processed_length  = length(processed_speech);
 71 | 
 72 | if (clean_length ~= processed_length)
 73 |   disp('Error: Both Speech Files must be same length.');
 74 |   return
 75 | end
 76 | 
 77 | % ----------------------------------------------------------------------
 78 | % Scale both clean speech and processed speech to have same dynamic
 79 | % range.  Also remove DC component from each signal
 80 | % ----------------------------------------------------------------------
 81 | 
 82 | %clean_speech     = clean_speech     - mean(clean_speech);
 83 | %processed_speech = processed_speech - mean(processed_speech);
 84 | 
 85 | %processed_speech = processed_speech.*(max(abs(clean_speech))/ max(abs(processed_speech)));
 86 | 
 87 | overall_snr = 10* log10( sum(clean_speech.^2)/sum((clean_speech-processed_speech).^2));
 88 | 
 89 | % ----------------------------------------------------------------------
 90 | % Global Variables
 91 | % ----------------------------------------------------------------------
 92 | 
 93 | 
 94 | winlength   = round(30*sample_rate/1000); %240;		   % window length in samples for 30-msecs
 95 | skiprate    = floor(winlength/4); %60;		   % window skip in samples
 96 | MIN_SNR     = -10;		   % minimum SNR in dB
 97 | MAX_SNR     =  35;		   % maximum SNR in dB
 98 | 
 99 | % ----------------------------------------------------------------------
100 | % For each frame of input speech, calculate the Segmental SNR
101 | % ----------------------------------------------------------------------
102 | 
103 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
104 | start      = 1;					% starting sample
105 | window     = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
106 | 
107 | for frame_count = 1: num_frames
108 | 
109 |    % ----------------------------------------------------------
110 |    % (1) Get the Frames for the test and reference speech. 
111 |    %     Multiply by Hanning Window.
112 |    % ----------------------------------------------------------
113 | 
114 |    clean_frame = clean_speech(start:start+winlength-1);
115 |    processed_frame = processed_speech(start:start+winlength-1);
116 |    clean_frame = clean_frame.*window;
117 |    processed_frame = processed_frame.*window;
118 | 
119 |    % ----------------------------------------------------------
120 |    % (2) Compute the Segmental SNR
121 |    % ----------------------------------------------------------
122 | 
123 |    signal_energy = sum(clean_frame.^2);
124 |    noise_energy  = sum((clean_frame-processed_frame).^2);
125 |    segmental_snr(frame_count) = 10*log10(signal_energy/(noise_energy+eps)+eps);
126 |    segmental_snr(frame_count) = max(segmental_snr(frame_count),MIN_SNR);
127 |    segmental_snr(frame_count) = min(segmental_snr(frame_count),MAX_SNR);
128 | 
129 |    start = start + skiprate;
130 | 
131 | end
132 | 
133 | 


--------------------------------------------------------------------------------
/PESQ/comp_wss.asv:
--------------------------------------------------------------------------------
  1 | function wss_dist= comp_wss(cleanFile, enhancedFile);
  2 | % ----------------------------------------------------------------------
  3 | %
  4 | %     Weighted Spectral Slope (WSS) Objective Speech Quality Measure
  5 | %
  6 | %     This function implements the Weighted Spectral Slope (WSS)
  7 | %     distance measure originally proposed in [1].  The algorithm
  8 | %     works by first decomposing the speech signal into a set of
  9 | %     frequency bands (this is done for both the test and reference
 10 | %     frame).  The intensities within each critical band are 
 11 | %     measured.  Then, a weighted distances between the measured
 12 | %     slopes of the log-critical band spectra are computed.  
 13 | %     This measure is also described in Section 2.2.9 (pages 56-58)
 14 | %     of [2].
 15 | %
 16 | %     Whereas Klatt's original measure used 36 critical-band 
 17 | %     filters to estimate the smoothed short-time spectrum, this
 18 | %     implementation considers a bank of 25 filters spanning 
 19 | %     the 4 kHz bandwidth.  
 20 | %
 21 | %   Usage:  wss_dist=comp_wss(cleanFile.wav, enhancedFile.wav)
 22 | %           
 23 | %         cleanFile.wav - clean input file in .wav format
 24 | %         enhancedFile  - enhanced output file in .wav format
 25 | %         wss_dist      - computed spectral slope distance
 26 | %
 27 | %  Example call:  ws =comp_wss('sp04.wav','enhanced.wav')
 28 | %
 29 | %  References:
 30 | %
 31 | %     [1] D. H. Klatt, "Prediction of Perceived Phonetic Distance
 32 | %	    from Critical-Band Spectra: A First Step", Proc. IEEE
 33 | %	    ICASSP'82, Volume 2, pp. 1278-1281, May, 1982.
 34 | %
 35 | %     [2] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
 36 | %	    Objective Measures of Speech Quality.  Prentice Hall
 37 | %	    Advanced Reference Series, Englewood Cliffs, NJ, 1988,
 38 | %	    ISBN: 0-13-629056-6.
 39 | %
 40 | %  Authors:
 41 | %
 42 | %     Bryan L. Pellom and John H. L. Hansen
 43 | %     Robust Speech Processing Laboratory, Duke University
 44 | %     Department of Electrical Engineeering
 45 | %
 46 | %  Last Modified:
 47 | %
 48 | %     July 22, 1998
 49 | %
 50 | % ----------------------------------------------------------------------
 51 | 
 52 | 
 53 | alpha= 0.95;
 54 | 
 55 | [data1, Srate1, Nbits1]= wavread(cleanFile);
 56 | [data2, Srate2, Nbits2]= wavread(enhancedFile);
 57 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
 58 |     error( 'The two files do not match!\n');
 59 | end
 60 | 
 61 | len= min( length( data1), length( data2));
 62 | data1= data1( 1: len)+eps;
 63 | data2= data2( 1: len)+eps;
 64 | 
 65 | wss_dist_vec= wss( data1, data2,Srate1);
 66 | wss_dist_vec= sort( wss_dist_vec);
 67 | wss_dist= mean( wss_dist_vec( 1: round( length( wss_dist_vec)*alpha)));
 68 | 
 69 | 
 70 | 
 71 | function distortion = wss(clean_speech, processed_speech,sample_rate)
 72 | 
 73 | 
 74 | % ----------------------------------------------------------------------
 75 | % Check the length of the clean and processed speech.  Must be the same.
 76 | % ----------------------------------------------------------------------
 77 | 
 78 | clean_length      = length(clean_speech);
 79 | processed_length  = length(processed_speech);
 80 | 
 81 | if (clean_length ~= processed_length)
 82 |   disp('Error: Files  musthave same length.');
 83 |   return
 84 | end
 85 | 
 86 | 
 87 | 
 88 | % ----------------------------------------------------------------------
 89 | % Global Variables
 90 | % ----------------------------------------------------------------------
 91 | 
 92 | %ample_rate = 8000;		   % default sample rate
 93 | %winlength   = 240;		   % window length in samples
 94 | %skiprate    = 60;		   % window skip in samples
 95 | winlength   = round(30*sample_rate/1000); 	   % window length in samples
 96 | skiprate    = floor(winlength/4);		   % window skip in samples
 97 | max_freq    = sample_rate/2;	   % maximum bandwidth
 98 | num_crit    = 25;		   % number of critical bands
 99 | 
100 | USE_FFT_SPECTRUM = 1;		   % defaults to 10th order LP spectrum
101 | %n_fft       = 512;		   % FFT size
102 | n_fft       = 2^nextpow2(2*winlength);
103 | n_fftby2    = n_fft/2;		   % FFT size/2
104 | Kmax        = 20;		   % value suggested by Klatt, pg 1280
105 | Klocmax     = 1;		   % value suggested by Klatt, pg 1280		
106 | 
107 | % ----------------------------------------------------------------------
108 | % Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
109 | % ----------------------------------------------------------------------
110 | 
111 | cent_freq(1)  = 50.0000;   bandwidth(1)  = 70.0000;
112 | cent_freq(2)  = 120.000;   bandwidth(2)  = 70.0000;
113 | cent_freq(3)  = 190.000;   bandwidth(3)  = 70.0000;
114 | cent_freq(4)  = 260.000;   bandwidth(4)  = 70.0000;
115 | cent_freq(5)  = 330.000;   bandwidth(5)  = 70.0000;
116 | cent_freq(6)  = 400.000;   bandwidth(6)  = 70.0000;
117 | cent_freq(7)  = 470.000;   bandwidth(7)  = 70.0000;
118 | cent_freq(8)  = 540.000;   bandwidth(8)  = 77.3724;
119 | cent_freq(9)  = 617.372;   bandwidth(9)  = 86.0056;
120 | cent_freq(10) = 703.378;   bandwidth(10) = 95.3398;
121 | cent_freq(11) = 798.717;   bandwidth(11) = 105.411;
122 | cent_freq(12) = 904.128;   bandwidth(12) = 116.256;
123 | cent_freq(13) = 1020.38;   bandwidth(13) = 127.914;
124 | cent_freq(14) = 1148.30;   bandwidth(14) = 140.423;
125 | cent_freq(15) = 1288.72;   bandwidth(15) = 153.823;
126 | cent_freq(16) = 1442.54;   bandwidth(16) = 168.154;
127 | cent_freq(17) = 1610.70;   bandwidth(17) = 183.457;
128 | cent_freq(18) = 1794.16;   bandwidth(18) = 199.776;
129 | cent_freq(19) = 1993.93;   bandwidth(19) = 217.153;
130 | cent_freq(20) = 2211.08;   bandwidth(20) = 235.631;
131 | cent_freq(21) = 2446.71;   bandwidth(21) = 255.255;
132 | cent_freq(22) = 2701.97;   bandwidth(22) = 276.072;
133 | cent_freq(23) = 2978.04;   bandwidth(23) = 298.126;
134 | cent_freq(24) = 3276.17;   bandwidth(24) = 321.465;
135 | cent_freq(25) = 3597.63;   bandwidth(25) = 346.136;
136 | 
137 | bw_min      = bandwidth (1);	   % minimum critical bandwidth
138 | 
139 | % ----------------------------------------------------------------------
140 | % Set up the critical band filters.  Note here that Gaussianly shaped
141 | % filters are used.  Also, the sum of the filter weights are equivalent
142 | % for each critical band filter.  Filter less than -30 dB and set to
143 | % zero.
144 | % ----------------------------------------------------------------------
145 | 
146 | min_factor = exp (-30.0 / (2.0 * 2.303));       % -30 dB point of filter
147 | 
148 | for i = 1:num_crit
149 |   f0 = (cent_freq (i) / max_freq) * (n_fftby2);
150 |   all_f0(i) = floor(f0);
151 |   bw = (bandwidth (i) / max_freq) * (n_fftby2);
152 |   norm_factor = log(bw_min) - log(bandwidth(i));
153 |   j = 0:1:n_fftby2-1;
154 |   crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
155 |   crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
156 | end   
157 | 
158 | % ----------------------------------------------------------------------
159 | % For each frame of input speech, calculate the Weighted Spectral
160 | % Slope Measure
161 | % ----------------------------------------------------------------------
162 | 
163 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
164 | start      = 1;					% starting sample
165 | window     = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
166 | 
167 | for frame_count = 1:num_frames
168 | 
169 |    % ----------------------------------------------------------
170 |    % (1) Get the Frames for the test and reference speech. 
171 |    %     Multiply by Hanning Window.
172 |    % ----------------------------------------------------------
173 | 
174 |    clean_frame = clean_speech(start:start+winlength-1);
175 |    processed_frame = processed_speech(start:start+winlength-1);
176 |    clean_frame = clean_frame.*window;
177 |    processed_frame = processed_frame.*window;
178 | 
179 |    % ----------------------------------------------------------
180 |    % (2) Compute the Power Spectrum of Clean and Processed
181 |    % ----------------------------------------------------------
182 | 
183 |     if (USE_FFT_SPECTRUM)
184 |        clean_spec     = (abs(fft(clean_frame,n_fft)).^2);
185 |        processed_spec = (abs(fft(processed_frame,n_fft)).^2);
186 |     else
187 |        a_vec = zeros(1,n_fft);
188 |        a_vec(1:11) = lpc(clean_frame,10);
189 |        clean_spec     = 1.0/(abs(fft(a_vec,n_fft)).^2)';
190 | 
191 |        a_vec = zeros(1,n_fft);
192 |        a_vec(1:11) = lpc(processed_frame,10);
193 |        processed_spec = 1.0/(abs(fft(a_vec,n_fft)).^2)';
194 |     end
195 | 
196 |    % ----------------------------------------------------------
197 |    % (3) Compute Filterbank Output Energies (in dB scale)
198 |    % ----------------------------------------------------------
199 |  
200 |    for i = 1:num_crit
201 |       clean_energy(i) = sum(clean_spec(1:n_fftby2) ...
202 | 		            .*crit_filter(i,:)');
203 |       processed_energy(i) = sum(processed_spec(1:n_fftby2) ...
204 | 			        .*crit_filter(i,:)');
205 |    end
206 |    clean_energy = 10*log10(max(clean_energy,1E-10));
207 |    processed_energy = 10*log10(max(processed_energy,1E-10));
208 | 
209 |    % ----------------------------------------------------------
210 |    % (4) Compute Spectral Slope (dB[i+1]-dB[i]) 
211 |    % ----------------------------------------------------------
212 | 
213 |    clean_slope     = clean_energy(2:num_crit) - ...
214 | 		     clean_energy(1:num_crit-1);
215 |    processed_slope = processed_energy(2:num_crit) - ...
216 | 		     processed_energy(1:num_crit-1);
217 | 
218 |    % ----------------------------------------------------------
219 |    % (5) Find the nearest peak locations in the spectra to 
220 |    %     each critical band.  If the slope is negative, we 
221 |    %     search to the left.  If positive, we search to the 
222 |    %     right.
223 |    % ----------------------------------------------------------
224 | 
225 |    for i = 1:num_crit-1
226 | 
227 |        % find the peaks in the clean speech signal
228 | 	
229 |        if (clean_slope(i)>0) 		% search to the right
230 | 	  n = i;
231 |           while ((n<num_crit) & (clean_slope(n) > 0))
232 | 	     n = n+1;
233 |  	  end
234 | 	  clean_loc_peak(i) = clean_energy(n-1);
235 |        else				% search to the left
236 |           n = i;
237 | 	  while ((n>0) & (clean_slope(n) <= 0))
238 | 	     n = n-1;
239 |  	  end
240 | 	  clean_loc_peak(i) = clean_energy(n+1);
241 |        end
242 | 
243 |        % find the peaks in the processed speech signal
244 | 
245 |        if (processed_slope(i)>0) 	% search to the right
246 | 	  n = i;
247 |           while ((n<num_crit) & (processed_slope(n) > 0))
248 | 	     n = n+1;
249 | 	  end
250 | 	  processed_loc_peak(i) = processed_energy(n-1);
251 |        else				% search to the left
252 |           n = i;
253 | 	  while ((n>0) & (processed_slope(n) <= 0))
254 | 	     n = n-1;
255 |  	  end
256 | 	  processed_loc_peak(i) = processed_energy(n+1);
257 |        end
258 | 
259 |    end
260 | 
261 |    % ----------------------------------------------------------
262 |    %  (6) Compute the WSS Measure for this frame.  This 
263 |    %      includes determination of the weighting function.
264 |    % ----------------------------------------------------------
265 | 
266 |    dBMax_clean       = max(clean_energy);
267 |    dBMax_processed   = max(processed_energy);
268 | 
269 |    % The weights are calculated by averaging individual
270 |    % weighting factors from the clean and processed frame.
271 |    % These weights W_clean and W_processed should range
272 |    % from 0 to 1 and place more emphasis on spectral 
273 |    % peaks and less emphasis on slope differences in spectral
274 |    % valleys.  This procedure is described on page 1280 of
275 |    % Klatt's 1982 ICASSP paper.
276 | 
277 |    Wmax_clean        = Kmax ./ (Kmax + dBMax_clean - ...
278 | 		 	    clean_energy(1:num_crit-1));
279 |    Wlocmax_clean     = Klocmax ./ ( Klocmax + clean_loc_peak - ...
280 | 				clean_energy(1:num_crit-1));
281 |    W_clean           = Wmax_clean .* Wlocmax_clean;
282 | 
283 |    Wmax_processed    = Kmax ./ (Kmax + dBMax_processed - ...
284 | 			        processed_energy(1:num_crit-1));
285 |    Wlocmax_processed = Klocmax ./ ( Klocmax + processed_loc_peak - ...
286 | 			            processed_energy(1:num_crit-1));
287 |    W_processed       = Wmax_processed .* Wlocmax_processed;
288 |   
289 |    W = (W_clean + W_processed)./2.0;
290 |   
291 |    distortion(frame_count) = sum(W.*(clean_slope(1:num_crit-1) - ...
292 | 		       processed_slope(1:num_crit-1)).^2);
293 | 
294 |    % this normalization is not part of Klatt's paper, but helps
295 |    % to normalize the measure.  Here we scale the measure by the
296 |    % sum of the weights.
297 | 
298 |    distortion(frame_count) = distortion(frame_count)/sum(W);
299 |    
300 |    start = start + skiprate;
301 |      
302 | end
303 | 
304 | 


--------------------------------------------------------------------------------
/PESQ/comp_wss.m:
--------------------------------------------------------------------------------
  1 | function wss_dist= comp_wss(cleanFile, enhancedFile);
  2 | % ----------------------------------------------------------------------
  3 | %
  4 | %     Weighted Spectral Slope (WSS) Objective Speech Quality Measure
  5 | %
  6 | %     This function implements the Weighted Spectral Slope (WSS)
  7 | %     distance measure originally proposed in [1].  The algorithm
  8 | %     works by first decomposing the speech signal into a set of
  9 | %     frequency bands (this is done for both the test and reference
 10 | %     frame).  The intensities within each critical band are 
 11 | %     measured.  Then, a weighted distances between the measured
 12 | %     slopes of the log-critical band spectra are computed.  
 13 | %     This measure is also described in Section 2.2.9 (pages 56-58)
 14 | %     of [2].
 15 | %
 16 | %     Whereas Klatt's original measure used 36 critical-band 
 17 | %     filters to estimate the smoothed short-time spectrum, this
 18 | %     implementation considers a bank of 25 filters spanning 
 19 | %     the 4 kHz bandwidth.  
 20 | %
 21 | %   Usage:  wss_dist=comp_wss(cleanFile.wav, enhancedFile.wav)
 22 | %           
 23 | %         cleanFile.wav - clean input file in .wav format
 24 | %         enhancedFile  - enhanced output file in .wav format
 25 | %         wss_dist      - computed spectral slope distance
 26 | %
 27 | %  Example call:  ws =comp_wss('sp04.wav','enhanced.wav')
 28 | %
 29 | %  References:
 30 | %
 31 | %     [1] D. H. Klatt, "Prediction of Perceived Phonetic Distance
 32 | %	    from Critical-Band Spectra: A First Step", Proc. IEEE
 33 | %	    ICASSP'82, Volume 2, pp. 1278-1281, May, 1982.
 34 | %
 35 | %     [2] S. R. Quackenbush, T. P. Barnwell, and M. A. Clements,
 36 | %	    Objective Measures of Speech Quality.  Prentice Hall
 37 | %	    Advanced Reference Series, Englewood Cliffs, NJ, 1988,
 38 | %	    ISBN: 0-13-629056-6.
 39 | %
 40 | %  Authors: Bryan L. Pellom and John H. L. Hansen (July 1998)
 41 | %  Modified by: Philipos C. Loizou  (Oct 2006)
 42 | %
 43 | % Copyright (c) 2006 by Philipos C. Loizou
 44 | % $Revision: 0.0 $  $Date: 10/09/2006 $
 45 | %
 46 | % ----------------------------------------------------------------------
 47 | if nargin~=2
 48 |     fprintf('USAGE: WSS=comp_wss(cleanFile.wav, enhancedFile.wav)\n');
 49 |     fprintf('For more help, type: help comp_wss\n\n');
 50 |     return;
 51 | end
 52 | 
 53 | alpha= 0.95;
 54 | 
 55 | [data1, Srate1, Nbits1]= wavread(cleanFile);
 56 | [data2, Srate2, Nbits2]= wavread(enhancedFile);
 57 | if ( Srate1~= Srate2) | ( Nbits1~= Nbits2)
 58 |     error( 'The two files do not match!\n');
 59 | end
 60 | 
 61 | len= min( length( data1), length( data2));
 62 | data1= data1( 1: len)+eps;
 63 | data2= data2( 1: len)+eps;
 64 | 
 65 | wss_dist_vec= wss( data1, data2,Srate1);
 66 | wss_dist_vec= sort( wss_dist_vec);
 67 | wss_dist= mean( wss_dist_vec( 1: round( length( wss_dist_vec)*alpha)));
 68 | 
 69 | 
 70 | 
 71 | function distortion = wss(clean_speech, processed_speech,sample_rate)
 72 | 
 73 | 
 74 | % ----------------------------------------------------------------------
 75 | % Check the length of the clean and processed speech.  Must be the same.
 76 | % ----------------------------------------------------------------------
 77 | 
 78 | clean_length      = length(clean_speech);
 79 | processed_length  = length(processed_speech);
 80 | 
 81 | if (clean_length ~= processed_length)
 82 |   disp('Error: Files  musthave same length.');
 83 |   return
 84 | end
 85 | 
 86 | 
 87 | 
 88 | % ----------------------------------------------------------------------
 89 | % Global Variables
 90 | % ----------------------------------------------------------------------
 91 | 
 92 | winlength   = round(30*sample_rate/1000); 	   % window length in samples
 93 | skiprate    = floor(winlength/4);		   % window skip in samples
 94 | max_freq    = sample_rate/2;	   % maximum bandwidth
 95 | num_crit    = 25;		   % number of critical bands
 96 | 
 97 | USE_FFT_SPECTRUM = 1;		   % defaults to 10th order LP spectrum
 98 | n_fft       = 2^nextpow2(2*winlength);
 99 | n_fftby2    = n_fft/2;		   % FFT size/2
100 | Kmax        = 20;		   % value suggested by Klatt, pg 1280
101 | Klocmax     = 1;		   % value suggested by Klatt, pg 1280		
102 | 
103 | % ----------------------------------------------------------------------
104 | % Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
105 | % ----------------------------------------------------------------------
106 | 
107 | cent_freq(1)  = 50.0000;   bandwidth(1)  = 70.0000;
108 | cent_freq(2)  = 120.000;   bandwidth(2)  = 70.0000;
109 | cent_freq(3)  = 190.000;   bandwidth(3)  = 70.0000;
110 | cent_freq(4)  = 260.000;   bandwidth(4)  = 70.0000;
111 | cent_freq(5)  = 330.000;   bandwidth(5)  = 70.0000;
112 | cent_freq(6)  = 400.000;   bandwidth(6)  = 70.0000;
113 | cent_freq(7)  = 470.000;   bandwidth(7)  = 70.0000;
114 | cent_freq(8)  = 540.000;   bandwidth(8)  = 77.3724;
115 | cent_freq(9)  = 617.372;   bandwidth(9)  = 86.0056;
116 | cent_freq(10) = 703.378;   bandwidth(10) = 95.3398;
117 | cent_freq(11) = 798.717;   bandwidth(11) = 105.411;
118 | cent_freq(12) = 904.128;   bandwidth(12) = 116.256;
119 | cent_freq(13) = 1020.38;   bandwidth(13) = 127.914;
120 | cent_freq(14) = 1148.30;   bandwidth(14) = 140.423;
121 | cent_freq(15) = 1288.72;   bandwidth(15) = 153.823;
122 | cent_freq(16) = 1442.54;   bandwidth(16) = 168.154;
123 | cent_freq(17) = 1610.70;   bandwidth(17) = 183.457;
124 | cent_freq(18) = 1794.16;   bandwidth(18) = 199.776;
125 | cent_freq(19) = 1993.93;   bandwidth(19) = 217.153;
126 | cent_freq(20) = 2211.08;   bandwidth(20) = 235.631;
127 | cent_freq(21) = 2446.71;   bandwidth(21) = 255.255;
128 | cent_freq(22) = 2701.97;   bandwidth(22) = 276.072;
129 | cent_freq(23) = 2978.04;   bandwidth(23) = 298.126;
130 | cent_freq(24) = 3276.17;   bandwidth(24) = 321.465;
131 | cent_freq(25) = 3597.63;   bandwidth(25) = 346.136;
132 | 
133 | bw_min      = bandwidth (1);	   % minimum critical bandwidth
134 | 
135 | % ----------------------------------------------------------------------
136 | % Set up the critical band filters.  Note here that Gaussianly shaped
137 | % filters are used.  Also, the sum of the filter weights are equivalent
138 | % for each critical band filter.  Filter less than -30 dB and set to
139 | % zero.
140 | % ----------------------------------------------------------------------
141 | 
142 | min_factor = exp (-30.0 / (2.0 * 2.303));       % -30 dB point of filter
143 | 
144 | for i = 1:num_crit
145 |   f0 = (cent_freq (i) / max_freq) * (n_fftby2);
146 |   all_f0(i) = floor(f0);
147 |   bw = (bandwidth (i) / max_freq) * (n_fftby2);
148 |   norm_factor = log(bw_min) - log(bandwidth(i));
149 |   j = 0:1:n_fftby2-1;
150 |   crit_filter(i,:) = exp (-11 *(((j - floor(f0)) ./bw).^2) + norm_factor);
151 |   crit_filter(i,:) = crit_filter(i,:).*(crit_filter(i,:) > min_factor);
152 | end   
153 | 
154 | % ----------------------------------------------------------------------
155 | % For each frame of input speech, calculate the Weighted Spectral
156 | % Slope Measure
157 | % ----------------------------------------------------------------------
158 | 
159 | num_frames = clean_length/skiprate-(winlength/skiprate); % number of frames
160 | start      = 1;					% starting sample
161 | window     = 0.5*(1 - cos(2*pi*(1:winlength)'/(winlength+1)));
162 | 
163 | for frame_count = 1:num_frames
164 | 
165 |    % ----------------------------------------------------------
166 |    % (1) Get the Frames for the test and reference speech. 
167 |    %     Multiply by Hanning Window.
168 |    % ----------------------------------------------------------
169 | 
170 |    clean_frame = clean_speech(start:start+winlength-1);
171 |    processed_frame = processed_speech(start:start+winlength-1);
172 |    clean_frame = clean_frame.*window;
173 |    processed_frame = processed_frame.*window;
174 | 
175 |    % ----------------------------------------------------------
176 |    % (2) Compute the Power Spectrum of Clean and Processed
177 |    % ----------------------------------------------------------
178 | 
179 |     if (USE_FFT_SPECTRUM)
180 |        clean_spec     = (abs(fft(clean_frame,n_fft)).^2);
181 |        processed_spec = (abs(fft(processed_frame,n_fft)).^2);
182 |     else
183 |        a_vec = zeros(1,n_fft);
184 |        a_vec(1:11) = lpc(clean_frame,10);
185 |        clean_spec     = 1.0/(abs(fft(a_vec,n_fft)).^2)';
186 | 
187 |        a_vec = zeros(1,n_fft);
188 |        a_vec(1:11) = lpc(processed_frame,10);
189 |        processed_spec = 1.0/(abs(fft(a_vec,n_fft)).^2)';
190 |     end
191 | 
192 |    % ----------------------------------------------------------
193 |    % (3) Compute Filterbank Output Energies (in dB scale)
194 |    % ----------------------------------------------------------
195 |  
196 |    for i = 1:num_crit
197 |       clean_energy(i) = sum(clean_spec(1:n_fftby2) ...
198 | 		            .*crit_filter(i,:)');
199 |       processed_energy(i) = sum(processed_spec(1:n_fftby2) ...
200 | 			        .*crit_filter(i,:)');
201 |    end
202 |    clean_energy = 10*log10(max(clean_energy,1E-10));
203 |    processed_energy = 10*log10(max(processed_energy,1E-10));
204 | 
205 |    % ----------------------------------------------------------
206 |    % (4) Compute Spectral Slope (dB[i+1]-dB[i]) 
207 |    % ----------------------------------------------------------
208 | 
209 |    clean_slope     = clean_energy(2:num_crit) - ...
210 | 		     clean_energy(1:num_crit-1);
211 |    processed_slope = processed_energy(2:num_crit) - ...
212 | 		     processed_energy(1:num_crit-1);
213 | 
214 |    % ----------------------------------------------------------
215 |    % (5) Find the nearest peak locations in the spectra to 
216 |    %     each critical band.  If the slope is negative, we 
217 |    %     search to the left.  If positive, we search to the 
218 |    %     right.
219 |    % ----------------------------------------------------------
220 | 
221 |    for i = 1:num_crit-1
222 | 
223 |        % find the peaks in the clean speech signal
224 | 	
225 |        if (clean_slope(i)>0) 		% search to the right
226 | 	  n = i;
227 |           while ((n<num_crit) & (clean_slope(n) > 0))
228 | 	     n = n+1;
229 |  	  end
230 | 	  clean_loc_peak(i) = clean_energy(n-1);
231 |        else				% search to the left
232 |           n = i;
233 | 	  while ((n>0) & (clean_slope(n) <= 0))
234 | 	     n = n-1;
235 |  	  end
236 | 	  clean_loc_peak(i) = clean_energy(n+1);
237 |        end
238 | 
239 |        % find the peaks in the processed speech signal
240 | 
241 |        if (processed_slope(i)>0) 	% search to the right
242 | 	  n = i;
243 |           while ((n<num_crit) & (processed_slope(n) > 0))
244 | 	     n = n+1;
245 | 	  end
246 | 	  processed_loc_peak(i) = processed_energy(n-1);
247 |        else				% search to the left
248 |           n = i;
249 | 	  while ((n>0) & (processed_slope(n) <= 0))
250 | 	     n = n-1;
251 |  	  end
252 | 	  processed_loc_peak(i) = processed_energy(n+1);
253 |        end
254 | 
255 |    end
256 | 
257 |    % ----------------------------------------------------------
258 |    %  (6) Compute the WSS Measure for this frame.  This 
259 |    %      includes determination of the weighting function.
260 |    % ----------------------------------------------------------
261 | 
262 |    dBMax_clean       = max(clean_energy);
263 |    dBMax_processed   = max(processed_energy);
264 | 
265 |    % The weights are calculated by averaging individual
266 |    % weighting factors from the clean and processed frame.
267 |    % These weights W_clean and W_processed should range
268 |    % from 0 to 1 and place more emphasis on spectral 
269 |    % peaks and less emphasis on slope differences in spectral
270 |    % valleys.  This procedure is described on page 1280 of
271 |    % Klatt's 1982 ICASSP paper.
272 | 
273 |    Wmax_clean        = Kmax ./ (Kmax + dBMax_clean - ...
274 | 		 	    clean_energy(1:num_crit-1));
275 |    Wlocmax_clean     = Klocmax ./ ( Klocmax + clean_loc_peak - ...
276 | 				clean_energy(1:num_crit-1));
277 |    W_clean           = Wmax_clean .* Wlocmax_clean;
278 | 
279 |    Wmax_processed    = Kmax ./ (Kmax + dBMax_processed - ...
280 | 			        processed_energy(1:num_crit-1));
281 |    Wlocmax_processed = Klocmax ./ ( Klocmax + processed_loc_peak - ...
282 | 			            processed_energy(1:num_crit-1));
283 |    W_processed       = Wmax_processed .* Wlocmax_processed;
284 |   
285 |    W = (W_clean + W_processed)./2.0;
286 |   
287 |    distortion(frame_count) = sum(W.*(clean_slope(1:num_crit-1) - ...
288 | 		       processed_slope(1:num_crit-1)).^2);
289 | 
290 |    % this normalization is not part of Klatt's paper, but helps
291 |    % to normalize the measure.  Here we scale the measure by the
292 |    % sum of the weights.
293 | 
294 |    distortion(frame_count) = distortion(frame_count)/sum(W);
295 |    
296 |    start = start + skiprate;
297 |      
298 | end
299 | 
300 | 


--------------------------------------------------------------------------------
/PESQ/crude_align.m:
--------------------------------------------------------------------------------
 1 | function crude_align( ref_logVAD, ref_Nsamples, deg_logVAD, ...
 2 |     deg_Nsamples, Utt_id)
 3 | 
 4 | global Downsample 
 5 | global Nutterances Largest_uttsize Nsurf_samples Crude_DelayEst
 6 | global Crude_DelayConf UttSearch_Start UttSearch_End Utt_DelayEst
 7 | global Utt_Delay Utt_DelayConf Utt_Start Utt_End
 8 | global MAXNUTTERANCES WHOLE_SIGNAL
 9 | global pesq_mos subj_mos cond_nr 
10 | 
11 | if (Utt_id== WHOLE_SIGNAL )
12 |     nr = floor( ref_Nsamples/ Downsample);
13 |     nd = floor( deg_Nsamples/ Downsample);
14 |     startr= 1;
15 |     startd= 1;
16 | elseif Utt_id== MAXNUTTERANCES
17 |     startr= UttSearch_Start(MAXNUTTERANCES);
18 |     startd= startr+ Utt_DelayEst(MAXNUTTERANCES)/ Downsample;
19 |     if ( startd< 0 )
20 |         startr= 1- Utt_DelayEst(MAXNUTTERANCES)/ Downsample;
21 |         startd= 1;
22 |     end
23 | 
24 |     nr= UttSearch_End(MAXNUTTERANCES)- startr;
25 |     nd= nr;
26 |     
27 |     if( startd+ nd> floor( deg_Nsamples/ Downsample) )
28 |         nd= floor( deg_Nsamples/ Downsample)- startd;
29 |     end
30 | %     fprintf( 'nr,nd is %d,%d\n', nr, nd);
31 |     
32 | else
33 |     startr= UttSearch_Start(Utt_id);
34 |     startd= startr+ Crude_DelayEst/ Downsample; 
35 |     
36 |     if ( startd< 0 )       
37 |         startr= 1- Crude_DelayEst/ Downsample;
38 |         startd= 1;
39 |     end
40 | 
41 |     nr= UttSearch_End(Utt_id)- startr;
42 |     nd = nr;
43 |     if( startd+ nd> floor( deg_Nsamples/ Downsample)+ 1)
44 |         nd = floor( deg_Nsamples/ Downsample)- startd+ 1;
45 |     end
46 | end
47 | 
48 | max_Y= 0.0;
49 | I_max_Y= nr;
50 | if( (nr> 1) && (nd> 1) )
51 |     Y= FFTNXCorr( ref_logVAD, startr, nr, deg_logVAD, startd, nd);
52 |     [max_Y, I_max_Y]= max( Y);
53 |     if (max_Y<= 0)
54 |         max_Y= 0;
55 |         I_max_Y= nr;
56 |     end
57 | end
58 | 
59 | % fprintf( 'max_Y, I_max_Y is %f, %d\n', max_Y, I_max_Y);
60 | 
61 | if( Utt_id== WHOLE_SIGNAL )
62 |     Crude_DelayEst= (I_max_Y- nr)* Downsample;
63 |     Crude_DelayConf= 0.0;
64 | %     fprintf( 1, 'I_max_Y, nr, Crude_DelayEst is %f, %f, %f\n', ...
65 | %         I_max_Y, nr, Crude_DelayEst);
66 | elseif( Utt_id == MAXNUTTERANCES )
67 |     Utt_Delay(MAXNUTTERANCES)= (I_max_Y- nr)* Downsample+ ...
68 |         Utt_DelayEst(MAXNUTTERANCES);    
69 | %     fprintf( 'startr, startd, nr, nd, I_max, Utt_Delay[%d] is %d, %d, %d, %d, %d, %d\n', ...
70 | % 			MAXNUTTERANCES, startr, startd, nr, nd, ...
71 | %             I_max_Y, Utt_Delay(MAXNUTTERANCES) );
72 | else
73 | %     fprintf( 'I_max_Y, nr is %d, %d\n', I_max_Y, nr);
74 |     Utt_DelayEst(Utt_id)= (I_max_Y- nr)* Downsample+ ... 
75 |         Crude_DelayEst;    
76 | end
77 |     
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 
84 | 
85 | 


--------------------------------------------------------------------------------
/PESQ/enhanced.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/PESQ/enhanced.wav


--------------------------------------------------------------------------------
/PESQ/fix_power_level.asv:
--------------------------------------------------------------------------------
 1 | function mod_data= fix_power_level( data, data_Nsamples, maxNsamples)
 2 | % this function is used for level normalization, i.e., to fix the power
 3 | % level of data to a preset number, and return it to mod_data. 
 4 | 
 5 | global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs
 6 | global TARGET_AVG_POWER 
 7 | TARGET_AVG_POWER= 1e7;
 8 | 
 9 | %Este filtro no coincide con el que propone el estandar (se ve mejor en
10 | %tiempo de ejecucion, por cierto).
11 | align_filter_dB= [0,-500; 50, -500; 100, -500; 125, -500; 160, -500; 200, -500;
12 |     250, -500; 300, -500; 350,  0; 400,  0; 500,  0; 600,  0; 630,  0;
13 |     800,  0; 1000, 0; 1250, 0; 1600, 0; 2000, 0; 2500, 0; 3000, 0;
14 |     3250, 0; 3500, -500; 4000, -500; 5000, -500; 6300, -500; 8000, -500];    
15 | 
16 | align_filtered= apply_filter( data, data_Nsamples, align_filter_dB);
17 | power_above_300Hz = pow_of (align_filtered, SEARCHBUFFER* Downsample+ 1, ...
18 |     data_Nsamples- SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000), ...
19 |     maxNsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000));
20 | 
21 | global_scale= sqrt( TARGET_AVG_POWER/ power_above_300Hz);
22 | % fprintf( 1, '\tglobal_scale is %f\n', global_scale);
23 | mod_data= data* global_scale;
24 | 


--------------------------------------------------------------------------------
/PESQ/fix_power_level.m:
--------------------------------------------------------------------------------
 1 | function mod_data= fix_power_level( data, data_Nsamples, maxNsamples)
 2 | % this function is used for level normalization, i.e., to fix the power
 3 | % level of data to a preset number, and return it to mod_data. 
 4 | 
 5 | global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs
 6 | global TARGET_AVG_POWER 
 7 | TARGET_AVG_POWER= 1e7;
 8 | 
 9 | %Este filtro no coincide con el que propone el estandar (se ve mejor en
10 | %tiempo de ejecucion, por cierto).
11 | align_filter_dB= [0,-500; 50, -500; 100, -500; 125, -500; 160, -500; 200, -500;
12 |     250, -500; 300, -500; 350,  0; 400,  0; 500,  0; 600,  0; 630,  0;
13 |     800,  0; 1000, 0; 1250, 0; 1600, 0; 2000, 0; 2500, 0; 3000, 0;
14 |     3250, 0; 3500, -500; 4000, -500; 5000, -500; 6300, -500; 8000, -500];    
15 | 
16 | align_filtered= apply_filter( data, data_Nsamples, align_filter_dB);
17 | %Calcula la potencia (energia por muestra) de la segnal filtrada
18 | power_above_300Hz = pow_of (align_filtered, SEARCHBUFFER* Downsample+ 1, ...
19 |     data_Nsamples- SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000), ...
20 |     maxNsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000));
21 | 
22 | %Calcula la ganancia necesaria para que la senal tenga una potencia (en la
23 | %banda de interes) igual a la indicada por el estandar
24 | global_scale= sqrt( TARGET_AVG_POWER/ power_above_300Hz);
25 | % fprintf( 1, '\tglobal_scale is %f\n', global_scale);
26 | mod_data= data* global_scale;
27 | 


--------------------------------------------------------------------------------
/PESQ/id_searchwindows.m:
--------------------------------------------------------------------------------
 1 | function id_searchwindows( ref_VAD, ref_Nsamples, deg_VAD, deg_Nsamples);
 2 | 
 3 | global MINUTTLENGTH Downsample MINUTTLENGTH SEARCHBUFFER
 4 | global Crude_DelayEst Nutterances UttSearch_Start UttSearch_End
 5 | 
 6 | Utt_num = 1;
 7 | speech_flag = 0;
 8 | 
 9 | VAD_length= floor( ref_Nsamples/ Downsample);
10 | del_deg_start= MINUTTLENGTH- Crude_DelayEst/ Downsample;
11 | del_deg_end= floor((deg_Nsamples- Crude_DelayEst)/ Downsample)-...
12 |     MINUTTLENGTH;
13 | 
14 | for count= 1: VAD_length
15 |     VAD_value= ref_VAD(count);
16 |     if( (VAD_value> 0) && (speech_flag== 0) ) 
17 |         speech_flag= 1;
18 |         this_start= count;
19 |         UttSearch_Start(Utt_num)= count- SEARCHBUFFER;
20 |         if( UttSearch_Start(Utt_num)< 0 )
21 |             UttSearch_Start(Utt_num)= 0;
22 |         end
23 |     end
24 | 
25 |     if( ((VAD_value== 0) || (count == (VAD_length-1))) && ...
26 |             (speech_flag == 1) ) 
27 |         speech_flag = 0;
28 |         UttSearch_End(Utt_num) = count + SEARCHBUFFER;
29 |         if( UttSearch_End(Utt_num) > VAD_length - 1 )
30 |             UttSearch_End(Utt_num) = VAD_length -1;
31 |         end
32 | 
33 |         if( ((count - this_start) >= MINUTTLENGTH) &&...
34 |                 (this_start < del_deg_end) &&...
35 |                 (count > del_deg_start) )
36 |             Utt_num= Utt_num + 1;            
37 |         end
38 |     end
39 | end
40 | Utt_num= Utt_num- 1;
41 | Nutterances = Utt_num;
42 |     
43 | % fprintf( 1, 'Nutterances is %d\n', Nutterances);
44 | 
45 | % fid= fopen( 'mat_utt.txt', 'wt');
46 | % fprintf( fid, '%d\n', UttSearch_Start( 1: Nutterances));
47 | % fprintf( fid, '\n');
48 | % fprintf( fid, '%d\n', UttSearch_End( 1: Nutterances));
49 | % fclose(fid);
50 | 
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/PESQ/id_utterances.m:
--------------------------------------------------------------------------------
 1 | function id_utterances( ref_Nsamples, ref_VAD, deg_Nsamples)
 2 | 
 3 | global Largest_uttsize MINUTTLENGTH MINUTTLENGTH Crude_DelayEst
 4 | global Downsample SEARCHBUFFER Nutterances Utt_Start
 5 | global Utt_End Utt_Delay
 6 | 
 7 | Utt_num = 1;
 8 | speech_flag = 0;
 9 | VAD_length = floor( ref_Nsamples / Downsample);
10 | % fprintf( 1, 'VAD_length is %d\n', VAD_length);
11 | 
12 | del_deg_start = MINUTTLENGTH - Crude_DelayEst / Downsample;
13 | del_deg_end = floor((deg_Nsamples- Crude_DelayEst)/ Downsample) ...
14 |     - MINUTTLENGTH;
15 | 
16 | for count = 1: VAD_length 
17 |     VAD_value = ref_VAD(count);
18 |     if( (VAD_value > 0.0) && (speech_flag == 0) ) 
19 |         speech_flag = 1;
20 |         this_start = count;
21 |         Utt_Start (Utt_num) = count;
22 |     end
23 | 
24 |     if( ((VAD_value == 0) || (count == VAD_length)) && ...
25 |             (speech_flag == 1) ) 
26 |         speech_flag = 0;
27 |         Utt_End (Utt_num) = count;
28 |         
29 |         if( ((count - this_start) >= MINUTTLENGTH) && ...
30 |                 (this_start < del_deg_end) && ... 
31 |                 (count > del_deg_start) )
32 |             Utt_num = Utt_num + 1;   
33 |         end
34 |     end
35 | end
36 | 
37 | Utt_Start(1) = SEARCHBUFFER+ 1;
38 | Utt_End(Nutterances) = VAD_length - SEARCHBUFFER+ 1;
39 | 
40 | for Utt_num = 2: Nutterances
41 |     this_start = Utt_Start(Utt_num)- 1;
42 |     last_end = Utt_End(Utt_num - 1)- 1;
43 |     count = floor( (this_start + last_end) / 2);
44 |     Utt_Start(Utt_num) = count+ 1;
45 |     Utt_End(Utt_num - 1) = count+ 1;
46 | end
47 | 
48 | this_start = (Utt_Start(1)- 1) * Downsample + Utt_Delay(1);
49 | if( this_start < (SEARCHBUFFER * Downsample) )
50 |     count = SEARCHBUFFER + floor( ...
51 |         (Downsample - 1 - Utt_Delay(1)) / Downsample);
52 |     Utt_Start(1) = count+ 1;
53 | end
54 | 
55 | last_end = (Utt_End(Nutterances)- 1) * Downsample + 1 + ...
56 |     Utt_Delay(Nutterances);
57 | % fprintf( 'Utt_End(%d) is %d\n', Nutterances, Utt_End(Nutterances));
58 | % fprintf( 'last_end is %d\n', last_end);
59 | % fprintf( 'Utt_Delay(%d) is %d\n', Nutterances, Utt_Delay(Nutterances));
60 | if( last_end > (deg_Nsamples - SEARCHBUFFER * Downsample+ 1) )
61 |     count = floor( (deg_Nsamples - Utt_Delay(Nutterances)) / Downsample) ...
62 |         - SEARCHBUFFER;
63 |     Utt_End(Nutterances) = count+ 1;
64 | end
65 | 
66 | for Utt_num = 2: Nutterances
67 |     this_start = (Utt_Start(Utt_num)- 1) * Downsample + Utt_Delay(Utt_num);
68 |     last_end = (Utt_End(Utt_num - 1)- 1) * Downsample + Utt_Delay(Utt_num - 1);
69 |     if( this_start < last_end )
70 |         count = floor( (this_start + last_end) / 2);
71 |         this_start = floor( (Downsample- 1+ count- Utt_Delay(Utt_num))...
72 |             / Downsample);
73 |         last_end = floor( (count - Utt_Delay(Utt_num - 1))...
74 |             / Downsample);
75 |         Utt_Start(Utt_num) = this_start+ 1;
76 |         Utt_End(Utt_num- 1) = last_end+ 1;
77 |     end
78 | end
79 | 
80 | Largest_uttsize= max( Utt_End- Utt_Start);    
81 |     
82 |     
83 |     
84 |     
85 |   


--------------------------------------------------------------------------------
/PESQ/input_filter.m:
--------------------------------------------------------------------------------
 1 | function [mod_ref_data, mod_deg_data]= input_filter( ref_data, ref_Nsamples, ...
 2 |     deg_data, deg_Nsamples)
 3 | 
 4 | mod_ref_data= DC_block( ref_data, ref_Nsamples);
 5 | mod_deg_data= DC_block( deg_data, deg_Nsamples);
 6 | 
 7 | mod_ref_data= apply_filters( mod_ref_data, ref_Nsamples);
 8 | mod_deg_data= apply_filters( mod_deg_data, deg_Nsamples);
 9 | 
10 | 


--------------------------------------------------------------------------------
/PESQ/pesq.asv:
--------------------------------------------------------------------------------
  1 | function [pesq_mos]= pesq(ref_wav, deg_wav)
  2 | 
  3 | % ----------------------------------------------------------------------
  4 | %            PESQ objective speech quality measure
  5 | %
  6 | %   This function implements the PESQ measure based on the ITU standard
  7 | %   P.862 [1].
  8 | %
  9 | %
 10 | %   Usage:  pval=pesq(cleanFile.wav, enhancedFile.wav)
 11 | %           
 12 | %         cleanFile.wav - clean input file in .wav format
 13 | %         enhancedFile  - enhanced output file in .wav format
 14 | %         pval          - PESQ value
 15 | %
 16 | %    Note that the PESQ routine only supports sampling rates of 8 kHz and
 17 | %    16 kHz [1]
 18 | %
 19 | %  Example call:  pval = pesq ('sp04.wav','enhanced.wav')
 20 | %
 21 | %  
 22 | %  References:
 23 | %   [1] ITU (2000). Perceptual evaluation of speech quality (PESQ), and 
 24 | %       objective method for end-to-end speech quality assessment of 
 25 | %       narrowband telephone networks and speech codecs. ITU-T
 26 | %       Recommendation P. 862   
 27 | %
 28 | %   Authors: Yi Hu and Philipos C. Loizou 
 29 | %
 30 | %
 31 | % Copyright (c) 2006 by Philipos C. Loizou
 32 | % $Revision: 0.0 $  $Date: 10/09/2006 $
 33 | % ----------------------------------------------------------------------
 34 | 
 35 | %Referencias frente a trasteo ;D
 36 | % pesq('sp04.wav','sp04_babble_sn10.wav')
 37 | % 2.4634
 38 | % pesq('sp04.wav','enhanced.wav')
 39 | % 2.5658
 40 | 
 41 | if nargin<2
 42 |     fprintf('Usage: [pesq_mos]=pesq(cleanfile.wav,enhanced.wav) \n');
 43 |     return;
 44 | end;
 45 | 
 46 | %Establecemos las siguientes variables globales
 47 | global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs WHOLE_SIGNAL
 48 | global Align_Nfft Window 
 49 | 
 50 | %Leemos el WAV de REFERENCIA (y obtenemos la frec. de muestreo)
 51 | [ref_data,sampling_rate,nbits]= wavread( ref_wav);
 52 | if sampling_rate~=8000 & sampling_rate~=16000
 53 |     error('Sampling frequency needs to be either 8000 or 16000 Hz');
 54 | end
 55 | %Leemos el WAV de TEST (ignoramos los datos de fec. muestreo)
 56 | deg_data= wavread( deg_wav);
 57 | 
 58 | %Establecemos un conjunto de variables Globales, que dependen de la
 59 | %frecuencia de muestreo
 60 | setup_global( sampling_rate);
 61 | %Esta funcion se encarga de definir las siguientes variables globales
 62 | %fundamentales que dependen de la frecuencia de muestreo
 63 | 
 64 | 
 65 | %Align_Nfft define el tamano de la ventana FFT (512 para 8Khz y 1024 para
 66 | %16kHz)
 67 | TWOPI= 6.28318530717959;
 68 | count=0:Align_Nfft- 1;
 69 | Window= 0.5 * (1.0 - cos((TWOPI * count) / Align_Nfft));
 70 | % Equivalente a:
 71 | % Window= hann( Align_Nfft); %Hanning window
 72 | 
 73 | %Prepara las senales de referencia y degradada
 74 | %Duda: reescala las senales a 16 bits (15 bits amp. 1 de signo), porque?
 75 | %Mete un buffer de busqueda al principio y otro al final, mas 320
 76 | %milisegundos de padding
 77 | ref_data= ref_data';
 78 | ref_data= ref_data* 32768; %2^15
 79 | ref_Nsamples= length( ref_data)+ 2* SEARCHBUFFER* Downsample;
 80 | ref_data= [zeros( 1, SEARCHBUFFER* Downsample), ref_data, zeros( 1, DATAPADDING_MSECS* (Fs/ 1000)+ SEARCHBUFFER* Downsample)];
 81 | 
 82 | deg_data= deg_data';
 83 | deg_data= deg_data* 32768; %2^15
 84 | deg_Nsamples= length( deg_data)+ 2* SEARCHBUFFER* Downsample;
 85 | deg_data= [zeros( 1, SEARCHBUFFER* Downsample), deg_data, zeros( 1, DATAPADDING_MSECS* (Fs/ 1000)+ SEARCHBUFFER* Downsample)];
 86 | 
 87 | maxNsamples= max( ref_Nsamples, deg_Nsamples);
 88 | 
 89 | %Las dos senales deben de tener un nivel de ganancia parecido. Para
 90 | %igualarlo se calcula la potencia de las senales. Pero solo se considera la
 91 | %region del espectro con voz. Aqui desde 300 a 3Khz, aunque esto NO
 92 | %COINCIDE con el standar.
 93 | ref_data= fix_power_level( ref_data, ref_Nsamples, maxNsamples);
 94 | deg_data= fix_power_level( deg_data, deg_Nsamples, maxNsamples);
 95 | 
 96 | %Aplica un filtrado que simula la respuesta en frecuencia de un dispositivo
 97 | %telefonico estandard.
 98 | standard_IRS_filter_dB= [0, -200; 50, -40; 100, -20; 125, -12; 160, -6; 200, 0;...    
 99 |     250, 4; 300, 6; 350, 8; 400, 10; 500, 11; 600, 12; 700, 12; 800, 12;...
100 |     1000, 12; 1300, 12; 1600, 12; 2000, 12; 2500, 12; 3000, 12; 3250, 12;...
101 |     3500, 4; 4000, -200; 5000, -200; 6300, -200; 8000, -200]; 
102 | 
103 | ref_data= apply_filter( ref_data, ref_Nsamples, standard_IRS_filter_dB);
104 | deg_data= apply_filter( deg_data, deg_Nsamples, standard_IRS_filter_dB);
105 | 
106 | 
107 | % Salvaguardamos los datos para el modelado perceptual
108 | % Mas adelante las variables model_ref y mode_deg se vuelven a volcar sobre
109 | % ref_data y deg_data
110 | model_ref= ref_data;
111 | model_deg= deg_data;
112 | 
113 | %Realmente no tengo ni idea de que diablos se le hace aqui a la senal. Se
114 | %le toca el DC offset y se filtra por algo (pero es muy dificil imaginar
115 | %que). Supongo que se acomoda la senal para la siguiente etapa (han copiado
116 | %las cocinas del codigo PESQ.
117 | [ref_data, deg_data]= input_filter( ref_data, ref_Nsamples, deg_data, ...
118 |     deg_Nsamples);
119 | 
120 | %Aqui se calcula la envelope de la senal (log(MAX(E(k)/Ethresh,1)))
121 | %E(k) es la energia en 4 ms y Ethresh un umbral del VAD. ref_VAD se refiere
122 | %a la senal antes del logaritmo y el maximo.
123 | [ref_VAD, ref_logVAD]= apply_VAD( ref_data, ref_Nsamples);
124 | [deg_VAD, deg_logVAD]= apply_VAD( deg_data, deg_Nsamples);
125 | 
126 | %Sobre el envelope se calcula el alineamiento en crudo. Basicamente se
127 | %calcula la correlacion cruzada entre las senales y se busca el maximo
128 | crude_align (ref_logVAD, ref_Nsamples, deg_logVAD, deg_Nsamples,WHOLE_SIGNAL);
129 | %NOTA: Los resultados se almacenan en variables globales Crude_DelayEst 
130 | %y Crude_DelayConf;
131 | 
132 | utterance_locate (ref_data, ref_Nsamples, ref_VAD, ref_logVAD,...
133 |     deg_data, deg_Nsamples, deg_VAD, deg_logVAD);
134 | 
135 | ref_data= model_ref;
136 | deg_data= model_deg;
137 | 
138 | % make ref_data and deg_data equal length
139 | if (ref_Nsamples< deg_Nsamples)
140 |     newlen= deg_Nsamples+ DATAPADDING_MSECS* (Fs/ 1000);
141 |     ref_data( newlen)= 0;
142 | elseif (ref_Nsamples> deg_Nsamples)
143 |     newlen= ref_Nsamples+ DATAPADDING_MSECS* (Fs/ 1000);
144 |     deg_data( newlen)= 0;
145 | end
146 | 
147 | %Tras la identificacion de las sentencias y el alineado se procede a la
148 | %evaluacion objetiva del mos mediante un modelo psicoacustico.
149 | 
150 | pesq_mos= pesq_psychoacoustic_model (ref_data, ref_Nsamples, deg_data, ...
151 |     deg_Nsamples );
152 | 
153 | 
154 | 
155 | 
156 | 


--------------------------------------------------------------------------------
/PESQ/pesq.m:
--------------------------------------------------------------------------------
  1 | function [pesq_mos]= pesq(ref_wav, deg_wav)
  2 | 
  3 | % ----------------------------------------------------------------------
  4 | %            PESQ objective speech quality measure
  5 | %
  6 | %   This function implements the PESQ measure based on the ITU standard
  7 | %   P.862 [1].
  8 | %
  9 | %
 10 | %   Usage:  pval=pesq(cleanFile.wav, enhancedFile.wav)
 11 | %           
 12 | %         cleanFile.wav - clean input file in .wav format
 13 | %         enhancedFile  - enhanced output file in .wav format
 14 | %         pval          - PESQ value
 15 | %
 16 | %    Note that the PESQ routine only supports sampling rates of 8 kHz and
 17 | %    16 kHz [1]
 18 | %
 19 | %  Example call:  pval = pesq ('sp04.wav','enhanced.wav')
 20 | %
 21 | %  
 22 | %  References:
 23 | %   [1] ITU (2000). Perceptual evaluation of speech quality (PESQ), and 
 24 | %       objective method for end-to-end speech quality assessment of 
 25 | %       narrowband telephone networks and speech codecs. ITU-T
 26 | %       Recommendation P. 862   
 27 | %
 28 | %   Authors: Yi Hu and Philipos C. Loizou 
 29 | %
 30 | %
 31 | % Copyright (c) 2006 by Philipos C. Loizou
 32 | % $Revision: 0.0 $  $Date: 10/09/2006 $
 33 | % ----------------------------------------------------------------------
 34 | 
 35 | %Referencias frente a trasteo ;D
 36 | % pesq('sp04.wav','sp04_babble_sn10.wav')
 37 | % 2.4634
 38 | % pesq('sp04.wav','enhanced.wav')
 39 | % 2.5658
 40 | 
 41 | if nargin<2
 42 |     fprintf('Usage: [pesq_mos]=pesq(cleanfile.wav,enhanced.wav) \n');
 43 |     return;
 44 | end;
 45 | 
 46 | %Establecemos las siguientes variables globales
 47 | global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs WHOLE_SIGNAL
 48 | global Align_Nfft Window 
 49 | 
 50 | %Leemos el WAV de REFERENCIA (y obtenemos la frec. de muestreo)
 51 | [ref_data,sampling_rate,nbits]= wavread( ref_wav);
 52 | if sampling_rate~=8000 & sampling_rate~=16000
 53 |     error('Sampling frequency needs to be either 8000 or 16000 Hz');
 54 | end
 55 | %Leemos el WAV de TEST (ignoramos los datos de fec. muestreo)
 56 | deg_data= wavread( deg_wav);
 57 | 
 58 | %Establecemos un conjunto de variables Globales, que dependen de la
 59 | %frecuencia de muestreo
 60 | setup_global( sampling_rate);
 61 | %Esta funcion se encarga de definir las siguientes variables globales
 62 | %fundamentales que dependen de la frecuencia de muestreo
 63 | 
 64 | 
 65 | %Align_Nfft define el tamano de la ventana FFT (512 para 8Khz y 1024 para
 66 | %16kHz)
 67 | TWOPI= 6.28318530717959;
 68 | count=0:Align_Nfft- 1;
 69 | Window= 0.5 * (1.0 - cos((TWOPI * count) / Align_Nfft));
 70 | % Equivalente a:
 71 | % Window= hann( Align_Nfft); %Hanning window
 72 | 
 73 | %Prepara las senales de referencia y degradada
 74 | %Duda: reescala las senales a 16 bits (15 bits amp. 1 de signo), porque?
 75 | %Mete un buffer de busqueda al principio y otro al final, mas 320
 76 | %milisegundos de padding
 77 | ref_data= ref_data';
 78 | ref_data= ref_data* 32768; %2^15
 79 | ref_Nsamples= length( ref_data)+ 2* SEARCHBUFFER* Downsample;
 80 | ref_data= [zeros( 1, SEARCHBUFFER* Downsample), ref_data, zeros( 1, DATAPADDING_MSECS* (Fs/ 1000)+ SEARCHBUFFER* Downsample)];
 81 | 
 82 | deg_data= deg_data';
 83 | deg_data= deg_data* 32768; %2^15
 84 | deg_Nsamples= length( deg_data)+ 2* SEARCHBUFFER* Downsample;
 85 | deg_data= [zeros( 1, SEARCHBUFFER* Downsample), deg_data, zeros( 1, DATAPADDING_MSECS* (Fs/ 1000)+ SEARCHBUFFER* Downsample)];
 86 | 
 87 | maxNsamples= max( ref_Nsamples, deg_Nsamples);
 88 | 
 89 | %Las dos senales deben de tener un nivel de ganancia parecido. Para
 90 | %igualarlo se calcula la potencia de las senales. Pero solo se considera la
 91 | %region del espectro con voz. Aqui desde 300 a 3Khz, aunque esto NO
 92 | %COINCIDE con el standar.
 93 | ref_data= fix_power_level( ref_data, ref_Nsamples, maxNsamples);
 94 | deg_data= fix_power_level( deg_data, deg_Nsamples, maxNsamples);
 95 | 
 96 | %Aplica un filtrado que simula la respuesta en frecuencia de un dispositivo
 97 | %telefonico estandard.
 98 | standard_IRS_filter_dB= [0, -200; 50, -40; 100, -20; 125, -12; 160, -6; 200, 0;...    
 99 |     250, 4; 300, 6; 350, 8; 400, 10; 500, 11; 600, 12; 700, 12; 800, 12;...
100 |     1000, 12; 1300, 12; 1600, 12; 2000, 12; 2500, 12; 3000, 12; 3250, 12;...
101 |     3500, 4; 4000, -200; 5000, -200; 6300, -200; 8000, -200]; 
102 | 
103 | ref_data= apply_filter( ref_data, ref_Nsamples, standard_IRS_filter_dB);
104 | deg_data= apply_filter( deg_data, deg_Nsamples, standard_IRS_filter_dB);
105 | 
106 | 
107 | % Salvaguardamos los datos para el modelado perceptual
108 | % Mas adelante las variables model_ref y mode_deg se vuelven a volcar sobre
109 | % ref_data y deg_data
110 | model_ref= ref_data;
111 | model_deg= deg_data;
112 | 
113 | %Realmente no tengo ni idea de que diablos se le hace aqui a la senal. Se
114 | %le toca el DC offset y se filtra por algo (pero es muy dificil imaginar
115 | %que). Supongo que se acomoda la senal para la siguiente etapa (han copiado
116 | %las cocinas del codigo PESQ.
117 | [ref_data, deg_data]= input_filter( ref_data, ref_Nsamples, deg_data, ...
118 |     deg_Nsamples);
119 | 
120 | %Aqui se calcula la envelope de la senal (log(MAX(E(k)/Ethresh,1)))
121 | %E(k) es la energia en 4 ms y Ethresh un umbral del VAD. ref_VAD se refiere
122 | %a la senal antes del logaritmo y el maximo.
123 | [ref_VAD, ref_logVAD]= apply_VAD( ref_data, ref_Nsamples);
124 | [deg_VAD, deg_logVAD]= apply_VAD( deg_data, deg_Nsamples);
125 | 
126 | %Sobre el envelope se calcula el alineamiento en crudo. Basicamente se
127 | %calcula la correlacion cruzada entre las senales y se busca el maximo
128 | crude_align (ref_logVAD, ref_Nsamples, deg_logVAD, deg_Nsamples,WHOLE_SIGNAL);
129 | %NOTA: Los resultados se almacenan en variables globales Crude_DelayEst 
130 | %y Crude_DelayConf;
131 | 
132 | utterance_locate (ref_data, ref_Nsamples, ref_VAD, ref_logVAD,...
133 |     deg_data, deg_Nsamples, deg_VAD, deg_logVAD);
134 | 
135 | ref_data= model_ref;
136 | deg_data= model_deg;
137 | 
138 | % make ref_data and deg_data equal length
139 | if (ref_Nsamples< deg_Nsamples)
140 |     newlen= deg_Nsamples+ DATAPADDING_MSECS* (Fs/ 1000);
141 |     ref_data( newlen)= 0;
142 | elseif (ref_Nsamples> deg_Nsamples)
143 |     newlen= ref_Nsamples+ DATAPADDING_MSECS* (Fs/ 1000);
144 |     deg_data( newlen)= 0;
145 | end
146 | 
147 | %Tras la identificacion de las sentencias y el alineado se procede a la
148 | %evaluacion objetiva del mos mediante un modelo psicoacustico.
149 | 
150 | pesq_mos= pesq_psychoacoustic_model (ref_data, ref_Nsamples, deg_data, ...
151 |     deg_Nsamples );
152 | 
153 | 
154 | 
155 | 
156 | 


--------------------------------------------------------------------------------
/PESQ/pow_of.m:
--------------------------------------------------------------------------------
1 | function power= pow_of( data, start_point, end_point, divisor)
2 | 
3 | power= sum( data( start_point: end_point).^ 2)/ divisor; 


--------------------------------------------------------------------------------
/PESQ/readme.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/PESQ/readme.pdf


--------------------------------------------------------------------------------
/PESQ/readme.txt:
--------------------------------------------------------------------------------
 1 | This folder contains implementations of objective measures (Chapters 10 and 11):
 2 | 
 3 | 	MATLAB file	Description					Reference
 4 | -----------------------------------------------------------------------------------
 5 | 	comp_snr.m	Overall and segmental SNR			[1]
 6 | 	comp_wss.m	Weighted-spectral slope metric			[2]
 7 | 	comp_llr.m	Likelihood-ratio measure			[3]
 8 | 	comp_is.m	Itakura-Saito measure				[3]
 9 | 	comp_cep.m	Cepstral distance measure			[4]
10 | 	comp_fwseg	Freq. weighted segm. SNR (fwSNRseg)    	        [5],Chap 11,
11 | 									Eq. 11.5 
12 | 	comp_fwseg_variant   Frequency-variant fwSNRseg measure		Chapter 10, 
13 | 									Eq. 10.24
14 | 	comp_fwseg_mars	    Frequency variant fwSNRseg measure 		Chap 10,
15 | 			    based on MARS analysis			Sec. 10.5.4	 									
16 | 	pesq.m		PESQ measure					[6]
17 | 	composite.m	A composite measure				[7]
18 | 
19 | 	addnoise_asl.m	Adds noise to the clean signal at specified SNR 
20 | 			based on active speech level.			[8]
21 | ---------------------------------------------------------------------------------
22 | USAGE
23 | 
24 | >>   [snr_mean, segsnr_mean]= compSNR(cleanFile.wav, enhdFile.wav);
25 |       where 'snr_mean' is the global overall SNR and 'segsnr_mean' is the 
26 |       segmental SNR.
27 | 
28 | >>   wss_mean = comp_wss(cleanFile.wav, enhancedFile.wav);
29 | 
30 | >>   llr_mean= comp_llr(cleanFile.wav, enhancedFile.wav);
31 | 
32 | >>    is_mean = comp_is(cleanFile.wav, enhancedFile.wav);
33 | 
34 | >>    cep_mean = comp_cep(cleanFile.wav, enhancedFile.wav);
35 | 
36 | >>    fwSNRseg = comp_fwseg(cleanFile.wav, enhancedFile.wav);
37 | 
38 | >>    [SIG,BAK,OVL]=comp_fwseg_variant(cleanFile.wav, enhancedFile.wav);
39 | 	where   'SIG' is the predicted rating of speech distortion,
40 | 		'BAK' is the predicted rating of background noise distortion,
41 | 		'OVL' is the predicted rating of overall quality.
42 | 
43 | >>    [SIG,BAK,OVL]=comp_fwseg_mars(cleanFile.wav, enhancedFile.wav);
44 | 
45 | >>    pesq_mean = pesq(cleanFile.wav, enhancedFile.wav);
46 | 	       Only sampling frequencies of 8000 Hz or 16000 Hz are supported.
47 | 
48 | >>    [Csig,Cbak,Covl]=composite(cleanFile.wav, enhancedFile.wav);
49 | 	where   'Csig' is the predicted rating of speech distortion,
50 | 		'Cbak' is the predicted rating of background noise distortion,
51 | 		'Covl' is the predicted rating of overall quality.
52 | 
53 | >> 	addnoise_asl(cleanfile.wav, noisefile.wav, outfile.wav, SNRlevel)
54 | 
55 | ---------------------------------------------------------------------------
56 | 
57 | REFERENCES:
58 | [1] 	Hansen, J. and Pellom, B. (1998). An effective quality evaluation
59 | 	protocol for speech enhancement algorithms. Inter. Conf. on Spoken 
60 | 	Language Processing, 7(2819), 2822
61 | [2] 	Klatt, D. (1982). Prediction of perceived phonetic distance from 
62 | 	critical band spectra. Proc. IEEE Int. Conf. Acoust. , Speech, 
63 | 	Signal Processing, 7, 1278-1281.
64 | [3] 	Quackenbush, S., Barnwell, T., and Clements, M. (1988). Objective
65 | 	 measures of speech quality. NJ: Prentice-Hall, Eaglewood Cliffs.
66 | 
67 | [4]	Kitawaki, N., Nagabuchi, H., and Itoh, K. (1988). Objective quality
68 | 	evaluation for low bit-rate speech coding systems. IEEE J. Select.
69 | 	Areas in Comm., 6(2), 262-273.
70 | [5] 	Tribolet, J., Noll, P., McDermott, B., and Crochiere, R. E. (1978).
71 | 	 A study of complexity and quality of speech waveform coders. Proc. 
72 | 	IEEE Int. Conf. Acoust. , Speech, Signal Processing, 586-590.
73 | [6] 	ITU (2000). Perceptual evaluation of speech quality (PESQ), and 
74 | 	objective method for end-to-end speech quality assessment of 
75 | 	narrowband telephone networks and speech codecs. ITU-T
76 | 	Recommendation P. 862
77 | [7] 	Hu, Y. and Loizou, P. (2006). Evaluation of objective measures 
78 | 	for speech enhancement. Proc. Interspeech
79 | [8] 	ITU-T (1993). Objective measurement of active speech level. ITU-T 
80 | 	Recommendation P. 56
81 | 
82 | 
83 | Copyright (c) 2006 by Philipos C. Loizou
84 | $Revision: 0.0 $  $Date: 07/30/2006 $
85 | ------------------------------------------------------------------------------


--------------------------------------------------------------------------------
/PESQ/sp04.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/PESQ/sp04.wav


--------------------------------------------------------------------------------
/PESQ/sp04_babble_sn10.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/PESQ/sp04_babble_sn10.wav


--------------------------------------------------------------------------------
/PESQ/split_align.m:
--------------------------------------------------------------------------------
  1 | function split_align( ref_data, ref_Nsamples, ref_VAD, ref_logVAD, ...
  2 |     deg_data, deg_Nsamples, deg_VAD, deg_logVAD, ...
  3 |     Utt_Start_l, Utt_SpeechStart, Utt_SpeechEnd, Utt_End_l, ...
  4 |     Utt_DelayEst_l, Utt_DelayConf_l)
  5 | 
  6 | global MAXNUTTERANCES Align_Nfft Downsample Window    
  7 | global Utt_DelayEst Utt_Delay UttSearch_Start UttSearch_End 
  8 | global Best_ED1 Best_D1 Best_DC1 Best_ED2 Best_D2 Best_DC2 Best_BP
  9 | 
 10 | Utt_BPs= zeros( 1, 41);
 11 | Utt_ED1= zeros( 1, 41);
 12 | Utt_ED2= zeros( 1, 41);
 13 | Utt_D1= zeros( 1, 41);
 14 | Utt_D2= zeros( 1, 41);
 15 | Utt_DC1= zeros( 1, 41);
 16 | Utt_DC2= zeros( 1, 41);
 17 | 
 18 | 
 19 | Utt_Len = Utt_SpeechEnd - Utt_SpeechStart;
 20 | Utt_Test = MAXNUTTERANCES;
 21 | Best_DC1 = 0.0;
 22 | Best_DC2 = 0.0;
 23 | kernel = Align_Nfft / 64;
 24 | Delta = Align_Nfft / (4 * Downsample);
 25 | Step = floor( ((0.801 * Utt_Len + 40 * Delta - 1)/(40 * Delta)));
 26 | Step = Step* Delta;
 27 | % fprintf( 'Step is %f\n', Step);
 28 | 
 29 | Pad = floor( Utt_Len / 10);
 30 | if( Pad < 75 ) 
 31 |     Pad = 75;
 32 | end
 33 | 
 34 | Utt_BPs(1) = Utt_SpeechStart + Pad;
 35 | N_BPs = 1;
 36 | while( 1)
 37 |     N_BPs= N_BPs+ 1;
 38 |     Utt_BPs(N_BPs)= Utt_BPs(N_BPs- 1)+ Step;
 39 |     if (~((Utt_BPs(N_BPs) <= (Utt_SpeechEnd- Pad)) && (N_BPs <= 40) ))
 40 |         break;
 41 |     end
 42 | end
 43 | 
 44 | if( N_BPs <= 1 ) 
 45 |     return;
 46 | end
 47 | 
 48 | % fprintf( 'Utt_DelayEst_l, Utt_Start_l, N_BPs is %d,%d,%d\n', ...
 49 | %     Utt_DelayEst_l, Utt_Start_l, N_BPs);
 50 | for bp = 1: N_BPs- 1
 51 |     Utt_DelayEst(Utt_Test) = Utt_DelayEst_l;
 52 |     UttSearch_Start(Utt_Test) = Utt_Start_l;
 53 |     UttSearch_End(Utt_Test) = Utt_BPs(bp);
 54 | %     fprintf( 'bp,Utt_BPs(%d) is %d,%d\n', bp,bp,Utt_BPs(bp)); 
 55 |     
 56 |     crude_align( ref_logVAD, ref_Nsamples, deg_logVAD, ...
 57 |         deg_Nsamples, MAXNUTTERANCES);
 58 |     Utt_ED1(bp) = Utt_Delay(Utt_Test);
 59 | 
 60 |     Utt_DelayEst(Utt_Test) = Utt_DelayEst_l;
 61 |     UttSearch_Start(Utt_Test) = Utt_BPs(bp);
 62 |     UttSearch_End(Utt_Test) = Utt_End_l;
 63 |     
 64 |     crude_align( ref_logVAD, ref_Nsamples, deg_logVAD, ...
 65 |         deg_Nsamples, MAXNUTTERANCES);
 66 |     Utt_ED2(bp) = Utt_Delay(Utt_Test);
 67 | end
 68 | 
 69 | % stream = fopen( 'matmat.txt', 'wt' );	
 70 | % for count= 1: N_BPs- 1 
 71 | %     fprintf( stream, '%d\n', Utt_ED2(count));
 72 | % end
 73 | % fclose( stream );
 74 | 
 75 | 
 76 | Utt_DC1(1: N_BPs-1) = -2.0;
 77 | % stream= fopen( 'what_mmm.txt', 'at');
 78 | while( 1 )
 79 |     bp = 1;
 80 |     while( (bp <= N_BPs- 1) && (Utt_DC1(bp) > -2.0) )
 81 |         bp = bp+ 1;
 82 |     end
 83 |     if( bp >= N_BPs )
 84 |         break;
 85 |     end
 86 |     
 87 |     estdelay = Utt_ED1(bp);
 88 | %     fprintf( 'bp,estdelay is %d,%d\n', bp, estdelay);
 89 |     H(1: Align_Nfft)= 0;
 90 |     Hsum = 0.0;
 91 |     
 92 |     startr = (Utt_Start_l- 1) * Downsample+ 1;
 93 |     startd = startr + estdelay;
 94 | %     fprintf( 'startr/startd is %d/%d\n', startr, startd);
 95 |     
 96 |     if ( startd < 0 )
 97 |         startr = -estdelay+ 1;
 98 |         startd = 1;
 99 |     end
100 | 
101 |     while( ((startd + Align_Nfft) <= 1+ deg_Nsamples) &&...
102 |             ((startr + Align_Nfft) <= (1+ (Utt_BPs(bp)- 1) * Downsample)) )
103 |         X1= ref_data(startr: startr+ Align_Nfft- 1).* Window;
104 |         X2= deg_data(startd: startd+ Align_Nfft- 1).* Window;
105 |         
106 |         X1_fft= fft( X1, Align_Nfft );
107 |         X1_fft_conj= conj( X1_fft);
108 |         X2_fft= fft( X2, Align_Nfft );
109 |         X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft);
110 |         
111 |         X1= abs( X1);
112 |         v_max= max( X1)* 0.99;        
113 |         n_max = (v_max^ 0.125 )/ kernel;
114 | %         fprintf( stream, '%f %f\n', v_max, n_max);
115 |         
116 |         for count = 0: Align_Nfft- 1
117 |             if( X1(count+ 1) > v_max )
118 |                 Hsum = Hsum+ n_max * kernel;
119 |                 for k = 1-kernel: kernel- 1
120 |                     H(1+ rem( count+ k+ Align_Nfft, Align_Nfft))= ...
121 |                         H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))+ ...
122 |                         n_max* (kernel- abs(k));
123 |                 end
124 |             end
125 |         end
126 | 
127 |         startr = startr+ (Align_Nfft / 4);
128 |         startd = startd+ (Align_Nfft / 4);
129 |     end
130 | 
131 |     [v_max, I_max] = max( H);
132 |     if( I_max- 1 >= (Align_Nfft/2) )
133 |         I_max = I_max- Align_Nfft;
134 |     end
135 | 
136 |     Utt_D1(bp) = estdelay + I_max- 1;
137 |     if( Hsum > 0.0 )
138 | %         if (Utt_Len== 236)
139 | %             fprintf( 'v_max, Hsum is %f, %f\n', v_max, Hsum);
140 | %         end
141 |         Utt_DC1(bp) = v_max / Hsum;
142 |     else
143 |         Utt_DC1(bp) = 0.0;
144 |     end
145 | 
146 | %     fprintf( 'bp/startr/startd is %d/%d/%d\n', bp, startr, startd);
147 |     while( bp < (N_BPs - 1) )
148 |         bp = bp + 1;
149 |         
150 |         if( (Utt_ED1(bp) == estdelay) && (Utt_DC1(bp) <= -2.0) )
151 | %             loopno= 0;
152 |             while(((startd+ Align_Nfft)<= 1+ deg_Nsamples) && ...
153 |                     ((startr+ Align_Nfft)<= ...
154 |                     ((Utt_BPs(bp)- 1)* Downsample+ 1) ))
155 |                 X1= ref_data( startr: startr+ Align_Nfft- 1).* ...
156 |                     Window;
157 | % %                 if (Utt_Len== 321)
158 | %                     fid= fopen( 'what_mat.txt', 'at');
159 | %                     fprintf( fid, '%f\n', Window);
160 | %                     fclose( fid);
161 | % %                     fprintf( '\n');
162 | % %                 end
163 |                 X2= deg_data( startd: startd+ Align_Nfft- 1).* ...
164 |                     Window;
165 |                 X1_fft= fft( X1, Align_Nfft );
166 |                 X1_fft_conj= conj( X1_fft);
167 |                 X2_fft= fft( X2, Align_Nfft );
168 |                 X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft);
169 |                 
170 |                 X1= abs( X1);
171 |                 v_max = 0.99* max( X1);
172 |                 n_max = (v_max^ 0.125)/ kernel;
173 | %                 fprintf( 'v_max n_max is %f %f\n', v_max, n_max);
174 |                 
175 |                 for count = 0: Align_Nfft- 1
176 |                     if( X1(count+ 1) > v_max )
177 |                         Hsum = Hsum+ n_max * kernel;
178 |                         for k = 1-kernel: kernel-1
179 |                             H(1+ rem( count+ k+ Align_Nfft, Align_Nfft))= ...
180 |                                 H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))+ ...
181 |                                 n_max* (kernel- abs(k));
182 |                         end
183 |                     end
184 |                 end
185 | 
186 |                 startr = startr+ (Align_Nfft / 4);
187 |                 startd = startd+ (Align_Nfft / 4);
188 |                 
189 | %                 loopno= loopno+ 1;
190 |             end
191 | %             fprintf( 'loopno is %d\n', loopno);
192 | 
193 |             [v_max, I_max] = max( H);
194 | %             fprintf( 'I_max is %d ', I_max);
195 |             if( I_max- 1 >= (Align_Nfft/2) )
196 |                 I_max = I_max- Align_Nfft;
197 |             end
198 |             
199 | 
200 |             Utt_D1(bp) = estdelay + I_max- 1;
201 |             if( Hsum > 0.0 )
202 | %                 fprintf( 'v_max Hsum is %f %f\n', v_max, Hsum);
203 |                 Utt_DC1(bp) = v_max / Hsum;
204 |             else
205 |                 Utt_DC1(bp) = 0.0;
206 |             end
207 |         end
208 |     end
209 | end
210 | % fclose( stream);
211 | 
212 | for bp= 1: N_BPs- 1
213 |     if( Utt_DC1(bp) > Utt_DelayConf_l )
214 |         Utt_DC2(bp) = -2.0;
215 |     else
216 |         Utt_DC2(bp) = 0.0;
217 |     end
218 | end
219 | 
220 | while( 1 )
221 |     bp = N_BPs- 1;
222 |     while( (bp >= 1) && (Utt_DC2(bp) > -2.0) )
223 |         bp = bp- 1; 
224 |     end
225 |     if( bp < 1 )
226 |         break;
227 |     end 
228 | 
229 |     estdelay = Utt_ED2(bp);
230 |     H( 1: Align_Nfft)= 0;
231 |     Hsum = 0.0;
232 |     
233 |     startr = (Utt_End_l- 1)* Downsample+ 1- Align_Nfft;
234 |     startd = startr + estdelay;
235 |     
236 | %     fprintf( '***NEW startr is %d\n', startr);
237 |     
238 | %     fprintf( 'startr/d, deg_Nsamples is %d/%d, %d\n', startr,startd, ...
239 | %         deg_Nsamples);
240 | %     fprintf( 'deg_data has %d elements\n', numel( deg_data));
241 |     
242 |     if ( (startd + Align_Nfft) > deg_Nsamples+ 1 )
243 |         startd = deg_Nsamples - Align_Nfft+ 1;
244 |         startr = startd - estdelay;
245 |     end
246 | 
247 |     while( (startd>= 1) && (startr>= (Utt_BPs(bp)- 1)* Downsample+ 1) )
248 |         X1= ref_data( startr: startr+ Align_Nfft- 1).* Window;
249 |         X2= deg_data( startd: startd+ Align_Nfft- 1).* Window;
250 |         
251 |         X1_fft= fft( X1, Align_Nfft);
252 |         X1_fft_conj= conj( X1_fft);
253 |         X2_fft= fft( X2, Align_Nfft);
254 |         
255 |         X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft );
256 |         X1= abs( X1);
257 |         
258 |         v_max = max( X1)* 0.99;
259 |         n_max = ( v_max^ 0.125 )/ kernel;
260 |         
261 |         for count = 0: Align_Nfft- 1
262 |             if( X1(count+ 1) > v_max )
263 |                 Hsum = Hsum+ n_max * kernel;
264 |                 for k = 1-kernel: kernel- 1
265 |                     H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))= ...
266 |                         H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))+ ...
267 |                         n_max* (kernel- abs(k));
268 |                 end
269 |             end
270 |         end
271 | 
272 |         startr = startr- (Align_Nfft / 4);
273 |         startd = startd- (Align_Nfft / 4);
274 |     end
275 | 
276 |     [v_max, I_max] = max( H);
277 |     if( I_max- 1 >= (Align_Nfft/2) )
278 |         I_max = I_max- Align_Nfft;
279 |     end
280 | 
281 |     Utt_D2(bp) = estdelay + I_max- 1;
282 |     if( Hsum > 0.0 )
283 |         Utt_DC2(bp) = v_max / Hsum;
284 |     else
285 |         Utt_DC2(bp) = 0.0;
286 |     end
287 | 
288 |     while( bp > 1 )
289 |         bp = bp - 1;
290 |         if( (Utt_ED2(bp) == estdelay) && (Utt_DC2(bp) <= -2.0) )
291 |             while( (startd >= 1) && (startr >= (Utt_BPs(bp)- 1) * Downsample+ 1)) 
292 |                  X1= ref_data( startr: startr+ Align_Nfft- 1).* Window;
293 |                  X2= deg_data( startd: startd+ Align_Nfft- 1).* Window;
294 |                  X1_fft_conj= conj( fft( X1, Align_Nfft));
295 |                  X2_fft= fft( X2, Align_Nfft);
296 |                  X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft);
297 |                  
298 |                  X1= abs( X1);
299 |                  v_max = max( X1)* 0.99;
300 |                  n_max = (v_max^ 0.125)/ kernel;
301 |                  
302 |                  for count = 0: Align_Nfft- 1
303 |                      if( X1(count+ 1) > v_max )
304 |                          Hsum = Hsum+ n_max * kernel;
305 |                          for k = 1-kernel: kernel- 1
306 |                              H(1+ rem( count+ k+ Align_Nfft, Align_Nfft))= ...
307 |                                  H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))+ ...
308 |                                  n_max* (kernel- abs(k));
309 |                          end
310 |                      end
311 |                  end
312 | 
313 |                  startr = startr- (Align_Nfft / 4);
314 |                  startd = startd- (Align_Nfft / 4);
315 |             end
316 | 
317 |             [v_max, I_max] = max( H);
318 |             if( I_max- 1 >= (Align_Nfft/2) )
319 |                 I_max = I_max- Align_Nfft;
320 |             end
321 |             
322 | 
323 |             Utt_D2(bp) = estdelay + I_max- 1;
324 |             if( Hsum > 0.0 )
325 |                 Utt_DC2(bp) = v_max / Hsum;
326 |             else
327 |                 Utt_DC2(bp) = 0.0;
328 |             end
329 |         end
330 |     end
331 | end
332 | 
333 | % fid= fopen( 'uttinfo_mat.txt', 'wt');
334 | % fprintf( fid, '%f\n', Utt_D2);
335 | % fprintf( fid, '\n');
336 | % fprintf( fid, '%f\n', Utt_DC2);
337 | % fclose( fid);
338 | 
339 | % fprintf( 'Utt_Len, N_BPs is %d, %d\n', Utt_Len, N_BPs);
340 | for bp = 1: N_BPs- 1
341 |     if( (abs(Utt_D2(bp) - Utt_D1(bp)) >= Downsample) && ...
342 |             ((Utt_DC1(bp)+ Utt_DC2(bp))> (Best_DC1 + Best_DC2)) &&...
343 |             (Utt_DC1(bp) > Utt_DelayConf_l) && ...
344 |             (Utt_DC2(bp) > Utt_DelayConf_l) )
345 |         Best_ED1 = Utt_ED1(bp);
346 |         Best_D1 = Utt_D1(bp);
347 |         Best_DC1 = Utt_DC1(bp);
348 |         Best_ED2 = Utt_ED2(bp);
349 |         Best_D2 = Utt_D2(bp);
350 |         Best_DC2 = Utt_DC2(bp);
351 |         Best_BP = Utt_BPs(bp);
352 | %         fprintf( 'in loop...');
353 |     end
354 | end
355 | 
356 | % if (Utt_Len== 236)
357 | %     fid= fopen( 'matmat.txt', 'wt');
358 | %     fprintf( fid, 'N_BPs is %d\n', N_BPs);
359 | %     fprintf( fid, 'Utt_DelayConf is %f\n', Utt_DelayConf_l);
360 | %     fprintf( fid, 'ED2\t ED1\t D2\t D1\t DC2\t DC1\t BPs\n');
361 | %     for bp= 1: N_BPs- 1
362 | %         fprintf( fid, '%d\t %d\t %d\t %d\t %f\t %f\t %d\n', Utt_ED2( bp), ...
363 | %             Utt_ED1( bp), Utt_D2(bp), Utt_D1(bp), Utt_DC2(bp),...
364 | %             Utt_DC1( bp), Utt_BPs( bp));
365 | %     end
366 | %     fclose( fid);
367 | % end
368 | 
369 | 
370 | 
371 | 
372 | 
373 | 
374 | 
375 | 
376 | 
377 | 
378 | 
379 | 
380 | 
381 | 
382 | 
383 | 
384 | 
385 | 
386 | 
387 | 
388 | 
389 | 
390 | 
391 | 


--------------------------------------------------------------------------------
/PESQ/time_align.m:
--------------------------------------------------------------------------------
 1 | function time_align(ref_data, ref_Nsamples, ...
 2 |     deg_data, deg_Nsamples, Utt_id)
 3 | 
 4 | global Utt_DelayEst Utt_Delay Utt_DelayConf UttSearch_Start UttSearch_End 
 5 | global Align_Nfft Downsample Window
 6 | 
 7 | estdelay = Utt_DelayEst(Utt_id);
 8 | 
 9 | H = zeros( 1, Align_Nfft);
10 | X1= zeros( 1, Align_Nfft);
11 | X2= zeros( 1, Align_Nfft);
12 | 
13 | startr = (UttSearch_Start(Utt_id)- 1)* Downsample+ 1;
14 | startd = startr + estdelay;
15 | if ( startd < 0 )
16 |     startr = 1 -estdelay;
17 |     startd = 1;
18 | end
19 | 
20 | while( ((startd + Align_Nfft) <= deg_Nsamples) && ...
21 |         ((startr + Align_Nfft) <= ((UttSearch_End(Utt_id)- 1) * Downsample)) )
22 |     X1= ref_data( startr: startr+ Align_Nfft- 1).* Window;    
23 |     X2= deg_data( startd: startd+ Align_Nfft- 1).* Window;  
24 |     
25 |     % find cross-correlation between X1 and X2
26 |     X1_fft= fft( X1, Align_Nfft );
27 |     X1_fft_conj= conj( X1_fft);
28 |     X2_fft= fft( X2, Align_Nfft );    
29 |     X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft );        
30 | 
31 |     X1= abs( X1);     
32 |     v_max = max( X1)* 0.99;
33 |     
34 |     X1_greater_vmax= find( X1 > v_max );
35 |     H( X1_greater_vmax )= H( X1_greater_vmax )+ v_max^ 0.125;
36 |     
37 |     startr = startr+ Align_Nfft/ 4;
38 |     startd = startd+ Align_Nfft/ 4;
39 | 
40 | end
41 | 
42 | X1= H;
43 | X2= 0;
44 | Hsum = sum( H);
45 | 
46 | X2(1) = 1.0;
47 | kernel = Align_Nfft / 64;
48 | 
49 | for count= 2: kernel
50 |     X2( count)= 1- (count- 1)/ kernel;
51 |     X2( Align_Nfft- count+ 2)= 1- (count- 1)/ kernel;
52 | end
53 |     
54 | X1_fft= fft( X1, Align_Nfft );
55 | X2_fft= fft( X2, Align_Nfft );
56 | 
57 | X1= ifft( X1_fft.* X2_fft, Align_Nfft );
58 | 
59 | if (Hsum> 0)
60 |     H= abs( X1)/ Hsum;
61 | else
62 |     H= 0;
63 | end
64 | 
65 | [v_max, I_max] = max( H);
66 | if( I_max- 1 >= (Align_Nfft/2) )
67 |     I_max = I_max- Align_Nfft;
68 | end
69 | 
70 | Utt_Delay(Utt_id) = estdelay + I_max- 1;
71 | Utt_DelayConf(Utt_id) = v_max; % confidence
72 |     
73 | 
74 |     
75 |     
76 |     
77 | 


--------------------------------------------------------------------------------
/PESQ/utterance_locate.m:
--------------------------------------------------------------------------------
 1 | function utterance_locate (ref_data, ref_Nsamples, ref_VAD, ref_logVAD,...
 2 |     deg_data, deg_Nsamples, deg_VAD, deg_logVAD);
 3 | 
 4 | global Nutterances Utt_Delay Utt_DelayConf Utt_Start Utt_End Utt_DelayEst
 5 | 
 6 | id_searchwindows( ref_VAD, ref_Nsamples, deg_VAD, deg_Nsamples);
 7 | 
 8 | for Utt_id= 1: Nutterances
 9 |     %fprintf( 1, 'Utt_id is %d\n', Utt_id);
10 |     crude_align( ref_logVAD, ref_Nsamples, deg_logVAD, deg_Nsamples, Utt_id);
11 |     time_align(ref_data, ref_Nsamples, ...
12 |         deg_data, deg_Nsamples, Utt_id);
13 | end
14 | 
15 | id_utterances( ref_Nsamples, ref_VAD, deg_Nsamples);
16 | 
17 | 
18 | utterance_split( ref_data, ref_Nsamples, ref_VAD, ref_logVAD, ...
19 |     deg_data, deg_Nsamples, deg_VAD, deg_logVAD); 
20 | 
21 | 
22 | 
23 | 
24 | 
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/PESQ/utterance_split.m:
--------------------------------------------------------------------------------
  1 | function utterance_split( ref_data, ref_Nsamples, ref_VAD, ref_logVAD, ...
  2 |     deg_data, deg_Nsamples, deg_VAD, deg_logVAD)
  3 | 
  4 | global Nutterances MAXNUTTERANCES Downsample SEARCHBUFFER
  5 | global Utt_DelayEst Utt_Delay Utt_DelayConf UttSearch_Start
  6 | global Utt_Start Utt_End Largest_uttsize UttSearch_End
  7 | global Best_ED1 Best_D1 Best_DC1 Best_ED2 Best_D2 Best_DC2 Best_BP
  8 | 
  9 | Utt_id = 1;
 10 | while( (Utt_id <= Nutterances) && (Nutterances <= MAXNUTTERANCES) )
 11 |     Utt_DelayEst_l = Utt_DelayEst(Utt_id);
 12 |     Utt_Delay_l = Utt_Delay(Utt_id);
 13 |     Utt_DelayConf_l = Utt_DelayConf(Utt_id);
 14 |     Utt_Start_l = Utt_Start(Utt_id);
 15 |     Utt_End_l = Utt_End(Utt_id);
 16 |     
 17 |     Utt_SpeechStart = Utt_Start_l;
 18 | %     fprintf( 'SpeechStart is %d\n', Utt_SpeechStart);
 19 |     while( (Utt_SpeechStart < Utt_End_l) && ...
 20 |             (ref_VAD(Utt_SpeechStart)<= 0.0) )
 21 |         Utt_SpeechStart = Utt_SpeechStart + 1;
 22 |     end %find the SpeechStart for each utterance
 23 |     Utt_SpeechEnd = Utt_End_l;
 24 | %     fprintf( 'SpeechEnd is %d\n', Utt_SpeechEnd);
 25 |     while( (Utt_SpeechEnd > Utt_Start_l) && ...
 26 |             (ref_VAD(Utt_SpeechEnd) <= 0))
 27 |         Utt_SpeechEnd = Utt_SpeechEnd- 1;
 28 |     end
 29 |     Utt_SpeechEnd = Utt_SpeechEnd+ 1;    
 30 |     %find SpeechEnd for each utterance
 31 |     Utt_Len = Utt_SpeechEnd - Utt_SpeechStart;
 32 |     
 33 | %     fprintf( 'Utt_Len is %d\n', Utt_Len);
 34 |     
 35 |     if( Utt_Len >= 200 )
 36 |         split_align( ref_data, ref_Nsamples, ref_VAD, ref_logVAD, ...
 37 |             deg_data, deg_Nsamples, deg_VAD, deg_logVAD, ...
 38 |             Utt_Start_l, Utt_SpeechStart, Utt_SpeechEnd, Utt_End_l, ...
 39 |             Utt_DelayEst_l, Utt_DelayConf_l);
 40 | %         fprintf( '\nBest_ED1, Best_D1, Best_DC1 is %d, %d, %f\n',...
 41 | % 				Best_ED1, Best_D1, Best_DC1);
 42 | %         fprintf( 'Best_ED2, Best_D2, Best_DC2 is %d, %d, %f\n',...
 43 | % 				Best_ED2, Best_D2, Best_DC2);
 44 | %         fprintf( 'Best_BP is %d\n', Best_BP);
 45 |                 
 46 |         if( (Best_DC1 > Utt_DelayConf_l) && (Best_DC2 > Utt_DelayConf_l) )
 47 |             for step = Nutterances: -1: Utt_id+ 1
 48 |                 Utt_DelayEst(step+ 1) = Utt_DelayEst(step);
 49 |                 Utt_Delay(step+ 1) = Utt_Delay(step);
 50 |                 Utt_DelayConf(step+ 1) = Utt_DelayConf(step);
 51 |                 Utt_Start(step+ 1) = Utt_Start(step);
 52 |                 Utt_End(step+ 1) = Utt_End(step);
 53 |                 UttSearch_Start(step+ 1) = Utt_Start( step);
 54 |                 UttSearch_End(step+ 1) = Utt_End( step);
 55 |             end
 56 | 
 57 |             Nutterances = Nutterances+ 1;
 58 |             
 59 |             Utt_DelayEst(Utt_id) = Best_ED1;
 60 |             Utt_Delay(Utt_id) = Best_D1;
 61 |             Utt_DelayConf(Utt_id) = Best_DC1;
 62 |             
 63 |             Utt_DelayEst(Utt_id +1) = Best_ED2;
 64 |             Utt_Delay(Utt_id +1) = Best_D2;
 65 |             Utt_DelayConf(Utt_id +1) = Best_DC2;
 66 |             
 67 |             UttSearch_Start(Utt_id +1) = UttSearch_Start(Utt_id);
 68 |             UttSearch_End(Utt_id +1) = UttSearch_End( Utt_id);
 69 |             if( Best_D2 < Best_D1 )
 70 |                 Utt_Start(Utt_id) = Utt_Start_l;
 71 |                 Utt_End(Utt_id) = Best_BP;
 72 |                 Utt_Start(Utt_id +1) = Best_BP;
 73 |                 Utt_End(Utt_id +1) = Utt_End_l;
 74 |             else
 75 |                 Utt_Start( Utt_id) = Utt_Start_l;
 76 |                 Utt_End( Utt_id) = Best_BP + ...
 77 |                     floor( (Best_D2- Best_D1)/ (2 * Downsample));
 78 |                 Utt_Start( Utt_id +1) = Best_BP - ...
 79 |                     floor( (Best_D2- Best_D1)/ (2 * Downsample));
 80 |                 Utt_End( Utt_id +1) = Utt_End_l;
 81 |             end
 82 | 
 83 |             if( (Utt_Start(Utt_id)- SEARCHBUFFER- 1)* Downsample+ 1+ ...
 84 |                     Best_D1 < 0 )
 85 |                 Utt_Start(Utt_id) = SEARCHBUFFER+ 1+  ...
 86 |                     floor( (Downsample - 1 - Best_D1) / Downsample);
 87 |             end
 88 | 
 89 |             if( ((Utt_End( Utt_id +1)- 1)* Downsample+ 1 + Best_D2) >...
 90 |                     (deg_Nsamples - SEARCHBUFFER * Downsample) )
 91 |                 Utt_End( Utt_id +1) = floor( (deg_Nsamples - Best_D2)...
 92 |                     / Downsample)- SEARCHBUFFER+ 1;
 93 |             end
 94 |         else
 95 |             Utt_id= Utt_id+ 1;
 96 |         end
 97 |     else
 98 |         Utt_id = Utt_id+ 1;
 99 |     end
100 | end
101 | 
102 | Largest_uttsize = max( Utt_End- Utt_Start);
103 | 
104 | % fid= fopen( 'uttinfo_mat.txt', 'wt');
105 | % fprintf( fid, 'Number of Utterances is:\n');
106 | % fprintf( fid, '%d\n', Nutterances);
107 | % fprintf( fid, 'Utterance Delay Estimation:\n');
108 | % fprintf( fid, '%d\n', Utt_DelayEst( 1: Nutterances) );
109 | % fprintf( fid, 'Utterance Delay:\n');
110 | % fprintf( fid, '%d\n', Utt_Delay( 1: Nutterances));
111 | % fprintf( fid, 'Utterance Delay Confidence:\n');
112 | % fprintf( fid, '%f\n', Utt_DelayConf( 1: Nutterances));
113 | % fprintf( fid, 'Utterance Start:\n');
114 | % fprintf( fid, '%d\n', Utt_Start( 1: Nutterances));
115 | % fprintf( fid, 'Utterance End:\n');
116 | % fprintf( fid, '%d\n', Utt_End( 1: Nutterances));
117 | % fprintf( fid, 'Largest utterance length:\n');
118 | % fprintf( fid, '%d\n', Largest_uttsize);
119 | % fclose( fid);
120 | 
121 | 
122 | 
123 | 


--------------------------------------------------------------------------------
/PESQ/wavread.m:
--------------------------------------------------------------------------------
1 | function [data,Srate,Nbits]=wavread(filename)
2 | 
3 | [data,Srate]=audioread(filename);
4 | Nbits=32;
5 | 


--------------------------------------------------------------------------------
/PESQ/white_noise.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/PESQ/white_noise.wav


--------------------------------------------------------------------------------
/RETO2016_README.txt:
--------------------------------------------------------------------------------
 1 | - Objetivo del reto: realzar una señal de voz multicanal. 
 2 | 
 3 | - La calidad de la voz realzada se medirá mediante un test PESQ (proporciona una nota de 0, calidad pésima, a 5, calidad excelente) respecto a una señal limpia de referencia. Como punto de partida se tomará la calidad proporcionada por el canal central (num 8, PESQ=2.1752) y un beamformer Delay-And-Sum (PESQ=2.3741). Programas: PESQ.zip es un archivo-directorio comprimido con el programa PESQ para la evaluación de la calidad. Descomprimir en el directorio de trabajo y consultar el "readme" correspondiente para su uso.
 4 | 
 5 | - Señales a emplear (directorio signals):
 6 | * Tipo de array: lineal, 15 canales, no uniforme (espaciados d, 2*s y 4*d, d=4cm).
 7 | * La señal multicanal ruidosa (ruido laboratorio) a realzar es "an103-mtms-arr4A.adc".
 8 | * La señal de referencia limpia monocanal adquirida con micrófono de proximidad es "an103-mtms-senn4.adc".
 9 | * Otras senales: "an10n-mtms-arr4A.adc" (n=1,2,4,5). Grabadas con el mismo array y tipo de ruido.
10 | * Las especificaciones de adquisición de las señales pueden consultarse en el fichero "README_acquisition".
11 | 
12 | - Parámetros:
13 | * Fs=16000; %frecuencia de muestreo
14 | * nc=15;    %numero de canales
15 | * L=400;    %longitud de la STFT
16 | 
17 | - Ficheros adicionales:
18 | * Leer_Array_Signals.m: programa de lectura de la señal multicanal.
19 | * offsetcom.m: función utilizada por el programa de lectura para compensación de componentes DC.
20 | * steering_vector.mat: contiene la variable ds (15x201) con el steering vector a todas las frecuencias posibles (k=1:201).


--------------------------------------------------------------------------------
/RETO2016_TOOLS/signals/README_Acquisition:
--------------------------------------------------------------------------------
  1 | 
  2 | This directory contains multi-microphone data recorded by Tom Sullivan
  3 | (tms@cs.cmu.edu) at Carnegie Mellon University.
  4 | 
  5 | All data is sampled at 16 kHz, 16-bit linear sampling.  
  6 | 
  7 | There are 3 directories included:
  8 | 
  9 | 15element -- Recorded at Carnegie Mellon University.
 10 | 
 11 | 8element -- Recorded at Carnegie Mellon University.
 12 | 
 13 | rutdata -- Recorded at Rutgers University.
 14 | 
 15 | The contents of each directory are described individually in more detail
 16 | below.
 17 | 
 18 | 
 19 | 15element
 20 | ---------
 21 | These utterances were collected with a 15-element array. The array spacing is 
 22 | such that it is actually three 7-element sub-arrays (with different spacing) 
 23 | interleaved (ala. Jim Flanagan's array at ATT/Rutgers U.)  Some elements are 
 24 | shared between the sub-arrays.  
 25 | 
 26 | If the minimum spacing of this array is N-cm.  The array looks roughly like
 27 | this:
 28 | 
 29 | 1    2  3  4 5 6 7 8 9 10 11 12  13  14    15
 30 | 
 31 | Where element 8 is in the exact center of the array.
 32 | 
 33 | Elements 5, 6, 7, 8, 9, 10, and 11 are a 7 element array with a spacing of 
 34 | N cm between elements.
 35 | 
 36 | Elements 3, 4, 6, 8, 10, 12 and 13 are a 7 element array with a spacing of 
 37 | 2*N cm between elements.
 38 | 
 39 | Elements 1, 2, 4, 8, 12, 14 and 15 are a 7 element array with a spacing of 
 40 | 4*N cm between elements.
 41 | 
 42 | Of course you can feel free to combine the elements in any manner you 
 43 | desire for your own experiments.  We used them to study different element
 44 | spacing from data that was collected simultaneously to each of the
 45 | sub-arrays.
 46 | 
 47 | Within this directory are files of the form: 
 48 | 
 49 | {utterance}-{subject}-{microphone_type}.adc
 50 | 
 51 | {utterance} is either an* or cen*, where "*" is a number.  These are
 52 | alphanumeric and census utterances in the AN4 dataset used often here
 53 | at CMU.  
 54 | 
 55 | {subject} is the same 4-letter code used to name the sub-directories.
 56 | 
 57 | {microphone_type} is the type of microphone used and a key into the 
 58 | experimental conditions.  "senn" is the Sennheiser HMD414 headset 
 59 | closetalking microphone, used in every set as a control.  "arr" is a
 60 | microphone array having multiple elements.  All array elements are
 61 | Panasonic WD-063 noise cancelling electret condenser elements.
 62 | 
 63 | There are 6 data sets collected with this array:
 64 | 
 65 | 1) This set was collected in a noisy computer lab at Carnegie Mellon Univ.
 66 | with an array of 15-elements with a minimum spacing of 3 cm.  It is denoted 
 67 | by "arr3A" for the 15 array elements and "senn3" for the closetalk.  The 
 68 | subject sat one meter from the center of the array.
 69 | 
 70 | Ex: 
 71 | 
 72 | an101-mtms-arr3A.adc  (15-channel array, 3 cm minimum spacing)
 73 | an101-mtms-senn3.adc  (closetalking control signal for above)
 74 | 
 75 | 2) This set was collected in the same noisy lab as above but with the
 76 | 15-element array with a minimum spacing of 4 cm.  It is denoted by
 77 | "arr4A" and "senn4" for the closetalk.  The subject sat one meter from
 78 | the center of the array.
 79 | 
 80 | Ex: 
 81 | 
 82 | an101-mtms-arr4A.adc  (15-channel array, 4 cm minimum spacing)
 83 | an101-mtms-senn4.adc  (closetalking control signal for above)
 84 | 
 85 | 3) This set was collected in a conference room with the 15-element array 
 86 | with a minimum spacing of 4 cm.  The conference room is larger than the
 87 | noisy lab, but didn't have all of the computer fans.  It is denoted by
 88 | "arrC1A" and "sennC1" for the closetalk.  The subject sat one meter from
 89 | the center of the array.
 90 | 
 91 | Ex: 
 92 | 
 93 | an101-mtms-arrC1A.adc  (15-channel array, 4 cm minimum spacing, 1 meter dist.)
 94 | an101-mtms-sennC1.adc  (closetalking control signal for above)
 95 | 
 96 | 4) This set was collected in the same conference room as 4) above with the 
 97 | 15-element array with a minimum spacing of 4 cm.  It is denoted by
 98 | "arrC1A" and "sennC1" for the closetalk.  The subject sat three meters from
 99 | the center of the array.
100 | 
101 | Ex: 
102 | 
103 | an101-mtms-arrC3A.adc  (15-channel array, 4 cm minimum spacing, 3 meter dist.)
104 | an101-mtms-sennC3.adc  (closetalking control signal for above)
105 | 
106 | 5) This set was collected the same conference room as above with the 
107 | 15-element array with a minimum spacing of 4 cm, but also had an AM talk-radio
108 | jamming signal at approximately 45 degrees off-axis from the center of the
109 | array, competing with the speaker.  It is denoted by "arrCR1A" and "sennCR1" 
110 | for the closetalk.  The subject sat one meter from the center of the array.
111 | 
112 | Ex: 
113 | 
114 | an101-mtms-arrCR1A.adc  (15-channel array, 4 cm minimum spacing, 1 meter 
115 | 			dist., radio jamming signal)
116 | an101-mtms-sennCR1.adc  (closetalking control signal for above)
117 | 
118 | 6) This set was collected the same conference room as above with the 
119 | 15-element array with a minimum spacing of 4 cm, but also had an AM talk-radio
120 | jamming signal at approximately 45 degrees off-axis from the center of the
121 | array, competing with the speaker.  It is denoted by "arrCR1A" and "sennCR1" 
122 | for the closetalk.  The subject sat three meters from the center of the array.
123 | 
124 | Ex: 
125 | 
126 | an101-mtms-arrCR3A.adc  (15-channel array, 4 cm minimum spacing, 3 meter 
127 | 			dist., radio jamming signal)
128 | an101-mtms-sennCR3.adc  (closetalking control signal for above)
129 | 
130 | 
131 | 8element
132 | --------
133 | There are ten "subject" sub-directories included.  Their names are each four
134 | letters long.  The first letter denotes the gender of the speaker ("m"
135 | or "f").  All data we've collected thus far used only male speakers.
136 | The final three letters are the initials of the subject.  Hence "mtms" is
137 | a male speaker with the initials TMS.
138 | 
139 | Within each subject's sub-directory are files of the form: 
140 | 
141 | {utterance}-{subject}-{microphone_type}.adc
142 | 
143 | {utterance} is either an* or cen*, where "*" is a number.  These are
144 | alphanumeric and census utterances in the AN4 dataset used often here
145 | at CMU.  
146 | 
147 | {subject} is the same 4-letter code used to name the sub-directories.
148 | 
149 | {microphone_type} is the type of microphone used and a key into the 
150 | experimental conditions.  "senn" is the Sennheiser HMD414 headset 
151 | closetalking microphone, used in every set as a control.  "arr" is a
152 | microphone area having multiple elements.  All array elements are
153 | Panasonic WD-063 noise cancelling electret condenser elements.
154 | 
155 | There are 3 different data sets contained in these directories.  They are
156 | listed as follows:
157 | 
158 | This set contains 10 male speakers each speaking 14 utterances.  The
159 | microphone array used had eight (8) elements, and these utterances are
160 | denoted by "arrA" as the {microphone_type}.  The 8 elements were spaced
161 | linearly and with a spacing of 7 cm between elements.  The subject sat 
162 | directly in front of the array at a distance of 1 meter from the center.
163 | A pair of Crown PZM6FS microphones were also used to collect a stereo pair
164 | of Crown PZM signals to compare the performance of a quality set of 
165 | omnidirectional microphones to the Panasonic WD-063 array elements.
166 | The closetalking control signal is denoted by "senn" only.  The set was
167 | collected in a noisy computer lab at Carnegie Mellon Univ. with many 
168 | computer and disk-drive fans.
169 | 
170 | Ex:
171 | 
172 | an101-mtms-arrA.adc  (8-channel array)
173 | an101-mtms-pzmS.adc  (stereo pair of PZM6FS microphones)
174 | an101-mtms-senn.adc  (closetalking control signal)
175 | 
176 | rutdata
177 | -------
178 | 
179 | These data were collected at the CAIP Center at Rutgers University in 1991 
180 | with the help of Jim Flanagan and Joe French.  
181 | 
182 | The experimental setup consisted of two 23-element microphone arrays built
183 | at ATT Bell Labs, a Crown PZM6FS microphone, and a Sennheiser HMD414 head-
184 | mounted closetalking microphone.  The arrays had 23 microphone elements each
185 | (comprising three interleaved arrays of 11-elements each).  One of the arrays 
186 | had a bandwidth of 8kHz (standard DARPA speech bandwidth) and the other a 
187 | bandwidth of 4kHz (standard telephone speech bandwidth).
188 | 
189 | There are two sub-directories:  1meter and 3meters.  These directories refer
190 | to the distance of the speaker from the center of the microphone array.
191 | Within each directory are sub-directories, one for each speaker.
192 | 
193 | Within each subject's sub-directory are files of the form: 
194 | 
195 | {utterance}-{subject}-{microphone_type}.adc
196 | 
197 | {utterance} is either an* or cen*, where "*" is a number.  These are
198 | alphanumeric and census utterances in the AN4 dataset used often here
199 | at CMU.  
200 | 
201 | {subject} is the same 4-letter code used to name the sub-directories.
202 | 
203 | {microphone_type} is the type of microphone used and a key into the 
204 | experimental conditions.  "sen" is the Sennheiser HMD414 headset 
205 | closetalking microphone, used in every set as a control.  "pzm" is a
206 | Crown PZM6FS microphone, "arrA(1m or 3m)" is a 8kHZ bandwidth array described
207 | above, and "arrB(1m or 3m) is a 4kHZ bandwidth array as describe above.  All 
208 | array elements are Panasonic WD-063 noise cancelling electret condenser 
209 | elements.
210 | 
211 | The arrA and arrB files in this data set are monophonic files.  They are
212 | created by combining the array element outputs in real-time via hardware
213 | via delay and sum beamforming.
214 | 
215 | ------------------------
216 | The sentence transcripts for each of the files in "15element" and 8element" 
217 | are contained in the "transcripts" directory.
218 | 
219 | The sentence transcripts for each of the files in "rutdata" are contained 
220 | in the "rutdata/transcripts/trans{1m,3m}" directories.
221 | ------------------------
222 | 
223 | The file ad.h in this directory is a C "include" file that contains the
224 | structure of the soundfile headers found on the .adc files.
225 | 
226 | 
227 | 


--------------------------------------------------------------------------------
/RETO2016_TOOLS/signals/an101-mtms-arr4A.adc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/RETO2016_TOOLS/signals/an101-mtms-arr4A.adc


--------------------------------------------------------------------------------
/RETO2016_TOOLS/signals/an102-mtms-arr4A.adc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/RETO2016_TOOLS/signals/an102-mtms-arr4A.adc


--------------------------------------------------------------------------------
/RETO2016_TOOLS/signals/an103-mtms-arr4A.adc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/RETO2016_TOOLS/signals/an103-mtms-arr4A.adc


--------------------------------------------------------------------------------
/RETO2016_TOOLS/signals/an103-mtms-senn4.adc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/RETO2016_TOOLS/signals/an103-mtms-senn4.adc


--------------------------------------------------------------------------------
/RETO2016_TOOLS/signals/an104-mtms-arr4A.adc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/RETO2016_TOOLS/signals/an104-mtms-arr4A.adc


--------------------------------------------------------------------------------
/RETO2016_TOOLS/signals/an105-mtms-arr4A.adc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/RETO2016_TOOLS/signals/an105-mtms-arr4A.adc


--------------------------------------------------------------------------------
/ResumenResultados.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/ResumenResultados.xlsx


--------------------------------------------------------------------------------
/array.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/array.wav


--------------------------------------------------------------------------------
/asdf.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/asdf.wav


--------------------------------------------------------------------------------
/image_2017-02-16_15-18-08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/image_2017-02-16_15-18-08.png


--------------------------------------------------------------------------------
/limpia.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/limpia.wav


--------------------------------------------------------------------------------
/lms_eq.m:
--------------------------------------------------------------------------------
 1 | function [ yout, ak ] = lms_eq(ak,xbloqueo,xout,mu)
 2 |     %UNTITLED2 Summary of this function goes here
 3 |     %   Detailed explanation goes here
 4 | 
 5 |     E = 10e-5;
 6 | 
 7 |     xk = xbloqueo;%14xL/2
 8 |     yk = sum(ak.*xk); %1xL/2
 9 |     yout = xout'-yk; %Salida para una trama 1xL/2
10 |     err= repmat(yout,14,1).*xk;%1xL/2
11 |     ak = ak + mu*err./xk.^2;   
12 | 
13 | end
14 | 
15 | 


--------------------------------------------------------------------------------
/offsetcomp.m:
--------------------------------------------------------------------------------
 1 | function xout = offsetcomp(x)
 2 | 
 3 | F=0.98;
 4 | N=length(x);
 5 | x=x-mean(x);
 6 | x_ant=0;
 7 | xof=0;
 8 | xout=[];
 9 | for n=1:N
10 |     xof=x(n)-x_ant+F*xof;
11 |     x_ant=x(n);
12 |     xout=[xout; xof];
13 | end


--------------------------------------------------------------------------------
/steering_vector.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jgarciagimenez/GSC_beamforming/5c49dd51f837679fdd4355d535a372ed2de8d894/steering_vector.mat


--------------------------------------------------------------------------------