├── PESQ
    ├── DC_block.m
    ├── FFTNXCorr.m
    ├── apply_VAD.m
    ├── apply_filter.m
    ├── apply_filters.m
    ├── crude_align.m
    ├── fix_power_level.m
    ├── id_searchwindows.m
    ├── id_utterances.m
    ├── input_filter.m
    ├── pesq.m
    ├── pesq_psychoacoustic_model.m
    ├── pow_of.m
    ├── setup_global.m
    ├── split_align.m
    ├── stoi.m
    ├── time_align.m
    ├── utterance_locate.m
    └── utterance_split.m
├── README.md
├── bss_eval_sources.m
├── estoi.m
├── evaluate_2speaker_ori.m
├── evaluate_2speaker_separated.m
├── pesq.m
├── rusult
    ├── 050a0501_1.7783_442o030z_-1.7783_1.wav
    ├── 050a0501_1.7783_442o030z_-1.7783_2.wav
    ├── 050a0502_1.3461_440o030j_-1.3461_1.wav
    ├── 050a0502_1.3461_440o030j_-1.3461_2.wav
    ├── 050a0502_1.463_420a010o_-1.463_1.wav
    ├── 050a0502_1.463_420a010o_-1.463_2.wav
    ├── 050a0502_1.9707_440c020w_-1.9707_1.wav
    └── 050a0502_1.9707_440c020w_-1.9707_2.wav
└── stoi.m


/PESQ/DC_block.m:
--------------------------------------------------------------------------------
 1 | function mod_data= DC_block( data, Nsamples)
 2 | 
 3 | global Downsample DATAPADDING_MSECS SEARCHBUFFER
 4 | 
 5 | ofs= SEARCHBUFFER* Downsample;
 6 | mod_data= data;
 7 | 
 8 | %compute dc component, it is a little weird
 9 | facc= sum( data( ofs+ 1: Nsamples- ofs))/ Nsamples; 
10 | mod_data( ofs+ 1: Nsamples- ofs)= data( ofs+ 1: Nsamples- ofs)- facc;
11 | 
12 | mod_data( ofs+ 1: ofs+ Downsample)= mod_data( ofs+ 1: ofs+ Downsample).* ...
13 |     ( 0.5+ (0: Downsample- 1))/ Downsample;
14 | 
15 | mod_data( Nsamples- ofs: -1: Nsamples- ofs-Downsample+ 1)= ...
16 |     mod_data( Nsamples- ofs: -1: Nsamples- ofs-Downsample+ 1).* ...
17 |     ( 0.5+ (0: Downsample- 1))/ Downsample;
18 | 
19 | 
20 |      
21 |     
22 | 
23 | 
24 | 
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/PESQ/FFTNXCorr.m:
--------------------------------------------------------------------------------
 1 | function Y= FFTNXCorr( ref_VAD, startr, nr, deg_VAD, startd, nd)
 2 | % this function has other simple implementations, current implementation is
 3 | % consistent with the C version
 4 | 
 5 | % one way to do this (in time domain) =====
 6 | x1= ref_VAD( startr: startr+ nr- 1);
 7 | x2= deg_VAD( startd: startd+ nd- 1);
 8 | x1= fliplr( x1);
 9 | Y= conv( x2, x1);
10 | % done =====
11 | 
12 | % % the other way to do this (in freq domain)===
13 | % Nx= 2^ (ceil( log2( max( nr, nd))));
14 | % x1= zeros( 1, 2* Nx);
15 | % x2= zeros( 1, 2* Nx);
16 | % x1( 1: nr)= fliplr( ref_VAD( startr: startr+ nr- 1));
17 | % x2( 1: nd)= deg_VAD( startd: startd+ nd- 1);
18 | % 
19 | % if (nr== 491)
20 | %     fid= fopen( 'mat_debug.txt', 'wt');
21 | %     fprintf( fid, '%f\n', x1);
22 | %     fclose( fid);
23 | % end
24 | % 
25 | % x1_fft= fft( x1, 2* Nx);
26 | % x2_fft= fft( x2, 2* Nx);
27 | % 
28 | % tmp1= ifft( x1_fft.* x2_fft, 2* Nx);
29 | % 
30 | % Ny= nr+ nd- 1;
31 | % Y= tmp1( 1: Ny);
32 | % % done ===========
33 | 
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/PESQ/apply_VAD.m:
--------------------------------------------------------------------------------
  1 | function [VAD, logVAD]= apply_VAD( data, Nsamples)
  2 | 
  3 | global Downsample MINSPEECHLGTH JOINSPEECHLGTH
  4 | 
  5 | Nwindows= floor( Nsamples/ Downsample);
  6 | %number of 4ms window
  7 | 
  8 | VAD= zeros( 1, Nwindows);
  9 | for count= 1: Nwindows
 10 |     VAD( count)= sum( data( (count-1)* Downsample+ 1: ...
 11 |         count* Downsample).^ 2)/ Downsample;   
 12 | end
 13 | %VAD is the power of each 4ms window 
 14 | 
 15 | LevelThresh = sum( VAD)/ Nwindows;
 16 | %LevelThresh is set to mean value of VAD
 17 | 
 18 | LevelMin= max( VAD);
 19 | if( LevelMin > 0 )
 20 |     LevelMin= LevelMin* 1.0e-4;
 21 | else
 22 |     LevelMin = 1.0;
 23 | end
 24 | %fprintf( 1, 'LevelMin is %f\n', LevelMin);
 25 | 
 26 | VAD( find( VAD< LevelMin))= LevelMin;
 27 | 
 28 | for iteration= 1: 12    
 29 |     LevelNoise= 0;
 30 |     len= 0;
 31 |     StDNoise= 0;    
 32 |     
 33 |     VAD_lessthan_LevelThresh= VAD( find( VAD<= LevelThresh));
 34 |     len= length( VAD_lessthan_LevelThresh);
 35 |     LevelNoise= sum( VAD_lessthan_LevelThresh);
 36 |     if (len> 0)
 37 |         LevelNoise= LevelNoise/ len;
 38 |         StDNoise= sqrt( sum( ...
 39 |         (VAD_lessthan_LevelThresh- LevelNoise).^ 2)/ len);
 40 |     end
 41 |     LevelThresh= 1.001* (LevelNoise+ 2* StDNoise);  
 42 | end
 43 | %fprintf( 1, 'LevelThresh is %f\n', LevelThresh);
 44 | 
 45 | LevelNoise= 0;
 46 | LevelSig= 0;
 47 | len= 0;
 48 | VAD_greaterthan_LevelThresh= VAD( find( VAD> LevelThresh));
 49 | len= length( VAD_greaterthan_LevelThresh);
 50 | LevelSig= sum( VAD_greaterthan_LevelThresh);
 51 | 
 52 | VAD_lessorequal_LevelThresh= VAD( find( VAD<= LevelThresh));
 53 | LevelNoise= sum( VAD_lessorequal_LevelThresh);
 54 | 
 55 | if (len> 0)
 56 |     LevelSig= LevelSig/ len;
 57 | else
 58 |     LevelThresh= -1;
 59 | end
 60 | %fprintf( 1, 'LevelSig is %f\n', LevelSig);
 61 | 
 62 | if (len< Nwindows)
 63 |     LevelNoise= LevelNoise/( Nwindows- len);
 64 | else
 65 |     LevelNoise= 1;
 66 | end
 67 | %fprintf( 1, 'LevelNoise is %f\n', LevelNoise);
 68 | 
 69 | VAD( find( VAD<= LevelThresh))= -VAD( find( VAD<= LevelThresh));
 70 | VAD(1)= -LevelMin;
 71 | VAD(Nwindows)= -LevelMin;
 72 | 
 73 | 
 74 | start= 0;
 75 | finish= 0;
 76 | for count= 2: Nwindows
 77 |     if( (VAD(count) > 0.0) && (VAD(count-1) <= 0.0) )
 78 |         start = count;
 79 |     end
 80 |     if( (VAD(count) <= 0.0) && (VAD(count-1) > 0.0) )
 81 |         finish = count;
 82 |         if( (finish - start)<= MINSPEECHLGTH )
 83 |             VAD( start: finish- 1)= -VAD( start: finish- 1);
 84 |         end
 85 |     end
 86 | end
 87 | %to make sure finish- start is more than 4
 88 | 
 89 | if( LevelSig >= (LevelNoise* 1000) )
 90 |     for count= 2: Nwindows
 91 |         if( (VAD(count)> 0) && (VAD(count-1)<= 0) )
 92 |             start= count;
 93 |         end
 94 |         if( (VAD(count)<= 0) && (VAD(count-1)> 0) )
 95 |             finish = count;
 96 |             g = sum( VAD( start: finish- 1));
 97 |             if( g< 3.0* LevelThresh* (finish - start) )
 98 |                 VAD( start: finish- 1)= -VAD( start: finish- 1);
 99 |             end
100 |         end
101 |     end
102 | end
103 | 
104 | start = 0;
105 | finish = 0;
106 | for count= 2: Nwindows
107 |     if( (VAD(count) > 0.0) && (VAD(count-1) <= 0.0) )
108 |         start = count;
109 |         if( (finish > 0) && ((start - finish) <= JOINSPEECHLGTH) )
110 |             VAD( finish: start- 1)= LevelMin;
111 |         end        
112 |     end
113 |     if( (VAD(count) <= 0.0) && (VAD(count-1) > 0.0) )
114 |         finish = count;
115 |     end
116 | end
117 | 
118 | start= 0;
119 | for count= 2: Nwindows
120 |     if( (VAD(count)> 0) && (VAD(count-1)<= 0) )
121 |         start= count;
122 |     end
123 | end
124 | if( start== 0 )
125 |     VAD= abs(VAD);
126 |     VAD(1) = -LevelMin;
127 |     VAD(Nwindows) = -LevelMin;
128 | end
129 | 
130 | count = 4;
131 | while( count< (Nwindows-1) )
132 |     if( (VAD(count)> 0) && (VAD(count-2) <= 0) )
133 |         VAD(count-2)= VAD(count)* 0.1;
134 |         VAD(count-1)= VAD(count)* 0.3;
135 |         count= count+ 1;
136 |     end
137 |     if( (VAD(count)<= 0) && (VAD(count-1)> 0) )
138 |         VAD(count)= VAD(count-1)* 0.3;
139 |         VAD(count+ 1)= VAD(count-1)* 0.1;
140 |         count= count+ 3;
141 |     end
142 |     count= count+ 1;
143 | end
144 | 
145 | VAD( find( VAD< 0))= 0;
146 | 
147 | % fid= fopen( 'mat_vad.txt', 'wt');
148 | % fprintf( fid, '%f\n', VAD);
149 | % fclose( fid);
150 | 
151 | if( LevelThresh<= 0 )
152 |     LevelThresh= LevelMin;
153 | end
154 | 
155 | logVAD( find( VAD<= LevelThresh))= 0;
156 | VAD_greaterthan_LevelThresh= find( VAD> LevelThresh);
157 | logVAD( VAD_greaterthan_LevelThresh)= log( VAD( ...
158 |     VAD_greaterthan_LevelThresh)/ LevelThresh);
159 | 
160 | 
161 | 
162 | 
163 | 


--------------------------------------------------------------------------------
/PESQ/apply_filter.m:
--------------------------------------------------------------------------------
 1 | function align_filtered= apply_filter( data, data_Nsamples, align_filter_dB)
 2 | 
 3 | global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs
 4 | 
 5 | align_filtered= data;
 6 | n= data_Nsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000);
 7 | % now find the next power of 2 which is greater or equal to n
 8 | pow_of_2= 2^ (ceil( log2( n)));
 9 | 
10 | [number_of_points, trivial]= size( align_filter_dB);
11 | overallGainFilter= interp1( align_filter_dB( :, 1), align_filter_dB( :, 2), ...
12 |     1000);
13 | 
14 | x= zeros( 1, pow_of_2);
15 | x( 1: n)= data( SEARCHBUFFER* Downsample+ 1: SEARCHBUFFER* Downsample+ n);
16 | 
17 | x_fft= fft( x, pow_of_2);
18 | 
19 | freq_resolution= Fs/ pow_of_2;
20 | 
21 | factorDb( 1: pow_of_2/2+ 1)= interp1( align_filter_dB( :, 1), ...
22 |     align_filter_dB( :, 2), (0: pow_of_2/2)* freq_resolution)- ...
23 |     overallGainFilter;
24 | factor= 10.^ (factorDb/ 20);
25 | 
26 | factor= [factor, fliplr( factor( 2: pow_of_2/2))];
27 | x_fft= x_fft.* factor;
28 | 
29 | y= ifft( x_fft, pow_of_2);
30 | 
31 | align_filtered( SEARCHBUFFER* Downsample+ 1: SEARCHBUFFER* Downsample+ n)...
32 |     = y( 1: n);
33 | 
34 | % fid= fopen( 'log_mat.txt', 'wt');
35 | % fprintf( fid, '%f\n', y( 1: n));
36 | % fclose( fid);
37 | 
38 | 
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/PESQ/apply_filters.m:
--------------------------------------------------------------------------------
 1 | function mod_data= apply_filters( data, Nsamples)
 2 | %IIRFilt( InIIR_Hsos, InIIR_Nsos, data, data_Nsamples);
 3 | 
 4 | global InIIR_Hsos InIIR_Nsos DATAPADDING_MSECS Fs
 5 | % data_Nsamples= Nsamples+ DATAPADDING_MSECS* (Fs/ 1000);
 6 | 
 7 | % now we construct the second order section matrix
 8 | sosMatrix= zeros( InIIR_Nsos, 6);
 9 | sosMatrix( :, 4)= 1; %set a(1) to 1
10 | % each row of sosMatrix holds [b(1*3) a(1*3)] for each section
11 | sosMatrix( :, 1: 3)= InIIR_Hsos( :, 1: 3);
12 | sosMatrix( :, 5: 6)= InIIR_Hsos( :, 4: 5);
13 | %sosMatrix
14 | 
15 | % now we construct second order section direct form II filter
16 | iirdf2= dfilt.df2sos( sosMatrix);
17 | 
18 | mod_data= filter( iirdf2, data);
19 | 
20 | 
21 | 
22 | 
23 | 
24 | 
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/PESQ/crude_align.m:
--------------------------------------------------------------------------------
 1 | function crude_align( ref_logVAD, ref_Nsamples, deg_logVAD, ...
 2 |     deg_Nsamples, Utt_id)
 3 | 
 4 | global Downsample 
 5 | global Nutterances Largest_uttsize Nsurf_samples Crude_DelayEst
 6 | global Crude_DelayConf UttSearch_Start UttSearch_End Utt_DelayEst
 7 | global Utt_Delay Utt_DelayConf Utt_Start Utt_End
 8 | global MAXNUTTERANCES WHOLE_SIGNAL
 9 | global pesq_mos subj_mos cond_nr 
10 | 
11 | if (Utt_id== WHOLE_SIGNAL )
12 |     nr = floor( ref_Nsamples/ Downsample);
13 |     nd = floor( deg_Nsamples/ Downsample);
14 |     startr= 1;
15 |     startd= 1;
16 | elseif Utt_id== MAXNUTTERANCES
17 |     startr= UttSearch_Start(MAXNUTTERANCES);
18 |     startd= startr+ Utt_DelayEst(MAXNUTTERANCES)/ Downsample;
19 |     if ( startd< 0 )
20 |         startr= 1- Utt_DelayEst(MAXNUTTERANCES)/ Downsample;
21 |         startd= 1;
22 |     end
23 | 
24 |     nr= UttSearch_End(MAXNUTTERANCES)- startr;
25 |     nd= nr;
26 |     
27 |     if( startd+ nd> floor( deg_Nsamples/ Downsample) )
28 |         nd= floor( deg_Nsamples/ Downsample)- startd;
29 |     end
30 | %     fprintf( 'nr,nd is %d,%d\n', nr, nd);
31 |     
32 | else
33 |     startr= UttSearch_Start(Utt_id);
34 |     startd= startr+ Crude_DelayEst/ Downsample; 
35 |     
36 |     if ( startd< 0 )       
37 |         startr= 1- Crude_DelayEst/ Downsample;
38 |         startd= 1;
39 |     end
40 | 
41 |     nr= UttSearch_End(Utt_id)- startr;
42 |     nd = nr;
43 |     if( startd+ nd> floor( deg_Nsamples/ Downsample)+ 1)
44 |         nd = floor( deg_Nsamples/ Downsample)- startd+ 1;
45 |     end
46 | end
47 | 
48 | max_Y= 0.0;
49 | I_max_Y= nr;
50 | if( (nr> 1) && (nd> 1) )
51 |     Y= FFTNXCorr( ref_logVAD, startr, nr, deg_logVAD, startd, nd);
52 |     [max_Y, I_max_Y]= max( Y);
53 |     if (max_Y<= 0)
54 |         max_Y= 0;
55 |         I_max_Y= nr;
56 |     end
57 | end
58 | 
59 | % fprintf( 'max_Y, I_max_Y is %f, %d\n', max_Y, I_max_Y);
60 | 
61 | if( Utt_id== WHOLE_SIGNAL )
62 |     Crude_DelayEst= (I_max_Y- nr)* Downsample;
63 |     Crude_DelayConf= 0.0;
64 | %     fprintf( 1, 'I_max_Y, nr, Crude_DelayEst is %f, %f, %f\n', ...
65 | %         I_max_Y, nr, Crude_DelayEst);
66 | elseif( Utt_id == MAXNUTTERANCES )
67 |     Utt_Delay(MAXNUTTERANCES)= (I_max_Y- nr)* Downsample+ ...
68 |         Utt_DelayEst(MAXNUTTERANCES);    
69 | %     fprintf( 'startr, startd, nr, nd, I_max, Utt_Delay[%d] is %d, %d, %d, %d, %d, %d\n', ...
70 | % 			MAXNUTTERANCES, startr, startd, nr, nd, ...
71 | %             I_max_Y, Utt_Delay(MAXNUTTERANCES) );
72 | else
73 | %     fprintf( 'I_max_Y, nr is %d, %d\n', I_max_Y, nr);
74 |     Utt_DelayEst(Utt_id)= (I_max_Y- nr)* Downsample+ ... 
75 |         Crude_DelayEst;    
76 | end
77 |     
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 
84 | 
85 | 


--------------------------------------------------------------------------------
/PESQ/fix_power_level.m:
--------------------------------------------------------------------------------
 1 | function mod_data= fix_power_level( data, data_Nsamples, maxNsamples)
 2 | % this function is used for level normalization, i.e., to fix the power
 3 | % level of data to a preset number, and return it to mod_data. 
 4 | 
 5 | global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs
 6 | global TARGET_AVG_POWER 
 7 | TARGET_AVG_POWER= 1e7;
 8 | 
 9 | align_filter_dB= [0,-500; 50, -500; 100, -500; 125, -500; 160, -500; 200, -500;
10 |     250, -500; 300, -500; 350,  0; 400,  0; 500,  0; 600,  0; 630,  0;
11 |     800,  0; 1000, 0; 1250, 0; 1600, 0; 2000, 0; 2500, 0; 3000, 0;
12 |     3250, 0; 3500, -500; 4000, -500; 5000, -500; 6300, -500; 8000, -500];    
13 | 
14 | align_filtered= apply_filter( data, data_Nsamples, align_filter_dB);
15 | power_above_300Hz = pow_of (align_filtered, SEARCHBUFFER* Downsample+ 1, ...
16 |     data_Nsamples- SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000), ...
17 |     maxNsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000));
18 | 
19 | global_scale= sqrt( TARGET_AVG_POWER/ power_above_300Hz);
20 | % fprintf( 1, '\tglobal_scale is %f\n', global_scale);
21 | mod_data= data* global_scale;
22 | 


--------------------------------------------------------------------------------
/PESQ/id_searchwindows.m:
--------------------------------------------------------------------------------
 1 | function id_searchwindows( ref_VAD, ref_Nsamples, deg_VAD, deg_Nsamples);
 2 | 
 3 | global MINUTTLENGTH Downsample MINUTTLENGTH SEARCHBUFFER
 4 | global Crude_DelayEst Nutterances UttSearch_Start UttSearch_End
 5 | 
 6 | Utt_num = 1;
 7 | speech_flag = 0;
 8 | 
 9 | VAD_length= floor( ref_Nsamples/ Downsample);
10 | del_deg_start= MINUTTLENGTH- Crude_DelayEst/ Downsample;
11 | del_deg_end= floor((deg_Nsamples- Crude_DelayEst)/ Downsample)-...
12 |     MINUTTLENGTH;
13 | 
14 | for count= 1: VAD_length
15 |     VAD_value= ref_VAD(count);
16 |     if( (VAD_value> 0) && (speech_flag== 0) ) 
17 |         speech_flag= 1;
18 |         this_start= count;
19 |         UttSearch_Start(Utt_num)= count- SEARCHBUFFER;
20 |         if( UttSearch_Start(Utt_num)< 0 )
21 |             UttSearch_Start(Utt_num)= 0;
22 |         end
23 |     end
24 | 
25 |     if( ((VAD_value== 0) || (count == (VAD_length-1))) && ...
26 |             (speech_flag == 1) ) 
27 |         speech_flag = 0;
28 |         UttSearch_End(Utt_num) = count + SEARCHBUFFER;
29 |         if( UttSearch_End(Utt_num) > VAD_length - 1 )
30 |             UttSearch_End(Utt_num) = VAD_length -1;
31 |         end
32 | 
33 |         if( ((count - this_start) >= MINUTTLENGTH) &&...
34 |                 (this_start < del_deg_end) &&...
35 |                 (count > del_deg_start) )
36 |             Utt_num= Utt_num + 1;            
37 |         end
38 |     end
39 | end
40 | Utt_num= Utt_num- 1;
41 | Nutterances = Utt_num;
42 |     
43 | % fprintf( 1, 'Nutterances is %d\n', Nutterances);
44 | 
45 | % fid= fopen( 'mat_utt.txt', 'wt');
46 | % fprintf( fid, '%d\n', UttSearch_Start( 1: Nutterances));
47 | % fprintf( fid, '\n');
48 | % fprintf( fid, '%d\n', UttSearch_End( 1: Nutterances));
49 | % fclose(fid);
50 | 
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/PESQ/id_utterances.m:
--------------------------------------------------------------------------------
 1 | function id_utterances( ref_Nsamples, ref_VAD, deg_Nsamples)
 2 | 
 3 | global Largest_uttsize MINUTTLENGTH MINUTTLENGTH Crude_DelayEst
 4 | global Downsample SEARCHBUFFER Nutterances Utt_Start
 5 | global Utt_End Utt_Delay
 6 | 
 7 | Utt_num = 1;
 8 | speech_flag = 0;
 9 | VAD_length = floor( ref_Nsamples / Downsample);
10 | % fprintf( 1, 'VAD_length is %d\n', VAD_length);
11 | 
12 | del_deg_start = MINUTTLENGTH - Crude_DelayEst / Downsample;
13 | del_deg_end = floor((deg_Nsamples- Crude_DelayEst)/ Downsample) ...
14 |     - MINUTTLENGTH;
15 | 
16 | for count = 1: VAD_length 
17 |     VAD_value = ref_VAD(count);
18 |     if( (VAD_value > 0.0) && (speech_flag == 0) ) 
19 |         speech_flag = 1;
20 |         this_start = count;
21 |         Utt_Start (Utt_num) = count;
22 |     end
23 | 
24 |     if( ((VAD_value == 0) || (count == VAD_length)) && ...
25 |             (speech_flag == 1) ) 
26 |         speech_flag = 0;
27 |         Utt_End (Utt_num) = count;
28 |         
29 |         if( ((count - this_start) >= MINUTTLENGTH) && ...
30 |                 (this_start < del_deg_end) && ... 
31 |                 (count > del_deg_start) )
32 |             Utt_num = Utt_num + 1;   
33 |         end
34 |     end
35 | end
36 | 
37 | Utt_Start(1) = SEARCHBUFFER+ 1;
38 | Utt_End(Nutterances) = VAD_length - SEARCHBUFFER+ 1;
39 | 
40 | for Utt_num = 2: Nutterances
41 |     this_start = Utt_Start(Utt_num)- 1;
42 |     last_end = Utt_End(Utt_num - 1)- 1;
43 |     count = floor( (this_start + last_end) / 2);
44 |     Utt_Start(Utt_num) = count+ 1;
45 |     Utt_End(Utt_num - 1) = count+ 1;
46 | end
47 | 
48 | this_start = (Utt_Start(1)- 1) * Downsample + Utt_Delay(1);
49 | if( this_start < (SEARCHBUFFER * Downsample) )
50 |     count = SEARCHBUFFER + floor( ...
51 |         (Downsample - 1 - Utt_Delay(1)) / Downsample);
52 |     Utt_Start(1) = count+ 1;
53 | end
54 | 
55 | last_end = (Utt_End(Nutterances)- 1) * Downsample + 1 + ...
56 |     Utt_Delay(Nutterances);
57 | % fprintf( 'Utt_End(%d) is %d\n', Nutterances, Utt_End(Nutterances));
58 | % fprintf( 'last_end is %d\n', last_end);
59 | % fprintf( 'Utt_Delay(%d) is %d\n', Nutterances, Utt_Delay(Nutterances));
60 | if( last_end > (deg_Nsamples - SEARCHBUFFER * Downsample+ 1) )
61 |     count = floor( (deg_Nsamples - Utt_Delay(Nutterances)) / Downsample) ...
62 |         - SEARCHBUFFER;
63 |     Utt_End(Nutterances) = count+ 1;
64 | end
65 | 
66 | for Utt_num = 2: Nutterances
67 |     this_start = (Utt_Start(Utt_num)- 1) * Downsample + Utt_Delay(Utt_num);
68 |     last_end = (Utt_End(Utt_num - 1)- 1) * Downsample + Utt_Delay(Utt_num - 1);
69 |     if( this_start < last_end )
70 |         count = floor( (this_start + last_end) / 2);
71 |         this_start = floor( (Downsample- 1+ count- Utt_Delay(Utt_num))...
72 |             / Downsample);
73 |         last_end = floor( (count - Utt_Delay(Utt_num - 1))...
74 |             / Downsample);
75 |         Utt_Start(Utt_num) = this_start+ 1;
76 |         Utt_End(Utt_num- 1) = last_end+ 1;
77 |     end
78 | end
79 | 
80 | Largest_uttsize= max( Utt_End- Utt_Start);    
81 |     
82 |     
83 |     
84 |     
85 |   


--------------------------------------------------------------------------------
/PESQ/input_filter.m:
--------------------------------------------------------------------------------
 1 | function [mod_ref_data, mod_deg_data]= input_filter( ref_data, ref_Nsamples, ...
 2 |     deg_data, deg_Nsamples)
 3 | 
 4 | mod_ref_data= DC_block( ref_data, ref_Nsamples);
 5 | mod_deg_data= DC_block( deg_data, deg_Nsamples);
 6 | 
 7 | mod_ref_data= apply_filters( mod_ref_data, ref_Nsamples);
 8 | mod_deg_data= apply_filters( mod_deg_data, deg_Nsamples);
 9 | 
10 | 


--------------------------------------------------------------------------------
/PESQ/pesq.m:
--------------------------------------------------------------------------------
  1 | function [pesq_mos]= pesq(ref_wav, deg_wav)
  2 | 
  3 | % ----------------------------------------------------------------------
  4 | %            PESQ objective speech quality measure
  5 | %
  6 | %   This function implements the PESQ measure based on the ITU standard
  7 | %   P.862 [1].
  8 | %
  9 | %
 10 | %   Usage:  pval=pesq(cleanFile.wav, enhancedFile.wav)
 11 | %           
 12 | %         cleanFile.wav - clean input file in .wav format
 13 | %         enhancedFile  - enhanced output file in .wav format
 14 | %         pval          - PESQ value
 15 | %
 16 | %    Note that the PESQ routine only supports sampling rates of 8 kHz and
 17 | %    16 kHz [1]
 18 | %
 19 | %  Example call:  pval = pesq ('sp04.wav','enhanced.wav')
 20 | %
 21 | %  
 22 | %  References:
 23 | %   [1] ITU (2000). Perceptual evaluation of speech quality (PESQ), and 
 24 | %       objective method for end-to-end speech quality assessment of 
 25 | %       narrowband telephone networks and speech codecs. ITU-T
 26 | %       Recommendation P. 862   
 27 | %
 28 | %   Authors: Yi Hu and Philipos C. Loizou 
 29 | %
 30 | %
 31 | % Copyright (c) 2006 by Philipos C. Loizou
 32 | % $Revision: 0.0 $  $Date: 10/09/2006 $
 33 | % ----------------------------------------------------------------------
 34 | if nargin<2
 35 |     fprintf('Usage: [pesq_mos]=pesq(cleanfile.wav,enhanced.wav) \n');
 36 |     return;
 37 | end;
 38 | 
 39 | global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs WHOLE_SIGNAL
 40 | global Align_Nfft Window 
 41 | 
 42 | [ref_data,sampling_rate]= audioread( ref_wav);
 43 | if sampling_rate~=8000 && sampling_rate~=16000
 44 |     error('Sampling frequency needs to be either 8000 or 16000 Hz');
 45 | end
 46 | 
 47 | setup_global( sampling_rate);
 48 | 
 49 | % Window= hann( Align_Nfft, 'periodic'); %Hanning window
 50 | % Window= Window'; 
 51 | TWOPI= 6.28318530717959;
 52 | %for count = 0: Align_Nfft- 1
 53 | %    Window(1+ count) = 0.5 * (1.0 - cos((TWOPI * count) / Align_Nfft));
 54 | %end
 55 | 
 56 | count=0:Align_Nfft- 1;
 57 | Window= 0.5 * (1.0 - cos((TWOPI * count) / Align_Nfft));
 58 |   
 59 | 
 60 | 
 61 | ref_data= ref_data';
 62 | ref_data= ref_data* 32768;
 63 | ref_Nsamples= length( ref_data)+ 2* SEARCHBUFFER* Downsample;
 64 | ref_data= [zeros( 1, SEARCHBUFFER* Downsample), ref_data, ...
 65 |     zeros( 1, DATAPADDING_MSECS* (Fs/ 1000)+ SEARCHBUFFER* Downsample)];
 66 | 
 67 | [deg_data,~]= audioread( deg_wav);
 68 | deg_data= deg_data';
 69 | deg_data= deg_data* 32768;
 70 | deg_Nsamples= length( deg_data)+ 2* SEARCHBUFFER* Downsample;
 71 | deg_data= [zeros( 1, SEARCHBUFFER* Downsample), deg_data, ...
 72 |     zeros( 1, DATAPADDING_MSECS* (Fs/ 1000)+ SEARCHBUFFER* Downsample)];
 73 | 
 74 | maxNsamples= max( ref_Nsamples, deg_Nsamples);
 75 | 
 76 | ref_data= fix_power_level( ref_data, ref_Nsamples, maxNsamples);
 77 | deg_data= fix_power_level( deg_data, deg_Nsamples, maxNsamples);
 78 | 
 79 | standard_IRS_filter_dB= [0, -200; 50, -40; 100, -20; 125, -12; 160, -6; 200, 0;...    
 80 |     250, 4; 300, 6; 350, 8; 400, 10; 500, 11; 600, 12; 700, 12; 800, 12;...
 81 |     1000, 12; 1300, 12; 1600, 12; 2000, 12; 2500, 12; 3000, 12; 3250, 12;...
 82 |     3500, 4; 4000, -200; 5000, -200; 6300, -200; 8000, -200]; 
 83 | 
 84 | ref_data= apply_filter( ref_data, ref_Nsamples, standard_IRS_filter_dB);
 85 | deg_data= apply_filter( deg_data, deg_Nsamples, standard_IRS_filter_dB);
 86 | % 
 87 | 
 88 | 
 89 | 
 90 | % for later use in psychoacoustical model
 91 | model_ref= ref_data;
 92 | model_deg= deg_data;
 93 | 
 94 | [ref_data, deg_data]= input_filter( ref_data, ref_Nsamples, deg_data, ...
 95 |     deg_Nsamples);
 96 | 
 97 | 
 98 | [ref_VAD, ref_logVAD]= apply_VAD( ref_data, ref_Nsamples);
 99 | [deg_VAD, deg_logVAD]= apply_VAD( deg_data, deg_Nsamples);
100 | 
101 | 
102 | crude_align (ref_logVAD, ref_Nsamples, deg_logVAD, deg_Nsamples,...
103 |     WHOLE_SIGNAL);
104 | 
105 | utterance_locate (ref_data, ref_Nsamples, ref_VAD, ref_logVAD,...
106 |     deg_data, deg_Nsamples, deg_VAD, deg_logVAD);
107 | 
108 | ref_data= model_ref;
109 | deg_data= model_deg;
110 | 
111 | % make ref_data and deg_data equal length
112 | if (ref_Nsamples< deg_Nsamples)
113 |     newlen= deg_Nsamples+ DATAPADDING_MSECS* (Fs/ 1000);
114 |     ref_data( newlen)= 0;
115 | elseif (ref_Nsamples> deg_Nsamples)
116 |     newlen= ref_Nsamples+ DATAPADDING_MSECS* (Fs/ 1000);
117 |     deg_data( newlen)= 0;
118 | end
119 | 
120 | 
121 | pesq_mos= pesq_psychoacoustic_model (ref_data, ref_Nsamples, deg_data, ...
122 |     deg_Nsamples );
123 | 
124 | 
125 | 
126 | 
127 | 


--------------------------------------------------------------------------------
/PESQ/pesq_psychoacoustic_model.m:
--------------------------------------------------------------------------------
  1 | function pesq_mos= pesq_psychoacoustic_model (ref_data, ref_Nsamples, deg_data, ...
  2 |     deg_Nsamples )
  3 | 
  4 | global CALIBRATE Nfmax Nb Sl Sp
  5 | global nr_of_hz_bands_per_bark_band centre_of_band_bark
  6 | global width_of_band_hz centre_of_band_hz width_of_band_bark
  7 | global pow_dens_correction_factor abs_thresh_power
  8 | global Downsample SEARCHBUFFER DATAPADDING_MSECS Fs Nutterances
  9 | global Utt_Start Utt_End Utt_Delay NUMBER_OF_PSQM_FRAMES_PER_SYLLABE 
 10 | global Fs Plot_Frame
 11 | 
 12 | % Plot_Frame= 75; % this is the frame whose spectrum will be plotted
 13 | 
 14 | FALSE= 0;
 15 | TRUE= 1;
 16 | NUMBER_OF_PSQM_FRAMES_PER_SYLLABE= 20;
 17 | 
 18 | maxNsamples = max (ref_Nsamples, deg_Nsamples);
 19 | Nf = Downsample * 8;
 20 | MAX_NUMBER_OF_BAD_INTERVALS = 1000;
 21 | 
 22 | start_frame_of_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
 23 | stop_frame_of_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
 24 | start_sample_of_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
 25 | stop_sample_of_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
 26 | number_of_samples_in_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
 27 | delay_in_samples_in_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS);
 28 | number_of_bad_intervals= 0;
 29 | there_is_a_bad_frame= FALSE;
 30 | 
 31 | Whanning= hann( Nf, 'periodic');
 32 | Whanning= Whanning';
 33 | 
 34 | D_POW_F = 2;
 35 | D_POW_S = 6;
 36 | D_POW_T = 2;
 37 | A_POW_F = 1;
 38 | A_POW_S = 6;
 39 | A_POW_T = 2;
 40 | D_WEIGHT= 0.1;
 41 | A_WEIGHT= 0.0309;
 42 | 
 43 | CRITERIUM_FOR_SILENCE_OF_5_SAMPLES = 500;
 44 | samples_to_skip_at_start = 0;
 45 | sum_of_5_samples= 0;
 46 | while ((sum_of_5_samples< CRITERIUM_FOR_SILENCE_OF_5_SAMPLES) ...
 47 |         && (samples_to_skip_at_start < maxNsamples / 2))
 48 |     sum_of_5_samples= sum( abs( ref_data( samples_to_skip_at_start...
 49 |         + SEARCHBUFFER * Downsample + 1: samples_to_skip_at_start...
 50 |         + SEARCHBUFFER * Downsample + 5)));
 51 | 
 52 |     if (sum_of_5_samples< CRITERIUM_FOR_SILENCE_OF_5_SAMPLES)
 53 |         samples_to_skip_at_start = samples_to_skip_at_start+ 1;
 54 |     end
 55 | end
 56 | % fprintf( 'samples_to_skip_at_start is %d\n', samples_to_skip_at_start);
 57 | 
 58 | samples_to_skip_at_end = 0;
 59 | sum_of_5_samples= 0;
 60 | while ((sum_of_5_samples< CRITERIUM_FOR_SILENCE_OF_5_SAMPLES) ...
 61 |         && (samples_to_skip_at_end < maxNsamples / 2))
 62 |     sum_of_5_samples= sum( abs( ref_data( maxNsamples - ...
 63 |         SEARCHBUFFER* Downsample + DATAPADDING_MSECS* (Fs/ 1000) ...
 64 |         - samples_to_skip_at_end - 4: maxNsamples - ...
 65 |         SEARCHBUFFER* Downsample + DATAPADDING_MSECS* (Fs/ 1000) ...
 66 |         - samples_to_skip_at_end)));
 67 |     if (sum_of_5_samples< CRITERIUM_FOR_SILENCE_OF_5_SAMPLES)
 68 |         samples_to_skip_at_end = samples_to_skip_at_end+ 1;
 69 |     end
 70 | end
 71 | % fprintf( 'samples_to_skip_at_end is %d\n', samples_to_skip_at_end);
 72 | 
 73 | start_frame = floor( samples_to_skip_at_start/ (Nf/ 2));
 74 | stop_frame = floor( (maxNsamples- 2* SEARCHBUFFER* Downsample ...
 75 |     + DATAPADDING_MSECS* (Fs/ 1000)- samples_to_skip_at_end) ...
 76 |     / (Nf/ 2))- 1;
 77 | % number of frames in speech data plus DATAPADDING_MSECS
 78 | % fprintf( 'start/end frame is %d/%d\n', start_frame, stop_frame);
 79 | 
 80 | D_disturbance= zeros( stop_frame+ 1, Nb);
 81 | DA_disturbance= zeros( stop_frame+ 1, Nb);
 82 | 
 83 | power_ref = pow_of (ref_data, SEARCHBUFFER* Downsample, ...
 84 |     maxNsamples- SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000),...
 85 |     maxNsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000));
 86 | power_deg = pow_of (deg_data, SEARCHBUFFER * Downsample, ...
 87 |     maxNsamples- SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000),...
 88 |     maxNsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000));
 89 | % fprintf( 'ref/deg power is %f/%f\n', power_ref, power_deg);
 90 | 
 91 | hz_spectrum_ref             = zeros( 1, Nf/ 2);
 92 | hz_spectrum_deg             = zeros( 1, Nf/ 2);
 93 | frame_is_bad                = zeros( 1, stop_frame + 1);
 94 | smeared_frame_is_bad        = zeros( 1, stop_frame + 1);
 95 | silent                      = zeros( 1, stop_frame + 1);
 96 | 
 97 | pitch_pow_dens_ref          = zeros( stop_frame + 1, Nb);
 98 | pitch_pow_dens_deg          = zeros( stop_frame + 1, Nb);
 99 | 
100 | frame_was_skipped           = zeros( 1, stop_frame + 1);
101 | frame_disturbance           = zeros( 1, stop_frame + 1);
102 | frame_disturbance_asym_add  = zeros( 1, stop_frame + 1);
103 | 
104 | avg_pitch_pow_dens_ref      = zeros( 1, Nb);
105 | avg_pitch_pow_dens_deg      = zeros( 1, Nb);
106 | loudness_dens_ref           = zeros( 1, Nb);
107 | loudness_dens_deg           = zeros( 1, Nb);
108 | deadzone                    = zeros( 1, Nb);
109 | disturbance_dens            = zeros( 1, Nb);
110 | disturbance_dens_asym_add   = zeros( 1, Nb);
111 | 
112 | time_weight                 = zeros( 1, stop_frame + 1);
113 | total_power_ref             = zeros( 1, stop_frame + 1);
114 | 
115 | % fid= fopen( 'tmp_mat.txt', 'wt');
116 | 
117 | for frame = 0: stop_frame
118 |     start_sample_ref = 1+ SEARCHBUFFER * Downsample + frame* (Nf/ 2);
119 |     hz_spectrum_ref= short_term_fft (Nf, ref_data, Whanning, ...
120 |         start_sample_ref);
121 | 
122 |     utt = Nutterances;
123 |     while ((utt >= 1) && ((Utt_Start(utt)- 1)* Downsample+ 1 ...
124 |             > start_sample_ref))
125 |         utt= utt - 1;
126 |     end
127 | 
128 |     if (utt >= 1)
129 |         delay = Utt_Delay(utt);
130 |     else
131 |         delay = Utt_Delay(1);
132 |     end
133 | 
134 |     start_sample_deg = start_sample_ref + delay;
135 | 
136 |     if ((start_sample_deg > 0) && (start_sample_deg + Nf- 1 < ...
137 |             maxNsamples+ DATAPADDING_MSECS* (Fs/ 1000)))
138 |         hz_spectrum_deg= short_term_fft (Nf, deg_data, Whanning, ...
139 |             start_sample_deg);
140 |     else
141 |         hz_spectrum_deg( 1: Nf/ 2)= 0;
142 |     end
143 | 
144 |     pitch_pow_dens_ref( frame+ 1, :)= freq_warping (...
145 |         hz_spectrum_ref, Nb, frame);
146 |     %peak = maximum_of (pitch_pow_dens_ref, 0, Nb);
147 |     pitch_pow_dens_deg( frame+ 1, :)= freq_warping (...
148 |         hz_spectrum_deg, Nb, frame);
149 | 
150 |     total_audible_pow_ref = total_audible (frame, pitch_pow_dens_ref, 1E2);
151 |     total_audible_pow_deg = total_audible (frame, pitch_pow_dens_deg, 1E2);
152 |     silent(frame+ 1) = (total_audible_pow_ref < 1E7);
153 |     
154 | 
155 | end
156 | % fclose( fid);
157 | 
158 | avg_pitch_pow_dens_ref= time_avg_audible_of (stop_frame + 1, ...
159 |     silent, pitch_pow_dens_ref, floor((maxNsamples- 2* SEARCHBUFFER* ...
160 |     Downsample+ DATAPADDING_MSECS* (Fs/ 1000))/ (Nf / 2))- 1);
161 | avg_pitch_pow_dens_deg= time_avg_audible_of (stop_frame + 1, ...
162 |     silent, pitch_pow_dens_deg, floor((maxNsamples- 2* SEARCHBUFFER* ...
163 |     Downsample+ DATAPADDING_MSECS* (Fs/ 1000))/ (Nf/ 2))- 1);
164 | 
165 | % fid= fopen( 'tmp_mat.txt', 'wt');
166 | % fprintf( fid, '%f\n', avg_pitch_pow_dens_deg);
167 | % fclose( fid);
168 | 
169 | if (CALIBRATE== 0)
170 |     pitch_pow_dens_ref= freq_resp_compensation (stop_frame + 1, ...
171 |         pitch_pow_dens_ref, avg_pitch_pow_dens_ref, ...
172 |         avg_pitch_pow_dens_deg, 1000);
173 |     if (Plot_Frame>= 0) % plot pitch_pow_dens_ref
174 |         figure;
175 |         subplot( 1, 2, 1);
176 |         plot( centre_of_band_hz, 10* log10( eps+ ...
177 |             pitch_pow_dens_ref( Plot_Frame+ 1, :)));
178 |         axis( [0 Fs/2 0 95]); %xlabel( 'Hz'); ylabel( 'Db');   
179 |         title( 'reference signal bark spectrum with frequency compensation');
180 |         subplot( 1, 2, 2);
181 |         plot( centre_of_band_hz, 10* log10( eps+ ...
182 |             pitch_pow_dens_deg( Plot_Frame+ 1, :)));
183 |         axis( [0 Fs/2 0 95]); %xlabel( 'Hz'); ylabel( 'Db');
184 |         title( 'degraded signal bark spectrum');
185 |     end
186 |         
187 | end
188 | % tmp1= pitch_pow_dens_ref';
189 | 
190 | 
191 | MAX_SCALE = 5.0;
192 | MIN_SCALE = 3e-4;
193 | oldScale = 1;
194 | THRESHOLD_BAD_FRAMES = 30;
195 | for frame = 0: stop_frame
196 |     
197 |     total_audible_pow_ref = total_audible (frame, pitch_pow_dens_ref, 1);
198 |     total_audible_pow_deg = total_audible (frame, pitch_pow_dens_deg, 1);        
199 |     total_power_ref (1+ frame) = total_audible_pow_ref;
200 |     
201 |     scale = (total_audible_pow_ref + 5e3)/ (total_audible_pow_deg + 5e3);    
202 |     if (frame > 0) 
203 |         scale = 0.2 * oldScale + 0.8 * scale;
204 |     end
205 |     oldScale = scale;
206 |     
207 |     if (scale > MAX_SCALE) 
208 |         scale = MAX_SCALE;
209 |     elseif (scale < MIN_SCALE) 
210 |         scale = MIN_SCALE;            
211 |     end
212 | 
213 |     pitch_pow_dens_deg( 1+ frame, :) = ...
214 |         pitch_pow_dens_deg( 1+ frame, :) * scale;
215 |     
216 |     if (frame== Plot_Frame)
217 |         figure;
218 |         subplot( 1, 2, 1);
219 |         plot( centre_of_band_hz, 10* log10( eps+ ...
220 |             pitch_pow_dens_ref( Plot_Frame+ 1, :)));
221 |         axis( [0 Fs/2 0 95]); %xlabel( 'Hz'); ylabel( 'Db');        
222 |         subplot( 1, 2, 2);
223 |         plot( centre_of_band_hz, 10* log10( eps+ ...
224 |             pitch_pow_dens_deg( Plot_Frame+ 1, :)));
225 |         axis( [0 Fs/2 0 95]); %xlabel( 'Hz'); ylabel( 'Db');
226 |     end
227 | 
228 |     loudness_dens_ref = intensity_warping_of (frame, pitch_pow_dens_ref);
229 |     loudness_dens_deg = intensity_warping_of (frame, pitch_pow_dens_deg);         
230 |     disturbance_dens = loudness_dens_deg - loudness_dens_ref;
231 |     
232 |     if (frame== Plot_Frame)
233 |         figure;
234 |         subplot( 1, 2, 1);
235 |         plot( centre_of_band_hz, 10* log10( eps+ ...
236 |             loudness_dens_ref));
237 |         axis( [0 Fs/2 0 15]); %xlabel( 'Hz'); ylabel( 'Db'); 
238 |         title( 'reference signal loudness density');
239 |         subplot( 1, 2, 2);
240 |         plot( centre_of_band_hz, 10* log10( eps+ ...
241 |             loudness_dens_deg));
242 |         axis( [0 Fs/2 0 15]); %xlabel( 'Hz'); ylabel( 'Db');
243 |         title( 'degraded signal loudness density');        
244 |     end
245 |     
246 |     for band =1: Nb
247 |         deadzone (band) = 0.25* min (loudness_dens_deg (band), ...
248 |             loudness_dens_ref (band));    
249 |     end
250 | 
251 |     for band = 1: Nb
252 |         d = disturbance_dens (band);
253 |         m = deadzone (band);
254 |         
255 |         if (d > m) 
256 |             disturbance_dens (band) = disturbance_dens (band)- m;
257 | %             disturbance_dens (band) = d- m;
258 |         else
259 |             if (d < -m) 
260 |                 disturbance_dens (band) = disturbance_dens (band)+ m;
261 | %                 disturbance_dens (band) = d+ m;
262 |             else
263 |                 disturbance_dens (band) = 0;
264 |             end
265 |         end
266 |     end
267 |     
268 |     if (frame== Plot_Frame)
269 |         figure;
270 |         subplot( 1, 2, 1);
271 |         plot( centre_of_band_hz, disturbance_dens);
272 |         axis( [0 Fs/2 -1 50]); %xlabel( 'Hz'); ylabel( 'Db');                
273 |         title( 'disturbance');        
274 |     end
275 |     D_disturbance( frame+ 1, :)= disturbance_dens;
276 | 
277 |     frame_disturbance (1+ frame) = pseudo_Lp (disturbance_dens, D_POW_F);    
278 |     if (frame_disturbance (1+ frame) > THRESHOLD_BAD_FRAMES) 
279 |         there_is_a_bad_frame = TRUE;
280 |     end
281 |     
282 |     disturbance_dens= multiply_with_asymmetry_factor (...
283 |         disturbance_dens, frame, pitch_pow_dens_ref, pitch_pow_dens_deg);
284 |     
285 |     if (frame== Plot_Frame)        
286 |         subplot( 1, 2, 2);
287 |         plot( centre_of_band_hz, disturbance_dens);
288 |         axis( [0 Fs/2 -1 50]); %xlabel( 'Hz'); ylabel( 'Db');
289 |         title( 'disturbance after asymmetry processing');
290 |     end
291 |     DA_disturbance( frame+ 1, :)= disturbance_dens;
292 | 
293 | 
294 |     frame_disturbance_asym_add (1+ frame) = ...
295 |         pseudo_Lp (disturbance_dens, A_POW_F);    
296 | end
297 | % fid= fopen( 'tmp_mat.txt', 'wt');
298 | % fprintf( fid, '%f\n', frame_disturbance);
299 | % fclose( fid);
300 | 
301 | frame_was_skipped (1: 1+ stop_frame) = FALSE;
302 | 
303 | for utt = 2: Nutterances
304 |     frame1 = floor (((Utt_Start(utt)- 1- SEARCHBUFFER )* Downsample+ 1+ ...
305 |         Utt_Delay(utt))/ (Nf/ 2));
306 |     j = floor( floor(((Utt_End(utt-1)- 1- SEARCHBUFFER)* Downsample+ 1+ ...
307 |         Utt_Delay(utt-1)))/(Nf/ 2));
308 |     delay_jump = Utt_Delay(utt) - Utt_Delay(utt-1);
309 |     if (frame1 > j) 
310 |         frame1 = j;    
311 |     elseif (frame1 < 0) 
312 |         frame1 = 0;
313 |     end
314 | %     fprintf( 'frame1, j, delay_jump is %d, %d, %d\n', frame1, ...
315 | %         j, delay_jump);
316 | 
317 |     if (delay_jump < -(Nf/ 2)) 
318 |         frame2 = floor (((Utt_Start(utt)- 1- SEARCHBUFFER)* Downsample+ 1 ...
319 |             + max (0, abs (delay_jump)))/ (Nf/ 2)) + 1; 
320 |         
321 |         for frame = frame1: frame2
322 |             if (frame < stop_frame) 
323 |                 frame_was_skipped (1+ frame) = TRUE;
324 |                 frame_disturbance (1+ frame) = 0;
325 |                 frame_disturbance_asym_add (1+ frame) = 0;
326 |             end
327 |         end
328 |     end
329 | end
330 | 
331 | nn = DATAPADDING_MSECS* (Fs/ 1000) + maxNsamples;
332 | tweaked_deg = zeros( 1, nn);
333 | % fprintf( 'nn is %d\n', nn);
334 | 
335 | for i= SEARCHBUFFER* Downsample+ 1: nn- SEARCHBUFFER* Downsample
336 |     utt = Nutterances;
337 |     
338 |     while ((utt >= 1) && ((Utt_Start (utt)- 1)* Downsample> i)) 
339 |         utt = utt- 1;
340 |     end
341 |     if (utt >= 1) 
342 |         delay = Utt_Delay (utt);        
343 |     else
344 |         delay = Utt_Delay (1);
345 |     end
346 | 
347 |     j = i + delay;
348 |     if (j < SEARCHBUFFER * Downsample+ 1) 
349 |         j = SEARCHBUFFER * Downsample+ 1;
350 |     end
351 |     if (j > nn - SEARCHBUFFER * Downsample) 
352 |         j = nn - SEARCHBUFFER * Downsample;
353 |     end
354 |     tweaked_deg (i) = deg_data (j);
355 | end
356 | 
357 | if (there_is_a_bad_frame) 
358 |     
359 |     for frame = 0: stop_frame
360 |         frame_is_bad (1+ frame) = (frame_disturbance (1+ frame)...
361 |             > THRESHOLD_BAD_FRAMES);       
362 |         smeared_frame_is_bad (1+ frame) = FALSE;
363 |     end
364 |     frame_is_bad (1) = FALSE;
365 |     SMEAR_RANGE = 2;
366 |     
367 |     for frame = SMEAR_RANGE: stop_frame- 1- SMEAR_RANGE
368 |         max_itself_and_left = frame_is_bad (1+ frame);
369 |         max_itself_and_right = frame_is_bad (1+ frame);
370 |         
371 |         for i = -SMEAR_RANGE: 0
372 |             if (max_itself_and_left < frame_is_bad (1+ frame+ i)) 
373 |                 max_itself_and_left = frame_is_bad (1+ frame+ i);
374 |             end
375 |         end
376 | 
377 |         for i = 0: SMEAR_RANGE
378 |             if (max_itself_and_right < frame_is_bad (1+ frame + i)) 
379 |                 max_itself_and_right = frame_is_bad (1+ frame + i);
380 |             end
381 |         end
382 | 
383 |         mini = max_itself_and_left;
384 |         if (mini > max_itself_and_right) 
385 |             mini = max_itself_and_right;
386 |         end
387 | 
388 |         smeared_frame_is_bad (1+ frame) = mini;
389 |     end
390 |     
391 |     MINIMUM_NUMBER_OF_BAD_FRAMES_IN_BAD_INTERVAL = 5;
392 |     number_of_bad_intervals = 0;    
393 |     frame = 0; 
394 |     while (frame <= stop_frame) 
395 |         while ((frame <= stop_frame) && (~smeared_frame_is_bad (1+ frame)))
396 |             frame= frame+ 1;
397 |         end
398 | 
399 |         if (frame <= stop_frame) 
400 |             start_frame_of_bad_interval(1+ number_of_bad_intervals)= ...
401 |                 1+ frame;
402 |             
403 |             while ((frame <= stop_frame) && (...
404 |                     smeared_frame_is_bad (1+ frame))) 
405 |                 frame= frame+ 1; 
406 |             end
407 | 
408 |             if (frame <= stop_frame)
409 |                 stop_frame_of_bad_interval(1+ number_of_bad_intervals)= ...
410 |                     1+ frame; 
411 |                 if (stop_frame_of_bad_interval(1+ number_of_bad_intervals)- ...
412 |                         start_frame_of_bad_interval(1+ number_of_bad_intervals)...
413 |                         >= MINIMUM_NUMBER_OF_BAD_FRAMES_IN_BAD_INTERVAL) 
414 |                     number_of_bad_intervals= number_of_bad_intervals+ 1;
415 |                 end
416 |             end
417 |         end
418 |     end
419 | 
420 |     for bad_interval = 0: number_of_bad_intervals - 1
421 |         start_sample_of_bad_interval(1+ bad_interval) = ...
422 |             (start_frame_of_bad_interval(1+ bad_interval)- 1) * (Nf/ 2) ...
423 |             + SEARCHBUFFER * Downsample+ 1;
424 |         stop_sample_of_bad_interval(1+ bad_interval) = ...
425 |             (stop_frame_of_bad_interval(1+ bad_interval)- 1) * (Nf/ 2) ...
426 |             + Nf + SEARCHBUFFER* Downsample;
427 |         if (stop_frame_of_bad_interval(1+ bad_interval) > stop_frame+ 1) 
428 |             stop_frame_of_bad_interval(1+ bad_interval) = stop_frame+ 1; 
429 |         end
430 | 
431 |         number_of_samples_in_bad_interval(1+ bad_interval) = ...
432 |             stop_sample_of_bad_interval(1+ bad_interval) - ...
433 |             start_sample_of_bad_interval(1+ bad_interval)+ 1;
434 |     end        
435 | %     fprintf( 'number of bad intervals %d\n', number_of_bad_intervals);
436 | %     fprintf( '%d %d\n', number_of_samples_in_bad_interval(1), ...
437 | %         number_of_samples_in_bad_interval(2));
438 | %     fprintf( '%d %d\n', start_sample_of_bad_interval(1), ...
439 | %         start_sample_of_bad_interval(2));
440 | 
441 |     SEARCH_RANGE_IN_TRANSFORM_LENGTH = 4;    
442 |     search_range_in_samples= SEARCH_RANGE_IN_TRANSFORM_LENGTH * Nf;
443 |     
444 |     for bad_interval= 0: number_of_bad_intervals- 1
445 |         ref = zeros (1, 2 * search_range_in_samples + ...
446 |             number_of_samples_in_bad_interval (1+ bad_interval));
447 |         deg = zeros (1, 2 * search_range_in_samples + ...
448 |             number_of_samples_in_bad_interval (1+ bad_interval));
449 |         
450 |         ref(1: search_range_in_samples) = 0;
451 | 
452 |         ref (search_range_in_samples+ 1: search_range_in_samples+ ...
453 |                 number_of_samples_in_bad_interval (1+ bad_interval)) = ...
454 |                 ref_data (start_sample_of_bad_interval( 1+ bad_interval) + 1: ...
455 |                 start_sample_of_bad_interval( 1+ bad_interval) + ...
456 |                 number_of_samples_in_bad_interval (1+ bad_interval));
457 |         
458 |         ref (search_range_in_samples + ...
459 |                 number_of_samples_in_bad_interval (1+ bad_interval) + 1: ...
460 |                 search_range_in_samples + ...
461 |                 number_of_samples_in_bad_interval (1+ bad_interval) + ...
462 |                 search_range_in_samples) = 0;
463 |         
464 |         for i = 0: 2 * search_range_in_samples + ...
465 |                 number_of_samples_in_bad_interval (1+ bad_interval) - 1
466 |             j = start_sample_of_bad_interval (1+ bad_interval) - ...
467 |                 search_range_in_samples + i;
468 |             nn = maxNsamples - SEARCHBUFFER * Downsample + ...
469 |                 DATAPADDING_MSECS  * (Fs / 1000);
470 |             if (j <= SEARCHBUFFER * Downsample) 
471 |                 j = SEARCHBUFFER * Downsample+ 1;
472 |             end
473 |             if (j > nn) 
474 |                 j = nn;
475 |             end
476 |             deg (1+ i) = tweaked_deg (j);
477 |         end
478 | 
479 |         [delay_in_samples, best_correlation]= compute_delay ...
480 |             (1, 2 * search_range_in_samples + ...
481 |             number_of_samples_in_bad_interval (1+ bad_interval), ...
482 |             search_range_in_samples, ref, deg);
483 |         delay_in_samples_in_bad_interval (1+ bad_interval) =  ...
484 |             delay_in_samples;
485 | %         fprintf( 'delay_in_samples, best_correlation is \n\t%d, %f\n', ...
486 | %             delay_in_samples, best_correlation);
487 | %         
488 |         if (best_correlation < 0.5) 
489 |             delay_in_samples_in_bad_interval  (1+ bad_interval) = 0;
490 |         end
491 |     end
492 | 
493 |     if (number_of_bad_intervals > 0) 
494 |         doubly_tweaked_deg = tweaked_deg( 1: maxNsamples + ...
495 |             DATAPADDING_MSECS  * (Fs / 1000));
496 |         for bad_interval= 0: number_of_bad_intervals- 1
497 |             delay = delay_in_samples_in_bad_interval (1+ bad_interval);
498 |         
499 |             for i = start_sample_of_bad_interval (1+ bad_interval): ...
500 |                     stop_sample_of_bad_interval (1+ bad_interval)
501 |                 j = i + delay;
502 |                 if (j < 1) 
503 |                     j = 1;
504 |                 end
505 |                 if (j > maxNsamples) 
506 |                     j = maxNsamples;
507 |                 end
508 |                 h = tweaked_deg (j);
509 |                 doubly_tweaked_deg (i) = h;
510 |             end
511 |         end
512 | 
513 |         untweaked_deg = deg_data;
514 |         deg_data = doubly_tweaked_deg;
515 |         
516 |         for bad_interval= 0: number_of_bad_intervals- 1
517 |             for frame = start_frame_of_bad_interval (1+ bad_interval): ...
518 |                     stop_frame_of_bad_interval (1+ bad_interval)- 1
519 |                 frame= frame- 1;
520 |                 start_sample_ref = SEARCHBUFFER * Downsample + ...
521 |                     frame * Nf / 2+ 1;
522 |                 start_sample_deg = start_sample_ref;
523 |                 hz_spectrum_deg= short_term_fft (Nf, deg_data, ...
524 |                     Whanning, start_sample_deg);    
525 |                 pitch_pow_dens_deg( 1+ frame, :)= freq_warping (...
526 |                     hz_spectrum_deg, Nb, frame);
527 |             end
528 | 
529 |             oldScale = 1;
530 |             for frame = start_frame_of_bad_interval (1+ bad_interval): ...
531 |                     stop_frame_of_bad_interval (1+ bad_interval)- 1
532 |                 frame= frame- 1;    
533 |                 % see implementation for detail why 1 needed to be
534 |                 % subtracted
535 |                 total_audible_pow_ref = total_audible (frame, ...
536 |                     pitch_pow_dens_ref, 1);
537 |                 total_audible_pow_deg = total_audible (frame, ...
538 |                     pitch_pow_dens_deg, 1);        
539 |                 scale = (total_audible_pow_ref + 5e3) / ...
540 |                     (total_audible_pow_deg + 5e3);
541 |                 if (frame > 0) 
542 |                     scale = 0.2 * oldScale + 0.8*scale;
543 |                 end
544 |                 oldScale = scale;
545 |                 if (scale > MAX_SCALE) 
546 |                     scale = MAX_SCALE;
547 |                 end
548 |                 if (scale < MIN_SCALE) 
549 |                     scale = MIN_SCALE;   
550 |                 end
551 | 
552 |                 pitch_pow_dens_deg (1+ frame, :) = ...
553 |                     pitch_pow_dens_deg (1+ frame, :)* scale;
554 |                 loudness_dens_ref= intensity_warping_of (frame, ...
555 |                     pitch_pow_dens_ref); 
556 |                 loudness_dens_deg= intensity_warping_of (frame, ...
557 |                     pitch_pow_dens_deg); 
558 |                 disturbance_dens = loudness_dens_deg - loudness_dens_ref;
559 |                 
560 |                 for band = 1: Nb
561 |                     deadzone(band) = min (loudness_dens_deg(band), ...
562 |                         loudness_dens_ref(band));    
563 |                     deadzone(band) = deadzone(band)* 0.25;
564 |                 end
565 | 
566 |                 for band = 1: Nb
567 |                     d = disturbance_dens (band);
568 |                     m = deadzone (band);
569 |                     
570 |                     if (d > m) 
571 |                         disturbance_dens (band) = ...
572 |                             disturbance_dens (band)- m;
573 |                     else
574 |                         if (d < -m) 
575 |                             disturbance_dens (band) = ...
576 |                                 disturbance_dens (band)+ m;
577 |                         else
578 |                             disturbance_dens (band) = 0;
579 |                         end
580 |                     end
581 |                 end
582 | 
583 |                 frame_disturbance( 1+ frame) = min (...
584 |                     frame_disturbance( 1+ frame), pseudo_Lp(...
585 |                     disturbance_dens, D_POW_F));
586 |                 disturbance_dens= multiply_with_asymmetry_factor ...
587 |                     (disturbance_dens, frame, pitch_pow_dens_ref, ...
588 |                     pitch_pow_dens_deg);
589 |                 frame_disturbance_asym_add(1+ frame) = min (...
590 |                     frame_disturbance_asym_add(1+ frame), ...
591 |                     pseudo_Lp (disturbance_dens, A_POW_F));    
592 |             end
593 |         end
594 |         deg_data = untweaked_deg;
595 |     end
596 | end     
597 | 
598 | for frame = 0: stop_frame
599 |     h = 1;
600 |     if (stop_frame + 1 > 1000) 
601 |         n = floor( (maxNsamples - 2 * SEARCHBUFFER * Downsample)...
602 |             / (Nf / 2)) - 1;
603 |         timeWeightFactor = (n - 1000) / 5500;
604 |         if (timeWeightFactor > 0.5) 
605 |             timeWeightFactor = 0.5;
606 |         end
607 |         h = (1.0 - timeWeightFactor) + timeWeightFactor * frame / n;
608 |     end
609 | 
610 |     time_weight (1 +frame) = h;
611 | end
612 | 
613 | % fid= fopen( 'tmp_mat1.txt', 'at');
614 | % fprintf( '\n');
615 | for frame = 0: stop_frame
616 |     h = ((total_power_ref (1+ frame) + 1e5) / 1e7)^ 0.04; 
617 | %     if (frame== 118)
618 | %         fprintf( '%f\n', h);    
619 | %         fprintf( '%f\n', frame_disturbance( 1+ frame));
620 | %     end
621 |     frame_disturbance( 1+ frame) = frame_disturbance( 1+ frame)/ h;
622 |     
623 | %     if (frame== 118)
624 | %         fprintf( '%f\n', frame_disturbance( 1+ frame));
625 | %     end
626 | %         
627 |     frame_disturbance_asym_add( 1+ frame) = ...
628 |         frame_disturbance_asym_add( 1+ frame)/ h;
629 |     if (frame_disturbance( 1+ frame) > 45) 
630 |         frame_disturbance( 1+ frame) = 45;  
631 |     end
632 |     if (frame_disturbance_asym_add( 1+ frame)> 45) 
633 |         frame_disturbance_asym_add( 1+ frame) = 45;
634 |     end
635 | end
636 | % fclose ( fid);
637 | 
638 | d_indicator = Lpq_weight (start_frame, stop_frame, ...
639 |     D_POW_S, D_POW_T, frame_disturbance, time_weight);
640 | a_indicator = Lpq_weight (start_frame, stop_frame, ...
641 |     A_POW_S, A_POW_T, frame_disturbance_asym_add, time_weight);       
642 | 
643 | pesq_mos = 4.5 - D_WEIGHT * d_indicator - A_WEIGHT * a_indicator; 
644 | 
645 | if (Plot_Frame> 0)
646 |     figure;
647 |     subplot( 1, 2, 1);
648 |     mesh( 0: stop_frame, centre_of_band_hz, D_disturbance');
649 |     title( 'disturbance');
650 |     subplot( 1, 2, 2);
651 |     mesh( 0: stop_frame, centre_of_band_hz, DA_disturbance');
652 |     title( 'disturbance after asymmetry processing');
653 | end
654 | 
655 | % fid= fopen( 'tmp_mat.txt', 'wt');
656 | % fprintf( fid, 'time_weight\n');
657 | % fprintf( fid, '%f\n', time_weight);
658 | % fprintf( fid, 'frame_disturbance:\n');
659 | % fprintf( fid, '%f\n', frame_disturbance);
660 | % fprintf( fid, 'frame_disturbance_asym_add\n');
661 | % fprintf( fid, '%f\n', frame_disturbance_asym_add);
662 | % fclose( fid);
663 |     
664 | function result_time= Lpq_weight(start_frame, stop_frame, ...
665 |         power_syllable, power_time, frame_disturbance, time_weight)
666 | 
667 | global NUMBER_OF_PSQM_FRAMES_PER_SYLLABE
668 | 
669 | % fid= fopen( 'tmp_mat1.txt', 'at');
670 | % fprintf( 'result_time:\n');
671 | 
672 | result_time= 0;
673 | total_time_weight_time = 0;
674 | % fprintf( 'start/end frame: %d/%d\n', start_frame, stop_frame);
675 | for start_frame_of_syllable = start_frame: ...
676 |         NUMBER_OF_PSQM_FRAMES_PER_SYLLABE/2: stop_frame
677 |     result_syllable = 0;
678 |     count_syllable = 0;
679 |     
680 |     for frame = start_frame_of_syllable: ...
681 |             start_frame_of_syllable + NUMBER_OF_PSQM_FRAMES_PER_SYLLABE- 1
682 |         if (frame <= stop_frame) 
683 |             h = frame_disturbance(1+ frame);
684 | %             if (start_frame_of_syllable== 101)
685 | %                 fprintf( fid, '%f\n', h);
686 | %             end
687 |             result_syllable = result_syllable+ (h^ power_syllable);
688 |         end
689 |         count_syllable = count_syllable+ 1;
690 |     end
691 | 
692 |     result_syllable = result_syllable/ count_syllable;
693 |     result_syllable = result_syllable^ (1/power_syllable);     
694 |     
695 |     result_time= result_time+ (time_weight (...
696 |         1+ start_frame_of_syllable - start_frame) * ...
697 |         result_syllable)^ power_time; 
698 |     total_time_weight_time = total_time_weight_time+ ...
699 |         time_weight (1+ start_frame_of_syllable - start_frame)^ power_time;
700 |     
701 | %     fprintf( fid, '%f\n', result_time);
702 | end
703 | % fclose (fid);
704 | 
705 | % fprintf( 'total_time_weight_time is %f\n', total_time_weight_time);
706 | result_time = result_time/ total_time_weight_time;
707 | result_time= result_time^ (1/ power_time);
708 | % fprintf( 'result_time is %f\n\n', result_time);
709 | 
710 |     
711 | function [best_delay, max_correlation] = compute_delay (...
712 |     start_sample, stop_sample, search_range, ...
713 |     time_series1, time_series2) 
714 | 
715 | n = stop_sample - start_sample+ 1;   
716 | power_of_2 = 2^ (ceil( log2( 2 * n)));
717 | 
718 | power1 = pow_of (time_series1, start_sample, stop_sample, n)* ...
719 |     n/ power_of_2;
720 | power2 = pow_of (time_series2, start_sample, stop_sample, n)* ...
721 |     n/ power_of_2;
722 | normalization = sqrt (power1 * power2);
723 | % fprintf( 'normalization is %f\n', normalization);
724 | 
725 | if ((power1 <= 1e-6) || (power2 <= 1e-6)) 
726 |     max_correlation = 0;
727 |     best_delay= 0;
728 | end
729 | 
730 | x1( 1: power_of_2)= 0;
731 | x2( 1: power_of_2)= 0;
732 | y( 1: power_of_2)= 0;
733 | 
734 | x1( 1: n)= abs( time_series1( start_sample: ...
735 |     stop_sample));
736 | x2( 1: n)= abs( time_series2( start_sample: ...
737 |     stop_sample));
738 | 
739 | x1_fft= fft( x1, power_of_2)/ power_of_2;
740 | x2_fft= fft( x2, power_of_2);
741 | x1_fft_conj= conj( x1_fft);
742 | y= ifft( x1_fft_conj.* x2_fft, power_of_2);
743 | 
744 | best_delay = 0;
745 | max_correlation = 0;
746 | 
747 | % these loop can be rewritten
748 | for i = -search_range: -1
749 |     h = abs (y (1+ i + power_of_2)) / normalization;
750 |     if (h > max_correlation) 
751 |         max_correlation = h;
752 |         best_delay= i;
753 |     end
754 | end
755 | for i = 0: search_range- 1
756 |     h = abs (y (1+i)) / normalization;
757 |     if (h > max_correlation) 
758 |         max_correlation = h;
759 |         best_delay= i;
760 |     end
761 | end
762 | best_delay= best_delay- 1;
763 |     
764 | function mod_disturbance_dens= multiply_with_asymmetry_factor (...
765 |     disturbance_dens, frame, pitch_pow_dens_ref, pitch_pow_dens_deg) 
766 | 
767 | global Nb
768 | for i = 1: Nb
769 |     ratio = (pitch_pow_dens_deg(1+ frame, i) + 50)...
770 |         / (pitch_pow_dens_ref (1+ frame, i) + 50);
771 |     h = ratio^ 1.2;    
772 |     if (h > 12) 
773 |         h = 12;
774 |     elseif (h < 3) 
775 |         h = 0.0;
776 |     end
777 |     mod_disturbance_dens (i) = disturbance_dens (i) * h;
778 | end
779 | 
780 | 
781 | function loudness_dens = intensity_warping_of (...
782 |     frame, pitch_pow_dens)
783 | 
784 | global abs_thresh_power Sl Nb centre_of_band_bark
785 | ZWICKER_POWER= 0.23;
786 | for band = 1: Nb
787 |     threshold = abs_thresh_power (band);
788 |     input = pitch_pow_dens (1+ frame, band);
789 |     
790 |     if (centre_of_band_bark (band) < 4) 
791 |         h =  6 / (centre_of_band_bark (band) + 2);
792 |     else
793 |         h = 1;
794 |     end
795 | 
796 |     if (h > 2) 
797 |         h = 2;
798 |     end
799 |     h = h^ 0.15;
800 |     modified_zwicker_power = ZWICKER_POWER * h;
801 |     if (input > threshold) 
802 |         loudness_dens (band) = ((threshold / 0.5)^ modified_zwicker_power)...
803 |             * ((0.5 + 0.5 * input / threshold)^ modified_zwicker_power- 1);
804 |     else
805 |         loudness_dens (band) = 0;
806 |     end
807 | 
808 |     loudness_dens (band) = loudness_dens (band)* Sl;
809 | end
810 |     
811 | function result= pseudo_Lp (x, p)
812 | 
813 | global Nb width_of_band_bark
814 | totalWeight = 0;
815 | result = 0;
816 | for band = 2: Nb
817 |     h = abs (x (band));
818 |     w = width_of_band_bark (band);
819 |     prod = h * w;
820 |     
821 |     result = result+ prod^ p;
822 |     totalWeight = totalWeight+ w;
823 | end
824 | result = (result/ totalWeight)^ (1/p);
825 | result = result* totalWeight;
826 | 
827 |     
828 | function mod_pitch_pow_dens_ref= freq_resp_compensation (number_of_frames, ...
829 |     pitch_pow_dens_ref, avg_pitch_pow_dens_ref, ...
830 |     avg_pitch_pow_dens_deg, constant)
831 | 
832 | global Nb
833 | 
834 | for band = 1: Nb
835 |     x = (avg_pitch_pow_dens_deg (band) + constant) / ...
836 |         (avg_pitch_pow_dens_ref (band) + constant);
837 |     if (x > 100.0) 
838 |         x = 100.0;
839 |     elseif (x < 0.01) 
840 |         x = 0.01;
841 |     end
842 | 
843 |     for frame = 1: number_of_frames
844 |         mod_pitch_pow_dens_ref(frame, band) = ...
845 |             pitch_pow_dens_ref(frame, band) * x;
846 |     end
847 | end
848 | 
849 | 
850 | 
851 | function avg_pitch_pow_dens= time_avg_audible_of(number_of_frames, ...
852 |     silent, pitch_pow_dens, total_number_of_frames) 
853 | 
854 | global Nb abs_thresh_power
855 | 
856 | for band = 1: Nb
857 |     result = 0;
858 |     for frame = 1: number_of_frames
859 |         if (~silent (frame)) 
860 |             h = pitch_pow_dens (frame, band);
861 |             if (h > 100 * abs_thresh_power (band)) 
862 |                 result = result + h;
863 |             end
864 |         end
865 | 
866 |         avg_pitch_pow_dens (band) = result/ total_number_of_frames;
867 |     end
868 | end  
869 | 
870 | 
871 | 
872 | function hz_spectrum= short_term_fft (Nf, data, Whanning, start_sample)
873 | 
874 | x1= data( start_sample: start_sample+ Nf-1).* Whanning;
875 | x1_fft= fft( x1);
876 | hz_spectrum= abs( x1_fft( 1: Nf/ 2)).^ 2;
877 | hz_spectrum( 1)= 0;
878 | 
879 | 
880 | function pitch_pow_dens= freq_warping( hz_spectrum, Nb, frame)
881 | 
882 | global nr_of_hz_bands_per_bark_band pow_dens_correction_factor
883 | global Sp
884 | 
885 | hz_band = 1;
886 | for bark_band = 1: Nb
887 |     n = nr_of_hz_bands_per_bark_band (bark_band);    
888 |     sum = 0;
889 |     for i = 1: n
890 |         sum = sum+ hz_spectrum( hz_band);
891 |         hz_band= hz_band+ 1;
892 |     end
893 |     sum = sum* pow_dens_correction_factor (bark_band);
894 |     sum = sum* Sp;
895 |     pitch_pow_dens (bark_band) = sum;
896 |     
897 | end
898 | 
899 | 
900 | function total_audible_pow = total_audible (frame, ...
901 |     pitch_pow_dens, factor)
902 | 
903 | global Nb abs_thresh_power
904 | 
905 | total_audible_pow = 0;
906 | for band= 2: Nb
907 |     h = pitch_pow_dens (frame+ 1,band);
908 |     threshold = factor * abs_thresh_power (band);
909 |     if (h > threshold) 
910 |         total_audible_pow = total_audible_pow+ h;
911 |     end
912 | end
913 | 
914 | 
915 | 
916 | 
917 | 
918 | 
919 | 
920 | 
921 | 


--------------------------------------------------------------------------------
/PESQ/pow_of.m:
--------------------------------------------------------------------------------
1 | function power= pow_of( data, start_point, end_point, divisor)
2 | 
3 | power= sum( data( start_point: end_point).^ 2)/ divisor; 


--------------------------------------------------------------------------------
/PESQ/setup_global.m:
--------------------------------------------------------------------------------
  1 | function setup_global( sampling_rate);
  2 | 
  3 | global Downsample InIIR_Hsos InIIR_Nsos Align_Nfft
  4 | global DATAPADDING_MSECS SEARCHBUFFER Fs MINSPEECHLGTH JOINSPEECHLGTH
  5 | 
  6 | global Nutterances Largest_uttsize Nsurf_samples Crude_DelayEst
  7 | global Crude_DelayConf UttSearch_Start UttSearch_End Utt_DelayEst
  8 | global Utt_Delay Utt_DelayConf Utt_Start Utt_End
  9 | global MAXNUTTERANCES WHOLE_SIGNAL
 10 | global pesq_mos subj_mos cond_nr MINUTTLENGTH
 11 | global CALIBRATE Nfmax Nb Sl Sp 
 12 | global nr_of_hz_bands_per_bark_band centre_of_band_bark 
 13 | global width_of_band_hz centre_of_band_hz width_of_band_bark 
 14 | global pow_dens_correction_factor abs_thresh_power
 15 | 
 16 | CALIBRATE= 0;
 17 | Nfmax= 512;
 18 | 
 19 | MAXNUTTERANCES= 50;
 20 | MINUTTLENGTH= 50;
 21 | WHOLE_SIGNAL= -1;
 22 | UttSearch_Star= zeros( 1, MAXNUTTERANCES);
 23 | UttSearch_End= zeros( 1, MAXNUTTERANCES);
 24 | Utt_DelayEst= zeros( 1, MAXNUTTERANCES);
 25 | Utt_Delay= zeros( 1, MAXNUTTERANCES);
 26 | Utt_DelayConf= zeros( 1, MAXNUTTERANCES);
 27 | Utt_Start= zeros( 1, MAXNUTTERANCES);
 28 | Utt_End= zeros( 1, MAXNUTTERANCES);
 29 | 
 30 | DATAPADDING_MSECS= 320;
 31 | SEARCHBUFFER= 75;
 32 | MINSPEECHLGTH= 4;
 33 | JOINSPEECHLGTH= 50;
 34 | 
 35 | Sp_16k = 6.910853e-006;
 36 | Sl_16k = 1.866055e-001;
 37 | fs_16k= 16000;
 38 | Downsample_16k = 64;
 39 | Align_Nfft_16k = 1024;
 40 | InIIR_Nsos_16k = 12;
 41 | InIIR_Hsos_16k = [
 42 |    0.325631521,        -0.086782860,  -0.238848661,  -1.079416490,  0.434583902;
 43 |    0.403961804,        -0.556985881,  0.153024077,   -0.415115835,  0.696590244;
 44 |    4.736162769,        3.287251046,   1.753289019,   -1.859599046,  0.876284034;
 45 |    0.365373469,        0.000000000,   0.000000000,   -0.634626531,  0.000000000;
 46 |    0.884811506,        0.000000000,   0.000000000,   -0.256725271,  0.141536777;
 47 |    0.723593055,        -1.447186099,  0.723593044,   -1.129587469,  0.657232737;
 48 |    1.644910855,        -1.817280902,  1.249658063,   -1.778403899,  0.801724355;
 49 |    0.633692689,        -0.284644314,  -0.319789663,  0.000000000,   0.000000000;
 50 |    1.032763031,        0.268428979,   0.602913323,   0.000000000,   0.000000000;
 51 |    1.001616361,        -0.823749013,  0.439731942,   -0.885778255,  0.000000000;
 52 |    0.752472096,        -0.375388990,  0.188977609,   -0.077258216,  0.247230734;
 53 |    1.023700575,        0.001661628,   0.521284240,   -0.183867259,  0.354324187
 54 |    ];
 55 | 
 56 | Sp_8k = 2.764344e-5;
 57 | Sl_8k = 1.866055e-1;
 58 | fs_8k= 8000;
 59 | Downsample_8k = 32;
 60 | Align_Nfft_8k = 512;
 61 | InIIR_Nsos_8k = 8;
 62 | InIIR_Hsos_8k = [
 63 |     0.885535424,       -0.885535424,  0.000000000,   -0.771070709,  0.000000000;
 64 |     0.895092588,       1.292907193,   0.449260174,   1.268869037,   0.442025372;
 65 |     4.049527940,       -7.865190042,  3.815662102,   -1.746859852,  0.786305963;
 66 |     0.500002353,       -0.500002353,  0.000000000,   0.000000000,   0.000000000;
 67 |     0.565002834,       -0.241585934,  -0.306009671,  0.259688659,   0.249979657;
 68 |     2.115237288,       0.919935084,   1.141240051,   -1.587313419,  0.665935315;
 69 |     0.912224584,       -0.224397719,  -0.641121413,  -0.246029464,  -0.556720590;
 70 |     0.444617727,       -0.307589321,  0.141638062,   -0.996391149,  0.502251622
 71 |     ];
 72 | 
 73 | nr_of_hz_bands_per_bark_band_8k = [
 74 |     1,    1,    1,    1,    1,     1,    1,    1,    2,    1, ...
 75 |     1,    1,    1,    1,    2,     1,    1,    2,    2,    2, ...
 76 |     2,    2,    2,    2,    2,     3,    3,    3,    3,    4, ...
 77 |     3,    4,    5,    4,    5,     6,    6,    7,    8,    9, ...
 78 |     9,    11
 79 |     ];
 80 | 
 81 | centre_of_band_bark_8k = [
 82 |     0.078672,   0.316341,   0.636559,   0.961246,   1.290450, ...
 83 |     1.624217,   1.962597,   2.305636,   2.653383,   3.005889, ...
 84 |     3.363201,   3.725371,   4.092449,   4.464486,   4.841533, ...
 85 |     5.223642,   5.610866,   6.003256,   6.400869,   6.803755, ...
 86 |     7.211971,   7.625571,   8.044611,   8.469146,   8.899232, ...
 87 |     9.334927,   9.776288,   10.223374,  10.676242,  11.134952,...
 88 |     11.599563,  12.070135,  12.546731,  13.029408,  13.518232,...
 89 |     14.013264,  14.514566,  15.022202,  15.536238,  16.056736,...
 90 |     16.583761,  17.117382
 91 |     ];
 92 | 
 93 | centre_of_band_hz_8k = [
 94 |     7.867213,    31.634144,   63.655895,   96.124611,   129.044968,...
 95 |     162.421738,  196.259659,  230.563568,  265.338348,  300.588867,...     
 96 |     336.320129,  372.537140,  409.244934,  446.448578,  484.568604,...     
 97 |     526.600586,  570.303833,  619.423340,  672.121643,  728.525696,...     
 98 |     785.675964,  846.835693,  909.691650,  977.063293,  1049.861694,...     
 99 |     1129.635986, 1217.257568, 1312.109497, 1412.501465, 1517.999390,...   
100 |     1628.894165, 1746.194336, 1871.568848, 2008.776123, 2158.979248,...     
101 |     2326.743164, 2513.787109, 2722.488770, 2952.586670, 3205.835449,... 
102 |     3492.679932, 3820.219238
103 |     ];
104 | 
105 | width_of_band_bark_8k = [
106 |     0.157344,     0.317994,     0.322441,     0.326934,     0.331474, ...    
107 |     0.336061,     0.340697,     0.345381,     0.350114,     0.354897, ...    
108 |     0.359729,     0.364611,     0.369544,     0.374529,     0.379565, ...    
109 |     0.384653,     0.389794,     0.394989,     0.400236,     0.405538, ...    
110 |     0.410894,     0.416306,     0.421773,     0.427297,     0.432877, ...    
111 |     0.438514,     0.444209,     0.449962,     0.455774,     0.461645, ...    
112 |     0.467577,     0.473569,     0.479621,     0.485736,     0.491912, ...    
113 |     0.498151,     0.504454,     0.510819,     0.517250,     0.523745, ...    
114 |     0.530308,     0.536934
115 |     ];
116 | 
117 | width_of_band_hz_8k = [
118 |     15.734426,  31.799433,  32.244064,   32.693359,   33.147385, ...    
119 |     33.606140,  34.069702,  34.538116,   35.011429,   35.489655, ...    
120 |     35.972870,  36.461121,  36.954407,   37.452911,   40.269653, ...    
121 |     42.311859,  45.992554,  51.348511,   55.040527,   56.775208, ...    
122 |     58.699402,  62.445862,  64.820923,   69.195374,   76.745667, ...   
123 |     84.016235,  90.825684,  97.931152,   103.348877,  107.801880, ...    
124 |     113.552246, 121.490601, 130.420410,  143.431763,  158.486816,  ...   
125 |     176.872803, 198.314697, 219.549561,  240.600098,  268.702393,  ...   
126 |     306.060059, 349.937012
127 |     ];
128 | 
129 | pow_dens_correction_factor_8k = [
130 |     100.000000,  99.999992,   100.000000,  100.000008,   100.000008,... 
131 |     100.000015,  99.999992,   99.999969,   50.000027,    100.000000,...     
132 |     99.999969,   100.000015,  99.999947,   100.000061,   53.047077, ...    
133 |     110.000046,  117.991989,  65.000000,   68.760147,    69.999931, ...    
134 |     71.428818,   75.000038,   76.843384,   80.968781,    88.646126, ...    
135 |     63.864388,   68.155350,   72.547775,   75.584831,    58.379192,...     
136 |     80.950836,   64.135651,   54.384785,   73.821884,    64.437073, ...    
137 |     59.176456,   65.521278,   61.399822,   58.144047,    57.004543,...     
138 |     64.126297,   59.248363
139 |     ];
140 | 
141 | abs_thresh_power_8k = [
142 |     51286152,     2454709.500,  70794.593750,  ...
143 |     4897.788574,  1174.897705,  389.045166,  ...
144 |     104.712860,   45.708820,    17.782795,   ...
145 |     9.772372,     4.897789,     3.090296,     ...
146 |     1.905461,     1.258925,     0.977237,     ...
147 |     0.724436,     0.562341,     0.457088,     ...
148 |     0.389045,     0.331131,     0.295121,     ...
149 |     0.269153,     0.257040,     0.251189,     ...
150 |     0.251189,     0.251189,     0.251189,     ...
151 |     0.263027,     0.288403,     0.309030,     ...
152 |     0.338844,     0.371535,     0.398107,     ...
153 |     0.436516,     0.467735,     0.489779,     ...
154 |     0.501187,     0.501187,     0.512861,     ...
155 |     0.524807,     0.524807,     0.524807
156 |     ];
157 | 
158 | nr_of_hz_bands_per_bark_band_16k = [
159 |     1,    1,    1,    1,    1,   1,    1,    1,    2,    1,    ...
160 |     1,    1,    1,    1,    2,   1,    1,    2,    2,    2,    ...
161 |     2,    2,    2,    2,    2,   3,    3,    3,    3,    4,    ...
162 |     3,    4,    5,    4,    5,   6,    6,    7,    8,    9,    ...
163 |     9,    12,   12,   15,   16,  18,   21,   25,   20
164 |     ];
165 | 
166 | centre_of_band_bark_16k = [
167 |     0.078672,   0.316341,   0.636559,    0.961246,     1.290450, ...
168 |     1.624217,   1.962597,   2.305636,    2.653383,     3.005889, ...
169 |     3.363201,   3.725371,   4.092449,    4.464486,     4.841533, ...
170 |     5.223642,   5.610866,   6.003256,    6.400869,     6.803755, ...
171 |     7.211971,   7.625571,   8.044611,    8.469146,     8.899232, ...
172 |     9.334927,   9.776288,   10.223374,   10.676242,    11.134952, ...
173 |     11.599563,  12.070135,  12.546731,   13.029408,    13.518232, ...
174 |     14.013264,  14.514566,  15.022202,   15.536238,    16.056736, ...
175 |     16.583761,  17.117382,  17.657663,   18.204674,    18.758478, ...
176 |     19.319147,  19.886751,  20.461355,   21.043034
177 |     ];
178 | 
179 | centre_of_band_hz_16k = [
180 |     7.867213,     31.634144,    63.655895,    96.124611,   129.044968,...
181 |     162.421738,   196.259659,   230.563568,   265.338348,  300.588867,...
182 |     336.320129,   372.537140,   409.244934,   446.448578,  484.568604,...
183 |     526.600586,   570.303833,   619.423340,   672.121643,  728.525696,...
184 |     785.675964,   846.835693,   909.691650,   977.063293,  1049.861694,...
185 |     1129.635986,  1217.257568,  1312.109497,  1412.501465, 1517.999390,...
186 |     1628.894165,  1746.194336,  1871.568848,  2008.776123, 2158.979248,...
187 |     2326.743164,  2513.787109,  2722.488770,  2952.586670, 3205.835449,...
188 |     3492.679932,  3820.219238,  4193.938477,  4619.846191, 5100.437012,...
189 |     5636.199219,  6234.313477,  6946.734863,  7796.473633
190 |     ];
191 | 
192 | width_of_band_bark_16k = [
193 |     0.157344,     0.317994,     0.322441,     0.326934,     0.331474,...
194 |     0.336061,     0.340697,     0.345381,     0.350114,     0.354897,...
195 |     0.359729,     0.364611,     0.369544,     0.374529,     0.379565,...
196 |     0.384653,     0.389794,     0.394989,     0.400236,     0.405538,...
197 |     0.410894,     0.416306,     0.421773,     0.427297,     0.432877,...
198 |     0.438514,     0.444209,     0.449962,     0.455774,     0.461645,...
199 |     0.467577,     0.473569,     0.479621,     0.485736,     0.491912,...
200 |     0.498151,     0.504454,     0.510819,     0.517250,     0.523745,...
201 |     0.530308,     0.536934,     0.543629,     0.550390,     0.557220,...
202 |     0.564119,     0.571085,     0.578125,     0.585232
203 |     ];
204 | 
205 | width_of_band_hz_16k = [
206 |     15.734426,     31.799433,     32.244064,     32.693359,     ...
207 |     33.147385,     33.606140,     34.069702,     34.538116,   ...
208 |     35.011429,     35.489655,     35.972870,     36.461121,    ... 
209 |     36.954407,     37.452911,     40.269653,     42.311859,   ...
210 |     45.992554,     51.348511,     55.040527,     56.775208,    ...
211 |     58.699402,     62.445862,     64.820923,     69.195374,   ...
212 |     76.745667,     84.016235,     90.825684,     97.931152,   ...
213 |     103.348877,    107.801880,    113.552246,    121.490601,  ...
214 |     130.420410,    143.431763,    158.486816,    176.872803,  ...
215 |     198.314697,    219.549561,    240.600098,    268.702393,  ...
216 |     306.060059,    349.937012,    398.686279,    454.713867,  ...
217 |     506.841797,    564.863770,    637.261230,    794.717285,  ...
218 |     931.068359
219 |     ];
220 | 
221 | pow_dens_correction_factor_16k = [
222 |     100.000000,     99.999992,     100.000000,    100.000008,...
223 |     100.000008,     100.000015,    99.999992,     99.999969,  ...
224 |     50.000027,      100.000000,    99.999969,     100.000015, ...
225 |     99.999947,      100.000061,    53.047077,     110.000046, ...
226 |     117.991989,     65.000000,     68.760147,     69.999931, ...
227 |     71.428818,      75.000038,     76.843384,     80.968781, ...
228 |     88.646126,      63.864388,     68.155350,     72.547775, ...
229 |     75.584831,      58.379192,     80.950836,     64.135651, ...
230 |     54.384785,      73.821884,     64.437073,     59.176456,     ...
231 |     65.521278,      61.399822,     58.144047,     57.004543,     ...
232 |     64.126297,      54.311001,     61.114979,     55.077751,     ...
233 |     56.849335,      55.628868,     53.137054,     54.985844,    ...
234 |     79.546974
235 |     ];
236 | 
237 | abs_thresh_power_16k = [
238 |     51286152.00,  2454709.500,  70794.593750,  ...
239 |     4897.788574,  1174.897705,  389.045166,     ...
240 |     104.712860,   45.708820,    17.782795,    ...
241 |     9.772372,     4.897789,     3.090296,   ...
242 |     1.905461,     1.258925,     0.977237,     ...
243 |     0.724436,     0.562341,     0.457088,     ...
244 |     0.389045,     0.331131,     0.295121,     ...
245 |     0.269153,     0.257040,     0.251189,    ...
246 |     0.251189,     0.251189,     0.251189,    ...
247 |     0.263027,     0.288403,     0.309030,     ...
248 |     0.338844,     0.371535,     0.398107,    ...
249 |     0.436516,     0.467735,     0.489779,    ...
250 |     0.501187,     0.501187,     0.512861,    ...
251 |     0.524807,     0.524807,     0.524807,    ...
252 |     0.512861,     0.478630,     0.426580,    ...
253 |     0.371535,     0.363078,     0.416869,    ...
254 |     0.537032
255 |     ];
256 | 
257 | if (sampling_rate== fs_16k)
258 |     Downsample = Downsample_16k;
259 |     InIIR_Hsos = InIIR_Hsos_16k;
260 |     InIIR_Nsos = InIIR_Nsos_16k;
261 |     Align_Nfft = Align_Nfft_16k;
262 |     Fs= fs_16k;
263 |     
264 |     Nb = 49;
265 |     Sl = Sl_16k;
266 |     Sp = Sp_16k;
267 |     nr_of_hz_bands_per_bark_band = nr_of_hz_bands_per_bark_band_16k;
268 |     centre_of_band_bark = centre_of_band_bark_16k;
269 |     centre_of_band_hz = centre_of_band_hz_16k;
270 |     width_of_band_bark = width_of_band_bark_16k;
271 |     width_of_band_hz = width_of_band_hz_16k;
272 |     pow_dens_correction_factor = pow_dens_correction_factor_16k;
273 |     abs_thresh_power = abs_thresh_power_16k;
274 |     
275 |     return;
276 | end
277 | 
278 | if (sampling_rate== fs_8k)
279 |     Downsample = Downsample_8k;
280 |     InIIR_Hsos = InIIR_Hsos_8k;
281 |     InIIR_Nsos = InIIR_Nsos_8k;
282 |     Align_Nfft = Align_Nfft_8k;
283 |     Fs= fs_8k;
284 |     
285 |     Nb = 42;
286 |     Sl = Sl_8k;
287 |     Sp = Sp_8k;
288 |     nr_of_hz_bands_per_bark_band = nr_of_hz_bands_per_bark_band_8k;
289 |     centre_of_band_bark = centre_of_band_bark_8k;
290 |     centre_of_band_hz = centre_of_band_hz_8k;
291 |     width_of_band_bark = width_of_band_bark_8k;
292 |     width_of_band_hz = width_of_band_hz_8k;
293 |     pow_dens_correction_factor = pow_dens_correction_factor_8k;
294 |     abs_thresh_power = abs_thresh_power_8k;
295 |     return;
296 | end
297 | 
298 | 
299 |     
300 |     
301 |     


--------------------------------------------------------------------------------
/PESQ/split_align.m:
--------------------------------------------------------------------------------
  1 | function split_align( ref_data, ref_Nsamples, ref_VAD, ref_logVAD, ...
  2 |     deg_data, deg_Nsamples, deg_VAD, deg_logVAD, ...
  3 |     Utt_Start_l, Utt_SpeechStart, Utt_SpeechEnd, Utt_End_l, ...
  4 |     Utt_DelayEst_l, Utt_DelayConf_l)
  5 | 
  6 | global MAXNUTTERANCES Align_Nfft Downsample Window    
  7 | global Utt_DelayEst Utt_Delay UttSearch_Start UttSearch_End 
  8 | global Best_ED1 Best_D1 Best_DC1 Best_ED2 Best_D2 Best_DC2 Best_BP
  9 | 
 10 | Utt_BPs= zeros( 1, 41);
 11 | Utt_ED1= zeros( 1, 41);
 12 | Utt_ED2= zeros( 1, 41);
 13 | Utt_D1= zeros( 1, 41);
 14 | Utt_D2= zeros( 1, 41);
 15 | Utt_DC1= zeros( 1, 41);
 16 | Utt_DC2= zeros( 1, 41);
 17 | 
 18 | 
 19 | Utt_Len = Utt_SpeechEnd - Utt_SpeechStart;
 20 | Utt_Test = MAXNUTTERANCES;
 21 | Best_DC1 = 0.0;
 22 | Best_DC2 = 0.0;
 23 | kernel = Align_Nfft / 64;
 24 | Delta = Align_Nfft / (4 * Downsample);
 25 | Step = floor( ((0.801 * Utt_Len + 40 * Delta - 1)/(40 * Delta)));
 26 | Step = Step* Delta;
 27 | % fprintf( 'Step is %f\n', Step);
 28 | 
 29 | Pad = floor( Utt_Len / 10);
 30 | if( Pad < 75 ) 
 31 |     Pad = 75;
 32 | end
 33 | 
 34 | Utt_BPs(1) = Utt_SpeechStart + Pad;
 35 | N_BPs = 1;
 36 | while( 1)
 37 |     N_BPs= N_BPs+ 1;
 38 |     Utt_BPs(N_BPs)= Utt_BPs(N_BPs- 1)+ Step;
 39 |     if (~((Utt_BPs(N_BPs) <= (Utt_SpeechEnd- Pad)) && (N_BPs <= 40) ))
 40 |         break;
 41 |     end
 42 | end
 43 | 
 44 | if( N_BPs <= 1 ) 
 45 |     return;
 46 | end
 47 | 
 48 | % fprintf( 'Utt_DelayEst_l, Utt_Start_l, N_BPs is %d,%d,%d\n', ...
 49 | %     Utt_DelayEst_l, Utt_Start_l, N_BPs);
 50 | for bp = 1: N_BPs- 1
 51 |     Utt_DelayEst(Utt_Test) = Utt_DelayEst_l;
 52 |     UttSearch_Start(Utt_Test) = Utt_Start_l;
 53 |     UttSearch_End(Utt_Test) = Utt_BPs(bp);
 54 | %     fprintf( 'bp,Utt_BPs(%d) is %d,%d\n', bp,bp,Utt_BPs(bp)); 
 55 |     
 56 |     crude_align( ref_logVAD, ref_Nsamples, deg_logVAD, ...
 57 |         deg_Nsamples, MAXNUTTERANCES);
 58 |     Utt_ED1(bp) = Utt_Delay(Utt_Test);
 59 | 
 60 |     Utt_DelayEst(Utt_Test) = Utt_DelayEst_l;
 61 |     UttSearch_Start(Utt_Test) = Utt_BPs(bp);
 62 |     UttSearch_End(Utt_Test) = Utt_End_l;
 63 |     
 64 |     crude_align( ref_logVAD, ref_Nsamples, deg_logVAD, ...
 65 |         deg_Nsamples, MAXNUTTERANCES);
 66 |     Utt_ED2(bp) = Utt_Delay(Utt_Test);
 67 | end
 68 | 
 69 | % stream = fopen( 'matmat.txt', 'wt' );	
 70 | % for count= 1: N_BPs- 1 
 71 | %     fprintf( stream, '%d\n', Utt_ED2(count));
 72 | % end
 73 | % fclose( stream );
 74 | 
 75 | 
 76 | Utt_DC1(1: N_BPs-1) = -2.0;
 77 | % stream= fopen( 'what_mmm.txt', 'at');
 78 | while( 1 )
 79 |     bp = 1;
 80 |     while( (bp <= N_BPs- 1) && (Utt_DC1(bp) > -2.0) )
 81 |         bp = bp+ 1;
 82 |     end
 83 |     if( bp >= N_BPs )
 84 |         break;
 85 |     end
 86 |     
 87 |     estdelay = Utt_ED1(bp);
 88 | %     fprintf( 'bp,estdelay is %d,%d\n', bp, estdelay);
 89 |     H(1: Align_Nfft)= 0;
 90 |     Hsum = 0.0;
 91 |     
 92 |     startr = (Utt_Start_l- 1) * Downsample+ 1;
 93 |     startd = startr + estdelay;
 94 | %     fprintf( 'startr/startd is %d/%d\n', startr, startd);
 95 |     
 96 |     if ( startd < 0 )
 97 |         startr = -estdelay+ 1;
 98 |         startd = 1;
 99 |     end
100 | 
101 |     while( ((startd + Align_Nfft) <= 1+ deg_Nsamples) &&...
102 |             ((startr + Align_Nfft) <= (1+ (Utt_BPs(bp)- 1) * Downsample)) )
103 |         X1= ref_data(startr: startr+ Align_Nfft- 1).* Window;
104 |         X2= deg_data(startd: startd+ Align_Nfft- 1).* Window;
105 |         
106 |         X1_fft= fft( X1, Align_Nfft );
107 |         X1_fft_conj= conj( X1_fft);
108 |         X2_fft= fft( X2, Align_Nfft );
109 |         X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft);
110 |         
111 |         X1= abs( X1);
112 |         v_max= max( X1)* 0.99;        
113 |         n_max = (v_max^ 0.125 )/ kernel;
114 | %         fprintf( stream, '%f %f\n', v_max, n_max);
115 |         
116 |         for count = 0: Align_Nfft- 1
117 |             if( X1(count+ 1) > v_max )
118 |                 Hsum = Hsum+ n_max * kernel;
119 |                 for k = 1-kernel: kernel- 1
120 |                     H(1+ rem( count+ k+ Align_Nfft, Align_Nfft))= ...
121 |                         H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))+ ...
122 |                         n_max* (kernel- abs(k));
123 |                 end
124 |             end
125 |         end
126 | 
127 |         startr = startr+ (Align_Nfft / 4);
128 |         startd = startd+ (Align_Nfft / 4);
129 |     end
130 | 
131 |     [v_max, I_max] = max( H);
132 |     if( I_max- 1 >= (Align_Nfft/2) )
133 |         I_max = I_max- Align_Nfft;
134 |     end
135 | 
136 |     Utt_D1(bp) = estdelay + I_max- 1;
137 |     if( Hsum > 0.0 )
138 | %         if (Utt_Len== 236)
139 | %             fprintf( 'v_max, Hsum is %f, %f\n', v_max, Hsum);
140 | %         end
141 |         Utt_DC1(bp) = v_max / Hsum;
142 |     else
143 |         Utt_DC1(bp) = 0.0;
144 |     end
145 | 
146 | %     fprintf( 'bp/startr/startd is %d/%d/%d\n', bp, startr, startd);
147 |     while( bp < (N_BPs - 1) )
148 |         bp = bp + 1;
149 |         
150 |         if( (Utt_ED1(bp) == estdelay) && (Utt_DC1(bp) <= -2.0) )
151 | %             loopno= 0;
152 |             while(((startd+ Align_Nfft)<= 1+ deg_Nsamples) && ...
153 |                     ((startr+ Align_Nfft)<= ...
154 |                     ((Utt_BPs(bp)- 1)* Downsample+ 1) ))
155 |                 X1= ref_data( startr: startr+ Align_Nfft- 1).* ...
156 |                     Window;
157 | % %                 if (Utt_Len== 321)
158 | %                     fid= fopen( 'what_mat.txt', 'at');
159 | %                     fprintf( fid, '%f\n', Window);
160 | %                     fclose( fid);
161 | % %                     fprintf( '\n');
162 | % %                 end
163 |                 X2= deg_data( startd: startd+ Align_Nfft- 1).* ...
164 |                     Window;
165 |                 X1_fft= fft( X1, Align_Nfft );
166 |                 X1_fft_conj= conj( X1_fft);
167 |                 X2_fft= fft( X2, Align_Nfft );
168 |                 X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft);
169 |                 
170 |                 X1= abs( X1);
171 |                 v_max = 0.99* max( X1);
172 |                 n_max = (v_max^ 0.125)/ kernel;
173 | %                 fprintf( 'v_max n_max is %f %f\n', v_max, n_max);
174 |                 
175 |                 for count = 0: Align_Nfft- 1
176 |                     if( X1(count+ 1) > v_max )
177 |                         Hsum = Hsum+ n_max * kernel;
178 |                         for k = 1-kernel: kernel-1
179 |                             H(1+ rem( count+ k+ Align_Nfft, Align_Nfft))= ...
180 |                                 H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))+ ...
181 |                                 n_max* (kernel- abs(k));
182 |                         end
183 |                     end
184 |                 end
185 | 
186 |                 startr = startr+ (Align_Nfft / 4);
187 |                 startd = startd+ (Align_Nfft / 4);
188 |                 
189 | %                 loopno= loopno+ 1;
190 |             end
191 | %             fprintf( 'loopno is %d\n', loopno);
192 | 
193 |             [v_max, I_max] = max( H);
194 | %             fprintf( 'I_max is %d ', I_max);
195 |             if( I_max- 1 >= (Align_Nfft/2) )
196 |                 I_max = I_max- Align_Nfft;
197 |             end
198 |             
199 | 
200 |             Utt_D1(bp) = estdelay + I_max- 1;
201 |             if( Hsum > 0.0 )
202 | %                 fprintf( 'v_max Hsum is %f %f\n', v_max, Hsum);
203 |                 Utt_DC1(bp) = v_max / Hsum;
204 |             else
205 |                 Utt_DC1(bp) = 0.0;
206 |             end
207 |         end
208 |     end
209 | end
210 | % fclose( stream);
211 | 
212 | for bp= 1: N_BPs- 1
213 |     if( Utt_DC1(bp) > Utt_DelayConf_l )
214 |         Utt_DC2(bp) = -2.0;
215 |     else
216 |         Utt_DC2(bp) = 0.0;
217 |     end
218 | end
219 | 
220 | while( 1 )
221 |     bp = N_BPs- 1;
222 |     while( (bp >= 1) && (Utt_DC2(bp) > -2.0) )
223 |         bp = bp- 1; 
224 |     end
225 |     if( bp < 1 )
226 |         break;
227 |     end 
228 | 
229 |     estdelay = Utt_ED2(bp);
230 |     H( 1: Align_Nfft)= 0;
231 |     Hsum = 0.0;
232 |     
233 |     startr = (Utt_End_l- 1)* Downsample+ 1- Align_Nfft;
234 |     startd = startr + estdelay;
235 |     
236 | %     fprintf( '***NEW startr is %d\n', startr);
237 |     
238 | %     fprintf( 'startr/d, deg_Nsamples is %d/%d, %d\n', startr,startd, ...
239 | %         deg_Nsamples);
240 | %     fprintf( 'deg_data has %d elements\n', numel( deg_data));
241 |     
242 |     if ( (startd + Align_Nfft) > deg_Nsamples+ 1 )
243 |         startd = deg_Nsamples - Align_Nfft+ 1;
244 |         startr = startd - estdelay;
245 |     end
246 | 
247 |     while( (startd>= 1) && (startr>= (Utt_BPs(bp)- 1)* Downsample+ 1) )
248 |         X1= ref_data( startr: startr+ Align_Nfft- 1).* Window;
249 |         X2= deg_data( startd: startd+ Align_Nfft- 1).* Window;
250 |         
251 |         X1_fft= fft( X1, Align_Nfft);
252 |         X1_fft_conj= conj( X1_fft);
253 |         X2_fft= fft( X2, Align_Nfft);
254 |         
255 |         X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft );
256 |         X1= abs( X1);
257 |         
258 |         v_max = max( X1)* 0.99;
259 |         n_max = ( v_max^ 0.125 )/ kernel;
260 |         
261 |         for count = 0: Align_Nfft- 1
262 |             if( X1(count+ 1) > v_max )
263 |                 Hsum = Hsum+ n_max * kernel;
264 |                 for k = 1-kernel: kernel- 1
265 |                     H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))= ...
266 |                         H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))+ ...
267 |                         n_max* (kernel- abs(k));
268 |                 end
269 |             end
270 |         end
271 | 
272 |         startr = startr- (Align_Nfft / 4);
273 |         startd = startd- (Align_Nfft / 4);
274 |     end
275 | 
276 |     [v_max, I_max] = max( H);
277 |     if( I_max- 1 >= (Align_Nfft/2) )
278 |         I_max = I_max- Align_Nfft;
279 |     end
280 | 
281 |     Utt_D2(bp) = estdelay + I_max- 1;
282 |     if( Hsum > 0.0 )
283 |         Utt_DC2(bp) = v_max / Hsum;
284 |     else
285 |         Utt_DC2(bp) = 0.0;
286 |     end
287 | 
288 |     while( bp > 1 )
289 |         bp = bp - 1;
290 |         if( (Utt_ED2(bp) == estdelay) && (Utt_DC2(bp) <= -2.0) )
291 |             while( (startd >= 1) && (startr >= (Utt_BPs(bp)- 1) * Downsample+ 1)) 
292 |                  X1= ref_data( startr: startr+ Align_Nfft- 1).* Window;
293 |                  X2= deg_data( startd: startd+ Align_Nfft- 1).* Window;
294 |                  X1_fft_conj= conj( fft( X1, Align_Nfft));
295 |                  X2_fft= fft( X2, Align_Nfft);
296 |                  X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft);
297 |                  
298 |                  X1= abs( X1);
299 |                  v_max = max( X1)* 0.99;
300 |                  n_max = (v_max^ 0.125)/ kernel;
301 |                  
302 |                  for count = 0: Align_Nfft- 1
303 |                      if( X1(count+ 1) > v_max )
304 |                          Hsum = Hsum+ n_max * kernel;
305 |                          for k = 1-kernel: kernel- 1
306 |                              H(1+ rem( count+ k+ Align_Nfft, Align_Nfft))= ...
307 |                                  H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))+ ...
308 |                                  n_max* (kernel- abs(k));
309 |                          end
310 |                      end
311 |                  end
312 | 
313 |                  startr = startr- (Align_Nfft / 4);
314 |                  startd = startd- (Align_Nfft / 4);
315 |             end
316 | 
317 |             [v_max, I_max] = max( H);
318 |             if( I_max- 1 >= (Align_Nfft/2) )
319 |                 I_max = I_max- Align_Nfft;
320 |             end
321 |             
322 | 
323 |             Utt_D2(bp) = estdelay + I_max- 1;
324 |             if( Hsum > 0.0 )
325 |                 Utt_DC2(bp) = v_max / Hsum;
326 |             else
327 |                 Utt_DC2(bp) = 0.0;
328 |             end
329 |         end
330 |     end
331 | end
332 | 
333 | % fid= fopen( 'uttinfo_mat.txt', 'wt');
334 | % fprintf( fid, '%f\n', Utt_D2);
335 | % fprintf( fid, '\n');
336 | % fprintf( fid, '%f\n', Utt_DC2);
337 | % fclose( fid);
338 | 
339 | % fprintf( 'Utt_Len, N_BPs is %d, %d\n', Utt_Len, N_BPs);
340 | for bp = 1: N_BPs- 1
341 |     if( (abs(Utt_D2(bp) - Utt_D1(bp)) >= Downsample) && ...
342 |             ((Utt_DC1(bp)+ Utt_DC2(bp))> (Best_DC1 + Best_DC2)) &&...
343 |             (Utt_DC1(bp) > Utt_DelayConf_l) && ...
344 |             (Utt_DC2(bp) > Utt_DelayConf_l) )
345 |         Best_ED1 = Utt_ED1(bp);
346 |         Best_D1 = Utt_D1(bp);
347 |         Best_DC1 = Utt_DC1(bp);
348 |         Best_ED2 = Utt_ED2(bp);
349 |         Best_D2 = Utt_D2(bp);
350 |         Best_DC2 = Utt_DC2(bp);
351 |         Best_BP = Utt_BPs(bp);
352 | %         fprintf( 'in loop...');
353 |     end
354 | end
355 | 
356 | % if (Utt_Len== 236)
357 | %     fid= fopen( 'matmat.txt', 'wt');
358 | %     fprintf( fid, 'N_BPs is %d\n', N_BPs);
359 | %     fprintf( fid, 'Utt_DelayConf is %f\n', Utt_DelayConf_l);
360 | %     fprintf( fid, 'ED2\t ED1\t D2\t D1\t DC2\t DC1\t BPs\n');
361 | %     for bp= 1: N_BPs- 1
362 | %         fprintf( fid, '%d\t %d\t %d\t %d\t %f\t %f\t %d\n', Utt_ED2( bp), ...
363 | %             Utt_ED1( bp), Utt_D2(bp), Utt_D1(bp), Utt_DC2(bp),...
364 | %             Utt_DC1( bp), Utt_BPs( bp));
365 | %     end
366 | %     fclose( fid);
367 | % end
368 | 
369 | 
370 | 
371 | 
372 | 
373 | 
374 | 
375 | 
376 | 
377 | 
378 | 
379 | 
380 | 
381 | 
382 | 
383 | 
384 | 
385 | 
386 | 
387 | 
388 | 
389 | 
390 | 
391 | 


--------------------------------------------------------------------------------
/PESQ/stoi.m:
--------------------------------------------------------------------------------
  1 | function d = stoi(x, y, fs_signal)
  2 | %   d = stoi(x, y, fs_signal) returns the output of the short-time
  3 | %   objective intelligibility (STOI) measure described in [1, 2], where x 
  4 | %   and y denote the clean and processed speech, respectively, with sample
  5 | %   rate fs_signal in Hz. The output d is expected to have a monotonic 
  6 | %   relation with the subjective speech-intelligibility, where a higher d 
  7 | %   denotes better intelligible speech. See [1, 2] for more details.
  8 | %
  9 | %   References:
 10 | %      [1] C.H.Taal, R.C.Hendriks, R.Heusdens, J.Jensen 'A Short-Time
 11 | %      Objective Intelligibility Measure for Time-Frequency Weighted Noisy
 12 | %      Speech', ICASSP 2010, Texas, Dallas.
 13 | %
 14 | %      [2] C.H.Taal, R.C.Hendriks, R.Heusdens, J.Jensen 'An Algorithm for 
 15 | %      Intelligibility Prediction of Time-Frequency Weighted Noisy Speech', 
 16 | %      IEEE Transactions on Audio, Speech, and Language Processing, 2011. 
 17 | %
 18 | %
 19 | % Copyright 2009: Delft University of Technology, Signal & Information
 20 | % Processing Lab. The software is free for non-commercial use. This program
 21 | % comes WITHOUT ANY WARRANTY.
 22 | %
 23 | %
 24 | %
 25 | % Updates:
 26 | % 2011-04-26 Using the more efficient 'taa_corr' instead of 'corr'
 27 | 
 28 | if length(x)~=length(y)
 29 | %     error('x and y should have the same length');
 30 |     if length(x)>length(y)
 31 |         x=x(1:length(y));
 32 |     else
 33 |         y=y(1:length(x));
 34 |     end
 35 | end
 36 | 
 37 | % initialization
 38 | x           = x(:);                             % clean speech column vector
 39 | y           = y(:);                             % processed speech column vector
 40 | 
 41 | fs          = 10000;                            % sample rate of proposed intelligibility measure
 42 | N_frame    	= 256;                              % window support
 43 | K           = 512;                              % FFT size
 44 | J           = 15;                               % Number of 1/3 octave bands
 45 | mn          = 150;                              % Center frequency of first 1/3 octave band in Hz.
 46 | H           = thirdoct(fs, K, J, mn);           % Get 1/3 octave band matrix
 47 | N           = 30;                               % Number of frames for intermediate intelligibility measure (Length analysis window)
 48 | Beta        = -15;                           	% lower SDR-bound
 49 | dyn_range   = 40;                               % speech dynamic range
 50 | 
 51 | % resample signals if other samplerate is used than fs
 52 | if fs_signal ~= fs
 53 |     x	= resample(x, fs, fs_signal);
 54 |     y 	= resample(y, fs, fs_signal);
 55 | end
 56 | 
 57 | % remove silent frames
 58 | [x y] = removeSilentFrames(x, y, dyn_range, N_frame, N_frame/2);
 59 | 
 60 | % apply 1/3 octave band TF-decomposition
 61 | x_hat     	= stdft(x, N_frame, N_frame/2, K); 	% apply short-time DFT to clean speech
 62 | y_hat     	= stdft(y, N_frame, N_frame/2, K); 	% apply short-time DFT to processed speech
 63 | 
 64 | x_hat       = x_hat(:, 1:(K/2+1)).';         	% take clean single-sided spectrum
 65 | y_hat       = y_hat(:, 1:(K/2+1)).';        	% take processed single-sided spectrum
 66 | 
 67 | X           = zeros(J, size(x_hat, 2));         % init memory for clean speech 1/3 octave band TF-representation 
 68 | Y           = zeros(J, size(y_hat, 2));         % init memory for processed speech 1/3 octave band TF-representation 
 69 | 
 70 | for i = 1:size(x_hat, 2)
 71 |     X(:, i)	= sqrt(H*abs(x_hat(:, i)).^2);      % apply 1/3 octave bands as described in Eq.(1) [1]
 72 |     Y(:, i)	= sqrt(H*abs(y_hat(:, i)).^2);
 73 | end
 74 | 
 75 | % loop al segments of length N and obtain intermediate intelligibility measure for all TF-regions
 76 | d_interm  	= zeros(J, length(N:size(X, 2)));                               % init memory for intermediate intelligibility measure
 77 | c           = 10^(-Beta/20);                                                % constant for clipping procedure
 78 | 
 79 | for m = N:size(X, 2)
 80 |     X_seg  	= X(:, (m-N+1):m);                                              % region with length N of clean TF-units for all j
 81 |     Y_seg  	= Y(:, (m-N+1):m);                                              % region with length N of processed TF-units for all j
 82 |     alpha   = sqrt(sum(X_seg.^2, 2)./sum(Y_seg.^2, 2));                     % obtain scale factor for normalizing processed TF-region for all j
 83 |     aY_seg 	= Y_seg.*repmat(alpha, [1 N]);                               	% obtain \alpha*Y_j(n) from Eq.(2) [1]
 84 |     for j = 1:J
 85 |       	Y_prime             = min(aY_seg(j, :), X_seg(j, :)+X_seg(j, :)*c); % apply clipping from Eq.(3)   	
 86 |         d_interm(j, m-N+1)  = taa_corr(X_seg(j, :).', Y_prime(:));          % obtain correlation coeffecient from Eq.(4) [1]
 87 |     end
 88 | end
 89 |         
 90 | d = mean(d_interm(:));                                                      % combine all intermediate intelligibility measures as in Eq.(4) [1]
 91 | 
 92 | %%
 93 | function  [A cf] = thirdoct(fs, N_fft, numBands, mn)
 94 | %   [A CF] = THIRDOCT(FS, N_FFT, NUMBANDS, MN) returns 1/3 octave band matrix
 95 | %   inputs:
 96 | %       FS:         samplerate 
 97 | %       N_FFT:      FFT size
 98 | %       NUMBANDS:   number of bands
 99 | %       MN:         center frequency of first 1/3 octave band
100 | %   outputs:
101 | %       A:          octave band matrix
102 | %       CF:         center frequencies
103 | 
104 | f               = linspace(0, fs, N_fft+1);
105 | f               = f(1:(N_fft/2+1));
106 | k               = 0:(numBands-1); 
107 | cf              = 2.^(k/3)*mn;
108 | fl              = sqrt((2.^(k/3)*mn).*2.^((k-1)/3)*mn);
109 | fr              = sqrt((2.^(k/3)*mn).*2.^((k+1)/3)*mn);
110 | A               = zeros(numBands, length(f));
111 | 
112 | for i = 1:(length(cf))
113 |     [a b]                   = min((f-fl(i)).^2);
114 |     fl(i)                   = f(b);
115 |     fl_ii                   = b;
116 | 
117 | 	[a b]                   = min((f-fr(i)).^2);
118 |     fr(i)                   = f(b);
119 |     fr_ii                   = b;
120 |     A(i,fl_ii:(fr_ii-1))	= 1;
121 | end
122 | 
123 | rnk         = sum(A, 2);
124 | numBands  	= find((rnk(2:end)>=rnk(1:(end-1))) & (rnk(2:end)~=0)~=0, 1, 'last' )+1;
125 | A           = A(1:numBands, :);
126 | cf          = cf(1:numBands);
127 | 
128 | %%
129 | function x_stdft = stdft(x, N, K, N_fft)
130 | %   X_STDFT = X_STDFT(X, N, K, N_FFT) returns the short-time
131 | %	hanning-windowed dft of X with frame-size N, overlap K and DFT size
132 | %   N_FFT. The columns and rows of X_STDFT denote the frame-index and
133 | %   dft-bin index, respectively.
134 | 
135 | frames      = 1:K:(length(x)-N);
136 | x_stdft     = zeros(length(frames), N_fft);
137 | 
138 | w           = hanning(N);
139 | x           = x(:);
140 | 
141 | for i = 1:length(frames)
142 |     ii              = frames(i):(frames(i)+N-1);
143 | 	x_stdft(i, :) 	= fft(x(ii).*w, N_fft);
144 | end
145 | 
146 | %%
147 | function [x_sil y_sil] = removeSilentFrames(x, y, range, N, K)
148 | %   [X_SIL Y_SIL] = REMOVESILENTFRAMES(X, Y, RANGE, N, K) X and Y
149 | %   are segmented with frame-length N and overlap K, where the maximum energy
150 | %   of all frames of X is determined, say X_MAX. X_SIL and Y_SIL are the
151 | %   reconstructed signals, excluding the frames, where the energy of a frame
152 | %   of X is smaller than X_MAX-RANGE
153 | 
154 | x       = x(:);
155 | y       = y(:);
156 | 
157 | frames  = 1:K:(length(x)-N);
158 | w       = hanning(N);
159 | msk     = zeros(size(frames));
160 | 
161 | for j = 1:length(frames)
162 |     jj      = frames(j):(frames(j)+N-1);
163 |     msk(j) 	= 20*log10(norm(x(jj).*w)./sqrt(N));
164 | end
165 | 
166 | msk     = (msk-max(msk)+range)>0;
167 | count   = 1;
168 | 
169 | x_sil   = zeros(size(x));
170 | y_sil   = zeros(size(y));
171 | 
172 | for j = 1:length(frames)
173 |     if msk(j)
174 |         jj_i            = frames(j):(frames(j)+N-1);
175 |         jj_o            = frames(count):(frames(count)+N-1);
176 |         x_sil(jj_o)     = x_sil(jj_o) + x(jj_i).*w;
177 |         y_sil(jj_o)  	= y_sil(jj_o) + y(jj_i).*w;
178 |         count           = count+1;
179 |     end
180 | end
181 | 
182 | x_sil = x_sil(1:jj_o(end));
183 | y_sil = y_sil(1:jj_o(end));
184 | 
185 | %%
186 | function rho = taa_corr(x, y)
187 | %   RHO = TAA_CORR(X, Y) Returns correlation coeffecient between column
188 | %   vectors x and y. Gives same results as 'corr' from statistics toolbox.
189 | xn    	= x-mean(x);
190 | xn  	= xn/sqrt(sum(xn.^2));
191 | yn   	= y-mean(y);
192 | yn    	= yn/sqrt(sum(yn.^2));
193 | rho   	= sum(xn.*yn);


--------------------------------------------------------------------------------
/PESQ/time_align.m:
--------------------------------------------------------------------------------
 1 | function time_align(ref_data, ref_Nsamples, ...
 2 |     deg_data, deg_Nsamples, Utt_id)
 3 | 
 4 | global Utt_DelayEst Utt_Delay Utt_DelayConf UttSearch_Start UttSearch_End 
 5 | global Align_Nfft Downsample Window
 6 | 
 7 | estdelay = Utt_DelayEst(Utt_id);
 8 | 
 9 | H = zeros( 1, Align_Nfft);
10 | X1= zeros( 1, Align_Nfft);
11 | X2= zeros( 1, Align_Nfft);
12 | 
13 | startr = (UttSearch_Start(Utt_id)- 1)* Downsample+ 1;
14 | startd = startr + estdelay;
15 | if ( startd < 0 )
16 |     startr = 1 -estdelay;
17 |     startd = 1;
18 | end
19 | 
20 | while( ((startd + Align_Nfft) <= deg_Nsamples) && ...
21 |         ((startr + Align_Nfft) <= ((UttSearch_End(Utt_id)- 1) * Downsample)) )
22 |     X1= ref_data( startr: startr+ Align_Nfft- 1).* Window;    
23 |     X2= deg_data( startd: startd+ Align_Nfft- 1).* Window;  
24 |     
25 |     % find cross-correlation between X1 and X2
26 |     X1_fft= fft( X1, Align_Nfft );
27 |     X1_fft_conj= conj( X1_fft);
28 |     X2_fft= fft( X2, Align_Nfft );    
29 |     X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft );        
30 | 
31 |     X1= abs( X1);     
32 |     v_max = max( X1)* 0.99;
33 |     
34 |     X1_greater_vmax= find( X1 > v_max );
35 |     H( X1_greater_vmax )= H( X1_greater_vmax )+ v_max^ 0.125;
36 |     
37 |     startr = startr+ Align_Nfft/ 4;
38 |     startd = startd+ Align_Nfft/ 4;
39 | 
40 | end
41 | 
42 | X1= H;
43 | X2= 0;
44 | Hsum = sum( H);
45 | 
46 | X2(1) = 1.0;
47 | kernel = Align_Nfft / 64;
48 | 
49 | for count= 2: kernel
50 |     X2( count)= 1- (count- 1)/ kernel;
51 |     X2( Align_Nfft- count+ 2)= 1- (count- 1)/ kernel;
52 | end
53 |     
54 | X1_fft= fft( X1, Align_Nfft );
55 | X2_fft= fft( X2, Align_Nfft );
56 | 
57 | X1= ifft( X1_fft.* X2_fft, Align_Nfft );
58 | 
59 | if (Hsum> 0)
60 |     H= abs( X1)/ Hsum;
61 | else
62 |     H= 0;
63 | end
64 | 
65 | [v_max, I_max] = max( H);
66 | if( I_max- 1 >= (Align_Nfft/2) )
67 |     I_max = I_max- Align_Nfft;
68 | end
69 | 
70 | Utt_Delay(Utt_id) = estdelay + I_max- 1;
71 | Utt_DelayConf(Utt_id) = v_max; % confidence
72 |     
73 | 
74 |     
75 |     
76 |     
77 | 


--------------------------------------------------------------------------------
/PESQ/utterance_locate.m:
--------------------------------------------------------------------------------
 1 | function utterance_locate (ref_data, ref_Nsamples, ref_VAD, ref_logVAD,...
 2 |     deg_data, deg_Nsamples, deg_VAD, deg_logVAD);
 3 | 
 4 | global Nutterances Utt_Delay Utt_DelayConf Utt_Start Utt_End Utt_DelayEst
 5 | 
 6 | id_searchwindows( ref_VAD, ref_Nsamples, deg_VAD, deg_Nsamples);
 7 | 
 8 | for Utt_id= 1: Nutterances
 9 |     %fprintf( 1, 'Utt_id is %d\n', Utt_id);
10 |     crude_align( ref_logVAD, ref_Nsamples, deg_logVAD, deg_Nsamples, Utt_id);
11 |     time_align(ref_data, ref_Nsamples, ...
12 |         deg_data, deg_Nsamples, Utt_id);
13 | end
14 | 
15 | id_utterances( ref_Nsamples, ref_VAD, deg_Nsamples);
16 | 
17 | 
18 | utterance_split( ref_data, ref_Nsamples, ref_VAD, ref_logVAD, ...
19 |     deg_data, deg_Nsamples, deg_VAD, deg_logVAD); 
20 | 
21 | 
22 | 
23 | 
24 | 
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/PESQ/utterance_split.m:
--------------------------------------------------------------------------------
  1 | function utterance_split( ref_data, ref_Nsamples, ref_VAD, ref_logVAD, ...
  2 |     deg_data, deg_Nsamples, deg_VAD, deg_logVAD)
  3 | 
  4 | global Nutterances MAXNUTTERANCES Downsample SEARCHBUFFER
  5 | global Utt_DelayEst Utt_Delay Utt_DelayConf UttSearch_Start
  6 | global Utt_Start Utt_End Largest_uttsize UttSearch_End
  7 | global Best_ED1 Best_D1 Best_DC1 Best_ED2 Best_D2 Best_DC2 Best_BP
  8 | 
  9 | Utt_id = 1;
 10 | while( (Utt_id <= Nutterances) && (Nutterances <= MAXNUTTERANCES) )
 11 |     Utt_DelayEst_l = Utt_DelayEst(Utt_id);
 12 |     Utt_Delay_l = Utt_Delay(Utt_id);
 13 |     Utt_DelayConf_l = Utt_DelayConf(Utt_id);
 14 |     Utt_Start_l = Utt_Start(Utt_id);
 15 |     Utt_End_l = Utt_End(Utt_id);
 16 |     
 17 |     Utt_SpeechStart = Utt_Start_l;
 18 | %     fprintf( 'SpeechStart is %d\n', Utt_SpeechStart);
 19 |     while( (Utt_SpeechStart < Utt_End_l) && ...
 20 |             (ref_VAD(Utt_SpeechStart)<= 0.0) )
 21 |         Utt_SpeechStart = Utt_SpeechStart + 1;
 22 |     end %find the SpeechStart for each utterance
 23 |     Utt_SpeechEnd = Utt_End_l;
 24 | %     fprintf( 'SpeechEnd is %d\n', Utt_SpeechEnd);
 25 |     while( (Utt_SpeechEnd > Utt_Start_l) && ...
 26 |             (ref_VAD(Utt_SpeechEnd) <= 0))
 27 |         Utt_SpeechEnd = Utt_SpeechEnd- 1;
 28 |     end
 29 |     Utt_SpeechEnd = Utt_SpeechEnd+ 1;    
 30 |     %find SpeechEnd for each utterance
 31 |     Utt_Len = Utt_SpeechEnd - Utt_SpeechStart;
 32 |     
 33 | %     fprintf( 'Utt_Len is %d\n', Utt_Len);
 34 |     
 35 |     if( Utt_Len >= 200 )
 36 |         split_align( ref_data, ref_Nsamples, ref_VAD, ref_logVAD, ...
 37 |             deg_data, deg_Nsamples, deg_VAD, deg_logVAD, ...
 38 |             Utt_Start_l, Utt_SpeechStart, Utt_SpeechEnd, Utt_End_l, ...
 39 |             Utt_DelayEst_l, Utt_DelayConf_l);
 40 | %         fprintf( '\nBest_ED1, Best_D1, Best_DC1 is %d, %d, %f\n',...
 41 | % 				Best_ED1, Best_D1, Best_DC1);
 42 | %         fprintf( 'Best_ED2, Best_D2, Best_DC2 is %d, %d, %f\n',...
 43 | % 				Best_ED2, Best_D2, Best_DC2);
 44 | %         fprintf( 'Best_BP is %d\n', Best_BP);
 45 |                 
 46 |         if( (Best_DC1 > Utt_DelayConf_l) && (Best_DC2 > Utt_DelayConf_l) )
 47 |             for step = Nutterances: -1: Utt_id+ 1
 48 |                 Utt_DelayEst(step+ 1) = Utt_DelayEst(step);
 49 |                 Utt_Delay(step+ 1) = Utt_Delay(step);
 50 |                 Utt_DelayConf(step+ 1) = Utt_DelayConf(step);
 51 |                 Utt_Start(step+ 1) = Utt_Start(step);
 52 |                 Utt_End(step+ 1) = Utt_End(step);
 53 |                 UttSearch_Start(step+ 1) = Utt_Start( step);
 54 |                 UttSearch_End(step+ 1) = Utt_End( step);
 55 |             end
 56 | 
 57 |             Nutterances = Nutterances+ 1;
 58 |             
 59 |             Utt_DelayEst(Utt_id) = Best_ED1;
 60 |             Utt_Delay(Utt_id) = Best_D1;
 61 |             Utt_DelayConf(Utt_id) = Best_DC1;
 62 |             
 63 |             Utt_DelayEst(Utt_id +1) = Best_ED2;
 64 |             Utt_Delay(Utt_id +1) = Best_D2;
 65 |             Utt_DelayConf(Utt_id +1) = Best_DC2;
 66 |             
 67 |             UttSearch_Start(Utt_id +1) = UttSearch_Start(Utt_id);
 68 |             UttSearch_End(Utt_id +1) = UttSearch_End( Utt_id);
 69 |             if( Best_D2 < Best_D1 )
 70 |                 Utt_Start(Utt_id) = Utt_Start_l;
 71 |                 Utt_End(Utt_id) = Best_BP;
 72 |                 Utt_Start(Utt_id +1) = Best_BP;
 73 |                 Utt_End(Utt_id +1) = Utt_End_l;
 74 |             else
 75 |                 Utt_Start( Utt_id) = Utt_Start_l;
 76 |                 Utt_End( Utt_id) = Best_BP + ...
 77 |                     floor( (Best_D2- Best_D1)/ (2 * Downsample));
 78 |                 Utt_Start( Utt_id +1) = Best_BP - ...
 79 |                     floor( (Best_D2- Best_D1)/ (2 * Downsample));
 80 |                 Utt_End( Utt_id +1) = Utt_End_l;
 81 |             end
 82 | 
 83 |             if( (Utt_Start(Utt_id)- SEARCHBUFFER- 1)* Downsample+ 1+ ...
 84 |                     Best_D1 < 0 )
 85 |                 Utt_Start(Utt_id) = SEARCHBUFFER+ 1+  ...
 86 |                     floor( (Downsample - 1 - Best_D1) / Downsample);
 87 |             end
 88 | 
 89 |             if( ((Utt_End( Utt_id +1)- 1)* Downsample+ 1 + Best_D2) >...
 90 |                     (deg_Nsamples - SEARCHBUFFER * Downsample) )
 91 |                 Utt_End( Utt_id +1) = floor( (deg_Nsamples - Best_D2)...
 92 |                     / Downsample)- SEARCHBUFFER+ 1;
 93 |             end
 94 |         else
 95 |             Utt_id= Utt_id+ 1;
 96 |         end
 97 |     else
 98 |         Utt_id = Utt_id+ 1;
 99 |     end
100 | end
101 | 
102 | Largest_uttsize = max( Utt_End- Utt_Start);
103 | 
104 | % fid= fopen( 'uttinfo_mat.txt', 'wt');
105 | % fprintf( fid, 'Number of Utterances is:\n');
106 | % fprintf( fid, '%d\n', Nutterances);
107 | % fprintf( fid, 'Utterance Delay Estimation:\n');
108 | % fprintf( fid, '%d\n', Utt_DelayEst( 1: Nutterances) );
109 | % fprintf( fid, 'Utterance Delay:\n');
110 | % fprintf( fid, '%d\n', Utt_Delay( 1: Nutterances));
111 | % fprintf( fid, 'Utterance Delay Confidence:\n');
112 | % fprintf( fid, '%f\n', Utt_DelayConf( 1: Nutterances));
113 | % fprintf( fid, 'Utterance Start:\n');
114 | % fprintf( fid, '%d\n', Utt_Start( 1: Nutterances));
115 | % fprintf( fid, 'Utterance End:\n');
116 | % fprintf( fid, '%d\n', Utt_End( 1: Nutterances));
117 | % fprintf( fid, 'Largest utterance length:\n');
118 | % fprintf( fid, '%d\n', Largest_uttsize);
119 | % fclose( fid);
120 | 
121 | 
122 | 
123 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Speech-measure-SDR-SAR-STOI-PESQ
 2 | Speech quality measure of SDR、SAR、STOI、ESTOI、PESQ via MATLAB
 3 | 
 4 | you can take a reference in /result for your sample
 5 | authour: dakenan1   262589340@qq.com
 6 | 
 7 | Refference:
 8 | __SDR/SAR/SIR__
 9 |     * Toolbox: [BSS Eval](http://bass-db.gforge.inria.fr/bss_eval/), [The PEASS Toolkit](http://bass-db.gforge.inria.fr/peass/), [craffel/mir_eval/separation.py](https://github.com/craffel/mir_eval/blob/master/mir_eval/separation.py)
10 |     * Paper: [Performance measurement in blind audio source separation](https://ieeexplore.ieee.org/document/1643671/)
11 | * __STOI__
12 |     * Toolbox: [stoi.zip](http://insy.ewi.tudelft.nl/content/short-time-objective-intelligibility-measure)+[actuallyaswin/stoi](https://github.com/actuallyaswin/stoi), [mpariente/pystoi](https://github.com/mpariente/pystoi)
13 |     * Paper: [A short-time objective intelligibility measure for time-frequency weighted noisy speech](https://ieeexplore.ieee.org/document/5495701/)
14 | * __ESTOI__
15 |     * Toolbox: [estoi.m](http://kom.aau.dk/~jje/code/estoi.m)
16 |     * Paper: [An Algorithm for Predicting the Intelligibility of Speech Masked by Modulated Noise Maskers](https://ieeexplore.ieee.org/document/7539284/)
17 | * __PESQ__
18 |     * Toolbox: [pesq.m](https://github.com/JacobD10/SoundZone_Tools/blob/master/pesq2.m), [MATLAB software-composite](http://ecs.utdallas.edu/loizou/speech/software.htm)
19 |     * Paper: [Perceptual evaluation of speech quality (PESQ)-a new method for speech quality assessment of telephone networks and codecs](https://ieeexplore.ieee.org/document/941023/)
20 | 


--------------------------------------------------------------------------------
/bss_eval_sources.m:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dakenan1/Speech-measure-SDR-SAR-STOI-PESQ/8d9562631a344ba37e7b4cf0d993b1ca339c8c0b/bss_eval_sources.m


--------------------------------------------------------------------------------
/estoi.m:
--------------------------------------------------------------------------------
  1 | function d = estoi(x, y, fs_signal)
  2 | %   d = estoi(x, y, fs_signal) returns the output of the extended short-time
  3 | %   objective intelligibility (ESTOI) predictor.
  4 | %  
  5 | % Implementation of the Extended Short-Time Objective
  6 | % Intelligibility (ESTOI) predictor, described in Jesper Jensen and
  7 | % Cees H. Taal, "An Algorithm for Predicting the Intelligibility of
  8 | % Speech Masked by Modulated Noise Maskers," IEEE Transactions on
  9 | % Audio, Speech and Language Processing, 2016.
 10 | %
 11 | % Input:
 12 | %        x:         clean reference time domain signal
 13 | %        y:         noisy/processed time domain signal
 14 | %        fs_signal: sampling rate [Hz]
 15 | %
 16 | % Output:
 17 | %        d: intelligibility index
 18 | %
 19 | %
 20 | % Copyright 2016: Aalborg University, Section for Signal and Information Processing. 
 21 | % The software is free for non-commercial use. 
 22 | % The software comes WITHOUT ANY WARRANTY.
 23 | 
 24 | 
 25 | if length(x)~=length(y)
 26 |   error('x and y should have the same length');
 27 | end
 28 | 
 29 | % initialization
 30 | x               = x(:);                   % clean speech column vector
 31 | y               = y(:);                   % processed speech column vector
 32 | 
 33 | fs              = 10000;                  % sample rate of proposed intelligibility measure
 34 | N_frame         = 256;                    % window support
 35 | K               = 512;                    % FFT size
 36 | J               = 15;                     % Number of 1/3 octave bands
 37 | mn              = 150;                    % Center frequency of first 1/3 octave band in Hz.
 38 | [H,fc_thirdoct] = thirdoct(fs, K, J, mn); % Get 1/3 octave band matrix
 39 | N               = 30;                     % Number of frames for intermediate intelligibility measure
 40 | dyn_range       = 40;                     % speech dynamic range
 41 | 
 42 | % resample signals if other samplerate is used than fs
 43 | if fs_signal ~= fs
 44 |   x	= resample(x, fs, fs_signal);
 45 |   y 	= resample(y, fs, fs_signal);
 46 | end
 47 | 
 48 | % remove silent frames
 49 | [x y] = removeSilentFrames(x, y, dyn_range, N_frame, N_frame/2);
 50 | 
 51 | % apply 1/3 octave band TF-decomposition
 52 | x_hat     	= stdft(x, N_frame, N_frame/2, K); % apply short-time DFT to clean speech
 53 | y_hat     	= stdft(y, N_frame, N_frame/2, K); % apply short-time DFT to processed speech
 54 | 
 55 | 
 56 | x_hat       = x_hat(:, 1:(K/2+1)).'; % take clean single-sided spectrum
 57 | y_hat       = y_hat(:, 1:(K/2+1)).'; % take processed single-sided spectrum
 58 | 
 59 | X           = zeros(J, size(x_hat, 2)); % init memory for clean speech 1/3 octave band TF-representation
 60 | Y           = zeros(J, size(y_hat, 2)); % init memory for processed speech 1/3 octave band TF-representation
 61 | 
 62 | for i = 1:size(x_hat, 2)
 63 |   X(:, i)	= sqrt(H*abs(x_hat(:, i)).^2); % apply 1/3 octave band filtering
 64 |   Y(:, i)	= sqrt(H*abs(y_hat(:, i)).^2);
 65 | end
 66 | 
 67 | % loop all segments of length N and obtain intermediate intelligibility measure for each
 68 | d1 = zeros(length(N:size(X, 2)),1); % init memory for intermediate intelligibility measure
 69 | for m=N:size(X,2)
 70 |     X_seg  	= X(:, (m-N+1):m); % region of length N with clean TF-units for all j
 71 |     Y_seg  	= Y(:, (m-N+1):m); % region of length N with processed TF-units for all j
 72 |     X_seg = X_seg + eps*randn(size(X_seg)); % to avoid divide by zero
 73 |     Y_seg = Y_seg + eps*randn(size(Y_seg)); % to avoid divide by zero
 74 |     
 75 |     %% first normalize rows (to give \bar{S}_m)
 76 |     XX = X_seg - mean(X_seg.').'*ones(1,N); % normalize rows to zero mean
 77 |     YY = Y_seg - mean(Y_seg.').'*ones(1,N); % normalize rows to zero mean
 78 |     
 79 |     YY = diag(1./sqrt(diag(YY*YY')))*YY; % normalize rows to unit length
 80 |     XX = diag(1./sqrt(diag(XX*XX')))*XX; % normalize rows to unit length
 81 | 
 82 |     XX = XX + eps*randn(size(XX)); % to avoid corr.div.by.0
 83 |     YY = YY + eps*randn(size(YY)); % to avoid corr.div.by.0
 84 | 
 85 |     %% then normalize columns (to give \check{S}_m)
 86 |     YYY = YY - ones(J,1)*mean(YY); % normalize cols to zero mean
 87 |     XXX = XX - ones(J,1)*mean(XX); % normalize cols to zero mean
 88 | 
 89 |     YYY = YYY*diag(1./sqrt(diag(YYY'*YYY))); % normalize cols to unit length
 90 |     XXX = XXX*diag(1./sqrt(diag(XXX'*XXX))); % normalize cols to unit length
 91 | 
 92 |     %compute average of col.correlations (by stacking cols)
 93 |     d1(m-N+1) = 1/N*XXX(:).'*YYY(:);
 94 | end
 95 | d = mean(d1);
 96 | 
 97 | 
 98 | %%
 99 | function  [A cf] = thirdoct(fs, N_fft, numBands, mn)
100 | %   [A CF] = THIRDOCT(FS, N_FFT, NUMBANDS, MN) returns 1/3 octave band matrix
101 | %   inputs:
102 | %       FS:         samplerate
103 | %       N_FFT:      FFT size
104 | %       NUMBANDS:   number of bands
105 | %       MN:         center frequency of first 1/3 octave band
106 | %   outputs:
107 | %       A:          octave band matrix
108 | %       CF:         center frequencies
109 | 
110 | f               = linspace(0, fs, N_fft+1);
111 | f               = f(1:(N_fft/2+1));
112 | k               = 0:(numBands-1);
113 | cf              = 2.^(k/3)*mn;
114 | fl              = sqrt((2.^(k/3)*mn).*2.^((k-1)/3)*mn);
115 | fr              = sqrt((2.^(k/3)*mn).*2.^((k+1)/3)*mn);
116 | A               = zeros(numBands, length(f));
117 | 
118 | for i = 1:(length(cf))
119 |   [a b]                   = min((f-fl(i)).^2);
120 |   fl(i)                   = f(b);
121 |   fl_ii                   = b;
122 |   
123 |   [a b]                   = min((f-fr(i)).^2);
124 |   fr(i)                   = f(b);
125 |   fr_ii                   = b;
126 |   A(i,fl_ii:(fr_ii-1))	= 1;
127 | end
128 | 
129 | rnk         = sum(A, 2);
130 | numBands  	= find((rnk(2:end)>=rnk(1:(end-1))) & (rnk(2:end)~=0)~=0, 1, 'last' )+1;
131 | A           = A(1:numBands, :);
132 | cf          = cf(1:numBands);
133 | 
134 | %%
135 | function x_stdft = stdft(x, N, K, N_fft)
136 | %   X_STDFT = X_STDFT(X, N, K, N_FFT) returns the short-time
137 | %	hanning-windowed dft of X with frame-size N, overlap K and DFT size
138 | %   N_FFT. The columns and rows of X_STDFT denote the frame-index and
139 | %   dft-bin index, respectively.
140 | 
141 | frames      = 1:K:(length(x)-N);
142 | x_stdft     = zeros(length(frames), N_fft);
143 | 
144 | w           = hanning(N);
145 | x           = x(:);
146 | 
147 | for i = 1:length(frames)
148 |   ii              = frames(i):(frames(i)+N-1);
149 |   x_stdft(i, :) 	= fft(x(ii).*w, N_fft);
150 | end
151 | 
152 | %%
153 | function [x_sil y_sil] = removeSilentFrames(x, y, range, N, K)
154 | %   [X_SIL Y_SIL] = REMOVESILENTFRAMES(X, Y, RANGE, N, K) X and Y
155 | %   are segmented with frame-length N and overlap K, where the maximum energy
156 | %   of all frames of X is determined, say X_MAX. X_SIL and Y_SIL are the
157 | %   reconstructed signals, excluding the frames, where the energy of a frame
158 | %   of X is smaller than X_MAX-RANGE
159 | 
160 | x       = x(:);
161 | y       = y(:);
162 | 
163 | frames  = 1:K:(length(x)-N);
164 | w       = hanning(N);
165 | msk     = zeros(size(frames));
166 | 
167 | for j = 1:length(frames)
168 |   jj      = frames(j):(frames(j)+N-1);
169 |   msk(j) 	= 20*log10(norm(x(jj).*w)./sqrt(N));
170 | end
171 | 
172 | msk     = (msk-max(msk)+range)>0;
173 | count   = 1;
174 | 
175 | x_sil   = zeros(size(x));
176 | y_sil   = zeros(size(y));
177 | 
178 | for j = 1:length(frames)
179 |   if msk(j)
180 |     jj_i            = frames(j):(frames(j)+N-1);
181 |     jj_o            = frames(count):(frames(count)+N-1);
182 |     x_sil(jj_o)     = x_sil(jj_o) + x(jj_i).*w;
183 |     y_sil(jj_o)  	= y_sil(jj_o) + y(jj_i).*w;
184 |     count           = count+1;
185 |   end
186 | end
187 | 
188 | x_sil = x_sil(1:jj_o(end));
189 | y_sil = y_sil(1:jj_o(end));
190 | 


--------------------------------------------------------------------------------
/evaluate_2speaker_ori.m:
--------------------------------------------------------------------------------
  1 | %====================================================================================
  2 | %               Performance Measurement in Multi-speaker Separation
  3 | %                   Author: Chao Peng, EECS, Peking University
  4 | %            Github: https://github.com/pchao6/LSTM_PIT_Speech_Separation
  5 | %                            Revision 1.0, June 2018
  6 | %====================================================================================
  7 | tic
  8 | addpath('/usr/local/MATLAB/R2016b/toolbox/voicebox'); 
  9 | addpath('PESQ');  %PESQ Toolbox According to ITU-T P.862;
 10 | 
 11 | sample_rate = 8000;
 12 | tt_wav_dir = '/home/wuxc/BLSTM-PIT-BSS/LSTM_PIT_Speech_Separation-master/Dataset/WSJ0-mix/mix/data/2speakers_0dB/wav8k/min/tt/';
 13 | model_name = '2speakers_0dB_original';
 14 | mix_wav_dir = [tt_wav_dir '/mix/'];
 15 | spk1_dir = [tt_wav_dir, '/s1/'];
 16 | spk2_dir = [tt_wav_dir, '/s2/'];
 17 | 
 18 | lists = dir(spk2_dir);  %3002*1 struct
 19 | len = length(lists) - 2;  %3000
 20 | SDR =  zeros(len, 2);
 21 | SIR = SDR;
 22 | SAR = SDR;
 23 | STOI = SDR;
 24 | ESTOI = SDR;
 25 | PESQ = SDR;
 26 | error_num_STOI = 0;
 27 | error_num_ESTOI = 0;
 28 | error_num_PESQ = 0;
 29 | 
 30 | 
 31 | for i = 3:len+2 
 32 |     name = lists(i).name;
 33 |     part_name = name(1:end-4);
 34 |     fprintf('Computing Audio:%s, Number:%d ...\n', [part_name '.wav'], i-2)
 35 | 
 36 |     mix_wav1 = audioread([mix_wav_dir part_name '.wav']);  %35328*1 double
 37 |     mix_wav = [mix_wav1, mix_wav1];  %35328*2 double
 38 |     
 39 |     ori_wav1 = audioread([spk1_dir part_name '.wav']);  %35269*1 double
 40 |     ori_wav2 = audioread([spk2_dir part_name '.wav']);  %35269*1 double
 41 |     ori_wav = [ori_wav1, ori_wav2];  %35269*2 double
 42 |     
 43 |     min_len = min(size(ori_wav, 1), size(mix_wav, 1));  %35269
 44 |     mix_wav = mix_wav(1:min_len, :);  %35269*2 double
 45 |     ori_wav = ori_wav(1:min_len, :);  %35269*2 double
 46 |     [SDR(i-2, :),SIR(i-2, :),SAR(i-2, :),perm]=bss_eval_sources(mix_wav', ori_wav');
 47 | 
 48 |     x1 = stoi(ori_wav(:,1), mix_wav(:,1), sample_rate);
 49 |     x2 = stoi(ori_wav(:,2), mix_wav(:,2), sample_rate);
 50 |     if ~isnan(x1) & ~isnan(x2)
 51 |     %if x1 ~= NaN & x2 ~= NaN
 52 |         STOI(i-2, 1) = x1;
 53 |         STOI(i-2, 2) = x2;
 54 |     else
 55 |     	STOI(i-2, 1) = 0;
 56 |         STOI(i-2, 2) = 0;
 57 |     	error_num_STOI = error_num_STOI + 1;
 58 |         fprintf('STOI NaN happens in computing the audio:%s, i=%d.\n', [part_name '.wav'], i-2)
 59 |     end
 60 |     
 61 |     e1 = estoi(ori_wav(:,1), mix_wav(:,1), sample_rate);
 62 |     e2 = estoi(ori_wav(:,2), mix_wav(:,2), sample_rate);
 63 |     if ~isnan(x1) & ~isnan(x2)
 64 |     %if e1 ~= NaN & e2 ~= NaN 
 65 |         ESTOI(i-2, 1) = e1;
 66 |         ESTOI(i-2, 2) = e2;
 67 |     else
 68 |     	STOI(i-2, 1) = 0;
 69 |         STOI(i-2, 2) = 0;
 70 |     	error_num_ESTOI = error_num_ESTOI + 1;
 71 |         fprintf('ESTOI NaN happens in computing the audio:%s, i=%d.\n', [part_name '.wav'], i-2)
 72 |     end
 73 | 
 74 |     try
 75 |         PESQ(i-2, 1) = pesq([spk1_dir part_name '.wav'], [mix_wav_dir part_name '.wav']);
 76 |         PESQ(i-2, 2) = pesq([spk2_dir part_name '.wav'], [mix_wav_dir part_name '.wav']);
 77 |     catch ErrorInfo
 78 |         PESQ(i-2, 1) = 0;
 79 |         PESQ(i-2, 2) = 0;
 80 |         disp(ErrorInfo)
 81 |         error_num_PESQ = error_num_PESQ + 1;
 82 |         fprintf('PESQ Error happens in computing the audio:%s, i=%d.\n', [part_name '.wav'], i-2)
 83 |     end
 84 | end
 85 | 
 86 | 
 87 | fprintf('Model Name: %s.\n', model_name)
 88 | fprintf('The mean SDR is %f.\n', mean(mean(SDR)))
 89 | fprintf('The mean SAR is %f.\n', mean(mean(SAR)))
 90 | fprintf('The mean SIR is %f.\n', mean(mean(SIR)))
 91 | fprintf('Mean STOI is %f.\n', mean(sum(STOI)/(len - error_num_STOI)))
 92 | fprintf('Mean ESTOI is %f.\n', mean(sum(ESTOI)/(len - error_num_ESTOI)))
 93 | fprintf('Mean PESQ is %f.\n', mean(sum(PESQ)/(len - error_num_PESQ)))
 94 | save(['matfiles/evaluate_' model_name], 'SDR', 'SAR', 'SIR', 'STOI', 'ESTOI', 'PESQ', 'lists');
 95 | 
 96 | time_length = toc;
 97 | hour = floor(time_length/3600);
 98 | remaining = mod(time_length, 3600);
 99 | minute = floor(remaining/60);
100 | second = mod(remaining, 60);
101 | fprintf('\nElapsed time is %d hour(s), %d minute(s), %d second(s).\n', hour, minute, floor(second))
102 | 


--------------------------------------------------------------------------------
/evaluate_2speaker_separated.m:
--------------------------------------------------------------------------------
 1 | %====================================================================================
 2 | %               Performance Measurement in Multi-speaker Separation
 3 | %                   Author: Chao Peng, EECS, Peking University
 4 | %            Github: https://github.com/pchao6/LSTM_PIT_Speech_Separation
 5 | %                            Revision 1.0, June 2018
 6 | %====================================================================================
 7 | tic
 8 | addpath('/usr/local/MATLAB/R2016b/toolbox/voicebox');
 9 | addpath('PESQ');  %PESQ Toolbox According to ITU-T P.862;
10 | 
11 | sample_rate = 8000;
12 | tt_wav_dir = 'SpeechSeparation/mix/data/2speakers_0dB/wav8k/min/tt';
13 | model_name = 'PIT_BLSTM_3_496_2speaker_8KHz_0dB';
14 | rec_wav_dir = ['SpeechSeparation/separated/' model_name '/'];
15 | 
16 | spk1_dir = [tt_wav_dir, '/s1/'];
17 | spk2_dir = [tt_wav_dir, '/s2/'];
18 | 
19 | lists = dir(spk2_dir);
20 | len = length(lists) - 2;
21 | SDR =  zeros(len, 2);
22 | SIR = SDR;
23 | SAR = SDR;
24 | STOI = SDR;
25 | ESTOI = SDR;
26 | PESQ = SDR;
27 | error_num_STOI = 0;
28 | error_num_ESTOI = 0;
29 | error_num_PESQ = 0;
30 | 
31 | fprintf('Model Name: %s.\n', model_name)
32 | for i = 3:len+2 
33 |     name = lists(i).name; 
34 |     part_name = name(1:end-4);
35 |     fprintf('Computing Audio:%s, Number:%d ...\n', [part_name '.wav'], i-2)
36 | 
37 |     rec_wav1 = audioread([rec_wav_dir part_name '_1.wav']);
38 |     rec_wav2 = audioread([rec_wav_dir part_name '_2.wav']);
39 |     rec_wav = [rec_wav1, rec_wav2];
40 |     
41 |     ori_wav1 = audioread([spk1_dir part_name '.wav']);
42 |     ori_wav2 = audioread([spk2_dir part_name '.wav']);
43 |     ori_wav = [ori_wav1, ori_wav2];  %35269*2 double
44 |     
45 |     min_len = min(size(ori_wav, 1), size(rec_wav, 1));  %35269
46 |     rec_wav = rec_wav(1:min_len, :);
47 |     ori_wav = ori_wav(1:min_len, :);
48 |     [SDR(i-2, :),SIR(i-2, :),SAR(i-2, :),perm]=bss_eval_sources(rec_wav', ori_wav');
49 | 
50 |     x1 = stoi(ori_wav(:,1), rec_wav(:,1), sample_rate);
51 |     x2 = stoi(ori_wav(:,2), rec_wav(:,2), sample_rate);
52 |     if ~isnan(x1) & ~isnan(x2)
53 |         STOI(i-2, 1) = x1;
54 |         STOI(i-2, 2) = x2;
55 |     else
56 |     	STOI(i-2, 1) = 0;
57 |         STOI(i-2, 2) = 0;
58 |     	error_num_STOI = error_num_STOI + 1;
59 |     end
60 |     
61 |     e1 = estoi(ori_wav(:,1), rec_wav(:,1), sample_rate);
62 |     e2 = estoi(ori_wav(:,2), rec_wav(:,2), sample_rate);
63 |     if ~isnan(x1) & ~isnan(x2)
64 |         ESTOI(i-2, 1) = e1;
65 |         ESTOI(i-2, 2) = e2;
66 |     else
67 |     	STOI(i-2, 1) = 0;
68 |         STOI(i-2, 2) = 0;
69 |     	error_num_ESTOI = error_num_ESTOI + 1;
70 |     end
71 | 
72 |     try
73 |         PESQ(i-2, 1) = pesq([spk1_dir part_name '.wav'], [rec_wav_dir part_name '_1.wav']);
74 |         PESQ(i-2, 2) = pesq([spk2_dir part_name '.wav'], [rec_wav_dir part_name '_2.wav']);
75 |     catch ErrorInfo
76 |         PESQ(i-2, 1) = 0;
77 |         PESQ(i-2, 2) = 0;
78 |         disp(ErrorInfo)
79 |         error_num_PESQ = error_num_PESQ + 1;
80 |     end
81 | end
82 | 
83 | fprintf('The mean SDR is %f.\n', mean(mean(SDR)))
84 | fprintf('The mean SAR is %f.\n', mean(mean(SAR)))
85 | fprintf('The mean SIR is %f.\n', mean(mean(SIR)))
86 | fprintf('The mean STOI is %f.\n', mean(sum(STOI)/(len - error_num_STOI)))
87 | fprintf('The mean ESTOI is %f.\n', mean(sum(ESTOI)/(len - error_num_ESTOI)))
88 | fprintf('The mean PESQ is %f.\n', mean(sum(PESQ)/(len - error_num_PESQ)))
89 | save(['matfiles/evaluate_' model_name], 'SDR', 'SAR', 'SIR', 'STOI', 'ESTOI', 'PESQ', 'lists');
90 | 
91 | time_length = toc;
92 | hour = floor(time_length/3600);
93 | remaining = mod(time_length, 3600);
94 | minute = floor(remaining/60);
95 | second = mod(remaining, 60);
96 | fprintf('\nElapsed time is %d hour(s), %d minute(s), %d second(s).\n', hour, minute, floor(second))


--------------------------------------------------------------------------------
/pesq.m:
--------------------------------------------------------------------------------
  1 | function [pesq_mos]= pesq(ref_wav, deg_wav)
  2 | 
  3 | % ----------------------------------------------------------------------
  4 | %            PESQ objective speech quality measure
  5 | %
  6 | %   This function implements the PESQ measure based on the ITU standard
  7 | %   P.862 [1].
  8 | %
  9 | %
 10 | %   Usage:  pval=pesq(cleanFile.wav, enhancedFile.wav)
 11 | %           
 12 | %         cleanFile.wav - clean input file in .wav format
 13 | %         enhancedFile  - enhanced output file in .wav format
 14 | %         pval          - PESQ value
 15 | %
 16 | %    Note that the PESQ routine only supports sampling rates of 8 kHz and
 17 | %    16 kHz [1]
 18 | %
 19 | %  Example call:  pval = pesq ('sp04.wav','enhanced.wav')
 20 | %
 21 | %  
 22 | %  References:
 23 | %   [1] ITU (2000). Perceptual evaluation of speech quality (PESQ), and 
 24 | %       objective method for end-to-end speech quality assessment of 
 25 | %       narrowband telephone networks and speech codecs. ITU-T
 26 | %       Recommendation P. 862   
 27 | %
 28 | %   Authors: Yi Hu and Philipos C. Loizou 
 29 | %
 30 | %
 31 | % Copyright (c) 2006 by Philipos C. Loizou
 32 | % $Revision: 0.0 $  $Date: 10/09/2006 $
 33 | % ----------------------------------------------------------------------
 34 | if nargin<2
 35 |     fprintf('Usage: [pesq_mos]=pesq(cleanfile.wav,enhanced.wav) \n');
 36 |     return;
 37 | end;
 38 | 
 39 | global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs WHOLE_SIGNAL
 40 | global Align_Nfft Window 
 41 | 
 42 | [ref_data,sampling_rate]= audioread( ref_wav);
 43 | if sampling_rate~=8000 && sampling_rate~=16000
 44 |     error('Sampling frequency needs to be either 8000 or 16000 Hz');
 45 | end
 46 | 
 47 | setup_global( sampling_rate);
 48 | 
 49 | % Window= hann( Align_Nfft, 'periodic'); %Hanning window
 50 | % Window= Window'; 
 51 | TWOPI= 6.28318530717959;
 52 | %for count = 0: Align_Nfft- 1
 53 | %    Window(1+ count) = 0.5 * (1.0 - cos((TWOPI * count) / Align_Nfft));
 54 | %end
 55 | 
 56 | count=0:Align_Nfft- 1;
 57 | Window= 0.5 * (1.0 - cos((TWOPI * count) / Align_Nfft));
 58 |   
 59 | 
 60 | 
 61 | ref_data= ref_data';
 62 | ref_data= ref_data* 32768;
 63 | ref_Nsamples= length( ref_data)+ 2* SEARCHBUFFER* Downsample;
 64 | ref_data= [zeros( 1, SEARCHBUFFER* Downsample), ref_data, ...
 65 |     zeros( 1, DATAPADDING_MSECS* (Fs/ 1000)+ SEARCHBUFFER* Downsample)];
 66 | 
 67 | [deg_data,~]= audioread( deg_wav);
 68 | deg_data= deg_data';
 69 | deg_data= deg_data* 32768;
 70 | deg_Nsamples= length( deg_data)+ 2* SEARCHBUFFER* Downsample;
 71 | deg_data= [zeros( 1, SEARCHBUFFER* Downsample), deg_data, ...
 72 |     zeros( 1, DATAPADDING_MSECS* (Fs/ 1000)+ SEARCHBUFFER* Downsample)];
 73 | 
 74 | maxNsamples= max( ref_Nsamples, deg_Nsamples);
 75 | 
 76 | ref_data= fix_power_level( ref_data, ref_Nsamples, maxNsamples);
 77 | deg_data= fix_power_level( deg_data, deg_Nsamples, maxNsamples);
 78 | 
 79 | standard_IRS_filter_dB= [0, -200; 50, -40; 100, -20; 125, -12; 160, -6; 200, 0;...    
 80 |     250, 4; 300, 6; 350, 8; 400, 10; 500, 11; 600, 12; 700, 12; 800, 12;...
 81 |     1000, 12; 1300, 12; 1600, 12; 2000, 12; 2500, 12; 3000, 12; 3250, 12;...
 82 |     3500, 4; 4000, -200; 5000, -200; 6300, -200; 8000, -200]; 
 83 | 
 84 | ref_data= apply_filter( ref_data, ref_Nsamples, standard_IRS_filter_dB);
 85 | deg_data= apply_filter( deg_data, deg_Nsamples, standard_IRS_filter_dB);
 86 | % 
 87 | 
 88 | 
 89 | 
 90 | % for later use in psychoacoustical model
 91 | model_ref= ref_data;
 92 | model_deg= deg_data;
 93 | 
 94 | [ref_data, deg_data]= input_filter( ref_data, ref_Nsamples, deg_data, ...
 95 |     deg_Nsamples);
 96 | 
 97 | 
 98 | [ref_VAD, ref_logVAD]= apply_VAD( ref_data, ref_Nsamples);
 99 | [deg_VAD, deg_logVAD]= apply_VAD( deg_data, deg_Nsamples);
100 | 
101 | 
102 | crude_align (ref_logVAD, ref_Nsamples, deg_logVAD, deg_Nsamples,...
103 |     WHOLE_SIGNAL);
104 | 
105 | utterance_locate (ref_data, ref_Nsamples, ref_VAD, ref_logVAD,...
106 |     deg_data, deg_Nsamples, deg_VAD, deg_logVAD);
107 | 
108 | ref_data= model_ref;
109 | deg_data= model_deg;
110 | 
111 | % make ref_data and deg_data equal length
112 | if (ref_Nsamples< deg_Nsamples)
113 |     newlen= deg_Nsamples+ DATAPADDING_MSECS* (Fs/ 1000);
114 |     ref_data( newlen)= 0;
115 | elseif (ref_Nsamples> deg_Nsamples)
116 |     newlen= ref_Nsamples+ DATAPADDING_MSECS* (Fs/ 1000);
117 |     deg_data( newlen)= 0;
118 | end
119 | 
120 | 
121 | pesq_mos= pesq_psychoacoustic_model (ref_data, ref_Nsamples, deg_data, ...
122 |     deg_Nsamples );
123 | 
124 | 
125 | 
126 | 
127 | 


--------------------------------------------------------------------------------
/rusult/050a0501_1.7783_442o030z_-1.7783_1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dakenan1/Speech-measure-SDR-SAR-STOI-PESQ/8d9562631a344ba37e7b4cf0d993b1ca339c8c0b/rusult/050a0501_1.7783_442o030z_-1.7783_1.wav


--------------------------------------------------------------------------------
/rusult/050a0501_1.7783_442o030z_-1.7783_2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dakenan1/Speech-measure-SDR-SAR-STOI-PESQ/8d9562631a344ba37e7b4cf0d993b1ca339c8c0b/rusult/050a0501_1.7783_442o030z_-1.7783_2.wav


--------------------------------------------------------------------------------
/rusult/050a0502_1.3461_440o030j_-1.3461_1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dakenan1/Speech-measure-SDR-SAR-STOI-PESQ/8d9562631a344ba37e7b4cf0d993b1ca339c8c0b/rusult/050a0502_1.3461_440o030j_-1.3461_1.wav


--------------------------------------------------------------------------------
/rusult/050a0502_1.3461_440o030j_-1.3461_2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dakenan1/Speech-measure-SDR-SAR-STOI-PESQ/8d9562631a344ba37e7b4cf0d993b1ca339c8c0b/rusult/050a0502_1.3461_440o030j_-1.3461_2.wav


--------------------------------------------------------------------------------
/rusult/050a0502_1.463_420a010o_-1.463_1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dakenan1/Speech-measure-SDR-SAR-STOI-PESQ/8d9562631a344ba37e7b4cf0d993b1ca339c8c0b/rusult/050a0502_1.463_420a010o_-1.463_1.wav


--------------------------------------------------------------------------------
/rusult/050a0502_1.463_420a010o_-1.463_2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dakenan1/Speech-measure-SDR-SAR-STOI-PESQ/8d9562631a344ba37e7b4cf0d993b1ca339c8c0b/rusult/050a0502_1.463_420a010o_-1.463_2.wav


--------------------------------------------------------------------------------
/rusult/050a0502_1.9707_440c020w_-1.9707_1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dakenan1/Speech-measure-SDR-SAR-STOI-PESQ/8d9562631a344ba37e7b4cf0d993b1ca339c8c0b/rusult/050a0502_1.9707_440c020w_-1.9707_1.wav


--------------------------------------------------------------------------------
/rusult/050a0502_1.9707_440c020w_-1.9707_2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dakenan1/Speech-measure-SDR-SAR-STOI-PESQ/8d9562631a344ba37e7b4cf0d993b1ca339c8c0b/rusult/050a0502_1.9707_440c020w_-1.9707_2.wav


--------------------------------------------------------------------------------
/stoi.m:
--------------------------------------------------------------------------------
  1 | function d = stoi(x, y, fs_signal)
  2 | %   d = stoi(x, y, fs_signal) returns the output of the short-time
  3 | %   objective intelligibility (STOI) measure described in [1, 2], where x 
  4 | %   and y denote the clean and processed speech, respectively, with sample
  5 | %   rate fs_signal in Hz. The output d is expected to have a monotonic 
  6 | %   relation with the subjective speech-intelligibility, where a higher d 
  7 | %   denotes better intelligible speech. See [1, 2] for more details.
  8 | %
  9 | %   References:
 10 | %      [1] C.H.Taal, R.C.Hendriks, R.Heusdens, J.Jensen 'A Short-Time
 11 | %      Objective Intelligibility Measure for Time-Frequency Weighted Noisy
 12 | %      Speech', ICASSP 2010, Texas, Dallas.
 13 | %
 14 | %      [2] C.H.Taal, R.C.Hendriks, R.Heusdens, J.Jensen 'An Algorithm for 
 15 | %      Intelligibility Prediction of Time-Frequency Weighted Noisy Speech', 
 16 | %      IEEE Transactions on Audio, Speech, and Language Processing, 2011. 
 17 | %
 18 | %
 19 | % Copyright 2009: Delft University of Technology, Signal & Information
 20 | % Processing Lab. The software is free for non-commercial use. This program
 21 | % comes WITHOUT ANY WARRANTY.
 22 | %
 23 | %
 24 | %
 25 | % Updates:
 26 | % 2011-04-26 Using the more efficient 'taa_corr' instead of 'corr'
 27 | 
 28 | if length(x)~=length(y)
 29 |     error('x and y should have the same length');
 30 | end
 31 | 
 32 | % initialization
 33 | x           = x(:);                             % clean speech column vector
 34 | y           = y(:);                             % processed speech column vector
 35 | 
 36 | fs          = 10000;                            % sample rate of proposed intelligibility measure
 37 | N_frame    	= 256;                              % window support
 38 | K           = 512;                              % FFT size
 39 | J           = 15;                               % Number of 1/3 octave bands
 40 | mn          = 150;                              % Center frequency of first 1/3 octave band in Hz.
 41 | H           = thirdoct(fs, K, J, mn);           % Get 1/3 octave band matrix
 42 | N           = 30;                               % Number of frames for intermediate intelligibility measure (Length analysis window)
 43 | Beta        = -15;                           	% lower SDR-bound
 44 | dyn_range   = 40;                               % speech dynamic range
 45 | 
 46 | % resample signals if other samplerate is used than fs
 47 | if fs_signal ~= fs
 48 |     x	= resample(x, fs, fs_signal);
 49 |     y 	= resample(y, fs, fs_signal);
 50 | end
 51 | 
 52 | % remove silent frames
 53 | [x y] = removeSilentFrames(x, y, dyn_range, N_frame, N_frame/2);
 54 | 
 55 | % apply 1/3 octave band TF-decomposition
 56 | x_hat     	= stdft(x, N_frame, N_frame/2, K); 	% apply short-time DFT to clean speech
 57 | y_hat     	= stdft(y, N_frame, N_frame/2, K); 	% apply short-time DFT to processed speech
 58 | 
 59 | x_hat       = x_hat(:, 1:(K/2+1)).';         	% take clean single-sided spectrum
 60 | y_hat       = y_hat(:, 1:(K/2+1)).';        	% take processed single-sided spectrum
 61 | 
 62 | X           = zeros(J, size(x_hat, 2));         % init memory for clean speech 1/3 octave band TF-representation 
 63 | Y           = zeros(J, size(y_hat, 2));         % init memory for processed speech 1/3 octave band TF-representation 
 64 | 
 65 | for i = 1:size(x_hat, 2)
 66 |     X(:, i)	= sqrt(H*abs(x_hat(:, i)).^2);      % apply 1/3 octave bands as described in Eq.(1) [1]
 67 |     Y(:, i)	= sqrt(H*abs(y_hat(:, i)).^2);
 68 | end
 69 | 
 70 | % loop al segments of length N and obtain intermediate intelligibility measure for all TF-regions
 71 | d_interm  	= zeros(J, length(N:size(X, 2)));                               % init memory for intermediate intelligibility measure
 72 | c           = 10^(-Beta/20);                                                % constant for clipping procedure
 73 | 
 74 | for m = N:size(X, 2)
 75 |     X_seg  	= X(:, (m-N+1):m);                                              % region with length N of clean TF-units for all j
 76 |     Y_seg  	= Y(:, (m-N+1):m);                                              % region with length N of processed TF-units for all j
 77 |     alpha   = sqrt(sum(X_seg.^2, 2)./sum(Y_seg.^2, 2));                     % obtain scale factor for normalizing processed TF-region for all j
 78 |     aY_seg 	= Y_seg.*repmat(alpha, [1 N]);                               	% obtain \alpha*Y_j(n) from Eq.(2) [1]
 79 |     for j = 1:J
 80 |       	Y_prime             = min(aY_seg(j, :), X_seg(j, :)+X_seg(j, :)*c); % apply clipping from Eq.(3)   	
 81 |         d_interm(j, m-N+1)  = taa_corr(X_seg(j, :).', Y_prime(:));          % obtain correlation coeffecient from Eq.(4) [1]
 82 |     end
 83 | end
 84 |         
 85 | d = mean(d_interm(:));                                                      % combine all intermediate intelligibility measures as in Eq.(4) [1]
 86 | 
 87 | %%
 88 | function  [A cf] = thirdoct(fs, N_fft, numBands, mn)
 89 | %   [A CF] = THIRDOCT(FS, N_FFT, NUMBANDS, MN) returns 1/3 octave band matrix
 90 | %   inputs:
 91 | %       FS:         samplerate 
 92 | %       N_FFT:      FFT size
 93 | %       NUMBANDS:   number of bands
 94 | %       MN:         center frequency of first 1/3 octave band
 95 | %   outputs:
 96 | %       A:          octave band matrix
 97 | %       CF:         center frequencies
 98 | 
 99 | f               = linspace(0, fs, N_fft+1);
100 | f               = f(1:(N_fft/2+1));
101 | k               = 0:(numBands-1); 
102 | cf              = 2.^(k/3)*mn;
103 | fl              = sqrt((2.^(k/3)*mn).*2.^((k-1)/3)*mn);
104 | fr              = sqrt((2.^(k/3)*mn).*2.^((k+1)/3)*mn);
105 | A               = zeros(numBands, length(f));
106 | 
107 | for i = 1:(length(cf))
108 |     [a b]                   = min((f-fl(i)).^2);
109 |     fl(i)                   = f(b);
110 |     fl_ii                   = b;
111 | 
112 | 	[a b]                   = min((f-fr(i)).^2);
113 |     fr(i)                   = f(b);
114 |     fr_ii                   = b;
115 |     A(i,fl_ii:(fr_ii-1))	= 1;
116 | end
117 | 
118 | rnk         = sum(A, 2);
119 | numBands  	= find((rnk(2:end)>=rnk(1:(end-1))) & (rnk(2:end)~=0)~=0, 1, 'last' )+1;
120 | A           = A(1:numBands, :);
121 | cf          = cf(1:numBands);
122 | 
123 | %%
124 | function x_stdft = stdft(x, N, K, N_fft)
125 | %   X_STDFT = X_STDFT(X, N, K, N_FFT) returns the short-time
126 | %	hanning-windowed dft of X with frame-size N, overlap K and DFT size
127 | %   N_FFT. The columns and rows of X_STDFT denote the frame-index and
128 | %   dft-bin index, respectively.
129 | 
130 | frames      = 1:K:(length(x)-N);
131 | x_stdft     = zeros(length(frames), N_fft);
132 | 
133 | w           = hanning(N);
134 | x           = x(:);
135 | 
136 | for i = 1:length(frames)
137 |     ii              = frames(i):(frames(i)+N-1);
138 | 	x_stdft(i, :) 	= fft(x(ii).*w, N_fft);
139 | end
140 | 
141 | %%
142 | function [x_sil y_sil] = removeSilentFrames(x, y, range, N, K)
143 | %   [X_SIL Y_SIL] = REMOVESILENTFRAMES(X, Y, RANGE, N, K) X and Y
144 | %   are segmented with frame-length N and overlap K, where the maximum energy
145 | %   of all frames of X is determined, say X_MAX. X_SIL and Y_SIL are the
146 | %   reconstructed signals, excluding the frames, where the energy of a frame
147 | %   of X is smaller than X_MAX-RANGE
148 | 
149 | x       = x(:);
150 | y       = y(:);
151 | 
152 | frames  = 1:K:(length(x)-N);
153 | w       = hanning(N);
154 | msk     = zeros(size(frames));
155 | 
156 | for j = 1:length(frames)
157 |     jj      = frames(j):(frames(j)+N-1);
158 |     msk(j) 	= 20*log10(norm(x(jj).*w)./sqrt(N));
159 | end
160 | 
161 | msk     = (msk-max(msk)+range)>0;
162 | count   = 1;
163 | 
164 | x_sil   = zeros(size(x));
165 | y_sil   = zeros(size(y));
166 | 
167 | for j = 1:length(frames)
168 |     if msk(j)
169 |         jj_i            = frames(j):(frames(j)+N-1);
170 |         jj_o            = frames(count):(frames(count)+N-1);
171 |         x_sil(jj_o)     = x_sil(jj_o) + x(jj_i).*w;
172 |         y_sil(jj_o)  	= y_sil(jj_o) + y(jj_i).*w;
173 |         count           = count+1;
174 |     end
175 | end
176 | 
177 | x_sil = x_sil(1:jj_o(end));
178 | y_sil = y_sil(1:jj_o(end));
179 | 
180 | %%
181 | function rho = taa_corr(x, y)
182 | %   RHO = TAA_CORR(X, Y) Returns correlation coeffecient between column
183 | %   vectors x and y. Gives same results as 'corr' from statistics toolbox.
184 | xn    	= x-mean(x);
185 | xn  	= xn/sqrt(sum(xn.^2));
186 | yn   	= y-mean(y);
187 | yn    	= yn/sqrt(sum(yn.^2));
188 | rho   	= sum(xn.*yn);


--------------------------------------------------------------------------------