├── PESQ ├── DC_block.m ├── FFTNXCorr.m ├── apply_VAD.m ├── apply_filter.m ├── apply_filters.m ├── crude_align.m ├── fix_power_level.m ├── id_searchwindows.m ├── id_utterances.m ├── input_filter.m ├── pesq.m ├── pesq_psychoacoustic_model.m ├── pow_of.m ├── setup_global.m ├── split_align.m ├── stoi.m ├── time_align.m ├── utterance_locate.m └── utterance_split.m ├── README.md ├── bss_eval_sources.m ├── estoi.m ├── evaluate_2speaker_ori.m ├── evaluate_2speaker_separated.m ├── pesq.m ├── rusult ├── 050a0501_1.7783_442o030z_-1.7783_1.wav ├── 050a0501_1.7783_442o030z_-1.7783_2.wav ├── 050a0502_1.3461_440o030j_-1.3461_1.wav ├── 050a0502_1.3461_440o030j_-1.3461_2.wav ├── 050a0502_1.463_420a010o_-1.463_1.wav ├── 050a0502_1.463_420a010o_-1.463_2.wav ├── 050a0502_1.9707_440c020w_-1.9707_1.wav └── 050a0502_1.9707_440c020w_-1.9707_2.wav └── stoi.m /PESQ/DC_block.m: -------------------------------------------------------------------------------- 1 | function mod_data= DC_block( data, Nsamples) 2 | 3 | global Downsample DATAPADDING_MSECS SEARCHBUFFER 4 | 5 | ofs= SEARCHBUFFER* Downsample; 6 | mod_data= data; 7 | 8 | %compute dc component, it is a little weird 9 | facc= sum( data( ofs+ 1: Nsamples- ofs))/ Nsamples; 10 | mod_data( ofs+ 1: Nsamples- ofs)= data( ofs+ 1: Nsamples- ofs)- facc; 11 | 12 | mod_data( ofs+ 1: ofs+ Downsample)= mod_data( ofs+ 1: ofs+ Downsample).* ... 13 | ( 0.5+ (0: Downsample- 1))/ Downsample; 14 | 15 | mod_data( Nsamples- ofs: -1: Nsamples- ofs-Downsample+ 1)= ... 16 | mod_data( Nsamples- ofs: -1: Nsamples- ofs-Downsample+ 1).* ... 17 | ( 0.5+ (0: Downsample- 1))/ Downsample; 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /PESQ/FFTNXCorr.m: -------------------------------------------------------------------------------- 1 | function Y= FFTNXCorr( ref_VAD, startr, nr, deg_VAD, startd, nd) 2 | % this function has other simple implementations, current implementation is 3 | % consistent with the C version 4 | 5 | % one way to do this (in time domain) ===== 6 | x1= ref_VAD( startr: startr+ nr- 1); 7 | x2= deg_VAD( startd: startd+ nd- 1); 8 | x1= fliplr( x1); 9 | Y= conv( x2, x1); 10 | % done ===== 11 | 12 | % % the other way to do this (in freq domain)=== 13 | % Nx= 2^ (ceil( log2( max( nr, nd)))); 14 | % x1= zeros( 1, 2* Nx); 15 | % x2= zeros( 1, 2* Nx); 16 | % x1( 1: nr)= fliplr( ref_VAD( startr: startr+ nr- 1)); 17 | % x2( 1: nd)= deg_VAD( startd: startd+ nd- 1); 18 | % 19 | % if (nr== 491) 20 | % fid= fopen( 'mat_debug.txt', 'wt'); 21 | % fprintf( fid, '%f\n', x1); 22 | % fclose( fid); 23 | % end 24 | % 25 | % x1_fft= fft( x1, 2* Nx); 26 | % x2_fft= fft( x2, 2* Nx); 27 | % 28 | % tmp1= ifft( x1_fft.* x2_fft, 2* Nx); 29 | % 30 | % Ny= nr+ nd- 1; 31 | % Y= tmp1( 1: Ny); 32 | % % done =========== 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /PESQ/apply_VAD.m: -------------------------------------------------------------------------------- 1 | function [VAD, logVAD]= apply_VAD( data, Nsamples) 2 | 3 | global Downsample MINSPEECHLGTH JOINSPEECHLGTH 4 | 5 | Nwindows= floor( Nsamples/ Downsample); 6 | %number of 4ms window 7 | 8 | VAD= zeros( 1, Nwindows); 9 | for count= 1: Nwindows 10 | VAD( count)= sum( data( (count-1)* Downsample+ 1: ... 11 | count* Downsample).^ 2)/ Downsample; 12 | end 13 | %VAD is the power of each 4ms window 14 | 15 | LevelThresh = sum( VAD)/ Nwindows; 16 | %LevelThresh is set to mean value of VAD 17 | 18 | LevelMin= max( VAD); 19 | if( LevelMin > 0 ) 20 | LevelMin= LevelMin* 1.0e-4; 21 | else 22 | LevelMin = 1.0; 23 | end 24 | %fprintf( 1, 'LevelMin is %f\n', LevelMin); 25 | 26 | VAD( find( VAD< LevelMin))= LevelMin; 27 | 28 | for iteration= 1: 12 29 | LevelNoise= 0; 30 | len= 0; 31 | StDNoise= 0; 32 | 33 | VAD_lessthan_LevelThresh= VAD( find( VAD<= LevelThresh)); 34 | len= length( VAD_lessthan_LevelThresh); 35 | LevelNoise= sum( VAD_lessthan_LevelThresh); 36 | if (len> 0) 37 | LevelNoise= LevelNoise/ len; 38 | StDNoise= sqrt( sum( ... 39 | (VAD_lessthan_LevelThresh- LevelNoise).^ 2)/ len); 40 | end 41 | LevelThresh= 1.001* (LevelNoise+ 2* StDNoise); 42 | end 43 | %fprintf( 1, 'LevelThresh is %f\n', LevelThresh); 44 | 45 | LevelNoise= 0; 46 | LevelSig= 0; 47 | len= 0; 48 | VAD_greaterthan_LevelThresh= VAD( find( VAD> LevelThresh)); 49 | len= length( VAD_greaterthan_LevelThresh); 50 | LevelSig= sum( VAD_greaterthan_LevelThresh); 51 | 52 | VAD_lessorequal_LevelThresh= VAD( find( VAD<= LevelThresh)); 53 | LevelNoise= sum( VAD_lessorequal_LevelThresh); 54 | 55 | if (len> 0) 56 | LevelSig= LevelSig/ len; 57 | else 58 | LevelThresh= -1; 59 | end 60 | %fprintf( 1, 'LevelSig is %f\n', LevelSig); 61 | 62 | if (len< Nwindows) 63 | LevelNoise= LevelNoise/( Nwindows- len); 64 | else 65 | LevelNoise= 1; 66 | end 67 | %fprintf( 1, 'LevelNoise is %f\n', LevelNoise); 68 | 69 | VAD( find( VAD<= LevelThresh))= -VAD( find( VAD<= LevelThresh)); 70 | VAD(1)= -LevelMin; 71 | VAD(Nwindows)= -LevelMin; 72 | 73 | 74 | start= 0; 75 | finish= 0; 76 | for count= 2: Nwindows 77 | if( (VAD(count) > 0.0) && (VAD(count-1) <= 0.0) ) 78 | start = count; 79 | end 80 | if( (VAD(count) <= 0.0) && (VAD(count-1) > 0.0) ) 81 | finish = count; 82 | if( (finish - start)<= MINSPEECHLGTH ) 83 | VAD( start: finish- 1)= -VAD( start: finish- 1); 84 | end 85 | end 86 | end 87 | %to make sure finish- start is more than 4 88 | 89 | if( LevelSig >= (LevelNoise* 1000) ) 90 | for count= 2: Nwindows 91 | if( (VAD(count)> 0) && (VAD(count-1)<= 0) ) 92 | start= count; 93 | end 94 | if( (VAD(count)<= 0) && (VAD(count-1)> 0) ) 95 | finish = count; 96 | g = sum( VAD( start: finish- 1)); 97 | if( g< 3.0* LevelThresh* (finish - start) ) 98 | VAD( start: finish- 1)= -VAD( start: finish- 1); 99 | end 100 | end 101 | end 102 | end 103 | 104 | start = 0; 105 | finish = 0; 106 | for count= 2: Nwindows 107 | if( (VAD(count) > 0.0) && (VAD(count-1) <= 0.0) ) 108 | start = count; 109 | if( (finish > 0) && ((start - finish) <= JOINSPEECHLGTH) ) 110 | VAD( finish: start- 1)= LevelMin; 111 | end 112 | end 113 | if( (VAD(count) <= 0.0) && (VAD(count-1) > 0.0) ) 114 | finish = count; 115 | end 116 | end 117 | 118 | start= 0; 119 | for count= 2: Nwindows 120 | if( (VAD(count)> 0) && (VAD(count-1)<= 0) ) 121 | start= count; 122 | end 123 | end 124 | if( start== 0 ) 125 | VAD= abs(VAD); 126 | VAD(1) = -LevelMin; 127 | VAD(Nwindows) = -LevelMin; 128 | end 129 | 130 | count = 4; 131 | while( count< (Nwindows-1) ) 132 | if( (VAD(count)> 0) && (VAD(count-2) <= 0) ) 133 | VAD(count-2)= VAD(count)* 0.1; 134 | VAD(count-1)= VAD(count)* 0.3; 135 | count= count+ 1; 136 | end 137 | if( (VAD(count)<= 0) && (VAD(count-1)> 0) ) 138 | VAD(count)= VAD(count-1)* 0.3; 139 | VAD(count+ 1)= VAD(count-1)* 0.1; 140 | count= count+ 3; 141 | end 142 | count= count+ 1; 143 | end 144 | 145 | VAD( find( VAD< 0))= 0; 146 | 147 | % fid= fopen( 'mat_vad.txt', 'wt'); 148 | % fprintf( fid, '%f\n', VAD); 149 | % fclose( fid); 150 | 151 | if( LevelThresh<= 0 ) 152 | LevelThresh= LevelMin; 153 | end 154 | 155 | logVAD( find( VAD<= LevelThresh))= 0; 156 | VAD_greaterthan_LevelThresh= find( VAD> LevelThresh); 157 | logVAD( VAD_greaterthan_LevelThresh)= log( VAD( ... 158 | VAD_greaterthan_LevelThresh)/ LevelThresh); 159 | 160 | 161 | 162 | 163 | -------------------------------------------------------------------------------- /PESQ/apply_filter.m: -------------------------------------------------------------------------------- 1 | function align_filtered= apply_filter( data, data_Nsamples, align_filter_dB) 2 | 3 | global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs 4 | 5 | align_filtered= data; 6 | n= data_Nsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000); 7 | % now find the next power of 2 which is greater or equal to n 8 | pow_of_2= 2^ (ceil( log2( n))); 9 | 10 | [number_of_points, trivial]= size( align_filter_dB); 11 | overallGainFilter= interp1( align_filter_dB( :, 1), align_filter_dB( :, 2), ... 12 | 1000); 13 | 14 | x= zeros( 1, pow_of_2); 15 | x( 1: n)= data( SEARCHBUFFER* Downsample+ 1: SEARCHBUFFER* Downsample+ n); 16 | 17 | x_fft= fft( x, pow_of_2); 18 | 19 | freq_resolution= Fs/ pow_of_2; 20 | 21 | factorDb( 1: pow_of_2/2+ 1)= interp1( align_filter_dB( :, 1), ... 22 | align_filter_dB( :, 2), (0: pow_of_2/2)* freq_resolution)- ... 23 | overallGainFilter; 24 | factor= 10.^ (factorDb/ 20); 25 | 26 | factor= [factor, fliplr( factor( 2: pow_of_2/2))]; 27 | x_fft= x_fft.* factor; 28 | 29 | y= ifft( x_fft, pow_of_2); 30 | 31 | align_filtered( SEARCHBUFFER* Downsample+ 1: SEARCHBUFFER* Downsample+ n)... 32 | = y( 1: n); 33 | 34 | % fid= fopen( 'log_mat.txt', 'wt'); 35 | % fprintf( fid, '%f\n', y( 1: n)); 36 | % fclose( fid); 37 | 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /PESQ/apply_filters.m: -------------------------------------------------------------------------------- 1 | function mod_data= apply_filters( data, Nsamples) 2 | %IIRFilt( InIIR_Hsos, InIIR_Nsos, data, data_Nsamples); 3 | 4 | global InIIR_Hsos InIIR_Nsos DATAPADDING_MSECS Fs 5 | % data_Nsamples= Nsamples+ DATAPADDING_MSECS* (Fs/ 1000); 6 | 7 | % now we construct the second order section matrix 8 | sosMatrix= zeros( InIIR_Nsos, 6); 9 | sosMatrix( :, 4)= 1; %set a(1) to 1 10 | % each row of sosMatrix holds [b(1*3) a(1*3)] for each section 11 | sosMatrix( :, 1: 3)= InIIR_Hsos( :, 1: 3); 12 | sosMatrix( :, 5: 6)= InIIR_Hsos( :, 4: 5); 13 | %sosMatrix 14 | 15 | % now we construct second order section direct form II filter 16 | iirdf2= dfilt.df2sos( sosMatrix); 17 | 18 | mod_data= filter( iirdf2, data); 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /PESQ/crude_align.m: -------------------------------------------------------------------------------- 1 | function crude_align( ref_logVAD, ref_Nsamples, deg_logVAD, ... 2 | deg_Nsamples, Utt_id) 3 | 4 | global Downsample 5 | global Nutterances Largest_uttsize Nsurf_samples Crude_DelayEst 6 | global Crude_DelayConf UttSearch_Start UttSearch_End Utt_DelayEst 7 | global Utt_Delay Utt_DelayConf Utt_Start Utt_End 8 | global MAXNUTTERANCES WHOLE_SIGNAL 9 | global pesq_mos subj_mos cond_nr 10 | 11 | if (Utt_id== WHOLE_SIGNAL ) 12 | nr = floor( ref_Nsamples/ Downsample); 13 | nd = floor( deg_Nsamples/ Downsample); 14 | startr= 1; 15 | startd= 1; 16 | elseif Utt_id== MAXNUTTERANCES 17 | startr= UttSearch_Start(MAXNUTTERANCES); 18 | startd= startr+ Utt_DelayEst(MAXNUTTERANCES)/ Downsample; 19 | if ( startd< 0 ) 20 | startr= 1- Utt_DelayEst(MAXNUTTERANCES)/ Downsample; 21 | startd= 1; 22 | end 23 | 24 | nr= UttSearch_End(MAXNUTTERANCES)- startr; 25 | nd= nr; 26 | 27 | if( startd+ nd> floor( deg_Nsamples/ Downsample) ) 28 | nd= floor( deg_Nsamples/ Downsample)- startd; 29 | end 30 | % fprintf( 'nr,nd is %d,%d\n', nr, nd); 31 | 32 | else 33 | startr= UttSearch_Start(Utt_id); 34 | startd= startr+ Crude_DelayEst/ Downsample; 35 | 36 | if ( startd< 0 ) 37 | startr= 1- Crude_DelayEst/ Downsample; 38 | startd= 1; 39 | end 40 | 41 | nr= UttSearch_End(Utt_id)- startr; 42 | nd = nr; 43 | if( startd+ nd> floor( deg_Nsamples/ Downsample)+ 1) 44 | nd = floor( deg_Nsamples/ Downsample)- startd+ 1; 45 | end 46 | end 47 | 48 | max_Y= 0.0; 49 | I_max_Y= nr; 50 | if( (nr> 1) && (nd> 1) ) 51 | Y= FFTNXCorr( ref_logVAD, startr, nr, deg_logVAD, startd, nd); 52 | [max_Y, I_max_Y]= max( Y); 53 | if (max_Y<= 0) 54 | max_Y= 0; 55 | I_max_Y= nr; 56 | end 57 | end 58 | 59 | % fprintf( 'max_Y, I_max_Y is %f, %d\n', max_Y, I_max_Y); 60 | 61 | if( Utt_id== WHOLE_SIGNAL ) 62 | Crude_DelayEst= (I_max_Y- nr)* Downsample; 63 | Crude_DelayConf= 0.0; 64 | % fprintf( 1, 'I_max_Y, nr, Crude_DelayEst is %f, %f, %f\n', ... 65 | % I_max_Y, nr, Crude_DelayEst); 66 | elseif( Utt_id == MAXNUTTERANCES ) 67 | Utt_Delay(MAXNUTTERANCES)= (I_max_Y- nr)* Downsample+ ... 68 | Utt_DelayEst(MAXNUTTERANCES); 69 | % fprintf( 'startr, startd, nr, nd, I_max, Utt_Delay[%d] is %d, %d, %d, %d, %d, %d\n', ... 70 | % MAXNUTTERANCES, startr, startd, nr, nd, ... 71 | % I_max_Y, Utt_Delay(MAXNUTTERANCES) ); 72 | else 73 | % fprintf( 'I_max_Y, nr is %d, %d\n', I_max_Y, nr); 74 | Utt_DelayEst(Utt_id)= (I_max_Y- nr)* Downsample+ ... 75 | Crude_DelayEst; 76 | end 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /PESQ/fix_power_level.m: -------------------------------------------------------------------------------- 1 | function mod_data= fix_power_level( data, data_Nsamples, maxNsamples) 2 | % this function is used for level normalization, i.e., to fix the power 3 | % level of data to a preset number, and return it to mod_data. 4 | 5 | global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs 6 | global TARGET_AVG_POWER 7 | TARGET_AVG_POWER= 1e7; 8 | 9 | align_filter_dB= [0,-500; 50, -500; 100, -500; 125, -500; 160, -500; 200, -500; 10 | 250, -500; 300, -500; 350, 0; 400, 0; 500, 0; 600, 0; 630, 0; 11 | 800, 0; 1000, 0; 1250, 0; 1600, 0; 2000, 0; 2500, 0; 3000, 0; 12 | 3250, 0; 3500, -500; 4000, -500; 5000, -500; 6300, -500; 8000, -500]; 13 | 14 | align_filtered= apply_filter( data, data_Nsamples, align_filter_dB); 15 | power_above_300Hz = pow_of (align_filtered, SEARCHBUFFER* Downsample+ 1, ... 16 | data_Nsamples- SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000), ... 17 | maxNsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000)); 18 | 19 | global_scale= sqrt( TARGET_AVG_POWER/ power_above_300Hz); 20 | % fprintf( 1, '\tglobal_scale is %f\n', global_scale); 21 | mod_data= data* global_scale; 22 | -------------------------------------------------------------------------------- /PESQ/id_searchwindows.m: -------------------------------------------------------------------------------- 1 | function id_searchwindows( ref_VAD, ref_Nsamples, deg_VAD, deg_Nsamples); 2 | 3 | global MINUTTLENGTH Downsample MINUTTLENGTH SEARCHBUFFER 4 | global Crude_DelayEst Nutterances UttSearch_Start UttSearch_End 5 | 6 | Utt_num = 1; 7 | speech_flag = 0; 8 | 9 | VAD_length= floor( ref_Nsamples/ Downsample); 10 | del_deg_start= MINUTTLENGTH- Crude_DelayEst/ Downsample; 11 | del_deg_end= floor((deg_Nsamples- Crude_DelayEst)/ Downsample)-... 12 | MINUTTLENGTH; 13 | 14 | for count= 1: VAD_length 15 | VAD_value= ref_VAD(count); 16 | if( (VAD_value> 0) && (speech_flag== 0) ) 17 | speech_flag= 1; 18 | this_start= count; 19 | UttSearch_Start(Utt_num)= count- SEARCHBUFFER; 20 | if( UttSearch_Start(Utt_num)< 0 ) 21 | UttSearch_Start(Utt_num)= 0; 22 | end 23 | end 24 | 25 | if( ((VAD_value== 0) || (count == (VAD_length-1))) && ... 26 | (speech_flag == 1) ) 27 | speech_flag = 0; 28 | UttSearch_End(Utt_num) = count + SEARCHBUFFER; 29 | if( UttSearch_End(Utt_num) > VAD_length - 1 ) 30 | UttSearch_End(Utt_num) = VAD_length -1; 31 | end 32 | 33 | if( ((count - this_start) >= MINUTTLENGTH) &&... 34 | (this_start < del_deg_end) &&... 35 | (count > del_deg_start) ) 36 | Utt_num= Utt_num + 1; 37 | end 38 | end 39 | end 40 | Utt_num= Utt_num- 1; 41 | Nutterances = Utt_num; 42 | 43 | % fprintf( 1, 'Nutterances is %d\n', Nutterances); 44 | 45 | % fid= fopen( 'mat_utt.txt', 'wt'); 46 | % fprintf( fid, '%d\n', UttSearch_Start( 1: Nutterances)); 47 | % fprintf( fid, '\n'); 48 | % fprintf( fid, '%d\n', UttSearch_End( 1: Nutterances)); 49 | % fclose(fid); 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /PESQ/id_utterances.m: -------------------------------------------------------------------------------- 1 | function id_utterances( ref_Nsamples, ref_VAD, deg_Nsamples) 2 | 3 | global Largest_uttsize MINUTTLENGTH MINUTTLENGTH Crude_DelayEst 4 | global Downsample SEARCHBUFFER Nutterances Utt_Start 5 | global Utt_End Utt_Delay 6 | 7 | Utt_num = 1; 8 | speech_flag = 0; 9 | VAD_length = floor( ref_Nsamples / Downsample); 10 | % fprintf( 1, 'VAD_length is %d\n', VAD_length); 11 | 12 | del_deg_start = MINUTTLENGTH - Crude_DelayEst / Downsample; 13 | del_deg_end = floor((deg_Nsamples- Crude_DelayEst)/ Downsample) ... 14 | - MINUTTLENGTH; 15 | 16 | for count = 1: VAD_length 17 | VAD_value = ref_VAD(count); 18 | if( (VAD_value > 0.0) && (speech_flag == 0) ) 19 | speech_flag = 1; 20 | this_start = count; 21 | Utt_Start (Utt_num) = count; 22 | end 23 | 24 | if( ((VAD_value == 0) || (count == VAD_length)) && ... 25 | (speech_flag == 1) ) 26 | speech_flag = 0; 27 | Utt_End (Utt_num) = count; 28 | 29 | if( ((count - this_start) >= MINUTTLENGTH) && ... 30 | (this_start < del_deg_end) && ... 31 | (count > del_deg_start) ) 32 | Utt_num = Utt_num + 1; 33 | end 34 | end 35 | end 36 | 37 | Utt_Start(1) = SEARCHBUFFER+ 1; 38 | Utt_End(Nutterances) = VAD_length - SEARCHBUFFER+ 1; 39 | 40 | for Utt_num = 2: Nutterances 41 | this_start = Utt_Start(Utt_num)- 1; 42 | last_end = Utt_End(Utt_num - 1)- 1; 43 | count = floor( (this_start + last_end) / 2); 44 | Utt_Start(Utt_num) = count+ 1; 45 | Utt_End(Utt_num - 1) = count+ 1; 46 | end 47 | 48 | this_start = (Utt_Start(1)- 1) * Downsample + Utt_Delay(1); 49 | if( this_start < (SEARCHBUFFER * Downsample) ) 50 | count = SEARCHBUFFER + floor( ... 51 | (Downsample - 1 - Utt_Delay(1)) / Downsample); 52 | Utt_Start(1) = count+ 1; 53 | end 54 | 55 | last_end = (Utt_End(Nutterances)- 1) * Downsample + 1 + ... 56 | Utt_Delay(Nutterances); 57 | % fprintf( 'Utt_End(%d) is %d\n', Nutterances, Utt_End(Nutterances)); 58 | % fprintf( 'last_end is %d\n', last_end); 59 | % fprintf( 'Utt_Delay(%d) is %d\n', Nutterances, Utt_Delay(Nutterances)); 60 | if( last_end > (deg_Nsamples - SEARCHBUFFER * Downsample+ 1) ) 61 | count = floor( (deg_Nsamples - Utt_Delay(Nutterances)) / Downsample) ... 62 | - SEARCHBUFFER; 63 | Utt_End(Nutterances) = count+ 1; 64 | end 65 | 66 | for Utt_num = 2: Nutterances 67 | this_start = (Utt_Start(Utt_num)- 1) * Downsample + Utt_Delay(Utt_num); 68 | last_end = (Utt_End(Utt_num - 1)- 1) * Downsample + Utt_Delay(Utt_num - 1); 69 | if( this_start < last_end ) 70 | count = floor( (this_start + last_end) / 2); 71 | this_start = floor( (Downsample- 1+ count- Utt_Delay(Utt_num))... 72 | / Downsample); 73 | last_end = floor( (count - Utt_Delay(Utt_num - 1))... 74 | / Downsample); 75 | Utt_Start(Utt_num) = this_start+ 1; 76 | Utt_End(Utt_num- 1) = last_end+ 1; 77 | end 78 | end 79 | 80 | Largest_uttsize= max( Utt_End- Utt_Start); 81 | 82 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /PESQ/input_filter.m: -------------------------------------------------------------------------------- 1 | function [mod_ref_data, mod_deg_data]= input_filter( ref_data, ref_Nsamples, ... 2 | deg_data, deg_Nsamples) 3 | 4 | mod_ref_data= DC_block( ref_data, ref_Nsamples); 5 | mod_deg_data= DC_block( deg_data, deg_Nsamples); 6 | 7 | mod_ref_data= apply_filters( mod_ref_data, ref_Nsamples); 8 | mod_deg_data= apply_filters( mod_deg_data, deg_Nsamples); 9 | 10 | -------------------------------------------------------------------------------- /PESQ/pesq.m: -------------------------------------------------------------------------------- 1 | function [pesq_mos]= pesq(ref_wav, deg_wav) 2 | 3 | % ---------------------------------------------------------------------- 4 | % PESQ objective speech quality measure 5 | % 6 | % This function implements the PESQ measure based on the ITU standard 7 | % P.862 [1]. 8 | % 9 | % 10 | % Usage: pval=pesq(cleanFile.wav, enhancedFile.wav) 11 | % 12 | % cleanFile.wav - clean input file in .wav format 13 | % enhancedFile - enhanced output file in .wav format 14 | % pval - PESQ value 15 | % 16 | % Note that the PESQ routine only supports sampling rates of 8 kHz and 17 | % 16 kHz [1] 18 | % 19 | % Example call: pval = pesq ('sp04.wav','enhanced.wav') 20 | % 21 | % 22 | % References: 23 | % [1] ITU (2000). Perceptual evaluation of speech quality (PESQ), and 24 | % objective method for end-to-end speech quality assessment of 25 | % narrowband telephone networks and speech codecs. ITU-T 26 | % Recommendation P. 862 27 | % 28 | % Authors: Yi Hu and Philipos C. Loizou 29 | % 30 | % 31 | % Copyright (c) 2006 by Philipos C. Loizou 32 | % $Revision: 0.0 $ $Date: 10/09/2006 $ 33 | % ---------------------------------------------------------------------- 34 | if nargin<2 35 | fprintf('Usage: [pesq_mos]=pesq(cleanfile.wav,enhanced.wav) \n'); 36 | return; 37 | end; 38 | 39 | global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs WHOLE_SIGNAL 40 | global Align_Nfft Window 41 | 42 | [ref_data,sampling_rate]= audioread( ref_wav); 43 | if sampling_rate~=8000 && sampling_rate~=16000 44 | error('Sampling frequency needs to be either 8000 or 16000 Hz'); 45 | end 46 | 47 | setup_global( sampling_rate); 48 | 49 | % Window= hann( Align_Nfft, 'periodic'); %Hanning window 50 | % Window= Window'; 51 | TWOPI= 6.28318530717959; 52 | %for count = 0: Align_Nfft- 1 53 | % Window(1+ count) = 0.5 * (1.0 - cos((TWOPI * count) / Align_Nfft)); 54 | %end 55 | 56 | count=0:Align_Nfft- 1; 57 | Window= 0.5 * (1.0 - cos((TWOPI * count) / Align_Nfft)); 58 | 59 | 60 | 61 | ref_data= ref_data'; 62 | ref_data= ref_data* 32768; 63 | ref_Nsamples= length( ref_data)+ 2* SEARCHBUFFER* Downsample; 64 | ref_data= [zeros( 1, SEARCHBUFFER* Downsample), ref_data, ... 65 | zeros( 1, DATAPADDING_MSECS* (Fs/ 1000)+ SEARCHBUFFER* Downsample)]; 66 | 67 | [deg_data,~]= audioread( deg_wav); 68 | deg_data= deg_data'; 69 | deg_data= deg_data* 32768; 70 | deg_Nsamples= length( deg_data)+ 2* SEARCHBUFFER* Downsample; 71 | deg_data= [zeros( 1, SEARCHBUFFER* Downsample), deg_data, ... 72 | zeros( 1, DATAPADDING_MSECS* (Fs/ 1000)+ SEARCHBUFFER* Downsample)]; 73 | 74 | maxNsamples= max( ref_Nsamples, deg_Nsamples); 75 | 76 | ref_data= fix_power_level( ref_data, ref_Nsamples, maxNsamples); 77 | deg_data= fix_power_level( deg_data, deg_Nsamples, maxNsamples); 78 | 79 | standard_IRS_filter_dB= [0, -200; 50, -40; 100, -20; 125, -12; 160, -6; 200, 0;... 80 | 250, 4; 300, 6; 350, 8; 400, 10; 500, 11; 600, 12; 700, 12; 800, 12;... 81 | 1000, 12; 1300, 12; 1600, 12; 2000, 12; 2500, 12; 3000, 12; 3250, 12;... 82 | 3500, 4; 4000, -200; 5000, -200; 6300, -200; 8000, -200]; 83 | 84 | ref_data= apply_filter( ref_data, ref_Nsamples, standard_IRS_filter_dB); 85 | deg_data= apply_filter( deg_data, deg_Nsamples, standard_IRS_filter_dB); 86 | % 87 | 88 | 89 | 90 | % for later use in psychoacoustical model 91 | model_ref= ref_data; 92 | model_deg= deg_data; 93 | 94 | [ref_data, deg_data]= input_filter( ref_data, ref_Nsamples, deg_data, ... 95 | deg_Nsamples); 96 | 97 | 98 | [ref_VAD, ref_logVAD]= apply_VAD( ref_data, ref_Nsamples); 99 | [deg_VAD, deg_logVAD]= apply_VAD( deg_data, deg_Nsamples); 100 | 101 | 102 | crude_align (ref_logVAD, ref_Nsamples, deg_logVAD, deg_Nsamples,... 103 | WHOLE_SIGNAL); 104 | 105 | utterance_locate (ref_data, ref_Nsamples, ref_VAD, ref_logVAD,... 106 | deg_data, deg_Nsamples, deg_VAD, deg_logVAD); 107 | 108 | ref_data= model_ref; 109 | deg_data= model_deg; 110 | 111 | % make ref_data and deg_data equal length 112 | if (ref_Nsamples< deg_Nsamples) 113 | newlen= deg_Nsamples+ DATAPADDING_MSECS* (Fs/ 1000); 114 | ref_data( newlen)= 0; 115 | elseif (ref_Nsamples> deg_Nsamples) 116 | newlen= ref_Nsamples+ DATAPADDING_MSECS* (Fs/ 1000); 117 | deg_data( newlen)= 0; 118 | end 119 | 120 | 121 | pesq_mos= pesq_psychoacoustic_model (ref_data, ref_Nsamples, deg_data, ... 122 | deg_Nsamples ); 123 | 124 | 125 | 126 | 127 | -------------------------------------------------------------------------------- /PESQ/pesq_psychoacoustic_model.m: -------------------------------------------------------------------------------- 1 | function pesq_mos= pesq_psychoacoustic_model (ref_data, ref_Nsamples, deg_data, ... 2 | deg_Nsamples ) 3 | 4 | global CALIBRATE Nfmax Nb Sl Sp 5 | global nr_of_hz_bands_per_bark_band centre_of_band_bark 6 | global width_of_band_hz centre_of_band_hz width_of_band_bark 7 | global pow_dens_correction_factor abs_thresh_power 8 | global Downsample SEARCHBUFFER DATAPADDING_MSECS Fs Nutterances 9 | global Utt_Start Utt_End Utt_Delay NUMBER_OF_PSQM_FRAMES_PER_SYLLABE 10 | global Fs Plot_Frame 11 | 12 | % Plot_Frame= 75; % this is the frame whose spectrum will be plotted 13 | 14 | FALSE= 0; 15 | TRUE= 1; 16 | NUMBER_OF_PSQM_FRAMES_PER_SYLLABE= 20; 17 | 18 | maxNsamples = max (ref_Nsamples, deg_Nsamples); 19 | Nf = Downsample * 8; 20 | MAX_NUMBER_OF_BAD_INTERVALS = 1000; 21 | 22 | start_frame_of_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS); 23 | stop_frame_of_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS); 24 | start_sample_of_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS); 25 | stop_sample_of_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS); 26 | number_of_samples_in_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS); 27 | delay_in_samples_in_bad_interval= zeros( 1, MAX_NUMBER_OF_BAD_INTERVALS); 28 | number_of_bad_intervals= 0; 29 | there_is_a_bad_frame= FALSE; 30 | 31 | Whanning= hann( Nf, 'periodic'); 32 | Whanning= Whanning'; 33 | 34 | D_POW_F = 2; 35 | D_POW_S = 6; 36 | D_POW_T = 2; 37 | A_POW_F = 1; 38 | A_POW_S = 6; 39 | A_POW_T = 2; 40 | D_WEIGHT= 0.1; 41 | A_WEIGHT= 0.0309; 42 | 43 | CRITERIUM_FOR_SILENCE_OF_5_SAMPLES = 500; 44 | samples_to_skip_at_start = 0; 45 | sum_of_5_samples= 0; 46 | while ((sum_of_5_samples< CRITERIUM_FOR_SILENCE_OF_5_SAMPLES) ... 47 | && (samples_to_skip_at_start < maxNsamples / 2)) 48 | sum_of_5_samples= sum( abs( ref_data( samples_to_skip_at_start... 49 | + SEARCHBUFFER * Downsample + 1: samples_to_skip_at_start... 50 | + SEARCHBUFFER * Downsample + 5))); 51 | 52 | if (sum_of_5_samples< CRITERIUM_FOR_SILENCE_OF_5_SAMPLES) 53 | samples_to_skip_at_start = samples_to_skip_at_start+ 1; 54 | end 55 | end 56 | % fprintf( 'samples_to_skip_at_start is %d\n', samples_to_skip_at_start); 57 | 58 | samples_to_skip_at_end = 0; 59 | sum_of_5_samples= 0; 60 | while ((sum_of_5_samples< CRITERIUM_FOR_SILENCE_OF_5_SAMPLES) ... 61 | && (samples_to_skip_at_end < maxNsamples / 2)) 62 | sum_of_5_samples= sum( abs( ref_data( maxNsamples - ... 63 | SEARCHBUFFER* Downsample + DATAPADDING_MSECS* (Fs/ 1000) ... 64 | - samples_to_skip_at_end - 4: maxNsamples - ... 65 | SEARCHBUFFER* Downsample + DATAPADDING_MSECS* (Fs/ 1000) ... 66 | - samples_to_skip_at_end))); 67 | if (sum_of_5_samples< CRITERIUM_FOR_SILENCE_OF_5_SAMPLES) 68 | samples_to_skip_at_end = samples_to_skip_at_end+ 1; 69 | end 70 | end 71 | % fprintf( 'samples_to_skip_at_end is %d\n', samples_to_skip_at_end); 72 | 73 | start_frame = floor( samples_to_skip_at_start/ (Nf/ 2)); 74 | stop_frame = floor( (maxNsamples- 2* SEARCHBUFFER* Downsample ... 75 | + DATAPADDING_MSECS* (Fs/ 1000)- samples_to_skip_at_end) ... 76 | / (Nf/ 2))- 1; 77 | % number of frames in speech data plus DATAPADDING_MSECS 78 | % fprintf( 'start/end frame is %d/%d\n', start_frame, stop_frame); 79 | 80 | D_disturbance= zeros( stop_frame+ 1, Nb); 81 | DA_disturbance= zeros( stop_frame+ 1, Nb); 82 | 83 | power_ref = pow_of (ref_data, SEARCHBUFFER* Downsample, ... 84 | maxNsamples- SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000),... 85 | maxNsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000)); 86 | power_deg = pow_of (deg_data, SEARCHBUFFER * Downsample, ... 87 | maxNsamples- SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000),... 88 | maxNsamples- 2* SEARCHBUFFER* Downsample+ DATAPADDING_MSECS* (Fs/ 1000)); 89 | % fprintf( 'ref/deg power is %f/%f\n', power_ref, power_deg); 90 | 91 | hz_spectrum_ref = zeros( 1, Nf/ 2); 92 | hz_spectrum_deg = zeros( 1, Nf/ 2); 93 | frame_is_bad = zeros( 1, stop_frame + 1); 94 | smeared_frame_is_bad = zeros( 1, stop_frame + 1); 95 | silent = zeros( 1, stop_frame + 1); 96 | 97 | pitch_pow_dens_ref = zeros( stop_frame + 1, Nb); 98 | pitch_pow_dens_deg = zeros( stop_frame + 1, Nb); 99 | 100 | frame_was_skipped = zeros( 1, stop_frame + 1); 101 | frame_disturbance = zeros( 1, stop_frame + 1); 102 | frame_disturbance_asym_add = zeros( 1, stop_frame + 1); 103 | 104 | avg_pitch_pow_dens_ref = zeros( 1, Nb); 105 | avg_pitch_pow_dens_deg = zeros( 1, Nb); 106 | loudness_dens_ref = zeros( 1, Nb); 107 | loudness_dens_deg = zeros( 1, Nb); 108 | deadzone = zeros( 1, Nb); 109 | disturbance_dens = zeros( 1, Nb); 110 | disturbance_dens_asym_add = zeros( 1, Nb); 111 | 112 | time_weight = zeros( 1, stop_frame + 1); 113 | total_power_ref = zeros( 1, stop_frame + 1); 114 | 115 | % fid= fopen( 'tmp_mat.txt', 'wt'); 116 | 117 | for frame = 0: stop_frame 118 | start_sample_ref = 1+ SEARCHBUFFER * Downsample + frame* (Nf/ 2); 119 | hz_spectrum_ref= short_term_fft (Nf, ref_data, Whanning, ... 120 | start_sample_ref); 121 | 122 | utt = Nutterances; 123 | while ((utt >= 1) && ((Utt_Start(utt)- 1)* Downsample+ 1 ... 124 | > start_sample_ref)) 125 | utt= utt - 1; 126 | end 127 | 128 | if (utt >= 1) 129 | delay = Utt_Delay(utt); 130 | else 131 | delay = Utt_Delay(1); 132 | end 133 | 134 | start_sample_deg = start_sample_ref + delay; 135 | 136 | if ((start_sample_deg > 0) && (start_sample_deg + Nf- 1 < ... 137 | maxNsamples+ DATAPADDING_MSECS* (Fs/ 1000))) 138 | hz_spectrum_deg= short_term_fft (Nf, deg_data, Whanning, ... 139 | start_sample_deg); 140 | else 141 | hz_spectrum_deg( 1: Nf/ 2)= 0; 142 | end 143 | 144 | pitch_pow_dens_ref( frame+ 1, :)= freq_warping (... 145 | hz_spectrum_ref, Nb, frame); 146 | %peak = maximum_of (pitch_pow_dens_ref, 0, Nb); 147 | pitch_pow_dens_deg( frame+ 1, :)= freq_warping (... 148 | hz_spectrum_deg, Nb, frame); 149 | 150 | total_audible_pow_ref = total_audible (frame, pitch_pow_dens_ref, 1E2); 151 | total_audible_pow_deg = total_audible (frame, pitch_pow_dens_deg, 1E2); 152 | silent(frame+ 1) = (total_audible_pow_ref < 1E7); 153 | 154 | 155 | end 156 | % fclose( fid); 157 | 158 | avg_pitch_pow_dens_ref= time_avg_audible_of (stop_frame + 1, ... 159 | silent, pitch_pow_dens_ref, floor((maxNsamples- 2* SEARCHBUFFER* ... 160 | Downsample+ DATAPADDING_MSECS* (Fs/ 1000))/ (Nf / 2))- 1); 161 | avg_pitch_pow_dens_deg= time_avg_audible_of (stop_frame + 1, ... 162 | silent, pitch_pow_dens_deg, floor((maxNsamples- 2* SEARCHBUFFER* ... 163 | Downsample+ DATAPADDING_MSECS* (Fs/ 1000))/ (Nf/ 2))- 1); 164 | 165 | % fid= fopen( 'tmp_mat.txt', 'wt'); 166 | % fprintf( fid, '%f\n', avg_pitch_pow_dens_deg); 167 | % fclose( fid); 168 | 169 | if (CALIBRATE== 0) 170 | pitch_pow_dens_ref= freq_resp_compensation (stop_frame + 1, ... 171 | pitch_pow_dens_ref, avg_pitch_pow_dens_ref, ... 172 | avg_pitch_pow_dens_deg, 1000); 173 | if (Plot_Frame>= 0) % plot pitch_pow_dens_ref 174 | figure; 175 | subplot( 1, 2, 1); 176 | plot( centre_of_band_hz, 10* log10( eps+ ... 177 | pitch_pow_dens_ref( Plot_Frame+ 1, :))); 178 | axis( [0 Fs/2 0 95]); %xlabel( 'Hz'); ylabel( 'Db'); 179 | title( 'reference signal bark spectrum with frequency compensation'); 180 | subplot( 1, 2, 2); 181 | plot( centre_of_band_hz, 10* log10( eps+ ... 182 | pitch_pow_dens_deg( Plot_Frame+ 1, :))); 183 | axis( [0 Fs/2 0 95]); %xlabel( 'Hz'); ylabel( 'Db'); 184 | title( 'degraded signal bark spectrum'); 185 | end 186 | 187 | end 188 | % tmp1= pitch_pow_dens_ref'; 189 | 190 | 191 | MAX_SCALE = 5.0; 192 | MIN_SCALE = 3e-4; 193 | oldScale = 1; 194 | THRESHOLD_BAD_FRAMES = 30; 195 | for frame = 0: stop_frame 196 | 197 | total_audible_pow_ref = total_audible (frame, pitch_pow_dens_ref, 1); 198 | total_audible_pow_deg = total_audible (frame, pitch_pow_dens_deg, 1); 199 | total_power_ref (1+ frame) = total_audible_pow_ref; 200 | 201 | scale = (total_audible_pow_ref + 5e3)/ (total_audible_pow_deg + 5e3); 202 | if (frame > 0) 203 | scale = 0.2 * oldScale + 0.8 * scale; 204 | end 205 | oldScale = scale; 206 | 207 | if (scale > MAX_SCALE) 208 | scale = MAX_SCALE; 209 | elseif (scale < MIN_SCALE) 210 | scale = MIN_SCALE; 211 | end 212 | 213 | pitch_pow_dens_deg( 1+ frame, :) = ... 214 | pitch_pow_dens_deg( 1+ frame, :) * scale; 215 | 216 | if (frame== Plot_Frame) 217 | figure; 218 | subplot( 1, 2, 1); 219 | plot( centre_of_band_hz, 10* log10( eps+ ... 220 | pitch_pow_dens_ref( Plot_Frame+ 1, :))); 221 | axis( [0 Fs/2 0 95]); %xlabel( 'Hz'); ylabel( 'Db'); 222 | subplot( 1, 2, 2); 223 | plot( centre_of_band_hz, 10* log10( eps+ ... 224 | pitch_pow_dens_deg( Plot_Frame+ 1, :))); 225 | axis( [0 Fs/2 0 95]); %xlabel( 'Hz'); ylabel( 'Db'); 226 | end 227 | 228 | loudness_dens_ref = intensity_warping_of (frame, pitch_pow_dens_ref); 229 | loudness_dens_deg = intensity_warping_of (frame, pitch_pow_dens_deg); 230 | disturbance_dens = loudness_dens_deg - loudness_dens_ref; 231 | 232 | if (frame== Plot_Frame) 233 | figure; 234 | subplot( 1, 2, 1); 235 | plot( centre_of_band_hz, 10* log10( eps+ ... 236 | loudness_dens_ref)); 237 | axis( [0 Fs/2 0 15]); %xlabel( 'Hz'); ylabel( 'Db'); 238 | title( 'reference signal loudness density'); 239 | subplot( 1, 2, 2); 240 | plot( centre_of_band_hz, 10* log10( eps+ ... 241 | loudness_dens_deg)); 242 | axis( [0 Fs/2 0 15]); %xlabel( 'Hz'); ylabel( 'Db'); 243 | title( 'degraded signal loudness density'); 244 | end 245 | 246 | for band =1: Nb 247 | deadzone (band) = 0.25* min (loudness_dens_deg (band), ... 248 | loudness_dens_ref (band)); 249 | end 250 | 251 | for band = 1: Nb 252 | d = disturbance_dens (band); 253 | m = deadzone (band); 254 | 255 | if (d > m) 256 | disturbance_dens (band) = disturbance_dens (band)- m; 257 | % disturbance_dens (band) = d- m; 258 | else 259 | if (d < -m) 260 | disturbance_dens (band) = disturbance_dens (band)+ m; 261 | % disturbance_dens (band) = d+ m; 262 | else 263 | disturbance_dens (band) = 0; 264 | end 265 | end 266 | end 267 | 268 | if (frame== Plot_Frame) 269 | figure; 270 | subplot( 1, 2, 1); 271 | plot( centre_of_band_hz, disturbance_dens); 272 | axis( [0 Fs/2 -1 50]); %xlabel( 'Hz'); ylabel( 'Db'); 273 | title( 'disturbance'); 274 | end 275 | D_disturbance( frame+ 1, :)= disturbance_dens; 276 | 277 | frame_disturbance (1+ frame) = pseudo_Lp (disturbance_dens, D_POW_F); 278 | if (frame_disturbance (1+ frame) > THRESHOLD_BAD_FRAMES) 279 | there_is_a_bad_frame = TRUE; 280 | end 281 | 282 | disturbance_dens= multiply_with_asymmetry_factor (... 283 | disturbance_dens, frame, pitch_pow_dens_ref, pitch_pow_dens_deg); 284 | 285 | if (frame== Plot_Frame) 286 | subplot( 1, 2, 2); 287 | plot( centre_of_band_hz, disturbance_dens); 288 | axis( [0 Fs/2 -1 50]); %xlabel( 'Hz'); ylabel( 'Db'); 289 | title( 'disturbance after asymmetry processing'); 290 | end 291 | DA_disturbance( frame+ 1, :)= disturbance_dens; 292 | 293 | 294 | frame_disturbance_asym_add (1+ frame) = ... 295 | pseudo_Lp (disturbance_dens, A_POW_F); 296 | end 297 | % fid= fopen( 'tmp_mat.txt', 'wt'); 298 | % fprintf( fid, '%f\n', frame_disturbance); 299 | % fclose( fid); 300 | 301 | frame_was_skipped (1: 1+ stop_frame) = FALSE; 302 | 303 | for utt = 2: Nutterances 304 | frame1 = floor (((Utt_Start(utt)- 1- SEARCHBUFFER )* Downsample+ 1+ ... 305 | Utt_Delay(utt))/ (Nf/ 2)); 306 | j = floor( floor(((Utt_End(utt-1)- 1- SEARCHBUFFER)* Downsample+ 1+ ... 307 | Utt_Delay(utt-1)))/(Nf/ 2)); 308 | delay_jump = Utt_Delay(utt) - Utt_Delay(utt-1); 309 | if (frame1 > j) 310 | frame1 = j; 311 | elseif (frame1 < 0) 312 | frame1 = 0; 313 | end 314 | % fprintf( 'frame1, j, delay_jump is %d, %d, %d\n', frame1, ... 315 | % j, delay_jump); 316 | 317 | if (delay_jump < -(Nf/ 2)) 318 | frame2 = floor (((Utt_Start(utt)- 1- SEARCHBUFFER)* Downsample+ 1 ... 319 | + max (0, abs (delay_jump)))/ (Nf/ 2)) + 1; 320 | 321 | for frame = frame1: frame2 322 | if (frame < stop_frame) 323 | frame_was_skipped (1+ frame) = TRUE; 324 | frame_disturbance (1+ frame) = 0; 325 | frame_disturbance_asym_add (1+ frame) = 0; 326 | end 327 | end 328 | end 329 | end 330 | 331 | nn = DATAPADDING_MSECS* (Fs/ 1000) + maxNsamples; 332 | tweaked_deg = zeros( 1, nn); 333 | % fprintf( 'nn is %d\n', nn); 334 | 335 | for i= SEARCHBUFFER* Downsample+ 1: nn- SEARCHBUFFER* Downsample 336 | utt = Nutterances; 337 | 338 | while ((utt >= 1) && ((Utt_Start (utt)- 1)* Downsample> i)) 339 | utt = utt- 1; 340 | end 341 | if (utt >= 1) 342 | delay = Utt_Delay (utt); 343 | else 344 | delay = Utt_Delay (1); 345 | end 346 | 347 | j = i + delay; 348 | if (j < SEARCHBUFFER * Downsample+ 1) 349 | j = SEARCHBUFFER * Downsample+ 1; 350 | end 351 | if (j > nn - SEARCHBUFFER * Downsample) 352 | j = nn - SEARCHBUFFER * Downsample; 353 | end 354 | tweaked_deg (i) = deg_data (j); 355 | end 356 | 357 | if (there_is_a_bad_frame) 358 | 359 | for frame = 0: stop_frame 360 | frame_is_bad (1+ frame) = (frame_disturbance (1+ frame)... 361 | > THRESHOLD_BAD_FRAMES); 362 | smeared_frame_is_bad (1+ frame) = FALSE; 363 | end 364 | frame_is_bad (1) = FALSE; 365 | SMEAR_RANGE = 2; 366 | 367 | for frame = SMEAR_RANGE: stop_frame- 1- SMEAR_RANGE 368 | max_itself_and_left = frame_is_bad (1+ frame); 369 | max_itself_and_right = frame_is_bad (1+ frame); 370 | 371 | for i = -SMEAR_RANGE: 0 372 | if (max_itself_and_left < frame_is_bad (1+ frame+ i)) 373 | max_itself_and_left = frame_is_bad (1+ frame+ i); 374 | end 375 | end 376 | 377 | for i = 0: SMEAR_RANGE 378 | if (max_itself_and_right < frame_is_bad (1+ frame + i)) 379 | max_itself_and_right = frame_is_bad (1+ frame + i); 380 | end 381 | end 382 | 383 | mini = max_itself_and_left; 384 | if (mini > max_itself_and_right) 385 | mini = max_itself_and_right; 386 | end 387 | 388 | smeared_frame_is_bad (1+ frame) = mini; 389 | end 390 | 391 | MINIMUM_NUMBER_OF_BAD_FRAMES_IN_BAD_INTERVAL = 5; 392 | number_of_bad_intervals = 0; 393 | frame = 0; 394 | while (frame <= stop_frame) 395 | while ((frame <= stop_frame) && (~smeared_frame_is_bad (1+ frame))) 396 | frame= frame+ 1; 397 | end 398 | 399 | if (frame <= stop_frame) 400 | start_frame_of_bad_interval(1+ number_of_bad_intervals)= ... 401 | 1+ frame; 402 | 403 | while ((frame <= stop_frame) && (... 404 | smeared_frame_is_bad (1+ frame))) 405 | frame= frame+ 1; 406 | end 407 | 408 | if (frame <= stop_frame) 409 | stop_frame_of_bad_interval(1+ number_of_bad_intervals)= ... 410 | 1+ frame; 411 | if (stop_frame_of_bad_interval(1+ number_of_bad_intervals)- ... 412 | start_frame_of_bad_interval(1+ number_of_bad_intervals)... 413 | >= MINIMUM_NUMBER_OF_BAD_FRAMES_IN_BAD_INTERVAL) 414 | number_of_bad_intervals= number_of_bad_intervals+ 1; 415 | end 416 | end 417 | end 418 | end 419 | 420 | for bad_interval = 0: number_of_bad_intervals - 1 421 | start_sample_of_bad_interval(1+ bad_interval) = ... 422 | (start_frame_of_bad_interval(1+ bad_interval)- 1) * (Nf/ 2) ... 423 | + SEARCHBUFFER * Downsample+ 1; 424 | stop_sample_of_bad_interval(1+ bad_interval) = ... 425 | (stop_frame_of_bad_interval(1+ bad_interval)- 1) * (Nf/ 2) ... 426 | + Nf + SEARCHBUFFER* Downsample; 427 | if (stop_frame_of_bad_interval(1+ bad_interval) > stop_frame+ 1) 428 | stop_frame_of_bad_interval(1+ bad_interval) = stop_frame+ 1; 429 | end 430 | 431 | number_of_samples_in_bad_interval(1+ bad_interval) = ... 432 | stop_sample_of_bad_interval(1+ bad_interval) - ... 433 | start_sample_of_bad_interval(1+ bad_interval)+ 1; 434 | end 435 | % fprintf( 'number of bad intervals %d\n', number_of_bad_intervals); 436 | % fprintf( '%d %d\n', number_of_samples_in_bad_interval(1), ... 437 | % number_of_samples_in_bad_interval(2)); 438 | % fprintf( '%d %d\n', start_sample_of_bad_interval(1), ... 439 | % start_sample_of_bad_interval(2)); 440 | 441 | SEARCH_RANGE_IN_TRANSFORM_LENGTH = 4; 442 | search_range_in_samples= SEARCH_RANGE_IN_TRANSFORM_LENGTH * Nf; 443 | 444 | for bad_interval= 0: number_of_bad_intervals- 1 445 | ref = zeros (1, 2 * search_range_in_samples + ... 446 | number_of_samples_in_bad_interval (1+ bad_interval)); 447 | deg = zeros (1, 2 * search_range_in_samples + ... 448 | number_of_samples_in_bad_interval (1+ bad_interval)); 449 | 450 | ref(1: search_range_in_samples) = 0; 451 | 452 | ref (search_range_in_samples+ 1: search_range_in_samples+ ... 453 | number_of_samples_in_bad_interval (1+ bad_interval)) = ... 454 | ref_data (start_sample_of_bad_interval( 1+ bad_interval) + 1: ... 455 | start_sample_of_bad_interval( 1+ bad_interval) + ... 456 | number_of_samples_in_bad_interval (1+ bad_interval)); 457 | 458 | ref (search_range_in_samples + ... 459 | number_of_samples_in_bad_interval (1+ bad_interval) + 1: ... 460 | search_range_in_samples + ... 461 | number_of_samples_in_bad_interval (1+ bad_interval) + ... 462 | search_range_in_samples) = 0; 463 | 464 | for i = 0: 2 * search_range_in_samples + ... 465 | number_of_samples_in_bad_interval (1+ bad_interval) - 1 466 | j = start_sample_of_bad_interval (1+ bad_interval) - ... 467 | search_range_in_samples + i; 468 | nn = maxNsamples - SEARCHBUFFER * Downsample + ... 469 | DATAPADDING_MSECS * (Fs / 1000); 470 | if (j <= SEARCHBUFFER * Downsample) 471 | j = SEARCHBUFFER * Downsample+ 1; 472 | end 473 | if (j > nn) 474 | j = nn; 475 | end 476 | deg (1+ i) = tweaked_deg (j); 477 | end 478 | 479 | [delay_in_samples, best_correlation]= compute_delay ... 480 | (1, 2 * search_range_in_samples + ... 481 | number_of_samples_in_bad_interval (1+ bad_interval), ... 482 | search_range_in_samples, ref, deg); 483 | delay_in_samples_in_bad_interval (1+ bad_interval) = ... 484 | delay_in_samples; 485 | % fprintf( 'delay_in_samples, best_correlation is \n\t%d, %f\n', ... 486 | % delay_in_samples, best_correlation); 487 | % 488 | if (best_correlation < 0.5) 489 | delay_in_samples_in_bad_interval (1+ bad_interval) = 0; 490 | end 491 | end 492 | 493 | if (number_of_bad_intervals > 0) 494 | doubly_tweaked_deg = tweaked_deg( 1: maxNsamples + ... 495 | DATAPADDING_MSECS * (Fs / 1000)); 496 | for bad_interval= 0: number_of_bad_intervals- 1 497 | delay = delay_in_samples_in_bad_interval (1+ bad_interval); 498 | 499 | for i = start_sample_of_bad_interval (1+ bad_interval): ... 500 | stop_sample_of_bad_interval (1+ bad_interval) 501 | j = i + delay; 502 | if (j < 1) 503 | j = 1; 504 | end 505 | if (j > maxNsamples) 506 | j = maxNsamples; 507 | end 508 | h = tweaked_deg (j); 509 | doubly_tweaked_deg (i) = h; 510 | end 511 | end 512 | 513 | untweaked_deg = deg_data; 514 | deg_data = doubly_tweaked_deg; 515 | 516 | for bad_interval= 0: number_of_bad_intervals- 1 517 | for frame = start_frame_of_bad_interval (1+ bad_interval): ... 518 | stop_frame_of_bad_interval (1+ bad_interval)- 1 519 | frame= frame- 1; 520 | start_sample_ref = SEARCHBUFFER * Downsample + ... 521 | frame * Nf / 2+ 1; 522 | start_sample_deg = start_sample_ref; 523 | hz_spectrum_deg= short_term_fft (Nf, deg_data, ... 524 | Whanning, start_sample_deg); 525 | pitch_pow_dens_deg( 1+ frame, :)= freq_warping (... 526 | hz_spectrum_deg, Nb, frame); 527 | end 528 | 529 | oldScale = 1; 530 | for frame = start_frame_of_bad_interval (1+ bad_interval): ... 531 | stop_frame_of_bad_interval (1+ bad_interval)- 1 532 | frame= frame- 1; 533 | % see implementation for detail why 1 needed to be 534 | % subtracted 535 | total_audible_pow_ref = total_audible (frame, ... 536 | pitch_pow_dens_ref, 1); 537 | total_audible_pow_deg = total_audible (frame, ... 538 | pitch_pow_dens_deg, 1); 539 | scale = (total_audible_pow_ref + 5e3) / ... 540 | (total_audible_pow_deg + 5e3); 541 | if (frame > 0) 542 | scale = 0.2 * oldScale + 0.8*scale; 543 | end 544 | oldScale = scale; 545 | if (scale > MAX_SCALE) 546 | scale = MAX_SCALE; 547 | end 548 | if (scale < MIN_SCALE) 549 | scale = MIN_SCALE; 550 | end 551 | 552 | pitch_pow_dens_deg (1+ frame, :) = ... 553 | pitch_pow_dens_deg (1+ frame, :)* scale; 554 | loudness_dens_ref= intensity_warping_of (frame, ... 555 | pitch_pow_dens_ref); 556 | loudness_dens_deg= intensity_warping_of (frame, ... 557 | pitch_pow_dens_deg); 558 | disturbance_dens = loudness_dens_deg - loudness_dens_ref; 559 | 560 | for band = 1: Nb 561 | deadzone(band) = min (loudness_dens_deg(band), ... 562 | loudness_dens_ref(band)); 563 | deadzone(band) = deadzone(band)* 0.25; 564 | end 565 | 566 | for band = 1: Nb 567 | d = disturbance_dens (band); 568 | m = deadzone (band); 569 | 570 | if (d > m) 571 | disturbance_dens (band) = ... 572 | disturbance_dens (band)- m; 573 | else 574 | if (d < -m) 575 | disturbance_dens (band) = ... 576 | disturbance_dens (band)+ m; 577 | else 578 | disturbance_dens (band) = 0; 579 | end 580 | end 581 | end 582 | 583 | frame_disturbance( 1+ frame) = min (... 584 | frame_disturbance( 1+ frame), pseudo_Lp(... 585 | disturbance_dens, D_POW_F)); 586 | disturbance_dens= multiply_with_asymmetry_factor ... 587 | (disturbance_dens, frame, pitch_pow_dens_ref, ... 588 | pitch_pow_dens_deg); 589 | frame_disturbance_asym_add(1+ frame) = min (... 590 | frame_disturbance_asym_add(1+ frame), ... 591 | pseudo_Lp (disturbance_dens, A_POW_F)); 592 | end 593 | end 594 | deg_data = untweaked_deg; 595 | end 596 | end 597 | 598 | for frame = 0: stop_frame 599 | h = 1; 600 | if (stop_frame + 1 > 1000) 601 | n = floor( (maxNsamples - 2 * SEARCHBUFFER * Downsample)... 602 | / (Nf / 2)) - 1; 603 | timeWeightFactor = (n - 1000) / 5500; 604 | if (timeWeightFactor > 0.5) 605 | timeWeightFactor = 0.5; 606 | end 607 | h = (1.0 - timeWeightFactor) + timeWeightFactor * frame / n; 608 | end 609 | 610 | time_weight (1 +frame) = h; 611 | end 612 | 613 | % fid= fopen( 'tmp_mat1.txt', 'at'); 614 | % fprintf( '\n'); 615 | for frame = 0: stop_frame 616 | h = ((total_power_ref (1+ frame) + 1e5) / 1e7)^ 0.04; 617 | % if (frame== 118) 618 | % fprintf( '%f\n', h); 619 | % fprintf( '%f\n', frame_disturbance( 1+ frame)); 620 | % end 621 | frame_disturbance( 1+ frame) = frame_disturbance( 1+ frame)/ h; 622 | 623 | % if (frame== 118) 624 | % fprintf( '%f\n', frame_disturbance( 1+ frame)); 625 | % end 626 | % 627 | frame_disturbance_asym_add( 1+ frame) = ... 628 | frame_disturbance_asym_add( 1+ frame)/ h; 629 | if (frame_disturbance( 1+ frame) > 45) 630 | frame_disturbance( 1+ frame) = 45; 631 | end 632 | if (frame_disturbance_asym_add( 1+ frame)> 45) 633 | frame_disturbance_asym_add( 1+ frame) = 45; 634 | end 635 | end 636 | % fclose ( fid); 637 | 638 | d_indicator = Lpq_weight (start_frame, stop_frame, ... 639 | D_POW_S, D_POW_T, frame_disturbance, time_weight); 640 | a_indicator = Lpq_weight (start_frame, stop_frame, ... 641 | A_POW_S, A_POW_T, frame_disturbance_asym_add, time_weight); 642 | 643 | pesq_mos = 4.5 - D_WEIGHT * d_indicator - A_WEIGHT * a_indicator; 644 | 645 | if (Plot_Frame> 0) 646 | figure; 647 | subplot( 1, 2, 1); 648 | mesh( 0: stop_frame, centre_of_band_hz, D_disturbance'); 649 | title( 'disturbance'); 650 | subplot( 1, 2, 2); 651 | mesh( 0: stop_frame, centre_of_band_hz, DA_disturbance'); 652 | title( 'disturbance after asymmetry processing'); 653 | end 654 | 655 | % fid= fopen( 'tmp_mat.txt', 'wt'); 656 | % fprintf( fid, 'time_weight\n'); 657 | % fprintf( fid, '%f\n', time_weight); 658 | % fprintf( fid, 'frame_disturbance:\n'); 659 | % fprintf( fid, '%f\n', frame_disturbance); 660 | % fprintf( fid, 'frame_disturbance_asym_add\n'); 661 | % fprintf( fid, '%f\n', frame_disturbance_asym_add); 662 | % fclose( fid); 663 | 664 | function result_time= Lpq_weight(start_frame, stop_frame, ... 665 | power_syllable, power_time, frame_disturbance, time_weight) 666 | 667 | global NUMBER_OF_PSQM_FRAMES_PER_SYLLABE 668 | 669 | % fid= fopen( 'tmp_mat1.txt', 'at'); 670 | % fprintf( 'result_time:\n'); 671 | 672 | result_time= 0; 673 | total_time_weight_time = 0; 674 | % fprintf( 'start/end frame: %d/%d\n', start_frame, stop_frame); 675 | for start_frame_of_syllable = start_frame: ... 676 | NUMBER_OF_PSQM_FRAMES_PER_SYLLABE/2: stop_frame 677 | result_syllable = 0; 678 | count_syllable = 0; 679 | 680 | for frame = start_frame_of_syllable: ... 681 | start_frame_of_syllable + NUMBER_OF_PSQM_FRAMES_PER_SYLLABE- 1 682 | if (frame <= stop_frame) 683 | h = frame_disturbance(1+ frame); 684 | % if (start_frame_of_syllable== 101) 685 | % fprintf( fid, '%f\n', h); 686 | % end 687 | result_syllable = result_syllable+ (h^ power_syllable); 688 | end 689 | count_syllable = count_syllable+ 1; 690 | end 691 | 692 | result_syllable = result_syllable/ count_syllable; 693 | result_syllable = result_syllable^ (1/power_syllable); 694 | 695 | result_time= result_time+ (time_weight (... 696 | 1+ start_frame_of_syllable - start_frame) * ... 697 | result_syllable)^ power_time; 698 | total_time_weight_time = total_time_weight_time+ ... 699 | time_weight (1+ start_frame_of_syllable - start_frame)^ power_time; 700 | 701 | % fprintf( fid, '%f\n', result_time); 702 | end 703 | % fclose (fid); 704 | 705 | % fprintf( 'total_time_weight_time is %f\n', total_time_weight_time); 706 | result_time = result_time/ total_time_weight_time; 707 | result_time= result_time^ (1/ power_time); 708 | % fprintf( 'result_time is %f\n\n', result_time); 709 | 710 | 711 | function [best_delay, max_correlation] = compute_delay (... 712 | start_sample, stop_sample, search_range, ... 713 | time_series1, time_series2) 714 | 715 | n = stop_sample - start_sample+ 1; 716 | power_of_2 = 2^ (ceil( log2( 2 * n))); 717 | 718 | power1 = pow_of (time_series1, start_sample, stop_sample, n)* ... 719 | n/ power_of_2; 720 | power2 = pow_of (time_series2, start_sample, stop_sample, n)* ... 721 | n/ power_of_2; 722 | normalization = sqrt (power1 * power2); 723 | % fprintf( 'normalization is %f\n', normalization); 724 | 725 | if ((power1 <= 1e-6) || (power2 <= 1e-6)) 726 | max_correlation = 0; 727 | best_delay= 0; 728 | end 729 | 730 | x1( 1: power_of_2)= 0; 731 | x2( 1: power_of_2)= 0; 732 | y( 1: power_of_2)= 0; 733 | 734 | x1( 1: n)= abs( time_series1( start_sample: ... 735 | stop_sample)); 736 | x2( 1: n)= abs( time_series2( start_sample: ... 737 | stop_sample)); 738 | 739 | x1_fft= fft( x1, power_of_2)/ power_of_2; 740 | x2_fft= fft( x2, power_of_2); 741 | x1_fft_conj= conj( x1_fft); 742 | y= ifft( x1_fft_conj.* x2_fft, power_of_2); 743 | 744 | best_delay = 0; 745 | max_correlation = 0; 746 | 747 | % these loop can be rewritten 748 | for i = -search_range: -1 749 | h = abs (y (1+ i + power_of_2)) / normalization; 750 | if (h > max_correlation) 751 | max_correlation = h; 752 | best_delay= i; 753 | end 754 | end 755 | for i = 0: search_range- 1 756 | h = abs (y (1+i)) / normalization; 757 | if (h > max_correlation) 758 | max_correlation = h; 759 | best_delay= i; 760 | end 761 | end 762 | best_delay= best_delay- 1; 763 | 764 | function mod_disturbance_dens= multiply_with_asymmetry_factor (... 765 | disturbance_dens, frame, pitch_pow_dens_ref, pitch_pow_dens_deg) 766 | 767 | global Nb 768 | for i = 1: Nb 769 | ratio = (pitch_pow_dens_deg(1+ frame, i) + 50)... 770 | / (pitch_pow_dens_ref (1+ frame, i) + 50); 771 | h = ratio^ 1.2; 772 | if (h > 12) 773 | h = 12; 774 | elseif (h < 3) 775 | h = 0.0; 776 | end 777 | mod_disturbance_dens (i) = disturbance_dens (i) * h; 778 | end 779 | 780 | 781 | function loudness_dens = intensity_warping_of (... 782 | frame, pitch_pow_dens) 783 | 784 | global abs_thresh_power Sl Nb centre_of_band_bark 785 | ZWICKER_POWER= 0.23; 786 | for band = 1: Nb 787 | threshold = abs_thresh_power (band); 788 | input = pitch_pow_dens (1+ frame, band); 789 | 790 | if (centre_of_band_bark (band) < 4) 791 | h = 6 / (centre_of_band_bark (band) + 2); 792 | else 793 | h = 1; 794 | end 795 | 796 | if (h > 2) 797 | h = 2; 798 | end 799 | h = h^ 0.15; 800 | modified_zwicker_power = ZWICKER_POWER * h; 801 | if (input > threshold) 802 | loudness_dens (band) = ((threshold / 0.5)^ modified_zwicker_power)... 803 | * ((0.5 + 0.5 * input / threshold)^ modified_zwicker_power- 1); 804 | else 805 | loudness_dens (band) = 0; 806 | end 807 | 808 | loudness_dens (band) = loudness_dens (band)* Sl; 809 | end 810 | 811 | function result= pseudo_Lp (x, p) 812 | 813 | global Nb width_of_band_bark 814 | totalWeight = 0; 815 | result = 0; 816 | for band = 2: Nb 817 | h = abs (x (band)); 818 | w = width_of_band_bark (band); 819 | prod = h * w; 820 | 821 | result = result+ prod^ p; 822 | totalWeight = totalWeight+ w; 823 | end 824 | result = (result/ totalWeight)^ (1/p); 825 | result = result* totalWeight; 826 | 827 | 828 | function mod_pitch_pow_dens_ref= freq_resp_compensation (number_of_frames, ... 829 | pitch_pow_dens_ref, avg_pitch_pow_dens_ref, ... 830 | avg_pitch_pow_dens_deg, constant) 831 | 832 | global Nb 833 | 834 | for band = 1: Nb 835 | x = (avg_pitch_pow_dens_deg (band) + constant) / ... 836 | (avg_pitch_pow_dens_ref (band) + constant); 837 | if (x > 100.0) 838 | x = 100.0; 839 | elseif (x < 0.01) 840 | x = 0.01; 841 | end 842 | 843 | for frame = 1: number_of_frames 844 | mod_pitch_pow_dens_ref(frame, band) = ... 845 | pitch_pow_dens_ref(frame, band) * x; 846 | end 847 | end 848 | 849 | 850 | 851 | function avg_pitch_pow_dens= time_avg_audible_of(number_of_frames, ... 852 | silent, pitch_pow_dens, total_number_of_frames) 853 | 854 | global Nb abs_thresh_power 855 | 856 | for band = 1: Nb 857 | result = 0; 858 | for frame = 1: number_of_frames 859 | if (~silent (frame)) 860 | h = pitch_pow_dens (frame, band); 861 | if (h > 100 * abs_thresh_power (band)) 862 | result = result + h; 863 | end 864 | end 865 | 866 | avg_pitch_pow_dens (band) = result/ total_number_of_frames; 867 | end 868 | end 869 | 870 | 871 | 872 | function hz_spectrum= short_term_fft (Nf, data, Whanning, start_sample) 873 | 874 | x1= data( start_sample: start_sample+ Nf-1).* Whanning; 875 | x1_fft= fft( x1); 876 | hz_spectrum= abs( x1_fft( 1: Nf/ 2)).^ 2; 877 | hz_spectrum( 1)= 0; 878 | 879 | 880 | function pitch_pow_dens= freq_warping( hz_spectrum, Nb, frame) 881 | 882 | global nr_of_hz_bands_per_bark_band pow_dens_correction_factor 883 | global Sp 884 | 885 | hz_band = 1; 886 | for bark_band = 1: Nb 887 | n = nr_of_hz_bands_per_bark_band (bark_band); 888 | sum = 0; 889 | for i = 1: n 890 | sum = sum+ hz_spectrum( hz_band); 891 | hz_band= hz_band+ 1; 892 | end 893 | sum = sum* pow_dens_correction_factor (bark_band); 894 | sum = sum* Sp; 895 | pitch_pow_dens (bark_band) = sum; 896 | 897 | end 898 | 899 | 900 | function total_audible_pow = total_audible (frame, ... 901 | pitch_pow_dens, factor) 902 | 903 | global Nb abs_thresh_power 904 | 905 | total_audible_pow = 0; 906 | for band= 2: Nb 907 | h = pitch_pow_dens (frame+ 1,band); 908 | threshold = factor * abs_thresh_power (band); 909 | if (h > threshold) 910 | total_audible_pow = total_audible_pow+ h; 911 | end 912 | end 913 | 914 | 915 | 916 | 917 | 918 | 919 | 920 | 921 | -------------------------------------------------------------------------------- /PESQ/pow_of.m: -------------------------------------------------------------------------------- 1 | function power= pow_of( data, start_point, end_point, divisor) 2 | 3 | power= sum( data( start_point: end_point).^ 2)/ divisor; -------------------------------------------------------------------------------- /PESQ/setup_global.m: -------------------------------------------------------------------------------- 1 | function setup_global( sampling_rate); 2 | 3 | global Downsample InIIR_Hsos InIIR_Nsos Align_Nfft 4 | global DATAPADDING_MSECS SEARCHBUFFER Fs MINSPEECHLGTH JOINSPEECHLGTH 5 | 6 | global Nutterances Largest_uttsize Nsurf_samples Crude_DelayEst 7 | global Crude_DelayConf UttSearch_Start UttSearch_End Utt_DelayEst 8 | global Utt_Delay Utt_DelayConf Utt_Start Utt_End 9 | global MAXNUTTERANCES WHOLE_SIGNAL 10 | global pesq_mos subj_mos cond_nr MINUTTLENGTH 11 | global CALIBRATE Nfmax Nb Sl Sp 12 | global nr_of_hz_bands_per_bark_band centre_of_band_bark 13 | global width_of_band_hz centre_of_band_hz width_of_band_bark 14 | global pow_dens_correction_factor abs_thresh_power 15 | 16 | CALIBRATE= 0; 17 | Nfmax= 512; 18 | 19 | MAXNUTTERANCES= 50; 20 | MINUTTLENGTH= 50; 21 | WHOLE_SIGNAL= -1; 22 | UttSearch_Star= zeros( 1, MAXNUTTERANCES); 23 | UttSearch_End= zeros( 1, MAXNUTTERANCES); 24 | Utt_DelayEst= zeros( 1, MAXNUTTERANCES); 25 | Utt_Delay= zeros( 1, MAXNUTTERANCES); 26 | Utt_DelayConf= zeros( 1, MAXNUTTERANCES); 27 | Utt_Start= zeros( 1, MAXNUTTERANCES); 28 | Utt_End= zeros( 1, MAXNUTTERANCES); 29 | 30 | DATAPADDING_MSECS= 320; 31 | SEARCHBUFFER= 75; 32 | MINSPEECHLGTH= 4; 33 | JOINSPEECHLGTH= 50; 34 | 35 | Sp_16k = 6.910853e-006; 36 | Sl_16k = 1.866055e-001; 37 | fs_16k= 16000; 38 | Downsample_16k = 64; 39 | Align_Nfft_16k = 1024; 40 | InIIR_Nsos_16k = 12; 41 | InIIR_Hsos_16k = [ 42 | 0.325631521, -0.086782860, -0.238848661, -1.079416490, 0.434583902; 43 | 0.403961804, -0.556985881, 0.153024077, -0.415115835, 0.696590244; 44 | 4.736162769, 3.287251046, 1.753289019, -1.859599046, 0.876284034; 45 | 0.365373469, 0.000000000, 0.000000000, -0.634626531, 0.000000000; 46 | 0.884811506, 0.000000000, 0.000000000, -0.256725271, 0.141536777; 47 | 0.723593055, -1.447186099, 0.723593044, -1.129587469, 0.657232737; 48 | 1.644910855, -1.817280902, 1.249658063, -1.778403899, 0.801724355; 49 | 0.633692689, -0.284644314, -0.319789663, 0.000000000, 0.000000000; 50 | 1.032763031, 0.268428979, 0.602913323, 0.000000000, 0.000000000; 51 | 1.001616361, -0.823749013, 0.439731942, -0.885778255, 0.000000000; 52 | 0.752472096, -0.375388990, 0.188977609, -0.077258216, 0.247230734; 53 | 1.023700575, 0.001661628, 0.521284240, -0.183867259, 0.354324187 54 | ]; 55 | 56 | Sp_8k = 2.764344e-5; 57 | Sl_8k = 1.866055e-1; 58 | fs_8k= 8000; 59 | Downsample_8k = 32; 60 | Align_Nfft_8k = 512; 61 | InIIR_Nsos_8k = 8; 62 | InIIR_Hsos_8k = [ 63 | 0.885535424, -0.885535424, 0.000000000, -0.771070709, 0.000000000; 64 | 0.895092588, 1.292907193, 0.449260174, 1.268869037, 0.442025372; 65 | 4.049527940, -7.865190042, 3.815662102, -1.746859852, 0.786305963; 66 | 0.500002353, -0.500002353, 0.000000000, 0.000000000, 0.000000000; 67 | 0.565002834, -0.241585934, -0.306009671, 0.259688659, 0.249979657; 68 | 2.115237288, 0.919935084, 1.141240051, -1.587313419, 0.665935315; 69 | 0.912224584, -0.224397719, -0.641121413, -0.246029464, -0.556720590; 70 | 0.444617727, -0.307589321, 0.141638062, -0.996391149, 0.502251622 71 | ]; 72 | 73 | nr_of_hz_bands_per_bark_band_8k = [ 74 | 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, ... 75 | 1, 1, 1, 1, 2, 1, 1, 2, 2, 2, ... 76 | 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, ... 77 | 3, 4, 5, 4, 5, 6, 6, 7, 8, 9, ... 78 | 9, 11 79 | ]; 80 | 81 | centre_of_band_bark_8k = [ 82 | 0.078672, 0.316341, 0.636559, 0.961246, 1.290450, ... 83 | 1.624217, 1.962597, 2.305636, 2.653383, 3.005889, ... 84 | 3.363201, 3.725371, 4.092449, 4.464486, 4.841533, ... 85 | 5.223642, 5.610866, 6.003256, 6.400869, 6.803755, ... 86 | 7.211971, 7.625571, 8.044611, 8.469146, 8.899232, ... 87 | 9.334927, 9.776288, 10.223374, 10.676242, 11.134952,... 88 | 11.599563, 12.070135, 12.546731, 13.029408, 13.518232,... 89 | 14.013264, 14.514566, 15.022202, 15.536238, 16.056736,... 90 | 16.583761, 17.117382 91 | ]; 92 | 93 | centre_of_band_hz_8k = [ 94 | 7.867213, 31.634144, 63.655895, 96.124611, 129.044968,... 95 | 162.421738, 196.259659, 230.563568, 265.338348, 300.588867,... 96 | 336.320129, 372.537140, 409.244934, 446.448578, 484.568604,... 97 | 526.600586, 570.303833, 619.423340, 672.121643, 728.525696,... 98 | 785.675964, 846.835693, 909.691650, 977.063293, 1049.861694,... 99 | 1129.635986, 1217.257568, 1312.109497, 1412.501465, 1517.999390,... 100 | 1628.894165, 1746.194336, 1871.568848, 2008.776123, 2158.979248,... 101 | 2326.743164, 2513.787109, 2722.488770, 2952.586670, 3205.835449,... 102 | 3492.679932, 3820.219238 103 | ]; 104 | 105 | width_of_band_bark_8k = [ 106 | 0.157344, 0.317994, 0.322441, 0.326934, 0.331474, ... 107 | 0.336061, 0.340697, 0.345381, 0.350114, 0.354897, ... 108 | 0.359729, 0.364611, 0.369544, 0.374529, 0.379565, ... 109 | 0.384653, 0.389794, 0.394989, 0.400236, 0.405538, ... 110 | 0.410894, 0.416306, 0.421773, 0.427297, 0.432877, ... 111 | 0.438514, 0.444209, 0.449962, 0.455774, 0.461645, ... 112 | 0.467577, 0.473569, 0.479621, 0.485736, 0.491912, ... 113 | 0.498151, 0.504454, 0.510819, 0.517250, 0.523745, ... 114 | 0.530308, 0.536934 115 | ]; 116 | 117 | width_of_band_hz_8k = [ 118 | 15.734426, 31.799433, 32.244064, 32.693359, 33.147385, ... 119 | 33.606140, 34.069702, 34.538116, 35.011429, 35.489655, ... 120 | 35.972870, 36.461121, 36.954407, 37.452911, 40.269653, ... 121 | 42.311859, 45.992554, 51.348511, 55.040527, 56.775208, ... 122 | 58.699402, 62.445862, 64.820923, 69.195374, 76.745667, ... 123 | 84.016235, 90.825684, 97.931152, 103.348877, 107.801880, ... 124 | 113.552246, 121.490601, 130.420410, 143.431763, 158.486816, ... 125 | 176.872803, 198.314697, 219.549561, 240.600098, 268.702393, ... 126 | 306.060059, 349.937012 127 | ]; 128 | 129 | pow_dens_correction_factor_8k = [ 130 | 100.000000, 99.999992, 100.000000, 100.000008, 100.000008,... 131 | 100.000015, 99.999992, 99.999969, 50.000027, 100.000000,... 132 | 99.999969, 100.000015, 99.999947, 100.000061, 53.047077, ... 133 | 110.000046, 117.991989, 65.000000, 68.760147, 69.999931, ... 134 | 71.428818, 75.000038, 76.843384, 80.968781, 88.646126, ... 135 | 63.864388, 68.155350, 72.547775, 75.584831, 58.379192,... 136 | 80.950836, 64.135651, 54.384785, 73.821884, 64.437073, ... 137 | 59.176456, 65.521278, 61.399822, 58.144047, 57.004543,... 138 | 64.126297, 59.248363 139 | ]; 140 | 141 | abs_thresh_power_8k = [ 142 | 51286152, 2454709.500, 70794.593750, ... 143 | 4897.788574, 1174.897705, 389.045166, ... 144 | 104.712860, 45.708820, 17.782795, ... 145 | 9.772372, 4.897789, 3.090296, ... 146 | 1.905461, 1.258925, 0.977237, ... 147 | 0.724436, 0.562341, 0.457088, ... 148 | 0.389045, 0.331131, 0.295121, ... 149 | 0.269153, 0.257040, 0.251189, ... 150 | 0.251189, 0.251189, 0.251189, ... 151 | 0.263027, 0.288403, 0.309030, ... 152 | 0.338844, 0.371535, 0.398107, ... 153 | 0.436516, 0.467735, 0.489779, ... 154 | 0.501187, 0.501187, 0.512861, ... 155 | 0.524807, 0.524807, 0.524807 156 | ]; 157 | 158 | nr_of_hz_bands_per_bark_band_16k = [ 159 | 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, ... 160 | 1, 1, 1, 1, 2, 1, 1, 2, 2, 2, ... 161 | 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, ... 162 | 3, 4, 5, 4, 5, 6, 6, 7, 8, 9, ... 163 | 9, 12, 12, 15, 16, 18, 21, 25, 20 164 | ]; 165 | 166 | centre_of_band_bark_16k = [ 167 | 0.078672, 0.316341, 0.636559, 0.961246, 1.290450, ... 168 | 1.624217, 1.962597, 2.305636, 2.653383, 3.005889, ... 169 | 3.363201, 3.725371, 4.092449, 4.464486, 4.841533, ... 170 | 5.223642, 5.610866, 6.003256, 6.400869, 6.803755, ... 171 | 7.211971, 7.625571, 8.044611, 8.469146, 8.899232, ... 172 | 9.334927, 9.776288, 10.223374, 10.676242, 11.134952, ... 173 | 11.599563, 12.070135, 12.546731, 13.029408, 13.518232, ... 174 | 14.013264, 14.514566, 15.022202, 15.536238, 16.056736, ... 175 | 16.583761, 17.117382, 17.657663, 18.204674, 18.758478, ... 176 | 19.319147, 19.886751, 20.461355, 21.043034 177 | ]; 178 | 179 | centre_of_band_hz_16k = [ 180 | 7.867213, 31.634144, 63.655895, 96.124611, 129.044968,... 181 | 162.421738, 196.259659, 230.563568, 265.338348, 300.588867,... 182 | 336.320129, 372.537140, 409.244934, 446.448578, 484.568604,... 183 | 526.600586, 570.303833, 619.423340, 672.121643, 728.525696,... 184 | 785.675964, 846.835693, 909.691650, 977.063293, 1049.861694,... 185 | 1129.635986, 1217.257568, 1312.109497, 1412.501465, 1517.999390,... 186 | 1628.894165, 1746.194336, 1871.568848, 2008.776123, 2158.979248,... 187 | 2326.743164, 2513.787109, 2722.488770, 2952.586670, 3205.835449,... 188 | 3492.679932, 3820.219238, 4193.938477, 4619.846191, 5100.437012,... 189 | 5636.199219, 6234.313477, 6946.734863, 7796.473633 190 | ]; 191 | 192 | width_of_band_bark_16k = [ 193 | 0.157344, 0.317994, 0.322441, 0.326934, 0.331474,... 194 | 0.336061, 0.340697, 0.345381, 0.350114, 0.354897,... 195 | 0.359729, 0.364611, 0.369544, 0.374529, 0.379565,... 196 | 0.384653, 0.389794, 0.394989, 0.400236, 0.405538,... 197 | 0.410894, 0.416306, 0.421773, 0.427297, 0.432877,... 198 | 0.438514, 0.444209, 0.449962, 0.455774, 0.461645,... 199 | 0.467577, 0.473569, 0.479621, 0.485736, 0.491912,... 200 | 0.498151, 0.504454, 0.510819, 0.517250, 0.523745,... 201 | 0.530308, 0.536934, 0.543629, 0.550390, 0.557220,... 202 | 0.564119, 0.571085, 0.578125, 0.585232 203 | ]; 204 | 205 | width_of_band_hz_16k = [ 206 | 15.734426, 31.799433, 32.244064, 32.693359, ... 207 | 33.147385, 33.606140, 34.069702, 34.538116, ... 208 | 35.011429, 35.489655, 35.972870, 36.461121, ... 209 | 36.954407, 37.452911, 40.269653, 42.311859, ... 210 | 45.992554, 51.348511, 55.040527, 56.775208, ... 211 | 58.699402, 62.445862, 64.820923, 69.195374, ... 212 | 76.745667, 84.016235, 90.825684, 97.931152, ... 213 | 103.348877, 107.801880, 113.552246, 121.490601, ... 214 | 130.420410, 143.431763, 158.486816, 176.872803, ... 215 | 198.314697, 219.549561, 240.600098, 268.702393, ... 216 | 306.060059, 349.937012, 398.686279, 454.713867, ... 217 | 506.841797, 564.863770, 637.261230, 794.717285, ... 218 | 931.068359 219 | ]; 220 | 221 | pow_dens_correction_factor_16k = [ 222 | 100.000000, 99.999992, 100.000000, 100.000008,... 223 | 100.000008, 100.000015, 99.999992, 99.999969, ... 224 | 50.000027, 100.000000, 99.999969, 100.000015, ... 225 | 99.999947, 100.000061, 53.047077, 110.000046, ... 226 | 117.991989, 65.000000, 68.760147, 69.999931, ... 227 | 71.428818, 75.000038, 76.843384, 80.968781, ... 228 | 88.646126, 63.864388, 68.155350, 72.547775, ... 229 | 75.584831, 58.379192, 80.950836, 64.135651, ... 230 | 54.384785, 73.821884, 64.437073, 59.176456, ... 231 | 65.521278, 61.399822, 58.144047, 57.004543, ... 232 | 64.126297, 54.311001, 61.114979, 55.077751, ... 233 | 56.849335, 55.628868, 53.137054, 54.985844, ... 234 | 79.546974 235 | ]; 236 | 237 | abs_thresh_power_16k = [ 238 | 51286152.00, 2454709.500, 70794.593750, ... 239 | 4897.788574, 1174.897705, 389.045166, ... 240 | 104.712860, 45.708820, 17.782795, ... 241 | 9.772372, 4.897789, 3.090296, ... 242 | 1.905461, 1.258925, 0.977237, ... 243 | 0.724436, 0.562341, 0.457088, ... 244 | 0.389045, 0.331131, 0.295121, ... 245 | 0.269153, 0.257040, 0.251189, ... 246 | 0.251189, 0.251189, 0.251189, ... 247 | 0.263027, 0.288403, 0.309030, ... 248 | 0.338844, 0.371535, 0.398107, ... 249 | 0.436516, 0.467735, 0.489779, ... 250 | 0.501187, 0.501187, 0.512861, ... 251 | 0.524807, 0.524807, 0.524807, ... 252 | 0.512861, 0.478630, 0.426580, ... 253 | 0.371535, 0.363078, 0.416869, ... 254 | 0.537032 255 | ]; 256 | 257 | if (sampling_rate== fs_16k) 258 | Downsample = Downsample_16k; 259 | InIIR_Hsos = InIIR_Hsos_16k; 260 | InIIR_Nsos = InIIR_Nsos_16k; 261 | Align_Nfft = Align_Nfft_16k; 262 | Fs= fs_16k; 263 | 264 | Nb = 49; 265 | Sl = Sl_16k; 266 | Sp = Sp_16k; 267 | nr_of_hz_bands_per_bark_band = nr_of_hz_bands_per_bark_band_16k; 268 | centre_of_band_bark = centre_of_band_bark_16k; 269 | centre_of_band_hz = centre_of_band_hz_16k; 270 | width_of_band_bark = width_of_band_bark_16k; 271 | width_of_band_hz = width_of_band_hz_16k; 272 | pow_dens_correction_factor = pow_dens_correction_factor_16k; 273 | abs_thresh_power = abs_thresh_power_16k; 274 | 275 | return; 276 | end 277 | 278 | if (sampling_rate== fs_8k) 279 | Downsample = Downsample_8k; 280 | InIIR_Hsos = InIIR_Hsos_8k; 281 | InIIR_Nsos = InIIR_Nsos_8k; 282 | Align_Nfft = Align_Nfft_8k; 283 | Fs= fs_8k; 284 | 285 | Nb = 42; 286 | Sl = Sl_8k; 287 | Sp = Sp_8k; 288 | nr_of_hz_bands_per_bark_band = nr_of_hz_bands_per_bark_band_8k; 289 | centre_of_band_bark = centre_of_band_bark_8k; 290 | centre_of_band_hz = centre_of_band_hz_8k; 291 | width_of_band_bark = width_of_band_bark_8k; 292 | width_of_band_hz = width_of_band_hz_8k; 293 | pow_dens_correction_factor = pow_dens_correction_factor_8k; 294 | abs_thresh_power = abs_thresh_power_8k; 295 | return; 296 | end 297 | 298 | 299 | 300 | 301 | -------------------------------------------------------------------------------- /PESQ/split_align.m: -------------------------------------------------------------------------------- 1 | function split_align( ref_data, ref_Nsamples, ref_VAD, ref_logVAD, ... 2 | deg_data, deg_Nsamples, deg_VAD, deg_logVAD, ... 3 | Utt_Start_l, Utt_SpeechStart, Utt_SpeechEnd, Utt_End_l, ... 4 | Utt_DelayEst_l, Utt_DelayConf_l) 5 | 6 | global MAXNUTTERANCES Align_Nfft Downsample Window 7 | global Utt_DelayEst Utt_Delay UttSearch_Start UttSearch_End 8 | global Best_ED1 Best_D1 Best_DC1 Best_ED2 Best_D2 Best_DC2 Best_BP 9 | 10 | Utt_BPs= zeros( 1, 41); 11 | Utt_ED1= zeros( 1, 41); 12 | Utt_ED2= zeros( 1, 41); 13 | Utt_D1= zeros( 1, 41); 14 | Utt_D2= zeros( 1, 41); 15 | Utt_DC1= zeros( 1, 41); 16 | Utt_DC2= zeros( 1, 41); 17 | 18 | 19 | Utt_Len = Utt_SpeechEnd - Utt_SpeechStart; 20 | Utt_Test = MAXNUTTERANCES; 21 | Best_DC1 = 0.0; 22 | Best_DC2 = 0.0; 23 | kernel = Align_Nfft / 64; 24 | Delta = Align_Nfft / (4 * Downsample); 25 | Step = floor( ((0.801 * Utt_Len + 40 * Delta - 1)/(40 * Delta))); 26 | Step = Step* Delta; 27 | % fprintf( 'Step is %f\n', Step); 28 | 29 | Pad = floor( Utt_Len / 10); 30 | if( Pad < 75 ) 31 | Pad = 75; 32 | end 33 | 34 | Utt_BPs(1) = Utt_SpeechStart + Pad; 35 | N_BPs = 1; 36 | while( 1) 37 | N_BPs= N_BPs+ 1; 38 | Utt_BPs(N_BPs)= Utt_BPs(N_BPs- 1)+ Step; 39 | if (~((Utt_BPs(N_BPs) <= (Utt_SpeechEnd- Pad)) && (N_BPs <= 40) )) 40 | break; 41 | end 42 | end 43 | 44 | if( N_BPs <= 1 ) 45 | return; 46 | end 47 | 48 | % fprintf( 'Utt_DelayEst_l, Utt_Start_l, N_BPs is %d,%d,%d\n', ... 49 | % Utt_DelayEst_l, Utt_Start_l, N_BPs); 50 | for bp = 1: N_BPs- 1 51 | Utt_DelayEst(Utt_Test) = Utt_DelayEst_l; 52 | UttSearch_Start(Utt_Test) = Utt_Start_l; 53 | UttSearch_End(Utt_Test) = Utt_BPs(bp); 54 | % fprintf( 'bp,Utt_BPs(%d) is %d,%d\n', bp,bp,Utt_BPs(bp)); 55 | 56 | crude_align( ref_logVAD, ref_Nsamples, deg_logVAD, ... 57 | deg_Nsamples, MAXNUTTERANCES); 58 | Utt_ED1(bp) = Utt_Delay(Utt_Test); 59 | 60 | Utt_DelayEst(Utt_Test) = Utt_DelayEst_l; 61 | UttSearch_Start(Utt_Test) = Utt_BPs(bp); 62 | UttSearch_End(Utt_Test) = Utt_End_l; 63 | 64 | crude_align( ref_logVAD, ref_Nsamples, deg_logVAD, ... 65 | deg_Nsamples, MAXNUTTERANCES); 66 | Utt_ED2(bp) = Utt_Delay(Utt_Test); 67 | end 68 | 69 | % stream = fopen( 'matmat.txt', 'wt' ); 70 | % for count= 1: N_BPs- 1 71 | % fprintf( stream, '%d\n', Utt_ED2(count)); 72 | % end 73 | % fclose( stream ); 74 | 75 | 76 | Utt_DC1(1: N_BPs-1) = -2.0; 77 | % stream= fopen( 'what_mmm.txt', 'at'); 78 | while( 1 ) 79 | bp = 1; 80 | while( (bp <= N_BPs- 1) && (Utt_DC1(bp) > -2.0) ) 81 | bp = bp+ 1; 82 | end 83 | if( bp >= N_BPs ) 84 | break; 85 | end 86 | 87 | estdelay = Utt_ED1(bp); 88 | % fprintf( 'bp,estdelay is %d,%d\n', bp, estdelay); 89 | H(1: Align_Nfft)= 0; 90 | Hsum = 0.0; 91 | 92 | startr = (Utt_Start_l- 1) * Downsample+ 1; 93 | startd = startr + estdelay; 94 | % fprintf( 'startr/startd is %d/%d\n', startr, startd); 95 | 96 | if ( startd < 0 ) 97 | startr = -estdelay+ 1; 98 | startd = 1; 99 | end 100 | 101 | while( ((startd + Align_Nfft) <= 1+ deg_Nsamples) &&... 102 | ((startr + Align_Nfft) <= (1+ (Utt_BPs(bp)- 1) * Downsample)) ) 103 | X1= ref_data(startr: startr+ Align_Nfft- 1).* Window; 104 | X2= deg_data(startd: startd+ Align_Nfft- 1).* Window; 105 | 106 | X1_fft= fft( X1, Align_Nfft ); 107 | X1_fft_conj= conj( X1_fft); 108 | X2_fft= fft( X2, Align_Nfft ); 109 | X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft); 110 | 111 | X1= abs( X1); 112 | v_max= max( X1)* 0.99; 113 | n_max = (v_max^ 0.125 )/ kernel; 114 | % fprintf( stream, '%f %f\n', v_max, n_max); 115 | 116 | for count = 0: Align_Nfft- 1 117 | if( X1(count+ 1) > v_max ) 118 | Hsum = Hsum+ n_max * kernel; 119 | for k = 1-kernel: kernel- 1 120 | H(1+ rem( count+ k+ Align_Nfft, Align_Nfft))= ... 121 | H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))+ ... 122 | n_max* (kernel- abs(k)); 123 | end 124 | end 125 | end 126 | 127 | startr = startr+ (Align_Nfft / 4); 128 | startd = startd+ (Align_Nfft / 4); 129 | end 130 | 131 | [v_max, I_max] = max( H); 132 | if( I_max- 1 >= (Align_Nfft/2) ) 133 | I_max = I_max- Align_Nfft; 134 | end 135 | 136 | Utt_D1(bp) = estdelay + I_max- 1; 137 | if( Hsum > 0.0 ) 138 | % if (Utt_Len== 236) 139 | % fprintf( 'v_max, Hsum is %f, %f\n', v_max, Hsum); 140 | % end 141 | Utt_DC1(bp) = v_max / Hsum; 142 | else 143 | Utt_DC1(bp) = 0.0; 144 | end 145 | 146 | % fprintf( 'bp/startr/startd is %d/%d/%d\n', bp, startr, startd); 147 | while( bp < (N_BPs - 1) ) 148 | bp = bp + 1; 149 | 150 | if( (Utt_ED1(bp) == estdelay) && (Utt_DC1(bp) <= -2.0) ) 151 | % loopno= 0; 152 | while(((startd+ Align_Nfft)<= 1+ deg_Nsamples) && ... 153 | ((startr+ Align_Nfft)<= ... 154 | ((Utt_BPs(bp)- 1)* Downsample+ 1) )) 155 | X1= ref_data( startr: startr+ Align_Nfft- 1).* ... 156 | Window; 157 | % % if (Utt_Len== 321) 158 | % fid= fopen( 'what_mat.txt', 'at'); 159 | % fprintf( fid, '%f\n', Window); 160 | % fclose( fid); 161 | % % fprintf( '\n'); 162 | % % end 163 | X2= deg_data( startd: startd+ Align_Nfft- 1).* ... 164 | Window; 165 | X1_fft= fft( X1, Align_Nfft ); 166 | X1_fft_conj= conj( X1_fft); 167 | X2_fft= fft( X2, Align_Nfft ); 168 | X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft); 169 | 170 | X1= abs( X1); 171 | v_max = 0.99* max( X1); 172 | n_max = (v_max^ 0.125)/ kernel; 173 | % fprintf( 'v_max n_max is %f %f\n', v_max, n_max); 174 | 175 | for count = 0: Align_Nfft- 1 176 | if( X1(count+ 1) > v_max ) 177 | Hsum = Hsum+ n_max * kernel; 178 | for k = 1-kernel: kernel-1 179 | H(1+ rem( count+ k+ Align_Nfft, Align_Nfft))= ... 180 | H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))+ ... 181 | n_max* (kernel- abs(k)); 182 | end 183 | end 184 | end 185 | 186 | startr = startr+ (Align_Nfft / 4); 187 | startd = startd+ (Align_Nfft / 4); 188 | 189 | % loopno= loopno+ 1; 190 | end 191 | % fprintf( 'loopno is %d\n', loopno); 192 | 193 | [v_max, I_max] = max( H); 194 | % fprintf( 'I_max is %d ', I_max); 195 | if( I_max- 1 >= (Align_Nfft/2) ) 196 | I_max = I_max- Align_Nfft; 197 | end 198 | 199 | 200 | Utt_D1(bp) = estdelay + I_max- 1; 201 | if( Hsum > 0.0 ) 202 | % fprintf( 'v_max Hsum is %f %f\n', v_max, Hsum); 203 | Utt_DC1(bp) = v_max / Hsum; 204 | else 205 | Utt_DC1(bp) = 0.0; 206 | end 207 | end 208 | end 209 | end 210 | % fclose( stream); 211 | 212 | for bp= 1: N_BPs- 1 213 | if( Utt_DC1(bp) > Utt_DelayConf_l ) 214 | Utt_DC2(bp) = -2.0; 215 | else 216 | Utt_DC2(bp) = 0.0; 217 | end 218 | end 219 | 220 | while( 1 ) 221 | bp = N_BPs- 1; 222 | while( (bp >= 1) && (Utt_DC2(bp) > -2.0) ) 223 | bp = bp- 1; 224 | end 225 | if( bp < 1 ) 226 | break; 227 | end 228 | 229 | estdelay = Utt_ED2(bp); 230 | H( 1: Align_Nfft)= 0; 231 | Hsum = 0.0; 232 | 233 | startr = (Utt_End_l- 1)* Downsample+ 1- Align_Nfft; 234 | startd = startr + estdelay; 235 | 236 | % fprintf( '***NEW startr is %d\n', startr); 237 | 238 | % fprintf( 'startr/d, deg_Nsamples is %d/%d, %d\n', startr,startd, ... 239 | % deg_Nsamples); 240 | % fprintf( 'deg_data has %d elements\n', numel( deg_data)); 241 | 242 | if ( (startd + Align_Nfft) > deg_Nsamples+ 1 ) 243 | startd = deg_Nsamples - Align_Nfft+ 1; 244 | startr = startd - estdelay; 245 | end 246 | 247 | while( (startd>= 1) && (startr>= (Utt_BPs(bp)- 1)* Downsample+ 1) ) 248 | X1= ref_data( startr: startr+ Align_Nfft- 1).* Window; 249 | X2= deg_data( startd: startd+ Align_Nfft- 1).* Window; 250 | 251 | X1_fft= fft( X1, Align_Nfft); 252 | X1_fft_conj= conj( X1_fft); 253 | X2_fft= fft( X2, Align_Nfft); 254 | 255 | X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft ); 256 | X1= abs( X1); 257 | 258 | v_max = max( X1)* 0.99; 259 | n_max = ( v_max^ 0.125 )/ kernel; 260 | 261 | for count = 0: Align_Nfft- 1 262 | if( X1(count+ 1) > v_max ) 263 | Hsum = Hsum+ n_max * kernel; 264 | for k = 1-kernel: kernel- 1 265 | H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))= ... 266 | H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))+ ... 267 | n_max* (kernel- abs(k)); 268 | end 269 | end 270 | end 271 | 272 | startr = startr- (Align_Nfft / 4); 273 | startd = startd- (Align_Nfft / 4); 274 | end 275 | 276 | [v_max, I_max] = max( H); 277 | if( I_max- 1 >= (Align_Nfft/2) ) 278 | I_max = I_max- Align_Nfft; 279 | end 280 | 281 | Utt_D2(bp) = estdelay + I_max- 1; 282 | if( Hsum > 0.0 ) 283 | Utt_DC2(bp) = v_max / Hsum; 284 | else 285 | Utt_DC2(bp) = 0.0; 286 | end 287 | 288 | while( bp > 1 ) 289 | bp = bp - 1; 290 | if( (Utt_ED2(bp) == estdelay) && (Utt_DC2(bp) <= -2.0) ) 291 | while( (startd >= 1) && (startr >= (Utt_BPs(bp)- 1) * Downsample+ 1)) 292 | X1= ref_data( startr: startr+ Align_Nfft- 1).* Window; 293 | X2= deg_data( startd: startd+ Align_Nfft- 1).* Window; 294 | X1_fft_conj= conj( fft( X1, Align_Nfft)); 295 | X2_fft= fft( X2, Align_Nfft); 296 | X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft); 297 | 298 | X1= abs( X1); 299 | v_max = max( X1)* 0.99; 300 | n_max = (v_max^ 0.125)/ kernel; 301 | 302 | for count = 0: Align_Nfft- 1 303 | if( X1(count+ 1) > v_max ) 304 | Hsum = Hsum+ n_max * kernel; 305 | for k = 1-kernel: kernel- 1 306 | H(1+ rem( count+ k+ Align_Nfft, Align_Nfft))= ... 307 | H(1+ rem(count+ k+ Align_Nfft, Align_Nfft))+ ... 308 | n_max* (kernel- abs(k)); 309 | end 310 | end 311 | end 312 | 313 | startr = startr- (Align_Nfft / 4); 314 | startd = startd- (Align_Nfft / 4); 315 | end 316 | 317 | [v_max, I_max] = max( H); 318 | if( I_max- 1 >= (Align_Nfft/2) ) 319 | I_max = I_max- Align_Nfft; 320 | end 321 | 322 | 323 | Utt_D2(bp) = estdelay + I_max- 1; 324 | if( Hsum > 0.0 ) 325 | Utt_DC2(bp) = v_max / Hsum; 326 | else 327 | Utt_DC2(bp) = 0.0; 328 | end 329 | end 330 | end 331 | end 332 | 333 | % fid= fopen( 'uttinfo_mat.txt', 'wt'); 334 | % fprintf( fid, '%f\n', Utt_D2); 335 | % fprintf( fid, '\n'); 336 | % fprintf( fid, '%f\n', Utt_DC2); 337 | % fclose( fid); 338 | 339 | % fprintf( 'Utt_Len, N_BPs is %d, %d\n', Utt_Len, N_BPs); 340 | for bp = 1: N_BPs- 1 341 | if( (abs(Utt_D2(bp) - Utt_D1(bp)) >= Downsample) && ... 342 | ((Utt_DC1(bp)+ Utt_DC2(bp))> (Best_DC1 + Best_DC2)) &&... 343 | (Utt_DC1(bp) > Utt_DelayConf_l) && ... 344 | (Utt_DC2(bp) > Utt_DelayConf_l) ) 345 | Best_ED1 = Utt_ED1(bp); 346 | Best_D1 = Utt_D1(bp); 347 | Best_DC1 = Utt_DC1(bp); 348 | Best_ED2 = Utt_ED2(bp); 349 | Best_D2 = Utt_D2(bp); 350 | Best_DC2 = Utt_DC2(bp); 351 | Best_BP = Utt_BPs(bp); 352 | % fprintf( 'in loop...'); 353 | end 354 | end 355 | 356 | % if (Utt_Len== 236) 357 | % fid= fopen( 'matmat.txt', 'wt'); 358 | % fprintf( fid, 'N_BPs is %d\n', N_BPs); 359 | % fprintf( fid, 'Utt_DelayConf is %f\n', Utt_DelayConf_l); 360 | % fprintf( fid, 'ED2\t ED1\t D2\t D1\t DC2\t DC1\t BPs\n'); 361 | % for bp= 1: N_BPs- 1 362 | % fprintf( fid, '%d\t %d\t %d\t %d\t %f\t %f\t %d\n', Utt_ED2( bp), ... 363 | % Utt_ED1( bp), Utt_D2(bp), Utt_D1(bp), Utt_DC2(bp),... 364 | % Utt_DC1( bp), Utt_BPs( bp)); 365 | % end 366 | % fclose( fid); 367 | % end 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | -------------------------------------------------------------------------------- /PESQ/stoi.m: -------------------------------------------------------------------------------- 1 | function d = stoi(x, y, fs_signal) 2 | % d = stoi(x, y, fs_signal) returns the output of the short-time 3 | % objective intelligibility (STOI) measure described in [1, 2], where x 4 | % and y denote the clean and processed speech, respectively, with sample 5 | % rate fs_signal in Hz. The output d is expected to have a monotonic 6 | % relation with the subjective speech-intelligibility, where a higher d 7 | % denotes better intelligible speech. See [1, 2] for more details. 8 | % 9 | % References: 10 | % [1] C.H.Taal, R.C.Hendriks, R.Heusdens, J.Jensen 'A Short-Time 11 | % Objective Intelligibility Measure for Time-Frequency Weighted Noisy 12 | % Speech', ICASSP 2010, Texas, Dallas. 13 | % 14 | % [2] C.H.Taal, R.C.Hendriks, R.Heusdens, J.Jensen 'An Algorithm for 15 | % Intelligibility Prediction of Time-Frequency Weighted Noisy Speech', 16 | % IEEE Transactions on Audio, Speech, and Language Processing, 2011. 17 | % 18 | % 19 | % Copyright 2009: Delft University of Technology, Signal & Information 20 | % Processing Lab. The software is free for non-commercial use. This program 21 | % comes WITHOUT ANY WARRANTY. 22 | % 23 | % 24 | % 25 | % Updates: 26 | % 2011-04-26 Using the more efficient 'taa_corr' instead of 'corr' 27 | 28 | if length(x)~=length(y) 29 | % error('x and y should have the same length'); 30 | if length(x)>length(y) 31 | x=x(1:length(y)); 32 | else 33 | y=y(1:length(x)); 34 | end 35 | end 36 | 37 | % initialization 38 | x = x(:); % clean speech column vector 39 | y = y(:); % processed speech column vector 40 | 41 | fs = 10000; % sample rate of proposed intelligibility measure 42 | N_frame = 256; % window support 43 | K = 512; % FFT size 44 | J = 15; % Number of 1/3 octave bands 45 | mn = 150; % Center frequency of first 1/3 octave band in Hz. 46 | H = thirdoct(fs, K, J, mn); % Get 1/3 octave band matrix 47 | N = 30; % Number of frames for intermediate intelligibility measure (Length analysis window) 48 | Beta = -15; % lower SDR-bound 49 | dyn_range = 40; % speech dynamic range 50 | 51 | % resample signals if other samplerate is used than fs 52 | if fs_signal ~= fs 53 | x = resample(x, fs, fs_signal); 54 | y = resample(y, fs, fs_signal); 55 | end 56 | 57 | % remove silent frames 58 | [x y] = removeSilentFrames(x, y, dyn_range, N_frame, N_frame/2); 59 | 60 | % apply 1/3 octave band TF-decomposition 61 | x_hat = stdft(x, N_frame, N_frame/2, K); % apply short-time DFT to clean speech 62 | y_hat = stdft(y, N_frame, N_frame/2, K); % apply short-time DFT to processed speech 63 | 64 | x_hat = x_hat(:, 1:(K/2+1)).'; % take clean single-sided spectrum 65 | y_hat = y_hat(:, 1:(K/2+1)).'; % take processed single-sided spectrum 66 | 67 | X = zeros(J, size(x_hat, 2)); % init memory for clean speech 1/3 octave band TF-representation 68 | Y = zeros(J, size(y_hat, 2)); % init memory for processed speech 1/3 octave band TF-representation 69 | 70 | for i = 1:size(x_hat, 2) 71 | X(:, i) = sqrt(H*abs(x_hat(:, i)).^2); % apply 1/3 octave bands as described in Eq.(1) [1] 72 | Y(:, i) = sqrt(H*abs(y_hat(:, i)).^2); 73 | end 74 | 75 | % loop al segments of length N and obtain intermediate intelligibility measure for all TF-regions 76 | d_interm = zeros(J, length(N:size(X, 2))); % init memory for intermediate intelligibility measure 77 | c = 10^(-Beta/20); % constant for clipping procedure 78 | 79 | for m = N:size(X, 2) 80 | X_seg = X(:, (m-N+1):m); % region with length N of clean TF-units for all j 81 | Y_seg = Y(:, (m-N+1):m); % region with length N of processed TF-units for all j 82 | alpha = sqrt(sum(X_seg.^2, 2)./sum(Y_seg.^2, 2)); % obtain scale factor for normalizing processed TF-region for all j 83 | aY_seg = Y_seg.*repmat(alpha, [1 N]); % obtain \alpha*Y_j(n) from Eq.(2) [1] 84 | for j = 1:J 85 | Y_prime = min(aY_seg(j, :), X_seg(j, :)+X_seg(j, :)*c); % apply clipping from Eq.(3) 86 | d_interm(j, m-N+1) = taa_corr(X_seg(j, :).', Y_prime(:)); % obtain correlation coeffecient from Eq.(4) [1] 87 | end 88 | end 89 | 90 | d = mean(d_interm(:)); % combine all intermediate intelligibility measures as in Eq.(4) [1] 91 | 92 | %% 93 | function [A cf] = thirdoct(fs, N_fft, numBands, mn) 94 | % [A CF] = THIRDOCT(FS, N_FFT, NUMBANDS, MN) returns 1/3 octave band matrix 95 | % inputs: 96 | % FS: samplerate 97 | % N_FFT: FFT size 98 | % NUMBANDS: number of bands 99 | % MN: center frequency of first 1/3 octave band 100 | % outputs: 101 | % A: octave band matrix 102 | % CF: center frequencies 103 | 104 | f = linspace(0, fs, N_fft+1); 105 | f = f(1:(N_fft/2+1)); 106 | k = 0:(numBands-1); 107 | cf = 2.^(k/3)*mn; 108 | fl = sqrt((2.^(k/3)*mn).*2.^((k-1)/3)*mn); 109 | fr = sqrt((2.^(k/3)*mn).*2.^((k+1)/3)*mn); 110 | A = zeros(numBands, length(f)); 111 | 112 | for i = 1:(length(cf)) 113 | [a b] = min((f-fl(i)).^2); 114 | fl(i) = f(b); 115 | fl_ii = b; 116 | 117 | [a b] = min((f-fr(i)).^2); 118 | fr(i) = f(b); 119 | fr_ii = b; 120 | A(i,fl_ii:(fr_ii-1)) = 1; 121 | end 122 | 123 | rnk = sum(A, 2); 124 | numBands = find((rnk(2:end)>=rnk(1:(end-1))) & (rnk(2:end)~=0)~=0, 1, 'last' )+1; 125 | A = A(1:numBands, :); 126 | cf = cf(1:numBands); 127 | 128 | %% 129 | function x_stdft = stdft(x, N, K, N_fft) 130 | % X_STDFT = X_STDFT(X, N, K, N_FFT) returns the short-time 131 | % hanning-windowed dft of X with frame-size N, overlap K and DFT size 132 | % N_FFT. The columns and rows of X_STDFT denote the frame-index and 133 | % dft-bin index, respectively. 134 | 135 | frames = 1:K:(length(x)-N); 136 | x_stdft = zeros(length(frames), N_fft); 137 | 138 | w = hanning(N); 139 | x = x(:); 140 | 141 | for i = 1:length(frames) 142 | ii = frames(i):(frames(i)+N-1); 143 | x_stdft(i, :) = fft(x(ii).*w, N_fft); 144 | end 145 | 146 | %% 147 | function [x_sil y_sil] = removeSilentFrames(x, y, range, N, K) 148 | % [X_SIL Y_SIL] = REMOVESILENTFRAMES(X, Y, RANGE, N, K) X and Y 149 | % are segmented with frame-length N and overlap K, where the maximum energy 150 | % of all frames of X is determined, say X_MAX. X_SIL and Y_SIL are the 151 | % reconstructed signals, excluding the frames, where the energy of a frame 152 | % of X is smaller than X_MAX-RANGE 153 | 154 | x = x(:); 155 | y = y(:); 156 | 157 | frames = 1:K:(length(x)-N); 158 | w = hanning(N); 159 | msk = zeros(size(frames)); 160 | 161 | for j = 1:length(frames) 162 | jj = frames(j):(frames(j)+N-1); 163 | msk(j) = 20*log10(norm(x(jj).*w)./sqrt(N)); 164 | end 165 | 166 | msk = (msk-max(msk)+range)>0; 167 | count = 1; 168 | 169 | x_sil = zeros(size(x)); 170 | y_sil = zeros(size(y)); 171 | 172 | for j = 1:length(frames) 173 | if msk(j) 174 | jj_i = frames(j):(frames(j)+N-1); 175 | jj_o = frames(count):(frames(count)+N-1); 176 | x_sil(jj_o) = x_sil(jj_o) + x(jj_i).*w; 177 | y_sil(jj_o) = y_sil(jj_o) + y(jj_i).*w; 178 | count = count+1; 179 | end 180 | end 181 | 182 | x_sil = x_sil(1:jj_o(end)); 183 | y_sil = y_sil(1:jj_o(end)); 184 | 185 | %% 186 | function rho = taa_corr(x, y) 187 | % RHO = TAA_CORR(X, Y) Returns correlation coeffecient between column 188 | % vectors x and y. Gives same results as 'corr' from statistics toolbox. 189 | xn = x-mean(x); 190 | xn = xn/sqrt(sum(xn.^2)); 191 | yn = y-mean(y); 192 | yn = yn/sqrt(sum(yn.^2)); 193 | rho = sum(xn.*yn); -------------------------------------------------------------------------------- /PESQ/time_align.m: -------------------------------------------------------------------------------- 1 | function time_align(ref_data, ref_Nsamples, ... 2 | deg_data, deg_Nsamples, Utt_id) 3 | 4 | global Utt_DelayEst Utt_Delay Utt_DelayConf UttSearch_Start UttSearch_End 5 | global Align_Nfft Downsample Window 6 | 7 | estdelay = Utt_DelayEst(Utt_id); 8 | 9 | H = zeros( 1, Align_Nfft); 10 | X1= zeros( 1, Align_Nfft); 11 | X2= zeros( 1, Align_Nfft); 12 | 13 | startr = (UttSearch_Start(Utt_id)- 1)* Downsample+ 1; 14 | startd = startr + estdelay; 15 | if ( startd < 0 ) 16 | startr = 1 -estdelay; 17 | startd = 1; 18 | end 19 | 20 | while( ((startd + Align_Nfft) <= deg_Nsamples) && ... 21 | ((startr + Align_Nfft) <= ((UttSearch_End(Utt_id)- 1) * Downsample)) ) 22 | X1= ref_data( startr: startr+ Align_Nfft- 1).* Window; 23 | X2= deg_data( startd: startd+ Align_Nfft- 1).* Window; 24 | 25 | % find cross-correlation between X1 and X2 26 | X1_fft= fft( X1, Align_Nfft ); 27 | X1_fft_conj= conj( X1_fft); 28 | X2_fft= fft( X2, Align_Nfft ); 29 | X1= ifft( X1_fft_conj.* X2_fft, Align_Nfft ); 30 | 31 | X1= abs( X1); 32 | v_max = max( X1)* 0.99; 33 | 34 | X1_greater_vmax= find( X1 > v_max ); 35 | H( X1_greater_vmax )= H( X1_greater_vmax )+ v_max^ 0.125; 36 | 37 | startr = startr+ Align_Nfft/ 4; 38 | startd = startd+ Align_Nfft/ 4; 39 | 40 | end 41 | 42 | X1= H; 43 | X2= 0; 44 | Hsum = sum( H); 45 | 46 | X2(1) = 1.0; 47 | kernel = Align_Nfft / 64; 48 | 49 | for count= 2: kernel 50 | X2( count)= 1- (count- 1)/ kernel; 51 | X2( Align_Nfft- count+ 2)= 1- (count- 1)/ kernel; 52 | end 53 | 54 | X1_fft= fft( X1, Align_Nfft ); 55 | X2_fft= fft( X2, Align_Nfft ); 56 | 57 | X1= ifft( X1_fft.* X2_fft, Align_Nfft ); 58 | 59 | if (Hsum> 0) 60 | H= abs( X1)/ Hsum; 61 | else 62 | H= 0; 63 | end 64 | 65 | [v_max, I_max] = max( H); 66 | if( I_max- 1 >= (Align_Nfft/2) ) 67 | I_max = I_max- Align_Nfft; 68 | end 69 | 70 | Utt_Delay(Utt_id) = estdelay + I_max- 1; 71 | Utt_DelayConf(Utt_id) = v_max; % confidence 72 | 73 | 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /PESQ/utterance_locate.m: -------------------------------------------------------------------------------- 1 | function utterance_locate (ref_data, ref_Nsamples, ref_VAD, ref_logVAD,... 2 | deg_data, deg_Nsamples, deg_VAD, deg_logVAD); 3 | 4 | global Nutterances Utt_Delay Utt_DelayConf Utt_Start Utt_End Utt_DelayEst 5 | 6 | id_searchwindows( ref_VAD, ref_Nsamples, deg_VAD, deg_Nsamples); 7 | 8 | for Utt_id= 1: Nutterances 9 | %fprintf( 1, 'Utt_id is %d\n', Utt_id); 10 | crude_align( ref_logVAD, ref_Nsamples, deg_logVAD, deg_Nsamples, Utt_id); 11 | time_align(ref_data, ref_Nsamples, ... 12 | deg_data, deg_Nsamples, Utt_id); 13 | end 14 | 15 | id_utterances( ref_Nsamples, ref_VAD, deg_Nsamples); 16 | 17 | 18 | utterance_split( ref_data, ref_Nsamples, ref_VAD, ref_logVAD, ... 19 | deg_data, deg_Nsamples, deg_VAD, deg_logVAD); 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /PESQ/utterance_split.m: -------------------------------------------------------------------------------- 1 | function utterance_split( ref_data, ref_Nsamples, ref_VAD, ref_logVAD, ... 2 | deg_data, deg_Nsamples, deg_VAD, deg_logVAD) 3 | 4 | global Nutterances MAXNUTTERANCES Downsample SEARCHBUFFER 5 | global Utt_DelayEst Utt_Delay Utt_DelayConf UttSearch_Start 6 | global Utt_Start Utt_End Largest_uttsize UttSearch_End 7 | global Best_ED1 Best_D1 Best_DC1 Best_ED2 Best_D2 Best_DC2 Best_BP 8 | 9 | Utt_id = 1; 10 | while( (Utt_id <= Nutterances) && (Nutterances <= MAXNUTTERANCES) ) 11 | Utt_DelayEst_l = Utt_DelayEst(Utt_id); 12 | Utt_Delay_l = Utt_Delay(Utt_id); 13 | Utt_DelayConf_l = Utt_DelayConf(Utt_id); 14 | Utt_Start_l = Utt_Start(Utt_id); 15 | Utt_End_l = Utt_End(Utt_id); 16 | 17 | Utt_SpeechStart = Utt_Start_l; 18 | % fprintf( 'SpeechStart is %d\n', Utt_SpeechStart); 19 | while( (Utt_SpeechStart < Utt_End_l) && ... 20 | (ref_VAD(Utt_SpeechStart)<= 0.0) ) 21 | Utt_SpeechStart = Utt_SpeechStart + 1; 22 | end %find the SpeechStart for each utterance 23 | Utt_SpeechEnd = Utt_End_l; 24 | % fprintf( 'SpeechEnd is %d\n', Utt_SpeechEnd); 25 | while( (Utt_SpeechEnd > Utt_Start_l) && ... 26 | (ref_VAD(Utt_SpeechEnd) <= 0)) 27 | Utt_SpeechEnd = Utt_SpeechEnd- 1; 28 | end 29 | Utt_SpeechEnd = Utt_SpeechEnd+ 1; 30 | %find SpeechEnd for each utterance 31 | Utt_Len = Utt_SpeechEnd - Utt_SpeechStart; 32 | 33 | % fprintf( 'Utt_Len is %d\n', Utt_Len); 34 | 35 | if( Utt_Len >= 200 ) 36 | split_align( ref_data, ref_Nsamples, ref_VAD, ref_logVAD, ... 37 | deg_data, deg_Nsamples, deg_VAD, deg_logVAD, ... 38 | Utt_Start_l, Utt_SpeechStart, Utt_SpeechEnd, Utt_End_l, ... 39 | Utt_DelayEst_l, Utt_DelayConf_l); 40 | % fprintf( '\nBest_ED1, Best_D1, Best_DC1 is %d, %d, %f\n',... 41 | % Best_ED1, Best_D1, Best_DC1); 42 | % fprintf( 'Best_ED2, Best_D2, Best_DC2 is %d, %d, %f\n',... 43 | % Best_ED2, Best_D2, Best_DC2); 44 | % fprintf( 'Best_BP is %d\n', Best_BP); 45 | 46 | if( (Best_DC1 > Utt_DelayConf_l) && (Best_DC2 > Utt_DelayConf_l) ) 47 | for step = Nutterances: -1: Utt_id+ 1 48 | Utt_DelayEst(step+ 1) = Utt_DelayEst(step); 49 | Utt_Delay(step+ 1) = Utt_Delay(step); 50 | Utt_DelayConf(step+ 1) = Utt_DelayConf(step); 51 | Utt_Start(step+ 1) = Utt_Start(step); 52 | Utt_End(step+ 1) = Utt_End(step); 53 | UttSearch_Start(step+ 1) = Utt_Start( step); 54 | UttSearch_End(step+ 1) = Utt_End( step); 55 | end 56 | 57 | Nutterances = Nutterances+ 1; 58 | 59 | Utt_DelayEst(Utt_id) = Best_ED1; 60 | Utt_Delay(Utt_id) = Best_D1; 61 | Utt_DelayConf(Utt_id) = Best_DC1; 62 | 63 | Utt_DelayEst(Utt_id +1) = Best_ED2; 64 | Utt_Delay(Utt_id +1) = Best_D2; 65 | Utt_DelayConf(Utt_id +1) = Best_DC2; 66 | 67 | UttSearch_Start(Utt_id +1) = UttSearch_Start(Utt_id); 68 | UttSearch_End(Utt_id +1) = UttSearch_End( Utt_id); 69 | if( Best_D2 < Best_D1 ) 70 | Utt_Start(Utt_id) = Utt_Start_l; 71 | Utt_End(Utt_id) = Best_BP; 72 | Utt_Start(Utt_id +1) = Best_BP; 73 | Utt_End(Utt_id +1) = Utt_End_l; 74 | else 75 | Utt_Start( Utt_id) = Utt_Start_l; 76 | Utt_End( Utt_id) = Best_BP + ... 77 | floor( (Best_D2- Best_D1)/ (2 * Downsample)); 78 | Utt_Start( Utt_id +1) = Best_BP - ... 79 | floor( (Best_D2- Best_D1)/ (2 * Downsample)); 80 | Utt_End( Utt_id +1) = Utt_End_l; 81 | end 82 | 83 | if( (Utt_Start(Utt_id)- SEARCHBUFFER- 1)* Downsample+ 1+ ... 84 | Best_D1 < 0 ) 85 | Utt_Start(Utt_id) = SEARCHBUFFER+ 1+ ... 86 | floor( (Downsample - 1 - Best_D1) / Downsample); 87 | end 88 | 89 | if( ((Utt_End( Utt_id +1)- 1)* Downsample+ 1 + Best_D2) >... 90 | (deg_Nsamples - SEARCHBUFFER * Downsample) ) 91 | Utt_End( Utt_id +1) = floor( (deg_Nsamples - Best_D2)... 92 | / Downsample)- SEARCHBUFFER+ 1; 93 | end 94 | else 95 | Utt_id= Utt_id+ 1; 96 | end 97 | else 98 | Utt_id = Utt_id+ 1; 99 | end 100 | end 101 | 102 | Largest_uttsize = max( Utt_End- Utt_Start); 103 | 104 | % fid= fopen( 'uttinfo_mat.txt', 'wt'); 105 | % fprintf( fid, 'Number of Utterances is:\n'); 106 | % fprintf( fid, '%d\n', Nutterances); 107 | % fprintf( fid, 'Utterance Delay Estimation:\n'); 108 | % fprintf( fid, '%d\n', Utt_DelayEst( 1: Nutterances) ); 109 | % fprintf( fid, 'Utterance Delay:\n'); 110 | % fprintf( fid, '%d\n', Utt_Delay( 1: Nutterances)); 111 | % fprintf( fid, 'Utterance Delay Confidence:\n'); 112 | % fprintf( fid, '%f\n', Utt_DelayConf( 1: Nutterances)); 113 | % fprintf( fid, 'Utterance Start:\n'); 114 | % fprintf( fid, '%d\n', Utt_Start( 1: Nutterances)); 115 | % fprintf( fid, 'Utterance End:\n'); 116 | % fprintf( fid, '%d\n', Utt_End( 1: Nutterances)); 117 | % fprintf( fid, 'Largest utterance length:\n'); 118 | % fprintf( fid, '%d\n', Largest_uttsize); 119 | % fclose( fid); 120 | 121 | 122 | 123 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Speech-measure-SDR-SAR-STOI-PESQ 2 | Speech quality measure of SDR、SAR、STOI、ESTOI、PESQ via MATLAB 3 | 4 | you can take a reference in /result for your sample 5 | authour: dakenan1 262589340@qq.com 6 | 7 | Refference: 8 | __SDR/SAR/SIR__ 9 | * Toolbox: [BSS Eval](http://bass-db.gforge.inria.fr/bss_eval/), [The PEASS Toolkit](http://bass-db.gforge.inria.fr/peass/), [craffel/mir_eval/separation.py](https://github.com/craffel/mir_eval/blob/master/mir_eval/separation.py) 10 | * Paper: [Performance measurement in blind audio source separation](https://ieeexplore.ieee.org/document/1643671/) 11 | * __STOI__ 12 | * Toolbox: [stoi.zip](http://insy.ewi.tudelft.nl/content/short-time-objective-intelligibility-measure)+[actuallyaswin/stoi](https://github.com/actuallyaswin/stoi), [mpariente/pystoi](https://github.com/mpariente/pystoi) 13 | * Paper: [A short-time objective intelligibility measure for time-frequency weighted noisy speech](https://ieeexplore.ieee.org/document/5495701/) 14 | * __ESTOI__ 15 | * Toolbox: [estoi.m](http://kom.aau.dk/~jje/code/estoi.m) 16 | * Paper: [An Algorithm for Predicting the Intelligibility of Speech Masked by Modulated Noise Maskers](https://ieeexplore.ieee.org/document/7539284/) 17 | * __PESQ__ 18 | * Toolbox: [pesq.m](https://github.com/JacobD10/SoundZone_Tools/blob/master/pesq2.m), [MATLAB software-composite](http://ecs.utdallas.edu/loizou/speech/software.htm) 19 | * Paper: [Perceptual evaluation of speech quality (PESQ)-a new method for speech quality assessment of telephone networks and codecs](https://ieeexplore.ieee.org/document/941023/) 20 | -------------------------------------------------------------------------------- /bss_eval_sources.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dakenan1/Speech-measure-SDR-SAR-STOI-PESQ/8d9562631a344ba37e7b4cf0d993b1ca339c8c0b/bss_eval_sources.m -------------------------------------------------------------------------------- /estoi.m: -------------------------------------------------------------------------------- 1 | function d = estoi(x, y, fs_signal) 2 | % d = estoi(x, y, fs_signal) returns the output of the extended short-time 3 | % objective intelligibility (ESTOI) predictor. 4 | % 5 | % Implementation of the Extended Short-Time Objective 6 | % Intelligibility (ESTOI) predictor, described in Jesper Jensen and 7 | % Cees H. Taal, "An Algorithm for Predicting the Intelligibility of 8 | % Speech Masked by Modulated Noise Maskers," IEEE Transactions on 9 | % Audio, Speech and Language Processing, 2016. 10 | % 11 | % Input: 12 | % x: clean reference time domain signal 13 | % y: noisy/processed time domain signal 14 | % fs_signal: sampling rate [Hz] 15 | % 16 | % Output: 17 | % d: intelligibility index 18 | % 19 | % 20 | % Copyright 2016: Aalborg University, Section for Signal and Information Processing. 21 | % The software is free for non-commercial use. 22 | % The software comes WITHOUT ANY WARRANTY. 23 | 24 | 25 | if length(x)~=length(y) 26 | error('x and y should have the same length'); 27 | end 28 | 29 | % initialization 30 | x = x(:); % clean speech column vector 31 | y = y(:); % processed speech column vector 32 | 33 | fs = 10000; % sample rate of proposed intelligibility measure 34 | N_frame = 256; % window support 35 | K = 512; % FFT size 36 | J = 15; % Number of 1/3 octave bands 37 | mn = 150; % Center frequency of first 1/3 octave band in Hz. 38 | [H,fc_thirdoct] = thirdoct(fs, K, J, mn); % Get 1/3 octave band matrix 39 | N = 30; % Number of frames for intermediate intelligibility measure 40 | dyn_range = 40; % speech dynamic range 41 | 42 | % resample signals if other samplerate is used than fs 43 | if fs_signal ~= fs 44 | x = resample(x, fs, fs_signal); 45 | y = resample(y, fs, fs_signal); 46 | end 47 | 48 | % remove silent frames 49 | [x y] = removeSilentFrames(x, y, dyn_range, N_frame, N_frame/2); 50 | 51 | % apply 1/3 octave band TF-decomposition 52 | x_hat = stdft(x, N_frame, N_frame/2, K); % apply short-time DFT to clean speech 53 | y_hat = stdft(y, N_frame, N_frame/2, K); % apply short-time DFT to processed speech 54 | 55 | 56 | x_hat = x_hat(:, 1:(K/2+1)).'; % take clean single-sided spectrum 57 | y_hat = y_hat(:, 1:(K/2+1)).'; % take processed single-sided spectrum 58 | 59 | X = zeros(J, size(x_hat, 2)); % init memory for clean speech 1/3 octave band TF-representation 60 | Y = zeros(J, size(y_hat, 2)); % init memory for processed speech 1/3 octave band TF-representation 61 | 62 | for i = 1:size(x_hat, 2) 63 | X(:, i) = sqrt(H*abs(x_hat(:, i)).^2); % apply 1/3 octave band filtering 64 | Y(:, i) = sqrt(H*abs(y_hat(:, i)).^2); 65 | end 66 | 67 | % loop all segments of length N and obtain intermediate intelligibility measure for each 68 | d1 = zeros(length(N:size(X, 2)),1); % init memory for intermediate intelligibility measure 69 | for m=N:size(X,2) 70 | X_seg = X(:, (m-N+1):m); % region of length N with clean TF-units for all j 71 | Y_seg = Y(:, (m-N+1):m); % region of length N with processed TF-units for all j 72 | X_seg = X_seg + eps*randn(size(X_seg)); % to avoid divide by zero 73 | Y_seg = Y_seg + eps*randn(size(Y_seg)); % to avoid divide by zero 74 | 75 | %% first normalize rows (to give \bar{S}_m) 76 | XX = X_seg - mean(X_seg.').'*ones(1,N); % normalize rows to zero mean 77 | YY = Y_seg - mean(Y_seg.').'*ones(1,N); % normalize rows to zero mean 78 | 79 | YY = diag(1./sqrt(diag(YY*YY')))*YY; % normalize rows to unit length 80 | XX = diag(1./sqrt(diag(XX*XX')))*XX; % normalize rows to unit length 81 | 82 | XX = XX + eps*randn(size(XX)); % to avoid corr.div.by.0 83 | YY = YY + eps*randn(size(YY)); % to avoid corr.div.by.0 84 | 85 | %% then normalize columns (to give \check{S}_m) 86 | YYY = YY - ones(J,1)*mean(YY); % normalize cols to zero mean 87 | XXX = XX - ones(J,1)*mean(XX); % normalize cols to zero mean 88 | 89 | YYY = YYY*diag(1./sqrt(diag(YYY'*YYY))); % normalize cols to unit length 90 | XXX = XXX*diag(1./sqrt(diag(XXX'*XXX))); % normalize cols to unit length 91 | 92 | %compute average of col.correlations (by stacking cols) 93 | d1(m-N+1) = 1/N*XXX(:).'*YYY(:); 94 | end 95 | d = mean(d1); 96 | 97 | 98 | %% 99 | function [A cf] = thirdoct(fs, N_fft, numBands, mn) 100 | % [A CF] = THIRDOCT(FS, N_FFT, NUMBANDS, MN) returns 1/3 octave band matrix 101 | % inputs: 102 | % FS: samplerate 103 | % N_FFT: FFT size 104 | % NUMBANDS: number of bands 105 | % MN: center frequency of first 1/3 octave band 106 | % outputs: 107 | % A: octave band matrix 108 | % CF: center frequencies 109 | 110 | f = linspace(0, fs, N_fft+1); 111 | f = f(1:(N_fft/2+1)); 112 | k = 0:(numBands-1); 113 | cf = 2.^(k/3)*mn; 114 | fl = sqrt((2.^(k/3)*mn).*2.^((k-1)/3)*mn); 115 | fr = sqrt((2.^(k/3)*mn).*2.^((k+1)/3)*mn); 116 | A = zeros(numBands, length(f)); 117 | 118 | for i = 1:(length(cf)) 119 | [a b] = min((f-fl(i)).^2); 120 | fl(i) = f(b); 121 | fl_ii = b; 122 | 123 | [a b] = min((f-fr(i)).^2); 124 | fr(i) = f(b); 125 | fr_ii = b; 126 | A(i,fl_ii:(fr_ii-1)) = 1; 127 | end 128 | 129 | rnk = sum(A, 2); 130 | numBands = find((rnk(2:end)>=rnk(1:(end-1))) & (rnk(2:end)~=0)~=0, 1, 'last' )+1; 131 | A = A(1:numBands, :); 132 | cf = cf(1:numBands); 133 | 134 | %% 135 | function x_stdft = stdft(x, N, K, N_fft) 136 | % X_STDFT = X_STDFT(X, N, K, N_FFT) returns the short-time 137 | % hanning-windowed dft of X with frame-size N, overlap K and DFT size 138 | % N_FFT. The columns and rows of X_STDFT denote the frame-index and 139 | % dft-bin index, respectively. 140 | 141 | frames = 1:K:(length(x)-N); 142 | x_stdft = zeros(length(frames), N_fft); 143 | 144 | w = hanning(N); 145 | x = x(:); 146 | 147 | for i = 1:length(frames) 148 | ii = frames(i):(frames(i)+N-1); 149 | x_stdft(i, :) = fft(x(ii).*w, N_fft); 150 | end 151 | 152 | %% 153 | function [x_sil y_sil] = removeSilentFrames(x, y, range, N, K) 154 | % [X_SIL Y_SIL] = REMOVESILENTFRAMES(X, Y, RANGE, N, K) X and Y 155 | % are segmented with frame-length N and overlap K, where the maximum energy 156 | % of all frames of X is determined, say X_MAX. X_SIL and Y_SIL are the 157 | % reconstructed signals, excluding the frames, where the energy of a frame 158 | % of X is smaller than X_MAX-RANGE 159 | 160 | x = x(:); 161 | y = y(:); 162 | 163 | frames = 1:K:(length(x)-N); 164 | w = hanning(N); 165 | msk = zeros(size(frames)); 166 | 167 | for j = 1:length(frames) 168 | jj = frames(j):(frames(j)+N-1); 169 | msk(j) = 20*log10(norm(x(jj).*w)./sqrt(N)); 170 | end 171 | 172 | msk = (msk-max(msk)+range)>0; 173 | count = 1; 174 | 175 | x_sil = zeros(size(x)); 176 | y_sil = zeros(size(y)); 177 | 178 | for j = 1:length(frames) 179 | if msk(j) 180 | jj_i = frames(j):(frames(j)+N-1); 181 | jj_o = frames(count):(frames(count)+N-1); 182 | x_sil(jj_o) = x_sil(jj_o) + x(jj_i).*w; 183 | y_sil(jj_o) = y_sil(jj_o) + y(jj_i).*w; 184 | count = count+1; 185 | end 186 | end 187 | 188 | x_sil = x_sil(1:jj_o(end)); 189 | y_sil = y_sil(1:jj_o(end)); 190 | -------------------------------------------------------------------------------- /evaluate_2speaker_ori.m: -------------------------------------------------------------------------------- 1 | %==================================================================================== 2 | % Performance Measurement in Multi-speaker Separation 3 | % Author: Chao Peng, EECS, Peking University 4 | % Github: https://github.com/pchao6/LSTM_PIT_Speech_Separation 5 | % Revision 1.0, June 2018 6 | %==================================================================================== 7 | tic 8 | addpath('/usr/local/MATLAB/R2016b/toolbox/voicebox'); 9 | addpath('PESQ'); %PESQ Toolbox According to ITU-T P.862; 10 | 11 | sample_rate = 8000; 12 | tt_wav_dir = '/home/wuxc/BLSTM-PIT-BSS/LSTM_PIT_Speech_Separation-master/Dataset/WSJ0-mix/mix/data/2speakers_0dB/wav8k/min/tt/'; 13 | model_name = '2speakers_0dB_original'; 14 | mix_wav_dir = [tt_wav_dir '/mix/']; 15 | spk1_dir = [tt_wav_dir, '/s1/']; 16 | spk2_dir = [tt_wav_dir, '/s2/']; 17 | 18 | lists = dir(spk2_dir); %3002*1 struct 19 | len = length(lists) - 2; %3000 20 | SDR = zeros(len, 2); 21 | SIR = SDR; 22 | SAR = SDR; 23 | STOI = SDR; 24 | ESTOI = SDR; 25 | PESQ = SDR; 26 | error_num_STOI = 0; 27 | error_num_ESTOI = 0; 28 | error_num_PESQ = 0; 29 | 30 | 31 | for i = 3:len+2 32 | name = lists(i).name; 33 | part_name = name(1:end-4); 34 | fprintf('Computing Audio:%s, Number:%d ...\n', [part_name '.wav'], i-2) 35 | 36 | mix_wav1 = audioread([mix_wav_dir part_name '.wav']); %35328*1 double 37 | mix_wav = [mix_wav1, mix_wav1]; %35328*2 double 38 | 39 | ori_wav1 = audioread([spk1_dir part_name '.wav']); %35269*1 double 40 | ori_wav2 = audioread([spk2_dir part_name '.wav']); %35269*1 double 41 | ori_wav = [ori_wav1, ori_wav2]; %35269*2 double 42 | 43 | min_len = min(size(ori_wav, 1), size(mix_wav, 1)); %35269 44 | mix_wav = mix_wav(1:min_len, :); %35269*2 double 45 | ori_wav = ori_wav(1:min_len, :); %35269*2 double 46 | [SDR(i-2, :),SIR(i-2, :),SAR(i-2, :),perm]=bss_eval_sources(mix_wav', ori_wav'); 47 | 48 | x1 = stoi(ori_wav(:,1), mix_wav(:,1), sample_rate); 49 | x2 = stoi(ori_wav(:,2), mix_wav(:,2), sample_rate); 50 | if ~isnan(x1) & ~isnan(x2) 51 | %if x1 ~= NaN & x2 ~= NaN 52 | STOI(i-2, 1) = x1; 53 | STOI(i-2, 2) = x2; 54 | else 55 | STOI(i-2, 1) = 0; 56 | STOI(i-2, 2) = 0; 57 | error_num_STOI = error_num_STOI + 1; 58 | fprintf('STOI NaN happens in computing the audio:%s, i=%d.\n', [part_name '.wav'], i-2) 59 | end 60 | 61 | e1 = estoi(ori_wav(:,1), mix_wav(:,1), sample_rate); 62 | e2 = estoi(ori_wav(:,2), mix_wav(:,2), sample_rate); 63 | if ~isnan(x1) & ~isnan(x2) 64 | %if e1 ~= NaN & e2 ~= NaN 65 | ESTOI(i-2, 1) = e1; 66 | ESTOI(i-2, 2) = e2; 67 | else 68 | STOI(i-2, 1) = 0; 69 | STOI(i-2, 2) = 0; 70 | error_num_ESTOI = error_num_ESTOI + 1; 71 | fprintf('ESTOI NaN happens in computing the audio:%s, i=%d.\n', [part_name '.wav'], i-2) 72 | end 73 | 74 | try 75 | PESQ(i-2, 1) = pesq([spk1_dir part_name '.wav'], [mix_wav_dir part_name '.wav']); 76 | PESQ(i-2, 2) = pesq([spk2_dir part_name '.wav'], [mix_wav_dir part_name '.wav']); 77 | catch ErrorInfo 78 | PESQ(i-2, 1) = 0; 79 | PESQ(i-2, 2) = 0; 80 | disp(ErrorInfo) 81 | error_num_PESQ = error_num_PESQ + 1; 82 | fprintf('PESQ Error happens in computing the audio:%s, i=%d.\n', [part_name '.wav'], i-2) 83 | end 84 | end 85 | 86 | 87 | fprintf('Model Name: %s.\n', model_name) 88 | fprintf('The mean SDR is %f.\n', mean(mean(SDR))) 89 | fprintf('The mean SAR is %f.\n', mean(mean(SAR))) 90 | fprintf('The mean SIR is %f.\n', mean(mean(SIR))) 91 | fprintf('Mean STOI is %f.\n', mean(sum(STOI)/(len - error_num_STOI))) 92 | fprintf('Mean ESTOI is %f.\n', mean(sum(ESTOI)/(len - error_num_ESTOI))) 93 | fprintf('Mean PESQ is %f.\n', mean(sum(PESQ)/(len - error_num_PESQ))) 94 | save(['matfiles/evaluate_' model_name], 'SDR', 'SAR', 'SIR', 'STOI', 'ESTOI', 'PESQ', 'lists'); 95 | 96 | time_length = toc; 97 | hour = floor(time_length/3600); 98 | remaining = mod(time_length, 3600); 99 | minute = floor(remaining/60); 100 | second = mod(remaining, 60); 101 | fprintf('\nElapsed time is %d hour(s), %d minute(s), %d second(s).\n', hour, minute, floor(second)) 102 | -------------------------------------------------------------------------------- /evaluate_2speaker_separated.m: -------------------------------------------------------------------------------- 1 | %==================================================================================== 2 | % Performance Measurement in Multi-speaker Separation 3 | % Author: Chao Peng, EECS, Peking University 4 | % Github: https://github.com/pchao6/LSTM_PIT_Speech_Separation 5 | % Revision 1.0, June 2018 6 | %==================================================================================== 7 | tic 8 | addpath('/usr/local/MATLAB/R2016b/toolbox/voicebox'); 9 | addpath('PESQ'); %PESQ Toolbox According to ITU-T P.862; 10 | 11 | sample_rate = 8000; 12 | tt_wav_dir = 'SpeechSeparation/mix/data/2speakers_0dB/wav8k/min/tt'; 13 | model_name = 'PIT_BLSTM_3_496_2speaker_8KHz_0dB'; 14 | rec_wav_dir = ['SpeechSeparation/separated/' model_name '/']; 15 | 16 | spk1_dir = [tt_wav_dir, '/s1/']; 17 | spk2_dir = [tt_wav_dir, '/s2/']; 18 | 19 | lists = dir(spk2_dir); 20 | len = length(lists) - 2; 21 | SDR = zeros(len, 2); 22 | SIR = SDR; 23 | SAR = SDR; 24 | STOI = SDR; 25 | ESTOI = SDR; 26 | PESQ = SDR; 27 | error_num_STOI = 0; 28 | error_num_ESTOI = 0; 29 | error_num_PESQ = 0; 30 | 31 | fprintf('Model Name: %s.\n', model_name) 32 | for i = 3:len+2 33 | name = lists(i).name; 34 | part_name = name(1:end-4); 35 | fprintf('Computing Audio:%s, Number:%d ...\n', [part_name '.wav'], i-2) 36 | 37 | rec_wav1 = audioread([rec_wav_dir part_name '_1.wav']); 38 | rec_wav2 = audioread([rec_wav_dir part_name '_2.wav']); 39 | rec_wav = [rec_wav1, rec_wav2]; 40 | 41 | ori_wav1 = audioread([spk1_dir part_name '.wav']); 42 | ori_wav2 = audioread([spk2_dir part_name '.wav']); 43 | ori_wav = [ori_wav1, ori_wav2]; %35269*2 double 44 | 45 | min_len = min(size(ori_wav, 1), size(rec_wav, 1)); %35269 46 | rec_wav = rec_wav(1:min_len, :); 47 | ori_wav = ori_wav(1:min_len, :); 48 | [SDR(i-2, :),SIR(i-2, :),SAR(i-2, :),perm]=bss_eval_sources(rec_wav', ori_wav'); 49 | 50 | x1 = stoi(ori_wav(:,1), rec_wav(:,1), sample_rate); 51 | x2 = stoi(ori_wav(:,2), rec_wav(:,2), sample_rate); 52 | if ~isnan(x1) & ~isnan(x2) 53 | STOI(i-2, 1) = x1; 54 | STOI(i-2, 2) = x2; 55 | else 56 | STOI(i-2, 1) = 0; 57 | STOI(i-2, 2) = 0; 58 | error_num_STOI = error_num_STOI + 1; 59 | end 60 | 61 | e1 = estoi(ori_wav(:,1), rec_wav(:,1), sample_rate); 62 | e2 = estoi(ori_wav(:,2), rec_wav(:,2), sample_rate); 63 | if ~isnan(x1) & ~isnan(x2) 64 | ESTOI(i-2, 1) = e1; 65 | ESTOI(i-2, 2) = e2; 66 | else 67 | STOI(i-2, 1) = 0; 68 | STOI(i-2, 2) = 0; 69 | error_num_ESTOI = error_num_ESTOI + 1; 70 | end 71 | 72 | try 73 | PESQ(i-2, 1) = pesq([spk1_dir part_name '.wav'], [rec_wav_dir part_name '_1.wav']); 74 | PESQ(i-2, 2) = pesq([spk2_dir part_name '.wav'], [rec_wav_dir part_name '_2.wav']); 75 | catch ErrorInfo 76 | PESQ(i-2, 1) = 0; 77 | PESQ(i-2, 2) = 0; 78 | disp(ErrorInfo) 79 | error_num_PESQ = error_num_PESQ + 1; 80 | end 81 | end 82 | 83 | fprintf('The mean SDR is %f.\n', mean(mean(SDR))) 84 | fprintf('The mean SAR is %f.\n', mean(mean(SAR))) 85 | fprintf('The mean SIR is %f.\n', mean(mean(SIR))) 86 | fprintf('The mean STOI is %f.\n', mean(sum(STOI)/(len - error_num_STOI))) 87 | fprintf('The mean ESTOI is %f.\n', mean(sum(ESTOI)/(len - error_num_ESTOI))) 88 | fprintf('The mean PESQ is %f.\n', mean(sum(PESQ)/(len - error_num_PESQ))) 89 | save(['matfiles/evaluate_' model_name], 'SDR', 'SAR', 'SIR', 'STOI', 'ESTOI', 'PESQ', 'lists'); 90 | 91 | time_length = toc; 92 | hour = floor(time_length/3600); 93 | remaining = mod(time_length, 3600); 94 | minute = floor(remaining/60); 95 | second = mod(remaining, 60); 96 | fprintf('\nElapsed time is %d hour(s), %d minute(s), %d second(s).\n', hour, minute, floor(second)) -------------------------------------------------------------------------------- /pesq.m: -------------------------------------------------------------------------------- 1 | function [pesq_mos]= pesq(ref_wav, deg_wav) 2 | 3 | % ---------------------------------------------------------------------- 4 | % PESQ objective speech quality measure 5 | % 6 | % This function implements the PESQ measure based on the ITU standard 7 | % P.862 [1]. 8 | % 9 | % 10 | % Usage: pval=pesq(cleanFile.wav, enhancedFile.wav) 11 | % 12 | % cleanFile.wav - clean input file in .wav format 13 | % enhancedFile - enhanced output file in .wav format 14 | % pval - PESQ value 15 | % 16 | % Note that the PESQ routine only supports sampling rates of 8 kHz and 17 | % 16 kHz [1] 18 | % 19 | % Example call: pval = pesq ('sp04.wav','enhanced.wav') 20 | % 21 | % 22 | % References: 23 | % [1] ITU (2000). Perceptual evaluation of speech quality (PESQ), and 24 | % objective method for end-to-end speech quality assessment of 25 | % narrowband telephone networks and speech codecs. ITU-T 26 | % Recommendation P. 862 27 | % 28 | % Authors: Yi Hu and Philipos C. Loizou 29 | % 30 | % 31 | % Copyright (c) 2006 by Philipos C. Loizou 32 | % $Revision: 0.0 $ $Date: 10/09/2006 $ 33 | % ---------------------------------------------------------------------- 34 | if nargin<2 35 | fprintf('Usage: [pesq_mos]=pesq(cleanfile.wav,enhanced.wav) \n'); 36 | return; 37 | end; 38 | 39 | global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs WHOLE_SIGNAL 40 | global Align_Nfft Window 41 | 42 | [ref_data,sampling_rate]= audioread( ref_wav); 43 | if sampling_rate~=8000 && sampling_rate~=16000 44 | error('Sampling frequency needs to be either 8000 or 16000 Hz'); 45 | end 46 | 47 | setup_global( sampling_rate); 48 | 49 | % Window= hann( Align_Nfft, 'periodic'); %Hanning window 50 | % Window= Window'; 51 | TWOPI= 6.28318530717959; 52 | %for count = 0: Align_Nfft- 1 53 | % Window(1+ count) = 0.5 * (1.0 - cos((TWOPI * count) / Align_Nfft)); 54 | %end 55 | 56 | count=0:Align_Nfft- 1; 57 | Window= 0.5 * (1.0 - cos((TWOPI * count) / Align_Nfft)); 58 | 59 | 60 | 61 | ref_data= ref_data'; 62 | ref_data= ref_data* 32768; 63 | ref_Nsamples= length( ref_data)+ 2* SEARCHBUFFER* Downsample; 64 | ref_data= [zeros( 1, SEARCHBUFFER* Downsample), ref_data, ... 65 | zeros( 1, DATAPADDING_MSECS* (Fs/ 1000)+ SEARCHBUFFER* Downsample)]; 66 | 67 | [deg_data,~]= audioread( deg_wav); 68 | deg_data= deg_data'; 69 | deg_data= deg_data* 32768; 70 | deg_Nsamples= length( deg_data)+ 2* SEARCHBUFFER* Downsample; 71 | deg_data= [zeros( 1, SEARCHBUFFER* Downsample), deg_data, ... 72 | zeros( 1, DATAPADDING_MSECS* (Fs/ 1000)+ SEARCHBUFFER* Downsample)]; 73 | 74 | maxNsamples= max( ref_Nsamples, deg_Nsamples); 75 | 76 | ref_data= fix_power_level( ref_data, ref_Nsamples, maxNsamples); 77 | deg_data= fix_power_level( deg_data, deg_Nsamples, maxNsamples); 78 | 79 | standard_IRS_filter_dB= [0, -200; 50, -40; 100, -20; 125, -12; 160, -6; 200, 0;... 80 | 250, 4; 300, 6; 350, 8; 400, 10; 500, 11; 600, 12; 700, 12; 800, 12;... 81 | 1000, 12; 1300, 12; 1600, 12; 2000, 12; 2500, 12; 3000, 12; 3250, 12;... 82 | 3500, 4; 4000, -200; 5000, -200; 6300, -200; 8000, -200]; 83 | 84 | ref_data= apply_filter( ref_data, ref_Nsamples, standard_IRS_filter_dB); 85 | deg_data= apply_filter( deg_data, deg_Nsamples, standard_IRS_filter_dB); 86 | % 87 | 88 | 89 | 90 | % for later use in psychoacoustical model 91 | model_ref= ref_data; 92 | model_deg= deg_data; 93 | 94 | [ref_data, deg_data]= input_filter( ref_data, ref_Nsamples, deg_data, ... 95 | deg_Nsamples); 96 | 97 | 98 | [ref_VAD, ref_logVAD]= apply_VAD( ref_data, ref_Nsamples); 99 | [deg_VAD, deg_logVAD]= apply_VAD( deg_data, deg_Nsamples); 100 | 101 | 102 | crude_align (ref_logVAD, ref_Nsamples, deg_logVAD, deg_Nsamples,... 103 | WHOLE_SIGNAL); 104 | 105 | utterance_locate (ref_data, ref_Nsamples, ref_VAD, ref_logVAD,... 106 | deg_data, deg_Nsamples, deg_VAD, deg_logVAD); 107 | 108 | ref_data= model_ref; 109 | deg_data= model_deg; 110 | 111 | % make ref_data and deg_data equal length 112 | if (ref_Nsamples< deg_Nsamples) 113 | newlen= deg_Nsamples+ DATAPADDING_MSECS* (Fs/ 1000); 114 | ref_data( newlen)= 0; 115 | elseif (ref_Nsamples> deg_Nsamples) 116 | newlen= ref_Nsamples+ DATAPADDING_MSECS* (Fs/ 1000); 117 | deg_data( newlen)= 0; 118 | end 119 | 120 | 121 | pesq_mos= pesq_psychoacoustic_model (ref_data, ref_Nsamples, deg_data, ... 122 | deg_Nsamples ); 123 | 124 | 125 | 126 | 127 | -------------------------------------------------------------------------------- /rusult/050a0501_1.7783_442o030z_-1.7783_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dakenan1/Speech-measure-SDR-SAR-STOI-PESQ/8d9562631a344ba37e7b4cf0d993b1ca339c8c0b/rusult/050a0501_1.7783_442o030z_-1.7783_1.wav -------------------------------------------------------------------------------- /rusult/050a0501_1.7783_442o030z_-1.7783_2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dakenan1/Speech-measure-SDR-SAR-STOI-PESQ/8d9562631a344ba37e7b4cf0d993b1ca339c8c0b/rusult/050a0501_1.7783_442o030z_-1.7783_2.wav -------------------------------------------------------------------------------- /rusult/050a0502_1.3461_440o030j_-1.3461_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dakenan1/Speech-measure-SDR-SAR-STOI-PESQ/8d9562631a344ba37e7b4cf0d993b1ca339c8c0b/rusult/050a0502_1.3461_440o030j_-1.3461_1.wav -------------------------------------------------------------------------------- /rusult/050a0502_1.3461_440o030j_-1.3461_2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dakenan1/Speech-measure-SDR-SAR-STOI-PESQ/8d9562631a344ba37e7b4cf0d993b1ca339c8c0b/rusult/050a0502_1.3461_440o030j_-1.3461_2.wav -------------------------------------------------------------------------------- /rusult/050a0502_1.463_420a010o_-1.463_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dakenan1/Speech-measure-SDR-SAR-STOI-PESQ/8d9562631a344ba37e7b4cf0d993b1ca339c8c0b/rusult/050a0502_1.463_420a010o_-1.463_1.wav -------------------------------------------------------------------------------- /rusult/050a0502_1.463_420a010o_-1.463_2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dakenan1/Speech-measure-SDR-SAR-STOI-PESQ/8d9562631a344ba37e7b4cf0d993b1ca339c8c0b/rusult/050a0502_1.463_420a010o_-1.463_2.wav -------------------------------------------------------------------------------- /rusult/050a0502_1.9707_440c020w_-1.9707_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dakenan1/Speech-measure-SDR-SAR-STOI-PESQ/8d9562631a344ba37e7b4cf0d993b1ca339c8c0b/rusult/050a0502_1.9707_440c020w_-1.9707_1.wav -------------------------------------------------------------------------------- /rusult/050a0502_1.9707_440c020w_-1.9707_2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dakenan1/Speech-measure-SDR-SAR-STOI-PESQ/8d9562631a344ba37e7b4cf0d993b1ca339c8c0b/rusult/050a0502_1.9707_440c020w_-1.9707_2.wav -------------------------------------------------------------------------------- /stoi.m: -------------------------------------------------------------------------------- 1 | function d = stoi(x, y, fs_signal) 2 | % d = stoi(x, y, fs_signal) returns the output of the short-time 3 | % objective intelligibility (STOI) measure described in [1, 2], where x 4 | % and y denote the clean and processed speech, respectively, with sample 5 | % rate fs_signal in Hz. The output d is expected to have a monotonic 6 | % relation with the subjective speech-intelligibility, where a higher d 7 | % denotes better intelligible speech. See [1, 2] for more details. 8 | % 9 | % References: 10 | % [1] C.H.Taal, R.C.Hendriks, R.Heusdens, J.Jensen 'A Short-Time 11 | % Objective Intelligibility Measure for Time-Frequency Weighted Noisy 12 | % Speech', ICASSP 2010, Texas, Dallas. 13 | % 14 | % [2] C.H.Taal, R.C.Hendriks, R.Heusdens, J.Jensen 'An Algorithm for 15 | % Intelligibility Prediction of Time-Frequency Weighted Noisy Speech', 16 | % IEEE Transactions on Audio, Speech, and Language Processing, 2011. 17 | % 18 | % 19 | % Copyright 2009: Delft University of Technology, Signal & Information 20 | % Processing Lab. The software is free for non-commercial use. This program 21 | % comes WITHOUT ANY WARRANTY. 22 | % 23 | % 24 | % 25 | % Updates: 26 | % 2011-04-26 Using the more efficient 'taa_corr' instead of 'corr' 27 | 28 | if length(x)~=length(y) 29 | error('x and y should have the same length'); 30 | end 31 | 32 | % initialization 33 | x = x(:); % clean speech column vector 34 | y = y(:); % processed speech column vector 35 | 36 | fs = 10000; % sample rate of proposed intelligibility measure 37 | N_frame = 256; % window support 38 | K = 512; % FFT size 39 | J = 15; % Number of 1/3 octave bands 40 | mn = 150; % Center frequency of first 1/3 octave band in Hz. 41 | H = thirdoct(fs, K, J, mn); % Get 1/3 octave band matrix 42 | N = 30; % Number of frames for intermediate intelligibility measure (Length analysis window) 43 | Beta = -15; % lower SDR-bound 44 | dyn_range = 40; % speech dynamic range 45 | 46 | % resample signals if other samplerate is used than fs 47 | if fs_signal ~= fs 48 | x = resample(x, fs, fs_signal); 49 | y = resample(y, fs, fs_signal); 50 | end 51 | 52 | % remove silent frames 53 | [x y] = removeSilentFrames(x, y, dyn_range, N_frame, N_frame/2); 54 | 55 | % apply 1/3 octave band TF-decomposition 56 | x_hat = stdft(x, N_frame, N_frame/2, K); % apply short-time DFT to clean speech 57 | y_hat = stdft(y, N_frame, N_frame/2, K); % apply short-time DFT to processed speech 58 | 59 | x_hat = x_hat(:, 1:(K/2+1)).'; % take clean single-sided spectrum 60 | y_hat = y_hat(:, 1:(K/2+1)).'; % take processed single-sided spectrum 61 | 62 | X = zeros(J, size(x_hat, 2)); % init memory for clean speech 1/3 octave band TF-representation 63 | Y = zeros(J, size(y_hat, 2)); % init memory for processed speech 1/3 octave band TF-representation 64 | 65 | for i = 1:size(x_hat, 2) 66 | X(:, i) = sqrt(H*abs(x_hat(:, i)).^2); % apply 1/3 octave bands as described in Eq.(1) [1] 67 | Y(:, i) = sqrt(H*abs(y_hat(:, i)).^2); 68 | end 69 | 70 | % loop al segments of length N and obtain intermediate intelligibility measure for all TF-regions 71 | d_interm = zeros(J, length(N:size(X, 2))); % init memory for intermediate intelligibility measure 72 | c = 10^(-Beta/20); % constant for clipping procedure 73 | 74 | for m = N:size(X, 2) 75 | X_seg = X(:, (m-N+1):m); % region with length N of clean TF-units for all j 76 | Y_seg = Y(:, (m-N+1):m); % region with length N of processed TF-units for all j 77 | alpha = sqrt(sum(X_seg.^2, 2)./sum(Y_seg.^2, 2)); % obtain scale factor for normalizing processed TF-region for all j 78 | aY_seg = Y_seg.*repmat(alpha, [1 N]); % obtain \alpha*Y_j(n) from Eq.(2) [1] 79 | for j = 1:J 80 | Y_prime = min(aY_seg(j, :), X_seg(j, :)+X_seg(j, :)*c); % apply clipping from Eq.(3) 81 | d_interm(j, m-N+1) = taa_corr(X_seg(j, :).', Y_prime(:)); % obtain correlation coeffecient from Eq.(4) [1] 82 | end 83 | end 84 | 85 | d = mean(d_interm(:)); % combine all intermediate intelligibility measures as in Eq.(4) [1] 86 | 87 | %% 88 | function [A cf] = thirdoct(fs, N_fft, numBands, mn) 89 | % [A CF] = THIRDOCT(FS, N_FFT, NUMBANDS, MN) returns 1/3 octave band matrix 90 | % inputs: 91 | % FS: samplerate 92 | % N_FFT: FFT size 93 | % NUMBANDS: number of bands 94 | % MN: center frequency of first 1/3 octave band 95 | % outputs: 96 | % A: octave band matrix 97 | % CF: center frequencies 98 | 99 | f = linspace(0, fs, N_fft+1); 100 | f = f(1:(N_fft/2+1)); 101 | k = 0:(numBands-1); 102 | cf = 2.^(k/3)*mn; 103 | fl = sqrt((2.^(k/3)*mn).*2.^((k-1)/3)*mn); 104 | fr = sqrt((2.^(k/3)*mn).*2.^((k+1)/3)*mn); 105 | A = zeros(numBands, length(f)); 106 | 107 | for i = 1:(length(cf)) 108 | [a b] = min((f-fl(i)).^2); 109 | fl(i) = f(b); 110 | fl_ii = b; 111 | 112 | [a b] = min((f-fr(i)).^2); 113 | fr(i) = f(b); 114 | fr_ii = b; 115 | A(i,fl_ii:(fr_ii-1)) = 1; 116 | end 117 | 118 | rnk = sum(A, 2); 119 | numBands = find((rnk(2:end)>=rnk(1:(end-1))) & (rnk(2:end)~=0)~=0, 1, 'last' )+1; 120 | A = A(1:numBands, :); 121 | cf = cf(1:numBands); 122 | 123 | %% 124 | function x_stdft = stdft(x, N, K, N_fft) 125 | % X_STDFT = X_STDFT(X, N, K, N_FFT) returns the short-time 126 | % hanning-windowed dft of X with frame-size N, overlap K and DFT size 127 | % N_FFT. The columns and rows of X_STDFT denote the frame-index and 128 | % dft-bin index, respectively. 129 | 130 | frames = 1:K:(length(x)-N); 131 | x_stdft = zeros(length(frames), N_fft); 132 | 133 | w = hanning(N); 134 | x = x(:); 135 | 136 | for i = 1:length(frames) 137 | ii = frames(i):(frames(i)+N-1); 138 | x_stdft(i, :) = fft(x(ii).*w, N_fft); 139 | end 140 | 141 | %% 142 | function [x_sil y_sil] = removeSilentFrames(x, y, range, N, K) 143 | % [X_SIL Y_SIL] = REMOVESILENTFRAMES(X, Y, RANGE, N, K) X and Y 144 | % are segmented with frame-length N and overlap K, where the maximum energy 145 | % of all frames of X is determined, say X_MAX. X_SIL and Y_SIL are the 146 | % reconstructed signals, excluding the frames, where the energy of a frame 147 | % of X is smaller than X_MAX-RANGE 148 | 149 | x = x(:); 150 | y = y(:); 151 | 152 | frames = 1:K:(length(x)-N); 153 | w = hanning(N); 154 | msk = zeros(size(frames)); 155 | 156 | for j = 1:length(frames) 157 | jj = frames(j):(frames(j)+N-1); 158 | msk(j) = 20*log10(norm(x(jj).*w)./sqrt(N)); 159 | end 160 | 161 | msk = (msk-max(msk)+range)>0; 162 | count = 1; 163 | 164 | x_sil = zeros(size(x)); 165 | y_sil = zeros(size(y)); 166 | 167 | for j = 1:length(frames) 168 | if msk(j) 169 | jj_i = frames(j):(frames(j)+N-1); 170 | jj_o = frames(count):(frames(count)+N-1); 171 | x_sil(jj_o) = x_sil(jj_o) + x(jj_i).*w; 172 | y_sil(jj_o) = y_sil(jj_o) + y(jj_i).*w; 173 | count = count+1; 174 | end 175 | end 176 | 177 | x_sil = x_sil(1:jj_o(end)); 178 | y_sil = y_sil(1:jj_o(end)); 179 | 180 | %% 181 | function rho = taa_corr(x, y) 182 | % RHO = TAA_CORR(X, Y) Returns correlation coeffecient between column 183 | % vectors x and y. Gives same results as 'corr' from statistics toolbox. 184 | xn = x-mean(x); 185 | xn = xn/sqrt(sum(xn.^2)); 186 | yn = y-mean(y); 187 | yn = yn/sqrt(sum(yn.^2)); 188 | rho = sum(xn.*yn); --------------------------------------------------------------------------------