├── rVAD2.0
    ├── filtbankm.m
    ├── stdspectrum.m
    ├── aurora2read.m
    ├── pitchestm.m
    ├── sflux.m
    ├── README.TXT
    ├── pitchblockdetect.m
    ├── vadbatch_1folder_diffpathes.m
    ├── winenvar.m
    ├── rfft.m
    ├── snre_highenergy.m
    ├── vad.m
    ├── irfft.m
    ├── findpeaks.m
    ├── enframe.m
    ├── snre_vad.m
    ├── estnoiseg.m
    ├── gaussmixp.m
    ├── voicebox.m
    ├── specsub.m
    ├── specsub_noiseseg_lfn.m
    ├── estnoisem.m
    ├── fxpefac.m
    ├── LICENSE
    ├── estnoisem_noiseseg.m
    └── spgrambw.m
├── Aurora2TestSet-ReferenceVAD.zip
├── Aurora2TrainSet-ReferenceVAD.zip
├── LICENSE
├── rVADfast_py_2.0
    ├── README.TXT
    ├── LICENSE
    ├── audio_stream.py
    ├── rVAD_fast.py
    ├── rVAD_fast_stream.py
    └── speechproc.py
└── README.md


/rVAD2.0/filtbankm.m:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhenghuatan/rVAD/HEAD/rVAD2.0/filtbankm.m


--------------------------------------------------------------------------------
/rVAD2.0/stdspectrum.m:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhenghuatan/rVAD/HEAD/rVAD2.0/stdspectrum.m


--------------------------------------------------------------------------------
/Aurora2TestSet-ReferenceVAD.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhenghuatan/rVAD/HEAD/Aurora2TestSet-ReferenceVAD.zip


--------------------------------------------------------------------------------
/Aurora2TrainSet-ReferenceVAD.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhenghuatan/rVAD/HEAD/Aurora2TrainSet-ReferenceVAD.zip


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The Matlab code rVAD/rVAD2.0/ is under GNU GENERAL PUBLIC LICENSE Version 2
2 | 
3 | The Python code rVAD/rVADfast_py_2.0/ is under MIT License 
4 | 
5 | Refer to LICENSE under each folder for details
6 | 


--------------------------------------------------------------------------------
/rVAD2.0/aurora2read.m:
--------------------------------------------------------------------------------
 1 | function [data_float, fs]=aurora2read(fname)
 2 | 
 3 | % Read data from the Aurora2 database 
 4 | 
 5 | fid=fopen(fname,'r','b');
 6 | data=fread(fid,'int16');
 7 | fclose(fid);
 8 | fs=8000;
 9 | 
10 | %str1=strread(fname,'%s','delimiter','.'); 
11 | 
12 | data_float = double(data)/2^15; %% Normalize int16(y) by 2^15
13 | 
14 | % wavwrite(data_float,fs,strcat(str1{1},'.wav'));
15 | 
16 | 


--------------------------------------------------------------------------------
/rVADfast_py_2.0/README.TXT:
--------------------------------------------------------------------------------
 1 | Fast noise-robust voice activity detection algorithm (rVAD-fast).
 2 | Version 2.0
 3 | 
 4 | 02 Dec 2017, Achintya Kumar Sarkar and Zheng-Hua Tan
 5 | 
 6 | Usage: python rVAD_fast_2.0.py inWaveFile  outputVadLabel
 7 | 
 8 | Refs:
 9 | [1] Z.-H. Tan, A.k. Sarkara and N. Dehak, "rVAD: an unsupervised segment-based robust voice activity detection method," Computer Speech and Language, vol. 59, pp. 1-21, 2020. 
10 | [2] Z.-H. Tan and B. Lindberg, "Low-complexity variable frame rate analysis for speech recognition and voice activity detection,” IEEE Journal of Selected Topics in Signal Processing, vol. 4, no. 5, pp. 798-807, 2010.
11 | 
12 | Contact:
13 |   Prof Zheng-Hua Tan
14 |   Aalborg University, Denmark
15 |   zt@es.aau.dk
16 |   https://vbn.aau.dk/en/persons/107665  
17 | 


--------------------------------------------------------------------------------
/rVAD2.0/pitchestm.m:
--------------------------------------------------------------------------------
 1 | function [pv01, fx]=pitchestm(data, fs, nfr10, pv01)
 2 | % function [pv01, pvblk, pvblkb, pv]=pitchestm(data, fs, nfr10, pv01)
 3 | 
 4 | [fx,tt,pv,fv]=fxpefac(data, fs); % should plus 3
 5 | npv=length(pv);
 6 | pv01=zeros(nfr10,1); sign_pv=0;
 7 | for i=1:npv
 8 |     if pv(i)>0.25  
 9 |         pv01(i+3) =1;    
10 |         if sign_pv==0
11 |             sign_pv=1;
12 |             nstart=i;
13 |         end
14 |     else
15 |         if sign_pv==1
16 |             sign_pv=0;
17 |             nstop=i-1;
18 |             if nstop-nstart<3
19 |                 pv01(nstart+3:nstop+3)=0;  % Remove 2 frames only pitch
20 |             end
21 |         end
22 |     end
23 | end
24 | pv01(1:3)=pv01(4);
25 | fxtmp(1:3)=fx(4); fxtmp(4:npv+3)=fx(1:npv);
26 | if (npv+3) < nfr10
27 |     pv01(npv+4:nfr10)=pv01(npv+3);
28 |     fxtmp(npv+4:nfr10)=fx(npv);
29 | else
30 |     pv01=pv01(1:nfr10);
31 |     fxtmp=fxtmp(1:nfr10);
32 | end
33 | fx=fxtmp;
34 | 
35 | 


--------------------------------------------------------------------------------
/rVAD2.0/sflux.m:
--------------------------------------------------------------------------------
 1 | function [ft, d, sVar]= sflux(data,flen,fsh10);
 2 | 
 3 | %% output -
 4 | % d - spectral flux
 5 | % ft - spectral flatness
 6 | % sVar - spectral variance
 7 | 
 8 | nftt=pow2(nextpow2(flen)); %% FFT point 
 9 | 
10 | %% sf-> spectral flux, ft-> spectral flatness, sVar-> spectral variance
11 | x=enframe(data,flen,fsh10);
12 | w=hamming(flen);
13 | x=x.*repmat(w',size(x,1),1); 
14 |  
15 | ak=abs(fft(x',nftt)); % spectrum
16 | ak=ak'; 
17 | ak=ak(:, 1:fix(nftt/2)+1); 
18 | 
19 | ak_1=ak(2:end,:); % ak-1
20 | ak_1=[ ak_1 ; ak_1(end,:)]; % ak(t-1)
21 |  
22 | d= sum((ak - ak_1).^2, 2); % sum_k [ak(t) -ak(t-1)]
23 | denA= sqrt(sum(ak.^2, 2)) .* sqrt( sum(ak_1.^2, 2) );
24 | d=(d+eps)./(denA+eps);  
25 |           
26 | %% flatness
27 | win=size(ak,2); % number of bands in spectra
28 | num= exp( (1/win) * sum( log(ak),2) );
29 | den= (1/win) * sum( ak,2);
30 | ft= (num+eps)./(den+eps);
31 |         
32 | %% Spectral Variation is the normalized by the correlation of spectrum between consecutive frames
33 | num= (sum(ak.*ak_1,2) +eps)./(denA+eps);
34 | sVar= 1- num;
35 |         
36 |         
37 | 
38 |         
39 | 


--------------------------------------------------------------------------------
/rVAD2.0/README.TXT:
--------------------------------------------------------------------------------
 1 | Noise-robust voice activity detection algorithm (rVAD).
 2 | Version 2.0
 3 | 
 4 | 28 Nov 2017, Zheng-Hua Tan
 5 | 
 6 | Usage: vad(finwav, fvad) 
 7 |        vad(finwav, fvad, opts) 
 8 |        vad(finwav, fvad, opts, vadThres). 
 9 | 
10 | where finwav is the input WAVE file path and name, fvad is the output VAD file path and name, opts can be 0 for using pitch (default option) or 1 for using flatness (significantly faster at the cost of slightly reduced accuracy), and finally vadThres is the threshold for VAD. Refer to vad.m for more detailed explanation. 
11 | 
12 | The code has been tested on Matlab R2016a.
13 | 
14 | Refs:
15 | [1] Z.-H. Tan, A.k. Sarkara and N. Dehak, "rVAD: an unsupervised segment-based robust voice activity detection method," Computer Speech and Language, vol. 59, pp. 1-21, 2020. 
16 | [2] Z.-H. Tan and B. Lindberg, "Low-complexity variable frame rate analysis for speech recognition and voice activity detection,” IEEE Journal of Selected Topics in Signal Processing, vol. 4, no. 5, pp. 798-807, 2010.
17 | 
18 | Contact:
19 |   Prof Zheng-Hua Tan
20 |   Aalborg University, Denmark
21 |   zt@es.aau.dk
22 |   https://vbn.aau.dk/en/persons/107665  
23 | 


--------------------------------------------------------------------------------
/rVADfast_py_2.0/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | for rVADfast_py (the Python code)
 4 | 
 5 | Copyright (c) 2022 Zheng-Hua Tan and Achintya Kumar Sarkar
 6 | 
 7 | Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | of this software and associated documentation files (the "Software"), to deal
 9 | in the Software without restriction, including without limitation the rights
10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | copies of the Software, and to permit persons to whom the Software is
12 | furnished to do so, subject to the following conditions:
13 | 
14 | The above copyright notice and this permission notice shall be included in all
15 | copies or substantial portions of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 | 


--------------------------------------------------------------------------------
/rVAD2.0/pitchblockdetect.m:
--------------------------------------------------------------------------------
 1 | function [pvblk]=pitchblockdetect(pv01, nfr10, pitch, opts)
 2 | 
 3 | %pitch block detection
 4 | 
 5 | if nfr10==length(pv01)+1
 6 |     pv01(nfr10)=pv01(nfr10-1); 
 7 | end 
 8 | 
 9 | if opts == 0
10 | 
11 | sign_pv=0;
12 | for i=1:nfr10
13 |     if pv01(i)==1 && sign_pv==0
14 |         nstart=i;
15 |         sign_pv=1;
16 |     elseif (pv01(i)==0 || i==nfr10) && sign_pv==1
17 |         nstop=i;
18 |         if i==nfr10; nstop=i+1; end
19 |         sign_pv=0;
20 |         pitchseg=zeros(nstop-nstart,1);
21 |         for j=nstart:nstop-1
22 | 
23 |             if isstring(pitch(j))
24 |                 pitchseg(j-nstart+1)=str2double(pitch(j));
25 |             else
26 |                  pitchseg(j-nstart+1)=pitch(j);
27 |             end
28 | 
29 |         end
30 |         if sum(abs(round(pitchseg-mean(pitchseg))))==0 && nstop-nstart+1>=10
31 |             pv01(nstart:nstop-1)=0;
32 |         end
33 |     end
34 | end
35 | 
36 | end %opts
37 | 
38 | 
39 | sign_pv=0;
40 | pvblk=pv01;
41 | for i=1:nfr10
42 |     if pv01(i)==1 && sign_pv==0
43 |         nstart=i;
44 |         sign_pv=1;
45 |         pvblk(max(nstart-60,1):nstart)=1;
46 |     elseif (pv01(i)==0 || i==nfr10) && sign_pv==1
47 |         nstop=i;
48 |         sign_pv=0;
49 |         pvblk(nstop:min(nstop+60,nfr10))=1;
50 |     end
51 | end
52 | 
53 | 


--------------------------------------------------------------------------------
/rVAD2.0/vadbatch_1folder_diffpathes.m:
--------------------------------------------------------------------------------
 1 | function []=vadbatch_1folder_diffpathes(wavfold,nfiles1,nfiles2)
 2 | 
 3 | dinwav=strcat('/data/scratch/najim/RATS/Correct_data/',wavfold,'/');
 4 | dpitch=strcat('/data/scratch2/najim/RATS/PEF/',wavfold,'/');
 5 | 
 6 | doutwav1=strcat('/data/scratch2/zhenghua/RATS/wav1/',wavfold);
 7 | doutwav2=strcat('/data/scratch2/zhenghua/RATS/wav2/',wavfold);
 8 | dvad=strcat('/data/scratch2/zhenghua/RATS/vad/',wavfold);
 9 | 
10 | d1=dir(dinwav); 
11 | n1=length(d1);
12 | if nargin==1
13 |     nfiles1=1; nfiles2=n1-2;
14 | elseif nargin==2
15 |     nfiles2=n1-2;
16 |     doutwav1=strcat(doutwav1,'_',num2str(nfiles1));
17 |     doutwav2=strcat(doutwav2,'_',num2str(nfiles1));
18 |     dvad=strcat(dvad,'_',num2str(nfiles1));
19 | elseif nargin==3
20 |     doutwav1=strcat(doutwav1,'_',num2str(nfiles1));
21 |     doutwav2=strcat(doutwav2,'_',num2str(nfiles1));
22 |     dvad=strcat(dvad,'_',num2str(nfiles1));
23 | end
24 | doutwav1=strcat(doutwav1,'/')
25 | doutwav2=strcat(doutwav2,'/')
26 | dvad=strcat(dvad,'/')
27 | 
28 | if ~isdir(doutwav1); mkdir(doutwav1); end
29 | if ~isdir(doutwav2); mkdir(doutwav2); end
30 | if ~isdir(dvad); mkdir(dvad); end
31 | if nfiles2>n1-2; nfiles2=n1-2; end
32 | for i1=2+nfiles1:2+nfiles2
33 |     [str1, str2]=strread(d1(i1).name,'%s%s','delimiter','.');
34 |     finwav=strcat(dinwav,d1(i1).name) 
35 |     fpitch=strcat(dpitch,str1{1},'.PEF');
36 |     foutwav1=strcat(doutwav1,d1(i1).name);
37 |     foutwav2=strcat(doutwav2,d1(i1).name);
38 |     fvad=strcat(dvad,str1{1},'.vad');
39 |     vad(finwav,fpitch,foutwav1,foutwav2,fvad);
40 | end
41 | 
42 | clear all;
43 | 
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/rVADfast_py_2.0/audio_stream.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import code
 4 | import pyaudio
 5 | import wave
 6 | 
 7 | # record stream audio, save it and then apply rVADfast to it
 8 | # usage: python3 audio_stream.py
 9 | 
10 | CHUNK = 1024
11 | FORMAT = pyaudio.paInt16
12 | CHANNELS = 1
13 | RATE = 44100
14 | 
15 | def record():
16 | 
17 | 	p = pyaudio.PyAudio()
18 | 
19 | 	stream = p.open(format=FORMAT,channels=CHANNELS,rate=RATE,input=True,frames_per_buffer=CHUNK)
20 | 
21 | 	print("Start recording")
22 |         print("2. Press Ctrl+C to stop the recording"
23 |         print("3. rVAD will start")
24 |         print("==================================================================\n")
25 |         frames = []
26 | 
27 | 	try:
28 | 		while True:
29 | 			data = stream.read(CHUNK)
30 | 			frames.append(data)
31 | 
32 | 	except KeyboardInterrupt:
33 |             print("Done recording: stored --> output.wav")
34 | 
35 | 	except Exception as e:
36 | 		print(str(e))
37 | 
38 | 	sample_width = p.get_sample_size(FORMAT)
39 | 	
40 | 	stream.stop_stream()
41 | 	stream.close()
42 | 	p.terminate()
43 | 	
44 | 	return sample_width, frames	
45 | 
46 | def record_to_file(file_path):
47 | 	wf = wave.open(file_path, 'wb')
48 | 	wf.setnchannels(CHANNELS)
49 | 
50 | 	sample_width, frames = record()
51 | 
52 | 	wf.setsampwidth(sample_width)
53 | 	wf.setframerate(RATE)
54 | 	wf.writeframes(b''.join(frames))
55 | 	wf.close()
56 | 
57 | if __name__ == '__main__':
58 |         record_to_file('output.wav')
59 |         print("rVAD running...")
60 |         os.system("python3 rVAD_fast.py output.wav output.txt")
61 |         print("Result written")
62 | 


--------------------------------------------------------------------------------
/rVAD2.0/winenvar.m:
--------------------------------------------------------------------------------
 1 | function d=winenvar(n)
 2 | %WINENVAR get windows environment variable [D]=(N)
 3 | %
 4 | % Inputs: N  name of environment variable (e.g. 'temp')
 5 | %
 6 | % Outputs: D  value of variable or [] is non-existant
 7 | %
 8 | % Notes: (1) This is WINDOWS specific and needs to be fixed to work on UNIX
 9 | %        (2) The search is case insensitive (like most of WINDOWS).
10 | %
11 | % Examples: (1) Open a temporary text file:
12 | %               d=winenar('temp'); fid=fopen(fullfile(d,'temp.txt'),'wt');
13 | 
14 | %   Copyright (c) 2005 Mike Brookes,  mike.brookes@ic.ac.uk
15 | %      Version: $Id: winenvar.m 713 2011-10-16 14:45:43Z dmb $
16 | %
17 | %   VOICEBOX is a MATLAB toolbox for speech processing.
18 | %   Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html
19 | %
20 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
21 | %   This program is free software; you can redistribute it and/or modify
22 | %   it under the terms of the GNU General Public License as published by
23 | %   the Free Software Foundation; either version 2 of the License, or
24 | %   (at your option) any later version.
25 | %
26 | %   This program is distributed in the hope that it will be useful,
27 | %   but WITHOUT ANY WARRANTY; without even the implied warranty of
28 | %   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
29 | %   GNU General Public License for more details.
30 | %
31 | %   You can obtain a copy of the GNU General Public License from
32 | %   http://www.gnu.org/copyleft/gpl.html or by writing to
33 | %   Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA.
34 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
35 | p=['%',n,'%'];
36 | [s,d]=system(['echo ',p]);
37 | while d(end)<=' ';
38 |     d(end)=[];
39 | end
40 | if strcmp(d,p)
41 |     d=[];
42 | end


--------------------------------------------------------------------------------
/rVAD2.0/rfft.m:
--------------------------------------------------------------------------------
 1 | function y=rfft(x,n,d)
 2 | %RFFT     Calculate the DFT of real data Y=(X,N,D)
 3 | % Data is truncated/padded to length N if specified.
 4 | %   N even:	(N+2)/2 points are returned with
 5 | % 			the first and last being real
 6 | %   N odd:	(N+1)/2 points are returned with the
 7 | % 			first being real
 8 | % In all cases fix(1+N/2) points are returned
 9 | % D is the dimension along which to do the DFT
10 | 
11 | 
12 | 
13 | %      Copyright (C) Mike Brookes 1998
14 | %      Version: $Id: rfft.m 713 2011-10-16 14:45:43Z dmb $
15 | %
16 | %   VOICEBOX is a MATLAB toolbox for speech processing.
17 | %   Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html
18 | %
19 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
20 | %   This program is free software; you can redistribute it and/or modify
21 | %   it under the terms of the GNU General Public License as published by
22 | %   the Free Software Foundation; either version 2 of the License, or
23 | %   (at your option) any later version.
24 | %
25 | %   This program is distributed in the hope that it will be useful,
26 | %   but WITHOUT ANY WARRANTY; without even the implied warranty of
27 | %   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
28 | %   GNU General Public License for more details.
29 | %
30 | %   You can obtain a copy of the GNU General Public License from
31 | %   http://www.gnu.org/copyleft/gpl.html or by writing to
32 | %   Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA.
33 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
34 | 
35 | s=size(x);
36 | if prod(s)==1
37 |     y=x
38 | else
39 |     if nargin <3 || isempty(d)
40 |         d=find(s>1,1);
41 |         if nargin<2
42 |             n=s(d);
43 |         end
44 |     end
45 |     if isempty(n) 
46 |         n=s(d);
47 |     end
48 |     y=fft(x,n,d);
49 |     y=reshape(y,prod(s(1:d-1)),n,prod(s(d+1:end))); 
50 |     s(d)=1+fix(n/2);
51 |     y(:,s(d)+1:end,:)=[];
52 |     y=reshape(y,s);
53 | end
54 | 


--------------------------------------------------------------------------------
/rVADfast_py_2.0/rVAD_fast.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import numpy 
 3 | import pickle
 4 | import os
 5 | import sys
 6 | import math
 7 | import code
 8 | from scipy.signal import lfilter
 9 | import speechproc
10 | from copy import deepcopy
11 | 
12 | # Refs:
13 | #  [1] Z.-H. Tan, A.k. Sarkara and N. Dehak, "rVAD: an unsupervised segment-based robust voice activity detection method," Computer Speech and Language, vol. 59, pp. 1-21, 2020. 
14 | #  [2] Z.-H. Tan and B. Lindberg, "Low-complexity variable frame rate analysis for speech recognition and voice activity detection." 
15 | #  IEEE Journal of Selected Topics in Signal Processing, vol. 4, no. 5, pp. 798-807, 2010.
16 | 
17 | # Version: 2.0
18 | # 02 Dec 2017, Achintya Kumar Sarkar and Zheng-Hua Tan
19 | 
20 | # Usage: python rVAD_fast_2.0.py inWaveFile  outputVadLabel
21 | 
22 | 
23 | winlen, ovrlen, pre_coef, nfilter, nftt = 0.025, 0.01, 0.97, 20, 512
24 | ftThres=0.5; vadThres=0.4
25 | opts=1
26 | 
27 | finwav=str(sys.argv[1])
28 | fvad=str(sys.argv[2])
29 | 
30 | fs, data = speechproc.speech_wave(finwav)   
31 | ft, flen, fsh10, nfr10 =speechproc.sflux(data, fs, winlen, ovrlen, nftt)
32 | 
33 | 
34 | # --spectral flatness --
35 | pv01=numpy.zeros(nfr10)
36 | pv01[numpy.less_equal(ft, ftThres)]=1 
37 | pitch=deepcopy(ft)
38 | 
39 | pvblk=speechproc.pitchblockdetect(pv01, pitch, nfr10, opts)
40 | 
41 | 
42 | # --filtering--
43 | ENERGYFLOOR = numpy.exp(-50)
44 | b=numpy.array([0.9770,   -0.9770])
45 | a=numpy.array([1.0000,   -0.9540])
46 | fdata=lfilter(b, a, data, axis=0)
47 | 
48 | 
49 | #--pass 1--
50 | noise_samp, noise_seg, n_noise_samp=speechproc.snre_highenergy(fdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk)
51 | 
52 | #sets noisy segments to zero
53 | for j in range(n_noise_samp):
54 |     fdata[range(int(noise_samp[j,0]),  int(noise_samp[j,1]) +1)] = 0 
55 | 
56 | 
57 | vad_seg=speechproc.snre_vad(fdata,  nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk, vadThres)
58 | 
59 | numpy.savetxt(fvad, vad_seg.astype(int),  fmt='%i')
60 | print("%s --> %s " %(finwav, fvad))
61 | 
62 | data=None; pv01=None; pitch=None; fdata=None; pvblk=None; vad_seg=None
63 |      
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/rVADfast_py_2.0/rVAD_fast_stream.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import code
 4 | import pyaudio
 5 | import wave
 6 | 
 7 | 
 8 | CHUNK = 1024 ##number of frames in buffer
 9 | FORMAT = pyaudio.paInt16
10 | CHANNELS = 1 #each frame contents 1 sample of audio --> chunk -> 1024 samples in buffer
11 | RATE = 44100 #no of samples per Seconds
12 | dur = 5 # the duration of recording audio chunck (seconds)
13 | 
14 | def record():
15 | 
16 |     p = pyaudio.PyAudio()
17 | 
18 |     stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
19 | 
20 |     print(".......................................")
21 |     print("Start recording of 5s chunk of audio")
22 |     print("Stop recording - press clt+c")
23 |     print(".......................................\n")
24 |     frames = []
25 |     cont=0
26 | 
27 |     try:
28 |          while True:
29 |              data = stream.read(CHUNK)
30 |              frames.append(data)
31 |              cont = cont + CHUNK
32 |              if cont >= dur*RATE:  #for dur seconds audio
33 |                 print('Recorded %d seconds audio' %(dur))
34 |                 print('rVAD going ...')
35 |                 break;
36 | 
37 |     except KeyboardInterrupt:
38 |            print("Done recording: stored --> output.wav")
39 |     except Exception as e:
40 |            print(str(e))
41 | 
42 |     sample_width = p.get_sample_size(FORMAT)
43 | 
44 |     stream.stop_stream()
45 |     stream.close()
46 |     p.terminate()
47 | 
48 |     return sample_width, frames
49 | 
50 | 
51 | def record_to_file(file_path):
52 |     wf = wave.open(file_path, 'wb')
53 |     wf.setnchannels(CHANNELS)
54 | 
55 |     sample_width, frames = record()
56 | 
57 |     wf.setsampwidth(sample_width)
58 |     wf.setframerate(RATE)
59 |     wf.writeframes(b''.join(frames))
60 |     wf.close()
61 | 
62 | 
63 | if __name__ == '__main__':
64 |     part=0
65 |     while True:
66 |        audPart = 'output'+ str(part)  
67 |        record_to_file(audPart + '.wav')
68 |        cmd = 'python3' + " " + 'rVAD_fast.py' + " " + audPart +'.wav' + " " + audPart+'.txt'
69 |        os.system(cmd)
70 |        print('Result for audio chunk%d written' %(part))
71 |        part = part + 1
72 | 


--------------------------------------------------------------------------------
/rVAD2.0/snre_highenergy.m:
--------------------------------------------------------------------------------
 1 | function [noise_samp, n_noise_samp, noise_seg, D, Dsmth, snre_vad, e]=snre_highenergy(dfdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01) 
 2 | 
 3 | % Ref:
 4 | %   Z-H Tan and B Lindberg
 5 | %   Low-Complexity Variable Frame Rate Analysis for Speech Recognition and Voice Activity Detection
 6 | %   IEEE Journal of Selected Topics in Signal Processing, 4(5), Oct. 2010.
 7 | 
 8 | % square root of a posteriori SNR weighted engergy difference, block based 
 9 | %
10 | % Modified 01 Mar 2013
11 | 
12 | Dexpl=18;
13 | Dexpr=18;
14 | segThres = 0.25; 
15 | e=zeros(nfr10,1);
16 | for i=1:nfr10
17 |     for j=1:flen 
18 |         e(i)=e(i)+dfdata((i-1)*fsh10+j)*dfdata((i-1)*fsh10+j);  
19 |     end
20 |     if e(i) <= ENERGYFLOOR
21 |         e(i)=ENERGYFLOOR;
22 |     end
23 | end
24 | 
25 | 
26 | 
27 | emin=ones(nfr10,1);
28 | NESEG = 200;
29 | if nfr10 < NESEG; NESEG=nfr10; end
30 | for i=1:floor(nfr10/NESEG)
31 | 	[eY,eI]=sort(e((i-1)*NESEG+1:i*NESEG));
32 | 	emin((i-1)*NESEG+1:i*NESEG)=eY(floor(NESEG*0.1));
33 |     if i~=1
34 |         emin((i-1)*NESEG+1:i*NESEG)=0.9*emin((i-1)*NESEG)+0.1*emin((i-1)*NESEG+1); 
35 |     end
36 | end
37 | if i*NESEG~=nfr10
38 |     [eY,eI]=sort(e((i-1)*NESEG+1:nfr10));
39 |     emin(i*NESEG+1:nfr10)=eY(floor((nfr10-(i-1)*NESEG)*0.1));
40 |     emin(i*NESEG+1:nfr10)=0.9*emin(i*NESEG)+0.1*emin(i*NESEG+1);
41 | end
42 | 
43 | 
44 | 
45 | 
46 | D=zeros(nfr10,1);   
47 | postsnr=zeros(nfr10,1);
48 | for i=2:nfr10
49 |     postsnr(i) =log10(e(i))-log10(emin(i));
50 |     if postsnr(i)<0
51 |         postsnr(i)=0; 
52 |     end 
53 |     D(i)=sqrt(abs(e(i)-e(i-1))*postsnr(i));
54 | end
55 | D(1)=D(2);
56 | 
57 | Dexp = vertcat(ones(Dexpl,1)*D(1), D, ones(Dexpr,1)*D(nfr10));
58 | Dsmth = zeros(nfr10,1);
59 | for i=1:nfr10
60 |     Dsmth(i)=sum(Dexp(i:i+Dexpl+Dexpr));
61 | end
62 | 
63 | for i=1:floor(nfr10/NESEG)
64 |     Dsmth_max((i-1)*NESEG+1:i*NESEG)=max(e((i-1)*NESEG+1:i*NESEG));
65 | end
66 | if i*NESEG~=nfr10
67 |     Dsmth_max(i*NESEG+1:nfr10)=max(e((i-1)*NESEG+1:nfr10)); 
68 | end
69 | 
70 | snre_vad = zeros(nfr10,1);
71 | for i=1:nfr10
72 |    if Dsmth(i)>Dsmth_max(i)*segThres; snre_vad(i)=1; end
73 | end
74 | 
75 | 
76 | 
77 | % block based processing to remove noise part by using snre_vad1.
78 | sign_vad = 0;
79 | noise_seg=zeros(floor(nfr10/1.6),1);
80 | noise_samp=zeros(nfr10,2);
81 | n_noise_samp=0;
82 | for i=1:nfr10
83 |     if snre_vad(i) == 1 && sign_vad == 0 % start of a segment
84 |         sign_vad = 1;
85 |         nstart=i;
86 |     elseif (snre_vad(i) ==0 || i==nfr10) && sign_vad == 1 % end of a segment
87 |         sign_vad = 0;
88 |         nstop=i-1;
89 |         if sum(pv01(nstart:nstop))==0
90 |             noise_seg(round(nstart/1.6):floor(nstop/1.6)) = 1;
91 |             n_noise_samp=n_noise_samp+1;
92 |             noise_samp(n_noise_samp,:)=[(nstart-1)*fsh10+1 nstop*fsh10];
93 |         end
94 |     end
95 | end
96 | noise_samp(n_noise_samp+1:nfr10,:)=[];
97 | 
98 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # rVAD
 2 | 
 3 | ## Description
 4 | Matlab and Python libraries for an unsupervised method for robust voice activity detection (rVAD) or speech activity detection (SAD), as presented in [rVAD: An Unsupervised Segment-Based Robust Voice Activity Detection Method, Computer Speech & Language, 2020](https://www.sciencedirect.com/science/article/pii/S0885230819300920) or its [arXiv version](https://arxiv.org/abs/1906.03588). 
 5 | 
 6 | ***The rVAD paper published in Computer Speech & Language won International Speech Communication Association (ISCA) 2022 Best Research Paper Award.***
 7 | 
 8 | The rVAD method consists of two passes of denoising followed by a VAD stage. It has been applied as a preprocessor for a wide range of applications, such as speech recognition, speaker identification, language identification, age and gender identification, self-supervised learning, human-robot interaction, audio archive segmentation, and so on as in [Google Scholar](https://scholar.google.com/citations?view_op=view_citation&hl=en&user=fugL2E8AAAAJ&citation_for_view=fugL2E8AAAAJ:-mN3Mh-tlDkC).  
 9 | 
10 | The method is unsupervised to make it applicable to a broad range of acoustic environments, and it is optimized considering both noisy and clean conditions. 
11 | 
12 | The rVAD (out of the box) ranks the 4th place (out of 27 supervised/unsupervised systems) in a Fearless Steps Speech Activity Detection Challenge. 
13 | 
14 | The rVAD paper is among [the most cited articles from Computer Speech and Language published since 2018](https://www.journals.elsevier.com/computer-speech-and-language/most-cited-articles) (the 6th place), in 2022 and 2023.
15 | 
16 | ## Source code for rVAD: 
17 | Source code in Matlab for rVAD (including both rVAD and rVAD-fast) is available under the [rVAD2.0](rVAD2.0/) folder. It is straightforward to use: Simply call the function vad.m. Some Matlab functions and their modified versions from the publicly available VoiceBox are included with kind permission of Mike Brookes.  
18 | 
19 | Source code in Python for rVAD-fast is available under the [rVADfast_py_2.0](rVADfast_py_2.0/) folder. Source code for rVAD-fast to take streaming audio in is included too. 
20 | 
21 | rVAD-fast is 10+ times faster than rVAD while rVAD has superior performance. 
22 | 
23 | The rVADfast library is available as a python package installable via:
24 | pip install rVADfast.
25 | See [rVADfast GitHub page](https://github.com/zhenghuatan/rVADfast) for more details. 
26 | 
27 | ## Reference VAD for Aurora 2 database:
28 | The frame-by-frame reference VAD was generated from the clean set of Aurora 2 using forced-alignment speech recognition and has been used as a 'ground truth' for evaluating VAD algorithms. Our study shows that forced-alignment ASR performs as well as a human expert labeler for generating VAD references, as detailed in [Comparison of Forced-Alignment Speech Recognition and Humans for Generating Reference VAD](https://www.isca-speech.org/archive/pdfs/interspeech_2015/kraljevski15_interspeech.pdf). Here are the generated [reference VAD for the training set](Aurora2TrainSet-ReferenceVAD.zip) and the [reference VAD for the test set](Aurora2TestSet-ReferenceVAD.zip). 
29 | 
30 | 


--------------------------------------------------------------------------------
/rVAD2.0/vad.m:
--------------------------------------------------------------------------------
 1 | function []=vad(finwav, fvad, opts, vadThres)
 2 | 
 3 | % Usage: vad(finwav, fvad) 
 4 | %        vad(finwav, fvad, opts) 
 5 | %        vad(finwav, fvad, opts, vadThres). 
 6 | %
 7 | % finwav: The input WAVE file path and name.
 8 | %
 9 | % fvad: The output VAD file path and name [optional]. If the output is in 0-1 format, each line in the file is the label for that frame (0 for non-speech and 1 for speech), while if the output is in the segment format, each line contains the start frame number and the end frame number for a speech segment. The default is 0-1 format, and one can switch to the segment format by choosing another line of fprintf in the end of this code. The frame shift is 10ms.
10 | %
11 | % opts: 0 for using pitch (default option), and 1 for using flatness (significantly faster at the cost of slightly reduced accuracy). 
12 | %
13 | % vadThres: The threshold for VAD. The default value is 0.4. Increasing vadThres (e.g. to 0.5) makes the VAD more aggressive, i.e. the number of frames to be detected as speech will be reduced.
14 | %
15 | % Refs:
16 | %  [1] Z.-H. Tan, A.k. Sarkara and N. Dehak, "rVAD: an unsupervised segment-based robust voice activity detection method," Computer Speech and Language, 2019. 
17 | %  [2] Z.-H. Tan and B. Lindberg, "Low-complexity variable frame rate analysis for speech recognition and voice activity detection,” IEEE Journal of Selected Topics in Signal Processing, vol. 4, no. 5, pp. 798-807, 2010.
18 | %
19 | % 2017-11-28, Zheng-Hua Tan
20 | 
21 | if nargin < 2; error('Usage: vad(finwav, fvad)'); end
22 | if nargin == 2
23 |   opts = 0; vadThres = 0.4; 
24 | elseif nargin == 3
25 |   vadThres = 0.4; 
26 | end
27 | 
28 | [data,fs]= audioread(finwav);
29 | % [data,fs]=wavread(finwav);
30 | % [data, fs]=aurora2read(finwav);
31 | 
32 | % Parameter setting
33 | ENERGYFLOOR = exp(-50);
34 | flen=floor(fs/40); % 25ms frame length 
35 | fsh10=fs/100; % 10ms frame shift
36 | nfr10=floor((length(data)-(flen-fsh10))/fsh10);
37 | 
38 | b=[0.9770   -0.9770]; a=[ 1.0000   -0.9540];
39 | fdata=filter(b,a,data);
40 | 
41 | if opts == 0
42 |   [pv01, pitch]=pitchestm(data, fs, nfr10);
43 | else              % using flatness 
44 |   ftThres = 0.5;  % Default threshold. It can range from 0 to 1. Increasing ftThres increases the number of frames being detected as speech.
45 |   [ft]= sflux(data,flen,fsh10);
46 |   pv01 = (ft <= ftThres);  % <= threshold would give  1( meaning a speech frame)
47 |   pitch=ft;
48 | end
49 | 
50 | pvblk=pitchblockdetect(pv01, nfr10, pitch, opts);
51 | 
52 | [noise_samp, n_noise_samp, noise_seg]=snre_highenergy(fdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01);
53 | 
54 | %% Set high energy segments to zero 
55 | for i=1:n_noise_samp
56 |     fdata(noise_samp(i,1):noise_samp(i,2)) = 0;
57 | end
58 | 
59 | [dfdatarm]=specsub(fdata,fs);
60 | % [dfdatarm]=specsub(fdata,fs,noise_seg,pv01);
61 | 
62 | [vad_seg]=snre_vad(dfdatarm, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk, vadThres);
63 | 
64 | %% Output VAD results in 0-1 format (1 for speech frames and 0 for non-speech ones) 
65 | if isempty(vad_seg) ==1
66 |    z=zeros(nfr10,1);
67 | else
68 |    y=[];
69 |    for i=1:size(vad_seg,1)
70 |        y=[ y ; [ vad_seg(i,1):vad_seg(i,2)]' ];
71 |    end
72 |    z=zeros(nfr10,1);
73 |    z([y],1)=1;
74 | 
75 |    if sum(z) ~= size(y,1) % checking
76 |       error('The number of labeled speech frames does not matched the results of detected speech segments!');
77 |    end
78 | end
79 | 
80 | fid=fopen(fvad,'w');
81 | fprintf(fid, '%d\n',z'); % 0-1 VAD output
82 | % fprintf(fid, '%d\n',vad_seg); % segment-label VAD output
83 | fclose(fid);
84 | 
85 | 


--------------------------------------------------------------------------------
/rVAD2.0/irfft.m:
--------------------------------------------------------------------------------
 1 | function x=irfft(y,n,d)
 2 | %IRFFT    Inverse fft of a conjugate symmetric spectrum X=(Y,N,D)
 3 | %
 4 | % Inputs:  Y(M)   The first half of a complex spectrum
 5 | %          N      The number of output points to generate (default: 2M-2)
 6 | %          D      The dimension along which to perorm the transform
 7 | %                 (default: first non-singleton dimension of Y)
 8 | %
 9 | % Outputs: X(N)   Real inverse dft of Y
10 | %
11 | % This routine calculates the inverse DFT of a conjugate-symmetric to give a real-valued
12 | % output of dimension N. Only the first half of the spectrum need be supplied: if N is even,
13 | % this includes the Nyquist term and is of dimension M=N/2 + 1 whereas if N is odd then there is
14 | % no Nyquist term and the input is of dimension M=(N+1)/2.
15 | % Note that the default value of N is always even so that N must be given explicitly
16 | % if it is odd.
17 | %
18 | % See also the forward transform: RFFT
19 | 
20 | %      Copyright (C) Mike Brookes 2009
21 | %      Version: $Id: irfft.m 713 2011-10-16 14:45:43Z dmb $
22 | %
23 | %   VOICEBOX is a MATLAB toolbox for speech processing.
24 | %   Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html
25 | %
26 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
27 | %   This program is free software; you can redistribute it and/or modify
28 | %   it under the terms of the GNU General Public License as published by
29 | %   the Free Software Foundation; either version 2 of the License, or
30 | %   (at your option) any later version.
31 | %
32 | %   This program is distributed in the hope that it will be useful,
33 | %   but WITHOUT ANY WARRANTY; without even the implied warranty of
34 | %   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
35 | %   GNU General Public License for more details.
36 | %
37 | %   You can obtain a copy of the GNU General Public License from
38 | %   http://www.gnu.org/copyleft/gpl.html or by writing to
39 | %   Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA.
40 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
41 | 
42 | s=size(y);
43 | ps=prod(s);
44 | ns=length(s);
45 | if ps==1
46 |     x=y
47 | else
48 |     if nargin <3 || isempty(d)
49 |         d=find(s>1);
50 |         d=d(1);
51 |     end
52 |     m=s(d);
53 |     k=ps/m;     % number of fft's to do
54 |     if d==1
55 |         v=reshape(y,m,k);
56 |     else
57 |         v=reshape(permute(y,[d:ns 1:d-1]),m,k);
58 |     end
59 |     if nargin<2 || isempty(n)
60 |         n=2*m-2;        % default output length
61 |     else
62 |         mm=1+fix(n/2);          % expected input length
63 |         if mm>m v=[v; zeros(mm-m,k)];   % zero pad
64 |         elseif mm<m v(mm+1:m,:)=[];     % or truncate
65 |         end
66 |         m=mm;
67 |     end
68 |     if rem(n,2)		% odd output length
69 |         x=real(ifft([v;conj(v(m:-1:2,:))],[],1));    % do it the long way
70 |     else			% even output length
71 |         v(m,:)=real(v(m,:));	% force nyquist element real
72 |         w=ones(1,k);
73 |         %  t=[cumprod([-0.5i; exp(2i*pi/n)*ones(m-2,1)]); 0.5i];
74 |         t=-0.5i* exp((2i*pi/n)*(0:m-1)).';
75 |         z=(t(:,w)+0.5).*(conj(flipud(v))-v)+v;
76 |         z(m,:)=[];
77 |         zz=ifft(z,[],1);
78 |         x=zeros(n,k);
79 |         x(1:2:n,:)=real(zz);
80 |         x(2:2:n,:)=imag(zz);
81 |     end
82 |     s(d)=n;         % change output dimension
83 |     if d==1
84 |         x=reshape(x,s);
85 |     else
86 |         x=permute(reshape(x,s([d:ns 1:d-1])),[ns+2-d:ns 1:ns+1-d]);
87 |     end
88 | end
89 | 


--------------------------------------------------------------------------------
/rVAD2.0/findpeaks.m:
--------------------------------------------------------------------------------
  1 | function [k,v]=findpeaks(x,m,w)
  2 | %FINDPEAKS finds peaks with optional quadratic interpolation [K,V]=(X,M,W)
  3 | %
  4 | %  Inputs:  X        is the input signal (does not work with UInt datatype)
  5 | %           M        is mode:
  6 | %                       'q' performs quadratic interpolation
  7 | %                       'v' finds valleys instead of peaks
  8 | %           W        is the width tolerance; a peak will be eliminated if there is
  9 | %                    a higher peak within +-W samples
 10 | %
 11 | % Outputs:  K        are the peak locations in X (fractional if M='q')
 12 | %           V        are the peak amplitudes: if M='q' the amplitudes will be interpolated
 13 | %                    whereas if M~='q' then V=X(K). 
 14 | 
 15 | % Outputs are column vectors regardless of whether X is row or column.
 16 | % If there is a plateau rather than a sharp peak, the routine will place the
 17 | % peak in the centre of the plateau. When the W input argument is specified,
 18 | % the routine will eliminate the lower of any pair of peaks whose separation
 19 | % is <=W; if the peaks have exactly the same height, the second one will be eliminated.
 20 | % All peak locations satisfy 1<K<length(X).
 21 | %
 22 | % If no output arguments are specified, the results will be plotted.
 23 | %
 24 | 
 25 | %	   Copyright (C) Mike Brookes 2005
 26 | %      Version: $Id: findpeaks.m 713 2011-10-16 14:45:43Z dmb $
 27 | %
 28 | %   VOICEBOX is a MATLAB toolbox for speech processing.
 29 | %   Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html
 30 | %
 31 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 32 | %   This program is free software; you can redistribute it and/or modify
 33 | %   it under the terms of the GNU General Public License as published by
 34 | %   the Free Software Foundation; either version 2 of the License, or
 35 | %   (at your option) any later version.
 36 | %
 37 | %   This program is distributed in the hope that it will be useful,
 38 | %   but WITHOUT ANY WARRANTY; without even the implied warranty of
 39 | %   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 40 | %   GNU General Public License for more details.
 41 | %
 42 | %   You can obtain a copy of the GNU General Public License from
 43 | %   http://www.gnu.org/copyleft/gpl.html or by writing to
 44 | %   Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA.
 45 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 46 | 
 47 | if nargin<2
 48 |     m=' ';
 49 | end
 50 | nx=length(x);
 51 | if any(m=='v')
 52 |     x=-x(:);        % invert x if searching for valleys
 53 | else
 54 |     x=x(:);        % force to be a column vector
 55 | end
 56 | dx=x(2:end)-x(1:end-1);
 57 | r=find(dx>0);
 58 | f=find(dx<0);
 59 | 
 60 | if length(r)>0 & length(f)>0    % we must have at least one rise and one fall
 61 |     dr=r;
 62 |     dr(2:end)=r(2:end)-r(1:end-1);
 63 |     rc=repmat(1,nx,1);
 64 |     rc(r+1)=1-dr;
 65 |     rc(1)=0;
 66 |     rs=cumsum(rc); % = time since the last rise
 67 |     
 68 |     df=f;
 69 |     df(2:end)=f(2:end)-f(1:end-1);
 70 |     fc=repmat(1,nx,1);
 71 |     fc(f+1)=1-df;
 72 |     fc(1)=0;
 73 |     fs=cumsum(fc); % = time since the last fall
 74 |     
 75 |     rp=repmat(-1,nx,1);
 76 |     rp([1; r+1])=[dr-1; nx-r(end)-1];
 77 |     rq=cumsum(rp);  % = time to the next rise
 78 |     
 79 |     fp=repmat(-1,nx,1);
 80 |     fp([1; f+1])=[df-1; nx-f(end)-1];
 81 |     fq=cumsum(fp); % = time to the next fall
 82 |     
 83 |     k=find((rs<fs) & (fq<rq) & (floor((fq-rs)/2)==0));   % the final term centres peaks within a plateau
 84 |     v=x(k);
 85 |     
 86 |     if any(m=='q')         % do quadratic interpolation
 87 |         b=0.5*(x(k+1)-x(k-1));
 88 |         a=x(k)-b-x(k-1);
 89 |         j=(a>0);            % j=0 on a plateau
 90 |         v(j)=x(k(j))+0.25*b(j).^2./a(j);
 91 |         k(j)=k(j)+0.5*b(j)./a(j);
 92 |         k(~j)=k(~j)+(fq(k(~j))-rs(k(~j)))/2;    % add 0.5 to k if plateau has an even width
 93 |     end
 94 |     
 95 |     % now purge nearby peaks
 96 |     
 97 |     if nargin>2
 98 |         j=find(k(2:end)-k(1:end-1)<=w);
 99 |         while any(j)
100 |             j=j+(v(j)>=v(j+1));
101 |             k(j)=[];
102 |             v(j)=[];
103 |             j=find(k(2:end)-k(1:end-1)<=w);
104 |         end
105 |     end
106 | else
107 |     k=[];
108 |     v=[];
109 | end
110 | if any(m=='v')
111 |     v=-v;    % invert peaks if searching for valleys
112 | end
113 | if ~nargout
114 |     if any(m=='v')
115 |         x=-x;    % re-invert x if searching for valleys
116 |         ch='v';
117 |     else
118 |         ch='^';
119 |     end
120 |     plot(1:nx,x,'-',k,v,ch);
121 | end
122 | 


--------------------------------------------------------------------------------
/rVAD2.0/enframe.m:
--------------------------------------------------------------------------------
  1 | function [f,t,w]=enframe(x,win,inc,m)
  2 | %ENFRAME split signal up into (overlapping) frames: one per row. [F,T]=(X,WIN,INC)
  3 | %
  4 | % Usage:  (1) f=enframe(x,n)     % split into frames of length n
  5 | %
  6 | %         (2) f=enframe(x,hamming(n,'periodic'),n/4)     % use a 75% overlapped Hamming window of length n
  7 | %
  8 | %  Inputs:   x    input signal
  9 | %          win    window or window length in samples
 10 | %          inc    frame increment in samples
 11 | %            m    mode input:
 12 | %                  'z'  zero pad to fill up final frame
 13 | %                  'r'  reflect last few samples for final frame
 14 | %                  'A'  calculate window times as the centre of mass
 15 | %                  'E'  calculate window times as the centre of energy
 16 | %
 17 | % Outputs:   f    enframed data - one frame per row
 18 | %            t    fractional time in samples at the centre of each frame
 19 | %            w    window function used
 20 | %
 21 | % By default, the number of frames will be rounded down to the nearest
 22 | % integer and the last few samples of x() will be ignored unless its length
 23 | % is lw more than a multiple of inc. If the 'z' or 'r' options are given,
 24 | % the number of frame will instead be rounded up and no samples will be ignored.
 25 | %
 26 | % Example of frame-based processing:
 27 | %          INC=20       						% set frame increment in samples
 28 | %          NW=INC*2     						% oversample by a factor of 2 (4 is also often used)
 29 | %          S=cos((0:NW*7)*6*pi/NW);				% example input signal
 30 | %          W=sqrt(hamming(NW),'periodic'));  	% sqrt hamming window of period NW
 31 | %          F=enframe(S,W,INC);               	% split into frames
 32 | %          ... process frames ...
 33 | %          X=overlapadd(F,W,INC);               % reconstitute the time waveform (omit "X=" to plot waveform)
 34 | 
 35 | % Bugs/Suggestions:
 36 | %  (1) Possible additional mode options:
 37 | %        'u'  modify window for first and last few frames to ensure WOLA
 38 | %        'a'  normalize window to give a mean of unity after overlaps
 39 | %        'e'  normalize window to give an energy of unity after overlaps
 40 | %        'wm' use Hamming window
 41 | %        'wn' use Hanning window
 42 | %        'x'  include all frames that include any of the x samples
 43 | 
 44 | %	   Copyright (C) Mike Brookes 1997-2012
 45 | %      Version: $Id: enframe.m 1713 2012-03-30 21:27:46Z dmb $
 46 | %
 47 | %   VOICEBOX is a MATLAB toolbox for speech processing.
 48 | %   Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html
 49 | %
 50 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 51 | %   This program is free software; you can redistribute it and/or modify
 52 | %   it under the terms of the GNU General Public License as published by
 53 | %   the Free Software Foundation; either version 2 of the License, or
 54 | %   (at your option) any later version.
 55 | %
 56 | %   This program is distributed in the hope that it will be useful,
 57 | %   but WITHOUT ANY WARRANTY; without even the implied warranty of
 58 | %   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 59 | %   GNU General Public License for more details.
 60 | %
 61 | %   You can obtain a copy of the GNU General Public License from
 62 | %   http://www.gnu.org/copyleft/gpl.html or by writing to
 63 | %   Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA.
 64 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 65 | 
 66 | nx=length(x(:));
 67 | if nargin<2 || isempty(win)
 68 |     win=nx;
 69 | end
 70 | if nargin<4 || isempty(m)
 71 |     m='';
 72 | end
 73 | nwin=length(win);
 74 | if nwin == 1
 75 |     lw = win;
 76 |     w = ones(1,lw);
 77 | else
 78 |     lw = nwin;
 79 |     w = win(:)';
 80 | end
 81 | if (nargin < 3) || isempty(inc)
 82 |     inc = lw;
 83 | end
 84 | nli=nx-lw+inc;
 85 | nf = fix((nli)/inc);
 86 | na=nli-inc*nf;
 87 | f=zeros(nf,lw);
 88 | indf= inc*(0:(nf-1)).';
 89 | inds = (1:lw);
 90 | f(:) = x(indf(:,ones(1,lw))+inds(ones(nf,1),:));
 91 | if nargin>3 && (any(m=='z') || any(m=='r')) && na>0
 92 |     if any(m=='r')
 93 |         ix=1+mod(nx-na:nx-na+lw-1,2*nx);
 94 |         f(nf+1,:)=x(ix+(ix>nx).*(2*nx+1-2*ix));
 95 |     else
 96 |         f(nf+1,1:na)=x(1+nx-na:nx);
 97 |     end
 98 |     nf=size(f,1);
 99 | end
100 | 
101 | 
102 | if (nwin > 1)   % if we have a non-unity window
103 |     f = f .* w(ones(nf,1),:);
104 | end
105 | if nargout>1
106 |     if any(m=='E')
107 |         t0=sum((1:lw).*w.^2)/sum(w.^2);
108 |     elseif any(m=='E')
109 |         t0=sum((1:lw).*w)/sum(w);
110 |     else
111 |         t0=(1+lw)/2;
112 |     end
113 |     t=t0+inc*(0:(nf-1)).';
114 | end
115 | 
116 | 
117 | 


--------------------------------------------------------------------------------
/rVAD2.0/snre_vad.m:
--------------------------------------------------------------------------------
  1 | function [vad_seg, D, Dsmth, snre_vad, pv_vad, e, segsnr]=snre_vad(dfdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk, vadThres)
  2 | 
  3 | % Ref:
  4 | %   Zheng-Hua Tan and Børge Lindberg
  5 | %   Low-Complexity Variable Frame Rate Analysis for Speech Recognition and Voice Activity Detection
  6 | %   IEEE Journal of Selected Topics in Signal Processing, 4(5), Oct. 2010.
  7 | 
  8 | % a posteriori SNR weighted engergy difference
  9 | Dexpl=18;
 10 | Dexpr=18;
 11 | % vadThres = 0.1; %0.1 best  %0.125 %18, 18, 0.1 for maximal, 0.16 for mean, 0.12 for ss-seg
 12 | Dsmth=zeros(nfr10,1); % smoothed energy distance 
 13 | 
 14 | % energy estimation
 15 | e=zeros(nfr10,1);
 16 | for i=1:nfr10
 17 |     for j=1:flen
 18 |         e(i)=e(i)+dfdata((i-1)*fsh10+j)*dfdata((i-1)*fsh10+j);
 19 |     end
 20 |     if e(i) <= ENERGYFLOOR
 21 |         e(i)=ENERGYFLOOR;
 22 |     end
 23 | end
 24 | 
 25 | segsnr=zeros(nfr10,1);
 26 | segsnrsmth=1; sign_segsnr=0;
 27 | D=zeros(nfr10,1);
 28 | postsnr=D;
 29 | snre_vad=zeros(nfr10,1);
 30 | sign_pv=0;
 31 | for i=1:nfr10
 32 |     if pvblk(i)==1 && sign_pv==0
 33 |         nstart=i;
 34 |         sign_pv=1; % a pitch segment starts 
 35 |     elseif (pvblk(i)==0 || i==nfr10) && sign_pv==1 
 36 |         nstop=i-1;    % a pitch segment ends
 37 |         if i==nfr10; nstop=i; end
 38 |         sign_pv=0;
 39 |         
 40 |         %if nstart>1 && nstop<nfr10
 41 |         %datai=dfdata((nstart-1)*fsh10+1:nstop*fsh10+flen-fsh10);
 42 |         datai=dfdata((nstart-1)*fsh10+1:(nstop-1)*fsh10+flen-fsh10);
 43 |         
 44 |         %[datai,snr]=specsub_segment(datai,8000);
 45 |         %segsnr(nstart:nstop)=10*log10(snr);
 46 |         %if isinf(segsnr(nstart+1))
 47 |         %    segsnr(nstart:nstop)=200;
 48 |         %end
 49 |         for j=nstart:nstop-1 % previously it was for j=nstart:nstop-1
 50 |             for h=1:flen
 51 |                 e(j)=e(j)+datai((j-nstart)*fsh10+h)*datai((j-nstart)*fsh10+h);
 52 |             end
 53 |             if e(j) <= ENERGYFLOOR; e(j)=ENERGYFLOOR; end
 54 |         end
 55 |         e(nstop)=e(nstop-1);
 56 |         %end
 57 |         
 58 |         [eY,eI]=sort(e(nstart:nstop));
 59 |         emin=eY(floor((nstop-nstart+1)*0.1));
 60 |         for j=nstart+1:nstop
 61 |             postsnr(j) =log10(e(j))-log10(emin); % calculate a posteriori SNR within a pitch segment
 62 |             if postsnr(j)<0
 63 |                 postsnr(j)=0;
 64 |             end
 65 |             D(j)=sqrt(abs(e(j)-e(j-1))*postsnr(j)); % weighted energy distance 
 66 |         end
 67 |         D(nstart)=D(nstart+1);
 68 |        
 69 |         Dexp = vertcat(ones(Dexpl,1)*D(nstart), D(nstart:nstop), ones(Dexpr,1)*D(nstop));
 70 |         for j=0:nstop-nstart
 71 |             Dsmth(nstart+j)=sum(Dexp(j+1:j+Dexpl+Dexpr));
 72 |         end
 73 |         
 74 |         % Dsmth_thres = max(Dsmth(nstart:nstop));
 75 |         Dsmth_thres=sum(Dsmth(nstart:nstop).*pv01(nstart:nstop))/sum(pv01(nstart:nstop));
 76 |         
 77 |         for j=nstart:nstop
 78 |             if Dsmth(j)>Dsmth_thres*vadThres
 79 |                 snre_vad(j)=1;
 80 |             end
 81 |         end
 82 |     end
 83 | end
 84 | pv_vad=snre_vad;
 85 | 
 86 | nexpl=33;
 87 | nexpr=47; % 29 and 39, estimated statistically, 95% ; 33, 47 %98 for voicebox pitch
 88 | sign_vad=0;
 89 | for i=1:nfr10
 90 |     if snre_vad(i)==1 && sign_vad==0
 91 |         nstart=i;
 92 |         sign_vad=1;
 93 |     elseif (snre_vad(i)==0 || i==nfr10) && sign_vad==1
 94 |         nstop=i-1;
 95 |         if i==nfr10; nstop=i; end
 96 |         sign_vad=0;
 97 |         for j=nstart:nstop
 98 |             if pv01(j)==1
 99 |                 break;
100 |             end
101 |         end
102 |         pv_vad(nstart:max(j-nexpl-1,1))=0; % beyond 33 frames to the left, non speech 
103 |         for j=0:(nstop-nstart)
104 |             if pv01(nstop-j)==1
105 |                 break;
106 |             end
107 |         end
108 |         pv_vad(nstop-j+1+nexpr:nstop)=0; % beyond 47 frames to the right, non speech
109 |     end
110 | end
111 | 
112 | nexpl =5; nexpr=12; % 9 and 13, estimated statistically 5%; 5, 12 %2 for voicebox pitch
113 | sign_vad=0;
114 | for i=1:nfr10
115 |     if snre_vad(i)==1 && sign_vad==0
116 |         nstart=i;
117 |         sign_vad=1;
118 |     elseif (snre_vad(i)==0 || i==nfr10) && sign_vad==1
119 |         nstop=i-1;
120 |         if i==nfr10; nstop=i; end
121 |         sign_vad=0;
122 |         if  sum(pv01(nstart:nstop)) > 4
123 |             for j=nstart:nstop
124 |                 if pv01(j)==1
125 |                     break;
126 |                 end
127 |             end
128 |             pv_vad(max(j-nexpl,1):j-1)=1;
129 |             for j=0:(nstop-nstart)
130 |                 if pv01(nstop-j)==1
131 |                     break;
132 |                 end
133 |             end
134 |             pv_vad(nstop-j+1:min(nstop-j+nexpr,nfr10))=1;
135 |         end
136 |         esegment=sum(e(nstart:nstop))/(nstop-nstart+1);
137 |         if esegment < 0.001
138 |             pv_vad(nstart:nstop)=0;
139 |         end
140 |         if sum(pv01(nstart:nstop)) <= 2
141 |             pv_vad(nstart:nstop) = 0;
142 |         end
143 |     end
144 | end
145 | 
146 | sign_vad=0;
147 | esum=0;
148 | for i=1:nfr10
149 |     if pv_vad(i)==1 && sign_vad==0
150 |         nstart=i;
151 |         sign_vad=1;
152 |     elseif (pv_vad(i)==0 || i==nfr10) && sign_vad==1
153 |         nstop=i-1;
154 |         if i==nfr10; nstop=i; end
155 |         sign_vad=0;
156 |         esum=esum+sum(e(nstart:nstop)); 
157 |     end
158 | end
159 | eave=esum/(sum(pv_vad)+eps); % average pitch segment energy over the utterance 
160 | sign_vad=0;
161 | for i=1:nfr10
162 |     if pv_vad(i)==1 && sign_vad==0
163 |         nstart=i;
164 |         sign_vad=1;
165 |     elseif (pv_vad(i)==0 || i==nfr10) && sign_vad==1
166 |         nstop=i-1;
167 |         if i==nfr10; nstop=i; end
168 |         sign_vad=0;
169 |         % if sum(e(nstart:nstop))/(nstop-nstart+1)<eave*0.05
170 |         %     pv_vad(nstart:nstop) = 0;  % detected speech segment has an energy smaller than 5% of average pitch segment energy, classify as non-speech 
171 |         % end
172 |         % This has an impact on long-duration recordings only. 
173 |     end
174 | end
175 | 
176 | 
177 | 
178 | sign_vad=0;
179 | vad_seg=zeros(nfr10,2);
180 | n_vad_seg=0;
181 | for i=1:nfr10
182 |     if pv_vad(i)==1 && sign_vad==0
183 |         nstart=i;
184 |         sign_vad=1;
185 |     elseif (pv_vad(i)==0 || i==nfr10) && sign_vad==1
186 |         nstop=i-1;
187 |         sign_vad=0;
188 |         n_vad_seg=n_vad_seg+1;
189 |         vad_seg(n_vad_seg,:)=[nstart nstop];
190 |     end
191 | end
192 | vad_seg(n_vad_seg+1:nfr10,:)=[];
193 | 
194 | 


--------------------------------------------------------------------------------
/rVAD2.0/estnoiseg.m:
--------------------------------------------------------------------------------
  1 | function [x,zo]=estnoiseg(yf,tz,pp)
  2 | %ESTNOISEG - estimate MMSE noise spectrum [x,zo]=(yf,tz,pp)
  3 | %
  4 | % Usage:    ninc=round(0.016*fs);   % frame increment [fs=sample frequency]
  5 | %           ovf=2;                  % overlap factor
  6 | %           f=rfft(enframe(s,hanning(ovf*ninc,'periodic'),ninc),ovf*ninc,2);
  7 | %           f=f.*conj(f);           % convert to power spectrum
  8 | %           x=estnoiseg(f,ninc/fs); % estimate the noise power spectrum
  9 | %
 10 | % Inputs:
 11 | %   yf      input power spectra (one row per frame)
 12 | %   tz      frame increment in seconds
 13 | %           Alternatively, the input state from a previous call (see below)
 14 | %   pp      algorithm parameters [optional]
 15 | %
 16 | % Outputs:
 17 | %   x       estimated noise power spectra (one row per frame)
 18 | %   zo      output state
 19 | %
 20 | % The algorithm parameters are defined in reference [1] from which equation
 21 | % numbers are given in parentheses. They are as follows:
 22 | %
 23 | %        pp.tax      % smoothing time constant for noise power estimate [0.0717 seconds](8)
 24 | %        pp.tap      % smoothing time constant for smoothed speech prob [0.152 seconds](23)
 25 | %        pp.psthr    % threshold for smoothed speech probability [0.99] (24)
 26 | %        pp.pnsaf    % noise probability safety value [0.01] (24)
 27 | %        pp.pspri    % prior speech probability [0.5] (18)
 28 | %        pp.asnr     % active SNR in dB [15] (18)
 29 | %        pp.psini    % initial speech probability [0.5] (23)
 30 | %        pp.tavini   % assumed speech absent time at start [0.064 seconds]
 31 | %
 32 | % If convenient, you can call estnoiseg in chunks of arbitrary size. Thus the following are equivalent:
 33 | %
 34 | %                   (a) dp=estnoiseg(yp(1:300),tinc);
 35 | %
 36 | %                   (b) [dp(1:100,:),z]=estnoiseg(yp(1:100,:),tinc);
 37 | %                       [dp(101:200,:),z]=estnoiseg(yp(101:200,:),z);
 38 | %                       [dp(201:300,:),z]=estnoiseg(yp(201:300,:),z);
 39 | 
 40 | 
 41 | % This is intended to be a precise implementation of [1] for a frame rate of 62.5 Hz.
 42 | % Time constants are adjusted for other frame rates.
 43 | %
 44 | % Refs:
 45 | %    [1] Gerkmann, T. & Hendriks, R. C.
 46 | %        Unbiased MMSE-Based Noise Power Estimation With Low Complexity and Low Tracking Delay
 47 | %        IEEE Trans Audio, Speech, Language Processing, 2012, 20, 1383-1393
 48 | 
 49 | %	   Copyright (C) Mike Brookes 2012
 50 | %      Version: $Id: estnoiseg.m 3387 2013-08-23 12:32:47Z dmb $
 51 | %
 52 | %   VOICEBOX is a MATLAB toolbox for speech processing.
 53 | %   Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html
 54 | %
 55 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 56 | %   This program is free software; you can redistribute it and/or modify
 57 | %   it under the terms of the GNU General Public License as published by
 58 | %   the Free Software Foundation; either version 2 of the License, or
 59 | %   (at your option) any later version.
 60 | %
 61 | %   This program is distributed in the hope that it will be useful,
 62 | %   but WITHOUT ANY WARRANTY; without even the implied warranty of
 63 | %   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 64 | %   GNU General Public License for more details.
 65 | %
 66 | %   You can obtain a copy of the GNU General Public License from
 67 | %   http://www.gnu.org/copyleft/gpl.html or by writing to
 68 | %   Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA.
 69 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 70 | 
 71 | [nr,nrf]=size(yf);          % number of frames and freq bins
 72 | x=zeros(nr,nrf);            % initialize output arrays
 73 | if isempty(yf) && isstruct(tz)             % no real data
 74 |     zo=tz;                  % just keep the same state
 75 | else
 76 |     if isstruct(tz)         % take parameters from a previous call
 77 |         nrcum=tz.nrcum;     % cumulative number of frames
 78 |         xt=tz.xt;           % smoothed power spectrum
 79 |         pslp=tz.pslp;       % correction factor (9)
 80 |         tinc=tz.tinc;       % frame increment
 81 |         qq=tz.qq;           % parameter structure
 82 |     else
 83 |         tinc = tz;          % second argument is frame increment
 84 |         nrcum=0;            % no frames so far
 85 |         % default algorithm constants
 86 |         qq.tax=0.0717;      % noise output smoothing time constant = -tinc/log(0.8) (8)
 87 |         qq.tap=0.152;       % speech prob smoothing time constant = -tinc/log(0.9) (23)
 88 |         qq.psthr=0.99;      % threshold for smoothed speech probability [0.99] (24)
 89 |         qq.pnsaf=0.01;      % noise probability safety value [0.01] (24)
 90 |         qq.pspri=0.5;       % prior speech probability [0.5] (18)
 91 |         qq.asnr=15;         % active SNR in dB [15] (18)
 92 |         qq.psini=0.5;       % initial speech probability [0.5] (23)
 93 |         qq.tavini=0.064;        % assumed speech absent time at start [64 ms]
 94 | 
 95 |         if nargin>=3 && ~isempty(pp)  % update fields from pp input
 96 |             qqn=fieldnames(qq);
 97 |             for i=1:length(qqn)
 98 |                 if isfield(pp,qqn{i})
 99 |                     qq.(qqn{i})=pp.(qqn{i});
100 |                 end
101 |             end
102 |         end
103 |         pslp=repmat(qq.psini,1,nrf); % initialize smoothed speech presence prob
104 |         xt=[];                       % initialize just in case the first call has no data
105 |     end
106 | 
107 |     % unpack parameters needed within the loop
108 | 
109 |     psthr=qq.psthr;     % threshold for smoothed speech probability [0.99] (24)
110 |     pnsaf=qq.pnsaf;     % noise probability safety value [0.01] (24)
111 | 
112 |     % derived algorithm constants
113 | 
114 |     ax=exp(-tinc/qq.tax); % noise output smoothing factor = 0.8 (8)
115 |     axc=1-ax;
116 |     ap=exp(-tinc/qq.tap); % noise output smoothing factor = 0.9 (23)
117 |     apc=1-ap;
118 |     xih1=10^(qq.asnr/10); % speech-present SNR
119 |     xih1r=1/(1+xih1)-1;
120 |     pfac=(1/qq.pspri-1)*(1+xih1); % p(noise)/p(speech) (18)
121 | 
122 |     if nrcum==0 && nr>0       % initialize values for first frame
123 |         xt=qq.psini*mean(yf(1:max(1,min(nr,round(1+qq.tavini/tinc))),:),1);  % initial noise estimate
124 |     end
125 | 
126 |     % loop for each frame
127 |     for t=1:nr
128 |         yft=yf(t,:);        % noisy speech power spectrum
129 |         ph1y=(1+pfac*exp(xih1r*yft./xt)).^(-1); % a-posteriori speech presence prob (18)
130 |         pslp=ap*pslp+apc*ph1y; % smoothed speech presence prob (23)
131 |         ph1y=min(ph1y,1-pnsaf*(pslp>psthr)); % limit ph1y (24)
132 |         xtr=(1-ph1y).*yft+ph1y.*xt; % estimated raw noise spectrum (22)
133 |         xt=ax*xt+axc*xtr;  % smooth the noise estimate (8)
134 |         x(t,:)=xt;  % save the noise estimate
135 |     end
136 |     if nargout>1    % we need to store the state for next time
137 |         zo.nrcum=nrcum+nr;      % number of frames so far
138 |         zo.xt=xt;          % smoothed power spectrum
139 |         zo.pslp=pslp;               % correction factor (9)
140 |         zo.tinc=tinc;     % must be the last one
141 |         zo.qq=qq;
142 |     end
143 |     if ~nargout
144 |         clf;
145 |         subplot(212);
146 |         plot((1:nr)*tinc,10*log10([sum(yf,2) sum(x,2)]))
147 |         ylabel('Frame Energy (dB)');
148 |         xlabel(sprintf('Time (s)   [%d ms frame incr]',round(tinc*1000)));
149 |         axisenlarge([-1 -1.05]);
150 |         legend('input','noise','Location','Best');
151 |         subplot(211);
152 |         plot(1:nrf,10*log10([sum(yf,1)'/nr sum(x,1)'/nr]))
153 |         ylabel('Power (dB)');
154 |         xlabel('Frequency bin');
155 |         axisenlarge([-1 -1.05]);
156 |         legend('input','noise','Location','Best');
157 |     end
158 | end
159 | 
160 | 


--------------------------------------------------------------------------------
/rVAD2.0/gaussmixp.m:
--------------------------------------------------------------------------------
  1 | function [lp,rp,kh,kp]=gaussmixp(y,m,v,w,a,b)
  2 | %GAUSSMIXP calculate probability densities from a Gaussian mixture model
  3 | %
  4 | % Inputs: n data values, k mixtures, p parameters, q data vector size
  5 | %
  6 | %   Y(n,q) = input data
  7 | %   M(k,p) = mixture means for x(p)
  8 | %   V(k,p) or V(p,p,k) variances (diagonal or full)
  9 | %   W(k,1) = weights
 10 | %   A(q,p), B(q) = transformation: y=x*a'+b' (where y and x are row vectors)
 11 | %            if A is omitted, it is assumed to be the first q rows of the
 12 | %            identity matrix. B defaults to zero.
 13 | %   Note that most commonly, q=p and A and B are omitted entirely.
 14 | %
 15 | % Outputs
 16 | %
 17 | %  LP(n,1) = log probability of each data point
 18 | %  RP(n,k) = relative probability of each mixture
 19 | %  KH(n,1) = highest probability mixture
 20 | %  KP(n,1) = relative probability of highest probability mixture
 21 | 
 22 | %      Copyright (C) Mike Brookes 2000-2009
 23 | %      Version: $Id: gaussmixp.m 713 2011-10-16 14:45:43Z dmb $
 24 | %
 25 | %   VOICEBOX is a MATLAB toolbox for speech processing.
 26 | %   Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html
 27 | %
 28 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 29 | %   This program is free software; you can redistribute it and/or modify
 30 | %   it under the terms of the GNU General Public License as published by
 31 | %   the Free Software Foundation; either version 2 of the License, or
 32 | %   (at your option) any later version.
 33 | %
 34 | %   This program is distributed in the hope that it will be useful,
 35 | %   but WITHOUT ANY WARRANTY; without even the implied warranty of
 36 | %   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 37 | %   GNU General Public License for more details.
 38 | %
 39 | %   You can obtain a copy of the GNU General Public License from
 40 | %   http://www.gnu.org/copyleft/gpl.html or by writing to
 41 | %   Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA.
 42 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 43 | [n,q]=size(y);
 44 | [k,p]=size(m);
 45 | 
 46 | if nargin<4
 47 |     w=repmat(1/k,k,1);
 48 |     if nargin<3
 49 |         v=ones(k,p);
 50 |     end
 51 | end
 52 | fv=ndims(v)>2 || size(v,1)>k; % full covariance matrix is requested
 53 | if nargin>4         % need to modify distribution means
 54 |     if nargin>5
 55 |         m=m*a'+repmat(b',k,1);
 56 |     else
 57 |         m=m*a';
 58 |     end
 59 |     v1=v;
 60 |     v=zeros(q,q,k);
 61 |     if fv
 62 |         for ik=1:k
 63 |             v(:,:,ik)=a*v1(:,:,ik)*a';
 64 |         end
 65 |     else
 66 |         for ik=1:k
 67 |             v(:,:,ik)=(a.*repmat(v1(ik,:),q,1))*a';
 68 |         end
 69 |         fv=1;
 70 |     end
 71 | elseif q<p     % need to select coefficient subset
 72 |     m=m(:,1:q);
 73 |     if fv
 74 |         v=v(1:q,1:q,:);
 75 |     else
 76 |         v=v(:,1:q);
 77 |     end
 78 | end
 79 | 
 80 | memsize=voicebox('memsize');    % set memory size to use
 81 | 
 82 | lp=zeros(n,1);
 83 | rp=zeros(n,k);
 84 | wk=ones(k,1);
 85 | 
 86 | if ~fv          % diagonal covariance
 87 |     %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 88 |     % Diagonal Covariance matrices  %
 89 |     %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 90 | 
 91 |     % If data size is large then do calculations in chunks
 92 | 
 93 |     nb=min(n,max(1,floor(memsize/(8*q*k))));    % chunk size for testing data points
 94 |     nl=ceil(n/nb);                  % number of chunks
 95 |     jx0=n-(nl-1)*nb;                % size of first chunk
 96 |     im=repmat((1:k)',nb,1);
 97 |     wnb=ones(1,nb);
 98 |     wnj=ones(1,jx0);
 99 | 
100 |     vi=-0.5*v.^(-1);                % data-independent scale factor in exponent
101 |     lvm=log(w)-0.5*sum(log(v),2);   % log of external scale factor (excluding -0.5*q*log(2pi) term)
102 | 
103 |     % first do partial chunk
104 | 
105 |     jx=jx0;
106 |     ii=1:jx;
107 |     kk=repmat(ii,k,1);
108 |     km=repmat(1:k,1,jx);
109 |     py=reshape(sum((y(kk(:),:)-m(km(:),:)).^2.*vi(km(:),:),2),k,jx)+lvm(:,wnj);
110 |     mx=max(py,[],1);                % find normalizing factor for each data point to prevent underflow when using exp()
111 |     px=exp(py-mx(wk,:));            % find normalized probability of each mixture for each datapoint
112 |     ps=sum(px,1);                   % total normalized likelihood of each data point
113 |     rp(ii,:)=(px./ps(wk,:))';                % relative mixture probabilities for each data point (columns sum to 1)
114 |     lp(ii)=log(ps)+mx;
115 | 
116 |     for il=2:nl
117 |         ix=jx+1;
118 |         jx=jx+nb;                    % increment upper limit
119 |         ii=ix:jx;
120 |         kk=repmat(ii,k,1);
121 |         py=reshape(sum((y(kk(:),:)-m(im,:)).^2.*vi(im,:),2),k,nb)+lvm(:,wnb);
122 |         mx=max(py,[],1);                % find normalizing factor for each data point to prevent underflow when using exp()
123 |         px=exp(py-mx(wk,:));            % find normalized probability of each mixture for each datapoint
124 |         ps=sum(px,1);                   % total normalized likelihood of each data point
125 |         rp(ii,:)=(px./ps(wk,:))';                % relative mixture probabilities for each data point (columns sum to 1)
126 |         lp(ii)=log(ps)+mx;
127 |     end
128 | else
129 |     %%%%%%%%%%%%%%%%%%%%%%%%%%%%%
130 |     % Full Covariance matrices  %
131 |     %%%%%%%%%%%%%%%%%%%%%%%%%%%%%
132 |     pl=q*(q+1)/2;
133 |     lix=1:q^2;
134 |     cix=repmat(1:q,q,1);
135 |     rix=cix';
136 |     lix(cix>rix)=[];                                        % index of lower triangular elements
137 |     lixi=zeros(q,q);
138 |     lixi(lix)=1:pl;
139 |     lixi=lixi';
140 |     lixi(lix)=1:pl;                                        % reverse index to build full matrices
141 |     v=reshape(v,q^2,k);
142 |     v=v(lix,:)';                                            % lower triangular in rows
143 | 
144 |     % If data size is large then do calculations in chunks
145 | 
146 |     nb=min(n,max(1,floor(memsize/(24*q*k))));    % chunk size for testing data points
147 |     nl=ceil(n/nb);                  % number of chunks
148 |     jx0=n-(nl-1)*nb;                % size of first chunk
149 |     wnb=ones(1,nb);
150 |     wnj=ones(1,jx0);
151 | 
152 |     vi=zeros(q*k,q);                    % stack of k inverse cov matrices each size q*q
153 |     vim=zeros(q*k,1);                   % stack of k vectors of the form inv(v)*m
154 |     mtk=vim;                             % stack of k vectors of the form m
155 |     lvm=zeros(k,1);
156 |     wpk=repmat((1:q)',k,1);
157 | 
158 |     for ik=1:k
159 | 
160 |         % these lines added for debugging only
161 |         %             vk=reshape(v(k,lixi),q,q);
162 |         %             condk(ik)=cond(vk);
163 |         %%%%%%%%%%%%%%%%%%%%
164 |         [uvk,dvk]=eig(reshape(v(ik,lixi),q,q));      % convert lower triangular to full and find eigenvalues
165 |         dvk=diag(dvk);
166 |         vik=-0.5*uvk*diag(dvk.^(-1))*uvk';   % calculate inverse
167 |         vi((ik-1)*q+(1:q),:)=vik;           % vi contains all mixture inverses stacked on top of each other
168 |         vim((ik-1)*q+(1:q))=vik*m(ik,:)';   % vim contains vi*m for all mixtures stacked on top of each other
169 |         mtk((ik-1)*q+(1:q))=m(ik,:)';       % mtk contains all mixture means stacked on top of each other
170 |         lvm(ik)=log(w(ik))-0.5*sum(log(dvk));       % vm contains the weighted sqrt of det(vi) for each mixture
171 |     end
172 |     %
173 |     %         % first do partial chunk
174 |     %
175 |     jx=jx0;
176 |     ii=1:jx;
177 |     xii=y(ii,:).';
178 |     py=reshape(sum(reshape((vi*xii-vim(:,wnj)).*(xii(wpk,:)-mtk(:,wnj)),q,jx*k),1),k,jx)+lvm(:,wnj);
179 |     mx=max(py,[],1);                % find normalizing factor for each data point to prevent underflow when using exp()
180 |     px=exp(py-mx(wk,:));  % find normalized probability of each mixture for each datapoint
181 |     ps=sum(px,1);                   % total normalized likelihood of each data point
182 |     rp(ii,:)=(px./ps(wk,:))';                % relative mixture probabilities for each data point (columns sum to 1)
183 |     lp(ii)=log(ps)+mx;
184 | 
185 |     for il=2:nl
186 |         ix=jx+1;
187 |         jx=jx+nb;        % increment upper limit
188 |         ii=ix:jx;
189 |         xii=y(ii,:).';
190 |         py=reshape(sum(reshape((vi*xii-vim(:,wnb)).*(xii(wpk,:)-mtk(:,wnb)),q,nb*k),1),k,nb)+lvm(:,wnb);
191 |         mx=max(py,[],1);                % find normalizing factor for each data point to prevent underflow when using exp()
192 |         px=exp(py-mx(wk,:));  % find normalized probability of each mixture for each datapoint
193 |         ps=sum(px,1);                   % total normalized likelihood of each data point
194 |         rp(ii,:)=(px./ps(wk,:))';                % relative mixture probabilities for each data point (columns sum to 1)
195 |         lp(ii)=log(ps)+mx;
196 |     end
197 | end
198 | lp=lp-0.5*q*log(2*pi);
199 | if nargout >2
200 |     [kp,kh]=max(rp,[],2);
201 | end


--------------------------------------------------------------------------------
/rVAD2.0/voicebox.m:
--------------------------------------------------------------------------------
  1 | function y=voicebox(f,v)
  2 | %VOICEBOX  set global parameters for Voicebox functions Y=(FIELD,VAL)
  3 | %
  4 | %  Inputs:  F   is a field name
  5 | %           V   is a new value for the field
  6 | %
  7 | % Outputs:  Y   is set equal to the structure of parameters if the
  8 | %               f and v inputs are both present or both absent. If only
  9 | %               input f is specified, then y is set to the value of the
 10 | %               corresponding field or null if it doesn't exist.
 11 | %
 12 | % This routine contains default values for constants that are used by
 13 | % other functions in the VOICEBOX toolbox. Values in the first section below,
 14 | % entitled "System-dependent directory paths" should be set as follows:
 15 | %    PP.dir_temp     directory for storing temporary files
 16 | %    PP.dir_data     default directory to preappend to speech data file names
 17 | %                    when the "d" option is specified in READWAV etc.
 18 | %    PP.shorten      location of SHORTEN executable. SHORTEN is a proprietary file compression
 19 | %                    algorithm that is used for some SPHERE-format files. READSPH
 20 | %                    will try to call an external decoder if it is asked to
 21 | %                    read such a compressed file.
 22 | %    PP.sfsbin       location of Speech Filing Sysytem binaries. If the "c" option
 23 | %                    is given to READSFS, it will try to create a requested item
 24 | %                    if it is not present in the SFS file. This parameter tells it
 25 | %                    where to find the SFS executables.
 26 | %    PP.sfssuffix    suffix for Speech Filing Sysytem binaries. READSFS uses this paremeter
 27 | %                    to create the name of an SFS executable (see PP.sfsbin above).
 28 | % Other values defined in this routine are the defaults for specific algorithm constants.
 29 | % If you want to change these, please refer to the individual routines for a fuller description.
 30 | 
 31 | % Bugs/Suggestions
 32 | %    (1)  Could allow a * at the end of F to act as a wildcard and return/print a part structure
 33 | 
 34 | %      Copyright (C) Mike Brookes 2003
 35 | %      Version: $Id: voicebox.m 713 2011-10-16 14:45:43Z dmb $
 36 | %
 37 | %   VOICEBOX is a MATLAB toolbox for speech processing.
 38 | %   Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html
 39 | %
 40 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 41 | %   This program is free software; you can redistribute it and/or modify
 42 | %   it under the terms of the GNU General Public License as published by
 43 | %   the Free Software Foundation; either version 2 of the License, or
 44 | %   (at your option) any later version.
 45 | %
 46 | %   This program is distributed in the hope that it will be useful,
 47 | %   but WITHOUT ANY WARRANTY; without even the implied warranty of
 48 | %   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 49 | %   GNU General Public License for more details.
 50 | %
 51 | %   You can obtain a copy of the GNU General Public License from
 52 | %   http://www.gnu.org/copyleft/gpl.html or by writing to
 53 | %   Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA.
 54 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 55 | 
 56 | persistent PP
 57 | if isempty(PP)
 58 | 
 59 |     % System-dependent directory paths and constants
 60 | 
 61 |     PP.dir_temp='F:\TEMP';                      % directory for storing temporary files
 62 |     PP.dir_data='E:\dmb\data\speech';           % default directory to preappend to speech data file names
 63 |     PP.shorten='C:\bin\shorten.exe';            % location of shorten executable
 64 |     PP.flac='C:\bin\flac.exe';                  % location of flac executable
 65 |     PP.sfsbin='F:\Program Files\SFS\Program';   % location of Speech Filing Sysytem binaries
 66 |     PP.sfssuffix='.exe';                        % suffix for Speech Filing Sysytem binaries
 67 |     PP.memsize=50e6;                            % Maximum amount of temporary memory to use (Bytes)
 68 | 
 69 |     % DYPSA glottal closure identifier
 70 | 
 71 |     PP.dy_cpfrac=0.3;           % presumed closed phase fraction of larynx cycle
 72 |     PP.dy_cproj=0.2;            % cost of projected candidate
 73 |     PP.dy_cspurt=-0.45;         % cost of a talkspurt
 74 |     PP.dy_dopsp=1;              % Use phase slope projection (1) or not (0)?
 75 |     PP.dy_ewdly=0.0008;         % window delay for energy cost function term [~ energy peak delay from closure] (sec)
 76 |     PP.dy_ewlen=0.003;          % window length for energy cost function term (sec)
 77 |     PP.dy_ewtaper=0.001;        % taper length for energy cost function window (sec)
 78 |     PP.dy_fwlen=0.00045;        % window length used to smooth group delay (sec)
 79 |     PP.dy_fxmax=500;            % max larynx frequency (Hz)
 80 |     PP.dy_fxmin=50;             % min larynx frequency (Hz)
 81 |     PP.dy_fxminf=60;            % min larynx frequency (Hz) [used for Frobenius norm only]
 82 |     PP.dy_gwlen=0.0030;         % group delay evaluation window length (sec)
 83 |     PP.dy_lpcdur=0.020;         % lpc analysis frame length (sec)
 84 |     PP.dy_lpcn=2;               % lpc additional poles
 85 |     PP.dy_lpcnf=0.001;          % lpc poles per Hz (1/Hz)
 86 |     PP.dy_lpcstep=0.010;        % lpc analysis step (sec)
 87 |     PP.dy_nbest=5;              % Number of NBest paths to keep
 88 |     PP.dy_preemph=50;           % pre-emphasis filter frequency (Hz) (to avoid preemphasis, make this very large)
 89 |     PP.dy_spitch=0.2;           % scale factor for pitch deviation cost
 90 |     PP.dy_wener=0.3;            % DP energy weighting
 91 |     PP.dy_wpitch=0.5;           % DP pitch weighting
 92 |     PP.dy_wslope=0.1;           % DP group delay slope weighting
 93 |     PP.dy_wxcorr=0.8;           % DP cross correlation weighting
 94 |     PP.dy_xwlen=0.01;           % cross-correlation length for waveform similarity (sec)
 95 | 
 96 |     % RAPT pitch tracker
 97 | 
 98 |     PP.rapt_f0min=50;           % Min F0 (Hz)
 99 |     PP.rapt_f0max=500;          % Max F0 (Hz)
100 |     PP.rapt_tframe=0.01;        % frame size (s)
101 |     PP.rapt_tlpw=0.005;         % low pass filter window size (s)
102 |     PP.rapt_tcorw=0.0075;       % correlation window size (s)
103 |     PP.rapt_candtr=0.3;         % minimum peak in NCCF
104 |     PP.rapt_lagwt=0.3;          % linear lag taper factor
105 |     PP.rapt_freqwt=0.02;        % cost factor for F0 change
106 |     PP.rapt_vtranc=0.005;       % fixed voice-state transition cost
107 |     PP.rapt_vtrac=0.5;          % delta amplitude modulated transition cost
108 |     PP.rapt_vtrsc=0.5;          % delta spectrum modulated transition cost
109 |     PP.rapt_vobias=0.0;         % bias to encourage voiced hypotheses
110 |     PP.rapt_doublec=0.35;       % cost of exact doubling or halving
111 |     PP.rapt_absnoise=0;         % absolute rms noise level
112 |     PP.rapt_relnoise=2;         % rms noise level relative to noise floor
113 |     PP.rapt_signoise=0.001;     % ratio of peak signal rms to noise floor (0.001 = 60dB)
114 |     PP.rapt_ncands=20;          % max hypotheses at each frame
115 |     PP.rapt_trms=0.03;                      % window length for rms measurement
116 |     PP.rapt_dtrms=0.02;                     % window spacing for rms measurement
117 |     PP.rapt_preemph=-7000;                  % s-plane position of preemphasis zero
118 |     PP.rapt_nfullag=7;                      % number of full lags to try (must be odd)
119 | 
120 |     % now check some of the key values for validity
121 | 
122 |     if exist(PP.dir_temp)~=7        % check that temp directory exists
123 |         PP.dir_temp = winenvar('temp');     % else use windows temp directory
124 |     end
125 | 
126 |     [fnp,fnn,fne]=fileparts(mfilename('fullpath'));
127 |     if exist(PP.shorten)~=2        % check that shorten executable exists
128 |         PP.shorten=fullfile(fnp,'shorten.exe'); % next try local directory
129 |         if exist(PP.shorten)~=2        % check if it exists in local directory
130 |             PP.shorten='shorten.exe'; % finally assume it is on the search path
131 |         end
132 |     end
133 | 
134 |     if exist(PP.flac)~=2        % check that flac executable exists
135 |         PP.flac=fullfile(fnp,'flac.exe'); % next try local directory
136 |         if exist(PP.flac)~=2        % check if it exists in local directory
137 |             PP.shorten='flac.exe'; % finally assume it is on the search path
138 |         end
139 |     end
140 | 
141 | end
142 | if nargin==0
143 |     if nargout==0
144 |         % list all fields
145 |         nn=sort(fieldnames(PP));
146 |         cnn=char(nn);
147 |         fprintf('%d Voicebox parameters:\n',length(nn));
148 | 
149 |         for i=1:length(nn);
150 |             if ischar(PP.(nn{i}))
151 |                 fmt='  %s = %s\n';
152 |             else
153 |                 fmt='  %s = %g\n';
154 |             end
155 |             fprintf(fmt,cnn(i,:),PP.(nn{i}));
156 |         end
157 |     else
158 |         y=PP;
159 |     end
160 | elseif nargin==1
161 |     if isfield(PP,f)
162 |         y=PP.(f);
163 |     else
164 |         y=[];
165 |     end
166 | else
167 |     if isfield(PP,f)
168 |         PP.(f)=v;
169 |         y=PP;
170 |     else
171 |         error(sprintf('''%s'' is not a valid voicebox field name',f));
172 |     end
173 | end


--------------------------------------------------------------------------------
/rVAD2.0/specsub.m:
--------------------------------------------------------------------------------
  1 | function [ss,gg,tt,ff,zo]=specsub(si,fsz,pp)
  2 | %SPECSUB performs speech enhancement using spectral subtraction [SS,ZO]=(S,FSZ,P)
  3 | %
  4 | % Usage: (1) y=specsub(x,fs);   % enhance the speech using default parameters
  5 | %
  6 | % Inputs:
  7 | %   si      input speech signal
  8 | %   fsz     sample frequency in Hz
  9 | %           Alternatively, the input state from a previous call (see below)
 10 | %   pp      algorithm parameters [optional]
 11 | %
 12 | % Outputs:
 13 | %   ss        output enhanced speech
 14 | %   gg(t,f,i) selected time-frequency values (see pp.tf below)
 15 | %   tt        centre of frames (in seconds)
 16 | %   ff        centre of frequency bins (in Hz)
 17 | %   zo        output state (or the 2nd argument if gg,tt,ff are omitted)
 18 | %
 19 | % The algorithm operation is controlled by a small number of parameters:
 20 | %
 21 | %        pp.of          % overlap factor = (fft length)/(frame increment) [2]
 22 | %        pp.ti          % desired frame increment [0.016 seconds]
 23 | %        pp.ri          % set to 1 to round ti to the nearest power of 2 samples [0]
 24 | %        pp.g           % subtraction domain: 1=magnitude, 2=power [1]
 25 | %        pp.e           % gain exponent [1]
 26 | %        pp.am          % max oversubtraction factor [3]
 27 | %        pp.b           % max noise attenutaion in power domain [0.01]
 28 | %        pp.al          % SNR for oversubtraction=am (set this to Inf for fixed a) [-5 dB]
 29 | %        pp.ah          % SNR for oversubtraction=1 [20 dB]
 30 | %        pp.ne          % noise estimation: 0=min statistics, 1=MMSE [0]
 31 | %        pp.bt          % threshold for binary gain or -1 for continuous gain [-1]
 32 | %        pp.mx          % input mixture gain [0]
 33 | %        pp.gh          % maximum gain for noise floor [1]
 34 | %        pp.rf          % round output signal to an exact number of frames [0]
 35 | %        pp.tf          % selects time-frequency planes to output in the gg() variable ['g']
 36 | %                           'i' = input power spectrum
 37 | %                           'I' = input complex spectrum
 38 | %                           'n' = noise power spectrum
 39 | %                           'g' = gain
 40 | %                           'o' = output power spectrum
 41 | %                           'O' = output complex spectrum
 42 | %
 43 | % Following [1], the magnitude-domain gain in each time-frequency bin is given by
 44 | %                          gain=mx+(1-mx)*max((1-(a*N/X)^(g/2))^(e/g),min(gh,(b*N/X)^(e/2)))
 45 | % where N and X are the powers of the noise and noisy speech respectively.
 46 | % The oversubtraction factor varies linearly between a=am for a frame SNR of al down to
 47 | % a=1 for a frame SNR of ah. To obtain a fixed value of a for all values of SNR, set al=Inf.
 48 | % Common exponent combinations are:
 49 | %                      g=1  e=1    Magnitude Domain spectral subtraction
 50 | %                      g=2  e=1    Power Domain spectral subtraction
 51 | %                      g=2  e=2    Wiener filtering
 52 | % Many authors use the parameters alpha=a^(g/2), beta=b^(g/2) and gamma2=e/g instead of a, b and e
 53 | % but this increases interdependence amongst the parameters.
 54 | % If bt>=0 then the max(...) expression above is thresholded to become 0 or 1.
 55 | %
 56 | % In addition it is possible to specify parameters for the noise estimation algorithm
 57 | % which implements reference [2] or [3] according to the setting of pp.ne
 58 | %
 59 | % Minimum statistics noise estimate [2]: pp.ne=0 
 60 | %        pp.taca      % (11): smoothing time constant for alpha_c [0.0449 seconds]
 61 | %        pp.tamax     % (3): max smoothing time constant [0.392 seconds]
 62 | %        pp.taminh    % (3): min smoothing time constant (upper limit) [0.0133 seconds]
 63 | %        pp.tpfall    % (12): time constant for P to fall [0.064 seconds]
 64 | %        pp.tbmax     % (20): max smoothing time constant [0.0717 seconds]
 65 | %        pp.qeqmin    % (23): minimum value of Qeq [2]
 66 | %        pp.qeqmax    % max value of Qeq per frame [14]
 67 | %        pp.av        % (23)+13 lines: fudge factor for bc calculation  [2.12]
 68 | %        pp.td        % time to take minimum over [1.536 seconds]
 69 | %        pp.nu        % number of subwindows to use [3]
 70 | %        pp.qith      % Q-inverse thresholds to select maximum noise slope [0.03 0.05 0.06 Inf ]
 71 | %        pp.nsmdb     % corresponding noise slope thresholds in dB/second   [47 31.4 15.7 4.1]
 72 | %
 73 | % MMSE noise estimate [3]: pp.ne=1 
 74 | %        pp.tax      % smoothing time constant for noise power estimate [0.0717 seconds](8)
 75 | %        pp.tap      % smoothing time constant for smoothed speech prob [0.152 seconds](23)
 76 | %        pp.psthr    % threshold for smoothed speech probability [0.99] (24)
 77 | %        pp.pnsaf    % noise probability safety value [0.01] (24)
 78 | %        pp.pspri    % prior speech probability [0.5] (18)
 79 | %        pp.asnr     % active SNR in dB [15] (18)
 80 | %        pp.psini    % initial speech probability [0.5] (23)
 81 | %        pp.tavini   % assumed speech absent time at start [0.064 seconds]
 82 | %
 83 | % If convenient, you can call specsub in chunks of arbitrary size. Thus the following are equivalent:
 84 | %
 85 | %                   (a) y=specsub(s,fs);
 86 | %
 87 | %                   (b) [y1,z]=specsub(s(1:1000),fs);
 88 | %                       [y2,z]=specsub(s(1001:2000),z);
 89 | %                       y3=specsub(s(2001:end),z);
 90 | %                       y=[y1; y2; y3];
 91 | %
 92 | % If the number of output arguments is either 2 or 5, the last partial frame of samples will
 93 | % be retained for overlap adding with the output from the next call to specsub().
 94 | %
 95 | % See also ssubmmse() for an alternative gain function
 96 | %
 97 | % Refs:
 98 | %    [1] M. Berouti, R. Schwartz and J. Makhoul
 99 | %        Enhancement of speech corrupted by acoustic noise
100 | %        Proc IEEE ICASSP, 1979, 4, 208-211
101 | %    [2] Rainer Martin.
102 | %        Noise power spectral density estimation based on optimal smoothing and minimum statistics.
103 | %        IEEE Trans. Speech and Audio Processing, 9(5):504-512, July 2001.
104 | %    [3] Gerkmann, T. & Hendriks, R. C.
105 | %        Unbiased MMSE-Based Noise Power Estimation With Low Complexity and Low Tracking Delay
106 | %        IEEE Trans Audio, Speech, Language Processing, 2012, 20, 1383-1393
107 | 
108 | %      Copyright (C) Mike Brookes 2004
109 | %      Version: $Id: specsub.m 1720 2012-03-31 17:17:31Z dmb $
110 | %
111 | %   VOICEBOX is a MATLAB toolbox for speech processing.
112 | %   Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html
113 | %
114 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
115 | %   This program is free software; you can redistribute it and/or modify
116 | %   it under the terms of the GNU General Public License as published by
117 | %   the Free Software Foundation; either version 2 of the License, or
118 | %   (at your option) any later version.
119 | %
120 | %   This program is distributed in the hope that it will be useful,
121 | %   but WITHOUT ANY WARRANTY; without even the implied warranty of
122 | %   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
123 | %   GNU General Public License for more details.
124 | %
125 | %   You can obtain a copy of the GNU General Public License from
126 | %   http://www.gnu.org/copyleft/gpl.html or by writing to
127 | %   Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA.
128 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
129 | if numel(si)>length(si)
130 |     error('Input speech signal must be a vector not a matrix');
131 | end
132 | if isstruct(fsz)
133 |     fs=fsz.fs;
134 |     qq=fsz.qq;
135 |     qp=fsz.qp;
136 |     ze=fsz.ze;
137 |     s=zeros(length(fsz.si)+length(si(:)),1); % allocate space for speech
138 |     s(1:length(fsz.si))=fsz.si;
139 |     s(length(fsz.si)+1:end)=si(:);
140 | else
141 |     fs=fsz;     % sample frequency
142 |     s=si(:);
143 |     % default algorithm constants
144 | 
145 |     qq.of=2;   % overlap factor = (fft length)/(frame increment)
146 |     qq.ti=16e-3;   % desired frame increment (16 ms)
147 |     qq.ri=0;       % round ni to the nearest power of 2
148 |     qq.g=1;        % subtraction domain: 1=magnitude, 2=power
149 |     qq.e=1;        % gain exponent
150 |     qq.am=3;      % max oversubtraction factor
151 |     qq.b=0.01;      % noise floor
152 |     qq.al=-5;       % SNR for maximum a (set to Inf for fixed a)
153 |     qq.ah=20;       % SNR for minimum a
154 |     qq.bt=-1;       % suppress binary masking
155 |     qq.ne=0;        % noise estimation: 0=min statistics, 1=MMSE [0]
156 |     qq.mx=0;        % no input mixing
157 |     qq.gh=1;        % maximum gain
158 |     qq.tf='g';      % output the gain time-frequency plane by default
159 |     qq.rf=0;
160 |     if nargin>=3 && ~isempty(pp)
161 |         qp=pp;      % save for estnoisem call
162 |         qqn=fieldnames(qq);
163 |         for i=1:length(qqn)
164 |             if isfield(pp,qqn{i})
165 |                 qq.(qqn{i})=pp.(qqn{i});
166 |             end
167 |         end
168 |     else
169 |         qp=struct;  % make an empty structure
170 |     end
171 | end
172 | % derived algorithm constants
173 | if qq.ri
174 |     ni=pow2(nextpow2(qq.ti*fs*sqrt(0.5)));
175 | else
176 |     ni=round(qq.ti*fs);    % frame increment in samples
177 | end
178 | tinc=ni/fs;          % true frame increment time
179 | tf=qq.tf;
180 | rf=qq.rf || nargout==2 || nargout==5;            % round down to an exact number of frames
181 | ne=qq.ne;           % noise estimation: 0=min statistics, 1=MMSE [0]
182 | 
183 | % calculate power spectrum in frames
184 | 
185 | no=round(qq.of);                                   % integer overlap factor
186 | nf=ni*no;           % fft length
187 | w=sqrt(hamming(nf+1))'; w(end)=[]; % for now always use sqrt hamming window
188 | w=w/sqrt(sum(w(1:ni:nf).^2));       % normalize to give overall gain of 1
189 | if rf>0
190 |     rfm='';                         % truncated input to an exact number of frames
191 | else
192 |     rfm='r';
193 | end
194 | [y,tt]=enframe(s,w,ni,rfm);
195 | tt=tt/fs;                           % frame times
196 | yf=rfft(y,nf,2);
197 | yp=yf.*conj(yf);        % power spectrum of input speech
198 | [nr,nf2]=size(yp);              % number of frames
199 | ff=(0:nf2-1)*fs/nf;
200 | if isstruct(fsz)
201 |     if ne>0
202 |         [dp,ze]=estnoiseg(yp,ze);       % estimate the noise using MMSE
203 |     else
204 |         [dp,ze]=estnoisem(yp,ze);       % estimate the noise using minimum statistics
205 |     end
206 |     ssv=fsz.ssv;
207 | else
208 |     if ne>0
209 |         [dp,ze]=estnoiseg(yp,tinc,qp);	% estimate the noise using MMSE
210 |     else
211 |         [dp,ze]=estnoisem(yp,tinc,qp);	% estimate the noise using minimum statistics
212 |     end
213 |     ssv=zeros(ni*(no-1),1);             % dummy saved overlap
214 | end
215 | if ~nr                                  % no data frames
216 |     ss=[];
217 |     gg=[];
218 | else
219 |     mz=yp==0;   %  mask for zero power time-frequency bins (unlikely)
220 |     if qq.al<Inf
221 |         ypf=sum(yp,2);
222 |         dpf=sum(dp,2);
223 |         mzf=dpf==0;     % zero noise frames = very high SNR
224 |         af=1+(qq.am-1)*(min(max(10*log10(ypf./(dpf+mzf)),qq.al),qq.ah)-qq.ah)/(qq.al-qq.ah);
225 |         af(mzf)=1;      % fix the zero noise frames
226 |     else
227 |         af=repmat(qq.am,nr,1);
228 |     end
229 |     switch qq.g
230 |         case 1   % magnitude domain subtraction
231 |             v=sqrt(dp./(yp+mz));
232 |             af=sqrt(af);
233 |             bf=sqrt(qq.b);
234 |         case 2   % power domain subtraction
235 |             v=dp./(yp+mz);
236 |             bf=qq.b;
237 |         otherwise % arbitrary subtraction domain
238 |             v=(dp./(yp+mz)).^(0.5*qq.g);
239 |             af=af.^(0.5*qq.g);
240 |             bf=qq.b^(0.5*qq.g);
241 |     end
242 |     af =repmat(af,1,nf2);       % replicate frame oversubtraction factors for each frequency
243 |     mf=v>=(af+bf).^(-1);        % mask for noise floor limiting
244 |     g=zeros(size(v));           % reserve space for gain matrix
245 |     eg=qq.e/qq.g;               % gain exponent relative to subtraction domain
246 |     gh=qq.gh;
247 |     switch eg
248 |         case 1                          % Normal case
249 |             g(mf)=min(bf*v(mf),gh);      % never give a gain > 1
250 |             g(~mf)=1-af(~mf).*v(~mf);
251 |         case 0.5
252 |             g(mf)=min(sqrt(bf*v(mf)),gh);
253 |             g(~mf)=sqrt(1-af(~mf).*v(~mf));
254 |         otherwise
255 |             g(mf)=min((bf*v(mf)).^eg,gh);
256 |             g(~mf)=(1-af(~mf).*v(~mf)).^eg;
257 |     end
258 |     if qq.bt>=0
259 |         g=g>qq.bt;
260 |     end
261 |     g=qq.mx+(1-qq.mx)*g;   % mix in some of the input
262 |     se=(irfft((yf.*g).',nf).').*repmat(w,nr,1);   % inverse dft and apply output window
263 |     ss=zeros(ni*(nr+no-1),no);                      % space for overlapped output speech
264 |     ss(1:ni*(no-1),end)=ssv;
265 |     for i=1:no
266 |         nm=nf*(1+floor((nr-i)/no));  % number of samples in this set
267 |         ss(1+(i-1)*ni:nm+(i-1)*ni,i)=reshape(se(i:no:nr,:)',nm,1);
268 |     end
269 |     ss=sum(ss,2);
270 |     if nargout>2 && ~isempty(tf)
271 |         gg=zeros(nr,nf2,length(tf));  % make space
272 |         for i=1:length(tf)
273 |             switch tf(i)
274 |                 case 'i'            % 'i' = input power spectrum
275 |                     gg(:,:,i)=yp;
276 |                 case 'I'            % 'i' = input power spectrum
277 |                     gg(:,:,i)=yf;
278 |                 case 'n'            % 'n' = noise power spectrum
279 |                     gg(:,:,i)=dp;
280 |                 case 'g'            % 'g' = gain
281 |                     gg(:,:,i)=g;
282 |                 case 'o'            % 'o' = output power spectrum
283 |                     gg(:,:,i)=yp.*g.^2;
284 |                 case 'O'            % 'o' = output power spectrum
285 |                     gg(:,:,i)=yf.*g;
286 |             end
287 |         end
288 |     end
289 | end
290 | if nargout==2 || nargout==5
291 |     if nr
292 |         zo.ssv=ss(end-ni*(no-1)+1:end);    % save the output tail for next time
293 |         ss(end-ni*(no-1)+1:end)=[];
294 |     else
295 |         zo.ssv=ssv;  %
296 |     end
297 |     zo.si=s(length(ss)+1:end);      % save the tail end of the input speech signal
298 |     zo.fs=fs;                       % save sample frequency
299 |     zo.qq=qq;                       % save local parameters
300 |     zo.qp=qp;                       % save estnoisem parameters
301 |     zo.ze=ze;                       % save state of noise estimation
302 |     if nargout==2
303 |         gg=zo;                      % 2nd of two arguments is zo
304 |     end
305 | elseif rf==0
306 |     ss=ss(1:length(s));             % trim to the correct length if not an exact number of frames
307 | end
308 | if ~nargout && nr>0
309 |     ffax=ff/1000;    ax=zeros(4,1);
310 |     ax(1)=subplot(223);
311 |     imagesc(tt,ffax,20*log10(g)');
312 |     colorbar;
313 |     axis('xy');
314 |     if qq.al==Inf
315 |         title(sprintf('Filter Gain (dB): a=%.2g, b=%.3g',qq.am,qq.b));
316 |     else
317 |         title(sprintf('Filter Gain (dB): a=%.2g (%.0f to %.0fdB), b=%.3g',qq.am,qq.al,qq.ah,qq.b));
318 |     end
319 |     xlabel('Time (s)');
320 |     ylabel('Frequency (kHz)');
321 | 
322 |     ax(2)=subplot(222);
323 |     imagesc(tt,ffax,10*log10(yp)');
324 |     colorbar;
325 |     axis('xy');
326 |     title('Noisy Speech (dB)');
327 |     xlabel('Time (s)');
328 |     ylabel('Frequency (kHz)');
329 | 
330 |     ax(3)=subplot(224);
331 |     imagesc(tt,ffax,10*log10(yp.*g.^2)');
332 |     colorbar;
333 |     axis('xy');
334 |     title(sprintf('Enhanced Speech (dB): g=%.2g, e=%.3g',qq.g,qq.e));
335 |     xlabel('Time (s)');
336 |     ylabel('Frequency (kHz)');
337 | 
338 |     ax(4)=subplot(221);
339 |     imagesc(tt,ffax,10*log10(dp)');
340 |     colorbar;
341 |     axis('xy');
342 |     title('Noise Estimate (dB)');
343 |     xlabel('Time (s)');
344 |     ylabel('Frequency (kHz)');
345 |     linkaxes(ax);
346 | end


--------------------------------------------------------------------------------
/rVADfast_py_2.0/speechproc.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import numpy
  3 | import sys
  4 | import os
  5 | import math
  6 | import struct
  7 | import scipy.io.wavfile as wav
  8 | from scipy.fftpack import dct
  9 | from scipy.signal import lfilter
 10 | from copy import deepcopy
 11 | import code
 12 | 
 13 | # Refs:
 14 | #  [1] Z.-H. Tan, A.k. Sarkara and N. Dehak, "rVAD: an unsupervised segment-based robust voice activity detection method," Computer Speech and Language, vol. 59, pp. 1-21, 2020. 
 15 | #  [2] Z.-H. Tan and B. Lindberg, "Low-complexity variable frame rate analysis for speech recognition and voice activity detection." 
 16 | #  IEEE Journal of Selected Topics in Signal Processing, vol. 4, no. 5, pp. 798-807, 2010.
 17 | 
 18 | # Version: 2.0 
 19 | # 02 Dec 2017, Achintya Kumar Sarkar and Zheng-Hua Tan
 20 | 
 21 | def speech_wave(fileName_):
 22 |     
 23 |      (fs,sig) = wav.read(fileName_)
 24 |      if sig.dtype == 'int16':
 25 |         nb = 16 # -> 16-bit wav files
 26 |      elif sig.dtype == 'int32':
 27 |         nb = 32 # -> 32-bit wav files
 28 |      max_nb = float(2 ** (nb - 1))
 29 |      sig = sig / (max_nb + 1.0)  
 30 |      return fs, sig
 31 |  
 32 | def enframe(speech, fs, winlen, ovrlen):
 33 |     
 34 |      N, flth, foVr = len(speech), int(numpy.fix(fs*winlen)),  int(numpy.fix(fs*ovrlen))
 35 |      
 36 |      if len(speech) < flth:
 37 |         print("speech file length shorter than window length")
 38 |         exit()
 39 |      
 40 | 
 41 |      frames = int(numpy.ceil( (N - flth + foVr)/foVr))
 42 |      slen = (frames-1)*foVr + flth
 43 | 
 44 | 
 45 |      if len(speech) < slen:
 46 |         signal = numpy.concatenate((speech, numpy.zeros((slen - N))))
 47 | 
 48 |      else:
 49 |         signal = deepcopy(speech)
 50 |   
 51 | 
 52 |      idx = numpy.tile(numpy.arange(0,flth),(frames,1)) + numpy.tile(numpy.arange(0,(frames)*foVr,foVr),(flth,1)).T
 53 |      idx = numpy.array(idx,dtype=numpy.int64)
 54 |     
 55 |  
 56 |      return signal[idx]
 57 | 
 58 | 
 59 | def sflux(data, fs, winlen, ovrlen, nftt):
 60 |     
 61 |     eps=numpy.finfo(float).eps
 62 | 
 63 |     xf=enframe(data, fs, winlen, ovrlen) #framing
 64 |     w = numpy.matrix(numpy.hamming(int(fs*winlen)) )
 65 |     w = numpy.tile(w,(numpy.size(xf, axis=0), 1))
 66 | 
 67 |     xf = numpy.multiply (xf, w) #apply window
 68 |     #fft
 69 |     ak=numpy.abs(numpy.fft.fft(xf,nftt))
 70 |     idx = range(0,int(nftt/2) +1)
 71 |     ak=ak[:,idx]
 72 |     Num=numpy.exp( float(1/len(idx)) * numpy.sum(numpy.log(ak+eps), axis=1) ) 
 73 |     Den=float(1/len(idx)) * numpy.sum(ak, axis=1)
 74 |     
 75 |     ft=(Num+eps)/(Den+eps)
 76 | 
 77 | 
 78 |     flen, fsh10 = int(numpy.fix(fs*winlen)),  int(numpy.fix(fs*ovrlen))
 79 |     nfr10=int(numpy.floor((len(data)-(flen-fsh10))/fsh10))
 80 | 
 81 |     #syn frames as per nfr10
 82 |     if nfr10 < len(ft):
 83 |        ft=ft[range(nfr10)]
 84 |     else:
 85 |        ft = numpy.concatenate((ft, numpy.repeat(ft[:1], nfr10 -len(ft), axis=0) ))
 86 | 
 87 | 
 88 |     
 89 |     return ft, flen, fsh10, nfr10
 90 | 
 91 | 
 92 | def snre_highenergy(fdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk):
 93 | 
 94 |     ## ---*******- important *******
 95 |     #here [0] index array element has  not used 
 96 | 
 97 |     Dexpl=18;     Dexpr=18 ;     segThres = 0.25
 98 | 
 99 |     fdata_=deepcopy(fdata) ;   pv01_=deepcopy(pv01) ;  pvblk_=deepcopy(pvblk)
100 | 
101 |     fdata_=numpy.insert(fdata_,0,'inf')
102 |     pv01_=numpy.insert(pv01_,0,'inf')
103 |     pvblk_=numpy.insert(pvblk_,0,'inf')
104 | 
105 | 
106 |     #energy estimation
107 |     e=numpy.zeros(nfr10,  dtype='float64')
108 |     e=numpy.insert(e,0,'inf')
109 | 
110 |     for i in range(1, nfr10+1):
111 |         for j in range(1, flen+1):
112 |              e[i]=e[i]+numpy.square(fdata_[(i-1)*fsh10+j])
113 |     
114 |         if numpy.less_equal(e[i], ENERGYFLOOR):
115 |              e[i]=ENERGYFLOOR
116 |     
117 |     emin=numpy.ones(nfr10)
118 |     emin=numpy.insert(emin,0,'inf')
119 |     NESEG = 200
120 | 
121 |     if numpy.less(nfr10, NESEG):
122 |         NESEG=nfr10
123 | 
124 |     for i in range(1, int(numpy.floor(nfr10/NESEG))+1):
125 |         eY=numpy.sort(e[range((i-1)*NESEG+1, (i*NESEG)+1)])
126 |         eY=numpy.insert(eY,0,'inf')
127 | 
128 |         emin[range( (i-1)*NESEG+1, i*NESEG+1)]=eY[int(numpy.floor(NESEG*0.1))]
129 |         if numpy.not_equal(i, 1):
130 |              emin[range((i-1)*NESEG+1,i*NESEG+1)]=0.9*emin[(i-1)*NESEG]+0.1*emin[(i-1)*NESEG+1]
131 | 
132 |     if numpy.not_equal(i*NESEG, nfr10):
133 |         eY=numpy.sort(e[range((i-1)*NESEG+1, nfr10+1)])
134 |         eY=numpy.insert(eY,0,'inf')
135 | 
136 |         emin[range(i*NESEG+1,nfr10+1)]=eY[int(numpy.floor((nfr10-(i-1)*NESEG)*0.1))]
137 |         emin[range(i*NESEG+1,nfr10+1)]=0.9*emin[i*NESEG]+0.1*emin[i*NESEG+1]
138 | 
139 | 
140 |     D=numpy.zeros(nfr10)
141 |     D=numpy.insert(D,0,'inf')
142 | 
143 |     postsnr=numpy.zeros(nfr10)
144 |     postsnr=numpy.insert(postsnr,0,'inf')
145 | 
146 |     for i in range(2, nfr10+1):
147 |         postsnr[i] =numpy.log10(e[i])-numpy.log10(emin[i])
148 |         if numpy.less(postsnr[i],0):
149 |              postsnr[i]=0
150 |     
151 |         D[i]=numpy.sqrt(numpy.abs(e[i]-e[i-1])*postsnr[i])
152 |     D[1]=D[2]
153 | 
154 | 
155 |     
156 |     tm1 = numpy.hstack((numpy.ones(Dexpl)*D[1], D[1:len(D)]))
157 |     Dexp = numpy.hstack((tm1, numpy.ones(Dexpr)*D[nfr10] ))
158 |     Dexp = numpy.insert(Dexp,0,'inf')
159 |   
160 |     Dsmth=numpy.zeros(nfr10, dtype='float64')
161 |     Dsmth=numpy.insert(Dsmth,0,'inf')
162 |   
163 |     Dsmth_max=deepcopy(Dsmth)
164 | 
165 | 
166 |     for i in range(1,nfr10+1):
167 |         Dsmth[i]=sum(Dexp[range(i, i+Dexpl+Dexpr+1)])
168 | 
169 |     for i in range(1, int(numpy.floor(nfr10/NESEG))+1):
170 |         Dsmth_max[range((i-1)*NESEG+1, i*NESEG+1)]= numpy.amax(e[range((i-1)*NESEG+1, i*NESEG+1)]);  #numpy.amax(Dsmth[range((i-1)*NESEG+1, i*NESEG+1)])
171 | 
172 | 
173 |     if numpy.not_equal(i*NESEG, nfr10):
174 |         Dsmth_max[range(i*NESEG+1, nfr10+1)]=numpy.amax(e[range((i-1)*NESEG+1, nfr10+1)])     #numpy.amax(Dsmth[range((i-1)*NESEG+1, nfr10+1)])
175 | 
176 |     snre_vad = numpy.zeros(nfr10)
177 |     snre_vad=numpy.insert(snre_vad,0,'inf')
178 | 
179 |     for i in range(1, nfr10+1):
180 |         if numpy.greater(Dsmth[i], Dsmth_max[i]*segThres):
181 |              snre_vad[i]=1
182 | 
183 |     #block based processing to remove noise part by using snre_vad1.
184 |     sign_vad = 0
185 |     noise_seg=numpy.zeros(int(numpy.floor(nfr10/1.6))) ;   noise_seg=numpy.insert(noise_seg,0,'inf')
186 |  
187 |     noise_samp=numpy.zeros((nfr10,2))
188 |     n_noise_samp=-1
189 | 
190 |     for i in range(1, nfr10+1):
191 |         if (snre_vad[i] == 1) and (sign_vad == 0): #% start of a segment
192 |              sign_vad = 1
193 |              nstart=i
194 |         elif ((snre_vad[i] ==0) or (i==nfr10)) and (sign_vad == 1): # % end of a segment
195 |              sign_vad = 0
196 |              nstop=i-1
197 |              if numpy.equal(sum(pv01_[range(nstart, nstop+1)]), 0):
198 |                   noise_seg[range(int(numpy.round(nstart/1.6)), int(numpy.floor(nstop/1.6))+1)] = 1
199 |                   n_noise_samp=n_noise_samp+1
200 |                   noise_samp[n_noise_samp,:]=numpy.array([(nstart-1)*fsh10+1, nstop*fsh10])
201 | 
202 |     noise_samp=noise_samp[:n_noise_samp+1,]
203 | 
204 |     #syn  from [0] index
205 |     noise_samp=noise_samp-1
206 |     noise_seg=noise_seg[1:len(noise_seg)]
207 |  
208 |     return noise_samp, noise_seg, len(noise_samp)   
209 | 
210 | 
211 | 
212 | 
213 | def snre_vad(fdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk, vadThres):
214 | 
215 |     ## ---*******- important *******
216 |     #here [0] index array element has  not used 
217 | 
218 |     Dexpl, Dexpr=18, 18
219 |     Dsmth=numpy.zeros(nfr10, dtype='float64'); Dsmth=numpy.insert(Dsmth,0,'inf')    
220 |    
221 |     fdata_=deepcopy(fdata)
222 |     pv01_=deepcopy(pv01)
223 |     pvblk_=deepcopy(pvblk)   
224 |  
225 |     fdata_=numpy.insert(fdata_,0,'inf')
226 |     pv01_=numpy.insert(pv01_,0,'inf')
227 |     pvblk_=numpy.insert(pvblk_,0,'inf')
228 | 
229 | 
230 |     #energy estimation
231 |     e=numpy.zeros(nfr10,  dtype='float64')
232 |     e=numpy.insert(e,0,'inf')
233 | 
234 |     for i in range(1, nfr10+1):
235 |         for j in range(1, flen+1):
236 |             e[i]=e[i]+ numpy.square(fdata_[(i-1)*fsh10+j])
237 |     
238 |         if numpy.less_equal(e[i], ENERGYFLOOR):
239 |             e[i]=ENERGYFLOOR
240 | 
241 | 
242 |     segsnr=numpy.zeros(nfr10); segsnr=numpy.insert(segsnr,0,'inf')
243 |     segsnrsmth=1
244 |     sign_segsnr=0
245 |     D=numpy.zeros(nfr10); D=numpy.insert(D,0,'inf')
246 |     postsnr=numpy.zeros(nfr10, dtype='float64'); postsnr=numpy.insert(postsnr,0,'inf')
247 |     snre_vad=numpy.zeros(nfr10); snre_vad=numpy.insert(snre_vad,0,'inf')
248 |     sign_pv=0
249 | 
250 |     
251 |      
252 |  
253 |     for i in range(1, nfr10+1):
254 |         
255 |         if (pvblk_[i]==1) and (sign_pv==0):
256 |              nstart=i
257 |              sign_pv=1
258 | 
259 |         elif ( (pvblk_[i]==0) or (i==nfr10) ) and (sign_pv==1): 
260 | 
261 |              nstop=i-1
262 |              if i==nfr10:
263 |                   nstop=i
264 |              sign_pv=0
265 |              datai=fdata_[range( (nstart-1)*fsh10+1, (nstop-1)*fsh10+flen-fsh10+1) ]
266 |              datai=numpy.insert(datai,0,'inf')
267 | 
268 |              for j in range(nstart, nstop-1+1):  #previously it was for j=nstart:nstop-1
269 |                   for h in range(1, flen+1):
270 |                       e[j]=e[j]+ numpy.square(datai[(j-nstart)*fsh10+h] )
271 |                   if numpy.less_equal(e[j], ENERGYFLOOR):
272 |                       e[j]=ENERGYFLOOR
273 |              
274 |              e[nstop]=e[nstop-1]
275 | 
276 | 
277 |              eY=numpy.sort(e[range(nstart, nstop+1)] )
278 |              eY=numpy.insert(eY,0,'inf') #as [0] is discarding
279 | 
280 |              emin=eY[int(numpy.floor((nstop-nstart+1)*0.1))]
281 |              
282 |                             
283 |                            
284 | 
285 |              for j in range(nstart+1, nstop+1):
286 |                   
287 |                   postsnr[j] =math.log10(e[j]) - math.log10(emin)
288 | 
289 |                   if numpy.less(postsnr[j], 0):
290 |                       postsnr[j]=0
291 |                   
292 |                   D[j]=math.sqrt(numpy.abs(e[j]-e[j-1])*postsnr[j] )
293 |              
294 |              D[nstart]=D[nstart+1]
295 | 
296 | 
297 |              tm1 = numpy.hstack((numpy.ones(Dexpl)*D[nstart], D[range(nstart, nstop+1)]))
298 |              Dexp = numpy.hstack((tm1, numpy.ones(Dexpr)*D[nstop] ))
299 |              
300 |              Dexp = numpy.insert(Dexp,0,'inf')
301 | 
302 |              for j in range(0, nstop-nstart+1):
303 |                   Dsmth[nstart+j]=sum(Dexp[range(j+1, j+Dexpl+Dexpr+1)])
304 | 
305 |              Dsmth_thres=sum(Dsmth[range(nstart, nstop+1)]*pv01_[range(nstart, nstop+1)])/sum(pv01_[range(nstart,nstop+1)])
306 | 
307 |              for j in range(nstart, nstop+1):
308 |                   if numpy.greater(Dsmth[j], Dsmth_thres*vadThres):
309 |                       snre_vad[j]=1 
310 |                      
311 |     #     
312 |     pv_vad=deepcopy(snre_vad)       
313 |         
314 | 
315 |     nexpl=33
316 |     nexpr=47 # % 29 and 39, estimated statistically, 95% ; 33, 47 %98 for voicebox pitch
317 |     sign_vad=0
318 |     for i in range(1, nfr10+1):
319 |         if (snre_vad[i]==1) and (sign_vad==0):
320 |              nstart=i
321 |              sign_vad=1
322 |         elif ((snre_vad[i]==0) or (i==nfr10)) and (sign_vad==1):
323 |              nstop=i-1
324 |              if i==nfr10:
325 |                   nstop=i
326 |              sign_vad=0
327 |              for j in range(nstart, nstop+1):
328 |                   if pv01_[j]==1:
329 |                      break
330 |             
331 |              
332 |              pv_vad[range(nstart, numpy.max([j-nexpl-1,1])+1)]=0
333 |              
334 |              for j in range(0, nstop-nstart+1):
335 |                   if pv01_[nstop-j]==1:
336 |                       break
337 |             
338 |         
339 |              pv_vad[range(nstop-j+1+nexpr,nstop+1)]=0
340 |     
341 |     nexpl =5; nexpr=12 #; % 9 and 13, estimated statistically 5%; 5, 12 %2 for voicebox pitch
342 |     sign_vad=0
343 |     for i in range(1,nfr10+1):
344 |         if (snre_vad[i]==1) and (sign_vad==0):
345 |              nstart=i
346 |              sign_vad=1
347 |         elif ((snre_vad[i]==0) or (i==nfr10) ) and (sign_vad==1):
348 |              nstop=i-1  
349 |              if i==nfr10:
350 |                   nstop=i
351 |              sign_vad=0
352 |              
353 |              if  numpy.greater(sum(pv01_[range(nstart,nstop+1)]), 4):
354 |                   for j in range(nstart,nstop+1):
355 |                      if pv01_[j]==1:
356 |                          break
357 |                   
358 |                   pv_vad[range(numpy.maximum(j-nexpl,1),j-1+1)]=1
359 |                   for j in range(0,nstop-nstart+1):
360 |                      if pv01_[nstop-j]==1:
361 |                          break
362 |                   pv_vad[range(nstop-j+1,min(nstop-j+nexpr,nfr10)+1)]=1
363 |         
364 |              
365 |              esegment=sum(e[range(nstart,nstop+1)])/(nstop-nstart+1)
366 |              if numpy.less(esegment, 0.001):
367 |                   pv_vad[range(nstart, nstop+1)]=0
368 |         
369 |              if numpy.less_equal(sum(pv01_[range(nstart,nstop+1)]),  2):
370 |                   pv_vad[range(nstart,nstop+1)] = 0
371 |         
372 | 
373 |     sign_vad=0
374 |     esum=0
375 |     for i in range(1,nfr10+1):
376 |         if (pv_vad[i]==1) and (sign_vad==0):
377 |              nstart=i
378 |              sign_vad=1
379 |         elif ((pv_vad[i]==0) or (i==nfr10)) and (sign_vad==1):
380 |              nstop=i-1
381 |              if i==nfr10:
382 |                   nstop=i
383 |              sign_vad=0
384 |              esum=esum+sum(e[range(nstart, nstop+1)])
385 |              
386 |     #
387 |     eps = numpy.finfo(float).eps
388 | 
389 |     eave=esum/(sum(pv_vad[1:len(pv_vad)])+eps) # except [0] index 'inf'
390 |     
391 | 
392 |     
393 |     sign_vad=0
394 |     for i in range(1,nfr10+1):
395 |         if (pv_vad[i]==1) and (sign_vad==0):
396 |              nstart=i
397 |              sign_vad=1
398 |         elif ((pv_vad[i]==0) or (i==nfr10)) and (sign_vad==1):
399 |              nstop=i-1
400 |              if i==nfr10:
401 |                   nstop=i
402 |              sign_vad=0
403 |             
404 |              #if numpy.less(sum(e[range(nstart,nstop+1)])/(nstop-nstart+1), eave*0.05):
405 |                   #pv_vad[range(nstart,nstop+1)] = 0
406 |         
407 |     #
408 |     sign_vad=0
409 |     vad_seg=numpy.zeros((nfr10,2), dtype="int64")
410 |     n_vad_seg=-1 #for indexing array
411 |     for i in range(1,nfr10+1):
412 |         if (pv_vad[i]==1) and (sign_vad==0):
413 |              nstart=i
414 |              sign_vad=1
415 |         elif ((pv_vad[i]==0) or (i==nfr10)) and (sign_vad==1):
416 |              nstop=i-1
417 |              sign_vad=0
418 |              n_vad_seg=n_vad_seg+1
419 |              #print i, n_vad_seg, nstart, nstop
420 |              vad_seg[n_vad_seg,:]=numpy.array([nstart, nstop])
421 |     
422 | 
423 |     vad_seg=vad_seg[:n_vad_seg+1,]
424 | 
425 | 
426 |     #syn  from [0] index
427 |     vad_seg = vad_seg - 1
428 | 
429 |     #print vad_seg
430 | 
431 |     # make one dimension array of (0/1) 
432 |     xYY=numpy.zeros(nfr10, dtype="int64")
433 |     for i in range(len(vad_seg)):  
434 |         k=range(vad_seg[i,0], vad_seg[i,1]+1)
435 |         xYY[k]=1
436 | 
437 |     vad_seg=xYY
438 | 
439 | 
440 |     return vad_seg
441 | 
442 | 
443 | 
444 | def pitchblockdetect(pv01, pitch, nfr10, opts):
445 |    
446 | 
447 |     pv01_=deepcopy(pv01)
448 | 
449 |     if nfr10 == len(pv01_)+1:
450 |        numpy.append(pv01_, pv01_[nfr10-1])  
451 |     if opts == 0:
452 |         sign_pv=0
453 |         for i in range(0, nfr10):
454 | 
455 |              if ( pv01_[i]==1) and (sign_pv==0):
456 |  
457 |                   nstart, sign_pv =i, 1
458 | 
459 |              elif ( (pv01_[i] == 0) or (i==nfr10-1) ) and (sign_pv==1):
460 | 
461 |                   nstop=i
462 |                   if i==nfr10-1:
463 |                      nstop=i+1
464 |                   sign_pv=0
465 |                   pitchseg=numpy.zeros(nstop-nstart)
466 |                   #print len(pitchseg)
467 |                   for j in range (nstart, nstop):
468 |                      
469 |                      pitchseg[j-nstart]=pitch[j];
470 |         
471 |                   if (sum(numpy.abs( numpy.round( pitchseg-numpy.average(pitchseg) ) ))==0)  and (nstop-nstart+1>=10):
472 |                      pv01_[range(nstart,nstop)]=0 
473 |     #
474 |     sign_pv=0
475 |     pvblk=deepcopy(pv01_)   
476 | 
477 |     #print i
478 |     for i in range(0, nfr10):
479 |         
480 |         if (pv01_[i]==1) and (sign_pv==0):
481 |              #print("i=%s " %(i))
482 |              nstart, sign_pv=i, 1
483 |              pvblk[range(max([nstart-60,0]), nstart+1)]=1
484 |              #print("fm P2: i=%s %s % " %(i,max([nstart-60,0]), nstart+1))
485 |              
486 |         elif ( (pv01_[i] ==0) or (i==nfr10-1 )) and (sign_pv==1):
487 | 
488 |              nstop, sign_pv= i, 0
489 | 
490 |              pvblk[range(nstop, numpy.amin([nstop+60,nfr10-1])+1 )]=1 
491 |              #print("fm P2: i=%s %s %s " %(i,nstop, numpy.amin([nstop+60,nfr10-1])+1 ))
492 |             
493 |     return pvblk 
494 | 
495 | 
496 | 


--------------------------------------------------------------------------------
/rVAD2.0/specsub_noiseseg_lfn.m:
--------------------------------------------------------------------------------
  1 | function [ss,gg,tt,ff,zo]=specsub_noiseseg_lfn(si,fsz,noise_seg,pv01,pp)
  2 | %SPECSUB performs speech enhancement using spectral subtraction [SS,ZO]=(S,FSZ,P)
  3 | %
  4 | % Usage: (1) y=specsub(x,fs);   % enhance the speech using default parameters
  5 | %
  6 | % Inputs:
  7 | %   si      input speech signal
  8 | %   fsz     sample frequency in Hz
  9 | %           Alternatively, the input state from a previous call (see below)
 10 | %   pp      algorithm parameters [optional]
 11 | %
 12 | % Outputs:
 13 | %   ss        output enhanced speech
 14 | %   gg(t,f,i) selected time-frequency values (see pp.tf below)
 15 | %   tt        centre of frames (in seconds)
 16 | %   ff        centre of frequency bins (in Hz)
 17 | %   zo        output state (or the 2nd argument if gg,tt,ff are omitted)
 18 | %
 19 | % The algorithm operation is controlled by a small number of parameters:
 20 | %
 21 | %        pp.of          % overlap factor = (fft length)/(frame increment) [2]
 22 | %        pp.ti          % desired frame increment [0.016 seconds]
 23 | %        pp.ri          % set to 1 to round ti to the nearest power of 2 samples [0]
 24 | %        pp.g           % subtraction domain: 1=magnitude, 2=power [1]
 25 | %        pp.e           % gain exponent [1]
 26 | %        pp.am          % max oversubtraction factor [3]
 27 | %        pp.b           % max noise attenutaion in power domain [0.01]
 28 | %        pp.al          % SNR for oversubtraction=am (set this to Inf for fixed a) [-5 dB]
 29 | %        pp.ah          % SNR for oversubtraction=1 [20 dB]
 30 | %        pp.ne          % noise estimation: 0=min statistics, 1=MMSE [0]
 31 | %        pp.bt          % threshold for binary gain or -1 for continuous gain [-1]
 32 | %        pp.mx          % input mixture gain [0]
 33 | %        pp.gh          % maximum gain for noise floor [1]
 34 | %        pp.rf          % round output signal to an exact number of frames [0]
 35 | %        pp.tf          % selects time-frequency planes to output in the gg() variable ['g']
 36 | %                           'i' = input power spectrum
 37 | %                           'I' = input complex spectrum
 38 | %                           'n' = noise power spectrum
 39 | %                           'g' = gain
 40 | %                           'o' = output power spectrum
 41 | %                           'O' = output complex spectrum
 42 | %
 43 | % Following [1], the magnitude-domain gain in each time-frequency bin is given by
 44 | %                          gain=mx+(1-mx)*max((1-(a*N/X)^(g/2))^(e/g),min(gh,(b*N/X)^(e/2)))
 45 | % where N and X are the powers of the noise and noisy speech respectively.
 46 | % The oversubtraction factor varies linearly between a=am for a frame SNR of al down to
 47 | % a=1 for a frame SNR of ah. To obtain a fixed value of a for all values of SNR, set al=Inf.
 48 | % Common exponent combinations are:
 49 | %                      g=1  e=1    Magnitude Domain spectral subtraction
 50 | %                      g=2  e=1    Power Domain spectral subtraction
 51 | %                      g=2  e=2    Wiener filtering
 52 | % Many authors use the parameters alpha=a^(g/2), beta=b^(g/2) and gamma2=e/g instead of a, b and e
 53 | % but this increases interdependence amongst the parameters.
 54 | % If bt>=0 then the max(...) expression above is thresholded to become 0 or 1.
 55 | %
 56 | % In addition it is possible to specify parameters for the noise estimation algorithm
 57 | % which implements reference [2] or [3] according to the setting of pp.ne
 58 | %
 59 | % Minimum statistics noise estimate [2]: pp.ne=0 
 60 | %        pp.taca      % (11): smoothing time constant for alpha_c [0.0449 seconds]
 61 | %        pp.tamax     % (3): max smoothing time constant [0.392 seconds]
 62 | %        pp.taminh    % (3): min smoothing time constant (upper limit) [0.0133 seconds]
 63 | %        pp.tpfall    % (12): time constant for P to fall [0.064 seconds]
 64 | %        pp.tbmax     % (20): max smoothing time constant [0.0717 seconds]
 65 | %        pp.qeqmin    % (23): minimum value of Qeq [2]
 66 | %        pp.qeqmax    % max value of Qeq per frame [14]
 67 | %        pp.av        % (23)+13 lines: fudge factor for bc calculation  [2.12]
 68 | %        pp.td        % time to take minimum over [1.536 seconds]
 69 | %        pp.nu        % number of subwindows to use [3]
 70 | %        pp.qith      % Q-inverse thresholds to select maximum noise slope [0.03 0.05 0.06 Inf ]
 71 | %        pp.nsmdb     % corresponding noise slope thresholds in dB/second   [47 31.4 15.7 4.1]
 72 | %
 73 | % MMSE noise estimate [3]: pp.ne=1 
 74 | %        pp.tax      % smoothing time constant for noise power estimate [0.0717 seconds](8)
 75 | %        pp.tap      % smoothing time constant for smoothed speech prob [0.152 seconds](23)
 76 | %        pp.psthr    % threshold for smoothed speech probability [0.99] (24)
 77 | %        pp.pnsaf    % noise probability safety value [0.01] (24)
 78 | %        pp.pspri    % prior speech probability [0.5] (18)
 79 | %        pp.asnr     % active SNR in dB [15] (18)
 80 | %        pp.psini    % initial speech probability [0.5] (23)
 81 | %        pp.tavini   % assumed speech absent time at start [0.064 seconds]
 82 | %
 83 | % If convenient, you can call specsub in chunks of arbitrary size. Thus the following are equivalent:
 84 | %
 85 | %                   (a) y=specsub(s,fs);
 86 | %
 87 | %                   (b) [y1,z]=specsub(s(1:1000),fs);
 88 | %                       [y2,z]=specsub(s(1001:2000),z);
 89 | %                       y3=specsub(s(2001:end),z);
 90 | %                       y=[y1; y2; y3];
 91 | %
 92 | % If the number of output arguments is either 2 or 5, the last partial frame of samples will
 93 | % be retained for overlap adding with the output from the next call to specsub().
 94 | %
 95 | % See also ssubmmse() for an alternative gain function
 96 | %
 97 | % Refs:
 98 | %    [1] M. Berouti, R. Schwartz and J. Makhoul
 99 | %        Enhancement of speech corrupted by acoustic noise
100 | %        Proc IEEE ICASSP, 1979, 4, 208-211
101 | %    [2] Rainer Martin.
102 | %        Noise power spectral density estimation based on optimal smoothing and minimum statistics.
103 | %        IEEE Trans. Speech and Audio Processing, 9(5):504-512, July 2001.
104 | %    [3] Gerkmann, T. & Hendriks, R. C.
105 | %        Unbiased MMSE-Based Noise Power Estimation With Low Complexity and Low Tracking Delay
106 | %        IEEE Trans Audio, Speech, Language Processing, 2012, 20, 1383-1393
107 | 
108 | %      Copyright (C) Mike Brookes 2004
109 | %      Version: $Id: specsub.m 1720 2012-03-31 17:17:31Z dmb $
110 | %
111 | %   VOICEBOX is a MATLAB toolbox for speech processing.
112 | %   Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html
113 | %
114 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
115 | %   This program is free software; you can redistribute it and/or modify
116 | %   it under the terms of the GNU General Public License as published by
117 | %   the Free Software Foundation; either version 2 of the License, or
118 | %   (at your option) any later version.
119 | %
120 | %   This program is distributed in the hope that it will be useful,
121 | %   but WITHOUT ANY WARRANTY; without even the implied warranty of
122 | %   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
123 | %   GNU General Public License for more details.
124 | %
125 | %   You can obtain a copy of the GNU General Public License from
126 | %   http://www.gnu.org/copyleft/gpl.html or by writing to
127 | %   Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA.
128 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
129 | %
130 | % Modified code: Zheng-Hua Tan, 2012
131 | %
132 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
133 | 
134 | if numel(si)>length(si)
135 |     error('Input speech signal must be a vector not a matrix');
136 | end
137 | if isstruct(fsz)
138 |     fs=fsz.fs;
139 |     qq=fsz.qq;
140 |     qp=fsz.qp;
141 |     ze=fsz.ze;
142 |     s=zeros(length(fsz.si)+length(si(:)),1); % allocate space for speech
143 |     s(1:length(fsz.si))=fsz.si;
144 |     s(length(fsz.si)+1:end)=si(:);
145 | else
146 |     fs=fsz;     % sample frequency
147 |     s=si(:);
148 |     % default algorithm constants
149 | 
150 |     qq.of=2;   % overlap factor = (fft length)/(frame increment)
151 |     qq.ti=16e-3;   % desired frame increment (16 ms)
152 |     qq.ri=0;       % round ni to the nearest power of 2
153 |     qq.g=1;        % subtraction domain: 1=magnitude, 2=power
154 |     qq.e=1;        % gain exponent
155 |     qq.am=3;      % max oversubtraction factor
156 |     qq.b=0.01;      % noise floor
157 |     qq.al=-5;       % SNR for maximum a (set to Inf for fixed a)
158 |     qq.ah=20;       % SNR for minimum a
159 |     qq.bt=-1;       % suppress binary masking
160 |     qq.ne=0;        % noise estimation: 0=min statistics, 1=MMSE [0]
161 |     qq.mx=0;        % no input mixing
162 |     qq.gh=1;        % maximum gain
163 |     qq.tf='g';      % output the gain time-frequency plane by default
164 |     qq.rf=0;
165 | %    if nargin>=3 && ~isempty(pp)
166 |     if nargin>=5 && ~isempty(pp)
167 |         qp=pp;      % save for estnoisem call
168 |         qqn=fieldnames(qq);
169 |         for i=1:length(qqn)
170 |             if isfield(pp,qqn{i})
171 |                 qq.(qqn{i})=pp.(qqn{i});
172 |             end
173 |         end
174 |     else
175 |         qp=struct;  % make an empty structure
176 |     end
177 | end
178 | % derived algorithm constants
179 | if qq.ri
180 |     ni=pow2(nextpow2(qq.ti*fs*sqrt(0.5)));
181 | else
182 |     ni=round(qq.ti*fs);    % frame increment in samples
183 | end
184 | tinc=ni/fs;          % true frame increment time
185 | tf=qq.tf;
186 | rf=qq.rf || nargout==2 || nargout==5;            % round down to an exact number of frames
187 | ne=qq.ne;           % noise estimation: 0=min statistics, 1=MMSE [0]
188 | 
189 | % calculate power spectrum in frames
190 | 
191 | no=round(qq.of);                                   % integer overlap factor
192 | nf=ni*no;           % fft length
193 | w=sqrt(hamming(nf+1))'; w(end)=[]; % for now always use sqrt hamming window
194 | w=w/sqrt(sum(w(1:ni:nf).^2));       % normalize to give overall gain of 1
195 | if rf>0
196 |     rfm='';                         % truncated input to an exact number of frames
197 | else
198 |     rfm='r';
199 | end
200 | 
201 | 
202 | [y,tt]=enframe(s,w,ni,rfm);
203 | 
204 | tt=tt/fs;                           % frame times
205 | yf=rfft(y,nf,2);
206 | yp=yf.*conj(yf);        % power spectrum of input speech
207 | [nr,nf2]=size(yp);              % number of frames
208 | ff=(0:nf2-1)*fs/nf;
209 | 
210 | 
211 | if isstruct(fsz)
212 |     if ne>0
213 |         [dp,ze]=estnoiseg(yp,ze);       % estimate the noise using MMSE
214 |     else
215 |         if size(noise_seg)<nr; noise_seg(nr) = 0; end % to match the size with yp. 
216 |         [dp,ze]=estnoisem_noiseseg(yp,noise_seg,pv01,ze);       % estimate the noise using minimum statistics
217 |     end
218 |     ssv=fsz.ssv;
219 | else
220 |     if ne>0
221 |         [dp,ze]=estnoiseg(yp,tinc,qp);	% estimate the noise using MMSE
222 |     else
223 |         if size(noise_seg,1)<nr; noise_seg(nr) = 0; end % to match the size with yp.
224 |         [dp,ze]=estnoisem_noiseseg(yp,noise_seg,pv01,tinc,qp);	% estimate the noise using minimum statistics
225 |     end
226 |     ssv=zeros(ni*(no-1),1);             % dummy saved overlap
227 | end
228 | 
229 | 
230 | 
231 | if ~nr                                  % no data frames
232 |     ss=[];
233 |     gg=[];
234 | else
235 |     mz=yp==0;   %  mask for zero power time-frequency bins (unlikely)
236 |     if qq.al<Inf
237 |         ypf=sum(yp,2);
238 |         dpf=sum(dp,2);
239 |         mzf=dpf==0;     % zero noise frames = very high SNR
240 |         af=1+(qq.am-1)*(min(max(10*log10(ypf./(dpf+mzf)),qq.al),qq.ah)-qq.ah)/(qq.al-qq.ah);
241 |         af(mzf)=1;      % fix the zero noise frames
242 |     else
243 |         af=repmat(qq.am,nr,1);
244 |     end
245 |     switch qq.g
246 |         case 1   % magnitude domain subtraction
247 |             v=sqrt(dp./(yp+mz));
248 |             af=sqrt(af);
249 |             bf=sqrt(qq.b);
250 |         case 2   % power domain subtraction
251 |             v=dp./(yp+mz);
252 |             bf=qq.b;
253 |         otherwise % arbitrary subtraction domain
254 |             v=(dp./(yp+mz)).^(0.5*qq.g);
255 |             af=af.^(0.5*qq.g);
256 |             bf=qq.b^(0.5*qq.g);
257 |     end
258 |     af =repmat(af,1,nf2);       % replicate frame oversubtraction factors for each frequency
259 |     mf=v>=(af+bf).^(-1);        % mask for noise floor limiting
260 |     g=zeros(size(v));           % reserve space for gain matrix
261 |     eg=qq.e/qq.g;               % gain exponent relative to subtraction domain
262 |     gh=qq.gh;
263 |     switch eg
264 |         case 1                          % Normal case
265 |             g(mf)=min(bf*v(mf),gh);      % never give a gain > 1
266 |             g(~mf)=1-af(~mf).*v(~mf);
267 |         case 0.5
268 |             g(mf)=min(sqrt(bf*v(mf)),gh);
269 |             g(~mf)=sqrt(1-af(~mf).*v(~mf));
270 |         otherwise
271 |             g(mf)=min((bf*v(mf)).^eg,gh);
272 |             g(~mf)=(1-af(~mf).*v(~mf)).^eg;
273 |     end
274 |     if qq.bt>=0
275 |         g=g>qq.bt;
276 |     end
277 |     g=qq.mx+(1-qq.mx)*g;    % mix in some of the input
278 |     
279 |     out=yf.*g;  
280 |     out_p=out.*conj(out);
281 |     out_pf=sum(out_p,2);   
282 | 
283 | 
284 |     %--low frequency noise 
285 |     for i=1:nr    
286 |         if sum(yp(i,1:7),2)>sum(yp(i,:),2)/2   %% 7 frequecy bins condition
287 |             yp(i,1:7)=0;
288 |             out_p(i,1:7)=0;
289 |             out(i,1:7)=0+0i;
290 |         end
291 |     end
292 |     
293 | 
294 |     out_dpft=dpf/2;    
295 |     out_smth=ones(nr,1);
296 |     for i=1:nr
297 |         if out_pf(i)<out_dpft(i) %thres
298 |             out_smth(i)=0;
299 |         else
300 |             out_smth(i)=1;
301 |         end
302 |     end
303 |     for i=1:10
304 |         if out_pf(i)<out_dpft(i) %thres
305 |             out(i,:)=0+0i;
306 |         end
307 |     end
308 |     for i=nr-9:nr
309 |         if out_pf(i)<out_dpft(i) %thres
310 |             out(i,:)=0+0i;
311 |         end
312 |     end
313 |     for i=11:nr-10
314 |         if out_pf(i)<out_dpft(i) && sum(out_smth(i-10:i+10))<5
315 |             out(i,:)=0+0i;            
316 |         end
317 |     end
318 | 
319 | 
320 | 
321 |     se=(irfft((out).',nf).').*repmat(w,nr,1);   % inverse dft and apply output window
322 |    
323 |     ss=zeros(ni*(nr+no-1),no);                      % space for overlapped output speech
324 |     ss(1:ni*(no-1),end)=ssv;
325 |     for i=1:no   % e.g. no=2, ni=128, nr=number of frames
326 |         nm=nf*(1+floor((nr-i)/no));  % number of samples in this set
327 |         ss(1+(i-1)*ni:nm+(i-1)*ni,i)=reshape(se(i:no:nr,:)',nm,1);
328 |     end
329 |     ss=sum(ss,2);
330 | 
331 |     if nargout>2 && ~isempty(tf)
332 |         gg=zeros(nr,nf2,length(tf));  % make space
333 |         for i=1:length(tf)
334 |             switch tf(i)
335 |                 case 'i'            % 'i' = input power spectrum
336 |                     gg(:,:,i)=yp;
337 |                 case 'I'            % 'i' = input power spectrum
338 |                     gg(:,:,i)=yf;
339 |                 case 'n'            % 'n' = noise power spectrum
340 |                     gg(:,:,i)=dp;
341 |                 case 'g'            % 'g' = gain
342 |                     gg(:,:,i)=g;
343 |                 case 'o'            % 'o' = output power spectrum
344 |                     gg(:,:,i)=yp.*g.^2;
345 |                 case 'O'            % 'o' = output power spectrum
346 |                     gg(:,:,i)=yf.*g;
347 |             end
348 |         end
349 |     end
350 | end
351 | if nargout==2 || nargout==5
352 |     if nr
353 |         zo.ssv=ss(end-ni*(no-1)+1:end);    % save the output tail for next time
354 |         ss(end-ni*(no-1)+1:end)=[];
355 |     else
356 |         zo.ssv=ssv;  %
357 |     end
358 |     zo.si=s(length(ss)+1:end);      % save the tail end of the input speech signal
359 |     zo.fs=fs;                       % save sample frequency
360 |     zo.qq=qq;                       % save local parameters
361 |     zo.qp=qp;                       % save estnoisem parameters
362 |     zo.ze=ze;                       % save state of noise estimation
363 |     if nargout==2
364 |         gg=zo;                      % 2nd of two arguments is zo
365 |     end
366 | elseif rf==0
367 |     ss=ss(1:length(s));             % trim to the correct length if not an exact number of frames
368 | end
369 | if ~nargout && nr>0
370 |     ffax=ff/1000;    ax=zeros(4,1);
371 |     ax(1)=subplot(223);
372 |     imagesc(tt,ffax,20*log10(g)');
373 |     colorbar;
374 |     axis('xy');
375 |     if qq.al==Inf
376 |         title(sprintf('Filter Gain (dB): a=%.2g, b=%.3g',qq.am,qq.b));
377 |     else
378 |         title(sprintf('Filter Gain (dB): a=%.2g (%.0f to %.0fdB), b=%.3g',qq.am,qq.al,qq.ah,qq.b));
379 |     end
380 |     xlabel('Time (s)');
381 |     ylabel('Frequency (kHz)');
382 | 
383 |     ax(2)=subplot(222);
384 |     imagesc(tt,ffax,10*log10(yp)');
385 |     colorbar;
386 |     axis('xy');
387 |     title('Noisy Speech (dB)');
388 |     xlabel('Time (s)');
389 |     ylabel('Frequency (kHz)');
390 | 
391 |     ax(3)=subplot(224);
392 |     imagesc(tt,ffax,10*log10(yp.*g.^2)');
393 |     colorbar;
394 |     axis('xy');
395 |     title(sprintf('Enhanced Speech (dB): g=%.2g, e=%.3g',qq.g,qq.e));
396 |     xlabel('Time (s)');
397 |     ylabel('Frequency (kHz)');
398 | 
399 |     ax(4)=subplot(221);
400 |     imagesc(tt,ffax,10*log10(dp)');
401 |     colorbar;
402 |     axis('xy');
403 |     title('Noise Estimate (dB)');
404 |     xlabel('Time (s)');
405 |     ylabel('Frequency (kHz)');
406 |     linkaxes(ax);
407 | end
408 | 


--------------------------------------------------------------------------------
/rVAD2.0/estnoisem.m:
--------------------------------------------------------------------------------
  1 | function [x,zo,xs]=estnoisem(yf,tz,pp)
  2 | %ESTNOISEM - estimate noise spectrum using minimum statistics
  3 | %
  4 | % Usage:    ninc=round(0.016*fs);   % frame increment [fs=sample frequency]
  5 | %           ovf=2;                  % overlap factor
  6 | %           f=rfft(enframe(s,hanning(ovf*ninc,'periodic'),ninc),ovf*ninc,2);
  7 | %           f=f.*conj(f);           % convert to power spectrum
  8 | %           x=estnoisem(f,ninc/fs); % estimate the noise power spectrum
  9 | %
 10 | % Inputs:
 11 | %   yf      input power spectra (one row per frame)
 12 | %   tz      frame increment in seconds
 13 | %           Alternatively, the input state from a previous call (see below)
 14 | %   pp      algorithm parameters [optional]
 15 | %
 16 | % Outputs:
 17 | %   x       estimated noise power spectra (one row per frame)
 18 | %   zo      output state
 19 | %   xs      estimated std error of x (one row per frame)
 20 | %           xs seems often to be an underestimate by a factor of 2 or 3
 21 | %
 22 | % The algorithm parameters are defined in reference [1] from which equation
 23 | % numbers are given in parentheses. They are as follows:
 24 | %
 25 | %        pp.taca      % (11): smoothing time constant for alpha_c [0.0449 seconds]
 26 | %        pp.tamax     % (3): max smoothing time constant [0.392 seconds]
 27 | %        pp.taminh    % (3): min smoothing time constant (upper limit) [0.0133 seconds]
 28 | %        pp.tpfall    % (12): time constant for P to fall [0.064 seconds]
 29 | %        pp.tbmax     % (20): max smoothing time constant [0.0717 seconds]
 30 | %        pp.qeqmin    % (23): minimum value of Qeq [2]
 31 | %        pp.qeqmax    % max value of Qeq per frame [14]
 32 | %        pp.av        % (23)+13 lines: fudge factor for bc calculation  [2.12]
 33 | %        pp.td        % time to take minimum over [1.536 seconds]
 34 | %        pp.nu        % number of subwindows to use [3]
 35 | %        pp.qith      % Q-inverse thresholds to select maximum noise slope [0.03 0.05 0.06 Inf ]
 36 | %        pp.nsmdb     % corresponding noise slope thresholds in dB/second   [47 31.4 15.7 4.1]
 37 | %
 38 | % Example use:      y=enframe(s,w,ni);                  % divide speech signal s(n) into
 39 | %                                                       % overlapping frames using window w(n)
 40 | %                   yf=rfft(y,nf,2);                    % take fourier transform
 41 | %                   dp=estnoisem(yf.*conj(yf),tinc);    % estimate the noise
 42 | %
 43 | % If convenient, you can call estnoisem in chunks of arbitrary size. Thus the following are equivalent:
 44 | %
 45 | %                   (a) dp=estnoisem(yp(1:300),tinc);
 46 | %
 47 | %                   (b) [dp(1:100),z]=estnoisem(yp(1:100),tinc);
 48 | %                       [dp(101:200),z]=estnoisem(yp(101:200),z);
 49 | %                       [dp(201:300),z]=estnoisem(yp(201:300),z);
 50 | 
 51 | 
 52 | % This is intended to be a precise implementation of [1] with Table III
 53 | % replaced by the updated table 5 from [2]. The only deliberate algorithm
 54 | % change is the introduction of a minimum value for 1/Qeq in equation (23).
 55 | % This change only affects the first few frames and improves the
 56 | % convergence of the algorithm. A minor improveemnt was reported in [3] but
 57 | % this has not yet been included.
 58 | %
 59 | % Refs:
 60 | %    [1] Rainer Martin.
 61 | %        Noise power spectral density estimation based on optimal smoothing and minimum statistics.
 62 | %        IEEE Trans. Speech and Audio Processing, 9(5):504-512, July 2001.
 63 | %    [2] Rainer Martin.
 64 | %        Bias compensation methods for minimum statistics noise power spectral density estimation
 65 | %        Signal Processing, 2006, 86, 1215-1229
 66 | %    [3] Dirk Mauler and Rainer Martin
 67 | %        Noise power spectral density estimation on highly correlated data
 68 | %        Proc IWAENC, 2006
 69 | 
 70 | %	   Copyright (C) Mike Brookes 2008
 71 | %      Version: $Id: estnoisem.m 1718 2012-03-31 16:40:41Z dmb $
 72 | %
 73 | %   VOICEBOX is a MATLAB toolbox for speech processing.
 74 | %   Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html
 75 | %
 76 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 77 | %   This program is free software; you can redistribute it and/or modify
 78 | %   it under the terms of the GNU General Public License as published by
 79 | %   the Free Software Foundation; either version 2 of the License, or
 80 | %   (at your option) any later version.
 81 | %
 82 | %   This program is distributed in the hope that it will be useful,
 83 | %   but WITHOUT ANY WARRANTY; without even the implied warranty of
 84 | %   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 85 | %   GNU General Public License for more details.
 86 | %
 87 | %   You can obtain a copy of the GNU General Public License from
 88 | %   http://www.gnu.org/copyleft/gpl.html or by writing to
 89 | %   Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA.
 90 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 91 | 
 92 | [nr,nrf]=size(yf);          % number of frames and freq bins
 93 | x=zeros(nr,nrf);            % initialize output arrays
 94 | xs=zeros(nr,nrf);           % will hold std error in the future
 95 | if isempty(yf) && isstruct(tz)             % no real data
 96 |     zo=tz;              % just keep the same state
 97 | else
 98 |     if isstruct(tz)       % take parameters from a previous call
 99 |         nrcum=tz.nrcum;
100 |         p=tz.p;          % smoothed power spectrum
101 |         ac=tz.ac;               % correction factor (9)
102 |         sn2=tz.sn2;              % estimated noise power
103 |         pb=tz.pb;               % smoothed noisy speech power (20)
104 |         pb2=tz.pb2;
105 |         pminu=tz.pminu;
106 |         actmin=tz.actmin;   % Running minimum estimate
107 |         actminsub=tz.actminsub;           % sub-window minimum estimate
108 |         subwc=tz.subwc;                   % force a buffer switch on first loop
109 |         actbuf=tz.actbuf;  % buffer to store subwindow minima
110 |         ibuf=tz.ibuf;
111 |         lminflag=tz.lminflag;      % flag to remember local minimum
112 |         tinc=tz.tinc;     % frame increment
113 |         qq=tz.qq;         % parameter structure
114 |     else
115 |         tinc = tz;          % second argument is frame increment
116 |         nrcum=0;            % no frames so far
117 |         % default algorithm constants
118 | 
119 |         qq.taca=0.0449;    % smoothing time constant for alpha_c = -tinc/log(0.7) in equ (11)
120 |         qq.tamax=0.392;    % max smoothing time constant in (3) = -tinc/log(0.96)
121 |         qq.taminh=0.0133;    % min smoothing time constant (upper limit) in (3) = -tinc/log(0.3)
122 |         qq.tpfall=0.064;   % time constant for P to fall (12)
123 |         qq.tbmax=0.0717;   % max smoothing time constant in (20) = -tinc/log(0.8)
124 |         qq.qeqmin=2;       % minimum value of Qeq (23)
125 |         qq.qeqmax=14;      % max value of Qeq per frame
126 |         qq.av=2.12;             % fudge factor for bc calculation (23 + 13 lines)
127 |         qq.td=1.536;       % time to take minimum over
128 |         qq.nu=8;           % number of subwindows
129 |         qq.qith=[0.03 0.05 0.06 Inf]; % noise slope thresholds in dB/s
130 |         qq.nsmdb=[47 31.4 15.7 4.1];
131 | 
132 |         if nargin>=3 && ~isempty(pp)
133 |             qqn=fieldnames(qq);
134 |             for i=1:length(qqn)
135 |                 if isfield(pp,qqn{i})
136 |                     qq.(qqn{i})=pp.(qqn{i});
137 |                 end
138 |             end
139 |         end
140 |     end
141 | 
142 |     % unpack parameter structure
143 | 
144 |     taca=qq.taca;    % smoothing time constant for alpha_c = -tinc/log(0.7) in equ (11)
145 |     tamax=qq.tamax;    % max smoothing time constant in (3) = -tinc/log(0.96)
146 |     taminh=qq.taminh;    % min smoothing time constant (upper limit) in (3) = -tinc/log(0.3)
147 |     tpfall=qq.tpfall;   % time constant for P to fall (12)
148 |     tbmax=qq.tbmax;   % max smoothing time constant in (20) = -tinc/log(0.8)
149 |     qeqmin=qq.qeqmin;       % minimum value of Qeq (23)
150 |     qeqmax=qq.qeqmax;      % max value of Qeq per frame
151 |     av=qq.av;             % fudge factor for bc calculation (23 + 13 lines)
152 |     td=qq.td;       % time to take minimum over
153 |     nu=qq.nu;           % number of subwindows
154 |     qith=qq.qith; % noise slope thresholds in dB/s
155 |     nsmdb=qq.nsmdb;   % maximum permitted +ve noise slope in dB/s
156 | 
157 |     % derived algorithm constants
158 | 
159 |     aca=exp(-tinc/taca); % smoothing constant for alpha_c in equ (11) = 0.7
160 |     acmax=aca;          % min value of alpha_c = 0.7 in equ (11) also = 0.7
161 |     amax=exp(-tinc/tamax); % max smoothing constant in (3) = 0.96
162 |     aminh=exp(-tinc/taminh); % min smoothing constant (upper limit) in (3) = 0.3
163 |     bmax=exp(-tinc/tbmax); % max smoothing constant in (20) = 0.8
164 |     snrexp = -tinc/tpfall;
165 |     nv=round(td/(tinc*nu));    % length of each subwindow in frames
166 |     if nv<4            % algorithm doesn't work for miniscule frames
167 |         nv=4;
168 |         nu=max(round(td/(tinc*nv)),1);
169 |     end
170 |     nd=nu*nv;           % length of total window in frames
171 |     [md,hd]=mhvals(nd); % calculate the constants M(D) and H(D) from Table III
172 |     [mv,hv]=mhvals(nv); % calculate the constants M(D) and H(D) from Table III
173 |     nsms=10.^(nsmdb*nv*tinc/10);  % [8 4 2 1.2] in paper
174 |     qeqimax=1/qeqmin;  % maximum value of Qeq inverse (23)
175 |     qeqimin=1/qeqmax; % minumum value of Qeq per frame inverse
176 | 
177 |     if isempty(yf)      % provide dummy initialization
178 |         ac=1;               % correction factor (9)
179 |         subwc=nv;                   % force a buffer switch on first loop
180 |         ibuf=0;
181 |         p=x;          % smoothed power spectrum
182 |         sn2=p;              % estimated noise power
183 |         pb=p;               % smoothed noisy speech power (20)
184 |         pb2=pb.^2;
185 |         pminu=p;
186 |         actmin=repmat(Inf,1,nrf);   % Running minimum estimate
187 |         actminsub=actmin;           % sub-window minimum estimate
188 |         actbuf=repmat(Inf,nu,nrf);  % buffer to store subwindow minima
189 |         lminflag=zeros(1,nrf);      % flag to remember local minimum
190 |     else
191 | 
192 |         if ~nrcum       % initialize values for first frame
193 |             p=yf(1,:);          % smoothed power spectrum
194 |             ac=1;               % correction factor (9)
195 |             sn2=p;              % estimated noise power
196 |             pb=p;               % smoothed noisy speech power (20)
197 |             pb2=pb.^2;
198 |             pminu=p;
199 |             actmin=repmat(Inf,1,nrf);   % Running minimum estimate
200 |             actminsub=actmin;           % sub-window minimum estimate
201 |             subwc=nv;                   % force a buffer switch on first loop
202 |             actbuf=repmat(Inf,nu,nrf);  % buffer to store subwindow minima
203 |             ibuf=0;
204 |             lminflag=zeros(1,nrf);      % flag to remember local minimum
205 |         end
206 | 
207 |         % loop for each frame
208 | 
209 |         for t=1:nr              % we use t instead of lambda in the paper
210 |             yft=yf(t,:);        % noise speech power spectrum
211 |             acb=(1+(sum(p)./sum(yft)-1).^2).^(-1);  % alpha_c-bar(t)  (9)
212 |             ac=aca*ac+(1-aca)*max(acb,acmax);       % alpha_c(t)  (10)
213 |             ah=amax*ac.*(1+(p./sn2-1).^2).^(-1);    % alpha_hat: smoothing factor per frequency (11)
214 |             snr=sum(p)/sum(sn2);
215 |             ah=max(ah,min(aminh,snr^snrexp));       % lower limit for alpha_hat (12)
216 | 
217 |             p=ah.*p+(1-ah).*yft;            % smoothed noisy speech power (3)
218 |             b=min(ah.^2,bmax);              % smoothing constant for estimating periodogram variance (22 + 2 lines)
219 |             pb=b.*pb + (1-b).*p;            % smoothed periodogram (20)
220 |             pb2=b.*pb2 + (1-b).*p.^2;     	% smoothed periodogram squared (21)
221 | 
222 |             qeqi=max(min((pb2-pb.^2)./(2*sn2.^2),qeqimax),qeqimin/(t+nrcum));   % Qeq inverse (23)
223 |             qiav=sum(qeqi)/nrf;             % Average over all frequencies (23+12 lines) (ignore non-duplication of DC and nyquist terms)
224 |             bc=1+av*sqrt(qiav);             % bias correction factor (23+11 lines)
225 |             bmind=1+2*(nd-1)*(1-md)./(qeqi.^(-1)-2*md);      % we use the simplified form (17) instead of (15)
226 |             bminv=1+2*(nv-1)*(1-mv)./(qeqi.^(-1)-2*mv);      % same expression but for sub windows
227 |             kmod=bc*p.*bmind<actmin;        % Frequency mask for new minimum
228 |             if any(kmod)
229 |                 actmin(kmod)=bc*p(kmod).*bmind(kmod);
230 |                 actminsub(kmod)=bc*p(kmod).*bminv(kmod);
231 |             end
232 |             if subwc>1 && subwc<nv              % middle of buffer - allow a local minimum
233 |                 lminflag=lminflag | kmod;    	% potential local minimum frequency bins
234 |                 pminu=min(actminsub,pminu);
235 |                 sn2=pminu;
236 |             else
237 |                 if subwc>=nv                    % end of buffer - do a buffer switch
238 |                     ibuf=1+rem(ibuf,nu);     	% increment actbuf storage pointer
239 |                     actbuf(ibuf,:)=actmin;    	% save sub-window minimum
240 |                     pminu=min(actbuf,[],1);
241 |                     i=find(qiav<qith);
242 |                     nsm=nsms(i(1));          	% noise slope max
243 |                     lmin=lminflag & ~kmod & actminsub<nsm*pminu & actminsub>pminu;
244 |                     if any(lmin)
245 |                         pminu(lmin)=actminsub(lmin);
246 |                         actbuf(:,lmin)=repmat(pminu(lmin),nu,1);
247 |                     end
248 |                     lminflag(:)=0;
249 |                     actmin(:)=Inf;
250 |                     subwc=0;
251 |                 end
252 |             end
253 |             subwc=subwc+1;
254 |             x(t,:)=sn2;
255 |             qisq=sqrt(qeqi);
256 |             % empirical formula for standard error based on Fig 15 of [2]
257 |             xs(t,:)=sn2.*sqrt(0.266*(nd+100*qisq).*qisq/(1+0.005*nd+6/nd)./(0.5*qeqi.^(-1)+nd-1));
258 |         end
259 |     end
260 |     if nargout>1    % we need to store the state for next time
261 |         zo.nrcum=nrcum+nr;      % number of frames so far
262 |         zo.p=p;          % smoothed power spectrum
263 |         zo.ac=ac;               % correction factor (9)
264 |         zo.sn2=sn2;              % estimated noise power
265 |         zo.pb=pb;               % smoothed noisy speech power (20)
266 |         zo.pb2=pb2;
267 |         zo.pminu=pminu;
268 |         zo.actmin=actmin;   % Running minimum estimate
269 |         zo.actminsub=actminsub;           % sub-window minimum estimate
270 |         zo.subwc=subwc;                   % force a buffer switch on first loop
271 |         zo.actbuf=actbuf;  % buffer to store subwindow minima
272 |         zo.ibuf=ibuf;
273 |         zo.lminflag=lminflag;      % flag to remember local minimum
274 |         zo.tinc=tinc;     % must be the last one
275 |         zo.qq=qq;
276 |     end
277 |     if ~nargout
278 |         clf;
279 |         subplot(212);
280 |         plot((1:nr)*tinc,10*log10([sum(yf,2) sum(x,2)]))
281 |         ylabel('Frame Energy (dB)');
282 |         xlabel(sprintf('Time (s)   [%d ms frame incr]',round(tinc*1000)));
283 |         axisenlarge([-1 -1.05]);
284 |         legend('input','noise','Location','Best');
285 |         subplot(211);
286 |         plot(1:nrf,10*log10([sum(yf,1)'/nr sum(x,1)'/nr]))
287 |         ylabel('Power (dB)');
288 |         xlabel('Frequency bin');
289 |         axisenlarge([-1 -1.05]);
290 |         legend('input','noise','Location','Best');
291 |     end
292 | end
293 | 
294 | function [m,h,d]=mhvals(d)
295 | % Values are taken from Table 5 in [2]
296 | %[2] R. Martin,"Bias compensation methods for minimum statistics noise power
297 | %               spectral density estimation", Signal Processing Vol 86, pp1215-1229, 2006.
298 | 
299 | % approx: plot(d.^(-0.5),[m 1-d.^(-0.5)],'x-'), plot(d.^0.5,h,'x-')
300 | persistent dmh
301 | if isempty(dmh)
302 |     dmh=[
303 |         1   0       0;
304 |         2   0.26    0.15;
305 |         5   0.48    0.48;
306 |         8   0.58    0.78;
307 |         10  0.61    0.98;
308 |         15  0.668   1.55;
309 |         20  0.705   2;
310 |         30  0.762   2.3;
311 |         40  0.8     2.52;
312 |         60  0.841   3.1;
313 |         80  0.865   3.38;
314 |         120 0.89    4.15;
315 |         140 0.9     4.35;
316 |         160 0.91    4.25;
317 |         180 0.92    3.9;
318 |         220 0.93    4.1;
319 |         260 0.935   4.7;
320 |         300 0.94    5];
321 | end
322 | 
323 | if nargin>=1
324 |     i=find(d<=dmh(:,1));
325 |     if isempty(i)
326 |         i=size(dmh,1);
327 |         j=i;
328 |     else
329 |         i=i(1);
330 |         j=i-1;
331 |     end
332 |     if d==dmh(i,1)
333 |         m=dmh(i,2);
334 |         h=dmh(i,3);
335 |     else
336 |         qj=sqrt(dmh(i-1,1));    % interpolate using sqrt(d)
337 |         qi=sqrt(dmh(i,1));
338 |         q=sqrt(d);
339 |         h=dmh(i,3)+(q-qi)*(dmh(j,3)-dmh(i,3))/(qj-qi);
340 |         m=dmh(i,2)+(qi*qj/q-qj)*(dmh(j,2)-dmh(i,2))/(qi-qj);
341 |     end
342 | else
343 |     d=dmh(:,1);
344 |     m=dmh(:,2);
345 |     h=dmh(:,3);
346 | end


--------------------------------------------------------------------------------
/rVAD2.0/fxpefac.m:
--------------------------------------------------------------------------------
  1 | function [fx,tx,pv,fv]=fxpefac(s,fs,tinc,m,pp)
  2 | %FXPEFAC PEFAC pitch tracker [FX,TT,PV,FV]=(S,FS,TINC,M,PP)
  3 | %
  4 | % Input:   s(ns)      Speech signal
  5 | %          fs         Sample frequency (Hz)
  6 | %          tinc       Time increment between frames (s) [0.01]
  7 | %                     or [start increment end]
  8 | %          m          mode
  9 | %                     'g' plot graph showing waveform and pitch
 10 | %                     'G' plot spectrogram with superimposed pitch
 11 | %                     'x' use external files for algorithm parameter
 12 | %                         initialization: fxpefac_g and fxpefac_w
 13 | %          pp         structure containing algorithm parameters
 14 | %
 15 | % Outputs: fx(nframe)     Estimated pitch (Hz)
 16 | %          tx(nframe)     Time at the centre of each frame (seconds).
 17 | %          pv(nframe)     Probability of the frame of being voiced
 18 | %          fv             structure containing feature vectors
 19 | %                           fv.vuvfea(nframe,2) = voiced/unvoiced GMM features
 20 | 
 21 | % References
 22 | %  [1]  S.Gonzalez and M. Brookes,
 23 | %       A pitch estimation filter robust to high levels of noise (PEFAC), Proc EUSIPCO,Aug 2011.
 24 | 
 25 | % Bugs/Suggestions
 26 | % (1) do long files in chunks
 27 | % (2) option of n-best DP
 28 | 
 29 | %	   Copyright (C) Sira Gonzalez and Mike Brookes 2011
 30 | %      Version: $Id: fxpefac.m 713 2011-10-16 14:45:43Z dmb $
 31 | %
 32 | %   VOICEBOX is a MATLAB toolbox for speech processing.
 33 | %   Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html
 34 | %
 35 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 36 | %   This program is free software; you can redistribute it and/or modify
 37 | %   it under the terms of the GNU General Public License as published by
 38 | %   the Free Software Foundation; either version 2 of the License, or
 39 | %   (at your option) any later version.
 40 | %
 41 | %   This program is distributed in the hope that it will be useful,
 42 | %   but WITHOUT ANY WARRANTY; without even the implied warranty of
 43 | %   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 44 | %   GNU General Public License for more details.
 45 | %
 46 | %   You can obtain a copy of the GNU General Public License from
 47 | %   http://www.gnu.org/copyleft/gpl.html or by writing to
 48 | %   Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA.
 49 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 50 | 
 51 | persistent w_u m_u v_u w_v m_v v_v dpwtdef
 52 | % initialize persistent variables
 53 | if ~numel(w_u)
 54 | 
 55 |     % voiced/unvoiced decision based on 2-element feature vector
 56 |     % (a) mean power of the frame's log-freq spectrum (normalized so its short-term average is LTASS)
 57 |     % (b) sum of the power in the first three peaks
 58 |     %===== VUV
 59 |     if nargin>3 && any(m=='x')
 60 |         fxpefac_g;    % read in GMM parameters
 61 |         fxpefac_w;     % read in Weights parameters
 62 |     else
 63 |         w_u=[0.2123723 0.207788 0.2701817 0.1293616 0.04741722 0.1328791 ]';
 64 | 
 65 |         m_u=[0.2220388 0.4067706 ;
 66 |             0.04567656 0.4016914 ;
 67 |             0.8415278 0.3192158 ;
 68 |             0.2194808 0.1910079 ;
 69 |             1.6347 0.5819833 ;
 70 |             1.181519 0.6996485 ];
 71 | 
 72 |         v_u=reshape([0.01413822 0.003357913 0.003357913 0.01786169 ;
 73 |             0.0009377269 0.0006220489 0.0006220489 0.03422057 ;
 74 |             0.1233703 0.004299293 0.004299293 0.007660504 ;
 75 |             0.01779449 0.002078821 0.002078821 0.001605052 ;
 76 |             1.110173 0.00718649 0.00718649 0.005734435 ;
 77 |             0.5477135 -0.00182316 -0.00182316 0.05659796 ]',[2 2 6]);
 78 | 
 79 |         w_v=[0.07758689 0.2109879 0.1856225 0.06853158 0.2701563 0.1871148 ]';
 80 | 
 81 |         m_v=[1.208656 0.3365564 ;
 82 |             1.216643 0.5971916 ;
 83 |             4.08585 1.240948 ;
 84 |             8.322102 1.349939 ;
 85 |             1.734108 1.168643 ;
 86 |             0.5107205 0.940308 ];
 87 | 
 88 |         v_v=reshape([0.06181574 0.002950501 0.002950501 0.004528442 ;
 89 |             0.2946077 0.01433284 0.01433284 0.02684239 ;
 90 |             2.508473 -0.03310555 -0.03310555 0.1098579 ;
 91 |             14.17252 -0.09009174 -0.09009174 0.07989255 ;
 92 |             0.5834894 -0.07854027 -0.07854027 0.1108958 ;
 93 |             0.05978017 0.005528601 0.005528601 0.1309329 ]',[2 2 6]);
 94 |     end
 95 |     %===== PDP
 96 |     %     dfm = -0.4238; % df mean
 97 |     %     dfv = 3.8968; % df variance (although treated as std dev here)
 98 |     %     delta = 0.15;
 99 |     %     dflpso=[dfm 0.5/(log(10)*dfv^2) -log(2*delta/(dfv*sqrt(2*pi)))/log(10)]; % scale factor & offset for df pdf
100 |     %     dpwtdef=[1.0000, 0.8250, 1.3064, 1.9863]; % default DP weights
101 |     dpwtdef=[1.0000, 0.8250, 0.01868, 0.006773, 98.9, -0.4238]; % default DP weights
102 |     %===== END
103 | 
104 | end
105 | 
106 | 
107 | % Algorithm parameter defaults
108 | 
109 | p.fstep=5;              % frequency resolution of initial spectrogram (Hz)
110 | p.fmax=4000;            % maximum frequency of initial spectrogram (Hz)
111 | p.fres = 20;            % bandwidth of initial spectrogram (Hz)
112 | p.fbanklo = 40;         % low frequency limit of log filterbank (Hz)
113 | p.mpsmooth = 201;       % width of smoothing filter for mean power
114 | % p.maxtranf = 1000;      % maximum value of tranf cost term
115 | p.shortut = 7;          % max utterance length to average power of entire utterance
116 | p.pefact = 1.5;         % shape factor in PEFAC filter
117 | p.numopt = 3;           % number of possible frequencies per frame
118 | p.flim = [60 400];      % range of feasible fundamental frequencies (Hz)
119 | p.w = dpwtdef;          % DP weights
120 | % p.rampk = 1.1;          % constant for relative-amplitude cost term
121 | % p.rampcz = 100;         % relative amplitude cost for missing peak
122 | p.tmf = 2;              % median frequency smoothing interval (s)
123 | p.tinc = 0.01;          % default frame increment (s)
124 | 
125 | % update parameters from pp argument
126 | 
127 | if nargin>=5 && isstruct(pp)
128 |     fnq=fieldnames(pp);
129 |     for i=1:length(fnq)
130 |         if isfield(p,fnq{i})
131 |             p.(fnq{i})=pp.(fnq{i});
132 |         end
133 |     end
134 | end
135 | 
136 | % Sort out input arguments
137 | if nargin>=3  && numel(tinc)>0
138 |     p.tinc = tinc;   % 0.01 s between consecutive time frames
139 | end
140 | if nargin<4
141 |     m='';
142 | end
143 | 
144 | % Spectrogram of the mixture
145 | fmin = 0; fstep = p.fstep; fmax = p.fmax;
146 | fres = p.fres;  % Frequency resolution (Hz)
147 | [tx,f,MIX]=spgrambw(s,fs,fres,[fmin fstep fmax],[],p.tinc);
148 | nframes=length(tx);
149 | txinc=tx(2)-tx(1);  % actual frame increment
150 | %  ==== we could combine spgrambw and filtbankm into a single call to spgrambw or use fft directly ====
151 | % Log-frequency scale
152 | [trans,cf]=filtbankm(length(f),2*length(f)-1,2*f(end),p.fbanklo,f(end),'usl');
153 | O = MIX*trans'; % Original spectrum in Log-frequency scale
154 | 
155 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
156 | % Amplitude Compression
157 | 
158 | % Calculate alpha based on LTASS ratios
159 | ltass = stdspectrum(6,'p',cf);
160 | auxf = [cf(1),(cf(1:end-1)+cf(2:end))./2,cf(end)];
161 | ltass = ltass.*diff(auxf);                  % weight by bin width
162 | 
163 | % estimated ltass
164 | O = O.*repmat(diff(auxf),nframes,1);     % weight spectrum by bin width
165 | 
166 | if tx(end)<p.shortut                        % if it is a short utterance
167 |     eltass = mean(O,1);                     % mean power per each frequency band
168 |     eltass = smooth(eltass,p.mpsmooth);     % smooth in log frequency
169 |     eltass= eltass(:).';                    % force a row vector
170 | 
171 |     % Same mean power per frame as ltass
172 |     cte = mean(ltass)/mean(eltass);
173 |     eltass = eltass.*cte;                   % normalize to have the same mean as LTASS
174 |     O = O.*cte;
175 | 
176 |     % Linear AC
177 |     alpha = (ltass)./(eltass);
178 |     alpha = alpha(:).';
179 |     alpha = repmat(alpha,nframes,1);
180 |     O = O.*alpha;                           % force O to have an average LTASS spectrum
181 |     % ==== should perhaps exclude the silent portions ***
182 | else                                        % long utterance
183 |     tsmo = 2; % time smoothing over 1 sec
184 |     stt = round(tsmo/txinc);
185 |     filttime = [ones(stt,1); zeros(stt-1,1)];
186 |     filtfreq = ones(1,p.mpsmooth);
187 |     eltass = imfilter(O,filttime);
188 |     eltass = imfilter(eltass,filtfreq);     % filter in time and log frequency
189 | 
190 |     % Same mean power per frame than ltass
191 |     cte = repmat(mean(ltass),nframes,1)./mean(eltass,2);
192 |     eltass = eltass.*repmat(cte,1,length(cf));
193 |     O = O.*repmat(cte,1,length(cf));
194 | 
195 |     % Linear AC
196 |     alpha = repmat(ltass,nframes,1)./(eltass);
197 |     O = O.*alpha;
198 | 
199 | end
200 | 
201 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
202 | % Create the filter to detect the harmonics
203 | ini = find(cf>2*cf(1));
204 | sca = cf/cf(ini(1)); % normalize bin frequencies to start at approximately 0.5
205 | sca = sca(sca<10.5 & sca>0.5);  % restrict to 0.5 - 10.5 times fundamental
206 | filh = -log10(p.pefact-cos(2*pi*sca));
207 | filh = filh-mean(filh);  % force filter to be zero mean
208 | posit = find(sca>=1);  % ==== this should just equal ini(1) ====
209 | if ~mod(length(posit),2)
210 |     filh = [filh 0];  % force to be an odd length after central tap
211 | end
212 | negat = find(sca<1);
213 | numz = length(posit)-1-length(negat);
214 | filh = filh./max(filh);
215 | filh = [zeros(1,numz) filh];
216 | 
217 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
218 | % Filter the log-frequency scaled spectrogram
219 | B = imfilter(O,filh);  % ==== no good reason to use imfilter here ====
220 | 
221 | % Feasible frequency range
222 | numopt = p.numopt; % Number of possible fundamental frequencies per frame
223 | flim = p.flim;
224 | pfreq = find(cf>flim(1) & cf<flim(2));
225 | ff = zeros(nframes,numopt);
226 | amp = zeros(nframes,numopt);
227 | for i=1:nframes
228 |     [pos,peak]=findpeaks(B(i,pfreq),[],5/(cf(pfreq(2))-cf(pfreq(1)))); % ==== calculate some out of loop ====
229 |     if numel(pos)
230 |         [peak,ind]=sort(peak,'descend');
231 |         pos = pos(ind);                     % indices of peaks in the B array
232 |         posff = cf(pfreq(pos));             % frequencies of peaks
233 |         fin = min(numopt,length(posff));
234 |         ff(i,1:fin)=posff(1:fin);           % save both frequency and amplitudes
235 |         amp(i,1:fin)=peak(1:fin);
236 |         %     else
237 |         %         ff(i,:)=0;          % ==== unnecessary since they start as zeros ====
238 |         %         amp(i,:)=0;
239 |     end
240 | end
241 | 
242 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
243 | % Probabilitly of the frame of being voiced
244 | 
245 | % voiced/unvoiced decision based on 2-element feature vector
246 | % (a) mean power of the frame's log-freq spectrum (normalized so its short-term average is LTASS)
247 | % (b) sum of the power in the first three peaks
248 | 
249 | pow = mean(O,2)*1e-6;
250 | vuvfea = [pow sum(amp,2)./(pow*1e9)];
251 | 
252 | pru=gaussmixp(vuvfea,m_u,v_u,w_u);  % Probability of being unvoiced
253 | prv=gaussmixp(vuvfea,m_v,v_v,w_v);  % Probability of being voiced
254 | 
255 | % pru = exp(pru);
256 | % prv=exp(prv);    % Linear probability
257 | % pv = prv./(prv+pru); % ==== better to write pv=(1+exp(pru-prv)).^(-1) ====
258 | pv=(1+exp(pru-prv)).^(-1);
259 | 
260 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
261 | % Dynamic programming
262 | 
263 | % w(1): relative amp, voiced local cost
264 | % w(2): median pitch deviation cost
265 | % w(3): df cost weight
266 | % w(4): max df cost
267 | % w(5): relative amp cost for missing peaks (very high)
268 | % w(6): df mean
269 | 
270 | w = p.w;
271 | 
272 | % Relative amplitude 
273 | camp = -amp./repmat(max(amp,[],2),1,numopt);  % relative amplitude used as cost
274 | camp(amp==0)=w(5); % If no frequency found
275 | 
276 | % Time interval for the median frequency
277 | tmf = p.tmf; % in sec
278 | inmf = round(tmf/txinc);
279 | 
280 | %--------------------------------------------------------------------------
281 | % FORWARDS
282 | % Initialize values
283 | cost = zeros(nframes,numopt);
284 | prev = zeros(nframes,numopt);
285 | medfx = zeros(nframes,1);
286 | dffact=2/txinc;
287 | 
288 | % First time frame
289 | % cost(1,:) = w(1)*ramp(1,:);
290 | cost(1,:) = w(1)*camp(1,:);  % only one cost term for first frame
291 | fpos = ff(1:min(inmf,end),1);
292 | mf=median(fpos(pv(1:min(inmf,end))>0.6));   % calculate median frequency of first 2 seconds
293 | if isnan(mf)
294 |     mf=median(fpos(pv(1:min(inmf,end))>0.5));
295 |     if isnan(mf)
296 |         mf=median(fpos(pv(1:min(inmf,end))>0.4));
297 |         if isnan(mf)
298 |             mf=median(fpos(pv(1:min(inmf,end))>0.3)); % ==== clumsy way of ensuring that we take the best frames ====
299 |             if isnan(mf)
300 |                 mf=0;
301 |             end
302 |         end
303 |     end
304 | end
305 | medfx(1)=mf;
306 | 
307 | for i=2:nframes              % main dynamic programming loop
308 |     if i>inmf
309 |         fpos = ff(i-inmf:i,1);  % fpos is the highest peak in each frame
310 |         mf=median(fpos(pv(1:inmf)>0.6));  % find median frequency over past 2 seconds
311 |         if isnan(mf)
312 |             mf=median(fpos(pv(1:inmf)>0.5));
313 |             if isnan(mf)
314 |                 mf=median(fpos(pv(1:inmf)>0.4));
315 |                 if isnan(mf)
316 |                     mf=median(fpos(pv(1:inmf)>0.3));% ==== clumsy way of ensuring that we take the best frames ====
317 |                     if isnan(mf)
318 |                         mf=0;
319 |                     end
320 |                 end
321 |             end
322 |         end
323 |     end
324 |     medfx(i)=mf;
325 |     % Frequency difference between candidates and cost
326 |     df = dffact*(repmat(ff(i,:).',1,numopt) - repmat(ff(i-1,:),numopt,1))./(repmat(ff(i,:).',1,numopt) + repmat(ff(i-1,:),numopt,1));
327 |     costdf=w(3)*min((df-w(6)).^2,w(4));
328 | 
329 |     % Cost related to the median pitch
330 |     if mf==0                                   % this test was inverted in the original version
331 |         costf = zeros(1,numopt);
332 |     else
333 |         costf = abs(ff(i,:) - mf)./mf;
334 |     end
335 |     [cost(i,:),prev(i,:)]=min(costdf + repmat(cost(i-1,:),numopt,1),[],2); % ==== should we allow the possibility of skipping frames ? ====
336 |     cost(i,:)=cost(i,:)+w(2)*costf + w(1)*camp(i,:);  % add on costs that are independent of previous path
337 | 
338 | end
339 | 
340 | % Traceback
341 | 
342 | fx=zeros(nframes,1);
343 | best = zeros(nframes,1);
344 | 
345 | nose=find(cost(end,:)==min(cost(end,:))); % ==== bad method (dangerous) ===
346 | best(end)=nose(1);
347 | % ff = [ff zeros(nframes,1)];  % not clear why this was here
348 | fx(end)=ff(end,best(end));
349 | for i=nframes:-1:2
350 |     best(i-1)=prev(i,best(i));
351 |     fx(i-1)=ff(i-1,best(i-1));
352 | end
353 | 
354 | if nargout>=4
355 |     fv.vuvfea=vuvfea;  % voiced-unvoiced features
356 |     fv.best=best;  % selected path
357 |     fv.ff=ff;  % pitch candidates
358 |     fv.amp=amp;  % pitch candidate amplitudes
359 |     fv.medfx=medfx;  % median pitch
360 |     fv.w=w;  % DP weights
361 |     fv.dffact=dffact;  % df scale factor
362 | end
363 | 
364 | if ~nargout || any(m=='g') || any(m=='G')
365 |     nax=0;  % number of axes sets to link
366 |     msk=pv>0.5; % find voiced frames as a mask
367 |     fxg=fx;
368 |     fxg(~msk)=NaN; % allow only good frames
369 |     fxb=fx;
370 |     fxb(msk)=NaN; % allow only bad frames
371 |     if any(m=='G') || ~nargout && ~any(m=='g')
372 |         clf;
373 |         spgrambw(s,fs,'ilcwpf'); % draw spectrogram with log axes
374 |         hold on
375 |         plot(tx,log10(fxg),'-b',tx,log10(fxb),'-r'); % fx track
376 |         yy=get(gca,'ylim');
377 |         plot(tx,yy(1)+yy*[-1;1]*(0.02+0.05*pv),'-k'); % P(V) track
378 |         hold off
379 |         nax=nax+1;
380 |         axh(nax)=gca;
381 |         if any(m=='g')
382 |             figure;   % need a new figure if plotting two graphs
383 |         end
384 |     end
385 |     if any(m=='g')
386 |         ns=length(s);
387 |         [tsr,ix]=sort([(1:ns)/fs 0.5*(tx(1:end-1)+tx(2:end))']); % intermingle speech and frame boundaries
388 |         jx(ix)=1:length(ix); % create inverse index
389 |         sp2fr=jx(1:ns)-(0:ns-1);  % speech sample to frame number
390 |         spmsk=msk(sp2fr);   % speech sample voiced mask
391 |         sg=s;
392 |         sg(~spmsk)=NaN;   % good speech samples only
393 |         sb=s;
394 |         sb(spmsk)=NaN;    % bad speech samples only
395 |         clf;
396 |         subplot(5,1,1);
397 |         plot(tx,pv,'-b',(1:ns)/fs,0.5*mod(cumsum(fx(sp2fr)/fs),1)-0.6,'-b');
398 |         nax=nax+1;
399 |         axh(nax)=gca;
400 |         ylabel('\phi(t), P(V)');
401 |         set(gca,'ylim',[-0.65 1.05]);
402 |         subplot(5,1,2:3);
403 |         plot((1:ns)/fs,sg,'-b',(1:ns)/fs,sb,'-r');
404 |         nax=nax+1;
405 |         axh(nax)=gca;
406 |         subplot(5,1,4:5);
407 |         plot(tx,fxg,'-b',tx,fxb,'-r');
408 |         ylabel('Pitch (Hz)');
409 |         %         semilogy(tx,fxg,'-b',tx,fxb,'-r');
410 |         %         ylabel(['Pitch (' yticksi 'Hz)']);
411 |         set(gca,'ylim',[min(fxg)-30 max(fxg)+30]);
412 |         nax=nax+1;
413 |         axh(nax)=gca;
414 |     end
415 |     if nax>1
416 |         linkaxes(axh,'x');
417 |     end
418 | end
419 | 
420 | function y=smooth(x,n)
421 | nx=length(x);
422 | c=cumsum(x);
423 | y=[c(1:2:n)./(1:2:n) (c(n+1:end)-c(1:end-n))/n (c(end)-c(end-n+2:2:end-1))./(n-2:-2:1)];
424 | 


--------------------------------------------------------------------------------
/rVAD2.0/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     <one line to give the program's name and a brief idea of what it does.>
294 |     Copyright (C) <year>  <name of author>
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   <signature of Ty Coon>, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 


--------------------------------------------------------------------------------
/rVAD2.0/estnoisem_noiseseg.m:
--------------------------------------------------------------------------------
  1 | function [x,zo,xs]=estnoisem_noiseseg(yf,noise_seg,pv01,tz,pp)
  2 | %ESTNOISEM - estimate noise spectrum using minimum statistics
  3 | %
  4 | % Usage:    ninc=round(0.016*fs);   % frame increment [fs=sample frequency]
  5 | %           ovf=2;                  % overlap factor
  6 | %           f=rfft(enframe(s,hanning(ovf*ninc,'periodic'),ninc),ovf*ninc,2);
  7 | %           f=f.*conj(f);           % convert to power spectrum
  8 | %           x=estnoisem(f,ninc/fs); % estimate the noise power spectrum
  9 | %
 10 | % Inputs:
 11 | %   yf      input power spectra (one row per frame)
 12 | %   tz      frame increment in seconds
 13 | %           Alternatively, the input state from a previous call (see below)
 14 | %   pp      algorithm parameters [optional]
 15 | %
 16 | % Outputs:
 17 | %   x       estimated noise power spectra (one row per frame)
 18 | %   zo      output state
 19 | %   xs      estimated std error of x (one row per frame)
 20 | %           xs seems often to be an underestimate by a factor of 2 or 3
 21 | %
 22 | % The algorithm parameters are defined in reference [1] from which equation
 23 | % numbers are given in parentheses. They are as follows:
 24 | %
 25 | %        pp.taca      % (11): smoothing time constant for alpha_c [0.0449 seconds]
 26 | %        pp.tamax     % (3): max smoothing time constant [0.392 seconds]
 27 | %        pp.taminh    % (3): min smoothing time constant (upper limit) [0.0133 seconds]
 28 | %        pp.tpfall    % (12): time constant for P to fall [0.064 seconds]
 29 | %        pp.tbmax     % (20): max smoothing time constant [0.0717 seconds]
 30 | %        pp.qeqmin    % (23): minimum value of Qeq [2]
 31 | %        pp.qeqmax    % max value of Qeq per frame [14]
 32 | %        pp.av        % (23)+13 lines: fudge factor for bc calculation  [2.12]
 33 | %        pp.td        % time to take minimum over [1.536 seconds]
 34 | %        pp.nu        % number of subwindows to use [3]
 35 | %        pp.qith      % Q-inverse thresholds to select maximum noise slope [0.03 0.05 0.06 Inf ]
 36 | %        pp.nsmdb     % corresponding noise slope thresholds in dB/second   [47 31.4 15.7 4.1]
 37 | %
 38 | % Example use:      y=enframe(s,w,ni);                  % divide speech signal s(n) into
 39 | %                                                       % overlapping frames using window w(n)
 40 | %                   yf=rfft(y,nf,2);                    % take fourier transform
 41 | %                   dp=estnoisem(yf.*conj(yf),tinc);    % estimate the noise
 42 | %
 43 | % If convenient, you can call estnoisem in chunks of arbitrary size. Thus the following are equivalent:
 44 | %
 45 | %                   (a) dp=estnoisem(yp(1:300),tinc);
 46 | %
 47 | %                   (b) [dp(1:100),z]=estnoisem(yp(1:100),tinc);
 48 | %                       [dp(101:200),z]=estnoisem(yp(101:200),z);
 49 | %                       [dp(201:300),z]=estnoisem(yp(201:300),z);
 50 | 
 51 | 
 52 | % This is intended to be a precise implementation of [1] with Table III
 53 | % replaced by the updated table 5 from [2]. The only deliberate algorithm
 54 | % change is the introduction of a minimum value for 1/Qeq in equation (23).
 55 | % This change only affects the first few frames and improves the
 56 | % convergence of the algorithm. A minor improveemnt was reported in [3] but
 57 | % this has not yet been included.
 58 | %
 59 | % Refs:
 60 | %    [1] Rainer Martin.
 61 | %        Noise power spectral density estimation based on optimal smoothing and minimum statistics.
 62 | %        IEEE Trans. Speech and Audio Processing, 9(5):504-512, July 2001.
 63 | %    [2] Rainer Martin.
 64 | %        Bias compensation methods for minimum statistics noise power spectral density estimation
 65 | %        Signal Processing, 2006, 86, 1215-1229
 66 | %    [3] Dirk Mauler and Rainer Martin
 67 | %        Noise power spectral density estimation on highly correlated data
 68 | %        Proc IWAENC, 2006
 69 | 
 70 | %	   Copyright (C) Mike Brookes 2008
 71 | %      Version: $Id: estnoisem.m 1718 2012-03-31 16:40:41Z dmb $
 72 | %
 73 | %   VOICEBOX is a MATLAB toolbox for speech processing.
 74 | %   Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html
 75 | %
 76 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 77 | %   This program is free software; you can redistribute it and/or modify
 78 | %   it under the terms of the GNU General Public License as published by
 79 | %   the Free Software Foundation; either version 2 of the License, or
 80 | %   (at your option) any later version.
 81 | %
 82 | %   This program is distributed in the hope that it will be useful,
 83 | %   but WITHOUT ANY WARRANTY; without even the implied warranty of
 84 | %   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 85 | %   GNU General Public License for more details.
 86 | %
 87 | %   You can obtain a copy of the GNU General Public License from
 88 | %   http://www.gnu.org/copyleft/gpl.html or by writing to
 89 | %   Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA.
 90 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 91 | %
 92 | %   Modified code, Zheng-Hua Tan, 2012
 93 | %
 94 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 95 | 
 96 | [nr,nrf]=size(yf);          % number of frames and freq bins
 97 | 
 98 | 
 99 | 
100 | 
101 | x=zeros(nr,nrf);            % initialize output arrays
102 | xs=zeros(nr,nrf);           % will hold std error in the future
103 | if isempty(yf) && isstruct(tz)             % no real data
104 |     zo=tz;              % just keep the same state
105 | else
106 |     if isstruct(tz)       % take parameters from a previous call
107 |         nrcum=tz.nrcum;
108 |         p=tz.p;          % smoothed power spectrum
109 |         ac=tz.ac;               % correction factor (9)
110 |         sn2=tz.sn2;              % estimated noise power
111 |         pb=tz.pb;               % smoothed noisy speech power (20)
112 |         pb2=tz.pb2;
113 |         pminu=tz.pminu;
114 |         actmin=tz.actmin;   % Running minimum estimate
115 |         actminsub=tz.actminsub;           % sub-window minimum estimate
116 |         subwc=tz.subwc;                   % force a buffer switch on first loop
117 |         actbuf=tz.actbuf;  % buffer to store subwindow minima
118 |         ibuf=tz.ibuf;
119 |         lminflag=tz.lminflag;      % flag to remember local minimum
120 |         tinc=tz.tinc;     % frame increment
121 |         qq=tz.qq;         % parameter structure
122 |     else
123 |         tinc = tz;          % second argument is frame increment
124 |         nrcum=0;            % no frames so far
125 |         % default algorithm constants
126 |         
127 |         qq.taca=0.0449;    % smoothing time constant for alpha_c = -tinc/log(0.7) in equ (11)
128 |         qq.tamax=0.392;    % max smoothing time constant in (3) = -tinc/log(0.96)
129 |         qq.taminh=0.0133;    % min smoothing time constant (upper limit) in (3) = -tinc/log(0.3)
130 |         qq.tpfall=0.064;   % time constant for P to fall (12)
131 |         qq.tbmax=0.0717;   % max smoothing time constant in (20) = -tinc/log(0.8)
132 |         qq.qeqmin=2;       % minimum value of Qeq (23)
133 |         qq.qeqmax=14;      % max value of Qeq per frame
134 |         qq.av=2.12;             % fudge factor for bc calculation (23 + 13 lines)
135 |         qq.td=1.536;       % time to take minimum over
136 |         qq.nu=8;           % number of subwindows
137 |         qq.qith=[0.03 0.05 0.06 Inf]; % noise slope thresholds in dB/s
138 |         qq.nsmdb=[47 31.4 15.7 4.1];
139 |         
140 |         if nargin>=4 && ~isempty(pp)
141 |             qqn=fieldnames(qq);
142 |             for i=1:length(qqn)
143 |                 if isfield(pp,qqn{i})
144 |                     qq.(qqn{i})=pp.(qqn{i});
145 |                 end
146 |             end
147 |         end
148 |     end
149 |     
150 |     % unpack parameter structure
151 |     
152 |     taca=qq.taca;    % smoothing time constant for alpha_c = -tinc/log(0.7) in equ (11)
153 |     tamax=qq.tamax;    % max smoothing time constant in (3) = -tinc/log(0.96)
154 |     taminh=qq.taminh;    % min smoothing time constant (upper limit) in (3) = -tinc/log(0.3)
155 |     tpfall=qq.tpfall;   % time constant for P to fall (12)
156 |     tbmax=qq.tbmax;   % max smoothing time constant in (20) = -tinc/log(0.8)
157 |     qeqmin=qq.qeqmin;       % minimum value of Qeq (23)
158 |     qeqmax=qq.qeqmax;      % max value of Qeq per frame
159 |     av=qq.av;             % fudge factor for bc calculation (23 + 13 lines)
160 |     td=qq.td;       % time to take minimum over
161 |     nu=qq.nu;           % number of subwindows
162 |     qith=qq.qith; % noise slope thresholds in dB/s
163 |     nsmdb=qq.nsmdb;   % maximum permitted +ve noise slope in dB/s
164 |     
165 |     % derived algorithm constants
166 |     
167 |     aca=exp(-tinc/taca); % smoothing constant for alpha_c in equ (11) = 0.7
168 |     acmax=aca;          % min value of alpha_c = 0.7 in equ (11) also = 0.7
169 |     amax=exp(-tinc/tamax); % max smoothing constant in (3) = 0.96
170 |     aminh=exp(-tinc/taminh); % min smoothing constant (upper limit) in (3) = 0.3
171 |     bmax=exp(-tinc/tbmax); % max smoothing constant in (20) = 0.8
172 |     snrexp = -tinc/tpfall;
173 |     nv=round(td/(tinc*nu));    % length of each subwindow in frames
174 |     if nv<4            % algorithm doesn't work for miniscule frames
175 |         nv=4;
176 |         nu=max(round(td/(tinc*nv)),1);
177 |     end
178 |     nd=nu*nv;           % length of total window in frames
179 |     [md,hd]=mhvals(nd); % calculate the constants M(D) and H(D) from Table III
180 |     [mv,hv]=mhvals(nv); % calculate the constants M(D) and H(D) from Table III
181 |     nsms=10.^(nsmdb*nv*tinc/10);  % [8 4 2 1.2] in paper
182 |     qeqimax=1/qeqmin;  % maximum value of Qeq inverse (23)
183 |     qeqimin=1/qeqmax; % minumum value of Qeq per frame inverse
184 |     
185 | 
186 | 
187 |     if isempty(yf)      % provide dummy initialization
188 |         ac=1;               % correction factor (9)
189 |         subwc=nv;                   % force a buffer switch on first loop
190 |         ibuf=0;
191 |         p=x;          % smoothed power spectrum
192 |         sn2=p;              % estimated noise power
193 |         pb=p;               % smoothed noisy speech power (20)
194 |         pb2=pb.^2;
195 |         pminu=p;
196 |         actmin=repmat(Inf,1,nrf);   % Running minimum estimate
197 |         actminsub=actmin;           % sub-window minimum estimate
198 |         actbuf=repmat(Inf,nu,nrf);  % buffer to store subwindow minima
199 |         lminflag=zeros(1,nrf);      % flag to remember local minimum
200 |     else
201 |         
202 |         if ~nrcum      
203 | 
204 |             %Initialization excluding 3 left, current (noisy frame), 3 right frames
205 |             By=[]; Cx=[]; dC=[]; Ax=[];
206 |             xNc=find(noise_seg==1); % current noise frame indexs
207 |             L_3f=[ [xNc-3]' [xNc-2]' [xNc-1]' ]; %3 frames(indexes) 'left' of 'xNc' 
208 |             R_3f=[ [xNc+1]' [xNc+2]' [xNc+3]' ]; % 3 frames(indexes)  'right' of 'xNc'
209 | 
210 |             dC=[ L_3f xNc' R_3f ];   SxY=sort(unique(dC(:))); % (frame indexes which should not be considered)
211 | 
212 |             By=SxY(find(SxY>0)); % (discard 0 or negative frame indexes) 
213 |             Ax=1:1:length(noise_seg); Cx = setdiff(Ax,By); % 'By' all unsatisfied frames (ascending sorted order)
214 |             
215 |             if isempty(Cx) == 0 % satisfied frame indexes           
216 |                p=yf(Cx(1),:);   %first satisfied frame from begining (as 'Cx' asced. sorted order)
217 |                    
218 |             else    % otherwise
219 | 
220 |                 Cx=find(noise_seg==0); %speech frames
221 |                 if length (Cx) > 3 
222 |                    Cx(1:3)=[];  % avoid first 3 speech frames - insense of "only slight better" 'init of p'
223 |                 end
224 |                 By=[];       % removed 3L-current-3R indexes (else impose to discard all frames in file)- during 'p' update [ ~isempty(intersect(t,By)) ]     
225 | 
226 |                 if isempty(Cx) == 1 % not exist 4th speech frame/only contents noisy segment
227 | 
228 |                       Cx=1;   p=yf(1,:); %no hope (as usual)
229 |                       warning('File seems have only noisy segments ..  noise-power-spectra-estimation may not good!');                 
230 |                     else
231 |                       p=yf(Cx(1),:); %i.e. 4th speech frame 
232 |                 end
233 | 
234 |             end % 
235 | 
236 | 
237 |             ne_min=p;
238 |             ac=1;               % correction factor (9)
239 |             sn2=p;              % estimated noise power
240 |             pb=p;               % smoothed noisy speech power (20)
241 |             pb2=pb.^2;
242 |             pminu=p;
243 |             actmin=repmat(Inf,1,nrf);   % Running minimum estimate
244 |             actminsub=actmin;           % sub-window minimum estimate
245 |             subwc=nv;                   % force a buffer switch on first loop
246 |             actbuf=repmat(Inf,nu,nrf);  % buffer to store subwindow minima
247 |             ibuf=0;
248 |             lminflag=zeros(1,nrf);      % flag to remember local minimum
249 |         end
250 | 
251 | 
252 |         
253 |         % loop for each frame
254 |         if sum(pv01(1:10))>=1
255 |             p=min(yf(1:min(50,nr),:),[],1);
256 |             ne_min=p; sn2=p; pb=p; pb2=pb.^2; pminu=p;
257 |         end
258 | 
259 | 
260 |         for t=Cx(1):nr % start from the "initialization frame for 'p'  %% we use t instead of lambda in the paper
261 |             yft=yf(t,:);        % noise speech power spectrum
262 |             acb=(1+(sum(p)./sum(yft)-1).^2).^(-1);  % alpha_c-bar(t)  (9)
263 |             ac=aca*ac+(1-aca)*max(acb,acmax);       % alpha_c(t)  (10)
264 |             ah=amax*ac.*(1+(p./sn2-1).^2).^(-1);    % alpha_hat: smoothing factor per frequency (11)
265 |             snr=sum(p)/sum(sn2);
266 |             ah=max(ah,min(aminh,snr^snrexp));       % lower limit for alpha_hat (12)
267 |             
268 | 
269 | 
270 |             if noise_seg(t) ||  ~isempty(intersect(t,By)) || (t<11 && sum(pv01(1:10))>=1) %avoid the frames in "By" (3Left, curr. noise,3Right)
271 |             else
272 |                 p=ah.*p+(1-ah).*yft;            % smoothed noisy speech power (3)
273 |                 b=min(ah.^2,bmax);              % smoothing constant for estimating periodogram variance (22 + 2 lines)
274 |                 pb=b.*pb + (1-b).*p;            % smoothed periodogram (20)
275 |                 pb2=b.*pb2 + (1-b).*p.^2;     	% smoothed periodogram squared (21)
276 |             end
277 |             
278 |             qeqi=max(min((pb2-pb.^2)./(2*sn2.^2),qeqimax),qeqimin/(t+nrcum));   % Qeq inverse (23)
279 |             qiav=sum(qeqi)/nrf;             % Average over all frequencies (23+12 lines) (ignore non-duplication of DC and nyquist terms)
280 |             bc=1+av*sqrt(qiav);             % bias correction factor (23+11 lines)
281 |             bmind=1+2*(nd-1)*(1-md)./(qeqi.^(-1)-2*md);      % we use the simplified form (17) instead of (15)
282 |             bminv=1+2*(nv-1)*(1-mv)./(qeqi.^(-1)-2*mv);      % same expression but for sub windows
283 |             kmod=bc*p.*bmind<actmin;        % Frequency mask for new minimum
284 |               
285 |             if any(kmod)
286 |                 actmin(kmod)=bc*p(kmod).*bmind(kmod);
287 |                 actminsub(kmod)=bc*p(kmod).*bminv(kmod);
288 |             end
289 |             if subwc>1 && subwc<nv              % middle of buffer - allow a local minimum
290 |                 lminflag=lminflag | kmod;    	% potential local minimum frequency bins
291 |                 pminu=min(actminsub,pminu);
292 |                 sn2=pminu;
293 |             else
294 |                 if subwc>=nv                    % end of buffer - do a buffer switch
295 |                     ibuf=1+rem(ibuf,nu);     	% increment actbuf storage pointer
296 |                     actbuf(ibuf,:)=actmin;    	% save sub-window minimum
297 |                     pminu=min(actbuf,[],1);
298 |                     i=find(qiav<qith);
299 |                     nsm=nsms(i(1));          	% noise slope max
300 |                     lmin=lminflag & ~kmod & actminsub<nsm*pminu & actminsub>pminu;
301 |                     if any(lmin)
302 |                         pminu(lmin)=actminsub(lmin);
303 |                         actbuf(:,lmin)=repmat(pminu(lmin),nu,1);
304 |                     end
305 |                     lminflag(:)=0;
306 |                     actmin(:)=Inf;
307 |                     subwc=0;
308 |                 end
309 |             end
310 |             subwc=subwc+1;
311 |             x(t,:)=sn2;
312 |             %ne_min=min(ne_min, sn2);
313 |             qisq=sqrt(qeqi);
314 |             % empirical formula for standard error based on Fig 15 of [2]
315 |             xs(t,:)=sn2.*sqrt(0.266*(nd+100*qisq).*qisq/(1+0.005*nd+6/nd)./(0.5*qeqi.^(-1)+nd-1));
316 |         end
317 |     end
318 | 
319 | 
320 |     if nargout>1    % we need to store the state for next time
321 |         zo.nrcum=nrcum+nr;      % number of frames so far
322 |         zo.p=p;          % smoothed power spectrum
323 |         zo.ac=ac;               % correction factor (9)
324 |         zo.sn2=sn2;              % estimated noise power
325 |         zo.pb=pb;               % smoothed noisy speech power (20)
326 |         zo.pb2=pb2;
327 |         zo.pminu=pminu;
328 |         zo.actmin=actmin;   % Running minimum estimate
329 |         zo.actminsub=actminsub;           % sub-window minimum estimate
330 |         zo.subwc=subwc;                   % force a buffer switch on first loop
331 |         zo.actbuf=actbuf;  % buffer to store subwindow minima
332 |         zo.ibuf=ibuf;
333 |         zo.lminflag=lminflag;      % flag to remember local minimum
334 |         zo.tinc=tinc;     % must be the last one
335 |         zo.qq=qq;
336 |     end
337 |     if ~nargout
338 |         clf;
339 |         subplot(212);
340 |         plot((1:nr)*tinc,10*log10([sum(yf,2) sum(x,2)]))
341 |         ylabel('Frame Energy (dB)');
342 |         xlabel(sprintf('Time (s)   [%d ms frame incr]',round(tinc*1000)));
343 |         axisenlarge([-1 -1.05]);
344 |         legend('input','noise','Location','Best');
345 |         subplot(211);
346 |         plot(1:nrf,10*log10([sum(yf,1)'/nr sum(x,1)'/nr]))
347 |         ylabel('Power (dB)');
348 |         xlabel('Frequency bin');
349 |         axisenlarge([-1 -1.05]);
350 |         legend('input','noise','Location','Best');
351 |     end
352 | end
353 | 
354 | function [m,h,d]=mhvals(d)
355 | % Values are taken from Table 5 in [2]
356 | %[2] R. Martin,"Bias compensation methods for minimum statistics noise power
357 | %               spectral density estimation", Signal Processing Vol 86, pp1215-1229, 2006.
358 | 
359 | % approx: plot(d.^(-0.5),[m 1-d.^(-0.5)],'x-'), plot(d.^0.5,h,'x-')
360 | persistent dmh
361 | if isempty(dmh)
362 |     dmh=[
363 |         1   0       0;
364 |         2   0.26    0.15;
365 |         5   0.48    0.48;
366 |         8   0.58    0.78;
367 |         10  0.61    0.98;
368 |         15  0.668   1.55;
369 |         20  0.705   2;
370 |         30  0.762   2.3;
371 |         40  0.8     2.52;
372 |         60  0.841   3.1;
373 |         80  0.865   3.38;
374 |         120 0.89    4.15;
375 |         140 0.9     4.35;
376 |         160 0.91    4.25;
377 |         180 0.92    3.9;
378 |         220 0.93    4.1;
379 |         260 0.935   4.7;
380 |         300 0.94    5];
381 | end
382 | 
383 | if nargin>=1
384 |     i=find(d<=dmh(:,1));
385 |     if isempty(i)
386 |         i=size(dmh,1);
387 |         j=i;
388 |     else
389 |         i=i(1);
390 |         j=i-1;
391 |     end
392 |     if d==dmh(i,1)
393 |         m=dmh(i,2);
394 |         h=dmh(i,3);
395 |     else
396 |         qj=sqrt(dmh(i-1,1));    % interpolate using sqrt(d)
397 |         qi=sqrt(dmh(i,1));
398 |         q=sqrt(d);
399 |         h=dmh(i,3)+(q-qi)*(dmh(j,3)-dmh(i,3))/(qj-qi);
400 |         m=dmh(i,2)+(qi*qj/q-qj)*(dmh(j,2)-dmh(i,2))/(qi-qj);
401 |     end
402 | else
403 |     d=dmh(:,1);
404 |     m=dmh(:,2);
405 |     h=dmh(:,3);
406 | end
407 | 


--------------------------------------------------------------------------------
/rVAD2.0/spgrambw.m:
--------------------------------------------------------------------------------
  1 | function [t,f,b]=spgrambw(s,fs,varargin)
  2 | %SPGRAMBW Draw spectrogram [T,F,B]=(s,fs,mode,bw,fmax,db,tinc,ann)
  3 | %
  4 | %  Usage: spgrambw(s,fs,'pJcw')  % Plot spectrogram with my favourite set of options
  5 | %         
  6 | %         For examples of the many options available see: 
  7 | %         http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/tutorial/spgrambw/spgram_tut.pdf
  8 | %
  9 | %  Inputs:  S         speech signal, or single-sided power spectrum array, S(NT,NF), in power per Hz
 10 | %           FS        sample fequency (Hz) or [FS T1] where T1 is the time of the first sample
 11 | %                     or, if s is a matrix, [FS T1 FINC F1] where FS is the frame rate, T1 is
 12 | %                     the time of the first sample, FINC is the frequency increment and F1 the
 13 | %                     frequency of the first column.
 14 | %           MODE      optional character string specifying options (see list below)
 15 | %           BW        bandwidth resolution in Hz (DFT window length = 1.81/BW)[default: 200]
 16 | %           FMAX      frequency range [Fmin Fstep Fmax]. If Fstep is omitted
 17 | %                     it is taken to be (Fmax-Fmin)/257, if Fmin is also omitted it is taken
 18 | %                     to be 0 (or 20Hz for mode l), if all three are omitted Fmax is taken to be FS/2.
 19 | %                     If modes m, b, e or l are specified then the units are in mel, bark or erb or
 20 | %                     log10(Hz); this can be over-ridden by the 'h' option.
 21 | %           DB        either dB-range or [dB-min dB-max] [default: 40]
 22 | %           TINC      output frame increment in seconds [0 or missing uses default=0.45/BW]
 23 | %                     or [TFIRST TLAST] or [TFIRST TINC TLAST] where TFIRST/TLAST are the times
 24 | %                     of first/last frames
 25 | %           ANN       annotation cell array: each row contains either
 26 | %                     {time 'text-string' 'font'} or {[t_start t_end] 'text-string' 'font'} where
 27 | %                     the time value is in seconds with s(n) at time offset+n/fs. The font column can
 28 | %                     omitted in which case the system font will be used. MATLAB cannot cope with
 29 | %                     unicode so I recommend the SILDoulosIPA (serifed) or SILSophiaIPA (sans) fonts
 30 | %                     for phonetic symbols; these are now a little hard to find.
 31 | %
 32 | % Outputs:  T(NT)        time axis values (in seconds). Input sample s(n) is at time offset+n/fs.
 33 | %           F(NF)        frequency axis values in Hz or, unless mode=H, other selected frequency units
 34 | %                        according to mode: m=mel, l=log10(Hz), b=bark,e=erb-rate
 35 | %           B(NT,NF)     spectrogram values in power (or clipped dB values if 'd' option given)
 36 | %
 37 | % MODE:  'p' = output power per decade rather than power per Hz [preemphasis]
 38 | %        'P' = output power per mel/bark/erb according to y axis scaling
 39 | %        'd' = output B array is in dB rather than power
 40 | %        'D' = clip the output B array to the limits specified by the "db" input
 41 | %
 42 | %        'm' = mel scale
 43 | %        'b' = bark scale
 44 | %        'e' = erb scale
 45 | %        'l' = log10 Hz frequency scale
 46 | %        'f' = label frequency axis in Hz rather than mel/bark/... 
 47 | %
 48 | %        'h' = units of the FMAX input are in Hz instead of mel/bark
 49 | %              [in this case, the Fstep parameter is used only to determine
 50 | %               the number of filters]
 51 | %        'H' = express the F output in Hz instead of mel/bark/...
 52 | %
 53 | %        'g' = draw a graph even if output arguments are present
 54 | %        'j' = jet colourmap
 55 | %        'J' = "thermal" colourmap that is linear in grayscale. Based on Oliver Woodford's
 56 | %                 real2rgb at http://www.mathworks.com/matlabcentral/fileexchange/23342
 57 | %        'i' = inverted colourmap (white background)
 58 | %        'c' = include a colourbar as an intensity scale
 59 | %        'w' = draw the speech waveform above the spectrogram
 60 | %        'a' = centre-align annotations rather than left-aligning them
 61 | %        't' = add time markers with annotations
 62 | %
 63 | % The BW input gives the 6dB bandwidth of the Hamming window used in the analysis.
 64 | % Equal amplitude frequency components are guaranteed to give separate peaks if they
 65 | % are this far apart. This value also determines the time resolution: the window length is
 66 | % 1.81/BW and the low-pass filter applied to amplitude modulations has a 6-dB bandwidth of
 67 | % BW/2 Hz.
 68 | %
 69 | % The units are power per Hz unless the u
 70 | % option is given in which case power per displayed unit is used
 71 | % or power per decade for the l option.
 72 | 
 73 | %%%% BUGS %%%%%%
 74 | % * allow ANN rows to be a mixture of intervals and instants
 75 | % * allow multiple ANN rows
 76 | % * Do not use triangular interpolation if the output frequencies are the same as an FFT
 77 | % * Place as many subticks as will fit beyond the last tick with the 'f' option
 78 | % * Use a special subtick pattern between ticks that are powers of 10 using the 'f' option
 79 | % * Future options:
 80 | %       ['q' = constant q transform]
 81 | %       ['k' = add a piano keyboard to the frequency scale]
 82 | %       ['z' = use a bipolar colourmap for a matrix input with negative values]
 83 | 
 84 | %      Copyright (C) Mike Brookes 1997-2011
 85 | %      Version: $Id: spgrambw.m 713 2011-10-16 14:45:43Z dmb $
 86 | %
 87 | %   VOICEBOX is a MATLAB toolbox for speech processing.
 88 | %   Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html
 89 | %
 90 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 91 | %   This program is free software; you can redistribute it and/or modify
 92 | %   it under the terms of the GNU General Public License as published by
 93 | %   the Free Software Foundation; either version 2 of the License, or
 94 | %   (at your option) any later version.
 95 | %
 96 | %   This program is distributed in the hope that it will be useful,
 97 | %   but WITHOUT ANY WARRANTY; without even the implied warranty of
 98 | %   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 99 | %   GNU General Public License for more details.
100 | %
101 | %   You can obtain a copy of the GNU General Public License from
102 | %   http://www.gnu.org/copyleft/gpl.html or by writing to
103 | %   Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA.
104 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
105 | persistent tcmap
106 | if isempty(tcmap)
107 |     % modified thermal with better grayscale linearity
108 |     tcmap=[ 0 0 0; 7 0 17; 14 0 33; 21 0 50; 29 0 67; 36 0 84; 43 0 100; 50 0 117;
109 |         57 0 134; 64 0 150; 72 0 167; 80 3 164; 89 7 156; 97 11 149; 106 15 142; 114 19 134;
110 |         123 23 127; 131 27 119; 140 31 112; 149 35 105; 157 39 97; 166 43 90; 174 47 82;
111 |         183 51 75; 192 55 68; 200 59 60; 209 63 53; 217 67 45; 226 71 38; 234 75 31;
112 |         243 79 23; 252 83 16; 255 88 12; 255 95 12; 255 102 11; 255 109 11; 255 116 10;
113 |         255 123 10; 255 130 9; 255 137 9; 255 144 8; 255 151 8; 255 158 7; 255 165 7;
114 |         255 172 6; 255 179 6; 255 186 5; 255 193 4; 255 200 4; 255 207 3; 255 214 3; 255 221 2;
115 |         255 228 2; 255 235 1; 255 242 1; 255 249 0; 255 252 22; 255 252 55; 255 253 88;
116 |         255 253 122; 255 254 155; 255 254 188; 255 255 222; 255 255 255]/255;
117 | end
118 | if nargin<2
119 |     error('Usage: SPGRAMBW(s,fs,mode,bw,fmax,db,tinc)');
120 | end
121 | %SPGRAMBW Draw grey-scale spectrogram [T,F,B]=(s,fs,mode,bw,fmax,db,tinc)
122 | %
123 | % first decode the input arguments
124 | %
125 | if size(s,1)==1
126 |     s=s(:);   % force to be a column vector (unless it is a matrix)
127 | end
128 | [ns1,ns2]=size(s);
129 | ap=zeros(1,6);
130 | j=2;
131 | if numel(fs)<2
132 |     fs(2)=1/fs(1);  % first sample or frame is at time 1/fs
133 | end
134 | for i=1:length(varargin)
135 |     if ischar(varargin{i})
136 |         ap(1)=i;
137 |     else
138 |         ap(j)=i;
139 |         j=j+1;
140 |     end
141 | end
142 | if ap(1) && ~isempty(varargin{ap(1)})
143 |     mode=varargin{ap(1)};
144 | else
145 |     mode='';  % default mode
146 | end
147 | if ap(2) && ~isempty(varargin{ap(2)})
148 |     bw=varargin{ap(2)};
149 | else
150 |     bw=200;
151 | end
152 | if ap(3) && ~isempty(varargin{ap(3)})
153 |     fmax=varargin{ap(3)};
154 | else
155 |     fmax=[];
156 | end
157 | if ap(4) && ~isempty(varargin{ap(4)})
158 |     db=varargin{ap(4)};
159 | else
160 |     db=40;
161 | end
162 | if ap(5) && ~isempty(varargin{ap(5)})
163 |     tinc=varargin{ap(5)};
164 | else
165 |     tinc=0;
166 | end
167 | switch numel(tinc)
168 |     case 1
169 |         tinc=[tinc -Inf Inf];
170 |     case 2
171 |         tinc=[0 tinc];
172 |     otherwise
173 |         tinc=tinc([2 1 3]);
174 | end
175 | if tinc(1)<=0
176 |     tinc(1)=1.81/(4*bw); % default frame increment
177 | end
178 | if ap(6)
179 |     ann=varargin{ap(6)};
180 | else
181 |     ann=[];
182 | end
183 | 
184 | % now sort out the mode flags
185 | 
186 | mdsw='  ';           % [yscale preemph]
187 | for i=1:length(mode)
188 |     switch mode(i)
189 |         case {'l','m','b','e'}
190 |             mdsw(1)=mode(i);
191 |         case {'p','P'}
192 |             mdsw(2)=mode(i);
193 |     end
194 | end
195 | if mdsw(2)=='P'
196 |     mdsw(2)=mdsw(1);        % preemphasis is scaling dependent
197 | end
198 | %
199 | % sort out the frequency axis
200 | %
201 | flmin=30;                   % min frequency for 'l' option
202 | nfrq=257;                   % default number of frequency bins
203 | if ns2==1
204 |     fnyq=fs(1)/2;           % default upper frequency limit is fs/2
205 | else                        % input is a power spectrum
206 |     if numel(fs)<3
207 |         fs(3)=fs(1)*0.25;   % default increment is 0.25 times frame increment
208 |     end
209 |     if numel(fs)<4
210 |         fs(4)=0;            % first freq bin is DC by default
211 |     end
212 |     fnyq=fs(4)+(ns2-1)*fs(3);  % default upper frequency limit is highest supplied frequency
213 | end
214 | 
215 | if ~numel(fmax)             % no explicit frequency range
216 |     switch mdsw(1)
217 |         case 'l'
218 |             fx=linspace(log10(flmin),log10(fnyq),nfrq);   % 20  Hz to Nyquist
219 |         case 'm'
220 |             fx=linspace(0,frq2mel(fnyq),nfrq);   % DC to Nyquist
221 |         case 'b'
222 |             fx=linspace(0,frq2bark(fnyq),nfrq);   % DC to Nyquist
223 |         case 'e'
224 |             fx=linspace(0,frq2erb(fnyq),nfrq);   % DC to Nyquist
225 |         otherwise   % linear Hz scale
226 |             fx=(0:nfrq-1)*fnyq/(nfrq-1);
227 |     end
228 | else
229 |     if any(mode=='h')
230 |         switch mdsw(1)
231 |             case 'l'
232 |                 fmaxu=log10(fmax);   % 20  Hz to Nyquist
233 |             case 'm'
234 |                 fmaxu=frq2mel(fmax);   % DC to Nyquist
235 |             case 'b'
236 |                 fmaxu=frq2bark(fmax);   % DC to Nyquist
237 |             case 'e'
238 |                 fmaxu=frq2erb(fmax);   % DC to Nyquist
239 |             otherwise
240 |                 fmaxu=fmax;  % linear Hz scale
241 |         end
242 |     else
243 |         fmaxu=fmax;                 % already in the correct units
244 |     end
245 |     if numel(fmax)<2   % only max value specified
246 |         if mdsw(1)=='l'
247 |             fx=linspace(log10(flmin),fmaxu,nfrq);   % 20  Hz to fmax
248 |         else
249 |             fx=linspace(0,fmaxu,nfrq);   % DC to fmax
250 |         end
251 |     elseif numel(fmax)<3 % min and max values specified
252 |         fx=linspace(fmaxu(1),fmaxu(2),nfrq);   % fmin to fmax
253 |     else
254 |         fmaxu(2)=fmax(2)*(fmaxu(3)-fmaxu(1))/(fmax(3)-fmax(1)); % scale the step size appropriately
255 |         fx=fmaxu(1):fmaxu(2):fmaxu(3);   % fmin to fmax in steps of finc
256 |         nfrq=length(fx);
257 |     end
258 | end
259 | switch mdsw(1)          % convert the frequency range to Hz
260 |     case 'l'
261 |         f=10.^fx;
262 |         frlab='log_{10}Hz';
263 |         frlabf='log';
264 |         frq2y=@log10;
265 |         y2frq=@(x) 10.^x;
266 |     case 'm'
267 |         f=mel2frq(fx);
268 |         frlab='Mel';
269 |         frlabf='Mel';
270 |         frq2y=@frq2mel;
271 |         y2frq=@mel2frq;
272 |     case 'b'
273 |         f=bark2frq(fx);
274 |         frlab='Bark';
275 |         frlabf='Bark';
276 |                 frq2y=@frq2bark;
277 |         y2frq=@bark2frq;
278 |     case 'e'
279 |         f=erb2frq(fx);
280 |         frlab='Erb-rate';
281 |         frlabf='Erb';
282 |         frq2y=@frq2erb;
283 |         y2frq=@erb2frq;
284 |     otherwise
285 |         f=fx;
286 |         frlab='Hz';
287 |                 frq2y=@(x) x;
288 |         y2frq=@(x) x;
289 | end
290 | if ~any(mode=='H')
291 |     f=fx;               % give output frequencies in native units instead of Hz unless 'H' is specified
292 | end
293 | %
294 | % now calculate the spectrogram
295 | %
296 | if ns2==1   % input is a speech signal vector
297 |     winlen = fix(1.81*fs(1)/bw);   % window length
298 |     win=0.54+0.46*cos((1-winlen:2:winlen)*pi/winlen);  % Hamming window
299 |     ninc=max(round(tinc(1)*fs(1)),1);                 % window increment in samples
300 |     %  we need to take account of minimum freq increment + make it exact if possible
301 |     fftlen=pow2(nextpow2(4*winlen));        % enough oversampling to get good interpolation
302 |     win=win/sqrt(sum(win.^2));              % ensure window squared sums to unity
303 |     ix1=max(round((tinc(2)-fs(2))*fs(1)-(winlen-3)/2),1); % first sample required
304 |     ix2=min(ceil((tinc(3)-fs(2))*fs(1)+(winlen+1)/2),ns1); % last sample required
305 |     [sf,t]=enframe(s(ix1:ix2),win,ninc);
306 |     t=fs(2)+(t+ix1-2)/fs(1);                         % time axis
307 |     b=rfft(sf,fftlen,2);
308 |     b=b.*conj(b)*2/fs(1);          % Power per Hz
309 |     b(:,1)=b(:,1)*0.5;   % correct for no negative zero frequency to double the power
310 |     b(:,end)=b(:,end)*0.5;   % correct for no negative nyquist frequency to double the power
311 |     fb=(0:fftlen/2)*fs(1)/fftlen; % fft bin frequencies
312 |     fftfs=fs(1);
313 | else
314 | 
315 |     b=s;
316 |     t=fs(2)+(0:ns1-1)/fs(1);  % frame times
317 |     fb=fs(4)+(0:ns2-1)*fs(3);
318 |     fftlen=[ns2 fs(3) fs(4)]; % for filtbankm: ns2=# input freq bins, freq increment (Hz), first bin freq (Hz)
319 |     fftfs=0;
320 |     %     fftlen=2*(ns2-1);  % assume an even length fft
321 |     %     fftfs=fftlen*fs(3);
322 | end
323 | nfr=numel(t);                   % number of frames
324 | dblab='Power/Hz';
325 | switch mdsw(2)
326 |     case {'p','l'}
327 |         b=b.*repmat(fb*log(10),nfr,1);       % convert to power per decade
328 |         dblab='Power/Decade';
329 |     case 'm'
330 |         b=b.*repmat((1+fb/700)*log(1+1000/700)/1000,nfr,1);       % convert to power per mel
331 |         dblab='Power/Mel';
332 |     case 'b'
333 |         b=b.*repmat((1960+fb).^2/52547.6,nfr,1);       % convert to power per bark
334 |         dblab='Power/Bark';
335 |     case 'e'
336 |         b=b.*repmat(6.23*fb.^2 + 93.39*fb + 28.52,nfr,1);       % convert to power per erb
337 |         dblab='Power/Erb-rate';
338 | end
339 | %
340 | % Now map onto the desired frequency scale
341 | %
342 | b=b*filtbankm(nfrq,fftlen,fftfs,fx(1),fx(end),['cush' mdsw(1)])';
343 | 
344 | if ~nargout || any(mode=='g') ||  any(mode=='d')
345 |     if numel(db)<2          % find clipping limits
346 |         plim=max(b(:))*[0.1^(0.1*db) 1];
347 |     else
348 |         plim=10.^(0.1*db(1:2));
349 |     end
350 |     if plim(2)<=0
351 |         plim(2)=1;
352 |     end
353 |     if plim(1)<=0 || plim(1)==plim(2)
354 |         plim(1)=0.1*plim(2);
355 |     end
356 |     if ~nargout || any(mode=='g')
357 |         bd=10*log10(b);  % save an unclipped log version for plotting
358 |     end
359 |     if any(mode=='D')
360 |         b=min(max(b,plim(1)),plim(2)); % clip the output
361 |     end
362 |     if any(mode=='d')
363 |         b=10*log10(b);    % output the dB version
364 |     end
365 | end
366 | % now plot things
367 | if ~nargout || any(mode=='g')
368 |     cla;  % clear current axis
369 |     imagesc(t,fx,bd');
370 |     axis('xy');
371 |     set(gca,'tickdir','out','clim',10*log10(plim));
372 |     if any(mode=='j')
373 |         colormap('jet');
374 |         map=colormap;
375 |     elseif any(mode=='J')
376 |         map=tcmap;
377 |     else
378 |         map = repmat((0:63)'/63,1,3);
379 |     end
380 |     if any(mode=='i')               % 'i' option = invert the colourmap
381 |         map=map(64:-1:1,:);
382 |     end
383 |     colormap(map);
384 |     if any(mode=='c')                % 'c' option = show a colourbar
385 |         colorbar;
386 |         cblabel([dblab ' (dB)']);
387 |     end
388 |     %
389 |     % Now check if annotations or a waveform are required
390 |     %
391 |     dotaw=[((any(mode=='t') && size(ann,2)>1) || size(ann,2)==1) size(ann,2)>1 (any(mode=='w') && ns2==1)];
392 |         ylim=get(gca,'ylim');
393 |     if  any(dotaw)
394 |         yrange = ylim(2)-ylim(1);
395 |         zlim=ylim;
396 |         toptaw=cumsum([0 dotaw.*[0.05 0.05 0.1]]*yrange)+ylim(2);
397 |         zlim(2)=toptaw(4);
398 |         set(gca,'ylim',zlim,'color',map(1,:));
399 |         if dotaw(3)        % Plot the waveform
400 |             smax=max(s(:));
401 |             smin=min(s(:));
402 |             srange=smax-smin;
403 |             hold on
404 |             plot(fs(2)+(0:length(s)-1)/fs(1),(s-smin)/srange*0.9*(toptaw(4)-toptaw(3))+toptaw(3),'color',map(48,:))
405 |             hold off
406 |         end
407 |         if dotaw(1) || dotaw(2)
408 |             tmk=cell2mat(ann(:,1));
409 |             tmksel=tmk(:,1)<=t(end) & tmk(:,end)>=t(1);
410 |             yix=1+[tmk(tmksel,1)<t(1) ones(sum(tmksel),2) tmk(tmksel,end)>t(end)]';
411 |             tmk(:,1)=max(tmk(:,1),t(1));  % clip to axis limits
412 |             tmk(:,end)=min(tmk(:,end),t(end));
413 |         end
414 |         if dotaw(1) && any(tmksel)  % draw time markers
415 |             ymk=toptaw(1:2)*[0.8 0.4;0.2 0.6];
416 |             switch size(tmk,2)
417 |                 case 0
418 |                 case 1      % isolated marks
419 |                     hold on
420 |                     plot([tmk(tmksel) tmk(tmksel)]',repmat(ymk',1,sum(tmksel)),'color',map(48,:));
421 |                     hold off
422 |                 otherwise % draw durations
423 | 
424 |                     hold on
425 |                     plot(tmk(tmksel,[1 1 2 2])',ymk(yix),'color',map(48,:));
426 |                     hold off
427 |             end
428 |         end
429 |         if dotaw(2) && any(tmksel) % print annotations
430 |             if any(mode=='a')
431 |                 horal='center';
432 |                 tmk=(tmk(:,1)+tmk(:,end))*0.5;
433 |             else
434 |                 horal='left';
435 |                 tmk=tmk(:,1);
436 |             end
437 |             if size(ann,2)>2
438 |                 font='Arial';
439 |                 for i=1:size(ann,1)
440 |                     if tmksel(i)
441 |                         if ~isempty(ann{i,3})
442 |                             font = ann{i,3};
443 |                         end
444 |                         text(tmk(i),toptaw(2),ann{i,2},'color',map(48,:),'fontname',font,'VerticalAlignment','baseline','HorizontalAlignment',horal);
445 |                     end
446 |                 end
447 |             else
448 |                 for i=1:size(ann,1)
449 |                     if tmksel(i)
450 |                         text(tmk(i),toptaw(2),ann{i,2},'color',map(48,:),'VerticalAlignment','baseline','HorizontalAlignment',horal);
451 |                     end
452 |                 end
453 |             end
454 |         end
455 |     end
456 |     xlabel(['Time (' xticksi 's)']);
457 |     if any(mode=='f') && ~strcmp(frlab,'Hz')
458 |         ylabel([frlabf '-scaled frequency (Hz)']);
459 |         ytickhz(frq2y,y2frq);
460 |     else
461 |     ylabel(['Frequency (' yticksi frlab ')']);
462 |     end
463 |     ytick=get(gca,'YTick');
464 |     ytickl=get(gca,'YTickLabel');
465 |     msk=ytick<=ylim(2);
466 |     if any(~msk)
467 |         set(gca,'YTick',ytick(msk),'YTickLabel',ytickl(msk));
468 |     end
469 | end
470 | 
471 | function ytickhz(frq2y,y2frq)
472 | % label non linear y frequency axis
473 | %
474 | % Bugs/Suggestions:
475 | % * Add a penalty for large numbers (e.g. 94 is less "round" than 11)
476 | % * possibly add subticks at 1:2:5 if boundaries are 1 and 10
477 | % * could treat subtick allocation specially if bounding lables are both powers of 10
478 | %   and work in log spacing rather than spacing directly
479 | 
480 | % algorithm constants
481 | 
482 | seps=[0.4 1 3 6]; % spacings: (a) min subtick, (b) min tick, (c) min good tick, (d) max good tick
483 | ww=[0.5 0.6 0.8 0.1 0.3 0.3 0.2];  % weight for (a) last digit=5, (b) power of 10, (c) power of 1000, (d) equal spacing, (e) 1:2:5 labels (f) <seps(3) (g) >seps(4)
484 | nbest=10; % number of possibilities to track
485 | 
486 | prefix={'y','z','a','f','p','n','�','m','','k','M','G','T','P','E','Z','Y'};
487 | 
488 | ah=gca;
489 | getgca=get(ah);  % Get original axis properties
490 | set(ah,'Units','points','FontUnits','points');
491 | getgcac=get(ah);  % Get axis properties in points units
492 | set(ah,'Units',getgca.Units,'FontUnits',getgca.FontUnits); % return to original values
493 | ylim=getgca.YLim;
494 | yrange=ylim*[-1;1];
495 | chsz= yrange*getgcac.FontSize/getgcac.Position(4); % char height in Y-units
496 | % divide the y-axis up into bins containing at most one label each
497 | maxl=ceil(2*yrange/chsz);  % max number of labels
498 | 
499 | % candidate array [cand(:,[1 2])/1000 cand(:,5) cand(:,6)/1000 cand(:,[7 8])]
500 | % 1,2=y limits, 3,4=log limits, 5=Hz, 6=cost, 7=mantissa, 8=exponent, 9=sig digits, 10=y-position
501 | cand=zeros(maxl+2,10);
502 | yinc=(yrange+chsz*0.0002)/maxl;  % bin spacing (allowing for a tiny bit to ensure the ends are included)
503 | cand(2:end-1,2)=ylim(1)+yinc*(1:maxl)'-chsz*0.0001;
504 | cand(3:end-1,1)=cand(2:end-2,2);
505 | cand(2,1)=cand(2,2)-yinc;
506 | cand(2:end-1,1:2)=y2frq(max(cand(2:end-1,1:2),0));
507 | 
508 | % find the "roundest" number in each interval
509 | % first deal with intervals containing zero
510 | cand([1 maxl+2],6)=-1;
511 | cand(2,9)=(cand(2,1)<=0);  % mask out interval contaiing zero
512 | cand(2,6)=-cand(2,9);
513 | msk=cand(:,6)==0;  % find rows without a cost yet
514 | cand(msk,3:4)=log10(cand(msk,1:2));
515 | % find powers of 1000
516 | loglim=ceil(cand(:,3:4)/3);
517 | msk=loglim(:,2)>loglim(:,1);
518 | if any(msk)
519 |     xp=loglim(msk,1);
520 |     wuns=ones(length(xp),1);
521 |     cand(msk,5:9)=[1000.^xp wuns-ww(3) wuns 3*xp wuns];
522 | end
523 | % find powers of 10
524 | loglim=ceil(cand(:,3:4));
525 | msk=~msk & (loglim(:,2)>loglim(:,1));
526 | if any(msk)
527 |     xp=loglim(msk,1);
528 |     wuns=ones(length(xp),1);
529 |     cand(msk,5:9)=[10.^xp wuns-ww(2) wuns xp wuns];
530 | end
531 | % find value with fewest digits
532 | msk=cand(:,6)==0;  % find rows without a cost yet
533 | maxsig=1-floor(log10(10^min(cand(msk,3:4)*[-1;1])-1)); % maximum number of significant figures to consider
534 | pten=10.^(0:maxsig-1);   % row vector of powers of ten
535 | noten=10.^(-floor(cand(msk,3))); % exponent of floating point representation of lower bound
536 | sigdig=sum((ceil(cand(msk,2).*noten*pten)-ceil(cand(msk,1).*noten*pten))==0,2); % number of digits common to the interval bounds
537 | lowman=ceil(cand(msk,1).*noten.*10.^sigdig);
538 | midman=10*floor(lowman/10)+5;
539 | highman=ceil(cand(msk,2).*noten.*10.^sigdig);
540 | mskman=midman>=lowman & midman<highman;   % check if we can include a manitssa ending in 5
541 | lowman(mskman)=midman(mskman);
542 | cand(msk,6:9)=[sigdig+1 lowman floor(cand(msk,3))-sigdig sigdig+1];
543 | cand(msk,5)=cand(msk,7).*10.^cand(msk,8);
544 | cand(msk,6)=cand(msk,6)-(mod(cand(msk,7),10)==5)*ww(1);
545 | cand(2:end-1,10)=frq2y(cand(2:end-1,5));
546 | cand([1 maxl+2],10)=ylim + seps(4)*chsz*[-1 1]; % put imaginary labels at the optimum spacing beyond the axes
547 | % [cand(:,[1 2 5])/1000 cand(:,[6 7 8 9])]
548 | 
549 | % Now do n-best DP to find the best sequence
550 | 
551 | ratint=[8/5 25/10 0 0 4/3];
552 | costs=repmat(Inf,nbest,maxl+2); % cumulative path costs
553 | costs(1,1)=0; % starting node only has one option
554 | prev=ones(nbest,maxl+2); % previous label in path
555 | labcnt=zeros(nbest,maxl+2); % number of labels in path
556 | for i=2:maxl+2
557 |     ntry=nbest*(i-1); % number of previous options
558 |     prevc=reshape(repmat(1:i-1,nbest,1),ntry,1); % previous candidate
559 |     prevprev=1+floor((prev(1:ntry)'-1)/nbest); % previous previous candidate
560 |     msk=prevprev>1+(maxl+2)*(i==maxl+2); % mask for label triplets
561 |     labcnti=labcnt(1:ntry)+1;
562 |     disti=(cand(i,10)-cand(prevc,10))/chsz; % distance to previous label in characters
563 |     costa=max(seps(3)-disti,0)*ww(6)+max(disti-seps(4),0)*ww(7);
564 |     incri=(cand(i,5)-cand(prevc,5)); % label increment
565 |     incrj=(cand(i,5)-cand(prevprev,5)); % double label increment
566 |     if any(msk)
567 |         costa(msk)=costa(msk)- ww(4)*(abs(incrj(msk)-2*incri(msk))<0.01*incri(msk));
568 |         if cand(i,7)==1 || cand(i,7)==2 || cand(i,7)==5 % look for labels 1:2:5
569 |             costa(msk)=costa(msk)- ww(5)*(abs(incrj(msk)-ratint(cand(i,7))*incri(msk))<0.01*incri(msk));
570 |         end
571 |     end
572 |     costa(disti<seps(2))=Inf;
573 |     costi=(costs(1:ntry).*max(labcnt(1:ntry),1)+costa'+cand(i,6))./labcnti;
574 |     [sc,isc]=sort(costi);
575 |     isc=isc(1:nbest);
576 |     costs(:,i)=sc(1:nbest)';
577 |     prev(:,i)=isc';
578 |     labcnt(:,i)=labcnti(isc)';
579 | end
580 | 
581 | % now traceback the best sequence
582 | 
583 | % fprintf('Traceback\n\n');
584 | ichoose=0;
585 | labchoose=[];
586 | for i=1:nbest
587 |     if labcnt(i,maxl+2)>1 && costs(i,maxl+2)<Inf
588 |         lablist=zeros(labcnt(i,maxl+2)-1,1);
589 |         k=prev(i,maxl+2);
590 |         for j=labcnt(i,maxl+2)-1:-1:1
591 |             lablist(j)=1+floor((k-1)/nbest);
592 |             k=prev(k);
593 |         end
594 | %         fprintf('Cost=%8.2f :',costs(i,maxl+2));
595 | %         fprintf(' %g',cand(lablist,5))
596 | %         fprintf('\n');
597 |         if ~ichoose || labcnt(ichoose,maxl+2)==1
598 |             ichoose=i;
599 |             labchoose=lablist;
600 |         end
601 |     end
602 | end
603 | 
604 | % now create the labels
605 | 
606 | ntick=length(labchoose);
607 | % sort out the subticks
608 | subpos=[];
609 | if ntick>=2
610 |     for i=1:ntick-1
611 |         clj=cand(labchoose(i:i+1),:);
612 |         sprec=min(clj(1,8)+100*(clj(1,7)==0),clj(2,8)); % subtick precision
613 |         spos=(clj(1,7)*10^(clj(1,8)-sprec):clj(2,7)*10^(clj(2,8)-sprec))*10^sprec;
614 |         nsub=length(spos);
615 |         if nsub==2
616 |             spos=spos*[1 0.5 0;0 0.5 1];
617 |             nsub=3;
618 |         end
619 |         if nsub>=3
620 |             yspos=frq2y(spos);
621 |             for kk=1:3 % try various subdivisions: every 1, 2 or 5
622 |                 k=kk+2*(kk==3);  % 1, 2 and 5
623 |                 if 2*k<=nsub-1 && ~mod(nsub-1,k)  % must divide exactly into nsub
624 |                     if all((yspos(1+k:k:nsub)-yspos(1:k:nsub-k))>=(seps(1)*chsz)) % check they all fit in
625 |                         subpos=[subpos yspos(1+k:k:nsub-k)];
626 |                         if i==1
627 |                             spos=(ceil(cand(2,1)/10^sprec):clj(1,7)*10^(clj(1,8)-sprec))*10^sprec;
628 |                             nsub=length(spos);
629 |                             yspos=frq2y(spos);
630 |                             if nsub>=k+1 && all((yspos(nsub:-k:1+k)-yspos(nsub-k:-k:1))>=(seps(1)*chsz))
631 |                                 subpos=[subpos yspos(nsub-k:-k:1)];
632 |                             end
633 |                         elseif i==ntick-1
634 |                             spos=(clj(2,7)*10^(clj(2,8)-sprec):floor(cand(end-1,2)/10^sprec))*10^sprec;
635 |                             nsub=length(spos);
636 |                             yspos=frq2y(spos);
637 |                             if nsub>=k+1 && all((yspos(1+k:k:nsub)-yspos(1:k:nsub-k))>=(seps(1)*chsz))
638 |                                 subpos=[subpos yspos(1+k:k:nsub)];
639 |                             end
640 |                         end
641 |                         break;
642 |                     end
643 |                 end
644 |             end
645 |         end
646 |     end
647 | end
648 | nsub=length(subpos);
649 | tickpos=[cand(labchoose,10); subpos'];
650 | ticklab=cell(ntick+nsub,1);
651 | sipref=min(max(floor((sum(cand(labchoose,8:9),2)-1)/3),-8),8);
652 | nzadd=cand(labchoose,8)-3*sipref;  % trailing zeros to add
653 | digzer=cand(labchoose,7).*10.^max(nzadd,0); % label digits including trailing zeros
654 | ndleft=cand(labchoose,9)+nzadd; % digits to the left of the decimal point
655 | for i=1:ntick
656 |     tickint=num2str(digzer(i));
657 |     if nzadd(i)<0
658 |         tickint=[tickint(1:ndleft(i)) '.' tickint(1+ndleft(i):end)];
659 |     end
660 |     ticklab{i} = sprintf('%s%s',tickint,prefix{sipref(i)+9});
661 | end
662 | for i=ntick+1:ntick+nsub
663 |     ticklab{i}='';
664 | end
665 | [tickpos,ix]=sort(tickpos);
666 | ticklab=ticklab(ix);
667 | 
668 | set(ah,'YTick',tickpos','YTickLabel',ticklab);
669 | 
670 | 


--------------------------------------------------------------------------------