├── rVAD2.0 ├── filtbankm.m ├── stdspectrum.m ├── aurora2read.m ├── pitchestm.m ├── sflux.m ├── README.TXT ├── pitchblockdetect.m ├── vadbatch_1folder_diffpathes.m ├── winenvar.m ├── rfft.m ├── snre_highenergy.m ├── vad.m ├── irfft.m ├── findpeaks.m ├── enframe.m ├── snre_vad.m ├── estnoiseg.m ├── gaussmixp.m ├── voicebox.m ├── specsub.m ├── specsub_noiseseg_lfn.m ├── estnoisem.m ├── fxpefac.m ├── LICENSE ├── estnoisem_noiseseg.m └── spgrambw.m ├── Aurora2TestSet-ReferenceVAD.zip ├── Aurora2TrainSet-ReferenceVAD.zip ├── LICENSE ├── rVADfast_py_2.0 ├── README.TXT ├── LICENSE ├── audio_stream.py ├── rVAD_fast.py ├── rVAD_fast_stream.py └── speechproc.py └── README.md /rVAD2.0/filtbankm.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenghuatan/rVAD/HEAD/rVAD2.0/filtbankm.m -------------------------------------------------------------------------------- /rVAD2.0/stdspectrum.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenghuatan/rVAD/HEAD/rVAD2.0/stdspectrum.m -------------------------------------------------------------------------------- /Aurora2TestSet-ReferenceVAD.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenghuatan/rVAD/HEAD/Aurora2TestSet-ReferenceVAD.zip -------------------------------------------------------------------------------- /Aurora2TrainSet-ReferenceVAD.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenghuatan/rVAD/HEAD/Aurora2TrainSet-ReferenceVAD.zip -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The Matlab code rVAD/rVAD2.0/ is under GNU GENERAL PUBLIC LICENSE Version 2 2 | 3 | The Python code rVAD/rVADfast_py_2.0/ is under MIT License 4 | 5 | Refer to LICENSE under each folder for details 6 | -------------------------------------------------------------------------------- /rVAD2.0/aurora2read.m: -------------------------------------------------------------------------------- 1 | function [data_float, fs]=aurora2read(fname) 2 | 3 | % Read data from the Aurora2 database 4 | 5 | fid=fopen(fname,'r','b'); 6 | data=fread(fid,'int16'); 7 | fclose(fid); 8 | fs=8000; 9 | 10 | %str1=strread(fname,'%s','delimiter','.'); 11 | 12 | data_float = double(data)/2^15; %% Normalize int16(y) by 2^15 13 | 14 | % wavwrite(data_float,fs,strcat(str1{1},'.wav')); 15 | 16 | -------------------------------------------------------------------------------- /rVADfast_py_2.0/README.TXT: -------------------------------------------------------------------------------- 1 | Fast noise-robust voice activity detection algorithm (rVAD-fast). 2 | Version 2.0 3 | 4 | 02 Dec 2017, Achintya Kumar Sarkar and Zheng-Hua Tan 5 | 6 | Usage: python rVAD_fast_2.0.py inWaveFile outputVadLabel 7 | 8 | Refs: 9 | [1] Z.-H. Tan, A.k. Sarkara and N. Dehak, "rVAD: an unsupervised segment-based robust voice activity detection method," Computer Speech and Language, vol. 59, pp. 1-21, 2020. 10 | [2] Z.-H. Tan and B. Lindberg, "Low-complexity variable frame rate analysis for speech recognition and voice activity detection,” IEEE Journal of Selected Topics in Signal Processing, vol. 4, no. 5, pp. 798-807, 2010. 11 | 12 | Contact: 13 | Prof Zheng-Hua Tan 14 | Aalborg University, Denmark 15 | zt@es.aau.dk 16 | https://vbn.aau.dk/en/persons/107665 17 | -------------------------------------------------------------------------------- /rVAD2.0/pitchestm.m: -------------------------------------------------------------------------------- 1 | function [pv01, fx]=pitchestm(data, fs, nfr10, pv01) 2 | % function [pv01, pvblk, pvblkb, pv]=pitchestm(data, fs, nfr10, pv01) 3 | 4 | [fx,tt,pv,fv]=fxpefac(data, fs); % should plus 3 5 | npv=length(pv); 6 | pv01=zeros(nfr10,1); sign_pv=0; 7 | for i=1:npv 8 | if pv(i)>0.25 9 | pv01(i+3) =1; 10 | if sign_pv==0 11 | sign_pv=1; 12 | nstart=i; 13 | end 14 | else 15 | if sign_pv==1 16 | sign_pv=0; 17 | nstop=i-1; 18 | if nstop-nstart<3 19 | pv01(nstart+3:nstop+3)=0; % Remove 2 frames only pitch 20 | end 21 | end 22 | end 23 | end 24 | pv01(1:3)=pv01(4); 25 | fxtmp(1:3)=fx(4); fxtmp(4:npv+3)=fx(1:npv); 26 | if (npv+3) < nfr10 27 | pv01(npv+4:nfr10)=pv01(npv+3); 28 | fxtmp(npv+4:nfr10)=fx(npv); 29 | else 30 | pv01=pv01(1:nfr10); 31 | fxtmp=fxtmp(1:nfr10); 32 | end 33 | fx=fxtmp; 34 | 35 | -------------------------------------------------------------------------------- /rVAD2.0/sflux.m: -------------------------------------------------------------------------------- 1 | function [ft, d, sVar]= sflux(data,flen,fsh10); 2 | 3 | %% output - 4 | % d - spectral flux 5 | % ft - spectral flatness 6 | % sVar - spectral variance 7 | 8 | nftt=pow2(nextpow2(flen)); %% FFT point 9 | 10 | %% sf-> spectral flux, ft-> spectral flatness, sVar-> spectral variance 11 | x=enframe(data,flen,fsh10); 12 | w=hamming(flen); 13 | x=x.*repmat(w',size(x,1),1); 14 | 15 | ak=abs(fft(x',nftt)); % spectrum 16 | ak=ak'; 17 | ak=ak(:, 1:fix(nftt/2)+1); 18 | 19 | ak_1=ak(2:end,:); % ak-1 20 | ak_1=[ ak_1 ; ak_1(end,:)]; % ak(t-1) 21 | 22 | d= sum((ak - ak_1).^2, 2); % sum_k [ak(t) -ak(t-1)] 23 | denA= sqrt(sum(ak.^2, 2)) .* sqrt( sum(ak_1.^2, 2) ); 24 | d=(d+eps)./(denA+eps); 25 | 26 | %% flatness 27 | win=size(ak,2); % number of bands in spectra 28 | num= exp( (1/win) * sum( log(ak),2) ); 29 | den= (1/win) * sum( ak,2); 30 | ft= (num+eps)./(den+eps); 31 | 32 | %% Spectral Variation is the normalized by the correlation of spectrum between consecutive frames 33 | num= (sum(ak.*ak_1,2) +eps)./(denA+eps); 34 | sVar= 1- num; 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /rVAD2.0/README.TXT: -------------------------------------------------------------------------------- 1 | Noise-robust voice activity detection algorithm (rVAD). 2 | Version 2.0 3 | 4 | 28 Nov 2017, Zheng-Hua Tan 5 | 6 | Usage: vad(finwav, fvad) 7 | vad(finwav, fvad, opts) 8 | vad(finwav, fvad, opts, vadThres). 9 | 10 | where finwav is the input WAVE file path and name, fvad is the output VAD file path and name, opts can be 0 for using pitch (default option) or 1 for using flatness (significantly faster at the cost of slightly reduced accuracy), and finally vadThres is the threshold for VAD. Refer to vad.m for more detailed explanation. 11 | 12 | The code has been tested on Matlab R2016a. 13 | 14 | Refs: 15 | [1] Z.-H. Tan, A.k. Sarkara and N. Dehak, "rVAD: an unsupervised segment-based robust voice activity detection method," Computer Speech and Language, vol. 59, pp. 1-21, 2020. 16 | [2] Z.-H. Tan and B. Lindberg, "Low-complexity variable frame rate analysis for speech recognition and voice activity detection,” IEEE Journal of Selected Topics in Signal Processing, vol. 4, no. 5, pp. 798-807, 2010. 17 | 18 | Contact: 19 | Prof Zheng-Hua Tan 20 | Aalborg University, Denmark 21 | zt@es.aau.dk 22 | https://vbn.aau.dk/en/persons/107665 23 | -------------------------------------------------------------------------------- /rVADfast_py_2.0/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | for rVADfast_py (the Python code) 4 | 5 | Copyright (c) 2022 Zheng-Hua Tan and Achintya Kumar Sarkar 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | -------------------------------------------------------------------------------- /rVAD2.0/pitchblockdetect.m: -------------------------------------------------------------------------------- 1 | function [pvblk]=pitchblockdetect(pv01, nfr10, pitch, opts) 2 | 3 | %pitch block detection 4 | 5 | if nfr10==length(pv01)+1 6 | pv01(nfr10)=pv01(nfr10-1); 7 | end 8 | 9 | if opts == 0 10 | 11 | sign_pv=0; 12 | for i=1:nfr10 13 | if pv01(i)==1 && sign_pv==0 14 | nstart=i; 15 | sign_pv=1; 16 | elseif (pv01(i)==0 || i==nfr10) && sign_pv==1 17 | nstop=i; 18 | if i==nfr10; nstop=i+1; end 19 | sign_pv=0; 20 | pitchseg=zeros(nstop-nstart,1); 21 | for j=nstart:nstop-1 22 | 23 | if isstring(pitch(j)) 24 | pitchseg(j-nstart+1)=str2double(pitch(j)); 25 | else 26 | pitchseg(j-nstart+1)=pitch(j); 27 | end 28 | 29 | end 30 | if sum(abs(round(pitchseg-mean(pitchseg))))==0 && nstop-nstart+1>=10 31 | pv01(nstart:nstop-1)=0; 32 | end 33 | end 34 | end 35 | 36 | end %opts 37 | 38 | 39 | sign_pv=0; 40 | pvblk=pv01; 41 | for i=1:nfr10 42 | if pv01(i)==1 && sign_pv==0 43 | nstart=i; 44 | sign_pv=1; 45 | pvblk(max(nstart-60,1):nstart)=1; 46 | elseif (pv01(i)==0 || i==nfr10) && sign_pv==1 47 | nstop=i; 48 | sign_pv=0; 49 | pvblk(nstop:min(nstop+60,nfr10))=1; 50 | end 51 | end 52 | 53 | -------------------------------------------------------------------------------- /rVAD2.0/vadbatch_1folder_diffpathes.m: -------------------------------------------------------------------------------- 1 | function []=vadbatch_1folder_diffpathes(wavfold,nfiles1,nfiles2) 2 | 3 | dinwav=strcat('/data/scratch/najim/RATS/Correct_data/',wavfold,'/'); 4 | dpitch=strcat('/data/scratch2/najim/RATS/PEF/',wavfold,'/'); 5 | 6 | doutwav1=strcat('/data/scratch2/zhenghua/RATS/wav1/',wavfold); 7 | doutwav2=strcat('/data/scratch2/zhenghua/RATS/wav2/',wavfold); 8 | dvad=strcat('/data/scratch2/zhenghua/RATS/vad/',wavfold); 9 | 10 | d1=dir(dinwav); 11 | n1=length(d1); 12 | if nargin==1 13 | nfiles1=1; nfiles2=n1-2; 14 | elseif nargin==2 15 | nfiles2=n1-2; 16 | doutwav1=strcat(doutwav1,'_',num2str(nfiles1)); 17 | doutwav2=strcat(doutwav2,'_',num2str(nfiles1)); 18 | dvad=strcat(dvad,'_',num2str(nfiles1)); 19 | elseif nargin==3 20 | doutwav1=strcat(doutwav1,'_',num2str(nfiles1)); 21 | doutwav2=strcat(doutwav2,'_',num2str(nfiles1)); 22 | dvad=strcat(dvad,'_',num2str(nfiles1)); 23 | end 24 | doutwav1=strcat(doutwav1,'/') 25 | doutwav2=strcat(doutwav2,'/') 26 | dvad=strcat(dvad,'/') 27 | 28 | if ~isdir(doutwav1); mkdir(doutwav1); end 29 | if ~isdir(doutwav2); mkdir(doutwav2); end 30 | if ~isdir(dvad); mkdir(dvad); end 31 | if nfiles2>n1-2; nfiles2=n1-2; end 32 | for i1=2+nfiles1:2+nfiles2 33 | [str1, str2]=strread(d1(i1).name,'%s%s','delimiter','.'); 34 | finwav=strcat(dinwav,d1(i1).name) 35 | fpitch=strcat(dpitch,str1{1},'.PEF'); 36 | foutwav1=strcat(doutwav1,d1(i1).name); 37 | foutwav2=strcat(doutwav2,d1(i1).name); 38 | fvad=strcat(dvad,str1{1},'.vad'); 39 | vad(finwav,fpitch,foutwav1,foutwav2,fvad); 40 | end 41 | 42 | clear all; 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /rVADfast_py_2.0/audio_stream.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import code 4 | import pyaudio 5 | import wave 6 | 7 | # record stream audio, save it and then apply rVADfast to it 8 | # usage: python3 audio_stream.py 9 | 10 | CHUNK = 1024 11 | FORMAT = pyaudio.paInt16 12 | CHANNELS = 1 13 | RATE = 44100 14 | 15 | def record(): 16 | 17 | p = pyaudio.PyAudio() 18 | 19 | stream = p.open(format=FORMAT,channels=CHANNELS,rate=RATE,input=True,frames_per_buffer=CHUNK) 20 | 21 | print("Start recording") 22 | print("2. Press Ctrl+C to stop the recording" 23 | print("3. rVAD will start") 24 | print("==================================================================\n") 25 | frames = [] 26 | 27 | try: 28 | while True: 29 | data = stream.read(CHUNK) 30 | frames.append(data) 31 | 32 | except KeyboardInterrupt: 33 | print("Done recording: stored --> output.wav") 34 | 35 | except Exception as e: 36 | print(str(e)) 37 | 38 | sample_width = p.get_sample_size(FORMAT) 39 | 40 | stream.stop_stream() 41 | stream.close() 42 | p.terminate() 43 | 44 | return sample_width, frames 45 | 46 | def record_to_file(file_path): 47 | wf = wave.open(file_path, 'wb') 48 | wf.setnchannels(CHANNELS) 49 | 50 | sample_width, frames = record() 51 | 52 | wf.setsampwidth(sample_width) 53 | wf.setframerate(RATE) 54 | wf.writeframes(b''.join(frames)) 55 | wf.close() 56 | 57 | if __name__ == '__main__': 58 | record_to_file('output.wav') 59 | print("rVAD running...") 60 | os.system("python3 rVAD_fast.py output.wav output.txt") 61 | print("Result written") 62 | -------------------------------------------------------------------------------- /rVAD2.0/winenvar.m: -------------------------------------------------------------------------------- 1 | function d=winenvar(n) 2 | %WINENVAR get windows environment variable [D]=(N) 3 | % 4 | % Inputs: N name of environment variable (e.g. 'temp') 5 | % 6 | % Outputs: D value of variable or [] is non-existant 7 | % 8 | % Notes: (1) This is WINDOWS specific and needs to be fixed to work on UNIX 9 | % (2) The search is case insensitive (like most of WINDOWS). 10 | % 11 | % Examples: (1) Open a temporary text file: 12 | % d=winenar('temp'); fid=fopen(fullfile(d,'temp.txt'),'wt'); 13 | 14 | % Copyright (c) 2005 Mike Brookes, mike.brookes@ic.ac.uk 15 | % Version: $Id: winenvar.m 713 2011-10-16 14:45:43Z dmb $ 16 | % 17 | % VOICEBOX is a MATLAB toolbox for speech processing. 18 | % Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html 19 | % 20 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 21 | % This program is free software; you can redistribute it and/or modify 22 | % it under the terms of the GNU General Public License as published by 23 | % the Free Software Foundation; either version 2 of the License, or 24 | % (at your option) any later version. 25 | % 26 | % This program is distributed in the hope that it will be useful, 27 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 28 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 29 | % GNU General Public License for more details. 30 | % 31 | % You can obtain a copy of the GNU General Public License from 32 | % http://www.gnu.org/copyleft/gpl.html or by writing to 33 | % Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA. 34 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 35 | p=['%',n,'%']; 36 | [s,d]=system(['echo ',p]); 37 | while d(end)<=' '; 38 | d(end)=[]; 39 | end 40 | if strcmp(d,p) 41 | d=[]; 42 | end -------------------------------------------------------------------------------- /rVAD2.0/rfft.m: -------------------------------------------------------------------------------- 1 | function y=rfft(x,n,d) 2 | %RFFT Calculate the DFT of real data Y=(X,N,D) 3 | % Data is truncated/padded to length N if specified. 4 | % N even: (N+2)/2 points are returned with 5 | % the first and last being real 6 | % N odd: (N+1)/2 points are returned with the 7 | % first being real 8 | % In all cases fix(1+N/2) points are returned 9 | % D is the dimension along which to do the DFT 10 | 11 | 12 | 13 | % Copyright (C) Mike Brookes 1998 14 | % Version: $Id: rfft.m 713 2011-10-16 14:45:43Z dmb $ 15 | % 16 | % VOICEBOX is a MATLAB toolbox for speech processing. 17 | % Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html 18 | % 19 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 20 | % This program is free software; you can redistribute it and/or modify 21 | % it under the terms of the GNU General Public License as published by 22 | % the Free Software Foundation; either version 2 of the License, or 23 | % (at your option) any later version. 24 | % 25 | % This program is distributed in the hope that it will be useful, 26 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 27 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 28 | % GNU General Public License for more details. 29 | % 30 | % You can obtain a copy of the GNU General Public License from 31 | % http://www.gnu.org/copyleft/gpl.html or by writing to 32 | % Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA. 33 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 34 | 35 | s=size(x); 36 | if prod(s)==1 37 | y=x 38 | else 39 | if nargin <3 || isempty(d) 40 | d=find(s>1,1); 41 | if nargin<2 42 | n=s(d); 43 | end 44 | end 45 | if isempty(n) 46 | n=s(d); 47 | end 48 | y=fft(x,n,d); 49 | y=reshape(y,prod(s(1:d-1)),n,prod(s(d+1:end))); 50 | s(d)=1+fix(n/2); 51 | y(:,s(d)+1:end,:)=[]; 52 | y=reshape(y,s); 53 | end 54 | -------------------------------------------------------------------------------- /rVADfast_py_2.0/rVAD_fast.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import numpy 3 | import pickle 4 | import os 5 | import sys 6 | import math 7 | import code 8 | from scipy.signal import lfilter 9 | import speechproc 10 | from copy import deepcopy 11 | 12 | # Refs: 13 | # [1] Z.-H. Tan, A.k. Sarkara and N. Dehak, "rVAD: an unsupervised segment-based robust voice activity detection method," Computer Speech and Language, vol. 59, pp. 1-21, 2020. 14 | # [2] Z.-H. Tan and B. Lindberg, "Low-complexity variable frame rate analysis for speech recognition and voice activity detection." 15 | # IEEE Journal of Selected Topics in Signal Processing, vol. 4, no. 5, pp. 798-807, 2010. 16 | 17 | # Version: 2.0 18 | # 02 Dec 2017, Achintya Kumar Sarkar and Zheng-Hua Tan 19 | 20 | # Usage: python rVAD_fast_2.0.py inWaveFile outputVadLabel 21 | 22 | 23 | winlen, ovrlen, pre_coef, nfilter, nftt = 0.025, 0.01, 0.97, 20, 512 24 | ftThres=0.5; vadThres=0.4 25 | opts=1 26 | 27 | finwav=str(sys.argv[1]) 28 | fvad=str(sys.argv[2]) 29 | 30 | fs, data = speechproc.speech_wave(finwav) 31 | ft, flen, fsh10, nfr10 =speechproc.sflux(data, fs, winlen, ovrlen, nftt) 32 | 33 | 34 | # --spectral flatness -- 35 | pv01=numpy.zeros(nfr10) 36 | pv01[numpy.less_equal(ft, ftThres)]=1 37 | pitch=deepcopy(ft) 38 | 39 | pvblk=speechproc.pitchblockdetect(pv01, pitch, nfr10, opts) 40 | 41 | 42 | # --filtering-- 43 | ENERGYFLOOR = numpy.exp(-50) 44 | b=numpy.array([0.9770, -0.9770]) 45 | a=numpy.array([1.0000, -0.9540]) 46 | fdata=lfilter(b, a, data, axis=0) 47 | 48 | 49 | #--pass 1-- 50 | noise_samp, noise_seg, n_noise_samp=speechproc.snre_highenergy(fdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk) 51 | 52 | #sets noisy segments to zero 53 | for j in range(n_noise_samp): 54 | fdata[range(int(noise_samp[j,0]), int(noise_samp[j,1]) +1)] = 0 55 | 56 | 57 | vad_seg=speechproc.snre_vad(fdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk, vadThres) 58 | 59 | numpy.savetxt(fvad, vad_seg.astype(int), fmt='%i') 60 | print("%s --> %s " %(finwav, fvad)) 61 | 62 | data=None; pv01=None; pitch=None; fdata=None; pvblk=None; vad_seg=None 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /rVADfast_py_2.0/rVAD_fast_stream.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import code 4 | import pyaudio 5 | import wave 6 | 7 | 8 | CHUNK = 1024 ##number of frames in buffer 9 | FORMAT = pyaudio.paInt16 10 | CHANNELS = 1 #each frame contents 1 sample of audio --> chunk -> 1024 samples in buffer 11 | RATE = 44100 #no of samples per Seconds 12 | dur = 5 # the duration of recording audio chunck (seconds) 13 | 14 | def record(): 15 | 16 | p = pyaudio.PyAudio() 17 | 18 | stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK) 19 | 20 | print(".......................................") 21 | print("Start recording of 5s chunk of audio") 22 | print("Stop recording - press clt+c") 23 | print(".......................................\n") 24 | frames = [] 25 | cont=0 26 | 27 | try: 28 | while True: 29 | data = stream.read(CHUNK) 30 | frames.append(data) 31 | cont = cont + CHUNK 32 | if cont >= dur*RATE: #for dur seconds audio 33 | print('Recorded %d seconds audio' %(dur)) 34 | print('rVAD going ...') 35 | break; 36 | 37 | except KeyboardInterrupt: 38 | print("Done recording: stored --> output.wav") 39 | except Exception as e: 40 | print(str(e)) 41 | 42 | sample_width = p.get_sample_size(FORMAT) 43 | 44 | stream.stop_stream() 45 | stream.close() 46 | p.terminate() 47 | 48 | return sample_width, frames 49 | 50 | 51 | def record_to_file(file_path): 52 | wf = wave.open(file_path, 'wb') 53 | wf.setnchannels(CHANNELS) 54 | 55 | sample_width, frames = record() 56 | 57 | wf.setsampwidth(sample_width) 58 | wf.setframerate(RATE) 59 | wf.writeframes(b''.join(frames)) 60 | wf.close() 61 | 62 | 63 | if __name__ == '__main__': 64 | part=0 65 | while True: 66 | audPart = 'output'+ str(part) 67 | record_to_file(audPart + '.wav') 68 | cmd = 'python3' + " " + 'rVAD_fast.py' + " " + audPart +'.wav' + " " + audPart+'.txt' 69 | os.system(cmd) 70 | print('Result for audio chunk%d written' %(part)) 71 | part = part + 1 72 | -------------------------------------------------------------------------------- /rVAD2.0/snre_highenergy.m: -------------------------------------------------------------------------------- 1 | function [noise_samp, n_noise_samp, noise_seg, D, Dsmth, snre_vad, e]=snre_highenergy(dfdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01) 2 | 3 | % Ref: 4 | % Z-H Tan and B Lindberg 5 | % Low-Complexity Variable Frame Rate Analysis for Speech Recognition and Voice Activity Detection 6 | % IEEE Journal of Selected Topics in Signal Processing, 4(5), Oct. 2010. 7 | 8 | % square root of a posteriori SNR weighted engergy difference, block based 9 | % 10 | % Modified 01 Mar 2013 11 | 12 | Dexpl=18; 13 | Dexpr=18; 14 | segThres = 0.25; 15 | e=zeros(nfr10,1); 16 | for i=1:nfr10 17 | for j=1:flen 18 | e(i)=e(i)+dfdata((i-1)*fsh10+j)*dfdata((i-1)*fsh10+j); 19 | end 20 | if e(i) <= ENERGYFLOOR 21 | e(i)=ENERGYFLOOR; 22 | end 23 | end 24 | 25 | 26 | 27 | emin=ones(nfr10,1); 28 | NESEG = 200; 29 | if nfr10 < NESEG; NESEG=nfr10; end 30 | for i=1:floor(nfr10/NESEG) 31 | [eY,eI]=sort(e((i-1)*NESEG+1:i*NESEG)); 32 | emin((i-1)*NESEG+1:i*NESEG)=eY(floor(NESEG*0.1)); 33 | if i~=1 34 | emin((i-1)*NESEG+1:i*NESEG)=0.9*emin((i-1)*NESEG)+0.1*emin((i-1)*NESEG+1); 35 | end 36 | end 37 | if i*NESEG~=nfr10 38 | [eY,eI]=sort(e((i-1)*NESEG+1:nfr10)); 39 | emin(i*NESEG+1:nfr10)=eY(floor((nfr10-(i-1)*NESEG)*0.1)); 40 | emin(i*NESEG+1:nfr10)=0.9*emin(i*NESEG)+0.1*emin(i*NESEG+1); 41 | end 42 | 43 | 44 | 45 | 46 | D=zeros(nfr10,1); 47 | postsnr=zeros(nfr10,1); 48 | for i=2:nfr10 49 | postsnr(i) =log10(e(i))-log10(emin(i)); 50 | if postsnr(i)<0 51 | postsnr(i)=0; 52 | end 53 | D(i)=sqrt(abs(e(i)-e(i-1))*postsnr(i)); 54 | end 55 | D(1)=D(2); 56 | 57 | Dexp = vertcat(ones(Dexpl,1)*D(1), D, ones(Dexpr,1)*D(nfr10)); 58 | Dsmth = zeros(nfr10,1); 59 | for i=1:nfr10 60 | Dsmth(i)=sum(Dexp(i:i+Dexpl+Dexpr)); 61 | end 62 | 63 | for i=1:floor(nfr10/NESEG) 64 | Dsmth_max((i-1)*NESEG+1:i*NESEG)=max(e((i-1)*NESEG+1:i*NESEG)); 65 | end 66 | if i*NESEG~=nfr10 67 | Dsmth_max(i*NESEG+1:nfr10)=max(e((i-1)*NESEG+1:nfr10)); 68 | end 69 | 70 | snre_vad = zeros(nfr10,1); 71 | for i=1:nfr10 72 | if Dsmth(i)>Dsmth_max(i)*segThres; snre_vad(i)=1; end 73 | end 74 | 75 | 76 | 77 | % block based processing to remove noise part by using snre_vad1. 78 | sign_vad = 0; 79 | noise_seg=zeros(floor(nfr10/1.6),1); 80 | noise_samp=zeros(nfr10,2); 81 | n_noise_samp=0; 82 | for i=1:nfr10 83 | if snre_vad(i) == 1 && sign_vad == 0 % start of a segment 84 | sign_vad = 1; 85 | nstart=i; 86 | elseif (snre_vad(i) ==0 || i==nfr10) && sign_vad == 1 % end of a segment 87 | sign_vad = 0; 88 | nstop=i-1; 89 | if sum(pv01(nstart:nstop))==0 90 | noise_seg(round(nstart/1.6):floor(nstop/1.6)) = 1; 91 | n_noise_samp=n_noise_samp+1; 92 | noise_samp(n_noise_samp,:)=[(nstart-1)*fsh10+1 nstop*fsh10]; 93 | end 94 | end 95 | end 96 | noise_samp(n_noise_samp+1:nfr10,:)=[]; 97 | 98 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # rVAD 2 | 3 | ## Description 4 | Matlab and Python libraries for an unsupervised method for robust voice activity detection (rVAD) or speech activity detection (SAD), as presented in [rVAD: An Unsupervised Segment-Based Robust Voice Activity Detection Method, Computer Speech & Language, 2020](https://www.sciencedirect.com/science/article/pii/S0885230819300920) or its [arXiv version](https://arxiv.org/abs/1906.03588). 5 | 6 | ***The rVAD paper published in Computer Speech & Language won International Speech Communication Association (ISCA) 2022 Best Research Paper Award.*** 7 | 8 | The rVAD method consists of two passes of denoising followed by a VAD stage. It has been applied as a preprocessor for a wide range of applications, such as speech recognition, speaker identification, language identification, age and gender identification, self-supervised learning, human-robot interaction, audio archive segmentation, and so on as in [Google Scholar](https://scholar.google.com/citations?view_op=view_citation&hl=en&user=fugL2E8AAAAJ&citation_for_view=fugL2E8AAAAJ:-mN3Mh-tlDkC). 9 | 10 | The method is unsupervised to make it applicable to a broad range of acoustic environments, and it is optimized considering both noisy and clean conditions. 11 | 12 | The rVAD (out of the box) ranks the 4th place (out of 27 supervised/unsupervised systems) in a Fearless Steps Speech Activity Detection Challenge. 13 | 14 | The rVAD paper is among [the most cited articles from Computer Speech and Language published since 2018](https://www.journals.elsevier.com/computer-speech-and-language/most-cited-articles) (the 6th place), in 2022 and 2023. 15 | 16 | ## Source code for rVAD: 17 | Source code in Matlab for rVAD (including both rVAD and rVAD-fast) is available under the [rVAD2.0](rVAD2.0/) folder. It is straightforward to use: Simply call the function vad.m. Some Matlab functions and their modified versions from the publicly available VoiceBox are included with kind permission of Mike Brookes. 18 | 19 | Source code in Python for rVAD-fast is available under the [rVADfast_py_2.0](rVADfast_py_2.0/) folder. Source code for rVAD-fast to take streaming audio in is included too. 20 | 21 | rVAD-fast is 10+ times faster than rVAD while rVAD has superior performance. 22 | 23 | The rVADfast library is available as a python package installable via: 24 | pip install rVADfast. 25 | See [rVADfast GitHub page](https://github.com/zhenghuatan/rVADfast) for more details. 26 | 27 | ## Reference VAD for Aurora 2 database: 28 | The frame-by-frame reference VAD was generated from the clean set of Aurora 2 using forced-alignment speech recognition and has been used as a 'ground truth' for evaluating VAD algorithms. Our study shows that forced-alignment ASR performs as well as a human expert labeler for generating VAD references, as detailed in [Comparison of Forced-Alignment Speech Recognition and Humans for Generating Reference VAD](https://www.isca-speech.org/archive/pdfs/interspeech_2015/kraljevski15_interspeech.pdf). Here are the generated [reference VAD for the training set](Aurora2TrainSet-ReferenceVAD.zip) and the [reference VAD for the test set](Aurora2TestSet-ReferenceVAD.zip). 29 | 30 | -------------------------------------------------------------------------------- /rVAD2.0/vad.m: -------------------------------------------------------------------------------- 1 | function []=vad(finwav, fvad, opts, vadThres) 2 | 3 | % Usage: vad(finwav, fvad) 4 | % vad(finwav, fvad, opts) 5 | % vad(finwav, fvad, opts, vadThres). 6 | % 7 | % finwav: The input WAVE file path and name. 8 | % 9 | % fvad: The output VAD file path and name [optional]. If the output is in 0-1 format, each line in the file is the label for that frame (0 for non-speech and 1 for speech), while if the output is in the segment format, each line contains the start frame number and the end frame number for a speech segment. The default is 0-1 format, and one can switch to the segment format by choosing another line of fprintf in the end of this code. The frame shift is 10ms. 10 | % 11 | % opts: 0 for using pitch (default option), and 1 for using flatness (significantly faster at the cost of slightly reduced accuracy). 12 | % 13 | % vadThres: The threshold for VAD. The default value is 0.4. Increasing vadThres (e.g. to 0.5) makes the VAD more aggressive, i.e. the number of frames to be detected as speech will be reduced. 14 | % 15 | % Refs: 16 | % [1] Z.-H. Tan, A.k. Sarkara and N. Dehak, "rVAD: an unsupervised segment-based robust voice activity detection method," Computer Speech and Language, 2019. 17 | % [2] Z.-H. Tan and B. Lindberg, "Low-complexity variable frame rate analysis for speech recognition and voice activity detection,” IEEE Journal of Selected Topics in Signal Processing, vol. 4, no. 5, pp. 798-807, 2010. 18 | % 19 | % 2017-11-28, Zheng-Hua Tan 20 | 21 | if nargin < 2; error('Usage: vad(finwav, fvad)'); end 22 | if nargin == 2 23 | opts = 0; vadThres = 0.4; 24 | elseif nargin == 3 25 | vadThres = 0.4; 26 | end 27 | 28 | [data,fs]= audioread(finwav); 29 | % [data,fs]=wavread(finwav); 30 | % [data, fs]=aurora2read(finwav); 31 | 32 | % Parameter setting 33 | ENERGYFLOOR = exp(-50); 34 | flen=floor(fs/40); % 25ms frame length 35 | fsh10=fs/100; % 10ms frame shift 36 | nfr10=floor((length(data)-(flen-fsh10))/fsh10); 37 | 38 | b=[0.9770 -0.9770]; a=[ 1.0000 -0.9540]; 39 | fdata=filter(b,a,data); 40 | 41 | if opts == 0 42 | [pv01, pitch]=pitchestm(data, fs, nfr10); 43 | else % using flatness 44 | ftThres = 0.5; % Default threshold. It can range from 0 to 1. Increasing ftThres increases the number of frames being detected as speech. 45 | [ft]= sflux(data,flen,fsh10); 46 | pv01 = (ft <= ftThres); % <= threshold would give 1( meaning a speech frame) 47 | pitch=ft; 48 | end 49 | 50 | pvblk=pitchblockdetect(pv01, nfr10, pitch, opts); 51 | 52 | [noise_samp, n_noise_samp, noise_seg]=snre_highenergy(fdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01); 53 | 54 | %% Set high energy segments to zero 55 | for i=1:n_noise_samp 56 | fdata(noise_samp(i,1):noise_samp(i,2)) = 0; 57 | end 58 | 59 | [dfdatarm]=specsub(fdata,fs); 60 | % [dfdatarm]=specsub(fdata,fs,noise_seg,pv01); 61 | 62 | [vad_seg]=snre_vad(dfdatarm, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk, vadThres); 63 | 64 | %% Output VAD results in 0-1 format (1 for speech frames and 0 for non-speech ones) 65 | if isempty(vad_seg) ==1 66 | z=zeros(nfr10,1); 67 | else 68 | y=[]; 69 | for i=1:size(vad_seg,1) 70 | y=[ y ; [ vad_seg(i,1):vad_seg(i,2)]' ]; 71 | end 72 | z=zeros(nfr10,1); 73 | z([y],1)=1; 74 | 75 | if sum(z) ~= size(y,1) % checking 76 | error('The number of labeled speech frames does not matched the results of detected speech segments!'); 77 | end 78 | end 79 | 80 | fid=fopen(fvad,'w'); 81 | fprintf(fid, '%d\n',z'); % 0-1 VAD output 82 | % fprintf(fid, '%d\n',vad_seg); % segment-label VAD output 83 | fclose(fid); 84 | 85 | -------------------------------------------------------------------------------- /rVAD2.0/irfft.m: -------------------------------------------------------------------------------- 1 | function x=irfft(y,n,d) 2 | %IRFFT Inverse fft of a conjugate symmetric spectrum X=(Y,N,D) 3 | % 4 | % Inputs: Y(M) The first half of a complex spectrum 5 | % N The number of output points to generate (default: 2M-2) 6 | % D The dimension along which to perorm the transform 7 | % (default: first non-singleton dimension of Y) 8 | % 9 | % Outputs: X(N) Real inverse dft of Y 10 | % 11 | % This routine calculates the inverse DFT of a conjugate-symmetric to give a real-valued 12 | % output of dimension N. Only the first half of the spectrum need be supplied: if N is even, 13 | % this includes the Nyquist term and is of dimension M=N/2 + 1 whereas if N is odd then there is 14 | % no Nyquist term and the input is of dimension M=(N+1)/2. 15 | % Note that the default value of N is always even so that N must be given explicitly 16 | % if it is odd. 17 | % 18 | % See also the forward transform: RFFT 19 | 20 | % Copyright (C) Mike Brookes 2009 21 | % Version: $Id: irfft.m 713 2011-10-16 14:45:43Z dmb $ 22 | % 23 | % VOICEBOX is a MATLAB toolbox for speech processing. 24 | % Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html 25 | % 26 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 27 | % This program is free software; you can redistribute it and/or modify 28 | % it under the terms of the GNU General Public License as published by 29 | % the Free Software Foundation; either version 2 of the License, or 30 | % (at your option) any later version. 31 | % 32 | % This program is distributed in the hope that it will be useful, 33 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 34 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 35 | % GNU General Public License for more details. 36 | % 37 | % You can obtain a copy of the GNU General Public License from 38 | % http://www.gnu.org/copyleft/gpl.html or by writing to 39 | % Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA. 40 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 41 | 42 | s=size(y); 43 | ps=prod(s); 44 | ns=length(s); 45 | if ps==1 46 | x=y 47 | else 48 | if nargin <3 || isempty(d) 49 | d=find(s>1); 50 | d=d(1); 51 | end 52 | m=s(d); 53 | k=ps/m; % number of fft's to do 54 | if d==1 55 | v=reshape(y,m,k); 56 | else 57 | v=reshape(permute(y,[d:ns 1:d-1]),m,k); 58 | end 59 | if nargin<2 || isempty(n) 60 | n=2*m-2; % default output length 61 | else 62 | mm=1+fix(n/2); % expected input length 63 | if mm>m v=[v; zeros(mm-m,k)]; % zero pad 64 | elseif mm0); 58 | f=find(dx<0); 59 | 60 | if length(r)>0 & length(f)>0 % we must have at least one rise and one fall 61 | dr=r; 62 | dr(2:end)=r(2:end)-r(1:end-1); 63 | rc=repmat(1,nx,1); 64 | rc(r+1)=1-dr; 65 | rc(1)=0; 66 | rs=cumsum(rc); % = time since the last rise 67 | 68 | df=f; 69 | df(2:end)=f(2:end)-f(1:end-1); 70 | fc=repmat(1,nx,1); 71 | fc(f+1)=1-df; 72 | fc(1)=0; 73 | fs=cumsum(fc); % = time since the last fall 74 | 75 | rp=repmat(-1,nx,1); 76 | rp([1; r+1])=[dr-1; nx-r(end)-1]; 77 | rq=cumsum(rp); % = time to the next rise 78 | 79 | fp=repmat(-1,nx,1); 80 | fp([1; f+1])=[df-1; nx-f(end)-1]; 81 | fq=cumsum(fp); % = time to the next fall 82 | 83 | k=find((rs0); % j=0 on a plateau 90 | v(j)=x(k(j))+0.25*b(j).^2./a(j); 91 | k(j)=k(j)+0.5*b(j)./a(j); 92 | k(~j)=k(~j)+(fq(k(~j))-rs(k(~j)))/2; % add 0.5 to k if plateau has an even width 93 | end 94 | 95 | % now purge nearby peaks 96 | 97 | if nargin>2 98 | j=find(k(2:end)-k(1:end-1)<=w); 99 | while any(j) 100 | j=j+(v(j)>=v(j+1)); 101 | k(j)=[]; 102 | v(j)=[]; 103 | j=find(k(2:end)-k(1:end-1)<=w); 104 | end 105 | end 106 | else 107 | k=[]; 108 | v=[]; 109 | end 110 | if any(m=='v') 111 | v=-v; % invert peaks if searching for valleys 112 | end 113 | if ~nargout 114 | if any(m=='v') 115 | x=-x; % re-invert x if searching for valleys 116 | ch='v'; 117 | else 118 | ch='^'; 119 | end 120 | plot(1:nx,x,'-',k,v,ch); 121 | end 122 | -------------------------------------------------------------------------------- /rVAD2.0/enframe.m: -------------------------------------------------------------------------------- 1 | function [f,t,w]=enframe(x,win,inc,m) 2 | %ENFRAME split signal up into (overlapping) frames: one per row. [F,T]=(X,WIN,INC) 3 | % 4 | % Usage: (1) f=enframe(x,n) % split into frames of length n 5 | % 6 | % (2) f=enframe(x,hamming(n,'periodic'),n/4) % use a 75% overlapped Hamming window of length n 7 | % 8 | % Inputs: x input signal 9 | % win window or window length in samples 10 | % inc frame increment in samples 11 | % m mode input: 12 | % 'z' zero pad to fill up final frame 13 | % 'r' reflect last few samples for final frame 14 | % 'A' calculate window times as the centre of mass 15 | % 'E' calculate window times as the centre of energy 16 | % 17 | % Outputs: f enframed data - one frame per row 18 | % t fractional time in samples at the centre of each frame 19 | % w window function used 20 | % 21 | % By default, the number of frames will be rounded down to the nearest 22 | % integer and the last few samples of x() will be ignored unless its length 23 | % is lw more than a multiple of inc. If the 'z' or 'r' options are given, 24 | % the number of frame will instead be rounded up and no samples will be ignored. 25 | % 26 | % Example of frame-based processing: 27 | % INC=20 % set frame increment in samples 28 | % NW=INC*2 % oversample by a factor of 2 (4 is also often used) 29 | % S=cos((0:NW*7)*6*pi/NW); % example input signal 30 | % W=sqrt(hamming(NW),'periodic')); % sqrt hamming window of period NW 31 | % F=enframe(S,W,INC); % split into frames 32 | % ... process frames ... 33 | % X=overlapadd(F,W,INC); % reconstitute the time waveform (omit "X=" to plot waveform) 34 | 35 | % Bugs/Suggestions: 36 | % (1) Possible additional mode options: 37 | % 'u' modify window for first and last few frames to ensure WOLA 38 | % 'a' normalize window to give a mean of unity after overlaps 39 | % 'e' normalize window to give an energy of unity after overlaps 40 | % 'wm' use Hamming window 41 | % 'wn' use Hanning window 42 | % 'x' include all frames that include any of the x samples 43 | 44 | % Copyright (C) Mike Brookes 1997-2012 45 | % Version: $Id: enframe.m 1713 2012-03-30 21:27:46Z dmb $ 46 | % 47 | % VOICEBOX is a MATLAB toolbox for speech processing. 48 | % Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html 49 | % 50 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 51 | % This program is free software; you can redistribute it and/or modify 52 | % it under the terms of the GNU General Public License as published by 53 | % the Free Software Foundation; either version 2 of the License, or 54 | % (at your option) any later version. 55 | % 56 | % This program is distributed in the hope that it will be useful, 57 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 58 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 59 | % GNU General Public License for more details. 60 | % 61 | % You can obtain a copy of the GNU General Public License from 62 | % http://www.gnu.org/copyleft/gpl.html or by writing to 63 | % Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA. 64 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 65 | 66 | nx=length(x(:)); 67 | if nargin<2 || isempty(win) 68 | win=nx; 69 | end 70 | if nargin<4 || isempty(m) 71 | m=''; 72 | end 73 | nwin=length(win); 74 | if nwin == 1 75 | lw = win; 76 | w = ones(1,lw); 77 | else 78 | lw = nwin; 79 | w = win(:)'; 80 | end 81 | if (nargin < 3) || isempty(inc) 82 | inc = lw; 83 | end 84 | nli=nx-lw+inc; 85 | nf = fix((nli)/inc); 86 | na=nli-inc*nf; 87 | f=zeros(nf,lw); 88 | indf= inc*(0:(nf-1)).'; 89 | inds = (1:lw); 90 | f(:) = x(indf(:,ones(1,lw))+inds(ones(nf,1),:)); 91 | if nargin>3 && (any(m=='z') || any(m=='r')) && na>0 92 | if any(m=='r') 93 | ix=1+mod(nx-na:nx-na+lw-1,2*nx); 94 | f(nf+1,:)=x(ix+(ix>nx).*(2*nx+1-2*ix)); 95 | else 96 | f(nf+1,1:na)=x(1+nx-na:nx); 97 | end 98 | nf=size(f,1); 99 | end 100 | 101 | 102 | if (nwin > 1) % if we have a non-unity window 103 | f = f .* w(ones(nf,1),:); 104 | end 105 | if nargout>1 106 | if any(m=='E') 107 | t0=sum((1:lw).*w.^2)/sum(w.^2); 108 | elseif any(m=='E') 109 | t0=sum((1:lw).*w)/sum(w); 110 | else 111 | t0=(1+lw)/2; 112 | end 113 | t=t0+inc*(0:(nf-1)).'; 114 | end 115 | 116 | 117 | -------------------------------------------------------------------------------- /rVAD2.0/snre_vad.m: -------------------------------------------------------------------------------- 1 | function [vad_seg, D, Dsmth, snre_vad, pv_vad, e, segsnr]=snre_vad(dfdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk, vadThres) 2 | 3 | % Ref: 4 | % Zheng-Hua Tan and Børge Lindberg 5 | % Low-Complexity Variable Frame Rate Analysis for Speech Recognition and Voice Activity Detection 6 | % IEEE Journal of Selected Topics in Signal Processing, 4(5), Oct. 2010. 7 | 8 | % a posteriori SNR weighted engergy difference 9 | Dexpl=18; 10 | Dexpr=18; 11 | % vadThres = 0.1; %0.1 best %0.125 %18, 18, 0.1 for maximal, 0.16 for mean, 0.12 for ss-seg 12 | Dsmth=zeros(nfr10,1); % smoothed energy distance 13 | 14 | % energy estimation 15 | e=zeros(nfr10,1); 16 | for i=1:nfr10 17 | for j=1:flen 18 | e(i)=e(i)+dfdata((i-1)*fsh10+j)*dfdata((i-1)*fsh10+j); 19 | end 20 | if e(i) <= ENERGYFLOOR 21 | e(i)=ENERGYFLOOR; 22 | end 23 | end 24 | 25 | segsnr=zeros(nfr10,1); 26 | segsnrsmth=1; sign_segsnr=0; 27 | D=zeros(nfr10,1); 28 | postsnr=D; 29 | snre_vad=zeros(nfr10,1); 30 | sign_pv=0; 31 | for i=1:nfr10 32 | if pvblk(i)==1 && sign_pv==0 33 | nstart=i; 34 | sign_pv=1; % a pitch segment starts 35 | elseif (pvblk(i)==0 || i==nfr10) && sign_pv==1 36 | nstop=i-1; % a pitch segment ends 37 | if i==nfr10; nstop=i; end 38 | sign_pv=0; 39 | 40 | %if nstart>1 && nstopDsmth_thres*vadThres 79 | snre_vad(j)=1; 80 | end 81 | end 82 | end 83 | end 84 | pv_vad=snre_vad; 85 | 86 | nexpl=33; 87 | nexpr=47; % 29 and 39, estimated statistically, 95% ; 33, 47 %98 for voicebox pitch 88 | sign_vad=0; 89 | for i=1:nfr10 90 | if snre_vad(i)==1 && sign_vad==0 91 | nstart=i; 92 | sign_vad=1; 93 | elseif (snre_vad(i)==0 || i==nfr10) && sign_vad==1 94 | nstop=i-1; 95 | if i==nfr10; nstop=i; end 96 | sign_vad=0; 97 | for j=nstart:nstop 98 | if pv01(j)==1 99 | break; 100 | end 101 | end 102 | pv_vad(nstart:max(j-nexpl-1,1))=0; % beyond 33 frames to the left, non speech 103 | for j=0:(nstop-nstart) 104 | if pv01(nstop-j)==1 105 | break; 106 | end 107 | end 108 | pv_vad(nstop-j+1+nexpr:nstop)=0; % beyond 47 frames to the right, non speech 109 | end 110 | end 111 | 112 | nexpl =5; nexpr=12; % 9 and 13, estimated statistically 5%; 5, 12 %2 for voicebox pitch 113 | sign_vad=0; 114 | for i=1:nfr10 115 | if snre_vad(i)==1 && sign_vad==0 116 | nstart=i; 117 | sign_vad=1; 118 | elseif (snre_vad(i)==0 || i==nfr10) && sign_vad==1 119 | nstop=i-1; 120 | if i==nfr10; nstop=i; end 121 | sign_vad=0; 122 | if sum(pv01(nstart:nstop)) > 4 123 | for j=nstart:nstop 124 | if pv01(j)==1 125 | break; 126 | end 127 | end 128 | pv_vad(max(j-nexpl,1):j-1)=1; 129 | for j=0:(nstop-nstart) 130 | if pv01(nstop-j)==1 131 | break; 132 | end 133 | end 134 | pv_vad(nstop-j+1:min(nstop-j+nexpr,nfr10))=1; 135 | end 136 | esegment=sum(e(nstart:nstop))/(nstop-nstart+1); 137 | if esegment < 0.001 138 | pv_vad(nstart:nstop)=0; 139 | end 140 | if sum(pv01(nstart:nstop)) <= 2 141 | pv_vad(nstart:nstop) = 0; 142 | end 143 | end 144 | end 145 | 146 | sign_vad=0; 147 | esum=0; 148 | for i=1:nfr10 149 | if pv_vad(i)==1 && sign_vad==0 150 | nstart=i; 151 | sign_vad=1; 152 | elseif (pv_vad(i)==0 || i==nfr10) && sign_vad==1 153 | nstop=i-1; 154 | if i==nfr10; nstop=i; end 155 | sign_vad=0; 156 | esum=esum+sum(e(nstart:nstop)); 157 | end 158 | end 159 | eave=esum/(sum(pv_vad)+eps); % average pitch segment energy over the utterance 160 | sign_vad=0; 161 | for i=1:nfr10 162 | if pv_vad(i)==1 && sign_vad==0 163 | nstart=i; 164 | sign_vad=1; 165 | elseif (pv_vad(i)==0 || i==nfr10) && sign_vad==1 166 | nstop=i-1; 167 | if i==nfr10; nstop=i; end 168 | sign_vad=0; 169 | % if sum(e(nstart:nstop))/(nstop-nstart+1)=3 && ~isempty(pp) % update fields from pp input 96 | qqn=fieldnames(qq); 97 | for i=1:length(qqn) 98 | if isfield(pp,qqn{i}) 99 | qq.(qqn{i})=pp.(qqn{i}); 100 | end 101 | end 102 | end 103 | pslp=repmat(qq.psini,1,nrf); % initialize smoothed speech presence prob 104 | xt=[]; % initialize just in case the first call has no data 105 | end 106 | 107 | % unpack parameters needed within the loop 108 | 109 | psthr=qq.psthr; % threshold for smoothed speech probability [0.99] (24) 110 | pnsaf=qq.pnsaf; % noise probability safety value [0.01] (24) 111 | 112 | % derived algorithm constants 113 | 114 | ax=exp(-tinc/qq.tax); % noise output smoothing factor = 0.8 (8) 115 | axc=1-ax; 116 | ap=exp(-tinc/qq.tap); % noise output smoothing factor = 0.9 (23) 117 | apc=1-ap; 118 | xih1=10^(qq.asnr/10); % speech-present SNR 119 | xih1r=1/(1+xih1)-1; 120 | pfac=(1/qq.pspri-1)*(1+xih1); % p(noise)/p(speech) (18) 121 | 122 | if nrcum==0 && nr>0 % initialize values for first frame 123 | xt=qq.psini*mean(yf(1:max(1,min(nr,round(1+qq.tavini/tinc))),:),1); % initial noise estimate 124 | end 125 | 126 | % loop for each frame 127 | for t=1:nr 128 | yft=yf(t,:); % noisy speech power spectrum 129 | ph1y=(1+pfac*exp(xih1r*yft./xt)).^(-1); % a-posteriori speech presence prob (18) 130 | pslp=ap*pslp+apc*ph1y; % smoothed speech presence prob (23) 131 | ph1y=min(ph1y,1-pnsaf*(pslp>psthr)); % limit ph1y (24) 132 | xtr=(1-ph1y).*yft+ph1y.*xt; % estimated raw noise spectrum (22) 133 | xt=ax*xt+axc*xtr; % smooth the noise estimate (8) 134 | x(t,:)=xt; % save the noise estimate 135 | end 136 | if nargout>1 % we need to store the state for next time 137 | zo.nrcum=nrcum+nr; % number of frames so far 138 | zo.xt=xt; % smoothed power spectrum 139 | zo.pslp=pslp; % correction factor (9) 140 | zo.tinc=tinc; % must be the last one 141 | zo.qq=qq; 142 | end 143 | if ~nargout 144 | clf; 145 | subplot(212); 146 | plot((1:nr)*tinc,10*log10([sum(yf,2) sum(x,2)])) 147 | ylabel('Frame Energy (dB)'); 148 | xlabel(sprintf('Time (s) [%d ms frame incr]',round(tinc*1000))); 149 | axisenlarge([-1 -1.05]); 150 | legend('input','noise','Location','Best'); 151 | subplot(211); 152 | plot(1:nrf,10*log10([sum(yf,1)'/nr sum(x,1)'/nr])) 153 | ylabel('Power (dB)'); 154 | xlabel('Frequency bin'); 155 | axisenlarge([-1 -1.05]); 156 | legend('input','noise','Location','Best'); 157 | end 158 | end 159 | 160 | -------------------------------------------------------------------------------- /rVAD2.0/gaussmixp.m: -------------------------------------------------------------------------------- 1 | function [lp,rp,kh,kp]=gaussmixp(y,m,v,w,a,b) 2 | %GAUSSMIXP calculate probability densities from a Gaussian mixture model 3 | % 4 | % Inputs: n data values, k mixtures, p parameters, q data vector size 5 | % 6 | % Y(n,q) = input data 7 | % M(k,p) = mixture means for x(p) 8 | % V(k,p) or V(p,p,k) variances (diagonal or full) 9 | % W(k,1) = weights 10 | % A(q,p), B(q) = transformation: y=x*a'+b' (where y and x are row vectors) 11 | % if A is omitted, it is assumed to be the first q rows of the 12 | % identity matrix. B defaults to zero. 13 | % Note that most commonly, q=p and A and B are omitted entirely. 14 | % 15 | % Outputs 16 | % 17 | % LP(n,1) = log probability of each data point 18 | % RP(n,k) = relative probability of each mixture 19 | % KH(n,1) = highest probability mixture 20 | % KP(n,1) = relative probability of highest probability mixture 21 | 22 | % Copyright (C) Mike Brookes 2000-2009 23 | % Version: $Id: gaussmixp.m 713 2011-10-16 14:45:43Z dmb $ 24 | % 25 | % VOICEBOX is a MATLAB toolbox for speech processing. 26 | % Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html 27 | % 28 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 29 | % This program is free software; you can redistribute it and/or modify 30 | % it under the terms of the GNU General Public License as published by 31 | % the Free Software Foundation; either version 2 of the License, or 32 | % (at your option) any later version. 33 | % 34 | % This program is distributed in the hope that it will be useful, 35 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 36 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 37 | % GNU General Public License for more details. 38 | % 39 | % You can obtain a copy of the GNU General Public License from 40 | % http://www.gnu.org/copyleft/gpl.html or by writing to 41 | % Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA. 42 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 43 | [n,q]=size(y); 44 | [k,p]=size(m); 45 | 46 | if nargin<4 47 | w=repmat(1/k,k,1); 48 | if nargin<3 49 | v=ones(k,p); 50 | end 51 | end 52 | fv=ndims(v)>2 || size(v,1)>k; % full covariance matrix is requested 53 | if nargin>4 % need to modify distribution means 54 | if nargin>5 55 | m=m*a'+repmat(b',k,1); 56 | else 57 | m=m*a'; 58 | end 59 | v1=v; 60 | v=zeros(q,q,k); 61 | if fv 62 | for ik=1:k 63 | v(:,:,ik)=a*v1(:,:,ik)*a'; 64 | end 65 | else 66 | for ik=1:k 67 | v(:,:,ik)=(a.*repmat(v1(ik,:),q,1))*a'; 68 | end 69 | fv=1; 70 | end 71 | elseif q

rix)=[]; % index of lower triangular elements 137 | lixi=zeros(q,q); 138 | lixi(lix)=1:pl; 139 | lixi=lixi'; 140 | lixi(lix)=1:pl; % reverse index to build full matrices 141 | v=reshape(v,q^2,k); 142 | v=v(lix,:)'; % lower triangular in rows 143 | 144 | % If data size is large then do calculations in chunks 145 | 146 | nb=min(n,max(1,floor(memsize/(24*q*k)))); % chunk size for testing data points 147 | nl=ceil(n/nb); % number of chunks 148 | jx0=n-(nl-1)*nb; % size of first chunk 149 | wnb=ones(1,nb); 150 | wnj=ones(1,jx0); 151 | 152 | vi=zeros(q*k,q); % stack of k inverse cov matrices each size q*q 153 | vim=zeros(q*k,1); % stack of k vectors of the form inv(v)*m 154 | mtk=vim; % stack of k vectors of the form m 155 | lvm=zeros(k,1); 156 | wpk=repmat((1:q)',k,1); 157 | 158 | for ik=1:k 159 | 160 | % these lines added for debugging only 161 | % vk=reshape(v(k,lixi),q,q); 162 | % condk(ik)=cond(vk); 163 | %%%%%%%%%%%%%%%%%%%% 164 | [uvk,dvk]=eig(reshape(v(ik,lixi),q,q)); % convert lower triangular to full and find eigenvalues 165 | dvk=diag(dvk); 166 | vik=-0.5*uvk*diag(dvk.^(-1))*uvk'; % calculate inverse 167 | vi((ik-1)*q+(1:q),:)=vik; % vi contains all mixture inverses stacked on top of each other 168 | vim((ik-1)*q+(1:q))=vik*m(ik,:)'; % vim contains vi*m for all mixtures stacked on top of each other 169 | mtk((ik-1)*q+(1:q))=m(ik,:)'; % mtk contains all mixture means stacked on top of each other 170 | lvm(ik)=log(w(ik))-0.5*sum(log(dvk)); % vm contains the weighted sqrt of det(vi) for each mixture 171 | end 172 | % 173 | % % first do partial chunk 174 | % 175 | jx=jx0; 176 | ii=1:jx; 177 | xii=y(ii,:).'; 178 | py=reshape(sum(reshape((vi*xii-vim(:,wnj)).*(xii(wpk,:)-mtk(:,wnj)),q,jx*k),1),k,jx)+lvm(:,wnj); 179 | mx=max(py,[],1); % find normalizing factor for each data point to prevent underflow when using exp() 180 | px=exp(py-mx(wk,:)); % find normalized probability of each mixture for each datapoint 181 | ps=sum(px,1); % total normalized likelihood of each data point 182 | rp(ii,:)=(px./ps(wk,:))'; % relative mixture probabilities for each data point (columns sum to 1) 183 | lp(ii)=log(ps)+mx; 184 | 185 | for il=2:nl 186 | ix=jx+1; 187 | jx=jx+nb; % increment upper limit 188 | ii=ix:jx; 189 | xii=y(ii,:).'; 190 | py=reshape(sum(reshape((vi*xii-vim(:,wnb)).*(xii(wpk,:)-mtk(:,wnb)),q,nb*k),1),k,nb)+lvm(:,wnb); 191 | mx=max(py,[],1); % find normalizing factor for each data point to prevent underflow when using exp() 192 | px=exp(py-mx(wk,:)); % find normalized probability of each mixture for each datapoint 193 | ps=sum(px,1); % total normalized likelihood of each data point 194 | rp(ii,:)=(px./ps(wk,:))'; % relative mixture probabilities for each data point (columns sum to 1) 195 | lp(ii)=log(ps)+mx; 196 | end 197 | end 198 | lp=lp-0.5*q*log(2*pi); 199 | if nargout >2 200 | [kp,kh]=max(rp,[],2); 201 | end -------------------------------------------------------------------------------- /rVAD2.0/voicebox.m: -------------------------------------------------------------------------------- 1 | function y=voicebox(f,v) 2 | %VOICEBOX set global parameters for Voicebox functions Y=(FIELD,VAL) 3 | % 4 | % Inputs: F is a field name 5 | % V is a new value for the field 6 | % 7 | % Outputs: Y is set equal to the structure of parameters if the 8 | % f and v inputs are both present or both absent. If only 9 | % input f is specified, then y is set to the value of the 10 | % corresponding field or null if it doesn't exist. 11 | % 12 | % This routine contains default values for constants that are used by 13 | % other functions in the VOICEBOX toolbox. Values in the first section below, 14 | % entitled "System-dependent directory paths" should be set as follows: 15 | % PP.dir_temp directory for storing temporary files 16 | % PP.dir_data default directory to preappend to speech data file names 17 | % when the "d" option is specified in READWAV etc. 18 | % PP.shorten location of SHORTEN executable. SHORTEN is a proprietary file compression 19 | % algorithm that is used for some SPHERE-format files. READSPH 20 | % will try to call an external decoder if it is asked to 21 | % read such a compressed file. 22 | % PP.sfsbin location of Speech Filing Sysytem binaries. If the "c" option 23 | % is given to READSFS, it will try to create a requested item 24 | % if it is not present in the SFS file. This parameter tells it 25 | % where to find the SFS executables. 26 | % PP.sfssuffix suffix for Speech Filing Sysytem binaries. READSFS uses this paremeter 27 | % to create the name of an SFS executable (see PP.sfsbin above). 28 | % Other values defined in this routine are the defaults for specific algorithm constants. 29 | % If you want to change these, please refer to the individual routines for a fuller description. 30 | 31 | % Bugs/Suggestions 32 | % (1) Could allow a * at the end of F to act as a wildcard and return/print a part structure 33 | 34 | % Copyright (C) Mike Brookes 2003 35 | % Version: $Id: voicebox.m 713 2011-10-16 14:45:43Z dmb $ 36 | % 37 | % VOICEBOX is a MATLAB toolbox for speech processing. 38 | % Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html 39 | % 40 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 41 | % This program is free software; you can redistribute it and/or modify 42 | % it under the terms of the GNU General Public License as published by 43 | % the Free Software Foundation; either version 2 of the License, or 44 | % (at your option) any later version. 45 | % 46 | % This program is distributed in the hope that it will be useful, 47 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 48 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 49 | % GNU General Public License for more details. 50 | % 51 | % You can obtain a copy of the GNU General Public License from 52 | % http://www.gnu.org/copyleft/gpl.html or by writing to 53 | % Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA. 54 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 55 | 56 | persistent PP 57 | if isempty(PP) 58 | 59 | % System-dependent directory paths and constants 60 | 61 | PP.dir_temp='F:\TEMP'; % directory for storing temporary files 62 | PP.dir_data='E:\dmb\data\speech'; % default directory to preappend to speech data file names 63 | PP.shorten='C:\bin\shorten.exe'; % location of shorten executable 64 | PP.flac='C:\bin\flac.exe'; % location of flac executable 65 | PP.sfsbin='F:\Program Files\SFS\Program'; % location of Speech Filing Sysytem binaries 66 | PP.sfssuffix='.exe'; % suffix for Speech Filing Sysytem binaries 67 | PP.memsize=50e6; % Maximum amount of temporary memory to use (Bytes) 68 | 69 | % DYPSA glottal closure identifier 70 | 71 | PP.dy_cpfrac=0.3; % presumed closed phase fraction of larynx cycle 72 | PP.dy_cproj=0.2; % cost of projected candidate 73 | PP.dy_cspurt=-0.45; % cost of a talkspurt 74 | PP.dy_dopsp=1; % Use phase slope projection (1) or not (0)? 75 | PP.dy_ewdly=0.0008; % window delay for energy cost function term [~ energy peak delay from closure] (sec) 76 | PP.dy_ewlen=0.003; % window length for energy cost function term (sec) 77 | PP.dy_ewtaper=0.001; % taper length for energy cost function window (sec) 78 | PP.dy_fwlen=0.00045; % window length used to smooth group delay (sec) 79 | PP.dy_fxmax=500; % max larynx frequency (Hz) 80 | PP.dy_fxmin=50; % min larynx frequency (Hz) 81 | PP.dy_fxminf=60; % min larynx frequency (Hz) [used for Frobenius norm only] 82 | PP.dy_gwlen=0.0030; % group delay evaluation window length (sec) 83 | PP.dy_lpcdur=0.020; % lpc analysis frame length (sec) 84 | PP.dy_lpcn=2; % lpc additional poles 85 | PP.dy_lpcnf=0.001; % lpc poles per Hz (1/Hz) 86 | PP.dy_lpcstep=0.010; % lpc analysis step (sec) 87 | PP.dy_nbest=5; % Number of NBest paths to keep 88 | PP.dy_preemph=50; % pre-emphasis filter frequency (Hz) (to avoid preemphasis, make this very large) 89 | PP.dy_spitch=0.2; % scale factor for pitch deviation cost 90 | PP.dy_wener=0.3; % DP energy weighting 91 | PP.dy_wpitch=0.5; % DP pitch weighting 92 | PP.dy_wslope=0.1; % DP group delay slope weighting 93 | PP.dy_wxcorr=0.8; % DP cross correlation weighting 94 | PP.dy_xwlen=0.01; % cross-correlation length for waveform similarity (sec) 95 | 96 | % RAPT pitch tracker 97 | 98 | PP.rapt_f0min=50; % Min F0 (Hz) 99 | PP.rapt_f0max=500; % Max F0 (Hz) 100 | PP.rapt_tframe=0.01; % frame size (s) 101 | PP.rapt_tlpw=0.005; % low pass filter window size (s) 102 | PP.rapt_tcorw=0.0075; % correlation window size (s) 103 | PP.rapt_candtr=0.3; % minimum peak in NCCF 104 | PP.rapt_lagwt=0.3; % linear lag taper factor 105 | PP.rapt_freqwt=0.02; % cost factor for F0 change 106 | PP.rapt_vtranc=0.005; % fixed voice-state transition cost 107 | PP.rapt_vtrac=0.5; % delta amplitude modulated transition cost 108 | PP.rapt_vtrsc=0.5; % delta spectrum modulated transition cost 109 | PP.rapt_vobias=0.0; % bias to encourage voiced hypotheses 110 | PP.rapt_doublec=0.35; % cost of exact doubling or halving 111 | PP.rapt_absnoise=0; % absolute rms noise level 112 | PP.rapt_relnoise=2; % rms noise level relative to noise floor 113 | PP.rapt_signoise=0.001; % ratio of peak signal rms to noise floor (0.001 = 60dB) 114 | PP.rapt_ncands=20; % max hypotheses at each frame 115 | PP.rapt_trms=0.03; % window length for rms measurement 116 | PP.rapt_dtrms=0.02; % window spacing for rms measurement 117 | PP.rapt_preemph=-7000; % s-plane position of preemphasis zero 118 | PP.rapt_nfullag=7; % number of full lags to try (must be odd) 119 | 120 | % now check some of the key values for validity 121 | 122 | if exist(PP.dir_temp)~=7 % check that temp directory exists 123 | PP.dir_temp = winenvar('temp'); % else use windows temp directory 124 | end 125 | 126 | [fnp,fnn,fne]=fileparts(mfilename('fullpath')); 127 | if exist(PP.shorten)~=2 % check that shorten executable exists 128 | PP.shorten=fullfile(fnp,'shorten.exe'); % next try local directory 129 | if exist(PP.shorten)~=2 % check if it exists in local directory 130 | PP.shorten='shorten.exe'; % finally assume it is on the search path 131 | end 132 | end 133 | 134 | if exist(PP.flac)~=2 % check that flac executable exists 135 | PP.flac=fullfile(fnp,'flac.exe'); % next try local directory 136 | if exist(PP.flac)~=2 % check if it exists in local directory 137 | PP.shorten='flac.exe'; % finally assume it is on the search path 138 | end 139 | end 140 | 141 | end 142 | if nargin==0 143 | if nargout==0 144 | % list all fields 145 | nn=sort(fieldnames(PP)); 146 | cnn=char(nn); 147 | fprintf('%d Voicebox parameters:\n',length(nn)); 148 | 149 | for i=1:length(nn); 150 | if ischar(PP.(nn{i})) 151 | fmt=' %s = %s\n'; 152 | else 153 | fmt=' %s = %g\n'; 154 | end 155 | fprintf(fmt,cnn(i,:),PP.(nn{i})); 156 | end 157 | else 158 | y=PP; 159 | end 160 | elseif nargin==1 161 | if isfield(PP,f) 162 | y=PP.(f); 163 | else 164 | y=[]; 165 | end 166 | else 167 | if isfield(PP,f) 168 | PP.(f)=v; 169 | y=PP; 170 | else 171 | error(sprintf('''%s'' is not a valid voicebox field name',f)); 172 | end 173 | end -------------------------------------------------------------------------------- /rVAD2.0/specsub.m: -------------------------------------------------------------------------------- 1 | function [ss,gg,tt,ff,zo]=specsub(si,fsz,pp) 2 | %SPECSUB performs speech enhancement using spectral subtraction [SS,ZO]=(S,FSZ,P) 3 | % 4 | % Usage: (1) y=specsub(x,fs); % enhance the speech using default parameters 5 | % 6 | % Inputs: 7 | % si input speech signal 8 | % fsz sample frequency in Hz 9 | % Alternatively, the input state from a previous call (see below) 10 | % pp algorithm parameters [optional] 11 | % 12 | % Outputs: 13 | % ss output enhanced speech 14 | % gg(t,f,i) selected time-frequency values (see pp.tf below) 15 | % tt centre of frames (in seconds) 16 | % ff centre of frequency bins (in Hz) 17 | % zo output state (or the 2nd argument if gg,tt,ff are omitted) 18 | % 19 | % The algorithm operation is controlled by a small number of parameters: 20 | % 21 | % pp.of % overlap factor = (fft length)/(frame increment) [2] 22 | % pp.ti % desired frame increment [0.016 seconds] 23 | % pp.ri % set to 1 to round ti to the nearest power of 2 samples [0] 24 | % pp.g % subtraction domain: 1=magnitude, 2=power [1] 25 | % pp.e % gain exponent [1] 26 | % pp.am % max oversubtraction factor [3] 27 | % pp.b % max noise attenutaion in power domain [0.01] 28 | % pp.al % SNR for oversubtraction=am (set this to Inf for fixed a) [-5 dB] 29 | % pp.ah % SNR for oversubtraction=1 [20 dB] 30 | % pp.ne % noise estimation: 0=min statistics, 1=MMSE [0] 31 | % pp.bt % threshold for binary gain or -1 for continuous gain [-1] 32 | % pp.mx % input mixture gain [0] 33 | % pp.gh % maximum gain for noise floor [1] 34 | % pp.rf % round output signal to an exact number of frames [0] 35 | % pp.tf % selects time-frequency planes to output in the gg() variable ['g'] 36 | % 'i' = input power spectrum 37 | % 'I' = input complex spectrum 38 | % 'n' = noise power spectrum 39 | % 'g' = gain 40 | % 'o' = output power spectrum 41 | % 'O' = output complex spectrum 42 | % 43 | % Following [1], the magnitude-domain gain in each time-frequency bin is given by 44 | % gain=mx+(1-mx)*max((1-(a*N/X)^(g/2))^(e/g),min(gh,(b*N/X)^(e/2))) 45 | % where N and X are the powers of the noise and noisy speech respectively. 46 | % The oversubtraction factor varies linearly between a=am for a frame SNR of al down to 47 | % a=1 for a frame SNR of ah. To obtain a fixed value of a for all values of SNR, set al=Inf. 48 | % Common exponent combinations are: 49 | % g=1 e=1 Magnitude Domain spectral subtraction 50 | % g=2 e=1 Power Domain spectral subtraction 51 | % g=2 e=2 Wiener filtering 52 | % Many authors use the parameters alpha=a^(g/2), beta=b^(g/2) and gamma2=e/g instead of a, b and e 53 | % but this increases interdependence amongst the parameters. 54 | % If bt>=0 then the max(...) expression above is thresholded to become 0 or 1. 55 | % 56 | % In addition it is possible to specify parameters for the noise estimation algorithm 57 | % which implements reference [2] or [3] according to the setting of pp.ne 58 | % 59 | % Minimum statistics noise estimate [2]: pp.ne=0 60 | % pp.taca % (11): smoothing time constant for alpha_c [0.0449 seconds] 61 | % pp.tamax % (3): max smoothing time constant [0.392 seconds] 62 | % pp.taminh % (3): min smoothing time constant (upper limit) [0.0133 seconds] 63 | % pp.tpfall % (12): time constant for P to fall [0.064 seconds] 64 | % pp.tbmax % (20): max smoothing time constant [0.0717 seconds] 65 | % pp.qeqmin % (23): minimum value of Qeq [2] 66 | % pp.qeqmax % max value of Qeq per frame [14] 67 | % pp.av % (23)+13 lines: fudge factor for bc calculation [2.12] 68 | % pp.td % time to take minimum over [1.536 seconds] 69 | % pp.nu % number of subwindows to use [3] 70 | % pp.qith % Q-inverse thresholds to select maximum noise slope [0.03 0.05 0.06 Inf ] 71 | % pp.nsmdb % corresponding noise slope thresholds in dB/second [47 31.4 15.7 4.1] 72 | % 73 | % MMSE noise estimate [3]: pp.ne=1 74 | % pp.tax % smoothing time constant for noise power estimate [0.0717 seconds](8) 75 | % pp.tap % smoothing time constant for smoothed speech prob [0.152 seconds](23) 76 | % pp.psthr % threshold for smoothed speech probability [0.99] (24) 77 | % pp.pnsaf % noise probability safety value [0.01] (24) 78 | % pp.pspri % prior speech probability [0.5] (18) 79 | % pp.asnr % active SNR in dB [15] (18) 80 | % pp.psini % initial speech probability [0.5] (23) 81 | % pp.tavini % assumed speech absent time at start [0.064 seconds] 82 | % 83 | % If convenient, you can call specsub in chunks of arbitrary size. Thus the following are equivalent: 84 | % 85 | % (a) y=specsub(s,fs); 86 | % 87 | % (b) [y1,z]=specsub(s(1:1000),fs); 88 | % [y2,z]=specsub(s(1001:2000),z); 89 | % y3=specsub(s(2001:end),z); 90 | % y=[y1; y2; y3]; 91 | % 92 | % If the number of output arguments is either 2 or 5, the last partial frame of samples will 93 | % be retained for overlap adding with the output from the next call to specsub(). 94 | % 95 | % See also ssubmmse() for an alternative gain function 96 | % 97 | % Refs: 98 | % [1] M. Berouti, R. Schwartz and J. Makhoul 99 | % Enhancement of speech corrupted by acoustic noise 100 | % Proc IEEE ICASSP, 1979, 4, 208-211 101 | % [2] Rainer Martin. 102 | % Noise power spectral density estimation based on optimal smoothing and minimum statistics. 103 | % IEEE Trans. Speech and Audio Processing, 9(5):504-512, July 2001. 104 | % [3] Gerkmann, T. & Hendriks, R. C. 105 | % Unbiased MMSE-Based Noise Power Estimation With Low Complexity and Low Tracking Delay 106 | % IEEE Trans Audio, Speech, Language Processing, 2012, 20, 1383-1393 107 | 108 | % Copyright (C) Mike Brookes 2004 109 | % Version: $Id: specsub.m 1720 2012-03-31 17:17:31Z dmb $ 110 | % 111 | % VOICEBOX is a MATLAB toolbox for speech processing. 112 | % Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html 113 | % 114 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 115 | % This program is free software; you can redistribute it and/or modify 116 | % it under the terms of the GNU General Public License as published by 117 | % the Free Software Foundation; either version 2 of the License, or 118 | % (at your option) any later version. 119 | % 120 | % This program is distributed in the hope that it will be useful, 121 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 122 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 123 | % GNU General Public License for more details. 124 | % 125 | % You can obtain a copy of the GNU General Public License from 126 | % http://www.gnu.org/copyleft/gpl.html or by writing to 127 | % Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA. 128 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 129 | if numel(si)>length(si) 130 | error('Input speech signal must be a vector not a matrix'); 131 | end 132 | if isstruct(fsz) 133 | fs=fsz.fs; 134 | qq=fsz.qq; 135 | qp=fsz.qp; 136 | ze=fsz.ze; 137 | s=zeros(length(fsz.si)+length(si(:)),1); % allocate space for speech 138 | s(1:length(fsz.si))=fsz.si; 139 | s(length(fsz.si)+1:end)=si(:); 140 | else 141 | fs=fsz; % sample frequency 142 | s=si(:); 143 | % default algorithm constants 144 | 145 | qq.of=2; % overlap factor = (fft length)/(frame increment) 146 | qq.ti=16e-3; % desired frame increment (16 ms) 147 | qq.ri=0; % round ni to the nearest power of 2 148 | qq.g=1; % subtraction domain: 1=magnitude, 2=power 149 | qq.e=1; % gain exponent 150 | qq.am=3; % max oversubtraction factor 151 | qq.b=0.01; % noise floor 152 | qq.al=-5; % SNR for maximum a (set to Inf for fixed a) 153 | qq.ah=20; % SNR for minimum a 154 | qq.bt=-1; % suppress binary masking 155 | qq.ne=0; % noise estimation: 0=min statistics, 1=MMSE [0] 156 | qq.mx=0; % no input mixing 157 | qq.gh=1; % maximum gain 158 | qq.tf='g'; % output the gain time-frequency plane by default 159 | qq.rf=0; 160 | if nargin>=3 && ~isempty(pp) 161 | qp=pp; % save for estnoisem call 162 | qqn=fieldnames(qq); 163 | for i=1:length(qqn) 164 | if isfield(pp,qqn{i}) 165 | qq.(qqn{i})=pp.(qqn{i}); 166 | end 167 | end 168 | else 169 | qp=struct; % make an empty structure 170 | end 171 | end 172 | % derived algorithm constants 173 | if qq.ri 174 | ni=pow2(nextpow2(qq.ti*fs*sqrt(0.5))); 175 | else 176 | ni=round(qq.ti*fs); % frame increment in samples 177 | end 178 | tinc=ni/fs; % true frame increment time 179 | tf=qq.tf; 180 | rf=qq.rf || nargout==2 || nargout==5; % round down to an exact number of frames 181 | ne=qq.ne; % noise estimation: 0=min statistics, 1=MMSE [0] 182 | 183 | % calculate power spectrum in frames 184 | 185 | no=round(qq.of); % integer overlap factor 186 | nf=ni*no; % fft length 187 | w=sqrt(hamming(nf+1))'; w(end)=[]; % for now always use sqrt hamming window 188 | w=w/sqrt(sum(w(1:ni:nf).^2)); % normalize to give overall gain of 1 189 | if rf>0 190 | rfm=''; % truncated input to an exact number of frames 191 | else 192 | rfm='r'; 193 | end 194 | [y,tt]=enframe(s,w,ni,rfm); 195 | tt=tt/fs; % frame times 196 | yf=rfft(y,nf,2); 197 | yp=yf.*conj(yf); % power spectrum of input speech 198 | [nr,nf2]=size(yp); % number of frames 199 | ff=(0:nf2-1)*fs/nf; 200 | if isstruct(fsz) 201 | if ne>0 202 | [dp,ze]=estnoiseg(yp,ze); % estimate the noise using MMSE 203 | else 204 | [dp,ze]=estnoisem(yp,ze); % estimate the noise using minimum statistics 205 | end 206 | ssv=fsz.ssv; 207 | else 208 | if ne>0 209 | [dp,ze]=estnoiseg(yp,tinc,qp); % estimate the noise using MMSE 210 | else 211 | [dp,ze]=estnoisem(yp,tinc,qp); % estimate the noise using minimum statistics 212 | end 213 | ssv=zeros(ni*(no-1),1); % dummy saved overlap 214 | end 215 | if ~nr % no data frames 216 | ss=[]; 217 | gg=[]; 218 | else 219 | mz=yp==0; % mask for zero power time-frequency bins (unlikely) 220 | if qq.al=(af+bf).^(-1); % mask for noise floor limiting 244 | g=zeros(size(v)); % reserve space for gain matrix 245 | eg=qq.e/qq.g; % gain exponent relative to subtraction domain 246 | gh=qq.gh; 247 | switch eg 248 | case 1 % Normal case 249 | g(mf)=min(bf*v(mf),gh); % never give a gain > 1 250 | g(~mf)=1-af(~mf).*v(~mf); 251 | case 0.5 252 | g(mf)=min(sqrt(bf*v(mf)),gh); 253 | g(~mf)=sqrt(1-af(~mf).*v(~mf)); 254 | otherwise 255 | g(mf)=min((bf*v(mf)).^eg,gh); 256 | g(~mf)=(1-af(~mf).*v(~mf)).^eg; 257 | end 258 | if qq.bt>=0 259 | g=g>qq.bt; 260 | end 261 | g=qq.mx+(1-qq.mx)*g; % mix in some of the input 262 | se=(irfft((yf.*g).',nf).').*repmat(w,nr,1); % inverse dft and apply output window 263 | ss=zeros(ni*(nr+no-1),no); % space for overlapped output speech 264 | ss(1:ni*(no-1),end)=ssv; 265 | for i=1:no 266 | nm=nf*(1+floor((nr-i)/no)); % number of samples in this set 267 | ss(1+(i-1)*ni:nm+(i-1)*ni,i)=reshape(se(i:no:nr,:)',nm,1); 268 | end 269 | ss=sum(ss,2); 270 | if nargout>2 && ~isempty(tf) 271 | gg=zeros(nr,nf2,length(tf)); % make space 272 | for i=1:length(tf) 273 | switch tf(i) 274 | case 'i' % 'i' = input power spectrum 275 | gg(:,:,i)=yp; 276 | case 'I' % 'i' = input power spectrum 277 | gg(:,:,i)=yf; 278 | case 'n' % 'n' = noise power spectrum 279 | gg(:,:,i)=dp; 280 | case 'g' % 'g' = gain 281 | gg(:,:,i)=g; 282 | case 'o' % 'o' = output power spectrum 283 | gg(:,:,i)=yp.*g.^2; 284 | case 'O' % 'o' = output power spectrum 285 | gg(:,:,i)=yf.*g; 286 | end 287 | end 288 | end 289 | end 290 | if nargout==2 || nargout==5 291 | if nr 292 | zo.ssv=ss(end-ni*(no-1)+1:end); % save the output tail for next time 293 | ss(end-ni*(no-1)+1:end)=[]; 294 | else 295 | zo.ssv=ssv; % 296 | end 297 | zo.si=s(length(ss)+1:end); % save the tail end of the input speech signal 298 | zo.fs=fs; % save sample frequency 299 | zo.qq=qq; % save local parameters 300 | zo.qp=qp; % save estnoisem parameters 301 | zo.ze=ze; % save state of noise estimation 302 | if nargout==2 303 | gg=zo; % 2nd of two arguments is zo 304 | end 305 | elseif rf==0 306 | ss=ss(1:length(s)); % trim to the correct length if not an exact number of frames 307 | end 308 | if ~nargout && nr>0 309 | ffax=ff/1000; ax=zeros(4,1); 310 | ax(1)=subplot(223); 311 | imagesc(tt,ffax,20*log10(g)'); 312 | colorbar; 313 | axis('xy'); 314 | if qq.al==Inf 315 | title(sprintf('Filter Gain (dB): a=%.2g, b=%.3g',qq.am,qq.b)); 316 | else 317 | title(sprintf('Filter Gain (dB): a=%.2g (%.0f to %.0fdB), b=%.3g',qq.am,qq.al,qq.ah,qq.b)); 318 | end 319 | xlabel('Time (s)'); 320 | ylabel('Frequency (kHz)'); 321 | 322 | ax(2)=subplot(222); 323 | imagesc(tt,ffax,10*log10(yp)'); 324 | colorbar; 325 | axis('xy'); 326 | title('Noisy Speech (dB)'); 327 | xlabel('Time (s)'); 328 | ylabel('Frequency (kHz)'); 329 | 330 | ax(3)=subplot(224); 331 | imagesc(tt,ffax,10*log10(yp.*g.^2)'); 332 | colorbar; 333 | axis('xy'); 334 | title(sprintf('Enhanced Speech (dB): g=%.2g, e=%.3g',qq.g,qq.e)); 335 | xlabel('Time (s)'); 336 | ylabel('Frequency (kHz)'); 337 | 338 | ax(4)=subplot(221); 339 | imagesc(tt,ffax,10*log10(dp)'); 340 | colorbar; 341 | axis('xy'); 342 | title('Noise Estimate (dB)'); 343 | xlabel('Time (s)'); 344 | ylabel('Frequency (kHz)'); 345 | linkaxes(ax); 346 | end -------------------------------------------------------------------------------- /rVADfast_py_2.0/speechproc.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import numpy 3 | import sys 4 | import os 5 | import math 6 | import struct 7 | import scipy.io.wavfile as wav 8 | from scipy.fftpack import dct 9 | from scipy.signal import lfilter 10 | from copy import deepcopy 11 | import code 12 | 13 | # Refs: 14 | # [1] Z.-H. Tan, A.k. Sarkara and N. Dehak, "rVAD: an unsupervised segment-based robust voice activity detection method," Computer Speech and Language, vol. 59, pp. 1-21, 2020. 15 | # [2] Z.-H. Tan and B. Lindberg, "Low-complexity variable frame rate analysis for speech recognition and voice activity detection." 16 | # IEEE Journal of Selected Topics in Signal Processing, vol. 4, no. 5, pp. 798-807, 2010. 17 | 18 | # Version: 2.0 19 | # 02 Dec 2017, Achintya Kumar Sarkar and Zheng-Hua Tan 20 | 21 | def speech_wave(fileName_): 22 | 23 | (fs,sig) = wav.read(fileName_) 24 | if sig.dtype == 'int16': 25 | nb = 16 # -> 16-bit wav files 26 | elif sig.dtype == 'int32': 27 | nb = 32 # -> 32-bit wav files 28 | max_nb = float(2 ** (nb - 1)) 29 | sig = sig / (max_nb + 1.0) 30 | return fs, sig 31 | 32 | def enframe(speech, fs, winlen, ovrlen): 33 | 34 | N, flth, foVr = len(speech), int(numpy.fix(fs*winlen)), int(numpy.fix(fs*ovrlen)) 35 | 36 | if len(speech) < flth: 37 | print("speech file length shorter than window length") 38 | exit() 39 | 40 | 41 | frames = int(numpy.ceil( (N - flth + foVr)/foVr)) 42 | slen = (frames-1)*foVr + flth 43 | 44 | 45 | if len(speech) < slen: 46 | signal = numpy.concatenate((speech, numpy.zeros((slen - N)))) 47 | 48 | else: 49 | signal = deepcopy(speech) 50 | 51 | 52 | idx = numpy.tile(numpy.arange(0,flth),(frames,1)) + numpy.tile(numpy.arange(0,(frames)*foVr,foVr),(flth,1)).T 53 | idx = numpy.array(idx,dtype=numpy.int64) 54 | 55 | 56 | return signal[idx] 57 | 58 | 59 | def sflux(data, fs, winlen, ovrlen, nftt): 60 | 61 | eps=numpy.finfo(float).eps 62 | 63 | xf=enframe(data, fs, winlen, ovrlen) #framing 64 | w = numpy.matrix(numpy.hamming(int(fs*winlen)) ) 65 | w = numpy.tile(w,(numpy.size(xf, axis=0), 1)) 66 | 67 | xf = numpy.multiply (xf, w) #apply window 68 | #fft 69 | ak=numpy.abs(numpy.fft.fft(xf,nftt)) 70 | idx = range(0,int(nftt/2) +1) 71 | ak=ak[:,idx] 72 | Num=numpy.exp( float(1/len(idx)) * numpy.sum(numpy.log(ak+eps), axis=1) ) 73 | Den=float(1/len(idx)) * numpy.sum(ak, axis=1) 74 | 75 | ft=(Num+eps)/(Den+eps) 76 | 77 | 78 | flen, fsh10 = int(numpy.fix(fs*winlen)), int(numpy.fix(fs*ovrlen)) 79 | nfr10=int(numpy.floor((len(data)-(flen-fsh10))/fsh10)) 80 | 81 | #syn frames as per nfr10 82 | if nfr10 < len(ft): 83 | ft=ft[range(nfr10)] 84 | else: 85 | ft = numpy.concatenate((ft, numpy.repeat(ft[:1], nfr10 -len(ft), axis=0) )) 86 | 87 | 88 | 89 | return ft, flen, fsh10, nfr10 90 | 91 | 92 | def snre_highenergy(fdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk): 93 | 94 | ## ---*******- important ******* 95 | #here [0] index array element has not used 96 | 97 | Dexpl=18; Dexpr=18 ; segThres = 0.25 98 | 99 | fdata_=deepcopy(fdata) ; pv01_=deepcopy(pv01) ; pvblk_=deepcopy(pvblk) 100 | 101 | fdata_=numpy.insert(fdata_,0,'inf') 102 | pv01_=numpy.insert(pv01_,0,'inf') 103 | pvblk_=numpy.insert(pvblk_,0,'inf') 104 | 105 | 106 | #energy estimation 107 | e=numpy.zeros(nfr10, dtype='float64') 108 | e=numpy.insert(e,0,'inf') 109 | 110 | for i in range(1, nfr10+1): 111 | for j in range(1, flen+1): 112 | e[i]=e[i]+numpy.square(fdata_[(i-1)*fsh10+j]) 113 | 114 | if numpy.less_equal(e[i], ENERGYFLOOR): 115 | e[i]=ENERGYFLOOR 116 | 117 | emin=numpy.ones(nfr10) 118 | emin=numpy.insert(emin,0,'inf') 119 | NESEG = 200 120 | 121 | if numpy.less(nfr10, NESEG): 122 | NESEG=nfr10 123 | 124 | for i in range(1, int(numpy.floor(nfr10/NESEG))+1): 125 | eY=numpy.sort(e[range((i-1)*NESEG+1, (i*NESEG)+1)]) 126 | eY=numpy.insert(eY,0,'inf') 127 | 128 | emin[range( (i-1)*NESEG+1, i*NESEG+1)]=eY[int(numpy.floor(NESEG*0.1))] 129 | if numpy.not_equal(i, 1): 130 | emin[range((i-1)*NESEG+1,i*NESEG+1)]=0.9*emin[(i-1)*NESEG]+0.1*emin[(i-1)*NESEG+1] 131 | 132 | if numpy.not_equal(i*NESEG, nfr10): 133 | eY=numpy.sort(e[range((i-1)*NESEG+1, nfr10+1)]) 134 | eY=numpy.insert(eY,0,'inf') 135 | 136 | emin[range(i*NESEG+1,nfr10+1)]=eY[int(numpy.floor((nfr10-(i-1)*NESEG)*0.1))] 137 | emin[range(i*NESEG+1,nfr10+1)]=0.9*emin[i*NESEG]+0.1*emin[i*NESEG+1] 138 | 139 | 140 | D=numpy.zeros(nfr10) 141 | D=numpy.insert(D,0,'inf') 142 | 143 | postsnr=numpy.zeros(nfr10) 144 | postsnr=numpy.insert(postsnr,0,'inf') 145 | 146 | for i in range(2, nfr10+1): 147 | postsnr[i] =numpy.log10(e[i])-numpy.log10(emin[i]) 148 | if numpy.less(postsnr[i],0): 149 | postsnr[i]=0 150 | 151 | D[i]=numpy.sqrt(numpy.abs(e[i]-e[i-1])*postsnr[i]) 152 | D[1]=D[2] 153 | 154 | 155 | 156 | tm1 = numpy.hstack((numpy.ones(Dexpl)*D[1], D[1:len(D)])) 157 | Dexp = numpy.hstack((tm1, numpy.ones(Dexpr)*D[nfr10] )) 158 | Dexp = numpy.insert(Dexp,0,'inf') 159 | 160 | Dsmth=numpy.zeros(nfr10, dtype='float64') 161 | Dsmth=numpy.insert(Dsmth,0,'inf') 162 | 163 | Dsmth_max=deepcopy(Dsmth) 164 | 165 | 166 | for i in range(1,nfr10+1): 167 | Dsmth[i]=sum(Dexp[range(i, i+Dexpl+Dexpr+1)]) 168 | 169 | for i in range(1, int(numpy.floor(nfr10/NESEG))+1): 170 | Dsmth_max[range((i-1)*NESEG+1, i*NESEG+1)]= numpy.amax(e[range((i-1)*NESEG+1, i*NESEG+1)]); #numpy.amax(Dsmth[range((i-1)*NESEG+1, i*NESEG+1)]) 171 | 172 | 173 | if numpy.not_equal(i*NESEG, nfr10): 174 | Dsmth_max[range(i*NESEG+1, nfr10+1)]=numpy.amax(e[range((i-1)*NESEG+1, nfr10+1)]) #numpy.amax(Dsmth[range((i-1)*NESEG+1, nfr10+1)]) 175 | 176 | snre_vad = numpy.zeros(nfr10) 177 | snre_vad=numpy.insert(snre_vad,0,'inf') 178 | 179 | for i in range(1, nfr10+1): 180 | if numpy.greater(Dsmth[i], Dsmth_max[i]*segThres): 181 | snre_vad[i]=1 182 | 183 | #block based processing to remove noise part by using snre_vad1. 184 | sign_vad = 0 185 | noise_seg=numpy.zeros(int(numpy.floor(nfr10/1.6))) ; noise_seg=numpy.insert(noise_seg,0,'inf') 186 | 187 | noise_samp=numpy.zeros((nfr10,2)) 188 | n_noise_samp=-1 189 | 190 | for i in range(1, nfr10+1): 191 | if (snre_vad[i] == 1) and (sign_vad == 0): #% start of a segment 192 | sign_vad = 1 193 | nstart=i 194 | elif ((snre_vad[i] ==0) or (i==nfr10)) and (sign_vad == 1): # % end of a segment 195 | sign_vad = 0 196 | nstop=i-1 197 | if numpy.equal(sum(pv01_[range(nstart, nstop+1)]), 0): 198 | noise_seg[range(int(numpy.round(nstart/1.6)), int(numpy.floor(nstop/1.6))+1)] = 1 199 | n_noise_samp=n_noise_samp+1 200 | noise_samp[n_noise_samp,:]=numpy.array([(nstart-1)*fsh10+1, nstop*fsh10]) 201 | 202 | noise_samp=noise_samp[:n_noise_samp+1,] 203 | 204 | #syn from [0] index 205 | noise_samp=noise_samp-1 206 | noise_seg=noise_seg[1:len(noise_seg)] 207 | 208 | return noise_samp, noise_seg, len(noise_samp) 209 | 210 | 211 | 212 | 213 | def snre_vad(fdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk, vadThres): 214 | 215 | ## ---*******- important ******* 216 | #here [0] index array element has not used 217 | 218 | Dexpl, Dexpr=18, 18 219 | Dsmth=numpy.zeros(nfr10, dtype='float64'); Dsmth=numpy.insert(Dsmth,0,'inf') 220 | 221 | fdata_=deepcopy(fdata) 222 | pv01_=deepcopy(pv01) 223 | pvblk_=deepcopy(pvblk) 224 | 225 | fdata_=numpy.insert(fdata_,0,'inf') 226 | pv01_=numpy.insert(pv01_,0,'inf') 227 | pvblk_=numpy.insert(pvblk_,0,'inf') 228 | 229 | 230 | #energy estimation 231 | e=numpy.zeros(nfr10, dtype='float64') 232 | e=numpy.insert(e,0,'inf') 233 | 234 | for i in range(1, nfr10+1): 235 | for j in range(1, flen+1): 236 | e[i]=e[i]+ numpy.square(fdata_[(i-1)*fsh10+j]) 237 | 238 | if numpy.less_equal(e[i], ENERGYFLOOR): 239 | e[i]=ENERGYFLOOR 240 | 241 | 242 | segsnr=numpy.zeros(nfr10); segsnr=numpy.insert(segsnr,0,'inf') 243 | segsnrsmth=1 244 | sign_segsnr=0 245 | D=numpy.zeros(nfr10); D=numpy.insert(D,0,'inf') 246 | postsnr=numpy.zeros(nfr10, dtype='float64'); postsnr=numpy.insert(postsnr,0,'inf') 247 | snre_vad=numpy.zeros(nfr10); snre_vad=numpy.insert(snre_vad,0,'inf') 248 | sign_pv=0 249 | 250 | 251 | 252 | 253 | for i in range(1, nfr10+1): 254 | 255 | if (pvblk_[i]==1) and (sign_pv==0): 256 | nstart=i 257 | sign_pv=1 258 | 259 | elif ( (pvblk_[i]==0) or (i==nfr10) ) and (sign_pv==1): 260 | 261 | nstop=i-1 262 | if i==nfr10: 263 | nstop=i 264 | sign_pv=0 265 | datai=fdata_[range( (nstart-1)*fsh10+1, (nstop-1)*fsh10+flen-fsh10+1) ] 266 | datai=numpy.insert(datai,0,'inf') 267 | 268 | for j in range(nstart, nstop-1+1): #previously it was for j=nstart:nstop-1 269 | for h in range(1, flen+1): 270 | e[j]=e[j]+ numpy.square(datai[(j-nstart)*fsh10+h] ) 271 | if numpy.less_equal(e[j], ENERGYFLOOR): 272 | e[j]=ENERGYFLOOR 273 | 274 | e[nstop]=e[nstop-1] 275 | 276 | 277 | eY=numpy.sort(e[range(nstart, nstop+1)] ) 278 | eY=numpy.insert(eY,0,'inf') #as [0] is discarding 279 | 280 | emin=eY[int(numpy.floor((nstop-nstart+1)*0.1))] 281 | 282 | 283 | 284 | 285 | for j in range(nstart+1, nstop+1): 286 | 287 | postsnr[j] =math.log10(e[j]) - math.log10(emin) 288 | 289 | if numpy.less(postsnr[j], 0): 290 | postsnr[j]=0 291 | 292 | D[j]=math.sqrt(numpy.abs(e[j]-e[j-1])*postsnr[j] ) 293 | 294 | D[nstart]=D[nstart+1] 295 | 296 | 297 | tm1 = numpy.hstack((numpy.ones(Dexpl)*D[nstart], D[range(nstart, nstop+1)])) 298 | Dexp = numpy.hstack((tm1, numpy.ones(Dexpr)*D[nstop] )) 299 | 300 | Dexp = numpy.insert(Dexp,0,'inf') 301 | 302 | for j in range(0, nstop-nstart+1): 303 | Dsmth[nstart+j]=sum(Dexp[range(j+1, j+Dexpl+Dexpr+1)]) 304 | 305 | Dsmth_thres=sum(Dsmth[range(nstart, nstop+1)]*pv01_[range(nstart, nstop+1)])/sum(pv01_[range(nstart,nstop+1)]) 306 | 307 | for j in range(nstart, nstop+1): 308 | if numpy.greater(Dsmth[j], Dsmth_thres*vadThres): 309 | snre_vad[j]=1 310 | 311 | # 312 | pv_vad=deepcopy(snre_vad) 313 | 314 | 315 | nexpl=33 316 | nexpr=47 # % 29 and 39, estimated statistically, 95% ; 33, 47 %98 for voicebox pitch 317 | sign_vad=0 318 | for i in range(1, nfr10+1): 319 | if (snre_vad[i]==1) and (sign_vad==0): 320 | nstart=i 321 | sign_vad=1 322 | elif ((snre_vad[i]==0) or (i==nfr10)) and (sign_vad==1): 323 | nstop=i-1 324 | if i==nfr10: 325 | nstop=i 326 | sign_vad=0 327 | for j in range(nstart, nstop+1): 328 | if pv01_[j]==1: 329 | break 330 | 331 | 332 | pv_vad[range(nstart, numpy.max([j-nexpl-1,1])+1)]=0 333 | 334 | for j in range(0, nstop-nstart+1): 335 | if pv01_[nstop-j]==1: 336 | break 337 | 338 | 339 | pv_vad[range(nstop-j+1+nexpr,nstop+1)]=0 340 | 341 | nexpl =5; nexpr=12 #; % 9 and 13, estimated statistically 5%; 5, 12 %2 for voicebox pitch 342 | sign_vad=0 343 | for i in range(1,nfr10+1): 344 | if (snre_vad[i]==1) and (sign_vad==0): 345 | nstart=i 346 | sign_vad=1 347 | elif ((snre_vad[i]==0) or (i==nfr10) ) and (sign_vad==1): 348 | nstop=i-1 349 | if i==nfr10: 350 | nstop=i 351 | sign_vad=0 352 | 353 | if numpy.greater(sum(pv01_[range(nstart,nstop+1)]), 4): 354 | for j in range(nstart,nstop+1): 355 | if pv01_[j]==1: 356 | break 357 | 358 | pv_vad[range(numpy.maximum(j-nexpl,1),j-1+1)]=1 359 | for j in range(0,nstop-nstart+1): 360 | if pv01_[nstop-j]==1: 361 | break 362 | pv_vad[range(nstop-j+1,min(nstop-j+nexpr,nfr10)+1)]=1 363 | 364 | 365 | esegment=sum(e[range(nstart,nstop+1)])/(nstop-nstart+1) 366 | if numpy.less(esegment, 0.001): 367 | pv_vad[range(nstart, nstop+1)]=0 368 | 369 | if numpy.less_equal(sum(pv01_[range(nstart,nstop+1)]), 2): 370 | pv_vad[range(nstart,nstop+1)] = 0 371 | 372 | 373 | sign_vad=0 374 | esum=0 375 | for i in range(1,nfr10+1): 376 | if (pv_vad[i]==1) and (sign_vad==0): 377 | nstart=i 378 | sign_vad=1 379 | elif ((pv_vad[i]==0) or (i==nfr10)) and (sign_vad==1): 380 | nstop=i-1 381 | if i==nfr10: 382 | nstop=i 383 | sign_vad=0 384 | esum=esum+sum(e[range(nstart, nstop+1)]) 385 | 386 | # 387 | eps = numpy.finfo(float).eps 388 | 389 | eave=esum/(sum(pv_vad[1:len(pv_vad)])+eps) # except [0] index 'inf' 390 | 391 | 392 | 393 | sign_vad=0 394 | for i in range(1,nfr10+1): 395 | if (pv_vad[i]==1) and (sign_vad==0): 396 | nstart=i 397 | sign_vad=1 398 | elif ((pv_vad[i]==0) or (i==nfr10)) and (sign_vad==1): 399 | nstop=i-1 400 | if i==nfr10: 401 | nstop=i 402 | sign_vad=0 403 | 404 | #if numpy.less(sum(e[range(nstart,nstop+1)])/(nstop-nstart+1), eave*0.05): 405 | #pv_vad[range(nstart,nstop+1)] = 0 406 | 407 | # 408 | sign_vad=0 409 | vad_seg=numpy.zeros((nfr10,2), dtype="int64") 410 | n_vad_seg=-1 #for indexing array 411 | for i in range(1,nfr10+1): 412 | if (pv_vad[i]==1) and (sign_vad==0): 413 | nstart=i 414 | sign_vad=1 415 | elif ((pv_vad[i]==0) or (i==nfr10)) and (sign_vad==1): 416 | nstop=i-1 417 | sign_vad=0 418 | n_vad_seg=n_vad_seg+1 419 | #print i, n_vad_seg, nstart, nstop 420 | vad_seg[n_vad_seg,:]=numpy.array([nstart, nstop]) 421 | 422 | 423 | vad_seg=vad_seg[:n_vad_seg+1,] 424 | 425 | 426 | #syn from [0] index 427 | vad_seg = vad_seg - 1 428 | 429 | #print vad_seg 430 | 431 | # make one dimension array of (0/1) 432 | xYY=numpy.zeros(nfr10, dtype="int64") 433 | for i in range(len(vad_seg)): 434 | k=range(vad_seg[i,0], vad_seg[i,1]+1) 435 | xYY[k]=1 436 | 437 | vad_seg=xYY 438 | 439 | 440 | return vad_seg 441 | 442 | 443 | 444 | def pitchblockdetect(pv01, pitch, nfr10, opts): 445 | 446 | 447 | pv01_=deepcopy(pv01) 448 | 449 | if nfr10 == len(pv01_)+1: 450 | numpy.append(pv01_, pv01_[nfr10-1]) 451 | if opts == 0: 452 | sign_pv=0 453 | for i in range(0, nfr10): 454 | 455 | if ( pv01_[i]==1) and (sign_pv==0): 456 | 457 | nstart, sign_pv =i, 1 458 | 459 | elif ( (pv01_[i] == 0) or (i==nfr10-1) ) and (sign_pv==1): 460 | 461 | nstop=i 462 | if i==nfr10-1: 463 | nstop=i+1 464 | sign_pv=0 465 | pitchseg=numpy.zeros(nstop-nstart) 466 | #print len(pitchseg) 467 | for j in range (nstart, nstop): 468 | 469 | pitchseg[j-nstart]=pitch[j]; 470 | 471 | if (sum(numpy.abs( numpy.round( pitchseg-numpy.average(pitchseg) ) ))==0) and (nstop-nstart+1>=10): 472 | pv01_[range(nstart,nstop)]=0 473 | # 474 | sign_pv=0 475 | pvblk=deepcopy(pv01_) 476 | 477 | #print i 478 | for i in range(0, nfr10): 479 | 480 | if (pv01_[i]==1) and (sign_pv==0): 481 | #print("i=%s " %(i)) 482 | nstart, sign_pv=i, 1 483 | pvblk[range(max([nstart-60,0]), nstart+1)]=1 484 | #print("fm P2: i=%s %s % " %(i,max([nstart-60,0]), nstart+1)) 485 | 486 | elif ( (pv01_[i] ==0) or (i==nfr10-1 )) and (sign_pv==1): 487 | 488 | nstop, sign_pv= i, 0 489 | 490 | pvblk[range(nstop, numpy.amin([nstop+60,nfr10-1])+1 )]=1 491 | #print("fm P2: i=%s %s %s " %(i,nstop, numpy.amin([nstop+60,nfr10-1])+1 )) 492 | 493 | return pvblk 494 | 495 | 496 | -------------------------------------------------------------------------------- /rVAD2.0/specsub_noiseseg_lfn.m: -------------------------------------------------------------------------------- 1 | function [ss,gg,tt,ff,zo]=specsub_noiseseg_lfn(si,fsz,noise_seg,pv01,pp) 2 | %SPECSUB performs speech enhancement using spectral subtraction [SS,ZO]=(S,FSZ,P) 3 | % 4 | % Usage: (1) y=specsub(x,fs); % enhance the speech using default parameters 5 | % 6 | % Inputs: 7 | % si input speech signal 8 | % fsz sample frequency in Hz 9 | % Alternatively, the input state from a previous call (see below) 10 | % pp algorithm parameters [optional] 11 | % 12 | % Outputs: 13 | % ss output enhanced speech 14 | % gg(t,f,i) selected time-frequency values (see pp.tf below) 15 | % tt centre of frames (in seconds) 16 | % ff centre of frequency bins (in Hz) 17 | % zo output state (or the 2nd argument if gg,tt,ff are omitted) 18 | % 19 | % The algorithm operation is controlled by a small number of parameters: 20 | % 21 | % pp.of % overlap factor = (fft length)/(frame increment) [2] 22 | % pp.ti % desired frame increment [0.016 seconds] 23 | % pp.ri % set to 1 to round ti to the nearest power of 2 samples [0] 24 | % pp.g % subtraction domain: 1=magnitude, 2=power [1] 25 | % pp.e % gain exponent [1] 26 | % pp.am % max oversubtraction factor [3] 27 | % pp.b % max noise attenutaion in power domain [0.01] 28 | % pp.al % SNR for oversubtraction=am (set this to Inf for fixed a) [-5 dB] 29 | % pp.ah % SNR for oversubtraction=1 [20 dB] 30 | % pp.ne % noise estimation: 0=min statistics, 1=MMSE [0] 31 | % pp.bt % threshold for binary gain or -1 for continuous gain [-1] 32 | % pp.mx % input mixture gain [0] 33 | % pp.gh % maximum gain for noise floor [1] 34 | % pp.rf % round output signal to an exact number of frames [0] 35 | % pp.tf % selects time-frequency planes to output in the gg() variable ['g'] 36 | % 'i' = input power spectrum 37 | % 'I' = input complex spectrum 38 | % 'n' = noise power spectrum 39 | % 'g' = gain 40 | % 'o' = output power spectrum 41 | % 'O' = output complex spectrum 42 | % 43 | % Following [1], the magnitude-domain gain in each time-frequency bin is given by 44 | % gain=mx+(1-mx)*max((1-(a*N/X)^(g/2))^(e/g),min(gh,(b*N/X)^(e/2))) 45 | % where N and X are the powers of the noise and noisy speech respectively. 46 | % The oversubtraction factor varies linearly between a=am for a frame SNR of al down to 47 | % a=1 for a frame SNR of ah. To obtain a fixed value of a for all values of SNR, set al=Inf. 48 | % Common exponent combinations are: 49 | % g=1 e=1 Magnitude Domain spectral subtraction 50 | % g=2 e=1 Power Domain spectral subtraction 51 | % g=2 e=2 Wiener filtering 52 | % Many authors use the parameters alpha=a^(g/2), beta=b^(g/2) and gamma2=e/g instead of a, b and e 53 | % but this increases interdependence amongst the parameters. 54 | % If bt>=0 then the max(...) expression above is thresholded to become 0 or 1. 55 | % 56 | % In addition it is possible to specify parameters for the noise estimation algorithm 57 | % which implements reference [2] or [3] according to the setting of pp.ne 58 | % 59 | % Minimum statistics noise estimate [2]: pp.ne=0 60 | % pp.taca % (11): smoothing time constant for alpha_c [0.0449 seconds] 61 | % pp.tamax % (3): max smoothing time constant [0.392 seconds] 62 | % pp.taminh % (3): min smoothing time constant (upper limit) [0.0133 seconds] 63 | % pp.tpfall % (12): time constant for P to fall [0.064 seconds] 64 | % pp.tbmax % (20): max smoothing time constant [0.0717 seconds] 65 | % pp.qeqmin % (23): minimum value of Qeq [2] 66 | % pp.qeqmax % max value of Qeq per frame [14] 67 | % pp.av % (23)+13 lines: fudge factor for bc calculation [2.12] 68 | % pp.td % time to take minimum over [1.536 seconds] 69 | % pp.nu % number of subwindows to use [3] 70 | % pp.qith % Q-inverse thresholds to select maximum noise slope [0.03 0.05 0.06 Inf ] 71 | % pp.nsmdb % corresponding noise slope thresholds in dB/second [47 31.4 15.7 4.1] 72 | % 73 | % MMSE noise estimate [3]: pp.ne=1 74 | % pp.tax % smoothing time constant for noise power estimate [0.0717 seconds](8) 75 | % pp.tap % smoothing time constant for smoothed speech prob [0.152 seconds](23) 76 | % pp.psthr % threshold for smoothed speech probability [0.99] (24) 77 | % pp.pnsaf % noise probability safety value [0.01] (24) 78 | % pp.pspri % prior speech probability [0.5] (18) 79 | % pp.asnr % active SNR in dB [15] (18) 80 | % pp.psini % initial speech probability [0.5] (23) 81 | % pp.tavini % assumed speech absent time at start [0.064 seconds] 82 | % 83 | % If convenient, you can call specsub in chunks of arbitrary size. Thus the following are equivalent: 84 | % 85 | % (a) y=specsub(s,fs); 86 | % 87 | % (b) [y1,z]=specsub(s(1:1000),fs); 88 | % [y2,z]=specsub(s(1001:2000),z); 89 | % y3=specsub(s(2001:end),z); 90 | % y=[y1; y2; y3]; 91 | % 92 | % If the number of output arguments is either 2 or 5, the last partial frame of samples will 93 | % be retained for overlap adding with the output from the next call to specsub(). 94 | % 95 | % See also ssubmmse() for an alternative gain function 96 | % 97 | % Refs: 98 | % [1] M. Berouti, R. Schwartz and J. Makhoul 99 | % Enhancement of speech corrupted by acoustic noise 100 | % Proc IEEE ICASSP, 1979, 4, 208-211 101 | % [2] Rainer Martin. 102 | % Noise power spectral density estimation based on optimal smoothing and minimum statistics. 103 | % IEEE Trans. Speech and Audio Processing, 9(5):504-512, July 2001. 104 | % [3] Gerkmann, T. & Hendriks, R. C. 105 | % Unbiased MMSE-Based Noise Power Estimation With Low Complexity and Low Tracking Delay 106 | % IEEE Trans Audio, Speech, Language Processing, 2012, 20, 1383-1393 107 | 108 | % Copyright (C) Mike Brookes 2004 109 | % Version: $Id: specsub.m 1720 2012-03-31 17:17:31Z dmb $ 110 | % 111 | % VOICEBOX is a MATLAB toolbox for speech processing. 112 | % Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html 113 | % 114 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 115 | % This program is free software; you can redistribute it and/or modify 116 | % it under the terms of the GNU General Public License as published by 117 | % the Free Software Foundation; either version 2 of the License, or 118 | % (at your option) any later version. 119 | % 120 | % This program is distributed in the hope that it will be useful, 121 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 122 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 123 | % GNU General Public License for more details. 124 | % 125 | % You can obtain a copy of the GNU General Public License from 126 | % http://www.gnu.org/copyleft/gpl.html or by writing to 127 | % Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA. 128 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 129 | % 130 | % Modified code: Zheng-Hua Tan, 2012 131 | % 132 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 133 | 134 | if numel(si)>length(si) 135 | error('Input speech signal must be a vector not a matrix'); 136 | end 137 | if isstruct(fsz) 138 | fs=fsz.fs; 139 | qq=fsz.qq; 140 | qp=fsz.qp; 141 | ze=fsz.ze; 142 | s=zeros(length(fsz.si)+length(si(:)),1); % allocate space for speech 143 | s(1:length(fsz.si))=fsz.si; 144 | s(length(fsz.si)+1:end)=si(:); 145 | else 146 | fs=fsz; % sample frequency 147 | s=si(:); 148 | % default algorithm constants 149 | 150 | qq.of=2; % overlap factor = (fft length)/(frame increment) 151 | qq.ti=16e-3; % desired frame increment (16 ms) 152 | qq.ri=0; % round ni to the nearest power of 2 153 | qq.g=1; % subtraction domain: 1=magnitude, 2=power 154 | qq.e=1; % gain exponent 155 | qq.am=3; % max oversubtraction factor 156 | qq.b=0.01; % noise floor 157 | qq.al=-5; % SNR for maximum a (set to Inf for fixed a) 158 | qq.ah=20; % SNR for minimum a 159 | qq.bt=-1; % suppress binary masking 160 | qq.ne=0; % noise estimation: 0=min statistics, 1=MMSE [0] 161 | qq.mx=0; % no input mixing 162 | qq.gh=1; % maximum gain 163 | qq.tf='g'; % output the gain time-frequency plane by default 164 | qq.rf=0; 165 | % if nargin>=3 && ~isempty(pp) 166 | if nargin>=5 && ~isempty(pp) 167 | qp=pp; % save for estnoisem call 168 | qqn=fieldnames(qq); 169 | for i=1:length(qqn) 170 | if isfield(pp,qqn{i}) 171 | qq.(qqn{i})=pp.(qqn{i}); 172 | end 173 | end 174 | else 175 | qp=struct; % make an empty structure 176 | end 177 | end 178 | % derived algorithm constants 179 | if qq.ri 180 | ni=pow2(nextpow2(qq.ti*fs*sqrt(0.5))); 181 | else 182 | ni=round(qq.ti*fs); % frame increment in samples 183 | end 184 | tinc=ni/fs; % true frame increment time 185 | tf=qq.tf; 186 | rf=qq.rf || nargout==2 || nargout==5; % round down to an exact number of frames 187 | ne=qq.ne; % noise estimation: 0=min statistics, 1=MMSE [0] 188 | 189 | % calculate power spectrum in frames 190 | 191 | no=round(qq.of); % integer overlap factor 192 | nf=ni*no; % fft length 193 | w=sqrt(hamming(nf+1))'; w(end)=[]; % for now always use sqrt hamming window 194 | w=w/sqrt(sum(w(1:ni:nf).^2)); % normalize to give overall gain of 1 195 | if rf>0 196 | rfm=''; % truncated input to an exact number of frames 197 | else 198 | rfm='r'; 199 | end 200 | 201 | 202 | [y,tt]=enframe(s,w,ni,rfm); 203 | 204 | tt=tt/fs; % frame times 205 | yf=rfft(y,nf,2); 206 | yp=yf.*conj(yf); % power spectrum of input speech 207 | [nr,nf2]=size(yp); % number of frames 208 | ff=(0:nf2-1)*fs/nf; 209 | 210 | 211 | if isstruct(fsz) 212 | if ne>0 213 | [dp,ze]=estnoiseg(yp,ze); % estimate the noise using MMSE 214 | else 215 | if size(noise_seg)0 221 | [dp,ze]=estnoiseg(yp,tinc,qp); % estimate the noise using MMSE 222 | else 223 | if size(noise_seg,1)=(af+bf).^(-1); % mask for noise floor limiting 260 | g=zeros(size(v)); % reserve space for gain matrix 261 | eg=qq.e/qq.g; % gain exponent relative to subtraction domain 262 | gh=qq.gh; 263 | switch eg 264 | case 1 % Normal case 265 | g(mf)=min(bf*v(mf),gh); % never give a gain > 1 266 | g(~mf)=1-af(~mf).*v(~mf); 267 | case 0.5 268 | g(mf)=min(sqrt(bf*v(mf)),gh); 269 | g(~mf)=sqrt(1-af(~mf).*v(~mf)); 270 | otherwise 271 | g(mf)=min((bf*v(mf)).^eg,gh); 272 | g(~mf)=(1-af(~mf).*v(~mf)).^eg; 273 | end 274 | if qq.bt>=0 275 | g=g>qq.bt; 276 | end 277 | g=qq.mx+(1-qq.mx)*g; % mix in some of the input 278 | 279 | out=yf.*g; 280 | out_p=out.*conj(out); 281 | out_pf=sum(out_p,2); 282 | 283 | 284 | %--low frequency noise 285 | for i=1:nr 286 | if sum(yp(i,1:7),2)>sum(yp(i,:),2)/2 %% 7 frequecy bins condition 287 | yp(i,1:7)=0; 288 | out_p(i,1:7)=0; 289 | out(i,1:7)=0+0i; 290 | end 291 | end 292 | 293 | 294 | out_dpft=dpf/2; 295 | out_smth=ones(nr,1); 296 | for i=1:nr 297 | if out_pf(i)2 && ~isempty(tf) 332 | gg=zeros(nr,nf2,length(tf)); % make space 333 | for i=1:length(tf) 334 | switch tf(i) 335 | case 'i' % 'i' = input power spectrum 336 | gg(:,:,i)=yp; 337 | case 'I' % 'i' = input power spectrum 338 | gg(:,:,i)=yf; 339 | case 'n' % 'n' = noise power spectrum 340 | gg(:,:,i)=dp; 341 | case 'g' % 'g' = gain 342 | gg(:,:,i)=g; 343 | case 'o' % 'o' = output power spectrum 344 | gg(:,:,i)=yp.*g.^2; 345 | case 'O' % 'o' = output power spectrum 346 | gg(:,:,i)=yf.*g; 347 | end 348 | end 349 | end 350 | end 351 | if nargout==2 || nargout==5 352 | if nr 353 | zo.ssv=ss(end-ni*(no-1)+1:end); % save the output tail for next time 354 | ss(end-ni*(no-1)+1:end)=[]; 355 | else 356 | zo.ssv=ssv; % 357 | end 358 | zo.si=s(length(ss)+1:end); % save the tail end of the input speech signal 359 | zo.fs=fs; % save sample frequency 360 | zo.qq=qq; % save local parameters 361 | zo.qp=qp; % save estnoisem parameters 362 | zo.ze=ze; % save state of noise estimation 363 | if nargout==2 364 | gg=zo; % 2nd of two arguments is zo 365 | end 366 | elseif rf==0 367 | ss=ss(1:length(s)); % trim to the correct length if not an exact number of frames 368 | end 369 | if ~nargout && nr>0 370 | ffax=ff/1000; ax=zeros(4,1); 371 | ax(1)=subplot(223); 372 | imagesc(tt,ffax,20*log10(g)'); 373 | colorbar; 374 | axis('xy'); 375 | if qq.al==Inf 376 | title(sprintf('Filter Gain (dB): a=%.2g, b=%.3g',qq.am,qq.b)); 377 | else 378 | title(sprintf('Filter Gain (dB): a=%.2g (%.0f to %.0fdB), b=%.3g',qq.am,qq.al,qq.ah,qq.b)); 379 | end 380 | xlabel('Time (s)'); 381 | ylabel('Frequency (kHz)'); 382 | 383 | ax(2)=subplot(222); 384 | imagesc(tt,ffax,10*log10(yp)'); 385 | colorbar; 386 | axis('xy'); 387 | title('Noisy Speech (dB)'); 388 | xlabel('Time (s)'); 389 | ylabel('Frequency (kHz)'); 390 | 391 | ax(3)=subplot(224); 392 | imagesc(tt,ffax,10*log10(yp.*g.^2)'); 393 | colorbar; 394 | axis('xy'); 395 | title(sprintf('Enhanced Speech (dB): g=%.2g, e=%.3g',qq.g,qq.e)); 396 | xlabel('Time (s)'); 397 | ylabel('Frequency (kHz)'); 398 | 399 | ax(4)=subplot(221); 400 | imagesc(tt,ffax,10*log10(dp)'); 401 | colorbar; 402 | axis('xy'); 403 | title('Noise Estimate (dB)'); 404 | xlabel('Time (s)'); 405 | ylabel('Frequency (kHz)'); 406 | linkaxes(ax); 407 | end 408 | -------------------------------------------------------------------------------- /rVAD2.0/estnoisem.m: -------------------------------------------------------------------------------- 1 | function [x,zo,xs]=estnoisem(yf,tz,pp) 2 | %ESTNOISEM - estimate noise spectrum using minimum statistics 3 | % 4 | % Usage: ninc=round(0.016*fs); % frame increment [fs=sample frequency] 5 | % ovf=2; % overlap factor 6 | % f=rfft(enframe(s,hanning(ovf*ninc,'periodic'),ninc),ovf*ninc,2); 7 | % f=f.*conj(f); % convert to power spectrum 8 | % x=estnoisem(f,ninc/fs); % estimate the noise power spectrum 9 | % 10 | % Inputs: 11 | % yf input power spectra (one row per frame) 12 | % tz frame increment in seconds 13 | % Alternatively, the input state from a previous call (see below) 14 | % pp algorithm parameters [optional] 15 | % 16 | % Outputs: 17 | % x estimated noise power spectra (one row per frame) 18 | % zo output state 19 | % xs estimated std error of x (one row per frame) 20 | % xs seems often to be an underestimate by a factor of 2 or 3 21 | % 22 | % The algorithm parameters are defined in reference [1] from which equation 23 | % numbers are given in parentheses. They are as follows: 24 | % 25 | % pp.taca % (11): smoothing time constant for alpha_c [0.0449 seconds] 26 | % pp.tamax % (3): max smoothing time constant [0.392 seconds] 27 | % pp.taminh % (3): min smoothing time constant (upper limit) [0.0133 seconds] 28 | % pp.tpfall % (12): time constant for P to fall [0.064 seconds] 29 | % pp.tbmax % (20): max smoothing time constant [0.0717 seconds] 30 | % pp.qeqmin % (23): minimum value of Qeq [2] 31 | % pp.qeqmax % max value of Qeq per frame [14] 32 | % pp.av % (23)+13 lines: fudge factor for bc calculation [2.12] 33 | % pp.td % time to take minimum over [1.536 seconds] 34 | % pp.nu % number of subwindows to use [3] 35 | % pp.qith % Q-inverse thresholds to select maximum noise slope [0.03 0.05 0.06 Inf ] 36 | % pp.nsmdb % corresponding noise slope thresholds in dB/second [47 31.4 15.7 4.1] 37 | % 38 | % Example use: y=enframe(s,w,ni); % divide speech signal s(n) into 39 | % % overlapping frames using window w(n) 40 | % yf=rfft(y,nf,2); % take fourier transform 41 | % dp=estnoisem(yf.*conj(yf),tinc); % estimate the noise 42 | % 43 | % If convenient, you can call estnoisem in chunks of arbitrary size. Thus the following are equivalent: 44 | % 45 | % (a) dp=estnoisem(yp(1:300),tinc); 46 | % 47 | % (b) [dp(1:100),z]=estnoisem(yp(1:100),tinc); 48 | % [dp(101:200),z]=estnoisem(yp(101:200),z); 49 | % [dp(201:300),z]=estnoisem(yp(201:300),z); 50 | 51 | 52 | % This is intended to be a precise implementation of [1] with Table III 53 | % replaced by the updated table 5 from [2]. The only deliberate algorithm 54 | % change is the introduction of a minimum value for 1/Qeq in equation (23). 55 | % This change only affects the first few frames and improves the 56 | % convergence of the algorithm. A minor improveemnt was reported in [3] but 57 | % this has not yet been included. 58 | % 59 | % Refs: 60 | % [1] Rainer Martin. 61 | % Noise power spectral density estimation based on optimal smoothing and minimum statistics. 62 | % IEEE Trans. Speech and Audio Processing, 9(5):504-512, July 2001. 63 | % [2] Rainer Martin. 64 | % Bias compensation methods for minimum statistics noise power spectral density estimation 65 | % Signal Processing, 2006, 86, 1215-1229 66 | % [3] Dirk Mauler and Rainer Martin 67 | % Noise power spectral density estimation on highly correlated data 68 | % Proc IWAENC, 2006 69 | 70 | % Copyright (C) Mike Brookes 2008 71 | % Version: $Id: estnoisem.m 1718 2012-03-31 16:40:41Z dmb $ 72 | % 73 | % VOICEBOX is a MATLAB toolbox for speech processing. 74 | % Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html 75 | % 76 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 77 | % This program is free software; you can redistribute it and/or modify 78 | % it under the terms of the GNU General Public License as published by 79 | % the Free Software Foundation; either version 2 of the License, or 80 | % (at your option) any later version. 81 | % 82 | % This program is distributed in the hope that it will be useful, 83 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 84 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 85 | % GNU General Public License for more details. 86 | % 87 | % You can obtain a copy of the GNU General Public License from 88 | % http://www.gnu.org/copyleft/gpl.html or by writing to 89 | % Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA. 90 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 91 | 92 | [nr,nrf]=size(yf); % number of frames and freq bins 93 | x=zeros(nr,nrf); % initialize output arrays 94 | xs=zeros(nr,nrf); % will hold std error in the future 95 | if isempty(yf) && isstruct(tz) % no real data 96 | zo=tz; % just keep the same state 97 | else 98 | if isstruct(tz) % take parameters from a previous call 99 | nrcum=tz.nrcum; 100 | p=tz.p; % smoothed power spectrum 101 | ac=tz.ac; % correction factor (9) 102 | sn2=tz.sn2; % estimated noise power 103 | pb=tz.pb; % smoothed noisy speech power (20) 104 | pb2=tz.pb2; 105 | pminu=tz.pminu; 106 | actmin=tz.actmin; % Running minimum estimate 107 | actminsub=tz.actminsub; % sub-window minimum estimate 108 | subwc=tz.subwc; % force a buffer switch on first loop 109 | actbuf=tz.actbuf; % buffer to store subwindow minima 110 | ibuf=tz.ibuf; 111 | lminflag=tz.lminflag; % flag to remember local minimum 112 | tinc=tz.tinc; % frame increment 113 | qq=tz.qq; % parameter structure 114 | else 115 | tinc = tz; % second argument is frame increment 116 | nrcum=0; % no frames so far 117 | % default algorithm constants 118 | 119 | qq.taca=0.0449; % smoothing time constant for alpha_c = -tinc/log(0.7) in equ (11) 120 | qq.tamax=0.392; % max smoothing time constant in (3) = -tinc/log(0.96) 121 | qq.taminh=0.0133; % min smoothing time constant (upper limit) in (3) = -tinc/log(0.3) 122 | qq.tpfall=0.064; % time constant for P to fall (12) 123 | qq.tbmax=0.0717; % max smoothing time constant in (20) = -tinc/log(0.8) 124 | qq.qeqmin=2; % minimum value of Qeq (23) 125 | qq.qeqmax=14; % max value of Qeq per frame 126 | qq.av=2.12; % fudge factor for bc calculation (23 + 13 lines) 127 | qq.td=1.536; % time to take minimum over 128 | qq.nu=8; % number of subwindows 129 | qq.qith=[0.03 0.05 0.06 Inf]; % noise slope thresholds in dB/s 130 | qq.nsmdb=[47 31.4 15.7 4.1]; 131 | 132 | if nargin>=3 && ~isempty(pp) 133 | qqn=fieldnames(qq); 134 | for i=1:length(qqn) 135 | if isfield(pp,qqn{i}) 136 | qq.(qqn{i})=pp.(qqn{i}); 137 | end 138 | end 139 | end 140 | end 141 | 142 | % unpack parameter structure 143 | 144 | taca=qq.taca; % smoothing time constant for alpha_c = -tinc/log(0.7) in equ (11) 145 | tamax=qq.tamax; % max smoothing time constant in (3) = -tinc/log(0.96) 146 | taminh=qq.taminh; % min smoothing time constant (upper limit) in (3) = -tinc/log(0.3) 147 | tpfall=qq.tpfall; % time constant for P to fall (12) 148 | tbmax=qq.tbmax; % max smoothing time constant in (20) = -tinc/log(0.8) 149 | qeqmin=qq.qeqmin; % minimum value of Qeq (23) 150 | qeqmax=qq.qeqmax; % max value of Qeq per frame 151 | av=qq.av; % fudge factor for bc calculation (23 + 13 lines) 152 | td=qq.td; % time to take minimum over 153 | nu=qq.nu; % number of subwindows 154 | qith=qq.qith; % noise slope thresholds in dB/s 155 | nsmdb=qq.nsmdb; % maximum permitted +ve noise slope in dB/s 156 | 157 | % derived algorithm constants 158 | 159 | aca=exp(-tinc/taca); % smoothing constant for alpha_c in equ (11) = 0.7 160 | acmax=aca; % min value of alpha_c = 0.7 in equ (11) also = 0.7 161 | amax=exp(-tinc/tamax); % max smoothing constant in (3) = 0.96 162 | aminh=exp(-tinc/taminh); % min smoothing constant (upper limit) in (3) = 0.3 163 | bmax=exp(-tinc/tbmax); % max smoothing constant in (20) = 0.8 164 | snrexp = -tinc/tpfall; 165 | nv=round(td/(tinc*nu)); % length of each subwindow in frames 166 | if nv<4 % algorithm doesn't work for miniscule frames 167 | nv=4; 168 | nu=max(round(td/(tinc*nv)),1); 169 | end 170 | nd=nu*nv; % length of total window in frames 171 | [md,hd]=mhvals(nd); % calculate the constants M(D) and H(D) from Table III 172 | [mv,hv]=mhvals(nv); % calculate the constants M(D) and H(D) from Table III 173 | nsms=10.^(nsmdb*nv*tinc/10); % [8 4 2 1.2] in paper 174 | qeqimax=1/qeqmin; % maximum value of Qeq inverse (23) 175 | qeqimin=1/qeqmax; % minumum value of Qeq per frame inverse 176 | 177 | if isempty(yf) % provide dummy initialization 178 | ac=1; % correction factor (9) 179 | subwc=nv; % force a buffer switch on first loop 180 | ibuf=0; 181 | p=x; % smoothed power spectrum 182 | sn2=p; % estimated noise power 183 | pb=p; % smoothed noisy speech power (20) 184 | pb2=pb.^2; 185 | pminu=p; 186 | actmin=repmat(Inf,1,nrf); % Running minimum estimate 187 | actminsub=actmin; % sub-window minimum estimate 188 | actbuf=repmat(Inf,nu,nrf); % buffer to store subwindow minima 189 | lminflag=zeros(1,nrf); % flag to remember local minimum 190 | else 191 | 192 | if ~nrcum % initialize values for first frame 193 | p=yf(1,:); % smoothed power spectrum 194 | ac=1; % correction factor (9) 195 | sn2=p; % estimated noise power 196 | pb=p; % smoothed noisy speech power (20) 197 | pb2=pb.^2; 198 | pminu=p; 199 | actmin=repmat(Inf,1,nrf); % Running minimum estimate 200 | actminsub=actmin; % sub-window minimum estimate 201 | subwc=nv; % force a buffer switch on first loop 202 | actbuf=repmat(Inf,nu,nrf); % buffer to store subwindow minima 203 | ibuf=0; 204 | lminflag=zeros(1,nrf); % flag to remember local minimum 205 | end 206 | 207 | % loop for each frame 208 | 209 | for t=1:nr % we use t instead of lambda in the paper 210 | yft=yf(t,:); % noise speech power spectrum 211 | acb=(1+(sum(p)./sum(yft)-1).^2).^(-1); % alpha_c-bar(t) (9) 212 | ac=aca*ac+(1-aca)*max(acb,acmax); % alpha_c(t) (10) 213 | ah=amax*ac.*(1+(p./sn2-1).^2).^(-1); % alpha_hat: smoothing factor per frequency (11) 214 | snr=sum(p)/sum(sn2); 215 | ah=max(ah,min(aminh,snr^snrexp)); % lower limit for alpha_hat (12) 216 | 217 | p=ah.*p+(1-ah).*yft; % smoothed noisy speech power (3) 218 | b=min(ah.^2,bmax); % smoothing constant for estimating periodogram variance (22 + 2 lines) 219 | pb=b.*pb + (1-b).*p; % smoothed periodogram (20) 220 | pb2=b.*pb2 + (1-b).*p.^2; % smoothed periodogram squared (21) 221 | 222 | qeqi=max(min((pb2-pb.^2)./(2*sn2.^2),qeqimax),qeqimin/(t+nrcum)); % Qeq inverse (23) 223 | qiav=sum(qeqi)/nrf; % Average over all frequencies (23+12 lines) (ignore non-duplication of DC and nyquist terms) 224 | bc=1+av*sqrt(qiav); % bias correction factor (23+11 lines) 225 | bmind=1+2*(nd-1)*(1-md)./(qeqi.^(-1)-2*md); % we use the simplified form (17) instead of (15) 226 | bminv=1+2*(nv-1)*(1-mv)./(qeqi.^(-1)-2*mv); % same expression but for sub windows 227 | kmod=bc*p.*bmind1 && subwc=nv % end of buffer - do a buffer switch 238 | ibuf=1+rem(ibuf,nu); % increment actbuf storage pointer 239 | actbuf(ibuf,:)=actmin; % save sub-window minimum 240 | pminu=min(actbuf,[],1); 241 | i=find(qiavpminu; 244 | if any(lmin) 245 | pminu(lmin)=actminsub(lmin); 246 | actbuf(:,lmin)=repmat(pminu(lmin),nu,1); 247 | end 248 | lminflag(:)=0; 249 | actmin(:)=Inf; 250 | subwc=0; 251 | end 252 | end 253 | subwc=subwc+1; 254 | x(t,:)=sn2; 255 | qisq=sqrt(qeqi); 256 | % empirical formula for standard error based on Fig 15 of [2] 257 | xs(t,:)=sn2.*sqrt(0.266*(nd+100*qisq).*qisq/(1+0.005*nd+6/nd)./(0.5*qeqi.^(-1)+nd-1)); 258 | end 259 | end 260 | if nargout>1 % we need to store the state for next time 261 | zo.nrcum=nrcum+nr; % number of frames so far 262 | zo.p=p; % smoothed power spectrum 263 | zo.ac=ac; % correction factor (9) 264 | zo.sn2=sn2; % estimated noise power 265 | zo.pb=pb; % smoothed noisy speech power (20) 266 | zo.pb2=pb2; 267 | zo.pminu=pminu; 268 | zo.actmin=actmin; % Running minimum estimate 269 | zo.actminsub=actminsub; % sub-window minimum estimate 270 | zo.subwc=subwc; % force a buffer switch on first loop 271 | zo.actbuf=actbuf; % buffer to store subwindow minima 272 | zo.ibuf=ibuf; 273 | zo.lminflag=lminflag; % flag to remember local minimum 274 | zo.tinc=tinc; % must be the last one 275 | zo.qq=qq; 276 | end 277 | if ~nargout 278 | clf; 279 | subplot(212); 280 | plot((1:nr)*tinc,10*log10([sum(yf,2) sum(x,2)])) 281 | ylabel('Frame Energy (dB)'); 282 | xlabel(sprintf('Time (s) [%d ms frame incr]',round(tinc*1000))); 283 | axisenlarge([-1 -1.05]); 284 | legend('input','noise','Location','Best'); 285 | subplot(211); 286 | plot(1:nrf,10*log10([sum(yf,1)'/nr sum(x,1)'/nr])) 287 | ylabel('Power (dB)'); 288 | xlabel('Frequency bin'); 289 | axisenlarge([-1 -1.05]); 290 | legend('input','noise','Location','Best'); 291 | end 292 | end 293 | 294 | function [m,h,d]=mhvals(d) 295 | % Values are taken from Table 5 in [2] 296 | %[2] R. Martin,"Bias compensation methods for minimum statistics noise power 297 | % spectral density estimation", Signal Processing Vol 86, pp1215-1229, 2006. 298 | 299 | % approx: plot(d.^(-0.5),[m 1-d.^(-0.5)],'x-'), plot(d.^0.5,h,'x-') 300 | persistent dmh 301 | if isempty(dmh) 302 | dmh=[ 303 | 1 0 0; 304 | 2 0.26 0.15; 305 | 5 0.48 0.48; 306 | 8 0.58 0.78; 307 | 10 0.61 0.98; 308 | 15 0.668 1.55; 309 | 20 0.705 2; 310 | 30 0.762 2.3; 311 | 40 0.8 2.52; 312 | 60 0.841 3.1; 313 | 80 0.865 3.38; 314 | 120 0.89 4.15; 315 | 140 0.9 4.35; 316 | 160 0.91 4.25; 317 | 180 0.92 3.9; 318 | 220 0.93 4.1; 319 | 260 0.935 4.7; 320 | 300 0.94 5]; 321 | end 322 | 323 | if nargin>=1 324 | i=find(d<=dmh(:,1)); 325 | if isempty(i) 326 | i=size(dmh,1); 327 | j=i; 328 | else 329 | i=i(1); 330 | j=i-1; 331 | end 332 | if d==dmh(i,1) 333 | m=dmh(i,2); 334 | h=dmh(i,3); 335 | else 336 | qj=sqrt(dmh(i-1,1)); % interpolate using sqrt(d) 337 | qi=sqrt(dmh(i,1)); 338 | q=sqrt(d); 339 | h=dmh(i,3)+(q-qi)*(dmh(j,3)-dmh(i,3))/(qj-qi); 340 | m=dmh(i,2)+(qi*qj/q-qj)*(dmh(j,2)-dmh(i,2))/(qi-qj); 341 | end 342 | else 343 | d=dmh(:,1); 344 | m=dmh(:,2); 345 | h=dmh(:,3); 346 | end -------------------------------------------------------------------------------- /rVAD2.0/fxpefac.m: -------------------------------------------------------------------------------- 1 | function [fx,tx,pv,fv]=fxpefac(s,fs,tinc,m,pp) 2 | %FXPEFAC PEFAC pitch tracker [FX,TT,PV,FV]=(S,FS,TINC,M,PP) 3 | % 4 | % Input: s(ns) Speech signal 5 | % fs Sample frequency (Hz) 6 | % tinc Time increment between frames (s) [0.01] 7 | % or [start increment end] 8 | % m mode 9 | % 'g' plot graph showing waveform and pitch 10 | % 'G' plot spectrogram with superimposed pitch 11 | % 'x' use external files for algorithm parameter 12 | % initialization: fxpefac_g and fxpefac_w 13 | % pp structure containing algorithm parameters 14 | % 15 | % Outputs: fx(nframe) Estimated pitch (Hz) 16 | % tx(nframe) Time at the centre of each frame (seconds). 17 | % pv(nframe) Probability of the frame of being voiced 18 | % fv structure containing feature vectors 19 | % fv.vuvfea(nframe,2) = voiced/unvoiced GMM features 20 | 21 | % References 22 | % [1] S.Gonzalez and M. Brookes, 23 | % A pitch estimation filter robust to high levels of noise (PEFAC), Proc EUSIPCO,Aug 2011. 24 | 25 | % Bugs/Suggestions 26 | % (1) do long files in chunks 27 | % (2) option of n-best DP 28 | 29 | % Copyright (C) Sira Gonzalez and Mike Brookes 2011 30 | % Version: $Id: fxpefac.m 713 2011-10-16 14:45:43Z dmb $ 31 | % 32 | % VOICEBOX is a MATLAB toolbox for speech processing. 33 | % Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html 34 | % 35 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 36 | % This program is free software; you can redistribute it and/or modify 37 | % it under the terms of the GNU General Public License as published by 38 | % the Free Software Foundation; either version 2 of the License, or 39 | % (at your option) any later version. 40 | % 41 | % This program is distributed in the hope that it will be useful, 42 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 43 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 44 | % GNU General Public License for more details. 45 | % 46 | % You can obtain a copy of the GNU General Public License from 47 | % http://www.gnu.org/copyleft/gpl.html or by writing to 48 | % Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA. 49 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 50 | 51 | persistent w_u m_u v_u w_v m_v v_v dpwtdef 52 | % initialize persistent variables 53 | if ~numel(w_u) 54 | 55 | % voiced/unvoiced decision based on 2-element feature vector 56 | % (a) mean power of the frame's log-freq spectrum (normalized so its short-term average is LTASS) 57 | % (b) sum of the power in the first three peaks 58 | %===== VUV 59 | if nargin>3 && any(m=='x') 60 | fxpefac_g; % read in GMM parameters 61 | fxpefac_w; % read in Weights parameters 62 | else 63 | w_u=[0.2123723 0.207788 0.2701817 0.1293616 0.04741722 0.1328791 ]'; 64 | 65 | m_u=[0.2220388 0.4067706 ; 66 | 0.04567656 0.4016914 ; 67 | 0.8415278 0.3192158 ; 68 | 0.2194808 0.1910079 ; 69 | 1.6347 0.5819833 ; 70 | 1.181519 0.6996485 ]; 71 | 72 | v_u=reshape([0.01413822 0.003357913 0.003357913 0.01786169 ; 73 | 0.0009377269 0.0006220489 0.0006220489 0.03422057 ; 74 | 0.1233703 0.004299293 0.004299293 0.007660504 ; 75 | 0.01779449 0.002078821 0.002078821 0.001605052 ; 76 | 1.110173 0.00718649 0.00718649 0.005734435 ; 77 | 0.5477135 -0.00182316 -0.00182316 0.05659796 ]',[2 2 6]); 78 | 79 | w_v=[0.07758689 0.2109879 0.1856225 0.06853158 0.2701563 0.1871148 ]'; 80 | 81 | m_v=[1.208656 0.3365564 ; 82 | 1.216643 0.5971916 ; 83 | 4.08585 1.240948 ; 84 | 8.322102 1.349939 ; 85 | 1.734108 1.168643 ; 86 | 0.5107205 0.940308 ]; 87 | 88 | v_v=reshape([0.06181574 0.002950501 0.002950501 0.004528442 ; 89 | 0.2946077 0.01433284 0.01433284 0.02684239 ; 90 | 2.508473 -0.03310555 -0.03310555 0.1098579 ; 91 | 14.17252 -0.09009174 -0.09009174 0.07989255 ; 92 | 0.5834894 -0.07854027 -0.07854027 0.1108958 ; 93 | 0.05978017 0.005528601 0.005528601 0.1309329 ]',[2 2 6]); 94 | end 95 | %===== PDP 96 | % dfm = -0.4238; % df mean 97 | % dfv = 3.8968; % df variance (although treated as std dev here) 98 | % delta = 0.15; 99 | % dflpso=[dfm 0.5/(log(10)*dfv^2) -log(2*delta/(dfv*sqrt(2*pi)))/log(10)]; % scale factor & offset for df pdf 100 | % dpwtdef=[1.0000, 0.8250, 1.3064, 1.9863]; % default DP weights 101 | dpwtdef=[1.0000, 0.8250, 0.01868, 0.006773, 98.9, -0.4238]; % default DP weights 102 | %===== END 103 | 104 | end 105 | 106 | 107 | % Algorithm parameter defaults 108 | 109 | p.fstep=5; % frequency resolution of initial spectrogram (Hz) 110 | p.fmax=4000; % maximum frequency of initial spectrogram (Hz) 111 | p.fres = 20; % bandwidth of initial spectrogram (Hz) 112 | p.fbanklo = 40; % low frequency limit of log filterbank (Hz) 113 | p.mpsmooth = 201; % width of smoothing filter for mean power 114 | % p.maxtranf = 1000; % maximum value of tranf cost term 115 | p.shortut = 7; % max utterance length to average power of entire utterance 116 | p.pefact = 1.5; % shape factor in PEFAC filter 117 | p.numopt = 3; % number of possible frequencies per frame 118 | p.flim = [60 400]; % range of feasible fundamental frequencies (Hz) 119 | p.w = dpwtdef; % DP weights 120 | % p.rampk = 1.1; % constant for relative-amplitude cost term 121 | % p.rampcz = 100; % relative amplitude cost for missing peak 122 | p.tmf = 2; % median frequency smoothing interval (s) 123 | p.tinc = 0.01; % default frame increment (s) 124 | 125 | % update parameters from pp argument 126 | 127 | if nargin>=5 && isstruct(pp) 128 | fnq=fieldnames(pp); 129 | for i=1:length(fnq) 130 | if isfield(p,fnq{i}) 131 | p.(fnq{i})=pp.(fnq{i}); 132 | end 133 | end 134 | end 135 | 136 | % Sort out input arguments 137 | if nargin>=3 && numel(tinc)>0 138 | p.tinc = tinc; % 0.01 s between consecutive time frames 139 | end 140 | if nargin<4 141 | m=''; 142 | end 143 | 144 | % Spectrogram of the mixture 145 | fmin = 0; fstep = p.fstep; fmax = p.fmax; 146 | fres = p.fres; % Frequency resolution (Hz) 147 | [tx,f,MIX]=spgrambw(s,fs,fres,[fmin fstep fmax],[],p.tinc); 148 | nframes=length(tx); 149 | txinc=tx(2)-tx(1); % actual frame increment 150 | % ==== we could combine spgrambw and filtbankm into a single call to spgrambw or use fft directly ==== 151 | % Log-frequency scale 152 | [trans,cf]=filtbankm(length(f),2*length(f)-1,2*f(end),p.fbanklo,f(end),'usl'); 153 | O = MIX*trans'; % Original spectrum in Log-frequency scale 154 | 155 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 156 | % Amplitude Compression 157 | 158 | % Calculate alpha based on LTASS ratios 159 | ltass = stdspectrum(6,'p',cf); 160 | auxf = [cf(1),(cf(1:end-1)+cf(2:end))./2,cf(end)]; 161 | ltass = ltass.*diff(auxf); % weight by bin width 162 | 163 | % estimated ltass 164 | O = O.*repmat(diff(auxf),nframes,1); % weight spectrum by bin width 165 | 166 | if tx(end)2*cf(1)); 204 | sca = cf/cf(ini(1)); % normalize bin frequencies to start at approximately 0.5 205 | sca = sca(sca<10.5 & sca>0.5); % restrict to 0.5 - 10.5 times fundamental 206 | filh = -log10(p.pefact-cos(2*pi*sca)); 207 | filh = filh-mean(filh); % force filter to be zero mean 208 | posit = find(sca>=1); % ==== this should just equal ini(1) ==== 209 | if ~mod(length(posit),2) 210 | filh = [filh 0]; % force to be an odd length after central tap 211 | end 212 | negat = find(sca<1); 213 | numz = length(posit)-1-length(negat); 214 | filh = filh./max(filh); 215 | filh = [zeros(1,numz) filh]; 216 | 217 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 218 | % Filter the log-frequency scaled spectrogram 219 | B = imfilter(O,filh); % ==== no good reason to use imfilter here ==== 220 | 221 | % Feasible frequency range 222 | numopt = p.numopt; % Number of possible fundamental frequencies per frame 223 | flim = p.flim; 224 | pfreq = find(cf>flim(1) & cf0.6)); % calculate median frequency of first 2 seconds 293 | if isnan(mf) 294 | mf=median(fpos(pv(1:min(inmf,end))>0.5)); 295 | if isnan(mf) 296 | mf=median(fpos(pv(1:min(inmf,end))>0.4)); 297 | if isnan(mf) 298 | mf=median(fpos(pv(1:min(inmf,end))>0.3)); % ==== clumsy way of ensuring that we take the best frames ==== 299 | if isnan(mf) 300 | mf=0; 301 | end 302 | end 303 | end 304 | end 305 | medfx(1)=mf; 306 | 307 | for i=2:nframes % main dynamic programming loop 308 | if i>inmf 309 | fpos = ff(i-inmf:i,1); % fpos is the highest peak in each frame 310 | mf=median(fpos(pv(1:inmf)>0.6)); % find median frequency over past 2 seconds 311 | if isnan(mf) 312 | mf=median(fpos(pv(1:inmf)>0.5)); 313 | if isnan(mf) 314 | mf=median(fpos(pv(1:inmf)>0.4)); 315 | if isnan(mf) 316 | mf=median(fpos(pv(1:inmf)>0.3));% ==== clumsy way of ensuring that we take the best frames ==== 317 | if isnan(mf) 318 | mf=0; 319 | end 320 | end 321 | end 322 | end 323 | end 324 | medfx(i)=mf; 325 | % Frequency difference between candidates and cost 326 | df = dffact*(repmat(ff(i,:).',1,numopt) - repmat(ff(i-1,:),numopt,1))./(repmat(ff(i,:).',1,numopt) + repmat(ff(i-1,:),numopt,1)); 327 | costdf=w(3)*min((df-w(6)).^2,w(4)); 328 | 329 | % Cost related to the median pitch 330 | if mf==0 % this test was inverted in the original version 331 | costf = zeros(1,numopt); 332 | else 333 | costf = abs(ff(i,:) - mf)./mf; 334 | end 335 | [cost(i,:),prev(i,:)]=min(costdf + repmat(cost(i-1,:),numopt,1),[],2); % ==== should we allow the possibility of skipping frames ? ==== 336 | cost(i,:)=cost(i,:)+w(2)*costf + w(1)*camp(i,:); % add on costs that are independent of previous path 337 | 338 | end 339 | 340 | % Traceback 341 | 342 | fx=zeros(nframes,1); 343 | best = zeros(nframes,1); 344 | 345 | nose=find(cost(end,:)==min(cost(end,:))); % ==== bad method (dangerous) === 346 | best(end)=nose(1); 347 | % ff = [ff zeros(nframes,1)]; % not clear why this was here 348 | fx(end)=ff(end,best(end)); 349 | for i=nframes:-1:2 350 | best(i-1)=prev(i,best(i)); 351 | fx(i-1)=ff(i-1,best(i-1)); 352 | end 353 | 354 | if nargout>=4 355 | fv.vuvfea=vuvfea; % voiced-unvoiced features 356 | fv.best=best; % selected path 357 | fv.ff=ff; % pitch candidates 358 | fv.amp=amp; % pitch candidate amplitudes 359 | fv.medfx=medfx; % median pitch 360 | fv.w=w; % DP weights 361 | fv.dffact=dffact; % df scale factor 362 | end 363 | 364 | if ~nargout || any(m=='g') || any(m=='G') 365 | nax=0; % number of axes sets to link 366 | msk=pv>0.5; % find voiced frames as a mask 367 | fxg=fx; 368 | fxg(~msk)=NaN; % allow only good frames 369 | fxb=fx; 370 | fxb(msk)=NaN; % allow only bad frames 371 | if any(m=='G') || ~nargout && ~any(m=='g') 372 | clf; 373 | spgrambw(s,fs,'ilcwpf'); % draw spectrogram with log axes 374 | hold on 375 | plot(tx,log10(fxg),'-b',tx,log10(fxb),'-r'); % fx track 376 | yy=get(gca,'ylim'); 377 | plot(tx,yy(1)+yy*[-1;1]*(0.02+0.05*pv),'-k'); % P(V) track 378 | hold off 379 | nax=nax+1; 380 | axh(nax)=gca; 381 | if any(m=='g') 382 | figure; % need a new figure if plotting two graphs 383 | end 384 | end 385 | if any(m=='g') 386 | ns=length(s); 387 | [tsr,ix]=sort([(1:ns)/fs 0.5*(tx(1:end-1)+tx(2:end))']); % intermingle speech and frame boundaries 388 | jx(ix)=1:length(ix); % create inverse index 389 | sp2fr=jx(1:ns)-(0:ns-1); % speech sample to frame number 390 | spmsk=msk(sp2fr); % speech sample voiced mask 391 | sg=s; 392 | sg(~spmsk)=NaN; % good speech samples only 393 | sb=s; 394 | sb(spmsk)=NaN; % bad speech samples only 395 | clf; 396 | subplot(5,1,1); 397 | plot(tx,pv,'-b',(1:ns)/fs,0.5*mod(cumsum(fx(sp2fr)/fs),1)-0.6,'-b'); 398 | nax=nax+1; 399 | axh(nax)=gca; 400 | ylabel('\phi(t), P(V)'); 401 | set(gca,'ylim',[-0.65 1.05]); 402 | subplot(5,1,2:3); 403 | plot((1:ns)/fs,sg,'-b',(1:ns)/fs,sb,'-r'); 404 | nax=nax+1; 405 | axh(nax)=gca; 406 | subplot(5,1,4:5); 407 | plot(tx,fxg,'-b',tx,fxb,'-r'); 408 | ylabel('Pitch (Hz)'); 409 | % semilogy(tx,fxg,'-b',tx,fxb,'-r'); 410 | % ylabel(['Pitch (' yticksi 'Hz)']); 411 | set(gca,'ylim',[min(fxg)-30 max(fxg)+30]); 412 | nax=nax+1; 413 | axh(nax)=gca; 414 | end 415 | if nax>1 416 | linkaxes(axh,'x'); 417 | end 418 | end 419 | 420 | function y=smooth(x,n) 421 | nx=length(x); 422 | c=cumsum(x); 423 | y=[c(1:2:n)./(1:2:n) (c(n+1:end)-c(1:end-n))/n (c(end)-c(end-n+2:2:end-1))./(n-2:-2:1)]; 424 | -------------------------------------------------------------------------------- /rVAD2.0/LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | 294 | Copyright (C) 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | , 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | -------------------------------------------------------------------------------- /rVAD2.0/estnoisem_noiseseg.m: -------------------------------------------------------------------------------- 1 | function [x,zo,xs]=estnoisem_noiseseg(yf,noise_seg,pv01,tz,pp) 2 | %ESTNOISEM - estimate noise spectrum using minimum statistics 3 | % 4 | % Usage: ninc=round(0.016*fs); % frame increment [fs=sample frequency] 5 | % ovf=2; % overlap factor 6 | % f=rfft(enframe(s,hanning(ovf*ninc,'periodic'),ninc),ovf*ninc,2); 7 | % f=f.*conj(f); % convert to power spectrum 8 | % x=estnoisem(f,ninc/fs); % estimate the noise power spectrum 9 | % 10 | % Inputs: 11 | % yf input power spectra (one row per frame) 12 | % tz frame increment in seconds 13 | % Alternatively, the input state from a previous call (see below) 14 | % pp algorithm parameters [optional] 15 | % 16 | % Outputs: 17 | % x estimated noise power spectra (one row per frame) 18 | % zo output state 19 | % xs estimated std error of x (one row per frame) 20 | % xs seems often to be an underestimate by a factor of 2 or 3 21 | % 22 | % The algorithm parameters are defined in reference [1] from which equation 23 | % numbers are given in parentheses. They are as follows: 24 | % 25 | % pp.taca % (11): smoothing time constant for alpha_c [0.0449 seconds] 26 | % pp.tamax % (3): max smoothing time constant [0.392 seconds] 27 | % pp.taminh % (3): min smoothing time constant (upper limit) [0.0133 seconds] 28 | % pp.tpfall % (12): time constant for P to fall [0.064 seconds] 29 | % pp.tbmax % (20): max smoothing time constant [0.0717 seconds] 30 | % pp.qeqmin % (23): minimum value of Qeq [2] 31 | % pp.qeqmax % max value of Qeq per frame [14] 32 | % pp.av % (23)+13 lines: fudge factor for bc calculation [2.12] 33 | % pp.td % time to take minimum over [1.536 seconds] 34 | % pp.nu % number of subwindows to use [3] 35 | % pp.qith % Q-inverse thresholds to select maximum noise slope [0.03 0.05 0.06 Inf ] 36 | % pp.nsmdb % corresponding noise slope thresholds in dB/second [47 31.4 15.7 4.1] 37 | % 38 | % Example use: y=enframe(s,w,ni); % divide speech signal s(n) into 39 | % % overlapping frames using window w(n) 40 | % yf=rfft(y,nf,2); % take fourier transform 41 | % dp=estnoisem(yf.*conj(yf),tinc); % estimate the noise 42 | % 43 | % If convenient, you can call estnoisem in chunks of arbitrary size. Thus the following are equivalent: 44 | % 45 | % (a) dp=estnoisem(yp(1:300),tinc); 46 | % 47 | % (b) [dp(1:100),z]=estnoisem(yp(1:100),tinc); 48 | % [dp(101:200),z]=estnoisem(yp(101:200),z); 49 | % [dp(201:300),z]=estnoisem(yp(201:300),z); 50 | 51 | 52 | % This is intended to be a precise implementation of [1] with Table III 53 | % replaced by the updated table 5 from [2]. The only deliberate algorithm 54 | % change is the introduction of a minimum value for 1/Qeq in equation (23). 55 | % This change only affects the first few frames and improves the 56 | % convergence of the algorithm. A minor improveemnt was reported in [3] but 57 | % this has not yet been included. 58 | % 59 | % Refs: 60 | % [1] Rainer Martin. 61 | % Noise power spectral density estimation based on optimal smoothing and minimum statistics. 62 | % IEEE Trans. Speech and Audio Processing, 9(5):504-512, July 2001. 63 | % [2] Rainer Martin. 64 | % Bias compensation methods for minimum statistics noise power spectral density estimation 65 | % Signal Processing, 2006, 86, 1215-1229 66 | % [3] Dirk Mauler and Rainer Martin 67 | % Noise power spectral density estimation on highly correlated data 68 | % Proc IWAENC, 2006 69 | 70 | % Copyright (C) Mike Brookes 2008 71 | % Version: $Id: estnoisem.m 1718 2012-03-31 16:40:41Z dmb $ 72 | % 73 | % VOICEBOX is a MATLAB toolbox for speech processing. 74 | % Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html 75 | % 76 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 77 | % This program is free software; you can redistribute it and/or modify 78 | % it under the terms of the GNU General Public License as published by 79 | % the Free Software Foundation; either version 2 of the License, or 80 | % (at your option) any later version. 81 | % 82 | % This program is distributed in the hope that it will be useful, 83 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 84 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 85 | % GNU General Public License for more details. 86 | % 87 | % You can obtain a copy of the GNU General Public License from 88 | % http://www.gnu.org/copyleft/gpl.html or by writing to 89 | % Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA. 90 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 91 | % 92 | % Modified code, Zheng-Hua Tan, 2012 93 | % 94 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 95 | 96 | [nr,nrf]=size(yf); % number of frames and freq bins 97 | 98 | 99 | 100 | 101 | x=zeros(nr,nrf); % initialize output arrays 102 | xs=zeros(nr,nrf); % will hold std error in the future 103 | if isempty(yf) && isstruct(tz) % no real data 104 | zo=tz; % just keep the same state 105 | else 106 | if isstruct(tz) % take parameters from a previous call 107 | nrcum=tz.nrcum; 108 | p=tz.p; % smoothed power spectrum 109 | ac=tz.ac; % correction factor (9) 110 | sn2=tz.sn2; % estimated noise power 111 | pb=tz.pb; % smoothed noisy speech power (20) 112 | pb2=tz.pb2; 113 | pminu=tz.pminu; 114 | actmin=tz.actmin; % Running minimum estimate 115 | actminsub=tz.actminsub; % sub-window minimum estimate 116 | subwc=tz.subwc; % force a buffer switch on first loop 117 | actbuf=tz.actbuf; % buffer to store subwindow minima 118 | ibuf=tz.ibuf; 119 | lminflag=tz.lminflag; % flag to remember local minimum 120 | tinc=tz.tinc; % frame increment 121 | qq=tz.qq; % parameter structure 122 | else 123 | tinc = tz; % second argument is frame increment 124 | nrcum=0; % no frames so far 125 | % default algorithm constants 126 | 127 | qq.taca=0.0449; % smoothing time constant for alpha_c = -tinc/log(0.7) in equ (11) 128 | qq.tamax=0.392; % max smoothing time constant in (3) = -tinc/log(0.96) 129 | qq.taminh=0.0133; % min smoothing time constant (upper limit) in (3) = -tinc/log(0.3) 130 | qq.tpfall=0.064; % time constant for P to fall (12) 131 | qq.tbmax=0.0717; % max smoothing time constant in (20) = -tinc/log(0.8) 132 | qq.qeqmin=2; % minimum value of Qeq (23) 133 | qq.qeqmax=14; % max value of Qeq per frame 134 | qq.av=2.12; % fudge factor for bc calculation (23 + 13 lines) 135 | qq.td=1.536; % time to take minimum over 136 | qq.nu=8; % number of subwindows 137 | qq.qith=[0.03 0.05 0.06 Inf]; % noise slope thresholds in dB/s 138 | qq.nsmdb=[47 31.4 15.7 4.1]; 139 | 140 | if nargin>=4 && ~isempty(pp) 141 | qqn=fieldnames(qq); 142 | for i=1:length(qqn) 143 | if isfield(pp,qqn{i}) 144 | qq.(qqn{i})=pp.(qqn{i}); 145 | end 146 | end 147 | end 148 | end 149 | 150 | % unpack parameter structure 151 | 152 | taca=qq.taca; % smoothing time constant for alpha_c = -tinc/log(0.7) in equ (11) 153 | tamax=qq.tamax; % max smoothing time constant in (3) = -tinc/log(0.96) 154 | taminh=qq.taminh; % min smoothing time constant (upper limit) in (3) = -tinc/log(0.3) 155 | tpfall=qq.tpfall; % time constant for P to fall (12) 156 | tbmax=qq.tbmax; % max smoothing time constant in (20) = -tinc/log(0.8) 157 | qeqmin=qq.qeqmin; % minimum value of Qeq (23) 158 | qeqmax=qq.qeqmax; % max value of Qeq per frame 159 | av=qq.av; % fudge factor for bc calculation (23 + 13 lines) 160 | td=qq.td; % time to take minimum over 161 | nu=qq.nu; % number of subwindows 162 | qith=qq.qith; % noise slope thresholds in dB/s 163 | nsmdb=qq.nsmdb; % maximum permitted +ve noise slope in dB/s 164 | 165 | % derived algorithm constants 166 | 167 | aca=exp(-tinc/taca); % smoothing constant for alpha_c in equ (11) = 0.7 168 | acmax=aca; % min value of alpha_c = 0.7 in equ (11) also = 0.7 169 | amax=exp(-tinc/tamax); % max smoothing constant in (3) = 0.96 170 | aminh=exp(-tinc/taminh); % min smoothing constant (upper limit) in (3) = 0.3 171 | bmax=exp(-tinc/tbmax); % max smoothing constant in (20) = 0.8 172 | snrexp = -tinc/tpfall; 173 | nv=round(td/(tinc*nu)); % length of each subwindow in frames 174 | if nv<4 % algorithm doesn't work for miniscule frames 175 | nv=4; 176 | nu=max(round(td/(tinc*nv)),1); 177 | end 178 | nd=nu*nv; % length of total window in frames 179 | [md,hd]=mhvals(nd); % calculate the constants M(D) and H(D) from Table III 180 | [mv,hv]=mhvals(nv); % calculate the constants M(D) and H(D) from Table III 181 | nsms=10.^(nsmdb*nv*tinc/10); % [8 4 2 1.2] in paper 182 | qeqimax=1/qeqmin; % maximum value of Qeq inverse (23) 183 | qeqimin=1/qeqmax; % minumum value of Qeq per frame inverse 184 | 185 | 186 | 187 | if isempty(yf) % provide dummy initialization 188 | ac=1; % correction factor (9) 189 | subwc=nv; % force a buffer switch on first loop 190 | ibuf=0; 191 | p=x; % smoothed power spectrum 192 | sn2=p; % estimated noise power 193 | pb=p; % smoothed noisy speech power (20) 194 | pb2=pb.^2; 195 | pminu=p; 196 | actmin=repmat(Inf,1,nrf); % Running minimum estimate 197 | actminsub=actmin; % sub-window minimum estimate 198 | actbuf=repmat(Inf,nu,nrf); % buffer to store subwindow minima 199 | lminflag=zeros(1,nrf); % flag to remember local minimum 200 | else 201 | 202 | if ~nrcum 203 | 204 | %Initialization excluding 3 left, current (noisy frame), 3 right frames 205 | By=[]; Cx=[]; dC=[]; Ax=[]; 206 | xNc=find(noise_seg==1); % current noise frame indexs 207 | L_3f=[ [xNc-3]' [xNc-2]' [xNc-1]' ]; %3 frames(indexes) 'left' of 'xNc' 208 | R_3f=[ [xNc+1]' [xNc+2]' [xNc+3]' ]; % 3 frames(indexes) 'right' of 'xNc' 209 | 210 | dC=[ L_3f xNc' R_3f ]; SxY=sort(unique(dC(:))); % (frame indexes which should not be considered) 211 | 212 | By=SxY(find(SxY>0)); % (discard 0 or negative frame indexes) 213 | Ax=1:1:length(noise_seg); Cx = setdiff(Ax,By); % 'By' all unsatisfied frames (ascending sorted order) 214 | 215 | if isempty(Cx) == 0 % satisfied frame indexes 216 | p=yf(Cx(1),:); %first satisfied frame from begining (as 'Cx' asced. sorted order) 217 | 218 | else % otherwise 219 | 220 | Cx=find(noise_seg==0); %speech frames 221 | if length (Cx) > 3 222 | Cx(1:3)=[]; % avoid first 3 speech frames - insense of "only slight better" 'init of p' 223 | end 224 | By=[]; % removed 3L-current-3R indexes (else impose to discard all frames in file)- during 'p' update [ ~isempty(intersect(t,By)) ] 225 | 226 | if isempty(Cx) == 1 % not exist 4th speech frame/only contents noisy segment 227 | 228 | Cx=1; p=yf(1,:); %no hope (as usual) 229 | warning('File seems have only noisy segments .. noise-power-spectra-estimation may not good!'); 230 | else 231 | p=yf(Cx(1),:); %i.e. 4th speech frame 232 | end 233 | 234 | end % 235 | 236 | 237 | ne_min=p; 238 | ac=1; % correction factor (9) 239 | sn2=p; % estimated noise power 240 | pb=p; % smoothed noisy speech power (20) 241 | pb2=pb.^2; 242 | pminu=p; 243 | actmin=repmat(Inf,1,nrf); % Running minimum estimate 244 | actminsub=actmin; % sub-window minimum estimate 245 | subwc=nv; % force a buffer switch on first loop 246 | actbuf=repmat(Inf,nu,nrf); % buffer to store subwindow minima 247 | ibuf=0; 248 | lminflag=zeros(1,nrf); % flag to remember local minimum 249 | end 250 | 251 | 252 | 253 | % loop for each frame 254 | if sum(pv01(1:10))>=1 255 | p=min(yf(1:min(50,nr),:),[],1); 256 | ne_min=p; sn2=p; pb=p; pb2=pb.^2; pminu=p; 257 | end 258 | 259 | 260 | for t=Cx(1):nr % start from the "initialization frame for 'p' %% we use t instead of lambda in the paper 261 | yft=yf(t,:); % noise speech power spectrum 262 | acb=(1+(sum(p)./sum(yft)-1).^2).^(-1); % alpha_c-bar(t) (9) 263 | ac=aca*ac+(1-aca)*max(acb,acmax); % alpha_c(t) (10) 264 | ah=amax*ac.*(1+(p./sn2-1).^2).^(-1); % alpha_hat: smoothing factor per frequency (11) 265 | snr=sum(p)/sum(sn2); 266 | ah=max(ah,min(aminh,snr^snrexp)); % lower limit for alpha_hat (12) 267 | 268 | 269 | 270 | if noise_seg(t) || ~isempty(intersect(t,By)) || (t<11 && sum(pv01(1:10))>=1) %avoid the frames in "By" (3Left, curr. noise,3Right) 271 | else 272 | p=ah.*p+(1-ah).*yft; % smoothed noisy speech power (3) 273 | b=min(ah.^2,bmax); % smoothing constant for estimating periodogram variance (22 + 2 lines) 274 | pb=b.*pb + (1-b).*p; % smoothed periodogram (20) 275 | pb2=b.*pb2 + (1-b).*p.^2; % smoothed periodogram squared (21) 276 | end 277 | 278 | qeqi=max(min((pb2-pb.^2)./(2*sn2.^2),qeqimax),qeqimin/(t+nrcum)); % Qeq inverse (23) 279 | qiav=sum(qeqi)/nrf; % Average over all frequencies (23+12 lines) (ignore non-duplication of DC and nyquist terms) 280 | bc=1+av*sqrt(qiav); % bias correction factor (23+11 lines) 281 | bmind=1+2*(nd-1)*(1-md)./(qeqi.^(-1)-2*md); % we use the simplified form (17) instead of (15) 282 | bminv=1+2*(nv-1)*(1-mv)./(qeqi.^(-1)-2*mv); % same expression but for sub windows 283 | kmod=bc*p.*bmind1 && subwc=nv % end of buffer - do a buffer switch 295 | ibuf=1+rem(ibuf,nu); % increment actbuf storage pointer 296 | actbuf(ibuf,:)=actmin; % save sub-window minimum 297 | pminu=min(actbuf,[],1); 298 | i=find(qiavpminu; 301 | if any(lmin) 302 | pminu(lmin)=actminsub(lmin); 303 | actbuf(:,lmin)=repmat(pminu(lmin),nu,1); 304 | end 305 | lminflag(:)=0; 306 | actmin(:)=Inf; 307 | subwc=0; 308 | end 309 | end 310 | subwc=subwc+1; 311 | x(t,:)=sn2; 312 | %ne_min=min(ne_min, sn2); 313 | qisq=sqrt(qeqi); 314 | % empirical formula for standard error based on Fig 15 of [2] 315 | xs(t,:)=sn2.*sqrt(0.266*(nd+100*qisq).*qisq/(1+0.005*nd+6/nd)./(0.5*qeqi.^(-1)+nd-1)); 316 | end 317 | end 318 | 319 | 320 | if nargout>1 % we need to store the state for next time 321 | zo.nrcum=nrcum+nr; % number of frames so far 322 | zo.p=p; % smoothed power spectrum 323 | zo.ac=ac; % correction factor (9) 324 | zo.sn2=sn2; % estimated noise power 325 | zo.pb=pb; % smoothed noisy speech power (20) 326 | zo.pb2=pb2; 327 | zo.pminu=pminu; 328 | zo.actmin=actmin; % Running minimum estimate 329 | zo.actminsub=actminsub; % sub-window minimum estimate 330 | zo.subwc=subwc; % force a buffer switch on first loop 331 | zo.actbuf=actbuf; % buffer to store subwindow minima 332 | zo.ibuf=ibuf; 333 | zo.lminflag=lminflag; % flag to remember local minimum 334 | zo.tinc=tinc; % must be the last one 335 | zo.qq=qq; 336 | end 337 | if ~nargout 338 | clf; 339 | subplot(212); 340 | plot((1:nr)*tinc,10*log10([sum(yf,2) sum(x,2)])) 341 | ylabel('Frame Energy (dB)'); 342 | xlabel(sprintf('Time (s) [%d ms frame incr]',round(tinc*1000))); 343 | axisenlarge([-1 -1.05]); 344 | legend('input','noise','Location','Best'); 345 | subplot(211); 346 | plot(1:nrf,10*log10([sum(yf,1)'/nr sum(x,1)'/nr])) 347 | ylabel('Power (dB)'); 348 | xlabel('Frequency bin'); 349 | axisenlarge([-1 -1.05]); 350 | legend('input','noise','Location','Best'); 351 | end 352 | end 353 | 354 | function [m,h,d]=mhvals(d) 355 | % Values are taken from Table 5 in [2] 356 | %[2] R. Martin,"Bias compensation methods for minimum statistics noise power 357 | % spectral density estimation", Signal Processing Vol 86, pp1215-1229, 2006. 358 | 359 | % approx: plot(d.^(-0.5),[m 1-d.^(-0.5)],'x-'), plot(d.^0.5,h,'x-') 360 | persistent dmh 361 | if isempty(dmh) 362 | dmh=[ 363 | 1 0 0; 364 | 2 0.26 0.15; 365 | 5 0.48 0.48; 366 | 8 0.58 0.78; 367 | 10 0.61 0.98; 368 | 15 0.668 1.55; 369 | 20 0.705 2; 370 | 30 0.762 2.3; 371 | 40 0.8 2.52; 372 | 60 0.841 3.1; 373 | 80 0.865 3.38; 374 | 120 0.89 4.15; 375 | 140 0.9 4.35; 376 | 160 0.91 4.25; 377 | 180 0.92 3.9; 378 | 220 0.93 4.1; 379 | 260 0.935 4.7; 380 | 300 0.94 5]; 381 | end 382 | 383 | if nargin>=1 384 | i=find(d<=dmh(:,1)); 385 | if isempty(i) 386 | i=size(dmh,1); 387 | j=i; 388 | else 389 | i=i(1); 390 | j=i-1; 391 | end 392 | if d==dmh(i,1) 393 | m=dmh(i,2); 394 | h=dmh(i,3); 395 | else 396 | qj=sqrt(dmh(i-1,1)); % interpolate using sqrt(d) 397 | qi=sqrt(dmh(i,1)); 398 | q=sqrt(d); 399 | h=dmh(i,3)+(q-qi)*(dmh(j,3)-dmh(i,3))/(qj-qi); 400 | m=dmh(i,2)+(qi*qj/q-qj)*(dmh(j,2)-dmh(i,2))/(qi-qj); 401 | end 402 | else 403 | d=dmh(:,1); 404 | m=dmh(:,2); 405 | h=dmh(:,3); 406 | end 407 | -------------------------------------------------------------------------------- /rVAD2.0/spgrambw.m: -------------------------------------------------------------------------------- 1 | function [t,f,b]=spgrambw(s,fs,varargin) 2 | %SPGRAMBW Draw spectrogram [T,F,B]=(s,fs,mode,bw,fmax,db,tinc,ann) 3 | % 4 | % Usage: spgrambw(s,fs,'pJcw') % Plot spectrogram with my favourite set of options 5 | % 6 | % For examples of the many options available see: 7 | % http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/tutorial/spgrambw/spgram_tut.pdf 8 | % 9 | % Inputs: S speech signal, or single-sided power spectrum array, S(NT,NF), in power per Hz 10 | % FS sample fequency (Hz) or [FS T1] where T1 is the time of the first sample 11 | % or, if s is a matrix, [FS T1 FINC F1] where FS is the frame rate, T1 is 12 | % the time of the first sample, FINC is the frequency increment and F1 the 13 | % frequency of the first column. 14 | % MODE optional character string specifying options (see list below) 15 | % BW bandwidth resolution in Hz (DFT window length = 1.81/BW)[default: 200] 16 | % FMAX frequency range [Fmin Fstep Fmax]. If Fstep is omitted 17 | % it is taken to be (Fmax-Fmin)/257, if Fmin is also omitted it is taken 18 | % to be 0 (or 20Hz for mode l), if all three are omitted Fmax is taken to be FS/2. 19 | % If modes m, b, e or l are specified then the units are in mel, bark or erb or 20 | % log10(Hz); this can be over-ridden by the 'h' option. 21 | % DB either dB-range or [dB-min dB-max] [default: 40] 22 | % TINC output frame increment in seconds [0 or missing uses default=0.45/BW] 23 | % or [TFIRST TLAST] or [TFIRST TINC TLAST] where TFIRST/TLAST are the times 24 | % of first/last frames 25 | % ANN annotation cell array: each row contains either 26 | % {time 'text-string' 'font'} or {[t_start t_end] 'text-string' 'font'} where 27 | % the time value is in seconds with s(n) at time offset+n/fs. The font column can 28 | % omitted in which case the system font will be used. MATLAB cannot cope with 29 | % unicode so I recommend the SILDoulosIPA (serifed) or SILSophiaIPA (sans) fonts 30 | % for phonetic symbols; these are now a little hard to find. 31 | % 32 | % Outputs: T(NT) time axis values (in seconds). Input sample s(n) is at time offset+n/fs. 33 | % F(NF) frequency axis values in Hz or, unless mode=H, other selected frequency units 34 | % according to mode: m=mel, l=log10(Hz), b=bark,e=erb-rate 35 | % B(NT,NF) spectrogram values in power (or clipped dB values if 'd' option given) 36 | % 37 | % MODE: 'p' = output power per decade rather than power per Hz [preemphasis] 38 | % 'P' = output power per mel/bark/erb according to y axis scaling 39 | % 'd' = output B array is in dB rather than power 40 | % 'D' = clip the output B array to the limits specified by the "db" input 41 | % 42 | % 'm' = mel scale 43 | % 'b' = bark scale 44 | % 'e' = erb scale 45 | % 'l' = log10 Hz frequency scale 46 | % 'f' = label frequency axis in Hz rather than mel/bark/... 47 | % 48 | % 'h' = units of the FMAX input are in Hz instead of mel/bark 49 | % [in this case, the Fstep parameter is used only to determine 50 | % the number of filters] 51 | % 'H' = express the F output in Hz instead of mel/bark/... 52 | % 53 | % 'g' = draw a graph even if output arguments are present 54 | % 'j' = jet colourmap 55 | % 'J' = "thermal" colourmap that is linear in grayscale. Based on Oliver Woodford's 56 | % real2rgb at http://www.mathworks.com/matlabcentral/fileexchange/23342 57 | % 'i' = inverted colourmap (white background) 58 | % 'c' = include a colourbar as an intensity scale 59 | % 'w' = draw the speech waveform above the spectrogram 60 | % 'a' = centre-align annotations rather than left-aligning them 61 | % 't' = add time markers with annotations 62 | % 63 | % The BW input gives the 6dB bandwidth of the Hamming window used in the analysis. 64 | % Equal amplitude frequency components are guaranteed to give separate peaks if they 65 | % are this far apart. This value also determines the time resolution: the window length is 66 | % 1.81/BW and the low-pass filter applied to amplitude modulations has a 6-dB bandwidth of 67 | % BW/2 Hz. 68 | % 69 | % The units are power per Hz unless the u 70 | % option is given in which case power per displayed unit is used 71 | % or power per decade for the l option. 72 | 73 | %%%% BUGS %%%%%% 74 | % * allow ANN rows to be a mixture of intervals and instants 75 | % * allow multiple ANN rows 76 | % * Do not use triangular interpolation if the output frequencies are the same as an FFT 77 | % * Place as many subticks as will fit beyond the last tick with the 'f' option 78 | % * Use a special subtick pattern between ticks that are powers of 10 using the 'f' option 79 | % * Future options: 80 | % ['q' = constant q transform] 81 | % ['k' = add a piano keyboard to the frequency scale] 82 | % ['z' = use a bipolar colourmap for a matrix input with negative values] 83 | 84 | % Copyright (C) Mike Brookes 1997-2011 85 | % Version: $Id: spgrambw.m 713 2011-10-16 14:45:43Z dmb $ 86 | % 87 | % VOICEBOX is a MATLAB toolbox for speech processing. 88 | % Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html 89 | % 90 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 91 | % This program is free software; you can redistribute it and/or modify 92 | % it under the terms of the GNU General Public License as published by 93 | % the Free Software Foundation; either version 2 of the License, or 94 | % (at your option) any later version. 95 | % 96 | % This program is distributed in the hope that it will be useful, 97 | % but WITHOUT ANY WARRANTY; without even the implied warranty of 98 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 99 | % GNU General Public License for more details. 100 | % 101 | % You can obtain a copy of the GNU General Public License from 102 | % http://www.gnu.org/copyleft/gpl.html or by writing to 103 | % Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA. 104 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 105 | persistent tcmap 106 | if isempty(tcmap) 107 | % modified thermal with better grayscale linearity 108 | tcmap=[ 0 0 0; 7 0 17; 14 0 33; 21 0 50; 29 0 67; 36 0 84; 43 0 100; 50 0 117; 109 | 57 0 134; 64 0 150; 72 0 167; 80 3 164; 89 7 156; 97 11 149; 106 15 142; 114 19 134; 110 | 123 23 127; 131 27 119; 140 31 112; 149 35 105; 157 39 97; 166 43 90; 174 47 82; 111 | 183 51 75; 192 55 68; 200 59 60; 209 63 53; 217 67 45; 226 71 38; 234 75 31; 112 | 243 79 23; 252 83 16; 255 88 12; 255 95 12; 255 102 11; 255 109 11; 255 116 10; 113 | 255 123 10; 255 130 9; 255 137 9; 255 144 8; 255 151 8; 255 158 7; 255 165 7; 114 | 255 172 6; 255 179 6; 255 186 5; 255 193 4; 255 200 4; 255 207 3; 255 214 3; 255 221 2; 115 | 255 228 2; 255 235 1; 255 242 1; 255 249 0; 255 252 22; 255 252 55; 255 253 88; 116 | 255 253 122; 255 254 155; 255 254 188; 255 255 222; 255 255 255]/255; 117 | end 118 | if nargin<2 119 | error('Usage: SPGRAMBW(s,fs,mode,bw,fmax,db,tinc)'); 120 | end 121 | %SPGRAMBW Draw grey-scale spectrogram [T,F,B]=(s,fs,mode,bw,fmax,db,tinc) 122 | % 123 | % first decode the input arguments 124 | % 125 | if size(s,1)==1 126 | s=s(:); % force to be a column vector (unless it is a matrix) 127 | end 128 | [ns1,ns2]=size(s); 129 | ap=zeros(1,6); 130 | j=2; 131 | if numel(fs)<2 132 | fs(2)=1/fs(1); % first sample or frame is at time 1/fs 133 | end 134 | for i=1:length(varargin) 135 | if ischar(varargin{i}) 136 | ap(1)=i; 137 | else 138 | ap(j)=i; 139 | j=j+1; 140 | end 141 | end 142 | if ap(1) && ~isempty(varargin{ap(1)}) 143 | mode=varargin{ap(1)}; 144 | else 145 | mode=''; % default mode 146 | end 147 | if ap(2) && ~isempty(varargin{ap(2)}) 148 | bw=varargin{ap(2)}; 149 | else 150 | bw=200; 151 | end 152 | if ap(3) && ~isempty(varargin{ap(3)}) 153 | fmax=varargin{ap(3)}; 154 | else 155 | fmax=[]; 156 | end 157 | if ap(4) && ~isempty(varargin{ap(4)}) 158 | db=varargin{ap(4)}; 159 | else 160 | db=40; 161 | end 162 | if ap(5) && ~isempty(varargin{ap(5)}) 163 | tinc=varargin{ap(5)}; 164 | else 165 | tinc=0; 166 | end 167 | switch numel(tinc) 168 | case 1 169 | tinc=[tinc -Inf Inf]; 170 | case 2 171 | tinc=[0 tinc]; 172 | otherwise 173 | tinc=tinc([2 1 3]); 174 | end 175 | if tinc(1)<=0 176 | tinc(1)=1.81/(4*bw); % default frame increment 177 | end 178 | if ap(6) 179 | ann=varargin{ap(6)}; 180 | else 181 | ann=[]; 182 | end 183 | 184 | % now sort out the mode flags 185 | 186 | mdsw=' '; % [yscale preemph] 187 | for i=1:length(mode) 188 | switch mode(i) 189 | case {'l','m','b','e'} 190 | mdsw(1)=mode(i); 191 | case {'p','P'} 192 | mdsw(2)=mode(i); 193 | end 194 | end 195 | if mdsw(2)=='P' 196 | mdsw(2)=mdsw(1); % preemphasis is scaling dependent 197 | end 198 | % 199 | % sort out the frequency axis 200 | % 201 | flmin=30; % min frequency for 'l' option 202 | nfrq=257; % default number of frequency bins 203 | if ns2==1 204 | fnyq=fs(1)/2; % default upper frequency limit is fs/2 205 | else % input is a power spectrum 206 | if numel(fs)<3 207 | fs(3)=fs(1)*0.25; % default increment is 0.25 times frame increment 208 | end 209 | if numel(fs)<4 210 | fs(4)=0; % first freq bin is DC by default 211 | end 212 | fnyq=fs(4)+(ns2-1)*fs(3); % default upper frequency limit is highest supplied frequency 213 | end 214 | 215 | if ~numel(fmax) % no explicit frequency range 216 | switch mdsw(1) 217 | case 'l' 218 | fx=linspace(log10(flmin),log10(fnyq),nfrq); % 20 Hz to Nyquist 219 | case 'm' 220 | fx=linspace(0,frq2mel(fnyq),nfrq); % DC to Nyquist 221 | case 'b' 222 | fx=linspace(0,frq2bark(fnyq),nfrq); % DC to Nyquist 223 | case 'e' 224 | fx=linspace(0,frq2erb(fnyq),nfrq); % DC to Nyquist 225 | otherwise % linear Hz scale 226 | fx=(0:nfrq-1)*fnyq/(nfrq-1); 227 | end 228 | else 229 | if any(mode=='h') 230 | switch mdsw(1) 231 | case 'l' 232 | fmaxu=log10(fmax); % 20 Hz to Nyquist 233 | case 'm' 234 | fmaxu=frq2mel(fmax); % DC to Nyquist 235 | case 'b' 236 | fmaxu=frq2bark(fmax); % DC to Nyquist 237 | case 'e' 238 | fmaxu=frq2erb(fmax); % DC to Nyquist 239 | otherwise 240 | fmaxu=fmax; % linear Hz scale 241 | end 242 | else 243 | fmaxu=fmax; % already in the correct units 244 | end 245 | if numel(fmax)<2 % only max value specified 246 | if mdsw(1)=='l' 247 | fx=linspace(log10(flmin),fmaxu,nfrq); % 20 Hz to fmax 248 | else 249 | fx=linspace(0,fmaxu,nfrq); % DC to fmax 250 | end 251 | elseif numel(fmax)<3 % min and max values specified 252 | fx=linspace(fmaxu(1),fmaxu(2),nfrq); % fmin to fmax 253 | else 254 | fmaxu(2)=fmax(2)*(fmaxu(3)-fmaxu(1))/(fmax(3)-fmax(1)); % scale the step size appropriately 255 | fx=fmaxu(1):fmaxu(2):fmaxu(3); % fmin to fmax in steps of finc 256 | nfrq=length(fx); 257 | end 258 | end 259 | switch mdsw(1) % convert the frequency range to Hz 260 | case 'l' 261 | f=10.^fx; 262 | frlab='log_{10}Hz'; 263 | frlabf='log'; 264 | frq2y=@log10; 265 | y2frq=@(x) 10.^x; 266 | case 'm' 267 | f=mel2frq(fx); 268 | frlab='Mel'; 269 | frlabf='Mel'; 270 | frq2y=@frq2mel; 271 | y2frq=@mel2frq; 272 | case 'b' 273 | f=bark2frq(fx); 274 | frlab='Bark'; 275 | frlabf='Bark'; 276 | frq2y=@frq2bark; 277 | y2frq=@bark2frq; 278 | case 'e' 279 | f=erb2frq(fx); 280 | frlab='Erb-rate'; 281 | frlabf='Erb'; 282 | frq2y=@frq2erb; 283 | y2frq=@erb2frq; 284 | otherwise 285 | f=fx; 286 | frlab='Hz'; 287 | frq2y=@(x) x; 288 | y2frq=@(x) x; 289 | end 290 | if ~any(mode=='H') 291 | f=fx; % give output frequencies in native units instead of Hz unless 'H' is specified 292 | end 293 | % 294 | % now calculate the spectrogram 295 | % 296 | if ns2==1 % input is a speech signal vector 297 | winlen = fix(1.81*fs(1)/bw); % window length 298 | win=0.54+0.46*cos((1-winlen:2:winlen)*pi/winlen); % Hamming window 299 | ninc=max(round(tinc(1)*fs(1)),1); % window increment in samples 300 | % we need to take account of minimum freq increment + make it exact if possible 301 | fftlen=pow2(nextpow2(4*winlen)); % enough oversampling to get good interpolation 302 | win=win/sqrt(sum(win.^2)); % ensure window squared sums to unity 303 | ix1=max(round((tinc(2)-fs(2))*fs(1)-(winlen-3)/2),1); % first sample required 304 | ix2=min(ceil((tinc(3)-fs(2))*fs(1)+(winlen+1)/2),ns1); % last sample required 305 | [sf,t]=enframe(s(ix1:ix2),win,ninc); 306 | t=fs(2)+(t+ix1-2)/fs(1); % time axis 307 | b=rfft(sf,fftlen,2); 308 | b=b.*conj(b)*2/fs(1); % Power per Hz 309 | b(:,1)=b(:,1)*0.5; % correct for no negative zero frequency to double the power 310 | b(:,end)=b(:,end)*0.5; % correct for no negative nyquist frequency to double the power 311 | fb=(0:fftlen/2)*fs(1)/fftlen; % fft bin frequencies 312 | fftfs=fs(1); 313 | else 314 | 315 | b=s; 316 | t=fs(2)+(0:ns1-1)/fs(1); % frame times 317 | fb=fs(4)+(0:ns2-1)*fs(3); 318 | fftlen=[ns2 fs(3) fs(4)]; % for filtbankm: ns2=# input freq bins, freq increment (Hz), first bin freq (Hz) 319 | fftfs=0; 320 | % fftlen=2*(ns2-1); % assume an even length fft 321 | % fftfs=fftlen*fs(3); 322 | end 323 | nfr=numel(t); % number of frames 324 | dblab='Power/Hz'; 325 | switch mdsw(2) 326 | case {'p','l'} 327 | b=b.*repmat(fb*log(10),nfr,1); % convert to power per decade 328 | dblab='Power/Decade'; 329 | case 'm' 330 | b=b.*repmat((1+fb/700)*log(1+1000/700)/1000,nfr,1); % convert to power per mel 331 | dblab='Power/Mel'; 332 | case 'b' 333 | b=b.*repmat((1960+fb).^2/52547.6,nfr,1); % convert to power per bark 334 | dblab='Power/Bark'; 335 | case 'e' 336 | b=b.*repmat(6.23*fb.^2 + 93.39*fb + 28.52,nfr,1); % convert to power per erb 337 | dblab='Power/Erb-rate'; 338 | end 339 | % 340 | % Now map onto the desired frequency scale 341 | % 342 | b=b*filtbankm(nfrq,fftlen,fftfs,fx(1),fx(end),['cush' mdsw(1)])'; 343 | 344 | if ~nargout || any(mode=='g') || any(mode=='d') 345 | if numel(db)<2 % find clipping limits 346 | plim=max(b(:))*[0.1^(0.1*db) 1]; 347 | else 348 | plim=10.^(0.1*db(1:2)); 349 | end 350 | if plim(2)<=0 351 | plim(2)=1; 352 | end 353 | if plim(1)<=0 || plim(1)==plim(2) 354 | plim(1)=0.1*plim(2); 355 | end 356 | if ~nargout || any(mode=='g') 357 | bd=10*log10(b); % save an unclipped log version for plotting 358 | end 359 | if any(mode=='D') 360 | b=min(max(b,plim(1)),plim(2)); % clip the output 361 | end 362 | if any(mode=='d') 363 | b=10*log10(b); % output the dB version 364 | end 365 | end 366 | % now plot things 367 | if ~nargout || any(mode=='g') 368 | cla; % clear current axis 369 | imagesc(t,fx,bd'); 370 | axis('xy'); 371 | set(gca,'tickdir','out','clim',10*log10(plim)); 372 | if any(mode=='j') 373 | colormap('jet'); 374 | map=colormap; 375 | elseif any(mode=='J') 376 | map=tcmap; 377 | else 378 | map = repmat((0:63)'/63,1,3); 379 | end 380 | if any(mode=='i') % 'i' option = invert the colourmap 381 | map=map(64:-1:1,:); 382 | end 383 | colormap(map); 384 | if any(mode=='c') % 'c' option = show a colourbar 385 | colorbar; 386 | cblabel([dblab ' (dB)']); 387 | end 388 | % 389 | % Now check if annotations or a waveform are required 390 | % 391 | dotaw=[((any(mode=='t') && size(ann,2)>1) || size(ann,2)==1) size(ann,2)>1 (any(mode=='w') && ns2==1)]; 392 | ylim=get(gca,'ylim'); 393 | if any(dotaw) 394 | yrange = ylim(2)-ylim(1); 395 | zlim=ylim; 396 | toptaw=cumsum([0 dotaw.*[0.05 0.05 0.1]]*yrange)+ylim(2); 397 | zlim(2)=toptaw(4); 398 | set(gca,'ylim',zlim,'color',map(1,:)); 399 | if dotaw(3) % Plot the waveform 400 | smax=max(s(:)); 401 | smin=min(s(:)); 402 | srange=smax-smin; 403 | hold on 404 | plot(fs(2)+(0:length(s)-1)/fs(1),(s-smin)/srange*0.9*(toptaw(4)-toptaw(3))+toptaw(3),'color',map(48,:)) 405 | hold off 406 | end 407 | if dotaw(1) || dotaw(2) 408 | tmk=cell2mat(ann(:,1)); 409 | tmksel=tmk(:,1)<=t(end) & tmk(:,end)>=t(1); 410 | yix=1+[tmk(tmksel,1)t(end)]'; 411 | tmk(:,1)=max(tmk(:,1),t(1)); % clip to axis limits 412 | tmk(:,end)=min(tmk(:,end),t(end)); 413 | end 414 | if dotaw(1) && any(tmksel) % draw time markers 415 | ymk=toptaw(1:2)*[0.8 0.4;0.2 0.6]; 416 | switch size(tmk,2) 417 | case 0 418 | case 1 % isolated marks 419 | hold on 420 | plot([tmk(tmksel) tmk(tmksel)]',repmat(ymk',1,sum(tmksel)),'color',map(48,:)); 421 | hold off 422 | otherwise % draw durations 423 | 424 | hold on 425 | plot(tmk(tmksel,[1 1 2 2])',ymk(yix),'color',map(48,:)); 426 | hold off 427 | end 428 | end 429 | if dotaw(2) && any(tmksel) % print annotations 430 | if any(mode=='a') 431 | horal='center'; 432 | tmk=(tmk(:,1)+tmk(:,end))*0.5; 433 | else 434 | horal='left'; 435 | tmk=tmk(:,1); 436 | end 437 | if size(ann,2)>2 438 | font='Arial'; 439 | for i=1:size(ann,1) 440 | if tmksel(i) 441 | if ~isempty(ann{i,3}) 442 | font = ann{i,3}; 443 | end 444 | text(tmk(i),toptaw(2),ann{i,2},'color',map(48,:),'fontname',font,'VerticalAlignment','baseline','HorizontalAlignment',horal); 445 | end 446 | end 447 | else 448 | for i=1:size(ann,1) 449 | if tmksel(i) 450 | text(tmk(i),toptaw(2),ann{i,2},'color',map(48,:),'VerticalAlignment','baseline','HorizontalAlignment',horal); 451 | end 452 | end 453 | end 454 | end 455 | end 456 | xlabel(['Time (' xticksi 's)']); 457 | if any(mode=='f') && ~strcmp(frlab,'Hz') 458 | ylabel([frlabf '-scaled frequency (Hz)']); 459 | ytickhz(frq2y,y2frq); 460 | else 461 | ylabel(['Frequency (' yticksi frlab ')']); 462 | end 463 | ytick=get(gca,'YTick'); 464 | ytickl=get(gca,'YTickLabel'); 465 | msk=ytick<=ylim(2); 466 | if any(~msk) 467 | set(gca,'YTick',ytick(msk),'YTickLabel',ytickl(msk)); 468 | end 469 | end 470 | 471 | function ytickhz(frq2y,y2frq) 472 | % label non linear y frequency axis 473 | % 474 | % Bugs/Suggestions: 475 | % * Add a penalty for large numbers (e.g. 94 is less "round" than 11) 476 | % * possibly add subticks at 1:2:5 if boundaries are 1 and 10 477 | % * could treat subtick allocation specially if bounding lables are both powers of 10 478 | % and work in log spacing rather than spacing directly 479 | 480 | % algorithm constants 481 | 482 | seps=[0.4 1 3 6]; % spacings: (a) min subtick, (b) min tick, (c) min good tick, (d) max good tick 483 | ww=[0.5 0.6 0.8 0.1 0.3 0.3 0.2]; % weight for (a) last digit=5, (b) power of 10, (c) power of 1000, (d) equal spacing, (e) 1:2:5 labels (f) seps(4) 484 | nbest=10; % number of possibilities to track 485 | 486 | prefix={'y','z','a','f','p','n','�','m','','k','M','G','T','P','E','Z','Y'}; 487 | 488 | ah=gca; 489 | getgca=get(ah); % Get original axis properties 490 | set(ah,'Units','points','FontUnits','points'); 491 | getgcac=get(ah); % Get axis properties in points units 492 | set(ah,'Units',getgca.Units,'FontUnits',getgca.FontUnits); % return to original values 493 | ylim=getgca.YLim; 494 | yrange=ylim*[-1;1]; 495 | chsz= yrange*getgcac.FontSize/getgcac.Position(4); % char height in Y-units 496 | % divide the y-axis up into bins containing at most one label each 497 | maxl=ceil(2*yrange/chsz); % max number of labels 498 | 499 | % candidate array [cand(:,[1 2])/1000 cand(:,5) cand(:,6)/1000 cand(:,[7 8])] 500 | % 1,2=y limits, 3,4=log limits, 5=Hz, 6=cost, 7=mantissa, 8=exponent, 9=sig digits, 10=y-position 501 | cand=zeros(maxl+2,10); 502 | yinc=(yrange+chsz*0.0002)/maxl; % bin spacing (allowing for a tiny bit to ensure the ends are included) 503 | cand(2:end-1,2)=ylim(1)+yinc*(1:maxl)'-chsz*0.0001; 504 | cand(3:end-1,1)=cand(2:end-2,2); 505 | cand(2,1)=cand(2,2)-yinc; 506 | cand(2:end-1,1:2)=y2frq(max(cand(2:end-1,1:2),0)); 507 | 508 | % find the "roundest" number in each interval 509 | % first deal with intervals containing zero 510 | cand([1 maxl+2],6)=-1; 511 | cand(2,9)=(cand(2,1)<=0); % mask out interval contaiing zero 512 | cand(2,6)=-cand(2,9); 513 | msk=cand(:,6)==0; % find rows without a cost yet 514 | cand(msk,3:4)=log10(cand(msk,1:2)); 515 | % find powers of 1000 516 | loglim=ceil(cand(:,3:4)/3); 517 | msk=loglim(:,2)>loglim(:,1); 518 | if any(msk) 519 | xp=loglim(msk,1); 520 | wuns=ones(length(xp),1); 521 | cand(msk,5:9)=[1000.^xp wuns-ww(3) wuns 3*xp wuns]; 522 | end 523 | % find powers of 10 524 | loglim=ceil(cand(:,3:4)); 525 | msk=~msk & (loglim(:,2)>loglim(:,1)); 526 | if any(msk) 527 | xp=loglim(msk,1); 528 | wuns=ones(length(xp),1); 529 | cand(msk,5:9)=[10.^xp wuns-ww(2) wuns xp wuns]; 530 | end 531 | % find value with fewest digits 532 | msk=cand(:,6)==0; % find rows without a cost yet 533 | maxsig=1-floor(log10(10^min(cand(msk,3:4)*[-1;1])-1)); % maximum number of significant figures to consider 534 | pten=10.^(0:maxsig-1); % row vector of powers of ten 535 | noten=10.^(-floor(cand(msk,3))); % exponent of floating point representation of lower bound 536 | sigdig=sum((ceil(cand(msk,2).*noten*pten)-ceil(cand(msk,1).*noten*pten))==0,2); % number of digits common to the interval bounds 537 | lowman=ceil(cand(msk,1).*noten.*10.^sigdig); 538 | midman=10*floor(lowman/10)+5; 539 | highman=ceil(cand(msk,2).*noten.*10.^sigdig); 540 | mskman=midman>=lowman & midman1+(maxl+2)*(i==maxl+2); % mask for label triplets 561 | labcnti=labcnt(1:ntry)+1; 562 | disti=(cand(i,10)-cand(prevc,10))/chsz; % distance to previous label in characters 563 | costa=max(seps(3)-disti,0)*ww(6)+max(disti-seps(4),0)*ww(7); 564 | incri=(cand(i,5)-cand(prevc,5)); % label increment 565 | incrj=(cand(i,5)-cand(prevprev,5)); % double label increment 566 | if any(msk) 567 | costa(msk)=costa(msk)- ww(4)*(abs(incrj(msk)-2*incri(msk))<0.01*incri(msk)); 568 | if cand(i,7)==1 || cand(i,7)==2 || cand(i,7)==5 % look for labels 1:2:5 569 | costa(msk)=costa(msk)- ww(5)*(abs(incrj(msk)-ratint(cand(i,7))*incri(msk))<0.01*incri(msk)); 570 | end 571 | end 572 | costa(disti1 && costs(i,maxl+2)=2 610 | for i=1:ntick-1 611 | clj=cand(labchoose(i:i+1),:); 612 | sprec=min(clj(1,8)+100*(clj(1,7)==0),clj(2,8)); % subtick precision 613 | spos=(clj(1,7)*10^(clj(1,8)-sprec):clj(2,7)*10^(clj(2,8)-sprec))*10^sprec; 614 | nsub=length(spos); 615 | if nsub==2 616 | spos=spos*[1 0.5 0;0 0.5 1]; 617 | nsub=3; 618 | end 619 | if nsub>=3 620 | yspos=frq2y(spos); 621 | for kk=1:3 % try various subdivisions: every 1, 2 or 5 622 | k=kk+2*(kk==3); % 1, 2 and 5 623 | if 2*k<=nsub-1 && ~mod(nsub-1,k) % must divide exactly into nsub 624 | if all((yspos(1+k:k:nsub)-yspos(1:k:nsub-k))>=(seps(1)*chsz)) % check they all fit in 625 | subpos=[subpos yspos(1+k:k:nsub-k)]; 626 | if i==1 627 | spos=(ceil(cand(2,1)/10^sprec):clj(1,7)*10^(clj(1,8)-sprec))*10^sprec; 628 | nsub=length(spos); 629 | yspos=frq2y(spos); 630 | if nsub>=k+1 && all((yspos(nsub:-k:1+k)-yspos(nsub-k:-k:1))>=(seps(1)*chsz)) 631 | subpos=[subpos yspos(nsub-k:-k:1)]; 632 | end 633 | elseif i==ntick-1 634 | spos=(clj(2,7)*10^(clj(2,8)-sprec):floor(cand(end-1,2)/10^sprec))*10^sprec; 635 | nsub=length(spos); 636 | yspos=frq2y(spos); 637 | if nsub>=k+1 && all((yspos(1+k:k:nsub)-yspos(1:k:nsub-k))>=(seps(1)*chsz)) 638 | subpos=[subpos yspos(1+k:k:nsub)]; 639 | end 640 | end 641 | break; 642 | end 643 | end 644 | end 645 | end 646 | end 647 | end 648 | nsub=length(subpos); 649 | tickpos=[cand(labchoose,10); subpos']; 650 | ticklab=cell(ntick+nsub,1); 651 | sipref=min(max(floor((sum(cand(labchoose,8:9),2)-1)/3),-8),8); 652 | nzadd=cand(labchoose,8)-3*sipref; % trailing zeros to add 653 | digzer=cand(labchoose,7).*10.^max(nzadd,0); % label digits including trailing zeros 654 | ndleft=cand(labchoose,9)+nzadd; % digits to the left of the decimal point 655 | for i=1:ntick 656 | tickint=num2str(digzer(i)); 657 | if nzadd(i)<0 658 | tickint=[tickint(1:ndleft(i)) '.' tickint(1+ndleft(i):end)]; 659 | end 660 | ticklab{i} = sprintf('%s%s',tickint,prefix{sipref(i)+9}); 661 | end 662 | for i=ntick+1:ntick+nsub 663 | ticklab{i}=''; 664 | end 665 | [tickpos,ix]=sort(tickpos); 666 | ticklab=ticklab(ix); 667 | 668 | set(ah,'YTick',tickpos','YTickLabel',ticklab); 669 | 670 | --------------------------------------------------------------------------------