├── CMAKE_HELPERS ├── FindEigen2.cmake └── FindFFTW.cmake ├── CMakeLists.txt ├── LICENSE ├── README.md ├── VAD.h ├── sound.txt └── voice_detection.cpp /CMAKE_HELPERS/FindEigen2.cmake: -------------------------------------------------------------------------------- 1 | # - Try to find Eigen2 lib 2 | # 3 | # This module supports requiring a minimum version, e.g. you can do 4 | # find_package(Eigen2 2.0.3) 5 | # to require version 2.0.3 to newer of Eigen2. 6 | # 7 | # Once done this will define 8 | # 9 | # EIGEN2_FOUND - system has eigen lib with correct version 10 | # EIGEN2_INCLUDE_DIR - the eigen include directory 11 | # EIGEN2_VERSION - eigen version 12 | 13 | # Copyright (c) 2006, 2007 Montel Laurent, 14 | # Copyright (c) 2008, 2009 Gael Guennebaud, 15 | # Redistribution and use is allowed according to the terms of the BSD license. 16 | 17 | if(NOT Eigen2_FIND_VERSION) 18 | if(NOT Eigen2_FIND_VERSION_MAJOR) 19 | set(Eigen2_FIND_VERSION_MAJOR 2) 20 | endif(NOT Eigen2_FIND_VERSION_MAJOR) 21 | if(NOT Eigen2_FIND_VERSION_MINOR) 22 | set(Eigen2_FIND_VERSION_MINOR 0) 23 | endif(NOT Eigen2_FIND_VERSION_MINOR) 24 | if(NOT Eigen2_FIND_VERSION_PATCH) 25 | set(Eigen2_FIND_VERSION_PATCH 0) 26 | endif(NOT Eigen2_FIND_VERSION_PATCH) 27 | 28 | set(Eigen2_FIND_VERSION "${Eigen2_FIND_VERSION_MAJOR}.${Eigen2_FIND_VERSION_MINOR}.${Eigen2_FIND_VERSION_PATCH}") 29 | endif(NOT Eigen2_FIND_VERSION) 30 | 31 | macro(_eigen2_check_version) 32 | file(READ "${EIGEN2_INCLUDE_DIR}/Eigen/src/Core/util/Macros.h" _eigen2_version_header) 33 | 34 | string(REGEX MATCH "define[ \t]+EIGEN_WORLD_VERSION[ \t]+([0-9]+)" _eigen2_world_version_match "${_eigen2_version_header}") 35 | set(EIGEN2_WORLD_VERSION "${CMAKE_MATCH_1}") 36 | string(REGEX MATCH "define[ \t]+EIGEN_MAJOR_VERSION[ \t]+([0-9]+)" _eigen2_major_version_match "${_eigen2_version_header}") 37 | set(EIGEN2_MAJOR_VERSION "${CMAKE_MATCH_1}") 38 | string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen2_minor_version_match "${_eigen2_version_header}") 39 | set(EIGEN2_MINOR_VERSION "${CMAKE_MATCH_1}") 40 | 41 | set(EIGEN2_VERSION ${EIGEN2_WORLD_VERSION}.${EIGEN2_MAJOR_VERSION}.${EIGEN2_MINOR_VERSION}) 42 | if((${EIGEN2_WORLD_VERSION} NOTEQUAL 2) OR (${EIGEN2_MAJOR_VERSION} GREATER 10) OR (${EIGEN2_VERSION} VERSION_LESS ${Eigen2_FIND_VERSION})) 43 | set(EIGEN2_VERSION_OK FALSE) 44 | else() 45 | set(EIGEN2_VERSION_OK TRUE) 46 | endif() 47 | 48 | if(NOT EIGEN2_VERSION_OK) 49 | 50 | message(STATUS "Eigen2 version ${EIGEN2_VERSION} found in ${EIGEN2_INCLUDE_DIR}, " 51 | "but at least version ${Eigen2_FIND_VERSION} is required") 52 | endif(NOT EIGEN2_VERSION_OK) 53 | endmacro(_eigen2_check_version) 54 | 55 | if (EIGEN2_INCLUDE_DIR) 56 | 57 | # in cache already 58 | _eigen2_check_version() 59 | set(EIGEN2_FOUND ${EIGEN2_VERSION_OK}) 60 | 61 | else (EIGEN2_INCLUDE_DIR) 62 | 63 | find_path(EIGEN2_INCLUDE_DIR NAMES Eigen/Core 64 | PATHS 65 | ${INCLUDE_INSTALL_DIR} 66 | ${KDE4_INCLUDE_DIR} 67 | PATH_SUFFIXES eigen2 68 | ) 69 | 70 | if(EIGEN2_INCLUDE_DIR) 71 | _eigen2_check_version() 72 | endif(EIGEN2_INCLUDE_DIR) 73 | 74 | include(FindPackageHandleStandardArgs) 75 | find_package_handle_standard_args(Eigen2 DEFAULT_MSG EIGEN2_INCLUDE_DIR EIGEN2_VERSION_OK) 76 | 77 | mark_as_advanced(EIGEN2_INCLUDE_DIR) 78 | 79 | endif(EIGEN2_INCLUDE_DIR) 80 | 81 | -------------------------------------------------------------------------------- /CMAKE_HELPERS/FindFFTW.cmake: -------------------------------------------------------------------------------- 1 | # - Find FFTW 2 | # Find the native FFTW includes and library 3 | # 4 | # FFTW_INCLUDES - where to find fftw3.h 5 | # FFTW_LIBRARIES - List of libraries when using FFTW. 6 | # FFTW_FOUND - True if FFTW found. 7 | 8 | if (FFTW_INCLUDES) 9 | # Already in cache, be silent 10 | set (FFTW_FIND_QUIETLY TRUE) 11 | endif (FFTW_INCLUDES) 12 | 13 | find_path (FFTW_INCLUDES fftw3.h) 14 | 15 | find_library (FFTW_LIBRARIES NAMES fftw3) 16 | 17 | # handle the QUIETLY and REQUIRED arguments and set FFTW_FOUND to TRUE if 18 | # all listed variables are TRUE 19 | include (FindPackageHandleStandardArgs) 20 | find_package_handle_standard_args (FFTW DEFAULT_MSG FFTW_LIBRARIES FFTW_INCLUDES) 21 | 22 | mark_as_advanced (FFTW_LIBRARIES FFTW_INCLUDES) 23 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(vad) 2 | cmake_minimum_required(VERSION 2.8) 3 | 4 | set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/bin) 5 | 6 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wall") 7 | # set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Wall") 8 | 9 | list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/CMAKE_HELPERS/") 10 | 11 | set(HEADER_FILES ${PROJECT_SOURCE_DIR}/vad.h) 12 | 13 | aux_source_directory(. SRC_LIST) 14 | #include_directories(${PROJECT_SOURCE_DIR}) 15 | 16 | find_package(Eigen3 REQUIRED) 17 | include_directories(EIGEN3_INCLUDE_DIR) 18 | find_package(FFTW REQUIRED) 19 | include_directories(FFTW_INCLUDE_DIR) 20 | 21 | add_executable(vad voice_detection.cpp ) 22 | target_link_libraries(vad -lm -lfftw3 -lstdc++ ) 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Minh Nguyen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This repo is for voice activity detection algorithm (VAD). This code is based on this paper: 2 | 3 | Ramırez, Javier, José C. Segura, Carmen Benıtez, Angel De La Torre, and Antonio Rubio. 4 | "Efficient voice activity detection algorithms using long-term speech information." 5 | 6 | And the Matlab code version (Thanks for the isrish making such a beautiful program): 7 | 8 | https://github.com/isrish/VAD-LTSD 9 | 10 | 11 | This cpp file take input audio data from txt file and write results to 'example*.txt' file. At the end, I use gnuplot to plot the result and it works exactly the same as the Matlab Code from isrish. 12 | 13 | # Compile and run 14 | You need FFTW and Eigen3 libraries. 15 | ``` 16 | $ mkdir build & cd build 17 | $ cmake .. & make 18 | $ ../bin/./vad 19 | ``` 20 | ## Visualizing the result 21 | ``` 22 | $ gnuplot 23 | $ > set multiplot 24 | $ > plot "sound.txt" with 1:2 lines 25 | $ > filename(n) = sprintf("example_%d.txt", n) 26 | $ > plot for [i=1:10] filename(i) using 1:2 with lines 27 | ``` 28 | -------------------------------------------------------------------------------- /VAD.h: -------------------------------------------------------------------------------- 1 | /* 2 | * VAD.h 3 | * 4 | * Created on: May 27, 2016 5 | * Author: dmngu9 6 | */ 7 | 8 | #ifndef VAD_H_ 9 | #define VAD_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | using namespace std; 21 | using namespace Eigen; 22 | 23 | const double PI = 3.14; 24 | 25 | class VAD{ 26 | 27 | private: 28 | int winSize; 29 | int signalSize; 30 | int NFFT2; 31 | int order; 32 | double threshold; 33 | MatrixXd enFrame; 34 | RowVectorXd hamming; 35 | VectorXd averageNoise; 36 | MatrixXd amplitude; 37 | 38 | public: 39 | 40 | VAD(int winSize, int signalSize, int order,double threshold){ 41 | this->winSize = winSize; 42 | this->signalSize = signalSize; 43 | this->order = order; 44 | this->threshold = threshold; 45 | this->enFrame = MatrixXd::Zero(this->winSize, this->signalSize/(this->winSize*0.5)); 46 | this->hamming = createHammingWindow(); 47 | this->NFFT2 = this->winSize/2; 48 | } 49 | 50 | ~VAD(){} 51 | 52 | RowVectorXd createHammingWindow(){ 53 | double alpha = 0.54; 54 | double beta = 0.46; 55 | RowVectorXd hamming = RowVectorXd::Zero(winSize); 56 | for(int i = 0; i < winSize; i++){ 57 | hamming(i) = alpha - beta*cos((2*PI*i)/(winSize-1)); 58 | } 59 | return hamming; 60 | } 61 | 62 | void buffer(double* signal){ 63 | MatrixXd upper = MatrixXd::Zero(this->winSize/2,this->signalSize/(this->winSize*0.5)); 64 | MatrixXd below = MatrixXd::Zero(this->winSize/2,this->signalSize/(this->winSize*0.5)); 65 | 66 | int j = 0; 67 | int k = 0; 68 | for(int i =0; i < this->signalSize; i++){ 69 | below(j,k) = signal[i]; 70 | j++; 71 | if(j == this->winSize/2){ 72 | j = 0; 73 | k++; 74 | if(k < this->signalSize/(this->winSize*0.5)) 75 | upper.col(k) = below.col(k-1); 76 | } 77 | } 78 | 79 | this->enFrame.topRows(this->winSize/2)= upper; 80 | this->enFrame.bottomRows(this->winSize/2) = below; 81 | } 82 | 83 | VectorXd fft_calc(double* fft_input){ 84 | VectorXd result = VectorXd::Zero(NFFT2); 85 | fftw_complex* fft_result; 86 | fftw_plan p; 87 | 88 | fft_result = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * this->winSize); 89 | p = fftw_plan_dft_r2c_1d(this->winSize, fft_input, fft_result,FFTW_ESTIMATE); 90 | fftw_execute(p); 91 | fftw_destroy_plan(p); 92 | 93 | for(int i = 0; i < this->NFFT2; i++){ 94 | result(i) = sqrt(pow(fft_result[i][0],2)+ pow(fft_result[i][1],2)); 95 | } 96 | fftw_free(fft_result); 97 | return result; 98 | } 99 | 100 | VectorXd computeNoiseAverageSpectrum(){ 101 | VectorXd averageNoiseSpectrum; 102 | int wnum = this->enFrame.cols(); 103 | VectorXd avgAmp = VectorXd::Zero(this->NFFT2); 104 | for(int i = 0; i < wnum; i++){ 105 | VectorXd s = this->enFrame.col(i);//got 6 in each col 106 | double fft_input[this->winSize];//winsize 6 107 | for(int j = 0; j < this->winSize; j++){ 108 | fft_input[j] = this->hamming(j) * s(j);//size of 6 109 | } 110 | 111 | VectorXd temp = fft_calc(fft_input); 112 | avgAmp += temp; 113 | } 114 | averageNoiseSpectrum = avgAmp/wnum; 115 | return averageNoiseSpectrum; 116 | } 117 | 118 | //signal is enFrame 119 | VectorXd getAmplitude(int index){ 120 | VectorXd amp; 121 | if(amplitude.rows() > index){ 122 | amp = amplitude.row(index); 123 | } 124 | else{ 125 | VectorXd s = this->enFrame.col(index); 126 | double fft_input[this->winSize]; 127 | for(int j = 0; j < this->winSize; j++){ 128 | fft_input[j] = this->hamming(j) * s(j);//size of 6 129 | } 130 | amp = fft_calc(fft_input); 131 | amplitude.conservativeResize(amplitude.rows()+1, this->NFFT2); 132 | amplitude.row(index) = amp; 133 | } 134 | return amp; 135 | } 136 | 137 | VectorXd findMax(VectorXd& a, VectorXd& b){ 138 | VectorXd result = VectorXd::Zero(this->NFFT2); 139 | for(int i = 0; i < this->NFFT2; i++){ 140 | result(i) = (a(i) > b(i)) ? a(i) : b(i); 141 | } 142 | return result; 143 | } 144 | 145 | VectorXd ltse(int index){ 146 | VectorXd maxmag = VectorXd::Zero(this->NFFT2); 147 | VectorXd maxamp; 148 | int i = index - order; 149 | while(i != index+order){ 150 | VectorXd amp = getAmplitude(i); 151 | maxamp = findMax(amp,maxmag); 152 | i++; 153 | } 154 | return maxamp; 155 | } 156 | 157 | double ltsd(int index){ 158 | if(index < (this->order) || (index+order >= this->enFrame.cols())){ 159 | return 0.0; 160 | } 161 | 162 | VectorXd ltseOutput = ltse(index); 163 | ltseOutput = ltseOutput.array().square(); 164 | VectorXd sp = ltseOutput.array()/this->averageNoise.array(); 165 | double sum = 0; 166 | for(int i = 0; i < sp.size(); i++){ 167 | sum += sp(i)/this->NFFT2; 168 | } 169 | 170 | double result = 10 * log10(sum); 171 | 172 | if(result < this->threshold){ 173 | this->averageNoise = 0.54 * this->averageNoise + (1-0.54)*sum*VectorXd::Ones(averageNoise.size()); 174 | } 175 | return result; 176 | } 177 | 178 | vector compute(double* signal){ 179 | buffer(signal); 180 | int wnum = this->enFrame.cols(); 181 | vector ltsds; 182 | this->averageNoise = computeNoiseAverageSpectrum().array().square(); 183 | for(int i = 0; i < wnum; i++){ 184 | ltsds.push_back(ltsd(i)); 185 | } 186 | return ltsds; 187 | } 188 | 189 | MatrixXd getNormalizedEnFrame(){ 190 | MatrixXd upper = MatrixXd::Zero(this->winSize/2,this->signalSize/(this->winSize*0.5)); 191 | MatrixXd below = MatrixXd::Zero(this->winSize/2,this->signalSize/(this->winSize*0.5)); 192 | MatrixXd result = MatrixXd::Zero(this->winSize,this->signalSize/(this->winSize*0.5)); 193 | 194 | int j = 0; 195 | int k = 0; 196 | for(int i = 1; i < this->signalSize+1; i++){ 197 | below(j,k) = i; 198 | j++; 199 | if(j == this->winSize/2){ 200 | j = 0; 201 | k++; 202 | if(k < this->signalSize/(this->winSize*0.5)) 203 | upper.col(k) = below.col(k-1); 204 | } 205 | } 206 | 207 | result.topRows(this->winSize/2)= upper; 208 | result.bottomRows(this->winSize/2) = below; 209 | return result; 210 | } 211 | }; 212 | 213 | #endif /* VAD_H_ */ 214 | -------------------------------------------------------------------------------- /voice_detection.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * main.cpp 3 | * 4 | * Created on: May 26, 2016 5 | * Author: dmngu9 6 | */ 7 | #include "VAD.h" 8 | #include 9 | #include 10 | 11 | using namespace std; 12 | 13 | int main(int argc, char** argv){ 14 | 15 | const int fs = 11025; 16 | const int WinSize = 256; 17 | const int order = 5; 18 | const double threshold = -6; 19 | const int uSize = 4; 20 | 21 | ifstream soundFile(argv[1]); 22 | vector sound; 23 | 24 | if(soundFile.is_open()){ 25 | double value; 26 | while(soundFile >> value){ 27 | sound.push_back(value); 28 | if(sound.size() == 50176) 29 | break; 30 | } 31 | soundFile.close(); 32 | } 33 | 34 | double* signal = &sound[0]; 35 | 36 | VAD vad(WinSize,sound.size(),order,threshold); 37 | vector outcome = vad.compute(signal); 38 | MatrixXd enFrame = vad.getNormalizedEnFrame(); 39 | 40 | double maxLevel = 0; 41 | for(int i = 0; i < sound.size(); i++){ 42 | if(maxLevel < abs(sound[i])) 43 | maxLevel = abs(sound[i]); 44 | } 45 | 46 | maxLevel += 0.01*maxLevel; 47 | vector idx(outcome.size(),0); 48 | for(int i = 0; i < outcome.size(); i++){ 49 | idx[i] = (outcome[i] > 2.5) ? 1 : 0; 50 | } 51 | 52 | VectorXd d = VectorXd::Zero(idx.size()-1); 53 | VectorXd vadStart, vadEnd; 54 | 55 | for(int i = 0; i < outcome.size()-1; i++){ 56 | d(i) = idx[i+1] - idx[i]; 57 | if(d(i) == 1){ 58 | vadStart.conservativeResize(vadStart.size()+1); 59 | vadStart(vadStart.size()-1) = i; 60 | } 61 | else if (d(i) == -1){ 62 | vadEnd.conservativeResize(vadEnd.size()+1); 63 | vadEnd(vadEnd.size()-1) = i; 64 | } 65 | } 66 | 67 | double q = (double) WinSize/fs; 68 | VectorXd temp = vadEnd - vadStart; 69 | VectorXd len = temp*q; 70 | vector VAD_begin, VAD_end; 71 | for(int i = 0; i < len.size(); i++){ 72 | if(len(i) >= (uSize*WinSize/fs)){ 73 | VAD_begin.push_back(vadStart(i)); 74 | VAD_end.push_back(vadEnd(i)); 75 | } 76 | } 77 | 78 | //plot sound wave here 79 | cout << enFrame.row(0) << endl; 80 | for(int i = 0; i < VAD_begin.size(); i++){ 81 | double x_start = enFrame(0,VAD_begin[i]+1) + 0.5*order*WinSize; 82 | double x_end = enFrame(enFrame.rows()-1,VAD_end[i]+1) + 0.5*order*WinSize; 83 | // VectorXd x, y; 84 | // x << x_start, x_end, x_end, x_start, x_start; 85 | // y << maxLevel, maxLevel, -maxLevel, -maxLevel, maxLevel; 86 | 87 | stringstream ss; 88 | ss << "example_" << i << ".txt"; 89 | ofstream myfile; 90 | cout << ss.str(); 91 | myfile.open (ss.str().c_str()); 92 | myfile << x_start << "\t\t\t" << maxLevel << "\n"; 93 | myfile << x_end << "\t\t\t" << maxLevel << "\n"; 94 | myfile << x_end << "\t\t\t" << -maxLevel << "\n"; 95 | myfile << x_start << "\t\t\t" << -maxLevel << "\n"; 96 | myfile << x_start << "\t\t\t" << maxLevel << "\n"; 97 | myfile.close(); 98 | } 99 | return 0; 100 | } 101 | 102 | 103 | 104 | --------------------------------------------------------------------------------