├── .clang-format ├── examples ├── devices.js ├── live.js ├── record.js └── analyze-files.js ├── lib ├── resources │ └── vad.onnx ├── 3rd_party │ ├── vcruntime │ │ ├── x64 │ │ │ ├── vcruntime140.dll │ │ │ └── vcruntime140_1.dll │ │ └── x86 │ │ │ └── vcruntime140.dll │ └── webrtcvad │ │ └── webrtc │ │ ├── rtc_base │ │ ├── system │ │ │ ├── inline.h │ │ │ └── arch.h │ │ ├── compile_assert_c.h │ │ ├── type_traits.h │ │ ├── sanitizer.h │ │ ├── checks.cc │ │ └── numerics │ │ │ └── safe_compare.h │ │ ├── common_audio │ │ ├── third_party │ │ │ └── spl_sqrt_floor │ │ │ │ ├── spl_sqrt_floor.h │ │ │ │ └── spl_sqrt_floor.c │ │ ├── signal_processing │ │ │ ├── spl_inl.c │ │ │ ├── energy.c │ │ │ ├── cross_correlation.c │ │ │ ├── get_scaling_square.c │ │ │ ├── dot_product_with_scale.h │ │ │ ├── resample_by_2_internal.h │ │ │ ├── downsample_fast.c │ │ │ ├── division_operations.c │ │ │ ├── include │ │ │ │ ├── real_fft.h │ │ │ │ └── spl_inl.h │ │ │ ├── complex_bit_reverse.c │ │ │ ├── spl_init.c │ │ │ ├── vector_scaling_operations.c │ │ │ ├── spl_sqrt.c │ │ │ ├── min_max_operations.c │ │ │ ├── resample_48khz.c │ │ │ ├── resample_fractional.c │ │ │ ├── complex_fft_tables.h │ │ │ └── complex_fft.c │ │ └── vad │ │ │ ├── vad_gmm.h │ │ │ ├── vad_filterbank.h │ │ │ ├── vad_sp.h │ │ │ ├── vad_gmm.c │ │ │ ├── include │ │ │ └── webrtc_vad.h │ │ │ ├── webrtc_vad.c │ │ │ ├── vad_core.h │ │ │ └── vad_sp.c │ │ ├── system_wrappers │ │ └── include │ │ │ └── cpu_features_wrapper.h │ │ └── typedefs.h ├── include │ ├── webrtcvad.h │ ├── aligned.h │ ├── devices.h │ ├── microphone.h │ └── chunk_processor.h ├── src │ ├── webrtcvad.cpp │ ├── devices.cpp │ ├── microphone.cpp │ └── chunk_processor.cpp ├── test │ └── main.cpp └── CMakeLists.txt ├── .npmignore ├── .gitignore ├── package.json ├── LICENSE ├── include └── speech_recorder.h ├── setup.sh ├── src ├── index.js └── speech_recorder.cpp ├── README.md └── binding.gyp /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: Google 2 | IncludeBlocks: Preserve 3 | -------------------------------------------------------------------------------- /examples/devices.js: -------------------------------------------------------------------------------- 1 | const { devices } = require("../src/index"); 2 | 3 | console.log(devices()); 4 | -------------------------------------------------------------------------------- /lib/resources/vad.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/serenadeai/speech-recorder/HEAD/lib/resources/vad.onnx -------------------------------------------------------------------------------- /lib/3rd_party/vcruntime/x64/vcruntime140.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/serenadeai/speech-recorder/HEAD/lib/3rd_party/vcruntime/x64/vcruntime140.dll -------------------------------------------------------------------------------- /lib/3rd_party/vcruntime/x86/vcruntime140.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/serenadeai/speech-recorder/HEAD/lib/3rd_party/vcruntime/x86/vcruntime140.dll -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | build 2 | prebuilds 3 | lib/3rd_party 4 | lib/build 5 | lib/CMakeLists.txt 6 | lib/include 7 | lib/src 8 | lib/test 9 | lib/install 10 | -------------------------------------------------------------------------------- /lib/3rd_party/vcruntime/x64/vcruntime140_1.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/serenadeai/speech-recorder/HEAD/lib/3rd_party/vcruntime/x64/vcruntime140_1.dll -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | .DS_Store 3 | *.log 4 | *.wav 5 | *.raw 6 | *.map 7 | bin 8 | build 9 | dist 10 | node_modules 11 | lib/3rd_party/portaudio 12 | lib/3rd_party/onnxruntime 13 | lib/install 14 | prebuilds 15 | -------------------------------------------------------------------------------- /lib/include/webrtcvad.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | extern "C" { 4 | #include "webrtc/common_audio/vad/include/webrtc_vad.h" 5 | } 6 | 7 | namespace speechrecorder { 8 | 9 | class WebrtcVad { 10 | private: 11 | VadInst* instance_ = nullptr; 12 | int level_; 13 | int sampleRate_; 14 | 15 | public: 16 | WebrtcVad(int level, int sampleRate); 17 | ~WebrtcVad(); 18 | bool Process(int16_t* buffer, size_t size); 19 | void Reset(); 20 | }; 21 | 22 | } // namespace speechrecorder 23 | -------------------------------------------------------------------------------- /lib/include/aligned.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifndef ALIGNED 4 | #if defined(_WIN32) 5 | #define ALIGNED \ 6 | void* operator new(size_t i) { return _aligned_malloc(i, 64); } \ 7 | void operator delete(void* p) { _aligned_free(p); } 8 | #elif defined(__linux__) 9 | #define ALIGNED \ 10 | void* operator new(size_t i) { return aligned_alloc(64, i); } \ 11 | void operator delete(void* p) { free(p); } 12 | #else 13 | #define ALIGNED 14 | #endif 15 | #endif 16 | -------------------------------------------------------------------------------- /examples/live.js: -------------------------------------------------------------------------------- 1 | const { SpeechRecorder } = require("../src/index.js"); 2 | 3 | const recorder = new SpeechRecorder({ 4 | onChunkStart: () => { 5 | console.log(Date.now(), "Chunk start"); 6 | }, 7 | onAudio: ({ speaking, probability, volume }) => { 8 | console.log(Date.now(), speaking, probability, volume); 9 | }, 10 | onChunkEnd: () => { 11 | console.log(Date.now(), "Chunk end"); 12 | }, 13 | }); 14 | 15 | console.log("Recording..."); 16 | recorder.start(); 17 | setTimeout(() => { 18 | console.log("Done!"); 19 | recorder.stop(); 20 | }, 60000); 21 | -------------------------------------------------------------------------------- /lib/src/webrtcvad.cpp: -------------------------------------------------------------------------------- 1 | #include "microphone.h" 2 | #include "webrtcvad.h" 3 | 4 | namespace speechrecorder { 5 | 6 | WebrtcVad::WebrtcVad(int level, int sampleRate) 7 | : level_(level), sampleRate_(sampleRate) { 8 | Reset(); 9 | } 10 | 11 | WebrtcVad::~WebrtcVad() { 12 | if (instance_ != nullptr) { 13 | WebRtcVad_Free(instance_); 14 | } 15 | } 16 | 17 | bool WebrtcVad::Process(int16_t* buffer, size_t size) { 18 | return WebRtcVad_Process(instance_, sampleRate_, buffer, size) == 1; 19 | } 20 | 21 | void WebrtcVad::Reset() { 22 | if (instance_ != nullptr) { 23 | WebRtcVad_Free(instance_); 24 | } 25 | 26 | instance_ = WebRtcVad_Create(); 27 | WebRtcVad_Init(instance_); 28 | WebRtcVad_set_mode(instance_, level_); 29 | } 30 | 31 | } // namespace speechrecorder 32 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "speech-recorder", 3 | "version": "2.1.0", 4 | "description": "A node.js library for streaming audio and speech from the microphone.", 5 | "main": "src/index.js", 6 | "repository": "https://github.com/serenadeai/speech-recorder", 7 | "author": "Serenade", 8 | "license": "MIT", 9 | "gypfile": true, 10 | "binary": { 11 | "napi_versions": [6] 12 | }, 13 | "scripts": { 14 | "build": "bash build.sh", 15 | "clean": "rm -rf build prebuilds lib/build lib/install", 16 | "install": "prebuild-install -r napi || node-gyp rebuild" 17 | }, 18 | "dependencies": { 19 | "bindings": "^1.5.0", 20 | "node-addon-api": "^4.2.0", 21 | "prebuild-install": "^7.0.0" 22 | }, 23 | "devDependencies": { 24 | "prebuild": "^11.0.0", 25 | "wavefile": "^11.0.0" 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /examples/record.js: -------------------------------------------------------------------------------- 1 | const fs = require("fs"); 2 | const { SpeechRecorder } = require("../src/index"); 3 | const { WaveFile } = require("wavefile"); 4 | 5 | if (process.argv.length < 3) { 6 | console.log("Usage: node record.js /path/to/output.wav"); 7 | process.exit(1); 8 | } 9 | 10 | let buffer = []; 11 | const sampleRate = 16000; 12 | const recorder = new SpeechRecorder({ 13 | onAudio: ({ audio }) => { 14 | for (let i = 0; i < audio.length; i++) { 15 | buffer.push(audio[i]); 16 | } 17 | 18 | if (buffer.length >= sampleRate * 5) { 19 | let wav = new WaveFile(); 20 | wav.fromScratch(1, sampleRate, "16", buffer); 21 | fs.writeFileSync(process.argv[2], wav.toBuffer()); 22 | process.exit(1); 23 | } 24 | }, 25 | }); 26 | 27 | console.log("Ready..."); 28 | setTimeout(() => { 29 | console.log("Go!"); 30 | recorder.start(); 31 | }, 1000); 32 | -------------------------------------------------------------------------------- /lib/include/devices.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace speechrecorder { 6 | 7 | struct Device { 8 | int id; 9 | std::string name; 10 | std::string apiName; 11 | int maxInputChannels; 12 | int maxOutputChannels; 13 | double defaultSampleRate; 14 | bool isDefaultInput; 15 | bool isDefaultOutput; 16 | 17 | Device(int id, std::string name, std::string apiName, int maxInputChannels, 18 | int maxOutputChannels, double defaultSampleRate, bool isDefaultInput, 19 | bool isDefaultOutput) 20 | : id(id), 21 | name(name), 22 | apiName(apiName), 23 | maxInputChannels(maxInputChannels), 24 | maxOutputChannels(maxOutputChannels), 25 | defaultSampleRate(defaultSampleRate), 26 | isDefaultInput(isDefaultInput), 27 | isDefaultOutput(isDefaultOutput) {} 28 | }; 29 | 30 | std::vector GetDevices(); 31 | 32 | } // namespace speechrecorder 33 | -------------------------------------------------------------------------------- /lib/include/microphone.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include "webrtcvad.h" 11 | 12 | using namespace moodycamel; 13 | 14 | namespace speechrecorder { 15 | 16 | struct MicrophoneCallbackData { 17 | std::vector* buffer; 18 | int bufferIndex = 0; 19 | BlockingReaderWriterQueue* queue; 20 | }; 21 | 22 | class Microphone { 23 | private: 24 | std::vector buffer_; 25 | MicrophoneCallbackData callbackData_; 26 | int device_; 27 | int samplesPerFrame_; 28 | int sampleRate_; 29 | PaStream* stream_; 30 | 31 | void HandleError(PaError error, const std::string& message); 32 | 33 | public: 34 | Microphone(int device, int samplesPerFrame, int sampleRate, 35 | BlockingReaderWriterQueue* queue); 36 | void Start(); 37 | void Stop(); 38 | }; 39 | 40 | } // namespace speechrecorder 41 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/rtc_base/system/inline.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | #ifndef RTC_BASE_SYSTEM_INLINE_H_ 12 | #define RTC_BASE_SYSTEM_INLINE_H_ 13 | 14 | #if defined(_MSC_VER) 15 | 16 | #define RTC_FORCE_INLINE __forceinline 17 | #define RTC_NO_INLINE __declspec(noinline) 18 | 19 | #elif defined(__GNUC__) 20 | 21 | #define RTC_FORCE_INLINE __attribute__((__always_inline__)) 22 | #define RTC_NO_INLINE __attribute__((__noinline__)) 23 | 24 | #else 25 | 26 | #define RTC_FORCE_INLINE 27 | #define RTC_NO_INLINE 28 | 29 | #endif 30 | 31 | #endif // RTC_BASE_SYSTEM_INLINE_H_ 32 | -------------------------------------------------------------------------------- /lib/test/main.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "chunk_processor.h" 8 | 9 | int main(int argc, char** argv) { 10 | speechrecorder::ChunkProcessorOptions options; 11 | options.onChunkStart = [](std::vector audio) { 12 | std::cout << "Chunk start" << std::endl; 13 | }; 14 | options.onAudio = [](std::vector audio, bool speaking, double volume, 15 | bool speech, double probability, 16 | int consecutiveSilence) { 17 | std::cout << "Speaking: " << speaking << " Volume: " << volume 18 | << " Probability: " << probability << std::endl; 19 | }; 20 | options.onChunkEnd = []() { 21 | std::cout << "Chunk end" << std::endl; 22 | }; 23 | 24 | speechrecorder::ChunkProcessor processor(argv[1], options); 25 | processor.Start(); 26 | std::this_thread::sleep_for(std::chrono::milliseconds(3000)); 27 | processor.Stop(); 28 | std::cout << "Done" << std::endl; 29 | } 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2021 Serenade Labs, Inc. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /lib/src/devices.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | #include "devices.h" 7 | 8 | namespace speechrecorder { 9 | 10 | std::vector GetDevices() { 11 | Pa_Initialize(); 12 | std::vector result; 13 | 14 | int count = Pa_GetDeviceCount(); 15 | for (int i = 0; i < count; i++) { 16 | const PaDeviceInfo* info = Pa_GetDeviceInfo(i); 17 | bool include = info->maxInputChannels > 0; 18 | 19 | #ifdef WIN32 20 | if (strcmp(Pa_GetHostApiInfo(info->hostApi)->name, "MME") != 0) { 21 | include = false; 22 | } 23 | #endif 24 | 25 | if (include) { 26 | result.emplace_back(i, info->name, Pa_GetHostApiInfo(info->hostApi)->name, 27 | info->maxInputChannels, info->maxOutputChannels, 28 | info->defaultSampleRate, 29 | i == Pa_GetDefaultInputDevice(), 30 | i == Pa_GetDefaultOutputDevice()); 31 | } 32 | } 33 | 34 | return result; 35 | } 36 | 37 | } // namespace speechrecorder 38 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/rtc_base/compile_assert_c.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | #ifndef RTC_BASE_COMPILE_ASSERT_C_H_ 12 | #define RTC_BASE_COMPILE_ASSERT_C_H_ 13 | 14 | // Use this macro to verify at compile time that certain restrictions are met. 15 | // The argument is the boolean expression to evaluate. 16 | // Example: 17 | // RTC_COMPILE_ASSERT(sizeof(foo) < 128); 18 | // Note: In C++, use static_assert instead! 19 | #define RTC_COMPILE_ASSERT(expression) \ 20 | switch (0) { \ 21 | case 0: \ 22 | case expression:; \ 23 | } 24 | 25 | #endif // RTC_BASE_COMPILE_ASSERT_C_H_ 26 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | #include 12 | 13 | // 14 | // WebRtcSpl_SqrtFloor(...) 15 | // 16 | // Returns the square root of the input value |value|. The precision of this 17 | // function is rounding down integer precision, i.e., sqrt(8) gives 2 as answer. 18 | // If |value| is a negative number then 0 is returned. 19 | // 20 | // Algorithm: 21 | // 22 | // An iterative 4 cylce/bit routine 23 | // 24 | // Input: 25 | // - value : Value to calculate sqrt of 26 | // 27 | // Return value : Result of the sqrt calculation 28 | // 29 | int32_t WebRtcSpl_SqrtFloor(int32_t value); 30 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/common_audio/signal_processing/spl_inl.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | #include 12 | 13 | #include "webrtc/common_audio/signal_processing/include/spl_inl.h" 14 | 15 | // Table used by WebRtcSpl_CountLeadingZeros32_NotBuiltin. For each uint32_t n 16 | // that's a sequence of 0 bits followed by a sequence of 1 bits, the entry at 17 | // index (n * 0x8c0b2891) >> 26 in this table gives the number of zero bits in 18 | // n. 19 | const int8_t kWebRtcSpl_CountLeadingZeros32_Table[64] = { 20 | 32, 8, 17, -1, -1, 14, -1, -1, -1, 20, -1, -1, -1, 28, -1, 18, 21 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 26, 25, 24, 22 | 4, 11, 23, 31, 3, 7, 10, 16, 22, 30, -1, -1, 2, 6, 13, 9, 23 | -1, 15, -1, 21, -1, 29, 19, -1, -1, -1, -1, -1, 1, 27, 5, 12, 24 | }; 25 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/common_audio/signal_processing/energy.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | 12 | /* 13 | * This file contains the function WebRtcSpl_Energy(). 14 | * The description header can be found in signal_processing_library.h 15 | * 16 | */ 17 | 18 | #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h" 19 | 20 | int32_t WebRtcSpl_Energy(int16_t* vector, 21 | size_t vector_length, 22 | int* scale_factor) 23 | { 24 | int32_t en = 0; 25 | size_t i; 26 | int scaling = 27 | WebRtcSpl_GetScalingSquare(vector, vector_length, vector_length); 28 | size_t looptimes = vector_length; 29 | int16_t *vectorptr = vector; 30 | 31 | for (i = 0; i < looptimes; i++) 32 | { 33 | en += (*vectorptr * *vectorptr) >> scaling; 34 | vectorptr++; 35 | } 36 | *scale_factor = scaling; 37 | 38 | return en; 39 | } 40 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/common_audio/signal_processing/cross_correlation.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h" 12 | 13 | /* C version of WebRtcSpl_CrossCorrelation() for generic platforms. */ 14 | void WebRtcSpl_CrossCorrelationC(int32_t* cross_correlation, 15 | const int16_t* seq1, 16 | const int16_t* seq2, 17 | size_t dim_seq, 18 | size_t dim_cross_correlation, 19 | int right_shifts, 20 | int step_seq2) { 21 | size_t i = 0, j = 0; 22 | 23 | for (i = 0; i < dim_cross_correlation; i++) { 24 | int32_t corr = 0; 25 | for (j = 0; j < dim_seq; j++) 26 | corr += (seq1[j] * seq2[j]) >> right_shifts; 27 | seq2 += step_seq2; 28 | *cross_correlation++ = corr; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /include/speech_recorder.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include "aligned.h" 10 | #include "chunk_processor.h" 11 | 12 | struct SpeechRecorderCallbackData { 13 | std::string event = ""; 14 | std::vector audio; 15 | bool speaking = false; 16 | double volume = 0.0; 17 | bool speech = false; 18 | double probability = 0.0; 19 | int consecutiveSilence = 0; 20 | }; 21 | 22 | class SpeechRecorder : public Napi::ObjectWrap { 23 | private: 24 | std::thread thread_; 25 | Napi::ThreadSafeFunction threadSafeFunction_; 26 | std::atomic stopped_; 27 | BlockingReaderWriterQueue queue_; 28 | Napi::FunctionReference callback_; 29 | std::function 30 | threadSafeFunctionCallback_; 31 | std::string modelPath_; 32 | speechrecorder::ChunkProcessorOptions options_; 33 | speechrecorder::ChunkProcessor processor_; 34 | std::unique_ptr processFileProcessor_; 35 | 36 | void ProcessFile(const Napi::CallbackInfo& info); 37 | void Start(const Napi::CallbackInfo& info); 38 | void Stop(const Napi::CallbackInfo& info); 39 | 40 | public: 41 | SpeechRecorder(const Napi::CallbackInfo& info); 42 | static Napi::Object Init(Napi::Env env, Napi::Object exports); 43 | 44 | ALIGNED 45 | }; 46 | 47 | Napi::Value GetDevices(const Napi::CallbackInfo& info); 48 | Napi::Object Init(Napi::Env env, Napi::Object exports); 49 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/common_audio/signal_processing/get_scaling_square.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | 12 | /* 13 | * This file contains the function WebRtcSpl_GetScalingSquare(). 14 | * The description header can be found in signal_processing_library.h 15 | * 16 | */ 17 | 18 | #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h" 19 | 20 | int16_t WebRtcSpl_GetScalingSquare(int16_t* in_vector, 21 | size_t in_vector_length, 22 | size_t times) 23 | { 24 | int16_t nbits = WebRtcSpl_GetSizeInBits((uint32_t)times); 25 | size_t i; 26 | int16_t smax = -1; 27 | int16_t sabs; 28 | int16_t *sptr = in_vector; 29 | int16_t t; 30 | size_t looptimes = in_vector_length; 31 | 32 | for (i = looptimes; i > 0; i--) 33 | { 34 | sabs = (*sptr > 0 ? *sptr++ : -*sptr++); 35 | smax = (sabs > smax ? sabs : smax); 36 | } 37 | t = WebRtcSpl_NormW32(WEBRTC_SPL_MUL(smax, smax)); 38 | 39 | if (smax == 0) 40 | { 41 | return 0; // Since norm(0) returns 0 42 | } else 43 | { 44 | return (t > nbits) ? 0 : nbits - t; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/common_audio/vad/vad_gmm.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | // Gaussian probability calculations internally used in vad_core.c. 12 | 13 | #ifndef COMMON_AUDIO_VAD_VAD_GMM_H_ 14 | #define COMMON_AUDIO_VAD_VAD_GMM_H_ 15 | 16 | #include 17 | 18 | // Calculates the probability for |input|, given that |input| comes from a 19 | // normal distribution with mean and standard deviation (|mean|, |std|). 20 | // 21 | // Inputs: 22 | // - input : input sample in Q4. 23 | // - mean : mean input in the statistical model, Q7. 24 | // - std : standard deviation, Q7. 25 | // 26 | // Output: 27 | // 28 | // - delta : input used when updating the model, Q11. 29 | // |delta| = (|input| - |mean|) / |std|^2. 30 | // 31 | // Return: 32 | // (probability for |input|) = 33 | // 1 / |std| * exp(-(|input| - |mean|)^2 / (2 * |std|^2)); 34 | int32_t WebRtcVad_GaussianProbability(int16_t input, 35 | int16_t mean, 36 | int16_t std, 37 | int16_t* delta); 38 | 39 | #endif // COMMON_AUDIO_VAD_VAD_GMM_H_ 40 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/common_audio/signal_processing/dot_product_with_scale.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | #ifndef COMMON_AUDIO_SIGNAL_PROCESSING_DOT_PRODUCT_WITH_SCALE_H_ 12 | #define COMMON_AUDIO_SIGNAL_PROCESSING_DOT_PRODUCT_WITH_SCALE_H_ 13 | 14 | #include 15 | #include 16 | 17 | #ifdef __cplusplus 18 | extern "C" { 19 | #endif 20 | 21 | // Calculates the dot product between two (int16_t) vectors. 22 | // 23 | // Input: 24 | // - vector1 : Vector 1 25 | // - vector2 : Vector 2 26 | // - vector_length : Number of samples used in the dot product 27 | // - scaling : The number of right bit shifts to apply on each term 28 | // during calculation to avoid overflow, i.e., the 29 | // output will be in Q(-|scaling|) 30 | // 31 | // Return value : The dot product in Q(-scaling) 32 | int32_t WebRtcSpl_DotProductWithScale(const int16_t* vector1, 33 | const int16_t* vector2, 34 | size_t length, 35 | int scaling); 36 | 37 | #ifdef __cplusplus 38 | } 39 | #endif // __cplusplus 40 | #endif // COMMON_AUDIO_SIGNAL_PROCESSING_DOT_PRODUCT_WITH_SCALE_H_ 41 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/system_wrappers/include/cpu_features_wrapper.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | #ifndef SYSTEM_WRAPPERS_INCLUDE_CPU_FEATURES_WRAPPER_H_ 12 | #define SYSTEM_WRAPPERS_INCLUDE_CPU_FEATURES_WRAPPER_H_ 13 | 14 | #include 15 | 16 | #if defined(__cplusplus) || defined(c_plusplus) 17 | extern "C" { 18 | #endif 19 | 20 | // List of features in x86. 21 | typedef enum { kSSE2, kSSE3 } CPUFeature; 22 | 23 | // List of features in ARM. 24 | enum { 25 | kCPUFeatureARMv7 = (1 << 0), 26 | kCPUFeatureVFPv3 = (1 << 1), 27 | kCPUFeatureNEON = (1 << 2), 28 | kCPUFeatureLDREXSTREX = (1 << 3) 29 | }; 30 | 31 | typedef int (*WebRtc_CPUInfo)(CPUFeature feature); 32 | 33 | // Returns true if the CPU supports the feature. 34 | extern WebRtc_CPUInfo WebRtc_GetCPUInfo; 35 | 36 | // No CPU feature is available => straight C path. 37 | extern WebRtc_CPUInfo WebRtc_GetCPUInfoNoASM; 38 | 39 | // Return the features in an ARM device. 40 | // It detects the features in the hardware platform, and returns supported 41 | // values in the above enum definition as a bitmask. 42 | extern uint64_t WebRtc_GetCPUFeaturesARM(void); 43 | 44 | #if defined(__cplusplus) || defined(c_plusplus) 45 | } // extern "C" 46 | #endif 47 | 48 | #endif // SYSTEM_WRAPPERS_INCLUDE_CPU_FEATURES_WRAPPER_H_ 49 | -------------------------------------------------------------------------------- /lib/include/chunk_processor.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "aligned.h" 12 | #include "microphone.h" 13 | #include "onnxruntime_cxx_api.h" 14 | #include "webrtcvad.h" 15 | 16 | namespace speechrecorder { 17 | 18 | struct ChunkProcessorOptions { 19 | int consecutiveFramesForSilence = 5; 20 | int consecutiveFramesForSpeaking = 1; 21 | int device = -1; 22 | int leadingBufferFrames = 10; 23 | std::function)> onChunkStart = nullptr; 24 | std::function, bool, double, bool, double, int)> 25 | onAudio = nullptr; 26 | std::function onChunkEnd = nullptr; 27 | int samplesPerFrame = 480; 28 | int sampleRate = 16000; 29 | int sileroVadBufferSize = 2000; 30 | int sileroVadRateLimit = 3; 31 | double sileroVadSilenceThreshold = 0.1; 32 | double sileroVadSpeakingThreshold = 0.3; 33 | int webrtcVadLevel = 3; 34 | int webrtcVadBufferSize = 480; 35 | int webrtcVadResultsSize = 10; 36 | }; 37 | 38 | class ChunkProcessor { 39 | private: 40 | std::vector leadingBuffer_; 41 | int consecutiveSilence_ = 0; 42 | int consecutiveSpeaking_ = 0; 43 | int framesUntilSileroVad_ = 0; 44 | Microphone microphone_; 45 | BlockingReaderWriterQueue queue_; 46 | std::vector sileroBuffer_; 47 | double sileroVadProbability_ = 0.0; 48 | bool speaking_ = false; 49 | std::atomic stopped_; 50 | std::mutex toggleLock_; 51 | std::thread startThread_; 52 | std::thread stopThread_; 53 | std::thread queueThread_; 54 | WebrtcVad webrtcVad_; 55 | std::vector webrtcVadBuffer_; 56 | std::vector webrtcVadResults_; 57 | 58 | public: 59 | ChunkProcessorOptions options_; 60 | ChunkProcessor(std::string modelPath, ChunkProcessorOptions options); 61 | ~ChunkProcessor(); 62 | void Process(short* audio); 63 | void Reset(); 64 | void Start(); 65 | void Stop(); 66 | 67 | ALIGNED 68 | }; 69 | 70 | } // namespace speechrecorder 71 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/common_audio/vad/vad_filterbank.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | /* 12 | * This file includes feature calculating functionality used in vad_core.c. 13 | */ 14 | 15 | #ifndef COMMON_AUDIO_VAD_VAD_FILTERBANK_H_ 16 | #define COMMON_AUDIO_VAD_VAD_FILTERBANK_H_ 17 | 18 | #include "webrtc/common_audio/vad/vad_core.h" 19 | 20 | // Takes |data_length| samples of |data_in| and calculates the logarithm of the 21 | // energy of each of the |kNumChannels| = 6 frequency bands used by the VAD: 22 | // 80 Hz - 250 Hz 23 | // 250 Hz - 500 Hz 24 | // 500 Hz - 1000 Hz 25 | // 1000 Hz - 2000 Hz 26 | // 2000 Hz - 3000 Hz 27 | // 3000 Hz - 4000 Hz 28 | // 29 | // The values are given in Q4 and written to |features|. Further, an approximate 30 | // overall energy is returned. The return value is used in 31 | // WebRtcVad_GmmProbability() as a signal indicator, hence it is arbitrary above 32 | // the threshold |kMinEnergy|. 33 | // 34 | // - self [i/o] : State information of the VAD. 35 | // - data_in [i] : Input audio data, for feature extraction. 36 | // - data_length [i] : Audio data size, in number of samples. 37 | // - features [o] : 10 * log10(energy in each frequency band), Q4. 38 | // - returns : Total energy of the signal (NOTE! This value is not 39 | // exact. It is only used in a comparison.) 40 | int16_t WebRtcVad_CalculateFeatures(VadInstT* self, 41 | const int16_t* data_in, 42 | size_t data_length, 43 | int16_t* features); 44 | 45 | #endif // COMMON_AUDIO_VAD_VAD_FILTERBANK_H_ 46 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/rtc_base/system/arch.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | // This file contains platform-specific typedefs and defines. 12 | // Much of it is derived from Chromium's build/build_config.h. 13 | 14 | #ifndef RTC_BASE_SYSTEM_ARCH_H_ 15 | #define RTC_BASE_SYSTEM_ARCH_H_ 16 | 17 | // Processor architecture detection. For more info on what's defined, see: 18 | // http://msdn.microsoft.com/en-us/library/b0084kay.aspx 19 | // http://www.agner.org/optimize/calling_conventions.pdf 20 | // or with gcc, run: "echo | gcc -E -dM -" 21 | #if defined(_M_X64) || defined(__x86_64__) 22 | #define WEBRTC_ARCH_X86_FAMILY 23 | #define WEBRTC_ARCH_X86_64 24 | #define WEBRTC_ARCH_64_BITS 25 | #define WEBRTC_ARCH_LITTLE_ENDIAN 26 | #elif defined(__aarch64__) 27 | #define WEBRTC_ARCH_ARM_FAMILY 28 | #define WEBRTC_ARCH_64_BITS 29 | #define WEBRTC_ARCH_LITTLE_ENDIAN 30 | #elif defined(_M_IX86) || defined(__i386__) 31 | #define WEBRTC_ARCH_X86_FAMILY 32 | #define WEBRTC_ARCH_X86 33 | #define WEBRTC_ARCH_32_BITS 34 | #define WEBRTC_ARCH_LITTLE_ENDIAN 35 | #elif defined(__ARMEL__) 36 | #define WEBRTC_ARCH_ARM_FAMILY 37 | #define WEBRTC_ARCH_32_BITS 38 | #define WEBRTC_ARCH_LITTLE_ENDIAN 39 | #elif defined(__MIPSEL__) 40 | #define WEBRTC_ARCH_MIPS_FAMILY 41 | #if defined(__LP64__) 42 | #define WEBRTC_ARCH_64_BITS 43 | #else 44 | #define WEBRTC_ARCH_32_BITS 45 | #endif 46 | #define WEBRTC_ARCH_LITTLE_ENDIAN 47 | #elif defined(__pnacl__) 48 | #define WEBRTC_ARCH_32_BITS 49 | #define WEBRTC_ARCH_LITTLE_ENDIAN 50 | #else 51 | #error Please add support for your architecture in typedefs.h 52 | #endif 53 | 54 | #if !(defined(WEBRTC_ARCH_LITTLE_ENDIAN) ^ defined(WEBRTC_ARCH_BIG_ENDIAN)) 55 | #error Define either WEBRTC_ARCH_LITTLE_ENDIAN or WEBRTC_ARCH_BIG_ENDIAN 56 | #endif 57 | 58 | #endif // RTC_BASE_SYSTEM_ARCH_H_ 59 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/common_audio/vad/vad_sp.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | // This file includes specific signal processing tools used in vad_core.c. 12 | 13 | #ifndef COMMON_AUDIO_VAD_VAD_SP_H_ 14 | #define COMMON_AUDIO_VAD_VAD_SP_H_ 15 | 16 | #include "webrtc/common_audio/vad/vad_core.h" 17 | 18 | // Downsamples the signal by a factor 2, eg. 32->16 or 16->8. 19 | // 20 | // Inputs: 21 | // - signal_in : Input signal. 22 | // - in_length : Length of input signal in samples. 23 | // 24 | // Input & Output: 25 | // - filter_state : Current filter states of the two all-pass filters. The 26 | // |filter_state| is updated after all samples have been 27 | // processed. 28 | // 29 | // Output: 30 | // - signal_out : Downsampled signal (of length |in_length| / 2). 31 | void WebRtcVad_Downsampling(const int16_t* signal_in, 32 | int16_t* signal_out, 33 | int32_t* filter_state, 34 | size_t in_length); 35 | 36 | // Updates and returns the smoothed feature minimum. As minimum we use the 37 | // median of the five smallest feature values in a 100 frames long window. 38 | // As long as |handle->frame_counter| is zero, that is, we haven't received any 39 | // "valid" data, FindMinimum() outputs the default value of 1600. 40 | // 41 | // Inputs: 42 | // - feature_value : New feature value to update with. 43 | // - channel : Channel number. 44 | // 45 | // Input & Output: 46 | // - handle : State information of the VAD. 47 | // 48 | // Returns: 49 | // : Smoothed minimum value for a moving window. 50 | int16_t WebRtcVad_FindMinimum(VadInstT* handle, 51 | int16_t feature_value, 52 | int channel); 53 | 54 | #endif // COMMON_AUDIO_VAD_VAD_SP_H_ 55 | -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 5 | pushd "$HERE" &> /dev/null 6 | 7 | if [[ -z "$1" ]] ; then 8 | echo "Usage: setup.sh x86|x64|arm64" 9 | exit 1 10 | fi 11 | 12 | rm -rf tmp lib/3rd_party/portaudio lib/3rd_party/onnxruntime 13 | 14 | mkdir -p tmp/portaudio 15 | cd tmp/portaudio 16 | curl -Lo portaudio.tgz http://files.portaudio.com/archives/pa_stable_v190700_20210406.tgz 17 | tar xvf portaudio.tgz 18 | 19 | cd portaudio 20 | mkdir dist install 21 | cd dist 22 | 23 | portaudio_cmake="cmake" 24 | if [[ `uname -s` == "MINGW"* ]] ; then 25 | if [[ "$1" == "x86" ]] ; then 26 | portaudio_cmake+=" -A Win32" 27 | elif [[ "$1" == "x64" ]] ; then 28 | portaudio_cmake+=" -A x64" 29 | fi 30 | elif [[ `uname -s` == "Darwin" ]] ; then 31 | portaudio_cmake+=" -DCMAKE_OSX_DEPLOYMENT_TARGET=10.14" 32 | if [[ "$1" == "x64" ]] ; then 33 | portaudio_cmake+=" -DCMAKE_OSX_ARCHITECTURES=x86_64" 34 | elif [[ "$1" == "arm64" ]] ; then 35 | portaudio_cmake+=" -DCMAKE_OSX_ARCHITECTURES=arm64" 36 | fi 37 | fi 38 | 39 | portaudio_cmake+=" .." 40 | eval $portaudio_cmake 41 | cmake --build . --config Release 42 | cmake --install . --prefix ../install 43 | cp -r ../install ../../../../lib/3rd_party/portaudio 44 | 45 | cd ../../.. 46 | mkdir onnxruntime 47 | cd onnxruntime 48 | 49 | if [[ `uname -s` == "MINGW"* ]] ; then 50 | mkdir -p ../../lib/3rd_party/onnxruntime/lib 51 | curl -Lo onnxruntime.zip https://www.nuget.org/api/v2/package/Microsoft.ML.OnnxRuntime/1.10.0 52 | unzip onnxruntime.zip 53 | cp -r build/native/include ../../lib/3rd_party/onnxruntime 54 | 55 | path="win-x86" 56 | if [[ "$1" == "x64" ]] ; then 57 | path="win-x64" 58 | fi 59 | 60 | cp runtimes/$path/native/*.dll ../../lib/3rd_party/onnxruntime/lib 61 | cp runtimes/$path/native/*.lib ../../lib/3rd_party/onnxruntime/lib 62 | else 63 | path="onnxruntime-linux-x64-1.10.0" 64 | if [[ `uname -s` == "Darwin" ]] ; then 65 | if [[ "$1" == "x64" ]] ; then 66 | path="onnxruntime-osx-x86_64-1.10.0" 67 | elif [[ "$1" == "arm64" ]] ; then 68 | path="onnxruntime-osx-arm64-1.10.0" 69 | fi 70 | fi 71 | 72 | curl -Lo onnxruntime.tgz https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/$path.tgz 73 | tar xvf onnxruntime.tgz 74 | cp -r $path ../../lib/3rd_party/onnxruntime 75 | fi 76 | 77 | cd ../.. 78 | rm -rf tmp 79 | popd &> /dev/null 80 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/common_audio/signal_processing/resample_by_2_internal.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | /* 12 | * This header file contains some internal resampling functions. 13 | * 14 | */ 15 | 16 | #ifndef COMMON_AUDIO_SIGNAL_PROCESSING_RESAMPLE_BY_2_INTERNAL_H_ 17 | #define COMMON_AUDIO_SIGNAL_PROCESSING_RESAMPLE_BY_2_INTERNAL_H_ 18 | 19 | #include 20 | 21 | /******************************************************************* 22 | * resample_by_2_fast.c 23 | * Functions for internal use in the other resample functions 24 | ******************************************************************/ 25 | void WebRtcSpl_DownBy2IntToShort(int32_t* in, 26 | int32_t len, 27 | int16_t* out, 28 | int32_t* state); 29 | 30 | void WebRtcSpl_DownBy2ShortToInt(const int16_t* in, 31 | int32_t len, 32 | int32_t* out, 33 | int32_t* state); 34 | 35 | void WebRtcSpl_UpBy2ShortToInt(const int16_t* in, 36 | int32_t len, 37 | int32_t* out, 38 | int32_t* state); 39 | 40 | void WebRtcSpl_UpBy2IntToInt(const int32_t* in, 41 | int32_t len, 42 | int32_t* out, 43 | int32_t* state); 44 | 45 | void WebRtcSpl_UpBy2IntToShort(const int32_t* in, 46 | int32_t len, 47 | int16_t* out, 48 | int32_t* state); 49 | 50 | void WebRtcSpl_LPBy2ShortToInt(const int16_t* in, 51 | int32_t len, 52 | int32_t* out, 53 | int32_t* state); 54 | 55 | void WebRtcSpl_LPBy2IntToInt(const int32_t* in, 56 | int32_t len, 57 | int32_t* out, 58 | int32_t* state); 59 | 60 | #endif // COMMON_AUDIO_SIGNAL_PROCESSING_RESAMPLE_BY_2_INTERNAL_H_ 61 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Written by Wilco Dijkstra, 1996. The following email exchange establishes the 3 | * license. 4 | * 5 | * From: Wilco Dijkstra 6 | * Date: Fri, Jun 24, 2011 at 3:20 AM 7 | * Subject: Re: sqrt routine 8 | * To: Kevin Ma 9 | * Hi Kevin, 10 | * Thanks for asking. Those routines are public domain (originally posted to 11 | * comp.sys.arm a long time ago), so you can use them freely for any purpose. 12 | * Cheers, 13 | * Wilco 14 | * 15 | * ----- Original Message ----- 16 | * From: "Kevin Ma" 17 | * To: 18 | * Sent: Thursday, June 23, 2011 11:44 PM 19 | * Subject: Fwd: sqrt routine 20 | * Hi Wilco, 21 | * I saw your sqrt routine from several web sites, including 22 | * http://www.finesse.demon.co.uk/steven/sqrt.html. 23 | * Just wonder if there's any copyright information with your Successive 24 | * approximation routines, or if I can freely use it for any purpose. 25 | * Thanks. 26 | * Kevin 27 | */ 28 | 29 | // Minor modifications in code style for WebRTC, 2012. 30 | 31 | #include "webrtc/common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.h" 32 | 33 | /* 34 | * Algorithm: 35 | * Successive approximation of the equation (root + delta) ^ 2 = N 36 | * until delta < 1. If delta < 1 we have the integer part of SQRT (N). 37 | * Use delta = 2^i for i = 15 .. 0. 38 | * 39 | * Output precision is 16 bits. Note for large input values (close to 40 | * 0x7FFFFFFF), bit 15 (the highest bit of the low 16-bit half word) 41 | * contains the MSB information (a non-sign value). Do with caution 42 | * if you need to cast the output to int16_t type. 43 | * 44 | * If the input value is negative, it returns 0. 45 | */ 46 | 47 | #define WEBRTC_SPL_SQRT_ITER(N) \ 48 | try1 = root + (1 << (N)); \ 49 | if (value >= try1 << (N)) \ 50 | { \ 51 | value -= try1 << (N); \ 52 | root |= 2 << (N); \ 53 | } 54 | 55 | int32_t WebRtcSpl_SqrtFloor(int32_t value) 56 | { 57 | int32_t root = 0, try1; 58 | 59 | WEBRTC_SPL_SQRT_ITER (15); 60 | WEBRTC_SPL_SQRT_ITER (14); 61 | WEBRTC_SPL_SQRT_ITER (13); 62 | WEBRTC_SPL_SQRT_ITER (12); 63 | WEBRTC_SPL_SQRT_ITER (11); 64 | WEBRTC_SPL_SQRT_ITER (10); 65 | WEBRTC_SPL_SQRT_ITER ( 9); 66 | WEBRTC_SPL_SQRT_ITER ( 8); 67 | WEBRTC_SPL_SQRT_ITER ( 7); 68 | WEBRTC_SPL_SQRT_ITER ( 6); 69 | WEBRTC_SPL_SQRT_ITER ( 5); 70 | WEBRTC_SPL_SQRT_ITER ( 4); 71 | WEBRTC_SPL_SQRT_ITER ( 3); 72 | WEBRTC_SPL_SQRT_ITER ( 2); 73 | WEBRTC_SPL_SQRT_ITER ( 1); 74 | WEBRTC_SPL_SQRT_ITER ( 0); 75 | 76 | return root >> 1; 77 | } 78 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/common_audio/signal_processing/downsample_fast.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h" 12 | 13 | #include "webrtc/rtc_base/checks.h" 14 | #include "webrtc/rtc_base/sanitizer.h" 15 | 16 | // TODO(Bjornv): Change the function parameter order to WebRTC code style. 17 | // C version of WebRtcSpl_DownsampleFast() for generic platforms. 18 | int WebRtcSpl_DownsampleFastC(const int16_t* data_in, 19 | size_t data_in_length, 20 | int16_t* data_out, 21 | size_t data_out_length, 22 | const int16_t* __restrict coefficients, 23 | size_t coefficients_length, 24 | int factor, 25 | size_t delay) { 26 | int16_t* const original_data_out = data_out; 27 | size_t i = 0; 28 | size_t j = 0; 29 | int32_t out_s32 = 0; 30 | size_t endpos = delay + factor * (data_out_length - 1) + 1; 31 | 32 | // Return error if any of the running conditions doesn't meet. 33 | if (data_out_length == 0 || coefficients_length == 0 34 | || data_in_length < endpos) { 35 | return -1; 36 | } 37 | 38 | rtc_MsanCheckInitialized(coefficients, sizeof(coefficients[0]), 39 | coefficients_length); 40 | 41 | for (i = delay; i < endpos; i += factor) { 42 | out_s32 = 2048; // Round value, 0.5 in Q12. 43 | 44 | for (j = 0; j < coefficients_length; j++) { 45 | // Negative overflow is permitted here, because this is 46 | // auto-regressive filters, and the state for each batch run is 47 | // stored in the "negative" positions of the output vector. 48 | rtc_MsanCheckInitialized(&data_in[(ptrdiff_t) i - (ptrdiff_t) j], 49 | sizeof(data_in[0]), 1); 50 | // out_s32 is in Q12 domain. 51 | out_s32 += coefficients[j] * data_in[(ptrdiff_t) i - (ptrdiff_t) j]; 52 | } 53 | 54 | out_s32 >>= 12; // Q0. 55 | 56 | // Saturate and store the output. 57 | *data_out++ = WebRtcSpl_SatW32ToW16(out_s32); 58 | } 59 | 60 | RTC_DCHECK_EQ(original_data_out + data_out_length, data_out); 61 | rtc_MsanCheckInitialized(original_data_out, sizeof(original_data_out[0]), 62 | data_out_length); 63 | 64 | return 0; 65 | } 66 | -------------------------------------------------------------------------------- /lib/src/microphone.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "microphone.h" 9 | #include "webrtcvad.h" 10 | 11 | using namespace moodycamel; 12 | 13 | namespace speechrecorder { 14 | 15 | int callback(const void* input, void* output, unsigned long samplesPerFrame, 16 | const PaStreamCallbackTimeInfo* timeInfo, 17 | PaStreamCallbackFlags statusFlags, void* callbackData) { 18 | if (input == nullptr || callbackData == nullptr) { 19 | return paContinue; 20 | } 21 | 22 | MicrophoneCallbackData* data = (MicrophoneCallbackData*)callbackData; 23 | short* audio = (short*)input; 24 | for (int i = 0; i < samplesPerFrame; i++) { 25 | data->buffer->at((data->bufferIndex + i) % data->buffer->size()) = audio[i]; 26 | } 27 | 28 | data->queue->enqueue(data->buffer->data() + data->bufferIndex); 29 | data->bufferIndex = 30 | (data->bufferIndex + samplesPerFrame) % data->buffer->size(); 31 | return paContinue; 32 | } 33 | 34 | Microphone::Microphone(int device, int samplesPerFrame, int sampleRate, 35 | BlockingReaderWriterQueue* queue) 36 | : device_(device), 37 | samplesPerFrame_(samplesPerFrame), 38 | sampleRate_(sampleRate) { 39 | for (int i = 0; i < samplesPerFrame * 10; i++) { 40 | buffer_.push_back(0); 41 | } 42 | 43 | callbackData_ = {&buffer_, 0, queue}; 44 | PaError error = Pa_Initialize(); 45 | if (error != paNoError) { 46 | HandleError(error, "Initialize"); 47 | } 48 | 49 | if (device_ == -1) { 50 | device_ = Pa_GetDefaultInputDevice(); 51 | } 52 | } 53 | 54 | void Microphone::HandleError(PaError error, const std::string& message) { 55 | Pa_Terminate(); 56 | std::cerr << "PortAudio Error: " << message << std::endl 57 | << "Error number: " << error << std::endl 58 | << "Error message: " << Pa_GetErrorText(error) << std::endl; 59 | exit(error); 60 | } 61 | 62 | void Microphone::Start() { 63 | PaError error = paNoError; 64 | PaStreamParameters parameters; 65 | parameters.channelCount = 1; 66 | parameters.sampleFormat = paInt16; 67 | parameters.device = device_; 68 | parameters.suggestedLatency = 69 | Pa_GetDeviceInfo(parameters.device)->defaultLowInputLatency; 70 | parameters.hostApiSpecificStreamInfo = 0; 71 | 72 | error = Pa_OpenStream(&stream_, ¶meters, 0, sampleRate_, samplesPerFrame_, 73 | paClipOff, callback, &callbackData_); 74 | if (error != paNoError) { 75 | HandleError(error, "Open Stream"); 76 | } 77 | 78 | Pa_StartStream(stream_); 79 | if (error != paNoError) { 80 | HandleError(error, "Start Stream"); 81 | } 82 | } 83 | 84 | void Microphone::Stop() { 85 | Pa_AbortStream(stream_); 86 | Pa_CloseStream(stream_); 87 | } 88 | 89 | } // namespace speechrecorder 90 | -------------------------------------------------------------------------------- /src/index.js: -------------------------------------------------------------------------------- 1 | const path = require("path"); 2 | const { SpeechRecorder, devices } = require("bindings")("speechrecorder.node"); 3 | 4 | class Wrapper { 5 | constructor(options, model) { 6 | options = options ? options : {}; 7 | options.consecutiveFramesForSilence = 8 | options.consecutiveFramesForSilence !== undefined ? options.consecutiveFramesForSilence : 10; 9 | options.consecutiveFramesForSpeaking = 10 | options.consecutiveFramesForSpeaking !== undefined ? options.consecutiveFramesForSpeaking : 1; 11 | options.device = options.device !== undefined ? options.device : -1; 12 | options.leadingBufferFrames = 13 | options.leadingBufferFrames !== undefined ? options.leadingBufferFrames : 10; 14 | options.onChunkStart = options.onChunkStart !== undefined ? options.onChunkStart : (data) => {}; 15 | options.onAudio = 16 | options.onAudio !== undefined 17 | ? options.onAudio 18 | : (audio, speaking, volume, speech, probability) => {}; 19 | options.onChunkEnd = options.onChunkEnd !== undefined ? options.onChunkEnd : (data) => {}; 20 | options.samplesPerFrame = options.samplesPerFrame !== undefined ? options.samplesPerFrame : 480; 21 | options.sampleRate = options.sampleRate !== undefined ? options.sampleRate : 16000; 22 | options.sileroVadBufferSize = 23 | options.sileroVadBufferSize !== undefined ? options.sileroVadBufferSize : 2000; 24 | options.sileroVadRateLimit = 25 | options.sileroVadRateLimit !== undefined ? options.sileroVadRateLimit : 3; 26 | options.sileroVadSilenceThreshold = 27 | options.sileroVadSilenceThreshold !== undefined ? options.sileroVadSilenceThreshold : 0.1; 28 | options.sileroVadSpeakingThreshold = 29 | options.sileroVadSpeakingThreshold !== undefined ? options.sileroVadSpeakingThreshold : 0.3; 30 | options.webrtcVadLevel = options.webrtcVadLevel !== undefined ? options.webrtcVadLevel : 3; 31 | options.webrtcVadBufferSize = 32 | options.webrtcVadBufferSize !== undefined ? options.webrtcVadBufferSize : 480; 33 | options.webrtcVadResultsSize = 34 | options.webrtcVadResultsSize !== undefined ? options.webrtcVadResultsSize : 10; 35 | 36 | this.inner = new SpeechRecorder( 37 | model !== undefined ? model : path.join(__dirname, "..", "lib", "resources", "vad.onnx"), 38 | (event, data) => { 39 | if (event == "chunkStart") { 40 | options.onChunkStart({ audio: data.audio }); 41 | } else if (event == "audio") { 42 | options.onAudio({ 43 | audio: data.audio, 44 | speaking: data.speaking, 45 | probability: data.probability, 46 | volume: data.volume, 47 | speech: data.speech, 48 | consecutiveSilence: data.consecutiveSilence, 49 | }); 50 | } else if (event == "chunkEnd") { 51 | options.onChunkEnd(); 52 | } 53 | }, 54 | options 55 | ); 56 | } 57 | 58 | processFile(file) { 59 | this.inner.processFile(path.resolve(file)); 60 | } 61 | 62 | start() { 63 | this.inner.start(); 64 | } 65 | 66 | stop() { 67 | this.inner.stop(); 68 | } 69 | } 70 | 71 | exports.SpeechRecorder = Wrapper; 72 | exports.devices = devices; 73 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/common_audio/vad/vad_gmm.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | #include "webrtc/common_audio/vad/vad_gmm.h" 12 | 13 | #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h" 14 | 15 | static const int32_t kCompVar = 22005; 16 | static const int16_t kLog2Exp = 5909; // log2(exp(1)) in Q12. 17 | 18 | // For a normal distribution, the probability of |input| is calculated and 19 | // returned (in Q20). The formula for normal distributed probability is 20 | // 21 | // 1 / s * exp(-(x - m)^2 / (2 * s^2)) 22 | // 23 | // where the parameters are given in the following Q domains: 24 | // m = |mean| (Q7) 25 | // s = |std| (Q7) 26 | // x = |input| (Q4) 27 | // in addition to the probability we output |delta| (in Q11) used when updating 28 | // the noise/speech model. 29 | int32_t WebRtcVad_GaussianProbability(int16_t input, 30 | int16_t mean, 31 | int16_t std, 32 | int16_t* delta) { 33 | int16_t tmp16, inv_std, inv_std2, exp_value = 0; 34 | int32_t tmp32; 35 | 36 | // Calculate |inv_std| = 1 / s, in Q10. 37 | // 131072 = 1 in Q17, and (|std| >> 1) is for rounding instead of truncation. 38 | // Q-domain: Q17 / Q7 = Q10. 39 | tmp32 = (int32_t) 131072 + (int32_t) (std >> 1); 40 | inv_std = (int16_t) WebRtcSpl_DivW32W16(tmp32, std); 41 | 42 | // Calculate |inv_std2| = 1 / s^2, in Q14. 43 | tmp16 = (inv_std >> 2); // Q10 -> Q8. 44 | // Q-domain: (Q8 * Q8) >> 2 = Q14. 45 | inv_std2 = (int16_t)((tmp16 * tmp16) >> 2); 46 | // TODO(bjornv): Investigate if changing to 47 | // inv_std2 = (int16_t)((inv_std * inv_std) >> 6); 48 | // gives better accuracy. 49 | 50 | tmp16 = (input << 3); // Q4 -> Q7 51 | tmp16 = tmp16 - mean; // Q7 - Q7 = Q7 52 | 53 | // To be used later, when updating noise/speech model. 54 | // |delta| = (x - m) / s^2, in Q11. 55 | // Q-domain: (Q14 * Q7) >> 10 = Q11. 56 | *delta = (int16_t)((inv_std2 * tmp16) >> 10); 57 | 58 | // Calculate the exponent |tmp32| = (x - m)^2 / (2 * s^2), in Q10. Replacing 59 | // division by two with one shift. 60 | // Q-domain: (Q11 * Q7) >> 8 = Q10. 61 | tmp32 = (*delta * tmp16) >> 9; 62 | 63 | // If the exponent is small enough to give a non-zero probability we calculate 64 | // |exp_value| ~= exp(-(x - m)^2 / (2 * s^2)) 65 | // ~= exp2(-log2(exp(1)) * |tmp32|). 66 | if (tmp32 < kCompVar) { 67 | // Calculate |tmp16| = log2(exp(1)) * |tmp32|, in Q10. 68 | // Q-domain: (Q12 * Q10) >> 12 = Q10. 69 | tmp16 = (int16_t)((kLog2Exp * tmp32) >> 12); 70 | tmp16 = -tmp16; 71 | exp_value = (0x0400 | (tmp16 & 0x03FF)); 72 | tmp16 ^= 0xFFFF; 73 | tmp16 >>= 10; 74 | tmp16 += 1; 75 | // Get |exp_value| = exp(-|tmp32|) in Q10. 76 | exp_value >>= tmp16; 77 | } 78 | 79 | // Calculate and return (1 / s) * exp(-(x - m)^2 / (2 * s^2)), in Q20. 80 | // Q-domain: Q10 * Q10 = Q20. 81 | return inv_std * exp_value; 82 | } 83 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/common_audio/vad/include/webrtc_vad.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | /* 12 | * This header file includes the VAD API calls. Specific function calls are 13 | * given below. 14 | */ 15 | 16 | #ifndef COMMON_AUDIO_VAD_INCLUDE_WEBRTC_VAD_H_ // NOLINT 17 | #define COMMON_AUDIO_VAD_INCLUDE_WEBRTC_VAD_H_ 18 | 19 | #include 20 | #include 21 | 22 | typedef struct WebRtcVadInst VadInst; 23 | 24 | #ifdef __cplusplus 25 | extern "C" { 26 | #endif 27 | 28 | // Creates an instance to the VAD structure. 29 | VadInst* WebRtcVad_Create(void); 30 | 31 | // Frees the dynamic memory of a specified VAD instance. 32 | // 33 | // - handle [i] : Pointer to VAD instance that should be freed. 34 | void WebRtcVad_Free(VadInst* handle); 35 | 36 | // Initializes a VAD instance. 37 | // 38 | // - handle [i/o] : Instance that should be initialized. 39 | // 40 | // returns : 0 - (OK), 41 | // -1 - (null pointer or Default mode could not be set). 42 | int WebRtcVad_Init(VadInst* handle); 43 | 44 | // Sets the VAD operating mode. A more aggressive (higher mode) VAD is more 45 | // restrictive in reporting speech. Put in other words the probability of being 46 | // speech when the VAD returns 1 is increased with increasing mode. As a 47 | // consequence also the missed detection rate goes up. 48 | // 49 | // - handle [i/o] : VAD instance. 50 | // - mode [i] : Aggressiveness mode (0, 1, 2, or 3). 51 | // 52 | // returns : 0 - (OK), 53 | // -1 - (null pointer, mode could not be set or the VAD instance 54 | // has not been initialized). 55 | int WebRtcVad_set_mode(VadInst* handle, int mode); 56 | 57 | // Calculates a VAD decision for the |audio_frame|. For valid sampling rates 58 | // frame lengths, see the description of WebRtcVad_ValidRatesAndFrameLengths(). 59 | // 60 | // - handle [i/o] : VAD Instance. Needs to be initialized by 61 | // WebRtcVad_Init() before call. 62 | // - fs [i] : Sampling frequency (Hz): 8000, 16000, or 32000 63 | // - audio_frame [i] : Audio frame buffer. 64 | // - frame_length [i] : Length of audio frame buffer in number of samples. 65 | // 66 | // returns : 1 - (Active Voice), 67 | // 0 - (Non-active Voice), 68 | // -1 - (Error) 69 | int WebRtcVad_Process(VadInst* handle, 70 | int fs, 71 | const int16_t* audio_frame, 72 | size_t frame_length); 73 | 74 | // Checks for valid combinations of |rate| and |frame_length|. We support 10, 75 | // 20 and 30 ms frames and the rates 8000, 16000 and 32000 Hz. 76 | // 77 | // - rate [i] : Sampling frequency (Hz). 78 | // - frame_length [i] : Speech frame buffer length in number of samples. 79 | // 80 | // returns : 0 - (valid combination), -1 - (invalid combination) 81 | int WebRtcVad_ValidRateAndFrameLength(int rate, size_t frame_length); 82 | 83 | #ifdef __cplusplus 84 | } 85 | #endif 86 | 87 | #endif // COMMON_AUDIO_VAD_INCLUDE_WEBRTC_VAD_H_ // NOLINT 88 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/common_audio/vad/webrtc_vad.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | #include "webrtc/common_audio/vad/include/webrtc_vad.h" 12 | 13 | #include 14 | #include 15 | 16 | #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h" 17 | #include "webrtc/common_audio/vad/vad_core.h" 18 | 19 | static const int kInitCheck = 42; 20 | static const int kValidRates[] = { 8000, 16000, 32000, 48000 }; 21 | static const size_t kRatesSize = sizeof(kValidRates) / sizeof(*kValidRates); 22 | static const int kMaxFrameLengthMs = 30; 23 | 24 | VadInst* WebRtcVad_Create() { 25 | VadInstT* self = (VadInstT*)malloc(sizeof(VadInstT)); 26 | 27 | WebRtcSpl_Init(); 28 | self->init_flag = 0; 29 | 30 | return (VadInst*)self; 31 | } 32 | 33 | void WebRtcVad_Free(VadInst* handle) { 34 | free(handle); 35 | } 36 | 37 | // TODO(bjornv): Move WebRtcVad_InitCore() code here. 38 | int WebRtcVad_Init(VadInst* handle) { 39 | // Initialize the core VAD component. 40 | return WebRtcVad_InitCore((VadInstT*) handle); 41 | } 42 | 43 | // TODO(bjornv): Move WebRtcVad_set_mode_core() code here. 44 | int WebRtcVad_set_mode(VadInst* handle, int mode) { 45 | VadInstT* self = (VadInstT*) handle; 46 | 47 | if (handle == NULL) { 48 | return -1; 49 | } 50 | if (self->init_flag != kInitCheck) { 51 | return -1; 52 | } 53 | 54 | return WebRtcVad_set_mode_core(self, mode); 55 | } 56 | 57 | int WebRtcVad_Process(VadInst* handle, int fs, const int16_t* audio_frame, 58 | size_t frame_length) { 59 | int vad = -1; 60 | VadInstT* self = (VadInstT*) handle; 61 | 62 | if (handle == NULL) { 63 | return -1; 64 | } 65 | 66 | if (self->init_flag != kInitCheck) { 67 | return -1; 68 | } 69 | if (audio_frame == NULL) { 70 | return -1; 71 | } 72 | if (WebRtcVad_ValidRateAndFrameLength(fs, frame_length) != 0) { 73 | return -1; 74 | } 75 | 76 | if (fs == 48000) { 77 | vad = WebRtcVad_CalcVad48khz(self, audio_frame, frame_length); 78 | } else if (fs == 32000) { 79 | vad = WebRtcVad_CalcVad32khz(self, audio_frame, frame_length); 80 | } else if (fs == 16000) { 81 | vad = WebRtcVad_CalcVad16khz(self, audio_frame, frame_length); 82 | } else if (fs == 8000) { 83 | vad = WebRtcVad_CalcVad8khz(self, audio_frame, frame_length); 84 | } 85 | 86 | if (vad > 0) { 87 | vad = 1; 88 | } 89 | return vad; 90 | } 91 | 92 | int WebRtcVad_ValidRateAndFrameLength(int rate, size_t frame_length) { 93 | int return_value = -1; 94 | size_t i; 95 | int valid_length_ms; 96 | size_t valid_length; 97 | 98 | // We only allow 10, 20 or 30 ms frames. Loop through valid frame rates and 99 | // see if we have a matching pair. 100 | for (i = 0; i < kRatesSize; i++) { 101 | if (kValidRates[i] == rate) { 102 | for (valid_length_ms = 10; valid_length_ms <= kMaxFrameLengthMs; 103 | valid_length_ms += 10) { 104 | valid_length = (size_t)(kValidRates[i] / 1000 * valid_length_ms); 105 | if (frame_length == valid_length) { 106 | return_value = 0; 107 | break; 108 | } 109 | } 110 | break; 111 | } 112 | } 113 | 114 | return return_value; 115 | } 116 | -------------------------------------------------------------------------------- /examples/analyze-files.js: -------------------------------------------------------------------------------- 1 | const fs = require("fs"); 2 | const path = require("path"); 3 | const { SpeechRecorder } = require("../src/index"); 4 | 5 | const quantile = (elements, q) => { 6 | const sorted = elements.sort((a, b) => a - b); 7 | const p = (sorted.length - 1) * q; 8 | const base = Math.floor(p); 9 | const rest = p - base; 10 | if (sorted[base + 1] !== undefined) { 11 | return sorted[base] + rest * (sorted[base + 1] - sorted[base]); 12 | } else { 13 | return sorted[base]; 14 | } 15 | }; 16 | 17 | if (process.argv.length < 4) { 18 | console.log("Usage: node analyze-files.js /path/to/wav/files /path/to/labels"); 19 | process.exit(1); 20 | } 21 | 22 | let currentFile; 23 | let samples = 0; 24 | const leadingBufferFrames = 10; 25 | const sampleRate = 16000; 26 | const samplesPerFrame = 480; 27 | let results = {}; 28 | let labels = JSON.parse(fs.readFileSync(process.argv[3], "utf8")); 29 | 30 | const recorder = new SpeechRecorder({ 31 | leadingBufferFrames, 32 | samplesPerFrame, 33 | sampleRate, 34 | onAudio: ({ audio, probability, volume }) => { 35 | samples += audio.length; 36 | }, 37 | 38 | onChunkStart: ({ audio }) => { 39 | results[currentFile].speech.push([]); 40 | results[currentFile].speech[results[currentFile].speech.length - 1].push(samples / sampleRate); 41 | }, 42 | 43 | onChunkEnd: () => { 44 | results[currentFile].speech[results[currentFile].speech.length - 1].push(samples / sampleRate); 45 | }, 46 | }); 47 | 48 | fs.readdir(process.argv[2], async (error, files) => { 49 | for (const file of files) { 50 | if (!file.endsWith(".wav")) { 51 | continue; 52 | } 53 | 54 | currentFile = file; 55 | samples = 0; 56 | results[file] = { speech: [] }; 57 | console.log(`Processing ${file}...`); 58 | recorder.processFile(path.join(process.argv[2], file)); 59 | } 60 | 61 | let speechWindowTooSmall = []; 62 | let noiseWasSpeech = []; 63 | let noise = 0; 64 | let speech = 0; 65 | let extra = []; 66 | for (const i of Object.keys(results)) { 67 | const label = labels[i].speech; 68 | const result = results[i].speech; 69 | 70 | if (label.length == 0) { 71 | noise++; 72 | } else { 73 | speech++; 74 | } 75 | 76 | if (label.length == 0 && result.length > 0) { 77 | console.log("Noise was speech:", i); 78 | console.log("VAD:", result); 79 | noiseWasSpeech.push(i); 80 | } 81 | 82 | if (label.length > 0 && result.length > 0) { 83 | const start = Math.min(...result.map((e) => e[0])); 84 | const stop = Math.max(...result.map((e) => e[1])); 85 | if (isNaN(start) || isNaN(stop)) { 86 | continue; 87 | } 88 | 89 | const tolerance = 0.05; 90 | if ( 91 | start - (leadingBufferFrames * samplesPerFrame) / sampleRate > label[0] + tolerance || 92 | stop < label[1] - tolerance 93 | ) { 94 | console.log("Speech window too small:", i); 95 | console.log("Label:", label); 96 | console.log("VAD:", result, start, stop); 97 | speechWindowTooSmall.push(i); 98 | } else if (stop > label[1]) { 99 | extra.push(stop - label[1]); 100 | } 101 | } 102 | } 103 | 104 | console.log( 105 | `\nSpeech window too small: ${(speechWindowTooSmall.length / speech).toFixed(2)} (${ 106 | speechWindowTooSmall.length 107 | } / ${speech})` 108 | ); 109 | 110 | console.log( 111 | `Noise was speech: ${noise > 0 ? (noiseWasSpeech.length / noise).toFixed(2) : 0} (${ 112 | noiseWasSpeech.length 113 | } / ${noise})` 114 | ); 115 | 116 | if (extra.length > 0) { 117 | console.log( 118 | `Average extra speech: ${(extra.reduce((a, b) => a + b) / extra.length).toFixed(2)}` 119 | ); 120 | console.log(`p50 extra speech: ${quantile(extra, 0.5).toFixed(2)}`); 121 | console.log(`p90 extra speech: ${quantile(extra, 0.75).toFixed(2)}`); 122 | console.log(`Max extra speech: ${Math.max(...extra).toFixed(2)}`); 123 | } 124 | }); 125 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/common_audio/signal_processing/division_operations.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | 12 | /* 13 | * This file contains implementations of the divisions 14 | * WebRtcSpl_DivU32U16() 15 | * WebRtcSpl_DivW32W16() 16 | * WebRtcSpl_DivW32W16ResW16() 17 | * WebRtcSpl_DivResultInQ31() 18 | * WebRtcSpl_DivW32HiLow() 19 | * 20 | * The description header can be found in signal_processing_library.h 21 | * 22 | */ 23 | 24 | #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h" 25 | #include "webrtc/rtc_base/sanitizer.h" 26 | 27 | uint32_t WebRtcSpl_DivU32U16(uint32_t num, uint16_t den) 28 | { 29 | // Guard against division with 0 30 | if (den != 0) 31 | { 32 | return (uint32_t)(num / den); 33 | } else 34 | { 35 | return (uint32_t)0xFFFFFFFF; 36 | } 37 | } 38 | 39 | int32_t WebRtcSpl_DivW32W16(int32_t num, int16_t den) 40 | { 41 | // Guard against division with 0 42 | if (den != 0) 43 | { 44 | return (int32_t)(num / den); 45 | } else 46 | { 47 | return (int32_t)0x7FFFFFFF; 48 | } 49 | } 50 | 51 | int16_t WebRtcSpl_DivW32W16ResW16(int32_t num, int16_t den) 52 | { 53 | // Guard against division with 0 54 | if (den != 0) 55 | { 56 | return (int16_t)(num / den); 57 | } else 58 | { 59 | return (int16_t)0x7FFF; 60 | } 61 | } 62 | 63 | int32_t WebRtcSpl_DivResultInQ31(int32_t num, int32_t den) 64 | { 65 | int32_t L_num = num; 66 | int32_t L_den = den; 67 | int32_t div = 0; 68 | int k = 31; 69 | int change_sign = 0; 70 | 71 | if (num == 0) 72 | return 0; 73 | 74 | if (num < 0) 75 | { 76 | change_sign++; 77 | L_num = -num; 78 | } 79 | if (den < 0) 80 | { 81 | change_sign++; 82 | L_den = -den; 83 | } 84 | while (k--) 85 | { 86 | div <<= 1; 87 | L_num <<= 1; 88 | if (L_num >= L_den) 89 | { 90 | L_num -= L_den; 91 | div++; 92 | } 93 | } 94 | if (change_sign == 1) 95 | { 96 | div = -div; 97 | } 98 | return div; 99 | } 100 | 101 | int32_t RTC_NO_SANITIZE("signed-integer-overflow") // bugs.webrtc.org/5486 102 | WebRtcSpl_DivW32HiLow(int32_t num, int16_t den_hi, int16_t den_low) 103 | { 104 | int16_t approx, tmp_hi, tmp_low, num_hi, num_low; 105 | int32_t tmpW32; 106 | 107 | approx = (int16_t)WebRtcSpl_DivW32W16((int32_t)0x1FFFFFFF, den_hi); 108 | // result in Q14 (Note: 3FFFFFFF = 0.5 in Q30) 109 | 110 | // tmpW32 = 1/den = approx * (2.0 - den * approx) (in Q30) 111 | tmpW32 = (den_hi * approx << 1) + ((den_low * approx >> 15) << 1); 112 | // tmpW32 = den * approx 113 | 114 | tmpW32 = (int32_t)0x7fffffffL - tmpW32; // result in Q30 (tmpW32 = 2.0-(den*approx)) 115 | // UBSan: 2147483647 - -2 cannot be represented in type 'int' 116 | 117 | // Store tmpW32 in hi and low format 118 | tmp_hi = (int16_t)(tmpW32 >> 16); 119 | tmp_low = (int16_t)((tmpW32 - ((int32_t)tmp_hi << 16)) >> 1); 120 | 121 | // tmpW32 = 1/den in Q29 122 | tmpW32 = (tmp_hi * approx + (tmp_low * approx >> 15)) << 1; 123 | 124 | // 1/den in hi and low format 125 | tmp_hi = (int16_t)(tmpW32 >> 16); 126 | tmp_low = (int16_t)((tmpW32 - ((int32_t)tmp_hi << 16)) >> 1); 127 | 128 | // Store num in hi and low format 129 | num_hi = (int16_t)(num >> 16); 130 | num_low = (int16_t)((num - ((int32_t)num_hi << 16)) >> 1); 131 | 132 | // num * (1/den) by 32 bit multiplication (result in Q28) 133 | 134 | tmpW32 = num_hi * tmp_hi + (num_hi * tmp_low >> 15) + 135 | (num_low * tmp_hi >> 15); 136 | 137 | // Put result in Q31 (convert from Q28) 138 | tmpW32 = WEBRTC_SPL_LSHIFT_W32(tmpW32, 3); 139 | 140 | return tmpW32; 141 | } 142 | -------------------------------------------------------------------------------- /lib/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.15) 2 | project(speechrecorder) 3 | 4 | set(CMAKE_CXX_STANDARD 17) 5 | set(CMAKE_OSX_DEPLOYMENT_TARGET 10.14) 6 | set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) 7 | 8 | if(NOT APPLE AND NOT WIN32) 9 | set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) 10 | set(CMAKE_INSTALL_RPATH "$ORIGIN/") 11 | endif() 12 | 13 | option(BUILD_SHARED_LIBS "Build using shared libraries" ON) 14 | 15 | if(WIN32) 16 | add_compile_options( 17 | -DWEBRTC_WIN 18 | ) 19 | else() 20 | add_compile_options( 21 | -DWEBRTC_POSIX 22 | ) 23 | endif() 24 | 25 | include(FetchContent) 26 | set(FETCHCONTENT_UPDATES_DISCONNECTED ON) 27 | 28 | FetchContent_Declare(drwav 29 | GIT_REPOSITORY https://github.com/mackron/dr_libs 30 | GIT_TAG 9497270f581f43e6b795ce5d98d8764861fb6a50 31 | ) 32 | 33 | FetchContent_Declare(readerwriterqueue 34 | GIT_REPOSITORY https://github.com/cameron314/readerwriterqueue 35 | GIT_TAG v1.0.6 36 | ) 37 | 38 | FetchContent_MakeAvailable(drwav readerwriterqueue) 39 | 40 | include_directories( 41 | include 42 | 3rd_party/webrtcvad 43 | 3rd_party/portaudio/include 44 | 3rd_party/onnxruntime/include 45 | ) 46 | 47 | link_directories( 48 | ${CMAKE_SOURCE_DIR}/3rd_party/portaudio/lib 49 | ${CMAKE_SOURCE_DIR}/3rd_party/onnxruntime/lib 50 | ) 51 | 52 | file(GLOB_RECURSE SOURCES 53 | src/*.cpp 54 | 3rd_party/webrtcvad/*.c 55 | 3rd_party/webrtcvad/*.cc 56 | ) 57 | 58 | set(LIBRARIES 59 | readerwriterqueue 60 | ) 61 | 62 | if(APPLE) 63 | list(APPEND LIBRARIES 64 | "-framework AudioToolbox" 65 | "-framework AudioUnit" 66 | "-framework CoreAudio" 67 | "-framework CoreFoundation" 68 | "-framework CoreServices" 69 | portaudio 70 | onnxruntime.1.10.0 71 | ) 72 | elseif(WIN32) 73 | list(APPEND LIBRARIES 74 | onnxruntime 75 | ) 76 | 77 | if("${CMAKE_GENERATOR_PLATFORM}" STREQUAL "Win32") 78 | list(APPEND LIBRARIES 79 | portaudio_x86 80 | ) 81 | else() 82 | list(APPEND LIBRARIES 83 | portaudio_x64 84 | ) 85 | endif() 86 | else() 87 | list(APPEND LIBRARIES 88 | portaudio 89 | onnxruntime 90 | pthread 91 | ) 92 | endif() 93 | 94 | add_library(speechrecorder ${SOURCES}) 95 | target_link_libraries(speechrecorder ${LIBRARIES}) 96 | 97 | add_executable(main test/main.cpp) 98 | target_link_libraries(main speechrecorder) 99 | 100 | install(TARGETS speechrecorder DESTINATION lib) 101 | if (WIN32) 102 | install( 103 | FILES 104 | 3rd_party/onnxruntime/lib/onnxruntime.dll 105 | 3rd_party/onnxruntime/lib/onnxruntime.lib 106 | 3rd_party/onnxruntime/lib/onnxruntime_providers_shared.dll 107 | 3rd_party/onnxruntime/lib/onnxruntime_providers_shared.lib 108 | DESTINATION lib 109 | ) 110 | if("${CMAKE_GENERATOR_PLATFORM}" STREQUAL "Win32") 111 | install( 112 | FILES 113 | 3rd_party/portaudio/bin/portaudio_x86.dll 114 | 3rd_party/portaudio/lib/portaudio_x86.lib 115 | DESTINATION lib 116 | ) 117 | else() 118 | install( 119 | FILES 120 | 3rd_party/portaudio/bin/portaudio_x64.dll 121 | 3rd_party/portaudio/lib/portaudio_x64.lib 122 | DESTINATION lib 123 | ) 124 | endif() 125 | elseif(APPLE) 126 | install( 127 | FILES 128 | 3rd_party/onnxruntime/lib/libonnxruntime.1.10.0.dylib 129 | 3rd_party/portaudio/lib/libportaudio.dylib 130 | PERMISSIONS 131 | OWNER_READ OWNER_WRITE OWNER_EXECUTE 132 | GROUP_READ GROUP_EXECUTE 133 | WORLD_READ WORLD_EXECUTE 134 | DESTINATION lib 135 | ) 136 | else() 137 | install( 138 | FILES 139 | 3rd_party/onnxruntime/lib/libonnxruntime.so 140 | 3rd_party/onnxruntime/lib/libonnxruntime.so.1.10.0 141 | 3rd_party/portaudio/lib/libportaudio.so 142 | PERMISSIONS 143 | OWNER_READ OWNER_WRITE OWNER_EXECUTE 144 | GROUP_READ GROUP_EXECUTE 145 | WORLD_READ WORLD_EXECUTE 146 | DESTINATION lib 147 | ) 148 | endif() 149 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/common_audio/vad/vad_core.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | /* 12 | * This header file includes the descriptions of the core VAD calls. 13 | */ 14 | 15 | #ifndef COMMON_AUDIO_VAD_VAD_CORE_H_ 16 | #define COMMON_AUDIO_VAD_VAD_CORE_H_ 17 | 18 | #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h" 19 | 20 | enum { kNumChannels = 6 }; // Number of frequency bands (named channels). 21 | enum { kNumGaussians = 2 }; // Number of Gaussians per channel in the GMM. 22 | enum { kTableSize = kNumChannels * kNumGaussians }; 23 | enum { kMinEnergy = 10 }; // Minimum energy required to trigger audio signal. 24 | 25 | typedef struct VadInstT_ { 26 | int vad; 27 | int32_t downsampling_filter_states[4]; 28 | WebRtcSpl_State48khzTo8khz state_48_to_8; 29 | int16_t noise_means[kTableSize]; 30 | int16_t speech_means[kTableSize]; 31 | int16_t noise_stds[kTableSize]; 32 | int16_t speech_stds[kTableSize]; 33 | // TODO(bjornv): Change to |frame_count|. 34 | int32_t frame_counter; 35 | int16_t over_hang; // Over Hang 36 | int16_t num_of_speech; 37 | // TODO(bjornv): Change to |age_vector|. 38 | int16_t index_vector[16 * kNumChannels]; 39 | int16_t low_value_vector[16 * kNumChannels]; 40 | // TODO(bjornv): Change to |median|. 41 | int16_t mean_value[kNumChannels]; 42 | int16_t upper_state[5]; 43 | int16_t lower_state[5]; 44 | int16_t hp_filter_state[4]; 45 | int16_t over_hang_max_1[3]; 46 | int16_t over_hang_max_2[3]; 47 | int16_t individual[3]; 48 | int16_t total[3]; 49 | 50 | int init_flag; 51 | } VadInstT; 52 | 53 | // Initializes the core VAD component. The default aggressiveness mode is 54 | // controlled by |kDefaultMode| in vad_core.c. 55 | // 56 | // - self [i/o] : Instance that should be initialized 57 | // 58 | // returns : 0 (OK), -1 (null pointer in or if the default mode can't be 59 | // set) 60 | int WebRtcVad_InitCore(VadInstT* self); 61 | 62 | /**************************************************************************** 63 | * WebRtcVad_set_mode_core(...) 64 | * 65 | * This function changes the VAD settings 66 | * 67 | * Input: 68 | * - inst : VAD instance 69 | * - mode : Aggressiveness degree 70 | * 0 (High quality) - 3 (Highly aggressive) 71 | * 72 | * Output: 73 | * - inst : Changed instance 74 | * 75 | * Return value : 0 - Ok 76 | * -1 - Error 77 | */ 78 | 79 | int WebRtcVad_set_mode_core(VadInstT* self, int mode); 80 | 81 | /**************************************************************************** 82 | * WebRtcVad_CalcVad48khz(...) 83 | * WebRtcVad_CalcVad32khz(...) 84 | * WebRtcVad_CalcVad16khz(...) 85 | * WebRtcVad_CalcVad8khz(...) 86 | * 87 | * Calculate probability for active speech and make VAD decision. 88 | * 89 | * Input: 90 | * - inst : Instance that should be initialized 91 | * - speech_frame : Input speech frame 92 | * - frame_length : Number of input samples 93 | * 94 | * Output: 95 | * - inst : Updated filter states etc. 96 | * 97 | * Return value : VAD decision 98 | * 0 - No active speech 99 | * 1-6 - Active speech 100 | */ 101 | int WebRtcVad_CalcVad48khz(VadInstT* inst, 102 | const int16_t* speech_frame, 103 | size_t frame_length); 104 | int WebRtcVad_CalcVad32khz(VadInstT* inst, 105 | const int16_t* speech_frame, 106 | size_t frame_length); 107 | int WebRtcVad_CalcVad16khz(VadInstT* inst, 108 | const int16_t* speech_frame, 109 | size_t frame_length); 110 | int WebRtcVad_CalcVad8khz(VadInstT* inst, 111 | const int16_t* speech_frame, 112 | size_t frame_length); 113 | 114 | #endif // COMMON_AUDIO_VAD_VAD_CORE_H_ 115 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/common_audio/signal_processing/include/real_fft.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | #ifndef COMMON_AUDIO_SIGNAL_PROCESSING_INCLUDE_REAL_FFT_H_ 12 | #define COMMON_AUDIO_SIGNAL_PROCESSING_INCLUDE_REAL_FFT_H_ 13 | 14 | #include 15 | 16 | // For ComplexFFT(), the maximum fft order is 10; 17 | // WebRTC APM uses orders of only 7 and 8. 18 | enum { kMaxFFTOrder = 10 }; 19 | 20 | struct RealFFT; 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | 26 | struct RealFFT* WebRtcSpl_CreateRealFFT(int order); 27 | void WebRtcSpl_FreeRealFFT(struct RealFFT* self); 28 | 29 | // Compute an FFT for a real-valued signal of length of 2^order, 30 | // where 1 < order <= MAX_FFT_ORDER. Transform length is determined by the 31 | // specification structure, which must be initialized prior to calling the FFT 32 | // function with WebRtcSpl_CreateRealFFT(). 33 | // The relationship between the input and output sequences can 34 | // be expressed in terms of the DFT, i.e.: 35 | // x[n] = (2^(-scalefactor)/N) . SUM[k=0,...,N-1] X[k].e^(jnk.2.pi/N) 36 | // n=0,1,2,...N-1 37 | // N=2^order. 38 | // The conjugate-symmetric output sequence is represented using a CCS vector, 39 | // which is of length N+2, and is organized as follows: 40 | // Index: 0 1 2 3 4 5 . . . N-2 N-1 N N+1 41 | // Component: R0 0 R1 I1 R2 I2 . . . R[N/2-1] I[N/2-1] R[N/2] 0 42 | // where R[n] and I[n], respectively, denote the real and imaginary components 43 | // for FFT bin 'n'. Bins are numbered from 0 to N/2, where N is the FFT length. 44 | // Bin index 0 corresponds to the DC component, and bin index N/2 corresponds to 45 | // the foldover frequency. 46 | // 47 | // Input Arguments: 48 | // self - pointer to preallocated and initialized FFT specification structure. 49 | // real_data_in - the input signal. For an ARM Neon platform, it must be 50 | // aligned on a 32-byte boundary. 51 | // 52 | // Output Arguments: 53 | // complex_data_out - the output complex signal with (2^order + 2) 16-bit 54 | // elements. For an ARM Neon platform, it must be different 55 | // from real_data_in, and aligned on a 32-byte boundary. 56 | // 57 | // Return Value: 58 | // 0 - FFT calculation is successful. 59 | // -1 - Error with bad arguments (null pointers). 60 | int WebRtcSpl_RealForwardFFT(struct RealFFT* self, 61 | const int16_t* real_data_in, 62 | int16_t* complex_data_out); 63 | 64 | // Compute the inverse FFT for a conjugate-symmetric input sequence of length of 65 | // 2^order, where 1 < order <= MAX_FFT_ORDER. Transform length is determined by 66 | // the specification structure, which must be initialized prior to calling the 67 | // FFT function with WebRtcSpl_CreateRealFFT(). 68 | // For a transform of length M, the input sequence is represented using a packed 69 | // CCS vector of length M+2, which is explained in the comments for 70 | // WebRtcSpl_RealForwardFFTC above. 71 | // 72 | // Input Arguments: 73 | // self - pointer to preallocated and initialized FFT specification structure. 74 | // complex_data_in - the input complex signal with (2^order + 2) 16-bit 75 | // elements. For an ARM Neon platform, it must be aligned on 76 | // a 32-byte boundary. 77 | // 78 | // Output Arguments: 79 | // real_data_out - the output real signal. For an ARM Neon platform, it must 80 | // be different to complex_data_in, and aligned on a 32-byte 81 | // boundary. 82 | // 83 | // Return Value: 84 | // 0 or a positive number - a value that the elements in the |real_data_out| 85 | // should be shifted left with in order to get 86 | // correct physical values. 87 | // -1 - Error with bad arguments (null pointers). 88 | int WebRtcSpl_RealInverseFFT(struct RealFFT* self, 89 | const int16_t* complex_data_in, 90 | int16_t* real_data_out); 91 | 92 | #ifdef __cplusplus 93 | } 94 | #endif 95 | 96 | #endif // COMMON_AUDIO_SIGNAL_PROCESSING_INCLUDE_REAL_FFT_H_ 97 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/rtc_base/type_traits.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 The WebRTC Project Authors. All rights reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | #ifndef RTC_BASE_TYPE_TRAITS_H_ 12 | #define RTC_BASE_TYPE_TRAITS_H_ 13 | 14 | #include 15 | #include 16 | 17 | namespace rtc { 18 | 19 | // Determines if the given class has zero-argument .data() and .size() methods 20 | // whose return values are convertible to T* and size_t, respectively. 21 | template 22 | class HasDataAndSize { 23 | private: 24 | template < 25 | typename C, 26 | typename std::enable_if< 27 | std::is_convertible().data()), T*>::value && 28 | std::is_convertible().size()), 29 | std::size_t>::value>::type* = nullptr> 30 | static int Test(int); 31 | 32 | template 33 | static char Test(...); 34 | 35 | public: 36 | static constexpr bool value = std::is_same(0)), int>::value; 37 | }; 38 | 39 | namespace test_has_data_and_size { 40 | 41 | template 42 | struct Test1 { 43 | DR data(); 44 | SR size(); 45 | }; 46 | static_assert(HasDataAndSize, int>::value, ""); 47 | static_assert(HasDataAndSize, const int>::value, ""); 48 | static_assert(HasDataAndSize, const int>::value, ""); 49 | static_assert(!HasDataAndSize, int>::value, 50 | "implicit cast of const int* to int*"); 51 | static_assert(!HasDataAndSize, int>::value, 52 | "implicit cast of char* to int*"); 53 | 54 | struct Test2 { 55 | int* data; 56 | size_t size; 57 | }; 58 | static_assert(!HasDataAndSize::value, 59 | ".data and .size aren't functions"); 60 | 61 | struct Test3 { 62 | int* data(); 63 | }; 64 | static_assert(!HasDataAndSize::value, ".size() is missing"); 65 | 66 | class Test4 { 67 | int* data(); 68 | size_t size(); 69 | }; 70 | static_assert(!HasDataAndSize::value, 71 | ".data() and .size() are private"); 72 | 73 | } // namespace test_has_data_and_size 74 | 75 | namespace type_traits_impl { 76 | 77 | // Determines if the given type is an enum that converts implicitly to 78 | // an integral type. 79 | template 80 | struct IsIntEnum { 81 | private: 82 | // This overload is used if the type is an enum, and unary plus 83 | // compiles and turns it into an integral type. 84 | template ::value && 87 | std::is_integral())>::value>::type* = 88 | nullptr> 89 | static int Test(int); 90 | 91 | // Otherwise, this overload is used. 92 | template 93 | static char Test(...); 94 | 95 | public: 96 | static constexpr bool value = 97 | std::is_same::type>(0)), 98 | int>::value; 99 | }; 100 | 101 | } // namespace type_traits_impl 102 | 103 | // Determines if the given type is integral, or an enum that 104 | // converts implicitly to an integral type. 105 | template 106 | struct IsIntlike { 107 | private: 108 | using X = typename std::remove_reference::type; 109 | 110 | public: 111 | static constexpr bool value = 112 | std::is_integral::value || type_traits_impl::IsIntEnum::value; 113 | }; 114 | 115 | namespace test_enum_intlike { 116 | 117 | enum E1 { e1 }; 118 | enum { e2 }; 119 | enum class E3 { e3 }; 120 | struct S {}; 121 | 122 | static_assert(type_traits_impl::IsIntEnum::value, ""); 123 | static_assert(type_traits_impl::IsIntEnum::value, ""); 124 | static_assert(!type_traits_impl::IsIntEnum::value, ""); 125 | static_assert(!type_traits_impl::IsIntEnum::value, ""); 126 | static_assert(!type_traits_impl::IsIntEnum::value, ""); 127 | static_assert(!type_traits_impl::IsIntEnum::value, ""); 128 | 129 | static_assert(IsIntlike::value, ""); 130 | static_assert(IsIntlike::value, ""); 131 | static_assert(!IsIntlike::value, ""); 132 | static_assert(IsIntlike::value, ""); 133 | static_assert(!IsIntlike::value, ""); 134 | static_assert(!IsIntlike::value, ""); 135 | 136 | } // namespace test_enum_intlike 137 | 138 | } // namespace rtc 139 | 140 | #endif // RTC_BASE_TYPE_TRAITS_H_ 141 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/common_audio/signal_processing/complex_bit_reverse.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h" 12 | 13 | /* Tables for data buffer indexes that are bit reversed and thus need to be 14 | * swapped. Note that, index_7[{0, 2, 4, ...}] are for the left side of the swap 15 | * operations, while index_7[{1, 3, 5, ...}] are for the right side of the 16 | * operation. Same for index_8. 17 | */ 18 | 19 | /* Indexes for the case of stages == 7. */ 20 | static const int16_t index_7[112] = { 21 | 1, 64, 2, 32, 3, 96, 4, 16, 5, 80, 6, 48, 7, 112, 9, 72, 10, 40, 11, 104, 22 | 12, 24, 13, 88, 14, 56, 15, 120, 17, 68, 18, 36, 19, 100, 21, 84, 22, 52, 23 | 23, 116, 25, 76, 26, 44, 27, 108, 29, 92, 30, 60, 31, 124, 33, 66, 35, 98, 24 | 37, 82, 38, 50, 39, 114, 41, 74, 43, 106, 45, 90, 46, 58, 47, 122, 49, 70, 25 | 51, 102, 53, 86, 55, 118, 57, 78, 59, 110, 61, 94, 63, 126, 67, 97, 69, 26 | 81, 71, 113, 75, 105, 77, 89, 79, 121, 83, 101, 87, 117, 91, 109, 95, 125, 27 | 103, 115, 111, 123 28 | }; 29 | 30 | /* Indexes for the case of stages == 8. */ 31 | static const int16_t index_8[240] = { 32 | 1, 128, 2, 64, 3, 192, 4, 32, 5, 160, 6, 96, 7, 224, 8, 16, 9, 144, 10, 80, 33 | 11, 208, 12, 48, 13, 176, 14, 112, 15, 240, 17, 136, 18, 72, 19, 200, 20, 34 | 40, 21, 168, 22, 104, 23, 232, 25, 152, 26, 88, 27, 216, 28, 56, 29, 184, 35 | 30, 120, 31, 248, 33, 132, 34, 68, 35, 196, 37, 164, 38, 100, 39, 228, 41, 36 | 148, 42, 84, 43, 212, 44, 52, 45, 180, 46, 116, 47, 244, 49, 140, 50, 76, 37 | 51, 204, 53, 172, 54, 108, 55, 236, 57, 156, 58, 92, 59, 220, 61, 188, 62, 38 | 124, 63, 252, 65, 130, 67, 194, 69, 162, 70, 98, 71, 226, 73, 146, 74, 82, 39 | 75, 210, 77, 178, 78, 114, 79, 242, 81, 138, 83, 202, 85, 170, 86, 106, 87, 40 | 234, 89, 154, 91, 218, 93, 186, 94, 122, 95, 250, 97, 134, 99, 198, 101, 41 | 166, 103, 230, 105, 150, 107, 214, 109, 182, 110, 118, 111, 246, 113, 142, 42 | 115, 206, 117, 174, 119, 238, 121, 158, 123, 222, 125, 190, 127, 254, 131, 43 | 193, 133, 161, 135, 225, 137, 145, 139, 209, 141, 177, 143, 241, 147, 201, 44 | 149, 169, 151, 233, 155, 217, 157, 185, 159, 249, 163, 197, 167, 229, 171, 45 | 213, 173, 181, 175, 245, 179, 205, 183, 237, 187, 221, 191, 253, 199, 227, 46 | 203, 211, 207, 243, 215, 235, 223, 251, 239, 247 47 | }; 48 | 49 | void WebRtcSpl_ComplexBitReverse(int16_t* __restrict complex_data, int stages) { 50 | /* For any specific value of stages, we know exactly the indexes that are 51 | * bit reversed. Currently (Feb. 2012) in WebRTC the only possible values of 52 | * stages are 7 and 8, so we use tables to save unnecessary iterations and 53 | * calculations for these two cases. 54 | */ 55 | if (stages == 7 || stages == 8) { 56 | int m = 0; 57 | int length = 112; 58 | const int16_t* index = index_7; 59 | 60 | if (stages == 8) { 61 | length = 240; 62 | index = index_8; 63 | } 64 | 65 | /* Decimation in time. Swap the elements with bit-reversed indexes. */ 66 | for (m = 0; m < length; m += 2) { 67 | /* We declare a int32_t* type pointer, to load both the 16-bit real 68 | * and imaginary elements from complex_data in one instruction, reducing 69 | * complexity. 70 | */ 71 | int32_t* complex_data_ptr = (int32_t*)complex_data; 72 | int32_t temp = 0; 73 | 74 | temp = complex_data_ptr[index[m]]; /* Real and imaginary */ 75 | complex_data_ptr[index[m]] = complex_data_ptr[index[m + 1]]; 76 | complex_data_ptr[index[m + 1]] = temp; 77 | } 78 | } 79 | else { 80 | int m = 0, mr = 0, l = 0; 81 | int n = 1 << stages; 82 | int nn = n - 1; 83 | 84 | /* Decimation in time - re-order data */ 85 | for (m = 1; m <= nn; ++m) { 86 | int32_t* complex_data_ptr = (int32_t*)complex_data; 87 | int32_t temp = 0; 88 | 89 | /* Find out indexes that are bit-reversed. */ 90 | l = n; 91 | do { 92 | l >>= 1; 93 | } while (l > nn - mr); 94 | mr = (mr & (l - 1)) + l; 95 | 96 | if (mr <= m) { 97 | continue; 98 | } 99 | 100 | /* Swap the elements with bit-reversed indexes. 101 | * This is similar to the loop in the stages == 7 or 8 cases. 102 | */ 103 | temp = complex_data_ptr[m]; /* Real and imaginary */ 104 | complex_data_ptr[m] = complex_data_ptr[mr]; 105 | complex_data_ptr[mr] = temp; 106 | } 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Speech Recorder 2 | 3 | speech-recorder is a cross-platform, native [node.js](https://nodejs.org) [addon](http://nodejs.org/api/addons.html) for getting a stream of audio from a device's microphone. Using speech-recorder, you can also get only the audio that corresponds to someone speaking. 4 | 5 | This module is used for speech recognition in [Serenade](https://serenade.ai). Serenade enables you to write code through natural speech, rather than typing. 6 | 7 | ## Installation 8 | 9 | speech-recorder has been tested on Windows 10, macOS 10.14+, and Ubuntu 18.04+ (and may work on other platforms as well). 10 | 11 | To install speech-recorder, run: 12 | 13 | yarn add speech-recorder 14 | 15 | If you're using this library with Electron, you should probably use [electron-rebuild](https://github.com/electron/electron-rebuild). 16 | 17 | ## Usage 18 | 19 | This library uses two voice activity detection mechanisms: a fast first pass (the WebRTC VAD), and a slightly slower, but much more accurate, second pass (the Silero VAD). See below for the various options you can supply to each. 20 | 21 | ### Streaming 22 | 23 | When you start recording, you can register various callbacks. `onAudio` is called when any audio comes in from the microphone. `onChunkStart` is called when a chunk of speech begins, and `onChunkEnd` is called when speech ends. 24 | 25 | const { SpeechRecorder } = require("speech-recorder"); 26 | 27 | const recorder = new SpeechRecorder({ 28 | onChunkStart: ({ audio }) => { 29 | console.log(Date.now(), "Chunk start"); 30 | }, 31 | onAudio: ({ speaking, probability, volume }) => { 32 | console.log(Date.now(), speaking, probability, volume); 33 | }, 34 | onChunkEnd: () => { 35 | console.log(Date.now(), "Chunk end"); 36 | }, 37 | }); 38 | 39 | console.log("Recording for 5 seconds..."); 40 | recorder.start(); 41 | setTimeout(() => { 42 | console.log("Done!"); 43 | recorder.stop(); 44 | }, 5000); 45 | 46 | You can write all audio from the microphone to a file with: 47 | 48 | const { SpeechRecorder } = require("speech-recorder"); 49 | 50 | const writeStream = fs.createWriteStream("audio.raw"); 51 | const recorder = new SpeechRecorder({ 52 | onAudio: ({ audio }) => { 53 | writeStream.write(audio); 54 | } 55 | }); 56 | 57 | Or, just the speech with: 58 | 59 | const { SpeechRecorder } = require("speech-recorder"); 60 | 61 | const writeStream = fs.createWriteStream("audio.raw"); 62 | const recorder = new SpeechRecorder({ 63 | onAudio: ({ audio, speech }) => { 64 | if (speech) { 65 | writeStream.write(audio); 66 | } 67 | } 68 | }); 69 | 70 | ### Devices 71 | 72 | You can get a list of supported devices with: 73 | 74 | const { devices } = require("speech-recorder"); 75 | 76 | console.log(devices()); 77 | 78 | ### Options 79 | 80 | * `consecutiveFramesForSilence`: How many frames of audio must be silent before `onChunkEnd` is fired. Default `10`. 81 | * `consecutiveFramesForSpeaking`: How many frames of audio must be speech before `onChunkStart` is fired. Default `1`. 82 | * `device`: ID of the device to use for input (i.e., from the example above). Specify `-1` to use the system default. Default `-1`. 83 | * `leadingBufferFrames`: How many frames of audio to keep in a buffer that's included in `onChunkStart`. Default `10`. 84 | * `onChunkStart`: Callback to be executed when speech starts. 85 | * `onAudio`: Callback to be executed when any audio comes in. 86 | * `onChunkEnd`: Callback to be executed when speech ends. 87 | * `samplesPerFrame`: How many audio samples to be included in each frame from the microphone. Default `480`. 88 | * `sampleRate`: Audio sample rate. Default `16000`. 89 | * `sileroVadBufferSize`: How many audio samples to pass to the VAD. Default `2000`. 90 | * `sileroVadRateLimit`: Rate limit, in frames, for how frequently to call the VAD. Default `3`. 91 | * `sileroVadSilenceThreshold`: Probability threshold for speech to transition to silence. Default `0.1`. 92 | * `sileroVadSpeakingThreshold`: Probability threshold for silence to transition to speech. Default `0.3`. 93 | * `webrtcVadLevel`: Aggressiveness for the first-pass VAD filter. `0` is least aggressive, and `3` is most aggressive. Default `3`. 94 | * `webrtcVadBufferSize`: How many audio samples to pass to the first-pass VAD filter. Default `480`. Can only be `160`, `320`, or `480`. 95 | * `webrtcVadResultsSize`: How many first-pass VAD filter results to keep in history. Default `10`. 96 | 97 | ## Building SpeechRecorder 98 | 99 | If you want to build speech-recorder from source, first install the necessary dependencies by running: 100 | 101 | ./setup.sh 102 | 103 | Where `` specifies the architecture you'd like to build for and is one of `x86`, `x64`, or `arm64`. If you're not sure, you probably want `x64`. 104 | 105 | Then, you can build speech-recorder with: 106 | 107 | ./build.sh 108 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/rtc_base/sanitizer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 The WebRTC Project Authors. All rights reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | #ifndef RTC_BASE_SANITIZER_H_ 12 | #define RTC_BASE_SANITIZER_H_ 13 | 14 | #include // For size_t. 15 | 16 | #ifdef __cplusplus 17 | #include 18 | #endif 19 | 20 | #if defined(__has_feature) 21 | #if __has_feature(address_sanitizer) 22 | #define RTC_HAS_ASAN 1 23 | #endif 24 | #if __has_feature(memory_sanitizer) 25 | #define RTC_HAS_MSAN 1 26 | #endif 27 | #endif 28 | #ifndef RTC_HAS_ASAN 29 | #define RTC_HAS_ASAN 0 30 | #endif 31 | #ifndef RTC_HAS_MSAN 32 | #define RTC_HAS_MSAN 0 33 | #endif 34 | 35 | #if RTC_HAS_ASAN 36 | #include 37 | #endif 38 | #if RTC_HAS_MSAN 39 | #include 40 | #endif 41 | 42 | #ifdef __has_attribute 43 | #if __has_attribute(no_sanitize) 44 | #define RTC_NO_SANITIZE(what) __attribute__((no_sanitize(what))) 45 | #endif 46 | #endif 47 | #ifndef RTC_NO_SANITIZE 48 | #define RTC_NO_SANITIZE(what) 49 | #endif 50 | 51 | // Ask ASan to mark the memory range [ptr, ptr + element_size * num_elements) 52 | // as being unaddressable, so that reads and writes are not allowed. ASan may 53 | // narrow the range to the nearest alignment boundaries. 54 | static inline void rtc_AsanPoison(const volatile void* ptr, 55 | size_t element_size, 56 | size_t num_elements) { 57 | #if RTC_HAS_ASAN 58 | ASAN_POISON_MEMORY_REGION(ptr, element_size * num_elements); 59 | #endif 60 | } 61 | 62 | // Ask ASan to mark the memory range [ptr, ptr + element_size * num_elements) 63 | // as being addressable, so that reads and writes are allowed. ASan may widen 64 | // the range to the nearest alignment boundaries. 65 | static inline void rtc_AsanUnpoison(const volatile void* ptr, 66 | size_t element_size, 67 | size_t num_elements) { 68 | #if RTC_HAS_ASAN 69 | ASAN_UNPOISON_MEMORY_REGION(ptr, element_size * num_elements); 70 | #endif 71 | } 72 | 73 | // Ask MSan to mark the memory range [ptr, ptr + element_size * num_elements) 74 | // as being uninitialized. 75 | static inline void rtc_MsanMarkUninitialized(const volatile void* ptr, 76 | size_t element_size, 77 | size_t num_elements) { 78 | #if RTC_HAS_MSAN 79 | __msan_poison(ptr, element_size * num_elements); 80 | #endif 81 | } 82 | 83 | // Force an MSan check (if any bits in the memory range [ptr, ptr + 84 | // element_size * num_elements) are uninitialized the call will crash with an 85 | // MSan report). 86 | static inline void rtc_MsanCheckInitialized(const volatile void* ptr, 87 | size_t element_size, 88 | size_t num_elements) { 89 | #if RTC_HAS_MSAN 90 | __msan_check_mem_is_initialized(ptr, element_size * num_elements); 91 | #endif 92 | } 93 | 94 | #ifdef __cplusplus 95 | 96 | namespace rtc { 97 | namespace sanitizer_impl { 98 | 99 | template 100 | constexpr bool IsTriviallyCopyable() { 101 | return static_cast(std::is_trivially_copy_constructible::value && 102 | (std::is_trivially_copy_assignable::value || 103 | !std::is_copy_assignable::value) && 104 | std::is_trivially_destructible::value); 105 | } 106 | 107 | } // namespace sanitizer_impl 108 | 109 | template 110 | inline void AsanPoison(const T& mem) { 111 | rtc_AsanPoison(mem.data(), sizeof(mem.data()[0]), mem.size()); 112 | } 113 | 114 | template 115 | inline void AsanUnpoison(const T& mem) { 116 | rtc_AsanUnpoison(mem.data(), sizeof(mem.data()[0]), mem.size()); 117 | } 118 | 119 | template 120 | inline void MsanMarkUninitialized(const T& mem) { 121 | rtc_MsanMarkUninitialized(mem.data(), sizeof(mem.data()[0]), mem.size()); 122 | } 123 | 124 | template 125 | inline T MsanUninitialized(T t) { 126 | #if RTC_HAS_MSAN 127 | // TODO(bugs.webrtc.org/8762): Switch to std::is_trivially_copyable when it 128 | // becomes available in downstream projects. 129 | static_assert(sanitizer_impl::IsTriviallyCopyable(), ""); 130 | #endif 131 | rtc_MsanMarkUninitialized(&t, sizeof(T), 1); 132 | return t; 133 | } 134 | 135 | template 136 | inline void MsanCheckInitialized(const T& mem) { 137 | rtc_MsanCheckInitialized(mem.data(), sizeof(mem.data()[0]), mem.size()); 138 | } 139 | 140 | } // namespace rtc 141 | 142 | #endif // __cplusplus 143 | 144 | #endif // RTC_BASE_SANITIZER_H_ 145 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/common_audio/signal_processing/spl_init.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | /* The global function contained in this file initializes SPL function 12 | * pointers, currently only for ARM platforms. 13 | * 14 | * Some code came from common/rtcd.c in the WebM project. 15 | */ 16 | 17 | #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h" 18 | #include "webrtc/system_wrappers/include/cpu_features_wrapper.h" 19 | 20 | /* Declare function pointers. */ 21 | MaxAbsValueW16 WebRtcSpl_MaxAbsValueW16; 22 | MaxAbsValueW32 WebRtcSpl_MaxAbsValueW32; 23 | MaxValueW16 WebRtcSpl_MaxValueW16; 24 | MaxValueW32 WebRtcSpl_MaxValueW32; 25 | MinValueW16 WebRtcSpl_MinValueW16; 26 | MinValueW32 WebRtcSpl_MinValueW32; 27 | CrossCorrelation WebRtcSpl_CrossCorrelation; 28 | DownsampleFast WebRtcSpl_DownsampleFast; 29 | ScaleAndAddVectorsWithRound WebRtcSpl_ScaleAndAddVectorsWithRound; 30 | 31 | #if (!defined(WEBRTC_HAS_NEON)) && !defined(MIPS32_LE) 32 | /* Initialize function pointers to the generic C version. */ 33 | static void InitPointersToC(void) { 34 | WebRtcSpl_MaxAbsValueW16 = WebRtcSpl_MaxAbsValueW16C; 35 | WebRtcSpl_MaxAbsValueW32 = WebRtcSpl_MaxAbsValueW32C; 36 | WebRtcSpl_MaxValueW16 = WebRtcSpl_MaxValueW16C; 37 | WebRtcSpl_MaxValueW32 = WebRtcSpl_MaxValueW32C; 38 | WebRtcSpl_MinValueW16 = WebRtcSpl_MinValueW16C; 39 | WebRtcSpl_MinValueW32 = WebRtcSpl_MinValueW32C; 40 | WebRtcSpl_CrossCorrelation = WebRtcSpl_CrossCorrelationC; 41 | WebRtcSpl_DownsampleFast = WebRtcSpl_DownsampleFastC; 42 | WebRtcSpl_ScaleAndAddVectorsWithRound = 43 | WebRtcSpl_ScaleAndAddVectorsWithRoundC; 44 | } 45 | #endif 46 | 47 | #if defined(WEBRTC_HAS_NEON) 48 | /* Initialize function pointers to the Neon version. */ 49 | static void InitPointersToNeon(void) { 50 | WebRtcSpl_MaxAbsValueW16 = WebRtcSpl_MaxAbsValueW16Neon; 51 | WebRtcSpl_MaxAbsValueW32 = WebRtcSpl_MaxAbsValueW32Neon; 52 | WebRtcSpl_MaxValueW16 = WebRtcSpl_MaxValueW16Neon; 53 | WebRtcSpl_MaxValueW32 = WebRtcSpl_MaxValueW32Neon; 54 | WebRtcSpl_MinValueW16 = WebRtcSpl_MinValueW16Neon; 55 | WebRtcSpl_MinValueW32 = WebRtcSpl_MinValueW32Neon; 56 | WebRtcSpl_CrossCorrelation = WebRtcSpl_CrossCorrelationNeon; 57 | WebRtcSpl_DownsampleFast = WebRtcSpl_DownsampleFastNeon; 58 | WebRtcSpl_ScaleAndAddVectorsWithRound = 59 | WebRtcSpl_ScaleAndAddVectorsWithRoundC; 60 | } 61 | #endif 62 | 63 | #if defined(MIPS32_LE) 64 | /* Initialize function pointers to the MIPS version. */ 65 | static void InitPointersToMIPS(void) { 66 | WebRtcSpl_MaxAbsValueW16 = WebRtcSpl_MaxAbsValueW16_mips; 67 | WebRtcSpl_MaxValueW16 = WebRtcSpl_MaxValueW16_mips; 68 | WebRtcSpl_MaxValueW32 = WebRtcSpl_MaxValueW32_mips; 69 | WebRtcSpl_MinValueW16 = WebRtcSpl_MinValueW16_mips; 70 | WebRtcSpl_MinValueW32 = WebRtcSpl_MinValueW32_mips; 71 | WebRtcSpl_CrossCorrelation = WebRtcSpl_CrossCorrelation_mips; 72 | WebRtcSpl_DownsampleFast = WebRtcSpl_DownsampleFast_mips; 73 | #if defined(MIPS_DSP_R1_LE) 74 | WebRtcSpl_MaxAbsValueW32 = WebRtcSpl_MaxAbsValueW32_mips; 75 | WebRtcSpl_ScaleAndAddVectorsWithRound = 76 | WebRtcSpl_ScaleAndAddVectorsWithRound_mips; 77 | #else 78 | WebRtcSpl_MaxAbsValueW32 = WebRtcSpl_MaxAbsValueW32C; 79 | WebRtcSpl_ScaleAndAddVectorsWithRound = 80 | WebRtcSpl_ScaleAndAddVectorsWithRoundC; 81 | #endif 82 | } 83 | #endif 84 | 85 | static void InitFunctionPointers(void) { 86 | #if defined(WEBRTC_HAS_NEON) 87 | InitPointersToNeon(); 88 | #elif defined(MIPS32_LE) 89 | InitPointersToMIPS(); 90 | #else 91 | InitPointersToC(); 92 | #endif /* WEBRTC_HAS_NEON */ 93 | } 94 | 95 | #if defined(WEBRTC_POSIX) 96 | #include 97 | 98 | static void once(void (*func)(void)) { 99 | static pthread_once_t lock = PTHREAD_ONCE_INIT; 100 | pthread_once(&lock, func); 101 | } 102 | 103 | #elif defined(_WIN32) 104 | #include 105 | 106 | static void once(void (*func)(void)) { 107 | /* Didn't use InitializeCriticalSection() since there's no race-free context 108 | * in which to execute it. 109 | * 110 | * TODO(kma): Change to different implementation (e.g. 111 | * InterlockedCompareExchangePointer) to avoid issues similar to 112 | * http://code.google.com/p/webm/issues/detail?id=467. 113 | */ 114 | static CRITICAL_SECTION lock = {(void *)((size_t)-1), -1, 0, 0, 0, 0}; 115 | static int done = 0; 116 | 117 | EnterCriticalSection(&lock); 118 | if (!done) { 119 | func(); 120 | done = 1; 121 | } 122 | LeaveCriticalSection(&lock); 123 | } 124 | 125 | /* There's no fallback version as an #else block here to ensure thread safety. 126 | * In case of neither pthread for WEBRTC_POSIX nor _WIN32 is present, build 127 | * system should pick it up. 128 | */ 129 | #endif /* WEBRTC_POSIX */ 130 | 131 | void WebRtcSpl_Init(void) { 132 | once(InitFunctionPointers); 133 | } 134 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/typedefs.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | // This file contains platform-specific typedefs and defines. 12 | // Much of it is derived from Chromium's build/build_config.h. 13 | 14 | #ifndef WEBRTC_TYPEDEFS_H_ 15 | #define WEBRTC_TYPEDEFS_H_ 16 | 17 | // Processor architecture detection. For more info on what's defined, see: 18 | // http://msdn.microsoft.com/en-us/library/b0084kay.aspx 19 | // http://www.agner.org/optimize/calling_conventions.pdf 20 | // or with gcc, run: "echo | gcc -E -dM -" 21 | #if defined(_M_X64) || defined(__x86_64__) 22 | #define WEBRTC_ARCH_X86_FAMILY 23 | #define WEBRTC_ARCH_X86_64 24 | #define WEBRTC_ARCH_64_BITS 25 | #define WEBRTC_ARCH_LITTLE_ENDIAN 26 | #elif defined(__aarch64__) 27 | #define WEBRTC_ARCH_64_BITS 28 | #define WEBRTC_ARCH_LITTLE_ENDIAN 29 | #elif defined(_M_IX86) || defined(__i386__) 30 | #define WEBRTC_ARCH_X86_FAMILY 31 | #define WEBRTC_ARCH_X86 32 | #define WEBRTC_ARCH_32_BITS 33 | #define WEBRTC_ARCH_LITTLE_ENDIAN 34 | #elif defined(__ARMEL__) 35 | // TODO(ajm): We'd prefer to control platform defines here, but this is 36 | // currently provided by the Android makefiles. Commented to avoid duplicate 37 | // definition warnings. 38 | //#define WEBRTC_ARCH_ARM 39 | // TODO(ajm): Chromium uses the following two defines. Should we switch? 40 | //#define WEBRTC_ARCH_ARM_FAMILY 41 | //#define WEBRTC_ARCH_ARMEL 42 | #define WEBRTC_ARCH_32_BITS 43 | #define WEBRTC_ARCH_LITTLE_ENDIAN 44 | #elif defined(__MIPSEL__) 45 | #define WEBRTC_ARCH_32_BITS 46 | #define WEBRTC_ARCH_LITTLE_ENDIAN 47 | #elif defined(__pnacl__) 48 | #define WEBRTC_ARCH_32_BITS 49 | #define WEBRTC_ARCH_LITTLE_ENDIAN 50 | #else 51 | #error Please add support for your architecture in typedefs.h 52 | #endif 53 | 54 | #if !(defined(WEBRTC_ARCH_LITTLE_ENDIAN) ^ defined(WEBRTC_ARCH_BIG_ENDIAN)) 55 | #error Define either WEBRTC_ARCH_LITTLE_ENDIAN or WEBRTC_ARCH_BIG_ENDIAN 56 | #endif 57 | 58 | #if (defined(WEBRTC_ARCH_X86_FAMILY) && !defined(__SSE2__)) || \ 59 | (defined(WEBRTC_ARCH_ARM_V7) && !defined(WEBRTC_ARCH_ARM_NEON)) 60 | #define WEBRTC_CPU_DETECTION 61 | #endif 62 | 63 | #if !defined(_MSC_VER) 64 | #include 65 | #else 66 | // Define C99 equivalent types, since pre-2010 MSVC doesn't provide stdint.h. 67 | typedef signed char int8_t; 68 | typedef signed short int16_t; 69 | typedef signed int int32_t; 70 | typedef __int64 int64_t; 71 | typedef unsigned char uint8_t; 72 | typedef unsigned short uint16_t; 73 | typedef unsigned int uint32_t; 74 | typedef unsigned __int64 uint64_t; 75 | #endif 76 | 77 | // Borrowed from Chromium's base/compiler_specific.h. 78 | // Annotate a virtual method indicating it must be overriding a virtual 79 | // method in the parent class. 80 | // Use like: 81 | // virtual void foo() OVERRIDE; 82 | #if defined(_MSC_VER) 83 | #define OVERRIDE override 84 | #elif defined(__clang__) 85 | // Clang defaults to C++03 and warns about using override. Squelch that. 86 | // Intentionally no push/pop here so all users of OVERRIDE ignore the warning 87 | // too. This is like passing -Wno-c++11-extensions, except that GCC won't die 88 | // (because it won't see this pragma). 89 | #pragma clang diagnostic ignored "-Wc++11-extensions" 90 | #define OVERRIDE override 91 | #elif defined(__GNUC__) && __cplusplus >= 201103 && \ 92 | (__GNUC__ * 10000 + __GNUC_MINOR__ * 100) >= 40700 93 | // GCC 4.7 supports explicit virtual overrides when C++11 support is enabled. 94 | #define OVERRIDE override 95 | #else 96 | #define OVERRIDE 97 | #endif 98 | 99 | // Annotate a function indicating the caller must examine the return value. 100 | // Use like: 101 | // int foo() WARN_UNUSED_RESULT; 102 | // TODO(ajm): Hack to avoid multiple definitions until the base/ of webrtc and 103 | // libjingle are merged. 104 | #if !defined(WARN_UNUSED_RESULT) 105 | #if defined(__GNUC__) 106 | #define WARN_UNUSED_RESULT __attribute__((warn_unused_result)) 107 | #else 108 | #define WARN_UNUSED_RESULT 109 | #endif 110 | #endif // WARN_UNUSED_RESULT 111 | 112 | // Put after a variable that might not be used, to prevent compiler warnings: 113 | // int result ATTRIBUTE_UNUSED = DoSomething(); 114 | // assert(result == 17); 115 | #ifndef ATTRIBUTE_UNUSED 116 | #if defined(__GNUC__) || defined(__clang__) 117 | #define ATTRIBUTE_UNUSED __attribute__((unused)) 118 | #else 119 | #define ATTRIBUTE_UNUSED 120 | #endif 121 | #endif 122 | 123 | // Macro to be used for switch-case fallthrough (required for enabling 124 | // -Wimplicit-fallthrough warning on Clang). 125 | #ifndef FALLTHROUGH 126 | #if defined(__clang__) 127 | #define FALLTHROUGH() [[clang::fallthrough]] 128 | #else 129 | #define FALLTHROUGH() do { } while (0) 130 | #endif 131 | #endif 132 | 133 | // Annotate a function that will not return control flow to the caller. 134 | #if defined(_MSC_VER) 135 | #define NO_RETURN __declspec(noreturn) 136 | #elif defined(__GNUC__) 137 | #define NO_RETURN __attribute__((noreturn)) 138 | #else 139 | #define NO_RETURN 140 | #endif 141 | 142 | #endif // WEBRTC_TYPEDEFS_H_ 143 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/rtc_base/checks.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2006 The WebRTC Project Authors. All rights reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | // Most of this was borrowed (with minor modifications) from V8's and Chromium's 12 | // src/base/logging.cc. 13 | 14 | #include 15 | #include 16 | #include 17 | 18 | #if defined(WEBRTC_ANDROID) 19 | #define RTC_LOG_TAG_ANDROID "rtc" 20 | #include // NOLINT 21 | #endif 22 | 23 | #if defined(WEBRTC_WIN) 24 | #include 25 | #endif 26 | 27 | #if defined(WEBRTC_WIN) 28 | #define LAST_SYSTEM_ERROR (::GetLastError()) 29 | #elif defined(__native_client__) && __native_client__ 30 | #define LAST_SYSTEM_ERROR (0) 31 | #elif defined(WEBRTC_POSIX) 32 | #include 33 | #define LAST_SYSTEM_ERROR (errno) 34 | #endif // WEBRTC_WIN 35 | 36 | #include "webrtc/rtc_base/checks.h" 37 | 38 | namespace { 39 | #if defined(__GNUC__) 40 | __attribute__((__format__(__printf__, 2, 3))) 41 | #endif 42 | void AppendFormat(std::string* s, const char* fmt, ...) { 43 | va_list args, copy; 44 | va_start(args, fmt); 45 | va_copy(copy, args); 46 | const int predicted_length = std::vsnprintf(nullptr, 0, fmt, copy); 47 | va_end(copy); 48 | 49 | if (predicted_length > 0) { 50 | const size_t size = s->size(); 51 | s->resize(size + predicted_length); 52 | // Pass "+ 1" to vsnprintf to include space for the '\0'. 53 | std::vsnprintf(&((*s)[size]), predicted_length + 1, fmt, args); 54 | } 55 | va_end(args); 56 | } 57 | } 58 | 59 | namespace rtc { 60 | namespace webrtc_checks_impl { 61 | 62 | // Reads one argument from args, appends it to s and advances fmt. 63 | // Returns true iff an argument was sucessfully parsed. 64 | bool ParseArg(va_list* args, const CheckArgType** fmt, std::string* s) { 65 | if (**fmt == CheckArgType::kEnd) 66 | return false; 67 | 68 | switch (**fmt) { 69 | case CheckArgType::kInt: 70 | AppendFormat(s, "%d", va_arg(*args, int)); 71 | break; 72 | case CheckArgType::kLong: 73 | AppendFormat(s, "%ld", va_arg(*args, long)); 74 | break; 75 | case CheckArgType::kLongLong: 76 | AppendFormat(s, "%lld", va_arg(*args, long long)); 77 | break; 78 | case CheckArgType::kUInt: 79 | AppendFormat(s, "%u", va_arg(*args, unsigned)); 80 | break; 81 | case CheckArgType::kULong: 82 | AppendFormat(s, "%lu", va_arg(*args, unsigned long)); 83 | break; 84 | case CheckArgType::kULongLong: 85 | AppendFormat(s, "%llu", va_arg(*args, unsigned long long)); 86 | break; 87 | case CheckArgType::kDouble: 88 | AppendFormat(s, "%g", va_arg(*args, double)); 89 | break; 90 | case CheckArgType::kLongDouble: 91 | AppendFormat(s, "%Lg", va_arg(*args, long double)); 92 | break; 93 | case CheckArgType::kCharP: 94 | s->append(va_arg(*args, const char*)); 95 | break; 96 | case CheckArgType::kStdString: 97 | s->append(*va_arg(*args, const std::string*)); 98 | break; 99 | case CheckArgType::kVoidP: 100 | AppendFormat(s, "%p", va_arg(*args, const void*)); 101 | break; 102 | default: 103 | s->append("[Invalid CheckArgType]"); 104 | return false; 105 | } 106 | (*fmt)++; 107 | return true; 108 | } 109 | 110 | RTC_NORETURN void FatalLog(const char* file, 111 | int line, 112 | const char* message, 113 | const CheckArgType* fmt, 114 | ...) { 115 | va_list args; 116 | va_start(args, fmt); 117 | 118 | std::string s; 119 | AppendFormat(&s, 120 | "\n\n" 121 | "#\n" 122 | "# Fatal error in: %s, line %d\n" 123 | "# last system error: %u\n" 124 | "# Check failed: %s", 125 | file, line, LAST_SYSTEM_ERROR, message); 126 | 127 | if (*fmt == CheckArgType::kCheckOp) { 128 | // This log message was generated by RTC_CHECK_OP, so we have to complete 129 | // the error message using the operands that have been passed as the first 130 | // two arguments. 131 | fmt++; 132 | 133 | std::string s1, s2; 134 | if (ParseArg(&args, &fmt, &s1) && ParseArg(&args, &fmt, &s2)) 135 | AppendFormat(&s, " (%s vs. %s)\n# ", s1.c_str(), s2.c_str()); 136 | } else { 137 | s.append("\n# "); 138 | } 139 | 140 | // Append all the user-supplied arguments to the message. 141 | while (ParseArg(&args, &fmt, &s)) 142 | ; 143 | 144 | va_end(args); 145 | 146 | const char* output = s.c_str(); 147 | 148 | #if defined(WEBRTC_ANDROID) 149 | __android_log_print(ANDROID_LOG_ERROR, RTC_LOG_TAG_ANDROID, "%s\n", output); 150 | #endif 151 | 152 | fflush(stdout); 153 | fprintf(stderr, "%s", output); 154 | fflush(stderr); 155 | abort(); 156 | } 157 | 158 | } // namespace webrtc_checks_impl 159 | } // namespace rtc 160 | 161 | // Function to call from the C version of the RTC_CHECK and RTC_DCHECK macros. 162 | RTC_NORETURN void rtc_FatalMessage(const char* file, int line, 163 | const char* msg) { 164 | static constexpr rtc::webrtc_checks_impl::CheckArgType t[] = { 165 | rtc::webrtc_checks_impl::CheckArgType::kEnd}; 166 | FatalLog(file, line, msg, t); 167 | } 168 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/common_audio/signal_processing/vector_scaling_operations.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | 12 | /* 13 | * This file contains implementations of the functions 14 | * WebRtcSpl_VectorBitShiftW16() 15 | * WebRtcSpl_VectorBitShiftW32() 16 | * WebRtcSpl_VectorBitShiftW32ToW16() 17 | * WebRtcSpl_ScaleVector() 18 | * WebRtcSpl_ScaleVectorWithSat() 19 | * WebRtcSpl_ScaleAndAddVectors() 20 | * WebRtcSpl_ScaleAndAddVectorsWithRoundC() 21 | */ 22 | 23 | #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h" 24 | 25 | void WebRtcSpl_VectorBitShiftW16(int16_t *res, size_t length, 26 | const int16_t *in, int16_t right_shifts) 27 | { 28 | size_t i; 29 | 30 | if (right_shifts > 0) 31 | { 32 | for (i = length; i > 0; i--) 33 | { 34 | (*res++) = ((*in++) >> right_shifts); 35 | } 36 | } else 37 | { 38 | for (i = length; i > 0; i--) 39 | { 40 | (*res++) = ((*in++) * (1 << (-right_shifts))); 41 | } 42 | } 43 | } 44 | 45 | void WebRtcSpl_VectorBitShiftW32(int32_t *out_vector, 46 | size_t vector_length, 47 | const int32_t *in_vector, 48 | int16_t right_shifts) 49 | { 50 | size_t i; 51 | 52 | if (right_shifts > 0) 53 | { 54 | for (i = vector_length; i > 0; i--) 55 | { 56 | (*out_vector++) = ((*in_vector++) >> right_shifts); 57 | } 58 | } else 59 | { 60 | for (i = vector_length; i > 0; i--) 61 | { 62 | (*out_vector++) = ((*in_vector++) << (-right_shifts)); 63 | } 64 | } 65 | } 66 | 67 | void WebRtcSpl_VectorBitShiftW32ToW16(int16_t* out, size_t length, 68 | const int32_t* in, int right_shifts) { 69 | size_t i; 70 | int32_t tmp_w32; 71 | 72 | if (right_shifts >= 0) { 73 | for (i = length; i > 0; i--) { 74 | tmp_w32 = (*in++) >> right_shifts; 75 | (*out++) = WebRtcSpl_SatW32ToW16(tmp_w32); 76 | } 77 | } else { 78 | int left_shifts = -right_shifts; 79 | for (i = length; i > 0; i--) { 80 | tmp_w32 = (*in++) << left_shifts; 81 | (*out++) = WebRtcSpl_SatW32ToW16(tmp_w32); 82 | } 83 | } 84 | } 85 | 86 | void WebRtcSpl_ScaleVector(const int16_t *in_vector, int16_t *out_vector, 87 | int16_t gain, size_t in_vector_length, 88 | int16_t right_shifts) 89 | { 90 | // Performs vector operation: out_vector = (gain*in_vector)>>right_shifts 91 | size_t i; 92 | const int16_t *inptr; 93 | int16_t *outptr; 94 | 95 | inptr = in_vector; 96 | outptr = out_vector; 97 | 98 | for (i = 0; i < in_vector_length; i++) 99 | { 100 | *outptr++ = (int16_t)((*inptr++ * gain) >> right_shifts); 101 | } 102 | } 103 | 104 | void WebRtcSpl_ScaleVectorWithSat(const int16_t *in_vector, int16_t *out_vector, 105 | int16_t gain, size_t in_vector_length, 106 | int16_t right_shifts) 107 | { 108 | // Performs vector operation: out_vector = (gain*in_vector)>>right_shifts 109 | size_t i; 110 | const int16_t *inptr; 111 | int16_t *outptr; 112 | 113 | inptr = in_vector; 114 | outptr = out_vector; 115 | 116 | for (i = 0; i < in_vector_length; i++) { 117 | *outptr++ = WebRtcSpl_SatW32ToW16((*inptr++ * gain) >> right_shifts); 118 | } 119 | } 120 | 121 | void WebRtcSpl_ScaleAndAddVectors(const int16_t *in1, int16_t gain1, int shift1, 122 | const int16_t *in2, int16_t gain2, int shift2, 123 | int16_t *out, size_t vector_length) 124 | { 125 | // Performs vector operation: out = (gain1*in1)>>shift1 + (gain2*in2)>>shift2 126 | size_t i; 127 | const int16_t *in1ptr; 128 | const int16_t *in2ptr; 129 | int16_t *outptr; 130 | 131 | in1ptr = in1; 132 | in2ptr = in2; 133 | outptr = out; 134 | 135 | for (i = 0; i < vector_length; i++) 136 | { 137 | *outptr++ = (int16_t)((gain1 * *in1ptr++) >> shift1) + 138 | (int16_t)((gain2 * *in2ptr++) >> shift2); 139 | } 140 | } 141 | 142 | // C version of WebRtcSpl_ScaleAndAddVectorsWithRound() for generic platforms. 143 | int WebRtcSpl_ScaleAndAddVectorsWithRoundC(const int16_t* in_vector1, 144 | int16_t in_vector1_scale, 145 | const int16_t* in_vector2, 146 | int16_t in_vector2_scale, 147 | int right_shifts, 148 | int16_t* out_vector, 149 | size_t length) { 150 | size_t i = 0; 151 | int round_value = (1 << right_shifts) >> 1; 152 | 153 | if (in_vector1 == NULL || in_vector2 == NULL || out_vector == NULL || 154 | length == 0 || right_shifts < 0) { 155 | return -1; 156 | } 157 | 158 | for (i = 0; i < length; i++) { 159 | out_vector[i] = (int16_t)(( 160 | in_vector1[i] * in_vector1_scale + in_vector2[i] * in_vector2_scale + 161 | round_value) >> right_shifts); 162 | } 163 | 164 | return 0; 165 | } 166 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/common_audio/signal_processing/spl_sqrt.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | 12 | /* 13 | * This file contains the function WebRtcSpl_Sqrt(). 14 | * The description header can be found in signal_processing_library.h 15 | * 16 | */ 17 | 18 | #include "webrtc/rtc_base/checks.h" 19 | #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h" 20 | 21 | int32_t WebRtcSpl_SqrtLocal(int32_t in); 22 | 23 | int32_t WebRtcSpl_SqrtLocal(int32_t in) 24 | { 25 | 26 | int16_t x_half, t16; 27 | int32_t A, B, x2; 28 | 29 | /* The following block performs: 30 | y=in/2 31 | x=y-2^30 32 | x_half=x/2^31 33 | t = 1 + (x_half) - 0.5*((x_half)^2) + 0.5*((x_half)^3) - 0.625*((x_half)^4) 34 | + 0.875*((x_half)^5) 35 | */ 36 | 37 | B = in / 2; 38 | 39 | B = B - ((int32_t)0x40000000); // B = in/2 - 1/2 40 | x_half = (int16_t)(B >> 16); // x_half = x/2 = (in-1)/2 41 | B = B + ((int32_t)0x40000000); // B = 1 + x/2 42 | B = B + ((int32_t)0x40000000); // Add 0.5 twice (since 1.0 does not exist in Q31) 43 | 44 | x2 = ((int32_t)x_half) * ((int32_t)x_half) * 2; // A = (x/2)^2 45 | A = -x2; // A = -(x/2)^2 46 | B = B + (A >> 1); // B = 1 + x/2 - 0.5*(x/2)^2 47 | 48 | A >>= 16; 49 | A = A * A * 2; // A = (x/2)^4 50 | t16 = (int16_t)(A >> 16); 51 | B += -20480 * t16 * 2; // B = B - 0.625*A 52 | // After this, B = 1 + x/2 - 0.5*(x/2)^2 - 0.625*(x/2)^4 53 | 54 | A = x_half * t16 * 2; // A = (x/2)^5 55 | t16 = (int16_t)(A >> 16); 56 | B += 28672 * t16 * 2; // B = B + 0.875*A 57 | // After this, B = 1 + x/2 - 0.5*(x/2)^2 - 0.625*(x/2)^4 + 0.875*(x/2)^5 58 | 59 | t16 = (int16_t)(x2 >> 16); 60 | A = x_half * t16 * 2; // A = x/2^3 61 | 62 | B = B + (A >> 1); // B = B + 0.5*A 63 | // After this, B = 1 + x/2 - 0.5*(x/2)^2 + 0.5*(x/2)^3 - 0.625*(x/2)^4 + 0.875*(x/2)^5 64 | 65 | B = B + ((int32_t)32768); // Round off bit 66 | 67 | return B; 68 | } 69 | 70 | int32_t WebRtcSpl_Sqrt(int32_t value) 71 | { 72 | /* 73 | Algorithm: 74 | 75 | Six term Taylor Series is used here to compute the square root of a number 76 | y^0.5 = (1+x)^0.5 where x = y-1 77 | = 1+(x/2)-0.5*((x/2)^2+0.5*((x/2)^3-0.625*((x/2)^4+0.875*((x/2)^5) 78 | 0.5 <= x < 1 79 | 80 | Example of how the algorithm works, with ut=sqrt(in), and 81 | with in=73632 and ut=271 (even shift value case): 82 | 83 | in=73632 84 | y= in/131072 85 | x=y-1 86 | t = 1 + (x/2) - 0.5*((x/2)^2) + 0.5*((x/2)^3) - 0.625*((x/2)^4) + 0.875*((x/2)^5) 87 | ut=t*(1/sqrt(2))*512 88 | 89 | or: 90 | 91 | in=73632 92 | in2=73632*2^14 93 | y= in2/2^31 94 | x=y-1 95 | t = 1 + (x/2) - 0.5*((x/2)^2) + 0.5*((x/2)^3) - 0.625*((x/2)^4) + 0.875*((x/2)^5) 96 | ut=t*(1/sqrt(2)) 97 | ut2=ut*2^9 98 | 99 | which gives: 100 | 101 | in = 73632 102 | in2 = 1206386688 103 | y = 0.56176757812500 104 | x = -0.43823242187500 105 | t = 0.74973506527313 106 | ut = 0.53014274874797 107 | ut2 = 2.714330873589594e+002 108 | 109 | or: 110 | 111 | in=73632 112 | in2=73632*2^14 113 | y=in2/2 114 | x=y-2^30 115 | x_half=x/2^31 116 | t = 1 + (x_half) - 0.5*((x_half)^2) + 0.5*((x_half)^3) - 0.625*((x_half)^4) 117 | + 0.875*((x_half)^5) 118 | ut=t*(1/sqrt(2)) 119 | ut2=ut*2^9 120 | 121 | which gives: 122 | 123 | in = 73632 124 | in2 = 1206386688 125 | y = 603193344 126 | x = -470548480 127 | x_half = -0.21911621093750 128 | t = 0.74973506527313 129 | ut = 0.53014274874797 130 | ut2 = 2.714330873589594e+002 131 | 132 | */ 133 | 134 | int16_t x_norm, nshift, t16, sh; 135 | int32_t A; 136 | 137 | int16_t k_sqrt_2 = 23170; // 1/sqrt2 (==5a82) 138 | 139 | A = value; 140 | 141 | // The convention in this function is to calculate sqrt(abs(A)). Negate the 142 | // input if it is negative. 143 | if (A < 0) { 144 | if (A == WEBRTC_SPL_WORD32_MIN) { 145 | // This number cannot be held in an int32_t after negating. 146 | // Map it to the maximum positive value. 147 | A = WEBRTC_SPL_WORD32_MAX; 148 | } else { 149 | A = -A; 150 | } 151 | } else if (A == 0) { 152 | return 0; // sqrt(0) = 0 153 | } 154 | 155 | sh = WebRtcSpl_NormW32(A); // # shifts to normalize A 156 | A = WEBRTC_SPL_LSHIFT_W32(A, sh); // Normalize A 157 | if (A < (WEBRTC_SPL_WORD32_MAX - 32767)) 158 | { 159 | A = A + ((int32_t)32768); // Round off bit 160 | } else 161 | { 162 | A = WEBRTC_SPL_WORD32_MAX; 163 | } 164 | 165 | x_norm = (int16_t)(A >> 16); // x_norm = AH 166 | 167 | nshift = (sh / 2); 168 | RTC_DCHECK_GE(nshift, 0); 169 | 170 | A = (int32_t)WEBRTC_SPL_LSHIFT_W32((int32_t)x_norm, 16); 171 | A = WEBRTC_SPL_ABS_W32(A); // A = abs(x_norm<<16) 172 | A = WebRtcSpl_SqrtLocal(A); // A = sqrt(A) 173 | 174 | if (2 * nshift == sh) { 175 | // Even shift value case 176 | 177 | t16 = (int16_t)(A >> 16); // t16 = AH 178 | 179 | A = k_sqrt_2 * t16 * 2; // A = 1/sqrt(2)*t16 180 | A = A + ((int32_t)32768); // Round off 181 | A = A & ((int32_t)0x7fff0000); // Round off 182 | 183 | A >>= 15; // A = A>>16 184 | 185 | } else 186 | { 187 | A >>= 16; // A = A>>16 188 | } 189 | 190 | A = A & ((int32_t)0x0000ffff); 191 | A >>= nshift; // De-normalize the result. 192 | 193 | return A; 194 | } 195 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/common_audio/signal_processing/include/spl_inl.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | // This header file includes the inline functions in 12 | // the fix point signal processing library. 13 | 14 | #ifndef COMMON_AUDIO_SIGNAL_PROCESSING_INCLUDE_SPL_INL_H_ 15 | #define COMMON_AUDIO_SIGNAL_PROCESSING_INCLUDE_SPL_INL_H_ 16 | 17 | #include "webrtc/rtc_base/compile_assert_c.h" 18 | 19 | extern const int8_t kWebRtcSpl_CountLeadingZeros32_Table[64]; 20 | 21 | // Don't call this directly except in tests! 22 | static __inline int WebRtcSpl_CountLeadingZeros32_NotBuiltin(uint32_t n) { 23 | // Normalize n by rounding up to the nearest number that is a sequence of 0 24 | // bits followed by a sequence of 1 bits. This number has the same number of 25 | // leading zeros as the original n. There are exactly 33 such values. 26 | n |= n >> 1; 27 | n |= n >> 2; 28 | n |= n >> 4; 29 | n |= n >> 8; 30 | n |= n >> 16; 31 | 32 | // Multiply the modified n with a constant selected (by exhaustive search) 33 | // such that each of the 33 possible values of n give a product whose 6 most 34 | // significant bits are unique. Then look up the answer in the table. 35 | return kWebRtcSpl_CountLeadingZeros32_Table[(n * 0x8c0b2891) >> 26]; 36 | } 37 | 38 | // Don't call this directly except in tests! 39 | static __inline int WebRtcSpl_CountLeadingZeros64_NotBuiltin(uint64_t n) { 40 | const int leading_zeros = n >> 32 == 0 ? 32 : 0; 41 | return leading_zeros + WebRtcSpl_CountLeadingZeros32_NotBuiltin( 42 | (uint32_t)(n >> (32 - leading_zeros))); 43 | } 44 | 45 | // Returns the number of leading zero bits in the argument. 46 | static __inline int WebRtcSpl_CountLeadingZeros32(uint32_t n) { 47 | #ifdef __GNUC__ 48 | RTC_COMPILE_ASSERT(sizeof(unsigned int) == sizeof(uint32_t)); 49 | return n == 0 ? 32 : __builtin_clz(n); 50 | #else 51 | return WebRtcSpl_CountLeadingZeros32_NotBuiltin(n); 52 | #endif 53 | } 54 | 55 | // Returns the number of leading zero bits in the argument. 56 | static __inline int WebRtcSpl_CountLeadingZeros64(uint64_t n) { 57 | #ifdef __GNUC__ 58 | RTC_COMPILE_ASSERT(sizeof(unsigned long long) == sizeof(uint64_t)); // NOLINT 59 | return n == 0 ? 64 : __builtin_clzll(n); 60 | #else 61 | return WebRtcSpl_CountLeadingZeros64_NotBuiltin(n); 62 | #endif 63 | } 64 | 65 | #ifdef WEBRTC_ARCH_ARM_V7 66 | #include "webrtc/common_audio/signal_processing/include/spl_inl_armv7.h" 67 | #else 68 | 69 | #if defined(MIPS32_LE) 70 | #include "webrtc/common_audio/signal_processing/include/spl_inl_mips.h" 71 | #endif 72 | 73 | #if !defined(MIPS_DSP_R1_LE) 74 | static __inline int16_t WebRtcSpl_SatW32ToW16(int32_t value32) { 75 | int16_t out16 = (int16_t)value32; 76 | 77 | if (value32 > 32767) 78 | out16 = 32767; 79 | else if (value32 < -32768) 80 | out16 = -32768; 81 | 82 | return out16; 83 | } 84 | 85 | static __inline int32_t WebRtcSpl_AddSatW32(int32_t a, int32_t b) { 86 | // Do the addition in unsigned numbers, since signed overflow is undefined 87 | // behavior. 88 | const int32_t sum = (int32_t)((uint32_t)a + (uint32_t)b); 89 | 90 | // a + b can't overflow if a and b have different signs. If they have the 91 | // same sign, a + b also has the same sign iff it didn't overflow. 92 | if ((a < 0) == (b < 0) && (a < 0) != (sum < 0)) { 93 | // The direction of the overflow is obvious from the sign of a + b. 94 | return sum < 0 ? INT32_MAX : INT32_MIN; 95 | } 96 | return sum; 97 | } 98 | 99 | static __inline int32_t WebRtcSpl_SubSatW32(int32_t a, int32_t b) { 100 | // Do the subtraction in unsigned numbers, since signed overflow is undefined 101 | // behavior. 102 | const int32_t diff = (int32_t)((uint32_t)a - (uint32_t)b); 103 | 104 | // a - b can't overflow if a and b have the same sign. If they have different 105 | // signs, a - b has the same sign as a iff it didn't overflow. 106 | if ((a < 0) != (b < 0) && (a < 0) != (diff < 0)) { 107 | // The direction of the overflow is obvious from the sign of a - b. 108 | return diff < 0 ? INT32_MAX : INT32_MIN; 109 | } 110 | return diff; 111 | } 112 | 113 | static __inline int16_t WebRtcSpl_AddSatW16(int16_t a, int16_t b) { 114 | return WebRtcSpl_SatW32ToW16((int32_t)a + (int32_t)b); 115 | } 116 | 117 | static __inline int16_t WebRtcSpl_SubSatW16(int16_t var1, int16_t var2) { 118 | return WebRtcSpl_SatW32ToW16((int32_t)var1 - (int32_t)var2); 119 | } 120 | #endif // #if !defined(MIPS_DSP_R1_LE) 121 | 122 | #if !defined(MIPS32_LE) 123 | static __inline int16_t WebRtcSpl_GetSizeInBits(uint32_t n) { 124 | return 32 - WebRtcSpl_CountLeadingZeros32(n); 125 | } 126 | 127 | // Return the number of steps a can be left-shifted without overflow, 128 | // or 0 if a == 0. 129 | static __inline int16_t WebRtcSpl_NormW32(int32_t a) { 130 | return a == 0 ? 0 : WebRtcSpl_CountLeadingZeros32(a < 0 ? ~a : a) - 1; 131 | } 132 | 133 | // Return the number of steps a can be left-shifted without overflow, 134 | // or 0 if a == 0. 135 | static __inline int16_t WebRtcSpl_NormU32(uint32_t a) { 136 | return a == 0 ? 0 : WebRtcSpl_CountLeadingZeros32(a); 137 | } 138 | 139 | // Return the number of steps a can be left-shifted without overflow, 140 | // or 0 if a == 0. 141 | static __inline int16_t WebRtcSpl_NormW16(int16_t a) { 142 | const int32_t a32 = a; 143 | return a == 0 ? 0 : WebRtcSpl_CountLeadingZeros32(a < 0 ? ~a32 : a32) - 17; 144 | } 145 | 146 | static __inline int32_t WebRtc_MulAccumW16(int16_t a, int16_t b, int32_t c) { 147 | return (a * b + c); 148 | } 149 | #endif // #if !defined(MIPS32_LE) 150 | 151 | #endif // WEBRTC_ARCH_ARM_V7 152 | 153 | #endif // COMMON_AUDIO_SIGNAL_PROCESSING_INCLUDE_SPL_INL_H_ 154 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/common_audio/signal_processing/min_max_operations.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | /* 12 | * This file contains the implementation of functions 13 | * WebRtcSpl_MaxAbsValueW16C() 14 | * WebRtcSpl_MaxAbsValueW32C() 15 | * WebRtcSpl_MaxValueW16C() 16 | * WebRtcSpl_MaxValueW32C() 17 | * WebRtcSpl_MinValueW16C() 18 | * WebRtcSpl_MinValueW32C() 19 | * WebRtcSpl_MaxAbsIndexW16() 20 | * WebRtcSpl_MaxIndexW16() 21 | * WebRtcSpl_MaxIndexW32() 22 | * WebRtcSpl_MinIndexW16() 23 | * WebRtcSpl_MinIndexW32() 24 | * 25 | */ 26 | 27 | #include 28 | 29 | #include "webrtc/rtc_base/checks.h" 30 | #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h" 31 | 32 | // TODO(bjorn/kma): Consolidate function pairs (e.g. combine 33 | // WebRtcSpl_MaxAbsValueW16C and WebRtcSpl_MaxAbsIndexW16 into a single one.) 34 | // TODO(kma): Move the next six functions into min_max_operations_c.c. 35 | 36 | // Maximum absolute value of word16 vector. C version for generic platforms. 37 | int16_t WebRtcSpl_MaxAbsValueW16C(const int16_t* vector, size_t length) { 38 | size_t i = 0; 39 | int absolute = 0, maximum = 0; 40 | 41 | RTC_DCHECK_GT(length, 0); 42 | 43 | for (i = 0; i < length; i++) { 44 | absolute = abs((int)vector[i]); 45 | 46 | if (absolute > maximum) { 47 | maximum = absolute; 48 | } 49 | } 50 | 51 | // Guard the case for abs(-32768). 52 | if (maximum > WEBRTC_SPL_WORD16_MAX) { 53 | maximum = WEBRTC_SPL_WORD16_MAX; 54 | } 55 | 56 | return (int16_t)maximum; 57 | } 58 | 59 | // Maximum absolute value of word32 vector. C version for generic platforms. 60 | int32_t WebRtcSpl_MaxAbsValueW32C(const int32_t* vector, size_t length) { 61 | // Use uint32_t for the local variables, to accommodate the return value 62 | // of abs(0x80000000), which is 0x80000000. 63 | 64 | uint32_t absolute = 0, maximum = 0; 65 | size_t i = 0; 66 | 67 | RTC_DCHECK_GT(length, 0); 68 | 69 | for (i = 0; i < length; i++) { 70 | absolute = abs((int)vector[i]); 71 | if (absolute > maximum) { 72 | maximum = absolute; 73 | } 74 | } 75 | 76 | maximum = WEBRTC_SPL_MIN(maximum, WEBRTC_SPL_WORD32_MAX); 77 | 78 | return (int32_t)maximum; 79 | } 80 | 81 | // Maximum value of word16 vector. C version for generic platforms. 82 | int16_t WebRtcSpl_MaxValueW16C(const int16_t* vector, size_t length) { 83 | int16_t maximum = WEBRTC_SPL_WORD16_MIN; 84 | size_t i = 0; 85 | 86 | RTC_DCHECK_GT(length, 0); 87 | 88 | for (i = 0; i < length; i++) { 89 | if (vector[i] > maximum) 90 | maximum = vector[i]; 91 | } 92 | return maximum; 93 | } 94 | 95 | // Maximum value of word32 vector. C version for generic platforms. 96 | int32_t WebRtcSpl_MaxValueW32C(const int32_t* vector, size_t length) { 97 | int32_t maximum = WEBRTC_SPL_WORD32_MIN; 98 | size_t i = 0; 99 | 100 | RTC_DCHECK_GT(length, 0); 101 | 102 | for (i = 0; i < length; i++) { 103 | if (vector[i] > maximum) 104 | maximum = vector[i]; 105 | } 106 | return maximum; 107 | } 108 | 109 | // Minimum value of word16 vector. C version for generic platforms. 110 | int16_t WebRtcSpl_MinValueW16C(const int16_t* vector, size_t length) { 111 | int16_t minimum = WEBRTC_SPL_WORD16_MAX; 112 | size_t i = 0; 113 | 114 | RTC_DCHECK_GT(length, 0); 115 | 116 | for (i = 0; i < length; i++) { 117 | if (vector[i] < minimum) 118 | minimum = vector[i]; 119 | } 120 | return minimum; 121 | } 122 | 123 | // Minimum value of word32 vector. C version for generic platforms. 124 | int32_t WebRtcSpl_MinValueW32C(const int32_t* vector, size_t length) { 125 | int32_t minimum = WEBRTC_SPL_WORD32_MAX; 126 | size_t i = 0; 127 | 128 | RTC_DCHECK_GT(length, 0); 129 | 130 | for (i = 0; i < length; i++) { 131 | if (vector[i] < minimum) 132 | minimum = vector[i]; 133 | } 134 | return minimum; 135 | } 136 | 137 | // Index of maximum absolute value in a word16 vector. 138 | size_t WebRtcSpl_MaxAbsIndexW16(const int16_t* vector, size_t length) { 139 | // Use type int for local variables, to accomodate the value of abs(-32768). 140 | 141 | size_t i = 0, index = 0; 142 | int absolute = 0, maximum = 0; 143 | 144 | RTC_DCHECK_GT(length, 0); 145 | 146 | for (i = 0; i < length; i++) { 147 | absolute = abs((int)vector[i]); 148 | 149 | if (absolute > maximum) { 150 | maximum = absolute; 151 | index = i; 152 | } 153 | } 154 | 155 | return index; 156 | } 157 | 158 | // Index of maximum value in a word16 vector. 159 | size_t WebRtcSpl_MaxIndexW16(const int16_t* vector, size_t length) { 160 | size_t i = 0, index = 0; 161 | int16_t maximum = WEBRTC_SPL_WORD16_MIN; 162 | 163 | RTC_DCHECK_GT(length, 0); 164 | 165 | for (i = 0; i < length; i++) { 166 | if (vector[i] > maximum) { 167 | maximum = vector[i]; 168 | index = i; 169 | } 170 | } 171 | 172 | return index; 173 | } 174 | 175 | // Index of maximum value in a word32 vector. 176 | size_t WebRtcSpl_MaxIndexW32(const int32_t* vector, size_t length) { 177 | size_t i = 0, index = 0; 178 | int32_t maximum = WEBRTC_SPL_WORD32_MIN; 179 | 180 | RTC_DCHECK_GT(length, 0); 181 | 182 | for (i = 0; i < length; i++) { 183 | if (vector[i] > maximum) { 184 | maximum = vector[i]; 185 | index = i; 186 | } 187 | } 188 | 189 | return index; 190 | } 191 | 192 | // Index of minimum value in a word16 vector. 193 | size_t WebRtcSpl_MinIndexW16(const int16_t* vector, size_t length) { 194 | size_t i = 0, index = 0; 195 | int16_t minimum = WEBRTC_SPL_WORD16_MAX; 196 | 197 | RTC_DCHECK_GT(length, 0); 198 | 199 | for (i = 0; i < length; i++) { 200 | if (vector[i] < minimum) { 201 | minimum = vector[i]; 202 | index = i; 203 | } 204 | } 205 | 206 | return index; 207 | } 208 | 209 | // Index of minimum value in a word32 vector. 210 | size_t WebRtcSpl_MinIndexW32(const int32_t* vector, size_t length) { 211 | size_t i = 0, index = 0; 212 | int32_t minimum = WEBRTC_SPL_WORD32_MAX; 213 | 214 | RTC_DCHECK_GT(length, 0); 215 | 216 | for (i = 0; i < length; i++) { 217 | if (vector[i] < minimum) { 218 | minimum = vector[i]; 219 | index = i; 220 | } 221 | } 222 | 223 | return index; 224 | } 225 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/common_audio/vad/vad_sp.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | #include "webrtc/common_audio/vad/vad_sp.h" 12 | 13 | #include "webrtc/rtc_base/checks.h" 14 | #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h" 15 | #include "webrtc/common_audio/vad/vad_core.h" 16 | 17 | // Allpass filter coefficients, upper and lower, in Q13. 18 | // Upper: 0.64, Lower: 0.17. 19 | static const int16_t kAllPassCoefsQ13[2] = { 5243, 1392 }; // Q13. 20 | static const int16_t kSmoothingDown = 6553; // 0.2 in Q15. 21 | static const int16_t kSmoothingUp = 32439; // 0.99 in Q15. 22 | 23 | // TODO(bjornv): Move this function to vad_filterbank.c. 24 | // Downsampling filter based on splitting filter and allpass functions. 25 | void WebRtcVad_Downsampling(const int16_t* signal_in, 26 | int16_t* signal_out, 27 | int32_t* filter_state, 28 | size_t in_length) { 29 | int16_t tmp16_1 = 0, tmp16_2 = 0; 30 | int32_t tmp32_1 = filter_state[0]; 31 | int32_t tmp32_2 = filter_state[1]; 32 | size_t n = 0; 33 | // Downsampling by 2 gives half length. 34 | size_t half_length = (in_length >> 1); 35 | 36 | // Filter coefficients in Q13, filter state in Q0. 37 | for (n = 0; n < half_length; n++) { 38 | // All-pass filtering upper branch. 39 | tmp16_1 = (int16_t) ((tmp32_1 >> 1) + 40 | ((kAllPassCoefsQ13[0] * *signal_in) >> 14)); 41 | *signal_out = tmp16_1; 42 | tmp32_1 = (int32_t)(*signal_in++) - ((kAllPassCoefsQ13[0] * tmp16_1) >> 12); 43 | 44 | // All-pass filtering lower branch. 45 | tmp16_2 = (int16_t) ((tmp32_2 >> 1) + 46 | ((kAllPassCoefsQ13[1] * *signal_in) >> 14)); 47 | *signal_out++ += tmp16_2; 48 | tmp32_2 = (int32_t)(*signal_in++) - ((kAllPassCoefsQ13[1] * tmp16_2) >> 12); 49 | } 50 | // Store the filter states. 51 | filter_state[0] = tmp32_1; 52 | filter_state[1] = tmp32_2; 53 | } 54 | 55 | // Inserts |feature_value| into |low_value_vector|, if it is one of the 16 56 | // smallest values the last 100 frames. Then calculates and returns the median 57 | // of the five smallest values. 58 | int16_t WebRtcVad_FindMinimum(VadInstT* self, 59 | int16_t feature_value, 60 | int channel) { 61 | int i = 0, j = 0; 62 | int position = -1; 63 | // Offset to beginning of the 16 minimum values in memory. 64 | const int offset = (channel << 4); 65 | int16_t current_median = 1600; 66 | int16_t alpha = 0; 67 | int32_t tmp32 = 0; 68 | // Pointer to memory for the 16 minimum values and the age of each value of 69 | // the |channel|. 70 | int16_t* age = &self->index_vector[offset]; 71 | int16_t* smallest_values = &self->low_value_vector[offset]; 72 | 73 | RTC_DCHECK_LT(channel, kNumChannels); 74 | 75 | // Each value in |smallest_values| is getting 1 loop older. Update |age|, and 76 | // remove old values. 77 | for (i = 0; i < 16; i++) { 78 | if (age[i] != 100) { 79 | age[i]++; 80 | } else { 81 | // Too old value. Remove from memory and shift larger values downwards. 82 | for (j = i; j < 16; j++) { 83 | smallest_values[j] = smallest_values[j + 1]; 84 | age[j] = age[j + 1]; 85 | } 86 | age[15] = 101; 87 | smallest_values[15] = 10000; 88 | } 89 | } 90 | 91 | // Check if |feature_value| is smaller than any of the values in 92 | // |smallest_values|. If so, find the |position| where to insert the new value 93 | // (|feature_value|). 94 | if (feature_value < smallest_values[7]) { 95 | if (feature_value < smallest_values[3]) { 96 | if (feature_value < smallest_values[1]) { 97 | if (feature_value < smallest_values[0]) { 98 | position = 0; 99 | } else { 100 | position = 1; 101 | } 102 | } else if (feature_value < smallest_values[2]) { 103 | position = 2; 104 | } else { 105 | position = 3; 106 | } 107 | } else if (feature_value < smallest_values[5]) { 108 | if (feature_value < smallest_values[4]) { 109 | position = 4; 110 | } else { 111 | position = 5; 112 | } 113 | } else if (feature_value < smallest_values[6]) { 114 | position = 6; 115 | } else { 116 | position = 7; 117 | } 118 | } else if (feature_value < smallest_values[15]) { 119 | if (feature_value < smallest_values[11]) { 120 | if (feature_value < smallest_values[9]) { 121 | if (feature_value < smallest_values[8]) { 122 | position = 8; 123 | } else { 124 | position = 9; 125 | } 126 | } else if (feature_value < smallest_values[10]) { 127 | position = 10; 128 | } else { 129 | position = 11; 130 | } 131 | } else if (feature_value < smallest_values[13]) { 132 | if (feature_value < smallest_values[12]) { 133 | position = 12; 134 | } else { 135 | position = 13; 136 | } 137 | } else if (feature_value < smallest_values[14]) { 138 | position = 14; 139 | } else { 140 | position = 15; 141 | } 142 | } 143 | 144 | // If we have detected a new small value, insert it at the correct position 145 | // and shift larger values up. 146 | if (position > -1) { 147 | for (i = 15; i > position; i--) { 148 | smallest_values[i] = smallest_values[i - 1]; 149 | age[i] = age[i - 1]; 150 | } 151 | smallest_values[position] = feature_value; 152 | age[position] = 1; 153 | } 154 | 155 | // Get |current_median|. 156 | if (self->frame_counter > 2) { 157 | current_median = smallest_values[2]; 158 | } else if (self->frame_counter > 0) { 159 | current_median = smallest_values[0]; 160 | } 161 | 162 | // Smooth the median value. 163 | if (self->frame_counter > 0) { 164 | if (current_median < self->mean_value[channel]) { 165 | alpha = kSmoothingDown; // 0.2 in Q15. 166 | } else { 167 | alpha = kSmoothingUp; // 0.99 in Q15. 168 | } 169 | } 170 | tmp32 = (alpha + 1) * self->mean_value[channel]; 171 | tmp32 += (WEBRTC_SPL_WORD16_MAX - alpha) * current_median; 172 | tmp32 += 16384; 173 | self->mean_value[channel] = (int16_t) (tmp32 >> 15); 174 | 175 | return self->mean_value[channel]; 176 | } 177 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/common_audio/signal_processing/resample_48khz.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | 12 | /* 13 | * This file contains resampling functions between 48 kHz and nb/wb. 14 | * The description header can be found in signal_processing_library.h 15 | * 16 | */ 17 | 18 | #include 19 | #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h" 20 | #include "webrtc/common_audio/signal_processing/resample_by_2_internal.h" 21 | 22 | //////////////////////////// 23 | ///// 48 kHz -> 16 kHz ///// 24 | //////////////////////////// 25 | 26 | // 48 -> 16 resampler 27 | void WebRtcSpl_Resample48khzTo16khz(const int16_t* in, int16_t* out, 28 | WebRtcSpl_State48khzTo16khz* state, int32_t* tmpmem) 29 | { 30 | ///// 48 --> 48(LP) ///// 31 | // int16_t in[480] 32 | // int32_t out[480] 33 | ///// 34 | WebRtcSpl_LPBy2ShortToInt(in, 480, tmpmem + 16, state->S_48_48); 35 | 36 | ///// 48 --> 32 ///// 37 | // int32_t in[480] 38 | // int32_t out[320] 39 | ///// 40 | // copy state to and from input array 41 | memcpy(tmpmem + 8, state->S_48_32, 8 * sizeof(int32_t)); 42 | memcpy(state->S_48_32, tmpmem + 488, 8 * sizeof(int32_t)); 43 | WebRtcSpl_Resample48khzTo32khz(tmpmem + 8, tmpmem, 160); 44 | 45 | ///// 32 --> 16 ///// 46 | // int32_t in[320] 47 | // int16_t out[160] 48 | ///// 49 | WebRtcSpl_DownBy2IntToShort(tmpmem, 320, out, state->S_32_16); 50 | } 51 | 52 | // initialize state of 48 -> 16 resampler 53 | void WebRtcSpl_ResetResample48khzTo16khz(WebRtcSpl_State48khzTo16khz* state) 54 | { 55 | memset(state->S_48_48, 0, 16 * sizeof(int32_t)); 56 | memset(state->S_48_32, 0, 8 * sizeof(int32_t)); 57 | memset(state->S_32_16, 0, 8 * sizeof(int32_t)); 58 | } 59 | 60 | //////////////////////////// 61 | ///// 16 kHz -> 48 kHz ///// 62 | //////////////////////////// 63 | 64 | // 16 -> 48 resampler 65 | void WebRtcSpl_Resample16khzTo48khz(const int16_t* in, int16_t* out, 66 | WebRtcSpl_State16khzTo48khz* state, int32_t* tmpmem) 67 | { 68 | ///// 16 --> 32 ///// 69 | // int16_t in[160] 70 | // int32_t out[320] 71 | ///// 72 | WebRtcSpl_UpBy2ShortToInt(in, 160, tmpmem + 16, state->S_16_32); 73 | 74 | ///// 32 --> 24 ///// 75 | // int32_t in[320] 76 | // int32_t out[240] 77 | // copy state to and from input array 78 | ///// 79 | memcpy(tmpmem + 8, state->S_32_24, 8 * sizeof(int32_t)); 80 | memcpy(state->S_32_24, tmpmem + 328, 8 * sizeof(int32_t)); 81 | WebRtcSpl_Resample32khzTo24khz(tmpmem + 8, tmpmem, 80); 82 | 83 | ///// 24 --> 48 ///// 84 | // int32_t in[240] 85 | // int16_t out[480] 86 | ///// 87 | WebRtcSpl_UpBy2IntToShort(tmpmem, 240, out, state->S_24_48); 88 | } 89 | 90 | // initialize state of 16 -> 48 resampler 91 | void WebRtcSpl_ResetResample16khzTo48khz(WebRtcSpl_State16khzTo48khz* state) 92 | { 93 | memset(state->S_16_32, 0, 8 * sizeof(int32_t)); 94 | memset(state->S_32_24, 0, 8 * sizeof(int32_t)); 95 | memset(state->S_24_48, 0, 8 * sizeof(int32_t)); 96 | } 97 | 98 | //////////////////////////// 99 | ///// 48 kHz -> 8 kHz ///// 100 | //////////////////////////// 101 | 102 | // 48 -> 8 resampler 103 | void WebRtcSpl_Resample48khzTo8khz(const int16_t* in, int16_t* out, 104 | WebRtcSpl_State48khzTo8khz* state, int32_t* tmpmem) 105 | { 106 | ///// 48 --> 24 ///// 107 | // int16_t in[480] 108 | // int32_t out[240] 109 | ///// 110 | WebRtcSpl_DownBy2ShortToInt(in, 480, tmpmem + 256, state->S_48_24); 111 | 112 | ///// 24 --> 24(LP) ///// 113 | // int32_t in[240] 114 | // int32_t out[240] 115 | ///// 116 | WebRtcSpl_LPBy2IntToInt(tmpmem + 256, 240, tmpmem + 16, state->S_24_24); 117 | 118 | ///// 24 --> 16 ///// 119 | // int32_t in[240] 120 | // int32_t out[160] 121 | ///// 122 | // copy state to and from input array 123 | memcpy(tmpmem + 8, state->S_24_16, 8 * sizeof(int32_t)); 124 | memcpy(state->S_24_16, tmpmem + 248, 8 * sizeof(int32_t)); 125 | WebRtcSpl_Resample48khzTo32khz(tmpmem + 8, tmpmem, 80); 126 | 127 | ///// 16 --> 8 ///// 128 | // int32_t in[160] 129 | // int16_t out[80] 130 | ///// 131 | WebRtcSpl_DownBy2IntToShort(tmpmem, 160, out, state->S_16_8); 132 | } 133 | 134 | // initialize state of 48 -> 8 resampler 135 | void WebRtcSpl_ResetResample48khzTo8khz(WebRtcSpl_State48khzTo8khz* state) 136 | { 137 | memset(state->S_48_24, 0, 8 * sizeof(int32_t)); 138 | memset(state->S_24_24, 0, 16 * sizeof(int32_t)); 139 | memset(state->S_24_16, 0, 8 * sizeof(int32_t)); 140 | memset(state->S_16_8, 0, 8 * sizeof(int32_t)); 141 | } 142 | 143 | //////////////////////////// 144 | ///// 8 kHz -> 48 kHz ///// 145 | //////////////////////////// 146 | 147 | // 8 -> 48 resampler 148 | void WebRtcSpl_Resample8khzTo48khz(const int16_t* in, int16_t* out, 149 | WebRtcSpl_State8khzTo48khz* state, int32_t* tmpmem) 150 | { 151 | ///// 8 --> 16 ///// 152 | // int16_t in[80] 153 | // int32_t out[160] 154 | ///// 155 | WebRtcSpl_UpBy2ShortToInt(in, 80, tmpmem + 264, state->S_8_16); 156 | 157 | ///// 16 --> 12 ///// 158 | // int32_t in[160] 159 | // int32_t out[120] 160 | ///// 161 | // copy state to and from input array 162 | memcpy(tmpmem + 256, state->S_16_12, 8 * sizeof(int32_t)); 163 | memcpy(state->S_16_12, tmpmem + 416, 8 * sizeof(int32_t)); 164 | WebRtcSpl_Resample32khzTo24khz(tmpmem + 256, tmpmem + 240, 40); 165 | 166 | ///// 12 --> 24 ///// 167 | // int32_t in[120] 168 | // int16_t out[240] 169 | ///// 170 | WebRtcSpl_UpBy2IntToInt(tmpmem + 240, 120, tmpmem, state->S_12_24); 171 | 172 | ///// 24 --> 48 ///// 173 | // int32_t in[240] 174 | // int16_t out[480] 175 | ///// 176 | WebRtcSpl_UpBy2IntToShort(tmpmem, 240, out, state->S_24_48); 177 | } 178 | 179 | // initialize state of 8 -> 48 resampler 180 | void WebRtcSpl_ResetResample8khzTo48khz(WebRtcSpl_State8khzTo48khz* state) 181 | { 182 | memset(state->S_8_16, 0, 8 * sizeof(int32_t)); 183 | memset(state->S_16_12, 0, 8 * sizeof(int32_t)); 184 | memset(state->S_12_24, 0, 8 * sizeof(int32_t)); 185 | memset(state->S_24_48, 0, 8 * sizeof(int32_t)); 186 | } 187 | -------------------------------------------------------------------------------- /binding.gyp: -------------------------------------------------------------------------------- 1 | { 2 | "targets": [ 3 | { 4 | "target_name": "speechrecorder", 5 | "sources": ["src/speech_recorder.cpp"], 6 | "cflags!": [ 7 | "-fno-exceptions", 8 | "-fno-rtti", 9 | ], 10 | "cflags_cc!": [ 11 | "-fno-exceptions", 12 | "-fno-rtti", 13 | ], 14 | "include_dirs": [ 15 | " 18 | // rtc::SafeGe // >= 19 | // 20 | // They each accept two arguments of arbitrary types, and in almost all cases, 21 | // they simply call the appropriate comparison operator. However, if both 22 | // arguments are integers, they don't compare them using C++'s quirky rules, 23 | // but instead adhere to the true mathematical definitions. It is as if the 24 | // arguments were first converted to infinite-range signed integers, and then 25 | // compared, although of course nothing expensive like that actually takes 26 | // place. In practice, for signed/signed and unsigned/unsigned comparisons and 27 | // some mixed-signed comparisons with a compile-time constant, the overhead is 28 | // zero; in the remaining cases, it is just a few machine instructions (no 29 | // branches). 30 | 31 | #ifndef RTC_BASE_NUMERICS_SAFE_COMPARE_H_ 32 | #define RTC_BASE_NUMERICS_SAFE_COMPARE_H_ 33 | 34 | #include 35 | #include 36 | 37 | #include 38 | #include 39 | 40 | #include "webrtc/rtc_base/type_traits.h" 41 | 42 | namespace rtc { 43 | 44 | namespace safe_cmp_impl { 45 | 46 | template 47 | struct LargerIntImpl : std::false_type {}; 48 | template <> 49 | struct LargerIntImpl : std::true_type { 50 | using type = int16_t; 51 | }; 52 | template <> 53 | struct LargerIntImpl : std::true_type { 54 | using type = int32_t; 55 | }; 56 | template <> 57 | struct LargerIntImpl : std::true_type { 58 | using type = int64_t; 59 | }; 60 | 61 | // LargerInt::value is true iff there's a signed type that's larger 62 | // than T1 (and no larger than the larger of T2 and int*, for performance 63 | // reasons); and if there is such a type, LargerInt::type is an alias 64 | // for it. 65 | template 66 | struct LargerInt 67 | : LargerIntImpl {}; 70 | 71 | template 72 | constexpr typename std::make_unsigned::type MakeUnsigned(T a) { 73 | return static_cast::type>(a); 74 | } 75 | 76 | // Overload for when both T1 and T2 have the same signedness. 77 | template ::value == 81 | std::is_signed::value>::type* = nullptr> 82 | constexpr bool Cmp(T1 a, T2 b) { 83 | return Op::Op(a, b); 84 | } 85 | 86 | // Overload for signed - unsigned comparison that can be promoted to a bigger 87 | // signed type. 88 | template ::value && 92 | std::is_unsigned::value && 93 | LargerInt::value>::type* = nullptr> 94 | constexpr bool Cmp(T1 a, T2 b) { 95 | return Op::Op(a, static_cast::type>(b)); 96 | } 97 | 98 | // Overload for unsigned - signed comparison that can be promoted to a bigger 99 | // signed type. 100 | template ::value && 104 | std::is_signed::value && 105 | LargerInt::value>::type* = nullptr> 106 | constexpr bool Cmp(T1 a, T2 b) { 107 | return Op::Op(static_cast::type>(a), b); 108 | } 109 | 110 | // Overload for signed - unsigned comparison that can't be promoted to a bigger 111 | // signed type. 112 | template ::value && 116 | std::is_unsigned::value && 117 | !LargerInt::value>::type* = nullptr> 118 | constexpr bool Cmp(T1 a, T2 b) { 119 | return a < 0 ? Op::Op(-1, 0) : Op::Op(safe_cmp_impl::MakeUnsigned(a), b); 120 | } 121 | 122 | // Overload for unsigned - signed comparison that can't be promoted to a bigger 123 | // signed type. 124 | template ::value && 128 | std::is_signed::value && 129 | !LargerInt::value>::type* = nullptr> 130 | constexpr bool Cmp(T1 a, T2 b) { 131 | return b < 0 ? Op::Op(0, -1) : Op::Op(a, safe_cmp_impl::MakeUnsigned(b)); 132 | } 133 | 134 | #define RTC_SAFECMP_MAKE_OP(name, op) \ 135 | struct name { \ 136 | template \ 137 | static constexpr bool Op(T1 a, T2 b) { \ 138 | return a op b; \ 139 | } \ 140 | }; 141 | RTC_SAFECMP_MAKE_OP(EqOp, ==) 142 | RTC_SAFECMP_MAKE_OP(NeOp, !=) 143 | RTC_SAFECMP_MAKE_OP(LtOp, <) 144 | RTC_SAFECMP_MAKE_OP(LeOp, <=) 145 | RTC_SAFECMP_MAKE_OP(GtOp, >) 146 | RTC_SAFECMP_MAKE_OP(GeOp, >=) 147 | #undef RTC_SAFECMP_MAKE_OP 148 | 149 | } // namespace safe_cmp_impl 150 | 151 | #define RTC_SAFECMP_MAKE_FUN(name) \ 152 | template \ 153 | constexpr \ 154 | typename std::enable_if::value && IsIntlike::value, \ 155 | bool>::type Safe##name(T1 a, T2 b) { \ 156 | /* Unary plus here turns enums into real integral types. */ \ 157 | return safe_cmp_impl::Cmp(+a, +b); \ 158 | } \ 159 | template \ 160 | constexpr \ 161 | typename std::enable_if::value || !IsIntlike::value, \ 162 | bool>::type Safe##name(const T1& a, \ 163 | const T2& b) { \ 164 | return safe_cmp_impl::name##Op::Op(a, b); \ 165 | } 166 | RTC_SAFECMP_MAKE_FUN(Eq) 167 | RTC_SAFECMP_MAKE_FUN(Ne) 168 | RTC_SAFECMP_MAKE_FUN(Lt) 169 | RTC_SAFECMP_MAKE_FUN(Le) 170 | RTC_SAFECMP_MAKE_FUN(Gt) 171 | RTC_SAFECMP_MAKE_FUN(Ge) 172 | #undef RTC_SAFECMP_MAKE_FUN 173 | 174 | } // namespace rtc 175 | 176 | #endif // RTC_BASE_NUMERICS_SAFE_COMPARE_H_ 177 | -------------------------------------------------------------------------------- /lib/src/chunk_processor.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "chunk_processor.h" 9 | 10 | namespace speechrecorder { 11 | 12 | static std::mutex ortMutex_; 13 | static std::unique_ptr ortEnv_; 14 | static std::unique_ptr ortMemory_; 15 | static std::unique_ptr ortSession_; 16 | 17 | ChunkProcessor::ChunkProcessor(std::string modelPath, 18 | ChunkProcessorOptions options) 19 | : options_(options), 20 | queue_(), 21 | stopped_(false), 22 | microphone_(options.device, options.samplesPerFrame, options.sampleRate, 23 | &queue_), 24 | webrtcVad_(options.webrtcVadLevel, options.sampleRate) { 25 | queueThread_ = std::thread([&, modelPath] { 26 | ortMutex_.lock(); 27 | if (!ortSession_) { 28 | ortEnv_ = std::make_unique(ORT_LOGGING_LEVEL_WARNING, 29 | "SpeechRecorder::ChunkProcessor"); 30 | ortMemory_ = std::make_unique(Ort::MemoryInfo::CreateCpu( 31 | OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault)); 32 | 33 | Ort::SessionOptions sessionOptions; 34 | sessionOptions.SetIntraOpNumThreads(1); 35 | #ifdef _WIN32 36 | std::wstring wstring(modelPath.begin(), modelPath.end()); 37 | ortSession_ = std::make_unique(*ortEnv_, wstring.c_str(), 38 | sessionOptions); 39 | 40 | #else 41 | ortSession_ = std::make_unique(*ortEnv_, modelPath.c_str(), 42 | sessionOptions); 43 | #endif 44 | } 45 | ortMutex_.unlock(); 46 | while (true) { 47 | short* audio; 48 | queue_.wait_dequeue(audio); 49 | // null pointer means the destructor wants us to stop the thread. 50 | if (audio == nullptr) { 51 | return; 52 | } 53 | if (!stopped_) { 54 | Process(audio); 55 | } 56 | } 57 | }); 58 | } 59 | 60 | ChunkProcessor::~ChunkProcessor() { 61 | // shutdown the queue thread. 62 | stopped_ = true; 63 | queue_.enqueue(nullptr); 64 | queueThread_.join(); 65 | 66 | if (stopThread_.joinable()) { 67 | stopThread_.join(); 68 | } 69 | if (startThread_.joinable()) { 70 | startThread_.join(); 71 | } 72 | } 73 | 74 | void ChunkProcessor::Process(short* input) { 75 | std::vector frame; 76 | const short* iterator = (const short*)input; 77 | unsigned long long sum = 0; 78 | for (unsigned long i = 0; i < options_.samplesPerFrame; i++) { 79 | const short value = *iterator++; 80 | frame.push_back(value); 81 | leadingBuffer_.push_back(value); 82 | sileroBuffer_.push_back((float)value / (float)SHRT_MAX); 83 | webrtcVadBuffer_.push_back(value); 84 | sum += value * value; 85 | } 86 | 87 | double volume = sqrt((double)sum / (double)options_.samplesPerFrame); 88 | if (leadingBuffer_.size() > 89 | options_.leadingBufferFrames * options_.samplesPerFrame) { 90 | leadingBuffer_.erase( 91 | leadingBuffer_.begin(), 92 | leadingBuffer_.begin() + 93 | (leadingBuffer_.size() - 94 | (options_.leadingBufferFrames * options_.samplesPerFrame))); 95 | } 96 | 97 | if (sileroBuffer_.size() > options_.sileroVadBufferSize) { 98 | sileroBuffer_.erase(sileroBuffer_.begin(), 99 | sileroBuffer_.begin() + (sileroBuffer_.size() - 100 | options_.sileroVadBufferSize)); 101 | } 102 | 103 | // typically, the number of samples per frame will be larger than the 104 | // webrtcvad buffer size, so continually append the new audio to the end of 105 | // the buffer, and process the buffer from left to right until it's too small 106 | // for a webrtcvad call 107 | while (webrtcVadBuffer_.size() >= options_.webrtcVadBufferSize) { 108 | std::vector buffer( 109 | webrtcVadBuffer_.begin(), 110 | webrtcVadBuffer_.begin() + options_.webrtcVadBufferSize); 111 | webrtcVadResults_.push_back( 112 | webrtcVad_.Process(buffer.data(), options_.webrtcVadBufferSize)); 113 | webrtcVadBuffer_.erase( 114 | webrtcVadBuffer_.begin(), 115 | webrtcVadBuffer_.begin() + options_.webrtcVadBufferSize); 116 | } 117 | 118 | if (webrtcVadResults_.size() > options_.webrtcVadResultsSize) { 119 | webrtcVadResults_.erase( 120 | webrtcVadResults_.begin(), 121 | webrtcVadResults_.begin() + 122 | (webrtcVadResults_.size() - options_.webrtcVadResultsSize)); 123 | } 124 | 125 | if (framesUntilSileroVad_ > 0) { 126 | framesUntilSileroVad_--; 127 | } 128 | 129 | // if we're speaking or any past webrtcvad result within the window is true, 130 | // then use the result from the silero vad 131 | double probability = 0.0; 132 | if (speaking_ || webrtcVadResults_.size() != options_.webrtcVadResultsSize || 133 | std::any_of(webrtcVadResults_.begin(), webrtcVadResults_.end(), 134 | [](bool e) { return e; })) { 135 | if (framesUntilSileroVad_ == 0) { 136 | framesUntilSileroVad_ = options_.sileroVadRateLimit; 137 | 138 | std::vector inputDimensions; 139 | inputDimensions.push_back(1); 140 | inputDimensions.push_back(sileroBuffer_.size()); 141 | 142 | std::vector inputTensors; 143 | inputTensors.push_back(Ort::Value::CreateTensor( 144 | *ortMemory_, sileroBuffer_.data(), sileroBuffer_.size(), 145 | inputDimensions.data(), inputDimensions.size())); 146 | 147 | std::vector outputTensorValues(2); 148 | std::vector outputDimensions; 149 | outputDimensions.push_back(1); 150 | outputDimensions.push_back(2); 151 | 152 | std::vector outputTensors; 153 | outputTensors.push_back(Ort::Value::CreateTensor( 154 | *ortMemory_, outputTensorValues.data(), outputTensorValues.size(), 155 | outputDimensions.data(), outputDimensions.size())); 156 | 157 | std::vector inputNames{"input"}; 158 | std::vector outputNames{"output"}; 159 | ortSession_->Run(Ort::RunOptions{nullptr}, inputNames.data(), 160 | inputTensors.data(), 1, outputNames.data(), 161 | outputTensors.data(), 1); 162 | 163 | sileroVadProbability_ = outputTensorValues[1]; 164 | } 165 | 166 | probability = sileroVadProbability_; 167 | } 168 | 169 | bool speaking = speaking_ ? probability > options_.sileroVadSilenceThreshold 170 | : probability > options_.sileroVadSpeakingThreshold; 171 | if (speaking) { 172 | consecutiveSilence_ = 0; 173 | consecutiveSpeaking_++; 174 | } else { 175 | consecutiveSilence_++; 176 | consecutiveSpeaking_ = 0; 177 | } 178 | 179 | if (!speaking_ && 180 | consecutiveSpeaking_ == options_.consecutiveFramesForSpeaking) { 181 | speaking_ = true; 182 | if (options_.onChunkStart != nullptr) { 183 | options_.onChunkStart(leadingBuffer_); 184 | } 185 | } 186 | 187 | if (options_.onAudio != nullptr) { 188 | options_.onAudio(frame, speaking_, volume, speaking, probability, 189 | consecutiveSilence_); 190 | } 191 | 192 | if (speaking_ && 193 | consecutiveSilence_ == options_.consecutiveFramesForSilence) { 194 | speaking_ = false; 195 | leadingBuffer_.clear(); 196 | if (options_.onChunkEnd != nullptr) { 197 | options_.onChunkEnd(); 198 | } 199 | } 200 | } 201 | 202 | void ChunkProcessor::Reset() { 203 | consecutiveSilence_ = 0; 204 | consecutiveSpeaking_ = 0; 205 | framesUntilSileroVad_ = 0; 206 | leadingBuffer_.clear(); 207 | speaking_ = false; 208 | webrtcVad_.Reset(); 209 | webrtcVadBuffer_.clear(); 210 | webrtcVadResults_.clear(); 211 | short* audio; 212 | while (queue_.try_dequeue(audio)) { 213 | } 214 | } 215 | 216 | void ChunkProcessor::Start() { 217 | toggleLock_.lock(); 218 | startThread_ = std::thread([&] { 219 | Reset(); 220 | microphone_.Start(); 221 | stopped_ = false; 222 | toggleLock_.unlock(); 223 | }); 224 | } 225 | 226 | void ChunkProcessor::Stop() { 227 | toggleLock_.lock(); 228 | stopThread_ = std::thread([&] { 229 | stopped_ = true; 230 | microphone_.Stop(); 231 | toggleLock_.unlock(); 232 | }); 233 | } 234 | 235 | } // namespace speechrecorder 236 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/common_audio/signal_processing/resample_fractional.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | 12 | /* 13 | * This file contains the resampling functions between 48, 44, 32 and 24 kHz. 14 | * The description headers can be found in signal_processing_library.h 15 | * 16 | */ 17 | 18 | #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h" 19 | 20 | // interpolation coefficients 21 | static const int16_t kCoefficients48To32[2][8] = { 22 | {778, -2050, 1087, 23285, 12903, -3783, 441, 222}, 23 | {222, 441, -3783, 12903, 23285, 1087, -2050, 778} 24 | }; 25 | 26 | static const int16_t kCoefficients32To24[3][8] = { 27 | {767, -2362, 2434, 24406, 10620, -3838, 721, 90}, 28 | {386, -381, -2646, 19062, 19062, -2646, -381, 386}, 29 | {90, 721, -3838, 10620, 24406, 2434, -2362, 767} 30 | }; 31 | 32 | static const int16_t kCoefficients44To32[4][9] = { 33 | {117, -669, 2245, -6183, 26267, 13529, -3245, 845, -138}, 34 | {-101, 612, -2283, 8532, 29790, -5138, 1789, -524, 91}, 35 | {50, -292, 1016, -3064, 32010, 3933, -1147, 315, -53}, 36 | {-156, 974, -3863, 18603, 21691, -6246, 2353, -712, 126} 37 | }; 38 | 39 | // Resampling ratio: 2/3 40 | // input: int32_t (normalized, not saturated) :: size 3 * K 41 | // output: int32_t (shifted 15 positions to the left, + offset 16384) :: size 2 * K 42 | // K: number of blocks 43 | 44 | void WebRtcSpl_Resample48khzTo32khz(const int32_t *In, int32_t *Out, size_t K) 45 | { 46 | ///////////////////////////////////////////////////////////// 47 | // Filter operation: 48 | // 49 | // Perform resampling (3 input samples -> 2 output samples); 50 | // process in sub blocks of size 3 samples. 51 | int32_t tmp; 52 | size_t m; 53 | 54 | for (m = 0; m < K; m++) 55 | { 56 | tmp = 1 << 14; 57 | tmp += kCoefficients48To32[0][0] * In[0]; 58 | tmp += kCoefficients48To32[0][1] * In[1]; 59 | tmp += kCoefficients48To32[0][2] * In[2]; 60 | tmp += kCoefficients48To32[0][3] * In[3]; 61 | tmp += kCoefficients48To32[0][4] * In[4]; 62 | tmp += kCoefficients48To32[0][5] * In[5]; 63 | tmp += kCoefficients48To32[0][6] * In[6]; 64 | tmp += kCoefficients48To32[0][7] * In[7]; 65 | Out[0] = tmp; 66 | 67 | tmp = 1 << 14; 68 | tmp += kCoefficients48To32[1][0] * In[1]; 69 | tmp += kCoefficients48To32[1][1] * In[2]; 70 | tmp += kCoefficients48To32[1][2] * In[3]; 71 | tmp += kCoefficients48To32[1][3] * In[4]; 72 | tmp += kCoefficients48To32[1][4] * In[5]; 73 | tmp += kCoefficients48To32[1][5] * In[6]; 74 | tmp += kCoefficients48To32[1][6] * In[7]; 75 | tmp += kCoefficients48To32[1][7] * In[8]; 76 | Out[1] = tmp; 77 | 78 | // update pointers 79 | In += 3; 80 | Out += 2; 81 | } 82 | } 83 | 84 | // Resampling ratio: 3/4 85 | // input: int32_t (normalized, not saturated) :: size 4 * K 86 | // output: int32_t (shifted 15 positions to the left, + offset 16384) :: size 3 * K 87 | // K: number of blocks 88 | 89 | void WebRtcSpl_Resample32khzTo24khz(const int32_t *In, int32_t *Out, size_t K) 90 | { 91 | ///////////////////////////////////////////////////////////// 92 | // Filter operation: 93 | // 94 | // Perform resampling (4 input samples -> 3 output samples); 95 | // process in sub blocks of size 4 samples. 96 | size_t m; 97 | int32_t tmp; 98 | 99 | for (m = 0; m < K; m++) 100 | { 101 | tmp = 1 << 14; 102 | tmp += kCoefficients32To24[0][0] * In[0]; 103 | tmp += kCoefficients32To24[0][1] * In[1]; 104 | tmp += kCoefficients32To24[0][2] * In[2]; 105 | tmp += kCoefficients32To24[0][3] * In[3]; 106 | tmp += kCoefficients32To24[0][4] * In[4]; 107 | tmp += kCoefficients32To24[0][5] * In[5]; 108 | tmp += kCoefficients32To24[0][6] * In[6]; 109 | tmp += kCoefficients32To24[0][7] * In[7]; 110 | Out[0] = tmp; 111 | 112 | tmp = 1 << 14; 113 | tmp += kCoefficients32To24[1][0] * In[1]; 114 | tmp += kCoefficients32To24[1][1] * In[2]; 115 | tmp += kCoefficients32To24[1][2] * In[3]; 116 | tmp += kCoefficients32To24[1][3] * In[4]; 117 | tmp += kCoefficients32To24[1][4] * In[5]; 118 | tmp += kCoefficients32To24[1][5] * In[6]; 119 | tmp += kCoefficients32To24[1][6] * In[7]; 120 | tmp += kCoefficients32To24[1][7] * In[8]; 121 | Out[1] = tmp; 122 | 123 | tmp = 1 << 14; 124 | tmp += kCoefficients32To24[2][0] * In[2]; 125 | tmp += kCoefficients32To24[2][1] * In[3]; 126 | tmp += kCoefficients32To24[2][2] * In[4]; 127 | tmp += kCoefficients32To24[2][3] * In[5]; 128 | tmp += kCoefficients32To24[2][4] * In[6]; 129 | tmp += kCoefficients32To24[2][5] * In[7]; 130 | tmp += kCoefficients32To24[2][6] * In[8]; 131 | tmp += kCoefficients32To24[2][7] * In[9]; 132 | Out[2] = tmp; 133 | 134 | // update pointers 135 | In += 4; 136 | Out += 3; 137 | } 138 | } 139 | 140 | // 141 | // fractional resampling filters 142 | // Fout = 11/16 * Fin 143 | // Fout = 8/11 * Fin 144 | // 145 | 146 | // compute two inner-products and store them to output array 147 | static void WebRtcSpl_ResampDotProduct(const int32_t *in1, const int32_t *in2, 148 | const int16_t *coef_ptr, int32_t *out1, 149 | int32_t *out2) 150 | { 151 | int32_t tmp1 = 16384; 152 | int32_t tmp2 = 16384; 153 | int16_t coef; 154 | 155 | coef = coef_ptr[0]; 156 | tmp1 += coef * in1[0]; 157 | tmp2 += coef * in2[-0]; 158 | 159 | coef = coef_ptr[1]; 160 | tmp1 += coef * in1[1]; 161 | tmp2 += coef * in2[-1]; 162 | 163 | coef = coef_ptr[2]; 164 | tmp1 += coef * in1[2]; 165 | tmp2 += coef * in2[-2]; 166 | 167 | coef = coef_ptr[3]; 168 | tmp1 += coef * in1[3]; 169 | tmp2 += coef * in2[-3]; 170 | 171 | coef = coef_ptr[4]; 172 | tmp1 += coef * in1[4]; 173 | tmp2 += coef * in2[-4]; 174 | 175 | coef = coef_ptr[5]; 176 | tmp1 += coef * in1[5]; 177 | tmp2 += coef * in2[-5]; 178 | 179 | coef = coef_ptr[6]; 180 | tmp1 += coef * in1[6]; 181 | tmp2 += coef * in2[-6]; 182 | 183 | coef = coef_ptr[7]; 184 | tmp1 += coef * in1[7]; 185 | tmp2 += coef * in2[-7]; 186 | 187 | coef = coef_ptr[8]; 188 | *out1 = tmp1 + coef * in1[8]; 189 | *out2 = tmp2 + coef * in2[-8]; 190 | } 191 | 192 | // Resampling ratio: 8/11 193 | // input: int32_t (normalized, not saturated) :: size 11 * K 194 | // output: int32_t (shifted 15 positions to the left, + offset 16384) :: size 8 * K 195 | // K: number of blocks 196 | 197 | void WebRtcSpl_Resample44khzTo32khz(const int32_t *In, int32_t *Out, size_t K) 198 | { 199 | ///////////////////////////////////////////////////////////// 200 | // Filter operation: 201 | // 202 | // Perform resampling (11 input samples -> 8 output samples); 203 | // process in sub blocks of size 11 samples. 204 | int32_t tmp; 205 | size_t m; 206 | 207 | for (m = 0; m < K; m++) 208 | { 209 | tmp = 1 << 14; 210 | 211 | // first output sample 212 | Out[0] = ((int32_t)In[3] << 15) + tmp; 213 | 214 | // sum and accumulate filter coefficients and input samples 215 | tmp += kCoefficients44To32[3][0] * In[5]; 216 | tmp += kCoefficients44To32[3][1] * In[6]; 217 | tmp += kCoefficients44To32[3][2] * In[7]; 218 | tmp += kCoefficients44To32[3][3] * In[8]; 219 | tmp += kCoefficients44To32[3][4] * In[9]; 220 | tmp += kCoefficients44To32[3][5] * In[10]; 221 | tmp += kCoefficients44To32[3][6] * In[11]; 222 | tmp += kCoefficients44To32[3][7] * In[12]; 223 | tmp += kCoefficients44To32[3][8] * In[13]; 224 | Out[4] = tmp; 225 | 226 | // sum and accumulate filter coefficients and input samples 227 | WebRtcSpl_ResampDotProduct(&In[0], &In[17], kCoefficients44To32[0], &Out[1], &Out[7]); 228 | 229 | // sum and accumulate filter coefficients and input samples 230 | WebRtcSpl_ResampDotProduct(&In[2], &In[15], kCoefficients44To32[1], &Out[2], &Out[6]); 231 | 232 | // sum and accumulate filter coefficients and input samples 233 | WebRtcSpl_ResampDotProduct(&In[3], &In[14], kCoefficients44To32[2], &Out[3], &Out[5]); 234 | 235 | // update pointers 236 | In += 11; 237 | Out += 8; 238 | } 239 | } 240 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/common_audio/signal_processing/complex_fft_tables.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | #ifndef COMMON_AUDIO_SIGNAL_PROCESSING_COMPLEX_FFT_TABLES_H_ 12 | #define COMMON_AUDIO_SIGNAL_PROCESSING_COMPLEX_FFT_TABLES_H_ 13 | 14 | #include 15 | 16 | static const int16_t kSinTable1024[] = { 17 | 0, 201, 402, 603, 804, 1005, 1206, 1406, 1607, 18 | 1808, 2009, 2209, 2410, 2610, 2811, 3011, 3211, 3411, 19 | 3611, 3811, 4011, 4210, 4409, 4608, 4807, 5006, 5205, 20 | 5403, 5601, 5799, 5997, 6195, 6392, 6589, 6786, 6982, 21 | 7179, 7375, 7571, 7766, 7961, 8156, 8351, 8545, 8739, 22 | 8932, 9126, 9319, 9511, 9703, 9895, 10087, 10278, 10469, 23 | 10659, 10849, 11038, 11227, 11416, 11604, 11792, 11980, 12166, 24 | 12353, 12539, 12724, 12909, 13094, 13278, 13462, 13645, 13827, 25 | 14009, 14191, 14372, 14552, 14732, 14911, 15090, 15268, 15446, 26 | 15623, 15799, 15975, 16150, 16325, 16499, 16672, 16845, 17017, 27 | 17189, 17360, 17530, 17699, 17868, 18036, 18204, 18371, 18537, 28 | 18702, 18867, 19031, 19194, 19357, 19519, 19680, 19840, 20000, 29 | 20159, 20317, 20474, 20631, 20787, 20942, 21096, 21249, 21402, 30 | 21554, 21705, 21855, 22004, 22153, 22301, 22448, 22594, 22739, 31 | 22883, 23027, 23169, 23311, 23452, 23592, 23731, 23869, 24006, 32 | 24143, 24278, 24413, 24546, 24679, 24811, 24942, 25072, 25201, 33 | 25329, 25456, 25582, 25707, 25831, 25954, 26077, 26198, 26318, 34 | 26437, 26556, 26673, 26789, 26905, 27019, 27132, 27244, 27355, 35 | 27466, 27575, 27683, 27790, 27896, 28001, 28105, 28208, 28309, 36 | 28410, 28510, 28608, 28706, 28802, 28897, 28992, 29085, 29177, 37 | 29268, 29358, 29446, 29534, 29621, 29706, 29790, 29873, 29955, 38 | 30036, 30116, 30195, 30272, 30349, 30424, 30498, 30571, 30643, 39 | 30713, 30783, 30851, 30918, 30984, 31049, 31113, 31175, 31236, 40 | 31297, 31356, 31413, 31470, 31525, 31580, 31633, 31684, 31735, 41 | 31785, 31833, 31880, 31926, 31970, 32014, 32056, 32097, 32137, 42 | 32176, 32213, 32249, 32284, 32318, 32350, 32382, 32412, 32441, 43 | 32468, 32495, 32520, 32544, 32567, 32588, 32609, 32628, 32646, 44 | 32662, 32678, 32692, 32705, 32717, 32727, 32736, 32744, 32751, 45 | 32757, 32761, 32764, 32766, 32767, 32766, 32764, 32761, 32757, 46 | 32751, 32744, 32736, 32727, 32717, 32705, 32692, 32678, 32662, 47 | 32646, 32628, 32609, 32588, 32567, 32544, 32520, 32495, 32468, 48 | 32441, 32412, 32382, 32350, 32318, 32284, 32249, 32213, 32176, 49 | 32137, 32097, 32056, 32014, 31970, 31926, 31880, 31833, 31785, 50 | 31735, 31684, 31633, 31580, 31525, 31470, 31413, 31356, 31297, 51 | 31236, 31175, 31113, 31049, 30984, 30918, 30851, 30783, 30713, 52 | 30643, 30571, 30498, 30424, 30349, 30272, 30195, 30116, 30036, 53 | 29955, 29873, 29790, 29706, 29621, 29534, 29446, 29358, 29268, 54 | 29177, 29085, 28992, 28897, 28802, 28706, 28608, 28510, 28410, 55 | 28309, 28208, 28105, 28001, 27896, 27790, 27683, 27575, 27466, 56 | 27355, 27244, 27132, 27019, 26905, 26789, 26673, 26556, 26437, 57 | 26318, 26198, 26077, 25954, 25831, 25707, 25582, 25456, 25329, 58 | 25201, 25072, 24942, 24811, 24679, 24546, 24413, 24278, 24143, 59 | 24006, 23869, 23731, 23592, 23452, 23311, 23169, 23027, 22883, 60 | 22739, 22594, 22448, 22301, 22153, 22004, 21855, 21705, 21554, 61 | 21402, 21249, 21096, 20942, 20787, 20631, 20474, 20317, 20159, 62 | 20000, 19840, 19680, 19519, 19357, 19194, 19031, 18867, 18702, 63 | 18537, 18371, 18204, 18036, 17868, 17699, 17530, 17360, 17189, 64 | 17017, 16845, 16672, 16499, 16325, 16150, 15975, 15799, 15623, 65 | 15446, 15268, 15090, 14911, 14732, 14552, 14372, 14191, 14009, 66 | 13827, 13645, 13462, 13278, 13094, 12909, 12724, 12539, 12353, 67 | 12166, 11980, 11792, 11604, 11416, 11227, 11038, 10849, 10659, 68 | 10469, 10278, 10087, 9895, 9703, 9511, 9319, 9126, 8932, 69 | 8739, 8545, 8351, 8156, 7961, 7766, 7571, 7375, 7179, 70 | 6982, 6786, 6589, 6392, 6195, 5997, 5799, 5601, 5403, 71 | 5205, 5006, 4807, 4608, 4409, 4210, 4011, 3811, 3611, 72 | 3411, 3211, 3011, 2811, 2610, 2410, 2209, 2009, 1808, 73 | 1607, 1406, 1206, 1005, 804, 603, 402, 201, 0, 74 | -201, -402, -603, -804, -1005, -1206, -1406, -1607, -1808, 75 | -2009, -2209, -2410, -2610, -2811, -3011, -3211, -3411, -3611, 76 | -3811, -4011, -4210, -4409, -4608, -4807, -5006, -5205, -5403, 77 | -5601, -5799, -5997, -6195, -6392, -6589, -6786, -6982, -7179, 78 | -7375, -7571, -7766, -7961, -8156, -8351, -8545, -8739, -8932, 79 | -9126, -9319, -9511, -9703, -9895, -10087, -10278, -10469, -10659, 80 | -10849, -11038, -11227, -11416, -11604, -11792, -11980, -12166, -12353, 81 | -12539, -12724, -12909, -13094, -13278, -13462, -13645, -13827, -14009, 82 | -14191, -14372, -14552, -14732, -14911, -15090, -15268, -15446, -15623, 83 | -15799, -15975, -16150, -16325, -16499, -16672, -16845, -17017, -17189, 84 | -17360, -17530, -17699, -17868, -18036, -18204, -18371, -18537, -18702, 85 | -18867, -19031, -19194, -19357, -19519, -19680, -19840, -20000, -20159, 86 | -20317, -20474, -20631, -20787, -20942, -21096, -21249, -21402, -21554, 87 | -21705, -21855, -22004, -22153, -22301, -22448, -22594, -22739, -22883, 88 | -23027, -23169, -23311, -23452, -23592, -23731, -23869, -24006, -24143, 89 | -24278, -24413, -24546, -24679, -24811, -24942, -25072, -25201, -25329, 90 | -25456, -25582, -25707, -25831, -25954, -26077, -26198, -26318, -26437, 91 | -26556, -26673, -26789, -26905, -27019, -27132, -27244, -27355, -27466, 92 | -27575, -27683, -27790, -27896, -28001, -28105, -28208, -28309, -28410, 93 | -28510, -28608, -28706, -28802, -28897, -28992, -29085, -29177, -29268, 94 | -29358, -29446, -29534, -29621, -29706, -29790, -29873, -29955, -30036, 95 | -30116, -30195, -30272, -30349, -30424, -30498, -30571, -30643, -30713, 96 | -30783, -30851, -30918, -30984, -31049, -31113, -31175, -31236, -31297, 97 | -31356, -31413, -31470, -31525, -31580, -31633, -31684, -31735, -31785, 98 | -31833, -31880, -31926, -31970, -32014, -32056, -32097, -32137, -32176, 99 | -32213, -32249, -32284, -32318, -32350, -32382, -32412, -32441, -32468, 100 | -32495, -32520, -32544, -32567, -32588, -32609, -32628, -32646, -32662, 101 | -32678, -32692, -32705, -32717, -32727, -32736, -32744, -32751, -32757, 102 | -32761, -32764, -32766, -32767, -32766, -32764, -32761, -32757, -32751, 103 | -32744, -32736, -32727, -32717, -32705, -32692, -32678, -32662, -32646, 104 | -32628, -32609, -32588, -32567, -32544, -32520, -32495, -32468, -32441, 105 | -32412, -32382, -32350, -32318, -32284, -32249, -32213, -32176, -32137, 106 | -32097, -32056, -32014, -31970, -31926, -31880, -31833, -31785, -31735, 107 | -31684, -31633, -31580, -31525, -31470, -31413, -31356, -31297, -31236, 108 | -31175, -31113, -31049, -30984, -30918, -30851, -30783, -30713, -30643, 109 | -30571, -30498, -30424, -30349, -30272, -30195, -30116, -30036, -29955, 110 | -29873, -29790, -29706, -29621, -29534, -29446, -29358, -29268, -29177, 111 | -29085, -28992, -28897, -28802, -28706, -28608, -28510, -28410, -28309, 112 | -28208, -28105, -28001, -27896, -27790, -27683, -27575, -27466, -27355, 113 | -27244, -27132, -27019, -26905, -26789, -26673, -26556, -26437, -26318, 114 | -26198, -26077, -25954, -25831, -25707, -25582, -25456, -25329, -25201, 115 | -25072, -24942, -24811, -24679, -24546, -24413, -24278, -24143, -24006, 116 | -23869, -23731, -23592, -23452, -23311, -23169, -23027, -22883, -22739, 117 | -22594, -22448, -22301, -22153, -22004, -21855, -21705, -21554, -21402, 118 | -21249, -21096, -20942, -20787, -20631, -20474, -20317, -20159, -20000, 119 | -19840, -19680, -19519, -19357, -19194, -19031, -18867, -18702, -18537, 120 | -18371, -18204, -18036, -17868, -17699, -17530, -17360, -17189, -17017, 121 | -16845, -16672, -16499, -16325, -16150, -15975, -15799, -15623, -15446, 122 | -15268, -15090, -14911, -14732, -14552, -14372, -14191, -14009, -13827, 123 | -13645, -13462, -13278, -13094, -12909, -12724, -12539, -12353, -12166, 124 | -11980, -11792, -11604, -11416, -11227, -11038, -10849, -10659, -10469, 125 | -10278, -10087, -9895, -9703, -9511, -9319, -9126, -8932, -8739, 126 | -8545, -8351, -8156, -7961, -7766, -7571, -7375, -7179, -6982, 127 | -6786, -6589, -6392, -6195, -5997, -5799, -5601, -5403, -5205, 128 | -5006, -4807, -4608, -4409, -4210, -4011, -3811, -3611, -3411, 129 | -3211, -3011, -2811, -2610, -2410, -2209, -2009, -1808, -1607, 130 | -1406, -1206, -1005, -804, -603, -402, -201}; 131 | 132 | #endif // COMMON_AUDIO_SIGNAL_PROCESSING_COMPLEX_FFT_TABLES_H_ 133 | -------------------------------------------------------------------------------- /lib/3rd_party/webrtcvad/webrtc/common_audio/signal_processing/complex_fft.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. 3 | * 4 | * Use of this source code is governed by a BSD-style license 5 | * that can be found in the LICENSE file in the root of the source 6 | * tree. An additional intellectual property rights grant can be found 7 | * in the file PATENTS. All contributing project authors may 8 | * be found in the AUTHORS file in the root of the source tree. 9 | */ 10 | 11 | 12 | /* 13 | * This file contains the function WebRtcSpl_ComplexFFT(). 14 | * The description header can be found in signal_processing_library.h 15 | * 16 | */ 17 | 18 | #include "webrtc/common_audio/signal_processing/complex_fft_tables.h" 19 | #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h" 20 | #include "webrtc/rtc_base/system/arch.h" 21 | 22 | #define CFFTSFT 14 23 | #define CFFTRND 1 24 | #define CFFTRND2 16384 25 | 26 | #define CIFFTSFT 14 27 | #define CIFFTRND 1 28 | 29 | 30 | int WebRtcSpl_ComplexFFT(int16_t frfi[], int stages, int mode) 31 | { 32 | int i, j, l, k, istep, n, m; 33 | int16_t wr, wi; 34 | int32_t tr32, ti32, qr32, qi32; 35 | 36 | /* The 1024-value is a constant given from the size of kSinTable1024[], 37 | * and should not be changed depending on the input parameter 'stages' 38 | */ 39 | n = 1 << stages; 40 | if (n > 1024) 41 | return -1; 42 | 43 | l = 1; 44 | k = 10 - 1; /* Constant for given kSinTable1024[]. Do not change 45 | depending on the input parameter 'stages' */ 46 | 47 | if (mode == 0) 48 | { 49 | // mode==0: Low-complexity and Low-accuracy mode 50 | while (l < n) 51 | { 52 | istep = l << 1; 53 | 54 | for (m = 0; m < l; ++m) 55 | { 56 | j = m << k; 57 | 58 | /* The 256-value is a constant given as 1/4 of the size of 59 | * kSinTable1024[], and should not be changed depending on the input 60 | * parameter 'stages'. It will result in 0 <= j < N_SINE_WAVE/2 61 | */ 62 | wr = kSinTable1024[j + 256]; 63 | wi = -kSinTable1024[j]; 64 | 65 | for (i = m; i < n; i += istep) 66 | { 67 | j = i + l; 68 | 69 | tr32 = (wr * frfi[2 * j] - wi * frfi[2 * j + 1]) >> 15; 70 | 71 | ti32 = (wr * frfi[2 * j + 1] + wi * frfi[2 * j]) >> 15; 72 | 73 | qr32 = (int32_t)frfi[2 * i]; 74 | qi32 = (int32_t)frfi[2 * i + 1]; 75 | frfi[2 * j] = (int16_t)((qr32 - tr32) >> 1); 76 | frfi[2 * j + 1] = (int16_t)((qi32 - ti32) >> 1); 77 | frfi[2 * i] = (int16_t)((qr32 + tr32) >> 1); 78 | frfi[2 * i + 1] = (int16_t)((qi32 + ti32) >> 1); 79 | } 80 | } 81 | 82 | --k; 83 | l = istep; 84 | 85 | } 86 | 87 | } else 88 | { 89 | // mode==1: High-complexity and High-accuracy mode 90 | while (l < n) 91 | { 92 | istep = l << 1; 93 | 94 | for (m = 0; m < l; ++m) 95 | { 96 | j = m << k; 97 | 98 | /* The 256-value is a constant given as 1/4 of the size of 99 | * kSinTable1024[], and should not be changed depending on the input 100 | * parameter 'stages'. It will result in 0 <= j < N_SINE_WAVE/2 101 | */ 102 | wr = kSinTable1024[j + 256]; 103 | wi = -kSinTable1024[j]; 104 | 105 | #ifdef WEBRTC_ARCH_ARM_V7 106 | int32_t wri = 0; 107 | __asm __volatile("pkhbt %0, %1, %2, lsl #16" : "=r"(wri) : 108 | "r"((int32_t)wr), "r"((int32_t)wi)); 109 | #endif 110 | 111 | for (i = m; i < n; i += istep) 112 | { 113 | j = i + l; 114 | 115 | #ifdef WEBRTC_ARCH_ARM_V7 116 | register int32_t frfi_r; 117 | __asm __volatile( 118 | "pkhbt %[frfi_r], %[frfi_even], %[frfi_odd]," 119 | " lsl #16\n\t" 120 | "smlsd %[tr32], %[wri], %[frfi_r], %[cfftrnd]\n\t" 121 | "smladx %[ti32], %[wri], %[frfi_r], %[cfftrnd]\n\t" 122 | :[frfi_r]"=&r"(frfi_r), 123 | [tr32]"=&r"(tr32), 124 | [ti32]"=r"(ti32) 125 | :[frfi_even]"r"((int32_t)frfi[2*j]), 126 | [frfi_odd]"r"((int32_t)frfi[2*j +1]), 127 | [wri]"r"(wri), 128 | [cfftrnd]"r"(CFFTRND)); 129 | #else 130 | tr32 = wr * frfi[2 * j] - wi * frfi[2 * j + 1] + CFFTRND; 131 | 132 | ti32 = wr * frfi[2 * j + 1] + wi * frfi[2 * j] + CFFTRND; 133 | #endif 134 | 135 | tr32 >>= 15 - CFFTSFT; 136 | ti32 >>= 15 - CFFTSFT; 137 | 138 | qr32 = ((int32_t)frfi[2 * i]) * (1 << CFFTSFT); 139 | qi32 = ((int32_t)frfi[2 * i + 1]) * (1 << CFFTSFT); 140 | 141 | frfi[2 * j] = (int16_t)( 142 | (qr32 - tr32 + CFFTRND2) >> (1 + CFFTSFT)); 143 | frfi[2 * j + 1] = (int16_t)( 144 | (qi32 - ti32 + CFFTRND2) >> (1 + CFFTSFT)); 145 | frfi[2 * i] = (int16_t)( 146 | (qr32 + tr32 + CFFTRND2) >> (1 + CFFTSFT)); 147 | frfi[2 * i + 1] = (int16_t)( 148 | (qi32 + ti32 + CFFTRND2) >> (1 + CFFTSFT)); 149 | } 150 | } 151 | 152 | --k; 153 | l = istep; 154 | } 155 | } 156 | return 0; 157 | } 158 | 159 | int WebRtcSpl_ComplexIFFT(int16_t frfi[], int stages, int mode) 160 | { 161 | size_t i, j, l, istep, n, m; 162 | int k, scale, shift; 163 | int16_t wr, wi; 164 | int32_t tr32, ti32, qr32, qi32; 165 | int32_t tmp32, round2; 166 | 167 | /* The 1024-value is a constant given from the size of kSinTable1024[], 168 | * and should not be changed depending on the input parameter 'stages' 169 | */ 170 | n = ((size_t)1) << stages; 171 | if (n > 1024) 172 | return -1; 173 | 174 | scale = 0; 175 | 176 | l = 1; 177 | k = 10 - 1; /* Constant for given kSinTable1024[]. Do not change 178 | depending on the input parameter 'stages' */ 179 | 180 | while (l < n) 181 | { 182 | // variable scaling, depending upon data 183 | shift = 0; 184 | round2 = 8192; 185 | 186 | tmp32 = WebRtcSpl_MaxAbsValueW16(frfi, 2 * n); 187 | if (tmp32 > 13573) 188 | { 189 | shift++; 190 | scale++; 191 | round2 <<= 1; 192 | } 193 | if (tmp32 > 27146) 194 | { 195 | shift++; 196 | scale++; 197 | round2 <<= 1; 198 | } 199 | 200 | istep = l << 1; 201 | 202 | if (mode == 0) 203 | { 204 | // mode==0: Low-complexity and Low-accuracy mode 205 | for (m = 0; m < l; ++m) 206 | { 207 | j = m << k; 208 | 209 | /* The 256-value is a constant given as 1/4 of the size of 210 | * kSinTable1024[], and should not be changed depending on the input 211 | * parameter 'stages'. It will result in 0 <= j < N_SINE_WAVE/2 212 | */ 213 | wr = kSinTable1024[j + 256]; 214 | wi = kSinTable1024[j]; 215 | 216 | for (i = m; i < n; i += istep) 217 | { 218 | j = i + l; 219 | 220 | tr32 = (wr * frfi[2 * j] - wi * frfi[2 * j + 1]) >> 15; 221 | 222 | ti32 = (wr * frfi[2 * j + 1] + wi * frfi[2 * j]) >> 15; 223 | 224 | qr32 = (int32_t)frfi[2 * i]; 225 | qi32 = (int32_t)frfi[2 * i + 1]; 226 | frfi[2 * j] = (int16_t)((qr32 - tr32) >> shift); 227 | frfi[2 * j + 1] = (int16_t)((qi32 - ti32) >> shift); 228 | frfi[2 * i] = (int16_t)((qr32 + tr32) >> shift); 229 | frfi[2 * i + 1] = (int16_t)((qi32 + ti32) >> shift); 230 | } 231 | } 232 | } else 233 | { 234 | // mode==1: High-complexity and High-accuracy mode 235 | 236 | for (m = 0; m < l; ++m) 237 | { 238 | j = m << k; 239 | 240 | /* The 256-value is a constant given as 1/4 of the size of 241 | * kSinTable1024[], and should not be changed depending on the input 242 | * parameter 'stages'. It will result in 0 <= j < N_SINE_WAVE/2 243 | */ 244 | wr = kSinTable1024[j + 256]; 245 | wi = kSinTable1024[j]; 246 | 247 | #ifdef WEBRTC_ARCH_ARM_V7 248 | int32_t wri = 0; 249 | __asm __volatile("pkhbt %0, %1, %2, lsl #16" : "=r"(wri) : 250 | "r"((int32_t)wr), "r"((int32_t)wi)); 251 | #endif 252 | 253 | for (i = m; i < n; i += istep) 254 | { 255 | j = i + l; 256 | 257 | #ifdef WEBRTC_ARCH_ARM_V7 258 | register int32_t frfi_r; 259 | __asm __volatile( 260 | "pkhbt %[frfi_r], %[frfi_even], %[frfi_odd], lsl #16\n\t" 261 | "smlsd %[tr32], %[wri], %[frfi_r], %[cifftrnd]\n\t" 262 | "smladx %[ti32], %[wri], %[frfi_r], %[cifftrnd]\n\t" 263 | :[frfi_r]"=&r"(frfi_r), 264 | [tr32]"=&r"(tr32), 265 | [ti32]"=r"(ti32) 266 | :[frfi_even]"r"((int32_t)frfi[2*j]), 267 | [frfi_odd]"r"((int32_t)frfi[2*j +1]), 268 | [wri]"r"(wri), 269 | [cifftrnd]"r"(CIFFTRND) 270 | ); 271 | #else 272 | 273 | tr32 = wr * frfi[2 * j] - wi * frfi[2 * j + 1] + CIFFTRND; 274 | 275 | ti32 = wr * frfi[2 * j + 1] + wi * frfi[2 * j] + CIFFTRND; 276 | #endif 277 | tr32 >>= 15 - CIFFTSFT; 278 | ti32 >>= 15 - CIFFTSFT; 279 | 280 | qr32 = ((int32_t)frfi[2 * i]) * (1 << CIFFTSFT); 281 | qi32 = ((int32_t)frfi[2 * i + 1]) * (1 << CIFFTSFT); 282 | 283 | frfi[2 * j] = (int16_t)( 284 | (qr32 - tr32 + round2) >> (shift + CIFFTSFT)); 285 | frfi[2 * j + 1] = (int16_t)( 286 | (qi32 - ti32 + round2) >> (shift + CIFFTSFT)); 287 | frfi[2 * i] = (int16_t)( 288 | (qr32 + tr32 + round2) >> (shift + CIFFTSFT)); 289 | frfi[2 * i + 1] = (int16_t)( 290 | (qi32 + ti32 + round2) >> (shift + CIFFTSFT)); 291 | } 292 | } 293 | 294 | } 295 | --k; 296 | l = istep; 297 | } 298 | return scale; 299 | } 300 | -------------------------------------------------------------------------------- /src/speech_recorder.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "chunk_processor.h" 10 | #include "devices.h" 11 | #include "portaudio.h" 12 | #include "speech_recorder.h" 13 | 14 | #define DR_WAV_IMPLEMENTATION 15 | #include "dr_wav.h" 16 | 17 | Napi::Object SpeechRecorder::Init(Napi::Env env, Napi::Object exports) { 18 | Napi::Function f = DefineClass( 19 | env, "SpeechRecorder", 20 | { 21 | InstanceMethod<&SpeechRecorder::ProcessFile>( 22 | "processFile", static_cast( 23 | napi_writable | napi_configurable)), 24 | InstanceMethod<&SpeechRecorder::Start>( 25 | "start", static_cast( 26 | napi_writable | napi_configurable)), 27 | InstanceMethod<&SpeechRecorder::Stop>( 28 | "stop", static_cast(napi_writable | 29 | napi_configurable)), 30 | }); 31 | 32 | Napi::FunctionReference* constructor = new Napi::FunctionReference(); 33 | *constructor = Napi::Persistent(f); 34 | 35 | exports.Set("SpeechRecorder", f); 36 | env.SetInstanceData(constructor); 37 | 38 | exports.Set(Napi::String::New(env, "devices"), 39 | Napi::Function::New(env, GetDevices)); 40 | return exports; 41 | } 42 | 43 | SpeechRecorder::SpeechRecorder(const Napi::CallbackInfo& info) 44 | : Napi::ObjectWrap(info), 45 | stopped_(true), 46 | queue_(), 47 | callback_(Napi::Persistent(info[1].As())), 48 | threadSafeFunctionCallback_([&](Napi::Env env, Napi::Function jsCallback, 49 | SpeechRecorderCallbackData* data) { 50 | Napi::Object object = Napi::Object::New(env); 51 | object.Set("speaking", Napi::Boolean::New(env, data->speaking)); 52 | object.Set("volume", Napi::Number::New(env, data->volume)); 53 | object.Set("speech", Napi::Boolean::New(env, data->speech)); 54 | object.Set("probability", Napi::Number::New(env, data->probability)); 55 | object.Set("consecutiveSilence", 56 | Napi::Number::New(env, (double)data->consecutiveSilence)); 57 | 58 | if (data->audio.size() > 0) { 59 | Napi::Int16Array buffer = 60 | Napi::Int16Array::New(env, data->audio.size()); 61 | for (size_t i = 0; i < data->audio.size(); i++) { 62 | buffer[i] = data->audio[i]; 63 | } 64 | 65 | object.Set("audio", buffer); 66 | } 67 | 68 | jsCallback.Call({Napi::String::New(env, data->event), object}); 69 | delete data; 70 | }), 71 | modelPath_(info[0].As().Utf8Value()), 72 | options_({ 73 | info[2] 74 | .As() 75 | .Get("consecutiveFramesForSilence") 76 | .As() 77 | .Int32Value(), 78 | info[2] 79 | .As() 80 | .Get("consecutiveFramesForSpeaking") 81 | .As() 82 | .Int32Value(), 83 | info[2] 84 | .As() 85 | .Get("device") 86 | .As() 87 | .Int32Value(), 88 | info[2] 89 | .As() 90 | .Get("leadingBufferFrames") 91 | .As() 92 | .Int32Value(), 93 | [&](std::vector audio) { 94 | SpeechRecorderCallbackData* data = new SpeechRecorderCallbackData(); 95 | data->event = "chunkStart"; 96 | data->audio = audio; 97 | queue_.enqueue(data); 98 | }, 99 | [&](std::vector audio, bool speaking, double volume, 100 | bool speech, double probability, int consecutiveSilence) { 101 | SpeechRecorderCallbackData* data = new SpeechRecorderCallbackData(); 102 | data->event = "audio"; 103 | data->audio = audio; 104 | data->speaking = speaking; 105 | data->volume = volume; 106 | data->speech = speech; 107 | data->probability = probability; 108 | data->consecutiveSilence = consecutiveSilence; 109 | queue_.enqueue(data); 110 | }, 111 | [&]() { 112 | SpeechRecorderCallbackData* data = new SpeechRecorderCallbackData(); 113 | data->event = "chunkEnd"; 114 | queue_.enqueue(data); 115 | }, 116 | info[2] 117 | .As() 118 | .Get("samplesPerFrame") 119 | .As() 120 | .Int32Value(), 121 | info[2] 122 | .As() 123 | .Get("sampleRate") 124 | .As() 125 | .Int32Value(), 126 | info[2] 127 | .As() 128 | .Get("sileroVadBufferSize") 129 | .As() 130 | .Int32Value(), 131 | info[2] 132 | .As() 133 | .Get("sileroVadRateLimit") 134 | .As() 135 | .Int32Value(), 136 | info[2] 137 | .As() 138 | .Get("sileroVadSilenceThreshold") 139 | .As() 140 | .DoubleValue(), 141 | info[2] 142 | .As() 143 | .Get("sileroVadSpeakingThreshold") 144 | .As() 145 | .DoubleValue(), 146 | info[2] 147 | .As() 148 | .Get("webrtcVadLevel") 149 | .As() 150 | .Int32Value(), 151 | info[2] 152 | .As() 153 | .Get("webrtcVadBufferSize") 154 | .As() 155 | .Int32Value(), 156 | info[2] 157 | .As() 158 | .Get("webrtcVadResultsSize") 159 | .As() 160 | .Int32Value(), 161 | }), 162 | processor_(modelPath_, options_) {} 163 | 164 | void SpeechRecorder::ProcessFile(const Napi::CallbackInfo& info) { 165 | Napi::Env env = info.Env(); 166 | std::string path = info[0].As().Utf8Value(); 167 | 168 | // we don't want to create two processors on startup, because loading the 169 | // silero model is expensive, so lazily create this instance only if this 170 | // method is actually called (which is probably not common) 171 | if (!processFileProcessor_) { 172 | speechrecorder::ChunkProcessorOptions options = options_; 173 | 174 | options.onChunkStart = [&](std::vector audio) { 175 | Napi::Object object = Napi::Object::New(env); 176 | if (audio.size() > 0) { 177 | Napi::Int16Array buffer = Napi::Int16Array::New(env, audio.size()); 178 | for (size_t i = 0; i < audio.size(); i++) { 179 | buffer[i] = audio[i]; 180 | } 181 | 182 | object.Set("audio", buffer); 183 | } 184 | 185 | callback_.Value().Call({Napi::String::New(env, "chunkStart"), object}); 186 | }; 187 | 188 | options.onAudio = [&](std::vector audio, bool speaking, 189 | double volume, bool speech, double probability, 190 | int consecutiveSilence) { 191 | Napi::Object object = Napi::Object::New(env); 192 | object.Set("speaking", Napi::Boolean::New(env, speaking)); 193 | object.Set("volume", Napi::Number::New(env, volume)); 194 | object.Set("speech", Napi::Boolean::New(env, speech)); 195 | object.Set("probability", Napi::Number::New(env, probability)); 196 | object.Set("consecutiveSilence", 197 | Napi::Number::New(env, (double)consecutiveSilence)); 198 | 199 | if (audio.size() > 0) { 200 | Napi::Int16Array buffer = Napi::Int16Array::New(env, audio.size()); 201 | for (size_t i = 0; i < audio.size(); i++) { 202 | buffer[i] = audio[i]; 203 | } 204 | 205 | object.Set("audio", buffer); 206 | callback_.Value().Call({Napi::String::New(env, "audio"), object}); 207 | } 208 | }; 209 | 210 | options.onChunkEnd = [&] { 211 | callback_.Value().Call({Napi::String::New(env, "chunkEnd")}); 212 | }; 213 | 214 | processFileProcessor_ = 215 | std::make_unique(modelPath_, options); 216 | } 217 | 218 | unsigned int channels; 219 | unsigned int sampleRate; 220 | drwav_uint64 frames; 221 | short* data = drwav_open_file_and_read_pcm_frames_s16( 222 | path.c_str(), &channels, &sampleRate, &frames, nullptr); 223 | 224 | processFileProcessor_->Reset(); 225 | int size = (int)frames; 226 | for (int i = 0; i < size; i += options_.samplesPerFrame) { 227 | std::vector buffer; 228 | for (int j = 0; j < options_.samplesPerFrame; j++) { 229 | if (i + j < size) { 230 | buffer.push_back(data[i + j]); 231 | } 232 | } 233 | 234 | if (buffer.size() == (size_t)options_.samplesPerFrame) { 235 | processFileProcessor_->Process(buffer.data()); 236 | } 237 | } 238 | 239 | drwav_free(data, nullptr); 240 | } 241 | 242 | void SpeechRecorder::Start(const Napi::CallbackInfo& info) { 243 | stopped_ = false; 244 | threadSafeFunction_ = Napi::ThreadSafeFunction::New( 245 | info.Env(), callback_.Value(), "Speech Recorder Start", 0, 1, 246 | [&](Napi::Env env) { 247 | thread_.join(); 248 | }); 249 | 250 | thread_ = std::thread([&] { 251 | while (!stopped_) { 252 | SpeechRecorderCallbackData* data; 253 | bool element = queue_.try_dequeue(data); 254 | if (element) { 255 | threadSafeFunction_.BlockingCall(data, threadSafeFunctionCallback_); 256 | } 257 | 258 | std::this_thread::sleep_for(std::chrono::milliseconds(10)); 259 | } 260 | 261 | threadSafeFunction_.Release(); 262 | }); 263 | 264 | processor_.Start(); 265 | } 266 | 267 | void SpeechRecorder::Stop(const Napi::CallbackInfo& info) { 268 | stopped_ = true; 269 | processor_.Stop(); 270 | } 271 | 272 | Napi::Value GetDevices(const Napi::CallbackInfo& info) { 273 | Napi::Env env = info.Env(); 274 | 275 | std::vector devices = speechrecorder::GetDevices(); 276 | Napi::Array result = Napi::Array::New(env, devices.size()); 277 | for (size_t i = 0; i < devices.size(); i++) { 278 | Napi::Object e = Napi::Object::New(env); 279 | e.Set("id", devices[i].id); 280 | e.Set("name", devices[i].name); 281 | e.Set("apiName", devices[i].apiName); 282 | e.Set("maxInputChannels", devices[i].maxInputChannels); 283 | e.Set("maxOutputChannels", devices[i].maxOutputChannels); 284 | e.Set("defaultSampleRate", devices[i].defaultSampleRate); 285 | e.Set("isDefaultInput", devices[i].isDefaultInput); 286 | e.Set("isDefaultOutput", devices[i].isDefaultOutput); 287 | result[i] = e; 288 | } 289 | 290 | return result; 291 | } 292 | 293 | Napi::Object Init(Napi::Env env, Napi::Object exports) { 294 | SpeechRecorder::Init(env, exports); 295 | return exports; 296 | } 297 | 298 | NODE_API_MODULE(addon, Init); 299 | --------------------------------------------------------------------------------