├── .gitignore ├── cmake └── CPM.cmake ├── CMakeLists.txt ├── LICENSE ├── test ├── CMakeLists.txt ├── chowdsp_convolution_test.c └── chowdsp_convolution_test.cpp ├── .github └── workflows │ ├── coverage.yml │ └── test.yml ├── .clang-format ├── README.md ├── chowdsp_convolution.h └── chowdsp_convolution.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | build*/ 2 | 3 | .focus-config 4 | *.raddbg_project 5 | .vscode/ 6 | .idea/ 7 | .zed/ 8 | 9 | .DS_Store 10 | -------------------------------------------------------------------------------- /cmake/CPM.cmake: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | # 3 | # SPDX-FileCopyrightText: Copyright (c) 2019-2023 Lars Melchior and contributors 4 | 5 | set(CPM_DOWNLOAD_VERSION 0.40.2) 6 | set(CPM_HASH_SUM "c8cdc32c03816538ce22781ed72964dc864b2a34a310d3b7104812a5ca2d835d") 7 | 8 | if(CPM_SOURCE_CACHE) 9 | set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") 10 | elseif(DEFINED ENV{CPM_SOURCE_CACHE}) 11 | set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") 12 | else() 13 | set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake") 14 | endif() 15 | 16 | # Expand relative path. This is important if the provided path contains a tilde (~) 17 | get_filename_component(CPM_DOWNLOAD_LOCATION ${CPM_DOWNLOAD_LOCATION} ABSOLUTE) 18 | 19 | file(DOWNLOAD 20 | https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake 21 | ${CPM_DOWNLOAD_LOCATION} EXPECTED_HASH SHA256=${CPM_HASH_SUM} 22 | ) 23 | 24 | include(${CPM_DOWNLOAD_LOCATION}) 25 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.20) 2 | project(chowdsp_convolution VERSION 0.1.0) 3 | 4 | if(CHOWDSP_CONVOLUTION_ASAN) 5 | message(STATUS "Setting flags for address sanitizer: -fsanitize=address -g") 6 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -g") 7 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -g") 8 | set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=address") 9 | endif() 10 | 11 | if(TARGET chowdsp_fft) 12 | get_target_property(chowdsp_fft_dir chowdsp_fft SOURCE_DIR) 13 | message(STATUS "Using chowdsp_fft from ${chowdsp_fft_dir}") 14 | else() 15 | message(STATUS "Using chowdsp_fft from CPM") 16 | include(cmake/CPM.cmake) 17 | unset(JUCE_MODULES_DIR CACHE) # this causes problems with the tests CMake config? 18 | CPMAddPackage("gh:Chowdhury-DSP/chowdsp_fft#main") 19 | endif() 20 | 21 | add_library(chowdsp_convolution STATIC) 22 | target_sources(chowdsp_convolution 23 | PRIVATE 24 | chowdsp_convolution.h 25 | chowdsp_convolution.cpp 26 | ) 27 | target_include_directories(chowdsp_convolution PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) 28 | target_link_libraries(chowdsp_convolution PUBLIC chowdsp_fft) 29 | target_compile_features(chowdsp_convolution PRIVATE cxx_std_20) 30 | 31 | if(CHOWDSP_CONVOLUTION_TESTING) 32 | add_subdirectory(test) 33 | endif() 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2025, Jatin Chowdhury (jatin@chowdsp.com) 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/CPM.cmake) 2 | 3 | CPMAddPackage( 4 | NAME juce 5 | GIT_REPOSITORY https://github.com/juce-framework/juce 6 | GIT_TAG 8.0.7 7 | OPTIONS "JUCE_MODULES_ONLY ON" 8 | ) 9 | CPMAddPackage("gh:Chowdhury-DSP/chowdsp_utils#next") 10 | CPMAddPackage("gh:Chowdhury-DSP/chowdsp_fft#main") 11 | 12 | if(NOT TARGET chowdsp_fft_juce) 13 | juce_add_module(${chowdsp_fft_SOURCE_DIR}/chowdsp_fft_juce) 14 | endif() 15 | 16 | add_executable(chowdsp_convolution_test chowdsp_convolution_test.cpp) 17 | target_link_libraries(chowdsp_convolution_test 18 | # juce::juce_dsp 19 | juce::juce_audio_basics 20 | juce::juce_audio_formats 21 | chowdsp::chowdsp_buffers 22 | chowdsp::chowdsp_data_structures 23 | chowdsp_fft_juce 24 | chowdsp_convolution 25 | ) 26 | target_compile_definitions(chowdsp_convolution_test 27 | PRIVATE 28 | JUCE_MODULE_AVAILABLE_juce_dsp=1 # chowdsp_pffft includes juce_dsp internally! 29 | JUCE_USE_CURL=0 30 | $,BUILD_DEBUG=1,BUILD_RELEASE=1> 31 | ) 32 | target_compile_features(chowdsp_convolution_test PRIVATE cxx_std_20) 33 | 34 | if(CHOWDSP_CONVOLUTION_COVERAGE) 35 | message(STATUS "Appending code coverage compiler flags: -g --coverage") 36 | target_compile_options(chowdsp_convolution PUBLIC -g --coverage -fprofile-arcs -ftest-coverage) 37 | target_compile_options(chowdsp_convolution_test PUBLIC -g --coverage -fprofile-arcs -ftest-coverage) 38 | target_link_options(chowdsp_convolution_test PUBLIC --coverage) 39 | endif() 40 | 41 | add_executable(chowdsp_convolution_c_test chowdsp_convolution_test.c) 42 | target_link_libraries(chowdsp_convolution_c_test PRIVATE chowdsp_convolution) 43 | target_compile_features(chowdsp_convolution_c_test PRIVATE c_std_11) 44 | -------------------------------------------------------------------------------- /.github/workflows/coverage.yml: -------------------------------------------------------------------------------- 1 | name: Coverage 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - main 7 | - develop 8 | push: 9 | branches: 10 | - main 11 | - ci 12 | 13 | workflow_dispatch: 14 | 15 | jobs: 16 | build_and_test: 17 | name: Test library with coverage 18 | runs-on: ${{ matrix.os }} 19 | strategy: 20 | fail-fast: false # show all errors for each platform (vs. cancel jobs on error) 21 | matrix: 22 | include: 23 | - name: Linux 24 | os: ubuntu-22.04 25 | nparallel: 4 26 | 27 | steps: 28 | - name: Install Linux Deps 29 | if: runner.os == 'Linux' 30 | run: | 31 | sudo apt-get update 32 | sudo apt install libasound2-dev libcurl4-openssl-dev libx11-dev libxinerama-dev libxext-dev libfreetype6-dev libwebkit2gtk-4.0-dev libglu1-mesa-dev libjack-jackd2-dev 33 | 34 | - name: Install lcov (Linux) 35 | if: runner.os == 'Linux' 36 | run: sudo apt install lcov 37 | 38 | - name: Install Ninja 39 | uses: seanmiddleditch/gha-setup-ninja@master 40 | 41 | - name: Get latest CMake 42 | uses: lukka/get-cmake@latest 43 | 44 | - name: Checkout code 45 | uses: actions/checkout@v2 46 | 47 | - name: Cmake Configure 48 | run: cmake -Bbuild -G"Ninja Multi-Config" -DCHOWDSP_CONVOLUTION_TESTING=ON -DCHOWDSP_CONVOLUTION_COVERAGE=ON 49 | 50 | - name: Build Test 51 | run: cmake --build build --config Debug --parallel --target chowdsp_convolution_test 52 | 53 | - name: Run Test 54 | run: ./build/test/Debug/chowdsp_convolution_test 55 | 56 | - name: Collect Coverage Data 57 | run: | 58 | lcov --version 59 | lcov --directory . --capture --output-file coverage.info 60 | lcov --remove coverage.info '/usr/*' "${HOME}"'/.cache/*' '/Applications/Xcode*' '*build*' '*test*' --output-file coverage.info 61 | 62 | - name: Report Coverage Data 63 | run: lcov --list coverage.info 64 | 65 | - name: Upload coverage to Codecov 66 | uses: codecov/codecov-action@v4 67 | with: 68 | fail_ci_if_error: true 69 | token: ${{ secrets.CODECOV_TOKEN }} 70 | files: coverage.info 71 | verbose: true 72 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | AccessModifierOffset: -4 3 | AlignAfterOpenBracket: Align 4 | AlignConsecutiveAssignments: false 5 | AlignConsecutiveDeclarations: false 6 | AlignEscapedNewlines: Left 7 | AlignOperands: Align 8 | AlignTrailingComments: false 9 | AllowAllParametersOfDeclarationOnNextLine: false 10 | AllowShortBlocksOnASingleLine: Never 11 | AllowShortCaseLabelsOnASingleLine: false 12 | AllowShortFunctionsOnASingleLine: All 13 | AllowShortIfStatementsOnASingleLine: Never 14 | AllowShortLoopsOnASingleLine: false 15 | AlwaysBreakAfterDefinitionReturnType: None 16 | AlwaysBreakAfterReturnType: None 17 | AlwaysBreakBeforeMultilineStrings: false 18 | AlwaysBreakTemplateDeclarations: Yes 19 | BinPackArguments: false 20 | BinPackParameters: false 21 | BreakAfterJavaFieldAnnotations: false 22 | BreakBeforeBinaryOperators: NonAssignment 23 | BreakBeforeBraces: Allman 24 | BreakBeforeTernaryOperators: true 25 | BreakConstructorInitializersBeforeComma: false 26 | BreakStringLiterals: false 27 | ColumnLimit: 0 28 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 29 | ConstructorInitializerIndentWidth: 4 30 | ContinuationIndentWidth: 4 31 | Cpp11BracedListStyle: false 32 | DerivePointerAlignment: false 33 | DisableFormat: false 34 | ExperimentalAutoDetectBinPacking: false 35 | ForEachMacros: [ 'forEachXmlChildElement' ] 36 | IndentCaseLabels: true 37 | IndentWidth: 4 38 | IndentWrappedFunctionNames: true 39 | KeepEmptyLinesAtTheStartOfBlocks: false 40 | Language: Cpp 41 | MaxEmptyLinesToKeep: 1 42 | NamespaceIndentation: Inner 43 | PointerAlignment: Left 44 | ReflowComments: false 45 | SortIncludes: false 46 | SpaceAfterCStyleCast: true 47 | SpaceAfterLogicalNot: true 48 | SpaceBeforeAssignmentOperators: true 49 | SpaceBeforeCpp11BracedList: true 50 | SpaceBeforeParens: NonEmptyParentheses 51 | SpaceInEmptyParentheses: false 52 | SpaceBeforeInheritanceColon: true 53 | SpacesInAngles: false 54 | SpacesInCStyleCastParentheses: false 55 | SpacesInContainerLiterals: true 56 | SpacesInParentheses: false 57 | SpacesInSquareBrackets: false 58 | Standard: "c++17" 59 | TabWidth: 4 60 | UseTab: Never 61 | --- 62 | Language: ObjC 63 | BasedOnStyle: Chromium 64 | AlignTrailingComments: true 65 | BreakBeforeBraces: Allman 66 | ColumnLimit: 0 67 | IndentWidth: 4 68 | KeepEmptyLinesAtTheStartOfBlocks: false 69 | ObjCSpaceAfterProperty: true 70 | ObjCSpaceBeforeProtocolList: true 71 | PointerAlignment: Left 72 | SpacesBeforeTrailingComments: 1 73 | TabWidth: 4 74 | UseTab: Never 75 | ... 76 | -------------------------------------------------------------------------------- /test/chowdsp_convolution_test.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | int main() 9 | { 10 | printf("Running C tests\n"); 11 | 12 | // setup config 13 | const int block_size = 512; 14 | struct Convolution_Config conv_config; 15 | create_config (&conv_config, block_size, NULL); 16 | float* fft_scratch = (float*) aligned_malloc (conv_config.fft_size * sizeof (float)); 17 | 18 | // load IR (ideal impulse with delay) 19 | const int ir_size = 200; 20 | const int delay_samples = 100; 21 | float* ir = (float*) calloc(ir_size, sizeof (float)); 22 | ir[delay_samples] = 1.0f; 23 | struct IR_Uniform conv_ir; 24 | create_ir (&conv_config, &conv_ir, ir, ir_size, fft_scratch, NULL); 25 | free (ir); 26 | 27 | // set up process state 28 | struct Process_Uniform_State conv_state; 29 | create_process_state(&conv_config, &conv_ir, &conv_state, NULL); 30 | 31 | // set up i/o buffers 32 | const int num_blocks = 1000; 33 | const int data_size = block_size * num_blocks; 34 | float* test_input_data = malloc (data_size * sizeof (float)); 35 | float* test_output_data = malloc (data_size * sizeof (float)); 36 | for (int i = 0; i < data_size; ++i) 37 | test_input_data[i] = sinf(314.0f * (float) i / (float) data_size); 38 | 39 | // process convolution 40 | for (int i = 0; i < num_blocks; ++i) 41 | { 42 | const float* block_in = test_input_data + (i * block_size); 43 | float* block_out = test_output_data + (i * block_size); 44 | process_samples (&conv_config, 45 | &conv_ir, 46 | &conv_state, 47 | block_in, 48 | block_out, 49 | block_size, 50 | fft_scratch); 51 | } 52 | 53 | // compute error 54 | float error_accum = 0.0f; 55 | float max_error = 0.0f; 56 | for (int i = 0; i < data_size; ++i) 57 | { 58 | const float ref = i < delay_samples ? 0.0f : test_input_data[i - delay_samples]; 59 | const float test = test_output_data[i]; 60 | const float err = fabsf (ref - test); 61 | 62 | if (err > max_error) 63 | max_error = err; 64 | error_accum += err * err; 65 | } 66 | const float mse = error_accum / (float) data_size; 67 | printf("Max Error: %f\n", max_error); 68 | printf("Mean-squared: %f\n", mse); 69 | printf("COMPLETE!\n"); 70 | 71 | // cleanup 72 | free (test_input_data); 73 | free (test_output_data); 74 | aligned_free (fft_scratch); 75 | destroy_process_state (&conv_state); 76 | destroy_ir (&conv_ir); 77 | destroy_config (&conv_config); 78 | 79 | return 0; 80 | } 81 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - main 7 | - develop 8 | push: 9 | branches: 10 | - main 11 | - ci 12 | 13 | workflow_dispatch: 14 | 15 | jobs: 16 | build_and_test: 17 | name: Test library on ${{ matrix.name }} 18 | runs-on: ${{ matrix.os }} 19 | strategy: 20 | fail-fast: false # show all errors for each platform (vs. cancel jobs on error) 21 | matrix: 22 | include: 23 | - name: Linux 24 | os: ubuntu-22.04 25 | cmake_args: "-DCMAKE_LINKER_TYPE=MOLD -DCMAKE_C_COMPILER=clang-15 -DCMAKE_CXX_COMPILER=clang++-15" 26 | nparallel: 4 27 | - name: Linux ASan 28 | os: ubuntu-22.04 29 | cmake_args: "-DCMAKE_LINKER_TYPE=MOLD -DCMAKE_C_COMPILER=clang-15 -DCMAKE_CXX_COMPILER=clang++-15 -DCHOWDSP_CONVOLUTION_ASAN=ON" 30 | nparallel: 4 31 | - name: Windows-x64 32 | os: windows-2022 33 | cmake_args: -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl 34 | nparallel: 4 35 | - name: MacOS 36 | os: macos-14 37 | cmake_args: "-D\"CMAKE_OSX_ARCHITECTURES=arm64;x86_64\"" 38 | nparallel: 4 39 | - name: MacOS ASan 40 | os: macos-14 41 | cmake_args: "-DCHOWDSP_CONVOLUTION_ASAN=ON" 42 | nparallel: 4 43 | 44 | steps: 45 | - name: Install Linux Deps 46 | if: runner.os == 'Linux' 47 | run: | 48 | sudo apt-get update 49 | sudo apt install libasound2-dev libcurl4-openssl-dev libx11-dev libxinerama-dev libxext-dev libfreetype6-dev libwebkit2gtk-4.0-dev libglu1-mesa-dev libjack-jackd2-dev 50 | sudo apt-add-repository "deb http://apt.llvm.org/focal/ llvm-toolchain-focal-15 main" 51 | sudo apt install clang-15 -y 52 | 53 | - name: Install Ninja 54 | uses: seanmiddleditch/gha-setup-ninja@master 55 | 56 | - name: Get latest CMake 57 | uses: lukka/get-cmake@latest 58 | 59 | - uses: rui314/setup-mold@v1 60 | if: runner.os == 'Linux' 61 | 62 | - name: Upgrade LLVM 63 | if: runner.os == 'Windows' 64 | run: choco upgrade llvm --version=18.1.8 --allow-downgrade 65 | 66 | - name: Add msbuild to PATH 67 | if: runner.os == 'Windows' 68 | uses: microsoft/setup-msbuild@v2 69 | 70 | - name: Setup MSVC devcmd (x64) 71 | if: matrix.name == 'Windows-x64' 72 | uses: ilammy/msvc-dev-cmd@v1 73 | 74 | - name: Checkout code 75 | uses: actions/checkout@v2 76 | 77 | - name: Cmake Configure 78 | run: cmake -Bbuild -G"Ninja Multi-Config" -DCHOWDSP_CONVOLUTION_TESTING=ON ${{ matrix.cmake_args }} 79 | 80 | - name: Build Test (Debug) 81 | run: cmake --build build --config Debug --parallel ${{ matrix.nparallel }} --target chowdsp_convolution_test 82 | 83 | - name: Run Test (Debug) 84 | run: ./build/test/Debug/chowdsp_convolution_test 85 | 86 | - name: Build Test (Release) 87 | run: cmake --build build --config Release --parallel ${{ matrix.nparallel }} --target chowdsp_convolution_test 88 | 89 | - name: Run Test (Release) 90 | run: ./build/test/Release/chowdsp_convolution_test 91 | 92 | - name: Build Test C (Debug) 93 | run: cmake --build build --config Debug --parallel ${{ matrix.nparallel }} --target chowdsp_convolution_c_test 94 | 95 | - name: Run Test C (Debug) 96 | run: ./build/test/Debug/chowdsp_convolution_c_test 97 | 98 | - name: Build Test C++ (Release) 99 | run: cmake --build build --config Release --parallel ${{ matrix.nparallel }} --target chowdsp_convolution_c_test 100 | 101 | - name: Run Test C++ (Release) 102 | run: ./build/test/Release/chowdsp_convolution_c_test 103 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # chowdsp_convolution 2 | 3 | [![Test](https://github.com/Chowdhury-DSP/chowdsp_convolution/actions/workflows/test.yml/badge.svg)](https://github.com/Chowdhury-DSP/chowdsp_convolution/actions/workflows/test.yml) 4 | [![codecov](https://codecov.io/gh/Chowdhury-DSP/chowdsp_convolution/graph/badge.svg?token=3WCDKPHA58)](https://codecov.io/gh/Chowdhury-DSP/chowdsp_convolution) 5 | 6 | `chowdsp_convolution` is a library for performing frequency-domain 7 | convolution using [`chowdsp_fft`](https://github.com/Chowdhury-DSP/chowdsp_fft). 8 | The library currently supports uniformly-partitioned convolutions, 9 | as well as 2-stage non-uniformly-partitioned convolutions. 10 | 11 | **N.B.: This library is still in early development, and 12 | there will likely be breaking changes.** If you have 13 | suggestions for ways to improve the API, or features to 14 | add please create a GitHub Issue. 15 | 16 | ## Usage 17 | 18 | ### Basic Usage (mono IR, mono i/o, uniform partitioning) 19 | 20 | First, create a `Config` object: 21 | 22 | ```cpp 23 | chowdsp::convolution::Config config {}; 24 | chowdsp::convolution::create_config (&config, maximum_block_size); 25 | ``` 26 | 27 | We'll also allocate some "scratch" data that will be used for computing 28 | FFTs under the hood: 29 | 30 | ```cpp 31 | float* fft_scratch = chowdsp::fft::aligned_malloc (config->fft_size * sizeof (float)); 32 | ``` 33 | 34 | Next, create a partitioned IR: 35 | 36 | ```cpp 37 | chowdsp::convolution::IR_Uniform ir {}; 38 | chowdsp::convolution::create_ir (&config, &ir, my_ir.data(), my_ir.size()); 39 | ``` 40 | 41 | Then we'll create a convolution "state". 42 | ```cpp 43 | chowdsp::convolution::Process_Uniform_State state {}; 44 | chowdsp::convolution::create_process_state (&config, &ir, &state); 45 | ``` 46 | 47 | Now we're ready to process some data: 48 | 49 | ```cpp 50 | chowdsp::convolution::process_samples (&config, &ir, &state, data, data, num_samples, fft_scratch); 51 | ``` 52 | 53 | Alternatively, we could use `process_samples_with_latency()` which is 54 | faster, but adds `config->block_size` samples of latency. 55 | 56 | Finally, let's clean up all our memory allocation: 57 | 58 | ```cpp 59 | chowdsp::fft::aligned_free (fft_scratch); 60 | chowdsp::convolution::destroy_process_state (&state); 61 | chowdsp::convolution::destroy_ir (&ir); 62 | chowdsp::convolution::destroy_config (&config); 63 | ``` 64 | 65 | ### Multi-Channel Processing (mono IR) 66 | 67 | Let's say that you want to convolve a stereo audio stream with a mono IR. 68 | We can use `create_multichannel_process_state()` to create a processing state 69 | with a given number of channels. 70 | 71 | ```cpp 72 | chowdsp::convolution::Process_Uniform_State stereo_state {}; 73 | chowdsp::convolution::create_multichannel_process_state (&config, &ir, &stereo_state, 2); 74 | ``` 75 | 76 | To process our audio, we'll want to use `process_samples_multichannel()` 77 | (or `process_samples_multichannel_with_latency()`). 78 | 79 | ```cpp 80 | float* channel_data[2] { 81 | left_channel_data, 82 | right_channel_data, 83 | }; 84 | chowdsp::convolution::process_samples_multichannel (&config, &ir, &state, channel_data, channel_data, num_samples, 2, fft_scratch); 85 | ``` 86 | 87 | ### Multi-Channel IRs 88 | 89 | let's create a stereo, uniform-partitioned IR: 90 | 91 | ```cpp 92 | float* ir_data[2] { 93 | left_ir_data, 94 | right_ir_data, 95 | }; 96 | chowdsp::convolution::IR_Uniform ir {}; 97 | chowdsp::convolution::create_multichannel_ir (&config, &ir, ir_data, ir_num_samples, 2, fft_scratch); 98 | ``` 99 | 100 | Now if we call `create_process_state()`, the state will automatically be created 101 | for the same number of channels as the IR. 102 | ```cpp 103 | chowdsp::convolution::Process_Uniform_State state {}; 104 | chowdsp::convolution::create_process_state (&config, &ir, &state); 105 | ``` 106 | 107 | Then (as before), we can do our multi-channel processing: 108 | 109 | ```cpp 110 | float* channel_data[2] { 111 | left_channel_data, 112 | right_channel_data, 113 | }; 114 | chowdsp::convolution::process_samples_multichannel (&config, &ir, &state, channel_data, channel_data, num_samples, 2, fft_scratch); 115 | ``` 116 | 117 | ### Multi-Threaded Usage 118 | 119 | What should you do if you're looking to load an impulse response 120 | on some thread *other* than the audio thread, while the audio 121 | thread is still running? The basic idea is that you should: 122 | - Create a `IR_Uniform` object on your background thread. 123 | - Create one `Process_Uniform_State` object per-channel on your background thread 124 | - This step may be skipped if the new IR is the same length as the one currently on the audio thread. 125 | - Pass these objects to your audio thread (e.g. via a lock-free queue) 126 | - Pass the old IR and state objects to your background thread where they can be safely destroyed. 127 | 128 | Note that the `Config` object is thread-safe, so you may use the 129 | same config on both your audio thread and background thread (e.g. 130 | when calling `create_ir()` or `load_ir()`). However, the `fft_scratch` 131 | is **not** thread-safe, so make sure to allocate a dedicated `fft_scratch` 132 | for each thread. 133 | 134 | ## License 135 | 136 | `chowdsp_convolution` is licensed under the BSD 3-clause license. Enjoy! 137 | 138 | ### Disclaimer 139 | 140 | This implementation is *loosely* based on some code from the 141 | [JUCE](https://github.com/juce-framework/juce) library. Personally, 142 | I think that I've changed enough of the code that this library should 143 | be considered an original work, rather than a "fork" of the JUCE 144 | implementation. That said, if you want to use this library in a 145 | commercial product and you don't have a JUCE license, I'd recommend 146 | looking through both codebases and deciding for yourself. 147 | 148 | -- Jatin 149 | -------------------------------------------------------------------------------- /chowdsp_convolution.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef __cplusplus 4 | #include 5 | 6 | extern "C" 7 | { 8 | namespace chowdsp::convolution 9 | { 10 | #else 11 | #include 12 | #include 13 | #endif 14 | 15 | /** 16 | * Convolution configuration. 17 | * This depends only on the maximum block size. 18 | */ 19 | struct Convolution_Config 20 | { 21 | int block_size; 22 | int fft_size; 23 | void* fft; 24 | }; 25 | #ifdef __cplusplus 26 | using Config = Convolution_Config; 27 | #endif 28 | 29 | /** State for a uniform-partitioned IR. */ 30 | struct IR_Uniform 31 | { 32 | float* segments; 33 | int num_segments; 34 | int max_num_segments; 35 | int num_channels; 36 | }; 37 | 38 | /** State for processing a uniform-partitioned IR */ 39 | struct Process_Uniform_State 40 | { 41 | struct State_Data 42 | { 43 | float* segments; 44 | float* input_data; 45 | float* output_data; 46 | float* output_temp_data; 47 | float* overlap_data; 48 | }* state_data; 49 | int max_num_segments; 50 | int current_segment; 51 | int input_data_pos; 52 | int num_channels; 53 | }; 54 | 55 | /** State for processing a multi-channel uniform-partitioned IR */ 56 | struct Process_Multichannel_Uniform_State 57 | { 58 | struct Process_Uniform_State state; 59 | int num_channels; 60 | }; 61 | 62 | /** State for a mono non-uniform-partitioned IR. */ 63 | struct IR_Non_Uniform 64 | { 65 | struct IR_Uniform head; 66 | struct IR_Uniform tail; 67 | const struct Convolution_Config* head_config; 68 | const struct Convolution_Config* tail_config; 69 | int head_size; 70 | }; 71 | 72 | /** State for processing a mono non-uniform-partitioned IR */ 73 | struct Process_Non_Uniform_State 74 | { 75 | struct Process_Uniform_State head; 76 | struct Process_Uniform_State tail; 77 | const struct Convolution_Config* head_config; 78 | const struct Convolution_Config* tail_config; 79 | }; 80 | 81 | /** Returns the required FFT size for a given block size. */ 82 | int convolution_fft_size (int max_block_size); 83 | 84 | /** The number of bytes required for `create_config()` with in-place construction. */ 85 | size_t config_bytes_required (int max_block_size); 86 | 87 | /** 88 | * Creates a convolution config for a given maximum block size. 89 | * If no `place_data` pointer is provided, the config will allocate 90 | * its own memory, and the user must call `destroy_config()` to free 91 | * that memory. If a `place_data` pointer is provided, the config will 92 | * be constructed in-place, using the provided memory, and the user is 93 | * responsible for managing that memory themselves. The `place_data` pointer 94 | * must provide the number of bytes determined by `config_bytes_required()`, 95 | * and should be aligned to 64 bytes. 96 | */ 97 | void create_config (struct Convolution_Config*, int max_block_size, void* place_data 98 | #ifdef __cplusplus 99 | = nullptr 100 | #endif 101 | ); 102 | 103 | /** De-allocates the config's internal data. */ 104 | void destroy_config (struct Convolution_Config*); 105 | 106 | /** 107 | * Returns the number of bytes required to call `create_ir()` 108 | * or `create_zero_ir()` with `place_data`. 109 | */ 110 | size_t ir_bytes_required (int max_block_size, int ir_num_samples); 111 | 112 | /** 113 | * Creates a monophonic IR. 114 | * 115 | * The fft_scratch pointer should point to 116 | * an array of config->fft_size floats, and should 117 | * have 64-byte alignment. 118 | * 119 | * If `place_data` is provided, the IR will be constructed in-place. 120 | * Otherwise, memory will be allocated, and the user must call `destroy_ir()` 121 | * to free that memory. `place_data` should be aligned to 64 bytes. 122 | */ 123 | void create_ir (const struct Convolution_Config*, struct IR_Uniform*, const float* ir, int ir_num_samples, float* fft_scratch, void* place_data 124 | #ifdef __cplusplus 125 | = nullptr 126 | #endif 127 | ); 128 | 129 | /** 130 | * Creates a mono IR of a given size. 131 | * The IR will be filled with zeros. 132 | * 133 | * See the requirements for `place_data` for `create_ir()`. 134 | */ 135 | void create_zero_ir (const struct Convolution_Config*, struct IR_Uniform*, int ir_num_samples, void* place_data 136 | #ifdef __cplusplus 137 | = nullptr 138 | #endif 139 | ); 140 | 141 | /** 142 | * Loads IR data. 143 | * `ir_num_samples` must be less than or equal the number of samples 144 | * the IR was created to expect. 145 | */ 146 | void load_ir (const struct Convolution_Config*, struct IR_Uniform*, const float* ir, int ir_num_samples, float* fft_scratch); 147 | 148 | /** 149 | * Returns the number of bytes required to call `create_multichannel_ir()` 150 | * or `create_zero_multichannel_ir()` with `place_data`. 151 | */ 152 | size_t multichannel_ir_bytes_required (int max_block_size, int ir_num_samples, int num_channels); 153 | 154 | /** 155 | * Creates a multi-channel uniform-partitioned IR. 156 | * 157 | * The fft_scratch pointer should point to 158 | * an array of config->fft_size floats, and should 159 | * have 64-byte alignment. 160 | * 161 | * See the requirements for `place_data` for `create_ir()`. 162 | */ 163 | void create_multichannel_ir (const struct Convolution_Config*, struct IR_Uniform*, const float* const* ir, int ir_num_samples, int num_channels, float* fft_scratch, void* place_data 164 | #ifdef __cplusplus 165 | = nullptr 166 | #endif 167 | ); 168 | 169 | /** 170 | * Creates a multi-channel IR of a given size. 171 | * The IR will be filled with zeros. 172 | * 173 | * See the requirements for `place_data` for `create_ir()`. 174 | */ 175 | void create_zero_multichannel_ir (const struct Convolution_Config*, struct IR_Uniform*, int ir_num_samples, int num_channels, void* place_data 176 | #ifdef __cplusplus 177 | = nullptr 178 | #endif 179 | ); 180 | 181 | /** 182 | * Loads IR data. 183 | * `ir_num_samples` must be less than or equal the number of samples 184 | * the IR was created to expect. 185 | */ 186 | void load_multichannel_ir (const struct Convolution_Config*, struct IR_Uniform*, const float* const* ir, int ir_num_samples, int num_channels, float* fft_scratch); 187 | 188 | /** De-allocates the IR's internal data. */ 189 | void destroy_ir (struct IR_Uniform*); 190 | 191 | /** 192 | * Returns the number of bytes required to call `create_process_state()` 193 | * with `place_data`. 194 | */ 195 | size_t process_state_bytes_required (int max_block_size, int ir_num_samples); 196 | 197 | /** 198 | * Creates a process state object for a given IR. 199 | * The process state will be created to process the same number of channels as the IR contains. 200 | * 201 | * If `place_data` is provided, the state will be constructed in-place. 202 | * Otherwise, memory will be allocated, and the user must call `destroy_process_state()` 203 | * to free that memory. `place_data` should be aligned to 64 bytes. 204 | */ 205 | void create_process_state (const struct Convolution_Config*, const struct IR_Uniform*, struct Process_Uniform_State*, void* place_data 206 | #ifdef __cplusplus 207 | = nullptr 208 | #endif 209 | ); 210 | 211 | /** 212 | * Returns the number of bytes required to call 213 | * `create_multichannel_process_state()` with `place_data`. 214 | */ 215 | size_t multichannel_process_state_bytes_required (int max_block_size, int ir_num_samples, int num_channels); 216 | 217 | /** 218 | * Creates a process state object for a given IR, with a specific number of channels. 219 | * This is useful for convolving a monophonic IR with multiple channels. 220 | * 221 | * See the requirements for `place_data` for `create_process_state()`. 222 | */ 223 | void create_multichannel_process_state (const struct Convolution_Config*, const struct IR_Uniform*, struct Process_Uniform_State*, int num_channels, void* place_data 224 | #ifdef __cplusplus 225 | = nullptr 226 | #endif 227 | ); 228 | 229 | /** Zeros the process state. */ 230 | void reset_process_state (const struct Convolution_Config*, struct Process_Uniform_State*); 231 | 232 | /** Zeros the process state. */ 233 | void reset_process_state_segments (const struct Convolution_Config*, struct Process_Uniform_State*, const struct IR_Uniform*); 234 | 235 | /** De-allocates the state's internal data. */ 236 | void destroy_process_state (struct Process_Uniform_State*); 237 | 238 | /** 239 | * Creates a monophonic non-uniform IR. 240 | * 241 | * The scratch pointer should point to an allocated block 242 | * of at least get_required_nuir_scratch_bytes(), and should 243 | * have 64-byte alignment. 244 | */ 245 | void create_nuir (struct IR_Non_Uniform*, const float* ir, int ir_num_samples, float* scratch); 246 | 247 | /** 248 | * Creates a mono non-uniform IR of a given size. 249 | * The IR will be filled with zeros. 250 | */ 251 | void create_zero_nuir (struct IR_Non_Uniform*, int ir_num_samples); 252 | 253 | /** Returns the required scratch size needed for this non-uniform IR. */ 254 | int get_required_nuir_scratch_bytes (const struct IR_Non_Uniform*); 255 | 256 | /** 257 | * Loads IR data. 258 | * `ir_num_samples` must be less than or equal the number of samples 259 | * the IR was created to expect. 260 | */ 261 | void load_nuir (struct IR_Non_Uniform*, const float* ir, int ir_num_samples, float* scratch); 262 | 263 | /** De-allocates the IR's internal data. */ 264 | void destroy_nuir (struct IR_Non_Uniform*); 265 | 266 | /** Creates a mono process state object for a given IR. */ 267 | void create_nuir_process_state (const struct IR_Non_Uniform*, struct Process_Non_Uniform_State*); 268 | 269 | /** Zeros the process state. */ 270 | void reset_nuir_process_state (struct Process_Non_Uniform_State*); 271 | 272 | /** De-allocates the state's internal data. */ 273 | void destroy_nuir_process_state (struct Process_Non_Uniform_State*); 274 | 275 | /** 276 | * Performs convolution processing for a given IR and state. 277 | * 278 | * The fft_scratch pointer should be point to 279 | * an array of config->fft_size floats, and should 280 | * have 64-byte alignment. 281 | */ 282 | void process_samples (const struct Convolution_Config*, 283 | const struct IR_Uniform*, 284 | struct Process_Uniform_State*, 285 | const float* in, 286 | float* out, 287 | int N, 288 | float* fft_scratch); 289 | 290 | /** 291 | * Similar to process_samples(), but with an added 292 | * config->block_size samples of latency. In exchange, 293 | * the convolution processing will be a little bit 294 | * faster, especially when processing with odd block 295 | * sizes. 296 | */ 297 | void process_samples_with_latency (const struct Convolution_Config*, 298 | const struct IR_Uniform*, 299 | struct Process_Uniform_State*, 300 | const float* in, 301 | float* out, 302 | int N, 303 | float* fft_scratch); 304 | 305 | /** 306 | * Performs convolution processing for a given multi-channel IR and state. 307 | * 308 | * The fft_scratch pointer should be point to 309 | * an array of config->fft_size floats, and should 310 | * have 64-byte alignment. 311 | */ 312 | void process_samples_multichannel (const struct Convolution_Config*, 313 | const struct IR_Uniform*, 314 | struct Process_Uniform_State*, 315 | const float* const* in, 316 | float* const* out, 317 | int N, 318 | int num_channels, 319 | float* fft_scratch); 320 | 321 | /** 322 | * Similar to process_samples_multichannel(), but with an added 323 | * config->block_size samples of latency. In exchange, 324 | * the convolution processing will be a little bit 325 | * faster, especially when processing with odd block 326 | * sizes. 327 | */ 328 | void process_samples_with_latency_multichannel (const struct Convolution_Config*, 329 | const struct IR_Uniform*, 330 | struct Process_Uniform_State*, 331 | const float* const* in, 332 | float* const* out, 333 | int N, 334 | int num_channels, 335 | float* fft_scratch); 336 | 337 | /** 338 | * Performs convolution processing for a given non-uniform IR and state. 339 | * 340 | * The scratch pointer should point to an allocated block 341 | * of at least get_required_nuir_scratch_bytes(), and should 342 | * have 64-byte alignment. 343 | */ 344 | void process_samples_non_uniform (const struct IR_Non_Uniform*, 345 | struct Process_Non_Uniform_State*, 346 | const float* in, 347 | float* out, 348 | int N, 349 | float* scratch); 350 | 351 | #ifdef __cplusplus 352 | } // namespace chowdsp::convolution 353 | } // extern "C" 354 | #endif 355 | -------------------------------------------------------------------------------- /test/chowdsp_convolution_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | #include // NOLINT 13 | template class chowdsp::Buffer; 14 | using Convolution_Internal_Buffer = chowdsp::Buffer; 15 | 16 | struct ConvolutionEngine 17 | { 18 | ConvolutionEngine (const float* samples, 19 | size_t numSamples, 20 | size_t maxBlockSize) 21 | : blockSize ((size_t) juce::nextPowerOfTwo ((int) maxBlockSize)), 22 | fftSize (blockSize > 128 ? 2 * blockSize : 4 * blockSize), 23 | fftObject (std::make_unique (juce::roundToInt (std::log2 (fftSize)))), 24 | numSegments (numSamples / (fftSize - blockSize) + 1u), 25 | numInputSegments ((blockSize > 128 ? numSegments : 3 * numSegments)), 26 | bufferInput (1, static_cast (fftSize)), 27 | bufferOutput (1, static_cast (fftSize * 2)), 28 | bufferTempOutput (1, static_cast (fftSize * 2)), 29 | bufferOverlap (1, static_cast (fftSize)) 30 | { 31 | bufferOutput.clear(); 32 | 33 | auto updateSegmentsIfNecessary = [this] (size_t numSegmentsToUpdate, 34 | std::vector& segments) 35 | { 36 | if (numSegmentsToUpdate == 0 37 | || numSegmentsToUpdate != (size_t) segments.size() 38 | || (size_t) segments[0].getNumSamples() != fftSize * 2) 39 | { 40 | segments.clear(); 41 | 42 | for (size_t i = 0; i < numSegmentsToUpdate; ++i) 43 | segments.push_back ({ 1, static_cast (fftSize * 2) }); // NOLINT 44 | } 45 | }; 46 | 47 | updateSegmentsIfNecessary (numInputSegments, buffersInputSegments); 48 | updateSegmentsIfNecessary (numSegments, buffersImpulseSegments); 49 | 50 | auto FFTTempObject = std::make_unique (juce::roundToInt (std::log2 (fftSize))); 51 | size_t currentPtr = 0; 52 | 53 | for (auto& buf : buffersImpulseSegments) 54 | { 55 | buf.clear(); 56 | 57 | auto* impulseResponse = buf.getWritePointer (0); 58 | 59 | if (&buf == &buffersImpulseSegments.front()) 60 | impulseResponse[0] = 1.0f; 61 | 62 | juce::FloatVectorOperations::copy (impulseResponse, 63 | samples + currentPtr, 64 | static_cast (juce::jmin (fftSize - blockSize, numSamples - currentPtr))); 65 | 66 | FFTTempObject->performRealOnlyForwardTransform (impulseResponse); 67 | prepareForConvolution (impulseResponse); 68 | 69 | currentPtr += (fftSize - blockSize); 70 | } 71 | 72 | reset(); 73 | } 74 | 75 | void reset() 76 | { 77 | bufferInput.clear(); 78 | bufferOverlap.clear(); 79 | bufferTempOutput.clear(); 80 | bufferOutput.clear(); 81 | 82 | for (auto& buf : buffersInputSegments) 83 | buf.clear(); 84 | 85 | currentSegment = 0; 86 | inputDataPos = 0; 87 | } 88 | 89 | void processSamples (const float* input, float* output, size_t numSamples) 90 | { 91 | // Overlap-add, zero latency convolution algorithm with uniform partitioning 92 | size_t numSamplesProcessed = 0; 93 | 94 | auto indexStep = numInputSegments / numSegments; 95 | 96 | auto* inputData = bufferInput.getWritePointer (0); 97 | auto* outputTempData = bufferTempOutput.getWritePointer (0); 98 | auto* outputData = bufferOutput.getWritePointer (0); 99 | auto* overlapData = bufferOverlap.getWritePointer (0); 100 | 101 | while (numSamplesProcessed < numSamples) 102 | { 103 | const bool inputDataWasEmpty = (inputDataPos == 0); 104 | auto numSamplesToProcess = juce::jmin (numSamples - numSamplesProcessed, blockSize - inputDataPos); 105 | 106 | juce::FloatVectorOperations::copy (inputData + inputDataPos, input + numSamplesProcessed, static_cast (numSamplesToProcess)); 107 | 108 | auto* inputSegmentData = buffersInputSegments[currentSegment].getWritePointer (0); 109 | juce::FloatVectorOperations::copy (inputSegmentData, inputData, static_cast (fftSize)); 110 | 111 | fftObject->performRealOnlyForwardTransform (inputSegmentData); 112 | prepareForConvolution (inputSegmentData); 113 | 114 | // Complex multiplication 115 | if (inputDataWasEmpty) 116 | { 117 | juce::FloatVectorOperations::fill (outputTempData, 0, static_cast (fftSize + 1)); 118 | 119 | auto index = currentSegment; 120 | 121 | for (size_t i = 1; i < numSegments; ++i) 122 | { 123 | index += indexStep; 124 | 125 | if (index >= numInputSegments) 126 | index -= numInputSegments; 127 | 128 | convolutionProcessingAndAccumulate (buffersInputSegments[index].getWritePointer (0), 129 | buffersImpulseSegments[i].getWritePointer (0), 130 | outputTempData); 131 | } 132 | } 133 | 134 | juce::FloatVectorOperations::copy (outputData, outputTempData, static_cast (fftSize + 1)); 135 | 136 | convolutionProcessingAndAccumulate (inputSegmentData, 137 | buffersImpulseSegments.front().getWritePointer (0), 138 | outputData); 139 | 140 | updateSymmetricFrequencyDomainData (outputData); 141 | fftObject->performRealOnlyInverseTransform (outputData); 142 | 143 | // Add overlap 144 | juce::FloatVectorOperations::add (&output[numSamplesProcessed], &outputData[inputDataPos], &overlapData[inputDataPos], (int) numSamplesToProcess); 145 | 146 | // Input buffer full => Next block 147 | inputDataPos += numSamplesToProcess; 148 | 149 | if (inputDataPos == blockSize) 150 | { 151 | // Input buffer is empty again now 152 | juce::FloatVectorOperations::fill (inputData, 0.0f, static_cast (fftSize)); 153 | 154 | inputDataPos = 0; 155 | 156 | // Extra step for segSize > blockSize 157 | juce::FloatVectorOperations::add (&(outputData[blockSize]), &(overlapData[blockSize]), static_cast (fftSize - 2 * blockSize)); 158 | 159 | // Save the overlap 160 | juce::FloatVectorOperations::copy (overlapData, &(outputData[blockSize]), static_cast (fftSize - blockSize)); 161 | 162 | currentSegment = (currentSegment > 0) ? (currentSegment - 1) : (numInputSegments - 1); 163 | } 164 | 165 | numSamplesProcessed += numSamplesToProcess; 166 | } 167 | } 168 | 169 | void processSamplesWithAddedLatency (const float* input, float* output, size_t numSamples) 170 | { 171 | // Overlap-add, zero latency convolution algorithm with uniform partitioning 172 | size_t numSamplesProcessed = 0; 173 | 174 | auto indexStep = numInputSegments / numSegments; 175 | 176 | auto* inputData = bufferInput.getWritePointer (0); 177 | auto* outputTempData = bufferTempOutput.getWritePointer (0); 178 | auto* outputData = bufferOutput.getWritePointer (0); 179 | auto* overlapData = bufferOverlap.getWritePointer (0); 180 | 181 | while (numSamplesProcessed < numSamples) 182 | { 183 | auto numSamplesToProcess = juce::jmin (numSamples - numSamplesProcessed, blockSize - inputDataPos); 184 | 185 | juce::FloatVectorOperations::copy (inputData + inputDataPos, input + numSamplesProcessed, static_cast (numSamplesToProcess)); 186 | 187 | juce::FloatVectorOperations::copy (output + numSamplesProcessed, outputData + inputDataPos, static_cast (numSamplesToProcess)); 188 | 189 | numSamplesProcessed += numSamplesToProcess; 190 | inputDataPos += numSamplesToProcess; 191 | 192 | // processing itself when needed (with latency) 193 | if (inputDataPos == blockSize) 194 | { 195 | // Copy input data in input segment 196 | auto* inputSegmentData = buffersInputSegments[currentSegment].getWritePointer (0); 197 | juce::FloatVectorOperations::copy (inputSegmentData, inputData, static_cast (fftSize)); 198 | 199 | fftObject->performRealOnlyForwardTransform (inputSegmentData); 200 | prepareForConvolution (inputSegmentData); 201 | 202 | // Complex multiplication 203 | juce::FloatVectorOperations::fill (outputTempData, 0, static_cast (fftSize + 1)); 204 | 205 | auto index = currentSegment; 206 | 207 | for (size_t i = 1; i < numSegments; ++i) 208 | { 209 | index += indexStep; 210 | 211 | if (index >= numInputSegments) 212 | index -= numInputSegments; 213 | 214 | convolutionProcessingAndAccumulate (buffersInputSegments[index].getWritePointer (0), 215 | buffersImpulseSegments[i].getWritePointer (0), 216 | outputTempData); 217 | } 218 | 219 | juce::FloatVectorOperations::copy (outputData, outputTempData, static_cast (fftSize + 1)); 220 | 221 | convolutionProcessingAndAccumulate (inputSegmentData, 222 | buffersImpulseSegments.front().getWritePointer (0), 223 | outputData); 224 | 225 | updateSymmetricFrequencyDomainData (outputData); 226 | fftObject->performRealOnlyInverseTransform (outputData); 227 | 228 | // Add overlap 229 | juce::FloatVectorOperations::add (outputData, overlapData, static_cast (blockSize)); 230 | 231 | // Input buffer is empty again now 232 | juce::FloatVectorOperations::fill (inputData, 0.0f, static_cast (fftSize)); 233 | 234 | // Extra step for segSize > blockSize 235 | juce::FloatVectorOperations::add (&(outputData[blockSize]), &(overlapData[blockSize]), static_cast (fftSize - 2 * blockSize)); 236 | 237 | // Save the overlap 238 | juce::FloatVectorOperations::copy (overlapData, &(outputData[blockSize]), static_cast (fftSize - blockSize)); 239 | 240 | currentSegment = (currentSegment > 0) ? (currentSegment - 1) : (numInputSegments - 1); 241 | 242 | inputDataPos = 0; 243 | } 244 | } 245 | } 246 | 247 | // After each FFT, this function is called to allow convolution to be performed with only 4 SIMD functions calls. 248 | void prepareForConvolution (float* samples) noexcept 249 | { 250 | auto FFTSizeDiv2 = fftSize / 2; 251 | 252 | for (size_t i = 0; i < FFTSizeDiv2; i++) 253 | samples[i] = samples[i << 1]; 254 | 255 | samples[FFTSizeDiv2] = 0; 256 | 257 | for (size_t i = 1; i < FFTSizeDiv2; i++) 258 | samples[i + FFTSizeDiv2] = -samples[((fftSize - i) << 1) + 1]; 259 | } 260 | 261 | // Does the convolution operation itself only on half of the frequency domain samples. 262 | void convolutionProcessingAndAccumulate (const float* input, const float* impulse, float* output) 263 | { 264 | auto FFTSizeDiv2 = fftSize / 2; 265 | jassert (juce::isPowerOfTwo (FFTSizeDiv2) && FFTSizeDiv2 > 8); 266 | jassert (juce::snapPointerToAlignment (input, (size_t) 32) == input); 267 | jassert (juce::snapPointerToAlignment (impulse, (size_t) 32) == impulse); 268 | jassert (juce::snapPointerToAlignment (output, (size_t) 32) == output); 269 | 270 | { 271 | juce::FloatVectorOperations::addWithMultiply (output, input, impulse, static_cast (FFTSizeDiv2)); 272 | juce::FloatVectorOperations::subtractWithMultiply (output, &(input[FFTSizeDiv2]), &(impulse[FFTSizeDiv2]), static_cast (FFTSizeDiv2)); 273 | 274 | juce::FloatVectorOperations::addWithMultiply (&(output[FFTSizeDiv2]), input, &(impulse[FFTSizeDiv2]), static_cast (FFTSizeDiv2)); 275 | juce::FloatVectorOperations::addWithMultiply (&(output[FFTSizeDiv2]), &(input[FFTSizeDiv2]), impulse, static_cast (FFTSizeDiv2)); 276 | } 277 | 278 | output[fftSize] += input[fftSize] * impulse[fftSize]; 279 | } 280 | 281 | // Undoes the re-organization of samples from the function prepareForConvolution. 282 | // Then takes the conjugate of the frequency domain first half of samples to fill the 283 | // second half, so that the inverse transform will return real samples in the time domain. 284 | void updateSymmetricFrequencyDomainData (float* samples) noexcept 285 | { 286 | auto FFTSizeDiv2 = fftSize / 2; 287 | 288 | for (size_t i = 1; i < FFTSizeDiv2; i++) 289 | { 290 | samples[(fftSize - i) << 1] = samples[i]; 291 | samples[((fftSize - i) << 1) + 1] = -samples[FFTSizeDiv2 + i]; 292 | } 293 | 294 | samples[1] = 0.f; 295 | 296 | for (size_t i = 1; i < FFTSizeDiv2; i++) 297 | { 298 | samples[i << 1] = samples[(fftSize - i) << 1]; 299 | samples[(i << 1) + 1] = -samples[((fftSize - i) << 1) + 1]; 300 | } 301 | } 302 | 303 | //============================================================================== 304 | const size_t blockSize; 305 | const size_t fftSize; 306 | const std::unique_ptr fftObject; 307 | const size_t numSegments; 308 | const size_t numInputSegments; 309 | size_t currentSegment = 0, inputDataPos = 0; 310 | 311 | Convolution_Internal_Buffer bufferInput, bufferOutput, bufferTempOutput, bufferOverlap; 312 | std::vector buffersInputSegments, buffersImpulseSegments; 313 | }; 314 | 315 | std::vector generate (size_t N, std::mt19937& rng) 316 | { 317 | std::vector data {}; 318 | data.resize (N); 319 | 320 | std::uniform_real_distribution dist { -1.0f, 1.0f }; 321 | for (auto& x : data) 322 | x = dist (rng); 323 | 324 | return data; 325 | } 326 | 327 | static bool test_convolution (int ir_length_samples, int block_size, int num_blocks, bool latency, bool preallocate) 328 | { 329 | std::cout << "Running test with IR length: " << ir_length_samples 330 | << ", block size: " << block_size 331 | << ", latency: " << (latency ? "ON" : "OFF") << '\n'; 332 | 333 | std::mt19937 rng { 0x12345 }; 334 | auto ir = generate (ir_length_samples, rng); 335 | const auto input = generate (block_size * num_blocks, rng); 336 | std::vector ref_output (input.size()); 337 | 338 | ConvolutionEngine reference_engine { ir.data(), ir.size(), (size_t) block_size }; 339 | auto start = std::chrono::high_resolution_clock::now(); 340 | for (int i = 0; i < num_blocks; ++i) 341 | { 342 | const auto* block_in = input.data() + (i * block_size); 343 | auto* block_out_ref = ref_output.data() + (i * block_size); 344 | if (latency) 345 | reference_engine.processSamplesWithAddedLatency (block_in, block_out_ref, block_size); 346 | else 347 | reference_engine.processSamples (block_in, block_out_ref, block_size); 348 | } 349 | auto duration = std::chrono::high_resolution_clock::now() - start; 350 | auto ref_duration_seconds = std::chrono::duration (duration).count(); 351 | std::cout << " juce::dsp::Convolution: " << ref_duration_seconds << " seconds" << std::endl; 352 | 353 | std::vector test_output (input.size()); 354 | 355 | const auto fft_size = chowdsp::convolution::convolution_fft_size (block_size); 356 | const auto config_bytes = chowdsp::convolution::config_bytes_required (block_size); 357 | const auto ir_bytes = chowdsp::convolution::ir_bytes_required (block_size, (int) ir.size()); 358 | const auto state_bytes = chowdsp::convolution::process_state_bytes_required (block_size, (int) ir.size()); 359 | size_t bytes_needed = config_bytes // config 360 | + fft_size * sizeof (float) // fft scratch 361 | + ir_bytes // ir 362 | + state_bytes; // state 363 | chowdsp::ArenaAllocator<> arena { bytes_needed + 64 }; 364 | 365 | chowdsp::convolution::Config conv_config {}; 366 | chowdsp::convolution::destroy_config (&conv_config); // destroying an empty config should be okay... 367 | chowdsp::convolution::create_config (&conv_config, block_size, preallocate ? arena.allocate_bytes (config_bytes, 64) : nullptr); 368 | auto* fft_scratch = arena.allocate (conv_config.fft_size, 64); 369 | 370 | chowdsp::convolution::IR_Uniform conv_ir {}; 371 | chowdsp::convolution::destroy_ir (&conv_ir); // destroying an empty IR should be okay... 372 | chowdsp::convolution::create_ir (&conv_config, 373 | &conv_ir, 374 | ir.data(), 375 | (int) ir.size(), 376 | fft_scratch, 377 | preallocate ? arena.allocate_bytes (ir_bytes, 64) : nullptr); 378 | 379 | chowdsp::convolution::Process_Uniform_State conv_state {}; 380 | chowdsp::convolution::destroy_process_state (&conv_state); // destroying an empty state should be okay... 381 | chowdsp::convolution::create_process_state (&conv_config, &conv_ir, &conv_state, preallocate ? arena.allocate_bytes (state_bytes, 64) : nullptr); 382 | 383 | start = std::chrono::high_resolution_clock::now(); 384 | for (int i = 0; i < num_blocks; ++i) 385 | { 386 | const auto* block_in = input.data() + (i * block_size); 387 | auto* block_out_test = test_output.data() + (i * block_size); 388 | if (latency) 389 | { 390 | chowdsp::convolution::process_samples_with_latency ( 391 | &conv_config, 392 | &conv_ir, 393 | &conv_state, 394 | block_in, 395 | block_out_test, 396 | block_size, 397 | fft_scratch); 398 | } 399 | else 400 | { 401 | chowdsp::convolution::process_samples (&conv_config, 402 | &conv_ir, 403 | &conv_state, 404 | block_in, 405 | block_out_test, 406 | block_size, 407 | fft_scratch); 408 | } 409 | } 410 | duration = std::chrono::high_resolution_clock::now() - start; 411 | auto test_duration_seconds = std::chrono::duration (duration).count(); 412 | std::cout << " chowdsp_convolution: " << test_duration_seconds << " seconds" << std::endl; 413 | std::cout << " chowdsp is " << ref_duration_seconds / test_duration_seconds << "x faster\n"; 414 | 415 | if (! preallocate) 416 | { 417 | chowdsp::convolution::destroy_process_state (&conv_state); 418 | chowdsp::convolution::destroy_ir (&conv_ir); 419 | chowdsp::convolution::destroy_config (&conv_config); 420 | } 421 | 422 | float error_accum {}; 423 | float max_error {}; 424 | for (int i = 0; i < test_output.size(); ++i) 425 | { 426 | const auto ref = ref_output[i]; 427 | const auto test = test_output[i]; 428 | const auto err = ref - test; 429 | max_error = std::max (max_error, std::abs (err)); 430 | error_accum += err * err; 431 | } 432 | const auto mse = error_accum / static_cast (test_output.size()); 433 | std::cout << " Max error: " << max_error << '\n'; 434 | std::cout << " Mean-squared error: " << mse << '\n'; 435 | 436 | return max_error < 5.0e-4f && mse < 1.0e-9f; 437 | } 438 | 439 | static bool test_convolution_multi_channel (int ir_length_samples, 440 | int block_size, 441 | int num_blocks, 442 | bool latency, 443 | int num_channels, 444 | bool mono_ir, 445 | bool preallocate) 446 | { 447 | std::cout << "Running test with IR length: " << ir_length_samples 448 | << ", block size: " << block_size 449 | << ", latency: " << (latency ? "ON" : "OFF") 450 | << ", # channels: " << num_channels 451 | << ", mono IR: " << (mono_ir ? "ON" : "OFF") << '\n'; 452 | 453 | std::mt19937 rng { 0x12345 }; 454 | auto ir = generate (ir_length_samples, rng); 455 | const auto input = generate (block_size * num_blocks, rng); 456 | std::vector ref_output (input.size()); 457 | 458 | ConvolutionEngine reference_engine { ir.data(), ir.size(), (size_t) block_size }; 459 | auto start = std::chrono::high_resolution_clock::now(); 460 | for (int i = 0; i < num_blocks; ++i) 461 | { 462 | const auto* block_in = input.data() + (i * block_size); 463 | auto* block_out_ref = ref_output.data() + (i * block_size); 464 | if (latency) 465 | reference_engine.processSamplesWithAddedLatency (block_in, block_out_ref, block_size); 466 | else 467 | reference_engine.processSamples (block_in, block_out_ref, block_size); 468 | } 469 | auto duration = std::chrono::high_resolution_clock::now() - start; 470 | auto ref_duration_seconds = std::chrono::duration (duration).count(); 471 | std::cout << " juce::dsp::Convolution: " << ref_duration_seconds << " seconds" << std::endl; 472 | 473 | std::vector multi_channel_ir {}; 474 | for (int ch = 0; ch < num_channels; ++ch) 475 | multi_channel_ir.push_back (ir.data()); 476 | 477 | std::vector test_output_flat (input.size() * num_channels); 478 | std::vector test_input { (size_t) num_channels, nullptr }; 479 | std::vector test_output { (size_t) num_channels, nullptr }; 480 | 481 | const auto fft_size = chowdsp::convolution::convolution_fft_size (block_size); 482 | const auto config_bytes = chowdsp::convolution::config_bytes_required (block_size); 483 | const auto ir_bytes = mono_ir ? chowdsp::convolution::ir_bytes_required (block_size, (int) ir.size()) 484 | : chowdsp::convolution::multichannel_ir_bytes_required (block_size, (int) ir.size(), num_channels); 485 | const auto state_bytes = chowdsp::convolution::multichannel_process_state_bytes_required (block_size, (int) ir.size(), num_channels); 486 | size_t bytes_needed = config_bytes // config 487 | + fft_size * sizeof (float) // fft scratch 488 | + ir_bytes // ir 489 | + state_bytes; // state 490 | chowdsp::ArenaAllocator<> arena { bytes_needed + 64 }; 491 | 492 | chowdsp::convolution::Config conv_config {}; 493 | chowdsp::convolution::create_config (&conv_config, block_size, preallocate ? arena.allocate_bytes (config_bytes, 64) : nullptr); 494 | auto* fft_scratch = arena.allocate (conv_config.fft_size, 64); 495 | 496 | chowdsp::convolution::IR_Uniform conv_ir {}; 497 | if (mono_ir) 498 | { 499 | chowdsp::convolution::create_ir (&conv_config, 500 | &conv_ir, 501 | ir.data(), 502 | ir_length_samples, 503 | fft_scratch, 504 | preallocate ? arena.allocate_bytes (ir_bytes, 64) : nullptr); 505 | } 506 | else 507 | { 508 | chowdsp::convolution::create_multichannel_ir (&conv_config, 509 | &conv_ir, 510 | multi_channel_ir.data(), 511 | ir_length_samples, 512 | num_channels, 513 | fft_scratch, 514 | preallocate ? arena.allocate_bytes (ir_bytes, 64) : nullptr); 515 | } 516 | 517 | chowdsp::convolution::Process_Uniform_State conv_state {}; 518 | chowdsp::convolution::create_multichannel_process_state (&conv_config, &conv_ir, &conv_state, num_channels, preallocate ? arena.allocate_bytes (state_bytes, 64) : nullptr); 519 | 520 | start = std::chrono::high_resolution_clock::now(); 521 | for (int i = 0; i < num_blocks; ++i) 522 | { 523 | for (int ch = 0; ch < num_channels; ++ch) 524 | { 525 | test_input[ch] = input.data() + (i * block_size); 526 | test_output[ch] = test_output_flat.data() + (input.size() * ch) + (i * block_size); 527 | } 528 | 529 | if (latency) 530 | { 531 | chowdsp::convolution::process_samples_with_latency_multichannel ( 532 | &conv_config, 533 | &conv_ir, 534 | &conv_state, 535 | test_input.data(), 536 | test_output.data(), 537 | block_size, 538 | num_channels, 539 | fft_scratch); 540 | } 541 | else 542 | { 543 | chowdsp::convolution::process_samples_multichannel (&conv_config, 544 | &conv_ir, 545 | &conv_state, 546 | test_input.data(), 547 | test_output.data(), 548 | block_size, 549 | num_channels, 550 | fft_scratch); 551 | } 552 | } 553 | duration = std::chrono::high_resolution_clock::now() - start; 554 | auto test_duration_seconds = std::chrono::duration (duration).count(); 555 | std::cout << " chowdsp_convolution: " << test_duration_seconds << " seconds" << std::endl; 556 | std::cout << " chowdsp is " << ref_duration_seconds / test_duration_seconds << "x faster\n"; 557 | 558 | if (! preallocate) 559 | { 560 | chowdsp::convolution::destroy_ir (&conv_ir); 561 | chowdsp::convolution::destroy_process_state (&conv_state); 562 | chowdsp::convolution::destroy_config (&conv_config); 563 | } 564 | 565 | float error_accum {}; 566 | float max_error {}; 567 | for (int ch = 0; ch < num_channels; ++ch) 568 | { 569 | for (int i = 0; i < input.size(); ++i) 570 | { 571 | const auto ref = ref_output[i]; 572 | const auto test = test_output_flat[ch * input.size() + i]; 573 | const auto err = ref - test; 574 | max_error = std::max (max_error, std::abs (err)); 575 | error_accum += err * err; 576 | } 577 | } 578 | const auto mse = error_accum / static_cast (test_output_flat.size()); 579 | std::cout << " Max error: " << max_error << '\n'; 580 | std::cout << " Mean-squared error: " << mse << '\n'; 581 | 582 | return max_error < 5.0e-4f && mse < 1.0e-9f; 583 | } 584 | 585 | static bool test_convolution_non_uniform (int ir_length_samples, int block_size, int num_blocks, int head_size) 586 | { 587 | std::cout << "Running test with IR length: " << ir_length_samples 588 | << ", block size: " << block_size 589 | << ", head size: " << head_size << '\n'; 590 | 591 | std::mt19937 rng { 0x12345 }; 592 | auto ir = generate (ir_length_samples, rng); 593 | const auto input = generate (block_size * num_blocks, rng); 594 | std::vector ref_output (input.size()); 595 | 596 | ConvolutionEngine reference_engine { ir.data(), ir.size(), (size_t) block_size }; 597 | auto start = std::chrono::high_resolution_clock::now(); 598 | for (int i = 0; i < num_blocks; ++i) 599 | { 600 | const auto* block_in = input.data() + (i * block_size); 601 | auto* block_out_ref = ref_output.data() + (i * block_size); 602 | reference_engine.processSamples (block_in, block_out_ref, block_size); 603 | } 604 | auto duration = std::chrono::high_resolution_clock::now() - start; 605 | auto ref_duration_seconds = std::chrono::duration (duration).count(); 606 | std::cout << " juce::dsp::Convolution: " << ref_duration_seconds << " seconds" << std::endl; 607 | 608 | std::vector test_output (input.size()); 609 | chowdsp::convolution::Config head_config {}; 610 | chowdsp::convolution::create_config (&head_config, block_size); 611 | chowdsp::convolution::Config tail_config {}; 612 | chowdsp::convolution::create_config (&tail_config, head_size); 613 | 614 | chowdsp::convolution::IR_Non_Uniform conv_ir {}; 615 | chowdsp::convolution::destroy_nuir (&conv_ir); 616 | conv_ir.head_config = &head_config; 617 | conv_ir.tail_config = &tail_config; 618 | conv_ir.head_size = head_size; 619 | auto* scratch = (float*) chowdsp::fft::aligned_malloc (chowdsp::convolution::get_required_nuir_scratch_bytes (&conv_ir)); 620 | 621 | chowdsp::convolution::create_nuir (&conv_ir, 622 | ir.data(), 623 | (int) ir.size(), 624 | scratch); 625 | 626 | chowdsp::convolution::Process_Non_Uniform_State conv_state {}; 627 | chowdsp::convolution::destroy_nuir_process_state (&conv_state); // destroying an empty state should be okay... 628 | chowdsp::convolution::create_nuir_process_state (&conv_ir, &conv_state); 629 | 630 | start = std::chrono::high_resolution_clock::now(); 631 | for (int i = 0; i < num_blocks; ++i) 632 | { 633 | const auto* block_in = input.data() + (i * block_size); 634 | auto* block_out_test = test_output.data() + (i * block_size); 635 | chowdsp::convolution::process_samples_non_uniform (&conv_ir, 636 | &conv_state, 637 | block_in, 638 | block_out_test, 639 | block_size, 640 | scratch); 641 | } 642 | duration = std::chrono::high_resolution_clock::now() - start; 643 | auto test_duration_seconds = std::chrono::duration (duration).count(); 644 | std::cout << " chowdsp_convolution: " << test_duration_seconds << " seconds" << std::endl; 645 | std::cout << " chowdsp is " << ref_duration_seconds / test_duration_seconds << "x faster\n"; 646 | 647 | chowdsp::fft::aligned_free (scratch); 648 | chowdsp::convolution::destroy_nuir (&conv_ir); 649 | chowdsp::convolution::destroy_nuir_process_state (&conv_state); 650 | chowdsp::convolution::destroy_config (&head_config); 651 | chowdsp::convolution::destroy_config (&tail_config); 652 | 653 | float error_accum {}; 654 | float max_error {}; 655 | for (int i = 0; i < test_output.size(); ++i) 656 | { 657 | const auto ref = ref_output[i]; 658 | const auto test = test_output[i]; 659 | const auto err = ref - test; 660 | max_error = std::max (max_error, std::abs (err)); 661 | error_accum += err * err; 662 | } 663 | const auto mse = error_accum / static_cast (test_output.size()); 664 | std::cout << " Max error: " << max_error << '\n'; 665 | std::cout << " Mean-squared error: " << mse << '\n'; 666 | 667 | return max_error < 5.0e-4f && mse < 1.0e-9f; 668 | } 669 | 670 | int main() 671 | { 672 | auto success = true; 673 | for (bool preallocate : { false, true }) 674 | { 675 | for (bool latency : { false, true }) 676 | { 677 | success &= test_convolution (6000, 2048, 4, latency, preallocate); 678 | success &= test_convolution (6000, 512, 20, latency, preallocate); 679 | success &= test_convolution (6000, 511, 20, latency, preallocate); 680 | success &= test_convolution (6000, 32, 400, latency, preallocate); 681 | success &= test_convolution (100, 2048, 2, latency, preallocate); 682 | success &= test_convolution (100, 512, 4, latency, preallocate); 683 | success &= test_convolution (100, 511, 4, latency, preallocate); 684 | success &= test_convolution (100, 32, 10, latency, preallocate); 685 | 686 | success &= test_convolution_multi_channel (6000, 2048, 4, latency, 2, false, preallocate); 687 | success &= test_convolution_multi_channel (100, 32, 10, latency, 4, false, preallocate); 688 | success &= test_convolution_multi_channel (6000, 512, 4, latency, 2, true, preallocate); 689 | success &= test_convolution_multi_channel (100, 511, 10, latency, 4, true, preallocate); 690 | } 691 | } 692 | 693 | success &= test_convolution_non_uniform (6000, 2048, 4, 2048); 694 | success &= test_convolution_non_uniform (6000, 512, 20, 1024); 695 | success &= test_convolution_non_uniform (6000, 511, 20, 1024); 696 | success &= test_convolution_non_uniform (6000, 32, 400, 1024); 697 | success &= test_convolution_non_uniform (200, 32, 10, 64); 698 | 699 | std::cout << "Speed comparisons:\n"; 700 | success &= test_convolution (48'000, 512, 10'000, false, true); 701 | success &= test_convolution (48'000, 512, 10'000, true, true); 702 | success &= test_convolution_non_uniform (48'000, 512, 10'000, 2048); 703 | 704 | return success ? 0 : 1; 705 | } 706 | -------------------------------------------------------------------------------- /chowdsp_convolution.cpp: -------------------------------------------------------------------------------- 1 | #include "chowdsp_convolution.h" 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | namespace chowdsp::convolution 9 | { 10 | static int min_int (int a, int b) 11 | { 12 | return (b < a) ? b : a; 13 | } 14 | 15 | static int max_int (int a, int b) 16 | { 17 | return (b > a) ? b : a; 18 | } 19 | 20 | static int next_pow2 (int v) noexcept 21 | { 22 | --v; 23 | v |= (v >> 1); 24 | v |= (v >> 2); 25 | v |= (v >> 4); 26 | v |= (v >> 8); 27 | v |= (v >> 16); 28 | return v + 1; 29 | } 30 | 31 | static int pad_floats (int N) 32 | { 33 | static constexpr int pad_len = 16; 34 | const auto N_div = (N + pad_len - 1) / pad_len; 35 | return N_div * pad_len; 36 | } 37 | 38 | static int pad_bytes (int N) 39 | { 40 | static constexpr int pad_len = 64; 41 | const auto N_div = (N + pad_len - 1) / pad_len; 42 | return N_div * pad_len; 43 | } 44 | 45 | static void get_block_and_fft_sizes (int max_block_size, int& block_size, int& fft_size) 46 | { 47 | block_size = next_pow2 (max_block_size); 48 | fft_size = block_size > 128 ? 2 * block_size : 4 * block_size; 49 | } 50 | 51 | int convolution_fft_size (int max_block_size) 52 | { 53 | [[maybe_unused]] int block_size, fft_size; 54 | get_block_and_fft_sizes (max_block_size, block_size, fft_size); 55 | return fft_size; 56 | } 57 | 58 | void create_config (Config* config, int max_block_size, void* data) 59 | { 60 | get_block_and_fft_sizes (max_block_size, config->block_size, config->fft_size); 61 | if (data == nullptr) 62 | config->fft = fft::fft_new_setup (config->fft_size, fft::FFT_REAL); 63 | else 64 | config->fft = fft::fft_new_setup_preallocated (config->fft_size, fft::FFT_REAL, data); 65 | } 66 | 67 | size_t config_bytes_required (int max_block_size) 68 | { 69 | const auto fft_size = convolution_fft_size (max_block_size); 70 | return fft::fft_bytes_required (fft_size, fft::FFT_REAL); 71 | } 72 | 73 | void destroy_config (Config* config) 74 | { 75 | if (config->fft != nullptr) 76 | fft::fft_destroy_setup (config->fft); 77 | *config = {}; 78 | } 79 | 80 | //================================================================================================================ 81 | void create_ir (const Config* config, IR_Uniform* ir, const float* ir_data, int ir_num_samples, float* fft_scratch, void* data) 82 | { 83 | create_zero_ir (config, ir, ir_num_samples, data); 84 | load_ir (config, ir, ir_data, ir_num_samples, fft_scratch); 85 | } 86 | 87 | static int get_num_segments (int fft_size, int block_size, int ir_num_samples) 88 | { 89 | return (ir_num_samples / (fft_size - block_size)) + 1; 90 | } 91 | 92 | static int get_num_segments (const Config* config, int ir_num_samples) 93 | { 94 | return get_num_segments (config->fft_size, config->block_size, ir_num_samples); 95 | } 96 | 97 | static float* get_segment (const Config* config, float* segments, int segment_idx) 98 | { 99 | return segments + config->fft_size * segment_idx; 100 | } 101 | 102 | static void create_zero_ir_num_segments (const Config* config, IR_Uniform* ir, int num_segments, void* data) 103 | { 104 | const auto segment_num_samples = config->fft_size; 105 | 106 | if (data == nullptr) 107 | { 108 | size_t bytes_needed = segment_num_samples * num_segments * sizeof (float); 109 | data = fft::aligned_malloc (bytes_needed); 110 | } 111 | 112 | ir->max_num_segments = num_segments; 113 | ir->num_segments = num_segments; 114 | 115 | ir->segments = static_cast (data); 116 | memset (ir->segments, 0, ir->num_segments * segment_num_samples * sizeof (float)); 117 | } 118 | 119 | size_t ir_bytes_required (int max_block_size, int ir_num_samples) 120 | { 121 | int block_size, fft_size; 122 | get_block_and_fft_sizes (max_block_size, block_size, fft_size); 123 | const auto num_segments = get_num_segments (fft_size, block_size, ir_num_samples); 124 | const auto segment_num_samples = fft_size; 125 | return segment_num_samples * num_segments * sizeof (float); 126 | } 127 | 128 | void create_zero_ir (const Config* config, IR_Uniform* ir, int ir_num_samples, void* data) 129 | { 130 | create_zero_ir_num_segments (config, ir, get_num_segments (config, ir_num_samples), data); 131 | ir->num_channels = 1; 132 | } 133 | 134 | void load_ir (const Config* config, IR_Uniform* ir, const float* ir_data, int ir_num_samples, float* fft_scratch) 135 | { 136 | assert (ir->num_channels == 1); 137 | 138 | const auto num_segments = get_num_segments (config, ir_num_samples); 139 | assert (num_segments <= ir->max_num_segments); // IR is too large for the allocated number of segments 140 | ir->num_segments = num_segments; 141 | 142 | int current_ptr {}; 143 | for (int seg_idx = 0; seg_idx < ir->num_segments; ++seg_idx) 144 | { 145 | float* segment = get_segment (config, ir->segments, seg_idx); 146 | const auto segment_n = min_int (config->fft_size - config->block_size, ir_num_samples - current_ptr); 147 | memcpy (segment, ir_data + current_ptr, segment_n * sizeof (float)); 148 | memset (segment + segment_n, 0, (config->fft_size - segment_n) * sizeof (float)); 149 | fft::fft_transform_unordered (config->fft, 150 | segment, 151 | segment, 152 | fft_scratch, 153 | fft::FFT_FORWARD); 154 | current_ptr += segment_n; 155 | } 156 | } 157 | 158 | void destroy_ir (IR_Uniform* ir) 159 | { 160 | fft::aligned_free (ir->segments); 161 | *ir = {}; 162 | } 163 | 164 | size_t multichannel_ir_bytes_required (int max_block_size, int ir_num_samples, int num_channels) 165 | { 166 | return ir_bytes_required (max_block_size, ir_num_samples) * num_channels; 167 | } 168 | 169 | void create_multichannel_ir (const Config* config, IR_Uniform* ir, const float* const* ir_data, int ir_num_samples, int num_channels, float* fft_scratch, void* data) 170 | { 171 | create_zero_multichannel_ir (config, ir, ir_num_samples, num_channels, data); 172 | load_multichannel_ir (config, ir, ir_data, ir_num_samples, num_channels, fft_scratch); 173 | } 174 | 175 | void create_zero_multichannel_ir (const Config* config, IR_Uniform* ir, int ir_num_samples, int num_channels, void* data) 176 | { 177 | const auto mono_ir_num_segments = get_num_segments (config, ir_num_samples); 178 | 179 | create_zero_ir_num_segments (config, ir, mono_ir_num_segments * num_channels, data); 180 | assert (ir->num_segments % num_channels == 0); 181 | const auto actual_num_segments = ir->num_segments / num_channels; 182 | ir->num_segments = actual_num_segments; 183 | ir->max_num_segments = actual_num_segments; 184 | ir->num_channels = num_channels; 185 | } 186 | 187 | void load_multichannel_ir (const Config* config, IR_Uniform* ir, const float* const* ir_data, int ir_num_samples, int num_channels, float* fft_scratch) 188 | { 189 | assert (num_channels == ir->num_channels); 190 | 191 | int new_num_segments = 0; 192 | for (int ch = 0; ch < num_channels; ++ch) 193 | { 194 | IR_Uniform this_channel_ir { 195 | .segments = get_segment (config, ir->segments, ch * ir->max_num_segments), 196 | .num_segments = ir->num_segments, 197 | .max_num_segments = ir->max_num_segments, 198 | .num_channels = 1, 199 | }; 200 | load_ir (config, &this_channel_ir, ir_data[ch], ir_num_samples, fft_scratch); 201 | new_num_segments = this_channel_ir.num_segments; 202 | } 203 | ir->num_segments = new_num_segments; 204 | } 205 | 206 | //================================================================================================================ 207 | static int state_max_num_segments (int block_size, int ir_num_segments) 208 | { 209 | return block_size > 128 ? ir_num_segments : 3 * ir_num_segments; 210 | } 211 | 212 | static size_t state_data_bytes_needed (int fft_size, int block_size, int ir_num_segments, int num_channels) 213 | { 214 | size_t bytes_needed {}; 215 | 216 | const auto segment_num_samples = fft_size; 217 | const auto max_num_segments = state_max_num_segments (block_size, ir_num_segments); 218 | bytes_needed += segment_num_samples * max_num_segments * sizeof (float); 219 | 220 | bytes_needed += fft_size * sizeof (float); // input data 221 | bytes_needed += fft_size * sizeof (float); // output data 222 | bytes_needed += fft_size * sizeof (float); // output temp data 223 | bytes_needed += fft_size * sizeof (float); // overlap data 224 | return bytes_needed * num_channels; 225 | } 226 | 227 | static void state_data_partition_memory (const Config* config, Process_Uniform_State* state, Process_Uniform_State::State_Data& state_data, float*& data) 228 | { 229 | const auto segment_num_samples = config->fft_size; 230 | 231 | state_data.segments = data; 232 | data += segment_num_samples * state->max_num_segments; 233 | state_data.input_data = data; 234 | data += config->fft_size; 235 | state_data.output_data = data; 236 | data += config->fft_size; 237 | state_data.output_temp_data = data; 238 | data += config->fft_size; 239 | state_data.overlap_data = data; 240 | data += config->fft_size; 241 | } 242 | 243 | size_t multichannel_process_state_bytes_required (int max_block_size, int ir_num_samples, int num_channels) 244 | { 245 | int block_size, fft_size; 246 | get_block_and_fft_sizes (max_block_size, block_size, fft_size); 247 | const auto ir_num_segments = get_num_segments (fft_size, block_size, ir_num_samples); 248 | return state_data_bytes_needed (fft_size, block_size, ir_num_segments, num_channels) 249 | + pad_bytes (sizeof (Process_Uniform_State::State_Data) * num_channels); 250 | } 251 | 252 | void create_multichannel_process_state (const Config* config, const IR_Uniform* ir, Process_Uniform_State* state, int num_channels, void* data) 253 | { 254 | using State_Data = Process_Uniform_State::State_Data; 255 | state->num_channels = num_channels; 256 | state->max_num_segments = state_max_num_segments (config->block_size, ir->max_num_segments); 257 | 258 | const auto state_bytes_needed = state_data_bytes_needed (config->fft_size, config->block_size, ir->max_num_segments, num_channels); 259 | if (data == nullptr) 260 | data = fft::aligned_malloc (state_bytes_needed + num_channels * sizeof (State_Data)); 261 | state->state_data = reinterpret_cast (static_cast (data) + state_bytes_needed); 262 | 263 | auto* float_data = static_cast (data); 264 | for (int ch = 0; ch < state->num_channels; ++ch) 265 | state_data_partition_memory (config, state, state->state_data[ch], float_data); 266 | assert (static_cast (float_data) == static_cast (state->state_data)); 267 | 268 | reset_process_state (config, state); 269 | } 270 | 271 | size_t process_state_bytes_required (int block_size, int ir_num_samples) 272 | { 273 | return multichannel_process_state_bytes_required (block_size, ir_num_samples, 1); 274 | } 275 | 276 | void create_process_state (const Config* config, const IR_Uniform* ir, Process_Uniform_State* state, void* data) 277 | { 278 | create_multichannel_process_state (config, ir, state, ir->num_channels, data); 279 | } 280 | 281 | void reset_process_state (const Config* config, Process_Uniform_State* state) 282 | { 283 | state->current_segment = 0; 284 | state->input_data_pos = 0; 285 | 286 | const auto segment_num_samples = config->fft_size; 287 | for (int ch = 0; ch < state->num_channels; ++ch) 288 | { 289 | auto& state_data = state->state_data[ch]; 290 | memset (state_data.segments, 291 | 0, 292 | segment_num_samples * state->max_num_segments * sizeof (float)); 293 | 294 | memset (state_data.input_data, 0, config->fft_size * sizeof (float)); 295 | memset (state_data.output_data, 0, config->fft_size * sizeof (float)); 296 | memset (state_data.output_temp_data, 0, config->fft_size * sizeof (float)); 297 | memset (state_data.overlap_data, 0, config->fft_size * sizeof (float)); 298 | } 299 | } 300 | 301 | void reset_process_state_segments (const Convolution_Config* config, Process_Uniform_State* state, const IR_Uniform* ir) 302 | { 303 | const auto segment_num_samples = config->fft_size; 304 | for (int ch = 0; ch < state->num_channels; ++ch) 305 | { 306 | auto& state_data = state->state_data[ch]; 307 | memset (state_data.segments + segment_num_samples * ir->num_segments, 308 | 0, 309 | segment_num_samples * (state->max_num_segments - ir->num_segments) * sizeof (float)); 310 | } 311 | } 312 | 313 | void destroy_process_state (Process_Uniform_State* state) 314 | { 315 | if (state->state_data != nullptr) 316 | fft::aligned_free (state->state_data[0].segments); 317 | *state = {}; 318 | } 319 | 320 | //================================================================================================================ 321 | int get_required_nuir_scratch_bytes (const IR_Non_Uniform* ir) 322 | { 323 | assert (ir->head_config != nullptr); 324 | assert (ir->tail_config != nullptr); 325 | return static_cast ((max_int (ir->head_config->fft_size, 326 | ir->tail_config->fft_size) 327 | + pad_floats (ir->head_config->block_size)) 328 | * sizeof (float)); 329 | } 330 | 331 | void create_nuir (IR_Non_Uniform* ir, const float* ir_data, int ir_num_samples, float* fft_scratch) 332 | { 333 | create_zero_nuir (ir, ir_num_samples); 334 | load_nuir (ir, ir_data, ir_num_samples, fft_scratch); 335 | } 336 | 337 | void create_zero_nuir (IR_Non_Uniform* ir, int ir_num_samples) 338 | { 339 | assert (ir->head_config != nullptr); 340 | assert (ir->tail_config != nullptr); 341 | assert (ir->head_size >= ir->head_config->block_size); 342 | assert (ir->tail_config->block_size == ir->head_size); 343 | assert (ir_num_samples >= 2 * ir->head_size); 344 | 345 | const auto head_num_segments = get_num_segments (ir->head_config, ir->head_size); 346 | const auto head_segments_length = head_num_segments * ir->head_config->fft_size; 347 | const auto tail_num_segments = get_num_segments (ir->tail_config, ir_num_samples - ir->head_size); 348 | const auto tail_segments_length = tail_num_segments * ir->tail_config->fft_size; 349 | const auto total_segments_length = head_segments_length + tail_segments_length; 350 | 351 | auto* segment_data = static_cast (fft::aligned_malloc (total_segments_length * sizeof (float))); 352 | memset (segment_data, 0, total_segments_length * sizeof (float)); 353 | 354 | ir->head.segments = segment_data; 355 | ir->head.num_segments = head_num_segments; 356 | ir->head.max_num_segments = head_num_segments; 357 | ir->head.num_channels = 1; 358 | ir->tail.segments = segment_data + head_segments_length; 359 | ir->tail.num_segments = tail_num_segments; 360 | ir->tail.max_num_segments = tail_num_segments; 361 | ir->tail.num_channels = 1; 362 | } 363 | 364 | void load_nuir (IR_Non_Uniform* ir, const float* ir_data, int ir_num_samples, float* fft_scratch) 365 | { 366 | load_ir (ir->head_config, &ir->head, ir_data, min_int (ir_num_samples, ir->head_size), fft_scratch); 367 | load_ir (ir->tail_config, &ir->tail, ir_data + ir->head_size, max_int (ir_num_samples - ir->head_size, 0), fft_scratch); 368 | } 369 | 370 | void destroy_nuir (IR_Non_Uniform* ir) 371 | { 372 | fft::aligned_free (ir->head.segments); 373 | *ir = {}; 374 | } 375 | 376 | //================================================================================================================ 377 | void create_nuir_process_state (const IR_Non_Uniform* ir, Process_Non_Uniform_State* state) 378 | { 379 | using State_Data = Process_Uniform_State::State_Data; 380 | 381 | state->head.num_channels = 1; // @TODO 382 | state->head_config = ir->head_config; 383 | state->tail.num_channels = 1; // @TODO 384 | state->tail_config = ir->tail_config; 385 | 386 | state->head.max_num_segments = state_max_num_segments (ir->head_config->block_size, ir->head.max_num_segments); 387 | state->tail.max_num_segments = state_max_num_segments (ir->tail_config->block_size, ir->tail.max_num_segments); 388 | 389 | const auto head_state_bytes_needed = state_data_bytes_needed (state->head_config->fft_size, 390 | state->head_config->block_size, 391 | ir->head.max_num_segments, 392 | 1); 393 | const auto tail_state_bytes_needed = state_data_bytes_needed (state->tail_config->fft_size, 394 | state->tail_config->block_size, 395 | ir->tail.max_num_segments, 396 | 1); 397 | auto* data = fft::aligned_malloc (head_state_bytes_needed + tail_state_bytes_needed + 2 * sizeof (State_Data)); 398 | state->head.state_data = reinterpret_cast (static_cast (data) + head_state_bytes_needed + tail_state_bytes_needed); 399 | state->tail.state_data = state->head.state_data + 1; 400 | 401 | auto* float_data = static_cast (data); 402 | 403 | state_data_partition_memory (state->head_config, &state->head, state->head.state_data[0], float_data); 404 | state_data_partition_memory (state->tail_config, &state->tail, state->tail.state_data[0], float_data); 405 | assert (static_cast (float_data) == static_cast (state->head.state_data)); 406 | 407 | reset_process_state (state->head_config, &state->head); 408 | reset_process_state (state->tail_config, &state->tail); 409 | } 410 | 411 | void reset_nuir_process_state (Process_Non_Uniform_State* state) 412 | { 413 | reset_process_state (state->head_config, &state->head); 414 | reset_process_state (state->tail_config, &state->tail); 415 | } 416 | 417 | void destroy_nuir_process_state (Process_Non_Uniform_State* state) 418 | { 419 | destroy_process_state (&state->head); 420 | *state = {}; 421 | } 422 | 423 | //================================================================================================================ 424 | static void process_samples_mono (const Config* config, 425 | const IR_Uniform* ir, 426 | Process_Uniform_State* state, 427 | const float* input, 428 | float* output, 429 | int num_samples, 430 | float* fft_scratch) 431 | { 432 | const auto fft_inv_scale = 1.0f / static_cast (config->fft_size); 433 | const auto state_num_segments = config->block_size > 128 ? ir->num_segments : 3 * ir->num_segments; 434 | auto index_step = state_num_segments / ir->num_segments; 435 | state->current_segment = (state->current_segment >= state_num_segments) ? 0 : state->current_segment; 436 | auto* state_data = state->state_data; 437 | 438 | int num_samples_processed = 0; 439 | while (num_samples_processed < num_samples) 440 | { 441 | const auto input_data_was_empty = state->input_data_pos == 0; 442 | const auto samples_to_process = min_int (num_samples - num_samples_processed, 443 | config->block_size - state->input_data_pos); 444 | 445 | memcpy (state_data->input_data + state->input_data_pos, 446 | input + num_samples_processed, 447 | samples_to_process * sizeof (float)); 448 | 449 | auto* input_segment_data = get_segment (config, state_data->segments, state->current_segment); 450 | memcpy (input_segment_data, state_data->input_data, config->fft_size * sizeof (float)); 451 | 452 | fft::fft_transform_unordered (config->fft, 453 | input_segment_data, 454 | input_segment_data, 455 | fft_scratch, 456 | fft::FFT_FORWARD); 457 | 458 | // Complex multiplication 459 | if (input_data_was_empty) 460 | { 461 | memset (state_data->output_temp_data, 0, config->fft_size * sizeof (float)); 462 | 463 | auto index = state->current_segment; 464 | for (int seg_idx = 1; seg_idx < ir->num_segments; ++seg_idx) 465 | { 466 | index += index_step; 467 | if (index >= state_num_segments) 468 | index -= state_num_segments; 469 | 470 | const auto* input_segment = get_segment (config, state_data->segments, index); 471 | const auto* ir_segment = get_segment (config, ir->segments, seg_idx); 472 | fft::fft_convolve_unordered (config->fft, 473 | input_segment, 474 | ir_segment, 475 | state_data->output_temp_data, 476 | fft_inv_scale); 477 | } 478 | } 479 | 480 | memcpy (state_data->output_data, state_data->output_temp_data, config->fft_size * sizeof (float)); 481 | 482 | fft::fft_convolve_unordered (config->fft, 483 | input_segment_data, 484 | ir->segments, 485 | state_data->output_data, 486 | fft_inv_scale); 487 | fft::fft_transform_unordered (config->fft, 488 | state_data->output_data, 489 | state_data->output_data, 490 | fft_scratch, 491 | fft::FFT_BACKWARD); 492 | 493 | // Add overlap 494 | { 495 | // Using SIMD for this operation is tricky, because 496 | // we can't guarantee that the pointers will be aligned. 497 | 498 | // const auto vec_width_x2 = 2 * fft::fft_simd_width_bytes (config->fft) / static_cast (sizeof (float)); 499 | // const auto n_samples_vec = (samples_to_process / vec_width_x2) * vec_width_x2; 500 | // fft::fft_accumulate (config->fft, 501 | // state->output_data + state->input_data_pos, 502 | // state->overlap_data + state->input_data_pos, 503 | // output + num_samples_processed, 504 | // n_samples_vec); 505 | // for (int i = n_samples_vec; i < samples_to_process; ++i) // extra data that can't be SIMD-ed 506 | // output[num_samples_processed + i] = state->output_data[state->input_data_pos + i] + state->overlap_data[state->input_data_pos + i]; 507 | 508 | for (int i = 0; i < samples_to_process; ++i) 509 | output[num_samples_processed + i] = state_data->output_data[state->input_data_pos + i] + state_data->overlap_data[state->input_data_pos + i]; 510 | } 511 | 512 | // Input buffer full => Next block 513 | state->input_data_pos += samples_to_process; 514 | 515 | if (state->input_data_pos == config->block_size) 516 | { 517 | // Input buffer is empty again now 518 | memset (state_data->input_data, 0, config->fft_size * sizeof (float)); 519 | 520 | state->input_data_pos = 0; 521 | 522 | // Extra step for segSize > blockSize 523 | const auto extra_block_samples = config->fft_size - 2 * config->block_size; 524 | if (extra_block_samples > 0) 525 | { 526 | fft::fft_accumulate (config->fft, 527 | state_data->overlap_data + config->block_size, 528 | state_data->output_data + config->block_size, 529 | state_data->output_data + config->block_size, 530 | extra_block_samples); 531 | } 532 | 533 | // Save the overlap 534 | memcpy (state_data->overlap_data, 535 | state_data->output_data + config->block_size, 536 | (config->fft_size - config->block_size) * sizeof (float)); 537 | 538 | state->current_segment = (state->current_segment > 0) ? (state->current_segment - 1) : (state_num_segments - 1); 539 | } 540 | 541 | num_samples_processed += samples_to_process; 542 | } 543 | } 544 | 545 | void process_samples (const Config* config, 546 | const IR_Uniform* ir, 547 | Process_Uniform_State* state, 548 | const float* input, 549 | float* output, 550 | int num_samples, 551 | float* fft_scratch) 552 | { 553 | assert (ir->num_channels == 1); 554 | process_samples_mono (config, ir, state, input, output, num_samples, fft_scratch); 555 | } 556 | 557 | static void process_samples_with_latency_mono (const Config* config, 558 | const IR_Uniform* ir, 559 | Process_Uniform_State* state, 560 | const float* input, 561 | float* output, 562 | int num_samples, 563 | float* fft_scratch) 564 | { 565 | const auto fft_inv_scale = 1.0f / static_cast (config->fft_size); 566 | const auto state_num_segments = config->block_size > 128 ? ir->num_segments : 3 * ir->num_segments; 567 | auto index_step = state_num_segments / ir->num_segments; 568 | state->current_segment = (state->current_segment >= state_num_segments) ? 0 : state->current_segment; 569 | auto* state_data = state->state_data; 570 | 571 | int num_samples_processed = 0; 572 | while (num_samples_processed < num_samples) 573 | { 574 | const auto samples_to_process = min_int (num_samples - num_samples_processed, 575 | config->block_size - state->input_data_pos); 576 | 577 | memcpy (state_data->input_data + state->input_data_pos, 578 | input + num_samples_processed, 579 | samples_to_process * sizeof (float)); 580 | 581 | memcpy (output + num_samples_processed, 582 | state_data->output_data + state->input_data_pos, 583 | samples_to_process * sizeof (float)); 584 | 585 | num_samples_processed += samples_to_process; 586 | state->input_data_pos += samples_to_process; 587 | 588 | if (state->input_data_pos == config->block_size) 589 | { 590 | // Copy input data in input segment 591 | auto* input_segment_data = get_segment (config, state_data->segments, state->current_segment); 592 | memcpy (input_segment_data, state_data->input_data, config->fft_size * sizeof (float)); 593 | 594 | fft::fft_transform_unordered (config->fft, 595 | input_segment_data, 596 | input_segment_data, 597 | fft_scratch, 598 | fft::FFT_FORWARD); 599 | 600 | // Complex multiplication 601 | memset (state_data->output_temp_data, 0, config->fft_size * sizeof (float)); 602 | 603 | auto index = state->current_segment; 604 | for (int seg_idx = 1; seg_idx < ir->num_segments; ++seg_idx) 605 | { 606 | index += index_step; 607 | if (index >= state_num_segments) 608 | index -= state_num_segments; 609 | 610 | const auto* input_segment = get_segment (config, state_data->segments, index); 611 | const auto* ir_segment = get_segment (config, ir->segments, seg_idx); 612 | fft::fft_convolve_unordered (config->fft, 613 | input_segment, 614 | ir_segment, 615 | state_data->output_temp_data, 616 | fft_inv_scale); 617 | } 618 | 619 | memcpy (state_data->output_data, state_data->output_temp_data, config->fft_size * sizeof (float)); 620 | 621 | fft::fft_convolve_unordered (config->fft, 622 | input_segment_data, 623 | ir->segments, 624 | state_data->output_data, 625 | fft_inv_scale); 626 | fft::fft_transform_unordered (config->fft, 627 | state_data->output_data, 628 | state_data->output_data, 629 | fft_scratch, 630 | fft::FFT_BACKWARD); 631 | 632 | // Add overlap 633 | fft::fft_accumulate (config->fft, 634 | state_data->overlap_data, 635 | state_data->output_data, 636 | state_data->output_data, 637 | config->block_size); 638 | 639 | // Input buffer is empty again now 640 | memset (state_data->input_data, 0, config->fft_size * sizeof (float)); 641 | 642 | // Extra step for segSize > blockSize 643 | const auto extra_block_samples = config->fft_size - 2 * config->block_size; 644 | if (extra_block_samples > 0) 645 | { 646 | fft::fft_accumulate (config->fft, 647 | state_data->overlap_data + config->block_size, 648 | state_data->output_data + config->block_size, 649 | state_data->output_data + config->block_size, 650 | extra_block_samples); 651 | } 652 | 653 | // Save the overlap 654 | memcpy (state_data->overlap_data, 655 | state_data->output_data + config->block_size, 656 | (config->fft_size - config->block_size) * sizeof (float)); 657 | 658 | state->current_segment = (state->current_segment > 0) ? (state->current_segment - 1) : (state_num_segments - 1); 659 | 660 | state->input_data_pos = 0; 661 | } 662 | } 663 | } 664 | 665 | void process_samples_with_latency (const Config* config, 666 | const IR_Uniform* ir, 667 | Process_Uniform_State* state, 668 | const float* input, 669 | float* output, 670 | int num_samples, 671 | float* fft_scratch) 672 | { 673 | assert (ir->num_channels == 1); 674 | process_samples_with_latency_mono (config, ir, state, input, output, num_samples, fft_scratch); 675 | } 676 | 677 | static void process_multichannel (const Config* config, 678 | const IR_Uniform* ir, 679 | Process_Uniform_State* state, 680 | const float* const* in, 681 | float* const* out, 682 | int N, 683 | int num_channels, 684 | float* fft_scratch, 685 | bool with_latency) 686 | { 687 | assert (ir->num_channels == 1 || ir->num_channels == state->num_channels); 688 | assert (state->num_channels == num_channels); 689 | 690 | for (int ch = 0; ch < num_channels; ++ch) 691 | { 692 | IR_Uniform mono_ir { 693 | .segments = ir->num_channels == 1 694 | ? ir->segments 695 | : get_segment (config, ir->segments, ch * ir->max_num_segments), 696 | .num_segments = ir->num_segments, 697 | .max_num_segments = ir->max_num_segments, 698 | .num_channels = 1, 699 | }; 700 | 701 | Process_Uniform_State mono_state { 702 | .state_data = state->state_data + ch, 703 | .max_num_segments = state->max_num_segments, 704 | .current_segment = state->current_segment, 705 | .input_data_pos = state->input_data_pos, 706 | .num_channels = 1, 707 | }; 708 | 709 | if (with_latency) 710 | process_samples_with_latency_mono (config, &mono_ir, &mono_state, in[ch], out[ch], N, fft_scratch); 711 | else 712 | process_samples_mono (config, &mono_ir, &mono_state, in[ch], out[ch], N, fft_scratch); 713 | 714 | if (ch == num_channels - 1) 715 | { 716 | state->current_segment = mono_state.current_segment; 717 | state->input_data_pos = mono_state.input_data_pos; 718 | } 719 | } 720 | } 721 | 722 | void process_samples_multichannel (const Config* config, 723 | const IR_Uniform* ir, 724 | Process_Uniform_State* state, 725 | const float* const* in, 726 | float* const* out, 727 | int N, 728 | int num_channels, 729 | float* fft_scratch) 730 | { 731 | process_multichannel (config, ir, state, in, out, N, num_channels, fft_scratch, false); 732 | } 733 | 734 | void process_samples_with_latency_multichannel (const Config* config, 735 | const IR_Uniform* ir, 736 | Process_Uniform_State* state, 737 | const float* const* in, 738 | float* const* out, 739 | int N, 740 | int num_channels, 741 | float* fft_scratch) 742 | { 743 | process_multichannel (config, ir, state, in, out, N, num_channels, fft_scratch, true); 744 | } 745 | 746 | void process_samples_non_uniform (const IR_Non_Uniform* ir, 747 | Process_Non_Uniform_State* state, 748 | const float* in, 749 | float* out, 750 | int N, 751 | float* scratch) 752 | { 753 | auto* tail_out = scratch; 754 | scratch += pad_floats (N); 755 | 756 | process_samples_with_latency (ir->tail_config, 757 | &ir->tail, 758 | &state->tail, 759 | in, 760 | tail_out, 761 | N, 762 | scratch); 763 | 764 | process_samples (ir->head_config, 765 | &ir->head, 766 | &state->head, 767 | in, 768 | out, 769 | N, 770 | scratch); 771 | 772 | for (int n = 0; n < N; ++n) 773 | out[n] += tail_out[n]; 774 | } 775 | } // namespace chowdsp::convolution 776 | --------------------------------------------------------------------------------