├── .gitignore
├── cmake
    └── CPM.cmake
├── CMakeLists.txt
├── LICENSE
├── test
    ├── CMakeLists.txt
    ├── chowdsp_convolution_test.c
    └── chowdsp_convolution_test.cpp
├── .github
    └── workflows
    │   ├── coverage.yml
    │   └── test.yml
├── .clang-format
├── README.md
├── chowdsp_convolution.h
└── chowdsp_convolution.cpp


/.gitignore:
--------------------------------------------------------------------------------
 1 | build*/
 2 | 
 3 | .focus-config
 4 | *.raddbg_project
 5 | .vscode/
 6 | .idea/
 7 | .zed/
 8 | 
 9 | .DS_Store
10 | 


--------------------------------------------------------------------------------
/cmake/CPM.cmake:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: MIT
 2 | #
 3 | # SPDX-FileCopyrightText: Copyright (c) 2019-2023 Lars Melchior and contributors
 4 | 
 5 | set(CPM_DOWNLOAD_VERSION 0.40.2)
 6 | set(CPM_HASH_SUM "c8cdc32c03816538ce22781ed72964dc864b2a34a310d3b7104812a5ca2d835d")
 7 | 
 8 | if(CPM_SOURCE_CACHE)
 9 |     set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
10 | elseif(DEFINED ENV{CPM_SOURCE_CACHE})
11 |     set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
12 | else()
13 |     set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
14 | endif()
15 | 
16 | # Expand relative path. This is important if the provided path contains a tilde (~)
17 | get_filename_component(CPM_DOWNLOAD_LOCATION ${CPM_DOWNLOAD_LOCATION} ABSOLUTE)
18 | 
19 | file(DOWNLOAD
20 |     https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake
21 |     ${CPM_DOWNLOAD_LOCATION} EXPECTED_HASH SHA256=${CPM_HASH_SUM}
22 | )
23 | 
24 | include(${CPM_DOWNLOAD_LOCATION})
25 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.20)
 2 | project(chowdsp_convolution VERSION 0.1.0)
 3 | 
 4 | if(CHOWDSP_CONVOLUTION_ASAN)
 5 |     message(STATUS "Setting flags for address sanitizer: -fsanitize=address -g")
 6 |     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -g")
 7 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -g")
 8 |     set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=address")
 9 | endif()
10 | 
11 | if(TARGET chowdsp_fft)
12 |     get_target_property(chowdsp_fft_dir chowdsp_fft SOURCE_DIR)
13 |     message(STATUS "Using chowdsp_fft from ${chowdsp_fft_dir}")
14 | else()
15 |     message(STATUS "Using chowdsp_fft from CPM")
16 |     include(cmake/CPM.cmake)
17 |     unset(JUCE_MODULES_DIR CACHE) # this causes problems with the tests CMake config?
18 |     CPMAddPackage("gh:Chowdhury-DSP/chowdsp_fft#main")
19 | endif()
20 | 
21 | add_library(chowdsp_convolution STATIC)
22 | target_sources(chowdsp_convolution
23 |     PRIVATE
24 |         chowdsp_convolution.h
25 |         chowdsp_convolution.cpp
26 | )
27 | target_include_directories(chowdsp_convolution PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
28 | target_link_libraries(chowdsp_convolution PUBLIC chowdsp_fft)
29 | target_compile_features(chowdsp_convolution PRIVATE cxx_std_20)
30 | 
31 | if(CHOWDSP_CONVOLUTION_TESTING)
32 |     add_subdirectory(test)
33 | endif()
34 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2025, Jatin Chowdhury (jatin@chowdsp.com)
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/test/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/CPM.cmake)
 2 | 
 3 | CPMAddPackage(
 4 |     NAME juce
 5 |     GIT_REPOSITORY https://github.com/juce-framework/juce
 6 |     GIT_TAG 8.0.7
 7 |     OPTIONS "JUCE_MODULES_ONLY ON"
 8 | )
 9 | CPMAddPackage("gh:Chowdhury-DSP/chowdsp_utils#next")
10 | CPMAddPackage("gh:Chowdhury-DSP/chowdsp_fft#main")
11 | 
12 | if(NOT TARGET chowdsp_fft_juce)
13 |     juce_add_module(${chowdsp_fft_SOURCE_DIR}/chowdsp_fft_juce)
14 | endif()
15 | 
16 | add_executable(chowdsp_convolution_test chowdsp_convolution_test.cpp)
17 | target_link_libraries(chowdsp_convolution_test
18 |     # juce::juce_dsp
19 |     juce::juce_audio_basics
20 |     juce::juce_audio_formats
21 |     chowdsp::chowdsp_buffers
22 |     chowdsp::chowdsp_data_structures
23 |     chowdsp_fft_juce
24 |     chowdsp_convolution
25 | )
26 | target_compile_definitions(chowdsp_convolution_test
27 |     PRIVATE
28 |         JUCE_MODULE_AVAILABLE_juce_dsp=1 # chowdsp_pffft includes juce_dsp internally!
29 |         JUCE_USE_CURL=0
30 |         $<IF:$<CONFIG:DEBUG>,BUILD_DEBUG=1,BUILD_RELEASE=1>
31 | )
32 | target_compile_features(chowdsp_convolution_test PRIVATE cxx_std_20)
33 | 
34 | if(CHOWDSP_CONVOLUTION_COVERAGE)
35 |     message(STATUS "Appending code coverage compiler flags: -g --coverage")
36 |     target_compile_options(chowdsp_convolution PUBLIC -g --coverage -fprofile-arcs -ftest-coverage)
37 |     target_compile_options(chowdsp_convolution_test PUBLIC -g --coverage -fprofile-arcs -ftest-coverage)
38 |     target_link_options(chowdsp_convolution_test PUBLIC --coverage)
39 | endif()
40 | 
41 | add_executable(chowdsp_convolution_c_test chowdsp_convolution_test.c)
42 | target_link_libraries(chowdsp_convolution_c_test PRIVATE chowdsp_convolution)
43 | target_compile_features(chowdsp_convolution_c_test PRIVATE c_std_11)
44 | 


--------------------------------------------------------------------------------
/.github/workflows/coverage.yml:
--------------------------------------------------------------------------------
 1 |  name: Coverage
 2 | 
 3 |  on:
 4 |    pull_request:
 5 |      branches:
 6 |        - main
 7 |        - develop
 8 |    push:
 9 |      branches:
10 |        - main
11 |        - ci
12 | 
13 |    workflow_dispatch:
14 | 
15 |  jobs:
16 |    build_and_test:
17 |      name: Test library with coverage
18 |      runs-on: ${{ matrix.os }}
19 |      strategy:
20 |        fail-fast: false # show all errors for each platform (vs. cancel jobs on error)
21 |        matrix:
22 |          include:
23 |            - name: Linux
24 |              os: ubuntu-22.04
25 |              nparallel: 4
26 | 
27 |      steps:
28 |        - name: Install Linux Deps
29 |          if: runner.os == 'Linux'
30 |          run: |
31 |            sudo apt-get update
32 |            sudo apt install libasound2-dev libcurl4-openssl-dev libx11-dev libxinerama-dev libxext-dev libfreetype6-dev libwebkit2gtk-4.0-dev libglu1-mesa-dev libjack-jackd2-dev
33 | 
34 |        - name: Install lcov (Linux)
35 |          if: runner.os == 'Linux'
36 |          run: sudo apt install lcov
37 | 
38 |        - name: Install Ninja
39 |          uses: seanmiddleditch/gha-setup-ninja@master
40 | 
41 |        - name: Get latest CMake
42 |          uses: lukka/get-cmake@latest
43 | 
44 |        - name: Checkout code
45 |          uses: actions/checkout@v2
46 | 
47 |        - name: Cmake Configure
48 |          run: cmake -Bbuild -G"Ninja Multi-Config" -DCHOWDSP_CONVOLUTION_TESTING=ON -DCHOWDSP_CONVOLUTION_COVERAGE=ON
49 | 
50 |        - name: Build Test
51 |          run: cmake --build build --config Debug --parallel --target chowdsp_convolution_test
52 | 
53 |        - name: Run Test
54 |          run: ./build/test/Debug/chowdsp_convolution_test
55 | 
56 |        - name: Collect Coverage Data
57 |          run: |
58 |            lcov --version
59 |            lcov --directory . --capture --output-file coverage.info
60 |            lcov --remove coverage.info '/usr/*' "${HOME}"'/.cache/*' '/Applications/Xcode*' '*build*' '*test*' --output-file coverage.info
61 | 
62 |        - name: Report Coverage Data
63 |          run: lcov --list coverage.info
64 | 
65 |        - name: Upload coverage to Codecov
66 |          uses: codecov/codecov-action@v4
67 |          with:
68 |            fail_ci_if_error: true
69 |            token: ${{ secrets.CODECOV_TOKEN }}
70 |            files: coverage.info
71 |            verbose: true
72 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
 1 | ---
 2 | AccessModifierOffset: -4
 3 | AlignAfterOpenBracket: Align
 4 | AlignConsecutiveAssignments: false
 5 | AlignConsecutiveDeclarations: false
 6 | AlignEscapedNewlines: Left
 7 | AlignOperands: Align
 8 | AlignTrailingComments: false
 9 | AllowAllParametersOfDeclarationOnNextLine: false
10 | AllowShortBlocksOnASingleLine: Never
11 | AllowShortCaseLabelsOnASingleLine: false
12 | AllowShortFunctionsOnASingleLine: All
13 | AllowShortIfStatementsOnASingleLine: Never
14 | AllowShortLoopsOnASingleLine: false
15 | AlwaysBreakAfterDefinitionReturnType: None
16 | AlwaysBreakAfterReturnType: None
17 | AlwaysBreakBeforeMultilineStrings: false
18 | AlwaysBreakTemplateDeclarations: Yes
19 | BinPackArguments: false
20 | BinPackParameters: false
21 | BreakAfterJavaFieldAnnotations: false
22 | BreakBeforeBinaryOperators: NonAssignment
23 | BreakBeforeBraces: Allman
24 | BreakBeforeTernaryOperators: true
25 | BreakConstructorInitializersBeforeComma: false
26 | BreakStringLiterals: false
27 | ColumnLimit: 0
28 | ConstructorInitializerAllOnOneLineOrOnePerLine: true
29 | ConstructorInitializerIndentWidth: 4
30 | ContinuationIndentWidth: 4
31 | Cpp11BracedListStyle: false
32 | DerivePointerAlignment: false
33 | DisableFormat: false
34 | ExperimentalAutoDetectBinPacking: false
35 | ForEachMacros: [ 'forEachXmlChildElement' ]
36 | IndentCaseLabels: true
37 | IndentWidth: 4
38 | IndentWrappedFunctionNames: true
39 | KeepEmptyLinesAtTheStartOfBlocks: false
40 | Language: Cpp
41 | MaxEmptyLinesToKeep: 1
42 | NamespaceIndentation: Inner
43 | PointerAlignment: Left
44 | ReflowComments: false
45 | SortIncludes: false
46 | SpaceAfterCStyleCast: true
47 | SpaceAfterLogicalNot: true
48 | SpaceBeforeAssignmentOperators: true
49 | SpaceBeforeCpp11BracedList: true
50 | SpaceBeforeParens: NonEmptyParentheses
51 | SpaceInEmptyParentheses: false
52 | SpaceBeforeInheritanceColon: true
53 | SpacesInAngles: false
54 | SpacesInCStyleCastParentheses: false
55 | SpacesInContainerLiterals: true
56 | SpacesInParentheses: false
57 | SpacesInSquareBrackets: false
58 | Standard: "c++17"
59 | TabWidth: 4
60 | UseTab: Never
61 | ---
62 | Language: ObjC
63 | BasedOnStyle: Chromium
64 | AlignTrailingComments: true
65 | BreakBeforeBraces: Allman
66 | ColumnLimit: 0
67 | IndentWidth: 4
68 | KeepEmptyLinesAtTheStartOfBlocks: false
69 | ObjCSpaceAfterProperty: true
70 | ObjCSpaceBeforeProtocolList: true
71 | PointerAlignment: Left
72 | SpacesBeforeTrailingComments: 1
73 | TabWidth: 4
74 | UseTab: Never
75 | ...
76 | 


--------------------------------------------------------------------------------
/test/chowdsp_convolution_test.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <math.h>
 4 | 
 5 | #include <chowdsp_convolution.h>
 6 | #include <chowdsp_fft.h>
 7 | 
 8 | int main()
 9 | {
10 |     printf("Running C tests\n");
11 | 
12 |     // setup config
13 |     const int block_size = 512;
14 |     struct Convolution_Config conv_config;
15 |     create_config (&conv_config, block_size, NULL);
16 |     float* fft_scratch = (float*) aligned_malloc (conv_config.fft_size * sizeof (float));
17 | 
18 |     // load IR (ideal impulse with delay)
19 |     const int ir_size = 200;
20 |     const int delay_samples = 100;
21 |     float* ir = (float*) calloc(ir_size, sizeof (float));
22 |     ir[delay_samples] = 1.0f;
23 |     struct IR_Uniform conv_ir;
24 |     create_ir (&conv_config, &conv_ir, ir, ir_size, fft_scratch, NULL);
25 |     free (ir);
26 | 
27 |     // set up process state
28 |     struct Process_Uniform_State conv_state;
29 |     create_process_state(&conv_config, &conv_ir, &conv_state, NULL);
30 | 
31 |     // set up i/o buffers
32 |     const int num_blocks = 1000;
33 |     const int data_size = block_size * num_blocks;
34 |     float* test_input_data = malloc (data_size * sizeof (float));
35 |     float* test_output_data = malloc (data_size * sizeof (float));
36 |     for (int i = 0; i < data_size; ++i)
37 |         test_input_data[i] = sinf(314.0f * (float) i / (float) data_size);
38 | 
39 |     // process convolution
40 |     for (int i = 0; i < num_blocks; ++i)
41 |     {
42 |         const float* block_in = test_input_data + (i * block_size);
43 |         float* block_out = test_output_data + (i * block_size);
44 |         process_samples (&conv_config,
45 |                          &conv_ir,
46 |                          &conv_state,
47 |                          block_in,
48 |                          block_out,
49 |                          block_size,
50 |                          fft_scratch);
51 |     }
52 | 
53 |     // compute error
54 |     float error_accum = 0.0f;
55 |     float max_error = 0.0f;
56 |     for (int i = 0; i < data_size; ++i)
57 |     {
58 |         const float ref = i < delay_samples ? 0.0f : test_input_data[i - delay_samples];
59 |         const float test = test_output_data[i];
60 |         const float err = fabsf (ref - test);
61 | 
62 |         if (err > max_error)
63 |             max_error = err;
64 |         error_accum += err * err;
65 |     }
66 |     const float mse = error_accum / (float) data_size;
67 |     printf("Max Error: %f\n", max_error);
68 |     printf("Mean-squared: %f\n", mse);
69 |     printf("COMPLETE!\n");
70 | 
71 |     // cleanup
72 |     free (test_input_data);
73 |     free (test_output_data);
74 |     aligned_free (fft_scratch);
75 |     destroy_process_state (&conv_state);
76 |     destroy_ir (&conv_ir);
77 |     destroy_config (&conv_config);
78 | 
79 |     return 0;
80 | }
81 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
  1 | name: Test
  2 | 
  3 | on:
  4 |   pull_request:
  5 |     branches:
  6 |       - main
  7 |       - develop
  8 |   push:
  9 |     branches:
 10 |       - main
 11 |       - ci
 12 | 
 13 |   workflow_dispatch:
 14 | 
 15 | jobs:
 16 |   build_and_test:
 17 |     name: Test library on ${{ matrix.name }}
 18 |     runs-on: ${{ matrix.os }}
 19 |     strategy:
 20 |       fail-fast: false # show all errors for each platform (vs. cancel jobs on error)
 21 |       matrix:
 22 |         include:
 23 |           - name: Linux
 24 |             os: ubuntu-22.04
 25 |             cmake_args: "-DCMAKE_LINKER_TYPE=MOLD -DCMAKE_C_COMPILER=clang-15 -DCMAKE_CXX_COMPILER=clang++-15"
 26 |             nparallel: 4
 27 |           - name: Linux ASan
 28 |             os: ubuntu-22.04
 29 |             cmake_args: "-DCMAKE_LINKER_TYPE=MOLD -DCMAKE_C_COMPILER=clang-15 -DCMAKE_CXX_COMPILER=clang++-15 -DCHOWDSP_CONVOLUTION_ASAN=ON"
 30 |             nparallel: 4
 31 |           - name: Windows-x64
 32 |             os: windows-2022
 33 |             cmake_args: -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl
 34 |             nparallel: 4
 35 |           - name: MacOS
 36 |             os: macos-14
 37 |             cmake_args: "-D\"CMAKE_OSX_ARCHITECTURES=arm64;x86_64\""
 38 |             nparallel: 4
 39 |           - name: MacOS ASan
 40 |             os: macos-14
 41 |             cmake_args: "-DCHOWDSP_CONVOLUTION_ASAN=ON"
 42 |             nparallel: 4
 43 | 
 44 |     steps:
 45 |       - name: Install Linux Deps
 46 |         if: runner.os == 'Linux'
 47 |         run: |
 48 |           sudo apt-get update
 49 |           sudo apt install libasound2-dev libcurl4-openssl-dev libx11-dev libxinerama-dev libxext-dev libfreetype6-dev libwebkit2gtk-4.0-dev libglu1-mesa-dev libjack-jackd2-dev
 50 |           sudo apt-add-repository "deb http://apt.llvm.org/focal/ llvm-toolchain-focal-15 main"
 51 |           sudo apt install clang-15 -y
 52 | 
 53 |       - name: Install Ninja
 54 |         uses: seanmiddleditch/gha-setup-ninja@master
 55 | 
 56 |       - name: Get latest CMake
 57 |         uses: lukka/get-cmake@latest
 58 | 
 59 |       - uses: rui314/setup-mold@v1
 60 |         if: runner.os == 'Linux'
 61 | 
 62 |       - name: Upgrade LLVM
 63 |         if: runner.os == 'Windows'
 64 |         run: choco upgrade llvm --version=18.1.8 --allow-downgrade
 65 | 
 66 |       - name: Add msbuild to PATH
 67 |         if: runner.os == 'Windows'
 68 |         uses: microsoft/setup-msbuild@v2
 69 | 
 70 |       - name: Setup MSVC devcmd (x64)
 71 |         if: matrix.name == 'Windows-x64'
 72 |         uses: ilammy/msvc-dev-cmd@v1
 73 | 
 74 |       - name: Checkout code
 75 |         uses: actions/checkout@v2
 76 | 
 77 |       - name: Cmake Configure
 78 |         run: cmake -Bbuild -G"Ninja Multi-Config" -DCHOWDSP_CONVOLUTION_TESTING=ON ${{ matrix.cmake_args }}
 79 | 
 80 |       - name: Build Test (Debug)
 81 |         run: cmake --build build --config Debug --parallel ${{ matrix.nparallel }} --target chowdsp_convolution_test
 82 | 
 83 |       - name: Run Test (Debug)
 84 |         run: ./build/test/Debug/chowdsp_convolution_test
 85 | 
 86 |       - name: Build Test (Release)
 87 |         run: cmake --build build --config Release --parallel ${{ matrix.nparallel }} --target chowdsp_convolution_test
 88 | 
 89 |       - name: Run Test (Release)
 90 |         run: ./build/test/Release/chowdsp_convolution_test
 91 | 
 92 |       - name: Build Test C (Debug)
 93 |         run: cmake --build build --config Debug --parallel ${{ matrix.nparallel }} --target chowdsp_convolution_c_test
 94 | 
 95 |       - name: Run Test C (Debug)
 96 |         run: ./build/test/Debug/chowdsp_convolution_c_test
 97 | 
 98 |       - name: Build Test C++ (Release)
 99 |         run: cmake --build build --config Release --parallel ${{ matrix.nparallel }} --target chowdsp_convolution_c_test
100 | 
101 |       - name: Run Test C++ (Release)
102 |         run: ./build/test/Release/chowdsp_convolution_c_test
103 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # chowdsp_convolution
  2 | 
  3 | [![Test](https://github.com/Chowdhury-DSP/chowdsp_convolution/actions/workflows/test.yml/badge.svg)](https://github.com/Chowdhury-DSP/chowdsp_convolution/actions/workflows/test.yml)
  4 | [![codecov](https://codecov.io/gh/Chowdhury-DSP/chowdsp_convolution/graph/badge.svg?token=3WCDKPHA58)](https://codecov.io/gh/Chowdhury-DSP/chowdsp_convolution)
  5 | 
  6 | `chowdsp_convolution` is a library for performing frequency-domain
  7 | convolution using [`chowdsp_fft`](https://github.com/Chowdhury-DSP/chowdsp_fft).
  8 | The library currently supports uniformly-partitioned convolutions,
  9 | as well as 2-stage non-uniformly-partitioned convolutions.
 10 | 
 11 | **N.B.: This library is still in early development, and
 12 | there will likely be breaking changes.** If you have
 13 | suggestions for ways to improve the API, or features to
 14 | add please create a GitHub Issue.
 15 | 
 16 | ## Usage
 17 | 
 18 | ### Basic Usage (mono IR, mono i/o, uniform partitioning)
 19 | 
 20 | First, create a `Config` object:
 21 | 
 22 | ```cpp
 23 | chowdsp::convolution::Config config {};
 24 | chowdsp::convolution::create_config (&config, maximum_block_size);
 25 | ```
 26 | 
 27 | We'll also allocate some "scratch" data that will be used for computing
 28 | FFTs under the hood:
 29 | 
 30 | ```cpp
 31 | float* fft_scratch = chowdsp::fft::aligned_malloc (config->fft_size * sizeof (float));
 32 | ```
 33 | 
 34 | Next, create a partitioned IR:
 35 | 
 36 | ```cpp
 37 | chowdsp::convolution::IR_Uniform ir {};
 38 | chowdsp::convolution::create_ir (&config, &ir, my_ir.data(), my_ir.size());
 39 | ```
 40 | 
 41 | Then we'll create a convolution "state".
 42 | ```cpp
 43 | chowdsp::convolution::Process_Uniform_State state {};
 44 | chowdsp::convolution::create_process_state (&config, &ir, &state);
 45 | ```
 46 | 
 47 | Now we're ready to process some data:
 48 | 
 49 | ```cpp
 50 | chowdsp::convolution::process_samples (&config, &ir, &state, data, data, num_samples, fft_scratch);
 51 | ```
 52 | 
 53 | Alternatively, we could use `process_samples_with_latency()` which is
 54 | faster, but adds `config->block_size` samples of latency.
 55 | 
 56 | Finally, let's clean up all our memory allocation:
 57 | 
 58 | ```cpp
 59 | chowdsp::fft::aligned_free (fft_scratch);
 60 | chowdsp::convolution::destroy_process_state (&state);
 61 | chowdsp::convolution::destroy_ir (&ir);
 62 | chowdsp::convolution::destroy_config (&config);
 63 | ```
 64 | 
 65 | ### Multi-Channel Processing (mono IR)
 66 | 
 67 | Let's say that you want to convolve a stereo audio stream with a mono IR.
 68 | We can use `create_multichannel_process_state()` to create a processing state
 69 | with a given number of channels.
 70 | 
 71 | ```cpp
 72 | chowdsp::convolution::Process_Uniform_State stereo_state {};
 73 | chowdsp::convolution::create_multichannel_process_state (&config, &ir, &stereo_state, 2);
 74 | ```
 75 | 
 76 | To process our audio, we'll want to use `process_samples_multichannel()`
 77 | (or `process_samples_multichannel_with_latency()`).
 78 | 
 79 | ```cpp
 80 | float* channel_data[2] {
 81 |     left_channel_data,
 82 |     right_channel_data,
 83 | };
 84 | chowdsp::convolution::process_samples_multichannel (&config, &ir, &state, channel_data, channel_data, num_samples, 2, fft_scratch);
 85 | ```
 86 | 
 87 | ### Multi-Channel IRs
 88 | 
 89 | let's create a stereo, uniform-partitioned IR:
 90 | 
 91 | ```cpp
 92 | float* ir_data[2] {
 93 |     left_ir_data,
 94 |     right_ir_data,
 95 | };
 96 | chowdsp::convolution::IR_Uniform ir {};
 97 | chowdsp::convolution::create_multichannel_ir (&config, &ir, ir_data, ir_num_samples, 2, fft_scratch);
 98 | ```
 99 | 
100 | Now if we call `create_process_state()`, the state will automatically be created
101 | for the same number of channels as the IR.
102 | ```cpp
103 | chowdsp::convolution::Process_Uniform_State state {};
104 | chowdsp::convolution::create_process_state (&config, &ir, &state);
105 | ```
106 | 
107 | Then (as before), we can do our multi-channel processing:
108 | 
109 | ```cpp
110 | float* channel_data[2] {
111 |     left_channel_data,
112 |     right_channel_data,
113 | };
114 | chowdsp::convolution::process_samples_multichannel (&config, &ir, &state, channel_data, channel_data, num_samples, 2, fft_scratch);
115 | ```
116 | 
117 | ### Multi-Threaded Usage
118 | 
119 | What should you do if you're looking to load an impulse response
120 | on some thread *other* than the audio thread, while the audio
121 | thread is still running? The basic idea is that you should:
122 | - Create a `IR_Uniform` object on your background thread.
123 | - Create one `Process_Uniform_State` object per-channel on your background thread
124 |   - This step may be skipped if the new IR is the same length as the one currently on the audio thread.
125 | - Pass these objects to your audio thread (e.g. via a lock-free queue)
126 | - Pass the old IR and state objects to your background thread where they can be safely destroyed.
127 | 
128 | Note that the `Config` object is thread-safe, so you may use the
129 | same config on both your audio thread and background thread (e.g.
130 | when calling `create_ir()` or `load_ir()`). However, the `fft_scratch`
131 | is **not** thread-safe, so make sure to allocate a dedicated `fft_scratch`
132 | for each thread.
133 | 
134 | ## License
135 | 
136 | `chowdsp_convolution` is licensed under the BSD 3-clause license. Enjoy!
137 | 
138 | ### Disclaimer
139 | 
140 | This implementation is *loosely* based on some code from the
141 | [JUCE](https://github.com/juce-framework/juce) library. Personally,
142 | I think that I've changed enough of the code that this library should
143 | be considered an original work, rather than a "fork" of the JUCE
144 | implementation. That said, if you want to use this library in a
145 | commercial product and you don't have a JUCE license, I'd recommend
146 | looking through both codebases and deciding for yourself.
147 | 
148 | -- Jatin
149 | 


--------------------------------------------------------------------------------
/chowdsp_convolution.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #ifdef __cplusplus
  4 | #include <cstddef>
  5 | 
  6 | extern "C"
  7 | {
  8 | namespace chowdsp::convolution
  9 | {
 10 | #else
 11 | #include <stddef.h>
 12 | #include <stdbool.h>
 13 | #endif
 14 | 
 15 | /**
 16 |  * Convolution configuration.
 17 |  * This depends only on the maximum block size.
 18 |  */
 19 | struct Convolution_Config
 20 | {
 21 |     int block_size;
 22 |     int fft_size;
 23 |     void* fft;
 24 | };
 25 | #ifdef __cplusplus
 26 | using Config = Convolution_Config;
 27 | #endif
 28 | 
 29 | /** State for a uniform-partitioned IR. */
 30 | struct IR_Uniform
 31 | {
 32 |     float* segments;
 33 |     int num_segments;
 34 |     int max_num_segments;
 35 |     int num_channels;
 36 | };
 37 | 
 38 | /** State for processing a uniform-partitioned IR */
 39 | struct Process_Uniform_State
 40 | {
 41 |     struct State_Data
 42 |     {
 43 |         float* segments;
 44 |         float* input_data;
 45 |         float* output_data;
 46 |         float* output_temp_data;
 47 |         float* overlap_data;
 48 |     }* state_data;
 49 |     int max_num_segments;
 50 |     int current_segment;
 51 |     int input_data_pos;
 52 |     int num_channels;
 53 | };
 54 | 
 55 | /** State for processing a multi-channel uniform-partitioned IR */
 56 | struct Process_Multichannel_Uniform_State
 57 | {
 58 |     struct Process_Uniform_State state;
 59 |     int num_channels;
 60 | };
 61 | 
 62 | /** State for a mono non-uniform-partitioned IR. */
 63 | struct IR_Non_Uniform
 64 | {
 65 |     struct IR_Uniform head;
 66 |     struct IR_Uniform tail;
 67 |     const struct Convolution_Config* head_config;
 68 |     const struct Convolution_Config* tail_config;
 69 |     int head_size;
 70 | };
 71 | 
 72 | /** State for processing a mono non-uniform-partitioned IR */
 73 | struct Process_Non_Uniform_State
 74 | {
 75 |     struct Process_Uniform_State head;
 76 |     struct Process_Uniform_State tail;
 77 |     const struct Convolution_Config* head_config;
 78 |     const struct Convolution_Config* tail_config;
 79 | };
 80 | 
 81 | /** Returns the required FFT size for a given block size. */
 82 | int convolution_fft_size (int max_block_size);
 83 | 
 84 | /** The number of bytes required for `create_config()` with in-place construction. */
 85 | size_t config_bytes_required (int max_block_size);
 86 | 
 87 | /**
 88 |  * Creates a convolution config for a given maximum block size.
 89 |  * If no `place_data` pointer is provided, the config will allocate
 90 |  * its own memory, and the user must call `destroy_config()` to free
 91 |  * that memory. If a `place_data` pointer is provided, the config will
 92 |  * be constructed in-place, using the provided memory, and the user is
 93 |  * responsible for managing that memory themselves. The `place_data` pointer
 94 |  * must provide the number of bytes determined by `config_bytes_required()`,
 95 |  * and should be aligned to 64 bytes.
 96 |  */
 97 | void create_config (struct Convolution_Config*, int max_block_size, void* place_data
 98 | #ifdef __cplusplus
 99 |      = nullptr
100 | #endif
101 | );
102 | 
103 | /** De-allocates the config's internal data. */
104 | void destroy_config (struct Convolution_Config*);
105 | 
106 | /**
107 |  * Returns the number of bytes required to call `create_ir()`
108 |  * or `create_zero_ir()` with `place_data`.
109 |  */
110 | size_t ir_bytes_required (int max_block_size, int ir_num_samples);
111 | 
112 | /**
113 |  * Creates a monophonic IR.
114 |  *
115 |  * The fft_scratch pointer should point to
116 |  * an array of config->fft_size floats, and should
117 |  * have 64-byte alignment.
118 |  *
119 |  * If `place_data` is provided, the IR will be constructed in-place.
120 |  * Otherwise, memory will be allocated, and the user must call `destroy_ir()`
121 |  * to free that memory. `place_data` should be aligned to 64 bytes.
122 |  */
123 | void create_ir (const struct Convolution_Config*, struct IR_Uniform*, const float* ir, int ir_num_samples, float* fft_scratch, void* place_data
124 | #ifdef __cplusplus
125 |      = nullptr
126 | #endif
127 | );
128 | 
129 | /**
130 |  * Creates a mono IR of a given size.
131 |  * The IR will be filled with zeros.
132 |  *
133 |  * See the requirements for `place_data` for `create_ir()`.
134 |  */
135 | void create_zero_ir (const struct Convolution_Config*, struct IR_Uniform*, int ir_num_samples, void* place_data
136 | #ifdef __cplusplus
137 |      = nullptr
138 | #endif
139 | );
140 | 
141 | /**
142 |  * Loads IR data.
143 |  * `ir_num_samples` must be less than or equal the number of samples
144 |  * the IR was created to expect.
145 |  */
146 | void load_ir (const struct Convolution_Config*, struct IR_Uniform*, const float* ir, int ir_num_samples, float* fft_scratch);
147 | 
148 | /**
149 |  * Returns the number of bytes required to call `create_multichannel_ir()`
150 |  * or `create_zero_multichannel_ir()` with `place_data`.
151 |  */
152 | size_t multichannel_ir_bytes_required (int max_block_size, int ir_num_samples, int num_channels);
153 | 
154 | /**
155 |  * Creates a multi-channel uniform-partitioned IR.
156 |  *
157 |  * The fft_scratch pointer should point to
158 |  * an array of config->fft_size floats, and should
159 |  * have 64-byte alignment.
160 |  *
161 |  * See the requirements for `place_data` for `create_ir()`.
162 |  */
163 | void create_multichannel_ir (const struct Convolution_Config*, struct IR_Uniform*, const float* const* ir, int ir_num_samples, int num_channels, float* fft_scratch, void* place_data
164 | #ifdef __cplusplus
165 |      = nullptr
166 | #endif
167 | );
168 | 
169 | /**
170 |  * Creates a multi-channel IR of a given size.
171 |  * The IR will be filled with zeros.
172 |  *
173 |  * See the requirements for `place_data` for `create_ir()`.
174 |  */
175 | void create_zero_multichannel_ir (const struct Convolution_Config*, struct IR_Uniform*, int ir_num_samples, int num_channels, void* place_data
176 | #ifdef __cplusplus
177 |      = nullptr
178 | #endif
179 | );
180 | 
181 | /**
182 |  * Loads IR data.
183 |  * `ir_num_samples` must be less than or equal the number of samples
184 |  * the IR was created to expect.
185 |  */
186 | void load_multichannel_ir (const struct Convolution_Config*, struct IR_Uniform*, const float* const* ir, int ir_num_samples, int num_channels, float* fft_scratch);
187 | 
188 | /** De-allocates the IR's internal data. */
189 | void destroy_ir (struct IR_Uniform*);
190 | 
191 | /**
192 |  * Returns the number of bytes required to call `create_process_state()`
193 |  * with `place_data`.
194 |  */
195 | size_t process_state_bytes_required (int max_block_size, int ir_num_samples);
196 | 
197 | /**
198 |  * Creates a process state object for a given IR.
199 |  * The process state will be created to process the same number of channels as the IR contains.
200 |  *
201 |  * If `place_data` is provided, the state will be constructed in-place.
202 |  * Otherwise, memory will be allocated, and the user must call `destroy_process_state()`
203 |  * to free that memory. `place_data` should be aligned to 64 bytes.
204 |  */
205 | void create_process_state (const struct Convolution_Config*, const struct IR_Uniform*, struct Process_Uniform_State*, void* place_data
206 | #ifdef __cplusplus
207 |      = nullptr
208 | #endif
209 | );
210 | 
211 | /**
212 |  * Returns the number of bytes required to call
213 |  * `create_multichannel_process_state()` with `place_data`.
214 |  */
215 | size_t multichannel_process_state_bytes_required (int max_block_size, int ir_num_samples, int num_channels);
216 | 
217 | /**
218 |  * Creates a process state object for a given IR, with a specific number of channels.
219 |  * This is useful for convolving a monophonic IR with multiple channels.
220 |  *
221 |  * See the requirements for `place_data` for `create_process_state()`.
222 |  */
223 | void create_multichannel_process_state (const struct Convolution_Config*, const struct IR_Uniform*, struct Process_Uniform_State*, int num_channels, void* place_data
224 | #ifdef __cplusplus
225 |      = nullptr
226 | #endif
227 | );
228 | 
229 | /** Zeros the process state. */
230 | void reset_process_state (const struct Convolution_Config*, struct Process_Uniform_State*);
231 | 
232 | /** Zeros the process state. */
233 | void reset_process_state_segments (const struct Convolution_Config*, struct Process_Uniform_State*, const struct IR_Uniform*);
234 | 
235 | /** De-allocates the state's internal data. */
236 | void destroy_process_state (struct Process_Uniform_State*);
237 | 
238 | /**
239 |  * Creates a monophonic non-uniform IR.
240 |  *
241 |  * The scratch pointer should point to an allocated block
242 |  * of at least get_required_nuir_scratch_bytes(), and should
243 |  * have 64-byte alignment.
244 |  */
245 | void create_nuir (struct IR_Non_Uniform*, const float* ir, int ir_num_samples, float* scratch);
246 | 
247 | /**
248 |  * Creates a mono non-uniform IR of a given size.
249 |  * The IR will be filled with zeros.
250 |  */
251 | void create_zero_nuir (struct IR_Non_Uniform*, int ir_num_samples);
252 | 
253 | /** Returns the required scratch size needed for this non-uniform IR. */
254 | int get_required_nuir_scratch_bytes (const struct IR_Non_Uniform*);
255 | 
256 | /**
257 |  * Loads IR data.
258 |  * `ir_num_samples` must be less than or equal the number of samples
259 |  * the IR was created to expect.
260 |  */
261 | void load_nuir (struct IR_Non_Uniform*, const float* ir, int ir_num_samples, float* scratch);
262 | 
263 | /** De-allocates the IR's internal data. */
264 | void destroy_nuir (struct IR_Non_Uniform*);
265 | 
266 | /** Creates a mono process state object for a given IR. */
267 | void create_nuir_process_state (const struct IR_Non_Uniform*, struct Process_Non_Uniform_State*);
268 | 
269 | /** Zeros the process state. */
270 | void reset_nuir_process_state (struct Process_Non_Uniform_State*);
271 | 
272 | /** De-allocates the state's internal data. */
273 | void destroy_nuir_process_state (struct Process_Non_Uniform_State*);
274 | 
275 | /**
276 |  * Performs convolution processing for a given IR and state.
277 |  *
278 |  * The fft_scratch pointer should be point to
279 |  * an array of config->fft_size floats, and should
280 |  * have 64-byte alignment.
281 |  */
282 | void process_samples (const struct Convolution_Config*,
283 |                       const struct IR_Uniform*,
284 |                       struct Process_Uniform_State*,
285 |                       const float* in,
286 |                       float* out,
287 |                       int N,
288 |                       float* fft_scratch);
289 | 
290 | /**
291 |  * Similar to process_samples(), but with an added
292 |  * config->block_size samples of latency. In exchange,
293 |  * the convolution processing will be a little bit
294 |  * faster, especially when processing with odd block
295 |  * sizes.
296 |  */
297 | void process_samples_with_latency (const struct Convolution_Config*,
298 |                                    const struct IR_Uniform*,
299 |                                    struct Process_Uniform_State*,
300 |                                    const float* in,
301 |                                    float* out,
302 |                                    int N,
303 |                                    float* fft_scratch);
304 | 
305 | /**
306 |  * Performs convolution processing for a given multi-channel IR and state.
307 |  *
308 |  * The fft_scratch pointer should be point to
309 |  * an array of config->fft_size floats, and should
310 |  * have 64-byte alignment.
311 |  */
312 | void process_samples_multichannel (const struct Convolution_Config*,
313 |                                    const struct IR_Uniform*,
314 |                                    struct Process_Uniform_State*,
315 |                                    const float* const* in,
316 |                                    float* const* out,
317 |                                    int N,
318 |                                    int num_channels,
319 |                                    float* fft_scratch);
320 | 
321 | /**
322 |  * Similar to process_samples_multichannel(), but with an added
323 |  * config->block_size samples of latency. In exchange,
324 |  * the convolution processing will be a little bit
325 |  * faster, especially when processing with odd block
326 |  * sizes.
327 |  */
328 | void process_samples_with_latency_multichannel (const struct Convolution_Config*,
329 |                                                 const struct IR_Uniform*,
330 |                                                 struct Process_Uniform_State*,
331 |                                                 const float* const* in,
332 |                                                 float* const* out,
333 |                                                 int N,
334 |                                                 int num_channels,
335 |                                                 float* fft_scratch);
336 | 
337 | /**
338 |  * Performs convolution processing for a given non-uniform IR and state.
339 |  *
340 |  * The scratch pointer should point to an allocated block
341 |  * of at least get_required_nuir_scratch_bytes(), and should
342 |  * have 64-byte alignment.
343 |  */
344 | void process_samples_non_uniform (const struct IR_Non_Uniform*,
345 |                                   struct Process_Non_Uniform_State*,
346 |                                   const float* in,
347 |                                   float* out,
348 |                                   int N,
349 |                                   float* scratch);
350 | 
351 | #ifdef __cplusplus
352 | } // namespace chowdsp::convolution
353 | } // extern "C"
354 | #endif
355 | 


--------------------------------------------------------------------------------
/test/chowdsp_convolution_test.cpp:
--------------------------------------------------------------------------------
  1 | #include <chrono>
  2 | #include <iostream>
  3 | #include <random>
  4 | 
  5 | #include <juce_dsp/juce_dsp.h>
  6 | 
  7 | #include <chowdsp_convolution.h>
  8 | #include <chowdsp_fft.h>
  9 | 
 10 | #include <chowdsp_data_structures/chowdsp_data_structures.h>
 11 | #include <chowdsp_buffers/chowdsp_buffers.h>
 12 | #include <chowdsp_buffers/Buffers/chowdsp_Buffer.cpp> // NOLINT
 13 | template class chowdsp::Buffer<float, 32>;
 14 | using Convolution_Internal_Buffer = chowdsp::Buffer<float, 32>;
 15 | 
 16 | struct ConvolutionEngine
 17 | {
 18 |     ConvolutionEngine (const float* samples,
 19 |                        size_t numSamples,
 20 |                        size_t maxBlockSize)
 21 |         : blockSize ((size_t) juce::nextPowerOfTwo ((int) maxBlockSize)),
 22 |           fftSize (blockSize > 128 ? 2 * blockSize : 4 * blockSize),
 23 |           fftObject (std::make_unique<juce::dsp::FFT> (juce::roundToInt (std::log2 (fftSize)))),
 24 |           numSegments (numSamples / (fftSize - blockSize) + 1u),
 25 |           numInputSegments ((blockSize > 128 ? numSegments : 3 * numSegments)),
 26 |           bufferInput (1, static_cast<int> (fftSize)),
 27 |           bufferOutput (1, static_cast<int> (fftSize * 2)),
 28 |           bufferTempOutput (1, static_cast<int> (fftSize * 2)),
 29 |           bufferOverlap (1, static_cast<int> (fftSize))
 30 |     {
 31 |         bufferOutput.clear();
 32 | 
 33 |         auto updateSegmentsIfNecessary = [this] (size_t numSegmentsToUpdate,
 34 |                                                  std::vector<Convolution_Internal_Buffer>& segments)
 35 |         {
 36 |             if (numSegmentsToUpdate == 0
 37 |                 || numSegmentsToUpdate != (size_t) segments.size()
 38 |                 || (size_t) segments[0].getNumSamples() != fftSize * 2)
 39 |             {
 40 |                 segments.clear();
 41 | 
 42 |                 for (size_t i = 0; i < numSegmentsToUpdate; ++i)
 43 |                     segments.push_back ({ 1, static_cast<int> (fftSize * 2) }); // NOLINT
 44 |             }
 45 |         };
 46 | 
 47 |         updateSegmentsIfNecessary (numInputSegments, buffersInputSegments);
 48 |         updateSegmentsIfNecessary (numSegments, buffersImpulseSegments);
 49 | 
 50 |         auto FFTTempObject = std::make_unique<juce::dsp::FFT> (juce::roundToInt (std::log2 (fftSize)));
 51 |         size_t currentPtr = 0;
 52 | 
 53 |         for (auto& buf : buffersImpulseSegments)
 54 |         {
 55 |             buf.clear();
 56 | 
 57 |             auto* impulseResponse = buf.getWritePointer (0);
 58 | 
 59 |             if (&buf == &buffersImpulseSegments.front())
 60 |                 impulseResponse[0] = 1.0f;
 61 | 
 62 |             juce::FloatVectorOperations::copy (impulseResponse,
 63 |                                                samples + currentPtr,
 64 |                                                static_cast<int> (juce::jmin (fftSize - blockSize, numSamples - currentPtr)));
 65 | 
 66 |             FFTTempObject->performRealOnlyForwardTransform (impulseResponse);
 67 |             prepareForConvolution (impulseResponse);
 68 | 
 69 |             currentPtr += (fftSize - blockSize);
 70 |         }
 71 | 
 72 |         reset();
 73 |     }
 74 | 
 75 |     void reset()
 76 |     {
 77 |         bufferInput.clear();
 78 |         bufferOverlap.clear();
 79 |         bufferTempOutput.clear();
 80 |         bufferOutput.clear();
 81 | 
 82 |         for (auto& buf : buffersInputSegments)
 83 |             buf.clear();
 84 | 
 85 |         currentSegment = 0;
 86 |         inputDataPos = 0;
 87 |     }
 88 | 
 89 |     void processSamples (const float* input, float* output, size_t numSamples)
 90 |     {
 91 |         // Overlap-add, zero latency convolution algorithm with uniform partitioning
 92 |         size_t numSamplesProcessed = 0;
 93 | 
 94 |         auto indexStep = numInputSegments / numSegments;
 95 | 
 96 |         auto* inputData = bufferInput.getWritePointer (0);
 97 |         auto* outputTempData = bufferTempOutput.getWritePointer (0);
 98 |         auto* outputData = bufferOutput.getWritePointer (0);
 99 |         auto* overlapData = bufferOverlap.getWritePointer (0);
100 | 
101 |         while (numSamplesProcessed < numSamples)
102 |         {
103 |             const bool inputDataWasEmpty = (inputDataPos == 0);
104 |             auto numSamplesToProcess = juce::jmin (numSamples - numSamplesProcessed, blockSize - inputDataPos);
105 | 
106 |             juce::FloatVectorOperations::copy (inputData + inputDataPos, input + numSamplesProcessed, static_cast<int> (numSamplesToProcess));
107 | 
108 |             auto* inputSegmentData = buffersInputSegments[currentSegment].getWritePointer (0);
109 |             juce::FloatVectorOperations::copy (inputSegmentData, inputData, static_cast<int> (fftSize));
110 | 
111 |             fftObject->performRealOnlyForwardTransform (inputSegmentData);
112 |             prepareForConvolution (inputSegmentData);
113 | 
114 |             // Complex multiplication
115 |             if (inputDataWasEmpty)
116 |             {
117 |                 juce::FloatVectorOperations::fill (outputTempData, 0, static_cast<int> (fftSize + 1));
118 | 
119 |                 auto index = currentSegment;
120 | 
121 |                 for (size_t i = 1; i < numSegments; ++i)
122 |                 {
123 |                     index += indexStep;
124 | 
125 |                     if (index >= numInputSegments)
126 |                         index -= numInputSegments;
127 | 
128 |                     convolutionProcessingAndAccumulate (buffersInputSegments[index].getWritePointer (0),
129 |                                                         buffersImpulseSegments[i].getWritePointer (0),
130 |                                                         outputTempData);
131 |                 }
132 |             }
133 | 
134 |             juce::FloatVectorOperations::copy (outputData, outputTempData, static_cast<int> (fftSize + 1));
135 | 
136 |             convolutionProcessingAndAccumulate (inputSegmentData,
137 |                                                 buffersImpulseSegments.front().getWritePointer (0),
138 |                                                 outputData);
139 | 
140 |             updateSymmetricFrequencyDomainData (outputData);
141 |             fftObject->performRealOnlyInverseTransform (outputData);
142 | 
143 |             // Add overlap
144 |             juce::FloatVectorOperations::add (&output[numSamplesProcessed], &outputData[inputDataPos], &overlapData[inputDataPos], (int) numSamplesToProcess);
145 | 
146 |             // Input buffer full => Next block
147 |             inputDataPos += numSamplesToProcess;
148 | 
149 |             if (inputDataPos == blockSize)
150 |             {
151 |                 // Input buffer is empty again now
152 |                 juce::FloatVectorOperations::fill (inputData, 0.0f, static_cast<int> (fftSize));
153 | 
154 |                 inputDataPos = 0;
155 | 
156 |                 // Extra step for segSize > blockSize
157 |                 juce::FloatVectorOperations::add (&(outputData[blockSize]), &(overlapData[blockSize]), static_cast<int> (fftSize - 2 * blockSize));
158 | 
159 |                 // Save the overlap
160 |                 juce::FloatVectorOperations::copy (overlapData, &(outputData[blockSize]), static_cast<int> (fftSize - blockSize));
161 | 
162 |                 currentSegment = (currentSegment > 0) ? (currentSegment - 1) : (numInputSegments - 1);
163 |             }
164 | 
165 |             numSamplesProcessed += numSamplesToProcess;
166 |         }
167 |     }
168 | 
169 |     void processSamplesWithAddedLatency (const float* input, float* output, size_t numSamples)
170 |     {
171 |         // Overlap-add, zero latency convolution algorithm with uniform partitioning
172 |         size_t numSamplesProcessed = 0;
173 | 
174 |         auto indexStep = numInputSegments / numSegments;
175 | 
176 |         auto* inputData = bufferInput.getWritePointer (0);
177 |         auto* outputTempData = bufferTempOutput.getWritePointer (0);
178 |         auto* outputData = bufferOutput.getWritePointer (0);
179 |         auto* overlapData = bufferOverlap.getWritePointer (0);
180 | 
181 |         while (numSamplesProcessed < numSamples)
182 |         {
183 |             auto numSamplesToProcess = juce::jmin (numSamples - numSamplesProcessed, blockSize - inputDataPos);
184 | 
185 |             juce::FloatVectorOperations::copy (inputData + inputDataPos, input + numSamplesProcessed, static_cast<int> (numSamplesToProcess));
186 | 
187 |             juce::FloatVectorOperations::copy (output + numSamplesProcessed, outputData + inputDataPos, static_cast<int> (numSamplesToProcess));
188 | 
189 |             numSamplesProcessed += numSamplesToProcess;
190 |             inputDataPos += numSamplesToProcess;
191 | 
192 |             // processing itself when needed (with latency)
193 |             if (inputDataPos == blockSize)
194 |             {
195 |                 // Copy input data in input segment
196 |                 auto* inputSegmentData = buffersInputSegments[currentSegment].getWritePointer (0);
197 |                 juce::FloatVectorOperations::copy (inputSegmentData, inputData, static_cast<int> (fftSize));
198 | 
199 |                 fftObject->performRealOnlyForwardTransform (inputSegmentData);
200 |                 prepareForConvolution (inputSegmentData);
201 | 
202 |                 // Complex multiplication
203 |                 juce::FloatVectorOperations::fill (outputTempData, 0, static_cast<int> (fftSize + 1));
204 | 
205 |                 auto index = currentSegment;
206 | 
207 |                 for (size_t i = 1; i < numSegments; ++i)
208 |                 {
209 |                     index += indexStep;
210 | 
211 |                     if (index >= numInputSegments)
212 |                         index -= numInputSegments;
213 | 
214 |                     convolutionProcessingAndAccumulate (buffersInputSegments[index].getWritePointer (0),
215 |                                                         buffersImpulseSegments[i].getWritePointer (0),
216 |                                                         outputTempData);
217 |                 }
218 | 
219 |                 juce::FloatVectorOperations::copy (outputData, outputTempData, static_cast<int> (fftSize + 1));
220 | 
221 |                 convolutionProcessingAndAccumulate (inputSegmentData,
222 |                                                     buffersImpulseSegments.front().getWritePointer (0),
223 |                                                     outputData);
224 | 
225 |                 updateSymmetricFrequencyDomainData (outputData);
226 |                 fftObject->performRealOnlyInverseTransform (outputData);
227 | 
228 |                 // Add overlap
229 |                 juce::FloatVectorOperations::add (outputData, overlapData, static_cast<int> (blockSize));
230 | 
231 |                 // Input buffer is empty again now
232 |                 juce::FloatVectorOperations::fill (inputData, 0.0f, static_cast<int> (fftSize));
233 | 
234 |                 // Extra step for segSize > blockSize
235 |                 juce::FloatVectorOperations::add (&(outputData[blockSize]), &(overlapData[blockSize]), static_cast<int> (fftSize - 2 * blockSize));
236 | 
237 |                 // Save the overlap
238 |                 juce::FloatVectorOperations::copy (overlapData, &(outputData[blockSize]), static_cast<int> (fftSize - blockSize));
239 | 
240 |                 currentSegment = (currentSegment > 0) ? (currentSegment - 1) : (numInputSegments - 1);
241 | 
242 |                 inputDataPos = 0;
243 |             }
244 |         }
245 |     }
246 | 
247 |     // After each FFT, this function is called to allow convolution to be performed with only 4 SIMD functions calls.
248 |     void prepareForConvolution (float* samples) noexcept
249 |     {
250 |         auto FFTSizeDiv2 = fftSize / 2;
251 | 
252 |         for (size_t i = 0; i < FFTSizeDiv2; i++)
253 |             samples[i] = samples[i << 1];
254 | 
255 |         samples[FFTSizeDiv2] = 0;
256 | 
257 |         for (size_t i = 1; i < FFTSizeDiv2; i++)
258 |             samples[i + FFTSizeDiv2] = -samples[((fftSize - i) << 1) + 1];
259 |     }
260 | 
261 |     // Does the convolution operation itself only on half of the frequency domain samples.
262 |     void convolutionProcessingAndAccumulate (const float* input, const float* impulse, float* output)
263 |     {
264 |         auto FFTSizeDiv2 = fftSize / 2;
265 |         jassert (juce::isPowerOfTwo (FFTSizeDiv2) && FFTSizeDiv2 > 8);
266 |         jassert (juce::snapPointerToAlignment (input, (size_t) 32) == input);
267 |         jassert (juce::snapPointerToAlignment (impulse, (size_t) 32) == impulse);
268 |         jassert (juce::snapPointerToAlignment (output, (size_t) 32) == output);
269 | 
270 |         {
271 |             juce::FloatVectorOperations::addWithMultiply (output, input, impulse, static_cast<int> (FFTSizeDiv2));
272 |             juce::FloatVectorOperations::subtractWithMultiply (output, &(input[FFTSizeDiv2]), &(impulse[FFTSizeDiv2]), static_cast<int> (FFTSizeDiv2));
273 | 
274 |             juce::FloatVectorOperations::addWithMultiply (&(output[FFTSizeDiv2]), input, &(impulse[FFTSizeDiv2]), static_cast<int> (FFTSizeDiv2));
275 |             juce::FloatVectorOperations::addWithMultiply (&(output[FFTSizeDiv2]), &(input[FFTSizeDiv2]), impulse, static_cast<int> (FFTSizeDiv2));
276 |         }
277 | 
278 |         output[fftSize] += input[fftSize] * impulse[fftSize];
279 |     }
280 | 
281 |     // Undoes the re-organization of samples from the function prepareForConvolution.
282 |     // Then takes the conjugate of the frequency domain first half of samples to fill the
283 |     // second half, so that the inverse transform will return real samples in the time domain.
284 |     void updateSymmetricFrequencyDomainData (float* samples) noexcept
285 |     {
286 |         auto FFTSizeDiv2 = fftSize / 2;
287 | 
288 |         for (size_t i = 1; i < FFTSizeDiv2; i++)
289 |         {
290 |             samples[(fftSize - i) << 1] = samples[i];
291 |             samples[((fftSize - i) << 1) + 1] = -samples[FFTSizeDiv2 + i];
292 |         }
293 | 
294 |         samples[1] = 0.f;
295 | 
296 |         for (size_t i = 1; i < FFTSizeDiv2; i++)
297 |         {
298 |             samples[i << 1] = samples[(fftSize - i) << 1];
299 |             samples[(i << 1) + 1] = -samples[((fftSize - i) << 1) + 1];
300 |         }
301 |     }
302 | 
303 |     //==============================================================================
304 |     const size_t blockSize;
305 |     const size_t fftSize;
306 |     const std::unique_ptr<juce::dsp::FFT> fftObject;
307 |     const size_t numSegments;
308 |     const size_t numInputSegments;
309 |     size_t currentSegment = 0, inputDataPos = 0;
310 | 
311 |     Convolution_Internal_Buffer bufferInput, bufferOutput, bufferTempOutput, bufferOverlap;
312 |     std::vector<Convolution_Internal_Buffer> buffersInputSegments, buffersImpulseSegments;
313 | };
314 | 
315 | std::vector<float> generate (size_t N, std::mt19937& rng)
316 | {
317 |     std::vector<float> data {};
318 |     data.resize (N);
319 | 
320 |     std::uniform_real_distribution<float> dist { -1.0f, 1.0f };
321 |     for (auto& x : data)
322 |         x = dist (rng);
323 | 
324 |     return data;
325 | }
326 | 
327 | static bool test_convolution (int ir_length_samples, int block_size, int num_blocks, bool latency, bool preallocate)
328 | {
329 |     std::cout << "Running test with IR length: " << ir_length_samples
330 |               << ", block size: " << block_size
331 |               << ", latency: " << (latency ? "ON" : "OFF") << '\n';
332 | 
333 |     std::mt19937 rng { 0x12345 };
334 |     auto ir = generate (ir_length_samples, rng);
335 |     const auto input = generate (block_size * num_blocks, rng);
336 |     std::vector<float> ref_output (input.size());
337 | 
338 |     ConvolutionEngine reference_engine { ir.data(), ir.size(), (size_t) block_size };
339 |     auto start = std::chrono::high_resolution_clock::now();
340 |     for (int i = 0; i < num_blocks; ++i)
341 |     {
342 |         const auto* block_in = input.data() + (i * block_size);
343 |         auto* block_out_ref = ref_output.data() + (i * block_size);
344 |         if (latency)
345 |             reference_engine.processSamplesWithAddedLatency (block_in, block_out_ref, block_size);
346 |         else
347 |             reference_engine.processSamples (block_in, block_out_ref, block_size);
348 |     }
349 |     auto duration = std::chrono::high_resolution_clock::now() - start;
350 |     auto ref_duration_seconds = std::chrono::duration<float> (duration).count();
351 |     std::cout << "  juce::dsp::Convolution: " << ref_duration_seconds << " seconds" << std::endl;
352 | 
353 |     std::vector<float> test_output (input.size());
354 | 
355 |     const auto fft_size = chowdsp::convolution::convolution_fft_size (block_size);
356 |     const auto config_bytes = chowdsp::convolution::config_bytes_required (block_size);
357 |     const auto ir_bytes = chowdsp::convolution::ir_bytes_required (block_size, (int) ir.size());
358 |     const auto state_bytes = chowdsp::convolution::process_state_bytes_required (block_size, (int) ir.size());
359 |     size_t bytes_needed = config_bytes // config
360 |                           + fft_size * sizeof (float) // fft scratch
361 |                           + ir_bytes // ir
362 |                           + state_bytes; // state
363 |     chowdsp::ArenaAllocator<> arena { bytes_needed + 64 };
364 | 
365 |     chowdsp::convolution::Config conv_config {};
366 |     chowdsp::convolution::destroy_config (&conv_config); // destroying an empty config should be okay...
367 |     chowdsp::convolution::create_config (&conv_config, block_size, preallocate ? arena.allocate_bytes (config_bytes, 64) : nullptr);
368 |     auto* fft_scratch = arena.allocate<float> (conv_config.fft_size, 64);
369 | 
370 |     chowdsp::convolution::IR_Uniform conv_ir {};
371 |     chowdsp::convolution::destroy_ir (&conv_ir); // destroying an empty IR should be okay...
372 |     chowdsp::convolution::create_ir (&conv_config,
373 |                                      &conv_ir,
374 |                                      ir.data(),
375 |                                      (int) ir.size(),
376 |                                      fft_scratch,
377 |                                      preallocate ? arena.allocate_bytes (ir_bytes, 64) : nullptr);
378 | 
379 |     chowdsp::convolution::Process_Uniform_State conv_state {};
380 |     chowdsp::convolution::destroy_process_state (&conv_state); // destroying an empty state should be okay...
381 |     chowdsp::convolution::create_process_state (&conv_config, &conv_ir, &conv_state, preallocate ? arena.allocate_bytes (state_bytes, 64) : nullptr);
382 | 
383 |     start = std::chrono::high_resolution_clock::now();
384 |     for (int i = 0; i < num_blocks; ++i)
385 |     {
386 |         const auto* block_in = input.data() + (i * block_size);
387 |         auto* block_out_test = test_output.data() + (i * block_size);
388 |         if (latency)
389 |         {
390 |             chowdsp::convolution::process_samples_with_latency (
391 |                 &conv_config,
392 |                 &conv_ir,
393 |                 &conv_state,
394 |                 block_in,
395 |                 block_out_test,
396 |                 block_size,
397 |                 fft_scratch);
398 |         }
399 |         else
400 |         {
401 |             chowdsp::convolution::process_samples (&conv_config,
402 |                                                    &conv_ir,
403 |                                                    &conv_state,
404 |                                                    block_in,
405 |                                                    block_out_test,
406 |                                                    block_size,
407 |                                                    fft_scratch);
408 |         }
409 |     }
410 |     duration = std::chrono::high_resolution_clock::now() - start;
411 |     auto test_duration_seconds = std::chrono::duration<float> (duration).count();
412 |     std::cout << "  chowdsp_convolution: " << test_duration_seconds << " seconds" << std::endl;
413 |     std::cout << "  chowdsp is " << ref_duration_seconds / test_duration_seconds << "x faster\n";
414 | 
415 |     if (! preallocate)
416 |     {
417 |         chowdsp::convolution::destroy_process_state (&conv_state);
418 |         chowdsp::convolution::destroy_ir (&conv_ir);
419 |         chowdsp::convolution::destroy_config (&conv_config);
420 |     }
421 | 
422 |     float error_accum {};
423 |     float max_error {};
424 |     for (int i = 0; i < test_output.size(); ++i)
425 |     {
426 |         const auto ref = ref_output[i];
427 |         const auto test = test_output[i];
428 |         const auto err = ref - test;
429 |         max_error = std::max (max_error, std::abs (err));
430 |         error_accum += err * err;
431 |     }
432 |     const auto mse = error_accum / static_cast<float> (test_output.size());
433 |     std::cout << "  Max error: " << max_error << '\n';
434 |     std::cout << "  Mean-squared error: " << mse << '\n';
435 | 
436 |     return max_error < 5.0e-4f && mse < 1.0e-9f;
437 | }
438 | 
439 | static bool test_convolution_multi_channel (int ir_length_samples,
440 |                                             int block_size,
441 |                                             int num_blocks,
442 |                                             bool latency,
443 |                                             int num_channels,
444 |                                             bool mono_ir,
445 |                                             bool preallocate)
446 | {
447 |     std::cout << "Running test with IR length: " << ir_length_samples
448 |               << ", block size: " << block_size
449 |               << ", latency: " << (latency ? "ON" : "OFF")
450 |               << ", # channels: " << num_channels
451 |               << ", mono IR: " << (mono_ir ? "ON" : "OFF") << '\n';
452 | 
453 |     std::mt19937 rng { 0x12345 };
454 |     auto ir = generate (ir_length_samples, rng);
455 |     const auto input = generate (block_size * num_blocks, rng);
456 |     std::vector<float> ref_output (input.size());
457 | 
458 |     ConvolutionEngine reference_engine { ir.data(), ir.size(), (size_t) block_size };
459 |     auto start = std::chrono::high_resolution_clock::now();
460 |     for (int i = 0; i < num_blocks; ++i)
461 |     {
462 |         const auto* block_in = input.data() + (i * block_size);
463 |         auto* block_out_ref = ref_output.data() + (i * block_size);
464 |         if (latency)
465 |             reference_engine.processSamplesWithAddedLatency (block_in, block_out_ref, block_size);
466 |         else
467 |             reference_engine.processSamples (block_in, block_out_ref, block_size);
468 |     }
469 |     auto duration = std::chrono::high_resolution_clock::now() - start;
470 |     auto ref_duration_seconds = std::chrono::duration<float> (duration).count();
471 |     std::cout << "  juce::dsp::Convolution: " << ref_duration_seconds << " seconds" << std::endl;
472 | 
473 |     std::vector<float*> multi_channel_ir {};
474 |     for (int ch = 0; ch < num_channels; ++ch)
475 |         multi_channel_ir.push_back (ir.data());
476 | 
477 |     std::vector<float> test_output_flat (input.size() * num_channels);
478 |     std::vector<const float*> test_input { (size_t) num_channels, nullptr };
479 |     std::vector<float*> test_output { (size_t) num_channels, nullptr };
480 | 
481 |     const auto fft_size = chowdsp::convolution::convolution_fft_size (block_size);
482 |     const auto config_bytes = chowdsp::convolution::config_bytes_required (block_size);
483 |     const auto ir_bytes = mono_ir ? chowdsp::convolution::ir_bytes_required (block_size, (int) ir.size())
484 |                                   : chowdsp::convolution::multichannel_ir_bytes_required (block_size, (int) ir.size(), num_channels);
485 |     const auto state_bytes = chowdsp::convolution::multichannel_process_state_bytes_required (block_size, (int) ir.size(), num_channels);
486 |     size_t bytes_needed = config_bytes // config
487 |                           + fft_size * sizeof (float) // fft scratch
488 |                           + ir_bytes // ir
489 |                           + state_bytes; // state
490 |     chowdsp::ArenaAllocator<> arena { bytes_needed + 64 };
491 | 
492 |     chowdsp::convolution::Config conv_config {};
493 |     chowdsp::convolution::create_config (&conv_config, block_size, preallocate ? arena.allocate_bytes (config_bytes, 64) : nullptr);
494 |     auto* fft_scratch = arena.allocate<float> (conv_config.fft_size, 64);
495 | 
496 |     chowdsp::convolution::IR_Uniform conv_ir {};
497 |     if (mono_ir)
498 |     {
499 |         chowdsp::convolution::create_ir (&conv_config,
500 |                                          &conv_ir,
501 |                                          ir.data(),
502 |                                          ir_length_samples,
503 |                                          fft_scratch,
504 |                                          preallocate ? arena.allocate_bytes (ir_bytes, 64) : nullptr);
505 |     }
506 |     else
507 |     {
508 |         chowdsp::convolution::create_multichannel_ir (&conv_config,
509 |                                                       &conv_ir,
510 |                                                       multi_channel_ir.data(),
511 |                                                       ir_length_samples,
512 |                                                       num_channels,
513 |                                                       fft_scratch,
514 |                                                       preallocate ? arena.allocate_bytes (ir_bytes, 64) : nullptr);
515 |     }
516 | 
517 |     chowdsp::convolution::Process_Uniform_State conv_state {};
518 |     chowdsp::convolution::create_multichannel_process_state (&conv_config, &conv_ir, &conv_state, num_channels, preallocate ? arena.allocate_bytes (state_bytes, 64) : nullptr);
519 | 
520 |     start = std::chrono::high_resolution_clock::now();
521 |     for (int i = 0; i < num_blocks; ++i)
522 |     {
523 |         for (int ch = 0; ch < num_channels; ++ch)
524 |         {
525 |             test_input[ch] = input.data() + (i * block_size);
526 |             test_output[ch] = test_output_flat.data() + (input.size() * ch) + (i * block_size);
527 |         }
528 | 
529 |         if (latency)
530 |         {
531 |             chowdsp::convolution::process_samples_with_latency_multichannel (
532 |                 &conv_config,
533 |                 &conv_ir,
534 |                 &conv_state,
535 |                 test_input.data(),
536 |                 test_output.data(),
537 |                 block_size,
538 |                 num_channels,
539 |                 fft_scratch);
540 |         }
541 |         else
542 |         {
543 |             chowdsp::convolution::process_samples_multichannel (&conv_config,
544 |                                                                 &conv_ir,
545 |                                                                 &conv_state,
546 |                                                                 test_input.data(),
547 |                                                                 test_output.data(),
548 |                                                                 block_size,
549 |                                                                 num_channels,
550 |                                                                 fft_scratch);
551 |         }
552 |     }
553 |     duration = std::chrono::high_resolution_clock::now() - start;
554 |     auto test_duration_seconds = std::chrono::duration<float> (duration).count();
555 |     std::cout << "  chowdsp_convolution: " << test_duration_seconds << " seconds" << std::endl;
556 |     std::cout << "  chowdsp is " << ref_duration_seconds / test_duration_seconds << "x faster\n";
557 | 
558 |     if (! preallocate)
559 |     {
560 |         chowdsp::convolution::destroy_ir (&conv_ir);
561 |         chowdsp::convolution::destroy_process_state (&conv_state);
562 |         chowdsp::convolution::destroy_config (&conv_config);
563 |     }
564 | 
565 |     float error_accum {};
566 |     float max_error {};
567 |     for (int ch = 0; ch < num_channels; ++ch)
568 |     {
569 |         for (int i = 0; i < input.size(); ++i)
570 |         {
571 |             const auto ref = ref_output[i];
572 |             const auto test = test_output_flat[ch * input.size() + i];
573 |             const auto err = ref - test;
574 |             max_error = std::max (max_error, std::abs (err));
575 |             error_accum += err * err;
576 |         }
577 |     }
578 |     const auto mse = error_accum / static_cast<float> (test_output_flat.size());
579 |     std::cout << "  Max error: " << max_error << '\n';
580 |     std::cout << "  Mean-squared error: " << mse << '\n';
581 | 
582 |     return max_error < 5.0e-4f && mse < 1.0e-9f;
583 | }
584 | 
585 | static bool test_convolution_non_uniform (int ir_length_samples, int block_size, int num_blocks, int head_size)
586 | {
587 |     std::cout << "Running test with IR length: " << ir_length_samples
588 |               << ", block size: " << block_size
589 |               << ", head size: " << head_size << '\n';
590 | 
591 |     std::mt19937 rng { 0x12345 };
592 |     auto ir = generate (ir_length_samples, rng);
593 |     const auto input = generate (block_size * num_blocks, rng);
594 |     std::vector<float> ref_output (input.size());
595 | 
596 |     ConvolutionEngine reference_engine { ir.data(), ir.size(), (size_t) block_size };
597 |     auto start = std::chrono::high_resolution_clock::now();
598 |     for (int i = 0; i < num_blocks; ++i)
599 |     {
600 |         const auto* block_in = input.data() + (i * block_size);
601 |         auto* block_out_ref = ref_output.data() + (i * block_size);
602 |         reference_engine.processSamples (block_in, block_out_ref, block_size);
603 |     }
604 |     auto duration = std::chrono::high_resolution_clock::now() - start;
605 |     auto ref_duration_seconds = std::chrono::duration<float> (duration).count();
606 |     std::cout << "  juce::dsp::Convolution: " << ref_duration_seconds << " seconds" << std::endl;
607 | 
608 |     std::vector<float> test_output (input.size());
609 |     chowdsp::convolution::Config head_config {};
610 |     chowdsp::convolution::create_config (&head_config, block_size);
611 |     chowdsp::convolution::Config tail_config {};
612 |     chowdsp::convolution::create_config (&tail_config, head_size);
613 | 
614 |     chowdsp::convolution::IR_Non_Uniform conv_ir {};
615 |     chowdsp::convolution::destroy_nuir (&conv_ir);
616 |     conv_ir.head_config = &head_config;
617 |     conv_ir.tail_config = &tail_config;
618 |     conv_ir.head_size = head_size;
619 |     auto* scratch = (float*) chowdsp::fft::aligned_malloc (chowdsp::convolution::get_required_nuir_scratch_bytes (&conv_ir));
620 | 
621 |     chowdsp::convolution::create_nuir (&conv_ir,
622 |                                        ir.data(),
623 |                                        (int) ir.size(),
624 |                                        scratch);
625 | 
626 |     chowdsp::convolution::Process_Non_Uniform_State conv_state {};
627 |     chowdsp::convolution::destroy_nuir_process_state (&conv_state); // destroying an empty state should be okay...
628 |     chowdsp::convolution::create_nuir_process_state (&conv_ir, &conv_state);
629 | 
630 |     start = std::chrono::high_resolution_clock::now();
631 |     for (int i = 0; i < num_blocks; ++i)
632 |     {
633 |         const auto* block_in = input.data() + (i * block_size);
634 |         auto* block_out_test = test_output.data() + (i * block_size);
635 |         chowdsp::convolution::process_samples_non_uniform (&conv_ir,
636 |                                                            &conv_state,
637 |                                                            block_in,
638 |                                                            block_out_test,
639 |                                                            block_size,
640 |                                                            scratch);
641 |     }
642 |     duration = std::chrono::high_resolution_clock::now() - start;
643 |     auto test_duration_seconds = std::chrono::duration<float> (duration).count();
644 |     std::cout << "  chowdsp_convolution: " << test_duration_seconds << " seconds" << std::endl;
645 |     std::cout << "  chowdsp is " << ref_duration_seconds / test_duration_seconds << "x faster\n";
646 | 
647 |     chowdsp::fft::aligned_free (scratch);
648 |     chowdsp::convolution::destroy_nuir (&conv_ir);
649 |     chowdsp::convolution::destroy_nuir_process_state (&conv_state);
650 |     chowdsp::convolution::destroy_config (&head_config);
651 |     chowdsp::convolution::destroy_config (&tail_config);
652 | 
653 |     float error_accum {};
654 |     float max_error {};
655 |     for (int i = 0; i < test_output.size(); ++i)
656 |     {
657 |         const auto ref = ref_output[i];
658 |         const auto test = test_output[i];
659 |         const auto err = ref - test;
660 |         max_error = std::max (max_error, std::abs (err));
661 |         error_accum += err * err;
662 |     }
663 |     const auto mse = error_accum / static_cast<float> (test_output.size());
664 |     std::cout << "  Max error: " << max_error << '\n';
665 |     std::cout << "  Mean-squared error: " << mse << '\n';
666 | 
667 |     return max_error < 5.0e-4f && mse < 1.0e-9f;
668 | }
669 | 
670 | int main()
671 | {
672 |     auto success = true;
673 |     for (bool preallocate : { false, true })
674 |     {
675 |         for (bool latency : { false, true })
676 |         {
677 |             success &= test_convolution (6000, 2048, 4, latency, preallocate);
678 |             success &= test_convolution (6000, 512, 20, latency, preallocate);
679 |             success &= test_convolution (6000, 511, 20, latency, preallocate);
680 |             success &= test_convolution (6000, 32, 400, latency, preallocate);
681 |             success &= test_convolution (100, 2048, 2, latency, preallocate);
682 |             success &= test_convolution (100, 512, 4, latency, preallocate);
683 |             success &= test_convolution (100, 511, 4, latency, preallocate);
684 |             success &= test_convolution (100, 32, 10, latency, preallocate);
685 | 
686 |             success &= test_convolution_multi_channel (6000, 2048, 4, latency, 2, false, preallocate);
687 |             success &= test_convolution_multi_channel (100, 32, 10, latency, 4, false, preallocate);
688 |             success &= test_convolution_multi_channel (6000, 512, 4, latency, 2, true, preallocate);
689 |             success &= test_convolution_multi_channel (100, 511, 10, latency, 4, true, preallocate);
690 |         }
691 |     }
692 | 
693 |     success &= test_convolution_non_uniform (6000, 2048, 4, 2048);
694 |     success &= test_convolution_non_uniform (6000, 512, 20, 1024);
695 |     success &= test_convolution_non_uniform (6000, 511, 20, 1024);
696 |     success &= test_convolution_non_uniform (6000, 32, 400, 1024);
697 |     success &= test_convolution_non_uniform (200, 32, 10, 64);
698 | 
699 |     std::cout << "Speed comparisons:\n";
700 |     success &= test_convolution (48'000, 512, 10'000, false, true);
701 |     success &= test_convolution (48'000, 512, 10'000, true, true);
702 |     success &= test_convolution_non_uniform (48'000, 512, 10'000, 2048);
703 | 
704 |     return success ? 0 : 1;
705 | }
706 | 


--------------------------------------------------------------------------------
/chowdsp_convolution.cpp:
--------------------------------------------------------------------------------
  1 | #include "chowdsp_convolution.h"
  2 | 
  3 | #include <cassert>
  4 | #include <cstring>
  5 | 
  6 | #include <chowdsp_fft.h>
  7 | 
  8 | namespace chowdsp::convolution
  9 | {
 10 | static int min_int (int a, int b)
 11 | {
 12 |     return (b < a) ? b : a;
 13 | }
 14 | 
 15 | static int max_int (int a, int b)
 16 | {
 17 |     return (b > a) ? b : a;
 18 | }
 19 | 
 20 | static int next_pow2 (int v) noexcept
 21 | {
 22 |     --v;
 23 |     v |= (v >> 1);
 24 |     v |= (v >> 2);
 25 |     v |= (v >> 4);
 26 |     v |= (v >> 8);
 27 |     v |= (v >> 16);
 28 |     return v + 1;
 29 | }
 30 | 
 31 | static int pad_floats (int N)
 32 | {
 33 |     static constexpr int pad_len = 16;
 34 |     const auto N_div = (N + pad_len - 1) / pad_len;
 35 |     return N_div * pad_len;
 36 | }
 37 | 
 38 | static int pad_bytes (int N)
 39 | {
 40 |     static constexpr int pad_len = 64;
 41 |     const auto N_div = (N + pad_len - 1) / pad_len;
 42 |     return N_div * pad_len;
 43 | }
 44 | 
 45 | static void get_block_and_fft_sizes (int max_block_size, int& block_size, int& fft_size)
 46 | {
 47 |     block_size = next_pow2 (max_block_size);
 48 |     fft_size = block_size > 128 ? 2 * block_size : 4 * block_size;
 49 | }
 50 | 
 51 | int convolution_fft_size (int max_block_size)
 52 | {
 53 |     [[maybe_unused]] int block_size, fft_size;
 54 |     get_block_and_fft_sizes (max_block_size, block_size, fft_size);
 55 |     return fft_size;
 56 | }
 57 | 
 58 | void create_config (Config* config, int max_block_size, void* data)
 59 | {
 60 |     get_block_and_fft_sizes (max_block_size, config->block_size, config->fft_size);
 61 |     if (data == nullptr)
 62 |         config->fft = fft::fft_new_setup (config->fft_size, fft::FFT_REAL);
 63 |     else
 64 |         config->fft = fft::fft_new_setup_preallocated (config->fft_size, fft::FFT_REAL, data);
 65 | }
 66 | 
 67 | size_t config_bytes_required (int max_block_size)
 68 | {
 69 |     const auto fft_size = convolution_fft_size (max_block_size);
 70 |     return fft::fft_bytes_required (fft_size, fft::FFT_REAL);
 71 | }
 72 | 
 73 | void destroy_config (Config* config)
 74 | {
 75 |     if (config->fft != nullptr)
 76 |         fft::fft_destroy_setup (config->fft);
 77 |     *config = {};
 78 | }
 79 | 
 80 | //================================================================================================================
 81 | void create_ir (const Config* config, IR_Uniform* ir, const float* ir_data, int ir_num_samples, float* fft_scratch, void* data)
 82 | {
 83 |     create_zero_ir (config, ir, ir_num_samples, data);
 84 |     load_ir (config, ir, ir_data, ir_num_samples, fft_scratch);
 85 | }
 86 | 
 87 | static int get_num_segments (int fft_size, int block_size, int ir_num_samples)
 88 | {
 89 |     return (ir_num_samples / (fft_size - block_size)) + 1;
 90 | }
 91 | 
 92 | static int get_num_segments (const Config* config, int ir_num_samples)
 93 | {
 94 |     return get_num_segments (config->fft_size, config->block_size, ir_num_samples);
 95 | }
 96 | 
 97 | static float* get_segment (const Config* config, float* segments, int segment_idx)
 98 | {
 99 |     return segments + config->fft_size * segment_idx;
100 | }
101 | 
102 | static void create_zero_ir_num_segments (const Config* config, IR_Uniform* ir, int num_segments, void* data)
103 | {
104 |     const auto segment_num_samples = config->fft_size;
105 | 
106 |     if (data == nullptr)
107 |     {
108 |         size_t bytes_needed = segment_num_samples * num_segments * sizeof (float);
109 |         data = fft::aligned_malloc (bytes_needed);
110 |     }
111 | 
112 |     ir->max_num_segments = num_segments;
113 |     ir->num_segments = num_segments;
114 | 
115 |     ir->segments = static_cast<float*> (data);
116 |     memset (ir->segments, 0, ir->num_segments * segment_num_samples * sizeof (float));
117 | }
118 | 
119 | size_t ir_bytes_required (int max_block_size, int ir_num_samples)
120 | {
121 |     int block_size, fft_size;
122 |     get_block_and_fft_sizes (max_block_size, block_size, fft_size);
123 |     const auto num_segments = get_num_segments (fft_size, block_size, ir_num_samples);
124 |     const auto segment_num_samples = fft_size;
125 |     return segment_num_samples * num_segments * sizeof (float);
126 | }
127 | 
128 | void create_zero_ir (const Config* config, IR_Uniform* ir, int ir_num_samples, void* data)
129 | {
130 |     create_zero_ir_num_segments (config, ir, get_num_segments (config, ir_num_samples), data);
131 |     ir->num_channels = 1;
132 | }
133 | 
134 | void load_ir (const Config* config, IR_Uniform* ir, const float* ir_data, int ir_num_samples, float* fft_scratch)
135 | {
136 |     assert (ir->num_channels == 1);
137 | 
138 |     const auto num_segments = get_num_segments (config, ir_num_samples);
139 |     assert (num_segments <= ir->max_num_segments); // IR is too large for the allocated number of segments
140 |     ir->num_segments = num_segments;
141 | 
142 |     int current_ptr {};
143 |     for (int seg_idx = 0; seg_idx < ir->num_segments; ++seg_idx)
144 |     {
145 |         float* segment = get_segment (config, ir->segments, seg_idx);
146 |         const auto segment_n = min_int (config->fft_size - config->block_size, ir_num_samples - current_ptr);
147 |         memcpy (segment, ir_data + current_ptr, segment_n * sizeof (float));
148 |         memset (segment + segment_n, 0, (config->fft_size - segment_n) * sizeof (float));
149 |         fft::fft_transform_unordered (config->fft,
150 |                                       segment,
151 |                                       segment,
152 |                                       fft_scratch,
153 |                                       fft::FFT_FORWARD);
154 |         current_ptr += segment_n;
155 |     }
156 | }
157 | 
158 | void destroy_ir (IR_Uniform* ir)
159 | {
160 |     fft::aligned_free (ir->segments);
161 |     *ir = {};
162 | }
163 | 
164 | size_t multichannel_ir_bytes_required (int max_block_size, int ir_num_samples, int num_channels)
165 | {
166 |     return ir_bytes_required (max_block_size, ir_num_samples) * num_channels;
167 | }
168 | 
169 | void create_multichannel_ir (const Config* config, IR_Uniform* ir, const float* const* ir_data, int ir_num_samples, int num_channels, float* fft_scratch, void* data)
170 | {
171 |     create_zero_multichannel_ir (config, ir, ir_num_samples, num_channels, data);
172 |     load_multichannel_ir (config, ir, ir_data, ir_num_samples, num_channels, fft_scratch);
173 | }
174 | 
175 | void create_zero_multichannel_ir (const Config* config, IR_Uniform* ir, int ir_num_samples, int num_channels, void* data)
176 | {
177 |     const auto mono_ir_num_segments = get_num_segments (config, ir_num_samples);
178 | 
179 |     create_zero_ir_num_segments (config, ir, mono_ir_num_segments * num_channels, data);
180 |     assert (ir->num_segments % num_channels == 0);
181 |     const auto actual_num_segments = ir->num_segments / num_channels;
182 |     ir->num_segments = actual_num_segments;
183 |     ir->max_num_segments = actual_num_segments;
184 |     ir->num_channels = num_channels;
185 | }
186 | 
187 | void load_multichannel_ir (const Config* config, IR_Uniform* ir, const float* const* ir_data, int ir_num_samples, int num_channels, float* fft_scratch)
188 | {
189 |     assert (num_channels == ir->num_channels);
190 | 
191 |     int new_num_segments = 0;
192 |     for (int ch = 0; ch < num_channels; ++ch)
193 |     {
194 |         IR_Uniform this_channel_ir {
195 |             .segments = get_segment (config, ir->segments, ch * ir->max_num_segments),
196 |             .num_segments = ir->num_segments,
197 |             .max_num_segments = ir->max_num_segments,
198 |             .num_channels = 1,
199 |         };
200 |         load_ir (config, &this_channel_ir, ir_data[ch], ir_num_samples, fft_scratch);
201 |         new_num_segments = this_channel_ir.num_segments;
202 |     }
203 |     ir->num_segments = new_num_segments;
204 | }
205 | 
206 | //================================================================================================================
207 | static int state_max_num_segments (int block_size, int ir_num_segments)
208 | {
209 |     return block_size > 128 ? ir_num_segments : 3 * ir_num_segments;
210 | }
211 | 
212 | static size_t state_data_bytes_needed (int fft_size, int block_size, int ir_num_segments, int num_channels)
213 | {
214 |     size_t bytes_needed {};
215 | 
216 |     const auto segment_num_samples = fft_size;
217 |     const auto max_num_segments = state_max_num_segments (block_size, ir_num_segments);
218 |     bytes_needed += segment_num_samples * max_num_segments * sizeof (float);
219 | 
220 |     bytes_needed += fft_size * sizeof (float); // input data
221 |     bytes_needed += fft_size * sizeof (float); // output data
222 |     bytes_needed += fft_size * sizeof (float); // output temp data
223 |     bytes_needed += fft_size * sizeof (float); // overlap data
224 |     return bytes_needed * num_channels;
225 | }
226 | 
227 | static void state_data_partition_memory (const Config* config, Process_Uniform_State* state, Process_Uniform_State::State_Data& state_data, float*& data)
228 | {
229 |     const auto segment_num_samples = config->fft_size;
230 | 
231 |     state_data.segments = data;
232 |     data += segment_num_samples * state->max_num_segments;
233 |     state_data.input_data = data;
234 |     data += config->fft_size;
235 |     state_data.output_data = data;
236 |     data += config->fft_size;
237 |     state_data.output_temp_data = data;
238 |     data += config->fft_size;
239 |     state_data.overlap_data = data;
240 |     data += config->fft_size;
241 | }
242 | 
243 | size_t multichannel_process_state_bytes_required (int max_block_size, int ir_num_samples, int num_channels)
244 | {
245 |     int block_size, fft_size;
246 |     get_block_and_fft_sizes (max_block_size, block_size, fft_size);
247 |     const auto ir_num_segments = get_num_segments (fft_size, block_size, ir_num_samples);
248 |     return state_data_bytes_needed (fft_size, block_size, ir_num_segments, num_channels)
249 |            + pad_bytes (sizeof (Process_Uniform_State::State_Data) * num_channels);
250 | }
251 | 
252 | void create_multichannel_process_state (const Config* config, const IR_Uniform* ir, Process_Uniform_State* state, int num_channels, void* data)
253 | {
254 |     using State_Data = Process_Uniform_State::State_Data;
255 |     state->num_channels = num_channels;
256 |     state->max_num_segments = state_max_num_segments (config->block_size, ir->max_num_segments);
257 | 
258 |     const auto state_bytes_needed = state_data_bytes_needed (config->fft_size, config->block_size, ir->max_num_segments, num_channels);
259 |     if (data == nullptr)
260 |         data = fft::aligned_malloc (state_bytes_needed + num_channels * sizeof (State_Data));
261 |     state->state_data = reinterpret_cast<State_Data*> (static_cast<std::byte*> (data) + state_bytes_needed);
262 | 
263 |     auto* float_data = static_cast<float*> (data);
264 |     for (int ch = 0; ch < state->num_channels; ++ch)
265 |         state_data_partition_memory (config, state, state->state_data[ch], float_data);
266 |     assert (static_cast<void*> (float_data) == static_cast<void*> (state->state_data));
267 | 
268 |     reset_process_state (config, state);
269 | }
270 | 
271 | size_t process_state_bytes_required (int block_size, int ir_num_samples)
272 | {
273 |     return multichannel_process_state_bytes_required (block_size, ir_num_samples, 1);
274 | }
275 | 
276 | void create_process_state (const Config* config, const IR_Uniform* ir, Process_Uniform_State* state, void* data)
277 | {
278 |     create_multichannel_process_state (config, ir, state, ir->num_channels, data);
279 | }
280 | 
281 | void reset_process_state (const Config* config, Process_Uniform_State* state)
282 | {
283 |     state->current_segment = 0;
284 |     state->input_data_pos = 0;
285 | 
286 |     const auto segment_num_samples = config->fft_size;
287 |     for (int ch = 0; ch < state->num_channels; ++ch)
288 |     {
289 |         auto& state_data = state->state_data[ch];
290 |         memset (state_data.segments,
291 |                 0,
292 |                 segment_num_samples * state->max_num_segments * sizeof (float));
293 | 
294 |         memset (state_data.input_data, 0, config->fft_size * sizeof (float));
295 |         memset (state_data.output_data, 0, config->fft_size * sizeof (float));
296 |         memset (state_data.output_temp_data, 0, config->fft_size * sizeof (float));
297 |         memset (state_data.overlap_data, 0, config->fft_size * sizeof (float));
298 |     }
299 | }
300 | 
301 | void reset_process_state_segments (const Convolution_Config* config, Process_Uniform_State* state, const IR_Uniform* ir)
302 | {
303 |     const auto segment_num_samples = config->fft_size;
304 |     for (int ch = 0; ch < state->num_channels; ++ch)
305 |     {
306 |         auto& state_data = state->state_data[ch];
307 |         memset (state_data.segments + segment_num_samples * ir->num_segments,
308 |                 0,
309 |                 segment_num_samples * (state->max_num_segments - ir->num_segments) * sizeof (float));
310 |     }
311 | }
312 | 
313 | void destroy_process_state (Process_Uniform_State* state)
314 | {
315 |     if (state->state_data != nullptr)
316 |         fft::aligned_free (state->state_data[0].segments);
317 |     *state = {};
318 | }
319 | 
320 | //================================================================================================================
321 | int get_required_nuir_scratch_bytes (const IR_Non_Uniform* ir)
322 | {
323 |     assert (ir->head_config != nullptr);
324 |     assert (ir->tail_config != nullptr);
325 |     return static_cast<int> ((max_int (ir->head_config->fft_size,
326 |                                        ir->tail_config->fft_size)
327 |                               + pad_floats (ir->head_config->block_size))
328 |                              * sizeof (float));
329 | }
330 | 
331 | void create_nuir (IR_Non_Uniform* ir, const float* ir_data, int ir_num_samples, float* fft_scratch)
332 | {
333 |     create_zero_nuir (ir, ir_num_samples);
334 |     load_nuir (ir, ir_data, ir_num_samples, fft_scratch);
335 | }
336 | 
337 | void create_zero_nuir (IR_Non_Uniform* ir, int ir_num_samples)
338 | {
339 |     assert (ir->head_config != nullptr);
340 |     assert (ir->tail_config != nullptr);
341 |     assert (ir->head_size >= ir->head_config->block_size);
342 |     assert (ir->tail_config->block_size == ir->head_size);
343 |     assert (ir_num_samples >= 2 * ir->head_size);
344 | 
345 |     const auto head_num_segments = get_num_segments (ir->head_config, ir->head_size);
346 |     const auto head_segments_length = head_num_segments * ir->head_config->fft_size;
347 |     const auto tail_num_segments = get_num_segments (ir->tail_config, ir_num_samples - ir->head_size);
348 |     const auto tail_segments_length = tail_num_segments * ir->tail_config->fft_size;
349 |     const auto total_segments_length = head_segments_length + tail_segments_length;
350 | 
351 |     auto* segment_data = static_cast<float*> (fft::aligned_malloc (total_segments_length * sizeof (float)));
352 |     memset (segment_data, 0, total_segments_length * sizeof (float));
353 | 
354 |     ir->head.segments = segment_data;
355 |     ir->head.num_segments = head_num_segments;
356 |     ir->head.max_num_segments = head_num_segments;
357 |     ir->head.num_channels = 1;
358 |     ir->tail.segments = segment_data + head_segments_length;
359 |     ir->tail.num_segments = tail_num_segments;
360 |     ir->tail.max_num_segments = tail_num_segments;
361 |     ir->tail.num_channels = 1;
362 | }
363 | 
364 | void load_nuir (IR_Non_Uniform* ir, const float* ir_data, int ir_num_samples, float* fft_scratch)
365 | {
366 |     load_ir (ir->head_config, &ir->head, ir_data, min_int (ir_num_samples, ir->head_size), fft_scratch);
367 |     load_ir (ir->tail_config, &ir->tail, ir_data + ir->head_size, max_int (ir_num_samples - ir->head_size, 0), fft_scratch);
368 | }
369 | 
370 | void destroy_nuir (IR_Non_Uniform* ir)
371 | {
372 |     fft::aligned_free (ir->head.segments);
373 |     *ir = {};
374 | }
375 | 
376 | //================================================================================================================
377 | void create_nuir_process_state (const IR_Non_Uniform* ir, Process_Non_Uniform_State* state)
378 | {
379 |     using State_Data = Process_Uniform_State::State_Data;
380 | 
381 |     state->head.num_channels = 1; // @TODO
382 |     state->head_config = ir->head_config;
383 |     state->tail.num_channels = 1; // @TODO
384 |     state->tail_config = ir->tail_config;
385 | 
386 |     state->head.max_num_segments = state_max_num_segments (ir->head_config->block_size, ir->head.max_num_segments);
387 |     state->tail.max_num_segments = state_max_num_segments (ir->tail_config->block_size, ir->tail.max_num_segments);
388 | 
389 |     const auto head_state_bytes_needed = state_data_bytes_needed (state->head_config->fft_size,
390 |                                                                   state->head_config->block_size,
391 |                                                                   ir->head.max_num_segments,
392 |                                                                   1);
393 |     const auto tail_state_bytes_needed = state_data_bytes_needed (state->tail_config->fft_size,
394 |                                                                   state->tail_config->block_size,
395 |                                                                   ir->tail.max_num_segments,
396 |                                                                   1);
397 |     auto* data = fft::aligned_malloc (head_state_bytes_needed + tail_state_bytes_needed + 2 * sizeof (State_Data));
398 |     state->head.state_data = reinterpret_cast<State_Data*> (static_cast<std::byte*> (data) + head_state_bytes_needed + tail_state_bytes_needed);
399 |     state->tail.state_data = state->head.state_data + 1;
400 | 
401 |     auto* float_data = static_cast<float*> (data);
402 | 
403 |     state_data_partition_memory (state->head_config, &state->head, state->head.state_data[0], float_data);
404 |     state_data_partition_memory (state->tail_config, &state->tail, state->tail.state_data[0], float_data);
405 |     assert (static_cast<void*> (float_data) == static_cast<void*> (state->head.state_data));
406 | 
407 |     reset_process_state (state->head_config, &state->head);
408 |     reset_process_state (state->tail_config, &state->tail);
409 | }
410 | 
411 | void reset_nuir_process_state (Process_Non_Uniform_State* state)
412 | {
413 |     reset_process_state (state->head_config, &state->head);
414 |     reset_process_state (state->tail_config, &state->tail);
415 | }
416 | 
417 | void destroy_nuir_process_state (Process_Non_Uniform_State* state)
418 | {
419 |     destroy_process_state (&state->head);
420 |     *state = {};
421 | }
422 | 
423 | //================================================================================================================
424 | static void process_samples_mono (const Config* config,
425 |                                   const IR_Uniform* ir,
426 |                                   Process_Uniform_State* state,
427 |                                   const float* input,
428 |                                   float* output,
429 |                                   int num_samples,
430 |                                   float* fft_scratch)
431 | {
432 |     const auto fft_inv_scale = 1.0f / static_cast<float> (config->fft_size);
433 |     const auto state_num_segments = config->block_size > 128 ? ir->num_segments : 3 * ir->num_segments;
434 |     auto index_step = state_num_segments / ir->num_segments;
435 |     state->current_segment = (state->current_segment >= state_num_segments) ? 0 : state->current_segment;
436 |     auto* state_data = state->state_data;
437 | 
438 |     int num_samples_processed = 0;
439 |     while (num_samples_processed < num_samples)
440 |     {
441 |         const auto input_data_was_empty = state->input_data_pos == 0;
442 |         const auto samples_to_process = min_int (num_samples - num_samples_processed,
443 |                                                  config->block_size - state->input_data_pos);
444 | 
445 |         memcpy (state_data->input_data + state->input_data_pos,
446 |                 input + num_samples_processed,
447 |                 samples_to_process * sizeof (float));
448 | 
449 |         auto* input_segment_data = get_segment (config, state_data->segments, state->current_segment);
450 |         memcpy (input_segment_data, state_data->input_data, config->fft_size * sizeof (float));
451 | 
452 |         fft::fft_transform_unordered (config->fft,
453 |                                       input_segment_data,
454 |                                       input_segment_data,
455 |                                       fft_scratch,
456 |                                       fft::FFT_FORWARD);
457 | 
458 |         // Complex multiplication
459 |         if (input_data_was_empty)
460 |         {
461 |             memset (state_data->output_temp_data, 0, config->fft_size * sizeof (float));
462 | 
463 |             auto index = state->current_segment;
464 |             for (int seg_idx = 1; seg_idx < ir->num_segments; ++seg_idx)
465 |             {
466 |                 index += index_step;
467 |                 if (index >= state_num_segments)
468 |                     index -= state_num_segments;
469 | 
470 |                 const auto* input_segment = get_segment (config, state_data->segments, index);
471 |                 const auto* ir_segment = get_segment (config, ir->segments, seg_idx);
472 |                 fft::fft_convolve_unordered (config->fft,
473 |                                              input_segment,
474 |                                              ir_segment,
475 |                                              state_data->output_temp_data,
476 |                                              fft_inv_scale);
477 |             }
478 |         }
479 | 
480 |         memcpy (state_data->output_data, state_data->output_temp_data, config->fft_size * sizeof (float));
481 | 
482 |         fft::fft_convolve_unordered (config->fft,
483 |                                      input_segment_data,
484 |                                      ir->segments,
485 |                                      state_data->output_data,
486 |                                      fft_inv_scale);
487 |         fft::fft_transform_unordered (config->fft,
488 |                                       state_data->output_data,
489 |                                       state_data->output_data,
490 |                                       fft_scratch,
491 |                                       fft::FFT_BACKWARD);
492 | 
493 |         // Add overlap
494 |         {
495 |             // Using SIMD for this operation is tricky, because
496 |             // we can't guarantee that the pointers will be aligned.
497 | 
498 |             // const auto vec_width_x2 = 2 * fft::fft_simd_width_bytes (config->fft) / static_cast<int> (sizeof (float));
499 |             // const auto n_samples_vec = (samples_to_process / vec_width_x2) * vec_width_x2;
500 |             // fft::fft_accumulate (config->fft,
501 |             //                      state->output_data + state->input_data_pos,
502 |             //                      state->overlap_data + state->input_data_pos,
503 |             //                      output + num_samples_processed,
504 |             //                      n_samples_vec);
505 |             // for (int i = n_samples_vec; i < samples_to_process; ++i) // extra data that can't be SIMD-ed
506 |             //     output[num_samples_processed + i] = state->output_data[state->input_data_pos + i] + state->overlap_data[state->input_data_pos + i];
507 | 
508 |             for (int i = 0; i < samples_to_process; ++i)
509 |                 output[num_samples_processed + i] = state_data->output_data[state->input_data_pos + i] + state_data->overlap_data[state->input_data_pos + i];
510 |         }
511 | 
512 |         // Input buffer full => Next block
513 |         state->input_data_pos += samples_to_process;
514 | 
515 |         if (state->input_data_pos == config->block_size)
516 |         {
517 |             // Input buffer is empty again now
518 |             memset (state_data->input_data, 0, config->fft_size * sizeof (float));
519 | 
520 |             state->input_data_pos = 0;
521 | 
522 |             // Extra step for segSize > blockSize
523 |             const auto extra_block_samples = config->fft_size - 2 * config->block_size;
524 |             if (extra_block_samples > 0)
525 |             {
526 |                 fft::fft_accumulate (config->fft,
527 |                                      state_data->overlap_data + config->block_size,
528 |                                      state_data->output_data + config->block_size,
529 |                                      state_data->output_data + config->block_size,
530 |                                      extra_block_samples);
531 |             }
532 | 
533 |             // Save the overlap
534 |             memcpy (state_data->overlap_data,
535 |                     state_data->output_data + config->block_size,
536 |                     (config->fft_size - config->block_size) * sizeof (float));
537 | 
538 |             state->current_segment = (state->current_segment > 0) ? (state->current_segment - 1) : (state_num_segments - 1);
539 |         }
540 | 
541 |         num_samples_processed += samples_to_process;
542 |     }
543 | }
544 | 
545 | void process_samples (const Config* config,
546 |                       const IR_Uniform* ir,
547 |                       Process_Uniform_State* state,
548 |                       const float* input,
549 |                       float* output,
550 |                       int num_samples,
551 |                       float* fft_scratch)
552 | {
553 |     assert (ir->num_channels == 1);
554 |     process_samples_mono (config, ir, state, input, output, num_samples, fft_scratch);
555 | }
556 | 
557 | static void process_samples_with_latency_mono (const Config* config,
558 |                                                const IR_Uniform* ir,
559 |                                                Process_Uniform_State* state,
560 |                                                const float* input,
561 |                                                float* output,
562 |                                                int num_samples,
563 |                                                float* fft_scratch)
564 | {
565 |     const auto fft_inv_scale = 1.0f / static_cast<float> (config->fft_size);
566 |     const auto state_num_segments = config->block_size > 128 ? ir->num_segments : 3 * ir->num_segments;
567 |     auto index_step = state_num_segments / ir->num_segments;
568 |     state->current_segment = (state->current_segment >= state_num_segments) ? 0 : state->current_segment;
569 |     auto* state_data = state->state_data;
570 | 
571 |     int num_samples_processed = 0;
572 |     while (num_samples_processed < num_samples)
573 |     {
574 |         const auto samples_to_process = min_int (num_samples - num_samples_processed,
575 |                                                  config->block_size - state->input_data_pos);
576 | 
577 |         memcpy (state_data->input_data + state->input_data_pos,
578 |                 input + num_samples_processed,
579 |                 samples_to_process * sizeof (float));
580 | 
581 |         memcpy (output + num_samples_processed,
582 |                 state_data->output_data + state->input_data_pos,
583 |                 samples_to_process * sizeof (float));
584 | 
585 |         num_samples_processed += samples_to_process;
586 |         state->input_data_pos += samples_to_process;
587 | 
588 |         if (state->input_data_pos == config->block_size)
589 |         {
590 |             // Copy input data in input segment
591 |             auto* input_segment_data = get_segment (config, state_data->segments, state->current_segment);
592 |             memcpy (input_segment_data, state_data->input_data, config->fft_size * sizeof (float));
593 | 
594 |             fft::fft_transform_unordered (config->fft,
595 |                                           input_segment_data,
596 |                                           input_segment_data,
597 |                                           fft_scratch,
598 |                                           fft::FFT_FORWARD);
599 | 
600 |             // Complex multiplication
601 |             memset (state_data->output_temp_data, 0, config->fft_size * sizeof (float));
602 | 
603 |             auto index = state->current_segment;
604 |             for (int seg_idx = 1; seg_idx < ir->num_segments; ++seg_idx)
605 |             {
606 |                 index += index_step;
607 |                 if (index >= state_num_segments)
608 |                     index -= state_num_segments;
609 | 
610 |                 const auto* input_segment = get_segment (config, state_data->segments, index);
611 |                 const auto* ir_segment = get_segment (config, ir->segments, seg_idx);
612 |                 fft::fft_convolve_unordered (config->fft,
613 |                                              input_segment,
614 |                                              ir_segment,
615 |                                              state_data->output_temp_data,
616 |                                              fft_inv_scale);
617 |             }
618 | 
619 |             memcpy (state_data->output_data, state_data->output_temp_data, config->fft_size * sizeof (float));
620 | 
621 |             fft::fft_convolve_unordered (config->fft,
622 |                                          input_segment_data,
623 |                                          ir->segments,
624 |                                          state_data->output_data,
625 |                                          fft_inv_scale);
626 |             fft::fft_transform_unordered (config->fft,
627 |                                           state_data->output_data,
628 |                                           state_data->output_data,
629 |                                           fft_scratch,
630 |                                           fft::FFT_BACKWARD);
631 | 
632 |             // Add overlap
633 |             fft::fft_accumulate (config->fft,
634 |                                  state_data->overlap_data,
635 |                                  state_data->output_data,
636 |                                  state_data->output_data,
637 |                                  config->block_size);
638 | 
639 |             // Input buffer is empty again now
640 |             memset (state_data->input_data, 0, config->fft_size * sizeof (float));
641 | 
642 |             // Extra step for segSize > blockSize
643 |             const auto extra_block_samples = config->fft_size - 2 * config->block_size;
644 |             if (extra_block_samples > 0)
645 |             {
646 |                 fft::fft_accumulate (config->fft,
647 |                                      state_data->overlap_data + config->block_size,
648 |                                      state_data->output_data + config->block_size,
649 |                                      state_data->output_data + config->block_size,
650 |                                      extra_block_samples);
651 |             }
652 | 
653 |             // Save the overlap
654 |             memcpy (state_data->overlap_data,
655 |                     state_data->output_data + config->block_size,
656 |                     (config->fft_size - config->block_size) * sizeof (float));
657 | 
658 |             state->current_segment = (state->current_segment > 0) ? (state->current_segment - 1) : (state_num_segments - 1);
659 | 
660 |             state->input_data_pos = 0;
661 |         }
662 |     }
663 | }
664 | 
665 | void process_samples_with_latency (const Config* config,
666 |                                    const IR_Uniform* ir,
667 |                                    Process_Uniform_State* state,
668 |                                    const float* input,
669 |                                    float* output,
670 |                                    int num_samples,
671 |                                    float* fft_scratch)
672 | {
673 |     assert (ir->num_channels == 1);
674 |     process_samples_with_latency_mono (config, ir, state, input, output, num_samples, fft_scratch);
675 | }
676 | 
677 | static void process_multichannel (const Config* config,
678 |                                   const IR_Uniform* ir,
679 |                                   Process_Uniform_State* state,
680 |                                   const float* const* in,
681 |                                   float* const* out,
682 |                                   int N,
683 |                                   int num_channels,
684 |                                   float* fft_scratch,
685 |                                   bool with_latency)
686 | {
687 |     assert (ir->num_channels == 1 || ir->num_channels == state->num_channels);
688 |     assert (state->num_channels == num_channels);
689 | 
690 |     for (int ch = 0; ch < num_channels; ++ch)
691 |     {
692 |         IR_Uniform mono_ir {
693 |             .segments = ir->num_channels == 1
694 |                             ? ir->segments
695 |                             : get_segment (config, ir->segments, ch * ir->max_num_segments),
696 |             .num_segments = ir->num_segments,
697 |             .max_num_segments = ir->max_num_segments,
698 |             .num_channels = 1,
699 |         };
700 | 
701 |         Process_Uniform_State mono_state {
702 |             .state_data = state->state_data + ch,
703 |             .max_num_segments = state->max_num_segments,
704 |             .current_segment = state->current_segment,
705 |             .input_data_pos = state->input_data_pos,
706 |             .num_channels = 1,
707 |         };
708 | 
709 |         if (with_latency)
710 |             process_samples_with_latency_mono (config, &mono_ir, &mono_state, in[ch], out[ch], N, fft_scratch);
711 |         else
712 |             process_samples_mono (config, &mono_ir, &mono_state, in[ch], out[ch], N, fft_scratch);
713 | 
714 |         if (ch == num_channels - 1)
715 |         {
716 |             state->current_segment = mono_state.current_segment;
717 |             state->input_data_pos = mono_state.input_data_pos;
718 |         }
719 |     }
720 | }
721 | 
722 | void process_samples_multichannel (const Config* config,
723 |                                    const IR_Uniform* ir,
724 |                                    Process_Uniform_State* state,
725 |                                    const float* const* in,
726 |                                    float* const* out,
727 |                                    int N,
728 |                                    int num_channels,
729 |                                    float* fft_scratch)
730 | {
731 |     process_multichannel (config, ir, state, in, out, N, num_channels, fft_scratch, false);
732 | }
733 | 
734 | void process_samples_with_latency_multichannel (const Config* config,
735 |                                                 const IR_Uniform* ir,
736 |                                                 Process_Uniform_State* state,
737 |                                                 const float* const* in,
738 |                                                 float* const* out,
739 |                                                 int N,
740 |                                                 int num_channels,
741 |                                                 float* fft_scratch)
742 | {
743 |     process_multichannel (config, ir, state, in, out, N, num_channels, fft_scratch, true);
744 | }
745 | 
746 | void process_samples_non_uniform (const IR_Non_Uniform* ir,
747 |                                   Process_Non_Uniform_State* state,
748 |                                   const float* in,
749 |                                   float* out,
750 |                                   int N,
751 |                                   float* scratch)
752 | {
753 |     auto* tail_out = scratch;
754 |     scratch += pad_floats (N);
755 | 
756 |     process_samples_with_latency (ir->tail_config,
757 |                                   &ir->tail,
758 |                                   &state->tail,
759 |                                   in,
760 |                                   tail_out,
761 |                                   N,
762 |                                   scratch);
763 | 
764 |     process_samples (ir->head_config,
765 |                      &ir->head,
766 |                      &state->head,
767 |                      in,
768 |                      out,
769 |                      N,
770 |                      scratch);
771 | 
772 |     for (int n = 0; n < N; ++n)
773 |         out[n] += tail_out[n];
774 | }
775 | } // namespace chowdsp::convolution
776 | 


--------------------------------------------------------------------------------