├── .github
    └── workflows
    │   ├── linux.yml
    │   ├── linux_arm64.yml
    │   └── windows.yml
├── .gitmodules
├── CMakeLists.txt
├── LICENSE
├── README.md
├── common
    └── config.h.in
├── cpu_source
    ├── CMakeLists.txt
    ├── cpu_dispatch.h.in
    ├── dfttest2_cpu.h
    ├── getframe_impl.cpp
    ├── kernel.hpp
    └── source.cpp
├── cuda_source
    ├── kernel.hpp
    ├── source.cpp
    └── win32.cpp
├── dfttest2.py
├── gcc_source
    ├── CMakeLists.txt
    ├── dfttest2_cpu.h
    ├── getframe_impl.cpp
    ├── kernel.hpp
    └── source.cpp
├── hip_source
    ├── kernel.hpp
    └── source.cpp
├── hiprtc_source
    ├── dft_kernels.hpp
    ├── kernel.hpp
    └── source.cpp
└── nvrtc_source
    ├── dft_kernels.hpp
    ├── kernel.hpp
    └── source.cpp


/.github/workflows/linux.yml:
--------------------------------------------------------------------------------
 1 | name: Build (Linux)
 2 | 
 3 | on:
 4 |   push:
 5 |     paths:
 6 |       - 'CMakeLists.txt'
 7 |       - 'cpu_source/*'
 8 |       - 'cuda_source/*'
 9 |       - 'nvrtc_source/*'
10 |       - '.github/workflows/linux.yml'
11 |   workflow_dispatch:
12 | 
13 | jobs:
14 |   build-linux:
15 |     runs-on: ubuntu-22.04
16 |     steps:
17 |     - name: Checkout repo
18 |       uses: actions/checkout@v3
19 |       with:
20 |         submodules: true
21 | 
22 |     - name: Setup Ninja
23 |       run: pip3 install ninja
24 | 
25 |     - name: Setup CUDA
26 |       run: |
27 |         wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
28 |         sudo dpkg -i cuda-keyring_1.1-1_all.deb
29 |         sudo apt-get update
30 |         sudo apt-get install -y cuda-nvcc-11-8 cuda-cudart-dev-11-8 cuda-nvrtc-dev-11-8 libcufft-dev-11-8
31 |         echo "PATH=/usr/local/cuda/bin${PATH:+:${PATH}}" >> $GITHUB_ENV
32 |         echo "CUDA_PATH=/usr/local/cuda" >> $GITHUB_ENV
33 |         echo "LD_LIBRARY_PATH=/usr/local/cuda/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" >> $GITHUB_ENV
34 | 
35 |     - name: Download VapourSynth headers
36 |       run: |
37 |         wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip
38 |         unzip -q vs.zip
39 |         mv vapoursynth*/ vapoursynth
40 | 
41 |     - name: Configure
42 |       run: cmake -S . -B build -G Ninja
43 |         -D VS_INCLUDE_DIR="`pwd`/vapoursynth/include"
44 |         -D ENABLE_CUDA=ON
45 |         -D USE_NVRTC_STATIC=ON
46 |         -D ENABLE_CPU=ON
47 |         -D CMAKE_BUILD_TYPE=Release
48 |         -D CMAKE_CXX_COMPILER=g++-12
49 |         -D CMAKE_CXX_FLAGS="-Wall -ffast-math"
50 | 
51 |     - name: Build
52 |       run: cmake --build build --config Release --verbose
53 | 
54 |     - name: Install
55 |       run: cmake --install build --prefix artifact
56 | 
57 |     - name: Setup HIP
58 |       run: |
59 |         wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
60 |         echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/6.0.2 jammy main" | sudo tee --append /etc/apt/sources.list.d/rocm.list
61 |         echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
62 |         sudo apt update
63 |         sudo apt install hip-runtime-amd rocm-device-libs hipfft-dev
64 | 
65 |     - name: Configure (HIP)
66 |       run: cmake -S . -B build_hip -G Ninja
67 |         -D VS_INCLUDE_DIR="`pwd`/vapoursynth/include"
68 |         -D ENABLE_CUDA=OFF
69 |         -D ENABLE_CPU=OFF
70 |         -D ENABLE_HIP=ON
71 |         -D CMAKE_BUILD_TYPE=Release
72 |         -D CMAKE_CXX_FLAGS="-Wall -ffast-math"
73 |         -D CMAKE_PREFIX_PATH=/opt/rocm
74 | 
75 |     - name: Build (HIP)
76 |       run: cmake --build build_hip --config Release --verbose
77 | 
78 |     - name: Install (HIP)
79 |       run: cmake --install build_hip --prefix artifact
80 | 
81 |     - name: Upload
82 |       uses: actions/upload-artifact@v3
83 |       with:
84 |         name: vs-dfttest2-Linux
85 |         path: artifact
86 | 


--------------------------------------------------------------------------------
/.github/workflows/linux_arm64.yml:
--------------------------------------------------------------------------------
 1 | name: Build (Linux, ARM64)
 2 | 
 3 | on:
 4 |   push:
 5 |     paths:
 6 |       - 'CMakeLists.txt'
 7 |       - 'gcc_source/*'
 8 |       - '.github/workflows/linux_arm64.yml'
 9 |   workflow_dispatch:
10 | 
11 | jobs:
12 |   build-linux:
13 |     runs-on: ubuntu-24.04-arm
14 |     steps:
15 |     - name: Checkout repo
16 |       uses: actions/checkout@v4
17 |       with:
18 |         submodules: true
19 | 
20 |     - name: Setup clang
21 |       run: |
22 |         wget https://apt.llvm.org/llvm.sh
23 |         chmod +x llvm.sh
24 |         sudo ./llvm.sh all
25 | 
26 |     - name: Setup Ninja
27 |       run: pip3 install ninja
28 | 
29 |     - name: Download VapourSynth headers
30 |       run: |
31 |         wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip
32 |         unzip -q vs.zip
33 |         mv vapoursynth*/ vapoursynth
34 | 
35 |     - name: Configure
36 |       run: cmake -S . -B build -G Ninja
37 |         -D VS_INCLUDE_DIR="`pwd`/vapoursynth/include"
38 |         -D ENABLE_CUDA=OFF
39 |         -D ENABLE_CPU=OFF
40 |         -D ENABLE_GCC=ON
41 |         -D CMAKE_BUILD_TYPE=Release
42 |         -D CMAKE_CXX_COMPILER=clang++
43 |         -D CMAKE_CXX_FLAGS="-Wall -ffast-math"
44 | 
45 |     - name: Build
46 |       run: cmake --build build --config Release --verbose
47 | 
48 |     - name: Install
49 |       run: cmake --install build --prefix artifact
50 | 
51 |     - name: Upload
52 |       uses: actions/upload-artifact@v4
53 |       with:
54 |         name: vs-dfttest2-Linux
55 |         path: artifact
56 | 


--------------------------------------------------------------------------------
/.github/workflows/windows.yml:
--------------------------------------------------------------------------------
  1 | name: Build (Windows)
  2 | 
  3 | on:
  4 |   push:
  5 |     paths:
  6 |       - 'CMakeLists.txt'
  7 |       - 'cpu_source/*'
  8 |       - 'cuda_source/*'
  9 |       - 'nvrtc_source/*'
 10 |       - '.github/workflows/windows.yml'
 11 |   workflow_dispatch:
 12 |     inputs:
 13 |       tag:
 14 |         description: 'which tag to upload to'
 15 |         default: ''
 16 | 
 17 | jobs:
 18 |   build-windows:
 19 |     runs-on: windows-2022
 20 | 
 21 |     defaults:
 22 |       run:
 23 |         shell: cmd
 24 | 
 25 |     steps:
 26 |     - name: Checkout repo
 27 |       uses: actions/checkout@v4
 28 |       with:
 29 |         submodules: true
 30 | 
 31 |     - name: Setup MSVC
 32 |       uses: ilammy/msvc-dev-cmd@v1
 33 | 
 34 |     - name: Setup Ninja
 35 |       run: pip install ninja
 36 | 
 37 |     - name: Cache CUDA
 38 |       id: cache-cuda
 39 |       uses: actions/cache@v4
 40 |       with:
 41 |         path: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
 42 |         key: ${{ runner.os }}-cuda-12.4.1
 43 | 
 44 |     - name: Setup CUDA
 45 |       if: steps.cache-cuda.outputs.cache-hit != 'true'
 46 |       run: |
 47 |         curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/12.4.1/network_installers/cuda_12.4.1_windows_network.exe
 48 |         cuda_installer.exe -s nvcc_12.4 cudart_12.4 nvrtc_dev_12.4 cufft_12.4 cufft_dev_12.4
 49 | 
 50 |     - name: Download VapourSynth headers
 51 |       run: |
 52 |         curl -s -o vs.zip -L https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip
 53 |         unzip -q vs.zip
 54 |         mv vapoursynth-*/ vapoursynth/
 55 | 
 56 |     - name: Configure (CUDA)
 57 |       run: cmake -S . -B build_cuda -G Ninja -LA
 58 |         -D CMAKE_BUILD_TYPE=Release
 59 |         -D ENABLE_CUDA=ON
 60 |         -D USE_NVRTC_STATIC=ON
 61 |         -D ENABLE_CPU=OFF
 62 |         -D VS_INCLUDE_DIR="%cd%\vapoursynth\include"
 63 |         -D CMAKE_CXX_FLAGS="/fp:fast /EHsc"
 64 |         -D CMAKE_SHARED_LINKER_FLAGS="/DELAYLOAD:cufft64_11.dll"
 65 |         -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded
 66 |       env:
 67 |         CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4
 68 | 
 69 |     - name: Build (CUDA)
 70 |       run: cmake --build build_cuda --verbose
 71 | 
 72 |     - name: Install (CUDA)
 73 |       run: cmake --install build_cuda --prefix install_cuda
 74 | 
 75 |     - name: Prepare for upload (CUDA)
 76 |       run: |
 77 |         mkdir artifact_cuda
 78 |         copy install_cuda\lib\*.dll artifact_cuda
 79 |         mkdir cufft
 80 |         copy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin\cufft64_*.dll" cufft
 81 | 
 82 |     - name: Upload (CUDA)
 83 |       uses: actions/upload-artifact@v4
 84 |       with:
 85 |         name: vs-dfttest2-cuda-windows
 86 |         path: artifact_cuda
 87 | 
 88 |     - name: Upload cufft
 89 |       uses: actions/upload-artifact@v4
 90 |       if: false
 91 |       with:
 92 |         name: cufft-windows
 93 |         path: cufft
 94 | 
 95 |     - name: Configure (CPU)
 96 |       shell: bash
 97 |       run: cmake -S . -B build_cpu -G Ninja -LA
 98 |         -D CMAKE_BUILD_TYPE=Release
 99 |         -D ENABLE_CUDA=OFF
100 |         -D ENABLE_CPU=ON
101 |         -D VS_INCLUDE_DIR="$(pwd)/vapoursynth/include"
102 |         -D CMAKE_CXX_COMPILER=clang++
103 |         -D CMAKE_CXX_FLAGS="-ffast-math"
104 |         -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded
105 | 
106 |     - name: Build (CPU)
107 |       run: cmake --build build_cpu --verbose
108 | 
109 |     - name: Install (CPU)
110 |       run: cmake --install build_cpu --prefix install_cpu
111 | 
112 |     - name: Prepare for upload (CPU)
113 |       run: |
114 |         mkdir artifact_cpu
115 |         copy install_cpu\lib\*.dll artifact_cpu
116 | 
117 |     - name: Upload (CPU)
118 |       uses: actions/upload-artifact@v4
119 |       with:
120 |         name: vs-dfttest2-cpu-windows
121 |         path: artifact_cpu
122 | 
123 |     - name: Compress artifact for release
124 |       if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != ''
125 |       run: |
126 |         cd artifact_cuda
127 | 
128 |         mkdir vs-dfttest2-cuda-windows-${{ github.event.inputs.tag }}
129 |         xcopy dfttest2_cuda.dll vs-dfttest2-cuda-windows-${{ github.event.inputs.tag }} /f
130 |         xcopy dfttest2_nvrtc.dll vs-dfttest2-cuda-windows-${{ github.event.inputs.tag }} /f
131 |         7z a -t7z -mx=9 ../vs-dfttest2-cuda-windows-${{ github.event.inputs.tag }}.7z vs-dfttest2-cuda-windows-${{ github.event.inputs.tag }}
132 |         
133 |         cd ../cufft
134 | 
135 |         mkdir vsmlrt-cuda
136 |         xcopy cufft64_*.dll vsmlrt-cuda /f
137 |         7z a -t7z -mx=9 ../cufft-windows-${{ github.event.inputs.tag }}.7z vsmlrt-cuda
138 |         
139 |         cd ../artifact_cpu
140 | 
141 |         mkdir vs-dfttest2-cpu-windows-${{ github.event.inputs.tag }}
142 |         xcopy dfttest2_cpu.dll vs-dfttest2-cpu-windows-${{ github.event.inputs.tag }} /f
143 |         7z a -t7z -mx=9 ../vs-dfttest2-cpu-windows-${{ github.event.inputs.tag }}.7z vs-dfttest2-cpu-windows-${{ github.event.inputs.tag }}
144 | 
145 |     - name: Release
146 |       uses: softprops/action-gh-release@v2
147 |       if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != ''
148 |       with:
149 |         tag_name: ${{ github.event.inputs.tag }}
150 |         files: |
151 |           vs-dfttest2-cuda-windows-${{ github.event.inputs.tag }}.7z
152 |           vs-dfttest2-cpu-windows-${{ github.event.inputs.tag }}.7z
153 |           cufft-windows-${{ github.event.inputs.tag }}.7z
154 |           dfttest2.py
155 |         fail_on_unmatched_files: true
156 |         generate_release_notes: false
157 |         prerelease: true
158 | 
159 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "cpu_source/vectorclass"]
2 | 	path = cpu_source/vectorclass
3 | 	url = https://github.com/vectorclass/version2
4 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | cmake_minimum_required(VERSION 3.22.0)
  2 | 
  3 | project(vs-dfttest2 VERSION 0.1 LANGUAGES CXX)
  4 | 
  5 | set(ENABLE_CUDA ON CACHE BOOL "Whether to compile with CUDA backends")
  6 | set(ENABLE_CPU ON CACHE BOOL "Whether to compile with x86 backend")
  7 | set(ENABLE_GCC OFF CACHE BOOL "Whether to compile with gcc vector extension backend")
  8 | set(ENABLE_HIP OFF CACHE BOOL "Whether to compile with HIP backends")
  9 | 
 10 | if(NOT CMAKE_BUILD_TYPE)
 11 |     set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build" FORCE)
 12 | endif()
 13 | 
 14 | if(ENABLE_CUDA)
 15 |     set(USE_NVRTC_STATIC ON CACHE BOOL "Whether to use NVRTC static library")
 16 | 
 17 |     find_package(CUDAToolkit REQUIRED)
 18 | 
 19 |     add_library(dfttest2_cuda MODULE
 20 |         cuda_source/source.cpp
 21 |         cuda_source/win32.cpp
 22 |     )
 23 |     add_library(dfttest2_nvrtc MODULE
 24 |         nvrtc_source/source.cpp
 25 |     )
 26 | 
 27 |     set_target_properties(dfttest2_cuda PROPERTIES
 28 |         CXX_EXTENSIONS OFF
 29 |         CXX_STANDARD 20
 30 |         CXX_STANDARD_REQUIRED ON
 31 |     )
 32 |     set_target_properties(dfttest2_nvrtc PROPERTIES
 33 |         CXX_EXTENSIONS OFF
 34 |         CXX_STANDARD 20
 35 |         CXX_STANDARD_REQUIRED ON
 36 |     )
 37 | 
 38 |     target_link_libraries(dfttest2_cuda PRIVATE CUDA::cuda_driver CUDA::cufft)
 39 |     target_link_libraries(dfttest2_nvrtc PRIVATE CUDA::cuda_driver)
 40 | 
 41 |     if(
 42 |         USE_NVRTC_STATIC AND (
 43 |             CUDAToolkit_VERSION_MAJOR GREATER_EQUAL "12" OR (
 44 |                 CUDAToolkit_VERSION_MAJOR EQUAL "11" AND
 45 |                 CUDAToolkit_VERSION_MINOR GREATER_EQUAL "5"
 46 |             )
 47 |         )
 48 |     )
 49 |         target_link_directories(dfttest2_cuda PRIVATE "${CUDAToolkit_LIBRARY_DIR}")
 50 |         target_link_libraries(dfttest2_cuda PRIVATE nvrtc_static nvrtc-builtins_static nvptxcompiler_static)
 51 |         target_link_directories(dfttest2_nvrtc PRIVATE "${CUDAToolkit_LIBRARY_DIR}")
 52 |         target_link_libraries(dfttest2_nvrtc PRIVATE nvrtc_static nvrtc-builtins_static nvptxcompiler_static)
 53 | 
 54 |         if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
 55 |             set_property(TARGET dfttest2_cuda PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded")
 56 |             set_property(TARGET dfttest2_nvrtc PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded")
 57 |         endif()
 58 | 
 59 |         if(CMAKE_SYSTEM_NAME STREQUAL "Windows")
 60 |             target_link_libraries(dfttest2_cuda PRIVATE Ws2_32)
 61 |             target_link_libraries(dfttest2_nvrtc PRIVATE Ws2_32)
 62 |         endif()
 63 |     else()
 64 |         if(USE_NVRTC_STATIC)
 65 |             message(WARNING "NVRTC static library is not used")
 66 |         endif()
 67 |         target_link_libraries(dfttest2_cuda PRIVATE CUDA::nvrtc)
 68 |         target_link_libraries(dfttest2_nvrtc PRIVATE CUDA::nvrtc)
 69 |     endif()
 70 | endif() # ENABLE_CUDA
 71 | 
 72 | if(ENABLE_HIP)
 73 |     find_package(hip REQUIRED config)
 74 |     find_package(hipfft REQUIRED config)
 75 |     find_package(hiprtc REQUIRED config)
 76 | 
 77 |     add_library(dfttest2_hip MODULE
 78 |         hip_source/source.cpp
 79 |     )
 80 |     add_library(dfttest2_hiprtc MODULE
 81 |         hiprtc_source/source.cpp
 82 |     )
 83 | 
 84 |     set_target_properties(dfttest2_hip PROPERTIES
 85 |         CXX_EXTENSIONS OFF
 86 |         CXX_STANDARD 20
 87 |         CXX_STANDARD_REQUIRED ON
 88 |     )
 89 |     set_target_properties(dfttest2_hiprtc PROPERTIES
 90 |         CXX_EXTENSIONS OFF
 91 |         CXX_STANDARD 20
 92 |         CXX_STANDARD_REQUIRED ON
 93 |     )
 94 | 
 95 |     target_link_libraries(dfttest2_hip PRIVATE hip::host hip::hipfft hiprtc::hiprtc)
 96 |     target_link_libraries(dfttest2_hiprtc PRIVATE hip::host hiprtc::hiprtc)
 97 | endif() # ENABLE_HIP
 98 | 
 99 | find_package(PkgConfig QUIET MODULE)
100 | 
101 | if(PKG_CONFIG_FOUND)
102 |     pkg_search_module(VS vapoursynth)
103 | 
104 |     if(VS_FOUND)
105 |         message(STATUS "Found VapourSynth r${VS_VERSION}")
106 | 
107 |         cmake_path(APPEND install_dir ${VS_LIBDIR} vapoursynth)
108 | 
109 |         if(ENABLE_CUDA)
110 |             target_include_directories(dfttest2_cuda PRIVATE ${VS_INCLUDE_DIRS})
111 |             target_include_directories(dfttest2_nvrtc PRIVATE ${VS_INCLUDE_DIRS})
112 | 
113 |             install(TARGETS dfttest2_cuda LIBRARY DESTINATION ${install_dir})
114 |             install(TARGETS dfttest2_nvrtc LIBRARY DESTINATION ${install_dir})
115 |         endif() # ENABLE_CUDA
116 | 
117 |         if(ENABLE_HIP)
118 |             target_include_directories(dfttest2_hip PRIVATE ${VS_INCLUDE_DIRS})
119 |             target_include_directories(dfttest2_hiprtc PRIVATE ${VS_INCLUDE_DIRS})
120 | 
121 |             install(TARGETS dfttest2_hip LIBRARY DESTINATION ${install_dir})
122 |             install(TARGETS dfttest2_hiprtc LIBRARY DESTINATION ${install_dir})
123 |         endif() # ENABLE_HIP
124 |     endif()
125 | endif()
126 | 
127 | if(NOT VS_FOUND)
128 |     set(VS_INCLUDE_DIR "" CACHE PATH "Path to VapourSynth headers")
129 | 
130 |     if(VS_INCLUDE_DIR STREQUAL "")
131 |         message(WARNING "VapourSynth not found")
132 |     endif()
133 | 
134 |     if(ENABLE_CUDA)
135 |         target_include_directories(dfttest2_cuda PRIVATE ${VS_INCLUDE_DIR})
136 |         target_include_directories(dfttest2_nvrtc PRIVATE ${VS_INCLUDE_DIR})
137 | 
138 |         install(TARGETS dfttest2_cuda LIBRARY DESTINATION lib)
139 |         install(TARGETS dfttest2_nvrtc LIBRARY DESTINATION lib)
140 |     endif() # ENABLE_CUDA
141 | 
142 |     if(ENABLE_HIP)
143 |         target_include_directories(dfttest2_hip PRIVATE ${VS_INCLUDE_DIR})
144 |         target_include_directories(dfttest2_hiprtc PRIVATE ${VS_INCLUDE_DIR})
145 | 
146 |         install(TARGETS dfttest2_hip LIBRARY DESTINATION lib)
147 |         install(TARGETS dfttest2_hiprtc LIBRARY DESTINATION lib)
148 |     endif() # ENABLE_HIP
149 | endif()
150 | 
151 | find_package(Git QUIET)
152 | 
153 | if(GIT_FOUND)
154 |     execute_process(
155 |         COMMAND ${GIT_EXECUTABLE} describe --tags --long --always
156 |         WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
157 |         OUTPUT_VARIABLE VCS_TAG
158 |     )
159 |     if(VCS_TAG)
160 |         string(STRIP ${VCS_TAG} VCS_TAG)
161 |     endif()
162 | endif()
163 | 
164 | if(VCS_TAG)
165 |     message(STATUS "vs-dfttest2 ${VCS_TAG}")
166 | else()
167 |     message(WARNING "unknown plugin version")
168 |     set(VCS_TAG "unknown")
169 | endif()
170 | 
171 | configure_file(common/config.h.in config.h)
172 | 
173 | if(ENABLE_CUDA)
174 |     target_include_directories(dfttest2_cuda PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
175 |     target_include_directories(dfttest2_nvrtc PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
176 | 
177 |     if (WIN32)
178 |         target_link_options(dfttest2_cuda PRIVATE "delayimp.lib" ${CMAKE_SHARED_LINKER_FLAGS})
179 |     endif()
180 | endif() # ENABLE_CUDA
181 | 
182 | if(ENABLE_CPU)
183 |     add_subdirectory(cpu_source)
184 | endif() # ENABLE_CPU
185 | 
186 | if(ENABLE_GCC)
187 |     add_subdirectory(gcc_source)
188 | endif() # ENABLE_GCC
189 | 
190 | if(ENABLE_HIP)
191 |     target_include_directories(dfttest2_hip PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
192 |     target_include_directories(dfttest2_hiprtc PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
193 | endif() # ENABLE_HIP
194 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # vs-dfttest2
 2 | DFTTest re-implemetation (CUDA and x86)
 3 | 
 4 | ## Usage
 5 | ```python3
 6 | from dfttest2 import DFTTest
 7 | output = DFTTest(input)
 8 | ```
 9 | 
10 | See also [VapourSynth-DFTTest](https://github.com/HomeOfVapourSynthEvolution/VapourSynth-DFTTest)
11 | 
12 | ## Compilation
13 | ```bash
14 | # additional options: -D ENABLE_CUDA=ON -D ENABLE_CPU=ON
15 | cmake -S . -B build
16 | 
17 | cmake --build build
18 | 
19 | cmake --install build
20 | ```
21 | 
22 | If the vapoursynth library cannot be found by pkg-config, then the cmake variable `VS_INCLUDE_DIR` should be set.
23 | 


--------------------------------------------------------------------------------
/common/config.h.in:
--------------------------------------------------------------------------------
1 | #define VERSION "@VCS_TAG@"


--------------------------------------------------------------------------------
/cpu_source/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | set(VCL_HOME "${CMAKE_CURRENT_SOURCE_DIR}/vectorclass" CACHE PATH "Path to vector class v2 headers")
  2 | 
  3 | if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_FRONTEND_VARIANT STREQUAL "GNU")
  4 |     set(CPU_DISPATCH_TARGETS "sse2;avx2;avx512f" CACHE STRING "Dispatch targets")
  5 | elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR CMAKE_CXX_COMPILER_FRONTEND_VARIANT STREQUAL "MSVC")
  6 |     set(CPU_DISPATCH_TARGETS "AVX;AVX2;AVX512" CACHE STRING "Dispatch targets")
  7 | endif()
  8 | 
  9 | message(STATUS "cpu targets: ${CPU_DISPATCH_TARGETS}")
 10 | 
 11 | add_library(dfttest2_cpu MODULE source.cpp ${CMAKE_CURRENT_SOURCE_DIR}/vectorclass/instrset_detect.cpp)
 12 | 
 13 | set_target_properties(dfttest2_cpu PROPERTIES
 14 |     CXX_EXTENSIONS OFF
 15 |     CXX_STANDARD 20
 16 |     CXX_STANDARD_REQUIRED ON
 17 | )
 18 | 
 19 | target_include_directories(dfttest2_cpu PRIVATE ${VCL_HOME})
 20 | 
 21 | if(PKG_CONFIG_FOUND AND VS_FOUND)
 22 |     target_include_directories(dfttest2_cpu PRIVATE ${VS_INCLUDE_DIRS})
 23 |     install(TARGETS dfttest2_cpu LIBRARY DESTINATION ${install_dir})
 24 | else()
 25 |     target_include_directories(dfttest2_cpu PRIVATE ${VS_INCLUDE_DIR})
 26 |     install(TARGETS dfttest2_cpu LIBRARY DESTINATION lib)
 27 | endif()
 28 | 
 29 | target_include_directories(dfttest2_cpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/..)
 30 | 
 31 | 
 32 | if(CPU_DISPATCH_TARGETS)
 33 |     target_compile_definitions(dfttest2_cpu PRIVATE HAS_DISPATCH)
 34 | 
 35 |     set(GETFRAME_DECLARATIONS "")
 36 |     set(GETFRAME_PTRS "")
 37 |     set(SUPPORTED_ARCH_DECLARATIONS "")
 38 |     set(SUPPORTED_ARCH_PTRS "")
 39 |     set(SUPPORTED_ARCH_STRS "")
 40 | 
 41 |     foreach(arch_option ${CPU_DISPATCH_TARGETS})
 42 |         set(raw_arch_option ${arch_option})
 43 |         string(REPLACE "=" "_" arch ${arch_option})
 44 |         string(REPLACE "-" "_" arch ${arch})
 45 | 
 46 |         if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_FRONTEND_VARIANT STREQUAL "GNU")
 47 |             if(${arch_option} STREQUAL avx2)
 48 |                 set(arch_option ${arch_option} -mfma)
 49 |             elseif(${arch_option} STREQUAL avx512f)
 50 |                 if(WIN32)
 51 |                     # according to vcl2,
 52 |                     # MS compiler cannot generate code for AVX512F without AVX512DQ
 53 |                     set(arch_option ${arch_option} -mfma -mavx512vl -mavx512bw -mavx512dq)
 54 |                 else()
 55 |                     set(arch_option ${arch_option} -mfma)
 56 |                 endif()
 57 |             endif()
 58 |         endif()
 59 | 
 60 |         set(current_target getframe_impl_${arch})
 61 |         set(current_getframe_name DFTTestGetFrame_${arch})
 62 |         set(current_supported_arch_name supported_arch_${arch})
 63 | 
 64 |         add_library(${current_target} OBJECT getframe_impl.cpp)
 65 | 
 66 |         set_target_properties(${current_target} PROPERTIES
 67 |             CXX_EXTENSIONS OFF
 68 |             CXX_STANDARD 20
 69 |             CXX_STANDARD_REQUIRED ON
 70 |         )
 71 | 
 72 |         target_compile_definitions(${current_target} PRIVATE HAS_DISPATCH)
 73 |         target_compile_definitions(${current_target} PRIVATE DFTTEST_GETFRAME_NAME=${current_getframe_name})
 74 |         target_compile_definitions(${current_target} PRIVATE SUPPORTED_ARCH_NAME=${current_supported_arch_name})
 75 | 
 76 |         if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_FRONTEND_VARIANT STREQUAL "GNU")
 77 |             target_compile_options(${current_target} PRIVATE -m${arch_option})
 78 |         elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR CMAKE_CXX_COMPILER_FRONTEND_VARIANT STREQUAL "MSVC")
 79 |             target_compile_options(${current_target} PRIVATE /arch:${arch_option})
 80 |         endif()
 81 | 
 82 |         target_include_directories(${current_target} PRIVATE ${VCL_HOME})
 83 | 
 84 |         if(PKG_CONFIG_FOUND AND VS_FOUND)
 85 |             target_include_directories(${current_target} PRIVATE ${VS_INCLUDE_DIRS})
 86 |         else()
 87 |             target_include_directories(${current_target} PRIVATE ${VS_INCLUDE_DIR})
 88 |         endif()
 89 | 
 90 |         target_include_directories(${current_target} PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
 91 | 
 92 |         string(APPEND GETFRAME_DECLARATIONS "
 93 |         extern const VSFrameRef *VS_CC ${current_getframe_name}(
 94 |             int n, int activationReason, void **instanceData, void **frameData,
 95 |             VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi
 96 |         ) noexcept;\n")
 97 | 
 98 |         string(APPEND GETFRAME_PTRS ${current_getframe_name},)
 99 | 
100 |         string(APPEND SUPPORTED_ARCH_DECLARATIONS "extern bool ${current_supported_arch_name}() noexcept;\n")
101 | 
102 |         string(APPEND SUPPORTED_ARCH_PTRS ${current_supported_arch_name},)
103 | 
104 |         string(APPEND SUPPORTED_ARCH_STRS \"${raw_arch_option}\",)
105 | 
106 |         target_link_libraries(dfttest2_cpu PRIVATE ${current_target})
107 |     endforeach()
108 | 
109 |     configure_file(cpu_dispatch.h.in cpu_dispatch.h @ONLY)
110 | 
111 |     target_compile_definitions(dfttest2_cpu PRIVATE GETFRAME_PTRS=${GETFRAME_PTRS})
112 |     target_compile_definitions(dfttest2_cpu PRIVATE SUPPORTED_ARCH_PTRS=${SUPPORTED_ARCH_PTRS})
113 |     target_compile_definitions(dfttest2_cpu PRIVATE SUPPORTED_ARCH_STRS=${SUPPORTED_ARCH_STRS})
114 |     target_include_directories(dfttest2_cpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
115 | else()
116 |     add_library(getframe_impl OBJECT getframe_impl.cpp)
117 | 
118 |     set_target_properties(getframe_impl PROPERTIES
119 |         CXX_EXTENSIONS OFF
120 |         CXX_STANDARD 20
121 |         CXX_STANDARD_REQUIRED ON
122 |     )
123 | 
124 |     target_include_directories(getframe_impl PRIVATE ${VCL_HOME})
125 | 
126 |     if(PKG_CONFIG_FOUND AND VS_FOUND)
127 |         target_include_directories(getframe_impl PRIVATE ${VS_INCLUDE_DIRS})
128 |     else()
129 |         target_include_directories(getframe_impl PRIVATE ${VS_INCLUDE_DIR})
130 |     endif()
131 | 
132 |     target_link_libraries(dfttest2_cpu PRIVATE getframe_impl)
133 | endif()
134 | 


--------------------------------------------------------------------------------
/cpu_source/cpu_dispatch.h.in:
--------------------------------------------------------------------------------
 1 | #ifdef HAS_DISPATCH
 2 | 
 3 | #ifndef CPU_DISPATCH_H
 4 | #define CPU_DISPATCH_H
 5 | 
 6 | #include <VapourSynth.h>
 7 | 
 8 | @GETFRAME_DECLARATIONS@
 9 | 
10 | @SUPPORTED_ARCH_DECLARATIONS@
11 | 
12 | #endif // CPU_DISPATCH_H
13 | 
14 | #endif // HAS_DISPATCH
15 | 


--------------------------------------------------------------------------------
/cpu_source/dfttest2_cpu.h:
--------------------------------------------------------------------------------
 1 | #ifndef DFTTEST2_CPU_H
 2 | #define DFTTEST2_CPU_H
 3 | 
 4 | #include <array>
 5 | #include <atomic>
 6 | #include <cstdint>
 7 | #include <memory>
 8 | #include <shared_mutex>
 9 | #include <thread>
10 | #include <unordered_map>
11 | 
12 | #include <VapourSynth.h>
13 | #include <VSHelper.h>
14 | 
15 | #include <vectorclass.h>
16 | 
17 | 
18 | static inline void vs_aligned_free_float(float * ptr) {
19 |     vs_aligned_free(static_cast<void *>(ptr));
20 | }
21 | 
22 | 
23 | struct DFTTestThreadData {
24 |     uint8_t * padded; // shape: (pad_height, pad_width)
25 |     float * padded2; // shape: (pad_height, pad_width)
26 | };
27 | 
28 | 
29 | struct DFTTestData {
30 |     VSNodeRef * node;
31 |     int radius;
32 |     int block_size;
33 |     int block_step;
34 |     std::array<bool, 3> process;
35 |     bool zero_mean;
36 |     std::unique_ptr<float [], decltype(&vs_aligned_free_float)> window { nullptr, &vs_aligned_free_float };
37 |     std::unique_ptr<float [], decltype(&vs_aligned_free_float)> window_freq { nullptr, &vs_aligned_free_float };
38 |     std::unique_ptr<float [], decltype(&vs_aligned_free_float)> sigma { nullptr, &vs_aligned_free_float };
39 |     int filter_type;
40 |     float sigma2;
41 |     float pmin;
42 |     float pmax;
43 | 
44 |     std::atomic<int> num_uninitialized_threads;
45 |     std::unordered_map<std::thread::id, DFTTestThreadData> thread_data;
46 |     std::shared_mutex thread_data_lock;
47 | };
48 | 
49 | #if defined HAS_DISPATCH
50 | #include <cpu_dispatch.h>
51 | #else // HAS_DISPATCH
52 | extern const VSFrameRef *VS_CC DFTTestGetFrame(
53 |     int n, int activationReason, void **instanceData, void **frameData,
54 |     VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi
55 | ) noexcept;
56 | 
57 | extern bool supported_arch() noexcept;
58 | 
59 | extern const char * target_arch() noexcept;
60 | #endif // HAS_DISPATCH
61 | 
62 | #endif // DFTTEST2_CPU_H
63 | 


--------------------------------------------------------------------------------
/cpu_source/getframe_impl.cpp:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <cstdlib>
  3 | #include <cstring>
  4 | #include <vector>
  5 | 
  6 | #include <VSHelper.h>
  7 | 
  8 | #include "dfttest2_cpu.h"
  9 | #include "kernel.hpp"
 10 | 
 11 | 
 12 | static inline int calc_pad_size(int size, int block_size, int block_step) {
 13 |     return (
 14 |         size
 15 |         + ((size % block_size) ? block_size - size % block_size : 0)
 16 |         + std::max(block_size - block_step, block_step) * 2
 17 |     );
 18 | }
 19 | 
 20 | 
 21 | static inline int calc_pad_num(int size, int block_size, int block_step) {
 22 |     return (calc_pad_size(size, block_size, block_step) - block_size) / block_step + 1;
 23 | }
 24 | 
 25 | 
 26 | template <typename T>
 27 | static inline void reflection_padding_impl(
 28 |     T * VS_RESTRICT dst, // shape: (pad_height, pad_width)
 29 |     const T * VS_RESTRICT src, // shape: (height, stride)
 30 |     int width, int height, int stride,
 31 |     int block_size, int block_step
 32 | ) {
 33 | 
 34 |     int pad_width = calc_pad_size(width, block_size, block_step);
 35 |     int pad_height = calc_pad_size(height, block_size, block_step);
 36 | 
 37 |     int offset_y = (pad_height - height) / 2;
 38 |     int offset_x = (pad_width - width) / 2;
 39 | 
 40 |     vs_bitblt(
 41 |         &dst[offset_y * pad_width + offset_x], pad_width * sizeof(T),
 42 |         src, stride * sizeof(T),
 43 |         width * sizeof(T), height
 44 |     );
 45 | 
 46 |     // copy left and right regions
 47 |     for (int y = offset_y; y < offset_y + height; y++) {
 48 |         auto dst_line = &dst[y * pad_width];
 49 | 
 50 |         for (int x = 0; x < offset_x; x++) {
 51 |             dst_line[x] = dst_line[offset_x * 2 - x];
 52 |         }
 53 | 
 54 |         for (int x = offset_x + width; x < pad_width; x++) {
 55 |             dst_line[x] = dst_line[2 * (offset_x + width) - 2 - x];
 56 |         }
 57 |     }
 58 | 
 59 |     // copy top region
 60 |     for (int y = 0; y < offset_y; y++) {
 61 |         std::memcpy(
 62 |             &dst[y * pad_width],
 63 |             &dst[(offset_y * 2 - y) * pad_width],
 64 |             pad_width * sizeof(T)
 65 |         );
 66 |     }
 67 | 
 68 |     // copy bottom region
 69 |     for (int y = offset_y + height; y < pad_height; y++) {
 70 |         std::memcpy(
 71 |             &dst[y * pad_width],
 72 |             &dst[(2 * (offset_y + height) - 2 - y) * pad_width],
 73 |             pad_width * sizeof(T)
 74 |         );
 75 |     }
 76 | }
 77 | 
 78 | 
 79 | static inline void reflection_padding(
 80 |     uint8_t * VS_RESTRICT dst, // shape: (pad_height, pad_width)
 81 |     const uint8_t * VS_RESTRICT src, // shape: (height, stride)
 82 |     int width, int height, int stride,
 83 |     int block_size, int block_step,
 84 |     int bytes_per_sample
 85 | ) {
 86 | 
 87 |     if (bytes_per_sample == 1) {
 88 |         reflection_padding_impl(
 89 |             static_cast<uint8_t *>(dst),
 90 |             static_cast<const uint8_t *>(src),
 91 |             width, height, stride,
 92 |             block_size, block_step
 93 |         );
 94 |     } else if (bytes_per_sample == 2) {
 95 |         reflection_padding_impl(
 96 |             reinterpret_cast<uint16_t *>(dst),
 97 |             reinterpret_cast<const uint16_t *>(src),
 98 |             width, height, stride,
 99 |             block_size, block_step
100 |         );
101 |     } else if (bytes_per_sample == 4) {
102 |         reflection_padding_impl(
103 |             reinterpret_cast<uint32_t *>(dst),
104 |             reinterpret_cast<const uint32_t *>(src),
105 |             width, height, stride,
106 |             block_size, block_step
107 |         );
108 |     }
109 | }
110 | 
111 | 
112 | static inline void load_block(
113 |     Vec16f * VS_RESTRICT block,
114 |     const uint8_t * VS_RESTRICT shifted_src,
115 |     int radius,
116 |     int block_size,
117 |     int block_step,
118 |     int width,
119 |     int height,
120 |     const Vec16f * VS_RESTRICT window,
121 |     int bits_per_sample
122 | ) {
123 | 
124 |     float scale = 1.0f / (1 << (bits_per_sample - 8));
125 |     if (bits_per_sample == 32) {
126 |         scale = 255.0f;
127 |     }
128 | 
129 |     int bytes_per_sample = (bits_per_sample + 7) / 8;
130 | 
131 |     assert(block_size == 16);
132 |     block_size = 16; // unsafe
133 | 
134 |     int offset_x = calc_pad_size(width, block_size, block_step);
135 |     int offset_y = calc_pad_size(height, block_size, block_step);
136 | 
137 |     if (bytes_per_sample == 1) {
138 |         for (int i = 0; i < 2 * radius + 1; i++) {
139 |             for (int j = 0; j < block_size; j++) {
140 |                 auto vec_input = Vec16uc().load((const uint8_t *) shifted_src + (i * offset_y + j) * offset_x);
141 |                 auto vec_input_f = to_float(Vec16i(extend(extend(vec_input))));
142 |                 block[i * block_size * 2 + j] = scale * window[i * block_size + j] * vec_input_f;
143 |             }
144 |         }
145 |     }
146 |     if (bytes_per_sample == 2) {
147 |         for (int i = 0; i < 2 * radius + 1; i++) {
148 |             for (int j = 0; j < block_size; j++) {
149 |                 auto vec_input = Vec16us().load((const uint16_t *) shifted_src + (i * offset_y + j) * offset_x);
150 |                 auto vec_input_f = to_float(Vec16i(extend(vec_input)));
151 |                 block[i * block_size * 2 + j] = scale * window[i * block_size + j] * vec_input_f;
152 |             }
153 |         }
154 |     }
155 |     if (bytes_per_sample == 4) {
156 |         for (int i = 0; i < 2 * radius + 1; i++) {
157 |             for (int j = 0; j < block_size; j++) {
158 |                 auto vec_input_f = Vec16f().load((const float *) shifted_src + (i * offset_y + j) * offset_x);
159 |                 block[i * block_size * 2 + j] = scale * window[i * block_size + j] * vec_input_f;
160 |             }
161 |         }
162 |     }
163 | }
164 | 
165 | 
166 | static inline void store_block(
167 |     float * VS_RESTRICT shifted_dst,
168 |     const Vec16f * VS_RESTRICT shifted_block,
169 |     int block_size,
170 |     int block_step,
171 |     int width,
172 |     const Vec16f * VS_RESTRICT shifted_window
173 | ) {
174 | 
175 |     assert(block_size == 16);
176 |     block_size = 16; // unsafe
177 | 
178 |     for (int i = 0; i < block_size; i++) {
179 |         Vec16f acc = Vec16f().load((const float *) shifted_dst + (i * calc_pad_size(width, block_size, block_step)));
180 |         acc = mul_add(shifted_block[i], shifted_window[i], acc);
181 |         acc.store((float *) shifted_dst + (i * calc_pad_size(width, block_size, block_step)));
182 |     }
183 | }
184 | 
185 | 
186 | static inline void store_frame(
187 |     uint8_t * VS_RESTRICT dst,
188 |     const float * VS_RESTRICT shifted_src,
189 |     int width,
190 |     int height,
191 |     int dst_stride,
192 |     int src_stride,
193 |     int bits_per_sample
194 | ) {
195 | 
196 |     float scale = 1.0f / (1 << (bits_per_sample - 8));
197 |     if (bits_per_sample == 32) {
198 |         scale = 255.0f;
199 |     }
200 | 
201 |     int bytes_per_sample = (bits_per_sample + 7) / 8;
202 |     int peak = (1 << bits_per_sample) - 1;
203 | 
204 |     if (bytes_per_sample == 1) {
205 |         auto dstp = (uint8_t *) dst;
206 |         for (int y = 0; y < height; y++) {
207 |             for (int x = 0; x < width; x++) {
208 |                 auto clamped = std::clamp(static_cast<int>(shifted_src[y * src_stride + x] / scale + 0.5f), 0, peak);
209 |                 dstp[y * dst_stride + x] = static_cast<uint8_t>(clamped);
210 |             }
211 |         }
212 |     }
213 |     if (bytes_per_sample == 2) {
214 |         auto dstp = (uint16_t *) dst;
215 |         for (int y = 0; y < height; y++) {
216 |             for (int x = 0; x < width; x++) {
217 |                 auto clamped = std::clamp(static_cast<int>(shifted_src[y * src_stride + x] / scale + 0.5f), 0, peak);
218 |                 dstp[y * dst_stride + x] = static_cast<uint16_t>(clamped);
219 |             }
220 |         }
221 |     }
222 |     if (bytes_per_sample == 4) {
223 |         auto dstp = (float *) dst;
224 |         for (int y = 0; y < height; y++) {
225 |             for (int x = 0; x < width; x++) {
226 |                 dstp[y * dst_stride + x] = shifted_src[y * src_stride + x] / scale;
227 |             }
228 |         }
229 |     }
230 | }
231 | 
232 | 
233 | const VSFrameRef * VS_CC
234 | #ifndef HAS_DISPATCH
235 | DFTTestGetFrame
236 | #else // HAS_DISPATCH
237 | DFTTEST_GETFRAME_NAME
238 | #endif // HAS_DISPATCH
239 | (
240 |     int n, int activationReason, void **instanceData, void **frameData,
241 |     VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi
242 | ) noexcept {
243 | 
244 |     auto d = static_cast<DFTTestData *>(*instanceData);
245 | 
246 |     if (activationReason == arInitial) {
247 |         int start = std::max(n - d->radius, 0);
248 |         auto vi = vsapi->getVideoInfo(d->node);
249 |         int end = std::min(n + d->radius, vi->numFrames - 1);
250 |         for (int i = start; i <= end; i++) {
251 |             vsapi->requestFrameFilter(i, d->node, frameCtx);
252 |         }
253 |         return nullptr;
254 |     } else if (activationReason != arAllFramesReady) {
255 |         return nullptr;
256 |     }
257 | 
258 |     auto mxcsr = get_control_word();
259 |     no_subnormals();
260 | 
261 |     auto vi = vsapi->getVideoInfo(d->node);
262 | 
263 |     DFTTestThreadData thread_data;
264 | 
265 |     auto thread_id = std::this_thread::get_id();
266 |     if (d->num_uninitialized_threads.load(std::memory_order_acquire) == 0) {
267 |         const auto & const_data = d->thread_data;
268 |         thread_data = const_data.at(thread_id);
269 |     } else {
270 |         bool initialized = true;
271 | 
272 |         d->thread_data_lock.lock_shared();
273 |         try {
274 |             const auto & const_data = d->thread_data;
275 |             thread_data = const_data.at(thread_id);
276 |         } catch (const std::out_of_range &) {
277 |             initialized = false;
278 |         }
279 |         d->thread_data_lock.unlock_shared();
280 | 
281 |         if (!initialized) {
282 |             auto padded_size = (
283 |                 (2 * d->radius + 1) *
284 |                 calc_pad_size(vi->height, d->block_size, d->block_step) *
285 |                 calc_pad_size(vi->width, d->block_size, d->block_step) *
286 |                 vi->format->bytesPerSample
287 |             );
288 | 
289 |             thread_data.padded = static_cast<uint8_t *>(std::malloc(padded_size));
290 |             thread_data.padded2 = static_cast<float *>(std::malloc(
291 |                 calc_pad_size(vi->height, d->block_size, d->block_step) *
292 |                 calc_pad_size(vi->width, d->block_size, d->block_step) *
293 |                 sizeof(float)
294 |             ));
295 | 
296 |             {
297 |                 std::lock_guard _ { d->thread_data_lock };
298 |                 d->thread_data.emplace(thread_id, thread_data);
299 |             }
300 | 
301 |             d->num_uninitialized_threads.fetch_sub(1, std::memory_order_release);
302 |         }
303 |     }
304 | 
305 |     std::vector<std::unique_ptr<const VSFrameRef, decltype(vsapi->freeFrame)>> src_frames;
306 |     src_frames.reserve(2 * d->radius + 1);
307 |     for (int i = n - d->radius; i <= n + d->radius; i++) {
308 |         src_frames.emplace_back(
309 |             vsapi->getFrameFilter(std::clamp(i, 0, vi->numFrames - 1), d->node, frameCtx),
310 |             vsapi->freeFrame
311 |         );
312 |     }
313 | 
314 |     auto & src_center_frame = src_frames[d->radius];
315 |     auto format = vsapi->getFrameFormat(src_center_frame.get());
316 | 
317 |     const VSFrameRef * fr[] {
318 |         d->process[0] ? nullptr : src_center_frame.get(),
319 |         d->process[1] ? nullptr : src_center_frame.get(),
320 |         d->process[2] ? nullptr : src_center_frame.get()
321 |     };
322 |     const int pl[] { 0, 1, 2 };
323 |     std::unique_ptr<VSFrameRef, decltype(vsapi->freeFrame)> dst_frame {
324 |         vsapi->newVideoFrame2(format, vi->width, vi->height, fr, pl, src_center_frame.get(), core),
325 |         vsapi->freeFrame
326 |     };
327 | 
328 |     for (int plane = 0; plane < format->numPlanes; plane++) {
329 |         if (!d->process[plane]) {
330 |             continue;
331 |         }
332 | 
333 |         int width = vsapi->getFrameWidth(src_center_frame.get(), plane);
334 |         int height = vsapi->getFrameHeight(src_center_frame.get(), plane);
335 |         int stride = vsapi->getStride(src_center_frame.get(), plane) / vi->format->bytesPerSample;
336 | 
337 |         int padded_size_spatial = (
338 |             calc_pad_size(height, d->block_size, d->block_step) *
339 |             calc_pad_size(width, d->block_size, d->block_step)
340 |         );
341 | 
342 |         std::memset(thread_data.padded2, 0,
343 |             calc_pad_size(height, d->block_size, d->block_step) *
344 |             calc_pad_size(width, d->block_size, d->block_step) *
345 |             sizeof(float)
346 |         );
347 | 
348 |         for (int i = 0; i < 2 * d->radius + 1; i++) {
349 |             auto srcp = vsapi->getReadPtr(src_frames[i].get(), plane);
350 |             reflection_padding(
351 |                 &thread_data.padded[(i * padded_size_spatial) * vi->format->bytesPerSample],
352 |                 srcp,
353 |                 width, height, stride,
354 |                 d->block_size, d->block_step,
355 |                 vi->format->bytesPerSample
356 |             );
357 |         }
358 | 
359 |         for (int i = 0; i < calc_pad_num(height, d->block_size, d->block_step); i++) {
360 |             for (int j = 0; j < calc_pad_num(width, d->block_size, d->block_step); j++) {
361 |                 assert(d->block_size == 16);
362 |                 constexpr int block_size = 16;
363 | 
364 |                 Vec16f block[7 * block_size * 2];
365 | 
366 |                 int offset_x = calc_pad_size(width, d->block_size, d->block_step);
367 | 
368 |                 load_block(
369 |                     block,
370 |                     &thread_data.padded[(i * offset_x + j) * d->block_step * vi->format->bytesPerSample],
371 |                     d->radius, d->block_size, d->block_step,
372 |                     width, height,
373 |                     reinterpret_cast<const Vec16f *>(d->window.get()),
374 |                     vi->format->bitsPerSample
375 |                 );
376 | 
377 |                 fused(
378 |                     block,
379 |                     reinterpret_cast<const Vec16f *>(d->sigma.get()),
380 |                     d->sigma2,
381 |                     d->pmin,
382 |                     d->pmax,
383 |                     d->filter_type,
384 |                     d->zero_mean,
385 |                     reinterpret_cast<const Vec16f *>(d->window_freq.get()),
386 |                     d->radius
387 |                 );
388 | 
389 |                 store_block(
390 |                     &thread_data.padded2[(i * offset_x + j) * d->block_step],
391 |                     &block[d->radius * block_size * 2],
392 |                     block_size,
393 |                     d->block_step,
394 |                     width,
395 |                     reinterpret_cast<const Vec16f *>(&d->window[d->radius * block_size * 2 * 16])
396 |                 );
397 |             }
398 |         }
399 | 
400 |         int pad_width = calc_pad_size(width, d->block_size, d->block_step);
401 |         int pad_height = calc_pad_size(height, d->block_size, d->block_step);
402 |         int offset_y = (pad_height - height) / 2;
403 |         int offset_x = (pad_width - width) / 2;
404 | 
405 |         auto dstp = vsapi->getWritePtr(dst_frame.get(), plane);
406 |         store_frame(
407 |             dstp,
408 |             &thread_data.padded2[(offset_y * pad_width + offset_x)],
409 |             width,
410 |             height,
411 |             stride,
412 |             pad_width,
413 |             vi->format->bitsPerSample
414 |         );
415 |     }
416 | 
417 |     set_control_word(mxcsr);
418 | 
419 |     return dst_frame.release();
420 | }
421 | 
422 | 
423 | #ifndef HAS_DISPATCH
424 | bool supported_arch() noexcept {
425 | #else // HAS_DISPATCH
426 | bool SUPPORTED_ARCH_NAME() noexcept {
427 | #endif // HAS_DISPATCH
428 | 
429 |     return instrset_detect() >= INSTRSET;
430 | }
431 | 
432 | #ifndef HAS_DISPATCH
433 | const char * target_arch() noexcept {
434 | #if 0 <= INSTRSET && INSTRSET <= 10
435 |     constexpr std::array dispatch_targets {
436 |         "80386", "sse", "sse2", "sse3", "sse4.1", "sse4.2",
437 |         "avx", "avx2", "avx512f", "avx512bw/dq/vl"
438 |     };
439 |     return dispatch_targets[INSTRSET];
440 | #else // 0 <= INSTRSET && INSTRSET <= 10
441 |     return "unknown";
442 | #endif // 0 <= INSTRSET && INSTRSET <= 10
443 | }
444 | #endif // HAS_DISPATCH
445 | 


--------------------------------------------------------------------------------
/cpu_source/source.cpp:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <array>
  3 | #include <atomic>
  4 | #include <cassert>
  5 | #include <cmath>
  6 | #include <complex>
  7 | #include <cstdint>
  8 | #include <cstring>
  9 | #include <memory>
 10 | #include <mutex>
 11 | #if __cpp_lib_math_constants
 12 | #include <numbers>
 13 | #endif // __cpp_lib_math_constants
 14 | #include <shared_mutex>
 15 | #include <thread>
 16 | #include <type_traits>
 17 | #include <unordered_map>
 18 | #include <vector>
 19 | 
 20 | #include <VapourSynth.h>
 21 | #include <VSHelper.h>
 22 | 
 23 | #include "dfttest2_cpu.h"
 24 | #include "kernel.hpp"
 25 | 
 26 | #include <config.h> // generated by cmake, defines "VERSION"
 27 | 
 28 | 
 29 | template <typename T, typename T_in>
 30 | #if __cpp_concepts
 31 |     requires
 32 |         (std::is_same_v<T_in, T> || std::is_same_v<T_in, std::complex<T>>)
 33 | #endif // __cpp_concepts
 34 | static void dft(
 35 |     std::complex<T> * VS_RESTRICT dst,
 36 |     const T_in * VS_RESTRICT src,
 37 |     int n,
 38 |     int stride
 39 | ) {
 40 | #if __cpp_lib_math_constants
 41 |     const auto pi = std::numbers::pi_v<T>;
 42 | #else // __cpp_lib_math_constants
 43 |     const auto pi = static_cast<T>(M_PI);
 44 | #endif // __cpp_lib_math_constants
 45 | 
 46 |     int out_num = std::is_floating_point_v<T_in> ? (n / 2 + 1) : n;
 47 |     for (int i = 0; i < out_num; i++) {
 48 |         std::complex<T> sum {};
 49 |         for (int j = 0; j < n; j++) {
 50 |             auto imag = -2 * i * j * pi / n;
 51 |             auto weight = std::complex(std::cos(imag), std::sin(imag));
 52 |             sum += src[j * stride] * weight;
 53 |         }
 54 |         dst[i * stride] = sum;
 55 |     }
 56 | }
 57 | 
 58 | 
 59 | static void VS_CC DFTTestInit(
 60 |     VSMap *in, VSMap *out, void **instanceData, VSNode *node,
 61 |     VSCore *core, const VSAPI *vsapi
 62 | ) noexcept {
 63 | 
 64 |     auto d = static_cast<const DFTTestData *>(*instanceData);
 65 | 
 66 |     auto vi = vsapi->getVideoInfo(d->node);
 67 |     vsapi->setVideoInfo(vi, 1, node);
 68 | }
 69 | 
 70 | 
 71 | static void VS_CC DFTTestFree(
 72 |     void *instanceData, VSCore *core, const VSAPI *vsapi
 73 | ) noexcept {
 74 | 
 75 |     auto d = static_cast<const DFTTestData *>(instanceData);
 76 | 
 77 |     vsapi->freeNode(d->node);
 78 | 
 79 |     for (const auto & [_, thread_data] : d->thread_data) {
 80 |         std::free(thread_data.padded2);
 81 |         std::free(thread_data.padded);
 82 |     }
 83 | 
 84 |     delete d;
 85 | }
 86 | 
 87 | 
 88 | static void VS_CC DFTTestCreate(
 89 |     const VSMap *in, VSMap *out, void *userData,
 90 |     VSCore *core, const VSAPI *vsapi
 91 | ) noexcept {
 92 | 
 93 |     auto d = std::make_unique<DFTTestData>();
 94 | 
 95 |     d->node = vsapi->propGetNode(in, "clip", 0, nullptr);
 96 | 
 97 |     auto set_error = [vsapi, out, &d](const char * error_message) -> void {
 98 |         vsapi->freeNode(d->node);
 99 |         vsapi->setError(out, error_message);
100 |         return ;
101 |     };
102 | 
103 |     auto vi = vsapi->getVideoInfo(d->node);
104 |     if (!isConstantFormat(vi)) {
105 |         return set_error("only constant format input is supported");
106 |     }
107 |     if (vi->format->sampleType == stInteger && vi->format->bytesPerSample > 2) {
108 |         return set_error("only 8-16 bit integer format input is supported");
109 |     }
110 |     if (vi->format->sampleType == stFloat && vi->format->bitsPerSample != 32) {
111 |         return set_error("only 32-bit float format input is supported");
112 |     }
113 | 
114 |     int error;
115 | 
116 |     d->radius = int64ToIntS(vsapi->propGetInt(in, "radius", 0, &error));
117 |     if (error) {
118 |         d->radius = 0;
119 |     }
120 | 
121 |     if (d->radius < 0 || d->radius > 3) {
122 |         return set_error("\"radius\" must be in [0, 1, 2, 3]");
123 |     }
124 | 
125 |     d->block_size = int64ToIntS(vsapi->propGetInt(in, "block_size", 0, &error));
126 |     if (error) {
127 |         d->block_size = 16;
128 |     }
129 | 
130 |     if (d->block_size != 16) {
131 |         return set_error("\"block_size\" must be 16");
132 |     }
133 | 
134 |     d->block_step = int64ToIntS(vsapi->propGetInt(in, "block_step", 0, &error));
135 |     if (error) {
136 |         d->block_step = d->block_size;
137 |     }
138 | 
139 |     int num_planes_args = vsapi->propNumElements(in, "planes");
140 |     d->process.fill(num_planes_args <= 0);
141 |     for (int i = 0; i < num_planes_args; ++i) {
142 |         int plane = static_cast<int>(vsapi->propGetInt(in, "planes", i, nullptr));
143 | 
144 |         if (plane < 0 || plane >= vi->format->numPlanes) {
145 |             return set_error("plane index out of range");
146 |         }
147 | 
148 |         if (d->process[plane]) {
149 |             return set_error("plane specified twice");
150 |         }
151 | 
152 |         d->process[plane] = true;
153 |     }
154 | 
155 |     {
156 |         auto ptr = vs_aligned_malloc<float>(
157 |             (2 * d->radius + 1) * d->block_size * d->block_size * sizeof(float),
158 |             64
159 |         );
160 |         if (ptr == nullptr) {
161 |             return set_error("alloc error");
162 |         }
163 |         d->window.reset(ptr);
164 |     }
165 |     {
166 |         auto window = vsapi->propGetFloatArray(in, "window", nullptr);
167 |         for (int i = 0; i < (2 * d->radius + 1) * d->block_size * d->block_size / 16; i++) {
168 |             to_float(Vec8d().load(&window[i * 16])).store_a(&d->window[i * 16]);
169 |             to_float(Vec8d().load(&window[i * 16 + 8])).store_a(&d->window[i * 16 + 8]);
170 |         }
171 |     }
172 | 
173 |     {
174 |         auto ptr = vs_aligned_malloc<float>(
175 |             (2 * d->radius + 1) * d->block_size * (d->block_size / 2 + 1 + 15) * sizeof(float),
176 |             64
177 |         );
178 |         if (ptr == nullptr) {
179 |             return set_error("alloc error");
180 |         }
181 |         d->sigma.reset(ptr);
182 |     }
183 |     {
184 |         auto sigma = vsapi->propGetFloatArray(in, "sigma", nullptr);
185 |         for (int i = 0; i < (2 * d->radius + 1) * d->block_size; i++) {
186 |             float sigma_padded[16] {};
187 |             for (int j = 0; j < d->block_size / 2 + 1; j++) {
188 |                 sigma_padded[j] = static_cast<float>(sigma[i * (d->block_size / 2 + 1) + j]);
189 |             }
190 |             Vec16f().load(&sigma_padded[0]).store_a(&d->sigma[i * 16]);
191 |         }
192 |     }
193 | 
194 |     d->sigma2 = static_cast<float>(vsapi->propGetFloat(in, "sigma2", 0, nullptr));
195 |     d->pmin = static_cast<float>(vsapi->propGetFloat(in, "pmin", 0, nullptr));
196 |     d->pmax = static_cast<float>(vsapi->propGetFloat(in, "pmax", 0, nullptr));
197 | 
198 |     d->filter_type = static_cast<int>(vsapi->propGetInt(in, "filter_type", 0, nullptr));
199 | 
200 |     d->zero_mean = !!vsapi->propGetInt(in, "zero_mean", 0, &error);
201 |     if (error) {
202 |         d->zero_mean = true;
203 |     }
204 |     if (d->zero_mean) {
205 |         {
206 |             auto ptr = vs_aligned_malloc<float>(
207 |                 (2 * d->radius + 1) * d->block_size * (d->block_size / 2 + 1 + 15) * 2 * sizeof(float),
208 |                 64
209 |             );
210 |             if (ptr == nullptr) {
211 |                 return set_error("alloc error");
212 |             }
213 |             d->window_freq.reset(ptr);
214 |         }
215 |         auto window_freq = vsapi->propGetFloatArray(in, "window_freq", nullptr);
216 |         for (int i = 0; i < (2 * d->radius + 1) * d->block_size; i++) {
217 |             float sigma_padded[32] {};
218 |             for (int j = 0; j < d->block_size / 2 + 1; j++) {
219 |                 sigma_padded[j] = static_cast<float>(window_freq[(i * (d->block_size / 2 + 1) + j) * 2]);
220 |                 sigma_padded[16 + j] = static_cast<float>(window_freq[(i * (d->block_size / 2 + 1) + j) * 2 + 1]);
221 |             }
222 |             Vec16f().load(&sigma_padded[0]).store_a(&d->window_freq[i * 2 * 16]);
223 |             Vec16f().load(&sigma_padded[16]).store_a(&d->window_freq[(i * 2 + 1) * 16]);
224 |         }
225 |     }
226 | 
227 |     VSCoreInfo info;
228 |     vsapi->getCoreInfo2(core, &info);
229 |     d->num_uninitialized_threads.store(info.numThreads, std::memory_order::relaxed);
230 |     d->thread_data.reserve(info.numThreads);
231 | 
232 | #ifndef HAS_DISPATCH
233 |     if (!supported_arch()) {
234 |         return set_error("unsupported cpu architecture");
235 |     }
236 | 
237 |     vsapi->createFilter(
238 |         in, out, "DFTTest",
239 |         DFTTestInit, DFTTestGetFrame, DFTTestFree,
240 |         fmParallel, 0, d.release(), core
241 |     );
242 | #else
243 |     auto opt = int64ToIntS(vsapi->propGetInt(in, "opt", 0, &error));
244 |     if (error) {
245 |         opt = 0;
246 |     }
247 | 
248 |     constexpr std::array getframe_candidates { GETFRAME_PTRS };
249 | 
250 |     if (opt == 0) {
251 |         constexpr std::array supported_arch_candidates { SUPPORTED_ARCH_PTRS };
252 | 
253 |         bool found_supported_impl = false;
254 | 
255 |         for (int i = static_cast<int>(getframe_candidates.size()) - 1; i >= 0; i--) {
256 |             if (supported_arch_candidates[i]()) {
257 |                 vsapi->createFilter(
258 |                     in, out, "DFTTest",
259 |                     DFTTestInit, getframe_candidates[i], DFTTestFree,
260 |                     fmParallel, 0, d.release(), core
261 |                 );
262 | 
263 |                 found_supported_impl = true;
264 |                 break;
265 |             }
266 |         }
267 | 
268 |         if (!found_supported_impl) {
269 |             return set_error("unsupported cpu architecture");
270 |         }
271 |     } else {
272 |         if (0 < opt && opt < static_cast<int>(getframe_candidates.size() + 1)) {
273 |             vsapi->createFilter(
274 |                 in, out, "DFTTest",
275 |                 DFTTestInit, getframe_candidates[opt - 1], DFTTestFree,
276 |                 fmParallel, 0, d.release(), core
277 |             );
278 |         } else {
279 |             return set_error("invalid \"opt\"");
280 |         }
281 |     }
282 | #endif // HAS_DISPATCH
283 | }
284 | 
285 | 
286 | static void VS_CC RDFT(
287 |     const VSMap *in, VSMap *out, void *userData,
288 |     VSCore *core, const VSAPI *vsapi
289 | ) noexcept {
290 | 
291 |     auto set_error = [vsapi, out](const char * error_message) -> void {
292 |         vsapi->setError(out, error_message);
293 |     };
294 | 
295 |     int ndim = vsapi->propNumElements(in, "shape");
296 |     if (ndim != 1 && ndim != 2 && ndim != 3) {
297 |         return set_error("\"shape\" must be an array of ints with 1, 2 or 3 values");
298 |     }
299 | 
300 |     std::array<int, 3> shape {};
301 |     {
302 |         auto shape_array = vsapi->propGetIntArray(in, "shape", nullptr);
303 |         for (int i = 0; i < ndim; i++) {
304 |             shape[i] = int64ToIntS(shape_array[i]);
305 |         }
306 |     }
307 | 
308 |     int size = 1;
309 |     for (int i = 0; i < ndim; i++) {
310 |         size *= shape[i];
311 |     }
312 |     if (vsapi->propNumElements(in, "data") != size) {
313 |         return set_error("cannot reshape array");
314 |     }
315 | 
316 |     int complex_size = shape[ndim - 1] / 2 + 1;
317 |     for (int i = 0; i < ndim - 1; i++) {
318 |         complex_size *= shape[i];
319 |     }
320 | 
321 |     auto input = vsapi->propGetFloatArray(in, "data", nullptr);
322 | 
323 |     auto output = std::make_unique<std::complex<double> []>(complex_size);
324 | 
325 |     if (ndim == 1) {
326 |         dft(output.get(), input, size, 1);
327 |         vsapi->propSetFloatArray(out, "ret", (const double *) output.get(), complex_size * 2);
328 |     } else if (ndim == 2) {
329 |         for (int i = 0; i < shape[0]; i++) {
330 |             dft(&output[i * (shape[1] / 2 + 1)], &input[i * shape[1]], shape[1], 1);
331 |         }
332 | 
333 |         auto output2 = std::make_unique<std::complex<double> []>(complex_size);
334 | 
335 |         for (int i = 0; i < shape[1] / 2 + 1; i++) {
336 |             dft(&output2[i], &output[i], shape[0], shape[1] / 2 + 1);
337 |         }
338 | 
339 |         vsapi->propSetFloatArray(out, "ret", (const double *) output2.get(), complex_size * 2);
340 |     } else {
341 |         for (int i = 0; i < shape[0] * shape[1]; i++) {
342 |             dft(&output[i * (shape[2] / 2 + 1)], &input[i * shape[2]], shape[2], 1);
343 |         }
344 | 
345 |         auto output2 = std::make_unique<std::complex<double> []>(complex_size);
346 | 
347 |         for (int i = 0; i < shape[0]; i++) {
348 |             for (int j = 0; j < shape[2] / 2 + 1; j++) {
349 |                 dft(
350 |                     &output2[i * shape[1] * (shape[2] / 2 + 1) + j],
351 |                     &output[i * shape[1] * (shape[2] / 2 + 1) + j],
352 |                     shape[1],
353 |                     (shape[2] / 2 + 1)
354 |                 );
355 |             }
356 |         }
357 | 
358 |         for (int i = 0; i < shape[1] * (shape[2] / 2 + 1); i++) {
359 |             dft(&output[i], &output2[i], shape[0], shape[1] * (shape[2] / 2 + 1));
360 |         }
361 | 
362 |         vsapi->propSetFloatArray(out, "ret", (const double *) output.get(), complex_size * 2);
363 |     }
364 | }
365 | 
366 | 
367 | static void Version(const VSMap *, VSMap * out, void *, VSCore *, const VSAPI *vsapi) {
368 |     vsapi->propSetData(out, "version", VERSION, -1, paReplace);
369 | 
370 | #ifdef HAS_DISPATCH
371 |     constexpr std::array dispatch_targets { "auto", SUPPORTED_ARCH_STRS };
372 | 
373 |     for (int i = 0; i < static_cast<int>(dispatch_targets.size()); i++) {
374 |         vsapi->propSetData(out, "dispatch_targets", dispatch_targets[i], -1, paAppend);
375 |     }
376 | #else // HAS_DISPATCH
377 |     vsapi->propSetData(out, "dispatch_targets", target_arch(), -1, paReplace);
378 | #endif // HAS_DISPATCH
379 | }
380 | 
381 | 
382 | VS_EXTERNAL_API(void) VapourSynthPluginInit(
383 |     VSConfigPlugin configFunc,
384 |     VSRegisterFunction registerFunc,
385 |     VSPlugin *plugin
386 | ) {
387 | 
388 |     configFunc(
389 |         "io.github.amusementclub.dfttest2_cpu",
390 |         "dfttest2_cpu",
391 |         "DFTTest2 (CPU)",
392 |         VAPOURSYNTH_API_VERSION, 1, plugin
393 |     );
394 | 
395 |     registerFunc(
396 |         "DFTTest",
397 |         "clip:clip;"
398 |         "window:float[];"
399 |         "sigma:float[];"
400 |         "sigma2:float;"
401 |         "pmin:float;"
402 |         "pmax:float;"
403 |         "filter_type:int;"
404 |         "radius:int:opt;"
405 |         "block_size:int:opt;"
406 |         "block_step:int:opt;"
407 |         "zero_mean:int:opt;"
408 |         "window_freq:float[]:opt;"
409 |         "planes:int[]:opt;"
410 |         "opt:int:opt;",
411 |         DFTTestCreate, nullptr, plugin
412 |     );
413 | 
414 |     registerFunc(
415 |         "RDFT",
416 |         "data:float[];"
417 |         "shape:int[];",
418 |         RDFT, nullptr, plugin
419 |     );
420 | 
421 |     registerFunc(
422 |         "Version",
423 |         "",
424 |         Version, nullptr, plugin
425 |     );
426 | }
427 | 


--------------------------------------------------------------------------------
/cuda_source/kernel.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef KERNEL_HPP
  2 | #define KERNEL_HPP
  3 | 
  4 | static const auto kernel_implementation = R"""(
  5 | #ifndef __CUDACC_RTC__
  6 | #include <cufft.h>
  7 | #endif // __CUDACC_RTC__
  8 | 
  9 | __device__
 10 | extern void filter(float2 & value, int x, int y, int z);
 11 | 
 12 | // ZERO_MEAN
 13 | // RADIUS
 14 | // BLOCK_SIZE
 15 | // BLOCK_STEP
 16 | // IN_PLACE
 17 | // WARPS_PER_BLOCK
 18 | // WARP_SIZE
 19 | // TYPE
 20 | // SCALE
 21 | // PEAK (optional)
 22 | 
 23 | #if ZERO_MEAN
 24 | // __device__ const float window_freq[]; // frequency response of the window
 25 | #endif // ZERO_MEAN
 26 | 
 27 | __device__
 28 | static int calc_pad_size(int size, int block_size, int block_step) {
 29 |     return size + ((size % block_size) ? block_size - size % block_size : 0) + max(block_size - block_step, block_step) * 2;
 30 | }
 31 | 
 32 | __device__
 33 | static int calc_pad_num(int size, int block_size, int block_step) {
 34 |     return (calc_pad_size(size, block_size, block_step) - block_size) / block_step + 1;
 35 | }
 36 | 
 37 | __device__
 38 | static float to_float(TYPE x) {
 39 |     return static_cast<float>(x) * static_cast<float>(SCALE);
 40 | }
 41 | 
 42 | __device__
 43 | static TYPE from_float(float x) {
 44 | #ifdef PEAK
 45 |     x /= static_cast<float>(SCALE);
 46 |     x = fmaxf(0.0f, fminf(x + 0.5f, static_cast<float>(PEAK)));
 47 |     return static_cast<TYPE>(__float2int_rz(x));
 48 | #else // PEAK // only integral types define it
 49 |     return static_cast<TYPE>(x / static_cast<float>(SCALE));
 50 | #endif // PEAK
 51 | }
 52 | 
 53 | extern "C"
 54 | __launch_bounds__(WARPS_PER_BLOCK * WARP_SIZE)
 55 | __global__
 56 | void im2col(
 57 |     // shape: (vertical_num, horizontal_num, 2*radius+1, block_size, padded_block_size)
 58 |     float * __restrict__ dstp,
 59 |     const TYPE * __restrict__ srcp, // shape: (2*radius+1, vertical_size, horizontal_size)
 60 |     int width,
 61 |     int height
 62 | ) {
 63 | 
 64 |     int radius = static_cast<int>(RADIUS);
 65 |     int block_size = static_cast<int>(BLOCK_SIZE);
 66 |     int padded_block_size = IN_PLACE ? (block_size / 2 + 1) * 2 : block_size;
 67 |     int block_step = static_cast<int>(BLOCK_STEP);
 68 | 
 69 |     int horizontal_num = calc_pad_num(width, block_size, block_step);
 70 |     int vertical_num = calc_pad_num(height, block_size, block_step);
 71 |     int horizontal_size = calc_pad_size(width, block_size, block_step);
 72 |     int vertical_size = calc_pad_size(height, block_size, block_step);
 73 |     int num_blocks = vertical_num * horizontal_num;
 74 | 
 75 |     for (int i = blockIdx.x * WARPS_PER_BLOCK + threadIdx.x / WARP_SIZE; i < num_blocks; i += gridDim.x * WARPS_PER_BLOCK) {
 76 |         int ix = i % horizontal_num;
 77 |         int iy = i / horizontal_num;
 78 |         auto dst = &dstp[i * (2 * radius + 1) * block_size * padded_block_size];
 79 |         for (int j = 0; j < 2 * radius + 1; j++) {
 80 |             auto src = &srcp[(j * vertical_size + iy * block_step) * horizontal_size + ix * block_step];
 81 |             for (int k = threadIdx.x % WARP_SIZE; k < block_size * block_size; k += WARP_SIZE) {
 82 |                 int kx = k % block_size;
 83 |                 int ky = k / block_size;
 84 |                 float val = to_float(src[ky * horizontal_size + kx]) * window[j * block_size * block_size + k];
 85 | #if IN_PLACE == 1
 86 |                 dst[(j * block_size + k / block_size) * padded_block_size + k % block_size] = val;
 87 | #else
 88 |                 dst[j * block_size * block_size + k] = val;
 89 | #endif
 90 |             }
 91 |         }
 92 |     }
 93 | }
 94 | 
 95 | extern "C"
 96 | __launch_bounds__(WARPS_PER_BLOCK * WARP_SIZE)
 97 | __global__
 98 | void frequency_filtering(
 99 |     float2 * data,
100 |     int num_blocks
101 | ) {
102 | 
103 |     int radius = static_cast<int>(RADIUS);
104 |     int block_size_1d = static_cast<int>(BLOCK_SIZE);
105 | 
106 |     // each warp is responsible for a single block
107 |     // assume that blockDim.x % WARP_SIZE == 0
108 | 
109 |     int block_size_x = block_size_1d / 2 + 1;
110 |     int block_size_2d = block_size_1d * block_size_x;
111 |     int block_size_3d = (2 * radius + 1) * block_size_2d;
112 | 
113 |     for (int i = blockIdx.x * WARPS_PER_BLOCK + threadIdx.x / WARP_SIZE; i < num_blocks; i += gridDim.x * WARPS_PER_BLOCK) {
114 | #if ZERO_MEAN
115 |         float gf;
116 |         if (threadIdx.x % WARP_SIZE == 0) {
117 |             gf = data[i * block_size_3d].x / window_freq[0];
118 |         }
119 |         gf = __shfl_sync(0xFFFFFFFF, gf, 0);
120 | #endif // ZERO_MEAN
121 | 
122 |         for (int j = threadIdx.x % WARP_SIZE; j < block_size_3d; j += WARP_SIZE) {
123 |             float2 local_data = data[i * block_size_3d + j];
124 | 
125 | #if ZERO_MEAN
126 |             // remove mean
127 |             float val1 = gf * window_freq[j * 2];
128 |             float val2 = gf * window_freq[j * 2 + 1];
129 |             local_data.x -= val1;
130 |             local_data.y -= val2;
131 | #endif // ZERO_MEAN
132 | 
133 |             filter(
134 |                 local_data,
135 |                 j % block_size_x,
136 |                 (j % block_size_2d) / block_size_x,
137 |                 (j % block_size_3d) / block_size_2d
138 |             );
139 | 
140 | #if ZERO_MEAN
141 |             // add mean
142 |             local_data.x += val1;
143 |             local_data.y += val2;
144 | #endif // ZERO_MEAN
145 | 
146 |             data[i * block_size_3d + j] = local_data;
147 |         }
148 |     }
149 | }
150 | 
151 | extern "C"
152 | __launch_bounds__(WARPS_PER_BLOCK * WARP_SIZE)
153 | __global__
154 | void col2im(
155 |     TYPE * __restrict__ dst, // shape: (2*radius+1, vertical_size, horizontal_size)
156 |     // shape: (vertical_num, horizontal_num, 2*radius+1, block_size, padded_block_size)
157 |     const float * __restrict__ src,
158 |     int width,
159 |     int height
160 | ) {
161 | 
162 |     int radius = static_cast<int>(RADIUS);
163 |     int block_size = static_cast<int>(BLOCK_SIZE);
164 |     int padded_block_size = IN_PLACE ? (block_size / 2 + 1) * 2 : block_size;
165 |     int block_step = static_cast<int>(BLOCK_STEP);
166 | 
167 |     // each thread is responsible for a single pixel
168 |     int horizontal_size = calc_pad_size(width, block_size, block_step);
169 |     int horizontal_num = calc_pad_num(width, block_size, block_step);
170 |     int vertical_size = calc_pad_size(height, block_size, block_step);
171 |     int vertical_num = calc_pad_num(height, block_size, block_step);
172 |     int pad_x = (horizontal_size - width) / 2;
173 |     int pad_y = (vertical_size - height) / 2;
174 | 
175 |     int x = blockIdx.x * blockDim.x + threadIdx.x;
176 |     int y = blockIdx.y * blockDim.y + threadIdx.y;
177 |     if (y < pad_y || y >= pad_y + height || x < pad_x || x >= pad_x + width) {
178 |         return ;
179 |     }
180 | 
181 |     float sum {};
182 | 
183 |     int i1 = (y - block_size + block_step) / block_step; // i1 is implicitly greater than 0
184 |     int i2 = min(y / block_step, vertical_num - 1);
185 |     int j1 = (x - block_size + block_step) / block_step; // j1 is implicitly greater than 0
186 |     int j2 = min(x / block_step, horizontal_num - 1);
187 | 
188 |     for (int i = i1; i <= i2; i++) {
189 |         int offset_y = y - i * block_step;
190 |         for (int j = j1; j <= j2; j++) {
191 |             int offset_x = x - j * block_step;
192 |             auto src_offset = (((i * horizontal_num + j) * (2 * radius + 1) + radius) * block_size + offset_y) * padded_block_size + offset_x;
193 |             auto window_offset = (radius * block_size + offset_y) * block_size + offset_x;
194 |             sum += src[src_offset] * window[window_offset];
195 |         }
196 |     }
197 | 
198 |     dst[(radius * vertical_size + y) * horizontal_size + x] = from_float(sum);
199 | }
200 | )""";
201 | 
202 | #endif // KERNEL_HPP
203 | 


--------------------------------------------------------------------------------
/cuda_source/win32.cpp:
--------------------------------------------------------------------------------
 1 | #ifdef _MSC_VER
 2 | #include <windows.h>
 3 | #include <delayimp.h>
 4 | #include <string>
 5 | #include <vector>
 6 | #include <stdexcept>
 7 | #include <filesystem>
 8 | 
 9 | #define DLL_DIR L"vsmlrt-cuda"
10 | 
11 | #include <iostream>
12 | 
13 | namespace {
14 | namespace fs = std::filesystem;
15 | static fs::path dllDir() {
16 |     static const std::wstring res = []() -> std::wstring {
17 |         HMODULE mod = 0;
18 |         if (GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, (char *)dllDir, &mod)) {
19 |             std::vector<wchar_t> buf;
20 |             size_t n = 0;
21 |             do {
22 |                 buf.resize(buf.size() + MAX_PATH);
23 |                 n = GetModuleFileNameW(mod, buf.data(), buf.size());
24 |             } while (n >= buf.size());
25 |             buf.resize(n);
26 |             std::wstring path(buf.begin(), buf.end());
27 |             return path;
28 |         }
29 |         throw std::runtime_error("unable to locate myself");
30 |     }();
31 |     return fs::path(res).parent_path();
32 | }
33 | 
34 | FARPROC loadDLLs(std::string dll) {
35 |     fs::path p = dllDir() / DLL_DIR / dll;
36 |     std::wstring s = p;
37 |     HMODULE h = nullptr;
38 |     h = LoadLibraryW(s.c_str());
39 |     if (getenv("VS_DFTTEST2_VERBOSE"))
40 |         std::wcerr << L"vs-dfttest2: preloading " << p << L": " << h << std::endl;
41 |     if (!h) {
42 |         std::wcerr << L"vs-dfttest2: failed to preload " << p << std::endl;
43 |         h = LoadLibraryA(dll.c_str());
44 |     }
45 |     if (!h)
46 |         std::cerr << "vs-dfttest2: failed to preload " << dll << std::endl;
47 |     return (FARPROC)h;
48 | }
49 | 
50 | extern "C" FARPROC WINAPI delayload_hook(unsigned reason, DelayLoadInfo* info) {
51 |     switch (reason) {
52 |     case dliNoteStartProcessing:
53 |     case dliNoteEndProcessing:
54 |         // Nothing to do here.
55 |         break;
56 |     case dliNotePreLoadLibrary: {
57 |         //std::cerr << "loading " << info->szDll << std::endl;
58 |         std::string dll {info->szDll};
59 |         if (dll.find("cufft64") != std::string::npos)
60 |             return loadDLLs(dll);
61 |         break;
62 |     }
63 |     case dliNotePreGetProcAddress:
64 |         // Nothing to do here.
65 |         break;
66 |     case dliFailLoadLib:
67 |     case dliFailGetProc:
68 |         // Returning NULL from error notifications will cause the delay load
69 |         // runtime to raise a VcppException structured exception, that some code
70 |         // might want to handle.
71 |         return NULL;
72 |         break;
73 |     default:
74 |         abort(); // unreachable.
75 |         break;
76 |     }
77 |     // Returning NULL causes the delay load machinery to perform default
78 |     // processing for this notification.
79 |     return NULL;
80 | }
81 | } // namespace
82 | 
83 | extern "C" {
84 |     const PfnDliHook __pfnDliNotifyHook2 = delayload_hook;
85 |     const PfnDliHook __pfnDliFailureHook2 = delayload_hook;
86 | };
87 | #endif
88 | 


--------------------------------------------------------------------------------
/dfttest2.py:
--------------------------------------------------------------------------------
  1 | __version__ = "0.4.0"
  2 | 
  3 | from dataclasses import dataclass
  4 | import math
  5 | from string import Template
  6 | import typing
  7 | 
  8 | import vapoursynth as vs
  9 | from vapoursynth import core
 10 | 
 11 | 
 12 | __all__ = ["DFTTest", "DFTTest2", "Backend"]
 13 | 
 14 | 
 15 | class Backend:
 16 |     @dataclass(frozen=False)
 17 |     class cuFFT:
 18 |         device_id: int = 0
 19 |         in_place: bool = True
 20 | 
 21 |     @dataclass(frozen=False)
 22 |     class NVRTC:
 23 |         device_id: int = 0
 24 |         num_streams: int = 1
 25 | 
 26 |     @dataclass(frozen=False)
 27 |     class CPU:
 28 |         opt: int = 0
 29 | 
 30 |     @dataclass(frozen=False)
 31 |     class GCC:
 32 |         pass
 33 | 
 34 |     @dataclass(frozen=False)
 35 |     class hipFFT:
 36 |         device_id: int = 0
 37 |         in_place: bool = True
 38 | 
 39 |     @dataclass(frozen=False)
 40 |     class HIPRTC:
 41 |         device_id: int = 0
 42 |         num_streams: int = 1
 43 | 
 44 | backendT = typing.Union[Backend.cuFFT, Backend.NVRTC, Backend.CPU, Backend.GCC, Backend.hipFFT, Backend.HIPRTC]
 45 | 
 46 | 
 47 | def init_backend(backend: backendT) -> backendT:
 48 |     if backend is Backend.cuFFT: # type: ignore
 49 |         backend = Backend.cuFFT()
 50 |     elif backend is Backend.NVRTC: # type: ignore
 51 |         backend = Backend.NVRTC()
 52 |     elif backend is Backend.CPU: # type: ignore
 53 |         backend = Backend.CPU()
 54 |     elif backend is Backend.GCC: # type: ignore
 55 |         backend = Backend.GCC()
 56 |     elif backend is Backend.hipFFT: # type: ignore
 57 |         backend = Backend.hipFFT()
 58 |     elif backend is Backend.HIPRTC: # type: ignore
 59 |         backend = Backend.HIPRTC()
 60 |     return backend
 61 | 
 62 | 
 63 | # https://github.com/HomeOfVapourSynthEvolution/VapourSynth-DFTTest/blob/
 64 | # bc5e0186a7f309556f20a8e9502f2238e39179b8/DFTTest/DFTTest.cpp#L518
 65 | def normalize(
 66 |     window: typing.Sequence[float],
 67 |     size: int,
 68 |     step: int
 69 | ) -> typing.List[float]:
 70 | 
 71 |     nw = [0.0] * size
 72 |     for q in range(size):
 73 |         for h in range(q, -1, -step):
 74 |             nw[q] += window[h] ** 2
 75 |         for h in range(q + step, size, step):
 76 |             nw[q] += window[h] ** 2
 77 |     return [window[q] / math.sqrt(nw[q]) for q in range(size)]
 78 | 
 79 | 
 80 | # https://github.com/HomeOfVapourSynthEvolution/VapourSynth-DFTTest/blob/
 81 | # bc5e0186a7f309556f20a8e9502f2238e39179b8/DFTTest/DFTTest.cpp#L462
 82 | def get_window_value(location: float, size: int, mode: int, beta: float) -> float:
 83 |     temp = math.pi * location / size
 84 |     if mode == 0: # hanning
 85 |         return 0.5 * (1 - math.cos(2 * temp))
 86 |     elif mode == 1: # hamming
 87 |         return 0.53836 - 0.46164 * math.cos(2 * temp)
 88 |     elif mode == 2: # blackman
 89 |         return 0.42 - 0.5 * math.cos(2 * temp) + 0.08 * math.cos(4 * temp)
 90 |     elif mode == 3: # 4 term blackman-harris
 91 |         return (
 92 |             0.35875
 93 |             - 0.48829 * math.cos(2 * temp)
 94 |             + 0.14128 * math.cos(4 * temp)
 95 |             - 0.01168 * math.cos(6 * temp)
 96 |         )
 97 |     elif mode == 4: # kaiser-bessel
 98 |         def i0(p: float) -> float:
 99 |             p /= 2
100 |             n = t = d = 1.0
101 |             k = 1
102 |             while True:
103 |                 n *= p
104 |                 d *= k
105 |                 v = n / d
106 |                 t += v * v
107 |                 k += 1
108 |                 if k >= 15 or v <= 1e-8:
109 |                     break
110 |             return t
111 |         v = 2 * location / size - 1
112 |         return i0(math.pi * beta * math.sqrt(1 - v * v)) / i0(math.pi * beta)
113 |     elif mode == 5: # 7 term blackman-harris
114 |         return (
115 |             0.27105140069342415
116 |             - 0.433297939234486060 * math.cos(2 * temp)
117 |             + 0.218122999543110620 * math.cos(4 * temp)
118 |             - 0.065925446388030898 * math.cos(6 * temp)
119 |             + 0.010811742098372268 * math.cos(8 * temp)
120 |             - 7.7658482522509342e-4 * math.cos(10 * temp)
121 |             + 1.3887217350903198e-5 * math.cos(12 * temp)
122 |         )
123 |     elif mode == 6: # flat top
124 |         return (
125 |             0.2810639
126 |             - 0.5208972 * math.cos(2 * temp)
127 |             + 0.1980399 * math.cos(4 * temp)
128 |         )
129 |     elif mode == 7: # rectangular
130 |         return 1.0
131 |     elif mode == 8: # Bartlett
132 |         return 1 - 2 * abs(location - size / 2) / size
133 |     elif mode == 9: # bartlett-hann
134 |         return 0.62 - 0.48 * (location / size - 0.5) - 0.38 * math.cos(2 * temp)
135 |     elif mode == 10: # nuttall
136 |         return (
137 |             0.355768
138 |             - 0.487396 * math.cos(2 * temp)
139 |             + 0.144232 * math.cos(4 * temp)
140 |             - 0.012604 * math.cos(6 * temp)
141 |         )
142 |     elif mode == 11: # blackman-nuttall
143 |         return (
144 |             0.3635819
145 |             - 0.4891775 * math.cos(2 * temp)
146 |             + 0.1365995 * math.cos(4 * temp)
147 |             - 0.0106411 * math.cos(6 * temp)
148 |         )
149 |     else:
150 |         raise ValueError("unknown window")
151 | 
152 | 
153 | # https://github.com/HomeOfVapourSynthEvolution/VapourSynth-DFTTest/blob/
154 | # bc5e0186a7f309556f20a8e9502f2238e39179b8/DFTTest/DFTTest.cpp#L461
155 | def get_window(
156 |     radius: int,
157 |     block_size: int,
158 |     block_step: int,
159 |     spatial_window_mode: int,
160 |     spatial_beta: float,
161 |     temporal_window_mode: int,
162 |     temporal_beta: float
163 | ) -> typing.List[float]:
164 | 
165 |     temporal_window = [
166 |         get_window_value(
167 |             location = i + 0.5,
168 |             size = 2 * radius + 1,
169 |             mode = temporal_window_mode,
170 |             beta = temporal_beta
171 |         ) for i in range(2 * radius + 1)
172 |     ]
173 | 
174 |     spatial_window = [
175 |         get_window_value(
176 |             location = i + 0.5,
177 |             size = block_size,
178 |             mode = spatial_window_mode,
179 |             beta = spatial_beta
180 |         ) for i in range(block_size)
181 |     ]
182 | 
183 |     spatial_window = normalize(
184 |         window=spatial_window,
185 |         size=block_size,
186 |         step=block_step
187 |     )
188 | 
189 |     window = []
190 |     for t_val in temporal_window:
191 |         for s_val1 in spatial_window:
192 |             for s_val2 in spatial_window:
193 |                 value = t_val * s_val1 * s_val2
194 | 
195 |                 # normalize for unnormalized FFT implementation
196 |                 value /= math.sqrt(2 * radius + 1) * block_size
197 | 
198 |                 window.append(value)
199 | 
200 |     return window
201 | 
202 | 
203 | # https://github.com/HomeOfVapourSynthEvolution/VapourSynth-DFTTest/blob/
204 | # bc5e0186a7f309556f20a8e9502f2238e39179b8/DFTTest/DFTTest.cpp#L581
205 | def get_location(
206 |     position: float,
207 |     length: int
208 | ) -> float:
209 | 
210 |     if length == 1:
211 |         return 0.0
212 |     elif position > length // 2:
213 |         return (length - position) / (length // 2)
214 |     else:
215 |         return position / (length // 2)
216 | 
217 | 
218 | # https://github.com/HomeOfVapourSynthEvolution/VapourSynth-DFTTest/blob/
219 | # bc5e0186a7f309556f20a8e9502f2238e39179b8/DFTTest/DFTTest.cpp#L581
220 | def get_sigma(
221 |     position: float,
222 |     length: int,
223 |     func: typing.Callable[[float], float]
224 | ) -> float:
225 | 
226 |     if length == 1:
227 |         return 1.0
228 |     else:
229 |         return func(get_location(position, length))
230 | 
231 | 
232 | def DFTTest2(
233 |     clip: vs.VideoNode,
234 |     ftype: typing.Literal[0, 1, 2, 3, 4] = 0,
235 |     sigma: typing.Union[float, typing.Sequence[typing.Callable[[float], float]]] = 8.0,
236 |     sigma2: float = 8.0,
237 |     pmin: float = 0.0,
238 |     pmax: float = 500.0,
239 |     sbsize: int = 16,
240 |     sosize: int = 12,
241 |     tbsize: int = 3,
242 |     swin: typing.Literal[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] = 0,
243 |     twin: typing.Literal[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] = 7,
244 |     sbeta: float = 2.5,
245 |     tbeta: float = 2.5,
246 |     zmean: bool = True,
247 |     f0beta: float = 1.0,
248 |     ssystem: typing.Literal[0, 1] = 0,
249 |     planes: typing.Optional[typing.Union[int, typing.Sequence[int]]] = None,
250 |     backend: backendT = Backend.cuFFT()
251 | ) -> vs.VideoNode:
252 |     """ this interface is not stable """
253 | 
254 |     # translate parameters
255 |     if ftype == 0:
256 |         if abs(f0beta - 1) < 0.00005:
257 |             filter_type = 0
258 |         elif abs(f0beta - 0.5) < 0.0005:
259 |             filter_type = 6
260 |         else:
261 |             filter_type = 5
262 |     else:
263 |         filter_type = ftype
264 | 
265 |     radius = (tbsize - 1) // 2
266 |     block_size = sbsize
267 |     block_step = sbsize - sosize
268 |     spatial_window_mode = swin
269 |     temporal_window_mode = twin
270 |     spatial_beta = sbeta
271 |     temporal_beta = tbeta
272 |     zero_mean = zmean
273 |     backend = init_backend(backend)
274 | 
275 |     if isinstance(backend, (Backend.CPU, Backend.NVRTC, Backend.GCC, Backend.HIPRTC)):
276 |         if radius not in range(4):
277 |             raise ValueError("invalid radius (tbsize)")
278 |         if block_size != 16:
279 |             raise ValueError("invalid block_size (sbsize)")
280 | 
281 |     # compute constants
282 |     try:
283 |         sigma_scalar = float(sigma) # type: ignore
284 |         sigma_is_scalar = True
285 |     except:
286 |         # compute sigma_array
287 | 
288 |         sigma_is_scalar = False
289 | 
290 |         sigma_funcs = typing.cast(typing.Sequence[typing.Callable[[float], float]], sigma)
291 |         if callable(sigma_funcs):
292 |             sigma_funcs = [sigma_funcs]
293 |         else:
294 |             sigma_funcs = list(sigma_funcs)
295 |         sigma_funcs.extend([sigma_funcs[-1]] * 3)
296 |         sigma_func_x, sigma_func_y, sigma_func_t = sigma_funcs[:3]
297 | 
298 |         sigma_array = []
299 | 
300 |         if ssystem == 0:
301 |             for t in range(2 * radius + 1):
302 |                 sigma_t = get_sigma(position=t, length=2*radius+1, func=sigma_func_t)
303 |                 for y in range(block_size):
304 |                     sigma_y = get_sigma(position=y, length=block_size, func=sigma_func_y)
305 |                     for x in range(block_size // 2 + 1):
306 |                         sigma_x = get_sigma(position=x, length=block_size, func=sigma_func_x)
307 | 
308 |                         sigma = sigma_t * sigma_y * sigma_x
309 |                         sigma_array.append(sigma)
310 |         else:
311 |             for t in range(2 * radius + 1):
312 |                 loc_t = get_location(position=t, length=2*radius+1)
313 |                 for y in range(block_size):
314 |                     loc_y = get_location(position=y, length=block_size)
315 |                     for x in range(block_size // 2 + 1):
316 |                         loc_x = get_location(position=x, length=block_size)
317 | 
318 |                         ndim = 3 if radius > 0 else 2
319 |                         location = math.sqrt((loc_t * loc_t + loc_y * loc_y + loc_x * loc_x) / ndim)
320 |                         sigma = sigma_func_t(location)
321 |                         sigma_array.append(sigma)
322 | 
323 |     window = get_window(
324 |         radius=radius,
325 |         block_size=block_size,
326 |         block_step=block_step,
327 |         spatial_window_mode=spatial_window_mode,
328 |         temporal_window_mode=temporal_window_mode,
329 |         spatial_beta=spatial_beta,
330 |         temporal_beta=temporal_beta
331 |     )
332 | 
333 |     wscale = math.fsum(w * w for w in window)
334 | 
335 |     if ftype < 2:
336 |         if sigma_is_scalar:
337 |             sigma_scalar *= wscale
338 |         else:
339 |             sigma_array = [s * wscale for s in sigma_array]
340 |         sigma2 *= wscale
341 | 
342 |     pmin *= wscale
343 |     pmax *= wscale
344 | 
345 |     if isinstance(backend, Backend.cuFFT):
346 |         rdft = core.dfttest2_cuda.RDFT
347 |     elif isinstance(backend, Backend.NVRTC):
348 |         rdft = core.dfttest2_nvrtc.RDFT
349 |     elif isinstance(backend, Backend.CPU):
350 |         rdft = core.dfttest2_cpu.RDFT
351 |     elif isinstance(backend, Backend.GCC):
352 |         rdft = core.dfttest2_gcc.RDFT
353 |     elif isinstance(backend, Backend.hipFFT):
354 |         rdft = core.dfttest2_hip.RDFT
355 |     elif isinstance(backend, Backend.HIPRTC):
356 |         rdft = core.dfttest2_hiprtc.RDFT
357 |     else:
358 |         raise TypeError("unknown backend")
359 | 
360 |     if radius == 0:
361 |         window_freq = rdft(
362 |             data=[w * 255 for w in window],
363 |             shape=(block_size, block_size)
364 |         )
365 |     else:
366 |         window_freq = rdft(
367 |             data=[w * 255 for w in window],
368 |             shape=(2 * radius + 1, block_size, block_size)
369 |         )
370 | 
371 |     if isinstance(backend, Backend.CPU):
372 |         return core.dfttest2_cpu.DFTTest(
373 |             clip,
374 |             window=window,
375 |             sigma=[sigma_scalar] * (2 * radius + 1) * block_size * (block_size // 2 + 1) if sigma_is_scalar else sigma_array,
376 |             sigma2=sigma2,
377 |             pmin=pmin,
378 |             pmax=pmax,
379 |             radius=radius,
380 |             block_size=block_size,
381 |             block_step=block_step,
382 |             planes=planes,
383 |             filter_type=filter_type,
384 |             window_freq=window_freq,
385 |             opt=backend.opt
386 |         )
387 |     elif isinstance(backend, Backend.GCC):
388 |         return core.dfttest2_gcc.DFTTest(
389 |             clip,
390 |             window=window,
391 |             sigma=[sigma_scalar] * (2 * radius + 1) * block_size * (block_size // 2 + 1) if sigma_is_scalar else sigma_array,
392 |             sigma2=sigma2,
393 |             pmin=pmin,
394 |             pmax=pmax,
395 |             radius=radius,
396 |             block_size=block_size,
397 |             block_step=block_step,
398 |             planes=planes,
399 |             filter_type=filter_type,
400 |             window_freq=window_freq
401 |         )
402 | 
403 |     if isinstance(backend, Backend.cuFFT):
404 |         to_single = core.dfttest2_cuda.ToSingle
405 |     elif isinstance(backend, Backend.NVRTC):
406 |         to_single = core.dfttest2_nvrtc.ToSingle
407 |     elif isinstance(backend, Backend.hipFFT):
408 |         to_single = core.dfttest2_hip.ToSingle
409 |     elif isinstance(backend, Backend.HIPRTC):
410 |         to_single = core.dfttest2_hiprtc.ToSingle
411 |     else:
412 |         raise TypeError("unknown backend")
413 | 
414 |     kernel = Template(
415 |     """
416 |     #define FILTER_TYPE ${filter_type}
417 |     #define ZERO_MEAN ${zero_mean}
418 |     #define SIGMA_IS_SCALAR ${sigma_is_scalar}
419 | 
420 |     #if ZERO_MEAN
421 |     __device__ static const float window_freq[] { ${window_freq} };
422 |     #endif // ZERO_MEAN
423 | 
424 |     __device__ static const float window[] { ${window} };
425 | 
426 |     __device__
427 |     static void filter(float2 & value, int x, int y, int t) {
428 |     #if SIGMA_IS_SCALAR
429 |         float sigma = static_cast<float>(${sigma});
430 |     #else // SIGMA_IS_SCALAR
431 |         __device__ static const float sigma_array[] { ${sigma} };
432 |         float sigma = sigma_array[(t * BLOCK_SIZE + y) * (BLOCK_SIZE / 2 + 1) + x];
433 |     #endif // SIGMA_IS_SCALAR
434 |         [[maybe_unused]] float sigma2 = static_cast<float>(${sigma2});
435 |         [[maybe_unused]] float pmin = static_cast<float>(${pmin});
436 |         [[maybe_unused]] float pmax = static_cast<float>(${pmax});
437 |         [[maybe_unused]] float multiplier {};
438 | 
439 |     #if FILTER_TYPE == 2
440 |         value.x *= sigma;
441 |         value.y *= sigma;
442 |         return ;
443 |     #endif
444 | 
445 |         float psd = value.x * value.x + value.y * value.y;
446 | 
447 |     #if FILTER_TYPE == 1
448 |         if (psd < sigma) {
449 |             value.x = 0.0f;
450 |             value.y = 0.0f;
451 |         }
452 |         return ;
453 |     #elif FILTER_TYPE == 0
454 |         multiplier = fmaxf((psd - sigma) / (psd + 1e-15f), 0.0f);
455 |     #elif FILTER_TYPE == 3
456 |         if (psd >= pmin && psd <= pmax) {
457 |             multiplier = sigma;
458 |         } else {
459 |             multiplier = sigma2;
460 |         }
461 |     #elif FILTER_TYPE == 4
462 |         multiplier = sigma * sqrtf(psd * (pmax / ((psd + pmin) * (psd + pmax) + 1e-15f)));
463 |     #elif FILTER_TYPE == 5
464 |         multiplier = powf(fmaxf((psd - sigma) / (psd + 1e-15f), 0.0f), pmin);
465 |     #else
466 |         multiplier = sqrtf(fmaxf((psd - sigma) / (psd + 1e-15f), 0.0f));
467 |     #endif
468 | 
469 |         value.x *= multiplier;
470 |         value.y *= multiplier;
471 |     }
472 |     """
473 |     ).substitute(
474 |         sigma_is_scalar=int(sigma_is_scalar),
475 |         sigma=(
476 |             to_single(sigma_scalar)
477 |             if sigma_is_scalar
478 |             else ','.join(str(to_single(x)) for x in sigma_array)
479 |         ),
480 |         sigma2=to_single(sigma2),
481 |         pmin=to_single(pmin),
482 |         pmax=to_single(pmax),
483 |         filter_type=int(filter_type),
484 |         window_freq=','.join(str(to_single(x)) for x in window_freq),
485 |         zero_mean=int(zero_mean),
486 |         window=','.join(str(to_single(x)) for x in window),
487 |     )
488 | 
489 |     if isinstance(backend, Backend.cuFFT):
490 |         return core.dfttest2_cuda.DFTTest(
491 |             clip,
492 |             kernel=kernel,
493 |             radius=radius,
494 |             block_size=block_size,
495 |             block_step=block_step,
496 |             planes=planes,
497 |             in_place=backend.in_place,
498 |             device_id=backend.device_id
499 |         )
500 |     elif isinstance(backend, Backend.NVRTC):
501 |         return core.dfttest2_nvrtc.DFTTest(
502 |             clip,
503 |             kernel=kernel,
504 |             radius=radius,
505 |             block_size=block_size,
506 |             block_step=block_step,
507 |             planes=planes,
508 |             in_place=False,
509 |             device_id=backend.device_id,
510 |             num_streams=backend.num_streams
511 |         )
512 |     if isinstance(backend, Backend.hipFFT):
513 |         return core.dfttest2_hip.DFTTest(
514 |             clip,
515 |             kernel=kernel,
516 |             radius=radius,
517 |             block_size=block_size,
518 |             block_step=block_step,
519 |             planes=planes,
520 |             in_place=backend.in_place,
521 |             device_id=backend.device_id
522 |         )
523 |     elif isinstance(backend, Backend.HIPRTC):
524 |         return core.dfttest2_hiprtc.DFTTest(
525 |             clip,
526 |             kernel=kernel,
527 |             radius=radius,
528 |             block_size=block_size,
529 |             block_step=block_step,
530 |             planes=planes,
531 |             in_place=False,
532 |             device_id=backend.device_id,
533 |             num_streams=backend.num_streams
534 |         )
535 |     else:
536 |         raise TypeError("unknown backend")
537 | 
538 | 
539 | def select_backend(
540 |     backend: typing.Optional[backendT],
541 |     sbsize: int,
542 |     tbsize: int
543 | ) -> backendT:
544 | 
545 |     if backend is not None:
546 |         return backend
547 | 
548 |     if sbsize == 16 and tbsize in [1, 3, 5, 7]:
549 |         if hasattr(core, "dfttest2_nvrtc"):
550 |             return Backend.NVRTC()
551 |         elif hasattr(core, "dfttest2_hiprtc"):
552 |             return Backend.HIPRTC()
553 |         elif hasattr(core, "dfttest2_cuda"):
554 |             return Backend.cuFFT()
555 |         elif hasattr(core, "dfttest2_hip"):
556 |             return Backend.hipFFT()
557 |         elif hasattr(core, "dfttest2_cpu"):
558 |             return Backend.CPU()
559 |         else:
560 |             return Backend.GCC()
561 |     else:
562 |         if hasattr(core, "dfttest2_cuda"):
563 |             return Backend.cuFFT()
564 |         else:
565 |             return Backend.hipFFT()
566 | 
567 | 
568 | FREQ = float
569 | SIGMA = float
570 | def flatten(
571 |     data: typing.Optional[typing.Union[
572 |         typing.Sequence[typing.Tuple[FREQ, SIGMA]],
573 |         typing.Sequence[float]
574 |     ]]
575 | ) -> typing.Optional[typing.List[float]]:
576 | 
577 |     import itertools as it
578 |     import numbers
579 | 
580 |     if data is None:
581 |         return None
582 |     elif isinstance(data[0], numbers.Real):
583 |         return data
584 |     else:
585 |         data = typing.cast(typing.Sequence[typing.Tuple[FREQ, SIGMA]], data)
586 |         return list(it.chain.from_iterable(data))
587 | 
588 | 
589 | def to_func(
590 |     data: typing.Optional[typing.Sequence[float]],
591 |     norm: typing.Callable[[float], float],
592 |     sigma: float
593 | ) -> typing.Callable[[float], float]:
594 | 
595 |     if data is None:
596 |         return lambda _: norm(sigma)
597 | 
598 |     locations = data[::2]
599 |     sigmas = data[1::2]
600 |     packs = list(zip(locations, sigmas))
601 |     packs = sorted(packs, key=lambda group: group[0])
602 | 
603 |     def func(x: float) -> float:
604 |         length = len(packs)
605 |         for i in range(length - 1):
606 |             if x <= packs[i + 1][0]:
607 |                 weight = (x - packs[i][0]) / (packs[i + 1][0] - packs[i][0])
608 |                 return (1 - weight) * norm(packs[i][1]) + weight * norm(packs[i + 1][1])
609 |         raise ValueError()
610 | 
611 |     return func
612 | 
613 | 
614 | def DFTTest(
615 |     clip: vs.VideoNode,
616 |     ftype: typing.Literal[0, 1, 2, 3, 4] = 0,
617 |     sigma: float = 8.0,
618 |     sigma2: float = 8.0,
619 |     pmin: float = 0.0,
620 |     pmax: float = 500.0,
621 |     sbsize: int = 16,
622 |     smode: typing.Literal[0, 1] = 1,
623 |     sosize: int = 12,
624 |     tbsize: int = 3,
625 |     # tmode=0, tosize=0
626 |     swin: typing.Literal[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] = 0,
627 |     twin: typing.Literal[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] = 7,
628 |     sbeta: float = 2.5,
629 |     tbeta: float = 2.5,
630 |     zmean: bool = True,
631 |     f0beta: float = 1.0,
632 |     nlocation: typing.Optional[typing.Sequence[int]] = None,
633 |     alpha: typing.Optional[float] = None,
634 |     slocation: typing.Optional[typing.Union[
635 |         typing.Sequence[typing.Tuple[FREQ, SIGMA]],
636 |         typing.Sequence[float]
637 |     ]] = None,
638 |     ssx: typing.Optional[typing.Union[
639 |         typing.Sequence[typing.Tuple[FREQ, SIGMA]],
640 |         typing.Sequence[float]
641 |     ]] = None,
642 |     ssy: typing.Optional[typing.Union[
643 |         typing.Sequence[typing.Tuple[FREQ, SIGMA]],
644 |         typing.Sequence[float]
645 |     ]] = None,
646 |     sst: typing.Optional[typing.Union[
647 |         typing.Sequence[typing.Tuple[FREQ, SIGMA]],
648 |         typing.Sequence[float]
649 |     ]] = None,
650 |     ssystem: typing.Literal[0, 1] = 0,
651 |     planes: typing.Optional[typing.Union[int, typing.Sequence[int]]] = None,
652 |     backend: typing.Optional[backendT] = None
653 | ) -> vs.VideoNode:
654 |     """ 2D/3D frequency domain denoiser
655 | 
656 |     The interface is compatible with core.dfttest.DFTTest by HolyWu.
657 | 
658 |     Args:
659 |         clip: Clip to process.
660 | 
661 |             Any format with either integer sample type of 8-16 bit depth
662 |             or float sample type of 32 bit depth is supported.
663 | 
664 |         ftype: Controls the filter type.
665 | 
666 |             Possible settings are:
667 |                 0: generalized wiener filter
668 |                     mult = max((psd - sigma) / psd, 0) ^ f0beta
669 | 
670 |                 1: hard threshold
671 |                     mult = psd < sigma ? 0.0 : 1.0
672 | 
673 |                 2: multiplier
674 |                     mult = sigma
675 | 
676 |                 3: multiplier switched based on psd value
677 |                     mult = (psd >= pmin && psd <= pmax) ? sigma : sigma2
678 | 
679 |                 4: multiplier modified based on psd value and range
680 |                     mult = sigma * sqrt((psd * pmax) / ((psd + pmin) * (psd + pmax)))
681 | 
682 |             The real and imaginary parts of each complex dft coefficient are multiplied
683 |             by the corresponding 'mult' value.
684 | 
685 |             ** psd = magnitude squared = real*real + imag*imag
686 | 
687 |         sigma, sigma2: Value of sigma and sigma2.
688 |             If using the slocation parameter then the sigma parameter is ignored.
689 | 
690 |         pmin, pmax: Used as described in the ftype parameter description.
691 | 
692 |         sbsize: Sets the length of the sides of the spatial window.
693 |             Must be 1 or greater. Must be odd if using smode=0.
694 | 
695 |         smode: Sets the mode for spatial operation.
696 |             Currently only tmode=1 is implemented.
697 | 
698 |         sosize: Sets the spatial overlap amount.
699 |             Must be in the range 0 to sbsize-1 (inclusive).
700 |             If sosize is greater than sbsize>>1, then sbsize%(sbsize-sosize) must equal 0.
701 |             In other words, overlap greater than 50% requires that sbsize-sosize be a divisor of sbsize.
702 | 
703 |         tbsize: Sets the length of the temporal dimension (i.e. number of frames).
704 |             Must be at least 1. Must be odd if using tmode=0.
705 | 
706 |         tmode: Sets the mode for temporal operation.
707 |             Currently only tmode=0 is implemented.
708 | 
709 |         tosize: Sets the temporal overlap amount.
710 |             Must be in the range 0 to tbsize-1 (inclusive).
711 |             If tosize is greater than tbsize>>1, then tbsize%(tbsize-tosize) must equal 0.
712 |             In other words, overlap greater than 50% requires that tbsize-tosize be a divisor of tbsize.
713 | 
714 |         swin, twin: Sets the type of analysis/synthesis window to be used for spatial (swin) and
715 |             temporal (twin) processing. Possible settings:
716 | 
717 |             0: hanning
718 |             1: hamming
719 |             2: blackman
720 |             3: 4 term blackman-harris
721 |             4: kaiser-bessel
722 |             5: 7 term blackman-harris
723 |             6: flat top
724 |             7: rectangular
725 |             8: Bartlett
726 |             9: Bartlett-Hann
727 |             10: Nuttall
728 |             11: Blackman-Nuttall
729 | 
730 |         sbeta,tbeta: Sets the beta value for kaiser-bessel window type.
731 |             sbeta goes with swin, tbeta goes with twin.
732 |             Not used unless the corresponding window value is set to 4.
733 | 
734 |         zmean: Controls whether the window mean is subtracted out (zero'd)
735 |             prior to filtering in the frequency domain.
736 | 
737 |         f0beta: Power term in ftype=0.
738 | 
739 |         nlocation: Currently not implemented.
740 | 
741 |         slocation/ssx/ssy/sst: Used to specify functions of sigma based on frequency.
742 |             Check the original documentation for details.
743 | 
744 |             Note that in current implementation,
745 |             "slocation = [(0.0, 1.0), (1.0, 10.0)]"
746 |             is equivalent to
747 |             "slocation = [0.0, 1.0, 1.0, 10.0]"
748 | 
749 |         ssystem: Method of sigma computation.
750 |             Check the original documentation for details.
751 | 
752 |         planes: Sets which planes will be processed.
753 |             Any unprocessed planes will be simply copied.
754 | 
755 |         backend: Backend implementation to use.
756 |             All available backends can be found in the dfttest2.Backend "namespace":
757 |                 dfttest2.Backend.{CPU, cuFFT, NVRTC, GCC, hipFFT, HIPRTC}
758 | 
759 |             The CPU, NVRTC and GCC backends require sbsize=16.
760 |             The cuFFT and NVRTC backends require a CUDA-enabled system.
761 |             The hipFFT and HIPRTC backends require a CUDA-enabled system.
762 | 
763 |             Speed: NVRTC == HIPRTC >> cuFFT > hipFFT > CPU == GCC
764 |     """
765 | 
766 |     if (
767 |         not isinstance(clip, vs.VideoNode) or
768 |         clip.width == 0 or
769 |         clip.height == 0 or
770 |         clip.format is None or
771 |         (clip.format.sample_type == vs.INTEGER and clip.format.bits_per_sample > 16) or
772 |         (clip.format.sample_type == vs.FLOAT and clip.format.bits_per_sample != 32)
773 |     ):
774 |         raise ValueError("only constant format 8-16 bit integer and 32 bit float input supported")
775 | 
776 |     if ftype < 0 or ftype > 4:
777 |         raise ValueError("ftype must be 0, 1, 2, 3, or 4")
778 | 
779 |     if sbsize < 1:
780 |         raise ValueError("sbsize must be greater than or equal to 1")
781 | 
782 |     if smode != 1:
783 |         raise ValueError('"smode" must be 1')
784 | 
785 |     if sosize > sbsize // 2 and (sbsize % (sbsize - sosize) != 0):
786 |         raise ValueError("spatial overlap greater than 50% requires that sbsize-sosize is a divisor of sbsize")
787 | 
788 |     if tbsize < 1:
789 |         raise ValueError('"tbsize" must be at least 1')
790 | 
791 |     if swin < 0 or swin > 11:
792 |         raise ValueError("swin must be between 0 and 11 (inclusive)")
793 | 
794 |     if twin < 0 or twin > 11:
795 |         raise ValueError("twin must be between 0 and 11 (inclusive)")
796 | 
797 |     if nlocation is not None:
798 |         raise ValueError('"nlocation" must be None')
799 | 
800 |     if slocation and len(slocation) % 2 != 0:
801 |         raise ValueError("number of elements in slocation must be a multiple of 2")
802 | 
803 |     if ssx and len(ssx) % 2 != 0:
804 |         raise ValueError("number of elements in ssx must be a multiple of 2")
805 | 
806 |     if ssy and len(ssy) % 2 != 0:
807 |         raise ValueError("number of elements in ssy must be a multiple of 2")
808 | 
809 |     if sst and len(sst) % 2 != 0:
810 |         raise ValueError("number of elements in sst must be a multiple of 2")
811 | 
812 |     if ssystem < 0 or ssystem > 1:
813 |         raise ValueError("ssystem must be 0 or 1")
814 | 
815 |     def norm(x: float) -> float:
816 |         if slocation is not None and ssystem == 1:
817 |             return x
818 |         elif tbsize == 1:
819 |             return math.sqrt(x)
820 |         else:
821 |             return x ** (1 / 3)
822 | 
823 |     _sigma: typing.Union[float, typing.Sequence[typing.Callable[[float], float]]]
824 | 
825 |     if slocation is not None:
826 |         _sigma = [to_func(flatten(slocation), norm, sigma)] * 3
827 |     elif any(ss is not None for ss in (ssx, ssy, sst)):
828 |         _sigma = [to_func(flatten(ss), norm, sigma) for ss in (ssx, ssy, sst)]
829 |     else:
830 |         _sigma = sigma
831 | 
832 |     return DFTTest2(
833 |         clip = clip,
834 |         ftype = ftype,
835 |         sigma = _sigma,
836 |         sigma2 = sigma2,
837 |         pmin = pmin,
838 |         pmax = pmax,
839 |         sbsize = sbsize,
840 |         sosize = sosize,
841 |         tbsize = tbsize,
842 |         swin = swin,
843 |         twin = twin,
844 |         sbeta = sbeta,
845 |         tbeta = tbeta,
846 |         zmean = zmean,
847 |         f0beta = f0beta,
848 |         ssystem = ssystem,
849 |         planes = planes,
850 |         backend = select_backend(backend, sbsize, tbsize)
851 |     )
852 | 


--------------------------------------------------------------------------------
/gcc_source/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_library(dfttest2_gcc MODULE source.cpp)
 2 | 
 3 | set_target_properties(dfttest2_gcc PROPERTIES
 4 |     CXX_EXTENSIONS OFF
 5 |     CXX_STANDARD 20
 6 |     CXX_STANDARD_REQUIRED ON
 7 | )
 8 | 
 9 | target_include_directories(dfttest2_gcc PRIVATE ${VCL_HOME})
10 | 
11 | if(PKG_CONFIG_FOUND AND VS_FOUND)
12 |     target_include_directories(dfttest2_gcc PRIVATE ${VS_INCLUDE_DIRS})
13 |     install(TARGETS dfttest2_gcc LIBRARY DESTINATION ${install_dir})
14 | else()
15 |     target_include_directories(dfttest2_gcc PRIVATE ${VS_INCLUDE_DIR})
16 |     install(TARGETS dfttest2_gcc LIBRARY DESTINATION lib)
17 | endif()
18 | 
19 | target_include_directories(dfttest2_gcc PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/..)
20 | 
21 | 
22 | add_library(getframe_impl OBJECT getframe_impl.cpp)
23 | 
24 | set_target_properties(getframe_impl PROPERTIES
25 |     CXX_EXTENSIONS OFF
26 |     CXX_STANDARD 20
27 |     CXX_STANDARD_REQUIRED ON
28 | )
29 | 
30 | target_include_directories(getframe_impl PRIVATE ${VCL_HOME})
31 | 
32 | if(PKG_CONFIG_FOUND AND VS_FOUND)
33 |     target_include_directories(getframe_impl PRIVATE ${VS_INCLUDE_DIRS})
34 | else()
35 |     target_include_directories(getframe_impl PRIVATE ${VS_INCLUDE_DIR})
36 | endif()
37 | 
38 | target_link_libraries(dfttest2_gcc PRIVATE getframe_impl)
39 | 


--------------------------------------------------------------------------------
/gcc_source/dfttest2_cpu.h:
--------------------------------------------------------------------------------
 1 | #ifndef DFTTEST2_CPU_H
 2 | #define DFTTEST2_CPU_H
 3 | 
 4 | #include <array>
 5 | #include <atomic>
 6 | #include <cstdint>
 7 | #include <memory>
 8 | #include <shared_mutex>
 9 | #include <thread>
10 | #include <unordered_map>
11 | 
12 | #include <VapourSynth.h>
13 | #include <VSHelper.h>
14 | 
15 | 
16 | static inline void vs_aligned_free_float(float * ptr) {
17 |     vs_aligned_free(static_cast<void *>(ptr));
18 | }
19 | 
20 | 
21 | struct DFTTestThreadData {
22 |     uint8_t * padded; // shape: (pad_height, pad_width)
23 |     float * padded2; // shape: (pad_height, pad_width)
24 | };
25 | 
26 | 
27 | struct DFTTestData {
28 |     VSNodeRef * node;
29 |     int radius;
30 |     int block_size;
31 |     int block_step;
32 |     std::array<bool, 3> process;
33 |     bool zero_mean;
34 |     std::unique_ptr<float [], decltype(&vs_aligned_free_float)> window { nullptr, &vs_aligned_free_float };
35 |     std::unique_ptr<float [], decltype(&vs_aligned_free_float)> window_freq { nullptr, &vs_aligned_free_float };
36 |     std::unique_ptr<float [], decltype(&vs_aligned_free_float)> sigma { nullptr, &vs_aligned_free_float };
37 |     int filter_type;
38 |     float sigma2;
39 |     float pmin;
40 |     float pmax;
41 | 
42 |     std::atomic<int> num_uninitialized_threads;
43 |     std::unordered_map<std::thread::id, DFTTestThreadData> thread_data;
44 |     std::shared_mutex thread_data_lock;
45 | };
46 | 
47 | extern const VSFrameRef *VS_CC DFTTestGetFrame(
48 |     int n, int activationReason, void **instanceData, void **frameData,
49 |     VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi
50 | ) noexcept;
51 | 
52 | #endif // DFTTEST2_CPU_H
53 | 


--------------------------------------------------------------------------------
/gcc_source/getframe_impl.cpp:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <cstdlib>
  3 | #include <cstring>
  4 | #include <vector>
  5 | 
  6 | #include <VSHelper.h>
  7 | 
  8 | #include "dfttest2_cpu.h"
  9 | #include "kernel.hpp"
 10 | 
 11 | 
 12 | typedef unsigned char Vec16uc __attribute__((__vector_size__(16), __aligned__(16)));
 13 | typedef unsigned char Vec16uc_u __attribute__((__vector_size__(16), __aligned__(1)));
 14 | typedef unsigned short Vec16us __attribute__((__vector_size__(32), __aligned__(32)));
 15 | typedef unsigned short Vec16us_u __attribute__((__vector_size__(32), __aligned__(1)));
 16 | typedef float Vec16f_u __attribute__((__vector_size__(64), __aligned__(1)));
 17 | 
 18 | 
 19 | static inline Vec16uc __attribute__((__always_inline__)) load_16uc(const unsigned char * p) {
 20 |     struct loadu {
 21 |         Vec16uc_u v;
 22 |     } __attribute__((__packed__, __may_alias__));
 23 | 
 24 |     return ((const struct loadu*) p)->v;
 25 | }
 26 | 
 27 | 
 28 | static inline Vec16us __attribute__((__always_inline__)) load_16us(const unsigned short * p) {
 29 |     struct loadu {
 30 |         Vec16us_u v;
 31 |     } __attribute__((__packed__, __may_alias__));
 32 | 
 33 |     return ((const struct loadu*) p)->v;
 34 | }
 35 | 
 36 | 
 37 | static inline Vec16f __attribute__((__always_inline__)) load_16f(const float * p) {
 38 |     struct loadu_16f {
 39 |         Vec16f_u v;
 40 |     } __attribute__((__packed__, __may_alias__));
 41 | 
 42 |     return ((const struct loadu_16f*) p)->v;
 43 | }
 44 | 
 45 | 
 46 | static inline void __attribute__((__always_inline__)) store_16f(float * p, Vec16f a) {
 47 |     struct storeu_ps {
 48 |         Vec16f_u v;
 49 |     } __attribute__((__packed__, __may_alias__));
 50 | 
 51 |     ((struct storeu_ps*) p)->v = a;
 52 | }
 53 | 
 54 | 
 55 | static inline int calc_pad_size(int size, int block_size, int block_step) {
 56 |     return (
 57 |         size
 58 |         + ((size % block_size) ? block_size - size % block_size : 0)
 59 |         + std::max(block_size - block_step, block_step) * 2
 60 |     );
 61 | }
 62 | 
 63 | 
 64 | static inline int calc_pad_num(int size, int block_size, int block_step) {
 65 |     return (calc_pad_size(size, block_size, block_step) - block_size) / block_step + 1;
 66 | }
 67 | 
 68 | 
 69 | template <typename T>
 70 | static inline void reflection_padding_impl(
 71 |     T * VS_RESTRICT dst, // shape: (pad_height, pad_width)
 72 |     const T * VS_RESTRICT src, // shape: (height, stride)
 73 |     int width, int height, int stride,
 74 |     int block_size, int block_step
 75 | ) {
 76 | 
 77 |     int pad_width = calc_pad_size(width, block_size, block_step);
 78 |     int pad_height = calc_pad_size(height, block_size, block_step);
 79 | 
 80 |     int offset_y = (pad_height - height) / 2;
 81 |     int offset_x = (pad_width - width) / 2;
 82 | 
 83 |     vs_bitblt(
 84 |         &dst[offset_y * pad_width + offset_x], pad_width * sizeof(T),
 85 |         src, stride * sizeof(T),
 86 |         width * sizeof(T), height
 87 |     );
 88 | 
 89 |     // copy left and right regions
 90 |     for (int y = offset_y; y < offset_y + height; y++) {
 91 |         auto dst_line = &dst[y * pad_width];
 92 | 
 93 |         for (int x = 0; x < offset_x; x++) {
 94 |             dst_line[x] = dst_line[offset_x * 2 - x];
 95 |         }
 96 | 
 97 |         for (int x = offset_x + width; x < pad_width; x++) {
 98 |             dst_line[x] = dst_line[2 * (offset_x + width) - 2 - x];
 99 |         }
100 |     }
101 | 
102 |     // copy top region
103 |     for (int y = 0; y < offset_y; y++) {
104 |         std::memcpy(
105 |             &dst[y * pad_width],
106 |             &dst[(offset_y * 2 - y) * pad_width],
107 |             pad_width * sizeof(T)
108 |         );
109 |     }
110 | 
111 |     // copy bottom region
112 |     for (int y = offset_y + height; y < pad_height; y++) {
113 |         std::memcpy(
114 |             &dst[y * pad_width],
115 |             &dst[(2 * (offset_y + height) - 2 - y) * pad_width],
116 |             pad_width * sizeof(T)
117 |         );
118 |     }
119 | }
120 | 
121 | 
122 | static inline void reflection_padding(
123 |     uint8_t * VS_RESTRICT dst, // shape: (pad_height, pad_width)
124 |     const uint8_t * VS_RESTRICT src, // shape: (height, stride)
125 |     int width, int height, int stride,
126 |     int block_size, int block_step,
127 |     int bytes_per_sample
128 | ) {
129 | 
130 |     if (bytes_per_sample == 1) {
131 |         reflection_padding_impl(
132 |             static_cast<uint8_t *>(dst),
133 |             static_cast<const uint8_t *>(src),
134 |             width, height, stride,
135 |             block_size, block_step
136 |         );
137 |     } else if (bytes_per_sample == 2) {
138 |         reflection_padding_impl(
139 |             reinterpret_cast<uint16_t *>(dst),
140 |             reinterpret_cast<const uint16_t *>(src),
141 |             width, height, stride,
142 |             block_size, block_step
143 |         );
144 |     } else if (bytes_per_sample == 4) {
145 |         reflection_padding_impl(
146 |             reinterpret_cast<uint32_t *>(dst),
147 |             reinterpret_cast<const uint32_t *>(src),
148 |             width, height, stride,
149 |             block_size, block_step
150 |         );
151 |     }
152 | }
153 | 
154 | 
155 | static inline void load_block(
156 |     Vec16f * VS_RESTRICT block,
157 |     const uint8_t * VS_RESTRICT shifted_src,
158 |     int radius,
159 |     int block_size,
160 |     int block_step,
161 |     int width,
162 |     int height,
163 |     const Vec16f * VS_RESTRICT window,
164 |     int bits_per_sample
165 | ) {
166 | 
167 |     float scale = 1.0f / (1 << (bits_per_sample - 8));
168 |     if (bits_per_sample == 32) {
169 |         scale = 255.0f;
170 |     }
171 | 
172 |     int bytes_per_sample = (bits_per_sample + 7) / 8;
173 | 
174 |     assert(block_size == 16);
175 |     block_size = 16; // unsafe
176 | 
177 |     int offset_x = calc_pad_size(width, block_size, block_step);
178 |     int offset_y = calc_pad_size(height, block_size, block_step);
179 | 
180 |     if (bytes_per_sample == 1) {
181 |         for (int i = 0; i < 2 * radius + 1; i++) {
182 |             for (int j = 0; j < block_size; j++) {
183 |                 auto vec_input = load_16uc((const uint8_t *) shifted_src + (i * offset_y + j) * offset_x);
184 |                 auto vec_input_f = __builtin_convertvector(vec_input, Vec16f);
185 |                 block[i * block_size * 2 + j] = scale * window[i * block_size + j] * vec_input_f;
186 |             }
187 |         }
188 |     }
189 |     if (bytes_per_sample == 2) {
190 |         for (int i = 0; i < 2 * radius + 1; i++) {
191 |             for (int j = 0; j < block_size; j++) {
192 |                 auto vec_input = load_16us((const uint16_t *) shifted_src + (i * offset_y + j) * offset_x);
193 |                 auto vec_input_f = __builtin_convertvector(vec_input, Vec16f);
194 |                 block[i * block_size * 2 + j] = scale * window[i * block_size + j] * vec_input_f;
195 |             }
196 |         }
197 |     }
198 |     if (bytes_per_sample == 4) {
199 |         for (int i = 0; i < 2 * radius + 1; i++) {
200 |             for (int j = 0; j < block_size; j++) {
201 |                 auto vec_input_f = load_16f((const float *) shifted_src + (i * offset_y + j) * offset_x);
202 |                 block[i * block_size * 2 + j] = scale * window[i * block_size + j] * vec_input_f;
203 |             }
204 |         }
205 |     }
206 | }
207 | 
208 | 
209 | static inline void store_block(
210 |     float * VS_RESTRICT shifted_dst,
211 |     const Vec16f * VS_RESTRICT shifted_block,
212 |     int block_size,
213 |     int block_step,
214 |     int width,
215 |     int height,
216 |     const Vec16f * VS_RESTRICT shifted_window
217 | ) {
218 | 
219 |     assert(block_size == 16);
220 |     block_size = 16; // unsafe
221 | 
222 |     for (int i = 0; i < block_size; i++) {
223 |         Vec16f acc = load_16f((const float *) shifted_dst + (i * calc_pad_size(width, block_size, block_step)));
224 |         acc = FMA(shifted_block[i], shifted_window[i], acc);
225 |         store_16f((float *) shifted_dst + (i * calc_pad_size(width, block_size, block_step)), acc);
226 |     }
227 | }
228 | 
229 | 
230 | static inline void store_frame(
231 |     uint8_t * VS_RESTRICT dst,
232 |     const float * VS_RESTRICT shifted_src,
233 |     int width,
234 |     int height,
235 |     int dst_stride,
236 |     int src_stride,
237 |     int bits_per_sample
238 | ) {
239 | 
240 |     float scale = 1.0f / (1 << (bits_per_sample - 8));
241 |     if (bits_per_sample == 32) {
242 |         scale = 255.0f;
243 |     }
244 | 
245 |     int bytes_per_sample = (bits_per_sample + 7) / 8;
246 |     int peak = (1 << bits_per_sample) - 1;
247 | 
248 |     if (bytes_per_sample == 1) {
249 |         auto dstp = (uint8_t *) dst;
250 |         for (int y = 0; y < height; y++) {
251 |             for (int x = 0; x < width; x++) {
252 |                 auto clamped = std::clamp(static_cast<int>(shifted_src[y * src_stride + x] / scale + 0.5f), 0, peak);
253 |                 dstp[y * dst_stride + x] = static_cast<uint8_t>(clamped);
254 |             }
255 |         }
256 |     }
257 |     if (bytes_per_sample == 2) {
258 |         auto dstp = (uint16_t *) dst;
259 |         for (int y = 0; y < height; y++) {
260 |             for (int x = 0; x < width; x++) {
261 |                 auto clamped = std::clamp(static_cast<int>(shifted_src[y * src_stride + x] / scale + 0.5f), 0, peak);
262 |                 dstp[y * dst_stride + x] = static_cast<uint16_t>(clamped);
263 |             }
264 |         }
265 |     }
266 |     if (bytes_per_sample == 4) {
267 |         auto dstp = (float *) dst;
268 |         for (int y = 0; y < height; y++) {
269 |             for (int x = 0; x < width; x++) {
270 |                 dstp[y * dst_stride + x] = shifted_src[y * src_stride + x] / scale;
271 |             }
272 |         }
273 |     }
274 | }
275 | 
276 | 
277 | const VSFrameRef * VS_CC
278 | #ifndef HAS_DISPATCH
279 | DFTTestGetFrame
280 | #else // HAS_DISPATCH
281 | DFTTEST_GETFRAME_NAME
282 | #endif // HAS_DISPATCH
283 | (
284 |     int n, int activationReason, void **instanceData, void **frameData,
285 |     VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi
286 | ) noexcept {
287 | 
288 |     auto d = static_cast<DFTTestData *>(*instanceData);
289 | 
290 |     if (activationReason == arInitial) {
291 |         int start = std::max(n - d->radius, 0);
292 |         auto vi = vsapi->getVideoInfo(d->node);
293 |         int end = std::min(n + d->radius, vi->numFrames - 1);
294 |         for (int i = start; i <= end; i++) {
295 |             vsapi->requestFrameFilter(i, d->node, frameCtx);
296 |         }
297 |         return nullptr;
298 |     } else if (activationReason != arAllFramesReady) {
299 |         return nullptr;
300 |     }
301 | 
302 |     auto vi = vsapi->getVideoInfo(d->node);
303 | 
304 |     DFTTestThreadData thread_data;
305 | 
306 |     auto thread_id = std::this_thread::get_id();
307 |     if (d->num_uninitialized_threads.load(std::memory_order_acquire) == 0) {
308 |         const auto & const_data = d->thread_data;
309 |         thread_data = const_data.at(thread_id);
310 |     } else {
311 |         bool initialized = true;
312 | 
313 |         d->thread_data_lock.lock_shared();
314 |         try {
315 |             const auto & const_data = d->thread_data;
316 |             thread_data = const_data.at(thread_id);
317 |         } catch (const std::out_of_range &) {
318 |             initialized = false;
319 |         }
320 |         d->thread_data_lock.unlock_shared();
321 | 
322 |         if (!initialized) {
323 |             auto padded_size = (
324 |                 (2 * d->radius + 1) *
325 |                 calc_pad_size(vi->height, d->block_size, d->block_step) *
326 |                 calc_pad_size(vi->width, d->block_size, d->block_step) *
327 |                 vi->format->bytesPerSample
328 |             );
329 | 
330 |             thread_data.padded = static_cast<uint8_t *>(std::malloc(padded_size));
331 |             thread_data.padded2 = static_cast<float *>(std::malloc(
332 |                 calc_pad_size(vi->height, d->block_size, d->block_step) *
333 |                 calc_pad_size(vi->width, d->block_size, d->block_step) *
334 |                 sizeof(float)
335 |             ));
336 | 
337 |             {
338 |                 std::lock_guard _ { d->thread_data_lock };
339 |                 d->thread_data.emplace(thread_id, thread_data);
340 |             }
341 | 
342 |             d->num_uninitialized_threads.fetch_sub(1, std::memory_order_release);
343 |         }
344 |     }
345 | 
346 |     std::vector<std::unique_ptr<const VSFrameRef, decltype(vsapi->freeFrame)>> src_frames;
347 |     src_frames.reserve(2 * d->radius + 1);
348 |     for (int i = n - d->radius; i <= n + d->radius; i++) {
349 |         src_frames.emplace_back(
350 |             vsapi->getFrameFilter(std::clamp(i, 0, vi->numFrames - 1), d->node, frameCtx),
351 |             vsapi->freeFrame
352 |         );
353 |     }
354 | 
355 |     auto & src_center_frame = src_frames[d->radius];
356 |     auto format = vsapi->getFrameFormat(src_center_frame.get());
357 | 
358 |     const VSFrameRef * fr[] {
359 |         d->process[0] ? nullptr : src_center_frame.get(),
360 |         d->process[1] ? nullptr : src_center_frame.get(),
361 |         d->process[2] ? nullptr : src_center_frame.get()
362 |     };
363 |     const int pl[] { 0, 1, 2 };
364 |     std::unique_ptr<VSFrameRef, decltype(vsapi->freeFrame)> dst_frame {
365 |         vsapi->newVideoFrame2(format, vi->width, vi->height, fr, pl, src_center_frame.get(), core),
366 |         vsapi->freeFrame
367 |     };
368 | 
369 |     for (int plane = 0; plane < format->numPlanes; plane++) {
370 |         if (!d->process[plane]) {
371 |             continue;
372 |         }
373 | 
374 |         int width = vsapi->getFrameWidth(src_center_frame.get(), plane);
375 |         int height = vsapi->getFrameHeight(src_center_frame.get(), plane);
376 |         int stride = vsapi->getStride(src_center_frame.get(), plane) / vi->format->bytesPerSample;
377 | 
378 |         int padded_size_spatial = (
379 |             calc_pad_size(height, d->block_size, d->block_step) *
380 |             calc_pad_size(width, d->block_size, d->block_step)
381 |         );
382 | 
383 |         std::memset(thread_data.padded2, 0,
384 |             calc_pad_size(height, d->block_size, d->block_step) *
385 |             calc_pad_size(width, d->block_size, d->block_step) *
386 |             sizeof(float)
387 |         );
388 | 
389 |         for (int i = 0; i < 2 * d->radius + 1; i++) {
390 |             auto srcp = vsapi->getReadPtr(src_frames[i].get(), plane);
391 |             reflection_padding(
392 |                 &thread_data.padded[(i * padded_size_spatial) * vi->format->bytesPerSample],
393 |                 srcp,
394 |                 width, height, stride,
395 |                 d->block_size, d->block_step,
396 |                 vi->format->bytesPerSample
397 |             );
398 |         }
399 | 
400 |         for (int i = 0; i < calc_pad_num(height, d->block_size, d->block_step); i++) {
401 |             for (int j = 0; j < calc_pad_num(width, d->block_size, d->block_step); j++) {
402 |                 assert(d->block_size == 16);
403 |                 constexpr int block_size = 16;
404 | 
405 |                 Vec16f block[7 * block_size * 2];
406 | 
407 |                 int offset_x = calc_pad_size(width, d->block_size, d->block_step);
408 | 
409 |                 load_block(
410 |                     block,
411 |                     &thread_data.padded[(i * offset_x + j) * d->block_step * vi->format->bytesPerSample],
412 |                     d->radius, d->block_size, d->block_step,
413 |                     width, height,
414 |                     reinterpret_cast<const Vec16f *>(d->window.get()),
415 |                     vi->format->bitsPerSample
416 |                 );
417 | 
418 |                 fused(
419 |                     block,
420 |                     reinterpret_cast<const Vec16f *>(d->sigma.get()),
421 |                     d->sigma2,
422 |                     d->pmin,
423 |                     d->pmax,
424 |                     d->filter_type,
425 |                     d->zero_mean,
426 |                     reinterpret_cast<const Vec16f *>(d->window_freq.get()),
427 |                     d->radius
428 |                 );
429 | 
430 |                 store_block(
431 |                     &thread_data.padded2[(i * offset_x + j) * d->block_step],
432 |                     &block[d->radius * block_size * 2],
433 |                     block_size,
434 |                     d->block_step,
435 |                     width,
436 |                     height,
437 |                     reinterpret_cast<const Vec16f *>(&d->window[d->radius * block_size * 2 * 16])
438 |                 );
439 |             }
440 |         }
441 | 
442 |         int pad_width = calc_pad_size(width, d->block_size, d->block_step);
443 |         int pad_height = calc_pad_size(height, d->block_size, d->block_step);
444 |         int offset_y = (pad_height - height) / 2;
445 |         int offset_x = (pad_width - width) / 2;
446 | 
447 |         auto dstp = vsapi->getWritePtr(dst_frame.get(), plane);
448 |         store_frame(
449 |             dstp,
450 |             &thread_data.padded2[(offset_y * pad_width + offset_x)],
451 |             width,
452 |             height,
453 |             stride,
454 |             pad_width,
455 |             vi->format->bitsPerSample
456 |         );
457 |     }
458 | 
459 |     return dst_frame.release();
460 | }
461 | 


--------------------------------------------------------------------------------
/gcc_source/source.cpp:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <array>
  3 | #include <atomic>
  4 | #include <cassert>
  5 | #include <cmath>
  6 | #include <complex>
  7 | #include <cstdint>
  8 | #include <cstring>
  9 | #include <memory>
 10 | #include <mutex>
 11 | #if __cpp_lib_math_constants
 12 | #include <numbers>
 13 | #endif // __cpp_lib_math_constants
 14 | #include <shared_mutex>
 15 | #include <thread>
 16 | #include <type_traits>
 17 | #include <unordered_map>
 18 | #include <vector>
 19 | 
 20 | #include <VapourSynth.h>
 21 | #include <VSHelper.h>
 22 | 
 23 | #include "dfttest2_cpu.h"
 24 | #include "kernel.hpp"
 25 | 
 26 | #include <config.h> // generated by cmake, defines "VERSION"
 27 | 
 28 | 
 29 | template <typename T, typename T_in>
 30 | #if __cpp_concepts
 31 |     requires
 32 |         (std::is_same_v<T_in, T> || std::is_same_v<T_in, std::complex<T>>)
 33 | #endif // __cpp_concepts
 34 | static void dft(
 35 |     std::complex<T> * VS_RESTRICT dst,
 36 |     const T_in * VS_RESTRICT src,
 37 |     int n,
 38 |     int stride
 39 | ) {
 40 | #if __cpp_lib_math_constants
 41 |     const auto pi = std::numbers::pi_v<T>;
 42 | #else // __cpp_lib_math_constants
 43 |     const auto pi = static_cast<T>(M_PI);
 44 | #endif // __cpp_lib_math_constants
 45 | 
 46 |     int out_num = std::is_floating_point_v<T_in> ? (n / 2 + 1) : n;
 47 |     for (int i = 0; i < out_num; i++) {
 48 |         std::complex<T> sum {};
 49 |         for (int j = 0; j < n; j++) {
 50 |             auto imag = -2 * i * j * pi / n;
 51 |             auto weight = std::complex(std::cos(imag), std::sin(imag));
 52 |             sum += src[j * stride] * weight;
 53 |         }
 54 |         dst[i * stride] = sum;
 55 |     }
 56 | }
 57 | 
 58 | 
 59 | static void VS_CC DFTTestInit(
 60 |     VSMap *in, VSMap *out, void **instanceData, VSNode *node,
 61 |     VSCore *core, const VSAPI *vsapi
 62 | ) noexcept {
 63 | 
 64 |     auto d = static_cast<const DFTTestData *>(*instanceData);
 65 | 
 66 |     auto vi = vsapi->getVideoInfo(d->node);
 67 |     vsapi->setVideoInfo(vi, 1, node);
 68 | }
 69 | 
 70 | 
 71 | static void VS_CC DFTTestFree(
 72 |     void *instanceData, VSCore *core, const VSAPI *vsapi
 73 | ) noexcept {
 74 | 
 75 |     auto d = static_cast<const DFTTestData *>(instanceData);
 76 | 
 77 |     vsapi->freeNode(d->node);
 78 | 
 79 |     for (const auto & [_, thread_data] : d->thread_data) {
 80 |         std::free(thread_data.padded2);
 81 |         std::free(thread_data.padded);
 82 |     }
 83 | 
 84 |     delete d;
 85 | }
 86 | 
 87 | 
 88 | static void VS_CC DFTTestCreate(
 89 |     const VSMap *in, VSMap *out, void *userData,
 90 |     VSCore *core, const VSAPI *vsapi
 91 | ) noexcept {
 92 | 
 93 |     auto d = std::make_unique<DFTTestData>();
 94 | 
 95 |     d->node = vsapi->propGetNode(in, "clip", 0, nullptr);
 96 | 
 97 |     auto set_error = [vsapi, out, &d](const char * error_message) -> void {
 98 |         vsapi->freeNode(d->node);
 99 |         vsapi->setError(out, error_message);
100 |         return ;
101 |     };
102 | 
103 |     auto vi = vsapi->getVideoInfo(d->node);
104 |     if (!isConstantFormat(vi)) {
105 |         return set_error("only constant format input is supported");
106 |     }
107 |     if (vi->format->sampleType == stInteger && vi->format->bytesPerSample > 2) {
108 |         return set_error("only 8-16 bit integer format input is supported");
109 |     }
110 |     if (vi->format->sampleType == stFloat && vi->format->bitsPerSample != 32) {
111 |         return set_error("only 32-bit float format input is supported");
112 |     }
113 | 
114 |     int error;
115 | 
116 |     d->radius = int64ToIntS(vsapi->propGetInt(in, "radius", 0, &error));
117 |     if (error) {
118 |         d->radius = 0;
119 |     }
120 | 
121 |     if (d->radius < 0 || d->radius > 3) {
122 |         return set_error("\"radius\" must be in [0, 1, 2, 3]");
123 |     }
124 | 
125 |     d->block_size = int64ToIntS(vsapi->propGetInt(in, "block_size", 0, &error));
126 |     if (error) {
127 |         d->block_size = 16;
128 |     }
129 | 
130 |     if (d->block_size != 16) {
131 |         return set_error("\"block_size\" must be 16");
132 |     }
133 | 
134 |     d->block_step = int64ToIntS(vsapi->propGetInt(in, "block_step", 0, &error));
135 |     if (error) {
136 |         d->block_step = d->block_size;
137 |     }
138 | 
139 |     int num_planes_args = vsapi->propNumElements(in, "planes");
140 |     d->process.fill(num_planes_args <= 0);
141 |     for (int i = 0; i < num_planes_args; ++i) {
142 |         int plane = static_cast<int>(vsapi->propGetInt(in, "planes", i, nullptr));
143 | 
144 |         if (plane < 0 || plane >= vi->format->numPlanes) {
145 |             return set_error("plane index out of range");
146 |         }
147 | 
148 |         if (d->process[plane]) {
149 |             return set_error("plane specified twice");
150 |         }
151 | 
152 |         d->process[plane] = true;
153 |     }
154 | 
155 |     {
156 |         auto ptr = vs_aligned_malloc<float>(
157 |             (2 * d->radius + 1) * d->block_size * d->block_size * sizeof(float),
158 |             64
159 |         );
160 |         if (ptr == nullptr) {
161 |             return set_error("alloc error");
162 |         }
163 |         d->window.reset(ptr);
164 |     }
165 |     {
166 |         auto window = vsapi->propGetFloatArray(in, "window", nullptr);
167 |         for (int i = 0; i < (2 * d->radius + 1) * d->block_size * d->block_size / 16; i++) {
168 |             for (int j = 0; j < 16; j++) {
169 |                 d->window[i * 16 + j] = static_cast<float>(window[i * 16 + j]);
170 |             }
171 |         }
172 |     }
173 | 
174 |     {
175 |         auto ptr = vs_aligned_malloc<float>(
176 |             (2 * d->radius + 1) * d->block_size * (d->block_size / 2 + 1 + 15) * sizeof(float),
177 |             64
178 |         );
179 |         if (ptr == nullptr) {
180 |             return set_error("alloc error");
181 |         }
182 |         d->sigma.reset(ptr);
183 |     }
184 |     {
185 |         auto sigma = vsapi->propGetFloatArray(in, "sigma", nullptr);
186 |         for (int i = 0; i < (2 * d->radius + 1) * d->block_size; i++) {
187 |             float sigma_padded[16] {};
188 |             for (int j = 0; j < d->block_size / 2 + 1; j++) {
189 |                 sigma_padded[j] = static_cast<float>(sigma[i * (d->block_size / 2 + 1) + j]);
190 |             }
191 |             for (int j = 0; j < 16; j++) {
192 |                 d->sigma[i * 16 + j] = sigma_padded[j];
193 |             }
194 |         }
195 |     }
196 | 
197 |     d->sigma2 = static_cast<float>(vsapi->propGetFloat(in, "sigma2", 0, nullptr));
198 |     d->pmin = static_cast<float>(vsapi->propGetFloat(in, "pmin", 0, nullptr));
199 |     d->pmax = static_cast<float>(vsapi->propGetFloat(in, "pmax", 0, nullptr));
200 | 
201 |     d->filter_type = static_cast<int>(vsapi->propGetInt(in, "filter_type", 0, nullptr));
202 | 
203 |     d->zero_mean = !!vsapi->propGetInt(in, "zero_mean", 0, &error);
204 |     if (error) {
205 |         d->zero_mean = true;
206 |     }
207 |     if (d->zero_mean) {
208 |         {
209 |             auto ptr = vs_aligned_malloc<float>(
210 |                 (2 * d->radius + 1) * d->block_size * (d->block_size / 2 + 1 + 15) * 2 * sizeof(float),
211 |                 64
212 |             );
213 |             if (ptr == nullptr) {
214 |                 return set_error("alloc error");
215 |             }
216 |             d->window_freq.reset(ptr);
217 |         }
218 |         auto window_freq = vsapi->propGetFloatArray(in, "window_freq", nullptr);
219 |         for (int i = 0; i < (2 * d->radius + 1) * d->block_size; i++) {
220 |             float sigma_padded[32] {};
221 |             for (int j = 0; j < d->block_size / 2 + 1; j++) {
222 |                 sigma_padded[j] = static_cast<float>(window_freq[(i * (d->block_size / 2 + 1) + j) * 2]);
223 |                 sigma_padded[16 + j] = static_cast<float>(window_freq[(i * (d->block_size / 2 + 1) + j) * 2 + 1]);
224 |             }
225 |             for (int j = 0; j < 32; j++) {
226 |                 d->window_freq[i * 2 * 16 + j] = sigma_padded[j];
227 |             }
228 |         }
229 |     }
230 | 
231 |     VSCoreInfo info;
232 |     vsapi->getCoreInfo2(core, &info);
233 |     d->num_uninitialized_threads.store(info.numThreads, std::memory_order_relaxed);
234 |     d->thread_data.reserve(info.numThreads);
235 | 
236 |     vsapi->createFilter(
237 |         in, out, "DFTTest",
238 |         DFTTestInit, DFTTestGetFrame, DFTTestFree,
239 |         fmParallel, 0, d.release(), core
240 |     );
241 | }
242 | 
243 | 
244 | static void VS_CC RDFT(
245 |     const VSMap *in, VSMap *out, void *userData,
246 |     VSCore *core, const VSAPI *vsapi
247 | ) noexcept {
248 | 
249 |     auto set_error = [vsapi, out](const char * error_message) -> void {
250 |         vsapi->setError(out, error_message);
251 |     };
252 | 
253 |     int ndim = vsapi->propNumElements(in, "shape");
254 |     if (ndim != 1 && ndim != 2 && ndim != 3) {
255 |         return set_error("\"shape\" must be an array of ints with 1, 2 or 3 values");
256 |     }
257 | 
258 |     std::array<int, 3> shape {};
259 |     {
260 |         auto shape_array = vsapi->propGetIntArray(in, "shape", nullptr);
261 |         for (int i = 0; i < ndim; i++) {
262 |             shape[i] = int64ToIntS(shape_array[i]);
263 |         }
264 |     }
265 | 
266 |     int size = 1;
267 |     for (int i = 0; i < ndim; i++) {
268 |         size *= shape[i];
269 |     }
270 |     if (vsapi->propNumElements(in, "data") != size) {
271 |         return set_error("cannot reshape array");
272 |     }
273 | 
274 |     int complex_size = shape[ndim - 1] / 2 + 1;
275 |     for (int i = 0; i < ndim - 1; i++) {
276 |         complex_size *= shape[i];
277 |     }
278 | 
279 |     auto input = vsapi->propGetFloatArray(in, "data", nullptr);
280 | 
281 |     auto output = std::make_unique<std::complex<double> []>(complex_size);
282 | 
283 |     if (ndim == 1) {
284 |         dft(output.get(), input, size, 1);
285 |         vsapi->propSetFloatArray(out, "ret", (const double *) output.get(), complex_size * 2);
286 |     } else if (ndim == 2) {
287 |         for (int i = 0; i < shape[0]; i++) {
288 |             dft(&output[i * (shape[1] / 2 + 1)], &input[i * shape[1]], shape[1], 1);
289 |         }
290 | 
291 |         auto output2 = std::make_unique<std::complex<double> []>(complex_size);
292 | 
293 |         for (int i = 0; i < shape[1] / 2 + 1; i++) {
294 |             dft(&output2[i], &output[i], shape[0], shape[1] / 2 + 1);
295 |         }
296 | 
297 |         vsapi->propSetFloatArray(out, "ret", (const double *) output2.get(), complex_size * 2);
298 |     } else {
299 |         for (int i = 0; i < shape[0] * shape[1]; i++) {
300 |             dft(&output[i * (shape[2] / 2 + 1)], &input[i * shape[2]], shape[2], 1);
301 |         }
302 | 
303 |         auto output2 = std::make_unique<std::complex<double> []>(complex_size);
304 | 
305 |         for (int i = 0; i < shape[0]; i++) {
306 |             for (int j = 0; j < shape[2] / 2 + 1; j++) {
307 |                 dft(
308 |                     &output2[i * shape[1] * (shape[2] / 2 + 1) + j],
309 |                     &output[i * shape[1] * (shape[2] / 2 + 1) + j],
310 |                     shape[1],
311 |                     (shape[2] / 2 + 1)
312 |                 );
313 |             }
314 |         }
315 | 
316 |         for (int i = 0; i < shape[1] * (shape[2] / 2 + 1); i++) {
317 |             dft(&output[i], &output2[i], shape[0], shape[1] * (shape[2] / 2 + 1));
318 |         }
319 | 
320 |         vsapi->propSetFloatArray(out, "ret", (const double *) output.get(), complex_size * 2);
321 |     }
322 | }
323 | 
324 | 
325 | static void Version(const VSMap *, VSMap * out, void *, VSCore *, const VSAPI *vsapi) {
326 |     vsapi->propSetData(out, "version", VERSION, -1, paReplace);
327 | }
328 | 
329 | 
330 | VS_EXTERNAL_API(void) VapourSynthPluginInit(
331 |     VSConfigPlugin configFunc,
332 |     VSRegisterFunction registerFunc,
333 |     VSPlugin *plugin
334 | ) {
335 | 
336 |     configFunc(
337 |         "io.github.amusementclub.dfttest2_gcc",
338 |         "dfttest2_gcc",
339 |         "DFTTest2 (GCC vector extension)",
340 |         VAPOURSYNTH_API_VERSION, 1, plugin
341 |     );
342 | 
343 |     registerFunc(
344 |         "DFTTest",
345 |         "clip:clip;"
346 |         "window:float[];"
347 |         "sigma:float[];"
348 |         "sigma2:float;"
349 |         "pmin:float;"
350 |         "pmax:float;"
351 |         "filter_type:int;"
352 |         "radius:int:opt;"
353 |         "block_size:int:opt;"
354 |         "block_step:int:opt;"
355 |         "zero_mean:int:opt;"
356 |         "window_freq:float[]:opt;"
357 |         "planes:int[]:opt;",
358 |         DFTTestCreate, nullptr, plugin
359 |     );
360 | 
361 |     registerFunc(
362 |         "RDFT",
363 |         "data:float[];"
364 |         "shape:int[];",
365 |         RDFT, nullptr, plugin
366 |     );
367 | 
368 |     registerFunc(
369 |         "Version",
370 |         "",
371 |         Version, nullptr, plugin
372 |     );
373 | }
374 | 


--------------------------------------------------------------------------------
/hip_source/kernel.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef KERNEL_HPP
  2 | #define KERNEL_HPP
  3 | 
  4 | static const auto kernel_implementation = R"""(
  5 | __device__
  6 | extern void filter(float2 & value, int x, int y, int z);
  7 | 
  8 | // ZERO_MEAN
  9 | // RADIUS
 10 | // BLOCK_SIZE
 11 | // BLOCK_STEP
 12 | // IN_PLACE
 13 | // WARPS_PER_BLOCK
 14 | // WARP_SIZE
 15 | // TYPE
 16 | // SCALE
 17 | // PEAK (optional)
 18 | 
 19 | #if ZERO_MEAN
 20 | // __device__ const float window_freq[]; // frequency response of the window
 21 | #endif // ZERO_MEAN
 22 | 
 23 | __device__
 24 | static int calc_pad_size(int size, int block_size, int block_step) {
 25 |     return size + ((size % block_size) ? block_size - size % block_size : 0) + max(block_size - block_step, block_step) * 2;
 26 | }
 27 | 
 28 | __device__
 29 | static int calc_pad_num(int size, int block_size, int block_step) {
 30 |     return (calc_pad_size(size, block_size, block_step) - block_size) / block_step + 1;
 31 | }
 32 | 
 33 | __device__
 34 | static float to_float(TYPE x) {
 35 |     return static_cast<float>(x) * static_cast<float>(SCALE);
 36 | }
 37 | 
 38 | __device__
 39 | static TYPE from_float(float x) {
 40 | #ifdef PEAK
 41 |     x /= static_cast<float>(SCALE);
 42 |     x = fmaxf(0.0f, fminf(x + 0.5f, static_cast<float>(PEAK)));
 43 |     return static_cast<TYPE>(__float2int_rz(x));
 44 | #else // PEAK // only integral types define it
 45 |     return static_cast<TYPE>(x / static_cast<float>(SCALE));
 46 | #endif // PEAK
 47 | }
 48 | 
 49 | extern "C"
 50 | __launch_bounds__(WARPS_PER_BLOCK * WARP_SIZE)
 51 | __global__
 52 | void im2col(
 53 |     // shape: (vertical_num, horizontal_num, 2*radius+1, block_size, padded_block_size)
 54 |     float * __restrict__ dstp,
 55 |     const TYPE * __restrict__ srcp, // shape: (2*radius+1, vertical_size, horizontal_size)
 56 |     int width,
 57 |     int height
 58 | ) {
 59 | 
 60 |     int radius = static_cast<int>(RADIUS);
 61 |     int block_size = static_cast<int>(BLOCK_SIZE);
 62 |     int padded_block_size = IN_PLACE ? (block_size / 2 + 1) * 2 : block_size;
 63 |     int block_step = static_cast<int>(BLOCK_STEP);
 64 | 
 65 |     int horizontal_num = calc_pad_num(width, block_size, block_step);
 66 |     int vertical_num = calc_pad_num(height, block_size, block_step);
 67 |     int horizontal_size = calc_pad_size(width, block_size, block_step);
 68 |     int vertical_size = calc_pad_size(height, block_size, block_step);
 69 |     int num_blocks = vertical_num * horizontal_num;
 70 | 
 71 |     for (int i = blockIdx.x * WARPS_PER_BLOCK + threadIdx.x / WARP_SIZE; i < num_blocks; i += gridDim.x * WARPS_PER_BLOCK) {
 72 |         int ix = i % horizontal_num;
 73 |         int iy = i / horizontal_num;
 74 |         auto dst = &dstp[i * (2 * radius + 1) * block_size * padded_block_size];
 75 |         for (int j = 0; j < 2 * radius + 1; j++) {
 76 |             auto src = &srcp[(j * vertical_size + iy * block_step) * horizontal_size + ix * block_step];
 77 |             for (int k = threadIdx.x % WARP_SIZE; k < block_size * block_size; k += WARP_SIZE) {
 78 |                 int kx = k % block_size;
 79 |                 int ky = k / block_size;
 80 |                 float val = to_float(src[ky * horizontal_size + kx]) * window[j * block_size * block_size + k];
 81 | #if IN_PLACE == 1
 82 |                 dst[(j * block_size + k / block_size) * padded_block_size + k % block_size] = val;
 83 | #else
 84 |                 dst[j * block_size * block_size + k] = val;
 85 | #endif
 86 |             }
 87 |         }
 88 |     }
 89 | }
 90 | 
 91 | extern "C"
 92 | __launch_bounds__(WARPS_PER_BLOCK * WARP_SIZE)
 93 | __global__
 94 | void frequency_filtering(
 95 |     float2 * data,
 96 |     int num_blocks
 97 | ) {
 98 | 
 99 |     int radius = static_cast<int>(RADIUS);
100 |     int block_size_1d = static_cast<int>(BLOCK_SIZE);
101 | 
102 |     // each warp is responsible for a single block
103 |     // assume that blockDim.x % WARP_SIZE == 0
104 | 
105 |     int block_size_x = block_size_1d / 2 + 1;
106 |     int block_size_2d = block_size_1d * block_size_x;
107 |     int block_size_3d = (2 * radius + 1) * block_size_2d;
108 | 
109 |     for (int i = blockIdx.x * WARPS_PER_BLOCK + threadIdx.x / WARP_SIZE; i < num_blocks; i += gridDim.x * WARPS_PER_BLOCK) {
110 | #if ZERO_MEAN
111 |         float gf;
112 |         if (threadIdx.x % WARP_SIZE == 0) {
113 |             gf = data[i * block_size_3d].x / window_freq[0];
114 |         }
115 |         gf = __shfl(gf, 0);
116 | #endif // ZERO_MEAN
117 | 
118 |         for (int j = threadIdx.x % WARP_SIZE; j < block_size_3d; j += WARP_SIZE) {
119 |             float2 local_data = data[i * block_size_3d + j];
120 | 
121 | #if ZERO_MEAN
122 |             // remove mean
123 |             float val1 = gf * window_freq[j * 2];
124 |             float val2 = gf * window_freq[j * 2 + 1];
125 |             local_data.x -= val1;
126 |             local_data.y -= val2;
127 | #endif // ZERO_MEAN
128 | 
129 |             filter(
130 |                 local_data,
131 |                 j % block_size_x,
132 |                 (j % block_size_2d) / block_size_x,
133 |                 (j % block_size_3d) / block_size_2d
134 |             );
135 | 
136 | #if ZERO_MEAN
137 |             // add mean
138 |             local_data.x += val1;
139 |             local_data.y += val2;
140 | #endif // ZERO_MEAN
141 | 
142 |             data[i * block_size_3d + j] = local_data;
143 |         }
144 |     }
145 | }
146 | 
147 | extern "C"
148 | __launch_bounds__(WARPS_PER_BLOCK * WARP_SIZE)
149 | __global__
150 | void col2im(
151 |     TYPE * __restrict__ dst, // shape: (2*radius+1, vertical_size, horizontal_size)
152 |     // shape: (vertical_num, horizontal_num, 2*radius+1, block_size, padded_block_size)
153 |     const float * __restrict__ src,
154 |     int width,
155 |     int height
156 | ) {
157 | 
158 |     int radius = static_cast<int>(RADIUS);
159 |     int block_size = static_cast<int>(BLOCK_SIZE);
160 |     int padded_block_size = IN_PLACE ? (block_size / 2 + 1) * 2 : block_size;
161 |     int block_step = static_cast<int>(BLOCK_STEP);
162 | 
163 |     // each thread is responsible for a single pixel
164 |     int horizontal_size = calc_pad_size(width, block_size, block_step);
165 |     int horizontal_num = calc_pad_num(width, block_size, block_step);
166 |     int vertical_size = calc_pad_size(height, block_size, block_step);
167 |     int vertical_num = calc_pad_num(height, block_size, block_step);
168 |     int pad_x = (horizontal_size - width) / 2;
169 |     int pad_y = (vertical_size - height) / 2;
170 | 
171 |     int x = blockIdx.x * blockDim.x + threadIdx.x;
172 |     int y = blockIdx.y * blockDim.y + threadIdx.y;
173 |     if (y < pad_y || y >= pad_y + height || x < pad_x || x >= pad_x + width) {
174 |         return ;
175 |     }
176 | 
177 |     float sum {};
178 | 
179 |     int i1 = (y - block_size + block_step) / block_step; // i1 is implicitly greater than 0
180 |     int i2 = min(y / block_step, vertical_num - 1);
181 |     int j1 = (x - block_size + block_step) / block_step; // j1 is implicitly greater than 0
182 |     int j2 = min(x / block_step, horizontal_num - 1);
183 | 
184 |     for (int i = i1; i <= i2; i++) {
185 |         int offset_y = y - i * block_step;
186 |         for (int j = j1; j <= j2; j++) {
187 |             int offset_x = x - j * block_step;
188 |             auto src_offset = (((i * horizontal_num + j) * (2 * radius + 1) + radius) * block_size + offset_y) * padded_block_size + offset_x;
189 |             auto window_offset = (radius * block_size + offset_y) * block_size + offset_x;
190 |             sum += src[src_offset] * window[window_offset];
191 |         }
192 |     }
193 | 
194 |     dst[(radius * vertical_size + y) * horizontal_size + x] = from_float(sum);
195 | }
196 | )""";
197 | 
198 | #endif // KERNEL_HPP
199 | 


--------------------------------------------------------------------------------
/hiprtc_source/kernel.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef KERNEL_HPP
  2 | #define KERNEL_HPP
  3 | 
  4 | static const auto kernel_implementation = R"""(
  5 | __device__
  6 | extern void filter(float2 & value, int x, int y, int z);
  7 | 
  8 | // ZERO_MEAN
  9 | // RADIUS
 10 | // BLOCK_SIZE
 11 | // BLOCK_STEP
 12 | // WARPS_PER_BLOCK
 13 | // WARP_SIZE
 14 | // TYPE
 15 | // SCALE
 16 | // PEAK (optional)
 17 | 
 18 | #if ZERO_MEAN
 19 | // __device__ const float window_freq[]; // frequency response of the window
 20 | #endif // ZERO_MEAN
 21 | 
 22 | __device__
 23 | static int calc_pad_size(int size, int block_size, int block_step) {
 24 |     return size + ((size % block_size) ? block_size - size % block_size : 0) + max(block_size - block_step, block_step) * 2;
 25 | }
 26 | 
 27 | __device__
 28 | static int calc_pad_num(int size, int block_size, int block_step) {
 29 |     return (calc_pad_size(size, block_size, block_step) - block_size) / block_step + 1;
 30 | }
 31 | 
 32 | __device__
 33 | static float to_float(TYPE x) {
 34 |     return static_cast<float>(x) * static_cast<float>(SCALE);
 35 | }
 36 | 
 37 | __device__
 38 | static TYPE from_float(float x) {
 39 | #ifdef PEAK
 40 |     x /= static_cast<float>(SCALE);
 41 |     x = fmaxf(0.0f, fminf(x + 0.5f, static_cast<float>(PEAK)));
 42 |     return static_cast<TYPE>(__float2int_rz(x));
 43 | #else // PEAK // only integral types define it
 44 |     return static_cast<TYPE>(x / static_cast<float>(SCALE));
 45 | #endif // PEAK
 46 | }
 47 | 
 48 | // im2col + rdft + frequency_filtering + irdft
 49 | extern "C"
 50 | __launch_bounds__(WARPS_PER_BLOCK * WARP_SIZE)
 51 | __global__
 52 | void fused(
 53 |     float * __restrict__ dstp, // shape: (vertical_num, horizontal_num, 2*radius+1, block_size, block_size)
 54 |     const TYPE * __restrict__ srcp, // shape: (2*radius+1, vertical_size, horizontal_size)
 55 |     int width,
 56 |     int height
 57 | ) {
 58 | 
 59 |     constexpr int radius = static_cast<int>(RADIUS);
 60 |     constexpr int block_size = static_cast<int>(BLOCK_SIZE);
 61 |     constexpr int block_step = static_cast<int>(BLOCK_STEP);
 62 | 
 63 |     int horizontal_num = calc_pad_num(width, block_size, block_step);
 64 |     int vertical_num = calc_pad_num(height, block_size, block_step);
 65 |     int horizontal_size = calc_pad_size(width, block_size, block_step);
 66 |     int vertical_size = calc_pad_size(height, block_size, block_step);
 67 |     int num_blocks = vertical_num * horizontal_num;
 68 | 
 69 |     constexpr int warp_size = static_cast<int>(WARP_SIZE);
 70 |     constexpr int warps_per_block = static_cast<int>(WARPS_PER_BLOCK);
 71 |     constexpr int transpose_stride = (warp_size % block_size == 0) ? block_size + 1 : block_size;
 72 |     __shared__ float2 shared_transpose_buffer[warps_per_block * block_size * transpose_stride];
 73 | 
 74 |     int warp_id = threadIdx.x / warp_size;
 75 |     int lane_id = threadIdx.x % warp_size;
 76 | 
 77 |     if (lane_id >= block_size) {
 78 |         return;
 79 |     }
 80 | 
 81 |     auto transpose_buffer = &shared_transpose_buffer[warp_id * block_size * transpose_stride];
 82 | 
 83 |     for (int block_id = blockIdx.x * WARPS_PER_BLOCK + threadIdx.x / WARP_SIZE; block_id < num_blocks; block_id += gridDim.x * WARPS_PER_BLOCK) {
 84 |         int ix = block_id % horizontal_num;
 85 |         int iy = block_id / horizontal_num;
 86 | 
 87 |         constexpr int active_mask = (1 << block_size) - 1;
 88 |         float2 thread_data[(2 * radius + 1) * block_size];
 89 | 
 90 |         // im2col
 91 |         #pragma unroll
 92 |         for (int i = 0; i < 2 * radius + 1; i++) {
 93 |             auto src = &srcp[(i * vertical_size + iy * block_step) * horizontal_size + ix * block_step];
 94 |             auto local_thread_data = &thread_data[i * block_size];
 95 |             #pragma unroll
 96 |             for (int j = 0; j < block_size; j++) {
 97 |                 ((float *) local_thread_data)[j] = to_float(src[j * horizontal_size + lane_id]) * window[(i * block_size + j) * block_size + lane_id];
 98 |             }
 99 |         }
100 | 
101 |         // rdft
102 |         #pragma unroll
103 |         for (int i = 0; i < 2 * radius + 1; i++) {
104 |             auto local_thread_data = &thread_data[i * block_size];
105 | 
106 |             __syncthreads();
107 |             // transpose store of real data
108 |             #pragma unroll
109 |             for (int j = 0; j < block_size; j++) {
110 |                 ((float *) transpose_buffer)[j * transpose_stride + lane_id] = ((float *) local_thread_data)[j];
111 |             }
112 | 
113 |             __syncthreads();
114 |             // transpose load of real data
115 |             #pragma unroll
116 |             for (int j = 0; j < block_size; j++) {
117 |                 ((float *) local_thread_data)[j] = ((float *) transpose_buffer)[lane_id * transpose_stride + j];
118 |             }
119 | 
120 |             __syncthreads();
121 |             rdft<block_size>((float *) local_thread_data);
122 | 
123 |             // transpose store of complex data
124 |             #pragma unroll
125 |             for (int j = 0; j < block_size / 2 + 1; j++) {
126 |                 transpose_buffer[lane_id * transpose_stride + j] = local_thread_data[j];
127 |             }
128 | 
129 |             __syncthreads();
130 |             if (lane_id < block_size / 2 + 1) {
131 |                 // transpose load of complex data
132 |                 #pragma unroll
133 |                 for (int j = 0; j < block_size; j++) {
134 |                     local_thread_data[j] = transpose_buffer[j * transpose_stride + lane_id];
135 |                 }
136 | 
137 |                 dft<block_size>((float *) local_thread_data);
138 |             }
139 |         }
140 | 
141 |         if (lane_id < block_size / 2 + 1) {
142 |             #pragma unroll
143 |             for (int i = 0; i < block_size; i++) {
144 |                 dft<2 * radius + 1>((float *) &thread_data[i], block_size);
145 |             }
146 |         }
147 | 
148 |         // frequency_filtering
149 |         if (lane_id < block_size / 2 + 1) {
150 | #if ZERO_MEAN
151 |             float gf;
152 |             if (lane_id == 0) {
153 |                 gf = thread_data[0].x / window_freq[0];
154 |             }
155 |             gf = __shfl(gf, 0);
156 | #endif // ZERO_MEAN
157 |             #pragma unroll
158 |             for (int i = 0; i < 2 * radius + 1; i++) {
159 |                 #pragma unroll
160 |                 for (int j = 0; j < block_size; j++) {
161 |                     float2 local_data = thread_data[i * block_size + j];
162 | 
163 | #if ZERO_MEAN
164 |                     // remove mean
165 |                     float val1 = gf * window_freq[((i * block_size + j) * (block_size / 2 + 1) + lane_id) * 2];
166 |                     float val2 = gf * window_freq[((i * block_size + j) * (block_size / 2 + 1) + lane_id) * 2 + 1];
167 |                     local_data.x -= val1;
168 |                     local_data.y -= val2;
169 | #endif // ZERO_MEAN
170 | 
171 |                     filter(local_data, lane_id, j, i);
172 | 
173 | #if ZERO_MEAN
174 |                     // add mean
175 |                     local_data.x += val1;
176 |                     local_data.y += val2;
177 | #endif // ZERO_MEAN
178 | 
179 |                     thread_data[i * block_size + j] = local_data;
180 |                 }
181 |             }
182 |         }
183 | 
184 |         // irdft
185 |         if (lane_id < block_size / 2 + 1) {
186 |             #pragma unroll
187 |             for (int i = 0; i < block_size; i++) {
188 |                 idft<2 * radius + 1>((float *) &thread_data[i], block_size);
189 |             }
190 |         }
191 | 
192 |         // this is not a full 3d irdft, because only a single slice is required
193 |         auto local_thread_data = &thread_data[radius * block_size];
194 | 
195 |         __syncthreads();
196 |         if (lane_id < block_size / 2 + 1) {
197 |             idft<block_size>((float *) local_thread_data);
198 | 
199 |             // transpose store of complex data
200 |             #pragma unroll
201 |             for (int j = 0; j < block_size; j++) {
202 |                 transpose_buffer[j * transpose_stride + lane_id] = local_thread_data[j];
203 |             }
204 |         }
205 | 
206 |         __syncthreads();
207 |         #pragma unroll
208 |         for (int j = 0; j < block_size / 2 + 1; j++) {
209 |             // transpose load of complex data
210 |             local_thread_data[j].x = transpose_buffer[lane_id * transpose_stride + j].x;
211 |             local_thread_data[j].y = transpose_buffer[lane_id * transpose_stride + j].y;
212 |         }
213 | 
214 |         irdft<block_size>((float *) local_thread_data);
215 | 
216 |         __syncthreads();
217 |         #pragma unroll
218 |         for (int j = 0; j < block_size; j++) {
219 |             ((float *) transpose_buffer)[j * transpose_stride + lane_id] = ((float *) local_thread_data)[j == 0 ? j : block_size - j];
220 |         }
221 | 
222 |         __syncthreads();
223 |         auto local_dst = &dstp[(block_id * (2 * radius + 1) + radius) * block_size * block_size];
224 |         #pragma unroll
225 |         for (int j = 0; j < block_size; j++) {
226 |             local_dst[j * block_size + lane_id] = ((float *) transpose_buffer)[lane_id * transpose_stride + j];
227 |         }
228 |     }
229 | }
230 | 
231 | extern "C"
232 | __launch_bounds__(WARPS_PER_BLOCK * WARP_SIZE)
233 | __global__
234 | void col2im(
235 |     TYPE * __restrict__ dst, // shape: (2*radius+1, vertical_size, horizontal_size)
236 |     const float * __restrict__ src, // shape: (vertical_num, horizontal_num, 2*radius+1, block_size, block_size)
237 |     int width,
238 |     int height
239 | ) {
240 | 
241 |     int radius = static_cast<int>(RADIUS);
242 |     int block_size = static_cast<int>(BLOCK_SIZE);
243 |     int block_step = static_cast<int>(BLOCK_STEP);
244 | 
245 |     // each thread is responsible for a single pixel
246 |     int horizontal_size = calc_pad_size(width, block_size, block_step);
247 |     int horizontal_num = calc_pad_num(width, block_size, block_step);
248 |     int vertical_size = calc_pad_size(height, block_size, block_step);
249 |     int vertical_num = calc_pad_num(height, block_size, block_step);
250 |     int pad_x = (horizontal_size - width) / 2;
251 |     int pad_y = (vertical_size - height) / 2;
252 | 
253 |     int x = blockIdx.x * blockDim.x + threadIdx.x;
254 |     int y = blockIdx.y * blockDim.y + threadIdx.y;
255 |     if (y < pad_y || y >= pad_y + height || x < pad_x || x >= pad_x + width) {
256 |         return ;
257 |     }
258 | 
259 |     float sum {};
260 | 
261 |     int i1 = (y - block_size + block_step) / block_step; // i1 is implicitly greater than 0
262 |     int i2 = min(y / block_step, vertical_num - 1);
263 |     int j1 = (x - block_size + block_step) / block_step; // j1 is implicitly greater than 0
264 |     int j2 = min(x / block_step, horizontal_num - 1);
265 | 
266 |     for (int i = i1; i <= i2; i++) {
267 |         int offset_y = y - i * block_step;
268 |         for (int j = j1; j <= j2; j++) {
269 |             int offset_x = x - j * block_step;
270 |             auto src_offset = (((i * horizontal_num + j) * (2 * radius + 1) + radius) * block_size + offset_y) * block_size + offset_x;
271 |             auto window_offset = (radius * block_size + offset_y) * block_size + offset_x;
272 |             sum += src[src_offset] * window[window_offset];
273 |         }
274 |     }
275 | 
276 |     dst[(radius * vertical_size + y) * horizontal_size + x] = from_float(sum);
277 | }
278 | )""";
279 | 
280 | #endif // KERNEL_HPP
281 | 


--------------------------------------------------------------------------------
/hiprtc_source/source.cpp:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <array>
  3 | #include <atomic>
  4 | #include <complex>
  5 | #include <concepts>
  6 | #include <cstddef>
  7 | #include <cstdio>
  8 | #include <cstdlib>
  9 | #include <cstring>
 10 | #include <memory>
 11 | #include <numbers>
 12 | #include <shared_mutex>
 13 | #include <sstream>
 14 | #include <stdexcept>
 15 | #include <string>
 16 | #include <thread>
 17 | #include <type_traits>
 18 | #include <unordered_map>
 19 | #include <variant>
 20 | #include <utility>
 21 | #include <vector>
 22 | 
 23 | #include <VapourSynth.h>
 24 | #include <VSHelper.h>
 25 | 
 26 | #include <hip/hip_runtime.h>
 27 | #include <hip/hiprtc.h>
 28 | 
 29 | #include "dft_kernels.hpp"
 30 | #include "kernel.hpp"
 31 | 
 32 | #include <config.h> // generated by cmake
 33 | 
 34 | // real/complex-input DFT
 35 | template <typename T, typename T_in>
 36 |     requires
 37 |         (std::is_same_v<T_in, T> || std::is_same_v<T_in, std::complex<T>>)
 38 | static void dft(
 39 |     std::complex<T> * VS_RESTRICT dst,
 40 |     const T_in * VS_RESTRICT src,
 41 |     int n,
 42 |     int stride
 43 | ) {
 44 | 
 45 |     int out_num = std::is_floating_point_v<T_in> ? (n / 2 + 1) : n;
 46 |     for (int i = 0; i < out_num; i++) {
 47 |         std::complex<T> sum {};
 48 |         for (int j = 0; j < n; j++) {
 49 |             auto imag = -2 * i * j * std::numbers::pi_v<T> / n;
 50 |             auto weight = std::complex(std::cos(imag), std::sin(imag));
 51 |             sum += src[j * stride] * weight;
 52 |         }
 53 |         dst[i * stride] = sum;
 54 |     }
 55 | }
 56 | 
 57 | static bool success(hipError_t result) {
 58 |     return result == hipSuccess;
 59 | }
 60 | static bool success(hiprtcResult result) {
 61 |     return result == HIPRTC_SUCCESS;
 62 | }
 63 | 
 64 | static const char * get_error(hipError_t error) {
 65 |     return hipGetErrorString(error);
 66 | }
 67 | 
 68 | static const char * get_error(hiprtcResult error) {
 69 |     return hiprtcGetErrorString(error);
 70 | }
 71 | 
 72 | #define showError(expr) show_error_impl(expr, # expr, __LINE__)
 73 | template <typename T>
 74 | static void show_error_impl(T result, const char * source, int line_no) {
 75 |     if (!success(result)) [[unlikely]] {
 76 |         std::fprintf(stderr, "[%d] %s failed: %s\n", line_no, source, get_error(result));
 77 |     }
 78 | }
 79 | 
 80 | #define checkError(expr) do {                                                       \
 81 |     if (auto result = expr; !success(result)) [[unlikely]] {                        \
 82 |         std::ostringstream error;                                                   \
 83 |         error << '[' << __LINE__ << "] '" # expr "' failed: " << get_error(result); \
 84 |         return set_error(error.str().c_str());                                      \
 85 |     }                                                                               \
 86 | } while (0)
 87 | 
 88 | static void hipStreamDestroyCustom(hipStream_t stream) {
 89 |     showError(hipStreamDestroy(stream));
 90 | }
 91 | 
 92 | static void hipEventDestroyCustom(hipEvent_t event) {
 93 |     showError(hipEventDestroy(event));
 94 | }
 95 | 
 96 | static void hipFreeCustom(hipDeviceptr_t p) {
 97 |     showError(hipFree(p));
 98 | }
 99 | 
100 | static void hipModuleUnloadCustom(hipModule_t module) {
101 |     showError(hipModuleUnload(module));
102 | }
103 | 
104 | static void hiprtcDestroyProgramCustom(hiprtcProgram * program) {
105 |     showError(hiprtcDestroyProgram(program));
106 | }
107 | 
108 | struct node_freer {
109 |     const VSAPI * & vsapi;
110 |     VSNodeRef * node {};
111 |     void release() {
112 |         node = nullptr;
113 |     }
114 |     ~node_freer() {
115 |         if (node) {
116 |             vsapi->freeNode(node);
117 |         }
118 |     }
119 | };
120 | 
121 | template <typename T, auto deleter>
122 |     requires
123 |         std::default_initializable<T> &&
124 |         std::is_trivially_copy_assignable_v<T> &&
125 |         std::convertible_to<T, bool> &&
126 |         std::invocable<decltype(deleter), T>
127 | struct Resource {
128 |     T data;
129 | 
130 |     [[nodiscard]] constexpr Resource() noexcept = default;
131 | 
132 |     [[nodiscard]] constexpr Resource(T x) noexcept : data(x) {}
133 | 
134 |     [[nodiscard]] constexpr Resource(Resource&& other) noexcept
135 |             : data(std::exchange(other.data, T{}))
136 |     { }
137 | 
138 |     Resource& operator=(Resource&& other) noexcept {
139 |         if (this == &other) return *this;
140 |         deleter_(data);
141 |         data = std::exchange(other.data, T{});
142 |         return *this;
143 |     }
144 | 
145 |     Resource operator=(Resource other) = delete;
146 | 
147 |     Resource(const Resource& other) = delete;
148 | 
149 |     constexpr operator T() const noexcept {
150 |         return data;
151 |     }
152 | 
153 |     constexpr auto deleter_(T x) noexcept {
154 |         if (x) {
155 |             deleter(x);
156 |             x = T{};
157 |         }
158 |     }
159 | 
160 |     Resource& operator=(T x) noexcept {
161 |         deleter_(data);
162 |         data = x;
163 |         return *this;
164 |     }
165 | 
166 |     constexpr ~Resource() noexcept {
167 |         deleter_(data);
168 |     }
169 | };
170 | 
171 | template <typename T>
172 | static T square(const T & x) {
173 |     return x * x;
174 | }
175 | 
176 | static int calc_pad_size(int size, int block_size, int block_step) {
177 |     return (
178 |         size
179 |         + ((size % block_size) ? block_size - size % block_size : 0)
180 |         + std::max(block_size - block_step, block_step) * 2
181 |     );
182 | }
183 | 
184 | static int calc_pad_num(int size, int block_size, int block_step) {
185 |     return (calc_pad_size(size, block_size, block_step) - block_size) / block_step + 1;
186 | }
187 | 
188 | template <typename T>
189 | static void reflection_padding_impl(
190 |     T * VS_RESTRICT dst, // shape: (pad_height, pad_width)
191 |     const T * VS_RESTRICT src, // shape: (height, stride)
192 |     int width, int height, int stride,
193 |     int block_size, int block_step
194 | ) {
195 | 
196 |     int pad_width = calc_pad_size(width, block_size, block_step);
197 |     int pad_height = calc_pad_size(height, block_size, block_step);
198 | 
199 |     int offset_y = (pad_height - height) / 2;
200 |     int offset_x = (pad_width - width) / 2;
201 | 
202 |     vs_bitblt(
203 |         &dst[offset_y * pad_width + offset_x], pad_width * sizeof(T),
204 |         src, stride * sizeof(T),
205 |         width * sizeof(T), height
206 |     );
207 | 
208 |     // copy left and right regions
209 |     for (int y = offset_y; y < offset_y + height; y++) {
210 |         auto dst_line = &dst[y * pad_width];
211 | 
212 |         for (int x = 0; x < offset_x; x++) {
213 |             dst_line[x] = dst_line[offset_x * 2 - x];
214 |         }
215 | 
216 |         for (int x = offset_x + width; x < pad_width; x++) {
217 |             dst_line[x] = dst_line[2 * (offset_x + width) - 2 - x];
218 |         }
219 |     }
220 | 
221 |     // copy top region
222 |     for (int y = 0; y < offset_y; y++) {
223 |         std::memcpy(
224 |             &dst[y * pad_width],
225 |             &dst[(offset_y * 2 - y) * pad_width],
226 |             pad_width * sizeof(T)
227 |         );
228 |     }
229 | 
230 |     // copy bottom region
231 |     for (int y = offset_y + height; y < pad_height; y++) {
232 |         std::memcpy(
233 |             &dst[y * pad_width],
234 |             &dst[(2 * (offset_y + height) - 2 - y) * pad_width],
235 |             pad_width * sizeof(T)
236 |         );
237 |     }
238 | }
239 | 
240 | static void reflection_padding(
241 |     uint8_t * VS_RESTRICT dst, // shape: (pad_height, pad_width)
242 |     const uint8_t * VS_RESTRICT src, // shape: (height, stride)
243 |     int width, int height, int stride,
244 |     int block_size, int block_step,
245 |     int bytes_per_sample
246 | ) {
247 | 
248 |     if (bytes_per_sample == 1) {
249 |         reflection_padding_impl(
250 |             static_cast<uint8_t *>(dst),
251 |             static_cast<const uint8_t *>(src),
252 |             width, height, stride,
253 |             block_size, block_step
254 |         );
255 |     } else if (bytes_per_sample == 2) {
256 |         reflection_padding_impl(
257 |             reinterpret_cast<uint16_t *>(dst),
258 |             reinterpret_cast<const uint16_t *>(src),
259 |             width, height, stride,
260 |             block_size, block_step
261 |         );
262 |     } else if (bytes_per_sample == 4) {
263 |         reflection_padding_impl(
264 |             reinterpret_cast<uint32_t *>(dst),
265 |             reinterpret_cast<const uint32_t *>(src),
266 |             width, height, stride,
267 |             block_size, block_step
268 |         );
269 |     }
270 | }
271 | 
272 | static std::variant<hipModule_t, std::string> compile(
273 |     const char * user_kernel,
274 |     hipDevice_t device,
275 |     int radius,
276 |     int block_size,
277 |     int block_step,
278 |     bool in_place,
279 |     int warp_size,
280 |     int warps_per_block,
281 |     int sample_type,
282 |     int bits_per_sample
283 | ) {
284 | 
285 |     auto set_error = [](const char * error_message) -> std::string {
286 |         return std::string{ error_message };
287 |     };
288 | 
289 |     hipDeviceProp_t prop;
290 |     checkError(hipGetDeviceProperties(&prop, device));
291 | 
292 |     constexpr bool generate_bitcode = false;
293 | 
294 |     std::ostringstream kernel_source;
295 |     kernel_source << "#define RADIUS " << radius << '\n';
296 |     kernel_source << "#define BLOCK_SIZE " << block_size << '\n';
297 |     kernel_source << "#define BLOCK_STEP " << block_step << '\n';
298 |     kernel_source << "#define IN_PLACE " << (int) in_place << '\n';
299 |     kernel_source << "#define WARP_SIZE " << warp_size << '\n';
300 |     kernel_source << "#define WARPS_PER_BLOCK " << warps_per_block << '\n';
301 |     if (sample_type == stInteger) {
302 |         int bytes_per_sample = bits_per_sample / 8;
303 |         const char * type {};
304 |         if (bytes_per_sample == 1) {
305 |             type = "unsigned char";
306 |         } else if (bytes_per_sample == 2) {
307 |             type = "unsigned short";
308 |         } else if (bytes_per_sample == 4) {
309 |             type = "unsigned int";
310 |         }
311 |         kernel_source << "#define TYPE " << type << '\n';
312 |         kernel_source << "#define SCALE " << 1.0 / (1 << (bits_per_sample - 8)) << '\n';
313 |         kernel_source << "#define PEAK " << ((1 << bits_per_sample) - 1) << '\n';
314 |     } else if (sample_type == stFloat) {
315 |         if (bits_per_sample == 32) {
316 |             kernel_source << "#define TYPE float\n";
317 |         }
318 |         kernel_source << "#define SCALE 255.0\n";
319 |     }
320 |     kernel_source << user_kernel << '\n';
321 |     kernel_source << fft_header;
322 |     for (const auto & impl : rdft_implementations) {
323 |         kernel_source << impl;
324 |     }
325 |     for (const auto & impl : dft_implementations) {
326 |         kernel_source << impl;
327 |     }
328 |     for (const auto & impl : idft_implementations) {
329 |         kernel_source << impl;
330 |     }
331 |     for (const auto & impl : irdft_implementations) {
332 |         kernel_source << impl;
333 |     }
334 |     kernel_source << kernel_implementation;
335 | 
336 |     hiprtcProgram program;
337 |     checkError(hiprtcCreateProgram(&program, kernel_source.str().c_str(), nullptr, 0, nullptr, nullptr));
338 |     Resource<hiprtcProgram *, hiprtcDestroyProgramCustom> destroyer { &program };
339 | 
340 |     const std::string arch_str = std::string("--offload-arch=") + prop.gcnArchName;
341 | 
342 |     const char * opts[] = {
343 |         arch_str.c_str(),
344 |         "-std=c++17",
345 |         "-ffast-math",
346 |         "-mno-wavefrontsize64", // rdna only
347 |     };
348 | 
349 |     auto compilation = hiprtcCompileProgram(program, (int) std::extent_v<decltype(opts)>, opts);
350 | 
351 |     size_t log_size;
352 |     showError(hiprtcGetProgramLogSize(program, &log_size));
353 | 
354 |     std::string error_message;
355 |     if (log_size > 1) {
356 |         error_message.resize(log_size);
357 |         showError(hiprtcGetProgramLog(program, error_message.data()));
358 |     }
359 | 
360 |     if (success(compilation)) {
361 |         if (log_size > 1) {
362 |             std::fprintf(stderr, "hiprtc: %s\n", error_message.c_str());
363 |         }
364 |     } else {
365 |         return error_message;
366 |     }
367 | 
368 |     std::unique_ptr<char []> image;
369 |     if (generate_bitcode) {
370 |         size_t bitcode_size;
371 |         checkError(hiprtcGetBitcodeSize(program, &bitcode_size));
372 |         image = std::make_unique_for_overwrite<char[]>(bitcode_size);
373 |         checkError(hiprtcGetBitcode(program, image.get()));
374 |     } else {
375 |         size_t code_size;
376 |         checkError(hiprtcGetCodeSize(program, &code_size));
377 |         image = std::make_unique_for_overwrite<char[]>(code_size);
378 |         checkError(hiprtcGetCode(program, image.get()));
379 |     }
380 | 
381 |     hipModule_t module;
382 |     checkError(hipModuleLoadData(&module, image.get()));
383 | 
384 |     return module;
385 | }
386 | 
387 | 
388 | struct ticket_semaphore {
389 |     std::atomic<intptr_t> ticket {};
390 |     std::atomic<intptr_t> current {};
391 | 
392 |     void acquire() noexcept {
393 |         intptr_t tk { ticket.fetch_add(1, std::memory_order::acquire) };
394 |         while (true) {
395 |             intptr_t curr { current.load(std::memory_order::acquire) };
396 |             if (tk <= curr) {
397 |                 return;
398 |             }
399 |             current.wait(curr, std::memory_order::relaxed);
400 |         }
401 |     }
402 | 
403 |     void release() noexcept {
404 |         current.fetch_add(1, std::memory_order::release);
405 |         current.notify_all();
406 |     }
407 | };
408 | 
409 | 
410 | struct DFTTestThreadData {
411 |     uint8_t * h_padded; // shape: (pad_height, pad_width)
412 | };
413 | 
414 | 
415 | struct DFTTestStreamData {
416 |     Resource<hipStream_t, hipStreamDestroyCustom> stream;
417 | 
418 |     Resource<hipEvent_t, hipEventDestroyCustom> event;
419 | 
420 |     // shape: (vertical_num, horizontal_num, 2*radius+1, block_size, block_size)
421 |     Resource<hipDeviceptr_t, hipFreeCustom> d_spatial;
422 | 
423 |     Resource<hipDeviceptr_t, hipFreeCustom> d_padded; // shape: (pad_height, pad_width)
424 | };
425 | 
426 | 
427 | struct DFTTestData {
428 |     VSNodeRef * node;
429 |     int radius;
430 |     int block_size;
431 |     int block_step;
432 |     std::array<bool, 3> process;
433 |     hipDevice_t device; // device_id
434 |     bool in_place;
435 | 
436 |     int warp_size;
437 | 
438 |     int warps_per_block = 1;
439 | 
440 |     ticket_semaphore semaphore;
441 |     std::vector<DFTTestStreamData> stream_data;
442 |     std::vector<int> ticket;
443 |     std::mutex ticket_lock;
444 | 
445 |     Resource<hipModule_t, hipModuleUnloadCustom> module;
446 |     hipFunction_t fused_kernel;
447 |     int fused_num_blocks;
448 |     hipFunction_t col2im_kernel;
449 | 
450 |     std::atomic<int> num_uninitialized_threads;
451 |     std::unordered_map<std::thread::id, DFTTestThreadData> thread_data;
452 |     std::shared_mutex thread_data_lock;
453 | };
454 | 
455 | static void VS_CC DFTTestInit(
456 |     VSMap *in, VSMap *out, void **instanceData, VSNode *node,
457 |     VSCore *core, const VSAPI *vsapi
458 | ) noexcept {
459 | 
460 |     auto d = static_cast<const DFTTestData *>(*instanceData);
461 | 
462 |     auto vi = vsapi->getVideoInfo(d->node);
463 |     vsapi->setVideoInfo(vi, 1, node);
464 | }
465 | 
466 | static const VSFrameRef *VS_CC DFTTestGetFrame(
467 |     int n, int activationReason, void **instanceData, void **frameData,
468 |     VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi
469 | ) noexcept {
470 | 
471 |     auto d = static_cast<DFTTestData *>(*instanceData);
472 | 
473 |     if (activationReason == arInitial) {
474 |         int start = std::max(n - d->radius, 0);
475 |         auto vi = vsapi->getVideoInfo(d->node);
476 |         int end = std::min(n + d->radius, vi->numFrames - 1);
477 |         for (int i = start; i <= end; i++) {
478 |             vsapi->requestFrameFilter(i, d->node, frameCtx);
479 |         }
480 |         return nullptr;
481 |     } else if (activationReason != arAllFramesReady) {
482 |         return nullptr;
483 |     }
484 | 
485 |     auto set_error = [vsapi, frameCtx](const char * error_message) -> std::nullptr_t {
486 |         vsapi->setFilterError(error_message, frameCtx);
487 |         return nullptr;
488 |     };
489 | 
490 |     checkError(hipSetDevice(d->device));
491 | 
492 |     auto vi = vsapi->getVideoInfo(d->node);
493 | 
494 |     DFTTestThreadData thread_data;
495 | 
496 |     auto thread_id = std::this_thread::get_id();
497 |     if (d->num_uninitialized_threads.load(std::memory_order::acquire) == 0) {
498 |         const auto & const_data = d->thread_data;
499 |         thread_data = const_data.at(thread_id);
500 |     } else {
501 |         bool initialized = true;
502 | 
503 |         d->thread_data_lock.lock_shared();
504 |         try {
505 |             const auto & const_data = d->thread_data;
506 |             thread_data = const_data.at(thread_id);
507 |         } catch (const std::out_of_range &) {
508 |             initialized = false;
509 |         }
510 |         d->thread_data_lock.unlock_shared();
511 | 
512 |         if (!initialized) {
513 |             auto padded_size = (
514 |                 (2 * d->radius + 1) *
515 |                 calc_pad_size(vi->height, d->block_size, d->block_step) *
516 |                 calc_pad_size(vi->width, d->block_size, d->block_step) *
517 |                 vi->format->bytesPerSample
518 |             );
519 | 
520 |             checkError(hipHostMalloc((void **) &thread_data.h_padded, padded_size, 0));
521 | 
522 |             {
523 |                 std::lock_guard _ { d->thread_data_lock };
524 |                 d->thread_data.emplace(thread_id, thread_data);
525 |             }
526 | 
527 |             d->num_uninitialized_threads.fetch_sub(1, std::memory_order::release);
528 |         }
529 |     }
530 | 
531 |     std::vector<std::unique_ptr<const VSFrameRef, decltype(vsapi->freeFrame)>> src_frames;
532 |     src_frames.reserve(2 * d->radius + 1);
533 |     for (int i = n - d->radius; i <= n + d->radius; i++) {
534 |         src_frames.emplace_back(
535 |             vsapi->getFrameFilter(std::clamp(i, 0, vi->numFrames - 1), d->node, frameCtx),
536 |             vsapi->freeFrame
537 |         );
538 |     }
539 | 
540 |     auto & src_center_frame = src_frames[d->radius];
541 |     auto format = vsapi->getFrameFormat(src_center_frame.get());
542 | 
543 |     const VSFrameRef * fr[] {
544 |         d->process[0] ? nullptr : src_center_frame.get(),
545 |         d->process[1] ? nullptr : src_center_frame.get(),
546 |         d->process[2] ? nullptr : src_center_frame.get()
547 |     };
548 |     const int pl[] { 0, 1, 2 };
549 |     std::unique_ptr<VSFrameRef, decltype(vsapi->freeFrame)> dst_frame {
550 |         vsapi->newVideoFrame2(format, vi->width, vi->height, fr, pl, src_center_frame.get(), core),
551 |         vsapi->freeFrame
552 |     };
553 | 
554 |     for (int plane = 0; plane < format->numPlanes; plane++) {
555 |         if (!d->process[plane]) {
556 |             continue;
557 |         }
558 | 
559 |         int width = vsapi->getFrameWidth(src_center_frame.get(), plane);
560 |         int height = vsapi->getFrameHeight(src_center_frame.get(), plane);
561 |         int stride = vsapi->getStride(src_center_frame.get(), plane) / vi->format->bytesPerSample;
562 | 
563 |         int padded_size_spatial = (
564 |             calc_pad_size(height, d->block_size, d->block_step) *
565 |             calc_pad_size(width, d->block_size, d->block_step)
566 |         );
567 | 
568 |         for (int i = 0; i < 2 * d->radius + 1; i++) {
569 |             auto srcp = vsapi->getReadPtr(src_frames[i].get(), plane);
570 |             reflection_padding(
571 |                 &thread_data.h_padded[(i * padded_size_spatial) * vi->format->bytesPerSample],
572 |                 srcp,
573 |                 width, height, stride,
574 |                 d->block_size, d->block_step,
575 |                 vi->format->bytesPerSample
576 |             );
577 |         }
578 | 
579 |         {
580 |             d->semaphore.acquire();
581 | 
582 |             int ticket;
583 |             {
584 |                 std::lock_guard lock { d->ticket_lock };
585 |                 ticket = d->ticket.back();
586 |                 d->ticket.pop_back();
587 |             }
588 | 
589 |             auto & stream_data = d->stream_data[ticket];
590 | 
591 |             int padded_bytes = (2 * d->radius + 1) * padded_size_spatial * vi->format->bytesPerSample;
592 |             checkError(hipMemcpyHtoDAsync(stream_data.d_padded.data, thread_data.h_padded, padded_bytes, stream_data.stream));
593 |             {
594 |                 void * params[] { &stream_data.d_spatial.data, &stream_data.d_padded.data, &width, &height };
595 |                 checkError(hipModuleLaunchKernel(
596 |                     d->fused_kernel,
597 |                     d->fused_num_blocks, 1, 1,
598 |                     d->warps_per_block * d->warp_size, 1, 1,
599 |                     0,
600 |                     stream_data.stream,
601 |                     params, nullptr
602 |                 ));
603 |             }
604 |             {
605 |                 void * params[] { &stream_data.d_padded.data, &stream_data.d_spatial.data, &width, &height };
606 |                 unsigned int vertical_size = calc_pad_size(height, d->block_size, d->block_step);
607 |                 unsigned int horizontal_size = calc_pad_size(width, d->block_size, d->block_step);
608 |                 unsigned int grid_x = (horizontal_size + d->warp_size - 1) / d->warp_size;
609 |                 unsigned int grid_y = (vertical_size + d->warps_per_block - 1) / d->warps_per_block;
610 |                 checkError(hipModuleLaunchKernel(
611 |                     d->col2im_kernel,
612 |                     grid_x, grid_y, 1,
613 |                     d->warp_size, d->warps_per_block, 1,
614 |                     0,
615 |                     stream_data.stream,
616 |                     params, nullptr
617 |                 ));
618 |             }
619 |             {
620 |                 unsigned int pad_width = calc_pad_size(width, d->block_size, d->block_step);
621 |                 unsigned int pad_height = calc_pad_size(height, d->block_size, d->block_step);
622 |                 const HIP_MEMCPY3D config {
623 |                     .srcXInBytes = (pad_width - width) / 2 * vi->format->bytesPerSample,
624 |                     .srcY = (pad_height - height) / 2,
625 |                     .srcZ = (unsigned int) d->radius,
626 |                     .srcMemoryType = hipMemoryTypeDevice,
627 |                     .srcDevice = stream_data.d_padded.data,
628 |                     .srcPitch = pad_width * vi->format->bytesPerSample,
629 |                     .srcHeight = pad_height,
630 |                     .dstXInBytes = (pad_width - width) / 2 * vi->format->bytesPerSample,
631 |                     .dstY = (pad_height - height) / 2,
632 |                     .dstZ = 0, // vs_bitblt(dstp) copies from the 0-th slice
633 |                     .dstMemoryType = hipMemoryTypeHost,
634 |                     .dstHost = thread_data.h_padded,
635 |                     .dstPitch = pad_width * vi->format->bytesPerSample,
636 |                     .dstHeight = pad_height,
637 |                     .WidthInBytes = (unsigned int) width * vi->format->bytesPerSample,
638 |                     .Height = (unsigned int) height,
639 |                     .Depth = 1
640 |                 };
641 |                 checkError(hipDrvMemcpy3DAsync(&config, stream_data.stream));
642 |             }
643 | 
644 |             checkError(hipEventRecord(stream_data.event, stream_data.stream));
645 |             checkError(hipEventSynchronize(stream_data.event));
646 | 
647 |             {
648 |                 std::lock_guard lock { d->ticket_lock };
649 |                 d->ticket.emplace_back(ticket);
650 |             }
651 |             d->semaphore.release();
652 |         }
653 | 
654 |         int pad_width = calc_pad_size(width, d->block_size, d->block_step);
655 |         int pad_height = calc_pad_size(height, d->block_size, d->block_step);
656 |         int offset_y = (pad_height - height) / 2;
657 |         int offset_x = (pad_width - width) / 2;
658 | 
659 |         auto dstp = vsapi->getWritePtr(dst_frame.get(), plane);
660 |         auto input = &thread_data.h_padded[(offset_y * pad_width + offset_x) * vi->format->bytesPerSample];
661 |         vs_bitblt(
662 |             dstp, stride * vi->format->bytesPerSample,
663 |             input, pad_width * vi->format->bytesPerSample,
664 |             width * vi->format->bytesPerSample, height
665 |         );
666 |     }
667 | 
668 |     return dst_frame.release();
669 | }
670 | 
671 | static void VS_CC DFTTestFree(
672 |     void *instanceData, VSCore *core, const VSAPI *vsapi
673 | ) noexcept {
674 | 
675 |     auto d = static_cast<const DFTTestData *>(instanceData);
676 | 
677 |     vsapi->freeNode(d->node);
678 | 
679 |     for (const auto & [_, thread_data] : d->thread_data) {
680 |         showError(hipHostFree(thread_data.h_padded));
681 |     }
682 | 
683 |     delete d;
684 | }
685 | 
686 | static void VS_CC DFTTestCreate(
687 |     const VSMap *in, VSMap *out, void *userData,
688 |     VSCore *core, const VSAPI *vsapi
689 | ) noexcept {
690 | 
691 |     auto d = std::make_unique<DFTTestData>();
692 | 
693 |     d->node = vsapi->propGetNode(in, "clip", 0, nullptr);
694 |     node_freer node_freer { vsapi, d->node };
695 | 
696 |     auto set_error = [vsapi, out](const char * error_message) -> void {
697 |         vsapi->setError(out, error_message);
698 |         return ;
699 |     };
700 | 
701 |     auto vi = vsapi->getVideoInfo(d->node);
702 | 
703 |     auto user_kernel = vsapi->propGetData(in, "kernel", 0, nullptr);
704 | 
705 |     int error;
706 | 
707 |     d->radius = int64ToIntS(vsapi->propGetInt(in, "radius", 0, &error));
708 |     if (error) {
709 |         d->radius = 0;
710 |     }
711 | 
712 |     if (d->radius < 0 || d->radius > 3) {
713 |         return set_error("\"radius\" must be in [0, 1, 2, 3]");
714 |     }
715 | 
716 |     d->block_size = int64ToIntS(vsapi->propGetInt(in, "block_size", 0, &error));
717 |     if (error) {
718 |         d->block_size = 16;
719 |     }
720 | 
721 |     if (d->block_size != 16) {
722 |         return set_error("\"block_size\" must be 16");
723 |     }
724 | 
725 |     d->block_step = int64ToIntS(vsapi->propGetInt(in, "block_step", 0, &error));
726 |     if (error) {
727 |         d->block_step = d->block_size;
728 |     }
729 | 
730 |     int num_planes_args = vsapi->propNumElements(in, "planes");
731 |     d->process.fill(num_planes_args <= 0);
732 |     for (int i = 0; i < num_planes_args; ++i) {
733 |         int plane = static_cast<int>(vsapi->propGetInt(in, "planes", i, nullptr));
734 | 
735 |         if (plane < 0 || plane >= vi->format->numPlanes) {
736 |             return set_error("plane index out of range");
737 |         }
738 | 
739 |         if (d->process[plane]) {
740 |             return set_error("plane specified twice");
741 |         }
742 | 
743 |         d->process[plane] = true;
744 |     }
745 | 
746 |     d->in_place = !!(vsapi->propGetInt(in, "in_place", 0, &error));
747 |     if (error) {
748 |         d->in_place = true;
749 |     }
750 |     if (d->in_place) {
751 |         return set_error("\"in_place\" not supported yet");
752 |     }
753 | 
754 |     d->device = int64ToIntS(vsapi->propGetInt(in, "device_id", 0, &error));
755 |     if (error) {
756 |         d->device = 0;
757 |     }
758 | 
759 |     int num_streams = int64ToIntS(vsapi->propGetInt(in, "num_streams", 0, &error));
760 |     if (error) {
761 |         num_streams = 1;
762 |     }
763 |     d->semaphore.current.store(num_streams - 1, std::memory_order::relaxed);
764 |     d->ticket.reserve(num_streams);
765 |     for (int i = 0; i < num_streams; i++) {
766 |         d->ticket.emplace_back(i);
767 |     }
768 | 
769 |     checkError(hipSetDevice(d->device));
770 | 
771 |     checkError(hipDeviceGetAttribute(&d->warp_size, hipDeviceAttributeWarpSize, d->device));
772 | 
773 |     auto compilation = compile(
774 |         user_kernel,
775 |         d->device,
776 |         d->radius, d->block_size, d->block_step, d->in_place,
777 |         d->warp_size, d->warps_per_block,
778 |         vi->format->sampleType, vi->format->bitsPerSample
779 |     );
780 |     if (std::holds_alternative<std::string>(compilation)) {
781 |         std::ostringstream message;
782 |         message << '[' << __LINE__ << "] compile(): " << std::get<std::string>(compilation);
783 |         vsapi->setError(out, message.str().c_str());
784 |         return ;
785 |     }
786 |     d->module = std::get<hipModule_t>(compilation);
787 | 
788 |     int num_sms;
789 |     checkError(hipDeviceGetAttribute(&num_sms, hipDeviceAttributeMultiprocessorCount, d->device));
790 | 
791 |     checkError(hipModuleGetFunction(&d->fused_kernel, d->module, "fused"));
792 |     {
793 |         int max_blocks_per_sm;
794 |         checkError(hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(
795 |             &max_blocks_per_sm,
796 |             d->fused_kernel,
797 |             d->warps_per_block * d->warp_size,
798 |             0
799 |         ));
800 |         d->fused_num_blocks = num_sms * max_blocks_per_sm;
801 |     }
802 | 
803 |     checkError(hipModuleGetFunction(&d->col2im_kernel, d->module, "col2im"));
804 | 
805 |     d->stream_data.resize(num_streams);
806 |     for (int i = 0; i < num_streams; i++) {
807 |         auto & stream_data = d->stream_data[i];
808 | 
809 |         checkError(hipStreamCreateWithFlags(&stream_data.stream.data, hipStreamNonBlocking));
810 | 
811 |         checkError(hipEventCreateWithFlags(
812 |             &stream_data.event.data,
813 |             hipEventBlockingSync | hipEventDisableTiming
814 |         ));
815 | 
816 |         size_t padded_bytes = (
817 |             (2 * d->radius + 1) *
818 |             calc_pad_size(vi->height, d->block_size, d->block_step) *
819 |             calc_pad_size(vi->width, d->block_size, d->block_step) *
820 |             vi->format->bytesPerSample
821 |         );
822 |         checkError(hipMalloc(&stream_data.d_padded.data, padded_bytes));
823 | 
824 |         if (!d->in_place) {
825 |             size_t spatial_bytes = (
826 |                 calc_pad_num(vi->height, d->block_size, d->block_step) *
827 |                 calc_pad_num(vi->width, d->block_size, d->block_step) *
828 |                 (2 * d->radius + 1) *
829 |                 square(d->block_size) *
830 |                 sizeof(float)
831 |             );
832 |             checkError(hipMalloc(&stream_data.d_spatial.data, spatial_bytes));
833 |         }
834 |     }
835 | 
836 |     VSCoreInfo info;
837 |     vsapi->getCoreInfo2(core, &info);
838 |     d->num_uninitialized_threads.store(info.numThreads, std::memory_order_relaxed);
839 |     d->thread_data.reserve(info.numThreads);
840 | 
841 |     vsapi->createFilter(
842 |         in, out, "DFTTest",
843 |         DFTTestInit, DFTTestGetFrame, DFTTestFree,
844 |         fmParallel, 0, d.release(), core
845 |     );
846 | }
847 | 
848 | static void VS_CC RDFT(
849 |     const VSMap *in, VSMap *out, void *userData,
850 |     VSCore *core, const VSAPI *vsapi
851 | ) noexcept {
852 | 
853 |     auto set_error = [vsapi, out](const char * error_message) -> void {
854 |         vsapi->setError(out, error_message);
855 |     };
856 | 
857 |     int ndim = vsapi->propNumElements(in, "shape");
858 |     if (ndim != 1 && ndim != 2 && ndim != 3) {
859 |         return set_error("\"shape\" must be an array of ints with 1, 2 or 3 values");
860 |     }
861 | 
862 |     std::array<int, 3> shape {};
863 |     {
864 |         auto shape_array = vsapi->propGetIntArray(in, "shape", nullptr);
865 |         for (int i = 0; i < ndim; i++) {
866 |             shape[i] = int64ToIntS(shape_array[i]);
867 |         }
868 |     }
869 | 
870 |     int size = 1;
871 |     for (int i = 0; i < ndim; i++) {
872 |         size *= shape[i];
873 |     }
874 |     if (vsapi->propNumElements(in, "data") != size) {
875 |         return set_error("cannot reshape array");
876 |     }
877 | 
878 |     int complex_size = shape[ndim - 1] / 2 + 1;
879 |     for (int i = 0; i < ndim - 1; i++) {
880 |         complex_size *= shape[i];
881 |     }
882 | 
883 |     auto input = vsapi->propGetFloatArray(in, "data", nullptr);
884 | 
885 |     auto output = std::make_unique_for_overwrite<std::complex<double> []>(complex_size);
886 | 
887 |     if (ndim == 1) {
888 |         dft(output.get(), input, size, 1);
889 |         vsapi->propSetFloatArray(out, "ret", (const double *) output.get(), complex_size * 2);
890 |     } else if (ndim == 2) {
891 |         for (int i = 0; i < shape[0]; i++) {
892 |             dft(&output[i * (shape[1] / 2 + 1)], &input[i * shape[1]], shape[1], 1);
893 |         }
894 | 
895 |         auto output2 = std::make_unique_for_overwrite<std::complex<double> []>(complex_size);
896 | 
897 |         for (int i = 0; i < shape[1] / 2 + 1; i++) {
898 |             dft(&output2[i], &output[i], shape[0], shape[1] / 2 + 1);
899 |         }
900 | 
901 |         vsapi->propSetFloatArray(out, "ret", (const double *) output2.get(), complex_size * 2);
902 |     } else {
903 |         for (int i = 0; i < shape[0] * shape[1]; i++) {
904 |             dft(&output[i * (shape[2] / 2 + 1)], &input[i * shape[2]], shape[2], 1);
905 |         }
906 | 
907 |         auto output2 = std::make_unique_for_overwrite<std::complex<double> []>(complex_size);
908 | 
909 |         for (int i = 0; i < shape[0]; i++) {
910 |             for (int j = 0; j < shape[2] / 2 + 1; j++) {
911 |                 dft(
912 |                     &output2[i * shape[1] * (shape[2] / 2 + 1) + j], 
913 |                     &output[i * shape[1] * (shape[2] / 2 + 1) + j], 
914 |                     shape[1], 
915 |                     (shape[2] / 2 + 1)
916 |                 );
917 |             }
918 |         }
919 | 
920 |         for (int i = 0; i < shape[1] * (shape[2] / 2 + 1); i++) {
921 |             dft(&output[i], &output2[i], shape[0], shape[1] * (shape[2] / 2 + 1));
922 |         }
923 | 
924 |         vsapi->propSetFloatArray(out, "ret", (const double *) output.get(), complex_size * 2);
925 |     }
926 | }
927 | 
928 | static void VS_CC ToSingle(
929 |     const VSMap *in, VSMap *out, void *userData,
930 |     VSCore *core, const VSAPI *vsapi
931 | ) noexcept {
932 | 
933 |     auto data = vsapi->propGetFloatArray(in, "data", nullptr);
934 |     int num = vsapi->propNumElements(in, "data");
935 | 
936 |     auto converted_data = std::make_unique_for_overwrite<double []>(num);
937 |     for (int i = 0; i < num; i++) {
938 |         converted_data[i] = static_cast<float>(data[i]);
939 |     }
940 | 
941 |     if (num == 1) {
942 |         vsapi->propSetFloat(out, "ret", converted_data[0], paReplace);
943 |     } else {
944 |         vsapi->propSetFloatArray(out, "ret", converted_data.get(), num);
945 |     }
946 | }
947 | 
948 | static void Version(const VSMap *, VSMap * out, void *, VSCore *, const VSAPI *vsapi) {
949 |     vsapi->propSetData(out, "version", VERSION, -1, paReplace);
950 | };
951 | 
952 | VS_EXTERNAL_API(void)
953 | VapourSynthPluginInit(VSConfigPlugin configFunc, VSRegisterFunction registerFunc, VSPlugin *plugin) {
954 |     configFunc(
955 |         "io.github.amusementclub.dfttest2_hiprtc",
956 |         "dfttest2_hiprtc",
957 |         "DFTTest2 (HIPRTC)",
958 |         VAPOURSYNTH_API_VERSION, 1, plugin
959 |     );
960 | 
961 |     registerFunc(
962 |         "DFTTest",
963 |         "clip:clip;"
964 |         "kernel:data[];"
965 |         "radius:int:opt;"
966 |         "block_size:int:opt;"
967 |         "block_step:int:opt;"
968 |         "planes:int[]:opt;"
969 |         "in_place:int:opt;"
970 |         "device_id:int:opt;"
971 |         "num_streams:int:opt;",
972 |         DFTTestCreate, nullptr, plugin
973 |     );
974 | 
975 |     registerFunc(
976 |         "RDFT",
977 |         "data:float[];"
978 |         "shape:int[];",
979 |         RDFT, nullptr, plugin
980 |     );
981 | 
982 |     registerFunc(
983 |         "ToSingle",
984 |         "data:float[];",
985 |         ToSingle, nullptr, plugin
986 |     );
987 | 
988 |     registerFunc(
989 |         "Version",
990 |         "",
991 |         Version, nullptr, plugin
992 |     );
993 | }
994 | 


--------------------------------------------------------------------------------
/nvrtc_source/kernel.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef KERNEL_HPP
  2 | #define KERNEL_HPP
  3 | 
  4 | static const auto kernel_implementation = R"""(
  5 | __device__
  6 | extern void filter(float2 & value, int x, int y, int z);
  7 | 
  8 | // ZERO_MEAN
  9 | // RADIUS
 10 | // BLOCK_SIZE
 11 | // BLOCK_STEP
 12 | // WARPS_PER_BLOCK
 13 | // WARP_SIZE
 14 | // TYPE
 15 | // SCALE
 16 | // PEAK (optional)
 17 | 
 18 | #if ZERO_MEAN
 19 | // __device__ const float window_freq[]; // frequency response of the window
 20 | #endif // ZERO_MEAN
 21 | 
 22 | __device__
 23 | static int calc_pad_size(int size, int block_size, int block_step) {
 24 |     return size + ((size % block_size) ? block_size - size % block_size : 0) + max(block_size - block_step, block_step) * 2;
 25 | }
 26 | 
 27 | __device__
 28 | static int calc_pad_num(int size, int block_size, int block_step) {
 29 |     return (calc_pad_size(size, block_size, block_step) - block_size) / block_step + 1;
 30 | }
 31 | 
 32 | __device__
 33 | static float to_float(TYPE x) {
 34 |     return static_cast<float>(x) * static_cast<float>(SCALE);
 35 | }
 36 | 
 37 | __device__
 38 | static TYPE from_float(float x) {
 39 | #ifdef PEAK
 40 |     x /= static_cast<float>(SCALE);
 41 |     x = fmaxf(0.0f, fminf(x + 0.5f, static_cast<float>(PEAK)));
 42 |     return static_cast<TYPE>(__float2int_rz(x));
 43 | #else // PEAK // only integral types define it
 44 |     return static_cast<TYPE>(x / static_cast<float>(SCALE));
 45 | #endif // PEAK
 46 | }
 47 | 
 48 | // im2col + rdft + frequency_filtering + irdft
 49 | extern "C"
 50 | __launch_bounds__(WARPS_PER_BLOCK * WARP_SIZE)
 51 | __global__
 52 | void fused(
 53 |     float * __restrict__ dstp, // shape: (vertical_num, horizontal_num, 2*radius+1, block_size, block_size)
 54 |     const TYPE * __restrict__ srcp, // shape: (2*radius+1, vertical_size, horizontal_size)
 55 |     int width,
 56 |     int height
 57 | ) {
 58 | 
 59 |     constexpr int radius = static_cast<int>(RADIUS);
 60 |     constexpr int block_size = static_cast<int>(BLOCK_SIZE);
 61 |     constexpr int block_step = static_cast<int>(BLOCK_STEP);
 62 | 
 63 |     int horizontal_num = calc_pad_num(width, block_size, block_step);
 64 |     int vertical_num = calc_pad_num(height, block_size, block_step);
 65 |     int horizontal_size = calc_pad_size(width, block_size, block_step);
 66 |     int vertical_size = calc_pad_size(height, block_size, block_step);
 67 |     int num_blocks = vertical_num * horizontal_num;
 68 | 
 69 |     constexpr int warp_size = static_cast<int>(WARP_SIZE);
 70 |     constexpr int warps_per_block = static_cast<int>(WARPS_PER_BLOCK);
 71 |     constexpr int transpose_stride = (warp_size % block_size == 0) ? block_size + 1 : block_size;
 72 |     __shared__ float2 shared_transpose_buffer[warps_per_block * block_size * transpose_stride];
 73 | 
 74 |     int warp_id = threadIdx.x / warp_size;
 75 |     int lane_id = threadIdx.x % warp_size;
 76 |     auto transpose_buffer = &shared_transpose_buffer[warp_id * block_size * transpose_stride];
 77 | 
 78 |     for (int block_id = blockIdx.x * WARPS_PER_BLOCK + threadIdx.x / WARP_SIZE; block_id < num_blocks; block_id += gridDim.x * WARPS_PER_BLOCK) {
 79 |         int ix = block_id % horizontal_num;
 80 |         int iy = block_id / horizontal_num;
 81 | 
 82 |         if (lane_id < block_size) {
 83 |             constexpr int active_mask = (1 << block_size) - 1;
 84 |             float2 thread_data[(2 * radius + 1) * block_size];
 85 | 
 86 |             // im2col
 87 |             #pragma unroll
 88 |             for (int i = 0; i < 2 * radius + 1; i++) {
 89 |                 auto src = &srcp[(i * vertical_size + iy * block_step) * horizontal_size + ix * block_step];
 90 |                 auto local_thread_data = &thread_data[i * block_size];
 91 |                 #pragma unroll
 92 |                 for (int j = 0; j < block_size; j++) {
 93 |                     ((float *) local_thread_data)[j] = to_float(src[j * horizontal_size + lane_id]) * window[(i * block_size + j) * block_size + lane_id];
 94 |                 }
 95 |             }
 96 | 
 97 |             // rdft
 98 |             #pragma unroll
 99 |             for (int i = 0; i < 2 * radius + 1; i++) {
100 |                 auto local_thread_data = &thread_data[i * block_size];
101 | 
102 |                 __syncwarp(active_mask);
103 |                 // transpose store of real data
104 |                 #pragma unroll
105 |                 for (int j = 0; j < block_size; j++) {
106 |                     ((float *) transpose_buffer)[j * transpose_stride + lane_id] = ((float *) local_thread_data)[j];
107 |                 }
108 | 
109 |                 __syncwarp(active_mask);
110 |                 // transpose load of real data
111 |                 #pragma unroll
112 |                 for (int j = 0; j < block_size; j++) {
113 |                     ((float *) local_thread_data)[j] = ((float *) transpose_buffer)[lane_id * transpose_stride + j];
114 |                 }
115 | 
116 |                 __syncwarp(active_mask);
117 |                 rdft<block_size>((float *) local_thread_data);
118 | 
119 |                 // transpose store of complex data
120 |                 #pragma unroll
121 |                 for (int j = 0; j < block_size / 2 + 1; j++) {
122 |                     transpose_buffer[lane_id * transpose_stride + j] = local_thread_data[j];
123 |                 }
124 | 
125 |                 __syncwarp(active_mask);
126 |                 if (lane_id < block_size / 2 + 1) {
127 |                     // transpose load of complex data
128 |                     #pragma unroll
129 |                     for (int j = 0; j < block_size; j++) {
130 |                         local_thread_data[j] = transpose_buffer[j * transpose_stride + lane_id];
131 |                     }
132 | 
133 |                     __syncwarp((1 << (block_size / 2 + 1)) - 1);
134 |                     dft<block_size>((float *) local_thread_data);
135 |                 }
136 |             }
137 | 
138 |             if (lane_id < block_size / 2 + 1) {
139 |                 #pragma unroll
140 |                 for (int i = 0; i < block_size; i++) {
141 |                     dft<2 * radius + 1>((float *) &thread_data[i], block_size);
142 |                 }
143 |             }
144 | 
145 |             // frequency_filtering
146 |             if (lane_id < block_size / 2 + 1) {
147 | #if ZERO_MEAN
148 |                 float gf;
149 |                 if (lane_id == 0) {
150 |                     gf = thread_data[0].x / window_freq[0];
151 |                 }
152 |                 gf = __shfl_sync((1 << (block_size / 2 + 1)) - 1, gf, 0);
153 | #endif // ZERO_MEAN
154 |                 #pragma unroll
155 |                 for (int i = 0; i < 2 * radius + 1; i++) {
156 |                     #pragma unroll
157 |                     for (int j = 0; j < block_size; j++) {
158 |                         float2 local_data = thread_data[i * block_size + j];
159 | 
160 | #if ZERO_MEAN
161 |                         // remove mean
162 |                         float val1 = gf * window_freq[((i * block_size + j) * (block_size / 2 + 1) + lane_id) * 2];
163 |                         float val2 = gf * window_freq[((i * block_size + j) * (block_size / 2 + 1) + lane_id) * 2 + 1];
164 |                         local_data.x -= val1;
165 |                         local_data.y -= val2;
166 | #endif // ZERO_MEAN
167 | 
168 |                         filter(local_data, lane_id, j, i);
169 | 
170 | #if ZERO_MEAN
171 |                         // add mean
172 |                         local_data.x += val1;
173 |                         local_data.y += val2;
174 | #endif // ZERO_MEAN
175 | 
176 |                         thread_data[i * block_size + j] = local_data;
177 |                     }
178 |                 }
179 |             }
180 | 
181 |             // irdft
182 |             if (lane_id < block_size / 2 + 1) {
183 |                 #pragma unroll
184 |                 for (int i = 0; i < block_size; i++) {
185 |                     idft<2 * radius + 1>((float *) &thread_data[i], block_size);
186 |                 }
187 |             }
188 | 
189 |             // this is not a full 3d irdft, because only a single slice is required
190 |             auto local_thread_data = &thread_data[radius * block_size];
191 | 
192 |             if (lane_id < block_size / 2 + 1) {
193 |                 __syncwarp((1 << (block_size / 2 + 1)) - 1);
194 |                 idft<block_size>((float *) local_thread_data);
195 | 
196 |                 // transpose store of complex data
197 |                 #pragma unroll
198 |                 for (int j = 0; j < block_size; j++) {
199 |                     transpose_buffer[j * transpose_stride + lane_id] = local_thread_data[j];
200 |                 }
201 |             }
202 | 
203 |             __syncwarp(active_mask);
204 |             #pragma unroll
205 |             for (int j = 0; j < block_size / 2 + 1; j++) {
206 |                 // transpose load of complex data
207 |                 local_thread_data[j].x = transpose_buffer[lane_id * transpose_stride + j].x;
208 |                 local_thread_data[j].y = transpose_buffer[lane_id * transpose_stride + j].y;
209 |             }
210 | 
211 |             __syncwarp(active_mask);
212 |             irdft<block_size>((float *) local_thread_data);
213 | 
214 |             #pragma unroll
215 |             for (int j = 0; j < block_size; j++) {
216 |                 ((float *) transpose_buffer)[j * transpose_stride + lane_id] = ((float *) local_thread_data)[j == 0 ? j : block_size - j];
217 |             }
218 | 
219 |             __syncwarp(active_mask);
220 |             auto local_dst = &dstp[(block_id * (2 * radius + 1) + radius) * block_size * block_size];
221 |             #pragma unroll
222 |             for (int j = 0; j < block_size; j++) {
223 |                 local_dst[j * block_size + lane_id] = ((float *) transpose_buffer)[lane_id * transpose_stride + j];
224 |             }
225 |         }
226 |     }
227 | }
228 | 
229 | extern "C"
230 | __launch_bounds__(WARPS_PER_BLOCK * WARP_SIZE)
231 | __global__
232 | void col2im(
233 |     TYPE * __restrict__ dst, // shape: (2*radius+1, vertical_size, horizontal_size)
234 |     const float * __restrict__ src, // shape: (vertical_num, horizontal_num, 2*radius+1, block_size, block_size)
235 |     int width,
236 |     int height
237 | ) {
238 | 
239 |     int radius = static_cast<int>(RADIUS);
240 |     int block_size = static_cast<int>(BLOCK_SIZE);
241 |     int block_step = static_cast<int>(BLOCK_STEP);
242 | 
243 |     // each thread is responsible for a single pixel
244 |     int horizontal_size = calc_pad_size(width, block_size, block_step);
245 |     int horizontal_num = calc_pad_num(width, block_size, block_step);
246 |     int vertical_size = calc_pad_size(height, block_size, block_step);
247 |     int vertical_num = calc_pad_num(height, block_size, block_step);
248 |     int pad_x = (horizontal_size - width) / 2;
249 |     int pad_y = (vertical_size - height) / 2;
250 | 
251 |     int x = blockIdx.x * blockDim.x + threadIdx.x;
252 |     int y = blockIdx.y * blockDim.y + threadIdx.y;
253 |     if (y < pad_y || y >= pad_y + height || x < pad_x || x >= pad_x + width) {
254 |         return ;
255 |     }
256 | 
257 |     float sum {};
258 | 
259 |     int i1 = (y - block_size + block_step) / block_step; // i1 is implicitly greater than 0
260 |     int i2 = min(y / block_step, vertical_num - 1);
261 |     int j1 = (x - block_size + block_step) / block_step; // j1 is implicitly greater than 0
262 |     int j2 = min(x / block_step, horizontal_num - 1);
263 | 
264 |     for (int i = i1; i <= i2; i++) {
265 |         int offset_y = y - i * block_step;
266 |         for (int j = j1; j <= j2; j++) {
267 |             int offset_x = x - j * block_step;
268 |             auto src_offset = (((i * horizontal_num + j) * (2 * radius + 1) + radius) * block_size + offset_y) * block_size + offset_x;
269 |             auto window_offset = (radius * block_size + offset_y) * block_size + offset_x;
270 |             sum += src[src_offset] * window[window_offset];
271 |         }
272 |     }
273 | 
274 |     dst[(radius * vertical_size + y) * horizontal_size + x] = from_float(sum);
275 | }
276 | )""";
277 | 
278 | #endif // KERNEL_HPP
279 | 


--------------------------------------------------------------------------------