├── .github └── workflows │ ├── linux.yml │ ├── linux_arm64.yml │ └── windows.yml ├── .gitmodules ├── CMakeLists.txt ├── LICENSE ├── README.md ├── common └── config.h.in ├── cpu_source ├── CMakeLists.txt ├── cpu_dispatch.h.in ├── dfttest2_cpu.h ├── getframe_impl.cpp ├── kernel.hpp └── source.cpp ├── cuda_source ├── kernel.hpp ├── source.cpp └── win32.cpp ├── dfttest2.py ├── gcc_source ├── CMakeLists.txt ├── dfttest2_cpu.h ├── getframe_impl.cpp ├── kernel.hpp └── source.cpp ├── hip_source ├── kernel.hpp └── source.cpp ├── hiprtc_source ├── dft_kernels.hpp ├── kernel.hpp └── source.cpp └── nvrtc_source ├── dft_kernels.hpp ├── kernel.hpp └── source.cpp /.github/workflows/linux.yml: -------------------------------------------------------------------------------- 1 | name: Build (Linux) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'CMakeLists.txt' 7 | - 'cpu_source/*' 8 | - 'cuda_source/*' 9 | - 'nvrtc_source/*' 10 | - '.github/workflows/linux.yml' 11 | workflow_dispatch: 12 | 13 | jobs: 14 | build-linux: 15 | runs-on: ubuntu-22.04 16 | steps: 17 | - name: Checkout repo 18 | uses: actions/checkout@v3 19 | with: 20 | submodules: true 21 | 22 | - name: Setup Ninja 23 | run: pip3 install ninja 24 | 25 | - name: Setup CUDA 26 | run: | 27 | wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb 28 | sudo dpkg -i cuda-keyring_1.1-1_all.deb 29 | sudo apt-get update 30 | sudo apt-get install -y cuda-nvcc-11-8 cuda-cudart-dev-11-8 cuda-nvrtc-dev-11-8 libcufft-dev-11-8 31 | echo "PATH=/usr/local/cuda/bin${PATH:+:${PATH}}" >> $GITHUB_ENV 32 | echo "CUDA_PATH=/usr/local/cuda" >> $GITHUB_ENV 33 | echo "LD_LIBRARY_PATH=/usr/local/cuda/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" >> $GITHUB_ENV 34 | 35 | - name: Download VapourSynth headers 36 | run: | 37 | wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip 38 | unzip -q vs.zip 39 | mv vapoursynth*/ vapoursynth 40 | 41 | - name: Configure 42 | run: cmake -S . -B build -G Ninja 43 | -D VS_INCLUDE_DIR="`pwd`/vapoursynth/include" 44 | -D ENABLE_CUDA=ON 45 | -D USE_NVRTC_STATIC=ON 46 | -D ENABLE_CPU=ON 47 | -D CMAKE_BUILD_TYPE=Release 48 | -D CMAKE_CXX_COMPILER=g++-12 49 | -D CMAKE_CXX_FLAGS="-Wall -ffast-math" 50 | 51 | - name: Build 52 | run: cmake --build build --config Release --verbose 53 | 54 | - name: Install 55 | run: cmake --install build --prefix artifact 56 | 57 | - name: Setup HIP 58 | run: | 59 | wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null 60 | echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/6.0.2 jammy main" | sudo tee --append /etc/apt/sources.list.d/rocm.list 61 | echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600 62 | sudo apt update 63 | sudo apt install hip-runtime-amd rocm-device-libs hipfft-dev 64 | 65 | - name: Configure (HIP) 66 | run: cmake -S . -B build_hip -G Ninja 67 | -D VS_INCLUDE_DIR="`pwd`/vapoursynth/include" 68 | -D ENABLE_CUDA=OFF 69 | -D ENABLE_CPU=OFF 70 | -D ENABLE_HIP=ON 71 | -D CMAKE_BUILD_TYPE=Release 72 | -D CMAKE_CXX_FLAGS="-Wall -ffast-math" 73 | -D CMAKE_PREFIX_PATH=/opt/rocm 74 | 75 | - name: Build (HIP) 76 | run: cmake --build build_hip --config Release --verbose 77 | 78 | - name: Install (HIP) 79 | run: cmake --install build_hip --prefix artifact 80 | 81 | - name: Upload 82 | uses: actions/upload-artifact@v3 83 | with: 84 | name: vs-dfttest2-Linux 85 | path: artifact 86 | -------------------------------------------------------------------------------- /.github/workflows/linux_arm64.yml: -------------------------------------------------------------------------------- 1 | name: Build (Linux, ARM64) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'CMakeLists.txt' 7 | - 'gcc_source/*' 8 | - '.github/workflows/linux_arm64.yml' 9 | workflow_dispatch: 10 | 11 | jobs: 12 | build-linux: 13 | runs-on: ubuntu-24.04-arm 14 | steps: 15 | - name: Checkout repo 16 | uses: actions/checkout@v4 17 | with: 18 | submodules: true 19 | 20 | - name: Setup clang 21 | run: | 22 | wget https://apt.llvm.org/llvm.sh 23 | chmod +x llvm.sh 24 | sudo ./llvm.sh all 25 | 26 | - name: Setup Ninja 27 | run: pip3 install ninja 28 | 29 | - name: Download VapourSynth headers 30 | run: | 31 | wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip 32 | unzip -q vs.zip 33 | mv vapoursynth*/ vapoursynth 34 | 35 | - name: Configure 36 | run: cmake -S . -B build -G Ninja 37 | -D VS_INCLUDE_DIR="`pwd`/vapoursynth/include" 38 | -D ENABLE_CUDA=OFF 39 | -D ENABLE_CPU=OFF 40 | -D ENABLE_GCC=ON 41 | -D CMAKE_BUILD_TYPE=Release 42 | -D CMAKE_CXX_COMPILER=clang++ 43 | -D CMAKE_CXX_FLAGS="-Wall -ffast-math" 44 | 45 | - name: Build 46 | run: cmake --build build --config Release --verbose 47 | 48 | - name: Install 49 | run: cmake --install build --prefix artifact 50 | 51 | - name: Upload 52 | uses: actions/upload-artifact@v4 53 | with: 54 | name: vs-dfttest2-Linux 55 | path: artifact 56 | -------------------------------------------------------------------------------- /.github/workflows/windows.yml: -------------------------------------------------------------------------------- 1 | name: Build (Windows) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'CMakeLists.txt' 7 | - 'cpu_source/*' 8 | - 'cuda_source/*' 9 | - 'nvrtc_source/*' 10 | - '.github/workflows/windows.yml' 11 | workflow_dispatch: 12 | inputs: 13 | tag: 14 | description: 'which tag to upload to' 15 | default: '' 16 | 17 | jobs: 18 | build-windows: 19 | runs-on: windows-2022 20 | 21 | defaults: 22 | run: 23 | shell: cmd 24 | 25 | steps: 26 | - name: Checkout repo 27 | uses: actions/checkout@v4 28 | with: 29 | submodules: true 30 | 31 | - name: Setup MSVC 32 | uses: ilammy/msvc-dev-cmd@v1 33 | 34 | - name: Setup Ninja 35 | run: pip install ninja 36 | 37 | - name: Cache CUDA 38 | id: cache-cuda 39 | uses: actions/cache@v4 40 | with: 41 | path: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA 42 | key: ${{ runner.os }}-cuda-12.4.1 43 | 44 | - name: Setup CUDA 45 | if: steps.cache-cuda.outputs.cache-hit != 'true' 46 | run: | 47 | curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/12.4.1/network_installers/cuda_12.4.1_windows_network.exe 48 | cuda_installer.exe -s nvcc_12.4 cudart_12.4 nvrtc_dev_12.4 cufft_12.4 cufft_dev_12.4 49 | 50 | - name: Download VapourSynth headers 51 | run: | 52 | curl -s -o vs.zip -L https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip 53 | unzip -q vs.zip 54 | mv vapoursynth-*/ vapoursynth/ 55 | 56 | - name: Configure (CUDA) 57 | run: cmake -S . -B build_cuda -G Ninja -LA 58 | -D CMAKE_BUILD_TYPE=Release 59 | -D ENABLE_CUDA=ON 60 | -D USE_NVRTC_STATIC=ON 61 | -D ENABLE_CPU=OFF 62 | -D VS_INCLUDE_DIR="%cd%\vapoursynth\include" 63 | -D CMAKE_CXX_FLAGS="/fp:fast /EHsc" 64 | -D CMAKE_SHARED_LINKER_FLAGS="/DELAYLOAD:cufft64_11.dll" 65 | -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded 66 | env: 67 | CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4 68 | 69 | - name: Build (CUDA) 70 | run: cmake --build build_cuda --verbose 71 | 72 | - name: Install (CUDA) 73 | run: cmake --install build_cuda --prefix install_cuda 74 | 75 | - name: Prepare for upload (CUDA) 76 | run: | 77 | mkdir artifact_cuda 78 | copy install_cuda\lib\*.dll artifact_cuda 79 | mkdir cufft 80 | copy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin\cufft64_*.dll" cufft 81 | 82 | - name: Upload (CUDA) 83 | uses: actions/upload-artifact@v4 84 | with: 85 | name: vs-dfttest2-cuda-windows 86 | path: artifact_cuda 87 | 88 | - name: Upload cufft 89 | uses: actions/upload-artifact@v4 90 | if: false 91 | with: 92 | name: cufft-windows 93 | path: cufft 94 | 95 | - name: Configure (CPU) 96 | shell: bash 97 | run: cmake -S . -B build_cpu -G Ninja -LA 98 | -D CMAKE_BUILD_TYPE=Release 99 | -D ENABLE_CUDA=OFF 100 | -D ENABLE_CPU=ON 101 | -D VS_INCLUDE_DIR="$(pwd)/vapoursynth/include" 102 | -D CMAKE_CXX_COMPILER=clang++ 103 | -D CMAKE_CXX_FLAGS="-ffast-math" 104 | -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded 105 | 106 | - name: Build (CPU) 107 | run: cmake --build build_cpu --verbose 108 | 109 | - name: Install (CPU) 110 | run: cmake --install build_cpu --prefix install_cpu 111 | 112 | - name: Prepare for upload (CPU) 113 | run: | 114 | mkdir artifact_cpu 115 | copy install_cpu\lib\*.dll artifact_cpu 116 | 117 | - name: Upload (CPU) 118 | uses: actions/upload-artifact@v4 119 | with: 120 | name: vs-dfttest2-cpu-windows 121 | path: artifact_cpu 122 | 123 | - name: Compress artifact for release 124 | if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != '' 125 | run: | 126 | cd artifact_cuda 127 | 128 | mkdir vs-dfttest2-cuda-windows-${{ github.event.inputs.tag }} 129 | xcopy dfttest2_cuda.dll vs-dfttest2-cuda-windows-${{ github.event.inputs.tag }} /f 130 | xcopy dfttest2_nvrtc.dll vs-dfttest2-cuda-windows-${{ github.event.inputs.tag }} /f 131 | 7z a -t7z -mx=9 ../vs-dfttest2-cuda-windows-${{ github.event.inputs.tag }}.7z vs-dfttest2-cuda-windows-${{ github.event.inputs.tag }} 132 | 133 | cd ../cufft 134 | 135 | mkdir vsmlrt-cuda 136 | xcopy cufft64_*.dll vsmlrt-cuda /f 137 | 7z a -t7z -mx=9 ../cufft-windows-${{ github.event.inputs.tag }}.7z vsmlrt-cuda 138 | 139 | cd ../artifact_cpu 140 | 141 | mkdir vs-dfttest2-cpu-windows-${{ github.event.inputs.tag }} 142 | xcopy dfttest2_cpu.dll vs-dfttest2-cpu-windows-${{ github.event.inputs.tag }} /f 143 | 7z a -t7z -mx=9 ../vs-dfttest2-cpu-windows-${{ github.event.inputs.tag }}.7z vs-dfttest2-cpu-windows-${{ github.event.inputs.tag }} 144 | 145 | - name: Release 146 | uses: softprops/action-gh-release@v2 147 | if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != '' 148 | with: 149 | tag_name: ${{ github.event.inputs.tag }} 150 | files: | 151 | vs-dfttest2-cuda-windows-${{ github.event.inputs.tag }}.7z 152 | vs-dfttest2-cpu-windows-${{ github.event.inputs.tag }}.7z 153 | cufft-windows-${{ github.event.inputs.tag }}.7z 154 | dfttest2.py 155 | fail_on_unmatched_files: true 156 | generate_release_notes: false 157 | prerelease: true 158 | 159 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "cpu_source/vectorclass"] 2 | path = cpu_source/vectorclass 3 | url = https://github.com/vectorclass/version2 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.22.0) 2 | 3 | project(vs-dfttest2 VERSION 0.1 LANGUAGES CXX) 4 | 5 | set(ENABLE_CUDA ON CACHE BOOL "Whether to compile with CUDA backends") 6 | set(ENABLE_CPU ON CACHE BOOL "Whether to compile with x86 backend") 7 | set(ENABLE_GCC OFF CACHE BOOL "Whether to compile with gcc vector extension backend") 8 | set(ENABLE_HIP OFF CACHE BOOL "Whether to compile with HIP backends") 9 | 10 | if(NOT CMAKE_BUILD_TYPE) 11 | set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build" FORCE) 12 | endif() 13 | 14 | if(ENABLE_CUDA) 15 | set(USE_NVRTC_STATIC ON CACHE BOOL "Whether to use NVRTC static library") 16 | 17 | find_package(CUDAToolkit REQUIRED) 18 | 19 | add_library(dfttest2_cuda MODULE 20 | cuda_source/source.cpp 21 | cuda_source/win32.cpp 22 | ) 23 | add_library(dfttest2_nvrtc MODULE 24 | nvrtc_source/source.cpp 25 | ) 26 | 27 | set_target_properties(dfttest2_cuda PROPERTIES 28 | CXX_EXTENSIONS OFF 29 | CXX_STANDARD 20 30 | CXX_STANDARD_REQUIRED ON 31 | ) 32 | set_target_properties(dfttest2_nvrtc PROPERTIES 33 | CXX_EXTENSIONS OFF 34 | CXX_STANDARD 20 35 | CXX_STANDARD_REQUIRED ON 36 | ) 37 | 38 | target_link_libraries(dfttest2_cuda PRIVATE CUDA::cuda_driver CUDA::cufft) 39 | target_link_libraries(dfttest2_nvrtc PRIVATE CUDA::cuda_driver) 40 | 41 | if( 42 | USE_NVRTC_STATIC AND ( 43 | CUDAToolkit_VERSION_MAJOR GREATER_EQUAL "12" OR ( 44 | CUDAToolkit_VERSION_MAJOR EQUAL "11" AND 45 | CUDAToolkit_VERSION_MINOR GREATER_EQUAL "5" 46 | ) 47 | ) 48 | ) 49 | target_link_directories(dfttest2_cuda PRIVATE "${CUDAToolkit_LIBRARY_DIR}") 50 | target_link_libraries(dfttest2_cuda PRIVATE nvrtc_static nvrtc-builtins_static nvptxcompiler_static) 51 | target_link_directories(dfttest2_nvrtc PRIVATE "${CUDAToolkit_LIBRARY_DIR}") 52 | target_link_libraries(dfttest2_nvrtc PRIVATE nvrtc_static nvrtc-builtins_static nvptxcompiler_static) 53 | 54 | if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") 55 | set_property(TARGET dfttest2_cuda PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded") 56 | set_property(TARGET dfttest2_nvrtc PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded") 57 | endif() 58 | 59 | if(CMAKE_SYSTEM_NAME STREQUAL "Windows") 60 | target_link_libraries(dfttest2_cuda PRIVATE Ws2_32) 61 | target_link_libraries(dfttest2_nvrtc PRIVATE Ws2_32) 62 | endif() 63 | else() 64 | if(USE_NVRTC_STATIC) 65 | message(WARNING "NVRTC static library is not used") 66 | endif() 67 | target_link_libraries(dfttest2_cuda PRIVATE CUDA::nvrtc) 68 | target_link_libraries(dfttest2_nvrtc PRIVATE CUDA::nvrtc) 69 | endif() 70 | endif() # ENABLE_CUDA 71 | 72 | if(ENABLE_HIP) 73 | find_package(hip REQUIRED config) 74 | find_package(hipfft REQUIRED config) 75 | find_package(hiprtc REQUIRED config) 76 | 77 | add_library(dfttest2_hip MODULE 78 | hip_source/source.cpp 79 | ) 80 | add_library(dfttest2_hiprtc MODULE 81 | hiprtc_source/source.cpp 82 | ) 83 | 84 | set_target_properties(dfttest2_hip PROPERTIES 85 | CXX_EXTENSIONS OFF 86 | CXX_STANDARD 20 87 | CXX_STANDARD_REQUIRED ON 88 | ) 89 | set_target_properties(dfttest2_hiprtc PROPERTIES 90 | CXX_EXTENSIONS OFF 91 | CXX_STANDARD 20 92 | CXX_STANDARD_REQUIRED ON 93 | ) 94 | 95 | target_link_libraries(dfttest2_hip PRIVATE hip::host hip::hipfft hiprtc::hiprtc) 96 | target_link_libraries(dfttest2_hiprtc PRIVATE hip::host hiprtc::hiprtc) 97 | endif() # ENABLE_HIP 98 | 99 | find_package(PkgConfig QUIET MODULE) 100 | 101 | if(PKG_CONFIG_FOUND) 102 | pkg_search_module(VS vapoursynth) 103 | 104 | if(VS_FOUND) 105 | message(STATUS "Found VapourSynth r${VS_VERSION}") 106 | 107 | cmake_path(APPEND install_dir ${VS_LIBDIR} vapoursynth) 108 | 109 | if(ENABLE_CUDA) 110 | target_include_directories(dfttest2_cuda PRIVATE ${VS_INCLUDE_DIRS}) 111 | target_include_directories(dfttest2_nvrtc PRIVATE ${VS_INCLUDE_DIRS}) 112 | 113 | install(TARGETS dfttest2_cuda LIBRARY DESTINATION ${install_dir}) 114 | install(TARGETS dfttest2_nvrtc LIBRARY DESTINATION ${install_dir}) 115 | endif() # ENABLE_CUDA 116 | 117 | if(ENABLE_HIP) 118 | target_include_directories(dfttest2_hip PRIVATE ${VS_INCLUDE_DIRS}) 119 | target_include_directories(dfttest2_hiprtc PRIVATE ${VS_INCLUDE_DIRS}) 120 | 121 | install(TARGETS dfttest2_hip LIBRARY DESTINATION ${install_dir}) 122 | install(TARGETS dfttest2_hiprtc LIBRARY DESTINATION ${install_dir}) 123 | endif() # ENABLE_HIP 124 | endif() 125 | endif() 126 | 127 | if(NOT VS_FOUND) 128 | set(VS_INCLUDE_DIR "" CACHE PATH "Path to VapourSynth headers") 129 | 130 | if(VS_INCLUDE_DIR STREQUAL "") 131 | message(WARNING "VapourSynth not found") 132 | endif() 133 | 134 | if(ENABLE_CUDA) 135 | target_include_directories(dfttest2_cuda PRIVATE ${VS_INCLUDE_DIR}) 136 | target_include_directories(dfttest2_nvrtc PRIVATE ${VS_INCLUDE_DIR}) 137 | 138 | install(TARGETS dfttest2_cuda LIBRARY DESTINATION lib) 139 | install(TARGETS dfttest2_nvrtc LIBRARY DESTINATION lib) 140 | endif() # ENABLE_CUDA 141 | 142 | if(ENABLE_HIP) 143 | target_include_directories(dfttest2_hip PRIVATE ${VS_INCLUDE_DIR}) 144 | target_include_directories(dfttest2_hiprtc PRIVATE ${VS_INCLUDE_DIR}) 145 | 146 | install(TARGETS dfttest2_hip LIBRARY DESTINATION lib) 147 | install(TARGETS dfttest2_hiprtc LIBRARY DESTINATION lib) 148 | endif() # ENABLE_HIP 149 | endif() 150 | 151 | find_package(Git QUIET) 152 | 153 | if(GIT_FOUND) 154 | execute_process( 155 | COMMAND ${GIT_EXECUTABLE} describe --tags --long --always 156 | WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" 157 | OUTPUT_VARIABLE VCS_TAG 158 | ) 159 | if(VCS_TAG) 160 | string(STRIP ${VCS_TAG} VCS_TAG) 161 | endif() 162 | endif() 163 | 164 | if(VCS_TAG) 165 | message(STATUS "vs-dfttest2 ${VCS_TAG}") 166 | else() 167 | message(WARNING "unknown plugin version") 168 | set(VCS_TAG "unknown") 169 | endif() 170 | 171 | configure_file(common/config.h.in config.h) 172 | 173 | if(ENABLE_CUDA) 174 | target_include_directories(dfttest2_cuda PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) 175 | target_include_directories(dfttest2_nvrtc PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) 176 | 177 | if (WIN32) 178 | target_link_options(dfttest2_cuda PRIVATE "delayimp.lib" ${CMAKE_SHARED_LINKER_FLAGS}) 179 | endif() 180 | endif() # ENABLE_CUDA 181 | 182 | if(ENABLE_CPU) 183 | add_subdirectory(cpu_source) 184 | endif() # ENABLE_CPU 185 | 186 | if(ENABLE_GCC) 187 | add_subdirectory(gcc_source) 188 | endif() # ENABLE_GCC 189 | 190 | if(ENABLE_HIP) 191 | target_include_directories(dfttest2_hip PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) 192 | target_include_directories(dfttest2_hiprtc PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) 193 | endif() # ENABLE_HIP 194 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # vs-dfttest2 2 | DFTTest re-implemetation (CUDA and x86) 3 | 4 | ## Usage 5 | ```python3 6 | from dfttest2 import DFTTest 7 | output = DFTTest(input) 8 | ``` 9 | 10 | See also [VapourSynth-DFTTest](https://github.com/HomeOfVapourSynthEvolution/VapourSynth-DFTTest) 11 | 12 | ## Compilation 13 | ```bash 14 | # additional options: -D ENABLE_CUDA=ON -D ENABLE_CPU=ON 15 | cmake -S . -B build 16 | 17 | cmake --build build 18 | 19 | cmake --install build 20 | ``` 21 | 22 | If the vapoursynth library cannot be found by pkg-config, then the cmake variable `VS_INCLUDE_DIR` should be set. 23 | -------------------------------------------------------------------------------- /common/config.h.in: -------------------------------------------------------------------------------- 1 | #define VERSION "@VCS_TAG@" -------------------------------------------------------------------------------- /cpu_source/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(VCL_HOME "${CMAKE_CURRENT_SOURCE_DIR}/vectorclass" CACHE PATH "Path to vector class v2 headers") 2 | 3 | if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_FRONTEND_VARIANT STREQUAL "GNU") 4 | set(CPU_DISPATCH_TARGETS "sse2;avx2;avx512f" CACHE STRING "Dispatch targets") 5 | elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR CMAKE_CXX_COMPILER_FRONTEND_VARIANT STREQUAL "MSVC") 6 | set(CPU_DISPATCH_TARGETS "AVX;AVX2;AVX512" CACHE STRING "Dispatch targets") 7 | endif() 8 | 9 | message(STATUS "cpu targets: ${CPU_DISPATCH_TARGETS}") 10 | 11 | add_library(dfttest2_cpu MODULE source.cpp ${CMAKE_CURRENT_SOURCE_DIR}/vectorclass/instrset_detect.cpp) 12 | 13 | set_target_properties(dfttest2_cpu PROPERTIES 14 | CXX_EXTENSIONS OFF 15 | CXX_STANDARD 20 16 | CXX_STANDARD_REQUIRED ON 17 | ) 18 | 19 | target_include_directories(dfttest2_cpu PRIVATE ${VCL_HOME}) 20 | 21 | if(PKG_CONFIG_FOUND AND VS_FOUND) 22 | target_include_directories(dfttest2_cpu PRIVATE ${VS_INCLUDE_DIRS}) 23 | install(TARGETS dfttest2_cpu LIBRARY DESTINATION ${install_dir}) 24 | else() 25 | target_include_directories(dfttest2_cpu PRIVATE ${VS_INCLUDE_DIR}) 26 | install(TARGETS dfttest2_cpu LIBRARY DESTINATION lib) 27 | endif() 28 | 29 | target_include_directories(dfttest2_cpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/..) 30 | 31 | 32 | if(CPU_DISPATCH_TARGETS) 33 | target_compile_definitions(dfttest2_cpu PRIVATE HAS_DISPATCH) 34 | 35 | set(GETFRAME_DECLARATIONS "") 36 | set(GETFRAME_PTRS "") 37 | set(SUPPORTED_ARCH_DECLARATIONS "") 38 | set(SUPPORTED_ARCH_PTRS "") 39 | set(SUPPORTED_ARCH_STRS "") 40 | 41 | foreach(arch_option ${CPU_DISPATCH_TARGETS}) 42 | set(raw_arch_option ${arch_option}) 43 | string(REPLACE "=" "_" arch ${arch_option}) 44 | string(REPLACE "-" "_" arch ${arch}) 45 | 46 | if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_FRONTEND_VARIANT STREQUAL "GNU") 47 | if(${arch_option} STREQUAL avx2) 48 | set(arch_option ${arch_option} -mfma) 49 | elseif(${arch_option} STREQUAL avx512f) 50 | if(WIN32) 51 | # according to vcl2, 52 | # MS compiler cannot generate code for AVX512F without AVX512DQ 53 | set(arch_option ${arch_option} -mfma -mavx512vl -mavx512bw -mavx512dq) 54 | else() 55 | set(arch_option ${arch_option} -mfma) 56 | endif() 57 | endif() 58 | endif() 59 | 60 | set(current_target getframe_impl_${arch}) 61 | set(current_getframe_name DFTTestGetFrame_${arch}) 62 | set(current_supported_arch_name supported_arch_${arch}) 63 | 64 | add_library(${current_target} OBJECT getframe_impl.cpp) 65 | 66 | set_target_properties(${current_target} PROPERTIES 67 | CXX_EXTENSIONS OFF 68 | CXX_STANDARD 20 69 | CXX_STANDARD_REQUIRED ON 70 | ) 71 | 72 | target_compile_definitions(${current_target} PRIVATE HAS_DISPATCH) 73 | target_compile_definitions(${current_target} PRIVATE DFTTEST_GETFRAME_NAME=${current_getframe_name}) 74 | target_compile_definitions(${current_target} PRIVATE SUPPORTED_ARCH_NAME=${current_supported_arch_name}) 75 | 76 | if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_FRONTEND_VARIANT STREQUAL "GNU") 77 | target_compile_options(${current_target} PRIVATE -m${arch_option}) 78 | elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR CMAKE_CXX_COMPILER_FRONTEND_VARIANT STREQUAL "MSVC") 79 | target_compile_options(${current_target} PRIVATE /arch:${arch_option}) 80 | endif() 81 | 82 | target_include_directories(${current_target} PRIVATE ${VCL_HOME}) 83 | 84 | if(PKG_CONFIG_FOUND AND VS_FOUND) 85 | target_include_directories(${current_target} PRIVATE ${VS_INCLUDE_DIRS}) 86 | else() 87 | target_include_directories(${current_target} PRIVATE ${VS_INCLUDE_DIR}) 88 | endif() 89 | 90 | target_include_directories(${current_target} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) 91 | 92 | string(APPEND GETFRAME_DECLARATIONS " 93 | extern const VSFrameRef *VS_CC ${current_getframe_name}( 94 | int n, int activationReason, void **instanceData, void **frameData, 95 | VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi 96 | ) noexcept;\n") 97 | 98 | string(APPEND GETFRAME_PTRS ${current_getframe_name},) 99 | 100 | string(APPEND SUPPORTED_ARCH_DECLARATIONS "extern bool ${current_supported_arch_name}() noexcept;\n") 101 | 102 | string(APPEND SUPPORTED_ARCH_PTRS ${current_supported_arch_name},) 103 | 104 | string(APPEND SUPPORTED_ARCH_STRS \"${raw_arch_option}\",) 105 | 106 | target_link_libraries(dfttest2_cpu PRIVATE ${current_target}) 107 | endforeach() 108 | 109 | configure_file(cpu_dispatch.h.in cpu_dispatch.h @ONLY) 110 | 111 | target_compile_definitions(dfttest2_cpu PRIVATE GETFRAME_PTRS=${GETFRAME_PTRS}) 112 | target_compile_definitions(dfttest2_cpu PRIVATE SUPPORTED_ARCH_PTRS=${SUPPORTED_ARCH_PTRS}) 113 | target_compile_definitions(dfttest2_cpu PRIVATE SUPPORTED_ARCH_STRS=${SUPPORTED_ARCH_STRS}) 114 | target_include_directories(dfttest2_cpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) 115 | else() 116 | add_library(getframe_impl OBJECT getframe_impl.cpp) 117 | 118 | set_target_properties(getframe_impl PROPERTIES 119 | CXX_EXTENSIONS OFF 120 | CXX_STANDARD 20 121 | CXX_STANDARD_REQUIRED ON 122 | ) 123 | 124 | target_include_directories(getframe_impl PRIVATE ${VCL_HOME}) 125 | 126 | if(PKG_CONFIG_FOUND AND VS_FOUND) 127 | target_include_directories(getframe_impl PRIVATE ${VS_INCLUDE_DIRS}) 128 | else() 129 | target_include_directories(getframe_impl PRIVATE ${VS_INCLUDE_DIR}) 130 | endif() 131 | 132 | target_link_libraries(dfttest2_cpu PRIVATE getframe_impl) 133 | endif() 134 | -------------------------------------------------------------------------------- /cpu_source/cpu_dispatch.h.in: -------------------------------------------------------------------------------- 1 | #ifdef HAS_DISPATCH 2 | 3 | #ifndef CPU_DISPATCH_H 4 | #define CPU_DISPATCH_H 5 | 6 | #include 7 | 8 | @GETFRAME_DECLARATIONS@ 9 | 10 | @SUPPORTED_ARCH_DECLARATIONS@ 11 | 12 | #endif // CPU_DISPATCH_H 13 | 14 | #endif // HAS_DISPATCH 15 | -------------------------------------------------------------------------------- /cpu_source/dfttest2_cpu.h: -------------------------------------------------------------------------------- 1 | #ifndef DFTTEST2_CPU_H 2 | #define DFTTEST2_CPU_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | 15 | #include 16 | 17 | 18 | static inline void vs_aligned_free_float(float * ptr) { 19 | vs_aligned_free(static_cast(ptr)); 20 | } 21 | 22 | 23 | struct DFTTestThreadData { 24 | uint8_t * padded; // shape: (pad_height, pad_width) 25 | float * padded2; // shape: (pad_height, pad_width) 26 | }; 27 | 28 | 29 | struct DFTTestData { 30 | VSNodeRef * node; 31 | int radius; 32 | int block_size; 33 | int block_step; 34 | std::array process; 35 | bool zero_mean; 36 | std::unique_ptr window { nullptr, &vs_aligned_free_float }; 37 | std::unique_ptr window_freq { nullptr, &vs_aligned_free_float }; 38 | std::unique_ptr sigma { nullptr, &vs_aligned_free_float }; 39 | int filter_type; 40 | float sigma2; 41 | float pmin; 42 | float pmax; 43 | 44 | std::atomic num_uninitialized_threads; 45 | std::unordered_map thread_data; 46 | std::shared_mutex thread_data_lock; 47 | }; 48 | 49 | #if defined HAS_DISPATCH 50 | #include 51 | #else // HAS_DISPATCH 52 | extern const VSFrameRef *VS_CC DFTTestGetFrame( 53 | int n, int activationReason, void **instanceData, void **frameData, 54 | VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi 55 | ) noexcept; 56 | 57 | extern bool supported_arch() noexcept; 58 | 59 | extern const char * target_arch() noexcept; 60 | #endif // HAS_DISPATCH 61 | 62 | #endif // DFTTEST2_CPU_H 63 | -------------------------------------------------------------------------------- /cpu_source/getframe_impl.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | #include "dfttest2_cpu.h" 9 | #include "kernel.hpp" 10 | 11 | 12 | static inline int calc_pad_size(int size, int block_size, int block_step) { 13 | return ( 14 | size 15 | + ((size % block_size) ? block_size - size % block_size : 0) 16 | + std::max(block_size - block_step, block_step) * 2 17 | ); 18 | } 19 | 20 | 21 | static inline int calc_pad_num(int size, int block_size, int block_step) { 22 | return (calc_pad_size(size, block_size, block_step) - block_size) / block_step + 1; 23 | } 24 | 25 | 26 | template 27 | static inline void reflection_padding_impl( 28 | T * VS_RESTRICT dst, // shape: (pad_height, pad_width) 29 | const T * VS_RESTRICT src, // shape: (height, stride) 30 | int width, int height, int stride, 31 | int block_size, int block_step 32 | ) { 33 | 34 | int pad_width = calc_pad_size(width, block_size, block_step); 35 | int pad_height = calc_pad_size(height, block_size, block_step); 36 | 37 | int offset_y = (pad_height - height) / 2; 38 | int offset_x = (pad_width - width) / 2; 39 | 40 | vs_bitblt( 41 | &dst[offset_y * pad_width + offset_x], pad_width * sizeof(T), 42 | src, stride * sizeof(T), 43 | width * sizeof(T), height 44 | ); 45 | 46 | // copy left and right regions 47 | for (int y = offset_y; y < offset_y + height; y++) { 48 | auto dst_line = &dst[y * pad_width]; 49 | 50 | for (int x = 0; x < offset_x; x++) { 51 | dst_line[x] = dst_line[offset_x * 2 - x]; 52 | } 53 | 54 | for (int x = offset_x + width; x < pad_width; x++) { 55 | dst_line[x] = dst_line[2 * (offset_x + width) - 2 - x]; 56 | } 57 | } 58 | 59 | // copy top region 60 | for (int y = 0; y < offset_y; y++) { 61 | std::memcpy( 62 | &dst[y * pad_width], 63 | &dst[(offset_y * 2 - y) * pad_width], 64 | pad_width * sizeof(T) 65 | ); 66 | } 67 | 68 | // copy bottom region 69 | for (int y = offset_y + height; y < pad_height; y++) { 70 | std::memcpy( 71 | &dst[y * pad_width], 72 | &dst[(2 * (offset_y + height) - 2 - y) * pad_width], 73 | pad_width * sizeof(T) 74 | ); 75 | } 76 | } 77 | 78 | 79 | static inline void reflection_padding( 80 | uint8_t * VS_RESTRICT dst, // shape: (pad_height, pad_width) 81 | const uint8_t * VS_RESTRICT src, // shape: (height, stride) 82 | int width, int height, int stride, 83 | int block_size, int block_step, 84 | int bytes_per_sample 85 | ) { 86 | 87 | if (bytes_per_sample == 1) { 88 | reflection_padding_impl( 89 | static_cast(dst), 90 | static_cast(src), 91 | width, height, stride, 92 | block_size, block_step 93 | ); 94 | } else if (bytes_per_sample == 2) { 95 | reflection_padding_impl( 96 | reinterpret_cast(dst), 97 | reinterpret_cast(src), 98 | width, height, stride, 99 | block_size, block_step 100 | ); 101 | } else if (bytes_per_sample == 4) { 102 | reflection_padding_impl( 103 | reinterpret_cast(dst), 104 | reinterpret_cast(src), 105 | width, height, stride, 106 | block_size, block_step 107 | ); 108 | } 109 | } 110 | 111 | 112 | static inline void load_block( 113 | Vec16f * VS_RESTRICT block, 114 | const uint8_t * VS_RESTRICT shifted_src, 115 | int radius, 116 | int block_size, 117 | int block_step, 118 | int width, 119 | int height, 120 | const Vec16f * VS_RESTRICT window, 121 | int bits_per_sample 122 | ) { 123 | 124 | float scale = 1.0f / (1 << (bits_per_sample - 8)); 125 | if (bits_per_sample == 32) { 126 | scale = 255.0f; 127 | } 128 | 129 | int bytes_per_sample = (bits_per_sample + 7) / 8; 130 | 131 | assert(block_size == 16); 132 | block_size = 16; // unsafe 133 | 134 | int offset_x = calc_pad_size(width, block_size, block_step); 135 | int offset_y = calc_pad_size(height, block_size, block_step); 136 | 137 | if (bytes_per_sample == 1) { 138 | for (int i = 0; i < 2 * radius + 1; i++) { 139 | for (int j = 0; j < block_size; j++) { 140 | auto vec_input = Vec16uc().load((const uint8_t *) shifted_src + (i * offset_y + j) * offset_x); 141 | auto vec_input_f = to_float(Vec16i(extend(extend(vec_input)))); 142 | block[i * block_size * 2 + j] = scale * window[i * block_size + j] * vec_input_f; 143 | } 144 | } 145 | } 146 | if (bytes_per_sample == 2) { 147 | for (int i = 0; i < 2 * radius + 1; i++) { 148 | for (int j = 0; j < block_size; j++) { 149 | auto vec_input = Vec16us().load((const uint16_t *) shifted_src + (i * offset_y + j) * offset_x); 150 | auto vec_input_f = to_float(Vec16i(extend(vec_input))); 151 | block[i * block_size * 2 + j] = scale * window[i * block_size + j] * vec_input_f; 152 | } 153 | } 154 | } 155 | if (bytes_per_sample == 4) { 156 | for (int i = 0; i < 2 * radius + 1; i++) { 157 | for (int j = 0; j < block_size; j++) { 158 | auto vec_input_f = Vec16f().load((const float *) shifted_src + (i * offset_y + j) * offset_x); 159 | block[i * block_size * 2 + j] = scale * window[i * block_size + j] * vec_input_f; 160 | } 161 | } 162 | } 163 | } 164 | 165 | 166 | static inline void store_block( 167 | float * VS_RESTRICT shifted_dst, 168 | const Vec16f * VS_RESTRICT shifted_block, 169 | int block_size, 170 | int block_step, 171 | int width, 172 | const Vec16f * VS_RESTRICT shifted_window 173 | ) { 174 | 175 | assert(block_size == 16); 176 | block_size = 16; // unsafe 177 | 178 | for (int i = 0; i < block_size; i++) { 179 | Vec16f acc = Vec16f().load((const float *) shifted_dst + (i * calc_pad_size(width, block_size, block_step))); 180 | acc = mul_add(shifted_block[i], shifted_window[i], acc); 181 | acc.store((float *) shifted_dst + (i * calc_pad_size(width, block_size, block_step))); 182 | } 183 | } 184 | 185 | 186 | static inline void store_frame( 187 | uint8_t * VS_RESTRICT dst, 188 | const float * VS_RESTRICT shifted_src, 189 | int width, 190 | int height, 191 | int dst_stride, 192 | int src_stride, 193 | int bits_per_sample 194 | ) { 195 | 196 | float scale = 1.0f / (1 << (bits_per_sample - 8)); 197 | if (bits_per_sample == 32) { 198 | scale = 255.0f; 199 | } 200 | 201 | int bytes_per_sample = (bits_per_sample + 7) / 8; 202 | int peak = (1 << bits_per_sample) - 1; 203 | 204 | if (bytes_per_sample == 1) { 205 | auto dstp = (uint8_t *) dst; 206 | for (int y = 0; y < height; y++) { 207 | for (int x = 0; x < width; x++) { 208 | auto clamped = std::clamp(static_cast(shifted_src[y * src_stride + x] / scale + 0.5f), 0, peak); 209 | dstp[y * dst_stride + x] = static_cast(clamped); 210 | } 211 | } 212 | } 213 | if (bytes_per_sample == 2) { 214 | auto dstp = (uint16_t *) dst; 215 | for (int y = 0; y < height; y++) { 216 | for (int x = 0; x < width; x++) { 217 | auto clamped = std::clamp(static_cast(shifted_src[y * src_stride + x] / scale + 0.5f), 0, peak); 218 | dstp[y * dst_stride + x] = static_cast(clamped); 219 | } 220 | } 221 | } 222 | if (bytes_per_sample == 4) { 223 | auto dstp = (float *) dst; 224 | for (int y = 0; y < height; y++) { 225 | for (int x = 0; x < width; x++) { 226 | dstp[y * dst_stride + x] = shifted_src[y * src_stride + x] / scale; 227 | } 228 | } 229 | } 230 | } 231 | 232 | 233 | const VSFrameRef * VS_CC 234 | #ifndef HAS_DISPATCH 235 | DFTTestGetFrame 236 | #else // HAS_DISPATCH 237 | DFTTEST_GETFRAME_NAME 238 | #endif // HAS_DISPATCH 239 | ( 240 | int n, int activationReason, void **instanceData, void **frameData, 241 | VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi 242 | ) noexcept { 243 | 244 | auto d = static_cast(*instanceData); 245 | 246 | if (activationReason == arInitial) { 247 | int start = std::max(n - d->radius, 0); 248 | auto vi = vsapi->getVideoInfo(d->node); 249 | int end = std::min(n + d->radius, vi->numFrames - 1); 250 | for (int i = start; i <= end; i++) { 251 | vsapi->requestFrameFilter(i, d->node, frameCtx); 252 | } 253 | return nullptr; 254 | } else if (activationReason != arAllFramesReady) { 255 | return nullptr; 256 | } 257 | 258 | auto mxcsr = get_control_word(); 259 | no_subnormals(); 260 | 261 | auto vi = vsapi->getVideoInfo(d->node); 262 | 263 | DFTTestThreadData thread_data; 264 | 265 | auto thread_id = std::this_thread::get_id(); 266 | if (d->num_uninitialized_threads.load(std::memory_order_acquire) == 0) { 267 | const auto & const_data = d->thread_data; 268 | thread_data = const_data.at(thread_id); 269 | } else { 270 | bool initialized = true; 271 | 272 | d->thread_data_lock.lock_shared(); 273 | try { 274 | const auto & const_data = d->thread_data; 275 | thread_data = const_data.at(thread_id); 276 | } catch (const std::out_of_range &) { 277 | initialized = false; 278 | } 279 | d->thread_data_lock.unlock_shared(); 280 | 281 | if (!initialized) { 282 | auto padded_size = ( 283 | (2 * d->radius + 1) * 284 | calc_pad_size(vi->height, d->block_size, d->block_step) * 285 | calc_pad_size(vi->width, d->block_size, d->block_step) * 286 | vi->format->bytesPerSample 287 | ); 288 | 289 | thread_data.padded = static_cast(std::malloc(padded_size)); 290 | thread_data.padded2 = static_cast(std::malloc( 291 | calc_pad_size(vi->height, d->block_size, d->block_step) * 292 | calc_pad_size(vi->width, d->block_size, d->block_step) * 293 | sizeof(float) 294 | )); 295 | 296 | { 297 | std::lock_guard _ { d->thread_data_lock }; 298 | d->thread_data.emplace(thread_id, thread_data); 299 | } 300 | 301 | d->num_uninitialized_threads.fetch_sub(1, std::memory_order_release); 302 | } 303 | } 304 | 305 | std::vectorfreeFrame)>> src_frames; 306 | src_frames.reserve(2 * d->radius + 1); 307 | for (int i = n - d->radius; i <= n + d->radius; i++) { 308 | src_frames.emplace_back( 309 | vsapi->getFrameFilter(std::clamp(i, 0, vi->numFrames - 1), d->node, frameCtx), 310 | vsapi->freeFrame 311 | ); 312 | } 313 | 314 | auto & src_center_frame = src_frames[d->radius]; 315 | auto format = vsapi->getFrameFormat(src_center_frame.get()); 316 | 317 | const VSFrameRef * fr[] { 318 | d->process[0] ? nullptr : src_center_frame.get(), 319 | d->process[1] ? nullptr : src_center_frame.get(), 320 | d->process[2] ? nullptr : src_center_frame.get() 321 | }; 322 | const int pl[] { 0, 1, 2 }; 323 | std::unique_ptrfreeFrame)> dst_frame { 324 | vsapi->newVideoFrame2(format, vi->width, vi->height, fr, pl, src_center_frame.get(), core), 325 | vsapi->freeFrame 326 | }; 327 | 328 | for (int plane = 0; plane < format->numPlanes; plane++) { 329 | if (!d->process[plane]) { 330 | continue; 331 | } 332 | 333 | int width = vsapi->getFrameWidth(src_center_frame.get(), plane); 334 | int height = vsapi->getFrameHeight(src_center_frame.get(), plane); 335 | int stride = vsapi->getStride(src_center_frame.get(), plane) / vi->format->bytesPerSample; 336 | 337 | int padded_size_spatial = ( 338 | calc_pad_size(height, d->block_size, d->block_step) * 339 | calc_pad_size(width, d->block_size, d->block_step) 340 | ); 341 | 342 | std::memset(thread_data.padded2, 0, 343 | calc_pad_size(height, d->block_size, d->block_step) * 344 | calc_pad_size(width, d->block_size, d->block_step) * 345 | sizeof(float) 346 | ); 347 | 348 | for (int i = 0; i < 2 * d->radius + 1; i++) { 349 | auto srcp = vsapi->getReadPtr(src_frames[i].get(), plane); 350 | reflection_padding( 351 | &thread_data.padded[(i * padded_size_spatial) * vi->format->bytesPerSample], 352 | srcp, 353 | width, height, stride, 354 | d->block_size, d->block_step, 355 | vi->format->bytesPerSample 356 | ); 357 | } 358 | 359 | for (int i = 0; i < calc_pad_num(height, d->block_size, d->block_step); i++) { 360 | for (int j = 0; j < calc_pad_num(width, d->block_size, d->block_step); j++) { 361 | assert(d->block_size == 16); 362 | constexpr int block_size = 16; 363 | 364 | Vec16f block[7 * block_size * 2]; 365 | 366 | int offset_x = calc_pad_size(width, d->block_size, d->block_step); 367 | 368 | load_block( 369 | block, 370 | &thread_data.padded[(i * offset_x + j) * d->block_step * vi->format->bytesPerSample], 371 | d->radius, d->block_size, d->block_step, 372 | width, height, 373 | reinterpret_cast(d->window.get()), 374 | vi->format->bitsPerSample 375 | ); 376 | 377 | fused( 378 | block, 379 | reinterpret_cast(d->sigma.get()), 380 | d->sigma2, 381 | d->pmin, 382 | d->pmax, 383 | d->filter_type, 384 | d->zero_mean, 385 | reinterpret_cast(d->window_freq.get()), 386 | d->radius 387 | ); 388 | 389 | store_block( 390 | &thread_data.padded2[(i * offset_x + j) * d->block_step], 391 | &block[d->radius * block_size * 2], 392 | block_size, 393 | d->block_step, 394 | width, 395 | reinterpret_cast(&d->window[d->radius * block_size * 2 * 16]) 396 | ); 397 | } 398 | } 399 | 400 | int pad_width = calc_pad_size(width, d->block_size, d->block_step); 401 | int pad_height = calc_pad_size(height, d->block_size, d->block_step); 402 | int offset_y = (pad_height - height) / 2; 403 | int offset_x = (pad_width - width) / 2; 404 | 405 | auto dstp = vsapi->getWritePtr(dst_frame.get(), plane); 406 | store_frame( 407 | dstp, 408 | &thread_data.padded2[(offset_y * pad_width + offset_x)], 409 | width, 410 | height, 411 | stride, 412 | pad_width, 413 | vi->format->bitsPerSample 414 | ); 415 | } 416 | 417 | set_control_word(mxcsr); 418 | 419 | return dst_frame.release(); 420 | } 421 | 422 | 423 | #ifndef HAS_DISPATCH 424 | bool supported_arch() noexcept { 425 | #else // HAS_DISPATCH 426 | bool SUPPORTED_ARCH_NAME() noexcept { 427 | #endif // HAS_DISPATCH 428 | 429 | return instrset_detect() >= INSTRSET; 430 | } 431 | 432 | #ifndef HAS_DISPATCH 433 | const char * target_arch() noexcept { 434 | #if 0 <= INSTRSET && INSTRSET <= 10 435 | constexpr std::array dispatch_targets { 436 | "80386", "sse", "sse2", "sse3", "sse4.1", "sse4.2", 437 | "avx", "avx2", "avx512f", "avx512bw/dq/vl" 438 | }; 439 | return dispatch_targets[INSTRSET]; 440 | #else // 0 <= INSTRSET && INSTRSET <= 10 441 | return "unknown"; 442 | #endif // 0 <= INSTRSET && INSTRSET <= 10 443 | } 444 | #endif // HAS_DISPATCH 445 | -------------------------------------------------------------------------------- /cpu_source/source.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #if __cpp_lib_math_constants 12 | #include 13 | #endif // __cpp_lib_math_constants 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include 21 | #include 22 | 23 | #include "dfttest2_cpu.h" 24 | #include "kernel.hpp" 25 | 26 | #include // generated by cmake, defines "VERSION" 27 | 28 | 29 | template 30 | #if __cpp_concepts 31 | requires 32 | (std::is_same_v || std::is_same_v>) 33 | #endif // __cpp_concepts 34 | static void dft( 35 | std::complex * VS_RESTRICT dst, 36 | const T_in * VS_RESTRICT src, 37 | int n, 38 | int stride 39 | ) { 40 | #if __cpp_lib_math_constants 41 | const auto pi = std::numbers::pi_v; 42 | #else // __cpp_lib_math_constants 43 | const auto pi = static_cast(M_PI); 44 | #endif // __cpp_lib_math_constants 45 | 46 | int out_num = std::is_floating_point_v ? (n / 2 + 1) : n; 47 | for (int i = 0; i < out_num; i++) { 48 | std::complex sum {}; 49 | for (int j = 0; j < n; j++) { 50 | auto imag = -2 * i * j * pi / n; 51 | auto weight = std::complex(std::cos(imag), std::sin(imag)); 52 | sum += src[j * stride] * weight; 53 | } 54 | dst[i * stride] = sum; 55 | } 56 | } 57 | 58 | 59 | static void VS_CC DFTTestInit( 60 | VSMap *in, VSMap *out, void **instanceData, VSNode *node, 61 | VSCore *core, const VSAPI *vsapi 62 | ) noexcept { 63 | 64 | auto d = static_cast(*instanceData); 65 | 66 | auto vi = vsapi->getVideoInfo(d->node); 67 | vsapi->setVideoInfo(vi, 1, node); 68 | } 69 | 70 | 71 | static void VS_CC DFTTestFree( 72 | void *instanceData, VSCore *core, const VSAPI *vsapi 73 | ) noexcept { 74 | 75 | auto d = static_cast(instanceData); 76 | 77 | vsapi->freeNode(d->node); 78 | 79 | for (const auto & [_, thread_data] : d->thread_data) { 80 | std::free(thread_data.padded2); 81 | std::free(thread_data.padded); 82 | } 83 | 84 | delete d; 85 | } 86 | 87 | 88 | static void VS_CC DFTTestCreate( 89 | const VSMap *in, VSMap *out, void *userData, 90 | VSCore *core, const VSAPI *vsapi 91 | ) noexcept { 92 | 93 | auto d = std::make_unique(); 94 | 95 | d->node = vsapi->propGetNode(in, "clip", 0, nullptr); 96 | 97 | auto set_error = [vsapi, out, &d](const char * error_message) -> void { 98 | vsapi->freeNode(d->node); 99 | vsapi->setError(out, error_message); 100 | return ; 101 | }; 102 | 103 | auto vi = vsapi->getVideoInfo(d->node); 104 | if (!isConstantFormat(vi)) { 105 | return set_error("only constant format input is supported"); 106 | } 107 | if (vi->format->sampleType == stInteger && vi->format->bytesPerSample > 2) { 108 | return set_error("only 8-16 bit integer format input is supported"); 109 | } 110 | if (vi->format->sampleType == stFloat && vi->format->bitsPerSample != 32) { 111 | return set_error("only 32-bit float format input is supported"); 112 | } 113 | 114 | int error; 115 | 116 | d->radius = int64ToIntS(vsapi->propGetInt(in, "radius", 0, &error)); 117 | if (error) { 118 | d->radius = 0; 119 | } 120 | 121 | if (d->radius < 0 || d->radius > 3) { 122 | return set_error("\"radius\" must be in [0, 1, 2, 3]"); 123 | } 124 | 125 | d->block_size = int64ToIntS(vsapi->propGetInt(in, "block_size", 0, &error)); 126 | if (error) { 127 | d->block_size = 16; 128 | } 129 | 130 | if (d->block_size != 16) { 131 | return set_error("\"block_size\" must be 16"); 132 | } 133 | 134 | d->block_step = int64ToIntS(vsapi->propGetInt(in, "block_step", 0, &error)); 135 | if (error) { 136 | d->block_step = d->block_size; 137 | } 138 | 139 | int num_planes_args = vsapi->propNumElements(in, "planes"); 140 | d->process.fill(num_planes_args <= 0); 141 | for (int i = 0; i < num_planes_args; ++i) { 142 | int plane = static_cast(vsapi->propGetInt(in, "planes", i, nullptr)); 143 | 144 | if (plane < 0 || plane >= vi->format->numPlanes) { 145 | return set_error("plane index out of range"); 146 | } 147 | 148 | if (d->process[plane]) { 149 | return set_error("plane specified twice"); 150 | } 151 | 152 | d->process[plane] = true; 153 | } 154 | 155 | { 156 | auto ptr = vs_aligned_malloc( 157 | (2 * d->radius + 1) * d->block_size * d->block_size * sizeof(float), 158 | 64 159 | ); 160 | if (ptr == nullptr) { 161 | return set_error("alloc error"); 162 | } 163 | d->window.reset(ptr); 164 | } 165 | { 166 | auto window = vsapi->propGetFloatArray(in, "window", nullptr); 167 | for (int i = 0; i < (2 * d->radius + 1) * d->block_size * d->block_size / 16; i++) { 168 | to_float(Vec8d().load(&window[i * 16])).store_a(&d->window[i * 16]); 169 | to_float(Vec8d().load(&window[i * 16 + 8])).store_a(&d->window[i * 16 + 8]); 170 | } 171 | } 172 | 173 | { 174 | auto ptr = vs_aligned_malloc( 175 | (2 * d->radius + 1) * d->block_size * (d->block_size / 2 + 1 + 15) * sizeof(float), 176 | 64 177 | ); 178 | if (ptr == nullptr) { 179 | return set_error("alloc error"); 180 | } 181 | d->sigma.reset(ptr); 182 | } 183 | { 184 | auto sigma = vsapi->propGetFloatArray(in, "sigma", nullptr); 185 | for (int i = 0; i < (2 * d->radius + 1) * d->block_size; i++) { 186 | float sigma_padded[16] {}; 187 | for (int j = 0; j < d->block_size / 2 + 1; j++) { 188 | sigma_padded[j] = static_cast(sigma[i * (d->block_size / 2 + 1) + j]); 189 | } 190 | Vec16f().load(&sigma_padded[0]).store_a(&d->sigma[i * 16]); 191 | } 192 | } 193 | 194 | d->sigma2 = static_cast(vsapi->propGetFloat(in, "sigma2", 0, nullptr)); 195 | d->pmin = static_cast(vsapi->propGetFloat(in, "pmin", 0, nullptr)); 196 | d->pmax = static_cast(vsapi->propGetFloat(in, "pmax", 0, nullptr)); 197 | 198 | d->filter_type = static_cast(vsapi->propGetInt(in, "filter_type", 0, nullptr)); 199 | 200 | d->zero_mean = !!vsapi->propGetInt(in, "zero_mean", 0, &error); 201 | if (error) { 202 | d->zero_mean = true; 203 | } 204 | if (d->zero_mean) { 205 | { 206 | auto ptr = vs_aligned_malloc( 207 | (2 * d->radius + 1) * d->block_size * (d->block_size / 2 + 1 + 15) * 2 * sizeof(float), 208 | 64 209 | ); 210 | if (ptr == nullptr) { 211 | return set_error("alloc error"); 212 | } 213 | d->window_freq.reset(ptr); 214 | } 215 | auto window_freq = vsapi->propGetFloatArray(in, "window_freq", nullptr); 216 | for (int i = 0; i < (2 * d->radius + 1) * d->block_size; i++) { 217 | float sigma_padded[32] {}; 218 | for (int j = 0; j < d->block_size / 2 + 1; j++) { 219 | sigma_padded[j] = static_cast(window_freq[(i * (d->block_size / 2 + 1) + j) * 2]); 220 | sigma_padded[16 + j] = static_cast(window_freq[(i * (d->block_size / 2 + 1) + j) * 2 + 1]); 221 | } 222 | Vec16f().load(&sigma_padded[0]).store_a(&d->window_freq[i * 2 * 16]); 223 | Vec16f().load(&sigma_padded[16]).store_a(&d->window_freq[(i * 2 + 1) * 16]); 224 | } 225 | } 226 | 227 | VSCoreInfo info; 228 | vsapi->getCoreInfo2(core, &info); 229 | d->num_uninitialized_threads.store(info.numThreads, std::memory_order::relaxed); 230 | d->thread_data.reserve(info.numThreads); 231 | 232 | #ifndef HAS_DISPATCH 233 | if (!supported_arch()) { 234 | return set_error("unsupported cpu architecture"); 235 | } 236 | 237 | vsapi->createFilter( 238 | in, out, "DFTTest", 239 | DFTTestInit, DFTTestGetFrame, DFTTestFree, 240 | fmParallel, 0, d.release(), core 241 | ); 242 | #else 243 | auto opt = int64ToIntS(vsapi->propGetInt(in, "opt", 0, &error)); 244 | if (error) { 245 | opt = 0; 246 | } 247 | 248 | constexpr std::array getframe_candidates { GETFRAME_PTRS }; 249 | 250 | if (opt == 0) { 251 | constexpr std::array supported_arch_candidates { SUPPORTED_ARCH_PTRS }; 252 | 253 | bool found_supported_impl = false; 254 | 255 | for (int i = static_cast(getframe_candidates.size()) - 1; i >= 0; i--) { 256 | if (supported_arch_candidates[i]()) { 257 | vsapi->createFilter( 258 | in, out, "DFTTest", 259 | DFTTestInit, getframe_candidates[i], DFTTestFree, 260 | fmParallel, 0, d.release(), core 261 | ); 262 | 263 | found_supported_impl = true; 264 | break; 265 | } 266 | } 267 | 268 | if (!found_supported_impl) { 269 | return set_error("unsupported cpu architecture"); 270 | } 271 | } else { 272 | if (0 < opt && opt < static_cast(getframe_candidates.size() + 1)) { 273 | vsapi->createFilter( 274 | in, out, "DFTTest", 275 | DFTTestInit, getframe_candidates[opt - 1], DFTTestFree, 276 | fmParallel, 0, d.release(), core 277 | ); 278 | } else { 279 | return set_error("invalid \"opt\""); 280 | } 281 | } 282 | #endif // HAS_DISPATCH 283 | } 284 | 285 | 286 | static void VS_CC RDFT( 287 | const VSMap *in, VSMap *out, void *userData, 288 | VSCore *core, const VSAPI *vsapi 289 | ) noexcept { 290 | 291 | auto set_error = [vsapi, out](const char * error_message) -> void { 292 | vsapi->setError(out, error_message); 293 | }; 294 | 295 | int ndim = vsapi->propNumElements(in, "shape"); 296 | if (ndim != 1 && ndim != 2 && ndim != 3) { 297 | return set_error("\"shape\" must be an array of ints with 1, 2 or 3 values"); 298 | } 299 | 300 | std::array shape {}; 301 | { 302 | auto shape_array = vsapi->propGetIntArray(in, "shape", nullptr); 303 | for (int i = 0; i < ndim; i++) { 304 | shape[i] = int64ToIntS(shape_array[i]); 305 | } 306 | } 307 | 308 | int size = 1; 309 | for (int i = 0; i < ndim; i++) { 310 | size *= shape[i]; 311 | } 312 | if (vsapi->propNumElements(in, "data") != size) { 313 | return set_error("cannot reshape array"); 314 | } 315 | 316 | int complex_size = shape[ndim - 1] / 2 + 1; 317 | for (int i = 0; i < ndim - 1; i++) { 318 | complex_size *= shape[i]; 319 | } 320 | 321 | auto input = vsapi->propGetFloatArray(in, "data", nullptr); 322 | 323 | auto output = std::make_unique []>(complex_size); 324 | 325 | if (ndim == 1) { 326 | dft(output.get(), input, size, 1); 327 | vsapi->propSetFloatArray(out, "ret", (const double *) output.get(), complex_size * 2); 328 | } else if (ndim == 2) { 329 | for (int i = 0; i < shape[0]; i++) { 330 | dft(&output[i * (shape[1] / 2 + 1)], &input[i * shape[1]], shape[1], 1); 331 | } 332 | 333 | auto output2 = std::make_unique []>(complex_size); 334 | 335 | for (int i = 0; i < shape[1] / 2 + 1; i++) { 336 | dft(&output2[i], &output[i], shape[0], shape[1] / 2 + 1); 337 | } 338 | 339 | vsapi->propSetFloatArray(out, "ret", (const double *) output2.get(), complex_size * 2); 340 | } else { 341 | for (int i = 0; i < shape[0] * shape[1]; i++) { 342 | dft(&output[i * (shape[2] / 2 + 1)], &input[i * shape[2]], shape[2], 1); 343 | } 344 | 345 | auto output2 = std::make_unique []>(complex_size); 346 | 347 | for (int i = 0; i < shape[0]; i++) { 348 | for (int j = 0; j < shape[2] / 2 + 1; j++) { 349 | dft( 350 | &output2[i * shape[1] * (shape[2] / 2 + 1) + j], 351 | &output[i * shape[1] * (shape[2] / 2 + 1) + j], 352 | shape[1], 353 | (shape[2] / 2 + 1) 354 | ); 355 | } 356 | } 357 | 358 | for (int i = 0; i < shape[1] * (shape[2] / 2 + 1); i++) { 359 | dft(&output[i], &output2[i], shape[0], shape[1] * (shape[2] / 2 + 1)); 360 | } 361 | 362 | vsapi->propSetFloatArray(out, "ret", (const double *) output.get(), complex_size * 2); 363 | } 364 | } 365 | 366 | 367 | static void Version(const VSMap *, VSMap * out, void *, VSCore *, const VSAPI *vsapi) { 368 | vsapi->propSetData(out, "version", VERSION, -1, paReplace); 369 | 370 | #ifdef HAS_DISPATCH 371 | constexpr std::array dispatch_targets { "auto", SUPPORTED_ARCH_STRS }; 372 | 373 | for (int i = 0; i < static_cast(dispatch_targets.size()); i++) { 374 | vsapi->propSetData(out, "dispatch_targets", dispatch_targets[i], -1, paAppend); 375 | } 376 | #else // HAS_DISPATCH 377 | vsapi->propSetData(out, "dispatch_targets", target_arch(), -1, paReplace); 378 | #endif // HAS_DISPATCH 379 | } 380 | 381 | 382 | VS_EXTERNAL_API(void) VapourSynthPluginInit( 383 | VSConfigPlugin configFunc, 384 | VSRegisterFunction registerFunc, 385 | VSPlugin *plugin 386 | ) { 387 | 388 | configFunc( 389 | "io.github.amusementclub.dfttest2_cpu", 390 | "dfttest2_cpu", 391 | "DFTTest2 (CPU)", 392 | VAPOURSYNTH_API_VERSION, 1, plugin 393 | ); 394 | 395 | registerFunc( 396 | "DFTTest", 397 | "clip:clip;" 398 | "window:float[];" 399 | "sigma:float[];" 400 | "sigma2:float;" 401 | "pmin:float;" 402 | "pmax:float;" 403 | "filter_type:int;" 404 | "radius:int:opt;" 405 | "block_size:int:opt;" 406 | "block_step:int:opt;" 407 | "zero_mean:int:opt;" 408 | "window_freq:float[]:opt;" 409 | "planes:int[]:opt;" 410 | "opt:int:opt;", 411 | DFTTestCreate, nullptr, plugin 412 | ); 413 | 414 | registerFunc( 415 | "RDFT", 416 | "data:float[];" 417 | "shape:int[];", 418 | RDFT, nullptr, plugin 419 | ); 420 | 421 | registerFunc( 422 | "Version", 423 | "", 424 | Version, nullptr, plugin 425 | ); 426 | } 427 | -------------------------------------------------------------------------------- /cuda_source/kernel.hpp: -------------------------------------------------------------------------------- 1 | #ifndef KERNEL_HPP 2 | #define KERNEL_HPP 3 | 4 | static const auto kernel_implementation = R"""( 5 | #ifndef __CUDACC_RTC__ 6 | #include 7 | #endif // __CUDACC_RTC__ 8 | 9 | __device__ 10 | extern void filter(float2 & value, int x, int y, int z); 11 | 12 | // ZERO_MEAN 13 | // RADIUS 14 | // BLOCK_SIZE 15 | // BLOCK_STEP 16 | // IN_PLACE 17 | // WARPS_PER_BLOCK 18 | // WARP_SIZE 19 | // TYPE 20 | // SCALE 21 | // PEAK (optional) 22 | 23 | #if ZERO_MEAN 24 | // __device__ const float window_freq[]; // frequency response of the window 25 | #endif // ZERO_MEAN 26 | 27 | __device__ 28 | static int calc_pad_size(int size, int block_size, int block_step) { 29 | return size + ((size % block_size) ? block_size - size % block_size : 0) + max(block_size - block_step, block_step) * 2; 30 | } 31 | 32 | __device__ 33 | static int calc_pad_num(int size, int block_size, int block_step) { 34 | return (calc_pad_size(size, block_size, block_step) - block_size) / block_step + 1; 35 | } 36 | 37 | __device__ 38 | static float to_float(TYPE x) { 39 | return static_cast(x) * static_cast(SCALE); 40 | } 41 | 42 | __device__ 43 | static TYPE from_float(float x) { 44 | #ifdef PEAK 45 | x /= static_cast(SCALE); 46 | x = fmaxf(0.0f, fminf(x + 0.5f, static_cast(PEAK))); 47 | return static_cast(__float2int_rz(x)); 48 | #else // PEAK // only integral types define it 49 | return static_cast(x / static_cast(SCALE)); 50 | #endif // PEAK 51 | } 52 | 53 | extern "C" 54 | __launch_bounds__(WARPS_PER_BLOCK * WARP_SIZE) 55 | __global__ 56 | void im2col( 57 | // shape: (vertical_num, horizontal_num, 2*radius+1, block_size, padded_block_size) 58 | float * __restrict__ dstp, 59 | const TYPE * __restrict__ srcp, // shape: (2*radius+1, vertical_size, horizontal_size) 60 | int width, 61 | int height 62 | ) { 63 | 64 | int radius = static_cast(RADIUS); 65 | int block_size = static_cast(BLOCK_SIZE); 66 | int padded_block_size = IN_PLACE ? (block_size / 2 + 1) * 2 : block_size; 67 | int block_step = static_cast(BLOCK_STEP); 68 | 69 | int horizontal_num = calc_pad_num(width, block_size, block_step); 70 | int vertical_num = calc_pad_num(height, block_size, block_step); 71 | int horizontal_size = calc_pad_size(width, block_size, block_step); 72 | int vertical_size = calc_pad_size(height, block_size, block_step); 73 | int num_blocks = vertical_num * horizontal_num; 74 | 75 | for (int i = blockIdx.x * WARPS_PER_BLOCK + threadIdx.x / WARP_SIZE; i < num_blocks; i += gridDim.x * WARPS_PER_BLOCK) { 76 | int ix = i % horizontal_num; 77 | int iy = i / horizontal_num; 78 | auto dst = &dstp[i * (2 * radius + 1) * block_size * padded_block_size]; 79 | for (int j = 0; j < 2 * radius + 1; j++) { 80 | auto src = &srcp[(j * vertical_size + iy * block_step) * horizontal_size + ix * block_step]; 81 | for (int k = threadIdx.x % WARP_SIZE; k < block_size * block_size; k += WARP_SIZE) { 82 | int kx = k % block_size; 83 | int ky = k / block_size; 84 | float val = to_float(src[ky * horizontal_size + kx]) * window[j * block_size * block_size + k]; 85 | #if IN_PLACE == 1 86 | dst[(j * block_size + k / block_size) * padded_block_size + k % block_size] = val; 87 | #else 88 | dst[j * block_size * block_size + k] = val; 89 | #endif 90 | } 91 | } 92 | } 93 | } 94 | 95 | extern "C" 96 | __launch_bounds__(WARPS_PER_BLOCK * WARP_SIZE) 97 | __global__ 98 | void frequency_filtering( 99 | float2 * data, 100 | int num_blocks 101 | ) { 102 | 103 | int radius = static_cast(RADIUS); 104 | int block_size_1d = static_cast(BLOCK_SIZE); 105 | 106 | // each warp is responsible for a single block 107 | // assume that blockDim.x % WARP_SIZE == 0 108 | 109 | int block_size_x = block_size_1d / 2 + 1; 110 | int block_size_2d = block_size_1d * block_size_x; 111 | int block_size_3d = (2 * radius + 1) * block_size_2d; 112 | 113 | for (int i = blockIdx.x * WARPS_PER_BLOCK + threadIdx.x / WARP_SIZE; i < num_blocks; i += gridDim.x * WARPS_PER_BLOCK) { 114 | #if ZERO_MEAN 115 | float gf; 116 | if (threadIdx.x % WARP_SIZE == 0) { 117 | gf = data[i * block_size_3d].x / window_freq[0]; 118 | } 119 | gf = __shfl_sync(0xFFFFFFFF, gf, 0); 120 | #endif // ZERO_MEAN 121 | 122 | for (int j = threadIdx.x % WARP_SIZE; j < block_size_3d; j += WARP_SIZE) { 123 | float2 local_data = data[i * block_size_3d + j]; 124 | 125 | #if ZERO_MEAN 126 | // remove mean 127 | float val1 = gf * window_freq[j * 2]; 128 | float val2 = gf * window_freq[j * 2 + 1]; 129 | local_data.x -= val1; 130 | local_data.y -= val2; 131 | #endif // ZERO_MEAN 132 | 133 | filter( 134 | local_data, 135 | j % block_size_x, 136 | (j % block_size_2d) / block_size_x, 137 | (j % block_size_3d) / block_size_2d 138 | ); 139 | 140 | #if ZERO_MEAN 141 | // add mean 142 | local_data.x += val1; 143 | local_data.y += val2; 144 | #endif // ZERO_MEAN 145 | 146 | data[i * block_size_3d + j] = local_data; 147 | } 148 | } 149 | } 150 | 151 | extern "C" 152 | __launch_bounds__(WARPS_PER_BLOCK * WARP_SIZE) 153 | __global__ 154 | void col2im( 155 | TYPE * __restrict__ dst, // shape: (2*radius+1, vertical_size, horizontal_size) 156 | // shape: (vertical_num, horizontal_num, 2*radius+1, block_size, padded_block_size) 157 | const float * __restrict__ src, 158 | int width, 159 | int height 160 | ) { 161 | 162 | int radius = static_cast(RADIUS); 163 | int block_size = static_cast(BLOCK_SIZE); 164 | int padded_block_size = IN_PLACE ? (block_size / 2 + 1) * 2 : block_size; 165 | int block_step = static_cast(BLOCK_STEP); 166 | 167 | // each thread is responsible for a single pixel 168 | int horizontal_size = calc_pad_size(width, block_size, block_step); 169 | int horizontal_num = calc_pad_num(width, block_size, block_step); 170 | int vertical_size = calc_pad_size(height, block_size, block_step); 171 | int vertical_num = calc_pad_num(height, block_size, block_step); 172 | int pad_x = (horizontal_size - width) / 2; 173 | int pad_y = (vertical_size - height) / 2; 174 | 175 | int x = blockIdx.x * blockDim.x + threadIdx.x; 176 | int y = blockIdx.y * blockDim.y + threadIdx.y; 177 | if (y < pad_y || y >= pad_y + height || x < pad_x || x >= pad_x + width) { 178 | return ; 179 | } 180 | 181 | float sum {}; 182 | 183 | int i1 = (y - block_size + block_step) / block_step; // i1 is implicitly greater than 0 184 | int i2 = min(y / block_step, vertical_num - 1); 185 | int j1 = (x - block_size + block_step) / block_step; // j1 is implicitly greater than 0 186 | int j2 = min(x / block_step, horizontal_num - 1); 187 | 188 | for (int i = i1; i <= i2; i++) { 189 | int offset_y = y - i * block_step; 190 | for (int j = j1; j <= j2; j++) { 191 | int offset_x = x - j * block_step; 192 | auto src_offset = (((i * horizontal_num + j) * (2 * radius + 1) + radius) * block_size + offset_y) * padded_block_size + offset_x; 193 | auto window_offset = (radius * block_size + offset_y) * block_size + offset_x; 194 | sum += src[src_offset] * window[window_offset]; 195 | } 196 | } 197 | 198 | dst[(radius * vertical_size + y) * horizontal_size + x] = from_float(sum); 199 | } 200 | )"""; 201 | 202 | #endif // KERNEL_HPP 203 | -------------------------------------------------------------------------------- /cuda_source/win32.cpp: -------------------------------------------------------------------------------- 1 | #ifdef _MSC_VER 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define DLL_DIR L"vsmlrt-cuda" 10 | 11 | #include 12 | 13 | namespace { 14 | namespace fs = std::filesystem; 15 | static fs::path dllDir() { 16 | static const std::wstring res = []() -> std::wstring { 17 | HMODULE mod = 0; 18 | if (GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, (char *)dllDir, &mod)) { 19 | std::vector buf; 20 | size_t n = 0; 21 | do { 22 | buf.resize(buf.size() + MAX_PATH); 23 | n = GetModuleFileNameW(mod, buf.data(), buf.size()); 24 | } while (n >= buf.size()); 25 | buf.resize(n); 26 | std::wstring path(buf.begin(), buf.end()); 27 | return path; 28 | } 29 | throw std::runtime_error("unable to locate myself"); 30 | }(); 31 | return fs::path(res).parent_path(); 32 | } 33 | 34 | FARPROC loadDLLs(std::string dll) { 35 | fs::path p = dllDir() / DLL_DIR / dll; 36 | std::wstring s = p; 37 | HMODULE h = nullptr; 38 | h = LoadLibraryW(s.c_str()); 39 | if (getenv("VS_DFTTEST2_VERBOSE")) 40 | std::wcerr << L"vs-dfttest2: preloading " << p << L": " << h << std::endl; 41 | if (!h) { 42 | std::wcerr << L"vs-dfttest2: failed to preload " << p << std::endl; 43 | h = LoadLibraryA(dll.c_str()); 44 | } 45 | if (!h) 46 | std::cerr << "vs-dfttest2: failed to preload " << dll << std::endl; 47 | return (FARPROC)h; 48 | } 49 | 50 | extern "C" FARPROC WINAPI delayload_hook(unsigned reason, DelayLoadInfo* info) { 51 | switch (reason) { 52 | case dliNoteStartProcessing: 53 | case dliNoteEndProcessing: 54 | // Nothing to do here. 55 | break; 56 | case dliNotePreLoadLibrary: { 57 | //std::cerr << "loading " << info->szDll << std::endl; 58 | std::string dll {info->szDll}; 59 | if (dll.find("cufft64") != std::string::npos) 60 | return loadDLLs(dll); 61 | break; 62 | } 63 | case dliNotePreGetProcAddress: 64 | // Nothing to do here. 65 | break; 66 | case dliFailLoadLib: 67 | case dliFailGetProc: 68 | // Returning NULL from error notifications will cause the delay load 69 | // runtime to raise a VcppException structured exception, that some code 70 | // might want to handle. 71 | return NULL; 72 | break; 73 | default: 74 | abort(); // unreachable. 75 | break; 76 | } 77 | // Returning NULL causes the delay load machinery to perform default 78 | // processing for this notification. 79 | return NULL; 80 | } 81 | } // namespace 82 | 83 | extern "C" { 84 | const PfnDliHook __pfnDliNotifyHook2 = delayload_hook; 85 | const PfnDliHook __pfnDliFailureHook2 = delayload_hook; 86 | }; 87 | #endif 88 | -------------------------------------------------------------------------------- /dfttest2.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.4.0" 2 | 3 | from dataclasses import dataclass 4 | import math 5 | from string import Template 6 | import typing 7 | 8 | import vapoursynth as vs 9 | from vapoursynth import core 10 | 11 | 12 | __all__ = ["DFTTest", "DFTTest2", "Backend"] 13 | 14 | 15 | class Backend: 16 | @dataclass(frozen=False) 17 | class cuFFT: 18 | device_id: int = 0 19 | in_place: bool = True 20 | 21 | @dataclass(frozen=False) 22 | class NVRTC: 23 | device_id: int = 0 24 | num_streams: int = 1 25 | 26 | @dataclass(frozen=False) 27 | class CPU: 28 | opt: int = 0 29 | 30 | @dataclass(frozen=False) 31 | class GCC: 32 | pass 33 | 34 | @dataclass(frozen=False) 35 | class hipFFT: 36 | device_id: int = 0 37 | in_place: bool = True 38 | 39 | @dataclass(frozen=False) 40 | class HIPRTC: 41 | device_id: int = 0 42 | num_streams: int = 1 43 | 44 | backendT = typing.Union[Backend.cuFFT, Backend.NVRTC, Backend.CPU, Backend.GCC, Backend.hipFFT, Backend.HIPRTC] 45 | 46 | 47 | def init_backend(backend: backendT) -> backendT: 48 | if backend is Backend.cuFFT: # type: ignore 49 | backend = Backend.cuFFT() 50 | elif backend is Backend.NVRTC: # type: ignore 51 | backend = Backend.NVRTC() 52 | elif backend is Backend.CPU: # type: ignore 53 | backend = Backend.CPU() 54 | elif backend is Backend.GCC: # type: ignore 55 | backend = Backend.GCC() 56 | elif backend is Backend.hipFFT: # type: ignore 57 | backend = Backend.hipFFT() 58 | elif backend is Backend.HIPRTC: # type: ignore 59 | backend = Backend.HIPRTC() 60 | return backend 61 | 62 | 63 | # https://github.com/HomeOfVapourSynthEvolution/VapourSynth-DFTTest/blob/ 64 | # bc5e0186a7f309556f20a8e9502f2238e39179b8/DFTTest/DFTTest.cpp#L518 65 | def normalize( 66 | window: typing.Sequence[float], 67 | size: int, 68 | step: int 69 | ) -> typing.List[float]: 70 | 71 | nw = [0.0] * size 72 | for q in range(size): 73 | for h in range(q, -1, -step): 74 | nw[q] += window[h] ** 2 75 | for h in range(q + step, size, step): 76 | nw[q] += window[h] ** 2 77 | return [window[q] / math.sqrt(nw[q]) for q in range(size)] 78 | 79 | 80 | # https://github.com/HomeOfVapourSynthEvolution/VapourSynth-DFTTest/blob/ 81 | # bc5e0186a7f309556f20a8e9502f2238e39179b8/DFTTest/DFTTest.cpp#L462 82 | def get_window_value(location: float, size: int, mode: int, beta: float) -> float: 83 | temp = math.pi * location / size 84 | if mode == 0: # hanning 85 | return 0.5 * (1 - math.cos(2 * temp)) 86 | elif mode == 1: # hamming 87 | return 0.53836 - 0.46164 * math.cos(2 * temp) 88 | elif mode == 2: # blackman 89 | return 0.42 - 0.5 * math.cos(2 * temp) + 0.08 * math.cos(4 * temp) 90 | elif mode == 3: # 4 term blackman-harris 91 | return ( 92 | 0.35875 93 | - 0.48829 * math.cos(2 * temp) 94 | + 0.14128 * math.cos(4 * temp) 95 | - 0.01168 * math.cos(6 * temp) 96 | ) 97 | elif mode == 4: # kaiser-bessel 98 | def i0(p: float) -> float: 99 | p /= 2 100 | n = t = d = 1.0 101 | k = 1 102 | while True: 103 | n *= p 104 | d *= k 105 | v = n / d 106 | t += v * v 107 | k += 1 108 | if k >= 15 or v <= 1e-8: 109 | break 110 | return t 111 | v = 2 * location / size - 1 112 | return i0(math.pi * beta * math.sqrt(1 - v * v)) / i0(math.pi * beta) 113 | elif mode == 5: # 7 term blackman-harris 114 | return ( 115 | 0.27105140069342415 116 | - 0.433297939234486060 * math.cos(2 * temp) 117 | + 0.218122999543110620 * math.cos(4 * temp) 118 | - 0.065925446388030898 * math.cos(6 * temp) 119 | + 0.010811742098372268 * math.cos(8 * temp) 120 | - 7.7658482522509342e-4 * math.cos(10 * temp) 121 | + 1.3887217350903198e-5 * math.cos(12 * temp) 122 | ) 123 | elif mode == 6: # flat top 124 | return ( 125 | 0.2810639 126 | - 0.5208972 * math.cos(2 * temp) 127 | + 0.1980399 * math.cos(4 * temp) 128 | ) 129 | elif mode == 7: # rectangular 130 | return 1.0 131 | elif mode == 8: # Bartlett 132 | return 1 - 2 * abs(location - size / 2) / size 133 | elif mode == 9: # bartlett-hann 134 | return 0.62 - 0.48 * (location / size - 0.5) - 0.38 * math.cos(2 * temp) 135 | elif mode == 10: # nuttall 136 | return ( 137 | 0.355768 138 | - 0.487396 * math.cos(2 * temp) 139 | + 0.144232 * math.cos(4 * temp) 140 | - 0.012604 * math.cos(6 * temp) 141 | ) 142 | elif mode == 11: # blackman-nuttall 143 | return ( 144 | 0.3635819 145 | - 0.4891775 * math.cos(2 * temp) 146 | + 0.1365995 * math.cos(4 * temp) 147 | - 0.0106411 * math.cos(6 * temp) 148 | ) 149 | else: 150 | raise ValueError("unknown window") 151 | 152 | 153 | # https://github.com/HomeOfVapourSynthEvolution/VapourSynth-DFTTest/blob/ 154 | # bc5e0186a7f309556f20a8e9502f2238e39179b8/DFTTest/DFTTest.cpp#L461 155 | def get_window( 156 | radius: int, 157 | block_size: int, 158 | block_step: int, 159 | spatial_window_mode: int, 160 | spatial_beta: float, 161 | temporal_window_mode: int, 162 | temporal_beta: float 163 | ) -> typing.List[float]: 164 | 165 | temporal_window = [ 166 | get_window_value( 167 | location = i + 0.5, 168 | size = 2 * radius + 1, 169 | mode = temporal_window_mode, 170 | beta = temporal_beta 171 | ) for i in range(2 * radius + 1) 172 | ] 173 | 174 | spatial_window = [ 175 | get_window_value( 176 | location = i + 0.5, 177 | size = block_size, 178 | mode = spatial_window_mode, 179 | beta = spatial_beta 180 | ) for i in range(block_size) 181 | ] 182 | 183 | spatial_window = normalize( 184 | window=spatial_window, 185 | size=block_size, 186 | step=block_step 187 | ) 188 | 189 | window = [] 190 | for t_val in temporal_window: 191 | for s_val1 in spatial_window: 192 | for s_val2 in spatial_window: 193 | value = t_val * s_val1 * s_val2 194 | 195 | # normalize for unnormalized FFT implementation 196 | value /= math.sqrt(2 * radius + 1) * block_size 197 | 198 | window.append(value) 199 | 200 | return window 201 | 202 | 203 | # https://github.com/HomeOfVapourSynthEvolution/VapourSynth-DFTTest/blob/ 204 | # bc5e0186a7f309556f20a8e9502f2238e39179b8/DFTTest/DFTTest.cpp#L581 205 | def get_location( 206 | position: float, 207 | length: int 208 | ) -> float: 209 | 210 | if length == 1: 211 | return 0.0 212 | elif position > length // 2: 213 | return (length - position) / (length // 2) 214 | else: 215 | return position / (length // 2) 216 | 217 | 218 | # https://github.com/HomeOfVapourSynthEvolution/VapourSynth-DFTTest/blob/ 219 | # bc5e0186a7f309556f20a8e9502f2238e39179b8/DFTTest/DFTTest.cpp#L581 220 | def get_sigma( 221 | position: float, 222 | length: int, 223 | func: typing.Callable[[float], float] 224 | ) -> float: 225 | 226 | if length == 1: 227 | return 1.0 228 | else: 229 | return func(get_location(position, length)) 230 | 231 | 232 | def DFTTest2( 233 | clip: vs.VideoNode, 234 | ftype: typing.Literal[0, 1, 2, 3, 4] = 0, 235 | sigma: typing.Union[float, typing.Sequence[typing.Callable[[float], float]]] = 8.0, 236 | sigma2: float = 8.0, 237 | pmin: float = 0.0, 238 | pmax: float = 500.0, 239 | sbsize: int = 16, 240 | sosize: int = 12, 241 | tbsize: int = 3, 242 | swin: typing.Literal[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] = 0, 243 | twin: typing.Literal[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] = 7, 244 | sbeta: float = 2.5, 245 | tbeta: float = 2.5, 246 | zmean: bool = True, 247 | f0beta: float = 1.0, 248 | ssystem: typing.Literal[0, 1] = 0, 249 | planes: typing.Optional[typing.Union[int, typing.Sequence[int]]] = None, 250 | backend: backendT = Backend.cuFFT() 251 | ) -> vs.VideoNode: 252 | """ this interface is not stable """ 253 | 254 | # translate parameters 255 | if ftype == 0: 256 | if abs(f0beta - 1) < 0.00005: 257 | filter_type = 0 258 | elif abs(f0beta - 0.5) < 0.0005: 259 | filter_type = 6 260 | else: 261 | filter_type = 5 262 | else: 263 | filter_type = ftype 264 | 265 | radius = (tbsize - 1) // 2 266 | block_size = sbsize 267 | block_step = sbsize - sosize 268 | spatial_window_mode = swin 269 | temporal_window_mode = twin 270 | spatial_beta = sbeta 271 | temporal_beta = tbeta 272 | zero_mean = zmean 273 | backend = init_backend(backend) 274 | 275 | if isinstance(backend, (Backend.CPU, Backend.NVRTC, Backend.GCC, Backend.HIPRTC)): 276 | if radius not in range(4): 277 | raise ValueError("invalid radius (tbsize)") 278 | if block_size != 16: 279 | raise ValueError("invalid block_size (sbsize)") 280 | 281 | # compute constants 282 | try: 283 | sigma_scalar = float(sigma) # type: ignore 284 | sigma_is_scalar = True 285 | except: 286 | # compute sigma_array 287 | 288 | sigma_is_scalar = False 289 | 290 | sigma_funcs = typing.cast(typing.Sequence[typing.Callable[[float], float]], sigma) 291 | if callable(sigma_funcs): 292 | sigma_funcs = [sigma_funcs] 293 | else: 294 | sigma_funcs = list(sigma_funcs) 295 | sigma_funcs.extend([sigma_funcs[-1]] * 3) 296 | sigma_func_x, sigma_func_y, sigma_func_t = sigma_funcs[:3] 297 | 298 | sigma_array = [] 299 | 300 | if ssystem == 0: 301 | for t in range(2 * radius + 1): 302 | sigma_t = get_sigma(position=t, length=2*radius+1, func=sigma_func_t) 303 | for y in range(block_size): 304 | sigma_y = get_sigma(position=y, length=block_size, func=sigma_func_y) 305 | for x in range(block_size // 2 + 1): 306 | sigma_x = get_sigma(position=x, length=block_size, func=sigma_func_x) 307 | 308 | sigma = sigma_t * sigma_y * sigma_x 309 | sigma_array.append(sigma) 310 | else: 311 | for t in range(2 * radius + 1): 312 | loc_t = get_location(position=t, length=2*radius+1) 313 | for y in range(block_size): 314 | loc_y = get_location(position=y, length=block_size) 315 | for x in range(block_size // 2 + 1): 316 | loc_x = get_location(position=x, length=block_size) 317 | 318 | ndim = 3 if radius > 0 else 2 319 | location = math.sqrt((loc_t * loc_t + loc_y * loc_y + loc_x * loc_x) / ndim) 320 | sigma = sigma_func_t(location) 321 | sigma_array.append(sigma) 322 | 323 | window = get_window( 324 | radius=radius, 325 | block_size=block_size, 326 | block_step=block_step, 327 | spatial_window_mode=spatial_window_mode, 328 | temporal_window_mode=temporal_window_mode, 329 | spatial_beta=spatial_beta, 330 | temporal_beta=temporal_beta 331 | ) 332 | 333 | wscale = math.fsum(w * w for w in window) 334 | 335 | if ftype < 2: 336 | if sigma_is_scalar: 337 | sigma_scalar *= wscale 338 | else: 339 | sigma_array = [s * wscale for s in sigma_array] 340 | sigma2 *= wscale 341 | 342 | pmin *= wscale 343 | pmax *= wscale 344 | 345 | if isinstance(backend, Backend.cuFFT): 346 | rdft = core.dfttest2_cuda.RDFT 347 | elif isinstance(backend, Backend.NVRTC): 348 | rdft = core.dfttest2_nvrtc.RDFT 349 | elif isinstance(backend, Backend.CPU): 350 | rdft = core.dfttest2_cpu.RDFT 351 | elif isinstance(backend, Backend.GCC): 352 | rdft = core.dfttest2_gcc.RDFT 353 | elif isinstance(backend, Backend.hipFFT): 354 | rdft = core.dfttest2_hip.RDFT 355 | elif isinstance(backend, Backend.HIPRTC): 356 | rdft = core.dfttest2_hiprtc.RDFT 357 | else: 358 | raise TypeError("unknown backend") 359 | 360 | if radius == 0: 361 | window_freq = rdft( 362 | data=[w * 255 for w in window], 363 | shape=(block_size, block_size) 364 | ) 365 | else: 366 | window_freq = rdft( 367 | data=[w * 255 for w in window], 368 | shape=(2 * radius + 1, block_size, block_size) 369 | ) 370 | 371 | if isinstance(backend, Backend.CPU): 372 | return core.dfttest2_cpu.DFTTest( 373 | clip, 374 | window=window, 375 | sigma=[sigma_scalar] * (2 * radius + 1) * block_size * (block_size // 2 + 1) if sigma_is_scalar else sigma_array, 376 | sigma2=sigma2, 377 | pmin=pmin, 378 | pmax=pmax, 379 | radius=radius, 380 | block_size=block_size, 381 | block_step=block_step, 382 | planes=planes, 383 | filter_type=filter_type, 384 | window_freq=window_freq, 385 | opt=backend.opt 386 | ) 387 | elif isinstance(backend, Backend.GCC): 388 | return core.dfttest2_gcc.DFTTest( 389 | clip, 390 | window=window, 391 | sigma=[sigma_scalar] * (2 * radius + 1) * block_size * (block_size // 2 + 1) if sigma_is_scalar else sigma_array, 392 | sigma2=sigma2, 393 | pmin=pmin, 394 | pmax=pmax, 395 | radius=radius, 396 | block_size=block_size, 397 | block_step=block_step, 398 | planes=planes, 399 | filter_type=filter_type, 400 | window_freq=window_freq 401 | ) 402 | 403 | if isinstance(backend, Backend.cuFFT): 404 | to_single = core.dfttest2_cuda.ToSingle 405 | elif isinstance(backend, Backend.NVRTC): 406 | to_single = core.dfttest2_nvrtc.ToSingle 407 | elif isinstance(backend, Backend.hipFFT): 408 | to_single = core.dfttest2_hip.ToSingle 409 | elif isinstance(backend, Backend.HIPRTC): 410 | to_single = core.dfttest2_hiprtc.ToSingle 411 | else: 412 | raise TypeError("unknown backend") 413 | 414 | kernel = Template( 415 | """ 416 | #define FILTER_TYPE ${filter_type} 417 | #define ZERO_MEAN ${zero_mean} 418 | #define SIGMA_IS_SCALAR ${sigma_is_scalar} 419 | 420 | #if ZERO_MEAN 421 | __device__ static const float window_freq[] { ${window_freq} }; 422 | #endif // ZERO_MEAN 423 | 424 | __device__ static const float window[] { ${window} }; 425 | 426 | __device__ 427 | static void filter(float2 & value, int x, int y, int t) { 428 | #if SIGMA_IS_SCALAR 429 | float sigma = static_cast(${sigma}); 430 | #else // SIGMA_IS_SCALAR 431 | __device__ static const float sigma_array[] { ${sigma} }; 432 | float sigma = sigma_array[(t * BLOCK_SIZE + y) * (BLOCK_SIZE / 2 + 1) + x]; 433 | #endif // SIGMA_IS_SCALAR 434 | [[maybe_unused]] float sigma2 = static_cast(${sigma2}); 435 | [[maybe_unused]] float pmin = static_cast(${pmin}); 436 | [[maybe_unused]] float pmax = static_cast(${pmax}); 437 | [[maybe_unused]] float multiplier {}; 438 | 439 | #if FILTER_TYPE == 2 440 | value.x *= sigma; 441 | value.y *= sigma; 442 | return ; 443 | #endif 444 | 445 | float psd = value.x * value.x + value.y * value.y; 446 | 447 | #if FILTER_TYPE == 1 448 | if (psd < sigma) { 449 | value.x = 0.0f; 450 | value.y = 0.0f; 451 | } 452 | return ; 453 | #elif FILTER_TYPE == 0 454 | multiplier = fmaxf((psd - sigma) / (psd + 1e-15f), 0.0f); 455 | #elif FILTER_TYPE == 3 456 | if (psd >= pmin && psd <= pmax) { 457 | multiplier = sigma; 458 | } else { 459 | multiplier = sigma2; 460 | } 461 | #elif FILTER_TYPE == 4 462 | multiplier = sigma * sqrtf(psd * (pmax / ((psd + pmin) * (psd + pmax) + 1e-15f))); 463 | #elif FILTER_TYPE == 5 464 | multiplier = powf(fmaxf((psd - sigma) / (psd + 1e-15f), 0.0f), pmin); 465 | #else 466 | multiplier = sqrtf(fmaxf((psd - sigma) / (psd + 1e-15f), 0.0f)); 467 | #endif 468 | 469 | value.x *= multiplier; 470 | value.y *= multiplier; 471 | } 472 | """ 473 | ).substitute( 474 | sigma_is_scalar=int(sigma_is_scalar), 475 | sigma=( 476 | to_single(sigma_scalar) 477 | if sigma_is_scalar 478 | else ','.join(str(to_single(x)) for x in sigma_array) 479 | ), 480 | sigma2=to_single(sigma2), 481 | pmin=to_single(pmin), 482 | pmax=to_single(pmax), 483 | filter_type=int(filter_type), 484 | window_freq=','.join(str(to_single(x)) for x in window_freq), 485 | zero_mean=int(zero_mean), 486 | window=','.join(str(to_single(x)) for x in window), 487 | ) 488 | 489 | if isinstance(backend, Backend.cuFFT): 490 | return core.dfttest2_cuda.DFTTest( 491 | clip, 492 | kernel=kernel, 493 | radius=radius, 494 | block_size=block_size, 495 | block_step=block_step, 496 | planes=planes, 497 | in_place=backend.in_place, 498 | device_id=backend.device_id 499 | ) 500 | elif isinstance(backend, Backend.NVRTC): 501 | return core.dfttest2_nvrtc.DFTTest( 502 | clip, 503 | kernel=kernel, 504 | radius=radius, 505 | block_size=block_size, 506 | block_step=block_step, 507 | planes=planes, 508 | in_place=False, 509 | device_id=backend.device_id, 510 | num_streams=backend.num_streams 511 | ) 512 | if isinstance(backend, Backend.hipFFT): 513 | return core.dfttest2_hip.DFTTest( 514 | clip, 515 | kernel=kernel, 516 | radius=radius, 517 | block_size=block_size, 518 | block_step=block_step, 519 | planes=planes, 520 | in_place=backend.in_place, 521 | device_id=backend.device_id 522 | ) 523 | elif isinstance(backend, Backend.HIPRTC): 524 | return core.dfttest2_hiprtc.DFTTest( 525 | clip, 526 | kernel=kernel, 527 | radius=radius, 528 | block_size=block_size, 529 | block_step=block_step, 530 | planes=planes, 531 | in_place=False, 532 | device_id=backend.device_id, 533 | num_streams=backend.num_streams 534 | ) 535 | else: 536 | raise TypeError("unknown backend") 537 | 538 | 539 | def select_backend( 540 | backend: typing.Optional[backendT], 541 | sbsize: int, 542 | tbsize: int 543 | ) -> backendT: 544 | 545 | if backend is not None: 546 | return backend 547 | 548 | if sbsize == 16 and tbsize in [1, 3, 5, 7]: 549 | if hasattr(core, "dfttest2_nvrtc"): 550 | return Backend.NVRTC() 551 | elif hasattr(core, "dfttest2_hiprtc"): 552 | return Backend.HIPRTC() 553 | elif hasattr(core, "dfttest2_cuda"): 554 | return Backend.cuFFT() 555 | elif hasattr(core, "dfttest2_hip"): 556 | return Backend.hipFFT() 557 | elif hasattr(core, "dfttest2_cpu"): 558 | return Backend.CPU() 559 | else: 560 | return Backend.GCC() 561 | else: 562 | if hasattr(core, "dfttest2_cuda"): 563 | return Backend.cuFFT() 564 | else: 565 | return Backend.hipFFT() 566 | 567 | 568 | FREQ = float 569 | SIGMA = float 570 | def flatten( 571 | data: typing.Optional[typing.Union[ 572 | typing.Sequence[typing.Tuple[FREQ, SIGMA]], 573 | typing.Sequence[float] 574 | ]] 575 | ) -> typing.Optional[typing.List[float]]: 576 | 577 | import itertools as it 578 | import numbers 579 | 580 | if data is None: 581 | return None 582 | elif isinstance(data[0], numbers.Real): 583 | return data 584 | else: 585 | data = typing.cast(typing.Sequence[typing.Tuple[FREQ, SIGMA]], data) 586 | return list(it.chain.from_iterable(data)) 587 | 588 | 589 | def to_func( 590 | data: typing.Optional[typing.Sequence[float]], 591 | norm: typing.Callable[[float], float], 592 | sigma: float 593 | ) -> typing.Callable[[float], float]: 594 | 595 | if data is None: 596 | return lambda _: norm(sigma) 597 | 598 | locations = data[::2] 599 | sigmas = data[1::2] 600 | packs = list(zip(locations, sigmas)) 601 | packs = sorted(packs, key=lambda group: group[0]) 602 | 603 | def func(x: float) -> float: 604 | length = len(packs) 605 | for i in range(length - 1): 606 | if x <= packs[i + 1][0]: 607 | weight = (x - packs[i][0]) / (packs[i + 1][0] - packs[i][0]) 608 | return (1 - weight) * norm(packs[i][1]) + weight * norm(packs[i + 1][1]) 609 | raise ValueError() 610 | 611 | return func 612 | 613 | 614 | def DFTTest( 615 | clip: vs.VideoNode, 616 | ftype: typing.Literal[0, 1, 2, 3, 4] = 0, 617 | sigma: float = 8.0, 618 | sigma2: float = 8.0, 619 | pmin: float = 0.0, 620 | pmax: float = 500.0, 621 | sbsize: int = 16, 622 | smode: typing.Literal[0, 1] = 1, 623 | sosize: int = 12, 624 | tbsize: int = 3, 625 | # tmode=0, tosize=0 626 | swin: typing.Literal[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] = 0, 627 | twin: typing.Literal[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] = 7, 628 | sbeta: float = 2.5, 629 | tbeta: float = 2.5, 630 | zmean: bool = True, 631 | f0beta: float = 1.0, 632 | nlocation: typing.Optional[typing.Sequence[int]] = None, 633 | alpha: typing.Optional[float] = None, 634 | slocation: typing.Optional[typing.Union[ 635 | typing.Sequence[typing.Tuple[FREQ, SIGMA]], 636 | typing.Sequence[float] 637 | ]] = None, 638 | ssx: typing.Optional[typing.Union[ 639 | typing.Sequence[typing.Tuple[FREQ, SIGMA]], 640 | typing.Sequence[float] 641 | ]] = None, 642 | ssy: typing.Optional[typing.Union[ 643 | typing.Sequence[typing.Tuple[FREQ, SIGMA]], 644 | typing.Sequence[float] 645 | ]] = None, 646 | sst: typing.Optional[typing.Union[ 647 | typing.Sequence[typing.Tuple[FREQ, SIGMA]], 648 | typing.Sequence[float] 649 | ]] = None, 650 | ssystem: typing.Literal[0, 1] = 0, 651 | planes: typing.Optional[typing.Union[int, typing.Sequence[int]]] = None, 652 | backend: typing.Optional[backendT] = None 653 | ) -> vs.VideoNode: 654 | """ 2D/3D frequency domain denoiser 655 | 656 | The interface is compatible with core.dfttest.DFTTest by HolyWu. 657 | 658 | Args: 659 | clip: Clip to process. 660 | 661 | Any format with either integer sample type of 8-16 bit depth 662 | or float sample type of 32 bit depth is supported. 663 | 664 | ftype: Controls the filter type. 665 | 666 | Possible settings are: 667 | 0: generalized wiener filter 668 | mult = max((psd - sigma) / psd, 0) ^ f0beta 669 | 670 | 1: hard threshold 671 | mult = psd < sigma ? 0.0 : 1.0 672 | 673 | 2: multiplier 674 | mult = sigma 675 | 676 | 3: multiplier switched based on psd value 677 | mult = (psd >= pmin && psd <= pmax) ? sigma : sigma2 678 | 679 | 4: multiplier modified based on psd value and range 680 | mult = sigma * sqrt((psd * pmax) / ((psd + pmin) * (psd + pmax))) 681 | 682 | The real and imaginary parts of each complex dft coefficient are multiplied 683 | by the corresponding 'mult' value. 684 | 685 | ** psd = magnitude squared = real*real + imag*imag 686 | 687 | sigma, sigma2: Value of sigma and sigma2. 688 | If using the slocation parameter then the sigma parameter is ignored. 689 | 690 | pmin, pmax: Used as described in the ftype parameter description. 691 | 692 | sbsize: Sets the length of the sides of the spatial window. 693 | Must be 1 or greater. Must be odd if using smode=0. 694 | 695 | smode: Sets the mode for spatial operation. 696 | Currently only tmode=1 is implemented. 697 | 698 | sosize: Sets the spatial overlap amount. 699 | Must be in the range 0 to sbsize-1 (inclusive). 700 | If sosize is greater than sbsize>>1, then sbsize%(sbsize-sosize) must equal 0. 701 | In other words, overlap greater than 50% requires that sbsize-sosize be a divisor of sbsize. 702 | 703 | tbsize: Sets the length of the temporal dimension (i.e. number of frames). 704 | Must be at least 1. Must be odd if using tmode=0. 705 | 706 | tmode: Sets the mode for temporal operation. 707 | Currently only tmode=0 is implemented. 708 | 709 | tosize: Sets the temporal overlap amount. 710 | Must be in the range 0 to tbsize-1 (inclusive). 711 | If tosize is greater than tbsize>>1, then tbsize%(tbsize-tosize) must equal 0. 712 | In other words, overlap greater than 50% requires that tbsize-tosize be a divisor of tbsize. 713 | 714 | swin, twin: Sets the type of analysis/synthesis window to be used for spatial (swin) and 715 | temporal (twin) processing. Possible settings: 716 | 717 | 0: hanning 718 | 1: hamming 719 | 2: blackman 720 | 3: 4 term blackman-harris 721 | 4: kaiser-bessel 722 | 5: 7 term blackman-harris 723 | 6: flat top 724 | 7: rectangular 725 | 8: Bartlett 726 | 9: Bartlett-Hann 727 | 10: Nuttall 728 | 11: Blackman-Nuttall 729 | 730 | sbeta,tbeta: Sets the beta value for kaiser-bessel window type. 731 | sbeta goes with swin, tbeta goes with twin. 732 | Not used unless the corresponding window value is set to 4. 733 | 734 | zmean: Controls whether the window mean is subtracted out (zero'd) 735 | prior to filtering in the frequency domain. 736 | 737 | f0beta: Power term in ftype=0. 738 | 739 | nlocation: Currently not implemented. 740 | 741 | slocation/ssx/ssy/sst: Used to specify functions of sigma based on frequency. 742 | Check the original documentation for details. 743 | 744 | Note that in current implementation, 745 | "slocation = [(0.0, 1.0), (1.0, 10.0)]" 746 | is equivalent to 747 | "slocation = [0.0, 1.0, 1.0, 10.0]" 748 | 749 | ssystem: Method of sigma computation. 750 | Check the original documentation for details. 751 | 752 | planes: Sets which planes will be processed. 753 | Any unprocessed planes will be simply copied. 754 | 755 | backend: Backend implementation to use. 756 | All available backends can be found in the dfttest2.Backend "namespace": 757 | dfttest2.Backend.{CPU, cuFFT, NVRTC, GCC, hipFFT, HIPRTC} 758 | 759 | The CPU, NVRTC and GCC backends require sbsize=16. 760 | The cuFFT and NVRTC backends require a CUDA-enabled system. 761 | The hipFFT and HIPRTC backends require a CUDA-enabled system. 762 | 763 | Speed: NVRTC == HIPRTC >> cuFFT > hipFFT > CPU == GCC 764 | """ 765 | 766 | if ( 767 | not isinstance(clip, vs.VideoNode) or 768 | clip.width == 0 or 769 | clip.height == 0 or 770 | clip.format is None or 771 | (clip.format.sample_type == vs.INTEGER and clip.format.bits_per_sample > 16) or 772 | (clip.format.sample_type == vs.FLOAT and clip.format.bits_per_sample != 32) 773 | ): 774 | raise ValueError("only constant format 8-16 bit integer and 32 bit float input supported") 775 | 776 | if ftype < 0 or ftype > 4: 777 | raise ValueError("ftype must be 0, 1, 2, 3, or 4") 778 | 779 | if sbsize < 1: 780 | raise ValueError("sbsize must be greater than or equal to 1") 781 | 782 | if smode != 1: 783 | raise ValueError('"smode" must be 1') 784 | 785 | if sosize > sbsize // 2 and (sbsize % (sbsize - sosize) != 0): 786 | raise ValueError("spatial overlap greater than 50% requires that sbsize-sosize is a divisor of sbsize") 787 | 788 | if tbsize < 1: 789 | raise ValueError('"tbsize" must be at least 1') 790 | 791 | if swin < 0 or swin > 11: 792 | raise ValueError("swin must be between 0 and 11 (inclusive)") 793 | 794 | if twin < 0 or twin > 11: 795 | raise ValueError("twin must be between 0 and 11 (inclusive)") 796 | 797 | if nlocation is not None: 798 | raise ValueError('"nlocation" must be None') 799 | 800 | if slocation and len(slocation) % 2 != 0: 801 | raise ValueError("number of elements in slocation must be a multiple of 2") 802 | 803 | if ssx and len(ssx) % 2 != 0: 804 | raise ValueError("number of elements in ssx must be a multiple of 2") 805 | 806 | if ssy and len(ssy) % 2 != 0: 807 | raise ValueError("number of elements in ssy must be a multiple of 2") 808 | 809 | if sst and len(sst) % 2 != 0: 810 | raise ValueError("number of elements in sst must be a multiple of 2") 811 | 812 | if ssystem < 0 or ssystem > 1: 813 | raise ValueError("ssystem must be 0 or 1") 814 | 815 | def norm(x: float) -> float: 816 | if slocation is not None and ssystem == 1: 817 | return x 818 | elif tbsize == 1: 819 | return math.sqrt(x) 820 | else: 821 | return x ** (1 / 3) 822 | 823 | _sigma: typing.Union[float, typing.Sequence[typing.Callable[[float], float]]] 824 | 825 | if slocation is not None: 826 | _sigma = [to_func(flatten(slocation), norm, sigma)] * 3 827 | elif any(ss is not None for ss in (ssx, ssy, sst)): 828 | _sigma = [to_func(flatten(ss), norm, sigma) for ss in (ssx, ssy, sst)] 829 | else: 830 | _sigma = sigma 831 | 832 | return DFTTest2( 833 | clip = clip, 834 | ftype = ftype, 835 | sigma = _sigma, 836 | sigma2 = sigma2, 837 | pmin = pmin, 838 | pmax = pmax, 839 | sbsize = sbsize, 840 | sosize = sosize, 841 | tbsize = tbsize, 842 | swin = swin, 843 | twin = twin, 844 | sbeta = sbeta, 845 | tbeta = tbeta, 846 | zmean = zmean, 847 | f0beta = f0beta, 848 | ssystem = ssystem, 849 | planes = planes, 850 | backend = select_backend(backend, sbsize, tbsize) 851 | ) 852 | -------------------------------------------------------------------------------- /gcc_source/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(dfttest2_gcc MODULE source.cpp) 2 | 3 | set_target_properties(dfttest2_gcc PROPERTIES 4 | CXX_EXTENSIONS OFF 5 | CXX_STANDARD 20 6 | CXX_STANDARD_REQUIRED ON 7 | ) 8 | 9 | target_include_directories(dfttest2_gcc PRIVATE ${VCL_HOME}) 10 | 11 | if(PKG_CONFIG_FOUND AND VS_FOUND) 12 | target_include_directories(dfttest2_gcc PRIVATE ${VS_INCLUDE_DIRS}) 13 | install(TARGETS dfttest2_gcc LIBRARY DESTINATION ${install_dir}) 14 | else() 15 | target_include_directories(dfttest2_gcc PRIVATE ${VS_INCLUDE_DIR}) 16 | install(TARGETS dfttest2_gcc LIBRARY DESTINATION lib) 17 | endif() 18 | 19 | target_include_directories(dfttest2_gcc PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/..) 20 | 21 | 22 | add_library(getframe_impl OBJECT getframe_impl.cpp) 23 | 24 | set_target_properties(getframe_impl PROPERTIES 25 | CXX_EXTENSIONS OFF 26 | CXX_STANDARD 20 27 | CXX_STANDARD_REQUIRED ON 28 | ) 29 | 30 | target_include_directories(getframe_impl PRIVATE ${VCL_HOME}) 31 | 32 | if(PKG_CONFIG_FOUND AND VS_FOUND) 33 | target_include_directories(getframe_impl PRIVATE ${VS_INCLUDE_DIRS}) 34 | else() 35 | target_include_directories(getframe_impl PRIVATE ${VS_INCLUDE_DIR}) 36 | endif() 37 | 38 | target_link_libraries(dfttest2_gcc PRIVATE getframe_impl) 39 | -------------------------------------------------------------------------------- /gcc_source/dfttest2_cpu.h: -------------------------------------------------------------------------------- 1 | #ifndef DFTTEST2_CPU_H 2 | #define DFTTEST2_CPU_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | 15 | 16 | static inline void vs_aligned_free_float(float * ptr) { 17 | vs_aligned_free(static_cast(ptr)); 18 | } 19 | 20 | 21 | struct DFTTestThreadData { 22 | uint8_t * padded; // shape: (pad_height, pad_width) 23 | float * padded2; // shape: (pad_height, pad_width) 24 | }; 25 | 26 | 27 | struct DFTTestData { 28 | VSNodeRef * node; 29 | int radius; 30 | int block_size; 31 | int block_step; 32 | std::array process; 33 | bool zero_mean; 34 | std::unique_ptr window { nullptr, &vs_aligned_free_float }; 35 | std::unique_ptr window_freq { nullptr, &vs_aligned_free_float }; 36 | std::unique_ptr sigma { nullptr, &vs_aligned_free_float }; 37 | int filter_type; 38 | float sigma2; 39 | float pmin; 40 | float pmax; 41 | 42 | std::atomic num_uninitialized_threads; 43 | std::unordered_map thread_data; 44 | std::shared_mutex thread_data_lock; 45 | }; 46 | 47 | extern const VSFrameRef *VS_CC DFTTestGetFrame( 48 | int n, int activationReason, void **instanceData, void **frameData, 49 | VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi 50 | ) noexcept; 51 | 52 | #endif // DFTTEST2_CPU_H 53 | -------------------------------------------------------------------------------- /gcc_source/getframe_impl.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | #include "dfttest2_cpu.h" 9 | #include "kernel.hpp" 10 | 11 | 12 | typedef unsigned char Vec16uc __attribute__((__vector_size__(16), __aligned__(16))); 13 | typedef unsigned char Vec16uc_u __attribute__((__vector_size__(16), __aligned__(1))); 14 | typedef unsigned short Vec16us __attribute__((__vector_size__(32), __aligned__(32))); 15 | typedef unsigned short Vec16us_u __attribute__((__vector_size__(32), __aligned__(1))); 16 | typedef float Vec16f_u __attribute__((__vector_size__(64), __aligned__(1))); 17 | 18 | 19 | static inline Vec16uc __attribute__((__always_inline__)) load_16uc(const unsigned char * p) { 20 | struct loadu { 21 | Vec16uc_u v; 22 | } __attribute__((__packed__, __may_alias__)); 23 | 24 | return ((const struct loadu*) p)->v; 25 | } 26 | 27 | 28 | static inline Vec16us __attribute__((__always_inline__)) load_16us(const unsigned short * p) { 29 | struct loadu { 30 | Vec16us_u v; 31 | } __attribute__((__packed__, __may_alias__)); 32 | 33 | return ((const struct loadu*) p)->v; 34 | } 35 | 36 | 37 | static inline Vec16f __attribute__((__always_inline__)) load_16f(const float * p) { 38 | struct loadu_16f { 39 | Vec16f_u v; 40 | } __attribute__((__packed__, __may_alias__)); 41 | 42 | return ((const struct loadu_16f*) p)->v; 43 | } 44 | 45 | 46 | static inline void __attribute__((__always_inline__)) store_16f(float * p, Vec16f a) { 47 | struct storeu_ps { 48 | Vec16f_u v; 49 | } __attribute__((__packed__, __may_alias__)); 50 | 51 | ((struct storeu_ps*) p)->v = a; 52 | } 53 | 54 | 55 | static inline int calc_pad_size(int size, int block_size, int block_step) { 56 | return ( 57 | size 58 | + ((size % block_size) ? block_size - size % block_size : 0) 59 | + std::max(block_size - block_step, block_step) * 2 60 | ); 61 | } 62 | 63 | 64 | static inline int calc_pad_num(int size, int block_size, int block_step) { 65 | return (calc_pad_size(size, block_size, block_step) - block_size) / block_step + 1; 66 | } 67 | 68 | 69 | template 70 | static inline void reflection_padding_impl( 71 | T * VS_RESTRICT dst, // shape: (pad_height, pad_width) 72 | const T * VS_RESTRICT src, // shape: (height, stride) 73 | int width, int height, int stride, 74 | int block_size, int block_step 75 | ) { 76 | 77 | int pad_width = calc_pad_size(width, block_size, block_step); 78 | int pad_height = calc_pad_size(height, block_size, block_step); 79 | 80 | int offset_y = (pad_height - height) / 2; 81 | int offset_x = (pad_width - width) / 2; 82 | 83 | vs_bitblt( 84 | &dst[offset_y * pad_width + offset_x], pad_width * sizeof(T), 85 | src, stride * sizeof(T), 86 | width * sizeof(T), height 87 | ); 88 | 89 | // copy left and right regions 90 | for (int y = offset_y; y < offset_y + height; y++) { 91 | auto dst_line = &dst[y * pad_width]; 92 | 93 | for (int x = 0; x < offset_x; x++) { 94 | dst_line[x] = dst_line[offset_x * 2 - x]; 95 | } 96 | 97 | for (int x = offset_x + width; x < pad_width; x++) { 98 | dst_line[x] = dst_line[2 * (offset_x + width) - 2 - x]; 99 | } 100 | } 101 | 102 | // copy top region 103 | for (int y = 0; y < offset_y; y++) { 104 | std::memcpy( 105 | &dst[y * pad_width], 106 | &dst[(offset_y * 2 - y) * pad_width], 107 | pad_width * sizeof(T) 108 | ); 109 | } 110 | 111 | // copy bottom region 112 | for (int y = offset_y + height; y < pad_height; y++) { 113 | std::memcpy( 114 | &dst[y * pad_width], 115 | &dst[(2 * (offset_y + height) - 2 - y) * pad_width], 116 | pad_width * sizeof(T) 117 | ); 118 | } 119 | } 120 | 121 | 122 | static inline void reflection_padding( 123 | uint8_t * VS_RESTRICT dst, // shape: (pad_height, pad_width) 124 | const uint8_t * VS_RESTRICT src, // shape: (height, stride) 125 | int width, int height, int stride, 126 | int block_size, int block_step, 127 | int bytes_per_sample 128 | ) { 129 | 130 | if (bytes_per_sample == 1) { 131 | reflection_padding_impl( 132 | static_cast(dst), 133 | static_cast(src), 134 | width, height, stride, 135 | block_size, block_step 136 | ); 137 | } else if (bytes_per_sample == 2) { 138 | reflection_padding_impl( 139 | reinterpret_cast(dst), 140 | reinterpret_cast(src), 141 | width, height, stride, 142 | block_size, block_step 143 | ); 144 | } else if (bytes_per_sample == 4) { 145 | reflection_padding_impl( 146 | reinterpret_cast(dst), 147 | reinterpret_cast(src), 148 | width, height, stride, 149 | block_size, block_step 150 | ); 151 | } 152 | } 153 | 154 | 155 | static inline void load_block( 156 | Vec16f * VS_RESTRICT block, 157 | const uint8_t * VS_RESTRICT shifted_src, 158 | int radius, 159 | int block_size, 160 | int block_step, 161 | int width, 162 | int height, 163 | const Vec16f * VS_RESTRICT window, 164 | int bits_per_sample 165 | ) { 166 | 167 | float scale = 1.0f / (1 << (bits_per_sample - 8)); 168 | if (bits_per_sample == 32) { 169 | scale = 255.0f; 170 | } 171 | 172 | int bytes_per_sample = (bits_per_sample + 7) / 8; 173 | 174 | assert(block_size == 16); 175 | block_size = 16; // unsafe 176 | 177 | int offset_x = calc_pad_size(width, block_size, block_step); 178 | int offset_y = calc_pad_size(height, block_size, block_step); 179 | 180 | if (bytes_per_sample == 1) { 181 | for (int i = 0; i < 2 * radius + 1; i++) { 182 | for (int j = 0; j < block_size; j++) { 183 | auto vec_input = load_16uc((const uint8_t *) shifted_src + (i * offset_y + j) * offset_x); 184 | auto vec_input_f = __builtin_convertvector(vec_input, Vec16f); 185 | block[i * block_size * 2 + j] = scale * window[i * block_size + j] * vec_input_f; 186 | } 187 | } 188 | } 189 | if (bytes_per_sample == 2) { 190 | for (int i = 0; i < 2 * radius + 1; i++) { 191 | for (int j = 0; j < block_size; j++) { 192 | auto vec_input = load_16us((const uint16_t *) shifted_src + (i * offset_y + j) * offset_x); 193 | auto vec_input_f = __builtin_convertvector(vec_input, Vec16f); 194 | block[i * block_size * 2 + j] = scale * window[i * block_size + j] * vec_input_f; 195 | } 196 | } 197 | } 198 | if (bytes_per_sample == 4) { 199 | for (int i = 0; i < 2 * radius + 1; i++) { 200 | for (int j = 0; j < block_size; j++) { 201 | auto vec_input_f = load_16f((const float *) shifted_src + (i * offset_y + j) * offset_x); 202 | block[i * block_size * 2 + j] = scale * window[i * block_size + j] * vec_input_f; 203 | } 204 | } 205 | } 206 | } 207 | 208 | 209 | static inline void store_block( 210 | float * VS_RESTRICT shifted_dst, 211 | const Vec16f * VS_RESTRICT shifted_block, 212 | int block_size, 213 | int block_step, 214 | int width, 215 | int height, 216 | const Vec16f * VS_RESTRICT shifted_window 217 | ) { 218 | 219 | assert(block_size == 16); 220 | block_size = 16; // unsafe 221 | 222 | for (int i = 0; i < block_size; i++) { 223 | Vec16f acc = load_16f((const float *) shifted_dst + (i * calc_pad_size(width, block_size, block_step))); 224 | acc = FMA(shifted_block[i], shifted_window[i], acc); 225 | store_16f((float *) shifted_dst + (i * calc_pad_size(width, block_size, block_step)), acc); 226 | } 227 | } 228 | 229 | 230 | static inline void store_frame( 231 | uint8_t * VS_RESTRICT dst, 232 | const float * VS_RESTRICT shifted_src, 233 | int width, 234 | int height, 235 | int dst_stride, 236 | int src_stride, 237 | int bits_per_sample 238 | ) { 239 | 240 | float scale = 1.0f / (1 << (bits_per_sample - 8)); 241 | if (bits_per_sample == 32) { 242 | scale = 255.0f; 243 | } 244 | 245 | int bytes_per_sample = (bits_per_sample + 7) / 8; 246 | int peak = (1 << bits_per_sample) - 1; 247 | 248 | if (bytes_per_sample == 1) { 249 | auto dstp = (uint8_t *) dst; 250 | for (int y = 0; y < height; y++) { 251 | for (int x = 0; x < width; x++) { 252 | auto clamped = std::clamp(static_cast(shifted_src[y * src_stride + x] / scale + 0.5f), 0, peak); 253 | dstp[y * dst_stride + x] = static_cast(clamped); 254 | } 255 | } 256 | } 257 | if (bytes_per_sample == 2) { 258 | auto dstp = (uint16_t *) dst; 259 | for (int y = 0; y < height; y++) { 260 | for (int x = 0; x < width; x++) { 261 | auto clamped = std::clamp(static_cast(shifted_src[y * src_stride + x] / scale + 0.5f), 0, peak); 262 | dstp[y * dst_stride + x] = static_cast(clamped); 263 | } 264 | } 265 | } 266 | if (bytes_per_sample == 4) { 267 | auto dstp = (float *) dst; 268 | for (int y = 0; y < height; y++) { 269 | for (int x = 0; x < width; x++) { 270 | dstp[y * dst_stride + x] = shifted_src[y * src_stride + x] / scale; 271 | } 272 | } 273 | } 274 | } 275 | 276 | 277 | const VSFrameRef * VS_CC 278 | #ifndef HAS_DISPATCH 279 | DFTTestGetFrame 280 | #else // HAS_DISPATCH 281 | DFTTEST_GETFRAME_NAME 282 | #endif // HAS_DISPATCH 283 | ( 284 | int n, int activationReason, void **instanceData, void **frameData, 285 | VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi 286 | ) noexcept { 287 | 288 | auto d = static_cast(*instanceData); 289 | 290 | if (activationReason == arInitial) { 291 | int start = std::max(n - d->radius, 0); 292 | auto vi = vsapi->getVideoInfo(d->node); 293 | int end = std::min(n + d->radius, vi->numFrames - 1); 294 | for (int i = start; i <= end; i++) { 295 | vsapi->requestFrameFilter(i, d->node, frameCtx); 296 | } 297 | return nullptr; 298 | } else if (activationReason != arAllFramesReady) { 299 | return nullptr; 300 | } 301 | 302 | auto vi = vsapi->getVideoInfo(d->node); 303 | 304 | DFTTestThreadData thread_data; 305 | 306 | auto thread_id = std::this_thread::get_id(); 307 | if (d->num_uninitialized_threads.load(std::memory_order_acquire) == 0) { 308 | const auto & const_data = d->thread_data; 309 | thread_data = const_data.at(thread_id); 310 | } else { 311 | bool initialized = true; 312 | 313 | d->thread_data_lock.lock_shared(); 314 | try { 315 | const auto & const_data = d->thread_data; 316 | thread_data = const_data.at(thread_id); 317 | } catch (const std::out_of_range &) { 318 | initialized = false; 319 | } 320 | d->thread_data_lock.unlock_shared(); 321 | 322 | if (!initialized) { 323 | auto padded_size = ( 324 | (2 * d->radius + 1) * 325 | calc_pad_size(vi->height, d->block_size, d->block_step) * 326 | calc_pad_size(vi->width, d->block_size, d->block_step) * 327 | vi->format->bytesPerSample 328 | ); 329 | 330 | thread_data.padded = static_cast(std::malloc(padded_size)); 331 | thread_data.padded2 = static_cast(std::malloc( 332 | calc_pad_size(vi->height, d->block_size, d->block_step) * 333 | calc_pad_size(vi->width, d->block_size, d->block_step) * 334 | sizeof(float) 335 | )); 336 | 337 | { 338 | std::lock_guard _ { d->thread_data_lock }; 339 | d->thread_data.emplace(thread_id, thread_data); 340 | } 341 | 342 | d->num_uninitialized_threads.fetch_sub(1, std::memory_order_release); 343 | } 344 | } 345 | 346 | std::vectorfreeFrame)>> src_frames; 347 | src_frames.reserve(2 * d->radius + 1); 348 | for (int i = n - d->radius; i <= n + d->radius; i++) { 349 | src_frames.emplace_back( 350 | vsapi->getFrameFilter(std::clamp(i, 0, vi->numFrames - 1), d->node, frameCtx), 351 | vsapi->freeFrame 352 | ); 353 | } 354 | 355 | auto & src_center_frame = src_frames[d->radius]; 356 | auto format = vsapi->getFrameFormat(src_center_frame.get()); 357 | 358 | const VSFrameRef * fr[] { 359 | d->process[0] ? nullptr : src_center_frame.get(), 360 | d->process[1] ? nullptr : src_center_frame.get(), 361 | d->process[2] ? nullptr : src_center_frame.get() 362 | }; 363 | const int pl[] { 0, 1, 2 }; 364 | std::unique_ptrfreeFrame)> dst_frame { 365 | vsapi->newVideoFrame2(format, vi->width, vi->height, fr, pl, src_center_frame.get(), core), 366 | vsapi->freeFrame 367 | }; 368 | 369 | for (int plane = 0; plane < format->numPlanes; plane++) { 370 | if (!d->process[plane]) { 371 | continue; 372 | } 373 | 374 | int width = vsapi->getFrameWidth(src_center_frame.get(), plane); 375 | int height = vsapi->getFrameHeight(src_center_frame.get(), plane); 376 | int stride = vsapi->getStride(src_center_frame.get(), plane) / vi->format->bytesPerSample; 377 | 378 | int padded_size_spatial = ( 379 | calc_pad_size(height, d->block_size, d->block_step) * 380 | calc_pad_size(width, d->block_size, d->block_step) 381 | ); 382 | 383 | std::memset(thread_data.padded2, 0, 384 | calc_pad_size(height, d->block_size, d->block_step) * 385 | calc_pad_size(width, d->block_size, d->block_step) * 386 | sizeof(float) 387 | ); 388 | 389 | for (int i = 0; i < 2 * d->radius + 1; i++) { 390 | auto srcp = vsapi->getReadPtr(src_frames[i].get(), plane); 391 | reflection_padding( 392 | &thread_data.padded[(i * padded_size_spatial) * vi->format->bytesPerSample], 393 | srcp, 394 | width, height, stride, 395 | d->block_size, d->block_step, 396 | vi->format->bytesPerSample 397 | ); 398 | } 399 | 400 | for (int i = 0; i < calc_pad_num(height, d->block_size, d->block_step); i++) { 401 | for (int j = 0; j < calc_pad_num(width, d->block_size, d->block_step); j++) { 402 | assert(d->block_size == 16); 403 | constexpr int block_size = 16; 404 | 405 | Vec16f block[7 * block_size * 2]; 406 | 407 | int offset_x = calc_pad_size(width, d->block_size, d->block_step); 408 | 409 | load_block( 410 | block, 411 | &thread_data.padded[(i * offset_x + j) * d->block_step * vi->format->bytesPerSample], 412 | d->radius, d->block_size, d->block_step, 413 | width, height, 414 | reinterpret_cast(d->window.get()), 415 | vi->format->bitsPerSample 416 | ); 417 | 418 | fused( 419 | block, 420 | reinterpret_cast(d->sigma.get()), 421 | d->sigma2, 422 | d->pmin, 423 | d->pmax, 424 | d->filter_type, 425 | d->zero_mean, 426 | reinterpret_cast(d->window_freq.get()), 427 | d->radius 428 | ); 429 | 430 | store_block( 431 | &thread_data.padded2[(i * offset_x + j) * d->block_step], 432 | &block[d->radius * block_size * 2], 433 | block_size, 434 | d->block_step, 435 | width, 436 | height, 437 | reinterpret_cast(&d->window[d->radius * block_size * 2 * 16]) 438 | ); 439 | } 440 | } 441 | 442 | int pad_width = calc_pad_size(width, d->block_size, d->block_step); 443 | int pad_height = calc_pad_size(height, d->block_size, d->block_step); 444 | int offset_y = (pad_height - height) / 2; 445 | int offset_x = (pad_width - width) / 2; 446 | 447 | auto dstp = vsapi->getWritePtr(dst_frame.get(), plane); 448 | store_frame( 449 | dstp, 450 | &thread_data.padded2[(offset_y * pad_width + offset_x)], 451 | width, 452 | height, 453 | stride, 454 | pad_width, 455 | vi->format->bitsPerSample 456 | ); 457 | } 458 | 459 | return dst_frame.release(); 460 | } 461 | -------------------------------------------------------------------------------- /gcc_source/source.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #if __cpp_lib_math_constants 12 | #include 13 | #endif // __cpp_lib_math_constants 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include 21 | #include 22 | 23 | #include "dfttest2_cpu.h" 24 | #include "kernel.hpp" 25 | 26 | #include // generated by cmake, defines "VERSION" 27 | 28 | 29 | template 30 | #if __cpp_concepts 31 | requires 32 | (std::is_same_v || std::is_same_v>) 33 | #endif // __cpp_concepts 34 | static void dft( 35 | std::complex * VS_RESTRICT dst, 36 | const T_in * VS_RESTRICT src, 37 | int n, 38 | int stride 39 | ) { 40 | #if __cpp_lib_math_constants 41 | const auto pi = std::numbers::pi_v; 42 | #else // __cpp_lib_math_constants 43 | const auto pi = static_cast(M_PI); 44 | #endif // __cpp_lib_math_constants 45 | 46 | int out_num = std::is_floating_point_v ? (n / 2 + 1) : n; 47 | for (int i = 0; i < out_num; i++) { 48 | std::complex sum {}; 49 | for (int j = 0; j < n; j++) { 50 | auto imag = -2 * i * j * pi / n; 51 | auto weight = std::complex(std::cos(imag), std::sin(imag)); 52 | sum += src[j * stride] * weight; 53 | } 54 | dst[i * stride] = sum; 55 | } 56 | } 57 | 58 | 59 | static void VS_CC DFTTestInit( 60 | VSMap *in, VSMap *out, void **instanceData, VSNode *node, 61 | VSCore *core, const VSAPI *vsapi 62 | ) noexcept { 63 | 64 | auto d = static_cast(*instanceData); 65 | 66 | auto vi = vsapi->getVideoInfo(d->node); 67 | vsapi->setVideoInfo(vi, 1, node); 68 | } 69 | 70 | 71 | static void VS_CC DFTTestFree( 72 | void *instanceData, VSCore *core, const VSAPI *vsapi 73 | ) noexcept { 74 | 75 | auto d = static_cast(instanceData); 76 | 77 | vsapi->freeNode(d->node); 78 | 79 | for (const auto & [_, thread_data] : d->thread_data) { 80 | std::free(thread_data.padded2); 81 | std::free(thread_data.padded); 82 | } 83 | 84 | delete d; 85 | } 86 | 87 | 88 | static void VS_CC DFTTestCreate( 89 | const VSMap *in, VSMap *out, void *userData, 90 | VSCore *core, const VSAPI *vsapi 91 | ) noexcept { 92 | 93 | auto d = std::make_unique(); 94 | 95 | d->node = vsapi->propGetNode(in, "clip", 0, nullptr); 96 | 97 | auto set_error = [vsapi, out, &d](const char * error_message) -> void { 98 | vsapi->freeNode(d->node); 99 | vsapi->setError(out, error_message); 100 | return ; 101 | }; 102 | 103 | auto vi = vsapi->getVideoInfo(d->node); 104 | if (!isConstantFormat(vi)) { 105 | return set_error("only constant format input is supported"); 106 | } 107 | if (vi->format->sampleType == stInteger && vi->format->bytesPerSample > 2) { 108 | return set_error("only 8-16 bit integer format input is supported"); 109 | } 110 | if (vi->format->sampleType == stFloat && vi->format->bitsPerSample != 32) { 111 | return set_error("only 32-bit float format input is supported"); 112 | } 113 | 114 | int error; 115 | 116 | d->radius = int64ToIntS(vsapi->propGetInt(in, "radius", 0, &error)); 117 | if (error) { 118 | d->radius = 0; 119 | } 120 | 121 | if (d->radius < 0 || d->radius > 3) { 122 | return set_error("\"radius\" must be in [0, 1, 2, 3]"); 123 | } 124 | 125 | d->block_size = int64ToIntS(vsapi->propGetInt(in, "block_size", 0, &error)); 126 | if (error) { 127 | d->block_size = 16; 128 | } 129 | 130 | if (d->block_size != 16) { 131 | return set_error("\"block_size\" must be 16"); 132 | } 133 | 134 | d->block_step = int64ToIntS(vsapi->propGetInt(in, "block_step", 0, &error)); 135 | if (error) { 136 | d->block_step = d->block_size; 137 | } 138 | 139 | int num_planes_args = vsapi->propNumElements(in, "planes"); 140 | d->process.fill(num_planes_args <= 0); 141 | for (int i = 0; i < num_planes_args; ++i) { 142 | int plane = static_cast(vsapi->propGetInt(in, "planes", i, nullptr)); 143 | 144 | if (plane < 0 || plane >= vi->format->numPlanes) { 145 | return set_error("plane index out of range"); 146 | } 147 | 148 | if (d->process[plane]) { 149 | return set_error("plane specified twice"); 150 | } 151 | 152 | d->process[plane] = true; 153 | } 154 | 155 | { 156 | auto ptr = vs_aligned_malloc( 157 | (2 * d->radius + 1) * d->block_size * d->block_size * sizeof(float), 158 | 64 159 | ); 160 | if (ptr == nullptr) { 161 | return set_error("alloc error"); 162 | } 163 | d->window.reset(ptr); 164 | } 165 | { 166 | auto window = vsapi->propGetFloatArray(in, "window", nullptr); 167 | for (int i = 0; i < (2 * d->radius + 1) * d->block_size * d->block_size / 16; i++) { 168 | for (int j = 0; j < 16; j++) { 169 | d->window[i * 16 + j] = static_cast(window[i * 16 + j]); 170 | } 171 | } 172 | } 173 | 174 | { 175 | auto ptr = vs_aligned_malloc( 176 | (2 * d->radius + 1) * d->block_size * (d->block_size / 2 + 1 + 15) * sizeof(float), 177 | 64 178 | ); 179 | if (ptr == nullptr) { 180 | return set_error("alloc error"); 181 | } 182 | d->sigma.reset(ptr); 183 | } 184 | { 185 | auto sigma = vsapi->propGetFloatArray(in, "sigma", nullptr); 186 | for (int i = 0; i < (2 * d->radius + 1) * d->block_size; i++) { 187 | float sigma_padded[16] {}; 188 | for (int j = 0; j < d->block_size / 2 + 1; j++) { 189 | sigma_padded[j] = static_cast(sigma[i * (d->block_size / 2 + 1) + j]); 190 | } 191 | for (int j = 0; j < 16; j++) { 192 | d->sigma[i * 16 + j] = sigma_padded[j]; 193 | } 194 | } 195 | } 196 | 197 | d->sigma2 = static_cast(vsapi->propGetFloat(in, "sigma2", 0, nullptr)); 198 | d->pmin = static_cast(vsapi->propGetFloat(in, "pmin", 0, nullptr)); 199 | d->pmax = static_cast(vsapi->propGetFloat(in, "pmax", 0, nullptr)); 200 | 201 | d->filter_type = static_cast(vsapi->propGetInt(in, "filter_type", 0, nullptr)); 202 | 203 | d->zero_mean = !!vsapi->propGetInt(in, "zero_mean", 0, &error); 204 | if (error) { 205 | d->zero_mean = true; 206 | } 207 | if (d->zero_mean) { 208 | { 209 | auto ptr = vs_aligned_malloc( 210 | (2 * d->radius + 1) * d->block_size * (d->block_size / 2 + 1 + 15) * 2 * sizeof(float), 211 | 64 212 | ); 213 | if (ptr == nullptr) { 214 | return set_error("alloc error"); 215 | } 216 | d->window_freq.reset(ptr); 217 | } 218 | auto window_freq = vsapi->propGetFloatArray(in, "window_freq", nullptr); 219 | for (int i = 0; i < (2 * d->radius + 1) * d->block_size; i++) { 220 | float sigma_padded[32] {}; 221 | for (int j = 0; j < d->block_size / 2 + 1; j++) { 222 | sigma_padded[j] = static_cast(window_freq[(i * (d->block_size / 2 + 1) + j) * 2]); 223 | sigma_padded[16 + j] = static_cast(window_freq[(i * (d->block_size / 2 + 1) + j) * 2 + 1]); 224 | } 225 | for (int j = 0; j < 32; j++) { 226 | d->window_freq[i * 2 * 16 + j] = sigma_padded[j]; 227 | } 228 | } 229 | } 230 | 231 | VSCoreInfo info; 232 | vsapi->getCoreInfo2(core, &info); 233 | d->num_uninitialized_threads.store(info.numThreads, std::memory_order_relaxed); 234 | d->thread_data.reserve(info.numThreads); 235 | 236 | vsapi->createFilter( 237 | in, out, "DFTTest", 238 | DFTTestInit, DFTTestGetFrame, DFTTestFree, 239 | fmParallel, 0, d.release(), core 240 | ); 241 | } 242 | 243 | 244 | static void VS_CC RDFT( 245 | const VSMap *in, VSMap *out, void *userData, 246 | VSCore *core, const VSAPI *vsapi 247 | ) noexcept { 248 | 249 | auto set_error = [vsapi, out](const char * error_message) -> void { 250 | vsapi->setError(out, error_message); 251 | }; 252 | 253 | int ndim = vsapi->propNumElements(in, "shape"); 254 | if (ndim != 1 && ndim != 2 && ndim != 3) { 255 | return set_error("\"shape\" must be an array of ints with 1, 2 or 3 values"); 256 | } 257 | 258 | std::array shape {}; 259 | { 260 | auto shape_array = vsapi->propGetIntArray(in, "shape", nullptr); 261 | for (int i = 0; i < ndim; i++) { 262 | shape[i] = int64ToIntS(shape_array[i]); 263 | } 264 | } 265 | 266 | int size = 1; 267 | for (int i = 0; i < ndim; i++) { 268 | size *= shape[i]; 269 | } 270 | if (vsapi->propNumElements(in, "data") != size) { 271 | return set_error("cannot reshape array"); 272 | } 273 | 274 | int complex_size = shape[ndim - 1] / 2 + 1; 275 | for (int i = 0; i < ndim - 1; i++) { 276 | complex_size *= shape[i]; 277 | } 278 | 279 | auto input = vsapi->propGetFloatArray(in, "data", nullptr); 280 | 281 | auto output = std::make_unique []>(complex_size); 282 | 283 | if (ndim == 1) { 284 | dft(output.get(), input, size, 1); 285 | vsapi->propSetFloatArray(out, "ret", (const double *) output.get(), complex_size * 2); 286 | } else if (ndim == 2) { 287 | for (int i = 0; i < shape[0]; i++) { 288 | dft(&output[i * (shape[1] / 2 + 1)], &input[i * shape[1]], shape[1], 1); 289 | } 290 | 291 | auto output2 = std::make_unique []>(complex_size); 292 | 293 | for (int i = 0; i < shape[1] / 2 + 1; i++) { 294 | dft(&output2[i], &output[i], shape[0], shape[1] / 2 + 1); 295 | } 296 | 297 | vsapi->propSetFloatArray(out, "ret", (const double *) output2.get(), complex_size * 2); 298 | } else { 299 | for (int i = 0; i < shape[0] * shape[1]; i++) { 300 | dft(&output[i * (shape[2] / 2 + 1)], &input[i * shape[2]], shape[2], 1); 301 | } 302 | 303 | auto output2 = std::make_unique []>(complex_size); 304 | 305 | for (int i = 0; i < shape[0]; i++) { 306 | for (int j = 0; j < shape[2] / 2 + 1; j++) { 307 | dft( 308 | &output2[i * shape[1] * (shape[2] / 2 + 1) + j], 309 | &output[i * shape[1] * (shape[2] / 2 + 1) + j], 310 | shape[1], 311 | (shape[2] / 2 + 1) 312 | ); 313 | } 314 | } 315 | 316 | for (int i = 0; i < shape[1] * (shape[2] / 2 + 1); i++) { 317 | dft(&output[i], &output2[i], shape[0], shape[1] * (shape[2] / 2 + 1)); 318 | } 319 | 320 | vsapi->propSetFloatArray(out, "ret", (const double *) output.get(), complex_size * 2); 321 | } 322 | } 323 | 324 | 325 | static void Version(const VSMap *, VSMap * out, void *, VSCore *, const VSAPI *vsapi) { 326 | vsapi->propSetData(out, "version", VERSION, -1, paReplace); 327 | } 328 | 329 | 330 | VS_EXTERNAL_API(void) VapourSynthPluginInit( 331 | VSConfigPlugin configFunc, 332 | VSRegisterFunction registerFunc, 333 | VSPlugin *plugin 334 | ) { 335 | 336 | configFunc( 337 | "io.github.amusementclub.dfttest2_gcc", 338 | "dfttest2_gcc", 339 | "DFTTest2 (GCC vector extension)", 340 | VAPOURSYNTH_API_VERSION, 1, plugin 341 | ); 342 | 343 | registerFunc( 344 | "DFTTest", 345 | "clip:clip;" 346 | "window:float[];" 347 | "sigma:float[];" 348 | "sigma2:float;" 349 | "pmin:float;" 350 | "pmax:float;" 351 | "filter_type:int;" 352 | "radius:int:opt;" 353 | "block_size:int:opt;" 354 | "block_step:int:opt;" 355 | "zero_mean:int:opt;" 356 | "window_freq:float[]:opt;" 357 | "planes:int[]:opt;", 358 | DFTTestCreate, nullptr, plugin 359 | ); 360 | 361 | registerFunc( 362 | "RDFT", 363 | "data:float[];" 364 | "shape:int[];", 365 | RDFT, nullptr, plugin 366 | ); 367 | 368 | registerFunc( 369 | "Version", 370 | "", 371 | Version, nullptr, plugin 372 | ); 373 | } 374 | -------------------------------------------------------------------------------- /hip_source/kernel.hpp: -------------------------------------------------------------------------------- 1 | #ifndef KERNEL_HPP 2 | #define KERNEL_HPP 3 | 4 | static const auto kernel_implementation = R"""( 5 | __device__ 6 | extern void filter(float2 & value, int x, int y, int z); 7 | 8 | // ZERO_MEAN 9 | // RADIUS 10 | // BLOCK_SIZE 11 | // BLOCK_STEP 12 | // IN_PLACE 13 | // WARPS_PER_BLOCK 14 | // WARP_SIZE 15 | // TYPE 16 | // SCALE 17 | // PEAK (optional) 18 | 19 | #if ZERO_MEAN 20 | // __device__ const float window_freq[]; // frequency response of the window 21 | #endif // ZERO_MEAN 22 | 23 | __device__ 24 | static int calc_pad_size(int size, int block_size, int block_step) { 25 | return size + ((size % block_size) ? block_size - size % block_size : 0) + max(block_size - block_step, block_step) * 2; 26 | } 27 | 28 | __device__ 29 | static int calc_pad_num(int size, int block_size, int block_step) { 30 | return (calc_pad_size(size, block_size, block_step) - block_size) / block_step + 1; 31 | } 32 | 33 | __device__ 34 | static float to_float(TYPE x) { 35 | return static_cast(x) * static_cast(SCALE); 36 | } 37 | 38 | __device__ 39 | static TYPE from_float(float x) { 40 | #ifdef PEAK 41 | x /= static_cast(SCALE); 42 | x = fmaxf(0.0f, fminf(x + 0.5f, static_cast(PEAK))); 43 | return static_cast(__float2int_rz(x)); 44 | #else // PEAK // only integral types define it 45 | return static_cast(x / static_cast(SCALE)); 46 | #endif // PEAK 47 | } 48 | 49 | extern "C" 50 | __launch_bounds__(WARPS_PER_BLOCK * WARP_SIZE) 51 | __global__ 52 | void im2col( 53 | // shape: (vertical_num, horizontal_num, 2*radius+1, block_size, padded_block_size) 54 | float * __restrict__ dstp, 55 | const TYPE * __restrict__ srcp, // shape: (2*radius+1, vertical_size, horizontal_size) 56 | int width, 57 | int height 58 | ) { 59 | 60 | int radius = static_cast(RADIUS); 61 | int block_size = static_cast(BLOCK_SIZE); 62 | int padded_block_size = IN_PLACE ? (block_size / 2 + 1) * 2 : block_size; 63 | int block_step = static_cast(BLOCK_STEP); 64 | 65 | int horizontal_num = calc_pad_num(width, block_size, block_step); 66 | int vertical_num = calc_pad_num(height, block_size, block_step); 67 | int horizontal_size = calc_pad_size(width, block_size, block_step); 68 | int vertical_size = calc_pad_size(height, block_size, block_step); 69 | int num_blocks = vertical_num * horizontal_num; 70 | 71 | for (int i = blockIdx.x * WARPS_PER_BLOCK + threadIdx.x / WARP_SIZE; i < num_blocks; i += gridDim.x * WARPS_PER_BLOCK) { 72 | int ix = i % horizontal_num; 73 | int iy = i / horizontal_num; 74 | auto dst = &dstp[i * (2 * radius + 1) * block_size * padded_block_size]; 75 | for (int j = 0; j < 2 * radius + 1; j++) { 76 | auto src = &srcp[(j * vertical_size + iy * block_step) * horizontal_size + ix * block_step]; 77 | for (int k = threadIdx.x % WARP_SIZE; k < block_size * block_size; k += WARP_SIZE) { 78 | int kx = k % block_size; 79 | int ky = k / block_size; 80 | float val = to_float(src[ky * horizontal_size + kx]) * window[j * block_size * block_size + k]; 81 | #if IN_PLACE == 1 82 | dst[(j * block_size + k / block_size) * padded_block_size + k % block_size] = val; 83 | #else 84 | dst[j * block_size * block_size + k] = val; 85 | #endif 86 | } 87 | } 88 | } 89 | } 90 | 91 | extern "C" 92 | __launch_bounds__(WARPS_PER_BLOCK * WARP_SIZE) 93 | __global__ 94 | void frequency_filtering( 95 | float2 * data, 96 | int num_blocks 97 | ) { 98 | 99 | int radius = static_cast(RADIUS); 100 | int block_size_1d = static_cast(BLOCK_SIZE); 101 | 102 | // each warp is responsible for a single block 103 | // assume that blockDim.x % WARP_SIZE == 0 104 | 105 | int block_size_x = block_size_1d / 2 + 1; 106 | int block_size_2d = block_size_1d * block_size_x; 107 | int block_size_3d = (2 * radius + 1) * block_size_2d; 108 | 109 | for (int i = blockIdx.x * WARPS_PER_BLOCK + threadIdx.x / WARP_SIZE; i < num_blocks; i += gridDim.x * WARPS_PER_BLOCK) { 110 | #if ZERO_MEAN 111 | float gf; 112 | if (threadIdx.x % WARP_SIZE == 0) { 113 | gf = data[i * block_size_3d].x / window_freq[0]; 114 | } 115 | gf = __shfl(gf, 0); 116 | #endif // ZERO_MEAN 117 | 118 | for (int j = threadIdx.x % WARP_SIZE; j < block_size_3d; j += WARP_SIZE) { 119 | float2 local_data = data[i * block_size_3d + j]; 120 | 121 | #if ZERO_MEAN 122 | // remove mean 123 | float val1 = gf * window_freq[j * 2]; 124 | float val2 = gf * window_freq[j * 2 + 1]; 125 | local_data.x -= val1; 126 | local_data.y -= val2; 127 | #endif // ZERO_MEAN 128 | 129 | filter( 130 | local_data, 131 | j % block_size_x, 132 | (j % block_size_2d) / block_size_x, 133 | (j % block_size_3d) / block_size_2d 134 | ); 135 | 136 | #if ZERO_MEAN 137 | // add mean 138 | local_data.x += val1; 139 | local_data.y += val2; 140 | #endif // ZERO_MEAN 141 | 142 | data[i * block_size_3d + j] = local_data; 143 | } 144 | } 145 | } 146 | 147 | extern "C" 148 | __launch_bounds__(WARPS_PER_BLOCK * WARP_SIZE) 149 | __global__ 150 | void col2im( 151 | TYPE * __restrict__ dst, // shape: (2*radius+1, vertical_size, horizontal_size) 152 | // shape: (vertical_num, horizontal_num, 2*radius+1, block_size, padded_block_size) 153 | const float * __restrict__ src, 154 | int width, 155 | int height 156 | ) { 157 | 158 | int radius = static_cast(RADIUS); 159 | int block_size = static_cast(BLOCK_SIZE); 160 | int padded_block_size = IN_PLACE ? (block_size / 2 + 1) * 2 : block_size; 161 | int block_step = static_cast(BLOCK_STEP); 162 | 163 | // each thread is responsible for a single pixel 164 | int horizontal_size = calc_pad_size(width, block_size, block_step); 165 | int horizontal_num = calc_pad_num(width, block_size, block_step); 166 | int vertical_size = calc_pad_size(height, block_size, block_step); 167 | int vertical_num = calc_pad_num(height, block_size, block_step); 168 | int pad_x = (horizontal_size - width) / 2; 169 | int pad_y = (vertical_size - height) / 2; 170 | 171 | int x = blockIdx.x * blockDim.x + threadIdx.x; 172 | int y = blockIdx.y * blockDim.y + threadIdx.y; 173 | if (y < pad_y || y >= pad_y + height || x < pad_x || x >= pad_x + width) { 174 | return ; 175 | } 176 | 177 | float sum {}; 178 | 179 | int i1 = (y - block_size + block_step) / block_step; // i1 is implicitly greater than 0 180 | int i2 = min(y / block_step, vertical_num - 1); 181 | int j1 = (x - block_size + block_step) / block_step; // j1 is implicitly greater than 0 182 | int j2 = min(x / block_step, horizontal_num - 1); 183 | 184 | for (int i = i1; i <= i2; i++) { 185 | int offset_y = y - i * block_step; 186 | for (int j = j1; j <= j2; j++) { 187 | int offset_x = x - j * block_step; 188 | auto src_offset = (((i * horizontal_num + j) * (2 * radius + 1) + radius) * block_size + offset_y) * padded_block_size + offset_x; 189 | auto window_offset = (radius * block_size + offset_y) * block_size + offset_x; 190 | sum += src[src_offset] * window[window_offset]; 191 | } 192 | } 193 | 194 | dst[(radius * vertical_size + y) * horizontal_size + x] = from_float(sum); 195 | } 196 | )"""; 197 | 198 | #endif // KERNEL_HPP 199 | -------------------------------------------------------------------------------- /hiprtc_source/kernel.hpp: -------------------------------------------------------------------------------- 1 | #ifndef KERNEL_HPP 2 | #define KERNEL_HPP 3 | 4 | static const auto kernel_implementation = R"""( 5 | __device__ 6 | extern void filter(float2 & value, int x, int y, int z); 7 | 8 | // ZERO_MEAN 9 | // RADIUS 10 | // BLOCK_SIZE 11 | // BLOCK_STEP 12 | // WARPS_PER_BLOCK 13 | // WARP_SIZE 14 | // TYPE 15 | // SCALE 16 | // PEAK (optional) 17 | 18 | #if ZERO_MEAN 19 | // __device__ const float window_freq[]; // frequency response of the window 20 | #endif // ZERO_MEAN 21 | 22 | __device__ 23 | static int calc_pad_size(int size, int block_size, int block_step) { 24 | return size + ((size % block_size) ? block_size - size % block_size : 0) + max(block_size - block_step, block_step) * 2; 25 | } 26 | 27 | __device__ 28 | static int calc_pad_num(int size, int block_size, int block_step) { 29 | return (calc_pad_size(size, block_size, block_step) - block_size) / block_step + 1; 30 | } 31 | 32 | __device__ 33 | static float to_float(TYPE x) { 34 | return static_cast(x) * static_cast(SCALE); 35 | } 36 | 37 | __device__ 38 | static TYPE from_float(float x) { 39 | #ifdef PEAK 40 | x /= static_cast(SCALE); 41 | x = fmaxf(0.0f, fminf(x + 0.5f, static_cast(PEAK))); 42 | return static_cast(__float2int_rz(x)); 43 | #else // PEAK // only integral types define it 44 | return static_cast(x / static_cast(SCALE)); 45 | #endif // PEAK 46 | } 47 | 48 | // im2col + rdft + frequency_filtering + irdft 49 | extern "C" 50 | __launch_bounds__(WARPS_PER_BLOCK * WARP_SIZE) 51 | __global__ 52 | void fused( 53 | float * __restrict__ dstp, // shape: (vertical_num, horizontal_num, 2*radius+1, block_size, block_size) 54 | const TYPE * __restrict__ srcp, // shape: (2*radius+1, vertical_size, horizontal_size) 55 | int width, 56 | int height 57 | ) { 58 | 59 | constexpr int radius = static_cast(RADIUS); 60 | constexpr int block_size = static_cast(BLOCK_SIZE); 61 | constexpr int block_step = static_cast(BLOCK_STEP); 62 | 63 | int horizontal_num = calc_pad_num(width, block_size, block_step); 64 | int vertical_num = calc_pad_num(height, block_size, block_step); 65 | int horizontal_size = calc_pad_size(width, block_size, block_step); 66 | int vertical_size = calc_pad_size(height, block_size, block_step); 67 | int num_blocks = vertical_num * horizontal_num; 68 | 69 | constexpr int warp_size = static_cast(WARP_SIZE); 70 | constexpr int warps_per_block = static_cast(WARPS_PER_BLOCK); 71 | constexpr int transpose_stride = (warp_size % block_size == 0) ? block_size + 1 : block_size; 72 | __shared__ float2 shared_transpose_buffer[warps_per_block * block_size * transpose_stride]; 73 | 74 | int warp_id = threadIdx.x / warp_size; 75 | int lane_id = threadIdx.x % warp_size; 76 | 77 | if (lane_id >= block_size) { 78 | return; 79 | } 80 | 81 | auto transpose_buffer = &shared_transpose_buffer[warp_id * block_size * transpose_stride]; 82 | 83 | for (int block_id = blockIdx.x * WARPS_PER_BLOCK + threadIdx.x / WARP_SIZE; block_id < num_blocks; block_id += gridDim.x * WARPS_PER_BLOCK) { 84 | int ix = block_id % horizontal_num; 85 | int iy = block_id / horizontal_num; 86 | 87 | constexpr int active_mask = (1 << block_size) - 1; 88 | float2 thread_data[(2 * radius + 1) * block_size]; 89 | 90 | // im2col 91 | #pragma unroll 92 | for (int i = 0; i < 2 * radius + 1; i++) { 93 | auto src = &srcp[(i * vertical_size + iy * block_step) * horizontal_size + ix * block_step]; 94 | auto local_thread_data = &thread_data[i * block_size]; 95 | #pragma unroll 96 | for (int j = 0; j < block_size; j++) { 97 | ((float *) local_thread_data)[j] = to_float(src[j * horizontal_size + lane_id]) * window[(i * block_size + j) * block_size + lane_id]; 98 | } 99 | } 100 | 101 | // rdft 102 | #pragma unroll 103 | for (int i = 0; i < 2 * radius + 1; i++) { 104 | auto local_thread_data = &thread_data[i * block_size]; 105 | 106 | __syncthreads(); 107 | // transpose store of real data 108 | #pragma unroll 109 | for (int j = 0; j < block_size; j++) { 110 | ((float *) transpose_buffer)[j * transpose_stride + lane_id] = ((float *) local_thread_data)[j]; 111 | } 112 | 113 | __syncthreads(); 114 | // transpose load of real data 115 | #pragma unroll 116 | for (int j = 0; j < block_size; j++) { 117 | ((float *) local_thread_data)[j] = ((float *) transpose_buffer)[lane_id * transpose_stride + j]; 118 | } 119 | 120 | __syncthreads(); 121 | rdft((float *) local_thread_data); 122 | 123 | // transpose store of complex data 124 | #pragma unroll 125 | for (int j = 0; j < block_size / 2 + 1; j++) { 126 | transpose_buffer[lane_id * transpose_stride + j] = local_thread_data[j]; 127 | } 128 | 129 | __syncthreads(); 130 | if (lane_id < block_size / 2 + 1) { 131 | // transpose load of complex data 132 | #pragma unroll 133 | for (int j = 0; j < block_size; j++) { 134 | local_thread_data[j] = transpose_buffer[j * transpose_stride + lane_id]; 135 | } 136 | 137 | dft((float *) local_thread_data); 138 | } 139 | } 140 | 141 | if (lane_id < block_size / 2 + 1) { 142 | #pragma unroll 143 | for (int i = 0; i < block_size; i++) { 144 | dft<2 * radius + 1>((float *) &thread_data[i], block_size); 145 | } 146 | } 147 | 148 | // frequency_filtering 149 | if (lane_id < block_size / 2 + 1) { 150 | #if ZERO_MEAN 151 | float gf; 152 | if (lane_id == 0) { 153 | gf = thread_data[0].x / window_freq[0]; 154 | } 155 | gf = __shfl(gf, 0); 156 | #endif // ZERO_MEAN 157 | #pragma unroll 158 | for (int i = 0; i < 2 * radius + 1; i++) { 159 | #pragma unroll 160 | for (int j = 0; j < block_size; j++) { 161 | float2 local_data = thread_data[i * block_size + j]; 162 | 163 | #if ZERO_MEAN 164 | // remove mean 165 | float val1 = gf * window_freq[((i * block_size + j) * (block_size / 2 + 1) + lane_id) * 2]; 166 | float val2 = gf * window_freq[((i * block_size + j) * (block_size / 2 + 1) + lane_id) * 2 + 1]; 167 | local_data.x -= val1; 168 | local_data.y -= val2; 169 | #endif // ZERO_MEAN 170 | 171 | filter(local_data, lane_id, j, i); 172 | 173 | #if ZERO_MEAN 174 | // add mean 175 | local_data.x += val1; 176 | local_data.y += val2; 177 | #endif // ZERO_MEAN 178 | 179 | thread_data[i * block_size + j] = local_data; 180 | } 181 | } 182 | } 183 | 184 | // irdft 185 | if (lane_id < block_size / 2 + 1) { 186 | #pragma unroll 187 | for (int i = 0; i < block_size; i++) { 188 | idft<2 * radius + 1>((float *) &thread_data[i], block_size); 189 | } 190 | } 191 | 192 | // this is not a full 3d irdft, because only a single slice is required 193 | auto local_thread_data = &thread_data[radius * block_size]; 194 | 195 | __syncthreads(); 196 | if (lane_id < block_size / 2 + 1) { 197 | idft((float *) local_thread_data); 198 | 199 | // transpose store of complex data 200 | #pragma unroll 201 | for (int j = 0; j < block_size; j++) { 202 | transpose_buffer[j * transpose_stride + lane_id] = local_thread_data[j]; 203 | } 204 | } 205 | 206 | __syncthreads(); 207 | #pragma unroll 208 | for (int j = 0; j < block_size / 2 + 1; j++) { 209 | // transpose load of complex data 210 | local_thread_data[j].x = transpose_buffer[lane_id * transpose_stride + j].x; 211 | local_thread_data[j].y = transpose_buffer[lane_id * transpose_stride + j].y; 212 | } 213 | 214 | irdft((float *) local_thread_data); 215 | 216 | __syncthreads(); 217 | #pragma unroll 218 | for (int j = 0; j < block_size; j++) { 219 | ((float *) transpose_buffer)[j * transpose_stride + lane_id] = ((float *) local_thread_data)[j == 0 ? j : block_size - j]; 220 | } 221 | 222 | __syncthreads(); 223 | auto local_dst = &dstp[(block_id * (2 * radius + 1) + radius) * block_size * block_size]; 224 | #pragma unroll 225 | for (int j = 0; j < block_size; j++) { 226 | local_dst[j * block_size + lane_id] = ((float *) transpose_buffer)[lane_id * transpose_stride + j]; 227 | } 228 | } 229 | } 230 | 231 | extern "C" 232 | __launch_bounds__(WARPS_PER_BLOCK * WARP_SIZE) 233 | __global__ 234 | void col2im( 235 | TYPE * __restrict__ dst, // shape: (2*radius+1, vertical_size, horizontal_size) 236 | const float * __restrict__ src, // shape: (vertical_num, horizontal_num, 2*radius+1, block_size, block_size) 237 | int width, 238 | int height 239 | ) { 240 | 241 | int radius = static_cast(RADIUS); 242 | int block_size = static_cast(BLOCK_SIZE); 243 | int block_step = static_cast(BLOCK_STEP); 244 | 245 | // each thread is responsible for a single pixel 246 | int horizontal_size = calc_pad_size(width, block_size, block_step); 247 | int horizontal_num = calc_pad_num(width, block_size, block_step); 248 | int vertical_size = calc_pad_size(height, block_size, block_step); 249 | int vertical_num = calc_pad_num(height, block_size, block_step); 250 | int pad_x = (horizontal_size - width) / 2; 251 | int pad_y = (vertical_size - height) / 2; 252 | 253 | int x = blockIdx.x * blockDim.x + threadIdx.x; 254 | int y = blockIdx.y * blockDim.y + threadIdx.y; 255 | if (y < pad_y || y >= pad_y + height || x < pad_x || x >= pad_x + width) { 256 | return ; 257 | } 258 | 259 | float sum {}; 260 | 261 | int i1 = (y - block_size + block_step) / block_step; // i1 is implicitly greater than 0 262 | int i2 = min(y / block_step, vertical_num - 1); 263 | int j1 = (x - block_size + block_step) / block_step; // j1 is implicitly greater than 0 264 | int j2 = min(x / block_step, horizontal_num - 1); 265 | 266 | for (int i = i1; i <= i2; i++) { 267 | int offset_y = y - i * block_step; 268 | for (int j = j1; j <= j2; j++) { 269 | int offset_x = x - j * block_step; 270 | auto src_offset = (((i * horizontal_num + j) * (2 * radius + 1) + radius) * block_size + offset_y) * block_size + offset_x; 271 | auto window_offset = (radius * block_size + offset_y) * block_size + offset_x; 272 | sum += src[src_offset] * window[window_offset]; 273 | } 274 | } 275 | 276 | dst[(radius * vertical_size + y) * horizontal_size + x] = from_float(sum); 277 | } 278 | )"""; 279 | 280 | #endif // KERNEL_HPP 281 | -------------------------------------------------------------------------------- /hiprtc_source/source.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #include 24 | #include 25 | 26 | #include 27 | #include 28 | 29 | #include "dft_kernels.hpp" 30 | #include "kernel.hpp" 31 | 32 | #include // generated by cmake 33 | 34 | // real/complex-input DFT 35 | template 36 | requires 37 | (std::is_same_v || std::is_same_v>) 38 | static void dft( 39 | std::complex * VS_RESTRICT dst, 40 | const T_in * VS_RESTRICT src, 41 | int n, 42 | int stride 43 | ) { 44 | 45 | int out_num = std::is_floating_point_v ? (n / 2 + 1) : n; 46 | for (int i = 0; i < out_num; i++) { 47 | std::complex sum {}; 48 | for (int j = 0; j < n; j++) { 49 | auto imag = -2 * i * j * std::numbers::pi_v / n; 50 | auto weight = std::complex(std::cos(imag), std::sin(imag)); 51 | sum += src[j * stride] * weight; 52 | } 53 | dst[i * stride] = sum; 54 | } 55 | } 56 | 57 | static bool success(hipError_t result) { 58 | return result == hipSuccess; 59 | } 60 | static bool success(hiprtcResult result) { 61 | return result == HIPRTC_SUCCESS; 62 | } 63 | 64 | static const char * get_error(hipError_t error) { 65 | return hipGetErrorString(error); 66 | } 67 | 68 | static const char * get_error(hiprtcResult error) { 69 | return hiprtcGetErrorString(error); 70 | } 71 | 72 | #define showError(expr) show_error_impl(expr, # expr, __LINE__) 73 | template 74 | static void show_error_impl(T result, const char * source, int line_no) { 75 | if (!success(result)) [[unlikely]] { 76 | std::fprintf(stderr, "[%d] %s failed: %s\n", line_no, source, get_error(result)); 77 | } 78 | } 79 | 80 | #define checkError(expr) do { \ 81 | if (auto result = expr; !success(result)) [[unlikely]] { \ 82 | std::ostringstream error; \ 83 | error << '[' << __LINE__ << "] '" # expr "' failed: " << get_error(result); \ 84 | return set_error(error.str().c_str()); \ 85 | } \ 86 | } while (0) 87 | 88 | static void hipStreamDestroyCustom(hipStream_t stream) { 89 | showError(hipStreamDestroy(stream)); 90 | } 91 | 92 | static void hipEventDestroyCustom(hipEvent_t event) { 93 | showError(hipEventDestroy(event)); 94 | } 95 | 96 | static void hipFreeCustom(hipDeviceptr_t p) { 97 | showError(hipFree(p)); 98 | } 99 | 100 | static void hipModuleUnloadCustom(hipModule_t module) { 101 | showError(hipModuleUnload(module)); 102 | } 103 | 104 | static void hiprtcDestroyProgramCustom(hiprtcProgram * program) { 105 | showError(hiprtcDestroyProgram(program)); 106 | } 107 | 108 | struct node_freer { 109 | const VSAPI * & vsapi; 110 | VSNodeRef * node {}; 111 | void release() { 112 | node = nullptr; 113 | } 114 | ~node_freer() { 115 | if (node) { 116 | vsapi->freeNode(node); 117 | } 118 | } 119 | }; 120 | 121 | template 122 | requires 123 | std::default_initializable && 124 | std::is_trivially_copy_assignable_v && 125 | std::convertible_to && 126 | std::invocable 127 | struct Resource { 128 | T data; 129 | 130 | [[nodiscard]] constexpr Resource() noexcept = default; 131 | 132 | [[nodiscard]] constexpr Resource(T x) noexcept : data(x) {} 133 | 134 | [[nodiscard]] constexpr Resource(Resource&& other) noexcept 135 | : data(std::exchange(other.data, T{})) 136 | { } 137 | 138 | Resource& operator=(Resource&& other) noexcept { 139 | if (this == &other) return *this; 140 | deleter_(data); 141 | data = std::exchange(other.data, T{}); 142 | return *this; 143 | } 144 | 145 | Resource operator=(Resource other) = delete; 146 | 147 | Resource(const Resource& other) = delete; 148 | 149 | constexpr operator T() const noexcept { 150 | return data; 151 | } 152 | 153 | constexpr auto deleter_(T x) noexcept { 154 | if (x) { 155 | deleter(x); 156 | x = T{}; 157 | } 158 | } 159 | 160 | Resource& operator=(T x) noexcept { 161 | deleter_(data); 162 | data = x; 163 | return *this; 164 | } 165 | 166 | constexpr ~Resource() noexcept { 167 | deleter_(data); 168 | } 169 | }; 170 | 171 | template 172 | static T square(const T & x) { 173 | return x * x; 174 | } 175 | 176 | static int calc_pad_size(int size, int block_size, int block_step) { 177 | return ( 178 | size 179 | + ((size % block_size) ? block_size - size % block_size : 0) 180 | + std::max(block_size - block_step, block_step) * 2 181 | ); 182 | } 183 | 184 | static int calc_pad_num(int size, int block_size, int block_step) { 185 | return (calc_pad_size(size, block_size, block_step) - block_size) / block_step + 1; 186 | } 187 | 188 | template 189 | static void reflection_padding_impl( 190 | T * VS_RESTRICT dst, // shape: (pad_height, pad_width) 191 | const T * VS_RESTRICT src, // shape: (height, stride) 192 | int width, int height, int stride, 193 | int block_size, int block_step 194 | ) { 195 | 196 | int pad_width = calc_pad_size(width, block_size, block_step); 197 | int pad_height = calc_pad_size(height, block_size, block_step); 198 | 199 | int offset_y = (pad_height - height) / 2; 200 | int offset_x = (pad_width - width) / 2; 201 | 202 | vs_bitblt( 203 | &dst[offset_y * pad_width + offset_x], pad_width * sizeof(T), 204 | src, stride * sizeof(T), 205 | width * sizeof(T), height 206 | ); 207 | 208 | // copy left and right regions 209 | for (int y = offset_y; y < offset_y + height; y++) { 210 | auto dst_line = &dst[y * pad_width]; 211 | 212 | for (int x = 0; x < offset_x; x++) { 213 | dst_line[x] = dst_line[offset_x * 2 - x]; 214 | } 215 | 216 | for (int x = offset_x + width; x < pad_width; x++) { 217 | dst_line[x] = dst_line[2 * (offset_x + width) - 2 - x]; 218 | } 219 | } 220 | 221 | // copy top region 222 | for (int y = 0; y < offset_y; y++) { 223 | std::memcpy( 224 | &dst[y * pad_width], 225 | &dst[(offset_y * 2 - y) * pad_width], 226 | pad_width * sizeof(T) 227 | ); 228 | } 229 | 230 | // copy bottom region 231 | for (int y = offset_y + height; y < pad_height; y++) { 232 | std::memcpy( 233 | &dst[y * pad_width], 234 | &dst[(2 * (offset_y + height) - 2 - y) * pad_width], 235 | pad_width * sizeof(T) 236 | ); 237 | } 238 | } 239 | 240 | static void reflection_padding( 241 | uint8_t * VS_RESTRICT dst, // shape: (pad_height, pad_width) 242 | const uint8_t * VS_RESTRICT src, // shape: (height, stride) 243 | int width, int height, int stride, 244 | int block_size, int block_step, 245 | int bytes_per_sample 246 | ) { 247 | 248 | if (bytes_per_sample == 1) { 249 | reflection_padding_impl( 250 | static_cast(dst), 251 | static_cast(src), 252 | width, height, stride, 253 | block_size, block_step 254 | ); 255 | } else if (bytes_per_sample == 2) { 256 | reflection_padding_impl( 257 | reinterpret_cast(dst), 258 | reinterpret_cast(src), 259 | width, height, stride, 260 | block_size, block_step 261 | ); 262 | } else if (bytes_per_sample == 4) { 263 | reflection_padding_impl( 264 | reinterpret_cast(dst), 265 | reinterpret_cast(src), 266 | width, height, stride, 267 | block_size, block_step 268 | ); 269 | } 270 | } 271 | 272 | static std::variant compile( 273 | const char * user_kernel, 274 | hipDevice_t device, 275 | int radius, 276 | int block_size, 277 | int block_step, 278 | bool in_place, 279 | int warp_size, 280 | int warps_per_block, 281 | int sample_type, 282 | int bits_per_sample 283 | ) { 284 | 285 | auto set_error = [](const char * error_message) -> std::string { 286 | return std::string{ error_message }; 287 | }; 288 | 289 | hipDeviceProp_t prop; 290 | checkError(hipGetDeviceProperties(&prop, device)); 291 | 292 | constexpr bool generate_bitcode = false; 293 | 294 | std::ostringstream kernel_source; 295 | kernel_source << "#define RADIUS " << radius << '\n'; 296 | kernel_source << "#define BLOCK_SIZE " << block_size << '\n'; 297 | kernel_source << "#define BLOCK_STEP " << block_step << '\n'; 298 | kernel_source << "#define IN_PLACE " << (int) in_place << '\n'; 299 | kernel_source << "#define WARP_SIZE " << warp_size << '\n'; 300 | kernel_source << "#define WARPS_PER_BLOCK " << warps_per_block << '\n'; 301 | if (sample_type == stInteger) { 302 | int bytes_per_sample = bits_per_sample / 8; 303 | const char * type {}; 304 | if (bytes_per_sample == 1) { 305 | type = "unsigned char"; 306 | } else if (bytes_per_sample == 2) { 307 | type = "unsigned short"; 308 | } else if (bytes_per_sample == 4) { 309 | type = "unsigned int"; 310 | } 311 | kernel_source << "#define TYPE " << type << '\n'; 312 | kernel_source << "#define SCALE " << 1.0 / (1 << (bits_per_sample - 8)) << '\n'; 313 | kernel_source << "#define PEAK " << ((1 << bits_per_sample) - 1) << '\n'; 314 | } else if (sample_type == stFloat) { 315 | if (bits_per_sample == 32) { 316 | kernel_source << "#define TYPE float\n"; 317 | } 318 | kernel_source << "#define SCALE 255.0\n"; 319 | } 320 | kernel_source << user_kernel << '\n'; 321 | kernel_source << fft_header; 322 | for (const auto & impl : rdft_implementations) { 323 | kernel_source << impl; 324 | } 325 | for (const auto & impl : dft_implementations) { 326 | kernel_source << impl; 327 | } 328 | for (const auto & impl : idft_implementations) { 329 | kernel_source << impl; 330 | } 331 | for (const auto & impl : irdft_implementations) { 332 | kernel_source << impl; 333 | } 334 | kernel_source << kernel_implementation; 335 | 336 | hiprtcProgram program; 337 | checkError(hiprtcCreateProgram(&program, kernel_source.str().c_str(), nullptr, 0, nullptr, nullptr)); 338 | Resource destroyer { &program }; 339 | 340 | const std::string arch_str = std::string("--offload-arch=") + prop.gcnArchName; 341 | 342 | const char * opts[] = { 343 | arch_str.c_str(), 344 | "-std=c++17", 345 | "-ffast-math", 346 | "-mno-wavefrontsize64", // rdna only 347 | }; 348 | 349 | auto compilation = hiprtcCompileProgram(program, (int) std::extent_v, opts); 350 | 351 | size_t log_size; 352 | showError(hiprtcGetProgramLogSize(program, &log_size)); 353 | 354 | std::string error_message; 355 | if (log_size > 1) { 356 | error_message.resize(log_size); 357 | showError(hiprtcGetProgramLog(program, error_message.data())); 358 | } 359 | 360 | if (success(compilation)) { 361 | if (log_size > 1) { 362 | std::fprintf(stderr, "hiprtc: %s\n", error_message.c_str()); 363 | } 364 | } else { 365 | return error_message; 366 | } 367 | 368 | std::unique_ptr image; 369 | if (generate_bitcode) { 370 | size_t bitcode_size; 371 | checkError(hiprtcGetBitcodeSize(program, &bitcode_size)); 372 | image = std::make_unique_for_overwrite(bitcode_size); 373 | checkError(hiprtcGetBitcode(program, image.get())); 374 | } else { 375 | size_t code_size; 376 | checkError(hiprtcGetCodeSize(program, &code_size)); 377 | image = std::make_unique_for_overwrite(code_size); 378 | checkError(hiprtcGetCode(program, image.get())); 379 | } 380 | 381 | hipModule_t module; 382 | checkError(hipModuleLoadData(&module, image.get())); 383 | 384 | return module; 385 | } 386 | 387 | 388 | struct ticket_semaphore { 389 | std::atomic ticket {}; 390 | std::atomic current {}; 391 | 392 | void acquire() noexcept { 393 | intptr_t tk { ticket.fetch_add(1, std::memory_order::acquire) }; 394 | while (true) { 395 | intptr_t curr { current.load(std::memory_order::acquire) }; 396 | if (tk <= curr) { 397 | return; 398 | } 399 | current.wait(curr, std::memory_order::relaxed); 400 | } 401 | } 402 | 403 | void release() noexcept { 404 | current.fetch_add(1, std::memory_order::release); 405 | current.notify_all(); 406 | } 407 | }; 408 | 409 | 410 | struct DFTTestThreadData { 411 | uint8_t * h_padded; // shape: (pad_height, pad_width) 412 | }; 413 | 414 | 415 | struct DFTTestStreamData { 416 | Resource stream; 417 | 418 | Resource event; 419 | 420 | // shape: (vertical_num, horizontal_num, 2*radius+1, block_size, block_size) 421 | Resource d_spatial; 422 | 423 | Resource d_padded; // shape: (pad_height, pad_width) 424 | }; 425 | 426 | 427 | struct DFTTestData { 428 | VSNodeRef * node; 429 | int radius; 430 | int block_size; 431 | int block_step; 432 | std::array process; 433 | hipDevice_t device; // device_id 434 | bool in_place; 435 | 436 | int warp_size; 437 | 438 | int warps_per_block = 1; 439 | 440 | ticket_semaphore semaphore; 441 | std::vector stream_data; 442 | std::vector ticket; 443 | std::mutex ticket_lock; 444 | 445 | Resource module; 446 | hipFunction_t fused_kernel; 447 | int fused_num_blocks; 448 | hipFunction_t col2im_kernel; 449 | 450 | std::atomic num_uninitialized_threads; 451 | std::unordered_map thread_data; 452 | std::shared_mutex thread_data_lock; 453 | }; 454 | 455 | static void VS_CC DFTTestInit( 456 | VSMap *in, VSMap *out, void **instanceData, VSNode *node, 457 | VSCore *core, const VSAPI *vsapi 458 | ) noexcept { 459 | 460 | auto d = static_cast(*instanceData); 461 | 462 | auto vi = vsapi->getVideoInfo(d->node); 463 | vsapi->setVideoInfo(vi, 1, node); 464 | } 465 | 466 | static const VSFrameRef *VS_CC DFTTestGetFrame( 467 | int n, int activationReason, void **instanceData, void **frameData, 468 | VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi 469 | ) noexcept { 470 | 471 | auto d = static_cast(*instanceData); 472 | 473 | if (activationReason == arInitial) { 474 | int start = std::max(n - d->radius, 0); 475 | auto vi = vsapi->getVideoInfo(d->node); 476 | int end = std::min(n + d->radius, vi->numFrames - 1); 477 | for (int i = start; i <= end; i++) { 478 | vsapi->requestFrameFilter(i, d->node, frameCtx); 479 | } 480 | return nullptr; 481 | } else if (activationReason != arAllFramesReady) { 482 | return nullptr; 483 | } 484 | 485 | auto set_error = [vsapi, frameCtx](const char * error_message) -> std::nullptr_t { 486 | vsapi->setFilterError(error_message, frameCtx); 487 | return nullptr; 488 | }; 489 | 490 | checkError(hipSetDevice(d->device)); 491 | 492 | auto vi = vsapi->getVideoInfo(d->node); 493 | 494 | DFTTestThreadData thread_data; 495 | 496 | auto thread_id = std::this_thread::get_id(); 497 | if (d->num_uninitialized_threads.load(std::memory_order::acquire) == 0) { 498 | const auto & const_data = d->thread_data; 499 | thread_data = const_data.at(thread_id); 500 | } else { 501 | bool initialized = true; 502 | 503 | d->thread_data_lock.lock_shared(); 504 | try { 505 | const auto & const_data = d->thread_data; 506 | thread_data = const_data.at(thread_id); 507 | } catch (const std::out_of_range &) { 508 | initialized = false; 509 | } 510 | d->thread_data_lock.unlock_shared(); 511 | 512 | if (!initialized) { 513 | auto padded_size = ( 514 | (2 * d->radius + 1) * 515 | calc_pad_size(vi->height, d->block_size, d->block_step) * 516 | calc_pad_size(vi->width, d->block_size, d->block_step) * 517 | vi->format->bytesPerSample 518 | ); 519 | 520 | checkError(hipHostMalloc((void **) &thread_data.h_padded, padded_size, 0)); 521 | 522 | { 523 | std::lock_guard _ { d->thread_data_lock }; 524 | d->thread_data.emplace(thread_id, thread_data); 525 | } 526 | 527 | d->num_uninitialized_threads.fetch_sub(1, std::memory_order::release); 528 | } 529 | } 530 | 531 | std::vectorfreeFrame)>> src_frames; 532 | src_frames.reserve(2 * d->radius + 1); 533 | for (int i = n - d->radius; i <= n + d->radius; i++) { 534 | src_frames.emplace_back( 535 | vsapi->getFrameFilter(std::clamp(i, 0, vi->numFrames - 1), d->node, frameCtx), 536 | vsapi->freeFrame 537 | ); 538 | } 539 | 540 | auto & src_center_frame = src_frames[d->radius]; 541 | auto format = vsapi->getFrameFormat(src_center_frame.get()); 542 | 543 | const VSFrameRef * fr[] { 544 | d->process[0] ? nullptr : src_center_frame.get(), 545 | d->process[1] ? nullptr : src_center_frame.get(), 546 | d->process[2] ? nullptr : src_center_frame.get() 547 | }; 548 | const int pl[] { 0, 1, 2 }; 549 | std::unique_ptrfreeFrame)> dst_frame { 550 | vsapi->newVideoFrame2(format, vi->width, vi->height, fr, pl, src_center_frame.get(), core), 551 | vsapi->freeFrame 552 | }; 553 | 554 | for (int plane = 0; plane < format->numPlanes; plane++) { 555 | if (!d->process[plane]) { 556 | continue; 557 | } 558 | 559 | int width = vsapi->getFrameWidth(src_center_frame.get(), plane); 560 | int height = vsapi->getFrameHeight(src_center_frame.get(), plane); 561 | int stride = vsapi->getStride(src_center_frame.get(), plane) / vi->format->bytesPerSample; 562 | 563 | int padded_size_spatial = ( 564 | calc_pad_size(height, d->block_size, d->block_step) * 565 | calc_pad_size(width, d->block_size, d->block_step) 566 | ); 567 | 568 | for (int i = 0; i < 2 * d->radius + 1; i++) { 569 | auto srcp = vsapi->getReadPtr(src_frames[i].get(), plane); 570 | reflection_padding( 571 | &thread_data.h_padded[(i * padded_size_spatial) * vi->format->bytesPerSample], 572 | srcp, 573 | width, height, stride, 574 | d->block_size, d->block_step, 575 | vi->format->bytesPerSample 576 | ); 577 | } 578 | 579 | { 580 | d->semaphore.acquire(); 581 | 582 | int ticket; 583 | { 584 | std::lock_guard lock { d->ticket_lock }; 585 | ticket = d->ticket.back(); 586 | d->ticket.pop_back(); 587 | } 588 | 589 | auto & stream_data = d->stream_data[ticket]; 590 | 591 | int padded_bytes = (2 * d->radius + 1) * padded_size_spatial * vi->format->bytesPerSample; 592 | checkError(hipMemcpyHtoDAsync(stream_data.d_padded.data, thread_data.h_padded, padded_bytes, stream_data.stream)); 593 | { 594 | void * params[] { &stream_data.d_spatial.data, &stream_data.d_padded.data, &width, &height }; 595 | checkError(hipModuleLaunchKernel( 596 | d->fused_kernel, 597 | d->fused_num_blocks, 1, 1, 598 | d->warps_per_block * d->warp_size, 1, 1, 599 | 0, 600 | stream_data.stream, 601 | params, nullptr 602 | )); 603 | } 604 | { 605 | void * params[] { &stream_data.d_padded.data, &stream_data.d_spatial.data, &width, &height }; 606 | unsigned int vertical_size = calc_pad_size(height, d->block_size, d->block_step); 607 | unsigned int horizontal_size = calc_pad_size(width, d->block_size, d->block_step); 608 | unsigned int grid_x = (horizontal_size + d->warp_size - 1) / d->warp_size; 609 | unsigned int grid_y = (vertical_size + d->warps_per_block - 1) / d->warps_per_block; 610 | checkError(hipModuleLaunchKernel( 611 | d->col2im_kernel, 612 | grid_x, grid_y, 1, 613 | d->warp_size, d->warps_per_block, 1, 614 | 0, 615 | stream_data.stream, 616 | params, nullptr 617 | )); 618 | } 619 | { 620 | unsigned int pad_width = calc_pad_size(width, d->block_size, d->block_step); 621 | unsigned int pad_height = calc_pad_size(height, d->block_size, d->block_step); 622 | const HIP_MEMCPY3D config { 623 | .srcXInBytes = (pad_width - width) / 2 * vi->format->bytesPerSample, 624 | .srcY = (pad_height - height) / 2, 625 | .srcZ = (unsigned int) d->radius, 626 | .srcMemoryType = hipMemoryTypeDevice, 627 | .srcDevice = stream_data.d_padded.data, 628 | .srcPitch = pad_width * vi->format->bytesPerSample, 629 | .srcHeight = pad_height, 630 | .dstXInBytes = (pad_width - width) / 2 * vi->format->bytesPerSample, 631 | .dstY = (pad_height - height) / 2, 632 | .dstZ = 0, // vs_bitblt(dstp) copies from the 0-th slice 633 | .dstMemoryType = hipMemoryTypeHost, 634 | .dstHost = thread_data.h_padded, 635 | .dstPitch = pad_width * vi->format->bytesPerSample, 636 | .dstHeight = pad_height, 637 | .WidthInBytes = (unsigned int) width * vi->format->bytesPerSample, 638 | .Height = (unsigned int) height, 639 | .Depth = 1 640 | }; 641 | checkError(hipDrvMemcpy3DAsync(&config, stream_data.stream)); 642 | } 643 | 644 | checkError(hipEventRecord(stream_data.event, stream_data.stream)); 645 | checkError(hipEventSynchronize(stream_data.event)); 646 | 647 | { 648 | std::lock_guard lock { d->ticket_lock }; 649 | d->ticket.emplace_back(ticket); 650 | } 651 | d->semaphore.release(); 652 | } 653 | 654 | int pad_width = calc_pad_size(width, d->block_size, d->block_step); 655 | int pad_height = calc_pad_size(height, d->block_size, d->block_step); 656 | int offset_y = (pad_height - height) / 2; 657 | int offset_x = (pad_width - width) / 2; 658 | 659 | auto dstp = vsapi->getWritePtr(dst_frame.get(), plane); 660 | auto input = &thread_data.h_padded[(offset_y * pad_width + offset_x) * vi->format->bytesPerSample]; 661 | vs_bitblt( 662 | dstp, stride * vi->format->bytesPerSample, 663 | input, pad_width * vi->format->bytesPerSample, 664 | width * vi->format->bytesPerSample, height 665 | ); 666 | } 667 | 668 | return dst_frame.release(); 669 | } 670 | 671 | static void VS_CC DFTTestFree( 672 | void *instanceData, VSCore *core, const VSAPI *vsapi 673 | ) noexcept { 674 | 675 | auto d = static_cast(instanceData); 676 | 677 | vsapi->freeNode(d->node); 678 | 679 | for (const auto & [_, thread_data] : d->thread_data) { 680 | showError(hipHostFree(thread_data.h_padded)); 681 | } 682 | 683 | delete d; 684 | } 685 | 686 | static void VS_CC DFTTestCreate( 687 | const VSMap *in, VSMap *out, void *userData, 688 | VSCore *core, const VSAPI *vsapi 689 | ) noexcept { 690 | 691 | auto d = std::make_unique(); 692 | 693 | d->node = vsapi->propGetNode(in, "clip", 0, nullptr); 694 | node_freer node_freer { vsapi, d->node }; 695 | 696 | auto set_error = [vsapi, out](const char * error_message) -> void { 697 | vsapi->setError(out, error_message); 698 | return ; 699 | }; 700 | 701 | auto vi = vsapi->getVideoInfo(d->node); 702 | 703 | auto user_kernel = vsapi->propGetData(in, "kernel", 0, nullptr); 704 | 705 | int error; 706 | 707 | d->radius = int64ToIntS(vsapi->propGetInt(in, "radius", 0, &error)); 708 | if (error) { 709 | d->radius = 0; 710 | } 711 | 712 | if (d->radius < 0 || d->radius > 3) { 713 | return set_error("\"radius\" must be in [0, 1, 2, 3]"); 714 | } 715 | 716 | d->block_size = int64ToIntS(vsapi->propGetInt(in, "block_size", 0, &error)); 717 | if (error) { 718 | d->block_size = 16; 719 | } 720 | 721 | if (d->block_size != 16) { 722 | return set_error("\"block_size\" must be 16"); 723 | } 724 | 725 | d->block_step = int64ToIntS(vsapi->propGetInt(in, "block_step", 0, &error)); 726 | if (error) { 727 | d->block_step = d->block_size; 728 | } 729 | 730 | int num_planes_args = vsapi->propNumElements(in, "planes"); 731 | d->process.fill(num_planes_args <= 0); 732 | for (int i = 0; i < num_planes_args; ++i) { 733 | int plane = static_cast(vsapi->propGetInt(in, "planes", i, nullptr)); 734 | 735 | if (plane < 0 || plane >= vi->format->numPlanes) { 736 | return set_error("plane index out of range"); 737 | } 738 | 739 | if (d->process[plane]) { 740 | return set_error("plane specified twice"); 741 | } 742 | 743 | d->process[plane] = true; 744 | } 745 | 746 | d->in_place = !!(vsapi->propGetInt(in, "in_place", 0, &error)); 747 | if (error) { 748 | d->in_place = true; 749 | } 750 | if (d->in_place) { 751 | return set_error("\"in_place\" not supported yet"); 752 | } 753 | 754 | d->device = int64ToIntS(vsapi->propGetInt(in, "device_id", 0, &error)); 755 | if (error) { 756 | d->device = 0; 757 | } 758 | 759 | int num_streams = int64ToIntS(vsapi->propGetInt(in, "num_streams", 0, &error)); 760 | if (error) { 761 | num_streams = 1; 762 | } 763 | d->semaphore.current.store(num_streams - 1, std::memory_order::relaxed); 764 | d->ticket.reserve(num_streams); 765 | for (int i = 0; i < num_streams; i++) { 766 | d->ticket.emplace_back(i); 767 | } 768 | 769 | checkError(hipSetDevice(d->device)); 770 | 771 | checkError(hipDeviceGetAttribute(&d->warp_size, hipDeviceAttributeWarpSize, d->device)); 772 | 773 | auto compilation = compile( 774 | user_kernel, 775 | d->device, 776 | d->radius, d->block_size, d->block_step, d->in_place, 777 | d->warp_size, d->warps_per_block, 778 | vi->format->sampleType, vi->format->bitsPerSample 779 | ); 780 | if (std::holds_alternative(compilation)) { 781 | std::ostringstream message; 782 | message << '[' << __LINE__ << "] compile(): " << std::get(compilation); 783 | vsapi->setError(out, message.str().c_str()); 784 | return ; 785 | } 786 | d->module = std::get(compilation); 787 | 788 | int num_sms; 789 | checkError(hipDeviceGetAttribute(&num_sms, hipDeviceAttributeMultiprocessorCount, d->device)); 790 | 791 | checkError(hipModuleGetFunction(&d->fused_kernel, d->module, "fused")); 792 | { 793 | int max_blocks_per_sm; 794 | checkError(hipModuleOccupancyMaxActiveBlocksPerMultiprocessor( 795 | &max_blocks_per_sm, 796 | d->fused_kernel, 797 | d->warps_per_block * d->warp_size, 798 | 0 799 | )); 800 | d->fused_num_blocks = num_sms * max_blocks_per_sm; 801 | } 802 | 803 | checkError(hipModuleGetFunction(&d->col2im_kernel, d->module, "col2im")); 804 | 805 | d->stream_data.resize(num_streams); 806 | for (int i = 0; i < num_streams; i++) { 807 | auto & stream_data = d->stream_data[i]; 808 | 809 | checkError(hipStreamCreateWithFlags(&stream_data.stream.data, hipStreamNonBlocking)); 810 | 811 | checkError(hipEventCreateWithFlags( 812 | &stream_data.event.data, 813 | hipEventBlockingSync | hipEventDisableTiming 814 | )); 815 | 816 | size_t padded_bytes = ( 817 | (2 * d->radius + 1) * 818 | calc_pad_size(vi->height, d->block_size, d->block_step) * 819 | calc_pad_size(vi->width, d->block_size, d->block_step) * 820 | vi->format->bytesPerSample 821 | ); 822 | checkError(hipMalloc(&stream_data.d_padded.data, padded_bytes)); 823 | 824 | if (!d->in_place) { 825 | size_t spatial_bytes = ( 826 | calc_pad_num(vi->height, d->block_size, d->block_step) * 827 | calc_pad_num(vi->width, d->block_size, d->block_step) * 828 | (2 * d->radius + 1) * 829 | square(d->block_size) * 830 | sizeof(float) 831 | ); 832 | checkError(hipMalloc(&stream_data.d_spatial.data, spatial_bytes)); 833 | } 834 | } 835 | 836 | VSCoreInfo info; 837 | vsapi->getCoreInfo2(core, &info); 838 | d->num_uninitialized_threads.store(info.numThreads, std::memory_order_relaxed); 839 | d->thread_data.reserve(info.numThreads); 840 | 841 | vsapi->createFilter( 842 | in, out, "DFTTest", 843 | DFTTestInit, DFTTestGetFrame, DFTTestFree, 844 | fmParallel, 0, d.release(), core 845 | ); 846 | } 847 | 848 | static void VS_CC RDFT( 849 | const VSMap *in, VSMap *out, void *userData, 850 | VSCore *core, const VSAPI *vsapi 851 | ) noexcept { 852 | 853 | auto set_error = [vsapi, out](const char * error_message) -> void { 854 | vsapi->setError(out, error_message); 855 | }; 856 | 857 | int ndim = vsapi->propNumElements(in, "shape"); 858 | if (ndim != 1 && ndim != 2 && ndim != 3) { 859 | return set_error("\"shape\" must be an array of ints with 1, 2 or 3 values"); 860 | } 861 | 862 | std::array shape {}; 863 | { 864 | auto shape_array = vsapi->propGetIntArray(in, "shape", nullptr); 865 | for (int i = 0; i < ndim; i++) { 866 | shape[i] = int64ToIntS(shape_array[i]); 867 | } 868 | } 869 | 870 | int size = 1; 871 | for (int i = 0; i < ndim; i++) { 872 | size *= shape[i]; 873 | } 874 | if (vsapi->propNumElements(in, "data") != size) { 875 | return set_error("cannot reshape array"); 876 | } 877 | 878 | int complex_size = shape[ndim - 1] / 2 + 1; 879 | for (int i = 0; i < ndim - 1; i++) { 880 | complex_size *= shape[i]; 881 | } 882 | 883 | auto input = vsapi->propGetFloatArray(in, "data", nullptr); 884 | 885 | auto output = std::make_unique_for_overwrite []>(complex_size); 886 | 887 | if (ndim == 1) { 888 | dft(output.get(), input, size, 1); 889 | vsapi->propSetFloatArray(out, "ret", (const double *) output.get(), complex_size * 2); 890 | } else if (ndim == 2) { 891 | for (int i = 0; i < shape[0]; i++) { 892 | dft(&output[i * (shape[1] / 2 + 1)], &input[i * shape[1]], shape[1], 1); 893 | } 894 | 895 | auto output2 = std::make_unique_for_overwrite []>(complex_size); 896 | 897 | for (int i = 0; i < shape[1] / 2 + 1; i++) { 898 | dft(&output2[i], &output[i], shape[0], shape[1] / 2 + 1); 899 | } 900 | 901 | vsapi->propSetFloatArray(out, "ret", (const double *) output2.get(), complex_size * 2); 902 | } else { 903 | for (int i = 0; i < shape[0] * shape[1]; i++) { 904 | dft(&output[i * (shape[2] / 2 + 1)], &input[i * shape[2]], shape[2], 1); 905 | } 906 | 907 | auto output2 = std::make_unique_for_overwrite []>(complex_size); 908 | 909 | for (int i = 0; i < shape[0]; i++) { 910 | for (int j = 0; j < shape[2] / 2 + 1; j++) { 911 | dft( 912 | &output2[i * shape[1] * (shape[2] / 2 + 1) + j], 913 | &output[i * shape[1] * (shape[2] / 2 + 1) + j], 914 | shape[1], 915 | (shape[2] / 2 + 1) 916 | ); 917 | } 918 | } 919 | 920 | for (int i = 0; i < shape[1] * (shape[2] / 2 + 1); i++) { 921 | dft(&output[i], &output2[i], shape[0], shape[1] * (shape[2] / 2 + 1)); 922 | } 923 | 924 | vsapi->propSetFloatArray(out, "ret", (const double *) output.get(), complex_size * 2); 925 | } 926 | } 927 | 928 | static void VS_CC ToSingle( 929 | const VSMap *in, VSMap *out, void *userData, 930 | VSCore *core, const VSAPI *vsapi 931 | ) noexcept { 932 | 933 | auto data = vsapi->propGetFloatArray(in, "data", nullptr); 934 | int num = vsapi->propNumElements(in, "data"); 935 | 936 | auto converted_data = std::make_unique_for_overwrite(num); 937 | for (int i = 0; i < num; i++) { 938 | converted_data[i] = static_cast(data[i]); 939 | } 940 | 941 | if (num == 1) { 942 | vsapi->propSetFloat(out, "ret", converted_data[0], paReplace); 943 | } else { 944 | vsapi->propSetFloatArray(out, "ret", converted_data.get(), num); 945 | } 946 | } 947 | 948 | static void Version(const VSMap *, VSMap * out, void *, VSCore *, const VSAPI *vsapi) { 949 | vsapi->propSetData(out, "version", VERSION, -1, paReplace); 950 | }; 951 | 952 | VS_EXTERNAL_API(void) 953 | VapourSynthPluginInit(VSConfigPlugin configFunc, VSRegisterFunction registerFunc, VSPlugin *plugin) { 954 | configFunc( 955 | "io.github.amusementclub.dfttest2_hiprtc", 956 | "dfttest2_hiprtc", 957 | "DFTTest2 (HIPRTC)", 958 | VAPOURSYNTH_API_VERSION, 1, plugin 959 | ); 960 | 961 | registerFunc( 962 | "DFTTest", 963 | "clip:clip;" 964 | "kernel:data[];" 965 | "radius:int:opt;" 966 | "block_size:int:opt;" 967 | "block_step:int:opt;" 968 | "planes:int[]:opt;" 969 | "in_place:int:opt;" 970 | "device_id:int:opt;" 971 | "num_streams:int:opt;", 972 | DFTTestCreate, nullptr, plugin 973 | ); 974 | 975 | registerFunc( 976 | "RDFT", 977 | "data:float[];" 978 | "shape:int[];", 979 | RDFT, nullptr, plugin 980 | ); 981 | 982 | registerFunc( 983 | "ToSingle", 984 | "data:float[];", 985 | ToSingle, nullptr, plugin 986 | ); 987 | 988 | registerFunc( 989 | "Version", 990 | "", 991 | Version, nullptr, plugin 992 | ); 993 | } 994 | -------------------------------------------------------------------------------- /nvrtc_source/kernel.hpp: -------------------------------------------------------------------------------- 1 | #ifndef KERNEL_HPP 2 | #define KERNEL_HPP 3 | 4 | static const auto kernel_implementation = R"""( 5 | __device__ 6 | extern void filter(float2 & value, int x, int y, int z); 7 | 8 | // ZERO_MEAN 9 | // RADIUS 10 | // BLOCK_SIZE 11 | // BLOCK_STEP 12 | // WARPS_PER_BLOCK 13 | // WARP_SIZE 14 | // TYPE 15 | // SCALE 16 | // PEAK (optional) 17 | 18 | #if ZERO_MEAN 19 | // __device__ const float window_freq[]; // frequency response of the window 20 | #endif // ZERO_MEAN 21 | 22 | __device__ 23 | static int calc_pad_size(int size, int block_size, int block_step) { 24 | return size + ((size % block_size) ? block_size - size % block_size : 0) + max(block_size - block_step, block_step) * 2; 25 | } 26 | 27 | __device__ 28 | static int calc_pad_num(int size, int block_size, int block_step) { 29 | return (calc_pad_size(size, block_size, block_step) - block_size) / block_step + 1; 30 | } 31 | 32 | __device__ 33 | static float to_float(TYPE x) { 34 | return static_cast(x) * static_cast(SCALE); 35 | } 36 | 37 | __device__ 38 | static TYPE from_float(float x) { 39 | #ifdef PEAK 40 | x /= static_cast(SCALE); 41 | x = fmaxf(0.0f, fminf(x + 0.5f, static_cast(PEAK))); 42 | return static_cast(__float2int_rz(x)); 43 | #else // PEAK // only integral types define it 44 | return static_cast(x / static_cast(SCALE)); 45 | #endif // PEAK 46 | } 47 | 48 | // im2col + rdft + frequency_filtering + irdft 49 | extern "C" 50 | __launch_bounds__(WARPS_PER_BLOCK * WARP_SIZE) 51 | __global__ 52 | void fused( 53 | float * __restrict__ dstp, // shape: (vertical_num, horizontal_num, 2*radius+1, block_size, block_size) 54 | const TYPE * __restrict__ srcp, // shape: (2*radius+1, vertical_size, horizontal_size) 55 | int width, 56 | int height 57 | ) { 58 | 59 | constexpr int radius = static_cast(RADIUS); 60 | constexpr int block_size = static_cast(BLOCK_SIZE); 61 | constexpr int block_step = static_cast(BLOCK_STEP); 62 | 63 | int horizontal_num = calc_pad_num(width, block_size, block_step); 64 | int vertical_num = calc_pad_num(height, block_size, block_step); 65 | int horizontal_size = calc_pad_size(width, block_size, block_step); 66 | int vertical_size = calc_pad_size(height, block_size, block_step); 67 | int num_blocks = vertical_num * horizontal_num; 68 | 69 | constexpr int warp_size = static_cast(WARP_SIZE); 70 | constexpr int warps_per_block = static_cast(WARPS_PER_BLOCK); 71 | constexpr int transpose_stride = (warp_size % block_size == 0) ? block_size + 1 : block_size; 72 | __shared__ float2 shared_transpose_buffer[warps_per_block * block_size * transpose_stride]; 73 | 74 | int warp_id = threadIdx.x / warp_size; 75 | int lane_id = threadIdx.x % warp_size; 76 | auto transpose_buffer = &shared_transpose_buffer[warp_id * block_size * transpose_stride]; 77 | 78 | for (int block_id = blockIdx.x * WARPS_PER_BLOCK + threadIdx.x / WARP_SIZE; block_id < num_blocks; block_id += gridDim.x * WARPS_PER_BLOCK) { 79 | int ix = block_id % horizontal_num; 80 | int iy = block_id / horizontal_num; 81 | 82 | if (lane_id < block_size) { 83 | constexpr int active_mask = (1 << block_size) - 1; 84 | float2 thread_data[(2 * radius + 1) * block_size]; 85 | 86 | // im2col 87 | #pragma unroll 88 | for (int i = 0; i < 2 * radius + 1; i++) { 89 | auto src = &srcp[(i * vertical_size + iy * block_step) * horizontal_size + ix * block_step]; 90 | auto local_thread_data = &thread_data[i * block_size]; 91 | #pragma unroll 92 | for (int j = 0; j < block_size; j++) { 93 | ((float *) local_thread_data)[j] = to_float(src[j * horizontal_size + lane_id]) * window[(i * block_size + j) * block_size + lane_id]; 94 | } 95 | } 96 | 97 | // rdft 98 | #pragma unroll 99 | for (int i = 0; i < 2 * radius + 1; i++) { 100 | auto local_thread_data = &thread_data[i * block_size]; 101 | 102 | __syncwarp(active_mask); 103 | // transpose store of real data 104 | #pragma unroll 105 | for (int j = 0; j < block_size; j++) { 106 | ((float *) transpose_buffer)[j * transpose_stride + lane_id] = ((float *) local_thread_data)[j]; 107 | } 108 | 109 | __syncwarp(active_mask); 110 | // transpose load of real data 111 | #pragma unroll 112 | for (int j = 0; j < block_size; j++) { 113 | ((float *) local_thread_data)[j] = ((float *) transpose_buffer)[lane_id * transpose_stride + j]; 114 | } 115 | 116 | __syncwarp(active_mask); 117 | rdft((float *) local_thread_data); 118 | 119 | // transpose store of complex data 120 | #pragma unroll 121 | for (int j = 0; j < block_size / 2 + 1; j++) { 122 | transpose_buffer[lane_id * transpose_stride + j] = local_thread_data[j]; 123 | } 124 | 125 | __syncwarp(active_mask); 126 | if (lane_id < block_size / 2 + 1) { 127 | // transpose load of complex data 128 | #pragma unroll 129 | for (int j = 0; j < block_size; j++) { 130 | local_thread_data[j] = transpose_buffer[j * transpose_stride + lane_id]; 131 | } 132 | 133 | __syncwarp((1 << (block_size / 2 + 1)) - 1); 134 | dft((float *) local_thread_data); 135 | } 136 | } 137 | 138 | if (lane_id < block_size / 2 + 1) { 139 | #pragma unroll 140 | for (int i = 0; i < block_size; i++) { 141 | dft<2 * radius + 1>((float *) &thread_data[i], block_size); 142 | } 143 | } 144 | 145 | // frequency_filtering 146 | if (lane_id < block_size / 2 + 1) { 147 | #if ZERO_MEAN 148 | float gf; 149 | if (lane_id == 0) { 150 | gf = thread_data[0].x / window_freq[0]; 151 | } 152 | gf = __shfl_sync((1 << (block_size / 2 + 1)) - 1, gf, 0); 153 | #endif // ZERO_MEAN 154 | #pragma unroll 155 | for (int i = 0; i < 2 * radius + 1; i++) { 156 | #pragma unroll 157 | for (int j = 0; j < block_size; j++) { 158 | float2 local_data = thread_data[i * block_size + j]; 159 | 160 | #if ZERO_MEAN 161 | // remove mean 162 | float val1 = gf * window_freq[((i * block_size + j) * (block_size / 2 + 1) + lane_id) * 2]; 163 | float val2 = gf * window_freq[((i * block_size + j) * (block_size / 2 + 1) + lane_id) * 2 + 1]; 164 | local_data.x -= val1; 165 | local_data.y -= val2; 166 | #endif // ZERO_MEAN 167 | 168 | filter(local_data, lane_id, j, i); 169 | 170 | #if ZERO_MEAN 171 | // add mean 172 | local_data.x += val1; 173 | local_data.y += val2; 174 | #endif // ZERO_MEAN 175 | 176 | thread_data[i * block_size + j] = local_data; 177 | } 178 | } 179 | } 180 | 181 | // irdft 182 | if (lane_id < block_size / 2 + 1) { 183 | #pragma unroll 184 | for (int i = 0; i < block_size; i++) { 185 | idft<2 * radius + 1>((float *) &thread_data[i], block_size); 186 | } 187 | } 188 | 189 | // this is not a full 3d irdft, because only a single slice is required 190 | auto local_thread_data = &thread_data[radius * block_size]; 191 | 192 | if (lane_id < block_size / 2 + 1) { 193 | __syncwarp((1 << (block_size / 2 + 1)) - 1); 194 | idft((float *) local_thread_data); 195 | 196 | // transpose store of complex data 197 | #pragma unroll 198 | for (int j = 0; j < block_size; j++) { 199 | transpose_buffer[j * transpose_stride + lane_id] = local_thread_data[j]; 200 | } 201 | } 202 | 203 | __syncwarp(active_mask); 204 | #pragma unroll 205 | for (int j = 0; j < block_size / 2 + 1; j++) { 206 | // transpose load of complex data 207 | local_thread_data[j].x = transpose_buffer[lane_id * transpose_stride + j].x; 208 | local_thread_data[j].y = transpose_buffer[lane_id * transpose_stride + j].y; 209 | } 210 | 211 | __syncwarp(active_mask); 212 | irdft((float *) local_thread_data); 213 | 214 | #pragma unroll 215 | for (int j = 0; j < block_size; j++) { 216 | ((float *) transpose_buffer)[j * transpose_stride + lane_id] = ((float *) local_thread_data)[j == 0 ? j : block_size - j]; 217 | } 218 | 219 | __syncwarp(active_mask); 220 | auto local_dst = &dstp[(block_id * (2 * radius + 1) + radius) * block_size * block_size]; 221 | #pragma unroll 222 | for (int j = 0; j < block_size; j++) { 223 | local_dst[j * block_size + lane_id] = ((float *) transpose_buffer)[lane_id * transpose_stride + j]; 224 | } 225 | } 226 | } 227 | } 228 | 229 | extern "C" 230 | __launch_bounds__(WARPS_PER_BLOCK * WARP_SIZE) 231 | __global__ 232 | void col2im( 233 | TYPE * __restrict__ dst, // shape: (2*radius+1, vertical_size, horizontal_size) 234 | const float * __restrict__ src, // shape: (vertical_num, horizontal_num, 2*radius+1, block_size, block_size) 235 | int width, 236 | int height 237 | ) { 238 | 239 | int radius = static_cast(RADIUS); 240 | int block_size = static_cast(BLOCK_SIZE); 241 | int block_step = static_cast(BLOCK_STEP); 242 | 243 | // each thread is responsible for a single pixel 244 | int horizontal_size = calc_pad_size(width, block_size, block_step); 245 | int horizontal_num = calc_pad_num(width, block_size, block_step); 246 | int vertical_size = calc_pad_size(height, block_size, block_step); 247 | int vertical_num = calc_pad_num(height, block_size, block_step); 248 | int pad_x = (horizontal_size - width) / 2; 249 | int pad_y = (vertical_size - height) / 2; 250 | 251 | int x = blockIdx.x * blockDim.x + threadIdx.x; 252 | int y = blockIdx.y * blockDim.y + threadIdx.y; 253 | if (y < pad_y || y >= pad_y + height || x < pad_x || x >= pad_x + width) { 254 | return ; 255 | } 256 | 257 | float sum {}; 258 | 259 | int i1 = (y - block_size + block_step) / block_step; // i1 is implicitly greater than 0 260 | int i2 = min(y / block_step, vertical_num - 1); 261 | int j1 = (x - block_size + block_step) / block_step; // j1 is implicitly greater than 0 262 | int j2 = min(x / block_step, horizontal_num - 1); 263 | 264 | for (int i = i1; i <= i2; i++) { 265 | int offset_y = y - i * block_step; 266 | for (int j = j1; j <= j2; j++) { 267 | int offset_x = x - j * block_step; 268 | auto src_offset = (((i * horizontal_num + j) * (2 * radius + 1) + radius) * block_size + offset_y) * block_size + offset_x; 269 | auto window_offset = (radius * block_size + offset_y) * block_size + offset_x; 270 | sum += src[src_offset] * window[window_offset]; 271 | } 272 | } 273 | 274 | dst[(radius * vertical_size + y) * horizontal_size + x] = from_float(sum); 275 | } 276 | )"""; 277 | 278 | #endif // KERNEL_HPP 279 | --------------------------------------------------------------------------------