├── DFT_Metal_private.m ├── DFT.cpp ├── FFT.h ├── DFT_Metal_private.h ├── DFT.h ├── LICENSE ├── FFT.cpp ├── README.md ├── main.cpp ├── DFT.metal ├── FFT.metal ├── FFT.cu ├── DFT.cu ├── FFT.mm └── DFT.mm /DFT_Metal_private.m: -------------------------------------------------------------------------------- 1 | // 2 | // DFT.m 3 | // InokiFFT 4 | // 5 | // Created by inoki on 1/28/22. 6 | // 7 | 8 | #import 9 | #import 10 | 11 | #include "DFT_Metal_private.h" 12 | 13 | id GetMetalSystemDevice() { 14 | return MTLCreateSystemDefaultDevice(); 15 | } 16 | -------------------------------------------------------------------------------- /DFT.cpp: -------------------------------------------------------------------------------- 1 | 2 | #define _USE_MATH_DEFINES // for C++ 3 | #include 4 | #include 5 | 6 | void calculateDFT(std::complex* in, std::complex* out, size_t num) 7 | { 8 | for (int i = 0; i < num; i++) 9 | { 10 | out[i] = 0; 11 | for (int j = 0; j < num; j++) 12 | { 13 | out[i] += in[j] * std::exp(std::complex(0, - 2 * M_PI * i * j / num)); 14 | } 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /FFT.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | void calculateFFT(std::complex* in, std::complex* out, size_t num); 6 | 7 | #ifdef __HAS_CUDA__ 8 | void calculateFFTCUDA(std::complex* in, std::complex* out, size_t num); 9 | #endif 10 | 11 | #ifdef HAS_METAL 12 | void calculateFFTMetal(std::complex* inBuffer, std::complex* outBuffer, size_t num); 13 | #endif 14 | -------------------------------------------------------------------------------- /DFT_Metal_private.h: -------------------------------------------------------------------------------- 1 | // 2 | // DFT_Metal_private.h 3 | // InokiFFT 4 | // 5 | // Created by inoki on 1/28/22. 6 | // 7 | 8 | #ifndef DFT_Metal_private_h 9 | #define DFT_Metal_private_h 10 | 11 | #import 12 | #import 13 | #ifdef __cplusplus 14 | extern "C" { 15 | #endif 16 | 17 | /* The Metal MTLCreateSystemDefaultDevice has C-style ABI, which cannot be properly linked by C++ linker, use this workaround. */ 18 | id GetMetalSystemDevice(); 19 | 20 | #ifdef __cplusplus 21 | } 22 | #endif 23 | 24 | 25 | #endif /* DFT_Metal_private_h */ 26 | -------------------------------------------------------------------------------- /DFT.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | void calculateDFT(std::complex *in, std::complex *out, size_t num); 6 | #ifdef __HAS_CUDA__ 7 | void calculateDFTCUDA(std::complex* in, std::complex* out, size_t num); 8 | void calculateDFTCUDALargeMem(std::complex* in, std::complex* out, size_t num); 9 | #endif 10 | 11 | #ifdef HAS_METAL 12 | void calculateDFTMetal(std::complex* inBuffer, std::complex* outBuffer, size_t num); 13 | void calculateDFTMetalLargeMem(std::complex* inBuffer, std::complex* outBuffer, size_t num); 14 | #endif 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Inoki 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /FFT.cpp: -------------------------------------------------------------------------------- 1 | 2 | #define _USE_MATH_DEFINES // for C++ 3 | #include 4 | #include 5 | #include 6 | 7 | unsigned int bitReverse(unsigned int x, int log2n) 8 | { 9 | int n = 0; 10 | int mask = 0x1; 11 | for (int i = 0; i < log2n; i++) 12 | { 13 | n <<= 1; 14 | n |= (x & 1); 15 | x >>= 1; 16 | } 17 | return n; 18 | } 19 | 20 | void fft_impl(std::complex *a, std::complex* b, int num) 21 | { 22 | std::complex J(0, 1); 23 | int n = num; 24 | int log2n = 0; 25 | while (num > 1) { 26 | log2n++; 27 | num /= 2; 28 | } 29 | for (unsigned int i = 0; i < n; ++i) { 30 | b[bitReverse(i, log2n)] = a[i]; 31 | } 32 | for (int s = 1; s <= log2n; ++s) { 33 | int m = 1 << s; 34 | int m2 = m >> 1; 35 | std::complex w(1, 0); 36 | std::complex wm = std::exp(-J * (float)(M_PI / m2)); 37 | for (int j = 0; j < m2; ++j) { 38 | for (int k = j; k < n; k += m) { 39 | std::complex t = w * b[k + m2]; 40 | std::complex u = b[k]; 41 | b[k] = u + t; 42 | b[k + m2] = u - t; 43 | } 44 | w *= wm; 45 | } 46 | } 47 | } 48 | 49 | void calculateFFT(std::complex* in, std::complex* out, size_t num) 50 | { 51 | fft_impl(in, out, num); 52 | } 53 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Discrete Fourier Transform (DFT/FFT) implementations 2 | 3 | This project has experimental implementations of DFT/FFT in CUDA and Apple Metal. Use it at your own risk (remember to check the array border if you would like to use them in your own project). 4 | 5 | ## DFT Implementations 6 | - `DFT.cpp` - CPU DFT implementation 7 | - `DFT.cu` - CUDA DFT implementations (with or without precomputed complex roots) 8 | - `DFT.metal` / `DFT.mm` - Apple Metal DFT implementations (with or without precomputed complex roots) 9 | 10 | ## FFT Implementations (Cooley-Tukey Algorithm) 11 | - `FFT.cpp` - CPU FFT implementation using iterative Cooley-Tukey algorithm 12 | - `FFT.cu` - CUDA FFT implementation with parallel bit-reversal and butterfly operations 13 | - `FFT.metal` / `FFT.mm` - Apple Metal FFT implementation with parallel bit-reversal and butterfly operations 14 | 15 | ## Building 16 | 17 | ### CPU only 18 | ```bash 19 | g++ -o fourier main.cpp DFT.cpp FFT.cpp -std=c++11 20 | ``` 21 | 22 | ### With CUDA 23 | ```bash 24 | nvcc -o fourier main.cpp DFT.cpp FFT.cpp DFT.cu FFT.cu -D__HAS_CUDA__ 25 | ``` 26 | 27 | ### With Metal (macOS) 28 | ```bash 29 | clang++ -o fourier main.cpp DFT.cpp FFT.cpp DFT.mm FFT.mm DFT_Metal_private.m \ 30 | -framework Foundation -framework Metal -DHAS_METAL -std=c++11 31 | ``` 32 | 33 | ## Usage 34 | 35 | The FFT implementations require input sizes that are powers of 2 (e.g., 64, 128, 256, 512, etc.). 36 | -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | // InokiFourier.cpp 2 | // 3 | 4 | #include 5 | #include "DFT.h" 6 | #include "FFT.h" 7 | 8 | #define FFT_LEN 128 9 | 10 | int main() 11 | { 12 | std::complex inBuffer[128], outBuffer[128]; 13 | for (int i = 0; i < FFT_LEN; i++) 14 | { 15 | switch (i % 8) 16 | { 17 | case 0: 18 | inBuffer[i] = 1; 19 | break; 20 | case 1: case 7: 21 | inBuffer[i] = std::sqrt(2) / 2; 22 | break; 23 | case 2: case 6: 24 | inBuffer[i] = 0; 25 | break; 26 | case 3: case 5: 27 | inBuffer[i] = -std::sqrt(2) / 2; 28 | break; 29 | case 4: 30 | inBuffer[i] = -1; 31 | break; 32 | default: 33 | break; 34 | } 35 | } 36 | 37 | #ifdef __HAS_CUDA__ 38 | std::cout << "Using CUDA implementation" << std::endl; 39 | std::cout << "DFT result:" << std::endl; 40 | calculateDFTCUDA(inBuffer, outBuffer, FFT_LEN); 41 | for (int i = 0; i < FFT_LEN; i++) 42 | { 43 | std::cout << std::abs(outBuffer[i]) << " "; 44 | } 45 | std::cout << std::endl; 46 | 47 | std::cout << "FFT result:" << std::endl; 48 | calculateFFTCUDA(inBuffer, outBuffer, FFT_LEN); 49 | #elif defined(HAS_METAL) 50 | std::cout << "Using Metal implementation" << std::endl; 51 | std::cout << "DFT result:" << std::endl; 52 | calculateDFTMetal(inBuffer, outBuffer, FFT_LEN); 53 | for (int i = 0; i < FFT_LEN; i++) 54 | { 55 | std::cout << std::abs(outBuffer[i]) << " "; 56 | } 57 | std::cout << std::endl; 58 | 59 | std::cout << "FFT result:" << std::endl; 60 | calculateFFTMetal(inBuffer, outBuffer, FFT_LEN); 61 | #else 62 | std::cout << "Using CPU implementation" << std::endl; 63 | std::cout << "DFT result:" << std::endl; 64 | calculateDFT(inBuffer, outBuffer, FFT_LEN); 65 | for (int i = 0; i < FFT_LEN; i++) 66 | { 67 | std::cout << std::abs(outBuffer[i]) << " "; 68 | } 69 | std::cout << std::endl; 70 | 71 | std::cout << "FFT result:" << std::endl; 72 | calculateFFT(inBuffer, outBuffer, FFT_LEN); 73 | #endif 74 | 75 | float maxOut = -1; 76 | int maxI = 0; 77 | for (int i = 0; i < FFT_LEN; i++) 78 | { 79 | float outAbs = std::abs(outBuffer[i]); 80 | std::cout << outAbs << " "; 81 | if (outAbs > maxOut) 82 | { 83 | maxOut = outAbs; 84 | maxI = i; 85 | } 86 | } 87 | std::cout << std::endl << "Max: " << maxOut << " at " << maxI << std::endl; 88 | return 0; 89 | } 90 | -------------------------------------------------------------------------------- /DFT.metal: -------------------------------------------------------------------------------- 1 | // 2 | // DFT.metal 3 | // InokiFFT 4 | // 5 | // Created by inoki on 1/28/22. 6 | // 7 | 8 | #include 9 | using namespace metal; 10 | 11 | float2 complexAdd(const float2 inA, const float2 inB) 12 | { 13 | float2 out; 14 | out[0] = inA[0] + inB[0]; 15 | out[1] = inA[1] + inB[1]; 16 | return out; 17 | } 18 | 19 | float2 complexMul(const float2 inA, const float2 inB) 20 | { 21 | float2 out; 22 | out[0] = inA[0] * inB[0] - inA[1] * inB[1]; 23 | out[1] = inA[0] * inB[1] + inA[1] * inB[0]; 24 | return out; 25 | } 26 | 27 | float2 complexExp(const float2 in) 28 | { 29 | float2 out; 30 | float t = exp(in[0]); 31 | out[0] = t * cos(in[1]); 32 | out[1] = t * sin(in[1]); 33 | return out; 34 | } 35 | 36 | /// This is a Metal Shading Language (MSL) function 37 | kernel void computeDFTMetal(device const float *in, 38 | device float *out, 39 | device const int *num, 40 | uint index [[thread_position_in_grid]]) 41 | { 42 | if (index < (uint)*num) { 43 | int i = index * 2; 44 | out[i] = 0; 45 | out[i + 1] = 0; 46 | float2 temp1, temp2; 47 | for (int j = 0; j < *num; j++) 48 | { 49 | temp2[0] = 0; temp2[1] = -2 * 3.14159265 * index * j / *num; 50 | temp2 = complexExp(temp2); 51 | 52 | temp1[0] = in[j * 2]; temp1[1] = in[j * 2 + 1]; 53 | temp1 = complexMul(temp1, temp2); 54 | 55 | // Copy back 56 | out[i] += temp1[0]; 57 | out[i + 1] += temp1[1]; 58 | } 59 | } 60 | } 61 | 62 | kernel void computeDFTMetalWithPrecomputedRoot(device const float *in, 63 | device const float *roots, 64 | device float *out, 65 | device const int *num, 66 | uint index [[thread_position_in_grid]]) 67 | { 68 | if (index < (uint)*num) { 69 | int i = index * 2; 70 | out[i] = 0; 71 | out[i + 1] = 0; 72 | float2 temp1, temp2; 73 | for (int j = 0; j < *num; j++) 74 | { 75 | temp2[0] = roots[2 * ((index * j) % (*num))]; 76 | temp2[1] = roots[2 * ((index * j) % (*num)) + 1]; 77 | 78 | temp1[0] = in[j * 2]; temp1[1] = in[j * 2 + 1]; 79 | temp1 = complexMul(temp1, temp2); 80 | 81 | // Copy back 82 | out[i] += temp1[0]; 83 | out[i + 1] += temp1[1]; 84 | } 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /FFT.metal: -------------------------------------------------------------------------------- 1 | // 2 | // FFT.metal 3 | // InokiFFT 4 | // 5 | // FFT implementation using Cooley-Tukey algorithm for Metal 6 | // 7 | 8 | #include 9 | using namespace metal; 10 | 11 | // Complex number operations 12 | float2 complexAdd(const float2 inA, const float2 inB) 13 | { 14 | return float2(inA[0] + inB[0], inA[1] + inB[1]); 15 | } 16 | 17 | float2 complexSub(const float2 inA, const float2 inB) 18 | { 19 | return float2(inA[0] - inB[0], inA[1] - inB[1]); 20 | } 21 | 22 | float2 complexMul(const float2 inA, const float2 inB) 23 | { 24 | return float2(inA[0] * inB[0] - inA[1] * inB[1], 25 | inA[0] * inB[1] + inA[1] * inB[0]); 26 | } 27 | 28 | // Bit reverse function 29 | uint bitReverse(uint x, int log2n) 30 | { 31 | uint n = 0; 32 | for (int i = 0; i < log2n; i++) 33 | { 34 | n <<= 1; 35 | n |= (x & 1); 36 | x >>= 1; 37 | } 38 | return n; 39 | } 40 | 41 | // Kernel for bit-reverse permutation 42 | kernel void fftBitReversePermute(device const float *in [[buffer(0)]], 43 | device float *out [[buffer(1)]], 44 | device const int *num [[buffer(2)]], 45 | device const int *log2n [[buffer(3)]], 46 | uint index [[thread_position_in_grid]]) 47 | { 48 | int n = *num; 49 | if (index < (uint)n) 50 | { 51 | uint reversedIndex = bitReverse(index, *log2n); 52 | // Copy complex number (2 floats) 53 | out[reversedIndex * 2] = in[index * 2]; 54 | out[reversedIndex * 2 + 1] = in[index * 2 + 1]; 55 | } 56 | } 57 | 58 | // Kernel for FFT butterfly operation at a given stage 59 | kernel void fftButterfly(device float *data [[buffer(0)]], 60 | device const int *num [[buffer(1)]], 61 | device const int *stage [[buffer(2)]], 62 | uint index [[thread_position_in_grid]]) 63 | { 64 | int n = *num; 65 | int s = *stage; 66 | 67 | int m = 1 << s; // 2^stage 68 | int m2 = m >> 1; // m/2 69 | 70 | // Each thread handles one butterfly operation 71 | // We need n/2 butterflies per stage 72 | if (index < (uint)(n / 2)) 73 | { 74 | // Find which group this thread belongs to and position within group 75 | int group = index / m2; 76 | int j = index % m2; 77 | 78 | // Calculate the two indices for butterfly 79 | int k = group * m + j; 80 | int k2 = k + m2; 81 | 82 | // Calculate twiddle factor: W_m^j = exp(-2*pi*i*j/m) 83 | float angle = -2.0f * 3.14159265358979323846f * j / m; 84 | float2 w = float2(cos(angle), sin(angle)); 85 | 86 | // Load data 87 | float2 u = float2(data[k * 2], data[k * 2 + 1]); 88 | float2 t_val = float2(data[k2 * 2], data[k2 * 2 + 1]); 89 | 90 | // Butterfly operation 91 | float2 t = complexMul(w, t_val); 92 | 93 | float2 result1 = complexAdd(u, t); 94 | float2 result2 = complexSub(u, t); 95 | 96 | // Store results 97 | data[k * 2] = result1[0]; 98 | data[k * 2 + 1] = result1[1]; 99 | data[k2 * 2] = result2[0]; 100 | data[k2 * 2 + 1] = result2[1]; 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /FFT.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define M_PI 3.14159265358979323846 4 | 5 | #include "cuda_runtime.h" 6 | #include "device_launch_parameters.h" 7 | #include "cuComplex.h" 8 | 9 | // Bit reverse function for index permutation 10 | __device__ unsigned int bitReverseCUDA(unsigned int x, int log2n) 11 | { 12 | unsigned int n = 0; 13 | for (int i = 0; i < log2n; i++) 14 | { 15 | n <<= 1; 16 | n |= (x & 1); 17 | x >>= 1; 18 | } 19 | return n; 20 | } 21 | 22 | // Kernel for bit-reverse permutation 23 | __global__ void bitReversePermuteKernel(cuComplex* in, cuComplex* out, int num, int log2n) 24 | { 25 | int i = blockIdx.x * blockDim.x + threadIdx.x; 26 | if (i < num) 27 | { 28 | out[bitReverseCUDA(i, log2n)] = in[i]; 29 | } 30 | } 31 | 32 | // Kernel for FFT butterfly operation at a given stage 33 | __global__ void fftButterflyKernel(cuComplex* data, int num, int stage) 34 | { 35 | int i = blockIdx.x * blockDim.x + threadIdx.x; 36 | 37 | int m = 1 << stage; // 2^stage 38 | int m2 = m >> 1; // m/2 39 | 40 | // Each thread handles one butterfly operation 41 | // We need num/2 butterflies per stage 42 | if (i < num / 2) 43 | { 44 | // Find which group this thread belongs to and position within group 45 | int group = i / m2; 46 | int j = i % m2; 47 | 48 | // Calculate the two indices for butterfly 49 | int k = group * m + j; 50 | int k2 = k + m2; 51 | 52 | // Calculate twiddle factor: W_m^j = exp(-2*pi*i*j/m) 53 | float angle = -2.0f * M_PI * j / m; 54 | cuComplex w; 55 | w.x = cosf(angle); 56 | w.y = sinf(angle); 57 | 58 | // Butterfly operation 59 | cuComplex t = cuCmulf(w, data[k2]); 60 | cuComplex u = data[k]; 61 | 62 | data[k] = cuCaddf(u, t); 63 | data[k2] = cuCsubf(u, t); 64 | } 65 | } 66 | 67 | void calculateFFTCUDA(std::complex* in, std::complex* out, size_t num) 68 | { 69 | if (num == 0) return; 70 | 71 | // Calculate log2(num) 72 | int log2n = 0; 73 | size_t temp = num; 74 | while (temp > 1) { 75 | log2n++; 76 | temp /= 2; 77 | } 78 | 79 | // Verify num is power of 2 80 | if ((1 << log2n) != num) { 81 | fprintf(stderr, "FFT size must be a power of 2\n"); 82 | return; 83 | } 84 | 85 | // Allocate device memory 86 | cuComplex* d_in; 87 | cuComplex* d_data; 88 | cudaMalloc(&d_in, num * sizeof(cuComplex)); 89 | cudaMalloc(&d_data, num * sizeof(cuComplex)); 90 | 91 | // Copy input to device 92 | cudaMemcpy(d_in, in, num * sizeof(cuComplex), cudaMemcpyHostToDevice); 93 | 94 | // Calculate grid and block dimensions 95 | int blockSize = 256; 96 | int numBlocks = (num + blockSize - 1) / blockSize; 97 | int numBlocksHalf = ((num / 2) + blockSize - 1) / blockSize; 98 | 99 | // Step 1: Bit-reverse permutation 100 | bitReversePermuteKernel<<>>(d_in, d_data, num, log2n); 101 | 102 | cudaError_t cudaStatus = cudaGetLastError(); 103 | if (cudaStatus != cudaSuccess) { 104 | fprintf(stderr, "Bit-reverse kernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); 105 | goto Error; 106 | } 107 | 108 | cudaStatus = cudaDeviceSynchronize(); 109 | if (cudaStatus != cudaSuccess) { 110 | fprintf(stderr, "cudaDeviceSynchronize returned error code %d after bit-reverse kernel!\n", cudaStatus); 111 | goto Error; 112 | } 113 | 114 | // Step 2: FFT butterfly stages 115 | for (int s = 1; s <= log2n; s++) { 116 | fftButterflyKernel<<>>(d_data, num, s); 117 | 118 | cudaStatus = cudaGetLastError(); 119 | if (cudaStatus != cudaSuccess) { 120 | fprintf(stderr, "FFT butterfly kernel launch failed at stage %d: %s\n", s, cudaGetErrorString(cudaStatus)); 121 | goto Error; 122 | } 123 | 124 | cudaStatus = cudaDeviceSynchronize(); 125 | if (cudaStatus != cudaSuccess) { 126 | fprintf(stderr, "cudaDeviceSynchronize returned error code %d after FFT stage %d!\n", cudaStatus, s); 127 | goto Error; 128 | } 129 | } 130 | 131 | // Copy result back to host 132 | cudaMemcpy(out, d_data, num * sizeof(cuComplex), cudaMemcpyDeviceToHost); 133 | 134 | Error: 135 | cudaFree(d_in); 136 | cudaFree(d_data); 137 | } 138 | -------------------------------------------------------------------------------- /DFT.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define M_PI 3.14159265 4 | 5 | #include "cuda_runtime.h" 6 | #include "device_launch_parameters.h" 7 | #include "cuComplex.h" 8 | 9 | __device__ __forceinline__ cuComplex cuComplexExp(cuComplex z) 10 | { 11 | cuComplex res; 12 | float t = expf(z.x); 13 | sincosf(z.y, &res.y, &res.x); 14 | res.x *= t; 15 | res.y *= t; 16 | return res; 17 | } 18 | 19 | // Kernel definition 20 | __global__ void calculateDFTCUDAKernel(cuComplex* in, cuComplex* out, size_t num) 21 | { 22 | int i = threadIdx.x; 23 | if (i < num) 24 | { 25 | out[i].x = 0; 26 | out[i].y = 0; 27 | for (int j = 0; j < num; j++) 28 | { 29 | out[i] = cuCaddf(out[i], 30 | cuCmulf(in[j], cuComplexExp(make_cuComplex(0, -2 * M_PI * i * j / num))) 31 | ); 32 | } 33 | } 34 | } 35 | 36 | void calculateDFTCUDA(std::complex* in, std::complex* out, size_t num) 37 | { 38 | if (num == 0) return; 39 | 40 | // Allocate vectors in device memory 41 | cuComplex* d_in; 42 | cudaMalloc(&d_in, num * sizeof(cuComplex)); 43 | cuComplex* d_out; 44 | cudaMalloc(&d_out, num * sizeof(cuComplex)); 45 | 46 | // Copy vectors from host memory to device memory 47 | cudaMemcpy(d_in, in, num * sizeof(cuComplex), cudaMemcpyHostToDevice); 48 | 49 | calculateDFTCUDAKernel<<<1, num>>>(d_in, d_out, num); 50 | 51 | cudaError_t cudaStatus; 52 | 53 | // Check for any errors launching the kernel 54 | cudaStatus = cudaGetLastError(); 55 | if (cudaStatus != cudaSuccess) { 56 | fprintf(stderr, "DFT Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); 57 | goto Error; 58 | } 59 | 60 | // cudaDeviceSynchronize waits for the kernel to finish, and returns 61 | // any errors encountered during the launch. 62 | cudaStatus = cudaDeviceSynchronize(); 63 | if (cudaStatus != cudaSuccess) { 64 | fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching DFT Kernel!\n", cudaStatus); 65 | goto Error; 66 | } 67 | 68 | // Copy back the results 69 | cudaMemcpy(out, d_out, num * sizeof(cuComplex), cudaMemcpyDeviceToHost); 70 | 71 | Error: 72 | cudaFree(d_in); 73 | cudaFree(d_out); 74 | } 75 | 76 | // Kernel definition 77 | __global__ void calculateDFTCUDAKernelWithPrecomputedRoot(cuComplex* in, cuComplex* roots, cuComplex* out, size_t num) 78 | { 79 | int i = threadIdx.x; 80 | if (i < num) 81 | { 82 | out[i].x = 0; 83 | out[i].y = 0; 84 | for (int j = 0; j < num; j++) 85 | { 86 | out[i] = cuCaddf(out[i], cuCmulf(in[j], roots[i * j % num])); 87 | } 88 | } 89 | } 90 | 91 | void calculateDFTCUDALargeMem(std::complex* in, std::complex* out, size_t num) 92 | { 93 | if (num == 0) return; 94 | 95 | // Allocate vectors in device memory 96 | cuComplex* d_in; 97 | cudaMalloc(&d_in, num * sizeof(cuComplex)); 98 | cuComplex* d_out; 99 | cudaMalloc(&d_out, num * sizeof(cuComplex)); 100 | cuComplex* d_roots; 101 | cudaMalloc(&d_roots, num * sizeof(cuComplex)); 102 | 103 | // Copy vectors from host memory to device memory 104 | cudaMemcpy(d_in, in, num * sizeof(cuComplex), cudaMemcpyHostToDevice); 105 | 106 | // Precompute DFT 107 | cuComplex* roots = (cuComplex *)malloc(sizeof(cuComplex) * num); 108 | for (int i = 0; i < num; i++) 109 | { 110 | std::complex root = std::exp(std::complex(0, -2 * M_PI * i / num)); 111 | roots[i] = make_cuComplex(root.real(), root.imag()); 112 | } 113 | cudaMemcpy(d_roots, roots, num * sizeof(cuComplex), cudaMemcpyHostToDevice); 114 | free(roots); 115 | 116 | calculateDFTCUDAKernelWithPrecomputedRoot<<<1, num >>>(d_in, d_roots, d_out, num); 117 | 118 | cudaError_t cudaStatus; 119 | 120 | // Check for any errors launching the kernel 121 | cudaStatus = cudaGetLastError(); 122 | if (cudaStatus != cudaSuccess) { 123 | fprintf(stderr, "DFT Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); 124 | goto Error; 125 | } 126 | 127 | // cudaDeviceSynchronize waits for the kernel to finish, and returns 128 | // any errors encountered during the launch. 129 | cudaStatus = cudaDeviceSynchronize(); 130 | if (cudaStatus != cudaSuccess) { 131 | fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching DFT Kernel!\n", cudaStatus); 132 | goto Error; 133 | } 134 | 135 | // Copy back the results 136 | cudaMemcpy(out, d_out, num * sizeof(cuComplex), cudaMemcpyDeviceToHost); 137 | 138 | Error: 139 | cudaFree(d_in); 140 | cudaFree(d_out); 141 | cudaFree(d_roots); 142 | } 143 | -------------------------------------------------------------------------------- /FFT.mm: -------------------------------------------------------------------------------- 1 | // 2 | // FFT.mm 3 | // InokiFFT 4 | // 5 | // FFT implementation using Metal 6 | // 7 | 8 | #import 9 | #import 10 | 11 | #include 12 | #include 13 | 14 | #include "DFT_Metal_private.h" 15 | 16 | static id initFFTFunction(NSString *functionName, id device) { 17 | // Load the shader files with a .metal file extension in the project 18 | id defaultLibrary = [device newDefaultLibrary]; 19 | if (defaultLibrary == nil) 20 | { 21 | NSLog(@"Failed to find the default library."); 22 | return nil; 23 | } 24 | id loadedFunction = [defaultLibrary newFunctionWithName:functionName]; 25 | return loadedFunction; 26 | } 27 | 28 | void calculateFFTMetal(std::complex* inBuffer, std::complex* outBuffer, size_t num) { 29 | @autoreleasepool { 30 | if (num == 0) return; 31 | 32 | // Calculate log2(num) 33 | int log2n = 0; 34 | size_t temp = num; 35 | while (temp > 1) { 36 | log2n++; 37 | temp /= 2; 38 | } 39 | 40 | // Verify num is power of 2 41 | if ((1 << log2n) != num) { 42 | NSLog(@"FFT size must be a power of 2"); 43 | return; 44 | } 45 | 46 | id device = GetMetalSystemDevice(); 47 | 48 | // Load bit-reverse permutation kernel 49 | id bitReverseFunc = initFFTFunction(@"fftBitReversePermute", device); 50 | if (bitReverseFunc == nil) 51 | { 52 | NSLog(@"Failed to find the fftBitReversePermute function."); 53 | return; 54 | } 55 | 56 | // Load butterfly kernel 57 | id butterflyFunc = initFFTFunction(@"fftButterfly", device); 58 | if (butterflyFunc == nil) 59 | { 60 | NSLog(@"Failed to find the fftButterfly function."); 61 | return; 62 | } 63 | 64 | NSError* error = nil; 65 | 66 | id bitReversePSO = [device newComputePipelineStateWithFunction:bitReverseFunc error:&error]; 67 | if (bitReversePSO == nil) 68 | { 69 | NSLog(@"Failed to create bit-reverse pipeline state object, error %@.", error); 70 | return; 71 | } 72 | 73 | id butterflyPSO = [device newComputePipelineStateWithFunction:butterflyFunc error:&error]; 74 | if (butterflyPSO == nil) 75 | { 76 | NSLog(@"Failed to create butterfly pipeline state object, error %@.", error); 77 | return; 78 | } 79 | 80 | id commandQueue = [device newCommandQueue]; 81 | if (commandQueue == nil) 82 | { 83 | NSLog(@"Failed to find the command queue."); 84 | return; 85 | } 86 | 87 | // Prepare input buffer 88 | id bufferIn = [device newBufferWithLength:num * sizeof(float) * 2 options:MTLResourceStorageModeShared]; 89 | float *inDataPtr = (float *)bufferIn.contents; 90 | 91 | for (unsigned long index = 0; index < num; index++) 92 | { 93 | inDataPtr[index * 2] = inBuffer[index].real(); 94 | inDataPtr[index * 2 + 1] = inBuffer[index].imag(); 95 | } 96 | 97 | // Prepare working buffer (for bit-reversed data and butterfly operations) 98 | id bufferData = [device newBufferWithLength:num * sizeof(float) * 2 options:MTLResourceStorageModeShared]; 99 | 100 | // Prepare num buffer 101 | id bufferNum = [device newBufferWithLength:sizeof(int) options:MTLResourceStorageModeShared]; 102 | *((int *)bufferNum.contents) = (int)num; 103 | 104 | // Prepare log2n buffer 105 | id bufferLog2n = [device newBufferWithLength:sizeof(int) options:MTLResourceStorageModeShared]; 106 | *((int *)bufferLog2n.contents) = log2n; 107 | 108 | // Prepare stage buffer 109 | id bufferStage = [device newBufferWithLength:sizeof(int) options:MTLResourceStorageModeShared]; 110 | 111 | // Step 1: Bit-reverse permutation 112 | { 113 | id commandBuffer = [commandQueue commandBuffer]; 114 | assert(commandBuffer != nil); 115 | 116 | id computeEncoder = [commandBuffer computeCommandEncoder]; 117 | assert(computeEncoder != nil); 118 | 119 | [computeEncoder setComputePipelineState:bitReversePSO]; 120 | [computeEncoder setBuffer:bufferIn offset:0 atIndex:0]; 121 | [computeEncoder setBuffer:bufferData offset:0 atIndex:1]; 122 | [computeEncoder setBuffer:bufferNum offset:0 atIndex:2]; 123 | [computeEncoder setBuffer:bufferLog2n offset:0 atIndex:3]; 124 | 125 | MTLSize gridSize = MTLSizeMake(num, 1, 1); 126 | NSUInteger threadGroupSize = bitReversePSO.maxTotalThreadsPerThreadgroup; 127 | if (threadGroupSize > num) 128 | { 129 | threadGroupSize = num; 130 | } 131 | MTLSize threadgroupSize = MTLSizeMake(threadGroupSize, 1, 1); 132 | 133 | [computeEncoder dispatchThreads:gridSize 134 | threadsPerThreadgroup:threadgroupSize]; 135 | 136 | [computeEncoder endEncoding]; 137 | [commandBuffer commit]; 138 | [commandBuffer waitUntilCompleted]; 139 | } 140 | 141 | // Step 2: FFT butterfly stages 142 | for (int s = 1; s <= log2n; s++) { 143 | *((int *)bufferStage.contents) = s; 144 | 145 | id commandBuffer = [commandQueue commandBuffer]; 146 | assert(commandBuffer != nil); 147 | 148 | id computeEncoder = [commandBuffer computeCommandEncoder]; 149 | assert(computeEncoder != nil); 150 | 151 | [computeEncoder setComputePipelineState:butterflyPSO]; 152 | [computeEncoder setBuffer:bufferData offset:0 atIndex:0]; 153 | [computeEncoder setBuffer:bufferNum offset:0 atIndex:1]; 154 | [computeEncoder setBuffer:bufferStage offset:0 atIndex:2]; 155 | 156 | // We need num/2 threads for butterfly operations 157 | MTLSize gridSize = MTLSizeMake(num / 2, 1, 1); 158 | NSUInteger threadGroupSize = butterflyPSO.maxTotalThreadsPerThreadgroup; 159 | if (threadGroupSize > num / 2) 160 | { 161 | threadGroupSize = num / 2; 162 | } 163 | MTLSize threadgroupSize = MTLSizeMake(threadGroupSize, 1, 1); 164 | 165 | [computeEncoder dispatchThreads:gridSize 166 | threadsPerThreadgroup:threadgroupSize]; 167 | 168 | [computeEncoder endEncoding]; 169 | [commandBuffer commit]; 170 | [commandBuffer waitUntilCompleted]; 171 | } 172 | 173 | // Copy results back 174 | float *outDataPtr = (float *)bufferData.contents; 175 | for (unsigned long index = 0; index < num; index++) 176 | { 177 | outBuffer[index].real(outDataPtr[index * 2]); 178 | outBuffer[index].imag(outDataPtr[index * 2 + 1]); 179 | } 180 | } 181 | } 182 | -------------------------------------------------------------------------------- /DFT.mm: -------------------------------------------------------------------------------- 1 | // 2 | // DFT.m 3 | // InokiFFT 4 | // 5 | // Created by inoki on 1/28/22. 6 | // 7 | 8 | #import 9 | #import 10 | 11 | #include 12 | 13 | #include "DFT_Metal_private.h" 14 | 15 | id initFunction(NSString *functionName, id device) { 16 | // Load the shader files with a .metal file extension in the project 17 | id defaultLibrary = [device newDefaultLibrary]; 18 | if (defaultLibrary == nil) 19 | { 20 | NSLog(@"Failed to find the default library."); 21 | return nil; 22 | } 23 | id loadedFunction = [defaultLibrary newFunctionWithName:functionName]; 24 | return loadedFunction; 25 | } 26 | 27 | void calculateDFTMetal(std::complex* inBuffer, std::complex* outBuffer, size_t num) { 28 | @autoreleasepool { 29 | id device = GetMetalSystemDevice(); 30 | id func = initFunction(@"computeDFTMetal", device); 31 | if (func == nil) 32 | { 33 | NSLog(@"Failed to find the DFT function."); 34 | return; 35 | } 36 | 37 | NSError* error = nil; 38 | id funcPSO = [device newComputePipelineStateWithFunction: func error:&error]; 39 | 40 | if (funcPSO == nil) 41 | { 42 | // If the Metal API validation is enabled, you can find out more information about what 43 | // went wrong. (Metal API validation is enabled by default when a debug build is run 44 | // from Xcode) 45 | NSLog(@"Failed to created pipeline state object, error %@.", error); 46 | return; 47 | } 48 | 49 | id commandQueue = [device newCommandQueue]; 50 | if (commandQueue == nil) 51 | { 52 | NSLog(@"Failed to find the command queue."); 53 | return; 54 | } 55 | 56 | // Prepare data 57 | id bufferIn = [device newBufferWithLength:num * sizeof(float) * 2 options:MTLResourceStorageModeShared]; 58 | float *inDataPtr = (float *)bufferIn.contents; 59 | 60 | for (unsigned long index = 0; index < num; index++) 61 | { 62 | inDataPtr[index * 2] = inBuffer[index].real(); 63 | inDataPtr[index * 2 + 1] = inBuffer[index].imag(); 64 | } 65 | 66 | id bufferOut = [device newBufferWithLength:num * sizeof(float) * 2 options:MTLResourceStorageModeShared]; 67 | id bufferNum = [device newBufferWithLength:sizeof(int) options:MTLResourceStorageModeShared]; 68 | *((int *)bufferNum.contents) = (int)num; 69 | NSLog(@"%d", *((int *)bufferNum.contents)); 70 | 71 | id commandBuffer = [commandQueue commandBuffer]; 72 | assert(commandBuffer != nil); 73 | 74 | // Start a compute pass. 75 | id computeEncoder = [commandBuffer computeCommandEncoder]; 76 | assert(computeEncoder != nil); 77 | 78 | // Encode the pipeline state object and its parameters. 79 | [computeEncoder setComputePipelineState:funcPSO]; 80 | [computeEncoder setBuffer:bufferIn offset:0 atIndex:0]; 81 | [computeEncoder setBuffer:bufferOut offset:0 atIndex:1]; 82 | [computeEncoder setBuffer:bufferNum offset:0 atIndex:2]; 83 | 84 | MTLSize gridSize = MTLSizeMake(num, 1, 1); 85 | NSUInteger threadGroupSize = funcPSO.maxTotalThreadsPerThreadgroup; 86 | NSLog(@"My GPU has %lu thread groups.", (unsigned long)threadGroupSize); 87 | if (threadGroupSize > num) 88 | { 89 | threadGroupSize = num; 90 | } 91 | MTLSize threadgroupSize = MTLSizeMake(threadGroupSize, 1, 1); 92 | 93 | // Encode the compute command. 94 | [computeEncoder dispatchThreads:gridSize 95 | threadsPerThreadgroup:threadgroupSize]; 96 | 97 | // End the compute pass. 98 | [computeEncoder endEncoding]; 99 | 100 | // Execute the command. 101 | [commandBuffer commit]; 102 | 103 | // Normally, you want to do other work in your app while the GPU is running, 104 | // but in this example, the code simply blocks until the calculation is complete. 105 | [commandBuffer waitUntilCompleted]; 106 | 107 | float *outDataPtr = (float *)bufferOut.contents; 108 | // Write back 109 | for (unsigned long index = 0; index < num; index++) 110 | { 111 | outBuffer[index].real(outDataPtr[index * 2]); 112 | outBuffer[index].imag(outDataPtr[index * 2 + 1]); 113 | } 114 | // TODO: The result is buggy 115 | } 116 | } 117 | 118 | void calculateDFTMetalLargeMem(std::complex* inBuffer, std::complex* outBuffer, size_t num) { 119 | @autoreleasepool { 120 | id device = GetMetalSystemDevice(); 121 | id func = initFunction(@"computeDFTMetalWithPrecomputedRoot", device); 122 | if (func == nil) 123 | { 124 | NSLog(@"Failed to find the DFT function."); 125 | return; 126 | } 127 | 128 | NSError* error = nil; 129 | id funcPSO = [device newComputePipelineStateWithFunction: func error:&error]; 130 | 131 | if (funcPSO == nil) 132 | { 133 | // If the Metal API validation is enabled, you can find out more information about what 134 | // went wrong. (Metal API validation is enabled by default when a debug build is run 135 | // from Xcode) 136 | NSLog(@"Failed to created pipeline state object, error %@.", error); 137 | return; 138 | } 139 | 140 | id commandQueue = [device newCommandQueue]; 141 | if (commandQueue == nil) 142 | { 143 | NSLog(@"Failed to find the command queue."); 144 | return; 145 | } 146 | 147 | // Prepare data 148 | id bufferIn = [device newBufferWithLength:num * sizeof(float) * 2 options:MTLResourceStorageModeShared]; 149 | float *inDataPtr = (float *)bufferIn.contents; 150 | 151 | for (unsigned long index = 0; index < num; index++) 152 | { 153 | inDataPtr[index * 2] = inBuffer[index].real(); 154 | inDataPtr[index * 2 + 1] = inBuffer[index].imag(); 155 | } 156 | 157 | // Precompute Roots 158 | id bufferRoots = [device newBufferWithLength:num * sizeof(float) * 2 options:MTLResourceStorageModeShared]; 159 | inDataPtr = (float *)bufferRoots.contents; 160 | for (unsigned long index = 0; index < num; index++) 161 | { 162 | std::complex root = std::exp(std::complex(0, -2 * M_PI * index / num)); 163 | inDataPtr[index * 2] = root.real(); 164 | inDataPtr[index * 2 + 1] = root.imag(); 165 | } 166 | 167 | id bufferOut = [device newBufferWithLength:num * sizeof(float) * 2 options:MTLResourceStorageModeShared]; 168 | id bufferNum = [device newBufferWithLength:sizeof(int) options:MTLResourceStorageModeShared]; 169 | *((int *)bufferNum.contents) = (int)num; 170 | 171 | id commandBuffer = [commandQueue commandBuffer]; 172 | assert(commandBuffer != nil); 173 | 174 | // Start a compute pass. 175 | id computeEncoder = [commandBuffer computeCommandEncoder]; 176 | assert(computeEncoder != nil); 177 | 178 | // Encode the pipeline state object and its parameters. 179 | [computeEncoder setComputePipelineState:funcPSO]; 180 | [computeEncoder setBuffer:bufferIn offset:0 atIndex:0]; 181 | [computeEncoder setBuffer:bufferRoots offset:0 atIndex:1]; 182 | [computeEncoder setBuffer:bufferOut offset:0 atIndex:2]; 183 | [computeEncoder setBuffer:bufferNum offset:0 atIndex:3]; 184 | 185 | MTLSize gridSize = MTLSizeMake(num, 1, 1); 186 | NSUInteger threadGroupSize = funcPSO.maxTotalThreadsPerThreadgroup; 187 | NSLog(@"My GPU has %lu thread groups.", (unsigned long)threadGroupSize); 188 | if (threadGroupSize > num) 189 | { 190 | threadGroupSize = num; 191 | } 192 | MTLSize threadgroupSize = MTLSizeMake(threadGroupSize, 1, 1); 193 | 194 | // Encode the compute command. 195 | [computeEncoder dispatchThreads:gridSize 196 | threadsPerThreadgroup:threadgroupSize]; 197 | 198 | // End the compute pass. 199 | [computeEncoder endEncoding]; 200 | 201 | // Execute the command. 202 | [commandBuffer commit]; 203 | 204 | // Normally, you want to do other work in your app while the GPU is running, 205 | // but in this example, the code simply blocks until the calculation is complete. 206 | [commandBuffer waitUntilCompleted]; 207 | 208 | float *outDataPtr = (float *)bufferOut.contents; 209 | // Write back 210 | for (unsigned long index = 0; index < num; index++) 211 | { 212 | outBuffer[index].real(outDataPtr[index * 2]); 213 | outBuffer[index].imag(outDataPtr[index * 2 + 1]); 214 | } 215 | } 216 | } 217 | --------------------------------------------------------------------------------