├── DFT.cpp ├── DFT.cu ├── DFT.h ├── DFT.metal ├── DFT.mm ├── DFT_Metal_private.h ├── DFT_Metal_private.m ├── FFT.cpp ├── FFT.h ├── LICENSE ├── README.md └── main.cpp /DFT.cpp: -------------------------------------------------------------------------------- 1 | 2 | #define _USE_MATH_DEFINES // for C++ 3 | #include 4 | #include 5 | 6 | void calculateDFT(std::complex* in, std::complex* out, size_t num) 7 | { 8 | for (int i = 0; i < num; i++) 9 | { 10 | out[i] = 0; 11 | for (int j = 0; j < num; j++) 12 | { 13 | out[i] += in[j] * std::exp(std::complex(0, - 2 * M_PI * i * j / num)); 14 | } 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /DFT.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define M_PI 3.14159265 4 | 5 | #include "cuda_runtime.h" 6 | #include "device_launch_parameters.h" 7 | #include "cuComplex.h" 8 | 9 | __device__ __forceinline__ cuComplex cuComplexExp(cuComplex z) 10 | { 11 | cuComplex res; 12 | float t = expf(z.x); 13 | sincosf(z.y, &res.y, &res.x); 14 | res.x *= t; 15 | res.y *= t; 16 | return res; 17 | } 18 | 19 | // Kernel definition 20 | __global__ void calculateDFTCUDAKernel(cuComplex* in, cuComplex* out, size_t num) 21 | { 22 | int i = threadIdx.x; 23 | if (i < num) 24 | { 25 | out[i].x = 0; 26 | out[i].y = 0; 27 | for (int j = 0; j < num; j++) 28 | { 29 | out[i] = cuCaddf(out[i], 30 | cuCmulf(in[j], cuComplexExp(make_cuComplex(0, -2 * M_PI * i * j / num))) 31 | ); 32 | } 33 | } 34 | } 35 | 36 | void calculateDFTCUDA(std::complex* in, std::complex* out, size_t num) 37 | { 38 | if (num == 0) return; 39 | 40 | // Allocate vectors in device memory 41 | cuComplex* d_in; 42 | cudaMalloc(&d_in, num * sizeof(cuComplex)); 43 | cuComplex* d_out; 44 | cudaMalloc(&d_out, num * sizeof(cuComplex)); 45 | 46 | // Copy vectors from host memory to device memory 47 | cudaMemcpy(d_in, in, num * sizeof(cuComplex), cudaMemcpyHostToDevice); 48 | 49 | calculateDFTCUDAKernel<<<1, num>>>(d_in, d_out, num); 50 | 51 | cudaError_t cudaStatus; 52 | 53 | // Check for any errors launching the kernel 54 | cudaStatus = cudaGetLastError(); 55 | if (cudaStatus != cudaSuccess) { 56 | fprintf(stderr, "DFT Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); 57 | goto Error; 58 | } 59 | 60 | // cudaDeviceSynchronize waits for the kernel to finish, and returns 61 | // any errors encountered during the launch. 62 | cudaStatus = cudaDeviceSynchronize(); 63 | if (cudaStatus != cudaSuccess) { 64 | fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching DFT Kernel!\n", cudaStatus); 65 | goto Error; 66 | } 67 | 68 | // Copy back the results 69 | cudaMemcpy(out, d_out, num * sizeof(cuComplex), cudaMemcpyDeviceToHost); 70 | 71 | Error: 72 | cudaFree(d_in); 73 | cudaFree(d_out); 74 | } 75 | 76 | // Kernel definition 77 | __global__ void calculateDFTCUDAKernelWithPrecomputedRoot(cuComplex* in, cuComplex* roots, cuComplex* out, size_t num) 78 | { 79 | int i = threadIdx.x; 80 | if (i < num) 81 | { 82 | out[i].x = 0; 83 | out[i].y = 0; 84 | for (int j = 0; j < num; j++) 85 | { 86 | out[i] = cuCaddf(out[i], cuCmulf(in[j], roots[i * j % num])); 87 | } 88 | } 89 | } 90 | 91 | void calculateDFTCUDALargeMem(std::complex* in, std::complex* out, size_t num) 92 | { 93 | if (num == 0) return; 94 | 95 | // Allocate vectors in device memory 96 | cuComplex* d_in; 97 | cudaMalloc(&d_in, num * sizeof(cuComplex)); 98 | cuComplex* d_out; 99 | cudaMalloc(&d_out, num * sizeof(cuComplex)); 100 | cuComplex* d_roots; 101 | cudaMalloc(&d_roots, num * sizeof(cuComplex)); 102 | 103 | // Copy vectors from host memory to device memory 104 | cudaMemcpy(d_in, in, num * sizeof(cuComplex), cudaMemcpyHostToDevice); 105 | 106 | // Precompute DFT 107 | cuComplex* roots = (cuComplex *)malloc(sizeof(cuComplex) * num); 108 | for (int i = 0; i < num; i++) 109 | { 110 | std::complex root = std::exp(std::complex(0, -2 * M_PI * i / num)); 111 | roots[i] = make_cuComplex(root.real(), root.imag()); 112 | } 113 | cudaMemcpy(d_roots, roots, num * sizeof(cuComplex), cudaMemcpyHostToDevice); 114 | free(roots); 115 | 116 | calculateDFTCUDAKernelWithPrecomputedRoot<<<1, num >>>(d_in, d_roots, d_out, num); 117 | 118 | cudaError_t cudaStatus; 119 | 120 | // Check for any errors launching the kernel 121 | cudaStatus = cudaGetLastError(); 122 | if (cudaStatus != cudaSuccess) { 123 | fprintf(stderr, "DFT Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); 124 | goto Error; 125 | } 126 | 127 | // cudaDeviceSynchronize waits for the kernel to finish, and returns 128 | // any errors encountered during the launch. 129 | cudaStatus = cudaDeviceSynchronize(); 130 | if (cudaStatus != cudaSuccess) { 131 | fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching DFT Kernel!\n", cudaStatus); 132 | goto Error; 133 | } 134 | 135 | // Copy back the results 136 | cudaMemcpy(out, d_out, num * sizeof(cuComplex), cudaMemcpyDeviceToHost); 137 | 138 | Error: 139 | cudaFree(d_in); 140 | cudaFree(d_out); 141 | cudaFree(d_roots); 142 | } 143 | -------------------------------------------------------------------------------- /DFT.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | void calculateDFT(std::complex *in, std::complex *out, size_t num); 6 | #ifdef __HAS_CUDA__ 7 | void calculateDFTCUDA(std::complex* in, std::complex* out, size_t num); 8 | void calculateDFTCUDALargeMem(std::complex* in, std::complex* out, size_t num); 9 | #endif 10 | 11 | #ifdef HAS_METAL 12 | void calculateDFTMetal(std::complex* inBuffer, std::complex* outBuffer, size_t num); 13 | void calculateDFTMetalLargeMem(std::complex* inBuffer, std::complex* outBuffer, size_t num); 14 | #endif 15 | -------------------------------------------------------------------------------- /DFT.metal: -------------------------------------------------------------------------------- 1 | // 2 | // DFT.metal 3 | // InokiFFT 4 | // 5 | // Created by inoki on 1/28/22. 6 | // 7 | 8 | #include 9 | using namespace metal; 10 | 11 | float2 complexAdd(const float2 inA, const float2 inB) 12 | { 13 | float2 out; 14 | out[0] = inA[0] + inB[0]; 15 | out[1] = inA[1] + inB[1]; 16 | return out; 17 | } 18 | 19 | float2 complexMul(const float2 inA, const float2 inB) 20 | { 21 | float2 out; 22 | out[0] = inA[0] * inB[0] - inA[1] * inB[1]; 23 | out[1] = inA[0] * inB[1] + inA[1] * inB[0]; 24 | return out; 25 | } 26 | 27 | float2 complexExp(const float2 in) 28 | { 29 | float2 out; 30 | float t = exp(in[0]); 31 | out[0] = t * cos(in[1]); 32 | out[1] = t * sin(in[1]); 33 | return out; 34 | } 35 | 36 | /// This is a Metal Shading Language (MSL) function 37 | kernel void computeDFTMetal(device const float *in, 38 | device float *out, 39 | device const int *num, 40 | uint index [[thread_position_in_grid]]) 41 | { 42 | if (index < (uint)*num) { 43 | int i = index * 2; 44 | out[i] = 0; 45 | out[i + 1] = 0; 46 | float2 temp1, temp2; 47 | for (int j = 0; j < *num; j++) 48 | { 49 | temp2[0] = 0; temp2[1] = -2 * 3.14159265 * index * j / *num; 50 | temp2 = complexExp(temp2); 51 | 52 | temp1[0] = in[j * 2]; temp1[1] = in[j * 2 + 1]; 53 | temp1 = complexMul(temp1, temp2); 54 | 55 | // Copy back 56 | out[i] += temp1[0]; 57 | out[i + 1] += temp1[1]; 58 | } 59 | } 60 | } 61 | 62 | kernel void computeDFTMetalWithPrecomputedRoot(device const float *in, 63 | device const float *roots, 64 | device float *out, 65 | device const int *num, 66 | uint index [[thread_position_in_grid]]) 67 | { 68 | if (index < (uint)*num) { 69 | int i = index * 2; 70 | out[i] = 0; 71 | out[i + 1] = 0; 72 | float2 temp1, temp2; 73 | for (int j = 0; j < *num; j++) 74 | { 75 | temp2[0] = roots[2 * ((index * j) % (*num))]; 76 | temp2[1] = roots[2 * ((index * j) % (*num)) + 1]; 77 | 78 | temp1[0] = in[j * 2]; temp1[1] = in[j * 2 + 1]; 79 | temp1 = complexMul(temp1, temp2); 80 | 81 | // Copy back 82 | out[i] += temp1[0]; 83 | out[i + 1] += temp1[1]; 84 | } 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /DFT.mm: -------------------------------------------------------------------------------- 1 | // 2 | // DFT.m 3 | // InokiFFT 4 | // 5 | // Created by inoki on 1/28/22. 6 | // 7 | 8 | #import 9 | #import 10 | 11 | #include 12 | 13 | #include "DFT_Metal_private.h" 14 | 15 | id initFunction(NSString *functionName, id device) { 16 | // Load the shader files with a .metal file extension in the project 17 | id defaultLibrary = [device newDefaultLibrary]; 18 | if (defaultLibrary == nil) 19 | { 20 | NSLog(@"Failed to find the default library."); 21 | return nil; 22 | } 23 | id loadedFunction = [defaultLibrary newFunctionWithName:functionName]; 24 | return loadedFunction; 25 | } 26 | 27 | void calculateDFTMetal(std::complex* inBuffer, std::complex* outBuffer, size_t num) { 28 | @autoreleasepool { 29 | id device = GetMetalSystemDevice(); 30 | id func = initFunction(@"computeDFTMetal", device); 31 | if (func == nil) 32 | { 33 | NSLog(@"Failed to find the DFT function."); 34 | return; 35 | } 36 | 37 | NSError* error = nil; 38 | id funcPSO = [device newComputePipelineStateWithFunction: func error:&error]; 39 | 40 | if (funcPSO == nil) 41 | { 42 | // If the Metal API validation is enabled, you can find out more information about what 43 | // went wrong. (Metal API validation is enabled by default when a debug build is run 44 | // from Xcode) 45 | NSLog(@"Failed to created pipeline state object, error %@.", error); 46 | return; 47 | } 48 | 49 | id commandQueue = [device newCommandQueue]; 50 | if (commandQueue == nil) 51 | { 52 | NSLog(@"Failed to find the command queue."); 53 | return; 54 | } 55 | 56 | // Prepare data 57 | id bufferIn = [device newBufferWithLength:num * sizeof(float) * 2 options:MTLResourceStorageModeShared]; 58 | float *inDataPtr = (float *)bufferIn.contents; 59 | 60 | for (unsigned long index = 0; index < num; index++) 61 | { 62 | inDataPtr[index * 2] = inBuffer[index].real(); 63 | inDataPtr[index * 2 + 1] = inBuffer[index].imag(); 64 | } 65 | 66 | id bufferOut = [device newBufferWithLength:num * sizeof(float) * 2 options:MTLResourceStorageModeShared]; 67 | id bufferNum = [device newBufferWithLength:sizeof(int) options:MTLResourceStorageModeShared]; 68 | *((int *)bufferNum.contents) = (int)num; 69 | NSLog(@"%d", *((int *)bufferNum.contents)); 70 | 71 | id commandBuffer = [commandQueue commandBuffer]; 72 | assert(commandBuffer != nil); 73 | 74 | // Start a compute pass. 75 | id computeEncoder = [commandBuffer computeCommandEncoder]; 76 | assert(computeEncoder != nil); 77 | 78 | // Encode the pipeline state object and its parameters. 79 | [computeEncoder setComputePipelineState:funcPSO]; 80 | [computeEncoder setBuffer:bufferIn offset:0 atIndex:0]; 81 | [computeEncoder setBuffer:bufferOut offset:0 atIndex:1]; 82 | [computeEncoder setBuffer:bufferNum offset:0 atIndex:2]; 83 | 84 | MTLSize gridSize = MTLSizeMake(num, 1, 1); 85 | NSUInteger threadGroupSize = funcPSO.maxTotalThreadsPerThreadgroup; 86 | NSLog(@"My GPU has %lu thread groups.", (unsigned long)threadGroupSize); 87 | if (threadGroupSize > num) 88 | { 89 | threadGroupSize = num; 90 | } 91 | MTLSize threadgroupSize = MTLSizeMake(threadGroupSize, 1, 1); 92 | 93 | // Encode the compute command. 94 | [computeEncoder dispatchThreads:gridSize 95 | threadsPerThreadgroup:threadgroupSize]; 96 | 97 | // End the compute pass. 98 | [computeEncoder endEncoding]; 99 | 100 | // Execute the command. 101 | [commandBuffer commit]; 102 | 103 | // Normally, you want to do other work in your app while the GPU is running, 104 | // but in this example, the code simply blocks until the calculation is complete. 105 | [commandBuffer waitUntilCompleted]; 106 | 107 | float *outDataPtr = (float *)bufferOut.contents; 108 | // Write back 109 | for (unsigned long index = 0; index < num; index++) 110 | { 111 | outBuffer[index].real(outDataPtr[index * 2]); 112 | outBuffer[index].imag(outDataPtr[index * 2 + 1]); 113 | } 114 | // TODO: The result is buggy 115 | } 116 | } 117 | 118 | void calculateDFTMetalLargeMem(std::complex* inBuffer, std::complex* outBuffer, size_t num) { 119 | @autoreleasepool { 120 | id device = GetMetalSystemDevice(); 121 | id func = initFunction(@"computeDFTMetalWithPrecomputedRoot", device); 122 | if (func == nil) 123 | { 124 | NSLog(@"Failed to find the DFT function."); 125 | return; 126 | } 127 | 128 | NSError* error = nil; 129 | id funcPSO = [device newComputePipelineStateWithFunction: func error:&error]; 130 | 131 | if (funcPSO == nil) 132 | { 133 | // If the Metal API validation is enabled, you can find out more information about what 134 | // went wrong. (Metal API validation is enabled by default when a debug build is run 135 | // from Xcode) 136 | NSLog(@"Failed to created pipeline state object, error %@.", error); 137 | return; 138 | } 139 | 140 | id commandQueue = [device newCommandQueue]; 141 | if (commandQueue == nil) 142 | { 143 | NSLog(@"Failed to find the command queue."); 144 | return; 145 | } 146 | 147 | // Prepare data 148 | id bufferIn = [device newBufferWithLength:num * sizeof(float) * 2 options:MTLResourceStorageModeShared]; 149 | float *inDataPtr = (float *)bufferIn.contents; 150 | 151 | for (unsigned long index = 0; index < num; index++) 152 | { 153 | inDataPtr[index * 2] = inBuffer[index].real(); 154 | inDataPtr[index * 2 + 1] = inBuffer[index].imag(); 155 | } 156 | 157 | // Precompute Roots 158 | id bufferRoots = [device newBufferWithLength:num * sizeof(float) * 2 options:MTLResourceStorageModeShared]; 159 | inDataPtr = (float *)bufferRoots.contents; 160 | for (unsigned long index = 0; index < num; index++) 161 | { 162 | std::complex root = std::exp(std::complex(0, -2 * M_PI * index / num)); 163 | inDataPtr[index * 2] = root.real(); 164 | inDataPtr[index * 2 + 1] = root.imag(); 165 | } 166 | 167 | id bufferOut = [device newBufferWithLength:num * sizeof(float) * 2 options:MTLResourceStorageModeShared]; 168 | id bufferNum = [device newBufferWithLength:sizeof(int) options:MTLResourceStorageModeShared]; 169 | *((int *)bufferNum.contents) = (int)num; 170 | 171 | id commandBuffer = [commandQueue commandBuffer]; 172 | assert(commandBuffer != nil); 173 | 174 | // Start a compute pass. 175 | id computeEncoder = [commandBuffer computeCommandEncoder]; 176 | assert(computeEncoder != nil); 177 | 178 | // Encode the pipeline state object and its parameters. 179 | [computeEncoder setComputePipelineState:funcPSO]; 180 | [computeEncoder setBuffer:bufferIn offset:0 atIndex:0]; 181 | [computeEncoder setBuffer:bufferRoots offset:0 atIndex:1]; 182 | [computeEncoder setBuffer:bufferOut offset:0 atIndex:2]; 183 | [computeEncoder setBuffer:bufferNum offset:0 atIndex:3]; 184 | 185 | MTLSize gridSize = MTLSizeMake(num, 1, 1); 186 | NSUInteger threadGroupSize = funcPSO.maxTotalThreadsPerThreadgroup; 187 | NSLog(@"My GPU has %lu thread groups.", (unsigned long)threadGroupSize); 188 | if (threadGroupSize > num) 189 | { 190 | threadGroupSize = num; 191 | } 192 | MTLSize threadgroupSize = MTLSizeMake(threadGroupSize, 1, 1); 193 | 194 | // Encode the compute command. 195 | [computeEncoder dispatchThreads:gridSize 196 | threadsPerThreadgroup:threadgroupSize]; 197 | 198 | // End the compute pass. 199 | [computeEncoder endEncoding]; 200 | 201 | // Execute the command. 202 | [commandBuffer commit]; 203 | 204 | // Normally, you want to do other work in your app while the GPU is running, 205 | // but in this example, the code simply blocks until the calculation is complete. 206 | [commandBuffer waitUntilCompleted]; 207 | 208 | float *outDataPtr = (float *)bufferOut.contents; 209 | // Write back 210 | for (unsigned long index = 0; index < num; index++) 211 | { 212 | outBuffer[index].real(outDataPtr[index * 2]); 213 | outBuffer[index].imag(outDataPtr[index * 2 + 1]); 214 | } 215 | } 216 | } 217 | -------------------------------------------------------------------------------- /DFT_Metal_private.h: -------------------------------------------------------------------------------- 1 | // 2 | // DFT_Metal_private.h 3 | // InokiFFT 4 | // 5 | // Created by inoki on 1/28/22. 6 | // 7 | 8 | #ifndef DFT_Metal_private_h 9 | #define DFT_Metal_private_h 10 | 11 | #import 12 | #import 13 | #ifdef __cplusplus 14 | extern "C" { 15 | #endif 16 | 17 | /* The Metal MTLCreateSystemDefaultDevice has C-style ABI, which cannot be properly linked by C++ linker, use this workaround. */ 18 | id GetMetalSystemDevice(); 19 | 20 | #ifdef __cplusplus 21 | } 22 | #endif 23 | 24 | 25 | #endif /* DFT_Metal_private_h */ 26 | -------------------------------------------------------------------------------- /DFT_Metal_private.m: -------------------------------------------------------------------------------- 1 | // 2 | // DFT.m 3 | // InokiFFT 4 | // 5 | // Created by inoki on 1/28/22. 6 | // 7 | 8 | #import 9 | #import 10 | 11 | #include "DFT_Metal_private.h" 12 | 13 | id GetMetalSystemDevice() { 14 | return MTLCreateSystemDefaultDevice(); 15 | } 16 | -------------------------------------------------------------------------------- /FFT.cpp: -------------------------------------------------------------------------------- 1 | 2 | #define _USE_MATH_DEFINES // for C++ 3 | #include 4 | #include 5 | #include 6 | 7 | unsigned int bitReverse(unsigned int x, int log2n) 8 | { 9 | int n = 0; 10 | int mask = 0x1; 11 | for (int i = 0; i < log2n; i++) 12 | { 13 | n <<= 1; 14 | n |= (x & 1); 15 | x >>= 1; 16 | } 17 | return n; 18 | } 19 | 20 | void fft_impl(std::complex *a, std::complex* b, int num) 21 | { 22 | std::complex J(0, 1); 23 | int n = num; 24 | int log2n = 0; 25 | while (num > 1) { 26 | log2n++; 27 | num /= 2; 28 | } 29 | for (unsigned int i = 0; i < n; ++i) { 30 | b[bitReverse(i, log2n)] = a[i]; 31 | } 32 | for (int s = 1; s <= log2n; ++s) { 33 | int m = 1 << s; 34 | int m2 = m >> 1; 35 | std::complex w(1, 0); 36 | std::complex wm = std::exp(-J * (float)(M_PI / m2)); 37 | for (int j = 0; j < m2; ++j) { 38 | for (int k = j; k < n; k += m) { 39 | std::complex t = w * b[k + m2]; 40 | std::complex u = b[k]; 41 | b[k] = u + t; 42 | b[k + m2] = u - t; 43 | } 44 | w *= wm; 45 | } 46 | } 47 | } 48 | 49 | void calculateFFT(std::complex* in, std::complex* out, size_t num) 50 | { 51 | fft_impl(in, out, num); 52 | } 53 | -------------------------------------------------------------------------------- /FFT.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | void calculateFFT(std::complex* in, std::complex* out, size_t num); 6 | #ifdef __HAS_CUDA__ 7 | void calculateFFTCUDA(std::complex* in, std::complex* out, size_t num); 8 | #endif 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Inoki 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Discrete Fourier Transform (DFT/FFT) implementations 2 | 3 | This project has experimental implementations of DFT/FFT in CUDA and Apple Metal. Use it as your own risk (remember to check the array boarder if you would like to use them in your own project). 4 | 5 | - `DFT.cu` has DFT implementations (with or without precomputed complex roots) in CUDA 6 | - `DFT.metal` has DFT implementations (with or without precomputed complex roots) in Apple Metal 7 | - `FFT.cpp` includes an FFT CPU implementation 8 | - Parallel FFT is work in progress... 9 | -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | // InokiFourier.cpp 2 | // 3 | 4 | #include 5 | #include "DFT.h" 6 | #include "FFT.h" 7 | 8 | #define FFT_LEN 128 9 | 10 | int main() 11 | { 12 | std::complex inBuffer[128], outBuffer[128]; 13 | for (int i = 0; i < FFT_LEN; i++) 14 | { 15 | switch (i % 8) 16 | { 17 | case 0: 18 | inBuffer[i] = 1; 19 | break; 20 | case 1: case 7: 21 | inBuffer[i] = std::sqrtf(2) / 2; 22 | break; 23 | case 2: case 6: 24 | inBuffer[i] = 0; 25 | break; 26 | case 3: case 5: 27 | inBuffer[i] = -std::sqrtf(2) / 2; 28 | break; 29 | case 4: 30 | inBuffer[i] = -1; 31 | break; 32 | default: 33 | break; 34 | } 35 | } 36 | #ifndef __HAS_CUDA__ 37 | #ifndef HAS_METAL 38 | std::cout << "Using CPU implementation" << std::endl; 39 | calculateDFT(inBuffer, outBuffer, FFT_LEN); 40 | calculateFFT(inBuffer, outBuffer, FFT_LEN); 41 | #else 42 | std::cout << "Using Metal implementation" << std::endl; 43 | calculateDFTMetal(inBuffer, outBuffer, FFT_LEN); 44 | #endif 45 | #else 46 | std::cout << "Using customized CUDA implementation" << std::endl; 47 | calculateDFTCUDA(inBuffer, outBuffer, FFT_LEN); 48 | #endif // !__NVCC__ 49 | float maxOut = -1; 50 | int maxI = 0; 51 | for (int i = 0; i < FFT_LEN; i++) 52 | { 53 | float outAbs = std::abs(outBuffer[i]); 54 | std::cout << outAbs << " "; 55 | if (outAbs > maxOut) 56 | { 57 | maxOut = outAbs; 58 | maxI = i; 59 | } 60 | } 61 | std::cout << std::endl << "Max: " << maxOut << " at " << maxI << std::endl; 62 | return 0; 63 | } 64 | --------------------------------------------------------------------------------