├── CMakeLists.txt ├── README.md ├── src ├── assertions.hpp ├── definitions.cuh ├── grid.cuh ├── grid.hpp ├── poisson.cuh ├── poisson.hpp └── solver.hpp └── test ├── CMakeLists.txt ├── test_grid.cu └── test_poisson.cu /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | project(POISSON_MULTIGRID VERSION 1.0 LANGUAGES CUDA) 3 | 4 | include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) 5 | if (DEFINED ENV{ARCH}) 6 | set(ARCH $ENV{ARCH}) 7 | else() 8 | set(ARCH sm_70) 9 | endif() 10 | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O4 -g -use_fast_math -Xcompiler -fopenmp -std=c++11 -arch=${ARCH} -Xptxas=-v -lineinfo") 11 | 12 | include_directories(src) 13 | include_directories(test) 14 | include(CTest) 15 | enable_testing() 16 | 17 | add_subdirectory(test) 18 | 19 | 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CUDA Multigrid 2 | The code in this repository solves Poisson's equation in 2D subject to Dirichlet boundary condition 3 | using the Multigrid method with a Gauss-Seidel smoother. The PDE is discretized using second order 4 | finite differences and conservative prolongation and restriction operators. There are two 5 | implementations of the solver, a single-threaded CPU solver and a GPU solver (custom CUDA 6 | kernels). To expose parallelism, the GPU solver uses a Red-Black Gauss-Seidel smoother. 7 | 8 | ## Usage 9 | Look at the test program `test/test_poisson.cu` to learn how to use the code. 10 | 11 | ```CUDA 12 | // Select problem to solve 13 | using CUDAProblem = CUDAPoisson; 14 | // Select smoother 15 | using CUDASmoother = CUDAGaussSeidelRedBlack; 16 | // Select multigrid solver 17 | using CUDAMG = CUDAMultigrid; 18 | 19 | CUDAProblem problem(l, h, modes); 20 | CUDAMG solver(problem); 21 | auto out = solve(solver, problem, opts); 22 | printf("Iterations: %d, Residual: %g \n", out.iterations, out.residual); 23 | ``` 24 | 25 | 26 | ``` 27 | CPU: Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz 28 | GPU: NVIDIA RTX 2080 Ti 29 | 30 | test/test_poisson 31 | Solver: Gauss-Seidel (red-black) 32 | Iteration Residual 33 | 10 6.674289 34 | 20 1.369973 35 | 30 0.2812023 36 | 40 0.05771992 37 | 50 0.01184766 38 | 60 0.002431866 39 | 70 0.0004991677 40 | 80 0.0001024598 41 | 90 2.103102e-05 42 | 100 4.316853e-06 43 | 110 8.860825e-07 44 | 120 1.818784e-07 45 | 130 3.733259e-08 46 | Iterations: 139, Residual: 8.97769e-09 47 | Solver: Multi-Grid 48 | Iteration Residual 49 | 10 4.959894e-09 50 | Iterations: 10, Residual: 4.95989e-09 51 | Solver: CUDA Gauss-Seidel (red-black) 52 | Iteration Residual 53 | 10 6.674289 54 | 20 1.369973 55 | 30 0.2812023 56 | 40 0.05771992 57 | 50 0.01184766 58 | 60 0.002431866 59 | 70 0.0004991677 60 | 80 0.0001024598 61 | 90 2.103102e-05 62 | 100 4.316853e-06 63 | 110 8.860825e-07 64 | 120 1.818784e-07 65 | 130 3.733259e-08 66 | Iterations: 139, Residual: 8.97769e-09 67 | Solver: CUDA Multi-Grid 68 | Iteration Residual 69 | 10 4.959894e-09 70 | Iterations: 10, Residual: 4.95989e-09 71 | MMS convergence test 72 | Solver: Multi-Grid 73 | Grid Size Iterations Time (ms) Residual Error Rate 74 | 6 x 6 1 0.00406 2.4718e-16 0.9348 -inf 75 | 10 x 10 8 0.01162 8.3216e-09 0.30908 1.59669 76 | 18 x 18 10 0.03814 4.9599e-09 0.08183 1.91727 77 | 34 x 34 11 0.14534 1.6851e-09 0.02074 1.98024 78 | 66 x 66 11 0.55350 2.2692e-09 0.0052025 1.99512 79 | 130 x 130 11 2.19837 2.4601e-09 0.0013017 1.99878 80 | 258 x 258 11 8.77101 2.5173e-09 0.0003255 1.99970 81 | 514 x 514 11 40.58800 2.5417e-09 8.1378e-05 1.99993 82 | 1026 x 1026 11 179.36189 2.5836e-09 2.0344e-05 2.00001 83 | 2050 x 2050 11 746.79437 2.735e-09 5.0858e-06 2.00010 84 | 4098 x 4098 11 3020.28027 3.38e-09 1.2711e-06 2.00041 85 | 8194 x 8194 11 12569.50879 7.1411e-09 3.174e-07 2.00166 86 | MMS convergence test 87 | Solver: CUDA Multi-Grid 88 | Grid Size Iterations Time (ms) Residual Error Rate 89 | 6 x 6 1 0.08147 2.4718e-16 0.9348 -inf 90 | 10 x 10 8 0.46346 8.3216e-09 0.30908 1.59669 91 | 18 x 18 10 0.76461 4.9599e-09 0.08183 1.91727 92 | 34 x 34 11 1.04093 1.6851e-09 0.02074 1.98024 93 | 66 x 66 11 1.23590 2.2692e-09 0.0052025 1.99512 94 | 130 x 130 11 1.52275 2.4601e-09 0.0013017 1.99878 95 | 258 x 258 11 2.05760 2.5173e-09 0.0003255 1.99970 96 | 514 x 514 11 3.58195 2.5417e-09 8.1378e-05 1.99993 97 | 1026 x 1026 11 10.08339 2.5836e-09 2.0344e-05 2.00001 98 | 2050 x 2050 11 29.20931 2.735e-09 5.0858e-06 2.00010 99 | 4098 x 4098 11 109.10480 3.38e-09 1.2711e-06 2.00041 100 | 8194 x 8194 11 397.84283 7.1411e-09 3.174e-07 2.00166 101 | ``` 102 | 103 | ## TODO 104 | Currently, the GPU solver is more or less a copy-and-paste-version of the CPU solver. It would be 105 | nice to remove much of the code duplication. There's also room for improvement in terms of 106 | optimization. First, I should perform a baseline profiling to identify the current hotspots. 107 | Nonetheless, I already have some ideas on what can be improved. The red-black Gauss-Seidel 108 | implementation is currently terrible. It takes two trips to DRAM to load the data. First, once for 109 | the red points, and then once more for the black points. At each time, only half of the threads are 110 | active. It should possible to keep all threads active by loading the data into shared memory 111 | using a float2/double2 instruction. Some kernels like the restriction and residual kernel can also 112 | probably merged into a single one to further reduce memory traffic. 113 | -------------------------------------------------------------------------------- /src/assertions.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifndef HALT_ON_ERROR 4 | #define HALT_ON_ERROR 0 5 | #endif 6 | 7 | #ifndef FTOL 8 | #define FTOL 1e-6 9 | #endif 10 | 11 | #ifndef GTOL 12 | #define GTOL 1e-12 13 | #endif 14 | 15 | int num_pass = 0; 16 | int num_fail = 0; 17 | int num_tests = 0; 18 | 19 | #define equals(x, y) equals_impl((x), (y), #x, #y, __FILE__, __LINE__, __func__) 20 | #define approx(x, y) approx_impl((x), (y), #x, #y, __FILE__, __LINE__, __func__) 21 | 22 | template 23 | void equals_impl(T a, T b, const char *astr, const char *bstr, const char *file, 24 | const int line, const char *func) { 25 | if (a != b) { 26 | printf("%s:%d %s() %s == %s : %d == %d \n", file, line, func, astr, bstr, a, b); 27 | num_fail++; 28 | if (HALT_ON_ERROR) exit(-1); 29 | } else { 30 | num_pass++; 31 | } 32 | } 33 | 34 | template 35 | void approx_impl(T a, T b, const char *astr, const char *bstr, const char *file, 36 | const int line, const char *func) { 37 | int err = 0; 38 | 39 | // Single precision 40 | if (sizeof(T) == 4 && (a - b > 0 && a - b > FTOL) || (b - a > 0 && b - a > FTOL)) { 41 | printf("%s:%d %s() %s == %s : %f == %f \n", file, line, func, astr, bstr, a, b); 42 | err = 1; 43 | 44 | } 45 | 46 | // Double precision 47 | if (sizeof(T) == 8 && (a - b > 0 && a - b > GTOL) || (b - a > 0 && b - a > GTOL)) { 48 | printf("%s:%d %s() %s == %s : %g == %g \n", file, line, func, astr, bstr, a, b); 49 | err = 1; 50 | 51 | } 52 | 53 | if (HALT_ON_ERROR && err) exit(-1); 54 | if (err) num_fail++; 55 | else num_pass++; 56 | } 57 | 58 | int test_report(void) { 59 | printf("Number of tests passed: %d failed: %d \n", num_pass, num_fail); 60 | int num_fail_out = num_fail; 61 | num_pass = num_fail = num_tests = 0; 62 | return num_fail_out; 63 | } 64 | 65 | -------------------------------------------------------------------------------- /src/definitions.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #define CUDACHECK(call) cudacheck_impl(call, __FILE__, __LINE__, __func__) 3 | 4 | void cudacheck_impl(cudaError_t err, const char *file, const int line, 5 | const char *func) { 6 | if (cudaSuccess != err) { 7 | fprintf(stderr, "CUDA error in %s:%i %s(): %s.\n", file, line, 8 | func, cudaGetErrorString(err)); 9 | fflush(stderr); 10 | exit(EXIT_FAILURE); 11 | } 12 | } 13 | 14 | -------------------------------------------------------------------------------- /src/grid.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | enum norm_type {L1NORM, L2NORM}; 5 | 6 | __host__ __device__ __inline__ bool inbounds(const int ix, const int iy, const int nx, const int ny, 7 | const int ix0, const int iy0, const int ixn, 8 | const int iyn) { 9 | 10 | if (ix < ix0 || iy < iy0) return false; 11 | if (ix > nx - 1 - ixn || iy > ny - 1 - iyn) return false; 12 | return true; 13 | } 14 | 15 | template 16 | __global__ void cuda_grid_x_kernel(T *x, const int nx, const int ny, const T h) { 17 | int ix = threadIdx.x + blockDim.x * blockIdx.x; 18 | int iy = threadIdx.y + blockDim.y * blockIdx.y; 19 | 20 | if (ix >= nx || iy >= ny) return; 21 | 22 | x[ix + nx * iy] = ix * h; 23 | 24 | } 25 | 26 | template 27 | void cuda_grid_x(T *x, const int nx, const int ny, const T h) { 28 | dim3 threads (32, 4, 1); 29 | dim3 blocks ( (nx - 1) / threads.x + 1, (ny - 1) / threads.y + 1, 1); 30 | cuda_grid_x_kernel<<>>(x, nx, ny, h); 31 | CUDACHECK(cudaGetLastError()); 32 | } 33 | 34 | template 35 | __global__ void cuda_grid_y_kernel(T *x, const int nx, const int ny, const T h) { 36 | int ix = threadIdx.x + blockDim.x * blockIdx.x; 37 | int iy = threadIdx.y + blockDim.y * blockIdx.y; 38 | 39 | if (ix >= nx || iy >= ny) return; 40 | 41 | x[ix + nx * iy] = iy * h; 42 | 43 | } 44 | 45 | template 46 | void cuda_grid_y(T *x, const int nx, const int ny, const T h) { 47 | dim3 threads (32, 4, 1); 48 | dim3 blocks ( (nx - 1) / threads.x + 1, (ny - 1) / threads.y + 1, 1); 49 | cuda_grid_y_kernel<<>>(x, nx, ny, h); 50 | CUDACHECK(cudaGetLastError()); 51 | } 52 | 53 | template 54 | __global__ void cuda_grid_restrict_kernel(T *yc, const int nxc, const int nyc, const T *xf, const int nxf, const int nyf, const T a = 0.0, const T b=1.0, 55 | const int ix0=1, const int iy0=1, const int ixn=1, const int iyn=1) { 56 | int j = threadIdx.x + blockDim.x * blockIdx.x; 57 | int i = threadIdx.y + blockDim.y * blockIdx.y; 58 | const T c0 = 0.25; 59 | const T c1 = 0.5; 60 | if (!inbounds(j, i, nxc, nyc, ix0, iy0, ixn, iyn)) return; 61 | yc[j + nxc * i] = a * yc[j + nxc * i] + 62 | b * (c0 * c0 * xf[2 * j - 1 + nxf * (2 * i - 1)] + 63 | c0 * c1 * xf[2 * j + nxf * (2 * i - 1)] + 64 | c0 * c0 * xf[2 * j + 1 + nxf * (2 * i - 1)] + 65 | +c1 * c0 * xf[2 * j - 1 + nxf * (2 * i + 0)] + 66 | c1 * c1 * xf[2 * j + nxf * (2 * i + 0)] + 67 | c1 * c0 * xf[2 * j + 1 + nxf * (2 * i + 0)] + 68 | +c0 * c0 * xf[2 * j - 1 + nxf * (2 * i + 1)] + 69 | c0 * c1 * xf[2 * j + nxf * (2 * i + 1)] + 70 | c0 * c0 * xf[2 * j + 1 + nxf * (2 * i + 1)]); 71 | } 72 | 73 | template 74 | void cuda_grid_restrict(T *yc, const int nxc, const int nyc, const T *xf, const int nxf, const int nyf, const T a = 0.0, const T b=1.0, 75 | const int i0=1, const int j0=1, const int in=1, const int jn=1) { 76 | assert(nxf == 2 * (nxc - 1) + 1); 77 | dim3 threads (32, 4, 1); 78 | dim3 blocks ( (nxc - 1) / threads.x + 1, (nyc - 1) / threads.y + 1, 1); 79 | cuda_grid_restrict_kernel<<>>(yc, nxc, nyc, xf, nxf, nyf, a, b, i0, j0, in, jn); 80 | CUDACHECK(cudaGetLastError()); 81 | } 82 | 83 | template 84 | __global__ void cuda_grid_prolongate_kernel(T *yf, const int nxf, const int nyf, 85 | const T *xc, const int nxc, const int nyc, 86 | const T a = 0.0, const T b = 1.0, 87 | const int bx = 0, const int by = 0, 88 | const int ex = 0, const int ey = 0) { 89 | int j = threadIdx.x + blockDim.x * blockIdx.x; 90 | int i = threadIdx.y + blockDim.y * blockIdx.y; 91 | if (!inbounds(j, i, nxc, nyc, bx, by, ex, ey)) return; 92 | 93 | yf[2 * j + nxf * 2 * i] = 94 | a * yf[2 * j + nxf * 2 * i] + b * xc[j + nxc * i]; 95 | if (j < nxc - 1) 96 | yf[2 * j + 1 + nxf * 2 * i] = 97 | a * yf[2 * j + 1 + nxf * 2 * i] + 98 | 0.5 * b * 99 | (xc[j + nxc * i] + xc[j + 1 + nxc * i]); 100 | if (i < nxc - 1) 101 | yf[2 * j + nxf * (2 * i + 1)] = 102 | a * yf[2 * j + nxf * (2 * i + 1)] + 103 | 0.5 * b * 104 | (xc[j + nxc * i] + 105 | xc[j + nxc * (i + 1)]); 106 | if (i < nxc - 1 && j < nxc - 1) 107 | yf[2 * j + 1 + nxf * (2 * i + 1)] = 108 | + a * yf[2 * j + 1 + nxf * (2 * i + 1)] + 109 | 0.25 * b * 110 | (xc[j + nxc * i] + 111 | xc[j + nxc * (i + 1)] + 112 | xc[j + 1 + nxc * i] + 113 | xc[j + 1 + nxc * (i + 1)]); 114 | } 115 | 116 | template 117 | void cuda_grid_prolongate(T *yf, const int nxf, const int nyf, const T *xc, 118 | const int nxc, const int nyc, const T a = 0.0, 119 | const T b = 1.0, const int bx = 0, const int by = 0, 120 | const int ex = 0, const int ey = 0) { 121 | assert(nxf == 2 * (nxc - 1) + 1); 122 | dim3 threads (32, 4, 1); 123 | dim3 blocks ( (nxc - 1) / threads.x + 1, (nyc - 1) / threads.y + 1, 1); 124 | cuda_grid_prolongate_kernel<<>>(yf, nxf, nyf, xc, nxc, nyc, a, b, bx, by, ex, ey); 125 | CUDACHECK(cudaGetLastError()); 126 | 127 | 128 | } 129 | 130 | template 131 | __global__ void cuda_grid_subtract_kernel(T *z, const T *x, const T *y, const int nx, const int ny, 132 | const int ix0=0, const int iy0=0, const int ixn=0, const int iyn=0) { 133 | int i = threadIdx.y + blockDim.y * blockIdx.y; 134 | int j = threadIdx.x + blockDim.x * blockIdx.x; 135 | if (!inbounds(j, i, nx, ny, ix0, iy0, ixn, iyn)) return; 136 | z[j + nx * i] = x[j + nx * i] - y[j + nx * i]; 137 | } 138 | 139 | template 140 | void cuda_grid_subtract(T *z, const T *x, const T *y, const int nx, 141 | const int ny, const int i0 = 0, const int j0 = 0, 142 | const int in = 0, const int jn = 0) { 143 | dim3 threads (32, 4, 1); 144 | dim3 blocks ( (nx - 1) / threads.x + 1, (ny - 1) / threads.y + 1, 1); 145 | cuda_grid_subtract_kernel<<>>(z, x, y, nx, ny); 146 | } 147 | 148 | template 149 | __global__ void norm_kernel(T *temp, const T *u, const int N) { 150 | 151 | const int warpSize = 32; 152 | int numThreads = blockDim.x; 153 | int numValuesPerBlockPerThread = (N - 1) / gridDim.x / numThreads + 1; 154 | int numValuesPerBlock = numValuesPerBlockPerThread * numThreads; 155 | int idx = threadIdx.x + blockIdx.x * numValuesPerBlock; 156 | int warpID = threadIdx.x / warpSize; 157 | int lane = threadIdx.x % warpSize; 158 | int numWarps = numThreads / warpSize; 159 | 160 | double partialSum = 0.0; 161 | __shared__ T sPartialSum[1024]; 162 | int end = idx + numValuesPerBlock; 163 | int iEnd = end > N ? N : end; 164 | 165 | for (int i = idx; i < iEnd; i += numThreads) { 166 | switch (norm) { 167 | case L1NORM: 168 | partialSum += fabs(u[i]); 169 | break; 170 | case L2NORM: 171 | partialSum += u[i] * u[i]; 172 | break; 173 | } 174 | } 175 | 176 | T val = partialSum; 177 | for (int i = 16; i > 0; i /= 2) 178 | val += __shfl_down_sync(0xffffffff, val, i); 179 | 180 | if (lane == 0) sPartialSum[warpID] = val; 181 | 182 | __syncthreads(); 183 | 184 | if (lane == 0 && warpID == 0) { 185 | double blockSum = 0.0; 186 | for (int i = 0; i < numWarps; ++i) { 187 | blockSum += sPartialSum[i]; 188 | } 189 | 190 | atomicAdd(temp, blockSum); 191 | } 192 | } 193 | 194 | 195 | template 196 | T norm_H(T *tmp, const T *u, const int n, const int device=0) { 197 | 198 | double out[1] = {0.0}; 199 | 200 | cudaMemset(tmp, 0, sizeof(T)); 201 | 202 | int numSM = 0; 203 | const int threads = 128; 204 | cudaDeviceGetAttribute(&numSM, cudaDevAttrMultiProcessorCount, device); 205 | norm_kernel<<>>(tmp, u, n); 206 | CUDACHECK(cudaGetLastError()); 207 | cudaMemcpy(out, tmp, sizeof(T), cudaMemcpyDeviceToHost); 208 | CUDACHECK(cudaGetLastError()); 209 | 210 | return out[0]; 211 | } 212 | 213 | template 214 | class CUDANorm { 215 | private: 216 | T *sum; 217 | public: 218 | CUDANorm() { 219 | CUDACHECK(cudaMalloc((void**)&sum, sizeof(T))); 220 | } 221 | 222 | //TODO: Add bounds options to norm 223 | T operator()(const T *u, const int nx, const int ny, 224 | const T hx, const T hy, const int device=0) { 225 | CUDACHECK(cudaMemset(sum, 0, sizeof(T))); 226 | return norm_H(sum, u, nx * ny) * hx * hy; 227 | } 228 | 229 | ~CUDANorm() { 230 | CUDACHECK(cudaFree(sum)); 231 | } 232 | 233 | }; 234 | -------------------------------------------------------------------------------- /src/grid.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | template 5 | void grid_x(T *x, const int nx, const int ny, const T h) { 6 | for (int i = 0; i < ny; ++i) { 7 | for (int j = 0; j < nx; ++j) { 8 | x[j + nx * i] = j * h; 9 | } 10 | } 11 | } 12 | 13 | template 14 | void grid_y(T *y, const int nx, const int ny, const T h) { 15 | for (int i = 0; i < ny; ++i) { 16 | for (int j = 0; j < nx; ++j) { 17 | y[j + nx * i] = i * h; 18 | } 19 | } 20 | } 21 | 22 | template 23 | void grid_restrict(T *yc, const int nxc, const int nyc, const T *xf, 24 | const int nxf, const int nyf, const T a = 0.0, 25 | const T b = 1.0) { 26 | assert(nxf == 2 * (nxc - 1) + 1); 27 | const T c0 = 0.25; 28 | const T c1 = 0.5; 29 | for (int i = 1; i < nyc-1; ++i) { 30 | for (int j = 1; j < nxc-1; ++j) { 31 | yc[j + nxc * i] = 32 | a * yc[j + nxc * i] + b * 33 | ( 34 | c0 * c0 * xf[2 * j - 1 + nxf * (2 * i - 1)] + 35 | c0 * c1 * xf[2 * j + nxf * (2 * i - 1)] + 36 | c0 * c0 * xf[2 * j + 1 + nxf * (2 * i - 1)] + 37 | + 38 | c1 * c0 * xf[2 * j - 1 + nxf * (2 * i + 0)] + 39 | c1 * c1 * xf[2 * j + nxf * (2 * i + 0)] + 40 | c1 * c0 * xf[2 * j + 1 + nxf * (2 * i + 0)] + 41 | + 42 | c0 * c0 * xf[2 * j - 1 + nxf * (2 * i + 1)] + 43 | c0 * c1 * xf[2 * j + nxf * (2 * i + 1)] + 44 | c0 * c0 * xf[2 * j + 1 + nxf * (2 * i + 1)] 45 | ); 46 | } 47 | } 48 | } 49 | 50 | template 51 | void grid_prolongate(T *yf, const int nxf, const int nyf, const T *xc, 52 | const int nxc, const int nyc, const T a = 0.0, 53 | const T b = 1.0) { 54 | assert(nxf == 2 * (nxc - 1) + 1); 55 | 56 | for (int i = 0; i < nyc; ++i) { 57 | for (int j = 0; j < nxc; ++j) { 58 | yf[2 * j + nxf * 2 * i] = 59 | a * yf[2 * j + nxf * 2 * i] + b * xc[j + nxc * i]; 60 | if (j < nxc - 1) 61 | yf[2 * j + 1 + nxf * 2 * i] = 62 | a * yf[2 * j + 1 + nxf * 2 * i] + 63 | 0.5 * b * 64 | (xc[j + nxc * i] + xc[j + 1 + nxc * i]); 65 | if (i < nxc - 1) 66 | yf[2 * j + nxf * (2 * i + 1)] = 67 | a * yf[2 * j + nxf * (2 * i + 1)] + 68 | 0.5 * b * 69 | (xc[j + nxc * i] + 70 | xc[j + nxc * (i + 1)]); 71 | if (i < nxc - 1 && j < nxc - 1) 72 | yf[2 * j + 1 + nxf * (2 * i + 1)] = 73 | + a * yf[2 * j + 1 + nxf * (2 * i + 1)] + 74 | 0.25 * b * 75 | (xc[j + nxc * i] + 76 | xc[j + nxc * (i + 1)] + 77 | xc[j + 1 + nxc * i] + 78 | xc[j + 1 + nxc * (i + 1)]); 79 | } 80 | } 81 | } 82 | 83 | template 84 | void grid_subtract(T *z, const T *x, const T *y, const int nx, const int ny) { 85 | for (int i = 0; i < nx * ny; ++i) 86 | z[i] = x[i] - y[i]; 87 | } 88 | 89 | template 90 | double grid_l1norm(const T *x, const int nx, const int ny, const T hx, 91 | const T hy, const int bx = 0, const int by = 0, 92 | const int ex = 0, const int ey = 0) { 93 | double out = 0.0; 94 | for (int i = by; i < ny - ey; ++i) 95 | for (int j = bx; j < nx - ex; ++j) 96 | out += fabs(x[j + nx * i]) * hx * hy; 97 | return out; 98 | } 99 | 100 | template 101 | double grid_l2norm(const T *x, const int nx, const int ny, const T hx, 102 | const T hy, const int bx = 0, const int by = 0, 103 | const int ex = 0, const int ey = 0) { 104 | double out = 0.0; 105 | for (int i = by; i < ny - ey; ++i) 106 | for (int j = bx; j < nx - ex; ++j) 107 | out += x[j + nx * i] * x[j + nx * i]; 108 | return out * hx * hy; 109 | } 110 | 111 | template 112 | void grid_print(const T *x, const int nx, const int ny) { 113 | for (int i = 0; i < ny; ++i) { 114 | for (int j = 0; j < nx; ++j) 115 | printf("%-5.3g ", x[j + i * nx]); 116 | printf("\n"); 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /src/poisson.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | 7 | template 8 | __global__ void cuda_gauss_seidel_red_black_kernel( 9 | T *u, const T *f, const int n, const T h, const int color, const int ix0 = 1, 10 | const int iy0 = 1, const int ixn = 1, const int iyn = 1) { 11 | int i = threadIdx.y + blockDim.y * blockIdx.y; 12 | int j = threadIdx.x + blockDim.x * blockIdx.x; 13 | 14 | if (!inbounds(j, i, n, n, ix0, iy0, ixn, iyn)) return; 15 | 16 | if ((i + j) % 2 != color) return; 17 | 18 | u[j + i * n] = -0.25 * (h * h * f[j + i * n] - u[j + 1 + i * n] - 19 | u[j - 1 + i * n] - u[j + (i + 1) * n] - 20 | u[j + (i - 1) * n]); 21 | } 22 | 23 | template 24 | void cuda_gauss_seidel_red_black(T *u, const T *f, const int n, const T h) { 25 | 26 | dim3 threads (32, 4, 1); 27 | dim3 blocks ( (n - 1) / threads.x + 1, (n - 1) / threads.y + 1, 1); 28 | //TODO: Rewrite kernels so that we only access memory once 29 | cuda_gauss_seidel_red_black_kernel<<>>(u, f, n, h, 0); 30 | cuda_gauss_seidel_red_black_kernel<<>>(u, f, n, h, 1); 31 | CUDACHECK(cudaGetLastError()); 32 | } 33 | 34 | template 35 | __global__ void cuda_poisson_residual_kernel(T *r, const T *u, const T *f, 36 | const int n, const T h, 37 | const int ix0 = 1, const int iy0 = 1, 38 | const int ixn = 1, const int jxn = 1) { 39 | int i = threadIdx.y + blockDim.y * blockIdx.y; 40 | int j = threadIdx.x + blockDim.x * blockIdx.x; 41 | 42 | if (!inbounds(j, i, n, n, ix0, iy0, ixn, ixn)) return; 43 | 44 | T hi2 = 1.0 / (h * h); 45 | r[j + i * n] = 46 | f[j + i * n] - 47 | (u[j + 1 + i * n] + u[j - 1 + i * n] + -4.0 * u[j + i * n] + 48 | u[j + (i + 1) * n] + u[j + (i - 1) * n]) * 49 | hi2; 50 | } 51 | 52 | template 53 | __global__ void cuda_exact_solution_kernel(T *u, const int n, const T h, const T modes=1.0, 54 | const int bx=0, const int by=0, const int ex=0, const int ey=0) { 55 | int i = threadIdx.y + blockDim.y * blockIdx.y; 56 | int j = threadIdx.x + blockDim.x * blockIdx.x; 57 | if (!inbounds(j, i, n, n, bx, by, ex, ey)) return; 58 | 59 | T s = 2.0 * M_PI * modes / (h * (n - 1)); 60 | u[j + n * i] = sin(s * h * j) * sin(s * h * i); 61 | 62 | } 63 | template 64 | void cuda_exact_solution(T *u, const int n, const T h, const T modes=1.0) { 65 | dim3 threads (32, 4, 1); 66 | dim3 blocks ( (n - 1) / threads.x + 1, (n - 1) / threads.y + 1, 1); 67 | cuda_exact_solution_kernel<<>>(u, n, h); 68 | CUDACHECK(cudaGetLastError()); 69 | } 70 | 71 | template 72 | void cuda_poisson_residual(T *r, const T *u, const T *f, const int n, const T h) { 73 | dim3 threads (32, 4, 1); 74 | dim3 blocks ( (n - 1) / threads.x + 1, (n - 1) / threads.y + 1, 1); 75 | cuda_poisson_residual_kernel<<>>(r, u, f, n, h); 76 | CUDACHECK(cudaGetLastError()); 77 | } 78 | 79 | template 80 | __global__ void cuda_base_case_kernel(T *u, const T *f, const T h) { 81 | u[1 + 3 * 1] = -0.5 * f[1 + 3 * 1] * h * h; 82 | } 83 | 84 | template 85 | void cuda_base_case(T *u, const T *f, const T h) { 86 | dim3 threads (1, 1, 1); 87 | dim3 blocks (1, 1, 1); 88 | cuda_base_case_kernel<<>>(u, f, h); 89 | CUDACHECK(cudaGetLastError()); 90 | } 91 | 92 | template 93 | void cuda_multigrid_v_cycle(const int l, S& smoother, T *u, T *f, T *r, T *v, T *w, const T h) { 94 | 95 | if (l == 1) { 96 | cuda_base_case(u, f, h); 97 | return; 98 | } 99 | 100 | int nu = (1 << l) + 1; 101 | int nv = (1 << (l - 1)) + 1; 102 | 103 | T *el = &v[nv * nv]; 104 | T *rl = &w[nv * nv]; 105 | 106 | smoother(u, f, nu, h); 107 | 108 | cuda_poisson_residual(r, u, f, nu, h); 109 | 110 | cuda_grid_restrict(rl, nv, nv, r, nu, nu, 0.0, 1.0); 111 | 112 | cuda_multigrid_v_cycle(l - 1, smoother, el, rl, r, v, w, 2 * h); 113 | 114 | cuda_grid_prolongate(u, nu, nu, el, nv, nv, 1.0, 1.0); 115 | 116 | smoother(u, f, nu, h); 117 | } 118 | 119 | template 120 | class CUDAMultigrid { 121 | private: 122 | T *v = 0, *w = 0, *r = 0; 123 | int l; 124 | size_t num_bytes = 0; 125 | F smoother; 126 | public: 127 | 128 | CUDAMultigrid() { } 129 | CUDAMultigrid(P& p) : l(p.l) { 130 | num_bytes = multigrid_size(l) * sizeof(T); 131 | CUDACHECK(cudaMalloc((void**)&v, num_bytes)); 132 | CUDACHECK(cudaMalloc((void**)&w, num_bytes)); 133 | int n = (1 << p.l) + 1; 134 | CUDACHECK(cudaMalloc((void**)&r, sizeof(T) * n * n)); 135 | } 136 | 137 | void operator()(P& p) { 138 | CUDACHECK(cudaMemset(v, 0, num_bytes)); 139 | CUDACHECK(cudaMemset(w, 0, num_bytes)); 140 | cuda_multigrid_v_cycle(l, smoother, p.u, p.f, r, v, w, p.h); 141 | } 142 | 143 | ~CUDAMultigrid(void) { 144 | if (v != nullptr) CUDACHECK(cudaFree(v)); 145 | if (w != nullptr) CUDACHECK(cudaFree(w)); 146 | if (r != nullptr) CUDACHECK(cudaFree(r)); 147 | } 148 | 149 | const char *name() { 150 | static char name[2048]; 151 | sprintf(name, "CUDA Multi-Grid<%s>", smoother.name()); 152 | return name; 153 | } 154 | 155 | }; 156 | 157 | class CUDAGaussSeidelRedBlack { 158 | public: 159 | CUDAGaussSeidelRedBlack() { } 160 | template 161 | CUDAGaussSeidelRedBlack(P& p) { } 162 | 163 | template 164 | void operator()(T *u, const T *f, const int n, const T h) { 165 | cuda_gauss_seidel_red_black(u, f, n, h); 166 | } 167 | 168 | template 169 | void operator()(P& p) { 170 | cuda_gauss_seidel_red_black(p.u, p.f, p.n, p.h); 171 | } 172 | const char *name() { 173 | return "CUDA Gauss-Seidel (red-black)"; 174 | } 175 | 176 | }; 177 | 178 | template 179 | class CUDAPoisson { 180 | 181 | public: 182 | int n; 183 | int l; 184 | T h; 185 | T modes; 186 | T *u, *f, *r; 187 | size_t num_bytes; 188 | CUDANorm normfcn; 189 | 190 | CUDAPoisson(int l, T h, T modes) : l(l), h(h), modes(modes) { 191 | l = l; 192 | n = (1 << l) + 1; 193 | num_bytes = sizeof(T) * n * n; 194 | CUDACHECK(cudaMalloc((void**)&u, num_bytes)); 195 | CUDACHECK(cudaMalloc((void**)&f, num_bytes)); 196 | CUDACHECK(cudaMalloc((void**)&r, num_bytes)); 197 | CUDACHECK(cudaMemset(u, 0, num_bytes)); 198 | CUDACHECK(cudaMemset(f, 0, num_bytes)); 199 | CUDACHECK(cudaMemset(r, 0, num_bytes)); 200 | 201 | T *hf = (T*)malloc(num_bytes); 202 | forcing_function(hf, n, h, modes); 203 | CUDACHECK(cudaMemcpy(f, hf, num_bytes, cudaMemcpyHostToDevice)); 204 | 205 | free(hf); 206 | } 207 | 208 | void residual(void) { 209 | cuda_poisson_residual(r, u, f, n, h); 210 | } 211 | 212 | T error(void) { 213 | T *v; 214 | CUDACHECK(cudaMalloc((void**)&v, num_bytes)); 215 | CUDACHECK(cudaMemset(v, 0, num_bytes)); 216 | cuda_exact_solution(v, n, h, modes); 217 | cuda_grid_subtract(r, u, v, n, n); 218 | T err = 0.0; 219 | err = normfcn(r, n, n, h, h); 220 | CUDACHECK(cudaFree(v)); 221 | return err; 222 | } 223 | 224 | T norm(void) { 225 | T r_norm = normfcn(r, n, n, h, h); 226 | return r_norm; 227 | } 228 | 229 | ~CUDAPoisson() { 230 | CUDACHECK(cudaFree(u)); 231 | CUDACHECK(cudaFree(f)); 232 | CUDACHECK(cudaFree(r)); 233 | } 234 | 235 | 236 | 237 | }; 238 | -------------------------------------------------------------------------------- /src/poisson.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | // Solves Poisson's equation: Lu = f, Lu = u_xx + u_yy 5 | 6 | template 7 | void gauss_seidel(T *u, const T *f, const int n, const T h) { 8 | 9 | for (int i = 1; i < n - 1; ++i) { 10 | for (int j = 1; j < n - 1; ++j) { 11 | u[j + i * n] = 12 | - 0.25 * ( 13 | h * h * f[j + i * n] 14 | - 15 | u[j + 1 + i * n] - u[j - 1 + i * n] 16 | - 17 | u[j + (i + 1) * n] - u[j + (i - 1) * n]); 18 | } 19 | } 20 | 21 | } 22 | 23 | template 24 | void gauss_seidel_red_black(T *u, const T *f, const int n, const T h) { 25 | 26 | for (int i = 1; i < n - 1; ++i) { 27 | for (int j = 1; j < n - 1; ++j) { 28 | if ( (i + j) % 2 == 0) { 29 | u[j + i * n] = 30 | - 0.25 * ( 31 | h * h * f[j + i * n] 32 | - 33 | u[j + 1 + i * n] - u[j - 1 + i * n] 34 | - 35 | u[j + (i + 1) * n] - u[j + (i - 1) * n]); 36 | } 37 | } 38 | } 39 | 40 | for (int i = 1; i < n - 1; ++i) { 41 | for (int j = 1; j < n - 1; ++j) { 42 | if ( (i + j) % 2 == 1) { 43 | u[j + i * n] = 44 | - 0.25 * ( 45 | h * h * f[j + i * n] 46 | - 47 | u[j + 1 + i * n] - u[j - 1 + i * n] 48 | - 49 | u[j + (i + 1) * n] - u[j + (i - 1) * n]); 50 | } 51 | } 52 | } 53 | } 54 | 55 | template 56 | void poisson_residual(T *r, const T *u, const T *f, const int n, const T h) { 57 | 58 | T hi2 = 1.0 / (h * h); 59 | for (int i = 1; i < n - 1; ++i) { 60 | for (int j = 1; j < n - 1; ++j) { 61 | r[j + i * n] = 62 | f[j + i * n] - ( 63 | u[j + 1 + i * n] + u[j - 1 + i * n] + 64 | - 4.0 * u[j + i * n] + u[j + (i + 1) * n] + 65 | u[j + (i - 1) * n]) * hi2; 66 | } 67 | } 68 | 69 | } 70 | 71 | template 72 | void forcing_function(T *f, const int n, const T h, const T modes=1.0) { 73 | 74 | T s = 2.0 * M_PI * modes / (h * (n - 1)); 75 | memset(f, 0, n * n * sizeof(T)); 76 | for (int i = 0; i < n; ++i) { 77 | for (int j = 0; j < n; ++j) { 78 | f[j + n * i] = -2 * s * s * sin(s * h * i) * sin(s * h * j); 79 | } 80 | } 81 | 82 | } 83 | 84 | template 85 | void exact_solution(T *u, const int n, const T h, const T modes=1.0) { 86 | 87 | T s = 2.0 * M_PI * modes / (h * (n - 1)); 88 | for (int i = 0; i < n; ++i) { 89 | for (int j = 0; j < n; ++j) { 90 | u[j + n * i] = sin(s * h * j) * sin(s * h * i); 91 | } 92 | } 93 | } 94 | 95 | template 96 | __inline__ void base_case(T *u, const T *f, const T h) { 97 | u[1 + 3 * 1] = -0.5 * f[1 + 3 * 1] * h * h; 98 | } 99 | 100 | template 101 | void multigrid_v_cycle(const int l, S& smoother, T *u, T *f, T *r, T *v, T *w, const T h) { 102 | 103 | if (l == 1) { 104 | base_case(u, f, h); 105 | return; 106 | } 107 | 108 | int nu = (1 << l) + 1; 109 | int nv = (1 << (l - 1)) + 1; 110 | // Get e^(l-1) and residual r^(l-1) 111 | T *el = &v[nv * nv]; 112 | T *rl = &w[nv * nv]; 113 | 114 | smoother(u, f, nu, h); 115 | 116 | // r^l := f - Lu^l 117 | poisson_residual(r, u, f, nu, h); 118 | 119 | // r^(l-1) := R * r 120 | grid_restrict(rl, nv, nv, r, nu, nu, 0.0, 1.0); 121 | 122 | // Solve: A^(l-1) e^(l-1) = r^(l-1) 123 | multigrid_v_cycle(l - 1, smoother, el, rl, r, v, w, 2 * h); 124 | 125 | // Prolongate and add correction u^l := u^l + Pe^(l-1) 126 | grid_prolongate(u, nu, nu, el, nv, nv, 1.0, 1.0); 127 | 128 | smoother(u, f, nu, h); 129 | } 130 | 131 | // Size of all of the combined grids 132 | size_t multigrid_size(const int l) { 133 | size_t size = 0; 134 | for (int i = 0; i <= l; ++i) { 135 | int n = (1 << i) + 1; 136 | size += n * n; 137 | } 138 | return size; 139 | } 140 | 141 | template 142 | class Multigrid { 143 | private: 144 | // v and w are buffers of size (l + 1) * log (l + 1), 145 | // v[0] .. v[l] (one per grid) 146 | // v is used for the initial guess and w is used for the restricted residual 147 | T *v = 0, *w = 0, *r = 0; 148 | int l; 149 | size_t num_bytes = 0; 150 | F smoother; 151 | public: 152 | 153 | Multigrid() { } 154 | Multigrid(P& p) : l(p.l) { 155 | num_bytes = multigrid_size(l) * sizeof(T); 156 | v = (T*)malloc(num_bytes); 157 | w = (T*)malloc(num_bytes); 158 | int n = (1 << p.l) + 1; 159 | r = (T*)malloc(sizeof(T) * n * n); 160 | } 161 | 162 | void operator()(P& p) { 163 | memset(v, 0, num_bytes); 164 | memset(w, 0, num_bytes); 165 | multigrid_v_cycle(l, smoother, p.u, p.f, r, v, w, p.h); 166 | } 167 | 168 | ~Multigrid(void) { 169 | if (v != nullptr) free(v); 170 | if (w != nullptr) free(w); 171 | if (r != nullptr) free(r); 172 | } 173 | 174 | const char *name() { 175 | static char name[2048]; 176 | sprintf(name, "Multi-Grid<%s>", smoother.name()); 177 | return name; 178 | } 179 | 180 | }; 181 | 182 | class GaussSeidel { 183 | public: 184 | GaussSeidel() { } 185 | template 186 | GaussSeidel(P& p) { } 187 | template 188 | void operator()(P& p) { 189 | gauss_seidel(p.u, p.f, p.n, p.h); 190 | } 191 | 192 | template 193 | void operator()(T *u, T *f, const int n, const T h) { 194 | gauss_seidel(u, f, n, h); 195 | } 196 | 197 | const char *name() { 198 | return "Gauss-Seidel"; 199 | } 200 | 201 | }; 202 | 203 | class GaussSeidelRedBlack { 204 | public: 205 | GaussSeidelRedBlack() { } 206 | template 207 | GaussSeidelRedBlack(P& p) { } 208 | template 209 | void operator()(T *u, const T *f, const int n, const T h) { 210 | gauss_seidel_red_black(u, f, n, h); 211 | } 212 | 213 | template 214 | void operator()(P& p) { 215 | gauss_seidel_red_black(p.u, p.f, p.n, p.h); 216 | } 217 | const char *name() { 218 | return "Gauss-Seidel (red-black)"; 219 | } 220 | 221 | }; 222 | 223 | template 224 | class Poisson { 225 | public: 226 | int n; 227 | int l; 228 | T h; 229 | T modes; 230 | T *u, *f, *r; 231 | size_t num_bytes; 232 | 233 | Poisson(int l, T h, T modes) : l(l), h(h), modes(modes) { 234 | n = (1 << l) + 1; 235 | num_bytes = sizeof(T) * n * n; 236 | u = (T*)malloc(num_bytes); 237 | f = (T*)malloc(num_bytes); 238 | r = (T*)malloc(num_bytes); 239 | memset(u, 0, num_bytes); 240 | memset(f, 0, num_bytes); 241 | memset(r, 0, num_bytes); 242 | forcing_function(f, n, h, modes); 243 | } 244 | 245 | T error() { 246 | T *v = (T*)malloc(num_bytes); 247 | memset(v, 0, num_bytes); 248 | exact_solution(v, n, h, modes); 249 | grid_subtract(r, u, v, n, n); 250 | T err = grid_l1norm(r, n, n, h, h); 251 | free(v); 252 | return err; 253 | } 254 | 255 | void residual(void) { 256 | poisson_residual(r, u, f, n, h); 257 | } 258 | 259 | T norm(void) { 260 | return grid_l1norm(r, n, n, h, h); 261 | } 262 | 263 | ~Poisson() { 264 | free(u); 265 | free(f); 266 | free(r); 267 | } 268 | }; 269 | 270 | 271 | -------------------------------------------------------------------------------- /src/solver.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | class SolverOptions { 4 | public: 5 | int verbose = 0; 6 | int max_iterations = 1000000; 7 | double eps = 1e-12; 8 | int info = 1.0; 9 | int mms = 0; 10 | }; 11 | 12 | class SolverOutput { 13 | public: 14 | double residual; 15 | int iterations; 16 | double error; 17 | }; 18 | 19 | 20 | template 21 | SolverOutput solve(F& solver, P& problem, SolverOptions opts) { 22 | 23 | if (opts.verbose) { 24 | printf("Solver: %s \n", solver.name()); 25 | printf("Iteration \t Residual\n"); 26 | } 27 | 28 | T res = 0.0; 29 | int iter = 0; 30 | do { 31 | solver(problem); 32 | problem.residual(); 33 | res = problem.norm(); 34 | iter++; 35 | if (iter % opts.info == 0 && opts.verbose) 36 | printf("%-7d \t %-7.7g \n", iter, res); 37 | 38 | } while (res > opts.eps && (iter < opts.max_iterations || opts.max_iterations < 0)); 39 | 40 | SolverOutput out; 41 | out.iterations = iter; 42 | out.residual = res; 43 | 44 | if (opts.mms) { 45 | out.error = problem.error(); 46 | } 47 | 48 | return out; 49 | } 50 | 51 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(test_grid test_grid.cu) 2 | add_test(NAME test_grid COMMAND test_grid) 3 | 4 | add_executable(test_poisson test_poisson.cu) 5 | add_test(NAME test_poisson COMMAND test_poisson) 6 | -------------------------------------------------------------------------------- /test/test_grid.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | 8 | template 9 | int test_gridpoints(const int nx, const int ny, const T h) { 10 | 11 | printf("Testing grid points generation with nx = %d ny = %d and h = %f \n", nx, ny, h); 12 | size_t num_bytes = nx * ny * sizeof(T); 13 | T *x = (T*)malloc(num_bytes); 14 | 15 | grid_x(x, nx, ny, h); 16 | 17 | approx(x[0], 0.0); 18 | approx(x[1], h); 19 | approx(x[nx-1], h * (nx - 1)); 20 | approx(x[(ny - 1) * nx - 1], h * (nx - 1)); 21 | approx(x[(ny - 1) * nx], 0.0); 22 | 23 | 24 | grid_y(x, nx, ny, h); 25 | 26 | approx(x[0], 0.0); 27 | approx(x[1], 0.0); 28 | approx(x[nx-1], 0.0); 29 | 30 | approx(x[nx], h); 31 | approx(x[nx * (ny - 1)], (ny - 1) * h); 32 | 33 | free(x); 34 | 35 | return test_report(); 36 | } 37 | 38 | template 39 | int test_gridnorm(const int nx, const int ny) { 40 | T hx = 1.0 / (nx - 1); 41 | T hy = 1.0 / (ny - 1); 42 | printf( 43 | "Testing L1 and L2 norm on grid with nx = %d ny = %d and hx = %f hy = %f " 44 | "\n", 45 | nx, ny, hx, hy); 46 | size_t num_bytes = nx * ny * sizeof(T); 47 | T *x = (T*)malloc(num_bytes); 48 | 49 | for (int i = 0; i < nx * ny; ++i) 50 | x[i] = 1.0; 51 | 52 | T norm = grid_l1norm(x, nx, ny, hx, hy, 0, 0, 0, 0); 53 | approx(norm, nx * ny * hx * hy); 54 | 55 | norm = grid_l1norm(x, nx, ny, hx, hy, 1, 0, 0, 0); 56 | approx(norm, (nx - 1) * ny * hx * hy); 57 | 58 | norm = grid_l1norm(x, nx, ny, hx, hy, 0, 1, 0, 0); 59 | approx(norm, nx * (ny - 1) * hx * hy); 60 | 61 | norm = grid_l1norm(x, nx, ny, hx, hy, 0, 0, 1, 1); 62 | approx(norm, (nx - 1) * (ny - 1) * hx * hy); 63 | 64 | norm = grid_l2norm(x, nx, ny, hx, hy, 0, 0, 0, 0); 65 | approx(norm, nx * ny * hx * hy); 66 | 67 | norm = grid_l2norm(x, nx, ny, hx, hy, 1, 0, 0, 0); 68 | approx(norm, (nx - 1) * ny * hx * hy); 69 | 70 | norm = grid_l2norm(x, nx, ny, hx, hy, 0, 1, 0, 0); 71 | approx(norm, nx * (ny - 1) * hx * hy); 72 | 73 | // Extract the second last point in the corner 74 | grid_x(x, nx, ny, hx); 75 | norm = grid_l2norm(x, nx, ny, hx, hy, nx - 2, ny - 2, 1, 1); 76 | approx(norm, (1 - hx) * (1 - hx) * hx * hy); 77 | 78 | return test_report(); 79 | } 80 | 81 | template 82 | int cuda_test_gridnorm(const int nx, const int ny) { 83 | T hx = 1.0 / (nx - 1); 84 | T hy = 1.0 / (ny - 1); 85 | printf( 86 | "CUDA Testing L1 and L2 norm on grid with nx = %d ny = %d and hx = %f hy = %f " 87 | "\n", 88 | nx, ny, hx, hy); 89 | size_t num_bytes = nx * ny * sizeof(T); 90 | T *x = (T*)malloc(num_bytes); 91 | 92 | for (int i = 0; i < nx * ny; ++i) 93 | x[i] = 1.0; 94 | 95 | T *d_x; 96 | CUDACHECK(cudaMalloc((void**)&d_x, num_bytes)); 97 | CUDACHECK(cudaMemcpy(d_x, x, num_bytes, cudaMemcpyHostToDevice)); 98 | 99 | CUDANorm l1norm; 100 | T norm = l1norm(d_x, nx, ny, hx, hy); 101 | approx(norm, nx * ny * hx * hy); 102 | 103 | CUDANorm l2norm; 104 | norm = l2norm(d_x, nx, ny, hx, hy); 105 | approx(norm, nx * ny * hx * hy); 106 | 107 | CUDACHECK(cudaFree(d_x)); 108 | 109 | return test_report(); 110 | } 111 | 112 | 113 | 114 | template 115 | int cuda_test_gridpoints(const int nx, const int ny, const T h) { 116 | 117 | printf("CUDA Testing grid points generation with nx = %d ny = %d and h = %f \n", nx, ny, h); 118 | size_t num_bytes = nx * ny * sizeof(T); 119 | T *d_x, *x; 120 | cudaMalloc((void**)&d_x, num_bytes); 121 | x = (T*)malloc(num_bytes); 122 | 123 | cuda_grid_x(d_x, nx, ny, h); 124 | 125 | CUDACHECK(cudaMemcpy(x, d_x, num_bytes, cudaMemcpyDeviceToHost)); 126 | 127 | approx(x[0], 0.0); 128 | approx(x[1], h); 129 | approx(x[nx-1], h * (nx - 1)); 130 | approx(x[(ny - 1) * nx - 1], h * (nx - 1)); 131 | approx(x[(ny - 1) * nx], 0.0); 132 | 133 | cuda_grid_y(d_x, nx, ny, h); 134 | 135 | CUDACHECK(cudaMemcpy(x, d_x, num_bytes, cudaMemcpyDeviceToHost)); 136 | 137 | approx(x[0], 0.0); 138 | approx(x[1], 0.0); 139 | approx(x[nx-1], 0.0); 140 | 141 | approx(x[nx], h); 142 | approx(x[nx * (ny - 1)], (ny - 1) * h); 143 | 144 | free(x); 145 | CUDACHECK(cudaFree(d_x)); 146 | 147 | return test_report(); 148 | } 149 | 150 | template 151 | void restriction(const char *axis, const T *xc, T *yc, T *zc, const int nxc, 152 | const int nyc, const T hc, const T *xf, const int nxf, const int nyf, const T hf) { 153 | grid_restrict(yc, nxc, nyc, xf, nxf, nyf); 154 | grid_subtract(zc, xc, yc, nxc, nyc); 155 | 156 | 157 | T l1_err = grid_l1norm(zc, nxc, nyc, hc, hc, 1, 1, 1, 1); 158 | T l2_err = grid_l2norm(zc, nxc, nyc, hc, hc, 1, 1, 1, 1); 159 | 160 | approx(l1_err, 0.0); 161 | approx(l2_err, 0.0); 162 | 163 | printf("Restriction in %s l1-error: %g, l2-error: %g \n", axis, l1_err, l2_err); 164 | } 165 | 166 | 167 | template 168 | void prolongation(const char *axis, const T *xf, T *yf, T *zf, const int nxf, 169 | const int nyf, const T hf, const T *xc, const int nxc, const int nyc, const T hc) { 170 | grid_prolongate(yf, nxf, nyf, xc, nxc, nyc); 171 | grid_subtract(zf, xf, yf, nxf, nyf); 172 | 173 | T l1_err = grid_l1norm(zf, nxf, nyf, hf, hf); 174 | T l2_err = grid_l2norm(zf, nxf, nyf, hf, hf); 175 | 176 | approx(l1_err, 0.0); 177 | approx(l2_err, 0.0); 178 | 179 | printf("Prolongation in %s l1-error: %g, l2-error: %g \n", axis, l1_err, l2_err); 180 | } 181 | 182 | template 183 | void restriction_prolongation_info(const int nxf, const int nyf, const T hf, const int nxc, 184 | const int nyc, const T hc) { 185 | printf( 186 | "Testing grid restriction and prolongation with fine grid " 187 | "[%d %d], hf=%g, and coarse grid [%d %d], hc=%g. \n", 188 | nxf, nyf, hf, nxc, nyc, hc); 189 | 190 | } 191 | 192 | template 193 | int test_restriction_prolongation(const int nxc, const int nyc, const T hc) { 194 | 195 | T hf = 0.5 * hc; 196 | int nxf = 2 * (nxc - 1) + 1; 197 | int nyf = 2 * (nyc - 1) + 1; 198 | restriction_prolongation_info(nxf, nyf, hf, nxc, nyc, hc); 199 | 200 | size_t num_bytesf = sizeof(T) * nxf * nyf; 201 | size_t num_bytesc = sizeof(T) * nxc * nyc; 202 | 203 | T *xf = (T*)malloc(num_bytesf); 204 | T *yf = (T*)malloc(num_bytesf); 205 | T *zf = (T*)malloc(num_bytesf); 206 | T *xc = (T*)malloc(num_bytesc); 207 | T *yc = (T*)malloc(num_bytesc); 208 | T *zc = (T*)malloc(num_bytesc); 209 | 210 | // Test restriction and prolongation in the x-direction 211 | grid_x(xf, nxf, nyf, hf); 212 | grid_x(xc, nxc, nyc, hc); 213 | restriction("x", xc, yc, zc, nxc, nyc, hc, xf, nxf, nyf, hf); 214 | prolongation("x", xf, yf, zf, nxf, nyf, hf, xc, nxc, nyc, hc); 215 | 216 | // Test restriction and prolongation in the y-direction 217 | grid_y(xf, nxf, nyf, hf); 218 | grid_y(xc, nxc, nyc, hc); 219 | restriction("y", xc, yc, zc, nxc, nyc, hc, xf, nxf, nyf, hf); 220 | prolongation("y", xf, yf, zf, nxf, nyf, hf, xc, nxc, nyc, hc); 221 | 222 | free(xf); 223 | free(yf); 224 | free(zf); 225 | 226 | free(xc); 227 | free(yc); 228 | free(zc); 229 | 230 | return test_report(); 231 | } 232 | 233 | template 234 | void cuda_restriction(const char *axis, const T *xc, T *yc, T *zc, const int nxc, 235 | const int nyc, const T hc, const T *xf, const int nxf, const int nyf, const T hf) { 236 | 237 | size_t num_bytes = sizeof(T) * nxc * nyc; 238 | cuda_grid_restrict(yc, nxc, nyc, xf, nxf, nyf); 239 | cuda_grid_subtract(zc, xc, yc, nxc, nyc); 240 | 241 | T *hzc = (T*)malloc(num_bytes); 242 | CUDACHECK(cudaMemcpy(hzc, zc, num_bytes, cudaMemcpyDeviceToHost)); 243 | 244 | //TODO: Compute norms on device once there's support for bounds control 245 | T l1_err = grid_l1norm(hzc, nxc, nyc, hc, hc, 1, 1, 1, 1); 246 | T l2_err = grid_l2norm(hzc, nxc, nyc, hc, hc, 1, 1, 1, 1); 247 | 248 | approx(l1_err, 0.0); 249 | approx(l2_err, 0.0); 250 | 251 | printf("CUDA Restriction in %s l1-error: %g, l2-error: %g \n", axis, l1_err, l2_err); 252 | } 253 | 254 | template 255 | void cuda_prolongation(const char *axis, const T *xf, T *yf, T *zf, const int nxf, 256 | const int nyf, const T hf, const T *xc, const int nxc, const int nyc, const T hc) { 257 | 258 | size_t num_bytes = sizeof(T) * nxf * nyf; 259 | cuda_grid_prolongate(yf, nxf, nyf, xc, nxc, nyc); 260 | cuda_grid_subtract(zf, xf, yf, nxf, nyf); 261 | 262 | T *hzf = (T*)malloc(num_bytes); 263 | CUDACHECK(cudaMemcpy(hzf, zf, num_bytes, cudaMemcpyDeviceToHost)); 264 | 265 | //TODO: Compute norms on device once there's support for bounds control 266 | T l1_err = grid_l1norm(hzf, nxf, nyf, hf, hf, 1, 1, 1, 1); 267 | T l2_err = grid_l2norm(hzf, nxf, nyf, hf, hf, 1, 1, 1, 1); 268 | 269 | approx(l1_err, 0.0); 270 | approx(l2_err, 0.0); 271 | 272 | printf("CUDA Prolongation in %s l1-error: %g, l2-error: %g \n", axis, l1_err, l2_err); 273 | } 274 | 275 | template 276 | int cuda_test_restriction_prolongation(const int nxc, const int nyc, const T hc) { 277 | T hf = 0.5 * hc; 278 | int nxf = 2 * (nxc - 1) + 1; 279 | int nyf = 2 * (nyc - 1) + 1; 280 | restriction_prolongation_info(nxf, nyf, hf, nxc, nyc, hc); 281 | 282 | size_t num_bytesf = sizeof(T) * nxf * nyf; 283 | size_t num_bytesc = sizeof(T) * nxc * nyc; 284 | 285 | T *xf, *yf, *zf, *xc, *yc, *zc; 286 | cudaMalloc((void**)&xf, num_bytesf); 287 | cudaMalloc((void**)&yf, num_bytesf); 288 | cudaMalloc((void**)&zf, num_bytesf); 289 | cudaMalloc((void**)&xc, num_bytesc); 290 | cudaMalloc((void**)&yc, num_bytesc); 291 | cudaMalloc((void**)&zc, num_bytesc); 292 | 293 | cuda_grid_x(xf, nxf, nyf, hf); 294 | cuda_grid_x(xc, nxc, nyc, hc); 295 | cuda_restriction("x", xc, yc, zc, nxc, nyc, hc, xf, nxf, nyf, hf); 296 | cuda_prolongation("x", xf, yf, zf, nxf, nyf, hf, xc, nxc, nyc, hc); 297 | 298 | cuda_grid_y(yf, nxf, nyf, hf); 299 | cuda_grid_y(yc, nxc, nyc, hc); 300 | cuda_restriction("y", xc, yc, zc, nxc, nyc, hc, xf, nxf, nyf, hf); 301 | cuda_prolongation("y", xf, yf, zf, nxf, nyf, hf, xc, nxc, nyc, hc); 302 | 303 | CUDACHECK(cudaFree(xf)); 304 | CUDACHECK(cudaFree(yf)); 305 | CUDACHECK(cudaFree(zf)); 306 | CUDACHECK(cudaFree(xc)); 307 | CUDACHECK(cudaFree(yc)); 308 | CUDACHECK(cudaFree(zc)); 309 | 310 | return test_report(); 311 | } 312 | 313 | int main(int argc, char **argv) { 314 | 315 | int err = 0; 316 | { 317 | int nx = 20; 318 | int ny = 20; 319 | double h = 1.0; 320 | err |= test_gridpoints(nx, ny, h); 321 | err |= cuda_test_gridpoints(nx, ny, h); 322 | } 323 | 324 | { 325 | int nx = 21; 326 | int ny = 20; 327 | double h = 0.5; 328 | err |= test_gridpoints(nx, ny, h); 329 | } 330 | 331 | { 332 | int nx = 21; 333 | int ny = 31; 334 | err |= test_gridnorm(nx, ny); 335 | err |= cuda_test_gridnorm(nx, ny); 336 | } 337 | 338 | { 339 | int nxc = 4; 340 | int nyc = 4; 341 | double hf = 0.3; 342 | err |= test_restriction_prolongation(nxc, nyc, hf); 343 | err |= cuda_test_restriction_prolongation(nxc, nyc, hf); 344 | } 345 | 346 | return err; 347 | 348 | } 349 | -------------------------------------------------------------------------------- /test/test_poisson.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | 10 | template 11 | void convergence_test(const int num_grids, SolverOptions opts) { 12 | T rate = 0.0; 13 | T err1 = 0.0; 14 | T modes = 1.0; 15 | int l = 2; 16 | T h = 1.0; 17 | printf("MMS convergence test\n"); 18 | { 19 | S tmp; 20 | printf("Solver: %s \n", tmp.name()); 21 | } 22 | printf("Grid Size \t Iterations \t Time (ms) \t Residual \t Error \t\t Rate \n"); 23 | for (int i = 0; i < num_grids; ++i) { 24 | cudaEvent_t start, stop; 25 | cudaEventCreate(&start); 26 | cudaEventCreate(&stop); 27 | P problem(l, h, modes); 28 | S solver(problem); 29 | 30 | cudaEventRecord(start); 31 | SolverOutput out = solve(solver, problem, opts); 32 | cudaEventRecord(stop); 33 | cudaEventSynchronize(stop); 34 | float elapsed = 0; 35 | cudaEventElapsedTime(&elapsed, start, stop); 36 | 37 | rate = log2(err1 / out.error); 38 | int n = (1 << l) + 1; 39 | printf("%4d x %-4d \t %-7d \t %-5.5f \t %-5.5g \t %-5.5g \t %-5.5f \n", 40 | n, n, 41 | out.iterations, elapsed, out.residual, out.error, rate); 42 | err1 = out.error; 43 | l++; 44 | h /= 2; 45 | } 46 | } 47 | 48 | 49 | int main(int argc, char **argv) { 50 | 51 | using Number = double; 52 | SolverOptions opts; 53 | opts.verbose = 1; 54 | opts.info = 10; 55 | opts.max_iterations = 1e4; 56 | opts.eps = 1e-8; 57 | opts.mms = 1; 58 | int l = 4; 59 | int n = (1 << l) + 1; 60 | double h = 1.0 / (n - 1); 61 | double modes = 1.0; 62 | using Problem = Poisson; 63 | 64 | 65 | { 66 | 67 | using Problem = Poisson; 68 | Problem problem(l, h, modes); 69 | using Smoother=GaussSeidelRedBlack; 70 | Smoother solver; 71 | auto out = solve(solver, problem, opts); 72 | printf("Iterations: %d, Residual: %g \n", out.iterations, out.residual); 73 | 74 | } 75 | 76 | { 77 | Problem problem(l, h, modes); 78 | using Smoother=GaussSeidelRedBlack; 79 | using MG=Multigrid; 80 | MG mg(problem); 81 | auto out = solve(mg, problem, opts); 82 | printf("Iterations: %d, Residual: %g \n", out.iterations, out.residual); 83 | 84 | } 85 | 86 | { 87 | using CUDAProblem = CUDAPoisson; 88 | CUDAProblem problem(l, h, modes); 89 | using CUDASmoother = CUDAGaussSeidelRedBlack; 90 | 91 | CUDASmoother solver; 92 | auto out = solve(solver, problem, opts); 93 | printf("Iterations: %d, Residual: %g \n", out.iterations, out.residual); 94 | } 95 | 96 | { 97 | using CUDAProblem = CUDAPoisson; 98 | using CUDASmoother = CUDAGaussSeidelRedBlack; 99 | using CUDAMG = CUDAMultigrid; 100 | 101 | CUDAProblem problem(l, h, modes); 102 | CUDAMG solver(problem); 103 | auto out = solve(solver, problem, opts); 104 | printf("Iterations: %d, Residual: %g \n", out.iterations, out.residual); 105 | } 106 | 107 | { 108 | using Smoother=GaussSeidelRedBlack; 109 | using MG=Multigrid; 110 | opts.verbose = 0; 111 | 112 | int num_refinements = 12; 113 | convergence_test(num_refinements, opts); 114 | } 115 | { 116 | using CUDAProblem = CUDAPoisson; 117 | using CUDASmoother = CUDAGaussSeidelRedBlack; 118 | using CUDAMG = CUDAMultigrid; 119 | 120 | opts.verbose = 0; 121 | int num_refinements = 12; 122 | convergence_test(num_refinements, opts); 123 | } 124 | } 125 | --------------------------------------------------------------------------------