├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── common ├── complex.hh ├── finalise.hh ├── flags.makefile ├── fmm.hh ├── get-deps.hh ├── gpu-kernels-no-atomics.hh ├── gpu-kernels.hh ├── gpu-spharm.hh ├── gpu-utils.hh ├── init.hh ├── input.hh ├── kernels.hh ├── kokkos-utils.hh ├── node.hh ├── spharm.hh ├── timer.hh ├── traversal.hh ├── tree.hh ├── utils.hh └── verify.hh ├── cuda ├── cuda-utils.hh ├── flags.makefile ├── node.hh └── traversal.hh ├── gpusched ├── flags.makefile └── traversal.hh ├── inputs ├── plummer.in ├── small.in └── uniform.in ├── kokkos-for ├── flags.makefile ├── node.hh └── traversal.hh ├── kokkos-task-locks ├── flags.makefile ├── node.hh └── traversal.hh ├── kokkos-task ├── flags.makefile ├── node.hh └── traversal.hh ├── main.cc ├── omp-for ├── flags.makefile ├── node.hh └── traversal.hh ├── omp-task ├── flags.makefile ├── node.hh └── traversal.hh └── omptarget ├── flags.makefile ├── node.hh └── traversal.hh /.gitignore: -------------------------------------------------------------------------------- 1 | fmm.* 2 | 3 | # Prerequisites 4 | *.d 5 | 6 | # Compiled Object files 7 | *.slo 8 | *.lo 9 | *.o 10 | *.obj 11 | 12 | # Precompiled Headers 13 | *.gch 14 | *.pch 15 | 16 | # Compiled Dynamic libraries 17 | *.so 18 | *.dylib 19 | *.dll 20 | 21 | # Fortran module files 22 | *.mod 23 | *.smod 24 | 25 | # Compiled Static libraries 26 | *.lai 27 | *.la 28 | *.a 29 | *.lib 30 | 31 | # Executables 32 | *.exe 33 | *.out 34 | *.app 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Patrick Atkinson 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | COMPILER ?= GNU 2 | ARCH ?= native 3 | MODEL = omp-task 4 | 5 | EXES=$(addprefix fmm., $(MODEL)) 6 | SINGLE=$(addsuffix .single, $(EXES)) 7 | DOUBLE=$(addsuffix .double, $(EXES)) 8 | 9 | default: fmm.$(MODEL) 10 | 11 | include $(MODEL)/flags.makefile 12 | 13 | .PHONY: default clean 14 | 15 | COMMON_HEADERS=$(wildcard common/*.hh) 16 | COMMON_INC=-I./common 17 | 18 | main.o: main.cc $(MODEL)/*.hh $(COMMON_HEADERS) 19 | $(CC) $(CFLAGS) $(EXTRA_FLAGS) -I./$(MODEL) $(COMMON_INC) main.cc -c 20 | 21 | fmm.$(MODEL): main.o 22 | $(CC) main.o -o $@ $(LIBS) 23 | 24 | #fmm.%.single: main.cc %/*.hh $(COMMON_HEADERS) 25 | # $(CC) $(CFLAGS) -I $* $(COMMON_INC) main.cc -o $@ $(LIBS) 26 | # 27 | #fmm.%.double: main.cc %/*.hh $(COMMON_HEADERS) 28 | # $(CC) $(CFLAGS) -DFMM_DOUBLE -I $* $(COMMON_INC) main.cc -o $@ $(LIBS) 29 | 30 | clean: 31 | -rm -f fmm.* main.o 32 | 33 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | MiniFMM 2 | ======= 3 | 4 | Building 5 | -------- 6 | 7 | ``` 8 | make COMPILER= ARCH= MODEL= 9 | ``` 10 | 11 | The programming model must be the same name as the directory (i.e. `omp-tasks`). 12 | 13 | Running 14 | ------- 15 | 16 | ``` 17 | ./fmm. 18 | ``` 19 | 20 | Valid arguments 21 | 22 | - `n`: no. of input particles 23 | - `c`: max. no. of particles per tree node 24 | - `t`: no. of multipole terms 25 | - `e`: theta (as in Barnes-Hut) 26 | - `m`: no. samples used to validate 27 | - `p`: use a Plummer input distribution 28 | - `u`: use a uniform input distribution (cube) 29 | - `i`: input file 30 | 31 | Some sample input files are given in `input/`. 32 | 33 | Information 34 | ----------- 35 | 36 | Please cite using: 37 | >Atkinson, P., & McIntosh-Smith, S. (2017). On the Performance of Parallel Tasking Runtimes for an Irregular Fast Multipole Method Application. In Scaling OpenMP for Exascale Performance and Portability (pp. 92–106). Springer International Publishing. https://doi.org/10.1007/978-3-319-65578-9_7 38 | 39 | For further information, please refer to: https://patrickatkinson.co.uk/phd-thesis.pdf 40 | 41 | -------------------------------------------------------------------------------- /common/complex.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "utils.hh" 6 | 7 | #if !defined(__NVCC__) && defined(__x86_64__) && (__GNUC__ >= 8) 8 | #warning falling back to std complex numbers 9 | template 10 | using complex_t = std::complex; 11 | 12 | template 13 | T complex_real(const std::complex& a) 14 | { 15 | return std::real(a); 16 | } 17 | 18 | template 19 | std::complex complex_conj(const std::complex& a) 20 | { 21 | return std::conj(a); 22 | } 23 | 24 | template 25 | std::complex complex_pow(const std::complex& a, const U& b) 26 | { 27 | return std::pow(a, b); 28 | } 29 | 30 | template 31 | std::complex complex_exp(const std::complex& a) 32 | { 33 | return std::exp(a); 34 | } 35 | 36 | template 37 | std::complex imag_pow(const int n) 38 | { 39 | return std::pow(std::complex(0.0, 1.0), n); 40 | } 41 | #else 42 | 43 | template 44 | struct complex_t { 45 | T re, im; 46 | 47 | HOSTDEVICE complex_t(T arg_re, T arg_im) : re(arg_re), im(arg_im) {} 48 | HOSTDEVICE complex_t() : re(static_cast(0.0)), im(static_cast(0.0)) {} 49 | 50 | std::complex convert() { return std::complex(re, im); } 51 | 52 | HOSTDEVICE complex_t& operator+=(const complex_t& rhs) 53 | { 54 | this->re += rhs.re; 55 | this->im += rhs.im; 56 | return *this; 57 | } 58 | 59 | HOSTDEVICE T real() { return this->re; } 60 | HOSTDEVICE T imag() { return this->im; } 61 | }; 62 | 63 | template 64 | HOSTDEVICE complex_t operator*(const complex_t& a, const complex_t& b) 65 | { 66 | T re = a.re * b.re - a.im * b.im; 67 | T im = a.re * b.im + a.im * b.re; 68 | return complex_t(re, im); 69 | } 70 | 71 | template 72 | HOSTDEVICE complex_t operator*(const complex_t& a, const T& b) 73 | { 74 | return complex_t(a.re * b, a.im * b); 75 | } 76 | 77 | template 78 | HOSTDEVICE complex_t operator*(const T& a, const complex_t& b) 79 | { 80 | return b * a; 81 | } 82 | 83 | template 84 | HOSTDEVICE complex_t operator/(const complex_t& a, T& b) 85 | { 86 | return complex_t(a.re / b, a.im / b); 87 | } 88 | 89 | template 90 | HOSTDEVICE complex_t complex_exp(const complex_t& a) 91 | { 92 | const T r = std::exp(a.re); 93 | return complex_t(r * std::cos(a.im), r * std::sin(a.im)); 94 | } 95 | 96 | template 97 | HOSTDEVICE T complex_abs(const complex_t& a) 98 | { 99 | return std::hypot(a.re, a.im); 100 | } 101 | 102 | template 103 | HOSTDEVICE complex_t complex_pow(const complex_t& a, const T& b) 104 | { 105 | T r = complex_abs(a); 106 | if (a.re == 0.0) { 107 | printf("divide by zero\n"); 108 | exit(1); 109 | } 110 | T phi = std::atan(a.im / a.re); 111 | return std::pow(r, b) * complex_t(std::cos(phi * b), std::sin(phi * b)); 112 | } 113 | 114 | template 115 | HOSTDEVICE complex_t imag_pow(int n) 116 | { 117 | complex_t i = (n & 1) ? complex_t(0.0, 1.0) : complex_t(1.0, 0.0); 118 | i = (n & 2) ? i * static_cast(-1.0) : i; 119 | return i; 120 | } 121 | 122 | template 123 | HOSTDEVICE complex_t complex_conj(const complex_t& a) 124 | { 125 | return complex_t(a.re, -a.im); 126 | } 127 | 128 | template 129 | HOSTDEVICE T complex_real(const complex_t& a) 130 | { 131 | return a.re; 132 | } 133 | 134 | template 135 | HOSTDEVICE T complex_imag(const complex_t& a) 136 | { 137 | return a.im; 138 | } 139 | 140 | template 141 | HOSTDEVICE complex_t convert(std::complex a) 142 | { 143 | return complex_t(((T*)&a)[0], ((T*)&a)[1]); 144 | } 145 | #endif 146 | -------------------------------------------------------------------------------- /common/finalise.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | template 6 | void finalise(FMM* fmm) 7 | { 8 | free(fmm->x); 9 | free(fmm->y); 10 | free(fmm->z); 11 | free(fmm->w); 12 | free(fmm->ax); 13 | free(fmm->ay); 14 | free(fmm->az); 15 | free(fmm->p); 16 | free(fmm->inner_factors); 17 | free(fmm->outer_factors); 18 | free(fmm->m); 19 | free(fmm->l); 20 | free(fmm->nodes); 21 | } 22 | -------------------------------------------------------------------------------- /common/flags.makefile: -------------------------------------------------------------------------------- 1 | CC_GNU=g++ 2 | CC_INTEL=icpc 3 | CC_CLANG=clang++ 4 | CC_ARM=armclang++ 5 | CC_CRAY=CC 6 | CC=$(CC_$(COMPILER)) 7 | 8 | UNAME=$(shell uname -m) 9 | ifeq ($(UNAME), aarch64) 10 | ARCH_CFLAGS = -mcpu=$(ARCH) -mtune=$(ARCH) 11 | ifeq ($(COMPILER), GNU) 12 | ARCH_CFLAGS += -mlow-precision-recip-sqrt 13 | endif 14 | endif 15 | ifeq ($(UNAME), x86_64) 16 | ARCH_CFLAGS = -march=$(ARCH) 17 | endif 18 | 19 | CFLAGS_CLANG=-std=c++11 -Ofast $(ARCH_CFLAGS) -fopenmp 20 | CFLAGS_GNU=-std=c++11 -Ofast -fno-cx-limited-range $(ARCH_CFLAGS) -fopenmp 21 | CFLAGS_INTEL=-std=c++11 -Ofast -x$(ARCH) -qopenmp 22 | CFLAGS_ARM=-std=c++11 -Ofast $(ARCH_CFLAGS) -fopenmp 23 | CFLAGS_CRAY=-std=c++11 -Ofast -fopenmp 24 | CFLAGS=$(CFLAGS_$(COMPILER)) -Wall -g 25 | 26 | LIBS=-fopenmp 27 | -------------------------------------------------------------------------------- /common/fmm.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #ifndef TASK_CUTOFF 9 | #define TASK_CUTOFF 5000 10 | #endif 11 | #define SOURCE_TASK_SPAWN 0 12 | 13 | template 14 | struct FMM 15 | { 16 | node_t* nodes; 17 | size_t root; 18 | T* x = nullptr; 19 | T* y = nullptr; 20 | T* z = nullptr; 21 | T* w = nullptr; 22 | T* ax = nullptr; 23 | T* ay = nullptr; 24 | T* az = nullptr; 25 | T* p = nullptr; 26 | size_t num_points; 27 | size_t ncrit; 28 | int num_terms; 29 | T theta; 30 | T theta2; 31 | size_t num_samples; 32 | size_t num_multipoles; 33 | size_t num_nodes; 34 | size_t num_spharm_terms; 35 | complex_t* inner_factors = nullptr; 36 | complex_t* outer_factors = nullptr; 37 | complex_t* m = nullptr; 38 | complex_t* l = nullptr; 39 | enum Dist { 40 | Uniform = 0, 41 | Plummer, 42 | NumDist, 43 | }; 44 | Dist dist; 45 | }; 46 | -------------------------------------------------------------------------------- /common/get-deps.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | template 10 | void get_deps_omp_task(FMM* fmm, node_t* target, node_t* source, 11 | std::vector>* p2p_deps, 12 | std::vector>* m2l_deps) 13 | { 14 | T dx = source->cx - target->cx; 15 | T dy = source->cy - target->cy; 16 | T dz = source->cz - target->cz; 17 | T r2 = dx * dx + dy * dy + dz * dz; 18 | T d1 = source->rad * static_cast(2.0); 19 | T d2 = target->rad * static_cast(2.0); 20 | 21 | if ((d1 + d2) * (d1 + d2) < fmm->theta2 * r2) { 22 | omp_set_lock(&target->m2l_lock); 23 | (*m2l_deps)[target->node_idx].push_back(source->node_idx); 24 | omp_unset_lock(&target->m2l_lock); 25 | } 26 | else if (source->is_leaf() && target->is_leaf()) { 27 | omp_set_lock(&target->p2p_lock); 28 | (*p2p_deps)[target->node_idx].push_back(source->node_idx); 29 | omp_unset_lock(&target->p2p_lock); 30 | } 31 | else { 32 | T target_sz = target->rad; 33 | T source_sz = source->rad; 34 | if (source->is_leaf() || ((target_sz >= source_sz) && !target->is_leaf())) { 35 | for (size_t i = 0; i < target->num_children; ++i) { 36 | node_t* child = &fmm->nodes[target->child[i]]; 37 | #pragma omp task if (target->num_points > TASK_CUTOFF) 38 | get_deps_omp_task(fmm, child, source, p2p_deps, m2l_deps); 39 | } 40 | } 41 | else { 42 | for (size_t i = 0; i < source->num_children; ++i) { 43 | //#pragma omp task if(source->num_points > TASK_CUTOFF && 44 | // SOURCE_TASK_SPAWN) 45 | node_t* child = &fmm->nodes[source->child[i]]; 46 | get_deps_omp_task(fmm, target, child, p2p_deps, m2l_deps); 47 | } 48 | } 49 | } 50 | } 51 | 52 | template 53 | void get_deps_omp(FMM* fmm, std::vector>* p2p_deps, 54 | std::vector>* m2l_deps) 55 | { 56 | node_t* root_node = fmm->nodes + fmm->root; 57 | #pragma omp parallel 58 | #pragma omp single 59 | get_deps_omp_task(fmm, root_node, root_node, p2p_deps, m2l_deps); 60 | } 61 | 62 | void pack_deps(std::vector>& deps, size_t** ret_nodes, 63 | size_t** ret_deps, size_t** ret_offsets, size_t** ret_sizes, 64 | size_t* ret_count, size_t* ret_num_nodes) 65 | { 66 | size_t* prefixes1d = (size_t*)malloc(sizeof(size_t) * deps.size()); 67 | size_t* prefixes2d = (size_t*)malloc(sizeof(size_t) * deps.size()); 68 | 69 | prefixes1d[0] = 0; 70 | prefixes2d[0] = 0; 71 | 72 | size_t count = 0; 73 | size_t num_nodes = 0; 74 | 75 | count += deps[0].size(); 76 | if (deps[0].size() > 0) num_nodes++; 77 | 78 | for (size_t i = 1; i < deps.size(); ++i) { 79 | size_t flag = (deps[i - 1].size() > 0); 80 | prefixes1d[i] = flag + prefixes1d[i - 1]; 81 | prefixes2d[i] = deps[i - 1].size() + prefixes2d[i - 1]; 82 | count += deps[i].size(); 83 | if (deps[i].size() > 0) num_nodes++; 84 | } 85 | 86 | size_t* deps_array = (size_t*)malloc(sizeof(size_t) * count); 87 | size_t* nodes = (size_t*)malloc(sizeof(size_t) * num_nodes); 88 | size_t* sizes = (size_t*)malloc(sizeof(size_t) * num_nodes); 89 | size_t* offsets = (size_t*)malloc(sizeof(size_t) * num_nodes); 90 | 91 | #pragma omp parallel for 92 | for (size_t i = 0; i < deps.size(); ++i) { 93 | if (deps[i].size() > 0) { 94 | size_t beg = prefixes2d[i]; 95 | size_t end = beg + deps[i].size(); 96 | size_t c = 0; 97 | for (size_t j = beg; j < end; ++j) { 98 | deps_array[j] = deps[i][c++]; 99 | } 100 | const size_t idx = prefixes1d[i]; 101 | sizes[idx] = deps[i].size(); 102 | offsets[idx] = beg; 103 | nodes[idx] = i; 104 | } 105 | } 106 | 107 | free(prefixes1d); 108 | free(prefixes2d); 109 | 110 | *ret_nodes = nodes; 111 | *ret_deps = deps_array; 112 | *ret_offsets = offsets; 113 | *ret_sizes = sizes; 114 | *ret_count = count; 115 | *ret_num_nodes = num_nodes; 116 | } 117 | -------------------------------------------------------------------------------- /common/gpu-kernels-no-atomics.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #ifdef __CUDACC__ 7 | 8 | #include 9 | 10 | #define NTERMS 4 11 | #warning manually setting nterms 12 | 13 | template 14 | __device__ void p2p_gpu(FMM* fmm, node_t* target) 15 | { 16 | #ifdef KOKKOS 17 | using gpu_utils = gpu_utils; 18 | #endif 19 | 20 | const int tp = target->point_idx; 21 | 22 | __shared__ float4 base_shmem[STILE * WPB]; 23 | float4* shmem = base_shmem + gpu_utils::worker_id() * STILE; 24 | 25 | T ax[REG] = {0}; 26 | T ay[REG] = {0}; 27 | T az[REG] = {0}; 28 | T aw[REG] = {0}; 29 | T xi[REG] = {0}; 30 | T yi[REG] = {0}; 31 | T zi[REG] = {0}; 32 | 33 | const int tid = gpu_utils::thread_id(); 34 | const int ilim = ((target->num_points + TBS - 1) / TBS) * TBS; 35 | 36 | for (int ii = tid; ii < ilim; ii += TBS * REG) { 37 | #pragma unroll 38 | for (int i = 0; i < REG; ++i) { 39 | ax[i] = 0.0; 40 | ay[i] = 0.0; 41 | az[i] = 0.0; 42 | aw[i] = 0.0; 43 | xi[i] = ((i * TBS + ii) < target->num_points) ? fmm->x[tp + i * TBS + ii] 44 | : 0.0; 45 | yi[i] = ((i * TBS + ii) < target->num_points) ? fmm->y[tp + i * TBS + ii] 46 | : 0.0; 47 | zi[i] = ((i * TBS + ii) < target->num_points) ? fmm->z[tp + i * TBS + ii] 48 | : 0.0; 49 | } 50 | for (int jj = 0; jj < target->num_points; jj += STILE) { 51 | const int jlim = min(STILE, (int)target->num_points - jj); 52 | gpu_utils::sync_worker(); 53 | #pragma unroll 54 | for (int j = tid; j < jlim; j += TBS) { 55 | shmem[j] = make_float4(fmm->x[tp + jj + j], fmm->y[tp + jj + j], 56 | fmm->z[tp + jj + j], fmm->w[tp + jj + j]); 57 | } 58 | gpu_utils::sync_worker(); 59 | #pragma unroll 60 | for (int j = 0; j < jlim; ++j) { 61 | const float4 sj = shmem[j]; 62 | #pragma unroll 63 | for (int i = 0; i < REG; ++i) { 64 | const T dx = sj.x - xi[i]; 65 | const T dy = sj.y - yi[i]; 66 | const T dz = sj.z - zi[i]; 67 | const T sw = sj.w; 68 | const T r = dx * dx + dy * dy + dz * dz; 69 | const T inv_r = (r == 0.0) ? 0.0 : rsqrtf(r); 70 | const T inv_r_3 = sw * inv_r * inv_r * inv_r; 71 | ax[i] += dx * inv_r_3; 72 | ay[i] += dy * inv_r_3; 73 | az[i] += dz * inv_r_3; 74 | aw[i] += sw * inv_r; 75 | } 76 | } 77 | } 78 | #pragma unroll 79 | for (int i = 0; i < REG; ++i) { 80 | if ((i * TBS + ii) < target->num_points) { 81 | fmm->ax[tp + i*TBS+ii] += ax[i]; 82 | fmm->ay[tp + i*TBS+ii] += ay[i]; 83 | fmm->az[tp + i*TBS+ii] += az[i]; 84 | fmm->p[tp + i*TBS+ii] += aw[i]; 85 | //atomicAdd(fmm->ax + tp + i * TBS + ii, ax[i]); 86 | //atomicAdd(fmm->ay + tp + i * TBS + ii, ay[i]); 87 | //atomicAdd(fmm->az + tp + i * TBS + ii, az[i]); 88 | //atomicAdd(fmm->p + tp + i * TBS + ii, aw[i]); 89 | } 90 | } 91 | } 92 | } 93 | 94 | template 95 | __device__ void p2p_gpu(FMM* fmm, node_t* target, node_t* source) 96 | { 97 | #ifdef KOKKOS 98 | using gpu_utils = gpu_utils; 99 | #endif 100 | 101 | const int tp = target->point_idx; 102 | const int sp = source->point_idx; 103 | 104 | __shared__ float4 base_shmem[STILE * WPB]; 105 | float4* shmem = base_shmem + gpu_utils::worker_id() * STILE; 106 | T ax[REG] = {0}; 107 | T ay[REG] = {0}; 108 | T az[REG] = {0}; 109 | T aw[REG] = {0}; 110 | T xi[REG] = {0}; 111 | T yi[REG] = {0}; 112 | T zi[REG] = {0}; 113 | 114 | const int tid = gpu_utils::thread_id(); 115 | const int ilim = ((target->num_points + TBS - 1) / TBS) * TBS; 116 | 117 | for (int ii = tid; ii < ilim; ii += TBS * REG) { 118 | #pragma unroll 119 | for (int i = 0; i < REG; ++i) { 120 | ax[i] = 0.0; 121 | ay[i] = 0.0; 122 | az[i] = 0.0; 123 | aw[i] = 0.0; 124 | xi[i] = ((i * TBS + ii) < target->num_points) ? fmm->x[tp + i * TBS + ii] 125 | : 0.0; 126 | yi[i] = ((i * TBS + ii) < target->num_points) ? fmm->y[tp + i * TBS + ii] 127 | : 0.0; 128 | zi[i] = ((i * TBS + ii) < target->num_points) ? fmm->z[tp + i * TBS + ii] 129 | : 0.0; 130 | } 131 | for (int jj = 0; jj < source->num_points; jj += STILE) { 132 | const int jlim = min(STILE, (int)source->num_points - jj); 133 | gpu_utils::sync_worker(); 134 | #pragma unroll 135 | for (int j = tid; j < jlim; j += TBS) { 136 | shmem[j] = make_float4(fmm->x[sp + jj + j], fmm->y[sp + jj + j], 137 | fmm->z[sp + jj + j], fmm->w[sp + jj + j]); 138 | } 139 | gpu_utils::sync_worker(); 140 | #pragma unroll 141 | for (int j = 0; j < jlim; ++j) { 142 | const float4 sj = shmem[j]; 143 | #pragma unroll 144 | for (int i = 0; i < REG; ++i) { 145 | const T dx = sj.x - xi[i]; 146 | const T dy = sj.y - yi[i]; 147 | const T dz = sj.z - zi[i]; 148 | const T sw = sj.w; 149 | const T r = dx * dx + dy * dy + dz * dz; 150 | const T inv_r = rsqrtf(r); 151 | const T inv_r_3 = sw * inv_r * inv_r * inv_r; 152 | ax[i] += dx * inv_r_3; 153 | ay[i] += dy * inv_r_3; 154 | az[i] += dz * inv_r_3; 155 | aw[i] += sw * inv_r; 156 | } 157 | } 158 | } 159 | #pragma unroll 160 | for (int i = 0; i < REG; ++i) { 161 | if ((i * TBS + ii) < target->num_points) { 162 | fmm->ax[tp + i*TBS+ii] += ax[i]; 163 | fmm->ay[tp + i*TBS+ii] += ay[i]; 164 | fmm->az[tp + i*TBS+ii] += az[i]; 165 | fmm->p[tp + i*TBS+ii] += aw[i]; 166 | //atomicAdd(fmm->ax + tp + i * TBS + ii, ax[i]); 167 | //atomicAdd(fmm->ay + tp + i * TBS + ii, ay[i]); 168 | //atomicAdd(fmm->az + tp + i * TBS + ii, az[i]); 169 | //atomicAdd(fmm->p + tp + i * TBS + ii, aw[i]); 170 | } 171 | } 172 | } 173 | } 174 | 175 | template 176 | __device__ void m2l_gpu(FMM* fmm, node_t* target, node_t* source) 177 | { 178 | #ifdef KOKKOS 179 | using gpu_utils = gpu_utils; 180 | #endif 181 | 182 | const int size = NTERMS * NTERMS; 183 | const int shmem_size = size * (sizeof(T) + sizeof(complex_t) * 2); 184 | __shared__ char shmem[shmem_size * WPB]; 185 | char* warp_shmem = shmem + gpu_utils::worker_id() * shmem_size; 186 | 187 | T* legendre = (T*)warp_shmem; 188 | complex_t* outer = (complex_t*)(warp_shmem + sizeof(T) * size); 189 | complex_t* shared_m = 190 | (complex_t*)(warp_shmem + (sizeof(complex_t) + sizeof(T)) * size); 191 | 192 | T dx = target->cx - source->cx; 193 | T dy = target->cy - source->cy; 194 | T dz = target->cz - source->cz; 195 | T rho, alpha, beta; 196 | cart_to_sph(dx, dy, dz, rho, alpha, beta); 197 | compute_outer_gpu(fmm, rho, alpha, beta, outer, legendre); 198 | complex_t* Msource = &fmm->m[source->mult_idx]; 199 | complex_t* Ltarget = &fmm->l[target->mult_idx]; 200 | 201 | #pragma unroll 202 | for (int i = gpu_utils::thread_id(); i < fmm->num_terms * fmm->num_terms; 203 | i += TPB) { 204 | shared_m[i] = Msource[i]; 205 | } 206 | gpu_utils::sync_worker(); 207 | 208 | #pragma unroll 209 | for (int i = gpu_utils::thread_id(); i < fmm->num_terms * fmm->num_terms; 210 | i += TPB) { 211 | const int j = (int)sqrtf((float)i); 212 | const int k = i - j * j - j; 213 | 214 | complex_t tmp(0.0, 0.0); 215 | for (int n = 0; n < fmm->num_terms - j; ++n) { 216 | for (int m = -n; m <= n; ++m) { 217 | tmp += shared_m[mult_idx(n, m)] * outer[mult_idx(j + n, -k - m)]; 218 | } 219 | } 220 | Ltarget[i] += tmp; 221 | //atomicAdd(&(Ltarget[i].re), tmp.re); 222 | //atomicAdd(&(Ltarget[i].im), tmp.im); 223 | } 224 | } 225 | 226 | #endif 227 | -------------------------------------------------------------------------------- /common/gpu-kernels.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #ifdef __CUDACC__ 7 | 8 | #include 9 | 10 | #define NTERMS 4 11 | #warning manually setting nterms 12 | 13 | template 14 | __device__ inline T pow_one(int m) 15 | { 16 | return static_cast(1 + ((m & 0x01) * -2)); 17 | } 18 | 19 | template 20 | __device__ void p2p_gpu(FMM* fmm, node_t* target) 21 | { 22 | #ifdef KOKKOS 23 | using gpu_utils = gpu_utils; 24 | #endif 25 | 26 | const int tp = target->point_idx; 27 | 28 | __shared__ float4 base_shmem[STILE * WPB]; 29 | float4* shmem = base_shmem + gpu_utils::worker_id() * STILE; 30 | 31 | T ax[REG] = {0}; 32 | T ay[REG] = {0}; 33 | T az[REG] = {0}; 34 | T aw[REG] = {0}; 35 | T xi[REG] = {0}; 36 | T yi[REG] = {0}; 37 | T zi[REG] = {0}; 38 | 39 | const int tid = gpu_utils::thread_id(); 40 | const int ilim = ((target->num_points + TBS - 1) / TBS) * TBS; 41 | 42 | for (int ii = tid; ii < ilim; ii += TBS * REG) { 43 | #pragma unroll 44 | for (int i = 0; i < REG; ++i) { 45 | ax[i] = 0.0; 46 | ay[i] = 0.0; 47 | az[i] = 0.0; 48 | aw[i] = 0.0; 49 | xi[i] = ((i * TBS + ii) < target->num_points) ? fmm->x[tp + i * TBS + ii] 50 | : 0.0; 51 | yi[i] = ((i * TBS + ii) < target->num_points) ? fmm->y[tp + i * TBS + ii] 52 | : 0.0; 53 | zi[i] = ((i * TBS + ii) < target->num_points) ? fmm->z[tp + i * TBS + ii] 54 | : 0.0; 55 | } 56 | for (int jj = 0; jj < target->num_points; jj += STILE) { 57 | const int jlim = min(STILE, (int)target->num_points - jj); 58 | gpu_utils::sync_worker(); 59 | #pragma unroll 60 | for (int j = tid; j < jlim; j += TBS) { 61 | shmem[j] = make_float4(fmm->x[tp + jj + j], fmm->y[tp + jj + j], 62 | fmm->z[tp + jj + j], fmm->w[tp + jj + j]); 63 | } 64 | gpu_utils::sync_worker(); 65 | #pragma unroll 66 | for (int j = 0; j < jlim; ++j) { 67 | const float4 sj = shmem[j]; 68 | #pragma unroll 69 | for (int i = 0; i < REG; ++i) { 70 | const T dx = sj.x - xi[i]; 71 | const T dy = sj.y - yi[i]; 72 | const T dz = sj.z - zi[i]; 73 | const T sw = sj.w; 74 | const T r = dx * dx + dy * dy + dz * dz; 75 | const T inv_r = (r == 0.0) ? 0.0 : rsqrtf(r); 76 | const T inv_r_3 = sw * inv_r * inv_r * inv_r; 77 | ax[i] += dx * inv_r_3; 78 | ay[i] += dy * inv_r_3; 79 | az[i] += dz * inv_r_3; 80 | aw[i] += sw * inv_r; 81 | } 82 | } 83 | } 84 | #pragma unroll 85 | for (int i = 0; i < REG; ++i) { 86 | if ((i * TBS + ii) < target->num_points) { 87 | // fmm->ax[tp + i*TBS+ii] += ax[i]; 88 | // fmm->ay[tp + i*TBS+ii] += ay[i]; 89 | // fmm->az[tp + i*TBS+ii] += az[i]; 90 | // fmm->p[tp + i*TBS+ii] += aw[i]; 91 | atomicAdd(fmm->ax + tp + i * TBS + ii, ax[i]); 92 | atomicAdd(fmm->ay + tp + i * TBS + ii, ay[i]); 93 | atomicAdd(fmm->az + tp + i * TBS + ii, az[i]); 94 | atomicAdd(fmm->p + tp + i * TBS + ii, aw[i]); 95 | } 96 | } 97 | } 98 | } 99 | 100 | template 101 | __device__ void p2p_gpu(FMM* fmm, node_t* target, node_t* source) 102 | { 103 | #ifdef KOKKOS 104 | using gpu_utils = gpu_utils; 105 | #endif 106 | 107 | const int tp = target->point_idx; 108 | const int sp = source->point_idx; 109 | 110 | __shared__ float4 base_shmem[STILE * WPB]; 111 | float4* shmem = base_shmem + gpu_utils::worker_id() * STILE; 112 | T ax[REG] = {0}; 113 | T ay[REG] = {0}; 114 | T az[REG] = {0}; 115 | T aw[REG] = {0}; 116 | T xi[REG] = {0}; 117 | T yi[REG] = {0}; 118 | T zi[REG] = {0}; 119 | 120 | const int tid = gpu_utils::thread_id(); 121 | const int ilim = ((target->num_points + TBS - 1) / TBS) * TBS; 122 | 123 | for (int ii = tid; ii < ilim; ii += TBS * REG) { 124 | #pragma unroll 125 | for (int i = 0; i < REG; ++i) { 126 | ax[i] = 0.0; 127 | ay[i] = 0.0; 128 | az[i] = 0.0; 129 | aw[i] = 0.0; 130 | xi[i] = ((i * TBS + ii) < target->num_points) ? fmm->x[tp + i * TBS + ii] 131 | : 0.0; 132 | yi[i] = ((i * TBS + ii) < target->num_points) ? fmm->y[tp + i * TBS + ii] 133 | : 0.0; 134 | zi[i] = ((i * TBS + ii) < target->num_points) ? fmm->z[tp + i * TBS + ii] 135 | : 0.0; 136 | } 137 | for (int jj = 0; jj < source->num_points; jj += STILE) { 138 | const int jlim = min(STILE, (int)source->num_points - jj); 139 | gpu_utils::sync_worker(); 140 | #pragma unroll 141 | for (int j = tid; j < jlim; j += TBS) { 142 | shmem[j] = make_float4(fmm->x[sp + jj + j], fmm->y[sp + jj + j], 143 | fmm->z[sp + jj + j], fmm->w[sp + jj + j]); 144 | } 145 | gpu_utils::sync_worker(); 146 | #pragma unroll 147 | for (int j = 0; j < jlim; ++j) { 148 | const float4 sj = shmem[j]; 149 | #pragma unroll 150 | for (int i = 0; i < REG; ++i) { 151 | const T dx = sj.x - xi[i]; 152 | const T dy = sj.y - yi[i]; 153 | const T dz = sj.z - zi[i]; 154 | const T sw = sj.w; 155 | const T r = dx * dx + dy * dy + dz * dz; 156 | const T inv_r = rsqrtf(r); 157 | const T inv_r_3 = sw * inv_r * inv_r * inv_r; 158 | ax[i] += dx * inv_r_3; 159 | ay[i] += dy * inv_r_3; 160 | az[i] += dz * inv_r_3; 161 | aw[i] += sw * inv_r; 162 | } 163 | } 164 | } 165 | #pragma unroll 166 | for (int i = 0; i < REG; ++i) { 167 | if ((i * TBS + ii) < target->num_points) { 168 | // fmm->ax[tp + i*TBS+ii] += ax[i]; 169 | // fmm->ay[tp + i*TBS+ii] += ay[i]; 170 | // fmm->az[tp + i*TBS+ii] += az[i]; 171 | // fmm->p[tp + i*TBS+ii] += aw[i]; 172 | atomicAdd(fmm->ax + tp + i * TBS + ii, ax[i]); 173 | atomicAdd(fmm->ay + tp + i * TBS + ii, ay[i]); 174 | atomicAdd(fmm->az + tp + i * TBS + ii, az[i]); 175 | atomicAdd(fmm->p + tp + i * TBS + ii, aw[i]); 176 | } 177 | } 178 | } 179 | } 180 | 181 | template 182 | __device__ void m2l_gpu(FMM* fmm, node_t* target, node_t* source) 183 | { 184 | #ifdef KOKKOS 185 | using gpu_utils = gpu_utils; 186 | #endif 187 | 188 | const int size = NTERMS * NTERMS; 189 | const int shmem_size = size * (sizeof(T) + sizeof(complex_t) * 2); 190 | __shared__ char shmem[shmem_size * WPB]; 191 | char* warp_shmem = shmem + gpu_utils::worker_id() * shmem_size; 192 | 193 | T* legendre = (T*)warp_shmem; 194 | complex_t* outer = (complex_t*)(warp_shmem + sizeof(T) * size); 195 | complex_t* shared_m = 196 | (complex_t*)(warp_shmem + (sizeof(complex_t) + sizeof(T)) * size); 197 | 198 | T dx = target->cx - source->cx; 199 | T dy = target->cy - source->cy; 200 | T dz = target->cz - source->cz; 201 | T rho, alpha, beta; 202 | cart_to_sph(dx, dy, dz, rho, alpha, beta); 203 | compute_outer_gpu(fmm, rho, alpha, beta, outer, legendre); 204 | complex_t* Msource = &fmm->m[source->mult_idx]; 205 | complex_t* Ltarget = &fmm->l[target->mult_idx]; 206 | 207 | #pragma unroll 208 | for (int i = gpu_utils::thread_id(); i < fmm->num_terms * fmm->num_terms; 209 | i += TPB) { 210 | shared_m[i] = Msource[i]; 211 | } 212 | gpu_utils::sync_worker(); 213 | 214 | #pragma unroll 215 | for (int i = gpu_utils::thread_id(); i < fmm->num_terms * fmm->num_terms; 216 | i += TPB) { 217 | const int j = (int)sqrtf((float)i); 218 | const int k = i - j * j - j; 219 | 220 | complex_t tmp(0.0, 0.0); 221 | for (int n = 0; n < fmm->num_terms - j; ++n) { 222 | for (int m = -n; m <= n; ++m) { 223 | tmp += shared_m[mult_idx(n, m)] * outer[mult_idx(j + n, -k - m)]; 224 | } 225 | } 226 | // Ltarget[i] += tmp; 227 | atomicAdd(&(Ltarget[i].re), tmp.re); 228 | atomicAdd(&(Ltarget[i].im), tmp.im); 229 | } 230 | } 231 | 232 | template 233 | INLINE void p2m_gpu(FMM* fmm, node_t* node) 234 | { 235 | #ifdef KOKKOS 236 | using gpu_utils = gpu_utils; 237 | #endif 238 | 239 | size_t pt_offset = node->point_idx; 240 | size_t mt_offset = node->mult_idx; 241 | 242 | __shared__ T shmem_all[WPB * NTERMS * NTERMS * 3]; 243 | T* shmem = shmem_all + gpu_utils::worker_id() * NTERMS * NTERMS * 3; 244 | T* legendre = shmem; 245 | complex_t* inner = (complex_t*)(shmem + NTERMS * NTERMS); 246 | 247 | for (size_t i = 0; i < node->num_points; ++i) { 248 | T dx = fmm->x[i + pt_offset] - node->cx; 249 | T dy = fmm->y[i + pt_offset] - node->cy; 250 | T dz = fmm->z[i + pt_offset] - node->cz; 251 | T r, theta, phi; 252 | cart_to_sph(dx, dy, dz, r, theta, phi); 253 | compute_inner_gpu(fmm, r, theta, phi, inner, legendre); 254 | gpu_utils::sync_worker(); 255 | #pragma unroll 256 | for (int t = gpu_utils::thread_id(); t < fmm->num_terms * fmm->num_terms; 257 | t += TPB) { 258 | const int n = (int)sqrtf((float)t); 259 | fmm->m[mt_offset + t] += fmm->w[i + pt_offset] * pow_one(n) * inner[t]; 260 | } 261 | } 262 | } 263 | 264 | template 265 | __device__ void m2m_gpu(FMM* fmm, node_t* node) 266 | { 267 | #ifdef KOKKOS 268 | using gpu_utils = gpu_utils; 269 | #endif 270 | 271 | __shared__ T shmem_all[WPB * NTERMS * NTERMS * 3]; 272 | T* shmem = shmem_all + gpu_utils::worker_id() * NTERMS * NTERMS * 3; 273 | T* legendre = shmem; 274 | complex_t* inner = (complex_t*)(shmem + NTERMS * NTERMS); 275 | 276 | for (size_t i = 0; i < node->num_children; ++i) { 277 | node_t* child = &fmm->nodes[node->child[i]]; 278 | T dx = node->cx - child->cx; 279 | T dy = node->cy - child->cy; 280 | T dz = node->cz - child->cz; 281 | T r, theta, phi; 282 | cart_to_sph(dx, dy, dz, r, theta, phi); 283 | 284 | const complex_t* Mchild = &fmm->m[child->mult_idx]; 285 | complex_t* Mnode = &fmm->m[node->mult_idx]; 286 | 287 | compute_inner_gpu(fmm, r, theta, phi, inner, legendre); 288 | gpu_utils::sync_worker(); 289 | #pragma unroll 290 | for (int t = gpu_utils::thread_id(); t < fmm->num_terms * fmm->num_terms; 291 | t += TPB) { 292 | const int j = (int)sqrtf((float)t); 293 | const int k = t - j * j - j; 294 | complex_t tmp(static_cast(0.0), static_cast(0.0)); 295 | for (int n = 0; n <= j; ++n) { 296 | for (int m = -n; m <= n; ++m) { 297 | if (abs(k - m) <= j - n) 298 | tmp += Mchild[mult_idx(n, m)] * inner[mult_idx(j - n, k - m)]; 299 | } 300 | } 301 | Mnode[t] += tmp; 302 | } 303 | } 304 | } 305 | 306 | #endif 307 | -------------------------------------------------------------------------------- /common/gpu-spharm.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | template 6 | INLINE void compute_outer_gpu(FMM* fmm, T r, T theta, T phi, 7 | complex_t* outer, T* legendre, 8 | complex_t* outer_deriv = nullptr, 9 | T* legendre_deriv = nullptr) 10 | { 11 | #ifdef KOKKOS 12 | using gpu_utils = gpu_utils; 13 | #endif 14 | if (fmm->num_terms == 0) return; 15 | const int lid = gpu_utils::thread_id(); 16 | const int num_lanes = TPB; 17 | if (lid == 0) { 18 | compute_legendre<1>(fmm->num_terms - 1, std::cos(theta), legendre, 19 | legendre_deriv); 20 | } 21 | gpu_utils::sync_worker(); 22 | #pragma unroll 23 | for (int i = lid; i < fmm->num_terms * fmm->num_terms; i += num_lanes) { 24 | const int n = (int)sqrtf((float)i); 25 | const int m = i - n * n - n; 26 | outer[i] = fmm->outer_factors[i] * legendre[leg_idx(n, m)] * 27 | complex_exp(complex_t(0.0, 1.0) * static_cast(m) * phi) * 28 | (static_cast(1.0) / std::pow(r, static_cast(n + 1))); 29 | } 30 | } 31 | 32 | template 33 | INLINE void compute_inner_gpu(FMM* fmm, T r, T theta, T phi, 34 | complex_t* inner, T* legendre, 35 | complex_t* inner_deriv = nullptr, 36 | T* legendre_deriv = nullptr) 37 | { 38 | #ifdef KOKKOS 39 | using gpu_utils = gpu_utils; 40 | #endif 41 | if (fmm->num_terms == 0) return; 42 | const int lid = gpu_utils::thread_id(); 43 | const int num_lanes = TPB; 44 | if (lid == 0) { 45 | compute_legendre<1>(fmm->num_terms - 1, std::cos(theta), legendre, 46 | legendre_deriv); 47 | } 48 | gpu_utils::sync_worker(); 49 | #pragma unroll 50 | for (int i = lid; i < fmm->num_terms * fmm->num_terms; i += num_lanes) { 51 | const int n = (int)sqrtf((float)i); 52 | const int m = i - n * n - n; 53 | inner[i] = fmm->inner_factors[i] * legendre[leg_idx(n, m)] * 54 | complex_exp(complex_t(0.0, 1.0) * static_cast(m) * phi) * 55 | std::pow(r, static_cast(n)); 56 | } 57 | } 58 | 59 | -------------------------------------------------------------------------------- /common/gpu-utils.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef __CUDACC__ 4 | 5 | #define HOSTDEVICE __host__ __device__ 6 | #define CUDACHK(ans) \ 7 | { \ 8 | gpu_assert((ans), __FILE__, __LINE__); \ 9 | } 10 | inline void gpu_assert(cudaError_t code, const char* file, int line) 11 | { 12 | if (code != cudaSuccess) { 13 | fprintf(stderr, "CUDACHK: %s %s %d\n", cudaGetErrorString(code), file, 14 | line); 15 | exit(code); 16 | } 17 | } 18 | #else 19 | #define HOSTDEVICE 20 | #endif 21 | 22 | template 23 | struct FMM; 24 | 25 | template 26 | void alloc_and_copy(T** dst, const T* src, size_t nelms) 27 | { 28 | #ifdef __CUDACC__ 29 | CUDACHK(cudaMalloc((void**)dst, sizeof(T) * nelms)); 30 | CUDACHK(cudaMemcpy(*dst, src, sizeof(T) * nelms, cudaMemcpyHostToDevice)); 31 | #else 32 | *dst = (T*)malloc(sizeof(T) * nelms); 33 | memcpy(*dst, src, sizeof(T) * nelms); 34 | #endif 35 | } 36 | 37 | template 38 | void copy_back(T* dst, T* src, size_t nelms) 39 | { 40 | #ifdef __CUDACC__ 41 | CUDACHK(cudaMemcpy(dst, src, sizeof(T) * nelms, cudaMemcpyDeviceToHost)); 42 | #else 43 | memcpy(dst, src, sizeof(T) * nelms); 44 | #endif 45 | } 46 | 47 | template 48 | void init_device_fmm(FMM* fmm, FMM** h_fmm_ret, FMM** d_fmm_ret) 49 | { 50 | FMM* h_fmm = (FMM*)malloc(sizeof(FMM)); 51 | *h_fmm = *fmm; 52 | 53 | alloc_and_copy(&h_fmm->nodes, fmm->nodes, fmm->num_nodes); 54 | alloc_and_copy(&h_fmm->x, fmm->x, fmm->num_points); 55 | alloc_and_copy(&h_fmm->y, fmm->y, fmm->num_points); 56 | alloc_and_copy(&h_fmm->z, fmm->z, fmm->num_points); 57 | alloc_and_copy(&h_fmm->w, fmm->w, fmm->num_points); 58 | alloc_and_copy(&h_fmm->ax, fmm->ax, fmm->num_points); 59 | alloc_and_copy(&h_fmm->ay, fmm->ay, fmm->num_points); 60 | alloc_and_copy(&h_fmm->az, fmm->az, fmm->num_points); 61 | alloc_and_copy(&h_fmm->p, fmm->p, fmm->num_points); 62 | alloc_and_copy(&h_fmm->inner_factors, fmm->inner_factors, 63 | fmm->num_multipoles); 64 | alloc_and_copy(&h_fmm->outer_factors, fmm->outer_factors, 65 | fmm->num_multipoles); 66 | alloc_and_copy(&h_fmm->m, fmm->m, fmm->num_multipoles * fmm->num_nodes); 67 | alloc_and_copy(&h_fmm->l, fmm->l, fmm->num_multipoles * fmm->num_nodes); 68 | 69 | FMM* d_fmm; 70 | alloc_and_copy(&d_fmm, h_fmm, 1); 71 | 72 | *h_fmm_ret = h_fmm; 73 | *d_fmm_ret = d_fmm; 74 | } 75 | 76 | template 77 | void device_free(T* p) 78 | { 79 | #ifdef __CUDACC__ 80 | CUDACHK(cudaFree(p)); 81 | #else 82 | free(p); 83 | #endif 84 | } 85 | 86 | template 87 | void fini_device_fmm(FMM* fmm, FMM* h_fmm, FMM* d_fmm) 88 | { 89 | copy_back(fmm->ax, h_fmm->ax, fmm->num_points); 90 | copy_back(fmm->ay, h_fmm->ay, fmm->num_points); 91 | copy_back(fmm->az, h_fmm->az, fmm->num_points); 92 | copy_back(fmm->p, h_fmm->p, fmm->num_points); 93 | 94 | copy_back(fmm->m, h_fmm->m, fmm->num_nodes * fmm->num_multipoles); 95 | copy_back(fmm->l, h_fmm->l, fmm->num_nodes * fmm->num_multipoles); 96 | 97 | device_free(h_fmm->nodes); 98 | device_free(h_fmm->x); 99 | device_free(h_fmm->y); 100 | device_free(h_fmm->z); 101 | device_free(h_fmm->w); 102 | device_free(h_fmm->ax); 103 | device_free(h_fmm->ay); 104 | device_free(h_fmm->az); 105 | device_free(h_fmm->p); 106 | device_free(h_fmm->inner_factors); 107 | device_free(h_fmm->outer_factors); 108 | device_free(h_fmm->m); 109 | device_free(h_fmm->l); 110 | 111 | device_free(d_fmm); 112 | 113 | free(h_fmm); 114 | } 115 | 116 | template 117 | void update_device_array(T* d_array, T* h_array, size_t n) 118 | { 119 | #ifdef __CUDACC__ 120 | CUDACHK(cudaMemcpy(d_array, h_array, sizeof(T) * n, cudaMemcpyHostToDevice)); 121 | #else 122 | memcpy(d_array, h_array, sizeof(T) * n); 123 | #endif 124 | } 125 | 126 | template 127 | void update_host_array(T* h_array, T* d_array, size_t n) 128 | { 129 | #ifdef __CUDACC__ 130 | CUDACHK(cudaMemcpy(h_array, d_array, sizeof(T) * n, cudaMemcpyDeviceToHost)); 131 | #else 132 | memcpy(h_array, d_array, sizeof(T) * n); 133 | #endif 134 | } 135 | -------------------------------------------------------------------------------- /common/init.hh: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | template 8 | T rand_dist(T low, T high) 9 | { 10 | const T randn = (T)rand() / (T)RAND_MAX; 11 | return (low + randn * (high - low)); 12 | } 13 | 14 | template 15 | void init(FMM* fmm) 16 | { 17 | fmm->theta2 = fmm->theta * fmm->theta; 18 | fmm->num_multipoles = fmm->num_terms * fmm->num_terms; 19 | fmm->num_spharm_terms = fmm->num_terms * fmm->num_terms; 20 | 21 | fmm->x = (T*)malloc(sizeof(T) * fmm->num_points); 22 | fmm->y = (T*)malloc(sizeof(T) * fmm->num_points); 23 | fmm->z = (T*)malloc(sizeof(T) * fmm->num_points); 24 | fmm->w = (T*)malloc(sizeof(T) * fmm->num_points); 25 | fmm->ax = (T*)malloc(sizeof(T) * fmm->num_points); 26 | fmm->ay = (T*)malloc(sizeof(T) * fmm->num_points); 27 | fmm->az = (T*)malloc(sizeof(T) * fmm->num_points); 28 | fmm->p = (T*)malloc(sizeof(T) * fmm->num_points); 29 | 30 | srand(42); 31 | 32 | if (fmm->dist == FMM::Dist::Uniform) { 33 | for (size_t i = 0; i < fmm->num_points; ++i) { 34 | fmm->x[i] = static_cast(rand()) / static_cast(RAND_MAX); 35 | fmm->y[i] = static_cast(rand()) / static_cast(RAND_MAX); 36 | fmm->z[i] = static_cast(rand()) / static_cast(RAND_MAX); 37 | fmm->w[i] = static_cast(rand()) / static_cast(RAND_MAX); 38 | } 39 | } 40 | else if (fmm->dist == FMM::Dist::Plummer) { 41 | for (size_t i = 0; i < fmm->num_points; ++i) { 42 | T randn = (static_cast(rand()) / static_cast(RAND_MAX)); 43 | randn += (randn == static_cast(0.0)) 44 | ? std::numeric_limits::epsilon() 45 | : static_cast(0.0); 46 | randn -= (randn == static_cast(1.0)) 47 | ? std::numeric_limits::epsilon() 48 | : static_cast(0.0); 49 | const T radius = 50 | static_cast(1.0) / std::sqrt(std::pow(randn, (-2.0 / 3.0)) - 1.0); 51 | const T theta = std::acos(rand_dist(-1.0, 1.0)); 52 | const T phi = rand_dist(0.0, 2.0 * M_PI); 53 | fmm->x[i] = radius * std::sin(theta) * std::cos(phi); 54 | fmm->y[i] = radius * std::sin(theta) * std::sin(phi); 55 | fmm->z[i] = radius * std::cos(theta); 56 | fmm->w[i] = static_cast(1.0) / static_cast(fmm->num_points); 57 | } 58 | } 59 | else { 60 | fprintf(stderr, "error: unknown input distribution type\n"); 61 | exit(1); 62 | } 63 | 64 | std::fill(fmm->ax, fmm->ax + fmm->num_points, 0); 65 | std::fill(fmm->ay, fmm->ay + fmm->num_points, 0); 66 | std::fill(fmm->az, fmm->az + fmm->num_points, 0); 67 | std::fill(fmm->p, fmm->p + fmm->num_points, 0); 68 | 69 | int num_terms = fmm->num_terms; 70 | 71 | fmm->inner_factors = 72 | (complex_t*)malloc(sizeof(complex_t) * num_terms * num_terms); 73 | fmm->outer_factors = 74 | (complex_t*)malloc(sizeof(complex_t) * num_terms * num_terms); 75 | 76 | std::fill(fmm->inner_factors, fmm->inner_factors + (num_terms * num_terms), 77 | complex_t(static_cast(0.0), static_cast(0.0))); 78 | std::fill(fmm->outer_factors, fmm->outer_factors + (num_terms * num_terms), 79 | complex_t(static_cast(0.0), static_cast(0.0))); 80 | 81 | int max = 2 * num_terms - 1; 82 | T factorial[max]; 83 | factorial[0] = 1.0; 84 | for (int i = 1; i < max; ++i) factorial[i] = i * factorial[i - 1]; 85 | 86 | for (int n = 0; n < num_terms; ++n) { 87 | for (int m = -n; m <= n; ++m) { 88 | fmm->inner_factors[mult_idx(n, m)] = 89 | (std::pow(static_cast(-1.0), static_cast(n)) * 90 | imag_pow(std::abs(m))) / 91 | factorial[n + std::abs(m)]; 92 | fmm->outer_factors[mult_idx(n, m)] = 93 | imag_pow(-std::abs(m)) * factorial[n - std::abs(m)]; 94 | } 95 | } 96 | build_tree(fmm); 97 | } 98 | -------------------------------------------------------------------------------- /common/input.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | 12 | void print_help() { printf("help\n"); } 13 | 14 | template 15 | void parse_var(const char c, const char* optarg, FMM* fmm) 16 | { 17 | switch (c) { 18 | case 'n': 19 | fmm->num_points = std::atoi(optarg); 20 | break; 21 | case 'c': 22 | fmm->ncrit = std::atoi(optarg); 23 | break; 24 | case 't': 25 | fmm->num_terms = std::atoi(optarg); 26 | break; 27 | case 'e': 28 | fmm->theta = std::atof(optarg); 29 | break; 30 | case 'm': 31 | fmm->num_samples = std::atoi(optarg); 32 | break; 33 | case 'p': 34 | fmm->dist = FMM::Dist::Plummer; 35 | break; 36 | case 'u': 37 | fmm->dist = FMM::Dist::Uniform; 38 | break; 39 | case '?': 40 | fprintf(stderr, "error - %c not recognised or missing value\n", optopt); 41 | break; 42 | } 43 | } 44 | 45 | template 46 | void parse_input_file(const char* ifile, FMM* fmm) 47 | { 48 | std::ifstream ifs(ifile); 49 | if (!ifs.is_open()) { 50 | std::cerr << "error: could not open input file - " << ifile << std::endl; 51 | std::exit(1); 52 | } 53 | 54 | std::vector lines; 55 | 56 | std::string line; 57 | while (std::getline(ifs, line)) { 58 | std::stringstream ss(line); 59 | while (std::getline(ss, line, ' ')) { 60 | lines.push_back(line); 61 | } 62 | } 63 | 64 | int fargc = lines.size() + 1; 65 | char** fargv = (char**)malloc(sizeof(char*) * fargc); 66 | for (size_t i = 0; i < lines.size(); ++i) { 67 | fargv[i + 1] = (char*)malloc(sizeof(char) * lines[i].size()); 68 | strcpy(fargv[i + 1], lines[i].c_str()); 69 | } 70 | parse_args(fargc, fargv, fmm, true); 71 | } 72 | 73 | template 74 | void parse_args(int argc, char** argv, FMM* fmm, bool nested) 75 | { 76 | const static struct option long_params[] = { 77 | {"help", no_argument, NULL, 'h'}, 78 | {"npart", required_argument, NULL, 'n'}, 79 | {"ncrit", required_argument, NULL, 'c'}, 80 | {"nterms", required_argument, NULL, 't'}, 81 | {"theta", required_argument, NULL, 'e'}, 82 | {"nsamp", required_argument, NULL, 'm'}, 83 | {"ifile", required_argument, NULL, 'i'}, 84 | {"plummer", no_argument, NULL, 'p'}, 85 | {"uniform", no_argument, NULL, 'u'}, 86 | }; 87 | 88 | int c; 89 | optind = 1; 90 | opterr = 0; 91 | while ((c = getopt_long(argc, argv, "hpun:c:t:e:m:d:i:", long_params, NULL)) != 92 | -1) { 93 | switch (c) { 94 | case 'h': 95 | print_help(); 96 | // TODO check this leads to correct destruction. 97 | std::exit(0); 98 | case 'i': 99 | // stop input file arg being used inside an input file 100 | if (!nested) { 101 | const char* ifile = optarg; 102 | parse_input_file(ifile, fmm); 103 | // TODO fix this 104 | return; 105 | } 106 | break; 107 | default: 108 | parse_var(c, optarg, fmm); 109 | break; 110 | } 111 | } 112 | } 113 | 114 | template 115 | void read_input(int argc, char** argv, FMM* fmm) 116 | { 117 | const char* dist_strings[FMM::Dist::NumDist] = {"Uniform", "Plummer"}; 118 | fmm->num_points = 1000; 119 | fmm->ncrit = 20; 120 | fmm->num_terms = 4; 121 | fmm->theta = 0.5; 122 | fmm->num_samples = 1000; 123 | fmm->dist = FMM::Dist::Plummer; 124 | 125 | parse_args(argc, argv, fmm, false); 126 | 127 | fmm->num_samples = std::min(fmm->num_samples, fmm->num_points); 128 | 129 | std::cout << "FMM args\n" 130 | << "Num Points = " << fmm->num_points << '\n' 131 | << "NCrit = " << fmm->ncrit << '\n' 132 | << "Num Terms = " << fmm->num_terms << '\n' 133 | << "Theta = " << fmm->theta << '\n' 134 | << "Num Samples = " << fmm->num_samples << "\n" 135 | << "Distribution = " << dist_strings[fmm->dist] << "\n\n"; 136 | } 137 | -------------------------------------------------------------------------------- /common/kernels.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | template 7 | void p2p_tiled(FMM* fmm, node_t* target) 8 | { 9 | const size_t ip = target->point_idx; 10 | for (size_t ii = 0; ii < target->num_points; ii += TILE_SIZE) { 11 | T xi[TILE_SIZE] = {0}; 12 | T yi[TILE_SIZE] = {0}; 13 | T zi[TILE_SIZE] = {0}; 14 | T ax[TILE_SIZE] = {0}; 15 | T ay[TILE_SIZE] = {0}; 16 | T az[TILE_SIZE] = {0}; 17 | T aw[TILE_SIZE] = {0}; 18 | const int ilim = std::min((size_t)TILE_SIZE, target->num_points - ii); 19 | for (int i = 0; i < ilim; ++i) { 20 | xi[i] = fmm->x[i + ii + ip]; 21 | yi[i] = fmm->y[i + ii + ip]; 22 | zi[i] = fmm->z[i + ii + ip]; 23 | } 24 | for (size_t j = 0; j < target->num_points; ++j) { 25 | for (int i = 0; i < TILE_SIZE; ++i) { 26 | const T dx = fmm->x[j + ip] - xi[i]; 27 | const T dy = fmm->y[j + ip] - yi[i]; 28 | const T dz = fmm->z[j + ip] - zi[i]; 29 | const T sw = fmm->w[j + ip]; 30 | const T r = dx * dx + dy * dy + dz * dz; 31 | const T inv_r = (r == 0.0) ? 0.0 : 1.0 / std::sqrt(r); 32 | const T inv_r_3 = sw * inv_r * inv_r * inv_r; 33 | ax[i] += dx * inv_r_3; 34 | ay[i] += dy * inv_r_3; 35 | az[i] += dz * inv_r_3; 36 | aw[i] += sw * inv_r; 37 | } 38 | } 39 | for (int i = 0; i < ilim; ++i) { 40 | fmm->ax[i + ii + ip] += ax[i]; 41 | fmm->ay[i + ii + ip] += ay[i]; 42 | fmm->az[i + ii + ip] += az[i]; 43 | fmm->p[i + ii + ip] += aw[i]; 44 | } 45 | } 46 | } 47 | 48 | template 49 | void p2p(FMM* fmm, node_t* target) 50 | { 51 | for (size_t i = 0; i < target->num_points; ++i) { 52 | const size_t ip = i + target->point_idx; 53 | const T xi = fmm->x[ip]; 54 | const T yi = fmm->y[ip]; 55 | const T zi = fmm->z[ip]; 56 | T ax = static_cast(0.0); 57 | T ay = static_cast(0.0); 58 | T az = static_cast(0.0); 59 | T p = static_cast(0.0); 60 | for (size_t j = 0; j < target->num_points; ++j) { 61 | const size_t jp = j + target->point_idx; 62 | const T dx = fmm->x[jp] - xi; 63 | const T dy = fmm->y[jp] - yi; 64 | const T dz = fmm->z[jp] - zi; 65 | const T r = dx * dx + dy * dy + dz * dz; 66 | const T inv_r = (r == 0.0) ? 0.0 : 1.0 / std::sqrt(r); 67 | const T inv_r_3 = inv_r * inv_r * inv_r * fmm->w[jp]; 68 | ax += dx * inv_r_3; 69 | ay += dy * inv_r_3; 70 | az += dz * inv_r_3; 71 | p += fmm->w[jp] * inv_r; 72 | } 73 | fmm->ax[ip] += ax; 74 | fmm->ay[ip] += ay; 75 | fmm->az[ip] += az; 76 | fmm->p[ip] += p; 77 | } 78 | } 79 | 80 | template 81 | void p2p_tiled(FMM* fmm, node_t* target, node_t* source) 82 | { 83 | const size_t jp = source->point_idx; 84 | const size_t ip = target->point_idx; 85 | for (size_t ii = 0; ii < target->num_points; ii += TILE_SIZE) { 86 | T xi[TILE_SIZE] = {0}; 87 | T yi[TILE_SIZE] = {0}; 88 | T zi[TILE_SIZE] = {0}; 89 | T ax[TILE_SIZE] = {0}; 90 | T ay[TILE_SIZE] = {0}; 91 | T az[TILE_SIZE] = {0}; 92 | T aw[TILE_SIZE] = {0}; 93 | const int ilim = std::min((size_t)TILE_SIZE, target->num_points - ii); 94 | for (int i = 0; i < ilim; ++i) { 95 | xi[i] = fmm->x[i + ii + ip]; 96 | yi[i] = fmm->y[i + ii + ip]; 97 | zi[i] = fmm->z[i + ii + ip]; 98 | } 99 | for (size_t j = 0; j < source->num_points; ++j) { 100 | for (int i = 0; i < TILE_SIZE; ++i) { 101 | const T dx = fmm->x[j + jp] - xi[i]; 102 | const T dy = fmm->y[j + jp] - yi[i]; 103 | const T dz = fmm->z[j + jp] - zi[i]; 104 | const T sw = fmm->w[j + jp]; 105 | const T r = dx * dx + dy * dy + dz * dz; 106 | const T inv_r = 1.0 / std::sqrt(r); 107 | const T inv_r_3 = sw * inv_r * inv_r * inv_r; 108 | ax[i] += dx * inv_r_3; 109 | ay[i] += dy * inv_r_3; 110 | az[i] += dz * inv_r_3; 111 | aw[i] += sw * inv_r; 112 | } 113 | } 114 | for (int i = 0; i < ilim; ++i) { 115 | fmm->ax[i + ii + ip] += ax[i]; 116 | fmm->ay[i + ii + ip] += ay[i]; 117 | fmm->az[i + ii + ip] += az[i]; 118 | fmm->p[i + ii + ip] += aw[i]; 119 | } 120 | } 121 | } 122 | 123 | template 124 | void p2p(FMM* fmm, node_t* target, node_t* source) 125 | { 126 | for (size_t i = 0; i < target->num_points; ++i) { 127 | const size_t ip = i + target->point_idx; 128 | const T xi = fmm->x[ip]; 129 | const T yi = fmm->y[ip]; 130 | const T zi = fmm->z[ip]; 131 | T ax = static_cast(0.0); 132 | T ay = static_cast(0.0); 133 | T az = static_cast(0.0); 134 | T p = static_cast(0.0); 135 | for (size_t j = 0; j < source->num_points; ++j) { 136 | const size_t jp = j + source->point_idx; 137 | const T dx = fmm->x[jp] - xi; 138 | const T dy = fmm->y[jp] - yi; 139 | const T dz = fmm->z[jp] - zi; 140 | const T r = dx * dx + dy * dy + dz * dz; 141 | const T inv_r = 1.0 / std::sqrt(r); 142 | const T inv_r_3 = fmm->w[jp] * inv_r * inv_r * inv_r; 143 | ax += dx * inv_r_3; 144 | ay += dy * inv_r_3; 145 | az += dz * inv_r_3; 146 | p += fmm->w[jp] * inv_r; 147 | } 148 | fmm->ax[ip] += ax; 149 | fmm->ay[ip] += ay; 150 | fmm->az[ip] += az; 151 | fmm->p[ip] += p; 152 | } 153 | } 154 | 155 | template 156 | void m2l(FMM* fmm, node_t* target, node_t* source) 157 | { 158 | int num_terms = fmm->num_terms; 159 | T dx = target->cx - source->cx; 160 | T dy = target->cy - source->cy; 161 | T dz = target->cz - source->cz; 162 | complex_t outer[num_terms * num_terms]; 163 | T rho, alpha, beta; 164 | cart_to_sph(dx, dy, dz, rho, alpha, beta); 165 | compute_outer<1>(fmm, rho, alpha, beta, outer); 166 | complex_t* Msource = &fmm->m[source->mult_idx]; 167 | complex_t* Ltarget = &fmm->l[target->mult_idx]; 168 | for (int j = 0; j < num_terms; ++j) { 169 | for (int k = -j; k <= j; ++k) { 170 | complex_t tmp(static_cast(0.0), static_cast(0.0)); 171 | for (int n = 0; n < num_terms - j; ++n) { 172 | for (int m = -n; m <= n; ++m) { 173 | tmp += Msource[mult_idx(n, m)] * outer[mult_idx(j + n, -k - m)]; 174 | // blah 175 | } 176 | } 177 | Ltarget[mult_idx(j, k)] += tmp; 178 | } 179 | } 180 | } 181 | 182 | template 183 | void p2m(FMM* fmm, node_t* node) 184 | { 185 | int num_terms = fmm->num_terms; 186 | size_t pt_offset = node->point_idx; 187 | size_t mt_offset = node->mult_idx; 188 | for (size_t i = 0; i < node->num_points; ++i) { 189 | T dx = fmm->x[i + pt_offset] - node->cx; 190 | T dy = fmm->y[i + pt_offset] - node->cy; 191 | T dz = fmm->z[i + pt_offset] - node->cz; 192 | complex_t inner[num_terms * num_terms]; 193 | T r, theta, phi; 194 | cart_to_sph(dx, dy, dz, r, theta, phi); 195 | compute_inner<1>(fmm, r, theta, phi, inner); 196 | for (int n = 0; n < num_terms; ++n) { 197 | for (int m = -n; m <= n; ++m) { 198 | fmm->m[mt_offset + mult_idx(n, m)] += 199 | fmm->w[i + pt_offset] * 200 | std::pow(static_cast(-1.0), static_cast(n)) * 201 | inner[mult_idx(n, m)]; 202 | } 203 | } 204 | } 205 | } 206 | 207 | template 208 | void m2m(FMM* fmm, node_t* node) 209 | { 210 | int num_terms = fmm->num_terms; 211 | for (size_t i = 0; i < node->num_children; ++i) { 212 | complex_t inner[num_terms * num_terms]; 213 | node_t* child = &fmm->nodes[node->child[i]]; 214 | T dx = node->cx - child->cx; 215 | T dy = node->cy - child->cy; 216 | T dz = node->cz - child->cz; 217 | T r, theta, phi; 218 | cart_to_sph(dx, dy, dz, r, theta, phi); 219 | compute_inner<1>(fmm, r, theta, phi, inner); 220 | const complex_t* Mchild = &fmm->m[child->mult_idx]; 221 | complex_t* Mnode = &fmm->m[node->mult_idx]; 222 | for (int j = 0; j < num_terms; ++j) { 223 | for (int k = -j; k <= j; ++k) { 224 | complex_t tmp(static_cast(0.0), static_cast(0.0)); 225 | for (int n = 0; n <= j; ++n) { 226 | for (int m = -n; m <= n; ++m) { 227 | if (abs(k - m) <= j - n) 228 | tmp += Mchild[mult_idx(n, m)] * inner[mult_idx(j - n, k - m)]; 229 | } 230 | } 231 | Mnode[mult_idx(j, k)] += tmp; 232 | } 233 | } 234 | } 235 | } 236 | 237 | template 238 | void l2l(FMM* fmm, node_t* node) 239 | { 240 | int num_terms = fmm->num_terms; 241 | complex_t inner[num_terms * num_terms]; 242 | for (size_t i = 0; i < node->num_children; ++i) { 243 | node_t* child = &fmm->nodes[node->child[i]]; 244 | // TODO flip these? 245 | T dx = child->cx - node->cx; 246 | T dy = child->cy - node->cy; 247 | T dz = child->cz - node->cz; 248 | T rho, alpha, beta; 249 | cart_to_sph(dx, dy, dz, rho, alpha, beta); 250 | compute_inner<1>(fmm, rho, alpha, beta, inner); 251 | complex_t* Lnode = &fmm->l[node->mult_idx]; 252 | complex_t* Lchild = &fmm->l[child->mult_idx]; 253 | for (int j = 0; j < num_terms; ++j) { 254 | for (int k = -j; k <= j; ++k) { 255 | complex_t tmp(static_cast(0.0), static_cast(0.0)); 256 | for (int n = j; n < num_terms; ++n) { 257 | for (int m = -n; m <= n; ++m) { 258 | if (std::abs(m - k) <= n - j) { 259 | tmp += Lnode[mult_idx(n, m)] * inner[mult_idx(n - j, m - k)]; 260 | } 261 | } 262 | } 263 | Lchild[mult_idx(j, k)] += tmp; 264 | } 265 | } 266 | } 267 | } 268 | 269 | template 270 | void l2p(FMM* fmm, node_t* node) 271 | { 272 | int num_terms = fmm->num_terms; 273 | complex_t inner[num_terms * num_terms]; 274 | complex_t inner_deriv[num_terms * num_terms]; 275 | size_t pt_offset = node->point_idx; 276 | complex_t* Lnode = &fmm->l[node->mult_idx]; 277 | for (size_t i = 0; i < node->num_points; ++i) { 278 | T dx = fmm->x[pt_offset + i] - node->cx; 279 | T dy = fmm->y[pt_offset + i] - node->cy; 280 | T dz = fmm->z[pt_offset + i] - node->cz; 281 | T r, theta, phi; 282 | cart_to_sph(dx, dy, dz, r, theta, phi); 283 | compute_inner<2>(fmm, r, theta, phi, inner, inner_deriv); 284 | 285 | T Psum = static_cast(0.0); 286 | T rsum = static_cast(0.0); 287 | T tsum = static_cast(0.0); 288 | T psum = static_cast(0.0); 289 | T two = static_cast(2.0); 290 | complex_t ci(static_cast(0.0), static_cast(1.0)); 291 | for (int n = 0; n < num_terms; ++n) { 292 | int m = 0; 293 | Psum += complex_real(Lnode[mult_idx(n, m)] * inner[mult_idx(n, m)]); 294 | rsum += static_cast(n) * 295 | complex_real(Lnode[mult_idx(n, m)] * inner[mult_idx(n, m)]); 296 | tsum += complex_real(Lnode[mult_idx(n, m)] * inner_deriv[mult_idx(n, m)]); 297 | psum += static_cast(m) * 298 | complex_real(Lnode[mult_idx(n, m)] * inner[mult_idx(n, m)] * ci); 299 | for (int m = 1; m <= n; ++m) { 300 | Psum += 301 | two * complex_real(Lnode[mult_idx(n, m)] * inner[mult_idx(n, m)]); 302 | rsum += two * static_cast(n) * 303 | complex_real(Lnode[mult_idx(n, m)] * inner[mult_idx(n, m)]); 304 | tsum += two * complex_real(Lnode[mult_idx(n, m)] * 305 | inner_deriv[mult_idx(n, m)]); 306 | psum += 307 | two * static_cast(m) * 308 | complex_real(Lnode[mult_idx(n, m)] * inner[mult_idx(n, m)] * ci); 309 | } 310 | } 311 | T inv_r = (r == static_cast(0.0)) ? 0.0 : static_cast(1.0) / r; 312 | rsum *= inv_r; 313 | tsum *= inv_r; 314 | psum *= inv_r; 315 | psum *= (theta == static_cast(0.0)) 316 | ? 0.0 317 | : static_cast(1.0) / std::sin(theta); 318 | T ax, ay, az; 319 | sph_unit_to_cart_unit(r, theta, phi, rsum, tsum, psum, ax, ay, az); 320 | fmm->p[pt_offset + i] += Psum; 321 | fmm->ax[pt_offset + i] += ax; 322 | fmm->ay[pt_offset + i] += ay; 323 | fmm->az[pt_offset + i] += az; 324 | } 325 | } 326 | 327 | template 328 | void m2p(FMM* fmm, node_t* target, node_t* source) 329 | { 330 | int num_terms = fmm->num_terms; 331 | size_t target_pt_offset = target->point_idx; 332 | size_t source_mt_offset = source->mult_idx; 333 | for (size_t i = 0; i < target->num_points; ++i) { 334 | T dx = fmm->x[target_pt_offset + i] - source->cx; 335 | T dy = fmm->y[target_pt_offset + i] - source->cy; 336 | T dz = fmm->z[target_pt_offset + i] - source->cz; 337 | T r, theta, phi; 338 | cart_to_sph(dx, dy, dz, r, theta, phi); 339 | complex_t outer[num_terms * num_terms]; 340 | complex_t outer_deriv[num_terms * num_terms]; 341 | compute_outer<2>(fmm, r, theta, phi, outer, outer_deriv); 342 | T Psum = static_cast(0.0); 343 | T rsum = static_cast(0.0); 344 | T tsum = static_cast(0.0); 345 | T psum = static_cast(0.0); 346 | T two = static_cast(2.0); 347 | const complex_t* M = &fmm->m[source_mt_offset]; 348 | const complex_t ci(static_cast(0.0), static_cast(1.0)); 349 | for (int n = 0; n < num_terms; ++n) { 350 | int m = 0; 351 | Psum += (outer[mult_idx(n, -m)] * M[mult_idx(n, m)]).real(); 352 | rsum += -static_cast(n + 1) * 353 | complex_real(outer[mult_idx(n, -m)] * M[mult_idx(n, m)]); 354 | tsum += complex_real(outer_deriv[mult_idx(n, -m)] * M[mult_idx(n, m)]); 355 | psum += static_cast(m) * 356 | complex_real(ci * outer[mult_idx(n, -m)] * M[mult_idx(n, m)]); 357 | for (m = 1; m <= n; ++m) { 358 | Psum += two * complex_real(outer[mult_idx(n, -m)] * 359 | fmm->m[source_mt_offset + mult_idx(n, m)]); 360 | rsum += two * -static_cast(n + 1) * 361 | complex_real(outer[mult_idx(n, -m)] * M[mult_idx(n, m)]); 362 | tsum += two * 363 | complex_real(outer_deriv[mult_idx(n, -m)] * M[mult_idx(n, m)]); 364 | psum += two * static_cast(m) * 365 | complex_real(ci * outer[mult_idx(n, -m)] * M[mult_idx(n, m)]); 366 | } 367 | } 368 | rsum *= static_cast(1.0) / r; 369 | tsum *= static_cast(1.0) / r; 370 | psum *= static_cast(1.0) / r; 371 | psum /= std::sin(theta); 372 | T ax, ay, az; 373 | sph_unit_to_cart_unit(r, theta, phi, rsum, tsum, psum, ax, ay, az); 374 | fmm->p[target_pt_offset + i] += Psum; 375 | fmm->ax[target_pt_offset + i] += ax; 376 | fmm->ay[target_pt_offset + i] += ay; 377 | fmm->az[target_pt_offset + i] += az; 378 | } 379 | } 380 | -------------------------------------------------------------------------------- /common/kokkos-utils.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define KOKKOS 4 | 5 | template 6 | KOKKOS_INLINE_FUNCTION void lock(T* val) 7 | { 8 | while (!Kokkos::atomic_compare_exchange(val, 0, 1)) 9 | ; 10 | #ifdef __CUDA_ARCH__ 11 | __threadfence(); 12 | #endif 13 | } 14 | 15 | template 16 | KOKKOS_INLINE_FUNCTION void unlock(T* val) 17 | { 18 | #ifdef __CUDA_ARCH__ 19 | __threadfence(); 20 | #endif 21 | while (Kokkos::atomic_compare_exchange(val, 1, 0)) 22 | ; 23 | } 24 | 25 | #ifdef __CUDACC__ 26 | #define INLINE __device__ __inline__ 27 | 28 | template 29 | class gpu_utils { 30 | public: 31 | static const int warp_size = 32; 32 | static const int num_threads = NTHREADS; 33 | 34 | INLINE static int thread_id() { return threadIdx.y; } 35 | INLINE static int worker_id() { return threadIdx.z; } 36 | INLINE static int global_worker_id() 37 | { 38 | return blockIdx.x * blockDim.z + worker_id(); 39 | } 40 | 41 | template ::type* = nullptr> 43 | INLINE static void sync_worker() 44 | { 45 | __syncwarp(); 46 | } 47 | 48 | template 32)>::type* = nullptr> 50 | INLINE static void sync_worker() 51 | { 52 | __syncthreads(); 53 | } 54 | }; 55 | #else 56 | #define INLINE inline 57 | #endif 58 | -------------------------------------------------------------------------------- /common/node.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | template 6 | struct node_t { 7 | node_t() = default; 8 | ~node_t() = default; 9 | node_t(T arg_cx, T arg_cy, T arg_cz, T arg_rad, size_t arg_num_points, 10 | size_t arg_point_idx, size_t arg_mult_idx, size_t arg_node_idx, 11 | size_t arg_level) 12 | : cx{arg_cx}, 13 | cy{arg_cy}, 14 | cz{arg_cz}, 15 | rad{arg_rad}, 16 | num_points{arg_num_points}, 17 | point_idx{arg_point_idx}, 18 | mult_idx{arg_mult_idx}, 19 | node_idx{arg_node_idx}, 20 | level{arg_level} 21 | { 22 | } 23 | T cx; 24 | T cy; 25 | T cz; 26 | T rad; 27 | size_t num_children = 0; 28 | size_t child[8] = {0}; 29 | size_t num_points; 30 | size_t point_idx; 31 | size_t mult_idx; 32 | size_t node_idx; 33 | size_t level; 34 | 35 | HOSTDEVICE 36 | bool is_leaf() const { return (num_children == 0); } 37 | }; 38 | -------------------------------------------------------------------------------- /common/spharm.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | template 7 | HOSTDEVICE void compute_legendre(size_t nmax, T x, T* P, T* P_deriv = nullptr); 8 | 9 | template 10 | void compute_inner(FMM* fmm, T r, T theta, T phi, complex_t* inner, 11 | complex_t* inner_deriv = nullptr) 12 | { 13 | if (fmm->num_terms == 0) return; 14 | //// TODO this can be reduced as we only calculate the 'positive' legendre 15 | /// vals 16 | T legendre[fmm->num_terms * fmm->num_terms]; 17 | T legendre_deriv[fmm->num_terms * fmm->num_terms]; 18 | // TODO forward order to compute_legendre 19 | if (order == 1) { 20 | compute_legendre<1>(fmm->num_terms - 1, std::cos(theta), legendre); 21 | } 22 | if (order == 2) { 23 | compute_legendre<2>(fmm->num_terms - 1, std::cos(theta), legendre, 24 | legendre_deriv); 25 | } 26 | 27 | const complex_t i = complex_t(static_cast(0.0), static_cast(1.0)); 28 | 29 | for (int n = 0; n < fmm->num_terms; ++n) { 30 | inner[mult_idx(n, 0)] = fmm->inner_factors[mult_idx(n, 0)] * 31 | legendre[leg_idx(n, 0)] * 32 | complex_exp(i * static_cast(0) * phi) * 33 | std::pow(r, static_cast(n)); 34 | if (order == 2) 35 | inner_deriv[mult_idx(n, 0)] = fmm->inner_factors[mult_idx(n, 0)] * 36 | legendre_deriv[leg_idx(n, 0)] * 37 | complex_exp(i * static_cast(0) * phi) * 38 | std::pow(r, static_cast(n)); 39 | for (int m = 1; m <= n; ++m) { 40 | inner[mult_idx(n, m)] = fmm->inner_factors[mult_idx(n, m)] * 41 | legendre[leg_idx(n, m)] * 42 | complex_exp(i * static_cast(m) * phi) * 43 | std::pow(r, static_cast(n)); 44 | inner[mult_idx(n, -m)] = 45 | std::pow(static_cast(-1.0), static_cast(m)) * 46 | complex_conj(inner[mult_idx(n, m)]); 47 | if (order == 2) { 48 | inner_deriv[mult_idx(n, m)] = fmm->inner_factors[mult_idx(n, m)] * 49 | legendre_deriv[leg_idx(n, m)] * 50 | complex_exp(i * static_cast(m) * phi) * 51 | std::pow(r, static_cast(n)); 52 | inner_deriv[mult_idx(n, -m)] = 53 | std::pow(static_cast(-1.0), static_cast(m)) * 54 | complex_conj(inner_deriv[mult_idx(n, m)]); 55 | } 56 | } 57 | } 58 | } 59 | 60 | template 61 | void compute_outer(FMM* fmm, T r, T theta, T phi, complex_t* outer, 62 | complex_t* outer_deriv = nullptr) 63 | { 64 | if (fmm->num_terms == 0) return; 65 | // TODO this can be reduced as we only calculate the 'positive' legendre vals 66 | T legendre[fmm->num_terms * fmm->num_terms]; 67 | T legendre_deriv[fmm->num_terms * fmm->num_terms]; 68 | 69 | if (order == 1) 70 | compute_legendre<1>(fmm->num_terms - 1, std::cos(theta), legendre); 71 | if (order == 2) { 72 | compute_legendre<2>(fmm->num_terms - 1, std::cos(theta), legendre, 73 | legendre_deriv); 74 | } 75 | 76 | const complex_t i = complex_t(static_cast(0.0), static_cast(1.0)); 77 | 78 | for (int n = 0; n < fmm->num_terms; ++n) { 79 | for (int m = 0; m <= n; ++m) { 80 | outer[mult_idx(n, m)] = 81 | fmm->outer_factors[mult_idx(n, m)] * legendre[leg_idx(n, m)] * 82 | complex_exp(i * static_cast(m) * phi) * 83 | (static_cast(1.0) / std::pow(r, static_cast(n + 1))); 84 | outer[mult_idx(n, -m)] = 85 | std::pow(static_cast(-1.0), static_cast(m)) * 86 | complex_conj(outer[mult_idx(n, m)]); 87 | if (order == 2) { 88 | outer_deriv[mult_idx(n, m)] = 89 | fmm->outer_factors[mult_idx(n, m)] * legendre_deriv[leg_idx(n, m)] * 90 | complex_exp(i * static_cast(m) * phi) * 91 | (static_cast(1.0) / std::pow(r, static_cast(n + 1))); 92 | outer_deriv[mult_idx(n, -m)] = 93 | std::pow(static_cast(-1.0), static_cast(m)) * 94 | complex_conj(outer_deriv[mult_idx(n, m)]); 95 | // TODO negative derivs may need to be calculated 96 | } 97 | } 98 | } 99 | } 100 | 101 | // TODO test with 'if constexpr' with order (C++17 feature) 102 | template 103 | HOSTDEVICE void compute_legendre(size_t nmax, T x, T* P, T* P_deriv) 104 | { 105 | const T csphase = static_cast(-1.0); 106 | const T one = static_cast(1.0); 107 | const T u = (x == 1.0) ? 0.0 : std::sqrt((one - x) * (one + x)); 108 | const T uinv = (u == static_cast(0.0)) ? static_cast(0.0) : one / u; 109 | const T xbyu = x * uinv; 110 | size_t n, m; 111 | size_t k, idxmm; 112 | T pnm, pmm, pm1, pm2, twomm1; 113 | pm2 = one; 114 | pm1 = x; 115 | 116 | P[0] = pm2; 117 | if (order >= 2) P_deriv[0] = static_cast(0.0); 118 | if (nmax == 0) return; 119 | P[1] = pm1; 120 | if (order >= 2) P_deriv[1] = -u; 121 | 122 | k = 1; 123 | for (n = 2; n <= nmax; ++n) { 124 | k += n; 125 | pnm = (static_cast(2 * n - 1) * x * pm1 - static_cast(n - 1) * pm2) / 126 | static_cast(n); 127 | P[k] = pnm; 128 | if (order >= 2) P_deriv[k] = -static_cast(n) * (pm1 - x * pnm) * uinv; 129 | pm2 = pm1; 130 | pm1 = pnm; 131 | } 132 | 133 | pmm = one; 134 | twomm1 = -one; 135 | idxmm = 0; 136 | for (m = 1; m <= nmax - 1; ++m) { 137 | idxmm += m + 1; 138 | twomm1 += static_cast(2.0); 139 | pmm *= csphase * u * twomm1; 140 | P[idxmm] = pmm; 141 | if (order >= 2) P_deriv[idxmm] = static_cast(m) * xbyu * pmm; 142 | pm2 = pmm; 143 | k = idxmm + m + 1; 144 | pm1 = x * pmm * static_cast(2 * m + 1); 145 | P[k] = pm1; 146 | if (order >= 2) 147 | P_deriv[k] = -uinv * (static_cast(2 * m + 1) * pmm - 148 | static_cast(m + 1) * x * pm1); 149 | 150 | for (n = m + 2; n <= nmax; ++n) { 151 | k += n; 152 | pnm = (static_cast(2 * n - 1) * x * pm1 - 153 | static_cast(n + m - 1) * pm2) / 154 | static_cast(n - m); 155 | P[k] = pnm; 156 | if (order >= 2) 157 | P_deriv[k] = 158 | -uinv * (static_cast(n + m) * pm1 - static_cast(n) * x * pnm); 159 | pm2 = pm1; 160 | pm1 = pnm; 161 | } 162 | } 163 | 164 | idxmm += m + 1; 165 | twomm1 += static_cast(2.0); 166 | pmm *= csphase * u * twomm1; 167 | P[idxmm] = pmm; 168 | if (order >= 2) P_deriv[idxmm] = static_cast(nmax) * x * pmm * uinv; 169 | } 170 | -------------------------------------------------------------------------------- /common/timer.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | class Timer { 6 | private: 7 | double wtime() 8 | { 9 | struct timeval t; 10 | gettimeofday(&t, NULL); 11 | return t.tv_sec + t.tv_usec * 1.0E-6; 12 | } 13 | 14 | public: 15 | inline void start() { this->tick = wtime(); } 16 | inline void stop() 17 | { 18 | this->tock = wtime(); 19 | this->elaps = this->tock - this->tick; 20 | } 21 | double elapsed() { return this->elaps; } 22 | 23 | private: 24 | double tick, tock, elaps; 25 | }; 26 | -------------------------------------------------------------------------------- /common/traversal.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | template 9 | void upwards_pass(FMM* fmm, node_t* node) 10 | { 11 | for (size_t i = 0; i < node->num_children; ++i) { 12 | upwards_pass(fmm, &fmm->nodes[node->child[i]]); 13 | } 14 | 15 | if (node->is_leaf()) 16 | p2m(fmm, node); 17 | else 18 | m2m(fmm, node); 19 | } 20 | 21 | template 22 | void downwards_pass(FMM* fmm, node_t* node) 23 | { 24 | if (node->is_leaf()) 25 | l2p(fmm, node); 26 | else { 27 | l2l(fmm, node); 28 | for (size_t i = 0; i < node->num_children; ++i) { 29 | downwards_pass(fmm, &fmm->nodes[node->child[i]]); 30 | } 31 | } 32 | } 33 | 34 | template 35 | void dual_tree(FMM* fmm, node_t* target, node_t* source) 36 | { 37 | T dx = source->cx - target->cx; 38 | T dy = source->cy - target->cy; 39 | T dz = source->cz - target->cz; 40 | T r2 = dx * dx + dy * dy + dz * dz; 41 | T d1 = source->rad * static_cast(2.0); 42 | T d2 = target->rad * static_cast(2.0); 43 | 44 | if ((d1 + d2) * (d1 + d2) < fmm->theta2 * r2) { 45 | m2l(fmm, target, source); 46 | } 47 | else if (source->is_leaf() && target->is_leaf()) { 48 | if (target == source) 49 | p2p_tiled(fmm, target); 50 | else 51 | p2p_tiled(fmm, target, source); 52 | } 53 | else { 54 | T target_sz = target->rad; 55 | T source_sz = source->rad; 56 | if (source->is_leaf() || ((target_sz >= source_sz) && !target->is_leaf())) { 57 | for (size_t i = 0; i < target->num_children; ++i) { 58 | node_t* child = &fmm->nodes[target->child[i]]; 59 | dual_tree(fmm, child, source); 60 | } 61 | } 62 | else { 63 | for (size_t i = 0; i < source->num_children; ++i) { 64 | dual_tree(fmm, target, &fmm->nodes[source->child[i]]); 65 | } 66 | } 67 | } 68 | } 69 | 70 | template 71 | void perform_traversals(FMM* fmm) 72 | { 73 | printf("Running in serial\n"); 74 | 75 | Timer timer; 76 | Timer tot_timer; 77 | 78 | timer.start(); 79 | tot_timer.start(); 80 | upwards_pass(fmm, &fmm->nodes[fmm->root]); 81 | timer.stop(); 82 | printf("\n"); 83 | printf("%-20s %12.8f\n", "Upwards Time (s) ", timer.elapsed()); 84 | 85 | timer.start(); 86 | dual_tree(fmm, &fmm->nodes[fmm->root], &fmm->nodes[fmm->root]); 87 | timer.stop(); 88 | printf("%-20s %12.8f\n", "DTT Time (s) ", timer.elapsed()); 89 | 90 | timer.start(); 91 | downwards_pass(fmm, &fmm->nodes[fmm->root]); 92 | timer.stop(); 93 | printf("%-20s %12.8f\n", "Downwards Time (s) ", timer.elapsed()); 94 | 95 | tot_timer.stop(); 96 | printf("--------------------\n"); 97 | printf("%-20s %12.8f\n", "Total Time (s) ", tot_timer.elapsed()); 98 | printf("--------------------\n\n"); 99 | } 100 | -------------------------------------------------------------------------------- /common/tree.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | template 11 | void get_bound_box(FMM* fmm, size_t start, size_t end, 12 | std::array, 3>& lims) 13 | { 14 | lims[0] = std::minmax_element(&fmm->x[start], &fmm->x[end]); 15 | lims[1] = std::minmax_element(&fmm->y[start], &fmm->y[end]); 16 | lims[2] = std::minmax_element(&fmm->z[start], &fmm->z[end]); 17 | } 18 | 19 | // stable in-place mergesort 20 | template 21 | void reorder(T* x, T* y, T* z, T* w, std::vector indices, size_t start, 22 | size_t end) 23 | { 24 | // for (size_t i = start; i < end; ++i) { 25 | // const T tx = x[i]; 26 | // const T ty = y[i]; 27 | // const T tz = z[i]; 28 | // size_t j = i; 29 | // while (true) { 30 | // size_t k = indices[j]; 31 | // indices[j] = j; 32 | // if (k == i) break; 33 | // if (k >= end) { 34 | // printf("problem k = %zu\n", k); 35 | // exit(1); 36 | // } 37 | // x[j] = x[k]; 38 | // y[j] = y[k]; 39 | // z[j] = z[k]; 40 | // j = k; 41 | // } 42 | // x[j] = tx; 43 | // y[j] = ty; 44 | // z[j] = tz; 45 | //} 46 | std::vector temp(end - start); 47 | for (size_t i = start; i < end; ++i) temp[i - start] = x[indices[i - start]]; 48 | for (size_t i = start; i < end; ++i) x[i] = temp[i - start]; 49 | 50 | for (size_t i = start; i < end; ++i) temp[i - start] = y[indices[i - start]]; 51 | for (size_t i = start; i < end; ++i) y[i] = temp[i - start]; 52 | 53 | for (size_t i = start; i < end; ++i) temp[i - start] = z[indices[i - start]]; 54 | for (size_t i = start; i < end; ++i) z[i] = temp[i - start]; 55 | 56 | for (size_t i = start; i < end; ++i) temp[i - start] = w[indices[i - start]]; 57 | for (size_t i = start; i < end; ++i) w[i] = temp[i - start]; 58 | } 59 | 60 | template 61 | size_t construct_tree(FMM* fmm, std::vector>& nodes, size_t start, 62 | size_t end, int depth, T cx, T cy, T cz, T rad) 63 | { 64 | const size_t node_idx = nodes.size(); 65 | nodes.push_back( 66 | node_t(cx, cy, cz, rad, end - start, start, 0, node_idx, depth)); 67 | 68 | // const size_t num_points = end - start + 1; 69 | if (end - start <= fmm->ncrit) { 70 | } 71 | else { 72 | std::vector indices(end - start); 73 | std::vector octants(end - start); 74 | 75 | size_t num_oct[8] = {0}; 76 | size_t oct_beg[8] = {0}; 77 | 78 | for (size_t i = start; i < end; ++i) { 79 | const size_t oct = 80 | ((fmm->x[i] > cx) << 2) | ((fmm->y[i] > cy) << 1) | (fmm->z[i] > cz); 81 | octants[i - start] = oct; 82 | num_oct[oct]++; 83 | } 84 | 85 | std::partial_sum(num_oct, num_oct + 7, oct_beg + 1); 86 | 87 | std::iota(indices.begin(), indices.end(), 0); 88 | std::sort(indices.begin(), indices.end(), 89 | [&octants](const size_t i, const size_t j) { 90 | return (octants[i] < octants[j]); 91 | }); 92 | std::for_each(indices.begin(), indices.end(), 93 | [start](size_t& i) { i += start; }); 94 | 95 | reorder(fmm->x, fmm->y, fmm->z, fmm->w, indices, start, end); 96 | 97 | size_t child[8] = {0}; 98 | size_t num_children = 0; 99 | 100 | for (size_t i = 0; i < 8; ++i) { 101 | T nrad = rad / 2.0; 102 | T ncx = ((i >> 2) & 1) ? (cx + nrad) : (cx - nrad); 103 | T ncy = ((i >> 1) & 1) ? (cy + nrad) : (cy - nrad); 104 | T ncz = ((i >> 0) & 1) ? (cz + nrad) : (cz - nrad); 105 | 106 | if (num_oct[i]) { 107 | // offset oct ptrs by start of the current points array 108 | child[num_children] = construct_tree(fmm, nodes, start + oct_beg[i], 109 | start + oct_beg[i] + num_oct[i], 110 | depth + 1, ncx, ncy, ncz, nrad); 111 | num_children++; 112 | } 113 | } 114 | nodes[node_idx].num_children = num_children; 115 | for (size_t i = 0; i < num_children; ++i) { 116 | nodes[node_idx].child[i] = child[i]; 117 | } 118 | } 119 | return node_idx; 120 | } 121 | 122 | template 123 | void build_tree(FMM* fmm) 124 | { 125 | Timer timer; 126 | timer.start(); 127 | 128 | std::array, 3> lims; 129 | get_bound_box(fmm, 0, fmm->num_points, lims); 130 | 131 | T cx = (*lims[0].second + *lims[0].first) / static_cast(2.0); 132 | T cy = (*lims[1].second + *lims[1].first) / static_cast(2.0); 133 | T cz = (*lims[2].second + *lims[2].first) / static_cast(2.0); 134 | 135 | std::array radii; 136 | for (int i = 0; i < 3; ++i) { 137 | radii[i] = (*lims[i].second - *lims[i].first) / static_cast(2.0); 138 | } 139 | 140 | T rad = *std::max_element(radii.begin(), radii.end()); 141 | // make sure no points lie on the edge of the node 142 | rad += std::numeric_limits::epsilon(); 143 | 144 | std::vector> nodes; 145 | fmm->root = 146 | construct_tree(fmm, nodes, 0, fmm->num_points, 0, cx, cy, cz, rad); 147 | 148 | fmm->num_nodes = nodes.size(); 149 | fmm->nodes = (node_t*)malloc(sizeof(node_t) * fmm->num_nodes); 150 | for (size_t n = 0; n < fmm->num_nodes; ++n) { 151 | fmm->nodes[n] = nodes[n]; 152 | } 153 | timer.stop(); 154 | // printf("built tree in %fs\n", timer.elapsed()); 155 | 156 | printf("num_nodes = %zu\n", fmm->num_nodes); 157 | 158 | printf("root %zu has %zu children\n", fmm->root, 159 | fmm->nodes[fmm->root].num_children); 160 | 161 | // Now we know the number of nodes we can allocate the multipole storage and 162 | // assign to each node 163 | fmm->m = (complex_t*)calloc(fmm->num_multipoles * fmm->num_nodes, 164 | sizeof(complex_t)); 165 | fmm->l = (complex_t*)calloc(fmm->num_multipoles * fmm->num_nodes, 166 | sizeof(complex_t)); 167 | 168 | size_t max_depth = 0; 169 | for (size_t n = 0; n < fmm->num_nodes; ++n) { 170 | fmm->nodes[n].mult_idx = n * fmm->num_multipoles; 171 | max_depth = std::max(max_depth, fmm->nodes[n].level); 172 | } 173 | printf("max tree depth = %zu\n", max_depth); 174 | } 175 | -------------------------------------------------------------------------------- /common/utils.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #if defined(__x86_64__) 7 | #if defined (__AVX512F__) 8 | #ifdef FMM_DOUBLE 9 | #define TILE_SIZE 8 10 | #else 11 | #define TILE_SIZE 64 12 | #endif 13 | #elif defined (__AVX2__) 14 | #define TILE_SIZE 16 15 | #else 16 | #define TILE_SIZE 8 17 | #endif 18 | #elif defined (__aarch64__) 19 | #define TILE_SIZE 4 20 | #else 21 | #warning architecture not supported 22 | #define TILE_SIZE 32 23 | #endif 24 | 25 | #ifndef M_PI 26 | #define M_PI 3.14159265358979323846 27 | #endif 28 | 29 | //#define XSTR(x) STR(x) 30 | //#define STR(x) #x 31 | //#pragma message "tile size = " XSTR(TILE_SIZE) 32 | 33 | HOSTDEVICE inline int mult_idx(const int n, const int m) 34 | { 35 | return n * n + n + m; 36 | } 37 | 38 | HOSTDEVICE inline int leg_idx(const int n, const int m) 39 | { 40 | return (n * (n + 1)) / 2 + std::abs(m); 41 | } 42 | 43 | HOSTDEVICE inline void inv_mult_idx(const int i, int& n, int& m) 44 | { 45 | n = (int)sqrtf((float)i); 46 | m = i - n * n - n; 47 | } 48 | 49 | template 50 | HOSTDEVICE void sph_unit_to_cart_unit(T r, T theta, T phi, T rsum, T tsum, 51 | T psum, T& ax, T& ay, T& az) 52 | { 53 | ax = std::sin(theta) * std::cos(phi) * rsum + 54 | std::cos(theta) * std::cos(phi) * tsum - std::sin(phi) * psum; 55 | ay = std::sin(theta) * std::sin(phi) * rsum + 56 | std::cos(theta) * std::sin(phi) * tsum + std::cos(phi) * psum; 57 | az = std::cos(theta) * rsum - std::sin(theta) * tsum; 58 | } 59 | 60 | template 61 | HOSTDEVICE T get_eps(); 62 | template <> 63 | HOSTDEVICE float get_eps() { return 1e-6; } 64 | template <> 65 | HOSTDEVICE double get_eps() { return 1e-14; } 66 | 67 | template 68 | HOSTDEVICE inline void cart_to_sph(T x, T y, T z, T& r, T& theta, T& phi) 69 | { 70 | const T eps = get_eps(); 71 | r = std::sqrt(x * x + y * y + z * z) + eps; 72 | theta = std::acos(z / r); 73 | phi = std::atan2(y, x); 74 | } 75 | 76 | -------------------------------------------------------------------------------- /common/verify.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | template 4 | T calc_error(T* p, T* test_p, size_t n) 5 | { 6 | T diff = static_cast(0.0); 7 | T norm = static_cast(0.0); 8 | for (size_t i = 0; i < n; ++i) { 9 | diff += (p[i] - test_p[i]) * (p[i] - test_p[i]); 10 | norm += test_p[i] * test_p[i]; 11 | } 12 | return std::sqrt(diff / norm); 13 | } 14 | 15 | template 16 | T calc_error(T* x, T* y, T* z, T* test_x, T* test_y, T* test_z, size_t n) 17 | { 18 | T diff = static_cast(0.0); 19 | T norm = static_cast(0.0); 20 | for (size_t i = 0; i < n; ++i) { 21 | T dx = (x[i] - test_x[i]) * (x[i] - test_x[i]); 22 | T dy = (y[i] - test_y[i]) * (y[i] - test_y[i]); 23 | T dz = (z[i] - test_z[i]) * (z[i] - test_z[i]); 24 | diff += dx + dy + dz; 25 | norm += (test_x[i] * test_x[i]) + (test_y[i] * test_y[i]) + 26 | (test_z[i] * test_z[i]); 27 | } 28 | return std::sqrt(diff/norm); 29 | } 30 | 31 | template 32 | void verify(FMM* fmm) 33 | { 34 | std::vector test_ax(fmm->num_samples); 35 | std::vector test_ay(fmm->num_samples); 36 | std::vector test_az(fmm->num_samples); 37 | std::vector test_p(fmm->num_samples); 38 | 39 | #pragma omp parallel for 40 | for (size_t i = 0; i < fmm->num_samples; ++i) { 41 | for (size_t j = 0; j < fmm->num_points; ++j) { 42 | if (i == j) continue; 43 | const T dx = fmm->x[j] - fmm->x[i]; 44 | const T dy = fmm->y[j] - fmm->y[i]; 45 | const T dz = fmm->z[j] - fmm->z[i]; 46 | const T r = std::sqrt(dx * dx + dy * dy + dz * dz); 47 | const T inv_r = static_cast(1.0) / r; 48 | const T inv_r_3 = inv_r * inv_r * inv_r; 49 | const T s = inv_r_3 * fmm->w[j]; 50 | test_ax[i] += s * dx; 51 | test_ay[i] += s * dy; 52 | test_az[i] += s * dz; 53 | test_p[i] += inv_r * fmm->w[j]; 54 | } 55 | } 56 | //for (size_t i = 0; i < fmm->num_samples; ++i) { 57 | // printf("%f vs %f\n", fmm->p[i], test_p[i]); 58 | //} 59 | printf("pot err = %.12e\n", calc_error(fmm->p, &test_p[0], fmm->num_samples)); 60 | printf("acc err = %.12e\n", 61 | calc_error(fmm->ax, fmm->ay, fmm->az, &test_ax[0], &test_ay[0], 62 | &test_az[0], fmm->num_samples)); 63 | } 64 | -------------------------------------------------------------------------------- /cuda/cuda-utils.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define KOKKOS 4 | 5 | template 6 | KOKKOS_INLINE_FUNCTION void lock(T* val) 7 | { 8 | while (0 != Kokkos::atomic_compare_exchange(val, 0, 1)) 9 | ; 10 | #ifdef __CUDA_ARCH__ 11 | __threadfence(); 12 | #endif 13 | } 14 | 15 | template 16 | KOKKOS_INLINE_FUNCTION void unlock(T* val) 17 | { 18 | #ifdef __CUDA_ARCH__ 19 | __threadfence(); 20 | #endif 21 | while (1 != Kokkos::atomic_compare_exchange(val, 1, 0)) 22 | ; 23 | } 24 | 25 | #ifdef __CUDACC__ 26 | #define INLINE __device__ __inline__ 27 | 28 | template 29 | class gpu_utils { 30 | public: 31 | static const int warp_size = 32; 32 | static const int num_threads = NTHREADS; 33 | 34 | INLINE static int thread_id() { return threadIdx.y; } 35 | INLINE static int worker_id() { return threadIdx.z; } 36 | INLINE static int global_worker_id() 37 | { 38 | return blockIdx.x * blockDim.z + worker_id(); 39 | } 40 | 41 | template ::type* = nullptr> 43 | INLINE static void sync_worker() 44 | { 45 | __syncwarp(); 46 | } 47 | 48 | template 32)>::type* = nullptr> 50 | INLINE static void sync_worker() 51 | { 52 | __syncthreads(); 53 | } 54 | }; 55 | #else 56 | #define INLINE inline 57 | #endif 58 | -------------------------------------------------------------------------------- /cuda/flags.makefile: -------------------------------------------------------------------------------- 1 | CC_NVCC=nvcc 2 | CC=$(CC_$(COMPILER)) 3 | 4 | ifneq ($(COMPILER), NVCC) 5 | $(error Only NVCC support for this version of MiniFMM) 6 | endif 7 | 8 | ARCH=sm_60 9 | 10 | CFLAGS_NVCC=-std=c++11 -O3 -ftz=true --use_fast_math -x cu -Xcompiler -fopenmp -arch=$(ARCH) 11 | CFLAGS=$(CFLAGS_$(COMPILER)) 12 | 13 | LIBS=-Xcompiler -fopenmp 14 | 15 | -------------------------------------------------------------------------------- /cuda/node.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | template 6 | struct node_t { 7 | node_t() = default; 8 | ~node_t() = default; 9 | node_t(T arg_cx, T arg_cy, T arg_cz, T arg_rad, size_t arg_num_points, 10 | size_t arg_point_idx, size_t arg_mult_idx, size_t arg_node_idx, 11 | size_t arg_level) 12 | : cx{arg_cx}, 13 | cy{arg_cy}, 14 | cz{arg_cz}, 15 | rad{arg_rad}, 16 | num_points{arg_num_points}, 17 | point_idx{arg_point_idx}, 18 | mult_idx{arg_mult_idx}, 19 | node_idx{arg_node_idx}, 20 | level{arg_level} 21 | { 22 | omp_init_lock(&p2p_lock); 23 | omp_init_lock(&m2l_lock); 24 | } 25 | T cx; 26 | T cy; 27 | T cz; 28 | T rad; 29 | size_t num_children = 0; 30 | size_t child[8] = {0}; 31 | size_t num_points; 32 | size_t point_idx; 33 | size_t mult_idx; 34 | size_t node_idx; 35 | size_t level; 36 | omp_lock_t p2p_lock; 37 | omp_lock_t m2l_lock; 38 | 39 | bool is_leaf() const { return (num_children == 0); } 40 | }; 41 | -------------------------------------------------------------------------------- /cuda/traversal.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #define INLINE __device__ __inline__ 10 | 11 | // for compatability with generic gpu kernels 12 | namespace gpu_utils { 13 | INLINE static int thread_id() { return threadIdx.x; } 14 | INLINE static int worker_id() { return 0; } 15 | INLINE static void sync_worker() { __syncthreads(); } 16 | } // namespace gpu_utils 17 | 18 | #include 19 | 20 | template 21 | void upwards_pass(FMM* fmm, node_t* node) 22 | { 23 | for (size_t i = 0; i < node->num_children; ++i) { 24 | #pragma omp task 25 | upwards_pass(fmm, &fmm->nodes[node->child[i]]); 26 | } 27 | #pragma omp taskwait 28 | 29 | if (node->is_leaf()) 30 | p2m(fmm, node); 31 | else 32 | m2m(fmm, node); 33 | } 34 | 35 | template 36 | void downwards_pass(FMM* fmm, node_t* node) 37 | { 38 | if (node->is_leaf()) 39 | l2p(fmm, node); 40 | else { 41 | l2l(fmm, node); 42 | for (size_t i = 0; i < node->num_children; ++i) { 43 | #pragma omp task 44 | downwards_pass(fmm, &fmm->nodes[node->child[i]]); 45 | } 46 | } 47 | #pragma omp taskwait 48 | } 49 | 50 | template 51 | __global__ void p2p_kernel(FMM* d_fmm, size_t* d_p2p_nodes, 52 | size_t* d_p2p_deps_array, size_t* d_p2p_deps_offsets, 53 | size_t* d_p2p_deps_sizes) 54 | { 55 | const int i = blockIdx.x; 56 | node_t* target = d_fmm->nodes + d_p2p_nodes[i]; 57 | size_t num_deps = d_p2p_deps_sizes[i]; 58 | size_t offset = d_p2p_deps_offsets[i]; 59 | for (size_t j = 0; j < num_deps; ++j) { 60 | size_t source_idx = d_p2p_deps_array[offset + j]; 61 | node_t* source = d_fmm->nodes + source_idx; 62 | if (target == source) 63 | p2p_gpu<128, 128, 1, 128, 1>(d_fmm, target); 64 | else 65 | p2p_gpu<128, 128, 1, 128, 1>(d_fmm, target, source); 66 | } 67 | } 68 | 69 | template 70 | __global__ void m2l_kernel(FMM* d_fmm, size_t* d_m2l_nodes, 71 | size_t* d_m2l_deps_array, size_t* d_m2l_deps_offsets, 72 | size_t* d_m2l_deps_sizes) 73 | { 74 | const int i = blockIdx.x; 75 | node_t* target = d_fmm->nodes + d_m2l_nodes[i]; 76 | size_t num_deps = d_m2l_deps_sizes[i]; 77 | size_t offset = d_m2l_deps_offsets[i]; 78 | for (size_t j = 0; j < num_deps; ++j) { 79 | size_t source_idx = d_m2l_deps_array[offset + j]; 80 | node_t* source = d_fmm->nodes + source_idx; 81 | m2l_gpu<32, 1>(d_fmm, target, source); 82 | } 83 | } 84 | 85 | template 86 | void cuda_dtt(FMM* fmm, FMM* h_fmm, FMM* d_fmm) 87 | { 88 | Timer timer; 89 | 90 | timer.start(); 91 | 92 | std::vector> p2p_deps(fmm->num_nodes); 93 | std::vector> m2l_deps(fmm->num_nodes); 94 | 95 | get_deps_omp(fmm, &p2p_deps, &m2l_deps); 96 | timer.stop(); 97 | printf(" %-16s %12.8f\n", "Deps. Time (s) ", timer.elapsed()); 98 | 99 | timer.start(); 100 | 101 | size_t* h_p2p_nodes; 102 | size_t* h_p2p_deps_array; 103 | size_t* h_p2p_deps_offsets; 104 | size_t* h_p2p_deps_sizes; 105 | size_t* d_p2p_nodes; 106 | size_t* d_p2p_deps_array; 107 | size_t* d_p2p_deps_offsets; 108 | size_t* d_p2p_deps_sizes; 109 | size_t p2p_deps_tot; 110 | size_t p2p_num_nodes; 111 | 112 | size_t* h_m2l_nodes; 113 | size_t* h_m2l_deps_array; 114 | size_t* h_m2l_deps_offsets; 115 | size_t* h_m2l_deps_sizes; 116 | size_t* d_m2l_nodes; 117 | size_t* d_m2l_deps_array; 118 | size_t* d_m2l_deps_offsets; 119 | size_t* d_m2l_deps_sizes; 120 | size_t m2l_deps_tot; 121 | size_t m2l_num_nodes; 122 | 123 | pack_deps(p2p_deps, &h_p2p_nodes, &h_p2p_deps_array, &h_p2p_deps_offsets, 124 | &h_p2p_deps_sizes, &p2p_deps_tot, &p2p_num_nodes); 125 | pack_deps(m2l_deps, &h_m2l_nodes, &h_m2l_deps_array, &h_m2l_deps_offsets, 126 | &h_m2l_deps_sizes, &m2l_deps_tot, &m2l_num_nodes); 127 | timer.stop(); 128 | printf("%-20s %12.8f\n", " Pack Time (s) ", timer.elapsed()); 129 | 130 | timer.start(); 131 | alloc_and_copy(&d_p2p_nodes, h_p2p_nodes, p2p_num_nodes); 132 | alloc_and_copy(&d_p2p_deps_array, h_p2p_deps_array, p2p_deps_tot); 133 | alloc_and_copy(&d_p2p_deps_offsets, h_p2p_deps_offsets, p2p_num_nodes); 134 | alloc_and_copy(&d_p2p_deps_sizes, h_p2p_deps_sizes, p2p_num_nodes); 135 | alloc_and_copy(&d_m2l_nodes, h_m2l_nodes, m2l_num_nodes); 136 | alloc_and_copy(&d_m2l_deps_array, h_m2l_deps_array, m2l_deps_tot); 137 | alloc_and_copy(&d_m2l_deps_offsets, h_m2l_deps_offsets, m2l_num_nodes); 138 | alloc_and_copy(&d_m2l_deps_sizes, h_m2l_deps_sizes, m2l_num_nodes); 139 | timer.stop(); 140 | printf("%-20s %12.8f\n", " Transfer Time (s) ", timer.elapsed()); 141 | 142 | timer.start(); 143 | p2p_kernel<<>>(d_fmm, d_p2p_nodes, d_p2p_deps_array, 144 | d_p2p_deps_offsets, d_p2p_deps_sizes); 145 | CUDACHK(cudaGetLastError()); 146 | CUDACHK(cudaDeviceSynchronize()); 147 | timer.stop(); 148 | printf(" %-16s %12.8f\n", "P2P Time (s) ", timer.elapsed()); 149 | 150 | timer.start(); 151 | m2l_kernel<<>>(d_fmm, d_m2l_nodes, d_m2l_deps_array, 152 | d_m2l_deps_offsets, d_m2l_deps_sizes); 153 | CUDACHK(cudaGetLastError()); 154 | CUDACHK(cudaDeviceSynchronize()); 155 | timer.stop(); 156 | printf(" %-16s %12.8f\n", "M2L Time (s) ", timer.elapsed()); 157 | 158 | free(h_p2p_nodes); 159 | free(h_p2p_deps_array); 160 | free(h_p2p_deps_offsets); 161 | free(h_p2p_deps_sizes); 162 | free(h_m2l_nodes); 163 | free(h_m2l_deps_array); 164 | free(h_m2l_deps_offsets); 165 | free(h_m2l_deps_sizes); 166 | 167 | device_free(d_p2p_nodes); 168 | device_free(d_p2p_deps_array); 169 | device_free(d_p2p_deps_offsets); 170 | device_free(d_p2p_deps_sizes); 171 | device_free(d_m2l_nodes); 172 | device_free(d_m2l_deps_array); 173 | device_free(d_m2l_deps_offsets); 174 | device_free(d_m2l_deps_sizes); 175 | } 176 | 177 | template 178 | void perform_traversals(FMM* fmm) 179 | { 180 | #pragma omp parallel 181 | #pragma omp single 182 | printf("Running on %d threads\n", omp_get_num_threads()); 183 | 184 | Timer timer; 185 | Timer tot_timer; 186 | 187 | timer.start(); 188 | tot_timer.start(); 189 | #pragma omp parallel 190 | #pragma omp single 191 | upwards_pass(fmm, &fmm->nodes[fmm->root]); 192 | timer.stop(); 193 | printf("\n"); 194 | printf("%-20s %12.8f\n", "Upwards Time (s) ", timer.elapsed()); 195 | 196 | FMM* h_fmm; 197 | FMM* d_fmm; 198 | 199 | init_device_fmm(fmm, &h_fmm, &d_fmm); 200 | 201 | timer.start(); 202 | cuda_dtt(fmm, h_fmm, d_fmm); 203 | timer.stop(); 204 | printf("%-20s %12.8f\n", "DTT Time (s) ", timer.elapsed()); 205 | 206 | fini_device_fmm(fmm, h_fmm, d_fmm); 207 | 208 | timer.start(); 209 | #pragma omp parallel 210 | #pragma omp single 211 | downwards_pass(fmm, &fmm->nodes[fmm->root]); 212 | timer.stop(); 213 | printf("%-20s %12.8f\n", "Downwards Time (s) ", timer.elapsed()); 214 | 215 | tot_timer.stop(); 216 | printf("--------------------\n"); 217 | printf("%-20s %12.8f\n", "Total Time (s) ", tot_timer.elapsed()); 218 | printf("--------------------\n\n"); 219 | } 220 | -------------------------------------------------------------------------------- /gpusched/flags.makefile: -------------------------------------------------------------------------------- 1 | CC_NVCC=nvcc 2 | CC=$(CC_$(COMPILER)) 3 | 4 | ifneq ($(COMPILER), NVCC) 5 | $(error Only NVCC support for this version of MiniFMM) 6 | endif 7 | 8 | ARCH=sm_60 9 | 10 | CFLAGS_NVCC=-DNWORKERS=1 -std=c++11 -O3 -ftz=true --use_fast_math -x cu -Xcompiler -fopenmp -arch=$(ARCH) 11 | CFLAGS=$(CFLAGS_$(COMPILER)) 12 | 13 | LIBS=-Xcompiler -fopenmp 14 | 15 | -------------------------------------------------------------------------------- /gpusched/traversal.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | #include 11 | 12 | template 13 | void upwards_pass(FMM* fmm, node_t* node) 14 | { 15 | for (size_t i = 0; i < node->num_children; ++i) { 16 | #pragma omp task 17 | upwards_pass(fmm, &fmm->nodes[node->child[i]]); 18 | } 19 | #pragma omp taskwait 20 | 21 | if (node->is_leaf()) 22 | p2m(fmm, node); 23 | else 24 | m2m(fmm, node); 25 | } 26 | 27 | template 28 | __device__ void upwards_pass_gpu_task(worker_t* worker, task_t* task) 29 | { 30 | FMM* fmm = (FMM*)get_private(task, 0); 31 | node_t* node = (node_t*)get_private(task, 1); 32 | 33 | for (size_t i = 0; i < node->num_children; ++i) { 34 | void* args[2] = {fmm, fmm->nodes + node->child[i]}; 35 | generate_task(worker, upwards_pass_gpu_task, 2, args); 36 | } 37 | taskwait(worker); 38 | 39 | if (node->is_leaf()) 40 | p2m_gpu<32, 1>(fmm, node); 41 | else 42 | m2m_gpu<32, 1>(fmm, node); 43 | } 44 | 45 | template 46 | void upwards_pass_gpu(team_t* h_team, team_t* d_team, FMM*h_fmm, FMM* d_fmm) 47 | { 48 | const int nargs = 2; 49 | void* args[nargs] = {d_fmm, h_fmm->nodes + h_fmm->root}; 50 | 51 | fork_team>(h_team, d_team, nargs, args); 52 | } 53 | 54 | template 55 | void downwards_pass(FMM* fmm, node_t* node) 56 | { 57 | if (node->is_leaf()) 58 | l2p(fmm, node); 59 | else { 60 | l2l(fmm, node); 61 | for (size_t i = 0; i < node->num_children; ++i) { 62 | #pragma omp task 63 | downwards_pass(fmm, &fmm->nodes[node->child[i]]); 64 | } 65 | } 66 | #pragma omp taskwait 67 | } 68 | 69 | template 70 | __device__ void dtt_task(worker_t* worker, task_t* task) 71 | { 72 | FMM* fmm = (FMM*)get_private(task, 0); 73 | node_t* target = (node_t*)get_private(task, 1); 74 | node_t* source = (node_t*)get_private(task, 2); 75 | 76 | T dx = source->cx - target->cx; 77 | T dy = source->cy - target->cy; 78 | T dz = source->cz - target->cz; 79 | T r2 = dx * dx + dy * dy + dz * dz; 80 | T d1 = source->rad * static_cast(2.0); 81 | T d2 = target->rad * static_cast(2.0); 82 | 83 | if ((d1 + d2) * (d1 + d2) < fmm->theta2 * r2) { 84 | m2l_gpu(fmm, target, source); 85 | } 86 | else if (source->is_leaf() && target->is_leaf()) { 87 | if (target == source) 88 | p2p_gpu<32, 16, 4, NTHREADS, NWORKERS>(fmm, target); 89 | else 90 | p2p_gpu<32, 16, 4, NTHREADS, NWORKERS>(fmm, target, source); 91 | } 92 | else { 93 | T target_sz = target->rad; 94 | T source_sz = source->rad; 95 | if (source->is_leaf() || ((target_sz >= source_sz) && !target->is_leaf())) { 96 | for (size_t i = 0; i < target->num_children; ++i) { 97 | node_t* child = fmm->nodes + target->child[i]; 98 | if (target->num_points > TASK_CUTOFF) { 99 | void* args[3] = {fmm, child, source}; 100 | generate_task(worker, dtt_task, 3, args); 101 | } 102 | else { 103 | ((void**)task->storage)[1] = child; 104 | dtt_task(worker, task); 105 | ((void**)task->storage)[1] = target; 106 | } 107 | } 108 | } 109 | else { 110 | for (size_t i = 0; i < source->num_children; ++i) { 111 | node_t* child = fmm->nodes + source->child[i]; 112 | // void* args[3] = {fmm, target, child}; 113 | // generate_task_cond(worker, dtt_task, 3, args, 114 | // source->num_points > TASK_CUTOFF); 115 | if (source->num_points > TASK_CUTOFF) { 116 | void* args[3] = {fmm, target, child}; 117 | generate_task(worker, dtt_task, 3, args); 118 | } 119 | else { 120 | ((void**)task->storage)[2] = child; 121 | dtt_task(worker, task); 122 | ((void**)task->storage)[2] = source; 123 | } 124 | } 125 | } 126 | } 127 | } 128 | 129 | template 130 | void perform_traversals(FMM* fmm) 131 | { 132 | #pragma omp parallel 133 | #pragma omp single 134 | printf("Running on %d threads\n", omp_get_num_threads()); 135 | 136 | Timer timer; 137 | Timer tot_timer; 138 | 139 | // timer.start(); 140 | // tot_timer.start(); 141 | //#pragma omp parallel 142 | //#pragma omp single 143 | // upwards_pass(fmm, &fmm->nodes[fmm->root]); 144 | // timer.stop(); 145 | // printf("\n"); 146 | // printf("%-20s %12.8f\n", "Upwards Time (s) ", timer.elapsed()); 147 | 148 | //#pragma omp parallel 149 | //#pragma omp single 150 | // dual_tree(fmm, &fmm->nodes[fmm->root], &fmm->nodes[fmm->root]); 151 | 152 | FMM* h_fmm; 153 | FMM* d_fmm; 154 | init_device_fmm(fmm, &h_fmm, &d_fmm); 155 | 156 | const char* num_blocks_str = getenv("GPUSCHED_NUM_BLOCKS"); 157 | const int num_blocks = 158 | (num_blocks_str == NULL) ? 56 * 5 : atoi(num_blocks_str); 159 | 160 | team_t* h_team; 161 | team_t* d_team; 162 | create_team(num_blocks, 1024 * 512, &h_team, &d_team); 163 | 164 | timer.start(); 165 | tot_timer.start(); 166 | upwards_pass_gpu(h_team, d_team, h_fmm, d_fmm); 167 | timer.stop(); 168 | printf("\n"); 169 | printf("%-20s %12.8f\n", "Upwards Time (s) ", timer.elapsed()); 170 | 171 | const int nargs = 3; 172 | node_t* d_root_node = h_fmm->nodes + h_fmm->root; 173 | void* args[nargs] = {d_fmm, d_root_node, d_root_node}; 174 | 175 | timer.start(); 176 | fork_team>(h_team, d_team, nargs, args); 177 | timer.stop(); 178 | printf("%-20s %12.8f\n", "DTT Time (s) ", timer.elapsed()); 179 | 180 | fini_device_fmm(fmm, h_fmm, d_fmm); 181 | 182 | timer.start(); 183 | #pragma omp parallel 184 | #pragma omp single 185 | downwards_pass(fmm, &fmm->nodes[fmm->root]); 186 | timer.stop(); 187 | printf("%-20s %12.8f\n", "Downwards Time (s) ", timer.elapsed()); 188 | 189 | tot_timer.stop(); 190 | printf("--------------------\n"); 191 | printf("%-20s %12.8f\n", "Total Time (s) ", tot_timer.elapsed()); 192 | printf("--------------------\n\n"); 193 | } 194 | -------------------------------------------------------------------------------- /inputs/plummer.in: -------------------------------------------------------------------------------- 1 | -n 10000000 2 | -m 1000 3 | -e 0.66 4 | -t 3 5 | -c 256 6 | --plummer 7 | -------------------------------------------------------------------------------- /inputs/small.in: -------------------------------------------------------------------------------- 1 | -n 100000 2 | -m 1000 3 | -e 0.5 4 | -t 4 5 | -c 512 6 | -------------------------------------------------------------------------------- /inputs/uniform.in: -------------------------------------------------------------------------------- 1 | -n 10000000 2 | -m 1000 3 | -e 0.66 4 | -t 3 5 | -c 256 6 | --uniform 7 | -------------------------------------------------------------------------------- /kokkos-for/flags.makefile: -------------------------------------------------------------------------------- 1 | include $(KOKKOS_PATH)/Makefile.kokkos 2 | 3 | CC_GNU=g++ 4 | CC_INTEL=icpc 5 | CC_CLANG=clang++ 6 | CC_ARM=armclang++ 7 | CC_CRAY=CC 8 | CC_NVCC=nvcc_wrapper 9 | CC=$(CC_$(COMPILER)) 10 | 11 | UNAME=$(shell uname -m) 12 | ifeq ($(UNAME), aarch64) 13 | ARCH_CFLAGS = -mcpu=$(ARCH) -mtune=$(ARCH) 14 | ifeq ($(COMPILER), GNU) 15 | ARCH_CFLAGS += -mlow-precision-recip-sqrt 16 | endif 17 | endif 18 | ifeq ($(UNAME), x86_64) 19 | ARCH_CFLAGS = -march=$(ARCH) 20 | endif 21 | 22 | CFLAGS_CLANG=-Ofast $(ARCH_CFLAGS) -fopenmp 23 | CFLAGS_GNU=-Ofast -fno-cx-limited-range $(ARCH_CFLAGS) -fopenmp 24 | CFLAGS_INTEL=-Ofast -x$(ARCH_CFLAGS) -qopenmp 25 | CFLAGS_ARM=-Ofast $(ARCH_CFLAGS) -fopenmp 26 | CFLAGS_CRAY=-fopenmp 27 | CFLAGS_NVCC=-O3 -ftz=true --use_fast_math 28 | CFLAGS=$(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CFLAGS_$(COMPILER)) 29 | 30 | LIBS=$(KOKKOS_LDFLAGS) $(KOKKOS_LIBS) 31 | 32 | -------------------------------------------------------------------------------- /kokkos-for/node.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | 7 | template 8 | struct node_t { 9 | node_t() = default; 10 | ~node_t() = default; 11 | node_t(T arg_cx, T arg_cy, T arg_cz, T arg_rad, size_t arg_num_points, 12 | size_t arg_point_idx, size_t arg_mult_idx, size_t arg_node_idx, 13 | size_t arg_level) 14 | : cx{arg_cx}, 15 | cy{arg_cy}, 16 | cz{arg_cz}, 17 | rad{arg_rad}, 18 | num_points{arg_num_points}, 19 | point_idx{arg_point_idx}, 20 | mult_idx{arg_mult_idx}, 21 | node_idx{arg_node_idx}, 22 | level{arg_level} 23 | { 24 | omp_init_lock(&p2p_lock); 25 | omp_init_lock(&m2l_lock); 26 | } 27 | T cx; 28 | T cy; 29 | T cz; 30 | T rad; 31 | size_t num_children = 0; 32 | size_t child[8] = {0}; 33 | size_t num_points; 34 | size_t point_idx; 35 | size_t mult_idx; 36 | size_t node_idx; 37 | size_t level; 38 | 39 | omp_lock_t p2p_lock; 40 | omp_lock_t m2l_lock; 41 | 42 | HOSTDEVICE 43 | bool is_leaf() const { return (num_children == 0); } 44 | }; 45 | -------------------------------------------------------------------------------- /kokkos-for/traversal.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #ifdef __CUDACC__ 12 | // TODO fix this (kokkos utils needs to be included before gpu kernels) 13 | #include 14 | 15 | #include 16 | #endif 17 | 18 | #ifndef KOKKOS_SCHEDULE 19 | #define KOKKOS_SCHEDULE Dynamic 20 | #endif 21 | 22 | template 23 | void upwards_pass(FMM* fmm, node_t* node) 24 | { 25 | for (size_t i = 0; i < node->num_children; ++i) { 26 | #pragma omp task 27 | upwards_pass(fmm, &fmm->nodes[node->child[i]]); 28 | } 29 | 30 | #pragma omp taskwait 31 | 32 | if (node->is_leaf()) 33 | p2m(fmm, node); 34 | else 35 | m2m(fmm, node); 36 | } 37 | 38 | template 39 | void downwards_pass(FMM* fmm, node_t* node) 40 | { 41 | if (node->is_leaf()) 42 | l2p(fmm, node); 43 | else { 44 | l2l(fmm, node); 45 | for (size_t i = 0; i < node->num_children; ++i) { 46 | #pragma omp task 47 | downwards_pass(fmm, &fmm->nodes[node->child[i]]); 48 | } 49 | } 50 | #pragma omp taskwait 51 | } 52 | 53 | template 54 | void kokkos_dtt(FMM* fmm) 55 | { 56 | FMM* d_fmm; 57 | FMM* h_fmm; 58 | 59 | init_device_fmm(fmm, &h_fmm, &d_fmm); 60 | 61 | std::vector> p2p_deps(fmm->num_nodes); 62 | std::vector> m2l_deps(fmm->num_nodes); 63 | 64 | Timer timer; 65 | timer.start(); 66 | get_deps_omp(fmm, &p2p_deps, &m2l_deps); 67 | timer.stop(); 68 | printf("%-20s %12.8f\n", " Deps Time (s) ", timer.elapsed()); 69 | 70 | timer.start(); 71 | size_t* p2p_nodes; 72 | size_t* p2p_deps_array; 73 | size_t* p2p_deps_offsets; 74 | size_t* p2p_deps_sizes; 75 | size_t p2p_deps_tot; 76 | size_t p2p_num_nodes; 77 | 78 | size_t* m2l_nodes; 79 | size_t* m2l_deps_array; 80 | size_t* m2l_deps_offsets; 81 | size_t* m2l_deps_sizes; 82 | size_t m2l_deps_tot; 83 | size_t m2l_num_nodes; 84 | 85 | pack_deps(p2p_deps, &p2p_nodes, &p2p_deps_array, &p2p_deps_offsets, 86 | &p2p_deps_sizes, &p2p_deps_tot, &p2p_num_nodes); 87 | pack_deps(m2l_deps, &m2l_nodes, &m2l_deps_array, &m2l_deps_offsets, 88 | &m2l_deps_sizes, &m2l_deps_tot, &m2l_num_nodes); 89 | timer.stop(); 90 | printf("%-20s %12.8f\n", " Pack Time (s) ", timer.elapsed()); 91 | 92 | timer.start(); 93 | Kokkos::View> 95 | h_p2p_nodes(p2p_nodes, p2p_num_nodes); 96 | Kokkos::View> 98 | h_m2l_nodes(m2l_nodes, m2l_num_nodes); 99 | 100 | Kokkos::View> 102 | h_p2p_deps_array(p2p_deps_array, p2p_deps_tot); 103 | Kokkos::View> 105 | h_m2l_deps_array(m2l_deps_array, m2l_deps_tot); 106 | 107 | Kokkos::View> 109 | h_p2p_deps_offsets(p2p_deps_offsets, p2p_num_nodes); 110 | Kokkos::View> 112 | h_m2l_deps_offsets(m2l_deps_offsets, m2l_num_nodes); 113 | 114 | Kokkos::View> 116 | h_p2p_deps_sizes(p2p_deps_sizes, p2p_num_nodes); 117 | Kokkos::View> 119 | h_m2l_deps_sizes(m2l_deps_sizes, m2l_num_nodes); 120 | 121 | Kokkos::View d_p2p_nodes("d_p2p_nodes", p2p_num_nodes); 122 | Kokkos::View d_m2l_nodes("d_m2l_nodes", m2l_num_nodes); 123 | Kokkos::View d_p2p_deps_array("d_p2p_deps_array", p2p_deps_tot); 124 | Kokkos::View d_m2l_deps_array("d_m2l_deps_array", m2l_deps_tot); 125 | Kokkos::View d_p2p_deps_offsets("d_p2p_deps_offsets", p2p_num_nodes); 126 | Kokkos::View d_m2l_deps_offsets("d_m2l_deps_offsets", m2l_num_nodes); 127 | Kokkos::View d_p2p_deps_sizes("d_p2p_deps_sizes", p2p_num_nodes); 128 | Kokkos::View d_m2l_deps_sizes("d_m2l_deps_sizes", m2l_num_nodes); 129 | 130 | Kokkos::deep_copy(d_p2p_nodes, h_p2p_nodes); 131 | Kokkos::deep_copy(d_m2l_nodes, h_m2l_nodes); 132 | Kokkos::deep_copy(d_p2p_deps_array, h_p2p_deps_array); 133 | Kokkos::deep_copy(d_m2l_deps_array, h_m2l_deps_array); 134 | Kokkos::deep_copy(d_p2p_deps_offsets, h_p2p_deps_offsets); 135 | Kokkos::deep_copy(d_m2l_deps_offsets, h_m2l_deps_offsets); 136 | Kokkos::deep_copy(d_p2p_deps_sizes, h_p2p_deps_sizes); 137 | Kokkos::deep_copy(d_m2l_deps_sizes, h_m2l_deps_sizes); 138 | Kokkos::fence(); 139 | timer.stop(); 140 | printf("%-20s %12.8f\n", " Transfer Time (s) ", timer.elapsed()); 141 | 142 | using policy_type = 143 | Kokkos::TeamPolicy>; 145 | using member_type = policy_type::member_type; 146 | 147 | #ifdef __CUDACC__ 148 | policy_type p2p_policy = policy_type(p2p_num_nodes, 128); 149 | policy_type m2l_policy = policy_type(m2l_num_nodes, 32); 150 | #else 151 | policy_type p2p_policy = policy_type(p2p_num_nodes, 1); 152 | policy_type m2l_policy = policy_type(m2l_num_nodes, 1); 153 | #endif 154 | 155 | timer.start(); 156 | Kokkos::parallel_for( 157 | p2p_policy, KOKKOS_LAMBDA(member_type member) { 158 | const int i = member.league_rank(); 159 | node_t* target = d_fmm->nodes + d_p2p_nodes[i]; 160 | size_t p2p_size = d_p2p_deps_sizes[i]; 161 | size_t p2p_offset = d_p2p_deps_offsets[i]; 162 | for (size_t j = 0; j < p2p_size; ++j) { 163 | size_t source_idx = d_p2p_deps_array[p2p_offset + j]; 164 | node_t* source = d_fmm->nodes + source_idx; 165 | if (target == source) { 166 | // p2p_tiled(d_fmm, target); 167 | #ifdef __CUDA_ARCH__ 168 | p2p_gpu<128, 128, 1, 128, 1>(d_fmm, target); 169 | #else 170 | p2p_tiled(d_fmm, target); 171 | #endif 172 | } 173 | else { 174 | // p2p_tiled(d_fmm, target, source); 175 | #ifdef __CUDA_ARCH__ 176 | p2p_gpu<128, 128, 1, 128, 1>(d_fmm, target, source); 177 | #else 178 | p2p_tiled(d_fmm, target, source); 179 | #endif 180 | } 181 | } 182 | }); 183 | Kokkos::fence(); 184 | timer.stop(); 185 | printf("%-20s %12.8f\n", " P2P Time (s) ", timer.elapsed()); 186 | 187 | timer.start(); 188 | Kokkos::parallel_for( 189 | m2l_policy, KOKKOS_LAMBDA(member_type member) { 190 | const int i = member.league_rank(); 191 | node_t* target = d_fmm->nodes + d_m2l_nodes[i]; 192 | size_t m2l_size = d_m2l_deps_sizes[i]; 193 | size_t m2l_offset = d_m2l_deps_offsets[i]; 194 | for (size_t j = 0; j < m2l_size; ++j) { 195 | size_t source_idx = d_m2l_deps_array[m2l_offset + j]; 196 | node_t* source = d_fmm->nodes + source_idx; 197 | #ifdef __CUDA_ARCH__ 198 | m2l_gpu<32, 1>(d_fmm, target, source); 199 | #else 200 | m2l(d_fmm, target, source); 201 | #endif 202 | } 203 | }); 204 | Kokkos::fence(); 205 | timer.stop(); 206 | printf("%-20s %12.8f\n", " M2L Time (s) ", timer.elapsed()); 207 | 208 | free(p2p_nodes); 209 | free(p2p_deps_array); 210 | free(p2p_deps_offsets); 211 | free(p2p_deps_sizes); 212 | 213 | free(m2l_nodes); 214 | free(m2l_deps_array); 215 | free(m2l_deps_offsets); 216 | free(m2l_deps_sizes); 217 | 218 | fini_device_fmm(fmm, h_fmm, d_fmm); 219 | } 220 | 221 | template 222 | void perform_traversals(FMM* fmm) 223 | { 224 | Kokkos::initialize(); 225 | 226 | printf("Running in serial\n"); 227 | 228 | Timer timer; 229 | Timer tot_timer; 230 | 231 | timer.start(); 232 | tot_timer.start(); 233 | #pragma omp parallel 234 | #pragma omp single 235 | upwards_pass(fmm, &fmm->nodes[fmm->root]); 236 | timer.stop(); 237 | printf("\n"); 238 | printf("%-20s %12.8f\n", "Upwards Time (s) ", timer.elapsed()); 239 | 240 | timer.start(); 241 | kokkos_dtt(fmm); 242 | // dual_tree(fmm, &fmm->nodes[fmm->root], &fmm->nodes[fmm->root]); 243 | timer.stop(); 244 | printf("%-20s %12.8f\n", "DTT Time (s) ", timer.elapsed()); 245 | 246 | timer.start(); 247 | #pragma omp parallel 248 | #pragma omp single 249 | downwards_pass(fmm, &fmm->nodes[fmm->root]); 250 | timer.stop(); 251 | printf("%-20s %12.8f\n", "Downwards Time (s) ", timer.elapsed()); 252 | 253 | tot_timer.stop(); 254 | printf("--------------------\n"); 255 | printf("%-20s %12.8f\n", "Total Time (s) ", tot_timer.elapsed()); 256 | printf("--------------------\n\n"); 257 | 258 | Kokkos::finalize(); 259 | } 260 | -------------------------------------------------------------------------------- /kokkos-task-locks/flags.makefile: -------------------------------------------------------------------------------- 1 | include $(KOKKOS_PATH)/Makefile.kokkos 2 | 3 | CC_GNU=g++ 4 | CC_INTEL=icpc 5 | CC_CLANG=clang++ 6 | CC_ARM=armclang++ 7 | CC_CRAY=CC 8 | CC_NVCC=nvcc_wrapper 9 | CC=$(CC_$(COMPILER)) 10 | 11 | UNAME=$(shell uname -m) 12 | ifeq ($(UNAME), aarch64) 13 | ARCH_CFLAGS = -mcpu=$(ARCH) -mtune=$(ARCH) 14 | ifeq ($(COMPILER), GNU) 15 | ARCH_CFLAGS += -mlow-precision-recip-sqrt 16 | endif 17 | endif 18 | ifeq ($(UNAME), x86_64) 19 | ARCH_CFLAGS = -march=$(ARCH) 20 | endif 21 | 22 | CFLAGS_CLANG=-Ofast $(ARCH_CFLAGS) -fopenmp 23 | CFLAGS_GNU=-Ofast -fno-cx-limited-range $(ARCH_CFLAGS) -fopenmp 24 | CFLAGS_INTEL=-Ofast -x$(ARCH_CFLAGS) -qopenmp 25 | CFLAGS_ARM=-Ofast $(ARCH_CFLAGS) -fopenmp 26 | CFLAGS_CRAY=-fopenmp 27 | CFLAGS_NVCC=-O3 -ftz=true --use_fast_math 28 | CFLAGS=$(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CFLAGS_$(COMPILER)) 29 | 30 | LIBS=$(KOKKOS_LDFLAGS) $(KOKKOS_LIBS) 31 | 32 | -------------------------------------------------------------------------------- /kokkos-task-locks/node.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | template 6 | struct node_t { 7 | node_t() = default; 8 | ~node_t() = default; 9 | node_t(T arg_cx, T arg_cy, T arg_cz, T arg_rad, size_t arg_num_points, 10 | size_t arg_point_idx, size_t arg_mult_idx, size_t arg_node_idx, 11 | size_t arg_level) 12 | : cx{arg_cx}, 13 | cy{arg_cy}, 14 | cz{arg_cz}, 15 | rad{arg_rad}, 16 | num_points{arg_num_points}, 17 | point_idx{arg_point_idx}, 18 | mult_idx{arg_mult_idx}, 19 | node_idx{arg_node_idx}, 20 | level{arg_level} 21 | { 22 | } 23 | T cx; 24 | T cy; 25 | T cz; 26 | T rad; 27 | size_t num_children = 0; 28 | size_t child[8] = {0}; 29 | size_t num_points; 30 | size_t point_idx; 31 | size_t mult_idx; 32 | size_t node_idx; 33 | size_t level; 34 | 35 | int p2p_lock = 0; 36 | int m2l_lock = 0; 37 | 38 | HOSTDEVICE 39 | bool is_leaf() const { return (num_children == 0); } 40 | }; 41 | -------------------------------------------------------------------------------- /kokkos-task-locks/traversal.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | 12 | #include 13 | 14 | #ifndef KOKKOS_SCHEDULER 15 | #define KOKKOS_SCHEDULER TaskSchedulerMultiple 16 | #endif 17 | 18 | template 19 | void upwards_pass(FMM* fmm, node_t* node) 20 | { 21 | for (size_t i = 0; i < node->num_children; ++i) { 22 | #pragma omp task 23 | upwards_pass(fmm, &fmm->nodes[node->child[i]]); 24 | } 25 | 26 | #pragma omp taskwait 27 | 28 | if (node->is_leaf()) 29 | p2m(fmm, node); 30 | else 31 | m2m(fmm, node); 32 | } 33 | 34 | template 35 | void downwards_pass(FMM* fmm, node_t* node) 36 | { 37 | if (node->is_leaf()) 38 | l2p(fmm, node); 39 | else { 40 | l2l(fmm, node); 41 | for (size_t i = 0; i < node->num_children; ++i) { 42 | #pragma omp task 43 | downwards_pass(fmm, &fmm->nodes[node->child[i]]); 44 | } 45 | } 46 | #pragma omp taskwait 47 | } 48 | 49 | // template 50 | // void dual_tree(FMM* fmm, node_t* target, node_t* source) 51 | //{ 52 | // T dx = source->cx - target->cx; 53 | // T dy = source->cy - target->cy; 54 | // T dz = source->cz - target->cz; 55 | // T r2 = dx * dx + dy * dy + dz * dz; 56 | // T d1 = source->rad * static_cast(2.0); 57 | // T d2 = target->rad * static_cast(2.0); 58 | // 59 | // if ((d1 + d2) * (d1 + d2) < fmm->theta2 * r2) { 60 | // m2l(fmm, target, source); 61 | // } 62 | // else if (source->is_leaf() && target->is_leaf()) { 63 | // if (target == source) 64 | // p2p_tiled(fmm, target); 65 | // else 66 | // p2p_tiled(fmm, target, source); 67 | // } 68 | // else { 69 | // T target_sz = target->rad; 70 | // T source_sz = source->rad; 71 | // if (source->is_leaf() || ((target_sz >= source_sz) && !target->is_leaf())) 72 | // { 73 | // for (size_t i = 0; i < target->num_children; ++i) { 74 | // node_t* child = &fmm->nodes[target->child[i]]; 75 | // dual_tree(fmm, child, source); 76 | // } 77 | // } 78 | // else { 79 | // for (size_t i = 0; i < source->num_children; ++i) { 80 | // dual_tree(fmm, target, &fmm->nodes[source->child[i]]); 81 | // } 82 | // } 83 | // } 84 | //} 85 | 86 | namespace Kokkos { 87 | class Cuda; 88 | class OpenMP; 89 | } // namespace Kokkos 90 | 91 | template 92 | struct dual_tree_task { 93 | using value_type = void; 94 | using future_type = Kokkos::BasicFuture; 95 | 96 | FMM* fmm; 97 | node_t* target; 98 | node_t* source; 99 | 100 | KOKKOS_INLINE_FUNCTION 101 | dual_tree_task(FMM* arg_fmm, node_t* arg_target, node_t* arg_source) 102 | : fmm{arg_fmm}, target{arg_target}, source{arg_source} 103 | { 104 | } 105 | 106 | template 107 | KOKKOS_INLINE_FUNCTION typename std::enable_if< 108 | std::is_same::value>::type 109 | operator()(typename Scheduler::member_type& member) 110 | { 111 | T dx = source->cx - target->cx; 112 | T dy = source->cy - target->cy; 113 | T dz = source->cz - target->cz; 114 | T r2 = dx * dx + dy * dy + dz * dz; 115 | T d1 = source->rad * static_cast(2.0); 116 | T d2 = target->rad * static_cast(2.0); 117 | 118 | // TODO for some reason the compiler still tries to compile this function 119 | #ifdef __CUDACC__ 120 | if ((d1 + d2) * (d1 + d2) < fmm->theta2 * r2) { 121 | if (member.team_rank() == 0) lock(&target->m2l_lock); 122 | member.team_barrier(); 123 | m2l_gpu<32, 4>(fmm, target, source); 124 | if (member.team_rank() == 0) unlock(&target->m2l_lock); 125 | member.team_barrier(); 126 | } 127 | else if (source->is_leaf() && target->is_leaf()) { 128 | if (member.team_rank() == 0) lock(&target->p2p_lock); 129 | member.team_barrier(); 130 | if (target == source) 131 | p2p_gpu<32, 16, 4, 32, 4>(fmm, target); 132 | else 133 | p2p_gpu<32, 16, 4, 32, 4>(fmm, target, source); 134 | if (member.team_rank() == 0) unlock(&target->p2p_lock); 135 | member.team_barrier(); 136 | } 137 | else { 138 | T target_sz = target->rad; 139 | T source_sz = source->rad; 140 | if (source->is_leaf() || 141 | ((target_sz >= source_sz) && !target->is_leaf())) { 142 | for (size_t i = 0; i < target->num_children; ++i) { 143 | node_t* child = &fmm->nodes[target->child[i]]; 144 | if (target->num_points > TASK_CUTOFF) { 145 | if (member.team_rank() == 0) 146 | Kokkos::BasicFuture f = 147 | Kokkos::task_spawn(Kokkos::TaskTeam(member.scheduler()), 148 | dual_tree_task(fmm, child, source)); 149 | } 150 | else { 151 | dual_tree_task(fmm, child, source)(member); 152 | } 153 | } 154 | } 155 | else { 156 | for (size_t i = 0; i < source->num_children; ++i) { 157 | node_t* child = &fmm->nodes[source->child[i]]; 158 | if (source->num_points > TASK_CUTOFF) { 159 | if (member.team_rank() == 0) 160 | Kokkos::BasicFuture f = 161 | Kokkos::task_spawn(Kokkos::TaskTeam(member.scheduler()), 162 | dual_tree_task(fmm, target, child)); 163 | } 164 | else { 165 | dual_tree_task(fmm, target, child)(member); 166 | } 167 | } 168 | } 169 | } 170 | #endif 171 | } 172 | 173 | template 174 | KOKKOS_INLINE_FUNCTION 175 | typename std::enable_if::value>::type 177 | operator()(typename Scheduler::member_type& member) 178 | { 179 | T dx = source->cx - target->cx; 180 | T dy = source->cy - target->cy; 181 | T dz = source->cz - target->cz; 182 | T r2 = dx * dx + dy * dy + dz * dz; 183 | T d1 = source->rad * static_cast(2.0); 184 | T d2 = target->rad * static_cast(2.0); 185 | 186 | if ((d1 + d2) * (d1 + d2) < fmm->theta2 * r2) { 187 | lock(&target->m2l_lock); 188 | m2l(fmm, target, source); 189 | unlock(&target->m2l_lock); 190 | } 191 | else if (source->is_leaf() && target->is_leaf()) { 192 | lock(&target->p2p_lock); 193 | if (target == source) { 194 | p2p_tiled(fmm, target); 195 | } 196 | else { 197 | p2p_tiled(fmm, target, source); 198 | } 199 | unlock(&target->p2p_lock); 200 | } 201 | else { 202 | T target_sz = target->rad; 203 | T source_sz = source->rad; 204 | if (source->is_leaf() || 205 | ((target_sz >= source_sz) && !target->is_leaf())) { 206 | for (size_t i = 0; i < target->num_children; ++i) { 207 | node_t* child = &fmm->nodes[target->child[i]]; 208 | if (target->num_points > TASK_CUTOFF) { 209 | Kokkos::BasicFuture f = 210 | Kokkos::task_spawn(Kokkos::TaskSingle(member.scheduler()), 211 | dual_tree_task(fmm, child, source)); 212 | } 213 | else { 214 | dual_tree_task(fmm, child, source)(member); 215 | } 216 | } 217 | } 218 | else { 219 | for (size_t i = 0; i < source->num_children; ++i) { 220 | node_t* child = &fmm->nodes[source->child[i]]; 221 | dual_tree_task(fmm, target, child)(member); 222 | } 223 | } 224 | } 225 | } 226 | }; 227 | 228 | template 229 | typename std::enable_if::value>::type 231 | kokkos_dtt(FMM* fmm) 232 | { 233 | printf("openmp\n"); 234 | const size_t min_block_size = 64; 235 | const size_t max_block_size = 1024; 236 | const size_t super_block_size = 4096; 237 | const size_t memory_capacity = 1024 * 1024 * 1024; 238 | 239 | Scheduler sched(typename Scheduler::memory_space(), memory_capacity, 240 | min_block_size, std::min(max_block_size, memory_capacity), 241 | std::min(super_block_size, memory_capacity)); 242 | node_t* root_node = fmm->nodes + fmm->root; 243 | Kokkos::BasicFuture f = Kokkos::host_spawn( 244 | Kokkos::TaskSingle(sched), 245 | dual_tree_task(fmm, root_node, root_node)); 246 | Kokkos::wait(sched); 247 | } 248 | 249 | template 250 | typename std::enable_if::value>::type 252 | kokkos_dtt(FMM* fmm) 253 | { 254 | printf("cuda\n"); 255 | const size_t min_block_size = 128; 256 | const size_t max_block_size = 1024; 257 | const size_t super_block_size = 4096; 258 | const size_t memory_capacity = 1024 * 1024 * 1024; 259 | 260 | #ifdef __CUDACC__ 261 | const int stack_size = 8192; 262 | CUDACHK(cudaDeviceSetLimit(cudaLimitStackSize, stack_size)); 263 | #endif 264 | 265 | Scheduler sched(typename Scheduler::memory_space(), memory_capacity, 266 | min_block_size, std::min(max_block_size, memory_capacity), 267 | std::min(super_block_size, memory_capacity)); 268 | 269 | FMM* d_fmm; 270 | FMM* h_fmm; 271 | 272 | init_device_fmm(fmm, &h_fmm, &d_fmm); 273 | 274 | node_t* root_node = h_fmm->nodes + h_fmm->root; 275 | 276 | Kokkos::BasicFuture f = Kokkos::host_spawn( 277 | Kokkos::TaskTeam(sched), 278 | dual_tree_task(d_fmm, root_node, root_node)); 279 | Kokkos::wait(sched); 280 | 281 | fini_device_fmm(fmm, h_fmm, d_fmm); 282 | } 283 | 284 | template 285 | void perform_traversals(FMM* fmm) 286 | { 287 | using Scheduler = 288 | Kokkos::KOKKOS_SCHEDULER; 289 | 290 | Kokkos::initialize(); 291 | 292 | Timer timer; 293 | Timer tot_timer; 294 | 295 | timer.start(); 296 | tot_timer.start(); 297 | #pragma omp parallel 298 | #pragma omp single 299 | upwards_pass(fmm, &fmm->nodes[fmm->root]); 300 | timer.stop(); 301 | printf("\n"); 302 | printf("%-20s %12.8f\n", "Upwards Time (s) ", timer.elapsed()); 303 | 304 | timer.start(); 305 | kokkos_dtt(fmm); 306 | timer.stop(); 307 | printf("%-20s %12.8f\n", "DTT Time (s) ", timer.elapsed()); 308 | 309 | timer.start(); 310 | #pragma omp parallel 311 | #pragma omp single 312 | downwards_pass(fmm, &fmm->nodes[fmm->root]); 313 | timer.stop(); 314 | printf("%-20s %12.8f\n", "Downwards Time (s) ", timer.elapsed()); 315 | 316 | tot_timer.stop(); 317 | printf("--------------------\n"); 318 | printf("%-20s %12.8f\n", "Total Time (s) ", tot_timer.elapsed()); 319 | printf("--------------------\n\n"); 320 | 321 | Kokkos::finalize(); 322 | } 323 | 324 | 325 | -------------------------------------------------------------------------------- /kokkos-task/flags.makefile: -------------------------------------------------------------------------------- 1 | include $(KOKKOS_PATH)/Makefile.kokkos 2 | 3 | CC_GNU=g++ 4 | CC_INTEL=icpc 5 | CC_CLANG=clang++ 6 | CC_ARM=armclang++ 7 | CC_CRAY=CC 8 | CC_NVCC=nvcc_wrapper 9 | CC=$(CC_$(COMPILER)) 10 | 11 | UNAME=$(shell uname -m) 12 | ifeq ($(UNAME), aarch64) 13 | ARCH_CFLAGS = -mcpu=$(ARCH) -mtune=$(ARCH) 14 | ifeq ($(COMPILER), GNU) 15 | ARCH_CFLAGS += -mlow-precision-recip-sqrt 16 | endif 17 | endif 18 | ifeq ($(UNAME), x86_64) 19 | ARCH_CFLAGS = -march=$(ARCH) 20 | endif 21 | 22 | CFLAGS_CLANG=-Ofast $(ARCH_CFLAGS) -fopenmp 23 | CFLAGS_GNU=-Ofast -fno-cx-limited-range $(ARCH_CFLAGS) -fopenmp 24 | CFLAGS_INTEL=-Ofast -x$(ARCH_CFLAGS) -qopenmp 25 | CFLAGS_ARM=-Ofast $(ARCH_CFLAGS) -fopenmp 26 | CFLAGS_CRAY=-fopenmp 27 | CFLAGS_NVCC=-O3 -ftz=true --use_fast_math 28 | CFLAGS=$(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CFLAGS_$(COMPILER)) 29 | 30 | LIBS=$(KOKKOS_LDFLAGS) $(KOKKOS_LIBS) 31 | 32 | -------------------------------------------------------------------------------- /kokkos-task/node.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | template 6 | struct node_t { 7 | node_t() = default; 8 | ~node_t() = default; 9 | node_t(T arg_cx, T arg_cy, T arg_cz, T arg_rad, size_t arg_num_points, 10 | size_t arg_point_idx, size_t arg_mult_idx, size_t arg_node_idx, 11 | size_t arg_level) 12 | : cx{arg_cx}, 13 | cy{arg_cy}, 14 | cz{arg_cz}, 15 | rad{arg_rad}, 16 | num_points{arg_num_points}, 17 | point_idx{arg_point_idx}, 18 | mult_idx{arg_mult_idx}, 19 | node_idx{arg_node_idx}, 20 | level{arg_level} 21 | { 22 | } 23 | T cx; 24 | T cy; 25 | T cz; 26 | T rad; 27 | size_t num_children = 0; 28 | size_t child[8] = {0}; 29 | size_t num_points; 30 | size_t point_idx; 31 | size_t mult_idx; 32 | size_t node_idx; 33 | size_t level; 34 | 35 | int p2p_lock = 0; 36 | int m2l_lock = 0; 37 | 38 | HOSTDEVICE 39 | bool is_leaf() const { return (num_children == 0); } 40 | }; 41 | -------------------------------------------------------------------------------- /kokkos-task/traversal.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | 12 | #include 13 | 14 | #ifndef KOKKOS_SCHEDULER 15 | #define KOKKOS_SCHEDULER TaskSchedulerMultiple 16 | #endif 17 | 18 | //template 19 | //void upwards_pass(FMM* fmm, node_t* node) 20 | //{ 21 | // for (size_t i = 0; i < node->num_children; ++i) { 22 | //#pragma omp task 23 | // upwards_pass(fmm, &fmm->nodes[node->child[i]]); 24 | // } 25 | // 26 | //#pragma omp taskwait 27 | // 28 | // if (node->is_leaf()) 29 | // p2m(fmm, node); 30 | // else 31 | // m2m(fmm, node); 32 | //} 33 | 34 | template 35 | struct upwards_task { 36 | using value_type = void; 37 | using future_type = Kokkos::BasicFuture; 38 | 39 | FMM* fmm; 40 | node_t* node; 41 | future_type respawn_future; 42 | 43 | KOKKOS_INLINE_FUNCTION 44 | upwards_task(FMM* arg_fmm, node_t* arg_node) 45 | : fmm{arg_fmm}, node{arg_node}, respawn_future() 46 | { 47 | } 48 | 49 | KOKKOS_INLINE_FUNCTION 50 | void operator()(typename Scheduler::member_type& member) 51 | { 52 | if (node->is_leaf()) { 53 | #ifndef __CUDA_ARCH__ 54 | p2m(fmm, node); 55 | #else 56 | //p2m_gpu<32, 4>(fmm, node); 57 | #endif 58 | } 59 | else if (respawn_future.is_null()) { 60 | future_type futures[8]; 61 | for (size_t i = 0; i < node->num_children; ++i) { 62 | node_t* child = fmm->nodes + node->child[i]; 63 | futures[i] = 64 | Kokkos::task_spawn(Kokkos::TaskSingle(member.scheduler()), 65 | upwards_task(fmm, child)); 66 | } 67 | for (size_t i = node->num_children; i < 8; ++i) futures[i] = future_type(); 68 | respawn_future = member.scheduler().when_all(futures, node->num_children); 69 | Kokkos::respawn(this, respawn_future); 70 | } 71 | else { 72 | #ifndef __CUDA_ARCH__ 73 | m2m(fmm, node); 74 | #else 75 | //m2m_gpu<32, 4>(fmm, node); 76 | #endif 77 | } 78 | } 79 | }; 80 | 81 | template 82 | void kokkos_upwards(FMM* fmm) 83 | { 84 | #ifdef __CUDACC__ 85 | FMM* h_fmm; 86 | FMM* d_fmm; 87 | init_device_fmm(fmm, &h_fmm, &d_fmm); 88 | node_t* root_node = h_fmm->nodes + h_fmm->root; 89 | FMM* device_fmm = d_fmm; 90 | #else 91 | FMM* device_fmm = fmm; 92 | node_t* root_node = fmm->nodes + fmm->root; 93 | #endif 94 | 95 | const size_t min_block_size = 32; 96 | const size_t max_block_size = 128; 97 | const size_t super_block_size = 10000; 98 | const size_t memory_capacity = 1024 * 1024 * 1024; 99 | 100 | using Scheduler = Kokkos::TaskScheduler; 101 | 102 | Scheduler sched(typename Scheduler::memory_space(), memory_capacity, 103 | min_block_size, std::min(max_block_size, memory_capacity), 104 | std::min(super_block_size, memory_capacity)); 105 | 106 | Kokkos::BasicFuture f = Kokkos::host_spawn( 107 | Kokkos::TaskSingle(sched), upwards_task(device_fmm, root_node)); 108 | 109 | Kokkos::wait(sched); 110 | 111 | #ifdef __CUDACC__ 112 | fini_device_fmm(fmm, h_fmm, d_fmm); 113 | #endif 114 | } 115 | 116 | template 117 | void downwards_pass(FMM* fmm, node_t* node) 118 | { 119 | if (node->is_leaf()) 120 | l2p(fmm, node); 121 | else { 122 | l2l(fmm, node); 123 | for (size_t i = 0; i < node->num_children; ++i) { 124 | #pragma omp task 125 | downwards_pass(fmm, &fmm->nodes[node->child[i]]); 126 | } 127 | } 128 | #pragma omp taskwait 129 | } 130 | 131 | // template 132 | // void dual_tree(FMM* fmm, node_t* target, node_t* source) 133 | //{ 134 | // T dx = source->cx - target->cx; 135 | // T dy = source->cy - target->cy; 136 | // T dz = source->cz - target->cz; 137 | // T r2 = dx * dx + dy * dy + dz * dz; 138 | // T d1 = source->rad * static_cast(2.0); 139 | // T d2 = target->rad * static_cast(2.0); 140 | // 141 | // if ((d1 + d2) * (d1 + d2) < fmm->theta2 * r2) { 142 | // m2l(fmm, target, source); 143 | // } 144 | // else if (source->is_leaf() && target->is_leaf()) { 145 | // if (target == source) 146 | // p2p_tiled(fmm, target); 147 | // else 148 | // p2p_tiled(fmm, target, source); 149 | // } 150 | // else { 151 | // T target_sz = target->rad; 152 | // T source_sz = source->rad; 153 | // if (source->is_leaf() || ((target_sz >= source_sz) && !target->is_leaf())) 154 | // { 155 | // for (size_t i = 0; i < target->num_children; ++i) { 156 | // node_t* child = &fmm->nodes[target->child[i]]; 157 | // dual_tree(fmm, child, source); 158 | // } 159 | // } 160 | // else { 161 | // for (size_t i = 0; i < source->num_children; ++i) { 162 | // dual_tree(fmm, target, &fmm->nodes[source->child[i]]); 163 | // } 164 | // } 165 | // } 166 | //} 167 | 168 | namespace Kokkos { 169 | class Cuda; 170 | class OpenMP; 171 | } // namespace Kokkos 172 | 173 | template 174 | struct dual_tree_task { 175 | using value_type = void; 176 | using future_type = Kokkos::BasicFuture; 177 | 178 | FMM* fmm; 179 | node_t* target; 180 | node_t* source; 181 | 182 | KOKKOS_INLINE_FUNCTION 183 | dual_tree_task(FMM* arg_fmm, node_t* arg_target, node_t* arg_source) 184 | : fmm{arg_fmm}, target{arg_target}, source{arg_source} 185 | { 186 | } 187 | 188 | template 189 | KOKKOS_INLINE_FUNCTION typename std::enable_if< 190 | std::is_same::value>::type 191 | operator()(typename Scheduler::member_type& member) 192 | { 193 | T dx = source->cx - target->cx; 194 | T dy = source->cy - target->cy; 195 | T dz = source->cz - target->cz; 196 | T r2 = dx * dx + dy * dy + dz * dz; 197 | T d1 = source->rad * static_cast(2.0); 198 | T d2 = target->rad * static_cast(2.0); 199 | 200 | // TODO for some reason the compiler still tries to compile this function 201 | #ifdef __CUDACC__ 202 | if ((d1 + d2) * (d1 + d2) < fmm->theta2 * r2) { 203 | m2l_gpu<32, 4>(fmm, target, source); 204 | } 205 | else if (source->is_leaf() && target->is_leaf()) { 206 | if (target == source) 207 | p2p_gpu<32, 16, 4, 32, 4>(fmm, target); 208 | else 209 | p2p_gpu<32, 16, 4, 32, 4>(fmm, target, source); 210 | } 211 | else { 212 | T target_sz = target->rad; 213 | T source_sz = source->rad; 214 | if (source->is_leaf() || 215 | ((target_sz >= source_sz) && !target->is_leaf())) { 216 | for (size_t i = 0; i < target->num_children; ++i) { 217 | node_t* child = &fmm->nodes[target->child[i]]; 218 | if (target->num_points > TASK_CUTOFF) { 219 | if (member.team_rank() == 0) 220 | Kokkos::BasicFuture f = 221 | Kokkos::task_spawn(Kokkos::TaskTeam(member.scheduler()), 222 | dual_tree_task(fmm, child, source)); 223 | } 224 | else { 225 | dual_tree_task(fmm, child, source)(member); 226 | } 227 | } 228 | } 229 | else { 230 | for (size_t i = 0; i < source->num_children; ++i) { 231 | node_t* child = &fmm->nodes[source->child[i]]; 232 | if (source->num_points > TASK_CUTOFF) { 233 | if (member.team_rank() == 0) 234 | Kokkos::BasicFuture f = 235 | Kokkos::task_spawn(Kokkos::TaskTeam(member.scheduler()), 236 | dual_tree_task(fmm, target, child)); 237 | } 238 | else { 239 | dual_tree_task(fmm, target, child)(member); 240 | } 241 | } 242 | } 243 | } 244 | #endif 245 | } 246 | 247 | template 248 | KOKKOS_INLINE_FUNCTION 249 | typename std::enable_if::value>::type 251 | operator()(typename Scheduler::member_type& member) 252 | { 253 | T dx = source->cx - target->cx; 254 | T dy = source->cy - target->cy; 255 | T dz = source->cz - target->cz; 256 | T r2 = dx * dx + dy * dy + dz * dz; 257 | T d1 = source->rad * static_cast(2.0); 258 | T d2 = target->rad * static_cast(2.0); 259 | 260 | if ((d1 + d2) * (d1 + d2) < fmm->theta2 * r2) { 261 | lock(&target->m2l_lock); 262 | m2l(fmm, target, source); 263 | unlock(&target->m2l_lock); 264 | } 265 | else if (source->is_leaf() && target->is_leaf()) { 266 | lock(&target->p2p_lock); 267 | if (target == source) { 268 | p2p_tiled(fmm, target); 269 | } 270 | else { 271 | p2p_tiled(fmm, target, source); 272 | } 273 | unlock(&target->p2p_lock); 274 | } 275 | else { 276 | T target_sz = target->rad; 277 | T source_sz = source->rad; 278 | if (source->is_leaf() || 279 | ((target_sz >= source_sz) && !target->is_leaf())) { 280 | for (size_t i = 0; i < target->num_children; ++i) { 281 | node_t* child = &fmm->nodes[target->child[i]]; 282 | if (target->num_points > TASK_CUTOFF) { 283 | Kokkos::BasicFuture f = 284 | Kokkos::task_spawn(Kokkos::TaskSingle(member.scheduler()), 285 | dual_tree_task(fmm, child, source)); 286 | } 287 | else { 288 | dual_tree_task(fmm, child, source)(member); 289 | } 290 | } 291 | } 292 | else { 293 | for (size_t i = 0; i < source->num_children; ++i) { 294 | node_t* child = &fmm->nodes[source->child[i]]; 295 | dual_tree_task(fmm, target, child)(member); 296 | } 297 | } 298 | } 299 | } 300 | }; 301 | 302 | template 303 | typename std::enable_if::value>::type 305 | kokkos_dtt(FMM* fmm) 306 | { 307 | printf("openmp\n"); 308 | const size_t min_block_size = 64; 309 | const size_t max_block_size = 1024; 310 | const size_t super_block_size = 4096; 311 | const size_t memory_capacity = 1024 * 1024 * 1024; 312 | 313 | Scheduler sched(typename Scheduler::memory_space(), memory_capacity, 314 | min_block_size, std::min(max_block_size, memory_capacity), 315 | std::min(super_block_size, memory_capacity)); 316 | node_t* root_node = fmm->nodes + fmm->root; 317 | Kokkos::BasicFuture f = Kokkos::host_spawn( 318 | Kokkos::TaskSingle(sched), 319 | dual_tree_task(fmm, root_node, root_node)); 320 | Kokkos::wait(sched); 321 | } 322 | 323 | template 324 | typename std::enable_if::value>::type 326 | kokkos_dtt(FMM* fmm) 327 | { 328 | printf("cuda\n"); 329 | const size_t min_block_size = 128; 330 | const size_t max_block_size = 1024; 331 | const size_t super_block_size = 4096; 332 | const size_t memory_capacity = 1024 * 1024 * 1024; 333 | 334 | Scheduler sched(typename Scheduler::memory_space(), memory_capacity, 335 | min_block_size, std::min(max_block_size, memory_capacity), 336 | std::min(super_block_size, memory_capacity)); 337 | 338 | FMM* d_fmm; 339 | FMM* h_fmm; 340 | 341 | init_device_fmm(fmm, &h_fmm, &d_fmm); 342 | 343 | node_t* root_node = h_fmm->nodes + h_fmm->root; 344 | 345 | Kokkos::BasicFuture f = Kokkos::host_spawn( 346 | Kokkos::TaskTeam(sched), 347 | dual_tree_task(d_fmm, root_node, root_node)); 348 | Kokkos::wait(sched); 349 | 350 | fini_device_fmm(fmm, h_fmm, d_fmm); 351 | } 352 | 353 | template 354 | void perform_traversals(FMM* fmm) 355 | { 356 | using Scheduler = 357 | Kokkos::KOKKOS_SCHEDULER; 358 | 359 | Kokkos::initialize(); 360 | #ifdef __CUDACC__ 361 | const int stack_size = 8192; 362 | CUDACHK(cudaDeviceSetLimit(cudaLimitStackSize, stack_size)); 363 | #endif 364 | 365 | Timer timer; 366 | Timer tot_timer; 367 | 368 | timer.start(); 369 | tot_timer.start(); 370 | kokkos_upwards(fmm); 371 | timer.stop(); 372 | printf("\n"); 373 | printf("%-20s %12.8f\n", "Upwards Time (s) ", timer.elapsed()); 374 | Kokkos::finalize(); 375 | exit(0); 376 | 377 | timer.start(); 378 | kokkos_dtt(fmm); 379 | timer.stop(); 380 | printf("%-20s %12.8f\n", "DTT Time (s) ", timer.elapsed()); 381 | 382 | timer.start(); 383 | #pragma omp parallel 384 | #pragma omp single 385 | downwards_pass(fmm, &fmm->nodes[fmm->root]); 386 | timer.stop(); 387 | printf("%-20s %12.8f\n", "Downwards Time (s) ", timer.elapsed()); 388 | 389 | tot_timer.stop(); 390 | printf("--------------------\n"); 391 | printf("%-20s %12.8f\n", "Total Time (s) ", tot_timer.elapsed()); 392 | printf("--------------------\n\n"); 393 | 394 | } 395 | -------------------------------------------------------------------------------- /main.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | template 12 | void perform_fmm(int argc, char** argv) 13 | { 14 | FMM* fmm = new FMM(); 15 | read_input(argc, argv, fmm); 16 | 17 | init(fmm); 18 | perform_traversals(fmm); 19 | verify(fmm); 20 | finalise(fmm); 21 | 22 | delete fmm; 23 | } 24 | 25 | int main(int argc, char** argv) 26 | { 27 | #ifdef FMM_DOUBLE 28 | perform_fmm(argc, argv); 29 | #else 30 | perform_fmm(argc, argv); 31 | #endif 32 | }; 33 | -------------------------------------------------------------------------------- /omp-for/flags.makefile: -------------------------------------------------------------------------------- 1 | CC_GNU=g++ 2 | CC_INTEL=icpc 3 | CC_CLANG=clang++ 4 | CC_ARM=armclang++ 5 | CC_CRAY=CC 6 | CC=$(CC_$(COMPILER)) 7 | 8 | UNAME=$(shell uname -m) 9 | ifeq ($(UNAME), aarch64) 10 | ARCH_CFLAGS = -mcpu=$(ARCH) -mtune=$(ARCH) 11 | ifeq ($(COMPILER), GNU) 12 | ARCH_CFLAGS += -mlow-precision-recip-sqrt 13 | endif 14 | endif 15 | ifeq ($(UNAME), x86_64) 16 | ARCH_CFLAGS = -march=$(ARCH) 17 | endif 18 | 19 | CFLAGS_CLANG=-std=c++11 -Ofast $(ARCH_CFLAGS) -fopenmp 20 | CFLAGS_GNU=-std=c++11 -Ofast -fno-cx-limited-range $(ARCH_CFLAGS) -fopenmp 21 | CFLAGS_INTEL=-std=c++11 -Ofast -x$(ARCH) -qopenmp 22 | CFLAGS_ARM=-std=c++11 -Ofast $(ARCH_CFLAGS) -fopenmp 23 | CFLAGS_CRAY=-std=c++11 -Ofast -fopenmp 24 | CFLAGS=$(CFLAGS_$(COMPILER)) -Wall -g 25 | 26 | LIBS=-fopenmp 27 | -------------------------------------------------------------------------------- /omp-for/node.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | template 6 | struct node_t { 7 | node_t() = default; 8 | ~node_t() = default; 9 | node_t(T arg_cx, T arg_cy, T arg_cz, T arg_rad, size_t arg_num_points, 10 | size_t arg_point_idx, size_t arg_mult_idx, size_t arg_node_idx, 11 | size_t arg_level) 12 | : cx{arg_cx}, 13 | cy{arg_cy}, 14 | cz{arg_cz}, 15 | rad{arg_rad}, 16 | num_points{arg_num_points}, 17 | point_idx{arg_point_idx}, 18 | mult_idx{arg_mult_idx}, 19 | node_idx{arg_node_idx}, 20 | level{arg_level} 21 | { 22 | omp_init_lock(&p2p_lock); 23 | omp_init_lock(&m2l_lock); 24 | } 25 | T cx; 26 | T cy; 27 | T cz; 28 | T rad; 29 | size_t num_children = 0; 30 | size_t child[8] = {0}; 31 | size_t num_points; 32 | size_t point_idx; 33 | size_t mult_idx; 34 | size_t node_idx; 35 | size_t level; 36 | omp_lock_t p2p_lock; 37 | omp_lock_t m2l_lock; 38 | 39 | bool is_leaf() const { return (num_children == 0); } 40 | }; 41 | -------------------------------------------------------------------------------- /omp-for/traversal.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | template 10 | void upwards_pass(FMM* fmm, node_t* node) 11 | { 12 | for (size_t i = 0; i < node->num_children; ++i) { 13 | #pragma omp task 14 | upwards_pass(fmm, &fmm->nodes[node->child[i]]); 15 | } 16 | #pragma omp taskwait 17 | 18 | if (node->is_leaf()) 19 | p2m(fmm, node); 20 | else 21 | m2m(fmm, node); 22 | } 23 | 24 | template 25 | void downwards_pass(FMM* fmm, node_t* node) 26 | { 27 | if (node->is_leaf()) 28 | l2p(fmm, node); 29 | else { 30 | l2l(fmm, node); 31 | for (size_t i = 0; i < node->num_children; ++i) { 32 | #pragma omp task 33 | downwards_pass(fmm, &fmm->nodes[node->child[i]]); 34 | } 35 | } 36 | #pragma omp taskwait 37 | } 38 | 39 | template 40 | void perform_traversals(FMM* fmm) 41 | { 42 | #pragma omp parallel 43 | #pragma omp single 44 | printf("Running on %d threads\n", omp_get_num_threads()); 45 | 46 | Timer timer; 47 | Timer tot_timer; 48 | 49 | timer.start(); 50 | tot_timer.start(); 51 | #pragma omp parallel 52 | #pragma omp single 53 | upwards_pass(fmm, &fmm->nodes[fmm->root]); 54 | timer.stop(); 55 | printf("\n"); 56 | printf("%-20s %12.8f\n", "Upwards Time (s) ", timer.elapsed()); 57 | 58 | Timer deps_timer; 59 | deps_timer.start(); 60 | 61 | std::vector> p2p_deps(fmm->num_nodes); 62 | std::vector> m2l_deps(fmm->num_nodes); 63 | 64 | get_deps_omp(fmm, &p2p_deps, &m2l_deps); 65 | 66 | deps_timer.stop(); 67 | printf(" %-16s %12.8f\n", "Deps. Time (s) ", deps_timer.elapsed()); 68 | 69 | Timer compute_timer; 70 | compute_timer.start(); 71 | #pragma omp parallel for schedule(guided) 72 | for (size_t i = 0; i < fmm->num_nodes; ++i) { 73 | node_t* target = &fmm->nodes[i]; 74 | for (size_t j = 0; j < p2p_deps[i].size(); ++j) { 75 | node_t* source = fmm->nodes + p2p_deps[i][j]; 76 | if (target == source) { 77 | p2p_tiled(fmm, target); 78 | } 79 | else { 80 | p2p_tiled(fmm, target, source); 81 | } 82 | } 83 | } 84 | compute_timer.stop(); 85 | printf(" %-16s %12.8f\n", "P2P Time (s) ", compute_timer.elapsed()); 86 | 87 | compute_timer.start(); 88 | #pragma omp parallel for schedule(guided) 89 | for (size_t i = 0; i < fmm->num_nodes; ++i) { 90 | node_t* target = &fmm->nodes[i]; 91 | for (size_t j = 0; j < m2l_deps[i].size(); ++j) { 92 | node_t* source = fmm->nodes + m2l_deps[i][j]; 93 | m2l(fmm, target, source); 94 | } 95 | } 96 | compute_timer.stop(); 97 | timer.stop(); 98 | printf(" %-16s %12.8f\n", "M2L Time (s) ", compute_timer.elapsed()); 99 | printf("%-20s %12.8f\n", "DTT Time (s) ", timer.elapsed()); 100 | 101 | timer.start(); 102 | #pragma omp parallel 103 | #pragma omp single 104 | downwards_pass(fmm, &fmm->nodes[fmm->root]); 105 | timer.stop(); 106 | printf("%-20s %12.8f\n", "Downwards Time (s) ", timer.elapsed()); 107 | 108 | tot_timer.stop(); 109 | printf("--------------------\n"); 110 | printf("%-20s %12.8f\n", "Total Time (s) ", tot_timer.elapsed()); 111 | printf("--------------------\n\n"); 112 | } 113 | -------------------------------------------------------------------------------- /omp-task/flags.makefile: -------------------------------------------------------------------------------- 1 | CC_GNU=g++ 2 | CC_INTEL=icpc 3 | CC_CLANG=clang++ 4 | CC_ARM=armclang++ 5 | CC_CRAY=CC 6 | CC=$(CC_$(COMPILER)) 7 | 8 | UNAME=$(shell uname -m) 9 | ifeq ($(UNAME), aarch64) 10 | ARCH_CFLAGS = -mcpu=$(ARCH) -mtune=$(ARCH) 11 | ifeq ($(COMPILER), GNU) 12 | ARCH_CFLAGS += -mlow-precision-recip-sqrt 13 | endif 14 | endif 15 | ifeq ($(UNAME), x86_64) 16 | ARCH_CFLAGS = -march=$(ARCH) 17 | endif 18 | 19 | CFLAGS_CLANG=-std=c++11 -Ofast $(ARCH_CFLAGS) -fopenmp 20 | CFLAGS_GNU=-std=c++11 -Ofast -fno-cx-limited-range $(ARCH_CFLAGS) -fopenmp 21 | CFLAGS_INTEL=-std=c++11 -Ofast -x$(ARCH) -qopenmp 22 | CFLAGS_ARM=-std=c++11 -Ofast $(ARCH_CFLAGS) -fopenmp 23 | CFLAGS_CRAY=-std=c++11 -Ofast -fopenmp 24 | CFLAGS=$(CFLAGS_$(COMPILER)) -Wall -g 25 | 26 | LIBS=-fopenmp 27 | -------------------------------------------------------------------------------- /omp-task/node.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | template 6 | struct node_t { 7 | node_t() = default; 8 | ~node_t() = default; 9 | node_t(T arg_cx, T arg_cy, T arg_cz, T arg_rad, size_t arg_num_points, 10 | size_t arg_point_idx, size_t arg_mult_idx, size_t arg_node_idx, 11 | size_t arg_level) 12 | : cx{arg_cx}, 13 | cy{arg_cy}, 14 | cz{arg_cz}, 15 | rad{arg_rad}, 16 | num_points{arg_num_points}, 17 | point_idx{arg_point_idx}, 18 | mult_idx{arg_mult_idx}, 19 | node_idx{arg_node_idx}, 20 | level{arg_level} 21 | { 22 | omp_init_lock(&p2p_lock); 23 | omp_init_lock(&m2l_lock); 24 | } 25 | T cx; 26 | T cy; 27 | T cz; 28 | T rad; 29 | size_t num_children = 0; 30 | size_t child[8] = {0}; 31 | size_t num_points; 32 | size_t point_idx; 33 | size_t mult_idx; 34 | size_t node_idx; 35 | size_t level; 36 | omp_lock_t p2p_lock; 37 | omp_lock_t m2l_lock; 38 | 39 | bool is_leaf() const { return (num_children == 0); } 40 | }; 41 | -------------------------------------------------------------------------------- /omp-task/traversal.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | template 9 | void upwards_pass(FMM* fmm, node_t* node) 10 | { 11 | for (size_t i = 0; i < node->num_children; ++i) { 12 | #pragma omp task 13 | upwards_pass(fmm, &fmm->nodes[node->child[i]]); 14 | } 15 | #pragma omp taskwait 16 | 17 | if (node->is_leaf()) 18 | p2m(fmm, node); 19 | else 20 | m2m(fmm, node); 21 | } 22 | 23 | template 24 | void downwards_pass(FMM* fmm, node_t* node) 25 | { 26 | if (node->is_leaf()) 27 | l2p(fmm, node); 28 | else { 29 | l2l(fmm, node); 30 | for (size_t i = 0; i < node->num_children; ++i) { 31 | #pragma omp task 32 | downwards_pass(fmm, &fmm->nodes[node->child[i]]); 33 | } 34 | } 35 | #pragma omp taskwait 36 | } 37 | 38 | template 39 | void dual_tree(FMM* fmm, node_t* target, node_t* source) 40 | { 41 | T dx = source->cx - target->cx; 42 | T dy = source->cy - target->cy; 43 | T dz = source->cz - target->cz; 44 | T r2 = dx * dx + dy * dy + dz * dz; 45 | T d1 = source->rad * static_cast(2.0); 46 | T d2 = target->rad * static_cast(2.0); 47 | 48 | if ((d1 + d2) * (d1 + d2) < fmm->theta2 * r2) { 49 | omp_set_lock(&target->m2l_lock); 50 | m2l(fmm, target, source); 51 | omp_unset_lock(&target->m2l_lock); 52 | } 53 | else if (source->is_leaf() && target->is_leaf()) { 54 | omp_set_lock(&target->p2p_lock); 55 | if (target == source) 56 | p2p_tiled(fmm, target); 57 | else 58 | p2p_tiled(fmm, target, source); 59 | omp_unset_lock(&target->p2p_lock); 60 | } 61 | else { 62 | T target_sz = target->rad; 63 | T source_sz = source->rad; 64 | if (source->is_leaf() || ((target_sz >= source_sz) && !target->is_leaf())) { 65 | for (size_t i = 0; i < target->num_children; ++i) { 66 | node_t* child = &fmm->nodes[target->child[i]]; 67 | #pragma omp task if(target->num_points > TASK_CUTOFF) 68 | dual_tree(fmm, child, source); 69 | } 70 | } 71 | else { 72 | for (size_t i = 0; i < source->num_children; ++i) { 73 | //#pragma omp task if(source->num_points > TASK_CUTOFF && SOURCE_TASK_SPAWN) 74 | dual_tree(fmm, target, &fmm->nodes[source->child[i]]); 75 | } 76 | } 77 | } 78 | } 79 | 80 | template 81 | void perform_traversals(FMM* fmm) 82 | { 83 | #pragma omp parallel 84 | #pragma omp single 85 | printf("Running on %d threads\n", omp_get_num_threads()); 86 | 87 | Timer timer; 88 | Timer tot_timer; 89 | 90 | timer.start(); 91 | tot_timer.start(); 92 | #pragma omp parallel 93 | #pragma omp single 94 | upwards_pass(fmm, &fmm->nodes[fmm->root]); 95 | timer.stop(); 96 | printf("\n"); 97 | printf("%-20s %12.8f\n", "Upwards Time (s) ", timer.elapsed()); 98 | 99 | timer.start(); 100 | #pragma omp parallel 101 | #pragma omp single 102 | dual_tree(fmm, &fmm->nodes[fmm->root], &fmm->nodes[fmm->root]); 103 | timer.stop(); 104 | printf("%-20s %12.8f\n", "DTT Time (s) ", timer.elapsed()); 105 | 106 | timer.start(); 107 | #pragma omp parallel 108 | #pragma omp single 109 | downwards_pass(fmm, &fmm->nodes[fmm->root]); 110 | timer.stop(); 111 | printf("%-20s %12.8f\n", "Downwards Time (s) ", timer.elapsed()); 112 | 113 | tot_timer.stop(); 114 | printf("--------------------\n"); 115 | printf("%-20s %12.8f\n", "Total Time (s) ", tot_timer.elapsed()); 116 | printf("--------------------\n\n"); 117 | } 118 | -------------------------------------------------------------------------------- /omptarget/flags.makefile: -------------------------------------------------------------------------------- 1 | CC=clang++ 2 | 3 | ifeq ($(TARGET), GPU) 4 | CFLAGS=-Ofast -mllvm --nvptx-f32ftz -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=$(ARCH) 5 | LIBS=-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=$(ARCH) 6 | else 7 | CFLAGS=-Ofast -fopenmp -fopenmp-targets=x86_64 -march=$(ARCH) 8 | LIBS=-fopenmp -fopenmp-targets=x86_64 -march=$(ARCH) 9 | endif 10 | -------------------------------------------------------------------------------- /omptarget/node.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | template 6 | struct node_t { 7 | node_t() = default; 8 | ~node_t() = default; 9 | node_t(T arg_cx, T arg_cy, T arg_cz, T arg_rad, size_t arg_num_points, 10 | size_t arg_point_idx, size_t arg_mult_idx, size_t arg_node_idx, 11 | size_t arg_level) 12 | : cx{arg_cx}, 13 | cy{arg_cy}, 14 | cz{arg_cz}, 15 | rad{arg_rad}, 16 | num_points{arg_num_points}, 17 | point_idx{arg_point_idx}, 18 | mult_idx{arg_mult_idx}, 19 | node_idx{arg_node_idx}, 20 | level{arg_level} 21 | { 22 | omp_init_lock(&p2p_lock); 23 | omp_init_lock(&m2l_lock); 24 | } 25 | T cx; 26 | T cy; 27 | T cz; 28 | T rad; 29 | size_t num_children = 0; 30 | size_t child[8] = {0}; 31 | size_t num_points; 32 | size_t point_idx; 33 | size_t mult_idx; 34 | size_t node_idx; 35 | size_t level; 36 | omp_lock_t p2p_lock; 37 | omp_lock_t m2l_lock; 38 | 39 | bool is_leaf() const { return (num_children == 0); } 40 | }; 41 | -------------------------------------------------------------------------------- /omptarget/traversal.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | template 10 | void upwards_pass(FMM* fmm, node_t* node) 11 | { 12 | for (size_t i = 0; i < node->num_children; ++i) { 13 | #pragma omp task 14 | upwards_pass(fmm, &fmm->nodes[node->child[i]]); 15 | } 16 | #pragma omp taskwait 17 | 18 | if (node->is_leaf()) 19 | p2m(fmm, node); 20 | else 21 | m2m(fmm, node); 22 | } 23 | 24 | template 25 | void downwards_pass(FMM* fmm, node_t* node) 26 | { 27 | if (node->is_leaf()) 28 | l2p(fmm, node); 29 | else { 30 | l2l(fmm, node); 31 | for (size_t i = 0; i < node->num_children; ++i) { 32 | #pragma omp task 33 | downwards_pass(fmm, &fmm->nodes[node->child[i]]); 34 | } 35 | } 36 | #pragma omp taskwait 37 | } 38 | 39 | template 40 | void dtt(FMM* fmm) 41 | { 42 | node_t* nodes = fmm->nodes; 43 | T* x = fmm->x; 44 | T* y = fmm->y; 45 | T* z = fmm->z; 46 | T* w = fmm->w; 47 | T* ax = fmm->ax; 48 | T* ay = fmm->ay; 49 | T* az = fmm->az; 50 | T* aw = fmm->p; 51 | complex_t* mm = fmm->m; 52 | complex_t* ml = fmm->l; 53 | 54 | size_t nn = fmm->num_nodes; 55 | size_t np = fmm->num_points; 56 | size_t nm = fmm->num_multipoles; 57 | 58 | #pragma omp target enter data map(to: nodes[:nn], x[:np], y[:np], z[:np], \ 59 | w[:np], ax[:np], ay[:np], az[:np], \ 60 | aw[:np], mm[:nm * nn], ml[:nm * nn]) 61 | 62 | Timer deps_timer; 63 | deps_timer.start(); 64 | 65 | std::vector> p2p_deps(fmm->num_nodes); 66 | std::vector> m2l_deps(fmm->num_nodes); 67 | 68 | get_deps_omp(fmm, &p2p_deps, &m2l_deps); 69 | 70 | deps_timer.stop(); 71 | printf(" %-16s %12.8f\n", "Deps. Time (s) ", deps_timer.elapsed()); 72 | 73 | deps_timer.start(); 74 | size_t* p2p_nodes; 75 | size_t* p2p_deps_array; 76 | size_t* p2p_deps_offsets; 77 | size_t* p2p_deps_sizes; 78 | size_t p2p_deps_tot; 79 | size_t p2p_num_nodes; 80 | 81 | size_t* m2l_nodes; 82 | size_t* m2l_deps_array; 83 | size_t* m2l_deps_offsets; 84 | size_t* m2l_deps_sizes; 85 | size_t m2l_deps_tot; 86 | size_t m2l_num_nodes; 87 | 88 | pack_deps(p2p_deps, &p2p_nodes, &p2p_deps_array, &p2p_deps_offsets, 89 | &p2p_deps_sizes, &p2p_deps_tot, &p2p_num_nodes); 90 | pack_deps(m2l_deps, &m2l_nodes, &m2l_deps_array, &m2l_deps_offsets, 91 | &m2l_deps_sizes, &m2l_deps_tot, &m2l_num_nodes); 92 | deps_timer.stop(); 93 | printf("%-20s %12.8f\n", " Pack Time (s) ", deps_timer.elapsed()); 94 | 95 | #pragma omp target enter data map(to: p2p_nodes[:p2p_num_nodes],\ 96 | p2p_deps_array[:p2p_deps_tot],\ 97 | p2p_deps_offsets[:p2p_num_nodes],\ 98 | p2p_deps_sizes[:p2p_num_nodes]) 99 | 100 | #pragma omp target enter data map(to: m2l_nodes[:m2l_num_nodes],\ 101 | m2l_deps_array[:m2l_deps_tot],\ 102 | m2l_deps_offsets[:m2l_num_nodes],\ 103 | m2l_deps_sizes[:m2l_num_nodes]) 104 | 105 | Timer compute_timer; 106 | compute_timer.start(); 107 | 108 | #pragma omp target teams distribute 109 | for (size_t ni = 0; ni < p2p_num_nodes; ++ni) { 110 | node_t* target = &nodes[p2p_nodes[ni]]; 111 | size_t p2p_size = p2p_deps_sizes[ni]; 112 | size_t p2p_offset = p2p_deps_offsets[ni]; 113 | 114 | for (size_t nj = 0; nj < p2p_size; ++nj) { 115 | size_t source_idx = p2p_deps_array[p2p_offset + nj]; 116 | node_t* source = nodes + source_idx; 117 | // 118 | // if (target == source) p2p_tiled(fmm, target); 119 | // else p2p_tiled(fmm, target, source); 120 | // } 121 | // } 122 | 123 | //static __attribute((address_space(3))) 124 | T shmem[512 * 4]; 125 | T* source_pos = (T*)shmem; 126 | 127 | #pragma omp parallel for 128 | for (size_t j = 0; j < source->num_points; ++j) { 129 | const size_t jj = j + source->point_idx; 130 | source_pos[j * 4 + 0] = x[jj]; 131 | source_pos[j * 4 + 1] = y[jj]; 132 | source_pos[j * 4 + 2] = z[jj]; 133 | source_pos[j * 4 + 3] = w[jj]; 134 | } 135 | 136 | #pragma omp parallel for 137 | for (size_t i = 0; i < target->num_points; ++i) { 138 | const size_t ip = i + target->point_idx; 139 | const T xi = x[ip]; 140 | const T yi = y[ip]; 141 | const T zi = z[ip]; 142 | T tax = static_cast(0.0); 143 | T tay = static_cast(0.0); 144 | T taz = static_cast(0.0); 145 | T taw = static_cast(0.0); 146 | for (size_t j = 0; j < source->num_points; ++j) { 147 | const size_t jp = j + source->point_idx; 148 | const T dx = source_pos[j * 4 + 0] - xi; 149 | const T dy = source_pos[j * 4 + 1] - yi; 150 | const T dz = source_pos[j * 4 + 2] - zi; 151 | const T wj = source_pos[j * 4 + 3]; 152 | const T r = dx * dx + dy * dy + dz * dz; 153 | const T inv_r = (r == 0.0) ? 0.0 : 1.0/std::sqrt(r); 154 | const T inv_r_3 = inv_r * inv_r * inv_r * wj; 155 | tax += dx * inv_r_3; 156 | tay += dy * inv_r_3; 157 | taz += dz * inv_r_3; 158 | taw += inv_r * wj; 159 | } 160 | ax[ip] += tax; 161 | ay[ip] += tay; 162 | az[ip] += taz; 163 | aw[ip] += taw; 164 | } 165 | } 166 | } 167 | 168 | //#pragma omp parallel for schedule(guided) 169 | // for (size_t i = 0; i < fmm->num_nodes; ++i) { 170 | // node_t* target = &fmm->nodes[i]; 171 | // for (size_t j = 0; j < p2p_deps[i].size(); ++j) { 172 | // node_t* source = fmm->nodes + p2p_deps[i][j]; 173 | // if (target == source) { 174 | // p2p_tiled(fmm, target); 175 | // } 176 | // else { 177 | // p2p_tiled(fmm, target, source); 178 | // } 179 | // } 180 | // } 181 | compute_timer.stop(); 182 | printf(" %-16s %12.8f\n", "P2P Time (s) ", compute_timer.elapsed()); 183 | 184 | compute_timer.start(); 185 | #pragma omp parallel for schedule(guided) 186 | for (size_t i = 0; i < fmm->num_nodes; ++i) { 187 | node_t* target = &fmm->nodes[i]; 188 | for (size_t j = 0; j < m2l_deps[i].size(); ++j) { 189 | node_t* source = fmm->nodes + m2l_deps[i][j]; 190 | m2l(fmm, target, source); 191 | } 192 | } 193 | compute_timer.stop(); 194 | printf(" %-16s %12.8f\n", "M2L Time (s) ", compute_timer.elapsed()); 195 | 196 | #pragma omp target exit data map(from: ax[:np], ay[:np], az[:np], aw[:np]) 197 | } 198 | 199 | 200 | template 201 | void perform_traversals(FMM* fmm) 202 | { 203 | #pragma omp parallel 204 | #pragma omp single 205 | printf("Running on %d threads\n", omp_get_num_threads()); 206 | 207 | Timer timer; 208 | Timer tot_timer; 209 | 210 | timer.start(); 211 | tot_timer.start(); 212 | #pragma omp parallel 213 | #pragma omp single 214 | upwards_pass(fmm, &fmm->nodes[fmm->root]); 215 | timer.stop(); 216 | printf("\n"); 217 | printf("%-20s %12.8f\n", "Upwards Time (s) ", timer.elapsed()); 218 | 219 | dtt(fmm); 220 | 221 | timer.start(); 222 | #pragma omp parallel 223 | #pragma omp single 224 | downwards_pass(fmm, &fmm->nodes[fmm->root]); 225 | timer.stop(); 226 | printf("%-20s %12.8f\n", "Downwards Time (s) ", timer.elapsed()); 227 | 228 | tot_timer.stop(); 229 | printf("--------------------\n"); 230 | printf("%-20s %12.8f\n", "Total Time (s) ", tot_timer.elapsed()); 231 | printf("--------------------\n\n"); 232 | } 233 | --------------------------------------------------------------------------------