├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── common
    ├── complex.hh
    ├── finalise.hh
    ├── flags.makefile
    ├── fmm.hh
    ├── get-deps.hh
    ├── gpu-kernels-no-atomics.hh
    ├── gpu-kernels.hh
    ├── gpu-spharm.hh
    ├── gpu-utils.hh
    ├── init.hh
    ├── input.hh
    ├── kernels.hh
    ├── kokkos-utils.hh
    ├── node.hh
    ├── spharm.hh
    ├── timer.hh
    ├── traversal.hh
    ├── tree.hh
    ├── utils.hh
    └── verify.hh
├── cuda
    ├── cuda-utils.hh
    ├── flags.makefile
    ├── node.hh
    └── traversal.hh
├── gpusched
    ├── flags.makefile
    └── traversal.hh
├── inputs
    ├── plummer.in
    ├── small.in
    └── uniform.in
├── kokkos-for
    ├── flags.makefile
    ├── node.hh
    └── traversal.hh
├── kokkos-task-locks
    ├── flags.makefile
    ├── node.hh
    └── traversal.hh
├── kokkos-task
    ├── flags.makefile
    ├── node.hh
    └── traversal.hh
├── main.cc
├── omp-for
    ├── flags.makefile
    ├── node.hh
    └── traversal.hh
├── omp-task
    ├── flags.makefile
    ├── node.hh
    └── traversal.hh
└── omptarget
    ├── flags.makefile
    ├── node.hh
    └── traversal.hh


/.gitignore:
--------------------------------------------------------------------------------
 1 | fmm.*
 2 | 
 3 | # Prerequisites
 4 | *.d
 5 | 
 6 | # Compiled Object files
 7 | *.slo
 8 | *.lo
 9 | *.o
10 | *.obj
11 | 
12 | # Precompiled Headers
13 | *.gch
14 | *.pch
15 | 
16 | # Compiled Dynamic libraries
17 | *.so
18 | *.dylib
19 | *.dll
20 | 
21 | # Fortran module files
22 | *.mod
23 | *.smod
24 | 
25 | # Compiled Static libraries
26 | *.lai
27 | *.la
28 | *.a
29 | *.lib
30 | 
31 | # Executables
32 | *.exe
33 | *.out
34 | *.app
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Patrick Atkinson
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | COMPILER ?= GNU
 2 | ARCH ?= native
 3 | MODEL = omp-task
 4 | 
 5 | EXES=$(addprefix fmm., $(MODEL))
 6 | SINGLE=$(addsuffix .single, $(EXES))
 7 | DOUBLE=$(addsuffix .double, $(EXES))
 8 | 
 9 | default: fmm.$(MODEL)
10 | 
11 | include $(MODEL)/flags.makefile
12 | 
13 | .PHONY: default clean
14 | 
15 | COMMON_HEADERS=$(wildcard common/*.hh)
16 | COMMON_INC=-I./common
17 | 
18 | main.o: main.cc $(MODEL)/*.hh $(COMMON_HEADERS)
19 | 	$(CC) $(CFLAGS) $(EXTRA_FLAGS) -I./$(MODEL) $(COMMON_INC) main.cc -c
20 | 
21 | fmm.$(MODEL): main.o
22 | 	$(CC) main.o -o $@ $(LIBS) 
23 | 
24 | #fmm.%.single: main.cc %/*.hh $(COMMON_HEADERS)
25 | #	$(CC) $(CFLAGS) -I $* $(COMMON_INC) main.cc -o $@ $(LIBS)
26 | #
27 | #fmm.%.double: main.cc %/*.hh $(COMMON_HEADERS)
28 | #	$(CC) $(CFLAGS) -DFMM_DOUBLE -I $* $(COMMON_INC) main.cc -o $@ $(LIBS)
29 | 
30 | clean:
31 | 	-rm -f fmm.* main.o
32 | 
33 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | MiniFMM
 2 | =======
 3 | 
 4 | Building
 5 | --------
 6 | 
 7 | ```
 8 | make COMPILER=<Compiler> ARCH=<Architecture> MODEL=<Programming Model>
 9 | ```
10 | 
11 | The programming model must be the same name as the directory (i.e. `omp-tasks`).
12 | 
13 | Running
14 | -------
15 | 
16 | ```
17 | ./fmm.<Programming Model> <Arguments>
18 | ```
19 | 
20 | Valid arguments
21 | 
22 | - `n`: no. of input particles
23 | - `c`: max. no. of particles per tree node
24 | - `t`: no. of multipole terms
25 | - `e`: theta (as in Barnes-Hut)
26 | - `m`: no. samples used to validate
27 | - `p`: use a Plummer input distribution
28 | - `u`: use a uniform input distribution (cube)
29 | - `i`: input file
30 | 
31 | Some sample input files are given in `input/`. 
32 | 
33 | Information
34 | -----------
35 | 
36 | Please cite using:
37 | >Atkinson, P., & McIntosh-Smith, S. (2017). On the Performance of Parallel Tasking Runtimes for an Irregular Fast Multipole Method Application. In Scaling OpenMP for Exascale Performance and Portability (pp. 92–106). Springer International Publishing. https://doi.org/10.1007/978-3-319-65578-9_7
38 | 
39 | For further information, please refer to: https://patrickatkinson.co.uk/phd-thesis.pdf
40 | 
41 | 


--------------------------------------------------------------------------------
/common/complex.hh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <complex>
  4 | 
  5 | #include "utils.hh"
  6 | 
  7 | #if !defined(__NVCC__)  && defined(__x86_64__) && (__GNUC__ >= 8)
  8 | #warning falling back to std complex numbers
  9 | template <class T>
 10 | using complex_t = std::complex<T>;
 11 | 
 12 | template <class T>
 13 | T complex_real(const std::complex<T>& a)
 14 | {
 15 |   return std::real(a);
 16 | }
 17 | 
 18 | template <class T>
 19 | std::complex<T> complex_conj(const std::complex<T>& a)
 20 | {
 21 |   return std::conj(a);
 22 | }
 23 | 
 24 | template <class T, class U>
 25 | std::complex<T> complex_pow(const std::complex<T>& a, const U& b)
 26 | {
 27 |   return std::pow(a, b);
 28 | }
 29 | 
 30 | template <class T>
 31 | std::complex<T> complex_exp(const std::complex<T>& a)
 32 | {
 33 |   return std::exp(a);
 34 | }
 35 | 
 36 | template <class T>
 37 | std::complex<T> imag_pow(const int n)
 38 | {
 39 |   return std::pow(std::complex<T>(0.0, 1.0), n);
 40 | }
 41 | #else
 42 | 
 43 | template <class T>
 44 | struct complex_t {
 45 |   T re, im;
 46 | 
 47 |   HOSTDEVICE complex_t(T arg_re, T arg_im) : re(arg_re), im(arg_im) {}
 48 |   HOSTDEVICE complex_t() : re(static_cast<T>(0.0)), im(static_cast<T>(0.0)) {}
 49 | 
 50 |   std::complex<T> convert() { return std::complex<T>(re, im); }
 51 | 
 52 |   HOSTDEVICE complex_t<T>& operator+=(const complex_t<T>& rhs)
 53 |   {
 54 |     this->re += rhs.re;
 55 |     this->im += rhs.im;
 56 |     return *this;
 57 |   }
 58 | 
 59 |   HOSTDEVICE T real() { return this->re; }
 60 |   HOSTDEVICE T imag() { return this->im; }
 61 | };
 62 | 
 63 | template <class T>
 64 | HOSTDEVICE complex_t<T> operator*(const complex_t<T>& a, const complex_t<T>& b)
 65 | {
 66 |   T re = a.re * b.re - a.im * b.im;
 67 |   T im = a.re * b.im + a.im * b.re;
 68 |   return complex_t<T>(re, im);
 69 | }
 70 | 
 71 | template <class T>
 72 | HOSTDEVICE complex_t<T> operator*(const complex_t<T>& a, const T& b)
 73 | {
 74 |   return complex_t<T>(a.re * b, a.im * b);
 75 | }
 76 | 
 77 | template <class T>
 78 | HOSTDEVICE complex_t<T> operator*(const T& a, const complex_t<T>& b)
 79 | {
 80 |   return b * a;
 81 | }
 82 | 
 83 | template <class T>
 84 | HOSTDEVICE complex_t<T> operator/(const complex_t<T>& a, T& b)
 85 | {
 86 |   return complex_t<T>(a.re / b, a.im / b);
 87 | }
 88 | 
 89 | template <class T>
 90 | HOSTDEVICE complex_t<T> complex_exp(const complex_t<T>& a)
 91 | {
 92 |   const T r = std::exp(a.re);
 93 |   return complex_t<T>(r * std::cos(a.im), r * std::sin(a.im));
 94 | }
 95 | 
 96 | template <class T>
 97 | HOSTDEVICE T complex_abs(const complex_t<T>& a)
 98 | {
 99 |   return std::hypot(a.re, a.im);
100 | }
101 | 
102 | template <class T>
103 | HOSTDEVICE complex_t<T> complex_pow(const complex_t<T>& a, const T& b)
104 | {
105 |   T r = complex_abs(a);
106 |   if (a.re == 0.0) {
107 |     printf("divide by zero\n");
108 |     exit(1);
109 |   }
110 |   T phi = std::atan(a.im / a.re);
111 |   return std::pow(r, b) * complex_t<T>(std::cos(phi * b), std::sin(phi * b));
112 | }
113 | 
114 | template <class T>
115 | HOSTDEVICE complex_t<T> imag_pow(int n)
116 | {
117 |   complex_t<T> i = (n & 1) ? complex_t<T>(0.0, 1.0) : complex_t<T>(1.0, 0.0);
118 |   i = (n & 2) ? i * static_cast<T>(-1.0) : i;
119 |   return i;
120 | }
121 | 
122 | template <class T>
123 | HOSTDEVICE complex_t<T> complex_conj(const complex_t<T>& a)
124 | {
125 |   return complex_t<T>(a.re, -a.im);
126 | }
127 | 
128 | template <class T>
129 | HOSTDEVICE T complex_real(const complex_t<T>& a)
130 | {
131 |   return a.re;
132 | }
133 | 
134 | template <class T>
135 | HOSTDEVICE T complex_imag(const complex_t<T>& a)
136 | {
137 |   return a.im;
138 | }
139 | 
140 | template <class T>
141 | HOSTDEVICE complex_t<T> convert(std::complex<T> a)
142 | {
143 |   return complex_t<T>(((T*)&a)[0], ((T*)&a)[1]);
144 | }
145 | #endif
146 | 


--------------------------------------------------------------------------------
/common/finalise.hh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <fmm.hh>
 4 | 
 5 | template<class T>
 6 | void finalise(FMM<T>* fmm)
 7 | {
 8 |   free(fmm->x);
 9 |   free(fmm->y);
10 |   free(fmm->z);
11 |   free(fmm->w);
12 |   free(fmm->ax);
13 |   free(fmm->ay);
14 |   free(fmm->az);
15 |   free(fmm->p);
16 |   free(fmm->inner_factors);
17 |   free(fmm->outer_factors);
18 |   free(fmm->m);
19 |   free(fmm->l);
20 |   free(fmm->nodes);
21 | }
22 | 


--------------------------------------------------------------------------------
/common/flags.makefile:
--------------------------------------------------------------------------------
 1 | CC_GNU=g++
 2 | CC_INTEL=icpc
 3 | CC_CLANG=clang++
 4 | CC_ARM=armclang++
 5 | CC_CRAY=CC
 6 | CC=$(CC_$(COMPILER))
 7 | 
 8 | UNAME=$(shell uname -m)
 9 | ifeq ($(UNAME), aarch64)
10 |   ARCH_CFLAGS = -mcpu=$(ARCH) -mtune=$(ARCH) 
11 |   ifeq ($(COMPILER), GNU)
12 |     ARCH_CFLAGS += -mlow-precision-recip-sqrt
13 |   endif
14 | endif
15 | ifeq ($(UNAME), x86_64)
16 |   ARCH_CFLAGS = -march=$(ARCH)
17 | endif
18 | 
19 | CFLAGS_CLANG=-std=c++11 -Ofast $(ARCH_CFLAGS) -fopenmp
20 | CFLAGS_GNU=-std=c++11 -Ofast -fno-cx-limited-range $(ARCH_CFLAGS) -fopenmp
21 | CFLAGS_INTEL=-std=c++11 -Ofast -x$(ARCH) -qopenmp
22 | CFLAGS_ARM=-std=c++11 -Ofast $(ARCH_CFLAGS) -fopenmp
23 | CFLAGS_CRAY=-std=c++11 -Ofast -fopenmp
24 | CFLAGS=$(CFLAGS_$(COMPILER)) -Wall -g
25 | 
26 | LIBS=-fopenmp
27 | 


--------------------------------------------------------------------------------
/common/fmm.hh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <vector>
 4 | 
 5 | #include <complex.hh>
 6 | #include <node.hh>
 7 | 
 8 | #ifndef TASK_CUTOFF
 9 | #define TASK_CUTOFF 5000
10 | #endif
11 | #define SOURCE_TASK_SPAWN 0
12 | 
13 | template <class T>
14 | struct FMM
15 | {
16 |   node_t<T>* nodes;
17 |   size_t root;
18 |   T* x = nullptr; 
19 |   T* y = nullptr; 
20 |   T* z = nullptr; 
21 |   T* w = nullptr; 
22 |   T* ax = nullptr;
23 |   T* ay = nullptr;
24 |   T* az = nullptr;
25 |   T* p = nullptr;
26 |   size_t num_points;
27 |   size_t ncrit;
28 |   int num_terms;
29 |   T theta;
30 |   T theta2;
31 |   size_t num_samples;
32 |   size_t num_multipoles;
33 |   size_t num_nodes;
34 |   size_t num_spharm_terms;
35 |   complex_t<T>* inner_factors = nullptr;
36 |   complex_t<T>* outer_factors = nullptr;
37 |   complex_t<T>* m = nullptr;
38 |   complex_t<T>* l = nullptr;
39 |   enum Dist {
40 |     Uniform = 0,
41 |     Plummer,
42 |     NumDist,
43 |   };
44 |   Dist dist;
45 | };
46 | 


--------------------------------------------------------------------------------
/common/get-deps.hh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cstdlib>
  4 | #include <vector>
  5 | 
  6 | #include <node.hh>
  7 | #include <fmm.hh>
  8 | 
  9 | template <class T>
 10 | void get_deps_omp_task(FMM<T>* fmm, node_t<T>* target, node_t<T>* source,
 11 |                        std::vector<std::vector<size_t>>* p2p_deps,
 12 |                        std::vector<std::vector<size_t>>* m2l_deps)
 13 | {
 14 |   T dx = source->cx - target->cx;
 15 |   T dy = source->cy - target->cy;
 16 |   T dz = source->cz - target->cz;
 17 |   T r2 = dx * dx + dy * dy + dz * dz;
 18 |   T d1 = source->rad * static_cast<T>(2.0);
 19 |   T d2 = target->rad * static_cast<T>(2.0);
 20 | 
 21 |   if ((d1 + d2) * (d1 + d2) < fmm->theta2 * r2) {
 22 |     omp_set_lock(&target->m2l_lock);
 23 |     (*m2l_deps)[target->node_idx].push_back(source->node_idx);
 24 |     omp_unset_lock(&target->m2l_lock);
 25 |   }
 26 |   else if (source->is_leaf() && target->is_leaf()) {
 27 |     omp_set_lock(&target->p2p_lock);
 28 |     (*p2p_deps)[target->node_idx].push_back(source->node_idx);
 29 |     omp_unset_lock(&target->p2p_lock);
 30 |   }
 31 |   else {
 32 |     T target_sz = target->rad;
 33 |     T source_sz = source->rad;
 34 |     if (source->is_leaf() || ((target_sz >= source_sz) && !target->is_leaf())) {
 35 |       for (size_t i = 0; i < target->num_children; ++i) {
 36 |         node_t<T>* child = &fmm->nodes[target->child[i]];
 37 | #pragma omp task if (target->num_points > TASK_CUTOFF)
 38 |         get_deps_omp_task(fmm, child, source, p2p_deps, m2l_deps);
 39 |       }
 40 |     }
 41 |     else {
 42 |       for (size_t i = 0; i < source->num_children; ++i) {
 43 |         //#pragma omp task if(source->num_points > TASK_CUTOFF &&
 44 |         // SOURCE_TASK_SPAWN)
 45 |         node_t<T>* child = &fmm->nodes[source->child[i]];
 46 |         get_deps_omp_task(fmm, target, child, p2p_deps, m2l_deps);
 47 |       }
 48 |     }
 49 |   }
 50 | }
 51 | 
 52 | template <class T>
 53 | void get_deps_omp(FMM<T>* fmm, std::vector<std::vector<size_t>>* p2p_deps,
 54 |                   std::vector<std::vector<size_t>>* m2l_deps)
 55 | {
 56 |   node_t<T>* root_node = fmm->nodes + fmm->root;
 57 | #pragma omp parallel
 58 | #pragma omp single
 59 |   get_deps_omp_task(fmm, root_node, root_node, p2p_deps, m2l_deps);
 60 | }
 61 | 
 62 | void pack_deps(std::vector<std::vector<size_t>>& deps, size_t** ret_nodes,
 63 |                size_t** ret_deps, size_t** ret_offsets, size_t** ret_sizes,
 64 |                size_t* ret_count, size_t* ret_num_nodes)
 65 | {
 66 |   size_t* prefixes1d = (size_t*)malloc(sizeof(size_t) * deps.size());
 67 |   size_t* prefixes2d = (size_t*)malloc(sizeof(size_t) * deps.size());
 68 | 
 69 |   prefixes1d[0] = 0;
 70 |   prefixes2d[0] = 0;
 71 | 
 72 |   size_t count = 0;
 73 |   size_t num_nodes = 0;
 74 | 
 75 |   count += deps[0].size();
 76 |   if (deps[0].size() > 0) num_nodes++;
 77 | 
 78 |   for (size_t i = 1; i < deps.size(); ++i) {
 79 |     size_t flag = (deps[i - 1].size() > 0);
 80 |     prefixes1d[i] = flag + prefixes1d[i - 1];
 81 |     prefixes2d[i] = deps[i - 1].size() + prefixes2d[i - 1];
 82 |     count += deps[i].size();
 83 |     if (deps[i].size() > 0) num_nodes++;
 84 |   }
 85 | 
 86 |   size_t* deps_array = (size_t*)malloc(sizeof(size_t) * count);
 87 |   size_t* nodes = (size_t*)malloc(sizeof(size_t) * num_nodes);
 88 |   size_t* sizes = (size_t*)malloc(sizeof(size_t) * num_nodes);
 89 |   size_t* offsets = (size_t*)malloc(sizeof(size_t) * num_nodes);
 90 | 
 91 | #pragma omp parallel for
 92 |   for (size_t i = 0; i < deps.size(); ++i) {
 93 |     if (deps[i].size() > 0) {
 94 |       size_t beg = prefixes2d[i];
 95 |       size_t end = beg + deps[i].size();
 96 |       size_t c = 0;
 97 |       for (size_t j = beg; j < end; ++j) {
 98 |         deps_array[j] = deps[i][c++];
 99 |       }
100 |       const size_t idx = prefixes1d[i];
101 |       sizes[idx] = deps[i].size();
102 |       offsets[idx] = beg;
103 |       nodes[idx] = i;
104 |     }
105 |   }
106 | 
107 |   free(prefixes1d);
108 |   free(prefixes2d);
109 | 
110 |   *ret_nodes = nodes;
111 |   *ret_deps = deps_array;
112 |   *ret_offsets = offsets;
113 |   *ret_sizes = sizes;
114 |   *ret_count = count;
115 |   *ret_num_nodes = num_nodes;
116 | }
117 | 


--------------------------------------------------------------------------------
/common/gpu-kernels-no-atomics.hh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <fmm.hh>
  4 | #include <node.hh>
  5 | 
  6 | #ifdef __CUDACC__
  7 | 
  8 | #include <gpu-spharm.hh>
  9 | 
 10 | #define NTERMS 4
 11 | #warning manually setting nterms
 12 | 
 13 | template <int TBS, int STILE, int REG, int TPB, int WPB, class T>
 14 | __device__ void p2p_gpu(FMM<T>* fmm, node_t<T>* target)
 15 | {
 16 | #ifdef KOKKOS
 17 |   using gpu_utils = gpu_utils<TPB, WPB>;
 18 | #endif
 19 | 
 20 |   const int tp = target->point_idx;
 21 | 
 22 |   __shared__ float4 base_shmem[STILE * WPB];
 23 |   float4* shmem = base_shmem + gpu_utils::worker_id() * STILE;
 24 | 
 25 |   T ax[REG] = {0};
 26 |   T ay[REG] = {0};
 27 |   T az[REG] = {0};
 28 |   T aw[REG] = {0};
 29 |   T xi[REG] = {0};
 30 |   T yi[REG] = {0};
 31 |   T zi[REG] = {0};
 32 | 
 33 |   const int tid = gpu_utils::thread_id();
 34 |   const int ilim = ((target->num_points + TBS - 1) / TBS) * TBS;
 35 | 
 36 |   for (int ii = tid; ii < ilim; ii += TBS * REG) {
 37 | #pragma unroll
 38 |     for (int i = 0; i < REG; ++i) {
 39 |       ax[i] = 0.0;
 40 |       ay[i] = 0.0;
 41 |       az[i] = 0.0;
 42 |       aw[i] = 0.0;
 43 |       xi[i] = ((i * TBS + ii) < target->num_points) ? fmm->x[tp + i * TBS + ii]
 44 |                                                     : 0.0;
 45 |       yi[i] = ((i * TBS + ii) < target->num_points) ? fmm->y[tp + i * TBS + ii]
 46 |                                                     : 0.0;
 47 |       zi[i] = ((i * TBS + ii) < target->num_points) ? fmm->z[tp + i * TBS + ii]
 48 |                                                     : 0.0;
 49 |     }
 50 |     for (int jj = 0; jj < target->num_points; jj += STILE) {
 51 |       const int jlim = min(STILE, (int)target->num_points - jj);
 52 |       gpu_utils::sync_worker();
 53 | #pragma unroll
 54 |       for (int j = tid; j < jlim; j += TBS) {
 55 |         shmem[j] = make_float4(fmm->x[tp + jj + j], fmm->y[tp + jj + j],
 56 |                                fmm->z[tp + jj + j], fmm->w[tp + jj + j]);
 57 |       }
 58 |       gpu_utils::sync_worker();
 59 | #pragma unroll
 60 |       for (int j = 0; j < jlim; ++j) {
 61 |         const float4 sj = shmem[j];
 62 | #pragma unroll
 63 |         for (int i = 0; i < REG; ++i) {
 64 |           const T dx = sj.x - xi[i];
 65 |           const T dy = sj.y - yi[i];
 66 |           const T dz = sj.z - zi[i];
 67 |           const T sw = sj.w;
 68 |           const T r = dx * dx + dy * dy + dz * dz;
 69 |           const T inv_r = (r == 0.0) ? 0.0 : rsqrtf(r);
 70 |           const T inv_r_3 = sw * inv_r * inv_r * inv_r;
 71 |           ax[i] += dx * inv_r_3;
 72 |           ay[i] += dy * inv_r_3;
 73 |           az[i] += dz * inv_r_3;
 74 |           aw[i] += sw * inv_r;
 75 |         }
 76 |       }
 77 |     }
 78 | #pragma unroll
 79 |     for (int i = 0; i < REG; ++i) {
 80 |       if ((i * TBS + ii) < target->num_points) {
 81 |         fmm->ax[tp + i*TBS+ii] += ax[i];
 82 |         fmm->ay[tp + i*TBS+ii] += ay[i];
 83 |         fmm->az[tp + i*TBS+ii] += az[i];
 84 |         fmm->p[tp + i*TBS+ii] += aw[i];
 85 |         //atomicAdd(fmm->ax + tp + i * TBS + ii, ax[i]);
 86 |         //atomicAdd(fmm->ay + tp + i * TBS + ii, ay[i]);
 87 |         //atomicAdd(fmm->az + tp + i * TBS + ii, az[i]);
 88 |         //atomicAdd(fmm->p + tp + i * TBS + ii, aw[i]);
 89 |       }
 90 |     }
 91 |   }
 92 | }
 93 | 
 94 | template <int TBS, int STILE, int REG, int TPB, int WPB, class T>
 95 | __device__ void p2p_gpu(FMM<T>* fmm, node_t<T>* target, node_t<T>* source)
 96 | {
 97 | #ifdef KOKKOS
 98 |   using gpu_utils = gpu_utils<TPB, WPB>;
 99 | #endif
100 | 
101 |   const int tp = target->point_idx;
102 |   const int sp = source->point_idx;
103 | 
104 |   __shared__ float4 base_shmem[STILE * WPB];
105 |   float4* shmem = base_shmem + gpu_utils::worker_id() * STILE;
106 |   T ax[REG] = {0};
107 |   T ay[REG] = {0};
108 |   T az[REG] = {0};
109 |   T aw[REG] = {0};
110 |   T xi[REG] = {0};
111 |   T yi[REG] = {0};
112 |   T zi[REG] = {0};
113 | 
114 |   const int tid = gpu_utils::thread_id();
115 |   const int ilim = ((target->num_points + TBS - 1) / TBS) * TBS;
116 | 
117 |   for (int ii = tid; ii < ilim; ii += TBS * REG) {
118 | #pragma unroll
119 |     for (int i = 0; i < REG; ++i) {
120 |       ax[i] = 0.0;
121 |       ay[i] = 0.0;
122 |       az[i] = 0.0;
123 |       aw[i] = 0.0;
124 |       xi[i] = ((i * TBS + ii) < target->num_points) ? fmm->x[tp + i * TBS + ii]
125 |                                                     : 0.0;
126 |       yi[i] = ((i * TBS + ii) < target->num_points) ? fmm->y[tp + i * TBS + ii]
127 |                                                     : 0.0;
128 |       zi[i] = ((i * TBS + ii) < target->num_points) ? fmm->z[tp + i * TBS + ii]
129 |                                                     : 0.0;
130 |     }
131 |     for (int jj = 0; jj < source->num_points; jj += STILE) {
132 |       const int jlim = min(STILE, (int)source->num_points - jj);
133 |       gpu_utils::sync_worker();
134 | #pragma unroll
135 |       for (int j = tid; j < jlim; j += TBS) {
136 |         shmem[j] = make_float4(fmm->x[sp + jj + j], fmm->y[sp + jj + j],
137 |                                fmm->z[sp + jj + j], fmm->w[sp + jj + j]);
138 |       }
139 |       gpu_utils::sync_worker();
140 | #pragma unroll
141 |       for (int j = 0; j < jlim; ++j) {
142 |         const float4 sj = shmem[j];
143 | #pragma unroll
144 |         for (int i = 0; i < REG; ++i) {
145 |           const T dx = sj.x - xi[i];
146 |           const T dy = sj.y - yi[i];
147 |           const T dz = sj.z - zi[i];
148 |           const T sw = sj.w;
149 |           const T r = dx * dx + dy * dy + dz * dz;
150 |           const T inv_r = rsqrtf(r);
151 |           const T inv_r_3 = sw * inv_r * inv_r * inv_r;
152 |           ax[i] += dx * inv_r_3;
153 |           ay[i] += dy * inv_r_3;
154 |           az[i] += dz * inv_r_3;
155 |           aw[i] += sw * inv_r;
156 |         }
157 |       }
158 |     }
159 | #pragma unroll
160 |     for (int i = 0; i < REG; ++i) {
161 |       if ((i * TBS + ii) < target->num_points) {
162 |         fmm->ax[tp + i*TBS+ii] += ax[i];
163 |         fmm->ay[tp + i*TBS+ii] += ay[i];
164 |         fmm->az[tp + i*TBS+ii] += az[i];
165 |         fmm->p[tp + i*TBS+ii] += aw[i];
166 |         //atomicAdd(fmm->ax + tp + i * TBS + ii, ax[i]);
167 |         //atomicAdd(fmm->ay + tp + i * TBS + ii, ay[i]);
168 |         //atomicAdd(fmm->az + tp + i * TBS + ii, az[i]);
169 |         //atomicAdd(fmm->p + tp + i * TBS + ii, aw[i]);
170 |       }
171 |     }
172 |   }
173 | }
174 | 
175 | template <int TPB, int WPB, class T>
176 | __device__ void m2l_gpu(FMM<T>* fmm, node_t<T>* target, node_t<T>* source)
177 | {
178 | #ifdef KOKKOS
179 |   using gpu_utils = gpu_utils<TPB, WPB>;
180 | #endif
181 | 
182 |   const int size = NTERMS * NTERMS;
183 |   const int shmem_size = size * (sizeof(T) + sizeof(complex_t<T>) * 2);
184 |   __shared__ char shmem[shmem_size * WPB];
185 |   char* warp_shmem = shmem + gpu_utils::worker_id() * shmem_size;
186 | 
187 |   T* legendre = (T*)warp_shmem;
188 |   complex_t<T>* outer = (complex_t<T>*)(warp_shmem + sizeof(T) * size);
189 |   complex_t<T>* shared_m =
190 |       (complex_t<T>*)(warp_shmem + (sizeof(complex_t<T>) + sizeof(T)) * size);
191 | 
192 |   T dx = target->cx - source->cx;
193 |   T dy = target->cy - source->cy;
194 |   T dz = target->cz - source->cz;
195 |   T rho, alpha, beta;
196 |   cart_to_sph(dx, dy, dz, rho, alpha, beta);
197 |   compute_outer_gpu<TPB, WPB>(fmm, rho, alpha, beta, outer, legendre);
198 |   complex_t<T>* Msource = &fmm->m[source->mult_idx];
199 |   complex_t<T>* Ltarget = &fmm->l[target->mult_idx];
200 | 
201 | #pragma unroll
202 |   for (int i = gpu_utils::thread_id(); i < fmm->num_terms * fmm->num_terms;
203 |        i += TPB) {
204 |     shared_m[i] = Msource[i];
205 |   }
206 |   gpu_utils::sync_worker();
207 | 
208 | #pragma unroll
209 |   for (int i = gpu_utils::thread_id(); i < fmm->num_terms * fmm->num_terms;
210 |        i += TPB) {
211 |     const int j = (int)sqrtf((float)i);
212 |     const int k = i - j * j - j;
213 | 
214 |     complex_t<T> tmp(0.0, 0.0);
215 |     for (int n = 0; n < fmm->num_terms - j; ++n) {
216 |       for (int m = -n; m <= n; ++m) {
217 |         tmp += shared_m[mult_idx(n, m)] * outer[mult_idx(j + n, -k - m)];
218 |       }
219 |     }
220 |     Ltarget[i] += tmp;
221 |     //atomicAdd(&(Ltarget[i].re), tmp.re);
222 |     //atomicAdd(&(Ltarget[i].im), tmp.im);
223 |   }
224 | }
225 | 
226 | #endif
227 | 


--------------------------------------------------------------------------------
/common/gpu-kernels.hh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <fmm.hh>
  4 | #include <node.hh>
  5 | 
  6 | #ifdef __CUDACC__
  7 | 
  8 | #include <gpu-spharm.hh>
  9 | 
 10 | #define NTERMS 4
 11 | #warning manually setting nterms
 12 | 
 13 | template <class T>
 14 | __device__ inline T pow_one(int m)
 15 | {
 16 |   return static_cast<T>(1 + ((m & 0x01) * -2));
 17 | }
 18 | 
 19 | template <int TBS, int STILE, int REG, int TPB, int WPB, class T>
 20 | __device__ void p2p_gpu(FMM<T>* fmm, node_t<T>* target)
 21 | {
 22 | #ifdef KOKKOS
 23 |   using gpu_utils = gpu_utils<TPB, WPB>;
 24 | #endif
 25 | 
 26 |   const int tp = target->point_idx;
 27 | 
 28 |   __shared__ float4 base_shmem[STILE * WPB];
 29 |   float4* shmem = base_shmem + gpu_utils::worker_id() * STILE;
 30 | 
 31 |   T ax[REG] = {0};
 32 |   T ay[REG] = {0};
 33 |   T az[REG] = {0};
 34 |   T aw[REG] = {0};
 35 |   T xi[REG] = {0};
 36 |   T yi[REG] = {0};
 37 |   T zi[REG] = {0};
 38 | 
 39 |   const int tid = gpu_utils::thread_id();
 40 |   const int ilim = ((target->num_points + TBS - 1) / TBS) * TBS;
 41 | 
 42 |   for (int ii = tid; ii < ilim; ii += TBS * REG) {
 43 | #pragma unroll
 44 |     for (int i = 0; i < REG; ++i) {
 45 |       ax[i] = 0.0;
 46 |       ay[i] = 0.0;
 47 |       az[i] = 0.0;
 48 |       aw[i] = 0.0;
 49 |       xi[i] = ((i * TBS + ii) < target->num_points) ? fmm->x[tp + i * TBS + ii]
 50 |                                                     : 0.0;
 51 |       yi[i] = ((i * TBS + ii) < target->num_points) ? fmm->y[tp + i * TBS + ii]
 52 |                                                     : 0.0;
 53 |       zi[i] = ((i * TBS + ii) < target->num_points) ? fmm->z[tp + i * TBS + ii]
 54 |                                                     : 0.0;
 55 |     }
 56 |     for (int jj = 0; jj < target->num_points; jj += STILE) {
 57 |       const int jlim = min(STILE, (int)target->num_points - jj);
 58 |       gpu_utils::sync_worker();
 59 | #pragma unroll
 60 |       for (int j = tid; j < jlim; j += TBS) {
 61 |         shmem[j] = make_float4(fmm->x[tp + jj + j], fmm->y[tp + jj + j],
 62 |                                fmm->z[tp + jj + j], fmm->w[tp + jj + j]);
 63 |       }
 64 |       gpu_utils::sync_worker();
 65 | #pragma unroll
 66 |       for (int j = 0; j < jlim; ++j) {
 67 |         const float4 sj = shmem[j];
 68 | #pragma unroll
 69 |         for (int i = 0; i < REG; ++i) {
 70 |           const T dx = sj.x - xi[i];
 71 |           const T dy = sj.y - yi[i];
 72 |           const T dz = sj.z - zi[i];
 73 |           const T sw = sj.w;
 74 |           const T r = dx * dx + dy * dy + dz * dz;
 75 |           const T inv_r = (r == 0.0) ? 0.0 : rsqrtf(r);
 76 |           const T inv_r_3 = sw * inv_r * inv_r * inv_r;
 77 |           ax[i] += dx * inv_r_3;
 78 |           ay[i] += dy * inv_r_3;
 79 |           az[i] += dz * inv_r_3;
 80 |           aw[i] += sw * inv_r;
 81 |         }
 82 |       }
 83 |     }
 84 | #pragma unroll
 85 |     for (int i = 0; i < REG; ++i) {
 86 |       if ((i * TBS + ii) < target->num_points) {
 87 |         // fmm->ax[tp + i*TBS+ii] += ax[i];
 88 |         // fmm->ay[tp + i*TBS+ii] += ay[i];
 89 |         // fmm->az[tp + i*TBS+ii] += az[i];
 90 |         // fmm->p[tp + i*TBS+ii] += aw[i];
 91 |         atomicAdd(fmm->ax + tp + i * TBS + ii, ax[i]);
 92 |         atomicAdd(fmm->ay + tp + i * TBS + ii, ay[i]);
 93 |         atomicAdd(fmm->az + tp + i * TBS + ii, az[i]);
 94 |         atomicAdd(fmm->p + tp + i * TBS + ii, aw[i]);
 95 |       }
 96 |     }
 97 |   }
 98 | }
 99 | 
100 | template <int TBS, int STILE, int REG, int TPB, int WPB, class T>
101 | __device__ void p2p_gpu(FMM<T>* fmm, node_t<T>* target, node_t<T>* source)
102 | {
103 | #ifdef KOKKOS
104 |   using gpu_utils = gpu_utils<TPB, WPB>;
105 | #endif
106 | 
107 |   const int tp = target->point_idx;
108 |   const int sp = source->point_idx;
109 | 
110 |   __shared__ float4 base_shmem[STILE * WPB];
111 |   float4* shmem = base_shmem + gpu_utils::worker_id() * STILE;
112 |   T ax[REG] = {0};
113 |   T ay[REG] = {0};
114 |   T az[REG] = {0};
115 |   T aw[REG] = {0};
116 |   T xi[REG] = {0};
117 |   T yi[REG] = {0};
118 |   T zi[REG] = {0};
119 | 
120 |   const int tid = gpu_utils::thread_id();
121 |   const int ilim = ((target->num_points + TBS - 1) / TBS) * TBS;
122 | 
123 |   for (int ii = tid; ii < ilim; ii += TBS * REG) {
124 | #pragma unroll
125 |     for (int i = 0; i < REG; ++i) {
126 |       ax[i] = 0.0;
127 |       ay[i] = 0.0;
128 |       az[i] = 0.0;
129 |       aw[i] = 0.0;
130 |       xi[i] = ((i * TBS + ii) < target->num_points) ? fmm->x[tp + i * TBS + ii]
131 |                                                     : 0.0;
132 |       yi[i] = ((i * TBS + ii) < target->num_points) ? fmm->y[tp + i * TBS + ii]
133 |                                                     : 0.0;
134 |       zi[i] = ((i * TBS + ii) < target->num_points) ? fmm->z[tp + i * TBS + ii]
135 |                                                     : 0.0;
136 |     }
137 |     for (int jj = 0; jj < source->num_points; jj += STILE) {
138 |       const int jlim = min(STILE, (int)source->num_points - jj);
139 |       gpu_utils::sync_worker();
140 | #pragma unroll
141 |       for (int j = tid; j < jlim; j += TBS) {
142 |         shmem[j] = make_float4(fmm->x[sp + jj + j], fmm->y[sp + jj + j],
143 |                                fmm->z[sp + jj + j], fmm->w[sp + jj + j]);
144 |       }
145 |       gpu_utils::sync_worker();
146 | #pragma unroll
147 |       for (int j = 0; j < jlim; ++j) {
148 |         const float4 sj = shmem[j];
149 | #pragma unroll
150 |         for (int i = 0; i < REG; ++i) {
151 |           const T dx = sj.x - xi[i];
152 |           const T dy = sj.y - yi[i];
153 |           const T dz = sj.z - zi[i];
154 |           const T sw = sj.w;
155 |           const T r = dx * dx + dy * dy + dz * dz;
156 |           const T inv_r = rsqrtf(r);
157 |           const T inv_r_3 = sw * inv_r * inv_r * inv_r;
158 |           ax[i] += dx * inv_r_3;
159 |           ay[i] += dy * inv_r_3;
160 |           az[i] += dz * inv_r_3;
161 |           aw[i] += sw * inv_r;
162 |         }
163 |       }
164 |     }
165 | #pragma unroll
166 |     for (int i = 0; i < REG; ++i) {
167 |       if ((i * TBS + ii) < target->num_points) {
168 |         // fmm->ax[tp + i*TBS+ii] += ax[i];
169 |         // fmm->ay[tp + i*TBS+ii] += ay[i];
170 |         // fmm->az[tp + i*TBS+ii] += az[i];
171 |         // fmm->p[tp + i*TBS+ii] += aw[i];
172 |         atomicAdd(fmm->ax + tp + i * TBS + ii, ax[i]);
173 |         atomicAdd(fmm->ay + tp + i * TBS + ii, ay[i]);
174 |         atomicAdd(fmm->az + tp + i * TBS + ii, az[i]);
175 |         atomicAdd(fmm->p + tp + i * TBS + ii, aw[i]);
176 |       }
177 |     }
178 |   }
179 | }
180 | 
181 | template <int TPB, int WPB, class T>
182 | __device__ void m2l_gpu(FMM<T>* fmm, node_t<T>* target, node_t<T>* source)
183 | {
184 | #ifdef KOKKOS
185 |   using gpu_utils = gpu_utils<TPB, WPB>;
186 | #endif
187 | 
188 |   const int size = NTERMS * NTERMS;
189 |   const int shmem_size = size * (sizeof(T) + sizeof(complex_t<T>) * 2);
190 |   __shared__ char shmem[shmem_size * WPB];
191 |   char* warp_shmem = shmem + gpu_utils::worker_id() * shmem_size;
192 | 
193 |   T* legendre = (T*)warp_shmem;
194 |   complex_t<T>* outer = (complex_t<T>*)(warp_shmem + sizeof(T) * size);
195 |   complex_t<T>* shared_m =
196 |       (complex_t<T>*)(warp_shmem + (sizeof(complex_t<T>) + sizeof(T)) * size);
197 | 
198 |   T dx = target->cx - source->cx;
199 |   T dy = target->cy - source->cy;
200 |   T dz = target->cz - source->cz;
201 |   T rho, alpha, beta;
202 |   cart_to_sph(dx, dy, dz, rho, alpha, beta);
203 |   compute_outer_gpu<TPB, WPB>(fmm, rho, alpha, beta, outer, legendre);
204 |   complex_t<T>* Msource = &fmm->m[source->mult_idx];
205 |   complex_t<T>* Ltarget = &fmm->l[target->mult_idx];
206 | 
207 | #pragma unroll
208 |   for (int i = gpu_utils::thread_id(); i < fmm->num_terms * fmm->num_terms;
209 |        i += TPB) {
210 |     shared_m[i] = Msource[i];
211 |   }
212 |   gpu_utils::sync_worker();
213 | 
214 | #pragma unroll
215 |   for (int i = gpu_utils::thread_id(); i < fmm->num_terms * fmm->num_terms;
216 |        i += TPB) {
217 |     const int j = (int)sqrtf((float)i);
218 |     const int k = i - j * j - j;
219 | 
220 |     complex_t<T> tmp(0.0, 0.0);
221 |     for (int n = 0; n < fmm->num_terms - j; ++n) {
222 |       for (int m = -n; m <= n; ++m) {
223 |         tmp += shared_m[mult_idx(n, m)] * outer[mult_idx(j + n, -k - m)];
224 |       }
225 |     }
226 |     // Ltarget[i] += tmp;
227 |     atomicAdd(&(Ltarget[i].re), tmp.re);
228 |     atomicAdd(&(Ltarget[i].im), tmp.im);
229 |   }
230 | }
231 | 
232 | template <int TPB, int WPB, class T>
233 | INLINE void p2m_gpu(FMM<T>* fmm, node_t<T>* node)
234 | {
235 | #ifdef KOKKOS
236 |   using gpu_utils = gpu_utils<TPB, WPB>;
237 | #endif
238 | 
239 |   size_t pt_offset = node->point_idx;
240 |   size_t mt_offset = node->mult_idx;
241 |  
242 |   __shared__ T shmem_all[WPB * NTERMS * NTERMS * 3];
243 |   T* shmem = shmem_all + gpu_utils::worker_id() * NTERMS * NTERMS * 3;
244 |   T* legendre = shmem;
245 |   complex_t<T>* inner = (complex_t<T>*)(shmem + NTERMS * NTERMS);
246 | 
247 |   for (size_t i = 0; i < node->num_points; ++i) {
248 |     T dx = fmm->x[i + pt_offset] - node->cx;
249 |     T dy = fmm->y[i + pt_offset] - node->cy;
250 |     T dz = fmm->z[i + pt_offset] - node->cz;
251 |     T r, theta, phi;
252 |     cart_to_sph(dx, dy, dz, r, theta, phi);
253 |     compute_inner_gpu<TPB, WPB>(fmm, r, theta, phi, inner, legendre);
254 |     gpu_utils::sync_worker();
255 | #pragma unroll
256 |     for (int t = gpu_utils::thread_id(); t < fmm->num_terms * fmm->num_terms;
257 |          t += TPB) {
258 |       const int n = (int)sqrtf((float)t);
259 |       fmm->m[mt_offset + t] += fmm->w[i + pt_offset] * pow_one<T>(n) * inner[t];
260 |     }
261 |   }
262 | }
263 | 
264 | template <int TPB, int WPB, class T>
265 | __device__ void m2m_gpu(FMM<T>* fmm, node_t<T>* node)
266 | {
267 | #ifdef KOKKOS
268 |   using gpu_utils = gpu_utils<TPB, WPB>;
269 | #endif
270 | 
271 |   __shared__ T shmem_all[WPB * NTERMS * NTERMS * 3];
272 |   T* shmem = shmem_all + gpu_utils::worker_id() * NTERMS * NTERMS * 3;
273 |   T* legendre = shmem;
274 |   complex_t<T>* inner = (complex_t<T>*)(shmem + NTERMS * NTERMS);
275 | 
276 |   for (size_t i = 0; i < node->num_children; ++i) {
277 |     node_t<T>* child = &fmm->nodes[node->child[i]];
278 |     T dx = node->cx - child->cx;
279 |     T dy = node->cy - child->cy;
280 |     T dz = node->cz - child->cz;
281 |     T r, theta, phi;
282 |     cart_to_sph(dx, dy, dz, r, theta, phi);
283 | 
284 |     const complex_t<T>* Mchild = &fmm->m[child->mult_idx];
285 |     complex_t<T>* Mnode = &fmm->m[node->mult_idx];
286 | 
287 |     compute_inner_gpu<TPB, WPB>(fmm, r, theta, phi, inner, legendre);
288 |     gpu_utils::sync_worker();
289 | #pragma unroll
290 |     for (int t = gpu_utils::thread_id(); t < fmm->num_terms * fmm->num_terms;
291 |          t += TPB) {
292 |       const int j = (int)sqrtf((float)t);
293 |       const int k = t - j * j - j;
294 |       complex_t<T> tmp(static_cast<T>(0.0), static_cast<T>(0.0));
295 |       for (int n = 0; n <= j; ++n) {
296 |         for (int m = -n; m <= n; ++m) {
297 |           if (abs(k - m) <= j - n)
298 |             tmp += Mchild[mult_idx(n, m)] * inner[mult_idx(j - n, k - m)];
299 |         }
300 |       }
301 |       Mnode[t] += tmp;
302 |     }
303 |   }
304 | }
305 | 
306 | #endif
307 | 


--------------------------------------------------------------------------------
/common/gpu-spharm.hh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <spharm.hh>
 4 | 
 5 | template <int TPB, int WPB, class T>
 6 | INLINE void compute_outer_gpu(FMM<T>* fmm, T r, T theta, T phi,
 7 |                                   complex_t<T>* outer, T* legendre,
 8 |                                   complex_t<T>* outer_deriv = nullptr,
 9 |                                   T* legendre_deriv = nullptr)
10 | {
11 | #ifdef KOKKOS
12 |   using gpu_utils = gpu_utils<TPB, WPB>;
13 | #endif
14 |   if (fmm->num_terms == 0) return;
15 |   const int lid = gpu_utils::thread_id();
16 |   const int num_lanes = TPB;
17 |   if (lid == 0) {
18 |     compute_legendre<1>(fmm->num_terms - 1, std::cos(theta), legendre,
19 |                             legendre_deriv);
20 |   }
21 |   gpu_utils::sync_worker();
22 | #pragma unroll
23 |   for (int i = lid; i < fmm->num_terms * fmm->num_terms; i += num_lanes) {
24 |     const int n = (int)sqrtf((float)i);
25 |     const int m = i - n * n - n;
26 |     outer[i] = fmm->outer_factors[i] * legendre[leg_idx(n, m)] *
27 |                complex_exp(complex_t<T>(0.0, 1.0) * static_cast<T>(m) * phi) *
28 |                (static_cast<T>(1.0) / std::pow(r, static_cast<T>(n + 1)));
29 |   }
30 | }
31 | 
32 | template <int TPB, int WPB, class T>
33 | INLINE void compute_inner_gpu(FMM<T>* fmm, T r, T theta, T phi,
34 |                                   complex_t<T>* inner, T* legendre,
35 |                                   complex_t<T>* inner_deriv = nullptr,
36 |                                   T* legendre_deriv = nullptr)
37 | {
38 | #ifdef KOKKOS
39 |   using gpu_utils = gpu_utils<TPB, WPB>;
40 | #endif
41 |   if (fmm->num_terms == 0) return;
42 |   const int lid = gpu_utils::thread_id();
43 |   const int num_lanes = TPB;
44 |   if (lid == 0) {
45 |     compute_legendre<1>(fmm->num_terms - 1, std::cos(theta), legendre,
46 |                             legendre_deriv);
47 |   }
48 |   gpu_utils::sync_worker();
49 | #pragma unroll
50 |   for (int i = lid; i < fmm->num_terms * fmm->num_terms; i += num_lanes) {
51 |     const int n = (int)sqrtf((float)i);
52 |     const int m = i - n * n - n;
53 |     inner[i] = fmm->inner_factors[i] * legendre[leg_idx(n, m)] *
54 |                complex_exp(complex_t<T>(0.0, 1.0) * static_cast<T>(m) * phi) *
55 |                std::pow(r, static_cast<T>(n));
56 |   }
57 | }
58 | 
59 | 


--------------------------------------------------------------------------------
/common/gpu-utils.hh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #ifdef __CUDACC__
  4 | 
  5 | #define HOSTDEVICE __host__ __device__
  6 | #define CUDACHK(ans)                                                           \
  7 |   {                                                                            \
  8 |     gpu_assert((ans), __FILE__, __LINE__);                                     \
  9 |   }
 10 | inline void gpu_assert(cudaError_t code, const char* file, int line)
 11 | {
 12 |   if (code != cudaSuccess) {
 13 |     fprintf(stderr, "CUDACHK: %s %s %d\n", cudaGetErrorString(code), file,
 14 |             line);
 15 |     exit(code);
 16 |   }
 17 | }
 18 | #else
 19 | #define HOSTDEVICE
 20 | #endif
 21 | 
 22 | template <class T>
 23 | struct FMM;
 24 | 
 25 | template <class T>
 26 | void alloc_and_copy(T** dst, const T* src, size_t nelms)
 27 | {
 28 | #ifdef __CUDACC__
 29 |   CUDACHK(cudaMalloc((void**)dst, sizeof(T) * nelms));
 30 |   CUDACHK(cudaMemcpy(*dst, src, sizeof(T) * nelms, cudaMemcpyHostToDevice));
 31 | #else
 32 |   *dst = (T*)malloc(sizeof(T) * nelms);
 33 |   memcpy(*dst, src, sizeof(T) * nelms);
 34 | #endif
 35 | }
 36 | 
 37 | template <class T>
 38 | void copy_back(T* dst, T* src, size_t nelms)
 39 | {
 40 | #ifdef __CUDACC__
 41 |   CUDACHK(cudaMemcpy(dst, src, sizeof(T) * nelms, cudaMemcpyDeviceToHost));
 42 | #else
 43 |   memcpy(dst, src, sizeof(T) * nelms);
 44 | #endif
 45 | }
 46 | 
 47 | template <class T>
 48 | void init_device_fmm(FMM<T>* fmm, FMM<T>** h_fmm_ret, FMM<T>** d_fmm_ret)
 49 | {
 50 |   FMM<T>* h_fmm = (FMM<T>*)malloc(sizeof(FMM<T>));
 51 |   *h_fmm = *fmm;
 52 | 
 53 |   alloc_and_copy(&h_fmm->nodes, fmm->nodes, fmm->num_nodes);
 54 |   alloc_and_copy(&h_fmm->x, fmm->x, fmm->num_points);
 55 |   alloc_and_copy(&h_fmm->y, fmm->y, fmm->num_points);
 56 |   alloc_and_copy(&h_fmm->z, fmm->z, fmm->num_points);
 57 |   alloc_and_copy(&h_fmm->w, fmm->w, fmm->num_points);
 58 |   alloc_and_copy(&h_fmm->ax, fmm->ax, fmm->num_points);
 59 |   alloc_and_copy(&h_fmm->ay, fmm->ay, fmm->num_points);
 60 |   alloc_and_copy(&h_fmm->az, fmm->az, fmm->num_points);
 61 |   alloc_and_copy(&h_fmm->p, fmm->p, fmm->num_points);
 62 |   alloc_and_copy(&h_fmm->inner_factors, fmm->inner_factors,
 63 |                  fmm->num_multipoles);
 64 |   alloc_and_copy(&h_fmm->outer_factors, fmm->outer_factors,
 65 |                  fmm->num_multipoles);
 66 |   alloc_and_copy(&h_fmm->m, fmm->m, fmm->num_multipoles * fmm->num_nodes);
 67 |   alloc_and_copy(&h_fmm->l, fmm->l, fmm->num_multipoles * fmm->num_nodes);
 68 | 
 69 |   FMM<T>* d_fmm;
 70 |   alloc_and_copy(&d_fmm, h_fmm, 1);
 71 | 
 72 |   *h_fmm_ret = h_fmm;
 73 |   *d_fmm_ret = d_fmm;
 74 | }
 75 | 
 76 | template <class T>
 77 | void device_free(T* p)
 78 | {
 79 | #ifdef __CUDACC__
 80 |   CUDACHK(cudaFree(p));
 81 | #else
 82 |   free(p);
 83 | #endif
 84 | }
 85 | 
 86 | template <class T>
 87 | void fini_device_fmm(FMM<T>* fmm, FMM<T>* h_fmm, FMM<T>* d_fmm)
 88 | {
 89 |   copy_back(fmm->ax, h_fmm->ax, fmm->num_points);
 90 |   copy_back(fmm->ay, h_fmm->ay, fmm->num_points);
 91 |   copy_back(fmm->az, h_fmm->az, fmm->num_points);
 92 |   copy_back(fmm->p, h_fmm->p, fmm->num_points);
 93 | 
 94 |   copy_back(fmm->m, h_fmm->m, fmm->num_nodes * fmm->num_multipoles);
 95 |   copy_back(fmm->l, h_fmm->l, fmm->num_nodes * fmm->num_multipoles);
 96 | 
 97 |   device_free(h_fmm->nodes);
 98 |   device_free(h_fmm->x);
 99 |   device_free(h_fmm->y);
100 |   device_free(h_fmm->z);
101 |   device_free(h_fmm->w);
102 |   device_free(h_fmm->ax);
103 |   device_free(h_fmm->ay);
104 |   device_free(h_fmm->az);
105 |   device_free(h_fmm->p);
106 |   device_free(h_fmm->inner_factors);
107 |   device_free(h_fmm->outer_factors);
108 |   device_free(h_fmm->m);
109 |   device_free(h_fmm->l);
110 | 
111 |   device_free(d_fmm);
112 | 
113 |   free(h_fmm);
114 | }
115 | 
116 | template <class T>
117 | void update_device_array(T* d_array, T* h_array, size_t n)
118 | {
119 | #ifdef __CUDACC__
120 |   CUDACHK(cudaMemcpy(d_array, h_array, sizeof(T) * n, cudaMemcpyHostToDevice));
121 | #else
122 |   memcpy(d_array, h_array, sizeof(T) * n);
123 | #endif
124 | }
125 | 
126 | template <class T>
127 | void update_host_array(T* h_array, T* d_array, size_t n)
128 | {
129 | #ifdef __CUDACC__
130 |   CUDACHK(cudaMemcpy(h_array, d_array, sizeof(T) * n, cudaMemcpyDeviceToHost));
131 | #else
132 |   memcpy(h_array, d_array, sizeof(T) * n);
133 | #endif
134 | }
135 | 


--------------------------------------------------------------------------------
/common/init.hh:
--------------------------------------------------------------------------------
 1 | #include <algorithm>
 2 | #include <complex.hh>
 3 | #include <random>
 4 | #include <tree.hh>
 5 | #include <utils.hh>
 6 | 
 7 | template <class T>
 8 | T rand_dist(T low, T high)
 9 | {
10 |   const T randn = (T)rand() / (T)RAND_MAX;
11 |   return (low + randn * (high - low));
12 | }
13 | 
14 | template <class T>
15 | void init(FMM<T>* fmm)
16 | {
17 |   fmm->theta2 = fmm->theta * fmm->theta;
18 |   fmm->num_multipoles = fmm->num_terms * fmm->num_terms;
19 |   fmm->num_spharm_terms = fmm->num_terms * fmm->num_terms;
20 | 
21 |   fmm->x = (T*)malloc(sizeof(T) * fmm->num_points);
22 |   fmm->y = (T*)malloc(sizeof(T) * fmm->num_points);
23 |   fmm->z = (T*)malloc(sizeof(T) * fmm->num_points);
24 |   fmm->w = (T*)malloc(sizeof(T) * fmm->num_points);
25 |   fmm->ax = (T*)malloc(sizeof(T) * fmm->num_points);
26 |   fmm->ay = (T*)malloc(sizeof(T) * fmm->num_points);
27 |   fmm->az = (T*)malloc(sizeof(T) * fmm->num_points);
28 |   fmm->p = (T*)malloc(sizeof(T) * fmm->num_points);
29 | 
30 |   srand(42);
31 | 
32 |   if (fmm->dist == FMM<T>::Dist::Uniform) {
33 |     for (size_t i = 0; i < fmm->num_points; ++i) {
34 |       fmm->x[i] = static_cast<T>(rand()) / static_cast<T>(RAND_MAX);
35 |       fmm->y[i] = static_cast<T>(rand()) / static_cast<T>(RAND_MAX);
36 |       fmm->z[i] = static_cast<T>(rand()) / static_cast<T>(RAND_MAX);
37 |       fmm->w[i] = static_cast<T>(rand()) / static_cast<T>(RAND_MAX);
38 |     }
39 |   }
40 |   else if (fmm->dist == FMM<T>::Dist::Plummer) {
41 |     for (size_t i = 0; i < fmm->num_points; ++i) {
42 |       T randn = (static_cast<T>(rand()) / static_cast<T>(RAND_MAX));
43 |       randn += (randn == static_cast<T>(0.0))
44 |                    ? std::numeric_limits<T>::epsilon()
45 |                    : static_cast<T>(0.0);
46 |       randn -= (randn == static_cast<T>(1.0))
47 |                    ? std::numeric_limits<T>::epsilon()
48 |                    : static_cast<T>(0.0);
49 |       const T radius =
50 |           static_cast<T>(1.0) / std::sqrt(std::pow(randn, (-2.0 / 3.0)) - 1.0);
51 |       const T theta = std::acos(rand_dist(-1.0, 1.0));
52 |       const T phi = rand_dist(0.0, 2.0 * M_PI);
53 |       fmm->x[i] = radius * std::sin(theta) * std::cos(phi);
54 |       fmm->y[i] = radius * std::sin(theta) * std::sin(phi);
55 |       fmm->z[i] = radius * std::cos(theta);
56 |       fmm->w[i] = static_cast<T>(1.0) / static_cast<T>(fmm->num_points);
57 |     }
58 |   }
59 |   else {
60 |     fprintf(stderr, "error: unknown input distribution type\n");
61 |     exit(1);
62 |   }
63 | 
64 |   std::fill(fmm->ax, fmm->ax + fmm->num_points, 0);
65 |   std::fill(fmm->ay, fmm->ay + fmm->num_points, 0);
66 |   std::fill(fmm->az, fmm->az + fmm->num_points, 0);
67 |   std::fill(fmm->p, fmm->p + fmm->num_points, 0);
68 | 
69 |   int num_terms = fmm->num_terms;
70 | 
71 |   fmm->inner_factors =
72 |       (complex_t<T>*)malloc(sizeof(complex_t<T>) * num_terms * num_terms);
73 |   fmm->outer_factors =
74 |       (complex_t<T>*)malloc(sizeof(complex_t<T>) * num_terms * num_terms);
75 | 
76 |   std::fill(fmm->inner_factors, fmm->inner_factors + (num_terms * num_terms),
77 |             complex_t<T>(static_cast<T>(0.0), static_cast<T>(0.0)));
78 |   std::fill(fmm->outer_factors, fmm->outer_factors + (num_terms * num_terms),
79 |             complex_t<T>(static_cast<T>(0.0), static_cast<T>(0.0)));
80 | 
81 |   int max = 2 * num_terms - 1;
82 |   T factorial[max];
83 |   factorial[0] = 1.0;
84 |   for (int i = 1; i < max; ++i) factorial[i] = i * factorial[i - 1];
85 | 
86 |   for (int n = 0; n < num_terms; ++n) {
87 |     for (int m = -n; m <= n; ++m) {
88 |       fmm->inner_factors[mult_idx(n, m)] =
89 |           (std::pow(static_cast<T>(-1.0), static_cast<T>(n)) *
90 |            imag_pow<T>(std::abs(m))) /
91 |           factorial[n + std::abs(m)];
92 |       fmm->outer_factors[mult_idx(n, m)] =
93 |           imag_pow<T>(-std::abs(m)) * factorial[n - std::abs(m)];
94 |     }
95 |   }
96 |   build_tree(fmm);
97 | }
98 | 


--------------------------------------------------------------------------------
/common/input.hh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <getopt.h>
  4 | 
  5 | #include <cstdlib>
  6 | #include <cstring>
  7 | #include <fstream>
  8 | #include <iostream>
  9 | 
 10 | #include <fmm.hh>
 11 | 
 12 | void print_help() { printf("help\n"); }
 13 | 
 14 | template <class T>
 15 | void parse_var(const char c, const char* optarg, FMM<T>* fmm)
 16 | {
 17 |   switch (c) {
 18 |     case 'n':
 19 |       fmm->num_points = std::atoi(optarg);
 20 |       break;
 21 |     case 'c':
 22 |       fmm->ncrit = std::atoi(optarg);
 23 |       break;
 24 |     case 't':
 25 |       fmm->num_terms = std::atoi(optarg);
 26 |       break;
 27 |     case 'e':
 28 |       fmm->theta = std::atof(optarg);
 29 |       break;
 30 |     case 'm':
 31 |       fmm->num_samples = std::atoi(optarg);
 32 |       break;
 33 |     case 'p':
 34 |       fmm->dist = FMM<T>::Dist::Plummer;
 35 |       break;
 36 |     case 'u':
 37 |       fmm->dist = FMM<T>::Dist::Uniform;
 38 |       break;
 39 |     case '?':
 40 |       fprintf(stderr, "error - %c not recognised or missing value\n", optopt);
 41 |       break;
 42 |   }
 43 | }
 44 | 
 45 | template <class T>
 46 | void parse_input_file(const char* ifile, FMM<T>* fmm)
 47 | {
 48 |   std::ifstream ifs(ifile);
 49 |   if (!ifs.is_open()) {
 50 |     std::cerr << "error: could not open input file - " << ifile << std::endl;
 51 |     std::exit(1);
 52 |   }
 53 | 
 54 |   std::vector<std::string> lines;
 55 | 
 56 |   std::string line;
 57 |   while (std::getline(ifs, line)) {
 58 |     std::stringstream ss(line);
 59 |     while (std::getline(ss, line, ' ')) {
 60 |       lines.push_back(line);
 61 |     }
 62 |   }
 63 | 
 64 |   int fargc = lines.size() + 1;
 65 |   char** fargv = (char**)malloc(sizeof(char*) * fargc);
 66 |   for (size_t i = 0; i < lines.size(); ++i) {
 67 |     fargv[i + 1] = (char*)malloc(sizeof(char) * lines[i].size());
 68 |     strcpy(fargv[i + 1], lines[i].c_str());
 69 |   }
 70 |   parse_args(fargc, fargv, fmm, true);
 71 | }
 72 | 
 73 | template <class T>
 74 | void parse_args(int argc, char** argv, FMM<T>* fmm, bool nested)
 75 | {
 76 |   const static struct option long_params[] = {
 77 |       {"help", no_argument, NULL, 'h'},
 78 |       {"npart", required_argument, NULL, 'n'},
 79 |       {"ncrit", required_argument, NULL, 'c'},
 80 |       {"nterms", required_argument, NULL, 't'},
 81 |       {"theta", required_argument, NULL, 'e'},
 82 |       {"nsamp", required_argument, NULL, 'm'},
 83 |       {"ifile", required_argument, NULL, 'i'},
 84 |       {"plummer", no_argument, NULL, 'p'},
 85 |       {"uniform", no_argument, NULL, 'u'},
 86 |   };
 87 | 
 88 |   int c;
 89 |   optind = 1;
 90 |   opterr = 0;
 91 |   while ((c = getopt_long(argc, argv, "hpun:c:t:e:m:d:i:", long_params, NULL)) !=
 92 |          -1) {
 93 |     switch (c) {
 94 |       case 'h':
 95 |         print_help();
 96 |         // TODO check this leads to correct destruction.
 97 |         std::exit(0);
 98 |       case 'i':
 99 |         // stop input file arg being used inside an input file
100 |         if (!nested) {
101 |           const char* ifile = optarg;
102 |           parse_input_file(ifile, fmm);
103 |           // TODO fix this
104 |           return;
105 |         }
106 |         break;
107 |       default:
108 |         parse_var(c, optarg, fmm);
109 |         break;
110 |     }
111 |   }
112 | }
113 | 
114 | template <class T>
115 | void read_input(int argc, char** argv, FMM<T>* fmm)
116 | {
117 |   const char* dist_strings[FMM<T>::Dist::NumDist] = {"Uniform", "Plummer"};
118 |   fmm->num_points = 1000;
119 |   fmm->ncrit = 20;
120 |   fmm->num_terms = 4;
121 |   fmm->theta = 0.5;
122 |   fmm->num_samples = 1000;
123 |   fmm->dist = FMM<T>::Dist::Plummer;
124 | 
125 |   parse_args(argc, argv, fmm, false);
126 | 
127 |   fmm->num_samples = std::min(fmm->num_samples, fmm->num_points);
128 | 
129 |   std::cout << "FMM args\n"
130 |             << "Num Points   = " << fmm->num_points << '\n'
131 |             << "NCrit        = " << fmm->ncrit << '\n'
132 |             << "Num Terms    = " << fmm->num_terms << '\n'
133 |             << "Theta        = " << fmm->theta << '\n'
134 |             << "Num Samples  = " << fmm->num_samples << "\n"
135 |             << "Distribution = " << dist_strings[fmm->dist] << "\n\n";
136 | }
137 | 


--------------------------------------------------------------------------------
/common/kernels.hh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <spharm.hh>
  4 | #include <utils.hh>
  5 | 
  6 | template <class T>
  7 | void p2p_tiled(FMM<T>* fmm, node_t<T>* target)
  8 | {
  9 |   const size_t ip = target->point_idx;
 10 |   for (size_t ii = 0; ii < target->num_points; ii += TILE_SIZE) {
 11 |     T xi[TILE_SIZE] = {0};
 12 |     T yi[TILE_SIZE] = {0};
 13 |     T zi[TILE_SIZE] = {0};
 14 |     T ax[TILE_SIZE] = {0};
 15 |     T ay[TILE_SIZE] = {0};
 16 |     T az[TILE_SIZE] = {0};
 17 |     T aw[TILE_SIZE] = {0};
 18 |     const int ilim = std::min((size_t)TILE_SIZE, target->num_points - ii);
 19 |     for (int i = 0; i < ilim; ++i) {
 20 |       xi[i] = fmm->x[i + ii + ip];
 21 |       yi[i] = fmm->y[i + ii + ip];
 22 |       zi[i] = fmm->z[i + ii + ip];
 23 |     }
 24 |     for (size_t j = 0; j < target->num_points; ++j) {
 25 |       for (int i = 0; i < TILE_SIZE; ++i) {
 26 |         const T dx = fmm->x[j + ip] - xi[i];
 27 |         const T dy = fmm->y[j + ip] - yi[i];
 28 |         const T dz = fmm->z[j + ip] - zi[i];
 29 |         const T sw = fmm->w[j + ip];
 30 |         const T r = dx * dx + dy * dy + dz * dz;
 31 |         const T inv_r = (r == 0.0) ? 0.0 : 1.0 / std::sqrt(r);
 32 |         const T inv_r_3 = sw * inv_r * inv_r * inv_r;
 33 |         ax[i] += dx * inv_r_3;
 34 |         ay[i] += dy * inv_r_3;
 35 |         az[i] += dz * inv_r_3;
 36 |         aw[i] += sw * inv_r;
 37 |       }
 38 |     }
 39 |     for (int i = 0; i < ilim; ++i) {
 40 |       fmm->ax[i + ii + ip] += ax[i];
 41 |       fmm->ay[i + ii + ip] += ay[i];
 42 |       fmm->az[i + ii + ip] += az[i];
 43 |       fmm->p[i + ii + ip] += aw[i];
 44 |     }
 45 |   }
 46 | }
 47 | 
 48 | template <class T>
 49 | void p2p(FMM<T>* fmm, node_t<T>* target)
 50 | {
 51 |   for (size_t i = 0; i < target->num_points; ++i) {
 52 |     const size_t ip = i + target->point_idx;
 53 |     const T xi = fmm->x[ip];
 54 |     const T yi = fmm->y[ip];
 55 |     const T zi = fmm->z[ip];
 56 |     T ax = static_cast<T>(0.0);
 57 |     T ay = static_cast<T>(0.0);
 58 |     T az = static_cast<T>(0.0);
 59 |     T p = static_cast<T>(0.0);
 60 |     for (size_t j = 0; j < target->num_points; ++j) {
 61 |       const size_t jp = j + target->point_idx;
 62 |       const T dx = fmm->x[jp] - xi;
 63 |       const T dy = fmm->y[jp] - yi;
 64 |       const T dz = fmm->z[jp] - zi;
 65 |       const T r = dx * dx + dy * dy + dz * dz;
 66 |       const T inv_r = (r == 0.0) ? 0.0 : 1.0 / std::sqrt(r);
 67 |       const T inv_r_3 = inv_r * inv_r * inv_r * fmm->w[jp];
 68 |       ax += dx * inv_r_3;
 69 |       ay += dy * inv_r_3;
 70 |       az += dz * inv_r_3;
 71 |       p += fmm->w[jp] * inv_r;
 72 |     }
 73 |     fmm->ax[ip] += ax;
 74 |     fmm->ay[ip] += ay;
 75 |     fmm->az[ip] += az;
 76 |     fmm->p[ip] += p;
 77 |   }
 78 | }
 79 | 
 80 | template <class T>
 81 | void p2p_tiled(FMM<T>* fmm, node_t<T>* target, node_t<T>* source)
 82 | {
 83 |   const size_t jp = source->point_idx;
 84 |   const size_t ip = target->point_idx;
 85 |   for (size_t ii = 0; ii < target->num_points; ii += TILE_SIZE) {
 86 |     T xi[TILE_SIZE] = {0};
 87 |     T yi[TILE_SIZE] = {0};
 88 |     T zi[TILE_SIZE] = {0};
 89 |     T ax[TILE_SIZE] = {0};
 90 |     T ay[TILE_SIZE] = {0};
 91 |     T az[TILE_SIZE] = {0};
 92 |     T aw[TILE_SIZE] = {0};
 93 |     const int ilim = std::min((size_t)TILE_SIZE, target->num_points - ii);
 94 |     for (int i = 0; i < ilim; ++i) {
 95 |       xi[i] = fmm->x[i + ii + ip];
 96 |       yi[i] = fmm->y[i + ii + ip];
 97 |       zi[i] = fmm->z[i + ii + ip];
 98 |     }
 99 |     for (size_t j = 0; j < source->num_points; ++j) {
100 |       for (int i = 0; i < TILE_SIZE; ++i) {
101 |         const T dx = fmm->x[j + jp] - xi[i];
102 |         const T dy = fmm->y[j + jp] - yi[i];
103 |         const T dz = fmm->z[j + jp] - zi[i];
104 |         const T sw = fmm->w[j + jp];
105 |         const T r = dx * dx + dy * dy + dz * dz;
106 |         const T inv_r = 1.0 / std::sqrt(r);
107 |         const T inv_r_3 = sw * inv_r * inv_r * inv_r;
108 |         ax[i] += dx * inv_r_3;
109 |         ay[i] += dy * inv_r_3;
110 |         az[i] += dz * inv_r_3;
111 |         aw[i] += sw * inv_r;
112 |       }
113 |     }
114 |     for (int i = 0; i < ilim; ++i) {
115 |       fmm->ax[i + ii + ip] += ax[i];
116 |       fmm->ay[i + ii + ip] += ay[i];
117 |       fmm->az[i + ii + ip] += az[i];
118 |       fmm->p[i + ii + ip] += aw[i];
119 |     }
120 |   }
121 | }
122 | 
123 | template <class T>
124 | void p2p(FMM<T>* fmm, node_t<T>* target, node_t<T>* source)
125 | {
126 |   for (size_t i = 0; i < target->num_points; ++i) {
127 |     const size_t ip = i + target->point_idx;
128 |     const T xi = fmm->x[ip];
129 |     const T yi = fmm->y[ip];
130 |     const T zi = fmm->z[ip];
131 |     T ax = static_cast<T>(0.0);
132 |     T ay = static_cast<T>(0.0);
133 |     T az = static_cast<T>(0.0);
134 |     T p = static_cast<T>(0.0);
135 |     for (size_t j = 0; j < source->num_points; ++j) {
136 |       const size_t jp = j + source->point_idx;
137 |       const T dx = fmm->x[jp] - xi;
138 |       const T dy = fmm->y[jp] - yi;
139 |       const T dz = fmm->z[jp] - zi;
140 |       const T r = dx * dx + dy * dy + dz * dz;
141 |       const T inv_r = 1.0 / std::sqrt(r);
142 |       const T inv_r_3 = fmm->w[jp] * inv_r * inv_r * inv_r;
143 |       ax += dx * inv_r_3;
144 |       ay += dy * inv_r_3;
145 |       az += dz * inv_r_3;
146 |       p += fmm->w[jp] * inv_r;
147 |     }
148 |     fmm->ax[ip] += ax;
149 |     fmm->ay[ip] += ay;
150 |     fmm->az[ip] += az;
151 |     fmm->p[ip] += p;
152 |   }
153 | }
154 | 
155 | template <class T>
156 | void m2l(FMM<T>* fmm, node_t<T>* target, node_t<T>* source)
157 | {
158 |   int num_terms = fmm->num_terms;
159 |   T dx = target->cx - source->cx;
160 |   T dy = target->cy - source->cy;
161 |   T dz = target->cz - source->cz;
162 |   complex_t<T> outer[num_terms * num_terms];
163 |   T rho, alpha, beta;
164 |   cart_to_sph(dx, dy, dz, rho, alpha, beta);
165 |   compute_outer<1>(fmm, rho, alpha, beta, outer);
166 |   complex_t<T>* Msource = &fmm->m[source->mult_idx];
167 |   complex_t<T>* Ltarget = &fmm->l[target->mult_idx];
168 |   for (int j = 0; j < num_terms; ++j) {
169 |     for (int k = -j; k <= j; ++k) {
170 |       complex_t<T> tmp(static_cast<T>(0.0), static_cast<T>(0.0));
171 |       for (int n = 0; n < num_terms - j; ++n) {
172 |         for (int m = -n; m <= n; ++m) {
173 |           tmp += Msource[mult_idx(n, m)] * outer[mult_idx(j + n, -k - m)];
174 |           // blah
175 |         }
176 |       }
177 |       Ltarget[mult_idx(j, k)] += tmp;
178 |     }
179 |   }
180 | }
181 | 
182 | template <class T>
183 | void p2m(FMM<T>* fmm, node_t<T>* node)
184 | {
185 |   int num_terms = fmm->num_terms;
186 |   size_t pt_offset = node->point_idx;
187 |   size_t mt_offset = node->mult_idx;
188 |   for (size_t i = 0; i < node->num_points; ++i) {
189 |     T dx = fmm->x[i + pt_offset] - node->cx;
190 |     T dy = fmm->y[i + pt_offset] - node->cy;
191 |     T dz = fmm->z[i + pt_offset] - node->cz;
192 |     complex_t<T> inner[num_terms * num_terms];
193 |     T r, theta, phi;
194 |     cart_to_sph(dx, dy, dz, r, theta, phi);
195 |     compute_inner<1>(fmm, r, theta, phi, inner);
196 |     for (int n = 0; n < num_terms; ++n) {
197 |       for (int m = -n; m <= n; ++m) {
198 |         fmm->m[mt_offset + mult_idx(n, m)] +=
199 |             fmm->w[i + pt_offset] *
200 |             std::pow(static_cast<T>(-1.0), static_cast<T>(n)) *
201 |             inner[mult_idx(n, m)];
202 |       }
203 |     }
204 |   }
205 | }
206 | 
207 | template <class T>
208 | void m2m(FMM<T>* fmm, node_t<T>* node)
209 | {
210 |   int num_terms = fmm->num_terms;
211 |   for (size_t i = 0; i < node->num_children; ++i) {
212 |     complex_t<T> inner[num_terms * num_terms];
213 |     node_t<T>* child = &fmm->nodes[node->child[i]];
214 |     T dx = node->cx - child->cx;
215 |     T dy = node->cy - child->cy;
216 |     T dz = node->cz - child->cz;
217 |     T r, theta, phi;
218 |     cart_to_sph(dx, dy, dz, r, theta, phi);
219 |     compute_inner<1>(fmm, r, theta, phi, inner);
220 |     const complex_t<T>* Mchild = &fmm->m[child->mult_idx];
221 |     complex_t<T>* Mnode = &fmm->m[node->mult_idx];
222 |     for (int j = 0; j < num_terms; ++j) {
223 |       for (int k = -j; k <= j; ++k) {
224 |         complex_t<T> tmp(static_cast<T>(0.0), static_cast<T>(0.0));
225 |         for (int n = 0; n <= j; ++n) {
226 |           for (int m = -n; m <= n; ++m) {
227 |             if (abs(k - m) <= j - n)
228 |               tmp += Mchild[mult_idx(n, m)] * inner[mult_idx(j - n, k - m)];
229 |           }
230 |         }
231 |         Mnode[mult_idx(j, k)] += tmp;
232 |       }
233 |     }
234 |   }
235 | }
236 | 
237 | template <class T>
238 | void l2l(FMM<T>* fmm, node_t<T>* node)
239 | {
240 |   int num_terms = fmm->num_terms;
241 |   complex_t<T> inner[num_terms * num_terms];
242 |   for (size_t i = 0; i < node->num_children; ++i) {
243 |     node_t<T>* child = &fmm->nodes[node->child[i]];
244 |     // TODO flip these?
245 |     T dx = child->cx - node->cx;
246 |     T dy = child->cy - node->cy;
247 |     T dz = child->cz - node->cz;
248 |     T rho, alpha, beta;
249 |     cart_to_sph(dx, dy, dz, rho, alpha, beta);
250 |     compute_inner<1>(fmm, rho, alpha, beta, inner);
251 |     complex_t<T>* Lnode = &fmm->l[node->mult_idx];
252 |     complex_t<T>* Lchild = &fmm->l[child->mult_idx];
253 |     for (int j = 0; j < num_terms; ++j) {
254 |       for (int k = -j; k <= j; ++k) {
255 |         complex_t<T> tmp(static_cast<T>(0.0), static_cast<T>(0.0));
256 |         for (int n = j; n < num_terms; ++n) {
257 |           for (int m = -n; m <= n; ++m) {
258 |             if (std::abs(m - k) <= n - j) {
259 |               tmp += Lnode[mult_idx(n, m)] * inner[mult_idx(n - j, m - k)];
260 |             }
261 |           }
262 |         }
263 |         Lchild[mult_idx(j, k)] += tmp;
264 |       }
265 |     }
266 |   }
267 | }
268 | 
269 | template <class T>
270 | void l2p(FMM<T>* fmm, node_t<T>* node)
271 | {
272 |   int num_terms = fmm->num_terms;
273 |   complex_t<T> inner[num_terms * num_terms];
274 |   complex_t<T> inner_deriv[num_terms * num_terms];
275 |   size_t pt_offset = node->point_idx;
276 |   complex_t<T>* Lnode = &fmm->l[node->mult_idx];
277 |   for (size_t i = 0; i < node->num_points; ++i) {
278 |     T dx = fmm->x[pt_offset + i] - node->cx;
279 |     T dy = fmm->y[pt_offset + i] - node->cy;
280 |     T dz = fmm->z[pt_offset + i] - node->cz;
281 |     T r, theta, phi;
282 |     cart_to_sph(dx, dy, dz, r, theta, phi);
283 |     compute_inner<2>(fmm, r, theta, phi, inner, inner_deriv);
284 | 
285 |     T Psum = static_cast<T>(0.0);
286 |     T rsum = static_cast<T>(0.0);
287 |     T tsum = static_cast<T>(0.0);
288 |     T psum = static_cast<T>(0.0);
289 |     T two = static_cast<T>(2.0);
290 |     complex_t<T> ci(static_cast<T>(0.0), static_cast<T>(1.0));
291 |     for (int n = 0; n < num_terms; ++n) {
292 |       int m = 0;
293 |       Psum += complex_real(Lnode[mult_idx(n, m)] * inner[mult_idx(n, m)]);
294 |       rsum += static_cast<T>(n) *
295 |               complex_real(Lnode[mult_idx(n, m)] * inner[mult_idx(n, m)]);
296 |       tsum += complex_real(Lnode[mult_idx(n, m)] * inner_deriv[mult_idx(n, m)]);
297 |       psum += static_cast<T>(m) *
298 |               complex_real(Lnode[mult_idx(n, m)] * inner[mult_idx(n, m)] * ci);
299 |       for (int m = 1; m <= n; ++m) {
300 |         Psum +=
301 |             two * complex_real(Lnode[mult_idx(n, m)] * inner[mult_idx(n, m)]);
302 |         rsum += two * static_cast<T>(n) *
303 |                 complex_real(Lnode[mult_idx(n, m)] * inner[mult_idx(n, m)]);
304 |         tsum += two * complex_real(Lnode[mult_idx(n, m)] *
305 |                                    inner_deriv[mult_idx(n, m)]);
306 |         psum +=
307 |             two * static_cast<T>(m) *
308 |             complex_real(Lnode[mult_idx(n, m)] * inner[mult_idx(n, m)] * ci);
309 |       }
310 |     }
311 |     T inv_r = (r == static_cast<T>(0.0)) ? 0.0 : static_cast<T>(1.0) / r;
312 |     rsum *= inv_r;
313 |     tsum *= inv_r;
314 |     psum *= inv_r;
315 |     psum *= (theta == static_cast<T>(0.0))
316 |                 ? 0.0
317 |                 : static_cast<T>(1.0) / std::sin(theta);
318 |     T ax, ay, az;
319 |     sph_unit_to_cart_unit(r, theta, phi, rsum, tsum, psum, ax, ay, az);
320 |     fmm->p[pt_offset + i] += Psum;
321 |     fmm->ax[pt_offset + i] += ax;
322 |     fmm->ay[pt_offset + i] += ay;
323 |     fmm->az[pt_offset + i] += az;
324 |   }
325 | }
326 | 
327 | template <class T>
328 | void m2p(FMM<T>* fmm, node_t<T>* target, node_t<T>* source)
329 | {
330 |   int num_terms = fmm->num_terms;
331 |   size_t target_pt_offset = target->point_idx;
332 |   size_t source_mt_offset = source->mult_idx;
333 |   for (size_t i = 0; i < target->num_points; ++i) {
334 |     T dx = fmm->x[target_pt_offset + i] - source->cx;
335 |     T dy = fmm->y[target_pt_offset + i] - source->cy;
336 |     T dz = fmm->z[target_pt_offset + i] - source->cz;
337 |     T r, theta, phi;
338 |     cart_to_sph(dx, dy, dz, r, theta, phi);
339 |     complex_t<T> outer[num_terms * num_terms];
340 |     complex_t<T> outer_deriv[num_terms * num_terms];
341 |     compute_outer<2>(fmm, r, theta, phi, outer, outer_deriv);
342 |     T Psum = static_cast<T>(0.0);
343 |     T rsum = static_cast<T>(0.0);
344 |     T tsum = static_cast<T>(0.0);
345 |     T psum = static_cast<T>(0.0);
346 |     T two = static_cast<T>(2.0);
347 |     const complex_t<T>* M = &fmm->m[source_mt_offset];
348 |     const complex_t<T> ci(static_cast<T>(0.0), static_cast<T>(1.0));
349 |     for (int n = 0; n < num_terms; ++n) {
350 |       int m = 0;
351 |       Psum += (outer[mult_idx(n, -m)] * M[mult_idx(n, m)]).real();
352 |       rsum += -static_cast<T>(n + 1) *
353 |               complex_real(outer[mult_idx(n, -m)] * M[mult_idx(n, m)]);
354 |       tsum += complex_real(outer_deriv[mult_idx(n, -m)] * M[mult_idx(n, m)]);
355 |       psum += static_cast<T>(m) *
356 |               complex_real(ci * outer[mult_idx(n, -m)] * M[mult_idx(n, m)]);
357 |       for (m = 1; m <= n; ++m) {
358 |         Psum += two * complex_real(outer[mult_idx(n, -m)] *
359 |                                    fmm->m[source_mt_offset + mult_idx(n, m)]);
360 |         rsum += two * -static_cast<T>(n + 1) *
361 |                 complex_real(outer[mult_idx(n, -m)] * M[mult_idx(n, m)]);
362 |         tsum += two *
363 |                 complex_real(outer_deriv[mult_idx(n, -m)] * M[mult_idx(n, m)]);
364 |         psum += two * static_cast<T>(m) *
365 |                 complex_real(ci * outer[mult_idx(n, -m)] * M[mult_idx(n, m)]);
366 |       }
367 |     }
368 |     rsum *= static_cast<T>(1.0) / r;
369 |     tsum *= static_cast<T>(1.0) / r;
370 |     psum *= static_cast<T>(1.0) / r;
371 |     psum /= std::sin(theta);
372 |     T ax, ay, az;
373 |     sph_unit_to_cart_unit(r, theta, phi, rsum, tsum, psum, ax, ay, az);
374 |     fmm->p[target_pt_offset + i] += Psum;
375 |     fmm->ax[target_pt_offset + i] += ax;
376 |     fmm->ay[target_pt_offset + i] += ay;
377 |     fmm->az[target_pt_offset + i] += az;
378 |   }
379 | }
380 | 


--------------------------------------------------------------------------------
/common/kokkos-utils.hh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #define KOKKOS
 4 | 
 5 | template <class T>
 6 | KOKKOS_INLINE_FUNCTION void lock(T* val)
 7 | {
 8 |   while (!Kokkos::atomic_compare_exchange(val, 0, 1))
 9 |     ;
10 | #ifdef __CUDA_ARCH__
11 |   __threadfence();
12 | #endif
13 | }
14 | 
15 | template <class T>
16 | KOKKOS_INLINE_FUNCTION void unlock(T* val)
17 | {
18 | #ifdef __CUDA_ARCH__
19 |   __threadfence();
20 | #endif
21 |   while (Kokkos::atomic_compare_exchange(val, 1, 0))
22 |     ;
23 | }
24 | 
25 | #ifdef __CUDACC__
26 | #define INLINE __device__ __inline__
27 | 
28 | template <int NTHREADS, int NWORKERS>
29 | class gpu_utils {
30 | public:
31 |   static const int warp_size = 32;
32 |   static const int num_threads = NTHREADS;
33 | 
34 |   INLINE static int thread_id() { return threadIdx.y; }
35 |   INLINE static int worker_id() { return threadIdx.z; }
36 |   INLINE static int global_worker_id()
37 |   {
38 |     return blockIdx.x * blockDim.z + worker_id();
39 |   }
40 | 
41 |   template <int N = NTHREADS,
42 |             typename std::enable_if<(N <= 32)>::type* = nullptr>
43 |   INLINE static void sync_worker()
44 |   {
45 |     __syncwarp();
46 |   }
47 | 
48 |   template <int N = NTHREADS,
49 |             typename std::enable_if<(N > 32)>::type* = nullptr>
50 |   INLINE static void sync_worker()
51 |   {
52 |     __syncthreads();
53 |   }
54 | };
55 | #else
56 | #define INLINE inline
57 | #endif
58 | 


--------------------------------------------------------------------------------
/common/node.hh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <utils.hh>
 4 | 
 5 | template <class T>
 6 | struct node_t {
 7 |   node_t() = default;
 8 |   ~node_t() = default;
 9 |   node_t(T arg_cx, T arg_cy, T arg_cz, T arg_rad, size_t arg_num_points,
10 |          size_t arg_point_idx, size_t arg_mult_idx, size_t arg_node_idx,
11 |          size_t arg_level)
12 |       : cx{arg_cx},
13 |         cy{arg_cy},
14 |         cz{arg_cz},
15 |         rad{arg_rad},
16 |         num_points{arg_num_points},
17 |         point_idx{arg_point_idx},
18 |         mult_idx{arg_mult_idx},
19 |         node_idx{arg_node_idx},
20 |         level{arg_level}
21 |   {
22 |   }
23 |   T cx;
24 |   T cy;
25 |   T cz;
26 |   T rad;
27 |   size_t num_children = 0;
28 |   size_t child[8] = {0};
29 |   size_t num_points;
30 |   size_t point_idx;
31 |   size_t mult_idx;
32 |   size_t node_idx;
33 |   size_t level;
34 | 
35 |   HOSTDEVICE
36 |   bool is_leaf() const { return (num_children == 0); }
37 | };
38 | 


--------------------------------------------------------------------------------
/common/spharm.hh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <fmm.hh>
  4 | #include <utils.hh>
  5 | 
  6 | template <int order, class T>
  7 | HOSTDEVICE void compute_legendre(size_t nmax, T x, T* P, T* P_deriv = nullptr);
  8 | 
  9 | template <int order, class T>
 10 | void compute_inner(FMM<T>* fmm, T r, T theta, T phi, complex_t<T>* inner,
 11 |                    complex_t<T>* inner_deriv = nullptr)
 12 | {
 13 |   if (fmm->num_terms == 0) return;
 14 |   //// TODO this can be reduced as we only calculate the 'positive' legendre
 15 |   /// vals
 16 |   T legendre[fmm->num_terms * fmm->num_terms];
 17 |   T legendre_deriv[fmm->num_terms * fmm->num_terms];
 18 |   // TODO forward order to compute_legendre
 19 |   if (order == 1) {
 20 |     compute_legendre<1>(fmm->num_terms - 1, std::cos(theta), legendre);
 21 |   }
 22 |   if (order == 2) {
 23 |     compute_legendre<2>(fmm->num_terms - 1, std::cos(theta), legendre,
 24 |                         legendre_deriv);
 25 |   }
 26 | 
 27 |   const complex_t<T> i = complex_t<T>(static_cast<T>(0.0), static_cast<T>(1.0));
 28 | 
 29 |   for (int n = 0; n < fmm->num_terms; ++n) {
 30 |     inner[mult_idx(n, 0)] = fmm->inner_factors[mult_idx(n, 0)] *
 31 |                             legendre[leg_idx(n, 0)] *
 32 |                             complex_exp(i * static_cast<T>(0) * phi) *
 33 |                             std::pow(r, static_cast<T>(n));
 34 |     if (order == 2)
 35 |       inner_deriv[mult_idx(n, 0)] = fmm->inner_factors[mult_idx(n, 0)] *
 36 |                                     legendre_deriv[leg_idx(n, 0)] *
 37 |                                     complex_exp(i * static_cast<T>(0) * phi) *
 38 |                                     std::pow(r, static_cast<T>(n));
 39 |     for (int m = 1; m <= n; ++m) {
 40 |       inner[mult_idx(n, m)] = fmm->inner_factors[mult_idx(n, m)] *
 41 |                               legendre[leg_idx(n, m)] *
 42 |                               complex_exp(i * static_cast<T>(m) * phi) *
 43 |                               std::pow(r, static_cast<T>(n));
 44 |       inner[mult_idx(n, -m)] =
 45 |           std::pow(static_cast<T>(-1.0), static_cast<T>(m)) *
 46 |           complex_conj(inner[mult_idx(n, m)]);
 47 |       if (order == 2) {
 48 |         inner_deriv[mult_idx(n, m)] = fmm->inner_factors[mult_idx(n, m)] *
 49 |                                       legendre_deriv[leg_idx(n, m)] *
 50 |                                       complex_exp(i * static_cast<T>(m) * phi) *
 51 |                                       std::pow(r, static_cast<T>(n));
 52 |         inner_deriv[mult_idx(n, -m)] =
 53 |             std::pow(static_cast<T>(-1.0), static_cast<T>(m)) *
 54 |             complex_conj(inner_deriv[mult_idx(n, m)]);
 55 |       }
 56 |     }
 57 |   }
 58 | }
 59 | 
 60 | template <int order, class T>
 61 | void compute_outer(FMM<T>* fmm, T r, T theta, T phi, complex_t<T>* outer,
 62 |                    complex_t<T>* outer_deriv = nullptr)
 63 | {
 64 |   if (fmm->num_terms == 0) return;
 65 |   // TODO this can be reduced as we only calculate the 'positive' legendre vals
 66 |   T legendre[fmm->num_terms * fmm->num_terms];
 67 |   T legendre_deriv[fmm->num_terms * fmm->num_terms];
 68 | 
 69 |   if (order == 1)
 70 |     compute_legendre<1>(fmm->num_terms - 1, std::cos(theta), legendre);
 71 |   if (order == 2) {
 72 |     compute_legendre<2>(fmm->num_terms - 1, std::cos(theta), legendre,
 73 |                         legendre_deriv);
 74 |   }
 75 | 
 76 |   const complex_t<T> i = complex_t<T>(static_cast<T>(0.0), static_cast<T>(1.0));
 77 | 
 78 |   for (int n = 0; n < fmm->num_terms; ++n) {
 79 |     for (int m = 0; m <= n; ++m) {
 80 |       outer[mult_idx(n, m)] =
 81 |           fmm->outer_factors[mult_idx(n, m)] * legendre[leg_idx(n, m)] *
 82 |           complex_exp(i * static_cast<T>(m) * phi) *
 83 |           (static_cast<T>(1.0) / std::pow(r, static_cast<T>(n + 1)));
 84 |       outer[mult_idx(n, -m)] =
 85 |           std::pow(static_cast<T>(-1.0), static_cast<T>(m)) *
 86 |           complex_conj(outer[mult_idx(n, m)]);
 87 |       if (order == 2) {
 88 |         outer_deriv[mult_idx(n, m)] =
 89 |             fmm->outer_factors[mult_idx(n, m)] * legendre_deriv[leg_idx(n, m)] *
 90 |             complex_exp(i * static_cast<T>(m) * phi) *
 91 |             (static_cast<T>(1.0) / std::pow(r, static_cast<T>(n + 1)));
 92 |         outer_deriv[mult_idx(n, -m)] =
 93 |             std::pow(static_cast<T>(-1.0), static_cast<T>(m)) *
 94 |             complex_conj(outer_deriv[mult_idx(n, m)]);
 95 |         // TODO negative derivs may need to be calculated
 96 |       }
 97 |     }
 98 |   }
 99 | }
100 | 
101 | // TODO test with 'if constexpr' with order (C++17 feature)
102 | template <int order, class T>
103 | HOSTDEVICE void compute_legendre(size_t nmax, T x, T* P, T* P_deriv)
104 | {
105 |   const T csphase = static_cast<T>(-1.0);
106 |   const T one = static_cast<T>(1.0);
107 |   const T u = (x == 1.0) ? 0.0 : std::sqrt((one - x) * (one + x));
108 |   const T uinv = (u == static_cast<T>(0.0)) ? static_cast<T>(0.0) : one / u;
109 |   const T xbyu = x * uinv;
110 |   size_t n, m;
111 |   size_t k, idxmm;
112 |   T pnm, pmm, pm1, pm2, twomm1;
113 |   pm2 = one;
114 |   pm1 = x;
115 | 
116 |   P[0] = pm2;
117 |   if (order >= 2) P_deriv[0] = static_cast<T>(0.0);
118 |   if (nmax == 0) return;
119 |   P[1] = pm1;
120 |   if (order >= 2) P_deriv[1] = -u;
121 | 
122 |   k = 1;
123 |   for (n = 2; n <= nmax; ++n) {
124 |     k += n;
125 |     pnm = (static_cast<T>(2 * n - 1) * x * pm1 - static_cast<T>(n - 1) * pm2) /
126 |           static_cast<T>(n);
127 |     P[k] = pnm;
128 |     if (order >= 2) P_deriv[k] = -static_cast<T>(n) * (pm1 - x * pnm) * uinv;
129 |     pm2 = pm1;
130 |     pm1 = pnm;
131 |   }
132 | 
133 |   pmm = one;
134 |   twomm1 = -one;
135 |   idxmm = 0;
136 |   for (m = 1; m <= nmax - 1; ++m) {
137 |     idxmm += m + 1;
138 |     twomm1 += static_cast<T>(2.0);
139 |     pmm *= csphase * u * twomm1;
140 |     P[idxmm] = pmm;
141 |     if (order >= 2) P_deriv[idxmm] = static_cast<T>(m) * xbyu * pmm;
142 |     pm2 = pmm;
143 |     k = idxmm + m + 1;
144 |     pm1 = x * pmm * static_cast<T>(2 * m + 1);
145 |     P[k] = pm1;
146 |     if (order >= 2)
147 |       P_deriv[k] = -uinv * (static_cast<T>(2 * m + 1) * pmm -
148 |                             static_cast<T>(m + 1) * x * pm1);
149 | 
150 |     for (n = m + 2; n <= nmax; ++n) {
151 |       k += n;
152 |       pnm = (static_cast<T>(2 * n - 1) * x * pm1 -
153 |              static_cast<T>(n + m - 1) * pm2) /
154 |             static_cast<T>(n - m);
155 |       P[k] = pnm;
156 |       if (order >= 2)
157 |         P_deriv[k] =
158 |             -uinv * (static_cast<T>(n + m) * pm1 - static_cast<T>(n) * x * pnm);
159 |       pm2 = pm1;
160 |       pm1 = pnm;
161 |     }
162 |   }
163 | 
164 |   idxmm += m + 1;
165 |   twomm1 += static_cast<T>(2.0);
166 |   pmm *= csphase * u * twomm1;
167 |   P[idxmm] = pmm;
168 |   if (order >= 2) P_deriv[idxmm] = static_cast<T>(nmax) * x * pmm * uinv;
169 | }
170 | 


--------------------------------------------------------------------------------
/common/timer.hh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <sys/time.h>
 4 | 
 5 | class Timer {
 6 | private:
 7 |   double wtime()
 8 |   {
 9 |     struct timeval t;
10 |     gettimeofday(&t, NULL);
11 |     return t.tv_sec + t.tv_usec * 1.0E-6;
12 |   }
13 | 
14 | public:
15 |   inline void start() { this->tick = wtime(); }
16 |   inline void stop()
17 |   {
18 |     this->tock = wtime();
19 |     this->elaps = this->tock - this->tick;
20 |   }
21 |   double elapsed() { return this->elaps; }
22 | 
23 | private:
24 |   double tick, tock, elaps;
25 | };
26 | 


--------------------------------------------------------------------------------
/common/traversal.hh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <omp.h>
  4 | 
  5 | #include <node.hh>
  6 | #include <kernels.hh>
  7 | 
  8 | template <class T>
  9 | void upwards_pass(FMM<T>* fmm, node_t<T>* node)
 10 | {
 11 |   for (size_t i = 0; i < node->num_children; ++i) {
 12 |     upwards_pass(fmm, &fmm->nodes[node->child[i]]);
 13 |   }
 14 | 
 15 |   if (node->is_leaf())
 16 |     p2m(fmm, node);
 17 |   else
 18 |     m2m(fmm, node);
 19 | }
 20 | 
 21 | template <class T>
 22 | void downwards_pass(FMM<T>* fmm, node_t<T>* node)
 23 | {
 24 |   if (node->is_leaf())
 25 |     l2p(fmm, node);
 26 |   else {
 27 |     l2l(fmm, node);
 28 |     for (size_t i = 0; i < node->num_children; ++i) {
 29 |       downwards_pass(fmm, &fmm->nodes[node->child[i]]);
 30 |     }
 31 |   }
 32 | }
 33 | 
 34 | template <class T>
 35 | void dual_tree(FMM<T>* fmm, node_t<T>* target, node_t<T>* source)
 36 | {
 37 |   T dx = source->cx - target->cx;
 38 |   T dy = source->cy - target->cy;
 39 |   T dz = source->cz - target->cz;
 40 |   T r2 = dx * dx + dy * dy + dz * dz;
 41 |   T d1 = source->rad * static_cast<T>(2.0);
 42 |   T d2 = target->rad * static_cast<T>(2.0);
 43 | 
 44 |   if ((d1 + d2) * (d1 + d2) < fmm->theta2 * r2) {
 45 |     m2l(fmm, target, source);
 46 |   }
 47 |   else if (source->is_leaf() && target->is_leaf()) {
 48 |     if (target == source)
 49 |       p2p_tiled(fmm, target);
 50 |     else
 51 |       p2p_tiled(fmm, target, source);
 52 |   }
 53 |   else {
 54 |     T target_sz = target->rad;
 55 |     T source_sz = source->rad;
 56 |     if (source->is_leaf() || ((target_sz >= source_sz) && !target->is_leaf())) {
 57 |       for (size_t i = 0; i < target->num_children; ++i) {
 58 |         node_t<T>* child = &fmm->nodes[target->child[i]];
 59 |         dual_tree(fmm, child, source);
 60 |       }
 61 |     }
 62 |     else {
 63 |       for (size_t i = 0; i < source->num_children; ++i) {
 64 |         dual_tree(fmm, target, &fmm->nodes[source->child[i]]);
 65 |       }
 66 |     }
 67 |   }
 68 | }
 69 | 
 70 | template <class T>
 71 | void perform_traversals(FMM<T>* fmm)
 72 | {
 73 |   printf("Running in serial\n");
 74 | 
 75 |   Timer timer;
 76 |   Timer tot_timer;
 77 | 
 78 |   timer.start();
 79 |   tot_timer.start();
 80 |   upwards_pass(fmm, &fmm->nodes[fmm->root]);
 81 |   timer.stop();
 82 |   printf("\n");
 83 |   printf("%-20s %12.8f\n", "Upwards Time (s) ", timer.elapsed());
 84 | 
 85 |   timer.start();
 86 |   dual_tree(fmm, &fmm->nodes[fmm->root], &fmm->nodes[fmm->root]);
 87 |   timer.stop();
 88 |   printf("%-20s %12.8f\n", "DTT Time (s) ", timer.elapsed());
 89 | 
 90 |   timer.start();
 91 |   downwards_pass(fmm, &fmm->nodes[fmm->root]);
 92 |   timer.stop();
 93 |   printf("%-20s %12.8f\n", "Downwards Time (s) ", timer.elapsed());
 94 | 
 95 |   tot_timer.stop();
 96 |   printf("--------------------\n");
 97 |   printf("%-20s %12.8f\n", "Total Time (s) ", tot_timer.elapsed());
 98 |   printf("--------------------\n\n");
 99 | }
100 | 


--------------------------------------------------------------------------------
/common/tree.hh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <algorithm>
  4 | #include <array>
  5 | #include <fmm.hh>
  6 | #include <iostream>
  7 | #include <node.hh>
  8 | #include <timer.hh>
  9 | 
 10 | template <class T>
 11 | void get_bound_box(FMM<T>* fmm, size_t start, size_t end,
 12 |                    std::array<std::pair<T*, T*>, 3>& lims)
 13 | {
 14 |   lims[0] = std::minmax_element(&fmm->x[start], &fmm->x[end]);
 15 |   lims[1] = std::minmax_element(&fmm->y[start], &fmm->y[end]);
 16 |   lims[2] = std::minmax_element(&fmm->z[start], &fmm->z[end]);
 17 | }
 18 | 
 19 | // stable in-place mergesort
 20 | template <class T>
 21 | void reorder(T* x, T* y, T* z, T* w, std::vector<size_t> indices, size_t start,
 22 |              size_t end)
 23 | {
 24 |   // for (size_t i = start; i < end; ++i) {
 25 |   //  const T tx = x[i];
 26 |   //  const T ty = y[i];
 27 |   //  const T tz = z[i];
 28 |   //  size_t j = i;
 29 |   //  while (true) {
 30 |   //    size_t k = indices[j];
 31 |   //    indices[j] = j;
 32 |   //    if (k == i) break;
 33 |   //    if (k >= end) {
 34 |   //      printf("problem k = %zu\n", k);
 35 |   //      exit(1);
 36 |   //    }
 37 |   //    x[j] = x[k];
 38 |   //    y[j] = y[k];
 39 |   //    z[j] = z[k];
 40 |   //    j = k;
 41 |   //  }
 42 |   //  x[j] = tx;
 43 |   //  y[j] = ty;
 44 |   //  z[j] = tz;
 45 |   //}
 46 |   std::vector<T> temp(end - start);
 47 |   for (size_t i = start; i < end; ++i) temp[i - start] = x[indices[i - start]];
 48 |   for (size_t i = start; i < end; ++i) x[i] = temp[i - start];
 49 | 
 50 |   for (size_t i = start; i < end; ++i) temp[i - start] = y[indices[i - start]];
 51 |   for (size_t i = start; i < end; ++i) y[i] = temp[i - start];
 52 | 
 53 |   for (size_t i = start; i < end; ++i) temp[i - start] = z[indices[i - start]];
 54 |   for (size_t i = start; i < end; ++i) z[i] = temp[i - start];
 55 | 
 56 |   for (size_t i = start; i < end; ++i) temp[i - start] = w[indices[i - start]];
 57 |   for (size_t i = start; i < end; ++i) w[i] = temp[i - start];
 58 | }
 59 | 
 60 | template <class T>
 61 | size_t construct_tree(FMM<T>* fmm, std::vector<node_t<T>>& nodes, size_t start,
 62 |                       size_t end, int depth, T cx, T cy, T cz, T rad)
 63 | {
 64 |   const size_t node_idx = nodes.size();
 65 |   nodes.push_back(
 66 |       node_t<T>(cx, cy, cz, rad, end - start, start, 0, node_idx, depth));
 67 | 
 68 |   // const size_t num_points = end - start + 1;
 69 |   if (end - start <= fmm->ncrit) {
 70 |   }
 71 |   else {
 72 |     std::vector<size_t> indices(end - start);
 73 |     std::vector<size_t> octants(end - start);
 74 | 
 75 |     size_t num_oct[8] = {0};
 76 |     size_t oct_beg[8] = {0};
 77 | 
 78 |     for (size_t i = start; i < end; ++i) {
 79 |       const size_t oct =
 80 |           ((fmm->x[i] > cx) << 2) | ((fmm->y[i] > cy) << 1) | (fmm->z[i] > cz);
 81 |       octants[i - start] = oct;
 82 |       num_oct[oct]++;
 83 |     }
 84 | 
 85 |     std::partial_sum(num_oct, num_oct + 7, oct_beg + 1);
 86 | 
 87 |     std::iota(indices.begin(), indices.end(), 0);
 88 |     std::sort(indices.begin(), indices.end(),
 89 |               [&octants](const size_t i, const size_t j) {
 90 |                 return (octants[i] < octants[j]);
 91 |               });
 92 |     std::for_each(indices.begin(), indices.end(),
 93 |                   [start](size_t& i) { i += start; });
 94 | 
 95 |     reorder(fmm->x, fmm->y, fmm->z, fmm->w, indices, start, end);
 96 | 
 97 |     size_t child[8] = {0};
 98 |     size_t num_children = 0;
 99 | 
100 |     for (size_t i = 0; i < 8; ++i) {
101 |       T nrad = rad / 2.0;
102 |       T ncx = ((i >> 2) & 1) ? (cx + nrad) : (cx - nrad);
103 |       T ncy = ((i >> 1) & 1) ? (cy + nrad) : (cy - nrad);
104 |       T ncz = ((i >> 0) & 1) ? (cz + nrad) : (cz - nrad);
105 | 
106 |       if (num_oct[i]) {
107 |         // offset oct ptrs by start of the current points array
108 |         child[num_children] = construct_tree(fmm, nodes, start + oct_beg[i],
109 |                                              start + oct_beg[i] + num_oct[i],
110 |                                              depth + 1, ncx, ncy, ncz, nrad);
111 |         num_children++;
112 |       }
113 |     }
114 |     nodes[node_idx].num_children = num_children;
115 |     for (size_t i = 0; i < num_children; ++i) {
116 |       nodes[node_idx].child[i] = child[i];
117 |     }
118 |   }
119 |   return node_idx;
120 | }
121 | 
122 | template <class T>
123 | void build_tree(FMM<T>* fmm)
124 | {
125 |   Timer timer;
126 |   timer.start();
127 | 
128 |   std::array<std::pair<T*, T*>, 3> lims;
129 |   get_bound_box(fmm, 0, fmm->num_points, lims);
130 | 
131 |   T cx = (*lims[0].second + *lims[0].first) / static_cast<T>(2.0);
132 |   T cy = (*lims[1].second + *lims[1].first) / static_cast<T>(2.0);
133 |   T cz = (*lims[2].second + *lims[2].first) / static_cast<T>(2.0);
134 | 
135 |   std::array<T, 3> radii;
136 |   for (int i = 0; i < 3; ++i) {
137 |     radii[i] = (*lims[i].second - *lims[i].first) / static_cast<T>(2.0);
138 |   }
139 | 
140 |   T rad = *std::max_element(radii.begin(), radii.end());
141 |   // make sure no points lie on the edge of the node
142 |   rad += std::numeric_limits<T>::epsilon();
143 | 
144 |   std::vector<node_t<T>> nodes;
145 |   fmm->root =
146 |       construct_tree(fmm, nodes, 0, fmm->num_points, 0, cx, cy, cz, rad);
147 | 
148 |   fmm->num_nodes = nodes.size();
149 |   fmm->nodes = (node_t<T>*)malloc(sizeof(node_t<T>) * fmm->num_nodes);
150 |   for (size_t n = 0; n < fmm->num_nodes; ++n) {
151 |     fmm->nodes[n] = nodes[n];
152 |   }
153 |   timer.stop();
154 |   // printf("built tree in %fs\n", timer.elapsed());
155 | 
156 |   printf("num_nodes = %zu\n", fmm->num_nodes);
157 | 
158 |   printf("root %zu has %zu children\n", fmm->root,
159 |          fmm->nodes[fmm->root].num_children);
160 | 
161 |   // Now we know the number of nodes we can allocate the multipole storage and
162 |   // assign to each node
163 |   fmm->m = (complex_t<T>*)calloc(fmm->num_multipoles * fmm->num_nodes,
164 |                                  sizeof(complex_t<T>));
165 |   fmm->l = (complex_t<T>*)calloc(fmm->num_multipoles * fmm->num_nodes,
166 |                                  sizeof(complex_t<T>));
167 | 
168 |   size_t max_depth = 0;
169 |   for (size_t n = 0; n < fmm->num_nodes; ++n) {
170 |     fmm->nodes[n].mult_idx = n * fmm->num_multipoles;
171 |     max_depth = std::max(max_depth, fmm->nodes[n].level);
172 |   }
173 |   printf("max tree depth = %zu\n", max_depth);
174 | }
175 | 


--------------------------------------------------------------------------------
/common/utils.hh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <limits>
 4 | #include <gpu-utils.hh>
 5 | 
 6 | #if defined(__x86_64__)
 7 |   #if defined (__AVX512F__) 
 8 |     #ifdef FMM_DOUBLE
 9 |       #define TILE_SIZE 8
10 |     #else
11 |       #define TILE_SIZE 64
12 |     #endif
13 |   #elif defined (__AVX2__)
14 |     #define TILE_SIZE 16
15 |   #else
16 |     #define TILE_SIZE 8
17 |   #endif
18 | #elif defined (__aarch64__)
19 |   #define TILE_SIZE 4
20 | #else
21 |   #warning architecture not supported
22 |   #define TILE_SIZE 32
23 | #endif
24 | 
25 | #ifndef M_PI
26 | #define M_PI 3.14159265358979323846
27 | #endif
28 | 
29 | //#define XSTR(x) STR(x)
30 | //#define STR(x) #x
31 | //#pragma message "tile size = " XSTR(TILE_SIZE)
32 | 
33 | HOSTDEVICE inline int mult_idx(const int n, const int m)
34 | {
35 |   return n * n + n + m;
36 | }
37 | 
38 | HOSTDEVICE inline int leg_idx(const int n, const int m)
39 | {
40 |   return (n * (n + 1)) / 2 + std::abs(m);
41 | }
42 | 
43 | HOSTDEVICE inline void inv_mult_idx(const int i, int& n, int& m)
44 | {
45 |   n = (int)sqrtf((float)i);
46 |   m = i - n * n - n;
47 | }
48 | 
49 | template <class T>
50 | HOSTDEVICE void sph_unit_to_cart_unit(T r, T theta, T phi, T rsum, T tsum,
51 |                                       T psum, T& ax, T& ay, T& az)
52 | {
53 |   ax = std::sin(theta) * std::cos(phi) * rsum +
54 |        std::cos(theta) * std::cos(phi) * tsum - std::sin(phi) * psum;
55 |   ay = std::sin(theta) * std::sin(phi) * rsum +
56 |        std::cos(theta) * std::sin(phi) * tsum + std::cos(phi) * psum;
57 |   az = std::cos(theta) * rsum - std::sin(theta) * tsum;
58 | }
59 | 
60 | template <class T>
61 | HOSTDEVICE T get_eps();
62 | template <>
63 | HOSTDEVICE float get_eps() { return 1e-6; }
64 | template <>
65 | HOSTDEVICE double get_eps() { return 1e-14; }
66 | 
67 | template <class T>
68 | HOSTDEVICE inline void cart_to_sph(T x, T y, T z, T& r, T& theta, T& phi)
69 | {
70 |   const T eps = get_eps<T>();
71 |   r = std::sqrt(x * x + y * y + z * z) + eps;
72 |   theta = std::acos(z / r);
73 |   phi = std::atan2(y, x);
74 | }
75 | 
76 | 


--------------------------------------------------------------------------------
/common/verify.hh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | template <class T>
 4 | T calc_error(T* p, T* test_p, size_t n)
 5 | {
 6 |   T diff = static_cast<T>(0.0);
 7 |   T norm = static_cast<T>(0.0);
 8 |   for (size_t i = 0; i < n; ++i) {
 9 |     diff += (p[i] - test_p[i]) * (p[i] - test_p[i]);
10 |     norm += test_p[i] * test_p[i];
11 |   }
12 |   return std::sqrt(diff / norm);
13 | }
14 | 
15 | template <class T>
16 | T calc_error(T* x, T* y, T* z, T* test_x, T* test_y, T* test_z, size_t n)
17 | {
18 |   T diff = static_cast<T>(0.0);
19 |   T norm = static_cast<T>(0.0);
20 |   for (size_t i = 0; i < n; ++i) {
21 |     T dx = (x[i] - test_x[i]) * (x[i] - test_x[i]);
22 |     T dy = (y[i] - test_y[i]) * (y[i] - test_y[i]);
23 |     T dz = (z[i] - test_z[i]) * (z[i] - test_z[i]);
24 |     diff += dx + dy + dz;
25 |     norm += (test_x[i] * test_x[i]) + (test_y[i] * test_y[i]) +
26 |             (test_z[i] * test_z[i]);
27 |   }
28 |   return std::sqrt(diff/norm);
29 | }
30 | 
31 | template <class T>
32 | void verify(FMM<T>* fmm)
33 | {
34 |   std::vector<T> test_ax(fmm->num_samples);
35 |   std::vector<T> test_ay(fmm->num_samples);
36 |   std::vector<T> test_az(fmm->num_samples);
37 |   std::vector<T> test_p(fmm->num_samples);
38 | 
39 |   #pragma omp parallel for
40 |   for (size_t i = 0; i < fmm->num_samples; ++i) {
41 |     for (size_t j = 0; j < fmm->num_points; ++j) {
42 |       if (i == j) continue;
43 |       const T dx = fmm->x[j] - fmm->x[i];
44 |       const T dy = fmm->y[j] - fmm->y[i];
45 |       const T dz = fmm->z[j] - fmm->z[i];
46 |       const T r = std::sqrt(dx * dx + dy * dy + dz * dz);
47 |       const T inv_r = static_cast<T>(1.0) / r;
48 |       const T inv_r_3 = inv_r * inv_r * inv_r;
49 |       const T s = inv_r_3 * fmm->w[j];
50 |       test_ax[i] += s * dx;
51 |       test_ay[i] += s * dy;
52 |       test_az[i] += s * dz;
53 |       test_p[i] += inv_r * fmm->w[j];
54 |     }
55 |   }
56 |  //for (size_t i = 0; i < fmm->num_samples; ++i) {
57 |  //  printf("%f vs %f\n", fmm->p[i], test_p[i]);
58 |  //}
59 |   printf("pot err = %.12e\n", calc_error(fmm->p, &test_p[0], fmm->num_samples));
60 |   printf("acc err = %.12e\n",
61 |          calc_error(fmm->ax, fmm->ay, fmm->az, &test_ax[0], &test_ay[0],
62 |                     &test_az[0], fmm->num_samples));
63 | }
64 | 


--------------------------------------------------------------------------------
/cuda/cuda-utils.hh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #define KOKKOS
 4 | 
 5 | template <class T>
 6 | KOKKOS_INLINE_FUNCTION void lock(T* val)
 7 | {
 8 |   while (0 != Kokkos::atomic_compare_exchange(val, 0, 1))
 9 |     ;
10 | #ifdef __CUDA_ARCH__
11 |   __threadfence();
12 | #endif
13 | }
14 | 
15 | template <class T>
16 | KOKKOS_INLINE_FUNCTION void unlock(T* val)
17 | {
18 | #ifdef __CUDA_ARCH__
19 |   __threadfence();
20 | #endif
21 |   while (1 != Kokkos::atomic_compare_exchange(val, 1, 0))
22 |     ;
23 | }
24 | 
25 | #ifdef __CUDACC__
26 | #define INLINE __device__ __inline__
27 | 
28 | template <int NTHREADS, int NWORKERS>
29 | class gpu_utils {
30 | public:
31 |   static const int warp_size = 32;
32 |   static const int num_threads = NTHREADS;
33 | 
34 |   INLINE static int thread_id() { return threadIdx.y; }
35 |   INLINE static int worker_id() { return threadIdx.z; }
36 |   INLINE static int global_worker_id()
37 |   {
38 |     return blockIdx.x * blockDim.z + worker_id();
39 |   }
40 | 
41 |   template <int N = NTHREADS,
42 |             typename std::enable_if<(N <= 32)>::type* = nullptr>
43 |   INLINE static void sync_worker()
44 |   {
45 |     __syncwarp();
46 |   }
47 | 
48 |   template <int N = NTHREADS,
49 |             typename std::enable_if<(N > 32)>::type* = nullptr>
50 |   INLINE static void sync_worker()
51 |   {
52 |     __syncthreads();
53 |   }
54 | };
55 | #else
56 | #define INLINE inline
57 | #endif
58 | 


--------------------------------------------------------------------------------
/cuda/flags.makefile:
--------------------------------------------------------------------------------
 1 | CC_NVCC=nvcc
 2 | CC=$(CC_$(COMPILER))
 3 | 
 4 | ifneq ($(COMPILER), NVCC)
 5 | 	$(error Only NVCC support for this version of MiniFMM)
 6 | endif
 7 | 
 8 | ARCH=sm_60
 9 | 
10 | CFLAGS_NVCC=-std=c++11 -O3 -ftz=true --use_fast_math -x cu -Xcompiler -fopenmp -arch=$(ARCH)
11 | CFLAGS=$(CFLAGS_$(COMPILER))
12 | 
13 | LIBS=-Xcompiler -fopenmp
14 | 
15 | 


--------------------------------------------------------------------------------
/cuda/node.hh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <omp.h>
 4 | 
 5 | template <class T>
 6 | struct node_t {
 7 |   node_t() = default;
 8 |   ~node_t() = default;
 9 |   node_t(T arg_cx, T arg_cy, T arg_cz, T arg_rad, size_t arg_num_points,
10 |          size_t arg_point_idx, size_t arg_mult_idx, size_t arg_node_idx,
11 |          size_t arg_level)
12 |       : cx{arg_cx},
13 |         cy{arg_cy},
14 |         cz{arg_cz},
15 |         rad{arg_rad},
16 |         num_points{arg_num_points},
17 |         point_idx{arg_point_idx},
18 |         mult_idx{arg_mult_idx},
19 |         node_idx{arg_node_idx},
20 |         level{arg_level}
21 |   {
22 |     omp_init_lock(&p2p_lock);
23 |     omp_init_lock(&m2l_lock);
24 |   }
25 |   T cx;
26 |   T cy;
27 |   T cz;
28 |   T rad;
29 |   size_t num_children = 0;
30 |   size_t child[8] = {0};
31 |   size_t num_points;
32 |   size_t point_idx;
33 |   size_t mult_idx;
34 |   size_t node_idx;
35 |   size_t level;
36 |   omp_lock_t p2p_lock;
37 |   omp_lock_t m2l_lock;
38 | 
39 |   bool is_leaf() const { return (num_children == 0); }
40 | };
41 | 


--------------------------------------------------------------------------------
/cuda/traversal.hh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <omp.h>
  4 | 
  5 | #include <get-deps.hh>
  6 | #include <kernels.hh>
  7 | #include <node.hh>
  8 | 
  9 | #define INLINE __device__ __inline__
 10 | 
 11 | // for compatability with generic gpu kernels
 12 | namespace gpu_utils {
 13 | INLINE static int thread_id() { return threadIdx.x; }
 14 | INLINE static int worker_id() { return 0; }
 15 | INLINE static void sync_worker() { __syncthreads(); }
 16 | }  // namespace gpu_utils
 17 | 
 18 | #include <gpu-kernels.hh>
 19 | 
 20 | template <class T>
 21 | void upwards_pass(FMM<T>* fmm, node_t<T>* node)
 22 | {
 23 |   for (size_t i = 0; i < node->num_children; ++i) {
 24 | #pragma omp task
 25 |     upwards_pass(fmm, &fmm->nodes[node->child[i]]);
 26 |   }
 27 | #pragma omp taskwait
 28 | 
 29 |   if (node->is_leaf())
 30 |     p2m(fmm, node);
 31 |   else
 32 |     m2m(fmm, node);
 33 | }
 34 | 
 35 | template <class T>
 36 | void downwards_pass(FMM<T>* fmm, node_t<T>* node)
 37 | {
 38 |   if (node->is_leaf())
 39 |     l2p(fmm, node);
 40 |   else {
 41 |     l2l(fmm, node);
 42 |     for (size_t i = 0; i < node->num_children; ++i) {
 43 | #pragma omp task
 44 |       downwards_pass(fmm, &fmm->nodes[node->child[i]]);
 45 |     }
 46 |   }
 47 | #pragma omp taskwait
 48 | }
 49 | 
 50 | template <class T>
 51 | __global__ void p2p_kernel(FMM<T>* d_fmm, size_t* d_p2p_nodes,
 52 |                            size_t* d_p2p_deps_array, size_t* d_p2p_deps_offsets,
 53 |                            size_t* d_p2p_deps_sizes)
 54 | {
 55 |   const int i = blockIdx.x;
 56 |   node_t<T>* target = d_fmm->nodes + d_p2p_nodes[i];
 57 |   size_t num_deps = d_p2p_deps_sizes[i];
 58 |   size_t offset = d_p2p_deps_offsets[i];
 59 |   for (size_t j = 0; j < num_deps; ++j) {
 60 |     size_t source_idx = d_p2p_deps_array[offset + j];
 61 |     node_t<T>* source = d_fmm->nodes + source_idx;
 62 |     if (target == source)
 63 |       p2p_gpu<128, 128, 1, 128, 1>(d_fmm, target);
 64 |     else
 65 |       p2p_gpu<128, 128, 1, 128, 1>(d_fmm, target, source);
 66 |   }
 67 | }
 68 | 
 69 | template <class T>
 70 | __global__ void m2l_kernel(FMM<T>* d_fmm, size_t* d_m2l_nodes,
 71 |                            size_t* d_m2l_deps_array, size_t* d_m2l_deps_offsets,
 72 |                            size_t* d_m2l_deps_sizes)
 73 | {
 74 |   const int i = blockIdx.x;
 75 |   node_t<T>* target = d_fmm->nodes + d_m2l_nodes[i];
 76 |   size_t num_deps = d_m2l_deps_sizes[i];
 77 |   size_t offset = d_m2l_deps_offsets[i];
 78 |   for (size_t j = 0; j < num_deps; ++j) {
 79 |     size_t source_idx = d_m2l_deps_array[offset + j];
 80 |     node_t<T>* source = d_fmm->nodes + source_idx;
 81 |     m2l_gpu<32, 1>(d_fmm, target, source);
 82 |   }
 83 | }
 84 | 
 85 | template <class T>
 86 | void cuda_dtt(FMM<T>* fmm, FMM<T>* h_fmm, FMM<T>* d_fmm)
 87 | {
 88 |   Timer timer;
 89 | 
 90 |   timer.start();
 91 | 
 92 |   std::vector<std::vector<size_t>> p2p_deps(fmm->num_nodes);
 93 |   std::vector<std::vector<size_t>> m2l_deps(fmm->num_nodes);
 94 | 
 95 |   get_deps_omp(fmm, &p2p_deps, &m2l_deps);
 96 |   timer.stop();
 97 |   printf("    %-16s %12.8f\n", "Deps. Time (s) ", timer.elapsed());
 98 | 
 99 |   timer.start();
100 | 
101 |   size_t* h_p2p_nodes;
102 |   size_t* h_p2p_deps_array;
103 |   size_t* h_p2p_deps_offsets;
104 |   size_t* h_p2p_deps_sizes;
105 |   size_t* d_p2p_nodes;
106 |   size_t* d_p2p_deps_array;
107 |   size_t* d_p2p_deps_offsets;
108 |   size_t* d_p2p_deps_sizes;
109 |   size_t p2p_deps_tot;
110 |   size_t p2p_num_nodes;
111 | 
112 |   size_t* h_m2l_nodes;
113 |   size_t* h_m2l_deps_array;
114 |   size_t* h_m2l_deps_offsets;
115 |   size_t* h_m2l_deps_sizes;
116 |   size_t* d_m2l_nodes;
117 |   size_t* d_m2l_deps_array;
118 |   size_t* d_m2l_deps_offsets;
119 |   size_t* d_m2l_deps_sizes;
120 |   size_t m2l_deps_tot;
121 |   size_t m2l_num_nodes;
122 | 
123 |   pack_deps(p2p_deps, &h_p2p_nodes, &h_p2p_deps_array, &h_p2p_deps_offsets,
124 |             &h_p2p_deps_sizes, &p2p_deps_tot, &p2p_num_nodes);
125 |   pack_deps(m2l_deps, &h_m2l_nodes, &h_m2l_deps_array, &h_m2l_deps_offsets,
126 |             &h_m2l_deps_sizes, &m2l_deps_tot, &m2l_num_nodes);
127 |   timer.stop();
128 |   printf("%-20s %12.8f\n", "  Pack Time (s) ", timer.elapsed());
129 | 
130 |   timer.start();
131 |   alloc_and_copy(&d_p2p_nodes, h_p2p_nodes, p2p_num_nodes);
132 |   alloc_and_copy(&d_p2p_deps_array, h_p2p_deps_array, p2p_deps_tot);
133 |   alloc_and_copy(&d_p2p_deps_offsets, h_p2p_deps_offsets, p2p_num_nodes);
134 |   alloc_and_copy(&d_p2p_deps_sizes, h_p2p_deps_sizes, p2p_num_nodes);
135 |   alloc_and_copy(&d_m2l_nodes, h_m2l_nodes, m2l_num_nodes);
136 |   alloc_and_copy(&d_m2l_deps_array, h_m2l_deps_array, m2l_deps_tot);
137 |   alloc_and_copy(&d_m2l_deps_offsets, h_m2l_deps_offsets, m2l_num_nodes);
138 |   alloc_and_copy(&d_m2l_deps_sizes, h_m2l_deps_sizes, m2l_num_nodes);
139 |   timer.stop();
140 |   printf("%-20s %12.8f\n", "  Transfer Time (s) ", timer.elapsed());
141 | 
142 |   timer.start();
143 |   p2p_kernel<<<p2p_num_nodes, 128>>>(d_fmm, d_p2p_nodes, d_p2p_deps_array,
144 |                                      d_p2p_deps_offsets, d_p2p_deps_sizes);
145 |   CUDACHK(cudaGetLastError());
146 |   CUDACHK(cudaDeviceSynchronize());
147 |   timer.stop();
148 |   printf("    %-16s %12.8f\n", "P2P Time (s) ", timer.elapsed());
149 | 
150 |   timer.start();
151 |   m2l_kernel<<<m2l_num_nodes, 32>>>(d_fmm, d_m2l_nodes, d_m2l_deps_array,
152 |                                      d_m2l_deps_offsets, d_m2l_deps_sizes);
153 |   CUDACHK(cudaGetLastError());
154 |   CUDACHK(cudaDeviceSynchronize());
155 |   timer.stop();
156 |   printf("    %-16s %12.8f\n", "M2L Time (s) ", timer.elapsed());
157 | 
158 |   free(h_p2p_nodes);
159 |   free(h_p2p_deps_array);
160 |   free(h_p2p_deps_offsets);
161 |   free(h_p2p_deps_sizes);
162 |   free(h_m2l_nodes);
163 |   free(h_m2l_deps_array);
164 |   free(h_m2l_deps_offsets);
165 |   free(h_m2l_deps_sizes);
166 | 
167 |   device_free(d_p2p_nodes);
168 |   device_free(d_p2p_deps_array);
169 |   device_free(d_p2p_deps_offsets);
170 |   device_free(d_p2p_deps_sizes);
171 |   device_free(d_m2l_nodes);
172 |   device_free(d_m2l_deps_array);
173 |   device_free(d_m2l_deps_offsets);
174 |   device_free(d_m2l_deps_sizes);
175 | }
176 | 
177 | template <class T>
178 | void perform_traversals(FMM<T>* fmm)
179 | {
180 | #pragma omp parallel
181 | #pragma omp single
182 |   printf("Running on %d threads\n", omp_get_num_threads());
183 | 
184 |   Timer timer;
185 |   Timer tot_timer;
186 | 
187 |   timer.start();
188 |   tot_timer.start();
189 | #pragma omp parallel
190 | #pragma omp single
191 |   upwards_pass(fmm, &fmm->nodes[fmm->root]);
192 |   timer.stop();
193 |   printf("\n");
194 |   printf("%-20s %12.8f\n", "Upwards Time (s) ", timer.elapsed());
195 | 
196 |   FMM<T>* h_fmm;
197 |   FMM<T>* d_fmm;
198 | 
199 |   init_device_fmm(fmm, &h_fmm, &d_fmm);
200 | 
201 |   timer.start();
202 |   cuda_dtt(fmm, h_fmm, d_fmm);
203 |   timer.stop();
204 |   printf("%-20s %12.8f\n", "DTT Time (s) ", timer.elapsed());
205 | 
206 |   fini_device_fmm(fmm, h_fmm, d_fmm);
207 | 
208 |   timer.start();
209 | #pragma omp parallel
210 | #pragma omp single
211 |   downwards_pass(fmm, &fmm->nodes[fmm->root]);
212 |   timer.stop();
213 |   printf("%-20s %12.8f\n", "Downwards Time (s) ", timer.elapsed());
214 | 
215 |   tot_timer.stop();
216 |   printf("--------------------\n");
217 |   printf("%-20s %12.8f\n", "Total Time (s) ", tot_timer.elapsed());
218 |   printf("--------------------\n\n");
219 | }
220 | 


--------------------------------------------------------------------------------
/gpusched/flags.makefile:
--------------------------------------------------------------------------------
 1 | CC_NVCC=nvcc
 2 | CC=$(CC_$(COMPILER))
 3 | 
 4 | ifneq ($(COMPILER), NVCC)
 5 | 	$(error Only NVCC support for this version of MiniFMM)
 6 | endif
 7 | 
 8 | ARCH=sm_60
 9 | 
10 | CFLAGS_NVCC=-DNWORKERS=1 -std=c++11 -O3 -ftz=true --use_fast_math -x cu -Xcompiler -fopenmp -arch=$(ARCH)
11 | CFLAGS=$(CFLAGS_$(COMPILER))
12 | 
13 | LIBS=-Xcompiler -fopenmp
14 | 
15 | 


--------------------------------------------------------------------------------
/gpusched/traversal.hh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <omp.h>
  4 | 
  5 | #include <kernels.hh>
  6 | #include <node.hh>
  7 | 
  8 | #include <gpusched.h>
  9 | 
 10 | #include <gpu-kernels.hh>
 11 | 
 12 | template <class T>
 13 | void upwards_pass(FMM<T>* fmm, node_t<T>* node)
 14 | {
 15 |   for (size_t i = 0; i < node->num_children; ++i) {
 16 | #pragma omp task
 17 |     upwards_pass(fmm, &fmm->nodes[node->child[i]]);
 18 |   }
 19 | #pragma omp taskwait
 20 | 
 21 |   if (node->is_leaf())
 22 |     p2m(fmm, node);
 23 |   else
 24 |     m2m(fmm, node);
 25 | }
 26 | 
 27 | template <class T>
 28 | __device__ void upwards_pass_gpu_task(worker_t* worker, task_t* task)
 29 | {
 30 |   FMM<T>* fmm = (FMM<T>*)get_private(task, 0);
 31 |   node_t<T>* node = (node_t<T>*)get_private(task, 1);
 32 | 
 33 |   for (size_t i = 0; i < node->num_children; ++i) {
 34 |     void* args[2] = {fmm, fmm->nodes + node->child[i]};
 35 |     generate_task(worker, upwards_pass_gpu_task<T>, 2, args);
 36 |   }
 37 |   taskwait(worker);
 38 | 
 39 |   if (node->is_leaf())
 40 |     p2m_gpu<32, 1>(fmm, node);
 41 |   else
 42 |     m2m_gpu<32, 1>(fmm, node);
 43 | }
 44 | 
 45 | template <class T>
 46 | void upwards_pass_gpu(team_t* h_team, team_t* d_team, FMM<T>*h_fmm, FMM<T>* d_fmm)
 47 | {
 48 |   const int nargs = 2;
 49 |   void* args[nargs] = {d_fmm, h_fmm->nodes + h_fmm->root};
 50 | 
 51 |   fork_team<upwards_pass_gpu_task<T>>(h_team, d_team, nargs, args);
 52 | }
 53 | 
 54 | template <class T>
 55 | void downwards_pass(FMM<T>* fmm, node_t<T>* node)
 56 | {
 57 |   if (node->is_leaf())
 58 |     l2p(fmm, node);
 59 |   else {
 60 |     l2l(fmm, node);
 61 |     for (size_t i = 0; i < node->num_children; ++i) {
 62 | #pragma omp task
 63 |       downwards_pass(fmm, &fmm->nodes[node->child[i]]);
 64 |     }
 65 |   }
 66 | #pragma omp taskwait
 67 | }
 68 | 
 69 | template <class T>
 70 | __device__ void dtt_task(worker_t* worker, task_t* task)
 71 | {
 72 |   FMM<T>* fmm = (FMM<T>*)get_private(task, 0);
 73 |   node_t<T>* target = (node_t<T>*)get_private(task, 1);
 74 |   node_t<T>* source = (node_t<T>*)get_private(task, 2);
 75 | 
 76 |   T dx = source->cx - target->cx;
 77 |   T dy = source->cy - target->cy;
 78 |   T dz = source->cz - target->cz;
 79 |   T r2 = dx * dx + dy * dy + dz * dz;
 80 |   T d1 = source->rad * static_cast<T>(2.0);
 81 |   T d2 = target->rad * static_cast<T>(2.0);
 82 | 
 83 |   if ((d1 + d2) * (d1 + d2) < fmm->theta2 * r2) {
 84 |     m2l_gpu<NTHREADS, NWORKERS>(fmm, target, source);
 85 |   }
 86 |   else if (source->is_leaf() && target->is_leaf()) {
 87 |     if (target == source)
 88 |       p2p_gpu<32, 16, 4, NTHREADS, NWORKERS>(fmm, target);
 89 |     else
 90 |       p2p_gpu<32, 16, 4, NTHREADS, NWORKERS>(fmm, target, source);
 91 |   }
 92 |   else {
 93 |     T target_sz = target->rad;
 94 |     T source_sz = source->rad;
 95 |     if (source->is_leaf() || ((target_sz >= source_sz) && !target->is_leaf())) {
 96 |       for (size_t i = 0; i < target->num_children; ++i) {
 97 |         node_t<T>* child = fmm->nodes + target->child[i];
 98 |         if (target->num_points > TASK_CUTOFF) {
 99 |           void* args[3] = {fmm, child, source};
100 |           generate_task(worker, dtt_task<T>, 3, args);
101 |         }
102 |         else {
103 |           ((void**)task->storage)[1] = child;
104 |           dtt_task<T>(worker, task);
105 |           ((void**)task->storage)[1] = target;
106 |         }
107 |       }
108 |     }
109 |     else {
110 |       for (size_t i = 0; i < source->num_children; ++i) {
111 |         node_t<T>* child = fmm->nodes + source->child[i];
112 |         // void* args[3] = {fmm, target, child};
113 |         // generate_task_cond(worker, dtt_task<T>, 3, args,
114 |         //                   source->num_points > TASK_CUTOFF);
115 |         if (source->num_points > TASK_CUTOFF) {
116 |           void* args[3] = {fmm, target, child};
117 |           generate_task(worker, dtt_task<T>, 3, args);
118 |         }
119 |         else {
120 |           ((void**)task->storage)[2] = child;
121 |           dtt_task<T>(worker, task);
122 |           ((void**)task->storage)[2] = source;
123 |         }
124 |       }
125 |     }
126 |   }
127 | }
128 | 
129 | template <class T>
130 | void perform_traversals(FMM<T>* fmm)
131 | {
132 | #pragma omp parallel
133 | #pragma omp single
134 |   printf("Running on %d threads\n", omp_get_num_threads());
135 | 
136 |   Timer timer;
137 |   Timer tot_timer;
138 | 
139 | //  timer.start();
140 | //  tot_timer.start();
141 | //#pragma omp parallel
142 | //#pragma omp single
143 | //  upwards_pass(fmm, &fmm->nodes[fmm->root]);
144 | //  timer.stop();
145 | //  printf("\n");
146 | //  printf("%-20s %12.8f\n", "Upwards Time (s) ", timer.elapsed());
147 | 
148 |   //#pragma omp parallel
149 |   //#pragma omp single
150 |   // dual_tree(fmm, &fmm->nodes[fmm->root], &fmm->nodes[fmm->root]);
151 | 
152 |   FMM<T>* h_fmm;
153 |   FMM<T>* d_fmm;
154 |   init_device_fmm(fmm, &h_fmm, &d_fmm);
155 | 
156 |   const char* num_blocks_str = getenv("GPUSCHED_NUM_BLOCKS");
157 |   const int num_blocks =
158 |       (num_blocks_str == NULL) ? 56 * 5 : atoi(num_blocks_str);
159 | 
160 |   team_t* h_team;
161 |   team_t* d_team;
162 |   create_team(num_blocks, 1024 * 512, &h_team, &d_team);
163 | 
164 |   timer.start();
165 |   tot_timer.start();
166 |   upwards_pass_gpu(h_team, d_team, h_fmm, d_fmm);
167 |   timer.stop();
168 |   printf("\n");
169 |   printf("%-20s %12.8f\n", "Upwards Time (s) ", timer.elapsed());
170 | 
171 |   const int nargs = 3;
172 |   node_t<T>* d_root_node = h_fmm->nodes + h_fmm->root;
173 |   void* args[nargs] = {d_fmm, d_root_node, d_root_node};
174 | 
175 |   timer.start();
176 |   fork_team<dtt_task<T>>(h_team, d_team, nargs, args);
177 |   timer.stop();
178 |   printf("%-20s %12.8f\n", "DTT Time (s) ", timer.elapsed());
179 | 
180 |   fini_device_fmm(fmm, h_fmm, d_fmm);
181 | 
182 |   timer.start();
183 | #pragma omp parallel
184 | #pragma omp single
185 |   downwards_pass(fmm, &fmm->nodes[fmm->root]);
186 |   timer.stop();
187 |   printf("%-20s %12.8f\n", "Downwards Time (s) ", timer.elapsed());
188 | 
189 |   tot_timer.stop();
190 |   printf("--------------------\n");
191 |   printf("%-20s %12.8f\n", "Total Time (s) ", tot_timer.elapsed());
192 |   printf("--------------------\n\n");
193 | }
194 | 


--------------------------------------------------------------------------------
/inputs/plummer.in:
--------------------------------------------------------------------------------
1 | -n 10000000
2 | -m 1000
3 | -e 0.66
4 | -t 3
5 | -c 256
6 | --plummer
7 | 


--------------------------------------------------------------------------------
/inputs/small.in:
--------------------------------------------------------------------------------
1 | -n 100000
2 | -m 1000
3 | -e 0.5
4 | -t 4
5 | -c 512
6 | 


--------------------------------------------------------------------------------
/inputs/uniform.in:
--------------------------------------------------------------------------------
1 | -n 10000000
2 | -m 1000
3 | -e 0.66
4 | -t 3
5 | -c 256
6 | --uniform
7 | 


--------------------------------------------------------------------------------
/kokkos-for/flags.makefile:
--------------------------------------------------------------------------------
 1 | include $(KOKKOS_PATH)/Makefile.kokkos
 2 | 
 3 | CC_GNU=g++
 4 | CC_INTEL=icpc
 5 | CC_CLANG=clang++
 6 | CC_ARM=armclang++
 7 | CC_CRAY=CC
 8 | CC_NVCC=nvcc_wrapper
 9 | CC=$(CC_$(COMPILER))
10 | 
11 | UNAME=$(shell uname -m)
12 | ifeq ($(UNAME), aarch64)
13 |   ARCH_CFLAGS = -mcpu=$(ARCH) -mtune=$(ARCH) 
14 |   ifeq ($(COMPILER), GNU)
15 |     ARCH_CFLAGS += -mlow-precision-recip-sqrt
16 |   endif
17 | endif
18 | ifeq ($(UNAME), x86_64)
19 |   ARCH_CFLAGS = -march=$(ARCH)
20 | endif
21 | 
22 | CFLAGS_CLANG=-Ofast $(ARCH_CFLAGS) -fopenmp
23 | CFLAGS_GNU=-Ofast -fno-cx-limited-range $(ARCH_CFLAGS) -fopenmp
24 | CFLAGS_INTEL=-Ofast -x$(ARCH_CFLAGS) -qopenmp
25 | CFLAGS_ARM=-Ofast $(ARCH_CFLAGS) -fopenmp
26 | CFLAGS_CRAY=-fopenmp
27 | CFLAGS_NVCC=-O3 -ftz=true --use_fast_math 
28 | CFLAGS=$(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CFLAGS_$(COMPILER)) 
29 | 
30 | LIBS=$(KOKKOS_LDFLAGS) $(KOKKOS_LIBS)
31 | 
32 | 


--------------------------------------------------------------------------------
/kokkos-for/node.hh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <omp.h>
 4 | 
 5 | #include <utils.hh>
 6 | 
 7 | template <class T>
 8 | struct node_t {
 9 |   node_t() = default;
10 |   ~node_t() = default;
11 |   node_t(T arg_cx, T arg_cy, T arg_cz, T arg_rad, size_t arg_num_points,
12 |          size_t arg_point_idx, size_t arg_mult_idx, size_t arg_node_idx,
13 |          size_t arg_level)
14 |       : cx{arg_cx},
15 |         cy{arg_cy},
16 |         cz{arg_cz},
17 |         rad{arg_rad},
18 |         num_points{arg_num_points},
19 |         point_idx{arg_point_idx},
20 |         mult_idx{arg_mult_idx},
21 |         node_idx{arg_node_idx},
22 |         level{arg_level}
23 |   {
24 |     omp_init_lock(&p2p_lock);
25 |     omp_init_lock(&m2l_lock);
26 |   }
27 |   T cx;
28 |   T cy;
29 |   T cz;
30 |   T rad;
31 |   size_t num_children = 0;
32 |   size_t child[8] = {0};
33 |   size_t num_points;
34 |   size_t point_idx;
35 |   size_t mult_idx;
36 |   size_t node_idx;
37 |   size_t level;
38 | 
39 |   omp_lock_t p2p_lock;
40 |   omp_lock_t m2l_lock;
41 | 
42 |   HOSTDEVICE
43 |   bool is_leaf() const { return (num_children == 0); }
44 | };
45 | 


--------------------------------------------------------------------------------
/kokkos-for/traversal.hh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <omp.h>
  4 | 
  5 | #include <Kokkos_Core.hpp>
  6 | #include <get-deps.hh>
  7 | #include <kernels.hh>
  8 | #include <node.hh>
  9 | #include <utils.hh>
 10 | 
 11 | #ifdef __CUDACC__
 12 | // TODO fix this (kokkos utils needs to be included before gpu kernels)
 13 | #include <kokkos-utils.hh>
 14 | 
 15 | #include <gpu-kernels.hh>
 16 | #endif
 17 | 
 18 | #ifndef KOKKOS_SCHEDULE
 19 | #define KOKKOS_SCHEDULE Dynamic
 20 | #endif
 21 | 
 22 | template <class T>
 23 | void upwards_pass(FMM<T>* fmm, node_t<T>* node)
 24 | {
 25 |   for (size_t i = 0; i < node->num_children; ++i) {
 26 | #pragma omp task
 27 |     upwards_pass(fmm, &fmm->nodes[node->child[i]]);
 28 |   }
 29 | 
 30 | #pragma omp taskwait
 31 | 
 32 |   if (node->is_leaf())
 33 |     p2m(fmm, node);
 34 |   else
 35 |     m2m(fmm, node);
 36 | }
 37 | 
 38 | template <class T>
 39 | void downwards_pass(FMM<T>* fmm, node_t<T>* node)
 40 | {
 41 |   if (node->is_leaf())
 42 |     l2p(fmm, node);
 43 |   else {
 44 |     l2l(fmm, node);
 45 |     for (size_t i = 0; i < node->num_children; ++i) {
 46 | #pragma omp task
 47 |       downwards_pass(fmm, &fmm->nodes[node->child[i]]);
 48 |     }
 49 |   }
 50 | #pragma omp taskwait
 51 | }
 52 | 
 53 | template <class T>
 54 | void kokkos_dtt(FMM<T>* fmm)
 55 | {
 56 |   FMM<T>* d_fmm;
 57 |   FMM<T>* h_fmm;
 58 | 
 59 |   init_device_fmm(fmm, &h_fmm, &d_fmm);
 60 | 
 61 |   std::vector<std::vector<size_t>> p2p_deps(fmm->num_nodes);
 62 |   std::vector<std::vector<size_t>> m2l_deps(fmm->num_nodes);
 63 | 
 64 |   Timer timer;
 65 |   timer.start();
 66 |   get_deps_omp(fmm, &p2p_deps, &m2l_deps);
 67 |   timer.stop();
 68 |   printf("%-20s %12.8f\n", "  Deps Time (s) ", timer.elapsed());
 69 | 
 70 |   timer.start();
 71 |   size_t* p2p_nodes;
 72 |   size_t* p2p_deps_array;
 73 |   size_t* p2p_deps_offsets;
 74 |   size_t* p2p_deps_sizes;
 75 |   size_t p2p_deps_tot;
 76 |   size_t p2p_num_nodes;
 77 | 
 78 |   size_t* m2l_nodes;
 79 |   size_t* m2l_deps_array;
 80 |   size_t* m2l_deps_offsets;
 81 |   size_t* m2l_deps_sizes;
 82 |   size_t m2l_deps_tot;
 83 |   size_t m2l_num_nodes;
 84 | 
 85 |   pack_deps(p2p_deps, &p2p_nodes, &p2p_deps_array, &p2p_deps_offsets,
 86 |             &p2p_deps_sizes, &p2p_deps_tot, &p2p_num_nodes);
 87 |   pack_deps(m2l_deps, &m2l_nodes, &m2l_deps_array, &m2l_deps_offsets,
 88 |             &m2l_deps_sizes, &m2l_deps_tot, &m2l_num_nodes);
 89 |   timer.stop();
 90 |   printf("%-20s %12.8f\n", "  Pack Time (s) ", timer.elapsed());
 91 | 
 92 |   timer.start();
 93 |   Kokkos::View<size_t*, Kokkos::HostSpace,
 94 |                Kokkos::MemoryTraits<Kokkos::Unmanaged>>
 95 |       h_p2p_nodes(p2p_nodes, p2p_num_nodes);
 96 |   Kokkos::View<size_t*, Kokkos::HostSpace,
 97 |                Kokkos::MemoryTraits<Kokkos::Unmanaged>>
 98 |       h_m2l_nodes(m2l_nodes, m2l_num_nodes);
 99 | 
100 |   Kokkos::View<size_t*, Kokkos::HostSpace,
101 |                Kokkos::MemoryTraits<Kokkos::Unmanaged>>
102 |       h_p2p_deps_array(p2p_deps_array, p2p_deps_tot);
103 |   Kokkos::View<size_t*, Kokkos::HostSpace,
104 |                Kokkos::MemoryTraits<Kokkos::Unmanaged>>
105 |       h_m2l_deps_array(m2l_deps_array, m2l_deps_tot);
106 | 
107 |   Kokkos::View<size_t*, Kokkos::HostSpace,
108 |                Kokkos::MemoryTraits<Kokkos::Unmanaged>>
109 |       h_p2p_deps_offsets(p2p_deps_offsets, p2p_num_nodes);
110 |   Kokkos::View<size_t*, Kokkos::HostSpace,
111 |                Kokkos::MemoryTraits<Kokkos::Unmanaged>>
112 |       h_m2l_deps_offsets(m2l_deps_offsets, m2l_num_nodes);
113 | 
114 |   Kokkos::View<size_t*, Kokkos::HostSpace,
115 |                Kokkos::MemoryTraits<Kokkos::Unmanaged>>
116 |       h_p2p_deps_sizes(p2p_deps_sizes, p2p_num_nodes);
117 |   Kokkos::View<size_t*, Kokkos::HostSpace,
118 |                Kokkos::MemoryTraits<Kokkos::Unmanaged>>
119 |       h_m2l_deps_sizes(m2l_deps_sizes, m2l_num_nodes);
120 | 
121 |   Kokkos::View<size_t*> d_p2p_nodes("d_p2p_nodes", p2p_num_nodes);
122 |   Kokkos::View<size_t*> d_m2l_nodes("d_m2l_nodes", m2l_num_nodes);
123 |   Kokkos::View<size_t*> d_p2p_deps_array("d_p2p_deps_array", p2p_deps_tot);
124 |   Kokkos::View<size_t*> d_m2l_deps_array("d_m2l_deps_array", m2l_deps_tot);
125 |   Kokkos::View<size_t*> d_p2p_deps_offsets("d_p2p_deps_offsets", p2p_num_nodes);
126 |   Kokkos::View<size_t*> d_m2l_deps_offsets("d_m2l_deps_offsets", m2l_num_nodes);
127 |   Kokkos::View<size_t*> d_p2p_deps_sizes("d_p2p_deps_sizes", p2p_num_nodes);
128 |   Kokkos::View<size_t*> d_m2l_deps_sizes("d_m2l_deps_sizes", m2l_num_nodes);
129 | 
130 |   Kokkos::deep_copy(d_p2p_nodes, h_p2p_nodes);
131 |   Kokkos::deep_copy(d_m2l_nodes, h_m2l_nodes);
132 |   Kokkos::deep_copy(d_p2p_deps_array, h_p2p_deps_array);
133 |   Kokkos::deep_copy(d_m2l_deps_array, h_m2l_deps_array);
134 |   Kokkos::deep_copy(d_p2p_deps_offsets, h_p2p_deps_offsets);
135 |   Kokkos::deep_copy(d_m2l_deps_offsets, h_m2l_deps_offsets);
136 |   Kokkos::deep_copy(d_p2p_deps_sizes, h_p2p_deps_sizes);
137 |   Kokkos::deep_copy(d_m2l_deps_sizes, h_m2l_deps_sizes);
138 |   Kokkos::fence();
139 |   timer.stop();
140 |   printf("%-20s %12.8f\n", "  Transfer Time (s) ", timer.elapsed());
141 | 
142 |   using policy_type =
143 |       Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace,
144 |                          Kokkos::Schedule<Kokkos::KOKKOS_SCHEDULE>>;
145 |   using member_type = policy_type::member_type;
146 | 
147 | #ifdef __CUDACC__
148 |   policy_type p2p_policy = policy_type(p2p_num_nodes, 128);
149 |   policy_type m2l_policy = policy_type(m2l_num_nodes, 32);
150 | #else
151 |   policy_type p2p_policy = policy_type(p2p_num_nodes, 1);
152 |   policy_type m2l_policy = policy_type(m2l_num_nodes, 1);
153 | #endif
154 | 
155 |   timer.start();
156 |   Kokkos::parallel_for(
157 |       p2p_policy, KOKKOS_LAMBDA(member_type member) {
158 |         const int i = member.league_rank();
159 |         node_t<T>* target = d_fmm->nodes + d_p2p_nodes[i];
160 |         size_t p2p_size = d_p2p_deps_sizes[i];
161 |         size_t p2p_offset = d_p2p_deps_offsets[i];
162 |         for (size_t j = 0; j < p2p_size; ++j) {
163 |           size_t source_idx = d_p2p_deps_array[p2p_offset + j];
164 |           node_t<T>* source = d_fmm->nodes + source_idx;
165 |           if (target == source) {
166 |         // p2p_tiled(d_fmm, target);
167 | #ifdef __CUDA_ARCH__
168 |             p2p_gpu<128, 128, 1, 128, 1>(d_fmm, target);
169 | #else
170 |             p2p_tiled(d_fmm, target);
171 | #endif
172 |           }
173 |           else {
174 |         // p2p_tiled(d_fmm, target, source);
175 | #ifdef __CUDA_ARCH__
176 |             p2p_gpu<128, 128, 1, 128, 1>(d_fmm, target, source);
177 | #else
178 |             p2p_tiled(d_fmm, target, source);
179 | #endif
180 |           }
181 |         }
182 |       });
183 |   Kokkos::fence();
184 |   timer.stop();
185 |   printf("%-20s %12.8f\n", "  P2P Time (s) ", timer.elapsed());
186 | 
187 |   timer.start();
188 |   Kokkos::parallel_for(
189 |       m2l_policy, KOKKOS_LAMBDA(member_type member) {
190 |         const int i = member.league_rank();
191 |         node_t<T>* target = d_fmm->nodes + d_m2l_nodes[i];
192 |         size_t m2l_size = d_m2l_deps_sizes[i];
193 |         size_t m2l_offset = d_m2l_deps_offsets[i];
194 |         for (size_t j = 0; j < m2l_size; ++j) {
195 |           size_t source_idx = d_m2l_deps_array[m2l_offset + j];
196 |           node_t<T>* source = d_fmm->nodes + source_idx;
197 | #ifdef __CUDA_ARCH__
198 |           m2l_gpu<32, 1>(d_fmm, target, source);
199 | #else
200 |           m2l(d_fmm, target, source);
201 | #endif
202 |         }
203 |       });
204 |   Kokkos::fence();
205 |   timer.stop();
206 |   printf("%-20s %12.8f\n", "  M2L Time (s) ", timer.elapsed());
207 | 
208 |   free(p2p_nodes);
209 |   free(p2p_deps_array);
210 |   free(p2p_deps_offsets);
211 |   free(p2p_deps_sizes);
212 | 
213 |   free(m2l_nodes);
214 |   free(m2l_deps_array);
215 |   free(m2l_deps_offsets);
216 |   free(m2l_deps_sizes);
217 | 
218 |   fini_device_fmm(fmm, h_fmm, d_fmm);
219 | }
220 | 
221 | template <class T>
222 | void perform_traversals(FMM<T>* fmm)
223 | {
224 |   Kokkos::initialize();
225 | 
226 |   printf("Running in serial\n");
227 | 
228 |   Timer timer;
229 |   Timer tot_timer;
230 | 
231 |   timer.start();
232 |   tot_timer.start();
233 | #pragma omp parallel
234 | #pragma omp single
235 |   upwards_pass(fmm, &fmm->nodes[fmm->root]);
236 |   timer.stop();
237 |   printf("\n");
238 |   printf("%-20s %12.8f\n", "Upwards Time (s) ", timer.elapsed());
239 | 
240 |   timer.start();
241 |   kokkos_dtt(fmm);
242 |   // dual_tree(fmm, &fmm->nodes[fmm->root], &fmm->nodes[fmm->root]);
243 |   timer.stop();
244 |   printf("%-20s %12.8f\n", "DTT Time (s) ", timer.elapsed());
245 | 
246 |   timer.start();
247 | #pragma omp parallel
248 | #pragma omp single
249 |   downwards_pass(fmm, &fmm->nodes[fmm->root]);
250 |   timer.stop();
251 |   printf("%-20s %12.8f\n", "Downwards Time (s) ", timer.elapsed());
252 | 
253 |   tot_timer.stop();
254 |   printf("--------------------\n");
255 |   printf("%-20s %12.8f\n", "Total Time (s) ", tot_timer.elapsed());
256 |   printf("--------------------\n\n");
257 | 
258 |   Kokkos::finalize();
259 | }
260 | 


--------------------------------------------------------------------------------
/kokkos-task-locks/flags.makefile:
--------------------------------------------------------------------------------
 1 | include $(KOKKOS_PATH)/Makefile.kokkos
 2 | 
 3 | CC_GNU=g++
 4 | CC_INTEL=icpc
 5 | CC_CLANG=clang++
 6 | CC_ARM=armclang++
 7 | CC_CRAY=CC
 8 | CC_NVCC=nvcc_wrapper
 9 | CC=$(CC_$(COMPILER))
10 | 
11 | UNAME=$(shell uname -m)
12 | ifeq ($(UNAME), aarch64)
13 |   ARCH_CFLAGS = -mcpu=$(ARCH) -mtune=$(ARCH) 
14 |   ifeq ($(COMPILER), GNU)
15 |     ARCH_CFLAGS += -mlow-precision-recip-sqrt
16 |   endif
17 | endif
18 | ifeq ($(UNAME), x86_64)
19 |   ARCH_CFLAGS = -march=$(ARCH)
20 | endif
21 | 
22 | CFLAGS_CLANG=-Ofast $(ARCH_CFLAGS) -fopenmp
23 | CFLAGS_GNU=-Ofast -fno-cx-limited-range $(ARCH_CFLAGS) -fopenmp
24 | CFLAGS_INTEL=-Ofast -x$(ARCH_CFLAGS) -qopenmp
25 | CFLAGS_ARM=-Ofast $(ARCH_CFLAGS) -fopenmp
26 | CFLAGS_CRAY=-fopenmp
27 | CFLAGS_NVCC=-O3 -ftz=true --use_fast_math 
28 | CFLAGS=$(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CFLAGS_$(COMPILER)) 
29 | 
30 | LIBS=$(KOKKOS_LDFLAGS) $(KOKKOS_LIBS)
31 | 
32 | 


--------------------------------------------------------------------------------
/kokkos-task-locks/node.hh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <utils.hh>
 4 | 
 5 | template <class T>
 6 | struct node_t {
 7 |   node_t() = default;
 8 |   ~node_t() = default;
 9 |   node_t(T arg_cx, T arg_cy, T arg_cz, T arg_rad, size_t arg_num_points,
10 |          size_t arg_point_idx, size_t arg_mult_idx, size_t arg_node_idx,
11 |          size_t arg_level)
12 |       : cx{arg_cx},
13 |         cy{arg_cy},
14 |         cz{arg_cz},
15 |         rad{arg_rad},
16 |         num_points{arg_num_points},
17 |         point_idx{arg_point_idx},
18 |         mult_idx{arg_mult_idx},
19 |         node_idx{arg_node_idx},
20 |         level{arg_level}
21 |   {
22 |   }
23 |   T cx;
24 |   T cy;
25 |   T cz;
26 |   T rad;
27 |   size_t num_children = 0;
28 |   size_t child[8] = {0};
29 |   size_t num_points;
30 |   size_t point_idx;
31 |   size_t mult_idx;
32 |   size_t node_idx;
33 |   size_t level;
34 | 
35 |   int p2p_lock = 0;
36 |   int m2l_lock = 0;
37 | 
38 |   HOSTDEVICE
39 |   bool is_leaf() const { return (num_children == 0); }
40 | };
41 | 


--------------------------------------------------------------------------------
/kokkos-task-locks/traversal.hh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <omp.h>
  4 | 
  5 | #include <Kokkos_Core.hpp>
  6 | #include <gpu-utils.hh>
  7 | #include <kernels.hh>
  8 | #include <node.hh>
  9 | 
 10 | #include <kokkos-utils.hh>
 11 | 
 12 | #include <gpu-kernels-no-atomics.hh>
 13 | 
 14 | #ifndef KOKKOS_SCHEDULER
 15 | #define KOKKOS_SCHEDULER TaskSchedulerMultiple
 16 | #endif
 17 | 
 18 | template <class T>
 19 | void upwards_pass(FMM<T>* fmm, node_t<T>* node)
 20 | {
 21 |   for (size_t i = 0; i < node->num_children; ++i) {
 22 | #pragma omp task
 23 |     upwards_pass(fmm, &fmm->nodes[node->child[i]]);
 24 |   }
 25 | 
 26 | #pragma omp taskwait
 27 | 
 28 |   if (node->is_leaf())
 29 |     p2m(fmm, node);
 30 |   else
 31 |     m2m(fmm, node);
 32 | }
 33 | 
 34 | template <class T>
 35 | void downwards_pass(FMM<T>* fmm, node_t<T>* node)
 36 | {
 37 |   if (node->is_leaf())
 38 |     l2p(fmm, node);
 39 |   else {
 40 |     l2l(fmm, node);
 41 |     for (size_t i = 0; i < node->num_children; ++i) {
 42 | #pragma omp task
 43 |       downwards_pass(fmm, &fmm->nodes[node->child[i]]);
 44 |     }
 45 |   }
 46 | #pragma omp taskwait
 47 | }
 48 | 
 49 | // template <class T>
 50 | // void dual_tree(FMM<T>* fmm, node_t<T>* target, node_t<T>* source)
 51 | //{
 52 | //  T dx = source->cx - target->cx;
 53 | //  T dy = source->cy - target->cy;
 54 | //  T dz = source->cz - target->cz;
 55 | //  T r2 = dx * dx + dy * dy + dz * dz;
 56 | //  T d1 = source->rad * static_cast<T>(2.0);
 57 | //  T d2 = target->rad * static_cast<T>(2.0);
 58 | //
 59 | //  if ((d1 + d2) * (d1 + d2) < fmm->theta2 * r2) {
 60 | //    m2l(fmm, target, source);
 61 | //  }
 62 | //  else if (source->is_leaf() && target->is_leaf()) {
 63 | //    if (target == source)
 64 | //      p2p_tiled(fmm, target);
 65 | //    else
 66 | //      p2p_tiled(fmm, target, source);
 67 | //  }
 68 | //  else {
 69 | //    T target_sz = target->rad;
 70 | //    T source_sz = source->rad;
 71 | //    if (source->is_leaf() || ((target_sz >= source_sz) && !target->is_leaf()))
 72 | //    {
 73 | //      for (size_t i = 0; i < target->num_children; ++i) {
 74 | //        node_t<T>* child = &fmm->nodes[target->child[i]];
 75 | //        dual_tree(fmm, child, source);
 76 | //      }
 77 | //    }
 78 | //    else {
 79 | //      for (size_t i = 0; i < source->num_children; ++i) {
 80 | //        dual_tree(fmm, target, &fmm->nodes[source->child[i]]);
 81 | //      }
 82 | //    }
 83 | //  }
 84 | //}
 85 | 
 86 | namespace Kokkos {
 87 | class Cuda;
 88 | class OpenMP;
 89 | }  // namespace Kokkos
 90 | 
 91 | template <class Scheduler, class T>
 92 | struct dual_tree_task {
 93 |   using value_type = void;
 94 |   using future_type = Kokkos::BasicFuture<void, Scheduler>;
 95 | 
 96 |   FMM<T>* fmm;
 97 |   node_t<T>* target;
 98 |   node_t<T>* source;
 99 | 
100 |   KOKKOS_INLINE_FUNCTION
101 |   dual_tree_task(FMM<T>* arg_fmm, node_t<T>* arg_target, node_t<T>* arg_source)
102 |       : fmm{arg_fmm}, target{arg_target}, source{arg_source}
103 |   {
104 |   }
105 | 
106 |   template <class Sched = Scheduler>
107 |   KOKKOS_INLINE_FUNCTION typename std::enable_if<
108 |       std::is_same<typename Sched::execution_space, Kokkos::Cuda>::value>::type
109 |   operator()(typename Scheduler::member_type& member)
110 |   {
111 |     T dx = source->cx - target->cx;
112 |     T dy = source->cy - target->cy;
113 |     T dz = source->cz - target->cz;
114 |     T r2 = dx * dx + dy * dy + dz * dz;
115 |     T d1 = source->rad * static_cast<T>(2.0);
116 |     T d2 = target->rad * static_cast<T>(2.0);
117 | 
118 |     // TODO for some reason the compiler still tries to compile this function
119 | #ifdef __CUDACC__
120 |     if ((d1 + d2) * (d1 + d2) < fmm->theta2 * r2) {
121 |       if (member.team_rank() == 0) lock(&target->m2l_lock);
122 |       member.team_barrier();
123 |       m2l_gpu<32, 4>(fmm, target, source);
124 |       if (member.team_rank() == 0) unlock(&target->m2l_lock);
125 |       member.team_barrier();
126 |     }
127 |     else if (source->is_leaf() && target->is_leaf()) {
128 |       if (member.team_rank() == 0) lock(&target->p2p_lock);
129 |       member.team_barrier();
130 |       if (target == source)
131 |         p2p_gpu<32, 16, 4, 32, 4>(fmm, target);
132 |       else
133 |         p2p_gpu<32, 16, 4, 32, 4>(fmm, target, source);
134 |       if (member.team_rank() == 0) unlock(&target->p2p_lock);
135 |       member.team_barrier();
136 |     }
137 |     else {
138 |       T target_sz = target->rad;
139 |       T source_sz = source->rad;
140 |       if (source->is_leaf() ||
141 |           ((target_sz >= source_sz) && !target->is_leaf())) {
142 |         for (size_t i = 0; i < target->num_children; ++i) {
143 |           node_t<T>* child = &fmm->nodes[target->child[i]];
144 |           if (target->num_points > TASK_CUTOFF) {
145 |             if (member.team_rank() == 0)
146 |               Kokkos::BasicFuture<void, Scheduler> f =
147 |                   Kokkos::task_spawn(Kokkos::TaskTeam(member.scheduler()),
148 |                                      dual_tree_task(fmm, child, source));
149 |           }
150 |           else {
151 |             dual_tree_task(fmm, child, source)(member);
152 |           }
153 |         }
154 |       }
155 |       else {
156 |         for (size_t i = 0; i < source->num_children; ++i) {
157 |           node_t<T>* child = &fmm->nodes[source->child[i]];
158 |           if (source->num_points > TASK_CUTOFF) {
159 |             if (member.team_rank() == 0)
160 |               Kokkos::BasicFuture<void, Scheduler> f =
161 |                   Kokkos::task_spawn(Kokkos::TaskTeam(member.scheduler()),
162 |                                      dual_tree_task(fmm, target, child));
163 |           }
164 |           else {
165 |             dual_tree_task(fmm, target, child)(member);
166 |           }
167 |         }
168 |       }
169 |     }
170 | #endif
171 |   }
172 | 
173 |   template <class Sched = Scheduler>
174 |   KOKKOS_INLINE_FUNCTION
175 |       typename std::enable_if<std::is_same<typename Sched::execution_space,
176 |                                            Kokkos::OpenMP>::value>::type
177 |       operator()(typename Scheduler::member_type& member)
178 |   {
179 |     T dx = source->cx - target->cx;
180 |     T dy = source->cy - target->cy;
181 |     T dz = source->cz - target->cz;
182 |     T r2 = dx * dx + dy * dy + dz * dz;
183 |     T d1 = source->rad * static_cast<T>(2.0);
184 |     T d2 = target->rad * static_cast<T>(2.0);
185 | 
186 |     if ((d1 + d2) * (d1 + d2) < fmm->theta2 * r2) {
187 |       lock(&target->m2l_lock);
188 |       m2l(fmm, target, source);
189 |       unlock(&target->m2l_lock);
190 |     }
191 |     else if (source->is_leaf() && target->is_leaf()) {
192 |       lock(&target->p2p_lock);
193 |       if (target == source) {
194 |         p2p_tiled(fmm, target);
195 |       }
196 |       else {
197 |         p2p_tiled(fmm, target, source);
198 |       }
199 |       unlock(&target->p2p_lock);
200 |     }
201 |     else {
202 |       T target_sz = target->rad;
203 |       T source_sz = source->rad;
204 |       if (source->is_leaf() ||
205 |           ((target_sz >= source_sz) && !target->is_leaf())) {
206 |         for (size_t i = 0; i < target->num_children; ++i) {
207 |           node_t<T>* child = &fmm->nodes[target->child[i]];
208 |           if (target->num_points > TASK_CUTOFF) {
209 |             Kokkos::BasicFuture<void, Scheduler> f =
210 |                 Kokkos::task_spawn(Kokkos::TaskSingle(member.scheduler()),
211 |                                    dual_tree_task(fmm, child, source));
212 |           }
213 |           else {
214 |             dual_tree_task(fmm, child, source)(member);
215 |           }
216 |         }
217 |       }
218 |       else {
219 |         for (size_t i = 0; i < source->num_children; ++i) {
220 |           node_t<T>* child = &fmm->nodes[source->child[i]];
221 |           dual_tree_task(fmm, target, child)(member);
222 |         }
223 |       }
224 |     }
225 |   }
226 | };
227 | 
228 | template <class Scheduler, class T>
229 | typename std::enable_if<std::is_same<typename Scheduler::execution_space,
230 |                                      Kokkos::OpenMP>::value>::type
231 | kokkos_dtt(FMM<T>* fmm)
232 | {
233 |   printf("openmp\n");
234 |   const size_t min_block_size = 64;
235 |   const size_t max_block_size = 1024;
236 |   const size_t super_block_size = 4096;
237 |   const size_t memory_capacity = 1024 * 1024 * 1024;
238 | 
239 |   Scheduler sched(typename Scheduler::memory_space(), memory_capacity,
240 |                   min_block_size, std::min(max_block_size, memory_capacity),
241 |                   std::min(super_block_size, memory_capacity));
242 |   node_t<T>* root_node = fmm->nodes + fmm->root;
243 |   Kokkos::BasicFuture<void, Scheduler> f = Kokkos::host_spawn(
244 |       Kokkos::TaskSingle(sched),
245 |       dual_tree_task<Scheduler, T>(fmm, root_node, root_node));
246 |   Kokkos::wait(sched);
247 | }
248 | 
249 | template <class Scheduler, class T>
250 | typename std::enable_if<std::is_same<typename Scheduler::execution_space,
251 |                                      Kokkos::Cuda>::value>::type
252 | kokkos_dtt(FMM<T>* fmm)
253 | {
254 |   printf("cuda\n");
255 |   const size_t min_block_size = 128;
256 |   const size_t max_block_size = 1024;
257 |   const size_t super_block_size = 4096;
258 |   const size_t memory_capacity = 1024 * 1024 * 1024;
259 | 
260 | #ifdef __CUDACC__
261 |   const int stack_size = 8192;
262 |   CUDACHK(cudaDeviceSetLimit(cudaLimitStackSize, stack_size));
263 | #endif
264 | 
265 |   Scheduler sched(typename Scheduler::memory_space(), memory_capacity,
266 |                   min_block_size, std::min(max_block_size, memory_capacity),
267 |                   std::min(super_block_size, memory_capacity));
268 | 
269 |   FMM<T>* d_fmm;
270 |   FMM<T>* h_fmm;
271 | 
272 |   init_device_fmm(fmm, &h_fmm, &d_fmm);
273 | 
274 |   node_t<T>* root_node = h_fmm->nodes + h_fmm->root;
275 | 
276 |   Kokkos::BasicFuture<void, Scheduler> f = Kokkos::host_spawn(
277 |       Kokkos::TaskTeam(sched),
278 |       dual_tree_task<Scheduler, T>(d_fmm, root_node, root_node));
279 |   Kokkos::wait(sched);
280 | 
281 |   fini_device_fmm(fmm, h_fmm, d_fmm);
282 | }
283 | 
284 | template <class T>
285 | void perform_traversals(FMM<T>* fmm)
286 | {
287 |   using Scheduler =
288 |       Kokkos::KOKKOS_SCHEDULER<Kokkos::DefaultExecutionSpace>;
289 | 
290 |   Kokkos::initialize();
291 | 
292 |   Timer timer;
293 |   Timer tot_timer;
294 | 
295 |   timer.start();
296 |   tot_timer.start();
297 | #pragma omp parallel
298 | #pragma omp single
299 |   upwards_pass(fmm, &fmm->nodes[fmm->root]);
300 |   timer.stop();
301 |   printf("\n");
302 |   printf("%-20s %12.8f\n", "Upwards Time (s) ", timer.elapsed());
303 | 
304 |   timer.start();
305 |   kokkos_dtt<Scheduler>(fmm);
306 |   timer.stop();
307 |   printf("%-20s %12.8f\n", "DTT Time (s) ", timer.elapsed());
308 | 
309 |   timer.start();
310 | #pragma omp parallel
311 | #pragma omp single
312 |   downwards_pass(fmm, &fmm->nodes[fmm->root]);
313 |   timer.stop();
314 |   printf("%-20s %12.8f\n", "Downwards Time (s) ", timer.elapsed());
315 | 
316 |   tot_timer.stop();
317 |   printf("--------------------\n");
318 |   printf("%-20s %12.8f\n", "Total Time (s) ", tot_timer.elapsed());
319 |   printf("--------------------\n\n");
320 | 
321 |   Kokkos::finalize();
322 | }
323 | 
324 | 
325 | 


--------------------------------------------------------------------------------
/kokkos-task/flags.makefile:
--------------------------------------------------------------------------------
 1 | include $(KOKKOS_PATH)/Makefile.kokkos
 2 | 
 3 | CC_GNU=g++
 4 | CC_INTEL=icpc
 5 | CC_CLANG=clang++
 6 | CC_ARM=armclang++
 7 | CC_CRAY=CC
 8 | CC_NVCC=nvcc_wrapper
 9 | CC=$(CC_$(COMPILER))
10 | 
11 | UNAME=$(shell uname -m)
12 | ifeq ($(UNAME), aarch64)
13 |   ARCH_CFLAGS = -mcpu=$(ARCH) -mtune=$(ARCH) 
14 |   ifeq ($(COMPILER), GNU)
15 |     ARCH_CFLAGS += -mlow-precision-recip-sqrt
16 |   endif
17 | endif
18 | ifeq ($(UNAME), x86_64)
19 |   ARCH_CFLAGS = -march=$(ARCH)
20 | endif
21 | 
22 | CFLAGS_CLANG=-Ofast $(ARCH_CFLAGS) -fopenmp
23 | CFLAGS_GNU=-Ofast -fno-cx-limited-range $(ARCH_CFLAGS) -fopenmp
24 | CFLAGS_INTEL=-Ofast -x$(ARCH_CFLAGS) -qopenmp
25 | CFLAGS_ARM=-Ofast $(ARCH_CFLAGS) -fopenmp
26 | CFLAGS_CRAY=-fopenmp
27 | CFLAGS_NVCC=-O3 -ftz=true --use_fast_math 
28 | CFLAGS=$(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CFLAGS_$(COMPILER)) 
29 | 
30 | LIBS=$(KOKKOS_LDFLAGS) $(KOKKOS_LIBS)
31 | 
32 | 


--------------------------------------------------------------------------------
/kokkos-task/node.hh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <utils.hh>
 4 | 
 5 | template <class T>
 6 | struct node_t {
 7 |   node_t() = default;
 8 |   ~node_t() = default;
 9 |   node_t(T arg_cx, T arg_cy, T arg_cz, T arg_rad, size_t arg_num_points,
10 |          size_t arg_point_idx, size_t arg_mult_idx, size_t arg_node_idx,
11 |          size_t arg_level)
12 |       : cx{arg_cx},
13 |         cy{arg_cy},
14 |         cz{arg_cz},
15 |         rad{arg_rad},
16 |         num_points{arg_num_points},
17 |         point_idx{arg_point_idx},
18 |         mult_idx{arg_mult_idx},
19 |         node_idx{arg_node_idx},
20 |         level{arg_level}
21 |   {
22 |   }
23 |   T cx;
24 |   T cy;
25 |   T cz;
26 |   T rad;
27 |   size_t num_children = 0;
28 |   size_t child[8] = {0};
29 |   size_t num_points;
30 |   size_t point_idx;
31 |   size_t mult_idx;
32 |   size_t node_idx;
33 |   size_t level;
34 | 
35 |   int p2p_lock = 0;
36 |   int m2l_lock = 0;
37 | 
38 |   HOSTDEVICE
39 |   bool is_leaf() const { return (num_children == 0); }
40 | };
41 | 


--------------------------------------------------------------------------------
/kokkos-task/traversal.hh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <omp.h>
  4 | 
  5 | #include <Kokkos_Core.hpp>
  6 | #include <gpu-utils.hh>
  7 | #include <kernels.hh>
  8 | #include <node.hh>
  9 | 
 10 | #include <kokkos-utils.hh>
 11 | 
 12 | #include <gpu-kernels.hh>
 13 | 
 14 | #ifndef KOKKOS_SCHEDULER
 15 | #define KOKKOS_SCHEDULER TaskSchedulerMultiple
 16 | #endif
 17 | 
 18 | //template <class T>
 19 | //void upwards_pass(FMM<T>* fmm, node_t<T>* node)
 20 | //{
 21 | //  for (size_t i = 0; i < node->num_children; ++i) {
 22 | //#pragma omp task
 23 | //    upwards_pass(fmm, &fmm->nodes[node->child[i]]);
 24 | //  }
 25 | //
 26 | //#pragma omp taskwait
 27 | //
 28 | //  if (node->is_leaf())
 29 | //    p2m(fmm, node);
 30 | //  else
 31 | //    m2m(fmm, node);
 32 | //}
 33 | 
 34 | template <class Scheduler, class T>
 35 | struct upwards_task {
 36 |   using value_type = void;
 37 |   using future_type = Kokkos::BasicFuture<void, Scheduler>;
 38 | 
 39 |   FMM<T>* fmm;
 40 |   node_t<T>* node;
 41 |   future_type respawn_future;
 42 | 
 43 |   KOKKOS_INLINE_FUNCTION
 44 |   upwards_task(FMM<T>* arg_fmm, node_t<T>* arg_node)
 45 |       : fmm{arg_fmm}, node{arg_node}, respawn_future()
 46 |   {
 47 |   }
 48 | 
 49 |   KOKKOS_INLINE_FUNCTION
 50 |   void operator()(typename Scheduler::member_type& member)
 51 |   {
 52 |     if (node->is_leaf()) {
 53 | #ifndef __CUDA_ARCH__
 54 |       p2m(fmm, node);
 55 | #else
 56 |       //p2m_gpu<32, 4>(fmm, node);
 57 | #endif
 58 |     }
 59 |     else if (respawn_future.is_null()) {
 60 |       future_type futures[8];
 61 |       for (size_t i = 0; i < node->num_children; ++i) {
 62 |         node_t<T>* child = fmm->nodes + node->child[i];
 63 |         futures[i] =
 64 |             Kokkos::task_spawn(Kokkos::TaskSingle(member.scheduler()),
 65 |                                upwards_task(fmm, child));
 66 |       }
 67 |       for (size_t i = node->num_children; i < 8; ++i) futures[i] = future_type();
 68 |       respawn_future = member.scheduler().when_all(futures, node->num_children);
 69 |       Kokkos::respawn(this, respawn_future);
 70 |     }
 71 |     else {
 72 | #ifndef __CUDA_ARCH__
 73 |       m2m(fmm, node);
 74 | #else
 75 |       //m2m_gpu<32, 4>(fmm, node);
 76 | #endif
 77 |     }
 78 |   }
 79 | };
 80 | 
 81 | template <class T>
 82 | void kokkos_upwards(FMM<T>* fmm)
 83 | {
 84 | #ifdef __CUDACC__
 85 |   FMM<T>* h_fmm;
 86 |   FMM<T>* d_fmm;
 87 |   init_device_fmm(fmm, &h_fmm, &d_fmm);
 88 |   node_t<T>* root_node = h_fmm->nodes + h_fmm->root;
 89 |   FMM<T>* device_fmm = d_fmm;
 90 | #else
 91 |   FMM<T>* device_fmm = fmm;
 92 |   node_t<T>* root_node = fmm->nodes + fmm->root;
 93 | #endif
 94 | 
 95 |   const size_t min_block_size = 32;
 96 |   const size_t max_block_size = 128;
 97 |   const size_t super_block_size = 10000;
 98 |   const size_t memory_capacity = 1024 * 1024 * 1024;
 99 | 
100 |   using Scheduler = Kokkos::TaskScheduler<Kokkos::DefaultExecutionSpace>;
101 | 
102 |   Scheduler sched(typename Scheduler::memory_space(), memory_capacity,
103 |                   min_block_size, std::min(max_block_size, memory_capacity),
104 |                   std::min(super_block_size, memory_capacity));
105 | 
106 |   Kokkos::BasicFuture<void, Scheduler> f = Kokkos::host_spawn(
107 |       Kokkos::TaskSingle(sched), upwards_task<Scheduler, T>(device_fmm, root_node));
108 | 
109 |   Kokkos::wait(sched);
110 | 
111 | #ifdef __CUDACC__
112 |   fini_device_fmm(fmm, h_fmm, d_fmm);
113 | #endif
114 | }
115 | 
116 | template <class T>
117 | void downwards_pass(FMM<T>* fmm, node_t<T>* node)
118 | {
119 |   if (node->is_leaf())
120 |     l2p(fmm, node);
121 |   else {
122 |     l2l(fmm, node);
123 |     for (size_t i = 0; i < node->num_children; ++i) {
124 | #pragma omp task
125 |       downwards_pass(fmm, &fmm->nodes[node->child[i]]);
126 |     }
127 |   }
128 | #pragma omp taskwait
129 | }
130 | 
131 | // template <class T>
132 | // void dual_tree(FMM<T>* fmm, node_t<T>* target, node_t<T>* source)
133 | //{
134 | //  T dx = source->cx - target->cx;
135 | //  T dy = source->cy - target->cy;
136 | //  T dz = source->cz - target->cz;
137 | //  T r2 = dx * dx + dy * dy + dz * dz;
138 | //  T d1 = source->rad * static_cast<T>(2.0);
139 | //  T d2 = target->rad * static_cast<T>(2.0);
140 | //
141 | //  if ((d1 + d2) * (d1 + d2) < fmm->theta2 * r2) {
142 | //    m2l(fmm, target, source);
143 | //  }
144 | //  else if (source->is_leaf() && target->is_leaf()) {
145 | //    if (target == source)
146 | //      p2p_tiled(fmm, target);
147 | //    else
148 | //      p2p_tiled(fmm, target, source);
149 | //  }
150 | //  else {
151 | //    T target_sz = target->rad;
152 | //    T source_sz = source->rad;
153 | //    if (source->is_leaf() || ((target_sz >= source_sz) && !target->is_leaf()))
154 | //    {
155 | //      for (size_t i = 0; i < target->num_children; ++i) {
156 | //        node_t<T>* child = &fmm->nodes[target->child[i]];
157 | //        dual_tree(fmm, child, source);
158 | //      }
159 | //    }
160 | //    else {
161 | //      for (size_t i = 0; i < source->num_children; ++i) {
162 | //        dual_tree(fmm, target, &fmm->nodes[source->child[i]]);
163 | //      }
164 | //    }
165 | //  }
166 | //}
167 | 
168 | namespace Kokkos {
169 | class Cuda;
170 | class OpenMP;
171 | }  // namespace Kokkos
172 | 
173 | template <class Scheduler, class T>
174 | struct dual_tree_task {
175 |   using value_type = void;
176 |   using future_type = Kokkos::BasicFuture<void, Scheduler>;
177 | 
178 |   FMM<T>* fmm;
179 |   node_t<T>* target;
180 |   node_t<T>* source;
181 | 
182 |   KOKKOS_INLINE_FUNCTION
183 |   dual_tree_task(FMM<T>* arg_fmm, node_t<T>* arg_target, node_t<T>* arg_source)
184 |       : fmm{arg_fmm}, target{arg_target}, source{arg_source}
185 |   {
186 |   }
187 | 
188 |   template <class Sched = Scheduler>
189 |   KOKKOS_INLINE_FUNCTION typename std::enable_if<
190 |       std::is_same<typename Sched::execution_space, Kokkos::Cuda>::value>::type
191 |   operator()(typename Scheduler::member_type& member)
192 |   {
193 |     T dx = source->cx - target->cx;
194 |     T dy = source->cy - target->cy;
195 |     T dz = source->cz - target->cz;
196 |     T r2 = dx * dx + dy * dy + dz * dz;
197 |     T d1 = source->rad * static_cast<T>(2.0);
198 |     T d2 = target->rad * static_cast<T>(2.0);
199 | 
200 |     // TODO for some reason the compiler still tries to compile this function
201 | #ifdef __CUDACC__
202 |     if ((d1 + d2) * (d1 + d2) < fmm->theta2 * r2) {
203 |       m2l_gpu<32, 4>(fmm, target, source);
204 |     }
205 |     else if (source->is_leaf() && target->is_leaf()) {
206 |       if (target == source)
207 |         p2p_gpu<32, 16, 4, 32, 4>(fmm, target);
208 |       else
209 |         p2p_gpu<32, 16, 4, 32, 4>(fmm, target, source);
210 |     }
211 |     else {
212 |       T target_sz = target->rad;
213 |       T source_sz = source->rad;
214 |       if (source->is_leaf() ||
215 |           ((target_sz >= source_sz) && !target->is_leaf())) {
216 |         for (size_t i = 0; i < target->num_children; ++i) {
217 |           node_t<T>* child = &fmm->nodes[target->child[i]];
218 |           if (target->num_points > TASK_CUTOFF) {
219 |             if (member.team_rank() == 0)
220 |               Kokkos::BasicFuture<void, Scheduler> f =
221 |                   Kokkos::task_spawn(Kokkos::TaskTeam(member.scheduler()),
222 |                                      dual_tree_task(fmm, child, source));
223 |           }
224 |           else {
225 |             dual_tree_task(fmm, child, source)(member);
226 |           }
227 |         }
228 |       }
229 |       else {
230 |         for (size_t i = 0; i < source->num_children; ++i) {
231 |           node_t<T>* child = &fmm->nodes[source->child[i]];
232 |           if (source->num_points > TASK_CUTOFF) {
233 |             if (member.team_rank() == 0)
234 |               Kokkos::BasicFuture<void, Scheduler> f =
235 |                   Kokkos::task_spawn(Kokkos::TaskTeam(member.scheduler()),
236 |                                      dual_tree_task(fmm, target, child));
237 |           }
238 |           else {
239 |             dual_tree_task(fmm, target, child)(member);
240 |           }
241 |         }
242 |       }
243 |     }
244 | #endif
245 |   }
246 | 
247 |   template <class Sched = Scheduler>
248 |   KOKKOS_INLINE_FUNCTION
249 |       typename std::enable_if<std::is_same<typename Sched::execution_space,
250 |                                            Kokkos::OpenMP>::value>::type
251 |       operator()(typename Scheduler::member_type& member)
252 |   {
253 |     T dx = source->cx - target->cx;
254 |     T dy = source->cy - target->cy;
255 |     T dz = source->cz - target->cz;
256 |     T r2 = dx * dx + dy * dy + dz * dz;
257 |     T d1 = source->rad * static_cast<T>(2.0);
258 |     T d2 = target->rad * static_cast<T>(2.0);
259 | 
260 |     if ((d1 + d2) * (d1 + d2) < fmm->theta2 * r2) {
261 |       lock(&target->m2l_lock);
262 |       m2l(fmm, target, source);
263 |       unlock(&target->m2l_lock);
264 |     }
265 |     else if (source->is_leaf() && target->is_leaf()) {
266 |       lock(&target->p2p_lock);
267 |       if (target == source) {
268 |         p2p_tiled(fmm, target);
269 |       }
270 |       else {
271 |         p2p_tiled(fmm, target, source);
272 |       }
273 |       unlock(&target->p2p_lock);
274 |     }
275 |     else {
276 |       T target_sz = target->rad;
277 |       T source_sz = source->rad;
278 |       if (source->is_leaf() ||
279 |           ((target_sz >= source_sz) && !target->is_leaf())) {
280 |         for (size_t i = 0; i < target->num_children; ++i) {
281 |           node_t<T>* child = &fmm->nodes[target->child[i]];
282 |           if (target->num_points > TASK_CUTOFF) {
283 |             Kokkos::BasicFuture<void, Scheduler> f =
284 |                 Kokkos::task_spawn(Kokkos::TaskSingle(member.scheduler()),
285 |                                    dual_tree_task(fmm, child, source));
286 |           }
287 |           else {
288 |             dual_tree_task(fmm, child, source)(member);
289 |           }
290 |         }
291 |       }
292 |       else {
293 |         for (size_t i = 0; i < source->num_children; ++i) {
294 |           node_t<T>* child = &fmm->nodes[source->child[i]];
295 |           dual_tree_task(fmm, target, child)(member);
296 |         }
297 |       }
298 |     }
299 |   }
300 | };
301 | 
302 | template <class Scheduler, class T>
303 | typename std::enable_if<std::is_same<typename Scheduler::execution_space,
304 |                                      Kokkos::OpenMP>::value>::type
305 | kokkos_dtt(FMM<T>* fmm)
306 | {
307 |   printf("openmp\n");
308 |   const size_t min_block_size = 64;
309 |   const size_t max_block_size = 1024;
310 |   const size_t super_block_size = 4096;
311 |   const size_t memory_capacity = 1024 * 1024 * 1024;
312 | 
313 |   Scheduler sched(typename Scheduler::memory_space(), memory_capacity,
314 |                   min_block_size, std::min(max_block_size, memory_capacity),
315 |                   std::min(super_block_size, memory_capacity));
316 |   node_t<T>* root_node = fmm->nodes + fmm->root;
317 |   Kokkos::BasicFuture<void, Scheduler> f = Kokkos::host_spawn(
318 |       Kokkos::TaskSingle(sched),
319 |       dual_tree_task<Scheduler, T>(fmm, root_node, root_node));
320 |   Kokkos::wait(sched);
321 | }
322 | 
323 | template <class Scheduler, class T>
324 | typename std::enable_if<std::is_same<typename Scheduler::execution_space,
325 |                                      Kokkos::Cuda>::value>::type
326 | kokkos_dtt(FMM<T>* fmm)
327 | {
328 |   printf("cuda\n");
329 |   const size_t min_block_size = 128;
330 |   const size_t max_block_size = 1024;
331 |   const size_t super_block_size = 4096;
332 |   const size_t memory_capacity = 1024 * 1024 * 1024;
333 | 
334 |   Scheduler sched(typename Scheduler::memory_space(), memory_capacity,
335 |                   min_block_size, std::min(max_block_size, memory_capacity),
336 |                   std::min(super_block_size, memory_capacity));
337 | 
338 |   FMM<T>* d_fmm;
339 |   FMM<T>* h_fmm;
340 | 
341 |   init_device_fmm(fmm, &h_fmm, &d_fmm);
342 | 
343 |   node_t<T>* root_node = h_fmm->nodes + h_fmm->root;
344 | 
345 |   Kokkos::BasicFuture<void, Scheduler> f = Kokkos::host_spawn(
346 |       Kokkos::TaskTeam(sched),
347 |       dual_tree_task<Scheduler, T>(d_fmm, root_node, root_node));
348 |   Kokkos::wait(sched);
349 | 
350 |   fini_device_fmm(fmm, h_fmm, d_fmm);
351 | }
352 | 
353 | template <class T>
354 | void perform_traversals(FMM<T>* fmm)
355 | {
356 |   using Scheduler =
357 |       Kokkos::KOKKOS_SCHEDULER<Kokkos::DefaultExecutionSpace>;
358 | 
359 |   Kokkos::initialize();
360 | #ifdef __CUDACC__
361 |   const int stack_size = 8192;
362 |   CUDACHK(cudaDeviceSetLimit(cudaLimitStackSize, stack_size));
363 | #endif
364 | 
365 |   Timer timer;
366 |   Timer tot_timer;
367 | 
368 |   timer.start();
369 |   tot_timer.start();
370 |   kokkos_upwards(fmm);
371 |   timer.stop();
372 |   printf("\n");
373 |   printf("%-20s %12.8f\n", "Upwards Time (s) ", timer.elapsed());
374 |   Kokkos::finalize();
375 |   exit(0);
376 | 
377 |   timer.start();
378 |   kokkos_dtt<Scheduler>(fmm);
379 |   timer.stop();
380 |   printf("%-20s %12.8f\n", "DTT Time (s) ", timer.elapsed());
381 | 
382 |   timer.start();
383 | #pragma omp parallel
384 | #pragma omp single
385 |   downwards_pass(fmm, &fmm->nodes[fmm->root]);
386 |   timer.stop();
387 |   printf("%-20s %12.8f\n", "Downwards Time (s) ", timer.elapsed());
388 | 
389 |   tot_timer.stop();
390 |   printf("--------------------\n");
391 |   printf("%-20s %12.8f\n", "Total Time (s) ", tot_timer.elapsed());
392 |   printf("--------------------\n\n");
393 | 
394 | }
395 | 


--------------------------------------------------------------------------------
/main.cc:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #include <iostream>
 3 | 
 4 | #include <input.hh>
 5 | #include <init.hh>
 6 | #include <traversal.hh>
 7 | #include <finalise.hh>
 8 | #include <verify.hh>
 9 | #include <fmm.hh>
10 | 
11 | template <class T>
12 | void perform_fmm(int argc, char** argv)
13 | {
14 |   FMM<T>* fmm = new FMM<T>();
15 |   read_input(argc, argv, fmm);
16 | 
17 |   init(fmm);
18 |   perform_traversals(fmm);
19 |   verify(fmm);
20 |   finalise(fmm);
21 | 
22 |   delete fmm;
23 | } 
24 | 
25 | int main(int argc, char** argv)
26 | { 
27 | #ifdef FMM_DOUBLE
28 |   perform_fmm<double>(argc, argv);
29 | #else
30 |   perform_fmm<float>(argc, argv);
31 | #endif
32 | };
33 | 


--------------------------------------------------------------------------------
/omp-for/flags.makefile:
--------------------------------------------------------------------------------
 1 | CC_GNU=g++
 2 | CC_INTEL=icpc
 3 | CC_CLANG=clang++
 4 | CC_ARM=armclang++
 5 | CC_CRAY=CC
 6 | CC=$(CC_$(COMPILER))
 7 | 
 8 | UNAME=$(shell uname -m)
 9 | ifeq ($(UNAME), aarch64)
10 |   ARCH_CFLAGS = -mcpu=$(ARCH) -mtune=$(ARCH) 
11 |   ifeq ($(COMPILER), GNU)
12 |     ARCH_CFLAGS += -mlow-precision-recip-sqrt
13 |   endif
14 | endif
15 | ifeq ($(UNAME), x86_64)
16 |   ARCH_CFLAGS = -march=$(ARCH)
17 | endif
18 | 
19 | CFLAGS_CLANG=-std=c++11 -Ofast $(ARCH_CFLAGS) -fopenmp
20 | CFLAGS_GNU=-std=c++11 -Ofast -fno-cx-limited-range $(ARCH_CFLAGS) -fopenmp
21 | CFLAGS_INTEL=-std=c++11 -Ofast -x$(ARCH) -qopenmp
22 | CFLAGS_ARM=-std=c++11 -Ofast $(ARCH_CFLAGS) -fopenmp
23 | CFLAGS_CRAY=-std=c++11 -Ofast -fopenmp
24 | CFLAGS=$(CFLAGS_$(COMPILER)) -Wall -g
25 | 
26 | LIBS=-fopenmp
27 | 


--------------------------------------------------------------------------------
/omp-for/node.hh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <omp.h>
 4 | 
 5 | template <class T>
 6 | struct node_t {
 7 |   node_t() = default;
 8 |   ~node_t() = default;
 9 |   node_t(T arg_cx, T arg_cy, T arg_cz, T arg_rad, size_t arg_num_points,
10 |          size_t arg_point_idx, size_t arg_mult_idx, size_t arg_node_idx,
11 |          size_t arg_level)
12 |       : cx{arg_cx},
13 |         cy{arg_cy},
14 |         cz{arg_cz},
15 |         rad{arg_rad},
16 |         num_points{arg_num_points},
17 |         point_idx{arg_point_idx},
18 |         mult_idx{arg_mult_idx},
19 |         node_idx{arg_node_idx},
20 |         level{arg_level}
21 |   {
22 |     omp_init_lock(&p2p_lock);
23 |     omp_init_lock(&m2l_lock);
24 |   }
25 |   T cx;
26 |   T cy;
27 |   T cz;
28 |   T rad;
29 |   size_t num_children = 0;
30 |   size_t child[8] = {0};
31 |   size_t num_points;
32 |   size_t point_idx;
33 |   size_t mult_idx;
34 |   size_t node_idx;
35 |   size_t level;
36 |   omp_lock_t p2p_lock;
37 |   omp_lock_t m2l_lock;
38 | 
39 |   bool is_leaf() const { return (num_children == 0); }
40 | };
41 | 


--------------------------------------------------------------------------------
/omp-for/traversal.hh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <omp.h>
  4 | 
  5 | #include <kernels.hh>
  6 | #include <node.hh>
  7 | #include <get-deps.hh>
  8 | 
  9 | template <class T>
 10 | void upwards_pass(FMM<T>* fmm, node_t<T>* node)
 11 | {
 12 |   for (size_t i = 0; i < node->num_children; ++i) {
 13 | #pragma omp task
 14 |     upwards_pass(fmm, &fmm->nodes[node->child[i]]);
 15 |   }
 16 | #pragma omp taskwait
 17 | 
 18 |   if (node->is_leaf())
 19 |     p2m(fmm, node);
 20 |   else
 21 |     m2m(fmm, node);
 22 | }
 23 | 
 24 | template <class T>
 25 | void downwards_pass(FMM<T>* fmm, node_t<T>* node)
 26 | {
 27 |   if (node->is_leaf())
 28 |     l2p(fmm, node);
 29 |   else {
 30 |     l2l(fmm, node);
 31 |     for (size_t i = 0; i < node->num_children; ++i) {
 32 | #pragma omp task
 33 |       downwards_pass(fmm, &fmm->nodes[node->child[i]]);
 34 |     }
 35 |   }
 36 | #pragma omp taskwait
 37 | }
 38 | 
 39 | template <class T>
 40 | void perform_traversals(FMM<T>* fmm)
 41 | {
 42 | #pragma omp parallel
 43 | #pragma omp single
 44 |   printf("Running on %d threads\n", omp_get_num_threads());
 45 | 
 46 |   Timer timer;
 47 |   Timer tot_timer;
 48 | 
 49 |   timer.start();
 50 |   tot_timer.start();
 51 | #pragma omp parallel
 52 | #pragma omp single
 53 |   upwards_pass(fmm, &fmm->nodes[fmm->root]);
 54 |   timer.stop();
 55 |   printf("\n");
 56 |   printf("%-20s %12.8f\n", "Upwards Time (s) ", timer.elapsed());
 57 | 
 58 |   Timer deps_timer;
 59 |   deps_timer.start();
 60 | 
 61 |   std::vector<std::vector<size_t>> p2p_deps(fmm->num_nodes);
 62 |   std::vector<std::vector<size_t>> m2l_deps(fmm->num_nodes);
 63 | 
 64 |   get_deps_omp(fmm, &p2p_deps, &m2l_deps);
 65 | 
 66 |   deps_timer.stop();
 67 |   printf("    %-16s %12.8f\n", "Deps. Time (s) ", deps_timer.elapsed());
 68 | 
 69 |   Timer compute_timer;
 70 |   compute_timer.start();
 71 | #pragma omp parallel for schedule(guided)
 72 |   for (size_t i = 0; i < fmm->num_nodes; ++i) {
 73 |     node_t<T>* target = &fmm->nodes[i];
 74 |     for (size_t j = 0; j < p2p_deps[i].size(); ++j) {
 75 |       node_t<T>* source = fmm->nodes + p2p_deps[i][j];
 76 |       if (target == source) {
 77 |         p2p_tiled(fmm, target);
 78 |       }
 79 |       else {
 80 |         p2p_tiled(fmm, target, source);
 81 |       }
 82 |     }
 83 |   }
 84 |   compute_timer.stop();
 85 |   printf("    %-16s %12.8f\n", "P2P Time (s) ", compute_timer.elapsed());
 86 | 
 87 |   compute_timer.start();
 88 | #pragma omp parallel for schedule(guided)
 89 |   for (size_t i = 0; i < fmm->num_nodes; ++i) {
 90 |     node_t<T>* target = &fmm->nodes[i];
 91 |     for (size_t j = 0; j < m2l_deps[i].size(); ++j) {
 92 |       node_t<T>* source = fmm->nodes + m2l_deps[i][j];
 93 |       m2l(fmm, target, source);
 94 |     }
 95 |   }
 96 |   compute_timer.stop();
 97 |   timer.stop();
 98 |   printf("    %-16s %12.8f\n", "M2L Time (s) ", compute_timer.elapsed());
 99 |   printf("%-20s %12.8f\n", "DTT Time (s) ", timer.elapsed());
100 | 
101 |   timer.start();
102 | #pragma omp parallel
103 | #pragma omp single
104 |   downwards_pass(fmm, &fmm->nodes[fmm->root]);
105 |   timer.stop();
106 |   printf("%-20s %12.8f\n", "Downwards Time (s) ", timer.elapsed());
107 | 
108 |   tot_timer.stop();
109 |   printf("--------------------\n");
110 |   printf("%-20s %12.8f\n", "Total Time (s) ", tot_timer.elapsed());
111 |   printf("--------------------\n\n");
112 | }
113 | 


--------------------------------------------------------------------------------
/omp-task/flags.makefile:
--------------------------------------------------------------------------------
 1 | CC_GNU=g++
 2 | CC_INTEL=icpc
 3 | CC_CLANG=clang++
 4 | CC_ARM=armclang++
 5 | CC_CRAY=CC
 6 | CC=$(CC_$(COMPILER))
 7 | 
 8 | UNAME=$(shell uname -m)
 9 | ifeq ($(UNAME), aarch64)
10 |   ARCH_CFLAGS = -mcpu=$(ARCH) -mtune=$(ARCH) 
11 |   ifeq ($(COMPILER), GNU)
12 |     ARCH_CFLAGS += -mlow-precision-recip-sqrt
13 |   endif
14 | endif
15 | ifeq ($(UNAME), x86_64)
16 |   ARCH_CFLAGS = -march=$(ARCH)
17 | endif
18 | 
19 | CFLAGS_CLANG=-std=c++11 -Ofast $(ARCH_CFLAGS) -fopenmp
20 | CFLAGS_GNU=-std=c++11 -Ofast -fno-cx-limited-range $(ARCH_CFLAGS) -fopenmp
21 | CFLAGS_INTEL=-std=c++11 -Ofast -x$(ARCH) -qopenmp
22 | CFLAGS_ARM=-std=c++11 -Ofast $(ARCH_CFLAGS) -fopenmp
23 | CFLAGS_CRAY=-std=c++11 -Ofast -fopenmp
24 | CFLAGS=$(CFLAGS_$(COMPILER)) -Wall -g
25 | 
26 | LIBS=-fopenmp
27 | 


--------------------------------------------------------------------------------
/omp-task/node.hh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <omp.h>
 4 | 
 5 | template <class T>
 6 | struct node_t {
 7 |   node_t() = default;
 8 |   ~node_t() = default;
 9 |   node_t(T arg_cx, T arg_cy, T arg_cz, T arg_rad, size_t arg_num_points,
10 |          size_t arg_point_idx, size_t arg_mult_idx, size_t arg_node_idx,
11 |          size_t arg_level)
12 |       : cx{arg_cx},
13 |         cy{arg_cy},
14 |         cz{arg_cz},
15 |         rad{arg_rad},
16 |         num_points{arg_num_points},
17 |         point_idx{arg_point_idx},
18 |         mult_idx{arg_mult_idx},
19 |         node_idx{arg_node_idx},
20 |         level{arg_level}
21 |   {
22 |     omp_init_lock(&p2p_lock);
23 |     omp_init_lock(&m2l_lock);
24 |   }
25 |   T cx;
26 |   T cy;
27 |   T cz;
28 |   T rad;
29 |   size_t num_children = 0;
30 |   size_t child[8] = {0};
31 |   size_t num_points;
32 |   size_t point_idx;
33 |   size_t mult_idx;
34 |   size_t node_idx;
35 |   size_t level;
36 |   omp_lock_t p2p_lock;
37 |   omp_lock_t m2l_lock;
38 | 
39 |   bool is_leaf() const { return (num_children == 0); }
40 | };
41 | 


--------------------------------------------------------------------------------
/omp-task/traversal.hh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <omp.h>
  4 | 
  5 | #include <node.hh>
  6 | #include <kernels.hh>
  7 | 
  8 | template <class T>
  9 | void upwards_pass(FMM<T>* fmm, node_t<T>* node)
 10 | {
 11 |   for (size_t i = 0; i < node->num_children; ++i) {
 12 | #pragma omp task
 13 |     upwards_pass(fmm, &fmm->nodes[node->child[i]]);
 14 |   }
 15 | #pragma omp taskwait
 16 | 
 17 |   if (node->is_leaf())
 18 |     p2m(fmm, node);
 19 |   else
 20 |     m2m(fmm, node);
 21 | }
 22 | 
 23 | template <class T>
 24 | void downwards_pass(FMM<T>* fmm, node_t<T>* node)
 25 | {
 26 |   if (node->is_leaf())
 27 |     l2p(fmm, node);
 28 |   else {
 29 |     l2l(fmm, node);
 30 |     for (size_t i = 0; i < node->num_children; ++i) {
 31 | #pragma omp task
 32 |       downwards_pass(fmm, &fmm->nodes[node->child[i]]);
 33 |     }
 34 |   }
 35 | #pragma omp taskwait
 36 | }
 37 | 
 38 | template <class T>
 39 | void dual_tree(FMM<T>* fmm, node_t<T>* target, node_t<T>* source)
 40 | {
 41 |   T dx = source->cx - target->cx;
 42 |   T dy = source->cy - target->cy;
 43 |   T dz = source->cz - target->cz;
 44 |   T r2 = dx * dx + dy * dy + dz * dz;
 45 |   T d1 = source->rad * static_cast<T>(2.0);
 46 |   T d2 = target->rad * static_cast<T>(2.0);
 47 | 
 48 |   if ((d1 + d2) * (d1 + d2) < fmm->theta2 * r2) {
 49 |     omp_set_lock(&target->m2l_lock);
 50 |     m2l(fmm, target, source);
 51 |     omp_unset_lock(&target->m2l_lock);
 52 |   }
 53 |   else if (source->is_leaf() && target->is_leaf()) {
 54 |     omp_set_lock(&target->p2p_lock);
 55 |     if (target == source)
 56 |       p2p_tiled(fmm, target);
 57 |     else
 58 |       p2p_tiled(fmm, target, source);
 59 |     omp_unset_lock(&target->p2p_lock);
 60 |   }
 61 |   else {
 62 |     T target_sz = target->rad;
 63 |     T source_sz = source->rad;
 64 |     if (source->is_leaf() || ((target_sz >= source_sz) && !target->is_leaf())) {
 65 |       for (size_t i = 0; i < target->num_children; ++i) {
 66 |         node_t<T>* child = &fmm->nodes[target->child[i]];
 67 | #pragma omp task if(target->num_points > TASK_CUTOFF)
 68 |         dual_tree(fmm, child, source);
 69 |       }
 70 |     }
 71 |     else {
 72 |       for (size_t i = 0; i < source->num_children; ++i) {
 73 | //#pragma omp task if(source->num_points > TASK_CUTOFF && SOURCE_TASK_SPAWN)
 74 |         dual_tree(fmm, target, &fmm->nodes[source->child[i]]);
 75 |       }
 76 |     }
 77 |   }
 78 | }
 79 | 
 80 | template <class T>
 81 | void perform_traversals(FMM<T>* fmm)
 82 | {
 83 |   #pragma omp parallel
 84 |   #pragma omp single
 85 |   printf("Running on %d threads\n", omp_get_num_threads());
 86 | 
 87 |   Timer timer;
 88 |   Timer tot_timer;
 89 | 
 90 |   timer.start();
 91 |   tot_timer.start();
 92 |   #pragma omp parallel
 93 |   #pragma omp single
 94 |   upwards_pass(fmm, &fmm->nodes[fmm->root]);
 95 |   timer.stop();
 96 |   printf("\n");
 97 |   printf("%-20s %12.8f\n", "Upwards Time (s) ", timer.elapsed());
 98 | 
 99 |   timer.start();
100 |   #pragma omp parallel
101 |   #pragma omp single
102 |   dual_tree(fmm, &fmm->nodes[fmm->root], &fmm->nodes[fmm->root]);
103 |   timer.stop();
104 |   printf("%-20s %12.8f\n", "DTT Time (s) ", timer.elapsed());
105 | 
106 |   timer.start();
107 |   #pragma omp parallel
108 |   #pragma omp single
109 |   downwards_pass(fmm, &fmm->nodes[fmm->root]);
110 |   timer.stop();
111 |   printf("%-20s %12.8f\n", "Downwards Time (s) ", timer.elapsed());
112 | 
113 |   tot_timer.stop();
114 |   printf("--------------------\n");
115 |   printf("%-20s %12.8f\n", "Total Time (s) ", tot_timer.elapsed());
116 |   printf("--------------------\n\n");
117 | }
118 | 


--------------------------------------------------------------------------------
/omptarget/flags.makefile:
--------------------------------------------------------------------------------
 1 | CC=clang++
 2 | 
 3 | ifeq ($(TARGET), GPU)
 4 | CFLAGS=-Ofast -mllvm --nvptx-f32ftz -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=$(ARCH) 
 5 | LIBS=-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=$(ARCH)
 6 | else 
 7 | CFLAGS=-Ofast -fopenmp -fopenmp-targets=x86_64 -march=$(ARCH) 
 8 | LIBS=-fopenmp -fopenmp-targets=x86_64 -march=$(ARCH) 
 9 | endif
10 | 


--------------------------------------------------------------------------------
/omptarget/node.hh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <omp.h>
 4 | 
 5 | template <class T>
 6 | struct node_t {
 7 |   node_t() = default;
 8 |   ~node_t() = default;
 9 |   node_t(T arg_cx, T arg_cy, T arg_cz, T arg_rad, size_t arg_num_points,
10 |          size_t arg_point_idx, size_t arg_mult_idx, size_t arg_node_idx,
11 |          size_t arg_level)
12 |       : cx{arg_cx},
13 |         cy{arg_cy},
14 |         cz{arg_cz},
15 |         rad{arg_rad},
16 |         num_points{arg_num_points},
17 |         point_idx{arg_point_idx},
18 |         mult_idx{arg_mult_idx},
19 |         node_idx{arg_node_idx},
20 |         level{arg_level}
21 |   {
22 |     omp_init_lock(&p2p_lock);
23 |     omp_init_lock(&m2l_lock);
24 |   }
25 |   T cx;
26 |   T cy;
27 |   T cz;
28 |   T rad;
29 |   size_t num_children = 0;
30 |   size_t child[8] = {0};
31 |   size_t num_points;
32 |   size_t point_idx;
33 |   size_t mult_idx;
34 |   size_t node_idx;
35 |   size_t level;
36 |   omp_lock_t p2p_lock;
37 |   omp_lock_t m2l_lock;
38 | 
39 |   bool is_leaf() const { return (num_children == 0); }
40 | };
41 | 


--------------------------------------------------------------------------------
/omptarget/traversal.hh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <omp.h>
  4 | 
  5 | #include <kernels.hh>
  6 | #include <node.hh>
  7 | #include <get-deps.hh>
  8 | 
  9 | template <class T>
 10 | void upwards_pass(FMM<T>* fmm, node_t<T>* node)
 11 | {
 12 |   for (size_t i = 0; i < node->num_children; ++i) {
 13 | #pragma omp task
 14 |     upwards_pass(fmm, &fmm->nodes[node->child[i]]);
 15 |   }
 16 | #pragma omp taskwait
 17 | 
 18 |   if (node->is_leaf())
 19 |     p2m(fmm, node);
 20 |   else
 21 |     m2m(fmm, node);
 22 | }
 23 | 
 24 | template <class T>
 25 | void downwards_pass(FMM<T>* fmm, node_t<T>* node)
 26 | {
 27 |   if (node->is_leaf())
 28 |     l2p(fmm, node);
 29 |   else {
 30 |     l2l(fmm, node);
 31 |     for (size_t i = 0; i < node->num_children; ++i) {
 32 | #pragma omp task
 33 |       downwards_pass(fmm, &fmm->nodes[node->child[i]]);
 34 |     }
 35 |   }
 36 | #pragma omp taskwait
 37 | }
 38 | 
 39 | template <class T>
 40 | void dtt(FMM<T>* fmm)
 41 | {
 42 |   node_t<T>* nodes = fmm->nodes;
 43 |   T* x = fmm->x;
 44 |   T* y = fmm->y;
 45 |   T* z = fmm->z;
 46 |   T* w = fmm->w;
 47 |   T* ax = fmm->ax;
 48 |   T* ay = fmm->ay;
 49 |   T* az = fmm->az;
 50 |   T* aw = fmm->p;
 51 |   complex_t<T>* mm = fmm->m;
 52 |   complex_t<T>* ml = fmm->l;
 53 | 
 54 |   size_t nn = fmm->num_nodes;
 55 |   size_t np = fmm->num_points;
 56 |   size_t nm = fmm->num_multipoles;
 57 | 
 58 | #pragma omp target enter data map(to: nodes[:nn], x[:np], y[:np], z[:np], \
 59 |                                       w[:np], ax[:np], ay[:np], az[:np],  \
 60 |                                       aw[:np], mm[:nm * nn], ml[:nm * nn])
 61 | 
 62 |   Timer deps_timer;
 63 |   deps_timer.start();
 64 | 
 65 |   std::vector<std::vector<size_t>> p2p_deps(fmm->num_nodes);
 66 |   std::vector<std::vector<size_t>> m2l_deps(fmm->num_nodes);
 67 | 
 68 |   get_deps_omp(fmm, &p2p_deps, &m2l_deps);
 69 | 
 70 |   deps_timer.stop();
 71 |   printf("    %-16s %12.8f\n", "Deps. Time (s) ", deps_timer.elapsed());
 72 | 
 73 |   deps_timer.start();
 74 |   size_t* p2p_nodes;
 75 |   size_t* p2p_deps_array;
 76 |   size_t* p2p_deps_offsets;
 77 |   size_t* p2p_deps_sizes;
 78 |   size_t p2p_deps_tot;
 79 |   size_t p2p_num_nodes;
 80 | 
 81 |   size_t* m2l_nodes;
 82 |   size_t* m2l_deps_array;
 83 |   size_t* m2l_deps_offsets;
 84 |   size_t* m2l_deps_sizes;
 85 |   size_t m2l_deps_tot;
 86 |   size_t m2l_num_nodes;
 87 | 
 88 |   pack_deps(p2p_deps, &p2p_nodes, &p2p_deps_array, &p2p_deps_offsets,
 89 |             &p2p_deps_sizes, &p2p_deps_tot, &p2p_num_nodes);
 90 |   pack_deps(m2l_deps, &m2l_nodes, &m2l_deps_array, &m2l_deps_offsets,
 91 |             &m2l_deps_sizes, &m2l_deps_tot, &m2l_num_nodes);
 92 |   deps_timer.stop();
 93 |   printf("%-20s %12.8f\n", "  Pack Time (s) ", deps_timer.elapsed());
 94 | 
 95 | #pragma omp target enter data map(to: p2p_nodes[:p2p_num_nodes],\
 96 |                                   p2p_deps_array[:p2p_deps_tot],\
 97 |                                   p2p_deps_offsets[:p2p_num_nodes],\
 98 |                                   p2p_deps_sizes[:p2p_num_nodes])
 99 | 
100 | #pragma omp target enter data map(to: m2l_nodes[:m2l_num_nodes],\
101 |                                   m2l_deps_array[:m2l_deps_tot],\
102 |                                   m2l_deps_offsets[:m2l_num_nodes],\
103 |                                   m2l_deps_sizes[:m2l_num_nodes])
104 | 
105 |   Timer compute_timer;
106 |   compute_timer.start();
107 | 
108 | #pragma omp target teams distribute 
109 |   for (size_t ni = 0; ni < p2p_num_nodes; ++ni) {
110 |     node_t<T>* target = &nodes[p2p_nodes[ni]];
111 |     size_t p2p_size = p2p_deps_sizes[ni];
112 |     size_t p2p_offset = p2p_deps_offsets[ni];
113 | 
114 |     for (size_t nj = 0; nj < p2p_size; ++nj) {
115 |       size_t source_idx = p2p_deps_array[p2p_offset + nj];
116 |       node_t<T>* source = nodes + source_idx;
117 | //
118 | //      if (target == source) p2p_tiled(fmm, target);
119 | //      else p2p_tiled(fmm, target, source);
120 | //    }
121 | //  }
122 | 
123 |       //static __attribute((address_space(3))) 
124 |         T shmem[512 * 4];
125 |       T* source_pos = (T*)shmem;
126 | 
127 | #pragma omp parallel for
128 |       for (size_t j = 0; j < source->num_points; ++j) {
129 |         const size_t jj = j + source->point_idx;
130 |         source_pos[j * 4 + 0] = x[jj];
131 |         source_pos[j * 4 + 1] = y[jj];
132 |         source_pos[j * 4 + 2] = z[jj];
133 |         source_pos[j * 4 + 3] = w[jj];
134 |       }
135 | 
136 | #pragma omp parallel for
137 |       for (size_t i = 0; i < target->num_points; ++i) {
138 |         const size_t ip = i + target->point_idx;
139 |         const T xi = x[ip];
140 |         const T yi = y[ip];
141 |         const T zi = z[ip];
142 |         T tax = static_cast<T>(0.0);
143 |         T tay = static_cast<T>(0.0);
144 |         T taz = static_cast<T>(0.0);
145 |         T taw = static_cast<T>(0.0);
146 |         for (size_t j = 0; j < source->num_points; ++j) {
147 |           const size_t jp = j + source->point_idx;
148 |           const T dx = source_pos[j * 4 + 0] - xi;
149 |           const T dy = source_pos[j * 4 + 1] - yi;
150 |           const T dz = source_pos[j * 4 + 2] - zi;
151 |           const T wj = source_pos[j * 4 + 3];
152 |           const T r = dx * dx + dy * dy + dz * dz;
153 |           const T inv_r = (r == 0.0) ? 0.0 : 1.0/std::sqrt(r);
154 |           const T inv_r_3 = inv_r * inv_r * inv_r * wj;
155 |           tax += dx * inv_r_3;
156 |           tay += dy * inv_r_3;
157 |           taz += dz * inv_r_3;
158 |           taw += inv_r * wj;
159 |         }
160 |         ax[ip] += tax;
161 |         ay[ip] += tay;
162 |         az[ip] += taz;
163 |         aw[ip] += taw;
164 |       }
165 |     }
166 |   }
167 | 
168 | //#pragma omp parallel for schedule(guided)
169 | //  for (size_t i = 0; i < fmm->num_nodes; ++i) {
170 | //    node_t<T>* target = &fmm->nodes[i];
171 | //    for (size_t j = 0; j < p2p_deps[i].size(); ++j) {
172 | //      node_t<T>* source = fmm->nodes + p2p_deps[i][j];
173 | //      if (target == source) {
174 | //        p2p_tiled(fmm, target);
175 | //      }
176 | //      else {
177 | //        p2p_tiled(fmm, target, source);
178 | //      }
179 | //    }
180 | //  }
181 |   compute_timer.stop();
182 |   printf("    %-16s %12.8f\n", "P2P Time (s) ", compute_timer.elapsed());
183 | 
184 |   compute_timer.start();
185 | #pragma omp parallel for schedule(guided)
186 |   for (size_t i = 0; i < fmm->num_nodes; ++i) {
187 |     node_t<T>* target = &fmm->nodes[i];
188 |     for (size_t j = 0; j < m2l_deps[i].size(); ++j) {
189 |       node_t<T>* source = fmm->nodes + m2l_deps[i][j];
190 |       m2l(fmm, target, source);
191 |     }
192 |   }
193 |   compute_timer.stop();
194 |   printf("    %-16s %12.8f\n", "M2L Time (s) ", compute_timer.elapsed());
195 | 
196 | #pragma omp target exit data map(from: ax[:np], ay[:np], az[:np], aw[:np])
197 | }
198 | 
199 | 
200 | template <class T>
201 | void perform_traversals(FMM<T>* fmm)
202 | {
203 | #pragma omp parallel
204 | #pragma omp single
205 |   printf("Running on %d threads\n", omp_get_num_threads());
206 | 
207 |   Timer timer;
208 |   Timer tot_timer;
209 | 
210 |   timer.start();
211 |   tot_timer.start();
212 | #pragma omp parallel
213 | #pragma omp single
214 |   upwards_pass(fmm, &fmm->nodes[fmm->root]);
215 |   timer.stop();
216 |   printf("\n");
217 |   printf("%-20s %12.8f\n", "Upwards Time (s) ", timer.elapsed());
218 | 
219 |   dtt(fmm);
220 | 
221 |   timer.start();
222 | #pragma omp parallel
223 | #pragma omp single
224 |   downwards_pass(fmm, &fmm->nodes[fmm->root]);
225 |   timer.stop();
226 |   printf("%-20s %12.8f\n", "Downwards Time (s) ", timer.elapsed());
227 | 
228 |   tot_timer.stop();
229 |   printf("--------------------\n");
230 |   printf("%-20s %12.8f\n", "Total Time (s) ", tot_timer.elapsed());
231 |   printf("--------------------\n\n");
232 | }
233 | 


--------------------------------------------------------------------------------