├── LICENSE ├── eval_hermite_final.py ├── FMM_multipole2local.py └── m2l_gpu_test.py /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Barba group 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /eval_hermite_final.py: -------------------------------------------------------------------------------- 1 | """ 2 | Evaluate hermite polynomials using cuda 3 | * use modified version of previously written polynomial evaluator 4 | """ 5 | import warnings 6 | warnings.simplefilter('ignore',Warning) 7 | 8 | import numpy 9 | import time 10 | from math import sqrt, log10, exp, ceil 11 | import sys 12 | 13 | from optparse import OptionParser 14 | 15 | from scipy.special import hermite 16 | 17 | import pycuda.driver as cuda 18 | #from pycuda.compiler import SourceModule 19 | 20 | cuda.init() 21 | dev = cuda.Device(0) 22 | ctx = dev.make_context() 23 | 24 | 25 | # pad an array a with zeros until it is of desired 26 | # length len 27 | # need to pad at the beginning, as highest power of 28 | # x is held in a[0] 29 | def pad(a,l): 30 | a_len = len(a) 31 | pad_length = l-a_len 32 | 33 | return numpy.append(numpy.zeros(pad_length),a) 34 | 35 | # generate an array of hermite polynomials of degree < n 36 | # all padded to length of maximum poly. 37 | def gen_hermite_array(n): 38 | pad_len = n 39 | # initial polynomial 40 | h = pad(hermite(0).coeffs,pad_len) 41 | for i in range(1,n): 42 | h_temp = pad(hermite(i).coeffs,pad_len) 43 | h = numpy.append(h,h_temp) 44 | 45 | return h 46 | 47 | # pre-compute the hermite polynomials to be used throughout 48 | # the calculations (for speed) 49 | # input - p - number of terms kept in hermite / taylor series 50 | def gen_hermite_polys(p): 51 | H = [] 52 | for i in range(p): 53 | H.append(hermite(i)) 54 | 55 | return H 56 | 57 | def gen_alpha_cuda(p): 58 | alpha = [] 59 | for i in range(0,p): 60 | for j in range(0,p-i): 61 | alpha.append(i) 62 | alpha.append(j) 63 | 64 | return alpha 65 | 66 | def build_fact_cache(q): 67 | f = [] 68 | for i in range(q+1): 69 | f.append(factorial(i)) 70 | 71 | return f 72 | 73 | def factorial(n): 74 | if n <= 0: 75 | return 1 76 | 77 | return n*factorial(n-1) 78 | 79 | def gen_alpha(p): 80 | alpha = [] 81 | for i in range(0,p): 82 | for j in range(0,p-i): 83 | alpha.append([i,j]) 84 | 85 | return alpha 86 | 87 | # alpha! = product(alpha[i]) 88 | def a_fact(alpha): 89 | a_fact = factorial(alpha[0])*factorial(alpha[1]) 90 | return a_fact 91 | 92 | # get nCp needed for number of coefficients for taylor series 93 | # input - n 94 | # p 95 | def nCp(n,p): 96 | top = factorial(n) 97 | bottom = factorial(p)*factorial(n-p) 98 | 99 | return 1.*(top/bottom) 100 | 101 | delta = 0.25 102 | p = 9 103 | 104 | parser = OptionParser() 105 | parser.add_option("-p",type="int",dest="p",help="Number of terms") 106 | parser.add_option("-n",type="int",dest="n_points",help="number of blocks (n_points / 512)") 107 | 108 | (options, args) = parser.parse_args() 109 | 110 | p = options.p 111 | 112 | alpha = numpy.array(gen_alpha_cuda(p)).astype(numpy.float32) 113 | H = gen_hermite_array(p).astype(numpy.float32) 114 | 115 | # general parameters 116 | H_orig = gen_hermite_polys(p) 117 | num_terms = nCp(p+1,2) 118 | num_clusters = 36 119 | A_gen = numpy.random.randn(num_clusters*num_terms).astype(numpy.float32) 120 | 121 | f_cache = build_fact_cache(p) 122 | 123 | # start off with python generated A_alpha values for one center 124 | blocksize = 512 125 | n_points = options.n_points 126 | n_blocks = options.n_points / blocksize 127 | 128 | # computing evaluations from many source clusters 129 | # use same center for all calculations 130 | # uniform target points for speed of generation 131 | sb = numpy.array([0.25,0.25]*num_clusters).astype(numpy.float32) 132 | tx = numpy.ones(n_points).astype(numpy.float32) 133 | ty = numpy.ones(n_points).astype(numpy.float32) 134 | 135 | results = numpy.zeros(n_points).astype(numpy.float32) 136 | r_orig = numpy.empty_like(results) 137 | 138 | alpha = numpy.array(gen_alpha_cuda(p)).astype(numpy.float32) 139 | alpha_orig = gen_alpha(p) 140 | 141 | # setup cuda 142 | r = numpy.zeros(n_points).astype(numpy.float32) 143 | r_out = numpy.empty_like(r) 144 | 145 | # assign memory on gpu 146 | r_gpu = cuda.mem_alloc(r.size * r.dtype.itemsize) 147 | tx_gpu = cuda.mem_alloc(tx.size * tx.dtype.itemsize) 148 | ty_gpu = cuda.mem_alloc(ty.size * ty.dtype.itemsize) 149 | alpha_gpu = cuda.mem_alloc(alpha.size * alpha.dtype.itemsize) 150 | H_gpu = cuda.mem_alloc(H.size * H.dtype.itemsize) 151 | 152 | # copy memory from host to gpu 153 | cuda.memcpy_htod(r_gpu,r) 154 | cuda.memcpy_htod(tx_gpu,tx) 155 | cuda.memcpy_htod(ty_gpu,ty) 156 | cuda.memcpy_htod(alpha_gpu,alpha) 157 | cuda.memcpy_htod(H_gpu,H) 158 | 159 | # decide how many clusters are evaluated per instance 160 | avail_threads = blocksize - len(H) - len(alpha) 161 | threads_per_call = len(alpha)/2 + 2 162 | clusters_per_call = int(avail_threads*1. / threads_per_call) 163 | num_calls_needed = int(ceil(num_clusters*1. / clusters_per_call)) 164 | 165 | if num_calls_needed == 0: 166 | num_calls_needed += 1 167 | 168 | clusters_called = 0 169 | total_kernel_time = 0. 170 | loop_time = 0. 171 | 172 | tic = time.time() 173 | 174 | for k in range(num_calls_needed): 175 | # split the data 176 | len_A = len(alpha)/2 177 | A_curr = A_gen[k*clusters_per_call*len_A:k*clusters_per_call*len_A+clusters_per_call*len_A] 178 | sb_curr = sb[k*2*clusters_per_call:k*2*clusters_per_call+2*clusters_per_call] 179 | 180 | # keep track of how much of this call we have evaluated 181 | clusters_called += clusters_per_call 182 | if clusters_called <= num_clusters: 183 | clusters_this_call = clusters_per_call 184 | else: 185 | clusters_this_call = num_clusters-clusters_called+clusters_per_call 186 | 187 | # assign cuda memory for current set of evaluations 188 | A_gpu = cuda.mem_alloc(A_curr.size * A_curr.dtype.itemsize) 189 | sb_gpu = cuda.mem_alloc(sb_curr.size * sb_curr.dtype.itemsize) 190 | 191 | # copy to device 192 | cuda.memcpy_htod(A_gpu,A_curr) 193 | cuda.memcpy_htod(sb_gpu,sb_curr) 194 | 195 | # cuda source 196 | hermite_eval = cuda.SourceModule(""" 197 | #define A_TERMS %(lenA)d 198 | #define TERMS %(termsAlpha)d 199 | #define CLUSTERS %(clusters)d 200 | #define POLY_TERMS %(polyTerms)d 201 | #define BLOCKSIZE %(blocksize)d 202 | #define SQRT_2 1.4142135623730951f 203 | 204 | #define SIGMA %(sigma)f 205 | #define LEN_ALPHA %(len_alpha)d 206 | #define NUM_TERMS %(num_terms)d 207 | #define OPTS3 %(opts3)d 208 | #define NUM_CLUSTERS %(num_clusters)d 209 | 210 | #define DEST_PER_THREAD 2 211 | 212 | 213 | // slightly optimised evaluation -- do all calculations for source clusters 214 | // at once -- save on memory bandwidth 215 | __global__ void eval_hermite2(float *r, float *A, float *tx, float *ty, float *sb, 216 | float *alpha, float *H) 217 | { 218 | float result, x, y; 219 | int alpha1; 220 | float h1, h2; 221 | int i, k; 222 | 223 | float pre_mult, t_x, t_y; 224 | 225 | // shared memory 226 | __shared__ float shared_alpha[TERMS]; 227 | __shared__ float shared_A[A_TERMS]; 228 | __shared__ float shared_sb[CLUSTERS]; 229 | __shared__ float shared_H[POLY_TERMS]; 230 | 231 | //////////////////////////////// 232 | // Read vars into shared memory 233 | // WARNING: Each block needs more threads than (TERMS + A_TERMS + POLY_TERMS + CLUSTERS) 234 | // otherwise it won't work. 235 | //////////////////////////////// 236 | // select what each thread reads 237 | if (threadIdx.x < TERMS){ 238 | // shared_alpha case 239 | i = 0; 240 | k = 0; 241 | } else if (threadIdx.x < TERMS + A_TERMS) { 242 | // shared_A case 243 | i = 1; 244 | k = - TERMS; 245 | } else if (threadIdx.x < TERMS + A_TERMS + POLY_TERMS) { 246 | // shared_H case 247 | i = 2; 248 | k = - TERMS - A_TERMS; 249 | } else if (threadIdx.x < TERMS + A_TERMS + POLY_TERMS + CLUSTERS) { 250 | // shared_sb case 251 | i = 3; 252 | k = - TERMS - A_TERMS - POLY_TERMS; 253 | } else { 254 | // No read case 255 | i = 4; 256 | k = 0; 257 | } 258 | // diverge the threads to have independent reads 259 | switch (i){ 260 | case 0: 261 | shared_alpha[threadIdx.x + k] = alpha[threadIdx.x + k]; 262 | break; 263 | case 1: 264 | shared_A[threadIdx.x + k] = A[threadIdx.x + k]; 265 | break; 266 | case 2: 267 | shared_H[threadIdx.x + k] = H[threadIdx.x + k]; 268 | break; 269 | case 3: 270 | shared_sb[threadIdx.x + k] = sb[threadIdx.x + k]; 271 | break; 272 | default: 273 | break; 274 | } 275 | 276 | //__threadfence_block(); 277 | __syncthreads(); 278 | 279 | if (OPTS3 < threadIdx.x + BLOCKSIZE*blockIdx.x) 280 | { 281 | return; 282 | } 283 | 284 | t_x = tx[threadIdx.x + BLOCKSIZE*blockIdx.x]; 285 | t_y = ty[threadIdx.x + BLOCKSIZE*blockIdx.x]; 286 | result = 0.0; 287 | 288 | /////////////////////////////// 289 | // Main loop, flops: (NumClusters * (19 + LenAlpha/2 * (14 + 4 * NumTerms)) + 2) 290 | /////////////////////////////// 291 | 292 | // run through this code for each cluster center 293 | for (k=0; k < NUM_CLUSTERS; k++) { 294 | 295 | // distance operator 296 | x = (t_x - shared_sb[k*2+0]) / SQRT_2 / SIGMA; 297 | //x = (t_x - sb[k*2+0]) / SQRT_2 / SIGMA; 298 | y = (t_y - shared_sb[k*2+1]) / SQRT_2 / SIGMA; 299 | //y = (t_y - sb[k*2+1]) / SQRT_2 / SIGMA; 300 | 301 | pre_mult = exp(-(x*x))*exp(-(y*y)); 302 | 303 | // look at shared memory - all variables called in 304 | // poly_eval should be in shared memory 305 | 306 | for (i=0; i < LEN_ALPHA/2; i++) 307 | { 308 | alpha1 = shared_alpha[i*2]; 309 | 310 | // I avoid the inner loop and get a superb speedup, but it needs to be hardcoded 311 | // is it possible to do the same using MACROS? or generating this from python? 312 | // ONLY USE p=5 here 313 | h1 = h2 = 0.0f; 314 | h1 = shared_H[NUM_TERMS*alpha1 + 0] + x*h1; 315 | h1 = shared_H[NUM_TERMS*alpha1 + 1] + x*h1; 316 | h1 = shared_H[NUM_TERMS*alpha1 + 2] + x*h1; 317 | /* 318 | h1 = shared_H[NUM_TERMS*alpha1 + 3] + x*h1; 319 | h1 = shared_H[NUM_TERMS*alpha1 + 4] + x*h1; 320 | 321 | h1 = shared_H[NUM_TERMS*alpha1 + 5] + x*h1; 322 | h1 = shared_H[NUM_TERMS*alpha1 + 6] + x*h1; 323 | h1 = shared_H[NUM_TERMS*alpha1 + 7] + x*h1; 324 | h1 = shared_H[NUM_TERMS*alpha1 + 8] + x*h1; 325 | 326 | h1 = shared_H[NUM_TERMS*alpha1 + 9] + x*h1; 327 | h1 = shared_H[NUM_TERMS*alpha1 + 10] + x*h1; 328 | h1 = shared_H[NUM_TERMS*alpha1 + 11] + x*h1; 329 | */ 330 | 331 | //result += alpha1; 332 | 333 | alpha1 = shared_alpha[i*2 + 1]; 334 | h2 = shared_H[NUM_TERMS*alpha1 + 0] + y*h2; 335 | h2 = shared_H[NUM_TERMS*alpha1 + 1] + y*h2; 336 | h2 = shared_H[NUM_TERMS*alpha1 + 2] + y*h2; 337 | 338 | /* 339 | h2 = shared_H[NUM_TERMS*alpha1 + 3] + y*h2; 340 | h2 = shared_H[NUM_TERMS*alpha1 + 4] + y*h2; 341 | 342 | h2 = shared_H[NUM_TERMS*alpha1 + 5] + y*h2; 343 | h2 = shared_H[NUM_TERMS*alpha1 + 6] + y*h2; 344 | h2 = shared_H[NUM_TERMS*alpha1 + 7] + y*h2; 345 | h2 = shared_H[NUM_TERMS*alpha1 + 8] + y*h2; 346 | 347 | h2 = shared_H[NUM_TERMS*alpha1 + 9] + y*h2; 348 | h2 = shared_H[NUM_TERMS*alpha1 + 10] + y*h2; 349 | h2 = shared_H[NUM_TERMS*alpha1 + 11] + y*h2; 350 | */ 351 | result += shared_A[k*LEN_ALPHA/2+i]*pre_mult*h1*h2; 352 | } 353 | } 354 | r[threadIdx.x + BLOCKSIZE*blockIdx.x] += result; 355 | } 356 | """ % {'lenA':len(A_curr),'termsAlpha':len(alpha),'clusters':len(sb_curr),'polyTerms':len(H), 'blocksize':blocksize, 357 | 'sigma': delta, 'len_alpha': len(alpha), 'num_terms': p, 'opts3': len(tx), 'num_clusters':clusters_this_call}, 358 | nvcc="nvcc",options=['-use_fast_math'], keep=False, no_extern_c=False) 359 | 360 | start_time = time.time() 361 | func = hermite_eval.get_function("eval_hermite2") 362 | kernel_time = func(r_gpu,A_gpu,tx_gpu,ty_gpu,sb_gpu,alpha_gpu,H_gpu,block=(blocksize,1,1),grid=(n_blocks,1),time_kernel=True) 363 | total_kernel_time += kernel_time 364 | 365 | cuda_time = time.time() - start_time 366 | A_gpu.free() 367 | sb_gpu.free() 368 | 369 | cuda.memcpy_dtoh(r_out,r_gpu) 370 | 371 | toc = time.time() 372 | loop_time = toc-tic 373 | 374 | # FLOPS 375 | # p = truncation level, alpha = f(p) 376 | # n_points, num_clusters defined in program 377 | flops = n_points*(5+num_clusters*(17+len(alpha)/2*(10+8*p))) / 10**9 378 | 379 | print '# points\tKernel time (s)\tParticles/s\tGFLOPS' 380 | print '%d\t%f\t%f\t%f' %(n_points, total_kernel_time, n_points / total_kernel_time, flops / total_kernel_time) 381 | 382 | 383 | ctx.pop() 384 | -------------------------------------------------------------------------------- /FMM_multipole2local.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Test suite for the two-dimensional 1/r^2 multipole to local GPU kernel. 3 | ''' 4 | import pycuda.driver as cuda 5 | from pycuda.driver import SourceModule 6 | import numpy 7 | import time 8 | from support import findInteractionList 9 | from numpy import zeros, array, ones, arange, log10, alltrue, isfinite, sqrt 10 | 11 | class cudaKernel: 12 | precomputed_division = """ 13 | // One block translates one ME to a new location 14 | #define BLOCKSIZE %(blocksize)d 15 | #define TERMS %(terms)d 16 | #define TERMS_C %(terms_c)d 17 | #define SOURCE (blockIdx.x + gridDim.x * blockIdx.y) 18 | #define NID threadIdx.x 19 | #define SIZEIL 27 20 | 21 | /////////////////////////////////////////////////////////////////////////////////// 22 | // One block per source-ME. The block translates the source ME to a group of destinations. 23 | // 24 | // Input: 25 | // float *MEterms_ref Container of ME terms 26 | // float *MEpoints_ref Container of ME centers 27 | // float *DestPoint_lst Container of destination points 28 | // float *LEterms_ref LE data containers 29 | /////////////////////////////////////////////////////////////////////////////////// 30 | __global__ void me2me(float *MEterms_ref, float *MEpoints_ref, float *DestPoint_lst, float *LEterms_ref) 31 | { 32 | int m, ME_dest, blockLoop, Nterm; 33 | float ac, bd; 34 | float tnm1_r, tnm1_i; 35 | float comb, MATmn_r, MATmn_i; 36 | float LE_r, LE_i; 37 | float tX, tY; 38 | 39 | /////////////////////////////////////////// 40 | // Memory allocation part 41 | /////////////////////////////////////////// 42 | __shared__ float MEterms[TERMS_C]; // Common source point 43 | float srcX = MEpoints_ref[SOURCE*2]; 44 | float srcY = MEpoints_ref[SOURCE*2 +1]; 45 | 46 | // Multipole expansions 47 | if (NID < 2*TERMS) MEterms[NID] = MEterms_ref[SOURCE*TERMS_C + NID]; 48 | 49 | // Destination points 50 | __shared__ float dest_point_local[2*SIZEIL]; 51 | if (NID < 2*SIZEIL){ 52 | dest_point_local[NID] = DestPoint_lst[2*SOURCE*SIZEIL + NID]; 53 | } 54 | __syncthreads(); 55 | 56 | /////////////////////////////////////////// 57 | // Computing part: M2L translation 58 | /////////////////////////////////////////// 59 | 60 | // Loop over the translations, assign threads for LE coefficients to be computed 61 | blockLoop = 0; 62 | while (NID + blockLoop * BLOCKSIZE < SIZEIL * TERMS){ 63 | 64 | // Choose "destination point and term of the LE" to work on (this avoid the use of module op) 65 | #pragma unroll 66 | for(m = 0; m < SIZEIL; m++){ 67 | if (NID + blockLoop * BLOCKSIZE >= m * TERMS){ 68 | ME_dest = m; // Destination point 69 | Nterm = (NID + blockLoop * BLOCKSIZE) - m * TERMS; // LE term 70 | } 71 | } 72 | 73 | // translation distance 74 | tX = dest_point_local[ME_dest * 2] - srcX; 75 | tY = dest_point_local[ME_dest * 2 + 1] - srcY; 76 | 77 | // Precompute t^(n+1) 78 | tnm1_r = tX; 79 | tnm1_i = tY; 80 | #pragma unroll 81 | for (m = 1; m < TERMS; m++){ 82 | if (Nterm >= m){ // tnm1 = tnm1 * t 83 | ac = tnm1_r; 84 | bd = tnm1_i; 85 | tnm1_r = ac * tX - bd * tY; 86 | tnm1_i = ac * tY + bd * tX; 87 | } 88 | } 89 | 90 | if (Nterm & 1 == 1) { // if n is even number, change of sign 91 | tnm1_r = -tnm1_r; 92 | tnm1_i = -tnm1_i; 93 | } 94 | 95 | // Initialization for comb(n+m, m) 96 | comb = 1.0f; 97 | 98 | float tx_inv, ty_inv; 99 | tx_inv = tX / (tX*tX + tY*tY); 100 | ty_inv = - tY / (tX*tX + tY*tY); 101 | 102 | float tnm1_inv_r, tnm1_inv_i; 103 | tnm1_inv_r = tnm1_r / (tnm1_r * tnm1_r + tnm1_i * tnm1_i); 104 | tnm1_inv_i = - tnm1_i / (tnm1_r * tnm1_r + tnm1_i * tnm1_i); 105 | 106 | // update_complex = MEm_complex * comb * tnm1_inv 107 | LE_r = MEterms[0] * tnm1_inv_r - MEterms[1] * tnm1_inv_i; 108 | LE_i = MEterms[0] * tnm1_inv_i + MEterms[1] * tnm1_inv_r; 109 | 110 | // Do the dot product (mat_row, ME terms) for m >= 1 111 | #pragma unroll 112 | for (m = 1; m < TERMS; m++){ 113 | float float_m = (float) m; 114 | comb = (Nterm == 0) ? 1.0f : comb * (Nterm + float_m) / float_m; // comb (m+n, m) for next term 115 | 116 | // update tnm1 with contribution of next term. tnm1 = tnm1 * t 117 | ac = tnm1_inv_r; 118 | bd = tnm1_inv_i; 119 | tnm1_inv_r = ac * tx_inv - bd * ty_inv; 120 | tnm1_inv_i = ac * ty_inv + bd * tx_inv; 121 | 122 | // mat_nm * tnm1_inv 123 | MATmn_r = comb * tnm1_inv_r; 124 | MATmn_i = comb * tnm1_inv_i; 125 | 126 | // update_complex = MEm_complex * mat_mn 127 | int tmp_2m = 2 * m; 128 | LE_r += MEterms[tmp_2m] * MATmn_r - MEterms[tmp_2m +1] * MATmn_i; 129 | LE_i += MEterms[tmp_2m] * MATmn_i + MEterms[tmp_2m +1] * MATmn_r; 130 | } 131 | 132 | float2 tmp_f2; 133 | tmp_f2.x = LE_r; 134 | tmp_f2.y = LE_i; 135 | int out_offset = (SOURCE*SIZEIL*TERMS_C) + 2*(Nterm + ME_dest * TERMS); 136 | *((float2*) &LEterms_ref[out_offset]) = tmp_f2; 137 | 138 | blockLoop += 1; // increase loop counter 139 | } 140 | } 141 | """ 142 | 143 | local_expansion_reduction = """ 144 | // One block translates one ME to a new location 145 | #define BLOCKSIZE %(blocksize)d 146 | #define TERMS %(terms)d 147 | #define TERMS_C %(terms_c)d 148 | #define SOURCE (blockIdx.x + gridDim.x * blockIdx.y) 149 | #define NID threadIdx.x 150 | #define SIZEIL 27 151 | 152 | __global__ void localExpansionReduction(float* sourceLE, int* reductionList, float* destinationLE){ 153 | float term; 154 | int i; 155 | 156 | // read inverse interaction list into shared memory 157 | __shared__ int inverse_interaction_list[SIZEIL]; 158 | if (NID < SIZEIL) inverse_interaction_list[NID] = reductionList[SIZEIL*SOURCE + NID]; 159 | // initialize the threadblock local expansion 160 | __shared__ float local_expansion[TERMS_C]; 161 | if (NID < TERMS_C) local_expansion[NID] = 0.0f; 162 | __syncthreads(); 163 | 164 | // Reduction loop; one local expansion at each loop. 165 | if (NID < TERMS_C) { 166 | for (i = 0; i < SIZEIL; i++) { 167 | if (inverse_interaction_list[i] < 0) break; 168 | // each thread gets one term 169 | term = sourceLE[inverse_interaction_list[i]*SIZEIL + NID]; 170 | local_expansion[NID] += term; 171 | } 172 | __threadfence_block(); 173 | // copy back to global 174 | destinationLE[TERMS_C*SOURCE + NID] = local_expansion[NID]; 175 | } 176 | } 177 | 178 | """ 179 | 180 | 181 | class dataset: 182 | p = 0 183 | dim = 0 184 | max_level = 0 185 | sizeIL = 0 186 | num_ME = 0 187 | num_sources = 0 188 | num_translations = 0 189 | MEterms_ref = [] 190 | MEpoints_ref = [] 191 | Trans_lst = [] 192 | DestPoint_lst = [] 193 | DestP_offset = [] 194 | DestP_length = [] 195 | LEterms_ref = [] 196 | LEout_offset = [] 197 | LEout_length = [] 198 | LEterms_cpu = [] 199 | LEreduction = [] 200 | start_cpu = 0 201 | end_cpu = 0 202 | n_blocks = 0 203 | blocksize = 0 204 | time_gpu = 0 205 | time_transfer = 0 206 | 207 | 208 | def generateTreeTestData(data): 209 | ''' This function generates a simple case of an FMM dataset for a regular tree decomposition. 210 | ''' 211 | data.num_ME = 0 212 | for i in range(2, data.max_level+1): 213 | data.num_ME += 4**i 214 | 215 | data.MEterms_ref = ones(data.num_ME * 2*data.p) # Container of ME terms 216 | data.MEpoints_ref = zeros(data.num_ME * 2) # Container of ME centers 217 | data.LEreduction = -ones(data.num_ME * 27, int) # Contain the destination of the LE 218 | data.LEcounter = zeros(data.num_ME, int) # Counts the number of sources per LE 219 | for i in range(data.num_ME): 220 | data.MEpoints_ref[data.dim * i] = 0.0 221 | data.MEpoints_ref[data.dim * i +1] = 0.0 222 | 223 | # Data for the reduction step 224 | le_counter = 0 225 | for level_counter in range(2, data.max_level+1): 226 | for source_counter in range(4**level_counter): 227 | interaction_list = findInteractionList(source_counter, level_counter) 228 | for ilist_counter in range(len(interaction_list)): 229 | box_destination = le_counter + interaction_list[ilist_counter] 230 | box_counter = data.LEcounter[box_destination] 231 | data.LEreduction[27 * box_destination + box_counter] = source_counter 232 | data.LEcounter[box_destination] = box_counter + 1 233 | le_counter += 4**level_counter 234 | 235 | data.Trans_lst = arange(data.num_ME) # Number of 'the ME' to be translated 236 | data.num_sources = len(data.Trans_lst) 237 | data.num_translations = data.sizeIL * data.num_ME # Number of translations to be performed 238 | data.DestPoint_lst = zeros(data.sizeIL * data.num_ME * data.dim) # Destination points 239 | dst_arr1 = array([-1.60, 2.80, 0.1]) 240 | dst_arr2 = array([-1.60, -0.5, -1.40, 0.1, 1.80, 1.60]) 241 | dst_arr3 = array([-1.80, -1.40, 1.40]) 242 | dst_arr4 = array([-1.60, -1.80, 1.60]) 243 | 244 | for idME in range(data.num_ME): 245 | offset = idME * data.sizeIL 246 | point_index = 0 247 | # combine point sets (1&2) to form coordinates 248 | for x in dst_arr1: 249 | for y in dst_arr2: 250 | data.DestPoint_lst[data.dim*(offset + point_index)] = x 251 | data.DestPoint_lst[data.dim*(offset + point_index) +1] = y 252 | point_index += 1 253 | # combine point sets (3&4) to form coordinates 254 | for x in dst_arr3: 255 | for y in dst_arr4: 256 | data.DestPoint_lst[data.dim*(offset + point_index)] = x 257 | data.DestPoint_lst[data.dim*(offset + point_index) +1] = y 258 | point_index += 1 259 | 260 | # Translation destination Offset & Size 261 | offset_lst = zeros(3*data.num_ME, dtype=int) 262 | data.DestP_offset = zeros(data.num_ME, dtype=int) # Translation destination start at offset 263 | data.DestP_length = zeros(data.num_ME, dtype=int) # Number of translation destinations per ME 264 | for i in range(data.num_ME): 265 | data.DestP_offset[i] = i*data.sizeIL 266 | data.DestP_length[i] = data.sizeIL 267 | 268 | # Destination Output Offset & Size 269 | data.LEout_offset = zeros(data.num_ME, dtype=int) # Output of translation starts at offset 270 | data.LEout_length = zeros(data.num_ME, dtype=int) # Length of translation output 271 | for i in range(data.num_ME): 272 | data.LEout_offset[i] = i*data.sizeIL 273 | data.LEout_length[i] = data.sizeIL 274 | data.LEterms_ref = zeros(data.num_translations * data.dim*data.p) # LE data containers 275 | 276 | 277 | def generateSingleTestData(data): 278 | ''' One source - One destination case ''' 279 | data.p = 20 280 | data.num_ME = 1 281 | data.dim = 2 282 | 283 | data.MEterms_ref = 0.3*ones(data.num_ME * 2*data.p) # Container of ME terms 284 | data.MEpoints_ref = array([0.51, 0.52]) # Container of ME centers 285 | data.Trans_lst = array([0]) # Number of 'the ME' to be translated 286 | data.num_sources = len(data.Trans_lst) 287 | data.num_translations = 1 # Number of translations to be performed 288 | data.DestPoint_lst = array([0.72, 0.74]) 289 | data.DestP_offset = array([0]) # Translation destination start at offset 290 | data.DestP_length = array([1]) # Number of translation destinations per ME 291 | data.LEout_offset = array([0]) # Output of translation starts at offset 292 | data.LEout_length = array([1]) # Length of translation output 293 | data.LEterms_ref = zeros(data.num_translations * 2*data.p) # LE data containers 294 | 295 | 296 | def flops(ilz, bs, p, num_blocks): 297 | ''' 298 | Function with estimates for the number of floating point operations for a m2l call. 299 | 300 | ilz Interaction List size 301 | bs Blocksize (num threads in a block) 302 | p Terms in the expansion 303 | num_blocks Total number of blocks executed 304 | ''' 305 | return 1.0*num_blocks * (ilz * p * (35 + ilz*7 + (p-1)*28)) 306 | 307 | 308 | def bandwidth(ilz, p, num_blocks): 309 | ''' 310 | Computes the number of effective bytes moved by the kernel call 311 | ''' 312 | return num_blocks * (2*p + 2*ilz + 2*ilz*p + 4) * 4.0 313 | code 314 | def printTable(data): 315 | print data.p, ' & ', 316 | print data.num_translations, ' & ', 317 | print '%.2e & ' % data.time_gpu, 318 | print '%.2e & ' % data.time_reduction, 319 | print '%.2e & ' % data.time_transfer_in, 320 | print '%.2e & ' % data.time_transfer_out, 321 | print '%.2f & ' % (flops(27, data.blocksize, data.p, data.n_blocks) / 10**9 / data.time_gpu), 322 | print '%.2f & ' % (bandwidth(27, data.p, data.n_blocks) / 1024**3 / data.time_gpu), 323 | print '%.2f ' % ((1.0 * data.num_translations) / 10**6 / data.time_gpu), '\\\\' 324 | 325 | 326 | def printRun(data): 327 | print '\nNum coefficient: ', data.p, 328 | print '\tNum Sources: ', data.num_sources, 329 | print '\tNum translations: ', data.num_translations, 330 | print '\tNum Threads: ', data.blocksize, 331 | print '\tAll finite: ', alltrue(isfinite(data.LEterms_ref)) 332 | print 'GPU time: %(gpu_time)e' % {'gpu_time' : data.time_gpu}, 333 | print '\tTransfer time: %(transfer_time)e' % {'transfer_time' : data.time_transfer}, 334 | print 'GIGAOP: ', flops(27, data.blocksize, data.p, data.n_blocks) / 10**9, 335 | print '\tGIGAOP/S: ', flops(27, data.blocksize, data.p, data.n_blocks) / 10**9 / data.time_gpu, 336 | print '\tEffective Bandwidth [GB/s]: ', bandwidth(27, data.p, data.n_blocks) / 1024**3 / data.time_gpu 337 | print 'Translations per second (in millions) [MTPS]: ', (1.0 * data.num_translations) / 10**6 / data.time_gpu 338 | 339 | 340 | def compareCoefficients(data): 341 | l2_numerator = 0 342 | l2_denominator = 0 343 | for i in range(data.num_translations): 344 | first_coefficient = True 345 | print_coefficient = False 346 | for j in range(2 * data.p): 347 | coeff_gpu = data.LEterms_ref[i*2*data.p + j] 348 | coeff_cpu = data.LEterms_cpu[i*2*data.p + j] 349 | coeff_error = abs((coeff_gpu - coeff_cpu) / coeff_cpu) 350 | l2_numerator += coeff_error**2 351 | l2_denominator += coeff_cpu**2 352 | no_print_term = True 353 | if no_print_term: 354 | continue 355 | max_error = max(abs((data.LEterms_ref - data.LEterms_cpu) / data.LEterms_cpu)) 356 | print 'Max relative error: ', max_error 357 | print 'L2 relative error norm: ', sqrt(l2_numerator / l2_denominator) 358 | 359 | 360 | def cpuComputeM2L(data): 361 | ''' M2L translation using python. 362 | ''' 363 | data.start_cpu= time.time() 364 | 365 | m2l = zeros((data.p,data.p), complex) 366 | 367 | for source in range(len(data.Trans_lst)): 368 | ME_src = data.Trans_lst[source] 369 | offset = data.DestP_offset[source] 370 | length = data.DestP_length[source] 371 | 372 | # output data 373 | LE_out = zeros(length * 2*data.p) 374 | LE_offset = data.LEout_offset[source] 375 | LE_length = data.LEout_length[source] 376 | 377 | # get ME data 378 | MEterms = data.MEterms_ref[ME_src*2*data.p:ME_src*2*data.p+2*data.p] 379 | MEpoints = data.MEpoints_ref[ME_src*2:ME_src*2+2] 380 | 381 | # local output offset (translate into local memory) 382 | out_offset = 0 383 | for dest in range(length): 384 | # destination point 385 | ME_dest = offset + dest 386 | destPoint = data.DestPoint_lst[ME_dest*2:ME_dest*2+2] 387 | 388 | # translate source 389 | transDist = destPoint - MEpoints 390 | t_complex = complex(transDist[0], transDist[1]) 391 | 392 | # loop over the terms 393 | for n in range(data.p): 394 | # precompute t**(n+1) 395 | taux = t_complex**(n+1) 396 | vaux = 1. # variable for comb(n+m, m) 397 | 398 | # (-1)**n 399 | if ((n & 1) == 0): 400 | taux = vaux*taux 401 | else: 402 | taux = -vaux*taux 403 | 404 | # Do dot product for m = 0 405 | mat_mn = vaux / taux 406 | m2l[n][0] = mat_mn 407 | 408 | MEm_complex = MEterms[0] + MEterms[1] * 1j 409 | update_complex = MEm_complex * mat_mn 410 | LE_out[out_offset + 2*n + 0] = update_complex.real 411 | LE_out[out_offset + 2*n + 1] = update_complex.imag 412 | 413 | # Do the dot product (mat_row, ME) for all m > 0 414 | for m in range(1,data.p): 415 | # comb(m+n, m) 416 | if n == 0: 417 | vaux = 1. 418 | else: 419 | vaux = vaux * (n + m) / m 420 | 421 | # update t_aux with contribution 422 | taux = taux * t_complex 423 | 424 | # compute element operation 425 | mat_mn = vaux / taux 426 | MEm_complex = MEterms[2*m] + MEterms[2*m + 1] * 1j 427 | update_complex = MEm_complex * mat_mn 428 | m2l[n][m] = mat_mn 429 | 430 | # update LE 431 | LE_out[out_offset + 2*n] = LE_out[out_offset + 2*n] + update_complex.real 432 | LE_out[out_offset + 2*n +1] = LE_out[out_offset + 2*n +1] + update_complex.imag 433 | 434 | # update local output offset for the next translation 435 | out_offset += 2*data.p 436 | 437 | # save output to global 438 | LElocal_offset = LE_offset * 2*data.p 439 | data.LEterms_ref[LE_offset * 2*data.p:(LE_offset + LE_length) * 2*data.p] = LE_out 440 | #print 'LE terms: ', data.LEterms_ref 441 | data.LEterms_cpu = data.LEterms_ref.copy() 442 | data.end_cpu= time.time() 443 | 444 | 445 | def gpuComputeM2L(data, cuda_kernel_string): 446 | # Cuda module 447 | cuda.init() 448 | 449 | assert cuda.Device.count() >= 1 # check that we can run 450 | dev = cuda.Device(0) # Get device 451 | ctx = dev.make_context() # create context 452 | 453 | data.LE_inter_ref = zeros(data.num_translations * data.dim*data.p) # LE data containers 454 | data.LEterms_ref = zeros(data.num_ME * data.dim*data.p) # LE data containers 455 | 456 | # Convert data for the GPU 457 | data.MEterms_ref = data.MEterms_ref.astype(numpy.float32) # Container of ME terms 458 | data.MEpoints_ref = data.MEpoints_ref.astype(numpy.float32) # Container of ME centers 459 | data.DestPoint_lst = data.DestPoint_lst.astype(numpy.float32) # Container of destination points 460 | data.LE_inter_ref = data.LE_inter_ref.astype(numpy.float32) # LE intermediate data containers 461 | data.LEterms_ref = data.LEterms_ref.astype(numpy.float32) # LE data containers 462 | data.LEreduc_ref = data.LEreduction.astype(numpy.int32) 463 | 464 | # Allocate memory in the GPU 465 | gpu_MEterms_ref = cuda.mem_alloc(data.MEterms_ref.size * data.MEterms_ref.dtype.itemsize) 466 | gpu_MEpoints_ref = cuda.mem_alloc(data.MEpoints_ref.size * data.MEpoints_ref.dtype.itemsize) 467 | gpu_DestPoint_lst = cuda.mem_alloc(data.DestPoint_lst.size * data.DestPoint_lst.dtype.itemsize) 468 | gpu_LE_inter_ref = cuda.mem_alloc(data.LE_inter_ref.size * data.LE_inter_ref.dtype.itemsize) 469 | gpu_LEterms_ref = cuda.mem_alloc(data.LEterms_ref.size * data.LEterms_ref.dtype.itemsize) 470 | gpu_LEreduc_ref = cuda.mem_alloc(data.LEreduc_ref.size * data.LEreduc_ref.dtype.itemsize) 471 | 472 | # Transfer memory to device 473 | start_transfer = time.time() 474 | cuda.memcpy_htod(gpu_MEterms_ref, data.MEterms_ref) 475 | cuda.memcpy_htod(gpu_MEpoints_ref, data.MEpoints_ref) 476 | cuda.memcpy_htod(gpu_DestPoint_lst, data.DestPoint_lst) 477 | cuda.memcpy_htod(gpu_LEreduc_ref, data.LEreduc_ref) 478 | end_transfer = time.time() 479 | data.time_transfer_in = end_transfer - start_transfer 480 | 481 | data.blocksize = 64 # one to start with 482 | data.n_blocks = data.num_sources # one source per block 483 | 484 | mod = SourceModule(cuda_kernel_string % {'blocksize': data.blocksize,'terms':data.p,'terms_c':2*data.p}, 485 | nvcc="nvcc",options=['-use_fast_math'], keep=False, no_extern_c=False) 486 | 487 | module_reduction = SourceModule(cudaKernel.local_expansion_reduction % {'blocksize': data.blocksize,'terms':data.p,'terms_c':2*data.p}, 488 | nvcc="nvcc",options=['-use_fast_math'], keep=False, no_extern_c=False) 489 | 490 | if data.n_blocks > 512: 491 | n_blocks_x = data.n_blocks / 16 492 | n_blocks_y = 16 493 | else: 494 | n_blocks_x = data.n_blocks 495 | n_blocks_y = 1 496 | 497 | # Run multipole to local 498 | func = mod.get_function("me2me") 499 | data.time_gpu = func(gpu_MEterms_ref, gpu_MEpoints_ref, gpu_DestPoint_lst, gpu_LE_inter_ref, block=(data.blocksize,1,1), grid=(n_blocks_x, n_blocks_y), time_kernel=True) 500 | 501 | # Run reduction 502 | data.blocksize = 32 503 | func = module_reduction.get_function("localExpansionReduction") 504 | data.time_reduction = func(gpu_LE_inter_ref, gpu_LEreduc_ref, gpu_LEterms_ref, block=(data.blocksize,1,1), grid=(n_blocks_x, n_blocks_y), time_kernel=True) 505 | 506 | start_transfer = time.time() 507 | cuda.memcpy_dtoh(data.LEterms_ref, gpu_LEterms_ref) 508 | end_transfer = time.time() 509 | data.time_transfer_out = end_transfer - start_transfer 510 | 511 | ctx.pop() # context pop 512 | 513 | 514 | # Performs multiple runs and outputs performance data 515 | data = dataset() 516 | data.dim = 2 517 | data.sizeIL = 27 518 | 519 | for num_terms in [8, 12, 16]: 520 | for num_level in [3, 4, 5, 6, 7, 8]: 521 | data.p = num_terms 522 | data.max_level = num_level 523 | generateTreeTestData(data) 524 | gpuComputeM2L(data, cudaKernel.precomputed_division) 525 | printTable(data) 526 | 527 | #EOF 528 | 529 | -------------------------------------------------------------------------------- /m2l_gpu_test.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Multipole to Local translations. 3 | 4 | Translations of a single source are stored contiguously. 5 | 6 | ''' 7 | import pycuda.driver as cuda 8 | from pycuda.driver import SourceModule 9 | import numpy 10 | import time 11 | from numpy import zeros, array, ones, arange, log10, alltrue, isfinite, sqrt 12 | 13 | 14 | class cudaKernel: 15 | precomputed_division = """ 16 | // One block translates one ME to a new location 17 | #define BLOCKSIZE %(blocksize)d 18 | #define TERMS %(terms)d 19 | #define TERMS_C %(terms_c)d 20 | #define SOURCE (blockIdx.x + gridDim.x * blockIdx.y) 21 | #define NID threadIdx.x 22 | #define SIZEIL 27 23 | 24 | /////////////////////////////////////////////////////////////////////////////////// 25 | // One block per source-ME. The block translates the source ME to a group of destinations. 26 | // 27 | // Input: 28 | // float *MEterms_ref Container of ME terms 29 | // float *MEpoints_ref Container of ME centers 30 | // float *DestPoint_lst Container of destination points 31 | // float *LEterms_ref LE data containers 32 | /////////////////////////////////////////////////////////////////////////////////// 33 | __global__ void me2me(float *MEterms_ref, float *MEpoints_ref, float *DestPoint_lst, float *LEterms_ref) 34 | { 35 | int m, ME_dest, blockLoop, Nterm; 36 | float ac, bd; 37 | float tnm1_r, tnm1_i; 38 | float comb, MATmn_r, MATmn_i; 39 | float LE_r, LE_i; 40 | float tX, tY; 41 | 42 | /////////////////////////////////////////// 43 | // Memory allocation part 44 | /////////////////////////////////////////// 45 | __shared__ float MEterms[TERMS_C]; // Common source point 46 | float srcX = MEpoints_ref[SOURCE*2]; 47 | float srcY = MEpoints_ref[SOURCE*2 +1]; 48 | 49 | // Multipole expansions 50 | if (NID < 2*TERMS) MEterms[NID] = MEterms_ref[SOURCE*TERMS_C + NID]; 51 | 52 | // Destination points 53 | __shared__ float dest_point_local[2*SIZEIL]; 54 | if (NID < 2*SIZEIL){ 55 | dest_point_local[NID] = DestPoint_lst[2*SOURCE*SIZEIL + NID]; 56 | } 57 | __syncthreads(); 58 | 59 | /////////////////////////////////////////// 60 | // Computing part: M2L translation 61 | /////////////////////////////////////////// 62 | 63 | // Loop over the translations, assign threads for LE coefficients to be computed 64 | blockLoop = 0; 65 | while (NID + blockLoop * BLOCKSIZE < SIZEIL * TERMS){ 66 | 67 | // Choose "destination point and term of the LE" to work on (this avoid the use of module op) 68 | #pragma unroll 69 | for(m = 0; m < SIZEIL; m++){ 70 | if (NID + blockLoop * BLOCKSIZE >= m * TERMS){ 71 | ME_dest = m; // Destination point 72 | Nterm = (NID + blockLoop * BLOCKSIZE) - m * TERMS; // LE term 73 | } 74 | } 75 | 76 | // translation distance 77 | tX = dest_point_local[ME_dest * 2] - srcX; 78 | tY = dest_point_local[ME_dest * 2 + 1] - srcY; 79 | 80 | // Precompute t^(n+1) 81 | tnm1_r = tX; 82 | tnm1_i = tY; 83 | #pragma unroll 84 | for (m = 1; m < TERMS; m++){ 85 | if (Nterm >= m){ // tnm1 = tnm1 * t 86 | ac = tnm1_r; 87 | bd = tnm1_i; 88 | tnm1_r = ac * tX - bd * tY; 89 | tnm1_i = ac * tY + bd * tX; 90 | } 91 | } 92 | 93 | if (Nterm & 1 == 1) { // if n is even number, change of sign 94 | tnm1_r = -tnm1_r; 95 | tnm1_i = -tnm1_i; 96 | } 97 | 98 | // Initialization for comb(n+m, m) 99 | comb = 1.0f; 100 | 101 | float tx_inv, ty_inv; 102 | tx_inv = tX / (tX*tX + tY*tY); 103 | ty_inv = - tY / (tX*tX + tY*tY); 104 | 105 | float tnm1_inv_r, tnm1_inv_i; 106 | tnm1_inv_r = tnm1_r / (tnm1_r * tnm1_r + tnm1_i * tnm1_i); 107 | tnm1_inv_i = - tnm1_i / (tnm1_r * tnm1_r + tnm1_i * tnm1_i); 108 | 109 | // update_complex = MEm_complex * comb * tnm1_inv 110 | LE_r = MEterms[0] * tnm1_inv_r - MEterms[1] * tnm1_inv_i; 111 | LE_i = MEterms[0] * tnm1_inv_i + MEterms[1] * tnm1_inv_r; 112 | 113 | // Do the dot product (mat_row, ME terms) for m >= 1 114 | #pragma unroll 115 | for (m = 1; m < TERMS; m++){ 116 | float float_m = (float) m; 117 | comb = (Nterm == 0) ? 1.0f : comb * (Nterm + float_m) / float_m; // comb (m+n, m) for next term 118 | 119 | // update tnm1 with contribution of next term. tnm1 = tnm1 * t 120 | ac = tnm1_inv_r; 121 | bd = tnm1_inv_i; 122 | tnm1_inv_r = ac * tx_inv - bd * ty_inv; 123 | tnm1_inv_i = ac * ty_inv + bd * tx_inv; 124 | 125 | // mat_nm * tnm1_inv 126 | MATmn_r = comb * tnm1_inv_r; 127 | MATmn_i = comb * tnm1_inv_i; 128 | 129 | // update_complex = MEm_complex * mat_mn 130 | int tmp_2m = 2 * m; 131 | LE_r += MEterms[tmp_2m] * MATmn_r - MEterms[tmp_2m +1] * MATmn_i; 132 | LE_i += MEterms[tmp_2m] * MATmn_i + MEterms[tmp_2m +1] * MATmn_r; 133 | } 134 | 135 | float2 tmp_f2; 136 | tmp_f2.x = LE_r; 137 | tmp_f2.y = LE_i; 138 | int out_offset = (SOURCE*SIZEIL*TERMS_C) + 2*(Nterm + ME_dest * TERMS); 139 | *((float2*) &LEterms_ref[out_offset]) = tmp_f2; 140 | 141 | blockLoop += 1; // increase loop counter 142 | } 143 | } 144 | """ 145 | 146 | precomputed_with_safe_division = """ 147 | // One block translates one ME to a new location 148 | #define BLOCKSIZE %(blocksize)d 149 | #define TERMS %(terms)d 150 | #define TERMS_C %(terms_c)d 151 | #define SOURCE (blockIdx.x + gridDim.x * blockIdx.y) 152 | #define NID threadIdx.x 153 | #define SIZEIL 27 154 | 155 | /////////////////////////////////////////////////////////////////////////////////// 156 | // One block per source-ME. The block translates the source ME to a group of destinations. 157 | // 158 | // Input: 159 | // float *MEterms_ref Container of ME terms 160 | // float *MEpoints_ref Container of ME centers 161 | // float *DestPoint_lst Container of destination points 162 | // float *LEterms_ref LE data containers 163 | /////////////////////////////////////////////////////////////////////////////////// 164 | __global__ void me2me(float *MEterms_ref, float *MEpoints_ref, float *DestPoint_lst, float *LEterms_ref) 165 | { 166 | int m, ME_dest, blockLoop, Nterm; 167 | float ac, bd; 168 | float tnm1_r, tnm1_i; 169 | float comb, MATmn_r, MATmn_i; 170 | float LE_r, LE_i; 171 | float tX, tY; 172 | 173 | /////////////////////////////////////////// 174 | // Memory allocation part 175 | /////////////////////////////////////////// 176 | __shared__ float MEterms[TERMS_C]; // Common source point 177 | float srcX = MEpoints_ref[SOURCE*2]; 178 | float srcY = MEpoints_ref[SOURCE*2 +1]; 179 | 180 | // Multipole expansions 181 | if (NID < 2*TERMS) MEterms[NID] = MEterms_ref[SOURCE*TERMS_C + NID]; 182 | 183 | // Destination points 184 | __shared__ float dest_point_local[2*SIZEIL]; 185 | if (NID < 2*SIZEIL){ 186 | dest_point_local[NID] = DestPoint_lst[2*SOURCE*SIZEIL + NID]; 187 | } 188 | __syncthreads(); 189 | 190 | /////////////////////////////////////////// 191 | // Computing part: M2L translation 192 | /////////////////////////////////////////// 193 | 194 | // Loop over the translations, assign threads for LE coefficients to be computed 195 | blockLoop = 0; 196 | while (NID + blockLoop * BLOCKSIZE < SIZEIL * TERMS){ 197 | 198 | // Choose "destination point and term of the LE" to work on (this avoid the use of module op) 199 | #pragma unroll 200 | for(m = 0; m < SIZEIL; m++){ 201 | if (NID + blockLoop * BLOCKSIZE >= m * TERMS){ 202 | ME_dest = m; // Destination point 203 | Nterm = (NID + blockLoop * BLOCKSIZE) - m * TERMS; // LE term 204 | } 205 | } 206 | 207 | // translation distance 208 | tX = dest_point_local[ME_dest * 2] - srcX; 209 | tY = dest_point_local[ME_dest * 2 + 1] - srcY; 210 | 211 | // Precompute t^(n+1) 212 | tnm1_r = tX; 213 | tnm1_i = tY; 214 | #pragma unroll 215 | for (m = 1; m < TERMS; m++){ 216 | if (Nterm >= m){ // tnm1 = tnm1 * t 217 | ac = tnm1_r; 218 | bd = tnm1_i; 219 | tnm1_r = ac * tX - bd * tY; 220 | tnm1_i = ac * tY + bd * tX; 221 | } 222 | } 223 | 224 | if (Nterm & 1 == 1) { // if n is even number, change of sign 225 | tnm1_r = -tnm1_r; 226 | tnm1_i = -tnm1_i; 227 | } 228 | 229 | // Initialization for comb(n+m, m) 230 | comb = 1.0f; 231 | 232 | float tx_inv, ty_inv; 233 | tx_inv = tX / (tX*tX + tY*tY); 234 | ty_inv = - tY / (tX*tX + tY*tY); 235 | 236 | float tnm1_inv_r, tnm1_inv_i; 237 | tnm1_inv_r = tnm1_r / (tnm1_r * tnm1_r + tnm1_i * tnm1_i); 238 | tnm1_inv_i = - tnm1_i / (tnm1_r * tnm1_r + tnm1_i * tnm1_i); 239 | 240 | // update_complex = MEm_complex * comb * tnm1_inv 241 | LE_r = MEterms[0] * tnm1_inv_r - MEterms[1] * tnm1_inv_i; 242 | LE_i = MEterms[0] * tnm1_inv_i + MEterms[1] * tnm1_inv_r; 243 | 244 | // Do the dot product (mat_row, ME terms) for m >= 1 245 | #pragma unroll 246 | for (m = 1; m < TERMS; m++){ 247 | float float_m = (float) m; 248 | comb = (Nterm == 0) ? 1.0f : comb * (Nterm + float_m) / float_m; // comb (m+n, m) for next term 249 | 250 | // update tnm1 with contribution of next term. tnm1 = tnm1 * t 251 | ac = tnm1_inv_r; 252 | bd = tnm1_inv_i; 253 | tnm1_inv_r = ac * tx_inv - bd * ty_inv; 254 | tnm1_inv_i = ac * ty_inv + bd * tx_inv; 255 | 256 | // mat_nm * tnm1_inv 257 | MATmn_r = comb * tnm1_inv_r; 258 | MATmn_i = comb * tnm1_inv_i; 259 | 260 | // update_complex = MEm_complex * mat_mn 261 | int tmp_2m = 2 * m; 262 | LE_r += MEterms[tmp_2m] * MATmn_r - MEterms[tmp_2m +1] * MATmn_i; 263 | LE_i += MEterms[tmp_2m] * MATmn_i + MEterms[tmp_2m +1] * MATmn_r; 264 | } 265 | 266 | float2 tmp_f2; 267 | tmp_f2.x = LE_r; 268 | tmp_f2.y = LE_i; 269 | int out_offset = (SOURCE*SIZEIL*TERMS_C) + 2*(Nterm + ME_dest * TERMS); 270 | *((float2*) &LEterms_ref[out_offset]) = tmp_f2; 271 | 272 | blockLoop += 1; // increase loop counter 273 | } 274 | } 275 | """ 276 | 277 | division_inplace = """ 278 | // One block translates one ME to a new location 279 | #define BLOCKSIZE %(blocksize)d 280 | #define TERMS %(terms)d 281 | #define TERMS_C %(terms_c)d 282 | #define SOURCE (blockIdx.x + gridDim.x * blockIdx.y) 283 | #define NID threadIdx.x 284 | #define SIZEIL 27 285 | 286 | /////////////////////////////////////////////////////////////////////////////////// 287 | // One block per source-ME. The block translates the source ME to a group of destinations. 288 | // 289 | // Input: 290 | // float *MEterms_ref Container of ME terms 291 | // float *MEpoints_ref Container of ME centers 292 | // float *DestPoint_lst Container of destination points 293 | // float *LEterms_ref LE data containers 294 | /////////////////////////////////////////////////////////////////////////////////// 295 | __global__ void me2me(float *MEterms_ref, float *MEpoints_ref, float *DestPoint_lst, float *LEterms_ref) 296 | { 297 | int m, ME_dest, blockLoop, Nterm; 298 | float ac, bd; 299 | float tnm1_r, tnm1_i; 300 | float comb, MATmn_r, MATmn_i; 301 | float LE_r, LE_i; 302 | float tX, tY; 303 | 304 | /////////////////////////////////////////// 305 | // Memory allocation part 306 | /////////////////////////////////////////// 307 | __shared__ float MEterms[TERMS_C]; // Common source point 308 | float srcX = MEpoints_ref[SOURCE*2]; 309 | float srcY = MEpoints_ref[SOURCE*2 +1]; 310 | 311 | // Multipole expansions 312 | if (NID < 2*TERMS) MEterms[NID] = MEterms_ref[SOURCE*TERMS_C + NID]; 313 | 314 | // Destination points 315 | __shared__ float dest_point_local[2*SIZEIL]; 316 | if (NID < 2*SIZEIL){ 317 | dest_point_local[NID] = DestPoint_lst[2*SOURCE*SIZEIL + NID]; 318 | } 319 | __syncthreads(); 320 | 321 | /////////////////////////////////////////// 322 | // Computing part: M2L translation 323 | /////////////////////////////////////////// 324 | 325 | // Loop over the translations, assign threads for LE coefficients to be computed 326 | blockLoop = 0; 327 | while (NID + blockLoop * BLOCKSIZE < SIZEIL * TERMS){ 328 | 329 | // Choose "destination point and term of the LE" to work on (this avoid the use of module op) 330 | #pragma unroll 331 | for(m = 0; m < SIZEIL; m++){ 332 | if (NID + blockLoop * BLOCKSIZE >= m * TERMS){ 333 | ME_dest = m; // Destination point 334 | Nterm = (NID + blockLoop * BLOCKSIZE) - m * TERMS; // LE term 335 | } 336 | } 337 | 338 | // translation distance 339 | tX = dest_point_local[ME_dest * 2] - srcX; 340 | tY = dest_point_local[ME_dest * 2 + 1] - srcY; 341 | 342 | // Precompute t^(n+1) 343 | tnm1_r = tX; 344 | tnm1_i = tY; 345 | #pragma unroll 346 | for (m = 1; m < TERMS; m++){ 347 | if (Nterm >= m){ // tnm1 = tnm1 * t 348 | ac = tnm1_r; 349 | bd = tnm1_i; 350 | tnm1_r = ac * tX - bd * tY; 351 | tnm1_i = ac * tY + bd * tX; 352 | } 353 | } 354 | 355 | if (Nterm & 1 == 1) { // if n is even number, change of sign 356 | tnm1_r = -tnm1_r; 357 | tnm1_i = -tnm1_i; 358 | } 359 | 360 | // Initialization for comb(n+m, m) 361 | comb = 1.0f; //this is used as a float for speed 362 | 363 | float tx_inv, ty_inv; 364 | if (fabs(tX) >= fabs(tY)) { 365 | ac = tY / tX; 366 | bd = 1.0f / (tX + tY * ac); 367 | tx_inv = bd; 368 | ty_inv = - ac * bd; 369 | } else { 370 | ac = tX / tY; 371 | bd = 1.0f / (tX * ac + tY); 372 | tx_inv = ac * bd; 373 | ty_inv = - bd; 374 | } 375 | 376 | float tnm1_inv_r, tnm1_inv_i; 377 | if (fabs(tnm1_r) >= fabs(tnm1_i)) { 378 | ac = tnm1_i / tnm1_r; 379 | bd = 1.0f / (tnm1_r + tnm1_i * ac); 380 | tnm1_inv_r = bd; 381 | tnm1_inv_i = -ac * bd; 382 | } else { 383 | ac = tnm1_r / tnm1_i; 384 | bd = 1.0f / (tnm1_i + tnm1_r * ac); 385 | tnm1_inv_r = ac * bd; 386 | tnm1_inv_i = -bd; 387 | } 388 | 389 | // update_complex = MEm_complex * comb * tnm1_inv 390 | LE_r = MEterms[0] * tnm1_inv_r - MEterms[1] * tnm1_inv_i; 391 | LE_i = MEterms[0] * tnm1_inv_i + MEterms[1] * tnm1_inv_r; 392 | 393 | // Do the dot product (mat_row, ME terms) for m >= 1 394 | #pragma unroll 395 | for (m = 1; m < TERMS; m++){ 396 | // comb (m+n, m) for next term 397 | float float_m = (float) m; 398 | comb = (Nterm == 0) ? 1.0f : comb * (Nterm + float_m) / float_m; // comb (m+n, m) for next term 399 | 400 | // update tnm1 with contribution of next term. tnm1 = tnm1 * t 401 | ac = tnm1_inv_r; 402 | bd = tnm1_inv_i; 403 | tnm1_inv_r = ac * tx_inv - bd * ty_inv; 404 | tnm1_inv_i = ac * ty_inv + bd * tx_inv; 405 | 406 | // mat_nm * tnm1_inv 407 | MATmn_r = comb * tnm1_inv_r; 408 | MATmn_i = comb * tnm1_inv_i; 409 | 410 | // update_complex = MEm_complex * mat_mn 411 | int tmp_2m = 2*m; 412 | LE_r += MEterms[tmp_2m] * MATmn_r - MEterms[tmp_2m +1] * MATmn_i; 413 | LE_i += MEterms[tmp_2m] * MATmn_i + MEterms[tmp_2m +1] * MATmn_r; 414 | } 415 | 416 | float2 tmp_f2; 417 | tmp_f2.x = LE_r; 418 | tmp_f2.y = LE_i; 419 | int out_offset = (SOURCE*SIZEIL*TERMS_C) + 2*(Nterm + ME_dest * TERMS); 420 | *((float2*) &LEterms_ref[out_offset]) = tmp_f2; 421 | 422 | blockLoop += 1; // increase loop counter 423 | } 424 | } 425 | """ 426 | 427 | safe_division = """ 428 | // One block translates one ME to a new location 429 | #define BLOCKSIZE %(blocksize)d 430 | #define TERMS %(terms)d 431 | #define TERMS_C %(terms_c)d 432 | #define SOURCE (blockIdx.x + gridDim.x * blockIdx.y) 433 | #define NID threadIdx.x 434 | #define SIZEIL 27 435 | 436 | /////////////////////////////////////////////////////////////////////////////////// 437 | // One block per source-ME. The block translates the source ME to a group of destinations. 438 | // 439 | // Input: 440 | // float *MEterms_ref Container of ME terms 441 | // float *MEpoints_ref Container of ME centers 442 | // float *DestPoint_lst Container of destination points 443 | // float *LEterms_ref LE data containers 444 | /////////////////////////////////////////////////////////////////////////////////// 445 | __global__ void me2me(float *MEterms_ref, float *MEpoints_ref, float *DestPoint_lst, float *LEterms_ref) 446 | { 447 | int m, ME_dest, blockLoop, Nterm; 448 | float ac, bd; 449 | float tnm1_r, tnm1_i; 450 | float comb, MATmn_r, MATmn_i; 451 | float LE_r, LE_i; 452 | float tX, tY; 453 | 454 | /////////////////////////////////////////// 455 | // Memory allocation part 456 | /////////////////////////////////////////// 457 | __shared__ float MEterms[TERMS_C]; // Common source point 458 | float srcX = MEpoints_ref[SOURCE*2]; 459 | float srcY = MEpoints_ref[SOURCE*2 +1]; 460 | 461 | // Multipole expansions 462 | if (NID < 2*TERMS) MEterms[NID] = MEterms_ref[SOURCE*TERMS_C + NID]; 463 | 464 | // Destination points 465 | __shared__ float dest_point_local[2*SIZEIL]; 466 | if (NID < 2*SIZEIL){ 467 | dest_point_local[NID] = DestPoint_lst[2*SOURCE*SIZEIL + NID]; 468 | } 469 | __syncthreads(); 470 | 471 | /////////////////////////////////////////// 472 | // Computing part: M2L translation 473 | /////////////////////////////////////////// 474 | 475 | // Loop over the translations, assign threads for LE coefficients to be computed 476 | blockLoop = 0; 477 | while (NID + blockLoop * BLOCKSIZE < SIZEIL * TERMS){ 478 | 479 | // Choose "destination point and term of the LE" to work on (this avoid the use of module op) 480 | #pragma unroll 481 | for(m = 0; m < SIZEIL; m++){ 482 | if (NID + blockLoop * BLOCKSIZE >= m * TERMS){ 483 | ME_dest = m; // Destination point 484 | Nterm = (NID + blockLoop * BLOCKSIZE) - m * TERMS; // LE term 485 | } 486 | } 487 | 488 | // translation distance 489 | tX = dest_point_local[ME_dest * 2] - srcX; 490 | tY = dest_point_local[ME_dest * 2 + 1] - srcY; 491 | 492 | // Precompute t^(n+1) 493 | tnm1_r = tX; 494 | tnm1_i = tY; 495 | #pragma unroll 496 | for (m = 1; m < TERMS; m++){ 497 | if (Nterm >= m){ // tnm1 = tnm1 * t 498 | ac = tnm1_r; 499 | bd = tnm1_i; 500 | tnm1_r = ac * tX - bd * tY; 501 | tnm1_i = ac * tY + bd * tX; 502 | } 503 | } 504 | 505 | if (Nterm & 1 == 1) { // if n is even number, change of sign 506 | tnm1_r = -tnm1_r; 507 | tnm1_i = -tnm1_i; 508 | } 509 | 510 | // Initialization for comb(n+m, m) 511 | comb = 1.0f; //this is used as a float for speed 512 | 513 | // Do the dot product (mat_row, ME terms) for m = 0 514 | // update_complex = MEm_complex * comb / tnm1 (Note that comb=1, so it is just ME/tnm1) 515 | ac = fabs(tnm1_r); 516 | bd = fabs(tnm1_i); 517 | float temp1, temp2; 518 | bool division_path = (ac >= bd); 519 | temp1 = division_path ? tnm1_i : tnm1_r; 520 | temp2 = division_path ? tnm1_r : tnm1_i; 521 | ac = temp1 / temp2; 522 | bd = 1.0f / (temp2 + temp1 * ac); 523 | LE_r = division_path ? (MEterms[0] + MEterms[1] * ac) * bd : (MEterms[0] * ac + MEterms[1]) * bd; 524 | LE_i = division_path ? (MEterms[1] - MEterms[0] * ac) * bd : (MEterms[1] * ac - MEterms[0]) * bd; 525 | 526 | // Do the dot product (mat_row, ME terms) for m >= 1 527 | #pragma unroll 528 | for (m = 1; m < TERMS; m++){ 529 | // comb (m+n, m) for next term 530 | comb = (Nterm == 0) ? 1.0f : comb * (Nterm + m) / m; 531 | 532 | // update tnm1 with contribution of next term. tnm1 = tnm1 * t 533 | ac = tnm1_r; 534 | bd = tnm1_i; 535 | tnm1_r = ac * tX - bd * tY; 536 | tnm1_i = ac * tY + bd * tX; 537 | 538 | // compute element operation. mat_mn = mat_mn / tnm1 539 | ac = fabs(tnm1_r); 540 | bd = fabs(tnm1_i); 541 | division_path = (ac >= bd); 542 | temp1 = division_path ? tnm1_i : tnm1_r; 543 | temp2 = division_path ? tnm1_r : tnm1_i; 544 | ac = temp1 / temp2; 545 | bd = 1.0f / (temp2 + temp1 * ac); 546 | temp1 = comb * bd; 547 | temp2 = comb * ac * bd; 548 | MATmn_r = division_path ? temp1 : temp2; 549 | MATmn_i = division_path ? - temp2 : - temp1; 550 | 551 | // update_complex = MEm_complex * mat_mn 552 | int tmp_2m = 2*m; 553 | LE_r += MEterms[tmp_2m] * MATmn_r - MEterms[tmp_2m +1] * MATmn_i; 554 | LE_i += MEterms[tmp_2m] * MATmn_i + MEterms[tmp_2m +1] * MATmn_r; 555 | } 556 | 557 | float2 tmp_f2; 558 | tmp_f2.x = LE_r; 559 | tmp_f2.y = LE_i; 560 | int out_offset = (SOURCE*SIZEIL*TERMS_C) + 2*(Nterm + ME_dest * TERMS); 561 | *((float2*) &LEterms_ref[out_offset]) = tmp_f2; 562 | 563 | blockLoop += 1; // increase loop counter 564 | } 565 | } 566 | """ 567 | 568 | 569 | class dataset: 570 | p = 0 571 | dim = 0 572 | max_level = 0 573 | sizeIL = 0 574 | num_ME = 0 575 | num_sources = 0 576 | num_translations = 0 577 | MEterms_ref = [] 578 | MEpoints_ref = [] 579 | Trans_lst = [] 580 | DestPoint_lst = [] 581 | DestP_offset = [] 582 | DestP_length = [] 583 | LEterms_ref = [] 584 | LEout_offset = [] 585 | LEout_length = [] 586 | LEterms_cpu = [] 587 | start_cpu = 0 588 | end_cpu = 0 589 | n_blocks = 0 590 | blocksize = 0 591 | time_gpu = 0 592 | time_transfer = 0 593 | 594 | 595 | def generateTreeTestData(data): 596 | ''' This function generates a simple data case for an FMM run 597 | ''' 598 | data.num_ME = 0 599 | for i in range(2, data.max_level+1): 600 | data.num_ME += 4**i 601 | 602 | data.MEterms_ref = ones(data.num_ME * 2*data.p) # Container of ME terms 603 | data.MEpoints_ref = zeros(data.num_ME * 2) # Container of ME centers 604 | for i in range(data.num_ME): 605 | data.MEpoints_ref[data.dim * i] = 0.0 606 | data.MEpoints_ref[data.dim * i +1] = 0.0 607 | 608 | data.Trans_lst = arange(data.num_ME) # Number of 'the ME' to be translated 609 | data.num_sources = len(data.Trans_lst) 610 | data.num_translations = data.sizeIL * data.num_ME # Number of translations to be performed 611 | data.DestPoint_lst = zeros(data.sizeIL * data.num_ME * data.dim) # Destination points 612 | dst_arr1 = array([-1.60, 2.80, 0.1]) 613 | dst_arr2 = array([-1.60, -0.5, -1.40, 0.1, 1.80, 1.60]) 614 | dst_arr3 = array([-1.80, -1.40, 1.40]) 615 | dst_arr4 = array([-1.60, -1.80, 1.60]) 616 | for idME in range(data.num_ME): 617 | offset = idME * data.sizeIL 618 | point_index = 0 619 | # combine point sets (1&2) to form coordinates 620 | for x in dst_arr1: 621 | for y in dst_arr2: 622 | data.DestPoint_lst[data.dim*(offset + point_index)] = x 623 | data.DestPoint_lst[data.dim*(offset + point_index) +1] = y 624 | point_index += 1 625 | # combine point sets (3&4) to form coordinates 626 | for x in dst_arr3: 627 | for y in dst_arr4: 628 | data.DestPoint_lst[data.dim*(offset + point_index)] = x 629 | data.DestPoint_lst[data.dim*(offset + point_index) +1] = y 630 | point_index += 1 631 | 632 | # Translation destination Offset & Size 633 | offset_lst = zeros(3*data.num_ME, dtype=int) 634 | data.DestP_offset = zeros(data.num_ME, dtype=int) # Translation destination start at offset 635 | data.DestP_length = zeros(data.num_ME, dtype=int) # Number of translation destinations per ME 636 | for i in range(data.num_ME): 637 | data.DestP_offset[i] = i*data.sizeIL 638 | data.DestP_length[i] = data.sizeIL 639 | 640 | # Destination Output Offset & Size 641 | data.LEout_offset = zeros(data.num_ME, dtype=int) # Output of translation starts at offset 642 | data.LEout_length = zeros(data.num_ME, dtype=int) # Length of translation output 643 | for i in range(data.num_ME): 644 | data.LEout_offset[i] = i*data.sizeIL 645 | data.LEout_length[i] = data.sizeIL 646 | data.LEterms_ref = zeros(data.num_translations * data.dim*data.p) # LE data containers 647 | 648 | 649 | def generateSingleTestData(data): 650 | ''' One source - One destination case ''' 651 | data.p = 20 652 | data.num_ME = 1 653 | data.dim = 2 654 | 655 | data.MEterms_ref = 0.3*ones(data.num_ME * 2*data.p) # Container of ME terms 656 | data.MEpoints_ref = array([0.51, 0.52]) # Container of ME centers 657 | data.Trans_lst = array([0]) # Number of 'the ME' to be translated 658 | data.num_sources = len(data.Trans_lst) 659 | data.num_translations = 1 # Number of translations to be performed 660 | data.DestPoint_lst = array([0.72, 0.74]) 661 | data.DestP_offset = array([0]) # Translation destination start at offset 662 | data.DestP_length = array([1]) # Number of translation destinations per ME 663 | data.LEout_offset = array([0]) # Output of translation starts at offset 664 | data.LEout_length = array([1]) # Length of translation output 665 | data.LEterms_ref = zeros(data.num_translations * 2*data.p) # LE data containers 666 | 667 | 668 | def flops(ilz, bs, p, num_blocks): 669 | ''' 670 | Function that estimates the number of flops for a m2l call. 671 | 672 | ilz Interaction List size 673 | bs Blocksize (num threads in a block) 674 | p Terms in the expansion 675 | num_blocks Total number of blocks executed 676 | ''' 677 | return 1.0*num_blocks * (ilz * p * (35 + ilz*7 + (p-1)*28)) 678 | 679 | 680 | def bandwidth(ilz, p, num_blocks): 681 | ''' 682 | Computes the number of bytes moved by the kernel call 683 | ''' 684 | return num_blocks * (2*p + 2*ilz + 2*ilz*p + 4) * 4.0 685 | 686 | 687 | def printTable(data): 688 | print data.p, ' & ', 689 | print data.num_translations, ' & ', 690 | print '%.3e & ' % data.time_gpu, 691 | print '%.3e & ' % data.time_transfer, 692 | print '%.3f & ' % (flops(27, data.blocksize, data.p, data.n_blocks) / 10**9 / data.time_gpu), 693 | print '%.3f & ' % (bandwidth(27, data.p, data.n_blocks) / 1024**3 / data.time_gpu), 694 | print '%.3f ' % ((1.0 * data.num_translations) / 10**6 / data.time_gpu) 695 | 696 | 697 | def printRun(data): 698 | print '\nNum coefficient: ', data.p, 699 | print '\tNum Sources: ', data.num_sources, 700 | print '\tNum translations: ', data.num_translations, 701 | print '\tNum Threads: ', data.blocksize, 702 | print '\tAll finite: ', alltrue(isfinite(data.LEterms_ref)) 703 | print 'GPU time: %(gpu_time)e' % {'gpu_time' : data.time_gpu}, 704 | print '\tTransfer time: %(transfer_time)e' % {'transfer_time' : data.time_transfer}, 705 | print 'GIGAOP: ', flops(27, data.blocksize, data.p, data.n_blocks) / 10**9, 706 | print '\tGIGAOP/S: ', flops(27, data.blocksize, data.p, data.n_blocks) / 10**9 / data.time_gpu, 707 | print '\tEffective Bandwidth [GB/s]: ', bandwidth(27, data.p, data.n_blocks) / 1024**3 / data.time_gpu 708 | print 'Translations per second (in millions) [MTPS]: ', (1.0 * data.num_translations) / 10**6 / data.time_gpu 709 | 710 | 711 | def compareCoefficients(data): 712 | l2_numerator = 0 713 | l2_denominator = 0 714 | for i in range(data.num_translations): 715 | first_coefficient = True 716 | print_coefficient = False 717 | for j in range(2 * data.p): 718 | coeff_gpu = data.LEterms_ref[i*2*data.p + j] 719 | coeff_cpu = data.LEterms_cpu[i*2*data.p + j] 720 | coeff_error = abs((coeff_gpu - coeff_cpu) / coeff_cpu) 721 | l2_numerator += coeff_error**2 722 | l2_denominator += coeff_cpu**2 723 | no_print_term = True 724 | if no_print_term: 725 | continue 726 | max_error = max(abs((data.LEterms_ref - data.LEterms_cpu) / data.LEterms_cpu)) 727 | print 'Max relative error: ', max_error 728 | print 'L2 relative error norm: ', sqrt(l2_numerator / l2_denominator) 729 | 730 | 731 | def cpuComputeM2L(data): 732 | ''' M2L translation using python. 733 | ''' 734 | data.start_cpu= time.time() 735 | 736 | m2l = zeros((data.p,data.p), complex) 737 | 738 | for source in range(len(data.Trans_lst)): 739 | ME_src = data.Trans_lst[source] 740 | offset = data.DestP_offset[source] 741 | length = data.DestP_length[source] 742 | 743 | # output data 744 | LE_out = zeros(length * 2*data.p) 745 | LE_offset = data.LEout_offset[source] 746 | LE_length = data.LEout_length[source] 747 | 748 | # get ME data 749 | MEterms = data.MEterms_ref[ME_src*2*data.p:ME_src*2*data.p+2*data.p] 750 | MEpoints = data.MEpoints_ref[ME_src*2:ME_src*2+2] 751 | 752 | # local output offset (translate into local memory) 753 | out_offset = 0 754 | for dest in range(length): 755 | # destination point 756 | ME_dest = offset + dest 757 | destPoint = data.DestPoint_lst[ME_dest*2:ME_dest*2+2] 758 | 759 | # translate source 760 | transDist = destPoint - MEpoints 761 | t_complex = complex(transDist[0], transDist[1]) 762 | 763 | # loop over the terms 764 | for n in range(data.p): 765 | # precompute t**(n+1) 766 | taux = t_complex**(n+1) 767 | vaux = 1. # variable for comb(n+m, m) 768 | 769 | # (-1)**n 770 | if ((n & 1) == 0): 771 | taux = vaux*taux 772 | else: 773 | taux = -vaux*taux 774 | 775 | # Do dot product for m = 0 776 | mat_mn = vaux / taux 777 | m2l[n][0] = mat_mn 778 | 779 | MEm_complex = MEterms[0] + MEterms[1] * 1j 780 | update_complex = MEm_complex * mat_mn 781 | LE_out[out_offset + 2*n + 0] = update_complex.real 782 | LE_out[out_offset + 2*n + 1] = update_complex.imag 783 | 784 | # Do the dot product (mat_row, ME) for all m > 0 785 | for m in range(1,data.p): 786 | # comb(m+n, m) 787 | if n == 0: 788 | vaux = 1. 789 | else: 790 | vaux = vaux * (n + m) / m 791 | 792 | # update t_aux with contribution 793 | taux = taux * t_complex 794 | 795 | # compute element operation 796 | mat_mn = vaux / taux 797 | MEm_complex = MEterms[2*m] + MEterms[2*m + 1] * 1j 798 | update_complex = MEm_complex * mat_mn 799 | m2l[n][m] = mat_mn 800 | 801 | # update LE 802 | LE_out[out_offset + 2*n] = LE_out[out_offset + 2*n] + update_complex.real 803 | LE_out[out_offset + 2*n +1] = LE_out[out_offset + 2*n +1] + update_complex.imag 804 | 805 | # update local output offset for the next translation 806 | out_offset += 2*data.p 807 | 808 | # save output to global 809 | LElocal_offset = LE_offset * 2*data.p 810 | data.LEterms_ref[LE_offset * 2*data.p:(LE_offset + LE_length) * 2*data.p] = LE_out 811 | #print 'LE terms: ', data.LEterms_ref 812 | data.LEterms_cpu = data.LEterms_ref.copy() 813 | data.end_cpu= time.time() 814 | 815 | 816 | def gpuComputeM2L(data, cuda_kernel_string): 817 | # Cuda module 818 | cuda.init() 819 | 820 | assert cuda.Device.count() >= 1 # check that we can run 821 | dev = cuda.Device(0) # Get device 822 | ctx = dev.make_context() # create context 823 | 824 | data.LEterms_ref = zeros(data.num_translations * data.dim*data.p) # LE data containers 825 | # Convert data for the GPU 826 | data.MEterms_ref = data.MEterms_ref.astype(numpy.float32) # Container of ME terms 827 | data.MEpoints_ref = data.MEpoints_ref.astype(numpy.float32) # Container of ME centers 828 | data.Trans_lst = data.Trans_lst.astype(numpy.int32) # Numbers of 'the ME' to be translated 829 | data.DestPoint_lst = data.DestPoint_lst.astype(numpy.float32) # Container of destination points 830 | data.DestP_offset = data.DestP_offset.astype(numpy.int32) # Translation destination start at offset 831 | data.DestP_length = data.DestP_length.astype(numpy.int32) # Number of translation destinations per ME 832 | data.LEout_offset = data.LEout_offset.astype(numpy.int32) # Output of translation starts at offset 833 | data.LEout_length = data.LEout_length.astype(numpy.int32) # Length of translation output 834 | data.LEterms_ref = data.LEterms_ref.astype(numpy.float32) # LE data containers 835 | 836 | # Allocate memory in the GPU 837 | gpu_MEterms_ref = cuda.mem_alloc(data.MEterms_ref.size * data.MEterms_ref.dtype.itemsize) 838 | gpu_MEpoints_ref = cuda.mem_alloc(data.MEpoints_ref.size * data.MEpoints_ref.dtype.itemsize) 839 | gpu_Trans_lst = cuda.mem_alloc(data.Trans_lst.size * data.Trans_lst.dtype.itemsize) 840 | gpu_DestPoint_lst = cuda.mem_alloc(data.DestPoint_lst.size * data.DestPoint_lst.dtype.itemsize) 841 | gpu_DestP_offset = cuda.mem_alloc(data.DestP_offset.size * data.DestP_offset.dtype.itemsize) 842 | gpu_DestP_length = cuda.mem_alloc(data.DestP_length.size * data.DestP_length.dtype.itemsize) 843 | gpu_LEout_offset = cuda.mem_alloc(data.LEout_offset.size * data.LEout_offset.dtype.itemsize) 844 | gpu_LEout_length = cuda.mem_alloc(data.LEout_length.size * data.LEout_length.dtype.itemsize) 845 | gpu_LEterms_ref = cuda.mem_alloc(data.LEterms_ref.size * data.LEterms_ref.dtype.itemsize) 846 | 847 | # Transfer memory to device 848 | start_transfer = time.time() 849 | cuda.memcpy_htod(gpu_MEterms_ref, data.MEterms_ref) 850 | cuda.memcpy_htod(gpu_MEpoints_ref, data.MEpoints_ref) 851 | cuda.memcpy_htod(gpu_DestPoint_lst, data.DestPoint_lst) 852 | cuda.memcpy_htod(gpu_LEout_offset, data.LEout_offset) 853 | end_transfer = time.time() 854 | data.time_transfer = end_transfer - start_transfer 855 | 856 | data.blocksize = 64 # one to start with 857 | data.n_blocks = data.num_sources # one source per block 858 | 859 | mod = SourceModule(cuda_kernel_string % {'blocksize': data.blocksize,'terms':data.p,'terms_c':2*data.p}, 860 | nvcc="nvcc",options=['-use_fast_math'], keep=False, no_extern_c=False) 861 | 862 | if data.n_blocks > 512: 863 | n_blocks_x = data.n_blocks / 16 864 | n_blocks_y = 16 865 | else: 866 | n_blocks_x = data.n_blocks 867 | n_blocks_y = 1 868 | 869 | # Run in GPU 870 | func = mod.get_function("me2me") 871 | data.time_gpu = func(gpu_MEterms_ref, gpu_MEpoints_ref, gpu_DestPoint_lst, gpu_LEterms_ref, block=(data.blocksize,1,1), grid=(n_blocks_x, n_blocks_y), time_kernel=True) 872 | 873 | start_transfer = time.time() 874 | cuda.memcpy_dtoh(data.LEterms_ref, gpu_LEterms_ref) 875 | end_transfer = time.time() 876 | data.time_transfer += end_transfer - start_transfer 877 | 878 | ctx.pop() # context pop 879 | 880 | 881 | data = dataset() 882 | data.dim = 2 883 | data.sizeIL = 27 884 | 885 | for num_terms in [8, 12, 16]: 886 | for num_level in [4, 5, 6, 7, 8]: 887 | data.p = num_terms 888 | data.max_level = num_level 889 | generateTreeTestData(data) 890 | gpuComputeM2L(data, cudaKernel.precomputed_division) 891 | printTable(data) 892 | 893 | #EOF 894 | 895 | --------------------------------------------------------------------------------