├── LICENSE ├── README.md ├── fdtd_tutorial_asa.m ├── kernel_2d.cu └── kernel_2d.ptx /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 bsxfun 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Code for FDTD Tutorial, 180th ASA Meeting (Acoustics in Focus) 2 | ============================================================== 3 | 4 | Matlab code for the Acoustics in Focus Invited Talk: 5 | 6 | *2pCA2. Tutorial on finite-difference time-domain (FDTD) methods for room acoustics simulation.* 7 | 8 | A recording of the tutorial can be viewed at [this link](https://www.youtube.com/watch?v=xgJJwmrX568). 9 | 10 | ## Usage 11 | 12 | Run the script 'fdtd_tutorial_asa.m' in Matlab. 13 | 14 | ### Dependencies 15 | 16 | * This code was tested in Matlab 2019b, but it code should run in various Matlab versions with CUDA/GPU acceleration disabled. 17 | 18 | * For CUDA acceleration you will need a CUDA-supported Nvidia GPU with an up-to-date Nvidia driver, along with Matlab's Parallel Computing Toolbox. 19 | 20 | * CUDA support will be limited by your GPU architecture and Matlab version; see: [https://www.mathworks.com/help/parallel-computing/gpu-support-by-release.html](https://www.mathworks.com/help/parallel-computing/gpu-support-by-release.html). 21 | 22 | * The supplied PTX file was compiled with CUDA toolkit version 10.2. 23 | 24 | * PTX code can be recompiled with NVCC from the CUDA toolkit (install appropriate version for your GPU and Matlab version). 25 | -------------------------------------------------------------------------------- /fdtd_tutorial_asa.m: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | % Copyright 2021 Brian Hamilton % 3 | % % 4 | % Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated % 5 | % documentation files (the "Software"), to deal in the Software without restriction, including without % 6 | % limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of % 7 | % the Software, and to permit persons to whom the Software is furnished to do so, subject to the following % 8 | % conditions: % 9 | % % 10 | % The above copyright notice and this permission notice shall be included in all copies or substantial % 11 | % portions of the Software. % 12 | % % 13 | % THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT % 14 | % LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO % 15 | % EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN % 16 | % AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE % 17 | % OR OTHER DEALINGS IN THE SOFTWARE. % 18 | % % 19 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 20 | % % 21 | % FDTD tutorial for 180th ASA meeting - Matlab code % 22 | % % 23 | % Simulates 2D wave equation with wall absorption in a rectangular room with circular dome % 24 | % % 25 | % Depending on your version of Matlab and Nvidia GPU may need to install the CUDA toolkit and recompile .cu file % 26 | % % 27 | % For GPU execution you will need a Nvidia GPU (Kepler architecture+) and the Parallel Computing Toolbox % 28 | % See: https://uk.mathworks.com/help/parallel-computing/gpu-support-by-release.html % 29 | % % 30 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 31 | clear all; 32 | 33 | c = 343; %speed of sound m/s (20degC) 34 | fmax = 1000; %Hz 35 | PPW = 6; %points per wavelength at fmax 36 | duration = 0.1; %seconds 37 | refl_coeff = 0.9; %reflection coefficient 38 | 39 | Bx = 10; By = 4; %box dims (with lower corner at origin) 40 | x_in = Bx*0.5; y_in = By*0.5; %source input position 41 | R_dome = By*0.5; %heigh of dome (to be centered on roof of box) 42 | 43 | draw = true; %to plot or not 44 | add_dome = true; %add dome to scene 45 | apply_rigid = true; %apply rigid boundaries 46 | apply_loss = true; %apply loss 47 | use_gpu = false; %for GPU processing (with a supported Nvidia GPU) 48 | recompile_ptx = false; %force recompile of CUDA kernels 49 | 50 | assert(R_dome<=0.5*By); 51 | %calculate scene bounds 52 | if add_dome 53 | Lx = Bx; 54 | Ly = By+R_dome; 55 | else 56 | Lx = Bx; 57 | Ly = By; 58 | end 59 | 60 | if (apply_loss) 61 | assert(apply_rigid); 62 | end 63 | 64 | %calculate grid spacing, time step, sample rate 65 | dx = c/fmax/PPW; %grid spacing 66 | dt = sqrt(0.5)*dx/c; 67 | SR = 1/dt; 68 | fprintf('sample rate = %.3f Hz\n',SR) 69 | fprintf('Δx = %.5f m \n',dx) 70 | 71 | Nx = ceil(Lx/dx)+2; %number of points in x-dir 72 | Ny = ceil(Ly/dx)+2; %number of points in y-dir 73 | Nt = ceil(duration/dt); %number of time-steps to compute 74 | 75 | xv = [0:Nx-1]*dx-0.5*dx; %x-sampling points 76 | yv = [0:Ny-1]*dx-0.5*dx; %y-sampling points 77 | [X,Y] = ndgrid(xv,yv); 78 | 79 | in_mask = false(Nx,Ny); %mask for 'interior' points 80 | in_mask(X(:)>=0 & Y(:)>=0 & X(:)0 & K_map(:)<4); 95 | Kib = K_map(ib); 96 | clear K_map; 97 | end 98 | 99 | %initialise state arrays 100 | u0 = zeros(Nx,Ny); 101 | u1 = zeros(Nx,Ny); 102 | u2 = zeros(Nx,Ny); 103 | 104 | %set up an excitation signal 105 | u_in = zeros(Nt,1); 106 | Nh = ceil(5*SR/fmax); 107 | u_in(1:Nh) = 0.5-0.5*cos(2*pi*(0:Nh-1)'./Nh); 108 | u_in(1:Nh) = u_in(1:Nh).*sin(2*pi*(0:Nh-1)'./Nh); 109 | 110 | %grid forcing points 111 | inx = round(x_in/dx+1.5)+1; 112 | iny = round(y_in/dx+1.5)+1; 113 | assert(in_mask(inx,iny)); 114 | 115 | if (draw) 116 | %a mask convenient for plotting 117 | draw_mask = NaN*in_mask; 118 | draw_mask(in_mask) = 1; 119 | end 120 | 121 | if (apply_loss) 122 | %calculate specific admittance γ (g) 123 | assert(abs(refl_coeff)<=1.0); 124 | g = (1-refl_coeff)/(1+refl_coeff); 125 | lf = 0.5*sqrt(0.5)*g; %a loss factor 126 | end 127 | 128 | %GPU processing: move data to GPU and compile CUDA kernels 129 | if (use_gpu) 130 | gpuDevice 131 | %move arrays to GPU 132 | u0 = gpuArray(u0); 133 | u1 = gpuArray(u1); 134 | u2 = gpuArray(u2); 135 | in_mask = gpuArray(in_mask); 136 | if (apply_rigid) 137 | ib = gpuArray(int32(ib)); 138 | Kib = gpuArray(int32(Kib)); 139 | end 140 | 141 | if isempty(dir('./kernel_2d.ptx')) || (recompile_ptx) 142 | err = system(['nvcc -ptx -arch=sm_35 -O3 ','kernel_2d.cu']); 143 | if (err==0) 144 | fprintf('compiled kernel successfully\n') 145 | else 146 | fprintf('error compiling kernel: code = %d \n',err) 147 | return 148 | end 149 | end 150 | 151 | %thread block and thread-block grid dims 152 | cuBx=32; 153 | cuBy=8; 154 | cuGy = floor((Ny-1)/cuBy)+1; 155 | cuGx = floor((Nx-1)/cuBx)+1; 156 | 157 | k1 = parallel.gpu.CUDAKernel(['kernel_2d.ptx'], ['kernel_2d.cu'], 'air_update'); 158 | k1.ThreadBlockSize = [cuBx cuBy]; 159 | k1.GridSize = [cuGx cuGy]; 160 | 161 | if (apply_rigid) 162 | cuBb=128; 163 | Nb = length(ib); 164 | cuGb = floor((Nb-1)/cuBb)+1; 165 | k2 = parallel.gpu.CUDAKernel(['kernel_2d.ptx'], ['kernel_2d.cu'], 'rigid_update'); 166 | k2.ThreadBlockSize = [cuBb]; 167 | k2.GridSize = [cuGb]; 168 | if (apply_loss) 169 | k3 = parallel.gpu.CUDAKernel(['kernel_2d.ptx'], ['kernel_2d.cu'], 'apply_loss'); 170 | k3.ThreadBlockSize = [cuBb]; 171 | k3.GridSize = [cuGb]; 172 | end 173 | end 174 | end 175 | 176 | tt = tic; 177 | bb = 0; 178 | for nt=0:Nt-1 179 | %fdtd update 180 | if (use_gpu) 181 | %matlab calls CUDA kernels 182 | u0=feval(k1, u0, u1, u2, Nx, Ny, in_mask); 183 | if (apply_rigid) 184 | u0=feval(k2, u0, u1, u2, Nx, Nb, ib, Kib); 185 | if (apply_loss) 186 | u0=feval(k3, u0, u2, Nx, Nb, ib, Kib, lf); 187 | end 188 | end 189 | else 190 | %regular matlab 191 | u0(2:Nx-1,2:Ny-1) = in_mask(2:Nx-1,2:Ny-1).*(0.5*(u1(3:Nx,2:Ny-1) + u1(1:Nx-2,2:Ny-1) + u1(2:Nx-1,3:Ny) + u1(2:Nx-1,1:Ny-2)) - u2(2:Nx-1,2:Ny-1)); 192 | if (apply_rigid) 193 | u0(ib) = (2-0.5*Kib).*u1(ib) + 0.5*(u1(ib+1) + u1(ib-1) + u1(ib+Nx) + u1(ib-Nx)) - u2(ib); 194 | if (apply_loss) 195 | u0(ib) = (u0(ib) + lf*(4-Kib).*u2(ib))./(1+lf.*(4-Kib)); 196 | end 197 | end 198 | end 199 | 200 | %inject source 201 | u0(inx,iny) = u0(inx,iny) + u_in(nt+1); 202 | 203 | %plotting 204 | if (draw) 205 | if (use_gpu) 206 | u1g = gather(u1); 207 | else 208 | u1g = u1; 209 | end 210 | if nt==0 211 | figure('name','float'); 212 | u_draw = (u1g.*(draw_mask)).'; 213 | hh = imagesc(xv,yv,u_draw,'cdatamapping','scaled'); 214 | set(gca,'ydir','normal'); 215 | axis equal; 216 | xlabel('x'); 217 | ylabel('y'); 218 | colorbar; 219 | colormap(bone); 220 | xlim([min(xv) max(xv)]); 221 | ylim([min(yv) max(yv)]); 222 | else 223 | umax = max(abs(u1g(:)))+eps; 224 | u_draw = (u1g.*(draw_mask)).'; 225 | set(hh,'cdata',u_draw); 226 | caxis([-umax umax]); 227 | drawnow; 228 | end 229 | end 230 | 231 | %step forward in time 232 | u2 = u1; 233 | u1 = u0; 234 | 235 | %print time elapsed and performance in megavoxels/s 236 | tr=toc(tt); 237 | pstr = sprintf('Progress: nt=%d out of %d time-steps, %.1f megavox/s, %.2f s elapsed',nt+1,Nt,(nt*Nx*Ny)/tr/1e6,tr); 238 | fprintf([repmat('\b',[1 bb]),pstr]) %erase current line with backspaces 239 | bb = length(pstr); 240 | end 241 | %print last samples at source point 242 | fprintf('\nlast samples=\n%s',sprintf('%.15f\n',[u0(inx,iny);u1(inx,iny);u2(inx,iny)])) 243 | -------------------------------------------------------------------------------- /kernel_2d.cu: -------------------------------------------------------------------------------- 1 | ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2 | // Copyright 2021 Brian Hamilton // 3 | // // 4 | // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated // 5 | // documentation files (the "Software"), to deal in the Software without restriction, including without // 6 | // limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of // 7 | // the Software, and to permit persons to whom the Software is furnished to do so, subject to the following // 8 | // conditions: // 9 | // // 10 | // The above copyright notice and this permission notice shall be included in all copies or substantial // 11 | // portions of the Software. // 12 | // // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT // 14 | // LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO // 15 | // EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN // 16 | // AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE // 17 | // OR OTHER DEALINGS IN THE SOFTWARE. // 18 | // // 19 | ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 20 | // // 21 | // FDTD tutorial for 180th ASA meeting - CUDA Kernels to accompany Matlab code // 22 | // // 23 | // Compiles to PTX from Matlab, but can be compiled to PTX with 'nvcc --ptx kernel_2d.cu' // 24 | // // 25 | ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 26 | // 27 | //air update 28 | __global__ void air_update(double *u0, const double * __restrict__ u1, const double * __restrict__ u2, int Nx, int Ny, bool * in_mask) 29 | { 30 | int ix = blockIdx.x*blockDim.x + threadIdx.x; 31 | int iy = blockIdx.y*blockDim.y + threadIdx.y; 32 | if ((ix>0) && (ix0) && (iy; 25 | .reg .b16 %rs<2>; 26 | .reg .b32 %r<17>; 27 | .reg .f64 %fd<13>; 28 | .reg .b64 %rd<20>; 29 | 30 | 31 | ld.param.u64 %rd1, [_Z10air_updatePdPKdS1_iiPb_param_0]; 32 | ld.param.u64 %rd2, [_Z10air_updatePdPKdS1_iiPb_param_1]; 33 | ld.param.u64 %rd3, [_Z10air_updatePdPKdS1_iiPb_param_2]; 34 | ld.param.u32 %r3, [_Z10air_updatePdPKdS1_iiPb_param_3]; 35 | ld.param.u32 %r4, [_Z10air_updatePdPKdS1_iiPb_param_4]; 36 | ld.param.u64 %rd4, [_Z10air_updatePdPKdS1_iiPb_param_5]; 37 | mov.u32 %r5, %ntid.x; 38 | mov.u32 %r6, %ctaid.x; 39 | mov.u32 %r7, %tid.x; 40 | mad.lo.s32 %r1, %r5, %r6, %r7; 41 | mov.u32 %r8, %ntid.y; 42 | mov.u32 %r9, %ctaid.y; 43 | mov.u32 %r10, %tid.y; 44 | mad.lo.s32 %r2, %r8, %r9, %r10; 45 | setp.lt.s32 %p1, %r1, 1; 46 | add.s32 %r11, %r3, -1; 47 | setp.ge.s32 %p2, %r1, %r11; 48 | or.pred %p3, %p1, %p2; 49 | setp.lt.s32 %p4, %r2, 1; 50 | or.pred %p5, %p3, %p4; 51 | add.s32 %r12, %r4, -1; 52 | setp.ge.s32 %p6, %r2, %r12; 53 | or.pred %p7, %p5, %p6; 54 | @%p7 bra BB0_2; 55 | 56 | cvta.to.global.u64 %rd5, %rd2; 57 | cvta.to.global.u64 %rd6, %rd3; 58 | mad.lo.s32 %r13, %r2, %r3, %r1; 59 | add.s32 %r14, %r13, -1; 60 | mul.wide.s32 %rd7, %r14, 8; 61 | add.s64 %rd8, %rd5, %rd7; 62 | ld.global.nc.f64 %fd1, [%rd8+16]; 63 | ld.global.nc.f64 %fd2, [%rd8]; 64 | add.f64 %fd3, %fd2, %fd1; 65 | sub.s32 %r15, %r13, %r3; 66 | mul.wide.s32 %rd9, %r15, 8; 67 | add.s64 %rd10, %rd5, %rd9; 68 | ld.global.nc.f64 %fd4, [%rd10]; 69 | add.f64 %fd5, %fd3, %fd4; 70 | add.s32 %r16, %r13, %r3; 71 | mul.wide.s32 %rd11, %r16, 8; 72 | add.s64 %rd12, %rd5, %rd11; 73 | ld.global.nc.f64 %fd6, [%rd12]; 74 | add.f64 %fd7, %fd5, %fd6; 75 | mul.f64 %fd8, %fd7, 0d3FE0000000000000; 76 | cvt.s64.s32 %rd13, %r13; 77 | mul.wide.s32 %rd14, %r13, 8; 78 | add.s64 %rd15, %rd6, %rd14; 79 | ld.global.nc.f64 %fd9, [%rd15]; 80 | sub.f64 %fd10, %fd8, %fd9; 81 | cvta.to.global.u64 %rd16, %rd4; 82 | add.s64 %rd17, %rd16, %rd13; 83 | ld.global.s8 %rs1, [%rd17]; 84 | cvt.rn.f64.s16 %fd11, %rs1; 85 | mul.f64 %fd12, %fd10, %fd11; 86 | cvta.to.global.u64 %rd18, %rd1; 87 | add.s64 %rd19, %rd18, %rd14; 88 | st.global.f64 [%rd19], %fd12; 89 | 90 | BB0_2: 91 | ret; 92 | } 93 | 94 | // .globl _Z12rigid_updatePdPKdS1_iiPiS2_ 95 | .visible .entry _Z12rigid_updatePdPKdS1_iiPiS2_( 96 | .param .u64 _Z12rigid_updatePdPKdS1_iiPiS2__param_0, 97 | .param .u64 _Z12rigid_updatePdPKdS1_iiPiS2__param_1, 98 | .param .u64 _Z12rigid_updatePdPKdS1_iiPiS2__param_2, 99 | .param .u32 _Z12rigid_updatePdPKdS1_iiPiS2__param_3, 100 | .param .u32 _Z12rigid_updatePdPKdS1_iiPiS2__param_4, 101 | .param .u64 _Z12rigid_updatePdPKdS1_iiPiS2__param_5, 102 | .param .u64 _Z12rigid_updatePdPKdS1_iiPiS2__param_6 103 | ) 104 | { 105 | .reg .pred %p<2>; 106 | .reg .b32 %r<12>; 107 | .reg .f64 %fd<17>; 108 | .reg .b64 %rd<23>; 109 | 110 | 111 | ld.param.u64 %rd1, [_Z12rigid_updatePdPKdS1_iiPiS2__param_0]; 112 | ld.param.u64 %rd2, [_Z12rigid_updatePdPKdS1_iiPiS2__param_1]; 113 | ld.param.u64 %rd3, [_Z12rigid_updatePdPKdS1_iiPiS2__param_2]; 114 | ld.param.u32 %r2, [_Z12rigid_updatePdPKdS1_iiPiS2__param_3]; 115 | ld.param.u32 %r3, [_Z12rigid_updatePdPKdS1_iiPiS2__param_4]; 116 | ld.param.u64 %rd4, [_Z12rigid_updatePdPKdS1_iiPiS2__param_5]; 117 | ld.param.u64 %rd5, [_Z12rigid_updatePdPKdS1_iiPiS2__param_6]; 118 | mov.u32 %r4, %ntid.x; 119 | mov.u32 %r5, %ctaid.x; 120 | mov.u32 %r6, %tid.x; 121 | mad.lo.s32 %r1, %r4, %r5, %r6; 122 | setp.ge.s32 %p1, %r1, %r3; 123 | @%p1 bra BB1_2; 124 | 125 | cvta.to.global.u64 %rd6, %rd2; 126 | cvta.to.global.u64 %rd7, %rd4; 127 | mul.wide.s32 %rd8, %r1, 4; 128 | add.s64 %rd9, %rd7, %rd8; 129 | ld.global.u32 %r7, [%rd9]; 130 | add.s32 %r8, %r7, -1; 131 | cvta.to.global.u64 %rd10, %rd5; 132 | add.s64 %rd11, %rd10, %rd8; 133 | ld.global.u32 %r9, [%rd11]; 134 | cvt.rn.f64.s32 %fd1, %r9; 135 | mul.f64 %fd2, %fd1, 0d3FE0000000000000; 136 | mov.f64 %fd3, 0d4000000000000000; 137 | sub.f64 %fd4, %fd3, %fd2; 138 | mul.wide.s32 %rd12, %r7, 8; 139 | add.s64 %rd13, %rd6, %rd12; 140 | ld.global.nc.f64 %fd5, [%rd13+-8]; 141 | ld.global.nc.f64 %fd6, [%rd13]; 142 | ld.global.nc.f64 %fd7, [%rd13+-16]; 143 | add.f64 %fd8, %fd7, %fd6; 144 | sub.s32 %r10, %r8, %r2; 145 | mul.wide.s32 %rd14, %r10, 8; 146 | add.s64 %rd15, %rd6, %rd14; 147 | ld.global.nc.f64 %fd9, [%rd15]; 148 | add.f64 %fd10, %fd8, %fd9; 149 | add.s32 %r11, %r8, %r2; 150 | mul.wide.s32 %rd16, %r11, 8; 151 | add.s64 %rd17, %rd6, %rd16; 152 | ld.global.nc.f64 %fd11, [%rd17]; 153 | add.f64 %fd12, %fd10, %fd11; 154 | mul.f64 %fd13, %fd12, 0d3FE0000000000000; 155 | fma.rn.f64 %fd14, %fd5, %fd4, %fd13; 156 | cvta.to.global.u64 %rd18, %rd3; 157 | mul.wide.s32 %rd19, %r8, 8; 158 | add.s64 %rd20, %rd18, %rd19; 159 | ld.global.nc.f64 %fd15, [%rd20]; 160 | sub.f64 %fd16, %fd14, %fd15; 161 | cvta.to.global.u64 %rd21, %rd1; 162 | add.s64 %rd22, %rd21, %rd19; 163 | st.global.f64 [%rd22], %fd16; 164 | 165 | BB1_2: 166 | ret; 167 | } 168 | 169 | // .globl _Z10apply_lossPdPKdiiPiS2_d 170 | .visible .entry _Z10apply_lossPdPKdiiPiS2_d( 171 | .param .u64 _Z10apply_lossPdPKdiiPiS2_d_param_0, 172 | .param .u64 _Z10apply_lossPdPKdiiPiS2_d_param_1, 173 | .param .u32 _Z10apply_lossPdPKdiiPiS2_d_param_2, 174 | .param .u32 _Z10apply_lossPdPKdiiPiS2_d_param_3, 175 | .param .u64 _Z10apply_lossPdPKdiiPiS2_d_param_4, 176 | .param .u64 _Z10apply_lossPdPKdiiPiS2_d_param_5, 177 | .param .f64 _Z10apply_lossPdPKdiiPiS2_d_param_6 178 | ) 179 | { 180 | .reg .pred %p<2>; 181 | .reg .b32 %r<11>; 182 | .reg .f64 %fd<9>; 183 | .reg .b64 %rd<15>; 184 | 185 | 186 | ld.param.u64 %rd1, [_Z10apply_lossPdPKdiiPiS2_d_param_0]; 187 | ld.param.u64 %rd2, [_Z10apply_lossPdPKdiiPiS2_d_param_1]; 188 | ld.param.u32 %r2, [_Z10apply_lossPdPKdiiPiS2_d_param_3]; 189 | ld.param.u64 %rd3, [_Z10apply_lossPdPKdiiPiS2_d_param_4]; 190 | ld.param.u64 %rd4, [_Z10apply_lossPdPKdiiPiS2_d_param_5]; 191 | ld.param.f64 %fd1, [_Z10apply_lossPdPKdiiPiS2_d_param_6]; 192 | mov.u32 %r3, %ctaid.x; 193 | mov.u32 %r4, %ntid.x; 194 | mov.u32 %r5, %tid.x; 195 | mad.lo.s32 %r1, %r4, %r3, %r5; 196 | setp.ge.s32 %p1, %r1, %r2; 197 | @%p1 bra BB2_2; 198 | 199 | cvta.to.global.u64 %rd5, %rd1; 200 | cvta.to.global.u64 %rd6, %rd3; 201 | mul.wide.s32 %rd7, %r1, 4; 202 | add.s64 %rd8, %rd6, %rd7; 203 | ld.global.u32 %r6, [%rd8]; 204 | add.s32 %r7, %r6, -1; 205 | mul.wide.s32 %rd9, %r7, 8; 206 | add.s64 %rd10, %rd5, %rd9; 207 | cvta.to.global.u64 %rd11, %rd4; 208 | add.s64 %rd12, %rd11, %rd7; 209 | ld.global.u32 %r8, [%rd12]; 210 | mov.u32 %r9, 4; 211 | sub.s32 %r10, %r9, %r8; 212 | cvt.rn.f64.s32 %fd2, %r10; 213 | mul.f64 %fd3, %fd2, %fd1; 214 | cvta.to.global.u64 %rd13, %rd2; 215 | add.s64 %rd14, %rd13, %rd9; 216 | ld.global.nc.f64 %fd4, [%rd14]; 217 | ld.global.f64 %fd5, [%rd10]; 218 | fma.rn.f64 %fd6, %fd4, %fd3, %fd5; 219 | add.f64 %fd7, %fd3, 0d3FF0000000000000; 220 | div.rn.f64 %fd8, %fd6, %fd7; 221 | st.global.f64 [%rd10], %fd8; 222 | 223 | BB2_2: 224 | ret; 225 | } 226 | 227 | 228 | --------------------------------------------------------------------------------