├── LICENSE
├── README.md
├── fdtd_tutorial_asa.m
├── kernel_2d.cu
└── kernel_2d.ptx


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 bsxfun
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Code for FDTD Tutorial, 180th ASA Meeting (Acoustics in Focus)
 2 | ==============================================================
 3 | 
 4 | Matlab code for the Acoustics in Focus Invited Talk: 
 5 | 
 6 | *2pCA2. Tutorial on finite-difference time-domain (FDTD) methods for room acoustics simulation.*
 7 | 
 8 | A recording of the tutorial can be viewed at [this link](https://www.youtube.com/watch?v=xgJJwmrX568).
 9 | 
10 | ## Usage
11 | 
12 | Run the script 'fdtd_tutorial_asa.m' in Matlab.
13 | 
14 | ### Dependencies
15 | 
16 | * This code was tested in Matlab 2019b, but it code should run in various Matlab versions with CUDA/GPU acceleration disabled.
17 | 
18 | * For CUDA acceleration you will need a CUDA-supported Nvidia GPU with an up-to-date Nvidia driver, along with Matlab's Parallel Computing Toolbox.   
19 | 
20 | * CUDA support will be limited by your GPU architecture and Matlab version; see: [https://www.mathworks.com/help/parallel-computing/gpu-support-by-release.html](https://www.mathworks.com/help/parallel-computing/gpu-support-by-release.html).
21 | 
22 | * The supplied PTX file was compiled with CUDA toolkit version 10.2.
23 | 
24 | * PTX code can be recompiled with NVCC from the CUDA toolkit (install appropriate version for your GPU and Matlab version).
25 | 


--------------------------------------------------------------------------------
/fdtd_tutorial_asa.m:
--------------------------------------------------------------------------------
  1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  2 | %  Copyright 2021 Brian Hamilton                                                                                   %
  3 | %                                                                                                                  %
  4 | %  Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated    %
  5 | %  documentation files (the "Software"), to deal in the Software without restriction, including without            %
  6 | %  limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of       %
  7 | %  the Software, and to permit persons to whom the Software is furnished to do so, subject to the following        %
  8 | %  conditions:                                                                                                     %
  9 | %                                                                                                                  %
 10 | %  The above copyright notice and this permission notice shall be included in all copies or substantial            %
 11 | %  portions of the Software.                                                                                       %
 12 | %                                                                                                                  %
 13 | %  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT           %
 14 | %  LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO       %
 15 | %  EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN    %
 16 | %  AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE    %
 17 | %  OR OTHER DEALINGS IN THE SOFTWARE.                                                                              %
 18 | %                                                                                                                  %
 19 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 20 | %                                                                                                                  %
 21 | %  FDTD tutorial for 180th ASA meeting - Matlab code                                                               %
 22 | %                                                                                                                  %
 23 | %  Simulates 2D wave equation with wall absorption in a rectangular room with circular dome                        %
 24 | %                                                                                                                  %
 25 | %  Depending on your version of Matlab and Nvidia GPU may need to install the CUDA toolkit and recompile .cu file  %
 26 | %                                                                                                                  %
 27 | %  For GPU execution you will need a Nvidia GPU (Kepler architecture+) and the Parallel Computing Toolbox          %
 28 | %  See: https://uk.mathworks.com/help/parallel-computing/gpu-support-by-release.html                               %
 29 | %                                                                                                                  %
 30 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 31 | clear all;
 32 | 
 33 | c = 343; %speed of sound m/s (20degC)
 34 | fmax = 1000; %Hz
 35 | PPW = 6; %points per wavelength at fmax
 36 | duration = 0.1; %seconds
 37 | refl_coeff = 0.9; %reflection coefficient
 38 | 
 39 | Bx = 10; By = 4; %box dims (with lower corner at origin)
 40 | x_in = Bx*0.5; y_in = By*0.5; %source input position 
 41 | R_dome = By*0.5; %heigh of dome (to be centered on roof of box)
 42 | 
 43 | draw = true; %to plot or not
 44 | add_dome = true; %add dome to scene
 45 | apply_rigid = true; %apply rigid boundaries
 46 | apply_loss = true; %apply loss 
 47 | use_gpu = false;  %for GPU processing (with a supported Nvidia GPU)
 48 | recompile_ptx = false; %force recompile of CUDA kernels
 49 | 
 50 | assert(R_dome<=0.5*By);
 51 | %calculate scene bounds
 52 | if add_dome
 53 |    Lx = Bx;
 54 |    Ly = By+R_dome;
 55 | else
 56 |    Lx = Bx;
 57 |    Ly = By;
 58 | end
 59 | 
 60 | if (apply_loss)
 61 |    assert(apply_rigid);
 62 | end
 63 | 
 64 | %calculate grid spacing, time step, sample rate
 65 | dx = c/fmax/PPW; %grid spacing
 66 | dt = sqrt(0.5)*dx/c;
 67 | SR = 1/dt;
 68 | fprintf('sample rate = %.3f Hz\n',SR) 
 69 | fprintf('Δx = %.5f m \n',dx) 
 70 | 
 71 | Nx = ceil(Lx/dx)+2; %number of points in x-dir
 72 | Ny = ceil(Ly/dx)+2; %number of points in y-dir
 73 | Nt = ceil(duration/dt); %number of time-steps to compute
 74 | 
 75 | xv = [0:Nx-1]*dx-0.5*dx; %x-sampling points
 76 | yv = [0:Ny-1]*dx-0.5*dx; %y-sampling points
 77 | [X,Y] = ndgrid(xv,yv);
 78 | 
 79 | in_mask = false(Nx,Ny); %mask for 'interior' points
 80 | in_mask(X(:)>=0 & Y(:)>=0 & X(:)<Bx & Y(:)<By) = true;
 81 | if add_dome
 82 |    in_mask((X(:)-0.5*Bx).^2+ (Y(:)-By).^2<R_dome^2) = true;
 83 | end
 84 | clear X Y;
 85 | 
 86 | if (apply_rigid)
 87 |    %calculate number of interior neighbours (for interior points only)
 88 |    K_map = zeros(Nx,Ny);
 89 |    K_map(2:Nx-1,2:Ny-1) = K_map(2:Nx-1,2:Ny-1) + in_mask(3:Nx,2:Ny-1);
 90 |    K_map(2:Nx-1,2:Ny-1) = K_map(2:Nx-1,2:Ny-1) + in_mask(1:Nx-2,2:Ny-1);
 91 |    K_map(2:Nx-1,2:Ny-1) = K_map(2:Nx-1,2:Ny-1) + in_mask(2:Nx-1,3:Ny);
 92 |    K_map(2:Nx-1,2:Ny-1) = K_map(2:Nx-1,2:Ny-1) + in_mask(2:Nx-1,1:Ny-2);
 93 |    K_map(~in_mask) = 0;
 94 |    ib = find(K_map(:)>0 & K_map(:)<4);
 95 |    Kib = K_map(ib);
 96 |    clear K_map;
 97 | end
 98 | 
 99 | %initialise state arrays
100 | u0 = zeros(Nx,Ny);
101 | u1 = zeros(Nx,Ny);
102 | u2 = zeros(Nx,Ny);
103 | 
104 | %set up an excitation signal
105 | u_in = zeros(Nt,1);
106 | Nh = ceil(5*SR/fmax);
107 | u_in(1:Nh) = 0.5-0.5*cos(2*pi*(0:Nh-1)'./Nh);
108 | u_in(1:Nh) = u_in(1:Nh).*sin(2*pi*(0:Nh-1)'./Nh);
109 | 
110 | %grid forcing points
111 | inx = round(x_in/dx+1.5)+1;
112 | iny = round(y_in/dx+1.5)+1;
113 | assert(in_mask(inx,iny));
114 | 
115 | if (draw)
116 |    %a mask convenient for plotting
117 |    draw_mask = NaN*in_mask;
118 |    draw_mask(in_mask) = 1;
119 | end
120 | 
121 | if (apply_loss)
122 |    %calculate specific admittance γ (g)
123 |    assert(abs(refl_coeff)<=1.0);
124 |    g = (1-refl_coeff)/(1+refl_coeff);
125 |    lf = 0.5*sqrt(0.5)*g; %a loss factor
126 | end
127 | 
128 | %GPU processing: move data to GPU and compile CUDA kernels
129 | if (use_gpu)
130 |    gpuDevice
131 |    %move arrays to GPU 
132 |    u0 = gpuArray(u0);
133 |    u1 = gpuArray(u1);
134 |    u2 = gpuArray(u2);
135 |    in_mask = gpuArray(in_mask);
136 |    if (apply_rigid)
137 |       ib = gpuArray(int32(ib));
138 |       Kib = gpuArray(int32(Kib));
139 |    end
140 |    
141 |    if isempty(dir('./kernel_2d.ptx')) || (recompile_ptx)
142 |       err = system(['nvcc -ptx -arch=sm_35 -O3 ','kernel_2d.cu']);
143 |       if (err==0)
144 |          fprintf('compiled kernel successfully\n')
145 |       else
146 |          fprintf('error compiling kernel: code = %d \n',err)
147 |          return
148 |       end
149 |    end
150 | 
151 |    %thread block and thread-block grid dims
152 |    cuBx=32;
153 |    cuBy=8;
154 |    cuGy = floor((Ny-1)/cuBy)+1;
155 |    cuGx = floor((Nx-1)/cuBx)+1;
156 | 
157 |    k1 = parallel.gpu.CUDAKernel(['kernel_2d.ptx'], ['kernel_2d.cu'], 'air_update');
158 |    k1.ThreadBlockSize = [cuBx cuBy];
159 |    k1.GridSize = [cuGx cuGy];
160 | 
161 |    if (apply_rigid)
162 |       cuBb=128;
163 |       Nb = length(ib);
164 |       cuGb = floor((Nb-1)/cuBb)+1;
165 |       k2 = parallel.gpu.CUDAKernel(['kernel_2d.ptx'], ['kernel_2d.cu'], 'rigid_update');
166 |       k2.ThreadBlockSize = [cuBb];
167 |       k2.GridSize = [cuGb];
168 |       if (apply_loss)
169 |          k3 = parallel.gpu.CUDAKernel(['kernel_2d.ptx'], ['kernel_2d.cu'], 'apply_loss');
170 |          k3.ThreadBlockSize = [cuBb];
171 |          k3.GridSize = [cuGb];
172 |       end
173 |    end
174 | end
175 | 
176 | tt = tic;
177 | bb = 0;
178 | for nt=0:Nt-1
179 |    %fdtd update
180 |    if (use_gpu)
181 |       %matlab calls CUDA kernels
182 |       u0=feval(k1, u0, u1, u2, Nx, Ny, in_mask);
183 |       if (apply_rigid)
184 |          u0=feval(k2, u0, u1, u2, Nx, Nb, ib, Kib);
185 |          if (apply_loss)
186 |             u0=feval(k3, u0, u2, Nx, Nb, ib, Kib, lf);
187 |          end
188 |       end
189 |    else
190 |       %regular matlab
191 |       u0(2:Nx-1,2:Ny-1) = in_mask(2:Nx-1,2:Ny-1).*(0.5*(u1(3:Nx,2:Ny-1) + u1(1:Nx-2,2:Ny-1) + u1(2:Nx-1,3:Ny) + u1(2:Nx-1,1:Ny-2)) - u2(2:Nx-1,2:Ny-1));
192 |       if (apply_rigid)
193 |          u0(ib) = (2-0.5*Kib).*u1(ib) + 0.5*(u1(ib+1) + u1(ib-1) + u1(ib+Nx) + u1(ib-Nx)) - u2(ib);
194 |          if (apply_loss)
195 |             u0(ib) = (u0(ib) + lf*(4-Kib).*u2(ib))./(1+lf.*(4-Kib));
196 |          end
197 |       end
198 |    end
199 | 
200 |    %inject source
201 |    u0(inx,iny) = u0(inx,iny) + u_in(nt+1);
202 | 
203 |    %plotting
204 |    if (draw)
205 |       if (use_gpu)
206 |          u1g = gather(u1);
207 |       else
208 |          u1g = u1;
209 |       end
210 |       if nt==0
211 |          figure('name','float');
212 |          u_draw = (u1g.*(draw_mask)).';
213 |          hh = imagesc(xv,yv,u_draw,'cdatamapping','scaled');
214 |          set(gca,'ydir','normal');
215 |          axis equal;
216 |          xlabel('x');
217 |          ylabel('y');
218 |          colorbar;
219 |          colormap(bone);
220 |          xlim([min(xv) max(xv)]);
221 |          ylim([min(yv) max(yv)]);
222 |       else
223 |          umax = max(abs(u1g(:)))+eps;
224 |          u_draw = (u1g.*(draw_mask)).';
225 |          set(hh,'cdata',u_draw);
226 |          caxis([-umax umax]);
227 |          drawnow;
228 |       end
229 |    end
230 | 
231 |    %step forward in time
232 |    u2 = u1; 
233 |    u1 = u0;
234 | 
235 |    %print time elapsed and performance in megavoxels/s
236 |    tr=toc(tt);
237 |    pstr = sprintf('Progress: nt=%d out of %d time-steps, %.1f megavox/s, %.2f s elapsed',nt+1,Nt,(nt*Nx*Ny)/tr/1e6,tr);
238 |    fprintf([repmat('\b',[1 bb]),pstr]) %erase current line with backspaces
239 |    bb = length(pstr);
240 | end
241 | %print last samples at source point
242 | fprintf('\nlast samples=\n%s',sprintf('%.15f\n',[u0(inx,iny);u1(inx,iny);u2(inx,iny)]))
243 | 


--------------------------------------------------------------------------------
/kernel_2d.cu:
--------------------------------------------------------------------------------
 1 | //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 2 | // Copyright 2021 Brian Hamilton                                                                                    //
 3 | //                                                                                                                  //
 4 | // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated     //
 5 | // documentation files (the "Software"), to deal in the Software without restriction, including without             //
 6 | // limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of        //
 7 | // the Software, and to permit persons to whom the Software is furnished to do so, subject to the following         //
 8 | // conditions:                                                                                                      //
 9 | //                                                                                                                  //
10 | // The above copyright notice and this permission notice shall be included in all copies or substantial             //
11 | // portions of the Software.                                                                                        //
12 | //                                                                                                                  //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT            //
14 | // LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO        //
15 | // EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN     //
16 | // AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE     //
17 | // OR OTHER DEALINGS IN THE SOFTWARE.                                                                               //
18 | //                                                                                                                  //
19 | //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
20 | //                                                                                                                  //
21 | // FDTD tutorial for 180th ASA meeting - CUDA Kernels to accompany Matlab code                                      //
22 | //                                                                                                                  //
23 | // Compiles to PTX from Matlab, but can be compiled to PTX with 'nvcc --ptx kernel_2d.cu'                           //
24 | //                                                                                                                  //
25 | //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
26 | //
27 | //air update
28 | __global__ void air_update(double *u0, const double * __restrict__ u1, const double * __restrict__ u2, int Nx, int  Ny, bool * in_mask)
29 | {
30 |    int ix = blockIdx.x*blockDim.x + threadIdx.x;
31 |    int iy = blockIdx.y*blockDim.y + threadIdx.y;
32 |    if ((ix>0) && (ix<Nx-1) && (iy>0) && (iy<Ny-1)) {
33 |       int ii = iy*Nx+ix; 
34 |       u0[ii] = (0.5*(u1[ii-1]+u1[ii+1]+u1[ii-Nx]+u1[ii+Nx]) - u2[ii])*in_mask[ii];
35 |    }
36 | }
37 | //rigid boundary update
38 | __global__ void rigid_update(double *u0, const double * __restrict__ u1, const double * __restrict__ u2, int Nx, int Nb, int * ib, int * Kib)
39 | {
40 |    int ix = blockIdx.x*blockDim.x + threadIdx.x;
41 |    if (ix<Nb) {
42 |       int ii = ib[ix]-1; //from matlab indices 
43 |       u0[ii] = (2-0.5*Kib[ix])*u1[ii] + 0.5*(u1[ii-1]+u1[ii+1]+u1[ii-Nx]+u1[ii+Nx]) - u2[ii];
44 |    }
45 | }
46 | //add loss to boundary nodes
47 | __global__ void apply_loss(double *u0, const double * __restrict__ u2, int Nx, int Nb, int * ib, int * Kib, double lf)
48 | {
49 |    int ix = blockIdx.x*blockDim.x + threadIdx.x;
50 |    if (ix<Nb) {
51 |       int ii = ib[ix]-1; //from matlab indices 
52 |       u0[ii] = (u0[ii] + lf*(4-Kib[ix])*u2[ii])/(1.0+lf*(4-Kib[ix]));
53 |    }
54 | }
55 | 


--------------------------------------------------------------------------------
/kernel_2d.ptx:
--------------------------------------------------------------------------------
  1 | //
  2 | // Generated by NVIDIA NVVM Compiler
  3 | //
  4 | // Compiler Build ID: CL-27506705
  5 | // Cuda compilation tools, release 10.2, V10.2.89
  6 | // Based on LLVM 3.4svn
  7 | //
  8 | 
  9 | .version 6.5
 10 | .target sm_35
 11 | .address_size 64
 12 | 
 13 | 	// .globl	_Z10air_updatePdPKdS1_iiPb
 14 | 
 15 | .visible .entry _Z10air_updatePdPKdS1_iiPb(
 16 | 	.param .u64 _Z10air_updatePdPKdS1_iiPb_param_0,
 17 | 	.param .u64 _Z10air_updatePdPKdS1_iiPb_param_1,
 18 | 	.param .u64 _Z10air_updatePdPKdS1_iiPb_param_2,
 19 | 	.param .u32 _Z10air_updatePdPKdS1_iiPb_param_3,
 20 | 	.param .u32 _Z10air_updatePdPKdS1_iiPb_param_4,
 21 | 	.param .u64 _Z10air_updatePdPKdS1_iiPb_param_5
 22 | )
 23 | {
 24 | 	.reg .pred 	%p<8>;
 25 | 	.reg .b16 	%rs<2>;
 26 | 	.reg .b32 	%r<17>;
 27 | 	.reg .f64 	%fd<13>;
 28 | 	.reg .b64 	%rd<20>;
 29 | 
 30 | 
 31 | 	ld.param.u64 	%rd1, [_Z10air_updatePdPKdS1_iiPb_param_0];
 32 | 	ld.param.u64 	%rd2, [_Z10air_updatePdPKdS1_iiPb_param_1];
 33 | 	ld.param.u64 	%rd3, [_Z10air_updatePdPKdS1_iiPb_param_2];
 34 | 	ld.param.u32 	%r3, [_Z10air_updatePdPKdS1_iiPb_param_3];
 35 | 	ld.param.u32 	%r4, [_Z10air_updatePdPKdS1_iiPb_param_4];
 36 | 	ld.param.u64 	%rd4, [_Z10air_updatePdPKdS1_iiPb_param_5];
 37 | 	mov.u32 	%r5, %ntid.x;
 38 | 	mov.u32 	%r6, %ctaid.x;
 39 | 	mov.u32 	%r7, %tid.x;
 40 | 	mad.lo.s32 	%r1, %r5, %r6, %r7;
 41 | 	mov.u32 	%r8, %ntid.y;
 42 | 	mov.u32 	%r9, %ctaid.y;
 43 | 	mov.u32 	%r10, %tid.y;
 44 | 	mad.lo.s32 	%r2, %r8, %r9, %r10;
 45 | 	setp.lt.s32	%p1, %r1, 1;
 46 | 	add.s32 	%r11, %r3, -1;
 47 | 	setp.ge.s32	%p2, %r1, %r11;
 48 | 	or.pred  	%p3, %p1, %p2;
 49 | 	setp.lt.s32	%p4, %r2, 1;
 50 | 	or.pred  	%p5, %p3, %p4;
 51 | 	add.s32 	%r12, %r4, -1;
 52 | 	setp.ge.s32	%p6, %r2, %r12;
 53 | 	or.pred  	%p7, %p5, %p6;
 54 | 	@%p7 bra 	BB0_2;
 55 | 
 56 | 	cvta.to.global.u64 	%rd5, %rd2;
 57 | 	cvta.to.global.u64 	%rd6, %rd3;
 58 | 	mad.lo.s32 	%r13, %r2, %r3, %r1;
 59 | 	add.s32 	%r14, %r13, -1;
 60 | 	mul.wide.s32 	%rd7, %r14, 8;
 61 | 	add.s64 	%rd8, %rd5, %rd7;
 62 | 	ld.global.nc.f64 	%fd1, [%rd8+16];
 63 | 	ld.global.nc.f64 	%fd2, [%rd8];
 64 | 	add.f64 	%fd3, %fd2, %fd1;
 65 | 	sub.s32 	%r15, %r13, %r3;
 66 | 	mul.wide.s32 	%rd9, %r15, 8;
 67 | 	add.s64 	%rd10, %rd5, %rd9;
 68 | 	ld.global.nc.f64 	%fd4, [%rd10];
 69 | 	add.f64 	%fd5, %fd3, %fd4;
 70 | 	add.s32 	%r16, %r13, %r3;
 71 | 	mul.wide.s32 	%rd11, %r16, 8;
 72 | 	add.s64 	%rd12, %rd5, %rd11;
 73 | 	ld.global.nc.f64 	%fd6, [%rd12];
 74 | 	add.f64 	%fd7, %fd5, %fd6;
 75 | 	mul.f64 	%fd8, %fd7, 0d3FE0000000000000;
 76 | 	cvt.s64.s32	%rd13, %r13;
 77 | 	mul.wide.s32 	%rd14, %r13, 8;
 78 | 	add.s64 	%rd15, %rd6, %rd14;
 79 | 	ld.global.nc.f64 	%fd9, [%rd15];
 80 | 	sub.f64 	%fd10, %fd8, %fd9;
 81 | 	cvta.to.global.u64 	%rd16, %rd4;
 82 | 	add.s64 	%rd17, %rd16, %rd13;
 83 | 	ld.global.s8 	%rs1, [%rd17];
 84 | 	cvt.rn.f64.s16	%fd11, %rs1;
 85 | 	mul.f64 	%fd12, %fd10, %fd11;
 86 | 	cvta.to.global.u64 	%rd18, %rd1;
 87 | 	add.s64 	%rd19, %rd18, %rd14;
 88 | 	st.global.f64 	[%rd19], %fd12;
 89 | 
 90 | BB0_2:
 91 | 	ret;
 92 | }
 93 | 
 94 | 	// .globl	_Z12rigid_updatePdPKdS1_iiPiS2_
 95 | .visible .entry _Z12rigid_updatePdPKdS1_iiPiS2_(
 96 | 	.param .u64 _Z12rigid_updatePdPKdS1_iiPiS2__param_0,
 97 | 	.param .u64 _Z12rigid_updatePdPKdS1_iiPiS2__param_1,
 98 | 	.param .u64 _Z12rigid_updatePdPKdS1_iiPiS2__param_2,
 99 | 	.param .u32 _Z12rigid_updatePdPKdS1_iiPiS2__param_3,
100 | 	.param .u32 _Z12rigid_updatePdPKdS1_iiPiS2__param_4,
101 | 	.param .u64 _Z12rigid_updatePdPKdS1_iiPiS2__param_5,
102 | 	.param .u64 _Z12rigid_updatePdPKdS1_iiPiS2__param_6
103 | )
104 | {
105 | 	.reg .pred 	%p<2>;
106 | 	.reg .b32 	%r<12>;
107 | 	.reg .f64 	%fd<17>;
108 | 	.reg .b64 	%rd<23>;
109 | 
110 | 
111 | 	ld.param.u64 	%rd1, [_Z12rigid_updatePdPKdS1_iiPiS2__param_0];
112 | 	ld.param.u64 	%rd2, [_Z12rigid_updatePdPKdS1_iiPiS2__param_1];
113 | 	ld.param.u64 	%rd3, [_Z12rigid_updatePdPKdS1_iiPiS2__param_2];
114 | 	ld.param.u32 	%r2, [_Z12rigid_updatePdPKdS1_iiPiS2__param_3];
115 | 	ld.param.u32 	%r3, [_Z12rigid_updatePdPKdS1_iiPiS2__param_4];
116 | 	ld.param.u64 	%rd4, [_Z12rigid_updatePdPKdS1_iiPiS2__param_5];
117 | 	ld.param.u64 	%rd5, [_Z12rigid_updatePdPKdS1_iiPiS2__param_6];
118 | 	mov.u32 	%r4, %ntid.x;
119 | 	mov.u32 	%r5, %ctaid.x;
120 | 	mov.u32 	%r6, %tid.x;
121 | 	mad.lo.s32 	%r1, %r4, %r5, %r6;
122 | 	setp.ge.s32	%p1, %r1, %r3;
123 | 	@%p1 bra 	BB1_2;
124 | 
125 | 	cvta.to.global.u64 	%rd6, %rd2;
126 | 	cvta.to.global.u64 	%rd7, %rd4;
127 | 	mul.wide.s32 	%rd8, %r1, 4;
128 | 	add.s64 	%rd9, %rd7, %rd8;
129 | 	ld.global.u32 	%r7, [%rd9];
130 | 	add.s32 	%r8, %r7, -1;
131 | 	cvta.to.global.u64 	%rd10, %rd5;
132 | 	add.s64 	%rd11, %rd10, %rd8;
133 | 	ld.global.u32 	%r9, [%rd11];
134 | 	cvt.rn.f64.s32	%fd1, %r9;
135 | 	mul.f64 	%fd2, %fd1, 0d3FE0000000000000;
136 | 	mov.f64 	%fd3, 0d4000000000000000;
137 | 	sub.f64 	%fd4, %fd3, %fd2;
138 | 	mul.wide.s32 	%rd12, %r7, 8;
139 | 	add.s64 	%rd13, %rd6, %rd12;
140 | 	ld.global.nc.f64 	%fd5, [%rd13+-8];
141 | 	ld.global.nc.f64 	%fd6, [%rd13];
142 | 	ld.global.nc.f64 	%fd7, [%rd13+-16];
143 | 	add.f64 	%fd8, %fd7, %fd6;
144 | 	sub.s32 	%r10, %r8, %r2;
145 | 	mul.wide.s32 	%rd14, %r10, 8;
146 | 	add.s64 	%rd15, %rd6, %rd14;
147 | 	ld.global.nc.f64 	%fd9, [%rd15];
148 | 	add.f64 	%fd10, %fd8, %fd9;
149 | 	add.s32 	%r11, %r8, %r2;
150 | 	mul.wide.s32 	%rd16, %r11, 8;
151 | 	add.s64 	%rd17, %rd6, %rd16;
152 | 	ld.global.nc.f64 	%fd11, [%rd17];
153 | 	add.f64 	%fd12, %fd10, %fd11;
154 | 	mul.f64 	%fd13, %fd12, 0d3FE0000000000000;
155 | 	fma.rn.f64 	%fd14, %fd5, %fd4, %fd13;
156 | 	cvta.to.global.u64 	%rd18, %rd3;
157 | 	mul.wide.s32 	%rd19, %r8, 8;
158 | 	add.s64 	%rd20, %rd18, %rd19;
159 | 	ld.global.nc.f64 	%fd15, [%rd20];
160 | 	sub.f64 	%fd16, %fd14, %fd15;
161 | 	cvta.to.global.u64 	%rd21, %rd1;
162 | 	add.s64 	%rd22, %rd21, %rd19;
163 | 	st.global.f64 	[%rd22], %fd16;
164 | 
165 | BB1_2:
166 | 	ret;
167 | }
168 | 
169 | 	// .globl	_Z10apply_lossPdPKdiiPiS2_d
170 | .visible .entry _Z10apply_lossPdPKdiiPiS2_d(
171 | 	.param .u64 _Z10apply_lossPdPKdiiPiS2_d_param_0,
172 | 	.param .u64 _Z10apply_lossPdPKdiiPiS2_d_param_1,
173 | 	.param .u32 _Z10apply_lossPdPKdiiPiS2_d_param_2,
174 | 	.param .u32 _Z10apply_lossPdPKdiiPiS2_d_param_3,
175 | 	.param .u64 _Z10apply_lossPdPKdiiPiS2_d_param_4,
176 | 	.param .u64 _Z10apply_lossPdPKdiiPiS2_d_param_5,
177 | 	.param .f64 _Z10apply_lossPdPKdiiPiS2_d_param_6
178 | )
179 | {
180 | 	.reg .pred 	%p<2>;
181 | 	.reg .b32 	%r<11>;
182 | 	.reg .f64 	%fd<9>;
183 | 	.reg .b64 	%rd<15>;
184 | 
185 | 
186 | 	ld.param.u64 	%rd1, [_Z10apply_lossPdPKdiiPiS2_d_param_0];
187 | 	ld.param.u64 	%rd2, [_Z10apply_lossPdPKdiiPiS2_d_param_1];
188 | 	ld.param.u32 	%r2, [_Z10apply_lossPdPKdiiPiS2_d_param_3];
189 | 	ld.param.u64 	%rd3, [_Z10apply_lossPdPKdiiPiS2_d_param_4];
190 | 	ld.param.u64 	%rd4, [_Z10apply_lossPdPKdiiPiS2_d_param_5];
191 | 	ld.param.f64 	%fd1, [_Z10apply_lossPdPKdiiPiS2_d_param_6];
192 | 	mov.u32 	%r3, %ctaid.x;
193 | 	mov.u32 	%r4, %ntid.x;
194 | 	mov.u32 	%r5, %tid.x;
195 | 	mad.lo.s32 	%r1, %r4, %r3, %r5;
196 | 	setp.ge.s32	%p1, %r1, %r2;
197 | 	@%p1 bra 	BB2_2;
198 | 
199 | 	cvta.to.global.u64 	%rd5, %rd1;
200 | 	cvta.to.global.u64 	%rd6, %rd3;
201 | 	mul.wide.s32 	%rd7, %r1, 4;
202 | 	add.s64 	%rd8, %rd6, %rd7;
203 | 	ld.global.u32 	%r6, [%rd8];
204 | 	add.s32 	%r7, %r6, -1;
205 | 	mul.wide.s32 	%rd9, %r7, 8;
206 | 	add.s64 	%rd10, %rd5, %rd9;
207 | 	cvta.to.global.u64 	%rd11, %rd4;
208 | 	add.s64 	%rd12, %rd11, %rd7;
209 | 	ld.global.u32 	%r8, [%rd12];
210 | 	mov.u32 	%r9, 4;
211 | 	sub.s32 	%r10, %r9, %r8;
212 | 	cvt.rn.f64.s32	%fd2, %r10;
213 | 	mul.f64 	%fd3, %fd2, %fd1;
214 | 	cvta.to.global.u64 	%rd13, %rd2;
215 | 	add.s64 	%rd14, %rd13, %rd9;
216 | 	ld.global.nc.f64 	%fd4, [%rd14];
217 | 	ld.global.f64 	%fd5, [%rd10];
218 | 	fma.rn.f64 	%fd6, %fd4, %fd3, %fd5;
219 | 	add.f64 	%fd7, %fd3, 0d3FF0000000000000;
220 | 	div.rn.f64 	%fd8, %fd6, %fd7;
221 | 	st.global.f64 	[%rd10], %fd8;
222 | 
223 | BB2_2:
224 | 	ret;
225 | }
226 | 
227 | 
228 | 


--------------------------------------------------------------------------------