├── code
    ├── ch01
    │   ├── precision_m.cuf
    │   ├── precision2_m.cuf
    │   ├── version.cuf
    │   ├── managedCUF.cuf
    │   ├── multidimCUF.cuf
    │   ├── managedCUF2.f90
    │   ├── increment.f90
    │   ├── increment.cuf
    │   ├── multiblock.cuf
    │   ├── pciBusID.cuf
    │   ├── multidim.cuf
    │   ├── managed.cuf
    │   ├── managedImplicit.cuf
    │   ├── asyncError.cuf
    │   ├── syncError.cuf
    │   ├── errorHandling.cuf
    │   ├── explicitInterface.cuf
    │   ├── Makefile
    │   └── deviceQuery.cuf
    ├── ch10
    │   ├── precision_m.cuf
    │   └── Makefile
    ├── ch02
    │   ├── fma.cuf
    │   ├── Makefile
    │   ├── accuracy.cuf
    │   ├── debug.cuf
    │   ├── initcheck.cuf
    │   ├── memcheck.cuf
    │   └── print.cuf
    ├── ch05
    │   ├── fma.cuf
    │   ├── sliceTransfer.cuf
    │   ├── cufILP.cuf
    │   ├── constantAttribute.cuf
    │   ├── constant.cuf
    │   ├── assumedShapeSize.cuf
    │   ├── spill.cuf
    │   ├── loads.cuf
    │   ├── maxSharedMemory.cuf
    │   ├── local.cuf
    │   ├── Makefile
    │   ├── offset.cuf
    │   ├── stride.cuf
    │   ├── async.cuf
    │   └── HDtransfer.cuf
    ├── ch07
    │   ├── c.cu
    │   ├── testSort.cuf
    │   ├── cudaforEx.cuf
    │   ├── thrust.cu
    │   ├── thrust.cuf
    │   ├── matmulTranspose.cuf
    │   ├── sgemmNew.cuf
    │   ├── sgemmLegacy.cuf
    │   ├── callingC.cuf
    │   ├── Makefile
    │   ├── getrfBatched.cuf
    │   ├── matmulTC.cuf
    │   ├── cusparseMV_Ex.cuf
    │   ├── potr.cuf
    │   ├── gemmPerf.cuf
    │   ├── cusparseMV.cuf
    │   ├── cusparseEx.cuf
    │   └── cutensorContraction.cuf
    ├── ch11
    │   ├── precision_m.F90
    │   ├── Makefile
    │   ├── fft_test_c2c.cuf
    │   ├── fft_test_r2c.cuf
    │   └── fft_derivative.cuf
    ├── ch08
    │   ├── timing.f90
    │   ├── wallclock.c
    │   ├── mpiDeviceUtil.cuf
    │   ├── minimal.cuf
    │   ├── Makefile
    │   ├── Makefile~
    │   ├── p2pAccess.cuf
    │   ├── mpiDevices.cuf
    │   ├── assignDevice.cuf
    │   ├── p2pBandwidth.cuf
    │   └── directTransfer.cuf
    ├── ch06
    │   ├── portingBase.f90
    │   ├── portingManaged.cuf
    │   ├── portingDevice.cuf
    │   ├── portingManagedSent.f90
    │   ├── portingAssociate.f90
    │   ├── portingManaged_CUDA.F90
    │   ├── portingDeviceSent.F90
    │   ├── portingDevice_CUDA.F90
    │   ├── union_m.cuf
    │   ├── testCompact.cuf
    │   ├── test_union.cuf
    │   ├── Makefile
    │   ├── laplace2D.f90
    │   ├── laplace2DAssoc.f90
    │   └── laplace2DUse.F90
    ├── ch03
    │   ├── nvtxAuto.cuf
    │   ├── nvtxBasic.cuf
    │   ├── nvtxAdv2.cuf
    │   ├── nvtxAdv.cuf
    │   ├── peakBandwidth.cuf
    │   ├── Makefile
    │   ├── multidim.cuf
    │   ├── events.cuf
    │   ├── limitingFactor.cuf
    │   └── effectiveBandwidth.cuf
    ├── ch09
    │   ├── precision_m.F90
    │   ├── ieee_accuracy.f90
    │   ├── shflExample.cuf
    │   ├── pi_lock.cuf
    │   ├── generate_randomnumbers.cuf
    │   ├── Makefile
    │   ├── pi_shared.cuf
    │   ├── accuracy_sum.f90
    │   ├── compute_pi.cuf
    │   ├── testPiGridGroup.cuf
    │   ├── pi_shfl.cuf
    │   ├── pi_gridGroup.cuf
    │   └── montecarlo_european_option.cuf
    ├── ch04
    │   ├── raceAndAtomic.cuf
    │   ├── ballot.cuf
    │   ├── concurrentKernels.cuf
    │   ├── shfl.cuf
    │   ├── differentStreamTypes.cuf
    │   ├── syncthreads.cuf
    │   ├── pipeline.cuf
    │   ├── eventSync.cuf
    │   ├── defaultStream.cuf
    │   ├── cgReverse.cuf
    │   ├── streamSync.cuf
    │   ├── defaultStreamVar.cuf
    │   ├── raceAndAtomicShared.cuf
    │   ├── defaultStreamVarExplicit.cuf
    │   ├── smooth.cuf
    │   ├── twoKernels.cuf
    │   ├── sharedExample.cuf
    │   ├── sharedMultiple.cuf
    │   ├── Makefile
    │   ├── threadfence.cuf
    │   └── swap.cuf
    └── ch12
    │   ├── ppmExample.f90
    │   ├── Makefile
    │   ├── ray.F90
    │   ├── sphere.F90
    │   ├── normal.F90
    │   └── rgb_m.F90
└── LICENSE.txt


/code/ch01/precision_m.cuf:
--------------------------------------------------------------------------------
1 | module precision_m
2 |   use, intrinsic :: iso_fortran_env
3 |   integer, parameter :: fp_kind = real64
4 | end module precision_m
5 | 


--------------------------------------------------------------------------------
/code/ch10/precision_m.cuf:
--------------------------------------------------------------------------------
1 | module precision_m
2 |   use, intrinsic :: iso_fortran_env
3 |   integer, parameter :: fp_kind = real32
4 | end module precision_m
5 | 


--------------------------------------------------------------------------------
/code/ch02/fma.cuf:
--------------------------------------------------------------------------------
1 | module m
2 | contains
3 |   attributes(global) subroutine s(a, b, c)
4 |     real :: a, b, c
5 |     a = a+b*c
6 |   end subroutine s
7 | end module m
8 | 


--------------------------------------------------------------------------------
/code/ch05/fma.cuf:
--------------------------------------------------------------------------------
1 | module m
2 | contains
3 |   attributes(global) subroutine k(a, b, c)
4 |     real :: a, b, c
5 |     c = a*b+c
6 |   end subroutine k
7 | end module m
8 | 


--------------------------------------------------------------------------------
/code/ch07/c.cu:
--------------------------------------------------------------------------------
 1 | extern "C" __global__ void Ckernel(float *a, float b)
 2 | {
 3 |   a[threadIdx.x] = b;
 4 | }
 5 | 
 6 | extern "C" __device__ float Cdevicefun(float a)
 7 | {
 8 |   return 2*a;
 9 | }
10 | 


--------------------------------------------------------------------------------
/code/ch01/precision2_m.cuf:
--------------------------------------------------------------------------------
 1 | module precision_m
 2 |   use, intrinsic :: iso_fortran_env
 3 |   
 4 | #ifdef DOUBLE
 5 |   integer, parameter :: fp_kind = real64
 6 | #else
 7 |   integer, parameter :: fp_kind = real32
 8 | #endif
 9 | end module precision_m
10 | 


--------------------------------------------------------------------------------
/code/ch11/precision_m.F90:
--------------------------------------------------------------------------------
 1 | module precision_m
 2 |   use, intrinsic :: iso_fortran_env
 3 | 
 4 | #ifdef DOUBLE
 5 |   integer, parameter :: fp_kind = real64
 6 | #else
 7 |   integer, parameter :: fp_kind = real32
 8 | #endif
 9 | end module precision_m
10 | 
11 | 


--------------------------------------------------------------------------------
/code/ch05/sliceTransfer.cuf:
--------------------------------------------------------------------------------
 1 | program sliceTransfer
 2 |   implicit none
 3 |   integer, parameter :: nx=128, ny=128, nz=128
 4 |   real :: a(nx,ny,nz)
 5 |   real, device :: a_d(nx,ny,nz)
 6 | 
 7 |   a = 0.0
 8 |   a_d(2:nx-1,:,:) = a(2:nx-1,:,:)
 9 |   
10 | end program sliceTransfer
11 | 


--------------------------------------------------------------------------------
/code/ch08/timing.f90:
--------------------------------------------------------------------------------
 1 | module timing
 2 |   interface wallclock
 3 |      function wallclock() result(res) bind(C, name='wallclock')
 4 |        use iso_c_binding
 5 |        real (c_double) :: res
 6 |      end function wallclock
 7 |   end interface wallclock
 8 | end module timing
 9 | 
10 | 


--------------------------------------------------------------------------------
/code/ch01/version.cuf:
--------------------------------------------------------------------------------
 1 | program version
 2 |   use cudafor
 3 |   implicit none
 4 |   integer :: istat, ver
 5 |   istat = cudaDriverGetVersion(ver)
 6 |   print *, 'Driver version:  ', ver
 7 |   istat = cudaRuntimeGetVersion(ver)
 8 |   print *, 'Runtime version: ', ver
 9 | end program version
10 | 


--------------------------------------------------------------------------------
/code/ch06/portingBase.f90:
--------------------------------------------------------------------------------
 1 | program main
 2 |   implicit none
 3 |   integer, parameter :: n=8
 4 |   real :: a(n), b(n)
 5 |   integer :: i
 6 | 
 7 |   do i = 1, n
 8 |      a(i) = i+1
 9 |   enddo
10 | 
11 |   do i = 1, n
12 |      b(i) = a(i)+1
13 |   enddo
14 | 
15 |   print *, a
16 |   print *, b
17 | end program main
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/code/ch08/wallclock.c:
--------------------------------------------------------------------------------
 1 | #include <time.h>
 2 | #include <sys/types.h>
 3 | #include <sys/times.h>
 4 | #include <sys/time.h>
 5 | 
 6 | double wallclock()
 7 | {
 8 |   struct timeval tv;
 9 |   struct timezone tz;
10 |   double t;
11 | 
12 |   gettimeofday(&tv, &tz);
13 | 
14 |   t = (double)tv.tv_sec;
15 |   t += ((double)tv.tv_usec)/1000000.0;
16 | 
17 |   return t;
18 | }
19 | 


--------------------------------------------------------------------------------
/code/ch03/nvtxAuto.cuf:
--------------------------------------------------------------------------------
 1 | module m
 2 | contains
 3 |   subroutine s1
 4 |     call s2
 5 |     call s2
 6 |   end subroutine s1
 7 | 
 8 |   subroutine s2
 9 |     call sleep(1)
10 |   end subroutine s2
11 | end module m
12 | 
13 | program main
14 |   use m
15 |   implicit none
16 |   integer :: n
17 | 
18 |   do n = 1, 4
19 |      call s1
20 |   enddo
21 | 
22 | end program main
23 | 


--------------------------------------------------------------------------------
/code/ch03/nvtxBasic.cuf:
--------------------------------------------------------------------------------
 1 | program main
 2 |   use nvtx
 3 |   implicit none
 4 |   character(len=4) :: nchar
 5 |   integer :: n
 6 | 
 7 |   call nvtxStartRange("Outer Label")
 8 | 
 9 |   do n = 1, 4
10 |      write(nchar, '(i4)') n
11 |      call nvtxStartRange('Label '//nchar,n)
12 |      call sleep(1)
13 |      call nvtxEndRange()
14 |   enddo
15 | 
16 |   call nvtxEndRange()
17 | end program main
18 | 


--------------------------------------------------------------------------------
/code/ch09/precision_m.F90:
--------------------------------------------------------------------------------
 1 | module precision_m
 2 |   use, intrinsic :: iso_fortran_env
 3 |   use iso_fortran_env, only: real32, real64
 4 |   integer, parameter :: singlePrecision = real32
 5 |   integer, parameter :: doublePrecision = real64
 6 |   
 7 | #ifdef DOUBLE
 8 |   integer, parameter :: fp_kind = doublePrecision
 9 | #else
10 |   integer, parameter :: fp_kind = singlePrecision
11 | #endif
12 | 
13 | end module precision_m
14 | 
15 | 


--------------------------------------------------------------------------------
/code/ch03/nvtxAdv2.cuf:
--------------------------------------------------------------------------------
 1 | program main
 2 |   use nvtx
 3 |   implicit none
 4 |   integer, parameter :: n=4
 5 |   type(nvtxRangeId) :: id(n)
 6 |   character(len=4) :: ichar
 7 |   integer :: i
 8 |   
 9 |   do i=1, n
10 |      write(ichar,'(i4)') i
11 |      id(i) = nvtxRangeStart('Label '//ichar)
12 |      call sleep(1)
13 |      if (i>1) call nvtxRangeEnd(id(i-1))
14 |   enddo
15 |   
16 |   call nvtxRangeEnd(id(n))  
17 | end program main
18 | 


--------------------------------------------------------------------------------
/code/ch03/nvtxAdv.cuf:
--------------------------------------------------------------------------------
 1 | program main
 2 |   use nvtx
 3 |   implicit none
 4 |   type(nvtxRangeId) :: id1, id2
 5 |   character(len=4) :: nchar 
 6 |   integer :: n
 7 | 
 8 |   id1 = nvtxRangeStart("Outer label")  
 9 | 
10 |   do n = 1, 4
11 |      write(nchar, '(i4)') n
12 |      id2 = nvtxRangeStart('Label '//nchar)
13 |      call sleep(1)
14 |      call nvtxRangeEnd(id2)
15 |   enddo
16 |   
17 |   call nvtxRangeEnd(id1)  
18 | end program main
19 | 


--------------------------------------------------------------------------------
/code/ch07/testSort.cuf:
--------------------------------------------------------------------------------
 1 | program testsort
 2 |   use thrust
 3 |   implicit none
 4 |   integer, parameter :: n = 10
 5 |   real :: cpuData(n)
 6 |   real, device :: gpuData(n)
 7 | 
 8 |   call random_number(cpuData)
 9 |   cpuData(5)=100.
10 | 
11 |   print *, "Before sorting", cpuData
12 | 
13 |   gpuData=cpuData
14 |   call thrustsort(gpuData, size(gpuData))
15 |   cpuData=gpuData
16 | 
17 |   print *, "After sorting", cpuData
18 | end program testsort
19 | 


--------------------------------------------------------------------------------
/code/ch06/portingManaged.cuf:
--------------------------------------------------------------------------------
 1 | program main
 2 |   use cudafor
 3 |   implicit none
 4 |   integer, parameter :: n=8
 5 |   real, managed :: a(n), b(n)
 6 |   integer :: i
 7 | 
 8 |   !$cuf kernel do <<<*,*>>> 
 9 |   do i = 1, n
10 |      a(i) = i+1
11 |   enddo
12 | 
13 |   !$cuf kernel do <<<*,*>>>
14 |   do i = 1, n
15 |      b(i) = a(i)+1
16 |   enddo
17 | 
18 |   i = cudaDeviceSynchronize()
19 |   
20 |   print *, a
21 |   print *, b
22 | end program main
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/code/ch06/portingDevice.cuf:
--------------------------------------------------------------------------------
 1 | program main
 2 |   implicit none
 3 |   integer, parameter :: n=8
 4 |   real :: a(n), b(n)
 5 |   real, device :: a_d(n), b_d(n)
 6 |   integer :: i
 7 | 
 8 |   !$cuf kernel do <<<*,*>>> 
 9 |   do i = 1, n
10 |      a_d(i) = i+1
11 |   enddo
12 | 
13 |   !$cuf kernel do <<<*,*>>>
14 |   do i = 1, n
15 |      b_d(i) = a_d(i)+1
16 |   enddo
17 | 
18 |   a = a_d; b = b_d
19 |   
20 |   print *, a
21 |   print *, b
22 | end program main
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/code/ch05/cufILP.cuf:
--------------------------------------------------------------------------------
 1 | program ilp
 2 |   implicit none
 3 |   integer, parameter :: n = 1024*1024
 4 |   integer :: a(n), i, b
 5 |   integer, device :: a_d(n)
 6 |   integer, parameter :: tPB = 256
 7 | 
 8 |   a = 1
 9 |   b = 3
10 | 
11 |   a_d = a
12 | 
13 |   !$cuf kernel do <<<1024,tPB>>>
14 |   do i = 1, n
15 |      a_d(i) = a_d(i) + b
16 |   enddo
17 | 
18 |   a = a_d
19 | 
20 |   if (any(a /= 4)) then
21 |      write(*,*) '**** Program Failed ****'
22 |   else
23 |      write(*,*) 'Program Passed'
24 |   endif
25 | end program ilp
26 | 
27 | 


--------------------------------------------------------------------------------
/code/ch06/portingManagedSent.f90:
--------------------------------------------------------------------------------
 1 | program main
 2 |   !@cuf use cudafor
 3 |   implicit none
 4 |   integer, parameter :: n=8
 5 |   real :: a(n), b(n)
 6 |   !@cuf attributes(managed) :: a, b
 7 |   integer :: i
 8 | 
 9 |   !$cuf kernel do <<<*,*>>> 
10 |   do i = 1, n
11 |      a(i) = i+1
12 |   enddo
13 | 
14 |   !$cuf kernel do <<<*,*>>>
15 |   do i = 1, n
16 |      b(i) = a(i)+1
17 |   enddo
18 | 
19 |   !@cuf i = cudaDeviceSynchronize()
20 |   !@cuf print *, 'GPU version'
21 |   
22 |   print *, a
23 |   print *, b
24 | end program main
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/code/ch01/managedCUF.cuf:
--------------------------------------------------------------------------------
 1 | program managedCUF
 2 |   use cudafor
 3 |   implicit none
 4 |   integer, parameter :: nx=1024, ny=512
 5 |   integer, managed :: a(nx,ny)
 6 |   integer :: b, i, j
 7 | 
 8 |   a = 1
 9 |   b = 3
10 |   
11 |   !$cuf kernel do (2) <<<*,*>>>
12 |   do j = 1, ny
13 |      do i = 1, nx
14 |         a(i,j) = a(i,j) + b
15 |      enddo
16 |   enddo
17 |   i = cudaDeviceSynchronize()
18 | 
19 |   if (any(a /= 4)) then
20 |      print *, '**** Program Failed ****'
21 |   else
22 |      print *, 'Program Passed'
23 |   endif
24 | end program managedCUF
25 | 


--------------------------------------------------------------------------------
/code/ch01/multidimCUF.cuf:
--------------------------------------------------------------------------------
 1 | program multidimCUF
 2 |   use cudafor
 3 |   implicit none
 4 |   integer, parameter :: nx=1024, ny=512
 5 |   integer :: a(nx,ny), b, i, j
 6 |   integer, device :: a_d(nx,ny)
 7 | 
 8 |   a = 1
 9 |   b = 3
10 |   
11 |   a_d = a
12 |   !$cuf kernel do (2) <<<*,*>>>
13 |   do j = 1, ny
14 |      do i = 1, nx
15 |         a_d(i,j) = a_d(i,j) + b
16 |      enddo
17 |   enddo
18 |   a = a_d
19 | 
20 |   if (any(a /= 4)) then
21 |      print *, '**** Program Failed ****'
22 |   else
23 |      print *, 'Program Passed'
24 |   endif
25 | end program multidimCUF
26 | 


--------------------------------------------------------------------------------
/code/ch06/portingAssociate.f90:
--------------------------------------------------------------------------------
 1 | program main
 2 |   implicit none
 3 |   integer, parameter :: n=8
 4 |   
 5 |   real :: a(n), b(n)
 6 |   !@cuf  real, device :: a_d(n), b_d(n)
 7 |   integer :: i
 8 | 
 9 |   !@cuf associate(a => a_d)
10 |   !$cuf kernel do
11 |   do i = 1, n
12 |      a(i) = i+1
13 |   enddo
14 |   !@cuf associate(b => b_d) 
15 |   !$cuf kernel do
16 |   do i = 1, n
17 |      b(i) = a(i)+1
18 |   enddo
19 |   !@cuf end associate ! b ...
20 |   !@cuf end associate ! a ...
21 |   !@cuf a = a_d; b = b_d
22 | 
23 |   !@cuf print *, 'GPU run'
24 |   print *, a
25 |   print *, b
26 | end program main
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/code/ch06/portingManaged_CUDA.F90:
--------------------------------------------------------------------------------
 1 | program main
 2 | #ifdef _CUDA  
 3 |   use cudafor
 4 | #endif
 5 |   implicit none
 6 |   integer, parameter :: n=8
 7 |   real :: a(n), b(n)
 8 | #ifdef _CUDA
 9 |   attributes(managed) :: a, b
10 | #endif
11 |   integer :: i
12 | 
13 |   !$cuf kernel do <<<*,*>>> 
14 |   do i = 1, n
15 |      a(i) = i+1
16 |   enddo
17 | 
18 |   !$cuf kernel do <<<*,*>>>
19 |   do i = 1, n
20 |      b(i) = a(i)+1
21 |   enddo
22 | 
23 | #ifdef _CUDA
24 |   i = cudaDeviceSynchronize()
25 |   print *, 'GPU version'
26 | #endif
27 |   
28 |   print *, a
29 |   print *, b
30 | end program main
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/code/ch02/Makefile:
--------------------------------------------------------------------------------
 1 | OBJS = accuracy fma print debug memcheck initcheck
 2 | 
 3 | # section 2.1.1
 4 | 
 5 | accuracy: accuracy.cuf
 6 | 	nvfortran -o $@ $<
 7 | 
 8 | # section 2.1.2
 9 | 
10 | fma: fma.cuf
11 | 	nvfortran -c -gpu=keep $<
12 | 
13 | # section 2.2.1
14 | 
15 | print: print.cuf
16 | 	nvfortran -o $@ $<
17 | 
18 | # section 2.2.2
19 | 
20 | debug: debug.cuf
21 | 	nvfortran -o $@ -g -gpu=nordc $<
22 | 
23 | # section 2.2.3
24 | 
25 | memcheck: memcheck.cuf
26 | 	nvfortran -o $@ $<
27 | 
28 | initcheck: initcheck.cuf
29 | 	nvfortran -o $@ $<
30 | 
31 | clean:
32 | 	rm -rf $(OBJS) *.o *.mod *~ *.ptx *.gpu *.bin *.fat
33 | 


--------------------------------------------------------------------------------
/code/ch06/portingDeviceSent.F90:
--------------------------------------------------------------------------------
 1 | program main
 2 |   implicit none
 3 |   integer, parameter :: n=8
 4 |   real :: a(n), b(n)
 5 |   !@cuf attributes(device) :: a, b
 6 |   !@cuf real :: a_h(n), b_h(n)
 7 |   integer :: i
 8 | 
 9 |   !$cuf kernel do <<<*,*>>> 
10 |   do i = 1, n  
11 |      a(i) = i+1
12 |   enddo
13 | 
14 |   !$cuf kernel do <<<*,*>>>
15 |   do i = 1, n
16 |      b(i) = a(i)+1
17 |   enddo
18 | 
19 |   !@cuf a_h = a; b_h = b
20 |   !@cuf print *, 'GPU version'
21 | 
22 | #ifdef _CUDA
23 |   print *, a_h
24 |   print *, b_h
25 | #else
26 |   print *, a
27 |   print *, b
28 | #endif
29 | 
30 | end program main
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/code/ch06/portingDevice_CUDA.F90:
--------------------------------------------------------------------------------
 1 | program main
 2 |   implicit none
 3 |   integer, parameter :: n=8
 4 |   real :: a(n), b(n)
 5 | #ifdef _CUDA
 6 |   attributes(device) :: a, b
 7 |   real :: a_h(n), b_h(n)
 8 | #endif
 9 |   integer :: i
10 | 
11 |   !$cuf kernel do <<<*,*>>> 
12 |   do i = 1, n  
13 |      a(i) = i+1
14 |   enddo
15 | 
16 |   !$cuf kernel do <<<*,*>>>
17 |   do i = 1, n
18 |      b(i) = a(i)+1
19 |   enddo
20 | 
21 | #ifdef _CUDA
22 |   a_h = a; b_h = b
23 |   print *, 'GPU version'
24 |   print *, a_h
25 |   print *, b_h
26 | #else
27 |   print *, a
28 |   print *, b
29 | #endif
30 | 
31 | end program main
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/code/ch01/managedCUF2.f90:
--------------------------------------------------------------------------------
 1 | program managedCUF
 2 |   !@cuf use cudafor
 3 |   implicit none
 4 |   integer, parameter :: nx=1024, ny=512
 5 |   integer :: a(nx,ny)
 6 |   !@cuf	attributes(managed):: a	
 7 |   integer :: b, i ,j
 8 |     
 9 |   a = 1
10 |   b = 3 
11 |   !$cuf kernel do (2) <<<*,*>>>
12 |   do j=1,ny
13 |      do i=1,nx
14 |         a(i,j)=a(i,j)+b
15 |      end do
16 |   end do
17 |   !@cuf	i=cudaDeviceSynchronize()
18 | 
19 |   !@cuf print *, "Running CUDA version ..."
20 |   if(any(a /= 4)) then
21 |      print *, "**** Program Failed ****"
22 |   else
23 |      print *, "Program Passed"
24 |   end if
25 | end program managedCUF
26 | 


--------------------------------------------------------------------------------
/code/ch05/constantAttribute.cuf:
--------------------------------------------------------------------------------
 1 | module m
 2 |   integer, constant :: b
 3 | contains
 4 |   attributes(global) subroutine increment(a)
 5 |     implicit none
 6 |     integer, intent(inout) :: a(*)
 7 |     integer :: i
 8 | 
 9 |     i = threadIdx%x
10 |     a(i) = a(i)+b
11 | 
12 |   end subroutine increment
13 | end module m
14 | 
15 | 
16 | program constantAttribute
17 |   use cudafor
18 |   use m
19 |   implicit none
20 |   type(cudaFuncAttributes) :: attr
21 |   integer :: istat
22 | 
23 |   istat = cudaFuncGetAttributes(attr, increment)
24 |   print "('Constant memory used (bytes): ', i0)", attr%constSizeBytes
25 | end program constantAttribute
26 | 


--------------------------------------------------------------------------------
/code/ch10/Makefile:
--------------------------------------------------------------------------------
 1 | OBJS = finiteDifference finiteDifferenceStr limitingFactor laplace2D
 2 | 
 3 | # section 10.1.*
 4 | 
 5 | finiteDifference: precision_m.cuf derivative_m.cuf finiteDifference.cuf 
 6 | 	nvfortran -o $@ -O3 -gpu=ptxinfo $^
 7 | 
 8 | # section 10.1.2
 9 | 
10 | limitingFactor: precision_m.cuf limitingFactor.cuf
11 | 	nvfortran -o $@ -O3 $^
12 | 
13 | # section 10.1.4
14 | 
15 | finiteDifferenceStr: precision_m.cuf derivativeStr_m.cuf finiteDifferenceStr.cuf
16 | 	nvfortran -o $@ -O3 $^
17 | 
18 | # section 10.2
19 | 
20 | laplace2D: laplace2D.cuf 
21 | 	nvfortran -o $@ -O3 $^
22 | 
23 | 
24 | clean:
25 | 	rm -rf $(OBJS) *.o *.mod *~
26 | 


--------------------------------------------------------------------------------
/code/ch08/mpiDeviceUtil.cuf:
--------------------------------------------------------------------------------
 1 | module mpiDeviceUtil
 2 | contains
 3 |   !  assign a different GPU to each MPI rank
 4 |   !  note: all the memory allocations should be dynamic, 
 5 |   !  otherwise all the arrays will be allocated on device 0
 6 |   subroutine assignDevice(dev)
 7 |     use mpi
 8 |     use cudafor
 9 |     implicit none
10 |     integer :: dev
11 |     integer :: local_comm,  ierr
12 | 
13 |     dev=0
14 |     call MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, &
15 |          MPI_INFO_NULL, local_comm, ierr)
16 |     call MPI_Comm_rank(local_comm, dev, ierr)
17 |     ierr = cudaSetDevice(dev)
18 |   end subroutine assignDevice
19 | end module mpiDeviceUtil
20 | 
21 | 


--------------------------------------------------------------------------------
/code/ch01/increment.f90:
--------------------------------------------------------------------------------
 1 | module m
 2 | contains
 3 |   subroutine increment(a, b)
 4 |     implicit none
 5 |     integer, intent(inout) :: a(:)
 6 |     integer, intent(in) :: b
 7 |     integer :: i, n
 8 | 
 9 |     n = size(a)
10 |     do i = 1, n
11 |        a(i) = a(i)+b
12 |     enddo
13 | 
14 |   end subroutine increment
15 | end module m
16 | 
17 | 
18 | program incrementCPU
19 |   use m
20 |   implicit none
21 |   integer, parameter :: n = 256
22 |   integer :: a(n), b
23 | 
24 |   a = 1
25 |   b = 3
26 |   call increment(a, b)
27 | 
28 |   if (any(a /= 4)) then
29 |      print *, '**** Program Failed ****'
30 |   else
31 |      print *, 'Program Passed'
32 |   endif
33 | end program incrementCPU
34 | 


--------------------------------------------------------------------------------
/code/ch09/ieee_accuracy.f90:
--------------------------------------------------------------------------------
 1 | program test_accuracy
 2 |   real :: x, y, dist
 3 |   double precision:: x_dp, y_dp, dist_dp
 4 |   
 5 |   x=Z'3F1DC57A'
 6 |   y=Z'3F499AA3'
 7 |   dist= x**2 +y**2
 8 | 
 9 |   x_dp=real(x,8)
10 |   y_dp=real(y,8)
11 |   dist_dp= x_dp**2 +y_dp**2
12 | 
13 |   print *, 'Result with operands in single precision:'
14 |   print '((2x,z8)) ', dist
15 | 
16 |   print *, 'Result in double precision with operands'
17 |   print *, 'promoted to double precision:'
18 |   print '((2x,z16))', dist_dp
19 | 
20 |   print *, 'Result in single precision with operands'
21 |   print *, 'promoted to double precision:'
22 |   print '((2x,z8))', real(dist_dp,4)
23 | 
24 | end program test_accuracy
25 | 


--------------------------------------------------------------------------------
/code/ch02/accuracy.cuf:
--------------------------------------------------------------------------------
 1 | program main
 2 |   use cudafor
 3 |   implicit none
 4 |   integer, parameter :: n=1000000
 5 |   integer(8), parameter :: n8 = n
 6 | 
 7 |   real :: suminc = 0.0, sumdec = 0.0, a(n)
 8 |   real, device :: a_d(n)
 9 |   integer :: i
10 |   
11 |   do i = 1, n
12 |      a(i) = i
13 |      suminc = suminc + i
14 |   enddo
15 | 
16 |   do i = n, 1, -1
17 |      sumdec = sumdec + i
18 |   end do
19 | 
20 |   print *, 'n: ', n
21 |   print *,'n*(n+1)/2: ', n8/2*(n8+1)
22 |   
23 |   print *, 'from sum(a):         ', sum(a)  
24 |   print *, 'incr accumulation:   ', suminc
25 |   print *, 'decr accumulation:   ', sumdec
26 |   a_d = a
27 |   print *, 'from sum(a_d):       ', sum(a_d)
28 | end program main
29 | 


--------------------------------------------------------------------------------
/code/ch07/cudaforEx.cuf:
--------------------------------------------------------------------------------
 1 | module cudaforEx
 2 |   use cudafor
 3 |   
 4 | contains
 5 | 
 6 |   function cudaforGetDataType(x) result(res)
 7 |     use cudafor
 8 |     !dir$ ignore_tkr (rd) x
 9 |     class(*) :: x
10 |     integer :: res
11 |     select type (x)
12 |     type is (real(2))
13 |        res = CUDA_R_16F
14 |     type is (real(4))
15 |        res = CUDA_R_32F
16 |     type is (real(8))
17 |        res = CUDA_R_64F
18 |     type is (integer(4))
19 |        res = CUDA_R_32I
20 |     type is (complex(4))
21 |        res = CUDA_C_32F
22 |     type is (complex(8))
23 |        res = CUDA_C_64F
24 |     class default
25 |        res = -1
26 |     end select
27 |   end function cudaforGetDataType
28 | 
29 | end module cudaforEx
30 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use the files in this directory except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | 


--------------------------------------------------------------------------------
/code/ch05/constant.cuf:
--------------------------------------------------------------------------------
 1 | module m
 2 |   integer, constant :: b
 3 | contains
 4 |   attributes(global) subroutine increment(a)
 5 |     implicit none
 6 |     integer, intent(inout) :: a(*)
 7 |     integer :: i
 8 | 
 9 |     i = threadIdx%x
10 |     a(i) = a(i)+b
11 | 
12 |   end subroutine increment
13 | end module m
14 | 
15 | 
16 | program incrementGPU
17 |   use cudafor
18 |   use m
19 |   implicit none
20 |   integer, parameter :: n = 256
21 |   integer :: a(n)  
22 |   integer, device :: a_d(n)
23 | 
24 |   a = 1
25 |   b = 3
26 | 
27 |   a_d = a
28 |   call increment<<<1,n>>>(a_d)
29 |   a = a_d
30 | 
31 |   if (any(a /= 4)) then
32 |      print *, '**** Program Failed ****'
33 |   else
34 |      print *, 'Program Passed'
35 |   endif
36 | end program incrementGPU
37 | 


--------------------------------------------------------------------------------
/code/ch01/increment.cuf:
--------------------------------------------------------------------------------
 1 | module m
 2 | contains
 3 |   attributes(global) subroutine increment(a, b)
 4 |     implicit none
 5 |     integer, intent(inout) :: a(*)
 6 |     integer, value :: b
 7 |     integer :: i
 8 | 
 9 |     i = threadIdx%x
10 |     a(i) = a(i)+b
11 | 
12 |   end subroutine increment
13 | end module m
14 | 
15 | 
16 | program incrementGPU
17 |   use cudafor
18 |   use m
19 |   implicit none
20 |   integer, parameter :: n = 256
21 |   integer :: a(n), b
22 |   integer, device :: a_d(n)
23 | 
24 |   a = 1
25 |   b = 3
26 | 
27 |   a_d = a
28 |   call increment<<<1,n>>>(a_d, b)
29 |   a = a_d
30 | 
31 |   if (any(a /= 4)) then
32 |      print *, '**** Program Failed ****'
33 |   else
34 |      print *, 'Program Passed'
35 |   endif
36 | end program incrementGPU
37 | 


--------------------------------------------------------------------------------
/code/ch07/thrust.cu:
--------------------------------------------------------------------------------
 1 | #include <thrust/device_vector.h>
 2 | #include <thrust/sort.h>
 3 | 
 4 |  
 5 | extern "C" {
 6 |   void sort_int_wrapper(int *data, int N)	
 7 |   {      
 8 |     // C ptr to device_vector container
 9 |     thrust::device_ptr <int> dev_ptr(data);
10 |     // Use device_ptr in Thrust sort algorithm    
11 |     thrust::sort(dev_ptr, dev_ptr+N);
12 |   }
13 | 
14 |   //Sort for float arrays
15 |   void sort_float_wrapper(float *data, int N)
16 |   {
17 |     thrust::device_ptr <float> dev_ptr(data);
18 |     thrust::sort(dev_ptr, dev_ptr+N);
19 |   }
20 | 
21 |   //Sort for double arrays
22 |   void sort_double_wrapper(double *data, int N)
23 |   {
24 |     thrust::device_ptr <double> dev_ptr(data);
25 |     thrust::sort(dev_ptr, dev_ptr+N);
26 |   }
27 | }
28 | 
29 | 


--------------------------------------------------------------------------------
/code/ch04/raceAndAtomic.cuf:
--------------------------------------------------------------------------------
 1 | module m
 2 | contains
 3 |   attributes(global) subroutine s1(raceCount, atomicCount)
 4 |     implicit none
 5 |     integer :: raceCount, atomicCount
 6 |     integer :: tmp    
 7 |     raceCount = raceCount + 1
 8 |     tmp = atomicAdd(atomicCount, 1)
 9 |   end subroutine s1
10 | end module m
11 | 
12 | program raceAndAtomic
13 |   use m
14 |   implicit none
15 |   integer, parameter :: nBlocks = 256, nThreads = 256
16 |   integer, device :: raceCount_d, atomicCount_d
17 |   integer :: raceCount, atomicCount
18 | 
19 |   raceCount_d = 0
20 |   atomicCount_d = 0
21 |   call s1<<<nBlocks, nThreads>>>(raceCount_d, atomicCount_d)
22 |   raceCount = raceCount_d
23 |   atomicCount = atomicCount_d
24 |   print *, nBlocks*nThreads, raceCount, atomicCount 
25 | end program raceAndAtomic
26 | 


--------------------------------------------------------------------------------
/code/ch07/thrust.cuf:
--------------------------------------------------------------------------------
 1 | module thrust
 2 | 
 3 |   interface thrustsort
 4 |      subroutine sort_int(input, N) bind(C, name="sort_int_wrapper")
 5 |        use iso_c_binding
 6 |        integer(c_int), device :: input(*)
 7 |        integer(c_int), value :: N
 8 |      end subroutine sort_int
 9 | 
10 |      subroutine sort_float(input, N) bind(C, name="sort_float_wrapper")
11 |        use iso_c_binding
12 |        real(c_float), device :: input(*)
13 |        integer(c_int), value :: N
14 |      end subroutine sort_float
15 | 
16 |      subroutine sort_double(input, N) bind(C, name="sort_double_wrapper")
17 |        use iso_c_binding
18 |        real(c_double), device :: input(*)
19 |        integer(c_int), value :: N
20 |      end subroutine sort_double
21 |   end interface thrustsort
22 | 
23 | end module thrust
24 | 


--------------------------------------------------------------------------------
/code/ch03/peakBandwidth.cuf:
--------------------------------------------------------------------------------
 1 | program peakBandwidth
 2 |   use cudafor
 3 |   implicit none
 4 | 
 5 |   integer :: i, istat, nDevices=0
 6 |   type (cudaDeviceProp) :: prop
 7 | 
 8 |   istat = cudaGetDeviceCount(nDevices)
 9 |   do i = 0, nDevices-1
10 |      istat = cudaGetDeviceProperties(prop, i)
11 |      print "(' Device Number: ',i0)", i
12 |      print "('   Device name: ',a)", trim(prop%name)
13 |      print "('   Memory Clock Rate (KHz): ', i0)", &
14 |           prop%memoryClockRate
15 |      print "('   Memory Bus Width (bits): ', i0)", &
16 |           prop%memoryBusWidth
17 |      print "('   Peak Memory Bandwidth (GB/s): ', f9.2)", &
18 |           2.0 * prop%memoryClockRate * &
19 |           (prop%memoryBusWidth / 8) * 1.e-6
20 |      print *
21 |   enddo
22 | end program peakBandwidth
23 | 


--------------------------------------------------------------------------------
/code/ch04/ballot.cuf:
--------------------------------------------------------------------------------
 1 | module m
 2 | contains
 3 |   attributes(global) subroutine s(a, nWarps)
 4 |     implicit none
 5 |     integer :: a(nWarps)
 6 |     integer, value :: nWarps
 7 |     integer :: mask, laneID, warpID
 8 | 
 9 |     warpID = ishft(threadIdx%x-1,-5)+1
10 |     laneID = iand(threadIdx%x-1,31)+1
11 | 
12 |     mask = ballot(threadIdx%x > 40)
13 |     mask = ballot_sync(mask, mod(threadIdx%x,2) == 0)
14 |     if (laneID == 1) a(warpID) = mask
15 |   end subroutine s
16 | end module m
17 | 
18 | program ballot
19 |   use m
20 |   implicit none
21 |   integer, parameter :: nWarps = 3  
22 |   integer :: a(nWarps), i
23 |   integer, device :: a_d(nWarps)
24 | 
25 |   call s<<<1,nWarps*32>>>(a_d, nWarps)
26 |   a = a_d
27 |   do i = 1, nWarps
28 |      print "(i0, 1x, B32.32)", i, a(i)
29 |   enddo
30 | end program ballot
31 | 


--------------------------------------------------------------------------------
/code/ch04/concurrentKernels.cuf:
--------------------------------------------------------------------------------
 1 | program concurrentKernels
 2 |   use cudafor
 3 |   implicit none
 4 |   integer, parameter :: n=100000
 5 |   integer, parameter :: nStreams=6
 6 |   real, device :: a_d(n,nStreams)
 7 |   real :: a(n,nStreams)
 8 |   integer(kind=cuda_stream_kind) :: streams(nStreams)
 9 |   integer :: istat, i, j, flag
10 | 
11 |   flag = cudaStreamDefault
12 |   do j = 1, nStreams
13 |      istat = cudaStreamCreateWithFlags(streams(j), flag)
14 |   enddo
15 |   streams(3) = 0
16 |   
17 |   do j = 1, nStreams
18 |      !$cuf kernel do <<<1,1,stream=streams(j)>>>
19 |      do i = 1, n
20 |         a_d(i,j) = i
21 |      enddo
22 |   enddo
23 | 
24 |   if (flag == cudaStreamNonBlocking) &
25 |        istat = cudaDeviceSynchronize()
26 |   
27 |   a = a_d
28 |   
29 |   do j = 1, nStreams
30 |      istat = cudaStreamDestroy(streams(j))
31 |   enddo
32 | 
33 | end program concurrentKernels
34 | 


--------------------------------------------------------------------------------
/code/ch04/shfl.cuf:
--------------------------------------------------------------------------------
 1 | module m
 2 | contains
 3 |   attributes(global) subroutine s(a)
 4 |     implicit none
 5 |     integer :: a(warpsize, 6)
 6 |     integer :: laneMask, stage
 7 |     integer :: var
 8 | 
 9 |     var = threadIdx%x
10 |     a(threadIdx%x,1) = var
11 | 
12 |     laneMask = 1
13 |     stage = 2
14 |     do
15 |        var = var + __shfl_xor(var, laneMask)
16 |        laneMask = laneMask*2
17 | 
18 |        a(threadIdx%x, stage) = var; stage = stage+1
19 |        if (laneMask > 16) exit
20 |     enddo
21 |   end subroutine s
22 | end module m
23 | 
24 | program shfl
25 |   use m
26 |   implicit none
27 |   integer, device :: a_d(32, 6)
28 |   integer :: a(32, 6), i
29 | 
30 |   call s<<<1,32>>>(a_d)
31 |   a = a_d
32 | 
33 |   do i = 1, 6
34 |      print "(16I4)", a(1:16,i)
35 |   enddo
36 |   print *
37 |   do i = 1, 6
38 |      print "(16I4)", a(17:32,i)
39 |   enddo
40 |   
41 | end program shfl
42 | 


--------------------------------------------------------------------------------
/code/ch02/debug.cuf:
--------------------------------------------------------------------------------
 1 | module simpleOps_m
 2 | contains
 3 |   attributes(global) subroutine increment(a, n, b)
 4 |     implicit none
 5 |     integer, intent(inout) :: a(*)
 6 |     integer, value :: n, b
 7 |     integer :: i
 8 | 
 9 |     i = blockDim%x*(blockIdx%x-1) + threadIdx%x
10 |     if (i <= n) then
11 |        a(i) = a(i)+b
12 |     endif
13 |   end subroutine increment
14 | end module simpleOps_m
15 | 
16 | 
17 | program main  
18 |   use cudafor
19 |   use simpleOps_m
20 |   implicit none
21 |   integer, parameter :: tPB=32*5, n=tPB*5+1, b = 3
22 |   integer :: a(n), r(n), i
23 |   integer, device :: a_d(n)
24 | 
25 |   do i = 1, n
26 |      a(i) = i
27 |   end do
28 | 
29 |   a_d = a
30 |   call increment<<<(n-1)/tPB+1,tPB>>>(a_d, n, b)
31 |   r = a_d
32 | 
33 |   if (any(r /= a+b)) then
34 |      print *, '**** Program Failed ****'
35 |   else
36 |      print *, 'Program Passed'
37 |   endif
38 | end program main
39 | 


--------------------------------------------------------------------------------
/code/ch05/assumedShapeSize.cuf:
--------------------------------------------------------------------------------
 1 | module m
 2 | contains
 3 |   attributes(global) subroutine assumedSizeArrays(a, b, c, nx, ny)
 4 |     implicit none
 5 |     real :: a(nx,ny,*), b(nx,ny,*), c(nx,ny,*)
 6 |     integer, value :: nx, ny
 7 |     integer :: i, j, k
 8 | 
 9 |     i = (blockIdx%x-1)*blockDim%x + threadIdx%x
10 |     j = (blockIdx%y-1)*blockDim%y + threadIdx%y
11 |     k = (blockIdx%z-1)*blockDim%z + threadIdx%z
12 | 
13 |     c(i,j,k) = a(i,j,k) + b(i,j,k)
14 |   end subroutine assumedSizeArrays
15 | 
16 |   attributes(global) subroutine assumedShapeArrays(a, b, c)
17 |     implicit none
18 |     real :: a(:,:,:), b(:,:,:), c(:,:,:)
19 |     integer :: i, j, k
20 | 
21 |     i = (blockIdx%x-1)*blockDim%x + threadIdx%x
22 |     j = (blockIdx%y-1)*blockDim%y + threadIdx%y
23 |     k = (blockIdx%z-1)*blockDim%z + threadIdx%z
24 | 
25 |     c(i,j,k) = a(i,j,k) + b(i,j,k)
26 |   end subroutine assumedShapeArrays
27 | end module m
28 | 


--------------------------------------------------------------------------------
/code/ch05/spill.cuf:
--------------------------------------------------------------------------------
 1 | module spill
 2 | contains
 3 |   attributes(global) subroutine k(a, n)
 4 |     integer :: a(*)
 5 |     integer, value :: n
 6 |     integer, parameter :: nb = 32
 7 |     integer :: b(nb), tid, i
 8 |     tid = blockDim%x*(blockIdx%x-1) + threadIdx%x
 9 |     do i = 1, nb
10 |        b(i) = a(mod(tid-1+i,n)+1)
11 |     enddo
12 |     do i = 2, nb
13 |        b(1) = b(1) + b(i)
14 |     enddo
15 |     a(tid) = b(1)
16 |   end subroutine k
17 |   
18 |   
19 |   attributes(global) launch_bounds(1024,2) subroutine klb(a, n)
20 |     integer :: a(*)
21 |     integer, value :: n
22 |     integer, parameter :: nb = 32
23 |     integer :: b(nb), tid, i
24 |     tid = blockDim%x*(blockIdx%x-1) + threadIdx%x
25 |     do i = 1, nb
26 |        b(i) = a(mod(tid-1+i,n)+1)
27 |     enddo
28 |     do i = 2, nb
29 |        b(1) = b(1) + b(i)
30 |     enddo
31 |     a(tid) = b(1)
32 |   end subroutine klb
33 | end module spill
34 | 
35 | 


--------------------------------------------------------------------------------
/code/ch04/differentStreamTypes.cuf:
--------------------------------------------------------------------------------
 1 | program differentStreamTypes
 2 |   use cudafor
 3 |   implicit none
 4 |   integer, parameter :: n=100000
 5 |   real, device :: a_d(n), b_d(n)
 6 |   real :: a(n)
 7 |   integer(kind=cuda_stream_kind) :: stream1, stream2
 8 |   integer :: istat, i
 9 | 
10 |   istat = cudaStreamCreate(stream1)
11 |   istat = cudaStreamCreateWithFlags(stream2, cudaStreamNonBlocking)
12 |   a = 1.0
13 | 
14 |   !$cuf kernel do <<<1,1>>>
15 |   do i = 1, n
16 |      b_d(i) = i
17 |   enddo
18 |   a_d = a
19 | 
20 |   !$cuf kernel do <<<1,1,stream=stream1>>>
21 |   do i = 1, n
22 |      b_d(i) = i
23 |   enddo
24 |   a_d = a
25 | 
26 |   istat = cudaDeviceSynchronize()
27 |   
28 |   !$cuf kernel do <<<1,1,stream=stream2>>>
29 |   do i = 1, n
30 |      b_d(i) = i
31 |   enddo
32 |   a_d = a
33 |   
34 |   istat = cudaStreamDestroy(stream1)
35 |   istat = cudaStreamDestroy(stream2)
36 | 
37 | end program differentStreamTypes
38 | 


--------------------------------------------------------------------------------
/code/ch02/initcheck.cuf:
--------------------------------------------------------------------------------
 1 | module simpleOps_m
 2 | contains
 3 |   attributes(global) subroutine increment(a, n, b)
 4 |     implicit none
 5 |     integer, intent(inout) :: a(*)
 6 |     integer, value :: n, b
 7 |     integer :: i
 8 | 
 9 |     i = blockDim%x*(blockIdx%x-1) + threadIdx%x
10 |     if (i <= n) then
11 |        a(i) = a(i)+b
12 |     end if
13 |   end subroutine increment
14 | end module simpleOps_m
15 | 
16 | 
17 | program main  
18 |   use cudafor
19 |   use simpleOps_m
20 |   implicit none
21 |   integer, parameter :: tPB=32*5, n=tPB*5+1, b = 3
22 |   integer :: a(n), r(n), i
23 |   integer, device :: a_d(n)
24 | 
25 |   do i = 1, n
26 |      a(i) = i
27 |   end do
28 | 
29 | !  a_d = a
30 |   call increment<<<ceiling(real(n)/tPB),tPB>>>(a_d, n, b)
31 |   r = a_d
32 | 
33 |   if (any(r /= a+b)) then
34 |      write(*,*) '**** Program Failed ****'
35 |   else
36 |      write(*,*) 'Program Passed'
37 |   endif
38 | end program main
39 | 


--------------------------------------------------------------------------------
/code/ch01/multiblock.cuf:
--------------------------------------------------------------------------------
 1 | module m
 2 | contains
 3 |   attributes(global) subroutine increment(a, n, b)
 4 |     implicit none
 5 |     integer, intent(inout) :: a(*)
 6 |     integer, value :: n, b
 7 |     integer :: i
 8 | 
 9 |     i = blockDim%x*(blockIdx%x-1) + threadIdx%x
10 |     if (i <= n) a(i) = a(i)+b
11 | 
12 |   end subroutine increment
13 | end module m
14 | 
15 | 
16 | program multiblock
17 |   use cudafor
18 |   use m
19 |   implicit none
20 |   integer, parameter :: n = 1024*1024
21 |   integer, allocatable :: a(:)
22 |   integer, device, allocatable :: a_d(:)
23 |   integer :: b, tPB = 256
24 | 
25 |   allocate(a(n), a_d(n))
26 |   a = 1
27 |   b = 3
28 | 
29 |   a_d = a
30 |   call increment<<<(n-1)/tPB+1,tPB>>>(a_d, n, b)
31 |   a = a_d
32 | 
33 |   if (any(a /= 4)) then
34 |      print *, '**** Program Failed ****'
35 |   else
36 |      print *, 'Program Passed'
37 |   endif
38 |   deallocate(a, a_d)
39 | end program multiblock
40 | 


--------------------------------------------------------------------------------
/code/ch04/syncthreads.cuf:
--------------------------------------------------------------------------------
 1 | module m
 2 | contains
 3 |   attributes(global) subroutine s1(offset)
 4 |     implicit none
 5 |     integer, value :: offset
 6 |     integer :: tid, res
 7 |     tid = threadIdx%x
 8 | 
 9 |     res = syncthreads_and(tid > offset)
10 |     if (tid == 1) print *, 'syncthreads_and(tid > offset):   ', res
11 |     res = syncthreads_or(tid > offset)
12 |     if (tid == 1) print *, 'syncthreads_or(tid > offset):    ', res
13 |     res = syncthreads_count(tid > offset)
14 |     if (tid == 1) print *, 'syncthreads_count(tid > offset): ', res        
15 |   end subroutine s1
16 | end module m
17 | 
18 | program syncthreads
19 |   use m
20 |   use cudafor
21 |   implicit none
22 |   integer :: istat
23 | 
24 |   print *, 'offset = 0'
25 |   call s1<<<1, 256>>>(0)
26 |   istat = cudaDeviceSynchronize()
27 | 
28 |   print *, 'offset = 4'
29 |   call s1<<<1, 256>>>(4)
30 |   istat = cudaDeviceSynchronize()
31 | end program syncthreads
32 | 


--------------------------------------------------------------------------------
/code/ch02/memcheck.cuf:
--------------------------------------------------------------------------------
 1 | module simpleOps_m
 2 | contains
 3 |   attributes(global) subroutine increment(a, n, b)
 4 |     implicit none
 5 |     integer, intent(inout) :: a(*)
 6 |     integer, value :: n, b
 7 |     integer :: i
 8 | 
 9 |     i = blockDim%x*(blockIdx%x-1) + threadIdx%x-1 ! *incorrect*
10 |     if (i <= n) then
11 |        a(i) = a(i)+b
12 |     end if
13 |   end subroutine increment
14 | end module simpleOps_m
15 | 
16 | 
17 | program main  
18 |   use cudafor
19 |   use simpleOps_m
20 |   implicit none
21 |   integer, parameter :: tPB=32*5, n=tPB*5+1, b = 3
22 |   integer :: a(n), r(n), i
23 |   integer, device :: a_d(n)
24 | 
25 |   do i = 1, n
26 |      a(i) = i
27 |   end do
28 | 
29 |   a_d = a
30 |   call increment<<<ceiling(real(n)/tPB),tPB>>>(a_d, n, b)
31 |   r = a_d
32 | 
33 |   if (any(r /= a+b)) then
34 |      print *, '**** Program Failed ****'
35 |   else
36 |      print *, 'Program Passed'
37 |   endif
38 | end program main
39 | 


--------------------------------------------------------------------------------
/code/ch06/union_m.cuf:
--------------------------------------------------------------------------------
 1 | module union
 2 | 
 3 |   interface union
 4 |      module procedure union_r4c4, union_r4dc4d
 5 |   end interface union
 6 | 
 7 | contains
 8 | 
 9 |   subroutine union_r4c4(s, d_ptr)
10 |     use iso_c_binding
11 |     implicit none
12 |     real(4) :: s(:)
13 |     complex(4), pointer :: d_ptr(:)
14 |     type(c_ptr) :: s_cptr
15 |     integer :: d_shape(1)
16 | 
17 |     d_shape(1) = size(s)/2   
18 |     s_cptr = transfer(loc(s), s_cptr)
19 |     call c_f_pointer(s_cptr, d_ptr, d_shape)
20 |   end subroutine union_r4c4
21 | 
22 |   subroutine union_r4dc4d(s, d_ptr)
23 |     use cudafor
24 |     implicit none
25 |     real(4), device :: s(:)
26 |     complex(4), device, pointer :: d_ptr(:)
27 |     type(c_devptr) :: s_cptr
28 |     integer :: d_shape(1)
29 | 
30 |     d_shape(1) = size(s)/2   
31 |     s_cptr = c_devloc(s)
32 |     call c_f_pointer(s_cptr, d_ptr, d_shape)
33 |   end subroutine union_r4dc4d
34 |   
35 | end module union
36 | 


--------------------------------------------------------------------------------
/code/ch07/matmulTranspose.cuf:
--------------------------------------------------------------------------------
 1 | program main
 2 |   use cudafor
 3 |   use cutensorEx
 4 |   implicit none
 5 |   integer, parameter :: m=3200, n=m, k=3200
 6 |   real(8) :: a(m,k), b(k,n)
 7 |   real(8) :: c(m,n), cref(m,n)
 8 |   real(8), device :: a_d(m,k), b_d(k,n)
 9 |   real(8), device :: c_d(m,n)
10 |   integer :: istat
11 |   
12 |   type(cudaDeviceProp) :: prop
13 | 
14 |   istat = cudaGetDeviceProperties(prop, 0)
15 |   print "(' Device: ', a)", trim(prop%name)
16 |   print "(' m = ', i0, ', n = ', i0, ', k = ', i0)", m, n, k
17 |   
18 |   call random_number(a)
19 |   call random_number(b)
20 | 
21 |   ! transpose(A*B) = tranpose(B)*transpose(A)
22 |   ! LHS on host
23 |   cref = matmul(a,b)
24 |   cref = transpose(cref)
25 | 
26 |   ! RHS on device
27 |   a_d = a; b_d = b
28 |   c_d = matmul(transpose(b_d), transpose(a_d))
29 |   c = c_d
30 |   print *, 'maxval(abs(tr(AB)-tr(B)tr(A))): ', &
31 |        maxval(abs(cref-c))
32 | end program main
33 |   
34 | 


--------------------------------------------------------------------------------
/code/ch04/pipeline.cuf:
--------------------------------------------------------------------------------
 1 | program pipeline
 2 |   use cudafor
 3 |   implicit none
 4 |   integer, parameter :: n=100000
 5 |   integer, parameter :: nStreams=6
 6 |   real, device :: a_d(n,nStreams)
 7 |   real, pinned, allocatable :: a(:,:)
 8 |   integer(kind=cuda_stream_kind) :: streams(nStreams)
 9 |   integer :: istat, i, j
10 | 
11 |   do j = 1, nStreams
12 |      istat = cudaStreamCreate(streams(j))
13 |   enddo
14 | 
15 |   allocate(a(n,nStreams))
16 |   a = 1.0
17 | 
18 |   do j = 1, nStreams
19 |      istat = cudaMemcpyAsync(a_d(1,j), a(1,j), n, streams(j))
20 |      !$cuf kernel do <<<1,1,stream=streams(j)>>>
21 |      do i = 1, n
22 |         a_d(i,j) = a_d(i,j) + 1.0
23 |      enddo
24 |      istat = cudaMemcpyAsync(a(1,j), a_d(1,j), n, streams(j))
25 |   enddo
26 | 
27 |   istat = cudaDeviceSynchronize()
28 |   if (all(a == 2.0)) print *, 'OK'
29 |   
30 |   do j = 1, nStreams
31 |      istat = cudaStreamDestroy(streams(j))
32 |   enddo
33 | end program pipeline
34 | 


--------------------------------------------------------------------------------
/code/ch01/pciBusID.cuf:
--------------------------------------------------------------------------------
 1 | program pciBusID
 2 |   use cudafor
 3 |   implicit none
 4 | 
 5 |   type (cudaDeviceProp) :: prop
 6 |   integer :: nDevices=0, i, ierr
 7 |   
 8 |   ! Number of CUDA-capable devices
 9 | 
10 |   ierr = cudaGetDeviceCount(nDevices)
11 | 
12 |   if (nDevices == 0) then
13 |      print "(/,'No CUDA devices found',/)"
14 |      stop
15 |   else if (nDevices == 1) then
16 |      print "(/,'One CUDA device found',/)"
17 |   else 
18 |      print "(/,i0,' CUDA devices found',/)", nDevices
19 |   end if
20 | 
21 |   ! Loop over devices
22 |   
23 |   do i = 0, nDevices-1
24 | 
25 |      print "('Device Number: ',i0)", i
26 | 
27 |      ierr = cudaGetDeviceProperties(prop, i)
28 | 
29 |      ! General device info
30 | 
31 |      print "('  Device Name: ', a)", trim(prop%name)
32 |      print "('  Compute Capability: ',i0,'.',i0)", &
33 |           prop%major, prop%minor    
34 |      print "('  PCI Bus ID: ',i0)", prop%pciBusID
35 | 
36 |   enddo
37 | 
38 | end program pciBusID
39 | 


--------------------------------------------------------------------------------
/code/ch04/eventSync.cuf:
--------------------------------------------------------------------------------
 1 | program eventSync
 2 |   use cudafor
 3 |   implicit none
 4 |   integer, parameter :: n=100000
 5 |   real, device :: a_d(n), b_d(n), c_d(n)
 6 |   integer(kind=cuda_stream_kind) :: stream1, stream2
 7 |   type(cudaEvent) :: event1
 8 |   integer :: istat, i, j
 9 | 
10 |   istat = cudaStreamCreate(stream1)
11 |   istat = cudaStreamCreate(stream2)
12 |   istat = cudaEventCreate(event1)
13 |   
14 |   !$cuf kernel do <<<1,1,stream=stream1>>>
15 |   do i = 1, n
16 |      a_d(i) = i
17 |   enddo
18 | 
19 |   istat = cudaEventRecord(event1, stream1)
20 |   
21 |   !$cuf kernel do <<<1,1,stream=stream1>>>
22 |   do i = 1, n
23 |      b_d(i) = a_d(i) + 1.0
24 |   enddo
25 | 
26 |   istat = cudaEventSynchronize(event1)
27 | 
28 |   !$cuf kernel do <<<1,1,stream=stream2>>>
29 |   do i = 1, n
30 |      c_d(i) = a_d(i) + 2.0
31 |   enddo
32 | 
33 |   istat = cudaStreamDestroy(stream1)
34 |   istat = cudaStreamDestroy(stream2)
35 |   istat = cudaEventDestroy(event1)
36 | end program eventSync
37 | 


--------------------------------------------------------------------------------
/code/ch04/defaultStream.cuf:
--------------------------------------------------------------------------------
 1 | program defaultStream
 2 |   use cudafor
 3 |   implicit none
 4 |   integer, parameter :: n=100000
 5 |   real, device, allocatable :: a_d(:), b_d(:)
 6 |   real :: a(n), b(n)
 7 |   integer(kind=cuda_stream_kind) :: stream1
 8 |   integer :: istat, i
 9 | 
10 |   istat = cudaStreamCreate(stream1)
11 | 
12 |   allocate(a_d(n))
13 |   a_d = 1.0
14 |   !$cuf kernel do <<<1,1>>>
15 |   do i = 1, n
16 |      a_d(i) = a_d(i) + i
17 |   enddo
18 |   a = a_d
19 |   print *, maxval(a_d)
20 |   
21 |   istat = cudaforSetDefaultStream(stream1)
22 |   allocate(b_d(n))
23 | 
24 |   a_d = 2.0
25 |   !$cuf kernel do <<<1,1>>>
26 |   do i = 1, n
27 |      a_d(i) = a_d(i) + i
28 |   enddo
29 |   a = a_d
30 |   print *, maxval(a_d)
31 | 
32 |   b_d = 3.0
33 |   !$cuf kernel do <<<1,1>>>
34 |   do i = 1, n
35 |      b_d(i) = b_d(i) + i
36 |   enddo
37 |   b = b_d
38 |   print *, maxval(b_d)
39 |   
40 |   istat = cudaStreamDestroy(stream1)
41 |   deallocate(a_d, b_d)
42 | end program defaultStream
43 | 


--------------------------------------------------------------------------------
/code/ch08/minimal.cuf:
--------------------------------------------------------------------------------
 1 | module kernel
 2 | contains
 3 |   attributes(global) subroutine assign(a, v)
 4 |     implicit none
 5 |     real :: a(*)
 6 |     real, value :: v
 7 |     a(threadIdx%x) = v
 8 |   end subroutine assign
 9 | end module kernel
10 | 
11 | program minimal
12 |   use cudafor
13 |   use kernel
14 |   implicit none
15 |   integer, parameter :: n=32
16 |   real :: a(n)
17 |   real, device, allocatable :: a0_d(:), a1_d(:)
18 |   integer :: nDevices, istat
19 | 
20 |   istat = cudaGetDeviceCount(nDevices)
21 |   if (nDevices < 2) then
22 |      print *, 'This program requires at least two GPUs'
23 |      stop
24 |   end if
25 |   
26 |   istat = cudaSetDevice(0)
27 |   allocate(a0_d(n))
28 |   call assign<<<1,n>>>(a0_d, 3.0)
29 |   a = a0_d
30 |   deallocate(a0_d)
31 |   print *, 'Device 0: ', a(1)
32 | 
33 |   istat = cudaSetDevice(1)
34 |   allocate(a1_d(n))
35 |   call assign<<<1,n>>>(a1_d, 4.0)
36 |   a = a1_d
37 |   deallocate(a1_d)
38 |   print *, 'Device 1: ', a(1)
39 | end program minimal
40 | 


--------------------------------------------------------------------------------
/code/ch02/print.cuf:
--------------------------------------------------------------------------------
 1 | module simpleOps_m
 2 | contains
 3 |   attributes(global) subroutine increment(a, b, n)
 4 |     implicit none
 5 |     integer, intent(inout) :: a(*)
 6 |     integer, value :: b, n
 7 |     integer :: i
 8 | 
 9 |     i = blockDim%x*(blockIdx%x-1) + threadIdx%x
10 |     if (i <= n) a(i) = a(i)+b
11 |     if (i > 30 .and. i < 34) print *, 'i, a(i):', i, a(i)
12 |   end subroutine increment
13 | end module simpleOps_m
14 | 
15 | 
16 | program main
17 |   use cudafor
18 |   use simpleOps_m
19 |   implicit none
20 |   integer, parameter :: n = 1024*1024
21 |   integer, allocatable :: a(:)
22 |   integer, device, allocatable :: a_d(:)
23 |   integer :: b, tPB = 256
24 | 
25 |   allocate(a(n), a_d(n))
26 |   a = 1
27 |   b = 3
28 | 
29 |   a_d = a
30 |   call increment<<<(n-1)/tPB+1,tPB>>>(a_d, b, n)
31 |   a = a_d
32 | 
33 |   if (any(a /= 4)) then
34 |      print *, '**** Program Failed ****'
35 |   else
36 |      print *, 'Program Passed'
37 |   endif
38 |   deallocate(a, a_d)
39 | end program main
40 | 


--------------------------------------------------------------------------------
/code/ch04/cgReverse.cuf:
--------------------------------------------------------------------------------
 1 | module reverse
 2 | contains
 3 |   attributes(global) subroutine blockReverse(d, n)
 4 |     use cooperative_groups
 5 |     implicit none
 6 |     real :: d(n)
 7 |     integer, value :: n
 8 |     real, shared :: s(n)    
 9 |     integer :: t, tr
10 |     type(thread_group) :: tg
11 | 
12 |     tg = this_thread_block()
13 | 
14 |     t = threadIdx%x
15 |     tr = blockDim%x-t+1
16 | 
17 |     s(t) = d(t)
18 |     call syncthreads(tg)  
19 |     d(t) = s(tr)
20 |   end subroutine blockReverse
21 | end module reverse
22 | 
23 | 
24 | program cgReverse
25 |   use cudafor
26 |   use reverse
27 | 
28 |   implicit none
29 | 
30 |   integer, parameter :: n = 64
31 |   real :: a(n), r(n), d(n)
32 |   real, device :: d_d(n)
33 |   integer :: i
34 | 
35 |   do i = 1, n
36 |      a(i) = i
37 |      r(i) = n-i+1
38 |   enddo
39 | 
40 |   d_d = a
41 |   call blockReverse<<<1, n, 4*n>>>(d_d, n)
42 |   d = d_d
43 |   print *, 'blockReverse max error:', maxval(abs(r-d))
44 | end program cgReverse
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/code/ch12/ppmExample.f90:
--------------------------------------------------------------------------------
 1 | program main
 2 |   !@cuf use cudafor
 3 |   implicit none
 4 |   integer, parameter :: nx = 400, ny = 200
 5 |   integer :: i, j
 6 |   type rgb
 7 |      real :: v(3)
 8 |   end type rgb
 9 |   type(rgb) :: fb(nx, ny)
10 |   !@cuf type(rgb), device :: fb_d(nx, ny)
11 |   
12 |   !@cuf associate (fb => fb_d)
13 |   !$cuf kernel do (2) <<<*,*>>>
14 |   do j = 1, ny
15 |      do i = 1, nx                
16 |         fb(i,j)%v(1) = real(i)/nx
17 |         fb(i,j)%v(2) = real(j)/ny
18 |         fb(i,j)%v(3) = 0.2
19 |      end do
20 |   end do
21 |   !@cuf end associate
22 | 
23 |   !@cuf fb = fb_d
24 |   
25 |   ! ppm output
26 | 
27 |   print "(a2)", 'P3'   ! indicates RGB colors in ASCII, must be flush left
28 |   print *, nx, ny      ! width and height of image
29 |   print *, 255         ! maximum value for each color
30 | 
31 |   do j = ny, 1, -1
32 |      do i = 1, nx                
33 |         print "(3(1x,i3))", int(255*fb(i,j)%v)
34 |      end do
35 |   end do
36 | 
37 | end program main
38 | 


--------------------------------------------------------------------------------
/code/ch03/Makefile:
--------------------------------------------------------------------------------
 1 | OBJS = events multidim nvtxBasic nvtxAdv nvtxAdv2 nvtxAuto \
 2 | 	limitingFactor peakBandwidth effectiveBandwidth
 3 | 
 4 | # section 3.1.2
 5 | 
 6 | events: events.cuf
 7 | 	nvfortran -o $@ $<
 8 | 
 9 | # section 3.1.3
10 | 
11 | multidim: multidim.cuf
12 | 	nvfortran -o $@ $<
13 | 
14 | # section 3.1.4.1
15 | 
16 | nvtxBasic: nvtxBasic.cuf
17 | 	nvfortran -o $@ $< -cudalib=nvtx
18 | 
19 | # section 3.1.4.2
20 | 
21 | nvtxAdv: nvtxAdv.cuf
22 | 	nvfortran -o $@ $< -cudalib=nvtx
23 | 
24 | nvtxAdv2: nvtxAdv2.cuf
25 | 	nvfortran -o $@ $< -cudalib=nvtx
26 | 
27 | # section 3.1.4.3
28 | 
29 | nvtxAuto: nvtxAuto.cuf
30 | 	nvfortran -o $@ $< -Minstrument -traceback -cudalib=nvtx
31 | 
32 | # section 3.2
33 | 
34 | limitingFactor: limitingFactor.cuf
35 | 	nvfortran -o $@ $<
36 | 
37 | # section 3.3.1
38 | 
39 | peakBandwidth: peakBandwidth.cuf
40 | 	nvfortran -o $@ $<
41 | 
42 | # section 3.3.2
43 | 
44 | effectiveBandwidth: effectiveBandwidth.cuf
45 | 	nvfortran -o $@ $<
46 | 
47 | clean:
48 | 	rm -rf $(OBJS) *.o *.mod *~
49 | 


--------------------------------------------------------------------------------
/code/ch04/streamSync.cuf:
--------------------------------------------------------------------------------
 1 | program streamSync
 2 |   use cudafor
 3 |   implicit none
 4 |   integer, parameter :: n=100000
 5 |   real, device :: a_d(n), b_d(n), c_d(n)
 6 |   integer(kind=cuda_stream_kind) :: stream1, stream2, stream3
 7 |   integer :: istat, i, j
 8 | 
 9 |   istat = cudaStreamCreate(stream1)
10 |   istat = cudaStreamCreate(stream2)
11 |   istat = cudaStreamCreate(stream3)
12 | 
13 |   !$cuf kernel do <<<1,1,stream=stream1>>>
14 |   do i = 1, n
15 |      a_d(i) = i
16 |   enddo
17 | 
18 |   !$cuf kernel do <<<1,1,stream=stream2>>>
19 |   do i = 1, n
20 |      b_d(i) = i
21 |   enddo
22 | 
23 |   !$cuf kernel do <<<1,1,stream=stream3>>>
24 |   do i = 1, n
25 |      c_d(i) = 1.0/i 
26 |   enddo
27 | 
28 |   istat = cudaStreamSynchronize(stream1)
29 |   
30 |   !$cuf kernel do <<<1,1,stream=stream2>>>
31 |   do i = 1, n
32 |      a_d(i) = a_d(i) + b_d(i)
33 |   enddo
34 | 
35 |   istat = cudaStreamDestroy(stream1)
36 |   istat = cudaStreamDestroy(stream2)
37 |   istat = cudaStreamDestroy(stream3)
38 | end program streamSync
39 | 


--------------------------------------------------------------------------------
/code/ch04/defaultStreamVar.cuf:
--------------------------------------------------------------------------------
 1 | program defaultStreamVar
 2 |   use cudafor
 3 |   implicit none
 4 |   integer, parameter :: n=100000
 5 |   real :: a(n), b(n)
 6 |   real, device :: a_d(n)
 7 |   real, device, allocatable :: b_d(:)
 8 |   integer(cuda_stream_kind) :: s1
 9 |   integer :: istat, i
10 |   
11 |   istat = cudaStreamCreate(s1)
12 | 
13 |   a = 1.0
14 |   allocate(b_d(n))
15 | 
16 |   istat = cudaforSetDefaultStream(a_d, s1)
17 |   if (istat /= cudaSuccess) print *, 'a_d stream association error: ', &
18 |        cudaGetErrorString(istat)
19 |   istat = cudaforSetDefaultStream(b_d, s1)
20 |   if (istat /= cudaSuccess) print *, 'b_d stream association error: ', &
21 |        cudaGetErrorString(istat)
22 |   
23 |   a_d = 1.0
24 |   b_d = 1.0
25 |   
26 |   !$cuf kernel do <<<*,*>>>
27 |   do i = 1, n
28 |      a_d(i) = a_d(i) + b_d(i) + i
29 |   end do
30 | 
31 |   print *, minval(a_d)  
32 |   print *, maxval(b_d)
33 | 
34 |   a = a_d
35 |   b = b_d
36 | 
37 |   istat = cudaStreamDestroy(s1)
38 |   deallocate(b_d)
39 | end program defaultStreamVar
40 | 


--------------------------------------------------------------------------------
/code/ch01/multidim.cuf:
--------------------------------------------------------------------------------
 1 | module m
 2 | contains
 3 |   attributes(global) subroutine increment(a, n1, n2, b)
 4 |     implicit none
 5 |     integer :: a(n1,n2)
 6 |     integer, value :: n1, n2, b
 7 |     integer :: i, j
 8 | 
 9 |     i = (blockIdx%x-1)*blockDim%x + threadIdx%x
10 |     j = (blockIdx%y-1)*blockDim%y + threadIdx%y
11 |     if (i<=n1 .and. j<=n2) a(i,j) = a(i,j) + b
12 |   end subroutine increment
13 | end module m
14 | 
15 | 
16 | 
17 | program multidim
18 |   use cudafor
19 |   use m
20 |   implicit none
21 |   integer, parameter :: nx=1024, ny=512
22 |   integer :: a(nx,ny), b 
23 |   integer, device :: a_d(nx,ny)
24 |   type(dim3) :: grid, tBlock 
25 | 
26 |   a = 1
27 |   b = 3
28 |   
29 |   tBlock = dim3(32,8,1)
30 |   grid = dim3((nx-1)/tBlock%x+1, &
31 |               (ny-1)/tBlock%y+1, 1)
32 |   a_d = a
33 |   call increment<<<grid,tBlock>>>(a_d, nx, ny, b)
34 |   a = a_d
35 | 
36 |   if (any(a /= 4)) then
37 |      print *, '**** Program Failed ****'
38 |   else
39 |      print *, 'Program Passed'
40 |   endif
41 | end program multidim
42 | 


--------------------------------------------------------------------------------
/code/ch03/multidim.cuf:
--------------------------------------------------------------------------------
 1 | module m
 2 | contains
 3 |   attributes(global) subroutine increment(a, n1, n2, b)
 4 |     implicit none
 5 |     integer :: a(n1,n2)
 6 |     integer, value :: n1, n2, b
 7 |     integer :: i, j
 8 | 
 9 |     i = (blockIdx%x-1)*blockDim%x + threadIdx%x
10 |     j = (blockIdx%y-1)*blockDim%y + threadIdx%y
11 |     if (i<=n1 .and. j<=n2) a(i,j) = a(i,j) + b
12 |   end subroutine increment
13 | end module m
14 | 
15 | 
16 | 
17 | program multidim
18 |   use cudafor
19 |   use m
20 |   implicit none
21 |   integer, parameter :: nx=1024, ny=512
22 |   integer :: a(nx,ny), b 
23 |   integer, device :: a_d(nx,ny)
24 |   type(dim3) :: grid, tBlock 
25 | 
26 |   a = 1
27 |   b = 3
28 |   
29 |   tBlock = dim3(32,8,1)
30 |   grid = dim3((nx-1)/tBlock%x+1, &
31 |               (ny-1)/tBlock%y+1, 1)
32 |   a_d = a
33 |   call increment<<<grid,tBlock>>>(a_d, nx, ny, b)
34 |   a = a_d
35 | 
36 |   if (any(a /= 4)) then
37 |      print *, '**** Program Failed ****'
38 |   else
39 |      print *, 'Program Passed'
40 |   endif
41 | end program multidim
42 | 


--------------------------------------------------------------------------------
/code/ch04/raceAndAtomicShared.cuf:
--------------------------------------------------------------------------------
 1 | module m
 2 | contains
 3 |   attributes(global) subroutine s1(raceCount, atomicCount)
 4 |     implicit none
 5 |     integer :: raceCount, atomicCount
 6 |     integer :: tmp
 7 |     integer, shared :: sharedCount
 8 | 
 9 |     if (threadIdx%x == 1) sharedCount = 0
10 |     call syncthreads()
11 |     
12 |     raceCount = raceCount + 1
13 |     tmp = atomicAdd(sharedCount, 1)
14 |     call syncthreads()
15 |     
16 |     if (threadIdx%x == 1) tmp = atomicAdd(atomicCount, sharedCount)
17 |   end subroutine s1
18 | end module m
19 | 
20 | program raceAndAtomicShared
21 |   use m
22 |   implicit none
23 |   integer, parameter :: nBlocks = 256, nThreads = 256
24 |   integer, device :: raceCount_d, atomicCount_d
25 |   integer :: raceCount, atomicCount
26 | 
27 |   raceCount_d = 0
28 |   atomicCount_d = 0
29 |   call s1<<<nBlocks, nThreads>>>(raceCount_d, atomicCount_d)
30 |   raceCount = raceCount_d
31 |   atomicCount = atomicCount_d
32 |   print *, nBlocks*nThreads, raceCount, atomicCount 
33 | end program raceAndAtomicShared
34 | 


--------------------------------------------------------------------------------
/code/ch08/Makefile:
--------------------------------------------------------------------------------
 1 | OBJS = minimal p2pAccess directTransfer p2pBandwidth \
 2 |        transposeP2P mpiDevices assignDevice transposeMPI transposeCAMPI
 3 | 
 4 | # section 8.1
 5 | 
 6 | minimal: minimal.cuf
 7 | 	nvfortran -o $@ $<
 8 | 
 9 | # section 8.1.1.1
10 | 
11 | p2pAccess: p2pAccess.cuf
12 | 	nvfortran -o $@ $<
13 | 
14 | # section 8.1.2
15 | 
16 | directTransfer: directTransfer.cuf
17 | 	nvfortran -o $@ $<
18 | 
19 | p2pBandwidth: p2pBandwidth.cuf
20 | 	nvfortran -o $@ $<
21 | 
22 | # section 8.1.3
23 | 
24 | wallclock.o: wallclock.c
25 | 	nvc -c $<
26 | 
27 | transposeP2P: timing.f90 transposeP2P.cuf wallclock.o
28 | 	nvfortran -o $@ $^
29 | 
30 | # section 8.2.1
31 | 
32 | mpiDevices: mpiDevices.cuf
33 | 	mpif90 -o $@ $<
34 | 
35 | assignDevice: mpiDeviceUtil.cuf assignDevice.cuf
36 | 	mpif90 -o $@ $^
37 | 
38 | # section 8.2.2
39 | 
40 | transposeMPI: mpiDeviceUtil.cuf transposeMPI.cuf
41 | 	mpif90 -o $@ $^
42 | 
43 | transposeCAMPI: mpiDeviceUtil.cuf transposeCAMPI.cuf
44 | 	mpif90 -o $@ $^
45 | 
46 | 
47 | 
48 | clean:
49 | 	rm -rf $(OBJS) *.o *.mod *~
50 | 
51 | 


--------------------------------------------------------------------------------
/code/ch01/managed.cuf:
--------------------------------------------------------------------------------
 1 | module m
 2 | contains
 3 |   attributes(global) subroutine increment(a, n1, n2, b)
 4 |     implicit none
 5 |     integer :: a(n1,n2)
 6 |     integer, value :: n1, n2, b
 7 |     integer :: i, j
 8 | 
 9 |     i = (blockIdx%x-1)*blockDim%x + threadIdx%x
10 |     j = (blockIdx%y-1)*blockDim%y + threadIdx%y
11 |     if (i<=n1 .and. j<=n2) a(i,j) = a(i,j) + b
12 |   end subroutine increment
13 | end module m
14 | 
15 | 
16 | 
17 | program main
18 |   use cudafor
19 |   use m
20 |   implicit none
21 |   integer, parameter :: nx=1024, ny=512
22 |   integer, managed :: a(nx,ny)
23 |   integer :: b, istat
24 |   type(dim3) :: grid, tBlock 
25 | 
26 |   a = 1
27 |   b = 3
28 |   
29 |   tBlock = dim3(32,8,1)
30 |   grid = dim3(ceiling(real(nx)/tBlock%x), &
31 |               ceiling(real(ny)/tBlock%y), 1)
32 | 
33 |   call increment<<<grid,tBlock>>>(a, nx, ny, b)
34 |   istat = cudaDeviceSynchronize()
35 | 
36 |   if (any(a /= 4)) then
37 |      print *, '**** Program Failed ****'
38 |   else
39 |      print *, 'Program Passed'
40 |   endif
41 | end program main
42 | 


--------------------------------------------------------------------------------
/code/ch04/defaultStreamVarExplicit.cuf:
--------------------------------------------------------------------------------
 1 | program defaultStreamVarExplicit
 2 |   use cudafor
 3 |   implicit none
 4 |   integer, parameter :: n=100000
 5 |   real :: a(n), b(n)
 6 |   real, device :: a_d(n)
 7 |   real, device, allocatable :: b_d(:)
 8 |   integer(cuda_stream_kind) :: s1
 9 |   integer :: istat, i
10 |   
11 |   istat = cudaStreamCreate(s1)
12 | 
13 |   a = 1.0
14 |   allocate(b_d(n))
15 | 
16 |   istat = cudaforSetDefaultStream(a_d, s1)
17 |   if (istat /= cudaSuccess) print *, 'a_d stream association error: ', &
18 |        cudaGetErrorString(istat)
19 |   istat = cudaforSetDefaultStream(b_d, s1)
20 |   if (istat /= cudaSuccess) print *, 'b_d stream association error: ', &
21 |        cudaGetErrorString(istat)
22 |   
23 |   a_d = 1.0
24 |   b_d = 1.0
25 |   
26 |   !$cuf kernel do <<<*,*,stream=s1>>>
27 |   do i = 1, n
28 |      a_d(i) = a_d(i) + b_d(i) + i
29 |   end do
30 | 
31 |   print *, minval(a_d, stream=s1)  
32 |   print *, maxval(b_d, stream=s1)
33 | 
34 |   a = a_d
35 |   b = b_d
36 | 
37 |   istat = cudaStreamDestroy(s1)
38 |   deallocate(b_d)
39 | end program defaultStreamVarExplicit
40 | 


--------------------------------------------------------------------------------
/code/ch01/managedImplicit.cuf:
--------------------------------------------------------------------------------
 1 | module m
 2 | contains
 3 |   attributes(global) subroutine increment(a, n1, n2, b)
 4 |     implicit none
 5 |     integer :: a(n1,n2)
 6 |     integer, value :: n1, n2, b
 7 |     integer :: i, j
 8 | 
 9 |     i = (blockIdx%x-1)*blockDim%x + threadIdx%x
10 |     j = (blockIdx%y-1)*blockDim%y + threadIdx%y
11 |     if (i<=n1 .and. j<=n2) a(i,j) = a(i,j) + b
12 |   end subroutine increment
13 | end module m
14 | 
15 | 
16 | 
17 | program main
18 |   use cudafor
19 |   use m
20 |   implicit none
21 |   integer, parameter :: nx=1024, ny=512
22 |   integer, allocatable :: a(:,:)
23 |   integer :: b, istat
24 |   type(dim3) :: grid, tBlock 
25 | 
26 |   allocate(a(nx,ny))
27 |   
28 |   a = 1
29 |   b = 3
30 |   
31 |   tBlock = dim3(32,8,1)
32 |   grid = dim3(ceiling(real(nx)/tBlock%x), &
33 |               ceiling(real(ny)/tBlock%y), 1)
34 | 
35 |   call increment<<<grid,tBlock>>>(a, nx, ny, b)
36 |   istat = cudaDeviceSynchronize()
37 | 
38 |   if (any(a /= 4)) then
39 |      print *, '**** Program Failed ****'
40 |   else
41 |      print *, 'Program Passed'
42 |   endif
43 | 
44 |   deallocate(a)
45 | end program main
46 | 


--------------------------------------------------------------------------------
/code/ch01/asyncError.cuf:
--------------------------------------------------------------------------------
 1 | module m
 2 | contains
 3 |   attributes(global) subroutine increment(a, b)
 4 |     implicit none
 5 |     integer, intent(inout) :: a(*)
 6 |     integer, value :: b
 7 |     integer :: i
 8 | 
 9 |     i = threadIdx%x
10 |     a(i) = a(i)+b
11 | 
12 |   end subroutine increment
13 | end module m
14 | 
15 | 
16 | program asyncError
17 |   use cudafor
18 |   use m
19 |   implicit none
20 |   integer, parameter :: n = 256
21 |   integer :: a(n), b
22 |   integer, device, pointer :: a_d(:)
23 |   integer :: ierrSync, ierrAsync
24 | 
25 |   a = 1
26 |   b = 3
27 | 
28 |   call increment<<<1,n>>>(a_d, b)
29 |   ierrSync = cudaGetLastError()
30 |   ierrAsync = cudaDeviceSynchronize()
31 |   a = a_d
32 | 
33 |   if (ierrSync /= cudaSuccess) &
34 |        print *, 'Sync kernel error: ', &
35 |        trim(cudaGetErrorString(ierrSync))
36 |   if (ierrAsync /= cudaSuccess) &
37 |        print *, 'Async kernel error: ', &
38 |        trim(cudaGetErrorString(ierrAsync))
39 | 
40 |   if (any(a /= 4)) then
41 |      print *, '**** Program Failed ****'
42 |   else
43 |      print *, 'Program Passed'
44 |   endif
45 | end program asyncError
46 | 


--------------------------------------------------------------------------------
/code/ch01/syncError.cuf:
--------------------------------------------------------------------------------
 1 | module m
 2 | contains
 3 |   attributes(global) subroutine increment(a, b)
 4 |     implicit none
 5 |     integer, intent(inout) :: a(*)
 6 |     integer, value :: b
 7 |     integer :: i
 8 | 
 9 |     i = threadIdx%x
10 |     a(i) = a(i)+b
11 | 
12 |   end subroutine increment
13 | end module m
14 | 
15 | 
16 | program asyncError
17 |   use cudafor
18 |   use m
19 |   implicit none
20 |   integer, parameter :: n = 5000
21 |   integer :: a(n), b
22 |   integer, device :: a_d(n)
23 |   integer :: ierrSync, ierrAsync
24 | 
25 |   a = 1
26 |   b = 3
27 | 
28 |   a_d = a
29 |   call increment<<<1,n>>>(a_d, b)
30 |   ierrSync = cudaGetLastError()
31 |   ierrAsync = cudaDeviceSynchronize()
32 |   a = a_d
33 | 
34 |   if (ierrSync /= cudaSuccess) &
35 |        print *, 'Sync kernel error: ', &
36 |        trim(cudaGetErrorString(ierrSync))
37 |   if (ierrAsync /= cudaSuccess) &
38 |        print *, 'Async kernel error: ', &
39 |        trim(cudaGetErrorString(ierrAsync))
40 | 
41 |   if (any(a /= 4)) then
42 |      print *, '**** Program Failed ****'
43 |   else
44 |      print *, 'Program Passed'
45 |   endif
46 | end program asyncError
47 | 


--------------------------------------------------------------------------------
/code/ch01/errorHandling.cuf:
--------------------------------------------------------------------------------
 1 | module m
 2 | contains
 3 |   attributes(global) subroutine increment(a, b)
 4 |     implicit none
 5 |     integer, intent(inout) :: a(*)
 6 |     integer, value :: b
 7 |     integer :: i
 8 | 
 9 |     i = threadIdx%x
10 |     a(i) = a(i)+b
11 | 
12 |   end subroutine increment
13 | end module m
14 | 
15 | 
16 | program asyncError
17 |   use cudafor
18 |   use m
19 |   implicit none
20 |   integer, parameter :: n = 256
21 |   integer :: a(n), b
22 |   integer, device :: a_d(n)
23 |   integer :: ierrSync, ierrAsync
24 | 
25 |   a = 1
26 |   b = 3
27 | 
28 |   a_d = a
29 |   call increment<<<1,n>>>(a_d, b)
30 |   ierrSync = cudaGetLastError()
31 |   ierrAsync = cudaDeviceSynchronize()
32 |   a = a_d
33 | 
34 |   if (ierrSync /= cudaSuccess) &
35 |        print *, 'Sync kernel error: ', &
36 |        trim(cudaGetErrorString(ierrSync))
37 |   if (ierrAsync /= cudaSuccess) &
38 |        print *, 'Async kernel error: ', &
39 |        trim(cudaGetErrorString(ierrAsync))
40 | 
41 |   if (any(a /= 4)) then
42 |      print *, '**** Program Failed ****'
43 |   else
44 |      print *, 'Program Passed'
45 |   endif
46 | end program asyncError
47 | 


--------------------------------------------------------------------------------
/code/ch08/Makefile~:
--------------------------------------------------------------------------------
 1 | OBJS = minimal p2pAccess directTransfer p2pBandwidth \
 2 |        transposeP2P mpiDevices assignDevice transposeMPI transposeCAMPI
 3 | 
 4 | MPI_DIR=$(NVHPC_ROOT)/comm_libs/mpi/bin
 5 | 
 6 | # section 8.1
 7 | 
 8 | minimal: minimal.cuf
 9 | 	nvfortran -o $@ $<
10 | 
11 | # section 8.1.1.1
12 | 
13 | p2pAccess: p2pAccess.cuf
14 | 	nvfortran -o $@ $<
15 | 
16 | # section 8.1.2
17 | 
18 | directTransfer: directTransfer.cuf
19 | 	nvfortran -o $@ $<
20 | 
21 | p2pBandwidth: p2pBandwidth.cuf
22 | 	nvfortran -o $@ $<
23 | 
24 | # section 8.1.3
25 | 
26 | wallclock.o: wallclock.c
27 | 	nvc -c $<
28 | 
29 | transposeP2P: timing.f90 transposeP2P.cuf wallclock.o
30 | 	nvfortran -o $@ $^
31 | 
32 | # section 8.2.1
33 | 
34 | mpiDevices: mpiDevices.cuf
35 | 	$(MPI_DIR)/mpif90 -o $@ $<
36 | 
37 | assignDevice: mpiDeviceUtil.cuf assignDevice.cuf
38 | 	$(MPI_DIR)/mpif90 -o $@ $^
39 | 
40 | # section 8.2.2
41 | 
42 | transposeMPI: mpiDeviceUtil.cuf transposeMPI.cuf
43 | 	$(MPI_DIR)/mpif90 -o $@ $^
44 | 
45 | transposeCAMPI: mpiDeviceUtil.cuf transposeCAMPI.cuf
46 | 	$(MPI_DIR)/mpif90 -o $@ $^
47 | 
48 | 
49 | 
50 | clean:
51 | 	rm -rf $(OBJS) *.o *.mod *~
52 | 
53 | 


--------------------------------------------------------------------------------
/code/ch05/loads.cuf:
--------------------------------------------------------------------------------
 1 | module m
 2 | contains
 3 |   attributes(global) subroutine k(b,a)
 4 |     implicit none
 5 |     real :: b(*), a(*)
 6 |     integer :: i
 7 |     i = (blockIdx%x-1)*blockDim%x + threadIdx%x
 8 |     b(i) = a(i)
 9 |   end subroutine k
10 | 
11 |   attributes(global) subroutine k_ii(b,a)
12 |     implicit none
13 |     real :: b(*)
14 |     real, intent(in) :: a(*)
15 |     integer :: i
16 |     i = (blockIdx%x-1)*blockDim%x + threadIdx%x
17 |     b(i) = a(i)
18 |   end subroutine k_ii
19 | 
20 |   attributes(global) subroutine k_ca(b,a)
21 |     implicit none
22 |     real :: b(*), a(*)
23 |     integer :: i
24 |     i = (blockIdx%x-1)*blockDim%x + threadIdx%x
25 |     b(i) = __ldca(a(i))
26 |   end subroutine k_ca
27 | 
28 |   attributes(global) subroutine k_cg(b,a)
29 |     implicit none
30 |     real :: b(*), a(*)
31 |     integer :: i
32 |     i = (blockIdx%x-1)*blockDim%x + threadIdx%x
33 |     b(i) = __ldcg(a(i))
34 |   end subroutine k_cg
35 | 
36 |   attributes(global) subroutine k_cv(b,a)
37 |     implicit none
38 |     real :: b(*), a(*)
39 |     integer :: i
40 |     i = (blockIdx%x-1)*blockDim%x + threadIdx%x
41 |     b(i) = __ldcv(a(i))
42 |   end subroutine k_cv
43 | end module m
44 | 


--------------------------------------------------------------------------------
/code/ch01/explicitInterface.cuf:
--------------------------------------------------------------------------------
 1 | attributes(global) subroutine increment(a, n1, n2, b)
 2 |   implicit none
 3 |   integer :: a(n1, n2)
 4 |   integer, value :: n1, n2, b
 5 |   integer :: i, j
 6 |   
 7 |   i = (blockIdx%x-1)*blockDim%x + threadIdx%x
 8 |   j = (blockIdx%y-1)*blockDim%y + threadIdx%y
 9 |   if (i<=n1 .and. j<=n2) a(i,j) = a(i,j) + b
10 | end subroutine increment
11 | 
12 | program explicitInterface
13 |   use cudafor
14 |   implicit none
15 |   integer, parameter :: nx=1024, ny=512
16 |   integer :: a(nx,ny), b 
17 |   integer, device :: a_d(nx,ny)
18 |   type(dim3) :: grid, tBlock 
19 |   interface
20 |      attributes(global) subroutine increment(a, n1, n2, b)
21 |        integer :: a(n1, n2)
22 |        integer, value :: n1, n2, b
23 |      end subroutine increment
24 |   end interface
25 | 
26 |   a = 1
27 |   b = 3
28 |   
29 |   tBlock = dim3(32,8,1)
30 |   grid = dim3(ceiling(real(nx)/tBlock%x), &
31 |               ceiling(real(ny)/tBlock%y), 1)
32 |   a_d = a
33 |   call increment<<<grid,tBlock>>>(a_d, nx, ny, b)
34 |   a = a_d
35 | 
36 |   if (any(a /= 4)) then
37 |      print *, '**** Program Failed ****'
38 |   else
39 |      print *, 'Program Passed'
40 |   endif
41 | end program explicitInterface
42 | 


--------------------------------------------------------------------------------
/code/ch09/shflExample.cuf:
--------------------------------------------------------------------------------
 1 | module shuffle_m
 2 | contains
 3 |   attributes(global) subroutine shuffle_xor(len)
 4 |     integer:: i,n,depth
 5 |     integer,value:: len
 6 |     
 7 |     i=threadIdx%x
 8 |     depth=bit_size(len)-leadz(len)-1  !log2(len)
 9 |     do n=depth,1,-1
10 |        i=i+__shfl_xor(i,ishft(len,-n),len)
11 |        print*,threadIdx%x,ishft(len,-n),i
12 |     end do
13 |   end subroutine shuffle_xor
14 |   
15 |   attributes(global) subroutine shuffle_down(len)
16 |     integer:: i,n,depth
17 |     integer,value:: len
18 |     
19 |     i=threadIdx%x
20 |     depth=bit_size(len)-leadz(len)-1   !log2(len)
21 |     do n=depth,1,-1
22 |        i=i+__shfl_down(i,ishft(len,-n),len)
23 |        print*,threadIdx%x,ishft(len,-n),i
24 |     end do
25 |   end subroutine shuffle_down
26 | end module shuffle_m
27 | 
28 | program test_shuffle
29 |   use cudafor
30 |   use shuffle_m
31 |   integer:: depth
32 |   print *,"  Thread id","   laneMask   ","   __shfl_xor"
33 |   len=8
34 |   call shuffle_xor<<<1,len>>>(len)
35 |   istat=cudaDeviceSynchronize()
36 |   print *,"  Thread id","    delta     ","     __shfl_down"
37 |   call shuffle_down<<<1,len>>>(len)
38 |   istat=cudaDeviceSynchronize()
39 | end program test_shuffle
40 | 


--------------------------------------------------------------------------------
/code/ch07/sgemmNew.cuf:
--------------------------------------------------------------------------------
 1 | program sgemmNew
 2 |   use cublas
 3 |   use cudafor
 4 |   implicit none
 5 |   integer, parameter :: m = 100, n = 100, k = 100
 6 |   real :: a(m,k), b(k,n), c(m,n)
 7 |   real, device :: a_d(m,k), b_d(k,n), c_d(m,n)
 8 |   real, parameter :: alpha = 1.0, beta = 0.0
 9 |   real, device :: alpha_d, beta_d
10 |   integer :: lda = m, ldb = k, ldc = m
11 |   type(cublasHandle) :: h
12 |   integer :: istat
13 | 
14 |   a = 1.0; b = 2.0; c = 0.0
15 |   a_d = a; b_d = b; c_d = 0.0
16 | 
17 |   istat = cublasCreate(h)
18 |   if (istat /= CUBLAS_STATUS_SUCCESS) &
19 |        print *, 'Error initializing CUBLAS'
20 |   
21 |   
22 |   istat = cublasSgemm_v2(h, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &
23 |        alpha, a_d, lda, b_d, ldb, beta, c_d, ldc)
24 |   c = c_d
25 |   print *, 'cublasSgemm error =', maxval(c-k*2.0)
26 | 
27 |   alpha_d = alpha; beta_d = beta
28 | 
29 |   istat = cublasSgemm_v2(h, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &
30 |        alpha_d, a_d, lda, b_d, ldb, beta_d, c_d, ldc)
31 |   c = c_d
32 |   print *, 'cublasSgemm error =', maxval(c-k*2.0)
33 | 
34 | 
35 |   istat = cublasDestroy(h)
36 |   if (istat /= CUBLAS_STATUS_SUCCESS) &
37 |        print *, 'Error shutting down CUBLAS'
38 | end program sgemmNew
39 | 


--------------------------------------------------------------------------------
/code/ch04/smooth.cuf:
--------------------------------------------------------------------------------
 1 | module m
 2 | contains
 3 |   attributes(grid_global) subroutine smooth(a,b,n,radius)
 4 |     use cooperative_groups
 5 |     implicit none
 6 |     real :: a(n), b(n)
 7 |     integer, value :: n, radius
 8 |     integer :: i, j, jj
 9 |     type(grid_group) :: gg
10 |     real :: bt
11 |     
12 |     gg = this_grid()
13 |     do i = gg%rank, n, gg%size
14 |        a(i) = i 
15 |     end do
16 | 
17 |     call syncthreads(gg)
18 | 
19 |     do i = gg%rank, n, gg%size
20 |        bt = 0.0
21 |        do j = i-radius, i+radius
22 |           jj = j
23 |           if (j < 1) jj = jj + n
24 |           if (j > n) jj = jj - n
25 |           bt = bt + a(jj)
26 |         enddo
27 |         b(i) = bt/(2*radius+1)
28 |     enddo
29 | 
30 |   end subroutine smooth
31 | end module m
32 | 
33 | program main
34 |   use cudafor
35 |   use m
36 |   implicit none
37 |   integer, parameter :: n = 1024*1024
38 |   real :: a(n), b(n)
39 |   real, device :: a_d(n), b_d(n)
40 |   integer :: i, radius
41 |   radius = 2
42 |   call smooth<<<*,256>>>(a_d, b_d, n, radius)
43 |   a = a_d
44 |   b = b_d
45 |   print *, 'Filter radius: ', radius
46 |   do i = 1, n
47 |      if (abs(b(i)-a(i)) > 0.00010) print *, i, a(i), b(i)
48 |   enddo
49 | end program main
50 | 


--------------------------------------------------------------------------------
/code/ch09/pi_lock.cuf:
--------------------------------------------------------------------------------
 1 | module pi_lock_m
 2 |   integer, device::  lock=0
 3 | contains
 4 |   attributes(global) subroutine pi_lock(input, partial, twoN)
 5 |     use precision_m
 6 |     implicit none
 7 |     real(fp_kind) :: input(twoN)
 8 |     integer :: partial(*)
 9 |     integer, value :: twoN
10 | 
11 |     integer, shared :: p_s(*)
12 |     integer :: N, i, index, inext, interior
13 | 
14 |     N = twoN/2
15 | 
16 |     index=threadIdx%x+(BlockIdx%x-1)*BlockDim%x
17 |     
18 |     interior=0
19 |     do i=index, N, BlockDim%x*GridDim%x
20 |        if( (input(i)**2+input(i+N)**2) <= 1._fp_kind ) &
21 |             interior=interior+1
22 |     end do
23 | 
24 |     ! Local reduction per block
25 |     index=threadIdx%x
26 |     
27 |     p_s(index)=interior
28 |     call syncthreads()
29 |     
30 |     inext=blockDim%x/2
31 |     do while ( inext >=1 )
32 |        if (index <= inext) p_s(index)=p_s(index)+p_s(index+inext)
33 |        inext = inext/2
34 |        call syncthreads()
35 |     end do
36 |     
37 |     if (index == 1) then
38 |        do while (atomiccas(lock,0,1) == 1)
39 |        end do
40 |        partial(1)=partial(1)+p_s(1)
41 |        call threadfence()
42 |        lock = 0
43 |     end if
44 |   end subroutine pi_lock
45 | end module pi_lock_m
46 | 


--------------------------------------------------------------------------------
/code/ch03/events.cuf:
--------------------------------------------------------------------------------
 1 | module m
 2 | contains
 3 |   attributes(global) subroutine increment(a, b)
 4 |     implicit none
 5 |     integer, intent(inout) :: a(*)
 6 |     integer, value :: b
 7 |     integer :: i
 8 | 
 9 |     i = threadIdx%x
10 |     a(i) = a(i)+b
11 | 
12 |   end subroutine increment
13 | end module m
14 | 
15 | 
16 | program events  
17 |   use cudafor
18 |   use m
19 |   implicit none
20 |   integer, parameter :: n = 256
21 |   integer :: a(n), b
22 |   integer, device :: a_d(n)
23 |   type(cudaEvent) :: startEvent, stopEvent
24 |   real :: time
25 |   integer :: istat
26 | 
27 |   a = 1
28 |   b = 3
29 |   a_d = a
30 | 
31 |   istat = cudaEventCreate(startEvent)
32 |   istat = cudaEventCreate(stopEvent)
33 | 
34 |   istat = cudaEventRecord(startEvent, 0)
35 |   call increment<<<1,n>>>(a_d, b)
36 |   istat = cudaEventRecord(stopEvent, 0)
37 |   istat = cudaEventSynchronize(stopEvent)
38 |   istat = cudaEventElapsedTime(time, startEvent, stopEvent)
39 | 
40 |   a = a_d
41 | 
42 |   if (any(a /= 4)) then
43 |      print *, '**** Program Failed ****'
44 |   else
45 |      print *, '  Time for kernel execution (ms): ', time
46 |   endif
47 | 
48 |   istat = cudaEventDestroy(startEvent)
49 |   istat = cudaEventDestroy(stopEvent)
50 | 
51 | end program events
52 | 


--------------------------------------------------------------------------------
/code/ch07/sgemmLegacy.cuf:
--------------------------------------------------------------------------------
 1 | program sgemmLegacy
 2 |   use cublas
 3 |   use cudafor
 4 |   implicit none
 5 |   integer, parameter :: m = 100, n = 100, k = 100
 6 |   real :: a(m,k), b(k,n), c(m,n)
 7 |   real, device :: a_d(m,k), b_d(k,n), c_d(m,n)
 8 |   real, parameter :: alpha = 1.0, beta = 0.0
 9 |   integer :: lda = m, ldb = k, ldc = m
10 |   integer :: istat
11 | 
12 |   a = 1.0; b = 2.0; c = 0.0
13 | 
14 |   call sgemm('n', 'n', m, n, k, &
15 |        alpha, a, lda, b, ldb, beta, c, ldc)
16 |   print *, 'sgemm(host data) error =', &
17 |        maxval(c-k*2.0)
18 |   
19 |   istat = cublasInit()
20 |   if (istat /= CUBLAS_STATUS_SUCCESS) &
21 |        print *, 'Error initializing CUBLAS'
22 |   
23 |   a_d = a; b_d = b; c_d = 0.0
24 |   
25 |   call cublasSgemm('n', 'n', m, n, k, &
26 |        alpha, a_d, lda, b_d, ldb, beta, c_d, ldc)
27 |   c = c_d
28 |   print *, 'cublasSgemm error =', maxval(c-k*2.0)
29 | 
30 |   c_d = 0.0
31 |   call sgemm('n', 'n', m, n, k, &
32 |        alpha, a_d, lda, b_d, ldb, beta, c_d, ldc)
33 |   c = c_d
34 |   print *, 'sgemm(device data) error =', &
35 |        maxval(c-k*2.0)
36 | 
37 |   istat = cublasShutdown()
38 |   if (istat /= CUBLAS_STATUS_SUCCESS) &
39 |        print *, 'Error shutting down CUBLAS'
40 | end program sgemmLegacy
41 | 


--------------------------------------------------------------------------------
/code/ch11/Makefile:
--------------------------------------------------------------------------------
 1 | OBJS = fft_test_c2c_sp fft_test_c2c_dp fft_test_r2c_sp fft_test_r2c_dp \
 2 |        spectral_sp spectral_dp exampleOverlapFFT ns2d_sp ns2d_dp
 3 | 
 4 | # section 11.1
 5 | 
 6 | fft_test_c2c_sp: precision_m.F90 fft_test_c2c.cuf
 7 | 	nvfortran -o $@ $^ -cudalib=cufft
 8 | 
 9 | fft_test_c2c_dp: precision_m.F90 fft_test_c2c.cuf
10 | 	nvfortran -DDOUBLE -o $@ $^ -cudalib=cufft
11 | 
12 | fft_test_r2c_sp: precision_m.F90 fft_test_r2c.cuf
13 | 	nvfortran -o $@ $^ -cudalib=cufft
14 | 
15 | fft_test_r2c_dp: precision_m.F90 fft_test_r2c.cuf
16 | 	nvfortran -DDOUBLE -o $@ $^ -cudalib=cufft
17 | 
18 | # section 11.2
19 | 
20 | spectral_sp: precision_m.F90 fft_derivative.cuf
21 | 	nvfortran -O2 -o $@ $^ -cudalib=cufft
22 | 
23 | spectral_dp: precision_m.F90 fft_derivative.cuf
24 | 	nvfortran -DDOUBLE -O2 -o $@ $^ -cudalib=cufft
25 | 
26 | # section 11.3
27 | 
28 | exampleOverlapFFT: precision_m.F90 exampleOverlapFFT_nvtx.cuf
29 | 	nvfortran -O3 -Minfo -Mpreprocess -o $@ $^ -cudalib=nvtx,cufft
30 | 
31 | # section 11.4
32 | 
33 | ns2d_sp: ns2d.cuf
34 | 	nvfortran -DSINGLE -O3 -Mpreprocess -o $@ $< -cudalib=cufft
35 | 
36 | ns2d_dp: ns2d.cuf
37 | 	nvfortran -O3 -Mpreprocess -o $@ $< -cudalib=cufft
38 | 
39 | 
40 | 
41 | clean:
42 | 	rm -rf $(OBJS) *.o *.mod *~ *.hdf
43 | 
44 | 


--------------------------------------------------------------------------------
/code/ch08/p2pAccess.cuf:
--------------------------------------------------------------------------------
 1 | program checkP2pAccess
 2 |   use cudafor
 3 |   implicit none
 4 |   integer, allocatable :: p2pOK(:,:)
 5 |   integer :: nDevices, i, j, istat
 6 |   type (cudaDeviceProp) :: prop
 7 | 
 8 |   istat = cudaGetDeviceCount(nDevices)
 9 |   print "('Number of CUDA-capable devices: ', i0,/)", &
10 |        nDevices
11 | 
12 |   do i = 0, nDevices-1
13 |      istat = cudaGetDeviceProperties(prop, i)
14 |      print "('Device ', i0, ': ', a)", i, trim(prop%name)
15 |   enddo
16 |   print *
17 | 
18 |   allocate(p2pOK(0:nDevices-1, 0:nDevices-1))
19 |   p2pOK = 0
20 | 
21 |   do j = 0, nDevices-1
22 |      do i = j+1, nDevices-1
23 |         istat = cudaDeviceCanAccessPeer(p2pOK(i,j), i, j)
24 |         p2pOK(j,i) = p2pOK(i,j)
25 |      end do
26 |   end do
27 | 
28 |   do i = 0, nDevices-1
29 |      write(*, "(3x,i3)", advance='no') i
30 |   enddo
31 |   print *
32 | 
33 |   do j = 0, nDevices-1
34 |      write(*,"(i3)", advance='no') j
35 |      do i = 0, nDevices-1
36 |         if (i == j) then
37 |            write(*,"(2x,'-',3x)", advance='no')
38 |         else if (p2pOK(i,j) == 1) then
39 |            write(*,"(2x, 'Y',3x)",advance='no') 
40 |         else
41 |            write(*,"(6x)",advance='no')
42 |         end if
43 |      end do
44 |      print *
45 |   end do
46 | end program checkP2pAccess
47 | 


--------------------------------------------------------------------------------
/code/ch04/twoKernels.cuf:
--------------------------------------------------------------------------------
 1 | program twoKernels
 2 |   use cudafor
 3 |   implicit none
 4 |   integer, parameter :: n=100000
 5 |   real, device :: a_d(n,2)
 6 |   real :: a(n,2)
 7 |   integer(kind=cuda_stream_kind) :: stream1, stream2
 8 |   integer :: istat, i
 9 | 
10 |   istat = cudaStreamCreate(stream1)
11 |   istat = cudaStreamCreate(stream2)
12 |   a = 1.0
13 | 
14 |   ! two kernels in the null stream
15 | 
16 |   a_d = a
17 |   !$cuf kernel do <<<1,1>>>
18 |   do i = 1, n
19 |      a_d(i,1) = a_d(i,1) + i
20 |   enddo
21 |   !$cuf kernel do <<<1,1>>>
22 |   do i = 1, n
23 |      a_d(i,2) = a_d(i,2) - i
24 |   enddo
25 |   a = a_d
26 | 
27 |   ! one kernel in blocking stream, one in null stream
28 | 
29 |   a_d = a
30 |   !$cuf kernel do <<<1,1,stream=stream1>>>
31 |   do i = 1, n
32 |      a_d(i,1) = a_d(i,1) + i
33 |   enddo
34 |   !$cuf kernel do <<<1,1>>>
35 |   do i = 1, n
36 |      a_d(i,2) = a_d(i,2) - i
37 |   enddo
38 |   a = a_d
39 |   
40 |   ! two kernels in different, blocking streams
41 | 
42 |   a_d = a
43 |   !$cuf kernel do <<<1,1,stream=stream1>>>
44 |   do i = 1, n
45 |      a_d(i,1) = a_d(i,1) + i
46 |   enddo
47 |   !$cuf kernel do <<<1,1,stream=stream2>>>
48 |   do i = 1, n
49 |      a_d(i,2) = a_d(i,2) - i
50 |   enddo
51 |   a = a_d
52 |   
53 |   istat = cudaStreamDestroy(stream1)
54 |   istat = cudaStreamDestroy(stream2)
55 | end program twoKernels
56 | 


--------------------------------------------------------------------------------
/code/ch07/callingC.cuf:
--------------------------------------------------------------------------------
 1 | module m
 2 | 
 3 |   interface
 4 |      attributes(global) subroutine kernel(x, v) &
 5 |           bind(C,name='Ckernel')
 6 |        use iso_c_binding
 7 |        real(c_float), device :: x(*)
 8 |        real(c_float), value :: v
 9 |      end subroutine kernel
10 |   end interface
11 | 
12 |   interface
13 |      attributes(device) function devicefun(x) result(res) &
14 |           bind(C,name='Cdevicefun')
15 |        use iso_c_binding
16 |        real(c_float) :: res
17 |        real(c_float), value :: x
18 |      end function devicefun
19 |   end interface
20 | 
21 | contains
22 | 
23 |   attributes(global) subroutine Fkernel(x, v)
24 |     implicit none
25 |     real, device :: x(*)
26 |     real, value :: v
27 |     x(threadIdx%x) = devicefun(v)
28 |   end subroutine Fkernel
29 |     
30 | end module m
31 | 
32 | program main
33 |   use m
34 |   implicit none
35 |   real :: x(1), f(1)
36 |   real, device :: x_d(1), f_d(1)
37 |   integer :: i
38 | 
39 |   call kernel<<<1,1>>>(x_d, 1.0)
40 |   x = x_d
41 |   if (x(1) == 1.0) print *, 'Calling C Kernel -- OK'
42 |   
43 |   !$cuf kernel do <<<*,*>>>
44 |   do i = 1, 1
45 |      f_d(i) = devicefun(x_d(i))
46 |   end do
47 |   f = f_d
48 |   if (f(1) == 2.0) print *, 'CUF Kernel Calling C Function -- OK'  
49 | 
50 |   call Fkernel<<<1,1>>>(x_d, 4.0)
51 |   x = x_d
52 |   if (x(1) == 8.0) print *, 'Fortran Kernel Calling C Function -- OK' 
53 | end program main
54 | 
55 | 
56 | 
57 |   
58 | 


--------------------------------------------------------------------------------
/code/ch09/generate_randomnumbers.cuf:
--------------------------------------------------------------------------------
 1 | ! Generate N random numbers on GPU, copy them back to CPU 
 2 | ! and print the first 4
 3 | 
 4 | program curand_example
 5 |   use precision_m
 6 |   use curand
 7 |   implicit none
 8 |   real(fp_kind), allocatable:: hostData(:)
 9 |   real(fp_kind), allocatable, device:: deviceData(:)
10 |   type(curandGenerator) :: gen
11 |   integer ::  N, istat
12 |   integer(kind=8) :: seed
13 | 
14 |   ! Define how many numbers we want to generate
15 |   N=20
16 |  
17 |   ! Allocate array on CPU
18 |   allocate(hostData(N))
19 | 
20 |   ! Allocate array on GPU
21 |   allocate(deviceData(N))
22 | 
23 |   if (fp_kind == singlePrecision) then
24 |      write(*,"('Generating random numbers in single precision')")
25 |   else
26 |      write(*,"('Generating random numbers in double precision')")
27 |   end if
28 | 
29 |   ! Create pseudonumber generator
30 |   istat = curandCreateGenerator(gen, CURAND_RNG_PSEUDO_DEFAULT)
31 | 
32 |   ! Set seed
33 |   seed=1234
34 |   istat= curandSetPseudoRandomGeneratorSeed( gen, seed)
35 | 
36 |   ! Generate N floats or double on device
37 |   istat= curandGenerate(gen, deviceData, N)
38 | 
39 |   ! Copy the data back to CPU
40 |   hostData=deviceData
41 | 
42 |   ! print the first 4 of the sequence
43 |   write(*,*) hostData(1:4)
44 | 
45 |   ! Deallocate data on CPU and GPU
46 |   deallocate(hostData)
47 |   deallocate(deviceData)
48 |   
49 |   ! Destroy the generator
50 |   istat = curandDestroyGenerator(gen)
51 | 
52 | end program curand_example
53 | 


--------------------------------------------------------------------------------
/code/ch03/limitingFactor.cuf:
--------------------------------------------------------------------------------
 1 | module kernel_m
 2 | contains
 3 |   attributes(global) subroutine base(a, b)
 4 |     real :: a(*), b(*)
 5 |     integer :: i    
 6 |     i = (blockIdx%x-1)*blockDim%x + threadIdx%x
 7 |     a(i) = sin(b(i)) + cos(b(i)) &
 8 |          + sin(2.0*b(i)) + cos(2.0*b(i)) &
 9 |          + sin(3.0*b(i)) + cos(3.0*b(i)) &
10 |          + sin(4.0*b(i)) + cos(4.0*b(i))
11 |   end subroutine base
12 | 
13 |   attributes(global) subroutine memory(a, b)
14 |     real :: a(*), b(*)
15 |     integer :: i    
16 |     i = (blockIdx%x-1)*blockDim%x + threadIdx%x
17 |     a(i) = b(i)
18 |   end subroutine memory
19 | 
20 |   attributes(global) subroutine math(a, b, flag)
21 |     real :: a(*)
22 |     real, value :: b
23 |     integer, value :: flag
24 |     real :: v
25 |     integer :: i    
26 |     i = (blockIdx%x-1)*blockDim%x + threadIdx%x
27 |     v = sin(b) + cos(b) &
28 |          + sin(2.0*b) + cos(2.0*b) &
29 |          + sin(3.0*b) + cos(3.0*b) &
30 |          + sin(4.0*b) + cos(4.0*b)
31 |     if (v*flag == 1) a(i) = v
32 |   end subroutine math
33 | end module kernel_m
34 | 
35 | program limitingFactor
36 |   use cudafor
37 |   use kernel_m
38 |   
39 |   implicit none
40 |   
41 |   integer, parameter :: blockSize = 256
42 |   integer, parameter :: n = 64*1024*4*blockSize
43 |   real :: a(n)
44 |   real, device :: a_d(n), b_d(n)
45 |   b_d = 1.0
46 |   call base<<<n/blockSize,blockSize>>>(a_d, b_d)
47 |   call memory<<<n/blockSize,blockSize>>>(a_d, b_d)
48 |   call math<<<n/blockSize,blockSize>>>(a_d, 1.0, 0)
49 |   a = a_d
50 |   print *, a(1)
51 | end program limitingFactor
52 | 


--------------------------------------------------------------------------------
/code/ch05/maxSharedMemory.cuf:
--------------------------------------------------------------------------------
 1 | module m
 2 | contains
 3 |   attributes(global) subroutine increment(a, b)
 4 |     implicit none
 5 |     integer, intent(inout) :: a(*)
 6 |     integer, shared :: s(*)
 7 |     integer, value :: b
 8 |     integer :: i, il
 9 | 
10 |     il = threadIdx%x
11 |     i = (blockIdx%x-1)*blockDim%x + threadIdx%x
12 |     s(il) = a(i)
13 |     call syncthreads()
14 |     a(i) = s(il)+b
15 |   end subroutine increment
16 | end module m
17 | 
18 | 
19 | program maxSharedMemory
20 |   use cudafor
21 |   use m
22 |   implicit none
23 | 
24 |   integer, parameter :: n = 1024*1024
25 |   integer :: a(n)
26 |   integer, device :: a_d(n)
27 |   type(cudaFuncAttributes) :: attr
28 |   type(cudaDeviceProp) :: prop
29 |   integer :: istat, smBytes
30 | 
31 |   istat = cudaGetDeviceProperties(prop, 0)
32 |   print "(/,'Device Name: ',a)", trim(prop%name)
33 |   print "('Compute Capability: ',i0,'.',i0)", &
34 |        prop%major, prop%minor
35 |   
36 |   print "(/,'sharedMemPerBlock: ', i0)", prop%sharedMemPerBlock
37 |   print "('sharedMemPerBlockOptIn: ', i0)", &
38 |        prop%sharedMemPerBlockOptIn
39 |   print "('sharedMemPerMultiprocessor: ', i0)", &
40 |        prop%sharedMemPerMultiprocessor
41 | 
42 |   smBytes = prop%sharedMemPerBlockOptIn
43 |   istat = cudaFuncSetAttribute(increment, &
44 |        cudaFuncAttributeMaxDynamicSharedMemorySize, &
45 |        smBytes)
46 |   
47 |   a_d = 1
48 |   call increment<<<n/32, 32, smBytes>>>(a_d, 2)
49 |   a = a_d
50 |   if (all(a==3)) then
51 |      print "(/,'Passed')"
52 |   else
53 |      print "(/,'*** Failed ***')"
54 |   end if
55 | end program maxSharedMemory
56 | 


--------------------------------------------------------------------------------
/code/ch09/Makefile:
--------------------------------------------------------------------------------
 1 | OBJS = rng_gpu_sp rng_gpu_dp pi_sp pi_dp ieee_accuracy \
 2 |        pi_performance_sp shflExample testPiGridGroup \
 3 |        accuracy_sum mc_european_single mc_european_double
 4 | 
 5 | 
 6 | # section 9.1
 7 | 
 8 | rng_gpu_sp: precision_m.F90 generate_randomnumbers.cuf
 9 | 	nvfortran -O3 -o $@ $^ -cudalib=curand
10 | 
11 | rng_gpu_dp: precision_m.F90 generate_randomnumbers.cuf
12 | 	nvfortran -O3 -DDOUBLE -o $@ $^ -cudalib=curand
13 | 
14 | # section 9.2
15 | 
16 | pi_sp: precision_m.F90 compute_pi.cuf
17 | 	nvfortran -O3 -o $@ $^ -cudalib=curand
18 | 
19 | pi_dp: precision_m.F90 compute_pi.cuf
20 | 	nvfortran -DDOUBLE -O3 -o $@ $^ -cudalib=curand
21 | 
22 | # section 9.2.1
23 | 
24 | ieee_accuracy: ieee_accuracy.f90
25 | 	nvfortran -o $@ $^
26 | 
27 | # section 9.3.*
28 | 
29 | pi_performance_sp: precision_m.F90 pi_shared.cuf pi_shfl.cuf \
30 | 		   pi_lock.cuf pi_gridGroup.cuf compute_pi_performance.CUF
31 | 	nvfortran -O3 -DLOOP -o $@ $^ -cudalib=curand
32 | 
33 | # section 9.3.1
34 | 
35 | shflExample: shflExample.cuf
36 | 	nvfortran -o $@ $<
37 | 
38 | # section 9.3.3
39 | 
40 | testPiGridGroup: precision_m.F90 pi_gridGroup.cuf testPiGridGroup.cuf
41 | 	nvfortran -o $@ $^ -cudalib=curand
42 | 
43 | # section 9.4
44 | 
45 | accuracy_sum: accuracy_sum.f90
46 | 	nvfortran -o $@ $^
47 | 
48 | # section 9.5
49 | 
50 | mc_european_single: precision_m.F90 montecarlo_european_option.cuf 
51 | 	nvfortran -O3 -Minfo -o $@ $^ -cudalib=curand
52 | 
53 | mc_european_double: precision_m.F90 montecarlo_european_option.cuf 
54 | 	nvfortran -O3 -Minfo -DDOUBLE -o $@ $^ -cudalib=curand
55 | 
56 | clean:
57 | 	rm -rf $(OBJS) *.o *~ *.mod *~
58 | 
59 | 


--------------------------------------------------------------------------------
/code/ch01/Makefile:
--------------------------------------------------------------------------------
 1 | OBJS = incrementf90 incrementcuf multiblock multidim explicitInterface \
 2 |        managed managedImplicit multidimCUF managedCUF \
 3 |        managedCUF2f90 managedCUF2cuf deviceQuery pciBusID \
 4 |        errorHandling syncError asyncError version
 5 | 
 6 | all: $(OBJS)
 7 | 
 8 | # section 1.3.1
 9 | 
10 | incrementf90: increment.f90
11 | 	nvfortran -o $@ $<
12 | 
13 | incrementcuf: increment.cuf
14 | 	nvfortran -o $@ $<
15 | 
16 | # section 1.3.2
17 | 
18 | multiblock: multiblock.cuf 
19 | 	nvfortran -o $@ $<
20 | 
21 | # section 1.3.3
22 | 
23 | multidim: multidim.cuf
24 | 	nvfortran -o $@ $<
25 | 
26 | # section 1.3.4
27 | 
28 | explicitInterface: explicitInterface.cuf
29 | 	nvfortran -o $@ $<
30 | 
31 | # section 1.3.5
32 | 
33 | managed: managed.cuf
34 | 	nvfortran -o $@ $<
35 | 
36 | managedImplicit: managedImplicit.cuf
37 | 	nvfortran -o $@ -gpu=managed $<
38 | 
39 | # section 1.3.6
40 | 
41 | multidimCUF: multidimCUF.cuf
42 | 	nvfortran -o $@ $<
43 | 
44 | managedCUF: managedCUF.cuf
45 | 	nvfortran -o $@ $<
46 | 
47 | managedCUF2f90: managedCUF2.f90
48 | 	nvfortran -o $@ $<
49 | 
50 | managedCUF2cuf: managedCUF2.f90
51 | 	nvfortran -o $@ -cuda $<
52 | 
53 | # section 1.4
54 | 
55 | deviceQuery: deviceQuery.cuf
56 | 	nvfortran -o $@ $<
57 | 
58 | # section 1.4.1
59 | 
60 | pciBusID: pciBusID.cuf
61 | 	nvfortran -o $@ $<
62 | 
63 | # section 1.5
64 | 
65 | errorHandling: errorHandling.cuf
66 | 	nvfortran -o $@ $<
67 | 
68 | syncError: syncError.cuf
69 | 	nvfortran -o $@ $<
70 | 
71 | asyncError: asyncError.cuf
72 | 	nvfortran -o $@ $<
73 | 
74 | # section 1.7
75 | 
76 | version: version.cuf
77 | 	nvfortran -o $@ $<
78 | 
79 | clean:
80 | 	rm -rf $(OBJS) *.o *.mod *~
81 | 


--------------------------------------------------------------------------------
/code/ch05/local.cuf:
--------------------------------------------------------------------------------
 1 | module localmem
 2 |   implicit none
 3 | contains
 4 |   attributes(global) subroutine k1(a)
 5 |     integer :: a(*)
 6 |     integer :: b(2), i
 7 |     i = blockDim%x*(blockIdx%x-1) + threadIdx%x
 8 |     b = 1
 9 |     a(i) = b(2)
10 |   end subroutine k1
11 | 
12 |   attributes(global) subroutine k2(a, j)
13 |     integer :: a(*)
14 |     integer, value :: j
15 |     integer :: b(2), i
16 |     i = blockDim%x*(blockIdx%x-1) + threadIdx%x
17 |     b = 1
18 |     a(i) = b(j)
19 |   end subroutine k2
20 | 
21 |   attributes(global) subroutine k3(a, j)
22 |     integer :: a(*)
23 |     integer, value :: j
24 |     integer :: b(256), i
25 |     i = blockDim%x*(blockIdx%x-1) + threadIdx%x
26 |     b = 1    
27 |     a(i) = b(j)
28 |   end subroutine k3
29 | 
30 |   attributes(global) subroutine k4(a, j)
31 |     integer :: a(*), j(*)
32 |     integer :: b(2), i
33 |     i = blockDim%x*(blockIdx%x-1) + threadIdx%x
34 |     b = 1
35 |     a(i) = b(j(i))
36 |   end subroutine k4
37 | end module localmem
38 | 
39 | 
40 | program localAttribute
41 |   use localmem
42 |   use cudafor
43 |   implicit none
44 |   type(cudaFuncAttributes) :: attr
45 |   integer :: istat
46 | 
47 |   istat = cudaFuncGetAttributes(attr, k1)
48 |   print "('k1 local memory (bytes/thread): ', i0)", attr%localSizeBytes
49 |   istat = cudaFuncGetAttributes(attr, k2)
50 |   print "('k2 local memory (bytes/thread): ', i0)", attr%localSizeBytes
51 |   istat = cudaFuncGetAttributes(attr, k3)
52 |   print "('k3 local memory (bytes/thread): ', i0)", attr%localSizeBytes
53 |   istat = cudaFuncGetAttributes(attr, k4)
54 |   print "('k4 local memory (bytes/thread): ', i0)", attr%localSizeBytes
55 | end program localAttribute
56 |  
57 | 


--------------------------------------------------------------------------------
/code/ch03/effectiveBandwidth.cuf:
--------------------------------------------------------------------------------
 1 | module m
 2 |   use, intrinsic :: iso_fortran_env
 3 |   integer, parameter :: fpKind = real64
 4 | contains
 5 |   attributes(global) subroutine copy(lhs, rhs, n)
 6 |     implicit none
 7 |     real(fpKind) :: lhs(*)
 8 |     real(fpKind) :: rhs(*)
 9 |     integer, value :: n
10 |     integer :: i
11 |     i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
12 |     if (i <= n) lhs(i) = rhs(i)
13 |   end subroutine copy
14 | end module m
15 | 
16 | 
17 | program stream
18 |   use cudafor
19 |   use m
20 |   implicit none
21 |   integer, parameter :: N = 32*1024*1024  
22 |   real(fpKind), device :: a_d(N), b_d(N)
23 |   integer :: nBlocks, blockSize
24 |   real :: time
25 |   integer(8) :: nBytes
26 |   type(cudaEvent) :: startEvent, stopEvent 
27 |   integer :: istat
28 |   
29 |   istat = cudaEventCreate(startEvent) 
30 |   istat = cudaEventCreate(stopEvent)   
31 | 
32 |   blockSize = 256
33 |   nBlocks = (N-1)/blockSize+1
34 |   nBytes = 2 * sizeof(a_d)
35 |      
36 |   b_d = 1.0_fpKind
37 |   istat = cudaDeviceSynchronize()
38 |   istat = cudaEventRecord(startEvent, 0)
39 |   call copy<<<nBlocks, blockSize>>>(a_d, b_d, N)
40 |   istat = cudaEventRecord(stopEvent, 0)
41 |   istat = cudaEventSynchronize(stopEvent)
42 |   istat = cudaEventElapsedTime(time, startEvent, stopEvent)
43 | 
44 |   block
45 |     type(cudaDeviceProp) :: prop
46 |     istat = cudaGetDeviceProperties(prop, 0)
47 |     print "(A)", trim(prop%name)
48 |   end block
49 |   if (fpKind == real64) print "('real64')"
50 |   if (fpKind == real32) print "('real32')"
51 |   print "('Array size (bytes): ', i0)", sizeof(a_d)
52 |   print "('Copy effective bandwidth: ', f10.1)", &
53 |           nBytes/time/1.0E+6
54 | 
55 | end program stream
56 | 


--------------------------------------------------------------------------------
/code/ch05/Makefile:
--------------------------------------------------------------------------------
 1 | OBJS = HDtransfer sliceTransfer async \
 2 |        offset stride local constant constantAttribute \
 3 |        maxSharedMemory transpose \
 4 |        parallelism parallelismPipeline cufILP fma
 5 | 
 6 | 
 7 | # section 5.1.1
 8 | 
 9 | HDtransfer: HDtransfer.cuf
10 | 	nvfortran -o $@ $<
11 | 
12 | sliceTransfer: sliceTransfer.cuf
13 | 	nvfortran -o $@ $<
14 | 
15 | async: async.cuf
16 | 	nvfortran -o $@ $<
17 | 
18 | # section 5.2.2.1
19 | 
20 | assumedShapeSize: assumedShapeSize.cuf
21 | 	nvfortran -c -gpu=ptxinfo $<
22 | 
23 | # section 5.2.2.2
24 | 
25 | offset: offset.cuf
26 | 	nvfortran -o $@ $<
27 | 
28 | stride: stride.cuf
29 | 	nvfortran -o $@ $<
30 | 
31 | # section 5.2.3
32 | 
33 | local: local.cuf
34 | 	nvfortran -o $@ -gpu=keep $<
35 | 
36 | # section 5.2.4
37 | 
38 | constant: constant.cuf
39 | 	nvfortran -o $@ $<
40 | 
41 | constantAttribute: constantAttribute.cuf
42 | 	nvfortran -o $@ $<
43 | 
44 | # section 5.2.5
45 | 
46 | loads: loads.cuf
47 | 	nvfortran -c -gpu=keep $<
48 | 
49 | # section 5.2.6.1
50 | 
51 | maxSharedMemory: maxSharedMemory.cuf
52 | 	nvfortran -o $@ $<
53 | 
54 | # section 5.2.6.2
55 | 
56 | transpose: transpose.cuf
57 | 	nvfortran -o $@ $<
58 | 
59 | # section 5.2.7
60 | 
61 | spill: spill.cuf
62 | 	nvfortran -c -gpu=ptxinfo $<
63 | 
64 | # section 5.3.1
65 | 
66 | parallelism: parallelism.cuf
67 | 	nvfortran -o $@ $<
68 | 
69 | # section 5.3.2.1
70 | 
71 | parallelismPipeline: parallelismPipeline.cuf
72 | 	nvfortran -gpu=cc80 -o $@ $<
73 | 
74 | # section 5.3.2.2
75 | 
76 | cufILP: cufILP.cuf
77 | 	nvfortran -o $@ $<
78 | 
79 | # section 5.4.1.4
80 | 
81 | fma: fma.cuf
82 | 	nvfortran -c -gpu=fma,keep  $<	
83 | 
84 | 
85 | clean:
86 | 	rm -rf $(OBJS) *.o *.mod *~ *.bin *.gpu *.ptx *.fat
87 | 


--------------------------------------------------------------------------------
/code/ch11/fft_test_c2c.cuf:
--------------------------------------------------------------------------------
 1 | program fft_test_c2c
 2 |   use precision_m
 3 |   use cufft
 4 |   implicit none
 5 |   integer, allocatable :: kx(:)
 6 |   complex(fp_kind), allocatable :: cinput(:),coutput(:)
 7 |   complex(fp_kind), allocatable, device :: cinput_d(:),coutput_d(:)
 8 | 
 9 |   interface cufftExec
10 |      module procedure cufftExecC2C,cufftExecZ2Z
11 |   end interface cufftExec
12 | 
13 |   integer :: i,n,plan,istat
14 |   real(fp_kind) :: pi=4._fp_kind*atan(1._fp_kind), h, theta
15 |   
16 |   n=16
17 |   h=2._fp_kind*pi/real(n,fp_kind)
18 | 
19 |   ! allocate arrays on the host
20 |   allocate(cinput(n),coutput(n),kx(n))
21 | 
22 |   ! allocate arrays on the device
23 |   allocate(cinput_d(n),coutput_d(n))
24 | 
25 |   ! initialize arrays on host
26 |   kx = [(i-1, i=1,n/2), (-n+i-1, i=n/2+1,n)]
27 | 
28 |   do i=1,n
29 |      cinput(i)=(cos(2*real(i-1,fp_kind)*h)+sin(3*real(i-1,fp_kind)*h))
30 |   end do
31 | 
32 |   ! copy arrays to device
33 |   cinput_d=cinput
34 | 
35 | 
36 |   ! initialize the plan for complex to complex transform
37 |   if (fp_kind == real32) istat = cufftPlan1D(plan,n,CUFFT_C2C,1)
38 |   if (fp_kind == real64) istat = cufftPlan1D(plan,n,CUFFT_Z2Z,1)
39 | 
40 |   ! forward transform out of place
41 |   istat = cufftExec(plan,cinput_d,coutput_d,CUFFT_FORWARD)
42 | 
43 |   ! copy results back to host
44 |   coutput=coutput_d
45 | 
46 |   print *," Transform from complex array"
47 |   do i=1,n
48 |      write(*,'(i2,1x,2(f8.4),2x,i2,2(f8.4))') &
49 |           i,cinput(i),kx(i),coutput(i)/n
50 |   end do
51 | 
52 |   ! release memory on the host and on the device
53 |   deallocate(cinput,coutput,kx,cinput_d,coutput_d)
54 | 
55 |   ! destroy the plan
56 |   istat = cufftDestroy(plan)
57 | 
58 | end program fft_test_c2c
59 | 


--------------------------------------------------------------------------------
/code/ch11/fft_test_r2c.cuf:
--------------------------------------------------------------------------------
 1 | program fft_test_r2c
 2 |   use cudafor
 3 |   use precision_m
 4 |   use cufft
 5 |   implicit none
 6 |   integer, allocatable :: kx(:)
 7 |   real(fp_kind), allocatable :: rinput(:)
 8 |   real(fp_kind), allocatable, device :: rinput_d(:)
 9 |   complex(fp_kind), allocatable :: coutput(:)
10 |   
11 |   integer :: i,n,istat,plan
12 |   real(fp_kind) :: twopi=8._fp_kind*atan(1._fp_kind),h
13 |   
14 |   interface cufftExec
15 |      module procedure cufftExecR2C,cufftExecD2Z
16 |   end interface cufftExec
17 |   
18 |   n=16 
19 |   h=twopi/real(n,fp_kind)
20 | 
21 |   ! allocate arrays on the host
22 |   allocate(rinput(n),coutput(n/2+1),kx(n/2+1))
23 | 
24 |   ! allocate arrays on the device
25 |   allocate(rinput_d(n+2))
26 | 
27 |   !initialize arrays on host
28 |   kx = [(i-1, i=1,n/2+1)]
29 | 
30 |   do i=1,n
31 |      rinput(i)=(cos(2*real(i-1,fp_kind)*h)+ &
32 |           sin(3*real(i-1,fp_kind)*h))
33 |   end do
34 | 
35 |   !copy arrays to device
36 |   rinput_d=rinput
37 | 
38 |   ! Initialize the plan for real to complex transform
39 |   if (fp_kind == real32) istat=cufftPlan1D(plan,n,CUFFT_R2C,1)
40 |   if (fp_kind == real64) istat=cufftPlan1D(plan,n,CUFFT_D2Z,1)
41 | 
42 |   ! Execute  Forward transform in place
43 |   istat=cufftExec(plan,rinput_d,rinput_d)
44 | 
45 |   ! Copy results back to host
46 |   istat=cudaMemcpy(coutput,rinput_d,n/2+1,cudaMemcpyDeviceToHost)
47 | 
48 |   print *," Transform from real array"
49 |   do i=1,n/2+1
50 |      write(*,'(i2,1x,i2,2(f8.4))') i,kx(i),coutput(i)/n
51 |   end do
52 | 
53 |   !release memory on the host and on the device
54 |   deallocate (rinput,coutput,kx,rinput_d)
55 | 
56 |   ! Destroy the plans
57 |   istat=cufftDestroy(plan)
58 | 
59 | end program fft_test_r2c
60 | 
61 | 


--------------------------------------------------------------------------------
/code/ch08/mpiDevices.cuf:
--------------------------------------------------------------------------------
 1 | program mpiDevices
 2 |   use cudafor
 3 |   use mpi
 4 |   implicit none
 5 | 
 6 |   ! global array size
 7 |   integer, parameter :: n = 1024*1024
 8 |   ! MPI  variables
 9 |   integer :: myrank, nprocs, ierr
10 |   ! device 
11 |   type(cudaDeviceProp) :: prop
12 |   integer(int_ptr_kind()) :: freeB, totalB, freeA, totalA 
13 |   real, device, allocatable :: d(:)
14 |   integer :: i, j, istat
15 | 
16 |   ! MPI initialization
17 |   call MPI_Init(ierr)
18 |   call MPI_Comm_rank(MPI_COMM_WORLD, myrank, ierr)
19 |   call MPI_Comm_size(MPI_COMM_WORLD, nProcs, ierr)
20 | 
21 |   ! print compute mode for device
22 |   istat = cudaGetDevice(j)
23 |   istat = cudaGetDeviceProperties(prop, j)
24 |   do i = 0, nprocs-1
25 |      call MPI_Barrier(MPI_COMM_WORLD, ierr)
26 |      if (myrank == i) print "('[',i0,'] using device: ', &
27 |           i0, ' in compute mode: ', i0)", &
28 |           myrank, j, prop%computeMode
29 |   enddo
30 | 
31 |   ! get memory use before large allocations, 
32 |   call MPI_Barrier(MPI_COMM_WORLD, ierr)
33 |   istat = cudaMemGetInfo(freeB, totalB)
34 | 
35 |   ! now allocate arrays, one rank at a time
36 |   do j = 0, nProcs-1
37 | 
38 |      ! allocate on device associated with rank j
39 |      call MPI_Barrier(MPI_COMM_WORLD, ierr)
40 |      if (myrank == j) allocate(d(n)) 
41 |      
42 |      ! Get free memory after allocation
43 |      call MPI_Barrier(MPI_COMM_WORLD, ierr)
44 |      istat = cudaMemGetInfo(freeA, totalA)
45 |      
46 |      if (myrank == j) print "('  [',i0,'] after allocation on rank: ', &
47 |           i0, ', device arrays allocated: ', i0)", &
48 |           myrank, j, (freeB-freeA)/n/4
49 |     
50 |   end do
51 | 
52 |   deallocate(d)
53 |   call MPI_Finalize(ierr)
54 | end program mpiDevices
55 | 


--------------------------------------------------------------------------------
/code/ch05/offset.cuf:
--------------------------------------------------------------------------------
 1 | module m
 2 |   use, intrinsic :: iso_fortran_env
 3 |   integer, parameter :: fp_kind=real32
 4 | contains
 5 |   attributes(global) subroutine increment(a, offset)
 6 |     real(fp_kind) :: a(*)
 7 |     integer, value :: offset
 8 |     integer :: i
 9 |     i = blockDim%x*(blockIdx%x-1)+threadIdx%x + offset
10 |     a(i) = a(i)+1
11 |   end subroutine increment
12 | end module m
13 | 
14 | program offset
15 |   use cudafor
16 |   use m
17 |   
18 |   implicit none
19 |   
20 |   integer, parameter :: nMB = 128  ! transfer size in MB
21 |   integer, parameter :: n = nMB*1024*1024/fp_kind
22 |   integer, parameter :: blockSize = 256
23 |   real(fp_kind), device :: a_d(n+32)
24 |   type(cudaEvent) :: startEvent, stopEvent
25 |   type(cudaDeviceProp) :: prop
26 |   integer :: i, istat
27 |   real(4) :: time
28 | 
29 | 
30 |   istat = cudaGetDeviceProperties(prop, 0)
31 |   print "(/,'Device: ',a)", trim(prop%name)
32 |   print "('Transfer size (MB): ',i0)", nMB
33 | 
34 |   if (kind(a_d) == real32) then
35 |      print "('Single Precision',/)"
36 |   else
37 |      print "('Double Precision',/)"
38 |   endif
39 | 
40 |   istat = cudaEventCreate(startEvent)
41 |   istat = cudaEventCreate(stopEvent)
42 |  
43 |   print *, 'Offset, Bandwidth (GB/s):'
44 |   do i = 0, 32
45 |      a_d = 0.0
46 |      istat = cudaEventRecord(startEvent,0)
47 |      call increment<<<n/blockSize,blockSize>>>(a_d, i)
48 |      istat = cudaEventRecord(stopEvent,0)
49 |      istat = cudaEventSynchronize(stopEvent)
50 |      
51 |      istat = cudaEventElapsedTime(time, startEvent, &
52 |           stopEvent)
53 |      print *, i, 2*n*fp_kind/time*1.e-6
54 |   enddo
55 |  
56 |   istat = cudaEventDestroy(startEvent)
57 |   istat = cudaEventDestroy(stopEvent) 
58 | end program offset
59 | 


--------------------------------------------------------------------------------
/code/ch05/stride.cuf:
--------------------------------------------------------------------------------
 1 | module m
 2 |   use, intrinsic :: iso_fortran_env
 3 |   integer, parameter :: fp_kind=real32
 4 | contains
 5 |   attributes(global) subroutine increment(a, stride)
 6 |     implicit none
 7 |     real(fp_kind) :: a(stride,*)
 8 |     integer, value :: stride
 9 |     integer :: i
10 |     i = blockDim%x*(blockIdx%x-1)+threadIdx%x
11 |     a(1,i) = a(1,i)+1
12 |   end subroutine increment
13 | end module m
14 | 
15 | program stride
16 |   use cudafor
17 |   use m
18 |   
19 |   implicit none
20 |   
21 |   integer, parameter :: nMB = 128  ! transfer size in MB
22 |   integer, parameter :: n = nMB*1024*1024/fp_kind
23 |   integer, parameter :: blockSize = 256
24 |   real(fp_kind), device :: a_d(32,n)
25 |   type(cudaEvent) :: startEvent, stopEvent
26 |   type(cudaDeviceProp) :: prop
27 |   integer :: i, istat
28 |   real(4) :: time
29 | 
30 | 
31 |   istat = cudaGetDeviceProperties(prop, 0)
32 |   print "(/,'Device: ',a)", trim(prop%name)
33 |   print "('Transfer size (MB): ',i0)", nMB
34 | 
35 |   if (kind(a_d) == real32) then
36 |      print "('Single Precision',/)"
37 |   else
38 |      print "('Double Precision',/)"
39 |   endif
40 | 
41 |   istat = cudaEventCreate(startEvent)
42 |   istat = cudaEventCreate(stopEvent)
43 |  
44 |   print *, 'Stride, Bandwidth (GB/s):'
45 |   do i = 1, 32
46 |      a_d = 0.0
47 |      istat = cudaEventRecord(startEvent,0)
48 |      call increment<<<n/blockSize,blockSize>>>(a_d, i)
49 |      istat = cudaEventRecord(stopEvent,0)
50 |      istat = cudaEventSynchronize(stopEvent)
51 |      
52 |      istat = cudaEventElapsedTime(time, startEvent, &
53 |           stopEvent)
54 |      print *, i, 2*n*fp_kind/time*1.e-6
55 |   enddo
56 |  
57 |   istat = cudaEventDestroy(startEvent)
58 |   istat = cudaEventDestroy(stopEvent) 
59 | end program stride
60 | 


--------------------------------------------------------------------------------
/code/ch07/Makefile:
--------------------------------------------------------------------------------
 1 | OBJS = callingC sgemmLegacy sgemmNew getrfBatched gemmPerf \
 2 |        cusparseMV cusparseMV_Ex potr matmulTC matmulTranspose \
 3 |        cutensorContraction testSort
 4 | 
 5 | ARCH=sm_75
 6 | GPU=cc75
 7 | 
 8 | # section 7.1
 9 | # (Ubuntu 20.04 sytems require the -c++libs flag when linking)
10 | 
11 | c.o: c.cu
12 | 	nvcc -c -arch=$(ARCH) -rdc=true c.cu
13 | 
14 | callingC: callingC.cuf c.o
15 | 	nvfortran -gpu=$(GPU) -o $@ $^ -c++libs
16 | 
17 | # section 7.2.1
18 | 
19 | sgemmLegacy: sgemmLegacy.cuf
20 | 	nvfortran -o $@ $< -cudalib=cublas -lblas
21 | 
22 | # section 7.2.2
23 | 
24 | sgemmNew: sgemmNew.cuf
25 | 	nvfortran -o $@ $< -cudalib=cublas
26 | 
27 | # section 7.2.3
28 | 
29 | getrfBatched: getrfBatched.cuf
30 | 	nvfortran -o $@ $< -cudalib=cublas
31 | 
32 | # section 7.2.4
33 | 
34 | gemmPerf: gemmPerf.cuf
35 | 	nvfortran -O3 -o $@ $< -cudalib=cublas
36 | 
37 | # section 7.3
38 | 
39 | cusparseMV: cusparseMV.cuf
40 | 	nvfortran -o $@ $< -cudalib=cusparse
41 | 
42 | 
43 | cusparseMV_Ex: cudaforEx.cuf cusparseEx.cuf cusparseMV_Ex.cuf
44 | 	nvfortran -o $@ $^ -cudalib=cusparse
45 | 
46 | # section 7.4
47 | 
48 | potr: cudaforEx.cuf potr.cuf
49 | 	nvfortran -o $@ $^ -cudalib=cusolver
50 | 
51 | # section 7.5
52 | 
53 | matmulTC: matmulTC.cuf
54 | 	nvfortran -o $@ $^ -cudalib=cublas,cutensor
55 | 
56 | matmulTranspose: matmulTranspose.cuf
57 | 	nvfortran -o $@ $^ -cudalib=cutensor
58 | 
59 | # section 7.5.1
60 | 
61 | cutensorContraction: cutensorContraction.cuf
62 | 	nvfortran -o $@ $< -cudalib=cutensor
63 | 
64 | # section 7.6
65 | 
66 | thrust.C.o: thrust.cu
67 | 	nvcc -arch=$(ARCH) -c -o $@ $<
68 | 
69 | testSort: thrust.cuf testSort.cuf thrust.C.o
70 | 	nvfortran -gpu=$(GPU) -o $@ $^ -c++libs
71 | 
72 | 
73 | 
74 | 
75 | clean:
76 | 	rm -rf $(OBJS) *.o *.mod *~
77 | 
78 | 


--------------------------------------------------------------------------------
/code/ch07/getrfBatched.cuf:
--------------------------------------------------------------------------------
 1 | program testgetrfBatched
 2 |   use cudafor 
 3 |   use cublas 
 4 |   implicit none 
 5 | 
 6 |   integer, parameter :: n=2, nbatch=3, lda=n
 7 |   real :: a(n,n,nbatch) 
 8 |   real, device :: a_d(n,n,nbatch)
 9 |   type(c_devptr) :: devPtrA(nbatch) 
10 |   type(c_devptr), device :: devPtrA_d(nbatch) 
11 |   type(cublasHandle) :: h1 
12 |   integer  :: ipvt(n*nbatch), info(nbatch) 
13 |   integer, device  :: ipvt_d(n*nbatch), info_d(nbatch)
14 |   integer :: i, k, istat
15 | 
16 |   ! intitialize arrays and transfer to device
17 |   do k = 1, nbatch
18 |      a(1,1,k) = 6.0*k
19 |      a(2,1,k) = 4.0*k
20 |      a(1,2,k) = 3.0*k
21 |      a(2,2,k) = 3.0*k
22 |   end do
23 |   a_d = a
24 | 
25 |   print "(/,'Input:')"
26 |   do k = 1, nbatch
27 |      print "(2x,'Matrix: ', i0)", k
28 |      do i=1, n
29 |         print *, a(i,:,k)
30 |      enddo
31 |   enddo
32 | 
33 |   ! build an array of pointers
34 |   do k = 1, nbatch
35 |      devPtrA(k) = c_devloc(a_d(1,1,k))
36 |   end do
37 |   devPtrA_d = devPtrA
38 | 
39 |   ! create handle, call cublasSgetrfBatched, and destroy handle
40 |   istat = cublasCreate(h1)
41 |   if (istat /= CUBLAS_STATUS_SUCCESS) &
42 |        write(*,*) 'cublasCreate failed'
43 |   istat= cublasSgetrfBatched(h1, n, devPtrA_d, lda, &
44 |        ipvt_d, info_d, nbatch)
45 |   if (istat /= CUBLAS_STATUS_SUCCESS) &
46 |        write(*,*) 'cublasSgetrfBatched failed: ', istat
47 |   istat = cublasDestroy(h1)
48 |   if (istat /= CUBLAS_STATUS_SUCCESS) &
49 |        write(*,*) 'cublasDestroy failed'
50 |     
51 |   a = a_d
52 | 
53 |   print "(/, 'LU Factorization:')"
54 |   do k = 1, nbatch
55 |      print "(2x,'Matrix: ', i0)", k
56 |      do i = 1, n
57 |         print *, a(i,:,k)
58 |      enddo
59 |   enddo
60 |   
61 | end program testgetrfBatched
62 | 
63 | 


--------------------------------------------------------------------------------
/code/ch09/pi_shared.cuf:
--------------------------------------------------------------------------------
 1 | module pi_shared_m
 2 | contains
 3 |   attributes(global) subroutine final_pi_shared(partial)
 4 |     implicit none
 5 |     integer :: partial(*)
 6 | 
 7 |     integer, shared :: p_s(*)
 8 |     integer :: index, inext
 9 |     
10 |     index=threadIdx%x
11 |     
12 |     p_s(index)=partial(index)
13 |     call syncthreads()
14 |     
15 |     inext=blockDim%x/2
16 |     do while ( inext >=1 )
17 |        if (index <=inext) &
18 |             p_s(index)=p_s(index)+p_s(index+inext)
19 |        inext = inext/2
20 |        call syncthreads()
21 |     end do
22 |     if (index == 1) partial(1)=p_s(1)
23 |   end subroutine final_pi_shared
24 |   
25 |   attributes(global) &
26 |        subroutine partial_pi_shared(input, partial, twoN)
27 |     use precision_m
28 |     implicit none    
29 |     real(fp_kind) :: input(twoN)
30 |     integer :: partial(*)
31 |     integer, value :: twoN
32 | 
33 |     integer, shared :: p_s(*)
34 |     integer :: N
35 |     integer :: i, index, inext,interior
36 | 
37 |     N = twoN/2 ! x=input(1:N), y=input(N+1:twoN)
38 | 
39 |     ! grid-stride loop over data
40 |     index=threadIdx%x+(blockIdx%x-1)*blockDim%x
41 |     interior=0
42 |     do i=index, N, BlockDim%x*GridDim%x
43 |        if( (input(i)**2 + input(i+N)**2) <= 1._fp_kind ) &
44 |             interior=interior+1
45 |     end do
46 | 
47 |     ! Local reduction within block
48 |     index=threadIdx%x
49 |     p_s(index)=interior
50 |     call syncthreads()
51 | 
52 |     inext=blockDim%x/2
53 |     do while ( inext >=1 )
54 |        if (index <=inext) &
55 |             p_s(index)=p_s(index)+p_s(index+inext)
56 |        inext = inext/2
57 |        call syncthreads()
58 |     end do
59 |     if (index == 1) partial(blockIdx%x)=p_s(1)
60 |   end subroutine partial_pi_shared
61 | end module pi_shared_m
62 | 
63 | 


--------------------------------------------------------------------------------
/code/ch06/testCompact.cuf:
--------------------------------------------------------------------------------
 1 | program main
 2 |   use compact
 3 |   use cudafor
 4 |   implicit none
 5 |   integer, parameter :: n=1024*1024*8
 6 |   real(8) :: a(n), threshold
 7 |   real(8), device :: a_d(n)
 8 |   integer, device :: mask_d(n)
 9 |   real(8), device, allocatable :: ac_d(:)
10 |   integer, device, allocatable :: ic_d(:)
11 |   real(8), allocatable :: ac(:), ach(:)
12 |   integer, allocatable :: ic(:), ich(:)
13 |   integer :: nc, nch, i, j, nerri, nerra, iter
14 |   real :: t1, t2
15 | 
16 | 
17 |   print *, 'Array size: ', n
18 |   print *, 'Block size: ', compact_tpb
19 |   
20 |   threshold = 0.5d0
21 |   call random_number(a)
22 |   a_d = a
23 | 
24 |   mask_d = 0
25 |   !$cuf kernel do <<<*,*>>>
26 |   do i = 1, n
27 |      if (a_d(i) <= threshold) mask_d(i) = 1
28 |   enddo
29 | 
30 |   call cpu_time(t1)
31 |   call compact(a_d, mask_d, n, ac_d, ic_d, nc)
32 |   call cpu_time(t2)
33 | 
34 |   print *, 'elapsed time gpu:', t2-t1
35 | 
36 |   allocate(ac(nc), ic(nc))
37 |   ac = ac_d
38 |   ic = ic_d
39 | 
40 |   ! do on host
41 | 
42 |   call cpu_time(t1)
43 |   nch = count(a <= threshold)
44 |   allocate(ach(nch), ich(nch))
45 |   j = 0
46 |   do i = 1, n
47 |      if (a(i) <= threshold) then
48 |         j = j+1
49 |         ach(j) = a(i)
50 |         ich(j) = i
51 |      end if
52 |      if (j == nch) exit
53 |   enddo
54 |   call cpu_time(t2)
55 | 
56 |   print *, 'elapsed time host:', t2-t1
57 | 
58 |   if (nc /= nch) then
59 |      print *, 'Error: nc from host and device: ', nch, nc
60 |   else
61 |      print *, 'nc: ', nc
62 |   end if
63 | 
64 |   nerri = 0
65 |   nerra = 0
66 |   do i = 1, min(nc,nch)
67 |      if (ic(i) /= ich(i)) nerri = nerri+1
68 |      if (ac(i) /= ach(i)) nerra = nerra+1
69 |   end do
70 |   print *, 'Errors in index array: ', nerri
71 |   print *, 'Errors in data array: ', nerra
72 | 
73 | end program main
74 | 


--------------------------------------------------------------------------------
/code/ch08/assignDevice.cuf:
--------------------------------------------------------------------------------
 1 | program main
 2 |   use mpi
 3 |   use mpiDeviceUtil
 4 |   use cudafor
 5 |   implicit none
 6 | 
 7 |   ! global array size
 8 |   integer, parameter :: n = 1024*1024
 9 |   ! mpi 
10 |   character (len=MPI_MAX_PROCESSOR_NAME) :: hostname
11 |   integer :: myrank, nprocs, ierr, namelength
12 |   ! device 
13 |   type(cudaDeviceProp) :: prop
14 |   integer(cuda_count_kind) :: freeB, totalB, freeA, totalA 
15 |   real, device, allocatable :: d(:)
16 |   integer :: deviceID, i, istat
17 | 
18 |   call MPI_Init(ierr)
19 |   call MPI_Comm_rank(MPI_COMM_WORLD, myrank, ierr)
20 |   call MPI_Comm_size(MPI_COMM_WORLD, nProcs, ierr)
21 | 
22 |   ! get and set unique device 
23 |   call assignDevice(deviceID)
24 | 
25 |   ! print hostname and device ID for each rank
26 |   call MPI_Get_processor_name(hostname, namelength, ierr)
27 |   do i = 0, nProcs-1
28 |      call MPI_Barrier(MPI_COMM_WORLD, ierr)
29 |      if (i == myrank) &
30 |           print "('[',i0,'] host: ', a, ',  device: ', i0)", &
31 |           myrank, trim(hostname), deviceID
32 |   enddo
33 | 
34 |   ! get memory use before large allocations, 
35 |   call MPI_Barrier(MPI_COMM_WORLD, ierr)
36 |   istat = cudaMemGetInfo(freeB, totalB)
37 | 
38 |   ! allocate memory on each device
39 |   call MPI_Barrier(MPI_COMM_WORLD, ierr)
40 |   allocate(d(n)) 
41 | 
42 |   ! Get free memory after allocation
43 |   call MPI_Barrier(MPI_COMM_WORLD, ierr)
44 |   istat = cudaMemGetInfo(freeA, totalA)
45 | 
46 |   do i = 0, nProcs-1
47 |      call MPI_Barrier(MPI_COMM_WORLD, ierr)
48 |      if (i == myrank) &
49 |           print "('  [', i0, '] ', & 
50 |           'device arrays allocated: ', i0)", &
51 |           myrank, (freeB-freeA)/n/4    
52 |   end do
53 | 
54 |   deallocate(d)
55 |   call MPI_Finalize(ierr)  
56 | end program main
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/code/ch04/sharedExample.cuf:
--------------------------------------------------------------------------------
 1 | module reverse
 2 | contains
 3 |   attributes(global) subroutine staticReverse(d)
 4 |     implicit none
 5 |     real :: d(*)   
 6 |     real, shared :: s(64)    
 7 |     integer :: t, tr
 8 | 
 9 |     t = threadIdx%x
10 |     tr = blockDim%x-t+1
11 | 
12 |     s(t) = d(t)
13 |     call syncthreads()  
14 |     d(t) = s(tr)
15 |   end subroutine staticReverse
16 | 
17 |   attributes(global) subroutine dynamicReverse(d)
18 |     implicit none
19 |     real :: d(*)   
20 |     real, shared :: s(*)
21 |     integer :: t, tr
22 | 
23 |     t = threadIdx%x
24 |     tr = blockDim%x-t+1
25 | 
26 |     s(t) = d(t)
27 |     call syncthreads()  
28 |     d(t) = s(tr)
29 |   end subroutine dynamicReverse
30 | 
31 |   attributes(global) subroutine dynamicReverseAuto(d, n)
32 |     implicit none
33 |     real :: d(n)
34 |     integer, value :: n
35 |     real, shared :: s(n)
36 |     integer :: t, tr
37 | 
38 |     t = threadIdx%x
39 |     tr = n-t+1
40 | 
41 |     s(t) = d(t)
42 |     call syncthreads()  
43 |     d(t) = s(tr)
44 |   end subroutine dynamicReverseAuto
45 | end module reverse
46 | 
47 | 
48 | program sharedExample
49 |   use cudafor
50 |   use reverse
51 | 
52 |   implicit none
53 | 
54 |   integer, parameter :: n = 64
55 |   real :: a(n), r(n), d(n)
56 |   real, device :: d_d(n)
57 |   integer :: i
58 | 
59 |   do i = 1, n
60 |      a(i) = i
61 |      r(i) = n-i+1
62 |   enddo
63 | 
64 |   d_d = a
65 |   call staticReverse<<<1, n>>>(d_d)
66 |   d = d_d
67 |   print *, 'staticReverse max error:', maxval(abs(r-d))
68 | 
69 |   d_d = a
70 |   call dynamicReverse<<<1, n, 4*n>>>(d_d)
71 |   d = d_d
72 |   print *, 'dynamicReverse max error:', maxval(abs(r-d))
73 | 
74 |   d_d = a
75 |   call dynamicReverseAuto<<<1, n, 4*n>>>(d_d, n)
76 |   d = d_d
77 |   print *, 'dynamicReverseAuto max error:', maxval(abs(r-d))
78 | end program sharedExample
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------
/code/ch04/sharedMultiple.cuf:
--------------------------------------------------------------------------------
 1 | module m
 2 | contains
 3 |   attributes(global) subroutine automaticDSM(a, b, n)
 4 |     implicit none
 5 |     real(8) :: a(n)
 6 |     integer :: b(n)
 7 |     integer, value :: n
 8 | 
 9 |     real(8), shared :: sa(n)
10 |     integer, shared :: sb(n)
11 |     integer :: t, tr
12 |     
13 |     t = threadIdx%x
14 |     tr = n-t+1
15 |     
16 |     sa(t) = a(t)
17 |     sb(t) = b(t)
18 |     call syncthreads()
19 |     a(t) = sa(tr)
20 |     b(t) = sb(tr)
21 |   end subroutine automaticDSM
22 | 
23 |   attributes(global) subroutine assumedSizeDSM(a, b)
24 |     implicit none
25 |     real(8) :: a(*)
26 |     integer :: b(*)
27 | 
28 |     real(8), shared :: sa(*)
29 |     integer, shared :: sb(*)
30 |     integer :: t, tr, offset
31 |     
32 |     t = threadIdx%x
33 |     tr = blockDim%x-t+1
34 |     offset = blockDim%x*sizeof(sa(1))/sizeof(sb(1))
35 |     
36 |     sa(tr) = a(t)
37 |     sb(offset+tr) = b(t)
38 |     call syncthreads()
39 |     a(t) = sa(t)
40 |     b(t) = sb(offset+t)
41 |   end subroutine assumedSizeDSM
42 | end module m
43 | 
44 | program main
45 |   use m
46 |   implicit none
47 |   integer, parameter :: n = 512
48 |   real(8) :: a(n)
49 |   real(8), device :: a_d(n)
50 |   integer :: b(n)
51 |   integer, device :: b_d(n)
52 |   integer :: i, nerr
53 |   
54 |   do i = 1, n
55 |      a(i) = i
56 |      b(i) = 10*i
57 |   enddo
58 | 
59 |   a_d = a; b_d = b
60 |   call automaticDSM<<<1,n,n*12>>>(a_d, b_d, n)
61 |   a = a_d; b = b_d
62 | 
63 |   nerr = 0
64 |   do i = 1, n
65 |      if (a(i) /= n-i+1) nerr = nerr+1
66 |      if (b(i) /= 10*(n-i+1)) nerr = nerr+1
67 |   end do
68 | 
69 |   print *, 'automaticDSM errors: ', nerr
70 | 
71 |   call assumedSizeDSM<<<1,n,n*12>>>(a_d, b_d)
72 |   a = a_d; b = b_d
73 | 
74 |   nerr = 0
75 |   do i = 1, n
76 |      if (a(i) /= i) nerr = nerr+1
77 |      if (b(i) /= 10*i) nerr = nerr+1
78 |   end do
79 |   
80 |   print *, 'assumeSizeDSM errors: ', nerr
81 | end program main
82 |   
83 | 


--------------------------------------------------------------------------------
/code/ch09/accuracy_sum.f90:
--------------------------------------------------------------------------------
 1 | program sum_accuracy
 2 |   implicit none
 3 |   real, allocatable :: x(:)
 4 |   real :: sum_intrinsic,sum_cpu, sum_kahan, sum_pairwise, &
 5 |        comp, y, tmp
 6 |   double precision :: sum_cpu_dp
 7 |   integer :: i,inext,icurrent,  N=10000000
 8 | 
 9 |   allocate (x(N))
10 |   x=7.
11 | 
12 |   ! Summation using intrinsic
13 |   sum_intrinsic=sum(x)
14 | 
15 |   ! Recursive summation 
16 |   sum_cpu=0.
17 |   sum_cpu_dp=0.d0
18 |   do i=1,N
19 |      ! accumulator in single precision
20 |      sum_cpu=sum_cpu+x(i)
21 |      ! accumulator in double precision
22 |      sum_cpu_dp=sum_cpu_dp+x(i)
23 |   end do
24 | 
25 |   ! Kahan summation
26 |   sum_kahan=0.
27 |   comp=0. ! running compensation to recover lost low-order bits
28 |   
29 |   do i=1,N
30 |      y    = comp +x(i)
31 |      tmp  = sum_kahan + y     ! low-order bits may be lost
32 |      comp = (sum_kahan-tmp)+y ! (sum-tmp) recover low-order bits
33 |      sum_kahan = tmp
34 |   end do
35 |   sum_kahan=sum_kahan +comp
36 |   
37 |   ! Pairwise summation
38 |   icurrent=N
39 |   inext=ceiling(real(N)/2)
40 |   do while (inext >1)
41 |      do i=1,inext
42 |         if ( 2*i <= icurrent) x(i)=x(i)+x(i+inext)
43 |      end do
44 |      icurrent=inext
45 |      inext=ceiling(real(inext)/2)
46 |   end do
47 |   sum_pairwise=x(1)+x(2)
48 |   
49 |   write(*, "('Summming ',i10, &
50 |        ' elements of magnitude ',f3.1)") N,7.
51 |   write(*, "('Sum with intrinsic function       =',f12.1, &
52 |        '   Error=', f12.1)")  &
53 |        sum_intrinsic, 7.*N-sum_intrinsic
54 |   write(*, "('Recursive sum with SP accumulator =',f12.1, &
55 |        '   Error=', f12.1)")  sum_cpu, 7.*N-sum_cpu
56 |   write(*, "('Recursive sum with DP accumulator =',f12.1, &
57 |        '   Error=', f12.1)")  sum_cpu_dp, 7.*N-sum_cpu_dp
58 |   write(*, "('Pairwise sum in SP                =',f12.1, &
59 |        '   Error=', f12.1)")  sum_pairwise, 7.*N-sum_pairwise
60 |   write(*, "('Compensated sum in SP             =',f12.1, &
61 |        '   Error=', f12.1)")  sum_kahan, 7.*N-sum_kahan
62 |   
63 |   deallocate(x)
64 | end program sum_accuracy
65 | 


--------------------------------------------------------------------------------
/code/ch07/matmulTC.cuf:
--------------------------------------------------------------------------------
 1 | program main
 2 |   use cudafor
 3 |   use cutensorEx
 4 |   implicit none
 5 |   integer, parameter :: m=3200, n=3200, k=3200
 6 |   real(8) :: a(m,k), b(k,n)
 7 |   real(8) :: c(m,n), cref(m,n)
 8 |   real(8), device :: a_d(m,k), b_d(k,n)
 9 |   real(8), device :: c_d(m,n)
10 |   integer :: istat
11 |   
12 |   type(cudaDeviceProp) :: prop
13 |   type(cudaEvent) :: startEvent, stopEvent
14 |   real :: err, time
15 | 
16 |   istat = cudaGetDeviceProperties(prop, 0)
17 |   print "(' Device: ', a)", trim(prop%name)
18 |   print "(' m = ', i0, ', n = ', i0, ', k = ', i0)", m, n, k
19 |   
20 |   istat = cudaEventCreate(startEvent)
21 |   istat = cudaEventCreate(stopEvent)
22 |   
23 |   call random_number(a)
24 |   call random_number(b)
25 | 
26 |   ! on host
27 |   cref = matmul(a,b)
28 |   
29 |   a_d = a; b_d = b
30 |   ! for overhead
31 |   c_d = matmul(a_d, b_d)
32 | 
33 |   c_d = 0.0
34 |   istat = cudaDeviceSynchronize()
35 |   istat = cudaEventRecord(startEvent, 0)
36 |   c_d = matmul(a_d, b_d)
37 |   istat = cudaEventRecord(stopEvent, 0)
38 |   istat = cudaEventSynchronize(stopEvent)
39 |   c = c_d
40 |   istat = cudaEventElapsedTime(time, startEvent, stopEvent)
41 |    
42 |   print *, 'cuTensor matmul maxval(abs(cref-c)): ', maxval(abs(cref-c))
43 |   print *, 'cuTensor matmul TFLOPS: ',  2.*k*m*n/(time/1000.)/1.0E+12
44 | 
45 |   block
46 |     use cublas_v2
47 |     type(cublasHandle) :: handle    
48 | 
49 |     istat = cublasCreate(handle)
50 |     c_d = 0.0
51 |     istat = cudaDeviceSynchronize()
52 |     istat = cudaEventRecord(startEvent, 0)
53 |     istat = cublasDGemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, &
54 |          m, n, k, 1.0_8, a_d, m, b_d, k, 0.0_8, c_d, n)
55 |     istat = cudaEventRecord(stopEvent, 0)
56 |     istat = cudaEventSynchronize(stopEvent)
57 |     c = c_d
58 |     istat = cudaEventElapsedTime(time, startEvent, stopEvent)
59 |     
60 |     print *, 'cublasDGemm maxval(abs(cref-c)): ', maxval(abs(cref-c))
61 |     print *, 'cublasDGemm TFLOPS: ',  2.*k*m*n/(time/1000.)/1.0E+12
62 | 
63 |     istat = cublasDestroy(handle)
64 |   end block
65 | 
66 | end program main
67 |   
68 | 


--------------------------------------------------------------------------------
/code/ch04/Makefile:
--------------------------------------------------------------------------------
  1 | OBJS = twoKernels pipeline streamSync eventSync \
  2 |        defaultStream defaultStreamVar defaultStreamVarExplicit \
  3 |        differentStreamTypes concurrentKernels \
  4 |        sharedExample sharedMultiple syncthreads ballot shfl \
  5 |        raceAndAtomic raceAndAtomicShared threadfence cgReverse \
  6 |        smooth swap
  7 | 
  8 | # section 4.1.2.1
  9 | 
 10 | twoKernels: twoKernels.cuf
 11 | 	nvfortran -o $@ $<
 12 | 
 13 | # section 4.1.3
 14 | 
 15 | pipeline: pipeline.cuf
 16 | 	nvfortran -o $@ $<
 17 | 
 18 | # section 4.1.4.2
 19 | 
 20 | streamSync: streamSync.cuf
 21 | 	nvfortran -o $@ $<
 22 | 
 23 | # section 4.1.4.3
 24 | 
 25 | eventSync: eventSync.cuf
 26 | 	nvfortran -o $@ $<
 27 | 
 28 | # section 4.1.5.1
 29 | 
 30 | defaultStream: defaultStream.cuf
 31 | 	nvfortran -o $@ $<
 32 | 
 33 | defaultStreamVar: defaultStreamVar.cuf
 34 | 	nvfortran -o $@ $<
 35 | 
 36 | defaultStreamVarExplicit: defaultStreamVarExplicit.cuf
 37 | 	nvfortran -o $@ $<
 38 | 
 39 | # section 4.1.5.2
 40 | 
 41 | differentStreamTypes: differentStreamTypes.cuf
 42 | 	nvfortran -o $@ $<
 43 | 
 44 | concurrentKernels: concurrentKernels.cuf
 45 | 	nvfortran -o $@ $<
 46 | 
 47 | # section 4.2.1
 48 | 
 49 | sharedExample: sharedExample.cuf
 50 | 	nvfortran -o $@ $<
 51 | 
 52 | sharedMultiple: sharedMultiple.cuf
 53 | 	nvfortran -o $@ $<
 54 | 
 55 | # section 4.2.2
 56 | 
 57 | syncthreads: syncthreads.cuf
 58 | 	nvfortran -o $@ $<
 59 | 
 60 | # section 4.2.3
 61 | 
 62 | ballot: ballot.cuf
 63 | 	nvfortran -o $@ $<
 64 | 
 65 | # section 4.2.3.1
 66 | 
 67 | shfl: shfl.cuf
 68 | 	nvfortran -o $@ $<
 69 | 
 70 | # section 4.2.4
 71 | 
 72 | raceAndAtomic: raceAndAtomic.cuf
 73 | 	nvfortran -o $@ $<
 74 | 
 75 | raceAndAtomicShared: raceAndAtomicShared.cuf
 76 | 	nvfortran -o $@ $<
 77 | 
 78 | # section 4.2.5
 79 | 
 80 | threadfence: threadfence.cuf
 81 | 	nvfortran -o $@ $<
 82 | 
 83 | # section 4.2.6
 84 | 
 85 | cgReverse: cgReverse.cuf
 86 | 	nvfortran -o $@ $<
 87 | 
 88 | # section 4.2.6.1
 89 | 
 90 | smooth: smooth.cuf
 91 | 	nvfortran -o $@ $<
 92 | 
 93 | # section 4.2.6.2
 94 | 
 95 | swap: swap.cuf
 96 | 	nvfortran -o $@ -gpu=cc90 $<
 97 | 
 98 | 
 99 | clean:
100 | 	rm -rf $(OBJS) *.o *.mod *~
101 | 


--------------------------------------------------------------------------------
/code/ch06/test_union.cuf:
--------------------------------------------------------------------------------
 1 | module m
 2 | contains
 3 | 
 4 |   attributes(global) subroutine r1(x, n)
 5 |     implicit none
 6 |     real :: x(*)
 7 |     integer, value :: n
 8 |     integer :: i
 9 |     i = threadIdx%x + (blockIdx%x-1)*blockDim%x
10 |     if (i <= n) x(i) = i
11 |   end subroutine r1
12 |     
13 |   ! can use ignore_tkr if you have access to the routine
14 |   attributes(global) subroutine c1(x, n)
15 |     implicit none
16 |     !dir$ ignore_tkr x
17 |     complex :: x(*)
18 |     integer, value :: n
19 |     integer :: i
20 |     i = threadIdx%x + (blockIdx%x-1)*blockDim%x
21 |     if (i <= n) x(i) = cmplx(-i, -1.0)
22 |   end subroutine c1
23 |     
24 |   attributes(global) subroutine c2(x, n)
25 |     implicit none
26 |     complex :: x(*)
27 |     integer, value :: n
28 |     integer :: i
29 |     i = threadIdx%x + (blockIdx%x-1)*blockDim%x
30 |     if (i <= n) x(i) = cmplx(-i*2, -2.0)
31 |   end subroutine c2
32 |   
33 |   ! use a wrapper with ignore_tkr
34 |   subroutine wrap_c2(x, n, g, b)
35 |     implicit none
36 |     !dir$ ignore_tkr x
37 |     complex, device :: x(*)
38 |     integer, value :: n, g, b
39 |     call c2<<<g,b>>>(x, n)
40 |   end subroutine wrap_c2
41 | 
42 | end module m
43 | 
44 | 
45 | program main
46 |   use m
47 |   use union
48 |   use cudafor
49 |   implicit none
50 |   integer, parameter :: blks=2, tpb = 16, n = blks*tpb
51 |   real :: r(n)
52 |   real, device :: r_d(n)
53 |   complex, device, pointer :: ptr(:) 
54 |   integer :: i
55 | 
56 |   ! use r_d as real
57 |   call r1<<<blks, tpb>>>(r_d, n)
58 |   r = r_d
59 |   print *, r(1:4)  
60 |   
61 |   ! OK since ignore_tkr is in the kernel
62 |   call c1<<<blks/2,tpb>>>(r_d, n/2)
63 |   r = r_d
64 |   print *, r(1:4)
65 | 
66 |   ! OK since ignore_tkr is in the wrapper to c2
67 |   call wrap_c2(r_d, n/2, blks/2, tpb)  
68 |   r = r_d
69 |   print *, r(1:4)
70 | 
71 |   r_d = 0.0
72 | 
73 |   call union(r_d, ptr)
74 |   ! OK since ptr is complex
75 |   call c2<<<blks/2,tpb>>>(ptr, n/2)
76 |   r = r_d
77 |   print *, r(1:4)
78 | 
79 |   !for use in CUF kernel as a complex, use pointer
80 |   !$cuf kernel do
81 |   do i = 1, n/2
82 |      ptr(i) = conjg(ptr(i))
83 |   end do
84 |   r = r_d
85 |   print *, r(1:4)
86 | 
87 | end program main
88 | 


--------------------------------------------------------------------------------
/code/ch09/compute_pi.cuf:
--------------------------------------------------------------------------------
 1 | ! Compute pi using a Monte Carlo method 
 2 | 
 3 | program compute_pi
 4 |   use precision_m
 5 |   use curand
 6 |   implicit none
 7 |   real(fp_kind), allocatable:: hostData(:)
 8 |   real(fp_kind), allocatable, device:: deviceData(:)
 9 |   real(fp_kind) :: pival
10 |   type(curandGenerator):: gen
11 |   integer :: inside_gpu, inside_cpu, i, istat
12 |   integer(kind=8) :: twoN, seed, N
13 |   
14 |   ! Define how many numbers we want to generate
15 |   N=100000
16 |   twoN=N*2
17 |   
18 |   ! Allocate array on CPU
19 |   allocate(hostData(twoN))
20 |   
21 |   ! Allocate array on GPU
22 |   allocate(deviceData(twoN))
23 |   
24 |   if (fp_kind == singlePrecision) then
25 |      write(*,"('Compute pi in single precision')")
26 |   else
27 |      write(*,"('Compute pi in double precision')")
28 |   end if
29 |   
30 |   ! Create pseudonumber generator
31 |   istat = curandCreateGenerator(gen, CURAND_RNG_PSEUDO_DEFAULT)
32 |   
33 |   ! Set seed
34 |   seed=1234
35 |   istat = curandSetPseudoRandomGeneratorSeed( gen, seed)
36 |   
37 |   ! Generate N floats or double on device
38 |   istat = curandGenerate(gen, deviceData, twoN)
39 |   
40 |   ! Copy the data back to CPU to check result later
41 |   hostData=deviceData
42 |   
43 |   ! Perform the test on GPU using CUF kernel
44 |   inside_gpu=0
45 |   !$cuf kernel do <<<*,*>>>
46 |   do i=1,N
47 |      if( (deviceData(i)**2+deviceData(i+N)**2) <= 1._fp_kind ) &
48 |           inside_gpu=inside_gpu+1
49 |   end do
50 |   
51 |   ! Perform the test on CPU 
52 |   inside_cpu=0
53 |   do i=1,N
54 |      if( (hostData(i)**2+hostData(i+N)**2) <= 1._fp_kind ) & 
55 |           inside_cpu=inside_cpu+1
56 |   end do
57 |   
58 |   ! Check the results
59 |   if (inside_cpu .ne. inside_gpu) &
60 |        write(*,*) "Mismatch between CPU/GPU"
61 |   
62 |   ! Print the value of pi and the error
63 |   pival= 4._fp_kind*real(inside_gpu,fp_kind)/real(N,fp_kind)
64 |   write(*,"(t3,a,i10,a,f10.8,a,e11.4)") "Samples=", N, &
65 |        "  Pi=", pival, &
66 |        "  Error=", abs(pival-2.0_fp_kind*asin(1.0_fp_kind))
67 |   
68 |   ! Deallocate data on CPU and GPU
69 |   deallocate(hostData)
70 |   deallocate(deviceData)
71 |   
72 |   ! Destroy the generator
73 |   istat = curandDestroyGenerator(gen)
74 | end program compute_pi
75 | 


--------------------------------------------------------------------------------
/code/ch06/Makefile:
--------------------------------------------------------------------------------
 1 | OBJS = portingBase portingManaged portingDevice \
 2 |        portingManaged_HOST portingManaged_CUDA \
 3 |        portingDevice_HOST portingDevice_CUDA \
 4 |        portingManagedSent_HOST portingManagedSent_CUDA \
 5 |        portingDeviceSent_HOST portingDeviceSent_CUDA \
 6 |        laplace2D laplace2DUse_HOST laplace2DUse_CUDA \
 7 |        portingAssociate_HOST portingAssociate_CUDA \
 8 |        laplace2DAssoc_HOST laplace2DAssoc_CUDA test_union \
 9 |        test_union testCompact testCompactOpt
10 | 
11 | # section 6.1
12 | 
13 | portingBase: portingBase.f90
14 | 	nvfortran -o $@ $<
15 | 
16 | portingManaged: portingManaged.cuf
17 | 	nvfortran -o $@ $<
18 | 
19 | portingDevice: portingDevice.cuf
20 | 	nvfortran -o $@ $<
21 | 
22 | # section 6.2
23 | 
24 | portingManaged_HOST: portingManaged_CUDA.F90
25 | 	nvfortran -o $@ $<
26 | 
27 | portingManaged_CUDA: portingManaged_CUDA.F90
28 | 	nvfortran -cuda -o $@ $<
29 | 
30 | portingDevice_HOST: portingDevice_CUDA.F90
31 | 	nvfortran -o $@ $<
32 | 
33 | portingDevice_CUDA: portingDevice_CUDA.F90
34 | 	nvfortran -cuda -o $@ $<
35 | 
36 | 
37 | portingManagedSent_HOST: portingManagedSent.f90
38 | 	nvfortran -o $@ $<
39 | 
40 | portingManagedSent_CUDA: portingManagedSent.f90
41 | 	nvfortran -cuda -o $@ $<
42 | 
43 | portingDeviceSent_HOST: portingDeviceSent.F90
44 | 	nvfortran -o $@ $<
45 | 
46 | portingDeviceSent_CUDA: portingDeviceSent.F90
47 | 	nvfortran -cuda -o $@ $<
48 | 
49 | # section 6.3.1
50 | 
51 | laplace2D: laplace2D.f90
52 | 	nvfortran -o $@ $<
53 | 
54 | laplace2DUse_HOST: laplace2DUse.F90
55 | 	nvfortran -O3 -o $@ $<
56 | 
57 | laplace2DUse_CUDA: laplace2DUse.F90
58 | 	nvfortran -O3 -cuda -o $@ $<
59 | 
60 | # section 6.3.2
61 | 
62 | portingAssociate_HOST: portingAssociate.f90
63 | 	nvfortran -o $@ $<
64 | 
65 | portingAssociate_CUDA: portingAssociate.f90
66 | 	nvfortran -cuda -o $@ $<
67 | 
68 | laplace2DAssoc_HOST: laplace2DAssoc.f90
69 | 	nvfortran -O3 -o $@ $<
70 | 
71 | laplace2DAssoc_CUDA: laplace2DAssoc.f90
72 | 	nvfortran -O3 -cuda -o $@ $<
73 | 
74 | # section 6.4
75 | 
76 | test_union: union_m.cuf test_union.cuf
77 | 	nvfortran -o $@ $^
78 | 
79 | # section 6.5
80 | 
81 | testCompact: compact_m.cuf testCompact.cuf
82 | 	nvfortran -o $@ $^
83 | 
84 | testCompactOpt: compactOpt_m.cuf testCompact.cuf
85 | 	nvfortran -o $@ $^
86 | 
87 | 
88 | clean:
89 | 	rm -rf $(OBJS) *.o *.mod *~
90 | 


--------------------------------------------------------------------------------
/code/ch04/threadfence.cuf:
--------------------------------------------------------------------------------
 1 | module m
 2 | contains
 3 |   attributes(global) subroutine s1(a, b, n, useThreadfence)
 4 |     implicit none
 5 |     real :: a(n), b(n)
 6 |     integer, value :: n
 7 |     logical, value :: useThreadfence
 8 |     integer :: tid, i
 9 | 
10 |     tid = (blockIdx%x-1)*blockDim%x + threadIdx%x
11 |     
12 |     do i = tid, n, blockDim%x*gridDim%x
13 |        a(i) = i
14 |        if (useThreadfence) call threadfence()
15 |        b(i) = i
16 |     enddo
17 |   end subroutine s1
18 | 
19 |   attributes(global) subroutine s2(a, b, n, flag, useThreadfence)
20 |     implicit none
21 |     real :: a(n), b(n), aval, bval
22 |     integer, value :: n
23 |     integer :: flag
24 |     logical, value :: useThreadfence
25 |     integer :: tid, i, tmp
26 | 
27 |     tid = (blockIdx%x-1)*blockDim%x + threadIdx%x
28 |     do i = tid, n, blockDim%x*gridDim%x
29 |        bval = b(i)
30 |        if (useThreadfence) call threadfence()       
31 |        aval = a(i)
32 |        if (aval == 0 .and. bval == i) tmp = atomicExch(flag, 1)
33 |     enddo
34 |   end subroutine s2
35 | 
36 | end module m
37 | 
38 | program threadfence
39 |   use cudafor
40 |   use m
41 |   implicit none
42 |   integer, parameter :: n=10000*1024
43 |   integer, parameter :: niter = 2000
44 |   real, device :: a_d(n), b_d(n)
45 |   integer, device :: flag_d
46 |   integer :: flag
47 |   logical :: useThreadfence
48 |   integer(cuda_stream_kind) :: stream1, stream2
49 |   integer :: istat, i, j, icount
50 | 
51 |   istat = cudaStreamCreate(stream1)
52 |   istat = cudaStreamCreate(stream2)
53 | 
54 |   do j = 1, 2
55 |      if (j == 1) then
56 |         useThreadfence = .false.
57 |         print *, 'Runs without threadfence(): '
58 |      else
59 |         useThreadfence = .true.
60 |         print *, 'Runs with threadfence(): '
61 |      endif
62 | 
63 |      icount = 0
64 |      do i = 1, niter
65 |         a_d = 0.0
66 |         b_d = 0.0
67 |         flag_d = 0
68 | 
69 |         call s1<<<10,512,0,stream1>>>(a_d, b_d, n, useThreadfence)
70 |         call s2<<<10,512,0,stream2>>>(a_d, b_d, n, flag_d, useThreadfence)
71 |         flag = flag_d
72 |         if (flag == 1) icount = icount+1
73 |      enddo
74 |      print "(a, i0,'/',i0)", &
75 |           '  iterations where out-of-order access observed: ', &
76 |           icount, niter
77 |   end do
78 | 
79 |   istat = cudaStreamDestroy(stream1)
80 |   istat = cudaStreamDestroy(stream2)
81 | end program threadfence
82 | 


--------------------------------------------------------------------------------
/code/ch04/swap.cuf:
--------------------------------------------------------------------------------
 1 | module m
 2 |   integer, parameter :: nx = 64
 3 |   integer, parameter :: readRemoteSharedMemory = 1
 4 |   integer, parameter :: writeRemoteSharedMemory = 2  
 5 | contains
 6 |   attributes(global) cluster_dims(2,1,1) subroutine swap(a, b, mode)
 7 |     use cooperative_groups
 8 |     implicit none
 9 |     integer, device :: a(nx), b(nx)
10 |     integer, value :: mode
11 |     
12 |     type(cluster_group) :: cluster
13 |     integer, shared :: s(nx/2)
14 |     integer, shared :: ds(nx/2); pointer(dsPtr, ds)
15 |     integer :: i, tibIdx, tibDim, ticIdx, bicIdx
16 |     
17 |     cluster = this_cluster()   
18 |     
19 |     tibIdx = threadIdx%x   ! thread index in block
20 |     tibDim = blockDim%x    ! number of threads in block
21 |     bicIdx = cluster%rank  ! block index in cluster
22 |     ticIdx = (bicIdx-1)*tibDim + tibIdx  ! thread index in cluster
23 |  
24 |     s(tibIdx) = 0
25 |     call syncthreads(cluster)     
26 | 
27 |     ! get pointer to other block's shared memory
28 |     if (cluster%rank == 1) then
29 |        dsPtr = cluster_map_shared_rank(s, 2)
30 |     else
31 |        dsPtr = cluster_map_shared_rank(s, 1)
32 |     end if
33 | 
34 |     if (mode == readRemoteSharedMemory) then       
35 |        s(tibIdx) = a(ticIdx)  ! write to local shared memory
36 |        call syncthreads(cluster)     
37 |        b(ticIdx) = ds(tibIdx) ! read from remote shared memory
38 |        call syncthreads(cluster) ! ensure remote read finishes before exit
39 |     else ! writeRemoteSharedMemory
40 |        ds(tibIdx) = a(ticIdx) ! write to remote shared memory        
41 |        call syncthreads(cluster)
42 |        b(ticIdx) = s(tibIdx)  ! read from local shared memory
43 |     end if
44 |   end subroutine swap
45 | end module m
46 | 
47 | program main
48 |   use m
49 |   use cudafor
50 |   implicit none
51 |   integer :: a(nx), b(nx), gold(nx)
52 |   integer, device :: a_d(nx), b_d(nx)
53 |   integer :: i
54 | 
55 |   do i = 1, nx
56 |      a(i) = i
57 |   enddo
58 | 
59 |   gold(1:nx/2) = a(nx/2+1:nx)
60 |   gold(nx/2+1:nx) = a(1:nx/2)
61 |   
62 |   a_d = a
63 |   b_d = -1
64 | 
65 |   call swap<<<2,nx/2>>>(a_d, b_d, readRemoteSharedMemory)
66 |   b = b_d
67 |   print *, 'Remote read maxval(abs(gold-b)):  ', maxval(abs(gold-b))
68 |   
69 |   b_d = -1
70 |   call swap<<<2,nx/2>>>(a_d, b_d, writeRemoteSharedMemory)
71 |   b = b_d
72 |   print *, 'Remote write maxval(abs(gold-b)): ', maxval(abs(gold-b))
73 | end program main
74 | 


--------------------------------------------------------------------------------
/code/ch11/fft_derivative.cuf:
--------------------------------------------------------------------------------
 1 | program fft_derivative
 2 |   use precision_m
 3 |   use cufft
 4 |   implicit none
 5 |   real(fp_kind), allocatable :: kx(:), derivative(:)
 6 |   real(fp_kind), allocatable, device :: kx_d(:)
 7 | 
 8 |   complex(fp_kind), allocatable :: cinput(:),coutput(:)
 9 |   complex(fp_kind), allocatable, device :: cinput_d(:),coutput_d(:)
10 | 
11 |   integer :: i,n,plan, istat
12 |   real(fp_kind) :: twopi=8._fp_kind*atan(1._fp_kind), h
13 | 
14 |   interface cufftExec
15 |      module procedure cufftExecC2C, cufftExecZ2Z
16 |   end interface cufftExec
17 | 
18 |   n=8
19 |   h=twopi/real(n,fp_kind)
20 | 
21 |   ! allocate arrays on the host
22 |   allocate(cinput(n),coutput(n),derivative(n),kx(n))
23 | 
24 |   ! allocate arrays on the device
25 |   allocate(cinput_d(n),coutput_d(n),kx_d(n))
26 | 
27 |   ! initialize arrays on host
28 |   kx = [((i-1),i=1,n/2), ((-n+i-1),i=n/2+1,n)]
29 | 
30 |   ! Set the wave number for the Nyquist frequency to zero
31 |   kx(n/2+1) = 0._fp_kind
32 | 
33 |   ! Copy the wave number vector to the device
34 |   kx_d = kx
35 | 
36 |   do i=1,n
37 |      cinput(i) = (cos(2*real(i-1,fp_kind)*h) &
38 |           +sin(3*real(i-1,fp_kind)*h))
39 |      derivative(i) = (-2*sin(2*real(i-1,fp_kind)*h) &
40 |           +3*cos(3*real(i-1,fp_kind)*h))
41 |   end do
42 | 
43 |   ! copy input to device
44 |   cinput_d = cinput
45 | 
46 |   ! Initialize the plan for complex to complex transform
47 |   if (fp_kind == real32) istat=cufftPlan1D(plan,n,CUFFT_C2C,1)
48 |   if (fp_kind == real64) istat=cufftPlan1D(plan,n,CUFFT_Z2Z,1)
49 | 
50 |   ! Forward transform out of place
51 |   istat = cufftExec(plan,cinput_d,coutput_d,CUFFT_FORWARD)
52 | 
53 |   ! Compute the derivative in spectral space and normalize the FFT
54 |   !$cuf kernel do <<<*,*>>>
55 |   do i=1,n
56 |      coutput_d(i) = cmplx(0.,kx_d(i),fp_kind)*coutput_d(i)/n
57 |   end do
58 | 
59 |   ! Inverse transform in place
60 |   istat = cufftExec(plan,coutput_d,coutput_d,CUFFT_INVERSE)
61 | 
62 |   ! Copy results back to host
63 |   coutput = coutput_d
64 | 
65 |   print *," First Derivative from complex array"
66 |   do i=1,n
67 |      write(*,'(i2,2(1x,f8.4),2x,e13.7)') i, real(coutput(i)), &
68 |           derivative(i), real(coutput(i))-derivative(i)
69 |   end do
70 | 
71 |   !release memory on the host and on the device
72 |   deallocate(cinput,coutput,kx,derivative,cinput_d,coutput_d,kx_d)
73 | 
74 |   ! Destroy the plan
75 |   istat = cufftDestroy(plan)
76 | 
77 | end program fft_derivative
78 | 


--------------------------------------------------------------------------------
/code/ch07/cusparseMV_Ex.cuf:
--------------------------------------------------------------------------------
 1 | program sparseMatVec
 2 |   use cudafor
 3 |   use cusparseEx
 4 |   use, intrinsic :: iso_fortran_env
 5 |   
 6 |   implicit none
 7 | 
 8 |   integer, parameter :: n = 5   ! # rows/cols in matrix
 9 |   integer, parameter :: nnz = 5 ! # nonzeros in matrix
10 |   integer, parameter :: fpKind = real32
11 |   
12 |   type(cusparseHandle) :: h
13 |   type(cusparseSpMatDescr) :: descrA
14 |   type(cusparseDnVecDescr) :: descrX, descrY
15 | 
16 |   ! CSR matrix
17 |   real(fpKind), device :: csrValues_d(nnz) 
18 |   integer(4), device :: csrRowOffsets_d(n+1), csrColInd_d(nnz)
19 |   
20 |   ! dense vectors
21 |   real(fpKind), device :: x_d(n), y_d(n)
22 |   real(fpKind) :: y(n)
23 |   
24 |   ! parameters
25 |   real(fpKind) :: alpha, beta
26 | 
27 |   integer :: status, i
28 | 
29 |   if (fpKind == real32) then
30 |      print *, 'Using single precision'
31 |   else
32 |      print *, 'Using double precision'
33 |   endif
34 |      
35 |   
36 |   ! initalize cusparse  
37 |   status = cusparseCreate(h)
38 | 
39 |   ! CSR representation for upper circular shift matrix
40 |   csrValues_d = 1.0
41 |   csrColInd_d = [2, 3, 4, 5, 1]
42 |   csrRowOffsets_d = [1, 2, 3, 4, 5, 6]
43 | 
44 |   ! vectors
45 |   x_d = [11.0, 12.0, 13.0, 14.0, 15.0]
46 |   y_d = 0.0
47 | 
48 |   y = x_d
49 |   print *, 'Original vector' 
50 |   print "(5(1x,f7.2))", y
51 | 
52 |   ! initialize sparse matrix descriptor A in CSR format
53 |   status = cusparseCreateCsr(descrA, & 
54 |        n, csrRowOffsets_d, csrColInd_d, csrValues_d)
55 |  
56 |   ! initialize the dense vector descriptors for X and Y
57 |   status = cusparseCreateDnVec(descrX, x_d)
58 |   status = cusparseCreateDnVec(descrY, y_d)
59 | 
60 |   ! y = alpha*A*x + beta*y
61 | 
62 |   alpha = 1.0
63 |   beta = 0.0
64 |   status = cusparseSpMV(h, CUSPARSE_OPERATION_NON_TRANSPOSE, &
65 |        alpha, descrA, descrX, beta, descrY)
66 |        
67 |   y = y_d
68 |   print *, 'Shifted vector' 
69 |   print "(5(1x,f7.2))", y
70 | 
71 |   ! shift down and subtract original
72 |   ! x = alpha*(A')*y + beta*x
73 | 
74 |   alpha = 1.0
75 |   beta = -1.0
76 |   status = cusparseSpMV(h, CUSPARSE_OPERATION_TRANSPOSE, &
77 |        alpha, descrA, descrY, beta, descrX)
78 |        
79 |   y = x_d
80 |   print *, 'Max error: ', maxval(abs(y)) 
81 | 
82 |   ! cleanup
83 | 
84 |   status = cusparseDestroySpMat(descrA)
85 |   status = cusparseDestroyDnVec(descrX)
86 |   status = cusparseDestroyDnVec(descrY)
87 |   status = cusparseDestroy(h)
88 |   
89 | end program sparseMatVec
90 | 


--------------------------------------------------------------------------------
/code/ch01/deviceQuery.cuf:
--------------------------------------------------------------------------------
 1 | program deviceQuery
 2 |   use cudafor
 3 |   implicit none
 4 | 
 5 |   type (cudaDeviceProp) :: prop
 6 |   integer :: nDevices=0, i, ierr
 7 |   
 8 |   ! Number of CUDA-capable devices
 9 | 
10 |   ierr = cudaGetDeviceCount(nDevices)
11 | 
12 |   if (nDevices == 0) then
13 |      print "(/,'No CUDA devices found',/)"
14 |      stop
15 |   else if (nDevices == 1) then
16 |      print "(/,'One CUDA device found',/)"
17 |   else 
18 |      print "(/,i0,' CUDA devices found',/)", nDevices
19 |   end if
20 | 
21 |   ! Loop over devices (N.B. 0-based enumeration)
22 |   
23 |   do i = 0, nDevices-1
24 | 
25 |      print "('Device Number: ',i0)", i
26 | 
27 |      ierr = cudaGetDeviceProperties(prop, i)
28 | 
29 |      ! General device info
30 | 
31 |      print "('  Device Name: ', a)", trim(prop%name)
32 |      print "('  Compute Capability: ',i0,'.',i0)", &
33 |           prop%major, prop%minor
34 |      print "('  Number of Multiprocessors: ',i0)", &
35 |           prop%multiProcessorCount
36 |      print "('  Single- to Double-Precision Perf Ratio: &
37 |           &', i0)", &
38 |           prop%singleToDoublePrecisionPerfRatio
39 |      print "('  Max Threads per Multiprocessor: ',i0)", &
40 |           prop%maxThreadsPerMultiprocessor
41 |      if (prop%cooperativeLaunch == 0) then
42 |         print "('  Supports Cooperative Kernels: No',/)"
43 |      else
44 |         print "('  Supports Cooperative Kernels: Yes',/)"
45 |      end if
46 |      print "('  Global Memory (GB): ',f9.3,/)", &
47 |           prop%totalGlobalMem/1024.0**3
48 | 
49 | 
50 |      ! Execution Configuration
51 | 
52 |      print "('  Execution Configuration Limits')"
53 |      print "('    Max Grid Dims: ',2(i0,' x '),i0)", &
54 |           prop%maxGridSize
55 |      print "('    Max Block Dims: ',2(i0,' x '),i0)", &
56 |           prop%maxThreadsDim
57 |      print "('    Max Threads per Block: ',i0,/)", &
58 |           prop%maxThreadsPerBlock
59 | 
60 |      ! Has managed memory
61 | 
62 |      print "('  Managed Memory')"
63 |      if (prop%managedMemory == 0) then
64 |         print "('    Can Allocate Managed Memory: No')"
65 |      else
66 |         print "('    Can Allocate Managed Memory: Yes')"
67 |      endif
68 |      if (prop%concurrentManagedAccess == 0) then
69 |         print "('    Device/CPU Concurrent Access &
70 |              &to Managed Memory: No',/)"
71 |      else
72 |         print "('    Device/CPU Concurrent Access &
73 |              &to Managed Memory: Yes',/)"
74 |      endif
75 |         
76 |   enddo
77 | 
78 | end program deviceQuery
79 | 


--------------------------------------------------------------------------------
/code/ch09/testPiGridGroup.cuf:
--------------------------------------------------------------------------------
 1 | program testPiGridGroup
 2 |   use pi_gridGroup_m
 3 |   use precision_m
 4 |   use curand
 5 |   use cudafor
 6 |   implicit none
 7 | 
 8 |   integer, parameter :: blockSize = 256
 9 |   real(fp_kind), allocatable:: data_h(:)
10 |   real(fp_kind), allocatable, device:: data_d(:)
11 |   real(fp_kind) :: pival
12 | 
13 |   integer, device, allocatable :: partial_d(:)
14 |   type(curandGenerator):: gen
15 |   integer :: inside_gpu, inside_cpu, i, istat
16 |   integer :: twoN, N, nBlocks
17 |   integer(8) :: seed
18 |   
19 |   N=200000
20 |   twoN=N*2
21 |   
22 |   allocate(data_h(twoN), data_d(twoN))
23 | 
24 |   if (fp_kind == singlePrecision) then
25 |      write(*,"('Compute pi in single precision')")
26 |   else
27 |      write(*,"('Compute pi in double precision')")
28 |   end if
29 |   
30 |   ! Create pseudonumber generator
31 |   istat = curandCreateGenerator(gen, CURAND_RNG_PSEUDO_DEFAULT)
32 |   
33 |   ! Set seed
34 |   seed=1234
35 |   istat = curandSetPseudoRandomGeneratorSeed(gen, seed)
36 |   
37 |   ! Generate N floats or doubles on device
38 |   istat = curandGenerate(gen, data_d, twoN)
39 | 
40 |   data_h = data_d
41 | 
42 |   ! determine the # of blocks to launch for grid_global kernel
43 |   block
44 |     type(cudaDeviceProp) :: prop
45 |     istat = cudaGetDeviceProperties(prop, 0)
46 |     istat = cudaOccupancyMaxActiveBlocksPerMultiprocessor( &
47 |          nBlocks, pi_gg, blockSize, 0) 
48 |     nBlocks = nBlocks * prop%multiProcessorCount
49 |   end block
50 | 
51 |   ! allocate the partial() array
52 |   allocate(partial_d(nBlocks))
53 | 
54 |   ! Perform the test on GPU using grid_global kernel
55 |   inside_gpu=0
56 |   call pi_gg<<<nBlocks, blockSize>>>(data_d, partial_d, twoN)
57 |   inside_gpu = partial_d(1)
58 |   
59 |   ! Perform the test on CPU 
60 |   inside_cpu=0
61 |   do i=1,N
62 |      if( (data_h(i)**2 + data_h(i+N)**2) <= 1._fp_kind ) & 
63 |           inside_cpu=inside_cpu+1
64 |   end do  
65 | 
66 |   ! Check the results
67 |   if (inside_cpu .ne. inside_gpu) &
68 |        write(*,*) "Mismatch between CPU/GPU"
69 | 
70 |   ! Print the value of pi and the error
71 |   pival= 4._fp_kind*real(inside_gpu,fp_kind)/real(N,fp_kind)
72 |   write(*,"(t3,a,i10,a,f10.8,a,e11.4)") "Samples=", N, &
73 |        "  Pi=", pival, &
74 |        "  Error=", abs(pival-2.0_fp_kind*asin(1.0_fp_kind))
75 |   
76 |   ! Deallocate data on CPU and GPU
77 |   deallocate(data_h, data_d, partial_d)
78 |   
79 |   ! Destroy the generator
80 |   istat = curandDestroyGenerator(gen)
81 |   
82 | end program testPiGridGroup
83 | 


--------------------------------------------------------------------------------
/code/ch07/potr.cuf:
--------------------------------------------------------------------------------
 1 | program main
 2 |   use cudaforEx
 3 |   use cublas
 4 |   use cusolverDN
 5 |   implicit none
 6 |   integer, parameter :: n=3
 7 |   real(8) :: a(n,n), b(n)
 8 |   real(8), device :: a_d(n,n), b_d(n)
 9 |   integer, device :: devInfo_d
10 |   integer(8) :: wsSizeH, wsSizeD
11 |   integer(1), allocatable :: buffer(:) 
12 |   integer(1), device, allocatable :: buffer_d(:) 
13 |   integer, device :: info_d
14 |   integer :: istat, i
15 | 
16 |   type(cusolverDnHandle) :: h
17 |   type(cusolverDnParams) :: params
18 | 
19 |   a = reshape([25, 15, -5, 15, 18, 0, -5, 0, 11], shape=[3,3])
20 |   print *, 'A:'
21 |   do i = 1, n
22 |      print *, a(i,:)
23 |   enddo
24 |   a_d = a
25 |   b = [40, 51, 28]
26 |   b_d = b
27 |   print *, 'b:'
28 |   print *, b
29 |   
30 |   istat = cusolverDnCreate(h)
31 |   istat = cusolverDnCreateParams(params)
32 |   istat = cusolverDnXpotrf_buffersize(handle = h, &
33 |        params = params, &
34 |        uplo = CUBLAS_FILL_MODE_UPPER, &
35 |        n = n, &
36 |        dataTypeA = cudaDataType(cudaforGetDataType(a_d)), &
37 |        A = a_d, &
38 |        lda = n, &
39 |        computeType = cudaDataType(cudaforGetDataType(a_d)), &
40 |        workspaceinBytesOnDevice = wsSizeD, &
41 |        workspaceinBytesOnHost = wsSizeH)
42 |   
43 |   allocate(buffer(wsSizeH), buffer_d(wsSizeD))
44 | 
45 |   istat = cusolverDnXpotrf(handle = h, &
46 |        params = params, &
47 |        uplo = CUBLAS_FILL_MODE_UPPER, &
48 |        n = n, &
49 |        dataTypeA = cudaDataType(cudaforGetDataType(a_d)), &
50 |        A = a_d, &
51 |        lda = n, &
52 |        computeType = cudaDataType(cudaforGetDataType(a_d)), &
53 |        bufferOnDevice = buffer_d, &
54 |        workspaceinBytesOnDevice = wsSizeD, &
55 |        bufferOnHost = buffer, &
56 |        workspaceinBytesOnHost = wsSizeH, &
57 |        devinfo = info_d)
58 |   
59 |   a = a_d
60 | 
61 |   print *, 'Cholesky factorization:'
62 |   do i = 1, n
63 |      print *, a(i,:)
64 |   enddo
65 | 
66 |   istat = cusolverDnXpotrs(handle = h, &
67 |        params = params, &
68 |        uplo = CUBLAS_FILL_MODE_UPPER, &
69 |        n = n, &
70 |        nrhs = 1, &
71 |        dataTypeA = cudaDataType(cudaforGetDataType(a_d)), &
72 |        A = a_d, &
73 |        lda = n, &
74 |        dataTypeB = cudaDataType(cudaforGetDataType(b_d)), &
75 |        B = b_d, &
76 |        ldb = n, &
77 |        devinfo = info_d)
78 | 
79 |   b = b_d
80 | 
81 |   print *, 'x:'
82 |   print *, b
83 |   
84 |   ! cleanup
85 |   
86 |   deallocate(buffer, buffer_d)
87 |   istat = cusolverDnDestroyParams(params)
88 |   istat = cusolverDnDestroy(h)
89 | end program main
90 | 


--------------------------------------------------------------------------------
/code/ch06/laplace2D.f90:
--------------------------------------------------------------------------------
 1 | module parameters
 2 |   use, intrinsic :: iso_fortran_env
 3 |   integer, parameter :: nx = 4096, ny = 4096
 4 |   integer, parameter :: iterMax = 100
 5 |   integer, parameter :: reportInterval = 10
 6 |   integer, parameter :: fp_kind = real32
 7 |   real(fp_kind), parameter :: tol = 1.0e-5_fp_kind 
 8 | end module parameters
 9 |   
10 | module arrays
11 |   use parameters
12 |   real(fp_kind) :: a(nx,ny), aNew(nx,ny), absResidual(2:nx-1,2:ny-1)
13 | end module arrays
14 | 
15 | module laplaceRoutines
16 | contains
17 |   subroutine initialize()
18 |     use parameters
19 |     use arrays
20 |     implicit none
21 |     real(fp_kind), parameter :: &
22 |          pi = 2.0_fp_kind*asin(1.0_fp_kind)
23 |     real(fp_kind) :: y0(nx)
24 |     integer :: i 
25 |     
26 |     do i = 1, nx
27 |        y0(i) = sin(pi*(i-1)/(nx-1))
28 |     enddo
29 |     a = 0.0_fp_kind
30 |     a(:,1) = y0
31 |     a(:,ny) = y0*exp(-pi)   
32 |     aNew = a
33 |   end subroutine initialize
34 | 
35 | 
36 |   subroutine laplaceSolution()
37 |     use parameters
38 |     use arrays
39 |     implicit none
40 |     real(fp_kind) :: maxResidual = 2*tol 
41 |     integer :: iter
42 |     
43 |     iter=0
44 |     do while ( maxResidual > tol .and. iter <= iterMax )
45 |        iter = iter + 1
46 |        call jacobiIteration()
47 |        maxResidual = maxval(absResidual)     
48 |        if(mod(iter,reportInterval) == 0) &
49 |             print '(i8,3x,f10.6)', iter, maxResidual       
50 |        a = aNew
51 |     end do
52 |   end subroutine laplaceSolution
53 | 
54 | 
55 |   subroutine jacobiIteration()
56 |     use parameters
57 |     use arrays
58 |     implicit none
59 |     integer :: i, j
60 | 
61 |     do j=2,ny-1
62 |        do i=2,nx-1
63 |           aNew(i,j) = 0.2_fp_kind * & 
64 |                (a(i,j-1)+a(i-1,j)+a(i+1,j)+a(i,j+1)) + &
65 |                0.05_fp_kind * &
66 |                (a(i-1,j-1)+a(i+1,j-1)+a(i-1,j+1)+a(i+1,j+1))
67 |           absResidual(i,j) = abs(aNew(i,j)-a(i,j))
68 |        end do
69 |     end do
70 |   end subroutine jacobiIteration
71 | 
72 | end module laplaceRoutines
73 | 
74 | 
75 | program main
76 |   use parameters
77 |   use arrays
78 |   use laplaceRoutines
79 |   implicit none
80 |   real :: startTime, stopTime
81 | 
82 |   print '(/,a,i0,a,i0,a)', &
83 |        'Relaxation calculation on ', nx, ' x ', ny, ' mesh'
84 |   print *, 'Iteration   Max Residual'  
85 | 
86 |   call initialize()
87 | 
88 |   call cpu_time(startTime)
89 |   call laplaceSolution()
90 | 
91 |   call cpu_time(stopTime) 
92 |   print '(a,f10.3,a)',  '  Completed in ', &
93 |        stopTime-startTime, ' seconds'
94 | end program main
95 | 


--------------------------------------------------------------------------------
/code/ch09/pi_shfl.cuf:
--------------------------------------------------------------------------------
 1 | module pi_shfl_m
 2 | contains
 3 |   attributes(global) subroutine partial_pi_shfl(input, partial, twoN)
 4 |     use precision_m
 5 |     implicit none
 6 |     real(fp_kind) :: input(twoN)
 7 |     integer :: partial(*)
 8 |     integer, value :: twoN
 9 | 
10 |     integer, shared :: p_s(32)
11 |     integer :: N, interior 
12 |     integer :: i, tid, width, warpID, laneID
13 |     
14 |     N = twoN/2
15 |     tid = threadIdx%x+(BlockIdx%x-1)*BlockDim%x
16 | 
17 |     interior=0
18 |     do i = tid, N, BlockDim%x*GridDim%x
19 |        if( (input(i)**2+input(i+N)**2) <= 1._fp_kind ) &
20 |             interior=interior+1
21 |     end do
22 | 
23 |     ! Local reduction per warp
24 |     i = 1
25 |     do while (i < warpsize)
26 |        interior = interior + __shfl_xor(interior,i)
27 |        i = i*2
28 |     end do
29 |     
30 |     ! first element of a warp writes to shared memory
31 |     warpID = (threadIdx%x-1)/warpsize+1       ! warp ID within block
32 |     laneID = threadIdx%x-(warpID-1)*warpsize  ! thread ID within warp
33 |     if (laneID == 1) p_s(warpID)=interior  
34 |     call syncthreads()
35 | 
36 |     ! reduction of shared memory values by first warp
37 |     if (warpID == 1) then
38 |        interior = p_s(laneID)
39 |        width = blockDim%x/warpsize
40 |        i = 1
41 |        do while (i < width)
42 |           interior = interior + __shfl_xor(interior, i, width)
43 |           i = i*2
44 |        end do
45 |        if (laneID == 1) partial(blockIdx%x) = interior
46 |     end if
47 |   end subroutine partial_pi_shfl
48 | 
49 |   attributes(global) subroutine final_pi_shfl(partial)
50 |     implicit none
51 |     integer :: partial(*)
52 |     
53 |     integer, shared :: p_s(32)
54 |     integer :: val
55 |     integer :: i, warpID, laneID, width
56 | 
57 |     warpID = (threadIdx%x-1)/warpsize+1
58 |     laneID = threadIdx%x - (warpID-1)*warpsize     
59 | 
60 |     val = partial(threadIdx%x)
61 |     i = 1
62 |     do while (i < warpsize)
63 |        val = val + __shfl_xor(val, i)
64 |        i = i*2
65 |     enddo
66 |     
67 |     ! if more than one warp, reduce amongst warps
68 |     if (blockDim%x > warpsize) then
69 |        if (laneID == 1) p_s(warpID) = val
70 |        call syncthreads()
71 | 
72 |        if (warpID == 1) then
73 |           val = p_s(laneID)
74 |           width = blockDim%x/warpsize
75 |           i = 1
76 |           do while (i < width)
77 |              val = val + __shfl_xor(val, i, width)
78 |              i=i*2
79 |           enddo
80 |        end if
81 |     endif
82 |     
83 |     if (warpID == 1 .and. laneID == 1) partial(1) = val
84 |   end subroutine final_pi_shfl
85 | 
86 | end module pi_shfl_m
87 | 
88 | 
89 | 


--------------------------------------------------------------------------------
/code/ch09/pi_gridGroup.cuf:
--------------------------------------------------------------------------------
 1 | module pi_gridGroup_m
 2 | contains
 3 |   attributes(grid_global) subroutine pi_gg(input, partial, twoN)
 4 |     use cooperative_groups
 5 |     use precision_m
 6 |     implicit none
 7 |     real(fp_kind) :: input(twoN)
 8 |     integer :: partial(*)
 9 |     integer, value :: twoN
10 | 
11 |     integer, shared :: p_s(32)
12 |     type(grid_group) :: gg
13 |     integer :: N, interior 
14 |     integer :: i, warpID, laneID, width
15 | 
16 |     warpID = ishft(threadIdx%x-1,-5)+1
17 |     laneID = iand(threadIdx%x-1,31)+1
18 |     N = twoN/2
19 | 
20 |     gg = this_grid()
21 | 
22 |     interior=0
23 |     do i = gg%rank, N, gg%size
24 |        if( (input(i)**2+input(i+N)**2) <= 1._fp_kind ) &
25 |             interior=interior+1
26 |     end do
27 | 
28 |     ! Local reduction per warp
29 |     i = 1
30 |     do while (i < warpsize)
31 |        interior = interior + __shfl_xor(interior,i)
32 |        i = i*2
33 |     end do
34 |     
35 |     ! first element of a warp writes to shared memory
36 |     if (laneID == 1) p_s(warpID)=interior  
37 | 
38 |     call syncthreads(this_thread_block())
39 | 
40 |     ! reduction of shared memory values by first warp
41 |     if (warpID == 1) then
42 |        interior = p_s(laneID)
43 |        width = blockDim%x/warpsize
44 |        i = 1
45 |        do while (i < width)
46 |           interior = interior + __shfl_xor(interior, i, width)
47 |           i = i*2
48 |        end do
49 |        if (laneID == 1) partial(blockIdx%x) = interior
50 |     end if
51 | 
52 |     call syncthreads(gg)
53 | 
54 |     if (blockIdx%x == 1) then
55 |        
56 |        ! block-stride loop (if gridDim%x > blockDim%x)
57 |        interior = 0
58 |        do i = threadIdx%x, gridDim%x, blockDim%x 
59 |           interior = interior + partial(i)
60 |        enddo
61 |        
62 |        i = 1
63 |        do while (i < warpsize)
64 |           interior = interior + __shfl_xor(interior, i)
65 |           i = i*2
66 |        enddo
67 | 
68 |        ! if more than one warp, reduce amongst warps
69 |        if (gridDim%x > warpsize) then
70 |           if (warpID == 1) p_s(laneID) = 0
71 |           call syncthreads(this_thread_block())
72 |           if (laneID == 1) p_s(warpID) = interior
73 |           call syncthreads(this_thread_block())
74 | 
75 |           if (warpID == 1) then
76 |              interior = p_s(laneID)
77 |              i = 1
78 |              do while (i < warpsize)
79 |                 interior = interior + __shfl_xor(interior, i)
80 |                 i=i*2
81 |              enddo
82 |           end if
83 |        endif
84 |     
85 |        if (warpID == 1 .and. laneID == 1) partial(1) = interior
86 |     end if
87 | 
88 |   end subroutine pi_gg
89 | 
90 | end module pi_gridGroup_m
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/code/ch08/p2pBandwidth.cuf:
--------------------------------------------------------------------------------
 1 | program p2pBandwidth
 2 |   use cudafor
 3 |   implicit none
 4 |   integer, parameter :: N = 4*1024*1024
 5 |   type distributedArray
 6 |      real, device, allocatable :: a_d(:)
 7 |   end type distributedArray
 8 |   type (distributedArray), allocatable :: distArray(:) 
 9 | 
10 |   real, allocatable :: bandwidth(:,:)
11 |   real :: array(N), time
12 |   integer :: nDevices, access, i, j, istat
13 |   type (cudaDeviceProp) :: prop
14 |   type (cudaEvent) :: startEvent, stopEvent
15 | 
16 |   istat = cudaGetDeviceCount(nDevices)
17 |   print "('Number of CUDA-capable devices: ', i0,/)", &
18 |        nDevices
19 |   
20 |   do i = 0, nDevices-1
21 |      istat = cudaGetDeviceProperties(prop, i)
22 |      print "('Device ', i0, ': ', a)", i, trim(prop%name)
23 |   enddo
24 |   print *
25 | 
26 |   allocate(distArray(0:nDevices-1))
27 | 
28 |   do j = 0, nDevices-1
29 |      istat = cudaSetDevice(j)
30 |      allocate(distArray(j)%a_d(N))     
31 |      distArray(j)%a_d = j
32 |      do i = j+1, nDevices-1
33 |         istat = cudaDeviceCanAccessPeer(access, j, i)
34 |         if (access == 1) then 
35 |            istat = cudaSetDevice(j)
36 |            istat = cudaDeviceEnablePeerAccess(i, 0)
37 |            istat = cudaSetDevice(i)
38 |            istat = cudaDeviceEnablePeerAccess(j, 0)
39 |         endif
40 |      enddo
41 |   end do
42 | 
43 |   allocate(bandwidth(0:nDevices-1, 0:nDevices-1))
44 |   bandwidth = 0.0
45 | 
46 |   do j = 0, nDevices-1
47 |      istat = cudaSetDevice(j)
48 |      istat = cudaEventCreate(startEvent)
49 |      istat = cudaEventCreate(stopEvent)
50 |      do i = 0, nDevices-1
51 |         if (i == j) cycle
52 |         istat = cudaMemcpyPeer(distArray(j)%a_d, j, &
53 |              distArray(i)%a_d, i, N) 
54 |         istat = cudaEventRecord(startEvent,0)
55 |         istat = cudaMemcpyPeer(distArray(j)%a_d, j, &
56 |              distArray(i)%a_d, i, N) 
57 |         istat = cudaEventRecord(stopEvent,0)
58 |         istat = cudaEventSynchronize(stopEvent)
59 |         istat = cudaEventElapsedTime(time, &
60 |              startEvent, stopEvent)
61 | 
62 |         array = distArray(j)%a_d
63 |         if (all(array == i)) bandwidth(j,i) = N*4/time/1.0E+6
64 |      end do
65 |      distArray(j)%a_d = j
66 |      istat = cudaEventDestroy(startEvent)
67 |      istat = cudaEventDestroy(stopEvent)
68 |   enddo
69 | 
70 |   print "('Bandwidth (GB/s) for transfer size (MB): ', &
71 |        f9.3,/)", N*4.0/1024**2
72 |   write (*,"(' S\\R   0')", advance='no') 
73 |   do i = 1, nDevices-1
74 |      write(*,"(5x,i3)", advance='no') i
75 |   enddo
76 |   print *
77 | 
78 |   do j = 0, nDevices-1
79 |      write(*,"(i3)", advance='no') j
80 |      do i = 0, nDevices-1
81 |         if (i == j) then
82 |            write(*,"(4x,'-',3x)", advance='no')
83 |         else
84 |            write(*,"(f8.2)",advance='no') bandwidth(j,i)
85 |         end if
86 |      end do
87 |      write(*,*)
88 |   end do
89 | 
90 |   ! cleanup
91 |   do j = 0, nDevices-1
92 |      deallocate(distArray(j)%a_d)
93 |   end do
94 |   deallocate(distArray,bandwidth)
95 | 
96 | end program p2pBandwidth
97 | 


--------------------------------------------------------------------------------
/code/ch06/laplace2DAssoc.f90:
--------------------------------------------------------------------------------
  1 | module parameters
  2 |   use, intrinsic :: iso_fortran_env
  3 |   integer, parameter :: nx = 4096, ny = 4096
  4 |   integer, parameter :: iterMax = 100
  5 |   integer, parameter :: reportInterval = 10
  6 |   integer, parameter :: fp_kind = real32
  7 |   real(fp_kind), parameter :: tol = 1.0e-5_fp_kind 
  8 | end module parameters
  9 |   
 10 | module arrays
 11 |   use parameters
 12 |   real(fp_kind) :: a(nx,ny), aNew(nx,ny), absResidual(2:nx-1,2:ny-1)
 13 |   !@cuf real(fp_kind), device :: a_d(nx,ny), aNew_d(nx,ny)
 14 |   !@cuf attributes(device) :: absResidual
 15 | end module arrays
 16 | 
 17 | module laplaceRoutines
 18 | contains
 19 |   subroutine initialize()
 20 |     use parameters
 21 |     use arrays
 22 |     implicit none
 23 |     real(fp_kind), parameter :: &
 24 |          pi = 2.0_fp_kind*asin(1.0_fp_kind)
 25 |     real(fp_kind) :: y0(nx)
 26 |     integer :: i 
 27 |     
 28 |     do i = 1, nx
 29 |        y0(i) = sin(pi*(i-1)/(nx-1))
 30 |     enddo
 31 |     a = 0.0_fp_kind
 32 |     a(:,1) = y0
 33 |     a(:,ny) = y0*exp(-pi)   
 34 |     aNew = a
 35 |     !@cuf aNew_d = aNew
 36 |     !@cuf a_d = a
 37 |   end subroutine initialize
 38 | 
 39 | 
 40 |   subroutine laplaceSolution()
 41 |     use parameters
 42 |     !@cuf use cudafor
 43 |     use arrays
 44 |     implicit none
 45 |     real(fp_kind) :: maxResidual = 2*tol 
 46 |     integer :: iter
 47 |     
 48 |     iter=0
 49 |     !@cuf associate(a=>a_d, aNew=>aNew_d)
 50 |     do while ( maxResidual > tol .and. iter <= iterMax )
 51 |        iter = iter + 1
 52 |        call jacobiIteration()
 53 |        maxResidual = maxval(absResidual)     
 54 |        if(mod(iter,reportInterval) == 0) &
 55 |             print '(i8,3x,f10.6)', iter, maxResidual       
 56 |        a = aNew
 57 |     end do
 58 |     !@cuf end associate
 59 |   end subroutine laplaceSolution
 60 | 
 61 | 
 62 |   subroutine jacobiIteration()
 63 |     use parameters
 64 |     use arrays
 65 |     implicit none
 66 |     integer :: i, j
 67 | 
 68 |     !@cuf associate(a=>a_d, aNew=> aNew_d)
 69 |     !$cuf kernel do(2) <<<*,*>>>
 70 |     do j=2,ny-1
 71 |        do i=2,nx-1
 72 |           aNew(i,j) = 0.2_fp_kind * & 
 73 |                (a(i,j-1)+a(i-1,j)+a(i+1,j)+a(i,j+1)) + &
 74 |                0.05_fp_kind * &
 75 |                (a(i-1,j-1)+a(i+1,j-1)+a(i-1,j+1)+a(i+1,j+1))
 76 |           absResidual(i,j) = abs(aNew(i,j)-a(i,j))
 77 |        end do
 78 |     end do
 79 |     !@cuf end associate
 80 |   end subroutine jacobiIteration
 81 | 
 82 | end module laplaceRoutines
 83 | 
 84 | 
 85 | program main
 86 |   use parameters
 87 |   use arrays
 88 |   use laplaceRoutines
 89 |   implicit none
 90 | 
 91 |   real :: startTime, stopTime
 92 | 
 93 |   !@cuf print '(/,a,/)', 'GPU associate version'
 94 |   print '(/,a,i0,a,i0,a)', &
 95 |        'Relaxation calculation on ', nx, ' x ', ny, ' mesh'
 96 | 
 97 |   print *, 'Iteration   Max Residual'  
 98 | 
 99 |   call initialize()
100 | 
101 |   call cpu_time(startTime)
102 |   call laplaceSolution()
103 | 
104 |   call cpu_time(stopTime) 
105 |   print '(a,f10.3,a)', '  Completed in ', &
106 |        stopTime-startTime, ' seconds'
107 | end program main
108 | 


--------------------------------------------------------------------------------
/code/ch06/laplace2DUse.F90:
--------------------------------------------------------------------------------
  1 | module parameters
  2 |   use, intrinsic :: iso_fortran_env
  3 |   integer, parameter :: nx = 4096, ny = 4096
  4 |   integer, parameter :: iterMax = 100
  5 |   integer, parameter :: reportInterval = 10
  6 |   integer, parameter :: fp_kind = real32
  7 |   real(fp_kind), parameter :: tol = 1.0e-5_fp_kind 
  8 | end module parameters
  9 |   
 10 | module arrays
 11 |   use parameters
 12 |   real(fp_kind) :: a(nx,ny), aNew(nx,ny), absResidual(2:nx-1,2:ny-1)
 13 |   !@cuf real(fp_kind), device :: a_d(nx,ny), aNew_d(nx,ny)
 14 |   !@cuf attributes(device) :: absResidual
 15 | end module arrays
 16 | 
 17 | module laplaceRoutines
 18 | contains
 19 |   subroutine initialize()
 20 |     use parameters
 21 |     use arrays
 22 |     implicit none
 23 |     real(fp_kind), parameter :: &
 24 |          pi = 2.0_fp_kind*asin(1.0_fp_kind)
 25 |     real(fp_kind) :: y0(nx)
 26 |     integer :: i 
 27 |     
 28 |     do i = 1, nx
 29 |        y0(i) = sin(pi*(i-1)/(nx-1))
 30 |     enddo
 31 |     a = 0.0_fp_kind
 32 |     a(:,1) = y0
 33 |     a(:,ny) = y0*exp(-pi)   
 34 |     aNew = a
 35 |     !@cuf aNew_d = aNew
 36 |     !@cuf a_d = a
 37 |   end subroutine initialize
 38 | 
 39 | 
 40 |   subroutine laplaceSolution()
 41 |     use parameters
 42 | #ifdef _CUDA
 43 |     use arrays, only: a => a_d, aNew => aNew_d, absResidual
 44 | #else    
 45 |     use arrays
 46 | #endif
 47 |     !@cuf use cudafor
 48 |     implicit none
 49 |     real(fp_kind) :: maxResidual = 2*tol 
 50 |     integer :: iter
 51 |     
 52 |     iter=0
 53 |     do while ( maxResidual > tol .and. iter <= iterMax )
 54 |        iter = iter + 1
 55 |        call jacobiIteration()
 56 |        maxResidual = maxval(absResidual)     
 57 |        if(mod(iter,reportInterval) == 0) &
 58 |             print '(i8,3x,f10.6)', iter, maxResidual       
 59 |        a = aNew
 60 |     end do
 61 |   end subroutine laplaceSolution
 62 | 
 63 | 
 64 |   subroutine jacobiIteration()
 65 |     use parameters
 66 | #ifdef _CUDA
 67 |     use arrays, only: a => a_d, aNew => aNew_d, absResidual
 68 | #else
 69 |     use arrays
 70 | #endif    
 71 |     implicit none
 72 |     integer :: i, j
 73 | 
 74 |     !$cuf kernel do(2) <<<*,*>>>
 75 |     do j=2,ny-1
 76 |        do i=2,nx-1
 77 |           aNew(i,j) = 0.2_fp_kind * & 
 78 |                (a(i,j-1)+a(i-1,j)+a(i+1,j)+a(i,j+1)) + &
 79 |                0.05_fp_kind * &
 80 |                (a(i-1,j-1)+a(i+1,j-1)+a(i-1,j+1)+a(i+1,j+1))
 81 |           absResidual(i,j) = abs(aNew(i,j)-a(i,j))
 82 |        end do
 83 |     end do
 84 |   end subroutine jacobiIteration
 85 | 
 86 | end module laplaceRoutines
 87 | 
 88 | 
 89 | program main
 90 |   use parameters
 91 |   use arrays
 92 |   use laplaceRoutines
 93 |   implicit none
 94 | 
 95 |   real :: startTime, stopTime
 96 | 
 97 |   !@cuf print '(/,a,/)', 'GPU version'
 98 |   print '(/,a,i0,a,i0,a)', &
 99 |        'Relaxation calculation on ', nx, ' x ', ny, ' mesh'
100 | 
101 |   print *, 'Iteration   Max Residual'  
102 | 
103 |   call initialize()
104 | 
105 |   call cpu_time(startTime)
106 |   call laplaceSolution()
107 | 
108 |   call cpu_time(stopTime) 
109 |   print '(a,f10.3,a)',  '  Completed in ', &
110 |        stopTime-startTime, ' seconds'
111 | end program main
112 | 


--------------------------------------------------------------------------------
/code/ch05/async.cuf:
--------------------------------------------------------------------------------
  1 | program async
  2 |   use cudafor
  3 |   implicit none
  4 |   integer, parameter :: maxStreams = 64
  5 |   integer, parameter :: n = maxStreams*1024*1024
  6 | 
  7 |   real(8), pinned, allocatable :: a(:), b(:)
  8 |   real(8), device :: a_d(n), b_d(n)
  9 |   real(8) :: gold(n)
 10 | 
 11 |   integer(kind=cuda_stream_kind) :: stream(maxStreams)
 12 |   type (cudaEvent) :: startEvent, stopEvent
 13 | 
 14 |   real :: time
 15 |   integer :: nStreams, i, j, offset, istat
 16 | 
 17 |   block 
 18 |     type (cudaDeviceProp) :: prop
 19 |     istat = cudaGetDeviceProperties(prop, 0)
 20 |     print "(' Device: ', a)", trim(prop%name)
 21 |     print "(' Array size (MB): ', f8.2)", real(n)*8/1000/1000
 22 |     print "(/,'        Streams  time (ms)' )"
 23 |   end block
 24 |     
 25 |   block
 26 |     logical :: pinnedFlag    
 27 |     ! allocate pinned  host memory
 28 |     allocate(a(n), STAT=istat, PINNED=pinnedFlag)
 29 |     if (istat /= 0) then
 30 |        print *, 'Allocation of a failed'
 31 |        stop
 32 |     else
 33 |        if (.not. pinnedFlag) &
 34 |             print *, 'Pinned a allocation failed'
 35 |     end if
 36 |     allocate(b(n), STAT=istat, PINNED=pinnedFlag)
 37 |     if (istat /= 0) then
 38 |        print *, 'Allocation of b failed'
 39 |        stop
 40 |     else
 41 |        if (.not. pinnedFlag) &
 42 |             print *, 'Pinned b allocation failed'
 43 |     end if
 44 |   end block
 45 | 
 46 |   ! create events and streams
 47 |   istat = cudaEventCreate(startEvent)
 48 |   istat = cudaEventCreate(stopEvent)  
 49 | 
 50 |   do i = 1, maxStreams
 51 |      istat = cudaStreamCreate(stream(i))
 52 |   enddo
 53 |   
 54 |   call random_number(a)
 55 |   gold = 0.0
 56 |   
 57 |   ! warm up
 58 |   a_d = a
 59 |   !$cuf kernel do <<<*,*>>>
 60 |   do i = 1, n     
 61 |      b_d(i) = sin(a_d(i)) + sin(2*a_d(i)) &
 62 |           + sin(3*a_d(i)) + sin(4*a_d(i))
 63 |   enddo
 64 |   gold = b_d
 65 | 
 66 |   nStreams = 1
 67 |   do
 68 |      istat = cudaEventRecord(startEvent,0)
 69 |      do j = 1, nStreams
 70 |         offset = (j-1)*(n/nStreams) 
 71 |         istat = cudaMemcpyAsync(a_d(offset+1), a(offset+1), &
 72 |              n/nStreams, stream(j))
 73 | 
 74 |         !$cuf kernel do <<<*,*,0,stream(j)>>>
 75 |         do i = offset+1, offset+n/nStreams
 76 |            b_d(i) = sin(a_d(i)) + sin(2*a_d(i)) &
 77 |                 + sin(3*a_d(i)) + sin(4*a_d(i))
 78 |         enddo
 79 |      
 80 |         istat = cudaMemcpyAsync(b(offset+1), b_d(offset+1), &
 81 |              n/nStreams, stream(j))
 82 |      enddo
 83 |      istat = cudaEventRecord(stopEvent, 0)
 84 |      istat = cudaEventSynchronize(stopEvent)
 85 |      istat = cudaEventElapsedTime(time, startEvent, stopEvent)
 86 |      if (maxval(abs(gold-b)) > 0.0) then
 87 |         print *, n, ' *** Error ***'
 88 |      else
 89 |         print *, nStreams, time
 90 |      end if
 91 |      nStreams = nStreams*2
 92 |      if (nStreams > maxStreams) exit
 93 |   enddo
 94 |      
 95 |   ! cleanup
 96 |   istat = cudaEventDestroy(startEvent)
 97 |   istat = cudaEventDestroy(stopEvent)
 98 |   do j = 1, maxStreams
 99 |      istat = cudaStreamDestroy(stream(j))
100 |   enddo
101 |   deallocate(a, b)
102 | 
103 | end program async
104 | 
105 | 


--------------------------------------------------------------------------------
/code/ch07/gemmPerf.cuf:
--------------------------------------------------------------------------------
 1 | program main
 2 |   use cudafor
 3 |   use cublas_v2
 4 |   implicit none
 5 |   integer, parameter :: m=3200, n=3200, k=3200
 6 | 
 7 |   type(cublasHandle) :: handle
 8 |   type(cudaDeviceProp) :: prop
 9 |   type(cudaEvent) :: startEvent, stopEvent
10 |   real :: err, time
11 |   integer :: istat
12 | 
13 |   istat = cudaGetDeviceProperties(prop, 0)
14 |   print "(' Device: ', a)", trim(prop%name)
15 |   print "(' m = ', i0, ', n = ', i0, ', k = ', i0)", m, n, k
16 | 
17 |   istat = cublasCreate(handle)
18 |   istat = cudaEventCreate(startEvent)
19 |   istat = cudaEventCreate(stopEvent)
20 | 
21 |   block
22 |     real(8) :: a(m,k), b(k,n), c(m,n), cref(m,n)
23 |     real(8), device :: a_d(m,k), b_d(k,n), c_d(m,n)
24 | 
25 |     call random_number(a)
26 |     call random_number(b)
27 |     cref = matmul(a,b)
28 |   
29 |     a_d = a; b_d = b; c_d = 0.0
30 |     istat = cudaDeviceSynchronize()
31 |     istat = cudaEventRecord(startEvent, 0)
32 |     istat = cublasDGemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, &
33 |          m, n, k, 1.0_8, a_d, m, b_d, k, 0.0_8, c_d, n)
34 |     istat = cudaEventRecord(stopEvent, 0)
35 |     istat = cudaEventSynchronize(stopEvent)
36 |     c = c_d
37 |     istat = cudaEventElapsedTime(time, startEvent, stopEvent)
38 |    
39 |     print *, 'DGEMM maxval(abs(cref-c)): ', maxval(abs(cref-c))
40 |     print *, 'DGEMM TFlops: ', 2.*k*m*n/(time/1000.)/1.0E+12
41 |     print *
42 |   end block
43 | 
44 |   block
45 |     real(4) :: a(m,k), b(k,n), c(m,n), cref(m,n)
46 |     real(4), device :: a_d(m,k), b_d(k,n), c_d(m,n)
47 | 
48 |     call random_number(a)
49 |     call random_number(b)
50 |     cref = matmul(a,b)
51 |   
52 |     a_d = a; b_d = b; c_d = 0.0
53 |     istat = cudaDeviceSynchronize()
54 |     istat = cudaEventRecord(startEvent, 0)
55 |     istat = cublasSGemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, &
56 |          m, n, k, 1.0, a_d, m, b_d, k, 0.0, c_d, n)
57 |     istat = cudaEventRecord(stopEvent, 0)
58 |     istat = cudaEventSynchronize(stopEvent)
59 |     c = c_d
60 |     istat = cudaEventElapsedTime(time, startEvent, stopEvent)
61 |    
62 |     print *, 'SGEMM (FP32) maxval(abs(cref-c)): ', maxval(abs(cref-c))
63 |     print *, 'SGEMM (FP32) TFlops: ', 2.*k*m*n/(time/1000.)/1.0E+12
64 |     print *
65 |   end block
66 | 
67 |   block
68 |     real(4) :: a(m,k), b(k,n), c(m,n), cref(m,n)
69 |     real(4), device :: a_d(m,k), b_d(k,n), c_d(m,n)
70 | 
71 |     istat = cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH)
72 | 
73 |     call random_number(a)
74 |     call random_number(b)
75 |     cref = matmul(a,b)
76 |   
77 |     a_d = a; b_d = b; c_d = 0.0
78 |     istat = cudaDeviceSynchronize()
79 |     istat = cudaEventRecord(startEvent, 0)
80 |     istat = cublasSGemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, &
81 |          m, n, k, 1.0, a_d, m, b_d, k, 0.0, c_d, n)
82 |     istat = cudaEventRecord(stopEvent, 0)
83 |     istat = cudaEventSynchronize(stopEvent)
84 |     c = c_d
85 |     istat = cudaEventElapsedTime(time, startEvent, stopEvent)
86 |    
87 |     print *, 'SGEMM (TF32) maxval(abs(cref-c)): ', maxval(abs(cref-c))
88 |     print *, 'SGEMM (TF32) TFlops: ', 2.*k*m*n/(time/1000.)/1.0E+12
89 |   end block
90 | 
91 |   istat = cublasDestroy(handle)
92 |   istat = cudaEventDestroy(startEvent)
93 |   istat = cudaEventDestroy(stopEvent)
94 | end program main
95 | 


--------------------------------------------------------------------------------
/code/ch12/Makefile:
--------------------------------------------------------------------------------
  1 | MCUDA = -cuda 
  2 | MCUDA_CURAND = $(MCUDA) -cudalib=curand
  3 | 
  4 | OBJS = 	ppmExampleHost ppmExampleCUDA \
  5 | 	rayHost rayCUDA \
  6 | 	sphereHost sphereCUDA \
  7 | 	normalHost normalCUDA \
  8 | 	twoSpheresHost twoSpheresCUDA \
  9 | 	antialiasHost antialiasCUDA \
 10 | 	diffuseHost diffuseCUDA \
 11 | 	metalHost metalCUDA \
 12 | 	dielectricHost dielectricCUDA \
 13 | 	cameraHost cameraCUDA \
 14 | 	defocusBlurHost defocusBlurCUDA \
 15 | 	coverHost coverCUDA \
 16 | 	triangleHost triangleCUDA \
 17 | 	lightsHost lightsCUDA \
 18 | 	textureHost textureCUDA
 19 | 
 20 | # section 12.1
 21 | 
 22 | ppmExampleHost: ppmExample.f90
 23 | 	nvfortran -o $@ $^
 24 | 
 25 | ppmExampleCUDA: ppmExample.f90
 26 | 	nvfortran -o $@ $(MCUDA) $^
 27 | 
 28 | # section 12.2
 29 | 
 30 | rgb_m.host.o: rgb_m.F90
 31 | 	nvfortran -c -o $@ $^
 32 | 
 33 | rgb_m.cuda.o: rgb_m.F90
 34 | 	nvfortran -c -o $@ $(MCUDA) $^
 35 | 
 36 | # section 12.3
 37 | 
 38 | rayHost: ray.F90 rgb_m.host.o
 39 | 	nvfortran -o $@ $^
 40 | 
 41 | rayCUDA: ray.F90 rgb_m.cuda.o
 42 | 	nvfortran -o $@ $(MCUDA) $^
 43 | 
 44 | # section 12.4
 45 | 
 46 | sphereHost: sphere.F90 rgb_m.host.o
 47 | 	nvfortran -o $@ $^
 48 | 
 49 | sphereCUDA: sphere.F90 rgb_m.cuda.o
 50 | 	nvfortran -o $@ $(MCUDA) $^
 51 | 
 52 | # section 12.5
 53 | 
 54 | normalHost: normal.F90 rgb_m.host.o
 55 | 	nvfortran -o $@ $^
 56 | 
 57 | normalCUDA: normal.F90 rgb_m.cuda.o
 58 | 	nvfortran -o $@ $(MCUDA) $^
 59 | 
 60 | 
 61 | twoSpheresHost: twoSpheres.F90 rgb_m.host.o
 62 | 	nvfortran -o $@ $^
 63 | 
 64 | twoSpheresCUDA: twoSpheres.F90 rgb_m.cuda.o
 65 | 	nvfortran -o $@ $(MCUDA) $^
 66 | 
 67 | # section 12.6
 68 | 
 69 | antialiasHost: antialias.F90 rgb_m.host.o
 70 | 	nvfortran -o $@ $^
 71 | 
 72 | antialiasCUDA: antialias.F90 rgb_m.cuda.o
 73 | 	nvfortran -o $@ $(MCUDA_CURAND) $^
 74 | 
 75 | # section 12.7.1
 76 | 
 77 | diffuseHost: diffuse.F90 rgb_m.host.o
 78 | 	nvfortran -o $@ $^
 79 | 
 80 | diffuseCUDA: diffuse.F90 rgb_m.cuda.o
 81 | 	nvfortran -o $@ $(MCUDA_CURAND) $^
 82 | 
 83 | # section 12.7.2
 84 | 
 85 | metalHost: metal.F90 rgb_m.host.o
 86 | 	nvfortran -o $@ $^
 87 | 
 88 | metalCUDA: metal.F90 rgb_m.cuda.o
 89 | 	nvfortran -o $@ $(MCUDA_CURAND) $^
 90 | 
 91 | # section 12.7.3
 92 | 
 93 | dielectricHost: dielectric.F90 rgb_m.host.o
 94 | 	nvfortran -o $@ $^
 95 | 
 96 | dielectricCUDA: dielectric.F90 rgb_m.cuda.o
 97 | 	nvfortran -o $@ $(MCUDA_CURAND) $^
 98 | 
 99 | # section 12.8
100 | 
101 | cameraHost: camera.F90 rgb_m.host.o
102 | 	nvfortran -o $@ $^
103 | 
104 | cameraCUDA: camera.F90 rgb_m.cuda.o
105 | 	nvfortran -o $@ $(MCUDA_CURAND) $^
106 | 
107 | # section 12.9
108 | 
109 | defocusBlurHost: defocusBlur.F90 rgb_m.host.o
110 | 	nvfortran -o $@ $^
111 | 
112 | defocusBlurCUDA: defocusBlur.F90 rgb_m.cuda.o
113 | 	nvfortran -o $@ $(MCUDA_CURAND) $^
114 | 
115 | # section 12.10
116 | 
117 | coverHost: cover.F90 rgb_m.host.o
118 | 	nvfortran -o $@ $^
119 | 
120 | coverCUDA: cover.F90 rgb_m.cuda.o
121 | 	nvfortran -o $@ $(MCUDA_CURAND) $^
122 | 
123 | # section 12.11
124 | 
125 | triangleHost: triangle.F90 rgb_m.host.o
126 | 	nvfortran -o $@ $^
127 | 
128 | triangleCUDA: triangle.F90 rgb_m.cuda.o
129 | 	nvfortran -o $@ $(MCUDA_CURAND) $^
130 | 
131 | # section 12.12
132 | 
133 | lightsHost: lights.F90 rgb_m.host.o
134 | 	nvfortran -o $@ $^
135 | 
136 | lightsCUDA: lights.F90 rgb_m.cuda.o
137 | 	nvfortran -o $@ $(MCUDA_CURAND) $^
138 | 
139 | # section 12.13
140 | 
141 | textureHost: texture.F90 rgb_m.host.o
142 | 	nvfortran -o $@ $^
143 | 
144 | textureCUDA: texture.F90 rgb_m.cuda.o
145 | 	nvfortran -o $@ $(MCUDA_CURAND) $^
146 | 
147 | 
148 | 
149 | clean:
150 | 	rm -rf a.out *.mod *.o *~
151 | 
152 | clobber:
153 | 	rm -rf $(OBJS) *.ppm a.out *.mod *.o *~
154 | 


--------------------------------------------------------------------------------
/code/ch12/ray.F90:
--------------------------------------------------------------------------------
  1 | module rayTracing
  2 | #ifdef _CUDA
  3 |   use rgbCUDA
  4 | #else
  5 |   use rgbHost
  6 | #endif
  7 |   
  8 |   real, parameter :: lowerLeftCorner(3) = [-2.0, -1.0, -1.0]
  9 |   real, parameter :: horizontal(3) = [4.0, 0.0, 0.0]
 10 |   real, parameter :: vertical(3) = [0.0, 2.0, 0.0]
 11 |   real, parameter :: origin(3) = [0.0, 0.0, 0.0]
 12 | 
 13 |   type ray
 14 |      real :: origin(3)
 15 |      real :: dir(3)
 16 |   end type ray
 17 | 
 18 |   interface ray
 19 |      module procedure rayConstructor
 20 |   end interface ray
 21 | 
 22 | contains
 23 |   
 24 |   !@cuf attributes(device) &
 25 |   function normalize(a) result(res)
 26 |     implicit none
 27 |     real :: a(3), res(3)
 28 |     res = a/sqrt(sum(a**2))
 29 |   end function normalize
 30 | 
 31 |   !@cuf attributes(device) &
 32 |   function rayConstructor(origin, dir) result(r)
 33 |     implicit none
 34 |     !dir$ ignore_tkr (d) origin, (d) dir
 35 |     real :: origin(3), dir(3)
 36 |     type(ray) :: r
 37 |     r%origin = origin
 38 |     r%dir = normalize(dir)
 39 |   end function rayConstructor
 40 | 
 41 |   !@cuf attributes(device) &
 42 |   function color(r) result(res)
 43 |     implicit none
 44 |     type(ray) :: r
 45 |     type(rgb) :: res
 46 |     real :: t
 47 |     t = 0.5*(r%dir(2) + 1.0)  
 48 |     res = rgb((1.0-t)*[1.0, 1.0, 1.0] + t*[0.5, 0.7, 1.0])
 49 |   end function color
 50 | 
 51 | #ifdef _CUDA  
 52 |   attributes(global) subroutine renderKernel(fb, nx, ny)
 53 |     implicit none
 54 |     type(rgb) :: fb(nx,ny)
 55 |     integer, value :: nx, ny
 56 |     type(ray) :: r
 57 |     real :: dir(3)
 58 |     real :: u, v
 59 |     integer :: i, j
 60 |     i = threadIdx%x + (blockIdx%x-1)*blockDim%x
 61 |     j = threadIdx%y + (blockIdx%y-1)*blockDim%y
 62 |     if (i <= nx .and. j <= ny) then
 63 |        u = real(i)/nx
 64 |        v = real(j)/ny
 65 |        dir = lowerLeftCorner + u*horizontal + v*vertical - origin
 66 |        r = ray(origin, dir)
 67 |        fb(i,j) = color(r)
 68 |     end if
 69 |   end subroutine renderKernel
 70 | #endif
 71 |     
 72 |   subroutine render(fb)
 73 |     !@cuf use cudafor
 74 |     implicit none
 75 |     type(rgb) :: fb(:,:)
 76 |     type(ray) :: r
 77 |     real :: u, v
 78 |     integer :: nx, ny, i, j
 79 |     
 80 |     nx = size(fb,1)    
 81 |     ny = size(fb,2)
 82 | 
 83 | #ifdef _CUDA
 84 |     block
 85 |       type(rgb), device, allocatable :: fb_d(:,:)
 86 |       type(dim3) :: tBlock, grid
 87 |       
 88 |       allocate(fb_d(nx,ny))
 89 |       tBlock = dim3(32,8,1)
 90 |       grid = dim3((nx-1)/tBlock%x+1, (ny-1)/tBlock%y+1, 1)
 91 |       call renderKernel<<<grid, tBlock>>>(fb_d, nx, ny)
 92 |       fb = fb_d
 93 |       deallocate(fb_d)
 94 |     end block
 95 | #else    
 96 |     do j = 1, ny
 97 |        do i = 1, nx                
 98 |           u = real(i)/nx
 99 |           v = real(j)/ny
100 |           r = ray(origin, &
101 |                lowerLeftCorner + u*horizontal + v*vertical - origin)
102 |           fb(i,j) = color(r)
103 |        end do
104 |     end do
105 | #endif
106 |   end subroutine render
107 |   
108 | end module rayTracing
109 | 
110 | program main
111 |   use rayTracing
112 |   implicit none
113 |   integer, parameter :: nx = 400, ny = 200
114 |   integer :: i, j
115 |   type(rgb) :: fb(nx,ny)
116 |   
117 |   call render(fb)
118 |   
119 |   ! ppm output
120 | 
121 |   print "(a2)", 'P3'   ! indicates RGB colors in ASCII, must be flush left
122 |   print *, nx, ny      ! width and height of image
123 |   print *, 255         ! maximum value for each color
124 |   do j = ny, 1, -1
125 |      do i = 1, nx                
126 |         print "(3(1x,i3))", int(255*fb(i,j)%v)
127 |      end do
128 |   end do
129 | 
130 | end program main
131 | 


--------------------------------------------------------------------------------
/code/ch07/cusparseMV.cuf:
--------------------------------------------------------------------------------
  1 | program sparseMatVec
  2 |   use cudafor
  3 |   use cusparse
  4 | 
  5 |   implicit none
  6 | 
  7 |   integer, parameter :: n = 5   ! # rows/cols in matrix
  8 |   integer, parameter :: nnz = 5 ! # nonzeros in matrix
  9 | 
 10 |   type(cusparseHandle) :: h
 11 | 
 12 |   ! CSR matrix
 13 |   type(cusparseSpMatDescr) :: descrA
 14 |   real(4), device :: csrValues_d(nnz) 
 15 |   integer(4), device :: csrRowOffsets_d(n+1), csrColInd_d(nnz)
 16 | 
 17 |   ! dense vectors
 18 |   type(cusparseDnVecDescr) :: descrX, descrY
 19 |   real, device :: x_d(n), y_d(n)
 20 |   real :: y(n)
 21 |   
 22 |   ! parameters
 23 |   real(4) :: alpha = 1.0, beta = 0.0
 24 | 
 25 |   integer :: status, i
 26 | 
 27 |   ! initalize cusparse  
 28 |   status = cusparseCreate(h)
 29 | 
 30 |   ! CSR representation for upper circular shift matrix
 31 |   csrValues_d = 1.0
 32 |   csrColInd_d = [2, 3, 4, 5, 1]
 33 |   csrRowOffsets_d = [1, 2, 3, 4, 5, 6]
 34 | 
 35 |   ! vectors
 36 |   x_d = [11.0, 12.0, 13.0, 14.0, 15.0]
 37 |   y_d = 0.0
 38 | 
 39 |   y = x_d
 40 |   print *, 'Original vector' 
 41 |   print "(5(1x,f7.2))", y
 42 | 
 43 |   ! initialize sparse matrix descriptor A in CSR format
 44 |   status = cusparseCreateCsr(descr = descrA, &
 45 |        rows = n, &
 46 |        cols = n, &
 47 |        nnz = nnz, &
 48 |        csrRowOffsets = csrRowOffsets_d, &
 49 |        csrColInd = csrColInd_d, &
 50 |        csrValues = csrValues_d, &
 51 |        csrRowOffsetsType = CUSPARSE_INDEX_32I, &
 52 |        csrColIndType = CUSPARSE_INDEX_32I, &
 53 |        idxBase = CUSPARSE_INDEX_BASE_ONE, &
 54 |        valueType = CUDA_R_32F)
 55 |   
 56 |   ! initialize the dense vector descriptors for X and Y
 57 |   status = cusparseCreateDnVec(descrX, n, x_d, valueType = CUDA_R_32F)
 58 |   status = cusparseCreateDnVec(descrY, n, y_d, valueType = CUDA_R_32F)
 59 | 
 60 |   ! y = alpha*A*x + beta*y
 61 |   
 62 |   block
 63 |     integer(8) :: bufferSize
 64 |     integer(1), allocatable, device :: buffer_d(:)
 65 |     
 66 |     status = cusparseSpMV_buffersize(h, &
 67 |          CUSPARSE_OPERATION_NON_TRANSPOSE, &
 68 |          alpha, descrA, descrX, beta, descrY, &
 69 |          CUDA_R_32F, &
 70 |          CUSPARSE_SPMV_ALG_DEFAULT, &
 71 |          buffersize)
 72 | 
 73 |     allocate(buffer_d(buffersize))
 74 |     
 75 |     status = cusparseSpMV(h, &
 76 |          CUSPARSE_OPERATION_NON_TRANSPOSE, &
 77 |          alpha, descrA, descrX, beta, descrY, &
 78 |          CUDA_R_32F, &
 79 |          CUSPARSE_SPMV_ALG_DEFAULT, &
 80 |          buffer_d)
 81 | 
 82 |     deallocate(buffer_d)
 83 |   end block
 84 | 
 85 |   y = y_d
 86 |   print *, 'Shifted vector' 
 87 |   print "(5(1x,f7.2))", y
 88 | 
 89 |   ! shift down and subtract original
 90 |   ! x = alpha*(A')*y - x
 91 | 
 92 |   beta = -1.0
 93 |   
 94 |   block
 95 |     integer(8) :: bufferSize
 96 |     integer(1), allocatable, device :: buffer_d(:)
 97 |     
 98 |     status = cusparseSpMV_buffersize(h, &
 99 |          CUSPARSE_OPERATION_TRANSPOSE, &
100 |          alpha, descrA, descrY, beta, descrX, &
101 |          CUDA_R_32F, &
102 |          CUSPARSE_SPMV_ALG_DEFAULT, &
103 |          buffersize)
104 | 
105 |     allocate(buffer_d(buffersize))
106 |     
107 |     status = cusparseSpMV(h, &
108 |          CUSPARSE_OPERATION_TRANSPOSE, &
109 |          alpha, descrA, descrY, beta, descrX, &
110 |          CUDA_R_32F, &
111 |          CUSPARSE_SPMV_ALG_DEFAULT, &
112 |          buffer_d)
113 | 
114 |     deallocate(buffer_d)
115 |   end block
116 | 
117 |   y = x_d
118 |   print *, 'Max error: ', maxval(abs(y)) 
119 | 
120 |   ! cleanup
121 | 
122 |   status = cusparseDestroySpMat(descrA)
123 |   status = cusparseDestroyDnVec(descrX)
124 |   status = cusparseDestroyDnVec(descrY)
125 |   status = cusparseDestroy(h)
126 |   
127 | end program sparseMatVec
128 | 


--------------------------------------------------------------------------------
/code/ch05/HDtransfer.cuf:
--------------------------------------------------------------------------------
  1 | program HDtransfer
  2 | 
  3 |   use cudafor
  4 |   implicit none
  5 | 
  6 |   integer, parameter :: nElements = 128*1024*1024
  7 | 
  8 |   ! host arrays
  9 |   real(4) :: a_pageable(nElements), b_pageable(nElements)   
 10 |   real(4), allocatable, pinned :: a_pinned(:), b_pinned(:)
 11 | 
 12 |   ! device arrays
 13 |   real(4), device :: a_d(nElements)
 14 | 
 15 |   ! events for timing
 16 |   type (cudaEvent) :: startEvent, stopEvent
 17 | 
 18 |   ! misc
 19 |   type (cudaDeviceProp) :: prop
 20 |   real(4) :: time
 21 |   integer :: istat, n, i
 22 |   logical :: pinnedFlag
 23 | 
 24 |   ! allocate and initialize
 25 |   do i = 1, nElements
 26 |      a_pageable(i) = i
 27 |   end do
 28 |   b_pageable = 0.0
 29 | 
 30 |   allocate(a_pinned(nElements), b_pinned(nElements), &
 31 |        STAT=istat, PINNED=pinnedFlag)
 32 |   if (istat /= 0) then
 33 |      print *, 'Allocation of a_pinned/b_pinned failed'
 34 |      pinnedFlag = .false.
 35 |   else
 36 |      if (.not. pinnedFlag) print *, 'Pinned allocation failed'
 37 |   end if
 38 | 
 39 |   if (pinnedFlag) then 
 40 |      a_pinned = a_pageable
 41 |      b_pinned = 0.0
 42 |   endif
 43 | 
 44 |   istat = cudaEventCreate(startEvent)
 45 |   istat = cudaEventCreate(stopEvent)
 46 | 
 47 |   ! output device info and transfer size
 48 |   istat = cudaGetDeviceProperties(prop, 0)
 49 | 
 50 |   print "(/, 'Device: ', a)", trim(prop%name)
 51 | 
 52 |   ! pageable data transfers
 53 |   print "(/, 'Pageable transfers')"
 54 |   print *, 'size (KB)      H2D (GB/s)        D2H (GB/s)'
 55 | 
 56 |   n = 1024
 57 |   do
 58 |      if (n > nElements) exit
 59 |      write(*,'(i8)', advance='no') n*4/1024
 60 | 
 61 |      istat = cudaEventRecord(startEvent, 0)
 62 |      a_d(1:n) = a_pageable(1:n)
 63 |      istat = cudaEventRecord(stopEvent, 0)
 64 |      istat = cudaEventSynchronize(stopEvent)
 65 |   
 66 |      istat = cudaEventElapsedTime(time, startEvent, stopEvent)
 67 |      write(*,"(f15.2)", advance='no') n*4/time/1.e+6
 68 | 
 69 |      istat = cudaEventRecord(startEvent, 0)
 70 |      b_pageable(1:n) = a_d(1:n)
 71 |      istat = cudaEventRecord(stopEvent, 0)
 72 |      istat = cudaEventSynchronize(stopEvent)
 73 |   
 74 |      istat = cudaEventElapsedTime(time, startEvent, stopEvent)
 75 |      print "(3x,f15.2)", n*4/time/1.e+6
 76 | 
 77 |      if (any(a_pageable(1:n) /= b_pageable(1:n))) &
 78 |           print *, '*** Pageable transfers failed ***'
 79 | 
 80 |      n = n*2
 81 |   enddo
 82 | 
 83 |   ! pinned data transfers
 84 | 
 85 |   if (pinnedFlag) then
 86 |      print "(/, 'Pinned transfers')"
 87 |      print *, 'size (KB)      H2D (GB/s)        D2H (GB/s)'
 88 | 
 89 |      n = 1024
 90 |      do
 91 |         if (n > nElements) exit
 92 |         write(*,"(i8)", advance='no') n*4/1024
 93 | 
 94 | 
 95 |         istat = cudaEventRecord(startEvent, 0)
 96 |         a_d(1:n) = a_pinned(1:n)
 97 |         istat = cudaEventRecord(stopEvent, 0)
 98 |         istat = cudaEventSynchronize(stopEvent)
 99 |   
100 |         istat = cudaEventElapsedTime(time, startEvent, stopEvent)
101 |         write(*,"(f15.2)", advance='no') n*4/time/1.e+6
102 | 
103 |         istat = cudaEventRecord(startEvent, 0)
104 |         b_pinned(1:n) = a_d(1:n)
105 |         istat = cudaEventRecord(stopEvent, 0)
106 |         istat = cudaEventSynchronize(stopEvent)
107 |         
108 |         istat = cudaEventElapsedTime(time, startEvent, stopEvent)
109 |         print "(3x,f15.2)", n*4/time/1.e+6
110 | 
111 |         if (any(a_pinned(1:n) /= b_pinned(1:n))) &
112 |              print *, '*** Pinned transfers failed ***' 
113 | 
114 |         n = n*2
115 |      end do
116 | 
117 |   end if
118 |  
119 |   ! cleanup
120 |   if (allocated(a_pinned)) deallocate(a_pinned)
121 |   if (allocated(b_pinned)) deallocate(b_pinned)
122 |   istat = cudaEventDestroy(startEvent)
123 |   istat = cudaEventDestroy(stopEvent)
124 | 
125 | end program HDtransfer
126 | 


--------------------------------------------------------------------------------
/code/ch07/cusparseEx.cuf:
--------------------------------------------------------------------------------
  1 | module cusparseEx
  2 |   use cusparse
  3 | 
  4 |   interface cusparseCreateCsr
  5 |      module procedure :: &
  6 |           cusparseCreateCsr_abr
  7 |   end interface cusparseCreateCsr
  8 | 
  9 |   interface cusparseCreateDnVec
 10 |      module procedure :: &
 11 |           cusparseCreateDnVec_abr
 12 |   end interface cusparseCreateDnVec
 13 | 
 14 |   interface cusparseSpMV
 15 |      module procedure :: &
 16 |           cusparseSpMV_abr
 17 |   end interface cusparseSpMV
 18 | 
 19 | contains
 20 | 
 21 |   function cusparseGetIndexType(x) result(res)
 22 |     use cusparse
 23 |     implicit none
 24 |     !dir$ ignore_tkr (rd) x
 25 |     class(*) :: x
 26 |     integer :: res
 27 |     select type(x)
 28 |     type is (integer(4))
 29 |        res = CUSPARSE_INDEX_32I
 30 |     type is (integer(8))
 31 |        res = CUSPARSE_INDEX_64I
 32 |     class default
 33 |        res = -1
 34 |     end select
 35 |   end function cusparseGetIndexType
 36 |   
 37 |   ! cusparseCreateScr abridged interface
 38 |   
 39 |   function cusparseCreateCsr_abr(descr, cols, &
 40 |        csrRowOffsets_d, csrColInd_d, csrValues_d, idxBase) &
 41 |        result(status)
 42 |     use cudaforEx
 43 |     use cusparse
 44 |     implicit none
 45 |     type(cusparseSpMatDescr) :: descr
 46 |     integer(4) :: cols
 47 |     class(*), device :: csrRowOffsets_d(:), csrColInd_d(:)
 48 |     class(*), device :: csrValues_d(:)
 49 |     integer, optional :: idxBase
 50 |     integer :: status
 51 |     
 52 |     integer :: rows, nnz
 53 |     integer :: idxBaseLocal = CUSPARSE_INDEX_BASE_ONE
 54 | 
 55 |     if (present(idxBase)) idxBaseLocal = idxBase
 56 | 
 57 |     rows = size(csrRowOffsets_d)-1
 58 |     nnz = size(csrValues_d)
 59 | 
 60 |     status = cuSparseCreateCsr(descr, &
 61 |        rows, cols, nnz, &
 62 |        csrRowOffsets_d, csrColInd_d, csrValues_d, &
 63 |        cusparseGetIndexType(csrRowOffsets_d), &
 64 |        cusparseGetIndexType(csrColInd_d), &
 65 |        idxBaseLocal, &
 66 |        cudaforGetDataType(csrValues_d))
 67 |   end function cusparseCreateCsr_abr
 68 | 
 69 |   ! cusparseCreateDnVec abridged call
 70 | 
 71 |   function cusparseCreateDnVec_abr(descr, x_d) result(status)
 72 |     use cudaforEx
 73 |     use cusparse
 74 |     implicit none
 75 |     type(cusparseDnVecDescr) :: descr
 76 |     class(*), device :: x_d(:)
 77 |     integer :: status
 78 | 
 79 |     status = cusparseCreateDnVec(descr, &
 80 |          size(x_d), x_d, cudaforGetDataType(x_d))
 81 |   end function cusparseCreateDnVec_abr
 82 | 
 83 |   ! cusparseSpMV abridged call
 84 | 
 85 |   function cusparseSpMV_abr(h, opA, alpha, A, x, beta, y, alg, buffer) &
 86 |        result(status)
 87 |     use cudaforEx
 88 |     use cusparse
 89 |     implicit none
 90 |     type(cusparseHandle) :: h
 91 |     type(cusparseSpMatDescr) :: A
 92 |     integer :: opA
 93 |     type(cusparseDnVecDescr) :: x, y
 94 |     class(*) :: alpha, beta
 95 |     integer, optional :: alg
 96 |     integer(1), device, optional :: buffer(*)
 97 |     integer :: status
 98 |     
 99 |     ! valueType determined by alpha/beta
100 |     integer :: algLocal = CUSPARSE_SPMV_ALG_DEFAULT
101 |     
102 |     if (present(alg)) algLocal = alg
103 |     
104 |     if (present(buffer)) then
105 |        status = cusparseSpMV(h, &
106 |             opA, alpha, A, X, beta, Y, &
107 |             cudaforGetDataType(alpha), &
108 |             algLocal, buffer)
109 |     else
110 |        block 
111 |          integer(8) :: bufferSize
112 |          integer(1), allocatable, device :: buffer_d(:)      
113 |          
114 |          status = cusparseSpMV_buffersize(h, &
115 |               opA, alpha, A, x, beta, y, &
116 |               cudaforGetDataType(alpha), &
117 |               algLocal, &
118 |               buffersize)
119 |          if (status /= CUSPARSE_STATUS_SUCCESS) return
120 |          
121 |          allocate(buffer_d(buffersize))         
122 |          
123 |          status = cusparseSpMV(h, &
124 |               opA, alpha, A, X, beta, Y, &
125 |               cudaforGetDataType(alpha), &
126 |               algLocal, &
127 |               buffer_d)
128 |          
129 |          deallocate(buffer_d)
130 |        end block
131 |     endif
132 |   end function cusparseSpMV_abr
133 |   
134 | end module cusparseEx
135 | 


--------------------------------------------------------------------------------
/code/ch07/cutensorContraction.cuf:
--------------------------------------------------------------------------------
  1 | program main
  2 |   use cudafor
  3 |   use cutensor_v2
  4 |   implicit none
  5 |   integer, parameter :: m=3200, n=3200, k=3200
  6 |   real(8) :: a(m,k), b(k,n), c(m,n), cref(m,n)
  7 |   real(8), device :: a_d(m,k), b_d(k,n), c_d(m,n)
  8 |   real(8) :: alpha = 1.0, beta = 0.0
  9 |   
 10 |   type(cudaDeviceProp) :: prop
 11 | 
 12 |   type(cutensorHandle) :: handle
 13 |   type(cutensorTensorDescriptor) :: Adesc, Bdesc, Cdesc
 14 |   type(cutensorOperationDescriptor) :: opDesc
 15 |   type(cutensorPlan) :: plan
 16 |   type(cutensorPlanPreference) :: pref
 17 |   integer(4) :: aMode(2), bMode(2), cMode(2)
 18 |   
 19 |   integer :: istat
 20 |   type(cutensorStatus) :: ctStat
 21 | 
 22 |   istat = cudaGetDeviceProperties(prop, 0)
 23 |   if (istat /= cudaSuccess) &
 24 |        print *, cudaGetErrorString(istat)
 25 |   print "(' Device : ', a)", trim(prop%name)
 26 |   print "(' m = ', i0, ', n = ', i0, ', k = ', i0)", m , n , k
 27 | 
 28 |   call random_number(a); call random_number(b)
 29 |   a_d = a; b_d = b; c_d = 0.0
 30 | 
 31 |   ! host reference
 32 |   cref = matmul(a,b)
 33 | 
 34 |   ! Initialize cutensor library
 35 |   ctStat = cutensorCreate(handle)
 36 |   if (ctStat /= CUTENSOR_STATUS_SUCCESS) &
 37 |        print *, cutensorGetErrorString(ctStat)
 38 | 
 39 |   ! Create tensor descriptors
 40 |   block
 41 |     integer(8) :: extent(2), stride(2)
 42 |     integer(4) :: nModes
 43 |     integer(4) :: ialign
 44 | 
 45 |     ! A
 46 |     
 47 |     nModes=2
 48 |     extent = shape(a)
 49 |     stride = [1, m]
 50 |     Amode = [ichar('m'), ichar('k')]
 51 |     ialign = 128
 52 |     
 53 |     ctStat = cutensorCreateTensorDescriptor(handle, Adesc, &
 54 |          nModes, extent, stride, CUTENSOR_R_64F, ialign)
 55 | 
 56 |     ! B
 57 |     
 58 |     nModes=2
 59 |     extent = shape(b)
 60 |     stride = [1, k]
 61 |     Bmode = [ichar('k'), ichar('n')]
 62 |     ialign = 128
 63 |     
 64 |     ctStat = cutensorCreateTensorDescriptor(handle, Bdesc, &
 65 |          nModes, extent, stride, CUTENSOR_R_64F, ialign)
 66 | 
 67 |     ! C
 68 | 
 69 |     nModes=2
 70 |     extent = shape(c)
 71 |     stride = [1, m]
 72 |     Cmode = [ichar('m'), ichar('n')]
 73 |     ialign = 128
 74 |     
 75 |     ctStat = cutensorCreateTensorDescriptor(handle, Cdesc, &
 76 |          nModes, extent, stride, CUTENSOR_R_64F, ialign)
 77 |   end block
 78 |     
 79 |   ctStat = cutensorCreateContraction(handle, opDesc, &
 80 |        Adesc, Amode, CUTENSOR_OP_IDENTITY, &
 81 |        Bdesc, Bmode, CUTENSOR_OP_IDENTITY, &
 82 |        Cdesc, Cmode, CUTENSOR_OP_IDENTITY, &
 83 |        Cdesc, Cmode, CUTENSOR_COMPUTE_DESC_64F)
 84 |   
 85 |   ctStat = cutensorCreatePlanPreference(&
 86 |        handle, pref, CUTENSOR_ALGO_DEFAULT, CUTENSOR_JIT_MODE_NONE)
 87 |   
 88 |   ! create work buffer
 89 |   block
 90 |     integer(8) :: worksize
 91 |     integer(1), device, allocatable :: workspace_d(:)
 92 |     
 93 |     ctStat = cutensorEstimateWorkspaceSize(handle, opDesc, pref, &
 94 |          CUTENSOR_WORKSPACE_DEFAULT, worksize)
 95 |     print *,"Estimated workspace size (B): ", worksize
 96 | 
 97 |     ctStat = cutensorCreatePlan(handle, plan, opDesc, pref, worksize)
 98 |     allocate(workspace_d(worksize))
 99 | 
100 |     ! now do contraction
101 |     block
102 |       type(cudaEvent) :: startEvent, stopEvent
103 |       real(4) :: time
104 | 
105 |       istat = cudaEventCreate(startEvent)
106 |       istat = cudaEventCreate(stopEvent)
107 | 
108 |       istat = cudaDeviceSynchronize()
109 |       istat = cudaEventRecord(startEvent, 0)
110 |       
111 |       ctStat = cutensorContract(handle, plan, alpha, a_d, b_d, &
112 |            beta, c_d, c_d, workspace_d, worksize, 0)
113 | 
114 |       istat = cudaEventRecord(stopEvent, 0)
115 |       istat = cudaEventSynchronize(stopEvent)
116 |       istat = cudaEventElapsedTime(time, startEvent, stopEvent)
117 |       istat = cudaEventDestroy(startEvent)
118 |       istat = cudaEventDestroy(stopEvent)
119 |       
120 |       c = c_d
121 |       print *, 'maxval(abs(c-cref))', maxval(abs(c-cref))
122 |       print *, 'TFLOPS: ', 2.*k*m*n/(time/1000.)/1.0E+12
123 |     end block      
124 |     deallocate(workspace_d)
125 |   end block
126 | 
127 |   ! cleanup
128 | 
129 |   ctStat = cutensorDestroy(handle)
130 |   ctStat = cutensorDestroyPlan(plan)
131 |   ctStat = cutensorDestroyOperationDescriptor(opDesc)
132 |   ctStat = cutensorDestroyTensorDescriptor(Adesc)  
133 |   ctStat = cutensorDestroyTensorDescriptor(Bdesc)
134 |   ctStat = cutensorDestroyTensorDescriptor(Cdesc)
135 | 
136 | end program main
137 | 


--------------------------------------------------------------------------------
/code/ch12/sphere.F90:
--------------------------------------------------------------------------------
  1 | module rayTracing
  2 | #ifdef _CUDA
  3 |   use rgbCUDA
  4 | #else
  5 |   use rgbHost
  6 | #endif
  7 |   
  8 |   real, parameter :: lowerLeftCorner(3) = [-2.0, -1.0, -1.0]
  9 |   real, parameter :: horizontal(3) = [4.0, 0.0, 0.0]
 10 |   real, parameter :: vertical(3) = [0.0, 2.0, 0.0]
 11 |   real, parameter :: origin(3) = [0.0, 0.0, 0.0]
 12 | 
 13 |   type ray
 14 |      real :: origin(3)
 15 |      real :: dir(3)
 16 |   end type ray
 17 | 
 18 |   interface ray
 19 |      module procedure rayConstructor
 20 |   end interface ray
 21 | 
 22 |   type sphere
 23 |      real :: center(3), radius
 24 |   end type sphere
 25 | 
 26 | contains
 27 |   
 28 |   !@cuf attributes(device) &
 29 |   function normalize(a) result(res)
 30 |     implicit none
 31 |     real :: a(3), res(3)
 32 |     res = a/sqrt(sum(a**2))
 33 |   end function normalize
 34 | 
 35 |   !@cuf attributes(device) &
 36 |   function rayConstructor(origin, dir) result(r)
 37 |     implicit none
 38 |     !dir$ ignore_tkr (d) origin, (d) dir   
 39 |     real :: origin(3), dir(3)
 40 |     type(ray) :: r
 41 |     r%origin = origin
 42 |     r%dir = normalize(dir)
 43 |   end function rayConstructor
 44 | 
 45 |   !@cuf attributes(device) &
 46 |   function hitSphere(s, r) result(res)
 47 |     implicit none
 48 |     type(sphere) :: s
 49 |     type(ray) :: r
 50 |     real :: oc(3), a, b, c, disc
 51 |     logical :: res
 52 |     oc = r%origin - s%center
 53 |     a = dot_product(r%dir, r%dir)
 54 |     b = 2.0*dot_product(r%dir, oc)
 55 |     c = dot_product(oc, oc) - s%radius**2
 56 |     disc = b**2 - 4.0*a*c
 57 |     res = (disc > 0.0)
 58 |   end function hitSphere
 59 | 
 60 |   !@cuf attributes(device) &
 61 |   function color(r, s) result(res)
 62 |     implicit none
 63 |     type(ray) :: r
 64 |     type(sphere) :: s
 65 |     type(rgb) :: res
 66 |     real :: t
 67 |     if (hitSphere(s, r)) then
 68 |        res = rgb([1.0, 0.0, 0.0])
 69 |     else
 70 |        t = 0.5*(r%dir(2) + 1.0)  
 71 |        res = rgb((1.0-t)*[1.0, 1.0, 1.0] + t*[0.5, 0.7, 1.0])
 72 |     endif
 73 |   end function color
 74 | 
 75 | #ifdef _CUDA  
 76 |   attributes(global) subroutine renderKernel(fb, nx, ny, s)
 77 |     implicit none
 78 |     type(rgb) :: fb(nx,ny)
 79 |     integer, value :: nx, ny
 80 |     type(sphere) :: s
 81 |     type(ray) :: r
 82 |     real :: dir(3)
 83 |     real :: u, v
 84 |     integer :: i, j
 85 | 
 86 |     i = threadIdx%x + (blockIdx%x-1)*blockDim%x
 87 |     j = threadIdx%y + (blockIdx%y-1)*blockDim%y
 88 |     if (i <= nx .and. j <= ny) then
 89 |        u = real(i)/nx
 90 |        v = real(j)/ny
 91 |        dir = lowerLeftCorner + u*horizontal + v*vertical - origin
 92 |        r = ray(origin, dir)
 93 |        fb(i,j) = color(r, s)
 94 |     end if
 95 |   end subroutine renderKernel
 96 | #endif
 97 |     
 98 |   subroutine render(fb, s)
 99 |     !@cuf use cudafor
100 |     implicit none
101 |     type(rgb) :: fb(:,:)
102 |     type(sphere) :: s
103 |     type(ray) :: r
104 |     real :: u, v
105 |     integer :: nx, ny, i, j
106 |     
107 |     nx = size(fb,1)    
108 |     ny = size(fb,2)
109 | 
110 | #ifdef _CUDA
111 |     block
112 |       type(rgb), device, allocatable :: fb_d(:,:)
113 |       type(sphere), device :: s_d
114 |       type(dim3) :: tBlock, grid
115 | 
116 |       allocate(fb_d(nx,ny))
117 |       s_d = s
118 |       tBlock = dim3(32,8,1)
119 |       grid = dim3((nx-1)/tBlock%x+1, (ny-1)/tBlock%y+1, 1)
120 |       call renderKernel<<<grid, tBlock>>>(fb_d, nx, ny, s_d)
121 |       fb = fb_d
122 |       deallocate(fb_d)
123 |     end block
124 | #else    
125 |     do j = 1, ny
126 |        do i = 1, nx                
127 |           u = real(i)/nx
128 |           v = real(j)/ny
129 |           r = ray(origin, &
130 |                lowerLeftCorner + u*horizontal + v*vertical - origin)
131 |           fb(i,j) = color(r, s)
132 |        end do
133 |     end do
134 | #endif
135 |   end subroutine render
136 |   
137 | end module rayTracing
138 | 
139 | program main
140 |   use rayTracing
141 |   implicit none
142 |   integer, parameter :: nx = 400, ny = 200
143 |   integer :: i, j
144 |   type(rgb) :: fb(nx,ny)
145 |   type(sphere) :: s
146 | 
147 |   s = sphere([0.0, 0.0, -1], 0.5)
148 |   
149 |   call render(fb, s)
150 |   
151 |   ! ppm output
152 | 
153 |   print "(a2)", 'P3'   ! indicates RGB colors in ASCII, must be flush left
154 |   print *, nx, ny      ! width and height of image
155 |   print *, 255         ! maximum value for each color
156 |   do j = ny, 1, -1
157 |      do i = 1, nx                
158 |         print "(3(1x,i3))", int(255*fb(i,j)%v)
159 |      end do
160 |   end do
161 | 
162 | end program main
163 | 


--------------------------------------------------------------------------------
/code/ch12/normal.F90:
--------------------------------------------------------------------------------
  1 | module rayTracing
  2 | #ifdef _CUDA
  3 |   use rgbCUDA
  4 | #else
  5 |   use rgbHost
  6 | #endif
  7 |   
  8 |   real, parameter :: lowerLeftCorner(3) = [-2.0, -1.0, -1.0]
  9 |   real, parameter :: horizontal(3) = [4.0, 0.0, 0.0]
 10 |   real, parameter :: vertical(3) = [0.0, 2.0, 0.0]
 11 |   real, parameter :: origin(3) = [0.0, 0.0, 0.0]
 12 | 
 13 |   type ray
 14 |      real :: origin(3)
 15 |      real :: dir(3)
 16 |   end type ray
 17 | 
 18 |   interface ray
 19 |      module procedure rayConstructor
 20 |   end interface ray
 21 | 
 22 |   type sphere
 23 |      real :: center(3), radius
 24 |   end type sphere
 25 | 
 26 | contains
 27 |   
 28 |   !@cuf attributes(device) &
 29 |   function normalize(a) result(res)
 30 |     implicit none
 31 |     real :: a(3), res(3)
 32 | 
 33 |     res = a/sqrt(sum(a**2))
 34 |   end function normalize
 35 | 
 36 |   !@cuf attributes(device) &
 37 |   function rayConstructor(origin, dir) result(r)
 38 |     implicit none
 39 |     !dir$ ignore_tkr (d) origin, (d) dir   
 40 |     real :: origin(3), dir(3)
 41 |     type(ray) :: r
 42 |     r%origin = origin
 43 |     r%dir = normalize(dir)
 44 |   end function rayConstructor
 45 |   
 46 |   !@cuf attributes(device) &
 47 |   function hitSphere(s, r) result(res)
 48 |     implicit none
 49 |     type(sphere) :: s
 50 |     type(ray) :: r
 51 |     real :: oc(3), a, b, c, disc
 52 |     real :: res
 53 | 
 54 |     oc = r%origin - s%center
 55 |     a = dot_product(r%dir, r%dir)
 56 |     b = 2.0*dot_product(r%dir, oc)
 57 |     c = dot_product(oc, oc) - s%radius**2
 58 |     disc = b**2 - 4.0*a*c
 59 |     if (disc > 0.0) then
 60 |        res = (-b -sqrt(disc))/(2.0*a)
 61 |     else
 62 |        res = -1.0
 63 |     endif
 64 |   end function hitSphere
 65 | 
 66 |   !@cuf attributes(device) &
 67 |   function color(r, s) result(res)
 68 |     implicit none
 69 |     type(ray) :: r
 70 |     type(sphere) :: s
 71 |     type(rgb) :: res
 72 |     real :: t, n(3)
 73 | 
 74 |     t = hitSphere(s, r)
 75 |     if (t > 0) then
 76 |        n = r%dir*t + r%origin - s%center
 77 |        n = normalize(n)
 78 |        res = rgb(0.5*(n+1.0))
 79 |     else
 80 |        t = 0.5*(r%dir(2) + 1.0)  
 81 |        res = rgb((1.0-t)*[1.0, 1.0, 1.0] + t*[0.5, 0.7, 1.0])
 82 |     endif
 83 |   end function color
 84 | 
 85 | #ifdef _CUDA  
 86 |   attributes(global) subroutine renderKernel(fb, nx, ny, s)
 87 |     implicit none
 88 |     type(rgb) :: fb(nx,ny)
 89 |     integer, value :: nx, ny
 90 |     type(sphere) :: s
 91 |     type(ray) :: r
 92 |     real :: dir(3)
 93 |     real :: u, v
 94 |     integer :: i, j
 95 | 
 96 |     i = threadIdx%x + (blockIdx%x-1)*blockDim%x
 97 |     j = threadIdx%y + (blockIdx%y-1)*blockDim%y
 98 |     if (i <= nx .and. j <= ny) then
 99 |        u = real(i)/nx
100 |        v = real(j)/ny
101 |        dir = lowerLeftCorner + u*horizontal + v*vertical - origin
102 |        r = ray(origin, dir)
103 |        fb(i,j) = color(r, s)
104 |     end if
105 |   end subroutine renderKernel
106 | #endif
107 |     
108 |   subroutine render(fb, s)
109 |     !@cuf use cudafor
110 |     implicit none
111 |     type(rgb) :: fb(:,:)
112 |     type(sphere) :: s
113 |     type(ray) :: r
114 |     real :: u, v
115 |     integer :: nx, ny, i, j
116 |     
117 |     nx = size(fb,1)    
118 |     ny = size(fb,2)
119 | 
120 | #ifdef _CUDA
121 |     block
122 |       type(rgb), device, allocatable :: fb_d(:,:)
123 |       type(sphere), device :: s_d
124 |       type(dim3) :: tBlock, grid
125 | 
126 |       allocate(fb_d(nx,ny))
127 |       s_d = s
128 |       tBlock = dim3(32,8,1)
129 |       grid = dim3((nx-1)/tBlock%x+1, (ny-1)/tBlock%y+1, 1)
130 |       call renderKernel<<<grid, tBlock>>>(fb_d, nx, ny, s_d)
131 |       fb = fb_d
132 |       deallocate(fb_d)
133 |     end block
134 | #else    
135 |     do j = 1, ny
136 |        do i = 1, nx                
137 |           u = real(i)/nx
138 |           v = real(j)/ny
139 |           r = ray(origin, &
140 |                lowerLeftCorner + u*horizontal + v*vertical - origin)
141 |           fb(i,j) = color(r, s)
142 |        end do
143 |     end do
144 | #endif
145 |   end subroutine render
146 |   
147 | end module rayTracing
148 | 
149 | program main
150 |   use rayTracing
151 |   implicit none
152 |   integer, parameter :: nx = 400, ny = 200
153 |   integer :: i, j
154 |   type(rgb) :: fb(nx,ny)
155 |   type(sphere) :: s
156 | 
157 |   s = sphere([0.0, 0.0, -1.0], 0.5) 
158 |   call render(fb, s)
159 |   
160 |   ! ppm output
161 | 
162 |   print "(a2)", 'P3'   ! indicates RGB colors in ASCII, must be flush left
163 |   print *, nx, ny      ! width and height of image
164 |   print *, 255         ! maximum value for each color
165 |   do j = ny, 1, -1
166 |      do i = 1, nx                
167 |         print "(3(1x,i3))", int(255*fb(i,j)%v)
168 |      end do
169 |   end do
170 | 
171 | end program main
172 | 


--------------------------------------------------------------------------------
/code/ch12/rgb_m.F90:
--------------------------------------------------------------------------------
  1 | module &
  2 | #ifdef _CUDA
  3 |   rgbCUDA
  4 | #else
  5 |   rgbHost
  6 | #endif
  7 | 
  8 |   type rgb
  9 |      real :: v(3)
 10 |   end type rgb
 11 | 
 12 |   interface assignment (=)
 13 |      module procedure rgbEqR3, r3EqRgb
 14 |   end interface assignment (=)
 15 | 
 16 |   interface operator(*)
 17 |      module procedure rgbTimesR3, r3TimesRgb, rgbTimesRgb, &
 18 |           rgbTimesR, rTimesRgb
 19 |   end interface operator(*)
 20 | 
 21 |   interface operator(/)
 22 |      module procedure rgbDivR3, r3DivRgb, rgbDivRgb, rgbDivI
 23 |   end interface operator(/)
 24 | 
 25 |   interface operator(+)
 26 |      module procedure rgbPlusR3, r3PlusRgb, rgbPlusRgb
 27 |   end interface operator(+)
 28 | 
 29 |   interface operator(-)
 30 |      module procedure rgbMinusR3, r3MinusRgb, rgbMinusRgb
 31 |   end interface operator(-)
 32 | 
 33 | contains
 34 | 
 35 |   !@cuf attributes(device) &
 36 |   subroutine rgbEqR3(rgbout, rin)
 37 |     type(rgb), intent(out) :: rgbout
 38 |     real(4), intent(in) :: rin(3)
 39 |     rgbout%v = rin
 40 |   end subroutine rgbEqR3
 41 |   
 42 |   !@cuf attributes(device) &
 43 |   subroutine r3EqRgb(rout, rgbin)
 44 |     real(4), intent(out) :: rout(3)
 45 |     type(rgb), intent(in) :: rgbin
 46 |     rout = rgbin%v
 47 |   end subroutine r3EqRgb
 48 | 
 49 |   !@cuf attributes(device) &
 50 |   function rgbTimesR3(rgbin, rin) result(res)
 51 |     type(rgb), intent(in) :: rgbin
 52 |     real(4), intent(in) :: rin(3)
 53 |     real(4) :: res(3)
 54 |     res = rgbin%v * rin
 55 |   end function rgbTimesR3
 56 | 
 57 |   !@cuf attributes(device) &
 58 |   function r3TimesRgb(rin, rgbin) result(res)
 59 |     real(4), intent(in) :: rin(3)
 60 |     type(rgb), intent(in) :: rgbin
 61 |     real(4) :: res(3)
 62 |     res = rgbin%v * rin
 63 |   end function r3TimesRgb
 64 | 
 65 |   !@cuf attributes(device) &
 66 |   function rgbTimesRgb(rgb1, rgb2) result(res)
 67 |     type(rgb), intent(in) :: rgb1, rgb2
 68 |     real(4) :: res(3)
 69 |     res = rgb1%v * rgb2%v
 70 |   end function rgbTimesRgb
 71 | 
 72 |   !@cuf attributes(device) &
 73 |   function rgbTimesR(rgbin, rin) result(res)
 74 |     type(rgb), intent(in) :: rgbin
 75 |     real(4), intent(in) :: rin
 76 |     real(4) :: res(3)
 77 |     res = rgbin%v * rin
 78 |   end function rgbTimesR
 79 | 
 80 |   !@cuf attributes(device) &
 81 |   function rTimesRgb(rin, rgbin) result(res)
 82 |     real(4), intent(in) :: rin
 83 |     type(rgb), intent(in) :: rgbin
 84 |     real(4) :: res(3)
 85 |     res = rgbin%v * rin
 86 |   end function rTimesRgb
 87 | 
 88 |   !@cuf attributes(device) &
 89 |   function rgbDivR3(rgbin, rin) result(res)
 90 |     type(rgb), intent(in) :: rgbin
 91 |     real(4), intent(in) :: rin(3)
 92 |     real(4) :: res(3)
 93 |     res = rgbin%v / rin
 94 |   end function rgbDivR3
 95 | 
 96 |   !@cuf attributes(device) &
 97 |   function rgbDivI(rgbin, iin) result(res)
 98 |     type(rgb), intent(in) :: rgbin
 99 |     integer(4), intent(in) :: iin
100 |     real(4) :: res(3)
101 |     res = rgbin%v / iin
102 |   end function rgbDivI
103 | 
104 |   !@cuf attributes(device) &
105 |   function r3DivRgb(rin, rgbin) result(res)
106 |     real(4), intent(in) :: rin(3)
107 |     type(rgb), intent(in) :: rgbin
108 |     real(4) :: res(3)
109 |     res = rgbin%v / rin
110 |   end function r3DivRgb
111 | 
112 |   !@cuf attributes(device) &
113 |   function rgbDivRgb(rgb1, rgb2) result(res)
114 |     type(rgb), intent(in) :: rgb1, rgb2
115 |     real(4) :: res(3)
116 |     res = rgb1%v / rgb2%v
117 |   end function rgbDivRgb
118 | 
119 |   !@cuf attributes(device) &
120 |   function rgbPlusR3(rgbin, rin) result(res)
121 |     type(rgb), intent(in) :: rgbin
122 |     real(4), intent(in) :: rin(3)
123 |     real(4) :: res(3)
124 |     res = rgbin%v + rin
125 |   end function rgbPlusR3
126 | 
127 |   !@cuf attributes(device) &
128 |   function r3PlusRgb(rin, rgbin) result(res)
129 |     real(4), intent(in) :: rin(3)
130 |     type(rgb), intent(in) :: rgbin
131 |     real(4) :: res(3)
132 |     res = rgbin%v + rin
133 |   end function r3PlusRgb
134 | 
135 |   !@cuf attributes(device) &
136 |   function rgbPlusRgb(rgb1, rgb2) result(res)
137 |     type(rgb), intent(in) :: rgb1, rgb2
138 |     real(4) :: res(3)
139 |     res = rgb1%v + rgb2%v
140 |   end function rgbPlusRgb
141 | 
142 |   !@cuf attributes(device) &
143 |   function rgbMinusR3(rgbin, rin) result(res)
144 |     type(rgb), intent(in) :: rgbin
145 |     real(4), intent(in) :: rin(3)
146 |     real(4) :: res(3)
147 |     res = rgbin%v - rin
148 |   end function rgbMinusR3
149 | 
150 |   !@cuf attributes(device) &
151 |   function r3MinusRgb(rin, rgbin) result(res)
152 |     real(4), intent(in) :: rin(3)
153 |     type(rgb), intent(in) :: rgbin
154 |     real(4) :: res(3)
155 |     res = rgbin%v - rin
156 |   end function r3MinusRgb
157 | 
158 |   !@cuf attributes(device) &
159 |   function rgbMinusRgb(rgb1, rgb2) result(res)
160 |     type(rgb), intent(in) :: rgb1, rgb2
161 |     real(4) :: res(3)
162 |     res = rgb1%v - rgb2%v
163 |   end function rgbMinusRgb
164 | 
165 | end module 
166 | 


--------------------------------------------------------------------------------
/code/ch09/montecarlo_european_option.cuf:
--------------------------------------------------------------------------------
  1 | module blackscholes_m
  2 |   use precision_m
  3 | contains
  4 |   
  5 |   real(fp_kind)  function CND( d )
  6 |     ! Cumulative Normal Distribution function
  7 |     ! using Hasting's formula
  8 |     implicit none
  9 |     real(fp_kind), parameter :: A1 =  0.31938153_fp_kind
 10 |     real(fp_kind), parameter :: A2 = -0.356563782_fp_kind
 11 |     real(fp_kind), parameter :: A3 =  1.781477937_fp_kind
 12 |     real(fp_kind), parameter :: A4 = -1.821255978_fp_kind
 13 |     real(fp_kind), parameter :: A5 =  1.330274429_fp_kind
 14 |     real(fp_kind) :: d, K, abs, exp, RSQRT2PI 
 15 |     
 16 |     K = 1.0_fp_kind/(1.0_fp_kind + 0.2316419_fp_kind * abs(d))
 17 |     RSQRT2PI = 1._fp_kind/sqrt(8._fp_kind*atan(1._fp_kind))
 18 |     CND = RSQRT2PI * exp( -0.5_fp_kind * d * d) *          &
 19 |          (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))))
 20 |     if( d .gt. 0._fp_kind ) CND = 1.0_fp_kind - CND
 21 |     return
 22 |   end function CND
 23 |   
 24 |   subroutine blackscholes(callResult, putResult, &
 25 |        S, E, R, sigma, T)
 26 |     ! Black-Scholes formula for call and put
 27 |     ! S = asset price at time t
 28 |     ! E = exercise (strike) price
 29 |     ! sigma = volatility
 30 |     ! R = interest rate
 31 |     ! T = time to expiration
 32 |     implicit none
 33 |     real(fp_kind) :: callResult, putResult
 34 |     real(fp_kind) :: S, E, R, sigma, T
 35 |     real(fp_kind) :: sqrtT, d1, d2, log, exp, expRT
 36 |     
 37 |     if ( T > 0 ) then 
 38 |        sqrtT = sqrt(T)
 39 |        d1 = (log(S/E)+(R+0.5_fp_kind*sigma*sigma)*T) &
 40 |             /(sigma*sqrtT)
 41 |        d2 = d1 -sigma*sqrtT
 42 |        expRT = exp( -R * T)
 43 |        callResult = ( S * CND(d1) - E * expRT * CND(d2))
 44 |        putResult = callResult + E * expRT - S 
 45 |     else
 46 |        callResult = max(S-E,0._fp_kind)
 47 |        putResult  = max(E-S,0._fp_kind)
 48 |     end if
 49 |   end subroutine blackscholes
 50 | end module blackscholes_m
 51 | 
 52 | program  mc
 53 |   use blackscholes_m
 54 |   use curand
 55 |   use cudafor
 56 |   implicit none
 57 |   real(fp_kind), allocatable, device :: deviceData(:), &
 58 |        putValue(:),callValue(:)
 59 |   real(fp_kind) :: S, E, R, sigma, T,Sfinal, &
 60 |        call_price, put_price
 61 |   real(fp_kind) ::  meanPut,meanCall, &
 62 |        stddevPut, stddevCall, confidence
 63 |   type(curandGenerator) :: gen
 64 |   integer(kind=8) :: seed
 65 |   integer :: i, n2, nargs, istat, N
 66 |   type(cudaEvent) :: startEvent,stopEvent
 67 |   real :: time
 68 |   character*12 arg
 69 |   
 70 |   istat=cudaEventCreate(startEvent)
 71 |   istat=cudaEventCreate(stopEvent)
 72 |   
 73 |   ! Number of samples
 74 |   nargs=command_argument_count()
 75 |   if ( nargs == 0 ) then
 76 |      N = 1000000
 77 |   else
 78 |      call get_command_argument(1,arg)
 79 |      read(arg,'(i)') N
 80 |   endif
 81 |   
 82 |   S      = 5._fp_kind;       E = 4._fp_kind
 83 |   sigma  = 0.3_fp_kind;      R = 0.05_fp_kind
 84 |   T      = 1._fp_kind
 85 |   
 86 |   istat=cudaEventRecord(startEvent,0) !start timing
 87 |   
 88 |   !Allocate arrays on GPU
 89 |   allocate (deviceData(N),putValue(N),callValue(N))
 90 |   
 91 |   if (fp_kind == singlePrecision) then
 92 |      print *, " European option with random numbers"
 93 |      print *, " in single precisionm using ",N," samples"
 94 |   else
 95 |      print *, " European option with random numbers"
 96 |      print *, " in double precision using ",N," samples"
 97 |   end if
 98 |   
 99 |   ! Create pseudonumber generator
100 |   istat = curandCreateGenerator(gen, CURAND_RNG_PSEUDO_DEFAULT)
101 |   
102 |   ! Set seed
103 |   seed=1234
104 |   istat= curandSetPseudoRandomGeneratorSeed( gen, seed)
105 |   
106 |   ! Generate N floats/doubles on device w/ normal distribution
107 |   !istat= curandGenerateNormal(gen, deviceData, N, &
108 |   !       0._fp_kind, 1._fp_kind)
109 |   istat= curandGenerate(gen, deviceData, N, &
110 |        0._fp_kind, 1._fp_kind)
111 |   
112 |   meanPut=0._fp_kind; meanCall=0._fp_kind
113 |   !$cuf kernel do <<<*,*>>>
114 |   do i=1,N
115 |      Sfinal= S*exp((R-0.5_fp_kind*sigma*sigma)*T &
116 |           +sigma*sqrt(T)*deviceData(i))
117 |      putValue(i) =exp (-R *T) * max (E-Sfinal,0._fp_kind)
118 |      callValue(i)=exp (-R *T) * max (Sfinal-E,0._fp_kind)
119 |      meanPut=meanPut+putValue(i)
120 |      meanCall=meanCall+callValue(i)
121 |   end do
122 |   meanPut=meanPut/N
123 |   meanCall=meanCall/N
124 |   
125 |   stddevPut=0._fp_kind; stddevCall=0._fp_kind
126 |   !$cuf kernel do <<<*,*>>>
127 |   do i=1,N
128 |      stddevPut= stddevPut + (putValue(i)-meanPut) **2
129 |      stddevCall= stddevCall + (callValue(i)-meanCall) **2
130 |   end do
131 |   stddevPut=sqrt(stddevPut/(N-1) )
132 |   stddevCall=sqrt(stddevCall/(N-1) )
133 |   
134 |   ! compute a reference solution using Black Scholes formula
135 |   call blackscholes(call_price,put_price,S,E,R,sigma,T)
136 |   
137 |   print *, "Montecarlo  value of put option  =", meanPut
138 |   print *, "BlackScholes value of put option =", put_price
139 |   print *, "Confidence interval of put option  = [", &
140 |        meanPut -1.96*stddevPut/sqrt(real(N)),",",&
141 |        meanPut +1.96*stddevPut/sqrt(real(N)),"]"
142 |   print *, "Montecarlo value of call option  =", meanCall
143 |   print *, "BlackScholes value of call option=", call_price
144 |   print *, "Confidence interval of call option  = [", &
145 |        meanCall -1.96*stddevCall/sqrt(real(N)),",",&
146 |        meanCall +1.96*stddevCall/sqrt(real(N)),"]"
147 |   
148 |   istat=cudaEventRecord(stopEvent,0)
149 |   istat=cudaEventSynchronize(stopEvent)
150 |   istat=cudaEventElapsedTime(time,startEvent,stopEvent)
151 |   
152 |   print *,"Elapsed time (ms) :",time
153 | 
154 |   deallocate (deviceData,putValue,callValue)
155 |   
156 |   ! Destroy the generator
157 |   istat= curandDestroyGenerator(gen)
158 |   
159 | end program  mc
160 | 


--------------------------------------------------------------------------------
/code/ch08/directTransfer.cuf:
--------------------------------------------------------------------------------
  1 | program directTransfer
  2 |   use cudafor
  3 |   implicit none
  4 |   integer, parameter :: N = 4*1024*1024
  5 |   real, pinned, allocatable :: a(:), b(:)     
  6 |   real, device, allocatable :: a_d(:), b_d(:) 
  7 | 
  8 |   ! these hold free and total memory before and after 
  9 |   ! allocation, used to verify allocation is happening 
 10 |   ! on proper devices
 11 |   integer(cuda_count_kind), allocatable :: &
 12 |        freeBefore(:), totalBefore(:), &
 13 |        freeAfter(:), totalAfter(:)
 14 | 
 15 |   integer :: istat, nDevices, i, accessPeer, timingDev 
 16 |   type (cudaDeviceProp) :: prop
 17 |   type (cudaEvent) :: startEvent, stopEvent
 18 |   real :: time
 19 |   
 20 |   istat = cudaGetDeviceCount(nDevices)
 21 |   if (nDevices < 2) then
 22 |      print *, 'Need at least two CUDA capable devices'
 23 |      stop
 24 |   endif
 25 |   print "('Number of CUDA-capable devices: ', i0,/)", &
 26 |        nDevices
 27 | 
 28 |   ! allocate host arrays
 29 |   allocate(a(N), b(N))
 30 |   allocate(freeBefore(0:nDevices-1), &
 31 |        totalBefore(0:nDevices-1))
 32 |   allocate(freeAfter(0:nDevices-1), &
 33 |        totalAfter(0:nDevices-1))
 34 | 
 35 |   ! get device info (including total and free memory)
 36 |   ! before allocating a_d and b_d on devices 0 and 1
 37 |   do i = 0, nDevices-1
 38 |      istat = cudaGetDeviceProperties(prop, i)
 39 |      istat = cudaSetDevice(i)
 40 |      istat = cudaMemGetInfo(freeBefore(i), totalBefore(i))
 41 |   enddo
 42 |   istat = cudaSetDevice(0)
 43 |   allocate(a_d(N))
 44 |   istat = cudaSetDevice(1)
 45 |   allocate(b_d(N))
 46 | 
 47 |   ! print out free memory before and after allocation 
 48 |   print "('Allocation summary')"
 49 |   do i = 0, nDevices-1
 50 |      istat = cudaGetDeviceProperties(prop, i)
 51 |      print "('  Device ', i0, ': ', a)", &
 52 |           i, trim(prop%name)
 53 |      istat = cudaSetDevice(i)
 54 |      istat = cudaMemGetInfo(freeAfter(i), totalAfter(i))
 55 |      print "('    Free memory before: ', i0, &
 56 |           ', after: ', i0, ', difference: ',i0,/)", &
 57 |           freeBefore(i), freeAfter(i), &
 58 |           freeBefore(i)-freeAfter(i)
 59 |   enddo
 60 | 
 61 |   ! check whether devices 0 and 1 can use P2P
 62 |   if (nDevices > 1) then
 63 |      istat = cudaDeviceCanAccessPeer(accessPeer, 0, 1)
 64 |      if (accessPeer == 1) then 
 65 |         print *, 'Peer access available between 0 and 1'
 66 |      else
 67 |         print *, 'Peer access not available between 0 and 1'
 68 |      endif
 69 |   endif
 70 | 
 71 |   ! initialize
 72 |   a = 1.0
 73 |   istat = cudaSetDevice(0)
 74 |   a_d = a
 75 | 
 76 |   ! perform test twice, timing on both sending GPU 
 77 |   ! and receiving GPU
 78 |   do timingDev = 0, 1
 79 |      write(*,"(/,'Timing on device ', i0, /)") timingDev
 80 | 
 81 |      ! create events on the timing device
 82 |      istat = cudaSetDevice(timingDev)
 83 |      istat = cudaEventCreate(startEvent)
 84 |      istat = cudaEventCreate(stopEvent)
 85 | 
 86 |      if (accessPeer == 1) then 
 87 |         ! enable P2P communication
 88 |         istat = cudaSetDevice(0)
 89 |         istat = cudaDeviceEnablePeerAccess(1, 0)
 90 |         istat = cudaSetDevice(1)
 91 |         istat = cudaDeviceEnablePeerAccess(0, 0)
 92 | 
 93 |         ! transfer (implicitly) across devices 
 94 |         b_d = -1.0
 95 |         istat = cudaSetDevice(timingDev)
 96 |         istat = cudaEventRecord(startEvent,0)
 97 |         b_d = a_d 
 98 |         istat = cudaEventRecord(stopEvent,0)
 99 |         istat = cudaEventSynchronize(stopEvent)
100 |         istat = cudaEventElapsedTime(time, &
101 |              startEvent, stopEvent)
102 |         b = b_d
103 |         if (any(b /= a)) then
104 |            print "('Transfer failed')"
105 |         else
106 |            print "('b_d=a_d transfer (GB/s): ', f)", &
107 |                 N*4/time/1.0E+6
108 |         endif
109 |      end if
110 | 
111 |      ! transfer via cudaMemcpyPeer()
112 |      if (accessPeer == 0) istat = cudaSetDevice(1)
113 |      b_d = -1.0  
114 | 
115 |      istat = cudaSetDevice(timingDev)
116 |      istat = cudaEventRecord(startEvent,0)
117 |      istat = cudaMemcpyPeer(b_d, 1, a_d, 0, N)
118 |      istat = cudaEventRecord(stopEvent,0)
119 |      istat = cudaEventSynchronize(stopEvent)
120 |      istat = cudaEventElapsedTime(time, startEvent, &
121 |           stopEvent)
122 |      if (accessPeer == 0) istat = cudaSetDevice(1)
123 |      b = b_d
124 |      if (any(b /= a)) then
125 |         print "('Transfer failed')"
126 |      else
127 |         print "('cudaMemcpyPeer transfer (GB/s): ', f)", &
128 |              N*4/time/1.0E+6
129 |      endif
130 | 
131 |      ! cudaMemcpyPeer with P2P disabled
132 |      if (accessPeer == 1) then
133 |         istat = cudaSetDevice(0)
134 |         istat = cudaDeviceDisablePeerAccess(1)
135 |         istat = cudaSetDevice(1)
136 |         istat = cudaDeviceDisablePeerAccess(0) 
137 |         b_d = -1.0
138 | 
139 |         istat = cudaSetDevice(timingDev)
140 |         istat = cudaEventRecord(startEvent,0)
141 |         istat = cudaMemcpyPeer(b_d, 1, a_d, 0, N) 
142 |         istat = cudaEventRecord(stopEvent,0)
143 |         istat = cudaEventSynchronize(stopEvent)
144 |         istat = cudaEventElapsedTime(time, startEvent, &
145 |              stopEvent)
146 | 
147 |         istat = cudaSetDevice(1) 
148 |         b = b_d
149 |         if (any(b /= a)) then
150 |            print "('Transfer failed')"
151 |         else
152 |            print "('cudaMemcpyPeer transfer w/ P2P', &
153 |                 ' disabled (GB/s): ', f)", N*4/time/1.0E+6
154 |         endif
155 |      end if
156 | 
157 |      ! destroy events associated with timingDev     
158 |      istat = cudaEventDestroy(startEvent)
159 |      istat = cudaEventDestroy(stopEvent)
160 |   end do
161 | 
162 |   ! clean up
163 |   deallocate(freeBefore, totalBefore, freeAfter, totalAfter)  
164 |   deallocate(a, b, a_d, b_d)
165 | end program directTransfer
166 | 
167 | 


--------------------------------------------------------------------------------