├── code ├── ch01 │ ├── precision_m.cuf │ ├── precision2_m.cuf │ ├── version.cuf │ ├── managedCUF.cuf │ ├── multidimCUF.cuf │ ├── managedCUF2.f90 │ ├── increment.f90 │ ├── increment.cuf │ ├── multiblock.cuf │ ├── pciBusID.cuf │ ├── multidim.cuf │ ├── managed.cuf │ ├── managedImplicit.cuf │ ├── asyncError.cuf │ ├── syncError.cuf │ ├── errorHandling.cuf │ ├── explicitInterface.cuf │ ├── Makefile │ └── deviceQuery.cuf ├── ch10 │ ├── precision_m.cuf │ └── Makefile ├── ch02 │ ├── fma.cuf │ ├── Makefile │ ├── accuracy.cuf │ ├── debug.cuf │ ├── initcheck.cuf │ ├── memcheck.cuf │ └── print.cuf ├── ch05 │ ├── fma.cuf │ ├── sliceTransfer.cuf │ ├── cufILP.cuf │ ├── constantAttribute.cuf │ ├── constant.cuf │ ├── assumedShapeSize.cuf │ ├── spill.cuf │ ├── loads.cuf │ ├── maxSharedMemory.cuf │ ├── local.cuf │ ├── Makefile │ ├── offset.cuf │ ├── stride.cuf │ ├── async.cuf │ └── HDtransfer.cuf ├── ch07 │ ├── c.cu │ ├── testSort.cuf │ ├── cudaforEx.cuf │ ├── thrust.cu │ ├── thrust.cuf │ ├── matmulTranspose.cuf │ ├── sgemmNew.cuf │ ├── sgemmLegacy.cuf │ ├── callingC.cuf │ ├── Makefile │ ├── getrfBatched.cuf │ ├── matmulTC.cuf │ ├── cusparseMV_Ex.cuf │ ├── potr.cuf │ ├── gemmPerf.cuf │ ├── cusparseMV.cuf │ ├── cusparseEx.cuf │ └── cutensorContraction.cuf ├── ch11 │ ├── precision_m.F90 │ ├── Makefile │ ├── fft_test_c2c.cuf │ ├── fft_test_r2c.cuf │ └── fft_derivative.cuf ├── ch08 │ ├── timing.f90 │ ├── wallclock.c │ ├── mpiDeviceUtil.cuf │ ├── minimal.cuf │ ├── Makefile │ ├── Makefile~ │ ├── p2pAccess.cuf │ ├── mpiDevices.cuf │ ├── assignDevice.cuf │ ├── p2pBandwidth.cuf │ └── directTransfer.cuf ├── ch06 │ ├── portingBase.f90 │ ├── portingManaged.cuf │ ├── portingDevice.cuf │ ├── portingManagedSent.f90 │ ├── portingAssociate.f90 │ ├── portingManaged_CUDA.F90 │ ├── portingDeviceSent.F90 │ ├── portingDevice_CUDA.F90 │ ├── union_m.cuf │ ├── testCompact.cuf │ ├── test_union.cuf │ ├── Makefile │ ├── laplace2D.f90 │ ├── laplace2DAssoc.f90 │ └── laplace2DUse.F90 ├── ch03 │ ├── nvtxAuto.cuf │ ├── nvtxBasic.cuf │ ├── nvtxAdv2.cuf │ ├── nvtxAdv.cuf │ ├── peakBandwidth.cuf │ ├── Makefile │ ├── multidim.cuf │ ├── events.cuf │ ├── limitingFactor.cuf │ └── effectiveBandwidth.cuf ├── ch09 │ ├── precision_m.F90 │ ├── ieee_accuracy.f90 │ ├── shflExample.cuf │ ├── pi_lock.cuf │ ├── generate_randomnumbers.cuf │ ├── Makefile │ ├── pi_shared.cuf │ ├── accuracy_sum.f90 │ ├── compute_pi.cuf │ ├── testPiGridGroup.cuf │ ├── pi_shfl.cuf │ ├── pi_gridGroup.cuf │ └── montecarlo_european_option.cuf ├── ch04 │ ├── raceAndAtomic.cuf │ ├── ballot.cuf │ ├── concurrentKernels.cuf │ ├── shfl.cuf │ ├── differentStreamTypes.cuf │ ├── syncthreads.cuf │ ├── pipeline.cuf │ ├── eventSync.cuf │ ├── defaultStream.cuf │ ├── cgReverse.cuf │ ├── streamSync.cuf │ ├── defaultStreamVar.cuf │ ├── raceAndAtomicShared.cuf │ ├── defaultStreamVarExplicit.cuf │ ├── smooth.cuf │ ├── twoKernels.cuf │ ├── sharedExample.cuf │ ├── sharedMultiple.cuf │ ├── Makefile │ ├── threadfence.cuf │ └── swap.cuf └── ch12 │ ├── ppmExample.f90 │ ├── Makefile │ ├── ray.F90 │ ├── sphere.F90 │ ├── normal.F90 │ └── rgb_m.F90 └── LICENSE.txt /code/ch01/precision_m.cuf: -------------------------------------------------------------------------------- 1 | module precision_m 2 | use, intrinsic :: iso_fortran_env 3 | integer, parameter :: fp_kind = real64 4 | end module precision_m 5 | -------------------------------------------------------------------------------- /code/ch10/precision_m.cuf: -------------------------------------------------------------------------------- 1 | module precision_m 2 | use, intrinsic :: iso_fortran_env 3 | integer, parameter :: fp_kind = real32 4 | end module precision_m 5 | -------------------------------------------------------------------------------- /code/ch02/fma.cuf: -------------------------------------------------------------------------------- 1 | module m 2 | contains 3 | attributes(global) subroutine s(a, b, c) 4 | real :: a, b, c 5 | a = a+b*c 6 | end subroutine s 7 | end module m 8 | -------------------------------------------------------------------------------- /code/ch05/fma.cuf: -------------------------------------------------------------------------------- 1 | module m 2 | contains 3 | attributes(global) subroutine k(a, b, c) 4 | real :: a, b, c 5 | c = a*b+c 6 | end subroutine k 7 | end module m 8 | -------------------------------------------------------------------------------- /code/ch07/c.cu: -------------------------------------------------------------------------------- 1 | extern "C" __global__ void Ckernel(float *a, float b) 2 | { 3 | a[threadIdx.x] = b; 4 | } 5 | 6 | extern "C" __device__ float Cdevicefun(float a) 7 | { 8 | return 2*a; 9 | } 10 | -------------------------------------------------------------------------------- /code/ch01/precision2_m.cuf: -------------------------------------------------------------------------------- 1 | module precision_m 2 | use, intrinsic :: iso_fortran_env 3 | 4 | #ifdef DOUBLE 5 | integer, parameter :: fp_kind = real64 6 | #else 7 | integer, parameter :: fp_kind = real32 8 | #endif 9 | end module precision_m 10 | -------------------------------------------------------------------------------- /code/ch11/precision_m.F90: -------------------------------------------------------------------------------- 1 | module precision_m 2 | use, intrinsic :: iso_fortran_env 3 | 4 | #ifdef DOUBLE 5 | integer, parameter :: fp_kind = real64 6 | #else 7 | integer, parameter :: fp_kind = real32 8 | #endif 9 | end module precision_m 10 | 11 | -------------------------------------------------------------------------------- /code/ch05/sliceTransfer.cuf: -------------------------------------------------------------------------------- 1 | program sliceTransfer 2 | implicit none 3 | integer, parameter :: nx=128, ny=128, nz=128 4 | real :: a(nx,ny,nz) 5 | real, device :: a_d(nx,ny,nz) 6 | 7 | a = 0.0 8 | a_d(2:nx-1,:,:) = a(2:nx-1,:,:) 9 | 10 | end program sliceTransfer 11 | -------------------------------------------------------------------------------- /code/ch08/timing.f90: -------------------------------------------------------------------------------- 1 | module timing 2 | interface wallclock 3 | function wallclock() result(res) bind(C, name='wallclock') 4 | use iso_c_binding 5 | real (c_double) :: res 6 | end function wallclock 7 | end interface wallclock 8 | end module timing 9 | 10 | -------------------------------------------------------------------------------- /code/ch01/version.cuf: -------------------------------------------------------------------------------- 1 | program version 2 | use cudafor 3 | implicit none 4 | integer :: istat, ver 5 | istat = cudaDriverGetVersion(ver) 6 | print *, 'Driver version: ', ver 7 | istat = cudaRuntimeGetVersion(ver) 8 | print *, 'Runtime version: ', ver 9 | end program version 10 | -------------------------------------------------------------------------------- /code/ch06/portingBase.f90: -------------------------------------------------------------------------------- 1 | program main 2 | implicit none 3 | integer, parameter :: n=8 4 | real :: a(n), b(n) 5 | integer :: i 6 | 7 | do i = 1, n 8 | a(i) = i+1 9 | enddo 10 | 11 | do i = 1, n 12 | b(i) = a(i)+1 13 | enddo 14 | 15 | print *, a 16 | print *, b 17 | end program main 18 | 19 | 20 | -------------------------------------------------------------------------------- /code/ch08/wallclock.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | double wallclock() 7 | { 8 | struct timeval tv; 9 | struct timezone tz; 10 | double t; 11 | 12 | gettimeofday(&tv, &tz); 13 | 14 | t = (double)tv.tv_sec; 15 | t += ((double)tv.tv_usec)/1000000.0; 16 | 17 | return t; 18 | } 19 | -------------------------------------------------------------------------------- /code/ch03/nvtxAuto.cuf: -------------------------------------------------------------------------------- 1 | module m 2 | contains 3 | subroutine s1 4 | call s2 5 | call s2 6 | end subroutine s1 7 | 8 | subroutine s2 9 | call sleep(1) 10 | end subroutine s2 11 | end module m 12 | 13 | program main 14 | use m 15 | implicit none 16 | integer :: n 17 | 18 | do n = 1, 4 19 | call s1 20 | enddo 21 | 22 | end program main 23 | -------------------------------------------------------------------------------- /code/ch03/nvtxBasic.cuf: -------------------------------------------------------------------------------- 1 | program main 2 | use nvtx 3 | implicit none 4 | character(len=4) :: nchar 5 | integer :: n 6 | 7 | call nvtxStartRange("Outer Label") 8 | 9 | do n = 1, 4 10 | write(nchar, '(i4)') n 11 | call nvtxStartRange('Label '//nchar,n) 12 | call sleep(1) 13 | call nvtxEndRange() 14 | enddo 15 | 16 | call nvtxEndRange() 17 | end program main 18 | -------------------------------------------------------------------------------- /code/ch09/precision_m.F90: -------------------------------------------------------------------------------- 1 | module precision_m 2 | use, intrinsic :: iso_fortran_env 3 | use iso_fortran_env, only: real32, real64 4 | integer, parameter :: singlePrecision = real32 5 | integer, parameter :: doublePrecision = real64 6 | 7 | #ifdef DOUBLE 8 | integer, parameter :: fp_kind = doublePrecision 9 | #else 10 | integer, parameter :: fp_kind = singlePrecision 11 | #endif 12 | 13 | end module precision_m 14 | 15 | -------------------------------------------------------------------------------- /code/ch03/nvtxAdv2.cuf: -------------------------------------------------------------------------------- 1 | program main 2 | use nvtx 3 | implicit none 4 | integer, parameter :: n=4 5 | type(nvtxRangeId) :: id(n) 6 | character(len=4) :: ichar 7 | integer :: i 8 | 9 | do i=1, n 10 | write(ichar,'(i4)') i 11 | id(i) = nvtxRangeStart('Label '//ichar) 12 | call sleep(1) 13 | if (i>1) call nvtxRangeEnd(id(i-1)) 14 | enddo 15 | 16 | call nvtxRangeEnd(id(n)) 17 | end program main 18 | -------------------------------------------------------------------------------- /code/ch03/nvtxAdv.cuf: -------------------------------------------------------------------------------- 1 | program main 2 | use nvtx 3 | implicit none 4 | type(nvtxRangeId) :: id1, id2 5 | character(len=4) :: nchar 6 | integer :: n 7 | 8 | id1 = nvtxRangeStart("Outer label") 9 | 10 | do n = 1, 4 11 | write(nchar, '(i4)') n 12 | id2 = nvtxRangeStart('Label '//nchar) 13 | call sleep(1) 14 | call nvtxRangeEnd(id2) 15 | enddo 16 | 17 | call nvtxRangeEnd(id1) 18 | end program main 19 | -------------------------------------------------------------------------------- /code/ch07/testSort.cuf: -------------------------------------------------------------------------------- 1 | program testsort 2 | use thrust 3 | implicit none 4 | integer, parameter :: n = 10 5 | real :: cpuData(n) 6 | real, device :: gpuData(n) 7 | 8 | call random_number(cpuData) 9 | cpuData(5)=100. 10 | 11 | print *, "Before sorting", cpuData 12 | 13 | gpuData=cpuData 14 | call thrustsort(gpuData, size(gpuData)) 15 | cpuData=gpuData 16 | 17 | print *, "After sorting", cpuData 18 | end program testsort 19 | -------------------------------------------------------------------------------- /code/ch06/portingManaged.cuf: -------------------------------------------------------------------------------- 1 | program main 2 | use cudafor 3 | implicit none 4 | integer, parameter :: n=8 5 | real, managed :: a(n), b(n) 6 | integer :: i 7 | 8 | !$cuf kernel do <<<*,*>>> 9 | do i = 1, n 10 | a(i) = i+1 11 | enddo 12 | 13 | !$cuf kernel do <<<*,*>>> 14 | do i = 1, n 15 | b(i) = a(i)+1 16 | enddo 17 | 18 | i = cudaDeviceSynchronize() 19 | 20 | print *, a 21 | print *, b 22 | end program main 23 | 24 | 25 | -------------------------------------------------------------------------------- /code/ch06/portingDevice.cuf: -------------------------------------------------------------------------------- 1 | program main 2 | implicit none 3 | integer, parameter :: n=8 4 | real :: a(n), b(n) 5 | real, device :: a_d(n), b_d(n) 6 | integer :: i 7 | 8 | !$cuf kernel do <<<*,*>>> 9 | do i = 1, n 10 | a_d(i) = i+1 11 | enddo 12 | 13 | !$cuf kernel do <<<*,*>>> 14 | do i = 1, n 15 | b_d(i) = a_d(i)+1 16 | enddo 17 | 18 | a = a_d; b = b_d 19 | 20 | print *, a 21 | print *, b 22 | end program main 23 | 24 | 25 | -------------------------------------------------------------------------------- /code/ch05/cufILP.cuf: -------------------------------------------------------------------------------- 1 | program ilp 2 | implicit none 3 | integer, parameter :: n = 1024*1024 4 | integer :: a(n), i, b 5 | integer, device :: a_d(n) 6 | integer, parameter :: tPB = 256 7 | 8 | a = 1 9 | b = 3 10 | 11 | a_d = a 12 | 13 | !$cuf kernel do <<<1024,tPB>>> 14 | do i = 1, n 15 | a_d(i) = a_d(i) + b 16 | enddo 17 | 18 | a = a_d 19 | 20 | if (any(a /= 4)) then 21 | write(*,*) '**** Program Failed ****' 22 | else 23 | write(*,*) 'Program Passed' 24 | endif 25 | end program ilp 26 | 27 | -------------------------------------------------------------------------------- /code/ch06/portingManagedSent.f90: -------------------------------------------------------------------------------- 1 | program main 2 | !@cuf use cudafor 3 | implicit none 4 | integer, parameter :: n=8 5 | real :: a(n), b(n) 6 | !@cuf attributes(managed) :: a, b 7 | integer :: i 8 | 9 | !$cuf kernel do <<<*,*>>> 10 | do i = 1, n 11 | a(i) = i+1 12 | enddo 13 | 14 | !$cuf kernel do <<<*,*>>> 15 | do i = 1, n 16 | b(i) = a(i)+1 17 | enddo 18 | 19 | !@cuf i = cudaDeviceSynchronize() 20 | !@cuf print *, 'GPU version' 21 | 22 | print *, a 23 | print *, b 24 | end program main 25 | 26 | 27 | -------------------------------------------------------------------------------- /code/ch01/managedCUF.cuf: -------------------------------------------------------------------------------- 1 | program managedCUF 2 | use cudafor 3 | implicit none 4 | integer, parameter :: nx=1024, ny=512 5 | integer, managed :: a(nx,ny) 6 | integer :: b, i, j 7 | 8 | a = 1 9 | b = 3 10 | 11 | !$cuf kernel do (2) <<<*,*>>> 12 | do j = 1, ny 13 | do i = 1, nx 14 | a(i,j) = a(i,j) + b 15 | enddo 16 | enddo 17 | i = cudaDeviceSynchronize() 18 | 19 | if (any(a /= 4)) then 20 | print *, '**** Program Failed ****' 21 | else 22 | print *, 'Program Passed' 23 | endif 24 | end program managedCUF 25 | -------------------------------------------------------------------------------- /code/ch01/multidimCUF.cuf: -------------------------------------------------------------------------------- 1 | program multidimCUF 2 | use cudafor 3 | implicit none 4 | integer, parameter :: nx=1024, ny=512 5 | integer :: a(nx,ny), b, i, j 6 | integer, device :: a_d(nx,ny) 7 | 8 | a = 1 9 | b = 3 10 | 11 | a_d = a 12 | !$cuf kernel do (2) <<<*,*>>> 13 | do j = 1, ny 14 | do i = 1, nx 15 | a_d(i,j) = a_d(i,j) + b 16 | enddo 17 | enddo 18 | a = a_d 19 | 20 | if (any(a /= 4)) then 21 | print *, '**** Program Failed ****' 22 | else 23 | print *, 'Program Passed' 24 | endif 25 | end program multidimCUF 26 | -------------------------------------------------------------------------------- /code/ch06/portingAssociate.f90: -------------------------------------------------------------------------------- 1 | program main 2 | implicit none 3 | integer, parameter :: n=8 4 | 5 | real :: a(n), b(n) 6 | !@cuf real, device :: a_d(n), b_d(n) 7 | integer :: i 8 | 9 | !@cuf associate(a => a_d) 10 | !$cuf kernel do 11 | do i = 1, n 12 | a(i) = i+1 13 | enddo 14 | !@cuf associate(b => b_d) 15 | !$cuf kernel do 16 | do i = 1, n 17 | b(i) = a(i)+1 18 | enddo 19 | !@cuf end associate ! b ... 20 | !@cuf end associate ! a ... 21 | !@cuf a = a_d; b = b_d 22 | 23 | !@cuf print *, 'GPU run' 24 | print *, a 25 | print *, b 26 | end program main 27 | 28 | 29 | -------------------------------------------------------------------------------- /code/ch06/portingManaged_CUDA.F90: -------------------------------------------------------------------------------- 1 | program main 2 | #ifdef _CUDA 3 | use cudafor 4 | #endif 5 | implicit none 6 | integer, parameter :: n=8 7 | real :: a(n), b(n) 8 | #ifdef _CUDA 9 | attributes(managed) :: a, b 10 | #endif 11 | integer :: i 12 | 13 | !$cuf kernel do <<<*,*>>> 14 | do i = 1, n 15 | a(i) = i+1 16 | enddo 17 | 18 | !$cuf kernel do <<<*,*>>> 19 | do i = 1, n 20 | b(i) = a(i)+1 21 | enddo 22 | 23 | #ifdef _CUDA 24 | i = cudaDeviceSynchronize() 25 | print *, 'GPU version' 26 | #endif 27 | 28 | print *, a 29 | print *, b 30 | end program main 31 | 32 | 33 | -------------------------------------------------------------------------------- /code/ch02/Makefile: -------------------------------------------------------------------------------- 1 | OBJS = accuracy fma print debug memcheck initcheck 2 | 3 | # section 2.1.1 4 | 5 | accuracy: accuracy.cuf 6 | nvfortran -o $@ $< 7 | 8 | # section 2.1.2 9 | 10 | fma: fma.cuf 11 | nvfortran -c -gpu=keep $< 12 | 13 | # section 2.2.1 14 | 15 | print: print.cuf 16 | nvfortran -o $@ $< 17 | 18 | # section 2.2.2 19 | 20 | debug: debug.cuf 21 | nvfortran -o $@ -g -gpu=nordc $< 22 | 23 | # section 2.2.3 24 | 25 | memcheck: memcheck.cuf 26 | nvfortran -o $@ $< 27 | 28 | initcheck: initcheck.cuf 29 | nvfortran -o $@ $< 30 | 31 | clean: 32 | rm -rf $(OBJS) *.o *.mod *~ *.ptx *.gpu *.bin *.fat 33 | -------------------------------------------------------------------------------- /code/ch06/portingDeviceSent.F90: -------------------------------------------------------------------------------- 1 | program main 2 | implicit none 3 | integer, parameter :: n=8 4 | real :: a(n), b(n) 5 | !@cuf attributes(device) :: a, b 6 | !@cuf real :: a_h(n), b_h(n) 7 | integer :: i 8 | 9 | !$cuf kernel do <<<*,*>>> 10 | do i = 1, n 11 | a(i) = i+1 12 | enddo 13 | 14 | !$cuf kernel do <<<*,*>>> 15 | do i = 1, n 16 | b(i) = a(i)+1 17 | enddo 18 | 19 | !@cuf a_h = a; b_h = b 20 | !@cuf print *, 'GPU version' 21 | 22 | #ifdef _CUDA 23 | print *, a_h 24 | print *, b_h 25 | #else 26 | print *, a 27 | print *, b 28 | #endif 29 | 30 | end program main 31 | 32 | 33 | -------------------------------------------------------------------------------- /code/ch06/portingDevice_CUDA.F90: -------------------------------------------------------------------------------- 1 | program main 2 | implicit none 3 | integer, parameter :: n=8 4 | real :: a(n), b(n) 5 | #ifdef _CUDA 6 | attributes(device) :: a, b 7 | real :: a_h(n), b_h(n) 8 | #endif 9 | integer :: i 10 | 11 | !$cuf kernel do <<<*,*>>> 12 | do i = 1, n 13 | a(i) = i+1 14 | enddo 15 | 16 | !$cuf kernel do <<<*,*>>> 17 | do i = 1, n 18 | b(i) = a(i)+1 19 | enddo 20 | 21 | #ifdef _CUDA 22 | a_h = a; b_h = b 23 | print *, 'GPU version' 24 | print *, a_h 25 | print *, b_h 26 | #else 27 | print *, a 28 | print *, b 29 | #endif 30 | 31 | end program main 32 | 33 | 34 | -------------------------------------------------------------------------------- /code/ch01/managedCUF2.f90: -------------------------------------------------------------------------------- 1 | program managedCUF 2 | !@cuf use cudafor 3 | implicit none 4 | integer, parameter :: nx=1024, ny=512 5 | integer :: a(nx,ny) 6 | !@cuf attributes(managed):: a 7 | integer :: b, i ,j 8 | 9 | a = 1 10 | b = 3 11 | !$cuf kernel do (2) <<<*,*>>> 12 | do j=1,ny 13 | do i=1,nx 14 | a(i,j)=a(i,j)+b 15 | end do 16 | end do 17 | !@cuf i=cudaDeviceSynchronize() 18 | 19 | !@cuf print *, "Running CUDA version ..." 20 | if(any(a /= 4)) then 21 | print *, "**** Program Failed ****" 22 | else 23 | print *, "Program Passed" 24 | end if 25 | end program managedCUF 26 | -------------------------------------------------------------------------------- /code/ch05/constantAttribute.cuf: -------------------------------------------------------------------------------- 1 | module m 2 | integer, constant :: b 3 | contains 4 | attributes(global) subroutine increment(a) 5 | implicit none 6 | integer, intent(inout) :: a(*) 7 | integer :: i 8 | 9 | i = threadIdx%x 10 | a(i) = a(i)+b 11 | 12 | end subroutine increment 13 | end module m 14 | 15 | 16 | program constantAttribute 17 | use cudafor 18 | use m 19 | implicit none 20 | type(cudaFuncAttributes) :: attr 21 | integer :: istat 22 | 23 | istat = cudaFuncGetAttributes(attr, increment) 24 | print "('Constant memory used (bytes): ', i0)", attr%constSizeBytes 25 | end program constantAttribute 26 | -------------------------------------------------------------------------------- /code/ch10/Makefile: -------------------------------------------------------------------------------- 1 | OBJS = finiteDifference finiteDifferenceStr limitingFactor laplace2D 2 | 3 | # section 10.1.* 4 | 5 | finiteDifference: precision_m.cuf derivative_m.cuf finiteDifference.cuf 6 | nvfortran -o $@ -O3 -gpu=ptxinfo $^ 7 | 8 | # section 10.1.2 9 | 10 | limitingFactor: precision_m.cuf limitingFactor.cuf 11 | nvfortran -o $@ -O3 $^ 12 | 13 | # section 10.1.4 14 | 15 | finiteDifferenceStr: precision_m.cuf derivativeStr_m.cuf finiteDifferenceStr.cuf 16 | nvfortran -o $@ -O3 $^ 17 | 18 | # section 10.2 19 | 20 | laplace2D: laplace2D.cuf 21 | nvfortran -o $@ -O3 $^ 22 | 23 | 24 | clean: 25 | rm -rf $(OBJS) *.o *.mod *~ 26 | -------------------------------------------------------------------------------- /code/ch08/mpiDeviceUtil.cuf: -------------------------------------------------------------------------------- 1 | module mpiDeviceUtil 2 | contains 3 | ! assign a different GPU to each MPI rank 4 | ! note: all the memory allocations should be dynamic, 5 | ! otherwise all the arrays will be allocated on device 0 6 | subroutine assignDevice(dev) 7 | use mpi 8 | use cudafor 9 | implicit none 10 | integer :: dev 11 | integer :: local_comm, ierr 12 | 13 | dev=0 14 | call MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, & 15 | MPI_INFO_NULL, local_comm, ierr) 16 | call MPI_Comm_rank(local_comm, dev, ierr) 17 | ierr = cudaSetDevice(dev) 18 | end subroutine assignDevice 19 | end module mpiDeviceUtil 20 | 21 | -------------------------------------------------------------------------------- /code/ch01/increment.f90: -------------------------------------------------------------------------------- 1 | module m 2 | contains 3 | subroutine increment(a, b) 4 | implicit none 5 | integer, intent(inout) :: a(:) 6 | integer, intent(in) :: b 7 | integer :: i, n 8 | 9 | n = size(a) 10 | do i = 1, n 11 | a(i) = a(i)+b 12 | enddo 13 | 14 | end subroutine increment 15 | end module m 16 | 17 | 18 | program incrementCPU 19 | use m 20 | implicit none 21 | integer, parameter :: n = 256 22 | integer :: a(n), b 23 | 24 | a = 1 25 | b = 3 26 | call increment(a, b) 27 | 28 | if (any(a /= 4)) then 29 | print *, '**** Program Failed ****' 30 | else 31 | print *, 'Program Passed' 32 | endif 33 | end program incrementCPU 34 | -------------------------------------------------------------------------------- /code/ch09/ieee_accuracy.f90: -------------------------------------------------------------------------------- 1 | program test_accuracy 2 | real :: x, y, dist 3 | double precision:: x_dp, y_dp, dist_dp 4 | 5 | x=Z'3F1DC57A' 6 | y=Z'3F499AA3' 7 | dist= x**2 +y**2 8 | 9 | x_dp=real(x,8) 10 | y_dp=real(y,8) 11 | dist_dp= x_dp**2 +y_dp**2 12 | 13 | print *, 'Result with operands in single precision:' 14 | print '((2x,z8)) ', dist 15 | 16 | print *, 'Result in double precision with operands' 17 | print *, 'promoted to double precision:' 18 | print '((2x,z16))', dist_dp 19 | 20 | print *, 'Result in single precision with operands' 21 | print *, 'promoted to double precision:' 22 | print '((2x,z8))', real(dist_dp,4) 23 | 24 | end program test_accuracy 25 | -------------------------------------------------------------------------------- /code/ch02/accuracy.cuf: -------------------------------------------------------------------------------- 1 | program main 2 | use cudafor 3 | implicit none 4 | integer, parameter :: n=1000000 5 | integer(8), parameter :: n8 = n 6 | 7 | real :: suminc = 0.0, sumdec = 0.0, a(n) 8 | real, device :: a_d(n) 9 | integer :: i 10 | 11 | do i = 1, n 12 | a(i) = i 13 | suminc = suminc + i 14 | enddo 15 | 16 | do i = n, 1, -1 17 | sumdec = sumdec + i 18 | end do 19 | 20 | print *, 'n: ', n 21 | print *,'n*(n+1)/2: ', n8/2*(n8+1) 22 | 23 | print *, 'from sum(a): ', sum(a) 24 | print *, 'incr accumulation: ', suminc 25 | print *, 'decr accumulation: ', sumdec 26 | a_d = a 27 | print *, 'from sum(a_d): ', sum(a_d) 28 | end program main 29 | -------------------------------------------------------------------------------- /code/ch07/cudaforEx.cuf: -------------------------------------------------------------------------------- 1 | module cudaforEx 2 | use cudafor 3 | 4 | contains 5 | 6 | function cudaforGetDataType(x) result(res) 7 | use cudafor 8 | !dir$ ignore_tkr (rd) x 9 | class(*) :: x 10 | integer :: res 11 | select type (x) 12 | type is (real(2)) 13 | res = CUDA_R_16F 14 | type is (real(4)) 15 | res = CUDA_R_32F 16 | type is (real(8)) 17 | res = CUDA_R_64F 18 | type is (integer(4)) 19 | res = CUDA_R_32I 20 | type is (complex(4)) 21 | res = CUDA_C_32F 22 | type is (complex(8)) 23 | res = CUDA_C_64F 24 | class default 25 | res = -1 26 | end select 27 | end function cudaforGetDataType 28 | 29 | end module cudaforEx 30 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | SPDX-License-Identifier: Apache-2.0 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use the files in this directory except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | -------------------------------------------------------------------------------- /code/ch05/constant.cuf: -------------------------------------------------------------------------------- 1 | module m 2 | integer, constant :: b 3 | contains 4 | attributes(global) subroutine increment(a) 5 | implicit none 6 | integer, intent(inout) :: a(*) 7 | integer :: i 8 | 9 | i = threadIdx%x 10 | a(i) = a(i)+b 11 | 12 | end subroutine increment 13 | end module m 14 | 15 | 16 | program incrementGPU 17 | use cudafor 18 | use m 19 | implicit none 20 | integer, parameter :: n = 256 21 | integer :: a(n) 22 | integer, device :: a_d(n) 23 | 24 | a = 1 25 | b = 3 26 | 27 | a_d = a 28 | call increment<<<1,n>>>(a_d) 29 | a = a_d 30 | 31 | if (any(a /= 4)) then 32 | print *, '**** Program Failed ****' 33 | else 34 | print *, 'Program Passed' 35 | endif 36 | end program incrementGPU 37 | -------------------------------------------------------------------------------- /code/ch01/increment.cuf: -------------------------------------------------------------------------------- 1 | module m 2 | contains 3 | attributes(global) subroutine increment(a, b) 4 | implicit none 5 | integer, intent(inout) :: a(*) 6 | integer, value :: b 7 | integer :: i 8 | 9 | i = threadIdx%x 10 | a(i) = a(i)+b 11 | 12 | end subroutine increment 13 | end module m 14 | 15 | 16 | program incrementGPU 17 | use cudafor 18 | use m 19 | implicit none 20 | integer, parameter :: n = 256 21 | integer :: a(n), b 22 | integer, device :: a_d(n) 23 | 24 | a = 1 25 | b = 3 26 | 27 | a_d = a 28 | call increment<<<1,n>>>(a_d, b) 29 | a = a_d 30 | 31 | if (any(a /= 4)) then 32 | print *, '**** Program Failed ****' 33 | else 34 | print *, 'Program Passed' 35 | endif 36 | end program incrementGPU 37 | -------------------------------------------------------------------------------- /code/ch07/thrust.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | 5 | extern "C" { 6 | void sort_int_wrapper(int *data, int N) 7 | { 8 | // C ptr to device_vector container 9 | thrust::device_ptr dev_ptr(data); 10 | // Use device_ptr in Thrust sort algorithm 11 | thrust::sort(dev_ptr, dev_ptr+N); 12 | } 13 | 14 | //Sort for float arrays 15 | void sort_float_wrapper(float *data, int N) 16 | { 17 | thrust::device_ptr dev_ptr(data); 18 | thrust::sort(dev_ptr, dev_ptr+N); 19 | } 20 | 21 | //Sort for double arrays 22 | void sort_double_wrapper(double *data, int N) 23 | { 24 | thrust::device_ptr dev_ptr(data); 25 | thrust::sort(dev_ptr, dev_ptr+N); 26 | } 27 | } 28 | 29 | -------------------------------------------------------------------------------- /code/ch04/raceAndAtomic.cuf: -------------------------------------------------------------------------------- 1 | module m 2 | contains 3 | attributes(global) subroutine s1(raceCount, atomicCount) 4 | implicit none 5 | integer :: raceCount, atomicCount 6 | integer :: tmp 7 | raceCount = raceCount + 1 8 | tmp = atomicAdd(atomicCount, 1) 9 | end subroutine s1 10 | end module m 11 | 12 | program raceAndAtomic 13 | use m 14 | implicit none 15 | integer, parameter :: nBlocks = 256, nThreads = 256 16 | integer, device :: raceCount_d, atomicCount_d 17 | integer :: raceCount, atomicCount 18 | 19 | raceCount_d = 0 20 | atomicCount_d = 0 21 | call s1<<>>(raceCount_d, atomicCount_d) 22 | raceCount = raceCount_d 23 | atomicCount = atomicCount_d 24 | print *, nBlocks*nThreads, raceCount, atomicCount 25 | end program raceAndAtomic 26 | -------------------------------------------------------------------------------- /code/ch07/thrust.cuf: -------------------------------------------------------------------------------- 1 | module thrust 2 | 3 | interface thrustsort 4 | subroutine sort_int(input, N) bind(C, name="sort_int_wrapper") 5 | use iso_c_binding 6 | integer(c_int), device :: input(*) 7 | integer(c_int), value :: N 8 | end subroutine sort_int 9 | 10 | subroutine sort_float(input, N) bind(C, name="sort_float_wrapper") 11 | use iso_c_binding 12 | real(c_float), device :: input(*) 13 | integer(c_int), value :: N 14 | end subroutine sort_float 15 | 16 | subroutine sort_double(input, N) bind(C, name="sort_double_wrapper") 17 | use iso_c_binding 18 | real(c_double), device :: input(*) 19 | integer(c_int), value :: N 20 | end subroutine sort_double 21 | end interface thrustsort 22 | 23 | end module thrust 24 | -------------------------------------------------------------------------------- /code/ch03/peakBandwidth.cuf: -------------------------------------------------------------------------------- 1 | program peakBandwidth 2 | use cudafor 3 | implicit none 4 | 5 | integer :: i, istat, nDevices=0 6 | type (cudaDeviceProp) :: prop 7 | 8 | istat = cudaGetDeviceCount(nDevices) 9 | do i = 0, nDevices-1 10 | istat = cudaGetDeviceProperties(prop, i) 11 | print "(' Device Number: ',i0)", i 12 | print "(' Device name: ',a)", trim(prop%name) 13 | print "(' Memory Clock Rate (KHz): ', i0)", & 14 | prop%memoryClockRate 15 | print "(' Memory Bus Width (bits): ', i0)", & 16 | prop%memoryBusWidth 17 | print "(' Peak Memory Bandwidth (GB/s): ', f9.2)", & 18 | 2.0 * prop%memoryClockRate * & 19 | (prop%memoryBusWidth / 8) * 1.e-6 20 | print * 21 | enddo 22 | end program peakBandwidth 23 | -------------------------------------------------------------------------------- /code/ch04/ballot.cuf: -------------------------------------------------------------------------------- 1 | module m 2 | contains 3 | attributes(global) subroutine s(a, nWarps) 4 | implicit none 5 | integer :: a(nWarps) 6 | integer, value :: nWarps 7 | integer :: mask, laneID, warpID 8 | 9 | warpID = ishft(threadIdx%x-1,-5)+1 10 | laneID = iand(threadIdx%x-1,31)+1 11 | 12 | mask = ballot(threadIdx%x > 40) 13 | mask = ballot_sync(mask, mod(threadIdx%x,2) == 0) 14 | if (laneID == 1) a(warpID) = mask 15 | end subroutine s 16 | end module m 17 | 18 | program ballot 19 | use m 20 | implicit none 21 | integer, parameter :: nWarps = 3 22 | integer :: a(nWarps), i 23 | integer, device :: a_d(nWarps) 24 | 25 | call s<<<1,nWarps*32>>>(a_d, nWarps) 26 | a = a_d 27 | do i = 1, nWarps 28 | print "(i0, 1x, B32.32)", i, a(i) 29 | enddo 30 | end program ballot 31 | -------------------------------------------------------------------------------- /code/ch04/concurrentKernels.cuf: -------------------------------------------------------------------------------- 1 | program concurrentKernels 2 | use cudafor 3 | implicit none 4 | integer, parameter :: n=100000 5 | integer, parameter :: nStreams=6 6 | real, device :: a_d(n,nStreams) 7 | real :: a(n,nStreams) 8 | integer(kind=cuda_stream_kind) :: streams(nStreams) 9 | integer :: istat, i, j, flag 10 | 11 | flag = cudaStreamDefault 12 | do j = 1, nStreams 13 | istat = cudaStreamCreateWithFlags(streams(j), flag) 14 | enddo 15 | streams(3) = 0 16 | 17 | do j = 1, nStreams 18 | !$cuf kernel do <<<1,1,stream=streams(j)>>> 19 | do i = 1, n 20 | a_d(i,j) = i 21 | enddo 22 | enddo 23 | 24 | if (flag == cudaStreamNonBlocking) & 25 | istat = cudaDeviceSynchronize() 26 | 27 | a = a_d 28 | 29 | do j = 1, nStreams 30 | istat = cudaStreamDestroy(streams(j)) 31 | enddo 32 | 33 | end program concurrentKernels 34 | -------------------------------------------------------------------------------- /code/ch04/shfl.cuf: -------------------------------------------------------------------------------- 1 | module m 2 | contains 3 | attributes(global) subroutine s(a) 4 | implicit none 5 | integer :: a(warpsize, 6) 6 | integer :: laneMask, stage 7 | integer :: var 8 | 9 | var = threadIdx%x 10 | a(threadIdx%x,1) = var 11 | 12 | laneMask = 1 13 | stage = 2 14 | do 15 | var = var + __shfl_xor(var, laneMask) 16 | laneMask = laneMask*2 17 | 18 | a(threadIdx%x, stage) = var; stage = stage+1 19 | if (laneMask > 16) exit 20 | enddo 21 | end subroutine s 22 | end module m 23 | 24 | program shfl 25 | use m 26 | implicit none 27 | integer, device :: a_d(32, 6) 28 | integer :: a(32, 6), i 29 | 30 | call s<<<1,32>>>(a_d) 31 | a = a_d 32 | 33 | do i = 1, 6 34 | print "(16I4)", a(1:16,i) 35 | enddo 36 | print * 37 | do i = 1, 6 38 | print "(16I4)", a(17:32,i) 39 | enddo 40 | 41 | end program shfl 42 | -------------------------------------------------------------------------------- /code/ch02/debug.cuf: -------------------------------------------------------------------------------- 1 | module simpleOps_m 2 | contains 3 | attributes(global) subroutine increment(a, n, b) 4 | implicit none 5 | integer, intent(inout) :: a(*) 6 | integer, value :: n, b 7 | integer :: i 8 | 9 | i = blockDim%x*(blockIdx%x-1) + threadIdx%x 10 | if (i <= n) then 11 | a(i) = a(i)+b 12 | endif 13 | end subroutine increment 14 | end module simpleOps_m 15 | 16 | 17 | program main 18 | use cudafor 19 | use simpleOps_m 20 | implicit none 21 | integer, parameter :: tPB=32*5, n=tPB*5+1, b = 3 22 | integer :: a(n), r(n), i 23 | integer, device :: a_d(n) 24 | 25 | do i = 1, n 26 | a(i) = i 27 | end do 28 | 29 | a_d = a 30 | call increment<<<(n-1)/tPB+1,tPB>>>(a_d, n, b) 31 | r = a_d 32 | 33 | if (any(r /= a+b)) then 34 | print *, '**** Program Failed ****' 35 | else 36 | print *, 'Program Passed' 37 | endif 38 | end program main 39 | -------------------------------------------------------------------------------- /code/ch05/assumedShapeSize.cuf: -------------------------------------------------------------------------------- 1 | module m 2 | contains 3 | attributes(global) subroutine assumedSizeArrays(a, b, c, nx, ny) 4 | implicit none 5 | real :: a(nx,ny,*), b(nx,ny,*), c(nx,ny,*) 6 | integer, value :: nx, ny 7 | integer :: i, j, k 8 | 9 | i = (blockIdx%x-1)*blockDim%x + threadIdx%x 10 | j = (blockIdx%y-1)*blockDim%y + threadIdx%y 11 | k = (blockIdx%z-1)*blockDim%z + threadIdx%z 12 | 13 | c(i,j,k) = a(i,j,k) + b(i,j,k) 14 | end subroutine assumedSizeArrays 15 | 16 | attributes(global) subroutine assumedShapeArrays(a, b, c) 17 | implicit none 18 | real :: a(:,:,:), b(:,:,:), c(:,:,:) 19 | integer :: i, j, k 20 | 21 | i = (blockIdx%x-1)*blockDim%x + threadIdx%x 22 | j = (blockIdx%y-1)*blockDim%y + threadIdx%y 23 | k = (blockIdx%z-1)*blockDim%z + threadIdx%z 24 | 25 | c(i,j,k) = a(i,j,k) + b(i,j,k) 26 | end subroutine assumedShapeArrays 27 | end module m 28 | -------------------------------------------------------------------------------- /code/ch05/spill.cuf: -------------------------------------------------------------------------------- 1 | module spill 2 | contains 3 | attributes(global) subroutine k(a, n) 4 | integer :: a(*) 5 | integer, value :: n 6 | integer, parameter :: nb = 32 7 | integer :: b(nb), tid, i 8 | tid = blockDim%x*(blockIdx%x-1) + threadIdx%x 9 | do i = 1, nb 10 | b(i) = a(mod(tid-1+i,n)+1) 11 | enddo 12 | do i = 2, nb 13 | b(1) = b(1) + b(i) 14 | enddo 15 | a(tid) = b(1) 16 | end subroutine k 17 | 18 | 19 | attributes(global) launch_bounds(1024,2) subroutine klb(a, n) 20 | integer :: a(*) 21 | integer, value :: n 22 | integer, parameter :: nb = 32 23 | integer :: b(nb), tid, i 24 | tid = blockDim%x*(blockIdx%x-1) + threadIdx%x 25 | do i = 1, nb 26 | b(i) = a(mod(tid-1+i,n)+1) 27 | enddo 28 | do i = 2, nb 29 | b(1) = b(1) + b(i) 30 | enddo 31 | a(tid) = b(1) 32 | end subroutine klb 33 | end module spill 34 | 35 | -------------------------------------------------------------------------------- /code/ch04/differentStreamTypes.cuf: -------------------------------------------------------------------------------- 1 | program differentStreamTypes 2 | use cudafor 3 | implicit none 4 | integer, parameter :: n=100000 5 | real, device :: a_d(n), b_d(n) 6 | real :: a(n) 7 | integer(kind=cuda_stream_kind) :: stream1, stream2 8 | integer :: istat, i 9 | 10 | istat = cudaStreamCreate(stream1) 11 | istat = cudaStreamCreateWithFlags(stream2, cudaStreamNonBlocking) 12 | a = 1.0 13 | 14 | !$cuf kernel do <<<1,1>>> 15 | do i = 1, n 16 | b_d(i) = i 17 | enddo 18 | a_d = a 19 | 20 | !$cuf kernel do <<<1,1,stream=stream1>>> 21 | do i = 1, n 22 | b_d(i) = i 23 | enddo 24 | a_d = a 25 | 26 | istat = cudaDeviceSynchronize() 27 | 28 | !$cuf kernel do <<<1,1,stream=stream2>>> 29 | do i = 1, n 30 | b_d(i) = i 31 | enddo 32 | a_d = a 33 | 34 | istat = cudaStreamDestroy(stream1) 35 | istat = cudaStreamDestroy(stream2) 36 | 37 | end program differentStreamTypes 38 | -------------------------------------------------------------------------------- /code/ch02/initcheck.cuf: -------------------------------------------------------------------------------- 1 | module simpleOps_m 2 | contains 3 | attributes(global) subroutine increment(a, n, b) 4 | implicit none 5 | integer, intent(inout) :: a(*) 6 | integer, value :: n, b 7 | integer :: i 8 | 9 | i = blockDim%x*(blockIdx%x-1) + threadIdx%x 10 | if (i <= n) then 11 | a(i) = a(i)+b 12 | end if 13 | end subroutine increment 14 | end module simpleOps_m 15 | 16 | 17 | program main 18 | use cudafor 19 | use simpleOps_m 20 | implicit none 21 | integer, parameter :: tPB=32*5, n=tPB*5+1, b = 3 22 | integer :: a(n), r(n), i 23 | integer, device :: a_d(n) 24 | 25 | do i = 1, n 26 | a(i) = i 27 | end do 28 | 29 | ! a_d = a 30 | call increment<<>>(a_d, n, b) 31 | r = a_d 32 | 33 | if (any(r /= a+b)) then 34 | write(*,*) '**** Program Failed ****' 35 | else 36 | write(*,*) 'Program Passed' 37 | endif 38 | end program main 39 | -------------------------------------------------------------------------------- /code/ch01/multiblock.cuf: -------------------------------------------------------------------------------- 1 | module m 2 | contains 3 | attributes(global) subroutine increment(a, n, b) 4 | implicit none 5 | integer, intent(inout) :: a(*) 6 | integer, value :: n, b 7 | integer :: i 8 | 9 | i = blockDim%x*(blockIdx%x-1) + threadIdx%x 10 | if (i <= n) a(i) = a(i)+b 11 | 12 | end subroutine increment 13 | end module m 14 | 15 | 16 | program multiblock 17 | use cudafor 18 | use m 19 | implicit none 20 | integer, parameter :: n = 1024*1024 21 | integer, allocatable :: a(:) 22 | integer, device, allocatable :: a_d(:) 23 | integer :: b, tPB = 256 24 | 25 | allocate(a(n), a_d(n)) 26 | a = 1 27 | b = 3 28 | 29 | a_d = a 30 | call increment<<<(n-1)/tPB+1,tPB>>>(a_d, n, b) 31 | a = a_d 32 | 33 | if (any(a /= 4)) then 34 | print *, '**** Program Failed ****' 35 | else 36 | print *, 'Program Passed' 37 | endif 38 | deallocate(a, a_d) 39 | end program multiblock 40 | -------------------------------------------------------------------------------- /code/ch04/syncthreads.cuf: -------------------------------------------------------------------------------- 1 | module m 2 | contains 3 | attributes(global) subroutine s1(offset) 4 | implicit none 5 | integer, value :: offset 6 | integer :: tid, res 7 | tid = threadIdx%x 8 | 9 | res = syncthreads_and(tid > offset) 10 | if (tid == 1) print *, 'syncthreads_and(tid > offset): ', res 11 | res = syncthreads_or(tid > offset) 12 | if (tid == 1) print *, 'syncthreads_or(tid > offset): ', res 13 | res = syncthreads_count(tid > offset) 14 | if (tid == 1) print *, 'syncthreads_count(tid > offset): ', res 15 | end subroutine s1 16 | end module m 17 | 18 | program syncthreads 19 | use m 20 | use cudafor 21 | implicit none 22 | integer :: istat 23 | 24 | print *, 'offset = 0' 25 | call s1<<<1, 256>>>(0) 26 | istat = cudaDeviceSynchronize() 27 | 28 | print *, 'offset = 4' 29 | call s1<<<1, 256>>>(4) 30 | istat = cudaDeviceSynchronize() 31 | end program syncthreads 32 | -------------------------------------------------------------------------------- /code/ch02/memcheck.cuf: -------------------------------------------------------------------------------- 1 | module simpleOps_m 2 | contains 3 | attributes(global) subroutine increment(a, n, b) 4 | implicit none 5 | integer, intent(inout) :: a(*) 6 | integer, value :: n, b 7 | integer :: i 8 | 9 | i = blockDim%x*(blockIdx%x-1) + threadIdx%x-1 ! *incorrect* 10 | if (i <= n) then 11 | a(i) = a(i)+b 12 | end if 13 | end subroutine increment 14 | end module simpleOps_m 15 | 16 | 17 | program main 18 | use cudafor 19 | use simpleOps_m 20 | implicit none 21 | integer, parameter :: tPB=32*5, n=tPB*5+1, b = 3 22 | integer :: a(n), r(n), i 23 | integer, device :: a_d(n) 24 | 25 | do i = 1, n 26 | a(i) = i 27 | end do 28 | 29 | a_d = a 30 | call increment<<>>(a_d, n, b) 31 | r = a_d 32 | 33 | if (any(r /= a+b)) then 34 | print *, '**** Program Failed ****' 35 | else 36 | print *, 'Program Passed' 37 | endif 38 | end program main 39 | -------------------------------------------------------------------------------- /code/ch06/union_m.cuf: -------------------------------------------------------------------------------- 1 | module union 2 | 3 | interface union 4 | module procedure union_r4c4, union_r4dc4d 5 | end interface union 6 | 7 | contains 8 | 9 | subroutine union_r4c4(s, d_ptr) 10 | use iso_c_binding 11 | implicit none 12 | real(4) :: s(:) 13 | complex(4), pointer :: d_ptr(:) 14 | type(c_ptr) :: s_cptr 15 | integer :: d_shape(1) 16 | 17 | d_shape(1) = size(s)/2 18 | s_cptr = transfer(loc(s), s_cptr) 19 | call c_f_pointer(s_cptr, d_ptr, d_shape) 20 | end subroutine union_r4c4 21 | 22 | subroutine union_r4dc4d(s, d_ptr) 23 | use cudafor 24 | implicit none 25 | real(4), device :: s(:) 26 | complex(4), device, pointer :: d_ptr(:) 27 | type(c_devptr) :: s_cptr 28 | integer :: d_shape(1) 29 | 30 | d_shape(1) = size(s)/2 31 | s_cptr = c_devloc(s) 32 | call c_f_pointer(s_cptr, d_ptr, d_shape) 33 | end subroutine union_r4dc4d 34 | 35 | end module union 36 | -------------------------------------------------------------------------------- /code/ch07/matmulTranspose.cuf: -------------------------------------------------------------------------------- 1 | program main 2 | use cudafor 3 | use cutensorEx 4 | implicit none 5 | integer, parameter :: m=3200, n=m, k=3200 6 | real(8) :: a(m,k), b(k,n) 7 | real(8) :: c(m,n), cref(m,n) 8 | real(8), device :: a_d(m,k), b_d(k,n) 9 | real(8), device :: c_d(m,n) 10 | integer :: istat 11 | 12 | type(cudaDeviceProp) :: prop 13 | 14 | istat = cudaGetDeviceProperties(prop, 0) 15 | print "(' Device: ', a)", trim(prop%name) 16 | print "(' m = ', i0, ', n = ', i0, ', k = ', i0)", m, n, k 17 | 18 | call random_number(a) 19 | call random_number(b) 20 | 21 | ! transpose(A*B) = tranpose(B)*transpose(A) 22 | ! LHS on host 23 | cref = matmul(a,b) 24 | cref = transpose(cref) 25 | 26 | ! RHS on device 27 | a_d = a; b_d = b 28 | c_d = matmul(transpose(b_d), transpose(a_d)) 29 | c = c_d 30 | print *, 'maxval(abs(tr(AB)-tr(B)tr(A))): ', & 31 | maxval(abs(cref-c)) 32 | end program main 33 | 34 | -------------------------------------------------------------------------------- /code/ch04/pipeline.cuf: -------------------------------------------------------------------------------- 1 | program pipeline 2 | use cudafor 3 | implicit none 4 | integer, parameter :: n=100000 5 | integer, parameter :: nStreams=6 6 | real, device :: a_d(n,nStreams) 7 | real, pinned, allocatable :: a(:,:) 8 | integer(kind=cuda_stream_kind) :: streams(nStreams) 9 | integer :: istat, i, j 10 | 11 | do j = 1, nStreams 12 | istat = cudaStreamCreate(streams(j)) 13 | enddo 14 | 15 | allocate(a(n,nStreams)) 16 | a = 1.0 17 | 18 | do j = 1, nStreams 19 | istat = cudaMemcpyAsync(a_d(1,j), a(1,j), n, streams(j)) 20 | !$cuf kernel do <<<1,1,stream=streams(j)>>> 21 | do i = 1, n 22 | a_d(i,j) = a_d(i,j) + 1.0 23 | enddo 24 | istat = cudaMemcpyAsync(a(1,j), a_d(1,j), n, streams(j)) 25 | enddo 26 | 27 | istat = cudaDeviceSynchronize() 28 | if (all(a == 2.0)) print *, 'OK' 29 | 30 | do j = 1, nStreams 31 | istat = cudaStreamDestroy(streams(j)) 32 | enddo 33 | end program pipeline 34 | -------------------------------------------------------------------------------- /code/ch01/pciBusID.cuf: -------------------------------------------------------------------------------- 1 | program pciBusID 2 | use cudafor 3 | implicit none 4 | 5 | type (cudaDeviceProp) :: prop 6 | integer :: nDevices=0, i, ierr 7 | 8 | ! Number of CUDA-capable devices 9 | 10 | ierr = cudaGetDeviceCount(nDevices) 11 | 12 | if (nDevices == 0) then 13 | print "(/,'No CUDA devices found',/)" 14 | stop 15 | else if (nDevices == 1) then 16 | print "(/,'One CUDA device found',/)" 17 | else 18 | print "(/,i0,' CUDA devices found',/)", nDevices 19 | end if 20 | 21 | ! Loop over devices 22 | 23 | do i = 0, nDevices-1 24 | 25 | print "('Device Number: ',i0)", i 26 | 27 | ierr = cudaGetDeviceProperties(prop, i) 28 | 29 | ! General device info 30 | 31 | print "(' Device Name: ', a)", trim(prop%name) 32 | print "(' Compute Capability: ',i0,'.',i0)", & 33 | prop%major, prop%minor 34 | print "(' PCI Bus ID: ',i0)", prop%pciBusID 35 | 36 | enddo 37 | 38 | end program pciBusID 39 | -------------------------------------------------------------------------------- /code/ch04/eventSync.cuf: -------------------------------------------------------------------------------- 1 | program eventSync 2 | use cudafor 3 | implicit none 4 | integer, parameter :: n=100000 5 | real, device :: a_d(n), b_d(n), c_d(n) 6 | integer(kind=cuda_stream_kind) :: stream1, stream2 7 | type(cudaEvent) :: event1 8 | integer :: istat, i, j 9 | 10 | istat = cudaStreamCreate(stream1) 11 | istat = cudaStreamCreate(stream2) 12 | istat = cudaEventCreate(event1) 13 | 14 | !$cuf kernel do <<<1,1,stream=stream1>>> 15 | do i = 1, n 16 | a_d(i) = i 17 | enddo 18 | 19 | istat = cudaEventRecord(event1, stream1) 20 | 21 | !$cuf kernel do <<<1,1,stream=stream1>>> 22 | do i = 1, n 23 | b_d(i) = a_d(i) + 1.0 24 | enddo 25 | 26 | istat = cudaEventSynchronize(event1) 27 | 28 | !$cuf kernel do <<<1,1,stream=stream2>>> 29 | do i = 1, n 30 | c_d(i) = a_d(i) + 2.0 31 | enddo 32 | 33 | istat = cudaStreamDestroy(stream1) 34 | istat = cudaStreamDestroy(stream2) 35 | istat = cudaEventDestroy(event1) 36 | end program eventSync 37 | -------------------------------------------------------------------------------- /code/ch04/defaultStream.cuf: -------------------------------------------------------------------------------- 1 | program defaultStream 2 | use cudafor 3 | implicit none 4 | integer, parameter :: n=100000 5 | real, device, allocatable :: a_d(:), b_d(:) 6 | real :: a(n), b(n) 7 | integer(kind=cuda_stream_kind) :: stream1 8 | integer :: istat, i 9 | 10 | istat = cudaStreamCreate(stream1) 11 | 12 | allocate(a_d(n)) 13 | a_d = 1.0 14 | !$cuf kernel do <<<1,1>>> 15 | do i = 1, n 16 | a_d(i) = a_d(i) + i 17 | enddo 18 | a = a_d 19 | print *, maxval(a_d) 20 | 21 | istat = cudaforSetDefaultStream(stream1) 22 | allocate(b_d(n)) 23 | 24 | a_d = 2.0 25 | !$cuf kernel do <<<1,1>>> 26 | do i = 1, n 27 | a_d(i) = a_d(i) + i 28 | enddo 29 | a = a_d 30 | print *, maxval(a_d) 31 | 32 | b_d = 3.0 33 | !$cuf kernel do <<<1,1>>> 34 | do i = 1, n 35 | b_d(i) = b_d(i) + i 36 | enddo 37 | b = b_d 38 | print *, maxval(b_d) 39 | 40 | istat = cudaStreamDestroy(stream1) 41 | deallocate(a_d, b_d) 42 | end program defaultStream 43 | -------------------------------------------------------------------------------- /code/ch08/minimal.cuf: -------------------------------------------------------------------------------- 1 | module kernel 2 | contains 3 | attributes(global) subroutine assign(a, v) 4 | implicit none 5 | real :: a(*) 6 | real, value :: v 7 | a(threadIdx%x) = v 8 | end subroutine assign 9 | end module kernel 10 | 11 | program minimal 12 | use cudafor 13 | use kernel 14 | implicit none 15 | integer, parameter :: n=32 16 | real :: a(n) 17 | real, device, allocatable :: a0_d(:), a1_d(:) 18 | integer :: nDevices, istat 19 | 20 | istat = cudaGetDeviceCount(nDevices) 21 | if (nDevices < 2) then 22 | print *, 'This program requires at least two GPUs' 23 | stop 24 | end if 25 | 26 | istat = cudaSetDevice(0) 27 | allocate(a0_d(n)) 28 | call assign<<<1,n>>>(a0_d, 3.0) 29 | a = a0_d 30 | deallocate(a0_d) 31 | print *, 'Device 0: ', a(1) 32 | 33 | istat = cudaSetDevice(1) 34 | allocate(a1_d(n)) 35 | call assign<<<1,n>>>(a1_d, 4.0) 36 | a = a1_d 37 | deallocate(a1_d) 38 | print *, 'Device 1: ', a(1) 39 | end program minimal 40 | -------------------------------------------------------------------------------- /code/ch02/print.cuf: -------------------------------------------------------------------------------- 1 | module simpleOps_m 2 | contains 3 | attributes(global) subroutine increment(a, b, n) 4 | implicit none 5 | integer, intent(inout) :: a(*) 6 | integer, value :: b, n 7 | integer :: i 8 | 9 | i = blockDim%x*(blockIdx%x-1) + threadIdx%x 10 | if (i <= n) a(i) = a(i)+b 11 | if (i > 30 .and. i < 34) print *, 'i, a(i):', i, a(i) 12 | end subroutine increment 13 | end module simpleOps_m 14 | 15 | 16 | program main 17 | use cudafor 18 | use simpleOps_m 19 | implicit none 20 | integer, parameter :: n = 1024*1024 21 | integer, allocatable :: a(:) 22 | integer, device, allocatable :: a_d(:) 23 | integer :: b, tPB = 256 24 | 25 | allocate(a(n), a_d(n)) 26 | a = 1 27 | b = 3 28 | 29 | a_d = a 30 | call increment<<<(n-1)/tPB+1,tPB>>>(a_d, b, n) 31 | a = a_d 32 | 33 | if (any(a /= 4)) then 34 | print *, '**** Program Failed ****' 35 | else 36 | print *, 'Program Passed' 37 | endif 38 | deallocate(a, a_d) 39 | end program main 40 | -------------------------------------------------------------------------------- /code/ch04/cgReverse.cuf: -------------------------------------------------------------------------------- 1 | module reverse 2 | contains 3 | attributes(global) subroutine blockReverse(d, n) 4 | use cooperative_groups 5 | implicit none 6 | real :: d(n) 7 | integer, value :: n 8 | real, shared :: s(n) 9 | integer :: t, tr 10 | type(thread_group) :: tg 11 | 12 | tg = this_thread_block() 13 | 14 | t = threadIdx%x 15 | tr = blockDim%x-t+1 16 | 17 | s(t) = d(t) 18 | call syncthreads(tg) 19 | d(t) = s(tr) 20 | end subroutine blockReverse 21 | end module reverse 22 | 23 | 24 | program cgReverse 25 | use cudafor 26 | use reverse 27 | 28 | implicit none 29 | 30 | integer, parameter :: n = 64 31 | real :: a(n), r(n), d(n) 32 | real, device :: d_d(n) 33 | integer :: i 34 | 35 | do i = 1, n 36 | a(i) = i 37 | r(i) = n-i+1 38 | enddo 39 | 40 | d_d = a 41 | call blockReverse<<<1, n, 4*n>>>(d_d, n) 42 | d = d_d 43 | print *, 'blockReverse max error:', maxval(abs(r-d)) 44 | end program cgReverse 45 | 46 | 47 | -------------------------------------------------------------------------------- /code/ch12/ppmExample.f90: -------------------------------------------------------------------------------- 1 | program main 2 | !@cuf use cudafor 3 | implicit none 4 | integer, parameter :: nx = 400, ny = 200 5 | integer :: i, j 6 | type rgb 7 | real :: v(3) 8 | end type rgb 9 | type(rgb) :: fb(nx, ny) 10 | !@cuf type(rgb), device :: fb_d(nx, ny) 11 | 12 | !@cuf associate (fb => fb_d) 13 | !$cuf kernel do (2) <<<*,*>>> 14 | do j = 1, ny 15 | do i = 1, nx 16 | fb(i,j)%v(1) = real(i)/nx 17 | fb(i,j)%v(2) = real(j)/ny 18 | fb(i,j)%v(3) = 0.2 19 | end do 20 | end do 21 | !@cuf end associate 22 | 23 | !@cuf fb = fb_d 24 | 25 | ! ppm output 26 | 27 | print "(a2)", 'P3' ! indicates RGB colors in ASCII, must be flush left 28 | print *, nx, ny ! width and height of image 29 | print *, 255 ! maximum value for each color 30 | 31 | do j = ny, 1, -1 32 | do i = 1, nx 33 | print "(3(1x,i3))", int(255*fb(i,j)%v) 34 | end do 35 | end do 36 | 37 | end program main 38 | -------------------------------------------------------------------------------- /code/ch03/Makefile: -------------------------------------------------------------------------------- 1 | OBJS = events multidim nvtxBasic nvtxAdv nvtxAdv2 nvtxAuto \ 2 | limitingFactor peakBandwidth effectiveBandwidth 3 | 4 | # section 3.1.2 5 | 6 | events: events.cuf 7 | nvfortran -o $@ $< 8 | 9 | # section 3.1.3 10 | 11 | multidim: multidim.cuf 12 | nvfortran -o $@ $< 13 | 14 | # section 3.1.4.1 15 | 16 | nvtxBasic: nvtxBasic.cuf 17 | nvfortran -o $@ $< -cudalib=nvtx 18 | 19 | # section 3.1.4.2 20 | 21 | nvtxAdv: nvtxAdv.cuf 22 | nvfortran -o $@ $< -cudalib=nvtx 23 | 24 | nvtxAdv2: nvtxAdv2.cuf 25 | nvfortran -o $@ $< -cudalib=nvtx 26 | 27 | # section 3.1.4.3 28 | 29 | nvtxAuto: nvtxAuto.cuf 30 | nvfortran -o $@ $< -Minstrument -traceback -cudalib=nvtx 31 | 32 | # section 3.2 33 | 34 | limitingFactor: limitingFactor.cuf 35 | nvfortran -o $@ $< 36 | 37 | # section 3.3.1 38 | 39 | peakBandwidth: peakBandwidth.cuf 40 | nvfortran -o $@ $< 41 | 42 | # section 3.3.2 43 | 44 | effectiveBandwidth: effectiveBandwidth.cuf 45 | nvfortran -o $@ $< 46 | 47 | clean: 48 | rm -rf $(OBJS) *.o *.mod *~ 49 | -------------------------------------------------------------------------------- /code/ch04/streamSync.cuf: -------------------------------------------------------------------------------- 1 | program streamSync 2 | use cudafor 3 | implicit none 4 | integer, parameter :: n=100000 5 | real, device :: a_d(n), b_d(n), c_d(n) 6 | integer(kind=cuda_stream_kind) :: stream1, stream2, stream3 7 | integer :: istat, i, j 8 | 9 | istat = cudaStreamCreate(stream1) 10 | istat = cudaStreamCreate(stream2) 11 | istat = cudaStreamCreate(stream3) 12 | 13 | !$cuf kernel do <<<1,1,stream=stream1>>> 14 | do i = 1, n 15 | a_d(i) = i 16 | enddo 17 | 18 | !$cuf kernel do <<<1,1,stream=stream2>>> 19 | do i = 1, n 20 | b_d(i) = i 21 | enddo 22 | 23 | !$cuf kernel do <<<1,1,stream=stream3>>> 24 | do i = 1, n 25 | c_d(i) = 1.0/i 26 | enddo 27 | 28 | istat = cudaStreamSynchronize(stream1) 29 | 30 | !$cuf kernel do <<<1,1,stream=stream2>>> 31 | do i = 1, n 32 | a_d(i) = a_d(i) + b_d(i) 33 | enddo 34 | 35 | istat = cudaStreamDestroy(stream1) 36 | istat = cudaStreamDestroy(stream2) 37 | istat = cudaStreamDestroy(stream3) 38 | end program streamSync 39 | -------------------------------------------------------------------------------- /code/ch04/defaultStreamVar.cuf: -------------------------------------------------------------------------------- 1 | program defaultStreamVar 2 | use cudafor 3 | implicit none 4 | integer, parameter :: n=100000 5 | real :: a(n), b(n) 6 | real, device :: a_d(n) 7 | real, device, allocatable :: b_d(:) 8 | integer(cuda_stream_kind) :: s1 9 | integer :: istat, i 10 | 11 | istat = cudaStreamCreate(s1) 12 | 13 | a = 1.0 14 | allocate(b_d(n)) 15 | 16 | istat = cudaforSetDefaultStream(a_d, s1) 17 | if (istat /= cudaSuccess) print *, 'a_d stream association error: ', & 18 | cudaGetErrorString(istat) 19 | istat = cudaforSetDefaultStream(b_d, s1) 20 | if (istat /= cudaSuccess) print *, 'b_d stream association error: ', & 21 | cudaGetErrorString(istat) 22 | 23 | a_d = 1.0 24 | b_d = 1.0 25 | 26 | !$cuf kernel do <<<*,*>>> 27 | do i = 1, n 28 | a_d(i) = a_d(i) + b_d(i) + i 29 | end do 30 | 31 | print *, minval(a_d) 32 | print *, maxval(b_d) 33 | 34 | a = a_d 35 | b = b_d 36 | 37 | istat = cudaStreamDestroy(s1) 38 | deallocate(b_d) 39 | end program defaultStreamVar 40 | -------------------------------------------------------------------------------- /code/ch01/multidim.cuf: -------------------------------------------------------------------------------- 1 | module m 2 | contains 3 | attributes(global) subroutine increment(a, n1, n2, b) 4 | implicit none 5 | integer :: a(n1,n2) 6 | integer, value :: n1, n2, b 7 | integer :: i, j 8 | 9 | i = (blockIdx%x-1)*blockDim%x + threadIdx%x 10 | j = (blockIdx%y-1)*blockDim%y + threadIdx%y 11 | if (i<=n1 .and. j<=n2) a(i,j) = a(i,j) + b 12 | end subroutine increment 13 | end module m 14 | 15 | 16 | 17 | program multidim 18 | use cudafor 19 | use m 20 | implicit none 21 | integer, parameter :: nx=1024, ny=512 22 | integer :: a(nx,ny), b 23 | integer, device :: a_d(nx,ny) 24 | type(dim3) :: grid, tBlock 25 | 26 | a = 1 27 | b = 3 28 | 29 | tBlock = dim3(32,8,1) 30 | grid = dim3((nx-1)/tBlock%x+1, & 31 | (ny-1)/tBlock%y+1, 1) 32 | a_d = a 33 | call increment<<>>(a_d, nx, ny, b) 34 | a = a_d 35 | 36 | if (any(a /= 4)) then 37 | print *, '**** Program Failed ****' 38 | else 39 | print *, 'Program Passed' 40 | endif 41 | end program multidim 42 | -------------------------------------------------------------------------------- /code/ch03/multidim.cuf: -------------------------------------------------------------------------------- 1 | module m 2 | contains 3 | attributes(global) subroutine increment(a, n1, n2, b) 4 | implicit none 5 | integer :: a(n1,n2) 6 | integer, value :: n1, n2, b 7 | integer :: i, j 8 | 9 | i = (blockIdx%x-1)*blockDim%x + threadIdx%x 10 | j = (blockIdx%y-1)*blockDim%y + threadIdx%y 11 | if (i<=n1 .and. j<=n2) a(i,j) = a(i,j) + b 12 | end subroutine increment 13 | end module m 14 | 15 | 16 | 17 | program multidim 18 | use cudafor 19 | use m 20 | implicit none 21 | integer, parameter :: nx=1024, ny=512 22 | integer :: a(nx,ny), b 23 | integer, device :: a_d(nx,ny) 24 | type(dim3) :: grid, tBlock 25 | 26 | a = 1 27 | b = 3 28 | 29 | tBlock = dim3(32,8,1) 30 | grid = dim3((nx-1)/tBlock%x+1, & 31 | (ny-1)/tBlock%y+1, 1) 32 | a_d = a 33 | call increment<<>>(a_d, nx, ny, b) 34 | a = a_d 35 | 36 | if (any(a /= 4)) then 37 | print *, '**** Program Failed ****' 38 | else 39 | print *, 'Program Passed' 40 | endif 41 | end program multidim 42 | -------------------------------------------------------------------------------- /code/ch04/raceAndAtomicShared.cuf: -------------------------------------------------------------------------------- 1 | module m 2 | contains 3 | attributes(global) subroutine s1(raceCount, atomicCount) 4 | implicit none 5 | integer :: raceCount, atomicCount 6 | integer :: tmp 7 | integer, shared :: sharedCount 8 | 9 | if (threadIdx%x == 1) sharedCount = 0 10 | call syncthreads() 11 | 12 | raceCount = raceCount + 1 13 | tmp = atomicAdd(sharedCount, 1) 14 | call syncthreads() 15 | 16 | if (threadIdx%x == 1) tmp = atomicAdd(atomicCount, sharedCount) 17 | end subroutine s1 18 | end module m 19 | 20 | program raceAndAtomicShared 21 | use m 22 | implicit none 23 | integer, parameter :: nBlocks = 256, nThreads = 256 24 | integer, device :: raceCount_d, atomicCount_d 25 | integer :: raceCount, atomicCount 26 | 27 | raceCount_d = 0 28 | atomicCount_d = 0 29 | call s1<<>>(raceCount_d, atomicCount_d) 30 | raceCount = raceCount_d 31 | atomicCount = atomicCount_d 32 | print *, nBlocks*nThreads, raceCount, atomicCount 33 | end program raceAndAtomicShared 34 | -------------------------------------------------------------------------------- /code/ch08/Makefile: -------------------------------------------------------------------------------- 1 | OBJS = minimal p2pAccess directTransfer p2pBandwidth \ 2 | transposeP2P mpiDevices assignDevice transposeMPI transposeCAMPI 3 | 4 | # section 8.1 5 | 6 | minimal: minimal.cuf 7 | nvfortran -o $@ $< 8 | 9 | # section 8.1.1.1 10 | 11 | p2pAccess: p2pAccess.cuf 12 | nvfortran -o $@ $< 13 | 14 | # section 8.1.2 15 | 16 | directTransfer: directTransfer.cuf 17 | nvfortran -o $@ $< 18 | 19 | p2pBandwidth: p2pBandwidth.cuf 20 | nvfortran -o $@ $< 21 | 22 | # section 8.1.3 23 | 24 | wallclock.o: wallclock.c 25 | nvc -c $< 26 | 27 | transposeP2P: timing.f90 transposeP2P.cuf wallclock.o 28 | nvfortran -o $@ $^ 29 | 30 | # section 8.2.1 31 | 32 | mpiDevices: mpiDevices.cuf 33 | mpif90 -o $@ $< 34 | 35 | assignDevice: mpiDeviceUtil.cuf assignDevice.cuf 36 | mpif90 -o $@ $^ 37 | 38 | # section 8.2.2 39 | 40 | transposeMPI: mpiDeviceUtil.cuf transposeMPI.cuf 41 | mpif90 -o $@ $^ 42 | 43 | transposeCAMPI: mpiDeviceUtil.cuf transposeCAMPI.cuf 44 | mpif90 -o $@ $^ 45 | 46 | 47 | 48 | clean: 49 | rm -rf $(OBJS) *.o *.mod *~ 50 | 51 | -------------------------------------------------------------------------------- /code/ch01/managed.cuf: -------------------------------------------------------------------------------- 1 | module m 2 | contains 3 | attributes(global) subroutine increment(a, n1, n2, b) 4 | implicit none 5 | integer :: a(n1,n2) 6 | integer, value :: n1, n2, b 7 | integer :: i, j 8 | 9 | i = (blockIdx%x-1)*blockDim%x + threadIdx%x 10 | j = (blockIdx%y-1)*blockDim%y + threadIdx%y 11 | if (i<=n1 .and. j<=n2) a(i,j) = a(i,j) + b 12 | end subroutine increment 13 | end module m 14 | 15 | 16 | 17 | program main 18 | use cudafor 19 | use m 20 | implicit none 21 | integer, parameter :: nx=1024, ny=512 22 | integer, managed :: a(nx,ny) 23 | integer :: b, istat 24 | type(dim3) :: grid, tBlock 25 | 26 | a = 1 27 | b = 3 28 | 29 | tBlock = dim3(32,8,1) 30 | grid = dim3(ceiling(real(nx)/tBlock%x), & 31 | ceiling(real(ny)/tBlock%y), 1) 32 | 33 | call increment<<>>(a, nx, ny, b) 34 | istat = cudaDeviceSynchronize() 35 | 36 | if (any(a /= 4)) then 37 | print *, '**** Program Failed ****' 38 | else 39 | print *, 'Program Passed' 40 | endif 41 | end program main 42 | -------------------------------------------------------------------------------- /code/ch04/defaultStreamVarExplicit.cuf: -------------------------------------------------------------------------------- 1 | program defaultStreamVarExplicit 2 | use cudafor 3 | implicit none 4 | integer, parameter :: n=100000 5 | real :: a(n), b(n) 6 | real, device :: a_d(n) 7 | real, device, allocatable :: b_d(:) 8 | integer(cuda_stream_kind) :: s1 9 | integer :: istat, i 10 | 11 | istat = cudaStreamCreate(s1) 12 | 13 | a = 1.0 14 | allocate(b_d(n)) 15 | 16 | istat = cudaforSetDefaultStream(a_d, s1) 17 | if (istat /= cudaSuccess) print *, 'a_d stream association error: ', & 18 | cudaGetErrorString(istat) 19 | istat = cudaforSetDefaultStream(b_d, s1) 20 | if (istat /= cudaSuccess) print *, 'b_d stream association error: ', & 21 | cudaGetErrorString(istat) 22 | 23 | a_d = 1.0 24 | b_d = 1.0 25 | 26 | !$cuf kernel do <<<*,*,stream=s1>>> 27 | do i = 1, n 28 | a_d(i) = a_d(i) + b_d(i) + i 29 | end do 30 | 31 | print *, minval(a_d, stream=s1) 32 | print *, maxval(b_d, stream=s1) 33 | 34 | a = a_d 35 | b = b_d 36 | 37 | istat = cudaStreamDestroy(s1) 38 | deallocate(b_d) 39 | end program defaultStreamVarExplicit 40 | -------------------------------------------------------------------------------- /code/ch01/managedImplicit.cuf: -------------------------------------------------------------------------------- 1 | module m 2 | contains 3 | attributes(global) subroutine increment(a, n1, n2, b) 4 | implicit none 5 | integer :: a(n1,n2) 6 | integer, value :: n1, n2, b 7 | integer :: i, j 8 | 9 | i = (blockIdx%x-1)*blockDim%x + threadIdx%x 10 | j = (blockIdx%y-1)*blockDim%y + threadIdx%y 11 | if (i<=n1 .and. j<=n2) a(i,j) = a(i,j) + b 12 | end subroutine increment 13 | end module m 14 | 15 | 16 | 17 | program main 18 | use cudafor 19 | use m 20 | implicit none 21 | integer, parameter :: nx=1024, ny=512 22 | integer, allocatable :: a(:,:) 23 | integer :: b, istat 24 | type(dim3) :: grid, tBlock 25 | 26 | allocate(a(nx,ny)) 27 | 28 | a = 1 29 | b = 3 30 | 31 | tBlock = dim3(32,8,1) 32 | grid = dim3(ceiling(real(nx)/tBlock%x), & 33 | ceiling(real(ny)/tBlock%y), 1) 34 | 35 | call increment<<>>(a, nx, ny, b) 36 | istat = cudaDeviceSynchronize() 37 | 38 | if (any(a /= 4)) then 39 | print *, '**** Program Failed ****' 40 | else 41 | print *, 'Program Passed' 42 | endif 43 | 44 | deallocate(a) 45 | end program main 46 | -------------------------------------------------------------------------------- /code/ch01/asyncError.cuf: -------------------------------------------------------------------------------- 1 | module m 2 | contains 3 | attributes(global) subroutine increment(a, b) 4 | implicit none 5 | integer, intent(inout) :: a(*) 6 | integer, value :: b 7 | integer :: i 8 | 9 | i = threadIdx%x 10 | a(i) = a(i)+b 11 | 12 | end subroutine increment 13 | end module m 14 | 15 | 16 | program asyncError 17 | use cudafor 18 | use m 19 | implicit none 20 | integer, parameter :: n = 256 21 | integer :: a(n), b 22 | integer, device, pointer :: a_d(:) 23 | integer :: ierrSync, ierrAsync 24 | 25 | a = 1 26 | b = 3 27 | 28 | call increment<<<1,n>>>(a_d, b) 29 | ierrSync = cudaGetLastError() 30 | ierrAsync = cudaDeviceSynchronize() 31 | a = a_d 32 | 33 | if (ierrSync /= cudaSuccess) & 34 | print *, 'Sync kernel error: ', & 35 | trim(cudaGetErrorString(ierrSync)) 36 | if (ierrAsync /= cudaSuccess) & 37 | print *, 'Async kernel error: ', & 38 | trim(cudaGetErrorString(ierrAsync)) 39 | 40 | if (any(a /= 4)) then 41 | print *, '**** Program Failed ****' 42 | else 43 | print *, 'Program Passed' 44 | endif 45 | end program asyncError 46 | -------------------------------------------------------------------------------- /code/ch01/syncError.cuf: -------------------------------------------------------------------------------- 1 | module m 2 | contains 3 | attributes(global) subroutine increment(a, b) 4 | implicit none 5 | integer, intent(inout) :: a(*) 6 | integer, value :: b 7 | integer :: i 8 | 9 | i = threadIdx%x 10 | a(i) = a(i)+b 11 | 12 | end subroutine increment 13 | end module m 14 | 15 | 16 | program asyncError 17 | use cudafor 18 | use m 19 | implicit none 20 | integer, parameter :: n = 5000 21 | integer :: a(n), b 22 | integer, device :: a_d(n) 23 | integer :: ierrSync, ierrAsync 24 | 25 | a = 1 26 | b = 3 27 | 28 | a_d = a 29 | call increment<<<1,n>>>(a_d, b) 30 | ierrSync = cudaGetLastError() 31 | ierrAsync = cudaDeviceSynchronize() 32 | a = a_d 33 | 34 | if (ierrSync /= cudaSuccess) & 35 | print *, 'Sync kernel error: ', & 36 | trim(cudaGetErrorString(ierrSync)) 37 | if (ierrAsync /= cudaSuccess) & 38 | print *, 'Async kernel error: ', & 39 | trim(cudaGetErrorString(ierrAsync)) 40 | 41 | if (any(a /= 4)) then 42 | print *, '**** Program Failed ****' 43 | else 44 | print *, 'Program Passed' 45 | endif 46 | end program asyncError 47 | -------------------------------------------------------------------------------- /code/ch01/errorHandling.cuf: -------------------------------------------------------------------------------- 1 | module m 2 | contains 3 | attributes(global) subroutine increment(a, b) 4 | implicit none 5 | integer, intent(inout) :: a(*) 6 | integer, value :: b 7 | integer :: i 8 | 9 | i = threadIdx%x 10 | a(i) = a(i)+b 11 | 12 | end subroutine increment 13 | end module m 14 | 15 | 16 | program asyncError 17 | use cudafor 18 | use m 19 | implicit none 20 | integer, parameter :: n = 256 21 | integer :: a(n), b 22 | integer, device :: a_d(n) 23 | integer :: ierrSync, ierrAsync 24 | 25 | a = 1 26 | b = 3 27 | 28 | a_d = a 29 | call increment<<<1,n>>>(a_d, b) 30 | ierrSync = cudaGetLastError() 31 | ierrAsync = cudaDeviceSynchronize() 32 | a = a_d 33 | 34 | if (ierrSync /= cudaSuccess) & 35 | print *, 'Sync kernel error: ', & 36 | trim(cudaGetErrorString(ierrSync)) 37 | if (ierrAsync /= cudaSuccess) & 38 | print *, 'Async kernel error: ', & 39 | trim(cudaGetErrorString(ierrAsync)) 40 | 41 | if (any(a /= 4)) then 42 | print *, '**** Program Failed ****' 43 | else 44 | print *, 'Program Passed' 45 | endif 46 | end program asyncError 47 | -------------------------------------------------------------------------------- /code/ch08/Makefile~: -------------------------------------------------------------------------------- 1 | OBJS = minimal p2pAccess directTransfer p2pBandwidth \ 2 | transposeP2P mpiDevices assignDevice transposeMPI transposeCAMPI 3 | 4 | MPI_DIR=$(NVHPC_ROOT)/comm_libs/mpi/bin 5 | 6 | # section 8.1 7 | 8 | minimal: minimal.cuf 9 | nvfortran -o $@ $< 10 | 11 | # section 8.1.1.1 12 | 13 | p2pAccess: p2pAccess.cuf 14 | nvfortran -o $@ $< 15 | 16 | # section 8.1.2 17 | 18 | directTransfer: directTransfer.cuf 19 | nvfortran -o $@ $< 20 | 21 | p2pBandwidth: p2pBandwidth.cuf 22 | nvfortran -o $@ $< 23 | 24 | # section 8.1.3 25 | 26 | wallclock.o: wallclock.c 27 | nvc -c $< 28 | 29 | transposeP2P: timing.f90 transposeP2P.cuf wallclock.o 30 | nvfortran -o $@ $^ 31 | 32 | # section 8.2.1 33 | 34 | mpiDevices: mpiDevices.cuf 35 | $(MPI_DIR)/mpif90 -o $@ $< 36 | 37 | assignDevice: mpiDeviceUtil.cuf assignDevice.cuf 38 | $(MPI_DIR)/mpif90 -o $@ $^ 39 | 40 | # section 8.2.2 41 | 42 | transposeMPI: mpiDeviceUtil.cuf transposeMPI.cuf 43 | $(MPI_DIR)/mpif90 -o $@ $^ 44 | 45 | transposeCAMPI: mpiDeviceUtil.cuf transposeCAMPI.cuf 46 | $(MPI_DIR)/mpif90 -o $@ $^ 47 | 48 | 49 | 50 | clean: 51 | rm -rf $(OBJS) *.o *.mod *~ 52 | 53 | -------------------------------------------------------------------------------- /code/ch05/loads.cuf: -------------------------------------------------------------------------------- 1 | module m 2 | contains 3 | attributes(global) subroutine k(b,a) 4 | implicit none 5 | real :: b(*), a(*) 6 | integer :: i 7 | i = (blockIdx%x-1)*blockDim%x + threadIdx%x 8 | b(i) = a(i) 9 | end subroutine k 10 | 11 | attributes(global) subroutine k_ii(b,a) 12 | implicit none 13 | real :: b(*) 14 | real, intent(in) :: a(*) 15 | integer :: i 16 | i = (blockIdx%x-1)*blockDim%x + threadIdx%x 17 | b(i) = a(i) 18 | end subroutine k_ii 19 | 20 | attributes(global) subroutine k_ca(b,a) 21 | implicit none 22 | real :: b(*), a(*) 23 | integer :: i 24 | i = (blockIdx%x-1)*blockDim%x + threadIdx%x 25 | b(i) = __ldca(a(i)) 26 | end subroutine k_ca 27 | 28 | attributes(global) subroutine k_cg(b,a) 29 | implicit none 30 | real :: b(*), a(*) 31 | integer :: i 32 | i = (blockIdx%x-1)*blockDim%x + threadIdx%x 33 | b(i) = __ldcg(a(i)) 34 | end subroutine k_cg 35 | 36 | attributes(global) subroutine k_cv(b,a) 37 | implicit none 38 | real :: b(*), a(*) 39 | integer :: i 40 | i = (blockIdx%x-1)*blockDim%x + threadIdx%x 41 | b(i) = __ldcv(a(i)) 42 | end subroutine k_cv 43 | end module m 44 | -------------------------------------------------------------------------------- /code/ch01/explicitInterface.cuf: -------------------------------------------------------------------------------- 1 | attributes(global) subroutine increment(a, n1, n2, b) 2 | implicit none 3 | integer :: a(n1, n2) 4 | integer, value :: n1, n2, b 5 | integer :: i, j 6 | 7 | i = (blockIdx%x-1)*blockDim%x + threadIdx%x 8 | j = (blockIdx%y-1)*blockDim%y + threadIdx%y 9 | if (i<=n1 .and. j<=n2) a(i,j) = a(i,j) + b 10 | end subroutine increment 11 | 12 | program explicitInterface 13 | use cudafor 14 | implicit none 15 | integer, parameter :: nx=1024, ny=512 16 | integer :: a(nx,ny), b 17 | integer, device :: a_d(nx,ny) 18 | type(dim3) :: grid, tBlock 19 | interface 20 | attributes(global) subroutine increment(a, n1, n2, b) 21 | integer :: a(n1, n2) 22 | integer, value :: n1, n2, b 23 | end subroutine increment 24 | end interface 25 | 26 | a = 1 27 | b = 3 28 | 29 | tBlock = dim3(32,8,1) 30 | grid = dim3(ceiling(real(nx)/tBlock%x), & 31 | ceiling(real(ny)/tBlock%y), 1) 32 | a_d = a 33 | call increment<<>>(a_d, nx, ny, b) 34 | a = a_d 35 | 36 | if (any(a /= 4)) then 37 | print *, '**** Program Failed ****' 38 | else 39 | print *, 'Program Passed' 40 | endif 41 | end program explicitInterface 42 | -------------------------------------------------------------------------------- /code/ch09/shflExample.cuf: -------------------------------------------------------------------------------- 1 | module shuffle_m 2 | contains 3 | attributes(global) subroutine shuffle_xor(len) 4 | integer:: i,n,depth 5 | integer,value:: len 6 | 7 | i=threadIdx%x 8 | depth=bit_size(len)-leadz(len)-1 !log2(len) 9 | do n=depth,1,-1 10 | i=i+__shfl_xor(i,ishft(len,-n),len) 11 | print*,threadIdx%x,ishft(len,-n),i 12 | end do 13 | end subroutine shuffle_xor 14 | 15 | attributes(global) subroutine shuffle_down(len) 16 | integer:: i,n,depth 17 | integer,value:: len 18 | 19 | i=threadIdx%x 20 | depth=bit_size(len)-leadz(len)-1 !log2(len) 21 | do n=depth,1,-1 22 | i=i+__shfl_down(i,ishft(len,-n),len) 23 | print*,threadIdx%x,ishft(len,-n),i 24 | end do 25 | end subroutine shuffle_down 26 | end module shuffle_m 27 | 28 | program test_shuffle 29 | use cudafor 30 | use shuffle_m 31 | integer:: depth 32 | print *," Thread id"," laneMask "," __shfl_xor" 33 | len=8 34 | call shuffle_xor<<<1,len>>>(len) 35 | istat=cudaDeviceSynchronize() 36 | print *," Thread id"," delta "," __shfl_down" 37 | call shuffle_down<<<1,len>>>(len) 38 | istat=cudaDeviceSynchronize() 39 | end program test_shuffle 40 | -------------------------------------------------------------------------------- /code/ch07/sgemmNew.cuf: -------------------------------------------------------------------------------- 1 | program sgemmNew 2 | use cublas 3 | use cudafor 4 | implicit none 5 | integer, parameter :: m = 100, n = 100, k = 100 6 | real :: a(m,k), b(k,n), c(m,n) 7 | real, device :: a_d(m,k), b_d(k,n), c_d(m,n) 8 | real, parameter :: alpha = 1.0, beta = 0.0 9 | real, device :: alpha_d, beta_d 10 | integer :: lda = m, ldb = k, ldc = m 11 | type(cublasHandle) :: h 12 | integer :: istat 13 | 14 | a = 1.0; b = 2.0; c = 0.0 15 | a_d = a; b_d = b; c_d = 0.0 16 | 17 | istat = cublasCreate(h) 18 | if (istat /= CUBLAS_STATUS_SUCCESS) & 19 | print *, 'Error initializing CUBLAS' 20 | 21 | 22 | istat = cublasSgemm_v2(h, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, & 23 | alpha, a_d, lda, b_d, ldb, beta, c_d, ldc) 24 | c = c_d 25 | print *, 'cublasSgemm error =', maxval(c-k*2.0) 26 | 27 | alpha_d = alpha; beta_d = beta 28 | 29 | istat = cublasSgemm_v2(h, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, & 30 | alpha_d, a_d, lda, b_d, ldb, beta_d, c_d, ldc) 31 | c = c_d 32 | print *, 'cublasSgemm error =', maxval(c-k*2.0) 33 | 34 | 35 | istat = cublasDestroy(h) 36 | if (istat /= CUBLAS_STATUS_SUCCESS) & 37 | print *, 'Error shutting down CUBLAS' 38 | end program sgemmNew 39 | -------------------------------------------------------------------------------- /code/ch04/smooth.cuf: -------------------------------------------------------------------------------- 1 | module m 2 | contains 3 | attributes(grid_global) subroutine smooth(a,b,n,radius) 4 | use cooperative_groups 5 | implicit none 6 | real :: a(n), b(n) 7 | integer, value :: n, radius 8 | integer :: i, j, jj 9 | type(grid_group) :: gg 10 | real :: bt 11 | 12 | gg = this_grid() 13 | do i = gg%rank, n, gg%size 14 | a(i) = i 15 | end do 16 | 17 | call syncthreads(gg) 18 | 19 | do i = gg%rank, n, gg%size 20 | bt = 0.0 21 | do j = i-radius, i+radius 22 | jj = j 23 | if (j < 1) jj = jj + n 24 | if (j > n) jj = jj - n 25 | bt = bt + a(jj) 26 | enddo 27 | b(i) = bt/(2*radius+1) 28 | enddo 29 | 30 | end subroutine smooth 31 | end module m 32 | 33 | program main 34 | use cudafor 35 | use m 36 | implicit none 37 | integer, parameter :: n = 1024*1024 38 | real :: a(n), b(n) 39 | real, device :: a_d(n), b_d(n) 40 | integer :: i, radius 41 | radius = 2 42 | call smooth<<<*,256>>>(a_d, b_d, n, radius) 43 | a = a_d 44 | b = b_d 45 | print *, 'Filter radius: ', radius 46 | do i = 1, n 47 | if (abs(b(i)-a(i)) > 0.00010) print *, i, a(i), b(i) 48 | enddo 49 | end program main 50 | -------------------------------------------------------------------------------- /code/ch09/pi_lock.cuf: -------------------------------------------------------------------------------- 1 | module pi_lock_m 2 | integer, device:: lock=0 3 | contains 4 | attributes(global) subroutine pi_lock(input, partial, twoN) 5 | use precision_m 6 | implicit none 7 | real(fp_kind) :: input(twoN) 8 | integer :: partial(*) 9 | integer, value :: twoN 10 | 11 | integer, shared :: p_s(*) 12 | integer :: N, i, index, inext, interior 13 | 14 | N = twoN/2 15 | 16 | index=threadIdx%x+(BlockIdx%x-1)*BlockDim%x 17 | 18 | interior=0 19 | do i=index, N, BlockDim%x*GridDim%x 20 | if( (input(i)**2+input(i+N)**2) <= 1._fp_kind ) & 21 | interior=interior+1 22 | end do 23 | 24 | ! Local reduction per block 25 | index=threadIdx%x 26 | 27 | p_s(index)=interior 28 | call syncthreads() 29 | 30 | inext=blockDim%x/2 31 | do while ( inext >=1 ) 32 | if (index <= inext) p_s(index)=p_s(index)+p_s(index+inext) 33 | inext = inext/2 34 | call syncthreads() 35 | end do 36 | 37 | if (index == 1) then 38 | do while (atomiccas(lock,0,1) == 1) 39 | end do 40 | partial(1)=partial(1)+p_s(1) 41 | call threadfence() 42 | lock = 0 43 | end if 44 | end subroutine pi_lock 45 | end module pi_lock_m 46 | -------------------------------------------------------------------------------- /code/ch03/events.cuf: -------------------------------------------------------------------------------- 1 | module m 2 | contains 3 | attributes(global) subroutine increment(a, b) 4 | implicit none 5 | integer, intent(inout) :: a(*) 6 | integer, value :: b 7 | integer :: i 8 | 9 | i = threadIdx%x 10 | a(i) = a(i)+b 11 | 12 | end subroutine increment 13 | end module m 14 | 15 | 16 | program events 17 | use cudafor 18 | use m 19 | implicit none 20 | integer, parameter :: n = 256 21 | integer :: a(n), b 22 | integer, device :: a_d(n) 23 | type(cudaEvent) :: startEvent, stopEvent 24 | real :: time 25 | integer :: istat 26 | 27 | a = 1 28 | b = 3 29 | a_d = a 30 | 31 | istat = cudaEventCreate(startEvent) 32 | istat = cudaEventCreate(stopEvent) 33 | 34 | istat = cudaEventRecord(startEvent, 0) 35 | call increment<<<1,n>>>(a_d, b) 36 | istat = cudaEventRecord(stopEvent, 0) 37 | istat = cudaEventSynchronize(stopEvent) 38 | istat = cudaEventElapsedTime(time, startEvent, stopEvent) 39 | 40 | a = a_d 41 | 42 | if (any(a /= 4)) then 43 | print *, '**** Program Failed ****' 44 | else 45 | print *, ' Time for kernel execution (ms): ', time 46 | endif 47 | 48 | istat = cudaEventDestroy(startEvent) 49 | istat = cudaEventDestroy(stopEvent) 50 | 51 | end program events 52 | -------------------------------------------------------------------------------- /code/ch07/sgemmLegacy.cuf: -------------------------------------------------------------------------------- 1 | program sgemmLegacy 2 | use cublas 3 | use cudafor 4 | implicit none 5 | integer, parameter :: m = 100, n = 100, k = 100 6 | real :: a(m,k), b(k,n), c(m,n) 7 | real, device :: a_d(m,k), b_d(k,n), c_d(m,n) 8 | real, parameter :: alpha = 1.0, beta = 0.0 9 | integer :: lda = m, ldb = k, ldc = m 10 | integer :: istat 11 | 12 | a = 1.0; b = 2.0; c = 0.0 13 | 14 | call sgemm('n', 'n', m, n, k, & 15 | alpha, a, lda, b, ldb, beta, c, ldc) 16 | print *, 'sgemm(host data) error =', & 17 | maxval(c-k*2.0) 18 | 19 | istat = cublasInit() 20 | if (istat /= CUBLAS_STATUS_SUCCESS) & 21 | print *, 'Error initializing CUBLAS' 22 | 23 | a_d = a; b_d = b; c_d = 0.0 24 | 25 | call cublasSgemm('n', 'n', m, n, k, & 26 | alpha, a_d, lda, b_d, ldb, beta, c_d, ldc) 27 | c = c_d 28 | print *, 'cublasSgemm error =', maxval(c-k*2.0) 29 | 30 | c_d = 0.0 31 | call sgemm('n', 'n', m, n, k, & 32 | alpha, a_d, lda, b_d, ldb, beta, c_d, ldc) 33 | c = c_d 34 | print *, 'sgemm(device data) error =', & 35 | maxval(c-k*2.0) 36 | 37 | istat = cublasShutdown() 38 | if (istat /= CUBLAS_STATUS_SUCCESS) & 39 | print *, 'Error shutting down CUBLAS' 40 | end program sgemmLegacy 41 | -------------------------------------------------------------------------------- /code/ch11/Makefile: -------------------------------------------------------------------------------- 1 | OBJS = fft_test_c2c_sp fft_test_c2c_dp fft_test_r2c_sp fft_test_r2c_dp \ 2 | spectral_sp spectral_dp exampleOverlapFFT ns2d_sp ns2d_dp 3 | 4 | # section 11.1 5 | 6 | fft_test_c2c_sp: precision_m.F90 fft_test_c2c.cuf 7 | nvfortran -o $@ $^ -cudalib=cufft 8 | 9 | fft_test_c2c_dp: precision_m.F90 fft_test_c2c.cuf 10 | nvfortran -DDOUBLE -o $@ $^ -cudalib=cufft 11 | 12 | fft_test_r2c_sp: precision_m.F90 fft_test_r2c.cuf 13 | nvfortran -o $@ $^ -cudalib=cufft 14 | 15 | fft_test_r2c_dp: precision_m.F90 fft_test_r2c.cuf 16 | nvfortran -DDOUBLE -o $@ $^ -cudalib=cufft 17 | 18 | # section 11.2 19 | 20 | spectral_sp: precision_m.F90 fft_derivative.cuf 21 | nvfortran -O2 -o $@ $^ -cudalib=cufft 22 | 23 | spectral_dp: precision_m.F90 fft_derivative.cuf 24 | nvfortran -DDOUBLE -O2 -o $@ $^ -cudalib=cufft 25 | 26 | # section 11.3 27 | 28 | exampleOverlapFFT: precision_m.F90 exampleOverlapFFT_nvtx.cuf 29 | nvfortran -O3 -Minfo -Mpreprocess -o $@ $^ -cudalib=nvtx,cufft 30 | 31 | # section 11.4 32 | 33 | ns2d_sp: ns2d.cuf 34 | nvfortran -DSINGLE -O3 -Mpreprocess -o $@ $< -cudalib=cufft 35 | 36 | ns2d_dp: ns2d.cuf 37 | nvfortran -O3 -Mpreprocess -o $@ $< -cudalib=cufft 38 | 39 | 40 | 41 | clean: 42 | rm -rf $(OBJS) *.o *.mod *~ *.hdf 43 | 44 | -------------------------------------------------------------------------------- /code/ch08/p2pAccess.cuf: -------------------------------------------------------------------------------- 1 | program checkP2pAccess 2 | use cudafor 3 | implicit none 4 | integer, allocatable :: p2pOK(:,:) 5 | integer :: nDevices, i, j, istat 6 | type (cudaDeviceProp) :: prop 7 | 8 | istat = cudaGetDeviceCount(nDevices) 9 | print "('Number of CUDA-capable devices: ', i0,/)", & 10 | nDevices 11 | 12 | do i = 0, nDevices-1 13 | istat = cudaGetDeviceProperties(prop, i) 14 | print "('Device ', i0, ': ', a)", i, trim(prop%name) 15 | enddo 16 | print * 17 | 18 | allocate(p2pOK(0:nDevices-1, 0:nDevices-1)) 19 | p2pOK = 0 20 | 21 | do j = 0, nDevices-1 22 | do i = j+1, nDevices-1 23 | istat = cudaDeviceCanAccessPeer(p2pOK(i,j), i, j) 24 | p2pOK(j,i) = p2pOK(i,j) 25 | end do 26 | end do 27 | 28 | do i = 0, nDevices-1 29 | write(*, "(3x,i3)", advance='no') i 30 | enddo 31 | print * 32 | 33 | do j = 0, nDevices-1 34 | write(*,"(i3)", advance='no') j 35 | do i = 0, nDevices-1 36 | if (i == j) then 37 | write(*,"(2x,'-',3x)", advance='no') 38 | else if (p2pOK(i,j) == 1) then 39 | write(*,"(2x, 'Y',3x)",advance='no') 40 | else 41 | write(*,"(6x)",advance='no') 42 | end if 43 | end do 44 | print * 45 | end do 46 | end program checkP2pAccess 47 | -------------------------------------------------------------------------------- /code/ch04/twoKernels.cuf: -------------------------------------------------------------------------------- 1 | program twoKernels 2 | use cudafor 3 | implicit none 4 | integer, parameter :: n=100000 5 | real, device :: a_d(n,2) 6 | real :: a(n,2) 7 | integer(kind=cuda_stream_kind) :: stream1, stream2 8 | integer :: istat, i 9 | 10 | istat = cudaStreamCreate(stream1) 11 | istat = cudaStreamCreate(stream2) 12 | a = 1.0 13 | 14 | ! two kernels in the null stream 15 | 16 | a_d = a 17 | !$cuf kernel do <<<1,1>>> 18 | do i = 1, n 19 | a_d(i,1) = a_d(i,1) + i 20 | enddo 21 | !$cuf kernel do <<<1,1>>> 22 | do i = 1, n 23 | a_d(i,2) = a_d(i,2) - i 24 | enddo 25 | a = a_d 26 | 27 | ! one kernel in blocking stream, one in null stream 28 | 29 | a_d = a 30 | !$cuf kernel do <<<1,1,stream=stream1>>> 31 | do i = 1, n 32 | a_d(i,1) = a_d(i,1) + i 33 | enddo 34 | !$cuf kernel do <<<1,1>>> 35 | do i = 1, n 36 | a_d(i,2) = a_d(i,2) - i 37 | enddo 38 | a = a_d 39 | 40 | ! two kernels in different, blocking streams 41 | 42 | a_d = a 43 | !$cuf kernel do <<<1,1,stream=stream1>>> 44 | do i = 1, n 45 | a_d(i,1) = a_d(i,1) + i 46 | enddo 47 | !$cuf kernel do <<<1,1,stream=stream2>>> 48 | do i = 1, n 49 | a_d(i,2) = a_d(i,2) - i 50 | enddo 51 | a = a_d 52 | 53 | istat = cudaStreamDestroy(stream1) 54 | istat = cudaStreamDestroy(stream2) 55 | end program twoKernels 56 | -------------------------------------------------------------------------------- /code/ch07/callingC.cuf: -------------------------------------------------------------------------------- 1 | module m 2 | 3 | interface 4 | attributes(global) subroutine kernel(x, v) & 5 | bind(C,name='Ckernel') 6 | use iso_c_binding 7 | real(c_float), device :: x(*) 8 | real(c_float), value :: v 9 | end subroutine kernel 10 | end interface 11 | 12 | interface 13 | attributes(device) function devicefun(x) result(res) & 14 | bind(C,name='Cdevicefun') 15 | use iso_c_binding 16 | real(c_float) :: res 17 | real(c_float), value :: x 18 | end function devicefun 19 | end interface 20 | 21 | contains 22 | 23 | attributes(global) subroutine Fkernel(x, v) 24 | implicit none 25 | real, device :: x(*) 26 | real, value :: v 27 | x(threadIdx%x) = devicefun(v) 28 | end subroutine Fkernel 29 | 30 | end module m 31 | 32 | program main 33 | use m 34 | implicit none 35 | real :: x(1), f(1) 36 | real, device :: x_d(1), f_d(1) 37 | integer :: i 38 | 39 | call kernel<<<1,1>>>(x_d, 1.0) 40 | x = x_d 41 | if (x(1) == 1.0) print *, 'Calling C Kernel -- OK' 42 | 43 | !$cuf kernel do <<<*,*>>> 44 | do i = 1, 1 45 | f_d(i) = devicefun(x_d(i)) 46 | end do 47 | f = f_d 48 | if (f(1) == 2.0) print *, 'CUF Kernel Calling C Function -- OK' 49 | 50 | call Fkernel<<<1,1>>>(x_d, 4.0) 51 | x = x_d 52 | if (x(1) == 8.0) print *, 'Fortran Kernel Calling C Function -- OK' 53 | end program main 54 | 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /code/ch09/generate_randomnumbers.cuf: -------------------------------------------------------------------------------- 1 | ! Generate N random numbers on GPU, copy them back to CPU 2 | ! and print the first 4 3 | 4 | program curand_example 5 | use precision_m 6 | use curand 7 | implicit none 8 | real(fp_kind), allocatable:: hostData(:) 9 | real(fp_kind), allocatable, device:: deviceData(:) 10 | type(curandGenerator) :: gen 11 | integer :: N, istat 12 | integer(kind=8) :: seed 13 | 14 | ! Define how many numbers we want to generate 15 | N=20 16 | 17 | ! Allocate array on CPU 18 | allocate(hostData(N)) 19 | 20 | ! Allocate array on GPU 21 | allocate(deviceData(N)) 22 | 23 | if (fp_kind == singlePrecision) then 24 | write(*,"('Generating random numbers in single precision')") 25 | else 26 | write(*,"('Generating random numbers in double precision')") 27 | end if 28 | 29 | ! Create pseudonumber generator 30 | istat = curandCreateGenerator(gen, CURAND_RNG_PSEUDO_DEFAULT) 31 | 32 | ! Set seed 33 | seed=1234 34 | istat= curandSetPseudoRandomGeneratorSeed( gen, seed) 35 | 36 | ! Generate N floats or double on device 37 | istat= curandGenerate(gen, deviceData, N) 38 | 39 | ! Copy the data back to CPU 40 | hostData=deviceData 41 | 42 | ! print the first 4 of the sequence 43 | write(*,*) hostData(1:4) 44 | 45 | ! Deallocate data on CPU and GPU 46 | deallocate(hostData) 47 | deallocate(deviceData) 48 | 49 | ! Destroy the generator 50 | istat = curandDestroyGenerator(gen) 51 | 52 | end program curand_example 53 | -------------------------------------------------------------------------------- /code/ch03/limitingFactor.cuf: -------------------------------------------------------------------------------- 1 | module kernel_m 2 | contains 3 | attributes(global) subroutine base(a, b) 4 | real :: a(*), b(*) 5 | integer :: i 6 | i = (blockIdx%x-1)*blockDim%x + threadIdx%x 7 | a(i) = sin(b(i)) + cos(b(i)) & 8 | + sin(2.0*b(i)) + cos(2.0*b(i)) & 9 | + sin(3.0*b(i)) + cos(3.0*b(i)) & 10 | + sin(4.0*b(i)) + cos(4.0*b(i)) 11 | end subroutine base 12 | 13 | attributes(global) subroutine memory(a, b) 14 | real :: a(*), b(*) 15 | integer :: i 16 | i = (blockIdx%x-1)*blockDim%x + threadIdx%x 17 | a(i) = b(i) 18 | end subroutine memory 19 | 20 | attributes(global) subroutine math(a, b, flag) 21 | real :: a(*) 22 | real, value :: b 23 | integer, value :: flag 24 | real :: v 25 | integer :: i 26 | i = (blockIdx%x-1)*blockDim%x + threadIdx%x 27 | v = sin(b) + cos(b) & 28 | + sin(2.0*b) + cos(2.0*b) & 29 | + sin(3.0*b) + cos(3.0*b) & 30 | + sin(4.0*b) + cos(4.0*b) 31 | if (v*flag == 1) a(i) = v 32 | end subroutine math 33 | end module kernel_m 34 | 35 | program limitingFactor 36 | use cudafor 37 | use kernel_m 38 | 39 | implicit none 40 | 41 | integer, parameter :: blockSize = 256 42 | integer, parameter :: n = 64*1024*4*blockSize 43 | real :: a(n) 44 | real, device :: a_d(n), b_d(n) 45 | b_d = 1.0 46 | call base<<>>(a_d, b_d) 47 | call memory<<>>(a_d, b_d) 48 | call math<<>>(a_d, 1.0, 0) 49 | a = a_d 50 | print *, a(1) 51 | end program limitingFactor 52 | -------------------------------------------------------------------------------- /code/ch05/maxSharedMemory.cuf: -------------------------------------------------------------------------------- 1 | module m 2 | contains 3 | attributes(global) subroutine increment(a, b) 4 | implicit none 5 | integer, intent(inout) :: a(*) 6 | integer, shared :: s(*) 7 | integer, value :: b 8 | integer :: i, il 9 | 10 | il = threadIdx%x 11 | i = (blockIdx%x-1)*blockDim%x + threadIdx%x 12 | s(il) = a(i) 13 | call syncthreads() 14 | a(i) = s(il)+b 15 | end subroutine increment 16 | end module m 17 | 18 | 19 | program maxSharedMemory 20 | use cudafor 21 | use m 22 | implicit none 23 | 24 | integer, parameter :: n = 1024*1024 25 | integer :: a(n) 26 | integer, device :: a_d(n) 27 | type(cudaFuncAttributes) :: attr 28 | type(cudaDeviceProp) :: prop 29 | integer :: istat, smBytes 30 | 31 | istat = cudaGetDeviceProperties(prop, 0) 32 | print "(/,'Device Name: ',a)", trim(prop%name) 33 | print "('Compute Capability: ',i0,'.',i0)", & 34 | prop%major, prop%minor 35 | 36 | print "(/,'sharedMemPerBlock: ', i0)", prop%sharedMemPerBlock 37 | print "('sharedMemPerBlockOptIn: ', i0)", & 38 | prop%sharedMemPerBlockOptIn 39 | print "('sharedMemPerMultiprocessor: ', i0)", & 40 | prop%sharedMemPerMultiprocessor 41 | 42 | smBytes = prop%sharedMemPerBlockOptIn 43 | istat = cudaFuncSetAttribute(increment, & 44 | cudaFuncAttributeMaxDynamicSharedMemorySize, & 45 | smBytes) 46 | 47 | a_d = 1 48 | call increment<<>>(a_d, 2) 49 | a = a_d 50 | if (all(a==3)) then 51 | print "(/,'Passed')" 52 | else 53 | print "(/,'*** Failed ***')" 54 | end if 55 | end program maxSharedMemory 56 | -------------------------------------------------------------------------------- /code/ch09/Makefile: -------------------------------------------------------------------------------- 1 | OBJS = rng_gpu_sp rng_gpu_dp pi_sp pi_dp ieee_accuracy \ 2 | pi_performance_sp shflExample testPiGridGroup \ 3 | accuracy_sum mc_european_single mc_european_double 4 | 5 | 6 | # section 9.1 7 | 8 | rng_gpu_sp: precision_m.F90 generate_randomnumbers.cuf 9 | nvfortran -O3 -o $@ $^ -cudalib=curand 10 | 11 | rng_gpu_dp: precision_m.F90 generate_randomnumbers.cuf 12 | nvfortran -O3 -DDOUBLE -o $@ $^ -cudalib=curand 13 | 14 | # section 9.2 15 | 16 | pi_sp: precision_m.F90 compute_pi.cuf 17 | nvfortran -O3 -o $@ $^ -cudalib=curand 18 | 19 | pi_dp: precision_m.F90 compute_pi.cuf 20 | nvfortran -DDOUBLE -O3 -o $@ $^ -cudalib=curand 21 | 22 | # section 9.2.1 23 | 24 | ieee_accuracy: ieee_accuracy.f90 25 | nvfortran -o $@ $^ 26 | 27 | # section 9.3.* 28 | 29 | pi_performance_sp: precision_m.F90 pi_shared.cuf pi_shfl.cuf \ 30 | pi_lock.cuf pi_gridGroup.cuf compute_pi_performance.CUF 31 | nvfortran -O3 -DLOOP -o $@ $^ -cudalib=curand 32 | 33 | # section 9.3.1 34 | 35 | shflExample: shflExample.cuf 36 | nvfortran -o $@ $< 37 | 38 | # section 9.3.3 39 | 40 | testPiGridGroup: precision_m.F90 pi_gridGroup.cuf testPiGridGroup.cuf 41 | nvfortran -o $@ $^ -cudalib=curand 42 | 43 | # section 9.4 44 | 45 | accuracy_sum: accuracy_sum.f90 46 | nvfortran -o $@ $^ 47 | 48 | # section 9.5 49 | 50 | mc_european_single: precision_m.F90 montecarlo_european_option.cuf 51 | nvfortran -O3 -Minfo -o $@ $^ -cudalib=curand 52 | 53 | mc_european_double: precision_m.F90 montecarlo_european_option.cuf 54 | nvfortran -O3 -Minfo -DDOUBLE -o $@ $^ -cudalib=curand 55 | 56 | clean: 57 | rm -rf $(OBJS) *.o *~ *.mod *~ 58 | 59 | -------------------------------------------------------------------------------- /code/ch01/Makefile: -------------------------------------------------------------------------------- 1 | OBJS = incrementf90 incrementcuf multiblock multidim explicitInterface \ 2 | managed managedImplicit multidimCUF managedCUF \ 3 | managedCUF2f90 managedCUF2cuf deviceQuery pciBusID \ 4 | errorHandling syncError asyncError version 5 | 6 | all: $(OBJS) 7 | 8 | # section 1.3.1 9 | 10 | incrementf90: increment.f90 11 | nvfortran -o $@ $< 12 | 13 | incrementcuf: increment.cuf 14 | nvfortran -o $@ $< 15 | 16 | # section 1.3.2 17 | 18 | multiblock: multiblock.cuf 19 | nvfortran -o $@ $< 20 | 21 | # section 1.3.3 22 | 23 | multidim: multidim.cuf 24 | nvfortran -o $@ $< 25 | 26 | # section 1.3.4 27 | 28 | explicitInterface: explicitInterface.cuf 29 | nvfortran -o $@ $< 30 | 31 | # section 1.3.5 32 | 33 | managed: managed.cuf 34 | nvfortran -o $@ $< 35 | 36 | managedImplicit: managedImplicit.cuf 37 | nvfortran -o $@ -gpu=managed $< 38 | 39 | # section 1.3.6 40 | 41 | multidimCUF: multidimCUF.cuf 42 | nvfortran -o $@ $< 43 | 44 | managedCUF: managedCUF.cuf 45 | nvfortran -o $@ $< 46 | 47 | managedCUF2f90: managedCUF2.f90 48 | nvfortran -o $@ $< 49 | 50 | managedCUF2cuf: managedCUF2.f90 51 | nvfortran -o $@ -cuda $< 52 | 53 | # section 1.4 54 | 55 | deviceQuery: deviceQuery.cuf 56 | nvfortran -o $@ $< 57 | 58 | # section 1.4.1 59 | 60 | pciBusID: pciBusID.cuf 61 | nvfortran -o $@ $< 62 | 63 | # section 1.5 64 | 65 | errorHandling: errorHandling.cuf 66 | nvfortran -o $@ $< 67 | 68 | syncError: syncError.cuf 69 | nvfortran -o $@ $< 70 | 71 | asyncError: asyncError.cuf 72 | nvfortran -o $@ $< 73 | 74 | # section 1.7 75 | 76 | version: version.cuf 77 | nvfortran -o $@ $< 78 | 79 | clean: 80 | rm -rf $(OBJS) *.o *.mod *~ 81 | -------------------------------------------------------------------------------- /code/ch05/local.cuf: -------------------------------------------------------------------------------- 1 | module localmem 2 | implicit none 3 | contains 4 | attributes(global) subroutine k1(a) 5 | integer :: a(*) 6 | integer :: b(2), i 7 | i = blockDim%x*(blockIdx%x-1) + threadIdx%x 8 | b = 1 9 | a(i) = b(2) 10 | end subroutine k1 11 | 12 | attributes(global) subroutine k2(a, j) 13 | integer :: a(*) 14 | integer, value :: j 15 | integer :: b(2), i 16 | i = blockDim%x*(blockIdx%x-1) + threadIdx%x 17 | b = 1 18 | a(i) = b(j) 19 | end subroutine k2 20 | 21 | attributes(global) subroutine k3(a, j) 22 | integer :: a(*) 23 | integer, value :: j 24 | integer :: b(256), i 25 | i = blockDim%x*(blockIdx%x-1) + threadIdx%x 26 | b = 1 27 | a(i) = b(j) 28 | end subroutine k3 29 | 30 | attributes(global) subroutine k4(a, j) 31 | integer :: a(*), j(*) 32 | integer :: b(2), i 33 | i = blockDim%x*(blockIdx%x-1) + threadIdx%x 34 | b = 1 35 | a(i) = b(j(i)) 36 | end subroutine k4 37 | end module localmem 38 | 39 | 40 | program localAttribute 41 | use localmem 42 | use cudafor 43 | implicit none 44 | type(cudaFuncAttributes) :: attr 45 | integer :: istat 46 | 47 | istat = cudaFuncGetAttributes(attr, k1) 48 | print "('k1 local memory (bytes/thread): ', i0)", attr%localSizeBytes 49 | istat = cudaFuncGetAttributes(attr, k2) 50 | print "('k2 local memory (bytes/thread): ', i0)", attr%localSizeBytes 51 | istat = cudaFuncGetAttributes(attr, k3) 52 | print "('k3 local memory (bytes/thread): ', i0)", attr%localSizeBytes 53 | istat = cudaFuncGetAttributes(attr, k4) 54 | print "('k4 local memory (bytes/thread): ', i0)", attr%localSizeBytes 55 | end program localAttribute 56 | 57 | -------------------------------------------------------------------------------- /code/ch03/effectiveBandwidth.cuf: -------------------------------------------------------------------------------- 1 | module m 2 | use, intrinsic :: iso_fortran_env 3 | integer, parameter :: fpKind = real64 4 | contains 5 | attributes(global) subroutine copy(lhs, rhs, n) 6 | implicit none 7 | real(fpKind) :: lhs(*) 8 | real(fpKind) :: rhs(*) 9 | integer, value :: n 10 | integer :: i 11 | i = blockDim%x * (blockIdx%x - 1) + threadIdx%x 12 | if (i <= n) lhs(i) = rhs(i) 13 | end subroutine copy 14 | end module m 15 | 16 | 17 | program stream 18 | use cudafor 19 | use m 20 | implicit none 21 | integer, parameter :: N = 32*1024*1024 22 | real(fpKind), device :: a_d(N), b_d(N) 23 | integer :: nBlocks, blockSize 24 | real :: time 25 | integer(8) :: nBytes 26 | type(cudaEvent) :: startEvent, stopEvent 27 | integer :: istat 28 | 29 | istat = cudaEventCreate(startEvent) 30 | istat = cudaEventCreate(stopEvent) 31 | 32 | blockSize = 256 33 | nBlocks = (N-1)/blockSize+1 34 | nBytes = 2 * sizeof(a_d) 35 | 36 | b_d = 1.0_fpKind 37 | istat = cudaDeviceSynchronize() 38 | istat = cudaEventRecord(startEvent, 0) 39 | call copy<<>>(a_d, b_d, N) 40 | istat = cudaEventRecord(stopEvent, 0) 41 | istat = cudaEventSynchronize(stopEvent) 42 | istat = cudaEventElapsedTime(time, startEvent, stopEvent) 43 | 44 | block 45 | type(cudaDeviceProp) :: prop 46 | istat = cudaGetDeviceProperties(prop, 0) 47 | print "(A)", trim(prop%name) 48 | end block 49 | if (fpKind == real64) print "('real64')" 50 | if (fpKind == real32) print "('real32')" 51 | print "('Array size (bytes): ', i0)", sizeof(a_d) 52 | print "('Copy effective bandwidth: ', f10.1)", & 53 | nBytes/time/1.0E+6 54 | 55 | end program stream 56 | -------------------------------------------------------------------------------- /code/ch05/Makefile: -------------------------------------------------------------------------------- 1 | OBJS = HDtransfer sliceTransfer async \ 2 | offset stride local constant constantAttribute \ 3 | maxSharedMemory transpose \ 4 | parallelism parallelismPipeline cufILP fma 5 | 6 | 7 | # section 5.1.1 8 | 9 | HDtransfer: HDtransfer.cuf 10 | nvfortran -o $@ $< 11 | 12 | sliceTransfer: sliceTransfer.cuf 13 | nvfortran -o $@ $< 14 | 15 | async: async.cuf 16 | nvfortran -o $@ $< 17 | 18 | # section 5.2.2.1 19 | 20 | assumedShapeSize: assumedShapeSize.cuf 21 | nvfortran -c -gpu=ptxinfo $< 22 | 23 | # section 5.2.2.2 24 | 25 | offset: offset.cuf 26 | nvfortran -o $@ $< 27 | 28 | stride: stride.cuf 29 | nvfortran -o $@ $< 30 | 31 | # section 5.2.3 32 | 33 | local: local.cuf 34 | nvfortran -o $@ -gpu=keep $< 35 | 36 | # section 5.2.4 37 | 38 | constant: constant.cuf 39 | nvfortran -o $@ $< 40 | 41 | constantAttribute: constantAttribute.cuf 42 | nvfortran -o $@ $< 43 | 44 | # section 5.2.5 45 | 46 | loads: loads.cuf 47 | nvfortran -c -gpu=keep $< 48 | 49 | # section 5.2.6.1 50 | 51 | maxSharedMemory: maxSharedMemory.cuf 52 | nvfortran -o $@ $< 53 | 54 | # section 5.2.6.2 55 | 56 | transpose: transpose.cuf 57 | nvfortran -o $@ $< 58 | 59 | # section 5.2.7 60 | 61 | spill: spill.cuf 62 | nvfortran -c -gpu=ptxinfo $< 63 | 64 | # section 5.3.1 65 | 66 | parallelism: parallelism.cuf 67 | nvfortran -o $@ $< 68 | 69 | # section 5.3.2.1 70 | 71 | parallelismPipeline: parallelismPipeline.cuf 72 | nvfortran -gpu=cc80 -o $@ $< 73 | 74 | # section 5.3.2.2 75 | 76 | cufILP: cufILP.cuf 77 | nvfortran -o $@ $< 78 | 79 | # section 5.4.1.4 80 | 81 | fma: fma.cuf 82 | nvfortran -c -gpu=fma,keep $< 83 | 84 | 85 | clean: 86 | rm -rf $(OBJS) *.o *.mod *~ *.bin *.gpu *.ptx *.fat 87 | -------------------------------------------------------------------------------- /code/ch11/fft_test_c2c.cuf: -------------------------------------------------------------------------------- 1 | program fft_test_c2c 2 | use precision_m 3 | use cufft 4 | implicit none 5 | integer, allocatable :: kx(:) 6 | complex(fp_kind), allocatable :: cinput(:),coutput(:) 7 | complex(fp_kind), allocatable, device :: cinput_d(:),coutput_d(:) 8 | 9 | interface cufftExec 10 | module procedure cufftExecC2C,cufftExecZ2Z 11 | end interface cufftExec 12 | 13 | integer :: i,n,plan,istat 14 | real(fp_kind) :: pi=4._fp_kind*atan(1._fp_kind), h, theta 15 | 16 | n=16 17 | h=2._fp_kind*pi/real(n,fp_kind) 18 | 19 | ! allocate arrays on the host 20 | allocate(cinput(n),coutput(n),kx(n)) 21 | 22 | ! allocate arrays on the device 23 | allocate(cinput_d(n),coutput_d(n)) 24 | 25 | ! initialize arrays on host 26 | kx = [(i-1, i=1,n/2), (-n+i-1, i=n/2+1,n)] 27 | 28 | do i=1,n 29 | cinput(i)=(cos(2*real(i-1,fp_kind)*h)+sin(3*real(i-1,fp_kind)*h)) 30 | end do 31 | 32 | ! copy arrays to device 33 | cinput_d=cinput 34 | 35 | 36 | ! initialize the plan for complex to complex transform 37 | if (fp_kind == real32) istat = cufftPlan1D(plan,n,CUFFT_C2C,1) 38 | if (fp_kind == real64) istat = cufftPlan1D(plan,n,CUFFT_Z2Z,1) 39 | 40 | ! forward transform out of place 41 | istat = cufftExec(plan,cinput_d,coutput_d,CUFFT_FORWARD) 42 | 43 | ! copy results back to host 44 | coutput=coutput_d 45 | 46 | print *," Transform from complex array" 47 | do i=1,n 48 | write(*,'(i2,1x,2(f8.4),2x,i2,2(f8.4))') & 49 | i,cinput(i),kx(i),coutput(i)/n 50 | end do 51 | 52 | ! release memory on the host and on the device 53 | deallocate(cinput,coutput,kx,cinput_d,coutput_d) 54 | 55 | ! destroy the plan 56 | istat = cufftDestroy(plan) 57 | 58 | end program fft_test_c2c 59 | -------------------------------------------------------------------------------- /code/ch11/fft_test_r2c.cuf: -------------------------------------------------------------------------------- 1 | program fft_test_r2c 2 | use cudafor 3 | use precision_m 4 | use cufft 5 | implicit none 6 | integer, allocatable :: kx(:) 7 | real(fp_kind), allocatable :: rinput(:) 8 | real(fp_kind), allocatable, device :: rinput_d(:) 9 | complex(fp_kind), allocatable :: coutput(:) 10 | 11 | integer :: i,n,istat,plan 12 | real(fp_kind) :: twopi=8._fp_kind*atan(1._fp_kind),h 13 | 14 | interface cufftExec 15 | module procedure cufftExecR2C,cufftExecD2Z 16 | end interface cufftExec 17 | 18 | n=16 19 | h=twopi/real(n,fp_kind) 20 | 21 | ! allocate arrays on the host 22 | allocate(rinput(n),coutput(n/2+1),kx(n/2+1)) 23 | 24 | ! allocate arrays on the device 25 | allocate(rinput_d(n+2)) 26 | 27 | !initialize arrays on host 28 | kx = [(i-1, i=1,n/2+1)] 29 | 30 | do i=1,n 31 | rinput(i)=(cos(2*real(i-1,fp_kind)*h)+ & 32 | sin(3*real(i-1,fp_kind)*h)) 33 | end do 34 | 35 | !copy arrays to device 36 | rinput_d=rinput 37 | 38 | ! Initialize the plan for real to complex transform 39 | if (fp_kind == real32) istat=cufftPlan1D(plan,n,CUFFT_R2C,1) 40 | if (fp_kind == real64) istat=cufftPlan1D(plan,n,CUFFT_D2Z,1) 41 | 42 | ! Execute Forward transform in place 43 | istat=cufftExec(plan,rinput_d,rinput_d) 44 | 45 | ! Copy results back to host 46 | istat=cudaMemcpy(coutput,rinput_d,n/2+1,cudaMemcpyDeviceToHost) 47 | 48 | print *," Transform from real array" 49 | do i=1,n/2+1 50 | write(*,'(i2,1x,i2,2(f8.4))') i,kx(i),coutput(i)/n 51 | end do 52 | 53 | !release memory on the host and on the device 54 | deallocate (rinput,coutput,kx,rinput_d) 55 | 56 | ! Destroy the plans 57 | istat=cufftDestroy(plan) 58 | 59 | end program fft_test_r2c 60 | 61 | -------------------------------------------------------------------------------- /code/ch08/mpiDevices.cuf: -------------------------------------------------------------------------------- 1 | program mpiDevices 2 | use cudafor 3 | use mpi 4 | implicit none 5 | 6 | ! global array size 7 | integer, parameter :: n = 1024*1024 8 | ! MPI variables 9 | integer :: myrank, nprocs, ierr 10 | ! device 11 | type(cudaDeviceProp) :: prop 12 | integer(int_ptr_kind()) :: freeB, totalB, freeA, totalA 13 | real, device, allocatable :: d(:) 14 | integer :: i, j, istat 15 | 16 | ! MPI initialization 17 | call MPI_Init(ierr) 18 | call MPI_Comm_rank(MPI_COMM_WORLD, myrank, ierr) 19 | call MPI_Comm_size(MPI_COMM_WORLD, nProcs, ierr) 20 | 21 | ! print compute mode for device 22 | istat = cudaGetDevice(j) 23 | istat = cudaGetDeviceProperties(prop, j) 24 | do i = 0, nprocs-1 25 | call MPI_Barrier(MPI_COMM_WORLD, ierr) 26 | if (myrank == i) print "('[',i0,'] using device: ', & 27 | i0, ' in compute mode: ', i0)", & 28 | myrank, j, prop%computeMode 29 | enddo 30 | 31 | ! get memory use before large allocations, 32 | call MPI_Barrier(MPI_COMM_WORLD, ierr) 33 | istat = cudaMemGetInfo(freeB, totalB) 34 | 35 | ! now allocate arrays, one rank at a time 36 | do j = 0, nProcs-1 37 | 38 | ! allocate on device associated with rank j 39 | call MPI_Barrier(MPI_COMM_WORLD, ierr) 40 | if (myrank == j) allocate(d(n)) 41 | 42 | ! Get free memory after allocation 43 | call MPI_Barrier(MPI_COMM_WORLD, ierr) 44 | istat = cudaMemGetInfo(freeA, totalA) 45 | 46 | if (myrank == j) print "(' [',i0,'] after allocation on rank: ', & 47 | i0, ', device arrays allocated: ', i0)", & 48 | myrank, j, (freeB-freeA)/n/4 49 | 50 | end do 51 | 52 | deallocate(d) 53 | call MPI_Finalize(ierr) 54 | end program mpiDevices 55 | -------------------------------------------------------------------------------- /code/ch05/offset.cuf: -------------------------------------------------------------------------------- 1 | module m 2 | use, intrinsic :: iso_fortran_env 3 | integer, parameter :: fp_kind=real32 4 | contains 5 | attributes(global) subroutine increment(a, offset) 6 | real(fp_kind) :: a(*) 7 | integer, value :: offset 8 | integer :: i 9 | i = blockDim%x*(blockIdx%x-1)+threadIdx%x + offset 10 | a(i) = a(i)+1 11 | end subroutine increment 12 | end module m 13 | 14 | program offset 15 | use cudafor 16 | use m 17 | 18 | implicit none 19 | 20 | integer, parameter :: nMB = 128 ! transfer size in MB 21 | integer, parameter :: n = nMB*1024*1024/fp_kind 22 | integer, parameter :: blockSize = 256 23 | real(fp_kind), device :: a_d(n+32) 24 | type(cudaEvent) :: startEvent, stopEvent 25 | type(cudaDeviceProp) :: prop 26 | integer :: i, istat 27 | real(4) :: time 28 | 29 | 30 | istat = cudaGetDeviceProperties(prop, 0) 31 | print "(/,'Device: ',a)", trim(prop%name) 32 | print "('Transfer size (MB): ',i0)", nMB 33 | 34 | if (kind(a_d) == real32) then 35 | print "('Single Precision',/)" 36 | else 37 | print "('Double Precision',/)" 38 | endif 39 | 40 | istat = cudaEventCreate(startEvent) 41 | istat = cudaEventCreate(stopEvent) 42 | 43 | print *, 'Offset, Bandwidth (GB/s):' 44 | do i = 0, 32 45 | a_d = 0.0 46 | istat = cudaEventRecord(startEvent,0) 47 | call increment<<>>(a_d, i) 48 | istat = cudaEventRecord(stopEvent,0) 49 | istat = cudaEventSynchronize(stopEvent) 50 | 51 | istat = cudaEventElapsedTime(time, startEvent, & 52 | stopEvent) 53 | print *, i, 2*n*fp_kind/time*1.e-6 54 | enddo 55 | 56 | istat = cudaEventDestroy(startEvent) 57 | istat = cudaEventDestroy(stopEvent) 58 | end program offset 59 | -------------------------------------------------------------------------------- /code/ch05/stride.cuf: -------------------------------------------------------------------------------- 1 | module m 2 | use, intrinsic :: iso_fortran_env 3 | integer, parameter :: fp_kind=real32 4 | contains 5 | attributes(global) subroutine increment(a, stride) 6 | implicit none 7 | real(fp_kind) :: a(stride,*) 8 | integer, value :: stride 9 | integer :: i 10 | i = blockDim%x*(blockIdx%x-1)+threadIdx%x 11 | a(1,i) = a(1,i)+1 12 | end subroutine increment 13 | end module m 14 | 15 | program stride 16 | use cudafor 17 | use m 18 | 19 | implicit none 20 | 21 | integer, parameter :: nMB = 128 ! transfer size in MB 22 | integer, parameter :: n = nMB*1024*1024/fp_kind 23 | integer, parameter :: blockSize = 256 24 | real(fp_kind), device :: a_d(32,n) 25 | type(cudaEvent) :: startEvent, stopEvent 26 | type(cudaDeviceProp) :: prop 27 | integer :: i, istat 28 | real(4) :: time 29 | 30 | 31 | istat = cudaGetDeviceProperties(prop, 0) 32 | print "(/,'Device: ',a)", trim(prop%name) 33 | print "('Transfer size (MB): ',i0)", nMB 34 | 35 | if (kind(a_d) == real32) then 36 | print "('Single Precision',/)" 37 | else 38 | print "('Double Precision',/)" 39 | endif 40 | 41 | istat = cudaEventCreate(startEvent) 42 | istat = cudaEventCreate(stopEvent) 43 | 44 | print *, 'Stride, Bandwidth (GB/s):' 45 | do i = 1, 32 46 | a_d = 0.0 47 | istat = cudaEventRecord(startEvent,0) 48 | call increment<<>>(a_d, i) 49 | istat = cudaEventRecord(stopEvent,0) 50 | istat = cudaEventSynchronize(stopEvent) 51 | 52 | istat = cudaEventElapsedTime(time, startEvent, & 53 | stopEvent) 54 | print *, i, 2*n*fp_kind/time*1.e-6 55 | enddo 56 | 57 | istat = cudaEventDestroy(startEvent) 58 | istat = cudaEventDestroy(stopEvent) 59 | end program stride 60 | -------------------------------------------------------------------------------- /code/ch07/Makefile: -------------------------------------------------------------------------------- 1 | OBJS = callingC sgemmLegacy sgemmNew getrfBatched gemmPerf \ 2 | cusparseMV cusparseMV_Ex potr matmulTC matmulTranspose \ 3 | cutensorContraction testSort 4 | 5 | ARCH=sm_75 6 | GPU=cc75 7 | 8 | # section 7.1 9 | # (Ubuntu 20.04 sytems require the -c++libs flag when linking) 10 | 11 | c.o: c.cu 12 | nvcc -c -arch=$(ARCH) -rdc=true c.cu 13 | 14 | callingC: callingC.cuf c.o 15 | nvfortran -gpu=$(GPU) -o $@ $^ -c++libs 16 | 17 | # section 7.2.1 18 | 19 | sgemmLegacy: sgemmLegacy.cuf 20 | nvfortran -o $@ $< -cudalib=cublas -lblas 21 | 22 | # section 7.2.2 23 | 24 | sgemmNew: sgemmNew.cuf 25 | nvfortran -o $@ $< -cudalib=cublas 26 | 27 | # section 7.2.3 28 | 29 | getrfBatched: getrfBatched.cuf 30 | nvfortran -o $@ $< -cudalib=cublas 31 | 32 | # section 7.2.4 33 | 34 | gemmPerf: gemmPerf.cuf 35 | nvfortran -O3 -o $@ $< -cudalib=cublas 36 | 37 | # section 7.3 38 | 39 | cusparseMV: cusparseMV.cuf 40 | nvfortran -o $@ $< -cudalib=cusparse 41 | 42 | 43 | cusparseMV_Ex: cudaforEx.cuf cusparseEx.cuf cusparseMV_Ex.cuf 44 | nvfortran -o $@ $^ -cudalib=cusparse 45 | 46 | # section 7.4 47 | 48 | potr: cudaforEx.cuf potr.cuf 49 | nvfortran -o $@ $^ -cudalib=cusolver 50 | 51 | # section 7.5 52 | 53 | matmulTC: matmulTC.cuf 54 | nvfortran -o $@ $^ -cudalib=cublas,cutensor 55 | 56 | matmulTranspose: matmulTranspose.cuf 57 | nvfortran -o $@ $^ -cudalib=cutensor 58 | 59 | # section 7.5.1 60 | 61 | cutensorContraction: cutensorContraction.cuf 62 | nvfortran -o $@ $< -cudalib=cutensor 63 | 64 | # section 7.6 65 | 66 | thrust.C.o: thrust.cu 67 | nvcc -arch=$(ARCH) -c -o $@ $< 68 | 69 | testSort: thrust.cuf testSort.cuf thrust.C.o 70 | nvfortran -gpu=$(GPU) -o $@ $^ -c++libs 71 | 72 | 73 | 74 | 75 | clean: 76 | rm -rf $(OBJS) *.o *.mod *~ 77 | 78 | -------------------------------------------------------------------------------- /code/ch07/getrfBatched.cuf: -------------------------------------------------------------------------------- 1 | program testgetrfBatched 2 | use cudafor 3 | use cublas 4 | implicit none 5 | 6 | integer, parameter :: n=2, nbatch=3, lda=n 7 | real :: a(n,n,nbatch) 8 | real, device :: a_d(n,n,nbatch) 9 | type(c_devptr) :: devPtrA(nbatch) 10 | type(c_devptr), device :: devPtrA_d(nbatch) 11 | type(cublasHandle) :: h1 12 | integer :: ipvt(n*nbatch), info(nbatch) 13 | integer, device :: ipvt_d(n*nbatch), info_d(nbatch) 14 | integer :: i, k, istat 15 | 16 | ! intitialize arrays and transfer to device 17 | do k = 1, nbatch 18 | a(1,1,k) = 6.0*k 19 | a(2,1,k) = 4.0*k 20 | a(1,2,k) = 3.0*k 21 | a(2,2,k) = 3.0*k 22 | end do 23 | a_d = a 24 | 25 | print "(/,'Input:')" 26 | do k = 1, nbatch 27 | print "(2x,'Matrix: ', i0)", k 28 | do i=1, n 29 | print *, a(i,:,k) 30 | enddo 31 | enddo 32 | 33 | ! build an array of pointers 34 | do k = 1, nbatch 35 | devPtrA(k) = c_devloc(a_d(1,1,k)) 36 | end do 37 | devPtrA_d = devPtrA 38 | 39 | ! create handle, call cublasSgetrfBatched, and destroy handle 40 | istat = cublasCreate(h1) 41 | if (istat /= CUBLAS_STATUS_SUCCESS) & 42 | write(*,*) 'cublasCreate failed' 43 | istat= cublasSgetrfBatched(h1, n, devPtrA_d, lda, & 44 | ipvt_d, info_d, nbatch) 45 | if (istat /= CUBLAS_STATUS_SUCCESS) & 46 | write(*,*) 'cublasSgetrfBatched failed: ', istat 47 | istat = cublasDestroy(h1) 48 | if (istat /= CUBLAS_STATUS_SUCCESS) & 49 | write(*,*) 'cublasDestroy failed' 50 | 51 | a = a_d 52 | 53 | print "(/, 'LU Factorization:')" 54 | do k = 1, nbatch 55 | print "(2x,'Matrix: ', i0)", k 56 | do i = 1, n 57 | print *, a(i,:,k) 58 | enddo 59 | enddo 60 | 61 | end program testgetrfBatched 62 | 63 | -------------------------------------------------------------------------------- /code/ch09/pi_shared.cuf: -------------------------------------------------------------------------------- 1 | module pi_shared_m 2 | contains 3 | attributes(global) subroutine final_pi_shared(partial) 4 | implicit none 5 | integer :: partial(*) 6 | 7 | integer, shared :: p_s(*) 8 | integer :: index, inext 9 | 10 | index=threadIdx%x 11 | 12 | p_s(index)=partial(index) 13 | call syncthreads() 14 | 15 | inext=blockDim%x/2 16 | do while ( inext >=1 ) 17 | if (index <=inext) & 18 | p_s(index)=p_s(index)+p_s(index+inext) 19 | inext = inext/2 20 | call syncthreads() 21 | end do 22 | if (index == 1) partial(1)=p_s(1) 23 | end subroutine final_pi_shared 24 | 25 | attributes(global) & 26 | subroutine partial_pi_shared(input, partial, twoN) 27 | use precision_m 28 | implicit none 29 | real(fp_kind) :: input(twoN) 30 | integer :: partial(*) 31 | integer, value :: twoN 32 | 33 | integer, shared :: p_s(*) 34 | integer :: N 35 | integer :: i, index, inext,interior 36 | 37 | N = twoN/2 ! x=input(1:N), y=input(N+1:twoN) 38 | 39 | ! grid-stride loop over data 40 | index=threadIdx%x+(blockIdx%x-1)*blockDim%x 41 | interior=0 42 | do i=index, N, BlockDim%x*GridDim%x 43 | if( (input(i)**2 + input(i+N)**2) <= 1._fp_kind ) & 44 | interior=interior+1 45 | end do 46 | 47 | ! Local reduction within block 48 | index=threadIdx%x 49 | p_s(index)=interior 50 | call syncthreads() 51 | 52 | inext=blockDim%x/2 53 | do while ( inext >=1 ) 54 | if (index <=inext) & 55 | p_s(index)=p_s(index)+p_s(index+inext) 56 | inext = inext/2 57 | call syncthreads() 58 | end do 59 | if (index == 1) partial(blockIdx%x)=p_s(1) 60 | end subroutine partial_pi_shared 61 | end module pi_shared_m 62 | 63 | -------------------------------------------------------------------------------- /code/ch06/testCompact.cuf: -------------------------------------------------------------------------------- 1 | program main 2 | use compact 3 | use cudafor 4 | implicit none 5 | integer, parameter :: n=1024*1024*8 6 | real(8) :: a(n), threshold 7 | real(8), device :: a_d(n) 8 | integer, device :: mask_d(n) 9 | real(8), device, allocatable :: ac_d(:) 10 | integer, device, allocatable :: ic_d(:) 11 | real(8), allocatable :: ac(:), ach(:) 12 | integer, allocatable :: ic(:), ich(:) 13 | integer :: nc, nch, i, j, nerri, nerra, iter 14 | real :: t1, t2 15 | 16 | 17 | print *, 'Array size: ', n 18 | print *, 'Block size: ', compact_tpb 19 | 20 | threshold = 0.5d0 21 | call random_number(a) 22 | a_d = a 23 | 24 | mask_d = 0 25 | !$cuf kernel do <<<*,*>>> 26 | do i = 1, n 27 | if (a_d(i) <= threshold) mask_d(i) = 1 28 | enddo 29 | 30 | call cpu_time(t1) 31 | call compact(a_d, mask_d, n, ac_d, ic_d, nc) 32 | call cpu_time(t2) 33 | 34 | print *, 'elapsed time gpu:', t2-t1 35 | 36 | allocate(ac(nc), ic(nc)) 37 | ac = ac_d 38 | ic = ic_d 39 | 40 | ! do on host 41 | 42 | call cpu_time(t1) 43 | nch = count(a <= threshold) 44 | allocate(ach(nch), ich(nch)) 45 | j = 0 46 | do i = 1, n 47 | if (a(i) <= threshold) then 48 | j = j+1 49 | ach(j) = a(i) 50 | ich(j) = i 51 | end if 52 | if (j == nch) exit 53 | enddo 54 | call cpu_time(t2) 55 | 56 | print *, 'elapsed time host:', t2-t1 57 | 58 | if (nc /= nch) then 59 | print *, 'Error: nc from host and device: ', nch, nc 60 | else 61 | print *, 'nc: ', nc 62 | end if 63 | 64 | nerri = 0 65 | nerra = 0 66 | do i = 1, min(nc,nch) 67 | if (ic(i) /= ich(i)) nerri = nerri+1 68 | if (ac(i) /= ach(i)) nerra = nerra+1 69 | end do 70 | print *, 'Errors in index array: ', nerri 71 | print *, 'Errors in data array: ', nerra 72 | 73 | end program main 74 | -------------------------------------------------------------------------------- /code/ch08/assignDevice.cuf: -------------------------------------------------------------------------------- 1 | program main 2 | use mpi 3 | use mpiDeviceUtil 4 | use cudafor 5 | implicit none 6 | 7 | ! global array size 8 | integer, parameter :: n = 1024*1024 9 | ! mpi 10 | character (len=MPI_MAX_PROCESSOR_NAME) :: hostname 11 | integer :: myrank, nprocs, ierr, namelength 12 | ! device 13 | type(cudaDeviceProp) :: prop 14 | integer(cuda_count_kind) :: freeB, totalB, freeA, totalA 15 | real, device, allocatable :: d(:) 16 | integer :: deviceID, i, istat 17 | 18 | call MPI_Init(ierr) 19 | call MPI_Comm_rank(MPI_COMM_WORLD, myrank, ierr) 20 | call MPI_Comm_size(MPI_COMM_WORLD, nProcs, ierr) 21 | 22 | ! get and set unique device 23 | call assignDevice(deviceID) 24 | 25 | ! print hostname and device ID for each rank 26 | call MPI_Get_processor_name(hostname, namelength, ierr) 27 | do i = 0, nProcs-1 28 | call MPI_Barrier(MPI_COMM_WORLD, ierr) 29 | if (i == myrank) & 30 | print "('[',i0,'] host: ', a, ', device: ', i0)", & 31 | myrank, trim(hostname), deviceID 32 | enddo 33 | 34 | ! get memory use before large allocations, 35 | call MPI_Barrier(MPI_COMM_WORLD, ierr) 36 | istat = cudaMemGetInfo(freeB, totalB) 37 | 38 | ! allocate memory on each device 39 | call MPI_Barrier(MPI_COMM_WORLD, ierr) 40 | allocate(d(n)) 41 | 42 | ! Get free memory after allocation 43 | call MPI_Barrier(MPI_COMM_WORLD, ierr) 44 | istat = cudaMemGetInfo(freeA, totalA) 45 | 46 | do i = 0, nProcs-1 47 | call MPI_Barrier(MPI_COMM_WORLD, ierr) 48 | if (i == myrank) & 49 | print "(' [', i0, '] ', & 50 | 'device arrays allocated: ', i0)", & 51 | myrank, (freeB-freeA)/n/4 52 | end do 53 | 54 | deallocate(d) 55 | call MPI_Finalize(ierr) 56 | end program main 57 | 58 | 59 | -------------------------------------------------------------------------------- /code/ch04/sharedExample.cuf: -------------------------------------------------------------------------------- 1 | module reverse 2 | contains 3 | attributes(global) subroutine staticReverse(d) 4 | implicit none 5 | real :: d(*) 6 | real, shared :: s(64) 7 | integer :: t, tr 8 | 9 | t = threadIdx%x 10 | tr = blockDim%x-t+1 11 | 12 | s(t) = d(t) 13 | call syncthreads() 14 | d(t) = s(tr) 15 | end subroutine staticReverse 16 | 17 | attributes(global) subroutine dynamicReverse(d) 18 | implicit none 19 | real :: d(*) 20 | real, shared :: s(*) 21 | integer :: t, tr 22 | 23 | t = threadIdx%x 24 | tr = blockDim%x-t+1 25 | 26 | s(t) = d(t) 27 | call syncthreads() 28 | d(t) = s(tr) 29 | end subroutine dynamicReverse 30 | 31 | attributes(global) subroutine dynamicReverseAuto(d, n) 32 | implicit none 33 | real :: d(n) 34 | integer, value :: n 35 | real, shared :: s(n) 36 | integer :: t, tr 37 | 38 | t = threadIdx%x 39 | tr = n-t+1 40 | 41 | s(t) = d(t) 42 | call syncthreads() 43 | d(t) = s(tr) 44 | end subroutine dynamicReverseAuto 45 | end module reverse 46 | 47 | 48 | program sharedExample 49 | use cudafor 50 | use reverse 51 | 52 | implicit none 53 | 54 | integer, parameter :: n = 64 55 | real :: a(n), r(n), d(n) 56 | real, device :: d_d(n) 57 | integer :: i 58 | 59 | do i = 1, n 60 | a(i) = i 61 | r(i) = n-i+1 62 | enddo 63 | 64 | d_d = a 65 | call staticReverse<<<1, n>>>(d_d) 66 | d = d_d 67 | print *, 'staticReverse max error:', maxval(abs(r-d)) 68 | 69 | d_d = a 70 | call dynamicReverse<<<1, n, 4*n>>>(d_d) 71 | d = d_d 72 | print *, 'dynamicReverse max error:', maxval(abs(r-d)) 73 | 74 | d_d = a 75 | call dynamicReverseAuto<<<1, n, 4*n>>>(d_d, n) 76 | d = d_d 77 | print *, 'dynamicReverseAuto max error:', maxval(abs(r-d)) 78 | end program sharedExample 79 | 80 | 81 | -------------------------------------------------------------------------------- /code/ch04/sharedMultiple.cuf: -------------------------------------------------------------------------------- 1 | module m 2 | contains 3 | attributes(global) subroutine automaticDSM(a, b, n) 4 | implicit none 5 | real(8) :: a(n) 6 | integer :: b(n) 7 | integer, value :: n 8 | 9 | real(8), shared :: sa(n) 10 | integer, shared :: sb(n) 11 | integer :: t, tr 12 | 13 | t = threadIdx%x 14 | tr = n-t+1 15 | 16 | sa(t) = a(t) 17 | sb(t) = b(t) 18 | call syncthreads() 19 | a(t) = sa(tr) 20 | b(t) = sb(tr) 21 | end subroutine automaticDSM 22 | 23 | attributes(global) subroutine assumedSizeDSM(a, b) 24 | implicit none 25 | real(8) :: a(*) 26 | integer :: b(*) 27 | 28 | real(8), shared :: sa(*) 29 | integer, shared :: sb(*) 30 | integer :: t, tr, offset 31 | 32 | t = threadIdx%x 33 | tr = blockDim%x-t+1 34 | offset = blockDim%x*sizeof(sa(1))/sizeof(sb(1)) 35 | 36 | sa(tr) = a(t) 37 | sb(offset+tr) = b(t) 38 | call syncthreads() 39 | a(t) = sa(t) 40 | b(t) = sb(offset+t) 41 | end subroutine assumedSizeDSM 42 | end module m 43 | 44 | program main 45 | use m 46 | implicit none 47 | integer, parameter :: n = 512 48 | real(8) :: a(n) 49 | real(8), device :: a_d(n) 50 | integer :: b(n) 51 | integer, device :: b_d(n) 52 | integer :: i, nerr 53 | 54 | do i = 1, n 55 | a(i) = i 56 | b(i) = 10*i 57 | enddo 58 | 59 | a_d = a; b_d = b 60 | call automaticDSM<<<1,n,n*12>>>(a_d, b_d, n) 61 | a = a_d; b = b_d 62 | 63 | nerr = 0 64 | do i = 1, n 65 | if (a(i) /= n-i+1) nerr = nerr+1 66 | if (b(i) /= 10*(n-i+1)) nerr = nerr+1 67 | end do 68 | 69 | print *, 'automaticDSM errors: ', nerr 70 | 71 | call assumedSizeDSM<<<1,n,n*12>>>(a_d, b_d) 72 | a = a_d; b = b_d 73 | 74 | nerr = 0 75 | do i = 1, n 76 | if (a(i) /= i) nerr = nerr+1 77 | if (b(i) /= 10*i) nerr = nerr+1 78 | end do 79 | 80 | print *, 'assumeSizeDSM errors: ', nerr 81 | end program main 82 | 83 | -------------------------------------------------------------------------------- /code/ch09/accuracy_sum.f90: -------------------------------------------------------------------------------- 1 | program sum_accuracy 2 | implicit none 3 | real, allocatable :: x(:) 4 | real :: sum_intrinsic,sum_cpu, sum_kahan, sum_pairwise, & 5 | comp, y, tmp 6 | double precision :: sum_cpu_dp 7 | integer :: i,inext,icurrent, N=10000000 8 | 9 | allocate (x(N)) 10 | x=7. 11 | 12 | ! Summation using intrinsic 13 | sum_intrinsic=sum(x) 14 | 15 | ! Recursive summation 16 | sum_cpu=0. 17 | sum_cpu_dp=0.d0 18 | do i=1,N 19 | ! accumulator in single precision 20 | sum_cpu=sum_cpu+x(i) 21 | ! accumulator in double precision 22 | sum_cpu_dp=sum_cpu_dp+x(i) 23 | end do 24 | 25 | ! Kahan summation 26 | sum_kahan=0. 27 | comp=0. ! running compensation to recover lost low-order bits 28 | 29 | do i=1,N 30 | y = comp +x(i) 31 | tmp = sum_kahan + y ! low-order bits may be lost 32 | comp = (sum_kahan-tmp)+y ! (sum-tmp) recover low-order bits 33 | sum_kahan = tmp 34 | end do 35 | sum_kahan=sum_kahan +comp 36 | 37 | ! Pairwise summation 38 | icurrent=N 39 | inext=ceiling(real(N)/2) 40 | do while (inext >1) 41 | do i=1,inext 42 | if ( 2*i <= icurrent) x(i)=x(i)+x(i+inext) 43 | end do 44 | icurrent=inext 45 | inext=ceiling(real(inext)/2) 46 | end do 47 | sum_pairwise=x(1)+x(2) 48 | 49 | write(*, "('Summming ',i10, & 50 | ' elements of magnitude ',f3.1)") N,7. 51 | write(*, "('Sum with intrinsic function =',f12.1, & 52 | ' Error=', f12.1)") & 53 | sum_intrinsic, 7.*N-sum_intrinsic 54 | write(*, "('Recursive sum with SP accumulator =',f12.1, & 55 | ' Error=', f12.1)") sum_cpu, 7.*N-sum_cpu 56 | write(*, "('Recursive sum with DP accumulator =',f12.1, & 57 | ' Error=', f12.1)") sum_cpu_dp, 7.*N-sum_cpu_dp 58 | write(*, "('Pairwise sum in SP =',f12.1, & 59 | ' Error=', f12.1)") sum_pairwise, 7.*N-sum_pairwise 60 | write(*, "('Compensated sum in SP =',f12.1, & 61 | ' Error=', f12.1)") sum_kahan, 7.*N-sum_kahan 62 | 63 | deallocate(x) 64 | end program sum_accuracy 65 | -------------------------------------------------------------------------------- /code/ch07/matmulTC.cuf: -------------------------------------------------------------------------------- 1 | program main 2 | use cudafor 3 | use cutensorEx 4 | implicit none 5 | integer, parameter :: m=3200, n=3200, k=3200 6 | real(8) :: a(m,k), b(k,n) 7 | real(8) :: c(m,n), cref(m,n) 8 | real(8), device :: a_d(m,k), b_d(k,n) 9 | real(8), device :: c_d(m,n) 10 | integer :: istat 11 | 12 | type(cudaDeviceProp) :: prop 13 | type(cudaEvent) :: startEvent, stopEvent 14 | real :: err, time 15 | 16 | istat = cudaGetDeviceProperties(prop, 0) 17 | print "(' Device: ', a)", trim(prop%name) 18 | print "(' m = ', i0, ', n = ', i0, ', k = ', i0)", m, n, k 19 | 20 | istat = cudaEventCreate(startEvent) 21 | istat = cudaEventCreate(stopEvent) 22 | 23 | call random_number(a) 24 | call random_number(b) 25 | 26 | ! on host 27 | cref = matmul(a,b) 28 | 29 | a_d = a; b_d = b 30 | ! for overhead 31 | c_d = matmul(a_d, b_d) 32 | 33 | c_d = 0.0 34 | istat = cudaDeviceSynchronize() 35 | istat = cudaEventRecord(startEvent, 0) 36 | c_d = matmul(a_d, b_d) 37 | istat = cudaEventRecord(stopEvent, 0) 38 | istat = cudaEventSynchronize(stopEvent) 39 | c = c_d 40 | istat = cudaEventElapsedTime(time, startEvent, stopEvent) 41 | 42 | print *, 'cuTensor matmul maxval(abs(cref-c)): ', maxval(abs(cref-c)) 43 | print *, 'cuTensor matmul TFLOPS: ', 2.*k*m*n/(time/1000.)/1.0E+12 44 | 45 | block 46 | use cublas_v2 47 | type(cublasHandle) :: handle 48 | 49 | istat = cublasCreate(handle) 50 | c_d = 0.0 51 | istat = cudaDeviceSynchronize() 52 | istat = cudaEventRecord(startEvent, 0) 53 | istat = cublasDGemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, & 54 | m, n, k, 1.0_8, a_d, m, b_d, k, 0.0_8, c_d, n) 55 | istat = cudaEventRecord(stopEvent, 0) 56 | istat = cudaEventSynchronize(stopEvent) 57 | c = c_d 58 | istat = cudaEventElapsedTime(time, startEvent, stopEvent) 59 | 60 | print *, 'cublasDGemm maxval(abs(cref-c)): ', maxval(abs(cref-c)) 61 | print *, 'cublasDGemm TFLOPS: ', 2.*k*m*n/(time/1000.)/1.0E+12 62 | 63 | istat = cublasDestroy(handle) 64 | end block 65 | 66 | end program main 67 | 68 | -------------------------------------------------------------------------------- /code/ch04/Makefile: -------------------------------------------------------------------------------- 1 | OBJS = twoKernels pipeline streamSync eventSync \ 2 | defaultStream defaultStreamVar defaultStreamVarExplicit \ 3 | differentStreamTypes concurrentKernels \ 4 | sharedExample sharedMultiple syncthreads ballot shfl \ 5 | raceAndAtomic raceAndAtomicShared threadfence cgReverse \ 6 | smooth swap 7 | 8 | # section 4.1.2.1 9 | 10 | twoKernels: twoKernels.cuf 11 | nvfortran -o $@ $< 12 | 13 | # section 4.1.3 14 | 15 | pipeline: pipeline.cuf 16 | nvfortran -o $@ $< 17 | 18 | # section 4.1.4.2 19 | 20 | streamSync: streamSync.cuf 21 | nvfortran -o $@ $< 22 | 23 | # section 4.1.4.3 24 | 25 | eventSync: eventSync.cuf 26 | nvfortran -o $@ $< 27 | 28 | # section 4.1.5.1 29 | 30 | defaultStream: defaultStream.cuf 31 | nvfortran -o $@ $< 32 | 33 | defaultStreamVar: defaultStreamVar.cuf 34 | nvfortran -o $@ $< 35 | 36 | defaultStreamVarExplicit: defaultStreamVarExplicit.cuf 37 | nvfortran -o $@ $< 38 | 39 | # section 4.1.5.2 40 | 41 | differentStreamTypes: differentStreamTypes.cuf 42 | nvfortran -o $@ $< 43 | 44 | concurrentKernels: concurrentKernels.cuf 45 | nvfortran -o $@ $< 46 | 47 | # section 4.2.1 48 | 49 | sharedExample: sharedExample.cuf 50 | nvfortran -o $@ $< 51 | 52 | sharedMultiple: sharedMultiple.cuf 53 | nvfortran -o $@ $< 54 | 55 | # section 4.2.2 56 | 57 | syncthreads: syncthreads.cuf 58 | nvfortran -o $@ $< 59 | 60 | # section 4.2.3 61 | 62 | ballot: ballot.cuf 63 | nvfortran -o $@ $< 64 | 65 | # section 4.2.3.1 66 | 67 | shfl: shfl.cuf 68 | nvfortran -o $@ $< 69 | 70 | # section 4.2.4 71 | 72 | raceAndAtomic: raceAndAtomic.cuf 73 | nvfortran -o $@ $< 74 | 75 | raceAndAtomicShared: raceAndAtomicShared.cuf 76 | nvfortran -o $@ $< 77 | 78 | # section 4.2.5 79 | 80 | threadfence: threadfence.cuf 81 | nvfortran -o $@ $< 82 | 83 | # section 4.2.6 84 | 85 | cgReverse: cgReverse.cuf 86 | nvfortran -o $@ $< 87 | 88 | # section 4.2.6.1 89 | 90 | smooth: smooth.cuf 91 | nvfortran -o $@ $< 92 | 93 | # section 4.2.6.2 94 | 95 | swap: swap.cuf 96 | nvfortran -o $@ -gpu=cc90 $< 97 | 98 | 99 | clean: 100 | rm -rf $(OBJS) *.o *.mod *~ 101 | -------------------------------------------------------------------------------- /code/ch06/test_union.cuf: -------------------------------------------------------------------------------- 1 | module m 2 | contains 3 | 4 | attributes(global) subroutine r1(x, n) 5 | implicit none 6 | real :: x(*) 7 | integer, value :: n 8 | integer :: i 9 | i = threadIdx%x + (blockIdx%x-1)*blockDim%x 10 | if (i <= n) x(i) = i 11 | end subroutine r1 12 | 13 | ! can use ignore_tkr if you have access to the routine 14 | attributes(global) subroutine c1(x, n) 15 | implicit none 16 | !dir$ ignore_tkr x 17 | complex :: x(*) 18 | integer, value :: n 19 | integer :: i 20 | i = threadIdx%x + (blockIdx%x-1)*blockDim%x 21 | if (i <= n) x(i) = cmplx(-i, -1.0) 22 | end subroutine c1 23 | 24 | attributes(global) subroutine c2(x, n) 25 | implicit none 26 | complex :: x(*) 27 | integer, value :: n 28 | integer :: i 29 | i = threadIdx%x + (blockIdx%x-1)*blockDim%x 30 | if (i <= n) x(i) = cmplx(-i*2, -2.0) 31 | end subroutine c2 32 | 33 | ! use a wrapper with ignore_tkr 34 | subroutine wrap_c2(x, n, g, b) 35 | implicit none 36 | !dir$ ignore_tkr x 37 | complex, device :: x(*) 38 | integer, value :: n, g, b 39 | call c2<<>>(x, n) 40 | end subroutine wrap_c2 41 | 42 | end module m 43 | 44 | 45 | program main 46 | use m 47 | use union 48 | use cudafor 49 | implicit none 50 | integer, parameter :: blks=2, tpb = 16, n = blks*tpb 51 | real :: r(n) 52 | real, device :: r_d(n) 53 | complex, device, pointer :: ptr(:) 54 | integer :: i 55 | 56 | ! use r_d as real 57 | call r1<<>>(r_d, n) 58 | r = r_d 59 | print *, r(1:4) 60 | 61 | ! OK since ignore_tkr is in the kernel 62 | call c1<<>>(r_d, n/2) 63 | r = r_d 64 | print *, r(1:4) 65 | 66 | ! OK since ignore_tkr is in the wrapper to c2 67 | call wrap_c2(r_d, n/2, blks/2, tpb) 68 | r = r_d 69 | print *, r(1:4) 70 | 71 | r_d = 0.0 72 | 73 | call union(r_d, ptr) 74 | ! OK since ptr is complex 75 | call c2<<>>(ptr, n/2) 76 | r = r_d 77 | print *, r(1:4) 78 | 79 | !for use in CUF kernel as a complex, use pointer 80 | !$cuf kernel do 81 | do i = 1, n/2 82 | ptr(i) = conjg(ptr(i)) 83 | end do 84 | r = r_d 85 | print *, r(1:4) 86 | 87 | end program main 88 | -------------------------------------------------------------------------------- /code/ch09/compute_pi.cuf: -------------------------------------------------------------------------------- 1 | ! Compute pi using a Monte Carlo method 2 | 3 | program compute_pi 4 | use precision_m 5 | use curand 6 | implicit none 7 | real(fp_kind), allocatable:: hostData(:) 8 | real(fp_kind), allocatable, device:: deviceData(:) 9 | real(fp_kind) :: pival 10 | type(curandGenerator):: gen 11 | integer :: inside_gpu, inside_cpu, i, istat 12 | integer(kind=8) :: twoN, seed, N 13 | 14 | ! Define how many numbers we want to generate 15 | N=100000 16 | twoN=N*2 17 | 18 | ! Allocate array on CPU 19 | allocate(hostData(twoN)) 20 | 21 | ! Allocate array on GPU 22 | allocate(deviceData(twoN)) 23 | 24 | if (fp_kind == singlePrecision) then 25 | write(*,"('Compute pi in single precision')") 26 | else 27 | write(*,"('Compute pi in double precision')") 28 | end if 29 | 30 | ! Create pseudonumber generator 31 | istat = curandCreateGenerator(gen, CURAND_RNG_PSEUDO_DEFAULT) 32 | 33 | ! Set seed 34 | seed=1234 35 | istat = curandSetPseudoRandomGeneratorSeed( gen, seed) 36 | 37 | ! Generate N floats or double on device 38 | istat = curandGenerate(gen, deviceData, twoN) 39 | 40 | ! Copy the data back to CPU to check result later 41 | hostData=deviceData 42 | 43 | ! Perform the test on GPU using CUF kernel 44 | inside_gpu=0 45 | !$cuf kernel do <<<*,*>>> 46 | do i=1,N 47 | if( (deviceData(i)**2+deviceData(i+N)**2) <= 1._fp_kind ) & 48 | inside_gpu=inside_gpu+1 49 | end do 50 | 51 | ! Perform the test on CPU 52 | inside_cpu=0 53 | do i=1,N 54 | if( (hostData(i)**2+hostData(i+N)**2) <= 1._fp_kind ) & 55 | inside_cpu=inside_cpu+1 56 | end do 57 | 58 | ! Check the results 59 | if (inside_cpu .ne. inside_gpu) & 60 | write(*,*) "Mismatch between CPU/GPU" 61 | 62 | ! Print the value of pi and the error 63 | pival= 4._fp_kind*real(inside_gpu,fp_kind)/real(N,fp_kind) 64 | write(*,"(t3,a,i10,a,f10.8,a,e11.4)") "Samples=", N, & 65 | " Pi=", pival, & 66 | " Error=", abs(pival-2.0_fp_kind*asin(1.0_fp_kind)) 67 | 68 | ! Deallocate data on CPU and GPU 69 | deallocate(hostData) 70 | deallocate(deviceData) 71 | 72 | ! Destroy the generator 73 | istat = curandDestroyGenerator(gen) 74 | end program compute_pi 75 | -------------------------------------------------------------------------------- /code/ch06/Makefile: -------------------------------------------------------------------------------- 1 | OBJS = portingBase portingManaged portingDevice \ 2 | portingManaged_HOST portingManaged_CUDA \ 3 | portingDevice_HOST portingDevice_CUDA \ 4 | portingManagedSent_HOST portingManagedSent_CUDA \ 5 | portingDeviceSent_HOST portingDeviceSent_CUDA \ 6 | laplace2D laplace2DUse_HOST laplace2DUse_CUDA \ 7 | portingAssociate_HOST portingAssociate_CUDA \ 8 | laplace2DAssoc_HOST laplace2DAssoc_CUDA test_union \ 9 | test_union testCompact testCompactOpt 10 | 11 | # section 6.1 12 | 13 | portingBase: portingBase.f90 14 | nvfortran -o $@ $< 15 | 16 | portingManaged: portingManaged.cuf 17 | nvfortran -o $@ $< 18 | 19 | portingDevice: portingDevice.cuf 20 | nvfortran -o $@ $< 21 | 22 | # section 6.2 23 | 24 | portingManaged_HOST: portingManaged_CUDA.F90 25 | nvfortran -o $@ $< 26 | 27 | portingManaged_CUDA: portingManaged_CUDA.F90 28 | nvfortran -cuda -o $@ $< 29 | 30 | portingDevice_HOST: portingDevice_CUDA.F90 31 | nvfortran -o $@ $< 32 | 33 | portingDevice_CUDA: portingDevice_CUDA.F90 34 | nvfortran -cuda -o $@ $< 35 | 36 | 37 | portingManagedSent_HOST: portingManagedSent.f90 38 | nvfortran -o $@ $< 39 | 40 | portingManagedSent_CUDA: portingManagedSent.f90 41 | nvfortran -cuda -o $@ $< 42 | 43 | portingDeviceSent_HOST: portingDeviceSent.F90 44 | nvfortran -o $@ $< 45 | 46 | portingDeviceSent_CUDA: portingDeviceSent.F90 47 | nvfortran -cuda -o $@ $< 48 | 49 | # section 6.3.1 50 | 51 | laplace2D: laplace2D.f90 52 | nvfortran -o $@ $< 53 | 54 | laplace2DUse_HOST: laplace2DUse.F90 55 | nvfortran -O3 -o $@ $< 56 | 57 | laplace2DUse_CUDA: laplace2DUse.F90 58 | nvfortran -O3 -cuda -o $@ $< 59 | 60 | # section 6.3.2 61 | 62 | portingAssociate_HOST: portingAssociate.f90 63 | nvfortran -o $@ $< 64 | 65 | portingAssociate_CUDA: portingAssociate.f90 66 | nvfortran -cuda -o $@ $< 67 | 68 | laplace2DAssoc_HOST: laplace2DAssoc.f90 69 | nvfortran -O3 -o $@ $< 70 | 71 | laplace2DAssoc_CUDA: laplace2DAssoc.f90 72 | nvfortran -O3 -cuda -o $@ $< 73 | 74 | # section 6.4 75 | 76 | test_union: union_m.cuf test_union.cuf 77 | nvfortran -o $@ $^ 78 | 79 | # section 6.5 80 | 81 | testCompact: compact_m.cuf testCompact.cuf 82 | nvfortran -o $@ $^ 83 | 84 | testCompactOpt: compactOpt_m.cuf testCompact.cuf 85 | nvfortran -o $@ $^ 86 | 87 | 88 | clean: 89 | rm -rf $(OBJS) *.o *.mod *~ 90 | -------------------------------------------------------------------------------- /code/ch04/threadfence.cuf: -------------------------------------------------------------------------------- 1 | module m 2 | contains 3 | attributes(global) subroutine s1(a, b, n, useThreadfence) 4 | implicit none 5 | real :: a(n), b(n) 6 | integer, value :: n 7 | logical, value :: useThreadfence 8 | integer :: tid, i 9 | 10 | tid = (blockIdx%x-1)*blockDim%x + threadIdx%x 11 | 12 | do i = tid, n, blockDim%x*gridDim%x 13 | a(i) = i 14 | if (useThreadfence) call threadfence() 15 | b(i) = i 16 | enddo 17 | end subroutine s1 18 | 19 | attributes(global) subroutine s2(a, b, n, flag, useThreadfence) 20 | implicit none 21 | real :: a(n), b(n), aval, bval 22 | integer, value :: n 23 | integer :: flag 24 | logical, value :: useThreadfence 25 | integer :: tid, i, tmp 26 | 27 | tid = (blockIdx%x-1)*blockDim%x + threadIdx%x 28 | do i = tid, n, blockDim%x*gridDim%x 29 | bval = b(i) 30 | if (useThreadfence) call threadfence() 31 | aval = a(i) 32 | if (aval == 0 .and. bval == i) tmp = atomicExch(flag, 1) 33 | enddo 34 | end subroutine s2 35 | 36 | end module m 37 | 38 | program threadfence 39 | use cudafor 40 | use m 41 | implicit none 42 | integer, parameter :: n=10000*1024 43 | integer, parameter :: niter = 2000 44 | real, device :: a_d(n), b_d(n) 45 | integer, device :: flag_d 46 | integer :: flag 47 | logical :: useThreadfence 48 | integer(cuda_stream_kind) :: stream1, stream2 49 | integer :: istat, i, j, icount 50 | 51 | istat = cudaStreamCreate(stream1) 52 | istat = cudaStreamCreate(stream2) 53 | 54 | do j = 1, 2 55 | if (j == 1) then 56 | useThreadfence = .false. 57 | print *, 'Runs without threadfence(): ' 58 | else 59 | useThreadfence = .true. 60 | print *, 'Runs with threadfence(): ' 61 | endif 62 | 63 | icount = 0 64 | do i = 1, niter 65 | a_d = 0.0 66 | b_d = 0.0 67 | flag_d = 0 68 | 69 | call s1<<<10,512,0,stream1>>>(a_d, b_d, n, useThreadfence) 70 | call s2<<<10,512,0,stream2>>>(a_d, b_d, n, flag_d, useThreadfence) 71 | flag = flag_d 72 | if (flag == 1) icount = icount+1 73 | enddo 74 | print "(a, i0,'/',i0)", & 75 | ' iterations where out-of-order access observed: ', & 76 | icount, niter 77 | end do 78 | 79 | istat = cudaStreamDestroy(stream1) 80 | istat = cudaStreamDestroy(stream2) 81 | end program threadfence 82 | -------------------------------------------------------------------------------- /code/ch04/swap.cuf: -------------------------------------------------------------------------------- 1 | module m 2 | integer, parameter :: nx = 64 3 | integer, parameter :: readRemoteSharedMemory = 1 4 | integer, parameter :: writeRemoteSharedMemory = 2 5 | contains 6 | attributes(global) cluster_dims(2,1,1) subroutine swap(a, b, mode) 7 | use cooperative_groups 8 | implicit none 9 | integer, device :: a(nx), b(nx) 10 | integer, value :: mode 11 | 12 | type(cluster_group) :: cluster 13 | integer, shared :: s(nx/2) 14 | integer, shared :: ds(nx/2); pointer(dsPtr, ds) 15 | integer :: i, tibIdx, tibDim, ticIdx, bicIdx 16 | 17 | cluster = this_cluster() 18 | 19 | tibIdx = threadIdx%x ! thread index in block 20 | tibDim = blockDim%x ! number of threads in block 21 | bicIdx = cluster%rank ! block index in cluster 22 | ticIdx = (bicIdx-1)*tibDim + tibIdx ! thread index in cluster 23 | 24 | s(tibIdx) = 0 25 | call syncthreads(cluster) 26 | 27 | ! get pointer to other block's shared memory 28 | if (cluster%rank == 1) then 29 | dsPtr = cluster_map_shared_rank(s, 2) 30 | else 31 | dsPtr = cluster_map_shared_rank(s, 1) 32 | end if 33 | 34 | if (mode == readRemoteSharedMemory) then 35 | s(tibIdx) = a(ticIdx) ! write to local shared memory 36 | call syncthreads(cluster) 37 | b(ticIdx) = ds(tibIdx) ! read from remote shared memory 38 | call syncthreads(cluster) ! ensure remote read finishes before exit 39 | else ! writeRemoteSharedMemory 40 | ds(tibIdx) = a(ticIdx) ! write to remote shared memory 41 | call syncthreads(cluster) 42 | b(ticIdx) = s(tibIdx) ! read from local shared memory 43 | end if 44 | end subroutine swap 45 | end module m 46 | 47 | program main 48 | use m 49 | use cudafor 50 | implicit none 51 | integer :: a(nx), b(nx), gold(nx) 52 | integer, device :: a_d(nx), b_d(nx) 53 | integer :: i 54 | 55 | do i = 1, nx 56 | a(i) = i 57 | enddo 58 | 59 | gold(1:nx/2) = a(nx/2+1:nx) 60 | gold(nx/2+1:nx) = a(1:nx/2) 61 | 62 | a_d = a 63 | b_d = -1 64 | 65 | call swap<<<2,nx/2>>>(a_d, b_d, readRemoteSharedMemory) 66 | b = b_d 67 | print *, 'Remote read maxval(abs(gold-b)): ', maxval(abs(gold-b)) 68 | 69 | b_d = -1 70 | call swap<<<2,nx/2>>>(a_d, b_d, writeRemoteSharedMemory) 71 | b = b_d 72 | print *, 'Remote write maxval(abs(gold-b)): ', maxval(abs(gold-b)) 73 | end program main 74 | -------------------------------------------------------------------------------- /code/ch11/fft_derivative.cuf: -------------------------------------------------------------------------------- 1 | program fft_derivative 2 | use precision_m 3 | use cufft 4 | implicit none 5 | real(fp_kind), allocatable :: kx(:), derivative(:) 6 | real(fp_kind), allocatable, device :: kx_d(:) 7 | 8 | complex(fp_kind), allocatable :: cinput(:),coutput(:) 9 | complex(fp_kind), allocatable, device :: cinput_d(:),coutput_d(:) 10 | 11 | integer :: i,n,plan, istat 12 | real(fp_kind) :: twopi=8._fp_kind*atan(1._fp_kind), h 13 | 14 | interface cufftExec 15 | module procedure cufftExecC2C, cufftExecZ2Z 16 | end interface cufftExec 17 | 18 | n=8 19 | h=twopi/real(n,fp_kind) 20 | 21 | ! allocate arrays on the host 22 | allocate(cinput(n),coutput(n),derivative(n),kx(n)) 23 | 24 | ! allocate arrays on the device 25 | allocate(cinput_d(n),coutput_d(n),kx_d(n)) 26 | 27 | ! initialize arrays on host 28 | kx = [((i-1),i=1,n/2), ((-n+i-1),i=n/2+1,n)] 29 | 30 | ! Set the wave number for the Nyquist frequency to zero 31 | kx(n/2+1) = 0._fp_kind 32 | 33 | ! Copy the wave number vector to the device 34 | kx_d = kx 35 | 36 | do i=1,n 37 | cinput(i) = (cos(2*real(i-1,fp_kind)*h) & 38 | +sin(3*real(i-1,fp_kind)*h)) 39 | derivative(i) = (-2*sin(2*real(i-1,fp_kind)*h) & 40 | +3*cos(3*real(i-1,fp_kind)*h)) 41 | end do 42 | 43 | ! copy input to device 44 | cinput_d = cinput 45 | 46 | ! Initialize the plan for complex to complex transform 47 | if (fp_kind == real32) istat=cufftPlan1D(plan,n,CUFFT_C2C,1) 48 | if (fp_kind == real64) istat=cufftPlan1D(plan,n,CUFFT_Z2Z,1) 49 | 50 | ! Forward transform out of place 51 | istat = cufftExec(plan,cinput_d,coutput_d,CUFFT_FORWARD) 52 | 53 | ! Compute the derivative in spectral space and normalize the FFT 54 | !$cuf kernel do <<<*,*>>> 55 | do i=1,n 56 | coutput_d(i) = cmplx(0.,kx_d(i),fp_kind)*coutput_d(i)/n 57 | end do 58 | 59 | ! Inverse transform in place 60 | istat = cufftExec(plan,coutput_d,coutput_d,CUFFT_INVERSE) 61 | 62 | ! Copy results back to host 63 | coutput = coutput_d 64 | 65 | print *," First Derivative from complex array" 66 | do i=1,n 67 | write(*,'(i2,2(1x,f8.4),2x,e13.7)') i, real(coutput(i)), & 68 | derivative(i), real(coutput(i))-derivative(i) 69 | end do 70 | 71 | !release memory on the host and on the device 72 | deallocate(cinput,coutput,kx,derivative,cinput_d,coutput_d,kx_d) 73 | 74 | ! Destroy the plan 75 | istat = cufftDestroy(plan) 76 | 77 | end program fft_derivative 78 | -------------------------------------------------------------------------------- /code/ch07/cusparseMV_Ex.cuf: -------------------------------------------------------------------------------- 1 | program sparseMatVec 2 | use cudafor 3 | use cusparseEx 4 | use, intrinsic :: iso_fortran_env 5 | 6 | implicit none 7 | 8 | integer, parameter :: n = 5 ! # rows/cols in matrix 9 | integer, parameter :: nnz = 5 ! # nonzeros in matrix 10 | integer, parameter :: fpKind = real32 11 | 12 | type(cusparseHandle) :: h 13 | type(cusparseSpMatDescr) :: descrA 14 | type(cusparseDnVecDescr) :: descrX, descrY 15 | 16 | ! CSR matrix 17 | real(fpKind), device :: csrValues_d(nnz) 18 | integer(4), device :: csrRowOffsets_d(n+1), csrColInd_d(nnz) 19 | 20 | ! dense vectors 21 | real(fpKind), device :: x_d(n), y_d(n) 22 | real(fpKind) :: y(n) 23 | 24 | ! parameters 25 | real(fpKind) :: alpha, beta 26 | 27 | integer :: status, i 28 | 29 | if (fpKind == real32) then 30 | print *, 'Using single precision' 31 | else 32 | print *, 'Using double precision' 33 | endif 34 | 35 | 36 | ! initalize cusparse 37 | status = cusparseCreate(h) 38 | 39 | ! CSR representation for upper circular shift matrix 40 | csrValues_d = 1.0 41 | csrColInd_d = [2, 3, 4, 5, 1] 42 | csrRowOffsets_d = [1, 2, 3, 4, 5, 6] 43 | 44 | ! vectors 45 | x_d = [11.0, 12.0, 13.0, 14.0, 15.0] 46 | y_d = 0.0 47 | 48 | y = x_d 49 | print *, 'Original vector' 50 | print "(5(1x,f7.2))", y 51 | 52 | ! initialize sparse matrix descriptor A in CSR format 53 | status = cusparseCreateCsr(descrA, & 54 | n, csrRowOffsets_d, csrColInd_d, csrValues_d) 55 | 56 | ! initialize the dense vector descriptors for X and Y 57 | status = cusparseCreateDnVec(descrX, x_d) 58 | status = cusparseCreateDnVec(descrY, y_d) 59 | 60 | ! y = alpha*A*x + beta*y 61 | 62 | alpha = 1.0 63 | beta = 0.0 64 | status = cusparseSpMV(h, CUSPARSE_OPERATION_NON_TRANSPOSE, & 65 | alpha, descrA, descrX, beta, descrY) 66 | 67 | y = y_d 68 | print *, 'Shifted vector' 69 | print "(5(1x,f7.2))", y 70 | 71 | ! shift down and subtract original 72 | ! x = alpha*(A')*y + beta*x 73 | 74 | alpha = 1.0 75 | beta = -1.0 76 | status = cusparseSpMV(h, CUSPARSE_OPERATION_TRANSPOSE, & 77 | alpha, descrA, descrY, beta, descrX) 78 | 79 | y = x_d 80 | print *, 'Max error: ', maxval(abs(y)) 81 | 82 | ! cleanup 83 | 84 | status = cusparseDestroySpMat(descrA) 85 | status = cusparseDestroyDnVec(descrX) 86 | status = cusparseDestroyDnVec(descrY) 87 | status = cusparseDestroy(h) 88 | 89 | end program sparseMatVec 90 | -------------------------------------------------------------------------------- /code/ch01/deviceQuery.cuf: -------------------------------------------------------------------------------- 1 | program deviceQuery 2 | use cudafor 3 | implicit none 4 | 5 | type (cudaDeviceProp) :: prop 6 | integer :: nDevices=0, i, ierr 7 | 8 | ! Number of CUDA-capable devices 9 | 10 | ierr = cudaGetDeviceCount(nDevices) 11 | 12 | if (nDevices == 0) then 13 | print "(/,'No CUDA devices found',/)" 14 | stop 15 | else if (nDevices == 1) then 16 | print "(/,'One CUDA device found',/)" 17 | else 18 | print "(/,i0,' CUDA devices found',/)", nDevices 19 | end if 20 | 21 | ! Loop over devices (N.B. 0-based enumeration) 22 | 23 | do i = 0, nDevices-1 24 | 25 | print "('Device Number: ',i0)", i 26 | 27 | ierr = cudaGetDeviceProperties(prop, i) 28 | 29 | ! General device info 30 | 31 | print "(' Device Name: ', a)", trim(prop%name) 32 | print "(' Compute Capability: ',i0,'.',i0)", & 33 | prop%major, prop%minor 34 | print "(' Number of Multiprocessors: ',i0)", & 35 | prop%multiProcessorCount 36 | print "(' Single- to Double-Precision Perf Ratio: & 37 | &', i0)", & 38 | prop%singleToDoublePrecisionPerfRatio 39 | print "(' Max Threads per Multiprocessor: ',i0)", & 40 | prop%maxThreadsPerMultiprocessor 41 | if (prop%cooperativeLaunch == 0) then 42 | print "(' Supports Cooperative Kernels: No',/)" 43 | else 44 | print "(' Supports Cooperative Kernels: Yes',/)" 45 | end if 46 | print "(' Global Memory (GB): ',f9.3,/)", & 47 | prop%totalGlobalMem/1024.0**3 48 | 49 | 50 | ! Execution Configuration 51 | 52 | print "(' Execution Configuration Limits')" 53 | print "(' Max Grid Dims: ',2(i0,' x '),i0)", & 54 | prop%maxGridSize 55 | print "(' Max Block Dims: ',2(i0,' x '),i0)", & 56 | prop%maxThreadsDim 57 | print "(' Max Threads per Block: ',i0,/)", & 58 | prop%maxThreadsPerBlock 59 | 60 | ! Has managed memory 61 | 62 | print "(' Managed Memory')" 63 | if (prop%managedMemory == 0) then 64 | print "(' Can Allocate Managed Memory: No')" 65 | else 66 | print "(' Can Allocate Managed Memory: Yes')" 67 | endif 68 | if (prop%concurrentManagedAccess == 0) then 69 | print "(' Device/CPU Concurrent Access & 70 | &to Managed Memory: No',/)" 71 | else 72 | print "(' Device/CPU Concurrent Access & 73 | &to Managed Memory: Yes',/)" 74 | endif 75 | 76 | enddo 77 | 78 | end program deviceQuery 79 | -------------------------------------------------------------------------------- /code/ch09/testPiGridGroup.cuf: -------------------------------------------------------------------------------- 1 | program testPiGridGroup 2 | use pi_gridGroup_m 3 | use precision_m 4 | use curand 5 | use cudafor 6 | implicit none 7 | 8 | integer, parameter :: blockSize = 256 9 | real(fp_kind), allocatable:: data_h(:) 10 | real(fp_kind), allocatable, device:: data_d(:) 11 | real(fp_kind) :: pival 12 | 13 | integer, device, allocatable :: partial_d(:) 14 | type(curandGenerator):: gen 15 | integer :: inside_gpu, inside_cpu, i, istat 16 | integer :: twoN, N, nBlocks 17 | integer(8) :: seed 18 | 19 | N=200000 20 | twoN=N*2 21 | 22 | allocate(data_h(twoN), data_d(twoN)) 23 | 24 | if (fp_kind == singlePrecision) then 25 | write(*,"('Compute pi in single precision')") 26 | else 27 | write(*,"('Compute pi in double precision')") 28 | end if 29 | 30 | ! Create pseudonumber generator 31 | istat = curandCreateGenerator(gen, CURAND_RNG_PSEUDO_DEFAULT) 32 | 33 | ! Set seed 34 | seed=1234 35 | istat = curandSetPseudoRandomGeneratorSeed(gen, seed) 36 | 37 | ! Generate N floats or doubles on device 38 | istat = curandGenerate(gen, data_d, twoN) 39 | 40 | data_h = data_d 41 | 42 | ! determine the # of blocks to launch for grid_global kernel 43 | block 44 | type(cudaDeviceProp) :: prop 45 | istat = cudaGetDeviceProperties(prop, 0) 46 | istat = cudaOccupancyMaxActiveBlocksPerMultiprocessor( & 47 | nBlocks, pi_gg, blockSize, 0) 48 | nBlocks = nBlocks * prop%multiProcessorCount 49 | end block 50 | 51 | ! allocate the partial() array 52 | allocate(partial_d(nBlocks)) 53 | 54 | ! Perform the test on GPU using grid_global kernel 55 | inside_gpu=0 56 | call pi_gg<<>>(data_d, partial_d, twoN) 57 | inside_gpu = partial_d(1) 58 | 59 | ! Perform the test on CPU 60 | inside_cpu=0 61 | do i=1,N 62 | if( (data_h(i)**2 + data_h(i+N)**2) <= 1._fp_kind ) & 63 | inside_cpu=inside_cpu+1 64 | end do 65 | 66 | ! Check the results 67 | if (inside_cpu .ne. inside_gpu) & 68 | write(*,*) "Mismatch between CPU/GPU" 69 | 70 | ! Print the value of pi and the error 71 | pival= 4._fp_kind*real(inside_gpu,fp_kind)/real(N,fp_kind) 72 | write(*,"(t3,a,i10,a,f10.8,a,e11.4)") "Samples=", N, & 73 | " Pi=", pival, & 74 | " Error=", abs(pival-2.0_fp_kind*asin(1.0_fp_kind)) 75 | 76 | ! Deallocate data on CPU and GPU 77 | deallocate(data_h, data_d, partial_d) 78 | 79 | ! Destroy the generator 80 | istat = curandDestroyGenerator(gen) 81 | 82 | end program testPiGridGroup 83 | -------------------------------------------------------------------------------- /code/ch07/potr.cuf: -------------------------------------------------------------------------------- 1 | program main 2 | use cudaforEx 3 | use cublas 4 | use cusolverDN 5 | implicit none 6 | integer, parameter :: n=3 7 | real(8) :: a(n,n), b(n) 8 | real(8), device :: a_d(n,n), b_d(n) 9 | integer, device :: devInfo_d 10 | integer(8) :: wsSizeH, wsSizeD 11 | integer(1), allocatable :: buffer(:) 12 | integer(1), device, allocatable :: buffer_d(:) 13 | integer, device :: info_d 14 | integer :: istat, i 15 | 16 | type(cusolverDnHandle) :: h 17 | type(cusolverDnParams) :: params 18 | 19 | a = reshape([25, 15, -5, 15, 18, 0, -5, 0, 11], shape=[3,3]) 20 | print *, 'A:' 21 | do i = 1, n 22 | print *, a(i,:) 23 | enddo 24 | a_d = a 25 | b = [40, 51, 28] 26 | b_d = b 27 | print *, 'b:' 28 | print *, b 29 | 30 | istat = cusolverDnCreate(h) 31 | istat = cusolverDnCreateParams(params) 32 | istat = cusolverDnXpotrf_buffersize(handle = h, & 33 | params = params, & 34 | uplo = CUBLAS_FILL_MODE_UPPER, & 35 | n = n, & 36 | dataTypeA = cudaDataType(cudaforGetDataType(a_d)), & 37 | A = a_d, & 38 | lda = n, & 39 | computeType = cudaDataType(cudaforGetDataType(a_d)), & 40 | workspaceinBytesOnDevice = wsSizeD, & 41 | workspaceinBytesOnHost = wsSizeH) 42 | 43 | allocate(buffer(wsSizeH), buffer_d(wsSizeD)) 44 | 45 | istat = cusolverDnXpotrf(handle = h, & 46 | params = params, & 47 | uplo = CUBLAS_FILL_MODE_UPPER, & 48 | n = n, & 49 | dataTypeA = cudaDataType(cudaforGetDataType(a_d)), & 50 | A = a_d, & 51 | lda = n, & 52 | computeType = cudaDataType(cudaforGetDataType(a_d)), & 53 | bufferOnDevice = buffer_d, & 54 | workspaceinBytesOnDevice = wsSizeD, & 55 | bufferOnHost = buffer, & 56 | workspaceinBytesOnHost = wsSizeH, & 57 | devinfo = info_d) 58 | 59 | a = a_d 60 | 61 | print *, 'Cholesky factorization:' 62 | do i = 1, n 63 | print *, a(i,:) 64 | enddo 65 | 66 | istat = cusolverDnXpotrs(handle = h, & 67 | params = params, & 68 | uplo = CUBLAS_FILL_MODE_UPPER, & 69 | n = n, & 70 | nrhs = 1, & 71 | dataTypeA = cudaDataType(cudaforGetDataType(a_d)), & 72 | A = a_d, & 73 | lda = n, & 74 | dataTypeB = cudaDataType(cudaforGetDataType(b_d)), & 75 | B = b_d, & 76 | ldb = n, & 77 | devinfo = info_d) 78 | 79 | b = b_d 80 | 81 | print *, 'x:' 82 | print *, b 83 | 84 | ! cleanup 85 | 86 | deallocate(buffer, buffer_d) 87 | istat = cusolverDnDestroyParams(params) 88 | istat = cusolverDnDestroy(h) 89 | end program main 90 | -------------------------------------------------------------------------------- /code/ch06/laplace2D.f90: -------------------------------------------------------------------------------- 1 | module parameters 2 | use, intrinsic :: iso_fortran_env 3 | integer, parameter :: nx = 4096, ny = 4096 4 | integer, parameter :: iterMax = 100 5 | integer, parameter :: reportInterval = 10 6 | integer, parameter :: fp_kind = real32 7 | real(fp_kind), parameter :: tol = 1.0e-5_fp_kind 8 | end module parameters 9 | 10 | module arrays 11 | use parameters 12 | real(fp_kind) :: a(nx,ny), aNew(nx,ny), absResidual(2:nx-1,2:ny-1) 13 | end module arrays 14 | 15 | module laplaceRoutines 16 | contains 17 | subroutine initialize() 18 | use parameters 19 | use arrays 20 | implicit none 21 | real(fp_kind), parameter :: & 22 | pi = 2.0_fp_kind*asin(1.0_fp_kind) 23 | real(fp_kind) :: y0(nx) 24 | integer :: i 25 | 26 | do i = 1, nx 27 | y0(i) = sin(pi*(i-1)/(nx-1)) 28 | enddo 29 | a = 0.0_fp_kind 30 | a(:,1) = y0 31 | a(:,ny) = y0*exp(-pi) 32 | aNew = a 33 | end subroutine initialize 34 | 35 | 36 | subroutine laplaceSolution() 37 | use parameters 38 | use arrays 39 | implicit none 40 | real(fp_kind) :: maxResidual = 2*tol 41 | integer :: iter 42 | 43 | iter=0 44 | do while ( maxResidual > tol .and. iter <= iterMax ) 45 | iter = iter + 1 46 | call jacobiIteration() 47 | maxResidual = maxval(absResidual) 48 | if(mod(iter,reportInterval) == 0) & 49 | print '(i8,3x,f10.6)', iter, maxResidual 50 | a = aNew 51 | end do 52 | end subroutine laplaceSolution 53 | 54 | 55 | subroutine jacobiIteration() 56 | use parameters 57 | use arrays 58 | implicit none 59 | integer :: i, j 60 | 61 | do j=2,ny-1 62 | do i=2,nx-1 63 | aNew(i,j) = 0.2_fp_kind * & 64 | (a(i,j-1)+a(i-1,j)+a(i+1,j)+a(i,j+1)) + & 65 | 0.05_fp_kind * & 66 | (a(i-1,j-1)+a(i+1,j-1)+a(i-1,j+1)+a(i+1,j+1)) 67 | absResidual(i,j) = abs(aNew(i,j)-a(i,j)) 68 | end do 69 | end do 70 | end subroutine jacobiIteration 71 | 72 | end module laplaceRoutines 73 | 74 | 75 | program main 76 | use parameters 77 | use arrays 78 | use laplaceRoutines 79 | implicit none 80 | real :: startTime, stopTime 81 | 82 | print '(/,a,i0,a,i0,a)', & 83 | 'Relaxation calculation on ', nx, ' x ', ny, ' mesh' 84 | print *, 'Iteration Max Residual' 85 | 86 | call initialize() 87 | 88 | call cpu_time(startTime) 89 | call laplaceSolution() 90 | 91 | call cpu_time(stopTime) 92 | print '(a,f10.3,a)', ' Completed in ', & 93 | stopTime-startTime, ' seconds' 94 | end program main 95 | -------------------------------------------------------------------------------- /code/ch09/pi_shfl.cuf: -------------------------------------------------------------------------------- 1 | module pi_shfl_m 2 | contains 3 | attributes(global) subroutine partial_pi_shfl(input, partial, twoN) 4 | use precision_m 5 | implicit none 6 | real(fp_kind) :: input(twoN) 7 | integer :: partial(*) 8 | integer, value :: twoN 9 | 10 | integer, shared :: p_s(32) 11 | integer :: N, interior 12 | integer :: i, tid, width, warpID, laneID 13 | 14 | N = twoN/2 15 | tid = threadIdx%x+(BlockIdx%x-1)*BlockDim%x 16 | 17 | interior=0 18 | do i = tid, N, BlockDim%x*GridDim%x 19 | if( (input(i)**2+input(i+N)**2) <= 1._fp_kind ) & 20 | interior=interior+1 21 | end do 22 | 23 | ! Local reduction per warp 24 | i = 1 25 | do while (i < warpsize) 26 | interior = interior + __shfl_xor(interior,i) 27 | i = i*2 28 | end do 29 | 30 | ! first element of a warp writes to shared memory 31 | warpID = (threadIdx%x-1)/warpsize+1 ! warp ID within block 32 | laneID = threadIdx%x-(warpID-1)*warpsize ! thread ID within warp 33 | if (laneID == 1) p_s(warpID)=interior 34 | call syncthreads() 35 | 36 | ! reduction of shared memory values by first warp 37 | if (warpID == 1) then 38 | interior = p_s(laneID) 39 | width = blockDim%x/warpsize 40 | i = 1 41 | do while (i < width) 42 | interior = interior + __shfl_xor(interior, i, width) 43 | i = i*2 44 | end do 45 | if (laneID == 1) partial(blockIdx%x) = interior 46 | end if 47 | end subroutine partial_pi_shfl 48 | 49 | attributes(global) subroutine final_pi_shfl(partial) 50 | implicit none 51 | integer :: partial(*) 52 | 53 | integer, shared :: p_s(32) 54 | integer :: val 55 | integer :: i, warpID, laneID, width 56 | 57 | warpID = (threadIdx%x-1)/warpsize+1 58 | laneID = threadIdx%x - (warpID-1)*warpsize 59 | 60 | val = partial(threadIdx%x) 61 | i = 1 62 | do while (i < warpsize) 63 | val = val + __shfl_xor(val, i) 64 | i = i*2 65 | enddo 66 | 67 | ! if more than one warp, reduce amongst warps 68 | if (blockDim%x > warpsize) then 69 | if (laneID == 1) p_s(warpID) = val 70 | call syncthreads() 71 | 72 | if (warpID == 1) then 73 | val = p_s(laneID) 74 | width = blockDim%x/warpsize 75 | i = 1 76 | do while (i < width) 77 | val = val + __shfl_xor(val, i, width) 78 | i=i*2 79 | enddo 80 | end if 81 | endif 82 | 83 | if (warpID == 1 .and. laneID == 1) partial(1) = val 84 | end subroutine final_pi_shfl 85 | 86 | end module pi_shfl_m 87 | 88 | 89 | -------------------------------------------------------------------------------- /code/ch09/pi_gridGroup.cuf: -------------------------------------------------------------------------------- 1 | module pi_gridGroup_m 2 | contains 3 | attributes(grid_global) subroutine pi_gg(input, partial, twoN) 4 | use cooperative_groups 5 | use precision_m 6 | implicit none 7 | real(fp_kind) :: input(twoN) 8 | integer :: partial(*) 9 | integer, value :: twoN 10 | 11 | integer, shared :: p_s(32) 12 | type(grid_group) :: gg 13 | integer :: N, interior 14 | integer :: i, warpID, laneID, width 15 | 16 | warpID = ishft(threadIdx%x-1,-5)+1 17 | laneID = iand(threadIdx%x-1,31)+1 18 | N = twoN/2 19 | 20 | gg = this_grid() 21 | 22 | interior=0 23 | do i = gg%rank, N, gg%size 24 | if( (input(i)**2+input(i+N)**2) <= 1._fp_kind ) & 25 | interior=interior+1 26 | end do 27 | 28 | ! Local reduction per warp 29 | i = 1 30 | do while (i < warpsize) 31 | interior = interior + __shfl_xor(interior,i) 32 | i = i*2 33 | end do 34 | 35 | ! first element of a warp writes to shared memory 36 | if (laneID == 1) p_s(warpID)=interior 37 | 38 | call syncthreads(this_thread_block()) 39 | 40 | ! reduction of shared memory values by first warp 41 | if (warpID == 1) then 42 | interior = p_s(laneID) 43 | width = blockDim%x/warpsize 44 | i = 1 45 | do while (i < width) 46 | interior = interior + __shfl_xor(interior, i, width) 47 | i = i*2 48 | end do 49 | if (laneID == 1) partial(blockIdx%x) = interior 50 | end if 51 | 52 | call syncthreads(gg) 53 | 54 | if (blockIdx%x == 1) then 55 | 56 | ! block-stride loop (if gridDim%x > blockDim%x) 57 | interior = 0 58 | do i = threadIdx%x, gridDim%x, blockDim%x 59 | interior = interior + partial(i) 60 | enddo 61 | 62 | i = 1 63 | do while (i < warpsize) 64 | interior = interior + __shfl_xor(interior, i) 65 | i = i*2 66 | enddo 67 | 68 | ! if more than one warp, reduce amongst warps 69 | if (gridDim%x > warpsize) then 70 | if (warpID == 1) p_s(laneID) = 0 71 | call syncthreads(this_thread_block()) 72 | if (laneID == 1) p_s(warpID) = interior 73 | call syncthreads(this_thread_block()) 74 | 75 | if (warpID == 1) then 76 | interior = p_s(laneID) 77 | i = 1 78 | do while (i < warpsize) 79 | interior = interior + __shfl_xor(interior, i) 80 | i=i*2 81 | enddo 82 | end if 83 | endif 84 | 85 | if (warpID == 1 .and. laneID == 1) partial(1) = interior 86 | end if 87 | 88 | end subroutine pi_gg 89 | 90 | end module pi_gridGroup_m 91 | 92 | 93 | -------------------------------------------------------------------------------- /code/ch08/p2pBandwidth.cuf: -------------------------------------------------------------------------------- 1 | program p2pBandwidth 2 | use cudafor 3 | implicit none 4 | integer, parameter :: N = 4*1024*1024 5 | type distributedArray 6 | real, device, allocatable :: a_d(:) 7 | end type distributedArray 8 | type (distributedArray), allocatable :: distArray(:) 9 | 10 | real, allocatable :: bandwidth(:,:) 11 | real :: array(N), time 12 | integer :: nDevices, access, i, j, istat 13 | type (cudaDeviceProp) :: prop 14 | type (cudaEvent) :: startEvent, stopEvent 15 | 16 | istat = cudaGetDeviceCount(nDevices) 17 | print "('Number of CUDA-capable devices: ', i0,/)", & 18 | nDevices 19 | 20 | do i = 0, nDevices-1 21 | istat = cudaGetDeviceProperties(prop, i) 22 | print "('Device ', i0, ': ', a)", i, trim(prop%name) 23 | enddo 24 | print * 25 | 26 | allocate(distArray(0:nDevices-1)) 27 | 28 | do j = 0, nDevices-1 29 | istat = cudaSetDevice(j) 30 | allocate(distArray(j)%a_d(N)) 31 | distArray(j)%a_d = j 32 | do i = j+1, nDevices-1 33 | istat = cudaDeviceCanAccessPeer(access, j, i) 34 | if (access == 1) then 35 | istat = cudaSetDevice(j) 36 | istat = cudaDeviceEnablePeerAccess(i, 0) 37 | istat = cudaSetDevice(i) 38 | istat = cudaDeviceEnablePeerAccess(j, 0) 39 | endif 40 | enddo 41 | end do 42 | 43 | allocate(bandwidth(0:nDevices-1, 0:nDevices-1)) 44 | bandwidth = 0.0 45 | 46 | do j = 0, nDevices-1 47 | istat = cudaSetDevice(j) 48 | istat = cudaEventCreate(startEvent) 49 | istat = cudaEventCreate(stopEvent) 50 | do i = 0, nDevices-1 51 | if (i == j) cycle 52 | istat = cudaMemcpyPeer(distArray(j)%a_d, j, & 53 | distArray(i)%a_d, i, N) 54 | istat = cudaEventRecord(startEvent,0) 55 | istat = cudaMemcpyPeer(distArray(j)%a_d, j, & 56 | distArray(i)%a_d, i, N) 57 | istat = cudaEventRecord(stopEvent,0) 58 | istat = cudaEventSynchronize(stopEvent) 59 | istat = cudaEventElapsedTime(time, & 60 | startEvent, stopEvent) 61 | 62 | array = distArray(j)%a_d 63 | if (all(array == i)) bandwidth(j,i) = N*4/time/1.0E+6 64 | end do 65 | distArray(j)%a_d = j 66 | istat = cudaEventDestroy(startEvent) 67 | istat = cudaEventDestroy(stopEvent) 68 | enddo 69 | 70 | print "('Bandwidth (GB/s) for transfer size (MB): ', & 71 | f9.3,/)", N*4.0/1024**2 72 | write (*,"(' S\\R 0')", advance='no') 73 | do i = 1, nDevices-1 74 | write(*,"(5x,i3)", advance='no') i 75 | enddo 76 | print * 77 | 78 | do j = 0, nDevices-1 79 | write(*,"(i3)", advance='no') j 80 | do i = 0, nDevices-1 81 | if (i == j) then 82 | write(*,"(4x,'-',3x)", advance='no') 83 | else 84 | write(*,"(f8.2)",advance='no') bandwidth(j,i) 85 | end if 86 | end do 87 | write(*,*) 88 | end do 89 | 90 | ! cleanup 91 | do j = 0, nDevices-1 92 | deallocate(distArray(j)%a_d) 93 | end do 94 | deallocate(distArray,bandwidth) 95 | 96 | end program p2pBandwidth 97 | -------------------------------------------------------------------------------- /code/ch06/laplace2DAssoc.f90: -------------------------------------------------------------------------------- 1 | module parameters 2 | use, intrinsic :: iso_fortran_env 3 | integer, parameter :: nx = 4096, ny = 4096 4 | integer, parameter :: iterMax = 100 5 | integer, parameter :: reportInterval = 10 6 | integer, parameter :: fp_kind = real32 7 | real(fp_kind), parameter :: tol = 1.0e-5_fp_kind 8 | end module parameters 9 | 10 | module arrays 11 | use parameters 12 | real(fp_kind) :: a(nx,ny), aNew(nx,ny), absResidual(2:nx-1,2:ny-1) 13 | !@cuf real(fp_kind), device :: a_d(nx,ny), aNew_d(nx,ny) 14 | !@cuf attributes(device) :: absResidual 15 | end module arrays 16 | 17 | module laplaceRoutines 18 | contains 19 | subroutine initialize() 20 | use parameters 21 | use arrays 22 | implicit none 23 | real(fp_kind), parameter :: & 24 | pi = 2.0_fp_kind*asin(1.0_fp_kind) 25 | real(fp_kind) :: y0(nx) 26 | integer :: i 27 | 28 | do i = 1, nx 29 | y0(i) = sin(pi*(i-1)/(nx-1)) 30 | enddo 31 | a = 0.0_fp_kind 32 | a(:,1) = y0 33 | a(:,ny) = y0*exp(-pi) 34 | aNew = a 35 | !@cuf aNew_d = aNew 36 | !@cuf a_d = a 37 | end subroutine initialize 38 | 39 | 40 | subroutine laplaceSolution() 41 | use parameters 42 | !@cuf use cudafor 43 | use arrays 44 | implicit none 45 | real(fp_kind) :: maxResidual = 2*tol 46 | integer :: iter 47 | 48 | iter=0 49 | !@cuf associate(a=>a_d, aNew=>aNew_d) 50 | do while ( maxResidual > tol .and. iter <= iterMax ) 51 | iter = iter + 1 52 | call jacobiIteration() 53 | maxResidual = maxval(absResidual) 54 | if(mod(iter,reportInterval) == 0) & 55 | print '(i8,3x,f10.6)', iter, maxResidual 56 | a = aNew 57 | end do 58 | !@cuf end associate 59 | end subroutine laplaceSolution 60 | 61 | 62 | subroutine jacobiIteration() 63 | use parameters 64 | use arrays 65 | implicit none 66 | integer :: i, j 67 | 68 | !@cuf associate(a=>a_d, aNew=> aNew_d) 69 | !$cuf kernel do(2) <<<*,*>>> 70 | do j=2,ny-1 71 | do i=2,nx-1 72 | aNew(i,j) = 0.2_fp_kind * & 73 | (a(i,j-1)+a(i-1,j)+a(i+1,j)+a(i,j+1)) + & 74 | 0.05_fp_kind * & 75 | (a(i-1,j-1)+a(i+1,j-1)+a(i-1,j+1)+a(i+1,j+1)) 76 | absResidual(i,j) = abs(aNew(i,j)-a(i,j)) 77 | end do 78 | end do 79 | !@cuf end associate 80 | end subroutine jacobiIteration 81 | 82 | end module laplaceRoutines 83 | 84 | 85 | program main 86 | use parameters 87 | use arrays 88 | use laplaceRoutines 89 | implicit none 90 | 91 | real :: startTime, stopTime 92 | 93 | !@cuf print '(/,a,/)', 'GPU associate version' 94 | print '(/,a,i0,a,i0,a)', & 95 | 'Relaxation calculation on ', nx, ' x ', ny, ' mesh' 96 | 97 | print *, 'Iteration Max Residual' 98 | 99 | call initialize() 100 | 101 | call cpu_time(startTime) 102 | call laplaceSolution() 103 | 104 | call cpu_time(stopTime) 105 | print '(a,f10.3,a)', ' Completed in ', & 106 | stopTime-startTime, ' seconds' 107 | end program main 108 | -------------------------------------------------------------------------------- /code/ch06/laplace2DUse.F90: -------------------------------------------------------------------------------- 1 | module parameters 2 | use, intrinsic :: iso_fortran_env 3 | integer, parameter :: nx = 4096, ny = 4096 4 | integer, parameter :: iterMax = 100 5 | integer, parameter :: reportInterval = 10 6 | integer, parameter :: fp_kind = real32 7 | real(fp_kind), parameter :: tol = 1.0e-5_fp_kind 8 | end module parameters 9 | 10 | module arrays 11 | use parameters 12 | real(fp_kind) :: a(nx,ny), aNew(nx,ny), absResidual(2:nx-1,2:ny-1) 13 | !@cuf real(fp_kind), device :: a_d(nx,ny), aNew_d(nx,ny) 14 | !@cuf attributes(device) :: absResidual 15 | end module arrays 16 | 17 | module laplaceRoutines 18 | contains 19 | subroutine initialize() 20 | use parameters 21 | use arrays 22 | implicit none 23 | real(fp_kind), parameter :: & 24 | pi = 2.0_fp_kind*asin(1.0_fp_kind) 25 | real(fp_kind) :: y0(nx) 26 | integer :: i 27 | 28 | do i = 1, nx 29 | y0(i) = sin(pi*(i-1)/(nx-1)) 30 | enddo 31 | a = 0.0_fp_kind 32 | a(:,1) = y0 33 | a(:,ny) = y0*exp(-pi) 34 | aNew = a 35 | !@cuf aNew_d = aNew 36 | !@cuf a_d = a 37 | end subroutine initialize 38 | 39 | 40 | subroutine laplaceSolution() 41 | use parameters 42 | #ifdef _CUDA 43 | use arrays, only: a => a_d, aNew => aNew_d, absResidual 44 | #else 45 | use arrays 46 | #endif 47 | !@cuf use cudafor 48 | implicit none 49 | real(fp_kind) :: maxResidual = 2*tol 50 | integer :: iter 51 | 52 | iter=0 53 | do while ( maxResidual > tol .and. iter <= iterMax ) 54 | iter = iter + 1 55 | call jacobiIteration() 56 | maxResidual = maxval(absResidual) 57 | if(mod(iter,reportInterval) == 0) & 58 | print '(i8,3x,f10.6)', iter, maxResidual 59 | a = aNew 60 | end do 61 | end subroutine laplaceSolution 62 | 63 | 64 | subroutine jacobiIteration() 65 | use parameters 66 | #ifdef _CUDA 67 | use arrays, only: a => a_d, aNew => aNew_d, absResidual 68 | #else 69 | use arrays 70 | #endif 71 | implicit none 72 | integer :: i, j 73 | 74 | !$cuf kernel do(2) <<<*,*>>> 75 | do j=2,ny-1 76 | do i=2,nx-1 77 | aNew(i,j) = 0.2_fp_kind * & 78 | (a(i,j-1)+a(i-1,j)+a(i+1,j)+a(i,j+1)) + & 79 | 0.05_fp_kind * & 80 | (a(i-1,j-1)+a(i+1,j-1)+a(i-1,j+1)+a(i+1,j+1)) 81 | absResidual(i,j) = abs(aNew(i,j)-a(i,j)) 82 | end do 83 | end do 84 | end subroutine jacobiIteration 85 | 86 | end module laplaceRoutines 87 | 88 | 89 | program main 90 | use parameters 91 | use arrays 92 | use laplaceRoutines 93 | implicit none 94 | 95 | real :: startTime, stopTime 96 | 97 | !@cuf print '(/,a,/)', 'GPU version' 98 | print '(/,a,i0,a,i0,a)', & 99 | 'Relaxation calculation on ', nx, ' x ', ny, ' mesh' 100 | 101 | print *, 'Iteration Max Residual' 102 | 103 | call initialize() 104 | 105 | call cpu_time(startTime) 106 | call laplaceSolution() 107 | 108 | call cpu_time(stopTime) 109 | print '(a,f10.3,a)', ' Completed in ', & 110 | stopTime-startTime, ' seconds' 111 | end program main 112 | -------------------------------------------------------------------------------- /code/ch05/async.cuf: -------------------------------------------------------------------------------- 1 | program async 2 | use cudafor 3 | implicit none 4 | integer, parameter :: maxStreams = 64 5 | integer, parameter :: n = maxStreams*1024*1024 6 | 7 | real(8), pinned, allocatable :: a(:), b(:) 8 | real(8), device :: a_d(n), b_d(n) 9 | real(8) :: gold(n) 10 | 11 | integer(kind=cuda_stream_kind) :: stream(maxStreams) 12 | type (cudaEvent) :: startEvent, stopEvent 13 | 14 | real :: time 15 | integer :: nStreams, i, j, offset, istat 16 | 17 | block 18 | type (cudaDeviceProp) :: prop 19 | istat = cudaGetDeviceProperties(prop, 0) 20 | print "(' Device: ', a)", trim(prop%name) 21 | print "(' Array size (MB): ', f8.2)", real(n)*8/1000/1000 22 | print "(/,' Streams time (ms)' )" 23 | end block 24 | 25 | block 26 | logical :: pinnedFlag 27 | ! allocate pinned host memory 28 | allocate(a(n), STAT=istat, PINNED=pinnedFlag) 29 | if (istat /= 0) then 30 | print *, 'Allocation of a failed' 31 | stop 32 | else 33 | if (.not. pinnedFlag) & 34 | print *, 'Pinned a allocation failed' 35 | end if 36 | allocate(b(n), STAT=istat, PINNED=pinnedFlag) 37 | if (istat /= 0) then 38 | print *, 'Allocation of b failed' 39 | stop 40 | else 41 | if (.not. pinnedFlag) & 42 | print *, 'Pinned b allocation failed' 43 | end if 44 | end block 45 | 46 | ! create events and streams 47 | istat = cudaEventCreate(startEvent) 48 | istat = cudaEventCreate(stopEvent) 49 | 50 | do i = 1, maxStreams 51 | istat = cudaStreamCreate(stream(i)) 52 | enddo 53 | 54 | call random_number(a) 55 | gold = 0.0 56 | 57 | ! warm up 58 | a_d = a 59 | !$cuf kernel do <<<*,*>>> 60 | do i = 1, n 61 | b_d(i) = sin(a_d(i)) + sin(2*a_d(i)) & 62 | + sin(3*a_d(i)) + sin(4*a_d(i)) 63 | enddo 64 | gold = b_d 65 | 66 | nStreams = 1 67 | do 68 | istat = cudaEventRecord(startEvent,0) 69 | do j = 1, nStreams 70 | offset = (j-1)*(n/nStreams) 71 | istat = cudaMemcpyAsync(a_d(offset+1), a(offset+1), & 72 | n/nStreams, stream(j)) 73 | 74 | !$cuf kernel do <<<*,*,0,stream(j)>>> 75 | do i = offset+1, offset+n/nStreams 76 | b_d(i) = sin(a_d(i)) + sin(2*a_d(i)) & 77 | + sin(3*a_d(i)) + sin(4*a_d(i)) 78 | enddo 79 | 80 | istat = cudaMemcpyAsync(b(offset+1), b_d(offset+1), & 81 | n/nStreams, stream(j)) 82 | enddo 83 | istat = cudaEventRecord(stopEvent, 0) 84 | istat = cudaEventSynchronize(stopEvent) 85 | istat = cudaEventElapsedTime(time, startEvent, stopEvent) 86 | if (maxval(abs(gold-b)) > 0.0) then 87 | print *, n, ' *** Error ***' 88 | else 89 | print *, nStreams, time 90 | end if 91 | nStreams = nStreams*2 92 | if (nStreams > maxStreams) exit 93 | enddo 94 | 95 | ! cleanup 96 | istat = cudaEventDestroy(startEvent) 97 | istat = cudaEventDestroy(stopEvent) 98 | do j = 1, maxStreams 99 | istat = cudaStreamDestroy(stream(j)) 100 | enddo 101 | deallocate(a, b) 102 | 103 | end program async 104 | 105 | -------------------------------------------------------------------------------- /code/ch07/gemmPerf.cuf: -------------------------------------------------------------------------------- 1 | program main 2 | use cudafor 3 | use cublas_v2 4 | implicit none 5 | integer, parameter :: m=3200, n=3200, k=3200 6 | 7 | type(cublasHandle) :: handle 8 | type(cudaDeviceProp) :: prop 9 | type(cudaEvent) :: startEvent, stopEvent 10 | real :: err, time 11 | integer :: istat 12 | 13 | istat = cudaGetDeviceProperties(prop, 0) 14 | print "(' Device: ', a)", trim(prop%name) 15 | print "(' m = ', i0, ', n = ', i0, ', k = ', i0)", m, n, k 16 | 17 | istat = cublasCreate(handle) 18 | istat = cudaEventCreate(startEvent) 19 | istat = cudaEventCreate(stopEvent) 20 | 21 | block 22 | real(8) :: a(m,k), b(k,n), c(m,n), cref(m,n) 23 | real(8), device :: a_d(m,k), b_d(k,n), c_d(m,n) 24 | 25 | call random_number(a) 26 | call random_number(b) 27 | cref = matmul(a,b) 28 | 29 | a_d = a; b_d = b; c_d = 0.0 30 | istat = cudaDeviceSynchronize() 31 | istat = cudaEventRecord(startEvent, 0) 32 | istat = cublasDGemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, & 33 | m, n, k, 1.0_8, a_d, m, b_d, k, 0.0_8, c_d, n) 34 | istat = cudaEventRecord(stopEvent, 0) 35 | istat = cudaEventSynchronize(stopEvent) 36 | c = c_d 37 | istat = cudaEventElapsedTime(time, startEvent, stopEvent) 38 | 39 | print *, 'DGEMM maxval(abs(cref-c)): ', maxval(abs(cref-c)) 40 | print *, 'DGEMM TFlops: ', 2.*k*m*n/(time/1000.)/1.0E+12 41 | print * 42 | end block 43 | 44 | block 45 | real(4) :: a(m,k), b(k,n), c(m,n), cref(m,n) 46 | real(4), device :: a_d(m,k), b_d(k,n), c_d(m,n) 47 | 48 | call random_number(a) 49 | call random_number(b) 50 | cref = matmul(a,b) 51 | 52 | a_d = a; b_d = b; c_d = 0.0 53 | istat = cudaDeviceSynchronize() 54 | istat = cudaEventRecord(startEvent, 0) 55 | istat = cublasSGemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, & 56 | m, n, k, 1.0, a_d, m, b_d, k, 0.0, c_d, n) 57 | istat = cudaEventRecord(stopEvent, 0) 58 | istat = cudaEventSynchronize(stopEvent) 59 | c = c_d 60 | istat = cudaEventElapsedTime(time, startEvent, stopEvent) 61 | 62 | print *, 'SGEMM (FP32) maxval(abs(cref-c)): ', maxval(abs(cref-c)) 63 | print *, 'SGEMM (FP32) TFlops: ', 2.*k*m*n/(time/1000.)/1.0E+12 64 | print * 65 | end block 66 | 67 | block 68 | real(4) :: a(m,k), b(k,n), c(m,n), cref(m,n) 69 | real(4), device :: a_d(m,k), b_d(k,n), c_d(m,n) 70 | 71 | istat = cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH) 72 | 73 | call random_number(a) 74 | call random_number(b) 75 | cref = matmul(a,b) 76 | 77 | a_d = a; b_d = b; c_d = 0.0 78 | istat = cudaDeviceSynchronize() 79 | istat = cudaEventRecord(startEvent, 0) 80 | istat = cublasSGemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, & 81 | m, n, k, 1.0, a_d, m, b_d, k, 0.0, c_d, n) 82 | istat = cudaEventRecord(stopEvent, 0) 83 | istat = cudaEventSynchronize(stopEvent) 84 | c = c_d 85 | istat = cudaEventElapsedTime(time, startEvent, stopEvent) 86 | 87 | print *, 'SGEMM (TF32) maxval(abs(cref-c)): ', maxval(abs(cref-c)) 88 | print *, 'SGEMM (TF32) TFlops: ', 2.*k*m*n/(time/1000.)/1.0E+12 89 | end block 90 | 91 | istat = cublasDestroy(handle) 92 | istat = cudaEventDestroy(startEvent) 93 | istat = cudaEventDestroy(stopEvent) 94 | end program main 95 | -------------------------------------------------------------------------------- /code/ch12/Makefile: -------------------------------------------------------------------------------- 1 | MCUDA = -cuda 2 | MCUDA_CURAND = $(MCUDA) -cudalib=curand 3 | 4 | OBJS = ppmExampleHost ppmExampleCUDA \ 5 | rayHost rayCUDA \ 6 | sphereHost sphereCUDA \ 7 | normalHost normalCUDA \ 8 | twoSpheresHost twoSpheresCUDA \ 9 | antialiasHost antialiasCUDA \ 10 | diffuseHost diffuseCUDA \ 11 | metalHost metalCUDA \ 12 | dielectricHost dielectricCUDA \ 13 | cameraHost cameraCUDA \ 14 | defocusBlurHost defocusBlurCUDA \ 15 | coverHost coverCUDA \ 16 | triangleHost triangleCUDA \ 17 | lightsHost lightsCUDA \ 18 | textureHost textureCUDA 19 | 20 | # section 12.1 21 | 22 | ppmExampleHost: ppmExample.f90 23 | nvfortran -o $@ $^ 24 | 25 | ppmExampleCUDA: ppmExample.f90 26 | nvfortran -o $@ $(MCUDA) $^ 27 | 28 | # section 12.2 29 | 30 | rgb_m.host.o: rgb_m.F90 31 | nvfortran -c -o $@ $^ 32 | 33 | rgb_m.cuda.o: rgb_m.F90 34 | nvfortran -c -o $@ $(MCUDA) $^ 35 | 36 | # section 12.3 37 | 38 | rayHost: ray.F90 rgb_m.host.o 39 | nvfortran -o $@ $^ 40 | 41 | rayCUDA: ray.F90 rgb_m.cuda.o 42 | nvfortran -o $@ $(MCUDA) $^ 43 | 44 | # section 12.4 45 | 46 | sphereHost: sphere.F90 rgb_m.host.o 47 | nvfortran -o $@ $^ 48 | 49 | sphereCUDA: sphere.F90 rgb_m.cuda.o 50 | nvfortran -o $@ $(MCUDA) $^ 51 | 52 | # section 12.5 53 | 54 | normalHost: normal.F90 rgb_m.host.o 55 | nvfortran -o $@ $^ 56 | 57 | normalCUDA: normal.F90 rgb_m.cuda.o 58 | nvfortran -o $@ $(MCUDA) $^ 59 | 60 | 61 | twoSpheresHost: twoSpheres.F90 rgb_m.host.o 62 | nvfortran -o $@ $^ 63 | 64 | twoSpheresCUDA: twoSpheres.F90 rgb_m.cuda.o 65 | nvfortran -o $@ $(MCUDA) $^ 66 | 67 | # section 12.6 68 | 69 | antialiasHost: antialias.F90 rgb_m.host.o 70 | nvfortran -o $@ $^ 71 | 72 | antialiasCUDA: antialias.F90 rgb_m.cuda.o 73 | nvfortran -o $@ $(MCUDA_CURAND) $^ 74 | 75 | # section 12.7.1 76 | 77 | diffuseHost: diffuse.F90 rgb_m.host.o 78 | nvfortran -o $@ $^ 79 | 80 | diffuseCUDA: diffuse.F90 rgb_m.cuda.o 81 | nvfortran -o $@ $(MCUDA_CURAND) $^ 82 | 83 | # section 12.7.2 84 | 85 | metalHost: metal.F90 rgb_m.host.o 86 | nvfortran -o $@ $^ 87 | 88 | metalCUDA: metal.F90 rgb_m.cuda.o 89 | nvfortran -o $@ $(MCUDA_CURAND) $^ 90 | 91 | # section 12.7.3 92 | 93 | dielectricHost: dielectric.F90 rgb_m.host.o 94 | nvfortran -o $@ $^ 95 | 96 | dielectricCUDA: dielectric.F90 rgb_m.cuda.o 97 | nvfortran -o $@ $(MCUDA_CURAND) $^ 98 | 99 | # section 12.8 100 | 101 | cameraHost: camera.F90 rgb_m.host.o 102 | nvfortran -o $@ $^ 103 | 104 | cameraCUDA: camera.F90 rgb_m.cuda.o 105 | nvfortran -o $@ $(MCUDA_CURAND) $^ 106 | 107 | # section 12.9 108 | 109 | defocusBlurHost: defocusBlur.F90 rgb_m.host.o 110 | nvfortran -o $@ $^ 111 | 112 | defocusBlurCUDA: defocusBlur.F90 rgb_m.cuda.o 113 | nvfortran -o $@ $(MCUDA_CURAND) $^ 114 | 115 | # section 12.10 116 | 117 | coverHost: cover.F90 rgb_m.host.o 118 | nvfortran -o $@ $^ 119 | 120 | coverCUDA: cover.F90 rgb_m.cuda.o 121 | nvfortran -o $@ $(MCUDA_CURAND) $^ 122 | 123 | # section 12.11 124 | 125 | triangleHost: triangle.F90 rgb_m.host.o 126 | nvfortran -o $@ $^ 127 | 128 | triangleCUDA: triangle.F90 rgb_m.cuda.o 129 | nvfortran -o $@ $(MCUDA_CURAND) $^ 130 | 131 | # section 12.12 132 | 133 | lightsHost: lights.F90 rgb_m.host.o 134 | nvfortran -o $@ $^ 135 | 136 | lightsCUDA: lights.F90 rgb_m.cuda.o 137 | nvfortran -o $@ $(MCUDA_CURAND) $^ 138 | 139 | # section 12.13 140 | 141 | textureHost: texture.F90 rgb_m.host.o 142 | nvfortran -o $@ $^ 143 | 144 | textureCUDA: texture.F90 rgb_m.cuda.o 145 | nvfortran -o $@ $(MCUDA_CURAND) $^ 146 | 147 | 148 | 149 | clean: 150 | rm -rf a.out *.mod *.o *~ 151 | 152 | clobber: 153 | rm -rf $(OBJS) *.ppm a.out *.mod *.o *~ 154 | -------------------------------------------------------------------------------- /code/ch12/ray.F90: -------------------------------------------------------------------------------- 1 | module rayTracing 2 | #ifdef _CUDA 3 | use rgbCUDA 4 | #else 5 | use rgbHost 6 | #endif 7 | 8 | real, parameter :: lowerLeftCorner(3) = [-2.0, -1.0, -1.0] 9 | real, parameter :: horizontal(3) = [4.0, 0.0, 0.0] 10 | real, parameter :: vertical(3) = [0.0, 2.0, 0.0] 11 | real, parameter :: origin(3) = [0.0, 0.0, 0.0] 12 | 13 | type ray 14 | real :: origin(3) 15 | real :: dir(3) 16 | end type ray 17 | 18 | interface ray 19 | module procedure rayConstructor 20 | end interface ray 21 | 22 | contains 23 | 24 | !@cuf attributes(device) & 25 | function normalize(a) result(res) 26 | implicit none 27 | real :: a(3), res(3) 28 | res = a/sqrt(sum(a**2)) 29 | end function normalize 30 | 31 | !@cuf attributes(device) & 32 | function rayConstructor(origin, dir) result(r) 33 | implicit none 34 | !dir$ ignore_tkr (d) origin, (d) dir 35 | real :: origin(3), dir(3) 36 | type(ray) :: r 37 | r%origin = origin 38 | r%dir = normalize(dir) 39 | end function rayConstructor 40 | 41 | !@cuf attributes(device) & 42 | function color(r) result(res) 43 | implicit none 44 | type(ray) :: r 45 | type(rgb) :: res 46 | real :: t 47 | t = 0.5*(r%dir(2) + 1.0) 48 | res = rgb((1.0-t)*[1.0, 1.0, 1.0] + t*[0.5, 0.7, 1.0]) 49 | end function color 50 | 51 | #ifdef _CUDA 52 | attributes(global) subroutine renderKernel(fb, nx, ny) 53 | implicit none 54 | type(rgb) :: fb(nx,ny) 55 | integer, value :: nx, ny 56 | type(ray) :: r 57 | real :: dir(3) 58 | real :: u, v 59 | integer :: i, j 60 | i = threadIdx%x + (blockIdx%x-1)*blockDim%x 61 | j = threadIdx%y + (blockIdx%y-1)*blockDim%y 62 | if (i <= nx .and. j <= ny) then 63 | u = real(i)/nx 64 | v = real(j)/ny 65 | dir = lowerLeftCorner + u*horizontal + v*vertical - origin 66 | r = ray(origin, dir) 67 | fb(i,j) = color(r) 68 | end if 69 | end subroutine renderKernel 70 | #endif 71 | 72 | subroutine render(fb) 73 | !@cuf use cudafor 74 | implicit none 75 | type(rgb) :: fb(:,:) 76 | type(ray) :: r 77 | real :: u, v 78 | integer :: nx, ny, i, j 79 | 80 | nx = size(fb,1) 81 | ny = size(fb,2) 82 | 83 | #ifdef _CUDA 84 | block 85 | type(rgb), device, allocatable :: fb_d(:,:) 86 | type(dim3) :: tBlock, grid 87 | 88 | allocate(fb_d(nx,ny)) 89 | tBlock = dim3(32,8,1) 90 | grid = dim3((nx-1)/tBlock%x+1, (ny-1)/tBlock%y+1, 1) 91 | call renderKernel<<>>(fb_d, nx, ny) 92 | fb = fb_d 93 | deallocate(fb_d) 94 | end block 95 | #else 96 | do j = 1, ny 97 | do i = 1, nx 98 | u = real(i)/nx 99 | v = real(j)/ny 100 | r = ray(origin, & 101 | lowerLeftCorner + u*horizontal + v*vertical - origin) 102 | fb(i,j) = color(r) 103 | end do 104 | end do 105 | #endif 106 | end subroutine render 107 | 108 | end module rayTracing 109 | 110 | program main 111 | use rayTracing 112 | implicit none 113 | integer, parameter :: nx = 400, ny = 200 114 | integer :: i, j 115 | type(rgb) :: fb(nx,ny) 116 | 117 | call render(fb) 118 | 119 | ! ppm output 120 | 121 | print "(a2)", 'P3' ! indicates RGB colors in ASCII, must be flush left 122 | print *, nx, ny ! width and height of image 123 | print *, 255 ! maximum value for each color 124 | do j = ny, 1, -1 125 | do i = 1, nx 126 | print "(3(1x,i3))", int(255*fb(i,j)%v) 127 | end do 128 | end do 129 | 130 | end program main 131 | -------------------------------------------------------------------------------- /code/ch07/cusparseMV.cuf: -------------------------------------------------------------------------------- 1 | program sparseMatVec 2 | use cudafor 3 | use cusparse 4 | 5 | implicit none 6 | 7 | integer, parameter :: n = 5 ! # rows/cols in matrix 8 | integer, parameter :: nnz = 5 ! # nonzeros in matrix 9 | 10 | type(cusparseHandle) :: h 11 | 12 | ! CSR matrix 13 | type(cusparseSpMatDescr) :: descrA 14 | real(4), device :: csrValues_d(nnz) 15 | integer(4), device :: csrRowOffsets_d(n+1), csrColInd_d(nnz) 16 | 17 | ! dense vectors 18 | type(cusparseDnVecDescr) :: descrX, descrY 19 | real, device :: x_d(n), y_d(n) 20 | real :: y(n) 21 | 22 | ! parameters 23 | real(4) :: alpha = 1.0, beta = 0.0 24 | 25 | integer :: status, i 26 | 27 | ! initalize cusparse 28 | status = cusparseCreate(h) 29 | 30 | ! CSR representation for upper circular shift matrix 31 | csrValues_d = 1.0 32 | csrColInd_d = [2, 3, 4, 5, 1] 33 | csrRowOffsets_d = [1, 2, 3, 4, 5, 6] 34 | 35 | ! vectors 36 | x_d = [11.0, 12.0, 13.0, 14.0, 15.0] 37 | y_d = 0.0 38 | 39 | y = x_d 40 | print *, 'Original vector' 41 | print "(5(1x,f7.2))", y 42 | 43 | ! initialize sparse matrix descriptor A in CSR format 44 | status = cusparseCreateCsr(descr = descrA, & 45 | rows = n, & 46 | cols = n, & 47 | nnz = nnz, & 48 | csrRowOffsets = csrRowOffsets_d, & 49 | csrColInd = csrColInd_d, & 50 | csrValues = csrValues_d, & 51 | csrRowOffsetsType = CUSPARSE_INDEX_32I, & 52 | csrColIndType = CUSPARSE_INDEX_32I, & 53 | idxBase = CUSPARSE_INDEX_BASE_ONE, & 54 | valueType = CUDA_R_32F) 55 | 56 | ! initialize the dense vector descriptors for X and Y 57 | status = cusparseCreateDnVec(descrX, n, x_d, valueType = CUDA_R_32F) 58 | status = cusparseCreateDnVec(descrY, n, y_d, valueType = CUDA_R_32F) 59 | 60 | ! y = alpha*A*x + beta*y 61 | 62 | block 63 | integer(8) :: bufferSize 64 | integer(1), allocatable, device :: buffer_d(:) 65 | 66 | status = cusparseSpMV_buffersize(h, & 67 | CUSPARSE_OPERATION_NON_TRANSPOSE, & 68 | alpha, descrA, descrX, beta, descrY, & 69 | CUDA_R_32F, & 70 | CUSPARSE_SPMV_ALG_DEFAULT, & 71 | buffersize) 72 | 73 | allocate(buffer_d(buffersize)) 74 | 75 | status = cusparseSpMV(h, & 76 | CUSPARSE_OPERATION_NON_TRANSPOSE, & 77 | alpha, descrA, descrX, beta, descrY, & 78 | CUDA_R_32F, & 79 | CUSPARSE_SPMV_ALG_DEFAULT, & 80 | buffer_d) 81 | 82 | deallocate(buffer_d) 83 | end block 84 | 85 | y = y_d 86 | print *, 'Shifted vector' 87 | print "(5(1x,f7.2))", y 88 | 89 | ! shift down and subtract original 90 | ! x = alpha*(A')*y - x 91 | 92 | beta = -1.0 93 | 94 | block 95 | integer(8) :: bufferSize 96 | integer(1), allocatable, device :: buffer_d(:) 97 | 98 | status = cusparseSpMV_buffersize(h, & 99 | CUSPARSE_OPERATION_TRANSPOSE, & 100 | alpha, descrA, descrY, beta, descrX, & 101 | CUDA_R_32F, & 102 | CUSPARSE_SPMV_ALG_DEFAULT, & 103 | buffersize) 104 | 105 | allocate(buffer_d(buffersize)) 106 | 107 | status = cusparseSpMV(h, & 108 | CUSPARSE_OPERATION_TRANSPOSE, & 109 | alpha, descrA, descrY, beta, descrX, & 110 | CUDA_R_32F, & 111 | CUSPARSE_SPMV_ALG_DEFAULT, & 112 | buffer_d) 113 | 114 | deallocate(buffer_d) 115 | end block 116 | 117 | y = x_d 118 | print *, 'Max error: ', maxval(abs(y)) 119 | 120 | ! cleanup 121 | 122 | status = cusparseDestroySpMat(descrA) 123 | status = cusparseDestroyDnVec(descrX) 124 | status = cusparseDestroyDnVec(descrY) 125 | status = cusparseDestroy(h) 126 | 127 | end program sparseMatVec 128 | -------------------------------------------------------------------------------- /code/ch05/HDtransfer.cuf: -------------------------------------------------------------------------------- 1 | program HDtransfer 2 | 3 | use cudafor 4 | implicit none 5 | 6 | integer, parameter :: nElements = 128*1024*1024 7 | 8 | ! host arrays 9 | real(4) :: a_pageable(nElements), b_pageable(nElements) 10 | real(4), allocatable, pinned :: a_pinned(:), b_pinned(:) 11 | 12 | ! device arrays 13 | real(4), device :: a_d(nElements) 14 | 15 | ! events for timing 16 | type (cudaEvent) :: startEvent, stopEvent 17 | 18 | ! misc 19 | type (cudaDeviceProp) :: prop 20 | real(4) :: time 21 | integer :: istat, n, i 22 | logical :: pinnedFlag 23 | 24 | ! allocate and initialize 25 | do i = 1, nElements 26 | a_pageable(i) = i 27 | end do 28 | b_pageable = 0.0 29 | 30 | allocate(a_pinned(nElements), b_pinned(nElements), & 31 | STAT=istat, PINNED=pinnedFlag) 32 | if (istat /= 0) then 33 | print *, 'Allocation of a_pinned/b_pinned failed' 34 | pinnedFlag = .false. 35 | else 36 | if (.not. pinnedFlag) print *, 'Pinned allocation failed' 37 | end if 38 | 39 | if (pinnedFlag) then 40 | a_pinned = a_pageable 41 | b_pinned = 0.0 42 | endif 43 | 44 | istat = cudaEventCreate(startEvent) 45 | istat = cudaEventCreate(stopEvent) 46 | 47 | ! output device info and transfer size 48 | istat = cudaGetDeviceProperties(prop, 0) 49 | 50 | print "(/, 'Device: ', a)", trim(prop%name) 51 | 52 | ! pageable data transfers 53 | print "(/, 'Pageable transfers')" 54 | print *, 'size (KB) H2D (GB/s) D2H (GB/s)' 55 | 56 | n = 1024 57 | do 58 | if (n > nElements) exit 59 | write(*,'(i8)', advance='no') n*4/1024 60 | 61 | istat = cudaEventRecord(startEvent, 0) 62 | a_d(1:n) = a_pageable(1:n) 63 | istat = cudaEventRecord(stopEvent, 0) 64 | istat = cudaEventSynchronize(stopEvent) 65 | 66 | istat = cudaEventElapsedTime(time, startEvent, stopEvent) 67 | write(*,"(f15.2)", advance='no') n*4/time/1.e+6 68 | 69 | istat = cudaEventRecord(startEvent, 0) 70 | b_pageable(1:n) = a_d(1:n) 71 | istat = cudaEventRecord(stopEvent, 0) 72 | istat = cudaEventSynchronize(stopEvent) 73 | 74 | istat = cudaEventElapsedTime(time, startEvent, stopEvent) 75 | print "(3x,f15.2)", n*4/time/1.e+6 76 | 77 | if (any(a_pageable(1:n) /= b_pageable(1:n))) & 78 | print *, '*** Pageable transfers failed ***' 79 | 80 | n = n*2 81 | enddo 82 | 83 | ! pinned data transfers 84 | 85 | if (pinnedFlag) then 86 | print "(/, 'Pinned transfers')" 87 | print *, 'size (KB) H2D (GB/s) D2H (GB/s)' 88 | 89 | n = 1024 90 | do 91 | if (n > nElements) exit 92 | write(*,"(i8)", advance='no') n*4/1024 93 | 94 | 95 | istat = cudaEventRecord(startEvent, 0) 96 | a_d(1:n) = a_pinned(1:n) 97 | istat = cudaEventRecord(stopEvent, 0) 98 | istat = cudaEventSynchronize(stopEvent) 99 | 100 | istat = cudaEventElapsedTime(time, startEvent, stopEvent) 101 | write(*,"(f15.2)", advance='no') n*4/time/1.e+6 102 | 103 | istat = cudaEventRecord(startEvent, 0) 104 | b_pinned(1:n) = a_d(1:n) 105 | istat = cudaEventRecord(stopEvent, 0) 106 | istat = cudaEventSynchronize(stopEvent) 107 | 108 | istat = cudaEventElapsedTime(time, startEvent, stopEvent) 109 | print "(3x,f15.2)", n*4/time/1.e+6 110 | 111 | if (any(a_pinned(1:n) /= b_pinned(1:n))) & 112 | print *, '*** Pinned transfers failed ***' 113 | 114 | n = n*2 115 | end do 116 | 117 | end if 118 | 119 | ! cleanup 120 | if (allocated(a_pinned)) deallocate(a_pinned) 121 | if (allocated(b_pinned)) deallocate(b_pinned) 122 | istat = cudaEventDestroy(startEvent) 123 | istat = cudaEventDestroy(stopEvent) 124 | 125 | end program HDtransfer 126 | -------------------------------------------------------------------------------- /code/ch07/cusparseEx.cuf: -------------------------------------------------------------------------------- 1 | module cusparseEx 2 | use cusparse 3 | 4 | interface cusparseCreateCsr 5 | module procedure :: & 6 | cusparseCreateCsr_abr 7 | end interface cusparseCreateCsr 8 | 9 | interface cusparseCreateDnVec 10 | module procedure :: & 11 | cusparseCreateDnVec_abr 12 | end interface cusparseCreateDnVec 13 | 14 | interface cusparseSpMV 15 | module procedure :: & 16 | cusparseSpMV_abr 17 | end interface cusparseSpMV 18 | 19 | contains 20 | 21 | function cusparseGetIndexType(x) result(res) 22 | use cusparse 23 | implicit none 24 | !dir$ ignore_tkr (rd) x 25 | class(*) :: x 26 | integer :: res 27 | select type(x) 28 | type is (integer(4)) 29 | res = CUSPARSE_INDEX_32I 30 | type is (integer(8)) 31 | res = CUSPARSE_INDEX_64I 32 | class default 33 | res = -1 34 | end select 35 | end function cusparseGetIndexType 36 | 37 | ! cusparseCreateScr abridged interface 38 | 39 | function cusparseCreateCsr_abr(descr, cols, & 40 | csrRowOffsets_d, csrColInd_d, csrValues_d, idxBase) & 41 | result(status) 42 | use cudaforEx 43 | use cusparse 44 | implicit none 45 | type(cusparseSpMatDescr) :: descr 46 | integer(4) :: cols 47 | class(*), device :: csrRowOffsets_d(:), csrColInd_d(:) 48 | class(*), device :: csrValues_d(:) 49 | integer, optional :: idxBase 50 | integer :: status 51 | 52 | integer :: rows, nnz 53 | integer :: idxBaseLocal = CUSPARSE_INDEX_BASE_ONE 54 | 55 | if (present(idxBase)) idxBaseLocal = idxBase 56 | 57 | rows = size(csrRowOffsets_d)-1 58 | nnz = size(csrValues_d) 59 | 60 | status = cuSparseCreateCsr(descr, & 61 | rows, cols, nnz, & 62 | csrRowOffsets_d, csrColInd_d, csrValues_d, & 63 | cusparseGetIndexType(csrRowOffsets_d), & 64 | cusparseGetIndexType(csrColInd_d), & 65 | idxBaseLocal, & 66 | cudaforGetDataType(csrValues_d)) 67 | end function cusparseCreateCsr_abr 68 | 69 | ! cusparseCreateDnVec abridged call 70 | 71 | function cusparseCreateDnVec_abr(descr, x_d) result(status) 72 | use cudaforEx 73 | use cusparse 74 | implicit none 75 | type(cusparseDnVecDescr) :: descr 76 | class(*), device :: x_d(:) 77 | integer :: status 78 | 79 | status = cusparseCreateDnVec(descr, & 80 | size(x_d), x_d, cudaforGetDataType(x_d)) 81 | end function cusparseCreateDnVec_abr 82 | 83 | ! cusparseSpMV abridged call 84 | 85 | function cusparseSpMV_abr(h, opA, alpha, A, x, beta, y, alg, buffer) & 86 | result(status) 87 | use cudaforEx 88 | use cusparse 89 | implicit none 90 | type(cusparseHandle) :: h 91 | type(cusparseSpMatDescr) :: A 92 | integer :: opA 93 | type(cusparseDnVecDescr) :: x, y 94 | class(*) :: alpha, beta 95 | integer, optional :: alg 96 | integer(1), device, optional :: buffer(*) 97 | integer :: status 98 | 99 | ! valueType determined by alpha/beta 100 | integer :: algLocal = CUSPARSE_SPMV_ALG_DEFAULT 101 | 102 | if (present(alg)) algLocal = alg 103 | 104 | if (present(buffer)) then 105 | status = cusparseSpMV(h, & 106 | opA, alpha, A, X, beta, Y, & 107 | cudaforGetDataType(alpha), & 108 | algLocal, buffer) 109 | else 110 | block 111 | integer(8) :: bufferSize 112 | integer(1), allocatable, device :: buffer_d(:) 113 | 114 | status = cusparseSpMV_buffersize(h, & 115 | opA, alpha, A, x, beta, y, & 116 | cudaforGetDataType(alpha), & 117 | algLocal, & 118 | buffersize) 119 | if (status /= CUSPARSE_STATUS_SUCCESS) return 120 | 121 | allocate(buffer_d(buffersize)) 122 | 123 | status = cusparseSpMV(h, & 124 | opA, alpha, A, X, beta, Y, & 125 | cudaforGetDataType(alpha), & 126 | algLocal, & 127 | buffer_d) 128 | 129 | deallocate(buffer_d) 130 | end block 131 | endif 132 | end function cusparseSpMV_abr 133 | 134 | end module cusparseEx 135 | -------------------------------------------------------------------------------- /code/ch07/cutensorContraction.cuf: -------------------------------------------------------------------------------- 1 | program main 2 | use cudafor 3 | use cutensor_v2 4 | implicit none 5 | integer, parameter :: m=3200, n=3200, k=3200 6 | real(8) :: a(m,k), b(k,n), c(m,n), cref(m,n) 7 | real(8), device :: a_d(m,k), b_d(k,n), c_d(m,n) 8 | real(8) :: alpha = 1.0, beta = 0.0 9 | 10 | type(cudaDeviceProp) :: prop 11 | 12 | type(cutensorHandle) :: handle 13 | type(cutensorTensorDescriptor) :: Adesc, Bdesc, Cdesc 14 | type(cutensorOperationDescriptor) :: opDesc 15 | type(cutensorPlan) :: plan 16 | type(cutensorPlanPreference) :: pref 17 | integer(4) :: aMode(2), bMode(2), cMode(2) 18 | 19 | integer :: istat 20 | type(cutensorStatus) :: ctStat 21 | 22 | istat = cudaGetDeviceProperties(prop, 0) 23 | if (istat /= cudaSuccess) & 24 | print *, cudaGetErrorString(istat) 25 | print "(' Device : ', a)", trim(prop%name) 26 | print "(' m = ', i0, ', n = ', i0, ', k = ', i0)", m , n , k 27 | 28 | call random_number(a); call random_number(b) 29 | a_d = a; b_d = b; c_d = 0.0 30 | 31 | ! host reference 32 | cref = matmul(a,b) 33 | 34 | ! Initialize cutensor library 35 | ctStat = cutensorCreate(handle) 36 | if (ctStat /= CUTENSOR_STATUS_SUCCESS) & 37 | print *, cutensorGetErrorString(ctStat) 38 | 39 | ! Create tensor descriptors 40 | block 41 | integer(8) :: extent(2), stride(2) 42 | integer(4) :: nModes 43 | integer(4) :: ialign 44 | 45 | ! A 46 | 47 | nModes=2 48 | extent = shape(a) 49 | stride = [1, m] 50 | Amode = [ichar('m'), ichar('k')] 51 | ialign = 128 52 | 53 | ctStat = cutensorCreateTensorDescriptor(handle, Adesc, & 54 | nModes, extent, stride, CUTENSOR_R_64F, ialign) 55 | 56 | ! B 57 | 58 | nModes=2 59 | extent = shape(b) 60 | stride = [1, k] 61 | Bmode = [ichar('k'), ichar('n')] 62 | ialign = 128 63 | 64 | ctStat = cutensorCreateTensorDescriptor(handle, Bdesc, & 65 | nModes, extent, stride, CUTENSOR_R_64F, ialign) 66 | 67 | ! C 68 | 69 | nModes=2 70 | extent = shape(c) 71 | stride = [1, m] 72 | Cmode = [ichar('m'), ichar('n')] 73 | ialign = 128 74 | 75 | ctStat = cutensorCreateTensorDescriptor(handle, Cdesc, & 76 | nModes, extent, stride, CUTENSOR_R_64F, ialign) 77 | end block 78 | 79 | ctStat = cutensorCreateContraction(handle, opDesc, & 80 | Adesc, Amode, CUTENSOR_OP_IDENTITY, & 81 | Bdesc, Bmode, CUTENSOR_OP_IDENTITY, & 82 | Cdesc, Cmode, CUTENSOR_OP_IDENTITY, & 83 | Cdesc, Cmode, CUTENSOR_COMPUTE_DESC_64F) 84 | 85 | ctStat = cutensorCreatePlanPreference(& 86 | handle, pref, CUTENSOR_ALGO_DEFAULT, CUTENSOR_JIT_MODE_NONE) 87 | 88 | ! create work buffer 89 | block 90 | integer(8) :: worksize 91 | integer(1), device, allocatable :: workspace_d(:) 92 | 93 | ctStat = cutensorEstimateWorkspaceSize(handle, opDesc, pref, & 94 | CUTENSOR_WORKSPACE_DEFAULT, worksize) 95 | print *,"Estimated workspace size (B): ", worksize 96 | 97 | ctStat = cutensorCreatePlan(handle, plan, opDesc, pref, worksize) 98 | allocate(workspace_d(worksize)) 99 | 100 | ! now do contraction 101 | block 102 | type(cudaEvent) :: startEvent, stopEvent 103 | real(4) :: time 104 | 105 | istat = cudaEventCreate(startEvent) 106 | istat = cudaEventCreate(stopEvent) 107 | 108 | istat = cudaDeviceSynchronize() 109 | istat = cudaEventRecord(startEvent, 0) 110 | 111 | ctStat = cutensorContract(handle, plan, alpha, a_d, b_d, & 112 | beta, c_d, c_d, workspace_d, worksize, 0) 113 | 114 | istat = cudaEventRecord(stopEvent, 0) 115 | istat = cudaEventSynchronize(stopEvent) 116 | istat = cudaEventElapsedTime(time, startEvent, stopEvent) 117 | istat = cudaEventDestroy(startEvent) 118 | istat = cudaEventDestroy(stopEvent) 119 | 120 | c = c_d 121 | print *, 'maxval(abs(c-cref))', maxval(abs(c-cref)) 122 | print *, 'TFLOPS: ', 2.*k*m*n/(time/1000.)/1.0E+12 123 | end block 124 | deallocate(workspace_d) 125 | end block 126 | 127 | ! cleanup 128 | 129 | ctStat = cutensorDestroy(handle) 130 | ctStat = cutensorDestroyPlan(plan) 131 | ctStat = cutensorDestroyOperationDescriptor(opDesc) 132 | ctStat = cutensorDestroyTensorDescriptor(Adesc) 133 | ctStat = cutensorDestroyTensorDescriptor(Bdesc) 134 | ctStat = cutensorDestroyTensorDescriptor(Cdesc) 135 | 136 | end program main 137 | -------------------------------------------------------------------------------- /code/ch12/sphere.F90: -------------------------------------------------------------------------------- 1 | module rayTracing 2 | #ifdef _CUDA 3 | use rgbCUDA 4 | #else 5 | use rgbHost 6 | #endif 7 | 8 | real, parameter :: lowerLeftCorner(3) = [-2.0, -1.0, -1.0] 9 | real, parameter :: horizontal(3) = [4.0, 0.0, 0.0] 10 | real, parameter :: vertical(3) = [0.0, 2.0, 0.0] 11 | real, parameter :: origin(3) = [0.0, 0.0, 0.0] 12 | 13 | type ray 14 | real :: origin(3) 15 | real :: dir(3) 16 | end type ray 17 | 18 | interface ray 19 | module procedure rayConstructor 20 | end interface ray 21 | 22 | type sphere 23 | real :: center(3), radius 24 | end type sphere 25 | 26 | contains 27 | 28 | !@cuf attributes(device) & 29 | function normalize(a) result(res) 30 | implicit none 31 | real :: a(3), res(3) 32 | res = a/sqrt(sum(a**2)) 33 | end function normalize 34 | 35 | !@cuf attributes(device) & 36 | function rayConstructor(origin, dir) result(r) 37 | implicit none 38 | !dir$ ignore_tkr (d) origin, (d) dir 39 | real :: origin(3), dir(3) 40 | type(ray) :: r 41 | r%origin = origin 42 | r%dir = normalize(dir) 43 | end function rayConstructor 44 | 45 | !@cuf attributes(device) & 46 | function hitSphere(s, r) result(res) 47 | implicit none 48 | type(sphere) :: s 49 | type(ray) :: r 50 | real :: oc(3), a, b, c, disc 51 | logical :: res 52 | oc = r%origin - s%center 53 | a = dot_product(r%dir, r%dir) 54 | b = 2.0*dot_product(r%dir, oc) 55 | c = dot_product(oc, oc) - s%radius**2 56 | disc = b**2 - 4.0*a*c 57 | res = (disc > 0.0) 58 | end function hitSphere 59 | 60 | !@cuf attributes(device) & 61 | function color(r, s) result(res) 62 | implicit none 63 | type(ray) :: r 64 | type(sphere) :: s 65 | type(rgb) :: res 66 | real :: t 67 | if (hitSphere(s, r)) then 68 | res = rgb([1.0, 0.0, 0.0]) 69 | else 70 | t = 0.5*(r%dir(2) + 1.0) 71 | res = rgb((1.0-t)*[1.0, 1.0, 1.0] + t*[0.5, 0.7, 1.0]) 72 | endif 73 | end function color 74 | 75 | #ifdef _CUDA 76 | attributes(global) subroutine renderKernel(fb, nx, ny, s) 77 | implicit none 78 | type(rgb) :: fb(nx,ny) 79 | integer, value :: nx, ny 80 | type(sphere) :: s 81 | type(ray) :: r 82 | real :: dir(3) 83 | real :: u, v 84 | integer :: i, j 85 | 86 | i = threadIdx%x + (blockIdx%x-1)*blockDim%x 87 | j = threadIdx%y + (blockIdx%y-1)*blockDim%y 88 | if (i <= nx .and. j <= ny) then 89 | u = real(i)/nx 90 | v = real(j)/ny 91 | dir = lowerLeftCorner + u*horizontal + v*vertical - origin 92 | r = ray(origin, dir) 93 | fb(i,j) = color(r, s) 94 | end if 95 | end subroutine renderKernel 96 | #endif 97 | 98 | subroutine render(fb, s) 99 | !@cuf use cudafor 100 | implicit none 101 | type(rgb) :: fb(:,:) 102 | type(sphere) :: s 103 | type(ray) :: r 104 | real :: u, v 105 | integer :: nx, ny, i, j 106 | 107 | nx = size(fb,1) 108 | ny = size(fb,2) 109 | 110 | #ifdef _CUDA 111 | block 112 | type(rgb), device, allocatable :: fb_d(:,:) 113 | type(sphere), device :: s_d 114 | type(dim3) :: tBlock, grid 115 | 116 | allocate(fb_d(nx,ny)) 117 | s_d = s 118 | tBlock = dim3(32,8,1) 119 | grid = dim3((nx-1)/tBlock%x+1, (ny-1)/tBlock%y+1, 1) 120 | call renderKernel<<>>(fb_d, nx, ny, s_d) 121 | fb = fb_d 122 | deallocate(fb_d) 123 | end block 124 | #else 125 | do j = 1, ny 126 | do i = 1, nx 127 | u = real(i)/nx 128 | v = real(j)/ny 129 | r = ray(origin, & 130 | lowerLeftCorner + u*horizontal + v*vertical - origin) 131 | fb(i,j) = color(r, s) 132 | end do 133 | end do 134 | #endif 135 | end subroutine render 136 | 137 | end module rayTracing 138 | 139 | program main 140 | use rayTracing 141 | implicit none 142 | integer, parameter :: nx = 400, ny = 200 143 | integer :: i, j 144 | type(rgb) :: fb(nx,ny) 145 | type(sphere) :: s 146 | 147 | s = sphere([0.0, 0.0, -1], 0.5) 148 | 149 | call render(fb, s) 150 | 151 | ! ppm output 152 | 153 | print "(a2)", 'P3' ! indicates RGB colors in ASCII, must be flush left 154 | print *, nx, ny ! width and height of image 155 | print *, 255 ! maximum value for each color 156 | do j = ny, 1, -1 157 | do i = 1, nx 158 | print "(3(1x,i3))", int(255*fb(i,j)%v) 159 | end do 160 | end do 161 | 162 | end program main 163 | -------------------------------------------------------------------------------- /code/ch12/normal.F90: -------------------------------------------------------------------------------- 1 | module rayTracing 2 | #ifdef _CUDA 3 | use rgbCUDA 4 | #else 5 | use rgbHost 6 | #endif 7 | 8 | real, parameter :: lowerLeftCorner(3) = [-2.0, -1.0, -1.0] 9 | real, parameter :: horizontal(3) = [4.0, 0.0, 0.0] 10 | real, parameter :: vertical(3) = [0.0, 2.0, 0.0] 11 | real, parameter :: origin(3) = [0.0, 0.0, 0.0] 12 | 13 | type ray 14 | real :: origin(3) 15 | real :: dir(3) 16 | end type ray 17 | 18 | interface ray 19 | module procedure rayConstructor 20 | end interface ray 21 | 22 | type sphere 23 | real :: center(3), radius 24 | end type sphere 25 | 26 | contains 27 | 28 | !@cuf attributes(device) & 29 | function normalize(a) result(res) 30 | implicit none 31 | real :: a(3), res(3) 32 | 33 | res = a/sqrt(sum(a**2)) 34 | end function normalize 35 | 36 | !@cuf attributes(device) & 37 | function rayConstructor(origin, dir) result(r) 38 | implicit none 39 | !dir$ ignore_tkr (d) origin, (d) dir 40 | real :: origin(3), dir(3) 41 | type(ray) :: r 42 | r%origin = origin 43 | r%dir = normalize(dir) 44 | end function rayConstructor 45 | 46 | !@cuf attributes(device) & 47 | function hitSphere(s, r) result(res) 48 | implicit none 49 | type(sphere) :: s 50 | type(ray) :: r 51 | real :: oc(3), a, b, c, disc 52 | real :: res 53 | 54 | oc = r%origin - s%center 55 | a = dot_product(r%dir, r%dir) 56 | b = 2.0*dot_product(r%dir, oc) 57 | c = dot_product(oc, oc) - s%radius**2 58 | disc = b**2 - 4.0*a*c 59 | if (disc > 0.0) then 60 | res = (-b -sqrt(disc))/(2.0*a) 61 | else 62 | res = -1.0 63 | endif 64 | end function hitSphere 65 | 66 | !@cuf attributes(device) & 67 | function color(r, s) result(res) 68 | implicit none 69 | type(ray) :: r 70 | type(sphere) :: s 71 | type(rgb) :: res 72 | real :: t, n(3) 73 | 74 | t = hitSphere(s, r) 75 | if (t > 0) then 76 | n = r%dir*t + r%origin - s%center 77 | n = normalize(n) 78 | res = rgb(0.5*(n+1.0)) 79 | else 80 | t = 0.5*(r%dir(2) + 1.0) 81 | res = rgb((1.0-t)*[1.0, 1.0, 1.0] + t*[0.5, 0.7, 1.0]) 82 | endif 83 | end function color 84 | 85 | #ifdef _CUDA 86 | attributes(global) subroutine renderKernel(fb, nx, ny, s) 87 | implicit none 88 | type(rgb) :: fb(nx,ny) 89 | integer, value :: nx, ny 90 | type(sphere) :: s 91 | type(ray) :: r 92 | real :: dir(3) 93 | real :: u, v 94 | integer :: i, j 95 | 96 | i = threadIdx%x + (blockIdx%x-1)*blockDim%x 97 | j = threadIdx%y + (blockIdx%y-1)*blockDim%y 98 | if (i <= nx .and. j <= ny) then 99 | u = real(i)/nx 100 | v = real(j)/ny 101 | dir = lowerLeftCorner + u*horizontal + v*vertical - origin 102 | r = ray(origin, dir) 103 | fb(i,j) = color(r, s) 104 | end if 105 | end subroutine renderKernel 106 | #endif 107 | 108 | subroutine render(fb, s) 109 | !@cuf use cudafor 110 | implicit none 111 | type(rgb) :: fb(:,:) 112 | type(sphere) :: s 113 | type(ray) :: r 114 | real :: u, v 115 | integer :: nx, ny, i, j 116 | 117 | nx = size(fb,1) 118 | ny = size(fb,2) 119 | 120 | #ifdef _CUDA 121 | block 122 | type(rgb), device, allocatable :: fb_d(:,:) 123 | type(sphere), device :: s_d 124 | type(dim3) :: tBlock, grid 125 | 126 | allocate(fb_d(nx,ny)) 127 | s_d = s 128 | tBlock = dim3(32,8,1) 129 | grid = dim3((nx-1)/tBlock%x+1, (ny-1)/tBlock%y+1, 1) 130 | call renderKernel<<>>(fb_d, nx, ny, s_d) 131 | fb = fb_d 132 | deallocate(fb_d) 133 | end block 134 | #else 135 | do j = 1, ny 136 | do i = 1, nx 137 | u = real(i)/nx 138 | v = real(j)/ny 139 | r = ray(origin, & 140 | lowerLeftCorner + u*horizontal + v*vertical - origin) 141 | fb(i,j) = color(r, s) 142 | end do 143 | end do 144 | #endif 145 | end subroutine render 146 | 147 | end module rayTracing 148 | 149 | program main 150 | use rayTracing 151 | implicit none 152 | integer, parameter :: nx = 400, ny = 200 153 | integer :: i, j 154 | type(rgb) :: fb(nx,ny) 155 | type(sphere) :: s 156 | 157 | s = sphere([0.0, 0.0, -1.0], 0.5) 158 | call render(fb, s) 159 | 160 | ! ppm output 161 | 162 | print "(a2)", 'P3' ! indicates RGB colors in ASCII, must be flush left 163 | print *, nx, ny ! width and height of image 164 | print *, 255 ! maximum value for each color 165 | do j = ny, 1, -1 166 | do i = 1, nx 167 | print "(3(1x,i3))", int(255*fb(i,j)%v) 168 | end do 169 | end do 170 | 171 | end program main 172 | -------------------------------------------------------------------------------- /code/ch12/rgb_m.F90: -------------------------------------------------------------------------------- 1 | module & 2 | #ifdef _CUDA 3 | rgbCUDA 4 | #else 5 | rgbHost 6 | #endif 7 | 8 | type rgb 9 | real :: v(3) 10 | end type rgb 11 | 12 | interface assignment (=) 13 | module procedure rgbEqR3, r3EqRgb 14 | end interface assignment (=) 15 | 16 | interface operator(*) 17 | module procedure rgbTimesR3, r3TimesRgb, rgbTimesRgb, & 18 | rgbTimesR, rTimesRgb 19 | end interface operator(*) 20 | 21 | interface operator(/) 22 | module procedure rgbDivR3, r3DivRgb, rgbDivRgb, rgbDivI 23 | end interface operator(/) 24 | 25 | interface operator(+) 26 | module procedure rgbPlusR3, r3PlusRgb, rgbPlusRgb 27 | end interface operator(+) 28 | 29 | interface operator(-) 30 | module procedure rgbMinusR3, r3MinusRgb, rgbMinusRgb 31 | end interface operator(-) 32 | 33 | contains 34 | 35 | !@cuf attributes(device) & 36 | subroutine rgbEqR3(rgbout, rin) 37 | type(rgb), intent(out) :: rgbout 38 | real(4), intent(in) :: rin(3) 39 | rgbout%v = rin 40 | end subroutine rgbEqR3 41 | 42 | !@cuf attributes(device) & 43 | subroutine r3EqRgb(rout, rgbin) 44 | real(4), intent(out) :: rout(3) 45 | type(rgb), intent(in) :: rgbin 46 | rout = rgbin%v 47 | end subroutine r3EqRgb 48 | 49 | !@cuf attributes(device) & 50 | function rgbTimesR3(rgbin, rin) result(res) 51 | type(rgb), intent(in) :: rgbin 52 | real(4), intent(in) :: rin(3) 53 | real(4) :: res(3) 54 | res = rgbin%v * rin 55 | end function rgbTimesR3 56 | 57 | !@cuf attributes(device) & 58 | function r3TimesRgb(rin, rgbin) result(res) 59 | real(4), intent(in) :: rin(3) 60 | type(rgb), intent(in) :: rgbin 61 | real(4) :: res(3) 62 | res = rgbin%v * rin 63 | end function r3TimesRgb 64 | 65 | !@cuf attributes(device) & 66 | function rgbTimesRgb(rgb1, rgb2) result(res) 67 | type(rgb), intent(in) :: rgb1, rgb2 68 | real(4) :: res(3) 69 | res = rgb1%v * rgb2%v 70 | end function rgbTimesRgb 71 | 72 | !@cuf attributes(device) & 73 | function rgbTimesR(rgbin, rin) result(res) 74 | type(rgb), intent(in) :: rgbin 75 | real(4), intent(in) :: rin 76 | real(4) :: res(3) 77 | res = rgbin%v * rin 78 | end function rgbTimesR 79 | 80 | !@cuf attributes(device) & 81 | function rTimesRgb(rin, rgbin) result(res) 82 | real(4), intent(in) :: rin 83 | type(rgb), intent(in) :: rgbin 84 | real(4) :: res(3) 85 | res = rgbin%v * rin 86 | end function rTimesRgb 87 | 88 | !@cuf attributes(device) & 89 | function rgbDivR3(rgbin, rin) result(res) 90 | type(rgb), intent(in) :: rgbin 91 | real(4), intent(in) :: rin(3) 92 | real(4) :: res(3) 93 | res = rgbin%v / rin 94 | end function rgbDivR3 95 | 96 | !@cuf attributes(device) & 97 | function rgbDivI(rgbin, iin) result(res) 98 | type(rgb), intent(in) :: rgbin 99 | integer(4), intent(in) :: iin 100 | real(4) :: res(3) 101 | res = rgbin%v / iin 102 | end function rgbDivI 103 | 104 | !@cuf attributes(device) & 105 | function r3DivRgb(rin, rgbin) result(res) 106 | real(4), intent(in) :: rin(3) 107 | type(rgb), intent(in) :: rgbin 108 | real(4) :: res(3) 109 | res = rgbin%v / rin 110 | end function r3DivRgb 111 | 112 | !@cuf attributes(device) & 113 | function rgbDivRgb(rgb1, rgb2) result(res) 114 | type(rgb), intent(in) :: rgb1, rgb2 115 | real(4) :: res(3) 116 | res = rgb1%v / rgb2%v 117 | end function rgbDivRgb 118 | 119 | !@cuf attributes(device) & 120 | function rgbPlusR3(rgbin, rin) result(res) 121 | type(rgb), intent(in) :: rgbin 122 | real(4), intent(in) :: rin(3) 123 | real(4) :: res(3) 124 | res = rgbin%v + rin 125 | end function rgbPlusR3 126 | 127 | !@cuf attributes(device) & 128 | function r3PlusRgb(rin, rgbin) result(res) 129 | real(4), intent(in) :: rin(3) 130 | type(rgb), intent(in) :: rgbin 131 | real(4) :: res(3) 132 | res = rgbin%v + rin 133 | end function r3PlusRgb 134 | 135 | !@cuf attributes(device) & 136 | function rgbPlusRgb(rgb1, rgb2) result(res) 137 | type(rgb), intent(in) :: rgb1, rgb2 138 | real(4) :: res(3) 139 | res = rgb1%v + rgb2%v 140 | end function rgbPlusRgb 141 | 142 | !@cuf attributes(device) & 143 | function rgbMinusR3(rgbin, rin) result(res) 144 | type(rgb), intent(in) :: rgbin 145 | real(4), intent(in) :: rin(3) 146 | real(4) :: res(3) 147 | res = rgbin%v - rin 148 | end function rgbMinusR3 149 | 150 | !@cuf attributes(device) & 151 | function r3MinusRgb(rin, rgbin) result(res) 152 | real(4), intent(in) :: rin(3) 153 | type(rgb), intent(in) :: rgbin 154 | real(4) :: res(3) 155 | res = rgbin%v - rin 156 | end function r3MinusRgb 157 | 158 | !@cuf attributes(device) & 159 | function rgbMinusRgb(rgb1, rgb2) result(res) 160 | type(rgb), intent(in) :: rgb1, rgb2 161 | real(4) :: res(3) 162 | res = rgb1%v - rgb2%v 163 | end function rgbMinusRgb 164 | 165 | end module 166 | -------------------------------------------------------------------------------- /code/ch09/montecarlo_european_option.cuf: -------------------------------------------------------------------------------- 1 | module blackscholes_m 2 | use precision_m 3 | contains 4 | 5 | real(fp_kind) function CND( d ) 6 | ! Cumulative Normal Distribution function 7 | ! using Hasting's formula 8 | implicit none 9 | real(fp_kind), parameter :: A1 = 0.31938153_fp_kind 10 | real(fp_kind), parameter :: A2 = -0.356563782_fp_kind 11 | real(fp_kind), parameter :: A3 = 1.781477937_fp_kind 12 | real(fp_kind), parameter :: A4 = -1.821255978_fp_kind 13 | real(fp_kind), parameter :: A5 = 1.330274429_fp_kind 14 | real(fp_kind) :: d, K, abs, exp, RSQRT2PI 15 | 16 | K = 1.0_fp_kind/(1.0_fp_kind + 0.2316419_fp_kind * abs(d)) 17 | RSQRT2PI = 1._fp_kind/sqrt(8._fp_kind*atan(1._fp_kind)) 18 | CND = RSQRT2PI * exp( -0.5_fp_kind * d * d) * & 19 | (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))) 20 | if( d .gt. 0._fp_kind ) CND = 1.0_fp_kind - CND 21 | return 22 | end function CND 23 | 24 | subroutine blackscholes(callResult, putResult, & 25 | S, E, R, sigma, T) 26 | ! Black-Scholes formula for call and put 27 | ! S = asset price at time t 28 | ! E = exercise (strike) price 29 | ! sigma = volatility 30 | ! R = interest rate 31 | ! T = time to expiration 32 | implicit none 33 | real(fp_kind) :: callResult, putResult 34 | real(fp_kind) :: S, E, R, sigma, T 35 | real(fp_kind) :: sqrtT, d1, d2, log, exp, expRT 36 | 37 | if ( T > 0 ) then 38 | sqrtT = sqrt(T) 39 | d1 = (log(S/E)+(R+0.5_fp_kind*sigma*sigma)*T) & 40 | /(sigma*sqrtT) 41 | d2 = d1 -sigma*sqrtT 42 | expRT = exp( -R * T) 43 | callResult = ( S * CND(d1) - E * expRT * CND(d2)) 44 | putResult = callResult + E * expRT - S 45 | else 46 | callResult = max(S-E,0._fp_kind) 47 | putResult = max(E-S,0._fp_kind) 48 | end if 49 | end subroutine blackscholes 50 | end module blackscholes_m 51 | 52 | program mc 53 | use blackscholes_m 54 | use curand 55 | use cudafor 56 | implicit none 57 | real(fp_kind), allocatable, device :: deviceData(:), & 58 | putValue(:),callValue(:) 59 | real(fp_kind) :: S, E, R, sigma, T,Sfinal, & 60 | call_price, put_price 61 | real(fp_kind) :: meanPut,meanCall, & 62 | stddevPut, stddevCall, confidence 63 | type(curandGenerator) :: gen 64 | integer(kind=8) :: seed 65 | integer :: i, n2, nargs, istat, N 66 | type(cudaEvent) :: startEvent,stopEvent 67 | real :: time 68 | character*12 arg 69 | 70 | istat=cudaEventCreate(startEvent) 71 | istat=cudaEventCreate(stopEvent) 72 | 73 | ! Number of samples 74 | nargs=command_argument_count() 75 | if ( nargs == 0 ) then 76 | N = 1000000 77 | else 78 | call get_command_argument(1,arg) 79 | read(arg,'(i)') N 80 | endif 81 | 82 | S = 5._fp_kind; E = 4._fp_kind 83 | sigma = 0.3_fp_kind; R = 0.05_fp_kind 84 | T = 1._fp_kind 85 | 86 | istat=cudaEventRecord(startEvent,0) !start timing 87 | 88 | !Allocate arrays on GPU 89 | allocate (deviceData(N),putValue(N),callValue(N)) 90 | 91 | if (fp_kind == singlePrecision) then 92 | print *, " European option with random numbers" 93 | print *, " in single precisionm using ",N," samples" 94 | else 95 | print *, " European option with random numbers" 96 | print *, " in double precision using ",N," samples" 97 | end if 98 | 99 | ! Create pseudonumber generator 100 | istat = curandCreateGenerator(gen, CURAND_RNG_PSEUDO_DEFAULT) 101 | 102 | ! Set seed 103 | seed=1234 104 | istat= curandSetPseudoRandomGeneratorSeed( gen, seed) 105 | 106 | ! Generate N floats/doubles on device w/ normal distribution 107 | !istat= curandGenerateNormal(gen, deviceData, N, & 108 | ! 0._fp_kind, 1._fp_kind) 109 | istat= curandGenerate(gen, deviceData, N, & 110 | 0._fp_kind, 1._fp_kind) 111 | 112 | meanPut=0._fp_kind; meanCall=0._fp_kind 113 | !$cuf kernel do <<<*,*>>> 114 | do i=1,N 115 | Sfinal= S*exp((R-0.5_fp_kind*sigma*sigma)*T & 116 | +sigma*sqrt(T)*deviceData(i)) 117 | putValue(i) =exp (-R *T) * max (E-Sfinal,0._fp_kind) 118 | callValue(i)=exp (-R *T) * max (Sfinal-E,0._fp_kind) 119 | meanPut=meanPut+putValue(i) 120 | meanCall=meanCall+callValue(i) 121 | end do 122 | meanPut=meanPut/N 123 | meanCall=meanCall/N 124 | 125 | stddevPut=0._fp_kind; stddevCall=0._fp_kind 126 | !$cuf kernel do <<<*,*>>> 127 | do i=1,N 128 | stddevPut= stddevPut + (putValue(i)-meanPut) **2 129 | stddevCall= stddevCall + (callValue(i)-meanCall) **2 130 | end do 131 | stddevPut=sqrt(stddevPut/(N-1) ) 132 | stddevCall=sqrt(stddevCall/(N-1) ) 133 | 134 | ! compute a reference solution using Black Scholes formula 135 | call blackscholes(call_price,put_price,S,E,R,sigma,T) 136 | 137 | print *, "Montecarlo value of put option =", meanPut 138 | print *, "BlackScholes value of put option =", put_price 139 | print *, "Confidence interval of put option = [", & 140 | meanPut -1.96*stddevPut/sqrt(real(N)),",",& 141 | meanPut +1.96*stddevPut/sqrt(real(N)),"]" 142 | print *, "Montecarlo value of call option =", meanCall 143 | print *, "BlackScholes value of call option=", call_price 144 | print *, "Confidence interval of call option = [", & 145 | meanCall -1.96*stddevCall/sqrt(real(N)),",",& 146 | meanCall +1.96*stddevCall/sqrt(real(N)),"]" 147 | 148 | istat=cudaEventRecord(stopEvent,0) 149 | istat=cudaEventSynchronize(stopEvent) 150 | istat=cudaEventElapsedTime(time,startEvent,stopEvent) 151 | 152 | print *,"Elapsed time (ms) :",time 153 | 154 | deallocate (deviceData,putValue,callValue) 155 | 156 | ! Destroy the generator 157 | istat= curandDestroyGenerator(gen) 158 | 159 | end program mc 160 | -------------------------------------------------------------------------------- /code/ch08/directTransfer.cuf: -------------------------------------------------------------------------------- 1 | program directTransfer 2 | use cudafor 3 | implicit none 4 | integer, parameter :: N = 4*1024*1024 5 | real, pinned, allocatable :: a(:), b(:) 6 | real, device, allocatable :: a_d(:), b_d(:) 7 | 8 | ! these hold free and total memory before and after 9 | ! allocation, used to verify allocation is happening 10 | ! on proper devices 11 | integer(cuda_count_kind), allocatable :: & 12 | freeBefore(:), totalBefore(:), & 13 | freeAfter(:), totalAfter(:) 14 | 15 | integer :: istat, nDevices, i, accessPeer, timingDev 16 | type (cudaDeviceProp) :: prop 17 | type (cudaEvent) :: startEvent, stopEvent 18 | real :: time 19 | 20 | istat = cudaGetDeviceCount(nDevices) 21 | if (nDevices < 2) then 22 | print *, 'Need at least two CUDA capable devices' 23 | stop 24 | endif 25 | print "('Number of CUDA-capable devices: ', i0,/)", & 26 | nDevices 27 | 28 | ! allocate host arrays 29 | allocate(a(N), b(N)) 30 | allocate(freeBefore(0:nDevices-1), & 31 | totalBefore(0:nDevices-1)) 32 | allocate(freeAfter(0:nDevices-1), & 33 | totalAfter(0:nDevices-1)) 34 | 35 | ! get device info (including total and free memory) 36 | ! before allocating a_d and b_d on devices 0 and 1 37 | do i = 0, nDevices-1 38 | istat = cudaGetDeviceProperties(prop, i) 39 | istat = cudaSetDevice(i) 40 | istat = cudaMemGetInfo(freeBefore(i), totalBefore(i)) 41 | enddo 42 | istat = cudaSetDevice(0) 43 | allocate(a_d(N)) 44 | istat = cudaSetDevice(1) 45 | allocate(b_d(N)) 46 | 47 | ! print out free memory before and after allocation 48 | print "('Allocation summary')" 49 | do i = 0, nDevices-1 50 | istat = cudaGetDeviceProperties(prop, i) 51 | print "(' Device ', i0, ': ', a)", & 52 | i, trim(prop%name) 53 | istat = cudaSetDevice(i) 54 | istat = cudaMemGetInfo(freeAfter(i), totalAfter(i)) 55 | print "(' Free memory before: ', i0, & 56 | ', after: ', i0, ', difference: ',i0,/)", & 57 | freeBefore(i), freeAfter(i), & 58 | freeBefore(i)-freeAfter(i) 59 | enddo 60 | 61 | ! check whether devices 0 and 1 can use P2P 62 | if (nDevices > 1) then 63 | istat = cudaDeviceCanAccessPeer(accessPeer, 0, 1) 64 | if (accessPeer == 1) then 65 | print *, 'Peer access available between 0 and 1' 66 | else 67 | print *, 'Peer access not available between 0 and 1' 68 | endif 69 | endif 70 | 71 | ! initialize 72 | a = 1.0 73 | istat = cudaSetDevice(0) 74 | a_d = a 75 | 76 | ! perform test twice, timing on both sending GPU 77 | ! and receiving GPU 78 | do timingDev = 0, 1 79 | write(*,"(/,'Timing on device ', i0, /)") timingDev 80 | 81 | ! create events on the timing device 82 | istat = cudaSetDevice(timingDev) 83 | istat = cudaEventCreate(startEvent) 84 | istat = cudaEventCreate(stopEvent) 85 | 86 | if (accessPeer == 1) then 87 | ! enable P2P communication 88 | istat = cudaSetDevice(0) 89 | istat = cudaDeviceEnablePeerAccess(1, 0) 90 | istat = cudaSetDevice(1) 91 | istat = cudaDeviceEnablePeerAccess(0, 0) 92 | 93 | ! transfer (implicitly) across devices 94 | b_d = -1.0 95 | istat = cudaSetDevice(timingDev) 96 | istat = cudaEventRecord(startEvent,0) 97 | b_d = a_d 98 | istat = cudaEventRecord(stopEvent,0) 99 | istat = cudaEventSynchronize(stopEvent) 100 | istat = cudaEventElapsedTime(time, & 101 | startEvent, stopEvent) 102 | b = b_d 103 | if (any(b /= a)) then 104 | print "('Transfer failed')" 105 | else 106 | print "('b_d=a_d transfer (GB/s): ', f)", & 107 | N*4/time/1.0E+6 108 | endif 109 | end if 110 | 111 | ! transfer via cudaMemcpyPeer() 112 | if (accessPeer == 0) istat = cudaSetDevice(1) 113 | b_d = -1.0 114 | 115 | istat = cudaSetDevice(timingDev) 116 | istat = cudaEventRecord(startEvent,0) 117 | istat = cudaMemcpyPeer(b_d, 1, a_d, 0, N) 118 | istat = cudaEventRecord(stopEvent,0) 119 | istat = cudaEventSynchronize(stopEvent) 120 | istat = cudaEventElapsedTime(time, startEvent, & 121 | stopEvent) 122 | if (accessPeer == 0) istat = cudaSetDevice(1) 123 | b = b_d 124 | if (any(b /= a)) then 125 | print "('Transfer failed')" 126 | else 127 | print "('cudaMemcpyPeer transfer (GB/s): ', f)", & 128 | N*4/time/1.0E+6 129 | endif 130 | 131 | ! cudaMemcpyPeer with P2P disabled 132 | if (accessPeer == 1) then 133 | istat = cudaSetDevice(0) 134 | istat = cudaDeviceDisablePeerAccess(1) 135 | istat = cudaSetDevice(1) 136 | istat = cudaDeviceDisablePeerAccess(0) 137 | b_d = -1.0 138 | 139 | istat = cudaSetDevice(timingDev) 140 | istat = cudaEventRecord(startEvent,0) 141 | istat = cudaMemcpyPeer(b_d, 1, a_d, 0, N) 142 | istat = cudaEventRecord(stopEvent,0) 143 | istat = cudaEventSynchronize(stopEvent) 144 | istat = cudaEventElapsedTime(time, startEvent, & 145 | stopEvent) 146 | 147 | istat = cudaSetDevice(1) 148 | b = b_d 149 | if (any(b /= a)) then 150 | print "('Transfer failed')" 151 | else 152 | print "('cudaMemcpyPeer transfer w/ P2P', & 153 | ' disabled (GB/s): ', f)", N*4/time/1.0E+6 154 | endif 155 | end if 156 | 157 | ! destroy events associated with timingDev 158 | istat = cudaEventDestroy(startEvent) 159 | istat = cudaEventDestroy(stopEvent) 160 | end do 161 | 162 | ! clean up 163 | deallocate(freeBefore, totalBefore, freeAfter, totalAfter) 164 | deallocate(a, b, a_d, b_d) 165 | end program directTransfer 166 | 167 | --------------------------------------------------------------------------------