├── gpu_cpu.png ├── src ├── ComputePI │ ├── Readme.md │ ├── compute_pi.cuf │ └── curand_m.cuf ├── FFT │ ├── Readme.md │ ├── fft_test_c2c.cuf │ ├── fft_derivative.cuf │ └── cufft_m.cuf ├── cufReduction.cuf ├── cufILP.cuf ├── cufKernel.cuf ├── cufKernel2D.cuf ├── deviceQuery.cuf └── saxpy.cuf └── Readme.md /gpu_cpu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Koushikphy/Intro-to-CUDA-Fortran/HEAD/gpu_cpu.png -------------------------------------------------------------------------------- /src/ComputePI/Readme.md: -------------------------------------------------------------------------------- 1 | #### Computing Pi with Monte Carlo method 2 | 3 | Compile it with 4 | ```bash 5 | nvfortran curand_m.cuf compute_pi.cuf -cudalib=curand 6 | ``` 7 | The CUDA random number generator (`curand`) needs to be linked for this to work, using the flag `-cudalib` -------------------------------------------------------------------------------- /src/FFT/Readme.md: -------------------------------------------------------------------------------- 1 | #### Fast Fourier Transform with CUDA Fortran 2 | 3 | Compile it with 4 | ```bash 5 | nvfortran cufft_m.cuf fft_test_c2c.cuf -cudalib=cufft 6 | nvfortran cufft_m.cuf fft_derivative.cuf -cudalib=cufft 7 | ``` 8 | The CUDA random number generator (`curand`) needs to be linked for this to work, using the flag `-cudalib` 9 | -------------------------------------------------------------------------------- /src/cufReduction.cuf: -------------------------------------------------------------------------------- 1 | program reduce 2 | implicit none 3 | integer, parameter :: n = 1024*1024 4 | integer :: i, aSum = 0 5 | integer, device :: a_d(n) 6 | integer, parameter :: tPB = 256 7 | 8 | a_d = 1 9 | 10 | ! CUF kernels do the reduction operation automatically 11 | 12 | !$cuf kernel do <<<*,tPB>>> 13 | do i = 1, n 14 | aSum = aSum + a_d(i) 15 | enddo 16 | 17 | if (aSum /= n) then 18 | write(*,*) '**** Program Failed ****' 19 | else 20 | write(*,*) 'Program Passed' 21 | endif 22 | end program reduce 23 | 24 | -------------------------------------------------------------------------------- /src/cufILP.cuf: -------------------------------------------------------------------------------- 1 | program ilp 2 | implicit none 3 | integer, parameter :: n = 1024*1024 4 | integer :: a(n), i, b 5 | integer, device :: a_d(n) 6 | integer, parameter :: tPB = 256 7 | 8 | a = 1 9 | b = 3 10 | 11 | a_d = a 12 | 13 | 14 | ! Here the 1024 blocks of 256 threads cannot processes the 1024 2 elements if each thread processes 15 | ! a single element, so the compiler generates a loop that results in each thread processing four array 16 | ! elements. 17 | !----------------------------------------------------------------------------------------------------- 18 | 19 | !$cuf kernel do <<<1024,tPB>>> 20 | do i = 1, n 21 | a_d(i) = a_d(i) + b 22 | enddo 23 | 24 | a = a_d 25 | 26 | if (any(a /= 4)) then 27 | write(*,*) '**** Program Failed ****' 28 | else 29 | write(*,*) 'Program Passed' 30 | endif 31 | end program ilp 32 | 33 | -------------------------------------------------------------------------------- /src/cufKernel.cuf: -------------------------------------------------------------------------------- 1 | program incrementTest 2 | implicit none 3 | integer, parameter :: n = 1024*1024 4 | integer :: a(n), i, b 5 | integer, device :: a_d(n) 6 | integer, parameter :: tPB = 256 7 | 8 | a = 1 9 | b = 3 10 | 11 | a_d = a 12 | 13 | 14 | ! A thread block of tPB threads is used when launching the kernel. The * specified for the first execution configuration 15 | ! parameter leaves the compiler free to calculate the number of thread blocks to launch in order to carry out the 16 | ! operation in the loop. The execution configuration could have been specified as <<<*,*>>>, in which case the 17 | ! compiler would choose the thread block size as well as the number of thread blocks to launch. 18 | !------------------------------------------------------------------------------------------------------------------------ 19 | 20 | !$cuf kernel do <<<*,tPB>>> 21 | do i = 1, n 22 | a_d(i) = a_d(i) + b 23 | enddo 24 | 25 | a = a_d 26 | 27 | if (any(a /= 4)) then 28 | write(*,*) '**** Program Failed ****' 29 | else 30 | write(*,*) 'Program Passed' 31 | endif 32 | end program incrementTest 33 | 34 | -------------------------------------------------------------------------------- /src/cufKernel2D.cuf: -------------------------------------------------------------------------------- 1 | 2 | program incrementTest 3 | implicit none 4 | integer, parameter :: n = 4*1024 5 | integer :: a(n,n), i, j, b 6 | integer, device :: a_d(n,n) 7 | 8 | a = 1 9 | b = 3 10 | 11 | a_d = a 12 | 13 | ! In this case the do (2) specified on the directive indicates that the generated kernel will map to the two 14 | ! following loops. Multidimensional thread blocks and grids specified by the execution configuration in 15 | ! the directive map to the nested loops in an innermost to outermost fashion. For example, for the thread 16 | ! block of 32 × 8, the predefined kernel variable threadIdx%x will run from 1 to 32 and map to the 17 | ! i index, and threadIdx%y will run from 1 to 8 and map to the j index. Rather than specifying the 18 | ! thread block size, we could have also used <<<(*,*),(*,*)>>> or even <<<*,*>>> 19 | ! and have the compiler choose the thread block and grid size. 20 | !----------------------------------------------------------------------------------------------------- 21 | 22 | !$cuf kernel do (2) <<< (*,*), (32,8) >>> 23 | do j = 1, n 24 | do i = 1, n 25 | a_d(i,j) = a_d(i,j) + b 26 | enddo 27 | enddo 28 | 29 | a = a_d 30 | 31 | if (any(a /= 4)) then 32 | write(*,*) '**** Program Failed ****' 33 | else 34 | write(*,*) 'Program Passed' 35 | endif 36 | end program incrementTest 37 | 38 | -------------------------------------------------------------------------------- /src/FFT/fft_test_c2c.cuf: -------------------------------------------------------------------------------- 1 | program fft_test_c2c 2 | use iso_c_binding 3 | ! use precision_m 4 | use cufft_m 5 | implicit none 6 | integer, allocatable:: kx(:) 7 | complex(fp_kind), allocatable:: cinput(:),coutput(:) 8 | complex(fp_kind), allocatable, device:: cinput_d(:),coutput_d(:) 9 | 10 | integer:: i,j,k,n 11 | type(c_ptr):: plan 12 | real(fp_kind):: twopi=8._fp_kind*atan(1._fp_kind),h 13 | 14 | character*1:: a 15 | real(fp_kind):: w,x,y,z 16 | integer:: nerrors 17 | 18 | n=16 19 | h=twopi/real(n,fp_kind) 20 | 21 | ! allocate arrays on the host 22 | allocate (cinput(n),coutput(n),kx(n)) 23 | 24 | ! allocate arrays on the device 25 | allocate (cinput_d(n),coutput_d(n)) 26 | 27 | !initialize arrays on host 28 | kx =(/ (i-1, i=1,n/2), (-n+i-1, i=n/2+1,n) /) 29 | 30 | do i=1,n 31 | cinput(i)=(cos(2*real(i-1,fp_kind)*h)+sin(3*real(i-1,fp_kind)*h)) 32 | end do 33 | 34 | !copy arrays to device 35 | cinput_d=cinput 36 | 37 | ! Initialize the plan for complex to complex transform 38 | if (fp_kind== singlePrecision) call cufftPlan1D(plan,n,CUFFT_C2C,1) 39 | if (fp_kind== doublePrecision) call cufftPlan1D(plan,n,CUFFT_Z2Z,1) 40 | 41 | ! Forward transform out of place 42 | call cufftExec(plan,cinput_d,coutput_d,CUFFT_FORWARD) 43 | 44 | ! Copy results back to host 45 | coutput=coutput_d 46 | 47 | print *," Transform from complex array" 48 | do i=1,n 49 | write(*,'(i2,1x,2(f8.4),2x,i2,2(f8.4))') i,cinput(i),kx(i),coutput(i)/n 50 | end do 51 | 52 | 53 | !release memory on the host and on the device 54 | deallocate (cinput,coutput,kx,cinput_d,coutput_d) 55 | 56 | ! Destroy the plans 57 | call cufftDestroy(plan) 58 | 59 | end program fft_test_c2c 60 | 61 | -------------------------------------------------------------------------------- /src/ComputePI/compute_pi.cuf: -------------------------------------------------------------------------------- 1 | 2 | ! Compute pi using a Monte Carlo method 3 | 4 | program compute_pi 5 | use curand_m 6 | implicit none 7 | real(fp_kind), allocatable:: hostData(:) 8 | real(fp_kind), allocatable, device:: deviceData(:) 9 | real(fp_kind) :: pival 10 | integer :: inside_gpu, inside_cpu, N, i 11 | integer(kind=int_ptr_kind()) :: gen, twoN 12 | integer(kind=8) :: seed 13 | 14 | ! Define how many numbers we want to generate 15 | N=10000000 16 | twoN=N*2 17 | 18 | ! Allocate array on CPU 19 | allocate(hostData(twoN)) 20 | 21 | ! Allocate array on GPU 22 | allocate(deviceData(twoN)) 23 | 24 | 25 | ! Create pseudonumber generator 26 | call curandCreateGenerator(gen, CURAND_RNG_PSEUDO_DEFAULT) 27 | 28 | ! Set seed 29 | seed=1234 30 | call curandSetPseudoRandomGeneratorSeed( gen, seed) 31 | 32 | ! Generate N floats or double on device 33 | call curandGenerateUniform(gen, deviceData, twoN) 34 | 35 | ! Copy the data back to CPU to check result later 36 | hostData=deviceData 37 | 38 | ! Perform the test on GPU using CUF kernel 39 | inside_gpu=0 40 | !$cuf kernel do <<<*,*>>> 41 | do i=1,N 42 | if( (deviceData(i)**2+deviceData(i+N)**2) <= 1._fp_kind ) inside_gpu=inside_gpu+1 43 | end do 44 | 45 | ! Perform the test on CPU 46 | inside_cpu=0 47 | do i=1,N 48 | if( (hostData(i)**2+hostData(i+N)**2) <= 1._fp_kind ) inside_cpu=inside_cpu+1 49 | end do 50 | 51 | ! Check the results 52 | if (inside_cpu .ne. inside_gpu) then 53 | write(*,*) "Mismatch between CPU/GPU" 54 | write(*,*) "Test Failed" 55 | else 56 | write(*,*) "Test Passed" 57 | endif 58 | 59 | ! Print the value of pi and the error 60 | pival= 4._fp_kind*real(inside_gpu,fp_kind)/real(N,fp_kind) 61 | write(*,"(t3,a,i10,a,f10.8,a,e11.4)") "Samples=", N, " Pi=", pival, " Error=", abs(pival-2.0_fp_kind*asin(1.0_fp_kind)) 62 | 63 | ! Deallocate data on CPU and GPU 64 | deallocate(hostData) 65 | deallocate(deviceData) 66 | 67 | ! Destroy the generator 68 | call curandDestroyGenerator(gen) 69 | end program compute_pi 70 | -------------------------------------------------------------------------------- /src/FFT/fft_derivative.cuf: -------------------------------------------------------------------------------- 1 | program fft_derivative 2 | use iso_c_binding 3 | use cufft_m 4 | implicit none 5 | real(fp_kind), allocatable:: kx(:), derivative(:) 6 | real(fp_kind), allocatable, device:: kx_d(:) 7 | 8 | complex(fp_kind), allocatable:: cinput(:),coutput(:) 9 | complex(fp_kind), allocatable, device:: cinput_d(:),coutput_d(:) 10 | 11 | integer:: i,j,n 12 | type(c_ptr):: plan 13 | real(fp_kind):: twopi=8._fp_kind*atan(1._fp_kind),h 14 | 15 | character*1:: a 16 | real(fp_kind):: x,y,z 17 | integer:: nerrors 18 | 19 | n=8 20 | h=twopi/real(n,fp_kind) 21 | 22 | ! allocate arrays on the host 23 | allocate (cinput(n),coutput(n),derivative(n),kx(n)) 24 | 25 | ! allocate arrays on the device 26 | allocate (cinput_d(n),coutput_d(n),kx_d(n)) 27 | 28 | ! initialize arrays on host 29 | kx =(/ ((i-1), i=1,n/2), ((-n+i-1), i=n/2+1,n) /) 30 | 31 | ! Set the wave number for the Nyquist frequency to zero 32 | kx(n/2+1)=0._fp_kind 33 | 34 | ! Copy the wave number vector to the device 35 | kx_d=kx 36 | 37 | do i=1,n 38 | cinput(i)=(cos(2*real(i-1,fp_kind)*h) +sin(3*real(i-1,fp_kind)*h)) 39 | derivative(i)=(-2*sin(2*real(i-1,fp_kind)*h) +3*cos(3*real(i-1,fp_kind)*h)) 40 | end do 41 | 42 | ! copy input to device 43 | cinput_d=cinput 44 | 45 | ! Initialize the plan for complex to complex transform 46 | if (fp_kind== singlePrecision) call cufftPlan1D(plan,n,CUFFT_C2C,1) 47 | if (fp_kind== doublePrecision) call cufftPlan1D(plan,n,CUFFT_Z2Z,1) 48 | 49 | ! Forward transform out of place 50 | call cufftExec(plan,cinput_d,coutput_d,CUFFT_FORWARD) 51 | 52 | ! Compute the derivative in spectral space and normalize the FFT 53 | !$cuf kernel do <<<*,*>>> 54 | do i=1,n 55 | coutput_d(i)=cmplx(0.,kx_d(i),fp_kind)*coutput_d(i)/n 56 | end do 57 | 58 | ! Inverse transform in place 59 | call cufftExec(plan,coutput_d,coutput_d,CUFFT_INVERSE) 60 | 61 | ! Copy results back to host 62 | coutput=coutput_d 63 | 64 | print *," First Derivative from complex array" 65 | do i=1,n 66 | write(*,'(i2,2(1x,f8.4),2x,e13.7)') i,real(coutput(i)),derivative(i),real(coutput(i))-derivative(i) 67 | end do 68 | 69 | !release memory on the host and on the device 70 | deallocate (cinput,coutput,kx,derivative,cinput_d,coutput_d,kx_d) 71 | 72 | ! Destroy the plans 73 | call cufftDestroy(plan) 74 | 75 | end program fft_derivative 76 | 77 | -------------------------------------------------------------------------------- /src/deviceQuery.cuf: -------------------------------------------------------------------------------- 1 | program deviceQuery 2 | use cudafor 3 | implicit none 4 | 5 | type (cudaDeviceProp) :: prop 6 | integer :: nDevices=0, i, ierr 7 | 8 | ! Number of CUDA-capable devices 9 | 10 | ierr = cudaGetDeviceCount(nDevices) 11 | 12 | if (nDevices == 0) then 13 | write(*,"(/,'No CUDA devices found',/)") 14 | stop 15 | else if (nDevices == 1) then 16 | write(*,"(/,'One CUDA device found',/)") 17 | else 18 | write(*,"(/,i0,' CUDA devices found',/)") nDevices 19 | end if 20 | 21 | ! Loop over devices 22 | 23 | do i = 0, nDevices-1 24 | 25 | write(*,"('Device Number: ',i0)") i 26 | 27 | ierr = cudaGetDeviceProperties(prop, i) 28 | if (ierr .eq. 0) then 29 | write(*,"(' GetDeviceProperties for device ',i0,': Passed')") i 30 | else 31 | write(*,"(' GetDeviceProperties for device ',i0,': Failed')") i 32 | endif 33 | 34 | ! General device info 35 | 36 | write(*,"(' Device Name: ',a)") trim(prop%name) 37 | write(*,"(' Compute Capability: ',i0,'.',i0)") & 38 | prop%major, prop%minor 39 | write(*,"(' Number of Multiprocessors: ',i0)") & 40 | prop%multiProcessorCount 41 | write(*,"(' Max Threads per Multiprocessor: ',i0)") & 42 | prop%maxThreadsPerMultiprocessor 43 | write(*,"(' Global Memory (GB): ',f9.3,/)") & 44 | prop%totalGlobalMem/1024.0**3 45 | 46 | ! Execution Configuration 47 | 48 | write(*,"(' Execution Configuration Limits')") 49 | write(*,"(' Max Grid Dims: ',2(i0,' x '),i0)") & 50 | prop%maxGridSize 51 | write(*,"(' Max Block Dims: ',2(i0,' x '),i0)") & 52 | prop%maxThreadsDim 53 | write(*,"(' Max Threads per Block: ',i0,/)") & 54 | prop%maxThreadsPerBlock 55 | 56 | enddo 57 | 58 | end program deviceQuery 59 | 60 | 61 | !!! Sample output 62 | ! --------------------------------------------------------------- 63 | ! One CUDA device found 64 | 65 | ! Device Number: 0 66 | ! GetDeviceProperties for device 0: Passed 67 | ! Device Name: NVIDIA GeForce GT 1030 68 | ! Compute Capability: 6.1 69 | ! Number of Multiprocessors: 3 70 | ! Max Threads per Multiprocessor: 2048 71 | ! Global Memory (GB): 1.951 72 | 73 | ! Execution Configuration Limits 74 | ! Max Grid Dims: 2147483647 x 65535 x 65535 75 | ! Max Block Dims: 1024 x 1024 x 64 76 | ! Max Threads per Block: 1024 77 | ! --------------------------------------------------------------- -------------------------------------------------------------------------------- /src/ComputePI/curand_m.cuf: -------------------------------------------------------------------------------- 1 | 2 | module curand_m 3 | integer, public :: CURAND_RNG_PSEUDO_DEFAULT = 100 4 | integer, public :: CURAND_RNG_PSEUDO_XORWOW = 101 5 | integer, public :: CURAND_RNG_QUASI_DEFAULT = 200 6 | integer, public :: CURAND_RNG_QUASI_SOBOL32 = 201 7 | 8 | 9 | integer, parameter :: singlePrecision = kind(0.0) 10 | integer, parameter :: doublePrecision = kind(0.0d0) 11 | 12 | integer, parameter :: fp_kind = doublePrecision 13 | ! integer, parameter :: fp_kind = singlePrecision 14 | 15 | 16 | interface curandCreateGenerator 17 | subroutine curandCreateGenerator( generator,rng_type) bind(C,name='curandCreateGenerator') 18 | use iso_c_binding 19 | integer(c_size_t):: generator 20 | integer(c_int),value:: rng_type 21 | end subroutine curandCreateGenerator 22 | end interface curandCreateGenerator 23 | 24 | interface curandSetPseudoRandomGeneratorSeed 25 | subroutine curandSetPseudoRandomGeneratorSeed( generator,seed) bind(C,name='curandSetPseudoRandomGeneratorSeed') 26 | use iso_c_binding 27 | integer(c_size_t), value:: generator 28 | integer(c_long_long),value:: seed 29 | end subroutine curandSetPseudoRandomGeneratorSeed 30 | end interface curandSetPseudoRandomGeneratorSeed 31 | 32 | interface curandGenerateUniform 33 | subroutine curandGenerateUniform( generator, odata, numele) bind(C,name='curandGenerateUniform') 34 | use iso_c_binding 35 | integer(c_size_t),value:: generator 36 | !pgi$ ignore_tr odata 37 | real(c_float), device:: odata(*) 38 | integer(c_size_t),value:: numele 39 | end subroutine curandGenerateUniform 40 | 41 | subroutine curandGenerateUniformDouble(generator, odata, numele) bind(C,name='curandGenerateUniformDouble') 42 | use iso_c_binding 43 | integer(c_size_t),value:: generator 44 | !pgi$ ignore_tr odata 45 | real(c_double), device:: odata(*) 46 | integer(c_size_t),value:: numele 47 | end subroutine curandGenerateUniformDouble 48 | end interface curandGenerateUniform 49 | 50 | interface curandGenerateNormal 51 | subroutine curandGenerateNormal( generator, odata, numele, mean,stddev) bind(C,name='curandGenerateNormal') 52 | use iso_c_binding 53 | integer(c_size_t),value:: generator 54 | !pgi$ ignore_tr odata 55 | real(c_float), device:: odata(*) 56 | integer(c_size_t),value:: numele 57 | real(c_float), value:: mean,stddev 58 | end subroutine curandGenerateNormal 59 | 60 | subroutine curandGenerateNormalDouble( generator, odata, numele,mean, stddev) bind(C,name='curandGenerateNormalDouble') 61 | use iso_c_binding 62 | integer(c_size_t),value:: generator 63 | !pgi$ ignore_tr odata 64 | real(c_double), device:: odata(*) 65 | integer(c_size_t),value:: numele 66 | real(c_double), value:: mean,stddev 67 | end subroutine curandGenerateNormalDouble 68 | end interface curandGenerateNormal 69 | 70 | interface curandDestroyGenerator 71 | subroutine curandDestroyGenerator(generator) bind(C,name='curandDestroyGenerator') 72 | use iso_c_binding 73 | integer(c_size_t),value:: generator 74 | end subroutine curandDestroyGenerator 75 | end interface curandDestroyGenerator 76 | 77 | end module curand_m 78 | -------------------------------------------------------------------------------- /src/saxpy.cuf: -------------------------------------------------------------------------------- 1 | module mathOps 2 | contains 3 | ! The kernel i.e a function that runs on the device 4 | ! `attributes` describes the scope of the routine. `global` means its visible both from the host and device 5 | ! This indicates the subroutine is run on the device but called from the host 6 | attributes(global) subroutine saxpy(x, y, a) 7 | implicit none 8 | real :: x(:), y(:) 9 | real, value :: a 10 | integer :: i, n 11 | n = size(x) 12 | 13 | ! Remember the host launches "**grid** of block with each block having **tBlock** threads" 14 | ! and each thread works on a single element of the array 15 | ! These `blockDim`, `blockIdx` and `threadIdx` are provided defined by CUDA are similar to `dim3` type 16 | ! As we used only `x` component to launch the kernel only `x` component is used 17 | 18 | ! Think of this as there are groups (=`grid` in host) of threads and those groups are numbered with `blockIdx` 19 | ! Each group has `blockDim` (=`tBlock` in host) number of threads and each thread inside a particular block 20 | ! is numbered with `threadIdx`. 21 | ! Thus using this following formula we can calculate the offset of the element of array to be computed 22 | 23 | i = blockDim%x * (blockIdx%x - 1) + threadIdx%x 24 | !^ Note: This is an example of fine-grained parallelism 25 | 26 | ! As we have launched more threads in the host than there are array element a conditional check is required 27 | if (i <= n) y(i) = y(i) + a*x(i) 28 | 29 | end subroutine saxpy 30 | end module mathOps 31 | 32 | program testSaxpy 33 | use mathOps 34 | ! Fortran module that contains all the CUDA Fortran definitions 35 | use cudafor 36 | implicit none 37 | integer, parameter :: N = 40000 38 | ! host arrays 39 | real :: x(N), y(N), a 40 | ! device arrays, they are declared with the `device` attribute 41 | real, device :: x_d(N), y_d(N) 42 | 43 | ! Thread configuration to launch the kernel 44 | ! Threads and block can be arranged in multidimensional nature. 45 | ! Here the they are defined as `dim3`, so they as have three components `x`,`y` and `z`. 46 | type(dim3) :: grid, tBlock 47 | 48 | 49 | ! In this example `tBlock` has 256 threads in `x` components and 50 | ! `grid` is defined such a way to accomodate all the `N = 40000` computation in blocks each having 256 threads. 51 | tBlock = dim3(256,1,1) 52 | grid = dim3(ceiling(real(N)/tBlock%x),1,1) 53 | 54 | x = 1.0; y = 2.0; a = 2.0 55 | 56 | ! copies the array from host to device and vice versa. 57 | ! The `cudafor` module overloads the assignment operator with `cudaMemcpy` calls 58 | ! so that the memory transfer can be done with a simple assignment operation. 59 | ! Note: This step actually moves data beween two physical devices and actually can be time consuming. 60 | ! P.S. CUDA memory copy can also be done in asynchronous 61 | x_d = x 62 | y_d = y 63 | 64 | ! Launches the kernel on the device. 65 | ! The information between the tripple braces are the excution configuration 66 | ! it says to launch **grid** of block with each block having **tBlock** threads. 67 | ! This call is asynchronous, so the host can just call this and 68 | ! proceed to the next line without it's computation on the device being completed 69 | call saxpy<<>>(x_d, y_d, a) 70 | 71 | 72 | ! Copyies data back to host. 73 | ! This process is synchronous and waits for the device to complete the calculation 74 | y = y_d 75 | write(*,*) 'Max error: ', maxval(abs(y-4.0)) 76 | end program testSaxpy -------------------------------------------------------------------------------- /src/FFT/cufft_m.cuf: -------------------------------------------------------------------------------- 1 | 2 | module cufft_m 3 | 4 | integer, public :: CUFFT_FORWARD = -1 5 | integer, public :: CUFFT_INVERSE = 1 6 | integer, public :: CUFFT_R2C = Z'2a' ! Real to Complex (interleaved) 7 | integer, public :: CUFFT_C2R = Z'2c' ! Complex (interleaved) to Real 8 | integer, public :: CUFFT_C2C = Z'29' ! Complex to Complex, interleaved 9 | integer, public :: CUFFT_D2Z = Z'6a' ! Double to Double-Complex 10 | integer, public :: CUFFT_Z2D = Z'6c' ! Double-Complex to Double 11 | integer, public :: CUFFT_Z2Z = Z'69' ! Double-Complex to Double-Complex 12 | 13 | integer, parameter, public :: singlePrecision = kind(0.0) 14 | integer, parameter, public :: doublePrecision = kind(0.0d0) 15 | 16 | integer, parameter, public :: fp_kind = doublePrecision 17 | ! integer, parameter, public :: fp_kind = singlePrecision 18 | 19 | 20 | interface cufftDestroy 21 | subroutine cufftDestroy(plan) bind(C,name='cufftDestroy') 22 | use iso_c_binding 23 | type(c_ptr),value:: plan 24 | end subroutine cufftDestroy 25 | end interface cufftDestroy 26 | 27 | interface cufftSetStream 28 | subroutine cufftSetStream(plan, stream) bind(C,name='cufftSetStream') 29 | use iso_c_binding 30 | use cudafor 31 | type(c_ptr),value:: plan 32 | integer(kind=cuda_stream_kind),value:: stream 33 | end subroutine cufftSetStream 34 | end interface cufftSetStream 35 | 36 | interface cufftExec 37 | 38 | subroutine cufftExecC2C(plan, idata, odata, direction) bind(C,name='cufftExecC2C') 39 | use iso_c_binding 40 | import singlePrecision, doublePrecision 41 | type(c_ptr),value:: plan 42 | integer(c_int),value:: direction 43 | !pgi$ ignore_tr idata,odata 44 | complex(singlePrecision),device:: idata(*),odata(*) 45 | end subroutine cufftExecC2C 46 | 47 | subroutine cufftExecZ2Z(plan, idata, odata, direction) bind(C,name='cufftExecZ2Z') 48 | use iso_c_binding 49 | import singlePrecision, doublePrecision 50 | type(c_ptr),value:: plan 51 | integer(c_int),value:: direction 52 | !pgi$ ignore_tr idata,odata 53 | complex(doublePrecision),device:: idata(*),odata(*) 54 | end subroutine cufftExecZ2Z 55 | 56 | subroutine cufftExecR2C(plan, idata, odata) bind(C,name='cufftExecR2C') 57 | use iso_c_binding 58 | import singlePrecision, doublePrecision 59 | type(c_ptr),value:: plan 60 | integer(c_int),value:: direction 61 | !pgi$ ignore_tr idata,odata 62 | real(singlePrecision),device:: idata(*) 63 | complex(singlePrecision),device:: odata(*) 64 | end subroutine cufftExecR2C 65 | 66 | subroutine cufftExecD2Z(plan, idata, odata) bind(C,name='cufftExecD2Z') 67 | use iso_c_binding 68 | import singlePrecision, doublePrecision 69 | type(c_ptr),value:: plan 70 | integer(c_int),value:: direction 71 | !pgi$ ignore_tr idata,odata 72 | real(doublePrecision),device:: idata(*) 73 | complex(doublePrecision),device:: odata(*) 74 | end subroutine cufftExecD2Z 75 | 76 | subroutine cufftExecR2Cinplace(plan, idata, odata) bind(C,name='cufftExecR2C') 77 | use iso_c_binding 78 | import singlePrecision, doublePrecision 79 | type(c_ptr),value:: plan 80 | integer(c_int),value:: direction 81 | !pgi$ ignore_tr idata,odata 82 | real(singlePrecision),device:: idata(*) 83 | real(singlePrecision),device:: odata(*) 84 | end subroutine cufftExecR2Cinplace 85 | 86 | subroutine cufftExecD2Zinplace(plan, idata, odata) bind(C,name='cufftExecD2Z') 87 | use iso_c_binding 88 | import singlePrecision, doublePrecision 89 | type(c_ptr),value:: plan 90 | !pgi$ ignore_tr idata,odata 91 | real(doublePrecision),device:: idata(*) 92 | real(doublePrecision),device:: odata(*) 93 | end subroutine cufftExecD2Zinplace 94 | 95 | end interface cufftExec 96 | 97 | interface cufftPlan1d 98 | subroutine cufftPlan1d(plan, nx, type, batch) bind(C,name='cufftPlan1d') 99 | use iso_c_binding 100 | type(c_ptr):: plan 101 | integer(c_int),value:: nx, batch,type 102 | end subroutine cufftPlan1d 103 | end interface cufftPlan1d 104 | 105 | 106 | interface cufftPlanMany 107 | subroutine cufftPlanMany(plan, rank, n, inembed, istride, idist, onembed, ostride, odist,type, batch) bind(C,name='cufftPlanMany') 108 | use iso_c_binding 109 | implicit none 110 | !pgi$ ignore_tkr n, inembed, onembed 111 | type(c_ptr) :: plan 112 | integer(c_int) :: n, inembed, onembed 113 | integer(c_int), value:: rank, istride, ostride, idist, odist, type, batch 114 | end subroutine cufftPlanMany 115 | end interface cufftPlanMany 116 | 117 | interface cufftPlan2d 118 | module procedure cufftPlan2Dswap 119 | end interface cufftPlan2d 120 | 121 | interface cufftPlan2dC 122 | subroutine cufftPlan2d(plan, nx, ny, type) bind(C,name='cufftPlan2d') 123 | use iso_c_binding 124 | type(c_ptr):: plan 125 | integer(c_int),value:: nx, ny, type 126 | end subroutine cufftPlan2d 127 | end interface cufftPlan2dC 128 | 129 | contains 130 | 131 | subroutine cufftPlan2Dswap(plan,nx,ny, type) 132 | use iso_c_binding 133 | type(c_ptr):: plan 134 | integer(c_int),value:: nx, ny, type 135 | call cufftPlan2dC(plan,ny,nx,type) 136 | end subroutine cufftPlan2Dswap 137 | 138 | end module cufft_m 139 | 140 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | # CUDA Fortran: Fortran programming on GPU 2 | 3 | 4 | Disclaimer: There is no way possible to learn CUDA Fortran completely just from this one page Tutorial/Cheatsheet. This is only meant for a quick reference sheet to get started with GPGPU programming with CUDA Fortran. 5 | 6 | --- 7 | 8 | 9 | ## GPU & CUDA Programming Model 10 |   11 | ![GPU vs CPU](./gpu_cpu.png) 12 | - __The Host & Device:__ The CPU and its memory is called the host. On the other hand the GPU and its memory is called the device. They are usually connected with PCI bus which have much slower data bandwidth compared to the each processing unit and their memory and moving data between them is time consuming. Thus frequent exchange of data between the two memory is highly discourage 13 | 14 | - __Kernels:__ A function that is executed on the GPU. 15 | - __Threads Hierarchy__ 16 | - __Thread:__ At the lowest level of CUDA threads hierarchy are the individual threads. Each thread execute the kernel on a single piece of data and each gets mapped to a single CUDA core. 17 | - __Blocks:__ A group of thread. 18 | - __Grid:__ The collection of blocks that gets mapped on the entire GPU 19 | - Blocks and Grids can be 1D, 2D or 3D and the program has to written in such way to control over multidimensional Blocks/Grids. 20 | 21 | - __Flow of Program:__ The main code execution is started on the CPU aka the host. Separate memory are allocated for host and device to hold the data for each of their computation. When needed the data is copied to the device from host and back. Host can launch a group of kernels on the device. When the kernels are launched, the host does not wait for the kernels execution to finish and can proceed with its own flow. The memory copy between the host and device can be synchronous or asynchronous. Usually they are done in synchronous manner. The assignment operator (`=`) in CUDA Fortran is overloaded with synchronous memory copy i.e. the copy operation will wait for the kernels to finish their execution 22 | 23 | --- 24 | 25 | ### CUDA Fortran Installation: 26 | 1. Install the appropriate Nvidia drivers for your system. 27 | 2. Install the Nvidia CUDA toolkit. 28 | 3. Install Nvidia HPC SDK from https://developer.nvidia.com/nvidia-hpc-sdk-downloads. The installation path is usually `/opt/nvidia/hpc_sdk/Linux_x86_64/*/compilers/bin`, add it to your PATH. 29 | 30 | P.S. You may have to restart your system, before using the compilers. 31 | 32 | --- 33 | 34 | ### Compilation and Execution 35 | 36 | Earlier the CUDA Fortran compiler was developed by PGI. From 2020 the PGI compiler tools was replaced with the Nvidia HPC Toolkit. You can use compilers like `nvc`, `nvc++` and `nvfortan` to compile `C`, `C++` and `Fortran` respectively. 37 | 38 | - CUDA Fortran codes have suffixed `.cuf` 39 | 40 | - Compile CUDA Fortran with `nvfortran` and just run the executable 41 | 42 | ```bash 43 | nvfortran test_code.cuf -o test_exe 44 | ./test_exe 45 | ``` 46 | --- 47 | 48 | ### CUDA Fortran Code: 49 | Will follow the SAXPY (Scalar A*X Plus Y) aka the "Hello World" problem for CUDA programming to show how to go from CPU to GPU code. 50 | 51 | The serial CPU code 52 | ```Fortran 53 | module mathOps 54 | contains 55 | subroutine saxpy(x, y, a) 56 | implicit none 57 | real :: x(:), y(:), a 58 | ! Just a simple array scaler multiplication and addition 59 | y = a*x +y 60 | end subroutine saxpy 61 | end module mathOps 62 | 63 | program testSaxpy 64 | use mathOps 65 | implicit none 66 | integer, parameter :: N = 40000 67 | real :: x(N), y(N), a 68 | 69 | x = 1.0; y = 2.0; a = 2.0 70 | 71 | write(*,*) 'Max error: ', maxval(abs(y-4.0)) 72 | end program testSaxpy 73 | ```` 74 | 75 | The above CPU code is ported to CUDA fortran as follows, brief explanation are given in between the codes: 76 | 77 | ```fortran 78 | module mathOps 79 | contains 80 | ! The kernel i.e a function that runs on the device 81 | ! `attributes` describes the scope of the routine. `global` means its visible both from the host 82 | ! and device. This indicates the subroutine is run on the device but called from the host 83 | attributes(global) subroutine saxpy(x, y, a) 84 | implicit none 85 | real :: x(:), y(:) 86 | real, value :: a 87 | integer :: i, n 88 | n = size(x) 89 | 90 | ! Remember the host launches "**grid** of block with each block having **tBlock** threads" 91 | ! and each thread works on a single element of the array 92 | ! These `blockDim`, `blockIdx` and `threadIdx` are provided defined by CUDA are similar to 93 | ! `dim3` type. As we used only `x` component to launch the kernel only `x` component is used 94 | 95 | ! Think of this as there are groups (=`grid` in host) of threads and those groups are numbered 96 | ! with `blockIdx`. Each group has `blockDim` (=`tBlock` in host) number of threads and each 97 | ! thread inside a particular block is numbered with `threadIdx`. 98 | ! Thus using this following formula we can calculate the offset of the element of array to be computed 99 | 100 | i = blockDim%x * (blockIdx%x - 1) + threadIdx%x 101 | !^ Note: This is an example of fine-grained parallelism 102 | 103 | ! As we have launched more threads in the host than there are array element 104 | ! a conditional check is required 105 | if (i <= n) y(i) = y(i) + a*x(i) 106 | 107 | end subroutine saxpy 108 | end module mathOps 109 | 110 | program testSaxpy 111 | use mathOps 112 | ! Fortran module that contains all the CUDA Fortran definitions 113 | use cudafor 114 | implicit none 115 | integer, parameter :: N = 40000 116 | ! host arrays 117 | real :: x(N), y(N), a 118 | ! device arrays, they are declared with the `device` attribute 119 | real, device :: x_d(N), y_d(N) 120 | 121 | ! Thread configuration to launch the kernel 122 | ! Threads and block can be arranged in multidimensional nature. 123 | ! Here the they are defined as `dim3`, so they as have three components `x`,`y` and `z`. 124 | type(dim3) :: grid, tBlock 125 | 126 | 127 | ! In this example `tBlock` has 256 threads in `x` components and 128 | ! `grid` is defined such a way to accomodate all the `N = 40000` computation in blocks 129 | ! each having 256 threads. 130 | tBlock = dim3(256,1,1) 131 | grid = dim3(ceiling(real(N)/tBlock%x),1,1) 132 | 133 | x = 1.0; y = 2.0; a = 2.0 134 | 135 | ! copies the array from host to device and vice versa. 136 | ! The `cudafor` module overloads the assignment operator with `cudaMemcpy` calls 137 | ! so that the memory transfer can be done with a simple assignment operation. 138 | ! Note: This step actually moves data beween two physical devices and actually can be time consuming. 139 | ! P.S. CUDA memory copy can also be done in asynchronous 140 | x_d = x 141 | y_d = y 142 | 143 | ! Launches the kernel on the device. 144 | ! The information between the tripple braces are the excution configuration 145 | ! it says to launch **grid** of block with each block having **tBlock** threads. 146 | ! This call is asynchronous, so the host can just call this and 147 | ! proceed to the next line without it's computation on the device being completed 148 | call saxpy<<>>(x_d, y_d, a) 149 | 150 | 151 | ! Copyies data back to host. 152 | ! This process is synchronous and waits for the device to complete the calculation 153 | y = y_d 154 | write(*,*) 'Max error: ', maxval(abs(y-4.0)) 155 | end program testSaxpy 156 | ``` 157 | 158 | 159 | 160 | 161 | 162 | 163 | ### Profiling: 164 | Profiling can be done with the `nvprof` utility. 165 | 166 |
167 | 168 | A profiling on the `saxpy.cuf` code 169 | 170 | ```bash 171 | $ nvfortran saxpy.cuf 172 | $ sudo nvprof ./a.out 173 | ==2688609== NVPROF is profiling process 2688609, command: ./a.out 174 | Max error: 0.000000 175 | ==2688609== Profiling application: ./a.out 176 | ==2688609== Profiling result: 177 | Type Time(%) Time Calls Avg Min Max Name 178 | GPU activities: 64.44% 108.26us 4 27.063us 608ns 53.727us [CUDA memcpy HtoD] 179 | 28.55% 47.968us 1 47.968us 47.968us 47.968us [CUDA memcpy DtoH] 180 | 7.01% 11.776us 1 11.776us 11.776us 11.776us mathops_saxpy_ 181 | API calls: 99.78% 188.02ms 4 47.004ms 2.6290us 188.01ms cudaMalloc 182 | 0.11% 216.32us 5 43.264us 2.5470us 74.525us cudaMemcpy 183 | 0.05% 103.29us 4 25.822us 2.2740us 80.366us cudaFree 184 | 0.03% 63.516us 101 628ns 81ns 27.537us cuDeviceGetAttribute 185 | 0.01% 17.552us 1 17.552us 17.552us 17.552us cudaLaunchKernel 186 | 0.01% 11.915us 1 11.915us 11.915us 11.915us cuDeviceGetName 187 | 0.00% 4.8980us 1 4.8980us 4.8980us 4.8980us cuDeviceGetPCIBusId 188 | 0.00% 884ns 3 294ns 75ns 682ns cuDeviceGetCount 189 | 0.00% 507ns 2 253ns 83ns 424ns cuDeviceGet 190 | 0.00% 241ns 1 241ns 241ns 241ns cuDeviceTotalMem 191 | 0.00% 138ns 1 138ns 138ns 138ns cuDeviceGetUuid 192 | ``` 193 | 194 |
195 | 196 | --- 197 | 198 | 199 | ## Source Codes examples 200 | #### Loop parallelization: 201 | `!$cuf kernel do` directive can be used to simplify parallelizing loops. These directives instruct the compiler to generate kernels from a region of host code consisting of tightly nested loops. Essentially, kernel loop directives allow us to inline kernels in host code. 202 | 1. [Loop parallelization](./src/cufKernel.cuf) 203 | 1. [Loop parallelization 2](./src/cufILP.cuf) 204 | 3. [Nested Loop parallelization](./src/cufKernel2D.cuf) 205 | 4. [Reduction operation](./src/cufReduction.cuf) 206 | 207 | 208 | #### Few other sample code 209 | 5. [Compute pi using a Monte Carlo method](./src/ComputePI/) 210 | 6. [Fast Fourier Transform](./src/FFT/) 211 | 7. [Derivative with FFT](./src/FFT/) 212 | 213 | 214 | --- 215 | 216 |   217 | 218 | ### References 219 | 1. __CUDA Fortran for Scientists and Engineers__ by Gregory Ruetsch & Massimiliano Fatica 220 | 2. https://developer.nvidia.com/blog/easy-introduction-cuda-fortran/ 221 | 3. https://docs.nvidia.com/hpc-sdk/compilers/cuda-fortran-prog-guide/ 222 | --------------------------------------------------------------------------------