├── gpu_cpu.png
├── src
    ├── ComputePI
    │   ├── Readme.md
    │   ├── compute_pi.cuf
    │   └── curand_m.cuf
    ├── FFT
    │   ├── Readme.md
    │   ├── fft_test_c2c.cuf
    │   ├── fft_derivative.cuf
    │   └── cufft_m.cuf
    ├── cufReduction.cuf
    ├── cufILP.cuf
    ├── cufKernel.cuf
    ├── cufKernel2D.cuf
    ├── deviceQuery.cuf
    └── saxpy.cuf
└── Readme.md


/gpu_cpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Koushikphy/Intro-to-CUDA-Fortran/HEAD/gpu_cpu.png


--------------------------------------------------------------------------------
/src/ComputePI/Readme.md:
--------------------------------------------------------------------------------
1 | #### Computing Pi with Monte Carlo method
2 | 
3 | Compile it with 
4 | ```bash
5 | nvfortran curand_m.cuf compute_pi.cuf -cudalib=curand
6 | ```
7 | The CUDA random number generator (`curand`) needs to be linked for this to work, using the flag `-cudalib`


--------------------------------------------------------------------------------
/src/FFT/Readme.md:
--------------------------------------------------------------------------------
1 | #### Fast Fourier Transform with CUDA Fortran
2 | 
3 | Compile it with 
4 | ```bash
5 | nvfortran cufft_m.cuf fft_test_c2c.cuf -cudalib=cufft
6 | nvfortran cufft_m.cuf fft_derivative.cuf -cudalib=cufft
7 | ```
8 | The CUDA random number generator (`curand`) needs to be linked for this to work, using the flag `-cudalib`
9 | 


--------------------------------------------------------------------------------
/src/cufReduction.cuf:
--------------------------------------------------------------------------------
 1 | program reduce
 2 |     implicit none
 3 |     integer, parameter :: n = 1024*1024
 4 |     integer :: i, aSum = 0
 5 |     integer, device :: a_d(n)
 6 |     integer, parameter :: tPB = 256
 7 | 
 8 |     a_d = 1
 9 | 
10 |     ! CUF kernels do the reduction operation automatically 
11 |     
12 |     !$cuf kernel do <<<*,tPB>>>
13 |     do i = 1, n
14 |         aSum = aSum + a_d(i)
15 |     enddo
16 | 
17 |     if (aSum /= n) then
18 |         write(*,*) '**** Program Failed ****'
19 |     else
20 |         write(*,*) 'Program Passed'
21 |     endif
22 | end program reduce
23 | 
24 | 


--------------------------------------------------------------------------------
/src/cufILP.cuf:
--------------------------------------------------------------------------------
 1 | program ilp
 2 |     implicit none
 3 |     integer, parameter :: n = 1024*1024
 4 |     integer :: a(n), i, b
 5 |     integer, device :: a_d(n)
 6 |     integer, parameter :: tPB = 256
 7 | 
 8 |     a = 1
 9 |     b = 3
10 | 
11 |     a_d = a
12 | 
13 | 
14 |     ! Here the 1024 blocks of 256 threads cannot processes the 1024 2 elements if each thread processes
15 |     ! a single element, so the compiler generates a loop that results in each thread processing four array
16 |     ! elements.
17 |     !-----------------------------------------------------------------------------------------------------
18 | 
19 |     !$cuf kernel do <<<1024,tPB>>>
20 |     do i = 1, n
21 |         a_d(i) = a_d(i) + b
22 |     enddo
23 | 
24 |     a = a_d
25 | 
26 |     if (any(a /= 4)) then
27 |         write(*,*) '**** Program Failed ****'
28 |     else
29 |         write(*,*) 'Program Passed'
30 |     endif
31 | end program ilp
32 | 
33 | 


--------------------------------------------------------------------------------
/src/cufKernel.cuf:
--------------------------------------------------------------------------------
 1 | program incrementTest  
 2 |      implicit none
 3 |      integer, parameter :: n = 1024*1024
 4 |      integer :: a(n), i, b
 5 |      integer, device :: a_d(n)
 6 |      integer, parameter :: tPB = 256
 7 | 
 8 |      a = 1
 9 |      b = 3
10 | 
11 |      a_d = a
12 | 
13 | 
14 |      ! A thread block of tPB threads is used when launching the kernel. The * specified for the first execution configuration
15 |      ! parameter leaves the compiler free to calculate the number of thread blocks to launch in order to carry out the 
16 |      ! operation in the loop. The execution configuration could have been specified as <<<*,*>>>, in which case the 
17 |      ! compiler would choose the thread block size as well as the number of thread blocks to launch.
18 |      !------------------------------------------------------------------------------------------------------------------------
19 | 
20 |      !$cuf kernel do <<<*,tPB>>>
21 |      do i = 1, n
22 |           a_d(i) = a_d(i) + b
23 |      enddo
24 | 
25 |      a = a_d
26 | 
27 |      if (any(a /= 4)) then
28 |           write(*,*) '**** Program Failed ****'
29 |      else
30 |           write(*,*) 'Program Passed'
31 |      endif
32 | end program incrementTest
33 | 
34 | 


--------------------------------------------------------------------------------
/src/cufKernel2D.cuf:
--------------------------------------------------------------------------------
 1 | 
 2 | program incrementTest  
 3 |     implicit none
 4 |     integer, parameter :: n = 4*1024
 5 |     integer :: a(n,n), i, j, b
 6 |     integer, device :: a_d(n,n)
 7 | 
 8 |     a = 1
 9 |     b = 3
10 | 
11 |     a_d = a
12 | 
13 |     ! In this case the do (2) specified on the directive indicates that the generated kernel will map to the two
14 |     ! following loops. Multidimensional thread blocks and grids specified by the execution configuration in
15 |     ! the directive map to the nested loops in an innermost to outermost fashion. For example, for the thread
16 |     ! block of 32 × 8, the predefined kernel variable threadIdx%x will run from 1 to 32 and map to the
17 |     ! i index, and threadIdx%y will run from 1 to 8 and map to the j index. Rather than specifying the
18 |     ! thread block size, we could have also used <<<(*,*),(*,*)>>> or even <<<*,*>>> 
19 |     ! and have the compiler choose the thread block and grid size.
20 |     !-----------------------------------------------------------------------------------------------------
21 | 
22 |     !$cuf kernel do (2) <<< (*,*), (32,8) >>>
23 |     do j = 1, n
24 |         do i = 1, n
25 |             a_d(i,j) = a_d(i,j) + b
26 |         enddo
27 |     enddo
28 | 
29 |     a = a_d
30 | 
31 |     if (any(a /= 4)) then
32 |         write(*,*) '**** Program Failed ****'
33 |     else
34 |         write(*,*) 'Program Passed'
35 |     endif
36 | end program incrementTest
37 | 
38 | 


--------------------------------------------------------------------------------
/src/FFT/fft_test_c2c.cuf:
--------------------------------------------------------------------------------
 1 | program fft_test_c2c
 2 |     use iso_c_binding
 3 |     ! use precision_m
 4 |     use cufft_m
 5 |     implicit none
 6 |     integer, allocatable:: kx(:)
 7 |     complex(fp_kind), allocatable:: cinput(:),coutput(:)
 8 |     complex(fp_kind), allocatable, device:: cinput_d(:),coutput_d(:)
 9 | 
10 |     integer:: i,j,k,n
11 |     type(c_ptr):: plan
12 |     real(fp_kind):: twopi=8._fp_kind*atan(1._fp_kind),h
13 | 
14 |     character*1:: a
15 |     real(fp_kind):: w,x,y,z
16 |     integer:: nerrors
17 | 
18 |     n=16
19 |     h=twopi/real(n,fp_kind)
20 | 
21 |     ! allocate arrays on the host
22 |     allocate (cinput(n),coutput(n),kx(n))
23 | 
24 |     ! allocate arrays on the device
25 |     allocate (cinput_d(n),coutput_d(n))
26 | 
27 |     !initialize arrays on host
28 |     kx =(/ (i-1, i=1,n/2), (-n+i-1, i=n/2+1,n) /)
29 | 
30 |     do i=1,n
31 |         cinput(i)=(cos(2*real(i-1,fp_kind)*h)+sin(3*real(i-1,fp_kind)*h))
32 |     end do
33 | 
34 |     !copy arrays to device
35 |     cinput_d=cinput
36 | 
37 |     ! Initialize the plan for complex to complex transform
38 |     if (fp_kind== singlePrecision) call cufftPlan1D(plan,n,CUFFT_C2C,1)
39 |     if (fp_kind== doublePrecision) call cufftPlan1D(plan,n,CUFFT_Z2Z,1)
40 | 
41 |     ! Forward transform out of place
42 |     call cufftExec(plan,cinput_d,coutput_d,CUFFT_FORWARD)
43 | 
44 |     ! Copy results back to host
45 |     coutput=coutput_d
46 | 
47 |     print *," Transform from complex array"
48 |     do i=1,n
49 |         write(*,'(i2,1x,2(f8.4),2x,i2,2(f8.4))') i,cinput(i),kx(i),coutput(i)/n
50 |     end do
51 | 
52 | 
53 |     !release memory on the host and on the device
54 |     deallocate (cinput,coutput,kx,cinput_d,coutput_d)
55 | 
56 |     ! Destroy the plans
57 |     call cufftDestroy(plan)
58 | 
59 | end program fft_test_c2c
60 | 
61 | 


--------------------------------------------------------------------------------
/src/ComputePI/compute_pi.cuf:
--------------------------------------------------------------------------------
 1 | 
 2 | ! Compute pi using a Monte Carlo method 
 3 | 
 4 | program compute_pi
 5 |     use curand_m
 6 |     implicit none
 7 |     real(fp_kind), allocatable:: hostData(:)
 8 |     real(fp_kind), allocatable, device:: deviceData(:)
 9 |     real(fp_kind) :: pival
10 |     integer :: inside_gpu, inside_cpu, N, i
11 |     integer(kind=int_ptr_kind()) :: gen, twoN
12 |     integer(kind=8) :: seed
13 | 
14 |     ! Define how many numbers we want to generate
15 |     N=10000000
16 |     twoN=N*2
17 | 
18 |     ! Allocate array on CPU
19 |     allocate(hostData(twoN))
20 | 
21 |     ! Allocate array on GPU
22 |     allocate(deviceData(twoN))
23 | 
24 | 
25 |     ! Create pseudonumber generator
26 |     call curandCreateGenerator(gen, CURAND_RNG_PSEUDO_DEFAULT)
27 | 
28 |     ! Set seed
29 |     seed=1234
30 |     call curandSetPseudoRandomGeneratorSeed( gen, seed)
31 | 
32 |     ! Generate N floats or double on device
33 |     call curandGenerateUniform(gen, deviceData, twoN)
34 | 
35 |     ! Copy the data back to CPU to check result later
36 |     hostData=deviceData
37 | 
38 |     ! Perform the test on GPU using CUF kernel
39 |     inside_gpu=0
40 |     !$cuf kernel do <<<*,*>>>
41 |     do i=1,N
42 |         if( (deviceData(i)**2+deviceData(i+N)**2) <= 1._fp_kind ) inside_gpu=inside_gpu+1
43 |     end do
44 | 
45 |     ! Perform the test on CPU 
46 |     inside_cpu=0
47 |     do i=1,N
48 |         if( (hostData(i)**2+hostData(i+N)**2) <= 1._fp_kind ) inside_cpu=inside_cpu+1
49 |     end do
50 | 
51 |     ! Check the results
52 |     if (inside_cpu .ne. inside_gpu) then
53 |         write(*,*) "Mismatch between CPU/GPU"
54 |         write(*,*) "Test Failed"
55 |     else
56 |         write(*,*) "Test Passed"
57 |     endif
58 | 
59 |     ! Print the value of pi and the error
60 |     pival= 4._fp_kind*real(inside_gpu,fp_kind)/real(N,fp_kind)
61 |     write(*,"(t3,a,i10,a,f10.8,a,e11.4)") "Samples=", N, "  Pi=", pival, "  Error=", abs(pival-2.0_fp_kind*asin(1.0_fp_kind))
62 | 
63 |     ! Deallocate data on CPU and GPU
64 |     deallocate(hostData)
65 |     deallocate(deviceData)
66 | 
67 |     ! Destroy the generator
68 |     call curandDestroyGenerator(gen)
69 | end program compute_pi
70 | 


--------------------------------------------------------------------------------
/src/FFT/fft_derivative.cuf:
--------------------------------------------------------------------------------
 1 | program fft_derivative
 2 |     use iso_c_binding
 3 |     use cufft_m
 4 |     implicit none
 5 |     real(fp_kind), allocatable:: kx(:), derivative(:)
 6 |     real(fp_kind), allocatable, device:: kx_d(:)
 7 | 
 8 |     complex(fp_kind), allocatable:: cinput(:),coutput(:)
 9 |     complex(fp_kind), allocatable, device:: cinput_d(:),coutput_d(:)
10 | 
11 |     integer:: i,j,n
12 |     type(c_ptr):: plan
13 |     real(fp_kind):: twopi=8._fp_kind*atan(1._fp_kind),h
14 | 
15 |     character*1:: a
16 |     real(fp_kind):: x,y,z
17 |     integer:: nerrors
18 | 
19 |     n=8
20 |     h=twopi/real(n,fp_kind)
21 | 
22 |     ! allocate arrays on the host
23 |     allocate (cinput(n),coutput(n),derivative(n),kx(n))
24 | 
25 |     ! allocate arrays on the device
26 |     allocate (cinput_d(n),coutput_d(n),kx_d(n))
27 | 
28 |     ! initialize arrays on host
29 |     kx =(/ ((i-1), i=1,n/2), ((-n+i-1), i=n/2+1,n) /)
30 | 
31 |     ! Set the wave number for the Nyquist frequency to zero
32 |     kx(n/2+1)=0._fp_kind
33 | 
34 |     ! Copy the wave number vector to the device
35 |     kx_d=kx
36 | 
37 |     do i=1,n
38 |         cinput(i)=(cos(2*real(i-1,fp_kind)*h) +sin(3*real(i-1,fp_kind)*h))
39 |         derivative(i)=(-2*sin(2*real(i-1,fp_kind)*h) +3*cos(3*real(i-1,fp_kind)*h))
40 |     end do
41 | 
42 |     ! copy input to device
43 |     cinput_d=cinput
44 | 
45 |     ! Initialize the plan for complex to complex transform
46 |     if (fp_kind== singlePrecision) call cufftPlan1D(plan,n,CUFFT_C2C,1)
47 |     if (fp_kind== doublePrecision) call cufftPlan1D(plan,n,CUFFT_Z2Z,1)
48 | 
49 |     ! Forward transform out of place
50 |     call cufftExec(plan,cinput_d,coutput_d,CUFFT_FORWARD)
51 | 
52 |     ! Compute the derivative in spectral space and normalize the FFT
53 |     !$cuf kernel do <<<*,*>>>
54 |     do i=1,n
55 |         coutput_d(i)=cmplx(0.,kx_d(i),fp_kind)*coutput_d(i)/n
56 |     end do
57 | 
58 |     ! Inverse transform in place
59 |     call cufftExec(plan,coutput_d,coutput_d,CUFFT_INVERSE)
60 | 
61 |     ! Copy results back to host
62 |     coutput=coutput_d
63 | 
64 |     print *," First Derivative from complex array"
65 |     do i=1,n
66 |         write(*,'(i2,2(1x,f8.4),2x,e13.7)') i,real(coutput(i)),derivative(i),real(coutput(i))-derivative(i)
67 |     end do
68 | 
69 |     !release memory on the host and on the device
70 |     deallocate (cinput,coutput,kx,derivative,cinput_d,coutput_d,kx_d)
71 | 
72 |     ! Destroy the plans
73 |     call cufftDestroy(plan)
74 | 
75 | end program fft_derivative
76 | 
77 | 


--------------------------------------------------------------------------------
/src/deviceQuery.cuf:
--------------------------------------------------------------------------------
 1 | program deviceQuery
 2 |     use cudafor
 3 |     implicit none
 4 | 
 5 |     type (cudaDeviceProp) :: prop
 6 |     integer :: nDevices=0, i, ierr
 7 | 
 8 |     ! Number of CUDA-capable devices
 9 | 
10 |     ierr = cudaGetDeviceCount(nDevices)
11 | 
12 |     if (nDevices == 0) then
13 |     write(*,"(/,'No CUDA devices found',/)")
14 |     stop
15 |     else if (nDevices == 1) then
16 |     write(*,"(/,'One CUDA device found',/)")
17 |     else 
18 |     write(*,"(/,i0,' CUDA devices found',/)") nDevices
19 |     end if
20 | 
21 |     ! Loop over devices
22 | 
23 |     do i = 0, nDevices-1
24 | 
25 |         write(*,"('Device Number: ',i0)") i
26 | 
27 |         ierr = cudaGetDeviceProperties(prop, i)
28 |         if (ierr .eq. 0) then
29 |             write(*,"('  GetDeviceProperties for device ',i0,': Passed')") i
30 |         else
31 |             write(*,"('  GetDeviceProperties for device ',i0,': Failed')") i
32 |         endif
33 | 
34 |         ! General device info
35 | 
36 |         write(*,"('  Device Name: ',a)") trim(prop%name)
37 |         write(*,"('  Compute Capability: ',i0,'.',i0)") &
38 |         prop%major, prop%minor
39 |         write(*,"('  Number of Multiprocessors: ',i0)") &
40 |         prop%multiProcessorCount
41 |         write(*,"('  Max Threads per Multiprocessor: ',i0)") &
42 |         prop%maxThreadsPerMultiprocessor
43 |         write(*,"('  Global Memory (GB): ',f9.3,/)") &
44 |         prop%totalGlobalMem/1024.0**3
45 | 
46 |         ! Execution Configuration
47 | 
48 |         write(*,"('  Execution Configuration Limits')")
49 |         write(*,"('    Max Grid Dims: ',2(i0,' x '),i0)") &
50 |         prop%maxGridSize
51 |         write(*,"('    Max Block Dims: ',2(i0,' x '),i0)") &
52 |         prop%maxThreadsDim
53 |         write(*,"('    Max Threads per Block: ',i0,/)") &
54 |         prop%maxThreadsPerBlock
55 | 
56 |     enddo
57 | 
58 | end program deviceQuery
59 | 
60 | 
61 | !!!  Sample output
62 | ! ---------------------------------------------------------------
63 | ! One CUDA device found
64 | 
65 | ! Device Number: 0
66 | !   GetDeviceProperties for device 0: Passed
67 | !   Device Name: NVIDIA GeForce GT 1030
68 | !   Compute Capability: 6.1
69 | !   Number of Multiprocessors: 3
70 | !   Max Threads per Multiprocessor: 2048
71 | !   Global Memory (GB):     1.951
72 | 
73 | !   Execution Configuration Limits
74 | !     Max Grid Dims: 2147483647 x 65535 x 65535
75 | !     Max Block Dims: 1024 x 1024 x 64
76 | !     Max Threads per Block: 1024
77 | ! ---------------------------------------------------------------


--------------------------------------------------------------------------------
/src/ComputePI/curand_m.cuf:
--------------------------------------------------------------------------------
 1 | 
 2 | module curand_m
 3 |     integer, public :: CURAND_RNG_PSEUDO_DEFAULT = 100 
 4 |     integer, public :: CURAND_RNG_PSEUDO_XORWOW  = 101 
 5 |     integer, public :: CURAND_RNG_QUASI_DEFAULT  = 200 
 6 |     integer, public :: CURAND_RNG_QUASI_SOBOL32  = 201
 7 | 
 8 | 
 9 |     integer, parameter :: singlePrecision = kind(0.0)
10 |     integer, parameter :: doublePrecision = kind(0.0d0)
11 | 
12 |     integer, parameter :: fp_kind = doublePrecision
13 |     ! integer, parameter :: fp_kind = singlePrecision
14 | 
15 | 
16 |     interface curandCreateGenerator
17 |         subroutine curandCreateGenerator( generator,rng_type)  bind(C,name='curandCreateGenerator') 
18 |             use iso_c_binding
19 |             integer(c_size_t):: generator
20 |             integer(c_int),value:: rng_type
21 |         end subroutine curandCreateGenerator
22 |     end interface curandCreateGenerator
23 | 
24 |     interface curandSetPseudoRandomGeneratorSeed
25 |         subroutine curandSetPseudoRandomGeneratorSeed( generator,seed) bind(C,name='curandSetPseudoRandomGeneratorSeed')
26 |             use iso_c_binding
27 |             integer(c_size_t), value:: generator
28 |             integer(c_long_long),value:: seed
29 |         end subroutine curandSetPseudoRandomGeneratorSeed
30 |     end interface curandSetPseudoRandomGeneratorSeed
31 | 
32 |     interface curandGenerateUniform
33 |         subroutine curandGenerateUniform( generator, odata, numele) bind(C,name='curandGenerateUniform')
34 |             use iso_c_binding
35 |             integer(c_size_t),value:: generator
36 |             !pgi$ ignore_tr odata
37 |             real(c_float), device:: odata(*)
38 |             integer(c_size_t),value:: numele
39 |         end subroutine curandGenerateUniform
40 | 
41 |         subroutine curandGenerateUniformDouble(generator, odata, numele) bind(C,name='curandGenerateUniformDouble')
42 |             use iso_c_binding
43 |             integer(c_size_t),value:: generator
44 |             !pgi$ ignore_tr odata
45 |             real(c_double), device:: odata(*)
46 |             integer(c_size_t),value:: numele
47 |         end subroutine curandGenerateUniformDouble
48 |     end interface curandGenerateUniform
49 | 
50 |     interface curandGenerateNormal
51 |         subroutine curandGenerateNormal( generator, odata, numele, mean,stddev) bind(C,name='curandGenerateNormal')
52 |             use iso_c_binding
53 |             integer(c_size_t),value:: generator
54 |             !pgi$ ignore_tr odata
55 |             real(c_float), device:: odata(*)
56 |             integer(c_size_t),value:: numele
57 |             real(c_float), value:: mean,stddev
58 |         end subroutine curandGenerateNormal
59 | 
60 |         subroutine curandGenerateNormalDouble( generator, odata, numele,mean, stddev) bind(C,name='curandGenerateNormalDouble')
61 |             use iso_c_binding
62 |             integer(c_size_t),value:: generator
63 |             !pgi$ ignore_tr odata
64 |             real(c_double), device:: odata(*)
65 |             integer(c_size_t),value:: numele
66 |             real(c_double), value:: mean,stddev
67 |         end subroutine curandGenerateNormalDouble
68 |     end interface curandGenerateNormal
69 | 
70 |     interface curandDestroyGenerator
71 |         subroutine curandDestroyGenerator(generator) bind(C,name='curandDestroyGenerator')
72 |             use iso_c_binding
73 |             integer(c_size_t),value:: generator
74 |         end subroutine curandDestroyGenerator
75 |     end interface curandDestroyGenerator
76 | 
77 | end module curand_m
78 | 


--------------------------------------------------------------------------------
/src/saxpy.cuf:
--------------------------------------------------------------------------------
 1 | module mathOps
 2 |     contains
 3 |     ! The kernel i.e a function that runs on the device 
 4 |     ! `attributes` describes the scope of the routine. `global` means its visible both from the host and device
 5 |     ! This indicates the subroutine is run on the device but called from the host
 6 |     attributes(global) subroutine saxpy(x, y, a)
 7 |         implicit none
 8 |         real :: x(:), y(:)
 9 |         real, value :: a
10 |         integer :: i, n
11 |         n = size(x)
12 | 
13 |         ! Remember the host launches "**grid** of block with each block having **tBlock** threads"
14 |         ! and each thread works on a single element of the array
15 |         ! These `blockDim`, `blockIdx` and `threadIdx` are provided defined by CUDA are similar to `dim3` type
16 |         ! As we used only `x` component to launch the kernel only `x` component is used
17 | 
18 |         ! Think of this as there are groups (=`grid` in host) of threads and those groups are numbered with `blockIdx`
19 |         ! Each group has `blockDim` (=`tBlock` in host) number of threads and each thread inside a particular block
20 |         ! is numbered with `threadIdx`. 
21 |         ! Thus using this following formula we can calculate the offset of the element of array to be computed
22 | 
23 |         i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
24 |         !^ Note: This is an example of fine-grained parallelism
25 | 
26 |         ! As we have launched more threads in the host than there are array element a conditional check is required
27 |         if (i <= n) y(i) = y(i) + a*x(i)
28 | 
29 |     end subroutine saxpy 
30 | end module mathOps
31 | 
32 | program testSaxpy
33 |     use mathOps
34 |     ! Fortran module that contains all the CUDA Fortran definitions
35 |     use cudafor
36 |     implicit none
37 |     integer, parameter :: N = 40000
38 |     ! host arrays
39 |     real :: x(N), y(N), a
40 |     ! device arrays, they are declared with the `device` attribute
41 |     real, device :: x_d(N), y_d(N)
42 | 
43 |     ! Thread configuration to launch the kernel
44 |     ! Threads and block can be arranged in multidimensional nature. 
45 |     ! Here the they are defined as `dim3`, so they as have three components `x`,`y` and `z`.
46 |     type(dim3) :: grid, tBlock
47 | 
48 | 
49 |     ! In this example `tBlock` has 256 threads in `x` components and 
50 |     ! `grid` is defined such a way to accomodate all the `N = 40000` computation in blocks each having 256 threads.
51 |     tBlock = dim3(256,1,1)
52 |     grid = dim3(ceiling(real(N)/tBlock%x),1,1)
53 | 
54 |     x = 1.0; y = 2.0; a = 2.0
55 | 
56 |     ! copies the array from host to device and vice versa. 
57 |     ! The `cudafor` module overloads the assignment operator with `cudaMemcpy` calls 
58 |     ! so that the memory transfer can be done with a simple assignment operation. 
59 |     ! Note: This step actually moves data beween two physical devices and actually can be time consuming.
60 |     ! P.S. CUDA memory copy can also be done in asynchronous
61 |     x_d = x
62 |     y_d = y
63 | 
64 |     ! Launches the kernel on the device. 
65 |     ! The information between the tripple braces are the excution configuration 
66 |     ! it says to launch **grid** of block with each block having **tBlock** threads.
67 |     ! This call is asynchronous, so the host can just call this and 
68 |     ! proceed to the next line without it's computation on the device being completed
69 |     call saxpy<<<grid, tBlock>>>(x_d, y_d, a)
70 | 
71 | 
72 |     ! Copyies data back to host. 
73 |     ! This process is synchronous and waits for the device to complete the calculation
74 |     y = y_d
75 |     write(*,*) 'Max error: ', maxval(abs(y-4.0))
76 | end program testSaxpy


--------------------------------------------------------------------------------
/src/FFT/cufft_m.cuf:
--------------------------------------------------------------------------------
  1 | 
  2 | module cufft_m
  3 | 
  4 |     integer, public :: CUFFT_FORWARD = -1
  5 |     integer, public :: CUFFT_INVERSE =  1
  6 |     integer, public :: CUFFT_R2C = Z'2a' ! Real to Complex (interleaved)
  7 |     integer, public :: CUFFT_C2R = Z'2c' ! Complex (interleaved) to Real
  8 |     integer, public :: CUFFT_C2C = Z'29' ! Complex to Complex, interleaved
  9 |     integer, public :: CUFFT_D2Z = Z'6a' ! Double to Double-Complex
 10 |     integer, public :: CUFFT_Z2D = Z'6c' ! Double-Complex to Double
 11 |     integer, public :: CUFFT_Z2Z = Z'69' ! Double-Complex to Double-Complex
 12 | 
 13 |     integer, parameter, public :: singlePrecision = kind(0.0) 
 14 |     integer, parameter, public :: doublePrecision = kind(0.0d0) 
 15 | 
 16 |     integer, parameter, public :: fp_kind = doublePrecision
 17 |     !   integer, parameter, public :: fp_kind = singlePrecision
 18 | 
 19 | 
 20 |     interface cufftDestroy
 21 |         subroutine cufftDestroy(plan) bind(C,name='cufftDestroy') 
 22 |             use iso_c_binding
 23 |             type(c_ptr),value:: plan
 24 |         end subroutine cufftDestroy
 25 |     end interface cufftDestroy
 26 | 
 27 |     interface cufftSetStream
 28 |         subroutine cufftSetStream(plan, stream) bind(C,name='cufftSetStream') 
 29 |             use iso_c_binding
 30 |             use cudafor
 31 |             type(c_ptr),value:: plan
 32 |             integer(kind=cuda_stream_kind),value:: stream
 33 |         end subroutine cufftSetStream
 34 |     end interface cufftSetStream
 35 | 
 36 |     interface cufftExec
 37 | 
 38 |         subroutine cufftExecC2C(plan, idata, odata, direction) bind(C,name='cufftExecC2C') 
 39 |             use iso_c_binding
 40 |             import singlePrecision, doublePrecision
 41 |             type(c_ptr),value:: plan
 42 |             integer(c_int),value:: direction
 43 |             !pgi$ ignore_tr idata,odata
 44 |             complex(singlePrecision),device:: idata(*),odata(*)
 45 |         end subroutine cufftExecC2C
 46 | 
 47 |         subroutine cufftExecZ2Z(plan, idata, odata, direction) bind(C,name='cufftExecZ2Z') 
 48 |             use iso_c_binding
 49 |             import singlePrecision, doublePrecision
 50 |             type(c_ptr),value:: plan
 51 |             integer(c_int),value:: direction
 52 |             !pgi$ ignore_tr idata,odata
 53 |             complex(doublePrecision),device:: idata(*),odata(*)
 54 |         end subroutine cufftExecZ2Z
 55 | 
 56 |         subroutine cufftExecR2C(plan, idata, odata) bind(C,name='cufftExecR2C') 
 57 |             use iso_c_binding
 58 |             import singlePrecision, doublePrecision
 59 |             type(c_ptr),value:: plan
 60 |             integer(c_int),value:: direction
 61 |             !pgi$ ignore_tr idata,odata
 62 |             real(singlePrecision),device:: idata(*)
 63 |             complex(singlePrecision),device:: odata(*)
 64 |         end subroutine cufftExecR2C
 65 | 
 66 |         subroutine cufftExecD2Z(plan, idata, odata) bind(C,name='cufftExecD2Z') 
 67 |             use iso_c_binding
 68 |             import singlePrecision, doublePrecision
 69 |             type(c_ptr),value:: plan
 70 |             integer(c_int),value:: direction
 71 |             !pgi$ ignore_tr idata,odata
 72 |             real(doublePrecision),device:: idata(*)
 73 |             complex(doublePrecision),device:: odata(*)
 74 |         end subroutine cufftExecD2Z
 75 | 
 76 |         subroutine cufftExecR2Cinplace(plan, idata, odata) bind(C,name='cufftExecR2C') 
 77 |             use iso_c_binding
 78 |             import singlePrecision, doublePrecision
 79 |             type(c_ptr),value:: plan
 80 |             integer(c_int),value:: direction
 81 |             !pgi$ ignore_tr idata,odata
 82 |             real(singlePrecision),device:: idata(*)
 83 |             real(singlePrecision),device:: odata(*)
 84 |         end subroutine cufftExecR2Cinplace
 85 | 
 86 |         subroutine cufftExecD2Zinplace(plan, idata, odata) bind(C,name='cufftExecD2Z') 
 87 |             use iso_c_binding
 88 |             import singlePrecision, doublePrecision
 89 |             type(c_ptr),value:: plan
 90 |             !pgi$ ignore_tr idata,odata
 91 |             real(doublePrecision),device:: idata(*)
 92 |             real(doublePrecision),device:: odata(*)
 93 |         end subroutine cufftExecD2Zinplace
 94 | 
 95 |     end interface cufftExec
 96 | 
 97 |     interface cufftPlan1d
 98 |         subroutine cufftPlan1d(plan, nx, type, batch) bind(C,name='cufftPlan1d') 
 99 |         use iso_c_binding
100 |         type(c_ptr):: plan
101 |         integer(c_int),value:: nx, batch,type
102 |         end subroutine cufftPlan1d
103 |     end interface cufftPlan1d
104 | 
105 | 
106 |     interface cufftPlanMany
107 |         subroutine cufftPlanMany(plan, rank, n, inembed, istride, idist, onembed, ostride, odist,type, batch) bind(C,name='cufftPlanMany')
108 |             use iso_c_binding
109 |             implicit none
110 |             !pgi$ ignore_tkr n, inembed, onembed       
111 |             type(c_ptr) :: plan
112 |             integer(c_int) :: n, inembed, onembed
113 |             integer(c_int), value:: rank, istride, ostride, idist, odist, type, batch
114 |         end subroutine cufftPlanMany
115 |     end interface cufftPlanMany
116 | 
117 |     interface cufftPlan2d
118 |         module procedure cufftPlan2Dswap
119 |     end interface cufftPlan2d
120 | 
121 |     interface cufftPlan2dC
122 |         subroutine cufftPlan2d(plan, nx, ny, type) bind(C,name='cufftPlan2d')
123 |             use iso_c_binding
124 |             type(c_ptr):: plan
125 |             integer(c_int),value:: nx, ny, type
126 |         end subroutine cufftPlan2d
127 |     end interface cufftPlan2dC
128 | 
129 | contains
130 | 
131 |     subroutine cufftPlan2Dswap(plan,nx,ny, type) 
132 |         use iso_c_binding
133 |         type(c_ptr):: plan
134 |         integer(c_int),value:: nx, ny, type
135 |         call cufftPlan2dC(plan,ny,nx,type) 
136 |     end subroutine cufftPlan2Dswap
137 | 
138 | end module cufft_m
139 | 
140 | 


--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
  1 | # CUDA Fortran: Fortran programming on GPU
  2 | 
  3 | 
  4 | <sup> Disclaimer: There is no way possible to learn CUDA Fortran completely just from this one page Tutorial/Cheatsheet. This is only meant for a quick reference sheet to get started with GPGPU programming with CUDA Fortran. </sup>
  5 | 
  6 | ---
  7 | 
  8 | 
  9 | ## GPU & CUDA Programming Model  
 10 | &nbsp;
 11 | ![GPU vs CPU](./gpu_cpu.png)
 12 | - __The Host & Device:__ The CPU and its memory is called the host. On the other hand the GPU and its memory is called the device. They are usually connected with PCI bus which have much slower data bandwidth compared to the each processing unit and their memory and moving data between them is time consuming. Thus frequent exchange of data between the two memory is highly discourage
 13 | 
 14 | - __Kernels:__ A function that is executed on the GPU.
 15 | - __Threads Hierarchy__ 
 16 |     - __Thread:__ At the lowest level of CUDA threads hierarchy are the individual threads. Each thread execute the kernel on a single piece of data and each gets mapped to a single CUDA core.
 17 |     - __Blocks:__ A group of thread.
 18 |     - __Grid:__ The collection of blocks that gets mapped on the entire GPU 
 19 |     - Blocks and Grids can be 1D, 2D or 3D and the program has to written in such way to control over multidimensional Blocks/Grids.
 20 | 
 21 | - __Flow of Program:__ The main code execution is started on the CPU aka the host. Separate memory are allocated for host and device to hold the data for each of their computation. When needed the data is copied to the device from host and back. Host can launch a group of kernels on the device. When the kernels are launched, the host does not wait for the kernels execution to finish and can proceed with its own flow. The memory copy between the host and  device can be synchronous or asynchronous. Usually they are done in synchronous manner. The assignment operator (`=`) in CUDA Fortran is overloaded with synchronous memory copy i.e. the copy operation will wait for the kernels to finish their execution
 22 | 
 23 | ---
 24 | 
 25 | ### CUDA Fortran Installation:
 26 | 1. Install the appropriate Nvidia drivers for your system. 
 27 | 2. Install the Nvidia CUDA toolkit. 
 28 | 3. Install Nvidia HPC SDK from https://developer.nvidia.com/nvidia-hpc-sdk-downloads. The installation path is usually `/opt/nvidia/hpc_sdk/Linux_x86_64/*/compilers/bin`, add it to your PATH. 
 29 | 
 30 | P.S. You may have to restart your system, before using the compilers.
 31 | 
 32 | --- 
 33 | 
 34 | ### Compilation and Execution
 35 | 
 36 | Earlier the CUDA Fortran compiler was developed by PGI. From 2020 the PGI compiler tools was replaced with the Nvidia HPC Toolkit. You can use compilers like `nvc`, `nvc++` and `nvfortan` to compile `C`, `C++` and `Fortran` respectively.
 37 | 
 38 | - CUDA Fortran codes have suffixed `.cuf`
 39 | 
 40 | - Compile CUDA Fortran with `nvfortran` and just run the executable
 41 | 
 42 | ```bash
 43 | nvfortran test_code.cuf -o test_exe
 44 | ./test_exe
 45 | ```
 46 | ---
 47 | 
 48 | ### CUDA Fortran Code:
 49 | Will follow the SAXPY (Scalar A*X Plus Y) aka the "Hello World" problem for CUDA programming to show how to go from CPU to GPU code. 
 50 | 
 51 | The serial CPU code
 52 | ```Fortran
 53 | module mathOps
 54 |     contains
 55 |        subroutine saxpy(x, y, a)
 56 |         implicit none
 57 |         real :: x(:), y(:), a
 58 |         ! Just a simple array scaler multiplication and addition
 59 |         y = a*x +y
 60 |       end subroutine saxpy 
 61 |     end module mathOps
 62 |     
 63 |     program testSaxpy
 64 |       use mathOps
 65 |       implicit none
 66 |       integer, parameter :: N = 40000
 67 |       real :: x(N), y(N), a
 68 | 
 69 |       x = 1.0; y = 2.0; a = 2.0
 70 | 
 71 |       write(*,*) 'Max error: ', maxval(abs(y-4.0))
 72 |     end program testSaxpy 
 73 | ````
 74 | 
 75 | The above CPU code is ported to CUDA fortran as follows, brief explanation are given in between the codes:
 76 | 
 77 | ```fortran
 78 | module mathOps
 79 |   contains
 80 |   ! The kernel i.e a function that runs on the device 
 81 |   ! `attributes` describes the scope of the routine. `global` means its visible both from the host
 82 |   ! and device. This indicates the subroutine is run on the device but called from the host
 83 |     attributes(global) subroutine saxpy(x, y, a)
 84 |       implicit none
 85 |       real :: x(:), y(:)
 86 |       real, value :: a
 87 |       integer :: i, n
 88 |       n = size(x)
 89 | 
 90 |       ! Remember the host launches "**grid** of block with each block having **tBlock** threads"
 91 |       ! and each thread works on a single element of the array
 92 |       ! These `blockDim`, `blockIdx` and `threadIdx` are provided defined by CUDA are similar to 
 93 |       ! `dim3` type. As we used only `x` component to launch the kernel only `x` component is used
 94 | 
 95 |       ! Think of this as there are groups (=`grid` in host) of threads and those groups are numbered 
 96 |       ! with `blockIdx`. Each group has `blockDim` (=`tBlock` in host) number of threads and each 
 97 |       ! thread inside a particular block is numbered with `threadIdx`. 
 98 |       ! Thus using this following formula we can calculate the offset of the element of array to be computed
 99 | 
100 |       i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
101 |       !^ Note: This is an example of fine-grained parallelism
102 | 
103 |       ! As we have launched more threads in the host than there are array element 
104 |       ! a conditional check is required
105 |       if (i <= n) y(i) = y(i) + a*x(i)
106 | 
107 |     end subroutine saxpy 
108 |   end module mathOps
109 |   
110 |   program testSaxpy
111 |     use mathOps
112 |     ! Fortran module that contains all the CUDA Fortran definitions
113 |     use cudafor
114 |     implicit none
115 |     integer, parameter :: N = 40000
116 |     ! host arrays
117 |     real :: x(N), y(N), a
118 |     ! device arrays, they are declared with the `device` attribute
119 |     real, device :: x_d(N), y_d(N)
120 |   
121 |     ! Thread configuration to launch the kernel
122 |     ! Threads and block can be arranged in multidimensional nature. 
123 |     ! Here the they are defined as `dim3`, so they as have three components `x`,`y` and `z`.
124 |     type(dim3) :: grid, tBlock
125 |   
126 |     
127 |     ! In this example `tBlock` has 256 threads in `x` components and 
128 |     ! `grid` is defined such a way to accomodate all the `N = 40000` computation in blocks 
129 |     ! each having 256 threads.
130 |     tBlock = dim3(256,1,1)
131 |     grid = dim3(ceiling(real(N)/tBlock%x),1,1)
132 |   
133 |     x = 1.0; y = 2.0; a = 2.0
134 |   
135 |     ! copies the array from host to device and vice versa. 
136 |     ! The `cudafor` module overloads the assignment operator with `cudaMemcpy` calls 
137 |     ! so that the memory transfer can be done with a simple assignment operation. 
138 |     ! Note: This step actually moves data beween two physical devices and actually can be time consuming.
139 |     ! P.S. CUDA memory copy can also be done in asynchronous
140 |     x_d = x
141 |     y_d = y
142 | 
143 |     ! Launches the kernel on the device. 
144 |     ! The information between the tripple braces are the excution configuration 
145 |     ! it says to launch **grid** of block with each block having **tBlock** threads.
146 |     ! This call is asynchronous, so the host can just call this and 
147 |     ! proceed to the next line without it's computation on the device being completed
148 |     call saxpy<<<grid, tBlock>>>(x_d, y_d, a)
149 | 
150 | 
151 |     ! Copyies data back to host. 
152 |     ! This process is synchronous and waits for the device to complete the calculation
153 |     y = y_d
154 |     write(*,*) 'Max error: ', maxval(abs(y-4.0))
155 |   end program testSaxpy
156 | ```
157 | 
158 | 
159 | 
160 | 
161 | 
162 | 
163 | ### Profiling:
164 | Profiling can be done with the `nvprof` utility. 
165 | 
166 | <details>	
167 | 
168 |   <summary>A profiling on the `saxpy.cuf` code</summary>
169 | 
170 | ```bash
171 | $ nvfortran saxpy.cuf
172 | $ sudo nvprof ./a.out
173 | ==2688609== NVPROF is profiling process 2688609, command: ./a.out
174 |  Max error:     0.000000    
175 | ==2688609== Profiling application: ./a.out
176 | ==2688609== Profiling result:
177 |             Type  Time(%)      Time     Calls       Avg       Min       Max  Name
178 |  GPU activities:   64.44%  108.26us         4  27.063us     608ns  53.727us  [CUDA memcpy HtoD]
179 |                    28.55%  47.968us         1  47.968us  47.968us  47.968us  [CUDA memcpy DtoH]
180 |                     7.01%  11.776us         1  11.776us  11.776us  11.776us  mathops_saxpy_
181 |       API calls:   99.78%  188.02ms         4  47.004ms  2.6290us  188.01ms  cudaMalloc
182 |                     0.11%  216.32us         5  43.264us  2.5470us  74.525us  cudaMemcpy
183 |                     0.05%  103.29us         4  25.822us  2.2740us  80.366us  cudaFree
184 |                     0.03%  63.516us       101     628ns      81ns  27.537us  cuDeviceGetAttribute
185 |                     0.01%  17.552us         1  17.552us  17.552us  17.552us  cudaLaunchKernel
186 |                     0.01%  11.915us         1  11.915us  11.915us  11.915us  cuDeviceGetName
187 |                     0.00%  4.8980us         1  4.8980us  4.8980us  4.8980us  cuDeviceGetPCIBusId
188 |                     0.00%     884ns         3     294ns      75ns     682ns  cuDeviceGetCount
189 |                     0.00%     507ns         2     253ns      83ns     424ns  cuDeviceGet
190 |                     0.00%     241ns         1     241ns     241ns     241ns  cuDeviceTotalMem
191 |                     0.00%     138ns         1     138ns     138ns     138ns  cuDeviceGetUuid
192 | ```
193 | 
194 | </details>
195 | 
196 | ---
197 | 
198 | 
199 | ## Source Codes examples
200 | #### Loop parallelization:
201 | `!$cuf kernel do` directive can be used to simplify parallelizing loops. These directives instruct the compiler to generate kernels from a region of host code consisting of tightly nested loops. Essentially, kernel loop directives allow us to inline kernels in host code.  
202 | 1. [Loop parallelization](./src/cufKernel.cuf)
203 | 1. [Loop parallelization 2](./src/cufILP.cuf)
204 | 3. [Nested Loop parallelization](./src/cufKernel2D.cuf)
205 | 4. [Reduction operation](./src/cufReduction.cuf)
206 | 
207 | 
208 | #### Few other sample code
209 | 5. [Compute pi using a Monte Carlo method](./src/ComputePI/)
210 | 6. [Fast Fourier Transform](./src/FFT/)
211 | 7. [Derivative with FFT](./src/FFT/)
212 | 
213 | 
214 | ---
215 | 
216 | &nbsp;
217 | 
218 | ### References
219 | 1. __CUDA Fortran for Scientists and Engineers__ by Gregory Ruetsch & Massimiliano Fatica
220 | 2. https://developer.nvidia.com/blog/easy-introduction-cuda-fortran/
221 | 3. https://docs.nvidia.com/hpc-sdk/compilers/cuda-fortran-prog-guide/
222 | 


--------------------------------------------------------------------------------