├── .gitignore
├── slides
    ├── isambard.jpeg
    ├── pdfs
    │   ├── 02-pi.pdf
    │   ├── 05-gpu.pdf
    │   ├── 00-prelim.pdf
    │   ├── 07-wrapup.pdf
    │   ├── 03-simd-numa.pdf
    │   ├── 99-isambard.pdf
    │   ├── old
    │   │   ├── 01-intro.pdf
    │   │   ├── 03-opt.pdf
    │   │   ├── 06-tasks.pdf
    │   │   └── 04-hybrid.pdf
    │   └── 01-paralleldo.pdf
    ├── cache_bandwidth.pdf
    ├── logo-full-colour.png
    ├── .gitignore
    ├── Makefile
    ├── README.md
    ├── preamble.tex
    ├── 99-isambard.tex
    ├── 07-wrapup.tex
    ├── 00-prelim.tex
    ├── 04-hybrid.tex
    ├── 03-simd-numa.tex
    ├── 02-pi.tex
    ├── 01-paralleldo.tex
    └── 03-opt.tex
├── code
    ├── submit_stencil
    ├── wtime.c
    ├── .gitignore
    ├── timer.f90
    ├── Makefile
    ├── vadd.f90
    ├── vadd_paralleldo.f90
    ├── tasks.f90
    ├── fibonacci.f90
    ├── pi.f90
    ├── pi_reduction.f90
    ├── pi_atomic.f90
    ├── pi_critical.f90
    ├── vadd_spmd.f90
    ├── pi_private.f90
    ├── stencil.f90
    ├── private.f90
    ├── stencil_paralleldo.f90
    ├── pi_array.f90
    ├── stencil_reduction.f90
    ├── stencil_target.f90
    ├── stencil_optimised.f90
    ├── stencil_numa.f90
    ├── README.md
    └── jacobi.f90
├── README.md
└── LICENSE


/.gitignore:
--------------------------------------------------------------------------------
1 | # Misc.
2 | .*.swp
3 | .DS_Store
4 | 


--------------------------------------------------------------------------------
/slides/isambard.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UoB-HPC/openmp-for-cs/HEAD/slides/isambard.jpeg


--------------------------------------------------------------------------------
/slides/pdfs/02-pi.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UoB-HPC/openmp-for-cs/HEAD/slides/pdfs/02-pi.pdf


--------------------------------------------------------------------------------
/slides/pdfs/05-gpu.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UoB-HPC/openmp-for-cs/HEAD/slides/pdfs/05-gpu.pdf


--------------------------------------------------------------------------------
/slides/pdfs/00-prelim.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UoB-HPC/openmp-for-cs/HEAD/slides/pdfs/00-prelim.pdf


--------------------------------------------------------------------------------
/slides/pdfs/07-wrapup.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UoB-HPC/openmp-for-cs/HEAD/slides/pdfs/07-wrapup.pdf


--------------------------------------------------------------------------------
/slides/cache_bandwidth.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UoB-HPC/openmp-for-cs/HEAD/slides/cache_bandwidth.pdf


--------------------------------------------------------------------------------
/slides/logo-full-colour.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UoB-HPC/openmp-for-cs/HEAD/slides/logo-full-colour.png


--------------------------------------------------------------------------------
/slides/pdfs/03-simd-numa.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UoB-HPC/openmp-for-cs/HEAD/slides/pdfs/03-simd-numa.pdf


--------------------------------------------------------------------------------
/slides/pdfs/99-isambard.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UoB-HPC/openmp-for-cs/HEAD/slides/pdfs/99-isambard.pdf


--------------------------------------------------------------------------------
/slides/pdfs/old/01-intro.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UoB-HPC/openmp-for-cs/HEAD/slides/pdfs/old/01-intro.pdf


--------------------------------------------------------------------------------
/slides/pdfs/old/03-opt.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UoB-HPC/openmp-for-cs/HEAD/slides/pdfs/old/03-opt.pdf


--------------------------------------------------------------------------------
/slides/pdfs/old/06-tasks.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UoB-HPC/openmp-for-cs/HEAD/slides/pdfs/old/06-tasks.pdf


--------------------------------------------------------------------------------
/slides/pdfs/01-paralleldo.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UoB-HPC/openmp-for-cs/HEAD/slides/pdfs/01-paralleldo.pdf


--------------------------------------------------------------------------------
/slides/pdfs/old/04-hybrid.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UoB-HPC/openmp-for-cs/HEAD/slides/pdfs/old/04-hybrid.pdf


--------------------------------------------------------------------------------
/slides/.gitignore:
--------------------------------------------------------------------------------
 1 | # PDFs
 2 | *.pdf
 3 | 
 4 | # Latex temp files
 5 | *.aux
 6 | *.fdb_latexmk
 7 | *.fls
 8 | *.log
 9 | *.nav
10 | *.out
11 | *.snm
12 | *.toc
13 | *.vrb
14 | _minted-*/
15 | 


--------------------------------------------------------------------------------
/code/submit_stencil:
--------------------------------------------------------------------------------
 1 | #PBS -q R35330
 2 | #PBS -V
 3 | #PBS -joe
 4 | #PBS -lselect=1:ncpus=28,place=excl
 5 | #PBS -lwalltime=00:02:00
 6 | #PBS -N stencil
 7 | 
 8 | cd $PBS_O_WORKDIR
 9 | 
10 | export OMP_NUM_THREADS=28
11 | ./stencil
12 | 
13 | 


--------------------------------------------------------------------------------
/code/wtime.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <sys/time.h>
 3 | 
 4 | /* Get the current time in seconds since the Epoch */
 5 | void wtime(double *time)
 6 | {
 7 |   struct timeval tv;
 8 |   gettimeofday(&tv, NULL);
 9 |   *time = tv.tv_sec + tv.tv_usec*1e-6;
10 | }
11 | 


--------------------------------------------------------------------------------
/slides/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | all: 00-prelim.pdf 01-paralleldo.pdf 02-pi.pdf 03-simd-numa.pdf 05-gpu.pdf 07-wrapup.pdf 99-isambard.pdf
 3 | 
 4 | %.pdf:%.tex preamble.tex
 5 | 	latexmk -pdf -shell-escape $<
 6 | 
 7 | .PHONY: clean
 8 | clean:
 9 | 	latexmk -C
10 | 	rm -f *.nav *.snm *.vrb
11 | 	rm -rf _minted*/
12 | 
13 | 


--------------------------------------------------------------------------------
/code/.gitignore:
--------------------------------------------------------------------------------
 1 | # Build output
 2 | *.o
 3 | *.mod
 4 | 
 5 | # Binary names
 6 | pi
 7 | pi_array
 8 | pi_atomic
 9 | pi_critical
10 | pi_private
11 | pi_reduction
12 | private
13 | jacobi
14 | vadd
15 | vadd_paralleldo
16 | vadd_spmd
17 | stencil
18 | stencil_paralleldo
19 | stencil_reduction
20 | stencil_optimised
21 | stencil_numa
22 | stencil_target
23 | fibonacci
24 | tasks
25 | 
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # OpenMP for Computational Scientists
 2 | 
 3 | This is a 6-part course introducing the OpenMP programming model.
 4 | It is designed for Fortran programmers.
 5 | 
 6 | The example code (and solutions) are found in the `code/` directory.
 7 | LaTeX sources for the lecture slides are found in the `slides/` directory.
 8 | Generated PDFs of the lecture slides are found in the `slides/pdfs` directory.
 9 | 
10 | 


--------------------------------------------------------------------------------
/code/timer.f90:
--------------------------------------------------------------------------------
 1 | ! Timing module, used to call the C timer
 2 | module timer
 3 | 
 4 |   use ISO_C_BINDING
 5 | 
 6 |   implicit none
 7 | 
 8 |   interface
 9 | 
10 |     subroutine wtime_c(time) bind(C, name='wtime')
11 |       use ISO_C_BINDING
12 |       real(C_DOUBLE) :: time
13 |     end subroutine
14 |   end interface
15 | 
16 |   contains
17 | 
18 |   subroutine wtime(time)
19 | 
20 |     real(kind=8) :: time
21 | 
22 |     call wtime_c(time)
23 | 
24 |   end subroutine wtime
25 | 
26 | end module timer
27 | 


--------------------------------------------------------------------------------
/code/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | FTN=ftn
 3 | CC=cc
 4 | FFLAGS=-O3
 5 | LIBS=-homp -rm
 6 | 
 7 | TIMEOBJ=timer.o wtime.o
 8 | 
 9 | default: all
10 | 
11 | BINS=jacobi pi pi_critical pi_atomic pi_array pi_private pi_reduction private vadd vadd_paralleldo vadd_spmd stencil stencil_paralleldo stencil_reduction stencil_optimised stencil_numa stencil_target fibonacci tasks
12 | 
13 | all: $(BINS)
14 | 
15 | %:%.f90 $(TIMEOBJ)
16 | 	$(FTN) $(FFLAGS) $^ $(LIBS) -o $@
17 | 
18 | %.o:%.f90
19 | 	$(FTN) -O3 $< -c
20 | 
21 | %.o: %.c
22 | 	$(CC) -O3 $< -c
23 | 
24 | .PHONY: clean
25 | clean:
26 | 	rm -f *.o *.mod $(BINS)
27 | 


--------------------------------------------------------------------------------
/slides/README.md:
--------------------------------------------------------------------------------
 1 | # Slides
 2 | 
 3 | Source code for the teaching material (slides) that teach the OpenMP for Computational Scientists course.
 4 | The course material is presented using Fortran.
 5 | 
 6 | ## Course structure
 7 | 
 8 | 1. OpenMP overview: shared memory and parallel do.
 9 | 2. Data sharing clauses and reductions.
10 | 3. Vectorisation and code optimisaion.
11 | 4. NUMA and Hybrid MPI+OpenMP.
12 | 5. OpenMP for GPUs.
13 | 6. Tasks and Tools.
14 | 
15 | ## Compilation
16 | The slides are written in Latex.
17 | You should be able to build all the slides simply by typing ```make```.
18 | 
19 | ### Dependancies
20 | The LaTeX uses the following packages:
21 | - beamer
22 | - amsmath
23 | - pgfplots
24 | - minted
25 | - fontenc
26 | - multicol
27 | - booktabs
28 | - adjustbox
29 | 
30 | 


--------------------------------------------------------------------------------
/code/vadd.f90:
--------------------------------------------------------------------------------
 1 | 
 2 | ! Vector addition
 3 | program vadd
 4 | 
 5 |   use timer
 6 | 
 7 |   implicit none
 8 | 
 9 |   integer :: N=50000000
10 |   real(kind=8), allocatable :: A(:), B(:), C(:)
11 |   integer :: i
12 |   real(kind=8) :: start, end
13 | 
14 |   ! Allocate memory
15 |   allocate(A(N))
16 |   allocate(B(N))
17 |   allocate(C(N))
18 | 
19 |   ! Initilise data
20 |   do i = 1, N
21 |     A(i) = 1.0_8
22 |     B(i) = 2.0_8
23 |     C(i) = 0.0_8
24 |   end do
25 | 
26 |   ! Start timer
27 |   call wtime(start)
28 | 
29 |   ! Vector addition
30 |   do i = 1, N
31 |     C(i) = A(i) + B(i)
32 |   end do
33 | 
34 |   ! Stop timer
35 |   call wtime(end)
36 | 
37 |   ! Print result
38 |   write(*,"(A)")         "------------------------------------"
39 |   write(*,"(A,F10.3)")   "runtime:  ", end-start
40 |   if (any(C .ne. 3.0_8)) then
41 |     write(*,"(A)")       "WARNING: results incorrect"
42 |   end if
43 |   write(*,"(A)")         "------------------------------------"
44 | 
45 |   ! Free memory
46 |   deallocate(A,B)
47 | 
48 | end program vadd
49 | 


--------------------------------------------------------------------------------
/slides/preamble.tex:
--------------------------------------------------------------------------------
 1 | 
 2 | % Beamer settings
 3 | \usecolortheme{rose}
 4 | \beamertemplatenavigationsymbolsempty
 5 | \setbeamertemplate{footline}[frame number]
 6 | 
 7 | \titlegraphic{%
 8 | \includegraphics[height=1cm]{logo-full-colour.png}}
 9 | 
10 | \addtobeamertemplate{frametitle}{}{%
11 | \begin{tikzpicture}[remember picture,overlay]
12 | \node[anchor=north east,yshift=2pt] at (current page.north east) {\includegraphics[height=1cm]{logo-full-colour.png}};
13 | \end{tikzpicture}}
14 | 
15 | % Packages
16 | \usepackage{amsmath}
17 | 
18 | \usepackage{tikz}
19 | \usetikzlibrary{positioning}
20 | \usetikzlibrary{fit}
21 | 
22 | \usepackage{pgfplots}
23 | \pgfplotsset{compat=1.16}
24 | \usepgfplotslibrary{fillbetween}
25 | 
26 | 
27 | \usepackage{minted}
28 | \usepackage[T1]{fontenc} % Required by minted to ensure dollar signs are produced instead of pound (sterling) signs
29 | 
30 | \usepackage{multicol}
31 | 
32 | \usepackage{booktabs}
33 | 
34 | \usepackage{adjustbox}
35 | 
36 | % Author
37 | \author{Dr Tom Deakin\\University of Bristol}
38 | 
39 | \date{Tuesday 1 December, 2020}
40 | 
41 | 


--------------------------------------------------------------------------------
/code/vadd_paralleldo.f90:
--------------------------------------------------------------------------------
 1 | 
 2 | ! Vector addition
 3 | program vadd
 4 | 
 5 |   use timer
 6 | 
 7 |   implicit none
 8 | 
 9 |   integer :: N=50000000
10 |   real(kind=8), allocatable :: A(:), B(:), C(:)
11 |   integer :: i
12 |   real(kind=8) :: start, end
13 | 
14 |   ! Allocate memory
15 |   allocate(A(N))
16 |   allocate(B(N))
17 |   allocate(C(N))
18 | 
19 |   ! Initilise data
20 |   do i = 1, N
21 |     A(i) = 1.0_8
22 |     B(i) = 2.0_8
23 |     C(i) = 0.0_8
24 |   end do
25 | 
26 |   ! Start timer
27 |   call wtime(start)
28 | 
29 |   ! Vector addition
30 |   !$omp parallel do
31 |   do i = 1, N
32 |     C(i) = A(i) + B(i)
33 |   end do
34 |   !$omp end parallel do
35 | 
36 |   ! Stop timer
37 |   call wtime(end)
38 | 
39 |   ! Print result
40 |   write(*,"(A)")         "------------------------------------"
41 |   write(*,"(A,F10.3)")   "runtime:  ", end-start
42 |   if (any(C .ne. 3.0_8)) then
43 |     write(*,"(A)")       "WARNING: results incorrect"
44 |   end if
45 |   write(*,"(A)")         "------------------------------------"
46 | 
47 |   ! Free memory
48 |   deallocate(A,B)
49 | 
50 | end program vadd
51 | 


--------------------------------------------------------------------------------
/code/tasks.f90:
--------------------------------------------------------------------------------
 1 | 
 2 | subroutine do_c
 3 |   print *, "Task C starting"
 4 |   call sleep(1)
 5 |   print *, "Task C finished"
 6 | end subroutine
 7 | 
 8 | subroutine do_d
 9 |   print *, "Task D starting"
10 |   call sleep(1)
11 |   print *, "Task D finished"
12 | end subroutine
13 | 
14 | subroutine do_e
15 |   print *, "Task E starting"
16 |   call sleep(1)
17 |   print *, "Task E finished"
18 | end subroutine
19 | 
20 | subroutine do_b
21 | 
22 |   print *, "Task B starting"
23 |   call sleep(1)
24 | 
25 |   !$omp task
26 |   call do_d
27 |   !$omp end task
28 | 
29 |   !$omp task
30 |   call do_e
31 |   !$omp end task
32 | 
33 |   print *, "Task B finished"
34 | 
35 | end subroutine
36 | 
37 | subroutine do_a
38 | 
39 |   print *, "Task A starting"
40 |   call sleep(1)
41 | 
42 |   !$omp task
43 |   call do_b
44 |   !$omp end task
45 | 
46 |   !$omp task
47 |   call do_c
48 |   !$omp end task
49 | 
50 |   print *, "Task A finished"
51 | 
52 | end subroutine
53 | 
54 | program tasks
55 | 
56 |   implicit none
57 | 
58 |   !$omp parallel
59 |   !$omp master
60 |   call do_a
61 |   !$omp end master
62 |   !$omp end parallel
63 | 
64 | end program
65 | 
66 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Tom Deakin
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/code/fibonacci.f90:
--------------------------------------------------------------------------------
 1 | 
 2 | recursive integer function fib(n) result(res)
 3 | 
 4 |   implicit none
 5 | 
 6 |   integer :: n, i, j
 7 |   
 8 |   if (n .lt. 2) then
 9 |     res = n
10 |   else
11 |     !$omp task shared(i)
12 |     i = fib(n-1)
13 |     !$omp end task
14 |     
15 |     !$omp task shared(j)
16 |     j = fib(n-2)
17 |     !$omp end task
18 |     
19 |     !$omp taskwait
20 |     res = i+j
21 |   end if 
22 | end function
23 | 
24 | program fibonacci
25 | 
26 |   use timer
27 | 
28 |   implicit none
29 | 
30 |   integer :: fib ! Declare function
31 |   integer :: num = 40
32 |   integer :: res
33 |   real(kind=8) :: tic, toc
34 | 
35 |   ! Start timer
36 |   call wtime(tic)
37 | 
38 |   !$omp parallel
39 |     !$omp master
40 |     res = fib(num)
41 |     !$omp end master
42 |   !$omp end parallel
43 | 
44 |   ! Stop timer
45 |   call wtime(toc)
46 | 
47 | 
48 |   ! Print result
49 |   write(*,"(A)")         "------------------------------------"
50 |   write(*,"(I0,A,I0)")   num, "th Fibonacci is ", res
51 |   write(*,"(A,F10.3)")   "runtime:  ", toc-tic
52 |   write(*,"(A)")         "------------------------------------"
53 | 
54 | end program
55 | 
56 | 


--------------------------------------------------------------------------------
/code/pi.f90:
--------------------------------------------------------------------------------
 1 | 
 2 | program pi_main
 3 | 
 4 |   use timer
 5 | 
 6 |   ! Local variables
 7 |   integer, parameter :: num_steps = 100000000 ! number of steps over which to estimate pi
 8 |   real(kind=8) :: step                        ! the step size
 9 |   integer :: ii                               ! genereric counter
10 |   real(kind=8) :: x                           ! intermediate value
11 |   real(kind=8) :: pi = 0.0_8                  ! overall estimate
12 |   real(kind=8) :: sum = 0.0_8                 ! variable to store partial sum
13 |   real(kind=8) :: start, end                  ! timers
14 | 
15 |   real(kind=8), parameter :: PI_8 = 4.0_8 * atan(1.0_8)
16 | 
17 |   ! step size is dependent upon the number of steps
18 |   step = 1.0_8/num_steps
19 | 
20 |   ! Start timer
21 |   call wtime(start)
22 | 
23 |   ! main loop
24 |   do ii = 1, num_steps
25 |     x = (ii-0.5_8)*step
26 |     sum = sum + (4.0_8/(1.0_8+x*x))
27 |   end do
28 |   pi = step * sum
29 | 
30 |   ! Stop timer
31 |   call wtime(end)
32 | 
33 |   ! Print result
34 |   write(*,"(A)")         "------------------------------------"
35 |   write(*,"(A,F19.16)") "pi is:    ", pi
36 |   write(*,"(A,F19.16)") "error is: ", abs(pi - PI_8)
37 |   write(*,"(A,F10.3)")   "runtime:  ", end-start
38 |   write(*,"(A)")         "------------------------------------"
39 | 
40 | end program pi_main
41 | 


--------------------------------------------------------------------------------
/slides/99-isambard.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[aspectratio=169]{beamer}
 2 | 
 3 | \input{preamble.tex}
 4 | 
 5 | \title{OpenMP for Computational Scientists}
 6 | \subtitle{Using Isambard}
 7 | 
 8 | \begin{document}
 9 | 
10 | \frame{\titlepage}
11 | 
12 | %-------------------------------------------------------------------------------
13 | 
14 | \begin{frame}
15 | \frametitle{Using Isambard (1)}
16 | \begin{enumerate}
17 | \item Go to this webpage to get your account ID (01, 02, ...): \url{https://tinyurl.com/openmp-2020}
18 | \item Log into the Isambard \emph{bastion} node (the gateway to the system)\newline
19 | \mintinline{bash}|ssh br-trainXX@isambard.gw4.ac.uk|
20 | \item Password: \mintinline{bash}|openmpUG20|
21 | \item From the bastion node, log in to Isambard Phase 1\newline
22 | \mintinline{bash}|ssh phase1|
23 | \item Change to the directory containing the exercises\newline
24 | \mintinline{bash}|cd openmp-for-cs|
25 | \end{enumerate}
26 | 
27 | \end{frame}
28 | 
29 | \begin{frame}
30 | \frametitle{Using Isambard (2)}
31 | \begin{enumerate}
32 | \setcounter{enumi}{5}
33 | \item Build the exercises\newline
34 | \mintinline{bash}|make|
35 | \item Submit a job\newline
36 | \mintinline{bash}|qsub submit_stencil|
37 | \item Check job status\newline
38 | \mintinline{bash}|qstat -u $USER|
39 | \item Check job output\newline
40 | \mintinline{bash}|cat stencil.o9748|
41 | 
42 | \end{enumerate}
43 | 
44 | \end{frame}
45 | 
46 | \end{document}
47 | 


--------------------------------------------------------------------------------
/code/pi_reduction.f90:
--------------------------------------------------------------------------------
 1 | 
 2 | program pi_main
 3 | 
 4 |   use timer
 5 | 
 6 |   ! Local variables
 7 |   integer, parameter :: num_steps = 100000000 ! number of steps over which to estimate pi
 8 |   real(kind=8) :: step                        ! the step size
 9 |   integer :: ii                               ! genereric counter
10 |   real(kind=8) :: x                           ! intermediate value
11 |   real(kind=8) :: pi = 0.0_8                  ! overall estimate
12 |   real(kind=8) :: sum = 0.0_8                 ! variable to store partial sum
13 |   real(kind=8) :: start, end                  ! timers
14 | 
15 |   real(kind=8), parameter :: PI_8 = 4.0_8 * atan(1.0_8)
16 | 
17 |   ! step size is dependent upon the number of steps
18 |   step = 1.0_8/num_steps
19 | 
20 |   ! Start timer
21 |   call wtime(start)
22 | 
23 |   ! main loop
24 |   !$omp parallel do private(x) reduction(+:sum)
25 |   do ii = 1, num_steps
26 |     x = (ii-0.5_8)*step
27 |     sum = sum + (4.0_8/(1.0_8+x*x))
28 |   end do
29 |   !$omp end parallel do
30 |   pi = step * sum
31 | 
32 |   ! Stop timer
33 |   call wtime(end)
34 | 
35 |   ! Print result
36 |   write(*,"(A)")         "------------------------------------"
37 |   write(*,"(A,F19.16)") "pi is:    ", pi
38 |   write(*,"(A,F19.16)") "error is: ", abs(pi - PI_8)
39 |   write(*,"(A,F10.3)")   "runtime:  ", end-start
40 |   write(*,"(A)")         "------------------------------------"
41 | 
42 | end program pi_main
43 | 


--------------------------------------------------------------------------------
/code/pi_atomic.f90:
--------------------------------------------------------------------------------
 1 | 
 2 | program pi_main
 3 | 
 4 |   use timer
 5 | 
 6 |   ! Local variables
 7 |   integer, parameter :: num_steps = 100000000 ! number of steps over which to estimate pi
 8 |   real(kind=8) :: step                        ! the step size
 9 |   integer :: ii                               ! genereric counter
10 |   real(kind=8) :: x, x2                       ! intermediate value
11 |   real(kind=8) :: pi = 0.0_8                  ! overall estimate
12 |   real(kind=8) :: sum = 0.0_8                 ! variable to store partial sum
13 |   real(kind=8) :: start, end                  ! timers
14 | 
15 |   real(kind=8), parameter :: PI_8 = 4.0_8 * atan(1.0_8)
16 | 
17 |   ! step size is dependent upon the number of steps
18 |   step = 1.0_8/num_steps
19 | 
20 |   ! Start timer
21 |   call wtime(start)
22 | 
23 |   ! main loop
24 |   !$omp parallel do private(x,x2)
25 |   do ii = 1, num_steps
26 |     x = (ii-0.5_8)*step
27 |     x2 = 4.0_8/(1.0_8+x*x)
28 |     !$omp atomic
29 |     sum = sum + x2
30 |   end do
31 |   !$omp end parallel do
32 | 
33 |   pi = step * sum
34 | 
35 |   ! Stop timer
36 |   call wtime(end)
37 | 
38 |   ! Print result
39 |   write(*,"(A)")         "------------------------------------"
40 |   write(*,"(A,F19.16)") "pi is:    ", pi
41 |   write(*,"(A,F19.16)") "error is: ", abs(pi - PI_8)
42 |   write(*,"(A,F10.3)")   "runtime:  ", end-start
43 |   write(*,"(A)")         "------------------------------------"
44 | 
45 | end program pi_main
46 | 


--------------------------------------------------------------------------------
/code/pi_critical.f90:
--------------------------------------------------------------------------------
 1 | 
 2 | program pi_main
 3 | 
 4 |   use timer
 5 | 
 6 |   ! Local variables
 7 |   integer, parameter :: num_steps = 100000000 ! number of steps over which to estimate pi
 8 |   real(kind=8) :: step                        ! the step size
 9 |   integer :: ii                               ! genereric counter
10 |   real(kind=8) :: x, x2                       ! intermediate value
11 |   real(kind=8) :: pi = 0.0_8                  ! overall estimate
12 |   real(kind=8) :: sum = 0.0_8                 ! variable to store partial sum
13 |   real(kind=8) :: start, end                  ! timers
14 | 
15 |   real(kind=8), parameter :: PI_8 = 4.0_8 * atan(1.0_8)
16 | 
17 |   ! step size is dependent upon the number of steps
18 |   step = 1.0_8/num_steps
19 | 
20 |   ! Start timer
21 |   call wtime(start)
22 | 
23 |   ! main loop
24 |   !$omp parallel do private(x,x2)
25 |   do ii = 1, num_steps
26 |     x = (ii-0.5_8)*step
27 |     x2 = 4.0_8/(1.0_8+x*x)
28 |     !$omp critical
29 |     sum = sum + x2
30 |     !$omp end critical
31 |   end do
32 |   !$omp end parallel do
33 | 
34 |   pi = step * sum
35 | 
36 |   ! Stop timer
37 |   call wtime(end)
38 | 
39 |   ! Print result
40 |   write(*,"(A)")         "------------------------------------"
41 |   write(*,"(A,F19.16)") "pi is:    ", pi
42 |   write(*,"(A,F19.16)") "error is: ", abs(pi - PI_8)
43 |   write(*,"(A,F10.3)")   "runtime:  ", end-start
44 |   write(*,"(A)")         "------------------------------------"
45 | 
46 | end program pi_main
47 | 


--------------------------------------------------------------------------------
/code/vadd_spmd.f90:
--------------------------------------------------------------------------------
 1 | 
 2 | ! Vector addition
 3 | program vadd
 4 | 
 5 |   use timer
 6 |   use omp_lib
 7 | 
 8 |   implicit none
 9 | 
10 |   integer :: N=50000000
11 |   real(kind=8), allocatable :: A(:), B(:), C(:)
12 |   integer :: i
13 |   integer :: tid, nthreads
14 |   real(kind=8) :: start, end
15 | 
16 |   ! Allocate memory
17 |   allocate(A(N))
18 |   allocate(B(N))
19 |   allocate(C(N))
20 | 
21 |   ! Initilise data
22 |   do i = 1, N
23 |     A(i) = 1.0_8
24 |     B(i) = 2.0_8
25 |     C(i) = 0.0_8
26 |   end do
27 | 
28 |   ! Start timer
29 |   call wtime(start)
30 | 
31 |   ! Open parallel region
32 |   ! tid variable must be private to each thread
33 |   !$omp parallel private(tid)
34 | 
35 |   ! Get thread number
36 |   tid = omp_get_thread_num()
37 | 
38 |   ! Get total number of threads
39 |   nthreads = omp_get_num_threads()
40 | 
41 |   ! Vector addition
42 |   ! Share iteration space based on thread ID
43 |   do i = 1+(tid*N/nthreads), (tid+1)*N/nthreads
44 |     C(i) = A(i) + B(i)
45 |   end do
46 | 
47 |   ! End parallel region
48 |   !$omp end parallel
49 | 
50 |   ! Stop timer
51 |   call wtime(end)
52 | 
53 |   ! Print result
54 |   write(*,"(A)")         "------------------------------------"
55 |   write(*,"(A,F10.3)")   "runtime:  ", end-start
56 |   if (any(C .ne. 3.0_8)) then
57 |     write(*,"(A)")       "WARNING: results incorrect"
58 |   end if
59 |   write(*,"(A)")         "------------------------------------"
60 | 
61 |   ! Free memory
62 |   deallocate(A,B)
63 | 
64 | end program vadd
65 | 


--------------------------------------------------------------------------------
/code/pi_private.f90:
--------------------------------------------------------------------------------
 1 | 
 2 | program pi_main
 3 | 
 4 |   use timer
 5 | 
 6 |   ! Local variables
 7 |   integer, parameter :: num_steps = 100000000 ! number of steps over which to estimate pi
 8 |   real(kind=8) :: step                        ! the step size
 9 |   integer :: ii                               ! genereric counter
10 |   real(kind=8) :: x                           ! intermediate value
11 |   real(kind=8) :: pi = 0.0_8                  ! overall estimate
12 |   real(kind=8) :: sum = 0.0_8                 ! variable to store partial sum
13 |   real(kind=8) :: start, end                  ! timers
14 | 
15 |   real(kind=8), parameter :: PI_8 = 4.0_8 * atan(1.0_8)
16 | 
17 |   ! step size is dependent upon the number of steps
18 |   step = 1.0_8/num_steps
19 | 
20 |   ! Start timer
21 |   call wtime(start)
22 | 
23 |   ! main loop
24 |   !$omp parallel private(x) firstprivate(sum)
25 |   !$omp do
26 |   do ii = 1, num_steps
27 |     x = (ii-0.5_8)*step
28 |     sum = sum + 4.0_8/(1.0_8+x*x)
29 |   end do
30 |   !$omp end do
31 |   !$omp critical
32 |   pi = pi + sum
33 |   !$omp end critical
34 |   !$omp end parallel
35 | 
36 |   pi = pi * step
37 | 
38 |   ! Stop timer
39 |   call wtime(end)
40 | 
41 |   ! Print result
42 |   write(*,"(A)")         "------------------------------------"
43 |   write(*,"(A,F19.16)") "pi is:    ", pi
44 |   write(*,"(A,F19.16)") "error is: ", abs(pi - PI_8)
45 |   write(*,"(A,F10.3)")   "runtime:  ", end-start
46 |   write(*,"(A)")         "------------------------------------"
47 | 
48 | end program pi_main
49 | 


--------------------------------------------------------------------------------
/code/stencil.f90:
--------------------------------------------------------------------------------
 1 | 
 2 | ! 5 point stencil
 3 | program stencil
 4 | 
 5 |   use timer
 6 | 
 7 |   implicit none
 8 | 
 9 |   integer :: nx = 4000
10 |   integer :: ny = 4000
11 |   integer :: ntimes = 30
12 |   real(kind=8), dimension(:,:), pointer :: A, Atmp, Aptr
13 |   integer :: i, j, t
14 |   real(kind=8) :: total_start, total_end
15 |   real(kind=8) :: tic, toc
16 | 
17 |   ! Allocate memory
18 |   allocate(A(0:nx+1,0:ny+1))
19 |   allocate(Atmp(0:nx+1,0:ny+1))
20 | 
21 |   ! Initialise data to zero
22 |   do i = 0, nx+1
23 |     do j = 0, ny+1
24 |       A(i,j) = 0.0_8
25 |       Atmp(i,j) = 0.0_8
26 |     end do
27 |   end do
28 | 
29 |   ! Insert values in centre of grid
30 |   do i = nx/4, 3*nx/4
31 |     do j = ny/4, 3*ny/4
32 |       A(i,j) = 1.0_8
33 |     end do
34 |   end do
35 | 
36 |   total_start = sum(A(:,:))
37 | 
38 |   ! Start timer
39 |   call wtime(tic)
40 | 
41 |   ! Loop a number of times
42 |   do t = 1, ntimes
43 | 
44 |     ! Update the stencil
45 |     do i = 1, nx
46 |       do j = 1, ny
47 |         Atmp(i,j) = (A(i-1,j) + A(i+1,j) + A(i,j) + A(i,j-1) + A(i,j+1)) / 5.0_8
48 |       end do
49 |     end do
50 | 
51 |     ! Swap pointers
52 |     Aptr => A
53 |     A => Atmp
54 |     Atmp => Aptr
55 | 
56 |   end do
57 | 
58 |   ! Stop timer
59 |   call wtime(toc)
60 | 
61 |   ! Sum up grid values for rudimentary correctness check
62 |   total_end = sum(A(:,:))
63 | 
64 |   ! Print result
65 |   write(*,"(A)")         "------------------------------------"
66 |   write(*,"(A,F10.3)")   "runtime:  ", toc-tic
67 |   if (abs(total_end-total_start)/total_start > 1.0E-8) then
68 |     write(*,"(A)")       "result: Failed"
69 |   else
70 |     write(*,"(A)")       "result: Passed"
71 |   end if
72 |   write(*,"(A)")         "------------------------------------"
73 | 
74 | 
75 |   deallocate(A, Atmp)
76 | 
77 | end program stencil
78 | 
79 | 


--------------------------------------------------------------------------------
/code/private.f90:
--------------------------------------------------------------------------------
 1 | 
 2 | program private
 3 | 
 4 |   USE omp_lib
 5 | 
 6 |   implicit none
 7 | 
 8 |   integer :: i        ! Loop index
 9 |   integer :: nthreads ! Number of threads
10 |   integer :: N=10     ! Number of iterations
11 |   integer :: x=-1     ! Original variable
12 | 
13 |   write(*,"(A)") "------------------------------------"
14 | 
15 |   !$omp parallel
16 |     nthreads = omp_get_num_threads()
17 |   !$omp end parallel
18 |   write (*,"(A,I0)") "num threads: ", nthreads
19 |   write (*,*)
20 | 
21 |   write (*,"(A,I0)") "original: x=", x
22 |   write (*,*)
23 | 
24 |   ! Private clause
25 |   x=-1
26 |   write (*,"(A,I0)") "private:"
27 |   write (*,"(1X,A,I0)") "before: x=", x
28 |   !$omp parallel do private(x)
29 |   do i = 1, N
30 |     write (*,"(2X,A,I0,A,I0,A,I0)") "Thread ", omp_get_thread_num(), " setting x=", x, " to ", i
31 |     x = i
32 |   end do
33 |   !$omp end parallel do
34 |   write (*,"(1X,A,I0)") "after: x=", x
35 |   write (*,*)
36 | 
37 |   ! First private clause
38 |   x=-1
39 |   write (*,"(A,I0)") "firstprivate:"
40 |   write (*,"(1X,A,I0)") "before: x=", x
41 |   !$omp parallel do firstprivate(x)
42 |   do i = 1, N
43 |     write (*,"(2X,A,I0,A,I0,A,I0)") "Thread ", omp_get_thread_num(), " setting x=", x, " to ", i
44 |     x = i
45 |   end do
46 |   !$omp end parallel do
47 |   write (*,"(1X,A,I0)") "after: x=", x
48 |   write (*,*)
49 | 
50 |   ! Last private clause
51 |   x=-1
52 |   write (*,"(A,I0)") "lastprivate:"
53 |   write (*,"(1X,A,I0)") "before: x=", x
54 |   !$omp parallel do lastprivate(x)
55 |   do i = 1, N
56 |     write (*,"(2X,A,I0,A,I0,A,I0)") "Thread ", omp_get_thread_num(), " setting x=", x, " to ", i
57 |     x = i
58 |   end do
59 |   !$omp end parallel do
60 |   write (*,"(1X,A,I0)") "after: x=", x
61 | 
62 |   write(*,"(A)") "------------------------------------"
63 | 
64 | end program private
65 | 


--------------------------------------------------------------------------------
/code/stencil_paralleldo.f90:
--------------------------------------------------------------------------------
 1 | 
 2 | ! 5 point stencil
 3 | program stencil
 4 | 
 5 |   use timer
 6 | 
 7 |   implicit none
 8 | 
 9 |   integer :: nx = 4000
10 |   integer :: ny = 4000
11 |   integer :: ntimes = 30
12 |   real(kind=8), dimension(:,:), pointer :: A, Atmp, Aptr
13 |   integer :: i, j, t
14 |   real(kind=8) :: total_start, total_end
15 |   real(kind=8) :: tic, toc
16 | 
17 |   ! Allocate memory
18 |   allocate(A(0:nx+1,0:ny+1))
19 |   allocate(Atmp(0:nx+1,0:ny+1))
20 | 
21 |   ! Initialise data to zero
22 |   do i = 0, nx+1
23 |     do j = 0, ny+1
24 |       A(i,j) = 0.0_8
25 |       Atmp(i,j) = 0.0_8
26 |     end do
27 |   end do
28 | 
29 |   ! Insert values in centre of grid
30 |   do i = nx/4, 3*nx/4
31 |     do j = ny/4, 3*ny/4
32 |       A(i,j) = 1.0_8
33 |     end do
34 |   end do
35 | 
36 |   total_start = sum(A(:,:))
37 | 
38 |   ! Start timer
39 |   call wtime(tic)
40 | 
41 |   ! Loop a number of times
42 |   do t = 1, ntimes
43 | 
44 |     ! Update the stencil
45 |     !$omp parallel do collapse(2)
46 |     do i = 1, nx
47 |       do j = 1, ny
48 |         Atmp(i,j) = (A(i-1,j) + A(i+1,j) + A(i,j) + A(i,j-1) + A(i,j+1)) / 5.0_8
49 |       end do
50 |     end do
51 |     !$omp end parallel do
52 | 
53 |     ! Swap pointers
54 |     Aptr => A
55 |     A => Atmp
56 |     Atmp => Aptr
57 | 
58 |   end do
59 | 
60 |   ! Stop timer
61 |   call wtime(toc)
62 | 
63 |   ! Sum up grid values for rudimentary correctness check
64 |   total_end = sum(A(:,:))
65 | 
66 |   ! Print result
67 |   write(*,"(A)")         "------------------------------------"
68 |   write(*,"(A,F10.3)")   "runtime:  ", toc-tic
69 |   if (abs(total_end-total_start)/total_start > 1.0E-8) then
70 |     write(*,"(A)")       "result: Failed"
71 |   else
72 |     write(*,"(A)")       "result: Passed"
73 |   end if
74 |   write(*,"(A)")         "------------------------------------"
75 | 
76 | 
77 |   deallocate(A, Atmp)
78 | 
79 | end program stencil
80 | 
81 | 


--------------------------------------------------------------------------------
/code/pi_array.f90:
--------------------------------------------------------------------------------
 1 | 
 2 | program pi_main
 3 | 
 4 |   use timer
 5 |   use omp_lib
 6 | 
 7 |   ! Local variables
 8 |   integer, parameter :: num_steps = 100000000 ! number of steps over which to estimate pi
 9 |   real(kind=8) :: step                        ! the step size
10 |   integer :: ii                               ! genereric counter
11 |   real(kind=8) :: x                           ! intermediate value
12 |   real(kind=8) :: pi = 0.0_8                  ! overall estimate
13 |   real(kind=8), allocatable :: sum(:)         ! variable to store partial sum
14 |   real(kind=8) :: start, end                  ! timers
15 |   integer :: nthreads                         ! number of OpenMP threads
16 |   integer :: tid                              ! thread id
17 | 
18 |   real(kind=8), parameter :: PI_8 = 4.0_8 * atan(1.0_8)
19 | 
20 |   ! Get number of OpenMP threads
21 |   !$omp parallel
22 |   nthreads = omp_get_num_threads()
23 |   !$omp end parallel
24 | 
25 |   allocate(sum(nthreads))
26 | 
27 |   ! step size is dependent upon the number of steps
28 |   step = 1.0_8/num_steps
29 | 
30 |   ! Start timer
31 |   call wtime(start)
32 | 
33 |   ! main loop
34 |   !$omp parallel private(x,tid)
35 |   tid = omp_get_thread_num()
36 |   sum(tid+1) = 0.0_8
37 |   !$omp do
38 |   do ii = 1, num_steps
39 |     x = (ii-0.5_8)*step
40 |     sum(tid+1) = sum(tid+1) + (4.0_8/(1.0_8+x*x))
41 |     !$omp flush(sum)
42 |   end do
43 |   !$omp end do
44 |   !$omp end parallel
45 | 
46 |   ! Total partial sums serially
47 |   do ii = 1, nthreads
48 |     pi = pi + sum(ii)
49 |   end do
50 |   pi = pi * step
51 | 
52 |   ! Stop timer
53 |   call wtime(end)
54 | 
55 |   ! Print result
56 |   write(*,"(A)")         "------------------------------------"
57 |   write(*,"(A,F19.16)") "pi is:    ", pi
58 |   write(*,"(A,F19.16)") "error is: ", abs(pi - PI_8)
59 |   write(*,"(A,F10.3)")   "runtime:  ", end-start
60 |   write(*,"(A)")         "------------------------------------"
61 | 
62 |   deallocate(sum)
63 | 
64 | end program pi_main
65 | 


--------------------------------------------------------------------------------
/code/stencil_reduction.f90:
--------------------------------------------------------------------------------
 1 | 
 2 | ! 5 point stencil
 3 | program stencil
 4 | 
 5 |   use timer
 6 | 
 7 |   implicit none
 8 | 
 9 |   integer :: nx = 4000
10 |   integer :: ny = 4000
11 |   integer :: ntimes = 30
12 |   real(kind=8), dimension(:,:), pointer :: A, Atmp, Aptr
13 |   integer :: i, j, t
14 |   real(kind=8) :: total_start, total_end, total
15 |   real(kind=8) :: tic, toc
16 | 
17 |   ! Allocate memory
18 |   allocate(A(0:nx+1,0:ny+1))
19 |   allocate(Atmp(0:nx+1,0:ny+1))
20 | 
21 |   ! Initialise data to zero
22 |   do i = 0, nx+1
23 |     do j = 0, ny+1
24 |       A(i,j) = 0.0_8
25 |       Atmp(i,j) = 0.0_8
26 |     end do
27 |   end do
28 | 
29 |   ! Insert values in centre of grid
30 |   do i = nx/4, 3*nx/4
31 |     do j = ny/4, 3*ny/4
32 |       A(i,j) = 1.0_8
33 |     end do
34 |   end do
35 | 
36 |   total_start = sum(A(:,:))
37 | 
38 |   ! Start timer
39 |   call wtime(tic)
40 | 
41 |   ! Loop a number of times
42 |   do t = 1, ntimes
43 | 
44 |     ! Update the stencil
45 |     total = 0.0_8
46 |     !$omp parallel do collapse(2) reduction(+:total)
47 |     do i = 1, nx
48 |       do j = 1, ny
49 |         Atmp(i,j) = (A(i-1,j) + A(i+1,j) + A(i,j) + A(i,j-1) + A(i,j+1)) / 5.0_8
50 |         total = total + Atmp(i,j)
51 |       end do
52 |     end do
53 |     !$omp end parallel do
54 | 
55 |     ! Print out total
56 |     write(*,"(I0,A,F15.5)") t, ": total=", total
57 | 
58 |     ! Swap pointers
59 |     Aptr => A
60 |     A => Atmp
61 |     Atmp => Aptr
62 | 
63 |   end do
64 | 
65 |   ! Stop timer
66 |   call wtime(toc)
67 | 
68 |   ! Sum up grid values for rudimentary correctness check
69 |   total_end = sum(A(:,:))
70 | 
71 |   ! Print result
72 |   write(*,"(A)")         "------------------------------------"
73 |   write(*,"(A,F10.3)")   "runtime:  ", toc-tic
74 |   if (abs(total_end-total_start)/total_start > 1.0E-8) then
75 |     write(*,"(A)")       "result: Failed"
76 |   else
77 |     write(*,"(A)")       "result: Passed"
78 |   end if
79 |   write(*,"(A)")         "------------------------------------"
80 | 
81 | 
82 |   deallocate(A, Atmp)
83 | 
84 | end program stencil
85 | 
86 | 


--------------------------------------------------------------------------------
/code/stencil_target.f90:
--------------------------------------------------------------------------------
 1 | 
 2 | ! 5 point stencil
 3 | program stencil
 4 | 
 5 |   use timer
 6 | 
 7 |   implicit none
 8 | 
 9 |   integer :: nx = 4000
10 |   integer :: ny = 4000
11 |   integer :: ntimes = 30
12 |   real(kind=8), dimension(:,:), pointer :: A, Atmp, Aptr
13 |   integer :: i, j, t
14 |   real(kind=8) :: total_start, total_end, total
15 |   real(kind=8) :: tic, toc
16 | 
17 |   ! Allocate memory
18 |   allocate(A(0:nx+1,0:ny+1))
19 |   allocate(Atmp(0:nx+1,0:ny+1))
20 | 
21 |   ! Initialise data to zero
22 |   do j = 0, ny+1
23 |     do i = 0, nx+1
24 |       A(i,j) = 0.0_8
25 |       Atmp(i,j) = 0.0_8
26 |     end do
27 |   end do
28 | 
29 |   ! Insert values in centre of grid
30 |   do j = ny/4, 3*ny/4
31 |     do i = nx/4, 3*nx/4
32 |       A(i,j) = 1.0_8
33 |     end do
34 |   end do
35 | 
36 |   total_start = sum(A(:,:))
37 | 
38 |   ! Copy data to device
39 |   !$omp target enter data map(to: A, Atmp)
40 | 
41 |   ! Start timer
42 |   call wtime(tic)
43 | 
44 |   ! Loop a number of times
45 |   do t = 1, ntimes
46 | 
47 |     ! Update the stencil
48 |     total = 0.0_8
49 |     !$omp target map(tofrom:total)
50 |     !$omp teams distribute parallel do reduction(+:total) collapse(2)
51 |     do j = 1, ny
52 |       do i = 1, nx
53 |         Atmp(i,j) = (A(i-1,j) + A(i+1,j) + A(i,j) + A(i,j-1) + A(i,j+1)) * 0.2_8
54 |         total = total + Atmp(i,j)
55 |       end do
56 |     end do
57 |     !$omp end teams distribute parallel do
58 |     !$omp end target
59 | 
60 |     ! Print out total
61 |     write(*,"(I0,A,F15.5)") t, ": total=", total
62 | 
63 |     ! Swap pointers
64 |     Aptr => A
65 |     A => Atmp
66 |     Atmp => Aptr
67 | 
68 |   end do
69 | 
70 |   ! Stop timer
71 |   call wtime(toc)
72 | 
73 |   ! Copy data back
74 |   !$omp target exit data map(from: A, Atmp)
75 | 
76 |   ! Sum up grid values for rudimentary correctness check
77 |   total_end = sum(A(:,:))
78 | 
79 |   ! Print result
80 |   write(*,"(A)")         "------------------------------------"
81 |   write(*,"(A,F10.3)")   "runtime:  ", toc-tic
82 |   if (abs(total_end-total_start)/total_start > 1.0E-8) then
83 |     write(*,"(A)")       "result: Failed"
84 |   else
85 |     write(*,"(A)")       "result: Passed"
86 |   end if
87 |   write(*,"(A)")         "------------------------------------"
88 | 
89 | 
90 |   deallocate(A, Atmp)
91 | 
92 | end program stencil
93 | 
94 | 


--------------------------------------------------------------------------------
/code/stencil_optimised.f90:
--------------------------------------------------------------------------------
  1 | 
  2 | ! Update the stencil
  3 | subroutine kernel(nx, ny, A, Atmp, total)
  4 | 
  5 |   implicit none
  6 | 
  7 |   integer :: nx, ny
  8 |   real(kind=8) :: A(0:nx+1, 0:ny+1)
  9 |   real(kind=8) :: Atmp(0:nx+1, 0:ny+1)
 10 |   real(kind=8) :: total
 11 | 
 12 |   integer :: i, j
 13 | 
 14 |   total = 0.0_8
 15 |   !$omp parallel do reduction(+:total)
 16 |   do j = 1, ny
 17 |   !$omp simd
 18 |     do i = 1, nx
 19 |       Atmp(i,j) = (A(i-1,j) + A(i+1,j) + A(i,j) + A(i,j-1) + A(i,j+1)) * 0.2_8
 20 |       total = total + Atmp(i,j)
 21 |     end do
 22 |     !$omp end simd
 23 |   end do
 24 |   !$omp end parallel do
 25 | 
 26 | end subroutine kernel
 27 | 
 28 | ! 5 point stencil
 29 | program stencil
 30 | 
 31 |   use timer
 32 | 
 33 |   implicit none
 34 | 
 35 |   integer :: nx = 4000
 36 |   integer :: ny = 4000
 37 |   integer :: ntimes = 30
 38 |   real(kind=8), dimension(:,:), pointer :: A, Atmp, Aptr
 39 |   integer :: i, j, t
 40 |   real(kind=8) :: total_start, total_end, total
 41 |   real(kind=8) :: tic, toc
 42 | 
 43 |   ! Allocate memory
 44 |   allocate(A(0:nx+1,0:ny+1))
 45 |   allocate(Atmp(0:nx+1,0:ny+1))
 46 | 
 47 |   ! Initialise data to zero
 48 |   do j = 0, ny+1
 49 |     do i = 0, nx+1
 50 |       A(i,j) = 0.0_8
 51 |       Atmp(i,j) = 0.0_8
 52 |     end do
 53 |   end do
 54 | 
 55 |   ! Insert values in centre of grid
 56 |   do i = nx/4, 3*nx/4
 57 |     do j = ny/4, 3*ny/4
 58 |       A(i,j) = 1.0_8
 59 |     end do
 60 |   end do
 61 | 
 62 |   total_start = sum(A(:,:))
 63 | 
 64 |   ! Start timer
 65 |   call wtime(tic)
 66 | 
 67 |   ! Loop a number of times
 68 |   do t = 1, ntimes
 69 | 
 70 |     ! Update the stencil
 71 |     call kernel(nx, ny, A, Atmp, total)
 72 | 
 73 |     ! Print out total
 74 |     write(*,"(I0,A,F15.5)") t, ": total=", total
 75 | 
 76 |     ! Swap pointers
 77 |     Aptr => A
 78 |     A => Atmp
 79 |     Atmp => Aptr
 80 | 
 81 |   end do
 82 | 
 83 |   ! Stop timer
 84 |   call wtime(toc)
 85 | 
 86 |   ! Sum up grid values for rudimentary correctness check
 87 |   total_end = sum(A(:,:))
 88 | 
 89 |   ! Print result
 90 |   write(*,"(A)")         "------------------------------------"
 91 |   write(*,"(A,F10.3)")   "runtime:  ", toc-tic
 92 |   if (abs(total_end-total_start)/total_start > 1.0E-8) then
 93 |     write(*,"(A)")       "result: Failed"
 94 |   else
 95 |     write(*,"(A)")       "result: Passed"
 96 |   end if
 97 |   write(*,"(A)")         "------------------------------------"
 98 | 
 99 | 
100 |   deallocate(A, Atmp)
101 | 
102 | end program stencil
103 | 
104 | 


--------------------------------------------------------------------------------
/code/stencil_numa.f90:
--------------------------------------------------------------------------------
  1 | 
  2 | ! Update the stencil
  3 | subroutine kernel(nx, ny, A, Atmp, total)
  4 | 
  5 |   implicit none
  6 | 
  7 |   integer :: nx, ny
  8 |   real(kind=8) :: A(0:nx+1, 0:ny+1)
  9 |   real(kind=8) :: Atmp(0:nx+1, 0:ny+1)
 10 |   real(kind=8) :: total
 11 | 
 12 |   integer :: i, j
 13 | 
 14 |   total = 0.0_8
 15 |   !$omp parallel do reduction(+:total)
 16 |   do j = 1, ny
 17 |   !$omp simd
 18 |     do i = 1, nx
 19 |       Atmp(i,j) = (A(i-1,j) + A(i+1,j) + A(i,j) + A(i,j-1) + A(i,j+1)) * 0.2_8
 20 |       total = total + Atmp(i,j)
 21 |     end do
 22 |     !$omp end simd
 23 |   end do
 24 |   !$omp end parallel do
 25 | 
 26 | end subroutine kernel
 27 | 
 28 | ! 5 point stencil
 29 | program stencil
 30 | 
 31 |   use timer
 32 | 
 33 |   implicit none
 34 | 
 35 |   integer :: nx = 4000
 36 |   integer :: ny = 4000
 37 |   integer :: ntimes = 30
 38 |   real(kind=8), dimension(:,:), pointer :: A, Atmp, Aptr
 39 |   integer :: i, j, t
 40 |   real(kind=8) :: total_start, total_end, total
 41 |   real(kind=8) :: tic, toc
 42 | 
 43 |   ! Allocate memory
 44 |   allocate(A(0:nx+1,0:ny+1))
 45 |   allocate(Atmp(0:nx+1,0:ny+1))
 46 | 
 47 |   ! Initialise data to zero
 48 |   !$omp parallel do
 49 |   do j = 0, ny+1
 50 |     do i = 0, nx+1
 51 |       A(i,j) = 0.0_8
 52 |       Atmp(i,j) = 0.0_8
 53 |     end do
 54 |   end do
 55 |   !$omp end parallel do
 56 | 
 57 |   ! Insert values in centre of grid
 58 |   do i = nx/4, 3*nx/4
 59 |     do j = ny/4, 3*ny/4
 60 |       A(i,j) = 1.0_8
 61 |     end do
 62 |   end do
 63 | 
 64 |   total_start = sum(A(:,:))
 65 | 
 66 |   ! Start timer
 67 |   call wtime(tic)
 68 | 
 69 |   ! Loop a number of times
 70 |   do t = 1, ntimes
 71 | 
 72 |     ! Update the stencil
 73 |     call kernel(nx, ny, A, Atmp, total)
 74 | 
 75 |     ! Print out total
 76 |     write(*,"(I0,A,F15.5)") t, ": total=", total
 77 | 
 78 |     ! Swap pointers
 79 |     Aptr => A
 80 |     A => Atmp
 81 |     Atmp => Aptr
 82 | 
 83 |   end do
 84 | 
 85 |   ! Stop timer
 86 |   call wtime(toc)
 87 | 
 88 |   ! Sum up grid values for rudimentary correctness check
 89 |   total_end = sum(A(:,:))
 90 | 
 91 |   ! Print result
 92 |   write(*,"(A)")         "------------------------------------"
 93 |   write(*,"(A,F10.3)")   "runtime:  ", toc-tic
 94 |   if (abs(total_end-total_start)/total_start > 1.0E-8) then
 95 |     write(*,"(A)")       "result: Failed"
 96 |   else
 97 |     write(*,"(A)")       "result: Passed"
 98 |   end if
 99 |   write(*,"(A)")         "------------------------------------"
100 | 
101 | 
102 |   deallocate(A, Atmp)
103 | 
104 | end program stencil
105 | 
106 | 


--------------------------------------------------------------------------------
/slides/07-wrapup.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[aspectratio=169]{beamer}
 2 | 
 3 | \input{preamble.tex}
 4 | 
 5 | \title{OpenMP for Computational Scientists}
 6 | \subtitle{Wrap up}
 7 | 
 8 | \begin{document}
 9 | 
10 | \frame{\titlepage}
11 | 
12 | %-------------------------------------------------------------------------------
13 | 
14 | \begin{frame}
15 | \frametitle{OpenMP 5.0}
16 | OpenMP 5 adds features to make writing performance portable programs simpler.
17 | 
18 | Highlighting some applicable to target:
19 | \begin{itemize}
20 | \item Loop construct
21 | \item Mappers
22 | \item Unified Shared Memory (USM)
23 | \item Function variants
24 | \item Reverse offload
25 | \item \mintinline{fortran}|OMP_TARGET_OFFLOAD|
26 | \item Reduction variables now implicitly \mintinline{fortran}|map(tofrom)|
27 | \end{itemize}
28 | 
29 | \end{frame}
30 | 
31 | %-------------------------------------------------------------------------------
32 | \begin{frame}[fragile]
33 | \frametitle{Loop}
34 | \begin{itemize}
35 | \item Assert that the iterations in a loop nest may execute in any order, including concurrently
36 | \item Let the compiler figure our how to best utilize parallel resources
37 | \end{itemize}
38 | 
39 | \begin{minted}[]{fortran}
40 | !$omp target
41 | !$omp loop
42 | do i = 1, N
43 |   a(i) = b(i)
44 | end do
45 | !$omp end loop
46 | !$omp end target
47 | \end{minted}
48 | 
49 | \end{frame}
50 | %-------------------------------------------------------------------------------
51 | 
52 | \begin{frame}[fragile]
53 | \frametitle{Unified shared memory}
54 | Code requires specific features, e.g. shared memory between host and devices.
55 | 
56 | \begin{minted}[]{fortran}
57 | 
58 | real(kind=8), dimension(:), allocatable :: A
59 | allocate(A(1024))
60 | 
61 | !$omp requires unified_shared_memory
62 | 
63 | !$omp target
64 |   call do_something_with_A(A)
65 | !$omp end target
66 | \end{minted}
67 | 
68 | No map clauses. Data is shared between the host and device.
69 | 
70 | \end{frame}
71 | 
72 | %-------------------------------------------------------------------------------
73 | \begin{frame}
74 | \frametitle{OpenMP resources}
75 | \begin{itemize}
76 | \item Two brilliant books from MIT Press:
77 | \begin{itemize}
78 | \item The OpenMP Common Core: Making OpenMP Simple Again --- Tim Mattson, Yun (Helen) Ye and Alice Koniges.
79 | \item Using OpenMP - The Next Steps --- Ruud van de Pas, Eric Stotzer and Christian Terboven.
80 | \end{itemize}
81 |  \item OpenMP website: \url{https://www.openmp.org}
82 |    \begin{itemize}
83 |      \item The specification (not for the faint hearted).
84 |      \item Download summary cards.
85 |      \item List of compiler support.
86 |      \item Example code for all the directives.
87 |      \item List of books: \url{https://www.openmp.org/resources/openmp-books/}
88 |    \end{itemize}
89 |  
90 | \end{itemize}
91 | \end{frame}
92 | %-------------------------------------------------------------------------------
93 | \end{document}
94 | 


--------------------------------------------------------------------------------
/code/README.md:
--------------------------------------------------------------------------------
 1 | # OpenMP codes
 2 | 
 3 | This project contains a number of OpenMP examples.
 4 | 
 5 | A Fortran timing module (itself an interface to a C time call) is also provided as a utility to aid in getting wall clock time for serial Fortran programs.
 6 | 
 7 | ## Contents
 8 | - [Compiling the code](#compiling-the-code)
 9 | - [Vector addition](#vector-addition)
10 | - [5-point stencil](#5-point-stencil)
11 | - [Pi](#pi)
12 | - [Private](#private)
13 | - [Fibonacci](#fibonacci)
14 | - [Jacobi](#jacobi)
15 | - [Utility timing routines](#utility-timing-routines)
16 | 
17 | ## Compiling the code
18 | The provided `Makefile` will build all of the provided code.
19 | The default compiler is `gfortran`.
20 | 
21 | To use your own compiler, edit the `FTN` variable in the `Makefile`.
22 | For example, set `FTN=ifort` to use the Intel Fortran compiler.
23 | 
24 | Additional compiler flags can be set using the `FFLAGS` variable in the `Makefile`.
25 | 
26 | The OpenMP library is set using the `LIBS` variable in the `Makefile`.
27 | 
28 | Run `make clean` to clear away the built binaries and partial build files.
29 | 
30 | ## Vector Addition
31 | 
32 | Serial and parallel versions of the simple vector add program: `C=A+B`.
33 | Both a SPMD and a `parallel do` parallel version are provided (as solutions).
34 | 
35 | ## 5-point stencil
36 | 
37 | Serial and parallel versions of a simple 5-point stencil on a rectangular grid.
38 | The value in each cell is computed as the average (mean) of itself and north, south, east and west neighbours.
39 | The stencil is applied to the grid a number of times.
40 | 
41 | ## Pi
42 | 
43 | This code implements the integration of `4/(1+x*x)` using the trapezoidal rule to estimate pi.
44 | 
45 | A number of implementations are given, and should be viewed in order:
46 | 
47 | 1. pi: the serial version
48 | 2. critical: an initial parallel version, using a critical region to safeguard sum
49 | 3. atomic: parallel version, using an atomic to safeguard sum
50 | 4. array: parallel version, using an array of partial sums, one per thread
51 | 5. private: parallel version, using a private sum to each thread, totalled with a critical
52 | 6. reduction: parallel version using OpenMP reduction
53 | 
54 | ## Private
55 | 
56 | This code is a simple example to show how different private data sharing clauses change the data environment of each thread.
57 | 
58 | 
59 | ## Fibonacci
60 | An implementation of a recursive algorithm to calculate Fibonacci numbers using OpenMP tasks.
61 | 
62 | 
63 | ## Jacobi
64 | 
65 | This code implements the iterative Jacobi method to solve a system of linear equations.
66 | See the [Wikipedia page](https://en.wikipedia.org/wiki/Jacobi_method) for a full description of the Jacobi method.
67 | 
68 | The program can be run without any arguments to solve a default problem.
69 | The `-n` and `-i` arguments can be used to control the matrix size and maximum number of iterations.
70 | For example, to solve for a 500x500 matrix, use the following command:
71 | 
72 |     ./jacobi -n 500
73 | 
74 | Use `--help` to see a full description for all of the command-line arguments.
75 | 
76 | ### Sample runtimes
77 | 
78 | Here are the runtimes that we achieve with the starting code for a few different matrix sizes.
79 | Run on a MacBook Pro (Intel Core i7-4980HQ CPU @ 2.80GHz).
80 | 
81 | | Matrix size | Solver runtime  | Iterations | Solution error   |
82 | | ----------- | --------------- | ---------- | ---------------- |
83 | |     500     |  0.331 seconds  |    1511    |    0.0248609     |
84 | |    1000     |  4.858 seconds  |    2883    |    0.0499393     |
85 | |    2000     |  170   seconds  |    5445    |    0.0999166     |
86 | |    4000     |  1671  seconds  |    10233   |    0.1998391     |
87 | 
88 | ## Utility timing routines
89 | The `timer.f90` and `wtime.c` files provide a simple timing routine to use for all examples.
90 | The time is recorded in C using `gettimeofday()`, and a Fortran interface is provided.
91 | This was provided so that the serial codes can use a simple timing library.
92 | Users should use the OpenMP `omp_get_wtime()` API call for their parallel codes.
93 | 
94 | 


--------------------------------------------------------------------------------
/slides/00-prelim.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[aspectratio=169]{beamer}
  2 | 
  3 | \input{preamble.tex}
  4 | 
  5 | \title{OpenMP for Computational Scientists}
  6 | \subtitle{Preliminaries}
  7 | 
  8 | \begin{document}
  9 | 
 10 | \frame{\titlepage}
 11 | 
 12 | %-------------------------------------------------------------------------------
 13 | % \begin{frame}
 14 | % \frametitle{Audience}
 15 | 
 16 | % \begin{itemize}
 17 | % \item Teaches OpenMP 4.5 and 5.0 in a seminar style.
 18 | % \item 6 lecture topics, with exercises and solutions.
 19 | % \item Designed for Computational Scientists familiar with Fortran and MPI programming.
 20 | 
 21 | % \end{itemize}
 22 | 
 23 | % Download code (and slides) from:
 24 | % \url{https://github.com/UoB-HPC/openmp-for-cs}
 25 | 
 26 | % \end{frame}
 27 | %-------------------------------------------------------------------------------
 28 | 
 29 | \begin{frame}
 30 | \frametitle{Introduction}
 31 | 
 32 | \begin{itemize}
 33 |   \item Today: Learn OpenMP 4.5 (and maybe some 5.0).
 34 |   \item We will cover a lot of material!
 35 |   \item This is a hands-on tutorial!
 36 |   \item Mixture of lectures and exercises.
 37 |   % \item Exercises designed to try programming OpenMP.
 38 |   \item Experiment and have fun with them!
 39 |   \item Solutions provided, but only look as last resort.
 40 |   \item Assume knowledge of basic Fortran; parallel programming with MPI useful.
 41 | \end{itemize}
 42 | \end{frame}
 43 | 
 44 | %-------------------------------------------------------------------------------
 45 | 
 46 | \begin{frame}
 47 | \frametitle{Materials}
 48 | \begin{block}{Materials}
 49 | Download code (and slides) from:
 50 | \url{https://github.com/UoB-HPC/openmp-for-cs}
 51 | \end{block}
 52 | \end{frame}
 53 | %-------------------------------------------------------------------------------
 54 | 
 55 | \begin{frame}
 56 | \frametitle{GW4 Isambard}
 57 | \begin{columns}
 58 |   \begin{column}{0.7\framewidth}
 59 |     \begin{itemize}
 60 |       \item UK Tier-2 Supercomputer.
 61 |       \item Collaboration between GW4 Alliance, UK Met Office, Cray, Arm and EPSRC.
 62 |       \item 21,000+ Armv8 cores.
 63 |       \item Collection of CPUs/GPUs from different vendors.
 64 |       \item \textbf{Today:} using the Intel Xeon 2x18-core Broadwell and NVIDIA P100 nodes.
 65 |     \end{itemize}
 66 |   \end{column}
 67 |   \begin{column}{0.3\framewidth}
 68 |     \includegraphics[width=\textwidth]{isambard.jpeg}
 69 |   \end{column}
 70 | \end{columns}
 71 | 
 72 | Thanks to Simon McIntosh-Smith and Bristol for supporting today's tutorial with time on Isambard.
 73 | 
 74 | \end{frame}
 75 | 
 76 | %-------------------------------------------------------------------------------
 77 | 
 78 | 
 79 | \begin{frame}
 80 | \frametitle{Agenda}
 81 | 
 82 | \textbf{Part One: CPUs}
 83 | \begin{description}
 84 |   \item[09:30--09:40] Introduction.
 85 |   \item[09:40--10:10] Parallel worksharing.
 86 |   \item[10:10--10:35] Exercise 1: Parallel stencil (two-ways).
 87 |   \item[10:35--11:00] Data sharing.
 88 |   \item[11:00--11:15] Coffee Break.
 89 |   \item[11:15--11:35] Exercise 2: Parallel convergence.
 90 |   \item[11:35--12:10] Vectorisation and NUMA.
 91 |   \item[12:10--12:30] Exercise 3: Optimising stencil.
 92 | \end{description}
 93 | 
 94 | \textbf{Lunch break (12:30--13:30)}
 95 | \end{frame}
 96 | 
 97 | \begin{frame}
 98 | \frametitle{Agenda}
 99 | \textbf{Lunch break (12:30--13:30)}
100 | The Zoom session is open: feel free to continue on the morning exercises and ask questions in the Q and A.
101 | 
102 | \textbf{Part Two: GPUs}
103 | \begin{description}
104 |   \item[13:30--13:35] Welcome back.
105 |   \item[13:35--14:10] Transferring execution and data movement.
106 |   \item[14:10--14:35] Exercise 4: Stencil on a GPU.
107 |   \item[14:35--15:00] Target Parallelism.
108 |   \item[15:00--15:15] Coffee Break.
109 |   \item[15:15--15:40] Optimising data movement.
110 |   \item[15:40--16:25] Exercise 5: Optimising stencil on a GPU.
111 |   \item[16:25--16:30] Wrap up.
112 | \end{description}
113 | \end{frame}
114 | 
115 | %-------------------------------------------------------------------------------
116 | 
117 | % \begin{frame}
118 | % \frametitle{Exercises}
119 | % \begin{itemize}
120 | % \item This is a hands-on course!
121 | % \item Exercises will be set for you to try programming OpenMP yourselves.
122 | % \item Sample solutions also provided.
123 | % \item All the exercises will be in Fortran.
124 | % \end{itemize}
125 | 
126 | % \end{frame}
127 | 
128 | %-------------------------------------------------------------------------------
129 | % \section{Outline}
130 | % \begin{frame}
131 | % \frametitle{Course Outline}
132 | % Organised as 6 sessions teaching OpenMP plus top-tips for getting good performance.
133 | % \begin{enumerate}
134 | %   \item OpenMP overview
135 | %   \item Data sharing and reductions
136 | %   \item Vectorisation and code optimisations
137 | %   \item NUMA and MPI interoperability
138 | %   \item GPU programming with OpenMP
139 | %   \item Tasks and Tools
140 | % \end{enumerate}
141 | % \end{frame}
142 | 
143 | %-------------------------------------------------------------------------------
144 | \begin{frame}
145 | \frametitle{Thanks}
146 | Thanks go to the following authors, whose own OpenMP tutorials have inspired this one:
147 | \begin{itemize}
148 |   \item Tim Mattson (Intel)
149 |   \item Alice Koniges (Berkeley Lab/NERSC)
150 |   \item Simon McIntosh-Smith and the HPC team (UoBristol)
151 |   \item Gethin Williams (UoBristol)
152 |   \item and many others
153 | \end{itemize}
154 | \end{frame}
155 | %-------------------------------------------------------------------------------
156 | 
157 | \end{document}
158 | 


--------------------------------------------------------------------------------
/code/jacobi.f90:
--------------------------------------------------------------------------------
  1 | !
  2 | ! Implementation of the iterative Jacobi method.
  3 | !
  4 | ! Given a known, diagonally dominant matrix A and a known vector b, we aim to
  5 | ! to find the vector x that satisfies the following equation:
  6 | !
  7 | !     Ax = b
  8 | !
  9 | ! We first split the matrix A into the diagonal D and the remainder R:
 10 | !
 11 | !     (D + R)x = b
 12 | !
 13 | ! We then rearrange to form an iterative solution:
 14 | !
 15 | !     x' = (b - Rx) / D
 16 | !
 17 | ! More information:
 18 | ! -> https://en.wikipedia.org/wiki/Jacobi_method
 19 | !
 20 | 
 21 | ! Module which contains the Jacobi solver subrountine
 22 | module solve_mod
 23 | 
 24 |   contains
 25 | 
 26 |   ! Solve Ax=b according to the Jacobi method
 27 |   subroutine solve(N, A, b, x, xtmp, itr, MAX_ITERATIONS, CONVERGENCE_THRESHOLD)
 28 | 
 29 |     implicit none
 30 | 
 31 |     ! Input variables
 32 |     integer :: N                          ! Matrix order
 33 |     real(kind=8) :: A(N,N)                ! The matrix
 34 |     real(kind=8) :: b(N)                  ! The right hand side vector
 35 |     real(kind=8), pointer :: x(:)         ! Initial solution
 36 |     real(kind=8), pointer :: xtmp(:)      ! Next solution
 37 |     integer :: itr                        ! Iterations to solve
 38 |     integer :: MAX_ITERATIONS             ! Iteration limit
 39 |     real(kind=8) :: CONVERGENCE_THRESHOLD ! Convergence criteria
 40 | 
 41 |     ! Local variables
 42 |     real(kind=8), pointer :: ptrtmp(:) ! Used for pointer swapping
 43 |     integer :: row, col                ! Matrix index
 44 |     real(kind=8) :: dot
 45 |     real(kind=8) :: diff, sqdiff=huge(0.0_8)
 46 | 
 47 |     ! Loop until converged or maximum iterations reached
 48 |     itr = 0
 49 |     do while (itr .lt. MAX_ITERATIONS .and. sqrt(sqdiff) .gt. CONVERGENCE_THRESHOLD)
 50 |       ! Perfom Jacobi iteration
 51 |       do row = 1, N
 52 |         dot = 0.0_8
 53 |         do col = 1, N
 54 |           if (row .ne. col) then
 55 |             dot = dot + (A(row,col) * x(col))
 56 |           end if
 57 |         end do
 58 |         xtmp(row) = (b(row) - dot) / A(row,row)
 59 |       end do
 60 | 
 61 |       ! Swap pointers
 62 |       ptrtmp => x
 63 |       x      => xtmp
 64 |       xtmp   => ptrtmp
 65 | 
 66 |       ! Check for convergence
 67 |       sqdiff = 0.0_8
 68 |       do row = 1, N
 69 |         diff = xtmp(row) - x(row)
 70 |         sqdiff = sqdiff + (diff * diff)
 71 |       end do
 72 | 
 73 |       itr = itr + 1
 74 |     end do
 75 | 
 76 |   end subroutine solve
 77 | end module solve_mod
 78 | 
 79 | ! Main program
 80 | program jacobi
 81 | 
 82 |   use timer
 83 |   use solve_mod     ! Include solver (above)
 84 | 
 85 |   implicit none
 86 | 
 87 |   ! Solver settings
 88 |   integer :: MAX_ITERATIONS=20000
 89 |   real(kind=8) :: CONVERGENCE_THRESHOLD=0.0001
 90 | 
 91 |   ! Timers
 92 |   real(kind=8) :: total_start, total_end
 93 |   real(kind=8) :: solve_start, solve_end
 94 | 
 95 |   ! Matrix size
 96 |   integer :: N=1000
 97 | 
 98 |   ! Data arrays
 99 |   real(kind=8), allocatable :: A(:,:) ! The matrix
100 |   real(kind=8), allocatable :: b(:)   ! The right hand size vector
101 |   real(kind=8), pointer :: x(:)       ! Initial solution
102 |   real(kind=8), pointer :: xtmp(:)    ! Temporary solution storage
103 |   integer :: itr                      ! Iteration count
104 | 
105 |   ! Local variables
106 |   integer :: row, col
107 |   real(kind=8) :: rowsum, value
108 |   real(kind=8) :: err, tmp
109 | 
110 |   ! Read in any command line arguments which set problem variables
111 |   call parse_arguments(MAX_ITERATIONS, CONVERGENCE_THRESHOLD, N)
112 | 
113 |   ! Allocate memory
114 |   allocate(A(N,N))
115 |   allocate(b(N))
116 |   allocate(x(N))
117 |   allocate(xtmp(N))
118 | 
119 |   ! Print header
120 |   write(*,"(A)")         "------------------------------------"
121 |   write(*,"(A,I0,A,I0)") "Matrix size:           ", N, " x ", N
122 |   write(*,"(A,I0)")      "Maximum iterations:    ", MAX_ITERATIONS
123 |   write(*,"(A,F7.5)")    "Convergence threshold: ", CONVERGENCE_THRESHOLD
124 |   write(*,"(A)")         "------------------------------------"
125 |   write(*,*)
126 | 
127 |   ! Start the program timer
128 |   call wtime(total_start)
129 | 
130 |   ! Initialize data randomly
131 |   ! A needs to be a diagonally dominant square matrix, so diagonal entries are biased
132 |   do row = 1, N
133 |     rowsum = 0.0_8
134 |     do col = 1, N
135 |       call random_number(value)
136 |       A(row,col) = value
137 |       rowsum = rowsum + value
138 |     end do
139 |     A(row,row) = A(row,row) + rowsum
140 |     call random_number(b(row))
141 |     x(row) = 0.0_8
142 |   end do
143 | 
144 |   ! Run Jacobi solver
145 |   call wtime(solve_start)
146 |   call solve(N, A, b, x, xtmp, itr, MAX_ITERATIONS, CONVERGENCE_THRESHOLD)
147 |   call wtime(solve_end)
148 | 
149 |   ! Check error of final solution
150 |   err = 0.0_8
151 |   do row = 1, N
152 |     tmp = 0.0_8
153 |     do col = 1, N
154 |       tmp = tmp + (A(row,col) * x(col))
155 |     end do
156 |     tmp = b(row) - tmp
157 |     err = err + (tmp*tmp)
158 |   end do
159 |   err = sqrt(err)
160 | 
161 |   ! Stop the program timer
162 |   call wtime(total_end)
163 | 
164 |   ! Print results
165 |   write(*,"(A,F13.7)") "Solution error = ", err
166 |   write(*,"(A,I0)")    "Iterations     = ", itr
167 |   write(*,"(A,F10.3)") "Total runtime  = ", total_end-total_start
168 |   write(*,"(A,F10.3)") "Solver runtime = ", solve_end-solve_start
169 |   if (itr .eq. MAX_ITERATIONS) write(*,"(A)") "WARNING: solution did not converge"
170 |   write(*,"(A)")       "------------------------------------"
171 | 
172 |   ! Free memory
173 |   deallocate(A, b, x, xtmp)
174 | 
175 | end program jacobi
176 | 
177 | ! Parse the command line arguments, setting the problem size, etc.
178 | subroutine parse_arguments(MAX_ITERATIONS, CONVERGENCE_THRESHOLD, N)
179 | 
180 |   implicit none
181 | 
182 |   integer :: MAX_ITERATIONS
183 |   real(kind=8) :: CONVERGENCE_THRESHOLD
184 |   integer :: N
185 | 
186 |   character(len=32) :: arg
187 | 
188 |   integer :: i=1
189 |   integer :: err
190 | 
191 |   do while (i .le. command_argument_count())
192 |     call get_command_argument(i, arg)
193 |     arg = trim(arg)
194 | 
195 |     if ("--convergence" .eq. arg .or. &
196 |         "-c"            .eq. arg) then
197 |       i = i + 1
198 |       call get_command_argument(i, arg, status=err)
199 |       if (err .ne. 0) then
200 |         write (*,*) "Error: no convergence threshold given"
201 |         stop
202 |       end if
203 |       read(arg,*) CONVERGENCE_THRESHOLD
204 | 
205 |     else if ("--iterations" .eq. arg .or. &
206 |              "-i" .eq. arg) then
207 |       i = i + 1
208 |       call get_command_argument(i, arg, status=err)
209 |       if (err .ne. 0) then
210 |         write (*,*) "Error: no max iterations given"
211 |         stop
212 |       end if
213 |       read(arg,*) MAX_ITERATIONS
214 | 
215 |     else if ("--norder" .eq. arg .or. &
216 |              "-n" .eq. arg) then
217 |       i = i + 1
218 |       call get_command_argument(i, arg, status=err)
219 |       if (err .ne. 0) then
220 |         write (*,*) "Error: no matrix order given"
221 |         stop
222 |       end if
223 |       read(arg,*) N
224 | 
225 |     else if ("--help" .eq. arg) then
226 |       write(*,"(A)") "Usage: ./jacobi [OPTIONS]"
227 |       write(*,*)
228 |       write(*,"(A)") "Options:"
229 |       write(*,"(2X,A)") "-h  --help               Print this message"
230 |       write(*,"(2X,A)") "-c  --convergence  C     Set convergence threshold"
231 |       write(*,"(2X,A)") "-i  --iterations   I     Set maximum number of iterations"
232 |       write(*,"(2X,A)") "-n  --norder       N     Set maxtrix order"
233 |       write(*,*)
234 |       stop
235 | 
236 |     else
237 |       write (*,"(A,A)") "Unrecognized argument (try '--help'): ", arg
238 |       stop
239 |     end if
240 | 
241 |     i = i + 1
242 |   end do
243 | end subroutine parse_arguments
244 | 


--------------------------------------------------------------------------------
/slides/04-hybrid.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | 
  3 | \input{preamble.tex}
  4 | 
  5 | \title{OpenMP for Computational Scientists}
  6 | \subtitle{4: Combining MPI and OpenMP}
  7 | 
  8 | \begin{document}
  9 | 
 10 | \frame{\titlepage}
 11 | 
 12 | %-------------------------------------------------------------------------------
 13 | \section{Outline}
 14 | \begin{frame}
 15 | \frametitle{Outline}
 16 | 
 17 | \begin{itemize}
 18 |   \item Quick recap
 19 |   \item Calculating memory bandwidth for the 5-point stencil code
 20 | \end{itemize}
 21 | 
 22 | \vfill
 23 | 
 24 | Programming beyond a single multi-core CPU:
 25 | \begin{itemize}
 26 |   \item Non-uniform Memory Access
 27 |   \item Thread affinity in OpenMP
 28 |   \item Combining MPI with OpenMP
 29 | \end{itemize}
 30 | \end{frame}
 31 | 
 32 | %-------------------------------------------------------------------------------
 33 | \section{Recap}
 34 | \begin{frame}
 35 | \frametitle{Recap}
 36 | 
 37 | We've already come a long way!
 38 | 
 39 | \begin{itemize}
 40 |   \item Parallelise loops with OpenMP: \mintinline{fortran}|!$omp parallel do|.
 41 |   \item Data sharing clauses.
 42 |   \item Synchronisation with barriers, atomics and \mintinline{fortran}|critical| regions.
 43 |   \item Reductions with the \mintinline{fortran}|reduction| clause.
 44 |   \item The cache hierarchy.
 45 |   \item Performance analysis and the Roofline model.
 46 |   \item Vectorisation along with the OpenMP \mintinline{fortran}|simd| construct.
 47 |   \item Optimisations for memory access.
 48 | \end{itemize}
 49 | 
 50 | \end{frame}
 51 | 
 52 | %-------------------------------------------------------------------------------
 53 | \begin{frame}[fragile]
 54 | \frametitle{Previous exercise}
 55 | 
 56 | Vectorise and optimise memory access patterns of your parallel 5-point stencil code:
 57 | \begin{minted}[frame=single,breaklines,fontsize=\scriptsize]{fortran}
 58 | !$omp parallel do reduction(+:total)
 59 | do j = 1, ny
 60 |   !$omp simd
 61 |   do i = 1, nx
 62 |     Atmp(i,j) = (A(i-1,j) + A(i+1,j) + A(i,j) + A(i,j-1) + A(i,j+1)) * 0.2
 63 |     total = total + Atmp(i,j)
 64 |   end do
 65 |   !$omp end simd
 66 | end do
 67 | !$omp end parallel do
 68 | \end{minted}
 69 | 
 70 | \begin{itemize}
 71 |   \item Swapped loops to ensure stride-1 access pattern.
 72 |   \item Removed division!
 73 |   \item Use \mintinline{fortran}|simd| construct on inner loop (removing \mintinline{fortran}|collapse| clause).
 74 |   \item Checked vectorisation report: assume sizes arrays cause issue, so move kernel into \mintinline{fortran}|subroutine|.
 75 | \end{itemize}
 76 | 
 77 | \end{frame}
 78 | 
 79 | %-------------------------------------------------------------------------------
 80 | \begin{frame}
 81 | \frametitle{Calculating memory bandwidth}
 82 | Is your 5-point stencil code \emph{fast}?
 83 | 
 84 | \pause
 85 | 
 86 | Calculate memory bandwidth of the \emph{kernel} as a whole:
 87 | \begin{itemize}[<+->]
 88 |   \item Assume a ``perfect cache'' model: once you read a memory location, it's been cached and further reads are ``free'' within the kernel.
 89 |   \item All of \mintinline{fortran}|A| array is read: $nx \times ny$ reads.
 90 |   \item All of \mintinline{fortran}{Atmp} array is written: $nx \times ny$ reads.
 91 |   \item Total memory moved: $2 \times nx \times ny \times 8$ bytes data moved (double precision) \emph{per iteration}.
 92 |   \item Memory bandwidth: $\frac{ntimes \times 2 \times nx \times ny \times 8}{runtime}$ bytes/second.
 93 | \end{itemize}
 94 | 
 95 | 
 96 | \end{frame}
 97 | 
 98 | %-------------------------------------------------------------------------------
 99 | \begin{frame}
100 | \frametitle{Achieved memory bandwidth}
101 | 
102 | Results on dual-socket Intel Xeon E5-2680 v4 @ 2.40GHz, 14 cores/socket.
103 | Compiled with Intel 2018 compiler, {\tt -O3 -xHost}.
104 | 
105 | \vfill
106 | 
107 | Set $nx=ny=20,000$ so arrays are 3.2~GB. Set $ntimes=30$. Removed \mintinline{fortran}|write| statement. Taken best of 5 runs.
108 | 
109 | \vfill
110 | 
111 | \pause
112 | Theoretical peak bandwidth\footnote{\url{https://ark.intel.com/products/91754/Intel-Xeon-Processor-E5-2680-v4-35M-Cache-2_40-GHz}}: $2 \times 76.8 \text{GB/s} = 153.6 \text{GB/s}$. \\
113 | STREAM Triad: 129.0~GB/s (84\% theoretical peak).
114 | 
115 | \pause
116 | \begin{table}
117 | \begin{tabular}{ccc}
118 | \toprule
119 | Version & Runtime (s) & Memory bandwidth (GB/s)\\
120 | \midrule
121 | Initial parallel reduction & 25.667 &  7.48 \\
122 | Swap loops + vectorise     &  4.876 & 39.38 \\
123 | \bottomrule
124 | \end{tabular}
125 | \end{table}
126 | 
127 | Achieving 30.5\% of STREAM memory bandwidth.
128 | 
129 | \end{frame}
130 | 
131 | %-------------------------------------------------------------------------------
132 | \section{NUMA}
133 | \begin{frame}
134 | \frametitle{NUMA Architecture}
135 | 
136 | Recall this cartoon of a dual-socket, shared memory system:
137 | \begin{center}
138 | \begin{tikzpicture}
139 |   % Draw 4 cores for socket 0
140 |   \draw (0,0) rectangle (1,1);
141 |   \draw (1,0) rectangle (2,1);
142 |   \draw (0,1) rectangle (1,2);
143 |   \draw (1,1) rectangle (2,2);
144 | 
145 |   % Draw 4 cores for socket 1
146 |   \draw (3,0) rectangle (4,1);
147 |   \draw (4,0) rectangle (5,1);
148 |   \draw (3,1) rectangle (4,2);
149 |   \draw (4,1) rectangle (5,2);
150 | 
151 |   % Draw large memory
152 |   \draw (-0.5,3) rectangle (5.5,4);
153 |   \draw (2.5,3.5) node {Memory};
154 | 
155 |   % Connect sockets to memory
156 |   \draw (1,2) -- (1,3);
157 |   \draw (4,2) -- (4,3);
158 |   \draw[dashed] (2,1) -- (3,1); % QPI
159 | 
160 | \end{tikzpicture}
161 | \end{center}
162 | 
163 | \emph{All} threads (each running on a core) can access the same memory.
164 | 
165 | \end{frame}
166 | %-------------------------------------------------------------------------------
167 | 
168 | \begin{frame}
169 | \frametitle{NUMA Architecture}
170 | \begin{itemize}
171 |   \item In reality on a dual-socket system each \emph{socket} is physically connected to half of the memory.
172 |   \item Still shared memory: all cores can access all the memory.
173 |   \item A core in the first socket wanting memory attached to the other socket must:
174 |     \begin{itemize}
175 |       \item Go via the socket-to-socket interconnect.
176 |       \item Access memory via the other socket's memory controllers.
177 |     \end{itemize}
178 |   \item Accessing memory from other socket is slower than access from own socket.
179 | \end{itemize}
180 | \begin{center}
181 | \resizebox{!}{3.5cm}{
182 | \begin{tikzpicture}
183 |   % Draw 4 cores for socket 0
184 |   \foreach \i in {0,1,3,4} {
185 |     \foreach \j in {0, 1} {
186 |       \draw (\i,\j) rectangle (\i+1,\j+1);
187 |     }
188 |   }
189 | 
190 |   % Draw sockets around cores
191 |   \draw (-0.2, -0.2) rectangle (2.2, 2.2);
192 |   \draw (2.8, -0.2) rectangle (5.2, 2.2);
193 | 
194 |   % Draw large memory
195 |   \draw (-0.5,3) rectangle (2.3,4);
196 |   \draw (2.7,3) rectangle (5.5,4);
197 |   \draw[dashed] (-0.7,2.8) rectangle (5.7,4.2);
198 | 
199 |   % Connect sockets to memory
200 |   \draw (1,2.2) -- (1,3);
201 |   \draw (4,2.2) -- (4,3);
202 |   \draw[dashed] (2.2,1) -- (2.8,1); % QPI
203 | 
204 |   % Show memory shared
205 |   \pause
206 |   \draw[fill=red] (0.5,3.2) rectangle (1,3.7);
207 |   \draw (3.5,1.5) node {Read};
208 |   \pause
209 |   \draw[->,red,thick] (0.7,3.2) -- (0.7,2.1) -- (2.1,2.1) -- (2.1,1.1) -- (2.9,1.1) -- (3.5,1.2);
210 | 
211 | \end{tikzpicture}
212 | }
213 | \end{center}
214 | \end{frame}
215 | 
216 | %-------------------------------------------------------------------------------
217 | \begin{frame}
218 | \frametitle{Memory allocation}
219 | \begin{itemize}
220 |   \item What happens when you run \mintinline{fortran}|allocate(A(1:N))|?
221 |   \pause
222 |   \item Allocating memory does not necessarily allocate memory!
223 |   \item Memory is allocated when it's first used (i.e. \mintinline{fortran}|A(i) = 1.0|), one \emph{page} at a time.
224 |   \item OS tends to use a \emph{first touch policy}.
225 |   \item Memory is allocated in the closest NUMA region to the thread that first touches the data.
226 |   \item Ideally want threads to use data in local NUMA region to reduce socket-to-socket interconnect transfers.
227 | \end{itemize}
228 | \end{frame}
229 | 
230 | %-------------------------------------------------------------------------------
231 | \subsection{First touch}
232 | \begin{frame}[fragile]
233 | \frametitle{Taking advantage of first touch}
234 | Parallelising your data initialisation routine might mean your main loops go faster!
235 | 
236 | 
237 | \begin{minted}[fontsize=\small,linenos,frame=single]{fortran}
238 | ! Allocate and initialise vectors
239 | allocate(A(N), B(N), C(N))
240 | !$omp parallel do
241 | do i = 1, N
242 |   A(i) = 1.0
243 |   B(i) = 2.0
244 |   C(i) = 0.0
245 | end do
246 | !$omp end parallel do
247 | 
248 | ! Vector add
249 | !$omp parallel do
250 | do i = 1, N
251 |   C(i) = A(i) + B(i)
252 | end do
253 | !$omp end parallel do
254 | \end{minted}
255 | 
256 | \end{frame}
257 | 
258 | %-------------------------------------------------------------------------------
259 | \begin{frame}
260 | \frametitle{NUMA-aware}
261 | \begin{itemize}
262 |   \item Parallelise your initialisation routines the same way you parallelise the main loops.
263 |   \item This means each thread touches the same data in initialisation and compute.
264 |   \item Should reduce the number of remote memory accesses needed and improve run times.
265 |   \item But, OS is allowed to move threads around cores, and between sockets.
266 |   \item This will mess up your NUMA aware code!
267 | \end{itemize}
268 | \end{frame}
269 | 
270 | %-------------------------------------------------------------------------------
271 | \section{Thread affinity}
272 | \begin{frame}
273 | \frametitle{Pinning threads}
274 | \begin{itemize}
275 |   \item OpenMP gives you the controls to pin threads to specific cores.
276 |   \item Exposed as \emph{places} and \emph{thread pinning policy} to those places.
277 |   \item By default there is one place consisting of all the cores.
278 |   \item Use the \mintinline{bash}|OMP_PROC_BIND| environment variable to set pinning for all \mintinline{fortran}|parallel| regions.
279 |   \item Can use the \mintinline{bash}|proc_bind| clause for control of specific regions, but advise against this.
280 | \end{itemize}
281 | \end{frame}
282 | 
283 | %-------------------------------------------------------------------------------
284 | \begin{frame}
285 | \frametitle{OMP\_PROC\_BIND}
286 | \begin{itemize}
287 |   \item \mintinline{bash}|OMP_PROC_BIND=false|: Often the default; threads may move! \mintinline{fortran}|proc_bind| clauses ignored.
288 |   \item \mintinline{bash}|OMP_PROC_BIND=true|: Threads won't move, and follow \mintinline{fortran}|proc_bind| clauses or else the implementation default pinning.
289 |   \item \mintinline{bash}|OMP_PROC_BIND=master|: Threads pinned to same place as master thread.
290 |   \item \mintinline{bash}|OMP_PROC_BIND=close|: Threads are assigned to places close to the master thread.
291 |   If \mintinline{bash}|OMP_NUM_THREADS.eq.ncores|: thread 0 will pin to core 0; thread 1 will pin to core 1; etc
292 |   \item \mintinline{bash}|OMP_PROC_BIND=spread|: Threads are assigned to places ``sparsely''.
293 |   If \mintinline{bash}|OMP_NUM_THREADS.eq.ncores|: thread 0 will pin to socket 0 core 0; thread 1 will pin to socket 1 core 0; thread 2 will pin to socket 0 core 1; etc.
294 | \end{itemize}
295 | \end{frame}
296 | 
297 | %-------------------------------------------------------------------------------
298 | \begin{frame}
299 | \frametitle{Places}
300 | \begin{itemize}
301 |   \item The affinity (policy) defines how threads are assigned to places.
302 |   \item Places allow you to divide up the hardware resource, so that threads can be assigned to them.
303 |   \item Default: one place with all cores.
304 |   \item Use \mintinline{bash}|OMP_PLACES| environment variable to control.
305 |   \item \mintinline{bash}|OMP_PLACES=thread|: each place is a single hardware thread.
306 |   \item \mintinline{bash}|OMP_PLACES=cores|: each place is a single core (containing one or more hardware threads).
307 |   \item \mintinline{bash}|OMP_PLACES=sockets|: each place contains the cores of a single socket.
308 |   \item Can also use list notation: \mintinline{bash}|OMP_PLACES="{0:4},{4:4},{8:4},{12:4}"|
309 | \end{itemize}
310 | \end{frame}
311 | 
312 | %-------------------------------------------------------------------------------
313 | \begin{frame}
314 | \frametitle{Thread pinning summary}
315 | \begin{itemize}
316 |   \item In general, going to want to just use \mintinline{bash}|OMP_PROC_BIND=true|.
317 |   \item Sometimes \mintinline{bash}|spread| or \mintinline{bash}|close| gets better performance.
318 |   \item Pinning rules can get complicated when there are multiple places, so prefer to use the predefined values.
319 |   \item Most effective with a NUMA-aware implementation.
320 |   \item Also helps reduce run-to-run timing variability.
321 |   \item But must be careful with MPI+OpenMP pinning: more on this later\dots
322 | \end{itemize}
323 | \end{frame}
324 | 
325 | %-------------------------------------------------------------------------------
326 | \section{Hybrid MPI and OpenMP}
327 | \begin{frame}
328 | \frametitle{Why combine MPI+OpenMP}
329 | \begin{itemize}
330 |   \item Supercomputers are often constructed with a hierarchical structure:
331 |     \begin{itemize}
332 |       \item Shared memory nodes connected with a network.
333 |     \end{itemize}
334 |   \item Need MPI (or similar) to communicate between distributed nodes.
335 |   \item With multi-core, could just run MPI everywhere (flat MPI).
336 |   \item But there are advantages to running \emph{hybrid} MPI and OpenMP:
337 |     \begin{itemize}
338 |       \item Larger fewer messages to take advantage of network bandwidth.
339 |       \item Fewer MPI ranks to manage (fewer to synchronise and for collectives).
340 |       \item Can avoid memory copies for intra-node communication.
341 |       \item Reduced memory footprint.
342 |       \item Parallelise other problem dimensions not decomposed with MPI.
343 |     \end{itemize}
344 | \end{itemize}
345 | \end{frame}
346 | 
347 | %-------------------------------------------------------------------------------
348 | \begin{frame}
349 | \frametitle{Scaling}
350 | \begin{itemize}
351 |   \item Strong scaling:
352 |     \begin{itemize}
353 |       \item Take a fixed problem and add more compute resource.
354 |       \item Would hope runtime reduces with more resource.
355 |     \end{itemize}
356 |   \item Weak scaling:
357 |     \begin{itemize}
358 |       \item Take a fixed problem \emph{per compute resource}, and add more resource.
359 |       \item Problem gets bigger with more resources.
360 |       \item Would hope runtime stays constant.
361 |     \end{itemize}
362 |   \item In both cases, typically see scaling of MPI-only codes tail off at high node counts.
363 |   \item Hybrid MPI+OpenMP codes often continue scaling.
364 | \end{itemize}
365 | \end{frame}
366 | 
367 | 
368 | %-------------------------------------------------------------------------------
369 | \begin{frame}[fragile]
370 | \frametitle{MPI programs}
371 | What happens when you run an MPI program?
372 | \begin{minted}{bash}
373 | mpirun -np 16 ./a.out
374 | \end{minted}
375 | 
376 | \begin{itemize}
377 |   \item 16 processes are spawned on one (or more) nodes according to the hostname list file given by the queuing system.
378 |     \begin{itemize}
379 |       \item E.g. with PBS (\mintinline{bash}|qsub|, etc.) set by \mintinline{bash}|$PBS_NODEFILE|.
380 |     \end{itemize}
381 |   \item There is no reason why these processes have to be serial:
382 |   \begin{itemize}
383 |     \item Each MPI rank could spawn OpenMP threads and run in parallel.
384 |     \item Each MPI rank could use a GPU.
385 |   \end{itemize}
386 | \end{itemize}
387 | 
388 | \end{frame}
389 | 
390 | %-------------------------------------------------------------------------------
391 | \begin{frame}[fragile]
392 | \frametitle{Compiling OpenMP and MPI code}
393 | \begin{itemize}
394 |   \item Remember building MPI code just uses the wrapper commands.
395 |   \item Just pass in the OpenMP flag as usual:
396 |     \begin{itemize}
397 |       \item GNU: \mintinline{bash}|mpif90| -fopenmp
398 |       \item Intel: \mintinline{bash}|mpiifort| -qopenmp
399 |       \item Cray: \mintinline{bash}|ftn|
400 |     \end{itemize}
401 |   \item Set the number of OpenMP threads \emph{per rank}.
402 |   \item E.g 2 MPI ranks, 8 threads per rank:
403 |   \begin{minted}{bash}
404 |   OMP_NUM_THREADS=8 mpirun -np 2 ./a.out
405 |   \end{minted}
406 | \end{itemize}
407 | \end{frame}
408 | 
409 | %-------------------------------------------------------------------------------
410 | \begin{frame}[fragile]
411 | \frametitle{Combining OpenMP and MPI}
412 | \begin{itemize}
413 |   \item MPI assumes that each MPI process does not spawn anything else.
414 |   \item Must initialise MPI differently if using threads!
415 |   \begin{minted}{fortran}
416 |   call MPI_Init_thread(required, provided, ierr)
417 |   \end{minted}
418 | 
419 |   \item You specify a required thread support level, and it returns the level it could support.
420 |   \item A good idea to check \mintinline{fortran}|provided .ge. required|.
421 | \end{itemize}
422 | \end{frame}
423 | 
424 | %-------------------------------------------------------------------------------
425 | \begin{frame}
426 | \frametitle{Thread support levels}
427 | \begin{itemize}
428 |   \item \mintinline{fortran}|MPI_THREAD_SINGLE| \\
429 |   Only one thread will execute (no threads allowed).
430 | 
431 |   \item \mintinline{fortran}|MPI_THREAD_FUNNELED| \\
432 |   May spawn threads, but only the original process may call MPI routines: the one that called \mintinline{fortran}|MPI_Init|.
433 | 
434 |   \item \mintinline{fortran}|MPI_THREAD_SERIALIZED| \\
435 |   May spawn threads and any thread can make MPI calls, but only one at a time. \emph{Your} responsibility to synchronise.
436 | 
437 |   \item \mintinline{fortran}|MPI_THREAD_MULTIPLE| \\
438 |   May spawn threads and any thread can make MPI calls. The MPI library has to deal with being called in parallel.
439 | \end{itemize}
440 | 
441 | Remember to make sure ranks still match the MPI communications to avoid deadlock.
442 | 
443 | \end{frame}
444 | 
445 | %-------------------------------------------------------------------------------
446 | \begin{frame}[fragile]
447 | \frametitle{Example: MPI\_THREAD\_FUNNELED}
448 | Only the original process is allowed to call MPI routines.
449 | \begin{minted}[frame=single]{fortran}
450 | !$omp parallel
451 | ... ! Parallel work
452 | !$omp end parallel
453 | call MPI_Sendrecv()
454 | \end{minted}
455 | \end{frame}
456 | 
457 | %-------------------------------------------------------------------------------
458 | \begin{frame}[fragile]
459 | \frametitle{MPI\_THREAD\_SERIALIZED}
460 | The threads are allowed to call MPI, but you must program in synchronisation to ensure only one thread calls MPI at a time.
461 | \begin{minted}[frame=single]{fortran}
462 | !$omp parallel
463 |   ... ! Parallel work
464 |   !$omp critical
465 |   call MPI_Sendrecv()
466 |   !$omp end critical
467 | !$omp end parallel
468 | \end{minted}
469 | \end{frame}
470 | 
471 | %-------------------------------------------------------------------------------
472 | \begin{frame}[fragile]
473 | \frametitle{MPI\_THREAD\_MULTIPLE}
474 | Any thread can call MPI whenever it likes. The \mintinline{fortran}|MPI_THREAD_MULTIPLE| guarantees the MPI library will be OK with this.
475 | \begin{minted}[frame=single]{fortran}
476 | !$omp parallel
477 |   ... ! Parallel work
478 |   call MPI_Sendrecv()
479 | !$omp end parallel
480 | \end{minted}
481 | \end{frame}
482 | 
483 | %-------------------------------------------------------------------------------
484 | \subsection{Hybrid thread pinning}
485 | \begin{frame}
486 | \frametitle{Thread pinning}
487 | \begin{itemize}
488 |   \item Need to be very careful how MPI ranks and OpenMP threads are mapped to the physical hardware.
489 |   \item Imagine 2 dual-socket nodes: 4 sockets with (say) 16 cores per socket.
490 |   \item Launch 64 MPI ranks: 1 per core.
491 |     \begin{itemize}
492 |       \item This is flat MPI.
493 |       \item Launching OpenMP threads will over-allocate threads compared to hardware resource.
494 |       \item Warning: things will slow down.
495 |     \end{itemize}
496 |   \item Launch 4 MPI ranks (one per socket).
497 |     \begin{itemize}
498 |       \item Leaves 16 cores per MPI rank for OpenMP threads to run on.
499 |       \item But need to make sure processes \emph{and} threads go to the right places!
500 |       \item Often close interaction with the queuing system --- system dependant behaviour.
501 |     \end{itemize}
502 | \end{itemize}
503 | \end{frame}
504 | 
505 | %-------------------------------------------------------------------------------
506 | \begin{frame}[fragile]
507 | \frametitle{Example: default placement}
508 | Example MPI rank placement with standard PBS setup.
509 | 
510 | Job requested 2 nodes.
511 | 
512 | \begin{minted}{bash}
513 | mpirun -np 4 ./a.out
514 | \end{minted}
515 | 
516 | \begin{center}
517 | \begin{adjustbox}{max width={\textwidth}}
518 | \begin{tikzpicture}
519 | 
520 |   \foreach \loc in {0, 3, 7, 10} {
521 |   \foreach \i in {0,...,1} {
522 |     \foreach \j in {0,...,1} {
523 |       \draw (\loc+\i,\j) rectangle (\loc+\i+1,\j+1);
524 |     }
525 |   }
526 |   }
527 | 
528 |   \draw[dashed] (-0.5,-0.5) rectangle (5.5,2.5);
529 |   \draw[dashed] (6.5,-0.5) rectangle (12.5,2.5);
530 | 
531 |   \foreach \i in {0,...,1} {
532 |     \foreach \j in {0,...,1} {
533 |       \draw<2->[fill=red] (3+\i+.5,\j+.5) circle (0.4cm);
534 |     }
535 |   }
536 | \end{tikzpicture}
537 | \end{adjustbox}
538 | \end{center}
539 | \onslide<2->{
540 |  All ranks placed on the second socket of the first node.
541 | }
542 | \end{frame}
543 | 
544 | %-------------------------------------------------------------------------------
545 | 
546 | \begin{frame}[fragile]
547 | \frametitle{Example: pin MPI to one core per socket}
548 | \begin{itemize}
549 |   \item Tell the OS and MPI runtime to pin each MPI to the first core in each socket.
550 |   \item Then want to launch 4 OpenMP threads per process.
551 |   \item For OpenMPI:
552 |   \begin{minted}{bash}
553 |   export OMP_NUM_THREADS=4
554 |   mpirun -np 4 --npersocket 1 ./a.out
555 |   \end{minted}
556 |   \item Where do the threads go?
557 | \end{itemize}
558 | 
559 | 
560 | \begin{center}
561 | \begin{adjustbox}{max width={\textwidth}}
562 | \begin{tikzpicture}
563 | 
564 |   \foreach \loc in {0, 3, 7, 10} {
565 |   \foreach \i in {0,...,1} {
566 |     \foreach \j in {0,...,1} {
567 |       \draw (\loc+\i,\j) rectangle (\loc+\i+1,\j+1);
568 |     }
569 |   }
570 |   }
571 | 
572 |   \draw[dashed] (-0.5,-0.5) rectangle (5.5,2.5);
573 |   \draw[dashed] (6.5,-0.5) rectangle (12.5,2.5);
574 | 
575 |   \foreach \i in {0, 3, 7, 10} {
576 |     \draw[fill=red] (\i+0.5,1.5) circle (0.4cm);
577 |     \foreach \j in {0.2, 0.4, 0.6, 0.8} {
578 |       \draw<2->[->,line width=.5mm] (\i+\j,1.8) -- (\i+\j, 1.3);
579 |     }
580 |   }
581 | \end{tikzpicture}
582 | \end{adjustbox}
583 | \end{center}
584 | 
585 | \onslide<2->{
586 | Threads spawned inherit their parent's binding, which was one core.
587 | 
588 | Use \mintinline{bash}|--report-bindings| flag to see what's being pinned where.
589 | 
590 | }
591 | 
592 | \end{frame}
593 | 
594 | %-------------------------------------------------------------------------------
595 | \begin{frame}
596 | \frametitle{Example: pin MPI to socket}
597 | \begin{itemize}
598 |   \item Pin each MPI process to the cores of a socket.
599 |   \item MPI process \emph{could} move around those cores.
600 |   \item OpenMP threads can spawn across the socket.
601 |   \item OpenMPI gives three ways to do this:
602 |   \begin{itemize}
603 |     \item \mintinline{bash}|--bind-to-socket|
604 |     \item \mintinline{bash}|--bind-to-core --cpus-per-proc 8|
605 |     \item \mintinline{bash}|--map-by socket:PE=8| (v1.10 and up)
606 |   \end{itemize}
607 | \end{itemize}
608 | \end{frame}
609 | 
610 | %-------------------------------------------------------------------------------
611 | \begin{frame}
612 | \frametitle{Pinning with Intel and Cray}
613 | \begin{itemize}
614 |   \item Intel MPI will need different flags and environment variables, but tends to do the right thing by default.
615 |   \item Cray MPI (MVAPICH) can be controlled using \mintinline{bash}|aprun|.
616 |     \begin{itemize}
617 |       \item Use the \mintinline{bash}|-d| flag to specify the threads per process.
618 |       \item Pinning usually happens correctly.
619 |     \end{itemize}
620 |   \item Cray MPI with the Intel compiler needs a different set of \mintinline{bash}|aprun| flags.
621 |     \begin{itemize}
622 |       \item Default pinning is usually not what you expected.
623 |       \item Use the \mintinline{bash}|-cc| flag to specify correct thread pinning.
624 |     \end{itemize}
625 |   \item The \mintinline{bash}|amask| tool from TACC is very useful for discovering the pinning\footnote{\url{https://github.com/TACC/amask}}.
626 | \end{itemize}
627 | \end{frame}
628 | 
629 | %-------------------------------------------------------------------------------
630 | \section{Exercise}
631 | \begin{frame}
632 | \frametitle{Exercise}
633 | \begin{itemize}
634 |   \item Make your parallel 5-point stencil code NUMA aware.
635 |     \begin{itemize}
636 |       \item Parallelise the initialisation routine.
637 |     \end{itemize}
638 |   \item Calculate improvements memory bandwidth.
639 |     \begin{itemize}
640 |       \item Use a profiler to measure remote memory accesses before/after optimisation.
641 |     \end{itemize}
642 |   \item Experiment with thread affinity.
643 |   \item Extension: Add MPI to your OpenMP 5-point stencil to run it hybrid across multiple nodes.
644 | \end{itemize}
645 | \end{frame}
646 | 
647 | %-------------------------------------------------------------------------------
648 | \section{Summary}
649 | \begin{frame}
650 | \frametitle{Summary}
651 | 
652 | \begin{itemize}
653 |   \item Walked through memory bandwidth model calculation of 5-point stencil.
654 |   \item NUMA issues and taking advantage of first touch policy.
655 |   \item Controlling OpenMP thread affinity with \mintinline{bash}|OMP_PROC_BIND| and \mintinline{bash}|OMP_PLACES| environment variables.
656 |   \item Programming a hybrid MPI+OpenMP code.
657 |   \item Thread affinity of hybrid programs.
658 | \end{itemize}
659 | 
660 | \vfill
661 | 
662 | \begin{itemize}
663 |   \item Next sessions:
664 |     \begin{enumerate}
665 |       \setcounter{enumi}{4}
666 |       \item GPU programming with OpenMP.
667 |       \item Tasks and Tools.
668 |     \end{enumerate}
669 | \end{itemize}
670 | 
671 | \end{frame}
672 | 
673 | %-------------------------------------------------------------------------------
674 | 
675 | \end{document}
676 | 
677 | 


--------------------------------------------------------------------------------
/slides/03-simd-numa.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[aspectratio=169]{beamer}
  2 | 
  3 | \input{preamble.tex}
  4 | 
  5 | \title{OpenMP for Computational Scientists}
  6 | \subtitle{3: Vectorisation and NUMA}
  7 | 
  8 | \begin{document}
  9 | 
 10 | \frame{\titlepage}
 11 | 
 12 | %-------------------------------------------------------------------------------
 13 | \begin{frame}[fragile]
 14 | \frametitle{Previous exercise}
 15 | 
 16 | Take your parallel 5-point stencil, and implement a reduction:
 17 | \begin{minted}[frame=single,breaklines,fontsize=\small]{fortran}
 18 | total = 0.0
 19 | !$omp parallel do collapse(2) reduction(+:total)
 20 | do i = 1, nx
 21 |     do j = 1, ny
 22 |     Atmp(i,j) = (A(i-1,j) + A(i+1,j) + A(i,j) + A(i,j-1) + A(i,j+1)) / 5.0
 23 |     total = total + Atmp(i,j)
 24 |     end do
 25 | end do
 26 | !$omp end parallel do
 27 | \end{minted}
 28 | 
 29 | \begin{itemize}
 30 |     \item Well done if you managed this!
 31 |     \item 5-point stencil is simple, but captures the \emph{essence} of more complicated codes.
 32 |     \item Extension: did anyone try the parallelising the Jacobi solver?
 33 | \end{itemize}
 34 | 
 35 | \end{frame}
 36 | 
 37 | %-------------------------------------------------------------------------------
 38 | %-------------------------------------------------------------------------------
 39 | \section{Vectorisation}
 40 | \begin{frame}
 41 | \frametitle{Vectorisation}
 42 | $$C=A+B$$
 43 | \begin{columns}
 44 | \begin{column}{0.5\textwidth}
 45 | Scalar operations \\
 46 | \begin{adjustbox}{max totalsize={\textwidth}{0.6\textheight},center}
 47 | 
 48 | \begin{tikzpicture}
 49 |   \draw (-0.5,2) rectangle (0.5,3);
 50 |   \draw (1,2) rectangle (2,3);
 51 |   \draw[->] (0,2) -- (.74,1.2);
 52 |   \draw[->] (1.5,2) -- (.76,1.2);
 53 |   \draw (.75,.75) circle (.4cm);
 54 |   \draw (.75,.75) node {$+$};
 55 |   \draw[->] (.75,0.3) -- (.75,-0.5);
 56 |   \draw (.25,-1.5) rectangle (1.25,-0.5);
 57 | \end{tikzpicture}
 58 | \end{adjustbox}
 59 | \end{column}
 60 | 
 61 | \begin{column}{0.5\textwidth}
 62 | Vector operations \\
 63 | \begin{adjustbox}{max totalsize={\textwidth}{0.6\textheight},center}
 64 | \begin{tikzpicture}
 65 |   \draw[step=1cm] (0,2) grid (4,3);
 66 |   \draw[step=1cm] (0,0) grid (4,1);
 67 |   \draw[->] (2,0) -- (2,-0.6);
 68 |   \draw[->] (0,2.5) -- (-0.5,2.5) -- (-0.5,-1) -- (1.6,-1);
 69 |   \draw (2,-1) circle (.4cm);
 70 |   \draw (2,-1) node {$+$};
 71 |   \draw[->] (2,-1.4) -- (2,-1.9);
 72 |   \draw[step=1cm] (0,-3) grid (4,-2);
 73 | \end{tikzpicture}
 74 | \end{adjustbox}
 75 | \end{column}
 76 | \end{columns}
 77 | 
 78 | \end{frame}
 79 | 
 80 | %-------------------------------------------------------------------------------
 81 | \begin{frame}
 82 | \frametitle{Why vectorise?}
 83 | \begin{itemize}
 84 |   \item Vectorisation gives you more compute per cycle.
 85 |   \item Hence may increase the FLOP/s rate of the processor.
 86 |   \item Also results in fewer instructions to process (less pressure on instruction decode units).
 87 |   \item Vectors help make good use of the memory hierarchy (often the main benefit).
 88 |   \item Vectorisation helps you write code which has good access patterns to maximise bandwidth.
 89 | \end{itemize}
 90 | \end{frame}
 91 | 
 92 | %-------------------------------------------------------------------------------
 93 | \begin{frame}
 94 | \frametitle{Auto-vectorisation}
 95 | \begin{itemize}
 96 |   \item Modern compilers are very good at automatically vectorising your loops.
 97 |   \item Fortran helps as arrays can not alias (overlap), unlike C.
 98 |   \item But compiler needs to be sure it's safe to vectorise.
 99 |   \item Read compiler reports to see if it's already vectorising.
100 |     \begin{itemize}
101 |       \item Intel: \mintinline{bash}|-qopt-report=5|
102 |       \item Cray: \mintinline{bash}|-hlist=a|
103 |       \item GNU (old): \mintinline{bash}|-ftree-vectorizer-verbose=2|
104 |       \item GNU (new): \mintinline{bash}|-fopt-info-vec|
105 |       \item Clang: \mintinline{bash}|-Rpass=loop-vectorize| \mintinline{bash}|-Rpass-missed=loop-vectorize| \mintinline{bash}|-Rpass-analysis=loop-vectorize|
106 |     \end{itemize}
107 |   \item Often the memory access pattern prevents (efficient) auto-vectorisation.
108 | \end{itemize}
109 | \end{frame}
110 | 
111 | %-------------------------------------------------------------------------------
112 | \subsection{OpenMP SIMD}
113 | \begin{frame}[fragile]
114 | \frametitle{OpenMP SIMD}
115 | \begin{itemize}
116 |   \item Sometimes the compiler needs help in confirming loops are vectorisable.
117 |   \item OpenMP \mintinline{fortran}|simd| constructs give this information.
118 |   \item Can combine with \mintinline{fortran}|parallel do| construct to ensure a parallel vector loop: \mintinline{fortran}|omp parallel do simd|
119 |   \item Generally want to vectorise inner loops and parallelise outer loops.
120 | \end{itemize}
121 | 
122 | \begin{minted}[frame=single]{fortran}
123 | !$omp simd
124 | do i = 1, N
125 |   C(i) = A(i)+B(i)
126 | end do
127 | !$omp end simd
128 | \end{minted}
129 | \end{frame}
130 | 
131 | %-------------------------------------------------------------------------------
132 | \begin{frame}[fragile]
133 | \frametitle{SIMD functions}
134 | Say you've written an update function to update values in the loop:
135 | \begin{minted}[frame=single]{fortran}
136 | do i = 1, N
137 |   A(i) = magic_maths(A(i))
138 | end do
139 | \end{minted}
140 | 
141 | \begin{itemize}
142 |   \item The situation gets complicated.
143 |   \item If the function is small, then likely inlined and loop will auto-vectorise.
144 |   \item Otherwise need to use the \mintinline{fortran}|simd| construct, but need compiler to create a vector version of the function.
145 | \end{itemize}
146 | 
147 | \begin{minted}[frame=single]{fortran}
148 | function magic_maths(value) result(r)
149 | !$omp declare simd(magic_maths)
150 |   implicit none
151 |   real(kind=8) :: value, r
152 |   r = value * value
153 | end function
154 | \end{minted}
155 | 
156 | \end{frame}
157 | 
158 | %-------------------------------------------------------------------------------
159 | \begin{frame}[fragile]
160 | \frametitle{SIMD clauses}
161 | \begin{itemize}
162 |   \item All the usual data-sharing and reduction clauses can be applied.
163 |   \item \mintinline{fortran}|safelen(4)|: distance between iterations where its safe to vectorise.
164 |   \begin{minted}[frame=single]{fortran}
165 |   !$omp simd safelen(4)
166 |   do i = 1, N-4
167 |     A(i) = A(i) + A(i+4)
168 |   end do
169 |   !$omp end simd
170 |   \end{minted}
171 |   \item \mintinline{fortran}|simdlen(4)|: preferred iterations to be performed concurrently as a vector.
172 |   Specifying explicit vector lengths builds in obsolescence to the code as hardware vector lenghts continually change --- don't recommend using this clause.
173 | \end{itemize}
174 | \end{frame}
175 | 
176 | %-------------------------------------------------------------------------------
177 | \begin{frame}[fragile]
178 | \frametitle{SIMD clauses}
179 | \begin{itemize}
180 |   \item \mintinline{fortran}|linear(var)|: variable is private and linear to the loop iterator.
181 |   \begin{minted}[frame=single]{fortran}
182 |   !$omp simd linear(j)
183 |   do i = 1, N
184 |     j = j + 1
185 |     A(j) = B(i)
186 |   end do
187 |   !$omp end simd
188 |   \end{minted}
189 |   \item \mintinline{fortran}|aligned(var)|: says the array is aligned.
190 |   \item \mintinline{fortran}|uniform(var)|: for \mintinline{fortran}|declare simd| construct, the variable is the same in all vector lanes.
191 | \end{itemize}
192 | \end{frame}
193 | 
194 | %-------------------------------------------------------------------------------
195 | \begin{frame}
196 | \frametitle{SIMD summary}
197 | 
198 | \begin{itemize}
199 |   \item Sometimes need to force the compiler to auto-vectorise (the correct) loop with the \mintinline{fortran}|simd| construct.
200 |   \item As with \mintinline{fortran}|parallel|, you are telling the compiler it is safe to vectorise and to ignore its data dependancy analysis.
201 |   \item Check the compiler report before and after the check it did the right thing!
202 |   \item Use \mintinline{fortran}|declare simd| and appropriate clauses if you need to create vectorised versions of functions.
203 |   \begin{itemize}
204 |     \item The clauses can give more information to the compiler so it does a better job.
205 |   \end{itemize}
206 | \end{itemize}
207 | 
208 | \end{frame}
209 | 
210 | %-------------------------------------------------------------------------------
211 | \section{Derived types}
212 | \begin{frame}[fragile]
213 | \frametitle{Derived types}
214 | 2D grid of cells, each cell containing 4 different values.
215 | \begin{minted}[frame=single,linenos,fontsize=\footnotesize]{fortran}
216 | type cell
217 |   real(kind=8) :: property1
218 |   real(kind=8) :: property2
219 |   real(kind=8) :: property3
220 |   real(kind=8) :: property4
221 | end type
222 | 
223 | type(cell), allocatable :: grid(:,:)
224 | 
225 | do j = 1, ny
226 |   do i = 1, nx
227 |     grid(i,j)%property1 = update_1()
228 |     grid(i,j)%property2 = update_2()
229 |     grid(i,j)%property3 = update_3()
230 |     grid(i,j)%property4 = update_4()
231 |   end do
232 | end do
233 | \end{minted}
234 | \end{frame}
235 | 
236 | %-------------------------------------------------------------------------------
237 | \begin{frame}
238 | \frametitle{Derived types}
239 | \begin{itemize}
240 |   \item What do Fortran derived types look like in memory?
241 |   \item Organised as an array of structures.
242 |   \item<2-> What happens when we vectorise our loop over cells?
243 | \end{itemize}
244 | 
245 | \begin{adjustbox}{max width={\textwidth}}
246 | \begin{tikzpicture}
247 |   \draw[step=1cm] (0,0) grid (13,1);
248 |   \foreach \i in {0,4,8,12} {
249 |     \draw (\i+.5,.5) node {P1};
250 |   }
251 |   \foreach \i in {0,4,8} {
252 |     \draw (\i+1.5,.5) node {P2};
253 |     \draw (\i+2.5,.5) node {P3};
254 |     \draw (\i+3.5,.5) node {P4};
255 |   }
256 | 
257 |   \foreach \i in {0,4,8,12} {
258 |     \draw<3->[->] (\i+.5,-1) -- (\i+.5,0);
259 |   }
260 | \end{tikzpicture}
261 | \end{adjustbox}
262 | 
263 | \begin{itemize}
264 |   \item<4-> The \mintinline{fortran}|property1| values are gathered into a vector register.
265 |   \item<5-> After the computation, the results are scattered back into memory.
266 |   \item<6-> A cache line is 64 bytes, so only the first two values are on the first cache line.
267 |   \item<6-> Must read two cache lines to fill the vector up.
268 | \end{itemize}
269 | \end{frame}
270 | 
271 | %-------------------------------------------------------------------------------
272 | \begin{frame}[fragile]
273 | \frametitle{Structure of arrays}
274 | Switch type around to have an array per property.
275 | \begin{minted}[frame=single,linenos,fontsize=\small]{fortran}
276 | type grid
277 |   real(kind=8), allocatable :: property1(:,:)
278 |   real(kind=8), allocatable :: property2(:,:)
279 |   real(kind=8), allocatable :: property3(:,:)
280 |   real(kind=8), allocatable :: property4(:,:)
281 | end type
282 | 
283 | do j = 1, ny
284 |   do i = 1, nx
285 |     grid%property1(i,j) = update_1()
286 |     grid%property2(i,j) = update_2()
287 |     grid%property3(i,j) = update_3()
288 |     grid%property4(i,j) = update_4()
289 |   end do
290 | end do
291 | \end{minted}
292 | \end{frame}
293 | 
294 | %-------------------------------------------------------------------------------
295 | \begin{frame}
296 | \frametitle{Structure of arrays}
297 | \begin{itemize}
298 |   \item Order of data in memory has changed.
299 |   \item<2-> What happens when we vectorise?
300 | \end{itemize}
301 | 
302 | \begin{adjustbox}{max width={\textwidth}}
303 | \begin{tikzpicture}
304 |   \draw[step=1cm] (0,0) grid (13,1);
305 |   \foreach \i in {0,...,4} {
306 |     \draw (\i+.5,.5) node {P1};
307 |   }
308 |   \draw (5.5,.5) node {\dots};
309 | 
310 |   \foreach \i in {5,...,9} {
311 |     \draw (\i+1.5,.5) node {P2};
312 |   }
313 |   \draw (11.5,.5) node {\dots};
314 | 
315 |   \foreach \i in {10} {
316 |     \draw (\i+2.5,.5) node {P3};
317 |   }
318 | 
319 |   \foreach \i in {0,...,3} {
320 |     \draw<3->[->] (\i+.5,-1) -- (\i+.5,0);
321 |   }
322 | \end{tikzpicture}
323 | \end{adjustbox}
324 | 
325 | \onslide<4->{
326 | \begin{itemize}
327 |   \item Coalesced memory accesses are key for high performance code.
328 |   \item Adjacent vector lanes read adjacent memory locations.
329 |   \item A cache line is 64 bytes, so can fill the vector from a single cache line.
330 |   \item More efficient vectorisation.
331 | \end{itemize}
332 | }
333 | \end{frame}
334 | 
335 | %-------------------------------------------------------------------------------
336 | \section{Memory access patterns}
337 | \begin{frame}[fragile]
338 | \frametitle{Memory access patterns}
339 | \begin{minted}{fortran}
340 | do i = 1, N
341 |   val = A(i)
342 | end do
343 | \end{minted}
344 | \begin{adjustbox}{max width={\textwidth}}
345 | \begin{tikzpicture}
346 |   \draw[step=1cm] (-3,0) grid (11,1);
347 |   \draw[dashed] (0,-.5) -- (0,1.5);
348 |   \draw[dashed] (8,-.5) -- (8,1.5);
349 |   \draw (0,-1) node {64 byte boundary};
350 |   \foreach \i in {0,...,7} {
351 |     \draw[->] (\i+.5,2) -- (\i+.5,1.2);
352 |   }
353 | \end{tikzpicture}
354 | \end{adjustbox}
355 | \begin{itemize}
356 |   \item Ideal memory access pattern.
357 |   \item All access is coalesced.
358 |   \item Vectors are aligned to cache line boundary.
359 | \end{itemize}
360 | \end{frame}
361 | 
362 | %-------------------------------------------------------------------------------
363 | \begin{frame}[fragile]
364 | \frametitle{Memory access patterns}
365 | \begin{minted}{fortran}
366 | do i = 1, N
367 |   val = A(i+3)
368 | end do
369 | \end{minted}
370 | \begin{adjustbox}{max width={\textwidth}}
371 | \begin{tikzpicture}
372 |   \draw[step=1cm] (-3,0) grid (11,1);
373 |   \draw[dashed] (0,-.5) -- (0,1.5);
374 |   \draw[dashed] (8,-.5) -- (8,1.5);
375 |   \draw (0,-1) node {64 byte boundary};
376 |   \foreach \i in {0,...,7} {
377 |     \draw[->] (\i+.5,2) -- (3+\i+.5,1.2);
378 |   }
379 | \end{tikzpicture}
380 | \end{adjustbox}
381 | \begin{itemize}
382 |   \item OK memory access pattern.
383 |   \item All access is coalesced, but split across cache lines.
384 |   \item Still get good use of cache lines, but not as efficient as aligned version.
385 | \end{itemize}
386 | \end{frame}
387 | 
388 | %-------------------------------------------------------------------------------
389 | \begin{frame}[fragile]
390 | \frametitle{Memory access patterns}
391 | \begin{minted}{fortran}
392 | do i = 1, N
393 |   val = A(j,i) ! equiv. A(j+3*i)
394 | end do
395 | \end{minted}
396 | \begin{adjustbox}{max width={\textwidth}}
397 | \begin{tikzpicture}
398 |   \draw[step=1cm] (-3,0) grid (11,1);
399 |   \draw[dashed] (0,-.5) -- (0,1.5);
400 |   \draw[dashed] (8,-.5) -- (8,1.5);
401 |   \draw (0,-1) node {64 byte boundary};
402 |   \foreach \i in {0,...,3} {
403 |     \draw[->] (\i+.5,2) -- (3*\i+.5,1.2);
404 |   }
405 | \end{tikzpicture}
406 | \end{adjustbox}
407 | \begin{itemize}
408 |   \item Strided access results in multiple memory transactions.
409 |   \item Kills throughput due to poor reuse of cached data.
410 |   \item Very easy to fall into this trap with multi-dimensional arrays.
411 |   \item Check your strides!
412 | \end{itemize}
413 | \end{frame}
414 | 
415 | %-------------------------------------------------------------------------------
416 | \begin{frame}[fragile]
417 | \frametitle{Memory access patterns}
418 | \begin{minted}{fortran}
419 | do i = 1, N
420 |   val = A(B(i))
421 | end do
422 | \end{minted}
423 | \begin{adjustbox}{max width={\textwidth}}
424 | \begin{tikzpicture}
425 |   \draw[step=1cm] (-3,0) grid (11,1);
426 |   \draw[dashed] (0,-.5) -- (0,1.5);
427 |   \draw[dashed] (8,-.5) -- (8,1.5);
428 |   \draw (0,-1) node {64 byte boundary};
429 |   \draw[->] (0.5,2) -- (-3.5,1.2);
430 |   \draw[->] (1.5,2) -- (3.5,1.2);
431 |   \draw[->] (2.5,2) -- (0.5,1.2);
432 |   \draw[->] (3.5,2) -- (8.5,1.2);
433 |   \draw[->] (4.5,2) -- (-1.5,1.2);
434 |   \draw[->] (5.5,2) -- (7.5,1.2);
435 |   \draw[->] (6.5,2) -- (1.5,1.2);
436 |   \draw[->] (7.5,2) -- (-2.5,1.2);
437 | \end{tikzpicture}
438 | \end{adjustbox}
439 | \begin{itemize}
440 |   \item Essentially random access to memory.
441 |   \item Little reuse of cache lines.
442 |   \item Unpredictable pattern, so hardware prefetchers won't work efficiently.
443 |   \item Very challenging!
444 | \end{itemize}
445 | \end{frame}
446 | 
447 | %-------------------------------------------------------------------------------
448 | \section{NUMA}
449 | \begin{frame}
450 | \frametitle{NUMA Architecture}
451 | 
452 | Recall this cartoon of a dual-socket, shared memory system:
453 | \begin{center}
454 | \begin{tikzpicture}
455 |   % Draw 4 cores for socket 0
456 |   \draw (0,0) rectangle (1,1);
457 |   \draw (1,0) rectangle (2,1);
458 |   \draw (0,1) rectangle (1,2);
459 |   \draw (1,1) rectangle (2,2);
460 | 
461 |   % Draw 4 cores for socket 1
462 |   \draw (3,0) rectangle (4,1);
463 |   \draw (4,0) rectangle (5,1);
464 |   \draw (3,1) rectangle (4,2);
465 |   \draw (4,1) rectangle (5,2);
466 | 
467 |   % Draw large memory
468 |   \draw (-0.5,3) rectangle (5.5,4);
469 |   \draw (2.5,3.5) node {Memory};
470 | 
471 |   % Connect sockets to memory
472 |   \draw (1,2) -- (1,3);
473 |   \draw (4,2) -- (4,3);
474 |   \draw[dashed] (2,1) -- (3,1); % QPI
475 | 
476 | \end{tikzpicture}
477 | \end{center}
478 | 
479 | \emph{All} threads (each running on a core) can access the same memory.
480 | 
481 | \end{frame}
482 | %-------------------------------------------------------------------------------
483 | 
484 | \begin{frame}
485 | \frametitle{NUMA Architecture}
486 | \begin{itemize}
487 |   \item In reality on a dual-socket system each \emph{socket} is physically connected to half of the memory.
488 |   \item Still shared memory: all cores can access all the memory.
489 |   \item A core in the first socket wanting memory attached to the other socket must:
490 |     \begin{itemize}
491 |       \item Go via the socket-to-socket interconnect.
492 |       \item Access memory via the other socket's memory controllers.
493 |     \end{itemize}
494 |   \item Accessing memory from other socket is slower than access from own socket.
495 | \end{itemize}
496 | \begin{center}
497 | \resizebox{!}{3.5cm}{
498 | \begin{tikzpicture}
499 |   % Draw 4 cores for socket 0
500 |   \foreach \i in {0,1,3,4} {
501 |     \foreach \j in {0, 1} {
502 |       \draw (\i,\j) rectangle (\i+1,\j+1);
503 |     }
504 |   }
505 | 
506 |   % Draw sockets around cores
507 |   \draw (-0.2, -0.2) rectangle (2.2, 2.2);
508 |   \draw (2.8, -0.2) rectangle (5.2, 2.2);
509 | 
510 |   % Draw large memory
511 |   \draw (-0.5,3) rectangle (2.3,4);
512 |   \draw (2.7,3) rectangle (5.5,4);
513 |   \draw[dashed] (-0.7,2.8) rectangle (5.7,4.2);
514 | 
515 |   % Connect sockets to memory
516 |   \draw (1,2.2) -- (1,3);
517 |   \draw (4,2.2) -- (4,3);
518 |   \draw[dashed] (2.2,1) -- (2.8,1); % QPI
519 | 
520 |   % Show memory shared
521 |   \pause
522 |   \draw[fill=red] (0.5,3.2) rectangle (1,3.7);
523 |   \draw (3.5,1.5) node {Read};
524 |   \pause
525 |   \draw[->,red,thick] (0.7,3.2) -- (0.7,2.1) -- (2.1,2.1) -- (2.1,1.1) -- (2.9,1.1) -- (3.5,1.2);
526 | 
527 | \end{tikzpicture}
528 | }
529 | \end{center}
530 | \end{frame}
531 | 
532 | %-------------------------------------------------------------------------------
533 | \begin{frame}
534 | \frametitle{Memory allocation}
535 | \begin{itemize}
536 |   \item What happens when you run \mintinline{fortran}|allocate(A(1:N))|?
537 |   \pause
538 |   \item Allocating memory does not necessarily allocate memory!
539 |   \item Memory is allocated when it's first used (i.e. \mintinline{fortran}|A(i) = 1.0|), one \emph{page} at a time.
540 |   \item OS tends to use a \emph{first touch policy}.
541 |   \item Memory is allocated in the closest NUMA region to the thread that first touches the data.
542 |   \item Ideally want threads to use data in local NUMA region to reduce socket-to-socket interconnect transfers.
543 | \end{itemize}
544 | \end{frame}
545 | 
546 | %-------------------------------------------------------------------------------
547 | \subsection{First touch}
548 | \begin{frame}[fragile]
549 | \frametitle{Taking advantage of first touch}
550 | Parallelising your data initialisation routine might mean your main loops go faster!
551 | 
552 | 
553 | \begin{minted}[fontsize=\small,linenos,frame=single]{fortran}
554 | ! Allocate and initialise vectors
555 | allocate(A(N), B(N), C(N))
556 | !$omp parallel do
557 | do i = 1, N
558 |   A(i) = 1.0
559 |   B(i) = 2.0
560 |   C(i) = 0.0
561 | end do
562 | !$omp end parallel do
563 | 
564 | ! Vector add
565 | !$omp parallel do
566 | do i = 1, N
567 |   C(i) = A(i) + B(i)
568 | end do
569 | !$omp end parallel do
570 | \end{minted}
571 | 
572 | \end{frame}
573 | 
574 | %-------------------------------------------------------------------------------
575 | \begin{frame}
576 | \frametitle{NUMA-aware}
577 | \begin{itemize}
578 |   \item Parallelise your initialisation routines the same way you parallelise the main loops.
579 |   \item This means each thread touches the same data in initialisation and compute.
580 |   \item Should reduce the number of remote memory accesses needed and improve run times.
581 |   \item But, OS is allowed to move threads around cores, and between sockets.
582 |   \item This will mess up your NUMA aware code!
583 | \end{itemize}
584 | \end{frame}
585 | 
586 | %-------------------------------------------------------------------------------
587 | \section{Thread affinity}
588 | \begin{frame}
589 | \frametitle{Pinning threads}
590 | \begin{itemize}
591 |   \item OpenMP gives you the controls to pin threads to specific cores.
592 |   \item Exposed as \emph{places} and \emph{thread pinning policy} to those places.
593 |   \item By default there is one place consisting of all the cores.
594 |   \item Use the \mintinline{bash}|OMP_PROC_BIND| environment variable to set pinning for all \mintinline{fortran}|parallel| regions.
595 |   \item Can use the \mintinline{bash}|proc_bind| clause for control of specific regions, but advise against this.
596 | \end{itemize}
597 | \end{frame}
598 | 
599 | %-------------------------------------------------------------------------------
600 | \begin{frame}
601 | \frametitle{OMP\_PROC\_BIND}
602 | \begin{itemize}
603 |   \item \mintinline{bash}|OMP_PROC_BIND=false|: Often the default; threads may move! \mintinline{fortran}|proc_bind| clauses ignored.
604 |   \item \mintinline{bash}|OMP_PROC_BIND=true|: Threads won't move, and follow \mintinline{fortran}|proc_bind| clauses or else the implementation default pinning.
605 |   \item \mintinline{bash}|OMP_PROC_BIND=master|: Threads pinned to same place as master thread.
606 |   \item \mintinline{bash}|OMP_PROC_BIND=close|: Threads are assigned to places close to the master thread.
607 |   If \mintinline{bash}|OMP_NUM_THREADS.eq.ncores|: thread 0 will pin to core 0; thread 1 will pin to core 1; etc
608 |   \item \mintinline{bash}|OMP_PROC_BIND=spread|: Threads are assigned to places ``sparsely''.
609 |   If \mintinline{bash}|OMP_NUM_THREADS.eq.ncores|: thread 0 will pin to socket 0 core 0; thread 1 will pin to socket 1 core 0; thread 2 will pin to socket 0 core 1; etc.
610 | \end{itemize}
611 | \end{frame}
612 | 
613 | %-------------------------------------------------------------------------------
614 | \begin{frame}
615 | \frametitle{Places}
616 | \begin{itemize}
617 |   \item The affinity (policy) defines how threads are assigned to places.
618 |   \item Places allow you to divide up the hardware resource, so that threads can be assigned to them.
619 |   \item Default: one place with all cores.
620 |   \item Use \mintinline{bash}|OMP_PLACES| environment variable to control.
621 |   \item \mintinline{bash}|OMP_PLACES=threads|: each place is a single hardware thread.
622 |   \item \mintinline{bash}|OMP_PLACES=cores|: each place is a single core (containing one or more hardware threads).
623 |   \item \mintinline{bash}|OMP_PLACES=sockets|: each place contains the cores of a single socket.
624 |   \item Can also use list notation: \mintinline{bash}|OMP_PLACES="{0:4},{4:4},{8:4},{12:4}"|
625 | \end{itemize}
626 | \end{frame}
627 | 
628 | %-------------------------------------------------------------------------------
629 | \begin{frame}
630 | \frametitle{Thread pinning summary}
631 | \begin{itemize}
632 |   \item In general, going to want to just use \mintinline{bash}|OMP_PROC_BIND=true|.
633 |   \item Sometimes \mintinline{bash}|spread| or \mintinline{bash}|close| gets better performance.
634 |   \item Pinning rules can get complicated when there are multiple places, so prefer to use the predefined values.
635 |   \item Most effective with a NUMA-aware implementation.
636 |   \item Also helps reduce run-to-run timing variability.
637 |   \item But must be careful with MPI+OpenMP pinning.
638 | \end{itemize}
639 | \end{frame}
640 | 
641 | %-------------------------------------------------------------------------------
642 | \section{Hybrid MPI and OpenMP}
643 | \begin{frame}
644 | \frametitle{Why combine MPI+OpenMP}
645 | \begin{itemize}
646 |   \item Supercomputers are often constructed with a hierarchical structure:
647 |     \begin{itemize}
648 |       \item Shared memory nodes connected with a network.
649 |     \end{itemize}
650 |   \item Need MPI (or similar) to communicate between distributed nodes.
651 |   \item With multi-core, could just run MPI everywhere (flat MPI).
652 |   \item But there are advantages to running \emph{hybrid} MPI and OpenMP:
653 |     \begin{itemize}
654 |       \item Larger fewer messages to take advantage of network bandwidth.
655 |       \item Fewer MPI ranks to manage (fewer to synchronise and for collectives).
656 |       \item Can avoid memory copies for intra-node communication.
657 |       \item Reduced memory footprint.
658 |       \item Parallelise other problem dimensions not decomposed with MPI.
659 |     \end{itemize}
660 | \end{itemize}
661 | \end{frame}
662 | 
663 | %-------------------------------------------------------------------------------
664 | \begin{frame}
665 |   \frametitle{Thread support levels}
666 |   \begin{itemize}
667 |     \item \mintinline{fortran}|MPI_THREAD_SINGLE| \\
668 |     Only one thread will execute (no threads allowed).
669 |   
670 |     \item \mintinline{fortran}|MPI_THREAD_FUNNELED| \\
671 |     May spawn threads, but only the original process may call MPI routines: the one that called \mintinline{fortran}|MPI_Init|.
672 |   
673 |     \item \mintinline{fortran}|MPI_THREAD_SERIALIZED| \\
674 |     May spawn threads and any thread can make MPI calls, but only one at a time. \emph{Your} responsibility to synchronise.
675 |   
676 |     \item \mintinline{fortran}|MPI_THREAD_MULTIPLE| \\
677 |     May spawn threads and any thread can make MPI calls. The MPI library has to deal with being called in parallel.
678 |   \end{itemize}
679 |   
680 |   Remember to make sure ranks still match the MPI communications to avoid deadlock.
681 |   
682 |   \end{frame}
683 | 
684 | %-------------------------------------------------------------------------------
685 | \section{Exercise}
686 | \begin{frame}
687 | \frametitle{Exercise}
688 | \begin{itemize}
689 |   \item Take your parallel 5-point stencil code and optimise it.
690 |   \item Think about:
691 |     \begin{itemize}
692 |       \item Memory access patterns
693 |       \item Vectorisation
694 |       \item NUMA
695 |     \end{itemize}
696 |   \item Note down the performance differences your optimisations make.
697 |   \item Calculate the achieved memory bandwidth of your stencil code.
698 |   \item Extension: consider these optimisaions for the Jacobi solver.
699 | \end{itemize}
700 | \end{frame}
701 | 
702 | \end{document}
703 | 


--------------------------------------------------------------------------------
/slides/02-pi.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[aspectratio=169]{beamer}
  2 | 
  3 | \input{preamble.tex}
  4 | 
  5 | \title{OpenMP for Computational Scientists}
  6 | \subtitle{2: Data sharing and Reductions}
  7 | 
  8 | \begin{document}
  9 | 
 10 | \frame{\titlepage}
 11 | 
 12 | %-------------------------------------------------------------------------------
 13 | 
 14 | % \section{Outline}
 15 | % \begin{frame}
 16 | % \frametitle{Outline}
 17 | % \begin{itemize}
 18 | %   \item Recap
 19 | %   \item Data sharing clauses
 20 | %   \item The Pi program
 21 | %   \item Critical regions
 22 | %   \item Atomics
 23 | %   \item False sharing issues
 24 | %   \item Reductions
 25 | % \end{itemize}
 26 | % \end{frame}
 27 | %-------------------------------------------------------------------------------
 28 | % \section{Recap}
 29 | % \begin{frame}[fragile]
 30 | % \frametitle{Recap}
 31 | % \begin{itemize}
 32 | %   \item Fork/join execution model.
 33 | 
 34 | %   \item Shared memory model:
 35 | %     \begin{itemize}
 36 | %       \item All threads can read/write the \emph{same} memory.
 37 | %     \end{itemize}
 38 | 
 39 | %   \item Set number of threads with \mintinline{bash}|OMP_NUM_THREADS| environment variable.
 40 | 
 41 | %   \item Parallelise simple loops with worksharing clauses:
 42 | %   \begin{minted}[frame=single]{fortran}
 43 | %   !$omp parallel do
 44 | %   do i = 1, N
 45 | %     A(i) = ...
 46 | %   end do
 47 | %   !$omp end parallel do
 48 | %   \end{minted}
 49 | 
 50 | %   \item Talked about \mintinline{fortran}|collapse|, \mintinline{fortran}|nowait| and \mintinline{fortran}|schedule| clauses.
 51 | 
 52 | % \end{itemize}
 53 | % \end{frame}
 54 | %-------------------------------------------------------------------------------
 55 | \begin{frame}[fragile]
 56 | \frametitle{The first exercise}
 57 | 
 58 | \begin{itemize}
 59 |   \item Parallelise a serial 5-point stencil code using OpenMP.
 60 |   \item Solution is adding an OpenMP worksharing construct:
 61 | \end{itemize}
 62 | 
 63 | \begin{minted}[frame=single,breaklines]{fortran}
 64 | !$omp parallel do collapse(2)
 65 | do i = 1, nx
 66 |   do j = 1, ny
 67 |     Anew(i,j) = (A(i-1,j) + A(i+1,j) + A(i,j) + A(i,j-1) + A(i,j+1)) / 5.0
 68 |   end do
 69 | end do
 70 | !$omp end parallel do
 71 | \end{minted}
 72 | 
 73 | \begin{itemize}
 74 |   \item OpenMP threads are created.
 75 |   \item Loops are collapsed and iterations shared between threads.
 76 |   \item Each thread computes its assigned portion of iteration space.
 77 |   \item Threads synchronise and join.
 78 | \end{itemize}
 79 | 
 80 | \end{frame}
 81 | 
 82 | %-------------------------------------------------------------------------------
 83 | 
 84 | \section{Data sharing}
 85 | \begin{frame}
 86 | \frametitle{Data sharing}
 87 | Remember: OpenMP is a \emph{shared memory} programming model.
 88 | \begin{itemize}
 89 |   \item By default, all data is available to all threads.
 90 |   \item There is a single copy of \emph{shared} data.
 91 | \end{itemize}
 92 | 
 93 | \vfill
 94 | 
 95 | You must specify which data should be \emph{private} to each thread.
 96 | \begin{itemize}
 97 |   \item Each thread then has local (stack) space for each private variable.
 98 |   \item Each copy is only visible to its associated thread.
 99 | \end{itemize}
100 | 
101 | \begin{block}{Notice}
102 | Fortran variables being declared at the top of the routine mean you must think about this.
103 | \end{block}
104 | 
105 | \end{frame}
106 | 
107 | 
108 | %-------------------------------------------------------------------------------
109 | \begin{frame}
110 | \frametitle{Variables on the heap}
111 | \begin{itemize}
112 |   \item All data on the heap is shared.
113 |   \item Therefore all the Fortran \mintinline{fortran}|allocatable| data is shared.
114 |   \item You must ensure that different threads do not write to the same element of these arrays.
115 | \end{itemize}
116 | 
117 | \begin{alertblock}{Caution}
118 | Setting a data sharing clause on a heap variable only effects the metadata of the variable.
119 | The pointer could be private, but the target will still be shared.
120 | \end{alertblock}
121 | \end{frame}
122 | 
123 | %-------------------------------------------------------------------------------
124 | \section{Data clauses}
125 | \begin{frame}
126 | \frametitle{Data clauses}
127 | \begin{itemize}
128 |   \item \mintinline{fortran}|shared(x)|
129 |     There is one copy of the \mintinline{fortran}|x| variable. The programmer must ensure synchronisation.
130 |   \item \mintinline{fortran}|private(x)|
131 |     Each thread gets its own local \mintinline{fortran}|x| variable. It is not initialised. The value of the original \mintinline{fortran}|x| variable is undefined on region exit.
132 |   \item \mintinline{fortran}|firstprivate(x)|
133 |     Each thread gets its own \mintinline{fortran}|x| variable, and it is initialised to the value of the original variable entering the region.
134 |   \item \mintinline{fortran}|lastprivate(x)|
135 |     Used for loops. Each thread gets its own \mintinline{fortran}|x| variable, and on exiting the region the original variable is updated taking the value from the sequentially last iteration.
136 | \end{itemize}
137 | 
138 | These are the most common clauses that are needed.
139 | \end{frame}
140 | 
141 | %-------------------------------------------------------------------------------
142 | \begin{frame}
143 | \frametitle{Data clauses}
144 | There is also the \mintinline{fortran}|threadprivate(x)| directive (not a clause).
145 | \begin{itemize}
146 |   \item This says to take a copy of the data in \emph{thread local storage} which is persistent across parallel regions.
147 |   \item The \mintinline{fortran}|copyin| directive is a means to initialise \mintinline{fortran}|threadprivate| data, copying from the master thread.
148 | \end{itemize}
149 | 
150 | Unlikely to use this clause.
151 | Might be useful if using \mintinline{fortran}|common| blocks (or \mintinline{c}|static| variables in C).
152 | \end{frame}
153 | 
154 | %-------------------------------------------------------------------------------
155 | \subsection{Private example}
156 | \begin{frame}[fragile]
157 | \frametitle{Private example}
158 | Simple \mintinline{fortran}|do| loop, which just sets a variable to the iteration number.
159 | Each iteration prints out the current and next value of \mintinline{fortran}|x|, along with the thread number.
160 | Will see what happens with different data sharing clauses.
161 | 
162 | \begin{minted}[linenos,breaklines,frame=single, fontsize=\small]{fortran}
163 |   !$omp parallel do private(x) / firstprivate(x) / lastprivate(x)
164 |   do i = 1, N
165 |     write (*,"(2X,A,I0,A,I0,A,I0)") "Thread ", omp_get_thread_num(), " setting x=", x, " to ", i
166 |     x = i
167 |   end do
168 |   !$omp end parallel do
169 | \end{minted}
170 | N is set to 10.
171 | Ran using 4 threads.
172 | Full implementation: \mintinline{bash}|private.f90|.
173 | \end{frame}
174 | 
175 | %-------------------------------------------------------------------------------
176 | \begin{frame}[fragile]
177 | \frametitle{Private example}
178 | \begin{minted}{bash}
179 | private:
180 |  before: x=-1
181 |   Thread 1 setting x=0 to 4
182 |   Thread 2 setting x=0 to 7
183 |   Thread 3 setting x=0 to 9
184 |   Thread 0 setting x=0 to 1
185 |   Thread 1 setting x=4 to 5
186 |   Thread 2 setting x=7 to 8
187 |   Thread 3 setting x=9 to 10
188 |   Thread 0 setting x=1 to 2
189 |   Thread 1 setting x=5 to 6
190 |   Thread 0 setting x=2 to 3
191 |  after: x=-1
192 | \end{minted}
193 | Each thread starts with its own \mintinline{fortran}|x|.
194 | No guarantees of initial value, but happened to be zero this time.
195 | \end{frame}
196 | 
197 | %-------------------------------------------------------------------------------
198 | \begin{frame}[fragile]
199 | \frametitle{Private example}
200 | \begin{minted}{bash}
201 | firstprivate:
202 |  before: x=-1
203 |   Thread 3 setting x=-1 to 9
204 |   Thread 2 setting x=-1 to 7
205 |   Thread 1 setting x=-1 to 4
206 |   Thread 0 setting x=-1 to 1
207 |   Thread 3 setting x=9 to 10
208 |   Thread 2 setting x=7 to 8
209 |   Thread 1 setting x=4 to 5
210 |   Thread 0 setting x=1 to 2
211 |   Thread 1 setting x=5 to 6
212 |   Thread 0 setting x=2 to 3
213 |  after: x=-1
214 | \end{minted}
215 | Each thread starts with its own \mintinline{fortran}|x|, which set to the value of \mintinline{fortran}|x| before entering the \mintinline{fortran}|parallel| region, -1.
216 | \end{frame}
217 | 
218 | %-------------------------------------------------------------------------------
219 | \begin{frame}[fragile]
220 | \frametitle{Private example}
221 | \begin{minted}{bash}
222 | lastprivate:
223 |  before: x=-1
224 |   Thread 3 setting x=3 to 9
225 |   Thread 2 setting x=2 to 7
226 |   Thread 1 setting x=1 to 4
227 |   Thread 3 setting x=9 to 10
228 |   Thread 0 setting x=0 to 1
229 |   Thread 2 setting x=7 to 8
230 |   Thread 1 setting x=4 to 5
231 |   Thread 0 setting x=1 to 2
232 |   Thread 1 setting x=5 to 6
233 |   Thread 0 setting x=2 to 3
234 |  after: x=10
235 | \end{minted}
236 | Each thread starts with its own \mintinline{fortran}|x|, which set to to a garbage value.
237 | On exiting the region, the original \mintinline{fortran}|x| is set to the value of the last iteration of the loop, 10.
238 | \end{frame}
239 | 
240 | %-------------------------------------------------------------------------------
241 | \section{Default data sharing}
242 | \begin{frame}
243 | \frametitle{Choosing default data sharing}
244 | \begin{alertblock}{Note}
245 | It is especially important to list private variables in Fortran.
246 | All variables have \emph{global} scope within each \mintinline{fortran}|subroutine| so \emph{everything} is shared by default.
247 | In C, local scoping rules makes this easier.
248 | \end{alertblock}
249 | 
250 | \begin{itemize}
251 |   \item You can force yourself to specify everything manually by using the \mintinline{fortran}|default(none)| attribute. This is good practice.
252 |   \item You can also \mintinline{fortran}|default(private)| or \mintinline{fortran}|default(firstprivate)| to make everything private by default --- this might save a lot of typing in an old code with many temporary variables.
253 | \end{itemize}
254 | 
255 | \end{frame}
256 | 
257 | %-------------------------------------------------------------------------------
258 | \section{Calculating Pi}
259 | \begin{frame}
260 | \frametitle{Calculating Pi}
261 | Use a simple program to numerically approximate $\pi$ to explore:
262 | \begin{itemize}
263 |   \item Use of data sharing clauses.
264 |   \item Updating a shared variable in parallel.
265 |   \item Reductions.
266 | \end{itemize}
267 | \end{frame}
268 | 
269 | %-------------------------------------------------------------------------------
270 | \begin{frame}
271 | \frametitle{Integration to calculate Pi}
272 | 
273 | $$\int_{0}^{1} \frac{4}{1+x^2} dx = \pi$$
274 | 
275 | \begin{adjustbox}{max totalsize={\textwidth}{0.6\textheight},center}
276 | \begin{tikzpicture}
277 |   \begin{axis}[xlabel={$x$},ylabel={$f(x)$},ymin=0]
278 |     \addplot [name path=A, domain=0:1] {4/(1+x*x)};
279 |     \addplot[dashed] coordinates {(0,0) (0,4)};
280 |     \addplot[dashed] coordinates {(1,0) (1,2)};
281 |     \path [name path=axis] (axis cs:0,0) -- (axis cs:1,0);
282 |     \addplot[blue!30] fill between [of=A and axis, domain=0:1];
283 |   \end{axis}
284 | \end{tikzpicture}
285 | \end{adjustbox}
286 | \end{frame}
287 | 
288 | %-------------------------------------------------------------------------------
289 | \begin{frame}
290 | \frametitle{Trapezoidal rule}
291 | Sum the area of the boxes. Choose a small \emph{step} size to generate lots of boxes, and increase accuracy.
292 | 
293 | \begin{adjustbox}{max totalsize={\textwidth}{0.6\textheight},center}
294 | \begin{tikzpicture}
295 |   \begin{axis}[xlabel={$x$},ylabel={$f(x)$},ymin=0]
296 |     \addplot [name path=A, domain=0:1] {4/(1+x*x)};
297 |     \addplot[dashed] coordinates {(0,0) (0,4)};
298 |     \addplot[dashed] coordinates {(1,0) (1,2)};
299 |     \path [name path=axis] (axis cs:0,0) -- (axis cs:1,0);
300 |     \addplot[blue!30] fill between [of=A and axis, soft clip={domain=0:0.2}];
301 |     \addplot[red!30] fill between [of=A and axis, soft clip={domain=0.2:0.4}];
302 |     \addplot[green!30] fill between [of=A and axis, soft clip={domain=0.4:0.6}];
303 |     \addplot[gray!30] fill between [of=A and axis, soft clip={domain=0.6:0.8}];
304 |     \addplot[yellow!30] fill between [of=A and axis, soft clip={domain=0.8:1}];
305 |   \end{axis}
306 | \end{tikzpicture}
307 | \end{adjustbox}
308 | \end{frame}
309 | 
310 | %-------------------------------------------------------------------------------
311 | \begin{frame}[fragile]
312 | \frametitle{Code}
313 | We will use this code which calculates the value of $\pi$ as an example for the remainder of this session.
314 | 
315 | \begin{minted}[linenos,breaklines,frame=single]{fortran}
316 |   step = 1.0/num_steps
317 |   do ii = 1, num_steps
318 |     x = (ii-0.5)*step
319 |     sum = sum + (4.0/(1.0+x*x))
320 |   end do
321 |   pi = step * sum
322 | \end{minted}
323 | 
324 | With 100,000,000 steps, this takes 0.368s on my laptop.
325 | 
326 | Full implementation: \mintinline{bash}|pi.f90|.
327 | \end{frame}
328 | 
329 | %-------------------------------------------------------------------------------
330 | \begin{frame}[fragile]
331 | \frametitle{Parallelising the loop}
332 | 
333 | Use a worksharing directive to parallelise the loop.
334 | 
335 | \begin{minted}[linenos,breaklines,frame=single]{fortran}
336 |   step = 1.0/num_steps
337 |   !$omp parallel do private(x)
338 |   do ii = 1, num_steps
339 |     x = (ii-0.5)*step
340 |     sum = sum + (4.0/(1.0+x*x))
341 |   end do
342 |   !$omp end parallel do
343 |   pi = step * sum
344 | \end{minted}
345 | 
346 | \vfill
347 | 
348 | What about data sharing?
349 | \begin{itemize}
350 |   \item \mintinline{fortran}|x| needs to be used independently by each thread, so mark as \mintinline{fortran}|private|.
351 |   \item \mintinline{fortran}|sum| needs to be updated by \emph{all} threads, so leave as \mintinline{fortran}|shared|.
352 | \end{itemize}
353 | 
354 | \end{frame}
355 | 
356 | %-------------------------------------------------------------------------------
357 | \section{Critical regions}
358 | \begin{frame}[fragile]
359 | \frametitle{Parallelising with critical}
360 | \begin{itemize}
361 |   \item But need to be careful changing the \mintinline{fortran}|shared| variable, \mintinline{fortran}|sum|.
362 |   \item All threads can update this value directly!
363 |   \item A \mintinline{fortran}|critical| region only allows one thread to execute at any one time. No guarantees of ordering.
364 | \end{itemize}
365 | 
366 | \begin{minted}[linenos,breaklines,frame=single]{fortran}
367 |   step = 1.0/num_steps
368 |   !$omp parallel do private(x)
369 |   do ii = 1, num_steps
370 |     x = (ii-0.5)*step
371 |     !$omp critical
372 |     sum = sum + (4.0/(1.0+x*x))
373 |     !$omp end critical
374 |   end do
375 |   !$omp end parallel do
376 |   pi = step * sum
377 | \end{minted}
378 | 
379 | \end{frame}
380 | 
381 | %-------------------------------------------------------------------------------
382 | \begin{frame}
383 | \frametitle{Runtimes}
384 | Run on a MacBook Pro (Intel Core i7-4980HQ CPU @ 2.80GHz) with 4 threads.
385 | 
386 | \vfill
387 | 
388 | \begin{table}
389 | \begin{tabular}{cc}
390 | \toprule
391 | Implementation & Runtime (s) \\
392 | \midrule
393 | Serial   & 0.368 \\
394 | Critical & 426.1 \\
395 | \bottomrule
396 | \end{tabular}
397 | \end{table}
398 | 
399 | Full implementation: \mintinline{bash}|pi_critical.f90|.
400 | 
401 | \begin{center}
402 | \large Really slow!
403 | \end{center}
404 | 
405 | \end{frame}
406 | 
407 | %-------------------------------------------------------------------------------
408 | \section{Atomics}
409 | \begin{frame}[fragile]
410 | \frametitle{Atomics}
411 | A \mintinline{fortran}|critical| region protects a whole block of code. For a single operation, can use \mintinline{fortran}|atomic| instead.
412 | 
413 | Atomic operations are with respect to the memory access of a scalar variable {\tt x}.
414 | 
415 | \begin{itemize}
416 |   \item \mintinline{fortran}|read| for \mintinline{fortran}|v = x|
417 |   \item \mintinline{fortran}|write| for \mintinline{fortran}|x = expr|
418 |   \item \mintinline{fortran}|update| for \mintinline{fortran}|x = x op expr|
419 |   \item \mintinline{fortran}|capture| for read and write/update. The result is retained: \mintinline{fortran}|x = x op expr; v = x|
420 | \end{itemize}
421 | 
422 | Not specifying an atomic clause defaults to \mintinline{fortran}|update|.
423 | \end{frame}
424 | 
425 | %-------------------------------------------------------------------------------
426 | \begin{frame}[fragile]
427 | \frametitle{Atomic pi}
428 | \begin{minted}[linenos,breaklines]{fortran}
429 |   step = 1.0/num_steps
430 |   !$omp parallel do private(x)
431 |   do ii = 1, num_steps
432 |     x = (ii-0.5)*step
433 |     !$omp atomic
434 |     sum = sum + (4.0/(1.0+x*x))
435 |   end do
436 |   !$omp end parallel do
437 |   pi = step * sum
438 | \end{minted}
439 | \end{frame}
440 | 
441 | %-------------------------------------------------------------------------------
442 | \begin{frame}
443 | \frametitle{Runtimes}
444 | Run on a MacBook Pro (Intel Core i7-4980HQ CPU @ 2.80GHz) with 4 threads.
445 | 
446 | \vfill
447 | 
448 | \begin{table}
449 | \begin{tabular}{cc}
450 | \toprule
451 | Implementation & Runtime (s) \\
452 | \midrule
453 | Serial   & 0.368 \\
454 | Critical & 426.1 \\
455 | Atomic   & 8.3 \\
456 | \bottomrule
457 | \end{tabular}
458 | \end{table}
459 | 
460 | Full implementation: \mintinline{bash}|pi_atomic.f90|.
461 | 
462 | \begin{center}
463 | \large Faster, but still slower than serial.
464 | \end{center}
465 | 
466 | \end{frame}
467 | 
468 | %-------------------------------------------------------------------------------
469 | \section{Avoiding critical regions}
470 | \begin{frame}
471 | \frametitle{Independent summation}
472 | \begin{itemize}
473 |   \item Both methods cause threads to synchronise for every update to \mintinline{fortran}|sum|.
474 |   \item But each thread could compute a partial sum independently, synchronising once to total at the end.
475 | \end{itemize}
476 | 
477 | \vfill
478 | 
479 | Make \mintinline{fortran}|sum| an array of length equal to the number of threads.
480 | \begin{itemize}
481 |   \item Each thread stores its partial sum, and the array is totalled by the master thread serially at the end.
482 |   \item As it's \emph{shared memory}, the \mintinline{fortran}|sum| array can be read just fine on the master rank.
483 | \end{itemize}
484 | \end{frame}
485 | 
486 | %-------------------------------------------------------------------------------
487 | \begin{frame}[fragile]
488 | \frametitle{Independent summation}
489 | \begin{minted}[fontsize=\small,linenos,breaklines,frame=single]{fortran}
490 |   step = 1.0/num_steps
491 |   !$omp parallel private(x,tid)
492 |   tid = omp_get_thread_num()
493 |   sum(tid+1) = 0.0
494 |   !$omp do
495 |   do ii = 1, num_steps
496 |     x = (ii-0.5)*step
497 |     sum(tid+1) = sum(tid+1) + (4.0/(1.0+x*x))
498 |     !$omp flush(sum)
499 |   end do
500 |   !$omp end do
501 |   !$omp end parallel
502 |   do ii = 1, nthreads
503 |     pi = pi + sum(ii)
504 |   end do
505 |   pi = pi * step
506 | \end{minted}
507 | \end{frame}
508 | 
509 | %-------------------------------------------------------------------------------
510 | \begin{frame}
511 | \frametitle{Runtimes}
512 | Run on a MacBook Pro (Intel Core i7-4980HQ CPU @ 2.80GHz) with 4 threads.
513 | 
514 | \vfill
515 | 
516 | \begin{table}
517 | \begin{tabular}{cc}
518 | \toprule
519 | Implementation & Runtime (s) \\
520 | \midrule
521 | Serial   & 0.368 \\
522 | Critical & 426.1 \\
523 | Atomic   & 8.3 \\
524 | Array    & 2.8 \\
525 | \bottomrule
526 | \end{tabular}
527 | \end{table}
528 | 
529 | Full implementation: \mintinline{bash}|pi_array.f90|.
530 | 
531 | \begin{center}
532 | \large Fastest parallel version so far, but still slow.
533 | \end{center}
534 | 
535 | \end{frame}
536 | 
537 | %-------------------------------------------------------------------------------
538 | \section{False sharing}
539 | \begin{frame}
540 | \frametitle{False sharing}
541 | This code is susceptible to \emph{false sharing}.
542 | \begin{itemize}
543 |   \item False sharing occurs when different threads update data on the same cache line.
544 |   \item Cache system is coherent between cores, so data consistency must be maintained.
545 |   \item The cache line is no longer up to date because another thread changed it (in their local cache).
546 |   \item Therefore, cache line must be flushed to memory and reread into the other thread every time.
547 |   \item This is an example of \emph{cache thrashing}.
548 |   \item The performance is reduced as threads must wait for the cache lines to refresh.
549 | \end{itemize}
550 | \end{frame}
551 | 
552 | %-------------------------------------------------------------------------------
553 | \begin{frame}
554 | \frametitle{Flush}
555 | \begin{itemize}
556 |   \item The \mintinline{fortran}|flush()| construct ensures that the variables are consistent between the thread's memory and main memory.
557 |   \item Don't want to go into complicated parts of the OpenMP memory model.
558 |   \item In general, don't need to worry about this stuff.
559 |   \item Without the flush, the write to memory will be lowered to after the loop, so false sharing only occurs once at the end.
560 |   \item Here we use it to \emph{ensure} that false sharing occurs every time to highlight the performance hit.
561 | \end{itemize}
562 | \end{frame}
563 | 
564 | %-------------------------------------------------------------------------------
565 | \begin{frame}[fragile]
566 | \frametitle{Firstprivate pi}
567 | Can use data sharing clauses to our advantage here:
568 | 
569 | Give each thread a \emph{scalar} copy of \mintinline{fortran}|sum| to compute their partial sum, and reduce with only one critical (or atomic) region at the end.
570 | No false sharing, as value is just a single number (i.e.\ a register).
571 | \begin{minted}[linenos,breaklines,frame=single,fontsize=\footnotesize]{fortran}
572 |   step = 1.0/num_steps
573 |   !$omp parallel private(x) firstprivate(sum)
574 |   !$omp do
575 |   do ii = 1, num_steps
576 |     x = (ii-0.5)*step
577 |     sum = sum + (4.0/(1.0+x*x))
578 |   end do
579 |   !$omp end do
580 |   !$omp critical
581 |   pi = pi + sum
582 |   !$omp end critical
583 |   !$omp end parallel
584 |   pi = pi * step
585 | \end{minted}
586 | \end{frame}
587 | 
588 | %-------------------------------------------------------------------------------
589 | \begin{frame}
590 | \frametitle{Runtimes}
591 | Run on a MacBook Pro (Intel Core i7-4980HQ CPU @ 2.80GHz) with 4 threads.
592 | 
593 | \vfill
594 | 
595 | \begin{table}
596 | \begin{tabular}{cc}
597 | \toprule
598 | Implementation & Runtime (s) \\
599 | \midrule
600 | Serial        & 0.368 \\
601 | Critical      & 426.1 \\
602 | Atomic        & 8.3 \\
603 | Array         & 2.8 \\
604 | First private & 0.104 \\
605 | \bottomrule
606 | \end{tabular}
607 | \end{table}
608 | 
609 | Full implementation: \mintinline{bash}|pi_private.f90|.
610 | 
611 | \begin{center}
612 | \large Finally faster than serial! Around 3.5X faster on 4 threads.
613 | \end{center}
614 | 
615 | \end{frame}
616 | 
617 | %-------------------------------------------------------------------------------
618 | \section{Reductions}
619 | \begin{frame}[fragile]
620 | \frametitle{Reductions}
621 | Much simpler to use the OpenMP \mintinline{fortran}|reduction| clause on a worksharing loop.
622 | Specify the operation and the variable.
623 | \begin{multicols}{2}
624 | \begin{itemize}
625 |   \item \mintinline{fortran}|reduction(+:var)|
626 |   \item \mintinline{fortran}|reduction(-:var)|
627 |   \item \mintinline{fortran}|reduction(*:var)|
628 |   \item \mintinline{fortran}|reduction(.and.:var)|
629 |   \item \mintinline{fortran}|reduction(.or.:var)|
630 |   \item \mintinline{fortran}|reduction(.eqv.:var)|
631 |   \item \mintinline{fortran}|reduction(.neqv.:var)|
632 |   \item \mintinline{fortran}|reduction(.max.:var)|
633 |   \item \mintinline{fortran}|reduction(.min.:var)|
634 |   \item \mintinline{fortran}|reduction(.iand.:var)|
635 |   \item \mintinline{fortran}|reduction(.ior.:var)|
636 |   \item \mintinline{fortran}|reduction(.ieor.:var)|
637 | \end{itemize}
638 | \end{multicols}
639 | 
640 | Can also do array reductions. Each element of array is treated as own, separate, reduction.
641 | Similar to:
642 | \begin{minted}[breaklines]{fortran}
643 | MPI_Allreduce(MPI_IN_PLACE, arr, N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD, ierr)
644 | \end{minted}
645 | 
646 | \end{frame}
647 | 
648 | %-------------------------------------------------------------------------------
649 | \begin{frame}[fragile]
650 | \frametitle{Pi reduction}
651 | Much simpler to write using the \mintinline{fortran}|reduction| clause --- just need a single directive:
652 | \begin{minted}[linenos,breaklines,frame=single]{fortran}
653 |   step = 1.0/num_steps
654 |   !$omp parallel do private(x) reduction(+:sum)
655 |   do ii = 1, num_steps
656 |     x = (ii-0.5)*step
657 |     sum = sum + (4.0/(1.0+x*x))
658 |   end do
659 |   !$omp end parallel do
660 |   pi = step * sum
661 | \end{minted}
662 | 
663 | Full implementation: \mintinline{bash}|pi_reduction.f90|.
664 | \end{frame}
665 | 
666 | %-------------------------------------------------------------------------------
667 | \begin{frame}
668 | \frametitle{Runtimes}
669 | Run on a MacBook Pro (Intel Core i7-4980HQ CPU @ 2.80GHz) with 4 threads.
670 | 
671 | \vfill
672 | 
673 | \begin{table}
674 | \begin{tabular}{cc}
675 | \toprule
676 | Implementation & Runtime (s) \\
677 | \midrule
678 | Serial        & 0.368 \\
679 | Critical      & 426.1 \\
680 | Atomic        & 8.3 \\
681 | Array         & 2.8 \\
682 | First private & 0.104 \\
683 | Reduction     & 0.095 \\
684 | \bottomrule
685 | \end{tabular}
686 | \end{table}
687 | 
688 | \vfill
689 | 
690 | Around 3.9X faster on 4 threads!
691 | 
692 | \vfill
693 | 
694 | 
695 | \begin{block}{Recommendation}
696 | Use the \mintinline{fortran}|reduction| clause for reductions.
697 | \end{block}
698 | 
699 | \end{frame}
700 | 
701 | %-------------------------------------------------------------------------------
702 | \section{Exercise}
703 | \begin{frame}[fragile]
704 | \frametitle{Exercise}
705 | \begin{itemize}
706 |   \item Start with your parallel 5-point stencil code from last time.
707 |   \item Change the code to print out the total of the cells (excluding halo) every timestep.
708 |   \item You'll need to implement a parallel reduction to do this.
709 |   \item Try the different techniques shown to implement reductions:
710 |     \begin{itemize}
711 |       \item Critical sections.
712 |       \item Atomics.
713 |       \item Reduction clause.
714 |     \end{itemize}
715 |   \item Extension: there is also a Jacobi code to parallelise --- it needs a reduction too.
716 | \end{itemize}
717 | \begin{minted}[frame=single,breaklines]{fortran}
718 | do i = 1, nx
719 |   do j = 1, ny
720 |     Anew(i,j) = (A(i-1,j) + A(i+1,j) + A(i,j) + A(i,j-1) + A(i,j+1)) / 5.0
721 |     total = total + Anew(i,j)
722 |   end do
723 | end do
724 | \end{minted}
725 | \end{frame}
726 | 
727 | %-------------------------------------------------------------------------------
728 | \begin{frame}
729 | \frametitle{Summary}
730 | \begin{itemize}
731 |   \item Have now covered the most common parts of OpenMP.
732 |   \item 80/20 rule: Most programs will only use what you know so far.
733 |   \item OpenMP is deceptively simple!
734 |   \item In the remaining sessions you'll learn to program OpenMP on NUMA and GPU architectures.
735 | \end{itemize}
736 | \end{frame}
737 | 
738 | %-------------------------------------------------------------------------------
739 | \end{document}
740 | 


--------------------------------------------------------------------------------
/slides/01-paralleldo.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[aspectratio=169]{beamer}
  2 | 
  3 | \input{preamble.tex}
  4 | 
  5 | \title{OpenMP for Computational Scientists}
  6 | \subtitle{1: Parallel worksharing}
  7 | 
  8 | \begin{document}
  9 | 
 10 | \frame{\titlepage}
 11 | 
 12 | %-------------------------------------------------------------------------------
 13 | % \begin{frame}
 14 | % \frametitle{The first exercise}
 15 | % \begin{itemize}
 16 | %   \item At the end of this session, you will be able to parallelise a (simple) 5-point stencil code using OpenMP!
 17 | %   \item The other sessions provide you with details you might need for real world codes.
 18 | % \end{itemize}
 19 | % \end{frame}
 20 | 
 21 | %-------------------------------------------------------------------------------
 22 | \section{OpenMP introduction}
 23 | \begin{frame}
 24 | \frametitle{What is OpenMP?}
 25 | 
 26 | A collection of compiler directives, library routines, and environment variables for parallelism for shared memory parallel programs.
 27 | 
 28 | \begin{itemize}
 29 |   \item Create and manage parallel programs while permitting portability.
 30 |   \item User-directed parallelization.
 31 | \end{itemize}
 32 | 
 33 | A \emph{specification} of annotations you can make to your program in order to make it parallel.
 34 | 
 35 | \end{frame}
 36 | 
 37 | %-------------------------------------------------------------------------------
 38 | \begin{frame}[fragile]
 39 | \frametitle{Syntax}
 40 | \begin{itemize}
 41 | \item OpenMP mostly formed of \emph{compiler directives}\\
 42 |   \begin{minted}{fortran}
 43 |   !$omp construct [clause [clause]...]
 44 |   \end{minted}
 45 |   These tell the compiler to insert some extra code on your behalf.
 46 | 
 47 | \item Compiler directives usually apply to a \emph{structured block} of statements.
 48 | Limited scoping in Fortran means we often need to use \emph{end} directives.
 49 |   \begin{minted}{fortran}
 50 |   !$omp construct
 51 |   ... ! lines of Fortran code
 52 |   !$omp end construct
 53 |   \end{minted}
 54 | 
 55 | \item Library API calls
 56 |   \begin{minted}{fortran}
 57 |   use omp_lib
 58 |   call omp_...()
 59 |   \end{minted}
 60 | 
 61 | \end{itemize}
 62 | \end{frame}
 63 | 
 64 | %-------------------------------------------------------------------------------
 65 | \subsection{Compiler flags}
 66 | \begin{frame}[fragile]
 67 | \frametitle{Building with OpenMP}
 68 | 
 69 | Turn on OpenMP in the compiler:
 70 | \begin{minted}{bash}
 71 | gfortran *.f90 -fopenmp # GNU
 72 | ifort *.f90 -qopenmp    # Intel
 73 | ftn *.f90 -homp         # Cray (now off by default)
 74 | pgf90 *.f90 -mp         # PGI
 75 | \end{minted}
 76 | 
 77 | To also use the API calls within the code, use the library:
 78 | \begin{minted}{fortran}
 79 | USE omp_lib
 80 | \end{minted}
 81 | 
 82 | \begin{alertblock}{Note}
 83 | No need to include the library if only using the compiler directives.
 84 | The library only gets you the API calls.
 85 | \end{alertblock}
 86 | \end{frame}
 87 | 
 88 | %-------------------------------------------------------------------------------
 89 | \section{Memory and execution model}
 90 | \begin{frame}
 91 | \frametitle{Shared memory}
 92 | OpenMP is for shared memory programming: all threads have access to a shared address space.
 93 | 
 94 | A typical HPC node consisting of 2 multi-core CPUs.
 95 | \begin{center}
 96 | \begin{tikzpicture}
 97 |   % Draw 4 cores for socket 0
 98 |   \draw (0,0) rectangle (1,1);
 99 |   \draw (0.5,0.5) node {C0};
100 |   \draw (1,0) rectangle (2,1);
101 |   \draw (1.5,0.5) node {C1};
102 |   \draw (0,1) rectangle (1,2);
103 |   \draw (0.5,1.5) node {C2};
104 |   \draw (1,1) rectangle (2,2);
105 |   \draw (1.5,1.5) node {C3};
106 |   \draw (1,-0.5) node {Socket 0};
107 | 
108 |   % Draw 4 cores for socket 1
109 |   \draw (3,0) rectangle (4,1);
110 |   \draw (3.5,0.5) node {C0};
111 |   \draw (4,0) rectangle (5,1);
112 |   \draw (4.5,0.5) node {C1};
113 |   \draw (3,1) rectangle (4,2);
114 |   \draw (3.5,1.5) node {C2};
115 |   \draw (4,1) rectangle (5,2);
116 |   \draw (4.5,1.5) node {C3};
117 |   \draw (4,-0.5) node {Socket 1};
118 | 
119 |   % Draw large memory
120 |   \draw (-0.5,3) rectangle (5.5,4);
121 |   \draw (2.5,3.5) node {Memory};
122 | 
123 |   % Connect sockets to memory
124 |   \draw (1,2) -- (1,3);
125 |   \draw (4,2) -- (4,3);
126 |   \draw[dashed] (2,1) -- (3,1); % QPI
127 | 
128 |   % Show memory shared
129 |   \pause
130 |   \draw[fill=red] (0.5,3.2) rectangle (1,3.7);
131 |   \draw[->] (0.5,1.8) -- (0.7,3.2);
132 |   \draw[->] (0.7,3.2) -- (4.5,0.8);
133 | 
134 | \end{tikzpicture}
135 | \end{center}
136 | \emph{All} threads (each running on a core) can access the same memory.
137 | 
138 | Different to MPI, where one process cannot see the memory of another without explicit communication.
139 | 
140 | \end{frame}
141 | 
142 | %-------------------------------------------------------------------------------
143 | \begin{frame}
144 | \frametitle{Fork-join model}
145 | Serial/sequential execution:
146 | \begin{center}
147 | \begin{tikzpicture}
148 |   \draw[->] (0,0) -- (8,0);
149 | \end{tikzpicture}
150 | \end{center}
151 | 
152 | \pause
153 | 
154 | In a \emph{fork-join} model, code starts serial, \emph{forks} a \emph{team} of threads then \emph{joins} them back to serial execution.
155 | \begin{center}
156 | \begin{tikzpicture}
157 |   \draw (0,0) -- (1,0);
158 | 
159 |   % Fork
160 |   \draw (1,0) -- (2,1.5);
161 |   \draw (1,0) -- (2,0.5);
162 |   \draw (1,0) -- (2,-0.5);
163 |   \draw (1,0) -- (2,-1.5);
164 |   \draw (1,-1) node {Fork};
165 | 
166 |   % Run in parallel
167 |   \draw (2,1.5)  -- (5,1.5);
168 |   \draw (2,0.5)  -- (5,0.5);
169 |   \draw (2,-0.5) -- (5,-0.5);
170 |   \draw (2,-1.5) -- (5,-1.5);
171 |   \draw (3.5,0) node {Parallel execution};
172 | 
173 |   % Join
174 |   \draw (5,1.5)  -- (6,0);
175 |   \draw (5,0.5)  -- (6,0);
176 |   \draw (5,-0.5) -- (6,0);
177 |   \draw (5,-1.5) -- (6,0);
178 |   \draw (6,-1) node {Join};
179 | 
180 |   % Serial end
181 |   \draw[->] (6,0) -- (8,0);
182 | \end{tikzpicture}
183 | \end{center}
184 | 
185 | Nested threads are allowed, where a thread forks its own team of threads.
186 | \end{frame}
187 | 
188 | %-------------------------------------------------------------------------------
189 | \section{Going parallel}
190 | \begin{frame}[fragile]
191 | \frametitle{Creating OpenMP threads}
192 | \begin{minted}[frame=single, linenos]{fortran}
193 | program hello
194 | 
195 | !$omp parallel
196 |   print *, "Hello"
197 | !$omp end parallel
198 | 
199 | end program hello
200 | \end{minted}
201 | 
202 | Threads \emph{redundantly} execute code in the block.
203 | 
204 | Each thread will output \mintinline{bash}|Hello|.
205 | 
206 | Threads are synchronised at the end of the parallel region.
207 | 
208 | \end{frame}
209 | 
210 | %-------------------------------------------------------------------------------
211 | % \begin{frame}[fragile]
212 | % \frametitle{Pthreads}
213 | 
214 | % \begin{minted}[fontsize=\small, linenos, frame=single]{fortran}
215 | % program hello
216 | %   use fpthread
217 | %   integer :: i, err
218 | %   integer :: N = 4
219 | %   type(fpthread_t) :: Tide(N)
220 | 
221 | %   do i = 1, N
222 | %     call fpthread_create(tid(i), NULL, run, NULL, err)
223 | %   end do
224 | %   do i = 1, N
225 | %     call fpthread_join(tid(i), NULL, err)
226 | %   end do
227 | 
228 | %   subroutine run
229 | %     print *, "Hello"
230 | %   end subroutine run
231 | % end program hello
232 | % \end{minted}
233 | 
234 | % \end{frame}
235 | 
236 | % %-------------------------------------------------------------------------------
237 | % \begin{frame}
238 | % \frametitle{OpenMP and Pthreads}
239 | % \begin{itemize}
240 | %   \item Pthreads is very error prone and verbose.
241 | %   \item The OpenMP \mintinline{fortran}|!$omp parallel| abstracts this away.
242 | %   \item The compiler directive inserts this extra code on your behalf.
243 | %   \item Pthreads requires wrapping up your parallel work in subroutines.
244 | %     \begin{itemize}
245 | %       \item Kernels are a useful abstraction used in many programming models.
246 | %     \end{itemize}
247 | %   \item OpenMP much more convenient for \emph{incrementally} adding parallelism to your code.
248 | % \end{itemize}
249 | % \end{frame}
250 | 
251 | %-------------------------------------------------------------------------------
252 | \begin{frame}[fragile]
253 | \frametitle{Setting number of threads}
254 | You might need to set the number of threads to launch (though typically you'll leave OpenMP to set the number of threads for you at run-time).
255 | 
256 | OpenMP has 3 ways to do this:
257 | \begin{itemize}
258 |   \item Environment variables
259 |   \begin{minted}{bash}
260 |   OMP_NUM_THREADS=16
261 |   \end{minted}
262 | 
263 |   \item API calls
264 |   \begin{minted}{fortran}
265 |   call omp_set_num_threads(16)
266 |   \end{minted}
267 | 
268 |   \item Clauses
269 |   \begin{minted}{fortran}
270 |   !$omp parallel num_threads(16)
271 |   !$omp end parallel
272 |   \end{minted}
273 | \end{itemize}
274 | 
275 | In general it's better to use environment variables if you need to do this, as this approach gives you more flexibility at runtime.
276 | \end{frame}
277 | 
278 | %-------------------------------------------------------------------------------
279 | \begin{frame}[fragile]
280 | \frametitle{Thread API calls}
281 | Parallel programs often written in a SPMD style: \newline
282 | {\bf S}ingle {\bf P}rogram, {\bf M}ultiple {\bf D}ata.
283 | \begin{itemize}
284 |   \item MPI has a SPMD model.
285 |   \item Threads run the same code, and use their ID to work out which data to operate on.
286 | \end{itemize}
287 | 
288 | The OpenMP API gives you calls to determine thread information when \emph{inside} a parallel region:
289 | \begin{itemize}
290 |   \item Get number of threads
291 |     \begin{minted}{fortran}
292 |     nthreads = omp_get_num_threads()
293 |     \end{minted}
294 | 
295 |   \item Get thread ID
296 |     \begin{minted}{fortran}
297 |     tid = omp_get_thread_num()
298 |     \end{minted}
299 | 
300 | \end{itemize}
301 | \end{frame}
302 | 
303 | %-------------------------------------------------------------------------------
304 | \section{Example: vector addition}
305 | \begin{frame}[fragile]
306 | \frametitle{Vector add}
307 | Walkthrough parallelising vector addition using OpenMP.
308 | 
309 | \begin{minted}[fontsize=\footnotesize,linenos,frame=single]{fortran}
310 | program vecadd
311 |   integer :: N = 1024  ! Length of array
312 |   ! Arrays
313 |   real(kind=8), allocatable, dimension(:) :: A, B, C
314 |   integer :: i         ! Loop counter
315 | 
316 |   ! Allocate and initialise vectors
317 |   allocate(A(N), B(N), C(N))
318 |   A = 1.0; B = 2.0; C = 0.0
319 | 
320 |   ! Vector add
321 |   do i = 1, N
322 |     C(i) = A(i) + B(i)
323 |   end do
324 | 
325 |   deallocate(A,B,C)
326 | end program vecadd
327 | \end{minted}
328 | \end{frame}
329 | 
330 | %-------------------------------------------------------------------------------
331 | \begin{frame}[fragile]
332 | \frametitle{Vector add: Step 1}
333 | Add parallel region around work
334 | \begin{minted}[frame=single]{fortran}
335 |   !$omp parallel
336 |   do i = 1, N
337 |     C(i) = A(i) + B(i)
338 |   end do
339 |   !$omp end parallel
340 | \end{minted}
341 | Every thread will now do the entire vector addition --- redundantly!
342 | \end{frame}
343 | 
344 | %-------------------------------------------------------------------------------
345 | \begin{frame}[fragile]
346 | \frametitle{Vector add: Step 2}
347 | Get thread IDs
348 | \begin{minted}[fontsize=\small,frame=single]{fortran}
349 |   integer :: tid, nthreads
350 | 
351 |   !$omp parallel
352 |   tid = omp_get_thread_num()
353 |   nthreads = omp_get_num_threads()
354 | 
355 |   do i = 1, N
356 |     C(i) = A(i) + B(i)
357 |   end do
358 |   !$omp end parallel
359 | \end{minted}
360 | 
361 | \pause
362 | \begin{alertblock}{Incorrect behaviour at runtime}
363 | What's the problem here?
364 | \end{alertblock}
365 | \end{frame}
366 | 
367 | %-------------------------------------------------------------------------------
368 | \begin{frame}[fragile]
369 | \frametitle{Vector add: Step 2, take 2}
370 | 
371 | \begin{itemize}
372 |   \item In OpenMP, all variables are \emph{shared} between threads.
373 |   \item But each thread needs its own copy of \mintinline{fortran}|tid|.
374 |   \item Solution: use the \mintinline{fortran}|private| clause on the \mintinline{fortran}|parallel| region.
375 |   \item This gives each thread its own unique copy in memory for the variable.
376 | \end{itemize}
377 | 
378 | \begin{minted}[fontsize=\small,frame=single]{fortran}
379 |   integer :: tid, nthreads
380 | 
381 |   !$omp parallel private(tid)
382 |   tid = omp_get_thread_num()
383 |   nthreads = omp_get_num_threads()
384 | 
385 |   do i = 1, N
386 |     C(i) = A(i) + B(i)
387 |   end do
388 |   !$omp end parallel
389 | \end{minted}
390 | Much more information about data sharing clauses in next session.
391 | \end{frame}
392 | 
393 | %-------------------------------------------------------------------------------
394 | \begin{frame}[fragile]
395 | \frametitle{Vector add: Step 3}
396 | Finally, distribute the iteration space across the threads.
397 | \begin{minted}[frame=single]{fortran}
398 |   integer :: tid, nthreads
399 | 
400 |   !$omp parallel private(tid)
401 |   tid = omp_get_thread_num()
402 |   nthreads = omp_get_num_threads()
403 | 
404 |   do i = 1+(tid*N/nthreads), (tid+1)*N/nthreads
405 |     C(i) = A(i) + B(i)
406 |   end do
407 |   !$omp end parallel
408 | \end{minted}
409 | \begin{block}{Remember}
410 | Thread IDs are numbered from 0 in OpenMP.
411 | Be careful with your index calculation.
412 | \end{block}
413 | \end{frame}
414 | 
415 | %-------------------------------------------------------------------------------
416 | \begin{frame}[fragile]
417 |   \frametitle{Barriers}
418 |   A barrier simply synchronises threads in a parallel region.
419 |   
420 |   \begin{minted}[frame=single,linenos]{fortran}
421 |   !$omp parallel private(tid)
422 |   
423 |   tid = omp_get_thread_num()
424 |   A(tid) = big_work1(tid)
425 |   
426 |   !$omp barrier
427 |   
428 |   B(tid) = big_work2(A, tid)
429 |   
430 |   !$omp end parallel
431 |   \end{minted}
432 |   
433 |   \begin{itemize}
434 |     \item Running in parallel, need to compute \mintinline{fortran}|A(:)| before computing \mintinline{fortran}|B(:)|.
435 |     \item The barrier ensures all threads wait between these statements.
436 |     \item Must ensure all threads encounter the barrier.
437 |   \end{itemize}
438 |   
439 |   \end{frame}
440 | 
441 | 
442 | %-------------------------------------------------------------------------------
443 | \section{Worksharing}
444 | \begin{frame}[fragile]
445 | \frametitle{Worksharing}
446 | 
447 | \begin{itemize}
448 | \item The SPMD approach requires lots of bookkeeping.
449 | \item Common pattern of splitting loop iterations between threads.
450 | \item OpenMP has worksharing constructs to help with this.
451 | \item Used within a parallel region.
452 | \item The loop iterator is made \mintinline{fortran}|private| by default: no need for data sharing clause.
453 | \end{itemize}
454 | 
455 | \begin{minted}[frame=single]{fortran}
456 |   !$omp parallel
457 |   !$omp do
458 |   do i = 1, N
459 |     C(i) = A(i) + B(i)
460 |   end do
461 |   !$omp end do
462 |   !$omp end parallel
463 | \end{minted}
464 | 
465 | Implicit synchronisation point at the \mintinline{fortran}|!$omp end do|.
466 | 
467 | \end{frame}
468 | 
469 | %-------------------------------------------------------------------------------
470 | \begin{frame}[fragile]
471 | \frametitle{Combined worksharing directives}
472 | Generally it's convenient to combine the directives:
473 | \begin{minted}[frame=single]{fortran}
474 | !$omp parallel do
475 | do i = 1, N
476 |   ... ! loop body
477 | end do
478 | !$omp end parallel do
479 | \end{minted}
480 | 
481 | \begin{itemize}
482 |   \item This starts a parallel region, forking some threads.
483 |   \item Each thread then gets a portion of the iteration space and computes the loop body in parallel.
484 |   \item Implicit synchronisation point at the \mintinline{fortran}|end do|.
485 |   \item Threads finally join again; later code executes sequentially.
486 | \end{itemize}
487 | \end{frame}
488 | 
489 | %-------------------------------------------------------------------------------
490 | \begin{frame}
491 | \frametitle{Vector add code}
492 | The vector add codes are available in the repository for you to look at:
493 | \begin{itemize}
494 |   \item Serial: \mintinline{bash}|vadd.f90|
495 |   \item SPMD: \mintinline{bash}|vadd_spmd.f90|
496 |   \item Worksharing: \mintinline{bash}|vadd_paralleldo.f90|
497 | \end{itemize}
498 | \end{frame}
499 | 
500 | %-------------------------------------------------------------------------------
501 | \section{Loops}
502 | \begin{frame}[fragile]
503 | \frametitle{Nested loops}
504 | \begin{itemize}
505 |   \item Often have tightly nested loops in your code.
506 |   \item E.g. 2D grid code, every cell is independent.
507 |   \item OpenMP worksharing would only parallelise over first loop with each thread performing inner loop serially.
508 |   \item Use the \mintinline{fortran}|collapse(...)| clause to combine iteration spaces.
509 |   \item OpenMP then workshares the combined iteration space.
510 | \end{itemize}
511 | 
512 | \begin{minted}[frame=single]{fortran}
513 | !$omp parallel do collapse(2)
514 | do i = 1, N
515 |   do j = 1, N
516 |     ... ! loop body
517 |   end do
518 | end do
519 | !$omp end parallel do
520 | \end{minted}
521 | All $N^2$ iterations are distributed across threads, rather than just the $N$ of the outer loop.
522 | 
523 | \end{frame}
524 | 
525 | 
526 | %-------------------------------------------------------------------------------
527 | % \begin{frame}
528 | % \frametitle{Nested loops}
529 | % \begin{block}{Performance note}
530 | % Collapsing loops may subtly effect the compiler's knowledge about alignment and could affect vectorisation.
531 | % More on this when we talk about vectorisation in a later session.
532 | % \end{block}
533 | 
534 | % \end{frame}
535 | 
536 | %-------------------------------------------------------------------------------
537 | % \begin{frame}
538 | % \frametitle{Taking stock}
539 | % \begin{itemize}
540 | %   \item We've seen how to parallelise a simple program using OpenMP.
541 | %   \item Shown the MPI-style SPMD approach for dividing work.
542 | %   \item OpenMP worksharing constructs make this easier.
543 | % \end{itemize}
544 | 
545 | % The rest of this session:
546 | % \begin{itemize}
547 | %   \item Expands on the worksharing constructs.
548 | %   \item The first example for you to try.
549 | % \end{itemize}
550 | 
551 | % Then, onto the rest of the OpenMP common core.
552 | % \end{frame}
553 | 
554 | %-------------------------------------------------------------------------------
555 | \section{Exercise}
556 | \begin{frame}
557 | \frametitle{5-point stencil exercise}
558 | First exercise: parallelise a simple 5-point stencil code using OpenMP.
559 | 
560 | \begin{center}
561 | \begin{tikzpicture}
562 | \draw[step=1cm,gray,very thin] (-1.9,-1.9) grid (2.9,2.9);
563 | \draw[fill=black] (0.5,0.5) circle (0.1cm);
564 | \draw[fill=black] (-0.5,0.5) circle (0.1cm);
565 | \draw[fill=black] (1.5,0.5) circle (0.1cm);
566 | \draw[fill=black] (0.5,1.5) circle (0.1cm);
567 | \draw[fill=black] (0.5,-0.5) circle (0.1cm);
568 | \draw (-0.5,0.5) -- (1.5,0.5);
569 | \draw (0.5,-0.5) -- (0.5,1.5);
570 | \end{tikzpicture}
571 | \end{center}
572 | 
573 | Value in every cell is set to the average of its neighbours.
574 | \end{frame}
575 | 
576 | %-------------------------------------------------------------------------------
577 | \begin{frame}[fragile]
578 | \frametitle{5-point stencil exercise}
579 | Take \mintinline{bash}|stencil.f90| and parallelise it using OpenMP:
580 | \begin{enumerate}
581 |   \item Using a SPMD style.
582 |   \item Using the OpenMP worksharing clauses.
583 |   \item Vary the number of threads using \mintinline{bash}|OMP_NUM_THREADS|.
584 | \end{enumerate}
585 | 
586 | Focus on parallelising the main loop(s):
587 | \begin{minted}[frame=single,breaklines]{fortran}
588 | do i = 1, nx
589 |   do j = 1, ny
590 |     Anew(i,j) = (A(i-1,j) + A(i+1,j) + A(i,j) + A(i,j-1) + A(i,j+1)) / 5.0
591 |   end do
592 | end do
593 | \end{minted}
594 | 
595 | Sample solutions are provided, but do try it yourself first.
596 | 
597 | \end{frame}
598 | 
599 | 
600 | %-------------------------------------------------------------------------------
601 | \section{Scheduling}
602 | \begin{frame}[fragile]
603 | \frametitle{The Schedule clause}
604 | \begin{itemize}
605 |   \item The worksharing clauses use default rules for assigning iterations to threads.
606 |   \item Can use the \mintinline{fortran}|schedule| clause to specify the distribution.
607 |   \item General format:
608 |     \begin{minted}{fortran}
609 |       !$omp parallel do schedule(...)
610 |     \end{minted}
611 | \end{itemize}
612 | Next slides go through the options, using the following loop as an example:
613 | \begin{minted}[frame=single]{fortran}
614 | !$omp parallel do num_threads(4)
615 | do i = 1, 100
616 |   ... ! loop body
617 | end do
618 | !$omp end parallel do
619 | \end{minted}
620 | 
621 | \end{frame}
622 | 
623 | %-------------------------------------------------------------------------------
624 | \begin{frame}[fragile]
625 | \frametitle{Static schedule}
626 | \begin{minted}{fortran}
627 | schedule(static)
628 | schedule(static,16)
629 | \end{minted}
630 | 
631 | \begin{itemize}
632 | \item Static schedule divides iterations into chunks and assigns chunks to threads in round-robin.
633 | \item If no chunk size specified, iteration space divided roughly equally.
634 | \end{itemize}
635 | For our example loop:
636 | \begin{columns}
637 | \begin{column}{0.5\textwidth}
638 |   \mintinline{fortran}|schedule(static)|
639 |   \begin{tabular}{cc}
640 |   \toprule
641 |   Thread ID & Iterations \\
642 |   \midrule
643 |   0 &  1--25 \\
644 |   1 & 26--50 \\
645 |   2 & 51--75 \\
646 |   3 & 76--100 \\
647 |   \bottomrule
648 |   \end{tabular}
649 | \end{column}
650 | 
651 | \begin{column}{0.5\textwidth}
652 |   \mintinline{fortran}|schedule(static,16)|
653 |   \begin{tabular}{cc}
654 |   \toprule
655 |   Thread ID & Iterations \\
656 |   \midrule
657 |   0 &  1--16, 65--80 \\
658 |   1 & 17--32, 81--96 \\
659 |   2 & 33--48, 97--100 \\
660 |   3 & 49--64 \\
661 |   \bottomrule
662 |   \end{tabular}
663 | \end{column}
664 | \end{columns}
665 | 
666 | \end{frame}
667 | 
668 | %-------------------------------------------------------------------------------
669 | \begin{frame}[fragile]
670 | \frametitle{Dynamic schedule}
671 | \begin{minted}{fortran}
672 | schedule(dynamic)
673 | schedule(dynamic,16)
674 | \end{minted}
675 | 
676 | \begin{itemize}
677 |   \item Iteration space is divided into chunks according to chunk size.
678 |   \item If no chunk size specified, default size is one.
679 |   \item Each thread requests and executes a chunk, until no more chunks remain.
680 |   \item Useful for unbalanced work-loads if some threads complete work faster.
681 | \end{itemize}
682 | 
683 | For our example with a chunk size of 16:
684 | \begin{itemize}
685 |   \item The iteration space is split into chunk of 16 (the last chunk may be smaller).
686 |   \item Each threads gets one chunk, then requests a new chunk to work on.
687 | \end{itemize}
688 | 
689 | \end{frame}
690 | 
691 | %-------------------------------------------------------------------------------
692 | \begin{frame}[fragile]
693 | \frametitle{Guided schedule}
694 | \begin{minted}{fortran}
695 | schedule(guided)
696 | schedule(guided,16)
697 | \end{minted}
698 | 
699 | \begin{itemize}
700 |   \item Similar to assignment to dynamic, except the chunk size decreases over time.
701 |   \item Granularity of work chunks gets finer over time.
702 |   \item If no chunk size is specified, the default size is one.
703 |   \item Useful to try to mitigate overheads of a \mintinline{fortran}|dynamic| schedule by starting with large chunks of work.
704 | \end{itemize}
705 | 
706 | For our example with a chunk size of 16:
707 | \begin{itemize}
708 |   \item Each thread gets a chunk of 16 to work on.
709 |   \item Each thread requests a new chunk, which might be smaller than 16.
710 | \end{itemize}
711 | 
712 | \end{frame}
713 | 
714 | %-------------------------------------------------------------------------------
715 | \begin{frame}[fragile]
716 | \frametitle{Other schedules}
717 | \begin{minted}{fortran}
718 | schedule(auto)
719 | \end{minted}
720 | \begin{itemize}
721 |   \item Let the compiler or runtime choose the schedule.
722 | \end{itemize}
723 | 
724 | \vfill
725 | 
726 | \begin{minted}{fortran}
727 | schedule(runtime)
728 | \end{minted}
729 | \begin{itemize}
730 |   \item Get the schedule from the \mintinline{bash}|OMP_SCHEDULE| environment variable.
731 | \end{itemize}
732 | 
733 | \begin{block}{Recommendation}
734 | Just use a \mintinline{fortran}|static| schedule unless there is a good reason not to!
735 | \mintinline{fortran}|static| is usually the fastest of all the options.
736 | The choice of schedules is an advanced tuning option.
737 | \end{block}
738 | 
739 | \end{frame}
740 | 
741 | %-------------------------------------------------------------------------------
742 | \section{Synchronisation}
743 | \begin{frame}[fragile]
744 | \frametitle{The nowait clause}
745 | \begin{itemize}
746 |   \item May have series of loops in your code which are independent.
747 |   \item Threads must wait/synchronise at the end of the loop.% \mintinline{fortran}|!$omp end do|.
748 |   \item But it might be possible to delay this synchronisation using the \mintinline{fortran}|nowait| clause.
749 |   \item When a thread finishes the first loop, it starts on the next loop.
750 | \end{itemize}
751 | 
752 | \begin{minted}[fontsize=\small, linenos, frame=single]{fortran}
753 | !$omp parallel
754 | !$omp do nowait
755 | do i = 1, N
756 |   A(i) = i
757 | end do
758 | !$omp end do       ! No barrier!
759 | !$omp do
760 | do i = 1, N
761 |   B(i) = i
762 | end do
763 | !$omp end do       ! Implicit barrier
764 | !$omp end parallel ! Implicit barrier
765 | \end{minted}
766 | \end{frame}
767 | 
768 | %-------------------------------------------------------------------------------
769 | % \begin{frame}
770 | % \frametitle{Synchronisation}
771 | % A number of ways to synchronise the threads in OpenMP:
772 | % \begin{multicols}{2}
773 | % \begin{itemize}
774 | %   \item Barriers
775 | %   \item Critical
776 | %   \item Atomics
777 | %   \item Locks
778 | %   \item Ordered
779 | %   \item Single
780 | %   \item Master
781 | %   \item Flush
782 | % \end{itemize}
783 | % \end{multicols}
784 | 
785 | % \vfill
786 | 
787 | % \begin{itemize}
788 | %   \item Will look at Critical and Atomic in Session 2.
789 | %   \item Ordered, Single and Master in Session 6.
790 | %   \item Won't formally cover Flush and Locks --- advanced stuff with esoteric use cases.
791 | % \end{itemize}
792 | 
793 | % % Quickly cover barriers now.
794 | 
795 | % \end{frame}
796 | 
797 | %-------------------------------------------------------------------------------
798 | % \section{Miscellaneous}
799 | % \begin{frame}[fragile]
800 | % \frametitle{Nested threads}
801 | % \begin{itemize}
802 | %   \item Turn on support with by setting the environment variable \mintinline{fortran}|OMP_NESTED=true|, otherwise inner region is default serial.
803 | %   \item Every thread in the (outer) parallel region then spawns threads.
804 | %   \item Control the number of threads with clauses or environment variable: \mintinline{bash}|OMP_NUM_THREADS=4,2|.
805 | % \end{itemize}
806 | 
807 | % \begin{minted}[frame=single]{fortran}
808 | % !$omp parallel num_threads(4)
809 | % ... ! A parallel region
810 | %   !$omp parallel num_threads(4)
811 | %   ... ! Inner parallel region
812 | %   !$omp end parallel
813 | % !$omp end parallel
814 | % \end{minted}
815 | 
816 | % \end{frame}
817 | 
818 | 
819 | % %-------------------------------------------------------------------------------
820 | % \begin{frame}
821 | % \frametitle{Nested threads}
822 | % \begin{alertblock}{Warning!}
823 | % Be careful how you use nesting threads.
824 | % It's very easy to oversubscribe threads.
825 | % Thread affinity can be tricky.
826 | % You probably don't need to use nested threads!
827 | % \end{alertblock}
828 | % \end{frame}
829 | 
830 | %-------------------------------------------------------------------------------
831 | % \begin{frame}[fragile]
832 | % \frametitle{Multi-line directives}
833 | % \begin{itemize}
834 | %   \item Sometimes OpenMP directives can be quite long.
835 | %   \item Nicer to split up the directive across lines in the source file using line continuation character \mintinline{fortran}|&|:
836 | % \end{itemize}
837 | 
838 | % \begin{minted}{fortran}
839 | % !$omp construct &
840 | % !$omp& clause
841 | % !$omp& clause
842 | % \end{minted}
843 | 
844 | % \end{frame}
845 | 
846 | %-------------------------------------------------------------------------------
847 | % \begin{frame}
848 | % \frametitle{Summary}
849 | % This section introduced the OpenMP programming model:
850 | % \begin{itemize}
851 | %   \item Creating parallel regions: \mintinline{fortran}|!$omp parallel|/\mintinline{fortran}|!$omp end parallel|
852 | %   \item Getting thread IDs: \mintinline{fortran}|omp_get_thread_num()|/\mintinline{fortran}|omp_get_num_threads()|
853 | %   \item Worksharing constructs: \mintinline{fortran}|!$omp do|/\mintinline{fortran}|!$omp end do|
854 | %   \item The \mintinline{fortran}|schedule| and \mintinline{fortran}|nowait| clauses
855 | %   \item Synchronising threads with barriers: \mintinline{fortran}|!$omp barrier|
856 | % \end{itemize}
857 | % \end{frame}
858 | 
859 | %-------------------------------------------------------------------------------
860 | % \begin{frame}
861 | % \frametitle{Resources}
862 | % \begin{itemize}
863 | % \item OpenMP website: \url{https://www.openmp.org}
864 | %   \begin{itemize}
865 | %     \item The specification (not for the faint hearted).
866 | %     \item Download summary cards.
867 | %     \item List of compiler support.
868 | %     \item Example code for all the directives.
869 | %     \item List of books: \url{https://www.openmp.org/resources/openmp-books/}
870 | %   \end{itemize}
871 | % 
872 | % \item cOMPunity
873 | %   \begin{itemize}
874 | %     \item \url{http://www.compunity.org}
875 | %   \end{itemize}
876 | % 
877 | % \item Online tutorials:
878 | %   \begin{itemize}
879 | %     \item Tim Mattson's YouTube tutorial: \url{https://youtu.be/nE-xN4Bf8XI}
880 | %     \item SC'08 tutorial from Tim Mattson and Larry Meadows: \url{https://openmp.org/mp-documents/omp-hands-on-SC08.pdf}
881 | %     \item From Lawrence Livermore National Lab: \url{https://computing.llnl.gov/tutorials/openMP/}
882 | %   \end{itemize}
883 | % 
884 | % \end{itemize}
885 | % 
886 | % \end{frame}
887 | 
888 | %-------------------------------------------------------------------------------
889 | 
890 | \end{document}
891 | 


--------------------------------------------------------------------------------
/slides/03-opt.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | 
  3 | \input{preamble.tex}
  4 | 
  5 | \title{OpenMP for Computational Scientists}
  6 | \subtitle{3: Vectorisation and optimisations}
  7 | 
  8 | \begin{document}
  9 | 
 10 | \frame{\titlepage}
 11 | 
 12 | %-------------------------------------------------------------------------------
 13 | \section{Outline}
 14 | \begin{frame}
 15 | \frametitle{Outline}
 16 | Now you know how to parallelise programs using OpenMP, how do you write fast programs in OpenMP?
 17 | 
 18 | \begin{itemize}
 19 |   \item The cache hierarchy
 20 |   \item Performance analysis
 21 |   \item Vectorisation
 22 |   \item Array of structures vs Structure of arrays
 23 |   \item Memory access patterns
 24 |   \item Memory alignment
 25 | \end{itemize}
 26 | \end{frame}
 27 | 
 28 | %-------------------------------------------------------------------------------
 29 | \section{Recap}
 30 | \begin{frame}
 31 | \frametitle{Recap}
 32 | 
 33 | \begin{itemize}
 34 |   \item Data sharing clauses:
 35 |     \begin{itemize}
 36 |     \item \mintinline{fortran}|shared|, \mintinline{fortran}|private|, \mintinline{fortran}|firstprivate|, \mintinline{fortran}|lastprivate|
 37 |     \end{itemize}
 38 | 
 39 |   \item Atomics and \mintinline{fortran}|critical| regions
 40 | 
 41 |   \item False sharing and cache thrashing
 42 | 
 43 |   \item Reductions with the \mintinline{fortran}|reduction| clause
 44 | \end{itemize}
 45 | 
 46 | Combined with the \mintinline{fortran}|parallel| and worksharing constructs from before, we've covered the OpenMP ``common core''.
 47 | 
 48 | \end{frame}
 49 | 
 50 | 
 51 | %-------------------------------------------------------------------------------
 52 | \begin{frame}[fragile]
 53 | \frametitle{Previous exercise}
 54 | 
 55 | Take your parallel 5-point stencil, and implement a reduction:
 56 | \begin{minted}[frame=single,breaklines,fontsize=\small]{fortran}
 57 | total = 0.0
 58 | !$omp parallel do collapse(2) reduction(+:total)
 59 | do i = 1, nx
 60 |   do j = 1, ny
 61 |     Atmp(i,j) = (A(i-1,j) + A(i+1,j) + A(i,j) + A(i,j-1) + A(i,j+1)) / 5.0
 62 |     total = total + Atmp(i,j)
 63 |   end do
 64 | end do
 65 | !$omp end parallel do
 66 | \end{minted}
 67 | 
 68 | \begin{itemize}
 69 |   \item Well done if you managed this!
 70 |   \item 5-point stencil is simple, but captures the \emph{essence} of more complicated codes.
 71 |   \item Extension: did anyone try the parallelising the Jacobi solver?
 72 | \end{itemize}
 73 | 
 74 | \end{frame}
 75 | 
 76 | %-------------------------------------------------------------------------------
 77 | \section{The cache hierarchy}
 78 | \begin{frame}
 79 | \frametitle{Cache hierarchy}
 80 | \begin{center}
 81 | \begin{adjustbox}{max width={.8\textwidth}}
 82 | \begin{tikzpicture}
 83 |   % Triangle
 84 |   \draw (-3,0) -- (0,4.5) -- (3,0) -- (-3,0);
 85 | 
 86 |   \node at (4,4) {For a Skylake processor};
 87 |   \node at (0,3.5) {L1};
 88 |   \node at (4,3.5) {4~cycles};
 89 | 
 90 |   \draw[dashed] (-1,3) -- (1,3);
 91 |   \node at (0,2.5) {L2};
 92 |   \node at (4,2.5) {12~cycles};
 93 | 
 94 |   \draw[dashed] (-1.67,2) -- (1.67,2);
 95 |   \node at (0,1.5) {L3};
 96 |   \node at (4,1.5) {$\sim$ 44~cycles};
 97 | 
 98 |   \draw[dashed] (-2.337,1) -- (2.337,1);
 99 |   \node at (0,0.5) {DRAM};
100 |   \node at (6,0.5) {$\sim$ 90~ns ($\sim$ 200 cycles @ 2.2~GHz)};
101 | \end{tikzpicture}
102 | \end{adjustbox}
103 | \end{center}
104 | 
105 | \begin{itemize}
106 |   \item Most integer and floating point operations are single cycle.
107 |   \item Memory access is relatively slow.
108 |   \item Moving memory between nodes is hugely expensive: $\sim 3\mu s$
109 |   \item How long is a nanosecond? 11.8 inches --- Grace Hopper: \url{https://youtu.be/JEpsKnWZrJ8}.
110 |   \item Therefore very easy to become bound by memory movement.
111 | 
112 | \end{itemize}
113 | \end{frame}
114 | 
115 | %-------------------------------------------------------------------------------
116 | \begin{frame}
117 | \frametitle{Cache bandwidth}
118 | 
119 | Graph of aggregate cache bandwidth on different architectures:
120 | \begin{center}
121 | \includegraphics[width=0.8\textwidth]{cache_bandwidth}
122 | \end{center}
123 | 
124 | \begin{itemize}
125 |   \item Clear cliff edges at cache capacity sizes (3 times x-axis).
126 |   \item As with latency: more performance from lower levels.
127 | \end{itemize}
128 | 
129 | 
130 | \footnotetext[1]{{\Tiny Deakin, T., Price, J., and McIntosh-Smith, S. (2017). Portable Methods for Measuring Cache Hierarchy Performance (poster).\\In Supercomputing. Denver, CO.}}
131 | 
132 | \end{frame}
133 | 
134 | %-------------------------------------------------------------------------------
135 | \begin{frame}[fragile]
136 | \frametitle{Streaming data}
137 | 
138 | STREAM Triad kernel:
139 | 
140 | \begin{minted}[frame=single]{fortran}
141 | !$omp parallel do
142 | do i = 1, N
143 |   a(i) = b(i) + scalar * c(i)
144 | end do
145 | !$omp end parallel do
146 | \end{minted}
147 | 
148 | \begin{itemize}
149 |   \item Where \mintinline{fortran}|N| is large, the arrays exceed cache capacity.
150 |   \item This kernel has \emph{no} data reuse: data items are read or written once, then never used again.
151 |   \item Example of a \emph{streaming} data access pattern.
152 |   \item Performance is then bound by main memory bandwidth.
153 | \end{itemize}
154 | 
155 | \end{frame}
156 | 
157 | %-------------------------------------------------------------------------------
158 | \section{Performance analysis}
159 | \begin{frame}
160 | \frametitle{Performance analysis}
161 | \begin{itemize}
162 |   \item Optimisations can help code go faster, but how do you know when it's performing \emph{well}?
163 |   \item Helpful to think about characteristics of the algorithm:
164 |     \begin{itemize}
165 |       \item Algorithmic complexity for compute.
166 |       \item Algorithmic complexity for data movement.
167 |     \end{itemize}
168 |   \item Examples:
169 |     \begin{itemize}
170 |       \item Vector-vector and vector-matrix are $O(n)$ for compute and data movement.
171 |       \item Matrix-matrix multiply is $O(n^3)$ for compute and $O(n^2)$ for data movement.
172 |       \item Matrix multiplication becomes \emph{compute bound} at large enough $n$, but other examples remain memory bandwidth bound.
173 |     \end{itemize}
174 | \end{itemize}
175 | \end{frame}
176 | 
177 | %-------------------------------------------------------------------------------
178 | \begin{frame}
179 | \frametitle{Rate limiting factors}
180 | \begin{itemize}
181 |   \item Most HPC codes are \emph{memory bandwidth bound}.
182 |   \item A few are \emph{compute bound}.
183 |   \item Other possibilities:
184 |     \begin{itemize}
185 |       \item Network bound (e.g. MPI communication).
186 |       \item I/O bound (e.g. writing to the filesystem).
187 |       \item Memory latency bound.
188 |       \item Memory capacity bound.
189 |       \item \dots
190 |     \end{itemize}
191 |   \item Worth thinking about the bound for your own code.
192 |   \item Arithmetic (integer and floating point) is very cheap $O(1)$ cycle.
193 |   \item Division, transcendentals, exponentials/logs are relatively slow.
194 |   \item Load/store is 2--3 times slower than an arithmetic operation, even if it's an L1 cache hit (the best case).
195 |   \item Consider the ratio of bytes moved vs. floating point operations.
196 | \end{itemize}
197 | \end{frame}
198 | %-------------------------------------------------------------------------------
199 | \begin{frame}
200 | \frametitle{Computational intensity}
201 | \begin{itemize}
202 |   \item The ratio of FLOPS to bytes moved is known as \emph{computational intensity}, or CI.
203 |   \item Originally only DRAM traffic counted, but this causes problems.
204 |   \item Bytes moved is best calculated from the kernel perspective.
205 |   \item Take this example:
206 |     \begin{itemize}
207 |       \item \mintinline{fortran}|a(i) = a(i) + b(i) * c(i)|
208 |       \item Assume FP64 arrays.
209 |       \item Count the data movement and floating point operations for each \mintinline{fortran}|i|.
210 |       \item 24 bytes loaded, 8 bytes stored.
211 |       \item Two floating point operations: one \mintinline{fortran}|+| and one \mintinline{fortran}|*|.
212 |       \item CI of $2/32 = 1/16$.
213 |     \end{itemize}
214 | \end{itemize}
215 | \end{frame}
216 | 
217 | %-------------------------------------------------------------------------------
218 | \begin{frame}
219 | \frametitle{Roofline model}
220 | Useful conceptual tool to establish whether compute or memory bandwidth bound.
221 | 
222 | \begin{center}
223 | \begin{adjustbox}{max width={.6\textwidth}}
224 | \begin{tikzpicture}
225 |   \draw[->] (0,0)   -- (0,3);
226 |   \draw[->] (0,0)   -- (7,0);
227 |   \draw     (0,1)   -- (2,2.5);
228 |   \draw     (2,2.5) -- (7,2.5);
229 |   \node at (3.5,-0.5) {Computational intensity (FLOPS/byte)};
230 |   \node[rotate=90] at (-0.5,1.5) {FLOP/s};
231 |   \node at (7.5,2.4) {$F(I)$};
232 |   \node at (4,1) {A};
233 |   \node at (0.75,1) {B};
234 | \end{tikzpicture}
235 | \end{adjustbox}
236 | \end{center}
237 | 
238 | \begin{itemize}
239 |   \item The roof $F(I)$ is found from tech sheet data and/or micro-benchmarks.
240 |   \item Runtime performance of kernel gives FLOP/s, analysis of bytes/FLOPS gives CI.
241 |   \item Kernel A is compute bound; Kernel B is memory bandwidth bound.
242 |   \item Both kernels require optimisation!
243 |   \item If kernel is on the roof, then rate limiting factor is achieved.
244 | \end{itemize}
245 | \end{frame}
246 | 
247 | %-------------------------------------------------------------------------------
248 | \begin{frame}
249 | \frametitle{Intel Advisor: Roofline}
250 | \begin{itemize}
251 |   \item Can use Intel Advisor to run a Roofline analysis on your code.
252 |   \item First, it runs some micro-benchmarks to generate the Roofline model.
253 |   \item Then, it runs your code, calculating the CI and performance.
254 |   \item Can be helpful to visualise how performant your code is.
255 |   \item More information: \url{https://software.intel.com/en-us/articles/intel-advisor-roofline}.
256 |   \item But beware, the CI is calculated from executed instructions, so not always the whole picture.
257 | \end{itemize}
258 | \end{frame}
259 | 
260 | %-------------------------------------------------------------------------------
261 | \section{Vectorisation}
262 | \begin{frame}
263 | \frametitle{Vectorisation}
264 | $$C=A+B$$
265 | \begin{columns}
266 | \begin{column}{0.5\textwidth}
267 | Scalar operations \\
268 | \begin{tikzpicture}
269 |   \draw (-0.5,2) rectangle (0.5,3);
270 |   \draw (1,2) rectangle (2,3);
271 |   \draw[->] (0,2) -- (.74,1.2);
272 |   \draw[->] (1.5,2) -- (.76,1.2);
273 |   \draw (.75,.75) circle (.4cm);
274 |   \draw (.75,.75) node {$+$};
275 |   \draw[->] (.75,0.3) -- (.75,-0.5);
276 |   \draw (.25,-1.5) rectangle (1.25,-0.5);
277 | \end{tikzpicture}
278 | \end{column}
279 | 
280 | \begin{column}{0.5\textwidth}
281 | Vector operations \\
282 | \begin{tikzpicture}
283 |   \draw[step=1cm] (0,2) grid (4,3);
284 |   \draw[step=1cm] (0,0) grid (4,1);
285 |   \draw[->] (2,0) -- (2,-0.6);
286 |   \draw[->] (0,2.5) -- (-0.5,2.5) -- (-0.5,-1) -- (1.6,-1);
287 |   \draw (2,-1) circle (.4cm);
288 |   \draw (2,-1) node {$+$};
289 |   \draw[->] (2,-1.4) -- (2,-1.9);
290 |   \draw[step=1cm] (0,-3) grid (4,-2);
291 | \end{tikzpicture}
292 | \end{column}
293 | \end{columns}
294 | 
295 | \end{frame}
296 | 
297 | %-------------------------------------------------------------------------------
298 | \begin{frame}
299 | \frametitle{Why vectorise?}
300 | \begin{itemize}
301 |   \item Vectorisation gives you more compute per cycle.
302 |   \item Hence may increase the FLOP/s rate of the processor.
303 |   \item Also results in fewer instructions to process (less pressure on instruction decode units).
304 |   \item Vectors help make good use of the memory hierarchy (often the main benefit).
305 |   \item Vectorisation helps you write code which has good access patterns to maximise bandwidth.
306 | \end{itemize}
307 | \end{frame}
308 | 
309 | %-------------------------------------------------------------------------------
310 | \begin{frame}
311 | \frametitle{Auto-vectorisation}
312 | \begin{itemize}
313 |   \item Modern compilers are very good at automatically vectorising your loops.
314 |   \item Fortran helps as arrays can not alias (overlap), unlike C.
315 |   \item But compiler needs to be sure it's safe to vectorise.
316 |   \item Read compiler reports to see if it's already vectorising.
317 |     \begin{itemize}
318 |       \item Intel: \mintinline{bash}|-qopt-report=5|
319 |       \item Cray: \mintinline{bash}|-hlist=a|
320 |       \item GNU (old): \mintinline{bash}|-ftree-vectorizer-verbose=2|
321 |       \item GNU (new): \mintinline{bash}|-fopt-info-vec|
322 |       \item Clang: \mintinline{bash}|-Rpass=loop-vectorize| \mintinline{bash}|-Rpass-missed=loop-vectorize| \mintinline{bash}|-Rpass-analysis=loop-vectorize|
323 |     \end{itemize}
324 |   \item Often the memory access pattern prevents (efficient) auto-vectorisation.
325 | \end{itemize}
326 | \end{frame}
327 | 
328 | %-------------------------------------------------------------------------------
329 | \subsection{OpenMP SIMD}
330 | \begin{frame}[fragile]
331 | \frametitle{OpenMP SIMD}
332 | \begin{itemize}
333 |   \item Sometimes the compiler needs help in confirming loops are vectorisable.
334 |   \item OpenMP \mintinline{fortran}|simd| constructs give this information.
335 |   \item Can combine with \mintinline{fortran}|parallel do| construct to ensure a parallel vector loop: \mintinline{fortran}|omp parallel do simd|
336 |   \item Generally want to vectorise inner loops and parallelise outer loops.
337 | \end{itemize}
338 | 
339 | \begin{minted}[frame=single]{fortran}
340 | !$omp simd
341 | do i = 1, N
342 |   C(i) = A(i)+B(i)
343 | end do
344 | !$omp end simd
345 | \end{minted}
346 | \end{frame}
347 | 
348 | %-------------------------------------------------------------------------------
349 | \begin{frame}[fragile]
350 | \frametitle{SIMD functions}
351 | Say you've written an update function to update values in the loop:
352 | \begin{minted}[frame=single]{fortran}
353 | do i = 1, N
354 |   A(i) = magic_maths(A(i))
355 | end do
356 | \end{minted}
357 | 
358 | \begin{itemize}
359 |   \item The situation gets complicated.
360 |   \item If the function is small, then likely inlined and loop will auto-vectorise.
361 |   \item Otherwise need to use the \mintinline{fortran}|simd| construct, but need compiler to create a vector version of the function.
362 | \end{itemize}
363 | 
364 | \begin{minted}[frame=single]{fortran}
365 | function magic_maths(value) result(r)
366 | !$omp declare simd(magic_maths)
367 |   implicit none
368 |   real(kind=8) :: value, r
369 |   r = value * value
370 | end function
371 | \end{minted}
372 | 
373 | \end{frame}
374 | 
375 | %-------------------------------------------------------------------------------
376 | \begin{frame}[fragile]
377 | \frametitle{SIMD clauses}
378 | \begin{itemize}
379 |   \item All the usual data-sharing and reduction clauses can be applied.
380 |   \item \mintinline{fortran}|safelen(4)|: distance between iterations where its safe to vectorise.
381 |   \begin{minted}[frame=single]{fortran}
382 |   !$omp simd safelen(4)
383 |   do i = 1, N-4
384 |     A(i) = A(i) + A(i+4)
385 |   end do
386 |   !$omp end simd
387 |   \end{minted}
388 |   \item \mintinline{fortran}|simdlen(4)|: preferred iterations to be performed concurrently as a vector.
389 |   Specifying explicit vector lengths builds in obsolescence to the code as hardware vector lenghts continually change --- don't recommend using this clause.
390 | \end{itemize}
391 | \end{frame}
392 | 
393 | %-------------------------------------------------------------------------------
394 | \begin{frame}[fragile]
395 | \frametitle{SIMD clauses}
396 | \begin{itemize}
397 |   \item \mintinline{fortran}|linear(var)|: variable is private and linear to the loop iterator.
398 |   \begin{minted}[frame=single]{fortran}
399 |   !$omp simd linear(j)
400 |   do i = 1, N
401 |     j = j + 1
402 |     A(j) = B(i)
403 |   end do
404 |   !$omp end simd
405 |   \end{minted}
406 |   \item \mintinline{fortran}|aligned(var)|: says the array is aligned (more on this shortly).
407 |   \item \mintinline{fortran}|uniform(var)|: for \mintinline{fortran}|declare simd| construct, the variable is the same in all vector lanes.
408 | \end{itemize}
409 | \end{frame}
410 | 
411 | %-------------------------------------------------------------------------------
412 | \begin{frame}
413 | \frametitle{SIMD summary}
414 | 
415 | \begin{itemize}
416 |   \item Sometimes need to force the compiler to auto-vectorise (the correct) loop with the \mintinline{fortran}|simd| construct.
417 |   \item As with \mintinline{fortran}|parallel|, you are telling the compiler it is safe to vectorise and to ignore its data dependancy analysis.
418 |   \item Check the compiler report before and after the check it did the right thing!
419 |   \item Use \mintinline{fortran}|declare simd| and appropriate clauses if you need to create vectorised versions of functions.
420 |   \begin{itemize}
421 |     \item The clauses can give more information to the compiler so it does a better job.
422 |   \end{itemize}
423 | \end{itemize}
424 | 
425 | \end{frame}
426 | 
427 | %-------------------------------------------------------------------------------
428 | \section{Derived types}
429 | \begin{frame}[fragile]
430 | \frametitle{Derived types}
431 | 2D grid of cells, each cell containing 4 different values.
432 | \begin{minted}[frame=single,linenos,fontsize=\small]{fortran}
433 | type cell
434 |   real(kind=8) :: property1
435 |   real(kind=8) :: property2
436 |   real(kind=8) :: property3
437 |   real(kind=8) :: property4
438 | end type
439 | 
440 | type(cell), allocatable :: grid(:,:)
441 | 
442 | do j = 1, ny
443 |   do i = 1, nx
444 |     grid(i,j)%property1 = update_1()
445 |     grid(i,j)%property2 = update_2()
446 |     grid(i,j)%property3 = update_3()
447 |     grid(i,j)%property4 = update_4()
448 |   end do
449 | end do
450 | \end{minted}
451 | \end{frame}
452 | 
453 | %-------------------------------------------------------------------------------
454 | \begin{frame}
455 | \frametitle{Derived types}
456 | \begin{itemize}
457 |   \item What do Fortran derived types look like in memory?
458 |   \item Organised as an array of structures.
459 |   \item<2-> What happens when we vectorise our loop over cells?
460 | \end{itemize}
461 | 
462 | \begin{adjustbox}{max width={\textwidth}}
463 | \begin{tikzpicture}
464 |   \draw[step=1cm] (0,0) grid (13,1);
465 |   \foreach \i in {0,4,8,12} {
466 |     \draw (\i+.5,.5) node {P1};
467 |   }
468 |   \foreach \i in {0,4,8} {
469 |     \draw (\i+1.5,.5) node {P2};
470 |     \draw (\i+2.5,.5) node {P3};
471 |     \draw (\i+3.5,.5) node {P4};
472 |   }
473 | 
474 |   \foreach \i in {0,4,8,12} {
475 |     \draw<3->[->] (\i+.5,-1) -- (\i+.5,0);
476 |   }
477 | \end{tikzpicture}
478 | \end{adjustbox}
479 | 
480 | \begin{itemize}
481 |   \item<4-> The \mintinline{fortran}|property1| values are gathered into a vector register.
482 |   \item<5-> After the computation, the results are scattered back into memory.
483 |   \item<6-> A cache line is 64 bytes, so only the first two values are on the first cache line.
484 |   \item<6-> Must read two cache lines to fill the vector up.
485 | \end{itemize}
486 | \end{frame}
487 | 
488 | %-------------------------------------------------------------------------------
489 | \begin{frame}[fragile]
490 | \frametitle{Structure of arrays}
491 | Switch type around to have an array per property.
492 | \begin{minted}[frame=single,linenos]{fortran}
493 | type grid
494 |   real(kind=8), allocatable :: property1(:,:)
495 |   real(kind=8), allocatable :: property2(:,:)
496 |   real(kind=8), allocatable :: property3(:,:)
497 |   real(kind=8), allocatable :: property4(:,:)
498 | end type
499 | 
500 | do j = 1, ny
501 |   do i = 1, nx
502 |     grid%property1(i,j) = update_1()
503 |     grid%property2(i,j) = update_2()
504 |     grid%property3(i,j) = update_3()
505 |     grid%property4(i,j) = update_4()
506 |   end do
507 | end do
508 | \end{minted}
509 | \end{frame}
510 | 
511 | %-------------------------------------------------------------------------------
512 | \begin{frame}
513 | \frametitle{Structure of arrays}
514 | \begin{itemize}
515 |   \item Order of data in memory has changed.
516 |   \item<2-> What happens when we vectorise?
517 | \end{itemize}
518 | 
519 | \begin{adjustbox}{max width={\textwidth}}
520 | \begin{tikzpicture}
521 |   \draw[step=1cm] (0,0) grid (13,1);
522 |   \foreach \i in {0,...,4} {
523 |     \draw (\i+.5,.5) node {P1};
524 |   }
525 |   \draw (5.5,.5) node {\dots};
526 | 
527 |   \foreach \i in {5,...,9} {
528 |     \draw (\i+1.5,.5) node {P2};
529 |   }
530 |   \draw (11.5,.5) node {\dots};
531 | 
532 |   \foreach \i in {10} {
533 |     \draw (\i+2.5,.5) node {P3};
534 |   }
535 | 
536 |   \foreach \i in {0,...,3} {
537 |     \draw<3->[->] (\i+.5,-1) -- (\i+.5,0);
538 |   }
539 | \end{tikzpicture}
540 | \end{adjustbox}
541 | 
542 | \onslide<4->{
543 | \begin{itemize}
544 |   \item Coalesced memory accesses are key for high performance code.
545 |   \item Adjacent vector lanes read adjacent memory locations.
546 |   \item A cache line is 64 bytes, so can fill the vector from a single cache line.
547 |   \item More efficient vectorisation.
548 | \end{itemize}
549 | }
550 | \end{frame}
551 | 
552 | %-------------------------------------------------------------------------------
553 | \section{Memory access patterns}
554 | \begin{frame}[fragile]
555 | \frametitle{Memory access patterns}
556 | \begin{minted}{fortran}
557 | do i = 1, N
558 |   val = A(i)
559 | end do
560 | \end{minted}
561 | \begin{adjustbox}{max width={\textwidth}}
562 | \begin{tikzpicture}
563 |   \draw[step=1cm] (-3,0) grid (11,1);
564 |   \draw[dashed] (0,-.5) -- (0,1.5);
565 |   \draw[dashed] (8,-.5) -- (8,1.5);
566 |   \draw (0,-1) node {64 byte boundary};
567 |   \foreach \i in {0,...,7} {
568 |     \draw[->] (\i+.5,2) -- (\i+.5,1.2);
569 |   }
570 | \end{tikzpicture}
571 | \end{adjustbox}
572 | \begin{itemize}
573 |   \item Ideal memory access pattern.
574 |   \item All access is coalesced.
575 |   \item Vectors are aligned to cache line boundary.
576 | \end{itemize}
577 | \end{frame}
578 | 
579 | %-------------------------------------------------------------------------------
580 | \begin{frame}[fragile]
581 | \frametitle{Memory access patterns}
582 | \begin{minted}{fortran}
583 | do i = 1, N
584 |   val = A(i+3)
585 | end do
586 | \end{minted}
587 | \begin{adjustbox}{max width={\textwidth}}
588 | \begin{tikzpicture}
589 |   \draw[step=1cm] (-3,0) grid (11,1);
590 |   \draw[dashed] (0,-.5) -- (0,1.5);
591 |   \draw[dashed] (8,-.5) -- (8,1.5);
592 |   \draw (0,-1) node {64 byte boundary};
593 |   \foreach \i in {0,...,7} {
594 |     \draw[->] (\i+.5,2) -- (3+\i+.5,1.2);
595 |   }
596 | \end{tikzpicture}
597 | \end{adjustbox}
598 | \begin{itemize}
599 |   \item OK memory access pattern.
600 |   \item All access is coalesced, but split across cache lines.
601 |   \item Still get good use of cache lines, but not as efficient as aligned version.
602 | \end{itemize}
603 | \end{frame}
604 | 
605 | %-------------------------------------------------------------------------------
606 | \begin{frame}[fragile]
607 | \frametitle{Memory access patterns}
608 | \begin{minted}{fortran}
609 | do i = 1, N
610 |   val = A(j,i) ! equiv. A(j+3*i)
611 | end do
612 | \end{minted}
613 | \begin{adjustbox}{max width={\textwidth}}
614 | \begin{tikzpicture}
615 |   \draw[step=1cm] (-3,0) grid (11,1);
616 |   \draw[dashed] (0,-.5) -- (0,1.5);
617 |   \draw[dashed] (8,-.5) -- (8,1.5);
618 |   \draw (0,-1) node {64 byte boundary};
619 |   \foreach \i in {0,...,3} {
620 |     \draw[->] (\i+.5,2) -- (3*\i+.5,1.2);
621 |   }
622 | \end{tikzpicture}
623 | \end{adjustbox}
624 | \begin{itemize}
625 |   \item Strided access results in multiple memory transactions.
626 |   \item Kills throughput due to poor reuse of cached data.
627 |   \item Very easy to fall into this trap with multi-dimensional arrays.
628 |   \item Check your strides!
629 | \end{itemize}
630 | \end{frame}
631 | 
632 | %-------------------------------------------------------------------------------
633 | \begin{frame}[fragile]
634 | \frametitle{Memory access patterns}
635 | \begin{minted}{fortran}
636 | do i = 1, N
637 |   val = A(B(i))
638 | end do
639 | \end{minted}
640 | \begin{adjustbox}{max width={\textwidth}}
641 | \begin{tikzpicture}
642 |   \draw[step=1cm] (-3,0) grid (11,1);
643 |   \draw[dashed] (0,-.5) -- (0,1.5);
644 |   \draw[dashed] (8,-.5) -- (8,1.5);
645 |   \draw (0,-1) node {64 byte boundary};
646 |   \draw[->] (0.5,2) -- (-3.5,1.2);
647 |   \draw[->] (1.5,2) -- (3.5,1.2);
648 |   \draw[->] (2.5,2) -- (0.5,1.2);
649 |   \draw[->] (3.5,2) -- (8.5,1.2);
650 |   \draw[->] (4.5,2) -- (-1.5,1.2);
651 |   \draw[->] (5.5,2) -- (7.5,1.2);
652 |   \draw[->] (6.5,2) -- (1.5,1.2);
653 |   \draw[->] (7.5,2) -- (-2.5,1.2);
654 | \end{tikzpicture}
655 | \end{adjustbox}
656 | \begin{itemize}
657 |   \item Essentially random access to memory.
658 |   \item Little reuse of cache lines.
659 |   \item Unpredictable pattern, so hardware prefetchers won't work efficiently.
660 |   \item Very challenging!
661 | \end{itemize}
662 | \end{frame}
663 | 
664 | %-------------------------------------------------------------------------------
665 | \section{Alignment}
666 | \begin{frame}
667 | \frametitle{Alignment}
668 | \begin{itemize}
669 |   \item If we can align arrays, we get better vectorisation; specifically load/stores are faster.
670 |     \begin{itemize}
671 |       \item Guarantee only one cache line needs updating and not split between two cache lines.
672 |     \end{itemize}
673 |   \item Taking advantage of alignment is a two stage process:
674 |     \begin{enumerate}
675 |       \item Align the memory on allocation.
676 |       \item Tell the compiler the access is aligned.
677 |     \end{enumerate}
678 |   \item Aligned allocations in Fortran are (currently) unfortunately vendor specific.
679 |   \item OpenMP can help with telling the compiler the data is aligned.
680 |   \item Aligned allocations due in OpenMP 5.0.
681 | \end{itemize}
682 | \end{frame}
683 | 
684 | %-------------------------------------------------------------------------------
685 | \begin{frame}[fragile]
686 | \frametitle{Step 1: Aligning allocations}
687 | Generally focus on the Intel compiler.
688 | Only need to use one of these methods, whichever is most convenient.
689 | \begin{itemize}
690 |   \item Align all allocations of arrays (not in derived types) with compiler flag: \mintinline{bash}|-align array64byte|
691 |   \item Use an Intel compiler directive on array definition:
692 |   \begin{minted}[frame=single,fontsize=\small]{fortran}
693 |   real(kind=8), allocatable :: A(:,:)
694 |   !dir$ attributes align:64 :: A
695 |   \end{minted}
696 |   \item Allocate memory in C, and convert to Fortran \mintinline{fortran}|pointer|:
697 |   \begin{minted}[frame=single,breaklines,fontsize=\small]{c}
698 |   double * alloc(int *len) {
699 |     return (double *)aligned_alloc(64, sizeof(double)*(*len));
700 |   }
701 |   \end{minted}
702 |   \begin{minted}[frame=single,fontsize=\small]{fortran}
703 |   real(kind=8), pointer :: A(:,:)
704 |   type(c_ptr) :: A_ptr
705 |   A_ptr = alloc(nx*ny)
706 |   call c_f_pointer(A_ptr, A, (/ nx, ny/))
707 |   \end{minted}
708 | \end{itemize}
709 | \end{frame}
710 | 
711 | %-------------------------------------------------------------------------------
712 | \begin{frame}[fragile]
713 | \frametitle{Step 2: Telling the compiler}
714 | \begin{itemize}
715 |   \item Use OpenMP \mintinline{fortran}|simd aligned| clause:
716 |   \begin{minted}[frame=single,fontsize=\small]{fortran}
717 |   !$omp simd aligned(A:64)
718 |   do i = 1, nx
719 |     A(i,j) = A(i,j) + 1.0
720 |   end do
721 |   !$omp end simd
722 |   \end{minted}
723 |   \pause
724 |   \item Unfortunately often not sufficient.
725 |   \item Often need to use Intel specific directives to say loop extent is divisible by vector length.
726 |   \begin{minted}[frame=single,fontsize=\small]{fortran}
727 |   ! 64 byte aligned / 8 byte data type means mod 8
728 |   !dir$ assume(mod(nx,8) .eq. 0)
729 |   !$omp simd aligned(A:64)
730 |   do i = 1, nx
731 |     A(i,j) = A(i,j) + 1.0
732 |   end do
733 |   !$omp end simd
734 |   \end{minted}
735 |   \item Check the compiler report for aligned and unaligned access.
736 | \end{itemize}
737 | \end{frame}
738 | 
739 | %-------------------------------------------------------------------------------
740 | \begin{frame}[fragile]
741 | \frametitle{Aligning 2D arrays}
742 | \begin{itemize}
743 |   \item Aligning the memory only aligns the first entry.
744 |   \item Multiples of the alignment factor will also be aligned.
745 |   \item With 2D arrays you need to double check that access can be aligned.
746 |   \item Example: 10-by-10 grid of FP64 numbers, aligned to 64 byte cache line:
747 | \end{itemize}
748 | 
749 | \begin{adjustbox}{max width={\textwidth}}
750 | \begin{tikzpicture}
751 |   \draw[step=1cm] (0,0) grid (17,1);
752 |   \foreach \i in {0,8,16} {
753 |     \draw[dashed] (\i,-.5) -- (\i,1.5);
754 |   }
755 |   \foreach \i in {1,...,10} {
756 |     \draw (\i-0.5, 0.5) node {(\i,1)};
757 |   }
758 |   \foreach \i in {1,...,7} {
759 |     \draw (10+\i-0.5, 0.5) node {(\i,2)};
760 |   }
761 | \end{tikzpicture}
762 | \end{adjustbox}
763 | 
764 | \begin{minted}[frame=single]{fortran}
765 | do j = 1, 10
766 |   !$omp simd aligned(A:64)
767 |   do i = 1, 10
768 |     A(i,j) = A(i,j) + ...
769 |   end do
770 |   !$omp end simd
771 | end do
772 | \end{minted}
773 | 
774 | \end{frame}
775 | %-------------------------------------------------------------------------------
776 | 
777 | \begin{frame}
778 | \frametitle{Aligning 2D arrays}
779 | 
780 | \begin{adjustbox}{max width={\textwidth}}
781 | \begin{tikzpicture}
782 |   \draw[step=1cm] (0,0) grid (17,1);
783 |   \foreach \i in {0,8,16} {
784 |     \draw[dashed] (\i,-.5) -- (\i,1.5);
785 |   }
786 |   \foreach \i in {1,...,10} {
787 |     \draw (\i-0.5, 0.5) node {(\i,1)};
788 |   }
789 |   \foreach \i in {1,...,7} {
790 |     \draw (10+\i-0.5, 0.5) node {(\i,2)};
791 |   }
792 | 
793 |   \draw<2>[red, very thick] (0.1,0.1) rectangle (3.9, 0.9);
794 |   \draw<3>[red, very thick] (10.1,0.1) rectangle (13.9, 0.9);
795 | \end{tikzpicture}
796 | \end{adjustbox}
797 | 
798 | \begin{itemize}
799 |   \item The array is aligned to a 64-byte cache line.
800 |   \item<2-> Accessing the vector \mintinline{fortran}|A(1:4,1)| is aligned.
801 |   \item<3-> Accessing the vector \mintinline{fortran}|A(1:4,2)| is \emph{not} aligned.
802 |   \vfill
803 |   \item<4-> Need the inner stride to be a multiple of the alignment, and need to tell the compiler this is true (previous slide).
804 |   \item<4-> Solution: pad the array, but beware of memory footprint.
805 |   \item<4-> Example of why the \mintinline{fortran}|aligned| clause doesn't always ensure aligned load/stores.
806 | \end{itemize}
807 | 
808 | \end{frame}
809 | %-------------------------------------------------------------------------------
810 | 
811 | \section{Branches}
812 | \begin{frame}
813 | \frametitle{Branches}
814 | \begin{itemize}
815 |   \item CPUs support speculative execution, GPUs tend not to.
816 |   \item Branch instructions have high latency.
817 |   \item GPUs hide this latency by fast context switching, CPUs by good branch predictors.
818 |   \item In both cases, divergent execution within the vector unit reduces performance.
819 |   \item Can use predication, selection and masking to convert conditional control flow into straight line code.
820 | \end{itemize}
821 | \end{frame}
822 | 
823 | %-------------------------------------------------------------------------------
824 | \begin{frame}[fragile]
825 | \frametitle{Removing branches}
826 | \begin{columns}
827 | 
828 | \begin{column}{0.5\textwidth}
829 | Conditional execution
830 | \begin{itemize}
831 |   \item Only evaluate expression if condition is met
832 | \end{itemize}
833 | \begin{minted}[frame=single]{fortran}
834 | if (a .gt. b) then
835 |   acc = acc + (a - b*c)
836 | end if
837 | \end{minted}
838 | 
839 | \begin{minted}[frame=single]{C}
840 | if (a > b)
841 |   acc += a - b*c;
842 | \end{minted}
843 | \end{column}
844 | 
845 | \begin{column}{0.5\textwidth}
846 | Selection and masking
847 | \begin{itemize}
848 |   \item Always evaluate expression and mask result
849 | \end{itemize}
850 | \begin{minted}[frame=single,breaklines]{fortran}
851 | temp = a - b*c
852 | mask = merge(1.0, 0.0, a .gt. b)
853 | acc = acc + (mask * temp)
854 | \end{minted}
855 | 
856 | \begin{minted}[frame=single]{C}
857 | temp = a - b*c;
858 | mask = a > b ? 1.0 : 0.0;
859 | acc += mask * temp;
860 | \end{minted}
861 | \end{column}
862 | 
863 | \end{columns}
864 | In practice, you may or may not see an improvement: the compiler may be doing something smart already.
865 | \end{frame}
866 | 
867 | %-------------------------------------------------------------------------------
868 | \section{Exercise}
869 | \begin{frame}
870 | \frametitle{Exercise}
871 | \begin{itemize}
872 |   \item Take your parallel 5-point stencil code and optimise it.
873 |   \item Think about:
874 |     \begin{itemize}
875 |       \item Memory access patterns
876 |       \item Vectorisation
877 |     \end{itemize}
878 |   \item Note down the performance differences your optimisations make.
879 |   \item Calculate the achieved memory bandwidth of your stencil code.
880 |   \item Extension: consider these optimisaions for the Jacobi solver.
881 | \end{itemize}
882 | \end{frame}
883 | 
884 | %-------------------------------------------------------------------------------
885 | \section{Summary}
886 | \begin{frame}
887 | \frametitle{Summary}
888 | 
889 | \begin{itemize}
890 |   \item Performance of cache hierarchy.
891 |   \item Performance analysis with the Roofline model.
892 |   \item Vectorisation:
893 |     \begin{itemize}
894 |       \item Compiler auto-vectorisation.
895 |       \item OpenMP \mintinline{fortran}|simd| construct.
896 |       \item Memory access patterns.
897 |       \item Data alignment.
898 |     \end{itemize}
899 | 
900 |   \vfill
901 | 
902 |   \item Next sessions:
903 |     \begin{enumerate}
904 |       \setcounter{enumi}{3}
905 |       \item NUMA and MPI interoperability.
906 |       \item GPU programming with OpenMP.
907 |       \item Tasks and Tools.
908 |     \end{enumerate}
909 | \end{itemize}
910 | 
911 | 
912 | \end{frame}
913 | 
914 | %-------------------------------------------------------------------------------
915 | 
916 | \end{document}
917 | 


--------------------------------------------------------------------------------