├── .gitignore ├── slides ├── isambard.jpeg ├── pdfs │ ├── 02-pi.pdf │ ├── 05-gpu.pdf │ ├── 00-prelim.pdf │ ├── 07-wrapup.pdf │ ├── 03-simd-numa.pdf │ ├── 99-isambard.pdf │ ├── old │ │ ├── 01-intro.pdf │ │ ├── 03-opt.pdf │ │ ├── 06-tasks.pdf │ │ └── 04-hybrid.pdf │ └── 01-paralleldo.pdf ├── cache_bandwidth.pdf ├── logo-full-colour.png ├── .gitignore ├── Makefile ├── README.md ├── preamble.tex ├── 99-isambard.tex ├── 07-wrapup.tex ├── 00-prelim.tex ├── 04-hybrid.tex ├── 03-simd-numa.tex ├── 02-pi.tex ├── 01-paralleldo.tex └── 03-opt.tex ├── code ├── submit_stencil ├── wtime.c ├── .gitignore ├── timer.f90 ├── Makefile ├── vadd.f90 ├── vadd_paralleldo.f90 ├── tasks.f90 ├── fibonacci.f90 ├── pi.f90 ├── pi_reduction.f90 ├── pi_atomic.f90 ├── pi_critical.f90 ├── vadd_spmd.f90 ├── pi_private.f90 ├── stencil.f90 ├── private.f90 ├── stencil_paralleldo.f90 ├── pi_array.f90 ├── stencil_reduction.f90 ├── stencil_target.f90 ├── stencil_optimised.f90 ├── stencil_numa.f90 ├── README.md └── jacobi.f90 ├── README.md └── LICENSE /.gitignore: -------------------------------------------------------------------------------- 1 | # Misc. 2 | .*.swp 3 | .DS_Store 4 | -------------------------------------------------------------------------------- /slides/isambard.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UoB-HPC/openmp-for-cs/HEAD/slides/isambard.jpeg -------------------------------------------------------------------------------- /slides/pdfs/02-pi.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UoB-HPC/openmp-for-cs/HEAD/slides/pdfs/02-pi.pdf -------------------------------------------------------------------------------- /slides/pdfs/05-gpu.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UoB-HPC/openmp-for-cs/HEAD/slides/pdfs/05-gpu.pdf -------------------------------------------------------------------------------- /slides/pdfs/00-prelim.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UoB-HPC/openmp-for-cs/HEAD/slides/pdfs/00-prelim.pdf -------------------------------------------------------------------------------- /slides/pdfs/07-wrapup.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UoB-HPC/openmp-for-cs/HEAD/slides/pdfs/07-wrapup.pdf -------------------------------------------------------------------------------- /slides/cache_bandwidth.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UoB-HPC/openmp-for-cs/HEAD/slides/cache_bandwidth.pdf -------------------------------------------------------------------------------- /slides/logo-full-colour.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UoB-HPC/openmp-for-cs/HEAD/slides/logo-full-colour.png -------------------------------------------------------------------------------- /slides/pdfs/03-simd-numa.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UoB-HPC/openmp-for-cs/HEAD/slides/pdfs/03-simd-numa.pdf -------------------------------------------------------------------------------- /slides/pdfs/99-isambard.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UoB-HPC/openmp-for-cs/HEAD/slides/pdfs/99-isambard.pdf -------------------------------------------------------------------------------- /slides/pdfs/old/01-intro.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UoB-HPC/openmp-for-cs/HEAD/slides/pdfs/old/01-intro.pdf -------------------------------------------------------------------------------- /slides/pdfs/old/03-opt.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UoB-HPC/openmp-for-cs/HEAD/slides/pdfs/old/03-opt.pdf -------------------------------------------------------------------------------- /slides/pdfs/old/06-tasks.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UoB-HPC/openmp-for-cs/HEAD/slides/pdfs/old/06-tasks.pdf -------------------------------------------------------------------------------- /slides/pdfs/01-paralleldo.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UoB-HPC/openmp-for-cs/HEAD/slides/pdfs/01-paralleldo.pdf -------------------------------------------------------------------------------- /slides/pdfs/old/04-hybrid.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UoB-HPC/openmp-for-cs/HEAD/slides/pdfs/old/04-hybrid.pdf -------------------------------------------------------------------------------- /slides/.gitignore: -------------------------------------------------------------------------------- 1 | # PDFs 2 | *.pdf 3 | 4 | # Latex temp files 5 | *.aux 6 | *.fdb_latexmk 7 | *.fls 8 | *.log 9 | *.nav 10 | *.out 11 | *.snm 12 | *.toc 13 | *.vrb 14 | _minted-*/ 15 | -------------------------------------------------------------------------------- /code/submit_stencil: -------------------------------------------------------------------------------- 1 | #PBS -q R35330 2 | #PBS -V 3 | #PBS -joe 4 | #PBS -lselect=1:ncpus=28,place=excl 5 | #PBS -lwalltime=00:02:00 6 | #PBS -N stencil 7 | 8 | cd $PBS_O_WORKDIR 9 | 10 | export OMP_NUM_THREADS=28 11 | ./stencil 12 | 13 | -------------------------------------------------------------------------------- /code/wtime.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | /* Get the current time in seconds since the Epoch */ 5 | void wtime(double *time) 6 | { 7 | struct timeval tv; 8 | gettimeofday(&tv, NULL); 9 | *time = tv.tv_sec + tv.tv_usec*1e-6; 10 | } 11 | -------------------------------------------------------------------------------- /slides/Makefile: -------------------------------------------------------------------------------- 1 | 2 | all: 00-prelim.pdf 01-paralleldo.pdf 02-pi.pdf 03-simd-numa.pdf 05-gpu.pdf 07-wrapup.pdf 99-isambard.pdf 3 | 4 | %.pdf:%.tex preamble.tex 5 | latexmk -pdf -shell-escape $< 6 | 7 | .PHONY: clean 8 | clean: 9 | latexmk -C 10 | rm -f *.nav *.snm *.vrb 11 | rm -rf _minted*/ 12 | 13 | -------------------------------------------------------------------------------- /code/.gitignore: -------------------------------------------------------------------------------- 1 | # Build output 2 | *.o 3 | *.mod 4 | 5 | # Binary names 6 | pi 7 | pi_array 8 | pi_atomic 9 | pi_critical 10 | pi_private 11 | pi_reduction 12 | private 13 | jacobi 14 | vadd 15 | vadd_paralleldo 16 | vadd_spmd 17 | stencil 18 | stencil_paralleldo 19 | stencil_reduction 20 | stencil_optimised 21 | stencil_numa 22 | stencil_target 23 | fibonacci 24 | tasks 25 | 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OpenMP for Computational Scientists 2 | 3 | This is a 6-part course introducing the OpenMP programming model. 4 | It is designed for Fortran programmers. 5 | 6 | The example code (and solutions) are found in the `code/` directory. 7 | LaTeX sources for the lecture slides are found in the `slides/` directory. 8 | Generated PDFs of the lecture slides are found in the `slides/pdfs` directory. 9 | 10 | -------------------------------------------------------------------------------- /code/timer.f90: -------------------------------------------------------------------------------- 1 | ! Timing module, used to call the C timer 2 | module timer 3 | 4 | use ISO_C_BINDING 5 | 6 | implicit none 7 | 8 | interface 9 | 10 | subroutine wtime_c(time) bind(C, name='wtime') 11 | use ISO_C_BINDING 12 | real(C_DOUBLE) :: time 13 | end subroutine 14 | end interface 15 | 16 | contains 17 | 18 | subroutine wtime(time) 19 | 20 | real(kind=8) :: time 21 | 22 | call wtime_c(time) 23 | 24 | end subroutine wtime 25 | 26 | end module timer 27 | -------------------------------------------------------------------------------- /code/Makefile: -------------------------------------------------------------------------------- 1 | 2 | FTN=ftn 3 | CC=cc 4 | FFLAGS=-O3 5 | LIBS=-homp -rm 6 | 7 | TIMEOBJ=timer.o wtime.o 8 | 9 | default: all 10 | 11 | BINS=jacobi pi pi_critical pi_atomic pi_array pi_private pi_reduction private vadd vadd_paralleldo vadd_spmd stencil stencil_paralleldo stencil_reduction stencil_optimised stencil_numa stencil_target fibonacci tasks 12 | 13 | all: $(BINS) 14 | 15 | %:%.f90 $(TIMEOBJ) 16 | $(FTN) $(FFLAGS) $^ $(LIBS) -o $@ 17 | 18 | %.o:%.f90 19 | $(FTN) -O3 $< -c 20 | 21 | %.o: %.c 22 | $(CC) -O3 $< -c 23 | 24 | .PHONY: clean 25 | clean: 26 | rm -f *.o *.mod $(BINS) 27 | -------------------------------------------------------------------------------- /slides/README.md: -------------------------------------------------------------------------------- 1 | # Slides 2 | 3 | Source code for the teaching material (slides) that teach the OpenMP for Computational Scientists course. 4 | The course material is presented using Fortran. 5 | 6 | ## Course structure 7 | 8 | 1. OpenMP overview: shared memory and parallel do. 9 | 2. Data sharing clauses and reductions. 10 | 3. Vectorisation and code optimisaion. 11 | 4. NUMA and Hybrid MPI+OpenMP. 12 | 5. OpenMP for GPUs. 13 | 6. Tasks and Tools. 14 | 15 | ## Compilation 16 | The slides are written in Latex. 17 | You should be able to build all the slides simply by typing ```make```. 18 | 19 | ### Dependancies 20 | The LaTeX uses the following packages: 21 | - beamer 22 | - amsmath 23 | - pgfplots 24 | - minted 25 | - fontenc 26 | - multicol 27 | - booktabs 28 | - adjustbox 29 | 30 | -------------------------------------------------------------------------------- /code/vadd.f90: -------------------------------------------------------------------------------- 1 | 2 | ! Vector addition 3 | program vadd 4 | 5 | use timer 6 | 7 | implicit none 8 | 9 | integer :: N=50000000 10 | real(kind=8), allocatable :: A(:), B(:), C(:) 11 | integer :: i 12 | real(kind=8) :: start, end 13 | 14 | ! Allocate memory 15 | allocate(A(N)) 16 | allocate(B(N)) 17 | allocate(C(N)) 18 | 19 | ! Initilise data 20 | do i = 1, N 21 | A(i) = 1.0_8 22 | B(i) = 2.0_8 23 | C(i) = 0.0_8 24 | end do 25 | 26 | ! Start timer 27 | call wtime(start) 28 | 29 | ! Vector addition 30 | do i = 1, N 31 | C(i) = A(i) + B(i) 32 | end do 33 | 34 | ! Stop timer 35 | call wtime(end) 36 | 37 | ! Print result 38 | write(*,"(A)") "------------------------------------" 39 | write(*,"(A,F10.3)") "runtime: ", end-start 40 | if (any(C .ne. 3.0_8)) then 41 | write(*,"(A)") "WARNING: results incorrect" 42 | end if 43 | write(*,"(A)") "------------------------------------" 44 | 45 | ! Free memory 46 | deallocate(A,B) 47 | 48 | end program vadd 49 | -------------------------------------------------------------------------------- /slides/preamble.tex: -------------------------------------------------------------------------------- 1 | 2 | % Beamer settings 3 | \usecolortheme{rose} 4 | \beamertemplatenavigationsymbolsempty 5 | \setbeamertemplate{footline}[frame number] 6 | 7 | \titlegraphic{% 8 | \includegraphics[height=1cm]{logo-full-colour.png}} 9 | 10 | \addtobeamertemplate{frametitle}{}{% 11 | \begin{tikzpicture}[remember picture,overlay] 12 | \node[anchor=north east,yshift=2pt] at (current page.north east) {\includegraphics[height=1cm]{logo-full-colour.png}}; 13 | \end{tikzpicture}} 14 | 15 | % Packages 16 | \usepackage{amsmath} 17 | 18 | \usepackage{tikz} 19 | \usetikzlibrary{positioning} 20 | \usetikzlibrary{fit} 21 | 22 | \usepackage{pgfplots} 23 | \pgfplotsset{compat=1.16} 24 | \usepgfplotslibrary{fillbetween} 25 | 26 | 27 | \usepackage{minted} 28 | \usepackage[T1]{fontenc} % Required by minted to ensure dollar signs are produced instead of pound (sterling) signs 29 | 30 | \usepackage{multicol} 31 | 32 | \usepackage{booktabs} 33 | 34 | \usepackage{adjustbox} 35 | 36 | % Author 37 | \author{Dr Tom Deakin\\University of Bristol} 38 | 39 | \date{Tuesday 1 December, 2020} 40 | 41 | -------------------------------------------------------------------------------- /code/vadd_paralleldo.f90: -------------------------------------------------------------------------------- 1 | 2 | ! Vector addition 3 | program vadd 4 | 5 | use timer 6 | 7 | implicit none 8 | 9 | integer :: N=50000000 10 | real(kind=8), allocatable :: A(:), B(:), C(:) 11 | integer :: i 12 | real(kind=8) :: start, end 13 | 14 | ! Allocate memory 15 | allocate(A(N)) 16 | allocate(B(N)) 17 | allocate(C(N)) 18 | 19 | ! Initilise data 20 | do i = 1, N 21 | A(i) = 1.0_8 22 | B(i) = 2.0_8 23 | C(i) = 0.0_8 24 | end do 25 | 26 | ! Start timer 27 | call wtime(start) 28 | 29 | ! Vector addition 30 | !$omp parallel do 31 | do i = 1, N 32 | C(i) = A(i) + B(i) 33 | end do 34 | !$omp end parallel do 35 | 36 | ! Stop timer 37 | call wtime(end) 38 | 39 | ! Print result 40 | write(*,"(A)") "------------------------------------" 41 | write(*,"(A,F10.3)") "runtime: ", end-start 42 | if (any(C .ne. 3.0_8)) then 43 | write(*,"(A)") "WARNING: results incorrect" 44 | end if 45 | write(*,"(A)") "------------------------------------" 46 | 47 | ! Free memory 48 | deallocate(A,B) 49 | 50 | end program vadd 51 | -------------------------------------------------------------------------------- /code/tasks.f90: -------------------------------------------------------------------------------- 1 | 2 | subroutine do_c 3 | print *, "Task C starting" 4 | call sleep(1) 5 | print *, "Task C finished" 6 | end subroutine 7 | 8 | subroutine do_d 9 | print *, "Task D starting" 10 | call sleep(1) 11 | print *, "Task D finished" 12 | end subroutine 13 | 14 | subroutine do_e 15 | print *, "Task E starting" 16 | call sleep(1) 17 | print *, "Task E finished" 18 | end subroutine 19 | 20 | subroutine do_b 21 | 22 | print *, "Task B starting" 23 | call sleep(1) 24 | 25 | !$omp task 26 | call do_d 27 | !$omp end task 28 | 29 | !$omp task 30 | call do_e 31 | !$omp end task 32 | 33 | print *, "Task B finished" 34 | 35 | end subroutine 36 | 37 | subroutine do_a 38 | 39 | print *, "Task A starting" 40 | call sleep(1) 41 | 42 | !$omp task 43 | call do_b 44 | !$omp end task 45 | 46 | !$omp task 47 | call do_c 48 | !$omp end task 49 | 50 | print *, "Task A finished" 51 | 52 | end subroutine 53 | 54 | program tasks 55 | 56 | implicit none 57 | 58 | !$omp parallel 59 | !$omp master 60 | call do_a 61 | !$omp end master 62 | !$omp end parallel 63 | 64 | end program 65 | 66 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Tom Deakin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /code/fibonacci.f90: -------------------------------------------------------------------------------- 1 | 2 | recursive integer function fib(n) result(res) 3 | 4 | implicit none 5 | 6 | integer :: n, i, j 7 | 8 | if (n .lt. 2) then 9 | res = n 10 | else 11 | !$omp task shared(i) 12 | i = fib(n-1) 13 | !$omp end task 14 | 15 | !$omp task shared(j) 16 | j = fib(n-2) 17 | !$omp end task 18 | 19 | !$omp taskwait 20 | res = i+j 21 | end if 22 | end function 23 | 24 | program fibonacci 25 | 26 | use timer 27 | 28 | implicit none 29 | 30 | integer :: fib ! Declare function 31 | integer :: num = 40 32 | integer :: res 33 | real(kind=8) :: tic, toc 34 | 35 | ! Start timer 36 | call wtime(tic) 37 | 38 | !$omp parallel 39 | !$omp master 40 | res = fib(num) 41 | !$omp end master 42 | !$omp end parallel 43 | 44 | ! Stop timer 45 | call wtime(toc) 46 | 47 | 48 | ! Print result 49 | write(*,"(A)") "------------------------------------" 50 | write(*,"(I0,A,I0)") num, "th Fibonacci is ", res 51 | write(*,"(A,F10.3)") "runtime: ", toc-tic 52 | write(*,"(A)") "------------------------------------" 53 | 54 | end program 55 | 56 | -------------------------------------------------------------------------------- /code/pi.f90: -------------------------------------------------------------------------------- 1 | 2 | program pi_main 3 | 4 | use timer 5 | 6 | ! Local variables 7 | integer, parameter :: num_steps = 100000000 ! number of steps over which to estimate pi 8 | real(kind=8) :: step ! the step size 9 | integer :: ii ! genereric counter 10 | real(kind=8) :: x ! intermediate value 11 | real(kind=8) :: pi = 0.0_8 ! overall estimate 12 | real(kind=8) :: sum = 0.0_8 ! variable to store partial sum 13 | real(kind=8) :: start, end ! timers 14 | 15 | real(kind=8), parameter :: PI_8 = 4.0_8 * atan(1.0_8) 16 | 17 | ! step size is dependent upon the number of steps 18 | step = 1.0_8/num_steps 19 | 20 | ! Start timer 21 | call wtime(start) 22 | 23 | ! main loop 24 | do ii = 1, num_steps 25 | x = (ii-0.5_8)*step 26 | sum = sum + (4.0_8/(1.0_8+x*x)) 27 | end do 28 | pi = step * sum 29 | 30 | ! Stop timer 31 | call wtime(end) 32 | 33 | ! Print result 34 | write(*,"(A)") "------------------------------------" 35 | write(*,"(A,F19.16)") "pi is: ", pi 36 | write(*,"(A,F19.16)") "error is: ", abs(pi - PI_8) 37 | write(*,"(A,F10.3)") "runtime: ", end-start 38 | write(*,"(A)") "------------------------------------" 39 | 40 | end program pi_main 41 | -------------------------------------------------------------------------------- /slides/99-isambard.tex: -------------------------------------------------------------------------------- 1 | \documentclass[aspectratio=169]{beamer} 2 | 3 | \input{preamble.tex} 4 | 5 | \title{OpenMP for Computational Scientists} 6 | \subtitle{Using Isambard} 7 | 8 | \begin{document} 9 | 10 | \frame{\titlepage} 11 | 12 | %------------------------------------------------------------------------------- 13 | 14 | \begin{frame} 15 | \frametitle{Using Isambard (1)} 16 | \begin{enumerate} 17 | \item Go to this webpage to get your account ID (01, 02, ...): \url{https://tinyurl.com/openmp-2020} 18 | \item Log into the Isambard \emph{bastion} node (the gateway to the system)\newline 19 | \mintinline{bash}|ssh br-trainXX@isambard.gw4.ac.uk| 20 | \item Password: \mintinline{bash}|openmpUG20| 21 | \item From the bastion node, log in to Isambard Phase 1\newline 22 | \mintinline{bash}|ssh phase1| 23 | \item Change to the directory containing the exercises\newline 24 | \mintinline{bash}|cd openmp-for-cs| 25 | \end{enumerate} 26 | 27 | \end{frame} 28 | 29 | \begin{frame} 30 | \frametitle{Using Isambard (2)} 31 | \begin{enumerate} 32 | \setcounter{enumi}{5} 33 | \item Build the exercises\newline 34 | \mintinline{bash}|make| 35 | \item Submit a job\newline 36 | \mintinline{bash}|qsub submit_stencil| 37 | \item Check job status\newline 38 | \mintinline{bash}|qstat -u $USER| 39 | \item Check job output\newline 40 | \mintinline{bash}|cat stencil.o9748| 41 | 42 | \end{enumerate} 43 | 44 | \end{frame} 45 | 46 | \end{document} 47 | -------------------------------------------------------------------------------- /code/pi_reduction.f90: -------------------------------------------------------------------------------- 1 | 2 | program pi_main 3 | 4 | use timer 5 | 6 | ! Local variables 7 | integer, parameter :: num_steps = 100000000 ! number of steps over which to estimate pi 8 | real(kind=8) :: step ! the step size 9 | integer :: ii ! genereric counter 10 | real(kind=8) :: x ! intermediate value 11 | real(kind=8) :: pi = 0.0_8 ! overall estimate 12 | real(kind=8) :: sum = 0.0_8 ! variable to store partial sum 13 | real(kind=8) :: start, end ! timers 14 | 15 | real(kind=8), parameter :: PI_8 = 4.0_8 * atan(1.0_8) 16 | 17 | ! step size is dependent upon the number of steps 18 | step = 1.0_8/num_steps 19 | 20 | ! Start timer 21 | call wtime(start) 22 | 23 | ! main loop 24 | !$omp parallel do private(x) reduction(+:sum) 25 | do ii = 1, num_steps 26 | x = (ii-0.5_8)*step 27 | sum = sum + (4.0_8/(1.0_8+x*x)) 28 | end do 29 | !$omp end parallel do 30 | pi = step * sum 31 | 32 | ! Stop timer 33 | call wtime(end) 34 | 35 | ! Print result 36 | write(*,"(A)") "------------------------------------" 37 | write(*,"(A,F19.16)") "pi is: ", pi 38 | write(*,"(A,F19.16)") "error is: ", abs(pi - PI_8) 39 | write(*,"(A,F10.3)") "runtime: ", end-start 40 | write(*,"(A)") "------------------------------------" 41 | 42 | end program pi_main 43 | -------------------------------------------------------------------------------- /code/pi_atomic.f90: -------------------------------------------------------------------------------- 1 | 2 | program pi_main 3 | 4 | use timer 5 | 6 | ! Local variables 7 | integer, parameter :: num_steps = 100000000 ! number of steps over which to estimate pi 8 | real(kind=8) :: step ! the step size 9 | integer :: ii ! genereric counter 10 | real(kind=8) :: x, x2 ! intermediate value 11 | real(kind=8) :: pi = 0.0_8 ! overall estimate 12 | real(kind=8) :: sum = 0.0_8 ! variable to store partial sum 13 | real(kind=8) :: start, end ! timers 14 | 15 | real(kind=8), parameter :: PI_8 = 4.0_8 * atan(1.0_8) 16 | 17 | ! step size is dependent upon the number of steps 18 | step = 1.0_8/num_steps 19 | 20 | ! Start timer 21 | call wtime(start) 22 | 23 | ! main loop 24 | !$omp parallel do private(x,x2) 25 | do ii = 1, num_steps 26 | x = (ii-0.5_8)*step 27 | x2 = 4.0_8/(1.0_8+x*x) 28 | !$omp atomic 29 | sum = sum + x2 30 | end do 31 | !$omp end parallel do 32 | 33 | pi = step * sum 34 | 35 | ! Stop timer 36 | call wtime(end) 37 | 38 | ! Print result 39 | write(*,"(A)") "------------------------------------" 40 | write(*,"(A,F19.16)") "pi is: ", pi 41 | write(*,"(A,F19.16)") "error is: ", abs(pi - PI_8) 42 | write(*,"(A,F10.3)") "runtime: ", end-start 43 | write(*,"(A)") "------------------------------------" 44 | 45 | end program pi_main 46 | -------------------------------------------------------------------------------- /code/pi_critical.f90: -------------------------------------------------------------------------------- 1 | 2 | program pi_main 3 | 4 | use timer 5 | 6 | ! Local variables 7 | integer, parameter :: num_steps = 100000000 ! number of steps over which to estimate pi 8 | real(kind=8) :: step ! the step size 9 | integer :: ii ! genereric counter 10 | real(kind=8) :: x, x2 ! intermediate value 11 | real(kind=8) :: pi = 0.0_8 ! overall estimate 12 | real(kind=8) :: sum = 0.0_8 ! variable to store partial sum 13 | real(kind=8) :: start, end ! timers 14 | 15 | real(kind=8), parameter :: PI_8 = 4.0_8 * atan(1.0_8) 16 | 17 | ! step size is dependent upon the number of steps 18 | step = 1.0_8/num_steps 19 | 20 | ! Start timer 21 | call wtime(start) 22 | 23 | ! main loop 24 | !$omp parallel do private(x,x2) 25 | do ii = 1, num_steps 26 | x = (ii-0.5_8)*step 27 | x2 = 4.0_8/(1.0_8+x*x) 28 | !$omp critical 29 | sum = sum + x2 30 | !$omp end critical 31 | end do 32 | !$omp end parallel do 33 | 34 | pi = step * sum 35 | 36 | ! Stop timer 37 | call wtime(end) 38 | 39 | ! Print result 40 | write(*,"(A)") "------------------------------------" 41 | write(*,"(A,F19.16)") "pi is: ", pi 42 | write(*,"(A,F19.16)") "error is: ", abs(pi - PI_8) 43 | write(*,"(A,F10.3)") "runtime: ", end-start 44 | write(*,"(A)") "------------------------------------" 45 | 46 | end program pi_main 47 | -------------------------------------------------------------------------------- /code/vadd_spmd.f90: -------------------------------------------------------------------------------- 1 | 2 | ! Vector addition 3 | program vadd 4 | 5 | use timer 6 | use omp_lib 7 | 8 | implicit none 9 | 10 | integer :: N=50000000 11 | real(kind=8), allocatable :: A(:), B(:), C(:) 12 | integer :: i 13 | integer :: tid, nthreads 14 | real(kind=8) :: start, end 15 | 16 | ! Allocate memory 17 | allocate(A(N)) 18 | allocate(B(N)) 19 | allocate(C(N)) 20 | 21 | ! Initilise data 22 | do i = 1, N 23 | A(i) = 1.0_8 24 | B(i) = 2.0_8 25 | C(i) = 0.0_8 26 | end do 27 | 28 | ! Start timer 29 | call wtime(start) 30 | 31 | ! Open parallel region 32 | ! tid variable must be private to each thread 33 | !$omp parallel private(tid) 34 | 35 | ! Get thread number 36 | tid = omp_get_thread_num() 37 | 38 | ! Get total number of threads 39 | nthreads = omp_get_num_threads() 40 | 41 | ! Vector addition 42 | ! Share iteration space based on thread ID 43 | do i = 1+(tid*N/nthreads), (tid+1)*N/nthreads 44 | C(i) = A(i) + B(i) 45 | end do 46 | 47 | ! End parallel region 48 | !$omp end parallel 49 | 50 | ! Stop timer 51 | call wtime(end) 52 | 53 | ! Print result 54 | write(*,"(A)") "------------------------------------" 55 | write(*,"(A,F10.3)") "runtime: ", end-start 56 | if (any(C .ne. 3.0_8)) then 57 | write(*,"(A)") "WARNING: results incorrect" 58 | end if 59 | write(*,"(A)") "------------------------------------" 60 | 61 | ! Free memory 62 | deallocate(A,B) 63 | 64 | end program vadd 65 | -------------------------------------------------------------------------------- /code/pi_private.f90: -------------------------------------------------------------------------------- 1 | 2 | program pi_main 3 | 4 | use timer 5 | 6 | ! Local variables 7 | integer, parameter :: num_steps = 100000000 ! number of steps over which to estimate pi 8 | real(kind=8) :: step ! the step size 9 | integer :: ii ! genereric counter 10 | real(kind=8) :: x ! intermediate value 11 | real(kind=8) :: pi = 0.0_8 ! overall estimate 12 | real(kind=8) :: sum = 0.0_8 ! variable to store partial sum 13 | real(kind=8) :: start, end ! timers 14 | 15 | real(kind=8), parameter :: PI_8 = 4.0_8 * atan(1.0_8) 16 | 17 | ! step size is dependent upon the number of steps 18 | step = 1.0_8/num_steps 19 | 20 | ! Start timer 21 | call wtime(start) 22 | 23 | ! main loop 24 | !$omp parallel private(x) firstprivate(sum) 25 | !$omp do 26 | do ii = 1, num_steps 27 | x = (ii-0.5_8)*step 28 | sum = sum + 4.0_8/(1.0_8+x*x) 29 | end do 30 | !$omp end do 31 | !$omp critical 32 | pi = pi + sum 33 | !$omp end critical 34 | !$omp end parallel 35 | 36 | pi = pi * step 37 | 38 | ! Stop timer 39 | call wtime(end) 40 | 41 | ! Print result 42 | write(*,"(A)") "------------------------------------" 43 | write(*,"(A,F19.16)") "pi is: ", pi 44 | write(*,"(A,F19.16)") "error is: ", abs(pi - PI_8) 45 | write(*,"(A,F10.3)") "runtime: ", end-start 46 | write(*,"(A)") "------------------------------------" 47 | 48 | end program pi_main 49 | -------------------------------------------------------------------------------- /code/stencil.f90: -------------------------------------------------------------------------------- 1 | 2 | ! 5 point stencil 3 | program stencil 4 | 5 | use timer 6 | 7 | implicit none 8 | 9 | integer :: nx = 4000 10 | integer :: ny = 4000 11 | integer :: ntimes = 30 12 | real(kind=8), dimension(:,:), pointer :: A, Atmp, Aptr 13 | integer :: i, j, t 14 | real(kind=8) :: total_start, total_end 15 | real(kind=8) :: tic, toc 16 | 17 | ! Allocate memory 18 | allocate(A(0:nx+1,0:ny+1)) 19 | allocate(Atmp(0:nx+1,0:ny+1)) 20 | 21 | ! Initialise data to zero 22 | do i = 0, nx+1 23 | do j = 0, ny+1 24 | A(i,j) = 0.0_8 25 | Atmp(i,j) = 0.0_8 26 | end do 27 | end do 28 | 29 | ! Insert values in centre of grid 30 | do i = nx/4, 3*nx/4 31 | do j = ny/4, 3*ny/4 32 | A(i,j) = 1.0_8 33 | end do 34 | end do 35 | 36 | total_start = sum(A(:,:)) 37 | 38 | ! Start timer 39 | call wtime(tic) 40 | 41 | ! Loop a number of times 42 | do t = 1, ntimes 43 | 44 | ! Update the stencil 45 | do i = 1, nx 46 | do j = 1, ny 47 | Atmp(i,j) = (A(i-1,j) + A(i+1,j) + A(i,j) + A(i,j-1) + A(i,j+1)) / 5.0_8 48 | end do 49 | end do 50 | 51 | ! Swap pointers 52 | Aptr => A 53 | A => Atmp 54 | Atmp => Aptr 55 | 56 | end do 57 | 58 | ! Stop timer 59 | call wtime(toc) 60 | 61 | ! Sum up grid values for rudimentary correctness check 62 | total_end = sum(A(:,:)) 63 | 64 | ! Print result 65 | write(*,"(A)") "------------------------------------" 66 | write(*,"(A,F10.3)") "runtime: ", toc-tic 67 | if (abs(total_end-total_start)/total_start > 1.0E-8) then 68 | write(*,"(A)") "result: Failed" 69 | else 70 | write(*,"(A)") "result: Passed" 71 | end if 72 | write(*,"(A)") "------------------------------------" 73 | 74 | 75 | deallocate(A, Atmp) 76 | 77 | end program stencil 78 | 79 | -------------------------------------------------------------------------------- /code/private.f90: -------------------------------------------------------------------------------- 1 | 2 | program private 3 | 4 | USE omp_lib 5 | 6 | implicit none 7 | 8 | integer :: i ! Loop index 9 | integer :: nthreads ! Number of threads 10 | integer :: N=10 ! Number of iterations 11 | integer :: x=-1 ! Original variable 12 | 13 | write(*,"(A)") "------------------------------------" 14 | 15 | !$omp parallel 16 | nthreads = omp_get_num_threads() 17 | !$omp end parallel 18 | write (*,"(A,I0)") "num threads: ", nthreads 19 | write (*,*) 20 | 21 | write (*,"(A,I0)") "original: x=", x 22 | write (*,*) 23 | 24 | ! Private clause 25 | x=-1 26 | write (*,"(A,I0)") "private:" 27 | write (*,"(1X,A,I0)") "before: x=", x 28 | !$omp parallel do private(x) 29 | do i = 1, N 30 | write (*,"(2X,A,I0,A,I0,A,I0)") "Thread ", omp_get_thread_num(), " setting x=", x, " to ", i 31 | x = i 32 | end do 33 | !$omp end parallel do 34 | write (*,"(1X,A,I0)") "after: x=", x 35 | write (*,*) 36 | 37 | ! First private clause 38 | x=-1 39 | write (*,"(A,I0)") "firstprivate:" 40 | write (*,"(1X,A,I0)") "before: x=", x 41 | !$omp parallel do firstprivate(x) 42 | do i = 1, N 43 | write (*,"(2X,A,I0,A,I0,A,I0)") "Thread ", omp_get_thread_num(), " setting x=", x, " to ", i 44 | x = i 45 | end do 46 | !$omp end parallel do 47 | write (*,"(1X,A,I0)") "after: x=", x 48 | write (*,*) 49 | 50 | ! Last private clause 51 | x=-1 52 | write (*,"(A,I0)") "lastprivate:" 53 | write (*,"(1X,A,I0)") "before: x=", x 54 | !$omp parallel do lastprivate(x) 55 | do i = 1, N 56 | write (*,"(2X,A,I0,A,I0,A,I0)") "Thread ", omp_get_thread_num(), " setting x=", x, " to ", i 57 | x = i 58 | end do 59 | !$omp end parallel do 60 | write (*,"(1X,A,I0)") "after: x=", x 61 | 62 | write(*,"(A)") "------------------------------------" 63 | 64 | end program private 65 | -------------------------------------------------------------------------------- /code/stencil_paralleldo.f90: -------------------------------------------------------------------------------- 1 | 2 | ! 5 point stencil 3 | program stencil 4 | 5 | use timer 6 | 7 | implicit none 8 | 9 | integer :: nx = 4000 10 | integer :: ny = 4000 11 | integer :: ntimes = 30 12 | real(kind=8), dimension(:,:), pointer :: A, Atmp, Aptr 13 | integer :: i, j, t 14 | real(kind=8) :: total_start, total_end 15 | real(kind=8) :: tic, toc 16 | 17 | ! Allocate memory 18 | allocate(A(0:nx+1,0:ny+1)) 19 | allocate(Atmp(0:nx+1,0:ny+1)) 20 | 21 | ! Initialise data to zero 22 | do i = 0, nx+1 23 | do j = 0, ny+1 24 | A(i,j) = 0.0_8 25 | Atmp(i,j) = 0.0_8 26 | end do 27 | end do 28 | 29 | ! Insert values in centre of grid 30 | do i = nx/4, 3*nx/4 31 | do j = ny/4, 3*ny/4 32 | A(i,j) = 1.0_8 33 | end do 34 | end do 35 | 36 | total_start = sum(A(:,:)) 37 | 38 | ! Start timer 39 | call wtime(tic) 40 | 41 | ! Loop a number of times 42 | do t = 1, ntimes 43 | 44 | ! Update the stencil 45 | !$omp parallel do collapse(2) 46 | do i = 1, nx 47 | do j = 1, ny 48 | Atmp(i,j) = (A(i-1,j) + A(i+1,j) + A(i,j) + A(i,j-1) + A(i,j+1)) / 5.0_8 49 | end do 50 | end do 51 | !$omp end parallel do 52 | 53 | ! Swap pointers 54 | Aptr => A 55 | A => Atmp 56 | Atmp => Aptr 57 | 58 | end do 59 | 60 | ! Stop timer 61 | call wtime(toc) 62 | 63 | ! Sum up grid values for rudimentary correctness check 64 | total_end = sum(A(:,:)) 65 | 66 | ! Print result 67 | write(*,"(A)") "------------------------------------" 68 | write(*,"(A,F10.3)") "runtime: ", toc-tic 69 | if (abs(total_end-total_start)/total_start > 1.0E-8) then 70 | write(*,"(A)") "result: Failed" 71 | else 72 | write(*,"(A)") "result: Passed" 73 | end if 74 | write(*,"(A)") "------------------------------------" 75 | 76 | 77 | deallocate(A, Atmp) 78 | 79 | end program stencil 80 | 81 | -------------------------------------------------------------------------------- /code/pi_array.f90: -------------------------------------------------------------------------------- 1 | 2 | program pi_main 3 | 4 | use timer 5 | use omp_lib 6 | 7 | ! Local variables 8 | integer, parameter :: num_steps = 100000000 ! number of steps over which to estimate pi 9 | real(kind=8) :: step ! the step size 10 | integer :: ii ! genereric counter 11 | real(kind=8) :: x ! intermediate value 12 | real(kind=8) :: pi = 0.0_8 ! overall estimate 13 | real(kind=8), allocatable :: sum(:) ! variable to store partial sum 14 | real(kind=8) :: start, end ! timers 15 | integer :: nthreads ! number of OpenMP threads 16 | integer :: tid ! thread id 17 | 18 | real(kind=8), parameter :: PI_8 = 4.0_8 * atan(1.0_8) 19 | 20 | ! Get number of OpenMP threads 21 | !$omp parallel 22 | nthreads = omp_get_num_threads() 23 | !$omp end parallel 24 | 25 | allocate(sum(nthreads)) 26 | 27 | ! step size is dependent upon the number of steps 28 | step = 1.0_8/num_steps 29 | 30 | ! Start timer 31 | call wtime(start) 32 | 33 | ! main loop 34 | !$omp parallel private(x,tid) 35 | tid = omp_get_thread_num() 36 | sum(tid+1) = 0.0_8 37 | !$omp do 38 | do ii = 1, num_steps 39 | x = (ii-0.5_8)*step 40 | sum(tid+1) = sum(tid+1) + (4.0_8/(1.0_8+x*x)) 41 | !$omp flush(sum) 42 | end do 43 | !$omp end do 44 | !$omp end parallel 45 | 46 | ! Total partial sums serially 47 | do ii = 1, nthreads 48 | pi = pi + sum(ii) 49 | end do 50 | pi = pi * step 51 | 52 | ! Stop timer 53 | call wtime(end) 54 | 55 | ! Print result 56 | write(*,"(A)") "------------------------------------" 57 | write(*,"(A,F19.16)") "pi is: ", pi 58 | write(*,"(A,F19.16)") "error is: ", abs(pi - PI_8) 59 | write(*,"(A,F10.3)") "runtime: ", end-start 60 | write(*,"(A)") "------------------------------------" 61 | 62 | deallocate(sum) 63 | 64 | end program pi_main 65 | -------------------------------------------------------------------------------- /code/stencil_reduction.f90: -------------------------------------------------------------------------------- 1 | 2 | ! 5 point stencil 3 | program stencil 4 | 5 | use timer 6 | 7 | implicit none 8 | 9 | integer :: nx = 4000 10 | integer :: ny = 4000 11 | integer :: ntimes = 30 12 | real(kind=8), dimension(:,:), pointer :: A, Atmp, Aptr 13 | integer :: i, j, t 14 | real(kind=8) :: total_start, total_end, total 15 | real(kind=8) :: tic, toc 16 | 17 | ! Allocate memory 18 | allocate(A(0:nx+1,0:ny+1)) 19 | allocate(Atmp(0:nx+1,0:ny+1)) 20 | 21 | ! Initialise data to zero 22 | do i = 0, nx+1 23 | do j = 0, ny+1 24 | A(i,j) = 0.0_8 25 | Atmp(i,j) = 0.0_8 26 | end do 27 | end do 28 | 29 | ! Insert values in centre of grid 30 | do i = nx/4, 3*nx/4 31 | do j = ny/4, 3*ny/4 32 | A(i,j) = 1.0_8 33 | end do 34 | end do 35 | 36 | total_start = sum(A(:,:)) 37 | 38 | ! Start timer 39 | call wtime(tic) 40 | 41 | ! Loop a number of times 42 | do t = 1, ntimes 43 | 44 | ! Update the stencil 45 | total = 0.0_8 46 | !$omp parallel do collapse(2) reduction(+:total) 47 | do i = 1, nx 48 | do j = 1, ny 49 | Atmp(i,j) = (A(i-1,j) + A(i+1,j) + A(i,j) + A(i,j-1) + A(i,j+1)) / 5.0_8 50 | total = total + Atmp(i,j) 51 | end do 52 | end do 53 | !$omp end parallel do 54 | 55 | ! Print out total 56 | write(*,"(I0,A,F15.5)") t, ": total=", total 57 | 58 | ! Swap pointers 59 | Aptr => A 60 | A => Atmp 61 | Atmp => Aptr 62 | 63 | end do 64 | 65 | ! Stop timer 66 | call wtime(toc) 67 | 68 | ! Sum up grid values for rudimentary correctness check 69 | total_end = sum(A(:,:)) 70 | 71 | ! Print result 72 | write(*,"(A)") "------------------------------------" 73 | write(*,"(A,F10.3)") "runtime: ", toc-tic 74 | if (abs(total_end-total_start)/total_start > 1.0E-8) then 75 | write(*,"(A)") "result: Failed" 76 | else 77 | write(*,"(A)") "result: Passed" 78 | end if 79 | write(*,"(A)") "------------------------------------" 80 | 81 | 82 | deallocate(A, Atmp) 83 | 84 | end program stencil 85 | 86 | -------------------------------------------------------------------------------- /code/stencil_target.f90: -------------------------------------------------------------------------------- 1 | 2 | ! 5 point stencil 3 | program stencil 4 | 5 | use timer 6 | 7 | implicit none 8 | 9 | integer :: nx = 4000 10 | integer :: ny = 4000 11 | integer :: ntimes = 30 12 | real(kind=8), dimension(:,:), pointer :: A, Atmp, Aptr 13 | integer :: i, j, t 14 | real(kind=8) :: total_start, total_end, total 15 | real(kind=8) :: tic, toc 16 | 17 | ! Allocate memory 18 | allocate(A(0:nx+1,0:ny+1)) 19 | allocate(Atmp(0:nx+1,0:ny+1)) 20 | 21 | ! Initialise data to zero 22 | do j = 0, ny+1 23 | do i = 0, nx+1 24 | A(i,j) = 0.0_8 25 | Atmp(i,j) = 0.0_8 26 | end do 27 | end do 28 | 29 | ! Insert values in centre of grid 30 | do j = ny/4, 3*ny/4 31 | do i = nx/4, 3*nx/4 32 | A(i,j) = 1.0_8 33 | end do 34 | end do 35 | 36 | total_start = sum(A(:,:)) 37 | 38 | ! Copy data to device 39 | !$omp target enter data map(to: A, Atmp) 40 | 41 | ! Start timer 42 | call wtime(tic) 43 | 44 | ! Loop a number of times 45 | do t = 1, ntimes 46 | 47 | ! Update the stencil 48 | total = 0.0_8 49 | !$omp target map(tofrom:total) 50 | !$omp teams distribute parallel do reduction(+:total) collapse(2) 51 | do j = 1, ny 52 | do i = 1, nx 53 | Atmp(i,j) = (A(i-1,j) + A(i+1,j) + A(i,j) + A(i,j-1) + A(i,j+1)) * 0.2_8 54 | total = total + Atmp(i,j) 55 | end do 56 | end do 57 | !$omp end teams distribute parallel do 58 | !$omp end target 59 | 60 | ! Print out total 61 | write(*,"(I0,A,F15.5)") t, ": total=", total 62 | 63 | ! Swap pointers 64 | Aptr => A 65 | A => Atmp 66 | Atmp => Aptr 67 | 68 | end do 69 | 70 | ! Stop timer 71 | call wtime(toc) 72 | 73 | ! Copy data back 74 | !$omp target exit data map(from: A, Atmp) 75 | 76 | ! Sum up grid values for rudimentary correctness check 77 | total_end = sum(A(:,:)) 78 | 79 | ! Print result 80 | write(*,"(A)") "------------------------------------" 81 | write(*,"(A,F10.3)") "runtime: ", toc-tic 82 | if (abs(total_end-total_start)/total_start > 1.0E-8) then 83 | write(*,"(A)") "result: Failed" 84 | else 85 | write(*,"(A)") "result: Passed" 86 | end if 87 | write(*,"(A)") "------------------------------------" 88 | 89 | 90 | deallocate(A, Atmp) 91 | 92 | end program stencil 93 | 94 | -------------------------------------------------------------------------------- /code/stencil_optimised.f90: -------------------------------------------------------------------------------- 1 | 2 | ! Update the stencil 3 | subroutine kernel(nx, ny, A, Atmp, total) 4 | 5 | implicit none 6 | 7 | integer :: nx, ny 8 | real(kind=8) :: A(0:nx+1, 0:ny+1) 9 | real(kind=8) :: Atmp(0:nx+1, 0:ny+1) 10 | real(kind=8) :: total 11 | 12 | integer :: i, j 13 | 14 | total = 0.0_8 15 | !$omp parallel do reduction(+:total) 16 | do j = 1, ny 17 | !$omp simd 18 | do i = 1, nx 19 | Atmp(i,j) = (A(i-1,j) + A(i+1,j) + A(i,j) + A(i,j-1) + A(i,j+1)) * 0.2_8 20 | total = total + Atmp(i,j) 21 | end do 22 | !$omp end simd 23 | end do 24 | !$omp end parallel do 25 | 26 | end subroutine kernel 27 | 28 | ! 5 point stencil 29 | program stencil 30 | 31 | use timer 32 | 33 | implicit none 34 | 35 | integer :: nx = 4000 36 | integer :: ny = 4000 37 | integer :: ntimes = 30 38 | real(kind=8), dimension(:,:), pointer :: A, Atmp, Aptr 39 | integer :: i, j, t 40 | real(kind=8) :: total_start, total_end, total 41 | real(kind=8) :: tic, toc 42 | 43 | ! Allocate memory 44 | allocate(A(0:nx+1,0:ny+1)) 45 | allocate(Atmp(0:nx+1,0:ny+1)) 46 | 47 | ! Initialise data to zero 48 | do j = 0, ny+1 49 | do i = 0, nx+1 50 | A(i,j) = 0.0_8 51 | Atmp(i,j) = 0.0_8 52 | end do 53 | end do 54 | 55 | ! Insert values in centre of grid 56 | do i = nx/4, 3*nx/4 57 | do j = ny/4, 3*ny/4 58 | A(i,j) = 1.0_8 59 | end do 60 | end do 61 | 62 | total_start = sum(A(:,:)) 63 | 64 | ! Start timer 65 | call wtime(tic) 66 | 67 | ! Loop a number of times 68 | do t = 1, ntimes 69 | 70 | ! Update the stencil 71 | call kernel(nx, ny, A, Atmp, total) 72 | 73 | ! Print out total 74 | write(*,"(I0,A,F15.5)") t, ": total=", total 75 | 76 | ! Swap pointers 77 | Aptr => A 78 | A => Atmp 79 | Atmp => Aptr 80 | 81 | end do 82 | 83 | ! Stop timer 84 | call wtime(toc) 85 | 86 | ! Sum up grid values for rudimentary correctness check 87 | total_end = sum(A(:,:)) 88 | 89 | ! Print result 90 | write(*,"(A)") "------------------------------------" 91 | write(*,"(A,F10.3)") "runtime: ", toc-tic 92 | if (abs(total_end-total_start)/total_start > 1.0E-8) then 93 | write(*,"(A)") "result: Failed" 94 | else 95 | write(*,"(A)") "result: Passed" 96 | end if 97 | write(*,"(A)") "------------------------------------" 98 | 99 | 100 | deallocate(A, Atmp) 101 | 102 | end program stencil 103 | 104 | -------------------------------------------------------------------------------- /code/stencil_numa.f90: -------------------------------------------------------------------------------- 1 | 2 | ! Update the stencil 3 | subroutine kernel(nx, ny, A, Atmp, total) 4 | 5 | implicit none 6 | 7 | integer :: nx, ny 8 | real(kind=8) :: A(0:nx+1, 0:ny+1) 9 | real(kind=8) :: Atmp(0:nx+1, 0:ny+1) 10 | real(kind=8) :: total 11 | 12 | integer :: i, j 13 | 14 | total = 0.0_8 15 | !$omp parallel do reduction(+:total) 16 | do j = 1, ny 17 | !$omp simd 18 | do i = 1, nx 19 | Atmp(i,j) = (A(i-1,j) + A(i+1,j) + A(i,j) + A(i,j-1) + A(i,j+1)) * 0.2_8 20 | total = total + Atmp(i,j) 21 | end do 22 | !$omp end simd 23 | end do 24 | !$omp end parallel do 25 | 26 | end subroutine kernel 27 | 28 | ! 5 point stencil 29 | program stencil 30 | 31 | use timer 32 | 33 | implicit none 34 | 35 | integer :: nx = 4000 36 | integer :: ny = 4000 37 | integer :: ntimes = 30 38 | real(kind=8), dimension(:,:), pointer :: A, Atmp, Aptr 39 | integer :: i, j, t 40 | real(kind=8) :: total_start, total_end, total 41 | real(kind=8) :: tic, toc 42 | 43 | ! Allocate memory 44 | allocate(A(0:nx+1,0:ny+1)) 45 | allocate(Atmp(0:nx+1,0:ny+1)) 46 | 47 | ! Initialise data to zero 48 | !$omp parallel do 49 | do j = 0, ny+1 50 | do i = 0, nx+1 51 | A(i,j) = 0.0_8 52 | Atmp(i,j) = 0.0_8 53 | end do 54 | end do 55 | !$omp end parallel do 56 | 57 | ! Insert values in centre of grid 58 | do i = nx/4, 3*nx/4 59 | do j = ny/4, 3*ny/4 60 | A(i,j) = 1.0_8 61 | end do 62 | end do 63 | 64 | total_start = sum(A(:,:)) 65 | 66 | ! Start timer 67 | call wtime(tic) 68 | 69 | ! Loop a number of times 70 | do t = 1, ntimes 71 | 72 | ! Update the stencil 73 | call kernel(nx, ny, A, Atmp, total) 74 | 75 | ! Print out total 76 | write(*,"(I0,A,F15.5)") t, ": total=", total 77 | 78 | ! Swap pointers 79 | Aptr => A 80 | A => Atmp 81 | Atmp => Aptr 82 | 83 | end do 84 | 85 | ! Stop timer 86 | call wtime(toc) 87 | 88 | ! Sum up grid values for rudimentary correctness check 89 | total_end = sum(A(:,:)) 90 | 91 | ! Print result 92 | write(*,"(A)") "------------------------------------" 93 | write(*,"(A,F10.3)") "runtime: ", toc-tic 94 | if (abs(total_end-total_start)/total_start > 1.0E-8) then 95 | write(*,"(A)") "result: Failed" 96 | else 97 | write(*,"(A)") "result: Passed" 98 | end if 99 | write(*,"(A)") "------------------------------------" 100 | 101 | 102 | deallocate(A, Atmp) 103 | 104 | end program stencil 105 | 106 | -------------------------------------------------------------------------------- /slides/07-wrapup.tex: -------------------------------------------------------------------------------- 1 | \documentclass[aspectratio=169]{beamer} 2 | 3 | \input{preamble.tex} 4 | 5 | \title{OpenMP for Computational Scientists} 6 | \subtitle{Wrap up} 7 | 8 | \begin{document} 9 | 10 | \frame{\titlepage} 11 | 12 | %------------------------------------------------------------------------------- 13 | 14 | \begin{frame} 15 | \frametitle{OpenMP 5.0} 16 | OpenMP 5 adds features to make writing performance portable programs simpler. 17 | 18 | Highlighting some applicable to target: 19 | \begin{itemize} 20 | \item Loop construct 21 | \item Mappers 22 | \item Unified Shared Memory (USM) 23 | \item Function variants 24 | \item Reverse offload 25 | \item \mintinline{fortran}|OMP_TARGET_OFFLOAD| 26 | \item Reduction variables now implicitly \mintinline{fortran}|map(tofrom)| 27 | \end{itemize} 28 | 29 | \end{frame} 30 | 31 | %------------------------------------------------------------------------------- 32 | \begin{frame}[fragile] 33 | \frametitle{Loop} 34 | \begin{itemize} 35 | \item Assert that the iterations in a loop nest may execute in any order, including concurrently 36 | \item Let the compiler figure our how to best utilize parallel resources 37 | \end{itemize} 38 | 39 | \begin{minted}[]{fortran} 40 | !$omp target 41 | !$omp loop 42 | do i = 1, N 43 | a(i) = b(i) 44 | end do 45 | !$omp end loop 46 | !$omp end target 47 | \end{minted} 48 | 49 | \end{frame} 50 | %------------------------------------------------------------------------------- 51 | 52 | \begin{frame}[fragile] 53 | \frametitle{Unified shared memory} 54 | Code requires specific features, e.g. shared memory between host and devices. 55 | 56 | \begin{minted}[]{fortran} 57 | 58 | real(kind=8), dimension(:), allocatable :: A 59 | allocate(A(1024)) 60 | 61 | !$omp requires unified_shared_memory 62 | 63 | !$omp target 64 | call do_something_with_A(A) 65 | !$omp end target 66 | \end{minted} 67 | 68 | No map clauses. Data is shared between the host and device. 69 | 70 | \end{frame} 71 | 72 | %------------------------------------------------------------------------------- 73 | \begin{frame} 74 | \frametitle{OpenMP resources} 75 | \begin{itemize} 76 | \item Two brilliant books from MIT Press: 77 | \begin{itemize} 78 | \item The OpenMP Common Core: Making OpenMP Simple Again --- Tim Mattson, Yun (Helen) Ye and Alice Koniges. 79 | \item Using OpenMP - The Next Steps --- Ruud van de Pas, Eric Stotzer and Christian Terboven. 80 | \end{itemize} 81 | \item OpenMP website: \url{https://www.openmp.org} 82 | \begin{itemize} 83 | \item The specification (not for the faint hearted). 84 | \item Download summary cards. 85 | \item List of compiler support. 86 | \item Example code for all the directives. 87 | \item List of books: \url{https://www.openmp.org/resources/openmp-books/} 88 | \end{itemize} 89 | 90 | \end{itemize} 91 | \end{frame} 92 | %------------------------------------------------------------------------------- 93 | \end{document} 94 | -------------------------------------------------------------------------------- /code/README.md: -------------------------------------------------------------------------------- 1 | # OpenMP codes 2 | 3 | This project contains a number of OpenMP examples. 4 | 5 | A Fortran timing module (itself an interface to a C time call) is also provided as a utility to aid in getting wall clock time for serial Fortran programs. 6 | 7 | ## Contents 8 | - [Compiling the code](#compiling-the-code) 9 | - [Vector addition](#vector-addition) 10 | - [5-point stencil](#5-point-stencil) 11 | - [Pi](#pi) 12 | - [Private](#private) 13 | - [Fibonacci](#fibonacci) 14 | - [Jacobi](#jacobi) 15 | - [Utility timing routines](#utility-timing-routines) 16 | 17 | ## Compiling the code 18 | The provided `Makefile` will build all of the provided code. 19 | The default compiler is `gfortran`. 20 | 21 | To use your own compiler, edit the `FTN` variable in the `Makefile`. 22 | For example, set `FTN=ifort` to use the Intel Fortran compiler. 23 | 24 | Additional compiler flags can be set using the `FFLAGS` variable in the `Makefile`. 25 | 26 | The OpenMP library is set using the `LIBS` variable in the `Makefile`. 27 | 28 | Run `make clean` to clear away the built binaries and partial build files. 29 | 30 | ## Vector Addition 31 | 32 | Serial and parallel versions of the simple vector add program: `C=A+B`. 33 | Both a SPMD and a `parallel do` parallel version are provided (as solutions). 34 | 35 | ## 5-point stencil 36 | 37 | Serial and parallel versions of a simple 5-point stencil on a rectangular grid. 38 | The value in each cell is computed as the average (mean) of itself and north, south, east and west neighbours. 39 | The stencil is applied to the grid a number of times. 40 | 41 | ## Pi 42 | 43 | This code implements the integration of `4/(1+x*x)` using the trapezoidal rule to estimate pi. 44 | 45 | A number of implementations are given, and should be viewed in order: 46 | 47 | 1. pi: the serial version 48 | 2. critical: an initial parallel version, using a critical region to safeguard sum 49 | 3. atomic: parallel version, using an atomic to safeguard sum 50 | 4. array: parallel version, using an array of partial sums, one per thread 51 | 5. private: parallel version, using a private sum to each thread, totalled with a critical 52 | 6. reduction: parallel version using OpenMP reduction 53 | 54 | ## Private 55 | 56 | This code is a simple example to show how different private data sharing clauses change the data environment of each thread. 57 | 58 | 59 | ## Fibonacci 60 | An implementation of a recursive algorithm to calculate Fibonacci numbers using OpenMP tasks. 61 | 62 | 63 | ## Jacobi 64 | 65 | This code implements the iterative Jacobi method to solve a system of linear equations. 66 | See the [Wikipedia page](https://en.wikipedia.org/wiki/Jacobi_method) for a full description of the Jacobi method. 67 | 68 | The program can be run without any arguments to solve a default problem. 69 | The `-n` and `-i` arguments can be used to control the matrix size and maximum number of iterations. 70 | For example, to solve for a 500x500 matrix, use the following command: 71 | 72 | ./jacobi -n 500 73 | 74 | Use `--help` to see a full description for all of the command-line arguments. 75 | 76 | ### Sample runtimes 77 | 78 | Here are the runtimes that we achieve with the starting code for a few different matrix sizes. 79 | Run on a MacBook Pro (Intel Core i7-4980HQ CPU @ 2.80GHz). 80 | 81 | | Matrix size | Solver runtime | Iterations | Solution error | 82 | | ----------- | --------------- | ---------- | ---------------- | 83 | | 500 | 0.331 seconds | 1511 | 0.0248609 | 84 | | 1000 | 4.858 seconds | 2883 | 0.0499393 | 85 | | 2000 | 170 seconds | 5445 | 0.0999166 | 86 | | 4000 | 1671 seconds | 10233 | 0.1998391 | 87 | 88 | ## Utility timing routines 89 | The `timer.f90` and `wtime.c` files provide a simple timing routine to use for all examples. 90 | The time is recorded in C using `gettimeofday()`, and a Fortran interface is provided. 91 | This was provided so that the serial codes can use a simple timing library. 92 | Users should use the OpenMP `omp_get_wtime()` API call for their parallel codes. 93 | 94 | -------------------------------------------------------------------------------- /slides/00-prelim.tex: -------------------------------------------------------------------------------- 1 | \documentclass[aspectratio=169]{beamer} 2 | 3 | \input{preamble.tex} 4 | 5 | \title{OpenMP for Computational Scientists} 6 | \subtitle{Preliminaries} 7 | 8 | \begin{document} 9 | 10 | \frame{\titlepage} 11 | 12 | %------------------------------------------------------------------------------- 13 | % \begin{frame} 14 | % \frametitle{Audience} 15 | 16 | % \begin{itemize} 17 | % \item Teaches OpenMP 4.5 and 5.0 in a seminar style. 18 | % \item 6 lecture topics, with exercises and solutions. 19 | % \item Designed for Computational Scientists familiar with Fortran and MPI programming. 20 | 21 | % \end{itemize} 22 | 23 | % Download code (and slides) from: 24 | % \url{https://github.com/UoB-HPC/openmp-for-cs} 25 | 26 | % \end{frame} 27 | %------------------------------------------------------------------------------- 28 | 29 | \begin{frame} 30 | \frametitle{Introduction} 31 | 32 | \begin{itemize} 33 | \item Today: Learn OpenMP 4.5 (and maybe some 5.0). 34 | \item We will cover a lot of material! 35 | \item This is a hands-on tutorial! 36 | \item Mixture of lectures and exercises. 37 | % \item Exercises designed to try programming OpenMP. 38 | \item Experiment and have fun with them! 39 | \item Solutions provided, but only look as last resort. 40 | \item Assume knowledge of basic Fortran; parallel programming with MPI useful. 41 | \end{itemize} 42 | \end{frame} 43 | 44 | %------------------------------------------------------------------------------- 45 | 46 | \begin{frame} 47 | \frametitle{Materials} 48 | \begin{block}{Materials} 49 | Download code (and slides) from: 50 | \url{https://github.com/UoB-HPC/openmp-for-cs} 51 | \end{block} 52 | \end{frame} 53 | %------------------------------------------------------------------------------- 54 | 55 | \begin{frame} 56 | \frametitle{GW4 Isambard} 57 | \begin{columns} 58 | \begin{column}{0.7\framewidth} 59 | \begin{itemize} 60 | \item UK Tier-2 Supercomputer. 61 | \item Collaboration between GW4 Alliance, UK Met Office, Cray, Arm and EPSRC. 62 | \item 21,000+ Armv8 cores. 63 | \item Collection of CPUs/GPUs from different vendors. 64 | \item \textbf{Today:} using the Intel Xeon 2x18-core Broadwell and NVIDIA P100 nodes. 65 | \end{itemize} 66 | \end{column} 67 | \begin{column}{0.3\framewidth} 68 | \includegraphics[width=\textwidth]{isambard.jpeg} 69 | \end{column} 70 | \end{columns} 71 | 72 | Thanks to Simon McIntosh-Smith and Bristol for supporting today's tutorial with time on Isambard. 73 | 74 | \end{frame} 75 | 76 | %------------------------------------------------------------------------------- 77 | 78 | 79 | \begin{frame} 80 | \frametitle{Agenda} 81 | 82 | \textbf{Part One: CPUs} 83 | \begin{description} 84 | \item[09:30--09:40] Introduction. 85 | \item[09:40--10:10] Parallel worksharing. 86 | \item[10:10--10:35] Exercise 1: Parallel stencil (two-ways). 87 | \item[10:35--11:00] Data sharing. 88 | \item[11:00--11:15] Coffee Break. 89 | \item[11:15--11:35] Exercise 2: Parallel convergence. 90 | \item[11:35--12:10] Vectorisation and NUMA. 91 | \item[12:10--12:30] Exercise 3: Optimising stencil. 92 | \end{description} 93 | 94 | \textbf{Lunch break (12:30--13:30)} 95 | \end{frame} 96 | 97 | \begin{frame} 98 | \frametitle{Agenda} 99 | \textbf{Lunch break (12:30--13:30)} 100 | The Zoom session is open: feel free to continue on the morning exercises and ask questions in the Q and A. 101 | 102 | \textbf{Part Two: GPUs} 103 | \begin{description} 104 | \item[13:30--13:35] Welcome back. 105 | \item[13:35--14:10] Transferring execution and data movement. 106 | \item[14:10--14:35] Exercise 4: Stencil on a GPU. 107 | \item[14:35--15:00] Target Parallelism. 108 | \item[15:00--15:15] Coffee Break. 109 | \item[15:15--15:40] Optimising data movement. 110 | \item[15:40--16:25] Exercise 5: Optimising stencil on a GPU. 111 | \item[16:25--16:30] Wrap up. 112 | \end{description} 113 | \end{frame} 114 | 115 | %------------------------------------------------------------------------------- 116 | 117 | % \begin{frame} 118 | % \frametitle{Exercises} 119 | % \begin{itemize} 120 | % \item This is a hands-on course! 121 | % \item Exercises will be set for you to try programming OpenMP yourselves. 122 | % \item Sample solutions also provided. 123 | % \item All the exercises will be in Fortran. 124 | % \end{itemize} 125 | 126 | % \end{frame} 127 | 128 | %------------------------------------------------------------------------------- 129 | % \section{Outline} 130 | % \begin{frame} 131 | % \frametitle{Course Outline} 132 | % Organised as 6 sessions teaching OpenMP plus top-tips for getting good performance. 133 | % \begin{enumerate} 134 | % \item OpenMP overview 135 | % \item Data sharing and reductions 136 | % \item Vectorisation and code optimisations 137 | % \item NUMA and MPI interoperability 138 | % \item GPU programming with OpenMP 139 | % \item Tasks and Tools 140 | % \end{enumerate} 141 | % \end{frame} 142 | 143 | %------------------------------------------------------------------------------- 144 | \begin{frame} 145 | \frametitle{Thanks} 146 | Thanks go to the following authors, whose own OpenMP tutorials have inspired this one: 147 | \begin{itemize} 148 | \item Tim Mattson (Intel) 149 | \item Alice Koniges (Berkeley Lab/NERSC) 150 | \item Simon McIntosh-Smith and the HPC team (UoBristol) 151 | \item Gethin Williams (UoBristol) 152 | \item and many others 153 | \end{itemize} 154 | \end{frame} 155 | %------------------------------------------------------------------------------- 156 | 157 | \end{document} 158 | -------------------------------------------------------------------------------- /code/jacobi.f90: -------------------------------------------------------------------------------- 1 | ! 2 | ! Implementation of the iterative Jacobi method. 3 | ! 4 | ! Given a known, diagonally dominant matrix A and a known vector b, we aim to 5 | ! to find the vector x that satisfies the following equation: 6 | ! 7 | ! Ax = b 8 | ! 9 | ! We first split the matrix A into the diagonal D and the remainder R: 10 | ! 11 | ! (D + R)x = b 12 | ! 13 | ! We then rearrange to form an iterative solution: 14 | ! 15 | ! x' = (b - Rx) / D 16 | ! 17 | ! More information: 18 | ! -> https://en.wikipedia.org/wiki/Jacobi_method 19 | ! 20 | 21 | ! Module which contains the Jacobi solver subrountine 22 | module solve_mod 23 | 24 | contains 25 | 26 | ! Solve Ax=b according to the Jacobi method 27 | subroutine solve(N, A, b, x, xtmp, itr, MAX_ITERATIONS, CONVERGENCE_THRESHOLD) 28 | 29 | implicit none 30 | 31 | ! Input variables 32 | integer :: N ! Matrix order 33 | real(kind=8) :: A(N,N) ! The matrix 34 | real(kind=8) :: b(N) ! The right hand side vector 35 | real(kind=8), pointer :: x(:) ! Initial solution 36 | real(kind=8), pointer :: xtmp(:) ! Next solution 37 | integer :: itr ! Iterations to solve 38 | integer :: MAX_ITERATIONS ! Iteration limit 39 | real(kind=8) :: CONVERGENCE_THRESHOLD ! Convergence criteria 40 | 41 | ! Local variables 42 | real(kind=8), pointer :: ptrtmp(:) ! Used for pointer swapping 43 | integer :: row, col ! Matrix index 44 | real(kind=8) :: dot 45 | real(kind=8) :: diff, sqdiff=huge(0.0_8) 46 | 47 | ! Loop until converged or maximum iterations reached 48 | itr = 0 49 | do while (itr .lt. MAX_ITERATIONS .and. sqrt(sqdiff) .gt. CONVERGENCE_THRESHOLD) 50 | ! Perfom Jacobi iteration 51 | do row = 1, N 52 | dot = 0.0_8 53 | do col = 1, N 54 | if (row .ne. col) then 55 | dot = dot + (A(row,col) * x(col)) 56 | end if 57 | end do 58 | xtmp(row) = (b(row) - dot) / A(row,row) 59 | end do 60 | 61 | ! Swap pointers 62 | ptrtmp => x 63 | x => xtmp 64 | xtmp => ptrtmp 65 | 66 | ! Check for convergence 67 | sqdiff = 0.0_8 68 | do row = 1, N 69 | diff = xtmp(row) - x(row) 70 | sqdiff = sqdiff + (diff * diff) 71 | end do 72 | 73 | itr = itr + 1 74 | end do 75 | 76 | end subroutine solve 77 | end module solve_mod 78 | 79 | ! Main program 80 | program jacobi 81 | 82 | use timer 83 | use solve_mod ! Include solver (above) 84 | 85 | implicit none 86 | 87 | ! Solver settings 88 | integer :: MAX_ITERATIONS=20000 89 | real(kind=8) :: CONVERGENCE_THRESHOLD=0.0001 90 | 91 | ! Timers 92 | real(kind=8) :: total_start, total_end 93 | real(kind=8) :: solve_start, solve_end 94 | 95 | ! Matrix size 96 | integer :: N=1000 97 | 98 | ! Data arrays 99 | real(kind=8), allocatable :: A(:,:) ! The matrix 100 | real(kind=8), allocatable :: b(:) ! The right hand size vector 101 | real(kind=8), pointer :: x(:) ! Initial solution 102 | real(kind=8), pointer :: xtmp(:) ! Temporary solution storage 103 | integer :: itr ! Iteration count 104 | 105 | ! Local variables 106 | integer :: row, col 107 | real(kind=8) :: rowsum, value 108 | real(kind=8) :: err, tmp 109 | 110 | ! Read in any command line arguments which set problem variables 111 | call parse_arguments(MAX_ITERATIONS, CONVERGENCE_THRESHOLD, N) 112 | 113 | ! Allocate memory 114 | allocate(A(N,N)) 115 | allocate(b(N)) 116 | allocate(x(N)) 117 | allocate(xtmp(N)) 118 | 119 | ! Print header 120 | write(*,"(A)") "------------------------------------" 121 | write(*,"(A,I0,A,I0)") "Matrix size: ", N, " x ", N 122 | write(*,"(A,I0)") "Maximum iterations: ", MAX_ITERATIONS 123 | write(*,"(A,F7.5)") "Convergence threshold: ", CONVERGENCE_THRESHOLD 124 | write(*,"(A)") "------------------------------------" 125 | write(*,*) 126 | 127 | ! Start the program timer 128 | call wtime(total_start) 129 | 130 | ! Initialize data randomly 131 | ! A needs to be a diagonally dominant square matrix, so diagonal entries are biased 132 | do row = 1, N 133 | rowsum = 0.0_8 134 | do col = 1, N 135 | call random_number(value) 136 | A(row,col) = value 137 | rowsum = rowsum + value 138 | end do 139 | A(row,row) = A(row,row) + rowsum 140 | call random_number(b(row)) 141 | x(row) = 0.0_8 142 | end do 143 | 144 | ! Run Jacobi solver 145 | call wtime(solve_start) 146 | call solve(N, A, b, x, xtmp, itr, MAX_ITERATIONS, CONVERGENCE_THRESHOLD) 147 | call wtime(solve_end) 148 | 149 | ! Check error of final solution 150 | err = 0.0_8 151 | do row = 1, N 152 | tmp = 0.0_8 153 | do col = 1, N 154 | tmp = tmp + (A(row,col) * x(col)) 155 | end do 156 | tmp = b(row) - tmp 157 | err = err + (tmp*tmp) 158 | end do 159 | err = sqrt(err) 160 | 161 | ! Stop the program timer 162 | call wtime(total_end) 163 | 164 | ! Print results 165 | write(*,"(A,F13.7)") "Solution error = ", err 166 | write(*,"(A,I0)") "Iterations = ", itr 167 | write(*,"(A,F10.3)") "Total runtime = ", total_end-total_start 168 | write(*,"(A,F10.3)") "Solver runtime = ", solve_end-solve_start 169 | if (itr .eq. MAX_ITERATIONS) write(*,"(A)") "WARNING: solution did not converge" 170 | write(*,"(A)") "------------------------------------" 171 | 172 | ! Free memory 173 | deallocate(A, b, x, xtmp) 174 | 175 | end program jacobi 176 | 177 | ! Parse the command line arguments, setting the problem size, etc. 178 | subroutine parse_arguments(MAX_ITERATIONS, CONVERGENCE_THRESHOLD, N) 179 | 180 | implicit none 181 | 182 | integer :: MAX_ITERATIONS 183 | real(kind=8) :: CONVERGENCE_THRESHOLD 184 | integer :: N 185 | 186 | character(len=32) :: arg 187 | 188 | integer :: i=1 189 | integer :: err 190 | 191 | do while (i .le. command_argument_count()) 192 | call get_command_argument(i, arg) 193 | arg = trim(arg) 194 | 195 | if ("--convergence" .eq. arg .or. & 196 | "-c" .eq. arg) then 197 | i = i + 1 198 | call get_command_argument(i, arg, status=err) 199 | if (err .ne. 0) then 200 | write (*,*) "Error: no convergence threshold given" 201 | stop 202 | end if 203 | read(arg,*) CONVERGENCE_THRESHOLD 204 | 205 | else if ("--iterations" .eq. arg .or. & 206 | "-i" .eq. arg) then 207 | i = i + 1 208 | call get_command_argument(i, arg, status=err) 209 | if (err .ne. 0) then 210 | write (*,*) "Error: no max iterations given" 211 | stop 212 | end if 213 | read(arg,*) MAX_ITERATIONS 214 | 215 | else if ("--norder" .eq. arg .or. & 216 | "-n" .eq. arg) then 217 | i = i + 1 218 | call get_command_argument(i, arg, status=err) 219 | if (err .ne. 0) then 220 | write (*,*) "Error: no matrix order given" 221 | stop 222 | end if 223 | read(arg,*) N 224 | 225 | else if ("--help" .eq. arg) then 226 | write(*,"(A)") "Usage: ./jacobi [OPTIONS]" 227 | write(*,*) 228 | write(*,"(A)") "Options:" 229 | write(*,"(2X,A)") "-h --help Print this message" 230 | write(*,"(2X,A)") "-c --convergence C Set convergence threshold" 231 | write(*,"(2X,A)") "-i --iterations I Set maximum number of iterations" 232 | write(*,"(2X,A)") "-n --norder N Set maxtrix order" 233 | write(*,*) 234 | stop 235 | 236 | else 237 | write (*,"(A,A)") "Unrecognized argument (try '--help'): ", arg 238 | stop 239 | end if 240 | 241 | i = i + 1 242 | end do 243 | end subroutine parse_arguments 244 | -------------------------------------------------------------------------------- /slides/04-hybrid.tex: -------------------------------------------------------------------------------- 1 | \documentclass{beamer} 2 | 3 | \input{preamble.tex} 4 | 5 | \title{OpenMP for Computational Scientists} 6 | \subtitle{4: Combining MPI and OpenMP} 7 | 8 | \begin{document} 9 | 10 | \frame{\titlepage} 11 | 12 | %------------------------------------------------------------------------------- 13 | \section{Outline} 14 | \begin{frame} 15 | \frametitle{Outline} 16 | 17 | \begin{itemize} 18 | \item Quick recap 19 | \item Calculating memory bandwidth for the 5-point stencil code 20 | \end{itemize} 21 | 22 | \vfill 23 | 24 | Programming beyond a single multi-core CPU: 25 | \begin{itemize} 26 | \item Non-uniform Memory Access 27 | \item Thread affinity in OpenMP 28 | \item Combining MPI with OpenMP 29 | \end{itemize} 30 | \end{frame} 31 | 32 | %------------------------------------------------------------------------------- 33 | \section{Recap} 34 | \begin{frame} 35 | \frametitle{Recap} 36 | 37 | We've already come a long way! 38 | 39 | \begin{itemize} 40 | \item Parallelise loops with OpenMP: \mintinline{fortran}|!$omp parallel do|. 41 | \item Data sharing clauses. 42 | \item Synchronisation with barriers, atomics and \mintinline{fortran}|critical| regions. 43 | \item Reductions with the \mintinline{fortran}|reduction| clause. 44 | \item The cache hierarchy. 45 | \item Performance analysis and the Roofline model. 46 | \item Vectorisation along with the OpenMP \mintinline{fortran}|simd| construct. 47 | \item Optimisations for memory access. 48 | \end{itemize} 49 | 50 | \end{frame} 51 | 52 | %------------------------------------------------------------------------------- 53 | \begin{frame}[fragile] 54 | \frametitle{Previous exercise} 55 | 56 | Vectorise and optimise memory access patterns of your parallel 5-point stencil code: 57 | \begin{minted}[frame=single,breaklines,fontsize=\scriptsize]{fortran} 58 | !$omp parallel do reduction(+:total) 59 | do j = 1, ny 60 | !$omp simd 61 | do i = 1, nx 62 | Atmp(i,j) = (A(i-1,j) + A(i+1,j) + A(i,j) + A(i,j-1) + A(i,j+1)) * 0.2 63 | total = total + Atmp(i,j) 64 | end do 65 | !$omp end simd 66 | end do 67 | !$omp end parallel do 68 | \end{minted} 69 | 70 | \begin{itemize} 71 | \item Swapped loops to ensure stride-1 access pattern. 72 | \item Removed division! 73 | \item Use \mintinline{fortran}|simd| construct on inner loop (removing \mintinline{fortran}|collapse| clause). 74 | \item Checked vectorisation report: assume sizes arrays cause issue, so move kernel into \mintinline{fortran}|subroutine|. 75 | \end{itemize} 76 | 77 | \end{frame} 78 | 79 | %------------------------------------------------------------------------------- 80 | \begin{frame} 81 | \frametitle{Calculating memory bandwidth} 82 | Is your 5-point stencil code \emph{fast}? 83 | 84 | \pause 85 | 86 | Calculate memory bandwidth of the \emph{kernel} as a whole: 87 | \begin{itemize}[<+->] 88 | \item Assume a ``perfect cache'' model: once you read a memory location, it's been cached and further reads are ``free'' within the kernel. 89 | \item All of \mintinline{fortran}|A| array is read: $nx \times ny$ reads. 90 | \item All of \mintinline{fortran}{Atmp} array is written: $nx \times ny$ reads. 91 | \item Total memory moved: $2 \times nx \times ny \times 8$ bytes data moved (double precision) \emph{per iteration}. 92 | \item Memory bandwidth: $\frac{ntimes \times 2 \times nx \times ny \times 8}{runtime}$ bytes/second. 93 | \end{itemize} 94 | 95 | 96 | \end{frame} 97 | 98 | %------------------------------------------------------------------------------- 99 | \begin{frame} 100 | \frametitle{Achieved memory bandwidth} 101 | 102 | Results on dual-socket Intel Xeon E5-2680 v4 @ 2.40GHz, 14 cores/socket. 103 | Compiled with Intel 2018 compiler, {\tt -O3 -xHost}. 104 | 105 | \vfill 106 | 107 | Set $nx=ny=20,000$ so arrays are 3.2~GB. Set $ntimes=30$. Removed \mintinline{fortran}|write| statement. Taken best of 5 runs. 108 | 109 | \vfill 110 | 111 | \pause 112 | Theoretical peak bandwidth\footnote{\url{https://ark.intel.com/products/91754/Intel-Xeon-Processor-E5-2680-v4-35M-Cache-2_40-GHz}}: $2 \times 76.8 \text{GB/s} = 153.6 \text{GB/s}$. \\ 113 | STREAM Triad: 129.0~GB/s (84\% theoretical peak). 114 | 115 | \pause 116 | \begin{table} 117 | \begin{tabular}{ccc} 118 | \toprule 119 | Version & Runtime (s) & Memory bandwidth (GB/s)\\ 120 | \midrule 121 | Initial parallel reduction & 25.667 & 7.48 \\ 122 | Swap loops + vectorise & 4.876 & 39.38 \\ 123 | \bottomrule 124 | \end{tabular} 125 | \end{table} 126 | 127 | Achieving 30.5\% of STREAM memory bandwidth. 128 | 129 | \end{frame} 130 | 131 | %------------------------------------------------------------------------------- 132 | \section{NUMA} 133 | \begin{frame} 134 | \frametitle{NUMA Architecture} 135 | 136 | Recall this cartoon of a dual-socket, shared memory system: 137 | \begin{center} 138 | \begin{tikzpicture} 139 | % Draw 4 cores for socket 0 140 | \draw (0,0) rectangle (1,1); 141 | \draw (1,0) rectangle (2,1); 142 | \draw (0,1) rectangle (1,2); 143 | \draw (1,1) rectangle (2,2); 144 | 145 | % Draw 4 cores for socket 1 146 | \draw (3,0) rectangle (4,1); 147 | \draw (4,0) rectangle (5,1); 148 | \draw (3,1) rectangle (4,2); 149 | \draw (4,1) rectangle (5,2); 150 | 151 | % Draw large memory 152 | \draw (-0.5,3) rectangle (5.5,4); 153 | \draw (2.5,3.5) node {Memory}; 154 | 155 | % Connect sockets to memory 156 | \draw (1,2) -- (1,3); 157 | \draw (4,2) -- (4,3); 158 | \draw[dashed] (2,1) -- (3,1); % QPI 159 | 160 | \end{tikzpicture} 161 | \end{center} 162 | 163 | \emph{All} threads (each running on a core) can access the same memory. 164 | 165 | \end{frame} 166 | %------------------------------------------------------------------------------- 167 | 168 | \begin{frame} 169 | \frametitle{NUMA Architecture} 170 | \begin{itemize} 171 | \item In reality on a dual-socket system each \emph{socket} is physically connected to half of the memory. 172 | \item Still shared memory: all cores can access all the memory. 173 | \item A core in the first socket wanting memory attached to the other socket must: 174 | \begin{itemize} 175 | \item Go via the socket-to-socket interconnect. 176 | \item Access memory via the other socket's memory controllers. 177 | \end{itemize} 178 | \item Accessing memory from other socket is slower than access from own socket. 179 | \end{itemize} 180 | \begin{center} 181 | \resizebox{!}{3.5cm}{ 182 | \begin{tikzpicture} 183 | % Draw 4 cores for socket 0 184 | \foreach \i in {0,1,3,4} { 185 | \foreach \j in {0, 1} { 186 | \draw (\i,\j) rectangle (\i+1,\j+1); 187 | } 188 | } 189 | 190 | % Draw sockets around cores 191 | \draw (-0.2, -0.2) rectangle (2.2, 2.2); 192 | \draw (2.8, -0.2) rectangle (5.2, 2.2); 193 | 194 | % Draw large memory 195 | \draw (-0.5,3) rectangle (2.3,4); 196 | \draw (2.7,3) rectangle (5.5,4); 197 | \draw[dashed] (-0.7,2.8) rectangle (5.7,4.2); 198 | 199 | % Connect sockets to memory 200 | \draw (1,2.2) -- (1,3); 201 | \draw (4,2.2) -- (4,3); 202 | \draw[dashed] (2.2,1) -- (2.8,1); % QPI 203 | 204 | % Show memory shared 205 | \pause 206 | \draw[fill=red] (0.5,3.2) rectangle (1,3.7); 207 | \draw (3.5,1.5) node {Read}; 208 | \pause 209 | \draw[->,red,thick] (0.7,3.2) -- (0.7,2.1) -- (2.1,2.1) -- (2.1,1.1) -- (2.9,1.1) -- (3.5,1.2); 210 | 211 | \end{tikzpicture} 212 | } 213 | \end{center} 214 | \end{frame} 215 | 216 | %------------------------------------------------------------------------------- 217 | \begin{frame} 218 | \frametitle{Memory allocation} 219 | \begin{itemize} 220 | \item What happens when you run \mintinline{fortran}|allocate(A(1:N))|? 221 | \pause 222 | \item Allocating memory does not necessarily allocate memory! 223 | \item Memory is allocated when it's first used (i.e. \mintinline{fortran}|A(i) = 1.0|), one \emph{page} at a time. 224 | \item OS tends to use a \emph{first touch policy}. 225 | \item Memory is allocated in the closest NUMA region to the thread that first touches the data. 226 | \item Ideally want threads to use data in local NUMA region to reduce socket-to-socket interconnect transfers. 227 | \end{itemize} 228 | \end{frame} 229 | 230 | %------------------------------------------------------------------------------- 231 | \subsection{First touch} 232 | \begin{frame}[fragile] 233 | \frametitle{Taking advantage of first touch} 234 | Parallelising your data initialisation routine might mean your main loops go faster! 235 | 236 | 237 | \begin{minted}[fontsize=\small,linenos,frame=single]{fortran} 238 | ! Allocate and initialise vectors 239 | allocate(A(N), B(N), C(N)) 240 | !$omp parallel do 241 | do i = 1, N 242 | A(i) = 1.0 243 | B(i) = 2.0 244 | C(i) = 0.0 245 | end do 246 | !$omp end parallel do 247 | 248 | ! Vector add 249 | !$omp parallel do 250 | do i = 1, N 251 | C(i) = A(i) + B(i) 252 | end do 253 | !$omp end parallel do 254 | \end{minted} 255 | 256 | \end{frame} 257 | 258 | %------------------------------------------------------------------------------- 259 | \begin{frame} 260 | \frametitle{NUMA-aware} 261 | \begin{itemize} 262 | \item Parallelise your initialisation routines the same way you parallelise the main loops. 263 | \item This means each thread touches the same data in initialisation and compute. 264 | \item Should reduce the number of remote memory accesses needed and improve run times. 265 | \item But, OS is allowed to move threads around cores, and between sockets. 266 | \item This will mess up your NUMA aware code! 267 | \end{itemize} 268 | \end{frame} 269 | 270 | %------------------------------------------------------------------------------- 271 | \section{Thread affinity} 272 | \begin{frame} 273 | \frametitle{Pinning threads} 274 | \begin{itemize} 275 | \item OpenMP gives you the controls to pin threads to specific cores. 276 | \item Exposed as \emph{places} and \emph{thread pinning policy} to those places. 277 | \item By default there is one place consisting of all the cores. 278 | \item Use the \mintinline{bash}|OMP_PROC_BIND| environment variable to set pinning for all \mintinline{fortran}|parallel| regions. 279 | \item Can use the \mintinline{bash}|proc_bind| clause for control of specific regions, but advise against this. 280 | \end{itemize} 281 | \end{frame} 282 | 283 | %------------------------------------------------------------------------------- 284 | \begin{frame} 285 | \frametitle{OMP\_PROC\_BIND} 286 | \begin{itemize} 287 | \item \mintinline{bash}|OMP_PROC_BIND=false|: Often the default; threads may move! \mintinline{fortran}|proc_bind| clauses ignored. 288 | \item \mintinline{bash}|OMP_PROC_BIND=true|: Threads won't move, and follow \mintinline{fortran}|proc_bind| clauses or else the implementation default pinning. 289 | \item \mintinline{bash}|OMP_PROC_BIND=master|: Threads pinned to same place as master thread. 290 | \item \mintinline{bash}|OMP_PROC_BIND=close|: Threads are assigned to places close to the master thread. 291 | If \mintinline{bash}|OMP_NUM_THREADS.eq.ncores|: thread 0 will pin to core 0; thread 1 will pin to core 1; etc 292 | \item \mintinline{bash}|OMP_PROC_BIND=spread|: Threads are assigned to places ``sparsely''. 293 | If \mintinline{bash}|OMP_NUM_THREADS.eq.ncores|: thread 0 will pin to socket 0 core 0; thread 1 will pin to socket 1 core 0; thread 2 will pin to socket 0 core 1; etc. 294 | \end{itemize} 295 | \end{frame} 296 | 297 | %------------------------------------------------------------------------------- 298 | \begin{frame} 299 | \frametitle{Places} 300 | \begin{itemize} 301 | \item The affinity (policy) defines how threads are assigned to places. 302 | \item Places allow you to divide up the hardware resource, so that threads can be assigned to them. 303 | \item Default: one place with all cores. 304 | \item Use \mintinline{bash}|OMP_PLACES| environment variable to control. 305 | \item \mintinline{bash}|OMP_PLACES=thread|: each place is a single hardware thread. 306 | \item \mintinline{bash}|OMP_PLACES=cores|: each place is a single core (containing one or more hardware threads). 307 | \item \mintinline{bash}|OMP_PLACES=sockets|: each place contains the cores of a single socket. 308 | \item Can also use list notation: \mintinline{bash}|OMP_PLACES="{0:4},{4:4},{8:4},{12:4}"| 309 | \end{itemize} 310 | \end{frame} 311 | 312 | %------------------------------------------------------------------------------- 313 | \begin{frame} 314 | \frametitle{Thread pinning summary} 315 | \begin{itemize} 316 | \item In general, going to want to just use \mintinline{bash}|OMP_PROC_BIND=true|. 317 | \item Sometimes \mintinline{bash}|spread| or \mintinline{bash}|close| gets better performance. 318 | \item Pinning rules can get complicated when there are multiple places, so prefer to use the predefined values. 319 | \item Most effective with a NUMA-aware implementation. 320 | \item Also helps reduce run-to-run timing variability. 321 | \item But must be careful with MPI+OpenMP pinning: more on this later\dots 322 | \end{itemize} 323 | \end{frame} 324 | 325 | %------------------------------------------------------------------------------- 326 | \section{Hybrid MPI and OpenMP} 327 | \begin{frame} 328 | \frametitle{Why combine MPI+OpenMP} 329 | \begin{itemize} 330 | \item Supercomputers are often constructed with a hierarchical structure: 331 | \begin{itemize} 332 | \item Shared memory nodes connected with a network. 333 | \end{itemize} 334 | \item Need MPI (or similar) to communicate between distributed nodes. 335 | \item With multi-core, could just run MPI everywhere (flat MPI). 336 | \item But there are advantages to running \emph{hybrid} MPI and OpenMP: 337 | \begin{itemize} 338 | \item Larger fewer messages to take advantage of network bandwidth. 339 | \item Fewer MPI ranks to manage (fewer to synchronise and for collectives). 340 | \item Can avoid memory copies for intra-node communication. 341 | \item Reduced memory footprint. 342 | \item Parallelise other problem dimensions not decomposed with MPI. 343 | \end{itemize} 344 | \end{itemize} 345 | \end{frame} 346 | 347 | %------------------------------------------------------------------------------- 348 | \begin{frame} 349 | \frametitle{Scaling} 350 | \begin{itemize} 351 | \item Strong scaling: 352 | \begin{itemize} 353 | \item Take a fixed problem and add more compute resource. 354 | \item Would hope runtime reduces with more resource. 355 | \end{itemize} 356 | \item Weak scaling: 357 | \begin{itemize} 358 | \item Take a fixed problem \emph{per compute resource}, and add more resource. 359 | \item Problem gets bigger with more resources. 360 | \item Would hope runtime stays constant. 361 | \end{itemize} 362 | \item In both cases, typically see scaling of MPI-only codes tail off at high node counts. 363 | \item Hybrid MPI+OpenMP codes often continue scaling. 364 | \end{itemize} 365 | \end{frame} 366 | 367 | 368 | %------------------------------------------------------------------------------- 369 | \begin{frame}[fragile] 370 | \frametitle{MPI programs} 371 | What happens when you run an MPI program? 372 | \begin{minted}{bash} 373 | mpirun -np 16 ./a.out 374 | \end{minted} 375 | 376 | \begin{itemize} 377 | \item 16 processes are spawned on one (or more) nodes according to the hostname list file given by the queuing system. 378 | \begin{itemize} 379 | \item E.g. with PBS (\mintinline{bash}|qsub|, etc.) set by \mintinline{bash}|$PBS_NODEFILE|. 380 | \end{itemize} 381 | \item There is no reason why these processes have to be serial: 382 | \begin{itemize} 383 | \item Each MPI rank could spawn OpenMP threads and run in parallel. 384 | \item Each MPI rank could use a GPU. 385 | \end{itemize} 386 | \end{itemize} 387 | 388 | \end{frame} 389 | 390 | %------------------------------------------------------------------------------- 391 | \begin{frame}[fragile] 392 | \frametitle{Compiling OpenMP and MPI code} 393 | \begin{itemize} 394 | \item Remember building MPI code just uses the wrapper commands. 395 | \item Just pass in the OpenMP flag as usual: 396 | \begin{itemize} 397 | \item GNU: \mintinline{bash}|mpif90| -fopenmp 398 | \item Intel: \mintinline{bash}|mpiifort| -qopenmp 399 | \item Cray: \mintinline{bash}|ftn| 400 | \end{itemize} 401 | \item Set the number of OpenMP threads \emph{per rank}. 402 | \item E.g 2 MPI ranks, 8 threads per rank: 403 | \begin{minted}{bash} 404 | OMP_NUM_THREADS=8 mpirun -np 2 ./a.out 405 | \end{minted} 406 | \end{itemize} 407 | \end{frame} 408 | 409 | %------------------------------------------------------------------------------- 410 | \begin{frame}[fragile] 411 | \frametitle{Combining OpenMP and MPI} 412 | \begin{itemize} 413 | \item MPI assumes that each MPI process does not spawn anything else. 414 | \item Must initialise MPI differently if using threads! 415 | \begin{minted}{fortran} 416 | call MPI_Init_thread(required, provided, ierr) 417 | \end{minted} 418 | 419 | \item You specify a required thread support level, and it returns the level it could support. 420 | \item A good idea to check \mintinline{fortran}|provided .ge. required|. 421 | \end{itemize} 422 | \end{frame} 423 | 424 | %------------------------------------------------------------------------------- 425 | \begin{frame} 426 | \frametitle{Thread support levels} 427 | \begin{itemize} 428 | \item \mintinline{fortran}|MPI_THREAD_SINGLE| \\ 429 | Only one thread will execute (no threads allowed). 430 | 431 | \item \mintinline{fortran}|MPI_THREAD_FUNNELED| \\ 432 | May spawn threads, but only the original process may call MPI routines: the one that called \mintinline{fortran}|MPI_Init|. 433 | 434 | \item \mintinline{fortran}|MPI_THREAD_SERIALIZED| \\ 435 | May spawn threads and any thread can make MPI calls, but only one at a time. \emph{Your} responsibility to synchronise. 436 | 437 | \item \mintinline{fortran}|MPI_THREAD_MULTIPLE| \\ 438 | May spawn threads and any thread can make MPI calls. The MPI library has to deal with being called in parallel. 439 | \end{itemize} 440 | 441 | Remember to make sure ranks still match the MPI communications to avoid deadlock. 442 | 443 | \end{frame} 444 | 445 | %------------------------------------------------------------------------------- 446 | \begin{frame}[fragile] 447 | \frametitle{Example: MPI\_THREAD\_FUNNELED} 448 | Only the original process is allowed to call MPI routines. 449 | \begin{minted}[frame=single]{fortran} 450 | !$omp parallel 451 | ... ! Parallel work 452 | !$omp end parallel 453 | call MPI_Sendrecv() 454 | \end{minted} 455 | \end{frame} 456 | 457 | %------------------------------------------------------------------------------- 458 | \begin{frame}[fragile] 459 | \frametitle{MPI\_THREAD\_SERIALIZED} 460 | The threads are allowed to call MPI, but you must program in synchronisation to ensure only one thread calls MPI at a time. 461 | \begin{minted}[frame=single]{fortran} 462 | !$omp parallel 463 | ... ! Parallel work 464 | !$omp critical 465 | call MPI_Sendrecv() 466 | !$omp end critical 467 | !$omp end parallel 468 | \end{minted} 469 | \end{frame} 470 | 471 | %------------------------------------------------------------------------------- 472 | \begin{frame}[fragile] 473 | \frametitle{MPI\_THREAD\_MULTIPLE} 474 | Any thread can call MPI whenever it likes. The \mintinline{fortran}|MPI_THREAD_MULTIPLE| guarantees the MPI library will be OK with this. 475 | \begin{minted}[frame=single]{fortran} 476 | !$omp parallel 477 | ... ! Parallel work 478 | call MPI_Sendrecv() 479 | !$omp end parallel 480 | \end{minted} 481 | \end{frame} 482 | 483 | %------------------------------------------------------------------------------- 484 | \subsection{Hybrid thread pinning} 485 | \begin{frame} 486 | \frametitle{Thread pinning} 487 | \begin{itemize} 488 | \item Need to be very careful how MPI ranks and OpenMP threads are mapped to the physical hardware. 489 | \item Imagine 2 dual-socket nodes: 4 sockets with (say) 16 cores per socket. 490 | \item Launch 64 MPI ranks: 1 per core. 491 | \begin{itemize} 492 | \item This is flat MPI. 493 | \item Launching OpenMP threads will over-allocate threads compared to hardware resource. 494 | \item Warning: things will slow down. 495 | \end{itemize} 496 | \item Launch 4 MPI ranks (one per socket). 497 | \begin{itemize} 498 | \item Leaves 16 cores per MPI rank for OpenMP threads to run on. 499 | \item But need to make sure processes \emph{and} threads go to the right places! 500 | \item Often close interaction with the queuing system --- system dependant behaviour. 501 | \end{itemize} 502 | \end{itemize} 503 | \end{frame} 504 | 505 | %------------------------------------------------------------------------------- 506 | \begin{frame}[fragile] 507 | \frametitle{Example: default placement} 508 | Example MPI rank placement with standard PBS setup. 509 | 510 | Job requested 2 nodes. 511 | 512 | \begin{minted}{bash} 513 | mpirun -np 4 ./a.out 514 | \end{minted} 515 | 516 | \begin{center} 517 | \begin{adjustbox}{max width={\textwidth}} 518 | \begin{tikzpicture} 519 | 520 | \foreach \loc in {0, 3, 7, 10} { 521 | \foreach \i in {0,...,1} { 522 | \foreach \j in {0,...,1} { 523 | \draw (\loc+\i,\j) rectangle (\loc+\i+1,\j+1); 524 | } 525 | } 526 | } 527 | 528 | \draw[dashed] (-0.5,-0.5) rectangle (5.5,2.5); 529 | \draw[dashed] (6.5,-0.5) rectangle (12.5,2.5); 530 | 531 | \foreach \i in {0,...,1} { 532 | \foreach \j in {0,...,1} { 533 | \draw<2->[fill=red] (3+\i+.5,\j+.5) circle (0.4cm); 534 | } 535 | } 536 | \end{tikzpicture} 537 | \end{adjustbox} 538 | \end{center} 539 | \onslide<2->{ 540 | All ranks placed on the second socket of the first node. 541 | } 542 | \end{frame} 543 | 544 | %------------------------------------------------------------------------------- 545 | 546 | \begin{frame}[fragile] 547 | \frametitle{Example: pin MPI to one core per socket} 548 | \begin{itemize} 549 | \item Tell the OS and MPI runtime to pin each MPI to the first core in each socket. 550 | \item Then want to launch 4 OpenMP threads per process. 551 | \item For OpenMPI: 552 | \begin{minted}{bash} 553 | export OMP_NUM_THREADS=4 554 | mpirun -np 4 --npersocket 1 ./a.out 555 | \end{minted} 556 | \item Where do the threads go? 557 | \end{itemize} 558 | 559 | 560 | \begin{center} 561 | \begin{adjustbox}{max width={\textwidth}} 562 | \begin{tikzpicture} 563 | 564 | \foreach \loc in {0, 3, 7, 10} { 565 | \foreach \i in {0,...,1} { 566 | \foreach \j in {0,...,1} { 567 | \draw (\loc+\i,\j) rectangle (\loc+\i+1,\j+1); 568 | } 569 | } 570 | } 571 | 572 | \draw[dashed] (-0.5,-0.5) rectangle (5.5,2.5); 573 | \draw[dashed] (6.5,-0.5) rectangle (12.5,2.5); 574 | 575 | \foreach \i in {0, 3, 7, 10} { 576 | \draw[fill=red] (\i+0.5,1.5) circle (0.4cm); 577 | \foreach \j in {0.2, 0.4, 0.6, 0.8} { 578 | \draw<2->[->,line width=.5mm] (\i+\j,1.8) -- (\i+\j, 1.3); 579 | } 580 | } 581 | \end{tikzpicture} 582 | \end{adjustbox} 583 | \end{center} 584 | 585 | \onslide<2->{ 586 | Threads spawned inherit their parent's binding, which was one core. 587 | 588 | Use \mintinline{bash}|--report-bindings| flag to see what's being pinned where. 589 | 590 | } 591 | 592 | \end{frame} 593 | 594 | %------------------------------------------------------------------------------- 595 | \begin{frame} 596 | \frametitle{Example: pin MPI to socket} 597 | \begin{itemize} 598 | \item Pin each MPI process to the cores of a socket. 599 | \item MPI process \emph{could} move around those cores. 600 | \item OpenMP threads can spawn across the socket. 601 | \item OpenMPI gives three ways to do this: 602 | \begin{itemize} 603 | \item \mintinline{bash}|--bind-to-socket| 604 | \item \mintinline{bash}|--bind-to-core --cpus-per-proc 8| 605 | \item \mintinline{bash}|--map-by socket:PE=8| (v1.10 and up) 606 | \end{itemize} 607 | \end{itemize} 608 | \end{frame} 609 | 610 | %------------------------------------------------------------------------------- 611 | \begin{frame} 612 | \frametitle{Pinning with Intel and Cray} 613 | \begin{itemize} 614 | \item Intel MPI will need different flags and environment variables, but tends to do the right thing by default. 615 | \item Cray MPI (MVAPICH) can be controlled using \mintinline{bash}|aprun|. 616 | \begin{itemize} 617 | \item Use the \mintinline{bash}|-d| flag to specify the threads per process. 618 | \item Pinning usually happens correctly. 619 | \end{itemize} 620 | \item Cray MPI with the Intel compiler needs a different set of \mintinline{bash}|aprun| flags. 621 | \begin{itemize} 622 | \item Default pinning is usually not what you expected. 623 | \item Use the \mintinline{bash}|-cc| flag to specify correct thread pinning. 624 | \end{itemize} 625 | \item The \mintinline{bash}|amask| tool from TACC is very useful for discovering the pinning\footnote{\url{https://github.com/TACC/amask}}. 626 | \end{itemize} 627 | \end{frame} 628 | 629 | %------------------------------------------------------------------------------- 630 | \section{Exercise} 631 | \begin{frame} 632 | \frametitle{Exercise} 633 | \begin{itemize} 634 | \item Make your parallel 5-point stencil code NUMA aware. 635 | \begin{itemize} 636 | \item Parallelise the initialisation routine. 637 | \end{itemize} 638 | \item Calculate improvements memory bandwidth. 639 | \begin{itemize} 640 | \item Use a profiler to measure remote memory accesses before/after optimisation. 641 | \end{itemize} 642 | \item Experiment with thread affinity. 643 | \item Extension: Add MPI to your OpenMP 5-point stencil to run it hybrid across multiple nodes. 644 | \end{itemize} 645 | \end{frame} 646 | 647 | %------------------------------------------------------------------------------- 648 | \section{Summary} 649 | \begin{frame} 650 | \frametitle{Summary} 651 | 652 | \begin{itemize} 653 | \item Walked through memory bandwidth model calculation of 5-point stencil. 654 | \item NUMA issues and taking advantage of first touch policy. 655 | \item Controlling OpenMP thread affinity with \mintinline{bash}|OMP_PROC_BIND| and \mintinline{bash}|OMP_PLACES| environment variables. 656 | \item Programming a hybrid MPI+OpenMP code. 657 | \item Thread affinity of hybrid programs. 658 | \end{itemize} 659 | 660 | \vfill 661 | 662 | \begin{itemize} 663 | \item Next sessions: 664 | \begin{enumerate} 665 | \setcounter{enumi}{4} 666 | \item GPU programming with OpenMP. 667 | \item Tasks and Tools. 668 | \end{enumerate} 669 | \end{itemize} 670 | 671 | \end{frame} 672 | 673 | %------------------------------------------------------------------------------- 674 | 675 | \end{document} 676 | 677 | -------------------------------------------------------------------------------- /slides/03-simd-numa.tex: -------------------------------------------------------------------------------- 1 | \documentclass[aspectratio=169]{beamer} 2 | 3 | \input{preamble.tex} 4 | 5 | \title{OpenMP for Computational Scientists} 6 | \subtitle{3: Vectorisation and NUMA} 7 | 8 | \begin{document} 9 | 10 | \frame{\titlepage} 11 | 12 | %------------------------------------------------------------------------------- 13 | \begin{frame}[fragile] 14 | \frametitle{Previous exercise} 15 | 16 | Take your parallel 5-point stencil, and implement a reduction: 17 | \begin{minted}[frame=single,breaklines,fontsize=\small]{fortran} 18 | total = 0.0 19 | !$omp parallel do collapse(2) reduction(+:total) 20 | do i = 1, nx 21 | do j = 1, ny 22 | Atmp(i,j) = (A(i-1,j) + A(i+1,j) + A(i,j) + A(i,j-1) + A(i,j+1)) / 5.0 23 | total = total + Atmp(i,j) 24 | end do 25 | end do 26 | !$omp end parallel do 27 | \end{minted} 28 | 29 | \begin{itemize} 30 | \item Well done if you managed this! 31 | \item 5-point stencil is simple, but captures the \emph{essence} of more complicated codes. 32 | \item Extension: did anyone try the parallelising the Jacobi solver? 33 | \end{itemize} 34 | 35 | \end{frame} 36 | 37 | %------------------------------------------------------------------------------- 38 | %------------------------------------------------------------------------------- 39 | \section{Vectorisation} 40 | \begin{frame} 41 | \frametitle{Vectorisation} 42 | $$C=A+B$$ 43 | \begin{columns} 44 | \begin{column}{0.5\textwidth} 45 | Scalar operations \\ 46 | \begin{adjustbox}{max totalsize={\textwidth}{0.6\textheight},center} 47 | 48 | \begin{tikzpicture} 49 | \draw (-0.5,2) rectangle (0.5,3); 50 | \draw (1,2) rectangle (2,3); 51 | \draw[->] (0,2) -- (.74,1.2); 52 | \draw[->] (1.5,2) -- (.76,1.2); 53 | \draw (.75,.75) circle (.4cm); 54 | \draw (.75,.75) node {$+$}; 55 | \draw[->] (.75,0.3) -- (.75,-0.5); 56 | \draw (.25,-1.5) rectangle (1.25,-0.5); 57 | \end{tikzpicture} 58 | \end{adjustbox} 59 | \end{column} 60 | 61 | \begin{column}{0.5\textwidth} 62 | Vector operations \\ 63 | \begin{adjustbox}{max totalsize={\textwidth}{0.6\textheight},center} 64 | \begin{tikzpicture} 65 | \draw[step=1cm] (0,2) grid (4,3); 66 | \draw[step=1cm] (0,0) grid (4,1); 67 | \draw[->] (2,0) -- (2,-0.6); 68 | \draw[->] (0,2.5) -- (-0.5,2.5) -- (-0.5,-1) -- (1.6,-1); 69 | \draw (2,-1) circle (.4cm); 70 | \draw (2,-1) node {$+$}; 71 | \draw[->] (2,-1.4) -- (2,-1.9); 72 | \draw[step=1cm] (0,-3) grid (4,-2); 73 | \end{tikzpicture} 74 | \end{adjustbox} 75 | \end{column} 76 | \end{columns} 77 | 78 | \end{frame} 79 | 80 | %------------------------------------------------------------------------------- 81 | \begin{frame} 82 | \frametitle{Why vectorise?} 83 | \begin{itemize} 84 | \item Vectorisation gives you more compute per cycle. 85 | \item Hence may increase the FLOP/s rate of the processor. 86 | \item Also results in fewer instructions to process (less pressure on instruction decode units). 87 | \item Vectors help make good use of the memory hierarchy (often the main benefit). 88 | \item Vectorisation helps you write code which has good access patterns to maximise bandwidth. 89 | \end{itemize} 90 | \end{frame} 91 | 92 | %------------------------------------------------------------------------------- 93 | \begin{frame} 94 | \frametitle{Auto-vectorisation} 95 | \begin{itemize} 96 | \item Modern compilers are very good at automatically vectorising your loops. 97 | \item Fortran helps as arrays can not alias (overlap), unlike C. 98 | \item But compiler needs to be sure it's safe to vectorise. 99 | \item Read compiler reports to see if it's already vectorising. 100 | \begin{itemize} 101 | \item Intel: \mintinline{bash}|-qopt-report=5| 102 | \item Cray: \mintinline{bash}|-hlist=a| 103 | \item GNU (old): \mintinline{bash}|-ftree-vectorizer-verbose=2| 104 | \item GNU (new): \mintinline{bash}|-fopt-info-vec| 105 | \item Clang: \mintinline{bash}|-Rpass=loop-vectorize| \mintinline{bash}|-Rpass-missed=loop-vectorize| \mintinline{bash}|-Rpass-analysis=loop-vectorize| 106 | \end{itemize} 107 | \item Often the memory access pattern prevents (efficient) auto-vectorisation. 108 | \end{itemize} 109 | \end{frame} 110 | 111 | %------------------------------------------------------------------------------- 112 | \subsection{OpenMP SIMD} 113 | \begin{frame}[fragile] 114 | \frametitle{OpenMP SIMD} 115 | \begin{itemize} 116 | \item Sometimes the compiler needs help in confirming loops are vectorisable. 117 | \item OpenMP \mintinline{fortran}|simd| constructs give this information. 118 | \item Can combine with \mintinline{fortran}|parallel do| construct to ensure a parallel vector loop: \mintinline{fortran}|omp parallel do simd| 119 | \item Generally want to vectorise inner loops and parallelise outer loops. 120 | \end{itemize} 121 | 122 | \begin{minted}[frame=single]{fortran} 123 | !$omp simd 124 | do i = 1, N 125 | C(i) = A(i)+B(i) 126 | end do 127 | !$omp end simd 128 | \end{minted} 129 | \end{frame} 130 | 131 | %------------------------------------------------------------------------------- 132 | \begin{frame}[fragile] 133 | \frametitle{SIMD functions} 134 | Say you've written an update function to update values in the loop: 135 | \begin{minted}[frame=single]{fortran} 136 | do i = 1, N 137 | A(i) = magic_maths(A(i)) 138 | end do 139 | \end{minted} 140 | 141 | \begin{itemize} 142 | \item The situation gets complicated. 143 | \item If the function is small, then likely inlined and loop will auto-vectorise. 144 | \item Otherwise need to use the \mintinline{fortran}|simd| construct, but need compiler to create a vector version of the function. 145 | \end{itemize} 146 | 147 | \begin{minted}[frame=single]{fortran} 148 | function magic_maths(value) result(r) 149 | !$omp declare simd(magic_maths) 150 | implicit none 151 | real(kind=8) :: value, r 152 | r = value * value 153 | end function 154 | \end{minted} 155 | 156 | \end{frame} 157 | 158 | %------------------------------------------------------------------------------- 159 | \begin{frame}[fragile] 160 | \frametitle{SIMD clauses} 161 | \begin{itemize} 162 | \item All the usual data-sharing and reduction clauses can be applied. 163 | \item \mintinline{fortran}|safelen(4)|: distance between iterations where its safe to vectorise. 164 | \begin{minted}[frame=single]{fortran} 165 | !$omp simd safelen(4) 166 | do i = 1, N-4 167 | A(i) = A(i) + A(i+4) 168 | end do 169 | !$omp end simd 170 | \end{minted} 171 | \item \mintinline{fortran}|simdlen(4)|: preferred iterations to be performed concurrently as a vector. 172 | Specifying explicit vector lengths builds in obsolescence to the code as hardware vector lenghts continually change --- don't recommend using this clause. 173 | \end{itemize} 174 | \end{frame} 175 | 176 | %------------------------------------------------------------------------------- 177 | \begin{frame}[fragile] 178 | \frametitle{SIMD clauses} 179 | \begin{itemize} 180 | \item \mintinline{fortran}|linear(var)|: variable is private and linear to the loop iterator. 181 | \begin{minted}[frame=single]{fortran} 182 | !$omp simd linear(j) 183 | do i = 1, N 184 | j = j + 1 185 | A(j) = B(i) 186 | end do 187 | !$omp end simd 188 | \end{minted} 189 | \item \mintinline{fortran}|aligned(var)|: says the array is aligned. 190 | \item \mintinline{fortran}|uniform(var)|: for \mintinline{fortran}|declare simd| construct, the variable is the same in all vector lanes. 191 | \end{itemize} 192 | \end{frame} 193 | 194 | %------------------------------------------------------------------------------- 195 | \begin{frame} 196 | \frametitle{SIMD summary} 197 | 198 | \begin{itemize} 199 | \item Sometimes need to force the compiler to auto-vectorise (the correct) loop with the \mintinline{fortran}|simd| construct. 200 | \item As with \mintinline{fortran}|parallel|, you are telling the compiler it is safe to vectorise and to ignore its data dependancy analysis. 201 | \item Check the compiler report before and after the check it did the right thing! 202 | \item Use \mintinline{fortran}|declare simd| and appropriate clauses if you need to create vectorised versions of functions. 203 | \begin{itemize} 204 | \item The clauses can give more information to the compiler so it does a better job. 205 | \end{itemize} 206 | \end{itemize} 207 | 208 | \end{frame} 209 | 210 | %------------------------------------------------------------------------------- 211 | \section{Derived types} 212 | \begin{frame}[fragile] 213 | \frametitle{Derived types} 214 | 2D grid of cells, each cell containing 4 different values. 215 | \begin{minted}[frame=single,linenos,fontsize=\footnotesize]{fortran} 216 | type cell 217 | real(kind=8) :: property1 218 | real(kind=8) :: property2 219 | real(kind=8) :: property3 220 | real(kind=8) :: property4 221 | end type 222 | 223 | type(cell), allocatable :: grid(:,:) 224 | 225 | do j = 1, ny 226 | do i = 1, nx 227 | grid(i,j)%property1 = update_1() 228 | grid(i,j)%property2 = update_2() 229 | grid(i,j)%property3 = update_3() 230 | grid(i,j)%property4 = update_4() 231 | end do 232 | end do 233 | \end{minted} 234 | \end{frame} 235 | 236 | %------------------------------------------------------------------------------- 237 | \begin{frame} 238 | \frametitle{Derived types} 239 | \begin{itemize} 240 | \item What do Fortran derived types look like in memory? 241 | \item Organised as an array of structures. 242 | \item<2-> What happens when we vectorise our loop over cells? 243 | \end{itemize} 244 | 245 | \begin{adjustbox}{max width={\textwidth}} 246 | \begin{tikzpicture} 247 | \draw[step=1cm] (0,0) grid (13,1); 248 | \foreach \i in {0,4,8,12} { 249 | \draw (\i+.5,.5) node {P1}; 250 | } 251 | \foreach \i in {0,4,8} { 252 | \draw (\i+1.5,.5) node {P2}; 253 | \draw (\i+2.5,.5) node {P3}; 254 | \draw (\i+3.5,.5) node {P4}; 255 | } 256 | 257 | \foreach \i in {0,4,8,12} { 258 | \draw<3->[->] (\i+.5,-1) -- (\i+.5,0); 259 | } 260 | \end{tikzpicture} 261 | \end{adjustbox} 262 | 263 | \begin{itemize} 264 | \item<4-> The \mintinline{fortran}|property1| values are gathered into a vector register. 265 | \item<5-> After the computation, the results are scattered back into memory. 266 | \item<6-> A cache line is 64 bytes, so only the first two values are on the first cache line. 267 | \item<6-> Must read two cache lines to fill the vector up. 268 | \end{itemize} 269 | \end{frame} 270 | 271 | %------------------------------------------------------------------------------- 272 | \begin{frame}[fragile] 273 | \frametitle{Structure of arrays} 274 | Switch type around to have an array per property. 275 | \begin{minted}[frame=single,linenos,fontsize=\small]{fortran} 276 | type grid 277 | real(kind=8), allocatable :: property1(:,:) 278 | real(kind=8), allocatable :: property2(:,:) 279 | real(kind=8), allocatable :: property3(:,:) 280 | real(kind=8), allocatable :: property4(:,:) 281 | end type 282 | 283 | do j = 1, ny 284 | do i = 1, nx 285 | grid%property1(i,j) = update_1() 286 | grid%property2(i,j) = update_2() 287 | grid%property3(i,j) = update_3() 288 | grid%property4(i,j) = update_4() 289 | end do 290 | end do 291 | \end{minted} 292 | \end{frame} 293 | 294 | %------------------------------------------------------------------------------- 295 | \begin{frame} 296 | \frametitle{Structure of arrays} 297 | \begin{itemize} 298 | \item Order of data in memory has changed. 299 | \item<2-> What happens when we vectorise? 300 | \end{itemize} 301 | 302 | \begin{adjustbox}{max width={\textwidth}} 303 | \begin{tikzpicture} 304 | \draw[step=1cm] (0,0) grid (13,1); 305 | \foreach \i in {0,...,4} { 306 | \draw (\i+.5,.5) node {P1}; 307 | } 308 | \draw (5.5,.5) node {\dots}; 309 | 310 | \foreach \i in {5,...,9} { 311 | \draw (\i+1.5,.5) node {P2}; 312 | } 313 | \draw (11.5,.5) node {\dots}; 314 | 315 | \foreach \i in {10} { 316 | \draw (\i+2.5,.5) node {P3}; 317 | } 318 | 319 | \foreach \i in {0,...,3} { 320 | \draw<3->[->] (\i+.5,-1) -- (\i+.5,0); 321 | } 322 | \end{tikzpicture} 323 | \end{adjustbox} 324 | 325 | \onslide<4->{ 326 | \begin{itemize} 327 | \item Coalesced memory accesses are key for high performance code. 328 | \item Adjacent vector lanes read adjacent memory locations. 329 | \item A cache line is 64 bytes, so can fill the vector from a single cache line. 330 | \item More efficient vectorisation. 331 | \end{itemize} 332 | } 333 | \end{frame} 334 | 335 | %------------------------------------------------------------------------------- 336 | \section{Memory access patterns} 337 | \begin{frame}[fragile] 338 | \frametitle{Memory access patterns} 339 | \begin{minted}{fortran} 340 | do i = 1, N 341 | val = A(i) 342 | end do 343 | \end{minted} 344 | \begin{adjustbox}{max width={\textwidth}} 345 | \begin{tikzpicture} 346 | \draw[step=1cm] (-3,0) grid (11,1); 347 | \draw[dashed] (0,-.5) -- (0,1.5); 348 | \draw[dashed] (8,-.5) -- (8,1.5); 349 | \draw (0,-1) node {64 byte boundary}; 350 | \foreach \i in {0,...,7} { 351 | \draw[->] (\i+.5,2) -- (\i+.5,1.2); 352 | } 353 | \end{tikzpicture} 354 | \end{adjustbox} 355 | \begin{itemize} 356 | \item Ideal memory access pattern. 357 | \item All access is coalesced. 358 | \item Vectors are aligned to cache line boundary. 359 | \end{itemize} 360 | \end{frame} 361 | 362 | %------------------------------------------------------------------------------- 363 | \begin{frame}[fragile] 364 | \frametitle{Memory access patterns} 365 | \begin{minted}{fortran} 366 | do i = 1, N 367 | val = A(i+3) 368 | end do 369 | \end{minted} 370 | \begin{adjustbox}{max width={\textwidth}} 371 | \begin{tikzpicture} 372 | \draw[step=1cm] (-3,0) grid (11,1); 373 | \draw[dashed] (0,-.5) -- (0,1.5); 374 | \draw[dashed] (8,-.5) -- (8,1.5); 375 | \draw (0,-1) node {64 byte boundary}; 376 | \foreach \i in {0,...,7} { 377 | \draw[->] (\i+.5,2) -- (3+\i+.5,1.2); 378 | } 379 | \end{tikzpicture} 380 | \end{adjustbox} 381 | \begin{itemize} 382 | \item OK memory access pattern. 383 | \item All access is coalesced, but split across cache lines. 384 | \item Still get good use of cache lines, but not as efficient as aligned version. 385 | \end{itemize} 386 | \end{frame} 387 | 388 | %------------------------------------------------------------------------------- 389 | \begin{frame}[fragile] 390 | \frametitle{Memory access patterns} 391 | \begin{minted}{fortran} 392 | do i = 1, N 393 | val = A(j,i) ! equiv. A(j+3*i) 394 | end do 395 | \end{minted} 396 | \begin{adjustbox}{max width={\textwidth}} 397 | \begin{tikzpicture} 398 | \draw[step=1cm] (-3,0) grid (11,1); 399 | \draw[dashed] (0,-.5) -- (0,1.5); 400 | \draw[dashed] (8,-.5) -- (8,1.5); 401 | \draw (0,-1) node {64 byte boundary}; 402 | \foreach \i in {0,...,3} { 403 | \draw[->] (\i+.5,2) -- (3*\i+.5,1.2); 404 | } 405 | \end{tikzpicture} 406 | \end{adjustbox} 407 | \begin{itemize} 408 | \item Strided access results in multiple memory transactions. 409 | \item Kills throughput due to poor reuse of cached data. 410 | \item Very easy to fall into this trap with multi-dimensional arrays. 411 | \item Check your strides! 412 | \end{itemize} 413 | \end{frame} 414 | 415 | %------------------------------------------------------------------------------- 416 | \begin{frame}[fragile] 417 | \frametitle{Memory access patterns} 418 | \begin{minted}{fortran} 419 | do i = 1, N 420 | val = A(B(i)) 421 | end do 422 | \end{minted} 423 | \begin{adjustbox}{max width={\textwidth}} 424 | \begin{tikzpicture} 425 | \draw[step=1cm] (-3,0) grid (11,1); 426 | \draw[dashed] (0,-.5) -- (0,1.5); 427 | \draw[dashed] (8,-.5) -- (8,1.5); 428 | \draw (0,-1) node {64 byte boundary}; 429 | \draw[->] (0.5,2) -- (-3.5,1.2); 430 | \draw[->] (1.5,2) -- (3.5,1.2); 431 | \draw[->] (2.5,2) -- (0.5,1.2); 432 | \draw[->] (3.5,2) -- (8.5,1.2); 433 | \draw[->] (4.5,2) -- (-1.5,1.2); 434 | \draw[->] (5.5,2) -- (7.5,1.2); 435 | \draw[->] (6.5,2) -- (1.5,1.2); 436 | \draw[->] (7.5,2) -- (-2.5,1.2); 437 | \end{tikzpicture} 438 | \end{adjustbox} 439 | \begin{itemize} 440 | \item Essentially random access to memory. 441 | \item Little reuse of cache lines. 442 | \item Unpredictable pattern, so hardware prefetchers won't work efficiently. 443 | \item Very challenging! 444 | \end{itemize} 445 | \end{frame} 446 | 447 | %------------------------------------------------------------------------------- 448 | \section{NUMA} 449 | \begin{frame} 450 | \frametitle{NUMA Architecture} 451 | 452 | Recall this cartoon of a dual-socket, shared memory system: 453 | \begin{center} 454 | \begin{tikzpicture} 455 | % Draw 4 cores for socket 0 456 | \draw (0,0) rectangle (1,1); 457 | \draw (1,0) rectangle (2,1); 458 | \draw (0,1) rectangle (1,2); 459 | \draw (1,1) rectangle (2,2); 460 | 461 | % Draw 4 cores for socket 1 462 | \draw (3,0) rectangle (4,1); 463 | \draw (4,0) rectangle (5,1); 464 | \draw (3,1) rectangle (4,2); 465 | \draw (4,1) rectangle (5,2); 466 | 467 | % Draw large memory 468 | \draw (-0.5,3) rectangle (5.5,4); 469 | \draw (2.5,3.5) node {Memory}; 470 | 471 | % Connect sockets to memory 472 | \draw (1,2) -- (1,3); 473 | \draw (4,2) -- (4,3); 474 | \draw[dashed] (2,1) -- (3,1); % QPI 475 | 476 | \end{tikzpicture} 477 | \end{center} 478 | 479 | \emph{All} threads (each running on a core) can access the same memory. 480 | 481 | \end{frame} 482 | %------------------------------------------------------------------------------- 483 | 484 | \begin{frame} 485 | \frametitle{NUMA Architecture} 486 | \begin{itemize} 487 | \item In reality on a dual-socket system each \emph{socket} is physically connected to half of the memory. 488 | \item Still shared memory: all cores can access all the memory. 489 | \item A core in the first socket wanting memory attached to the other socket must: 490 | \begin{itemize} 491 | \item Go via the socket-to-socket interconnect. 492 | \item Access memory via the other socket's memory controllers. 493 | \end{itemize} 494 | \item Accessing memory from other socket is slower than access from own socket. 495 | \end{itemize} 496 | \begin{center} 497 | \resizebox{!}{3.5cm}{ 498 | \begin{tikzpicture} 499 | % Draw 4 cores for socket 0 500 | \foreach \i in {0,1,3,4} { 501 | \foreach \j in {0, 1} { 502 | \draw (\i,\j) rectangle (\i+1,\j+1); 503 | } 504 | } 505 | 506 | % Draw sockets around cores 507 | \draw (-0.2, -0.2) rectangle (2.2, 2.2); 508 | \draw (2.8, -0.2) rectangle (5.2, 2.2); 509 | 510 | % Draw large memory 511 | \draw (-0.5,3) rectangle (2.3,4); 512 | \draw (2.7,3) rectangle (5.5,4); 513 | \draw[dashed] (-0.7,2.8) rectangle (5.7,4.2); 514 | 515 | % Connect sockets to memory 516 | \draw (1,2.2) -- (1,3); 517 | \draw (4,2.2) -- (4,3); 518 | \draw[dashed] (2.2,1) -- (2.8,1); % QPI 519 | 520 | % Show memory shared 521 | \pause 522 | \draw[fill=red] (0.5,3.2) rectangle (1,3.7); 523 | \draw (3.5,1.5) node {Read}; 524 | \pause 525 | \draw[->,red,thick] (0.7,3.2) -- (0.7,2.1) -- (2.1,2.1) -- (2.1,1.1) -- (2.9,1.1) -- (3.5,1.2); 526 | 527 | \end{tikzpicture} 528 | } 529 | \end{center} 530 | \end{frame} 531 | 532 | %------------------------------------------------------------------------------- 533 | \begin{frame} 534 | \frametitle{Memory allocation} 535 | \begin{itemize} 536 | \item What happens when you run \mintinline{fortran}|allocate(A(1:N))|? 537 | \pause 538 | \item Allocating memory does not necessarily allocate memory! 539 | \item Memory is allocated when it's first used (i.e. \mintinline{fortran}|A(i) = 1.0|), one \emph{page} at a time. 540 | \item OS tends to use a \emph{first touch policy}. 541 | \item Memory is allocated in the closest NUMA region to the thread that first touches the data. 542 | \item Ideally want threads to use data in local NUMA region to reduce socket-to-socket interconnect transfers. 543 | \end{itemize} 544 | \end{frame} 545 | 546 | %------------------------------------------------------------------------------- 547 | \subsection{First touch} 548 | \begin{frame}[fragile] 549 | \frametitle{Taking advantage of first touch} 550 | Parallelising your data initialisation routine might mean your main loops go faster! 551 | 552 | 553 | \begin{minted}[fontsize=\small,linenos,frame=single]{fortran} 554 | ! Allocate and initialise vectors 555 | allocate(A(N), B(N), C(N)) 556 | !$omp parallel do 557 | do i = 1, N 558 | A(i) = 1.0 559 | B(i) = 2.0 560 | C(i) = 0.0 561 | end do 562 | !$omp end parallel do 563 | 564 | ! Vector add 565 | !$omp parallel do 566 | do i = 1, N 567 | C(i) = A(i) + B(i) 568 | end do 569 | !$omp end parallel do 570 | \end{minted} 571 | 572 | \end{frame} 573 | 574 | %------------------------------------------------------------------------------- 575 | \begin{frame} 576 | \frametitle{NUMA-aware} 577 | \begin{itemize} 578 | \item Parallelise your initialisation routines the same way you parallelise the main loops. 579 | \item This means each thread touches the same data in initialisation and compute. 580 | \item Should reduce the number of remote memory accesses needed and improve run times. 581 | \item But, OS is allowed to move threads around cores, and between sockets. 582 | \item This will mess up your NUMA aware code! 583 | \end{itemize} 584 | \end{frame} 585 | 586 | %------------------------------------------------------------------------------- 587 | \section{Thread affinity} 588 | \begin{frame} 589 | \frametitle{Pinning threads} 590 | \begin{itemize} 591 | \item OpenMP gives you the controls to pin threads to specific cores. 592 | \item Exposed as \emph{places} and \emph{thread pinning policy} to those places. 593 | \item By default there is one place consisting of all the cores. 594 | \item Use the \mintinline{bash}|OMP_PROC_BIND| environment variable to set pinning for all \mintinline{fortran}|parallel| regions. 595 | \item Can use the \mintinline{bash}|proc_bind| clause for control of specific regions, but advise against this. 596 | \end{itemize} 597 | \end{frame} 598 | 599 | %------------------------------------------------------------------------------- 600 | \begin{frame} 601 | \frametitle{OMP\_PROC\_BIND} 602 | \begin{itemize} 603 | \item \mintinline{bash}|OMP_PROC_BIND=false|: Often the default; threads may move! \mintinline{fortran}|proc_bind| clauses ignored. 604 | \item \mintinline{bash}|OMP_PROC_BIND=true|: Threads won't move, and follow \mintinline{fortran}|proc_bind| clauses or else the implementation default pinning. 605 | \item \mintinline{bash}|OMP_PROC_BIND=master|: Threads pinned to same place as master thread. 606 | \item \mintinline{bash}|OMP_PROC_BIND=close|: Threads are assigned to places close to the master thread. 607 | If \mintinline{bash}|OMP_NUM_THREADS.eq.ncores|: thread 0 will pin to core 0; thread 1 will pin to core 1; etc 608 | \item \mintinline{bash}|OMP_PROC_BIND=spread|: Threads are assigned to places ``sparsely''. 609 | If \mintinline{bash}|OMP_NUM_THREADS.eq.ncores|: thread 0 will pin to socket 0 core 0; thread 1 will pin to socket 1 core 0; thread 2 will pin to socket 0 core 1; etc. 610 | \end{itemize} 611 | \end{frame} 612 | 613 | %------------------------------------------------------------------------------- 614 | \begin{frame} 615 | \frametitle{Places} 616 | \begin{itemize} 617 | \item The affinity (policy) defines how threads are assigned to places. 618 | \item Places allow you to divide up the hardware resource, so that threads can be assigned to them. 619 | \item Default: one place with all cores. 620 | \item Use \mintinline{bash}|OMP_PLACES| environment variable to control. 621 | \item \mintinline{bash}|OMP_PLACES=threads|: each place is a single hardware thread. 622 | \item \mintinline{bash}|OMP_PLACES=cores|: each place is a single core (containing one or more hardware threads). 623 | \item \mintinline{bash}|OMP_PLACES=sockets|: each place contains the cores of a single socket. 624 | \item Can also use list notation: \mintinline{bash}|OMP_PLACES="{0:4},{4:4},{8:4},{12:4}"| 625 | \end{itemize} 626 | \end{frame} 627 | 628 | %------------------------------------------------------------------------------- 629 | \begin{frame} 630 | \frametitle{Thread pinning summary} 631 | \begin{itemize} 632 | \item In general, going to want to just use \mintinline{bash}|OMP_PROC_BIND=true|. 633 | \item Sometimes \mintinline{bash}|spread| or \mintinline{bash}|close| gets better performance. 634 | \item Pinning rules can get complicated when there are multiple places, so prefer to use the predefined values. 635 | \item Most effective with a NUMA-aware implementation. 636 | \item Also helps reduce run-to-run timing variability. 637 | \item But must be careful with MPI+OpenMP pinning. 638 | \end{itemize} 639 | \end{frame} 640 | 641 | %------------------------------------------------------------------------------- 642 | \section{Hybrid MPI and OpenMP} 643 | \begin{frame} 644 | \frametitle{Why combine MPI+OpenMP} 645 | \begin{itemize} 646 | \item Supercomputers are often constructed with a hierarchical structure: 647 | \begin{itemize} 648 | \item Shared memory nodes connected with a network. 649 | \end{itemize} 650 | \item Need MPI (or similar) to communicate between distributed nodes. 651 | \item With multi-core, could just run MPI everywhere (flat MPI). 652 | \item But there are advantages to running \emph{hybrid} MPI and OpenMP: 653 | \begin{itemize} 654 | \item Larger fewer messages to take advantage of network bandwidth. 655 | \item Fewer MPI ranks to manage (fewer to synchronise and for collectives). 656 | \item Can avoid memory copies for intra-node communication. 657 | \item Reduced memory footprint. 658 | \item Parallelise other problem dimensions not decomposed with MPI. 659 | \end{itemize} 660 | \end{itemize} 661 | \end{frame} 662 | 663 | %------------------------------------------------------------------------------- 664 | \begin{frame} 665 | \frametitle{Thread support levels} 666 | \begin{itemize} 667 | \item \mintinline{fortran}|MPI_THREAD_SINGLE| \\ 668 | Only one thread will execute (no threads allowed). 669 | 670 | \item \mintinline{fortran}|MPI_THREAD_FUNNELED| \\ 671 | May spawn threads, but only the original process may call MPI routines: the one that called \mintinline{fortran}|MPI_Init|. 672 | 673 | \item \mintinline{fortran}|MPI_THREAD_SERIALIZED| \\ 674 | May spawn threads and any thread can make MPI calls, but only one at a time. \emph{Your} responsibility to synchronise. 675 | 676 | \item \mintinline{fortran}|MPI_THREAD_MULTIPLE| \\ 677 | May spawn threads and any thread can make MPI calls. The MPI library has to deal with being called in parallel. 678 | \end{itemize} 679 | 680 | Remember to make sure ranks still match the MPI communications to avoid deadlock. 681 | 682 | \end{frame} 683 | 684 | %------------------------------------------------------------------------------- 685 | \section{Exercise} 686 | \begin{frame} 687 | \frametitle{Exercise} 688 | \begin{itemize} 689 | \item Take your parallel 5-point stencil code and optimise it. 690 | \item Think about: 691 | \begin{itemize} 692 | \item Memory access patterns 693 | \item Vectorisation 694 | \item NUMA 695 | \end{itemize} 696 | \item Note down the performance differences your optimisations make. 697 | \item Calculate the achieved memory bandwidth of your stencil code. 698 | \item Extension: consider these optimisaions for the Jacobi solver. 699 | \end{itemize} 700 | \end{frame} 701 | 702 | \end{document} 703 | -------------------------------------------------------------------------------- /slides/02-pi.tex: -------------------------------------------------------------------------------- 1 | \documentclass[aspectratio=169]{beamer} 2 | 3 | \input{preamble.tex} 4 | 5 | \title{OpenMP for Computational Scientists} 6 | \subtitle{2: Data sharing and Reductions} 7 | 8 | \begin{document} 9 | 10 | \frame{\titlepage} 11 | 12 | %------------------------------------------------------------------------------- 13 | 14 | % \section{Outline} 15 | % \begin{frame} 16 | % \frametitle{Outline} 17 | % \begin{itemize} 18 | % \item Recap 19 | % \item Data sharing clauses 20 | % \item The Pi program 21 | % \item Critical regions 22 | % \item Atomics 23 | % \item False sharing issues 24 | % \item Reductions 25 | % \end{itemize} 26 | % \end{frame} 27 | %------------------------------------------------------------------------------- 28 | % \section{Recap} 29 | % \begin{frame}[fragile] 30 | % \frametitle{Recap} 31 | % \begin{itemize} 32 | % \item Fork/join execution model. 33 | 34 | % \item Shared memory model: 35 | % \begin{itemize} 36 | % \item All threads can read/write the \emph{same} memory. 37 | % \end{itemize} 38 | 39 | % \item Set number of threads with \mintinline{bash}|OMP_NUM_THREADS| environment variable. 40 | 41 | % \item Parallelise simple loops with worksharing clauses: 42 | % \begin{minted}[frame=single]{fortran} 43 | % !$omp parallel do 44 | % do i = 1, N 45 | % A(i) = ... 46 | % end do 47 | % !$omp end parallel do 48 | % \end{minted} 49 | 50 | % \item Talked about \mintinline{fortran}|collapse|, \mintinline{fortran}|nowait| and \mintinline{fortran}|schedule| clauses. 51 | 52 | % \end{itemize} 53 | % \end{frame} 54 | %------------------------------------------------------------------------------- 55 | \begin{frame}[fragile] 56 | \frametitle{The first exercise} 57 | 58 | \begin{itemize} 59 | \item Parallelise a serial 5-point stencil code using OpenMP. 60 | \item Solution is adding an OpenMP worksharing construct: 61 | \end{itemize} 62 | 63 | \begin{minted}[frame=single,breaklines]{fortran} 64 | !$omp parallel do collapse(2) 65 | do i = 1, nx 66 | do j = 1, ny 67 | Anew(i,j) = (A(i-1,j) + A(i+1,j) + A(i,j) + A(i,j-1) + A(i,j+1)) / 5.0 68 | end do 69 | end do 70 | !$omp end parallel do 71 | \end{minted} 72 | 73 | \begin{itemize} 74 | \item OpenMP threads are created. 75 | \item Loops are collapsed and iterations shared between threads. 76 | \item Each thread computes its assigned portion of iteration space. 77 | \item Threads synchronise and join. 78 | \end{itemize} 79 | 80 | \end{frame} 81 | 82 | %------------------------------------------------------------------------------- 83 | 84 | \section{Data sharing} 85 | \begin{frame} 86 | \frametitle{Data sharing} 87 | Remember: OpenMP is a \emph{shared memory} programming model. 88 | \begin{itemize} 89 | \item By default, all data is available to all threads. 90 | \item There is a single copy of \emph{shared} data. 91 | \end{itemize} 92 | 93 | \vfill 94 | 95 | You must specify which data should be \emph{private} to each thread. 96 | \begin{itemize} 97 | \item Each thread then has local (stack) space for each private variable. 98 | \item Each copy is only visible to its associated thread. 99 | \end{itemize} 100 | 101 | \begin{block}{Notice} 102 | Fortran variables being declared at the top of the routine mean you must think about this. 103 | \end{block} 104 | 105 | \end{frame} 106 | 107 | 108 | %------------------------------------------------------------------------------- 109 | \begin{frame} 110 | \frametitle{Variables on the heap} 111 | \begin{itemize} 112 | \item All data on the heap is shared. 113 | \item Therefore all the Fortran \mintinline{fortran}|allocatable| data is shared. 114 | \item You must ensure that different threads do not write to the same element of these arrays. 115 | \end{itemize} 116 | 117 | \begin{alertblock}{Caution} 118 | Setting a data sharing clause on a heap variable only effects the metadata of the variable. 119 | The pointer could be private, but the target will still be shared. 120 | \end{alertblock} 121 | \end{frame} 122 | 123 | %------------------------------------------------------------------------------- 124 | \section{Data clauses} 125 | \begin{frame} 126 | \frametitle{Data clauses} 127 | \begin{itemize} 128 | \item \mintinline{fortran}|shared(x)| 129 | There is one copy of the \mintinline{fortran}|x| variable. The programmer must ensure synchronisation. 130 | \item \mintinline{fortran}|private(x)| 131 | Each thread gets its own local \mintinline{fortran}|x| variable. It is not initialised. The value of the original \mintinline{fortran}|x| variable is undefined on region exit. 132 | \item \mintinline{fortran}|firstprivate(x)| 133 | Each thread gets its own \mintinline{fortran}|x| variable, and it is initialised to the value of the original variable entering the region. 134 | \item \mintinline{fortran}|lastprivate(x)| 135 | Used for loops. Each thread gets its own \mintinline{fortran}|x| variable, and on exiting the region the original variable is updated taking the value from the sequentially last iteration. 136 | \end{itemize} 137 | 138 | These are the most common clauses that are needed. 139 | \end{frame} 140 | 141 | %------------------------------------------------------------------------------- 142 | \begin{frame} 143 | \frametitle{Data clauses} 144 | There is also the \mintinline{fortran}|threadprivate(x)| directive (not a clause). 145 | \begin{itemize} 146 | \item This says to take a copy of the data in \emph{thread local storage} which is persistent across parallel regions. 147 | \item The \mintinline{fortran}|copyin| directive is a means to initialise \mintinline{fortran}|threadprivate| data, copying from the master thread. 148 | \end{itemize} 149 | 150 | Unlikely to use this clause. 151 | Might be useful if using \mintinline{fortran}|common| blocks (or \mintinline{c}|static| variables in C). 152 | \end{frame} 153 | 154 | %------------------------------------------------------------------------------- 155 | \subsection{Private example} 156 | \begin{frame}[fragile] 157 | \frametitle{Private example} 158 | Simple \mintinline{fortran}|do| loop, which just sets a variable to the iteration number. 159 | Each iteration prints out the current and next value of \mintinline{fortran}|x|, along with the thread number. 160 | Will see what happens with different data sharing clauses. 161 | 162 | \begin{minted}[linenos,breaklines,frame=single, fontsize=\small]{fortran} 163 | !$omp parallel do private(x) / firstprivate(x) / lastprivate(x) 164 | do i = 1, N 165 | write (*,"(2X,A,I0,A,I0,A,I0)") "Thread ", omp_get_thread_num(), " setting x=", x, " to ", i 166 | x = i 167 | end do 168 | !$omp end parallel do 169 | \end{minted} 170 | N is set to 10. 171 | Ran using 4 threads. 172 | Full implementation: \mintinline{bash}|private.f90|. 173 | \end{frame} 174 | 175 | %------------------------------------------------------------------------------- 176 | \begin{frame}[fragile] 177 | \frametitle{Private example} 178 | \begin{minted}{bash} 179 | private: 180 | before: x=-1 181 | Thread 1 setting x=0 to 4 182 | Thread 2 setting x=0 to 7 183 | Thread 3 setting x=0 to 9 184 | Thread 0 setting x=0 to 1 185 | Thread 1 setting x=4 to 5 186 | Thread 2 setting x=7 to 8 187 | Thread 3 setting x=9 to 10 188 | Thread 0 setting x=1 to 2 189 | Thread 1 setting x=5 to 6 190 | Thread 0 setting x=2 to 3 191 | after: x=-1 192 | \end{minted} 193 | Each thread starts with its own \mintinline{fortran}|x|. 194 | No guarantees of initial value, but happened to be zero this time. 195 | \end{frame} 196 | 197 | %------------------------------------------------------------------------------- 198 | \begin{frame}[fragile] 199 | \frametitle{Private example} 200 | \begin{minted}{bash} 201 | firstprivate: 202 | before: x=-1 203 | Thread 3 setting x=-1 to 9 204 | Thread 2 setting x=-1 to 7 205 | Thread 1 setting x=-1 to 4 206 | Thread 0 setting x=-1 to 1 207 | Thread 3 setting x=9 to 10 208 | Thread 2 setting x=7 to 8 209 | Thread 1 setting x=4 to 5 210 | Thread 0 setting x=1 to 2 211 | Thread 1 setting x=5 to 6 212 | Thread 0 setting x=2 to 3 213 | after: x=-1 214 | \end{minted} 215 | Each thread starts with its own \mintinline{fortran}|x|, which set to the value of \mintinline{fortran}|x| before entering the \mintinline{fortran}|parallel| region, -1. 216 | \end{frame} 217 | 218 | %------------------------------------------------------------------------------- 219 | \begin{frame}[fragile] 220 | \frametitle{Private example} 221 | \begin{minted}{bash} 222 | lastprivate: 223 | before: x=-1 224 | Thread 3 setting x=3 to 9 225 | Thread 2 setting x=2 to 7 226 | Thread 1 setting x=1 to 4 227 | Thread 3 setting x=9 to 10 228 | Thread 0 setting x=0 to 1 229 | Thread 2 setting x=7 to 8 230 | Thread 1 setting x=4 to 5 231 | Thread 0 setting x=1 to 2 232 | Thread 1 setting x=5 to 6 233 | Thread 0 setting x=2 to 3 234 | after: x=10 235 | \end{minted} 236 | Each thread starts with its own \mintinline{fortran}|x|, which set to to a garbage value. 237 | On exiting the region, the original \mintinline{fortran}|x| is set to the value of the last iteration of the loop, 10. 238 | \end{frame} 239 | 240 | %------------------------------------------------------------------------------- 241 | \section{Default data sharing} 242 | \begin{frame} 243 | \frametitle{Choosing default data sharing} 244 | \begin{alertblock}{Note} 245 | It is especially important to list private variables in Fortran. 246 | All variables have \emph{global} scope within each \mintinline{fortran}|subroutine| so \emph{everything} is shared by default. 247 | In C, local scoping rules makes this easier. 248 | \end{alertblock} 249 | 250 | \begin{itemize} 251 | \item You can force yourself to specify everything manually by using the \mintinline{fortran}|default(none)| attribute. This is good practice. 252 | \item You can also \mintinline{fortran}|default(private)| or \mintinline{fortran}|default(firstprivate)| to make everything private by default --- this might save a lot of typing in an old code with many temporary variables. 253 | \end{itemize} 254 | 255 | \end{frame} 256 | 257 | %------------------------------------------------------------------------------- 258 | \section{Calculating Pi} 259 | \begin{frame} 260 | \frametitle{Calculating Pi} 261 | Use a simple program to numerically approximate $\pi$ to explore: 262 | \begin{itemize} 263 | \item Use of data sharing clauses. 264 | \item Updating a shared variable in parallel. 265 | \item Reductions. 266 | \end{itemize} 267 | \end{frame} 268 | 269 | %------------------------------------------------------------------------------- 270 | \begin{frame} 271 | \frametitle{Integration to calculate Pi} 272 | 273 | $$\int_{0}^{1} \frac{4}{1+x^2} dx = \pi$$ 274 | 275 | \begin{adjustbox}{max totalsize={\textwidth}{0.6\textheight},center} 276 | \begin{tikzpicture} 277 | \begin{axis}[xlabel={$x$},ylabel={$f(x)$},ymin=0] 278 | \addplot [name path=A, domain=0:1] {4/(1+x*x)}; 279 | \addplot[dashed] coordinates {(0,0) (0,4)}; 280 | \addplot[dashed] coordinates {(1,0) (1,2)}; 281 | \path [name path=axis] (axis cs:0,0) -- (axis cs:1,0); 282 | \addplot[blue!30] fill between [of=A and axis, domain=0:1]; 283 | \end{axis} 284 | \end{tikzpicture} 285 | \end{adjustbox} 286 | \end{frame} 287 | 288 | %------------------------------------------------------------------------------- 289 | \begin{frame} 290 | \frametitle{Trapezoidal rule} 291 | Sum the area of the boxes. Choose a small \emph{step} size to generate lots of boxes, and increase accuracy. 292 | 293 | \begin{adjustbox}{max totalsize={\textwidth}{0.6\textheight},center} 294 | \begin{tikzpicture} 295 | \begin{axis}[xlabel={$x$},ylabel={$f(x)$},ymin=0] 296 | \addplot [name path=A, domain=0:1] {4/(1+x*x)}; 297 | \addplot[dashed] coordinates {(0,0) (0,4)}; 298 | \addplot[dashed] coordinates {(1,0) (1,2)}; 299 | \path [name path=axis] (axis cs:0,0) -- (axis cs:1,0); 300 | \addplot[blue!30] fill between [of=A and axis, soft clip={domain=0:0.2}]; 301 | \addplot[red!30] fill between [of=A and axis, soft clip={domain=0.2:0.4}]; 302 | \addplot[green!30] fill between [of=A and axis, soft clip={domain=0.4:0.6}]; 303 | \addplot[gray!30] fill between [of=A and axis, soft clip={domain=0.6:0.8}]; 304 | \addplot[yellow!30] fill between [of=A and axis, soft clip={domain=0.8:1}]; 305 | \end{axis} 306 | \end{tikzpicture} 307 | \end{adjustbox} 308 | \end{frame} 309 | 310 | %------------------------------------------------------------------------------- 311 | \begin{frame}[fragile] 312 | \frametitle{Code} 313 | We will use this code which calculates the value of $\pi$ as an example for the remainder of this session. 314 | 315 | \begin{minted}[linenos,breaklines,frame=single]{fortran} 316 | step = 1.0/num_steps 317 | do ii = 1, num_steps 318 | x = (ii-0.5)*step 319 | sum = sum + (4.0/(1.0+x*x)) 320 | end do 321 | pi = step * sum 322 | \end{minted} 323 | 324 | With 100,000,000 steps, this takes 0.368s on my laptop. 325 | 326 | Full implementation: \mintinline{bash}|pi.f90|. 327 | \end{frame} 328 | 329 | %------------------------------------------------------------------------------- 330 | \begin{frame}[fragile] 331 | \frametitle{Parallelising the loop} 332 | 333 | Use a worksharing directive to parallelise the loop. 334 | 335 | \begin{minted}[linenos,breaklines,frame=single]{fortran} 336 | step = 1.0/num_steps 337 | !$omp parallel do private(x) 338 | do ii = 1, num_steps 339 | x = (ii-0.5)*step 340 | sum = sum + (4.0/(1.0+x*x)) 341 | end do 342 | !$omp end parallel do 343 | pi = step * sum 344 | \end{minted} 345 | 346 | \vfill 347 | 348 | What about data sharing? 349 | \begin{itemize} 350 | \item \mintinline{fortran}|x| needs to be used independently by each thread, so mark as \mintinline{fortran}|private|. 351 | \item \mintinline{fortran}|sum| needs to be updated by \emph{all} threads, so leave as \mintinline{fortran}|shared|. 352 | \end{itemize} 353 | 354 | \end{frame} 355 | 356 | %------------------------------------------------------------------------------- 357 | \section{Critical regions} 358 | \begin{frame}[fragile] 359 | \frametitle{Parallelising with critical} 360 | \begin{itemize} 361 | \item But need to be careful changing the \mintinline{fortran}|shared| variable, \mintinline{fortran}|sum|. 362 | \item All threads can update this value directly! 363 | \item A \mintinline{fortran}|critical| region only allows one thread to execute at any one time. No guarantees of ordering. 364 | \end{itemize} 365 | 366 | \begin{minted}[linenos,breaklines,frame=single]{fortran} 367 | step = 1.0/num_steps 368 | !$omp parallel do private(x) 369 | do ii = 1, num_steps 370 | x = (ii-0.5)*step 371 | !$omp critical 372 | sum = sum + (4.0/(1.0+x*x)) 373 | !$omp end critical 374 | end do 375 | !$omp end parallel do 376 | pi = step * sum 377 | \end{minted} 378 | 379 | \end{frame} 380 | 381 | %------------------------------------------------------------------------------- 382 | \begin{frame} 383 | \frametitle{Runtimes} 384 | Run on a MacBook Pro (Intel Core i7-4980HQ CPU @ 2.80GHz) with 4 threads. 385 | 386 | \vfill 387 | 388 | \begin{table} 389 | \begin{tabular}{cc} 390 | \toprule 391 | Implementation & Runtime (s) \\ 392 | \midrule 393 | Serial & 0.368 \\ 394 | Critical & 426.1 \\ 395 | \bottomrule 396 | \end{tabular} 397 | \end{table} 398 | 399 | Full implementation: \mintinline{bash}|pi_critical.f90|. 400 | 401 | \begin{center} 402 | \large Really slow! 403 | \end{center} 404 | 405 | \end{frame} 406 | 407 | %------------------------------------------------------------------------------- 408 | \section{Atomics} 409 | \begin{frame}[fragile] 410 | \frametitle{Atomics} 411 | A \mintinline{fortran}|critical| region protects a whole block of code. For a single operation, can use \mintinline{fortran}|atomic| instead. 412 | 413 | Atomic operations are with respect to the memory access of a scalar variable {\tt x}. 414 | 415 | \begin{itemize} 416 | \item \mintinline{fortran}|read| for \mintinline{fortran}|v = x| 417 | \item \mintinline{fortran}|write| for \mintinline{fortran}|x = expr| 418 | \item \mintinline{fortran}|update| for \mintinline{fortran}|x = x op expr| 419 | \item \mintinline{fortran}|capture| for read and write/update. The result is retained: \mintinline{fortran}|x = x op expr; v = x| 420 | \end{itemize} 421 | 422 | Not specifying an atomic clause defaults to \mintinline{fortran}|update|. 423 | \end{frame} 424 | 425 | %------------------------------------------------------------------------------- 426 | \begin{frame}[fragile] 427 | \frametitle{Atomic pi} 428 | \begin{minted}[linenos,breaklines]{fortran} 429 | step = 1.0/num_steps 430 | !$omp parallel do private(x) 431 | do ii = 1, num_steps 432 | x = (ii-0.5)*step 433 | !$omp atomic 434 | sum = sum + (4.0/(1.0+x*x)) 435 | end do 436 | !$omp end parallel do 437 | pi = step * sum 438 | \end{minted} 439 | \end{frame} 440 | 441 | %------------------------------------------------------------------------------- 442 | \begin{frame} 443 | \frametitle{Runtimes} 444 | Run on a MacBook Pro (Intel Core i7-4980HQ CPU @ 2.80GHz) with 4 threads. 445 | 446 | \vfill 447 | 448 | \begin{table} 449 | \begin{tabular}{cc} 450 | \toprule 451 | Implementation & Runtime (s) \\ 452 | \midrule 453 | Serial & 0.368 \\ 454 | Critical & 426.1 \\ 455 | Atomic & 8.3 \\ 456 | \bottomrule 457 | \end{tabular} 458 | \end{table} 459 | 460 | Full implementation: \mintinline{bash}|pi_atomic.f90|. 461 | 462 | \begin{center} 463 | \large Faster, but still slower than serial. 464 | \end{center} 465 | 466 | \end{frame} 467 | 468 | %------------------------------------------------------------------------------- 469 | \section{Avoiding critical regions} 470 | \begin{frame} 471 | \frametitle{Independent summation} 472 | \begin{itemize} 473 | \item Both methods cause threads to synchronise for every update to \mintinline{fortran}|sum|. 474 | \item But each thread could compute a partial sum independently, synchronising once to total at the end. 475 | \end{itemize} 476 | 477 | \vfill 478 | 479 | Make \mintinline{fortran}|sum| an array of length equal to the number of threads. 480 | \begin{itemize} 481 | \item Each thread stores its partial sum, and the array is totalled by the master thread serially at the end. 482 | \item As it's \emph{shared memory}, the \mintinline{fortran}|sum| array can be read just fine on the master rank. 483 | \end{itemize} 484 | \end{frame} 485 | 486 | %------------------------------------------------------------------------------- 487 | \begin{frame}[fragile] 488 | \frametitle{Independent summation} 489 | \begin{minted}[fontsize=\small,linenos,breaklines,frame=single]{fortran} 490 | step = 1.0/num_steps 491 | !$omp parallel private(x,tid) 492 | tid = omp_get_thread_num() 493 | sum(tid+1) = 0.0 494 | !$omp do 495 | do ii = 1, num_steps 496 | x = (ii-0.5)*step 497 | sum(tid+1) = sum(tid+1) + (4.0/(1.0+x*x)) 498 | !$omp flush(sum) 499 | end do 500 | !$omp end do 501 | !$omp end parallel 502 | do ii = 1, nthreads 503 | pi = pi + sum(ii) 504 | end do 505 | pi = pi * step 506 | \end{minted} 507 | \end{frame} 508 | 509 | %------------------------------------------------------------------------------- 510 | \begin{frame} 511 | \frametitle{Runtimes} 512 | Run on a MacBook Pro (Intel Core i7-4980HQ CPU @ 2.80GHz) with 4 threads. 513 | 514 | \vfill 515 | 516 | \begin{table} 517 | \begin{tabular}{cc} 518 | \toprule 519 | Implementation & Runtime (s) \\ 520 | \midrule 521 | Serial & 0.368 \\ 522 | Critical & 426.1 \\ 523 | Atomic & 8.3 \\ 524 | Array & 2.8 \\ 525 | \bottomrule 526 | \end{tabular} 527 | \end{table} 528 | 529 | Full implementation: \mintinline{bash}|pi_array.f90|. 530 | 531 | \begin{center} 532 | \large Fastest parallel version so far, but still slow. 533 | \end{center} 534 | 535 | \end{frame} 536 | 537 | %------------------------------------------------------------------------------- 538 | \section{False sharing} 539 | \begin{frame} 540 | \frametitle{False sharing} 541 | This code is susceptible to \emph{false sharing}. 542 | \begin{itemize} 543 | \item False sharing occurs when different threads update data on the same cache line. 544 | \item Cache system is coherent between cores, so data consistency must be maintained. 545 | \item The cache line is no longer up to date because another thread changed it (in their local cache). 546 | \item Therefore, cache line must be flushed to memory and reread into the other thread every time. 547 | \item This is an example of \emph{cache thrashing}. 548 | \item The performance is reduced as threads must wait for the cache lines to refresh. 549 | \end{itemize} 550 | \end{frame} 551 | 552 | %------------------------------------------------------------------------------- 553 | \begin{frame} 554 | \frametitle{Flush} 555 | \begin{itemize} 556 | \item The \mintinline{fortran}|flush()| construct ensures that the variables are consistent between the thread's memory and main memory. 557 | \item Don't want to go into complicated parts of the OpenMP memory model. 558 | \item In general, don't need to worry about this stuff. 559 | \item Without the flush, the write to memory will be lowered to after the loop, so false sharing only occurs once at the end. 560 | \item Here we use it to \emph{ensure} that false sharing occurs every time to highlight the performance hit. 561 | \end{itemize} 562 | \end{frame} 563 | 564 | %------------------------------------------------------------------------------- 565 | \begin{frame}[fragile] 566 | \frametitle{Firstprivate pi} 567 | Can use data sharing clauses to our advantage here: 568 | 569 | Give each thread a \emph{scalar} copy of \mintinline{fortran}|sum| to compute their partial sum, and reduce with only one critical (or atomic) region at the end. 570 | No false sharing, as value is just a single number (i.e.\ a register). 571 | \begin{minted}[linenos,breaklines,frame=single,fontsize=\footnotesize]{fortran} 572 | step = 1.0/num_steps 573 | !$omp parallel private(x) firstprivate(sum) 574 | !$omp do 575 | do ii = 1, num_steps 576 | x = (ii-0.5)*step 577 | sum = sum + (4.0/(1.0+x*x)) 578 | end do 579 | !$omp end do 580 | !$omp critical 581 | pi = pi + sum 582 | !$omp end critical 583 | !$omp end parallel 584 | pi = pi * step 585 | \end{minted} 586 | \end{frame} 587 | 588 | %------------------------------------------------------------------------------- 589 | \begin{frame} 590 | \frametitle{Runtimes} 591 | Run on a MacBook Pro (Intel Core i7-4980HQ CPU @ 2.80GHz) with 4 threads. 592 | 593 | \vfill 594 | 595 | \begin{table} 596 | \begin{tabular}{cc} 597 | \toprule 598 | Implementation & Runtime (s) \\ 599 | \midrule 600 | Serial & 0.368 \\ 601 | Critical & 426.1 \\ 602 | Atomic & 8.3 \\ 603 | Array & 2.8 \\ 604 | First private & 0.104 \\ 605 | \bottomrule 606 | \end{tabular} 607 | \end{table} 608 | 609 | Full implementation: \mintinline{bash}|pi_private.f90|. 610 | 611 | \begin{center} 612 | \large Finally faster than serial! Around 3.5X faster on 4 threads. 613 | \end{center} 614 | 615 | \end{frame} 616 | 617 | %------------------------------------------------------------------------------- 618 | \section{Reductions} 619 | \begin{frame}[fragile] 620 | \frametitle{Reductions} 621 | Much simpler to use the OpenMP \mintinline{fortran}|reduction| clause on a worksharing loop. 622 | Specify the operation and the variable. 623 | \begin{multicols}{2} 624 | \begin{itemize} 625 | \item \mintinline{fortran}|reduction(+:var)| 626 | \item \mintinline{fortran}|reduction(-:var)| 627 | \item \mintinline{fortran}|reduction(*:var)| 628 | \item \mintinline{fortran}|reduction(.and.:var)| 629 | \item \mintinline{fortran}|reduction(.or.:var)| 630 | \item \mintinline{fortran}|reduction(.eqv.:var)| 631 | \item \mintinline{fortran}|reduction(.neqv.:var)| 632 | \item \mintinline{fortran}|reduction(.max.:var)| 633 | \item \mintinline{fortran}|reduction(.min.:var)| 634 | \item \mintinline{fortran}|reduction(.iand.:var)| 635 | \item \mintinline{fortran}|reduction(.ior.:var)| 636 | \item \mintinline{fortran}|reduction(.ieor.:var)| 637 | \end{itemize} 638 | \end{multicols} 639 | 640 | Can also do array reductions. Each element of array is treated as own, separate, reduction. 641 | Similar to: 642 | \begin{minted}[breaklines]{fortran} 643 | MPI_Allreduce(MPI_IN_PLACE, arr, N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD, ierr) 644 | \end{minted} 645 | 646 | \end{frame} 647 | 648 | %------------------------------------------------------------------------------- 649 | \begin{frame}[fragile] 650 | \frametitle{Pi reduction} 651 | Much simpler to write using the \mintinline{fortran}|reduction| clause --- just need a single directive: 652 | \begin{minted}[linenos,breaklines,frame=single]{fortran} 653 | step = 1.0/num_steps 654 | !$omp parallel do private(x) reduction(+:sum) 655 | do ii = 1, num_steps 656 | x = (ii-0.5)*step 657 | sum = sum + (4.0/(1.0+x*x)) 658 | end do 659 | !$omp end parallel do 660 | pi = step * sum 661 | \end{minted} 662 | 663 | Full implementation: \mintinline{bash}|pi_reduction.f90|. 664 | \end{frame} 665 | 666 | %------------------------------------------------------------------------------- 667 | \begin{frame} 668 | \frametitle{Runtimes} 669 | Run on a MacBook Pro (Intel Core i7-4980HQ CPU @ 2.80GHz) with 4 threads. 670 | 671 | \vfill 672 | 673 | \begin{table} 674 | \begin{tabular}{cc} 675 | \toprule 676 | Implementation & Runtime (s) \\ 677 | \midrule 678 | Serial & 0.368 \\ 679 | Critical & 426.1 \\ 680 | Atomic & 8.3 \\ 681 | Array & 2.8 \\ 682 | First private & 0.104 \\ 683 | Reduction & 0.095 \\ 684 | \bottomrule 685 | \end{tabular} 686 | \end{table} 687 | 688 | \vfill 689 | 690 | Around 3.9X faster on 4 threads! 691 | 692 | \vfill 693 | 694 | 695 | \begin{block}{Recommendation} 696 | Use the \mintinline{fortran}|reduction| clause for reductions. 697 | \end{block} 698 | 699 | \end{frame} 700 | 701 | %------------------------------------------------------------------------------- 702 | \section{Exercise} 703 | \begin{frame}[fragile] 704 | \frametitle{Exercise} 705 | \begin{itemize} 706 | \item Start with your parallel 5-point stencil code from last time. 707 | \item Change the code to print out the total of the cells (excluding halo) every timestep. 708 | \item You'll need to implement a parallel reduction to do this. 709 | \item Try the different techniques shown to implement reductions: 710 | \begin{itemize} 711 | \item Critical sections. 712 | \item Atomics. 713 | \item Reduction clause. 714 | \end{itemize} 715 | \item Extension: there is also a Jacobi code to parallelise --- it needs a reduction too. 716 | \end{itemize} 717 | \begin{minted}[frame=single,breaklines]{fortran} 718 | do i = 1, nx 719 | do j = 1, ny 720 | Anew(i,j) = (A(i-1,j) + A(i+1,j) + A(i,j) + A(i,j-1) + A(i,j+1)) / 5.0 721 | total = total + Anew(i,j) 722 | end do 723 | end do 724 | \end{minted} 725 | \end{frame} 726 | 727 | %------------------------------------------------------------------------------- 728 | \begin{frame} 729 | \frametitle{Summary} 730 | \begin{itemize} 731 | \item Have now covered the most common parts of OpenMP. 732 | \item 80/20 rule: Most programs will only use what you know so far. 733 | \item OpenMP is deceptively simple! 734 | \item In the remaining sessions you'll learn to program OpenMP on NUMA and GPU architectures. 735 | \end{itemize} 736 | \end{frame} 737 | 738 | %------------------------------------------------------------------------------- 739 | \end{document} 740 | -------------------------------------------------------------------------------- /slides/01-paralleldo.tex: -------------------------------------------------------------------------------- 1 | \documentclass[aspectratio=169]{beamer} 2 | 3 | \input{preamble.tex} 4 | 5 | \title{OpenMP for Computational Scientists} 6 | \subtitle{1: Parallel worksharing} 7 | 8 | \begin{document} 9 | 10 | \frame{\titlepage} 11 | 12 | %------------------------------------------------------------------------------- 13 | % \begin{frame} 14 | % \frametitle{The first exercise} 15 | % \begin{itemize} 16 | % \item At the end of this session, you will be able to parallelise a (simple) 5-point stencil code using OpenMP! 17 | % \item The other sessions provide you with details you might need for real world codes. 18 | % \end{itemize} 19 | % \end{frame} 20 | 21 | %------------------------------------------------------------------------------- 22 | \section{OpenMP introduction} 23 | \begin{frame} 24 | \frametitle{What is OpenMP?} 25 | 26 | A collection of compiler directives, library routines, and environment variables for parallelism for shared memory parallel programs. 27 | 28 | \begin{itemize} 29 | \item Create and manage parallel programs while permitting portability. 30 | \item User-directed parallelization. 31 | \end{itemize} 32 | 33 | A \emph{specification} of annotations you can make to your program in order to make it parallel. 34 | 35 | \end{frame} 36 | 37 | %------------------------------------------------------------------------------- 38 | \begin{frame}[fragile] 39 | \frametitle{Syntax} 40 | \begin{itemize} 41 | \item OpenMP mostly formed of \emph{compiler directives}\\ 42 | \begin{minted}{fortran} 43 | !$omp construct [clause [clause]...] 44 | \end{minted} 45 | These tell the compiler to insert some extra code on your behalf. 46 | 47 | \item Compiler directives usually apply to a \emph{structured block} of statements. 48 | Limited scoping in Fortran means we often need to use \emph{end} directives. 49 | \begin{minted}{fortran} 50 | !$omp construct 51 | ... ! lines of Fortran code 52 | !$omp end construct 53 | \end{minted} 54 | 55 | \item Library API calls 56 | \begin{minted}{fortran} 57 | use omp_lib 58 | call omp_...() 59 | \end{minted} 60 | 61 | \end{itemize} 62 | \end{frame} 63 | 64 | %------------------------------------------------------------------------------- 65 | \subsection{Compiler flags} 66 | \begin{frame}[fragile] 67 | \frametitle{Building with OpenMP} 68 | 69 | Turn on OpenMP in the compiler: 70 | \begin{minted}{bash} 71 | gfortran *.f90 -fopenmp # GNU 72 | ifort *.f90 -qopenmp # Intel 73 | ftn *.f90 -homp # Cray (now off by default) 74 | pgf90 *.f90 -mp # PGI 75 | \end{minted} 76 | 77 | To also use the API calls within the code, use the library: 78 | \begin{minted}{fortran} 79 | USE omp_lib 80 | \end{minted} 81 | 82 | \begin{alertblock}{Note} 83 | No need to include the library if only using the compiler directives. 84 | The library only gets you the API calls. 85 | \end{alertblock} 86 | \end{frame} 87 | 88 | %------------------------------------------------------------------------------- 89 | \section{Memory and execution model} 90 | \begin{frame} 91 | \frametitle{Shared memory} 92 | OpenMP is for shared memory programming: all threads have access to a shared address space. 93 | 94 | A typical HPC node consisting of 2 multi-core CPUs. 95 | \begin{center} 96 | \begin{tikzpicture} 97 | % Draw 4 cores for socket 0 98 | \draw (0,0) rectangle (1,1); 99 | \draw (0.5,0.5) node {C0}; 100 | \draw (1,0) rectangle (2,1); 101 | \draw (1.5,0.5) node {C1}; 102 | \draw (0,1) rectangle (1,2); 103 | \draw (0.5,1.5) node {C2}; 104 | \draw (1,1) rectangle (2,2); 105 | \draw (1.5,1.5) node {C3}; 106 | \draw (1,-0.5) node {Socket 0}; 107 | 108 | % Draw 4 cores for socket 1 109 | \draw (3,0) rectangle (4,1); 110 | \draw (3.5,0.5) node {C0}; 111 | \draw (4,0) rectangle (5,1); 112 | \draw (4.5,0.5) node {C1}; 113 | \draw (3,1) rectangle (4,2); 114 | \draw (3.5,1.5) node {C2}; 115 | \draw (4,1) rectangle (5,2); 116 | \draw (4.5,1.5) node {C3}; 117 | \draw (4,-0.5) node {Socket 1}; 118 | 119 | % Draw large memory 120 | \draw (-0.5,3) rectangle (5.5,4); 121 | \draw (2.5,3.5) node {Memory}; 122 | 123 | % Connect sockets to memory 124 | \draw (1,2) -- (1,3); 125 | \draw (4,2) -- (4,3); 126 | \draw[dashed] (2,1) -- (3,1); % QPI 127 | 128 | % Show memory shared 129 | \pause 130 | \draw[fill=red] (0.5,3.2) rectangle (1,3.7); 131 | \draw[->] (0.5,1.8) -- (0.7,3.2); 132 | \draw[->] (0.7,3.2) -- (4.5,0.8); 133 | 134 | \end{tikzpicture} 135 | \end{center} 136 | \emph{All} threads (each running on a core) can access the same memory. 137 | 138 | Different to MPI, where one process cannot see the memory of another without explicit communication. 139 | 140 | \end{frame} 141 | 142 | %------------------------------------------------------------------------------- 143 | \begin{frame} 144 | \frametitle{Fork-join model} 145 | Serial/sequential execution: 146 | \begin{center} 147 | \begin{tikzpicture} 148 | \draw[->] (0,0) -- (8,0); 149 | \end{tikzpicture} 150 | \end{center} 151 | 152 | \pause 153 | 154 | In a \emph{fork-join} model, code starts serial, \emph{forks} a \emph{team} of threads then \emph{joins} them back to serial execution. 155 | \begin{center} 156 | \begin{tikzpicture} 157 | \draw (0,0) -- (1,0); 158 | 159 | % Fork 160 | \draw (1,0) -- (2,1.5); 161 | \draw (1,0) -- (2,0.5); 162 | \draw (1,0) -- (2,-0.5); 163 | \draw (1,0) -- (2,-1.5); 164 | \draw (1,-1) node {Fork}; 165 | 166 | % Run in parallel 167 | \draw (2,1.5) -- (5,1.5); 168 | \draw (2,0.5) -- (5,0.5); 169 | \draw (2,-0.5) -- (5,-0.5); 170 | \draw (2,-1.5) -- (5,-1.5); 171 | \draw (3.5,0) node {Parallel execution}; 172 | 173 | % Join 174 | \draw (5,1.5) -- (6,0); 175 | \draw (5,0.5) -- (6,0); 176 | \draw (5,-0.5) -- (6,0); 177 | \draw (5,-1.5) -- (6,0); 178 | \draw (6,-1) node {Join}; 179 | 180 | % Serial end 181 | \draw[->] (6,0) -- (8,0); 182 | \end{tikzpicture} 183 | \end{center} 184 | 185 | Nested threads are allowed, where a thread forks its own team of threads. 186 | \end{frame} 187 | 188 | %------------------------------------------------------------------------------- 189 | \section{Going parallel} 190 | \begin{frame}[fragile] 191 | \frametitle{Creating OpenMP threads} 192 | \begin{minted}[frame=single, linenos]{fortran} 193 | program hello 194 | 195 | !$omp parallel 196 | print *, "Hello" 197 | !$omp end parallel 198 | 199 | end program hello 200 | \end{minted} 201 | 202 | Threads \emph{redundantly} execute code in the block. 203 | 204 | Each thread will output \mintinline{bash}|Hello|. 205 | 206 | Threads are synchronised at the end of the parallel region. 207 | 208 | \end{frame} 209 | 210 | %------------------------------------------------------------------------------- 211 | % \begin{frame}[fragile] 212 | % \frametitle{Pthreads} 213 | 214 | % \begin{minted}[fontsize=\small, linenos, frame=single]{fortran} 215 | % program hello 216 | % use fpthread 217 | % integer :: i, err 218 | % integer :: N = 4 219 | % type(fpthread_t) :: Tide(N) 220 | 221 | % do i = 1, N 222 | % call fpthread_create(tid(i), NULL, run, NULL, err) 223 | % end do 224 | % do i = 1, N 225 | % call fpthread_join(tid(i), NULL, err) 226 | % end do 227 | 228 | % subroutine run 229 | % print *, "Hello" 230 | % end subroutine run 231 | % end program hello 232 | % \end{minted} 233 | 234 | % \end{frame} 235 | 236 | % %------------------------------------------------------------------------------- 237 | % \begin{frame} 238 | % \frametitle{OpenMP and Pthreads} 239 | % \begin{itemize} 240 | % \item Pthreads is very error prone and verbose. 241 | % \item The OpenMP \mintinline{fortran}|!$omp parallel| abstracts this away. 242 | % \item The compiler directive inserts this extra code on your behalf. 243 | % \item Pthreads requires wrapping up your parallel work in subroutines. 244 | % \begin{itemize} 245 | % \item Kernels are a useful abstraction used in many programming models. 246 | % \end{itemize} 247 | % \item OpenMP much more convenient for \emph{incrementally} adding parallelism to your code. 248 | % \end{itemize} 249 | % \end{frame} 250 | 251 | %------------------------------------------------------------------------------- 252 | \begin{frame}[fragile] 253 | \frametitle{Setting number of threads} 254 | You might need to set the number of threads to launch (though typically you'll leave OpenMP to set the number of threads for you at run-time). 255 | 256 | OpenMP has 3 ways to do this: 257 | \begin{itemize} 258 | \item Environment variables 259 | \begin{minted}{bash} 260 | OMP_NUM_THREADS=16 261 | \end{minted} 262 | 263 | \item API calls 264 | \begin{minted}{fortran} 265 | call omp_set_num_threads(16) 266 | \end{minted} 267 | 268 | \item Clauses 269 | \begin{minted}{fortran} 270 | !$omp parallel num_threads(16) 271 | !$omp end parallel 272 | \end{minted} 273 | \end{itemize} 274 | 275 | In general it's better to use environment variables if you need to do this, as this approach gives you more flexibility at runtime. 276 | \end{frame} 277 | 278 | %------------------------------------------------------------------------------- 279 | \begin{frame}[fragile] 280 | \frametitle{Thread API calls} 281 | Parallel programs often written in a SPMD style: \newline 282 | {\bf S}ingle {\bf P}rogram, {\bf M}ultiple {\bf D}ata. 283 | \begin{itemize} 284 | \item MPI has a SPMD model. 285 | \item Threads run the same code, and use their ID to work out which data to operate on. 286 | \end{itemize} 287 | 288 | The OpenMP API gives you calls to determine thread information when \emph{inside} a parallel region: 289 | \begin{itemize} 290 | \item Get number of threads 291 | \begin{minted}{fortran} 292 | nthreads = omp_get_num_threads() 293 | \end{minted} 294 | 295 | \item Get thread ID 296 | \begin{minted}{fortran} 297 | tid = omp_get_thread_num() 298 | \end{minted} 299 | 300 | \end{itemize} 301 | \end{frame} 302 | 303 | %------------------------------------------------------------------------------- 304 | \section{Example: vector addition} 305 | \begin{frame}[fragile] 306 | \frametitle{Vector add} 307 | Walkthrough parallelising vector addition using OpenMP. 308 | 309 | \begin{minted}[fontsize=\footnotesize,linenos,frame=single]{fortran} 310 | program vecadd 311 | integer :: N = 1024 ! Length of array 312 | ! Arrays 313 | real(kind=8), allocatable, dimension(:) :: A, B, C 314 | integer :: i ! Loop counter 315 | 316 | ! Allocate and initialise vectors 317 | allocate(A(N), B(N), C(N)) 318 | A = 1.0; B = 2.0; C = 0.0 319 | 320 | ! Vector add 321 | do i = 1, N 322 | C(i) = A(i) + B(i) 323 | end do 324 | 325 | deallocate(A,B,C) 326 | end program vecadd 327 | \end{minted} 328 | \end{frame} 329 | 330 | %------------------------------------------------------------------------------- 331 | \begin{frame}[fragile] 332 | \frametitle{Vector add: Step 1} 333 | Add parallel region around work 334 | \begin{minted}[frame=single]{fortran} 335 | !$omp parallel 336 | do i = 1, N 337 | C(i) = A(i) + B(i) 338 | end do 339 | !$omp end parallel 340 | \end{minted} 341 | Every thread will now do the entire vector addition --- redundantly! 342 | \end{frame} 343 | 344 | %------------------------------------------------------------------------------- 345 | \begin{frame}[fragile] 346 | \frametitle{Vector add: Step 2} 347 | Get thread IDs 348 | \begin{minted}[fontsize=\small,frame=single]{fortran} 349 | integer :: tid, nthreads 350 | 351 | !$omp parallel 352 | tid = omp_get_thread_num() 353 | nthreads = omp_get_num_threads() 354 | 355 | do i = 1, N 356 | C(i) = A(i) + B(i) 357 | end do 358 | !$omp end parallel 359 | \end{minted} 360 | 361 | \pause 362 | \begin{alertblock}{Incorrect behaviour at runtime} 363 | What's the problem here? 364 | \end{alertblock} 365 | \end{frame} 366 | 367 | %------------------------------------------------------------------------------- 368 | \begin{frame}[fragile] 369 | \frametitle{Vector add: Step 2, take 2} 370 | 371 | \begin{itemize} 372 | \item In OpenMP, all variables are \emph{shared} between threads. 373 | \item But each thread needs its own copy of \mintinline{fortran}|tid|. 374 | \item Solution: use the \mintinline{fortran}|private| clause on the \mintinline{fortran}|parallel| region. 375 | \item This gives each thread its own unique copy in memory for the variable. 376 | \end{itemize} 377 | 378 | \begin{minted}[fontsize=\small,frame=single]{fortran} 379 | integer :: tid, nthreads 380 | 381 | !$omp parallel private(tid) 382 | tid = omp_get_thread_num() 383 | nthreads = omp_get_num_threads() 384 | 385 | do i = 1, N 386 | C(i) = A(i) + B(i) 387 | end do 388 | !$omp end parallel 389 | \end{minted} 390 | Much more information about data sharing clauses in next session. 391 | \end{frame} 392 | 393 | %------------------------------------------------------------------------------- 394 | \begin{frame}[fragile] 395 | \frametitle{Vector add: Step 3} 396 | Finally, distribute the iteration space across the threads. 397 | \begin{minted}[frame=single]{fortran} 398 | integer :: tid, nthreads 399 | 400 | !$omp parallel private(tid) 401 | tid = omp_get_thread_num() 402 | nthreads = omp_get_num_threads() 403 | 404 | do i = 1+(tid*N/nthreads), (tid+1)*N/nthreads 405 | C(i) = A(i) + B(i) 406 | end do 407 | !$omp end parallel 408 | \end{minted} 409 | \begin{block}{Remember} 410 | Thread IDs are numbered from 0 in OpenMP. 411 | Be careful with your index calculation. 412 | \end{block} 413 | \end{frame} 414 | 415 | %------------------------------------------------------------------------------- 416 | \begin{frame}[fragile] 417 | \frametitle{Barriers} 418 | A barrier simply synchronises threads in a parallel region. 419 | 420 | \begin{minted}[frame=single,linenos]{fortran} 421 | !$omp parallel private(tid) 422 | 423 | tid = omp_get_thread_num() 424 | A(tid) = big_work1(tid) 425 | 426 | !$omp barrier 427 | 428 | B(tid) = big_work2(A, tid) 429 | 430 | !$omp end parallel 431 | \end{minted} 432 | 433 | \begin{itemize} 434 | \item Running in parallel, need to compute \mintinline{fortran}|A(:)| before computing \mintinline{fortran}|B(:)|. 435 | \item The barrier ensures all threads wait between these statements. 436 | \item Must ensure all threads encounter the barrier. 437 | \end{itemize} 438 | 439 | \end{frame} 440 | 441 | 442 | %------------------------------------------------------------------------------- 443 | \section{Worksharing} 444 | \begin{frame}[fragile] 445 | \frametitle{Worksharing} 446 | 447 | \begin{itemize} 448 | \item The SPMD approach requires lots of bookkeeping. 449 | \item Common pattern of splitting loop iterations between threads. 450 | \item OpenMP has worksharing constructs to help with this. 451 | \item Used within a parallel region. 452 | \item The loop iterator is made \mintinline{fortran}|private| by default: no need for data sharing clause. 453 | \end{itemize} 454 | 455 | \begin{minted}[frame=single]{fortran} 456 | !$omp parallel 457 | !$omp do 458 | do i = 1, N 459 | C(i) = A(i) + B(i) 460 | end do 461 | !$omp end do 462 | !$omp end parallel 463 | \end{minted} 464 | 465 | Implicit synchronisation point at the \mintinline{fortran}|!$omp end do|. 466 | 467 | \end{frame} 468 | 469 | %------------------------------------------------------------------------------- 470 | \begin{frame}[fragile] 471 | \frametitle{Combined worksharing directives} 472 | Generally it's convenient to combine the directives: 473 | \begin{minted}[frame=single]{fortran} 474 | !$omp parallel do 475 | do i = 1, N 476 | ... ! loop body 477 | end do 478 | !$omp end parallel do 479 | \end{minted} 480 | 481 | \begin{itemize} 482 | \item This starts a parallel region, forking some threads. 483 | \item Each thread then gets a portion of the iteration space and computes the loop body in parallel. 484 | \item Implicit synchronisation point at the \mintinline{fortran}|end do|. 485 | \item Threads finally join again; later code executes sequentially. 486 | \end{itemize} 487 | \end{frame} 488 | 489 | %------------------------------------------------------------------------------- 490 | \begin{frame} 491 | \frametitle{Vector add code} 492 | The vector add codes are available in the repository for you to look at: 493 | \begin{itemize} 494 | \item Serial: \mintinline{bash}|vadd.f90| 495 | \item SPMD: \mintinline{bash}|vadd_spmd.f90| 496 | \item Worksharing: \mintinline{bash}|vadd_paralleldo.f90| 497 | \end{itemize} 498 | \end{frame} 499 | 500 | %------------------------------------------------------------------------------- 501 | \section{Loops} 502 | \begin{frame}[fragile] 503 | \frametitle{Nested loops} 504 | \begin{itemize} 505 | \item Often have tightly nested loops in your code. 506 | \item E.g. 2D grid code, every cell is independent. 507 | \item OpenMP worksharing would only parallelise over first loop with each thread performing inner loop serially. 508 | \item Use the \mintinline{fortran}|collapse(...)| clause to combine iteration spaces. 509 | \item OpenMP then workshares the combined iteration space. 510 | \end{itemize} 511 | 512 | \begin{minted}[frame=single]{fortran} 513 | !$omp parallel do collapse(2) 514 | do i = 1, N 515 | do j = 1, N 516 | ... ! loop body 517 | end do 518 | end do 519 | !$omp end parallel do 520 | \end{minted} 521 | All $N^2$ iterations are distributed across threads, rather than just the $N$ of the outer loop. 522 | 523 | \end{frame} 524 | 525 | 526 | %------------------------------------------------------------------------------- 527 | % \begin{frame} 528 | % \frametitle{Nested loops} 529 | % \begin{block}{Performance note} 530 | % Collapsing loops may subtly effect the compiler's knowledge about alignment and could affect vectorisation. 531 | % More on this when we talk about vectorisation in a later session. 532 | % \end{block} 533 | 534 | % \end{frame} 535 | 536 | %------------------------------------------------------------------------------- 537 | % \begin{frame} 538 | % \frametitle{Taking stock} 539 | % \begin{itemize} 540 | % \item We've seen how to parallelise a simple program using OpenMP. 541 | % \item Shown the MPI-style SPMD approach for dividing work. 542 | % \item OpenMP worksharing constructs make this easier. 543 | % \end{itemize} 544 | 545 | % The rest of this session: 546 | % \begin{itemize} 547 | % \item Expands on the worksharing constructs. 548 | % \item The first example for you to try. 549 | % \end{itemize} 550 | 551 | % Then, onto the rest of the OpenMP common core. 552 | % \end{frame} 553 | 554 | %------------------------------------------------------------------------------- 555 | \section{Exercise} 556 | \begin{frame} 557 | \frametitle{5-point stencil exercise} 558 | First exercise: parallelise a simple 5-point stencil code using OpenMP. 559 | 560 | \begin{center} 561 | \begin{tikzpicture} 562 | \draw[step=1cm,gray,very thin] (-1.9,-1.9) grid (2.9,2.9); 563 | \draw[fill=black] (0.5,0.5) circle (0.1cm); 564 | \draw[fill=black] (-0.5,0.5) circle (0.1cm); 565 | \draw[fill=black] (1.5,0.5) circle (0.1cm); 566 | \draw[fill=black] (0.5,1.5) circle (0.1cm); 567 | \draw[fill=black] (0.5,-0.5) circle (0.1cm); 568 | \draw (-0.5,0.5) -- (1.5,0.5); 569 | \draw (0.5,-0.5) -- (0.5,1.5); 570 | \end{tikzpicture} 571 | \end{center} 572 | 573 | Value in every cell is set to the average of its neighbours. 574 | \end{frame} 575 | 576 | %------------------------------------------------------------------------------- 577 | \begin{frame}[fragile] 578 | \frametitle{5-point stencil exercise} 579 | Take \mintinline{bash}|stencil.f90| and parallelise it using OpenMP: 580 | \begin{enumerate} 581 | \item Using a SPMD style. 582 | \item Using the OpenMP worksharing clauses. 583 | \item Vary the number of threads using \mintinline{bash}|OMP_NUM_THREADS|. 584 | \end{enumerate} 585 | 586 | Focus on parallelising the main loop(s): 587 | \begin{minted}[frame=single,breaklines]{fortran} 588 | do i = 1, nx 589 | do j = 1, ny 590 | Anew(i,j) = (A(i-1,j) + A(i+1,j) + A(i,j) + A(i,j-1) + A(i,j+1)) / 5.0 591 | end do 592 | end do 593 | \end{minted} 594 | 595 | Sample solutions are provided, but do try it yourself first. 596 | 597 | \end{frame} 598 | 599 | 600 | %------------------------------------------------------------------------------- 601 | \section{Scheduling} 602 | \begin{frame}[fragile] 603 | \frametitle{The Schedule clause} 604 | \begin{itemize} 605 | \item The worksharing clauses use default rules for assigning iterations to threads. 606 | \item Can use the \mintinline{fortran}|schedule| clause to specify the distribution. 607 | \item General format: 608 | \begin{minted}{fortran} 609 | !$omp parallel do schedule(...) 610 | \end{minted} 611 | \end{itemize} 612 | Next slides go through the options, using the following loop as an example: 613 | \begin{minted}[frame=single]{fortran} 614 | !$omp parallel do num_threads(4) 615 | do i = 1, 100 616 | ... ! loop body 617 | end do 618 | !$omp end parallel do 619 | \end{minted} 620 | 621 | \end{frame} 622 | 623 | %------------------------------------------------------------------------------- 624 | \begin{frame}[fragile] 625 | \frametitle{Static schedule} 626 | \begin{minted}{fortran} 627 | schedule(static) 628 | schedule(static,16) 629 | \end{minted} 630 | 631 | \begin{itemize} 632 | \item Static schedule divides iterations into chunks and assigns chunks to threads in round-robin. 633 | \item If no chunk size specified, iteration space divided roughly equally. 634 | \end{itemize} 635 | For our example loop: 636 | \begin{columns} 637 | \begin{column}{0.5\textwidth} 638 | \mintinline{fortran}|schedule(static)| 639 | \begin{tabular}{cc} 640 | \toprule 641 | Thread ID & Iterations \\ 642 | \midrule 643 | 0 & 1--25 \\ 644 | 1 & 26--50 \\ 645 | 2 & 51--75 \\ 646 | 3 & 76--100 \\ 647 | \bottomrule 648 | \end{tabular} 649 | \end{column} 650 | 651 | \begin{column}{0.5\textwidth} 652 | \mintinline{fortran}|schedule(static,16)| 653 | \begin{tabular}{cc} 654 | \toprule 655 | Thread ID & Iterations \\ 656 | \midrule 657 | 0 & 1--16, 65--80 \\ 658 | 1 & 17--32, 81--96 \\ 659 | 2 & 33--48, 97--100 \\ 660 | 3 & 49--64 \\ 661 | \bottomrule 662 | \end{tabular} 663 | \end{column} 664 | \end{columns} 665 | 666 | \end{frame} 667 | 668 | %------------------------------------------------------------------------------- 669 | \begin{frame}[fragile] 670 | \frametitle{Dynamic schedule} 671 | \begin{minted}{fortran} 672 | schedule(dynamic) 673 | schedule(dynamic,16) 674 | \end{minted} 675 | 676 | \begin{itemize} 677 | \item Iteration space is divided into chunks according to chunk size. 678 | \item If no chunk size specified, default size is one. 679 | \item Each thread requests and executes a chunk, until no more chunks remain. 680 | \item Useful for unbalanced work-loads if some threads complete work faster. 681 | \end{itemize} 682 | 683 | For our example with a chunk size of 16: 684 | \begin{itemize} 685 | \item The iteration space is split into chunk of 16 (the last chunk may be smaller). 686 | \item Each threads gets one chunk, then requests a new chunk to work on. 687 | \end{itemize} 688 | 689 | \end{frame} 690 | 691 | %------------------------------------------------------------------------------- 692 | \begin{frame}[fragile] 693 | \frametitle{Guided schedule} 694 | \begin{minted}{fortran} 695 | schedule(guided) 696 | schedule(guided,16) 697 | \end{minted} 698 | 699 | \begin{itemize} 700 | \item Similar to assignment to dynamic, except the chunk size decreases over time. 701 | \item Granularity of work chunks gets finer over time. 702 | \item If no chunk size is specified, the default size is one. 703 | \item Useful to try to mitigate overheads of a \mintinline{fortran}|dynamic| schedule by starting with large chunks of work. 704 | \end{itemize} 705 | 706 | For our example with a chunk size of 16: 707 | \begin{itemize} 708 | \item Each thread gets a chunk of 16 to work on. 709 | \item Each thread requests a new chunk, which might be smaller than 16. 710 | \end{itemize} 711 | 712 | \end{frame} 713 | 714 | %------------------------------------------------------------------------------- 715 | \begin{frame}[fragile] 716 | \frametitle{Other schedules} 717 | \begin{minted}{fortran} 718 | schedule(auto) 719 | \end{minted} 720 | \begin{itemize} 721 | \item Let the compiler or runtime choose the schedule. 722 | \end{itemize} 723 | 724 | \vfill 725 | 726 | \begin{minted}{fortran} 727 | schedule(runtime) 728 | \end{minted} 729 | \begin{itemize} 730 | \item Get the schedule from the \mintinline{bash}|OMP_SCHEDULE| environment variable. 731 | \end{itemize} 732 | 733 | \begin{block}{Recommendation} 734 | Just use a \mintinline{fortran}|static| schedule unless there is a good reason not to! 735 | \mintinline{fortran}|static| is usually the fastest of all the options. 736 | The choice of schedules is an advanced tuning option. 737 | \end{block} 738 | 739 | \end{frame} 740 | 741 | %------------------------------------------------------------------------------- 742 | \section{Synchronisation} 743 | \begin{frame}[fragile] 744 | \frametitle{The nowait clause} 745 | \begin{itemize} 746 | \item May have series of loops in your code which are independent. 747 | \item Threads must wait/synchronise at the end of the loop.% \mintinline{fortran}|!$omp end do|. 748 | \item But it might be possible to delay this synchronisation using the \mintinline{fortran}|nowait| clause. 749 | \item When a thread finishes the first loop, it starts on the next loop. 750 | \end{itemize} 751 | 752 | \begin{minted}[fontsize=\small, linenos, frame=single]{fortran} 753 | !$omp parallel 754 | !$omp do nowait 755 | do i = 1, N 756 | A(i) = i 757 | end do 758 | !$omp end do ! No barrier! 759 | !$omp do 760 | do i = 1, N 761 | B(i) = i 762 | end do 763 | !$omp end do ! Implicit barrier 764 | !$omp end parallel ! Implicit barrier 765 | \end{minted} 766 | \end{frame} 767 | 768 | %------------------------------------------------------------------------------- 769 | % \begin{frame} 770 | % \frametitle{Synchronisation} 771 | % A number of ways to synchronise the threads in OpenMP: 772 | % \begin{multicols}{2} 773 | % \begin{itemize} 774 | % \item Barriers 775 | % \item Critical 776 | % \item Atomics 777 | % \item Locks 778 | % \item Ordered 779 | % \item Single 780 | % \item Master 781 | % \item Flush 782 | % \end{itemize} 783 | % \end{multicols} 784 | 785 | % \vfill 786 | 787 | % \begin{itemize} 788 | % \item Will look at Critical and Atomic in Session 2. 789 | % \item Ordered, Single and Master in Session 6. 790 | % \item Won't formally cover Flush and Locks --- advanced stuff with esoteric use cases. 791 | % \end{itemize} 792 | 793 | % % Quickly cover barriers now. 794 | 795 | % \end{frame} 796 | 797 | %------------------------------------------------------------------------------- 798 | % \section{Miscellaneous} 799 | % \begin{frame}[fragile] 800 | % \frametitle{Nested threads} 801 | % \begin{itemize} 802 | % \item Turn on support with by setting the environment variable \mintinline{fortran}|OMP_NESTED=true|, otherwise inner region is default serial. 803 | % \item Every thread in the (outer) parallel region then spawns threads. 804 | % \item Control the number of threads with clauses or environment variable: \mintinline{bash}|OMP_NUM_THREADS=4,2|. 805 | % \end{itemize} 806 | 807 | % \begin{minted}[frame=single]{fortran} 808 | % !$omp parallel num_threads(4) 809 | % ... ! A parallel region 810 | % !$omp parallel num_threads(4) 811 | % ... ! Inner parallel region 812 | % !$omp end parallel 813 | % !$omp end parallel 814 | % \end{minted} 815 | 816 | % \end{frame} 817 | 818 | 819 | % %------------------------------------------------------------------------------- 820 | % \begin{frame} 821 | % \frametitle{Nested threads} 822 | % \begin{alertblock}{Warning!} 823 | % Be careful how you use nesting threads. 824 | % It's very easy to oversubscribe threads. 825 | % Thread affinity can be tricky. 826 | % You probably don't need to use nested threads! 827 | % \end{alertblock} 828 | % \end{frame} 829 | 830 | %------------------------------------------------------------------------------- 831 | % \begin{frame}[fragile] 832 | % \frametitle{Multi-line directives} 833 | % \begin{itemize} 834 | % \item Sometimes OpenMP directives can be quite long. 835 | % \item Nicer to split up the directive across lines in the source file using line continuation character \mintinline{fortran}|&|: 836 | % \end{itemize} 837 | 838 | % \begin{minted}{fortran} 839 | % !$omp construct & 840 | % !$omp& clause 841 | % !$omp& clause 842 | % \end{minted} 843 | 844 | % \end{frame} 845 | 846 | %------------------------------------------------------------------------------- 847 | % \begin{frame} 848 | % \frametitle{Summary} 849 | % This section introduced the OpenMP programming model: 850 | % \begin{itemize} 851 | % \item Creating parallel regions: \mintinline{fortran}|!$omp parallel|/\mintinline{fortran}|!$omp end parallel| 852 | % \item Getting thread IDs: \mintinline{fortran}|omp_get_thread_num()|/\mintinline{fortran}|omp_get_num_threads()| 853 | % \item Worksharing constructs: \mintinline{fortran}|!$omp do|/\mintinline{fortran}|!$omp end do| 854 | % \item The \mintinline{fortran}|schedule| and \mintinline{fortran}|nowait| clauses 855 | % \item Synchronising threads with barriers: \mintinline{fortran}|!$omp barrier| 856 | % \end{itemize} 857 | % \end{frame} 858 | 859 | %------------------------------------------------------------------------------- 860 | % \begin{frame} 861 | % \frametitle{Resources} 862 | % \begin{itemize} 863 | % \item OpenMP website: \url{https://www.openmp.org} 864 | % \begin{itemize} 865 | % \item The specification (not for the faint hearted). 866 | % \item Download summary cards. 867 | % \item List of compiler support. 868 | % \item Example code for all the directives. 869 | % \item List of books: \url{https://www.openmp.org/resources/openmp-books/} 870 | % \end{itemize} 871 | % 872 | % \item cOMPunity 873 | % \begin{itemize} 874 | % \item \url{http://www.compunity.org} 875 | % \end{itemize} 876 | % 877 | % \item Online tutorials: 878 | % \begin{itemize} 879 | % \item Tim Mattson's YouTube tutorial: \url{https://youtu.be/nE-xN4Bf8XI} 880 | % \item SC'08 tutorial from Tim Mattson and Larry Meadows: \url{https://openmp.org/mp-documents/omp-hands-on-SC08.pdf} 881 | % \item From Lawrence Livermore National Lab: \url{https://computing.llnl.gov/tutorials/openMP/} 882 | % \end{itemize} 883 | % 884 | % \end{itemize} 885 | % 886 | % \end{frame} 887 | 888 | %------------------------------------------------------------------------------- 889 | 890 | \end{document} 891 | -------------------------------------------------------------------------------- /slides/03-opt.tex: -------------------------------------------------------------------------------- 1 | \documentclass{beamer} 2 | 3 | \input{preamble.tex} 4 | 5 | \title{OpenMP for Computational Scientists} 6 | \subtitle{3: Vectorisation and optimisations} 7 | 8 | \begin{document} 9 | 10 | \frame{\titlepage} 11 | 12 | %------------------------------------------------------------------------------- 13 | \section{Outline} 14 | \begin{frame} 15 | \frametitle{Outline} 16 | Now you know how to parallelise programs using OpenMP, how do you write fast programs in OpenMP? 17 | 18 | \begin{itemize} 19 | \item The cache hierarchy 20 | \item Performance analysis 21 | \item Vectorisation 22 | \item Array of structures vs Structure of arrays 23 | \item Memory access patterns 24 | \item Memory alignment 25 | \end{itemize} 26 | \end{frame} 27 | 28 | %------------------------------------------------------------------------------- 29 | \section{Recap} 30 | \begin{frame} 31 | \frametitle{Recap} 32 | 33 | \begin{itemize} 34 | \item Data sharing clauses: 35 | \begin{itemize} 36 | \item \mintinline{fortran}|shared|, \mintinline{fortran}|private|, \mintinline{fortran}|firstprivate|, \mintinline{fortran}|lastprivate| 37 | \end{itemize} 38 | 39 | \item Atomics and \mintinline{fortran}|critical| regions 40 | 41 | \item False sharing and cache thrashing 42 | 43 | \item Reductions with the \mintinline{fortran}|reduction| clause 44 | \end{itemize} 45 | 46 | Combined with the \mintinline{fortran}|parallel| and worksharing constructs from before, we've covered the OpenMP ``common core''. 47 | 48 | \end{frame} 49 | 50 | 51 | %------------------------------------------------------------------------------- 52 | \begin{frame}[fragile] 53 | \frametitle{Previous exercise} 54 | 55 | Take your parallel 5-point stencil, and implement a reduction: 56 | \begin{minted}[frame=single,breaklines,fontsize=\small]{fortran} 57 | total = 0.0 58 | !$omp parallel do collapse(2) reduction(+:total) 59 | do i = 1, nx 60 | do j = 1, ny 61 | Atmp(i,j) = (A(i-1,j) + A(i+1,j) + A(i,j) + A(i,j-1) + A(i,j+1)) / 5.0 62 | total = total + Atmp(i,j) 63 | end do 64 | end do 65 | !$omp end parallel do 66 | \end{minted} 67 | 68 | \begin{itemize} 69 | \item Well done if you managed this! 70 | \item 5-point stencil is simple, but captures the \emph{essence} of more complicated codes. 71 | \item Extension: did anyone try the parallelising the Jacobi solver? 72 | \end{itemize} 73 | 74 | \end{frame} 75 | 76 | %------------------------------------------------------------------------------- 77 | \section{The cache hierarchy} 78 | \begin{frame} 79 | \frametitle{Cache hierarchy} 80 | \begin{center} 81 | \begin{adjustbox}{max width={.8\textwidth}} 82 | \begin{tikzpicture} 83 | % Triangle 84 | \draw (-3,0) -- (0,4.5) -- (3,0) -- (-3,0); 85 | 86 | \node at (4,4) {For a Skylake processor}; 87 | \node at (0,3.5) {L1}; 88 | \node at (4,3.5) {4~cycles}; 89 | 90 | \draw[dashed] (-1,3) -- (1,3); 91 | \node at (0,2.5) {L2}; 92 | \node at (4,2.5) {12~cycles}; 93 | 94 | \draw[dashed] (-1.67,2) -- (1.67,2); 95 | \node at (0,1.5) {L3}; 96 | \node at (4,1.5) {$\sim$ 44~cycles}; 97 | 98 | \draw[dashed] (-2.337,1) -- (2.337,1); 99 | \node at (0,0.5) {DRAM}; 100 | \node at (6,0.5) {$\sim$ 90~ns ($\sim$ 200 cycles @ 2.2~GHz)}; 101 | \end{tikzpicture} 102 | \end{adjustbox} 103 | \end{center} 104 | 105 | \begin{itemize} 106 | \item Most integer and floating point operations are single cycle. 107 | \item Memory access is relatively slow. 108 | \item Moving memory between nodes is hugely expensive: $\sim 3\mu s$ 109 | \item How long is a nanosecond? 11.8 inches --- Grace Hopper: \url{https://youtu.be/JEpsKnWZrJ8}. 110 | \item Therefore very easy to become bound by memory movement. 111 | 112 | \end{itemize} 113 | \end{frame} 114 | 115 | %------------------------------------------------------------------------------- 116 | \begin{frame} 117 | \frametitle{Cache bandwidth} 118 | 119 | Graph of aggregate cache bandwidth on different architectures: 120 | \begin{center} 121 | \includegraphics[width=0.8\textwidth]{cache_bandwidth} 122 | \end{center} 123 | 124 | \begin{itemize} 125 | \item Clear cliff edges at cache capacity sizes (3 times x-axis). 126 | \item As with latency: more performance from lower levels. 127 | \end{itemize} 128 | 129 | 130 | \footnotetext[1]{{\Tiny Deakin, T., Price, J., and McIntosh-Smith, S. (2017). Portable Methods for Measuring Cache Hierarchy Performance (poster).\\In Supercomputing. Denver, CO.}} 131 | 132 | \end{frame} 133 | 134 | %------------------------------------------------------------------------------- 135 | \begin{frame}[fragile] 136 | \frametitle{Streaming data} 137 | 138 | STREAM Triad kernel: 139 | 140 | \begin{minted}[frame=single]{fortran} 141 | !$omp parallel do 142 | do i = 1, N 143 | a(i) = b(i) + scalar * c(i) 144 | end do 145 | !$omp end parallel do 146 | \end{minted} 147 | 148 | \begin{itemize} 149 | \item Where \mintinline{fortran}|N| is large, the arrays exceed cache capacity. 150 | \item This kernel has \emph{no} data reuse: data items are read or written once, then never used again. 151 | \item Example of a \emph{streaming} data access pattern. 152 | \item Performance is then bound by main memory bandwidth. 153 | \end{itemize} 154 | 155 | \end{frame} 156 | 157 | %------------------------------------------------------------------------------- 158 | \section{Performance analysis} 159 | \begin{frame} 160 | \frametitle{Performance analysis} 161 | \begin{itemize} 162 | \item Optimisations can help code go faster, but how do you know when it's performing \emph{well}? 163 | \item Helpful to think about characteristics of the algorithm: 164 | \begin{itemize} 165 | \item Algorithmic complexity for compute. 166 | \item Algorithmic complexity for data movement. 167 | \end{itemize} 168 | \item Examples: 169 | \begin{itemize} 170 | \item Vector-vector and vector-matrix are $O(n)$ for compute and data movement. 171 | \item Matrix-matrix multiply is $O(n^3)$ for compute and $O(n^2)$ for data movement. 172 | \item Matrix multiplication becomes \emph{compute bound} at large enough $n$, but other examples remain memory bandwidth bound. 173 | \end{itemize} 174 | \end{itemize} 175 | \end{frame} 176 | 177 | %------------------------------------------------------------------------------- 178 | \begin{frame} 179 | \frametitle{Rate limiting factors} 180 | \begin{itemize} 181 | \item Most HPC codes are \emph{memory bandwidth bound}. 182 | \item A few are \emph{compute bound}. 183 | \item Other possibilities: 184 | \begin{itemize} 185 | \item Network bound (e.g. MPI communication). 186 | \item I/O bound (e.g. writing to the filesystem). 187 | \item Memory latency bound. 188 | \item Memory capacity bound. 189 | \item \dots 190 | \end{itemize} 191 | \item Worth thinking about the bound for your own code. 192 | \item Arithmetic (integer and floating point) is very cheap $O(1)$ cycle. 193 | \item Division, transcendentals, exponentials/logs are relatively slow. 194 | \item Load/store is 2--3 times slower than an arithmetic operation, even if it's an L1 cache hit (the best case). 195 | \item Consider the ratio of bytes moved vs. floating point operations. 196 | \end{itemize} 197 | \end{frame} 198 | %------------------------------------------------------------------------------- 199 | \begin{frame} 200 | \frametitle{Computational intensity} 201 | \begin{itemize} 202 | \item The ratio of FLOPS to bytes moved is known as \emph{computational intensity}, or CI. 203 | \item Originally only DRAM traffic counted, but this causes problems. 204 | \item Bytes moved is best calculated from the kernel perspective. 205 | \item Take this example: 206 | \begin{itemize} 207 | \item \mintinline{fortran}|a(i) = a(i) + b(i) * c(i)| 208 | \item Assume FP64 arrays. 209 | \item Count the data movement and floating point operations for each \mintinline{fortran}|i|. 210 | \item 24 bytes loaded, 8 bytes stored. 211 | \item Two floating point operations: one \mintinline{fortran}|+| and one \mintinline{fortran}|*|. 212 | \item CI of $2/32 = 1/16$. 213 | \end{itemize} 214 | \end{itemize} 215 | \end{frame} 216 | 217 | %------------------------------------------------------------------------------- 218 | \begin{frame} 219 | \frametitle{Roofline model} 220 | Useful conceptual tool to establish whether compute or memory bandwidth bound. 221 | 222 | \begin{center} 223 | \begin{adjustbox}{max width={.6\textwidth}} 224 | \begin{tikzpicture} 225 | \draw[->] (0,0) -- (0,3); 226 | \draw[->] (0,0) -- (7,0); 227 | \draw (0,1) -- (2,2.5); 228 | \draw (2,2.5) -- (7,2.5); 229 | \node at (3.5,-0.5) {Computational intensity (FLOPS/byte)}; 230 | \node[rotate=90] at (-0.5,1.5) {FLOP/s}; 231 | \node at (7.5,2.4) {$F(I)$}; 232 | \node at (4,1) {A}; 233 | \node at (0.75,1) {B}; 234 | \end{tikzpicture} 235 | \end{adjustbox} 236 | \end{center} 237 | 238 | \begin{itemize} 239 | \item The roof $F(I)$ is found from tech sheet data and/or micro-benchmarks. 240 | \item Runtime performance of kernel gives FLOP/s, analysis of bytes/FLOPS gives CI. 241 | \item Kernel A is compute bound; Kernel B is memory bandwidth bound. 242 | \item Both kernels require optimisation! 243 | \item If kernel is on the roof, then rate limiting factor is achieved. 244 | \end{itemize} 245 | \end{frame} 246 | 247 | %------------------------------------------------------------------------------- 248 | \begin{frame} 249 | \frametitle{Intel Advisor: Roofline} 250 | \begin{itemize} 251 | \item Can use Intel Advisor to run a Roofline analysis on your code. 252 | \item First, it runs some micro-benchmarks to generate the Roofline model. 253 | \item Then, it runs your code, calculating the CI and performance. 254 | \item Can be helpful to visualise how performant your code is. 255 | \item More information: \url{https://software.intel.com/en-us/articles/intel-advisor-roofline}. 256 | \item But beware, the CI is calculated from executed instructions, so not always the whole picture. 257 | \end{itemize} 258 | \end{frame} 259 | 260 | %------------------------------------------------------------------------------- 261 | \section{Vectorisation} 262 | \begin{frame} 263 | \frametitle{Vectorisation} 264 | $$C=A+B$$ 265 | \begin{columns} 266 | \begin{column}{0.5\textwidth} 267 | Scalar operations \\ 268 | \begin{tikzpicture} 269 | \draw (-0.5,2) rectangle (0.5,3); 270 | \draw (1,2) rectangle (2,3); 271 | \draw[->] (0,2) -- (.74,1.2); 272 | \draw[->] (1.5,2) -- (.76,1.2); 273 | \draw (.75,.75) circle (.4cm); 274 | \draw (.75,.75) node {$+$}; 275 | \draw[->] (.75,0.3) -- (.75,-0.5); 276 | \draw (.25,-1.5) rectangle (1.25,-0.5); 277 | \end{tikzpicture} 278 | \end{column} 279 | 280 | \begin{column}{0.5\textwidth} 281 | Vector operations \\ 282 | \begin{tikzpicture} 283 | \draw[step=1cm] (0,2) grid (4,3); 284 | \draw[step=1cm] (0,0) grid (4,1); 285 | \draw[->] (2,0) -- (2,-0.6); 286 | \draw[->] (0,2.5) -- (-0.5,2.5) -- (-0.5,-1) -- (1.6,-1); 287 | \draw (2,-1) circle (.4cm); 288 | \draw (2,-1) node {$+$}; 289 | \draw[->] (2,-1.4) -- (2,-1.9); 290 | \draw[step=1cm] (0,-3) grid (4,-2); 291 | \end{tikzpicture} 292 | \end{column} 293 | \end{columns} 294 | 295 | \end{frame} 296 | 297 | %------------------------------------------------------------------------------- 298 | \begin{frame} 299 | \frametitle{Why vectorise?} 300 | \begin{itemize} 301 | \item Vectorisation gives you more compute per cycle. 302 | \item Hence may increase the FLOP/s rate of the processor. 303 | \item Also results in fewer instructions to process (less pressure on instruction decode units). 304 | \item Vectors help make good use of the memory hierarchy (often the main benefit). 305 | \item Vectorisation helps you write code which has good access patterns to maximise bandwidth. 306 | \end{itemize} 307 | \end{frame} 308 | 309 | %------------------------------------------------------------------------------- 310 | \begin{frame} 311 | \frametitle{Auto-vectorisation} 312 | \begin{itemize} 313 | \item Modern compilers are very good at automatically vectorising your loops. 314 | \item Fortran helps as arrays can not alias (overlap), unlike C. 315 | \item But compiler needs to be sure it's safe to vectorise. 316 | \item Read compiler reports to see if it's already vectorising. 317 | \begin{itemize} 318 | \item Intel: \mintinline{bash}|-qopt-report=5| 319 | \item Cray: \mintinline{bash}|-hlist=a| 320 | \item GNU (old): \mintinline{bash}|-ftree-vectorizer-verbose=2| 321 | \item GNU (new): \mintinline{bash}|-fopt-info-vec| 322 | \item Clang: \mintinline{bash}|-Rpass=loop-vectorize| \mintinline{bash}|-Rpass-missed=loop-vectorize| \mintinline{bash}|-Rpass-analysis=loop-vectorize| 323 | \end{itemize} 324 | \item Often the memory access pattern prevents (efficient) auto-vectorisation. 325 | \end{itemize} 326 | \end{frame} 327 | 328 | %------------------------------------------------------------------------------- 329 | \subsection{OpenMP SIMD} 330 | \begin{frame}[fragile] 331 | \frametitle{OpenMP SIMD} 332 | \begin{itemize} 333 | \item Sometimes the compiler needs help in confirming loops are vectorisable. 334 | \item OpenMP \mintinline{fortran}|simd| constructs give this information. 335 | \item Can combine with \mintinline{fortran}|parallel do| construct to ensure a parallel vector loop: \mintinline{fortran}|omp parallel do simd| 336 | \item Generally want to vectorise inner loops and parallelise outer loops. 337 | \end{itemize} 338 | 339 | \begin{minted}[frame=single]{fortran} 340 | !$omp simd 341 | do i = 1, N 342 | C(i) = A(i)+B(i) 343 | end do 344 | !$omp end simd 345 | \end{minted} 346 | \end{frame} 347 | 348 | %------------------------------------------------------------------------------- 349 | \begin{frame}[fragile] 350 | \frametitle{SIMD functions} 351 | Say you've written an update function to update values in the loop: 352 | \begin{minted}[frame=single]{fortran} 353 | do i = 1, N 354 | A(i) = magic_maths(A(i)) 355 | end do 356 | \end{minted} 357 | 358 | \begin{itemize} 359 | \item The situation gets complicated. 360 | \item If the function is small, then likely inlined and loop will auto-vectorise. 361 | \item Otherwise need to use the \mintinline{fortran}|simd| construct, but need compiler to create a vector version of the function. 362 | \end{itemize} 363 | 364 | \begin{minted}[frame=single]{fortran} 365 | function magic_maths(value) result(r) 366 | !$omp declare simd(magic_maths) 367 | implicit none 368 | real(kind=8) :: value, r 369 | r = value * value 370 | end function 371 | \end{minted} 372 | 373 | \end{frame} 374 | 375 | %------------------------------------------------------------------------------- 376 | \begin{frame}[fragile] 377 | \frametitle{SIMD clauses} 378 | \begin{itemize} 379 | \item All the usual data-sharing and reduction clauses can be applied. 380 | \item \mintinline{fortran}|safelen(4)|: distance between iterations where its safe to vectorise. 381 | \begin{minted}[frame=single]{fortran} 382 | !$omp simd safelen(4) 383 | do i = 1, N-4 384 | A(i) = A(i) + A(i+4) 385 | end do 386 | !$omp end simd 387 | \end{minted} 388 | \item \mintinline{fortran}|simdlen(4)|: preferred iterations to be performed concurrently as a vector. 389 | Specifying explicit vector lengths builds in obsolescence to the code as hardware vector lenghts continually change --- don't recommend using this clause. 390 | \end{itemize} 391 | \end{frame} 392 | 393 | %------------------------------------------------------------------------------- 394 | \begin{frame}[fragile] 395 | \frametitle{SIMD clauses} 396 | \begin{itemize} 397 | \item \mintinline{fortran}|linear(var)|: variable is private and linear to the loop iterator. 398 | \begin{minted}[frame=single]{fortran} 399 | !$omp simd linear(j) 400 | do i = 1, N 401 | j = j + 1 402 | A(j) = B(i) 403 | end do 404 | !$omp end simd 405 | \end{minted} 406 | \item \mintinline{fortran}|aligned(var)|: says the array is aligned (more on this shortly). 407 | \item \mintinline{fortran}|uniform(var)|: for \mintinline{fortran}|declare simd| construct, the variable is the same in all vector lanes. 408 | \end{itemize} 409 | \end{frame} 410 | 411 | %------------------------------------------------------------------------------- 412 | \begin{frame} 413 | \frametitle{SIMD summary} 414 | 415 | \begin{itemize} 416 | \item Sometimes need to force the compiler to auto-vectorise (the correct) loop with the \mintinline{fortran}|simd| construct. 417 | \item As with \mintinline{fortran}|parallel|, you are telling the compiler it is safe to vectorise and to ignore its data dependancy analysis. 418 | \item Check the compiler report before and after the check it did the right thing! 419 | \item Use \mintinline{fortran}|declare simd| and appropriate clauses if you need to create vectorised versions of functions. 420 | \begin{itemize} 421 | \item The clauses can give more information to the compiler so it does a better job. 422 | \end{itemize} 423 | \end{itemize} 424 | 425 | \end{frame} 426 | 427 | %------------------------------------------------------------------------------- 428 | \section{Derived types} 429 | \begin{frame}[fragile] 430 | \frametitle{Derived types} 431 | 2D grid of cells, each cell containing 4 different values. 432 | \begin{minted}[frame=single,linenos,fontsize=\small]{fortran} 433 | type cell 434 | real(kind=8) :: property1 435 | real(kind=8) :: property2 436 | real(kind=8) :: property3 437 | real(kind=8) :: property4 438 | end type 439 | 440 | type(cell), allocatable :: grid(:,:) 441 | 442 | do j = 1, ny 443 | do i = 1, nx 444 | grid(i,j)%property1 = update_1() 445 | grid(i,j)%property2 = update_2() 446 | grid(i,j)%property3 = update_3() 447 | grid(i,j)%property4 = update_4() 448 | end do 449 | end do 450 | \end{minted} 451 | \end{frame} 452 | 453 | %------------------------------------------------------------------------------- 454 | \begin{frame} 455 | \frametitle{Derived types} 456 | \begin{itemize} 457 | \item What do Fortran derived types look like in memory? 458 | \item Organised as an array of structures. 459 | \item<2-> What happens when we vectorise our loop over cells? 460 | \end{itemize} 461 | 462 | \begin{adjustbox}{max width={\textwidth}} 463 | \begin{tikzpicture} 464 | \draw[step=1cm] (0,0) grid (13,1); 465 | \foreach \i in {0,4,8,12} { 466 | \draw (\i+.5,.5) node {P1}; 467 | } 468 | \foreach \i in {0,4,8} { 469 | \draw (\i+1.5,.5) node {P2}; 470 | \draw (\i+2.5,.5) node {P3}; 471 | \draw (\i+3.5,.5) node {P4}; 472 | } 473 | 474 | \foreach \i in {0,4,8,12} { 475 | \draw<3->[->] (\i+.5,-1) -- (\i+.5,0); 476 | } 477 | \end{tikzpicture} 478 | \end{adjustbox} 479 | 480 | \begin{itemize} 481 | \item<4-> The \mintinline{fortran}|property1| values are gathered into a vector register. 482 | \item<5-> After the computation, the results are scattered back into memory. 483 | \item<6-> A cache line is 64 bytes, so only the first two values are on the first cache line. 484 | \item<6-> Must read two cache lines to fill the vector up. 485 | \end{itemize} 486 | \end{frame} 487 | 488 | %------------------------------------------------------------------------------- 489 | \begin{frame}[fragile] 490 | \frametitle{Structure of arrays} 491 | Switch type around to have an array per property. 492 | \begin{minted}[frame=single,linenos]{fortran} 493 | type grid 494 | real(kind=8), allocatable :: property1(:,:) 495 | real(kind=8), allocatable :: property2(:,:) 496 | real(kind=8), allocatable :: property3(:,:) 497 | real(kind=8), allocatable :: property4(:,:) 498 | end type 499 | 500 | do j = 1, ny 501 | do i = 1, nx 502 | grid%property1(i,j) = update_1() 503 | grid%property2(i,j) = update_2() 504 | grid%property3(i,j) = update_3() 505 | grid%property4(i,j) = update_4() 506 | end do 507 | end do 508 | \end{minted} 509 | \end{frame} 510 | 511 | %------------------------------------------------------------------------------- 512 | \begin{frame} 513 | \frametitle{Structure of arrays} 514 | \begin{itemize} 515 | \item Order of data in memory has changed. 516 | \item<2-> What happens when we vectorise? 517 | \end{itemize} 518 | 519 | \begin{adjustbox}{max width={\textwidth}} 520 | \begin{tikzpicture} 521 | \draw[step=1cm] (0,0) grid (13,1); 522 | \foreach \i in {0,...,4} { 523 | \draw (\i+.5,.5) node {P1}; 524 | } 525 | \draw (5.5,.5) node {\dots}; 526 | 527 | \foreach \i in {5,...,9} { 528 | \draw (\i+1.5,.5) node {P2}; 529 | } 530 | \draw (11.5,.5) node {\dots}; 531 | 532 | \foreach \i in {10} { 533 | \draw (\i+2.5,.5) node {P3}; 534 | } 535 | 536 | \foreach \i in {0,...,3} { 537 | \draw<3->[->] (\i+.5,-1) -- (\i+.5,0); 538 | } 539 | \end{tikzpicture} 540 | \end{adjustbox} 541 | 542 | \onslide<4->{ 543 | \begin{itemize} 544 | \item Coalesced memory accesses are key for high performance code. 545 | \item Adjacent vector lanes read adjacent memory locations. 546 | \item A cache line is 64 bytes, so can fill the vector from a single cache line. 547 | \item More efficient vectorisation. 548 | \end{itemize} 549 | } 550 | \end{frame} 551 | 552 | %------------------------------------------------------------------------------- 553 | \section{Memory access patterns} 554 | \begin{frame}[fragile] 555 | \frametitle{Memory access patterns} 556 | \begin{minted}{fortran} 557 | do i = 1, N 558 | val = A(i) 559 | end do 560 | \end{minted} 561 | \begin{adjustbox}{max width={\textwidth}} 562 | \begin{tikzpicture} 563 | \draw[step=1cm] (-3,0) grid (11,1); 564 | \draw[dashed] (0,-.5) -- (0,1.5); 565 | \draw[dashed] (8,-.5) -- (8,1.5); 566 | \draw (0,-1) node {64 byte boundary}; 567 | \foreach \i in {0,...,7} { 568 | \draw[->] (\i+.5,2) -- (\i+.5,1.2); 569 | } 570 | \end{tikzpicture} 571 | \end{adjustbox} 572 | \begin{itemize} 573 | \item Ideal memory access pattern. 574 | \item All access is coalesced. 575 | \item Vectors are aligned to cache line boundary. 576 | \end{itemize} 577 | \end{frame} 578 | 579 | %------------------------------------------------------------------------------- 580 | \begin{frame}[fragile] 581 | \frametitle{Memory access patterns} 582 | \begin{minted}{fortran} 583 | do i = 1, N 584 | val = A(i+3) 585 | end do 586 | \end{minted} 587 | \begin{adjustbox}{max width={\textwidth}} 588 | \begin{tikzpicture} 589 | \draw[step=1cm] (-3,0) grid (11,1); 590 | \draw[dashed] (0,-.5) -- (0,1.5); 591 | \draw[dashed] (8,-.5) -- (8,1.5); 592 | \draw (0,-1) node {64 byte boundary}; 593 | \foreach \i in {0,...,7} { 594 | \draw[->] (\i+.5,2) -- (3+\i+.5,1.2); 595 | } 596 | \end{tikzpicture} 597 | \end{adjustbox} 598 | \begin{itemize} 599 | \item OK memory access pattern. 600 | \item All access is coalesced, but split across cache lines. 601 | \item Still get good use of cache lines, but not as efficient as aligned version. 602 | \end{itemize} 603 | \end{frame} 604 | 605 | %------------------------------------------------------------------------------- 606 | \begin{frame}[fragile] 607 | \frametitle{Memory access patterns} 608 | \begin{minted}{fortran} 609 | do i = 1, N 610 | val = A(j,i) ! equiv. A(j+3*i) 611 | end do 612 | \end{minted} 613 | \begin{adjustbox}{max width={\textwidth}} 614 | \begin{tikzpicture} 615 | \draw[step=1cm] (-3,0) grid (11,1); 616 | \draw[dashed] (0,-.5) -- (0,1.5); 617 | \draw[dashed] (8,-.5) -- (8,1.5); 618 | \draw (0,-1) node {64 byte boundary}; 619 | \foreach \i in {0,...,3} { 620 | \draw[->] (\i+.5,2) -- (3*\i+.5,1.2); 621 | } 622 | \end{tikzpicture} 623 | \end{adjustbox} 624 | \begin{itemize} 625 | \item Strided access results in multiple memory transactions. 626 | \item Kills throughput due to poor reuse of cached data. 627 | \item Very easy to fall into this trap with multi-dimensional arrays. 628 | \item Check your strides! 629 | \end{itemize} 630 | \end{frame} 631 | 632 | %------------------------------------------------------------------------------- 633 | \begin{frame}[fragile] 634 | \frametitle{Memory access patterns} 635 | \begin{minted}{fortran} 636 | do i = 1, N 637 | val = A(B(i)) 638 | end do 639 | \end{minted} 640 | \begin{adjustbox}{max width={\textwidth}} 641 | \begin{tikzpicture} 642 | \draw[step=1cm] (-3,0) grid (11,1); 643 | \draw[dashed] (0,-.5) -- (0,1.5); 644 | \draw[dashed] (8,-.5) -- (8,1.5); 645 | \draw (0,-1) node {64 byte boundary}; 646 | \draw[->] (0.5,2) -- (-3.5,1.2); 647 | \draw[->] (1.5,2) -- (3.5,1.2); 648 | \draw[->] (2.5,2) -- (0.5,1.2); 649 | \draw[->] (3.5,2) -- (8.5,1.2); 650 | \draw[->] (4.5,2) -- (-1.5,1.2); 651 | \draw[->] (5.5,2) -- (7.5,1.2); 652 | \draw[->] (6.5,2) -- (1.5,1.2); 653 | \draw[->] (7.5,2) -- (-2.5,1.2); 654 | \end{tikzpicture} 655 | \end{adjustbox} 656 | \begin{itemize} 657 | \item Essentially random access to memory. 658 | \item Little reuse of cache lines. 659 | \item Unpredictable pattern, so hardware prefetchers won't work efficiently. 660 | \item Very challenging! 661 | \end{itemize} 662 | \end{frame} 663 | 664 | %------------------------------------------------------------------------------- 665 | \section{Alignment} 666 | \begin{frame} 667 | \frametitle{Alignment} 668 | \begin{itemize} 669 | \item If we can align arrays, we get better vectorisation; specifically load/stores are faster. 670 | \begin{itemize} 671 | \item Guarantee only one cache line needs updating and not split between two cache lines. 672 | \end{itemize} 673 | \item Taking advantage of alignment is a two stage process: 674 | \begin{enumerate} 675 | \item Align the memory on allocation. 676 | \item Tell the compiler the access is aligned. 677 | \end{enumerate} 678 | \item Aligned allocations in Fortran are (currently) unfortunately vendor specific. 679 | \item OpenMP can help with telling the compiler the data is aligned. 680 | \item Aligned allocations due in OpenMP 5.0. 681 | \end{itemize} 682 | \end{frame} 683 | 684 | %------------------------------------------------------------------------------- 685 | \begin{frame}[fragile] 686 | \frametitle{Step 1: Aligning allocations} 687 | Generally focus on the Intel compiler. 688 | Only need to use one of these methods, whichever is most convenient. 689 | \begin{itemize} 690 | \item Align all allocations of arrays (not in derived types) with compiler flag: \mintinline{bash}|-align array64byte| 691 | \item Use an Intel compiler directive on array definition: 692 | \begin{minted}[frame=single,fontsize=\small]{fortran} 693 | real(kind=8), allocatable :: A(:,:) 694 | !dir$ attributes align:64 :: A 695 | \end{minted} 696 | \item Allocate memory in C, and convert to Fortran \mintinline{fortran}|pointer|: 697 | \begin{minted}[frame=single,breaklines,fontsize=\small]{c} 698 | double * alloc(int *len) { 699 | return (double *)aligned_alloc(64, sizeof(double)*(*len)); 700 | } 701 | \end{minted} 702 | \begin{minted}[frame=single,fontsize=\small]{fortran} 703 | real(kind=8), pointer :: A(:,:) 704 | type(c_ptr) :: A_ptr 705 | A_ptr = alloc(nx*ny) 706 | call c_f_pointer(A_ptr, A, (/ nx, ny/)) 707 | \end{minted} 708 | \end{itemize} 709 | \end{frame} 710 | 711 | %------------------------------------------------------------------------------- 712 | \begin{frame}[fragile] 713 | \frametitle{Step 2: Telling the compiler} 714 | \begin{itemize} 715 | \item Use OpenMP \mintinline{fortran}|simd aligned| clause: 716 | \begin{minted}[frame=single,fontsize=\small]{fortran} 717 | !$omp simd aligned(A:64) 718 | do i = 1, nx 719 | A(i,j) = A(i,j) + 1.0 720 | end do 721 | !$omp end simd 722 | \end{minted} 723 | \pause 724 | \item Unfortunately often not sufficient. 725 | \item Often need to use Intel specific directives to say loop extent is divisible by vector length. 726 | \begin{minted}[frame=single,fontsize=\small]{fortran} 727 | ! 64 byte aligned / 8 byte data type means mod 8 728 | !dir$ assume(mod(nx,8) .eq. 0) 729 | !$omp simd aligned(A:64) 730 | do i = 1, nx 731 | A(i,j) = A(i,j) + 1.0 732 | end do 733 | !$omp end simd 734 | \end{minted} 735 | \item Check the compiler report for aligned and unaligned access. 736 | \end{itemize} 737 | \end{frame} 738 | 739 | %------------------------------------------------------------------------------- 740 | \begin{frame}[fragile] 741 | \frametitle{Aligning 2D arrays} 742 | \begin{itemize} 743 | \item Aligning the memory only aligns the first entry. 744 | \item Multiples of the alignment factor will also be aligned. 745 | \item With 2D arrays you need to double check that access can be aligned. 746 | \item Example: 10-by-10 grid of FP64 numbers, aligned to 64 byte cache line: 747 | \end{itemize} 748 | 749 | \begin{adjustbox}{max width={\textwidth}} 750 | \begin{tikzpicture} 751 | \draw[step=1cm] (0,0) grid (17,1); 752 | \foreach \i in {0,8,16} { 753 | \draw[dashed] (\i,-.5) -- (\i,1.5); 754 | } 755 | \foreach \i in {1,...,10} { 756 | \draw (\i-0.5, 0.5) node {(\i,1)}; 757 | } 758 | \foreach \i in {1,...,7} { 759 | \draw (10+\i-0.5, 0.5) node {(\i,2)}; 760 | } 761 | \end{tikzpicture} 762 | \end{adjustbox} 763 | 764 | \begin{minted}[frame=single]{fortran} 765 | do j = 1, 10 766 | !$omp simd aligned(A:64) 767 | do i = 1, 10 768 | A(i,j) = A(i,j) + ... 769 | end do 770 | !$omp end simd 771 | end do 772 | \end{minted} 773 | 774 | \end{frame} 775 | %------------------------------------------------------------------------------- 776 | 777 | \begin{frame} 778 | \frametitle{Aligning 2D arrays} 779 | 780 | \begin{adjustbox}{max width={\textwidth}} 781 | \begin{tikzpicture} 782 | \draw[step=1cm] (0,0) grid (17,1); 783 | \foreach \i in {0,8,16} { 784 | \draw[dashed] (\i,-.5) -- (\i,1.5); 785 | } 786 | \foreach \i in {1,...,10} { 787 | \draw (\i-0.5, 0.5) node {(\i,1)}; 788 | } 789 | \foreach \i in {1,...,7} { 790 | \draw (10+\i-0.5, 0.5) node {(\i,2)}; 791 | } 792 | 793 | \draw<2>[red, very thick] (0.1,0.1) rectangle (3.9, 0.9); 794 | \draw<3>[red, very thick] (10.1,0.1) rectangle (13.9, 0.9); 795 | \end{tikzpicture} 796 | \end{adjustbox} 797 | 798 | \begin{itemize} 799 | \item The array is aligned to a 64-byte cache line. 800 | \item<2-> Accessing the vector \mintinline{fortran}|A(1:4,1)| is aligned. 801 | \item<3-> Accessing the vector \mintinline{fortran}|A(1:4,2)| is \emph{not} aligned. 802 | \vfill 803 | \item<4-> Need the inner stride to be a multiple of the alignment, and need to tell the compiler this is true (previous slide). 804 | \item<4-> Solution: pad the array, but beware of memory footprint. 805 | \item<4-> Example of why the \mintinline{fortran}|aligned| clause doesn't always ensure aligned load/stores. 806 | \end{itemize} 807 | 808 | \end{frame} 809 | %------------------------------------------------------------------------------- 810 | 811 | \section{Branches} 812 | \begin{frame} 813 | \frametitle{Branches} 814 | \begin{itemize} 815 | \item CPUs support speculative execution, GPUs tend not to. 816 | \item Branch instructions have high latency. 817 | \item GPUs hide this latency by fast context switching, CPUs by good branch predictors. 818 | \item In both cases, divergent execution within the vector unit reduces performance. 819 | \item Can use predication, selection and masking to convert conditional control flow into straight line code. 820 | \end{itemize} 821 | \end{frame} 822 | 823 | %------------------------------------------------------------------------------- 824 | \begin{frame}[fragile] 825 | \frametitle{Removing branches} 826 | \begin{columns} 827 | 828 | \begin{column}{0.5\textwidth} 829 | Conditional execution 830 | \begin{itemize} 831 | \item Only evaluate expression if condition is met 832 | \end{itemize} 833 | \begin{minted}[frame=single]{fortran} 834 | if (a .gt. b) then 835 | acc = acc + (a - b*c) 836 | end if 837 | \end{minted} 838 | 839 | \begin{minted}[frame=single]{C} 840 | if (a > b) 841 | acc += a - b*c; 842 | \end{minted} 843 | \end{column} 844 | 845 | \begin{column}{0.5\textwidth} 846 | Selection and masking 847 | \begin{itemize} 848 | \item Always evaluate expression and mask result 849 | \end{itemize} 850 | \begin{minted}[frame=single,breaklines]{fortran} 851 | temp = a - b*c 852 | mask = merge(1.0, 0.0, a .gt. b) 853 | acc = acc + (mask * temp) 854 | \end{minted} 855 | 856 | \begin{minted}[frame=single]{C} 857 | temp = a - b*c; 858 | mask = a > b ? 1.0 : 0.0; 859 | acc += mask * temp; 860 | \end{minted} 861 | \end{column} 862 | 863 | \end{columns} 864 | In practice, you may or may not see an improvement: the compiler may be doing something smart already. 865 | \end{frame} 866 | 867 | %------------------------------------------------------------------------------- 868 | \section{Exercise} 869 | \begin{frame} 870 | \frametitle{Exercise} 871 | \begin{itemize} 872 | \item Take your parallel 5-point stencil code and optimise it. 873 | \item Think about: 874 | \begin{itemize} 875 | \item Memory access patterns 876 | \item Vectorisation 877 | \end{itemize} 878 | \item Note down the performance differences your optimisations make. 879 | \item Calculate the achieved memory bandwidth of your stencil code. 880 | \item Extension: consider these optimisaions for the Jacobi solver. 881 | \end{itemize} 882 | \end{frame} 883 | 884 | %------------------------------------------------------------------------------- 885 | \section{Summary} 886 | \begin{frame} 887 | \frametitle{Summary} 888 | 889 | \begin{itemize} 890 | \item Performance of cache hierarchy. 891 | \item Performance analysis with the Roofline model. 892 | \item Vectorisation: 893 | \begin{itemize} 894 | \item Compiler auto-vectorisation. 895 | \item OpenMP \mintinline{fortran}|simd| construct. 896 | \item Memory access patterns. 897 | \item Data alignment. 898 | \end{itemize} 899 | 900 | \vfill 901 | 902 | \item Next sessions: 903 | \begin{enumerate} 904 | \setcounter{enumi}{3} 905 | \item NUMA and MPI interoperability. 906 | \item GPU programming with OpenMP. 907 | \item Tasks and Tools. 908 | \end{enumerate} 909 | \end{itemize} 910 | 911 | 912 | \end{frame} 913 | 914 | %------------------------------------------------------------------------------- 915 | 916 | \end{document} 917 | --------------------------------------------------------------------------------