├── nways_cfd ├── English │ ├── C │ │ ├── source_code │ │ │ ├── cuda-c │ │ │ │ └── README │ │ │ ├── openmp │ │ │ │ └── README │ │ │ ├── stdpar │ │ │ │ └── README │ │ │ ├── openacc │ │ │ │ └── README │ │ │ └── serial │ │ │ │ ├── arraymalloc.h │ │ │ │ ├── boundary.h │ │ │ │ ├── cfdio.h │ │ │ │ ├── jacobi.h │ │ │ │ ├── arraymalloc.cpp │ │ │ │ ├── compare.py │ │ │ │ ├── boundary.cpp │ │ │ │ ├── jacobi.cpp │ │ │ │ ├── Makefile │ │ │ │ ├── cfdio.cpp │ │ │ │ └── cfd.cpp │ │ └── jupyter_notebook │ │ │ ├── images │ │ │ ├── cfd_flow.png │ │ │ └── Optimization_Cycle.jpg │ │ │ ├── minicfd_cudac.ipynb │ │ │ ├── .ipynb_checkpoints │ │ │ ├── minicfd_cudac-checkpoint.ipynb │ │ │ ├── minicfd_stdpar-checkpoint.ipynb │ │ │ ├── minicfd_openmp-checkpoint.ipynb │ │ │ ├── minicfd_openacc-checkpoint.ipynb │ │ │ └── minicfd-checkpoint.ipynb │ │ │ ├── minicfd_stdpar.ipynb │ │ │ ├── minicfd_openmp.ipynb │ │ │ ├── minicfd_openacc.ipynb │ │ │ └── minicfd.ipynb │ ├── Python │ │ ├── source_code │ │ │ ├── cupy │ │ │ │ └── README │ │ │ ├── numba │ │ │ │ └── README │ │ │ └── serial │ │ │ │ └── cfd.py │ │ └── jupyter_notebook │ │ │ ├── images │ │ │ ├── cfd_flow.png │ │ │ └── Optimization_Cycle.jpg │ │ │ ├── minicfd_cupy.ipynb │ │ │ ├── minicfd_numba.ipynb │ │ │ └── minicfd.ipynb │ ├── Fortran │ │ ├── source_code │ │ │ ├── openmp │ │ │ │ └── README │ │ │ ├── cudafortran │ │ │ │ └── README │ │ │ ├── doconcurrent │ │ │ │ └── README │ │ │ ├── openacc │ │ │ │ └── README │ │ │ └── serial │ │ │ │ ├── Makefile │ │ │ │ ├── boundary.f90 │ │ │ │ ├── jacobi.f90 │ │ │ │ ├── cfdio.f90 │ │ │ │ └── cfd.f90 │ │ └── jupyter_notebook │ │ │ ├── images │ │ │ ├── cfd_flow.png │ │ │ └── Optimization_Cycle.jpg │ │ │ ├── minicfd_cudafortran.ipynb │ │ │ ├── minicfd_do_concurrent.ipynb │ │ │ ├── minicfd_openmp.ipynb │ │ │ ├── minicfd_openacc.ipynb │ │ │ └── minicfd.ipynb │ └── minicfd.ipynb ├── Dockerfile ├── Singularity ├── Dockerfile_python ├── Singularity_python └── README.md ├── README.md └── LICENSE /nways_cfd/English/C/source_code/cuda-c/README: -------------------------------------------------------------------------------- 1 | CUDA C folder 2 | -------------------------------------------------------------------------------- /nways_cfd/English/C/source_code/openmp/README: -------------------------------------------------------------------------------- 1 | OpenMP folder 2 | -------------------------------------------------------------------------------- /nways_cfd/English/C/source_code/stdpar/README: -------------------------------------------------------------------------------- 1 | STDPAR folder 2 | -------------------------------------------------------------------------------- /nways_cfd/English/Python/source_code/cupy/README: -------------------------------------------------------------------------------- 1 | CuPy folder 2 | -------------------------------------------------------------------------------- /nways_cfd/English/Python/source_code/numba/README: -------------------------------------------------------------------------------- 1 | Numba folder 2 | -------------------------------------------------------------------------------- /nways_cfd/English/C/source_code/openacc/README: -------------------------------------------------------------------------------- 1 | OpenACC folder 2 | 3 | -------------------------------------------------------------------------------- /nways_cfd/English/Fortran/source_code/openmp/README: -------------------------------------------------------------------------------- 1 | OpenMP folder 2 | -------------------------------------------------------------------------------- /nways_cfd/English/Fortran/source_code/cudafortran/README: -------------------------------------------------------------------------------- 1 | CUDA C folder 2 | -------------------------------------------------------------------------------- /nways_cfd/English/Fortran/source_code/doconcurrent/README: -------------------------------------------------------------------------------- 1 | STDPAR folder 2 | -------------------------------------------------------------------------------- /nways_cfd/English/Fortran/source_code/openacc/README: -------------------------------------------------------------------------------- 1 | OpenACC folder 2 | 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | This private repository consists of all challenges given as part of the HPC Bootcamp. 3 | -------------------------------------------------------------------------------- /nways_cfd/English/C/source_code/serial/arraymalloc.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void **arraymalloc2d(int nx, int ny, size_t typesize); 4 | 5 | -------------------------------------------------------------------------------- /nways_cfd/English/C/source_code/serial/boundary.h: -------------------------------------------------------------------------------- 1 | void boundarypsi(double *psi, int m, int n, int b, int h, int w); 2 | void boundaryzet(double *zet, double *psi, int m, int n); 3 | -------------------------------------------------------------------------------- /nways_cfd/English/C/jupyter_notebook/images/cfd_flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_accelerated_programming_challenge/master/nways_cfd/English/C/jupyter_notebook/images/cfd_flow.png -------------------------------------------------------------------------------- /nways_cfd/English/Python/jupyter_notebook/images/cfd_flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_accelerated_programming_challenge/master/nways_cfd/English/Python/jupyter_notebook/images/cfd_flow.png -------------------------------------------------------------------------------- /nways_cfd/English/Fortran/jupyter_notebook/images/cfd_flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_accelerated_programming_challenge/master/nways_cfd/English/Fortran/jupyter_notebook/images/cfd_flow.png -------------------------------------------------------------------------------- /nways_cfd/English/C/jupyter_notebook/images/Optimization_Cycle.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_accelerated_programming_challenge/master/nways_cfd/English/C/jupyter_notebook/images/Optimization_Cycle.jpg -------------------------------------------------------------------------------- /nways_cfd/English/Python/jupyter_notebook/images/Optimization_Cycle.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_accelerated_programming_challenge/master/nways_cfd/English/Python/jupyter_notebook/images/Optimization_Cycle.jpg -------------------------------------------------------------------------------- /nways_cfd/English/Fortran/jupyter_notebook/images/Optimization_Cycle.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openhackathons-org/nways_accelerated_programming_challenge/master/nways_cfd/English/Fortran/jupyter_notebook/images/Optimization_Cycle.jpg -------------------------------------------------------------------------------- /nways_cfd/English/C/source_code/serial/cfdio.h: -------------------------------------------------------------------------------- 1 | void writedatafiles(double *psi, int m, int n, int scale); 2 | 3 | void writeplotfile(int m, int n, int scale); 4 | 5 | void hue2rgb(double hue, int *r, int *g, int *b); 6 | 7 | double colfunc(double x); 8 | 9 | double gettime(void); 10 | -------------------------------------------------------------------------------- /nways_cfd/English/C/source_code/serial/jacobi.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void jacobistep(double *psinew, double *psi, int m, int n); 4 | 5 | void jacobistepvort(double *zetnew, double *psinew, 6 | double *zet, double* psi, 7 | int m, int n, double re); 8 | 9 | double deltasq(double *newarr, double *oldarr, int m, int n); 10 | -------------------------------------------------------------------------------- /nways_cfd/English/C/source_code/serial/arraymalloc.cpp: -------------------------------------------------------------------------------- 1 | #include "arraymalloc.h" 2 | #include 3 | 4 | void **arraymalloc2d(int nx, int ny, size_t typesize) 5 | { 6 | int i; 7 | void **array2d; 8 | 9 | size_t mallocsize; 10 | 11 | // total memory requirements including pointers 12 | 13 | mallocsize = nx*sizeof(void *) + nx*ny*typesize; 14 | 15 | array2d = (void **) malloc(mallocsize); 16 | 17 | // set first pointer to first element of data 18 | 19 | array2d[0] = (void *) (array2d + nx); 20 | 21 | for(i=1; i < nx; i++) 22 | { 23 | // set other pointers to point at subsequent rows 24 | 25 | array2d[i] = (void *) (((char *) array2d[i-1]) + ny*typesize); 26 | } 27 | 28 | return array2d; 29 | } 30 | -------------------------------------------------------------------------------- /nways_cfd/English/Fortran/source_code/serial/Makefile: -------------------------------------------------------------------------------- 1 | # System dependent definitions 2 | 3 | FC= nvfortran 4 | FFLAGS= -O3 5 | LFLAGS= -lnvhpcwrapnvtx 6 | 7 | # System independent definitions 8 | 9 | MF= Makefile 10 | 11 | EXE= cfd 12 | 13 | SRC= \ 14 | boundary.f90 \ 15 | cfd.f90 \ 16 | cfdio.f90 \ 17 | jacobi.f90 18 | 19 | # 20 | # No need to edit below this line 21 | # 22 | 23 | .SUFFIXES: 24 | .SUFFIXES: .f90 .o 25 | 26 | OBJ= $(SRC:.f90=.o) 27 | 28 | .f90.o: 29 | $(FC) $(FFLAGS) -c $< 30 | 31 | all: $(EXE) 32 | 33 | $(EXE): $(OBJ) 34 | $(FC) $(FFLAGS) -o $@ $(OBJ) $(LFLAGS) 35 | 36 | $(OBJ): $(MF) 37 | 38 | cfd.o: boundary.o jacobi.o cfdio.o 39 | 40 | tar: 41 | tar cvf cfd.tar $(MF) $(INC) $(SRC) 42 | 43 | clean: 44 | rm -f $(OBJ) $(EXE) *.mod velocity.dat colourmap.dat cfd.plt core 45 | -------------------------------------------------------------------------------- /nways_cfd/English/C/source_code/serial/compare.py: -------------------------------------------------------------------------------- 1 | # If you want to use this file to compare outputs, please put the output data in the correct locations. Otherwise, please compare the "error" value against the given slides. 2 | # Make sure to copy the "velocity.dat" and rename it to "orig_velocity". 3 | # To run: python3 compare.py 4 | 5 | import numpy as np 6 | 7 | # Original output data file 8 | orig_file = "orig_velocity.dat" 9 | new_file = "new_velocity.dat" 10 | 11 | orig_data = np.loadtxt(orig_file,delimiter = ' ') 12 | new_data = np.loadtxt(new_file,delimiter = ' ') 13 | 14 | 15 | diff_data = new_data - orig_data 16 | 17 | print("shape of orig_data:",orig_data.shape) 18 | print("shape of new_data:",new_data.shape) 19 | print("shape of diff_data:",diff_data.shape) 20 | 21 | maxError = np.amax(diff_data) 22 | 23 | print("shape of maxError:",maxError.shape) 24 | 25 | maxError_exp = "{:e}".format(maxError) 26 | 27 | print('Max Error is : ', maxError_exp) 28 | -------------------------------------------------------------------------------- /nways_cfd/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA Corporation. All rights reserved. 2 | 3 | # To build the docker container, run: $ sudo docker build -f nways_Dockerfile -t nways:cf . 4 | # To run: $ sudo docker run --rm -it --runtime nvidia -p 8888:8888 nways:cf 5 | # Finally, open http://localhost:8888/ 6 | 7 | FROM nvcr.io/nvidia/nvhpc:24.1-devel-cuda_multi-ubuntu22.04 8 | 9 | RUN apt-get -y update && \ 10 | DEBIAN_FRONTEND=noninteractive apt-get -yq install --no-install-recommends python3-pip python3-setuptools nginx zip make build-essential libtbb-dev && \ 11 | rm -rf /var/lib/apt/lists/* && \ 12 | pip3 install --upgrade pip &&\ 13 | pip3 install numpy &&\ 14 | pip3 install jupyterlab &&\ 15 | pip3 install ipywidgets &&\ 16 | pip3 install gdown 17 | 18 | ############################################ 19 | 20 | # TO COPY the data 21 | COPY English/ /labs/ 22 | 23 | ################################################# 24 | ENV PATH="/usr/local/bin:/opt/anaconda3/bin:/usr/bin:$PATH" 25 | ################################################# 26 | 27 | WORKDIR /labs 28 | CMD jupyter-lab --no-browser --allow-root --ip=0.0.0.0 --port=8888 --NotebookApp.token="" --notebook-dir=/labs 29 | -------------------------------------------------------------------------------- /nways_cfd/English/C/source_code/serial/boundary.cpp: -------------------------------------------------------------------------------- 1 | #include "boundary.h" 2 | #include 3 | 4 | //grid is parallelised in the x direction 5 | 6 | void boundarypsi(double *psi, int m, int n, int b, int h, int w) 7 | { 8 | 9 | int i,j; 10 | 11 | //BCs on bottom edge 12 | 13 | for (i=b+1;i<=b+w-1;i++) 14 | { 15 | psi[i*(m+2)+0] = (double)(i-b); 16 | } 17 | 18 | for (i=b+w;i<=m;i++) 19 | { 20 | psi[i*(m+2)+0] = (double)(w); 21 | } 22 | 23 | //BCS on RHS 24 | 25 | for (j=1; j <= h; j++) 26 | { 27 | psi[(m+1)*(m+2)+j] = (double) w; 28 | } 29 | 30 | for (j=h+1;j<=h+w-1; j++) 31 | { 32 | psi[(m+1)*(m+2)+j]=(double)(w-j+h); 33 | } 34 | } 35 | 36 | void boundaryzet(double *zet, double *psi, int m, int n) 37 | { 38 | int i,j; 39 | 40 | //set top/bottom BCs: 41 | 42 | for (i=1;i 2 | 3 | #include "jacobi.h" 4 | 5 | void jacobistep(double *psinew, double *psi, int m, int n) 6 | { 7 | int i, j; 8 | 9 | 10 | for(i=1;i<=m;i++) 11 | { 12 | for(j=1;j<=n;j++) 13 | { 14 | psinew[i*(m+2)+j]=0.25*(psi[(i-1)*(m+2)+j]+psi[(i+1)*(m+2)+j]+psi[i*(m+2)+j-1]+psi[i*(m+2)+j+1]); 15 | } 16 | } 17 | 18 | } 19 | 20 | void jacobistepvort(double *zetnew, double *psinew, 21 | double *zet, double *psi, 22 | int m, int n, double re) 23 | { 24 | int i, j; 25 | 26 | for(i=1;i<=m;i++) 27 | { 28 | for(j=1;j<=n;j++) 29 | { 30 | psinew[i*(m+2)+j]=0.25*( psi[(i-1)*(m+2)+j]+psi[(i+1)*(m+2)+j]+psi[i*(m+2)+j-1]+psi[i*(m+2)+j+1] 31 | - zet[i*(m+2)+j] ); 32 | } 33 | } 34 | 35 | for(i=1;i<=m;i++) 36 | { 37 | for(j=1;j<=n;j++) 38 | { 39 | zetnew[i*(m+2)+j]=0.25*(zet[(i-1)*(m+2)+j]+zet[(i+1)*(m+2)+j]+zet[i*(m+2)+j-1]+zet[i*(m+2)+j+1]) 40 | - re/16.0*( 41 | ( psi[i*(m+2)+j+1]-psi[i*(m+2)+j-1])*(zet[(i+1)*(m+2)+j]-zet[(i-1)*(m+2)+j]) 42 | - (psi[(i+1)*(m+2)+j]-psi[(i-1)*(m+2)+j])*(zet[i*(m+2)+j+1]-zet[i*(m+2)+j-1]) 43 | ); 44 | } 45 | } 46 | } 47 | 48 | double deltasq(double *newarr, double *oldarr, int m, int n) 49 | { 50 | int i, j; 51 | 52 | double dsq=0.0; 53 | double tmp; 54 | 55 | for(i=1;i<=m;i++) 56 | { 57 | for(j=1;j<=n;j++) 58 | { 59 | tmp = newarr[i*(m+2)+j]-oldarr[i*(m+2)+j]; 60 | dsq += tmp*tmp; 61 | } 62 | } 63 | 64 | return dsq; 65 | } 66 | -------------------------------------------------------------------------------- /nways_cfd/English/C/source_code/serial/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 NVIDIA Corporation. All rights reserved. 2 | 3 | # HINT 1) comment out the CFLAGS line when you are compiling the parallel code as we no longer need to add the path to the location of nvtx 4 | 5 | # HINT 2) For the CUDA C version, remember to add -arch=native compiler option to use the default architecture on the system you are running. 6 | 7 | # HINT 3) For the CUDA C version, make sure the SUFFIXES and CC are correct 8 | 9 | CC= nvc++ 10 | CFLAGS := -O3 -w -ldl 11 | ACCFLAGS := -Minfo=accel 12 | VER=$(shell nvc -dumpversion) 13 | NVARCH=$(shell uname -s)_$(shell uname -m) 14 | CFLAGS= -I/opt/nvidia/hpc_sdk/$(NVARCH)/$(VER)/cuda/include 15 | LFLAGS= 16 | 17 | 18 | 19 | # System independent definitions 20 | 21 | MF= Makefile 22 | 23 | EXE= cfd 24 | 25 | INC= \ 26 | arraymalloc.h \ 27 | boundary.h \ 28 | cfdio.h \ 29 | jacobi.h 30 | 31 | # Replace/Add .cu files to the below and make the change for the rest of the locations. 32 | SRC= \ 33 | arraymalloc.cpp \ 34 | boundary.cpp \ 35 | cfd.cpp \ 36 | cfdio.cpp \ 37 | jacobi.cpp 38 | 39 | # 40 | # Make the necessary changes (hint: replace .cpp with .cu so the "make clean" does not remove them) 41 | # 42 | 43 | .SUFFIXES: 44 | .SUFFIXES: .cpp .o 45 | 46 | OBJ= $(SRC:.cpp=.o) 47 | 48 | .cpp.o: 49 | $(CC) $(CFLAGS) -c $< 50 | 51 | all: $(EXE) 52 | 53 | $(OBJ): $(INC) 54 | 55 | $(EXE): $(OBJ) 56 | $(CC) $(CFLAGS) -o $@ $(OBJ) $(LFLAGS) 57 | 58 | $(OBJ): $(MF) 59 | 60 | tar: 61 | tar cvf cfd.tar $(MF) $(INC) $(SRC) 62 | 63 | clean: 64 | rm -f $(OBJ) $(EXE) velocity.dat colourmap.dat cfd.plt core 65 | -------------------------------------------------------------------------------- /nways_cfd/English/Fortran/source_code/serial/jacobi.f90: -------------------------------------------------------------------------------- 1 | module jacobi 2 | 3 | implicit none 4 | 5 | contains 6 | 7 | subroutine jacobistep(psinew, psi, m, n) 8 | 9 | integer :: m, n 10 | double precision, dimension(0:m+1, 0:n+1) :: psinew, psi 11 | 12 | psinew(1:m, 1:n) = 0.25d0*(psi(2:m+1, 1:n) + psi(0:m-1, 1:n) + & 13 | psi(1:m, 2:n+1) + psi(1:m, 0:n-1) ) 14 | 15 | end subroutine jacobistep 16 | 17 | subroutine jacobistepvort(zetnew, psinew, zet, psi, m, n, re) 18 | 19 | integer :: m, n 20 | double precision :: re 21 | double precision, dimension(0:m+1, 0:n+1) :: zetnew, zet, psinew, psi 22 | 23 | psinew(1:m, 1:n) = 0.25d0*(psi(2:m+1, 1:n) + psi(0:m-1, 1:n) + & 24 | psi(1:m, 2:n+1) + psi(1:m, 0:n-1) - & 25 | zet(1:m, 1:n)) 26 | 27 | zetnew(1:m, 1:n) = 0.25d0*(zet(2:m+1, 1:n) + zet(0:m-1, 1:n) + & 28 | zet(1:m, 2:n+1) + zet(1:m, 0:n-1) ) - & 29 | re/16.0*((psi(1:m, 2:n+1) - psi(1:m, 0:n-1)) * & 30 | (zet(2:m+1, 1:n) - zet(0:m-1, 1:n)) - & 31 | (psi(2:m+1, 1:n) - psi(0:m-1, 1:n)) * & 32 | (zet(1:m, 2:n+1) - zet(1:m, 0:n-1)) ) 33 | 34 | end subroutine jacobistepvort 35 | 36 | double precision function deltasq(new, old, m, n) 37 | 38 | integer :: m, n 39 | double precision, dimension(0:m+1, 0:n+1) :: new, old 40 | 41 | integer :: ierr 42 | 43 | deltasq = sum((new(1:m,1:n)-old(1:m,1:n))**2) 44 | 45 | end function deltasq 46 | 47 | end module jacobi 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /nways_cfd/Dockerfile_python: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2021 NVIDIA Corporation. All rights reserved. 3 | 4 | # To build the docker container, run: $ sudo docker build -t nways-labs:latest . 5 | # To run: $ sudo docker run --rm -it --runtime nvidia -p 8888:8888 nways-labs:latest 6 | # Finally, open http://localhost:8888/ 7 | 8 | #FROM nvcr.io/nvidia/nvhpc:20.11-devel-cuda_multi-ubuntu20.04 9 | FROM nvidia/cuda:11.4.2-devel-ubuntu20.04 10 | 11 | ##### 12 | # Read https://forums.developer.nvidia.com/t/notice-cuda-linux-repository-key-rotation/212772 13 | RUN apt-key del 7fa2af80 14 | RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub 15 | ##### 16 | 17 | RUN apt-get -y update && \ 18 | DEBIAN_FRONTEND=noninteractive apt-get -yq install --no-install-recommends \ 19 | python3-dev \ 20 | python3-pip python3-setuptools nginx zip make build-essential libtbb-dev && \ 21 | rm -rf /var/lib/apt/lists/* 22 | 23 | RUN pip3 install --no-cache-dir -U install setuptools pip 24 | RUN pip3 install gdown 25 | RUN apt-get update -y 26 | RUN apt-get install -y git nvidia-modprobe 27 | # Install required python packages 28 | RUN pip3 install jupyterlab 29 | RUN pip3 install ipywidgets 30 | RUN pip3 install --upgrade numpy==1.21.1 31 | RUN pip3 install --no-cache-dir "cupy-cuda114==10.3.1" \ 32 | numba==0.53.1 scipy 33 | 34 | 35 | ############################################ 36 | # NVIDIA nsight-systems-cli-2022.1.1, nsight-compute-2022.1.1 37 | RUN apt-get update -y && \ 38 | DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ 39 | apt-transport-https \ 40 | ca-certificates \ 41 | gnupg \ 42 | wget && \ 43 | #apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys F60F4B3D7FA2AF80 && \ 44 | wget -qO - https://developer.download.nvidia.com/devtools/repos/ubuntu2004/amd64/nvidia.pub | apt-key add - \ 45 | echo "deb https://developer.download.nvidia.com/devtools/repos/ubuntu2004/amd64/ /" >> /etc/apt/sources.list.d/nsight.list &&\ 46 | apt-get update -y 47 | 48 | 49 | RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends nsight-systems-cli-2022.1.1 nsight-compute-2022.1.1 50 | 51 | # TO COPY the data 52 | COPY English/ /labs/ 53 | 54 | 55 | ################################################# 56 | ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/lib/python3.8/dist-packages:/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" 57 | ENV PATH="/opt/nvidia/nsight-systems/2022.1.1/bin:/opt/nvidia/nsight-compute/2022.1.1:/usr/local/bin:/bin:/usr/local/cuda/bin:/usr/bin${PATH:+:${PATH}}" 58 | 59 | 60 | WORKDIR /labs 61 | CMD service nginx start && jupyter-lab --no-browser --allow-root --ip=0.0.0.0 --port=8888 --NotebookApp.token="" --notebook-dir=/labs 62 | -------------------------------------------------------------------------------- /nways_cfd/Singularity_python: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA Corporation. All rights reserved. 2 | 3 | Bootstrap: docker 4 | #FROM: nvcr.io/nvidia/nvhpc:20.11-devel-cuda_multi-ubuntu20.04 5 | FROM: nvidia/cuda:11.4.2-devel-ubuntu20.04 6 | 7 | %environment 8 | export XDG_RUNTIME_DIR= 9 | export PATH="$PATH:/usr/local/bin:/usr/bin" 10 | export PATH=/opt/nvidia/nsight-systems/2022.1.1/bin:/opt/nvidia/nsight-compute/2022.1.1:/bin:/usr/local/cuda/bin$PATH 11 | export LD_LIBRARY_PATH="/usr/include/python3.8:/usr/local/lib:/usr/local/lib/python3.8/dist-packages:/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" 12 | 13 | 14 | %post 15 | build_tmp=$(mktemp -d) && cd ${build_tmp} 16 | 17 | ##### 18 | # Read https://forums.developer.nvidia.com/t/notice-cuda-linux-repository-key-rotation/212772 19 | apt-key del 7fa2af80 20 | apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub 21 | ##### 22 | 23 | apt-get -y update 24 | apt-get -y dist-upgrade 25 | DEBIAN_FRONTEND=noninteractive apt-get -yq install --no-install-recommends python3-dev \ 26 | m4 vim-nox emacs-nox nano zip \ 27 | python3-pip python3-setuptools nginx zip make build-essential libtbb-dev 28 | rm -rf /var/lib/apt/cache/* 29 | pip3 install --no-cache-dir -U install setuptools pip 30 | apt-get -y update 31 | apt-get -y install git nvidia-modprobe 32 | pip3 install 'chardet>=3.0.2,<3.1.0' 'idna>=2.5,<2.8' 'urllib3>=1.21.1,<1.24' 'certifi>=2017.4.17' 33 | pip3 install jupyterlab 34 | pip3 install ipywidgets 35 | pip3 install gdown 36 | pip3 install --upgrade numpy==1.21.1 37 | pip3 install --no-cache-dir "cupy-cuda114==10.3.1" \ 38 | numba==0.53.1 scipy 39 | 40 | 41 | apt-get install --no-install-recommends -y build-essential 42 | 43 | 44 | # NVIDIA nsight-systems-cli-2022.1.1, nsight-compute-2022.1.1 45 | apt-get update -y 46 | DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends apt-transport-https ca-certificates gnupg wget 47 | # apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys F60F4B3D7FA2AF80 48 | wget -qO - https://developer.download.nvidia.com/devtools/repos/ubuntu2004/amd64/nvidia.pub | apt-key add - 49 | echo "deb https://developer.download.nvidia.com/devtools/repos/ubuntu2004/amd64/ /" >> /etc/apt/sources.list.d/nsight.list 50 | apt-get update -y 51 | DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends nsight-systems-cli-2022.1.1 nsight-compute-2022.1.1 52 | #rm -rf /var/lib/apt/lists/* 53 | 54 | 55 | apt-get install --no-install-recommends -y build-essential 56 | 57 | 58 | cd / 59 | rm -rf ${build_tmp} 60 | 61 | %files 62 | English/ /labs 63 | %runscript 64 | "$@" 65 | 66 | %labels 67 | AUTHOR Tosin 68 | -------------------------------------------------------------------------------- /nways_cfd/English/minicfd.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## CFD Simulation\n", 8 | "In this bootcamp we will accelerate a Simple 2D regular-grid CFD simulation for teaching GPU programming using multiple approaches.\n", 9 | "\n", 10 | "### Learning objectives\n", 11 | "Learn how to write a portable parallel program that can run on multicore CPUs and accelerators like GPUs using OpenACC, OpenMP,std::par, CuPy, and Numba. Also learn how to optimize using lower level constructs using languages like CUDA-C. \n", 12 | "\n", 13 | "### Bootcamp Duration\n", 14 | "The lab material is a challenge where the particiapnts will accelerate the application using mutiple approaches to GPU programming.\n", 15 | "\n", 16 | "### Content Level\n", 17 | "Beginner, Intermediate\n", 18 | "\n", 19 | "### Target Audience and Prerequisites\n", 20 | " The target audience for this tutorial is researchers, graduate students and developers who are interested in\n", 21 | "harnessing the power of GPUs to accelerate their scientific applications, and evaluate which programming approach best suites their needs. \n", 22 | "\n", 23 | "\n", 24 | "### Start Here\n", 25 | "You can choose any of the following:\n", 26 | "\n", 27 | "- [C-based code](C/jupyter_notebook/minicfd.ipynb)\n", 28 | "- [Fortran-version](Fortran/jupyter_notebook/minicfd.ipynb)\n", 29 | "- [Python-based](Python/jupyter_notebook/minicfd.ipynb)\n", 30 | "\n", 31 | "\n", 32 | "--- \n", 33 | "\n", 34 | "## Links and Resources\n", 35 | "\n", 36 | "**NOTE**: To be able to see the Nsight Systems profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n", 37 | "\n", 38 | "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n", 39 | "\n", 40 | "--- \n", 41 | "\n", 42 | "## Licensing \n", 43 | "\n", 44 | "Copyright © 2022 OpenACC-Standard.org. This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply." 45 | ] 46 | } 47 | ], 48 | "metadata": { 49 | "anaconda-cloud": {}, 50 | "kernelspec": { 51 | "display_name": "Python 3", 52 | "language": "python", 53 | "name": "python3" 54 | }, 55 | "language_info": { 56 | "codemirror_mode": { 57 | "name": "ipython", 58 | "version": 3 59 | }, 60 | "file_extension": ".py", 61 | "mimetype": "text/x-python", 62 | "name": "python", 63 | "nbconvert_exporter": "python", 64 | "pygments_lexer": "ipython3", 65 | "version": "3.7.4" 66 | } 67 | }, 68 | "nbformat": 4, 69 | "nbformat_minor": 4 70 | } 71 | -------------------------------------------------------------------------------- /nways_cfd/English/C/source_code/serial/cfdio.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "cfdio.h" 6 | #include "arraymalloc.h" 7 | 8 | void writedatafiles(double *psi, int m, int n, int scale) 9 | { 10 | typedef double Vecvel[2]; 11 | typedef int Vecrgb[3]; 12 | 13 | Vecvel **vel; 14 | Vecrgb **rgb; 15 | 16 | FILE *cfile, *vfile; 17 | 18 | double modvsq, hue; 19 | int i,j, ix, iy; 20 | int nvel, nrgb; 21 | 22 | printf("\n\nWriting data files ...\n"); 23 | 24 | vel = (Vecvel **) arraymalloc2d(m,n,sizeof(Vecvel)); 25 | rgb = (Vecrgb **) arraymalloc2d(m,n,sizeof(Vecrgb)); 26 | 27 | //calculate velocities and hues 28 | 29 | double v1, v2; 30 | 31 | for (i=0;i x2) 127 | { 128 | return 0.0; 129 | } 130 | else if (absx < x1) 131 | { 132 | return 1.0; 133 | } 134 | else 135 | { 136 | return 1.0-pow((absx-x1)/(x2-x1),2); 137 | } 138 | } 139 | 140 | 141 | #include 142 | 143 | /* wall-clock time */ 144 | 145 | double gettime(void) 146 | { 147 | struct timeval tp; 148 | gettimeofday (&tp, NULL); 149 | return tp.tv_sec + tp.tv_usec/(double)1.0e6; 150 | } 151 | -------------------------------------------------------------------------------- /nways_cfd/README.md: -------------------------------------------------------------------------------- 1 | ## Application: 2 | 3 | # CFD 4 | 5 | Simple 2D regular-grid CFD simulation for teaching parallel scaling concepts 6 | 7 | This is a simple simulation of an incompressible fluid flowing in a cavity using the 2D Navier-Stokes equation. The fluid flow can either be viscous (finite Reynolds number and vortices in the flow) on non-viscous (no Reynolds 8 | number specified and no vortices in the flow). 9 | 10 | It is deliberately written to be very simple and easy to understand so it can be used as a teaching example. 11 | 12 | To build the application, just run the "make". This will produce a binary "cfd". To run the application, just run the executable. 13 | 14 | ## Checking Output: 15 | 16 | ## Prerequisites: 17 | 18 | To run this tutorial you will need a machine with NVIDIA GPU (**Tested on NVIDIA driver 525.105.17**) 19 | 20 | - Install the [Docker](https://docs.docker.com/get-docker/) or [Singularity](https://sylabs.io/docs/]). 21 | - Install Nvidia toolkit, [Nsight Systems (latest version)](https://developer.nvidia.com/nsight-systems). 22 | 23 | ## Creating containers 24 | 25 | To start with, you will have to build a Docker or Singularity container. 26 | 27 | **NOTE: Please build the container on the machine that you are planning to run the container on**. 28 | 29 | ### Docker Container 30 | 31 | To build a docker container for **C & Fortran**, run: 32 | 33 | `sudo docker build -t : .` 34 | 35 | For instance: 36 | 37 | `sudo docker build -t myimage:1.0 .` 38 | 39 | While in the case of **Python**, you have to specify the dockerfile name using flag **"-f"**, therefore run: 40 | 41 | `sudo docker build -f -t : .` 42 | 43 | For example : 44 | 45 | `sudo docker build -f Dockerfile_python -t myimage:1.0 .` 46 | 47 | For C, Fortran, and Python, the code labs have been written using Jupyter labs and a Dockerfile has been built to simplify deployment. In order to serve the docker instance for a student, it is necessary to expose port 8888 from the container, for instance, the following command would expose port 8888 inside the container as port 8888 on the lab machine: 48 | 49 | `sudo docker run --rm -it --gpus=all -p 8888:8888 myimage:1.0` 50 | 51 | When this command is run, you can browse to the serving machine on port 8888 using any web browser to access the labs. For instance, from if they are running on the local machine the web browser should be pointed to http://localhost:8888. The `--gpus` flag is used to enable `all` NVIDIA GPUs during container runtime. The `--rm` flag is used to clean an temporary images created during the running of the container. The `-it` flag enables killing the jupyter server with `ctrl-c`. This command may be customized for your hosting environment. 52 | 53 | Then, inside the container launch the Jupyter notebook assigning the port you opened: 54 | 55 | `jupyter-lab --ip 0.0.0.0 --port 8888 --no-browser --allow-root` 56 | 57 | Once inside the container, open the jupyter notebook in browser: http://localhost:8888, and start the lab by clicking on the `minicfd.ipynb` notebook. 58 | 59 | ### Singularity Container 60 | 61 | To build the singularity container for **C & Fortran**, run: 62 | 63 | `singularity build minicfd.simg Singularity` 64 | 65 | While in the case of **Python**, run: 66 | 67 | `singularity build minicfd.simg Singularity_python` 68 | 69 | Thereafter, for C, Fortran, and Python, copy the files to your local machine to make sure changes are stored locally: 70 | 71 | `singularity run minicfd.simg cp -rT /labs ~/labs` 72 | 73 | Then, run the container: 74 | 75 | `singularity run --nv minicfd.simg jupyter-lab --notebook-dir=~/labs` 76 | 77 | Once inside the container, open the jupyter notebook in browser: http://localhost:8888, and start the lab by clicking on the `minicfd.ipynb` notebook. 78 | 79 | ## Questions? 80 | 81 | Please join [OpenACC Slack Channel](https://openacclang.slack.com/messages/openaccusergroup) for questions. 82 | -------------------------------------------------------------------------------- /nways_cfd/English/Fortran/source_code/serial/cfdio.f90: -------------------------------------------------------------------------------- 1 | module cfdio 2 | 3 | implicit none 4 | 5 | contains 6 | 7 | subroutine writedatafiles(psi, m, n, scale) 8 | 9 | integer :: m, n, scale 10 | double precision :: psi(0:m+1, 0:n+1) 11 | 12 | double precision, allocatable :: vel(:,:,:) 13 | integer, allocatable :: rgb(:,:,:) 14 | 15 | double precision :: modvsq, hue 16 | integer :: i, j, k 17 | 18 | integer, parameter :: iounitvel = 10, iounitcol = 11 19 | 20 | ! Compute local velocities and colours 21 | 22 | allocate(rgb(3,m,n)) 23 | allocate(vel(2,m,n)) 24 | 25 | do i = 1, m 26 | do j = 1, n 27 | 28 | vel(1,i,j) = (psi(i,j+1)-psi(i,j-1)) / 2.0 29 | vel(2,i,j) = - (psi(i+1,j)-psi(i-1,j)) / 2.0 30 | 31 | modvsq = vel(1,i,j)**2 + vel(2,i,j)**2 32 | hue = modvsq**0.4 33 | 34 | call hue2rgb(hue, rgb(1,i,j), rgb(2,i,j), rgb(3,i,j)) 35 | 36 | end do 37 | end do 38 | 39 | ! Write out 40 | 41 | open(unit=iounitcol, file='colourmap.dat', form='formatted') 42 | open(unit=iounitvel, file='velocity.dat', form='formatted') 43 | 44 | do j = 1, n 45 | do i = 1, m 46 | 47 | ! Write colour map of velocity magnitude at every point 48 | 49 | write(iounitcol,fmt='(i4,1x,i4,1x,i3,1x,i3,1x,i3)') & 50 | i, j, rgb(1,i,j), rgb(2,i,j), rgb(3,i,j) 51 | 52 | ! Only write velocity vectors every "scale" points 53 | 54 | if (mod(i-1,scale) == (scale-1)/2 .and. & 55 | mod(j-1,scale) == (scale-1)/2 ) then 56 | 57 | write(iounitvel,fmt='(i4,1x,i4,1x,g12.5,1x,g12.5)') & 58 | i, j, vel(1,i,j), vel(2,i,j) 59 | end if 60 | 61 | end do 62 | end do 63 | 64 | close(unit=iounitcol) 65 | close(unit=iounitvel) 66 | 67 | end subroutine writedatafiles 68 | 69 | 70 | subroutine writeplotfile(m, n, scale) 71 | 72 | integer :: m, n, scale 73 | integer, parameter :: iounit = 10 74 | 75 | open(unit=iounit, file='cfd.plt', form='formatted') 76 | 77 | write(iounit,*) 'set size square' 78 | write(iounit,*) 'set key off' 79 | write(iounit,*) 'unset xtics' 80 | write(iounit,*) 'unset ytics' 81 | 82 | write(iounit,fmt='('' set xrange ['',i4,'':'',i4, '']'')') 1-scale, m+scale 83 | write(iounit,fmt='('' set yrange ['',i4,'':'',i4, '']'')') 1-scale, n+scale 84 | 85 | write(iounit,fmt='('' plot "colourmap.dat" w rgbimage, "velocity.dat" u 1:2:& 86 | &('',i2,''*0.75*$3/sqrt($3**2+$4**2)):& 87 | &('',i2,''*0.75*$4/sqrt($3**2+$4**2)) & 88 | &with vectors lc rgb "#7F7F7F"'')') scale, scale 89 | 90 | close(unit=iounit) 91 | 92 | end subroutine writeplotfile 93 | 94 | 95 | subroutine hue2rgb(hue, r, g, b) 96 | 97 | double precision :: hue 98 | 99 | integer :: r, g, b 100 | integer, parameter :: rgbmax = 255 101 | 102 | r = rgbmax*colfunc(hue-1.0) 103 | g = rgbmax*colfunc(hue-0.5) 104 | b = rgbmax*colfunc(hue ) 105 | 106 | end subroutine hue2rgb 107 | 108 | 109 | double precision function colfunc(x) 110 | 111 | double precision :: x, absx, val 112 | 113 | double precision, parameter :: x1 = 0.2, x2 = 0.5 114 | 115 | absx = abs(x) 116 | 117 | if (absx .gt. x2) then 118 | val = 0.0 119 | else if (absx .lt. x1) then 120 | val = 1.0 121 | else 122 | val = 1.0 - ((absx-x1)/(x2-x1))**2 123 | end if 124 | 125 | colfunc = val 126 | 127 | end function colfunc 128 | 129 | double precision function gettime() 130 | 131 | logical, save :: firstcall = .true. 132 | 133 | integer, parameter :: int32kind = selected_int_kind( 9) 134 | integer, parameter :: int64kind = selected_int_kind(18) 135 | 136 | integer, parameter :: intkind = int64kind 137 | 138 | integer(kind = intkind) :: count,rate 139 | 140 | double precision, save :: ticktime 141 | 142 | if (firstcall) then 143 | 144 | firstcall = .false. 145 | 146 | call system_clock(count, rate) 147 | 148 | ticktime = 1.0d0/dble(rate) 149 | gettime = dble(count)*ticktime 150 | 151 | ! write(*,*) 'Clock resolution is ', ticktime*1.0e6, ', usecs' 152 | 153 | else 154 | 155 | call system_clock(count) 156 | 157 | gettime = dble(count)*ticktime 158 | 159 | end if 160 | 161 | end function gettime 162 | 163 | end module cfdio 164 | -------------------------------------------------------------------------------- /nways_cfd/English/Python/jupyter_notebook/minicfd_cupy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "# CuPy Acceleration \n", 9 | "\n", 10 | "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "!nvidia-smi" 20 | ] 21 | }, 22 | { 23 | "attachments": {}, 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "## Copy the Serial code\n", 28 | "\n", 29 | "Before start modifying the serial code, let's copy the serial code to cupy folder by running the cell below." 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "!cp ../source_code/serial/* ../source_code/cupy" 39 | ] 40 | }, 41 | { 42 | "attachments": {}, 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "## Run the Serial code" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "%run ../source_code/cupy/cfd.py 64 500" 56 | ] 57 | }, 58 | { 59 | "attachments": {}, 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "---\n", 64 | "\n", 65 | "# Start Adding CuPy Constructs\n", 66 | "\n", 67 | "Now, you can start modifying the Python code:\n", 68 | "\n", 69 | "[cfd.py](../source_code/cupy/cfd.py)\n", 70 | "\n", 71 | "Remember to **SAVE** your code after changes, before running below cells.\n", 72 | "\n", 73 | "#### Some Hints\n", 74 | "The serial code consists of the `main, jacobi, and write_data` functions. Focus more the jacobi and main functions. Remember to import the cupy library as: ```import cupy as cp ``` at the top of your code. Check if there is any data race in your code.\n", 75 | "\n", 76 | "## Run and Profile the CuPy code\n", 77 | " " 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "!cd ../source_code/cupy && nsys profile -t nvtx --stats=true --force-overwrite true -o minicfdcupy_profile python3 cfd.py 64 500" 87 | ] 88 | }, 89 | { 90 | "attachments": {}, 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "You can examine the output on the terminal or you can download the file and view the timeline by opening the output with the NVIDIA Nsight Systems.\n", 95 | "\n", 96 | "Download and save the report file by holding down Shift and right-clicking [here](../source_code/cupy/minicfdcupy_profile.nsys-rep) then choosing save Link As. Once done, open it via the GUI.\n", 97 | "\n", 98 | "## Validating the Output\n", 99 | "\n", 100 | "Make sure the error value printed as output matches that of the serial code\n", 101 | "\n", 102 | "\n", 103 | "# Recommendations for adding CuPy Constructs\n", 104 | "\n", 105 | "After finding the hotspot function take an incremental approach: \n", 106 | "\n", 107 | "1) Add `@cp.fuse()` decorator at the top of the function or rewrite the function as a raw kernel(this is rather tedious)\n", 108 | "\n", 109 | "2) Ignore the I/O function\n", 110 | "\n", 111 | "3) Ensure that only required data moves from `host (CPU function)` to `device (GPU function)` and vice versa\n", 112 | "\n", 113 | "4) Cross check the output after incremental changes to check algorithmic scalability\n", 114 | "\n", 115 | "5) Start with a small problem size that reduces the execution time. \n", 116 | "\n", 117 | "\n", 118 | "**General tip:** Be aware of *Data Race* situation in which at least two threads access a shared variable at the same time. At least on thread tries to modify the variable. If data race happened, an incorrect result will be returned. So, make sure to validate your output against the serial version.\n", 119 | "\n", 120 | "\n", 121 | "# Links and Resources\n", 122 | "\n", 123 | "[Introduction to CuPy](https://github.com/gpuhackathons-org/gpubootcamp/blob/master/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/cupy/cupy_guide.ipynb)\n", 124 | "\n", 125 | "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n", 126 | "\n", 127 | "[NVIDIA CUDA Toolkit](https://developer.nvidia.com/cuda-downloads)\n", 128 | "\n", 129 | "**NOTE**: To be able to see the Nsight Systems profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n", 130 | "\n", 131 | "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n", 132 | "\n", 133 | "---\n", 134 | "## Licensing \n", 135 | "\n", 136 | "Copyright © 2022 OpenACC-Standard.org. This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply." 137 | ] 138 | } 139 | ], 140 | "metadata": { 141 | "kernelspec": { 142 | "display_name": "Python 3", 143 | "language": "python", 144 | "name": "python3" 145 | }, 146 | "language_info": { 147 | "codemirror_mode": { 148 | "name": "ipython", 149 | "version": 3 150 | }, 151 | "file_extension": ".py", 152 | "mimetype": "text/x-python", 153 | "name": "python", 154 | "nbconvert_exporter": "python", 155 | "pygments_lexer": "ipython3", 156 | "version": "3.7.4" 157 | } 158 | }, 159 | "nbformat": 4, 160 | "nbformat_minor": 4 161 | } 162 | -------------------------------------------------------------------------------- /nways_cfd/English/Python/jupyter_notebook/minicfd_numba.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "# Numba Acceleration \n", 9 | "\n", 10 | "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "!nvidia-smi" 20 | ] 21 | }, 22 | { 23 | "attachments": {}, 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "## Copy the Serial code\n", 28 | "\n", 29 | "Before start modifying the serial code, let's copy the serial code to cupy folder by running the cell below." 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "!cp ../source_code/serial/* ../source_code/numba" 39 | ] 40 | }, 41 | { 42 | "attachments": {}, 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "## Run the Serial code" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "%run ../source_code/numba/cfd.py 64 500" 56 | ] 57 | }, 58 | { 59 | "attachments": {}, 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "---\n", 64 | "\n", 65 | "# Start Adding Numba Constructs\n", 66 | "\n", 67 | "Now, you can start modifying the Python code: \n", 68 | "\n", 69 | "[cfd.py](../source_code/numba/cfd.py)\n", 70 | "\n", 71 | "Remember to **SAVE** your code after changes, before running below cells.\n", 72 | "\n", 73 | "#### Some Hints\n", 74 | "The serial code consists of the `main, jacobi, and write_data` functions. Focus more the jacobi and main functions. Remember to import the cupy library as: ```from numba import cuda ``` at the top of your code. Check if there is any data race in your code.\n", 75 | "\n", 76 | "## Run and Profile the CuPy code" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "!cd ../source_code/numba && nsys profile -t nvtx --stats=true --force-overwrite true -o minicfdnumba_profile python3 cfd.py 64 500" 86 | ] 87 | }, 88 | { 89 | "attachments": {}, 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "You can examine the output on the terminal or you can download the file and view the timeline by opening the output with the NVIDIA Nsight Systems.\n", 94 | "\n", 95 | "Download and save the report file by holding down Shift and right-clicking [here](../source_code/numba/minicfdnumba_profile.nsys-rep) then choosing save Link As. Once done, open it via the GUI.\n", 96 | "\n", 97 | "\n", 98 | "## Validating the Output\n", 99 | "\n", 100 | "Make sure the error value printed as output matches that of the serial code\n", 101 | "\n", 102 | "\n", 103 | "# Recommendations for adding Numba Constructs\n", 104 | "\n", 105 | "After finding the hotspot function take an incremental approach: \n", 106 | "\n", 107 | "1) Add `@cuda.jit()` decorator at the top of the function or rewrite the function as a raw kernel(this is rather tedious)\n", 108 | "\n", 109 | "2) You may need to perform a copy-swap data in a different kernel function\n", 110 | "\n", 111 | "3) Ignore the I/O function\n", 112 | "\n", 113 | "4) Ensure that only required data moves from `host (CPU function)` to `device (GPU function)` and vice versa\n", 114 | "\n", 115 | "5) Cross check the output after incremental changes to check algorithmic scalability\n", 116 | "\n", 117 | "6) Start with a small problem size that reduces the execution time. \n", 118 | "\n", 119 | "\n", 120 | "**General tip:** Be aware of *Data Race* situation in which at least two threads access a shared variable at the same time. At least on thread tries to modify the variable. If data race happened, an incorrect result will be returned. So, make sure to validate your output against the serial version.\n", 121 | "\n", 122 | "\n", 123 | "# Links and Resources\n", 124 | "\n", 125 | "[Introduction to Numba](https://github.com/gpuhackathons-org/gpubootcamp/tree/master/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/numba/numba_guide.ipynb)\n", 126 | "\n", 127 | "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n", 128 | "\n", 129 | "[NVIDIA CUDA Toolkit](https://developer.nvidia.com/cuda-downloads)\n", 130 | "\n", 131 | "**NOTE**: To be able to see the Nsight Systems profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n", 132 | "\n", 133 | "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n", 134 | "\n", 135 | "\n", 136 | "---\n", 137 | "## Licensing \n", 138 | "\n", 139 | "Copyright © 2022 OpenACC-Standard.org. This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply." 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [] 148 | } 149 | ], 150 | "metadata": { 151 | "kernelspec": { 152 | "display_name": "Python 3", 153 | "language": "python", 154 | "name": "python3" 155 | }, 156 | "language_info": { 157 | "codemirror_mode": { 158 | "name": "ipython", 159 | "version": 3 160 | }, 161 | "file_extension": ".py", 162 | "mimetype": "text/x-python", 163 | "name": "python", 164 | "nbconvert_exporter": "python", 165 | "pygments_lexer": "ipython3", 166 | "version": "3.7.4" 167 | } 168 | }, 169 | "nbformat": 4, 170 | "nbformat_minor": 4 171 | } 172 | -------------------------------------------------------------------------------- /nways_cfd/English/C/source_code/serial/cfd.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "arraymalloc.h" 6 | #include "boundary.h" 7 | #include "jacobi.h" 8 | #include "cfdio.h" 9 | 10 | int main(int argc, char **argv) 11 | { 12 | int printfreq=1000; //output frequency 13 | double error, bnorm; 14 | double tolerance=0.0; //tolerance for convergence. <=0 means do not check 15 | 16 | //main arrays 17 | double *psi, *zet; 18 | //temporary versions of main arrays 19 | double *psitmp, *zettmp; 20 | 21 | //command line arguments 22 | int scalefactor, numiter; 23 | 24 | double re; // Reynold's number - must be less than 3.7 25 | 26 | //simulation sizes 27 | int bbase=10; 28 | int hbase=15; 29 | int wbase=5; 30 | int mbase=32; 31 | int nbase=32; 32 | 33 | int irrotational = 1, checkerr = 0; 34 | 35 | int m,n,b,h,w; 36 | int iter; 37 | int i,j; 38 | 39 | double tstart, tstop, ttot, titer; 40 | 41 | //do we stop because of tolerance? 42 | if (tolerance > 0) {checkerr=1;} 43 | 44 | //check command line parameters and parse them 45 | 46 | if (argc <3|| argc >4) 47 | { 48 | printf("Usage: cfd [reynolds]\n"); 49 | return 0; 50 | } 51 | 52 | scalefactor=atoi(argv[1]); 53 | numiter=atoi(argv[2]); 54 | 55 | if (argc == 4) 56 | { 57 | re=atof(argv[3]); 58 | irrotational=0; 59 | } 60 | else 61 | { 62 | re=-1.0; 63 | } 64 | 65 | if(!checkerr) 66 | { 67 | printf("Scale Factor = %i, iterations = %i\n",scalefactor, numiter); 68 | } 69 | else 70 | { 71 | printf("Scale Factor = %i, iterations = %i, tolerance= %g\n",scalefactor,numiter,tolerance); 72 | } 73 | 74 | if (irrotational) 75 | { 76 | printf("Irrotational flow\n"); 77 | } 78 | else 79 | { 80 | printf("Reynolds number = %f\n",re); 81 | } 82 | 83 | //Calculate b, h & w and m & n 84 | b = bbase*scalefactor; 85 | h = hbase*scalefactor; 86 | w = wbase*scalefactor; 87 | m = mbase*scalefactor; 88 | n = nbase*scalefactor; 89 | 90 | re = re / (double)scalefactor; 91 | 92 | printf("Running CFD on %d x %d grid in serial\n",m,n); 93 | 94 | //allocate arrays 95 | 96 | psi = (double *) malloc((m+2)*(n+2)*sizeof(double)); 97 | psitmp = (double *) malloc((m+2)*(n+2)*sizeof(double)); 98 | 99 | nvtxRangePush("Initialization"); 100 | //zero the psi array 101 | for (i=0;i numiter) iter=numiter; 264 | 265 | tstop=gettime(); 266 | 267 | ttot=tstop-tstart; 268 | titer=ttot/(double)iter; 269 | 270 | 271 | //print out some stats 272 | 273 | printf("\n... finished\n"); 274 | printf("After %d iterations, the error is %g\n",iter,error); 275 | printf("Time for %d iterations was %g seconds\n",iter,ttot); 276 | printf("Each iteration took %g seconds\n",titer); 277 | 278 | //output results 279 | 280 | writedatafiles(psi,m,n, scalefactor); 281 | 282 | writeplotfile(m,n,scalefactor); 283 | 284 | //free un-needed arrays 285 | free(psi); 286 | free(psitmp); 287 | 288 | if (!irrotational) 289 | { 290 | free(zet); 291 | free(zettmp); 292 | } 293 | 294 | printf("... finished\n"); 295 | 296 | return 0; 297 | } 298 | -------------------------------------------------------------------------------- /nways_cfd/English/Fortran/source_code/serial/cfd.f90: -------------------------------------------------------------------------------- 1 | program cfd 2 | 3 | use boundary 4 | use jacobi 5 | use cfdio 6 | use nvtx 7 | 8 | implicit none 9 | 10 | ! Output frequency 11 | 12 | integer, parameter :: printfreq = 1000 13 | 14 | ! Variables associated with convergence 15 | 16 | double precision :: error, bnorm 17 | 18 | ! Set tolerance for convergence; zero or negative means do not check 19 | 20 | double precision, parameter :: tolerance = 0.0d0 21 | 22 | ! Main arrays 23 | 24 | double precision, allocatable :: psi(:,:), zet(:,:) 25 | double precision, allocatable :: psitmp(:,:), zettmp(:,:) 26 | 27 | ! Command-line arguments 28 | 29 | integer :: scalefactor, numiter 30 | 31 | double precision :: re ! re = 3.7 seems to be stability limit with Jacobi 32 | 33 | integer, parameter :: maxline = 32 34 | character(len=maxline) :: tmparg 35 | 36 | ! Basic sizes of simulation 37 | 38 | integer, parameter :: bbase = 10 39 | integer, parameter :: hbase = 15 40 | integer, parameter :: wbase = 5 41 | integer, parameter :: mbase = 32 42 | integer, parameter :: nbase = 32 43 | 44 | logical :: irrotational = .true., checkerr = .false. 45 | 46 | ! Some auxiliary parameters and variables 47 | 48 | integer :: m, n, b, h, w 49 | integer :: iter 50 | 51 | double precision :: tstart, tstop, ttot, titer, modvsq, hue 52 | 53 | ! Are we stopping based on tolerance? 54 | 55 | if (tolerance .gt. 0.0) checkerr = .true. 56 | 57 | ! Read in parameters 58 | 59 | if (command_argument_count() /= 2 .and. command_argument_count() /= 3) then 60 | 61 | write(*,*) 'Usage: cfd [reynolds]' 62 | stop 63 | 64 | end if 65 | 66 | call get_command_argument(1, tmparg) 67 | read(tmparg,*) scalefactor 68 | call get_command_argument(2, tmparg) 69 | read(tmparg,*) numiter 70 | 71 | if (command_argument_count() == 3) then 72 | 73 | irrotational = .false. 74 | call get_command_argument(3, tmparg) 75 | read(tmparg,*) re 76 | 77 | else 78 | 79 | re = -1.0 80 | 81 | end if 82 | 83 | if (.not. checkerr) then 84 | write(*,fmt='('' Scale factor = '',i3,'', iterations = '', i6)') & 85 | scalefactor, numiter 86 | else 87 | write(*,fmt='('' Scale factor = '',i3,'', iterations = '', i6, & 88 | &'', tolerance = '', g11.4)') scalefactor, numiter, tolerance 89 | end if 90 | 91 | if (irrotational) then 92 | 93 | write(*,*) 'Irrotational flow' 94 | 95 | else 96 | 97 | write(*,fmt='('' Reynolds number = '', f6.3)') re 98 | 99 | end if 100 | 101 | ! Calculate b, h & w and m & n 102 | 103 | b = bbase*scalefactor 104 | h = hbase*scalefactor 105 | w = wbase*scalefactor 106 | m = mbase*scalefactor 107 | n = nbase*scalefactor 108 | 109 | re = re / dble(scalefactor) 110 | 111 | write(*,fmt='('' Running CFD on '', i4, '' x '', i4, & 112 | &'' grid in serial '')') m, n 113 | 114 | ! Allocate arrays, including halos on psi and tmp 115 | 116 | allocate(psi(0:m+1, 0:n+1)) 117 | allocate(zet(0:m+1, 0:n+1)) 118 | 119 | allocate(psitmp(0:m+1, 0:n+1)) 120 | 121 | if (.not. irrotational) then 122 | 123 | allocate(zettmp(0:m+1, 0:n+1)) 124 | 125 | end if 126 | 127 | ! Zero the psi array 128 | call nvtxStartRange("Initialization") 129 | psi(:,:) = 0.0 130 | zet(:,:) = 0.0 131 | call nvtxEndRange 132 | 133 | ! Set the psi boundary condtions which are constant 134 | 135 | call nvtxStartRange("boundaryPSI") 136 | call boundarypsi(psi, m, n, b, h, w) 137 | call nvtxEndRange 138 | 139 | ! Compute normalisation factor for error 140 | 141 | bnorm = sum(psi(:,:)**2) 142 | 143 | if (.not. irrotational) then 144 | 145 | ! Update the zeta boundary condtions which depend on psi 146 | 147 | call boundaryzet(zet, psi, m, n) 148 | 149 | ! Update the normalisation 150 | 151 | bnorm = bnorm + sum(zet(:,:)**2) 152 | 153 | end if 154 | 155 | bnorm = sqrt(bnorm) 156 | 157 | ! Begin iterative Jacobi loop 158 | 159 | write(*,*) 160 | write(*,*) 'Starting main loop ...' 161 | write(*,*) 162 | 163 | tstart = gettime() 164 | 165 | call nvtxStartRange("Overall Iteration") 166 | do iter = 1, numiter 167 | 168 | ! Compute the new psi based on the old one 169 | 170 | call nvtxStartRange("Jacobi Step") 171 | if (irrotational) then 172 | 173 | ! Call function with no vorticity 174 | call jacobistep(psitmp, psi, m, n) 175 | 176 | else 177 | 178 | ! Call function containing vorticity 179 | 180 | call jacobistepvort(zettmp, psitmp, zet, psi, m, n, re) 181 | 182 | end if 183 | call nvtxEndRange 184 | 185 | ! Compute current error value if required 186 | 187 | call nvtxStartRange("Calculate Error") 188 | if (checkerr .or. iter == numiter) then 189 | 190 | error = deltasq(psitmp, psi, m, n) 191 | 192 | if (.not. irrotational) then 193 | 194 | error = error + deltasq(zettmp, zet, m, n) 195 | 196 | end if 197 | 198 | error = sqrt(error) 199 | 200 | error = error / bnorm 201 | 202 | end if 203 | call nvtxEndRange 204 | 205 | ! Quit early if we have reached required tolerance 206 | 207 | if (checkerr) then 208 | if (error .lt. tolerance) then 209 | write(*,*) 'CONVERGED iteration ', iter, ': terminating' 210 | exit 211 | end if 212 | end if 213 | 214 | ! Copy back 215 | 216 | call nvtxStartRange("Switch Array") 217 | psi(1:m, 1:n) = psitmp(1:m, 1:n) 218 | 219 | if (.not. irrotational) then 220 | 221 | zet(1:m, 1:n) = zettmp(1:m, 1:n) 222 | 223 | end if 224 | call nvtxEndRange 225 | 226 | if (.not. irrotational) then 227 | 228 | ! Update the zeta boundary condtions which depend on psi 229 | 230 | call boundaryzet(zet, psi, m, n) 231 | 232 | end if 233 | 234 | ! End iterative Jacobi loop 235 | 236 | if (mod(iter,printfreq) == 0) then 237 | 238 | if (.not. checkerr) then 239 | write(*,*) 'completed iteration ', iter 240 | else 241 | write(*,*) 'completed iteration ', iter, ', error = ', error 242 | end if 243 | 244 | end if 245 | 246 | end do 247 | call nvtxEndRange 248 | 249 | if (iter .gt. numiter) iter = numiter 250 | 251 | tstop = gettime() 252 | 253 | ttot = tstop-tstart 254 | titer = ttot/dble(iter) 255 | 256 | write(*,*) 257 | write(*,*) '... finished' 258 | write(*,*) 259 | write(*,fmt='('' After '', i6, '' iterations, error is '', g11.4)') & 260 | iter, error 261 | write(*,fmt='('' Time for '', i6, '' iterations was '',& 262 | &g11.4, '' seconds'')') iter, ttot 263 | write(*,fmt='('' Each individual iteration took '', g11.4, '' seconds'')') & 264 | titer 265 | write(*,*) 266 | write(*,*) 'Writing output file ...' 267 | 268 | ! Output results 269 | 270 | call writedatafiles(psi, m, n, scalefactor) 271 | 272 | ! Output gnuplot file 273 | 274 | call writeplotfile(m, n, scalefactor) 275 | 276 | ! Finish 277 | 278 | write(*,*) ' ... finished' 279 | write(*,*) 280 | write(*,*) 'CFD completed' 281 | write(*,*) 282 | 283 | end program cfd 284 | 285 | -------------------------------------------------------------------------------- /nways_cfd/English/C/jupyter_notebook/minicfd_cudac.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# CUDA C Acceleration \n", 8 | "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "!nvidia-smi" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "## Copy and Compile the Serial code\n", 25 | "\n", 26 | "Before start modifying the serial code, let's make a copy of the serial code and rename it." 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "!cp ../source_code/serial/* ../source_code/cuda-c" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "!cd ../source_code/cuda-c && make clean && make" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "## Run the Serial code" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "!cd ../source_code/cuda-c && ./cfd 64 500" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "---\n", 68 | "\n", 69 | "# Start adding CUDA C constructs" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "Now, you can start modifying the C++ code and the `Makefile`:\n", 77 | "\n", 78 | "[cfd code](../source_code/cuda-c/cfd.cpp) \n", 79 | "\n", 80 | "[Makefile](../source_code/cuda-c/Makefile)\n", 81 | "\n", 82 | "Remember to **SAVE** your code after changes, before running below cells.\n", 83 | "\n", 84 | "#### Some Hints\n", 85 | "Check if there is any data race in your code.( More details on data race is present in the Links and resources section below)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "## Compile and run CUDA C enabled code\n" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "!cd ../source_code/cuda-c && make clean && make" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "## Profile the CUDA C Code" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "!cd ../source_code/cuda-c && nsys profile -t nvtx,cuda --stats=true --force-overwrite true -o minicfdcudac_profile ./cfd 64 500" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "You can examine the output on the terminal or you can download the file and view the timeline by opening the output with the NVIDIA Nsight Systems." 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "Download and save the report file by holding down Shift and right-clicking [here](../source_code/cuda-c/minicfdcudac_profile.nsys-rep) then choosing save Link As. Once done, open it via the GUI." 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "## Validating the Output\n", 139 | "\n", 140 | "Make sure the error value printed as output matches that of the serial code" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "# Recommendations for adding CUDA C\n", 148 | "\n", 149 | "After finding the hotspot function take an incremental approach to add pargmas. \n", 150 | "\n", 151 | "1) Convert files using CUDA kernels to .cu \n", 152 | "\n", 153 | "2) Ignore the initialization, finalization and I/O functions\n", 154 | "\n", 155 | "3) Cross check the output after incremental changes to check algorithmic scalability\n", 156 | "\n", 157 | "4) Start with a small problem size that reduces the execution time. \n", 158 | "\n", 159 | "\n", 160 | "**General tip:** Be aware of *Data Race* situation in which at least two threads access a shared variable at the same time. At least on thread tries to modify the variable. If data race happened, an incorrect result will be returned. So, make sure to validate your output against the serial version." 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "# Links and Resources\n", 168 | "\n", 169 | "[CUDA Introduction ](https://developer.nvidia.com/blog/even-easier-introduction-cuda/)\n", 170 | "\n", 171 | "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n", 172 | "\n", 173 | "[CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)\n", 174 | "\n", 175 | "**NOTE**: To be able to see the Nsight Systems profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n", 176 | "\n", 177 | "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n", 178 | "\n", 179 | "--- \n", 180 | "\n" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "## Licensing \n", 188 | "\n", 189 | "Copyright © 2022 OpenACC-Standard.org. This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply." 190 | ] 191 | } 192 | ], 193 | "metadata": { 194 | "anaconda-cloud": {}, 195 | "kernelspec": { 196 | "display_name": "Python 3", 197 | "language": "python", 198 | "name": "python3" 199 | }, 200 | "language_info": { 201 | "codemirror_mode": { 202 | "name": "ipython", 203 | "version": 3 204 | }, 205 | "file_extension": ".py", 206 | "mimetype": "text/x-python", 207 | "name": "python", 208 | "nbconvert_exporter": "python", 209 | "pygments_lexer": "ipython3", 210 | "version": "3.7.4" 211 | } 212 | }, 213 | "nbformat": 4, 214 | "nbformat_minor": 4 215 | } 216 | -------------------------------------------------------------------------------- /nways_cfd/English/Fortran/jupyter_notebook/minicfd_cudafortran.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# CUDA Fortran Acceleration \n", 8 | "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "!nvidia-smi" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "## Copy and Compile the Serial code\n", 25 | "\n", 26 | "Before start modifying the serial code, let's make a copy of the serial code and rename it." 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "!cp ../source_code/serial/* ../source_code/cudafortran" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "!cd ../source_code/cudafortran && make clean && make" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "## Run the Serial code" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "!cd ../source_code/cudafortran && ./cfd 64 500" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "---\n", 68 | "\n", 69 | "# Start adding CUDA Fortran constructs" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "Now, you can start modifying the Fortran code and the `Makefile`:\n", 77 | "\n", 78 | "[cfd code](../source_code/cudafortran/cfd.f90) \n", 79 | "\n", 80 | "[Makefile](../source_code/cudafortran/Makefile)\n", 81 | "\n", 82 | "Remember to **SAVE** your code after changes, before running below cells.\n", 83 | "\n", 84 | "#### Some Hints\n", 85 | "Check if there is any data race in your code.( More details on data race is present in the Links and resources section below)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "## Compile and run CUDA Fortran enabled code\n" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "!cd ../source_code/cudafortran && make clean && make" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "## Profile the CUDA Fortran Code" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "!cd ../source_code/cudafortran && nsys profile -t nvtx,cuda --stats=true --force-overwrite true -o minicfdcudafortran_profile ./cfd 64 500" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "You can examine the output on the terminal or you can download the file and view the timeline by opening the output with the NVIDIA Nsight Systems." 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "Download and save the report file by holding down Shift and right-clicking [here](../source_code/cudafortran/minicfdcudafortran_profile.nsys-rep) then choosing save Link As. Once done, open it via the GUI." 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "## Validating the Output\n", 139 | "\n", 140 | "Make sure the error value printed as output matches that of the serial code" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "# Recommendations for adding CUDA Fortran\n", 148 | "\n", 149 | "After finding the hotspot function take an incremental approach to add pargmas. \n", 150 | "\n", 151 | "1) Ignore the initialization, finalization and I/O functions\n", 152 | "\n", 153 | "2) Cross check the output after incremental changes to check algorithmic scalability\n", 154 | "\n", 155 | "3) Start with a small problem size that reduces the execution time. \n", 156 | "\n", 157 | "\n", 158 | "**General tip:** Be aware of *Data Race* situation in which at least two threads access a shared variable at the same time. At least on thread tries to modify the variable. If data race happened, an incorrect result will be returned. So, make sure to validate your output against the serial version." 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "# Links and Resources\n", 166 | "\n", 167 | "[CUDA Introduction ](https://developer.nvidia.com/blog/even-easier-introduction-cuda/)\n", 168 | "\n", 169 | "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n", 170 | "\n", 171 | "[CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)\n", 172 | "\n", 173 | "**NOTE**: To be able to see the Nsight Systems profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n", 174 | "\n", 175 | "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n", 176 | "\n", 177 | "--- \n", 178 | "\n" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "## Licensing \n", 186 | "\n", 187 | "Copyright © 2022 OpenACC-Standard.org. This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply." 188 | ] 189 | } 190 | ], 191 | "metadata": { 192 | "anaconda-cloud": {}, 193 | "kernelspec": { 194 | "display_name": "Python 3", 195 | "language": "python", 196 | "name": "python3" 197 | }, 198 | "language_info": { 199 | "codemirror_mode": { 200 | "name": "ipython", 201 | "version": 3 202 | }, 203 | "file_extension": ".py", 204 | "mimetype": "text/x-python", 205 | "name": "python", 206 | "nbconvert_exporter": "python", 207 | "pygments_lexer": "ipython3", 208 | "version": "3.7.4" 209 | } 210 | }, 211 | "nbformat": 4, 212 | "nbformat_minor": 4 213 | } 214 | -------------------------------------------------------------------------------- /nways_cfd/English/C/jupyter_notebook/.ipynb_checkpoints/minicfd_cudac-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# CUDA C Acceleration \n", 8 | "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "!nvidia-smi" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "## Copy and Compile the Serial code\n", 25 | "\n", 26 | "Before start modifying the serial code, let's make a copy of the serial code and rename it." 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "!cp ../source_code/serial/* ../source_code/cuda-c" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "!cd ../source_code/cuda-c && make clean && make" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "## Run the Serial code" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "!cd ../source_code/cuda-c && ./cfd 64 500" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "---\n", 68 | "\n", 69 | "# Start adding CUDA C constructs" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "Now, you can start modifying the C++ code and the `Makefile`:\n", 77 | "\n", 78 | "[cfd code](../source_code/cuda-c/cfd.cpp) \n", 79 | "\n", 80 | "[Makefile](../source_code/cuda-c/Makefile)\n", 81 | "\n", 82 | "Remember to **SAVE** your code after changes, before running below cells.\n", 83 | "\n", 84 | "#### Some Hints\n", 85 | "Check if there is any data race in your code.( More details on data race is present in the Links and resources section below)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "## Compile and run CUDA C enabled code\n" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "!cd ../source_code/cuda-c && make clean && make" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "## Profile the CUDA C Code" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "!cd ../source_code/cuda-c && nsys profile -t nvtx,cuda --stats=true --force-overwrite true -o minicfdcudac_profile ./cfd 64 500" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "You can examine the output on the terminal or you can download the file and view the timeline by opening the output with the NVIDIA Nsight Systems." 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "Download and save the report file by holding down Shift and right-clicking [here](../source_code/cuda-c/minicfdcudac_profile.nsys-rep) then choosing save Link As. Once done, open it via the GUI." 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "## Validating the Output\n", 139 | "\n", 140 | "Make sure the error value printed as output matches that of the serial code" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "# Recommendations for adding CUDA C\n", 148 | "\n", 149 | "After finding the hotspot function take an incremental approach to add pargmas. \n", 150 | "\n", 151 | "1) Convert files using CUDA kernels to .cu \n", 152 | "\n", 153 | "2) Ignore the initialization, finalization and I/O functions\n", 154 | "\n", 155 | "3) Cross check the output after incremental changes to check algorithmic scalability\n", 156 | "\n", 157 | "4) Start with a small problem size that reduces the execution time. \n", 158 | "\n", 159 | "\n", 160 | "**General tip:** Be aware of *Data Race* situation in which at least two threads access a shared variable at the same time. At least on thread tries to modify the variable. If data race happened, an incorrect result will be returned. So, make sure to validate your output against the serial version." 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "# Links and Resources\n", 168 | "\n", 169 | "[CUDA Introduction ](https://developer.nvidia.com/blog/even-easier-introduction-cuda/)\n", 170 | "\n", 171 | "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n", 172 | "\n", 173 | "[CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)\n", 174 | "\n", 175 | "**NOTE**: To be able to see the Nsight Systems profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n", 176 | "\n", 177 | "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n", 178 | "\n", 179 | "--- \n", 180 | "\n" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "## Licensing \n", 188 | "\n", 189 | "Copyright © 2022 OpenACC-Standard.org. This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply." 190 | ] 191 | } 192 | ], 193 | "metadata": { 194 | "anaconda-cloud": {}, 195 | "kernelspec": { 196 | "display_name": "Python 3", 197 | "language": "python", 198 | "name": "python3" 199 | }, 200 | "language_info": { 201 | "codemirror_mode": { 202 | "name": "ipython", 203 | "version": 3 204 | }, 205 | "file_extension": ".py", 206 | "mimetype": "text/x-python", 207 | "name": "python", 208 | "nbconvert_exporter": "python", 209 | "pygments_lexer": "ipython3", 210 | "version": "3.7.4" 211 | } 212 | }, 213 | "nbformat": 4, 214 | "nbformat_minor": 4 215 | } 216 | -------------------------------------------------------------------------------- /nways_cfd/English/C/jupyter_notebook/minicfd_stdpar.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# std::par Acceleration \n", 8 | "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "!nvidia-smi" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "Since the code will be run on Multicore as well try running the cell below and get details of the nnumber of core and CPU architecure on the system" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "!cat /proc/cpuinfo" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "## Copy and Compile the Serial code\n", 41 | "\n", 42 | "Before start modifying the serial code, let's make a copy of the serial code and rename it." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "!cp ../source_code/serial/* ../source_code/stdpar" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "!cd ../source_code/stdpar && make clean && make" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "## Run the Serial code" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "!cd ../source_code/stdpar && ./cfd 64 500" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "---\n", 84 | "\n", 85 | "# Start adding std::par constructs" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "Now, you can start modifying the C++ code and the `Makefile`:\n", 93 | "\n", 94 | "[cfd code](../source_code/stdpar/cfd.cpp) \n", 95 | "\n", 96 | "[Makefile](../source_code/stdpar/Makefile)\n", 97 | "\n", 98 | "Remember to **SAVE** your code after changes, before running below cells.\n", 99 | "\n", 100 | "#### Some Hints\n", 101 | "\n", 102 | "Check if there is any data race in your code.( More details on data race is present in the Links and resources section below)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "## Compile and run std::par enabled code\n" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "!cd ../source_code/stdpar && make clean && make" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "## Profile the std::par Code" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "!cd ../source_code/stdpar && nsys profile -t nvtx,cuda --stats=true --force-overwrite true -o minicfdstdpar_profile ./cfd 64 500" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "You can examine the output on the terminal or you can download the file and view the timeline by opening the output with the NVIDIA Nsight Systems." 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "Download and save the report file by holding down Shift and right-clicking [here](../source_code/stdpar/minicfdstdpar_profile.nsys-rep) then choosing save Link As. Once done, open it via the GUI.\n", 149 | "\n", 150 | "## Validating the Output\n", 151 | "\n", 152 | "Make sure the error value printed as output matches that of the serial code" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": {}, 158 | "source": [ 159 | "# Recommendations for adding std::par \n", 160 | "\n", 161 | "After finding the hotspot function take an incremental approach to add pargmas. \n", 162 | "\n", 163 | "1) Ignore the initialization, finalization and I/O functions\n", 164 | "\n", 165 | "2) Convert the allocations to dynamic stl array\n", 166 | "\n", 167 | "3) Cross check the output after incremental changes to check algorithmic scalability\n", 168 | "\n", 169 | "4) Start with a small problem size that reduces the execution time. \n", 170 | "\n", 171 | "\n", 172 | "\n", 173 | "**General tip:** Be aware of *Data Race* situation in which at least two threads access a shared variable at the same time. At least on thread tries to modify the variable. If data race happened, an incorrect result will be returned. So, make sure to validate your output against the serial version." 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "# Links and Resources\n", 181 | "\n", 182 | "[std::par blog](https://developer.nvidia.com/blog/accelerating-standard-c-with-gpus-using-stdpar/)\n", 183 | "\n", 184 | "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n", 185 | "\n", 186 | "[CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)\n", 187 | "\n", 188 | "[Execution Policy Details](https://en.cppreference.com/w/cpp/algorithm/execution_policy_tag)\n", 189 | "\n", 190 | "**NOTE**: To be able to see the Nsight Systems profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n", 191 | "\n", 192 | "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n", 193 | "\n", 194 | "--- \n", 195 | "\n" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "## Licensing \n", 203 | "\n", 204 | "Copyright © 2022 OpenACC-Standard.org. This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply." 205 | ] 206 | } 207 | ], 208 | "metadata": { 209 | "anaconda-cloud": {}, 210 | "kernelspec": { 211 | "display_name": "Python 3", 212 | "language": "python", 213 | "name": "python3" 214 | }, 215 | "language_info": { 216 | "codemirror_mode": { 217 | "name": "ipython", 218 | "version": 3 219 | }, 220 | "file_extension": ".py", 221 | "mimetype": "text/x-python", 222 | "name": "python", 223 | "nbconvert_exporter": "python", 224 | "pygments_lexer": "ipython3", 225 | "version": "3.7.4" 226 | } 227 | }, 228 | "nbformat": 4, 229 | "nbformat_minor": 4 230 | } 231 | -------------------------------------------------------------------------------- /nways_cfd/English/C/jupyter_notebook/.ipynb_checkpoints/minicfd_stdpar-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# std::par Acceleration \n", 8 | "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "!nvidia-smi" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "Since the code will be run on Multicore as well try running the cell below and get details of the nnumber of core and CPU architecure on the system" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "!cat /proc/cpuinfo" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "## Copy and Compile the Serial code\n", 41 | "\n", 42 | "Before start modifying the serial code, let's make a copy of the serial code and rename it." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "!cp ../source_code/serial/* ../source_code/stdpar" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "!cd ../source_code/stdpar && make clean && make" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "## Run the Serial code" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "!cd ../source_code/stdpar && ./cfd 64 500" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "---\n", 84 | "\n", 85 | "# Start adding std::par constructs" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "Now, you can start modifying the C++ code and the `Makefile`:\n", 93 | "\n", 94 | "[cfd code](../source_code/stdpar/cfd.cpp) \n", 95 | "\n", 96 | "[Makefile](../source_code/stdpar/Makefile)\n", 97 | "\n", 98 | "Remember to **SAVE** your code after changes, before running below cells.\n", 99 | "\n", 100 | "#### Some Hints\n", 101 | "\n", 102 | "Check if there is any data race in your code.( More details on data race is present in the Links and resources section below)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "## Compile and run std::par enabled code\n" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "!cd ../source_code/stdpar && make clean && make" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "## Profile the std::par Code" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "!cd ../source_code/stdpar && nsys profile -t nvtx,cuda --stats=true --force-overwrite true -o minicfdstdpar_profile ./cfd 64 500" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "You can examine the output on the terminal or you can download the file and view the timeline by opening the output with the NVIDIA Nsight Systems." 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "Download and save the report file by holding down Shift and right-clicking [here](../source_code/stdpar/minicfdstdpar_profile.nsys-rep) then choosing save Link As. Once done, open it via the GUI.\n", 149 | "\n", 150 | "## Validating the Output\n", 151 | "\n", 152 | "Make sure the error value printed as output matches that of the serial code" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": {}, 158 | "source": [ 159 | "# Recommendations for adding std::par \n", 160 | "\n", 161 | "After finding the hotspot function take an incremental approach to add pargmas. \n", 162 | "\n", 163 | "1) Ignore the initialization, finalization and I/O functions\n", 164 | "\n", 165 | "2) Convert the allocations to dynamic stl array\n", 166 | "\n", 167 | "3) Cross check the output after incremental changes to check algorithmic scalability\n", 168 | "\n", 169 | "4) Start with a small problem size that reduces the execution time. \n", 170 | "\n", 171 | "\n", 172 | "\n", 173 | "**General tip:** Be aware of *Data Race* situation in which at least two threads access a shared variable at the same time. At least on thread tries to modify the variable. If data race happened, an incorrect result will be returned. So, make sure to validate your output against the serial version." 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "# Links and Resources\n", 181 | "\n", 182 | "[std::par blog](https://developer.nvidia.com/blog/accelerating-standard-c-with-gpus-using-stdpar/)\n", 183 | "\n", 184 | "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n", 185 | "\n", 186 | "[CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)\n", 187 | "\n", 188 | "[Execution Policy Details](https://en.cppreference.com/w/cpp/algorithm/execution_policy_tag)\n", 189 | "\n", 190 | "**NOTE**: To be able to see the Nsight Systems profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n", 191 | "\n", 192 | "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n", 193 | "\n", 194 | "--- \n", 195 | "\n" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "## Licensing \n", 203 | "\n", 204 | "Copyright © 2022 OpenACC-Standard.org. This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply." 205 | ] 206 | } 207 | ], 208 | "metadata": { 209 | "anaconda-cloud": {}, 210 | "kernelspec": { 211 | "display_name": "Python 3", 212 | "language": "python", 213 | "name": "python3" 214 | }, 215 | "language_info": { 216 | "codemirror_mode": { 217 | "name": "ipython", 218 | "version": 3 219 | }, 220 | "file_extension": ".py", 221 | "mimetype": "text/x-python", 222 | "name": "python", 223 | "nbconvert_exporter": "python", 224 | "pygments_lexer": "ipython3", 225 | "version": "3.7.4" 226 | } 227 | }, 228 | "nbformat": 4, 229 | "nbformat_minor": 4 230 | } 231 | -------------------------------------------------------------------------------- /nways_cfd/English/Fortran/jupyter_notebook/minicfd_do_concurrent.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# DO CONCURRENT Acceleration \n", 8 | "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "!nvidia-smi" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "Since the code will be run on Multicore as well try running the cell below and get details of the nnumber of core and CPU architecure on the system" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "!cat /proc/cpuinfo" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "## Copy and Compile the Serial code\n", 41 | "\n", 42 | "Before start modifying the serial code, let's make a copy of the serial code and rename it." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "!cp ../source_code/serial/* ../source_code/doconcurrent" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "!cd ../source_code/doconcurrent && make clean && make" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "## Run the Serial code" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "!cd ../source_code/doconcurrent && ./cfd 64 500" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "---\n", 84 | "\n", 85 | "# Start adding DO CONCURRENT constructs" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "Now, you can start modifying the Fortran code and the `Makefile`:\n", 93 | "\n", 94 | "[cfd code](../source_code/doconcurrent/cfd.f90) \n", 95 | "\n", 96 | "[Makefile](../source_code/doconcurrent/Makefile)\n", 97 | "\n", 98 | "Remember to **SAVE** your code after changes, before running below cells.\n", 99 | "\n", 100 | "#### Some Hints\n", 101 | "\n", 102 | "Check if there is any data race in your code.( More details on data race is present in the Links and resources section below)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "## Compile and run DO CONCURRENT enabled code\n" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "!cd ../source_code/doconcurrent && make clean && make" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "## Profile the parallel Code" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "!cd ../source_code/doconcurrent && nsys profile -t nvtx,cuda --stats=true --force-overwrite true -o minicfdstdpar_profile ./cfd 64 500" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "You can examine the output on the terminal or you can download the file and view the timeline by opening the output with the NVIDIA Nsight Systems." 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "Download and save the report file by holding down Shift and right-clicking [here](../source_code/doconcurrent/minicfdstdpar_profile.nsys-rep) then choosing save Link As. Once done, open it via the GUI.\n", 149 | "\n", 150 | "\n", 151 | "## Validating the Output\n", 152 | "\n", 153 | "Make sure the error value printed as output matches that of the serial code" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "# Recommendations for adding parallelism \n", 161 | "\n", 162 | "After finding the hotspot function take an incremental approach to add pargmas. \n", 163 | "\n", 164 | "1) Ignore the initialization, finalization and I/O functions\n", 165 | "\n", 166 | "2) Convert the allocations to dynamic stl array\n", 167 | "\n", 168 | "3) Cross check the output after incremental changes to check algorithmic scalability\n", 169 | "\n", 170 | "4) Start with a small problem size that reduces the execution time. \n", 171 | "\n", 172 | "\n", 173 | "\n", 174 | "**General tip:** Be aware of *Data Race* situation in which at least two threads access a shared variable at the same time. At least on thread tries to modify the variable. If data race happened, an incorrect result will be returned. So, make sure to validate your output against the serial version." 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "# Links and Resources\n", 182 | "\n", 183 | "[do concurrent blog](https://developer.nvidia.com/blog/accelerating-fortran-do-concurrent-with-gpus-and-the-nvidia-hpc-sdk/)\n", 184 | "\n", 185 | "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n", 186 | "\n", 187 | "[CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)\n", 188 | "\n", 189 | "[Execution Policy Details](https://en.cppreference.com/w/cpp/algorithm/execution_policy_tag)\n", 190 | "\n", 191 | "**NOTE**: To be able to see the Nsight Systems profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n", 192 | "\n", 193 | "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n", 194 | "\n", 195 | "--- \n", 196 | "\n" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "## Licensing \n", 204 | "\n", 205 | "Copyright © 2022 OpenACC-Standard.org. This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply." 206 | ] 207 | } 208 | ], 209 | "metadata": { 210 | "anaconda-cloud": {}, 211 | "kernelspec": { 212 | "display_name": "Python 3", 213 | "language": "python", 214 | "name": "python3" 215 | }, 216 | "language_info": { 217 | "codemirror_mode": { 218 | "name": "ipython", 219 | "version": 3 220 | }, 221 | "file_extension": ".py", 222 | "mimetype": "text/x-python", 223 | "name": "python", 224 | "nbconvert_exporter": "python", 225 | "pygments_lexer": "ipython3", 226 | "version": "3.7.4" 227 | } 228 | }, 229 | "nbformat": 4, 230 | "nbformat_minor": 4 231 | } 232 | -------------------------------------------------------------------------------- /nways_cfd/English/C/jupyter_notebook/minicfd_openmp.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# OpenMP Acceleration \n", 8 | "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "!nvidia-smi" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "Since the code will be run on Multicore as well try running the cell below and get details of the nnumber of core and CPU architecure on the system" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "!cat /proc/cpuinfo" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "## Copy and Compile the Serial code\n", 41 | "\n", 42 | "Before start modifying the serial code, let's make a copy of the serial code and rename it." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "!cp ../source_code/serial/* ../source_code/openmp" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "!cd ../source_code/openmp && make clean && make" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "## Run the Serial code" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "!cd ../source_code/openmp && ./cfd 64 500" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "---\n", 84 | "\n", 85 | "# Start adding OpenMP Pragmas" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "Now, you can start modifying the C++ code and the `Makefile`:\n", 93 | "\n", 94 | "[cfd code](../source_code/openmp/cfd.cpp) \n", 95 | "\n", 96 | "[Makefile](../source_code/openmp/Makefile)\n", 97 | "\n", 98 | "Remember to **SAVE** your code after changes, before running below cells.\n", 99 | "\n", 100 | "#### Some Hints\n", 101 | "\n", 102 | "1) Notice implicit and explicit copy of variables --> Add `-Minfo=mp` flag to `Makefile`.\n", 103 | "\n", 104 | "2) Check if there is any data race in your code.( More details on data race is present in the Links and resources section below)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "## Compile and run OpenMP enabled code\n" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "!cd ../source_code/openmp && make clean && make" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "Hint : Add `-Minfo=mp` to the `Makefile` to check that Kernel code indeed has been generated." 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "## Profile the OpenMP Code" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "!cd ../source_code/openmp && nsys profile -t nvtx,cuda --stats=true --force-overwrite true -o minicfdopenmp_profile ./cfd 64 500" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "You can examine the output on the terminal or you can download the file and view the timeline by opening the output with the NVIDIA Nsight Systems." 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "Download and save the report file by holding down Shift and right-clicking [here](../source_code/openmp/minicfdopenmp_profile.nsys-rep) then choosing save Link As. Once done, open it via the GUI.\n", 158 | "\n", 159 | "## Validating the Output\n", 160 | "\n", 161 | "Make sure the error value printed as output matches that of the serial code" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "# Recommendations for adding OpenMP Pragmas\n", 169 | "\n", 170 | "After finding the hotspot function take an incremental approach to add pargmas. \n", 171 | "\n", 172 | "1) Ignore the initialization, finalization and I/O functions\n", 173 | "\n", 174 | "2) Take an incremental approach by adding pragmas one at a time\n", 175 | "\n", 176 | "3) Cross check the output after incremental changes to check algorithmic scalability\n", 177 | "\n", 178 | "4) Move on to using data clauses for better performance \n", 179 | "\n", 180 | "5) Start with a small problem size that reduces the execution time. \n", 181 | "\n", 182 | "\n", 183 | "\n", 184 | "**General tip:** Be aware of *Data Race* situation in which at least two threads access a shared variable at the same time. At least on thread tries to modify the variable. If data race happened, an incorrect result will be returned. So, make sure to validate your output against the serial version." 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "# Links and Resources\n", 192 | "\n", 193 | "[OpenMP Specification](https://www.openmp.org/wp-content/uploads/OpenMP-API-Specification-5.0.pdf)\n", 194 | "\n", 195 | "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n", 196 | "\n", 197 | "[HPC SDK Download](https://developer.nvidia.com/hpc-sdk)\n", 198 | "\n", 199 | "[OpenMP on GPU](https://on-demand.gputechconf.com/gtc/2016/presentation/s6510-jeff-larkin-targeting-gpus-openmp.pdf)\n", 200 | "\n", 201 | "**NOTE**: To be able to see the Nsight Systems profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n", 202 | "\n", 203 | "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n", 204 | "\n", 205 | "--- \n", 206 | "\n" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": {}, 212 | "source": [ 213 | "## Licensing \n", 214 | "\n", 215 | "Copyright © 2022 OpenACC-Standard.org. This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply." 216 | ] 217 | } 218 | ], 219 | "metadata": { 220 | "anaconda-cloud": {}, 221 | "kernelspec": { 222 | "display_name": "Python 3", 223 | "language": "python", 224 | "name": "python3" 225 | }, 226 | "language_info": { 227 | "codemirror_mode": { 228 | "name": "ipython", 229 | "version": 3 230 | }, 231 | "file_extension": ".py", 232 | "mimetype": "text/x-python", 233 | "name": "python", 234 | "nbconvert_exporter": "python", 235 | "pygments_lexer": "ipython3", 236 | "version": "3.7.4" 237 | } 238 | }, 239 | "nbformat": 4, 240 | "nbformat_minor": 4 241 | } 242 | -------------------------------------------------------------------------------- /nways_cfd/English/Fortran/jupyter_notebook/minicfd_openmp.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# OpenMP Acceleration \n", 8 | "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "!nvidia-smi" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "Since the code will be run on Multicore as well try running the cell below and get details of the nnumber of core and CPU architecure on the system" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "!cat /proc/cpuinfo" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "## Copy and Compile the Serial code\n", 41 | "\n", 42 | "Before start modifying the serial code, let's make a copy of the serial code and rename it." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "!cp ../source_code/serial/* ../source_code/openmp" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "!cd ../source_code/openmp && make clean && make" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "## Run the Serial code" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "!cd ../source_code/openmp && ./cfd 64 500" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "---\n", 84 | "\n", 85 | "# Start adding OpenMP Pragmas" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "Now, you can start modifying the Fortran code and the `Makefile`:\n", 93 | "\n", 94 | "\n", 95 | "[cfd code](../source_code/openmp/cfd.f90) \n", 96 | "\n", 97 | "[Makefile](../source_code/openmp/Makefile)\n", 98 | "\n", 99 | "Remember to **SAVE** your code after changes, before running below cells.\n", 100 | "\n", 101 | "#### Some Hints\n", 102 | "\n", 103 | "1) Notice implicit and explicit copy of variables --> Add `-Minfo=mp` flag to `Makefile`.\n", 104 | "\n", 105 | "2) Check if there is any data race in your code.( More details on data race is present in the Links and resources section below)" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "## Compile and run OpenMP enabled code\n" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "!cd ../source_code/openmp && make clean && make" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "Hint : Add `-Minfo=mp` to the `Makefile` to check that Kernel code indeed has been generated." 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "## Profile the OpenMP Code" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "!cd ../source_code/openmp && nsys profile -t nvtx,cuda --stats=true --force-overwrite true -o minicfdopenmp_profile ./cfd 64 500" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "You can examine the output on the terminal or you can download the file and view the timeline by opening the output with the NVIDIA Nsight Systems." 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "Download and save the report file by holding down Shift and right-clicking [here](../source_code/openmp/minicfdopenmp_profile.nsys-rep) then choosing save Link As. Once done, open it via the GUI.\n", 159 | "\n", 160 | "## Validating the Output\n", 161 | "\n", 162 | "Make sure the error value printed as output matches that of the serial code" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "# Recommendations for adding OpenMP Pragmas\n", 170 | "\n", 171 | "After finding the hotspot function take an incremental approach to add pargmas. \n", 172 | "\n", 173 | "1) Ignore the initialization, finalization and I/O functions\n", 174 | "\n", 175 | "2) Take an incremental approach by adding pragmas one at a time\n", 176 | "\n", 177 | "3) Cross check the output after incremental changes to check algorithmic scalability\n", 178 | "\n", 179 | "4) Move on to using data clauses for better performance \n", 180 | "\n", 181 | "5) Start with a small problem size that reduces the execution time. \n", 182 | "\n", 183 | "\n", 184 | "\n", 185 | "**General tip:** Be aware of *Data Race* situation in which at least two threads access a shared variable at the same time. At least on thread tries to modify the variable. If data race happened, an incorrect result will be returned. So, make sure to validate your output against the serial version." 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "# Links and Resources\n", 193 | "\n", 194 | "[OpenMP Specification](https://www.openmp.org/wp-content/uploads/OpenMP-API-Specification-5.0.pdf)\n", 195 | "\n", 196 | "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n", 197 | "\n", 198 | "[HPC SDK Download](https://developer.nvidia.com/hpc-sdk)\n", 199 | "\n", 200 | "[OpenMP on GPU](https://on-demand.gputechconf.com/gtc/2016/presentation/s6510-jeff-larkin-targeting-gpus-openmp.pdf)\n", 201 | "\n", 202 | "**NOTE**: To be able to see the Nsight Systems profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n", 203 | "\n", 204 | "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n", 205 | "\n", 206 | "--- \n", 207 | "\n" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "## Licensing \n", 215 | "\n", 216 | "Copyright © 2022 OpenACC-Standard.org. This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply." 217 | ] 218 | } 219 | ], 220 | "metadata": { 221 | "anaconda-cloud": {}, 222 | "kernelspec": { 223 | "display_name": "Python 3", 224 | "language": "python", 225 | "name": "python3" 226 | }, 227 | "language_info": { 228 | "codemirror_mode": { 229 | "name": "ipython", 230 | "version": 3 231 | }, 232 | "file_extension": ".py", 233 | "mimetype": "text/x-python", 234 | "name": "python", 235 | "nbconvert_exporter": "python", 236 | "pygments_lexer": "ipython3", 237 | "version": "3.7.4" 238 | } 239 | }, 240 | "nbformat": 4, 241 | "nbformat_minor": 4 242 | } 243 | -------------------------------------------------------------------------------- /nways_cfd/English/C/jupyter_notebook/.ipynb_checkpoints/minicfd_openmp-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# OpenMP Acceleration \n", 8 | "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "!nvidia-smi" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "Since the code will be run on Multicore as well try running the cell below and get details of the nnumber of core and CPU architecure on the system" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "!cat /proc/cpuinfo" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "## Copy and Compile the Serial code\n", 41 | "\n", 42 | "Before start modifying the serial code, let's make a copy of the serial code and rename it." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "!cp ../source_code/serial/* ../source_code/openmp" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "!cd ../source_code/openmp && make clean && make" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "## Run the Serial code" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "!cd ../source_code/openmp && ./cfd 64 500" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "---\n", 84 | "\n", 85 | "# Start adding OpenMP Pragmas" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "Now, you can start modifying the C++ code and the `Makefile`:\n", 93 | "\n", 94 | "[cfd code](../source_code/openmp/cfd.cpp) \n", 95 | "\n", 96 | "[Makefile](../source_code/openmp/Makefile)\n", 97 | "\n", 98 | "Remember to **SAVE** your code after changes, before running below cells.\n", 99 | "\n", 100 | "#### Some Hints\n", 101 | "\n", 102 | "1) Notice implicit and explicit copy of variables --> Add `-Minfo=mp` flag to `Makefile`.\n", 103 | "\n", 104 | "2) Check if there is any data race in your code.( More details on data race is present in the Links and resources section below)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "## Compile and run OpenMP enabled code\n" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "!cd ../source_code/openmp && make clean && make" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "Hint : Add `-Minfo=mp` to the `Makefile` to check that Kernel code indeed has been generated." 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "## Profile the OpenMP Code" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "!cd ../source_code/openmp && nsys profile -t nvtx,cuda --stats=true --force-overwrite true -o minicfdopenmp_profile ./cfd 64 500" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "You can examine the output on the terminal or you can download the file and view the timeline by opening the output with the NVIDIA Nsight Systems." 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "Download and save the report file by holding down Shift and right-clicking [here](../source_code/openmp/minicfdopenmp_profile.nsys-rep) then choosing save Link As. Once done, open it via the GUI.\n", 158 | "\n", 159 | "## Validating the Output\n", 160 | "\n", 161 | "Make sure the error value printed as output matches that of the serial code" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "# Recommendations for adding OpenMP Pragmas\n", 169 | "\n", 170 | "After finding the hotspot function take an incremental approach to add pargmas. \n", 171 | "\n", 172 | "1) Ignore the initialization, finalization and I/O functions\n", 173 | "\n", 174 | "2) Take an incremental approach by adding pragmas one at a time\n", 175 | "\n", 176 | "3) Cross check the output after incremental changes to check algorithmic scalability\n", 177 | "\n", 178 | "4) Move on to using data clauses for better performance \n", 179 | "\n", 180 | "5) Start with a small problem size that reduces the execution time. \n", 181 | "\n", 182 | "\n", 183 | "\n", 184 | "**General tip:** Be aware of *Data Race* situation in which at least two threads access a shared variable at the same time. At least on thread tries to modify the variable. If data race happened, an incorrect result will be returned. So, make sure to validate your output against the serial version." 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "# Links and Resources\n", 192 | "\n", 193 | "[OpenMP Specification](https://www.openmp.org/wp-content/uploads/OpenMP-API-Specification-5.0.pdf)\n", 194 | "\n", 195 | "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n", 196 | "\n", 197 | "[HPC SDK Download](https://developer.nvidia.com/hpc-sdk)\n", 198 | "\n", 199 | "[OpenMP on GPU](https://on-demand.gputechconf.com/gtc/2016/presentation/s6510-jeff-larkin-targeting-gpus-openmp.pdf)\n", 200 | "\n", 201 | "**NOTE**: To be able to see the Nsight Systems profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n", 202 | "\n", 203 | "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n", 204 | "\n", 205 | "--- \n", 206 | "\n" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": {}, 212 | "source": [ 213 | "## Licensing \n", 214 | "\n", 215 | "Copyright © 2022 OpenACC-Standard.org. This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply." 216 | ] 217 | } 218 | ], 219 | "metadata": { 220 | "anaconda-cloud": {}, 221 | "kernelspec": { 222 | "display_name": "Python 3", 223 | "language": "python", 224 | "name": "python3" 225 | }, 226 | "language_info": { 227 | "codemirror_mode": { 228 | "name": "ipython", 229 | "version": 3 230 | }, 231 | "file_extension": ".py", 232 | "mimetype": "text/x-python", 233 | "name": "python", 234 | "nbconvert_exporter": "python", 235 | "pygments_lexer": "ipython3", 236 | "version": "3.7.4" 237 | } 238 | }, 239 | "nbformat": 4, 240 | "nbformat_minor": 4 241 | } 242 | -------------------------------------------------------------------------------- /nways_cfd/English/C/jupyter_notebook/minicfd_openacc.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# OpenACC Acceleration \n", 8 | "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "!nvidia-smi" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "Since the code will be run on Multicore as well try running the cell below and get details of the nnumber of core and CPU architecure on the system" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "!cat /proc/cpuinfo" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "## Copy and Compile the Serial code\n", 41 | "\n", 42 | "Before start modifying the serial code, let's make a copy of the serial code and rename it." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "!cp ../source_code/serial/* ../source_code/openacc" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "!cd ../source_code/openacc && make clean && make" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "## Run the Serial code" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "!cd ../source_code/openacc && ./cfd 64 500" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "---\n", 84 | "\n", 85 | "# Start adding OpenACC Pragmas" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "Now, you can start modifying the C++ code and the `Makefile`:\n", 93 | "\n", 94 | "[cfd code](../source_code/openacc/cfd.cpp) \n", 95 | "\n", 96 | "[Makefile](../source_code/openacc/Makefile)\n", 97 | "\n", 98 | "Remember to **SAVE** your code after changes, before running below cells.\n", 99 | "\n", 100 | "#### Some Hints\n", 101 | "\n", 102 | "1) Notice implicit and explicit copy of variables --> Add `-Minfo=accel` flag to `Makefile`.\n", 103 | "\n", 104 | "2) Check if there is any data race in your code.( More details on data race is present in the Links and resources section below)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "## Compile and run OpenACC enabled code\n" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "!cd ../source_code/openacc && make clean && make" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "Hint : Add `-Minfo=accel` to the `Makefile` to check that Kernel code indeed has been generated." 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "## Profile the OpenACC Code" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "!cd ../source_code/openacc && nsys profile -t nvtx,openacc,cuda --stats=true --force-overwrite true -o minicfdopenacc_profile ./cfd 64 500" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "You can examine the output on the terminal or you can download the file and view the timeline by opening the output with the NVIDIA Nsight Systems." 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "Download and save the report file by holding down Shift and right-clicking [here](../source_code/openacc/minicfdopenacc_profile.nsys-rep) then choosing save Link As. Once done, open it via the GUI.\n", 158 | "\n", 159 | "## Validating the Output\n", 160 | "\n", 161 | "Make sure the error value printed as output matches that of the serial code" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "# Recommendations for adding OpenACC Pragmas\n", 169 | "\n", 170 | "After finding the hotspot function take an incremental approach to add pargmas. \n", 171 | "\n", 172 | "1) Ignore the initialization, finalization and I/O functions\n", 173 | "\n", 174 | "2) Take an incremental approach by adding pragmas one at a time\n", 175 | "\n", 176 | "3) Unified Memory provides a good start point where you need not worry about the data transfers (`–ta=tesla:managed`)\n", 177 | "\n", 178 | "4) Cross check the output after incremental changes to check algorithmic scalability\n", 179 | "\n", 180 | "5) Move on to using data clauses for better performance \n", 181 | "\n", 182 | "6) Start with a small problem size that reduces the execution time. \n", 183 | "\n", 184 | "\n", 185 | "**General tip:** Be aware of *Data Race* situation in which at least two threads access a shared variable at the same time. At least on thread tries to modify the variable. If data race happened, an incorrect result will be returned. So, make sure to validate your output against the serial version." 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "# Links and Resources\n", 193 | "\n", 194 | "[OpenACC API Guide](https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf)\n", 195 | "\n", 196 | "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n", 197 | "\n", 198 | "[CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)\n", 199 | "\n", 200 | "**NOTE**: To be able to see the Nsight Systems profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n", 201 | "\n", 202 | "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n", 203 | "\n", 204 | "--- \n", 205 | "\n" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "## Licensing \n", 213 | "\n", 214 | "Copyright © 2022 OpenACC-Standard.org. This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply." 215 | ] 216 | } 217 | ], 218 | "metadata": { 219 | "anaconda-cloud": {}, 220 | "kernelspec": { 221 | "display_name": "Python 3", 222 | "language": "python", 223 | "name": "python3" 224 | }, 225 | "language_info": { 226 | "codemirror_mode": { 227 | "name": "ipython", 228 | "version": 3 229 | }, 230 | "file_extension": ".py", 231 | "mimetype": "text/x-python", 232 | "name": "python", 233 | "nbconvert_exporter": "python", 234 | "pygments_lexer": "ipython3", 235 | "version": "3.7.4" 236 | } 237 | }, 238 | "nbformat": 4, 239 | "nbformat_minor": 4 240 | } 241 | -------------------------------------------------------------------------------- /nways_cfd/English/Fortran/jupyter_notebook/minicfd_openacc.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# OpenACC Acceleration \n", 8 | "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "!nvidia-smi" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "Since the code will be run on Multicore as well try running the cell below and get details of the nnumber of core and CPU architecure on the system" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "!cat /proc/cpuinfo" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "## Copy and Compile the Serial code\n", 41 | "\n", 42 | "Before start modifying the serial code, let's make a copy of the serial code and rename it." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "!cp ../source_code/serial/* ../source_code/openacc" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "!cd ../source_code/openacc && make clean && make" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "## Run the Serial code" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "!cd ../source_code/openacc && ./cfd 64 500" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "---\n", 84 | "\n", 85 | "# Start adding OpenACC Pragmas" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "Now, you can start modifying the Fortran code and the `Makefile`:\n", 93 | "\n", 94 | "[cfd code](../source_code/openacc/cfd.f90) \n", 95 | "\n", 96 | "[Makefile](../source_code/openacc/Makefile)\n", 97 | "\n", 98 | "Remember to **SAVE** your code after changes, before running below cells.\n", 99 | "\n", 100 | "#### Some Hints\n", 101 | "\n", 102 | "1) Notice implicit and explicit copy of variables --> Add `-Minfo=accel` flag to `Makefile`.\n", 103 | "\n", 104 | "2) Check if there is any data race in your code.( More details on data race is present in the Links and resources section below)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "## Compile and run OpenACC enabled code\n" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "!cd ../source_code/openacc && make clean && make" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "Hint : Add `-Minfo=accel` to the `Makefile` to check that Kernel code indeed has been generated." 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "## Profile the OpenACC Code" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "!cd ../source_code/openacc && nsys profile -t nvtx,openacc,cuda --stats=true --force-overwrite true -o minicfdopenacc_profile ./cfd 64 500" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "You can examine the output on the terminal or you can download the file and view the timeline by opening the output with the NVIDIA Nsight Systems." 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "Download and save the report file by holding down Shift and right-clicking [here](../source_code/openacc/minicfdopenacc_profile.nsys-rep) then choosing save Link As. Once done, open it via the GUI.\n", 158 | "\n", 159 | "## Validating the Output\n", 160 | "\n", 161 | "Make sure the error value printed as output matches that of the serial code" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "# Recommendations for adding OpenACC Pragmas\n", 169 | "\n", 170 | "After finding the hotspot function take an incremental approach to add pargmas. \n", 171 | "\n", 172 | "1) Ignore the initialization, finalization and I/O functions\n", 173 | "\n", 174 | "2) Take an incremental approach by adding pragmas one at a time\n", 175 | "\n", 176 | "3) Unified Memory provides a good start point where you need not worry about the data transfers (`–ta=tesla:managed`)\n", 177 | "\n", 178 | "4) Cross check the output after incremental changes to check algorithmic scalability\n", 179 | "\n", 180 | "5) Move on to using data clauses for better performance \n", 181 | "\n", 182 | "6) Start with a small problem size that reduces the execution time. \n", 183 | "\n", 184 | "\n", 185 | "**General tip:** Be aware of *Data Race* situation in which at least two threads access a shared variable at the same time. At least on thread tries to modify the variable. If data race happened, an incorrect result will be returned. So, make sure to validate your output against the serial version." 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "# Links and Resources\n", 193 | "\n", 194 | "[OpenACC API Guide](https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf)\n", 195 | "\n", 196 | "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n", 197 | "\n", 198 | "[CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)\n", 199 | "\n", 200 | "**NOTE**: To be able to see the Nsight Systems profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n", 201 | "\n", 202 | "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n", 203 | "\n", 204 | "--- \n", 205 | "\n" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "## Licensing \n", 213 | "\n", 214 | "Copyright © 2022 OpenACC-Standard.org. This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply." 215 | ] 216 | } 217 | ], 218 | "metadata": { 219 | "anaconda-cloud": {}, 220 | "kernelspec": { 221 | "display_name": "Python 3", 222 | "language": "python", 223 | "name": "python3" 224 | }, 225 | "language_info": { 226 | "codemirror_mode": { 227 | "name": "ipython", 228 | "version": 3 229 | }, 230 | "file_extension": ".py", 231 | "mimetype": "text/x-python", 232 | "name": "python", 233 | "nbconvert_exporter": "python", 234 | "pygments_lexer": "ipython3", 235 | "version": "3.7.4" 236 | } 237 | }, 238 | "nbformat": 4, 239 | "nbformat_minor": 4 240 | } 241 | -------------------------------------------------------------------------------- /nways_cfd/English/C/jupyter_notebook/.ipynb_checkpoints/minicfd_openacc-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# OpenACC Acceleration \n", 8 | "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "!nvidia-smi" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "Since the code will be run on Multicore as well try running the cell below and get details of the nnumber of core and CPU architecure on the system" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "!cat /proc/cpuinfo" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "## Copy and Compile the Serial code\n", 41 | "\n", 42 | "Before start modifying the serial code, let's make a copy of the serial code and rename it." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "!cp ../source_code/serial/* ../source_code/openacc" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "!cd ../source_code/openacc && make clean && make" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "## Run the Serial code" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "!cd ../source_code/openacc && ./cfd 64 500" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "---\n", 84 | "\n", 85 | "# Start adding OpenACC Pragmas" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "Now, you can start modifying the C++ code and the `Makefile`:\n", 93 | "\n", 94 | "[cfd code](../source_code/openacc/cfd.cpp) \n", 95 | "\n", 96 | "[Makefile](../source_code/openacc/Makefile)\n", 97 | "\n", 98 | "Remember to **SAVE** your code after changes, before running below cells.\n", 99 | "\n", 100 | "#### Some Hints\n", 101 | "\n", 102 | "1) Notice implicit and explicit copy of variables --> Add `-Minfo=accel` flag to `Makefile`.\n", 103 | "\n", 104 | "2) Check if there is any data race in your code.( More details on data race is present in the Links and resources section below)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "## Compile and run OpenACC enabled code\n" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "!cd ../source_code/openacc && make clean && make" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "Hint : Add `-Minfo=accel` to the `Makefile` to check that Kernel code indeed has been generated." 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "## Profile the OpenACC Code" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "!cd ../source_code/openacc && nsys profile -t nvtx,openacc,cuda --stats=true --force-overwrite true -o minicfdopenacc_profile ./cfd 64 500" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "You can examine the output on the terminal or you can download the file and view the timeline by opening the output with the NVIDIA Nsight Systems." 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "Download and save the report file by holding down Shift and right-clicking [here](../source_code/openacc/minicfdopenacc_profile.nsys-rep) then choosing save Link As. Once done, open it via the GUI.\n", 158 | "\n", 159 | "## Validating the Output\n", 160 | "\n", 161 | "Make sure the error value printed as output matches that of the serial code" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "# Recommendations for adding OpenACC Pragmas\n", 169 | "\n", 170 | "After finding the hotspot function take an incremental approach to add pargmas. \n", 171 | "\n", 172 | "1) Ignore the initialization, finalization and I/O functions\n", 173 | "\n", 174 | "2) Take an incremental approach by adding pragmas one at a time\n", 175 | "\n", 176 | "3) Unified Memory provides a good start point where you need not worry about the data transfers (`–ta=tesla:managed`)\n", 177 | "\n", 178 | "4) Cross check the output after incremental changes to check algorithmic scalability\n", 179 | "\n", 180 | "5) Move on to using data clauses for better performance \n", 181 | "\n", 182 | "6) Start with a small problem size that reduces the execution time. \n", 183 | "\n", 184 | "\n", 185 | "**General tip:** Be aware of *Data Race* situation in which at least two threads access a shared variable at the same time. At least on thread tries to modify the variable. If data race happened, an incorrect result will be returned. So, make sure to validate your output against the serial version." 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "# Links and Resources\n", 193 | "\n", 194 | "[OpenACC API Guide](https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf)\n", 195 | "\n", 196 | "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n", 197 | "\n", 198 | "[CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)\n", 199 | "\n", 200 | "**NOTE**: To be able to see the Nsight Systems profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n", 201 | "\n", 202 | "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n", 203 | "\n", 204 | "--- \n", 205 | "\n" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "## Licensing \n", 213 | "\n", 214 | "Copyright © 2022 OpenACC-Standard.org. This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply." 215 | ] 216 | } 217 | ], 218 | "metadata": { 219 | "anaconda-cloud": {}, 220 | "kernelspec": { 221 | "display_name": "Python 3", 222 | "language": "python", 223 | "name": "python3" 224 | }, 225 | "language_info": { 226 | "codemirror_mode": { 227 | "name": "ipython", 228 | "version": 3 229 | }, 230 | "file_extension": ".py", 231 | "mimetype": "text/x-python", 232 | "name": "python", 233 | "nbconvert_exporter": "python", 234 | "pygments_lexer": "ipython3", 235 | "version": "3.7.4" 236 | } 237 | }, 238 | "nbformat": 4, 239 | "nbformat_minor": 4 240 | } 241 | -------------------------------------------------------------------------------- /nways_cfd/English/Python/jupyter_notebook/minicfd.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "# Getting Started\n", 9 | "\n", 10 | "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "!nvidia-smi" 20 | ] 21 | }, 22 | { 23 | "attachments": {}, 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "Since the code will be run on Multicore as well try running the cell below and get details of the nnumber of core and CPU architecure on the system" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "!cat /proc/cpuinfo" 37 | ] 38 | }, 39 | { 40 | "attachments": {}, 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "# A MINI-CFD APPLICATION\n", 45 | "\n", 46 | "In this lab we will accelerate a Simple 2D regular-grid CFD simulation for teaching GPU programming using multiple approaches.\n", 47 | "This is a simple simulation of an incompressible fluid flowing in a cavity using the 2D Navier-Stokes equation. The fluid flow can either be viscous (finite Reynolds number and vortices in the flow) on non-viscous (no Reynolds\n", 48 | "number specified and no vortices in the flow).\n", 49 | "\n", 50 | "It is deliberately written to be very simple and easy to understand so it can be used as a teaching example.\n", 51 | "\n", 52 | "\n", 53 | "In this exercise the finite difference approach is used to determine the flow pattern of a fluid in a cavity. For simplicity, the liquid is assumed to have zero viscosity which implies that there can be no vortices (i.e. no whirlpools) in the flow. The cavity is a square box with an inlet on one side and an outlet on another as shown below:" 54 | ] 55 | }, 56 | { 57 | "attachments": {}, 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "" 62 | ] 63 | }, 64 | { 65 | "attachments": {}, 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "### The objective of this exercise is not to dwell into the Maths part of it but to make use of different approaches to GPU programming to parallelize and improve the performance." 70 | ] 71 | }, 72 | { 73 | "attachments": {}, 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "The general flow of the code is as shown in form of pseudo code" 78 | ] 79 | }, 80 | { 81 | "attachments": {}, 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "```cpp\n", 86 | "set the boundary values for Ψ \n", 87 | "while (convergence == FALSE) do \n", 88 | " for each interior grid point do \n", 89 | " update Ψ by averaging with its 4 nearest neighbours \n", 90 | " end do \n", 91 | " \n", 92 | " check for convergence \n", 93 | "end do \n", 94 | "\n", 95 | "for each interior grid point do \n", 96 | " calculate 𝑢𝑥 calculate 𝑢𝑦 \n", 97 | "end do\n", 98 | "\n", 99 | "```" 100 | ] 101 | }, 102 | { 103 | "attachments": {}, 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "## Steps to follow\n", 108 | "We will follow the Optimization cycle for porting and improving the code performance.\n", 109 | "\n", 110 | "\n" 111 | ] 112 | }, 113 | { 114 | "attachments": {}, 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "### Understand and Analyze the code\n", 119 | "Analyze the code :\n", 120 | "\n", 121 | "[cfd.py](../source_code/serial/cfd.py)" 122 | ] 123 | }, 124 | { 125 | "attachments": {}, 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "## Run the CPU code" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "%run ../source_code/serial/cfd.py 64 500" 139 | ] 140 | }, 141 | { 142 | "attachments": {}, 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "## Profiling\n", 147 | "\n", 148 | "For this section, we will be using Nsight systems profiler and as the code is a CPU code, we will be tracing NVTX APIs (already integrated to the application). NVTX is useful for tracing of CPU events and time ranges. For more info on Nsight profiler, please see the __[profiler documentation](https://docs.nvidia.com/nsight-systems/)__.\n", 149 | "\n", 150 | "### Viewing the profler output\n", 151 | "There are two ways to look at profiled code: \n", 152 | "\n", 153 | "1) Command line based: Use `nsys` to collect and view profiling data from the command-line. Profiling results are displayed in the console after the profiling data is collected.\n", 154 | "\n", 155 | "2) NVIDIA Nsight System: Open the Nsight System profiler and click on file > open, and choose the profiler output called `minicfd_profile.nsys-rep`. If you would like to view this on your local machine, this requires that the local system has CUDA toolkit installed of same version. More details on where to download CUDA toolit can be found in the links in resources section below." 156 | ] 157 | }, 158 | { 159 | "attachments": {}, 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "## Profile the CPU code to find hotspots" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "!cd ../source_code/serial && nsys profile -t nvtx --stats=true --force-overwrite true -o minicfd_profile python3 cfd.py 64 500" 173 | ] 174 | }, 175 | { 176 | "attachments": {}, 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "Download and save the report file by holding down Shift and right-clicking [here](../source_code/serial/minicfd_profile.nsys-rep) then choosing save Link As. Once done, open it via the GUI." 181 | ] 182 | }, 183 | { 184 | "attachments": {}, 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "---\n", 189 | "\n", 190 | "# Start Accelerating code\n", 191 | "\n", 192 | "\n", 193 | "[CuPy](minicfd_cupy.ipynb)\n", 194 | "\n", 195 | "[Numba](minicfd_numba.ipynb)\n", 196 | "\n", 197 | "\n", 198 | "\n" 199 | ] 200 | }, 201 | { 202 | "attachments": {}, 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "## Final Results\n", 207 | "\n", 208 | "Modify and add timings for the accelerated code usinf different methods\n", 209 | "\n", 210 | "| | CuPy | Numba |\n", 211 | "| --- | --- | --- |\n", 212 | "| Multicore | | |\n", 213 | "| GPU | | | \n", 214 | "\n" 215 | ] 216 | }, 217 | { 218 | "attachments": {}, 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": [ 222 | "---\n", 223 | "## Licensing \n", 224 | "\n", 225 | "Copyright © 2022 OpenACC-Standard.org. This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply." 226 | ] 227 | } 228 | ], 229 | "metadata": { 230 | "anaconda-cloud": {}, 231 | "kernelspec": { 232 | "display_name": "Python 3", 233 | "language": "python", 234 | "name": "python3" 235 | }, 236 | "language_info": { 237 | "codemirror_mode": { 238 | "name": "ipython", 239 | "version": 3 240 | }, 241 | "file_extension": ".py", 242 | "mimetype": "text/x-python", 243 | "name": "python", 244 | "nbconvert_exporter": "python", 245 | "pygments_lexer": "ipython3", 246 | "version": "3.7.4" 247 | } 248 | }, 249 | "nbformat": 4, 250 | "nbformat_minor": 4 251 | } 252 | -------------------------------------------------------------------------------- /nways_cfd/English/C/jupyter_notebook/minicfd.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Getting Started\n", 8 | "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "!nvidia-smi" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "Since the code will be run on Multicore as well try running the cell below and get details of the nnumber of core and CPU architecure on the system" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "!cat /proc/cpuinfo" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "# A MINI-CFD APPLICATION\n", 41 | "\n", 42 | "In this lab we will accelerate a Simple 2D regular-grid CFD simulation for teaching GPU programming using multiple approaches.\n", 43 | "This is a simple simulation of an incompressible fluid flowing in a cavity using the 2D Navier-Stokes equation. The fluid flow can either be viscous (finite Reynolds number and vortices in the flow) on non-viscous (no Reynolds\n", 44 | "number specified and no vortices in the flow).\n", 45 | "\n", 46 | "It is deliberately written to be very simple and easy to understand so it can be used as a teaching example.\n", 47 | "\n", 48 | "\n", 49 | "In this exercise the finite difference approach is used to determine the flow pattern of a fluid in a cavity. For simplicity, the liquid is assumed to have zero viscosity which implies that there can be no vortices (i.e. no whirlpools) in the flow. The cavity is a square box with an inlet on one side and an outlet on another as shown below:" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "### The objective of this exercise is not to dwell into the Maths part of it but to make use of different approaches to GPU programming to parallelize and improve the performance." 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "The general flow of the code is as shown in form of pseudo code" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "```cpp\n", 78 | "set the boundary values for Ψ \n", 79 | "while (convergence == FALSE) do \n", 80 | " for each interior grid point do \n", 81 | " update Ψ by averaging with its 4 nearest neighbours \n", 82 | " end do \n", 83 | " \n", 84 | " check for convergence \n", 85 | "end do \n", 86 | "\n", 87 | "for each interior grid point do \n", 88 | " calculate 𝑢𝑥 calculate 𝑢𝑦 \n", 89 | "end do\n", 90 | "\n", 91 | "```" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "## Steps to follow\n", 99 | "We will follow the Optimization cycle for porting and improving the code performance.\n", 100 | "\n", 101 | "\n" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "### Understand and Analyze the code\n", 109 | "Analyze the code and the Makefile for how to compile the code:\n", 110 | "\n", 111 | "[cfd code](../source_code/serial/cfd.cpp) \n", 112 | "\n", 113 | "[Makefile](../source_code/serial/Makefile)" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "## Compile the code" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "!cd ../source_code/serial && make clean && make" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "## Run the CPU code" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "!cd ../source_code/serial && ./cfd 64 500" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "## Profiling\n", 153 | "\n", 154 | "For this section, we will be using Nsight systems profiler and as the code is a CPU code, we will be tracing NVTX APIs (already integrated to the application). NVTX is useful for tracing of CPU events and time ranges. For more info on Nsight profiler, please see the __[profiler documentation](https://docs.nvidia.com/nsight-systems/)__.\n", 155 | "\n", 156 | "### Viewing the profler output\n", 157 | "There are two ways to look at profiled code: \n", 158 | "\n", 159 | "1) Command line based: Use `nsys` to collect and view profiling data from the command-line. Profiling results are displayed in the console after the profiling data is collected.\n", 160 | "\n", 161 | "2) NVIDIA Nsight System: Open the Nsight System profiler and click on file > open, and choose the profiler output called `minicfd_profile.nsys-rep`. If you would like to view this on your local machine, this requires that the local system has CUDA toolkit installed of same version. More details on where to download CUDA toolit can be found in the links in resources section below." 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "## Profile the CPU code to find hotspots" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "!cd ../source_code/serial && nsys profile -t nvtx --stats=true --force-overwrite true -o minicfd_profile ./cfd 64 500" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "Download and save the report file by holding down Shift and right-clicking [here](../source_code/serial/minicfd_profile.nsys-rep) then choosing save Link As. Once done, open it via the GUI." 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "---\n", 192 | "\n", 193 | "# Start Accelerating code\n", 194 | "\n", 195 | "[stdpar](minicfd_stdpar.ipynb)\n", 196 | "\n", 197 | "[OpenACC](minicfd_openacc.ipynb)\n", 198 | "\n", 199 | "[OpenMP](minicfd_openmp.ipynb)\n", 200 | "\n", 201 | "[CUDA C](minicfd_cudac.ipynb)\n", 202 | "\n", 203 | "\n", 204 | "\n" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "## Final Results\n", 212 | "\n", 213 | "Modify and add timings for the accelerated code usinf different methods\n", 214 | "\n", 215 | "| | OpenACC | OpenMP | stdpar | CUDA Languages ( C ) |\n", 216 | "| --- | --- | --- | --- | --- |\n", 217 | "| Multicore | | | | |\n", 218 | "| GPU | | | | |\n", 219 | "\n" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "## Licensing \n", 227 | "\n", 228 | "Copyright © 2022 OpenACC-Standard.org. This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply." 229 | ] 230 | } 231 | ], 232 | "metadata": { 233 | "anaconda-cloud": {}, 234 | "kernelspec": { 235 | "display_name": "Python 3", 236 | "language": "python", 237 | "name": "python3" 238 | }, 239 | "language_info": { 240 | "codemirror_mode": { 241 | "name": "ipython", 242 | "version": 3 243 | }, 244 | "file_extension": ".py", 245 | "mimetype": "text/x-python", 246 | "name": "python", 247 | "nbconvert_exporter": "python", 248 | "pygments_lexer": "ipython3", 249 | "version": "3.7.4" 250 | } 251 | }, 252 | "nbformat": 4, 253 | "nbformat_minor": 4 254 | } 255 | -------------------------------------------------------------------------------- /nways_cfd/English/C/jupyter_notebook/.ipynb_checkpoints/minicfd-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Getting Started\n", 8 | "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "!nvidia-smi" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "Since the code will be run on Multicore as well try running the cell below and get details of the nnumber of core and CPU architecure on the system" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "!cat /proc/cpuinfo" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "# A MINI-CFD APPLICATION\n", 41 | "\n", 42 | "In this lab we will accelerate a Simple 2D regular-grid CFD simulation for teaching GPU programming using multiple approaches.\n", 43 | "This is a simple simulation of an incompressible fluid flowing in a cavity using the 2D Navier-Stokes equation. The fluid flow can either be viscous (finite Reynolds number and vortices in the flow) on non-viscous (no Reynolds\n", 44 | "number specified and no vortices in the flow).\n", 45 | "\n", 46 | "It is deliberately written to be very simple and easy to understand so it can be used as a teaching example.\n", 47 | "\n", 48 | "\n", 49 | "In this exercise the finite difference approach is used to determine the flow pattern of a fluid in a cavity. For simplicity, the liquid is assumed to have zero viscosity which implies that there can be no vortices (i.e. no whirlpools) in the flow. The cavity is a square box with an inlet on one side and an outlet on another as shown below:" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "### The objective of this exercise is not to dwell into the Maths part of it but to make use of different approaches to GPU programming to parallelize and improve the performance." 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "The general flow of the code is as shown in form of pseudo code" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "```cpp\n", 78 | "set the boundary values for Ψ \n", 79 | "while (convergence == FALSE) do \n", 80 | " for each interior grid point do \n", 81 | " update Ψ by averaging with its 4 nearest neighbours \n", 82 | " end do \n", 83 | " \n", 84 | " check for convergence \n", 85 | "end do \n", 86 | "\n", 87 | "for each interior grid point do \n", 88 | " calculate 𝑢𝑥 calculate 𝑢𝑦 \n", 89 | "end do\n", 90 | "\n", 91 | "```" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "## Steps to follow\n", 99 | "We will follow the Optimization cycle for porting and improving the code performance.\n", 100 | "\n", 101 | "\n" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "### Understand and Analyze the code\n", 109 | "Analyze the code and the Makefile for how to compile the code:\n", 110 | "\n", 111 | "[cfd code](../source_code/serial/cfd.cpp) \n", 112 | "\n", 113 | "[Makefile](../source_code/serial/Makefile)" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "## Compile the code" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "!cd ../source_code/serial && make clean && make" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "## Run the CPU code" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "!cd ../source_code/serial && ./cfd 64 500" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "## Profiling\n", 153 | "\n", 154 | "For this section, we will be using Nsight systems profiler and as the code is a CPU code, we will be tracing NVTX APIs (already integrated to the application). NVTX is useful for tracing of CPU events and time ranges. For more info on Nsight profiler, please see the __[profiler documentation](https://docs.nvidia.com/nsight-systems/)__.\n", 155 | "\n", 156 | "### Viewing the profler output\n", 157 | "There are two ways to look at profiled code: \n", 158 | "\n", 159 | "1) Command line based: Use `nsys` to collect and view profiling data from the command-line. Profiling results are displayed in the console after the profiling data is collected.\n", 160 | "\n", 161 | "2) NVIDIA Nsight System: Open the Nsight System profiler and click on file > open, and choose the profiler output called `minicfd_profile.nsys-rep`. If you would like to view this on your local machine, this requires that the local system has CUDA toolkit installed of same version. More details on where to download CUDA toolit can be found in the links in resources section below." 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "## Profile the CPU code to find hotspots" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "!cd ../source_code/serial && nsys profile -t nvtx --stats=true --force-overwrite true -o minicfd_profile ./cfd 64 500" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "Download and save the report file by holding down Shift and right-clicking [here](../source_code/serial/minicfd_profile.nsys-rep) then choosing save Link As. Once done, open it via the GUI." 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "---\n", 192 | "\n", 193 | "# Start Accelerating code\n", 194 | "\n", 195 | "[stdpar](minicfd_stdpar.ipynb)\n", 196 | "\n", 197 | "[OpenACC](minicfd_openacc.ipynb)\n", 198 | "\n", 199 | "[OpenMP](minicfd_openmp.ipynb)\n", 200 | "\n", 201 | "[CUDA C](minicfd_cudac.ipynb)\n", 202 | "\n", 203 | "\n", 204 | "\n" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "## Final Results\n", 212 | "\n", 213 | "Modify and add timings for the accelerated code usinf different methods\n", 214 | "\n", 215 | "| | OpenACC | OpenMP | stdpar | CUDA Languages ( C ) |\n", 216 | "| --- | --- | --- | --- | --- |\n", 217 | "| Multicore | | | | |\n", 218 | "| GPU | | | | |\n", 219 | "\n" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "## Licensing \n", 227 | "\n", 228 | "Copyright © 2022 OpenACC-Standard.org. This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply." 229 | ] 230 | } 231 | ], 232 | "metadata": { 233 | "anaconda-cloud": {}, 234 | "kernelspec": { 235 | "display_name": "Python 3", 236 | "language": "python", 237 | "name": "python3" 238 | }, 239 | "language_info": { 240 | "codemirror_mode": { 241 | "name": "ipython", 242 | "version": 3 243 | }, 244 | "file_extension": ".py", 245 | "mimetype": "text/x-python", 246 | "name": "python", 247 | "nbconvert_exporter": "python", 248 | "pygments_lexer": "ipython3", 249 | "version": "3.7.4" 250 | } 251 | }, 252 | "nbformat": 4, 253 | "nbformat_minor": 4 254 | } 255 | -------------------------------------------------------------------------------- /nways_cfd/English/Fortran/jupyter_notebook/minicfd.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Getting Started\n", 8 | "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "!nvidia-smi" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "Since the code will be run on Multicore as well try running the cell below and get details of the nnumber of core and CPU architecure on the system" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "!cat /proc/cpuinfo" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "# A MINI-CFD APPLICATION\n", 41 | "\n", 42 | "In this lab we will accelerate a Simple 2D regular-grid CFD simulation for teaching GPU programming using multiple approaches.\n", 43 | "This is a simple simulation of an incompressible fluid flowing in a cavity using the 2D Navier-Stokes equation. The fluid flow can either be viscous (finite Reynolds number and vortices in the flow) on non-viscous (no Reynolds\n", 44 | "number specified and no vortices in the flow).\n", 45 | "\n", 46 | "It is deliberately written to be very simple and easy to understand so it can be used as a teaching example.\n", 47 | "\n", 48 | "\n", 49 | "In this exercise the finite difference approach is used to determine the flow pattern of a fluid in a cavity. For simplicity, the liquid is assumed to have zero viscosity which implies that there can be no vortices (i.e. no whirlpools) in the flow. The cavity is a square box with an inlet on one side and an outlet on another as shown below:" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "### The objective of this exercise is not to dwell into the Maths part of it but to make use of different approaches to GPU programming to parallelize and improve the performance." 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "The general flow of the code is as shown in form of pseudo code" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "```cpp\n", 78 | "set the boundary values for Ψ \n", 79 | "while (convergence == FALSE) do \n", 80 | " for each interior grid point do \n", 81 | " update Ψ by averaging with its 4 nearest neighbours \n", 82 | " end do \n", 83 | " \n", 84 | " check for convergence \n", 85 | "end do \n", 86 | "\n", 87 | "for each interior grid point do \n", 88 | " calculate 𝑢𝑥 calculate 𝑢𝑦 \n", 89 | "end do\n", 90 | "\n", 91 | "```" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "## Steps to follow\n", 99 | "We will follow the Optimization cycle for porting and improving the code performance.\n", 100 | "\n", 101 | "\n" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "### Understand and Analyze the code\n", 109 | "Analyze the code and the Makefile for how to compile the code:\n", 110 | "\n", 111 | "\n", 112 | "[cfd code](../source_code/serial/cfd.f90) \n", 113 | "\n", 114 | "[Makefile](../source_code/serial/Makefile)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "## Compile the code" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "!cd ../source_code/serial && make clean && make" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "## Run the CPU code" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "!cd ../source_code/serial && ./cfd 64 500" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "## Profiling\n", 154 | "\n", 155 | "For this section, we will be using Nsight systems profiler and as the code is a CPU code, we will be tracing NVTX APIs (already integrated to the application). NVTX is useful for tracing of CPU events and time ranges. For more info on Nsight profiler, please see the __[profiler documentation](https://docs.nvidia.com/nsight-systems/)__.\n", 156 | "\n", 157 | "### Viewing the profler output\n", 158 | "There are two ways to look at profiled code: \n", 159 | "\n", 160 | "1) Command line based: Use `nsys` to collect and view profiling data from the command-line. Profiling results are displayed in the console after the profiling data is collected.\n", 161 | "\n", 162 | "2) NVIDIA Nsight System: Open the Nsight System profiler and click on file > open, and choose the profiler output called `minicfd_profile.nsys-rep`. If you would like to view this on your local machine, this requires that the local system has CUDA toolkit installed of same version. More details on where to download CUDA toolit can be found in the links in resources section below." 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "## Profile the CPU code to find hotspots" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "!cd ../source_code/serial && nsys profile -t nvtx --stats=true --force-overwrite true -o minicfd_profile ./cfd 64 500" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "Download and save the report file by holding down Shift and right-clicking [here](../source_code/serial/minicfd_profile.nsys-rep) then choosing save Link As. Once done, open it via the GUI." 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "---\n", 193 | "\n", 194 | "# Start Accelerating code\n", 195 | "\n", 196 | "[doconcurrent](minicfd_do_concurrent.ipynb)\n", 197 | "\n", 198 | "[OpenACC](minicfd_openacc.ipynb)\n", 199 | "\n", 200 | "[OpenMP](minicfd_openmp.ipynb)\n", 201 | "\n", 202 | "[CUDA Fortran](minicfd_cudafortran.ipynb)\n", 203 | "\n", 204 | "\n", 205 | "\n" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "## Final Results\n", 213 | "\n", 214 | "Modify and add timings for the accelerated code usinf different methods\n", 215 | "\n", 216 | "| | OpenACC | OpenMP | DO CONCURRENT | CUDA Languages ( Fortran ) |\n", 217 | "| --- | --- | --- | --- | --- |\n", 218 | "| Multicore | | | | |\n", 219 | "| GPU | | | | |\n", 220 | "\n" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": {}, 226 | "source": [ 227 | "## Licensing \n", 228 | "\n", 229 | "Copyright © 2022 OpenACC-Standard.org. This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply." 230 | ] 231 | } 232 | ], 233 | "metadata": { 234 | "anaconda-cloud": {}, 235 | "kernelspec": { 236 | "display_name": "Python 3", 237 | "language": "python", 238 | "name": "python3" 239 | }, 240 | "language_info": { 241 | "codemirror_mode": { 242 | "name": "ipython", 243 | "version": 3 244 | }, 245 | "file_extension": ".py", 246 | "mimetype": "text/x-python", 247 | "name": "python", 248 | "nbconvert_exporter": "python", 249 | "pygments_lexer": "ipython3", 250 | "version": "3.7.4" 251 | } 252 | }, 253 | "nbformat": 4, 254 | "nbformat_minor": 4 255 | } 256 | -------------------------------------------------------------------------------- /nways_cfd/English/Python/source_code/serial/cfd.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA Corporation. All rights reserved. 2 | #!/usr/bin/env python 3 | # 4 | # CFD Calculation 5 | # =============== 6 | # 7 | # Simulation of inviscid flow in a 2D box using the Jacobi algorithm. 8 | # 9 | # Python version - uses numpy and loops 10 | # 11 | # EPCC, 2014 12 | # 13 | import sys 14 | import time 15 | 16 | # Import numpy 17 | import numpy as np 18 | import math 19 | import sys 20 | import cupy.cuda.nvtx as nvtx 21 | from numba import njit, jit 22 | 23 | def main(argv): 24 | printfreq = 1000 #output frequency 25 | error = bnorm = 0.0 26 | tolerance = 0.0 #tolerance for convergence. <=0 means do not check 27 | 28 | error = 0.0 29 | # Set the minimum size parameters 30 | mbase = 32 31 | nbase = 32 32 | bbase = 10 33 | hbase = 15 34 | wbase = 5 35 | 36 | irrotational = 1 37 | checkerr = 0 38 | iter = 0 39 | 40 | 41 | # Test we have the correct number of arguments 42 | if len(argv) < 2: 43 | sys.stdout.write("Usage: cfd.py ") 44 | sys.exit(1) 45 | 46 | # Get the systen parameters from the arguments 47 | scalefactor = int(argv[0]) 48 | niter = int(argv[1]) 49 | 50 | sys.stdout.write("\n2D CFD Simulation\n") 51 | sys.stdout.write("=================\n") 52 | sys.stdout.write("Scale factor = {0}\n".format(scalefactor)) 53 | sys.stdout.write("Iterations = {0}\n".format(niter)) 54 | 55 | # do we stop because of tolerance? 56 | if (tolerance > 0): 57 | checkerr = 1 58 | 59 | # check command line parameters and parse them 60 | if (len(argv) < 2 or len(argv) > 3): 61 | print("Usage: cfd [reynolds]\n") 62 | return 0 63 | 64 | scalefactor = int(argv[0]) 65 | numiter = int(argv[1]) 66 | 67 | if len(argv) == 3: 68 | re = float(argv[2]) 69 | irrotational = 0 70 | else: 71 | re = -1.0 72 | 73 | if not checkerr: 74 | print("Scale Factor = {}, iterations = {}\n".format(scalefactor, numiter)) 75 | else: 76 | print("Scale Factor = {}, iterations = {}, tolerance= {}\n".format(scalefactor, numiter, tolerance)) 77 | 78 | if (irrotational): 79 | print("Irrotational flow\n") 80 | else: 81 | print("Reynolds number = {}\n".format(re)) 82 | 83 | # Set the parameters for boundary conditions 84 | #Calculate b, h & w and m & n 85 | b = bbase * scalefactor 86 | h = hbase * scalefactor 87 | w = wbase * scalefactor 88 | m = mbase * scalefactor 89 | n = nbase * scalefactor 90 | 91 | re = re / float(scalefactor) 92 | 93 | # Write the simulation details 94 | sys.stdout.write("\nRunning CFD on {0} x {1} grid in serial\n".format(m, n)) 95 | 96 | # allocate arrays 97 | nvtx.RangePush("Initialization") 98 | psi = np.zeros(((m + 2) * (n + 2)), dtype=np.float64) 99 | nvtx.RangePop() 100 | psitmp = np.zeros(psi.size, dtype=np.float64) 101 | 102 | if (not irrotational): 103 | # allocate arrays 104 | nvtx.RangePush("Initialization") 105 | zet = np.zeros(((m + 2) * (n + 2)), dtype=np.float64) 106 | nvtx.RangePop() 107 | zettmp = np.zeros(((m + 2) * (n + 2)), dtype=np.float64) 108 | 109 | nvtx.RangePush("Boundary_PSI") 110 | #set the psi boundary conditions 111 | psi = boundarypsi(psi, m, n, b, h, w) 112 | nvtx.RangePop() 113 | 114 | #compute normalisation factor for error 115 | bnorm = 0.0 116 | nvtx.RangePush("Compute_Normalization") 117 | for i in range(m + 2): 118 | for j in range(n + 2): 119 | bnorm += psi[i * (m + 2) + j] * psi[i * (m + 2) + j] 120 | nvtx.RangePop() 121 | # boundary set for zet 122 | if not irrotational: 123 | zet = boundaryzet(zet, psi, m, n) 124 | nvtx.RangePush("Compute_Normalization") 125 | for i in range(m + 2): 126 | for j in range(n + 2): 127 | bnorm += zet[i * (m + 2) + j] * zet[i * (m + 2) + j] 128 | nvtx.RangePop() 129 | bnorm = math.sqrt(bnorm) 130 | 131 | #begin iterative Jacobi loop 132 | print("\nStarting main loop...\n\n") 133 | tstart = time.time() 134 | 135 | nvtx.RangePush("Overall_Iteration") 136 | for iter in range(1, numiter+1): 137 | nvtx.RangePush("JacobiStep") 138 | if (irrotational): #calculate psi for next iteration 139 | psitmp = jacobistep(psitmp, psi, m, n) 140 | else: 141 | psitmp,zettmp = jacobistepvort(zettmp, psitmp, zet, psi, m, n, re) 142 | nvtx.RangePop() 143 | nvtx.RangePush("Calculate_Error") 144 | #calculate current error if required 145 | if checkerr or iter == numiter: 146 | error = deltasq(psitmp, psi, m, n) 147 | if not irrotational: 148 | error += deltasq(zettmp, zet, m, n) 149 | 150 | error = math.sqrt(error) 151 | error = error / bnorm 152 | nvtx.RangePop() 153 | #quit early if we have reached required tolerance 154 | if checkerr: 155 | if error < tolerance: 156 | print("Converged on iteration {0}\n".format(iter)) 157 | break 158 | #copy back 159 | nvtx.RangePush("Switch_Array") 160 | for i in range(1, m + 1): 161 | for j in range(1, n + 1): 162 | psi[i * (m + 2) + j] = psitmp[i * (m + 2) + j] 163 | 164 | if not irrotational: 165 | for i in range(1, m + 1): 166 | for j in range(1, n + 1): 167 | zet[i * (m + 2) + j] = zettmp[i * (m + 2) + j] 168 | nvtx.RangePop() 169 | if not irrotational: 170 | # update zeta BCs that depend on psi 171 | boundaryzet(zet, psi, m, n) 172 | 173 | # print loop information 174 | if iter % printfreq == 0: 175 | if not checkerr: 176 | print("Completed iteration {0}\n".format(iter)) 177 | else: 178 | print("Completed iteration {0}, error = {1}\n".format(iter, error)) 179 | nvtx.RangePop() 180 | if iter > numiter: 181 | iter=numiter 182 | tstop = time.time() 183 | ttot = tstop - tstart 184 | titer = ttot / float(iter) 185 | 186 | #print out some stats 187 | print("\n... finished\n") 188 | print("\nCalculation took {0:.5f}s\n\n".format(ttot)) 189 | print("After {0} iterations, the error is {1}\n".format(niter, error)) 190 | print("Time for {0} iterations was {1} seconds\n".format(niter, ttot)) 191 | print("Each iteration took {0} seconds\n".format(titer)) 192 | 193 | # Write the output files for subsequent visualisation 194 | nvtx.RangePush("output visualization") 195 | write_data(m, n, scalefactor, psi, "velocity.dat", "colourmap.dat") 196 | nvtx.RangePop() 197 | 198 | # Finish nicely 199 | sys.exit(0) 200 | 201 | 202 | def write_data(m, n, scale, psi, velfile, colfile): 203 | 204 | # Open the specified files 205 | velout = open(velfile, "w") 206 | velout.write("{0} {1}\n".format(m/scale, n/scale)) 207 | colout = open(colfile, "w") 208 | colout.write("{0} {1}\n".format(m, n)) 209 | 210 | # Loop over stream function array (excluding boundaries) 211 | for i in range(0, m): 212 | for j in range(0, n): 213 | 214 | # Compute velocities and magnitude 215 | ux = (psi[(i+1)*(m+2)+j+2]-psi[(i+1)*(m+2)+j])/2.0 216 | uy = -(psi[(i+2)*(m+2)+j+1]-psi[i*(m+2)+j+1])/2.0 217 | #umod = (ux**2 + uy**2) 218 | umod = (ux ** 2 + uy ** 2) ** 0.5 219 | 220 | # We are actually going to output a colour, in which 221 | # case it is useful to shift values towards a lighter 222 | # blue (for clarity) via the following kludge... 223 | hue = umod ** 0.6 224 | #hue = math.pow(umod, 0.4) 225 | colout.write("{0:5d} {1:5d} {2:10.5f}\n".format(i, j, hue)) 226 | 227 | # Only write velocity vectors every "scale" points 228 | if (i-1)%scale == (scale-1)/2 and (j-1)%scale == (scale-1)/2: 229 | velout.write("{0:5d} {1:5d} {2:10.5f} {3:10.5f}\n".format(i-1, j-1, ux, uy)) 230 | 231 | velout.close() 232 | colout.close() 233 | 234 | @jit() 235 | def jacobistep(psinew, psi, m, n): 236 | for i in range(1, m+1): 237 | for j in range(1, n+1): 238 | psinew[i * (m + 2) + j]=0.25 * (psi[(i-1) * (m+2)+j]+psi[(i+1) * (m+2)+j]+psi[i * (m+2)+j-1]+psi[i * (m+2)+j+1]) 239 | return psinew 240 | 241 | @jit() 242 | def jacobistepvort(zetnew, psinew,zet,psi,m,n,re): 243 | for i in range(1, m+1): 244 | for j in range(1, n+1): 245 | psinew[i * (m + 2) + j]=0.25 * (psi[(i-1) * (m+2)+j]+psi[(i+1) * (m+2)+j]+psi[i * (m+2)+j-1]+psi[i * (m+2)+j+1]- zet[i * (m+2)+j]) 246 | 247 | for i in range(1, m+1): 248 | for j in range(1, n+1): 249 | zetnew[i * (m + 2) + j] = 0.25 * (zet[(i - 1) * (m + 2) + j] + zet[(i + 1) * (m + 2) + j] + zet[i * (m + 2) + j - 1] + zet[i * (m + 2) + j + 1]) 250 | - re / 16.0 * ((psi[i * (m + 2) + j + 1] - psi[i * (m + 2) + j - 1]) * (zet[(i + 1) * (m + 2) + j] - zet[(i - 1) * (m + 2) + j]) 251 | - (psi[(i + 1) * (m + 2) + j] - psi[(i - 1) * (m + 2) + j]) * (zet[i * (m + 2) + j + 1] - zet[i * (m + 2) + j - 1])) 252 | 253 | return psinew, zetnew 254 | 255 | @jit() 256 | def deltasq (newarr, oldarr, m, n): 257 | dsq = 0.0 258 | for i in range(1, m+1): 259 | for j in range(1, n+1): 260 | tmp = newarr[i * (m + 2) + j] - oldarr[i * (m + 2) + j]; 261 | dsq += tmp * tmp 262 | 263 | return dsq 264 | 265 | @jit() 266 | def boundarypsi(psi,m,n,b,h,w): 267 | # Set the boundary conditions on bottom edge 268 | 269 | for i in range(b+1, b+w): 270 | psi[i*(m+2)+0] = float(i-b) 271 | 272 | for i in range(b + w, m + 1): 273 | psi[i*(m+2)+0] = float(w) 274 | 275 | # Set the boundary conditions on right edge 276 | for j in range(1, h + 1): 277 | psi[(m+1)*(m+2)+j] = float(w) 278 | 279 | for j in range(h + 1, h + w): 280 | psi[(m+1)*(m+2)+j] = float(w - j + h) 281 | 282 | return psi 283 | 284 | @jit() 285 | def boundaryzet(zet, psi, m, n): 286 | # set top/bottom BCs: 287 | for i in range(1, m + 1): 288 | zet[i * (m + 2) + 0] = 2.0 * (psi[i * (m + 2) + 1] - psi[i * (m + 2) + 0]) 289 | zet[i * (m + 2) + n + 1] = 2.0 * (psi[i * (m + 2) + n] - psi[i * (m + 2) + n + 1]) 290 | 291 | # set left BCs: 292 | for j in range(1, n + 1): 293 | zet[0 * (m + 2) + j] = 2.0 * (psi[1 * (m + 2) + j] - psi[0 * (m + 2) + j]) 294 | 295 | # set right BCs 296 | for j in range(1, n + 1): 297 | zet[(m + 1) * (m + 2) + j] = 2.0 * (psi[m * (m + 2) + j] - psi[(m + 1) * (m + 2) + j]) 298 | 299 | return zet 300 | 301 | if __name__ == "__main__": 302 | main(sys.argv[1:]) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | --------------------------------------------------------------------------------