├── nways_cfd
    ├── English
    │   ├── C
    │   │   ├── source_code
    │   │   │   ├── cuda-c
    │   │   │   │   └── README
    │   │   │   ├── openmp
    │   │   │   │   └── README
    │   │   │   ├── stdpar
    │   │   │   │   └── README
    │   │   │   ├── openacc
    │   │   │   │   └── README
    │   │   │   └── serial
    │   │   │   │   ├── arraymalloc.h
    │   │   │   │   ├── boundary.h
    │   │   │   │   ├── cfdio.h
    │   │   │   │   ├── jacobi.h
    │   │   │   │   ├── arraymalloc.cpp
    │   │   │   │   ├── compare.py
    │   │   │   │   ├── boundary.cpp
    │   │   │   │   ├── jacobi.cpp
    │   │   │   │   ├── Makefile
    │   │   │   │   ├── cfdio.cpp
    │   │   │   │   └── cfd.cpp
    │   │   └── jupyter_notebook
    │   │   │   ├── images
    │   │   │       ├── cfd_flow.png
    │   │   │       └── Optimization_Cycle.jpg
    │   │   │   ├── minicfd_cudac.ipynb
    │   │   │   ├── .ipynb_checkpoints
    │   │   │       ├── minicfd_cudac-checkpoint.ipynb
    │   │   │       ├── minicfd_stdpar-checkpoint.ipynb
    │   │   │       ├── minicfd_openmp-checkpoint.ipynb
    │   │   │       ├── minicfd_openacc-checkpoint.ipynb
    │   │   │       └── minicfd-checkpoint.ipynb
    │   │   │   ├── minicfd_stdpar.ipynb
    │   │   │   ├── minicfd_openmp.ipynb
    │   │   │   ├── minicfd_openacc.ipynb
    │   │   │   └── minicfd.ipynb
    │   ├── Python
    │   │   ├── source_code
    │   │   │   ├── cupy
    │   │   │   │   └── README
    │   │   │   ├── numba
    │   │   │   │   └── README
    │   │   │   └── serial
    │   │   │   │   └── cfd.py
    │   │   └── jupyter_notebook
    │   │   │   ├── images
    │   │   │       ├── cfd_flow.png
    │   │   │       └── Optimization_Cycle.jpg
    │   │   │   ├── minicfd_cupy.ipynb
    │   │   │   ├── minicfd_numba.ipynb
    │   │   │   └── minicfd.ipynb
    │   ├── Fortran
    │   │   ├── source_code
    │   │   │   ├── openmp
    │   │   │   │   └── README
    │   │   │   ├── cudafortran
    │   │   │   │   └── README
    │   │   │   ├── doconcurrent
    │   │   │   │   └── README
    │   │   │   ├── openacc
    │   │   │   │   └── README
    │   │   │   └── serial
    │   │   │   │   ├── Makefile
    │   │   │   │   ├── boundary.f90
    │   │   │   │   ├── jacobi.f90
    │   │   │   │   ├── cfdio.f90
    │   │   │   │   └── cfd.f90
    │   │   └── jupyter_notebook
    │   │   │   ├── images
    │   │   │       ├── cfd_flow.png
    │   │   │       └── Optimization_Cycle.jpg
    │   │   │   ├── minicfd_cudafortran.ipynb
    │   │   │   ├── minicfd_do_concurrent.ipynb
    │   │   │   ├── minicfd_openmp.ipynb
    │   │   │   ├── minicfd_openacc.ipynb
    │   │   │   └── minicfd.ipynb
    │   └── minicfd.ipynb
    ├── Dockerfile
    ├── Singularity
    ├── Dockerfile_python
    ├── Singularity_python
    └── README.md
├── README.md
└── LICENSE


/nways_cfd/English/C/source_code/cuda-c/README:
--------------------------------------------------------------------------------
1 | CUDA C folder
2 | 


--------------------------------------------------------------------------------
/nways_cfd/English/C/source_code/openmp/README:
--------------------------------------------------------------------------------
1 | OpenMP folder
2 | 


--------------------------------------------------------------------------------
/nways_cfd/English/C/source_code/stdpar/README:
--------------------------------------------------------------------------------
1 | STDPAR folder
2 | 


--------------------------------------------------------------------------------
/nways_cfd/English/Python/source_code/cupy/README:
--------------------------------------------------------------------------------
1 | CuPy folder
2 | 


--------------------------------------------------------------------------------
/nways_cfd/English/Python/source_code/numba/README:
--------------------------------------------------------------------------------
1 | Numba folder
2 | 


--------------------------------------------------------------------------------
/nways_cfd/English/C/source_code/openacc/README:
--------------------------------------------------------------------------------
1 | OpenACC folder
2 | 
3 | 


--------------------------------------------------------------------------------
/nways_cfd/English/Fortran/source_code/openmp/README:
--------------------------------------------------------------------------------
1 | OpenMP folder
2 | 


--------------------------------------------------------------------------------
/nways_cfd/English/Fortran/source_code/cudafortran/README:
--------------------------------------------------------------------------------
1 | CUDA C folder
2 | 


--------------------------------------------------------------------------------
/nways_cfd/English/Fortran/source_code/doconcurrent/README:
--------------------------------------------------------------------------------
1 | STDPAR folder
2 | 


--------------------------------------------------------------------------------
/nways_cfd/English/Fortran/source_code/openacc/README:
--------------------------------------------------------------------------------
1 | OpenACC folder
2 | 
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 | This private repository consists of all challenges given as part of the HPC Bootcamp. 
3 | 


--------------------------------------------------------------------------------
/nways_cfd/English/C/source_code/serial/arraymalloc.h:
--------------------------------------------------------------------------------
1 | #include <stddef.h>
2 | 
3 | void **arraymalloc2d(int nx, int ny, size_t typesize);
4 | 
5 | 


--------------------------------------------------------------------------------
/nways_cfd/English/C/source_code/serial/boundary.h:
--------------------------------------------------------------------------------
1 | void boundarypsi(double *psi, int m, int n, int b, int h, int w);
2 | void boundaryzet(double *zet, double *psi, int m, int n);
3 | 


--------------------------------------------------------------------------------
/nways_cfd/English/C/jupyter_notebook/images/cfd_flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_accelerated_programming_challenge/master/nways_cfd/English/C/jupyter_notebook/images/cfd_flow.png


--------------------------------------------------------------------------------
/nways_cfd/English/Python/jupyter_notebook/images/cfd_flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_accelerated_programming_challenge/master/nways_cfd/English/Python/jupyter_notebook/images/cfd_flow.png


--------------------------------------------------------------------------------
/nways_cfd/English/Fortran/jupyter_notebook/images/cfd_flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_accelerated_programming_challenge/master/nways_cfd/English/Fortran/jupyter_notebook/images/cfd_flow.png


--------------------------------------------------------------------------------
/nways_cfd/English/C/jupyter_notebook/images/Optimization_Cycle.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_accelerated_programming_challenge/master/nways_cfd/English/C/jupyter_notebook/images/Optimization_Cycle.jpg


--------------------------------------------------------------------------------
/nways_cfd/English/Python/jupyter_notebook/images/Optimization_Cycle.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_accelerated_programming_challenge/master/nways_cfd/English/Python/jupyter_notebook/images/Optimization_Cycle.jpg


--------------------------------------------------------------------------------
/nways_cfd/English/Fortran/jupyter_notebook/images/Optimization_Cycle.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/nways_accelerated_programming_challenge/master/nways_cfd/English/Fortran/jupyter_notebook/images/Optimization_Cycle.jpg


--------------------------------------------------------------------------------
/nways_cfd/English/C/source_code/serial/cfdio.h:
--------------------------------------------------------------------------------
 1 | void writedatafiles(double *psi, int m, int n, int scale);
 2 | 
 3 | void writeplotfile(int m, int n, int scale);
 4 | 
 5 | void hue2rgb(double hue, int *r, int *g, int *b);
 6 | 
 7 | double colfunc(double x);
 8 | 
 9 | double gettime(void);
10 | 


--------------------------------------------------------------------------------
/nways_cfd/English/C/source_code/serial/jacobi.h:
--------------------------------------------------------------------------------
 1 | #include <nvtx3/nvToolsExt.h>
 2 | 
 3 | void jacobistep(double *psinew, double *psi, int m, int n);
 4 | 
 5 | void jacobistepvort(double *zetnew, double *psinew,
 6 | 		    double *zet,    double* psi,
 7 | 		    int m, int n, double re);
 8 | 
 9 | double deltasq(double *newarr, double *oldarr, int m, int n);
10 | 


--------------------------------------------------------------------------------
/nways_cfd/English/C/source_code/serial/arraymalloc.cpp:
--------------------------------------------------------------------------------
 1 | #include "arraymalloc.h"
 2 | #include <stdlib.h>
 3 | 
 4 | void **arraymalloc2d(int nx, int ny, size_t typesize)
 5 | {
 6 |   int i;
 7 |   void **array2d;
 8 | 
 9 |   size_t mallocsize;
10 | 
11 |   // total memory requirements including pointers
12 | 
13 |   mallocsize = nx*sizeof(void *) + nx*ny*typesize;
14 | 
15 |   array2d = (void **) malloc(mallocsize);
16 | 
17 |   // set first pointer to first element of data
18 | 
19 |   array2d[0] = (void *) (array2d + nx);
20 | 
21 |   for(i=1; i < nx; i++)
22 |     {
23 |       // set other pointers to point at subsequent rows
24 | 
25 |       array2d[i] = (void *) (((char *) array2d[i-1]) + ny*typesize);
26 |     }
27 | 
28 |   return array2d;
29 | }
30 | 


--------------------------------------------------------------------------------
/nways_cfd/English/Fortran/source_code/serial/Makefile:
--------------------------------------------------------------------------------
 1 | # System dependent definitions
 2 | 
 3 | FC=	nvfortran
 4 | FFLAGS=	-O3
 5 | LFLAGS= -lnvhpcwrapnvtx
 6 | 
 7 | # System independent definitions
 8 | 
 9 | MF=	Makefile
10 | 
11 | EXE=	cfd
12 | 
13 | SRC= \
14 | 	boundary.f90 \
15 | 	cfd.f90 \
16 | 	cfdio.f90 \
17 | 	jacobi.f90
18 | 
19 | #
20 | # No need to edit below this line
21 | #
22 | 
23 | .SUFFIXES:
24 | .SUFFIXES: .f90 .o
25 | 
26 | OBJ=	$(SRC:.f90=.o)
27 | 
28 | .f90.o:
29 | 	$(FC) $(FFLAGS) -c $<
30 | 
31 | all:	$(EXE)
32 | 
33 | $(EXE):	$(OBJ)
34 | 	$(FC) $(FFLAGS) -o $@ $(OBJ) $(LFLAGS)
35 | 
36 | $(OBJ):	$(MF)
37 | 
38 | cfd.o:	boundary.o jacobi.o cfdio.o
39 | 
40 | tar:
41 | 	tar cvf cfd.tar $(MF) $(INC) $(SRC)
42 | 
43 | clean:
44 | 	rm -f $(OBJ) $(EXE) *.mod velocity.dat colourmap.dat cfd.plt core
45 | 


--------------------------------------------------------------------------------
/nways_cfd/English/C/source_code/serial/compare.py:
--------------------------------------------------------------------------------
 1 | # If you want to use this file to compare outputs, please put the output data in the correct locations. Otherwise, please compare the "error" value against the given slides.
 2 | # Make sure to copy the "velocity.dat" and rename it to "orig_velocity".
 3 | # To run: python3 compare.py
 4 | 
 5 | import numpy as np
 6 |  
 7 | # Original output data file
 8 | orig_file = "orig_velocity.dat"
 9 | new_file = "new_velocity.dat" 
10 | 
11 | orig_data = np.loadtxt(orig_file,delimiter = ' ')
12 | new_data = np.loadtxt(new_file,delimiter = ' ')
13 | 
14 | 
15 | diff_data = new_data - orig_data
16 | 
17 | print("shape of orig_data:",orig_data.shape)
18 | print("shape of new_data:",new_data.shape)
19 | print("shape of diff_data:",diff_data.shape)
20 | 
21 | maxError = np.amax(diff_data)
22 | 
23 | print("shape of maxError:",maxError.shape)
24 | 
25 | maxError_exp = "{:e}".format(maxError)
26 |  
27 | print('Max Error is : ', maxError_exp)
28 | 


--------------------------------------------------------------------------------
/nways_cfd/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 NVIDIA Corporation.  All rights reserved. 
 2 | 
 3 | # To build the docker container, run: $  sudo docker build -f nways_Dockerfile -t nways:cf .
 4 | # To run: $ sudo docker run --rm -it --runtime nvidia -p 8888:8888 nways:cf
 5 | # Finally, open http://localhost:8888/
 6 | 
 7 | FROM nvcr.io/nvidia/nvhpc:24.1-devel-cuda_multi-ubuntu22.04
 8 | 
 9 | RUN apt-get -y update && \
10 |         DEBIAN_FRONTEND=noninteractive apt-get -yq install --no-install-recommends python3-pip python3-setuptools nginx zip make build-essential libtbb-dev && \
11 |         rm -rf /var/lib/apt/lists/* && \
12 |         pip3 install --upgrade pip &&\
13 |         pip3 install numpy &&\
14 |         pip3 install jupyterlab &&\
15 |         pip3 install ipywidgets &&\
16 |         pip3 install gdown
17 | 
18 | ############################################
19 | 
20 | # TO COPY the data
21 | COPY English/ /labs/
22 | 
23 | #################################################
24 | ENV PATH="/usr/local/bin:/opt/anaconda3/bin:/usr/bin:$PATH"
25 | #################################################
26 | 
27 | WORKDIR /labs
28 | CMD jupyter-lab --no-browser --allow-root --ip=0.0.0.0 --port=8888 --NotebookApp.token="" --notebook-dir=/labs
29 | 


--------------------------------------------------------------------------------
/nways_cfd/English/C/source_code/serial/boundary.cpp:
--------------------------------------------------------------------------------
 1 | #include "boundary.h"
 2 | #include <stdio.h>
 3 | 
 4 | //grid is parallelised in the x direction
 5 | 
 6 | void boundarypsi(double *psi, int m, int n, int b, int h, int w)
 7 | {
 8 | 
 9 |   int i,j;
10 | 
11 |   //BCs on bottom edge
12 | 
13 |   for (i=b+1;i<=b+w-1;i++)
14 |     {
15 |       psi[i*(m+2)+0] = (double)(i-b);
16 |     }
17 | 
18 |   for (i=b+w;i<=m;i++)
19 |     {
20 |       psi[i*(m+2)+0] = (double)(w);
21 |     }
22 | 
23 |   //BCS on RHS
24 | 
25 |   for (j=1; j <= h; j++)
26 |     {
27 |       psi[(m+1)*(m+2)+j] = (double) w;
28 |     }
29 | 
30 |   for (j=h+1;j<=h+w-1; j++)
31 |     {
32 |       psi[(m+1)*(m+2)+j]=(double)(w-j+h);
33 |     }
34 | }
35 | 
36 | void boundaryzet(double *zet, double *psi, int m, int n)
37 | {
38 |   int i,j;
39 | 
40 |   //set top/bottom BCs:
41 | 
42 |   for (i=1;i<m+1;i++)
43 |     {
44 |       zet[i*(m+2)+0]   = 2.0*(psi[i*(m+2)+1]-psi[i*(m+2)+0]);
45 |       zet[i*(m+2)+n+1] = 2.0*(psi[i*(m+2)+n]-psi[i*(m+2)+n+1]);
46 |     }
47 | 
48 |   //set left BCs:
49 | 
50 |   for (j=1;j<n+1;j++)
51 |     {
52 |       zet[0*(m+2)+j] = 2.0*(psi[1*(m+2)+j]-psi[0*(m+2)+j]);
53 |     }
54 | 
55 |   //set right BCs
56 | 
57 |   for (j=1;j<n+1;j++)
58 |     {
59 |       zet[(m+1)*(m+2)+j] = 2.0*(psi[m*(m+2)+j]-psi[(m+1)*(m+2)+j]);
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/nways_cfd/English/Fortran/source_code/serial/boundary.f90:
--------------------------------------------------------------------------------
 1 | module boundary
 2 | 
 3 |   implicit none
 4 | 
 5 | contains
 6 | 
 7 | subroutine boundarypsi(psi, m, n, b, h, w)
 8 | 
 9 |   integer :: m, n, b, h, w
10 |   
11 |   double precision, dimension(0:m+1, 0:n+1) :: psi
12 | 
13 |   integer :: i, j
14 | 
15 | !  Set the boundary conditions on the bottom edge
16 | 
17 |   do i = b+1, b+w-1
18 |      psi(i, 0) = float(i-b)
19 |   end do
20 | 
21 |   do i = b+w, m
22 |      psi(i, 0) = float(w)
23 |   end do
24 | 
25 |   !  Set the boundary conditions on the right hand side
26 | 
27 |   do j = 1, h
28 | 
29 |      psi(m+1,j) = float(w)
30 |      
31 |   end do
32 | 
33 |   do j = h+1, h+w-1
34 | 
35 |      psi(m+1,j) = float(w-j+h)
36 | 
37 |   end do
38 | 
39 | end subroutine boundarypsi
40 | 
41 | subroutine boundaryzet(zet, psi, m, n)
42 | 
43 |   integer :: m, n
44 |   
45 |   double precision, dimension(0:m+1, 0:n+1) :: zet, psi
46 | 
47 |   integer :: i, j
48 | 
49 | ! Set the zeta boundary conditions which depend on psi
50 | 
51 |   do j = 1, n
52 | 
53 |      zet(0,  j) = 2.0*(psi(1,j) - psi(0,  j))
54 |      zet(m+1,j) = 2.0*(psi(m,j) - psi(m+1,j))
55 | 
56 |   end do
57 | 
58 |   do i = 1, m
59 |      zet(i,0) = 2.0*(psi(i,  1)-psi(i,0))
60 |   end do
61 | 
62 |   do i = 1, m
63 |      zet(i,n+1) = 2.0*(psi(i,n)-psi(i,n+1))
64 |   end do
65 | 
66 | end subroutine boundaryzet
67 | 
68 | end module boundary
69 | 


--------------------------------------------------------------------------------
/nways_cfd/Singularity:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 NVIDIA Corporation.  All rights reserved. 
 2 | 
 3 | # To build the singularity container, run: $  singularity build --fakeroot nways_c.simg nways_Singularity
 4 | # To copy the content of the container: $ singularity run nways_c.simg cp -rT /labs ~/labs
 5 | # To run: $ singularity run --nv nways_c.simg jupyter-lab --notebook-dir=~/labs
 6 | # Finally, open http://localhost:8888/
 7 | 
 8 | Bootstrap: docker
 9 | FROM: nvcr.io/nvidia/nvhpc:24.1-devel-cuda_multi-ubuntu22.04
10 | 
11 | %environment
12 |     export XDG_RUNTIME_DIR=
13 |     export PATH="/usr/local/bin:/opt/anaconda3/bin:/usr/bin:$PATH"
14 | 
15 | %post
16 |     build_tmp=$(mktemp -d) && cd ${build_tmp}
17 | 
18 |     apt-get -y update
19 |     apt-get -y dist-upgrade 
20 |     DEBIAN_FRONTEND=noninteractive apt-get -yq install --no-install-recommends \
21 | 	    m4 vim-nox emacs-nox nano zip\
22 |     	python3-pip python3-setuptools git-core inotify-tools \
23 | 	    curl git-lfs \
24 | 	    build-essential libtbb-dev
25 |     rm -rf /var/lib/apt/cache/* 
26 | 
27 |     pip3 install --upgrade pip
28 |     pip3 install numpy
29 |     pip3 install jupyterlab
30 |     pip3 install ipywidgets
31 |     pip3 install gdown
32 | 
33 |     apt-get update -y 
34 | 
35 |     cd /
36 |     rm -rf ${build_tmp}
37 | 
38 | %files
39 |     English/ /labs
40 | 
41 | %runscript
42 |     "$@"
43 | 
44 | %labels
45 |     AUTHOR mozhgank
46 | 


--------------------------------------------------------------------------------
/nways_cfd/English/C/source_code/serial/jacobi.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | #include "jacobi.h"
 4 | 
 5 | void jacobistep(double *psinew, double *psi, int m, int n)
 6 | {
 7 |   int i, j;
 8 |   
 9 | 
10 |   for(i=1;i<=m;i++)
11 |     {
12 |       for(j=1;j<=n;j++)
13 | 	{
14 | 	  psinew[i*(m+2)+j]=0.25*(psi[(i-1)*(m+2)+j]+psi[(i+1)*(m+2)+j]+psi[i*(m+2)+j-1]+psi[i*(m+2)+j+1]);
15 |         }
16 |     }
17 |   
18 | }
19 | 
20 | void jacobistepvort(double *zetnew, double *psinew,
21 | 		    double *zet, double *psi,
22 | 		    int m, int n, double re)
23 | {
24 |   int i, j;
25 | 
26 |   for(i=1;i<=m;i++)
27 |     {
28 |       for(j=1;j<=n;j++)
29 | 	{
30 | 	  psinew[i*(m+2)+j]=0.25*(  psi[(i-1)*(m+2)+j]+psi[(i+1)*(m+2)+j]+psi[i*(m+2)+j-1]+psi[i*(m+2)+j+1]
31 | 			     - zet[i*(m+2)+j] );
32 | 	}
33 |     }
34 | 
35 |   for(i=1;i<=m;i++)
36 |     {
37 |       for(j=1;j<=n;j++)
38 | 	{
39 | 	  zetnew[i*(m+2)+j]=0.25*(zet[(i-1)*(m+2)+j]+zet[(i+1)*(m+2)+j]+zet[i*(m+2)+j-1]+zet[i*(m+2)+j+1])
40 | 	    - re/16.0*(
41 | 		       (  psi[i*(m+2)+j+1]-psi[i*(m+2)+j-1])*(zet[(i+1)*(m+2)+j]-zet[(i-1)*(m+2)+j])
42 | 		       - (psi[(i+1)*(m+2)+j]-psi[(i-1)*(m+2)+j])*(zet[i*(m+2)+j+1]-zet[i*(m+2)+j-1])
43 | 		       );
44 | 	}
45 |     }
46 | }
47 | 
48 | double deltasq(double *newarr, double *oldarr, int m, int n)
49 | {
50 |   int i, j;
51 | 
52 |   double dsq=0.0;
53 |   double tmp;
54 | 
55 |   for(i=1;i<=m;i++)
56 |     {
57 |       for(j=1;j<=n;j++)
58 | 	{
59 | 	  tmp = newarr[i*(m+2)+j]-oldarr[i*(m+2)+j];
60 | 	  dsq += tmp*tmp;
61 |         }
62 |     }
63 | 
64 |   return dsq;
65 | }
66 | 


--------------------------------------------------------------------------------
/nways_cfd/English/C/source_code/serial/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 NVIDIA Corporation.  All rights reserved. 
 2 | 
 3 | # HINT 1) comment out the CFLAGS line when you are compiling the parallel code as we no longer need to add the path to the location of nvtx 
 4 | 
 5 | # HINT 2) For the CUDA C version, remember to add -arch=native compiler option to use the default architecture on the system you are running.
 6 | 
 7 | # HINT 3) For the CUDA C version, make sure the SUFFIXES and CC are correct  
 8 | 
 9 | CC=	nvc++
10 | CFLAGS := -O3 -w -ldl
11 | ACCFLAGS := -Minfo=accel
12 | VER=$(shell nvc -dumpversion)
13 | NVARCH=$(shell uname -s)_$(shell uname -m)
14 | CFLAGS= -I/opt/nvidia/hpc_sdk/$(NVARCH)/$(VER)/cuda/include
15 | LFLAGS=
16 | 
17 | 
18 | 
19 | # System independent definitions
20 | 
21 | MF=	Makefile
22 | 
23 | EXE=	cfd
24 | 
25 | INC= \
26 | 	arraymalloc.h \
27 | 	boundary.h \
28 | 	cfdio.h \
29 | 	jacobi.h
30 | 
31 | # Replace/Add .cu files to the below and make the change for the rest of the locations.
32 | SRC= \
33 | 	arraymalloc.cpp \
34 | 	boundary.cpp \
35 | 	cfd.cpp \
36 | 	cfdio.cpp \
37 | 	jacobi.cpp
38 | 
39 | #
40 | # Make the necessary changes (hint: replace .cpp with .cu so the "make clean" does not remove them)
41 | #
42 | 
43 | .SUFFIXES:
44 | .SUFFIXES: .cpp .o
45 | 
46 | OBJ=	$(SRC:.cpp=.o)
47 | 
48 | .cpp.o:
49 | 	$(CC) $(CFLAGS) -c $< 
50 | 
51 | all:	$(EXE)
52 | 
53 | $(OBJ):	$(INC)
54 | 
55 | $(EXE):	$(OBJ)
56 | 	$(CC) $(CFLAGS) -o $@ $(OBJ) $(LFLAGS) 
57 | 
58 | $(OBJ):	$(MF)
59 | 
60 | tar:
61 | 	tar cvf cfd.tar $(MF) $(INC) $(SRC)
62 | 
63 | clean:
64 | 	rm -f $(OBJ) $(EXE) velocity.dat colourmap.dat cfd.plt core
65 | 


--------------------------------------------------------------------------------
/nways_cfd/English/Fortran/source_code/serial/jacobi.f90:
--------------------------------------------------------------------------------
 1 | module jacobi
 2 | 
 3 |   implicit none
 4 | 
 5 | contains
 6 | 
 7 | subroutine jacobistep(psinew, psi, m, n)
 8 | 
 9 |   integer :: m, n
10 |   double precision, dimension(0:m+1, 0:n+1) :: psinew, psi
11 | 
12 |   psinew(1:m, 1:n) = 0.25d0*(psi(2:m+1, 1:n) + psi(0:m-1, 1:n) + &
13 |                              psi(1:m, 2:n+1) + psi(1:m, 0:n-1)     )
14 | 
15 | end subroutine jacobistep
16 | 
17 | subroutine jacobistepvort(zetnew, psinew, zet, psi, m, n, re)
18 | 
19 |   integer :: m, n
20 |   double precision :: re 
21 |   double precision, dimension(0:m+1, 0:n+1) :: zetnew, zet, psinew, psi
22 | 
23 |   psinew(1:m, 1:n) = 0.25d0*(psi(2:m+1, 1:n) + psi(0:m-1, 1:n) + &
24 |                              psi(1:m, 2:n+1) + psi(1:m, 0:n-1) - &
25 |                              zet(1:m,   1:n))
26 | 
27 |   zetnew(1:m, 1:n) = 0.25d0*(zet(2:m+1, 1:n) + zet(0:m-1, 1:n) +     &
28 |                              zet(1:m, 2:n+1) + zet(1:m, 0:n-1)   ) - &
29 |                    re/16.0*((psi(1:m, 2:n+1) - psi(1:m, 0:n-1)) *    &
30 |                             (zet(2:m+1, 1:n) - zet(0:m-1, 1:n)) -    &
31 |                             (psi(2:m+1, 1:n) - psi(0:m-1, 1:n)) *    &
32 |                             (zet(1:m, 2:n+1) - zet(1:m, 0:n-1))  )
33 | 
34 | end subroutine jacobistepvort
35 | 
36 | double precision function deltasq(new, old, m, n)
37 | 
38 |   integer :: m, n
39 |   double precision, dimension(0:m+1, 0:n+1) :: new, old
40 | 
41 |   integer :: ierr
42 | 
43 |   deltasq =   sum((new(1:m,1:n)-old(1:m,1:n))**2)
44 | 
45 | end function deltasq
46 | 
47 | end module jacobi
48 |                                     
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/nways_cfd/Dockerfile_python:
--------------------------------------------------------------------------------
 1 | 
 2 | # Copyright (c) 2021 NVIDIA Corporation.  All rights reserved. 
 3 | 
 4 | # To build the docker container, run: $ sudo docker build -t nways-labs:latest .
 5 | # To run: $ sudo docker run --rm -it --runtime nvidia -p 8888:8888 nways-labs:latest
 6 | # Finally, open http://localhost:8888/
 7 | 
 8 | #FROM nvcr.io/nvidia/nvhpc:20.11-devel-cuda_multi-ubuntu20.04
 9 | FROM nvidia/cuda:11.4.2-devel-ubuntu20.04
10 | 
11 | #####
12 | # Read https://forums.developer.nvidia.com/t/notice-cuda-linux-repository-key-rotation/212772
13 | RUN apt-key del 7fa2af80
14 | RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub
15 | #####
16 | 
17 | RUN apt-get -y update && \
18 |         DEBIAN_FRONTEND=noninteractive apt-get -yq install --no-install-recommends \
19 |         python3-dev \  
20 |         python3-pip python3-setuptools nginx zip make build-essential libtbb-dev && \
21 |         rm -rf /var/lib/apt/lists/*
22 | 
23 | RUN pip3 install --no-cache-dir -U install setuptools pip
24 | RUN pip3 install gdown
25 | RUN apt-get update -y
26 | RUN apt-get install -y git nvidia-modprobe
27 | # Install required python packages
28 | RUN pip3 install jupyterlab
29 | RUN pip3 install ipywidgets
30 | RUN pip3 install --upgrade numpy==1.21.1
31 | RUN pip3 install --no-cache-dir "cupy-cuda114==10.3.1" \
32 |     numba==0.53.1 scipy  
33 | 
34 | 
35 | ############################################
36 | # NVIDIA nsight-systems-cli-2022.1.1, nsight-compute-2022.1.1
37 | RUN apt-get update -y && \
38 |         DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
39 |         apt-transport-https \
40 |         ca-certificates \
41 |         gnupg \
42 |         wget && \
43 |         #apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys F60F4B3D7FA2AF80 && \
44 |         wget -qO - https://developer.download.nvidia.com/devtools/repos/ubuntu2004/amd64/nvidia.pub | apt-key add - \
45 |         echo "deb https://developer.download.nvidia.com/devtools/repos/ubuntu2004/amd64/ /" >> /etc/apt/sources.list.d/nsight.list &&\
46 |         apt-get update -y
47 | 
48 | 
49 | RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends nsight-systems-cli-2022.1.1 nsight-compute-2022.1.1
50 | 
51 | # TO COPY the data
52 | COPY English/ /labs/
53 | 
54 | 
55 | #################################################
56 | ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/lib/python3.8/dist-packages:/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
57 | ENV PATH="/opt/nvidia/nsight-systems/2022.1.1/bin:/opt/nvidia/nsight-compute/2022.1.1:/usr/local/bin:/bin:/usr/local/cuda/bin:/usr/bin${PATH:+:${PATH}}"
58 | 
59 | 
60 | WORKDIR /labs
61 | CMD service nginx start && jupyter-lab --no-browser --allow-root --ip=0.0.0.0 --port=8888 --NotebookApp.token="" --notebook-dir=/labs
62 | 


--------------------------------------------------------------------------------
/nways_cfd/Singularity_python:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 NVIDIA Corporation.  All rights reserved. 
 2 | 
 3 | Bootstrap: docker
 4 | #FROM: nvcr.io/nvidia/nvhpc:20.11-devel-cuda_multi-ubuntu20.04
 5 | FROM:  nvidia/cuda:11.4.2-devel-ubuntu20.04
 6 | 
 7 | %environment
 8 |     export XDG_RUNTIME_DIR=
 9 |     export PATH="$PATH:/usr/local/bin:/usr/bin"
10 |     export PATH=/opt/nvidia/nsight-systems/2022.1.1/bin:/opt/nvidia/nsight-compute/2022.1.1:/bin:/usr/local/cuda/bin$PATH
11 |     export LD_LIBRARY_PATH="/usr/include/python3.8:/usr/local/lib:/usr/local/lib/python3.8/dist-packages:/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
12 | 
13 | 
14 | %post
15 |     build_tmp=$(mktemp -d) && cd ${build_tmp}
16 | 
17 | #####
18 | # Read https://forums.developer.nvidia.com/t/notice-cuda-linux-repository-key-rotation/212772
19 |     apt-key del 7fa2af80
20 |     apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub
21 | #####
22 | 
23 |     apt-get -y update
24 |     apt-get -y dist-upgrade 
25 |     DEBIAN_FRONTEND=noninteractive apt-get -yq install --no-install-recommends python3-dev \
26 |         m4 vim-nox emacs-nox nano zip \
27 |         python3-pip python3-setuptools nginx zip make build-essential libtbb-dev
28 |     rm -rf /var/lib/apt/cache/* 
29 |     pip3 install --no-cache-dir -U install setuptools pip
30 |     apt-get -y update
31 |     apt-get -y install git nvidia-modprobe
32 |     pip3 install 'chardet>=3.0.2,<3.1.0' 'idna>=2.5,<2.8' 'urllib3>=1.21.1,<1.24' 'certifi>=2017.4.17'
33 |     pip3 install jupyterlab
34 |     pip3 install ipywidgets
35 |     pip3 install gdown
36 |     pip3 install --upgrade numpy==1.21.1
37 |     pip3 install --no-cache-dir "cupy-cuda114==10.3.1" \
38 |     numba==0.53.1 scipy
39 | 
40 | 
41 |     apt-get install --no-install-recommends -y build-essential 
42 | 
43 | 
44 | # NVIDIA nsight-systems-cli-2022.1.1, nsight-compute-2022.1.1
45 |     apt-get update -y   
46 |     DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends apt-transport-https ca-certificates gnupg wget
47 |    # apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys F60F4B3D7FA2AF80
48 |     wget -qO - https://developer.download.nvidia.com/devtools/repos/ubuntu2004/amd64/nvidia.pub | apt-key add -
49 |     echo "deb https://developer.download.nvidia.com/devtools/repos/ubuntu2004/amd64/ /" >> /etc/apt/sources.list.d/nsight.list 
50 |     apt-get update -y 
51 |     DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends nsight-systems-cli-2022.1.1 nsight-compute-2022.1.1
52 |     #rm -rf /var/lib/apt/lists/*
53 | 
54 | 
55 |     apt-get install --no-install-recommends -y build-essential
56 | 
57 | 
58 |     cd /
59 |     rm -rf ${build_tmp}
60 | 
61 | %files
62 |     English/ /labs
63 | %runscript
64 |     "$@"
65 | 
66 | %labels
67 |     AUTHOR Tosin
68 | 


--------------------------------------------------------------------------------
/nways_cfd/English/minicfd.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "## CFD Simulation\n",
 8 |     "In this bootcamp we will accelerate a Simple 2D regular-grid CFD simulation for teaching GPU programming using multiple approaches.\n",
 9 |     "\n",
10 |     "### Learning objectives\n",
11 |     "Learn how to write a portable parallel program that can run on multicore CPUs and accelerators like GPUs using OpenACC, OpenMP,std::par, CuPy, and Numba. Also learn how to optimize using lower level constructs using languages like CUDA-C. \n",
12 |     "\n",
13 |     "### Bootcamp Duration\n",
14 |     "The lab material is a challenge where the particiapnts will accelerate the application using mutiple approaches to GPU programming.\n",
15 |     "\n",
16 |     "### Content Level\n",
17 |     "Beginner, Intermediate\n",
18 |     "\n",
19 |     "### Target Audience and Prerequisites\n",
20 |     " The target audience for this tutorial is researchers, graduate students and developers who are interested in\n",
21 |     "harnessing the power of GPUs to accelerate their scientific applications, and evaluate which programming approach best suites their needs. \n",
22 |     "\n",
23 |     "\n",
24 |     "### Start Here\n",
25 |     "You can choose any of the following:\n",
26 |     "\n",
27 |     "- [C-based code](C/jupyter_notebook/minicfd.ipynb)\n",
28 |     "- [Fortran-version](Fortran/jupyter_notebook/minicfd.ipynb)\n",
29 |     "- [Python-based](Python/jupyter_notebook/minicfd.ipynb)\n",
30 |     "\n",
31 |     "\n",
32 |     "--- \n",
33 |     "\n",
34 |     "## Links and Resources\n",
35 |     "\n",
36 |     "**NOTE**: To be able to see the Nsight Systems profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n",
37 |     "\n",
38 |     "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
39 |     "\n",
40 |     "--- \n",
41 |     "\n",
42 |     "## Licensing \n",
43 |     "\n",
44 |     "Copyright © 2022 OpenACC-Standard.org.  This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply."
45 |    ]
46 |   }
47 |  ],
48 |  "metadata": {
49 |   "anaconda-cloud": {},
50 |   "kernelspec": {
51 |    "display_name": "Python 3",
52 |    "language": "python",
53 |    "name": "python3"
54 |   },
55 |   "language_info": {
56 |    "codemirror_mode": {
57 |     "name": "ipython",
58 |     "version": 3
59 |    },
60 |    "file_extension": ".py",
61 |    "mimetype": "text/x-python",
62 |    "name": "python",
63 |    "nbconvert_exporter": "python",
64 |    "pygments_lexer": "ipython3",
65 |    "version": "3.7.4"
66 |   }
67 |  },
68 |  "nbformat": 4,
69 |  "nbformat_minor": 4
70 | }
71 | 


--------------------------------------------------------------------------------
/nways_cfd/English/C/source_code/serial/cfdio.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <math.h>
  4 | 
  5 | #include "cfdio.h"
  6 | #include "arraymalloc.h"
  7 | 
  8 | void writedatafiles(double *psi, int m, int n, int scale)
  9 | {
 10 |   typedef double Vecvel[2];
 11 |   typedef int    Vecrgb[3];
 12 | 
 13 |   Vecvel **vel;
 14 |   Vecrgb **rgb;
 15 | 
 16 |   FILE *cfile, *vfile;
 17 | 
 18 |   double modvsq, hue;
 19 |   int i,j, ix, iy;
 20 |   int nvel, nrgb;
 21 | 
 22 |   printf("\n\nWriting data files ...\n");
 23 | 
 24 |   vel = (Vecvel **) arraymalloc2d(m,n,sizeof(Vecvel));
 25 |   rgb = (Vecrgb **) arraymalloc2d(m,n,sizeof(Vecrgb));
 26 | 
 27 |   //calculate velocities and hues
 28 | 
 29 |   double v1, v2;
 30 | 
 31 |   for (i=0;i<m;i++)
 32 |     {
 33 |       for (j=0;j<n;j++)
 34 | 	{
 35 | 	  vel[i][j][0] =  (psi[(i+1)*(m+2)+j+2]-psi[(i+1)*(m+2)+j])/2.0;
 36 | 	  vel[i][j][1] = -(psi[(i+2)*(m+2)+j+1]-psi[i*(m+2)+j+1])/2.0;
 37 | 
 38 | 	  v1 = vel[i][j][0];
 39 | 	  v2=  vel[i][j][1];
 40 | 
 41 | 	  modvsq = v1*v1 + v2*v2;
 42 | 
 43 | 	  hue = pow(modvsq,0.4);
 44 | 
 45 | 	  hue2rgb(hue,&(rgb[i][j][0]),&(rgb[i][j][1]),&(rgb[i][j][2]));
 46 | 	}
 47 |     }
 48 | 
 49 |   //write data
 50 | 
 51 |   cfile=fopen("colourmap.dat","w");
 52 |   vfile=fopen("velocity.dat","w");
 53 | 
 54 |   for (i=0;i<m;i++)
 55 |     {
 56 |       ix = i+1;
 57 | 
 58 |       for (j=0;j<n;j++)
 59 | 	{
 60 | 	  iy = j+1;
 61 | 
 62 | 	  fprintf(cfile,"%i %i %i %i %i\n", ix, iy,
 63 | 		  rgb[i][j][0], rgb[i][j][1],rgb[i][j][2]);
 64 | 
 65 | 	  if ((ix-1)%scale == (scale-1)/2 &&
 66 | 	      (iy-1)%scale == (scale-1)/2    )
 67 | 	    {
 68 | 	      fprintf(vfile,"%i %i %f %f\n",
 69 | 		      ix,iy,vel[i][j][0],vel[i][j][1]);
 70 | 	    }
 71 | 	}
 72 |     }
 73 | 
 74 |   fclose(vfile);
 75 |   fclose(cfile);
 76 | 
 77 |   free(rgb);
 78 |   free(vel);
 79 | 
 80 |   printf("... done!\n");
 81 | }
 82 | 
 83 | void writeplotfile(int m, int n, int scale)
 84 | {
 85 |   FILE *gnuplot;
 86 | 
 87 |   gnuplot = fopen("cfd.plt","w");
 88 | 
 89 |   fprintf(gnuplot,"set terminal pngcairo\n");
 90 |   fprintf(gnuplot,"set output 'cfd_output.png'\n");
 91 |   fprintf(gnuplot,"set size square\n");
 92 |   fprintf(gnuplot,"set key off\n");
 93 |   fprintf(gnuplot,"unset xtics\n");
 94 |   fprintf(gnuplot,"unset ytics\n");
 95 | 
 96 |   fprintf(gnuplot,"set xrange [%i:%i]\n",1-scale,m+scale);
 97 |   fprintf(gnuplot,"set yrange [%i:%i]\n",1-scale,n+scale);
 98 | 
 99 |   fprintf(gnuplot,"plot \"colourmap.dat\" w rgbimage, \"velocity.dat\" u 1:2:(%d*0.75*$3/sqrt($3**2+$4**2)):(%d*0.75*$4/sqrt($3**2+$4**2)) with vectors  lc rgb \"#7F7F7F\"",scale,scale);
100 | 
101 |   fclose(gnuplot);
102 | 
103 |   printf("\nWritten gnuplot script 'cfd.plt'\n");
104 | }
105 | 
106 | 
107 | void hue2rgb(double hue, int *r, int *g, int *b)
108 | {
109 |   int rgbmax = 255;
110 | 
111 |   *r = (int)(rgbmax*colfunc(hue-1.0));
112 |   *g = (int)(rgbmax*colfunc(hue-0.5));
113 |   *b = (int)(rgbmax*colfunc(hue    ));
114 | }
115 | 
116 | 
117 | double colfunc(double x)
118 | {
119 |   double absx;
120 | 
121 |   double x1=0.2;
122 |   double x2=0.5;
123 | 
124 |   absx=fabs(x);
125 | 
126 |   if (absx > x2)
127 |     {
128 |       return 0.0;
129 |     }
130 |   else if (absx < x1)
131 |     {
132 |       return 1.0;
133 |     }
134 |   else
135 |     {
136 |       return 1.0-pow((absx-x1)/(x2-x1),2);
137 |     }
138 | }
139 | 
140 | 
141 | #include <sys/time.h>
142 | 
143 | /* wall-clock time */
144 | 
145 | double gettime(void)
146 | {
147 |   struct timeval tp;
148 |   gettimeofday (&tp, NULL);
149 |   return tp.tv_sec + tp.tv_usec/(double)1.0e6;
150 | }
151 | 


--------------------------------------------------------------------------------
/nways_cfd/README.md:
--------------------------------------------------------------------------------
 1 | ## Application:
 2 | 
 3 | # CFD
 4 | 
 5 | Simple 2D regular-grid CFD simulation for teaching parallel scaling concepts
 6 | 
 7 | This is a simple simulation of an incompressible fluid flowing in a cavity using the 2D Navier-Stokes equation. The fluid flow can either be viscous (finite Reynolds number and vortices in the flow) on non-viscous (no Reynolds
 8 | number specified and no vortices in the flow).
 9 | 
10 | It is deliberately written to be very simple and easy to understand so it can be used as a teaching example.
11 | 
12 | To build the application, just run the "make". This will produce a binary "cfd". To run the application, just run the executable.
13 | 
14 | ## Checking Output:
15 | 
16 | ## Prerequisites:
17 | 
18 | To run this tutorial you will need a machine with NVIDIA GPU (**Tested on NVIDIA driver 525.105.17**)
19 | 
20 | - Install the [Docker](https://docs.docker.com/get-docker/) or [Singularity](https://sylabs.io/docs/]).
21 | - Install Nvidia toolkit, [Nsight Systems (latest version)](https://developer.nvidia.com/nsight-systems).
22 | 
23 | ## Creating containers
24 | 
25 | To start with, you will have to build a Docker or Singularity container.
26 | 
27 | **NOTE: Please build the container on the machine that you are planning to run the container on**.
28 | 
29 | ### Docker Container
30 | 
31 | To build a docker container for **C & Fortran**, run:
32 | 
33 | `sudo docker build -t <imagename>:<tagnumber> .`
34 | 
35 | For instance:
36 | 
37 | `sudo docker build -t myimage:1.0 .`
38 | 
39 | While in the case of **Python**, you have to specify the dockerfile name using flag **"-f"**, therefore run:
40 | 
41 | `sudo docker build -f <dockerfile name> -t <imagename>:<tagnumber> .`
42 | 
43 | For example :
44 | 
45 | `sudo docker build -f Dockerfile_python -t myimage:1.0 .`
46 | 
47 | For C, Fortran, and Python, the code labs have been written using Jupyter labs and a Dockerfile has been built to simplify deployment. In order to serve the docker instance for a student, it is necessary to expose port 8888 from the container, for instance, the following command would expose port 8888 inside the container as port 8888 on the lab machine:
48 | 
49 | `sudo docker run --rm -it --gpus=all -p 8888:8888 myimage:1.0`
50 | 
51 | When this command is run, you can browse to the serving machine on port 8888 using any web browser to access the labs. For instance, from if they are running on the local machine the web browser should be pointed to http://localhost:8888. The `--gpus` flag is used to enable `all` NVIDIA GPUs during container runtime. The `--rm` flag is used to clean an temporary images created during the running of the container. The `-it` flag enables killing the jupyter server with `ctrl-c`. This command may be customized for your hosting environment.
52 | 
53 | Then, inside the container launch the Jupyter notebook assigning the port you opened:
54 | 
55 | `jupyter-lab --ip 0.0.0.0 --port 8888 --no-browser --allow-root`
56 | 
57 | Once inside the container, open the jupyter notebook in browser: http://localhost:8888, and start the lab by clicking on the `minicfd.ipynb` notebook.
58 | 
59 | ### Singularity Container
60 | 
61 | To build the singularity container for **C & Fortran**, run:
62 | 
63 | `singularity build minicfd.simg Singularity`
64 | 
65 | While in the case of **Python**, run:
66 | 
67 | `singularity build minicfd.simg Singularity_python`
68 | 
69 | Thereafter, for C, Fortran, and Python, copy the files to your local machine to make sure changes are stored locally:
70 | 
71 | `singularity run minicfd.simg cp -rT /labs ~/labs`
72 | 
73 | Then, run the container:
74 | 
75 | `singularity run --nv minicfd.simg jupyter-lab --notebook-dir=~/labs`
76 | 
77 | Once inside the container, open the jupyter notebook in browser: http://localhost:8888, and start the lab by clicking on the `minicfd.ipynb` notebook.
78 | 
79 | ## Questions?
80 | 
81 | Please join [OpenACC Slack Channel](https://openacclang.slack.com/messages/openaccusergroup) for questions.
82 | 


--------------------------------------------------------------------------------
/nways_cfd/English/Fortran/source_code/serial/cfdio.f90:
--------------------------------------------------------------------------------
  1 | module cfdio
  2 | 
  3 |   implicit none
  4 | 
  5 | contains
  6 | 
  7 | subroutine writedatafiles(psi, m, n, scale)
  8 | 
  9 |   integer :: m, n, scale
 10 |   double precision ::  psi(0:m+1, 0:n+1)
 11 | 
 12 |   double precision, allocatable :: vel(:,:,:)
 13 |   integer, allocatable :: rgb(:,:,:)
 14 | 
 15 |   double precision :: modvsq, hue
 16 |   integer :: i, j, k
 17 | 
 18 |   integer, parameter :: iounitvel = 10, iounitcol = 11
 19 | 
 20 | ! Compute local velocities and colours
 21 | 
 22 |   allocate(rgb(3,m,n))
 23 |   allocate(vel(2,m,n))
 24 | 
 25 |   do i = 1, m
 26 |      do j = 1, n
 27 | 
 28 |         vel(1,i,j) =   (psi(i,j+1)-psi(i,j-1)) / 2.0
 29 |         vel(2,i,j) = - (psi(i+1,j)-psi(i-1,j)) / 2.0
 30 | 
 31 |         modvsq = vel(1,i,j)**2 + vel(2,i,j)**2
 32 |         hue = modvsq**0.4
 33 | 
 34 |         call hue2rgb(hue, rgb(1,i,j), rgb(2,i,j), rgb(3,i,j))
 35 | 
 36 |      end do
 37 |   end do
 38 | 
 39 | !  Write out
 40 | 
 41 |   open(unit=iounitcol, file='colourmap.dat', form='formatted')
 42 |   open(unit=iounitvel, file='velocity.dat',  form='formatted')
 43 | 
 44 |   do j = 1, n
 45 |      do i = 1, m
 46 | 
 47 | !  Write colour map of velocity magnitude at every point
 48 | 
 49 |         write(iounitcol,fmt='(i4,1x,i4,1x,i3,1x,i3,1x,i3)') &
 50 |               i, j, rgb(1,i,j), rgb(2,i,j), rgb(3,i,j)
 51 | 
 52 | !  Only write velocity vectors every "scale" points
 53 |            
 54 |         if (mod(i-1,scale) == (scale-1)/2 .and. &
 55 |             mod(j-1,scale) == (scale-1)/2         ) then
 56 | 
 57 |            write(iounitvel,fmt='(i4,1x,i4,1x,g12.5,1x,g12.5)') &
 58 |                  i, j, vel(1,i,j), vel(2,i,j)
 59 |         end if
 60 |         
 61 |      end do
 62 |   end do
 63 | 
 64 |   close(unit=iounitcol)
 65 |   close(unit=iounitvel)
 66 | 
 67 | end subroutine writedatafiles
 68 | 
 69 | 
 70 | subroutine writeplotfile(m, n, scale)
 71 | 
 72 |   integer :: m, n, scale
 73 |   integer, parameter :: iounit = 10
 74 | 
 75 |   open(unit=iounit, file='cfd.plt', form='formatted')
 76 | 
 77 |   write(iounit,*) 'set size square'
 78 |   write(iounit,*) 'set key off'
 79 |   write(iounit,*) 'unset xtics'
 80 |   write(iounit,*) 'unset ytics'
 81 | 
 82 |   write(iounit,fmt='('' set xrange ['',i4,'':'',i4, '']'')') 1-scale, m+scale
 83 |   write(iounit,fmt='('' set yrange ['',i4,'':'',i4, '']'')') 1-scale, n+scale
 84 | 
 85 |   write(iounit,fmt='('' plot "colourmap.dat" w rgbimage, "velocity.dat" u 1:2:&
 86 |        &('',i2,''*0.75*$3/sqrt($3**2+$4**2)):&
 87 |        &('',i2,''*0.75*$4/sqrt($3**2+$4**2)) &
 88 |        &with vectors  lc rgb "#7F7F7F"'')') scale, scale
 89 | 
 90 |   close(unit=iounit)
 91 | 
 92 | end subroutine writeplotfile
 93 | 
 94 | 
 95 | subroutine hue2rgb(hue, r, g, b)
 96 | 
 97 |   double precision :: hue
 98 | 
 99 |   integer :: r, g, b
100 |   integer, parameter :: rgbmax = 255
101 | 
102 |   r = rgbmax*colfunc(hue-1.0)
103 |   g = rgbmax*colfunc(hue-0.5)
104 |   b = rgbmax*colfunc(hue    )
105 | 
106 | end subroutine hue2rgb
107 | 
108 | 
109 | double precision function colfunc(x)
110 | 
111 |   double precision :: x, absx, val
112 | 
113 |   double precision, parameter :: x1 = 0.2, x2 = 0.5
114 | 
115 |   absx = abs(x)
116 | 
117 |   if (absx .gt. x2) then
118 |      val = 0.0
119 |   else if (absx .lt. x1) then
120 |      val = 1.0
121 |   else
122 |      val = 1.0 - ((absx-x1)/(x2-x1))**2
123 |   end if
124 | 
125 |   colfunc = val
126 |       
127 | end function colfunc
128 | 
129 | double precision function gettime()
130 | 
131 |   logical, save :: firstcall = .true.
132 | 
133 |   integer, parameter :: int32kind = selected_int_kind( 9)
134 |   integer, parameter :: int64kind = selected_int_kind(18)
135 | 
136 |   integer, parameter :: intkind = int64kind
137 | 
138 |   integer(kind = intkind) :: count,rate
139 | 
140 |   double precision, save :: ticktime
141 | 
142 |   if (firstcall) then
143 | 
144 |      firstcall = .false.
145 | 
146 |      call system_clock(count, rate)
147 | 
148 |      ticktime = 1.0d0/dble(rate)
149 |      gettime  = dble(count)*ticktime
150 | 
151 | !     write(*,*) 'Clock resolution is ', ticktime*1.0e6, ', usecs'
152 | 
153 |   else
154 | 
155 |      call system_clock(count)
156 | 
157 |      gettime = dble(count)*ticktime
158 | 
159 |   end if
160 | 
161 | end function gettime
162 | 
163 | end module cfdio
164 | 


--------------------------------------------------------------------------------
/nways_cfd/English/Python/jupyter_notebook/minicfd_cupy.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "attachments": {},
  5 |    "cell_type": "markdown",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# CuPy Acceleration \n",
  9 |     "\n",
 10 |     "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell."
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "!nvidia-smi"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "attachments": {},
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "## Copy the Serial code\n",
 28 |     "\n",
 29 |     "Before start modifying the serial code, let's copy the serial code to cupy folder by running the cell below."
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "!cp ../source_code/serial/* ../source_code/cupy"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "attachments": {},
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "## Run the Serial code"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "%run ../source_code/cupy/cfd.py 64 500"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "attachments": {},
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "---\n",
 64 |     "\n",
 65 |     "# Start Adding CuPy Constructs\n",
 66 |     "\n",
 67 |     "Now, you can start modifying the Python code:\n",
 68 |     "\n",
 69 |     "[cfd.py](../source_code/cupy/cfd.py)\n",
 70 |     "\n",
 71 |     "Remember to **SAVE** your code after changes, before running below cells.\n",
 72 |     "\n",
 73 |     "#### Some Hints\n",
 74 |     "The serial code consists of the `main, jacobi, and write_data` functions. Focus more the jacobi and main functions. Remember to import the cupy library as: ```import cupy as cp ``` at the top of your code. Check if there is any data race in your code.\n",
 75 |     "\n",
 76 |     "##  Run and Profile the CuPy code\n",
 77 |     " "
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "!cd ../source_code/cupy && nsys profile -t nvtx --stats=true --force-overwrite true -o minicfdcupy_profile python3 cfd.py 64 500"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "attachments": {},
 91 |    "cell_type": "markdown",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "You can examine the output on the terminal or you can download the file and view the timeline by opening the output with the NVIDIA Nsight Systems.\n",
 95 |     "\n",
 96 |     "Download and save the report file by holding down <mark>Shift</mark> and <mark>right-clicking</mark> [here](../source_code/cupy/minicfdcupy_profile.nsys-rep) then choosing <mark>save Link As</mark>. Once done, open it via the GUI.\n",
 97 |     "\n",
 98 |     "## Validating the Output\n",
 99 |     "\n",
100 |     "Make sure the error value printed as output matches that of the serial code\n",
101 |     "\n",
102 |     "\n",
103 |     "# Recommendations for adding CuPy Constructs\n",
104 |     "\n",
105 |     "After finding the hotspot function take an incremental approach: \n",
106 |     "\n",
107 |     "1) Add `@cp.fuse()` decorator at the top of the function or rewrite the function as a raw kernel(this is rather tedious)\n",
108 |     "\n",
109 |     "2) Ignore the I/O function\n",
110 |     "\n",
111 |     "3) Ensure that only required data moves from `host (CPU function)` to `device (GPU function)` and vice versa\n",
112 |     "\n",
113 |     "4) Cross check the output after incremental changes to check algorithmic scalability\n",
114 |     "\n",
115 |     "5) Start with a small problem size that reduces the execution time. \n",
116 |     "\n",
117 |     "\n",
118 |     "**General tip:** Be aware of *Data Race* situation in which at least two threads access a shared variable at the same time. At least on thread tries to modify the variable. If data race happened, an incorrect result will be returned. So, make sure to validate your output against the serial version.\n",
119 |     "\n",
120 |     "\n",
121 |     "# Links and Resources\n",
122 |     "\n",
123 |     "[Introduction to CuPy](https://github.com/gpuhackathons-org/gpubootcamp/blob/master/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/cupy/cupy_guide.ipynb)\n",
124 |     "\n",
125 |     "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
126 |     "\n",
127 |     "[NVIDIA CUDA Toolkit](https://developer.nvidia.com/cuda-downloads)\n",
128 |     "\n",
129 |     "**NOTE**: To be able to see the Nsight Systems profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n",
130 |     "\n",
131 |     "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
132 |     "\n",
133 |     "---\n",
134 |     "## Licensing \n",
135 |     "\n",
136 |     "Copyright © 2022 OpenACC-Standard.org.  This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply."
137 |    ]
138 |   }
139 |  ],
140 |  "metadata": {
141 |   "kernelspec": {
142 |    "display_name": "Python 3",
143 |    "language": "python",
144 |    "name": "python3"
145 |   },
146 |   "language_info": {
147 |    "codemirror_mode": {
148 |     "name": "ipython",
149 |     "version": 3
150 |    },
151 |    "file_extension": ".py",
152 |    "mimetype": "text/x-python",
153 |    "name": "python",
154 |    "nbconvert_exporter": "python",
155 |    "pygments_lexer": "ipython3",
156 |    "version": "3.7.4"
157 |   }
158 |  },
159 |  "nbformat": 4,
160 |  "nbformat_minor": 4
161 | }
162 | 


--------------------------------------------------------------------------------
/nways_cfd/English/Python/jupyter_notebook/minicfd_numba.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "attachments": {},
  5 |    "cell_type": "markdown",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Numba Acceleration \n",
  9 |     "\n",
 10 |     "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell."
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "!nvidia-smi"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "attachments": {},
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "## Copy the Serial code\n",
 28 |     "\n",
 29 |     "Before start modifying the serial code, let's copy the serial code to cupy folder by running the cell below."
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "!cp ../source_code/serial/* ../source_code/numba"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "attachments": {},
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "## Run the Serial code"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "%run ../source_code/numba/cfd.py 64 500"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "attachments": {},
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "---\n",
 64 |     "\n",
 65 |     "# Start Adding Numba Constructs\n",
 66 |     "\n",
 67 |     "Now, you can start modifying the Python code: \n",
 68 |     "\n",
 69 |     "[cfd.py](../source_code/numba/cfd.py)\n",
 70 |     "\n",
 71 |     "Remember to **SAVE** your code after changes, before running below cells.\n",
 72 |     "\n",
 73 |     "#### Some Hints\n",
 74 |     "The serial code consists of the `main, jacobi, and write_data` functions. Focus more the jacobi and main functions. Remember to import the cupy library as: ```from numba import cuda ``` at the top of your code. Check if there is any data race in your code.\n",
 75 |     "\n",
 76 |     "##  Run and Profile the CuPy code"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "!cd ../source_code/numba && nsys profile -t nvtx --stats=true --force-overwrite true -o minicfdnumba_profile python3 cfd.py 64 500"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "attachments": {},
 90 |    "cell_type": "markdown",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "You can examine the output on the terminal or you can download the file and view the timeline by opening the output with the NVIDIA Nsight Systems.\n",
 94 |     "\n",
 95 |     "Download and save the report file by holding down <mark>Shift</mark> and <mark>right-clicking</mark> [here](../source_code/numba/minicfdnumba_profile.nsys-rep) then choosing <mark>save Link As</mark>. Once done, open it via the GUI.\n",
 96 |     "\n",
 97 |     "\n",
 98 |     "## Validating the Output\n",
 99 |     "\n",
100 |     "Make sure the error value printed as output matches that of the serial code\n",
101 |     "\n",
102 |     "\n",
103 |     "# Recommendations for adding Numba Constructs\n",
104 |     "\n",
105 |     "After finding the hotspot function take an incremental approach: \n",
106 |     "\n",
107 |     "1) Add `@cuda.jit()` decorator at the top of the function or rewrite the function as a raw kernel(this is rather tedious)\n",
108 |     "\n",
109 |     "2) You may need to perform a copy-swap data in a different kernel function\n",
110 |     "\n",
111 |     "3) Ignore the I/O function\n",
112 |     "\n",
113 |     "4) Ensure that only required data moves from `host (CPU function)` to `device (GPU function)` and vice versa\n",
114 |     "\n",
115 |     "5) Cross check the output after incremental changes to check algorithmic scalability\n",
116 |     "\n",
117 |     "6) Start with a small problem size that reduces the execution time. \n",
118 |     "\n",
119 |     "\n",
120 |     "**General tip:** Be aware of *Data Race* situation in which at least two threads access a shared variable at the same time. At least on thread tries to modify the variable. If data race happened, an incorrect result will be returned. So, make sure to validate your output against the serial version.\n",
121 |     "\n",
122 |     "\n",
123 |     "# Links and Resources\n",
124 |     "\n",
125 |     "[Introduction to Numba](https://github.com/gpuhackathons-org/gpubootcamp/tree/master/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/numba/numba_guide.ipynb)\n",
126 |     "\n",
127 |     "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
128 |     "\n",
129 |     "[NVIDIA CUDA Toolkit](https://developer.nvidia.com/cuda-downloads)\n",
130 |     "\n",
131 |     "**NOTE**: To be able to see the Nsight Systems profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n",
132 |     "\n",
133 |     "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
134 |     "\n",
135 |     "\n",
136 |     "---\n",
137 |     "## Licensing \n",
138 |     "\n",
139 |     "Copyright © 2022 OpenACC-Standard.org.  This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply."
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": []
148 |   }
149 |  ],
150 |  "metadata": {
151 |   "kernelspec": {
152 |    "display_name": "Python 3",
153 |    "language": "python",
154 |    "name": "python3"
155 |   },
156 |   "language_info": {
157 |    "codemirror_mode": {
158 |     "name": "ipython",
159 |     "version": 3
160 |    },
161 |    "file_extension": ".py",
162 |    "mimetype": "text/x-python",
163 |    "name": "python",
164 |    "nbconvert_exporter": "python",
165 |    "pygments_lexer": "ipython3",
166 |    "version": "3.7.4"
167 |   }
168 |  },
169 |  "nbformat": 4,
170 |  "nbformat_minor": 4
171 | }
172 | 


--------------------------------------------------------------------------------
/nways_cfd/English/C/source_code/serial/cfd.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <math.h>
  4 | 
  5 | #include "arraymalloc.h"
  6 | #include "boundary.h"
  7 | #include "jacobi.h"
  8 | #include "cfdio.h"
  9 | 
 10 | int main(int argc, char **argv)
 11 | {
 12 |   int printfreq=1000; //output frequency
 13 |   double error, bnorm;
 14 |   double tolerance=0.0; //tolerance for convergence. <=0 means do not check
 15 | 
 16 |   //main arrays
 17 |   double *psi, *zet;
 18 |   //temporary versions of main arrays
 19 |   double *psitmp, *zettmp;
 20 | 
 21 |   //command line arguments
 22 |   int scalefactor, numiter;
 23 | 
 24 |   double re; // Reynold's number - must be less than 3.7
 25 | 
 26 |   //simulation sizes
 27 |   int bbase=10;
 28 |   int hbase=15;
 29 |   int wbase=5;
 30 |   int mbase=32;
 31 |   int nbase=32;
 32 | 
 33 |   int irrotational = 1, checkerr = 0;
 34 | 
 35 |   int m,n,b,h,w;
 36 |   int iter;
 37 |   int i,j;
 38 | 
 39 |   double tstart, tstop, ttot, titer;
 40 | 
 41 |   //do we stop because of tolerance?
 42 |   if (tolerance > 0) {checkerr=1;}
 43 | 
 44 |   //check command line parameters and parse them
 45 | 
 46 |   if (argc <3|| argc >4)
 47 |     {
 48 |       printf("Usage: cfd <scale> <numiter> [reynolds]\n");
 49 |       return 0;
 50 |     }
 51 | 
 52 |   scalefactor=atoi(argv[1]);
 53 |   numiter=atoi(argv[2]);
 54 | 
 55 |   if (argc == 4)
 56 |     {
 57 |       re=atof(argv[3]);
 58 |       irrotational=0;
 59 |     }
 60 |   else
 61 |     {
 62 |       re=-1.0;
 63 |     }
 64 | 
 65 |   if(!checkerr)
 66 |     {
 67 |       printf("Scale Factor = %i, iterations = %i\n",scalefactor, numiter);
 68 |     }
 69 |   else
 70 |     {
 71 |       printf("Scale Factor = %i, iterations = %i, tolerance= %g\n",scalefactor,numiter,tolerance);
 72 |     }
 73 | 
 74 |   if (irrotational)
 75 |     {
 76 |       printf("Irrotational flow\n");
 77 |     }
 78 |   else
 79 |     {
 80 |       printf("Reynolds number = %f\n",re);
 81 |     }
 82 | 
 83 |   //Calculate b, h & w and m & n
 84 |   b = bbase*scalefactor;
 85 |   h = hbase*scalefactor;
 86 |   w = wbase*scalefactor;
 87 |   m = mbase*scalefactor;
 88 |   n = nbase*scalefactor;
 89 | 
 90 |   re = re / (double)scalefactor;
 91 | 
 92 |   printf("Running CFD on %d x %d grid in serial\n",m,n);
 93 | 
 94 |   //allocate arrays
 95 | 
 96 |   psi    = (double *) malloc((m+2)*(n+2)*sizeof(double));
 97 |   psitmp = (double *) malloc((m+2)*(n+2)*sizeof(double));
 98 | 
 99 |     nvtxRangePush("Initialization");
100 |   //zero the psi array
101 |   for (i=0;i<m+2;i++)
102 |     {
103 |       for(j=0;j<n+2;j++)
104 | 	{
105 | 	  psi[i*(m+2)+j]=0.0;
106 | 	}
107 |     }
108 |     nvtxRangePop(); //pop 
109 | 
110 |   if (!irrotational)
111 |     {
112 |       //allocate arrays
113 | 
114 |       zet =   (double *) malloc((m+2)*(n+2)*sizeof(double));
115 |       zettmp =(double *) malloc((m+2)*(n+2)*sizeof(double));
116 | 
117 |       //zero the zeta array
118 |       nvtxRangePush("Initialization");
119 | 
120 |       for (i=0;i<m+2;i++)
121 | 	{
122 | 	  for(j=0;j<n+2;j++)
123 | 	    {
124 | 	      zet[i*(m+2)+j]=0.0;
125 | 	    }
126 | 	}
127 |        nvtxRangePop(); //pop for REading file
128 |     }
129 |   
130 |   //set the psi boundary conditions
131 |     nvtxRangePush("Boundary_PSI");
132 | 
133 |   boundarypsi(psi,m,n,b,h,w);
134 |     nvtxRangePop(); //pop 
135 | 
136 |   //compute normalisation factor for error
137 | 
138 |   bnorm=0.0;
139 |     nvtxRangePush("Compute_Normalization");
140 | 
141 |   for (i=0;i<m+2;i++)
142 |     {
143 |       for (j=0;j<n+2;j++)
144 | 	{
145 | 	  bnorm += psi[i*(m+2)+j]*psi[i*(m+2)+j];
146 | 	}
147 |     }
148 |      nvtxRangePop(); //pop 
149 | 
150 |   if (!irrotational)
151 |     {
152 |       //update zeta BCs that depend on psi
153 |       boundaryzet(zet,psi,m,n);
154 | 
155 |       //update normalisation
156 |       nvtxRangePush("Compute_Normalization");
157 |       for (i=0;i<m+2;i++)
158 | 	{
159 | 	  for (j=0;j<n+2;j++)
160 | 	    {
161 | 	      bnorm += zet[i*(m+2)+j]*zet[i*(m+2)+j];
162 | 	    }
163 | 	}
164 |        nvtxRangePop(); //pop 
165 |     }
166 | 
167 |   bnorm=sqrt(bnorm);
168 | 
169 |   //begin iterative Jacobi loop
170 | 
171 |   printf("\nStarting main loop...\n\n");
172 |   
173 |   tstart=gettime();
174 |    nvtxRangePush("Overall_Iteration");
175 | 
176 |   for(iter=1;iter<=numiter;iter++)
177 |     {
178 |       //calculate psi for next iteration
179 |       nvtxRangePush("JacobiStep");
180 |       if (irrotational)
181 | 	{
182 | 	  jacobistep(psitmp,psi,m,n);
183 | 	}
184 |       else
185 | 	{
186 | 	  jacobistepvort(zettmp,psitmp,zet,psi,m,n,re);
187 | 	}
188 |       nvtxRangePop(); //pop 
189 |       nvtxRangePush("Calculate_Error");
190 |       //calculate current error if required
191 |        
192 |       if (checkerr || iter == numiter)
193 | 	{
194 | 	  error = deltasq(psitmp,psi,m,n);
195 | 
196 | 	  if(!irrotational)
197 | 	    {
198 | 	      error += deltasq(zettmp,zet,m,n);
199 | 	    }
200 | 
201 | 	  error=sqrt(error);
202 | 	  error=error/bnorm;
203 | 	}
204 |       nvtxRangePop(); //pop 
205 | 
206 |       //quit early if we have reached required tolerance
207 | 
208 |       if (checkerr)
209 | 	{
210 | 	  if (error < tolerance)
211 | 	    {
212 | 	      printf("Converged on iteration %d\n",iter);
213 | 	      break;
214 | 	    }
215 | 	}
216 | 
217 |       //copy back
218 |       nvtxRangePush("Switch_Array");
219 | 
220 |       for(i=1;i<=m;i++)
221 | 	{
222 | 	  for(j=1;j<=n;j++)
223 | 	    {
224 | 	      psi[i*(m+2)+j]=psitmp[i*(m+2)+j];
225 | 	    }
226 | 	}
227 | 
228 |       if (!irrotational)
229 | 	{
230 | 	  for(i=1;i<=m;i++)
231 | 	    {
232 | 	      for(j=1;j<=n;j++)
233 | 		{
234 | 		  zet[i*(m+2)+j]=zettmp[i*(m+2)+j];
235 | 		}
236 | 	    }
237 | 	}
238 |       nvtxRangePop(); //pop 
239 | 
240 |       if (!irrotational)
241 | 	{
242 | 	  //update zeta BCs that depend on psi
243 | 	  boundaryzet(zet,psi,m,n);
244 | 	}
245 | 
246 |       //print loop information
247 | 
248 |       if(iter%printfreq == 0)
249 | 	{
250 | 	  if (!checkerr)
251 | 	    {
252 | 	      printf("Completed iteration %d\n",iter);
253 | 	    }
254 | 	  else
255 | 	    {
256 | 	      printf("Completed iteration %d, error = %g\n",iter,error);
257 | 	    }
258 | 	}
259 |      
260 |     }
261 |      nvtxRangePop(); //pop 
262 | 
263 |   if (iter > numiter) iter=numiter;
264 | 
265 |   tstop=gettime();
266 | 
267 |   ttot=tstop-tstart;
268 |   titer=ttot/(double)iter;
269 | 
270 | 
271 |   //print out some stats
272 | 
273 |   printf("\n... finished\n");
274 |   printf("After %d iterations, the error is %g\n",iter,error);
275 |   printf("Time for %d iterations was %g seconds\n",iter,ttot);
276 |   printf("Each iteration took %g seconds\n",titer);
277 | 
278 |   //output results
279 | 
280 |   writedatafiles(psi,m,n, scalefactor);
281 | 
282 |   writeplotfile(m,n,scalefactor);
283 | 
284 |   //free un-needed arrays
285 |   free(psi);
286 |   free(psitmp);
287 | 
288 |   if (!irrotational)
289 |     {
290 |       free(zet);
291 |       free(zettmp);
292 |     }
293 | 
294 |   printf("... finished\n");
295 | 
296 |   return 0;
297 | }
298 | 


--------------------------------------------------------------------------------
/nways_cfd/English/Fortran/source_code/serial/cfd.f90:
--------------------------------------------------------------------------------
  1 | program cfd
  2 | 
  3 |   use boundary
  4 |   use jacobi
  5 |   use cfdio
  6 |   use nvtx
  7 | 
  8 |   implicit none
  9 | 
 10 | ! Output frequency
 11 |   
 12 |   integer, parameter :: printfreq = 1000
 13 | 
 14 | ! Variables associated with convergence
 15 | 
 16 |   double precision :: error, bnorm
 17 | 
 18 | ! Set tolerance for convergence; zero or negative means do not check
 19 | 
 20 |   double precision, parameter :: tolerance = 0.0d0
 21 | 
 22 | ! Main arrays
 23 | 
 24 |   double precision, allocatable ::  psi(:,:), zet(:,:)
 25 |   double precision, allocatable ::  psitmp(:,:), zettmp(:,:)
 26 | 
 27 | ! Command-line arguments
 28 | 
 29 |   integer :: scalefactor,  numiter
 30 | 
 31 |   double precision :: re  ! re = 3.7 seems to be stability limit with Jacobi
 32 | 
 33 |   integer, parameter :: maxline = 32
 34 |   character(len=maxline) :: tmparg
 35 | 
 36 | !  Basic sizes of simulation
 37 | 
 38 |   integer, parameter :: bbase = 10
 39 |   integer, parameter :: hbase = 15
 40 |   integer, parameter :: wbase =  5
 41 |   integer, parameter :: mbase = 32
 42 |   integer, parameter :: nbase = 32
 43 | 
 44 |   logical :: irrotational = .true., checkerr = .false.
 45 | 
 46 | !  Some auxiliary parameters and variables
 47 | 
 48 |   integer :: m, n, b, h, w
 49 |   integer :: iter
 50 | 
 51 |   double precision :: tstart, tstop, ttot, titer, modvsq, hue
 52 | 
 53 | !  Are we stopping based on tolerance?
 54 | 
 55 |   if (tolerance .gt. 0.0) checkerr = .true.
 56 | 
 57 | !  Read in parameters
 58 | 
 59 |   if (command_argument_count() /= 2 .and. command_argument_count() /= 3) then
 60 | 
 61 |      write(*,*) 'Usage: cfd <scale> <numiter> [reynolds]'
 62 |      stop
 63 | 
 64 |   end if
 65 | 
 66 |   call get_command_argument(1, tmparg)
 67 |   read(tmparg,*) scalefactor
 68 |   call get_command_argument(2, tmparg)
 69 |   read(tmparg,*) numiter
 70 | 
 71 |   if (command_argument_count() == 3) then
 72 | 
 73 |      irrotational = .false.
 74 |      call get_command_argument(3, tmparg)
 75 |      read(tmparg,*) re
 76 |         
 77 |   else
 78 | 
 79 |      re = -1.0
 80 |      
 81 |   end if
 82 | 
 83 |   if (.not. checkerr) then
 84 |      write(*,fmt='('' Scale factor = '',i3,'', iterations = '', i6)') &
 85 |            scalefactor, numiter
 86 |   else
 87 |      write(*,fmt='('' Scale factor = '',i3,'', iterations = '', i6, &
 88 |           &'', tolerance = '', g11.4)') scalefactor, numiter, tolerance
 89 |   end if
 90 | 
 91 |   if (irrotational) then
 92 |         
 93 |      write(*,*) 'Irrotational flow'
 94 |         
 95 |   else
 96 | 
 97 |      write(*,fmt='('' Reynolds number = '', f6.3)') re
 98 |         
 99 |   end if
100 | 
101 | !  Calculate b, h & w and m & n
102 |         
103 |   b = bbase*scalefactor 
104 |   h = hbase*scalefactor
105 |   w = wbase*scalefactor 
106 |   m = mbase*scalefactor
107 |   n = nbase*scalefactor
108 | 
109 |   re = re / dble(scalefactor)
110 | 
111 |   write(*,fmt='('' Running CFD on '', i4, '' x '', i4, &
112 |        &'' grid in serial '')') m, n
113 | 
114 | !  Allocate arrays, including halos on psi and tmp
115 | 
116 |   allocate(psi(0:m+1, 0:n+1))
117 |   allocate(zet(0:m+1, 0:n+1))
118 | 
119 |   allocate(psitmp(0:m+1, 0:n+1))
120 | 
121 |   if (.not. irrotational) then
122 | 
123 |      allocate(zettmp(0:m+1, 0:n+1))
124 | 
125 |   end if
126 | 
127 | !  Zero the psi array
128 |   call nvtxStartRange("Initialization")
129 |   psi(:,:) = 0.0
130 |   zet(:,:) = 0.0
131 |   call nvtxEndRange
132 | 
133 | !  Set the psi boundary condtions which are constant
134 | 
135 |    call nvtxStartRange("boundaryPSI")
136 |    call boundarypsi(psi, m, n, b, h, w)
137 |    call nvtxEndRange
138 | 
139 | !  Compute normalisation factor for error
140 | 
141 |    bnorm = sum(psi(:,:)**2)
142 | 
143 |    if (.not. irrotational) then
144 | 
145 | !    Update the zeta boundary condtions which depend on psi
146 | 
147 |      call boundaryzet(zet, psi, m, n)
148 | 
149 | !    Update the normalisation
150 | 
151 |      bnorm = bnorm + sum(zet(:,:)**2)
152 | 
153 |   end if
154 | 
155 |    bnorm = sqrt(bnorm)
156 | 
157 | !  Begin iterative Jacobi loop
158 | 
159 |    write(*,*)
160 |    write(*,*) 'Starting main loop ...'
161 |    write(*,*)
162 | 
163 |    tstart = gettime()
164 | 
165 |   call nvtxStartRange("Overall Iteration")
166 |   do iter = 1, numiter
167 | 
168 | !  Compute the new psi based on the old one
169 | 
170 |      call nvtxStartRange("Jacobi Step")
171 |      if (irrotational) then
172 | 
173 | !  Call function with no vorticity
174 |         call jacobistep(psitmp, psi, m, n)
175 | 
176 |      else
177 | 
178 | !  Call function containing vorticity
179 | 
180 |         call jacobistepvort(zettmp, psitmp, zet, psi, m, n, re)
181 | 
182 |      end if
183 |      call nvtxEndRange
184 | 
185 | !  Compute current error value if required
186 |      
187 |      call nvtxStartRange("Calculate Error")
188 |      if (checkerr .or. iter == numiter) then
189 | 
190 |         error = deltasq(psitmp, psi, m, n)
191 | 
192 |         if (.not. irrotational) then
193 | 
194 |            error = error + deltasq(zettmp, zet, m, n)
195 | 
196 |         end if
197 | 
198 |         error = sqrt(error)
199 |         
200 |         error = error / bnorm
201 | 
202 |      end if
203 |      call nvtxEndRange
204 | 
205 | !  Quit early if we have reached required tolerance
206 | 
207 |      if (checkerr) then
208 |         if (error .lt. tolerance) then
209 |            write(*,*) 'CONVERGED iteration ', iter, ': terminating'
210 |            exit
211 |         end if
212 |      end if
213 | 
214 | !  Copy back
215 | 
216 |      call nvtxStartRange("Switch Array")
217 |      psi(1:m, 1:n) = psitmp(1:m, 1:n)
218 | 
219 |      if (.not. irrotational) then
220 | 
221 |         zet(1:m, 1:n) = zettmp(1:m, 1:n)
222 | 
223 |      end if
224 |      call nvtxEndRange
225 | 
226 |      if (.not. irrotational) then
227 | 
228 | !    Update the zeta boundary condtions which depend on psi
229 | 
230 |         call boundaryzet(zet, psi, m, n)
231 |         
232 |      end if
233 | 
234 | !  End iterative Jacobi loop
235 | 
236 |      if (mod(iter,printfreq) == 0) then
237 | 
238 |         if (.not. checkerr) then
239 |            write(*,*) 'completed iteration ', iter
240 |         else
241 |            write(*,*) 'completed iteration ', iter, ', error = ', error
242 |         end if
243 | 
244 |      end if
245 | 
246 |   end do
247 |   call nvtxEndRange
248 | 
249 |   if (iter .gt. numiter) iter = numiter
250 | 
251 |   tstop = gettime()
252 | 
253 |   ttot  = tstop-tstart
254 |   titer = ttot/dble(iter)
255 | 
256 |   write(*,*) 
257 |   write(*,*) '... finished'
258 |   write(*,*)
259 |   write(*,fmt='('' After    '', i6, '' iterations, error is '', g11.4)') &
260 |         iter, error
261 |   write(*,fmt='('' Time for '', i6, '' iterations was '',&
262 |         &g11.4, '' seconds'')') iter, ttot
263 |   write(*,fmt='('' Each individual iteration took '', g11.4, '' seconds'')') &
264 |         titer
265 |   write(*,*)
266 |   write(*,*) 'Writing output file ...'
267 | 
268 | !  Output results
269 | 
270 |   call writedatafiles(psi, m, n, scalefactor)
271 | 
272 | !  Output gnuplot file
273 | 
274 |   call writeplotfile(m, n, scalefactor)
275 | 
276 | ! Finish
277 | 
278 |   write(*,*) ' ... finished'
279 |   write(*,*)
280 |   write(*,*) 'CFD completed'
281 |   write(*,*)
282 | 
283 | end program cfd
284 | 
285 | 


--------------------------------------------------------------------------------
/nways_cfd/English/C/jupyter_notebook/minicfd_cudac.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# CUDA C Acceleration \n",
  8 |     "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell."
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "!nvidia-smi"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "## Copy and Compile the Serial code\n",
 25 |     "\n",
 26 |     "Before start modifying the serial code, let's make a copy of the serial code and rename it."
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "!cp ../source_code/serial/* ../source_code/cuda-c"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "!cd ../source_code/cuda-c && make clean && make"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "## Run the Serial code"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "!cd ../source_code/cuda-c && ./cfd 64 500"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "---\n",
 68 |     "\n",
 69 |     "# Start adding CUDA C constructs"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "Now, you can start modifying the C++ code and the `Makefile`:\n",
 77 |     "\n",
 78 |     "[cfd code](../source_code/cuda-c/cfd.cpp) \n",
 79 |     "\n",
 80 |     "[Makefile](../source_code/cuda-c/Makefile)\n",
 81 |     "\n",
 82 |     "Remember to **SAVE** your code after changes, before running below cells.\n",
 83 |     "\n",
 84 |     "#### Some Hints\n",
 85 |     "Check if there is any data race in your code.( More details on data race is present in the Links and resources section below)"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "## Compile and run CUDA C enabled code\n"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "!cd ../source_code/cuda-c && make clean && make"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "markdown",
106 |    "metadata": {},
107 |    "source": [
108 |     "## Profile the CUDA C Code"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "!cd ../source_code/cuda-c && nsys profile -t nvtx,cuda --stats=true --force-overwrite true -o minicfdcudac_profile ./cfd 64 500"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {},
123 |    "source": [
124 |     "You can examine the output on the terminal or you can download the file and view the timeline by opening the output with the NVIDIA Nsight Systems."
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "Download and save the report file by holding down <mark>Shift</mark> and <mark>right-clicking</mark> [here](../source_code/cuda-c/minicfdcudac_profile.nsys-rep) then choosing <mark>save Link As</mark>. Once done, open it via the GUI."
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "metadata": {},
137 |    "source": [
138 |     "## Validating the Output\n",
139 |     "\n",
140 |     "Make sure the error value printed as output matches that of the serial code"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "markdown",
145 |    "metadata": {},
146 |    "source": [
147 |     "# Recommendations for adding CUDA C\n",
148 |     "\n",
149 |     "After finding the hotspot function take an incremental approach to add pargmas. \n",
150 |     "\n",
151 |     "1) Convert files using CUDA kernels to .cu \n",
152 |     "\n",
153 |     "2) Ignore the initialization, finalization and I/O functions\n",
154 |     "\n",
155 |     "3) Cross check the output after incremental changes to check algorithmic scalability\n",
156 |     "\n",
157 |     "4) Start with a small problem size that reduces the execution time. \n",
158 |     "\n",
159 |     "\n",
160 |     "**General tip:** Be aware of *Data Race* situation in which at least two threads access a shared variable at the same time. At least on thread tries to modify the variable. If data race happened, an incorrect result will be returned. So, make sure to validate your output against the serial version."
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "markdown",
165 |    "metadata": {},
166 |    "source": [
167 |     "# Links and Resources\n",
168 |     "\n",
169 |     "[CUDA Introduction ](https://developer.nvidia.com/blog/even-easier-introduction-cuda/)\n",
170 |     "\n",
171 |     "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
172 |     "\n",
173 |     "[CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)\n",
174 |     "\n",
175 |     "**NOTE**: To be able to see the Nsight Systems profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n",
176 |     "\n",
177 |     "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
178 |     "\n",
179 |     "--- \n",
180 |     "\n"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "markdown",
185 |    "metadata": {},
186 |    "source": [
187 |     "## Licensing \n",
188 |     "\n",
189 |     "Copyright © 2022 OpenACC-Standard.org.  This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply."
190 |    ]
191 |   }
192 |  ],
193 |  "metadata": {
194 |   "anaconda-cloud": {},
195 |   "kernelspec": {
196 |    "display_name": "Python 3",
197 |    "language": "python",
198 |    "name": "python3"
199 |   },
200 |   "language_info": {
201 |    "codemirror_mode": {
202 |     "name": "ipython",
203 |     "version": 3
204 |    },
205 |    "file_extension": ".py",
206 |    "mimetype": "text/x-python",
207 |    "name": "python",
208 |    "nbconvert_exporter": "python",
209 |    "pygments_lexer": "ipython3",
210 |    "version": "3.7.4"
211 |   }
212 |  },
213 |  "nbformat": 4,
214 |  "nbformat_minor": 4
215 | }
216 | 


--------------------------------------------------------------------------------
/nways_cfd/English/Fortran/jupyter_notebook/minicfd_cudafortran.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# CUDA Fortran Acceleration \n",
  8 |     "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell."
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "!nvidia-smi"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "## Copy and Compile the Serial code\n",
 25 |     "\n",
 26 |     "Before start modifying the serial code, let's make a copy of the serial code and rename it."
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "!cp ../source_code/serial/* ../source_code/cudafortran"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "!cd ../source_code/cudafortran && make clean && make"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "## Run the Serial code"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "!cd ../source_code/cudafortran && ./cfd 64 500"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "---\n",
 68 |     "\n",
 69 |     "# Start adding CUDA Fortran constructs"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "Now, you can start modifying the Fortran code and the `Makefile`:\n",
 77 |     "\n",
 78 |     "[cfd code](../source_code/cudafortran/cfd.f90) \n",
 79 |     "\n",
 80 |     "[Makefile](../source_code/cudafortran/Makefile)\n",
 81 |     "\n",
 82 |     "Remember to **SAVE** your code after changes, before running below cells.\n",
 83 |     "\n",
 84 |     "#### Some Hints\n",
 85 |     "Check if there is any data race in your code.( More details on data race is present in the Links and resources section below)"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "## Compile and run CUDA Fortran enabled code\n"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "!cd ../source_code/cudafortran && make clean && make"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "markdown",
106 |    "metadata": {},
107 |    "source": [
108 |     "## Profile the CUDA Fortran Code"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "!cd ../source_code/cudafortran && nsys profile -t nvtx,cuda --stats=true --force-overwrite true -o minicfdcudafortran_profile ./cfd 64 500"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {},
123 |    "source": [
124 |     "You can examine the output on the terminal or you can download the file and view the timeline by opening the output with the NVIDIA Nsight Systems."
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "Download and save the report file by holding down <mark>Shift</mark> and <mark>right-clicking</mark> [here](../source_code/cudafortran/minicfdcudafortran_profile.nsys-rep) then choosing <mark>save Link As</mark>. Once done, open it via the GUI."
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "metadata": {},
137 |    "source": [
138 |     "## Validating the Output\n",
139 |     "\n",
140 |     "Make sure the error value printed as output matches that of the serial code"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "markdown",
145 |    "metadata": {},
146 |    "source": [
147 |     "# Recommendations for adding CUDA Fortran\n",
148 |     "\n",
149 |     "After finding the hotspot function take an incremental approach to add pargmas. \n",
150 |     "\n",
151 |     "1) Ignore the initialization, finalization and I/O functions\n",
152 |     "\n",
153 |     "2) Cross check the output after incremental changes to check algorithmic scalability\n",
154 |     "\n",
155 |     "3) Start with a small problem size that reduces the execution time. \n",
156 |     "\n",
157 |     "\n",
158 |     "**General tip:** Be aware of *Data Race* situation in which at least two threads access a shared variable at the same time. At least on thread tries to modify the variable. If data race happened, an incorrect result will be returned. So, make sure to validate your output against the serial version."
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "# Links and Resources\n",
166 |     "\n",
167 |     "[CUDA Introduction ](https://developer.nvidia.com/blog/even-easier-introduction-cuda/)\n",
168 |     "\n",
169 |     "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
170 |     "\n",
171 |     "[CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)\n",
172 |     "\n",
173 |     "**NOTE**: To be able to see the Nsight Systems profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n",
174 |     "\n",
175 |     "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
176 |     "\n",
177 |     "--- \n",
178 |     "\n"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "markdown",
183 |    "metadata": {},
184 |    "source": [
185 |     "## Licensing \n",
186 |     "\n",
187 |     "Copyright © 2022 OpenACC-Standard.org.  This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply."
188 |    ]
189 |   }
190 |  ],
191 |  "metadata": {
192 |   "anaconda-cloud": {},
193 |   "kernelspec": {
194 |    "display_name": "Python 3",
195 |    "language": "python",
196 |    "name": "python3"
197 |   },
198 |   "language_info": {
199 |    "codemirror_mode": {
200 |     "name": "ipython",
201 |     "version": 3
202 |    },
203 |    "file_extension": ".py",
204 |    "mimetype": "text/x-python",
205 |    "name": "python",
206 |    "nbconvert_exporter": "python",
207 |    "pygments_lexer": "ipython3",
208 |    "version": "3.7.4"
209 |   }
210 |  },
211 |  "nbformat": 4,
212 |  "nbformat_minor": 4
213 | }
214 | 


--------------------------------------------------------------------------------
/nways_cfd/English/C/jupyter_notebook/.ipynb_checkpoints/minicfd_cudac-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# CUDA C Acceleration \n",
  8 |     "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell."
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "!nvidia-smi"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "## Copy and Compile the Serial code\n",
 25 |     "\n",
 26 |     "Before start modifying the serial code, let's make a copy of the serial code and rename it."
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "!cp ../source_code/serial/* ../source_code/cuda-c"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "!cd ../source_code/cuda-c && make clean && make"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "## Run the Serial code"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "!cd ../source_code/cuda-c && ./cfd 64 500"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "---\n",
 68 |     "\n",
 69 |     "# Start adding CUDA C constructs"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "Now, you can start modifying the C++ code and the `Makefile`:\n",
 77 |     "\n",
 78 |     "[cfd code](../source_code/cuda-c/cfd.cpp) \n",
 79 |     "\n",
 80 |     "[Makefile](../source_code/cuda-c/Makefile)\n",
 81 |     "\n",
 82 |     "Remember to **SAVE** your code after changes, before running below cells.\n",
 83 |     "\n",
 84 |     "#### Some Hints\n",
 85 |     "Check if there is any data race in your code.( More details on data race is present in the Links and resources section below)"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "## Compile and run CUDA C enabled code\n"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "!cd ../source_code/cuda-c && make clean && make"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "markdown",
106 |    "metadata": {},
107 |    "source": [
108 |     "## Profile the CUDA C Code"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "!cd ../source_code/cuda-c && nsys profile -t nvtx,cuda --stats=true --force-overwrite true -o minicfdcudac_profile ./cfd 64 500"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {},
123 |    "source": [
124 |     "You can examine the output on the terminal or you can download the file and view the timeline by opening the output with the NVIDIA Nsight Systems."
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "Download and save the report file by holding down <mark>Shift</mark> and <mark>right-clicking</mark> [here](../source_code/cuda-c/minicfdcudac_profile.nsys-rep) then choosing <mark>save Link As</mark>. Once done, open it via the GUI."
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "metadata": {},
137 |    "source": [
138 |     "## Validating the Output\n",
139 |     "\n",
140 |     "Make sure the error value printed as output matches that of the serial code"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "markdown",
145 |    "metadata": {},
146 |    "source": [
147 |     "# Recommendations for adding CUDA C\n",
148 |     "\n",
149 |     "After finding the hotspot function take an incremental approach to add pargmas. \n",
150 |     "\n",
151 |     "1) Convert files using CUDA kernels to .cu \n",
152 |     "\n",
153 |     "2) Ignore the initialization, finalization and I/O functions\n",
154 |     "\n",
155 |     "3) Cross check the output after incremental changes to check algorithmic scalability\n",
156 |     "\n",
157 |     "4) Start with a small problem size that reduces the execution time. \n",
158 |     "\n",
159 |     "\n",
160 |     "**General tip:** Be aware of *Data Race* situation in which at least two threads access a shared variable at the same time. At least on thread tries to modify the variable. If data race happened, an incorrect result will be returned. So, make sure to validate your output against the serial version."
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "markdown",
165 |    "metadata": {},
166 |    "source": [
167 |     "# Links and Resources\n",
168 |     "\n",
169 |     "[CUDA Introduction ](https://developer.nvidia.com/blog/even-easier-introduction-cuda/)\n",
170 |     "\n",
171 |     "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
172 |     "\n",
173 |     "[CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)\n",
174 |     "\n",
175 |     "**NOTE**: To be able to see the Nsight Systems profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n",
176 |     "\n",
177 |     "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
178 |     "\n",
179 |     "--- \n",
180 |     "\n"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "markdown",
185 |    "metadata": {},
186 |    "source": [
187 |     "## Licensing \n",
188 |     "\n",
189 |     "Copyright © 2022 OpenACC-Standard.org.  This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply."
190 |    ]
191 |   }
192 |  ],
193 |  "metadata": {
194 |   "anaconda-cloud": {},
195 |   "kernelspec": {
196 |    "display_name": "Python 3",
197 |    "language": "python",
198 |    "name": "python3"
199 |   },
200 |   "language_info": {
201 |    "codemirror_mode": {
202 |     "name": "ipython",
203 |     "version": 3
204 |    },
205 |    "file_extension": ".py",
206 |    "mimetype": "text/x-python",
207 |    "name": "python",
208 |    "nbconvert_exporter": "python",
209 |    "pygments_lexer": "ipython3",
210 |    "version": "3.7.4"
211 |   }
212 |  },
213 |  "nbformat": 4,
214 |  "nbformat_minor": 4
215 | }
216 | 


--------------------------------------------------------------------------------
/nways_cfd/English/C/jupyter_notebook/minicfd_stdpar.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# std::par Acceleration \n",
  8 |     "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell."
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "!nvidia-smi"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "Since the code will be run on Multicore as well try running the cell below and get details of the nnumber of core and CPU architecure on the system"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "!cat /proc/cpuinfo"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "## Copy and Compile the Serial code\n",
 41 |     "\n",
 42 |     "Before start modifying the serial code, let's make a copy of the serial code and rename it."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "!cp ../source_code/serial/* ../source_code/stdpar"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "!cd ../source_code/stdpar && make clean && make"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "## Run the Serial code"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "!cd ../source_code/stdpar && ./cfd 64 500"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "---\n",
 84 |     "\n",
 85 |     "# Start adding std::par constructs"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "Now, you can start modifying the C++ code and the `Makefile`:\n",
 93 |     "\n",
 94 |     "[cfd code](../source_code/stdpar/cfd.cpp) \n",
 95 |     "\n",
 96 |     "[Makefile](../source_code/stdpar/Makefile)\n",
 97 |     "\n",
 98 |     "Remember to **SAVE** your code after changes, before running below cells.\n",
 99 |     "\n",
100 |     "#### Some Hints\n",
101 |     "\n",
102 |     "Check if there is any data race in your code.( More details on data race is present in the Links and resources section below)"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "## Compile and run std::par enabled code\n"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "!cd ../source_code/stdpar && make clean && make"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "metadata": {},
124 |    "source": [
125 |     "## Profile the std::par Code"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": [
134 |     "!cd ../source_code/stdpar && nsys profile -t nvtx,cuda --stats=true --force-overwrite true -o minicfdstdpar_profile ./cfd 64 500"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "metadata": {},
140 |    "source": [
141 |     "You can examine the output on the terminal or you can download the file and view the timeline by opening the output with the NVIDIA Nsight Systems."
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "metadata": {},
147 |    "source": [
148 |     "Download and save the report file by holding down <mark>Shift</mark> and <mark>right-clicking</mark> [here](../source_code/stdpar/minicfdstdpar_profile.nsys-rep) then choosing <mark>save Link As</mark>. Once done, open it via the GUI.\n",
149 |     "\n",
150 |     "## Validating the Output\n",
151 |     "\n",
152 |     "Make sure the error value printed as output matches that of the serial code"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "metadata": {},
158 |    "source": [
159 |     "# Recommendations for adding std::par \n",
160 |     "\n",
161 |     "After finding the hotspot function take an incremental approach to add pargmas. \n",
162 |     "\n",
163 |     "1) Ignore the initialization, finalization and I/O functions\n",
164 |     "\n",
165 |     "2) Convert the allocations to dynamic stl array\n",
166 |     "\n",
167 |     "3) Cross check the output after incremental changes to check algorithmic scalability\n",
168 |     "\n",
169 |     "4) Start with a small problem size that reduces the execution time. \n",
170 |     "\n",
171 |     "\n",
172 |     "\n",
173 |     "**General tip:** Be aware of *Data Race* situation in which at least two threads access a shared variable at the same time. At least on thread tries to modify the variable. If data race happened, an incorrect result will be returned. So, make sure to validate your output against the serial version."
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "metadata": {},
179 |    "source": [
180 |     "# Links and Resources\n",
181 |     "\n",
182 |     "[std::par blog](https://developer.nvidia.com/blog/accelerating-standard-c-with-gpus-using-stdpar/)\n",
183 |     "\n",
184 |     "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
185 |     "\n",
186 |     "[CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)\n",
187 |     "\n",
188 |     "[Execution Policy Details](https://en.cppreference.com/w/cpp/algorithm/execution_policy_tag)\n",
189 |     "\n",
190 |     "**NOTE**: To be able to see the Nsight Systems profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n",
191 |     "\n",
192 |     "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
193 |     "\n",
194 |     "--- \n",
195 |     "\n"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "markdown",
200 |    "metadata": {},
201 |    "source": [
202 |     "## Licensing \n",
203 |     "\n",
204 |     "Copyright © 2022 OpenACC-Standard.org.  This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply."
205 |    ]
206 |   }
207 |  ],
208 |  "metadata": {
209 |   "anaconda-cloud": {},
210 |   "kernelspec": {
211 |    "display_name": "Python 3",
212 |    "language": "python",
213 |    "name": "python3"
214 |   },
215 |   "language_info": {
216 |    "codemirror_mode": {
217 |     "name": "ipython",
218 |     "version": 3
219 |    },
220 |    "file_extension": ".py",
221 |    "mimetype": "text/x-python",
222 |    "name": "python",
223 |    "nbconvert_exporter": "python",
224 |    "pygments_lexer": "ipython3",
225 |    "version": "3.7.4"
226 |   }
227 |  },
228 |  "nbformat": 4,
229 |  "nbformat_minor": 4
230 | }
231 | 


--------------------------------------------------------------------------------
/nways_cfd/English/C/jupyter_notebook/.ipynb_checkpoints/minicfd_stdpar-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# std::par Acceleration \n",
  8 |     "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell."
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "!nvidia-smi"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "Since the code will be run on Multicore as well try running the cell below and get details of the nnumber of core and CPU architecure on the system"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "!cat /proc/cpuinfo"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "## Copy and Compile the Serial code\n",
 41 |     "\n",
 42 |     "Before start modifying the serial code, let's make a copy of the serial code and rename it."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "!cp ../source_code/serial/* ../source_code/stdpar"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "!cd ../source_code/stdpar && make clean && make"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "## Run the Serial code"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "!cd ../source_code/stdpar && ./cfd 64 500"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "---\n",
 84 |     "\n",
 85 |     "# Start adding std::par constructs"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "Now, you can start modifying the C++ code and the `Makefile`:\n",
 93 |     "\n",
 94 |     "[cfd code](../source_code/stdpar/cfd.cpp) \n",
 95 |     "\n",
 96 |     "[Makefile](../source_code/stdpar/Makefile)\n",
 97 |     "\n",
 98 |     "Remember to **SAVE** your code after changes, before running below cells.\n",
 99 |     "\n",
100 |     "#### Some Hints\n",
101 |     "\n",
102 |     "Check if there is any data race in your code.( More details on data race is present in the Links and resources section below)"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "## Compile and run std::par enabled code\n"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "!cd ../source_code/stdpar && make clean && make"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "metadata": {},
124 |    "source": [
125 |     "## Profile the std::par Code"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": [
134 |     "!cd ../source_code/stdpar && nsys profile -t nvtx,cuda --stats=true --force-overwrite true -o minicfdstdpar_profile ./cfd 64 500"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "metadata": {},
140 |    "source": [
141 |     "You can examine the output on the terminal or you can download the file and view the timeline by opening the output with the NVIDIA Nsight Systems."
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "metadata": {},
147 |    "source": [
148 |     "Download and save the report file by holding down <mark>Shift</mark> and <mark>right-clicking</mark> [here](../source_code/stdpar/minicfdstdpar_profile.nsys-rep) then choosing <mark>save Link As</mark>. Once done, open it via the GUI.\n",
149 |     "\n",
150 |     "## Validating the Output\n",
151 |     "\n",
152 |     "Make sure the error value printed as output matches that of the serial code"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "metadata": {},
158 |    "source": [
159 |     "# Recommendations for adding std::par \n",
160 |     "\n",
161 |     "After finding the hotspot function take an incremental approach to add pargmas. \n",
162 |     "\n",
163 |     "1) Ignore the initialization, finalization and I/O functions\n",
164 |     "\n",
165 |     "2) Convert the allocations to dynamic stl array\n",
166 |     "\n",
167 |     "3) Cross check the output after incremental changes to check algorithmic scalability\n",
168 |     "\n",
169 |     "4) Start with a small problem size that reduces the execution time. \n",
170 |     "\n",
171 |     "\n",
172 |     "\n",
173 |     "**General tip:** Be aware of *Data Race* situation in which at least two threads access a shared variable at the same time. At least on thread tries to modify the variable. If data race happened, an incorrect result will be returned. So, make sure to validate your output against the serial version."
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "metadata": {},
179 |    "source": [
180 |     "# Links and Resources\n",
181 |     "\n",
182 |     "[std::par blog](https://developer.nvidia.com/blog/accelerating-standard-c-with-gpus-using-stdpar/)\n",
183 |     "\n",
184 |     "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
185 |     "\n",
186 |     "[CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)\n",
187 |     "\n",
188 |     "[Execution Policy Details](https://en.cppreference.com/w/cpp/algorithm/execution_policy_tag)\n",
189 |     "\n",
190 |     "**NOTE**: To be able to see the Nsight Systems profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n",
191 |     "\n",
192 |     "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
193 |     "\n",
194 |     "--- \n",
195 |     "\n"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "markdown",
200 |    "metadata": {},
201 |    "source": [
202 |     "## Licensing \n",
203 |     "\n",
204 |     "Copyright © 2022 OpenACC-Standard.org.  This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply."
205 |    ]
206 |   }
207 |  ],
208 |  "metadata": {
209 |   "anaconda-cloud": {},
210 |   "kernelspec": {
211 |    "display_name": "Python 3",
212 |    "language": "python",
213 |    "name": "python3"
214 |   },
215 |   "language_info": {
216 |    "codemirror_mode": {
217 |     "name": "ipython",
218 |     "version": 3
219 |    },
220 |    "file_extension": ".py",
221 |    "mimetype": "text/x-python",
222 |    "name": "python",
223 |    "nbconvert_exporter": "python",
224 |    "pygments_lexer": "ipython3",
225 |    "version": "3.7.4"
226 |   }
227 |  },
228 |  "nbformat": 4,
229 |  "nbformat_minor": 4
230 | }
231 | 


--------------------------------------------------------------------------------
/nways_cfd/English/Fortran/jupyter_notebook/minicfd_do_concurrent.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# DO CONCURRENT Acceleration \n",
  8 |     "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell."
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "!nvidia-smi"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "Since the code will be run on Multicore as well try running the cell below and get details of the nnumber of core and CPU architecure on the system"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "!cat /proc/cpuinfo"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "## Copy and Compile the Serial code\n",
 41 |     "\n",
 42 |     "Before start modifying the serial code, let's make a copy of the serial code and rename it."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "!cp ../source_code/serial/* ../source_code/doconcurrent"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "!cd ../source_code/doconcurrent && make clean && make"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "## Run the Serial code"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "!cd ../source_code/doconcurrent && ./cfd 64 500"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "---\n",
 84 |     "\n",
 85 |     "# Start adding DO CONCURRENT constructs"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "Now, you can start modifying the Fortran code and the `Makefile`:\n",
 93 |     "\n",
 94 |     "[cfd code](../source_code/doconcurrent/cfd.f90) \n",
 95 |     "\n",
 96 |     "[Makefile](../source_code/doconcurrent/Makefile)\n",
 97 |     "\n",
 98 |     "Remember to **SAVE** your code after changes, before running below cells.\n",
 99 |     "\n",
100 |     "#### Some Hints\n",
101 |     "\n",
102 |     "Check if there is any data race in your code.( More details on data race is present in the Links and resources section below)"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "## Compile and run DO CONCURRENT enabled code\n"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "!cd ../source_code/doconcurrent && make clean && make"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "metadata": {},
124 |    "source": [
125 |     "## Profile the parallel Code"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": [
134 |     "!cd ../source_code/doconcurrent && nsys profile -t nvtx,cuda --stats=true --force-overwrite true -o minicfdstdpar_profile ./cfd 64 500"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "metadata": {},
140 |    "source": [
141 |     "You can examine the output on the terminal or you can download the file and view the timeline by opening the output with the NVIDIA Nsight Systems."
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "metadata": {},
147 |    "source": [
148 |     "Download and save the report file by holding down <mark>Shift</mark> and <mark>right-clicking</mark> [here](../source_code/doconcurrent/minicfdstdpar_profile.nsys-rep) then choosing <mark>save Link As</mark>. Once done, open it via the GUI.\n",
149 |     "\n",
150 |     "\n",
151 |     "## Validating the Output\n",
152 |     "\n",
153 |     "Make sure the error value printed as output matches that of the serial code"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "markdown",
158 |    "metadata": {},
159 |    "source": [
160 |     "# Recommendations for adding parallelism \n",
161 |     "\n",
162 |     "After finding the hotspot function take an incremental approach to add pargmas. \n",
163 |     "\n",
164 |     "1) Ignore the initialization, finalization and I/O functions\n",
165 |     "\n",
166 |     "2) Convert the allocations to dynamic stl array\n",
167 |     "\n",
168 |     "3) Cross check the output after incremental changes to check algorithmic scalability\n",
169 |     "\n",
170 |     "4) Start with a small problem size that reduces the execution time. \n",
171 |     "\n",
172 |     "\n",
173 |     "\n",
174 |     "**General tip:** Be aware of *Data Race* situation in which at least two threads access a shared variable at the same time. At least on thread tries to modify the variable. If data race happened, an incorrect result will be returned. So, make sure to validate your output against the serial version."
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "metadata": {},
180 |    "source": [
181 |     "# Links and Resources\n",
182 |     "\n",
183 |     "[do concurrent blog](https://developer.nvidia.com/blog/accelerating-fortran-do-concurrent-with-gpus-and-the-nvidia-hpc-sdk/)\n",
184 |     "\n",
185 |     "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
186 |     "\n",
187 |     "[CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)\n",
188 |     "\n",
189 |     "[Execution Policy Details](https://en.cppreference.com/w/cpp/algorithm/execution_policy_tag)\n",
190 |     "\n",
191 |     "**NOTE**: To be able to see the Nsight Systems profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n",
192 |     "\n",
193 |     "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
194 |     "\n",
195 |     "--- \n",
196 |     "\n"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "markdown",
201 |    "metadata": {},
202 |    "source": [
203 |     "## Licensing \n",
204 |     "\n",
205 |     "Copyright © 2022 OpenACC-Standard.org.  This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply."
206 |    ]
207 |   }
208 |  ],
209 |  "metadata": {
210 |   "anaconda-cloud": {},
211 |   "kernelspec": {
212 |    "display_name": "Python 3",
213 |    "language": "python",
214 |    "name": "python3"
215 |   },
216 |   "language_info": {
217 |    "codemirror_mode": {
218 |     "name": "ipython",
219 |     "version": 3
220 |    },
221 |    "file_extension": ".py",
222 |    "mimetype": "text/x-python",
223 |    "name": "python",
224 |    "nbconvert_exporter": "python",
225 |    "pygments_lexer": "ipython3",
226 |    "version": "3.7.4"
227 |   }
228 |  },
229 |  "nbformat": 4,
230 |  "nbformat_minor": 4
231 | }
232 | 


--------------------------------------------------------------------------------
/nways_cfd/English/C/jupyter_notebook/minicfd_openmp.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# OpenMP Acceleration \n",
  8 |     "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell."
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "!nvidia-smi"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "Since the code will be run on Multicore as well try running the cell below and get details of the nnumber of core and CPU architecure on the system"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "!cat /proc/cpuinfo"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "## Copy and Compile the Serial code\n",
 41 |     "\n",
 42 |     "Before start modifying the serial code, let's make a copy of the serial code and rename it."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "!cp ../source_code/serial/* ../source_code/openmp"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "!cd ../source_code/openmp && make clean && make"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "## Run the Serial code"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "!cd ../source_code/openmp && ./cfd 64 500"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "---\n",
 84 |     "\n",
 85 |     "# Start adding OpenMP Pragmas"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "Now, you can start modifying the C++ code and the `Makefile`:\n",
 93 |     "\n",
 94 |     "[cfd code](../source_code/openmp/cfd.cpp) \n",
 95 |     "\n",
 96 |     "[Makefile](../source_code/openmp/Makefile)\n",
 97 |     "\n",
 98 |     "Remember to **SAVE** your code after changes, before running below cells.\n",
 99 |     "\n",
100 |     "#### Some Hints\n",
101 |     "\n",
102 |     "1) Notice implicit and explicit copy of variables --> Add `-Minfo=mp` flag to `Makefile`.\n",
103 |     "\n",
104 |     "2) Check if there is any data race in your code.( More details on data race is present in the Links and resources section below)"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "## Compile and run OpenMP enabled code\n"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "!cd ../source_code/openmp && make clean && make"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "Hint : Add `-Minfo=mp` to the `Makefile` to check that Kernel code indeed has been generated."
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {},
133 |    "source": [
134 |     "## Profile the OpenMP Code"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "!cd ../source_code/openmp && nsys profile -t nvtx,cuda --stats=true --force-overwrite true -o minicfdopenmp_profile ./cfd 64 500"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "metadata": {},
149 |    "source": [
150 |     "You can examine the output on the terminal or you can download the file and view the timeline by opening the output with the NVIDIA Nsight Systems."
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "markdown",
155 |    "metadata": {},
156 |    "source": [
157 |     "Download and save the report file by holding down <mark>Shift</mark> and <mark>right-clicking</mark> [here](../source_code/openmp/minicfdopenmp_profile.nsys-rep) then choosing <mark>save Link As</mark>. Once done, open it via the GUI.\n",
158 |     "\n",
159 |     "## Validating the Output\n",
160 |     "\n",
161 |     "Make sure the error value printed as output matches that of the serial code"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "# Recommendations for adding OpenMP Pragmas\n",
169 |     "\n",
170 |     "After finding the hotspot function take an incremental approach to add pargmas. \n",
171 |     "\n",
172 |     "1) Ignore the initialization, finalization and I/O functions\n",
173 |     "\n",
174 |     "2) Take an incremental approach by adding pragmas one at a time\n",
175 |     "\n",
176 |     "3) Cross check the output after incremental changes to check algorithmic scalability\n",
177 |     "\n",
178 |     "4) Move on to using data clauses for better performance \n",
179 |     "\n",
180 |     "5) Start with a small problem size that reduces the execution time. \n",
181 |     "\n",
182 |     "\n",
183 |     "\n",
184 |     "**General tip:** Be aware of *Data Race* situation in which at least two threads access a shared variable at the same time. At least on thread tries to modify the variable. If data race happened, an incorrect result will be returned. So, make sure to validate your output against the serial version."
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "markdown",
189 |    "metadata": {},
190 |    "source": [
191 |     "# Links and Resources\n",
192 |     "\n",
193 |     "[OpenMP Specification](https://www.openmp.org/wp-content/uploads/OpenMP-API-Specification-5.0.pdf)\n",
194 |     "\n",
195 |     "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
196 |     "\n",
197 |     "[HPC SDK Download](https://developer.nvidia.com/hpc-sdk)\n",
198 |     "\n",
199 |     "[OpenMP on GPU](https://on-demand.gputechconf.com/gtc/2016/presentation/s6510-jeff-larkin-targeting-gpus-openmp.pdf)\n",
200 |     "\n",
201 |     "**NOTE**: To be able to see the Nsight Systems profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n",
202 |     "\n",
203 |     "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
204 |     "\n",
205 |     "--- \n",
206 |     "\n"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "markdown",
211 |    "metadata": {},
212 |    "source": [
213 |     "## Licensing \n",
214 |     "\n",
215 |     "Copyright © 2022 OpenACC-Standard.org.  This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply."
216 |    ]
217 |   }
218 |  ],
219 |  "metadata": {
220 |   "anaconda-cloud": {},
221 |   "kernelspec": {
222 |    "display_name": "Python 3",
223 |    "language": "python",
224 |    "name": "python3"
225 |   },
226 |   "language_info": {
227 |    "codemirror_mode": {
228 |     "name": "ipython",
229 |     "version": 3
230 |    },
231 |    "file_extension": ".py",
232 |    "mimetype": "text/x-python",
233 |    "name": "python",
234 |    "nbconvert_exporter": "python",
235 |    "pygments_lexer": "ipython3",
236 |    "version": "3.7.4"
237 |   }
238 |  },
239 |  "nbformat": 4,
240 |  "nbformat_minor": 4
241 | }
242 | 


--------------------------------------------------------------------------------
/nways_cfd/English/Fortran/jupyter_notebook/minicfd_openmp.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# OpenMP Acceleration \n",
  8 |     "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell."
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "!nvidia-smi"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "Since the code will be run on Multicore as well try running the cell below and get details of the nnumber of core and CPU architecure on the system"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "!cat /proc/cpuinfo"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "## Copy and Compile the Serial code\n",
 41 |     "\n",
 42 |     "Before start modifying the serial code, let's make a copy of the serial code and rename it."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "!cp ../source_code/serial/* ../source_code/openmp"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "!cd ../source_code/openmp && make clean && make"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "## Run the Serial code"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "!cd ../source_code/openmp && ./cfd 64 500"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "---\n",
 84 |     "\n",
 85 |     "# Start adding OpenMP Pragmas"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "Now, you can start modifying the Fortran code and the `Makefile`:\n",
 93 |     "\n",
 94 |     "\n",
 95 |     "[cfd code](../source_code/openmp/cfd.f90) \n",
 96 |     "\n",
 97 |     "[Makefile](../source_code/openmp/Makefile)\n",
 98 |     "\n",
 99 |     "Remember to **SAVE** your code after changes, before running below cells.\n",
100 |     "\n",
101 |     "#### Some Hints\n",
102 |     "\n",
103 |     "1) Notice implicit and explicit copy of variables --> Add `-Minfo=mp` flag to `Makefile`.\n",
104 |     "\n",
105 |     "2) Check if there is any data race in your code.( More details on data race is present in the Links and resources section below)"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "markdown",
110 |    "metadata": {},
111 |    "source": [
112 |     "## Compile and run OpenMP enabled code\n"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "!cd ../source_code/openmp && make clean && make"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "Hint : Add `-Minfo=mp` to the `Makefile` to check that Kernel code indeed has been generated."
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "markdown",
133 |    "metadata": {},
134 |    "source": [
135 |     "## Profile the OpenMP Code"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "!cd ../source_code/openmp && nsys profile -t nvtx,cuda --stats=true --force-overwrite true -o minicfdopenmp_profile ./cfd 64 500"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "markdown",
149 |    "metadata": {},
150 |    "source": [
151 |     "You can examine the output on the terminal or you can download the file and view the timeline by opening the output with the NVIDIA Nsight Systems."
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "markdown",
156 |    "metadata": {},
157 |    "source": [
158 |     "Download and save the report file by holding down <mark>Shift</mark> and <mark>right-clicking</mark> [here](../source_code/openmp/minicfdopenmp_profile.nsys-rep) then choosing <mark>save Link As</mark>. Once done, open it via the GUI.\n",
159 |     "\n",
160 |     "## Validating the Output\n",
161 |     "\n",
162 |     "Make sure the error value printed as output matches that of the serial code"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "# Recommendations for adding OpenMP Pragmas\n",
170 |     "\n",
171 |     "After finding the hotspot function take an incremental approach to add pargmas. \n",
172 |     "\n",
173 |     "1) Ignore the initialization, finalization and I/O functions\n",
174 |     "\n",
175 |     "2) Take an incremental approach by adding pragmas one at a time\n",
176 |     "\n",
177 |     "3) Cross check the output after incremental changes to check algorithmic scalability\n",
178 |     "\n",
179 |     "4) Move on to using data clauses for better performance \n",
180 |     "\n",
181 |     "5) Start with a small problem size that reduces the execution time. \n",
182 |     "\n",
183 |     "\n",
184 |     "\n",
185 |     "**General tip:** Be aware of *Data Race* situation in which at least two threads access a shared variable at the same time. At least on thread tries to modify the variable. If data race happened, an incorrect result will be returned. So, make sure to validate your output against the serial version."
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "markdown",
190 |    "metadata": {},
191 |    "source": [
192 |     "# Links and Resources\n",
193 |     "\n",
194 |     "[OpenMP Specification](https://www.openmp.org/wp-content/uploads/OpenMP-API-Specification-5.0.pdf)\n",
195 |     "\n",
196 |     "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
197 |     "\n",
198 |     "[HPC SDK Download](https://developer.nvidia.com/hpc-sdk)\n",
199 |     "\n",
200 |     "[OpenMP on GPU](https://on-demand.gputechconf.com/gtc/2016/presentation/s6510-jeff-larkin-targeting-gpus-openmp.pdf)\n",
201 |     "\n",
202 |     "**NOTE**: To be able to see the Nsight Systems profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n",
203 |     "\n",
204 |     "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
205 |     "\n",
206 |     "--- \n",
207 |     "\n"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "markdown",
212 |    "metadata": {},
213 |    "source": [
214 |     "## Licensing \n",
215 |     "\n",
216 |     "Copyright © 2022 OpenACC-Standard.org.  This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply."
217 |    ]
218 |   }
219 |  ],
220 |  "metadata": {
221 |   "anaconda-cloud": {},
222 |   "kernelspec": {
223 |    "display_name": "Python 3",
224 |    "language": "python",
225 |    "name": "python3"
226 |   },
227 |   "language_info": {
228 |    "codemirror_mode": {
229 |     "name": "ipython",
230 |     "version": 3
231 |    },
232 |    "file_extension": ".py",
233 |    "mimetype": "text/x-python",
234 |    "name": "python",
235 |    "nbconvert_exporter": "python",
236 |    "pygments_lexer": "ipython3",
237 |    "version": "3.7.4"
238 |   }
239 |  },
240 |  "nbformat": 4,
241 |  "nbformat_minor": 4
242 | }
243 | 


--------------------------------------------------------------------------------
/nways_cfd/English/C/jupyter_notebook/.ipynb_checkpoints/minicfd_openmp-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# OpenMP Acceleration \n",
  8 |     "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell."
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "!nvidia-smi"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "Since the code will be run on Multicore as well try running the cell below and get details of the nnumber of core and CPU architecure on the system"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "!cat /proc/cpuinfo"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "## Copy and Compile the Serial code\n",
 41 |     "\n",
 42 |     "Before start modifying the serial code, let's make a copy of the serial code and rename it."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "!cp ../source_code/serial/* ../source_code/openmp"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "!cd ../source_code/openmp && make clean && make"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "## Run the Serial code"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "!cd ../source_code/openmp && ./cfd 64 500"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "---\n",
 84 |     "\n",
 85 |     "# Start adding OpenMP Pragmas"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "Now, you can start modifying the C++ code and the `Makefile`:\n",
 93 |     "\n",
 94 |     "[cfd code](../source_code/openmp/cfd.cpp) \n",
 95 |     "\n",
 96 |     "[Makefile](../source_code/openmp/Makefile)\n",
 97 |     "\n",
 98 |     "Remember to **SAVE** your code after changes, before running below cells.\n",
 99 |     "\n",
100 |     "#### Some Hints\n",
101 |     "\n",
102 |     "1) Notice implicit and explicit copy of variables --> Add `-Minfo=mp` flag to `Makefile`.\n",
103 |     "\n",
104 |     "2) Check if there is any data race in your code.( More details on data race is present in the Links and resources section below)"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "## Compile and run OpenMP enabled code\n"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "!cd ../source_code/openmp && make clean && make"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "Hint : Add `-Minfo=mp` to the `Makefile` to check that Kernel code indeed has been generated."
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {},
133 |    "source": [
134 |     "## Profile the OpenMP Code"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "!cd ../source_code/openmp && nsys profile -t nvtx,cuda --stats=true --force-overwrite true -o minicfdopenmp_profile ./cfd 64 500"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "metadata": {},
149 |    "source": [
150 |     "You can examine the output on the terminal or you can download the file and view the timeline by opening the output with the NVIDIA Nsight Systems."
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "markdown",
155 |    "metadata": {},
156 |    "source": [
157 |     "Download and save the report file by holding down <mark>Shift</mark> and <mark>right-clicking</mark> [here](../source_code/openmp/minicfdopenmp_profile.nsys-rep) then choosing <mark>save Link As</mark>. Once done, open it via the GUI.\n",
158 |     "\n",
159 |     "## Validating the Output\n",
160 |     "\n",
161 |     "Make sure the error value printed as output matches that of the serial code"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "# Recommendations for adding OpenMP Pragmas\n",
169 |     "\n",
170 |     "After finding the hotspot function take an incremental approach to add pargmas. \n",
171 |     "\n",
172 |     "1) Ignore the initialization, finalization and I/O functions\n",
173 |     "\n",
174 |     "2) Take an incremental approach by adding pragmas one at a time\n",
175 |     "\n",
176 |     "3) Cross check the output after incremental changes to check algorithmic scalability\n",
177 |     "\n",
178 |     "4) Move on to using data clauses for better performance \n",
179 |     "\n",
180 |     "5) Start with a small problem size that reduces the execution time. \n",
181 |     "\n",
182 |     "\n",
183 |     "\n",
184 |     "**General tip:** Be aware of *Data Race* situation in which at least two threads access a shared variable at the same time. At least on thread tries to modify the variable. If data race happened, an incorrect result will be returned. So, make sure to validate your output against the serial version."
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "markdown",
189 |    "metadata": {},
190 |    "source": [
191 |     "# Links and Resources\n",
192 |     "\n",
193 |     "[OpenMP Specification](https://www.openmp.org/wp-content/uploads/OpenMP-API-Specification-5.0.pdf)\n",
194 |     "\n",
195 |     "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
196 |     "\n",
197 |     "[HPC SDK Download](https://developer.nvidia.com/hpc-sdk)\n",
198 |     "\n",
199 |     "[OpenMP on GPU](https://on-demand.gputechconf.com/gtc/2016/presentation/s6510-jeff-larkin-targeting-gpus-openmp.pdf)\n",
200 |     "\n",
201 |     "**NOTE**: To be able to see the Nsight Systems profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n",
202 |     "\n",
203 |     "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
204 |     "\n",
205 |     "--- \n",
206 |     "\n"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "markdown",
211 |    "metadata": {},
212 |    "source": [
213 |     "## Licensing \n",
214 |     "\n",
215 |     "Copyright © 2022 OpenACC-Standard.org.  This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply."
216 |    ]
217 |   }
218 |  ],
219 |  "metadata": {
220 |   "anaconda-cloud": {},
221 |   "kernelspec": {
222 |    "display_name": "Python 3",
223 |    "language": "python",
224 |    "name": "python3"
225 |   },
226 |   "language_info": {
227 |    "codemirror_mode": {
228 |     "name": "ipython",
229 |     "version": 3
230 |    },
231 |    "file_extension": ".py",
232 |    "mimetype": "text/x-python",
233 |    "name": "python",
234 |    "nbconvert_exporter": "python",
235 |    "pygments_lexer": "ipython3",
236 |    "version": "3.7.4"
237 |   }
238 |  },
239 |  "nbformat": 4,
240 |  "nbformat_minor": 4
241 | }
242 | 


--------------------------------------------------------------------------------
/nways_cfd/English/C/jupyter_notebook/minicfd_openacc.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# OpenACC Acceleration \n",
  8 |     "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell."
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "!nvidia-smi"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "Since the code will be run on Multicore as well try running the cell below and get details of the nnumber of core and CPU architecure on the system"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "!cat /proc/cpuinfo"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "## Copy and Compile the Serial code\n",
 41 |     "\n",
 42 |     "Before start modifying the serial code, let's make a copy of the serial code and rename it."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "!cp ../source_code/serial/* ../source_code/openacc"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "!cd ../source_code/openacc && make clean && make"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "## Run the Serial code"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "!cd ../source_code/openacc && ./cfd 64 500"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "---\n",
 84 |     "\n",
 85 |     "# Start adding OpenACC Pragmas"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "Now, you can start modifying the C++ code and the `Makefile`:\n",
 93 |     "\n",
 94 |     "[cfd code](../source_code/openacc/cfd.cpp) \n",
 95 |     "\n",
 96 |     "[Makefile](../source_code/openacc/Makefile)\n",
 97 |     "\n",
 98 |     "Remember to **SAVE** your code after changes, before running below cells.\n",
 99 |     "\n",
100 |     "#### Some Hints\n",
101 |     "\n",
102 |     "1) Notice implicit and explicit copy of variables --> Add `-Minfo=accel` flag to `Makefile`.\n",
103 |     "\n",
104 |     "2) Check if there is any data race in your code.( More details on data race is present in the Links and resources section below)"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "## Compile and run OpenACC enabled code\n"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "!cd ../source_code/openacc && make clean && make"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "Hint : Add `-Minfo=accel` to the `Makefile` to check that Kernel code indeed has been generated."
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {},
133 |    "source": [
134 |     "## Profile the OpenACC Code"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "!cd ../source_code/openacc && nsys profile -t nvtx,openacc,cuda --stats=true --force-overwrite true -o minicfdopenacc_profile ./cfd 64 500"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "metadata": {},
149 |    "source": [
150 |     "You can examine the output on the terminal or you can download the file and view the timeline by opening the output with the NVIDIA Nsight Systems."
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "markdown",
155 |    "metadata": {},
156 |    "source": [
157 |     "Download and save the report file by holding down <mark>Shift</mark> and <mark>right-clicking</mark> [here](../source_code/openacc/minicfdopenacc_profile.nsys-rep) then choosing <mark>save Link As</mark>. Once done, open it via the GUI.\n",
158 |     "\n",
159 |     "## Validating the Output\n",
160 |     "\n",
161 |     "Make sure the error value printed as output matches that of the serial code"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "# Recommendations for adding OpenACC Pragmas\n",
169 |     "\n",
170 |     "After finding the hotspot function take an incremental approach to add pargmas. \n",
171 |     "\n",
172 |     "1) Ignore the initialization, finalization and I/O functions\n",
173 |     "\n",
174 |     "2) Take an incremental approach by adding pragmas one at a time\n",
175 |     "\n",
176 |     "3) Unified Memory provides a good start point where you need not worry about the data transfers (`–ta=tesla:managed`)\n",
177 |     "\n",
178 |     "4) Cross check the output after incremental changes to check algorithmic scalability\n",
179 |     "\n",
180 |     "5) Move on to using data clauses for better performance \n",
181 |     "\n",
182 |     "6) Start with a small problem size that reduces the execution time. \n",
183 |     "\n",
184 |     "\n",
185 |     "**General tip:** Be aware of *Data Race* situation in which at least two threads access a shared variable at the same time. At least on thread tries to modify the variable. If data race happened, an incorrect result will be returned. So, make sure to validate your output against the serial version."
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "markdown",
190 |    "metadata": {},
191 |    "source": [
192 |     "# Links and Resources\n",
193 |     "\n",
194 |     "[OpenACC API Guide](https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf)\n",
195 |     "\n",
196 |     "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
197 |     "\n",
198 |     "[CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)\n",
199 |     "\n",
200 |     "**NOTE**: To be able to see the Nsight Systems profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n",
201 |     "\n",
202 |     "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
203 |     "\n",
204 |     "--- \n",
205 |     "\n"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "markdown",
210 |    "metadata": {},
211 |    "source": [
212 |     "## Licensing \n",
213 |     "\n",
214 |     "Copyright © 2022 OpenACC-Standard.org.  This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply."
215 |    ]
216 |   }
217 |  ],
218 |  "metadata": {
219 |   "anaconda-cloud": {},
220 |   "kernelspec": {
221 |    "display_name": "Python 3",
222 |    "language": "python",
223 |    "name": "python3"
224 |   },
225 |   "language_info": {
226 |    "codemirror_mode": {
227 |     "name": "ipython",
228 |     "version": 3
229 |    },
230 |    "file_extension": ".py",
231 |    "mimetype": "text/x-python",
232 |    "name": "python",
233 |    "nbconvert_exporter": "python",
234 |    "pygments_lexer": "ipython3",
235 |    "version": "3.7.4"
236 |   }
237 |  },
238 |  "nbformat": 4,
239 |  "nbformat_minor": 4
240 | }
241 | 


--------------------------------------------------------------------------------
/nways_cfd/English/Fortran/jupyter_notebook/minicfd_openacc.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# OpenACC Acceleration \n",
  8 |     "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell."
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "!nvidia-smi"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "Since the code will be run on Multicore as well try running the cell below and get details of the nnumber of core and CPU architecure on the system"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "!cat /proc/cpuinfo"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "## Copy and Compile the Serial code\n",
 41 |     "\n",
 42 |     "Before start modifying the serial code, let's make a copy of the serial code and rename it."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "!cp ../source_code/serial/* ../source_code/openacc"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "!cd ../source_code/openacc && make clean && make"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "## Run the Serial code"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "!cd ../source_code/openacc && ./cfd 64 500"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "---\n",
 84 |     "\n",
 85 |     "# Start adding OpenACC Pragmas"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "Now, you can start modifying the Fortran code and the `Makefile`:\n",
 93 |     "\n",
 94 |     "[cfd code](../source_code/openacc/cfd.f90) \n",
 95 |     "\n",
 96 |     "[Makefile](../source_code/openacc/Makefile)\n",
 97 |     "\n",
 98 |     "Remember to **SAVE** your code after changes, before running below cells.\n",
 99 |     "\n",
100 |     "#### Some Hints\n",
101 |     "\n",
102 |     "1) Notice implicit and explicit copy of variables --> Add `-Minfo=accel` flag to `Makefile`.\n",
103 |     "\n",
104 |     "2) Check if there is any data race in your code.( More details on data race is present in the Links and resources section below)"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "## Compile and run OpenACC enabled code\n"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "!cd ../source_code/openacc && make clean && make"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "Hint : Add `-Minfo=accel` to the `Makefile` to check that Kernel code indeed has been generated."
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {},
133 |    "source": [
134 |     "## Profile the OpenACC Code"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "!cd ../source_code/openacc && nsys profile -t nvtx,openacc,cuda --stats=true --force-overwrite true -o minicfdopenacc_profile ./cfd 64 500"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "metadata": {},
149 |    "source": [
150 |     "You can examine the output on the terminal or you can download the file and view the timeline by opening the output with the NVIDIA Nsight Systems."
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "markdown",
155 |    "metadata": {},
156 |    "source": [
157 |     "Download and save the report file by holding down <mark>Shift</mark> and <mark>right-clicking</mark> [here](../source_code/openacc/minicfdopenacc_profile.nsys-rep) then choosing <mark>save Link As</mark>. Once done, open it via the GUI.\n",
158 |     "\n",
159 |     "## Validating the Output\n",
160 |     "\n",
161 |     "Make sure the error value printed as output matches that of the serial code"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "# Recommendations for adding OpenACC Pragmas\n",
169 |     "\n",
170 |     "After finding the hotspot function take an incremental approach to add pargmas. \n",
171 |     "\n",
172 |     "1) Ignore the initialization, finalization and I/O functions\n",
173 |     "\n",
174 |     "2) Take an incremental approach by adding pragmas one at a time\n",
175 |     "\n",
176 |     "3) Unified Memory provides a good start point where you need not worry about the data transfers (`–ta=tesla:managed`)\n",
177 |     "\n",
178 |     "4) Cross check the output after incremental changes to check algorithmic scalability\n",
179 |     "\n",
180 |     "5) Move on to using data clauses for better performance \n",
181 |     "\n",
182 |     "6) Start with a small problem size that reduces the execution time. \n",
183 |     "\n",
184 |     "\n",
185 |     "**General tip:** Be aware of *Data Race* situation in which at least two threads access a shared variable at the same time. At least on thread tries to modify the variable. If data race happened, an incorrect result will be returned. So, make sure to validate your output against the serial version."
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "markdown",
190 |    "metadata": {},
191 |    "source": [
192 |     "# Links and Resources\n",
193 |     "\n",
194 |     "[OpenACC API Guide](https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf)\n",
195 |     "\n",
196 |     "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
197 |     "\n",
198 |     "[CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)\n",
199 |     "\n",
200 |     "**NOTE**: To be able to see the Nsight Systems profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n",
201 |     "\n",
202 |     "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
203 |     "\n",
204 |     "--- \n",
205 |     "\n"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "markdown",
210 |    "metadata": {},
211 |    "source": [
212 |     "## Licensing \n",
213 |     "\n",
214 |     "Copyright © 2022 OpenACC-Standard.org.  This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply."
215 |    ]
216 |   }
217 |  ],
218 |  "metadata": {
219 |   "anaconda-cloud": {},
220 |   "kernelspec": {
221 |    "display_name": "Python 3",
222 |    "language": "python",
223 |    "name": "python3"
224 |   },
225 |   "language_info": {
226 |    "codemirror_mode": {
227 |     "name": "ipython",
228 |     "version": 3
229 |    },
230 |    "file_extension": ".py",
231 |    "mimetype": "text/x-python",
232 |    "name": "python",
233 |    "nbconvert_exporter": "python",
234 |    "pygments_lexer": "ipython3",
235 |    "version": "3.7.4"
236 |   }
237 |  },
238 |  "nbformat": 4,
239 |  "nbformat_minor": 4
240 | }
241 | 


--------------------------------------------------------------------------------
/nways_cfd/English/C/jupyter_notebook/.ipynb_checkpoints/minicfd_openacc-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# OpenACC Acceleration \n",
  8 |     "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell."
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "!nvidia-smi"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "Since the code will be run on Multicore as well try running the cell below and get details of the nnumber of core and CPU architecure on the system"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "!cat /proc/cpuinfo"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "## Copy and Compile the Serial code\n",
 41 |     "\n",
 42 |     "Before start modifying the serial code, let's make a copy of the serial code and rename it."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "!cp ../source_code/serial/* ../source_code/openacc"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "!cd ../source_code/openacc && make clean && make"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "## Run the Serial code"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "!cd ../source_code/openacc && ./cfd 64 500"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "---\n",
 84 |     "\n",
 85 |     "# Start adding OpenACC Pragmas"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "Now, you can start modifying the C++ code and the `Makefile`:\n",
 93 |     "\n",
 94 |     "[cfd code](../source_code/openacc/cfd.cpp) \n",
 95 |     "\n",
 96 |     "[Makefile](../source_code/openacc/Makefile)\n",
 97 |     "\n",
 98 |     "Remember to **SAVE** your code after changes, before running below cells.\n",
 99 |     "\n",
100 |     "#### Some Hints\n",
101 |     "\n",
102 |     "1) Notice implicit and explicit copy of variables --> Add `-Minfo=accel` flag to `Makefile`.\n",
103 |     "\n",
104 |     "2) Check if there is any data race in your code.( More details on data race is present in the Links and resources section below)"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "## Compile and run OpenACC enabled code\n"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "!cd ../source_code/openacc && make clean && make"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "Hint : Add `-Minfo=accel` to the `Makefile` to check that Kernel code indeed has been generated."
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {},
133 |    "source": [
134 |     "## Profile the OpenACC Code"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "!cd ../source_code/openacc && nsys profile -t nvtx,openacc,cuda --stats=true --force-overwrite true -o minicfdopenacc_profile ./cfd 64 500"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "metadata": {},
149 |    "source": [
150 |     "You can examine the output on the terminal or you can download the file and view the timeline by opening the output with the NVIDIA Nsight Systems."
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "markdown",
155 |    "metadata": {},
156 |    "source": [
157 |     "Download and save the report file by holding down <mark>Shift</mark> and <mark>right-clicking</mark> [here](../source_code/openacc/minicfdopenacc_profile.nsys-rep) then choosing <mark>save Link As</mark>. Once done, open it via the GUI.\n",
158 |     "\n",
159 |     "## Validating the Output\n",
160 |     "\n",
161 |     "Make sure the error value printed as output matches that of the serial code"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "# Recommendations for adding OpenACC Pragmas\n",
169 |     "\n",
170 |     "After finding the hotspot function take an incremental approach to add pargmas. \n",
171 |     "\n",
172 |     "1) Ignore the initialization, finalization and I/O functions\n",
173 |     "\n",
174 |     "2) Take an incremental approach by adding pragmas one at a time\n",
175 |     "\n",
176 |     "3) Unified Memory provides a good start point where you need not worry about the data transfers (`–ta=tesla:managed`)\n",
177 |     "\n",
178 |     "4) Cross check the output after incremental changes to check algorithmic scalability\n",
179 |     "\n",
180 |     "5) Move on to using data clauses for better performance \n",
181 |     "\n",
182 |     "6) Start with a small problem size that reduces the execution time. \n",
183 |     "\n",
184 |     "\n",
185 |     "**General tip:** Be aware of *Data Race* situation in which at least two threads access a shared variable at the same time. At least on thread tries to modify the variable. If data race happened, an incorrect result will be returned. So, make sure to validate your output against the serial version."
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "markdown",
190 |    "metadata": {},
191 |    "source": [
192 |     "# Links and Resources\n",
193 |     "\n",
194 |     "[OpenACC API Guide](https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf)\n",
195 |     "\n",
196 |     "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
197 |     "\n",
198 |     "[CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)\n",
199 |     "\n",
200 |     "**NOTE**: To be able to see the Nsight Systems profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n",
201 |     "\n",
202 |     "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
203 |     "\n",
204 |     "--- \n",
205 |     "\n"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "markdown",
210 |    "metadata": {},
211 |    "source": [
212 |     "## Licensing \n",
213 |     "\n",
214 |     "Copyright © 2022 OpenACC-Standard.org.  This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply."
215 |    ]
216 |   }
217 |  ],
218 |  "metadata": {
219 |   "anaconda-cloud": {},
220 |   "kernelspec": {
221 |    "display_name": "Python 3",
222 |    "language": "python",
223 |    "name": "python3"
224 |   },
225 |   "language_info": {
226 |    "codemirror_mode": {
227 |     "name": "ipython",
228 |     "version": 3
229 |    },
230 |    "file_extension": ".py",
231 |    "mimetype": "text/x-python",
232 |    "name": "python",
233 |    "nbconvert_exporter": "python",
234 |    "pygments_lexer": "ipython3",
235 |    "version": "3.7.4"
236 |   }
237 |  },
238 |  "nbformat": 4,
239 |  "nbformat_minor": 4
240 | }
241 | 


--------------------------------------------------------------------------------
/nways_cfd/English/Python/jupyter_notebook/minicfd.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "attachments": {},
  5 |    "cell_type": "markdown",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Getting Started\n",
  9 |     "\n",
 10 |     "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell."
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "!nvidia-smi"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "attachments": {},
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "Since the code will be run on Multicore as well try running the cell below and get details of the nnumber of core and CPU architecure on the system"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "!cat /proc/cpuinfo"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "attachments": {},
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "# A MINI-CFD APPLICATION\n",
 45 |     "\n",
 46 |     "In this lab we will accelerate a Simple 2D regular-grid CFD simulation for teaching GPU programming using multiple approaches.\n",
 47 |     "This is a simple simulation of an incompressible fluid flowing in a cavity using the 2D Navier-Stokes equation. The fluid flow can either be viscous (finite Reynolds number and vortices in the flow) on non-viscous (no Reynolds\n",
 48 |     "number specified and no vortices in the flow).\n",
 49 |     "\n",
 50 |     "It is deliberately written to be very simple and easy to understand so it can be used as a teaching example.\n",
 51 |     "\n",
 52 |     "\n",
 53 |     "In this exercise the finite difference approach is used to determine the flow pattern of a fluid in a cavity. For simplicity, the liquid is assumed to have zero viscosity which implies that there can be no vortices (i.e. no whirlpools) in the flow. The cavity is a square box with an inlet on one side and an outlet on another as shown below:"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "attachments": {},
 58 |    "cell_type": "markdown",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "<img src=\"images/cfd_flow.png\" width=\"50%\" height=\"50%\">"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "attachments": {},
 66 |    "cell_type": "markdown",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "### The objective of this exercise is not to dwell into the Maths part of it but to make use of different approaches to GPU programming to parallelize and improve the performance."
 70 |    ]
 71 |   },
 72 |   {
 73 |    "attachments": {},
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "The general flow of the code is as shown in form of pseudo code"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "attachments": {},
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "```cpp\n",
 86 |     "set the boundary values for Ψ \n",
 87 |     "while (convergence == FALSE)  do \n",
 88 |     "    for each interior grid point do \n",
 89 |     "        update Ψ by averaging with its 4 nearest neighbours \n",
 90 |     "    end do \n",
 91 |     "    \n",
 92 |     "    check for convergence \n",
 93 |     "end do \n",
 94 |     "\n",
 95 |     "for each interior grid point do \n",
 96 |     "    calculate 𝑢𝑥 calculate 𝑢𝑦 \n",
 97 |     "end do\n",
 98 |     "\n",
 99 |     "```"
100 |    ]
101 |   },
102 |   {
103 |    "attachments": {},
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "## Steps to follow\n",
108 |     "We will follow the Optimization cycle for porting and improving the code performance.\n",
109 |     "\n",
110 |     "<img src=\"images/Optimization_Cycle.jpg\" width=\"80%\" height=\"80%\">\n"
111 |    ]
112 |   },
113 |   {
114 |    "attachments": {},
115 |    "cell_type": "markdown",
116 |    "metadata": {},
117 |    "source": [
118 |     "### Understand and Analyze the code\n",
119 |     "Analyze the code :\n",
120 |     "\n",
121 |     "[cfd.py](../source_code/serial/cfd.py)"
122 |    ]
123 |   },
124 |   {
125 |    "attachments": {},
126 |    "cell_type": "markdown",
127 |    "metadata": {},
128 |    "source": [
129 |     "## Run the CPU code"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "%run ../source_code/serial/cfd.py 64 500"
139 |    ]
140 |   },
141 |   {
142 |    "attachments": {},
143 |    "cell_type": "markdown",
144 |    "metadata": {},
145 |    "source": [
146 |     "## Profiling\n",
147 |     "\n",
148 |     "For this section, we will be using Nsight systems profiler and as the code is a CPU code, we will be tracing NVTX APIs (already integrated to the application). NVTX is useful for tracing of CPU events and time ranges. For more info on Nsight profiler, please see the __[profiler documentation](https://docs.nvidia.com/nsight-systems/)__.\n",
149 |     "\n",
150 |     "### Viewing the profler output\n",
151 |     "There are two ways to look at profiled code: \n",
152 |     "\n",
153 |     "1) Command line based: Use `nsys` to collect and view profiling data from the command-line. Profiling results are displayed in the console after the profiling data is collected.\n",
154 |     "\n",
155 |     "2) NVIDIA Nsight System: Open the Nsight System profiler and click on file > open, and choose the profiler output called `minicfd_profile.nsys-rep`. If you would like to view this on your local machine, this requires that the local system has CUDA toolkit installed of same version. More details on where to download CUDA toolit can be found in the links in resources section below."
156 |    ]
157 |   },
158 |   {
159 |    "attachments": {},
160 |    "cell_type": "markdown",
161 |    "metadata": {},
162 |    "source": [
163 |     "## Profile the CPU code to find hotspots"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "metadata": {},
170 |    "outputs": [],
171 |    "source": [
172 |     "!cd ../source_code/serial && nsys profile -t nvtx --stats=true --force-overwrite true -o minicfd_profile python3 cfd.py 64 500"
173 |    ]
174 |   },
175 |   {
176 |    "attachments": {},
177 |    "cell_type": "markdown",
178 |    "metadata": {},
179 |    "source": [
180 |     "Download and save the report file by holding down <mark>Shift</mark> and <mark>right-clicking</mark> [here](../source_code/serial/minicfd_profile.nsys-rep) then choosing <mark>save Link As</mark>. Once done, open it via the GUI."
181 |    ]
182 |   },
183 |   {
184 |    "attachments": {},
185 |    "cell_type": "markdown",
186 |    "metadata": {},
187 |    "source": [
188 |     "---\n",
189 |     "\n",
190 |     "# Start Accelerating code\n",
191 |     "\n",
192 |     "\n",
193 |     "[CuPy](minicfd_cupy.ipynb)\n",
194 |     "\n",
195 |     "[Numba](minicfd_numba.ipynb)\n",
196 |     "\n",
197 |     "\n",
198 |     "\n"
199 |    ]
200 |   },
201 |   {
202 |    "attachments": {},
203 |    "cell_type": "markdown",
204 |    "metadata": {},
205 |    "source": [
206 |     "## Final Results\n",
207 |     "\n",
208 |     "Modify and add timings for the accelerated code usinf different methods\n",
209 |     "\n",
210 |     "| | CuPy | Numba |\n",
211 |     "| --- | --- | --- |\n",
212 |     "| Multicore |   |  |\n",
213 |     "| GPU  |  |  | \n",
214 |     "\n"
215 |    ]
216 |   },
217 |   {
218 |    "attachments": {},
219 |    "cell_type": "markdown",
220 |    "metadata": {},
221 |    "source": [
222 |     "---\n",
223 |     "## Licensing \n",
224 |     "\n",
225 |     "Copyright © 2022 OpenACC-Standard.org.  This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply."
226 |    ]
227 |   }
228 |  ],
229 |  "metadata": {
230 |   "anaconda-cloud": {},
231 |   "kernelspec": {
232 |    "display_name": "Python 3",
233 |    "language": "python",
234 |    "name": "python3"
235 |   },
236 |   "language_info": {
237 |    "codemirror_mode": {
238 |     "name": "ipython",
239 |     "version": 3
240 |    },
241 |    "file_extension": ".py",
242 |    "mimetype": "text/x-python",
243 |    "name": "python",
244 |    "nbconvert_exporter": "python",
245 |    "pygments_lexer": "ipython3",
246 |    "version": "3.7.4"
247 |   }
248 |  },
249 |  "nbformat": 4,
250 |  "nbformat_minor": 4
251 | }
252 | 


--------------------------------------------------------------------------------
/nways_cfd/English/C/jupyter_notebook/minicfd.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Getting Started\n",
  8 |     "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell."
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "!nvidia-smi"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "Since the code will be run on Multicore as well try running the cell below and get details of the nnumber of core and CPU architecure on the system"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "!cat /proc/cpuinfo"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "# A MINI-CFD APPLICATION\n",
 41 |     "\n",
 42 |     "In this lab we will accelerate a Simple 2D regular-grid CFD simulation for teaching GPU programming using multiple approaches.\n",
 43 |     "This is a simple simulation of an incompressible fluid flowing in a cavity using the 2D Navier-Stokes equation. The fluid flow can either be viscous (finite Reynolds number and vortices in the flow) on non-viscous (no Reynolds\n",
 44 |     "number specified and no vortices in the flow).\n",
 45 |     "\n",
 46 |     "It is deliberately written to be very simple and easy to understand so it can be used as a teaching example.\n",
 47 |     "\n",
 48 |     "\n",
 49 |     "In this exercise the finite difference approach is used to determine the flow pattern of a fluid in a cavity. For simplicity, the liquid is assumed to have zero viscosity which implies that there can be no vortices (i.e. no whirlpools) in the flow. The cavity is a square box with an inlet on one side and an outlet on another as shown below:"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "<img src=\"images/cfd_flow.png\" width=\"50%\" height=\"50%\">"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "### The objective of this exercise is not to dwell into the Maths part of it but to make use of different approaches to GPU programming to parallelize and improve the performance."
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "The general flow of the code is as shown in form of pseudo code"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "```cpp\n",
 78 |     "set the boundary values for Ψ \n",
 79 |     "while (convergence == FALSE)  do \n",
 80 |     "    for each interior grid point do \n",
 81 |     "        update Ψ by averaging with its 4 nearest neighbours \n",
 82 |     "    end do \n",
 83 |     "    \n",
 84 |     "    check for convergence \n",
 85 |     "end do \n",
 86 |     "\n",
 87 |     "for each interior grid point do \n",
 88 |     "    calculate 𝑢𝑥 calculate 𝑢𝑦 \n",
 89 |     "end do\n",
 90 |     "\n",
 91 |     "```"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "metadata": {},
 97 |    "source": [
 98 |     "## Steps to follow\n",
 99 |     "We will follow the Optimization cycle for porting and improving the code performance.\n",
100 |     "\n",
101 |     "<img src=\"images/Optimization_Cycle.jpg\" width=\"80%\" height=\"80%\">\n"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "markdown",
106 |    "metadata": {},
107 |    "source": [
108 |     "### Understand and Analyze the code\n",
109 |     "Analyze the code and the Makefile for how to compile the code:\n",
110 |     "\n",
111 |     "[cfd code](../source_code/serial/cfd.cpp) \n",
112 |     "\n",
113 |     "[Makefile](../source_code/serial/Makefile)"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "metadata": {},
119 |    "source": [
120 |     "## Compile the code"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "!cd ../source_code/serial && make clean && make"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "markdown",
134 |    "metadata": {},
135 |    "source": [
136 |     "## Run the CPU code"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "!cd ../source_code/serial && ./cfd 64 500"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "metadata": {},
151 |    "source": [
152 |     "## Profiling\n",
153 |     "\n",
154 |     "For this section, we will be using Nsight systems profiler and as the code is a CPU code, we will be tracing NVTX APIs (already integrated to the application). NVTX is useful for tracing of CPU events and time ranges. For more info on Nsight profiler, please see the __[profiler documentation](https://docs.nvidia.com/nsight-systems/)__.\n",
155 |     "\n",
156 |     "### Viewing the profler output\n",
157 |     "There are two ways to look at profiled code: \n",
158 |     "\n",
159 |     "1) Command line based: Use `nsys` to collect and view profiling data from the command-line. Profiling results are displayed in the console after the profiling data is collected.\n",
160 |     "\n",
161 |     "2) NVIDIA Nsight System: Open the Nsight System profiler and click on file > open, and choose the profiler output called `minicfd_profile.nsys-rep`. If you would like to view this on your local machine, this requires that the local system has CUDA toolkit installed of same version. More details on where to download CUDA toolit can be found in the links in resources section below."
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "## Profile the CPU code to find hotspots"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "!cd ../source_code/serial && nsys profile -t nvtx --stats=true --force-overwrite true -o minicfd_profile ./cfd 64 500"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "markdown",
182 |    "metadata": {},
183 |    "source": [
184 |     "Download and save the report file by holding down <mark>Shift</mark> and <mark>right-clicking</mark> [here](../source_code/serial/minicfd_profile.nsys-rep) then choosing <mark>save Link As</mark>. Once done, open it via the GUI."
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "markdown",
189 |    "metadata": {},
190 |    "source": [
191 |     "---\n",
192 |     "\n",
193 |     "# Start Accelerating code\n",
194 |     "\n",
195 |     "[stdpar](minicfd_stdpar.ipynb)\n",
196 |     "\n",
197 |     "[OpenACC](minicfd_openacc.ipynb)\n",
198 |     "\n",
199 |     "[OpenMP](minicfd_openmp.ipynb)\n",
200 |     "\n",
201 |     "[CUDA C](minicfd_cudac.ipynb)\n",
202 |     "\n",
203 |     "\n",
204 |     "\n"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "markdown",
209 |    "metadata": {},
210 |    "source": [
211 |     "## Final Results\n",
212 |     "\n",
213 |     "Modify and add timings for the accelerated code usinf different methods\n",
214 |     "\n",
215 |     "| | OpenACC | OpenMP | stdpar | CUDA Languages ( C ) |\n",
216 |     "| --- | --- | --- | --- | --- |\n",
217 |     "| Multicore |   |  |   |  |\n",
218 |     "| GPU  |  |  |  |  |\n",
219 |     "\n"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "markdown",
224 |    "metadata": {},
225 |    "source": [
226 |     "## Licensing \n",
227 |     "\n",
228 |     "Copyright © 2022 OpenACC-Standard.org.  This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply."
229 |    ]
230 |   }
231 |  ],
232 |  "metadata": {
233 |   "anaconda-cloud": {},
234 |   "kernelspec": {
235 |    "display_name": "Python 3",
236 |    "language": "python",
237 |    "name": "python3"
238 |   },
239 |   "language_info": {
240 |    "codemirror_mode": {
241 |     "name": "ipython",
242 |     "version": 3
243 |    },
244 |    "file_extension": ".py",
245 |    "mimetype": "text/x-python",
246 |    "name": "python",
247 |    "nbconvert_exporter": "python",
248 |    "pygments_lexer": "ipython3",
249 |    "version": "3.7.4"
250 |   }
251 |  },
252 |  "nbformat": 4,
253 |  "nbformat_minor": 4
254 | }
255 | 


--------------------------------------------------------------------------------
/nways_cfd/English/C/jupyter_notebook/.ipynb_checkpoints/minicfd-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Getting Started\n",
  8 |     "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell."
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "!nvidia-smi"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "Since the code will be run on Multicore as well try running the cell below and get details of the nnumber of core and CPU architecure on the system"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "!cat /proc/cpuinfo"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "# A MINI-CFD APPLICATION\n",
 41 |     "\n",
 42 |     "In this lab we will accelerate a Simple 2D regular-grid CFD simulation for teaching GPU programming using multiple approaches.\n",
 43 |     "This is a simple simulation of an incompressible fluid flowing in a cavity using the 2D Navier-Stokes equation. The fluid flow can either be viscous (finite Reynolds number and vortices in the flow) on non-viscous (no Reynolds\n",
 44 |     "number specified and no vortices in the flow).\n",
 45 |     "\n",
 46 |     "It is deliberately written to be very simple and easy to understand so it can be used as a teaching example.\n",
 47 |     "\n",
 48 |     "\n",
 49 |     "In this exercise the finite difference approach is used to determine the flow pattern of a fluid in a cavity. For simplicity, the liquid is assumed to have zero viscosity which implies that there can be no vortices (i.e. no whirlpools) in the flow. The cavity is a square box with an inlet on one side and an outlet on another as shown below:"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "<img src=\"images/cfd_flow.png\" width=\"50%\" height=\"50%\">"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "### The objective of this exercise is not to dwell into the Maths part of it but to make use of different approaches to GPU programming to parallelize and improve the performance."
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "The general flow of the code is as shown in form of pseudo code"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "```cpp\n",
 78 |     "set the boundary values for Ψ \n",
 79 |     "while (convergence == FALSE)  do \n",
 80 |     "    for each interior grid point do \n",
 81 |     "        update Ψ by averaging with its 4 nearest neighbours \n",
 82 |     "    end do \n",
 83 |     "    \n",
 84 |     "    check for convergence \n",
 85 |     "end do \n",
 86 |     "\n",
 87 |     "for each interior grid point do \n",
 88 |     "    calculate 𝑢𝑥 calculate 𝑢𝑦 \n",
 89 |     "end do\n",
 90 |     "\n",
 91 |     "```"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "metadata": {},
 97 |    "source": [
 98 |     "## Steps to follow\n",
 99 |     "We will follow the Optimization cycle for porting and improving the code performance.\n",
100 |     "\n",
101 |     "<img src=\"images/Optimization_Cycle.jpg\" width=\"80%\" height=\"80%\">\n"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "markdown",
106 |    "metadata": {},
107 |    "source": [
108 |     "### Understand and Analyze the code\n",
109 |     "Analyze the code and the Makefile for how to compile the code:\n",
110 |     "\n",
111 |     "[cfd code](../source_code/serial/cfd.cpp) \n",
112 |     "\n",
113 |     "[Makefile](../source_code/serial/Makefile)"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "metadata": {},
119 |    "source": [
120 |     "## Compile the code"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "!cd ../source_code/serial && make clean && make"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "markdown",
134 |    "metadata": {},
135 |    "source": [
136 |     "## Run the CPU code"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "!cd ../source_code/serial && ./cfd 64 500"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "metadata": {},
151 |    "source": [
152 |     "## Profiling\n",
153 |     "\n",
154 |     "For this section, we will be using Nsight systems profiler and as the code is a CPU code, we will be tracing NVTX APIs (already integrated to the application). NVTX is useful for tracing of CPU events and time ranges. For more info on Nsight profiler, please see the __[profiler documentation](https://docs.nvidia.com/nsight-systems/)__.\n",
155 |     "\n",
156 |     "### Viewing the profler output\n",
157 |     "There are two ways to look at profiled code: \n",
158 |     "\n",
159 |     "1) Command line based: Use `nsys` to collect and view profiling data from the command-line. Profiling results are displayed in the console after the profiling data is collected.\n",
160 |     "\n",
161 |     "2) NVIDIA Nsight System: Open the Nsight System profiler and click on file > open, and choose the profiler output called `minicfd_profile.nsys-rep`. If you would like to view this on your local machine, this requires that the local system has CUDA toolkit installed of same version. More details on where to download CUDA toolit can be found in the links in resources section below."
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "## Profile the CPU code to find hotspots"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "!cd ../source_code/serial && nsys profile -t nvtx --stats=true --force-overwrite true -o minicfd_profile ./cfd 64 500"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "markdown",
182 |    "metadata": {},
183 |    "source": [
184 |     "Download and save the report file by holding down <mark>Shift</mark> and <mark>right-clicking</mark> [here](../source_code/serial/minicfd_profile.nsys-rep) then choosing <mark>save Link As</mark>. Once done, open it via the GUI."
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "markdown",
189 |    "metadata": {},
190 |    "source": [
191 |     "---\n",
192 |     "\n",
193 |     "# Start Accelerating code\n",
194 |     "\n",
195 |     "[stdpar](minicfd_stdpar.ipynb)\n",
196 |     "\n",
197 |     "[OpenACC](minicfd_openacc.ipynb)\n",
198 |     "\n",
199 |     "[OpenMP](minicfd_openmp.ipynb)\n",
200 |     "\n",
201 |     "[CUDA C](minicfd_cudac.ipynb)\n",
202 |     "\n",
203 |     "\n",
204 |     "\n"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "markdown",
209 |    "metadata": {},
210 |    "source": [
211 |     "## Final Results\n",
212 |     "\n",
213 |     "Modify and add timings for the accelerated code usinf different methods\n",
214 |     "\n",
215 |     "| | OpenACC | OpenMP | stdpar | CUDA Languages ( C ) |\n",
216 |     "| --- | --- | --- | --- | --- |\n",
217 |     "| Multicore |   |  |   |  |\n",
218 |     "| GPU  |  |  |  |  |\n",
219 |     "\n"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "markdown",
224 |    "metadata": {},
225 |    "source": [
226 |     "## Licensing \n",
227 |     "\n",
228 |     "Copyright © 2022 OpenACC-Standard.org.  This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply."
229 |    ]
230 |   }
231 |  ],
232 |  "metadata": {
233 |   "anaconda-cloud": {},
234 |   "kernelspec": {
235 |    "display_name": "Python 3",
236 |    "language": "python",
237 |    "name": "python3"
238 |   },
239 |   "language_info": {
240 |    "codemirror_mode": {
241 |     "name": "ipython",
242 |     "version": 3
243 |    },
244 |    "file_extension": ".py",
245 |    "mimetype": "text/x-python",
246 |    "name": "python",
247 |    "nbconvert_exporter": "python",
248 |    "pygments_lexer": "ipython3",
249 |    "version": "3.7.4"
250 |   }
251 |  },
252 |  "nbformat": 4,
253 |  "nbformat_minor": 4
254 | }
255 | 


--------------------------------------------------------------------------------
/nways_cfd/English/Fortran/jupyter_notebook/minicfd.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Getting Started\n",
  8 |     "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell."
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "!nvidia-smi"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "Since the code will be run on Multicore as well try running the cell below and get details of the nnumber of core and CPU architecure on the system"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "!cat /proc/cpuinfo"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "# A MINI-CFD APPLICATION\n",
 41 |     "\n",
 42 |     "In this lab we will accelerate a Simple 2D regular-grid CFD simulation for teaching GPU programming using multiple approaches.\n",
 43 |     "This is a simple simulation of an incompressible fluid flowing in a cavity using the 2D Navier-Stokes equation. The fluid flow can either be viscous (finite Reynolds number and vortices in the flow) on non-viscous (no Reynolds\n",
 44 |     "number specified and no vortices in the flow).\n",
 45 |     "\n",
 46 |     "It is deliberately written to be very simple and easy to understand so it can be used as a teaching example.\n",
 47 |     "\n",
 48 |     "\n",
 49 |     "In this exercise the finite difference approach is used to determine the flow pattern of a fluid in a cavity. For simplicity, the liquid is assumed to have zero viscosity which implies that there can be no vortices (i.e. no whirlpools) in the flow. The cavity is a square box with an inlet on one side and an outlet on another as shown below:"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "<img src=\"images/cfd_flow.png\" width=\"50%\" height=\"50%\">"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "### The objective of this exercise is not to dwell into the Maths part of it but to make use of different approaches to GPU programming to parallelize and improve the performance."
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "The general flow of the code is as shown in form of pseudo code"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "```cpp\n",
 78 |     "set the boundary values for Ψ \n",
 79 |     "while (convergence == FALSE)  do \n",
 80 |     "    for each interior grid point do \n",
 81 |     "        update Ψ by averaging with its 4 nearest neighbours \n",
 82 |     "    end do \n",
 83 |     "    \n",
 84 |     "    check for convergence \n",
 85 |     "end do \n",
 86 |     "\n",
 87 |     "for each interior grid point do \n",
 88 |     "    calculate 𝑢𝑥 calculate 𝑢𝑦 \n",
 89 |     "end do\n",
 90 |     "\n",
 91 |     "```"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "metadata": {},
 97 |    "source": [
 98 |     "## Steps to follow\n",
 99 |     "We will follow the Optimization cycle for porting and improving the code performance.\n",
100 |     "\n",
101 |     "<img src=\"images/Optimization_Cycle.jpg\" width=\"80%\" height=\"80%\">\n"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "markdown",
106 |    "metadata": {},
107 |    "source": [
108 |     "### Understand and Analyze the code\n",
109 |     "Analyze the code and the Makefile for how to compile the code:\n",
110 |     "\n",
111 |     "\n",
112 |     "[cfd code](../source_code/serial/cfd.f90) \n",
113 |     "\n",
114 |     "[Makefile](../source_code/serial/Makefile)"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "metadata": {},
120 |    "source": [
121 |     "## Compile the code"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "!cd ../source_code/serial && make clean && make"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "metadata": {},
136 |    "source": [
137 |     "## Run the CPU code"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "!cd ../source_code/serial && ./cfd 64 500"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {},
152 |    "source": [
153 |     "## Profiling\n",
154 |     "\n",
155 |     "For this section, we will be using Nsight systems profiler and as the code is a CPU code, we will be tracing NVTX APIs (already integrated to the application). NVTX is useful for tracing of CPU events and time ranges. For more info on Nsight profiler, please see the __[profiler documentation](https://docs.nvidia.com/nsight-systems/)__.\n",
156 |     "\n",
157 |     "### Viewing the profler output\n",
158 |     "There are two ways to look at profiled code: \n",
159 |     "\n",
160 |     "1) Command line based: Use `nsys` to collect and view profiling data from the command-line. Profiling results are displayed in the console after the profiling data is collected.\n",
161 |     "\n",
162 |     "2) NVIDIA Nsight System: Open the Nsight System profiler and click on file > open, and choose the profiler output called `minicfd_profile.nsys-rep`. If you would like to view this on your local machine, this requires that the local system has CUDA toolkit installed of same version. More details on where to download CUDA toolit can be found in the links in resources section below."
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "## Profile the CPU code to find hotspots"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "!cd ../source_code/serial && nsys profile -t nvtx --stats=true --force-overwrite true -o minicfd_profile ./cfd 64 500"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "markdown",
183 |    "metadata": {},
184 |    "source": [
185 |     "Download and save the report file by holding down <mark>Shift</mark> and <mark>right-clicking</mark> [here](../source_code/serial/minicfd_profile.nsys-rep) then choosing <mark>save Link As</mark>. Once done, open it via the GUI."
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "markdown",
190 |    "metadata": {},
191 |    "source": [
192 |     "---\n",
193 |     "\n",
194 |     "# Start Accelerating code\n",
195 |     "\n",
196 |     "[doconcurrent](minicfd_do_concurrent.ipynb)\n",
197 |     "\n",
198 |     "[OpenACC](minicfd_openacc.ipynb)\n",
199 |     "\n",
200 |     "[OpenMP](minicfd_openmp.ipynb)\n",
201 |     "\n",
202 |     "[CUDA Fortran](minicfd_cudafortran.ipynb)\n",
203 |     "\n",
204 |     "\n",
205 |     "\n"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "markdown",
210 |    "metadata": {},
211 |    "source": [
212 |     "## Final Results\n",
213 |     "\n",
214 |     "Modify and add timings for the accelerated code usinf different methods\n",
215 |     "\n",
216 |     "| | OpenACC | OpenMP | DO CONCURRENT | CUDA Languages ( Fortran ) |\n",
217 |     "| --- | --- | --- | --- | --- |\n",
218 |     "| Multicore |   |  |   |  |\n",
219 |     "| GPU  |  |  |  |  |\n",
220 |     "\n"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "markdown",
225 |    "metadata": {},
226 |    "source": [
227 |     "## Licensing \n",
228 |     "\n",
229 |     "Copyright © 2022 OpenACC-Standard.org.  This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply."
230 |    ]
231 |   }
232 |  ],
233 |  "metadata": {
234 |   "anaconda-cloud": {},
235 |   "kernelspec": {
236 |    "display_name": "Python 3",
237 |    "language": "python",
238 |    "name": "python3"
239 |   },
240 |   "language_info": {
241 |    "codemirror_mode": {
242 |     "name": "ipython",
243 |     "version": 3
244 |    },
245 |    "file_extension": ".py",
246 |    "mimetype": "text/x-python",
247 |    "name": "python",
248 |    "nbconvert_exporter": "python",
249 |    "pygments_lexer": "ipython3",
250 |    "version": "3.7.4"
251 |   }
252 |  },
253 |  "nbformat": 4,
254 |  "nbformat_minor": 4
255 | }
256 | 


--------------------------------------------------------------------------------
/nways_cfd/English/Python/source_code/serial/cfd.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 NVIDIA Corporation. All rights reserved.
  2 | #!/usr/bin/env python
  3 | #
  4 | # CFD Calculation
  5 | # ===============
  6 | #
  7 | # Simulation of inviscid flow in a 2D box using the Jacobi algorithm.
  8 | #
  9 | # Python version - uses numpy and loops
 10 | #
 11 | # EPCC, 2014
 12 | #
 13 | import sys
 14 | import time
 15 | 
 16 | # Import numpy
 17 | import numpy as np
 18 | import math
 19 | import sys
 20 | import cupy.cuda.nvtx as nvtx
 21 | from numba import njit, jit
 22 | 
 23 | def main(argv):
 24 |     printfreq = 1000 #output frequency
 25 |     error = bnorm = 0.0
 26 |     tolerance = 0.0 #tolerance for convergence. <=0 means do not check
 27 | 
 28 |     error = 0.0
 29 |     # Set the minimum size parameters
 30 |     mbase = 32
 31 |     nbase = 32
 32 |     bbase = 10
 33 |     hbase = 15
 34 |     wbase = 5
 35 | 
 36 |     irrotational = 1
 37 |     checkerr = 0
 38 |     iter = 0
 39 | 
 40 | 
 41 |     # Test we have the correct number of arguments
 42 |     if len(argv) < 2:
 43 |         sys.stdout.write("Usage: cfd.py <scalefactor> <iterations>")
 44 |         sys.exit(1)
 45 | 
 46 |     # Get the systen parameters from the arguments
 47 |     scalefactor = int(argv[0])
 48 |     niter = int(argv[1])
 49 | 
 50 |     sys.stdout.write("\n2D CFD Simulation\n")
 51 |     sys.stdout.write("=================\n")
 52 |     sys.stdout.write("Scale factor = {0}\n".format(scalefactor))
 53 |     sys.stdout.write("Iterations   = {0}\n".format(niter))
 54 | 
 55 |     # do we stop because of tolerance?
 56 |     if (tolerance > 0):
 57 |         checkerr = 1
 58 | 
 59 |     # check command line parameters and parse them
 60 |     if (len(argv) < 2 or len(argv) > 3):
 61 |         print("Usage: cfd <scale> <numiter> [reynolds]\n")
 62 |         return 0
 63 | 
 64 |     scalefactor = int(argv[0])
 65 |     numiter = int(argv[1])
 66 | 
 67 |     if len(argv) == 3:
 68 |         re = float(argv[2])
 69 |         irrotational = 0
 70 |     else:
 71 |         re = -1.0
 72 | 
 73 |     if not checkerr:
 74 |         print("Scale Factor = {}, iterations = {}\n".format(scalefactor, numiter))
 75 |     else:
 76 |         print("Scale Factor = {}, iterations = {}, tolerance= {}\n".format(scalefactor, numiter, tolerance))
 77 | 
 78 |     if (irrotational):
 79 |         print("Irrotational flow\n")
 80 |     else:
 81 |         print("Reynolds number = {}\n".format(re))
 82 | 
 83 |     # Set the parameters for boundary conditions
 84 |     #Calculate b, h & w and m & n
 85 |     b = bbase * scalefactor
 86 |     h = hbase * scalefactor
 87 |     w = wbase * scalefactor
 88 |     m = mbase * scalefactor
 89 |     n = nbase * scalefactor
 90 | 
 91 |     re = re / float(scalefactor)
 92 | 
 93 |     # Write the simulation details
 94 |     sys.stdout.write("\nRunning CFD on {0} x {1} grid in serial\n".format(m, n))
 95 | 
 96 |     # allocate arrays
 97 |     nvtx.RangePush("Initialization")
 98 |     psi = np.zeros(((m + 2) * (n + 2)), dtype=np.float64)
 99 |     nvtx.RangePop()
100 |     psitmp = np.zeros(psi.size, dtype=np.float64)
101 | 
102 |     if (not irrotational):
103 |         # allocate arrays
104 |         nvtx.RangePush("Initialization")
105 |         zet = np.zeros(((m + 2) * (n + 2)), dtype=np.float64)
106 |         nvtx.RangePop()
107 |         zettmp = np.zeros(((m + 2) * (n + 2)), dtype=np.float64)
108 | 
109 |     nvtx.RangePush("Boundary_PSI")
110 |     #set the psi boundary conditions
111 |     psi = boundarypsi(psi, m, n, b, h, w)
112 |     nvtx.RangePop() 
113 | 
114 |     #compute normalisation factor for error
115 |     bnorm = 0.0
116 |     nvtx.RangePush("Compute_Normalization")
117 |     for i in range(m + 2):
118 |         for j in range(n + 2):
119 |             bnorm += psi[i * (m + 2) + j] * psi[i * (m + 2) + j]
120 |     nvtx.RangePop()
121 |     # boundary set for zet
122 |     if not irrotational:
123 |         zet = boundaryzet(zet, psi, m, n)
124 |         nvtx.RangePush("Compute_Normalization")
125 |         for i in range(m + 2):
126 |             for j in range(n + 2):
127 |                 bnorm += zet[i * (m + 2) + j] * zet[i * (m + 2) + j]
128 |         nvtx.RangePop()
129 |     bnorm = math.sqrt(bnorm)
130 | 
131 |     #begin iterative Jacobi loop
132 |     print("\nStarting main loop...\n\n")
133 |     tstart = time.time()
134 | 
135 |     nvtx.RangePush("Overall_Iteration")
136 |     for iter in range(1, numiter+1):
137 |         nvtx.RangePush("JacobiStep")
138 |         if (irrotational): #calculate psi for next iteration
139 |             psitmp = jacobistep(psitmp, psi, m, n)
140 |         else:
141 |             psitmp,zettmp = jacobistepvort(zettmp, psitmp, zet, psi, m, n, re)
142 |         nvtx.RangePop() 
143 |         nvtx.RangePush("Calculate_Error")
144 |         #calculate current error if required
145 |         if checkerr or iter == numiter:
146 |             error = deltasq(psitmp, psi, m, n)
147 |             if not irrotational:
148 |                 error += deltasq(zettmp, zet, m, n)
149 | 
150 |             error = math.sqrt(error)
151 |             error = error / bnorm
152 |         nvtx.RangePop()
153 |         #quit early if we have reached required tolerance
154 |         if checkerr:
155 |             if error < tolerance:
156 |                 print("Converged on iteration {0}\n".format(iter))
157 |                 break
158 |         #copy back
159 |         nvtx.RangePush("Switch_Array")
160 |         for i in range(1, m + 1):
161 |             for j in range(1, n + 1):
162 |                 psi[i * (m + 2) + j] = psitmp[i * (m + 2) + j]
163 |         
164 |         if not irrotational:
165 |             for i in range(1, m + 1):
166 |                 for j in range(1, n + 1):
167 |                     zet[i * (m + 2) + j] = zettmp[i * (m + 2) + j]
168 |         nvtx.RangePop()
169 |         if not irrotational:
170 |             # update zeta BCs that depend on psi
171 |             boundaryzet(zet, psi, m, n)
172 | 
173 |         # print loop information
174 |         if iter % printfreq == 0:
175 |             if not checkerr:
176 |                 print("Completed iteration {0}\n".format(iter))
177 |             else:
178 |                 print("Completed iteration {0}, error = {1}\n".format(iter, error))
179 |     nvtx.RangePop()
180 |     if iter > numiter:
181 |         iter=numiter
182 |     tstop = time.time()
183 |     ttot = tstop - tstart
184 |     titer = ttot / float(iter)
185 | 
186 |     #print out some stats
187 |     print("\n... finished\n")
188 |     print("\nCalculation took {0:.5f}s\n\n".format(ttot))
189 |     print("After {0} iterations, the error is {1}\n".format(niter, error))
190 |     print("Time for {0} iterations was {1} seconds\n".format(niter, ttot))
191 |     print("Each iteration took {0} seconds\n".format(titer))
192 | 
193 |     # Write the output files for subsequent visualisation
194 |     nvtx.RangePush("output visualization")
195 |     write_data(m, n, scalefactor, psi, "velocity.dat", "colourmap.dat")
196 |     nvtx.RangePop()
197 | 
198 |     # Finish nicely
199 |     sys.exit(0)
200 | 
201 | 
202 | def write_data(m, n, scale, psi, velfile, colfile):
203 | 
204 |     # Open the specified files
205 |     velout = open(velfile, "w")
206 |     velout.write("{0} {1}\n".format(m/scale, n/scale))
207 |     colout = open(colfile, "w")
208 |     colout.write("{0} {1}\n".format(m, n))
209 | 
210 |     # Loop over stream function array (excluding boundaries)
211 |     for i in range(0, m):
212 |         for j in range(0, n):
213 | 
214 |             # Compute velocities and magnitude
215 |             ux =  (psi[(i+1)*(m+2)+j+2]-psi[(i+1)*(m+2)+j])/2.0
216 |             uy = -(psi[(i+2)*(m+2)+j+1]-psi[i*(m+2)+j+1])/2.0
217 |             #umod = (ux**2 + uy**2)
218 |             umod = (ux ** 2 + uy ** 2) ** 0.5
219 | 
220 |             # We are actually going to output a colour, in which
221 |             # case it is useful to shift values towards a lighter
222 |             # blue (for clarity) via the following kludge...
223 |             hue = umod ** 0.6
224 |             #hue = math.pow(umod, 0.4)
225 |             colout.write("{0:5d} {1:5d} {2:10.5f}\n".format(i, j, hue))
226 | 
227 |             # Only write velocity vectors every "scale" points
228 |             if (i-1)%scale == (scale-1)/2 and (j-1)%scale == (scale-1)/2:
229 |                 velout.write("{0:5d} {1:5d} {2:10.5f} {3:10.5f}\n".format(i-1, j-1, ux, uy))
230 | 
231 |     velout.close()
232 |     colout.close()
233 | 
234 | @jit()
235 | def jacobistep(psinew, psi, m, n):
236 |     for i in range(1, m+1):
237 |         for j in range(1, n+1):
238 |             psinew[i * (m + 2) + j]=0.25 * (psi[(i-1) * (m+2)+j]+psi[(i+1) * (m+2)+j]+psi[i * (m+2)+j-1]+psi[i * (m+2)+j+1])
239 |     return psinew
240 | 
241 | @jit()
242 | def jacobistepvort(zetnew, psinew,zet,psi,m,n,re):
243 |     for i in range(1, m+1):
244 |         for j in range(1, n+1):
245 |             psinew[i * (m + 2) + j]=0.25 * (psi[(i-1) * (m+2)+j]+psi[(i+1) * (m+2)+j]+psi[i * (m+2)+j-1]+psi[i * (m+2)+j+1]- zet[i * (m+2)+j])
246 | 
247 |     for i in range(1, m+1):
248 |         for j in range(1, n+1):
249 |             zetnew[i * (m + 2) + j] = 0.25 * (zet[(i - 1) * (m + 2) + j] + zet[(i + 1) * (m + 2) + j] + zet[i * (m + 2) + j - 1] + zet[i * (m + 2) + j + 1])
250 |             - re / 16.0 * ((psi[i * (m + 2) + j + 1] - psi[i * (m + 2) + j - 1]) * (zet[(i + 1) * (m + 2) + j] - zet[(i - 1) * (m + 2) + j])
251 |                     - (psi[(i + 1) * (m + 2) + j] - psi[(i - 1) * (m + 2) + j]) * (zet[i * (m + 2) + j + 1] - zet[i * (m + 2) + j - 1]))
252 | 
253 |     return psinew, zetnew
254 | 
255 | @jit()
256 | def deltasq (newarr, oldarr, m, n):
257 |     dsq = 0.0
258 |     for i in range(1, m+1):
259 |         for j in range(1, n+1):
260 |             tmp = newarr[i * (m + 2) + j] - oldarr[i * (m + 2) + j];
261 |             dsq += tmp * tmp
262 | 
263 |     return dsq
264 | 
265 | @jit()
266 | def boundarypsi(psi,m,n,b,h,w):
267 |     # Set the boundary conditions on bottom edge
268 | 
269 |     for i in range(b+1, b+w):
270 |         psi[i*(m+2)+0] = float(i-b)
271 | 
272 |     for i in range(b + w, m + 1):
273 |         psi[i*(m+2)+0] = float(w)
274 | 
275 |     # Set the boundary conditions on right edge
276 |     for j in range(1, h + 1):
277 |         psi[(m+1)*(m+2)+j] = float(w)
278 | 
279 |     for j in range(h + 1, h + w):
280 |         psi[(m+1)*(m+2)+j] = float(w - j + h)
281 | 
282 |     return psi
283 | 
284 | @jit()
285 | def boundaryzet(zet, psi, m, n):
286 |     # set top/bottom BCs:
287 |     for i in range(1, m + 1):
288 |         zet[i * (m + 2) + 0] = 2.0 * (psi[i * (m + 2) + 1] - psi[i * (m + 2) + 0])
289 |         zet[i * (m + 2) + n + 1] = 2.0 * (psi[i * (m + 2) + n] - psi[i * (m + 2) + n + 1])
290 | 
291 |     # set left BCs:
292 |     for j in range(1, n + 1):
293 |         zet[0 * (m + 2) + j] = 2.0 * (psi[1 * (m + 2) + j] - psi[0 * (m + 2) + j])
294 | 
295 |     # set right BCs
296 |     for j in range(1, n + 1):
297 |         zet[(m + 1) * (m + 2) + j] = 2.0 * (psi[m * (m + 2) + j] - psi[(m + 1) * (m + 2) + j])
298 | 
299 |     return  zet
300 | 
301 | if __name__ == "__main__":
302 |         main(sys.argv[1:])


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------