├── .gitignore
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── README.md
├── Singularity
└── _profiler
    ├── Presentations
        └── README.md
    ├── _start_profiling.ipynb
    ├── jupyter_notebook
        ├── images
        │   ├── 1_correlation.png
        │   ├── 1_gpu_row.png
        │   ├── 1_openacc_row.png
        │   ├── 1_timeline.png
        │   ├── 1_timeline_full.png
        │   ├── 3launch5skip.png
        │   ├── 5_init.png
        │   ├── MPI_Division.jpg
        │   ├── NIC.png
        │   ├── Nsight Diagram.png
        │   ├── Optimization_Cycle.jpg
        │   ├── Outer_Loop.jpg
        │   ├── Range-Kutta.jpg
        │   ├── SOL-compute.png
        │   ├── Screenshot from 2020-04-15 10-25-49.png
        │   ├── Semi_Discrete.jpg
        │   ├── Semi_Discrete_Step.jpg
        │   ├── Time.jpg
        │   ├── Time_Step.jpg
        │   ├── UM.png
        │   ├── X_Y.jpg
        │   ├── allsection-compute.png
        │   ├── baseline-compute.png
        │   ├── baseline1-compute.png
        │   ├── c2compute.png
        │   ├── ccompute.png
        │   ├── cexer5.png
        │   ├── cfeedback1-2.png
        │   ├── cfeedback1.png
        │   ├── cfeedback1_.png
        │   ├── cfeedback2.png
        │   ├── cfeedback2_.png
        │   ├── cfeedback3-1.png
        │   ├── cfeedback3-1_.png
        │   ├── cfeedback3.png
        │   ├── cfeedback3_.png
        │   ├── cfeedback4.png
        │   ├── cfeedback4_.png
        │   ├── charts-compute.png
        │   ├── checkerpy.png
        │   ├── cli-out.png
        │   ├── collapse_feedback.png
        │   ├── collapse_pre.png
        │   ├── collapse_thread.png
        │   ├── compare_23.png
        │   ├── compute-cli-1.png
        │   ├── compute-cli-2.png
        │   ├── compute-memory.png
        │   ├── compute-memtable-hover.png
        │   ├── compute-memtable.png
        │   ├── compute-open.png
        │   ├── compute-sections.png
        │   ├── compute-sets.png
        │   ├── compute.png
        │   ├── compute_analyz.png
        │   ├── compute_command.png
        │   ├── compute_command_line.png
        │   ├── compute_tags.png
        │   ├── correlation.png
        │   ├── cpu.png
        │   ├── cpu_row.png
        │   ├── cuda.png
        │   ├── cuda_api.png
        │   ├── cuda_hw_sw.png
        │   ├── cuda_indexing.png
        │   ├── cuda_row_0.png
        │   ├── cuda_row_1.png
        │   ├── cuda_vec_add.png
        │   ├── data_feedback.png
        │   ├── data_thread.png
        │   ├── diagram.png
        │   ├── e1-nvtx.png
        │   ├── e1-nvtx_gui.png
        │   ├── e1-nvtx_terminal.png
        │   ├── expand-compute.png
        │   ├── f2compute.png
        │   ├── fcompute.png
        │   ├── ffeedback1-0.png
        │   ├── ffeedback1-1.png
        │   ├── ffeedback1.png
        │   ├── ffeedback2.png
        │   ├── ffeedback3.png
        │   ├── ffeedback4.png
        │   ├── fortran_nvtx.png
        │   ├── fortranexer5.png
        │   ├── fulllaunch.png
        │   ├── gang_128.png
        │   ├── gang_256.png
        │   ├── gang_32.png
        │   ├── gang_vector.png
        │   ├── git_branching.jpg
        │   ├── gpu_feedback.png
        │   ├── gpu_metrics.png
        │   ├── header-compute.png
        │   ├── jacobi_1.png
        │   ├── jacobi_2.png
        │   ├── jacobi_2_.png
        │   ├── jacobi_3.png
        │   ├── jacobi_3_event.png
        │   ├── jacobi_4_.png
        │   ├── jacobi_4_1.png
        │   ├── jacobi_5_0.png
        │   ├── jacobi_5_1.png
        │   ├── jacobi_5_occ.png
        │   ├── jacobi_5_schedule.png
        │   ├── jacobi_5_sol.png
        │   ├── jacobi_5_source.png
        │   ├── jacobi_5_warp.png
        │   ├── jacobi_6_jacobi_base.png
        │   ├── jacobi_6_jacobi_schedule.png
        │   ├── jacobi_6_swap_base.png
        │   ├── jacobi_7_base.png
        │   ├── jacobi_7_roof.png
        │   ├── jacobi_7_sol.png
        │   ├── jacobi_8_nsight.png
        │   ├── jacobi_8_sol.png
        │   ├── jacobi_8_source.png
        │   ├── jacobi_formula.png
        │   ├── jacobi_swap_mem.png
        │   ├── jacobi_swap_sol.png
        │   ├── kernel_feedback.png
        │   ├── kernel_indep_feedback.png
        │   ├── laplas3.png
        │   ├── launch-compute.png
        │   ├── list-set.png
        │   ├── memory-compute.png
        │   ├── mpi_comm.png
        │   ├── nsight_open - Copy.png
        │   ├── nsight_open.png
        │   ├── nsight_sys_tags.png
        │   ├── nsys-compute-command.png
        │   ├── nsys-compute-command1.png
        │   ├── nsys-compute-command2.png
        │   ├── nsys_data_mv.png
        │   ├── nsys_fast_mv.png
        │   ├── nsys_fast_mv_.png
        │   ├── nsys_slow.png
        │   ├── nsys_slow_mv.png
        │   ├── nvtx.PNG
        │   ├── nvtx_gpu.png
        │   ├── nvtx_multicore.png
        │   ├── nvtx_serial.png
        │   ├── occu-1.png
        │   ├── occu-2.png
        │   ├── occu-2_.png
        │   ├── occu-3.png
        │   ├── occu-3_.png
        │   ├── openacc correlation.png
        │   ├── openacc_3_directives.png
        │   ├── openacc_construct.png
        │   ├── openacc_copyclause.png
        │   ├── openacc_parallel.png
        │   ├── openacc_parallel2.png
        │   ├── openacc_parallel_loop.png
        │   ├── openmp_fork_join.png
        │   ├── openmp_parallel_construct.png
        │   ├── openmp_parallelfor_construct.png
        │   ├── openmp_target_distribute.png
        │   ├── openmp_target_teams.png
        │   ├── openmp_teams.png
        │   ├── openmp_teams_for.png
        │   ├── page-compute.png
        │   ├── parallel_data.png
        │   ├── parallel_data_feedback.png
        │   ├── parallel_detailed.png
        │   ├── parallel_expand.png
        │   ├── parallel_loop.png
        │   ├── parallel_timeline.png
        │   ├── parallel_unified.png
        │   ├── q1-1.png
        │   ├── q1-2.png
        │   ├── q2-1.png
        │   ├── q2-1_zoom.png
        │   ├── q2-2_zoom.png
        │   ├── q3-1.png
        │   ├── q3-2.png
        │   ├── q4-1.png
        │   ├── q4-1_zoom.png
        │   ├── q4-1_zoom2.png
        │   ├── q4-2.PNG
        │   ├── q4-2_zoom.png
        │   ├── q4-2_zoom2.png
        │   ├── rdf.png
        │   ├── roofline-achieved.png
        │   ├── roofline-analysis.png
        │   ├── roofline-baseline.png
        │   ├── roofline-compute.png
        │   ├── roofline-overview.png
        │   ├── roofline.png
        │   ├── roofline_collapse.png
        │   ├── rule-compute.png
        │   ├── sass-compute.png
        │   ├── sass-reg-dependency.png
        │   ├── scheduler_collapse.png
        │   ├── sections-compute.png
        │   ├── serial.png
        │   ├── sol.png
        │   ├── sol_baseline.png
        │   ├── source-compute.png
        │   ├── source_collapse.png
        │   ├── source_hover.png
        │   ├── source_loc.png
        │   ├── source_sass_collapse.png
        │   ├── summary-compute.png
        │   ├── thread.png
        │   ├── thread_row.png
        │   ├── triangle.png
        │   ├── ucx.png
        │   ├── warning-compute.png
        │   ├── warp_collapse.png
        │   └── workflow.png
        ├── introduction.ipynb
        ├── miniweather.ipynb
        ├── nsight_advanced.ipynb
        ├── nsight_compute.ipynb
        ├── nsight_systems.ipynb
        ├── profiling_lab1.ipynb
        ├── profiling_lab2.ipynb
        ├── profiling_lab3.ipynb
        ├── profiling_lab4.ipynb
        ├── profiling_lab5.ipynb
        └── profiling_lab6.ipynb
    └── source_code
        ├── lab1
            ├── Makefile
            ├── miniWeather_serial.cpp
            └── miniWeather_serial.f90
        ├── lab2
            ├── Makefile
            ├── miniWeather_openacc.cpp
            └── miniWeather_openacc.f90
        ├── lab3
            ├── Makefile
            ├── miniWeather_openacc.cpp
            └── miniWeather_openacc.f90
        ├── lab4
            ├── Makefile
            ├── miniWeather_openacc.cpp
            └── miniWeather_openacc.f90
        ├── lab5
            ├── Makefile
            ├── miniWeather_openacc.cpp
            └── miniWeather_openacc.f90
        └── lab6
            ├── Makefile
            ├── jacobi.cpp
            ├── jacobi_step1.cpp
            ├── jacobi_step2.cpp
            ├── jacobi_step3.cpp
            ├── jacobi_step4.cpp
            ├── jacobi_step5.cpp
            ├── jacobi_step6.cpp
            ├── jacobi_step7.cpp
            └── jacobi_step8.cpp


/.gitignore:
--------------------------------------------------------------------------------
 1 | .ipynb_checkpoints
 2 | */.ipynb_checkpoints/*
 3 | alk.traj.dcd
 4 | *.simg
 5 | *.so*
 6 | *.a
 7 | *.la
 8 | mgpm
 9 | *.o
10 | *.out
11 | */.ses/*
12 | */.log/*
13 | */not repo/*
14 | */.nsys-rep/*
15 | */.sqlite/*
16 | */.ncu-rep/*
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | Contributing
 2 | ------------
 3 | 
 4 | Please use the following guidelines when contributing to this project. 
 5 | 
 6 | Before contributing significant changes, please begin a discussion of the desired changes via a GitHub Issue to prevent doing unnecessary or overlapping work.
 7 | 
 8 | ## License
 9 | 
10 | The preferred license for source code contributed to this project is the Apache License 2.0 (https://www.apache.org/licenses/LICENSE-2.0) and for documentation, including Jupyter notebooks and text documentation, is the Creative Commons Attribution 4.0 International (CC BY 4.0) (https://creativecommons.org/licenses/by/4.0/). Contributions under other, compatible licenses will be considered on a case-by-case basis.
11 | 
12 | ## Styling
13 | 
14 | Please use the following style guidelines when making contributions.
15 | 
16 | ### Source Code
17 | * Two-space indentation, no tabs
18 | * To the extent possible, variable names should be descriptive
19 | * Code should be documentation with detail like what function does and returns making the code readable. The code should also have proper license at the beginning of the file.
20 | * Fortran codes should use free-form source files
21 | * Fortran codes should not use implicit variable names and should use implicit none
22 | * The following file extensions should be used appropriately
23 | 	* C - .c
24 | 	* C++ - .cpp
25 | 	* CUDA C/C++ - .cu
26 | 	* CUDA Fortran - .cuf
27 | 	* Fortran - .F90
28 | 	* Python = .py
29 | 
30 | ### Jupyter Notebooks & Markdown
31 | * When they appear inline with the text; directive names, clauses, function or subroutine names, variable names, file names, commands and command-line arguments should appear between two backticks.
32 | * Code blocks should begin with three backticks and either 'cpp' or 'fortran' to enable appropriate source formatting and end with three backticks.
33 | * Leave an empty line before and after the codeblock.
34 | Emphasis, including quotes made for emphasis and introduction of new terms should be highlighted between a single pair of asterisks
35 | * A level 1 heading should appear at the top of the notebook as the title of the notebook.
36 | * A horizontal rule should appear between sections that begin with a level 2 heading.
37 | 
38 | Please refer to the following template for jupyter notebook styling in the [github](https://github.com/openhackathons-org/gpubootcamp/tree/master/misc).
39 | 
40 | ## Contributing Labs/Modules
41 | 
42 | ### Directory stucture for Github
43 | 
44 | Before starting to work on new lab it is important to follow the recommended git structure as shown below to avoid and reformatting:
45 | 
46 | ```
47 | ├── _profiler
48 | │   ├── Presentations
49 | │   ├── jupyter_notebook
50 | │      ├── images
51 | │      ├── x.ipynb
52 | │      └── ...
53 | │   ├── source_code
54 | │      ├── lab x
55 | │          ├── x.cpp
56 | │          ├── x.f90
57 | │          └── Makefile
58 | │      └── ...
59 | │   └── _start_profiling.ipynb
60 | ├── LICENSE
61 | ├── README.md
62 | ├── Dockerfile
63 | └── Singularity
64 | ```
65 | 
66 | Each lab will have following files/directories consisting of training material for the lab.
67 | * `jupyter_notebook` folder: Consists of jupyter notebooks and its corresponding images.  
68 | * `source_code` folder: Source codes are stored in a separate directory which contains all the programming lanugaes (C/C++/Fortran). Source code folder may optionally contain Makefile especially for HPC labs. This folder may also contains `SOLUTIONS` folder for all the related solutions to that particular lab.  
69 | * presentations: Consists of presentations for the labs ( pdf format is preferred )
70 | * Dockerfile and Singularity  : Each lab should have both Docker and Singularity recipes.
71 |  
72 | The lab optionally may also add custom license in case of any deviation from the top level directory license ( Apache 2.0 ). 
73 | 
74 | ### Git Branching
75 | 
76 | Adding a new feature/lab will follow a forking workflow. Which means a feature branch development will happen on a forked repo which later gets merged into our original project (GPUHackathons.org) repository.
77 | 
78 | 
79 | ![Git Branching Workflow](_profiler/jupyter_notebook/images/git_branching.jpg)
80 | 
81 | The 5 main steps depicted in image above are as follows:
82 | 1. Fork: To create a new lab/feature the GPUHackathons.org repository must be forked. Fork will create a snapshot of GPUHackathons.org repository at the time it was forked. Any new feature/lab that will be developed should be based on the develop branch of the repository.
83 | 2.  Clone: Developer can than clone this new repository to local machine
84 | Create Feature Branch: Create a new branch with a feature name in which your changes will be done. Recommend naming convention of feature branch is naming convention for branch: profiler-<feature_name>. The new changes that developer makes can be added, committed and pushed
85 | 3. Push: After the changes are committed, the developer pushes the changes to the remote branch. Push command helps the local changes to github repository
86 | 4. Pull: Submit a pull request. Upon receiving pull request a Hackathon team reviewer/owner will review the changes and upon accepting it can be merged into the develop branch of GpuHacakthons.org
87 | 
88 | Git Branch details are as follows:
89 | 
90 | * master branch: Consists of the stable branch. 
91 | 	* origin/master to be the main branch where the source code of HEAD always reflects a production-ready state
92 | 	* Merge request is possible through:  develop branch
93 | * develop branch: branched from master branch
94 | 	* Must branch from: master branch
95 | 	* Must merge back into: master branch
96 | 	* It is the main development branch where the source code of HEAD always reflects a state with the latest delivered development changes for the next release.
97 | 	* When the source code in the develop branch reaches a stable point and is ready to be released, all of the changes should be merged back into master somehow and then tagged with a release number
98 | 	* All feature development should happen by forking GPUHackathons.org and branching from develop branch only.
99 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022 NVIDIA Corporation.  All rights reserved. 
 2 | 
 3 | # To build: $ sudo docker build -t profiling:latest .
 4 | # To run: $ sudo docker run --rm -it --gpus=all -p 8888:8888 profiling:latest
 5 | # Finally, open http://127.0.0.1:8888/
 6 | 
 7 | FROM nvcr.io/nvidia/nvhpc:22.7-devel-cuda_multi-ubuntu20.04
 8 | 
 9 | RUN apt-get update -y && \
10 |     apt-get dist-upgrade -y && \
11 |     DEBIAN_FRONTEND=noninteractive apt-get -yq install  --no-install-recommends \
12 |     m4 vim-nox emacs-nox nano zip\
13 |     python3-pip python3-setuptools git-core inotify-tools \
14 |     curl git-lfs nginx\
15 |     build-essential openssh-server openssh-client && \
16 |     rm -rf /var/lib/apt/cache/* 
17 | 
18 | RUN apt-get update 
19 | RUN apt-get install --no-install-recommends -y python3
20 | RUN pip3 install --upgrade pip
21 | RUN apt-get update -y        
22 | RUN apt-get install -y git nvidia-modprobe
23 | RUN pip3 install jupyterlab
24 | # Install required python packages
25 | RUN pip3 install ipywidgets
26 | RUN pip3 install netcdf4
27 | 
28 | RUN apt-get install --no-install-recommends -y build-essential
29 | 
30 | ENV PATH="$PATH:/usr/local/bin:/opt/anaconda3/bin:/usr/bin" 
31 | 
32 | RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
33 |     bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/anaconda3  && \
34 |     rm Miniconda3-latest-Linux-x86_64.sh && \
35 |     /opt/anaconda3/bin/conda install -y -q netcdf4
36 | 
37 | ADD     _profiler /labs
38 | 
39 | WORKDIR /labs
40 | CMD jupyter-lab --no-browser --allow-root --ip=0.0.0.0 --port=8888 --NotebookApp.token="" --notebook-dir=/labs
41 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2018, National Center for Computational Sciences, Oak Ridge National Laboratory
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Profiling with NVIDIA Nsight Tools Bootcamp
 2 | This repository contains learning materials and exercises for NVIDIA Nsight Tools. Gola is to learn how to profile your application with NVIDIA Nsight Systems,Compute and NVTX API calls to find performance limiters and bottlenecks and apply incremental parallelization strategies. The content was tested on **NVIDIA driver 515.65**.
 3 | 
 4 | - Introduction: Overview of profiling tools and Mini Weather application
 5 | - Lab 1: Profile Serial application to find hotspots using NVIDIA Nsight System
 6 | - Lab 2: Parallelise the serial application using OpenACC compute directives
 7 | - Lab 3: Optimizing loops 
 8 | - Lab 4: Apply incremental parallelization strategies and use profiler's report for the next step
 9 | - Lab 5: Nsight Compute Kernel Level Analysis
10 | - [Optional]
11 |     - Lab 6:Performance Analysis of an application using Nsight Systems and Compute (CUDA example)
12 |     - Advanced: Multiprocess profiling 
13 | 
14 | ## Target Audience
15 | 
16 | The target audience for this lab is researchers/graduate students and developers who are interested in getting hands on experience with the NVIDIA Nsight System through profiling a real life parallel application.
17 | 
18 | While Labs 1-5 do not assume any expertise in CUDA experience, basic knowledge of OpenACC programming (e.g: compute constructs), GPU architecture, and programming experience with C/C++ is desirable.
19 | 
20 | The Optional lab 6 requires basic knowledge of CUDA programming, GPU architecture, and programming experience with C/C++.
21 | 
22 | ## Tutorial Duration
23 | 
24 | The lab material will be presented in a 2.5hr session. The link to the material is available for download at the end of each lab.
25 | 
26 | 
27 | ## Prerequisites:
28 | To run this content you will need a machine with NVIDIA GPUs (Nsight Systems supports Pascal and above (SM 60+), and Nsight Compute supports Volta and above (SM 70+)).
29 | 
30 | - Install the [Docker](https://docs.docker.com/get-docker/) or [Singularity](https://sylabs.io/docs/]).
31 | - Install Nvidia toolkit, [Nsight Systems (latest version)](https://developer.nvidia.com/nsight-systems) and [compute (latest version)](https://developer.nvidia.com/nsight-compute).
32 | - The base containers required for the lab may require users to create a NGC account and generate an API key (https://docs.nvidia.com/ngc/ngc-catalog-user-guide/index.html#registering-activating-ngc-account)
33 | 
34 | ## Creating containers
35 | To start with, you will have to build a Docker or Singularity container.
36 | 
37 | ### Docker Container
38 | To build a docker container, run: 
39 | `sudo docker build -t <imagename>:<tagnumber> .`
40 | 
41 | For instance:
42 | `sudo docker build -t profiling:latest .`
43 | 
44 | The code labs have been written using Jupyter lab and a Dockerfile has been built to simplify deployment. In order to serve the docker instance for a student, it is necessary to expose port 8000 from the container, for instance, the following command would expose port 8000 inside the container as port 8000 on the lab machine:
45 | 
46 | `sudo docker run --rm -it --gpus=all -p 8888:8888 profiling:latest`
47 | 
48 | When this command is run, you can browse to the serving machine on port 8000 using any web browser to access the labs. For instance, from if they are running on the local machine the web browser should be pointed to http://localhost:8000. The `--gpus` flag is used to enable `all` NVIDIA GPUs during container runtime. The `--rm` flag is used to clean an temporary images created during the running of the container. The `-it` flag enables killing the jupyter server with `ctrl-c`. This command may be customized for your hosting environment.
49 | 
50 | 
51 | Then, inside the container launch the Jupyter lab assigning the port you opened:
52 | 
53 | `jupyter-lab --ip 0.0.0.0 --port 8888 --no-browser --allow-root`
54 | 
55 | 
56 | Once inside the container, open the jupyter lab in browser: http://localhost:8888, and start the lab by clicking on the `_start_profiling.ipynb` notebook.
57 | 
58 | ### Singularity Container
59 | 
60 | To build the singularity container, run: 
61 | `sudo singularity build _profiler.simg Singularity` . If you do not have `sudo` rights, you can build the singularity container with `--fakeroot` option: `singularity build --fakeroot _profiler.simg Singularity`
62 | 
63 | and copy the files to your local machine to make sure changes are stored locally:
64 | `singularity run _profiler.simg cp -rT /labs ~/labs`
65 | 
66 | Then, run the container:
67 | `singularity run --nv _profiler.simg jupyter-lab --notebook-dir=~/labs`
68 | 
69 | Once inside the container, open the jupyter lab in browser: http://localhost:8888, and start the lab by clicking on the `_start_profiling.ipynb` notebook.
70 | 
71 | 
72 | ## Known issues
73 | - Please go through the list of exisiting bugs/issues or file a new issue at [Github](https://github.com/openhackathons-org/HPC_Profiler/issues).
74 | 


--------------------------------------------------------------------------------
/Singularity:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022 NVIDIA Corporation.  All rights reserved. 
 2 | 
 3 | Bootstrap: docker
 4 | FROM: nvcr.io/nvidia/nvhpc:22.7-devel-cuda_multi-ubuntu20.04
 5 | 
 6 | %environment
 7 |     export XDG_RUNTIME_DIR=
 8 |     export PATH="$PATH:/usr/local/bin:/opt/anaconda3/bin:/usr/bin"
 9 | 
10 | %post
11 |     build_tmp=$(mktemp -d) && cd ${build_tmp}
12 | 
13 |     apt-get -y update
14 |     apt-get -y dist-upgrade 
15 |     DEBIAN_FRONTEND=noninteractive apt-get -yq install --no-install-recommends \
16 | 	    m4 vim-nox emacs-nox nano zip\
17 |  	    python3-pip python3-setuptools git-core inotify-tools \
18 | 	    curl git-lfs \
19 | 	    build-essential
20 |     rm -rf /var/lib/apt/cache/* 
21 | 
22 |     pip3 install --upgrade pip
23 |     apt-get update -y
24 |     apt-get -y install git nvidia-modprobe
25 |     pip3 install jupyterlab
26 |     pip3 install ipywidgets
27 |     pip3 install jupyter netcdf4
28 | 
29 |     apt-get install --no-install-recommends -y build-essential 
30 | 
31 |     wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh 
32 |     bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/anaconda3 
33 |     rm Miniconda3-latest-Linux-x86_64.sh 
34 |     /opt/anaconda3/bin/conda install -y -q netcdf4
35 | 
36 |     cd /
37 |     rm -rf ${build_tmp}
38 | 
39 | %files
40 |     _profiler/ /labs
41 | 
42 | %runscript
43 |     "$@"
44 | 
45 | %labels
46 |     AUTHOR mozhgank
47 | 


--------------------------------------------------------------------------------
/_profiler/Presentations/README.md:
--------------------------------------------------------------------------------
1 | For Partners who are interested in delivering the critical hands-on skills needed to advance science in form of Bootcamp can reach out to us at [Open Hackathons Partner](https://www.openhackathons.org/s/about-open-hackathons) website. In addition to current bootcamp material the Partners will be provided with the following:
2 | 
3 | - Presentation: All the Bootcamps are accompanied with training material presentations which can be used during the Bootcamp session.
4 | - Mini challenge : To test the knowledge gained during this Bootcamp a mini application challenge is provided along with sample Solution.
5 | - Additional Support: On case to case basis the Partners can also be trained on how to effectively deliver the Bootcamp with maximal impact.


--------------------------------------------------------------------------------
/_profiler/_start_profiling.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Profiling Tutorial\n",
  8 |     "\n",
  9 |     "### Learning objectives\n",
 10 |     "Learn how to profile your application with NVIDIA® Nsight™ Systems and NVIDIA Tools Extension SDK (NVTX) Application Programming Interface (API)  calls to find performance limiters and bottlenecks and apply incremental parallelization strategies. In this lab, you will:\n",
 11 |     "\n",
 12 |     "- Learn to follow a cyclical process (analyze, parallelize, optimize) to help you identify the portions of the code that would benefit from GPU acceleration and apply parallelization strategies and optimization techniques to see additional speedups and performance improvements\n",
 13 |     "- Understand what a profiler is and which NVIDIA Nsight tool to choose in order to profile your application\n",
 14 |     "- Profile a sequential weather modeling application (integrated with NVTX APIs) with Nsight Systems to capture and trace CPU events and time ranges\n",
 15 |     "- Understand how to use  Nsight Systems profiler reports to detect hotspots to port the application to the GPU\n",
 16 |     "- Learn how to use Nsight Systems to identify issues such as underutilized GPU devices and unnecessary data movements in the application and  apply optimization strategies step by step to expose more parallelism and utilize the computer’s CPU and GPU\n",
 17 |     "- Learn how to use occupancy to address performance limitations\n",
 18 |     "\n",
 19 |     "\n",
 20 |     "In this lab, we will be optimizing a serial application written in C programming language. The first 5 labs optimize the weather simulation application using the OpenACC programming model. The optional lab 6 optimizes an application based that uses Jacobi iterative method using CUDA. \n",
 21 |     "\n",
 22 |     "\n",
 23 |     "### Tutorial Outline\n",
 24 |     "- [Introduction](jupyter_notebook/introduction.ipynb)\n",
 25 |     "    - Overview of Nsight profiler tools ([Nsight Systems](jupyter_notebook/nsight_systems.ipynb) and [Nsight Compute](jupyter_notebook/nsight_compute.ipynb))\n",
 26 |     "    - Overview of [Mini Weather application](jupyter_notebook/miniweather.ipynb)\n",
 27 |     "    - Optimization Steps to parallel programming with OpenACC\n",
 28 |     "- [Lab 1](jupyter_notebook/profiling_lab1.ipynb)\n",
 29 |     "    - How to compile a serial application with NVIDIA HPC compiler\n",
 30 |     "    - How to profile a serial application with Nsight Systems and NVTX APIs\n",
 31 |     "    - How to use profiler's report to find hotspots\n",
 32 |     "    - Scaling and Amdahl's law and why it matters\n",
 33 |     "- [Lab 2](jupyter_notebook/profiling_lab2.ipynb) \n",
 34 |     "    - Parallelize the serial application using OpenACC compute directives\n",
 35 |     "    - How to compile a parallel application with NVIDIA HPC compiler\n",
 36 |     "    - What does the compiler feedback tell us?\n",
 37 |     "    - Profile with Nsight Systems\n",
 38 |     "    - Finding bottlenecks from Nsight Systems report\n",
 39 |     "- [Lab 3](jupyter_notebook/profiling_lab3.ipynb)\n",
 40 |     "    - How to combine the knowledge from compiler feedback and profiler to optimize the application\n",
 41 |     "    - What is occupancy?\n",
 42 |     "    - Demystifying Gangs, Workers, and Vectors\n",
 43 |     "    - Apply collapse clause to optimize the application further\n",
 44 |     "- [Lab 4](jupyter_notebook/profiling_lab4.ipynb) \n",
 45 |     "    - Inspect data movement from the profiler's report\n",
 46 |     "    - Data management with OpenACC\n",
 47 |     "    - Apply incremental parallelization strategies and use the profiler report for the next step\n",
 48 |     "- [Lab 5](jupyter_notebook/profiling_lab5.ipynb)\n",
 49 |     "    - When and How to use Nsight Compute\n",
 50 |     "    - What does the profiler tell us: where is the bottleneck?\n",
 51 |     "    - How to use baselines with Nsight Compute\n",
 52 |     "- **Optional**\n",
 53 |     "    - [Lab 6](jupyter_notebook/profiling_lab6.ipynb)\n",
 54 |     "        - Performance Analysis of an application using Nsight Systems and Compute (NVIDIA CUDA® example)\n",
 55 |     "    - [Advanced](jupyter_notebook/nsight_advanced.ipynb)\n",
 56 |     "        - What are GPU Metrics in Nsight Systems\n",
 57 |     "        - How to profile multi processes\n",
 58 |     "    \n",
 59 |     "\n",
 60 |     "### Tutorial Duration\n",
 61 |     "The lab material will be presented in a 2.5hr session. The link to the material is available for download at the end of each lab.\n",
 62 |     "\n",
 63 |     "### Content Level\n",
 64 |     "Beginner, Intermediate\n",
 65 |     "\n",
 66 |     "### Target Audience and Prerequisites\n",
 67 |     "The target audience for this lab is researchers,graduate students, and developers who are interested in getting hands on experience with the NVIDIA Nsight System through profiling a real-life parallel application.\n",
 68 |     "\n",
 69 |     "While Labs 1-5 do not assume any expertise in CUDA, basic knowledge of OpenACC programming (e.g: compute constructs), GPU architecture, and programming experience with C/C++ is desirable.\n",
 70 |     "\n",
 71 |     "The Optional lab 6 requires basic knowledge of CUDA programming, GPU architecture, and programming experience with C/C++.\n",
 72 |     "\n",
 73 |     "### Additional Resources\n",
 74 |     "- Please install [NVIDIA Nsight Compute](https://docs.nvidia.com/nsight-compute/index.html) and [NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/) on your local machine.\n",
 75 |     "\n",
 76 |     "- [Nsight Developer Tools Training Contents](https://github.com/NVIDIA/nsight-training)\n",
 77 |     "\n",
 78 |     "--- "
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "## Licensing \n",
 86 |     "\n",
 87 |     "Copyright © 2022 OpenACC-Standard.org.  This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply."
 88 |    ]
 89 |   }
 90 |  ],
 91 |  "metadata": {
 92 |   "anaconda-cloud": {},
 93 |   "kernelspec": {
 94 |    "display_name": "Python 3",
 95 |    "language": "python",
 96 |    "name": "python3"
 97 |   },
 98 |   "language_info": {
 99 |    "codemirror_mode": {
100 |     "name": "ipython",
101 |     "version": 3
102 |    },
103 |    "file_extension": ".py",
104 |    "mimetype": "text/x-python",
105 |    "name": "python",
106 |    "nbconvert_exporter": "python",
107 |    "pygments_lexer": "ipython3",
108 |    "version": "3.7.4"
109 |   }
110 |  },
111 |  "nbformat": 4,
112 |  "nbformat_minor": 4
113 | }
114 | 


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/1_correlation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/1_correlation.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/1_gpu_row.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/1_gpu_row.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/1_openacc_row.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/1_openacc_row.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/1_timeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/1_timeline.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/1_timeline_full.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/1_timeline_full.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/3launch5skip.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/3launch5skip.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/5_init.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/5_init.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/MPI_Division.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/MPI_Division.jpg


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/NIC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/NIC.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/Nsight Diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/Nsight Diagram.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/Optimization_Cycle.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/Optimization_Cycle.jpg


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/Outer_Loop.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/Outer_Loop.jpg


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/Range-Kutta.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/Range-Kutta.jpg


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/SOL-compute.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/SOL-compute.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/Screenshot from 2020-04-15 10-25-49.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/Screenshot from 2020-04-15 10-25-49.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/Semi_Discrete.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/Semi_Discrete.jpg


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/Semi_Discrete_Step.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/Semi_Discrete_Step.jpg


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/Time.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/Time.jpg


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/Time_Step.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/Time_Step.jpg


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/UM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/UM.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/X_Y.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/X_Y.jpg


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/allsection-compute.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/allsection-compute.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/baseline-compute.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/baseline-compute.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/baseline1-compute.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/baseline1-compute.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/c2compute.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/c2compute.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/ccompute.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/ccompute.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/cexer5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/cexer5.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/cfeedback1-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/cfeedback1-2.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/cfeedback1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/cfeedback1.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/cfeedback1_.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/cfeedback1_.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/cfeedback2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/cfeedback2.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/cfeedback2_.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/cfeedback2_.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/cfeedback3-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/cfeedback3-1.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/cfeedback3-1_.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/cfeedback3-1_.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/cfeedback3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/cfeedback3.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/cfeedback3_.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/cfeedback3_.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/cfeedback4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/cfeedback4.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/cfeedback4_.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/cfeedback4_.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/charts-compute.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/charts-compute.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/checkerpy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/checkerpy.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/cli-out.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/cli-out.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/collapse_feedback.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/collapse_feedback.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/collapse_pre.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/collapse_pre.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/collapse_thread.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/collapse_thread.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/compare_23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/compare_23.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/compute-cli-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/compute-cli-1.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/compute-cli-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/compute-cli-2.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/compute-memory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/compute-memory.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/compute-memtable-hover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/compute-memtable-hover.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/compute-memtable.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/compute-memtable.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/compute-open.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/compute-open.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/compute-sections.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/compute-sections.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/compute-sets.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/compute-sets.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/compute.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/compute.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/compute_analyz.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/compute_analyz.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/compute_command.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/compute_command.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/compute_command_line.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/compute_command_line.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/compute_tags.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/compute_tags.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/correlation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/correlation.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/cpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/cpu.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/cpu_row.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/cpu_row.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/cuda.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/cuda.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/cuda_api.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/cuda_api.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/cuda_hw_sw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/cuda_hw_sw.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/cuda_indexing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/cuda_indexing.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/cuda_row_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/cuda_row_0.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/cuda_row_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/cuda_row_1.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/cuda_vec_add.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/cuda_vec_add.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/data_feedback.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/data_feedback.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/data_thread.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/data_thread.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/diagram.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/e1-nvtx.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/e1-nvtx.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/e1-nvtx_gui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/e1-nvtx_gui.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/e1-nvtx_terminal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/e1-nvtx_terminal.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/expand-compute.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/expand-compute.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/f2compute.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/f2compute.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/fcompute.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/fcompute.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/ffeedback1-0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/ffeedback1-0.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/ffeedback1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/ffeedback1-1.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/ffeedback1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/ffeedback1.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/ffeedback2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/ffeedback2.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/ffeedback3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/ffeedback3.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/ffeedback4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/ffeedback4.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/fortran_nvtx.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/fortran_nvtx.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/fortranexer5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/fortranexer5.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/fulllaunch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/fulllaunch.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/gang_128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/gang_128.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/gang_256.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/gang_256.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/gang_32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/gang_32.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/gang_vector.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/gang_vector.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/git_branching.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/git_branching.jpg


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/gpu_feedback.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/gpu_feedback.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/gpu_metrics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/gpu_metrics.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/header-compute.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/header-compute.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/jacobi_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/jacobi_1.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/jacobi_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/jacobi_2.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/jacobi_2_.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/jacobi_2_.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/jacobi_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/jacobi_3.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/jacobi_3_event.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/jacobi_3_event.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/jacobi_4_.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/jacobi_4_.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/jacobi_4_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/jacobi_4_1.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/jacobi_5_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/jacobi_5_0.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/jacobi_5_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/jacobi_5_1.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/jacobi_5_occ.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/jacobi_5_occ.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/jacobi_5_schedule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/jacobi_5_schedule.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/jacobi_5_sol.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/jacobi_5_sol.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/jacobi_5_source.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/jacobi_5_source.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/jacobi_5_warp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/jacobi_5_warp.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/jacobi_6_jacobi_base.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/jacobi_6_jacobi_base.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/jacobi_6_jacobi_schedule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/jacobi_6_jacobi_schedule.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/jacobi_6_swap_base.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/jacobi_6_swap_base.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/jacobi_7_base.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/jacobi_7_base.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/jacobi_7_roof.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/jacobi_7_roof.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/jacobi_7_sol.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/jacobi_7_sol.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/jacobi_8_nsight.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/jacobi_8_nsight.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/jacobi_8_sol.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/jacobi_8_sol.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/jacobi_8_source.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/jacobi_8_source.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/jacobi_formula.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/jacobi_formula.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/jacobi_swap_mem.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/jacobi_swap_mem.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/jacobi_swap_sol.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/jacobi_swap_sol.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/kernel_feedback.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/kernel_feedback.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/kernel_indep_feedback.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/kernel_indep_feedback.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/laplas3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/laplas3.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/launch-compute.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/launch-compute.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/list-set.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/list-set.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/memory-compute.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/memory-compute.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/mpi_comm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/mpi_comm.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/nsight_open - Copy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/nsight_open - Copy.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/nsight_open.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/nsight_open.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/nsight_sys_tags.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/nsight_sys_tags.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/nsys-compute-command.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/nsys-compute-command.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/nsys-compute-command1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/nsys-compute-command1.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/nsys-compute-command2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/nsys-compute-command2.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/nsys_data_mv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/nsys_data_mv.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/nsys_fast_mv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/nsys_fast_mv.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/nsys_fast_mv_.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/nsys_fast_mv_.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/nsys_slow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/nsys_slow.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/nsys_slow_mv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/nsys_slow_mv.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/nvtx.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/nvtx.PNG


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/nvtx_gpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/nvtx_gpu.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/nvtx_multicore.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/nvtx_multicore.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/nvtx_serial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/nvtx_serial.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/occu-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/occu-1.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/occu-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/occu-2.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/occu-2_.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/occu-2_.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/occu-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/occu-3.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/occu-3_.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/occu-3_.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/openacc correlation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/openacc correlation.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/openacc_3_directives.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/openacc_3_directives.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/openacc_construct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/openacc_construct.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/openacc_copyclause.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/openacc_copyclause.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/openacc_parallel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/openacc_parallel.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/openacc_parallel2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/openacc_parallel2.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/openacc_parallel_loop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/openacc_parallel_loop.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/openmp_fork_join.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/openmp_fork_join.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/openmp_parallel_construct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/openmp_parallel_construct.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/openmp_parallelfor_construct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/openmp_parallelfor_construct.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/openmp_target_distribute.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/openmp_target_distribute.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/openmp_target_teams.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/openmp_target_teams.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/openmp_teams.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/openmp_teams.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/openmp_teams_for.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/openmp_teams_for.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/page-compute.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/page-compute.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/parallel_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/parallel_data.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/parallel_data_feedback.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/parallel_data_feedback.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/parallel_detailed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/parallel_detailed.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/parallel_expand.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/parallel_expand.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/parallel_loop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/parallel_loop.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/parallel_timeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/parallel_timeline.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/parallel_unified.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/parallel_unified.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/q1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/q1-1.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/q1-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/q1-2.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/q2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/q2-1.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/q2-1_zoom.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/q2-1_zoom.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/q2-2_zoom.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/q2-2_zoom.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/q3-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/q3-1.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/q3-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/q3-2.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/q4-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/q4-1.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/q4-1_zoom.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/q4-1_zoom.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/q4-1_zoom2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/q4-1_zoom2.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/q4-2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/q4-2.PNG


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/q4-2_zoom.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/q4-2_zoom.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/q4-2_zoom2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/q4-2_zoom2.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/rdf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/rdf.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/roofline-achieved.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/roofline-achieved.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/roofline-analysis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/roofline-analysis.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/roofline-baseline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/roofline-baseline.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/roofline-compute.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/roofline-compute.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/roofline-overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/roofline-overview.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/roofline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/roofline.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/roofline_collapse.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/roofline_collapse.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/rule-compute.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/rule-compute.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/sass-compute.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/sass-compute.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/sass-reg-dependency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/sass-reg-dependency.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/scheduler_collapse.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/scheduler_collapse.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/sections-compute.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/sections-compute.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/serial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/serial.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/sol.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/sol.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/sol_baseline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/sol_baseline.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/source-compute.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/source-compute.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/source_collapse.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/source_collapse.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/source_hover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/source_hover.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/source_loc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/source_loc.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/source_sass_collapse.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/source_sass_collapse.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/summary-compute.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/summary-compute.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/thread.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/thread.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/thread_row.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/thread_row.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/triangle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/triangle.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/ucx.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/ucx.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/warning-compute.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/warning-compute.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/warp_collapse.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/warp_collapse.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/images/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openhackathons-org/HPC_Profiler/6cd4c6166cea9156bd9cba2a7f6c3166f1651708/_profiler/jupyter_notebook/images/workflow.png


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/introduction.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by  clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "!nvidia-smi"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "## Learning objectives\n",
 24 |     "The **goal** of this lab is to:\n",
 25 |     "\n",
 26 |     "- Learn how to profile your application using NVIDIA profiling tools\n",
 27 |     "- Learn and understand the NVIDIA Nsight Systems and compute profiler report\n",
 28 |     "- Learn how to integrate NVIDIA Tools Extension SDK (NVTX) markers in your application to trace CPU events when profiling \n",
 29 |     "- Learn about the optimization cycle and how to find bottlenecks via the NVIDIA profiling tools\n",
 30 |     "\n",
 31 |     "We do not intend to cover:\n",
 32 |     "\n",
 33 |     "- OpenACC programming model and optimization techniques in detail\n",
 34 |     "\n",
 35 |     "# NVIDIA Profiler\n",
 36 |     "\n",
 37 |     "### What is Profiling\n",
 38 |     "Profiling is the first step in optimizing and tuning your application. Profiling an application helps us understand where most of the execution time is spent and gives us an understanding of an application’s  performance characteristics to easily identify parts of the code that present opportunities for improvement. Finding the hotspots and bottlenecks in your application can help you decide where to focus your optimization efforts.\n",
 39 |     "\n",
 40 |     "### NVIDIA Nsight Tools\n",
 41 |     "NVIDIA offers Nsight tools (Nsight Systems, Nsight Compute, Nsight Graphics), a collection of applications that enable developers to debug and, profile the performance of applications using CUDA, OpenACC, or OpenMP applications.\n",
 42 |     "\n",
 43 |     "Your profiling workflow will change to reflect the individual Nsight tools. Start with Nsight Systems to get a system-level overview of the workload and eliminate any system-level bottlenecks, such as unnecessary thread synchronization or data movement, and improve the system-level parallelism of your algorithms. Then, proceed to Nsight Compute or Nsight Graphics to optimize the most significant CUDA kernels or graphics workloads. Periodically return to Nsight Systems to ensure that you remain focused on the largest bottleneck as the bottleneck may have shifted and kernel level optimizations may not achieve the expected level of improvement.\n",
 44 |     "\n",
 45 |     "- **Nsight Systems** analyze application algorithms system-wide\n",
 46 |     "- **Nsight Compute** debugs and optimizes CUDA kernels \n",
 47 |     "- **Nsight Graphics** debugs and optimizes graphic workloads\n",
 48 |     "\n",
 49 |     "<img src=\"images/Nsight Diagram.png\" width=\"80%\" height=\"80%\">\n",
 50 |     "*The data flow between the NVIDIA Nsight tools.*\n",
 51 |     "\n",
 52 |     "**Please follow the links to learn more about [Nsight Systems](nsight_systems.ipynb) and [Nsight Compute](nsight_compute.ipynb) (links contain notebooks)**. Once you reviewed the content, follow the below sections."
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "### Steps to Follow\n",
 60 |     "<a name=\"steps\"></a>\n",
 61 |     "\n",
 62 |     "To obtain the best performance from the GPU and utilize the hardware, one should follow the cyclical process (analyze, parallelize, optimize). \n",
 63 |     "\n",
 64 |     "- **Analyze**: In this step, you first identify the portion of your code that includes most of the computation and where most of the execution time is spent. From here, you find the hotspots, evaluate the bottlenecks, and investigate GPU acceleration.\n",
 65 |     "\n",
 66 |     "- **Parallelize**: Once bottlenecks are identified, we use the techniques to paralellize the routines where most of the time is spent.\n",
 67 |     "\n",
 68 |     "- **Optimize**:  To further improve the performance, one can implement optimization strategies step-by-step in an iterative process: identifying optimization opportunities, applying and testing the optimization method, verifying and repeating the process.\n",
 69 |     "\n",
 70 |     "Note: The above optimization is done incrementally after investigating the profiler output.\n",
 71 |     "\n",
 72 |     "We will follow the optimization cycle for porting and improving the code performance.\n",
 73 |     "\n",
 74 |     "<img src=\"images/Optimization_Cycle.jpg\" width=\"80%\" height=\"80%\">"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "# Getting Started \n",
 82 |     "In the following sections, we parallelize and optimize the serial [mini weather application](miniweather.ipynb) following the above steps. The next section comprises five exercises, each will guide you through steps to detect performance limiters and overcome them. For each exercise, inspect the code, compile, and profile it. Then, investigate the profiler’s report to identify the bottlenecks and spot the optimization opportunities.  At each step, locate problem areas in the application and make improvements iteratively to increase performance.\n",
 83 |     "\n",
 84 |     "This lab comprises multiple exercises, each following the optimization cycle method. For each exercise, build the code by running the `make` and profile. In these labs, we focus on Nsight Systems to get the system-wide actionable insights to eliminate bottlenecks as well as deep diving into the kernel using Nsight Compute.\n",
 85 |     "\n",
 86 |     "\n",
 87 |     "**NOTE**: Example screenshots are for reference only and you may not get an identical profiler report. In other words, some **screenshots represent profiler reports for the values of 400,200,1500. Throughout the notebook, we change these values/parameters to reduce the runtime.**"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "-----\n",
 95 |     "\n",
 96 |     "# <div style=\"text-align: center ;border:3px; border-style:solid; border-color:#FF0000  ; padding: 1em\">[NEXT](profiling_lab1.ipynb)</div>\n",
 97 |     "\n",
 98 |     "-----"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "markdown",
103 |    "metadata": {},
104 |    "source": [
105 |     "# Links and Resources\n",
106 |     "\n",
107 |     "\n",
108 |     "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
109 |     "\n",
110 |     "\n",
111 |     "**NOTE**: To be able to see the Nsight System and Compute profiler outputs, please download the latest versions from below links:\n",
112 |     "\n",
113 |     "- https://developer.nvidia.com/nsight-systems\n",
114 |     "- https://developer.nvidia.com/nsight-compute\n",
115 |     "\n",
116 |     "\n",
117 |     "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
118 |     "\n",
119 |     "--- \n",
120 |     "\n",
121 |     "## Licensing \n",
122 |     "\n",
123 |     "Copyright © 2022 OpenACC-Standard.org.  This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply."
124 |    ]
125 |   }
126 |  ],
127 |  "metadata": {
128 |   "anaconda-cloud": {},
129 |   "kernelspec": {
130 |    "display_name": "Python 3",
131 |    "language": "python",
132 |    "name": "python3"
133 |   },
134 |   "language_info": {
135 |    "codemirror_mode": {
136 |     "name": "ipython",
137 |     "version": 3
138 |    },
139 |    "file_extension": ".py",
140 |    "mimetype": "text/x-python",
141 |    "name": "python",
142 |    "nbconvert_exporter": "python",
143 |    "pygments_lexer": "ipython3",
144 |    "version": "3.7.4"
145 |   }
146 |  },
147 |  "nbformat": 4,
148 |  "nbformat_minor": 4
149 | }
150 | 


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/miniweather.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# A MINI-WEATHER APPLICATION\n",
  8 |     "\n",
  9 |     "In this lab, we will accelerate a fluid simulation in the context of atmosphere and weather simulation.\n",
 10 |     "This mini weather code mimics the basic dynamics seen in atmospheric weather and climate.\n",
 11 |     "\n",
 12 |     "The figure below demonstrates how a narrow jet of fast and slightly cold wind is injected into a balanced, neutral atmosphere at rest from the left domain near the model.\n",
 13 |     "\n",
 14 |     "<img src=\"images/Time.jpg\" width=\"80%\" height=\"80%\">\n",
 15 |     "\n",
 16 |     "Simulation is a repetitive process from 0 to the desired simulated time, increasing by Δt on every iteration.\n",
 17 |     "Each Δt step is practically the same operation. Each simulation is solving a differential equation that represents how the flow of the atmosphere (fluid) changes according to small perturbations. To simplify this solution the code uses dimensional splitting: Each dimension X and Z are treated independently.\n",
 18 |     "\n",
 19 |     "\n",
 20 |     "<img src=\"images/X_Y.jpg\" width=\"80%\" height=\"80%\">\n",
 21 |     "\n",
 22 |     "\n",
 23 |     "The differential equation has a time derivative that needs integration, and a simple low-storage Runge-Kutta ordinary differential equations (ODE) solver is used to integrate the time derivative. In each time step, the order in which the dimensions are solved is reversed, giving second-order accuracy. \n",
 24 |     "\n",
 25 |     "\n",
 26 |     "<img src=\"images/Range-Kutta.jpg\" width=\"70%\" height=\"70%\">\n",
 27 |     "\n",
 28 |     "\n",
 29 |     "### The objective of this exercise is not to dwell on the math but to make use of OpenACC to parallelize and improve performance.\n",
 30 |     "\n",
 31 |     "The general flow of the code is shown in the diagram below. For each time step, the differential equations are solved.\n",
 32 |     "\n",
 33 |     "\n",
 34 |     "<img src=\"images/Outer_Loop.jpg\" width=\"70%\" height=\"70%\">\n",
 35 |     "\n",
 36 |     "\n",
 37 |     "\n",
 38 |     "```cpp\n",
 39 |     "while (etime < sim_time) {\n",
 40 |     "    //If the time step leads to exceeding the simulation time, shorten it for the last step\n",
 41 |     "    if (etime + dt > sim_time) { dt = sim_time - etime; }\n",
 42 |     "    //Perform a single time step\n",
 43 |     "    perform_timestep(state,state_tmp,flux,tend,dt);\n",
 44 |     "    //Inform the user\n",
 45 |     "    if (masterproc) { printf( \"Elapsed Time: %lf / %lf\\n\", etime , sim_time ); }\n",
 46 |     "    //Update the elapsed time and output counter\n",
 47 |     "    etime = etime + dt;\n",
 48 |     "    output_counter = output_counter + dt;\n",
 49 |     "    //If it's time for output, reset the counter, and do output\n",
 50 |     "    if (output_counter >= output_freq) {\n",
 51 |     "      output_counter = output_counter - output_freq;\n",
 52 |     "      output(state,etime);\n",
 53 |     "    }\n",
 54 |     "  }\n",
 55 |     "  \n",
 56 |     "```\n",
 57 |     "\n",
 58 |     "At every time step, the direction is reversed to get the second-order derivative.\n",
 59 |     "\n",
 60 |     "\n",
 61 |     "<img src=\"images/Time_Step.jpg\" width=\"70%\" height=\"70%\">\n",
 62 |     "\n",
 63 |     "\n",
 64 |     "```cpp\n",
 65 |     "void perform_timestep( double *state , double *state_tmp , double *flux , double *tend , double dt ) {\n",
 66 |     "  if (direction_switch) {\n",
 67 |     "    //x-direction first\n",
 68 |     "    semi_discrete_step( state , state     , state_tmp , dt / 3 , DIR_X , flux , tend );\n",
 69 |     "    semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_X , flux , tend );\n",
 70 |     "    semi_discrete_step( state , state_tmp , state     , dt / 1 , DIR_X , flux , tend );\n",
 71 |     "    //z-direction second\n",
 72 |     "    semi_discrete_step( state , state     , state_tmp , dt / 3 , DIR_Z , flux , tend );\n",
 73 |     "    semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_Z , flux , tend );\n",
 74 |     "    semi_discrete_step( state , state_tmp , state     , dt / 1 , DIR_Z , flux , tend );\n",
 75 |     "  } else {\n",
 76 |     "    //z-direction second\n",
 77 |     "    semi_discrete_step( state , state     , state_tmp , dt / 3 , DIR_Z , flux , tend );\n",
 78 |     "    semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_Z , flux , tend );\n",
 79 |     "    semi_discrete_step( state , state_tmp , state     , dt / 1 , DIR_Z , flux , tend );\n",
 80 |     "    //x-direction first\n",
 81 |     "    semi_discrete_step( state , state     , state_tmp , dt / 3 , DIR_X , flux , tend );\n",
 82 |     "    semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_X , flux , tend );\n",
 83 |     "    semi_discrete_step( state , state_tmp , state     , dt / 1 , DIR_X , flux , tend );\n",
 84 |     "  }\n",
 85 |     "  if (direction_switch) { direction_switch = 0; } else { direction_switch = 1; }\n",
 86 |     "}\n",
 87 |     "```\n",
 88 |     "\n",
 89 |     "<img src=\"images/Semi_Discrete.jpg\" width=\"70%\" height=\"70%\">\n",
 90 |     "\n",
 91 |     "\n",
 92 |     "-----\n",
 93 |     "\n",
 94 |     "# <div style=\"text-align: center ;border:3px; border-style:solid; border-color:#FF0000  ; padding: 1em\">[HOME](../_start_profiling.ipynb#steps)</div>\n",
 95 |     "\n",
 96 |     "-----\n"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "metadata": {},
102 |    "source": [
103 |     "## Licensing \n",
104 |     "\n",
105 |     "Copyright © 2022 OpenACC-Standard.org.  This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply."
106 |    ]
107 |   }
108 |  ],
109 |  "metadata": {
110 |   "anaconda-cloud": {},
111 |   "kernelspec": {
112 |    "display_name": "Python 3",
113 |    "language": "python",
114 |    "name": "python3"
115 |   },
116 |   "language_info": {
117 |    "codemirror_mode": {
118 |     "name": "ipython",
119 |     "version": 3
120 |    },
121 |    "file_extension": ".py",
122 |    "mimetype": "text/x-python",
123 |    "name": "python",
124 |    "nbconvert_exporter": "python",
125 |    "pygments_lexer": "ipython3",
126 |    "version": "3.7.4"
127 |   }
128 |  },
129 |  "nbformat": 4,
130 |  "nbformat_minor": 4
131 | }
132 | 


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/nsight_advanced.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "!nvidia-smi"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "## Learning objectives\n",
 24 |     "The **goal** of this lab is to:\n",
 25 |     "\n",
 26 |     "- •\tLearn about tracing of Message Passing Interface (MPI), OpenSHMEM, NVSHMEM™ and NVIDIA Collective Communication Library (NCCL)\n",
 27 |     "- Learn how to do multi-process profiling\n",
 28 |     "\n",
 29 |     "We do not intend to cover:\n",
 30 |     "\n",
 31 |     "- How to use NVSHMEM, NCCL, and MPI\n",
 32 |     "\n",
 33 |     "### NVIDIA Nsight Systems \n",
 34 |     "NVIDIA Nsight™ Systems tool offers system-wide performance analysis to visualize the application’s algorithms, help identify optimization opportunities, and improve the performance of applications running on a system consisting of multiple CPUs and GPUs. Nsight Systems is packed with many features. A few of the features highlighted below:\n",
 35 |     "\n",
 36 |     "- Application programming interface (API)  tracing of CUDA libraries and deep learning frameworks\n",
 37 |     "- CPU utilization, CPU thread states and thread migration as well as CPU callstack sampling\n",
 38 |     "- Operating system (OS)  runtime library calls\n",
 39 |     "- GPU activities (kernels and memory copies) as well as GPU metrics\n",
 40 |     "\n",
 41 |     "This section briefly explores other features of Nsight Systems that were not covered as part of the other labs.\n",
 42 |     "\n",
 43 |     "#### GPU Metrics Sampling\n",
 44 |     "\n",
 45 |     "Nsight Systems has a GPU Metrics feature that is used to identify performance limiters in applications using GPU for computations. It uses periodic sampling to gather performance metrics and detailed timing statistics associated with different GPU hardware units taking advantage of specialized hardware to capture this data in a single pass with minimal overhead. These metrics provide an overview of GPU efficiency over time within compute and input/output (I/O) activities:\n",
 46 |     "\n",
 47 |     "- I/O throughputs: PCIe, NVLink, and dynamic random access memory (DRAM)\n",
 48 |     "- Streaming Multiprocessor / Shared Processor (SM)  utilization: SMs activity, Tensor Core activity, instructions issued, warp occupancy (including unallocated slots)\n",
 49 |     "\n",
 50 |     "These metrics can also help users answer the common questions:\n",
 51 |     "\n",
 52 |     "- Is my GPU idle? \n",
 53 |     "- Is my instruction rate low (possibly I/O bound)?\n",
 54 |     "- Is my GPU full? Sufficient kernel grids size and streams? Are my SMs and warp slots full?\n",
 55 |     "- Can I see GPU Direct Remote Direct Memory Access (RDMA)/Storage or other transfers?\n",
 56 |     "- Am I using TensorCores?\n",
 57 |     "- Am I possibly blocked on I/O, or number of warps, etc?\n",
 58 |     "\n",
 59 |     "Nsight Sytems GPU Metrics requires NVIDIA GPUs with the Turing™ architecture or newer. Learn more about GPU metrics at https://docs.nvidia.com/nsight-systems/UserGuide/index.html#gpu-metric-sampling\n",
 60 |     "\n",
 61 |     "<img src=\"images/gpu_metrics.png\">\n",
 62 |     "\n",
 63 |     "\n",
 64 |     "#### TRACING OF MPI, OpenSHMEM, NVSHMEM AND NCCL\n",
 65 |     "\n",
 66 |     "Nsight Systems supports MPI as well as OpenSHMEM tracing. You can record MPI communication parameters and track the MPI communicators. So, if you want to follow the data and see which MPI ranks are communicating with each other and how much data is transferred, you can see this information in the report. \n",
 67 |     "\n",
 68 |     "<img src=\"images/mpi_comm.png\">\n",
 69 |     "\n",
 70 |     "In the above screenshot you see the tooltip for an `MPI_Irecv` (bottom right) with its MPI tag, the number of bytes that have been received, the sender of the data and also the communicator. In the case of MPI, you can also trace MPI for Fortran applications. \n",
 71 |     "\n",
 72 |     "Besides MPI and OpenSHMEM, where we intercept the library calls, Nsight Systems can also trace calls into NVSHMEM and NCCL libraries based on NVIDIA Tools Extension SDK (NVTX) explained in previous labs. The result looks similar to what is shown in the above screenshot, with the function names of the respective API and the respective row labels, e.g. NCCL and NVSHMEM rows instead of MPI and UCX as in the above screenshot.\n",
 73 |     "\n",
 74 |     "In the screenshot, you see the execution timeline of a super short range of an MPI program which triggers some `MPI_Isends` and `MPI_Irecvs`. For each MPI call you get the communication parameters, and you also get the Unified Communication X (UCX) API calls, which are triggered by the MPI implementation, which in this example is OpenMPI. \n",
 75 |     "\n",
 76 |     "\n",
 77 |     "#### UCX API TRACING\n",
 78 |     "\n",
 79 |     "The UCX layer is an open-source communication framework that acts as a common library and API for several higher-level communication libraries, for example Open MPI (including its OpenSHMEM implementation) and MPICH. If UCX library trace is selected Nsight Systems will trace the subset of functions of the UC Protocol layer (UCP) that are most likely involved in performance bottlenecks. If OpenSHMEM library trace is selected Nsight Systems will trace the subset of OpenSHMEM API functions that are most likely involved in performance bottlenecks. \n",
 80 |     "\n",
 81 |     "<img src=\"images/ucx.png\">\n",
 82 |     "\n",
 83 |     "In the above screenshot, we have both `MPI_Isend` and `MPI_Irecv` calls that trigger UCP API calls (you only see the `MPI_Isend` calls, because the `MPI_Irecv` calls are super short for this particular example). The bottom row in the timeline shows the processing of transfers from non-blocking UCP communication operations. In the UCX row, you see the submit functions and in the row below you see when the processing of the transfers starts (if we were to zoom out, we could also see when the processing ends).\n",
 84 |     "\n",
 85 |     "#### NIC Performance Metrics\n",
 86 |     "NVIDIA ConnectX® smart network interface cards (smart NICs) offer advanced hardware offloads and accelerations for network operations. Viewing smart NICs metrics, on Nsight Systems timeline, enables developers to better understand their application’s network usage and use this information to optimize the application’s performance.\n",
 87 |     "\n",
 88 |     "<img src=\"images/NIC.png\">\n",
 89 |     "\n",
 90 |     "The performance counters are displayed over the Nsight Systems timeline, letting you know when the application is sending and receiving data. There are also counters that indicate network congestion like the `IB Send Wait` counter that you see in the above screenshot.\n",
 91 |     "\n",
 92 |     "### Nsight Systems Multi-Process Profiling\n",
 93 |     "\n",
 94 |     "On compute clusters where you have to use a workload manager or want to do a run over multiple nodes, the `nsys profile` command is prefixed before the application. With this, a report file is generated for each process. If you can launch your application without a workload manager on a single node (e.g. with `mpirun`), you can prefix `nsys profile` before `mpirun` and  a single report including all processes is generated.\n",
 95 |     "\n",
 96 |     "- **Single Node**: `nsys profile [nsys_args] mpirun [mpirun_args] your_executable`. The command will create one report file.\n",
 97 |     "- **Multiple Nodes**: `mpirun [mpirun_args] nsys profile [nsys_args] your_executable`, you can set the output report name with `-o report_name_%q{OMPI_COMM_WORLD_RANK}`. (For OpenMPI, PMI_RANK for MPICH and SLURM_PROCID for Slurm). The command will create one report file per MPI rank.\n",
 98 |     "\n",
 99 |     "You can also profile only specific ranks: \n",
100 |     "\n",
101 |     "```\n",
102 |     "#!/bin/bash\n",
103 |     "# OMPI_COMM_WORLD_LOCAL_RANK for node local rank\n",
104 |     "if [ $OMPI_COMM_WORLD_RANK -eq 0 ]; then\n",
105 |     "    nsys profile -t mpi \"$@\"\n",
106 |     "else\n",
107 |     "    \"$@\"\n",
108 |     "fi\n",
109 |     "```\n",
110 |     "\n",
111 |     "Below is an example command that was run on a compute facility that uses the *SLURM* workload manager using two nodes.\n",
112 |     "\n",
113 |     "```\n",
114 |     "srun [SRUN_ARGS] nsys profile -t mpi,ucx -s none --nic-metrics=true -o ./report_mpi.%q{SLURM_PROCID} -f true ./myprogram [PROGRAM_ARGS]\n",
115 |     "```\n",
116 |     "\n",
117 |     "where,\n",
118 |     "\n",
119 |     "- `nsys profile`: starts a profiling session\n",
120 |     "- `-t, --trace=...`: sets the APIs to be traced, in this example, it is UCX and MPI\n",
121 |     "- `-s,--sample=[cpu|none]`: controls CPU IP sampling\n",
122 |     "- `--nic-metrics=[true|false]`: controls Network Interface Cards (NIC) metrics collection\n",
123 |     "- `-o, --output=report#`: output profile report file path; and\n",
124 |     "- `-f --force-overwrite`: overwrite the output report file, if it already exists\n",
125 |     "\n",
126 |     "To learn more about other switches to use with `nsys profile`, you can type `nsys profile --help` on the command line or read the [documentation](https://docs.nvidia.com/nsight-systems/UserGuide/index.html#cli-profiling).\n",
127 |     "\n",
128 |     "### Nsight Compute Multi-Process Profiling\n",
129 |     "\n",
130 |     "On a single-node submission, one Nsight Compute instance can profile all launched child processes and with the data for all processes stored in one report file.\n",
131 |     "\n",
132 |     "`ncu --target-processes all -o <singlereport-name> <app> <args>`\n",
133 |     "\n",
134 |     "On multi-node submissions, one tool instance can be used per node. Make sure instances don’t write to the same report file on a shared disk. \n",
135 |     "\n",
136 |     "`ncu -o report_%q{OMPI_COMM_WORLD_RANK} <app> <args>`\n",
137 |     "\n",
138 |     "Similar to Nsight Systems, consider profiling only a single rank, for example using a wrapper script (see below example):\n",
139 |     "\n",
140 |     "```\n",
141 |     "#!/bin/bash\n",
142 |     "if [[ \"$OMPI_COMM_WORLD_RANK\" == \"3\" ]] ; then\n",
143 |     "    /sw/cluster/cuda/11.1/ nsight-compute/ncu -o report_${OMPI_COMM_WORLD_RANK} --target-processes all $*\n",
144 |     "else\n",
145 |     "    $*\n",
146 |     "fi\n",
147 |     "```"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "markdown",
152 |    "metadata": {},
153 |    "source": [
154 |     "-----\n",
155 |     "\n",
156 |     "# <div style=\"text-align: center ;border:3px; border-style:solid; border-color:#FF0000  ; padding: 1em\">[HOME](../_start_profiling.ipynb#steps)</div>\n",
157 |     "\n",
158 |     "-----"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "# Links and Resources\n",
166 |     "\n",
167 |     "\n",
168 |     "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
169 |     "\n",
170 |     "[NVIDIA Nsight Compute](https://docs.nvidia.com/nsight-compute/index.html)\n",
171 |     "\n",
172 |     "\n",
173 |     "**NOTE**: To be able to see the Nsight System and Compute profiler outputs, please download the latest versions them from below pages:\n",
174 |     "\n",
175 |     "- https://developer.nvidia.com/nsight-systems\n",
176 |     "- https://developer.nvidia.com/nsight-compute\n",
177 |     "\n",
178 |     "\n",
179 |     "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
180 |     "\n",
181 |     "--- \n",
182 |     "\n",
183 |     "## Licensing \n",
184 |     "\n",
185 |     "Copyright © 2022 OpenACC-Standard.org.  This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply."
186 |    ]
187 |   }
188 |  ],
189 |  "metadata": {
190 |   "anaconda-cloud": {},
191 |   "kernelspec": {
192 |    "display_name": "Python 3",
193 |    "language": "python",
194 |    "name": "python3"
195 |   },
196 |   "language_info": {
197 |    "codemirror_mode": {
198 |     "name": "ipython",
199 |     "version": 3
200 |    },
201 |    "file_extension": ".py",
202 |    "mimetype": "text/x-python",
203 |    "name": "python",
204 |    "nbconvert_exporter": "python",
205 |    "pygments_lexer": "ipython3",
206 |    "version": "3.7.4"
207 |   }
208 |  },
209 |  "nbformat": 4,
210 |  "nbformat_minor": 4
211 | }
212 | 


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/profiling_lab1.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "!nvidia-smi"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "## Exercise 1 \n",
 24 |     "\n",
 25 |     "###  Learning objectives\n",
 26 |     "\n",
 27 |     "The **goal** of this lab is to:\n",
 28 |     "\n",
 29 |     "- Learn how to compile your serial application with the NVIDIA HPC compiler\n",
 30 |     "- Learn how to benchmark and profile the serial code using NVIDIA Nsight Systems \n",
 31 |     "- Learn how to identify routines responsible for the bulk of the execution time via NVIDIA Tools Extension SDK (NVTX) markers shown on the Nsight System’s timeline\n",
 32 |     "- Learn about scaling and Amdahl’s law\n",
 33 |     "\n",
 34 |     "We do not intend to cover:\n",
 35 |     "\n",
 36 |     "- The OpenACC programming model\n",
 37 |     "- Advanced optimization techniques in detail\n",
 38 |     "\n",
 39 |     "Understanding the structure of the code is very important to identify opportunities and parallelize the code.\n",
 40 |     "\n",
 41 |     "**Understand and analyze** the code present at:\n",
 42 |     " \n",
 43 |     "[Serial Code](../source_code/lab1/miniWeather_serial.cpp) \n",
 44 |     "\n",
 45 |     "[Makefile](../source_code/lab1/Makefile)\n",
 46 |     "\n",
 47 |     "Open the downloaded file for inspection.\n",
 48 |     "\n",
 49 |     "**Compile** the code with the NVIDIA HPC compiler by running `make`. You can get compiler feedback by adding the `-Minfo` flag. Some of the available options are:\n",
 50 |     "\n",
 51 |     "- `accel` – Print compiler operations related to the accelerator\n",
 52 |     "- `all` – Print all compiler output\n",
 53 |     "- `intensity` – Print loop intensity information\n",
 54 |     "\n",
 55 |     "Example usage: `-Minfo=accel`"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "# compile the C/C++ code\n",
 65 |     "!cd ../source_code/lab1 && make clean && make "
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "Now, we can **profile** the serial code via Nsight Systems command line (see below example command) and download the report."
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "!cd ../source_code/lab1 && nsys profile -t nvtx --stats=true --force-overwrite true -o miniWeather_1 ./miniWeather"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "For the example command above, you can download and save the report file by holding down <mark>Shift</mark> and <mark>right-clicking</mark> [Here](../source_code/lab1/miniWeather_1.nsys-rep), then choosing <mark>Save Link As</mark>. Once done open the report   via the Nsight Systems user interface (UI). To identify which step in the CPU algorithm is slowing the GPU down, we added an annotated timeline to mark the regions and different steps of the algorithm.From the timeline view, check the NVTX markers displayed as part of threads. **Why are we using NVTX?** Please see the section on [Using NVIDIA Tools Extension SDK (NVTX)](nsight_systems.ipynb#nvtx)\n",
 89 |     "\n",
 90 |     "<img src=\"images/e1-nvtx_gui.png\">\n",
 91 |     "\n",
 92 |     "You can also review NVTX statistics from the terminal console once the profiling session has ended and see most of the execution time is spent in `perform_timestep`. \n",
 93 |     "\n",
 94 |     "<img src=\"images/e1-nvtx_terminal.png\">\n",
 95 |     "\n",
 96 |     "#### Scaling and Amdahl's law\n",
 97 |     "<a name=\"amdahls\"></a>\n",
 98 |     "To plan an incremental parallelization strategy after identifying routines responsible for the bulk of the execution time, it is important to know how the application can scale. The amount of performance an application achieves by running on a GPU depends on the extent to which it can be parallelized. Code that cannot be sufficiently parallelized should run on the host, unless doing so would result in excessive transfers between the host and the device. It is very important to understand the relation between the problem size and computational performance as this can determine the amount of speedup and benefit you would get by parallelizing on the GPU.  \n",
 99 |     "\n",
100 |     "We can **Profile** the application again and run the executable with different values for `nx_glob`, `nz_glob` , and `sim_time` (40,20,10)."
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "**Note:** You can provide input values for `nx_glob`, `nz_glob` , and `sim_time` where,\n",
108 |     "\n",
109 |     "* `nx_glob` and `nz_glob` is the number of total cells in the x and z direction; and\n",
110 |     "* `sim_time` is the simulation time in seconds\n",
111 |     "\n",
112 |     "The number of total cells in the x-direction must be twice as large as the total number of cells in the z-directions. The default values are 400, 200, and 200 seconds.\n",
113 |     "\n",
114 |     "Now, we profile the code again and open the example expected output via the Nsight Systems UI.\n",
115 |     "\n",
116 |     "From the \"Timeline view\", take a closer look at the \"NVTX\" markers from the function table on the left side of the top pane and compare it with the timeline from the previous report. You can see now that the most time-consuming part of the application is the initialization. \n",
117 |     "\n",
118 |     "<img src=\"images/e1-nvtx.png\">\n",
119 |     "\n",
120 |     "Due to the small problem size (`nx_glob`, `nz_glob` , and `sim_time` in this example), most of the computation is dominated by the initialization and there is not enough work/computation to make it suitable for GPU. \n",
121 |     "\n",
122 |     "According to *Amdahl's law*, the speedup achieved by accelerating portions of an application is limited by the code sections that are not accelerated. Before parallelizing an application, it is important to know that the overall performance improvement gained by optimizing the portion of the code is limited by the fraction of time that the improved section is used. In other words, you may speedup a portion of the code by a factor of N, but if only a small fraction of time is spent in this portion of the code, then the overall performance has not been improved substantially.\n",
123 |     "\n",
124 |     "So, in this example, changing the problem size can hide the initialization part of the code and make it a better candidate for the GPU. Now that you have determined what the most important bottleneck is, modify the application to make this problem more appropriate for the GPU."
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "## Post-Lab Summary\n",
132 |     "\n",
133 |     "If you would like to download this lab for later viewing, it is recommended you go to your browser's file menu (not the Jupyter notebook file menu) and save the complete web page.  This will ensure the images are copied down as well. You can also execute the following cell block to create a zip file of the files you have been working on, and download it with the link below."
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "%%bash\n",
143 |     "cd ..\n",
144 |     "rm -f _profiler_files.zip\n",
145 |     "zip -r _profiler_files.zip *"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "metadata": {},
151 |    "source": [
152 |     "**After** executing the above zip command, you should be able to download and save the zip file by holding down <mark>Shift</mark> and <mark>right-clicking</mark> [Here](../_profiler_files.zip), then choosing <mark>Save Link As</mark>."
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "metadata": {},
158 |    "source": [
159 |     "-----\n",
160 |     "\n",
161 |     "# <p style=\"text-align:center; border:3px; border-style:solid; border-color:#FF0000  ; padding: 1em\"> <a href=../_start_profiling.ipynb>HOME</a>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style=\"float:center\"> <a href=profiling_lab2.ipynb>NEXT</a></span> </p>\n",
162 |     "\n",
163 |     "-----"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "metadata": {},
169 |    "source": [
170 |     "# Links and Resources\n",
171 |     "\n",
172 |     "[OpenACC API Guide](https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf)\n",
173 |     "\n",
174 |     "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
175 |     "\n",
176 |     "[CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)\n",
177 |     "\n",
178 |     "**NOTE**: To be able to see the Nsight System profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n",
179 |     "\n",
180 |     "\n",
181 |     "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
182 |     "\n",
183 |     "\n",
184 |     "--- \n",
185 |     "\n",
186 |     "## Licensing \n",
187 |     "\n",
188 |     "Copyright © 2022 OpenACC-Standard.org.  This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply."
189 |    ]
190 |   }
191 |  ],
192 |  "metadata": {
193 |   "anaconda-cloud": {},
194 |   "kernelspec": {
195 |    "display_name": "Python 3",
196 |    "language": "python",
197 |    "name": "python3"
198 |   },
199 |   "language_info": {
200 |    "codemirror_mode": {
201 |     "name": "ipython",
202 |     "version": 3
203 |    },
204 |    "file_extension": ".py",
205 |    "mimetype": "text/x-python",
206 |    "name": "python",
207 |    "nbconvert_exporter": "python",
208 |    "pygments_lexer": "ipython3",
209 |    "version": "3.7.4"
210 |   }
211 |  },
212 |  "nbformat": 4,
213 |  "nbformat_minor": 4
214 | }
215 | 


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/profiling_lab2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "!nvidia-smi"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "## Exercise 2 \n",
 24 |     "\n",
 25 |     "### Learning objectives\n",
 26 |     "\n",
 27 |     "The **goal** of this lab is to:\n",
 28 |     "- Implement OpenACC parallelism using parallel directives to parallelize the serial application\n",
 29 |     "- Learn how to compile your parallel application with the NVIDIA HPC compiler\n",
 30 |     "- Benchmark and compare the parallel version of the application with the serial version\n",
 31 |     "- Learn how to interpret NVIDIA HPC compiler feedback to ensure the applied optimization was successful\n",
 32 |     "\n",
 33 |     "We do not intend to cover:\n",
 34 |     "\n",
 35 |     "- The OpenACC programming model\n",
 36 |     "- Advanced optimization techniques in detail"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "After inspecting the profiler report from the terminal, we noticed that most of the computation is done in the `perform_timestep`. So, we ported the code to the GPU using the OpenACC programming model and added OpenACC compute directives (`#pragma acc parallel`) around the expensive routines (loops) in the code. Click on the <b>[miniWeather_openacc.cpp](../source_code/lab2/miniWeather_openacc.cpp)</b> and <b>[Makefile](../source_code/lab2/Makefile)</b> and inspect the code before running the below cells. \n",
 44 |     "\n",
 45 |     "Once done, compile the code with `make`. View the NVIDIA HPC compiler feedback (enabled by adding `-Minfo=accel` flag) and investigate the compiler feedback for the OpenACC code. The compiler feedback provides useful information about applied optimizations."
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "# compile the C/C++ code\n",
 55 |     "!cd ../source_code/lab2 && make clean && make"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "Let's inspect part of the compiler feedback and see what it's telling us (the lines in the compiler feedback might be slightly different for you).\n",
 63 |     "\n",
 64 |     "<img src=\"images/cfeedback1_.png\">\n",
 65 |     "\n",
 66 |     "- Using `-ta=tesla:managed`, instruct the compiler to build for an NVIDIA GPU using \"CUDA Managed Memory\"\n",
 67 |     "- Using `-Minfo` command-line option, we will see all output from the compiler. In this example, we use `-Minfo=accel` to only see the output corresponding to the accelerator (in this case an NVIDIA GPU).\n",
 68 |     "- Let's look at the line starting with `compute_tendencies_x`. It tells us which function the following information is in reference to.\n",
 69 |     "- The line starting with 278, shows we created a parallel OpenACC loop. This loop is made up of gangs (a grid of blocks in CUDA language) and vector parallelism (threads in CUDA language) with the vector size being 128 per gang. `278, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */`\n",
 70 |     "- The rest of the information concerns data movement. The compiler detected the possible need to move data and handled it for us. We will get into this later in this lab.\n",
 71 |     "\n",
 72 |     "It is very important to inspect the feedback to make sure the compiler is doing what you have asked of it.\n",
 73 |     "\n",
 74 |     "Now, let's **profile** the application for smaller values of `nx_glob`,`nz_glob`, and `sim_time`: **40, 20, 100**."
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "!cd ../source_code/lab2 && nsys profile -t nvtx,openacc --stats=true --force-overwrite true -o miniWeather_2 ./miniWeather 40 20 100"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "You can see that the changes made actually slowed down the code and it runs slower compared to the non-accelerated CPU-only version. Let's review the profiler's report. Download and save the report file by holding down <mark>Shift</mark> and <mark>right-clicking</mark> [Here](../source_code/lab2/miniWeather_2.nsys-rep), then choosing <mark>Save Link As</mark>. Once done open the report via NVIDIA Nsight™ Systems user interface (UI) locally. \n",
 91 |     "\n",
 92 |     "The timeline of the application is shown below.\n",
 93 |     "\n",
 94 |     "<img src=\"images/1_timeline_full.png\" width=90%>\n",
 95 |     "\n",
 96 |     "Hovering over the blue chart in the CUDA device row, we see that the CUDA kernel coverage on the GPU is about 80-90% throughout. This means that the GPU is idle for the remaining 10-20% of the time.\n",
 97 |     "\n",
 98 |     "<img src=\"images/1_gpu_row.png\" width=90%>\n",
 99 |     "\n",
100 |     "**Let's zoom into the timeline to see what's going on.** Press the Ctrl key while moving the mouse scroll wheel up or down to zoom into or out of the area around the mouse pointer. Another way to zoom in is to select the region you want to zoom into and press *Shift*+*Z* keys.\n",
101 |     "\n",
102 |     "<img src=\"images/1_timeline.png\" width=90%>\n",
103 |     "\n",
104 |     "Zoom into the OpenACC row. Nsight Systems is capable of capturing information about OpenACC execution in the profiled process. Under the CPU rows in the timeline tree, each thread that uses OpenACC will show OpenACC trace information. You can click on an OpenACC application programming interface (API) call to see the correlation with the underlying CUDA API calls (highlighted in teal). If the OpenACC API results in GPU work, that will also be highlighted:\n",
105 |     "\n",
106 |     "<img src=\"images/1_correlation.png\" width=90%>\n",
107 |     "\n",
108 |     "If you hover over a particular OpenACC construct, it will bring up a tooltip with details about that construct:\n",
109 |     "\n",
110 |     "<img src=\"images/1_openacc_row.png\" width=90%>\n",
111 |     "\n",
112 |     "From the \"Timeline view\" on the top pane, double click on the \"CUDA\" from the function table on the left and expand it. Zoom in on the timeline and you can see a pattern similar to the screenshot below. Clearly, there is a repeating pattern where the GPU is idle for some time followed by a burst of kernel and memory operations. The blue boxes are the compute kernels and each of these groupings of kernels is surrounded by purple and teal boxes (annotated with red color) representing data movements. **Screenshots represent profiler report for the values of 400,200,200.**\n",
113 |     "\n",
114 |     "<img src=\"images/nsys_slow.png\" width=90%>\n",
115 |     "\n",
116 |     "Let's hover your mouse over kernels (blue boxes) one by one from each row and review the provided information.\n",
117 |     "\n",
118 |     "<img src=\"images/occu-1.png\" width=90% >\n",
119 |     "\n",
120 |     "\n",
121 |     "**Note**: In the next two exercises, we start optimizing the application by improving occupancy and reducing data movements."
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "## Post-Lab Summary\n",
129 |     "\n",
130 |     "If you would like to download this lab for later viewing, it is recommended you go to your browser's file menu (not the Jupyter notebook file menu) and save the complete web page.  This will ensure the images are copied down as well. You can also execute the following cell block to create a zip file of the files you have been working on, and download it with the link below."
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "%%bash\n",
140 |     "cd ..\n",
141 |     "rm -f _profiler_files.zip\n",
142 |     "zip -r _profiler_files.zip *"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "metadata": {},
148 |    "source": [
149 |     "**After** executing the above zip command, you should be able to download and save the zip file by holding down <mark>Shift</mark> and <mark>right-clicking</mark> [Here](../_profiler_files.zip), then choosing <mark>Save Link As</mark>. "
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "metadata": {},
155 |    "source": [
156 |     "-----\n",
157 |     "\n",
158 |     "# <p style=\"text-align:center; border:3px; border-style:solid; border-color:#FF0000  ; padding: 1em\"> <a href=../_start_profiling.ipynb>HOME</a>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style=\"float:center\"> <a href=profiling_lab3.ipynb>NEXT</a></span> </p>\n",
159 |     "\n",
160 |     "-----"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "markdown",
165 |    "metadata": {},
166 |    "source": [
167 |     "# Links and Resources\n",
168 |     "\n",
169 |     "[OpenACC API Guide](https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf)\n",
170 |     "\n",
171 |     "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
172 |     "\n",
173 |     "[CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)\n",
174 |     "\n",
175 |     "**NOTE**: To be able to see the Nsight System profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n",
176 |     "\n",
177 |     "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
178 |     "\n",
179 |     "--- \n",
180 |     "\n",
181 |     "## Licensing \n",
182 |     "\n",
183 |     "Copyright © 2022 OpenACC-Standard.org.  This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply."
184 |    ]
185 |   }
186 |  ],
187 |  "metadata": {
188 |   "anaconda-cloud": {},
189 |   "kernelspec": {
190 |    "display_name": "Python 3",
191 |    "language": "python",
192 |    "name": "python3"
193 |   },
194 |   "language_info": {
195 |    "codemirror_mode": {
196 |     "name": "ipython",
197 |     "version": 3
198 |    },
199 |    "file_extension": ".py",
200 |    "mimetype": "text/x-python",
201 |    "name": "python",
202 |    "nbconvert_exporter": "python",
203 |    "pygments_lexer": "ipython3",
204 |    "version": "3.7.4"
205 |   }
206 |  },
207 |  "nbformat": 4,
208 |  "nbformat_minor": 4
209 | }
210 | 


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/profiling_lab4.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "!nvidia-smi"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "## Exercise 4\n",
 24 |     "\n",
 25 |     "### Learning objectives\n",
 26 |     "\n",
 27 |     "The **goal** of this lab is to:\n",
 28 |     "\n",
 29 |     "- Learn how to identify redundant memory copies via NVIDIA Nsight™ Systems\n",
 30 |     "- Learn how to improve efficiency by reducing extra data copies via OpenACC data directives\n",
 31 |     "- Learn how to use NVIDIA HPC compiler feedback for guidance on where to insert OpenACC data directives\n",
 32 |     "- Apply data directives to the parallel application, benchmark and profile the application\n",
 33 |     "\n",
 34 |     "We do not intend to cover:\n",
 35 |     "\n",
 36 |     "- The OpenACC programming model\n",
 37 |     "- Advanced optimization techniques in detail\n",
 38 |     "\n",
 39 |     "Let's inspect the profiler report from the previous exercise. From the \"Timeline view\" on the top pane, double-click on \"CUDA\" from the function table on the left and expand it. Zoom in on the timeline and you can see a pattern similar to the screenshot below. The blue boxes are the compute kernels and each of these groupings of kernels is surrounded by purple and green boxes (annotated with a green rectangle) representing data movements. If you hover your mouse over each box, you can see more details.\n",
 40 |     "\n",
 41 |     "What this graph is showing is that there is a lot of data movement between GPU and CPU.\n",
 42 |     "    \n",
 43 |     "<img src=\"images/nsys_data_mv.png\">\n",
 44 |     "\n",
 45 |     "The compiler feedback we collected earlier tells us quite a bit about data movement too. If we look again at the compiler feedback from above, we see the following (the lines in the compiler feedback might be slightly different for you).\n",
 46 |     "\n",
 47 |     "<img src=\"images/cfeedback3-1_.png\" width=\"80%\" height=\"80%\">\n",
 48 |     "\n",
 49 |     "The compiler feedback is telling us that the compiler has inserted data movement around our parallel region at line 278 which copies the `hy_dens_cell`, `hy_dens_theta_cell`, and `state` arrays in and out of GPU memory and also copies `flux` array out. \n",
 50 |     "\n",
 51 |     "The compiler can only work with the information we provide. It knows we need the `hy_dens_cell`, `hy_dens_theta_cell`, `state`, and `flux` arrays on the GPU for the accelerated section within the  `compute_tendencies_x` function, but we didn't tell the compiler anything about what happens to the data outside of those sections. Without this knowledge, the compiler has to copy the full arrays to the GPU and back to the CPU for each accelerated section. This is a good deal of unnecessary data transfers. \n",
 52 |     "\n",
 53 |     "Ideally, we would want to move the data (example: `hy_dens_cell`, `hy_dens_theta_cell`, `state` arrays) to the GPU at the beginning, and only transfer it back to the CPU at the end (if needed). And as for the `flux` array in this example, we do not need to copy any data back and forth. So we only need to create space on the device (GPU) for this array. \n",
 54 |     "\n",
 55 |     "We need to give the compiler information about how to reduce the extra and unnecessary data movement. By adding an OpenACC `data` directive to a structured code block, the compiler will know how to manage data according to the clauses. For information on the data directive clauses, please visit [OpenACC 3.0 Specification](https://www.openacc.org/sites/default/files/inline-images/Specification/OpenACC.3.0.pdf).\n",
 56 |     "\n",
 57 |     "We added `data` directives to the code. You can inspect the code at <b>[miniWeather_openacc.cpp](../source_code/lab4/miniWeather_openacc.cpp)</b>. Once done, let's compile it and review the compiler feedback and profile again."
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "!cd ../source_code/lab4 && make clean && make"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "Start inspecting the compiler feedback and see if it applied the optimizations (the lines in the compiler feedback might be slightly different for you). Here is the screenshot of expected compiler feedback after adding the `data` directives. You can see that on line 281, the compiler is generating default present for `hy_dens_cell`, `hy_dens_theta_cell`, `state`, and `flux` arrays. In other words, it is assuming that data is present on the GPU and it only copies data to the GPU only if the data do not exist.\n",
 74 |     "\n",
 75 |     "<img src=\"images/cfeedback4_.png\" width=\"90%\" >\n",
 76 |     "\n",
 77 |     "Now, **Profile** the code with Nsight Systems command line `nsys`."
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "!cd ../source_code/lab4 && nsys profile -t nvtx,openacc --stats=true --force-overwrite true -o miniWeather_4 ./miniWeather"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "Download and save the report file by holding down <mark>Shift</mark> and <mark>right-clicking</mark> [Here](../source_code/lab4/miniWeather_4.nsys-rep), then choosing <mark>Save Link As</mark>. Once done open the report via the Nsight System user interface (UI)  locally. Now, have look at the expected output example below:\n",
 94 |     "\n",
 95 |     "<img src=\"images/nsys_fast_mv_.png\">\n",
 96 |     "\n",
 97 |     "Look at the data movements annotated with green and purple colors and compare them with the previous versions. We have accelerated the application and reduced the execution time by eliminating the unnecessary data transfers between CPU and GPU.\n",
 98 |     "\n",
 99 |     "Let's look at the NVTX ranges to see how much speedup we achieved after multiple optimizations.\n",
100 |     "\n",
101 |     "|   | Serial | Parallel (lab2) | Parallel (lab3) | Parallel (lab4) |\n",
102 |     "| --- | ----------- |----------- |----------- |----------- |\n",
103 |     "| Total | 27.66 s |157.47 s | 7.14 s |1.27 s |\n",
104 |     "| While | 27.66 s  |157.09 s | 6.77 s |926.69 ms |\n",
105 |     "| perform_timestep| 22.72 s |131.09 ms |  4.89 ms |0.78399 ms | \n",
106 |     "\n",
107 |     "**Note**: The next exercise gives an overview of the introduction to NVIDIA Nsight Compute tool and is optional."
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "metadata": {},
113 |    "source": [
114 |     "## Post-Lab Summary\n",
115 |     "\n",
116 |     "If you would like to download this lab for later viewing, it is recommended you go to your browser's file menu (not the Jupyter notebook file menu) and save the complete web page.  This will ensure the images are copied down as well. You can also execute the following cell block to create a zip file of the files you have been working on, and download it with the link below."
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": [
125 |     "%%bash\n",
126 |     "cd ..\n",
127 |     "rm -f _profiler_files.zip\n",
128 |     "zip -r _profiler_files.zip *"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "markdown",
133 |    "metadata": {},
134 |    "source": [
135 |     "**After** executing the above zip command, you should be able to download and save the zip file by holding down <mark>Shift</mark> and <mark>right-clicking</mark> [Here](../_profiler_files.zip), then choosing <mark>Save Link As</mark>. "
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "markdown",
140 |    "metadata": {},
141 |    "source": [
142 |     "-----\n",
143 |     "\n",
144 |     "# <p style=\"text-align:center; border:3px; border-style:solid; border-color:#FF0000  ; padding: 1em\"> <a href=../_start_profiling.ipynb>HOME</a>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style=\"float:center\"> <a href=profiling_lab5.ipynb>NEXT</a></span> </p>\n",
145 |     "\n",
146 |     "-----"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {},
152 |    "source": [
153 |     "# Links and Resources\n",
154 |     "\n",
155 |     "[OpenACC API Guide](https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf)\n",
156 |     "\n",
157 |     "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
158 |     "\n",
159 |     "[CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)\n",
160 |     "\n",
161 |     "**NOTE**: To be able to see the Nsight System profiler output, please download the latest version of Nsight Systems from [here](https://developer.nvidia.com/nsight-systems).\n",
162 |     "\n",
163 |     "\n",
164 |     "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
165 |     "\n",
166 |     "\n",
167 |     "--- \n",
168 |     "\n",
169 |     "## Licensing \n",
170 |     "\n",
171 |     "Copyright © 2022 OpenACC-Standard.org.  This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply."
172 |    ]
173 |   }
174 |  ],
175 |  "metadata": {
176 |   "anaconda-cloud": {},
177 |   "kernelspec": {
178 |    "display_name": "Python 3",
179 |    "language": "python",
180 |    "name": "python3"
181 |   },
182 |   "language_info": {
183 |    "codemirror_mode": {
184 |     "name": "ipython",
185 |     "version": 3
186 |    },
187 |    "file_extension": ".py",
188 |    "mimetype": "text/x-python",
189 |    "name": "python",
190 |    "nbconvert_exporter": "python",
191 |    "pygments_lexer": "ipython3",
192 |    "version": "3.7.4"
193 |   }
194 |  },
195 |  "nbformat": 4,
196 |  "nbformat_minor": 4
197 | }
198 | 


--------------------------------------------------------------------------------
/_profiler/jupyter_notebook/profiling_lab5.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Before we begin, let us execute the below cell to display information about the NVIDIA® CUDA® driver and the GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl+Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "!nvidia-smi"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "## Exercise 5\n",
 24 |     "\n",
 25 |     "###  Learning objectives\n",
 26 |     "\n",
 27 |     "The **goal** of this lab is to:\n",
 28 |     "\n",
 29 |     "- Learn how to inspect the application's kernels with NVIDIA Nsight™ Compute\n",
 30 |     "- Learn how to execute rules inside the Nsight Computer profiler and find bottlenecks\n",
 31 |     "- Learn how to add baselines and compare results/reports\n",
 32 |     "\n",
 33 |     "We do not intend to cover:\n",
 34 |     "\n",
 35 |     "- The OpenACC programming model\n",
 36 |     "- Advanced optimization techniques in detail\n",
 37 |     "\n",
 38 |     "As mentioned earlier on, Nsight Compute and Nsight Systems each serve a different purpose in profiling and with different functionalities. In previous exercises, we inspected the timelines, measured activity durations, and tracked CPU events via the Nsight Systems profiler. The purpose of this exercise is to get familiar with the Nsight Compute tool. This tool provides access to kernel-level analysis using GPU performance metrics.\n",
 39 |     "\n",
 40 |     "We first profile the GPU application and identify certain areas in the code, that don't behave as expected. Then we isolate those kernels and profile them via Nsight Compute. \n",
 41 |     "\n",
 42 |     "**Understand and analyze** the code present at:\n",
 43 |     "\n",
 44 |     "[OpenACC Code](../source_code/lab5/miniWeather_openacc.cpp) \n",
 45 |     "\n",
 46 |     "Open the downloaded file for inspection. Once done, **Compile** the code with `make` and **Profile** it with `nsys`."
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "!cd ../source_code/lab5 && make clean && make"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "Now, **Profile** the code with Nsight System command line interface (CLI):"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "!cd ../source_code/lab5 && nsys profile -t nvtx,openacc --stats=true --force-overwrite true -o miniWeather_5 ./miniWeather"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "Download and save the report file by holding down <mark>Shift</mark> and <mark>right-clicking</mark> [Here](../source_code/lab5/miniWeather_5.nsys-rep), then choosing <mark>Save Link As</mark>. Once done open the report via the Nsight System user interface(UI) locally. As shown in the example output, the initialization looks very expensive and the kernels are very small meaning that the GPU compute part of the problem is very small. Check how much time (what percentage) is spent in each kernel relative to the time it takes to run the code. \n",
 79 |     "\n",
 80 |     "<img src=\"images/5_init.png\" width=\"80%\">\n",
 81 |     "\n",
 82 |     "From the \"Timeline\" view, inspect the less efficient kernel. Next, inspect the most expensive kernel and see what the Nsight Compute recommends.\n",
 83 |     "\n",
 84 |     "<img src=\"images/cexer5.png\" width=\"80%\">"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "metadata": {},
 90 |    "source": [
 91 |     "Now, **Profile** the application via Nsight Compute CLI (`ncu`): "
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "!cd ../source_code/lab5 && ncu --set full -k regex:compute_tendencies_x --launch-skip 10 --launch-count 1 -f -o miniWeather1 ./miniWeather"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "Download and save the report file by holding down <mark>Shift</mark> and <mark>right-clicking</mark> [Here](../source_code/lab5/miniWeather1.ncu-rep), then choosing <mark>Save Link As</mark>. Once done open the report via the Nsight Compute UI. This tool has many sections that focus on different areas of the GPU and presents them all on one page. \n",
108 |     "\n",
109 |     "**Note:** If you do not specify a specific kernel name when profiling, all kernels will be profiled and will slow down the profiling time.\n",
110 |     "\n",
111 |     "<img src=\"images/ccompute.png\">\n",
112 |     "\n",
113 |     "The \"GPU Speed Of Light Throughput\" section shows less than 1% Compute (SM) Throughput. As you can see from the example output below, the Nsight Compute profiler suggests looking at the \"Launch Statistics\" section because the kernel grid is too small to fill the available resources on the GPU. \n",
114 |     "\n",
115 |     "We previously discussed Amdahl's law in the first exercise. It is very important to understand the relation between the problem size and computational performance as this can determine the amount of speedup and benefit you get by parallelizing on GPU. Due to the small problem size (`nx_glob`, `nz_glob` , and `sim_time` in this example), most of the computation is dominated by the initialization and there is not enough work/computation to make it suitable for GPU. Run the application with different values `nx_glob`, `nz_glob` , and `sim_time` and profile the same kernel (`nx_glob` = 400 , `nz_glob`= 200 , and `sim_time`= 100). "
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "!cd ../source_code/lab5 && make clean && make"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": null,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "!cd ../source_code/lab5 && ncu --set full -k regex:compute_tendencies_x --launch-skip 100 --launch-count 1 -f -o miniWeather2 ./miniWeather 400 200 100"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "metadata": {},
139 |    "source": [
140 |     "Download and save the report file by holding down <mark>Shift</mark> and <mark>right-clicking</mark> [Here](../source_code/lab5/miniWeather2.ncu-rep), then choosing <mark>Save Link As</mark>. Once done open the report via the Nsight Compute UI. \n",
141 |     "\n",
142 |     "**Diff the reports**\n",
143 |     "\n",
144 |     "Open both reports via the Nsight Compute UI. From the top of the first report, click on *Add Baseline*, then do the same for the second report shown in the second tab. Have a look at the expected output:\n",
145 |     "\n",
146 |     "<img src=\"images/c2compute.png\">\n",
147 |     "\n",
148 |     "You can now compare the two reports and see how changes you made to the cell size, affected specific metrics by looking at each section. By increasing the cell size, we increased the \"Compute Throughput\" and \"Memory Throughput\" by 35% and 26% respectively. \n",
149 |     "\n",
150 |     "\n",
151 |     "Next, look at the Roofline chart which shows a high-level overview of the utilization for compute and memory resources of the GPU. We can see that increasing cell sizes resulted in the performance getting  closer to the Rooflines. \n",
152 |     "\n",
153 |     "<img src=\"images/roofline.png\">\n",
154 |     "\n",
155 |     "However, the kernel is still too small to utilize the GPU and \"Compute\" and \"Memory\" are still less than 50% utilized. The \"GPU Speed of Light Throughput\" section, gives a high-level overview of the throughput for compute and memory resources of the GPU for each unit. Based on this information we can find the performance limiters and categorize them into four possible combinations:\n",
156 |     "\n",
157 |     "- Compute Bound: SM>50% & Mem<50%\n",
158 |     "- Bandwidth Bound: SM<50% & Mem>50%\n",
159 |     "- Latency Bound: SM<50% & Mem<50%\n",
160 |     "- Compute and Bandwidth Bound : SM>50% & Mem>50%\n",
161 |     "\n",
162 |     "\n",
163 |     "According to the Roofline, this kernel is fp64 bound, and we should consider using 32-bit precision floating point operations to improve its performance. \n",
164 |     "\n",
165 |     "The detailed \"Memory Workload Analysis\" section shows all the data traffic between various stages of the GPU and what your kernel is actually transferring. This section suggests that we need to look at the memory access pattern in the code as the load/store pattern is not optimal. The solution is to minimize how many cache lines need to be accessed per memory  request. \n",
166 |     "\n",
167 |     "\n",
168 |     "This is out of scope for this tutorial but  you can have a look at the algorithm and see if you can change anything to do more work per memory access."
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "markdown",
173 |    "metadata": {},
174 |    "source": [
175 |     "## Post-Lab Summary\n",
176 |     "\n",
177 |     "If you would like to download this lab for later viewing, it is recommended you go to your browser's file menu (not the Jupyter notebook file menu) and save the complete web page.  This will ensure the images are copied down as well. You can also execute the following cell block to create a zip file of the files you have been working on, and download it with the link below."
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": null,
183 |    "metadata": {},
184 |    "outputs": [],
185 |    "source": [
186 |     "%%bash\n",
187 |     "cd ..\n",
188 |     "rm -f _profiler_files.zip\n",
189 |     "zip -r _profiler_files.zip *"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "markdown",
194 |    "metadata": {},
195 |    "source": [
196 |     "**After** executing the above zip command, you should be able to download and save the zip file by holding down <mark>Shift</mark> and <mark>right-clicking</mark> [Here](../_profiler_files.zip), then choosing <mark>Save Link As</mark>. "
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "markdown",
201 |    "metadata": {},
202 |    "source": [
203 |     "-----\n",
204 |     "\n",
205 |     "# <div style=\"text-align: center ;border:3px; border-style:solid; border-color:#FF0000  ; padding: 1em\">[HOME](../_start_profiling.ipynb)</div>\n",
206 |     "\n",
207 |     "-----"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "markdown",
212 |    "metadata": {},
213 |    "source": [
214 |     "# Links and Resources\n",
215 |     "\n",
216 |     "[OpenACC API Guide](https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf)\n",
217 |     "\n",
218 |     "[NVIDIA Nsight Compute](https://docs.nvidia.com/nsight-compute/index.html)\n",
219 |     "\n",
220 |     "[CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)\n",
221 |     "\n",
222 |     "**NOTE**: To be able to see the Nsight System and Compute profiler outputs, please download the latest versions from below pages:\n",
223 |     "\n",
224 |     "- https://developer.nvidia.com/nsight-systems\n",
225 |     "- https://developer.nvidia.com/nsight-compute\n",
226 |     "\n",
227 |     "\n",
228 |     "Don't forget to check out additional [Open Hackathons Resources](https://www.openhackathons.org/s/technical-resources) and join our [OpenACC and Hackathons Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
229 |     "\n",
230 |     "--- \n",
231 |     "\n",
232 |     "## Licensing \n",
233 |     "\n",
234 |     "Copyright © 2022 OpenACC-Standard.org.  This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). These materials may include references to hardware and software developed by other entities; all applicable licensing and copyrights apply."
235 |    ]
236 |   }
237 |  ],
238 |  "metadata": {
239 |   "anaconda-cloud": {},
240 |   "kernelspec": {
241 |    "display_name": "Python 3",
242 |    "language": "python",
243 |    "name": "python3"
244 |   },
245 |   "language_info": {
246 |    "codemirror_mode": {
247 |     "name": "ipython",
248 |     "version": 3
249 |    },
250 |    "file_extension": ".py",
251 |    "mimetype": "text/x-python",
252 |    "name": "python",
253 |    "nbconvert_exporter": "python",
254 |    "pygments_lexer": "ipython3",
255 |    "version": "3.7.4"
256 |   }
257 |  },
258 |  "nbformat": 4,
259 |  "nbformat_minor": 4
260 | }
261 | 


--------------------------------------------------------------------------------
/_profiler/source_code/lab1/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 NVIDIA Corporation.  All rights reserved. 
 2 | 
 3 | CC := nvc++
 4 | CFLAGS := -O3 -w -ldl
 5 | ACCFLAGS := -Minfo=accel
 6 | NVTXLIB := -I/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/cuda/11.0/include
 7 | 
 8 | FC := nvfortran
 9 | FFLAGS := -fast
10 | LDFLAGS := -lnvhpcwrapnvtx 
11 |     
12 | miniWeather_c: miniWeather_serial.cpp
13 | 	${CC} ${CFLAGS} ${ACCFLAGS} -o miniWeather miniWeather_serial.cpp ${NVTXLIB} 
14 | 
15 | miniWeather_f: miniWeather_serial.f90
16 | 	$(FC) $(FFLAGS) $(ACCFLAGS) miniWeather_serial.f90 -o miniWeather $(LDFLAGS)
17 |     
18 | clean:
19 | 	rm -f *.o miniWeather *.nsys-rep *.sqlite *.ncu-rep
20 | 


--------------------------------------------------------------------------------
/_profiler/source_code/lab2/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 NVIDIA Corporation.  All rights reserved. 
 2 | 
 3 | CC := nvc++
 4 | CFLAGS := -O3 -w
 5 | ACCFLAGS := -ta=tesla:managed -Minfo=accel
 6 | 
 7 | FC := nvfortran
 8 | FFLAGS := -fast
 9 | LDFLAGS := -lnvhpcwrapnvtx
10 |     
11 | miniWeather_c: miniWeather_openacc.cpp
12 | 	${CC} ${CFLAGS} ${ACCFLAGS} -o miniWeather miniWeather_openacc.cpp 
13 | 
14 | miniWeather_f: miniWeather_openacc.f90
15 | 	$(FC) $(FFLAGS) $(ACCFLAGS) miniWeather_openacc.f90 -o miniWeather $(LDFLAGS)
16 |     
17 | clean:
18 | 	rm -f *.o miniWeather *.nsys-rep *.sqlite *.ncu-rep
19 | 


--------------------------------------------------------------------------------
/_profiler/source_code/lab3/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 NVIDIA Corporation.  All rights reserved. 
 2 | 
 3 | CC := nvc++
 4 | CFLAGS := -O3 -w
 5 | ACCFLAGS := -ta=tesla:managed -Minfo=accel
 6 | 
 7 | FC := nvfortran
 8 | FFLAGS := -fast
 9 | LDFLAGS := -lnvhpcwrapnvtx
10 |     
11 | miniWeather_c: miniWeather_openacc.cpp
12 | 	${CC} ${CFLAGS} ${ACCFLAGS} -o miniWeather miniWeather_openacc.cpp 
13 | 
14 | miniWeather_f: miniWeather_openacc.f90
15 | 	$(FC) $(FFLAGS) $(ACCFLAGS) miniWeather_openacc.f90 -o miniWeather $(LDFLAGS)
16 |     
17 | clean:
18 | 	rm -f *.o miniWeather *.nsys-rep *.sqlite *.ncu-rep
19 | 


--------------------------------------------------------------------------------
/_profiler/source_code/lab4/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 NVIDIA Corporation.  All rights reserved. 
 2 | 
 3 | CC := nvc++
 4 | CFLAGS := -O3 -w
 5 | ACCFLAGS := -ta=tesla -Minfo=accel
 6 | 
 7 | FC := nvfortran
 8 | FFLAGS := -fast
 9 | LDFLAGS := -lnvhpcwrapnvtx
10 | 
11 | miniWeather_c: miniWeather_openacc.cpp
12 | 	${CC} ${CFLAGS} ${ACCFLAGS} -o miniWeather miniWeather_openacc.cpp 
13 | 
14 | miniWeather_f: miniWeather_openacc.f90
15 | 	$(FC) $(FFLAGS) $(ACCFLAGS) miniWeather_openacc.f90 -o miniWeather $(LDFLAGS)
16 |     
17 | clean:
18 | 	rm -f *.o miniWeather *.nsys-rep *.sqlite *.ncu-rep
19 | 


--------------------------------------------------------------------------------
/_profiler/source_code/lab5/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 NVIDIA Corporation.  All rights reserved. 
 2 | 
 3 | CC := nvc++
 4 | CFLAGS := -O3 -w
 5 | ACCFLAGS := -ta=tesla:managed -Minfo=accel
 6 | 
 7 | FC := nvfortran
 8 | FFLAGS := -fast
 9 | LDFLAGS := -lnvhpcwrapnvtx
10 | 
11 | miniWeather_c: miniWeather_openacc.cpp
12 | 	${CC} ${CFLAGS} ${ACCFLAGS} -o miniWeather miniWeather_openacc.cpp 
13 | 
14 | miniWeather_f: miniWeather_openacc.f90
15 | 	$(FC) $(FFLAGS) $(ACCFLAGS)   miniWeather_openacc.f90 -o miniWeather  $(LDFLAGS)
16 |     
17 | clean:
18 | 	rm -f *.o miniWeather *.nsys-rep *.sqlite *.ncu-rep
19 | 


--------------------------------------------------------------------------------
/_profiler/source_code/lab6/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022 NVIDIA Corporation.  All rights reserved. 
 2 | 
 3 | CC := nvcc
 4 | CFLAGS := -x cu -lnvToolsExt -lineinfo
 5 | ACCFLAGS := -Minfo=accel
 6 | NVTXLIB := -I/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/cuda/11.0/include
 7 | 
 8 | FC := nvfortran
 9 | FFLAGS := -fast
10 | LDFLAGS := -lnvhpcwrapnvtx 
11 |     
12 | jacobi: jacobi.cpp
13 | 	${CC} -o jacobi jacobi.cpp  
14 | 
15 | jacobi_step1: jacobi_step1.cpp
16 | 	${CC} -o jacobi_step1 ${CFLAGS} jacobi_step1.cpp
17 |     
18 | jacobi_step2: jacobi_step2.cpp
19 | 	${CC} -o jacobi_step2 ${CFLAGS} jacobi_step2.cpp
20 | 
21 | jacobi_step3: jacobi_step3.cpp
22 | 	${CC} -o jacobi_step3 ${CFLAGS} jacobi_step3.cpp
23 |     
24 | jacobi_step4: jacobi_step4.cpp
25 | 	${CC} -o jacobi_step4 ${CFLAGS} jacobi_step4.cpp
26 | 
27 | jacobi_step5: jacobi_step5.cpp
28 | 	${CC} -o jacobi_step5 ${CFLAGS} jacobi_step5.cpp
29 |     
30 | jacobi_step6: jacobi_step6.cpp
31 | 	${CC} -o jacobi_step6 ${CFLAGS} jacobi_step6.cpp
32 | 
33 | jacobi_step7: jacobi_step7.cpp
34 | 	${CC} -o jacobi_step7 ${CFLAGS} jacobi_step7.cpp
35 |     
36 | jacobi_step8: jacobi_step8.cpp
37 | 	${CC} -o jacobi_step8 ${CFLAGS} jacobi_step8.cpp
38 |     
39 | clean:
40 | 	rm -f *.o jacobi_step1 jacobi_step2 jacobi_step3 jacobi_step4 jacobi_step5 jacobi_step6 jacobi_step7 jacobi_step8 jacobi *.nsys-rep *.sqlite *.ncu-rep
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/_profiler/source_code/lab6/jacobi.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
  2 | 
  3 | #include <iostream>
  4 | #include <iomanip>
  5 | #include <ctime>
  6 | #include <cmath>
  7 | #include <limits>
  8 | 
  9 | #define N 64
 10 | 
 11 | #define IDX(i, j) ((j) + (i) * N)
 12 | 
 13 | void allocate_memory (float** f, float** f_old, float** error) {
 14 |     *f = (float*) malloc(N * N * sizeof(float));
 15 |     *f_old = (float*) malloc(N * N * sizeof(float));
 16 |     *error = (float*) malloc(sizeof(float));
 17 | }
 18 | 
 19 | void free_memory (float* f, float* f_old, float* error) {
 20 |     free(f);
 21 |     free(f_old);
 22 |     free(error);
 23 | }
 24 | 
 25 | void initialize_data (float* f) {
 26 |     // Set up simple sinusoidal boundary conditions
 27 |     for (int j = 0; j < N; ++j) {
 28 |         for (int i = 0; i < N; ++i) {
 29 | 
 30 |             if (i == 0 || i == N-1) {
 31 |                 f[IDX(i,j)] = sin(j * 2 * M_PI / (N - 1));
 32 |             }
 33 |             else if (j == 0 || j == N-1) {
 34 |                 f[IDX(i,j)] = sin(i * 2 * M_PI / (N - 1));
 35 |             }
 36 |             else {
 37 |                 f[IDX(i,j)] = 0.0f;
 38 |             }
 39 | 
 40 |         }
 41 |     }
 42 | }
 43 | 
 44 | void jacobi_step (float* f, float* f_old, float* error) {
 45 |     for (int j = 1; j <= N-2; ++j) {
 46 |         for (int i = 1; i <= N-2; ++i) {
 47 |             f[IDX(i,j)] = 0.25f * (f_old[IDX(i+1,j)] + f_old[IDX(i-1,j)] +
 48 |                                    f_old[IDX(i,j+1)] + f_old[IDX(i,j-1)]);
 49 | 
 50 |             float df = f[IDX(i,j)] - f_old[IDX(i,j)];
 51 |             *error += df * df;
 52 |         }
 53 |     }
 54 | }
 55 | 
 56 | void swap_data (float* f, float* f_old) {
 57 |     for (int j = 1; j <= N-2; ++j) {
 58 |         for (int i = 1; i <= N-2; ++i) {
 59 |             f_old[IDX(i,j)] = f[IDX(i,j)];
 60 |         }
 61 |     }
 62 | }
 63 | 
 64 | int main () {
 65 |     // Begin wall timing
 66 |     std::clock_t start_time = std::clock();
 67 | 
 68 |     float* f;
 69 |     float* f_old;
 70 |     float* error;
 71 | 
 72 |     // Reserve space for the scalar field and the "old" copy of the data
 73 |     allocate_memory(&f, &f_old, &error);
 74 | 
 75 |     // Initialize data (we'll do this on both f and f_old, so that we don't
 76 |     // have to worry about the boundary points later)
 77 |     initialize_data(f);
 78 |     initialize_data(f_old);
 79 | 
 80 |     // Initialize error to a large number
 81 |     *error = std::numeric_limits<float>::max();
 82 |     const float tolerance = 1.e-4f;
 83 | 
 84 |     // Iterate until we're converged (but set a cap on the maximum number of
 85 |     // iterations to avoid any possible hangs)
 86 |     const int max_iters = 1000;
 87 |     int num_iters = 0;
 88 | 
 89 |     while (*error > tolerance && num_iters < max_iters) {
 90 |         // Initialize error to zero (we'll add to it the following step)
 91 |         *error = 0.0f;
 92 | 
 93 |         // Perform a Jacobi relaxation step
 94 |         jacobi_step(f, f_old, error);
 95 | 
 96 |         // Swap the old data and the new data
 97 |         // We're doing this explicitly for pedagogical purposes, even though
 98 |         // in this specific application a std::swap would have been OK
 99 |         swap_data(f, f_old);
100 | 
101 |         // Normalize the L2-norm of the error by the number of data points
102 |         // and then take the square root
103 |         *error = std::sqrt(*error / (N * N));
104 | 
105 |         // Periodically print out the current error
106 |         if (num_iters % 25 == 0) {
107 |             std::cout << "Error after iteration " << num_iters << " = " << *error << std::endl;
108 |         }
109 | 
110 |         // Increment the iteration count
111 |         ++num_iters;
112 |     }
113 | 
114 |     // If we took fewer than max_iters steps and the error is below the tolerance,
115 |     // we succeeded. Otherwise, we failed.
116 | 
117 |     if (*error <= tolerance && num_iters < max_iters) {
118 |         std::cout << "Success!" << std::endl;
119 |     }
120 |     else {
121 |         std::cout << "Failure!" << std::endl;
122 |         return -1;
123 |     }
124 | 
125 |     // Clean up memory allocations
126 |     free_memory(f, f_old, error);
127 | 
128 |     // End wall timing
129 |     double duration = (std::clock() - start_time) / (double) CLOCKS_PER_SEC;
130 |     std::cout << "Run time = " << std::setprecision(4) << duration << " seconds" << std::endl;
131 | 
132 |     return 0;
133 | }
134 | 


--------------------------------------------------------------------------------
/_profiler/source_code/lab6/jacobi_step1.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
  2 | 
  3 | #include <iostream>
  4 | #include <iomanip>
  5 | #include <cmath>
  6 | #include <limits>
  7 | #include <ctime>
  8 | #include <nvToolsExt.h>
  9 | 
 10 | #define N 64
 11 | 
 12 | #define IDX(i, j) ((j) + (i) * N)
 13 | 
 14 | void allocate_memory (float** f, float** f_old, float** error) {
 15 |     *f = (float*) malloc(N * N * sizeof(float));
 16 |     *f_old = (float*) malloc(N * N * sizeof(float));
 17 |     *error = (float*) malloc(sizeof(float));
 18 | }
 19 | 
 20 | void free_memory (float* f, float* f_old, float* error) {
 21 |     free(f);
 22 |     free(f_old);
 23 |     free(error);
 24 | }
 25 | 
 26 | void initialize_data (float* f) {
 27 |     // Set up simple sinusoidal boundary conditions
 28 |     for (int j = 0; j < N; ++j) {
 29 |         for (int i = 0; i < N; ++i) {
 30 | 
 31 |             if (i == 0 || i == N-1) {
 32 |                 f[IDX(i,j)] = sin(j * 2 * M_PI / (N - 1));
 33 |             }
 34 |             else if (j == 0 || j == N-1) {
 35 |                 f[IDX(i,j)] = sin(i * 2 * M_PI / (N - 1));
 36 |             }
 37 |             else {
 38 |                 f[IDX(i,j)] = 0.0f;
 39 |             }
 40 | 
 41 |         }
 42 |     }
 43 | }
 44 | 
 45 | void jacobi_step (float* f, float* f_old, float* error) {
 46 |     for (int j = 1; j <= N-2; ++j) {
 47 |         for (int i = 1; i <= N-2; ++i) {
 48 |             f[IDX(i,j)] = 0.25f * (f_old[IDX(i+1,j)] + f_old[IDX(i-1,j)] +
 49 |                                    f_old[IDX(i,j+1)] + f_old[IDX(i,j-1)]);
 50 | 
 51 |             float df = f[IDX(i,j)] - f_old[IDX(i,j)];
 52 |             *error += df * df;
 53 |         }
 54 |     }
 55 | }
 56 | 
 57 | void swap_data (float* f, float* f_old) {
 58 |     for (int j = 1; j <= N-2; ++j) {
 59 |         for (int i = 1; i <= N-2; ++i) {
 60 |             f_old[IDX(i,j)] = f[IDX(i,j)];
 61 |         }
 62 |     }
 63 | }
 64 | 
 65 | int main () {
 66 |     // Begin wall timing
 67 |     std::clock_t start_time = std::clock();
 68 | 
 69 |     float* f;
 70 |     float* f_old;
 71 |     float* error;
 72 | 
 73 |     // Reserve space for the scalar field and the "old" copy of the data
 74 |     nvtxRangePush("Allocate memory");
 75 |     allocate_memory(&f, &f_old, &error);
 76 |     nvtxRangePop();
 77 | 
 78 |     // Initialize data (we'll do this on both f and f_old, so that we don't
 79 |     // have to worry about the boundary points later)
 80 |     nvtxRangePush("Initialize data");
 81 |     initialize_data(f);
 82 |     initialize_data(f_old);
 83 |     nvtxRangePop();
 84 | 
 85 |     // Initialize error to a large number
 86 |     *error = std::numeric_limits<float>::max();
 87 |     const float tolerance = 1.e-4f;
 88 | 
 89 |     // Iterate until we're converged (but set a cap on the maximum number of
 90 |     // iterations to avoid any possible hangs)
 91 |     const int max_iters = 1000;
 92 |     int num_iters = 0;
 93 | 
 94 |     while (*error > tolerance && num_iters < max_iters) {
 95 |         // Initialize error to zero (we'll add to it the following step)
 96 |         *error = 0.0f;
 97 | 
 98 |         // Perform a Jacobi relaxation step
 99 |         nvtxRangePush("Jacobi step");
100 |         jacobi_step(f, f_old, error);
101 |         nvtxRangePop();
102 | 
103 |         // Swap the old data and the new data
104 |         // We're doing this explicitly for pedagogical purposes, even though
105 |         // in this specific application a std::swap would have been OK
106 |         nvtxRangePush("Swap data");
107 |         swap_data(f, f_old);
108 |         nvtxRangePop();
109 | 
110 |         // Normalize the L2-norm of the error by the number of data points
111 |         // and then take the square root
112 |         *error = std::sqrt(*error / (N * N));
113 | 
114 |         // Periodically print out the current error
115 |         if (num_iters % 25 == 0) {
116 |             std::cout << "Error after iteration " << num_iters << " = " << *error << std::endl;
117 |         }
118 | 
119 |         // Increment the iteration count
120 |         ++num_iters;
121 |     }
122 | 
123 |     // If we took fewer than max_iters steps and the error is below the tolerance,
124 |     // we succeeded. Otherwise, we failed.
125 | 
126 |     if (*error <= tolerance && num_iters < max_iters) {
127 |         std::cout << "Success!" << std::endl;
128 |     }
129 |     else {
130 |         std::cout << "Failure!" << std::endl;
131 |         return -1;
132 |     }
133 | 
134 |     // Clean up memory allocations
135 |     nvtxRangePush("Free memory");
136 |     free_memory(f, f_old, error);
137 |     nvtxRangePop();
138 | 
139 |     // End wall timing
140 |     double duration = (std::clock() - start_time) / (double) CLOCKS_PER_SEC;
141 |     std::cout << "Run time = " << std::setprecision(4) << duration << " seconds" << std::endl;
142 | 
143 |     return 0;
144 | }
145 | 


--------------------------------------------------------------------------------
/_profiler/source_code/lab6/jacobi_step2.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
  2 | 
  3 | #include <iostream>
  4 | #include <iomanip>
  5 | #include <cmath>
  6 | #include <limits>
  7 | #include <ctime>
  8 | #include <nvToolsExt.h>
  9 | 
 10 | #define N 64
 11 | 
 12 | #define IDX(i, j) ((j) + (i) * N)
 13 | 
 14 | // error checking macro
 15 | #define cudaCheckErrors(msg)                                    \
 16 |     do {                                                        \
 17 |         cudaError_t __err = cudaGetLastError();                 \
 18 |         if (__err != cudaSuccess) {                             \
 19 |             fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n",  \
 20 |                     msg, cudaGetErrorString(__err),             \
 21 |                     __FILE__, __LINE__);                        \
 22 |             fprintf(stderr, "*** FAILED - ABORTING\n");         \
 23 |             exit(1);                                            \
 24 |         }                                                       \
 25 |     } while (0)
 26 | 
 27 | void allocate_memory (float** f, float** f_old, float** error) {
 28 |     cudaMallocManaged(f, N * N * sizeof(float));
 29 |     cudaMallocManaged(f_old, N * N * sizeof(float));
 30 |     cudaMallocManaged(error, sizeof(float));
 31 |     cudaCheckErrors("Memory allocation");
 32 | }
 33 | 
 34 | void free_memory (float* f, float* f_old, float* error) {
 35 |     cudaFree(f);
 36 |     cudaFree(f_old);
 37 |     cudaFree(error);
 38 |     cudaCheckErrors("Memory deallocation");
 39 | }
 40 | 
 41 | void initialize_data (float* f) {
 42 |     // Set up simple sinusoidal boundary conditions
 43 |     for (int j = 0; j < N; ++j) {
 44 |         for (int i = 0; i < N; ++i) {
 45 | 
 46 |             if (i == 0 || i == N-1) {
 47 |                 f[IDX(i,j)] = sin(j * 2 * M_PI / (N - 1));
 48 |             }
 49 |             else if (j == 0 || j == N-1) {
 50 |                 f[IDX(i,j)] = sin(i * 2 * M_PI / (N - 1));
 51 |             }
 52 |             else {
 53 |                 f[IDX(i,j)] = 0.0f;
 54 |             }
 55 | 
 56 |         }
 57 |     }
 58 | }
 59 | 
 60 | void jacobi_step (float* f, float* f_old, float* error) {
 61 |     for (int j = 1; j <= N-2; ++j) {
 62 |         for (int i = 1; i <= N-2; ++i) {
 63 |             f[IDX(i,j)] = 0.25f * (f_old[IDX(i+1,j)] + f_old[IDX(i-1,j)] +
 64 |                                    f_old[IDX(i,j+1)] + f_old[IDX(i,j-1)]);
 65 | 
 66 |             float df = f[IDX(i,j)] - f_old[IDX(i,j)];
 67 |             *error += df * df;
 68 |         }
 69 |     }
 70 | }
 71 | 
 72 | void swap_data (float* f, float* f_old) {
 73 |     for (int j = 1; j <= N-2; ++j) {
 74 |         for (int i = 1; i <= N-2; ++i) {
 75 |             f_old[IDX(i,j)] = f[IDX(i,j)];
 76 |         }
 77 |     }
 78 | }
 79 | 
 80 | int main () {
 81 |     // Begin wall timing
 82 |     std::clock_t start_time = std::clock();
 83 | 
 84 |     float* f;
 85 |     float* f_old;
 86 |     float* error;
 87 | 
 88 |     // Reserve space for the scalar field and the "old" copy of the data
 89 |     nvtxRangePush("Allocate memory");
 90 |     allocate_memory(&f, &f_old, &error);
 91 |     nvtxRangePop();
 92 | 
 93 |     // Initialize data (we'll do this on both f and f_old, so that we don't
 94 |     // have to worry about the boundary points later)
 95 |     nvtxRangePush("Initialize data");
 96 |     initialize_data(f);
 97 |     initialize_data(f_old);
 98 |     nvtxRangePop();
 99 | 
100 |     // Initialize error to a large number
101 |     *error = std::numeric_limits<float>::max();
102 |     const float tolerance = 1.e-4f;
103 | 
104 |     // Iterate until we're converged (but set a cap on the maximum number of
105 |     // iterations to avoid any possible hangs)
106 |     const int max_iters = 1000;
107 |     int num_iters = 0;
108 | 
109 |     while (*error > tolerance && num_iters < max_iters) {
110 |         // Initialize error to zero (we'll add to it the following step)
111 |         *error = 0.0f;
112 | 
113 |         // Perform a Jacobi relaxation step
114 |         nvtxRangePush("Jacobi step");
115 |         jacobi_step(f, f_old, error);
116 |         nvtxRangePop();
117 | 
118 |         // Swap the old data and the new data
119 |         // We're doing this explicitly for pedagogical purposes, even though
120 |         // in this specific application a std::swap would have been OK
121 |         nvtxRangePush("Swap data");
122 |         swap_data(f, f_old);
123 |         nvtxRangePop();
124 | 
125 |         // Normalize the L2-norm of the error by the number of data points
126 |         // and then take the square root
127 |         *error = std::sqrt(*error / (N * N));
128 | 
129 |         // Periodically print out the current error
130 |         if (num_iters % 25 == 0) {
131 |             std::cout << "Error after iteration " << num_iters << " = " << *error << std::endl;
132 |         }
133 | 
134 |         // Increment the iteration count
135 |         ++num_iters;
136 |     }
137 | 
138 |     // If we took fewer than max_iters steps and the error is below the tolerance,
139 |     // we succeeded. Otherwise, we failed.
140 | 
141 |     if (*error <= tolerance && num_iters < max_iters) {
142 |         std::cout << "Success!" << std::endl;
143 |     }
144 |     else {
145 |         std::cout << "Failure!" << std::endl;
146 |         return -1;
147 |     }
148 | 
149 |     // Clean up memory allocations
150 |     nvtxRangePush("Free memory");
151 |     free_memory(f, f_old, error);
152 |     nvtxRangePop();
153 | 
154 |     // End wall timing
155 |     double duration = (std::clock() - start_time) / (double) CLOCKS_PER_SEC;
156 |     std::cout << "Run time = " << std::setprecision(4) << duration << " seconds" << std::endl;
157 | 
158 |     return 0;
159 | }
160 | 


--------------------------------------------------------------------------------
/_profiler/source_code/lab6/jacobi_step3.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
  2 | 
  3 | #include <iostream>
  4 | #include <iomanip>
  5 | #include <cmath>
  6 | #include <limits>
  7 | #include <ctime>
  8 | #include <nvToolsExt.h>
  9 | 
 10 | #define N 2048
 11 | 
 12 | #define IDX(i, j) ((j) + (i) * N)
 13 | 
 14 | // error checking macro
 15 | #define cudaCheckErrors(msg)                                    \
 16 |     do {                                                        \
 17 |         cudaError_t __err = cudaGetLastError();                 \
 18 |         if (__err != cudaSuccess) {                             \
 19 |             fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n",  \
 20 |                     msg, cudaGetErrorString(__err),             \
 21 |                     __FILE__, __LINE__);                        \
 22 |             fprintf(stderr, "*** FAILED - ABORTING\n");         \
 23 |             exit(1);                                            \
 24 |         }                                                       \
 25 |     } while (0)
 26 | 
 27 | void allocate_memory (float** f, float** f_old, float** error) {
 28 |     cudaMallocManaged(f, N * N * sizeof(float));
 29 |     cudaMallocManaged(f_old, N * N * sizeof(float));
 30 |     cudaMallocManaged(error, sizeof(float));
 31 |     cudaCheckErrors("Memory allocation");
 32 | }
 33 | 
 34 | void free_memory (float* f, float* f_old, float* error) {
 35 |     cudaFree(f);
 36 |     cudaFree(f_old);
 37 |     cudaFree(error);
 38 |     cudaCheckErrors("Memory deallocation");
 39 | }
 40 | 
 41 | void initialize_data (float* f) {
 42 |     // Set up simple sinusoidal boundary conditions
 43 |     for (int j = 0; j < N; ++j) {
 44 |         for (int i = 0; i < N; ++i) {
 45 | 
 46 |             if (i == 0 || i == N-1) {
 47 |                 f[IDX(i,j)] = sin(j * 2 * M_PI / (N - 1));
 48 |             }
 49 |             else if (j == 0 || j == N-1) {
 50 |                 f[IDX(i,j)] = sin(i * 2 * M_PI / (N - 1));
 51 |             }
 52 |             else {
 53 |                 f[IDX(i,j)] = 0.0f;
 54 |             }
 55 | 
 56 |         }
 57 |     }
 58 | }
 59 | 
 60 | void jacobi_step (float* f, float* f_old, float* error) {
 61 |     for (int j = 1; j <= N-2; ++j) {
 62 |         for (int i = 1; i <= N-2; ++i) {
 63 |             f[IDX(i,j)] = 0.25f * (f_old[IDX(i+1,j)] + f_old[IDX(i-1,j)] +
 64 |                                    f_old[IDX(i,j+1)] + f_old[IDX(i,j-1)]);
 65 | 
 66 |             float df = f[IDX(i,j)] - f_old[IDX(i,j)];
 67 |             *error += df * df;
 68 |         }
 69 |     }
 70 | }
 71 | 
 72 | void swap_data (float* f, float* f_old) {
 73 |     for (int j = 1; j <= N-2; ++j) {
 74 |         for (int i = 1; i <= N-2; ++i) {
 75 |             f_old[IDX(i,j)] = f[IDX(i,j)];
 76 |         }
 77 |     }
 78 | }
 79 | 
 80 | int main () {
 81 |     // Begin wall timing
 82 |     std::clock_t start_time = std::clock();
 83 | 
 84 |     float* f;
 85 |     float* f_old;
 86 |     float* error;
 87 | 
 88 |     // Reserve space for the scalar field and the "old" copy of the data
 89 |     nvtxRangePush("Allocate memory");
 90 |     allocate_memory(&f, &f_old, &error);
 91 |     nvtxRangePop();
 92 | 
 93 |     // Initialize data (we'll do this on both f and f_old, so that we don't
 94 |     // have to worry about the boundary points later)
 95 |     nvtxRangePush("Initialize data");
 96 |     initialize_data(f);
 97 |     initialize_data(f_old);
 98 |     nvtxRangePop();
 99 | 
100 |     // Initialize error to a large number
101 |     *error = std::numeric_limits<float>::max();
102 |     const float tolerance = 1.e-4f;
103 | 
104 |     // Iterate until we're converged (but set a cap on the maximum number of
105 |     // iterations to avoid any possible hangs)
106 |     const int max_iters = 1000;
107 |     int num_iters = 0;
108 | 
109 |     while (*error > tolerance && num_iters < max_iters) {
110 |         // Initialize error to zero (we'll add to it the following step)
111 |         *error = 0.0f;
112 | 
113 |         // Perform a Jacobi relaxation step
114 |         nvtxRangePush("Jacobi step");
115 |         jacobi_step(f, f_old, error);
116 |         nvtxRangePop();
117 | 
118 |         // Swap the old data and the new data
119 |         // We're doing this explicitly for pedagogical purposes, even though
120 |         // in this specific application a std::swap would have been OK
121 |         nvtxRangePush("Swap data");
122 |         swap_data(f, f_old);
123 |         nvtxRangePop();
124 | 
125 |         // Normalize the L2-norm of the error by the number of data points
126 |         // and then take the square root
127 |         *error = std::sqrt(*error / (N * N));
128 | 
129 |         // Periodically print out the current error
130 |         if (num_iters % 25 == 0) {
131 |             std::cout << "Error after iteration " << num_iters << " = " << *error << std::endl;
132 |         }
133 | 
134 |         // Increment the iteration count
135 |         ++num_iters;
136 |     }
137 | 
138 |     // If we took fewer than max_iters steps and the error is below the tolerance,
139 |     // we succeeded. Otherwise, we failed.
140 | 
141 |     if (*error <= tolerance && num_iters < max_iters) {
142 |         std::cout << "Success!" << std::endl;
143 |     }
144 |     else {
145 |         std::cout << "Failure!" << std::endl;
146 |         return -1;
147 |     }
148 | 
149 |     // Clean up memory allocations
150 |     nvtxRangePush("Free memory");
151 |     free_memory(f, f_old, error);
152 |     nvtxRangePop();
153 | 
154 |     // End wall timing
155 |     double duration = (std::clock() - start_time) / (double) CLOCKS_PER_SEC;
156 |     std::cout << "Run time = " << std::setprecision(4) << duration << " seconds" << std::endl;
157 | 
158 |     return 0;
159 | }
160 | 


--------------------------------------------------------------------------------
/_profiler/source_code/lab6/jacobi_step4.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
  2 | 
  3 | #include <iostream>
  4 | #include <iomanip>
  5 | #include <cmath>
  6 | #include <limits>
  7 | #include <ctime>
  8 | #include <nvToolsExt.h>
  9 | 
 10 | #define N 2048
 11 | 
 12 | #define IDX(i, j) ((j) + (i) * N)
 13 | 
 14 | // error checking macro
 15 | #define cudaCheckErrors(msg)                                    \
 16 |     do {                                                        \
 17 |         cudaError_t __err = cudaGetLastError();                 \
 18 |         if (__err != cudaSuccess) {                             \
 19 |             fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n",  \
 20 |                     msg, cudaGetErrorString(__err),             \
 21 |                     __FILE__, __LINE__);                        \
 22 |             fprintf(stderr, "*** FAILED - ABORTING\n");         \
 23 |             exit(1);                                            \
 24 |         }                                                       \
 25 |     } while (0)
 26 | 
 27 | void allocate_memory (float** f, float** f_old, float** error) {
 28 |     cudaMallocManaged(f, N * N * sizeof(float));
 29 |     cudaMallocManaged(f_old, N * N * sizeof(float));
 30 |     cudaMallocManaged(error, sizeof(float));
 31 |     cudaCheckErrors("Memory allocation");
 32 | }
 33 | 
 34 | void free_memory (float* f, float* f_old, float* error) {
 35 |     cudaFree(f);
 36 |     cudaFree(f_old);
 37 |     cudaFree(error);
 38 |     cudaCheckErrors("Memory deallocation");
 39 | }
 40 | 
 41 | void initialize_data (float* f) {
 42 |     // Set up simple sinusoidal boundary conditions
 43 |     for (int j = 0; j < N; ++j) {
 44 |         for (int i = 0; i < N; ++i) {
 45 | 
 46 |             if (i == 0 || i == N-1) {
 47 |                 f[IDX(i,j)] = sin(j * 2 * M_PI / (N - 1));
 48 |             }
 49 |             else if (j == 0 || j == N-1) {
 50 |                 f[IDX(i,j)] = sin(i * 2 * M_PI / (N - 1));
 51 |             }
 52 |             else {
 53 |                 f[IDX(i,j)] = 0.0f;
 54 |             }
 55 | 
 56 |         }
 57 |     }
 58 | }
 59 | 
 60 | __global__ void jacobi_step (float* f, float* f_old, float* error) {
 61 |     int i = threadIdx.x + blockIdx.x * blockDim.x;
 62 |     int j = threadIdx.y + blockIdx.y * blockDim.y;
 63 | 
 64 |     if (j >= 1 && j <= N-2) {
 65 |         if (i >= 1 && i <= N-2) {
 66 |             f[IDX(i,j)] = 0.25f * (f_old[IDX(i+1,j)] + f_old[IDX(i-1,j)] +
 67 |                                    f_old[IDX(i,j+1)] + f_old[IDX(i,j-1)]);
 68 | 
 69 |             float df = f[IDX(i,j)] - f_old[IDX(i,j)];
 70 |             atomicAdd(error, df * df);
 71 |         }
 72 |     }
 73 | }
 74 | 
 75 | void swap_data (float* f, float* f_old) {
 76 |     for (int j = 1; j <= N-2; ++j) {
 77 |         for (int i = 1; i <= N-2; ++i) {
 78 |             f_old[IDX(i,j)] = f[IDX(i,j)];
 79 |         }
 80 |     }
 81 | }
 82 | 
 83 | int main () {
 84 |     // Begin wall timing
 85 |     std::clock_t start_time = std::clock();
 86 | 
 87 |     float* f;
 88 |     float* f_old;
 89 |     float* error;
 90 | 
 91 |     // Reserve space for the scalar field and the "old" copy of the data
 92 |     nvtxRangePush("Allocate memory");
 93 |     allocate_memory(&f, &f_old, &error);
 94 |     nvtxRangePop();
 95 | 
 96 |     // Initialize data (we'll do this on both f and f_old, so that we don't
 97 |     // have to worry about the boundary points later)
 98 |     nvtxRangePush("Initialize data");
 99 |     initialize_data(f);
100 |     initialize_data(f_old);
101 |     nvtxRangePop();
102 | 
103 |     // Initialize error to a large number
104 |     *error = std::numeric_limits<float>::max();
105 |     const float tolerance = 1.e-4f;
106 | 
107 |     // Iterate until we're converged (but set a cap on the maximum number of
108 |     // iterations to avoid any possible hangs)
109 |     const int max_iters = 1000;
110 |     int num_iters = 0;
111 | 
112 |     while (*error > tolerance && num_iters < max_iters) {
113 |         // Initialize error to zero (we'll add to it the following step)
114 |         *error = 0.0f;
115 | 
116 |         // Perform a Jacobi relaxation step
117 |         nvtxRangePush("Jacobi step");
118 |         jacobi_step<<<dim3(N / 32, N / 32), dim3(32, 32)>>>(f, f_old, error);
119 |         cudaDeviceSynchronize();
120 |         nvtxRangePop();
121 | 
122 |         // Swap the old data and the new data
123 |         // We're doing this explicitly for pedagogical purposes, even though
124 |         // in this specific application a std::swap would have been OK
125 |         nvtxRangePush("Swap data");
126 |         swap_data(f, f_old);
127 |         nvtxRangePop();
128 | 
129 |         // Normalize the L2-norm of the error by the number of data points
130 |         // and then take the square root
131 |         *error = std::sqrt(*error / (N * N));
132 | 
133 |         // Periodically print out the current error
134 |         if (num_iters % 25 == 0) {
135 |             std::cout << "Error after iteration " << num_iters << " = " << *error << std::endl;
136 |         }
137 | 
138 |         // Increment the iteration count
139 |         ++num_iters;
140 |     }
141 | 
142 |     // If we took fewer than max_iters steps and the error is below the tolerance,
143 |     // we succeeded. Otherwise, we failed.
144 | 
145 |     if (*error <= tolerance && num_iters < max_iters) {
146 |         std::cout << "Success!" << std::endl;
147 |     }
148 |     else {
149 |         std::cout << "Failure!" << std::endl;
150 |         return -1;
151 |     }
152 | 
153 |     // Clean up memory allocations
154 |     nvtxRangePush("Free memory");
155 |     free_memory(f, f_old, error);
156 |     nvtxRangePop();
157 | 
158 |     // End wall timing
159 |     double duration = (std::clock() - start_time) / (double) CLOCKS_PER_SEC;
160 |     std::cout << "Run time = " << std::setprecision(4) << duration << " seconds" << std::endl;
161 | 
162 |     return 0;
163 | }
164 | 


--------------------------------------------------------------------------------
/_profiler/source_code/lab6/jacobi_step5.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
  2 | 
  3 | #include <iostream>
  4 | #include <iomanip>
  5 | #include <cmath>
  6 | #include <limits>
  7 | #include <ctime>
  8 | #include <nvToolsExt.h>
  9 | 
 10 | #define N 2048
 11 | 
 12 | #define IDX(i, j) ((j) + (i) * N)
 13 | 
 14 | // error checking macro
 15 | #define cudaCheckErrors(msg)                                    \
 16 |     do {                                                        \
 17 |         cudaError_t __err = cudaGetLastError();                 \
 18 |         if (__err != cudaSuccess) {                             \
 19 |             fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n",  \
 20 |                     msg, cudaGetErrorString(__err),             \
 21 |                     __FILE__, __LINE__);                        \
 22 |             fprintf(stderr, "*** FAILED - ABORTING\n");         \
 23 |             exit(1);                                            \
 24 |         }                                                       \
 25 |     } while (0)
 26 | 
 27 | void allocate_memory (float** f, float** f_old, float** error) {
 28 |     cudaMallocManaged(f, N * N * sizeof(float));
 29 |     cudaMallocManaged(f_old, N * N * sizeof(float));
 30 |     cudaMallocManaged(error, sizeof(float));
 31 |     cudaCheckErrors("Memory allocation");
 32 | }
 33 | 
 34 | void free_memory (float* f, float* f_old, float* error) {
 35 |     cudaFree(f);
 36 |     cudaFree(f_old);
 37 |     cudaFree(error);
 38 |     cudaCheckErrors("Memory deallocation");
 39 | }
 40 | 
 41 | void initialize_data (float* f) {
 42 |     // Set up simple sinusoidal boundary conditions
 43 |     for (int j = 0; j < N; ++j) {
 44 |         for (int i = 0; i < N; ++i) {
 45 | 
 46 |             if (i == 0 || i == N-1) {
 47 |                 f[IDX(i,j)] = sin(j * 2 * M_PI / (N - 1));
 48 |             }
 49 |             else if (j == 0 || j == N-1) {
 50 |                 f[IDX(i,j)] = sin(i * 2 * M_PI / (N - 1));
 51 |             }
 52 |             else {
 53 |                 f[IDX(i,j)] = 0.0f;
 54 |             }
 55 | 
 56 |         }
 57 |     }
 58 | }
 59 | 
 60 | __global__ void jacobi_step (float* f, float* f_old, float* error) {
 61 |     int i = threadIdx.x + blockIdx.x * blockDim.x;
 62 |     int j = threadIdx.y + blockIdx.y * blockDim.y;
 63 | 
 64 |     if (j >= 1 && j <= N-2) {
 65 |         if (i >= 1 && i <= N-2) {
 66 |             f[IDX(i,j)] = 0.25f * (f_old[IDX(i+1,j)] + f_old[IDX(i-1,j)] +
 67 |                                    f_old[IDX(i,j+1)] + f_old[IDX(i,j-1)]);
 68 | 
 69 |             float df = f[IDX(i,j)] - f_old[IDX(i,j)];
 70 |             atomicAdd(error, df * df);
 71 |         }
 72 |     }
 73 | }
 74 | 
 75 | __global__ void swap_data (float* f, float* f_old) {
 76 |     int i = threadIdx.x + blockIdx.x * blockDim.x;
 77 |     int j = threadIdx.y + blockIdx.y * blockDim.y;
 78 | 
 79 |     if (j >= 1 && j <= N-2) {
 80 |         if (i >= 1 && i <= N-2) {
 81 |             f_old[IDX(i,j)] = f[IDX(i,j)];
 82 |         }
 83 |     }
 84 | }
 85 | 
 86 | int main () {
 87 |     // Begin wall timing
 88 |     std::clock_t start_time = std::clock();
 89 | 
 90 |     float* f;
 91 |     float* f_old;
 92 |     float* error;
 93 | 
 94 |     // Reserve space for the scalar field and the "old" copy of the data
 95 |     nvtxRangePush("Allocate memory");
 96 |     allocate_memory(&f, &f_old, &error);
 97 |     nvtxRangePop();
 98 | 
 99 |     // Initialize data (we'll do this on both f and f_old, so that we don't
100 |     // have to worry about the boundary points later)
101 |     nvtxRangePush("Initialize data");
102 |     initialize_data(f);
103 |     initialize_data(f_old);
104 |     nvtxRangePop();
105 | 
106 |     // Initialize error to a large number
107 |     *error = std::numeric_limits<float>::max();
108 |     const float tolerance = 1.e-4f;
109 | 
110 |     // Iterate until we're converged (but set a cap on the maximum number of
111 |     // iterations to avoid any possible hangs)
112 |     const int max_iters = 1000;
113 |     int num_iters = 0;
114 | 
115 |     while (*error > tolerance && num_iters < max_iters) {
116 |         // Initialize error to zero (we'll add to it the following step)
117 |         *error = 0.0f;
118 | 
119 |         // Perform a Jacobi relaxation step
120 |         nvtxRangePush("Jacobi step");
121 |         jacobi_step<<<dim3(N / 32, N / 32), dim3(32, 32)>>>(f, f_old, error);
122 |         cudaDeviceSynchronize();
123 |         nvtxRangePop();
124 | 
125 |         // Swap the old data and the new data
126 |         // We're doing this explicitly for pedagogical purposes, even though
127 |         // in this specific application a std::swap would have been OK
128 |         nvtxRangePush("Swap data");
129 |         swap_data<<<dim3(N / 32, N / 32), dim3(32, 32)>>>(f, f_old);
130 |         cudaDeviceSynchronize();
131 |         nvtxRangePop();
132 | 
133 |         // Normalize the L2-norm of the error by the number of data points
134 |         // and then take the square root
135 |         *error = std::sqrt(*error / (N * N));
136 | 
137 |         // Periodically print out the current error
138 |         if (num_iters % 25 == 0) {
139 |             std::cout << "Error after iteration " << num_iters << " = " << *error << std::endl;
140 |         }
141 | 
142 |         // Increment the iteration count
143 |         ++num_iters;
144 |     }
145 | 
146 |     // If we took fewer than max_iters steps and the error is below the tolerance,
147 |     // we succeeded. Otherwise, we failed.
148 | 
149 |     if (*error <= tolerance && num_iters < max_iters) {
150 |         std::cout << "Success!" << std::endl;
151 |     }
152 |     else {
153 |         std::cout << "Failure!" << std::endl;
154 |         return -1;
155 |     }
156 | 
157 |     // Clean up memory allocations
158 |     nvtxRangePush("Free memory");
159 |     free_memory(f, f_old, error);
160 |     nvtxRangePop();
161 | 
162 |     // End wall timing
163 |     double duration = (std::clock() - start_time) / (double) CLOCKS_PER_SEC;
164 |     std::cout << "Run time = " << std::setprecision(4) << duration << " seconds" << std::endl;
165 | 
166 |     return 0;
167 | }
168 | 


--------------------------------------------------------------------------------
/_profiler/source_code/lab6/jacobi_step6.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
  2 | 
  3 | #include <iostream>
  4 | #include <iomanip>
  5 | #include <cmath>
  6 | #include <limits>
  7 | #include <ctime>
  8 | #include <nvToolsExt.h>
  9 | 
 10 | #define N 2048
 11 | 
 12 | #define IDX(i, j) ((i) + (j) * N)
 13 | 
 14 | // error checking macro
 15 | #define cudaCheckErrors(msg)                                    \
 16 |     do {                                                        \
 17 |         cudaError_t __err = cudaGetLastError();                 \
 18 |         if (__err != cudaSuccess) {                             \
 19 |             fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n",  \
 20 |                     msg, cudaGetErrorString(__err),             \
 21 |                     __FILE__, __LINE__);                        \
 22 |             fprintf(stderr, "*** FAILED - ABORTING\n");         \
 23 |             exit(1);                                            \
 24 |         }                                                       \
 25 |     } while (0)
 26 | 
 27 | void allocate_memory (float** f, float** f_old, float** error) {
 28 |     cudaMallocManaged(f, N * N * sizeof(float));
 29 |     cudaMallocManaged(f_old, N * N * sizeof(float));
 30 |     cudaMallocManaged(error, sizeof(float));
 31 |     cudaCheckErrors("Memory allocation");
 32 | }
 33 | 
 34 | void free_memory (float* f, float* f_old, float* error) {
 35 |     cudaFree(f);
 36 |     cudaFree(f_old);
 37 |     cudaFree(error);
 38 |     cudaCheckErrors("Memory deallocation");
 39 | }
 40 | 
 41 | void initialize_data (float* f) {
 42 |     // Set up simple sinusoidal boundary conditions
 43 |     for (int j = 0; j < N; ++j) {
 44 |         for (int i = 0; i < N; ++i) {
 45 | 
 46 |             if (i == 0 || i == N-1) {
 47 |                 f[IDX(i,j)] = sin(j * 2 * M_PI / (N - 1));
 48 |             }
 49 |             else if (j == 0 || j == N-1) {
 50 |                 f[IDX(i,j)] = sin(i * 2 * M_PI / (N - 1));
 51 |             }
 52 |             else {
 53 |                 f[IDX(i,j)] = 0.0f;
 54 |             }
 55 | 
 56 |         }
 57 |     }
 58 | }
 59 | 
 60 | __global__ void jacobi_step (float* f, float* f_old, float* error) {
 61 |     int i = threadIdx.x + blockIdx.x * blockDim.x;
 62 |     int j = threadIdx.y + blockIdx.y * blockDim.y;
 63 | 
 64 |     if (j >= 1 && j <= N-2) {
 65 |         if (i >= 1 && i <= N-2) {
 66 |             f[IDX(i,j)] = 0.25f * (f_old[IDX(i+1,j)] + f_old[IDX(i-1,j)] +
 67 |                                    f_old[IDX(i,j+1)] + f_old[IDX(i,j-1)]);
 68 | 
 69 |             float df = f[IDX(i,j)] - f_old[IDX(i,j)];
 70 |             atomicAdd(error, df * df);
 71 |         }
 72 |     }
 73 | }
 74 | 
 75 | __global__ void swap_data (float* f, float* f_old) {
 76 |     int i = threadIdx.x + blockIdx.x * blockDim.x;
 77 |     int j = threadIdx.y + blockIdx.y * blockDim.y;
 78 | 
 79 |     if (j >= 1 && j <= N-2) {
 80 |         if (i >= 1 && i <= N-2) {
 81 |             f_old[IDX(i,j)] = f[IDX(i,j)];
 82 |         }
 83 |     }
 84 | }
 85 | 
 86 | int main () {
 87 |     // Begin wall timing
 88 |     std::clock_t start_time = std::clock();
 89 | 
 90 |     float* f;
 91 |     float* f_old;
 92 |     float* error;
 93 | 
 94 |     // Reserve space for the scalar field and the "old" copy of the data
 95 |     nvtxRangePush("Allocate memory");
 96 |     allocate_memory(&f, &f_old, &error);
 97 |     nvtxRangePop();
 98 | 
 99 |     // Initialize data (we'll do this on both f and f_old, so that we don't
100 |     // have to worry about the boundary points later)
101 |     nvtxRangePush("Initialize data");
102 |     initialize_data(f);
103 |     initialize_data(f_old);
104 |     nvtxRangePop();
105 | 
106 |     // Initialize error to a large number
107 |     *error = std::numeric_limits<float>::max();
108 |     const float tolerance = 1.e-4f;
109 | 
110 |     // Iterate until we're converged (but set a cap on the maximum number of
111 |     // iterations to avoid any possible hangs)
112 |     const int max_iters = 1000;
113 |     int num_iters = 0;
114 | 
115 |     while (*error > tolerance && num_iters < max_iters) {
116 |         // Initialize error to zero (we'll add to it the following step)
117 |         *error = 0.0f;
118 | 
119 |         // Perform a Jacobi relaxation step
120 |         nvtxRangePush("Jacobi step");
121 |         jacobi_step<<<dim3(N / 32, N / 32), dim3(32, 32)>>>(f, f_old, error);
122 |         cudaDeviceSynchronize();
123 |         nvtxRangePop();
124 | 
125 |         // Swap the old data and the new data
126 |         // We're doing this explicitly for pedagogical purposes, even though
127 |         // in this specific application a std::swap would have been OK
128 |         nvtxRangePush("Swap data");
129 |         swap_data<<<dim3(N / 32, N / 32), dim3(32, 32)>>>(f, f_old);
130 |         cudaDeviceSynchronize();
131 |         nvtxRangePop();
132 | 
133 |         // Normalize the L2-norm of the error by the number of data points
134 |         // and then take the square root
135 |         *error = std::sqrt(*error / (N * N));
136 | 
137 |         // Periodically print out the current error
138 |         if (num_iters % 25 == 0) {
139 |             std::cout << "Error after iteration " << num_iters << " = " << *error << std::endl;
140 |         }
141 | 
142 |         // Increment the iteration count
143 |         ++num_iters;
144 |     }
145 | 
146 |     // If we took fewer than max_iters steps and the error is below the tolerance,
147 |     // we succeeded. Otherwise, we failed.
148 | 
149 |     if (*error <= tolerance && num_iters < max_iters) {
150 |         std::cout << "Success!" << std::endl;
151 |     }
152 |     else {
153 |         std::cout << "Failure!" << std::endl;
154 |         return -1;
155 |     }
156 | 
157 |     // Clean up memory allocations
158 |     nvtxRangePush("Free memory");
159 |     free_memory(f, f_old, error);
160 |     nvtxRangePop();
161 | 
162 |     // End wall timing
163 |     double duration = (std::clock() - start_time) / (double) CLOCKS_PER_SEC;
164 |     std::cout << "Run time = " << std::setprecision(4) << duration << " seconds" << std::endl;
165 | 
166 |     return 0;
167 | }
168 | 


--------------------------------------------------------------------------------
/_profiler/source_code/lab6/jacobi_step7.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
  2 | 
  3 | #include <iostream>
  4 | #include <iomanip>
  5 | #include <cmath>
  6 | #include <limits>
  7 | #include <ctime>
  8 | #include <nvToolsExt.h>
  9 | 
 10 | #define N 2048
 11 | 
 12 | #define IDX(i, j) ((i) + (j) * N)
 13 | 
 14 | // error checking macro
 15 | #define cudaCheckErrors(msg)                                    \
 16 |     do {                                                        \
 17 |         cudaError_t __err = cudaGetLastError();                 \
 18 |         if (__err != cudaSuccess) {                             \
 19 |             fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n",  \
 20 |                     msg, cudaGetErrorString(__err),             \
 21 |                     __FILE__, __LINE__);                        \
 22 |             fprintf(stderr, "*** FAILED - ABORTING\n");         \
 23 |             exit(1);                                            \
 24 |         }                                                       \
 25 |     } while (0)
 26 | 
 27 | void allocate_memory (float** f, float** f_old, float** error) {
 28 |     cudaMallocManaged(f, N * N * sizeof(float));
 29 |     cudaMallocManaged(f_old, N * N * sizeof(float));
 30 |     cudaMallocManaged(error, sizeof(float));
 31 |     cudaCheckErrors("Memory allocation");
 32 | }
 33 | 
 34 | void free_memory (float* f, float* f_old, float* error) {
 35 |     cudaFree(f);
 36 |     cudaFree(f_old);
 37 |     cudaFree(error);
 38 |     cudaCheckErrors("Memory deallocation");
 39 | }
 40 | 
 41 | void initialize_data (float* f) {
 42 |     // Set up simple sinusoidal boundary conditions
 43 |     for (int j = 0; j < N; ++j) {
 44 |         for (int i = 0; i < N; ++i) {
 45 | 
 46 |             if (i == 0 || i == N-1) {
 47 |                 f[IDX(i,j)] = sin(j * 2 * M_PI / (N - 1));
 48 |             }
 49 |             else if (j == 0 || j == N-1) {
 50 |                 f[IDX(i,j)] = sin(i * 2 * M_PI / (N - 1));
 51 |             }
 52 |             else {
 53 |                 f[IDX(i,j)] = 0.0f;
 54 |             }
 55 | 
 56 |         }
 57 |     }
 58 | }
 59 | 
 60 | __global__ void jacobi_step (float* f, float* f_old, float* error) {
 61 |     int i = threadIdx.x + blockIdx.x * blockDim.x;
 62 |     int j = threadIdx.y + blockIdx.y * blockDim.y;
 63 | 
 64 |     float err = 0.0f;
 65 | 
 66 |     if (j >= 1 && j <= N-2) {
 67 |         if (i >= 1 && i <= N-2) {
 68 |             f[IDX(i,j)] = 0.25f * (f_old[IDX(i+1,j)] + f_old[IDX(i-1,j)] +
 69 |                                    f_old[IDX(i,j+1)] + f_old[IDX(i,j-1)]);
 70 | 
 71 |             float df = f[IDX(i,j)] - f_old[IDX(i,j)];
 72 |             err = df * df;
 73 |         }
 74 |     }
 75 | 
 76 |     // Sum over threads in the warp
 77 |     // For simplicity, we do this outside the above conditional
 78 |     // so that all threads participate
 79 |     for (int offset = 16; offset > 0; offset /= 2) {
 80 |         err += __shfl_down_sync(0xffffffff, err, offset);
 81 |     }
 82 | 
 83 |     // If we're thread 0 in the warp, update our value to shared memory
 84 |     // Note that we're assuming exactly a 32x32 block and that the warp ID
 85 |     // is equivalent to threadIdx.y. For the general case, we would have to
 86 |     // write more careful code.
 87 |     __shared__ float reduction_array[32];
 88 |     if (threadIdx.x == 0) {
 89 |         reduction_array[threadIdx.y] = err;
 90 |     }
 91 | 
 92 |     // Synchronize the block before reading any values from smem
 93 |     __syncthreads();
 94 | 
 95 |     // Using the first warp in the block, reduce over the partial sums
 96 |     // in the shared memory array.
 97 |     if (threadIdx.y == 0) {
 98 |         err = reduction_array[threadIdx.x];
 99 |         for (int offset = 16; offset > 0; offset /= 2) {
100 |             err += __shfl_down_sync(0xffffffff, err, offset);
101 |         }
102 |         if (threadIdx.x == 0) {
103 |             atomicAdd(error, err);
104 |         }
105 |     }
106 | }
107 | 
108 | __global__ void swap_data (float* f, float* f_old) {
109 |     int i = threadIdx.x + blockIdx.x * blockDim.x;
110 |     int j = threadIdx.y + blockIdx.y * blockDim.y;
111 | 
112 |     if (j >= 1 && j <= N-2) {
113 |         if (i >= 1 && i <= N-2) {
114 |             f_old[IDX(i,j)] = f[IDX(i,j)];
115 |         }
116 |     }
117 | }
118 | 
119 | int main () {
120 |     // Begin wall timing
121 |     std::clock_t start_time = std::clock();
122 | 
123 |     float* f;
124 |     float* f_old;
125 |     float* error;
126 | 
127 |     // Reserve space for the scalar field and the "old" copy of the data
128 |     nvtxRangePush("Allocate memory");
129 |     allocate_memory(&f, &f_old, &error);
130 |     nvtxRangePop();
131 | 
132 |     // Initialize data (we'll do this on both f and f_old, so that we don't
133 |     // have to worry about the boundary points later)
134 |     nvtxRangePush("Initialize data");
135 |     initialize_data(f);
136 |     initialize_data(f_old);
137 |     nvtxRangePop();
138 | 
139 |     // Initialize error to a large number
140 |     *error = std::numeric_limits<float>::max();
141 |     const float tolerance = 1.e-4f;
142 | 
143 |     // Iterate until we're converged (but set a cap on the maximum number of
144 |     // iterations to avoid any possible hangs)
145 |     const int max_iters = 1000;
146 |     int num_iters = 0;
147 | 
148 |     while (*error > tolerance && num_iters < max_iters) {
149 |         // Initialize error to zero (we'll add to it the following step)
150 |         *error = 0.0f;
151 | 
152 |         // Perform a Jacobi relaxation step
153 |         nvtxRangePush("Jacobi step");
154 |         jacobi_step<<<dim3(N / 32, N / 32), dim3(32, 32)>>>(f, f_old, error);
155 |         cudaDeviceSynchronize();
156 |         nvtxRangePop();
157 | 
158 |         // Swap the old data and the new data
159 |         // We're doing this explicitly for pedagogical purposes, even though
160 |         // in this specific application a std::swap would have been OK
161 |         nvtxRangePush("Swap data");
162 |         swap_data<<<dim3(N / 32, N / 32), dim3(32, 32)>>>(f, f_old);
163 |         cudaDeviceSynchronize();
164 |         nvtxRangePop();
165 | 
166 |         // Normalize the L2-norm of the error by the number of data points
167 |         // and then take the square root
168 |         *error = std::sqrt(*error / (N * N));
169 | 
170 |         // Periodically print out the current error
171 |         if (num_iters % 25 == 0) {
172 |             std::cout << "Error after iteration " << num_iters << " = " << *error << std::endl;
173 |         }
174 | 
175 |         // Increment the iteration count
176 |         ++num_iters;
177 |     }
178 | 
179 |     // If we took fewer than max_iters steps and the error is below the tolerance,
180 |     // we succeeded. Otherwise, we failed.
181 | 
182 |     if (*error <= tolerance && num_iters < max_iters) {
183 |         std::cout << "Success!" << std::endl;
184 |     }
185 |     else {
186 |         std::cout << "Failure!" << std::endl;
187 |         return -1;
188 |     }
189 | 
190 |     // Clean up memory allocations
191 |     nvtxRangePush("Free memory");
192 |     free_memory(f, f_old, error);
193 |     nvtxRangePop();
194 | 
195 |     // End wall timing
196 |     double duration = (std::clock() - start_time) / (double) CLOCKS_PER_SEC;
197 |     std::cout << "Run time = " << std::setprecision(4) << duration << " seconds" << std::endl;
198 | 
199 |     return 0;
200 | }
201 | 


--------------------------------------------------------------------------------
/_profiler/source_code/lab6/jacobi_step8.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
  2 | 
  3 | #include <iostream>
  4 | #include <iomanip>
  5 | #include <cmath>
  6 | #include <limits>
  7 | #include <ctime>
  8 | #include <nvToolsExt.h>
  9 | 
 10 | #define N 2048
 11 | 
 12 | #define IDX(i, j) ((i) + (j) * N)
 13 | 
 14 | // error checking macro
 15 | #define cudaCheckErrors(msg)                                    \
 16 |     do {                                                        \
 17 |         cudaError_t __err = cudaGetLastError();                 \
 18 |         if (__err != cudaSuccess) {                             \
 19 |             fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n",  \
 20 |                     msg, cudaGetErrorString(__err),             \
 21 |                     __FILE__, __LINE__);                        \
 22 |             fprintf(stderr, "*** FAILED - ABORTING\n");         \
 23 |             exit(1);                                            \
 24 |         }                                                       \
 25 |     } while (0)
 26 | 
 27 | void allocate_memory (float** f, float** f_old, float** error) {
 28 |     cudaMallocManaged(f, N * N * sizeof(float));
 29 |     cudaMallocManaged(f_old, N * N * sizeof(float));
 30 |     cudaMallocManaged(error, sizeof(float));
 31 |     cudaCheckErrors("Memory allocation");
 32 | }
 33 | 
 34 | void free_memory (float* f, float* f_old, float* error) {
 35 |     cudaFree(f);
 36 |     cudaFree(f_old);
 37 |     cudaFree(error);
 38 |     cudaCheckErrors("Memory deallocation");
 39 | }
 40 | 
 41 | void initialize_data (float* f) {
 42 |     // Set up simple sinusoidal boundary conditions
 43 |     for (int j = 0; j < N; ++j) {
 44 |         for (int i = 0; i < N; ++i) {
 45 | 
 46 |             if (i == 0 || i == N-1) {
 47 |                 f[IDX(i,j)] = sin(j * 2 * M_PI / (N - 1));
 48 |             }
 49 |             else if (j == 0 || j == N-1) {
 50 |                 f[IDX(i,j)] = sin(i * 2 * M_PI / (N - 1));
 51 |             }
 52 |             else {
 53 |                 f[IDX(i,j)] = 0.0f;
 54 |             }
 55 | 
 56 |         }
 57 |     }
 58 | }
 59 | 
 60 | __global__ void jacobi_step (float* f, float* f_old, float* error) {
 61 |     __shared__ float f_old_tile[34][34];
 62 | 
 63 |     int i = threadIdx.x + blockIdx.x * blockDim.x;
 64 |     int j = threadIdx.y + blockIdx.y * blockDim.y;
 65 | 
 66 |     // First read in the "interior" data, one value per thread
 67 |     // Note the offset by 1, to reserve space for the "left"/"bottom" halo
 68 | 
 69 |     f_old_tile[threadIdx.y+1][threadIdx.x+1] = f_old[IDX(i,j)];
 70 | 
 71 |     // Now read in the halo data; we'll pick the "closest" thread
 72 |     // to each element. When we do this, make sure we don't fall
 73 |     // off the end of the global memory array. Note that this
 74 |     // code does not fill the corners, as they are not used in
 75 |     // this stencil.
 76 | 
 77 |     if (threadIdx.x == 0 && i >= 1) {
 78 |         f_old_tile[threadIdx.y+1][threadIdx.x+0] = f_old[IDX(i-1,j)];
 79 |     }
 80 |     if (threadIdx.x == 31 && i <= N-2) {
 81 |         f_old_tile[threadIdx.y+1][threadIdx.x+2] = f_old[IDX(i+1,j)];
 82 |     }
 83 |     if (threadIdx.y == 0 && j >= 1) {
 84 |         f_old_tile[threadIdx.y+0][threadIdx.x+1] = f_old[IDX(i,j-1)];
 85 |     }
 86 |     if (threadIdx.y == 31 && j <= N-2) {
 87 |         f_old_tile[threadIdx.y+2][threadIdx.x+1] = f_old[IDX(i,j+1)];
 88 |     }
 89 | 
 90 |     // Synchronize all threads
 91 |     __syncthreads();
 92 | 
 93 |     float err = 0.0f;
 94 | 
 95 |     if (j >= 1 && j <= N-2) {
 96 |         if (i >= 1 && i <= N-2) {
 97 |             // Perform the read from shared memory
 98 |             f[IDX(i,j)] = 0.25f * (f_old_tile[threadIdx.y+1][threadIdx.x+2] + f_old_tile[threadIdx.y+1][threadIdx.x+0] +
 99 |                                    f_old_tile[threadIdx.y+2][threadIdx.x+1] + f_old_tile[threadIdx.y+0][threadIdx.x+1]);
100 | 
101 |             float df = f[IDX(i,j)] - f_old_tile[threadIdx.y+1][threadIdx.x+1];
102 |             err = df * df;
103 |         }
104 |     }
105 | 
106 |     // Sum over threads in the warp
107 |     // For simplicity, we do this outside the above conditional
108 |     // so that all threads participate
109 |     for (int offset = 16; offset > 0; offset /= 2) {
110 |         err += __shfl_down_sync(0xffffffff, err, offset);
111 |     }
112 | 
113 |     // If we're thread 0 in the warp, update our value to shared memory
114 |     // Note that we're assuming exactly a 32x32 block and that the warp ID
115 |     // is equivalent to threadIdx.y. For the general case, we would have to
116 |     // write more careful code.
117 |     __shared__ float reduction_array[32];
118 |     if (threadIdx.x == 0) {
119 |         reduction_array[threadIdx.y] = err;
120 |     }
121 | 
122 |     // Synchronize the block before reading any values from smem
123 |     __syncthreads();
124 | 
125 |     // Using the first warp in the block, reduce over the partial sums
126 |     // in the shared memory array.
127 |     if (threadIdx.y == 0) {
128 |         err = reduction_array[threadIdx.x];
129 |         for (int offset = 16; offset > 0; offset /= 2) {
130 |             err += __shfl_down_sync(0xffffffff, err, offset);
131 |         }
132 |         if (threadIdx.x == 0) {
133 |             atomicAdd(error, err);
134 |         }
135 |     }
136 | }
137 | 
138 | __global__ void swap_data (float* f, float* f_old) {
139 |     int i = threadIdx.x + blockIdx.x * blockDim.x;
140 |     int j = threadIdx.y + blockIdx.y * blockDim.y;
141 | 
142 |     if (j >= 1 && j <= N-2) {
143 |         if (i >= 1 && i <= N-2) {
144 |             f_old[IDX(i,j)] = f[IDX(i,j)];
145 |         }
146 |     }
147 | }
148 | 
149 | int main () {
150 |     // Begin wall timing
151 |     std::clock_t start_time = std::clock();
152 | 
153 |     float* f;
154 |     float* f_old;
155 |     float* error;
156 | 
157 |     // Reserve space for the scalar field and the "old" copy of the data
158 |     nvtxRangePush("Allocate memory");
159 |     allocate_memory(&f, &f_old, &error);
160 |     nvtxRangePop();
161 | 
162 |     // Initialize data (we'll do this on both f and f_old, so that we don't
163 |     // have to worry about the boundary points later)
164 |     nvtxRangePush("Initialize data");
165 |     initialize_data(f);
166 |     initialize_data(f_old);
167 |     nvtxRangePop();
168 | 
169 |     // Initialize error to a large number
170 |     *error = std::numeric_limits<float>::max();
171 |     const float tolerance = 1.e-4f;
172 | 
173 |     // Iterate until we're converged (but set a cap on the maximum number of
174 |     // iterations to avoid any possible hangs)
175 |     const int max_iters = 1000;
176 |     int num_iters = 0;
177 | 
178 |     while (*error > tolerance && num_iters < max_iters) {
179 |         // Initialize error to zero (we'll add to it the following step)
180 |         *error = 0.0f;
181 | 
182 |         // Perform a Jacobi relaxation step
183 |         nvtxRangePush("Jacobi step");
184 |         jacobi_step<<<dim3(N / 32, N / 32), dim3(32, 32)>>>(f, f_old, error);
185 |         cudaDeviceSynchronize();
186 |         nvtxRangePop();
187 | 
188 |         // Swap the old data and the new data
189 |         // We're doing this explicitly for pedagogical purposes, even though
190 |         // in this specific application a std::swap would have been OK
191 |         nvtxRangePush("Swap data");
192 |         swap_data<<<dim3(N / 32, N / 32), dim3(32, 32)>>>(f, f_old);
193 |         cudaDeviceSynchronize();
194 |         nvtxRangePop();
195 | 
196 |         // Normalize the L2-norm of the error by the number of data points
197 |         // and then take the square root
198 |         *error = std::sqrt(*error / (N * N));
199 | 
200 |         // Periodically print out the current error
201 |         if (num_iters % 25 == 0) {
202 |             std::cout << "Error after iteration " << num_iters << " = " << *error << std::endl;
203 |         }
204 | 
205 |         // Increment the iteration count
206 |         ++num_iters;
207 |     }
208 | 
209 |     // If we took fewer than max_iters steps and the error is below the tolerance,
210 |     // we succeeded. Otherwise, we failed.
211 | 
212 |     if (*error <= tolerance && num_iters < max_iters) {
213 |         std::cout << "Success!" << std::endl;
214 |     }
215 |     else {
216 |         std::cout << "Failure!" << std::endl;
217 |         return -1;
218 |     }
219 | 
220 |     // CLean up memory allocations
221 |     nvtxRangePush("Free memory");
222 |     free_memory(f, f_old, error);
223 |     nvtxRangePop();
224 | 
225 |     // End wall timing
226 |     double duration = (std::clock() - start_time) / (double) CLOCKS_PER_SEC;
227 |     std::cout << "Run time = " << std::setprecision(4) << duration << " seconds" << std::endl;
228 | 
229 |     return 0;
230 | }
231 | 


--------------------------------------------------------------------------------