├── .dockerignore
├── .gitignore
├── .travis.yml
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── amd64_10-1.dockerfile
├── ci
    ├── build.sh
    ├── env.sh
    └── install_deps.sh
├── coalescing
    ├── CMakeLists.txt
    ├── README.md
    ├── common.hpp
    ├── include
    │   └── argparse
    │   │   └── argparse.hpp
    ├── main.cu
    └── rai_build.yml
├── ppc64le_10-1.dockerfile
├── sgemm
    ├── 1_1_pinned_basic.cu
    ├── 1_2_pinned_tiled.cu
    ├── 1_3_pinned_joint.cu
    ├── 2_1_pageable_basic.cu
    ├── 2_2_pinned_basic.cu
    ├── 2_3_pinned_tiled.cu
    ├── 2_4_pinned_tiled_overlap.cu
    ├── 2_5_pinned_joint.cu
    ├── 2_6_pinned_joint_overlap.cu
    ├── CMakeLists.txt
    ├── README.md
    ├── common.hpp
    ├── cpu.cpp
    ├── include
    │   └── argparse
    │   │   └── argparse.hpp
    └── rai_build.yml
├── slides
    ├── 20200416_ece408.pdf
    ├── 20200421_ece498_Nsight.pdf
    ├── GEMM-joint-tiling.ppt
    ├── memory_access_efficiency.pdf
    └── s22141-what-the-profiler-is-telling-you-how-to-get-the-most-performance-out-of-your-hardware.pdf
└── test.cu


/.dockerignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cwpearson/nvidia-performance-tools/90890e807ef9fc1532ee08938de6689444701686/.dockerignore


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | sgemm/build
2 | *.deb
3 | *.run


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | dist: bionic
 2 | language: minimal
 3 | 
 4 | jobs:
 5 |   include:
 6 |     - arch: ppc64le
 7 |       env: BUILD_DOCKER=1
 8 |       docker: true
 9 |     - arch: amd64
10 |       env: BUILD_DOCKER=1
11 |       docker: true
12 |     - env: BUILD_TYPE=Release
13 |     - env: BUILD_TYPE=Debug
14 | 
15 | before_script:
16 |   - ci/install_deps.sh || travis_terminate 1;
17 | 
18 | script:
19 |   - ci/build.sh || travis_terminate 1;
20 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Building a Development Container
 2 | 
 3 | First, download any dependencies required in `ci/install_deps.sh`.
 4 | 
 5 | Build a test image
 6 | ```
 7 | docker build -f amd64_10-1.dockerfile -t npt-dev .
 8 | ```
 9 | 
10 | Run an image
11 | ```
12 | docker run -it --rm npt-dev
13 | ```
14 | 
15 | Delete all unused docker data:
16 | ```
17 | docker prune system
18 | ```
19 | 
20 | ## Travis
21 | 
22 | CI is done through travis-ci.com.
23 | Travis builds the example code, as well as all docker images.
24 | Carl Pearson's docker hub account is used to push images up to [cwpearson/nvidia-performance-tools on Docker Hub](https://hub.docker.com/repository/docker/cwpearson/nvidia-performance-tools).
25 | 
26 | ## Resources
27 | 
28 | * [Nvidia Docker Image Definitions](https://gitlab.com/nvidia/container-images/cuda/)
29 | 
30 | ## Roadmap
31 | 
32 | * [ ] Using Nsight Compute and Nsight Systems on EWS
33 | * [ ] Instructions for remote profiling
34 | * [ ] Nsight Systems: How to load missing source file
35 | * [ ] Definitions for Various Performance Terms
36 |   * [ ] Occupancy
37 |   * [ ] Memory Hierarchy
38 |   * [ ] Scheduling
39 |     * [ ] Stall reasons
40 |   * [ ] cudaStreams, cudaEvents
41 | * [ ] CUDA Event and Stream timing examples
42 |   * [ ] single-device
43 |   * [ ] multi-device
44 | * [ ] interacting with `.qdrep` files.
45 | * [ ] interacting with `.nsight-cuprof-report` files.
46 | * [ ] Best Practices
47 |   * [ ] Fixing GPU frequency
48 |   * [ ] initial CUDA runtime cost
49 |   * [ ] Warmup Kernels
50 |   * [ ] `cuda-memcheck` race condition and sync check?
51 | * [ ] Is stream 0 the default stream?
52 | * [ ] Nsight System with MPI
53 | * [ ] Nsight System with multi-GPU
54 | * [ ] Nsight Compute multi-GPU
55 | 
56 | 
57 | ## Inspirations
58 | 
59 | * https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s22141-what-the-profiler-is-telling-you-how-to-get-the-most-performance-out-of-your-hardware.pdf
60 |   * https://developer.nvidia.com/gtc/2020/video/s22141
61 | * https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21351-scaling-the-transformer-model-implementation-in-pytorch-across-multiple-nodes.pdf
62 |   * https://developer.nvidia.com/gtc/2020/video/s21351
63 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2020 Carl Pearson
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Nvidia Performance Tools
  2 | 
  3 | [![Build Status](https://travis-ci.com/cwpearson/nvidia-performance-tools.svg?branch=master)](https://travis-ci.com/cwpearson/nvidia-performance-tools)
  4 | 
  5 | ## Docker images with Nvidia's Performance Tools
  6 | 
  7 | [cwpearson/nvidia-performance-tools on Docker Hub](https://hub.docker.com/repository/docker/cwpearson/nvidia-performance-tools).
  8 | 
  9 | ```bash
 10 | docker pull cwpearson/nvidia-performance-tools:latest-amd64   # for x86
 11 | docker pull cwpearson/nvidia-performance-tools:latest-ppc64le # for POWER
 12 | ```
 13 | 
 14 | Typically, you'll want the `latest-amd64` or `latest-ppc64le` tags.
 15 | If you are developing a workflow and want stability, choose a tag like `amd64-10.1-master-ce03360`, which describes the architecture, CUDA version, branch, and short SHA of the corresponding git commit for [cwpearson/nvidia-performance-tools on Github](github.com/cwpearson/nvidia-performance-tools).
 16 | 
 17 | ## Presentations
 18 | 
 19 | * April 21-23 2020 University of Illinois ECE 498 guest lecture for Professors Hwu, Chen, and Xiong.
 20 |   * [Slides](slides/20200421_ece498_Nsight.pdf)
 21 |   * Recorded Lectures
 22 |     * [Part 1: Intro, CUDA Events, Nsight Compute](https://youtu.be/JqlrVcfyp7E)
 23 |     * [Part 2: Nsight Systems](https://youtu.be/GvK_vKb5Btk)
 24 | * April 16 2020 University of Illinois ECE 408 guest lecture for Professor Lumetta.
 25 |   * [Slides](slides/20200416_ece408.pdf)
 26 |   * Recorded Lecture (75 mins)
 27 |     * [Part 1: Intro](https://youtu.be/uN2qju175aE)
 28 |     * [Part 2: CUDA Events](https://youtu.be/yI137sSOlkU)
 29 |     * [Part 3: Nsight Compute](https://youtu.be/UNX0KNMQlW8)
 30 |     * [Part 4: Nsight Systems](https://youtu.be/YHrmnaPgFfY)
 31 | 
 32 | ## Examples
 33 | 
 34 | * [sgemm](sgemm) Featuring basic, shared-memory tiled, and joint shared-memory and register tiling.
 35 | * [coalescing](coalescing) Featuring a simple code with and without memory coalescing, and discussion of how to analyze efficiency in Nsight Compute
 36 | 
 37 | ## Installing Nsight Systems and Nsight Compute
 38 | 
 39 | There is a command-line (CLI) and graphical (GUI) version of each tool.
 40 | They will be installed together, unless a CLI-only version is downloaded.
 41 | 
 42 | * macOS: You probably don't have CUDA installed, so download the Nsight Systems or Compute installer from the Nvidia website.
 43 | * Windows with CUDA:
 44 |   * with CUDA: You may already find Nsight Systems or Compute in your start menu. You can download a more recent release from the Nvidia website. If you install it, you will have two entries in the start menu for different versions.
 45 |   * without CUDA: Download the Nsight Systems or Compute installer from the CUDA website.
 46 | * Linux
 47 |   * with CUDA: you may already have Nsight Systems and Compute (check `/usr/local/cuda/bin/nsight-sys` and `/usr/local/cuda/bin/nv-nsight-cu`). If so, you can still download the Nsight Systems or Compute `.deb` package to update. It may override the package that was installed with CUDA. You can also use the `.run` file, which you should install to a directory not managed by the package manager, and add the location of the resulting binary files to your path.
 48 |   * without CUDA: 
 49 |     * `.deb`: Download the `.deb` package and install it. Requires root privileges
 50 |     * `.run`: Download the `.run` package and execute it. Choose a file system that you have permission to install to, and then add the resulting binary directory to your path.
 51 | 
 52 | ## Preparing for Profiling
 53 | 
 54 | ### Source code annotations
 55 | 
 56 | ```c++
 57 | #include <nvToolsExt.h>
 58 | 
 59 | nvtxRangePush("span 1");
 60 | nvtxRangePush("a nested span");
 61 | nvtxRangePop(); // end nested span
 62 | nvtxRangePop(); // end span 1
 63 | ```
 64 | 
 65 | Also link with `-lnvToolsExt`.
 66 | 
 67 | ### nvcc
 68 | 
 69 | Compile with optimizations turned on, and without debug information.
 70 | The most linkely relevant flags for `nvcc` are below:
 71 | 
 72 | ```
 73 | --profile                                       (-pg)                           
 74 |         Instrument generated code/executable for use by gprof (Linux only).
 75 | 
 76 | --debug                                         (-g)                            
 77 |         Generate debug information for host code.
 78 | 
 79 | --device-debug                                  (-G)                            
 80 |         Generate debug information for device code. Turns off all optimizations.
 81 |         Don't use for profiling; use -lineinfo instead.
 82 | 
 83 | --generate-line-info                            (-lineinfo)                     
 84 |         Generate line-number information for device code.
 85 | ```
 86 | 
 87 | So, change `nvcc -g/-pg/-G ...` to `nvcc <your optimization flags> -lineinfo ...`.
 88 | 
 89 | ### cuda-memcheck
 90 | 
 91 | If your code overwrites unallocated memory, it may corrupt the profiling process.
 92 | If profiling fails, try running your code under `cuda-memcheck`.
 93 | This will instrument your binary to detect bad GPU memory activity.
 94 | Fix any errors that occur, and try profiling again.
 95 | This will cause ~100x slowdown usually, so try a small dataset first.
 96 | 
 97 | ```
 98 | cuda-memcheck my-binary
 99 | ```
100 | 
101 | ### Nsight Systems Environment Check
102 | 
103 | Run `nsys status -e`. You should see something like 
104 | 
105 | ```
106 | Sampling Environment Check
107 | Linux Kernel Paranoid Level = 2: OK
108 | Linux Distribution = Ubuntu
109 | Linux Kernel Version = 4.16.15-41615: OK
110 | Linux perf_event_open syscall available: OK
111 | Sampling trigger event available: OK
112 | Intel(c) Last Branch Record support: Available
113 | Sampling Environment: OK
114 | ```
115 | 
116 | Errors may reduce the amount of information collected, or cause profiling to fail.
117 | Consult documentation for troubleshooting steps.
118 | 
119 | ## Capturing a Profile with CLI
120 | 
121 | Under this scheme, we 
122 | * use the CLI on the target to record a profiling file
123 | * transfer that file to the client
124 | * use the GUI on the client to analyze the record
125 | 
126 | ### Nsight Compute
127 | 
128 | This command will
129 | * Generate `a.nsight-cuprof-report` with recorded profiling information
130 | * Measure metrics associated with all sections
131 | * Profile the 6th invocation of `__global__ void kernel_name(...)`
132 | * Run a.out
133 | 
134 | ```bash
135 | nv-nsight-cu-cli \ 
136 |   -o a \
137 |   --sections ".*" \
138 |   --kernel-id ::kernel_name:6 \
139 |   a.out
140 | ```
141 | 
142 | To see sections that will be recorded for a command, add `--list-sections`.
143 | 
144 | ```
145 | nv-nsight-cu-cli --list-sections
146 | ---------------------------- ------------------------------- --------------------------------------------------
147 | Identifier                    Display Name                    Filename                                          
148 | ----------------------------- ------------------------------- --------------------------------------------------
149 | ComputeWorkloadAnalysis       Compute Workload Analysis       .../../../sections/ComputeWorkloadAnalysis.section
150 | InstructionStats              Instruction Statistics          ...64/../../sections/InstructionStatistics.section
151 | LaunchStats                   Launch Statistics               ...1_3-x64/../../sections/LaunchStatistics.section
152 | MemoryWorkloadAnalysis        Memory Workload Analysis        ...4/../../sections/MemoryWorkloadAnalysis.section
153 | MemoryWorkloadAnalysis_Chart  Memory Workload Analysis Chart  ..../sections/MemoryWorkloadAnalysis_Chart.section
154 | MemoryWorkloadAnalysis_Tables Memory Workload Analysis Tables .../sections/MemoryWorkloadAnalysis_Tables.section
155 | Occupancy                     Occupancy                       ...ibc_2_11_3-x64/../../sections/Occupancy.section
156 | SchedulerStats                Scheduler Statistics            ...-x64/../../sections/SchedulerStatistics.section
157 | SourceCounters                Source Counters                 ..._11_3-x64/../../sections/SourceCounters.section
158 | SpeedOfLight                  GPU Speed Of Light              ..._2_11_3-x64/../../sections/SpeedOfLight.section
159 | WarpStateStats                Warp State Statistics           ...-x64/../../sections/WarpStateStatistics.section
160 | ```
161 | 
162 | To see supported metrics on a device, do `nv-nsight-cu-cli --devices 0 --query-metrics`
163 | 
164 | On some newer devices, the base metrics name will not work. You need to append an allowed suffix. To see all the legal names and suffices, do `nv-nsight-cu-cli --devices 0 --query-metrics --query-metrics-mode all`
165 | 
166 | 
167 | The `--kernel-id` flag takes a string like `context-id:stream-id:[name-operator:]kernel-name:invocation-nr`.
168 | Commonly, we might only use `kernel-name`, to select kernels to profile by name, and `invocation-nr`, to select which invocation of the kernels to profile.
169 | 
170 | ### Nsight Systems
171 | 
172 | This command will
173 | * Record profiling info to `a.qdreq` 
174 | * Run a.out
175 | 
176 | ```bash
177 | nsys profile \
178 |   -o a
179 |   a.out
180 | ```
181 | 
182 | ## Using the GUI on a client to view a recorded file from the target
183 | 
184 | In **Nsight Compute**:
185 | 
186 | File > Open File ... > file.nsight-cuprof-report
187 | 
188 | If you profiled on a different system than the GUI tool is running on, and you want to look at a View that includes the source, you may have to click the "resolve" button to nagivate to a local copy of the source file.
189 | 
190 | In **Nsight Systems**:
191 | 
192 | File > Open > file.qdrep
193 | 
194 | ## Using the GUI on the client to Control Remote Profiling on the target
195 | 
196 | *instructions to come*
197 | 
198 | ## Managing docker images
199 | 
200 | * `docker ps -a`
201 | * `docker rm `docker ps -a -q``
202 | * `docker system prune`
203 | 
204 | Run a profiling container:
205 | ```bash
206 | docker run cwpearson/nvidia-performance-tools:latest-amd64
207 | ```
208 | 
209 | Resume a previously exited container:
210 | ```bash
211 | * docker ps -a       # find the ID
212 | * docker start <ID>  # resume the exited container
213 | * docker attach <ID> # attach a terminal to the container
214 | ```
215 | 
216 | ## For Contributors
217 | 
218 | See [CONTRIBUTING.md](CONTRIBUTING.md)
219 | 
220 | ## Resources
221 | 
222 | * [Nvidia Nsight Systems Docs](https://docs.nvidia.com/nsight-systems/)
223 | * [Nvidia Nsight Compute Docs](https://docs.nvidia.com/nsight-compute/)
224 | * NVIDIA Devloper Blog
225 |   * [Nsight Systems Exposes GPU Optimization (May 30 2018)](https://devblogs.nvidia.com/nsight-systems-exposes-gpu-optimization/)
226 |   * [Transitioning to Nsight Systems from NVIDIA Visual Profiler / nvprof (Aug 2 2019)](https://devblogs.nvidia.com/transitioning-nsight-systems-nvidia-visual-profiler-nvprof/)
227 |   * [Using Nsight Compute to Inspect your Kernels (Sep 16 2019)](https://devblogs.nvidia.com/using-nsight-compute-to-inspect-your-kernels/)
228 |   * [Using Nvidia Nsight Systems in Containers and the Cloud (Jan 29 2020)](https://devblogs.nvidia.com/nvidia-nsight-systems-containers-cloud/)
229 |   
230 | * Interpreting Nsight Compute Results
231 |   * Workload Memory Analysis
232 |     * [CUDA Memory Model](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#memory-hierarchy)
233 |     * [Device Memory Access Performance Guidelines](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses)
234 |   * Stall Reasons
235 |     * [Nsight Graphics Docs: Stall Reasons](https://docs.nvidia.com/drive/drive_os_5.1.12.0L/nsight-graphics/activities/#shaderprofiler_stallreasons)
236 |   * Issue Efficiency
237 |     * [Issue Efficiency Nsight Visual Studio Edition](https://docs.nvidia.com/gameworks/content/developertools/desktop/analysis/report/cudaexperiments/kernellevel/issueefficiency.htm)
238 |   * Occupancy
239 |     * [Nsight Visual Studio Edition](https://docs.nvidia.com/gameworks/content/developertools/desktop/analysis/report/cudaexperiments/kernellevel/achievedoccupancy.htm)
240 | * Slides
241 |   * [docs/GEMM-joint-tiling.ppt](docs/GEMM-joint-tiling.ppt): Joint-tiling slide deck from ECE 508 Spring 2017
242 | 
243 | * GTC
244 |   * [Volta Architecture and Performance Optimization](http://on-demand.gputechconf.com/gtc/2018/presentation/s81006-volta-architecture-and-performance-optimization.pdf): Volta L1 will cache writes
245 | 


--------------------------------------------------------------------------------
/amd64_10-1.dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:10.1-devel-ubuntu18.04
 2 | 
 3 | # Set one or more individual labels
 4 | LABEL maintainer="Carl Pearson"
 5 | 
 6 | # prevent prompts during apt-get
 7 | ENV DEBIAN_FRONTEND=noninteractive
 8 | 
 9 | RUN apt-get update && apt-get install -y -q --no-install-recommends --no-install-suggests \
10 |   cmake \
11 |   libglib2.0 \
12 |   && rm -rf /var/lib/apt/lists/*
13 | 
14 | COPY test.cu .
15 | COPY nsight-compute-linux-2019.5.0.14-27346997.run nsight_compute.run
16 | COPY NVIDIA_Nsight_Systems_Linux_2020.2.1.71.deb nsight_systems.deb
17 | 
18 | # install Nsight Compute
19 | # install script seems to want TERM set
20 | RUN chmod +x nsight_compute.run
21 | RUN TERM=xterm ./nsight_compute.run --quiet -- -noprompt -targetpath=/opt/NVIDIA-Nsight-Compute
22 | ENV PATH /opt/NVIDIA-Nsight-Compute:${PATH}
23 | # work around rai jobs having a different path than docker run
24 | RUN ln -s /opt/NVIDIA-Nsight-Compute/nv-nsight-cu-cli /usr/local/bin/nv-nsight-cu-cli
25 | RUN rm nsight_compute.run
26 | 
27 | # install Nsight Systems
28 | RUN dpkg -i nsight_systems.deb
29 | RUN rm nsight_systems.deb
30 | 


--------------------------------------------------------------------------------
/ci/build.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | set -e
 3 | 
 4 | source ci/env.sh
 5 | 
 6 | if [[ $BUILD_DOCKER == "1" ]]; then
 7 |     cd $TRAVIS_BUILD_DIR
 8 | 
 9 |     echo $DOCKER_PASS | docker login -u $DOCKER_USER --password-stdin
10 | 
11 |     TRAVIS_COMMIT=${TRAVIS_COMMIT:0:7}
12 |     DOCKER_REPO=nvidia-performance-tools
13 |     DOCKER_SLUG=$DOCKER_USER/$DOCKER_REPO
14 |     DOCKER_TAG=${TRAVIS_CPU_ARCH}-10.1-$TRAVIS_BRANCH-$TRAVIS_COMMIT
15 | 
16 | 
17 |     docker build -f ${TRAVIS_CPU_ARCH}_10-1.dockerfile -t $DOCKER_SLUG:$DOCKER_TAG .
18 |     docker push $DOCKER_SLUG:$DOCKER_TAG
19 | 
20 | 
21 |     if [[ $TRAVIS_BRANCH == master ]]; then
22 |         docker tag $DOCKER_SLUG:$DOCKER_TAG $DOCKER_SLUG:latest-${TRAVIS_CPU_ARCH}
23 |         docker push $DOCKER_SLUG:latest-${TRAVIS_CPU_ARCH}
24 |     else
25 |         docker tag $DOCKER_SLUG:$DOCKER_TAG $DOCKER_SLUG:$TRAVIS_BRANCH-$TRAVIS_CPU_ARCH
26 |         docker push $DOCKER_SLUG:$TRAVIS_BRANCH-${TRAVIS_CPU_ARCH}
27 |     fi
28 | fi
29 | 
30 | 
31 | if [[ $BUILD_TYPE != '' ]]; then
32 |     cd $TRAVIS_BUILD_DIR
33 |     cd sgemm
34 |     mkdir -p build
35 |     cd build
36 |     cmake .. -DCMAKE_BUILD_TYPE=$BUILD_TYPE
37 |     make VERBOSE=1
38 | fi


--------------------------------------------------------------------------------
/ci/env.sh:
--------------------------------------------------------------------------------
1 | CMAKE_PREFIX=$HOME/cmake
2 | 
3 | export PATH=/usr/local/cuda/bin:$PATH
4 | export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
5 | 
6 | export PATH=$CMAKE_PREFIX/bin:$PATH


--------------------------------------------------------------------------------
/ci/install_deps.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | set -e
 3 | 
 4 | source ci/env.sh
 5 | 
 6 | # deps for building docker images
 7 | if [[ $BUILD_DOCKER == "1" ]]; then
 8 |   cd $TRAVIS_BUILD_DIR
 9 | 
10 |   if [[ $TRAVIS_CPU_ARCH == ppc64le ]]; then
11 |     wget -qSL https://uofi.box.com/shared/static/vfxflckdjixxkc524qltme4sx8kt3w9d.deb -O NVIDIA_Nsight_Systems_Power_CLI_Only_2020.2.1.71.deb;
12 |     wget -qSL https://uofi.box.com/shared/static/swjp2bjr7xj153vzw8mvutv2tqomypxu.run -O nsight-compute-PPC64LE-2019.5.0.14-27346997.run;
13 |   elif [[ $TRAVIS_CPU_ARCH == amd64 ]]; then 
14 |     wget -qSL https://uofi.box.com/shared/static/zjsv2rayiotyrdix6a6yd3w8cre56lo0.deb -O NVIDIA_Nsight_Systems_Linux_2020.2.1.71.deb;
15 |     wget -qSL https://uofi.box.com/shared/static/4fuf3wws1uplhf29ndcq4s91kl3jyl7z.run -O nsight-compute-linux-2019.5.0.14-27346997.run;
16 |   fi
17 | fi
18 | 
19 | # deps for building code
20 | if [[ $BUILD_TYPE != '' ]]; then
21 |     cd $HOME
22 | 
23 |     ## install CMake
24 |     wget -qSL https://github.com/Kitware/CMake/releases/download/v3.8.2/cmake-3.8.2-Linux-x86_64.tar.gz -O cmake.tar.gz
25 |     mkdir -p $CMAKE_PREFIX
26 |     tar -xf cmake.tar.gz --strip-components=1 -C $CMAKE_PREFIX
27 |     rm cmake.tar.gz
28 | 
29 |     ## install CUDA
30 |     sudo apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub
31 |     CUDA102="http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-repo-ubuntu1804_10.2.89-1_amd64.deb"
32 |     wget -SL $CUDA102 -O cuda.deb
33 |     sudo dpkg -i cuda.deb
34 |     sudo apt-get update 
35 |     sudo apt-get install -y --no-install-recommends \
36 |         cuda-toolkit-10-2
37 |     rm cuda.deb
38 | fi


--------------------------------------------------------------------------------
/coalescing/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | project(coalescing LANGUAGES CXX CUDA)
 2 | 
 3 | # 3.8+ for CUDA
 4 | cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
 5 | 
 6 | if(NOT CMAKE_BUILD_TYPE)
 7 |     set(CMAKE_BUILD_TYPE "Release")
 8 |     message(STATUS "Setting CMAKE_BUILD_TYPE=Release")
 9 | endif()
10 | 
11 | set(CMAKE_CUDA_STANDARD 11)
12 | set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
13 | 
14 | include_directories(PUBLIC SYSTEM include)
15 | 
16 | # Add line info to binaries to help with profiling
17 | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo")
18 | 
19 | add_executable(main main.cu)
20 | 


--------------------------------------------------------------------------------
/coalescing/README.md:
--------------------------------------------------------------------------------
  1 | # Coalescing
  2 | 
  3 | Examples for measuring memory access efficiency using Nsight Compute.
  4 | 
  5 | To really see this in Nsight Compute, you need at least 2019.5.
  6 | Earlier versions do not show requested command line profiler metrics.
  7 | 
  8 | 
  9 | From the Programming guide section 5
 10 | Device memory can be accessed in 32B, 64B or 128B transactions.
 11 | 
 12 | 
 13 | CC3.x: 
 14 | Global accesses are cached in L2, and may be cached in L1 if read-only.
 15 | 
 16 | It appears that memory requests are 128B.
 17 | 
 18 | Cache line is 128 bytes
 19 | If in L1 and L2, served with 128 byte transactions
 20 | If in L2 only, 32-byte transactions
 21 | 
 22 | Each memory request is broken down into cache line requests that are issued independently. So, a 128B request may turn into some number of 128B or 32B cache line requests / transactions.
 23 | 
 24 | CC prior to 7.x: Read/Write is not cached in L1
 25 | CC 7.x: Read/Write is cached in L1.
 26 | 
 27 | *sector*: an aligned 32B region, the memory access granularity.
 28 | 
 29 | ## Documents
 30 | * slides
 31 |   * [Google Slides, view-only](https://docs.google.com/presentation/d/1L3o1mNqtXqHtBKidrKcfYeLx-AigNiptTtRODgUk9-I/edit?usp=sharing)
 32 |   * [pdf](../slides/memory_access_efficiency.pdf)
 33 | 
 34 | ## Measuring Memory Coalescing with Nsight Compute
 35 | 
 36 | Coalescing means we want to minimize the number of memory or cache transactions that are needed to serve a request.
 37 | 
 38 | There are four `indirect` kernels executed each iteration of this code.
 39 | * float, coalesced
 40 | * float, uncoalesced
 41 | * double, coalesced
 42 | * double, uncoalesced
 43 | 
 44 | Each thread increments one element in global memory.
 45 | The access is indirect through an integer array, which is permuted on the host to control coalescing.
 46 | 
 47 | **On Pascal (GTX 1070, GP102 or 104 or something (not GP100) CC6.1)**
 48 | 
 49 | On GP 104, Caches global loads in L2 only (Pascal Tuning Guide 1.4.3.2)
 50 | 
 51 | On pascal, this code makes `LDG.E` accesses, and L2 accesses are 32B.
 52 | 
 53 | 10,000 floats -> 40KB
 54 | 40 KB into 32B / request = 1250 requests.
 55 | if coalesced, each request is a single "sector" (?), so also 1250 sectors
 56 | If uncoalesced, each 32B request may be 8 separate 4B floats each from a different sector, so up to 10K sectors.
 57 | 
 58 | 
 59 | We can look at the "Source" page for the `ld.global.f32` instruction:
 60 | 
 61 | | | coalesced | uncoalesced |
 62 | |-|-|-|
 63 | | Instructions Executed | 313 | 313 |
 64 | | Predicated On-Thread Instructions | 10000 | 10000 |
 65 | | Memory Access Size | 32 | 32 |
 66 | | Memory L2 Transactions Global | 1250 | 9874 |
 67 | 
 68 | Memory access size means a 32-bit load instead of 64-bit.
 69 | 10000 floats into 32 threads/warp is 313 instructions.
 70 | If each of those 313 instructions was 32 loads, each would be a minimum of 4 32B L2 transactions, or 1252 transactions.
 71 | We can tell some of those 313 instructions are not full, since 313 * 32 = 10,016, and we only had 10,000 instructions predicated-on (thus, 1250).
 72 | 
 73 | In any case, we expect ~1250 transactions, which we observe for coalesced.
 74 | For uncoalesced, we see 9874, or about 8x too many.
 75 | So each 32 loads is ~32 L2 transactions, or fully uncoalesced.
 76 | 
 77 | A similar analysis with `double`s (`ld.global.f64`) instead of floats: 
 78 | 
 79 | | | coalesced | uncoalesced |
 80 | |-|-|-|
 81 | | Instructions Executed | 313 | 313 |
 82 | | Predicated On-Thread Instructions | 10000 | 10000 |
 83 | | Memory Access Size | 64 | 64 |
 84 | | Memory L2 Transactions Global | 2500 | 9950 |
 85 | 
 86 | Each load instruction is now 32 8B loads, or a minimum of 8 32B L2 transactions.
 87 | 313 * 8 transactions = 2504 transactions
 88 | Instead we observe nearly 4x the number of transactions.
 89 | The uncoalescing is 1/2 as bad here (4x instead of 8x) because each 32B L2 transaction can only handle 4 doubles instead of 8 floats.
 90 | 
 91 | We can also consider the kernel as a whole:
 92 | 
 93 | Coalesced accesses
 94 | ```
 95 | nv-nsight-cu-cli --kernel-id "::indirect:1" --metrics tex__texin_requests_global_ld_uncached,lts__request_tex_read_sectors_global_ld_uncached,lts__request_tex_read_bytes_global_ld_uncached main
 96 | ```
 97 | 
 98 | ```
 99 | [24874] main@127.0.0.1
100 |   indirect, 2020-Apr-22 09:04:29, Context 1, Stream 7
101 |     Section: Command line profiler metrics
102 |     ---------------------------------------------------------------------- --------------- ------------------------------
103 |     lts__request_tex_read_bytes_global_ld_uncached                                   Kbyte                             80
104 |     lts__request_tex_read_sectors_global_ld_uncached                                sector                          2,500
105 |     tex__texin_requests_global_ld_uncached                                                                          2,500
106 |     ---------------------------------------------------------------------- --------------- ------------------------------
107 | ```
108 | 
109 | Uncoalesced Accesses
110 | ```
111 | nv-nsight-cu-cli --kernel-id "::indirect:2" --metrics tex__texin_requests_global_ld_uncached,lts__request_tex_read_sectors_global_ld_uncached,lts__request_tex_read_bytes_global_ld_uncached main
112 | ```
113 | 
114 | ```
115 | [24760] main@127.0.0.1
116 |   indirect, 2020-Apr-22 09:03:40, Context 1, Stream 7
117 |     Section: Command line profiler metrics
118 |     ---------------------------------------------------------------------- --------------- ------------------------------
119 |     lts__request_tex_read_bytes_global_ld_uncached                                   Kbyte                         357.02
120 |     lts__request_tex_read_sectors_global_ld_uncached                                sector                         11,157
121 |     tex__texin_requests_global_ld_uncached                                                                          2,500
122 |     ---------------------------------------------------------------------- --------------- ------------------------------
123 | ```
124 | 
125 | If coalesed, 1250 + 1250 = 2500 32B requests
126 | 1250 + 1250 = 2500 sectors.
127 | 
128 | If not, 1250 + 1250 = 2500 32B requests
129 | 1250 + 10K = max of 11250 sectors.
130 | 
131 | 
132 | **On Volta (Titan V, CC7.0)**
133 | 
134 | On Volta, global memory loads are cached in L1.
135 | 
136 | On Volta, this code makes `LDG.E.SYS` accesses, so we get 128B L1 requests and 32B L2 requests.
137 | 
138 | Alternatively, we can look at the "Source" page for the `ld.global.f32` instruction:
139 | 
140 | | | coalesced | uncoalesced |
141 | |-|-|-|
142 | | Instructions Executed | 313 | 313 |
143 | | Memory Access Size | 32 | 32 |
144 | | Memory L1 Transactions Global | 313 | 9515 |
145 | | Memory L2 Tranactions Global | 1250 | 9885 |
146 | 
147 | Similarly, 313 instructions.
148 | When coalesced, this results in 313 L1 requests.
149 | If uncoalesced, each 128B instruction may ultimately be 32 separate 4B floats each from a different sector, so up to 10K sectors / transactions.
150 | 
151 | When coalesced, this is ~313 * 4 = 1250 L2 requests.
152 | If uncoalesced, each 128B instruction may ultimately be 32 separate 4B floats each from a different sector, so up to 10K sectors / transactions.
153 | 
154 | We can also consider the `ld.global.f64` version:
155 | 
156 | | | coalesced | uncoalesced |
157 | |-|-|-|
158 | | Instructions Executed | 313 | 313 |
159 | | Memory Access Size | 64 | 64 |
160 | | Memory L1 Transactions Global | 625 | 9740 |
161 | | Memory L2 Tranactions Global | 2500 | 9947 |
162 | 
163 | Now, each instruction (32 8B loads) generates two 128B L1 transactions (coalesced), or 32 128B L1 transactions (uncoalesced).
164 | 
165 | Now, each instruction (32 8B loads) generates 8 32B L2 transactions (coalesced), or 32 128B L2 transactions (uncoalesced).
166 | 
167 | We can also consider the entire kernel.
168 | 
169 | The uncoalesced results:
170 | ```
171 | nv-nsight-cu-cli --kernel-id "::indirect:1" --metrics l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum main
172 | ```
173 | 
174 | ```
175 | [362] main@127.0.0.1
176 |   indirect(float*,int*,unsigned long), 2020-Apr-22 14:25:36, Context 1, Stream 7
177 |     Section: Command line profiler metrics
178 | ---------------------------------------------------------------------- --------------- ------------------------------
179 |     l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum                                request                            626
180 |     l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum                                  sector                           2500
181 |     ---------------------------------------------------------------------- --------------- ------------------------------
182 | ```
183 | 
184 | The coalesced results:
185 | ```
186 | nv-nsight-cu-cli --kernel-id "::indirect:2" --metrics l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum main
187 | ```
188 | 
189 | ```
190 | [387] main@127.0.0.1
191 |   indirect(float*,int*,unsigned long), 2020-Apr-22 14:25:36, Context 1, Stream 7
192 |     Section: Command line profiler metrics
193 | ---------------------------------------------------------------------- --------------- ------------------------------
194 |     l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum                                request                            626
195 |     l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum                                  sector                          10093
196 |     ---------------------------------------------------------------------- --------------- ------------------------------
197 | ```
198 | 
199 | 
200 | 
201 | 
202 | ## Instruction Profiling
203 | 
204 | Compare `Instructions Executed`, `L1 Transactions`, and `L2 Transactions`.
205 | 
206 | ## Resources
207 | 
208 | This content is inspired by https://devblogs.nvidia.com/using-nsight-compute-to-inspect-your-kernels/
209 | 
210 | Metrics Names: https://docs.nvidia.com/cupti/Cupti/r_main.html#r_host_derived_metrics_api
211 | 
212 | 


--------------------------------------------------------------------------------
/coalescing/common.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <chrono>
 4 | 
 5 | #ifdef __CUDACC__
 6 | inline void checkCuda(cudaError_t result, const char *file, const int line) {
 7 |   if (result != cudaSuccess) {
 8 |     fprintf(stderr, "%s@%d: CUDA Runtime Error(%d): %s\n", file, line,
 9 |             int(result), cudaGetErrorString(result));
10 |     exit(-1);
11 |   }
12 | }
13 | 
14 | #define CUDA_RUNTIME(stmt) checkCuda(stmt, __FILE__, __LINE__);
15 | #endif
16 | 
17 | typedef std::chrono::high_resolution_clock Clock;
18 | typedef std::chrono::duration<float> Duration;


--------------------------------------------------------------------------------
/coalescing/include/argparse/argparse.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <iostream>
  4 | #include <sstream>
  5 | #include <string>
  6 | #include <vector>
  7 | 
  8 | namespace argparse {
  9 | 
 10 | class OptionBase {
 11 | public:
 12 |   virtual void set_val(const std::string &valStr) = 0;
 13 |   virtual const std::string &long_str() = 0;
 14 | };
 15 | 
 16 | template <typename T> class Option : public OptionBase {
 17 |   std::string long_;
 18 |   T *val_;
 19 | 
 20 | public:
 21 |   Option(T &val, const std::string &l) : long_(l), val_(&val) {}
 22 |   void set_val(const std::string &val) override { set_val((T *)nullptr, val); }
 23 |   const std::string &long_str() override { return long_; }
 24 | 
 25 | private:
 26 |   void set_val(size_t *, const std::string &val) { // convert to size_t
 27 |     *val_ = std::stoull(val);
 28 |   }
 29 |   void set_val(double *, const std::string &val) { // convert to double
 30 |     *val_ = std::stod(val);
 31 |   }
 32 |   void set_val(float *, const std::string &val) { // convert to float
 33 |     *val_ = std::stof(val);
 34 |   }
 35 |   void set_val(int *, const std::string &val) { // convert to int
 36 |     *val_ = std::stoi(val);
 37 |   }
 38 |   void set_val(std::string *, const std::string &val) { // convert to string
 39 |     *val_ = val;
 40 |   }
 41 | };
 42 | 
 43 | class Flag {
 44 |   std::string long_;
 45 |   std::string short_;
 46 |   std::string help_;
 47 |   bool *val_;
 48 | 
 49 | public:
 50 |   Flag(bool &val, const std::string &l, const std::string &s)
 51 |       : long_(l), short_(s), val_(&val) {}
 52 | 
 53 |   const std::string &long_str() const noexcept { return long_; }
 54 |   const std::string &short_str() const noexcept { return short_; }
 55 | 
 56 |   void set() const noexcept { *val_ = true; }
 57 | 
 58 |   void help(const std::string &s) { help_ = s; }
 59 | 
 60 |   const std::string &help_str() const noexcept { return help_; }
 61 | };
 62 | 
 63 | class PosnlBase {
 64 | public:
 65 |   virtual bool is_required() = 0;
 66 |   virtual PosnlBase *required() = 0;
 67 |   virtual void set_val(const std::string &val) = 0;
 68 |   virtual bool found() = 0;
 69 | };
 70 | 
 71 | template <typename T> class Positional : public PosnlBase {
 72 |   bool required_;
 73 |   T *val_;
 74 |   bool found_;
 75 | 
 76 | public:
 77 |   Positional(T &val) : required_(false), val_(&val), found_(false) {}
 78 | 
 79 |   PosnlBase *required() override {
 80 |     required_ = true;
 81 |     return this;
 82 |   }
 83 | 
 84 |   bool is_required() override { return required_; }
 85 | 
 86 |   // use nullpointer type to disambiguate call
 87 |   // https://stackoverflow.com/questions/5512910/explicit-specialization-of-template-class-member-function
 88 |   void set_val(const std::string &val) {
 89 |     found_ = true;
 90 |     set_val((T *)nullptr, val);
 91 |   }
 92 | 
 93 |   bool found() override { return found_; }
 94 | 
 95 | private:
 96 |   // https://stackoverflow.com/questions/5512910/explicit-specialization-of-template-class-member-function
 97 |   template <typename C>
 98 |   void get_as(C *, const std::string &val) { // to be overridden
 99 |   }
100 |   void set_val(size_t *, const std::string &val) { // convert to size_t
101 |     *val_ = std::stoull(val);
102 |   }
103 |   void set_val(double *, const std::string &val) { // convert to double
104 |     *val_ = std::stod(val);
105 |   }
106 |   void set_val(float *, const std::string &val) { // convert to float
107 |     *val_ = std::stof(val);
108 |   }
109 |   void set_val(int *, const std::string &val) { // convert to int
110 |     *val_ = std::stoi(val);
111 |   }
112 |   void set_val(std::string *, const std::string &val) { // convert to string
113 |     *val_ = val;
114 |   }
115 | };
116 | 
117 | class Parser {
118 | 
119 |   std::string description_;
120 |   bool noUnrecognized_; // error on unrecognized flags / opts
121 |   bool help_;           // help has been requested
122 |   bool consume_;        // remove consumed values from argc, argv
123 | 
124 |   std::vector<OptionBase *> opts_;
125 |   std::vector<Flag> flags_;
126 |   std::vector<PosnlBase *> posnls_;
127 | 
128 |   static bool starts_with(const std::string &s, const std::string &prefix) {
129 |     if (s.rfind(prefix, 0) == 0) {
130 |       return true;
131 |     }
132 |     return false;
133 |   }
134 | 
135 |   OptionBase *match_opt(const char *arg) const {
136 |     std::string sarg(arg);
137 |     for (int64_t i = int64_t(opts_.size()) - 1; i >= 0; --i) {
138 |       if (opts_[i]->long_str() == sarg) {
139 |         return opts_[i];
140 |       }
141 |     }
142 |     return nullptr;
143 |   }
144 | 
145 |   Flag *match_flag(const char *arg) {
146 |     std::string sarg(arg);
147 |     for (int64_t i = int64_t(flags_.size()) - 1; i >= 0; --i) {
148 |       if (flags_[i].long_str() == sarg || flags_[i].short_str() == sarg) {
149 |         return &flags_[i];
150 |       }
151 |     }
152 |     return nullptr;
153 |   }
154 | 
155 | public:
156 |   Parser() : noUnrecognized_(false), help_(false), consume_(true) {
157 |     add_flag(help_, "--help", "-h")->help("Print help message");
158 |   }
159 |   Parser(const std::string &description)
160 |       : description_(description), noUnrecognized_(false), help_(false),
161 |         consume_(true) {
162 |     add_flag(help_, "--help", "-h")->help("Print help message");
163 |   }
164 | 
165 |   bool parse(int &argc, char **argv) {
166 | 
167 |     std::vector<char *> newArgv;
168 |     if (argc > 0) {
169 |       newArgv.push_back(argv[0]);
170 |     }
171 | 
172 |     size_t pi = 0;        // positional argument position
173 |     bool optsOkay = true; // okay to interpret as opt/flag
174 |     for (int i = 1; i < argc; ++i) {
175 | 
176 |       // try interpreting as a flag or option if it looks like one
177 |       if (optsOkay && starts_with(argv[i], "-")) {
178 |         // '--' indicates only positional arguments follow
179 |         if (argv[i] == std::string("--")) {
180 |           optsOkay = false;
181 |           continue;
182 |         }
183 |         OptionBase *opt = match_opt(argv[i]);
184 |         if (opt) {
185 |           opt->set_val(argv[i + 1]);
186 |           ++i;
187 |           continue;
188 |         }
189 |         Flag *flag = match_flag(argv[i]);
190 |         if (flag) {
191 |           flag->set();
192 |           continue;
193 |         }
194 |         newArgv.push_back(argv[i]);
195 |         if (noUnrecognized_) {
196 |           std::cerr << "unrecognized " << argv[i] << "\n";
197 |           return false;
198 |         }
199 |       } else { // otherwise try it as positional
200 |         if (pi < posnls_.size()) {
201 |           posnls_[pi]->set_val(argv[i]);
202 |           ++pi;
203 |         } else {
204 |           newArgv.push_back(argv[i]);
205 |           std::cerr << "encountered unexpected positional argument " << pi
206 |                     << ": " << argv[i] << "\n";
207 |         }
208 |       }
209 |     }
210 | 
211 |     for (; pi < posnls_.size(); ++pi) {
212 |       if (posnls_[pi]->is_required()) {
213 |         std::cerr << "missing required positional argument " << pi << "\n";
214 |         return false;
215 |       }
216 |     }
217 | 
218 |     if (consume_) {
219 |       argc = newArgv.size();
220 |       for (int i = 0; i < argc; ++i) {
221 |         argv[i] = newArgv[i];
222 |       }
223 |     }
224 | 
225 |     return true;
226 |   };
227 | 
228 |   template <typename T> void add_option(T &val, const std::string &l) {
229 |     opts_.push_back(new Option<T>(val, l));
230 |   }
231 | 
232 |   Flag *add_flag(bool &val, const std::string &l, const std::string &s = "") {
233 |     flags_.push_back(Flag(val, l, s));
234 |     return &(flags_.back());
235 |   }
236 | 
237 |   template <typename T> PosnlBase *add_positional(T &val) {
238 |     posnls_.push_back(new Positional<T>(val));
239 |     return posnls_.back();
240 |   }
241 | 
242 |   std::string help() const {
243 |     std::stringstream ss;
244 | 
245 |     ss << description_ << "\n";
246 | 
247 |     for (auto &o : opts_) {
248 |       ss << o->long_str() << "\n";
249 |     }
250 | 
251 |     for (auto &f : flags_) {
252 |       ss << "  " << f.short_str() << ", " << f.long_str();
253 |       ss << "\t\t" << f.help_str();
254 |       ss << "\n";
255 |     }
256 | 
257 |     return ss.str();
258 |   }
259 | 
260 |   /*! \brief error on unrecognized flags and options
261 |    */
262 |   void no_unrecognized() { noUnrecognized_ = true; }
263 | 
264 |   /*! \brief don't modify argc/argv
265 |    */
266 |   void no_consume() { consume_ = false; }
267 | 
268 |   bool need_help() const noexcept { return help_; }
269 | };
270 | 
271 | } // namespace argparse


--------------------------------------------------------------------------------
/coalescing/main.cu:
--------------------------------------------------------------------------------
 1 | #include <algorithm>
 2 | #include <numeric>
 3 | #include <random>
 4 | 
 5 | #include <argparse/argparse.hpp>
 6 | 
 7 | #include "common.hpp"
 8 | 
 9 | 
10 | template <typename T>
11 | __global__ void indirect(T *p, const int *off, const size_t n) {
12 |   int tid = blockDim.x * blockIdx.x + threadIdx.x;
13 | 
14 |   for (int i = tid; i < n; i += gridDim.x * blockDim.x) {
15 |     int idx = off[i];
16 |     T f = p[idx];
17 |     f += 1;
18 |     p[idx] = f;
19 |   }
20 | 
21 | }
22 | 
23 | int main(int argc, char **argv) {
24 | 
25 |   argparse::Parser parser;
26 | 
27 |   int n = 10000;
28 |   int nIters = 5;
29 |   int nWarmup = 5;
30 |   parser.add_positional(n);
31 | 
32 |   if (!parser.parse(argc, argv)) {
33 |     parser.help();
34 |     exit(EXIT_FAILURE);
35 |   }
36 | 
37 |   // generate access patterns
38 |   std::vector<int> cHost(n), uHost(n);
39 |   std::iota(cHost.begin(), cHost.end(), 0);
40 |   std::iota(uHost.begin(), uHost.end(), 0);
41 |   std::shuffle(uHost.begin(), uHost.end(), std::mt19937{std::random_device{}()});
42 | 
43 | 
44 |   // allocate device data
45 |   float *fDev;
46 |   double *dDev;
47 |   int *cDev, *uDev;
48 |   CUDA_RUNTIME(cudaMalloc(&fDev, n * sizeof(*fDev)));
49 |   CUDA_RUNTIME(cudaMalloc(&dDev, n * sizeof(*dDev)));
50 |   CUDA_RUNTIME(cudaMalloc(&cDev, n * sizeof(int)));
51 |   CUDA_RUNTIME(cudaMalloc(&uDev, n * sizeof(int)));
52 | 
53 |   // copy indices to device
54 |   CUDA_RUNTIME(cudaMemcpy(cDev, cHost.data(), cHost.size() * sizeof(int), cudaMemcpyDefault));
55 |   CUDA_RUNTIME(cudaMemcpy(uDev, uHost.data(), uHost.size() * sizeof(int), cudaMemcpyDefault));
56 | 
57 |   // GPU kernel launch parameters
58 |   dim3 dimBlock(512,1,1);
59 |   dim3 dimGrid(1,1,1);
60 |   dimGrid.x = (n + dimBlock.x - 1) / dimBlock.x;
61 | 
62 |   for (int i = 0; i < nIters + nWarmup; ++i) {
63 |     indirect<<<dimGrid, dimBlock>>>(fDev, cDev, n);
64 |     CUDA_RUNTIME(cudaDeviceSynchronize());
65 |     indirect<<<dimGrid, dimBlock>>>(fDev, uDev, n);
66 |     CUDA_RUNTIME(cudaDeviceSynchronize());
67 |     indirect<<<dimGrid, dimBlock>>>(dDev, cDev, n);
68 |     CUDA_RUNTIME(cudaDeviceSynchronize());
69 |     indirect<<<dimGrid, dimBlock>>>(dDev, uDev, n);
70 |     CUDA_RUNTIME(cudaDeviceSynchronize());
71 |   }
72 | 
73 |   CUDA_RUNTIME(cudaFree(fDev));
74 |   CUDA_RUNTIME(cudaFree(dDev));
75 |   CUDA_RUNTIME(cudaFree(cDev));
76 |   CUDA_RUNTIME(cudaFree(uDev));
77 | 
78 |   return 0;
79 | }


--------------------------------------------------------------------------------
/coalescing/rai_build.yml:
--------------------------------------------------------------------------------
 1 | rai:
 2 |   version: 0.2
 3 |   image: cwpearson/nvidia-performance-tools:amd64-10.1-master-c4d1bb1
 4 | resources:
 5 |   cpu:
 6 |     architecture: amd64
 7 |   gpu:
 8 |     count: 1
 9 |   network: false
10 |   cache: false
11 | commands:
12 |   build:
13 |     - which nv-nsight-cu-cli
14 |     - nv-nsight-cu-cli --version
15 |     - nvidia-smi
16 |     - cp -r /src .
17 |     - cmake /src -DCMAKE_BUILD_TYPE=Release
18 |     - make
19 |     - echo "Check Nsight Configurations"
20 |     - bash -c "nv-nsight-cu-cli --devices 0 --query-metrics --query-metrics-mode all > metrics.txt"
21 |     - echo "Nsight Compute Results"
22 |     - nv-nsight-cu-cli --kernel-id "::indirect:1" --metrics l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum main | tee coalesced.txt
23 |     - nv-nsight-cu-cli --kernel-id "::indirect:2" --metrics l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum main | tee uncoalesced.txt
24 |     - nv-nsight-cu-cli --kernel-id "::indirect:1" --section ".*" -o f32_coalesced main
25 |     - nv-nsight-cu-cli --kernel-id "::indirect:2" --section ".*" -o f32_uncoalesced main
26 |     - nv-nsight-cu-cli --kernel-id "::indirect:3" --section ".*" -o f64_coalesced main
27 |     - nv-nsight-cu-cli --kernel-id "::indirect:4" --section ".*" -o f64_uncoalesced main
28 | 


--------------------------------------------------------------------------------
/ppc64le_10-1.dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda-ppc64le:10.1-devel-ubuntu18.04
 2 | 
 3 | # Set one or more individual labels
 4 | LABEL maintainer="Carl Pearson"
 5 | 
 6 | # prevent prompts during apt-get
 7 | ENV DEBIAN_FRONTEND=noninteractive
 8 | 
 9 | RUN apt-get update && apt-get install -y -q --no-install-recommends --no-install-suggests \
10 |   cmake \
11 |   libglib2.0 \
12 |   && rm -rf /var/lib/apt/lists/*
13 | 
14 | COPY test.cu .
15 | COPY nsight-compute-PPC64LE-2019.5.0.14-27346997.run nsight_compute.run
16 | COPY NVIDIA_Nsight_Systems_Power_CLI_Only_2020.2.1.71.deb nsight_systems.deb
17 | 
18 | # install Nsight Compute
19 | # install script seems to want TERM set
20 | RUN chmod +x nsight_compute.run
21 | RUN TERM=xterm ./nsight_compute.run --quiet -- -noprompt -targetpath=/opt/NVIDIA-Nsight-Compute
22 | ENV PATH /opt/NVIDIA-Nsight-Compute:${PATH}
23 | # work around rai jobs having a different path than docker run
24 | RUN ln -s /opt/NVIDIA-Nsight-Compute/nv-nsight-cu-cli /usr/local/bin/nv-nsight-cu-cli
25 | RUN rm nsight_compute.run
26 | 
27 | # install Nsight Systems
28 | RUN dpkg -i nsight_systems.deb
29 | RUN rm nsight_systems.deb
30 | 


--------------------------------------------------------------------------------
/sgemm/1_1_pinned_basic.cu:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | 
  3 | #include <nvToolsExt.h>
  4 | 
  5 | #include <argparse/argparse.hpp>
  6 | 
  7 | #include "common.hpp"
  8 | 
  9 | /* NOTE: A and C are column major, B is row major
 10 |  */
 11 | __global__ void mygemm(float *__restrict__ c, //<! [out] and MxN matrix
 12 |                        const float *a,        //<! [in] an MxK matrix
 13 |                        const float *b,        //<! [in] an KxN matrix
 14 |                        const int M, const int N, const int K) {
 15 | 
 16 | #define A(_i, _j) a[(_i) + (_j)*M]
 17 | #define B(_i, _j) b[(_i)*N + (_j)]
 18 | #define C(_i, _j) c[(_i) + (_j)*M]
 19 | 
 20 |   int gidx = blockDim.x * blockIdx.x + threadIdx.x;
 21 |   int gidy = blockDim.y * blockIdx.y + threadIdx.y;
 22 | 
 23 |   for (int i = gidy; i < M; i += gridDim.y * blockDim.y) {
 24 |     for (int j = gidx; j < N; j += gridDim.x * blockDim.x) {
 25 |       float acc = 0;
 26 |       for (int k = 0; k < K; ++k) {
 27 |         acc += A(i, k) * B(k, j);
 28 |       }
 29 |       C(i, j) = acc;
 30 |     }
 31 |   }
 32 | 
 33 | #undef A
 34 | #undef B
 35 | #undef C
 36 | }
 37 | 
 38 | int main(int argc, char **argv) {
 39 | 
 40 |   argparse::Parser parser;
 41 | 
 42 |   // default matrix sizes:
 43 |   // A: 1489 x 1493
 44 |   // B: 1493 x 1499
 45 |   // C: 1489 x 1499
 46 |   int m = 1489;
 47 |   int n = 1499;
 48 |   int k = 1493;
 49 | 
 50 |   int nIters = 5;
 51 |   int nWarmup = 5;
 52 |   bool check = false;
 53 |   parser.add_positional(m);
 54 |   parser.add_positional(n);
 55 |   parser.add_positional(k);
 56 |   parser.add_option(nIters, "--iters");
 57 |   parser.add_option(nWarmup, "--warmup");
 58 |   parser.add_flag(check, "--check");
 59 | 
 60 |   if (!parser.parse(argc, argv)) {
 61 |     parser.help();
 62 |     exit(EXIT_FAILURE);
 63 |   }
 64 | 
 65 |   const int64_t flop = int64_t(m) * int64_t(n) * int64_t(k) * 2;
 66 | 
 67 |   // initialize host data
 68 |   std::cout << "generate data\n";
 69 |   nvtxRangePush("generate data");
 70 |   float *aHost, *bHost, *cHost, *cExpected;
 71 |   CUDA_RUNTIME(cudaHostAlloc(&aHost, m * k * sizeof(float), 0));
 72 |   CUDA_RUNTIME(cudaHostAlloc(&bHost, k * n * sizeof(float), 0));
 73 |   CUDA_RUNTIME(cudaHostAlloc(&cHost, m * n * sizeof(float), 0));
 74 |   CUDA_RUNTIME(cudaHostAlloc(&cExpected, m * n * sizeof(float), 0));
 75 |   std::generate(aHost, aHost + m * k, random_int);
 76 |   std::generate(bHost, bHost + k * n, random_int);
 77 |   nvtxRangePop();
 78 | 
 79 |   // allocate device data
 80 |   float *aDev, *bDev, *cDev;
 81 |   CUDA_RUNTIME(cudaMalloc(&aDev, m * k * sizeof(float)));
 82 |   CUDA_RUNTIME(cudaMalloc(&bDev, k * n * sizeof(float)));
 83 |   CUDA_RUNTIME(cudaMalloc(&cDev, m * n * sizeof(float)));
 84 | 
 85 |   // copy data to device
 86 |   std::cout << "transfer to GPU\n";
 87 |   nvtxRangePush("host-to-device");
 88 |   CUDA_RUNTIME(
 89 |       cudaMemcpy(aDev, aHost, m * k * sizeof(float), cudaMemcpyDefault));
 90 |   CUDA_RUNTIME(
 91 |       cudaMemcpy(bDev, bHost, k * n * sizeof(float), cudaMemcpyDefault));
 92 |   nvtxRangePop();
 93 | 
 94 |   // create events to time GPU kernel
 95 |   cudaEvent_t start, stop;
 96 |   CUDA_RUNTIME(cudaEventCreate(&start));
 97 |   CUDA_RUNTIME(cudaEventCreate(&stop));
 98 | 
 99 |   // GPU kernel launch parameters
100 |   dim3 dimBlock(32, 32);
101 |   dim3 dimGrid;
102 |   dimGrid.x = (n + dimBlock.x - 1) / dimBlock.x;
103 |   dimGrid.y = (m + dimBlock.y - 1) / dimBlock.y;
104 | 
105 |   // total elapsed time
106 |   float elapsed = 0;
107 | 
108 |   /* Launch the kernel nIters + nWarmup times
109 |      Check for correctness on the first time.
110 |      Record the time after nWarmup runs complete.
111 |   */
112 |   for (int i = 0; i < nIters + nWarmup; ++i) {
113 |     nvtxRangePush("kernel");
114 |     CUDA_RUNTIME(cudaEventRecord(start));
115 |     mygemm<<<dimGrid, dimBlock>>>(cDev, aDev, bDev, m, n, k);
116 |     CUDA_RUNTIME(cudaEventRecord(stop));
117 |     CUDA_RUNTIME(cudaEventSynchronize(stop));
118 |     nvtxRangePop();
119 | 
120 |     // check result once
121 |     if (check && 0 == i) {
122 |       // copy result to host
123 |       CUDA_RUNTIME(
124 |           cudaMemcpy(cHost, cDev, m * n * sizeof(float), cudaMemcpyDefault));
125 | 
126 |       // check result on host
127 |       cpu_gemm(cExpected, aHost, bHost, m, n, k);
128 | 
129 |       for (size_t i = 0; i < m * n; ++i) {
130 |         if (!equal(cExpected[i], cHost[i], 1e-6)) {
131 |           std::cout << "Error!\n";
132 |           exit(EXIT_FAILURE);
133 |         }
134 |       }
135 |     }
136 | 
137 |     float millis;
138 |     CUDA_RUNTIME(cudaEventElapsedTime(&millis, start, stop));
139 |     std::cout << i << ": " << millis << (i >= nWarmup ? " *" : " ") << "\n";
140 | 
141 |     // record time after warmup runs
142 |     if (i >= nWarmup) {
143 |       elapsed += millis;
144 |     }
145 |   }
146 | 
147 |   // print results
148 |   double gflops = flop / ((elapsed / nIters) / 1000) / 1e9;
149 |   std::cout << "kernel " << gflops << "GFLOPS (" << flop << " flop, "
150 |             << (elapsed / nIters) / 1000 << "s)\n";
151 | 
152 |   // release resources
153 |   CUDA_RUNTIME(cudaEventDestroy(start));
154 |   CUDA_RUNTIME(cudaEventDestroy(stop));
155 |   CUDA_RUNTIME(cudaFree(aDev));
156 |   CUDA_RUNTIME(cudaFree(bDev));
157 |   CUDA_RUNTIME(cudaFree(cDev));
158 |   CUDA_RUNTIME(cudaFreeHost(aHost));
159 |   CUDA_RUNTIME(cudaFreeHost(bHost));
160 |   CUDA_RUNTIME(cudaFreeHost(cHost));
161 |   CUDA_RUNTIME(cudaFreeHost(cExpected));
162 |   return 0;
163 | }


--------------------------------------------------------------------------------
/sgemm/1_2_pinned_tiled.cu:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | 
  3 | #include <nvToolsExt.h>
  4 | 
  5 | #include <argparse/argparse.hpp>
  6 | 
  7 | #include "common.hpp"
  8 | 
  9 | #define TILE_WIDTH 32
 10 | 
 11 | /* NOTE: A and C are column major, B is row major
 12 |  */
 13 | __global__ void mygemm(float *__restrict__ c, //<! [out] and MxN matrix
 14 |                        const float *a,        //<! [in] an MxK matrix
 15 |                        const float *b,        //<! [in] an KxN matrix
 16 |                        const int M, const int N, const int K) {
 17 | 
 18 |   __shared__ float aSh[TILE_WIDTH][TILE_WIDTH];
 19 |   __shared__ float bSh[TILE_WIDTH][TILE_WIDTH];
 20 |   int bx = blockIdx.x;
 21 |   int by = blockIdx.y;
 22 |   int tx = threadIdx.x;
 23 |   int ty = threadIdx.y;
 24 |   int i = by * TILE_WIDTH + ty;
 25 |   int j = bx * TILE_WIDTH + tx;
 26 |   float acc = 0;
 27 | 
 28 | #define A(_i, _j) a[(_i) + (_j)*M]
 29 | #define B(_i, _j) b[(_i)*N + (_j)]
 30 | #define C(_i, _j) c[(_i) + (_j)*M]
 31 | 
 32 |   for (int m = 0; m < (K - 1) / TILE_WIDTH + 1; ++m) {
 33 |     if (i < M && m * TILE_WIDTH + tx < K) {
 34 |       aSh[ty][tx] = A(i, m * TILE_WIDTH + tx);
 35 |     } else {
 36 |       aSh[ty][tx] = 0;
 37 |     }
 38 |     if (j < N && m * TILE_WIDTH + ty < K) {
 39 |       bSh[ty][tx] = B(m * TILE_WIDTH + ty, j);
 40 |     } else {
 41 |       bSh[ty][tx] = 0;
 42 |     }
 43 | 
 44 |     __syncthreads();
 45 |     for (int k = 0; k < TILE_WIDTH; ++k) {
 46 |       acc += aSh[ty][k] * bSh[k][tx];
 47 |     }
 48 |     __syncthreads();
 49 |   }
 50 |   if (i < M && j < N) {
 51 |     C(i, j) = acc;
 52 |   }
 53 | 
 54 | #undef A
 55 | #undef B
 56 | #undef C
 57 | }
 58 | 
 59 | int main(int argc, char **argv) {
 60 | 
 61 |   argparse::Parser parser;
 62 | 
 63 |   // default matrix sizes:
 64 |   // A: 1489 x 1493
 65 |   // B: 1493 x 1499
 66 |   // C: 1489 x 1499
 67 |   int m = 1489;
 68 |   int n = 1499;
 69 |   int k = 1493;
 70 | 
 71 |   int nIters = 5;
 72 |   int nWarmup = 5;
 73 |   bool check = false;
 74 |   parser.add_positional(m);
 75 |   parser.add_positional(n);
 76 |   parser.add_positional(k);
 77 |   parser.add_option(nIters, "--iters");
 78 |   parser.add_option(nWarmup, "--warmup");
 79 |   parser.add_flag(check, "--check");
 80 | 
 81 |   if (!parser.parse(argc, argv)) {
 82 |     parser.help();
 83 |     exit(EXIT_FAILURE);
 84 |   }
 85 | 
 86 |   const int64_t flop = int64_t(m) * int64_t(n) * int64_t(k) * 2;
 87 | 
 88 |   // initialize host data
 89 |   std::cout << "generate data\n";
 90 |   nvtxRangePush("generate data");
 91 |   float *aHost, *bHost, *cHost, *cExpected;
 92 |   CUDA_RUNTIME(cudaHostAlloc(&aHost, m * k * sizeof(float), 0));
 93 |   CUDA_RUNTIME(cudaHostAlloc(&bHost, k * n * sizeof(float), 0));
 94 |   CUDA_RUNTIME(cudaHostAlloc(&cHost, m * n * sizeof(float), 0));
 95 |   CUDA_RUNTIME(cudaHostAlloc(&cExpected, m * n * sizeof(float), 0));
 96 |   std::generate(aHost, aHost + m * k, random_int);
 97 |   std::generate(bHost, bHost + k * n, random_int);
 98 |   nvtxRangePop();
 99 | 
100 |   // allocate device data
101 |   float *aDev, *bDev, *cDev;
102 |   CUDA_RUNTIME(cudaMalloc(&aDev, m * k * sizeof(float)));
103 |   CUDA_RUNTIME(cudaMalloc(&bDev, k * n * sizeof(float)));
104 |   CUDA_RUNTIME(cudaMalloc(&cDev, m * n * sizeof(float)));
105 | 
106 |   // copy data to device
107 |   std::cout << "transfer to GPU\n";
108 |   nvtxRangePush("host-to-device");
109 |   CUDA_RUNTIME(
110 |       cudaMemcpy(aDev, aHost, m * k * sizeof(float), cudaMemcpyDefault));
111 |   CUDA_RUNTIME(
112 |       cudaMemcpy(bDev, bHost, k * n * sizeof(float), cudaMemcpyDefault));
113 |   nvtxRangePop();
114 | 
115 |   // create events to time GPU kernel
116 |   cudaEvent_t start, stop;
117 |   CUDA_RUNTIME(cudaEventCreate(&start));
118 |   CUDA_RUNTIME(cudaEventCreate(&stop));
119 | 
120 |   // GPU kernel launch parameters
121 |   dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);
122 |   dim3 dimGrid;
123 |   dimGrid.x = (n + dimBlock.x - 1) / dimBlock.x;
124 |   dimGrid.y = (m + dimBlock.y - 1) / dimBlock.y;
125 | 
126 |   // total elapsed time
127 |   float elapsed = 0;
128 | 
129 |   /* Launch the kernel nIters + nWarmup times
130 |      Check for correctness on the first time.
131 |      Record the time after nWarmup runs complete.
132 |   */
133 |   for (int i = 0; i < nIters + nWarmup; ++i) {
134 |     CUDA_RUNTIME(cudaEventRecord(start));
135 |     mygemm<<<dimGrid, dimBlock>>>(cDev, aDev, bDev, m, n, k);
136 |     CUDA_RUNTIME(cudaEventRecord(stop));
137 |     CUDA_RUNTIME(cudaEventSynchronize(stop));
138 | 
139 |     // check result once
140 |     if (check && 0 == i) {
141 |       // copy result to host
142 |       CUDA_RUNTIME(
143 |           cudaMemcpy(cHost, cDev, m * n * sizeof(float), cudaMemcpyDefault));
144 | 
145 |       // check result on host
146 |       cpu_gemm(cExpected, aHost, bHost, m, n, k);
147 | 
148 |       for (size_t i = 0; i < m * n; ++i) {
149 |         if (!equal(cExpected[i], cHost[i], 1e-6)) {
150 |           std::cout << "Error!\n";
151 |           exit(EXIT_FAILURE);
152 |         }
153 |       }
154 |     }
155 | 
156 |     float millis;
157 |     CUDA_RUNTIME(cudaEventElapsedTime(&millis, start, stop));
158 |     std::cout << i << ": " << millis << (i >= nWarmup ? " *" : " ") << "\n";
159 | 
160 |     // record time after warmup runs
161 |     if (i >= nWarmup) {
162 |       elapsed += millis;
163 |     }
164 |   }
165 | 
166 |   // print results
167 |   double gflops = flop / ((elapsed / nIters) / 1000) / 1e9;
168 |   std::cout << "kernel " << gflops << "GFLOPS (" << flop << " flop, "
169 |             << (elapsed / nIters) / 1000 << "s)\n";
170 | 
171 |   // release resources
172 |   CUDA_RUNTIME(cudaEventDestroy(start));
173 |   CUDA_RUNTIME(cudaEventDestroy(stop));
174 |   CUDA_RUNTIME(cudaFree(aDev));
175 |   CUDA_RUNTIME(cudaFree(bDev));
176 |   CUDA_RUNTIME(cudaFree(cDev));
177 |   CUDA_RUNTIME(cudaFreeHost(aHost));
178 |   CUDA_RUNTIME(cudaFreeHost(bHost));
179 |   CUDA_RUNTIME(cudaFreeHost(cHost));
180 |   CUDA_RUNTIME(cudaFreeHost(cExpected));
181 |   return 0;
182 | }
183 | 


--------------------------------------------------------------------------------
/sgemm/1_3_pinned_joint.cu:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | 
  3 | #include <nvToolsExt.h>
  4 | 
  5 | #include <argparse/argparse.hpp>
  6 | 
  7 | #include "common.hpp"
  8 | 
  9 | #define TILE_SZ_A 64
 10 | #define TILE_SZ_B 16
 11 | #define TILE_SZ_RATIO (TILE_SZ_A / TILE_SZ_B)
 12 | 
 13 | /* NOTE: A and C are column major, B is row major
 14 |  */
 15 | __global__ void mygemm(float * __restrict__ c,       //<! [out] and MxN matrix
 16 |                        const float *a, //<! [in] an MxK matrix
 17 |                        const float *b, //<! [in] an KxN matrix
 18 |                        const int M, const int N, const int K) {
 19 | 
 20 | // Macros for accessing flattened matrices
 21 | #define A(_i, _j) a[(_i) + (_j)*M]
 22 | #define B(_i, _j) b[(_i)*N + (_j)]
 23 | #define C(_i, _j) c[(_i) + (_j)*M]
 24 | 
 25 |   // Shared memory for tiling input B array
 26 |   __shared__ float B_s[TILE_SZ_RATIO][TILE_SZ_B];
 27 | 
 28 |   // Index variables
 29 |   const unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
 30 |   const unsigned int col = blockIdx.y * TILE_SZ_B;
 31 | 
 32 |   // Privatization of output variables
 33 |   float c_reg[TILE_SZ_B];
 34 | 
 35 |   // Initialize output values
 36 |   for (unsigned int outIdx = 0; outIdx < TILE_SZ_B; ++outIdx) {
 37 |     c_reg[outIdx] = 0;
 38 |   }
 39 | 
 40 |   // Loop over the input tiles
 41 |   for (unsigned int tileIdx = 0; tileIdx < (K - 1) / TILE_SZ_RATIO + 1;
 42 |        ++tileIdx) {
 43 |     // Load the tile of B into shared memory
 44 |     const unsigned int i = threadIdx.x / TILE_SZ_B;
 45 |     const unsigned int j = threadIdx.x % TILE_SZ_B;
 46 |     if (tileIdx * TILE_SZ_RATIO + i < K && col + j < N) {
 47 |       B_s[i][j] = B(tileIdx * TILE_SZ_RATIO + i, col + j);
 48 |     } else {
 49 |       B_s[i][j] = 0;
 50 |     }
 51 |     __syncthreads();
 52 |     // Loop over elements inside the tile
 53 |     for (unsigned int idx = 0; idx < TILE_SZ_RATIO; ++idx) {
 54 |       // Load tile of A matrix into register
 55 |       float a_reg;
 56 |       if (row < M && tileIdx * TILE_SZ_RATIO + idx < K) {
 57 |         a_reg = A(row, tileIdx * TILE_SZ_RATIO + idx);
 58 |       } else {
 59 |         a_reg = 0;
 60 |       }
 61 |       // Loop over and update the output elements assigned to the thread
 62 |       for (unsigned int outIdx = 0; outIdx < TILE_SZ_B; ++outIdx) {
 63 |         c_reg[outIdx] += a_reg * B_s[idx][outIdx];
 64 |       }
 65 |     }
 66 |     __syncthreads();
 67 |   }
 68 | 
 69 |   for (unsigned int outIdx = 0; outIdx < TILE_SZ_B; ++outIdx) {
 70 |     if (row < M && col + outIdx < N) {
 71 |       C(row, col + outIdx) = c_reg[outIdx];
 72 |     }
 73 |   }
 74 | 
 75 | #undef A
 76 | #undef B
 77 | #undef C
 78 | }
 79 | 
 80 | int main(int argc, char **argv) {
 81 | 
 82 |   argparse::Parser parser;
 83 | 
 84 |   // default matrix sizes:
 85 |   // A: 1489 x 1493
 86 |   // B: 1493 x 1499
 87 |   // C: 1489 x 1499
 88 |   int m = 1489;
 89 |   int n = 1499;
 90 |   int k = 1493;
 91 | 
 92 |   int nIters = 5;
 93 |   int nWarmup = 5;
 94 |   bool check = false;
 95 |   parser.add_positional(m);
 96 |   parser.add_positional(n);
 97 |   parser.add_positional(k);
 98 |   parser.add_option(nIters, "--iters");
 99 |   parser.add_option(nWarmup, "--warmup");
100 |   parser.add_flag(check, "--check");
101 | 
102 |   if (!parser.parse(argc, argv)) {
103 |     parser.help();
104 |     exit(EXIT_FAILURE);
105 |   }
106 | 
107 |   const int64_t flop = int64_t(m) * int64_t(n) * int64_t(k) * 2;
108 | 
109 |   // initialize host data
110 |   std::cout << "generate data\n";
111 |   nvtxRangePush("generate data");
112 |   float *aHost, *bHost, *cHost, *cExpected;
113 |   CUDA_RUNTIME(cudaHostAlloc(&aHost, m * k * sizeof(float), 0));
114 |   CUDA_RUNTIME(cudaHostAlloc(&bHost, k * n * sizeof(float), 0));
115 |   CUDA_RUNTIME(cudaHostAlloc(&cHost, m * n * sizeof(float), 0));
116 |   CUDA_RUNTIME(cudaHostAlloc(&cExpected, m * n * sizeof(float), 0));
117 |   std::generate(aHost, aHost + m * k, random_int);
118 |   std::generate(bHost, bHost + k * n, random_int);
119 |   nvtxRangePop();
120 | 
121 |   // allocate device data
122 |   float *aDev, *bDev, *cDev;
123 |   CUDA_RUNTIME(cudaMalloc(&aDev, m * k * sizeof(float)));
124 |   CUDA_RUNTIME(cudaMalloc(&bDev, k * n * sizeof(float)));
125 |   CUDA_RUNTIME(cudaMalloc(&cDev, m * n * sizeof(float)));
126 | 
127 |   // copy data to device
128 |   std::cout << "transfer to GPU\n";
129 |   nvtxRangePush("host-to-device");
130 |   CUDA_RUNTIME(
131 |       cudaMemcpy(aDev, aHost, m * k * sizeof(float), cudaMemcpyDefault));
132 |   CUDA_RUNTIME(
133 |       cudaMemcpy(bDev, bHost, k * n * sizeof(float), cudaMemcpyDefault));
134 |   nvtxRangePop();
135 | 
136 |   // create events to time GPU kernel
137 |   cudaEvent_t start, stop;
138 |   CUDA_RUNTIME(cudaEventCreate(&start));
139 |   CUDA_RUNTIME(cudaEventCreate(&stop));
140 | 
141 |   // GPU kernel launch parameters
142 |   dim3 dimGrid((m + TILE_SZ_A - 1) / TILE_SZ_A, (n +TILE_SZ_B - 1) / TILE_SZ_B);
143 |   dim3 dimBlock(TILE_SZ_A, 1);
144 | 
145 |   // total elapsed time
146 |   float elapsed = 0;
147 | 
148 |   /* Launch the kernel nIters + nWarmup times
149 |      Check for correctness on the first time.
150 |      Record the time after nWarmup runs complete.
151 |   */
152 |   for (int i = 0; i < nIters + nWarmup; ++i) {
153 |     CUDA_RUNTIME(cudaEventRecord(start));
154 |     mygemm<<<dimGrid, dimBlock>>>(cDev, aDev, bDev, m, n, k);
155 |     CUDA_RUNTIME(cudaEventRecord(stop));
156 |     CUDA_RUNTIME(cudaEventSynchronize(stop));
157 | 
158 |     // check result once
159 |     if (check && 0 == i) {
160 |       // copy result to host
161 |       CUDA_RUNTIME(
162 |           cudaMemcpy(cHost, cDev, m * n * sizeof(float), cudaMemcpyDefault));
163 | 
164 |       // check result on host
165 |       cpu_gemm(cExpected, aHost, bHost, m, n, k);
166 | 
167 |       for (size_t i = 0; i < m * n; ++i) {
168 |         if (!equal(cExpected[i], cHost[i], 1e-6)) {
169 |           std::cout << "Error!\n";
170 |           exit(EXIT_FAILURE);
171 |         }
172 |       }
173 |     }
174 | 
175 |     float millis;
176 |     CUDA_RUNTIME(cudaEventElapsedTime(&millis, start, stop));
177 |     std::cout << i << ": " << millis << (i >= nWarmup ? " *" : " ") << "\n";
178 | 
179 |     // record time after warmup runs
180 |     if (i >= nWarmup) {
181 |       elapsed += millis;
182 |     }
183 |   }
184 | 
185 |   // print results
186 |   double gflops = flop / ((elapsed / nIters) / 1000) / 1e9;
187 |   std::cout << "kernel " << gflops << "GFLOPS (" << flop << " flop, "
188 |             << (elapsed / nIters) / 1000 << "s)\n";
189 | 
190 |   // release resources
191 |   CUDA_RUNTIME(cudaEventDestroy(start));
192 |   CUDA_RUNTIME(cudaEventDestroy(stop));
193 |   CUDA_RUNTIME(cudaFree(aDev));
194 |   CUDA_RUNTIME(cudaFree(bDev));
195 |   CUDA_RUNTIME(cudaFree(cDev));
196 |   CUDA_RUNTIME(cudaFreeHost(aHost));
197 |   CUDA_RUNTIME(cudaFreeHost(bHost));
198 |   CUDA_RUNTIME(cudaFreeHost(cHost));
199 |   CUDA_RUNTIME(cudaFreeHost(cExpected));
200 |   return 0;
201 | }
202 | 


--------------------------------------------------------------------------------
/sgemm/2_1_pageable_basic.cu:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | 
  3 | #include <nvToolsExt.h>
  4 | 
  5 | #include <argparse/argparse.hpp>
  6 | 
  7 | #include "common.hpp"
  8 | 
  9 | /* NOTE: A and C are column major, B is row major
 10 |  */
 11 | __global__ void mygemm(float *__restrict__ c, //<! [out] and MxN matrix
 12 |                        const float *a,        //<! [in] an MxK matrix
 13 |                        const float *b,        //<! [in] an KxN matrix
 14 |                        const int M, const int N, const int K) {
 15 | 
 16 | #define A(_i, _j) a[(_i) + (_j)*M]
 17 | #define B(_i, _j) b[(_i)*N + (_j)]
 18 | #define C(_i, _j) c[(_i) + (_j)*M]
 19 | 
 20 |   int gidx = blockDim.x * blockIdx.x + threadIdx.x;
 21 |   int gidy = blockDim.y * blockIdx.y + threadIdx.y;
 22 | 
 23 |   for (int i = gidy; i < M; i += gridDim.y * blockDim.y) {
 24 |     for (int j = gidx; j < N; j += gridDim.x * blockDim.x) {
 25 |       float acc = 0;
 26 |       for (int k = 0; k < K; ++k) {
 27 |         acc += A(i, k) * B(k, j);
 28 |       }
 29 |       C(i, j) = acc;
 30 |     }
 31 |   }
 32 | 
 33 | #undef A
 34 | #undef B
 35 | #undef C
 36 | }
 37 | 
 38 | /* Time the total transfer & matrix-multiplication time
 39 |  */
 40 |  int main(int argc, char **argv) {
 41 | 
 42 |   argparse::Parser parser;
 43 | 
 44 |   // default matrix sizes:
 45 |   // A: 1600 x 1500
 46 |   // B: 1500 x 1400
 47 |   // C: 1600 x 1400
 48 |   int m = 1600;
 49 |   int n = 1400;
 50 |   int k = 1500;
 51 | 
 52 |   int nIters = 5;
 53 |   int nWarmup = 5;
 54 |   parser.add_positional(m);
 55 |   parser.add_positional(n);
 56 |   parser.add_positional(k);
 57 |   parser.add_option(nIters, "--iters");
 58 |   parser.add_option(nWarmup, "--warmup");
 59 | 
 60 |   if (!parser.parse(argc, argv)) {
 61 |     parser.help();
 62 |     exit(EXIT_FAILURE);
 63 |   }
 64 | 
 65 |   const int64_t flop = int64_t(m) * int64_t(n) * int64_t(k) * 2 * nIters;
 66 | 
 67 |   // initialize host data
 68 |   std::cout << "generate data\n";
 69 |   nvtxRangePush("generate data");
 70 |   float *aHost, *bHost, *cHost;
 71 |   aHost = new float[m * k];
 72 |   bHost = new float[k * n];
 73 |   cHost = new float[m * n];
 74 |   std::generate(aHost, aHost + m * k, random_int);
 75 |   std::generate(bHost, bHost + k * n, random_int);
 76 |   nvtxRangePop();
 77 | 
 78 |   // allocate device data
 79 |   float *aDev, *bDev, *cDev;
 80 |   CUDA_RUNTIME(cudaMalloc(&aDev, m * k * sizeof(float)));
 81 |   CUDA_RUNTIME(cudaMalloc(&bDev, k * n * sizeof(float)));
 82 |   CUDA_RUNTIME(cudaMalloc(&cDev, m * n * sizeof(float)));
 83 | 
 84 |   // create events to time GPU kernel
 85 |   cudaEvent_t start, stop;
 86 |   CUDA_RUNTIME(cudaEventCreate(&start));
 87 |   CUDA_RUNTIME(cudaEventCreate(&stop));
 88 | 
 89 |   // GPU kernel launch parameters
 90 |   dim3 dimBlock(32, 32);
 91 |   dim3 dimGrid;
 92 |   dimGrid.x = (n + dimBlock.x - 1) / dimBlock.x;
 93 |   dimGrid.y = (m + dimBlock.y - 1) / dimBlock.y;
 94 | 
 95 |   float kernelTime = 0;
 96 |   float wallTime = 0;
 97 | 
 98 |   for (int iter = 0; iter < nWarmup + nIters; ++iter) {
 99 | 
100 |     auto wallStart = Clock::now();
101 | 
102 |     // copy data to device
103 |     nvtxRangePush("host-to-device");
104 |     CUDA_RUNTIME(
105 |         cudaMemcpy(aDev, aHost, m * k * sizeof(float), cudaMemcpyDefault));
106 |     CUDA_RUNTIME(
107 |         cudaMemcpy(bDev, bHost, k * n * sizeof(float), cudaMemcpyDefault));
108 |     nvtxRangePop();
109 | 
110 |     // kernel time
111 |     float millis;
112 |     CUDA_RUNTIME(cudaEventRecord(start));
113 |     mygemm<<<dimGrid, dimBlock>>>(cDev, aDev, bDev, m, n, k);
114 |     CUDA_RUNTIME(cudaEventRecord(stop));
115 |     CUDA_RUNTIME(cudaEventSynchronize(stop));
116 |     CUDA_RUNTIME(cudaEventElapsedTime(&millis, start, stop));
117 | 
118 |     // copy data back to host
119 |     nvtxRangePush("device-to-host");
120 |     CUDA_RUNTIME(
121 |         cudaMemcpy(cHost, cDev, m * n * sizeof(float), cudaMemcpyDefault));
122 |     nvtxRangePop();
123 |     CUDA_RUNTIME(cudaDeviceSynchronize());
124 | 
125 |     Duration wallElapsed = Clock::now() - wallStart;
126 | 
127 |     std::cout << iter << " kernel=" << millis / 1000
128 |               << " wall=" << wallElapsed.count()
129 |               << (iter >= nWarmup ? " *" : "  ") << "\n";
130 | 
131 |     // track time if no longer during warmup
132 |     if (iter >= nWarmup) {
133 |       wallTime += wallElapsed.count();
134 |       kernelTime += millis / 1000; // seconds
135 |     }
136 |   }
137 | 
138 |   // print results
139 |   double kernelGflops = flop / 1e9 / kernelTime;
140 |   std::cout << "kernel " << kernelGflops << "GFLOPS (" << flop << " flop, "
141 |             << kernelTime << "s)\n";
142 |   double wallGflops = flop / 1e9 / wallTime;
143 |   std::cout << "wall " << wallGflops << "GFLOPS (" << flop << " flop, "
144 |             << wallTime << "s)\n";
145 |   // release resources
146 |   CUDA_RUNTIME(cudaEventDestroy(start));
147 |   CUDA_RUNTIME(cudaEventDestroy(stop));
148 |   CUDA_RUNTIME(cudaFree(aDev));
149 |   CUDA_RUNTIME(cudaFree(bDev));
150 |   CUDA_RUNTIME(cudaFree(cDev));
151 |   delete[] aHost;
152 |   delete[] bHost;
153 |   delete[] cHost;
154 |   return 0;
155 | }


--------------------------------------------------------------------------------
/sgemm/2_2_pinned_basic.cu:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | 
  3 | #include <nvToolsExt.h>
  4 | 
  5 | #include <argparse/argparse.hpp>
  6 | 
  7 | #include "common.hpp"
  8 | 
  9 | /* NOTE: A and C are column major, B is row major
 10 |  */
 11 | __global__ void mygemm(float *__restrict__ c, //<! [out] and MxN matrix
 12 |                        const float *a,        //<! [in] an MxK matrix
 13 |                        const float *b,        //<! [in] an KxN matrix
 14 |                        const int M, const int N, const int K) {
 15 | 
 16 | #define A(_i, _j) a[(_i) + (_j)*M]
 17 | #define B(_i, _j) b[(_i)*N + (_j)]
 18 | #define C(_i, _j) c[(_i) + (_j)*M]
 19 | 
 20 |   int gidx = blockDim.x * blockIdx.x + threadIdx.x;
 21 |   int gidy = blockDim.y * blockIdx.y + threadIdx.y;
 22 | 
 23 |   for (int i = gidy; i < M; i += gridDim.y * blockDim.y) {
 24 |     for (int j = gidx; j < N; j += gridDim.x * blockDim.x) {
 25 |       float acc = 0;
 26 |       for (int k = 0; k < K; ++k) {
 27 |         acc += A(i, k) * B(k, j);
 28 |       }
 29 |       C(i, j) = acc;
 30 |     }
 31 |   }
 32 | 
 33 | #undef A
 34 | #undef B
 35 | #undef C
 36 | }
 37 | 
 38 | /* Time the total transfer & matrix-multiplication time
 39 |  */
 40 |  int main(int argc, char **argv) {
 41 | 
 42 |   argparse::Parser parser;
 43 | 
 44 |   // default matrix sizes:
 45 |   // A: 1600 x 1500
 46 |   // B: 1500 x 1400
 47 |   // C: 1600 x 1400
 48 |   int m = 1600;
 49 |   int n = 1400;
 50 |   int k = 1500;
 51 | 
 52 |   int nIters = 5;
 53 |   int nWarmup = 5;
 54 |   parser.add_positional(m);
 55 |   parser.add_positional(n);
 56 |   parser.add_positional(k);
 57 |   parser.add_option(nIters, "--iters");
 58 |   parser.add_option(nWarmup, "--warmup");
 59 | 
 60 |   if (!parser.parse(argc, argv)) {
 61 |     parser.help();
 62 |     exit(EXIT_FAILURE);
 63 |   }
 64 | 
 65 |   const int64_t flop = int64_t(m) * int64_t(n) * int64_t(k) * 2 * nIters;
 66 | 
 67 |   // initialize host data
 68 |   std::cout << "generate data\n";
 69 |   nvtxRangePush("generate data");
 70 |   float *aHost, *bHost, *cHost;
 71 |   CUDA_RUNTIME(cudaHostAlloc(&aHost, m * k * sizeof(float), 0));
 72 |   CUDA_RUNTIME(cudaHostAlloc(&bHost, k * n * sizeof(float), 0));
 73 |   CUDA_RUNTIME(cudaHostAlloc(&cHost, m * n * sizeof(float), 0));
 74 |   std::generate(aHost, aHost + m * k, random_int);
 75 |   std::generate(bHost, bHost + k * n, random_int);
 76 |   nvtxRangePop();
 77 | 
 78 |   // allocate device data
 79 |   float *aDev, *bDev, *cDev;
 80 |   CUDA_RUNTIME(cudaMalloc(&aDev, m * k * sizeof(float)));
 81 |   CUDA_RUNTIME(cudaMalloc(&bDev, k * n * sizeof(float)));
 82 |   CUDA_RUNTIME(cudaMalloc(&cDev, m * n * sizeof(float)));
 83 | 
 84 |   // create events to time GPU kernel
 85 |   cudaEvent_t start, stop;
 86 |   CUDA_RUNTIME(cudaEventCreate(&start));
 87 |   CUDA_RUNTIME(cudaEventCreate(&stop));
 88 | 
 89 |   // GPU kernel launch parameters
 90 |   dim3 dimBlock(32, 32);
 91 |   dim3 dimGrid;
 92 |   dimGrid.x = (n + dimBlock.x - 1) / dimBlock.x;
 93 |   dimGrid.y = (m + dimBlock.y - 1) / dimBlock.y;
 94 | 
 95 |   float kernelTime = 0;
 96 |   float wallTime = 0;
 97 | 
 98 |   for (int iter = 0; iter < nWarmup + nIters; ++iter) {
 99 | 
100 |     auto wallStart = Clock::now();
101 | 
102 |     // copy data to device
103 |     nvtxRangePush("host-to-device");
104 |     CUDA_RUNTIME(
105 |         cudaMemcpy(aDev, aHost, m * k * sizeof(float), cudaMemcpyDefault));
106 |     CUDA_RUNTIME(
107 |         cudaMemcpy(bDev, bHost, k * n * sizeof(float), cudaMemcpyDefault));
108 |     nvtxRangePop();
109 | 
110 |     // kernel time
111 |     float millis;
112 |     CUDA_RUNTIME(cudaEventRecord(start));
113 |     mygemm<<<dimGrid, dimBlock>>>(cDev, aDev, bDev, m, n, k);
114 |     CUDA_RUNTIME(cudaEventRecord(stop));
115 |     CUDA_RUNTIME(cudaEventSynchronize(stop));
116 |     CUDA_RUNTIME(cudaEventElapsedTime(&millis, start, stop));
117 | 
118 |     // copy data back to host
119 |     nvtxRangePush("device-to-host");
120 |     CUDA_RUNTIME(
121 |         cudaMemcpy(cHost, cDev, m * n * sizeof(float), cudaMemcpyDefault));
122 |     nvtxRangePop();
123 |     CUDA_RUNTIME(cudaDeviceSynchronize());
124 | 
125 |     Duration wallElapsed = Clock::now() - wallStart;
126 | 
127 |     std::cout << iter << " kernel=" << millis / 1000
128 |               << " wall=" << wallElapsed.count()
129 |               << (iter >= nWarmup ? " *" : "  ") << "\n";
130 | 
131 |     // track time if no longer during warmup
132 |     if (iter >= nWarmup) {
133 |       wallTime += wallElapsed.count();
134 |       kernelTime += millis / 1000; // seconds
135 |     }
136 |   }
137 | 
138 |   // print results
139 |   double kernelGflops = flop / 1e9 / kernelTime;
140 |   std::cout << "kernel " << kernelGflops << "GFLOPS (" << flop << " flop, "
141 |             << kernelTime << "s)\n";
142 |   double wallGflops = flop / 1e9 / wallTime;
143 |   std::cout << "wall " << wallGflops << "GFLOPS (" << flop << " flop, "
144 |             << wallTime << "s)\n";
145 |   // release resources
146 |   CUDA_RUNTIME(cudaEventDestroy(start));
147 |   CUDA_RUNTIME(cudaEventDestroy(stop));
148 |   CUDA_RUNTIME(cudaFree(aDev));
149 |   CUDA_RUNTIME(cudaFree(bDev));
150 |   CUDA_RUNTIME(cudaFree(cDev));
151 |   CUDA_RUNTIME(cudaFreeHost(aHost));
152 |   CUDA_RUNTIME(cudaFreeHost(bHost));
153 |   CUDA_RUNTIME(cudaFreeHost(cHost));
154 |   return 0;
155 | }


--------------------------------------------------------------------------------
/sgemm/2_3_pinned_tiled.cu:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <chrono>
  3 | 
  4 | #include <nvToolsExt.h>
  5 | 
  6 | #include <argparse/argparse.hpp>
  7 | 
  8 | #include "common.hpp"
  9 | 
 10 | #define TILE_WIDTH 32
 11 | 
 12 | /* NOTE: A and C are column major, B is row major
 13 |  */
 14 | __global__ void mygemm(float *__restrict__ c, //<! [out] and MxN matrix
 15 |                        const float *a,        //<! [in] an MxK matrix
 16 |                        const float *b,        //<! [in] an KxN matrix
 17 |                        const int M, const int N, const int K) {
 18 | 
 19 |   __shared__ float aSh[TILE_WIDTH][TILE_WIDTH];
 20 |   __shared__ float bSh[TILE_WIDTH][TILE_WIDTH];
 21 |   int bx = blockIdx.x;
 22 |   int by = blockIdx.y;
 23 |   int tx = threadIdx.x;
 24 |   int ty = threadIdx.y;
 25 |   int i = by * TILE_WIDTH + ty;
 26 |   int j = bx * TILE_WIDTH + tx;
 27 |   float acc = 0;
 28 | 
 29 | #define A(_i, _j) a[(_i) + (_j)*M]
 30 | #define B(_i, _j) b[(_i)*N + (_j)]
 31 | #define C(_i, _j) c[(_i) + (_j)*M]
 32 | 
 33 |   for (int m = 0; m < (K - 1) / TILE_WIDTH + 1; ++m) {
 34 |     if (i < M && m * TILE_WIDTH + tx < K) {
 35 |       aSh[ty][tx] = A(i, m * TILE_WIDTH + tx);
 36 |     } else {
 37 |       aSh[ty][tx] = 0;
 38 |     }
 39 |     if (j < N && m * TILE_WIDTH + ty < K) {
 40 |       bSh[ty][tx] = B(m * TILE_WIDTH + ty, j);
 41 |     } else {
 42 |       bSh[ty][tx] = 0;
 43 |     }
 44 | 
 45 |     __syncthreads();
 46 |     for (int k = 0; k < TILE_WIDTH; ++k) {
 47 |       acc += aSh[ty][k] * bSh[k][tx];
 48 |     }
 49 |     __syncthreads();
 50 |   }
 51 |   if (i < M && j < N) {
 52 |     C(i, j) = acc;
 53 |   }
 54 | 
 55 | #undef A
 56 | #undef B
 57 | #undef C
 58 | }
 59 | 
 60 | /* Time the total transfer & matrix-multiplication time
 61 |  */
 62 | int main(int argc, char **argv) {
 63 | 
 64 |   argparse::Parser parser;
 65 | 
 66 |   // default matrix sizes:
 67 |   // A: 1600 x 1500
 68 |   // B: 1500 x 1400
 69 |   // C: 1600 x 1400
 70 |   int m = 1600;
 71 |   int n = 1400;
 72 |   int k = 1500;
 73 | 
 74 |   int nIters = 5;
 75 |   int nWarmup = 5;
 76 |   parser.add_positional(m);
 77 |   parser.add_positional(n);
 78 |   parser.add_positional(k);
 79 |   parser.add_option(nIters, "--iters");
 80 |   parser.add_option(nWarmup, "--warmup");
 81 | 
 82 |   if (!parser.parse(argc, argv)) {
 83 |     parser.help();
 84 |     exit(EXIT_FAILURE);
 85 |   }
 86 | 
 87 |   const int64_t flop = int64_t(m) * int64_t(n) * int64_t(k) * 2 * nIters;
 88 | 
 89 |   // initialize host data
 90 |   std::cout << "generate data\n";
 91 |   nvtxRangePush("generate data");
 92 |   float *aHost, *bHost, *cHost;
 93 |   CUDA_RUNTIME(cudaHostAlloc(&aHost, m * k * sizeof(float), 0));
 94 |   CUDA_RUNTIME(cudaHostAlloc(&bHost, k * n * sizeof(float), 0));
 95 |   CUDA_RUNTIME(cudaHostAlloc(&cHost, m * n * sizeof(float), 0));
 96 |   std::generate(aHost, aHost + m * k, random_int);
 97 |   std::generate(bHost, bHost + k * n, random_int);
 98 |   nvtxRangePop();
 99 | 
100 |   // allocate device data
101 |   float *aDev, *bDev, *cDev;
102 |   CUDA_RUNTIME(cudaMalloc(&aDev, m * k * sizeof(float)));
103 |   CUDA_RUNTIME(cudaMalloc(&bDev, k * n * sizeof(float)));
104 |   CUDA_RUNTIME(cudaMalloc(&cDev, m * n * sizeof(float)));
105 | 
106 |   // create events to time GPU kernel
107 |   cudaEvent_t start, stop;
108 |   CUDA_RUNTIME(cudaEventCreate(&start));
109 |   CUDA_RUNTIME(cudaEventCreate(&stop));
110 | 
111 |   // GPU kernel launch parameters
112 |   dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);
113 |   dim3 dimGrid;
114 |   dimGrid.x = (n + dimBlock.x - 1) / dimBlock.x;
115 |   dimGrid.y = (m + dimBlock.y - 1) / dimBlock.y;
116 | 
117 |   float kernelTime = 0;
118 |   float wallTime = 0;
119 | 
120 |   for (int iter = 0; iter < nWarmup + nIters; ++iter) {
121 | 
122 |     auto wallStart = Clock::now();
123 | 
124 |     // copy data to device
125 |     nvtxRangePush("host-to-device");
126 |     CUDA_RUNTIME(
127 |         cudaMemcpy(aDev, aHost, m * k * sizeof(float), cudaMemcpyDefault));
128 |     CUDA_RUNTIME(
129 |         cudaMemcpy(bDev, bHost, k * n * sizeof(float), cudaMemcpyDefault));
130 |     nvtxRangePop();
131 | 
132 |     // kernel time
133 |     float millis;
134 |     CUDA_RUNTIME(cudaEventRecord(start));
135 |     mygemm<<<dimGrid, dimBlock>>>(cDev, aDev, bDev, m, n, k);
136 |     CUDA_RUNTIME(cudaEventRecord(stop));
137 |     CUDA_RUNTIME(cudaEventSynchronize(stop));
138 |     CUDA_RUNTIME(cudaEventElapsedTime(&millis, start, stop));
139 | 
140 |     // copy data back to host
141 |     nvtxRangePush("device-to-host");
142 |     CUDA_RUNTIME(
143 |         cudaMemcpy(cHost, cDev, m * n * sizeof(float), cudaMemcpyDefault));
144 |     nvtxRangePop();
145 |     CUDA_RUNTIME(cudaDeviceSynchronize());
146 | 
147 |     Duration wallElapsed = Clock::now() - wallStart;
148 | 
149 |     std::cout << iter << " kernel=" << millis / 1000
150 |               << " wall=" << wallElapsed.count()
151 |               << (iter >= nWarmup ? " *" : "  ") << "\n";
152 | 
153 |     // track time if no longer during warmup
154 |     if (iter >= nWarmup) {
155 |       wallTime += wallElapsed.count();
156 |       kernelTime += millis / 1000; // seconds
157 |     }
158 |   }
159 | 
160 |   // print results
161 |   double kernelGflops = flop / 1e9 / kernelTime;
162 |   std::cout << "kernel " << kernelGflops << "GFLOPS (" << flop << " flop, "
163 |             << kernelTime << "s)\n";
164 |   double wallGflops = flop / 1e9 / wallTime;
165 |   std::cout << "wall " << wallGflops << "GFLOPS (" << flop << " flop, "
166 |             << wallTime << "s)\n";
167 |   // release resources
168 |   CUDA_RUNTIME(cudaEventDestroy(start));
169 |   CUDA_RUNTIME(cudaEventDestroy(stop));
170 |   CUDA_RUNTIME(cudaFree(aDev));
171 |   CUDA_RUNTIME(cudaFree(bDev));
172 |   CUDA_RUNTIME(cudaFree(cDev));
173 |   CUDA_RUNTIME(cudaFreeHost(aHost));
174 |   CUDA_RUNTIME(cudaFreeHost(bHost));
175 |   CUDA_RUNTIME(cudaFreeHost(cHost));
176 |   return 0;
177 | }
178 | 


--------------------------------------------------------------------------------
/sgemm/2_4_pinned_tiled_overlap.cu:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <chrono>
  3 | 
  4 | #include <nvToolsExt.h>
  5 | 
  6 | #include <argparse/argparse.hpp>
  7 | 
  8 | #include "common.hpp"
  9 | 
 10 | #define TILE_WIDTH 32
 11 | 
 12 | /* NOTE: A and C are column major, B is row major
 13 |  */
 14 | __global__ void mygemm(float *__restrict__ c, //<! [out] and MxN matrix
 15 |                        const float *a,        //<! [in] an MxK matrix
 16 |                        const float *b,        //<! [in] an KxN matrix
 17 |                        const int M, const int N, const int K) {
 18 | 
 19 |   __shared__ float aSh[TILE_WIDTH][TILE_WIDTH];
 20 |   __shared__ float bSh[TILE_WIDTH][TILE_WIDTH];
 21 |   int bx = blockIdx.x;
 22 |   int by = blockIdx.y;
 23 |   int tx = threadIdx.x;
 24 |   int ty = threadIdx.y;
 25 |   int i = by * TILE_WIDTH + ty;
 26 |   int j = bx * TILE_WIDTH + tx;
 27 |   float acc = 0;
 28 | 
 29 | #define A(_i, _j) a[(_i) + (_j)*M]
 30 | #define B(_i, _j) b[(_i)*N + (_j)]
 31 | #define C(_i, _j) c[(_i) + (_j)*M]
 32 | 
 33 |   for (int m = 0; m < (K - 1) / TILE_WIDTH + 1; ++m) {
 34 |     if (i < M && m * TILE_WIDTH + tx < K) {
 35 |       aSh[ty][tx] = A(i, m * TILE_WIDTH + tx);
 36 |     } else {
 37 |       aSh[ty][tx] = 0;
 38 |     }
 39 |     if (j < N && m * TILE_WIDTH + ty < K) {
 40 |       bSh[ty][tx] = B(m * TILE_WIDTH + ty, j);
 41 |     } else {
 42 |       bSh[ty][tx] = 0;
 43 |     }
 44 | 
 45 |     __syncthreads();
 46 |     for (int k = 0; k < TILE_WIDTH; ++k) {
 47 |       acc += aSh[ty][k] * bSh[k][tx];
 48 |     }
 49 |     __syncthreads();
 50 |   }
 51 |   if (i < M && j < N) {
 52 |     C(i, j) = acc;
 53 |   }
 54 | 
 55 | #undef A
 56 | #undef B
 57 | #undef C
 58 | }
 59 | 
 60 | int main(int argc, char **argv) {
 61 | 
 62 |   argparse::Parser parser;
 63 | 
 64 |   // default matrix sizes:
 65 |   // A: 1600 x 1500
 66 |   // B: 1500 x 1400
 67 |   // C: 1600 x 1400
 68 |   int m = 1600;
 69 |   int n = 1400;
 70 |   int k = 1500;
 71 | 
 72 |   int nIters = 5;
 73 |   int nWarmup = 5;
 74 |   parser.add_positional(m);
 75 |   parser.add_positional(n);
 76 |   parser.add_positional(k);
 77 |   parser.add_option(nIters, "--iters");
 78 |   parser.add_option(nWarmup, "--warmup");
 79 | 
 80 |   if (!parser.parse(argc, argv)) {
 81 |     parser.help();
 82 |     exit(EXIT_FAILURE);
 83 |   }
 84 | 
 85 |   // 4 muls of m/2, n/2, k
 86 |   const int64_t flop = int64_t(m) / 2 * int64_t(n) / 2 * int64_t(k) * 2 * 4 * nIters;
 87 | 
 88 |   // initialize host data
 89 |   std::cout << "generate data\n";
 90 |   nvtxRangePush("generate data");
 91 |   float *aHost[2], *bHost[2], *cHost[2][2];
 92 |   CUDA_RUNTIME(cudaHostAlloc(&aHost[0], m / 2 * k * sizeof(float), 0));
 93 |   CUDA_RUNTIME(cudaHostAlloc(&aHost[1], m / 2 * k * sizeof(float), 0));
 94 |   CUDA_RUNTIME(cudaHostAlloc(&bHost[0], k * n / 2 * sizeof(float), 0));
 95 |   CUDA_RUNTIME(cudaHostAlloc(&bHost[1], k * n / 2 * sizeof(float), 0));
 96 |   CUDA_RUNTIME(cudaHostAlloc(&cHost[0][0], m / 2 * n / 2 * sizeof(float), 0));
 97 |   CUDA_RUNTIME(cudaHostAlloc(&cHost[0][1], m / 2 * n / 2 * sizeof(float), 0));
 98 |   CUDA_RUNTIME(cudaHostAlloc(&cHost[1][0], m / 2 * n / 2 * sizeof(float), 0));
 99 |   CUDA_RUNTIME(cudaHostAlloc(&cHost[1][1], m / 2 * n / 2 * sizeof(float), 0));
100 |   std::generate(aHost[0], aHost[0] + m / 2 * k, random_int);
101 |   std::generate(aHost[1], aHost[1] + m / 2 * k, random_int);
102 |   std::generate(bHost[0], bHost[0] + k * n / 2, random_int);
103 |   std::generate(bHost[1], bHost[1] + k * n / 2, random_int);
104 |   nvtxRangePop();
105 | 
106 |   // allocate device data
107 |   std::cout << "allocate data\n";
108 |   float *aDev[2], *bDev[2], *cDev[2][2];
109 |   CUDA_RUNTIME(cudaMalloc(&aDev[0], m / 2 * k * sizeof(float)));
110 |   CUDA_RUNTIME(cudaMalloc(&aDev[1], m / 2 * k * sizeof(float)));
111 |   CUDA_RUNTIME(cudaMalloc(&bDev[0], k * n / 2 * sizeof(float)));
112 |   CUDA_RUNTIME(cudaMalloc(&bDev[1], k * n / 2 * sizeof(float)));
113 |   CUDA_RUNTIME(cudaMalloc(&cDev[0][0], m / 2 * n / 2 * sizeof(float)));
114 |   CUDA_RUNTIME(cudaMalloc(&cDev[0][1], m / 2 * n / 2 * sizeof(float)));
115 |   CUDA_RUNTIME(cudaMalloc(&cDev[1][0], m / 2 * n / 2 * sizeof(float)));
116 |   CUDA_RUNTIME(cudaMalloc(&cDev[1][1], m / 2 * n / 2 * sizeof(float)));
117 | 
118 |   // create streams for copy and kernels
119 |   cudaStream_t copyStream, kernelStream;
120 |   CUDA_RUNTIME(cudaStreamCreate(&copyStream));
121 |   CUDA_RUNTIME(cudaStreamCreate(&kernelStream));
122 | 
123 |   cudaEvent_t waitForA0B0, waitForA1, waitForB1, waitC[2][2];
124 |   CUDA_RUNTIME(cudaEventCreate(&waitForA0B0));
125 |   CUDA_RUNTIME(cudaEventCreate(&waitForA1));
126 |   CUDA_RUNTIME(cudaEventCreate(&waitForB1));
127 |   for (int i = 0; i < 2; ++i) {
128 |     for (int j = 0; j < 2; ++j) {
129 |       CUDA_RUNTIME(cudaEventCreate(&waitC[i][j]));
130 |     }
131 |   }
132 | 
133 |   // GPU kernel launch parameters
134 |   dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);
135 |   dim3 dimGrid;
136 |   dimGrid.x = (n/2 + dimBlock.x - 1) / dimBlock.x;
137 |   dimGrid.y = (m/2 + dimBlock.y - 1) / dimBlock.y;
138 | 
139 |   float kernelTime = 0;
140 |   float wallTime = 0;
141 |   for (int iter = 0; iter < nIters + nWarmup; ++iter) {
142 | 
143 |     nvtxRangePush("wall time");
144 |     auto wallStart = Clock::now();
145 | 
146 |     // copy a0 and b0
147 |     CUDA_RUNTIME(cudaMemcpyAsync(aDev[0], aHost[0], m / 2 * k * sizeof(float),
148 |                                  cudaMemcpyDefault, copyStream));
149 |     CUDA_RUNTIME(cudaMemcpyAsync(bDev[0], bHost[0], k * n / 2 * sizeof(float),
150 |                                  cudaMemcpyDefault, copyStream));
151 |     CUDA_RUNTIME(cudaEventRecord(waitForA0B0, copyStream));
152 | 
153 |     // have the kernelStream wait for the transfers to complete
154 |     CUDA_RUNTIME(cudaStreamWaitEvent(kernelStream, waitForA0B0, 0));
155 | 
156 |     // launch c[0][0] = a[0] * b[0]
157 |     mygemm<<<dimGrid, dimBlock, 0, kernelStream>>>(cDev[0][0], aDev[0], bDev[0],
158 |                                                    m / 2, n / 2, k);
159 |     CUDA_RUNTIME(cudaEventRecord(waitC[0][0], kernelStream));
160 | 
161 |     // copy a1
162 |     CUDA_RUNTIME(cudaMemcpyAsync(aDev[1], aHost[1], m / 2 * k * sizeof(float),
163 |                                  cudaMemcpyDefault, copyStream));
164 |     CUDA_RUNTIME(cudaEventRecord(waitForA1, kernelStream));
165 | 
166 |     // launch c[1][0] = a[1] * b[0] after a[1] is on the GPU
167 |     CUDA_RUNTIME(cudaStreamWaitEvent(kernelStream, waitForA1, 0));
168 |     mygemm<<<dimGrid, dimBlock, 0, kernelStream>>>(cDev[1][0], aDev[1], bDev[0],
169 |                                                    m / 2, n / 2, k);
170 |     CUDA_RUNTIME(cudaEventRecord(waitC[1][0], kernelStream));
171 | 
172 |     // copy b1
173 |     CUDA_RUNTIME(cudaMemcpyAsync(bDev[1], bHost[1], k * n / 2 * sizeof(float),
174 |                                  cudaMemcpyDefault, copyStream));
175 |     CUDA_RUNTIME(cudaEventRecord(waitForB1, kernelStream));
176 | 
177 |     // launch c[0][1] = a[0] * b[1] after B1 is on the GPU
178 |     CUDA_RUNTIME(cudaStreamWaitEvent(kernelStream, waitForB1, 0));
179 |     mygemm<<<dimGrid, dimBlock, 0, kernelStream>>>(cDev[0][1], aDev[0], bDev[1],
180 |                                                    m / 2, n / 2, k);
181 |     CUDA_RUNTIME(cudaEventRecord(waitC[0][1], kernelStream));
182 | 
183 |     // launch c[1][1] = a[1] * b[1]
184 |     mygemm<<<dimGrid, dimBlock, 0, kernelStream>>>(cDev[1][1], aDev[1], bDev[1],
185 |                                                    m / 2, n / 2, k);
186 |     CUDA_RUNTIME(cudaEventRecord(waitC[1][1], kernelStream));
187 | 
188 |     // copy c back to CPU as kernels finish
189 |     CUDA_RUNTIME(cudaStreamWaitEvent(copyStream, waitC[0][0], 0));
190 |     CUDA_RUNTIME(cudaMemcpyAsync(cHost[0][0], cDev[0][0],
191 |                                  m / 2 * n / 2 * sizeof(float),
192 |                                  cudaMemcpyDefault, copyStream));
193 |     CUDA_RUNTIME(cudaStreamWaitEvent(copyStream, waitC[1][0], 0));
194 |     CUDA_RUNTIME(cudaMemcpyAsync(cHost[1][0], cDev[1][0],
195 |                                  m / 2 * n / 2 * sizeof(float),
196 |                                  cudaMemcpyDefault, copyStream));
197 |     CUDA_RUNTIME(cudaStreamWaitEvent(copyStream, waitC[0][1], 0));
198 |     CUDA_RUNTIME(cudaMemcpyAsync(cHost[0][1], cDev[0][1],
199 |                                  m / 2 * n / 2 * sizeof(float),
200 |                                  cudaMemcpyDefault, copyStream));
201 |     CUDA_RUNTIME(cudaStreamWaitEvent(copyStream, waitC[1][1], 0));
202 |     CUDA_RUNTIME(cudaMemcpyAsync(cHost[1][1], cDev[1][1],
203 |                                  m / 2 * n / 2 * sizeof(float),
204 |                                  cudaMemcpyDefault, copyStream));
205 | 
206 |     CUDA_RUNTIME(cudaDeviceSynchronize());
207 |     nvtxRangePop(); // wall time
208 |     Duration wallElapsed = Clock::now() - wallStart;
209 | 
210 |     // kernel time
211 |     float kernelElapsed;
212 |     CUDA_RUNTIME(cudaEventSynchronize(waitC[1][1]));
213 |     CUDA_RUNTIME(cudaEventElapsedTime(&kernelElapsed, waitForA0B0, waitC[1][1]));
214 |     kernelElapsed /= 1000; // seconds
215 | 
216 |     std::cout << iter << " kernel=" << kernelElapsed
217 |               << " wall=" << wallElapsed.count()
218 |               << (iter >= nWarmup ? " *" : "  ") << "\n";
219 | 
220 |     if (iter >= nWarmup) {
221 |       wallTime += wallElapsed.count();
222 |       kernelTime += kernelElapsed;
223 |     }
224 |   }
225 | 
226 |   // print results
227 |   double kernelGflops = flop / 1e9 / kernelTime;
228 |   std::cout << "kernel " << kernelGflops << "GFLOPS (" << flop << " flop, "
229 |             << kernelTime << "s)\n";
230 |   double wallGflops = flop / 1e9 / wallTime;
231 |   std::cout << "wall " << wallGflops << "GFLOPS (" << flop << " flop, "
232 |             << wallTime << "s)\n";
233 |   // release resources
234 | 
235 |   CUDA_RUNTIME(cudaFree(aDev[0]));
236 |   CUDA_RUNTIME(cudaFree(aDev[1]));
237 |   CUDA_RUNTIME(cudaFree(bDev[0]));
238 |   CUDA_RUNTIME(cudaFree(bDev[1]));
239 |   return 0;
240 | }
241 | 


--------------------------------------------------------------------------------
/sgemm/2_5_pinned_joint.cu:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <chrono>
  3 | 
  4 | #include <nvToolsExt.h>
  5 | 
  6 | #include <argparse/argparse.hpp>
  7 | 
  8 | #include "common.hpp"
  9 | 
 10 | #define TILE_SZ_A 64
 11 | #define TILE_SZ_B 16
 12 | #define TILE_SZ_RATIO (TILE_SZ_A / TILE_SZ_B)
 13 | 
 14 | /* NOTE: A and C are column major, B is row major
 15 |  */
 16 | __global__ void mygemm(float *__restrict__ c, //<! [out] and MxN matrix
 17 |                        const float *a,        //<! [in] an MxK matrix
 18 |                        const float *b,        //<! [in] an KxN matrix
 19 |                        const int M, const int N, const int K) {
 20 | 
 21 | // Macros for accessing flattened matrices
 22 | #define A(_i, _j) a[(_i) + (_j)*M]
 23 | #define B(_i, _j) b[(_i)*N + (_j)]
 24 | #define C(_i, _j) c[(_i) + (_j)*M]
 25 | 
 26 |   // Shared memory for tiling input B array
 27 |   __shared__ float B_s[TILE_SZ_RATIO][TILE_SZ_B];
 28 | 
 29 |   // Index variables
 30 |   const unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
 31 |   const unsigned int col = blockIdx.y * TILE_SZ_B;
 32 | 
 33 |   // Privatization of output variables
 34 |   float c_reg[TILE_SZ_B];
 35 | 
 36 |   // Initialize output values
 37 |   for (unsigned int outIdx = 0; outIdx < TILE_SZ_B; ++outIdx) {
 38 |     c_reg[outIdx] = 0;
 39 |   }
 40 | 
 41 |   // Loop over the input tiles
 42 |   for (unsigned int tileIdx = 0; tileIdx < (K - 1) / TILE_SZ_RATIO + 1;
 43 |        ++tileIdx) {
 44 |     // Load the tile of B into shared memory
 45 |     const unsigned int i = threadIdx.x / TILE_SZ_B;
 46 |     const unsigned int j = threadIdx.x % TILE_SZ_B;
 47 |     if (tileIdx * TILE_SZ_RATIO + i < K && col + j < N) {
 48 |       B_s[i][j] = B(tileIdx * TILE_SZ_RATIO + i, col + j);
 49 |     } else {
 50 |       B_s[i][j] = 0;
 51 |     }
 52 |     __syncthreads();
 53 |     // Loop over elements inside the tile
 54 |     for (unsigned int idx = 0; idx < TILE_SZ_RATIO; ++idx) {
 55 |       // Load tile of A matrix into register
 56 |       float a_reg;
 57 |       if (row < M && tileIdx * TILE_SZ_RATIO + idx < K) {
 58 |         a_reg = A(row, tileIdx * TILE_SZ_RATIO + idx);
 59 |       } else {
 60 |         a_reg = 0;
 61 |       }
 62 |       // Loop over and update the output elements assigned to the thread
 63 |       for (unsigned int outIdx = 0; outIdx < TILE_SZ_B; ++outIdx) {
 64 |         c_reg[outIdx] += a_reg * B_s[idx][outIdx];
 65 |       }
 66 |     }
 67 |     __syncthreads();
 68 |   }
 69 | 
 70 |   for (unsigned int outIdx = 0; outIdx < TILE_SZ_B; ++outIdx) {
 71 |     if (row < M && col + outIdx < N) {
 72 |       C(row, col + outIdx) = c_reg[outIdx];
 73 |     }
 74 |   }
 75 | 
 76 | #undef A
 77 | #undef B
 78 | #undef C
 79 | }
 80 | 
 81 | /* Time the total transfer & matrix-multiplication time
 82 |  */
 83 |  int main(int argc, char **argv) {
 84 | 
 85 |   argparse::Parser parser;
 86 | 
 87 |   // default matrix sizes:
 88 |   // A: 1600 x 1500
 89 |   // B: 1500 x 1400
 90 |   // C: 1600 x 1400
 91 |   int m = 1600;
 92 |   int n = 1400;
 93 |   int k = 1500;
 94 | 
 95 |   int nIters = 10;
 96 |   int nWarmup = 5;
 97 |   parser.add_positional(m);
 98 |   parser.add_positional(n);
 99 |   parser.add_positional(k);
100 |   parser.add_option(nIters, "--iters");
101 |   parser.add_option(nWarmup, "--warmup");
102 | 
103 |   if (!parser.parse(argc, argv)) {
104 |     parser.help();
105 |     exit(EXIT_FAILURE);
106 |   }
107 | 
108 |   const int64_t flop = int64_t(m) * int64_t(n) * int64_t(k) * 2 * nIters;
109 | 
110 |   // initialize host data
111 |   std::cout << "generate data\n";
112 |   nvtxRangePush("generate data");
113 |   float *aHost, *bHost, *cHost;
114 |   CUDA_RUNTIME(cudaHostAlloc(&aHost, m * k * sizeof(float), 0));
115 |   CUDA_RUNTIME(cudaHostAlloc(&bHost, k * n * sizeof(float), 0));
116 |   CUDA_RUNTIME(cudaHostAlloc(&cHost, m * n * sizeof(float), 0));
117 |   std::generate(aHost, aHost + m * k, random_int);
118 |   std::generate(bHost, bHost + k * n, random_int);
119 |   nvtxRangePop();
120 | 
121 |   // allocate device data
122 |   float *aDev, *bDev, *cDev;
123 |   CUDA_RUNTIME(cudaMalloc(&aDev, m * k * sizeof(float)));
124 |   CUDA_RUNTIME(cudaMalloc(&bDev, k * n * sizeof(float)));
125 |   CUDA_RUNTIME(cudaMalloc(&cDev, m * n * sizeof(float)));
126 | 
127 |   // create events to time GPU kernel
128 |   cudaEvent_t start, stop;
129 |   CUDA_RUNTIME(cudaEventCreate(&start));
130 |   CUDA_RUNTIME(cudaEventCreate(&stop));
131 | 
132 |   // GPU kernel launch parameters
133 |   dim3 dimGrid((m + TILE_SZ_A - 1) / TILE_SZ_A, (n +TILE_SZ_B - 1) / TILE_SZ_B);
134 |   dim3 dimBlock(TILE_SZ_A, 1);
135 | 
136 |   float kernelTime = 0;
137 |   float wallTime = 0;
138 | 
139 |   for (int iter = 0; iter < nWarmup + nIters; ++iter) {
140 | 
141 |     auto wallStart = Clock::now();
142 | 
143 |     // copy data to device
144 |     nvtxRangePush("host-to-device");
145 |     CUDA_RUNTIME(
146 |         cudaMemcpy(aDev, aHost, m * k * sizeof(float), cudaMemcpyDefault));
147 |     CUDA_RUNTIME(
148 |         cudaMemcpy(bDev, bHost, k * n * sizeof(float), cudaMemcpyDefault));
149 |     nvtxRangePop();
150 | 
151 |     // kernel time
152 |     float kernelElapsed;
153 |     CUDA_RUNTIME(cudaEventRecord(start));
154 |     mygemm<<<dimGrid, dimBlock>>>(cDev, aDev, bDev, m, n, k);
155 |     CUDA_RUNTIME(cudaEventRecord(stop));
156 |     CUDA_RUNTIME(cudaEventSynchronize(stop));
157 |     CUDA_RUNTIME(cudaEventElapsedTime(&kernelElapsed, start, stop));
158 |     kernelElapsed /= 1000; // seconds
159 | 
160 |     // copy data back to host
161 |     nvtxRangePush("device-to-host");
162 |     CUDA_RUNTIME(
163 |         cudaMemcpy(cHost, cDev, m * n * sizeof(float), cudaMemcpyDefault));
164 |     nvtxRangePop();
165 |     CUDA_RUNTIME(cudaDeviceSynchronize());
166 | 
167 |     Duration wallElapsed = Clock::now() - wallStart;
168 | 
169 |     std::cout << iter << " kernel=" << kernelElapsed
170 |               << " wall=" << wallElapsed.count()
171 |               << (iter >= nWarmup ? " *" : "  ") << "\n";
172 | 
173 |     // track time if no longer during warmup
174 |     if (iter >= nWarmup) {
175 |       wallTime += wallElapsed.count();
176 |       kernelTime += kernelElapsed; // seconds
177 |     }
178 |   }
179 | 
180 |   // print results
181 |   double kernelGflops = flop / 1e9 / kernelTime;
182 |   std::cout << "kernel " << kernelGflops << "GFLOPS (" << flop << " flop, "
183 |             << kernelTime << "s)\n";
184 |   double wallGflops = flop / 1e9 / wallTime;
185 |   std::cout << "wall " << wallGflops << "GFLOPS (" << flop << " flop, "
186 |             << wallTime << "s)\n";
187 |   // release resources
188 |   CUDA_RUNTIME(cudaEventDestroy(start));
189 |   CUDA_RUNTIME(cudaEventDestroy(stop));
190 |   CUDA_RUNTIME(cudaFree(aDev));
191 |   CUDA_RUNTIME(cudaFree(bDev));
192 |   CUDA_RUNTIME(cudaFree(cDev));
193 |   CUDA_RUNTIME(cudaFreeHost(aHost));
194 |   CUDA_RUNTIME(cudaFreeHost(bHost));
195 |   CUDA_RUNTIME(cudaFreeHost(cHost));
196 |   return 0;
197 | }


--------------------------------------------------------------------------------
/sgemm/2_6_pinned_joint_overlap.cu:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <chrono>
  3 | 
  4 | #include <nvToolsExt.h>
  5 | 
  6 | #include <argparse/argparse.hpp>
  7 | 
  8 | #include "common.hpp"
  9 | 
 10 | #define TILE_SZ_A 64
 11 | #define TILE_SZ_B 16
 12 | #define TILE_SZ_RATIO (TILE_SZ_A / TILE_SZ_B)
 13 | 
 14 | /* NOTE: A and C are column major, B is row major
 15 |  */
 16 | __global__ void mygemm(float *__restrict__ c, //<! [out] and MxN matrix
 17 |                        const float *a,        //<! [in] an MxK matrix
 18 |                        const float *b,        //<! [in] an KxN matrix
 19 |                        const int M, const int N, const int K) {
 20 | 
 21 | // Macros for accessing flattened matrices
 22 | #define A(_i, _j) a[(_i) + (_j)*M]
 23 | #define B(_i, _j) b[(_i)*N + (_j)]
 24 | #define C(_i, _j) c[(_i) + (_j)*M]
 25 | 
 26 |   // Shared memory for tiling input B array
 27 |   __shared__ float B_s[TILE_SZ_RATIO][TILE_SZ_B];
 28 | 
 29 |   // Index variables
 30 |   const unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
 31 |   const unsigned int col = blockIdx.y * TILE_SZ_B;
 32 | 
 33 |   // Privatization of output variables
 34 |   float c_reg[TILE_SZ_B];
 35 | 
 36 |   // Initialize output values
 37 |   for (unsigned int outIdx = 0; outIdx < TILE_SZ_B; ++outIdx) {
 38 |     c_reg[outIdx] = 0;
 39 |   }
 40 | 
 41 |   // Loop over the input tiles
 42 |   for (unsigned int tileIdx = 0; tileIdx < (K - 1) / TILE_SZ_RATIO + 1;
 43 |        ++tileIdx) {
 44 |     // Load the tile of B into shared memory
 45 |     const unsigned int i = threadIdx.x / TILE_SZ_B;
 46 |     const unsigned int j = threadIdx.x % TILE_SZ_B;
 47 |     if (tileIdx * TILE_SZ_RATIO + i < K && col + j < N) {
 48 |       B_s[i][j] = B(tileIdx * TILE_SZ_RATIO + i, col + j);
 49 |     } else {
 50 |       B_s[i][j] = 0;
 51 |     }
 52 |     __syncthreads();
 53 |     // Loop over elements inside the tile
 54 |     for (unsigned int idx = 0; idx < TILE_SZ_RATIO; ++idx) {
 55 |       // Load tile of A matrix into register
 56 |       float a_reg;
 57 |       if (row < M && tileIdx * TILE_SZ_RATIO + idx < K) {
 58 |         a_reg = A(row, tileIdx * TILE_SZ_RATIO + idx);
 59 |       } else {
 60 |         a_reg = 0;
 61 |       }
 62 |       // Loop over and update the output elements assigned to the thread
 63 |       for (unsigned int outIdx = 0; outIdx < TILE_SZ_B; ++outIdx) {
 64 |         c_reg[outIdx] += a_reg * B_s[idx][outIdx];
 65 |       }
 66 |     }
 67 |     __syncthreads();
 68 |   }
 69 | 
 70 |   for (unsigned int outIdx = 0; outIdx < TILE_SZ_B; ++outIdx) {
 71 |     if (row < M && col + outIdx < N) {
 72 |       C(row, col + outIdx) = c_reg[outIdx];
 73 |     }
 74 |   }
 75 | 
 76 | #undef A
 77 | #undef B
 78 | #undef C
 79 | }
 80 | 
 81 | int main(int argc, char **argv) {
 82 | 
 83 |   argparse::Parser parser;
 84 | 
 85 |   // default matrix sizes:
 86 |   // A: 1600 x 1500
 87 |   // B: 1500 x 1400
 88 |   // C: 1600 x 1400
 89 |   int m = 1600;
 90 |   int n = 1400;
 91 |   int k = 1500;
 92 | 
 93 |   int nIters = 10;
 94 |   int nWarmup = 5;
 95 |   parser.add_positional(m);
 96 |   parser.add_positional(n);
 97 |   parser.add_positional(k);
 98 |   parser.add_option(nIters, "--iters");
 99 |   parser.add_option(nWarmup, "--warmup");
100 | 
101 |   if (!parser.parse(argc, argv)) {
102 |     parser.help();
103 |     exit(EXIT_FAILURE);
104 |   }
105 | 
106 |   // 4 muls of m/2, n/2, k
107 |   const int64_t flop = int64_t(m) / 2 * int64_t(n) / 2 * int64_t(k) * 2 * 4 * nIters;
108 | 
109 |   // initialize host data
110 |   std::cout << "generate data\n";
111 |   nvtxRangePush("generate data");
112 |   float *aHost[2], *bHost[2], *cHost[2][2];
113 |   CUDA_RUNTIME(cudaHostAlloc(&aHost[0], m / 2 * k * sizeof(float), 0));
114 |   CUDA_RUNTIME(cudaHostAlloc(&aHost[1], m / 2 * k * sizeof(float), 0));
115 |   CUDA_RUNTIME(cudaHostAlloc(&bHost[0], k * n / 2 * sizeof(float), 0));
116 |   CUDA_RUNTIME(cudaHostAlloc(&bHost[1], k * n / 2 * sizeof(float), 0));
117 |   CUDA_RUNTIME(cudaHostAlloc(&cHost[0][0], m / 2 * n / 2 * sizeof(float), 0));
118 |   CUDA_RUNTIME(cudaHostAlloc(&cHost[0][1], m / 2 * n / 2 * sizeof(float), 0));
119 |   CUDA_RUNTIME(cudaHostAlloc(&cHost[1][0], m / 2 * n / 2 * sizeof(float), 0));
120 |   CUDA_RUNTIME(cudaHostAlloc(&cHost[1][1], m / 2 * n / 2 * sizeof(float), 0));
121 |   std::generate(aHost[0], aHost[0] + m / 2 * k, random_int);
122 |   std::generate(aHost[1], aHost[1] + m / 2 * k, random_int);
123 |   std::generate(bHost[0], bHost[0] + k * n / 2, random_int);
124 |   std::generate(bHost[1], bHost[1] + k * n / 2, random_int);
125 |   nvtxRangePop();
126 | 
127 |   // allocate device data
128 |   std::cout << "allocate data\n";
129 |   float *aDev[2], *bDev[2], *cDev[2][2];
130 |   CUDA_RUNTIME(cudaMalloc(&aDev[0], m / 2 * k * sizeof(float)));
131 |   CUDA_RUNTIME(cudaMalloc(&aDev[1], m / 2 * k * sizeof(float)));
132 |   CUDA_RUNTIME(cudaMalloc(&bDev[0], k * n / 2 * sizeof(float)));
133 |   CUDA_RUNTIME(cudaMalloc(&bDev[1], k * n / 2 * sizeof(float)));
134 |   CUDA_RUNTIME(cudaMalloc(&cDev[0][0], m / 2 * n / 2 * sizeof(float)));
135 |   CUDA_RUNTIME(cudaMalloc(&cDev[0][1], m / 2 * n / 2 * sizeof(float)));
136 |   CUDA_RUNTIME(cudaMalloc(&cDev[1][0], m / 2 * n / 2 * sizeof(float)));
137 |   CUDA_RUNTIME(cudaMalloc(&cDev[1][1], m / 2 * n / 2 * sizeof(float)));
138 | 
139 |   // create streams for copy and kernels
140 |   cudaStream_t copyStream, kernelStream;
141 |   CUDA_RUNTIME(cudaStreamCreate(&copyStream));
142 |   CUDA_RUNTIME(cudaStreamCreate(&kernelStream));
143 | 
144 |   cudaEvent_t waitForA0B0, waitForA1, waitForB1, waitC[2][2];
145 |   CUDA_RUNTIME(cudaEventCreate(&waitForA0B0));
146 |   CUDA_RUNTIME(cudaEventCreate(&waitForA1));
147 |   CUDA_RUNTIME(cudaEventCreate(&waitForB1));
148 |   for (int i = 0; i < 2; ++i) {
149 |     for (int j = 0; j < 2; ++j) {
150 |       CUDA_RUNTIME(cudaEventCreate(&waitC[i][j]));
151 |     }
152 |   }
153 | 
154 |   // GPU kernel launch parameters
155 |   dim3 dimGrid((m/2 + TILE_SZ_A - 1) / TILE_SZ_A, (n/2 +TILE_SZ_B - 1) / TILE_SZ_B);
156 |   dim3 dimBlock(TILE_SZ_A, 1);
157 | 
158 |   float kernelTime = 0;
159 |   float wallTime = 0;
160 |   for (int iter = 0; iter < nIters + nWarmup; ++iter) {
161 | 
162 |     nvtxRangePush("wall time");
163 |     auto wallStart = Clock::now();
164 | 
165 |     // copy a0 and b0
166 |     CUDA_RUNTIME(cudaMemcpyAsync(aDev[0], aHost[0], m / 2 * k * sizeof(float),
167 |                                  cudaMemcpyDefault, copyStream));
168 |     CUDA_RUNTIME(cudaMemcpyAsync(bDev[0], bHost[0], k * n / 2 * sizeof(float),
169 |                                  cudaMemcpyDefault, copyStream));
170 |     CUDA_RUNTIME(cudaEventRecord(waitForA0B0, copyStream));
171 | 
172 |     // have the kernelStream wait for the transfers to complete
173 |     CUDA_RUNTIME(cudaStreamWaitEvent(kernelStream, waitForA0B0, 0));
174 | 
175 |     // launch c[0][0] = a[0] * b[0]
176 |     mygemm<<<dimGrid, dimBlock, 0, kernelStream>>>(cDev[0][0], aDev[0], bDev[0],
177 |                                                    m / 2, n / 2, k);
178 |     CUDA_RUNTIME(cudaEventRecord(waitC[0][0], kernelStream));
179 | 
180 |     // copy a1
181 |     CUDA_RUNTIME(cudaMemcpyAsync(aDev[1], aHost[1], m / 2 * k * sizeof(float),
182 |                                  cudaMemcpyDefault, copyStream));
183 |     CUDA_RUNTIME(cudaEventRecord(waitForA1, kernelStream));
184 | 
185 |     // launch c[1][0] = a[1] * b[0] after a[1] is on the GPU
186 |     CUDA_RUNTIME(cudaStreamWaitEvent(kernelStream, waitForA1, 0));
187 |     mygemm<<<dimGrid, dimBlock, 0, kernelStream>>>(cDev[1][0], aDev[1], bDev[0],
188 |                                                    m / 2, n / 2, k);
189 |     CUDA_RUNTIME(cudaEventRecord(waitC[1][0], kernelStream));
190 | 
191 |     // copy b1
192 |     CUDA_RUNTIME(cudaMemcpyAsync(bDev[1], bHost[1], k * n / 2 * sizeof(float),
193 |                                  cudaMemcpyDefault, copyStream));
194 |     CUDA_RUNTIME(cudaEventRecord(waitForB1, kernelStream));
195 | 
196 |     // launch c[0][1] = a[0] * b[1] after B1 is on the GPU
197 |     CUDA_RUNTIME(cudaStreamWaitEvent(kernelStream, waitForB1, 0));
198 |     mygemm<<<dimGrid, dimBlock, 0, kernelStream>>>(cDev[0][1], aDev[0], bDev[1],
199 |                                                    m / 2, n / 2, k);
200 |     CUDA_RUNTIME(cudaEventRecord(waitC[0][1], kernelStream));
201 | 
202 |     // launch c[1][1] = a[1] * b[1]
203 |     mygemm<<<dimGrid, dimBlock, 0, kernelStream>>>(cDev[1][1], aDev[1], bDev[1],
204 |                                                    m / 2, n / 2, k);
205 |     CUDA_RUNTIME(cudaEventRecord(waitC[1][1], kernelStream));
206 | 
207 |     // copy c back to CPU as kernels finish
208 |     CUDA_RUNTIME(cudaStreamWaitEvent(copyStream, waitC[0][0], 0));
209 |     CUDA_RUNTIME(cudaMemcpyAsync(cHost[0][0], cDev[0][0],
210 |                                  m / 2 * n / 2 * sizeof(float),
211 |                                  cudaMemcpyDefault, copyStream));
212 |     CUDA_RUNTIME(cudaStreamWaitEvent(copyStream, waitC[1][0], 0));
213 |     CUDA_RUNTIME(cudaMemcpyAsync(cHost[1][0], cDev[1][0],
214 |                                  m / 2 * n / 2 * sizeof(float),
215 |                                  cudaMemcpyDefault, copyStream));
216 |     CUDA_RUNTIME(cudaStreamWaitEvent(copyStream, waitC[0][1], 0));
217 |     CUDA_RUNTIME(cudaMemcpyAsync(cHost[0][1], cDev[0][1],
218 |                                  m / 2 * n / 2 * sizeof(float),
219 |                                  cudaMemcpyDefault, copyStream));
220 |     CUDA_RUNTIME(cudaStreamWaitEvent(copyStream, waitC[1][1], 0));
221 |     CUDA_RUNTIME(cudaMemcpyAsync(cHost[1][1], cDev[1][1],
222 |                                  m / 2 * n / 2 * sizeof(float),
223 |                                  cudaMemcpyDefault, copyStream));
224 | 
225 |     CUDA_RUNTIME(cudaDeviceSynchronize());
226 |     nvtxRangePop(); // wall time
227 |     Duration wallElapsed = Clock::now() - wallStart;
228 | 
229 |     // kernel time
230 |     float kernelElapsed;
231 |     CUDA_RUNTIME(cudaEventElapsedTime(&kernelElapsed, waitForA0B0, waitC[1][1]));
232 |     kernelElapsed /= 1000; // seconds
233 | 
234 |     std::cout << iter << " kernel=" << kernelElapsed
235 |               << " wall=" << wallElapsed.count()
236 |               << (iter >= nWarmup ? " *" : "  ") << "\n";
237 | 
238 |     if (iter >= nWarmup) {
239 |       wallTime += wallElapsed.count();
240 |       kernelTime += kernelElapsed;
241 |     }
242 |   }
243 | 
244 |   // print results
245 |   double kernelGflops = flop / 1e9 / kernelTime;
246 |   std::cout << "kernel " << kernelGflops << "GFLOPS (" << flop << " flop, "
247 |             << kernelTime << "s)\n";
248 |   double wallGflops = flop / 1e9 / wallTime;
249 |   std::cout << "wall " << wallGflops << "GFLOPS (" << flop << " flop, "
250 |             << wallTime << "s)\n";
251 |   // release resources
252 | 
253 |   CUDA_RUNTIME(cudaFree(aDev[0]));
254 |   CUDA_RUNTIME(cudaFree(aDev[1]));
255 |   CUDA_RUNTIME(cudaFree(bDev[0]));
256 |   CUDA_RUNTIME(cudaFree(bDev[1]));
257 |   return 0;
258 | }
259 | 


--------------------------------------------------------------------------------
/sgemm/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | project(sgemm LANGUAGES CXX CUDA)
 2 | 
 3 | # 3.8+ for CUDA
 4 | cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
 5 | 
 6 | if(NOT CMAKE_BUILD_TYPE)
 7 |     set(CMAKE_BUILD_TYPE "Release")
 8 |     message(STATUS "Setting CMAKE_BUILD_TYPE=Release")
 9 | endif()
10 | 
11 | set(CMAKE_CUDA_STANDARD 11)
12 | set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
13 | 
14 | include_directories(PUBLIC SYSTEM include)
15 | 
16 | # Add line info to binaries to help with profiling
17 | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo")
18 | 
19 | add_executable(sgemm-cpu cpu.cpp)
20 | 
21 | add_executable(1-1-pinned-basic 1_1_pinned_basic.cu)
22 | target_link_libraries(1-1-pinned-basic nvToolsExt)
23 | 
24 | add_executable(1-2-pinned-tiled 1_2_pinned_tiled.cu)
25 | target_link_libraries(1-2-pinned-tiled nvToolsExt)
26 | 
27 | add_executable(1-3-pinned-joint 1_3_pinned_joint.cu)
28 | target_link_libraries(1-3-pinned-joint nvToolsExt)
29 | 
30 | add_executable(2-1-pageable-basic 2_1_pageable_basic.cu)
31 | target_link_libraries(2-1-pageable-basic nvToolsExt)
32 | 
33 | add_executable(2-2-pinned-basic 2_2_pinned_basic.cu)
34 | target_link_libraries(2-2-pinned-basic nvToolsExt)
35 | 
36 | add_executable(2-3-pinned-tiled 2_3_pinned_tiled.cu)
37 | target_link_libraries(2-3-pinned-tiled nvToolsExt)
38 | 
39 | add_executable(2-4-pinned-tiled-overlap 2_4_pinned_tiled_overlap.cu)
40 | target_link_libraries(2-4-pinned-tiled-overlap nvToolsExt)
41 | 
42 | add_executable(2-5-pinned-joint 2_5_pinned_joint.cu)
43 | target_link_libraries(2-5-pinned-joint nvToolsExt)
44 | 
45 | add_executable(2-6-pinned-joint-overlap 2_6_pinned_joint_overlap.cu)
46 | target_link_libraries(2-6-pinned-joint-overlap nvToolsExt)


--------------------------------------------------------------------------------
/sgemm/README.md:
--------------------------------------------------------------------------------
 1 | # Matrix-Multiplication Profiling Examples
 2 | 
 3 | This code contains a global memory, shared-memory tiled, and joint shared-memory and register-tiled matrix matrix multiplications.
 4 | 
 5 | 
 6 | ## Module 1: Nvidia Nsight Compute
 7 | 
 8 | Examples for using Nsight Compute to compare kernel performance.
 9 | 
10 | * `1-1-pinned-basic`: (`1_1_pinned_basic.cu`)
11 | * `1-2-pinned-tiled`: (`1_1_pinned_tiled.cu`)
12 | * `1-3-pinned-joint`: (`1_1_pinned_joint.cu`)
13 | 
14 | ## Module 2: Nvidia Nsight Systems
15 | 
16 | Examples for using Nsight Systems to compare data transfer, and relationship between data transfer and end-to-end time.
17 | 
18 | * `2-1-pageable-basic`: (`2_1_pageable_basic.cu`)
19 | * `2-2-pinned-basic`: (`2_2_pinned_basic.cu`)
20 | * `2-3-pinned-tiled`: (`2_3_pinned_tiled.cu`)
21 | * `2-4-pinned-tiled-overlap`: (`2_4_pinned_tiled_overlap.cu`)
22 | * `2-5-pinned-joint`: (`2_5_pinned_joint.cu`)
23 | * `2-6-pinned-joint-overlap`: (`2_6_pinned_joint_overlap.cu`)
24 | 
25 | All programs share the same basic options:
26 | 
27 | * Three optional positional arguments to set M, N, and K.
28 | * `--iters <int>` the number of measured iterations (default `5`)
29 | * `--warmup <int>` the number of warmup iterations (default `5`)
30 | * `--check`: check correctness (default `false`). Only use for small multiplications
31 | 


--------------------------------------------------------------------------------
/sgemm/common.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <chrono>
 4 | 
 5 | #ifdef __CUDACC__
 6 | inline void checkCuda(cudaError_t result, const char *file, const int line) {
 7 |   if (result != cudaSuccess) {
 8 |     fprintf(stderr, "%s@%d: CUDA Runtime Error(%d): %s\n", file, line,
 9 |             int(result), cudaGetErrorString(result));
10 |     exit(-1);
11 |   }
12 | }
13 | 
14 | #define CUDA_RUNTIME(stmt) checkCuda(stmt, __FILE__, __LINE__);
15 | #endif
16 | 
17 | /* NOTE: A and C are column major, B is row major
18 |  */
19 | inline void cpu_gemm(float *c,       //<! [out] and MxN matrix
20 |                      const float *a, //<! [in] an MxK matrix
21 |                      const float *b, //<! [in] an KxN matrix
22 |                      const int M, const int N, const int K) {
23 | 
24 | #define A(_i, _j) a[(_i) + (_j)*M]
25 | #define B(_i, _j) b[(_i)*N + (_j)]
26 | #define C(_i, _j) c[(_i) + (_j)*M]
27 | 
28 |   for (int i = 0; i < M; ++i) {
29 |     for (int j = 0; j < N; ++j) {
30 |       float acc = 0;
31 |       for (int k = 0; k < K; ++k) {
32 |         acc += A(i, k) * B(k, j);
33 |       }
34 |       C(i, j) = acc;
35 |     }
36 |   }
37 | 
38 | #undef A
39 | #undef B
40 | #undef C
41 | }
42 | 
43 | inline bool equal(float x, float y, float eps) {
44 |   return std::abs(x - y) <=
45 |          eps * std::max(std::max(1.0f, std::abs(x)), std::abs(y));
46 | }
47 | 
48 | inline int random_int() { return (std::rand() % 100); }
49 | 
50 | typedef std::chrono::high_resolution_clock Clock;
51 | typedef std::chrono::duration<float> Duration;


--------------------------------------------------------------------------------
/sgemm/cpu.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <vector>
 3 | #include <algorithm>
 4 | 
 5 | #include "common.hpp"
 6 | 
 7 | int main(int argc, char **argv) {
 8 | 
 9 |   int M = 2;
10 |   int N = 2;
11 |   int K = 2;
12 | 
13 |   // initialize host data
14 |   std::vector<float> a(M * K), b(K * N), c(M * N);
15 |   std::generate(a.begin(), a.end(), random_int);
16 |   std::generate(b.begin(), b.end(), random_int);
17 | 
18 |   cpu_gemm(c.data(), a.data(), b.data(), M, N, K);
19 | 
20 | #define A(_i, _j) a[(_i) + (_j)*M]
21 | #define B(_i, _j) b[(_i)*N + (_j)]
22 | #define C(_i, _j) c[(_i) + (_j)*M]
23 | 
24 |   float acc = 0;
25 |   for (int k = 0; k < K; ++k) {
26 |     acc += A(0, k) * B(k, 0);
27 |   }
28 |   
29 |   if (equal(C(0, 0), acc, 1e-6)) {
30 |     return 0;
31 |   } else {
32 |     return 1;
33 |   }
34 | 
35 | #undef A
36 | #undef B
37 | #undef C
38 | }


--------------------------------------------------------------------------------
/sgemm/include/argparse/argparse.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <iostream>
  4 | #include <sstream>
  5 | #include <string>
  6 | #include <vector>
  7 | 
  8 | namespace argparse {
  9 | 
 10 | class OptionBase {
 11 | public:
 12 |   virtual void set_val(const std::string &valStr) = 0;
 13 |   virtual const std::string &long_str() = 0;
 14 | };
 15 | 
 16 | template <typename T> class Option : public OptionBase {
 17 |   std::string long_;
 18 |   T *val_;
 19 | 
 20 | public:
 21 |   Option(T &val, const std::string &l) : long_(l), val_(&val) {}
 22 |   void set_val(const std::string &val) override { set_val((T *)nullptr, val); }
 23 |   const std::string &long_str() override { return long_; }
 24 | 
 25 | private:
 26 |   void set_val(size_t *, const std::string &val) { // convert to size_t
 27 |     *val_ = std::stoull(val);
 28 |   }
 29 |   void set_val(double *, const std::string &val) { // convert to double
 30 |     *val_ = std::stod(val);
 31 |   }
 32 |   void set_val(float *, const std::string &val) { // convert to float
 33 |     *val_ = std::stof(val);
 34 |   }
 35 |   void set_val(int *, const std::string &val) { // convert to int
 36 |     *val_ = std::stoi(val);
 37 |   }
 38 |   void set_val(std::string *, const std::string &val) { // convert to string
 39 |     *val_ = val;
 40 |   }
 41 | };
 42 | 
 43 | class Flag {
 44 |   std::string long_;
 45 |   std::string short_;
 46 |   std::string help_;
 47 |   bool *val_;
 48 | 
 49 | public:
 50 |   Flag(bool &val, const std::string &l, const std::string &s)
 51 |       : long_(l), short_(s), val_(&val) {}
 52 | 
 53 |   const std::string &long_str() const noexcept { return long_; }
 54 |   const std::string &short_str() const noexcept { return short_; }
 55 | 
 56 |   void set() const noexcept { *val_ = true; }
 57 | 
 58 |   void help(const std::string &s) { help_ = s; }
 59 | 
 60 |   const std::string &help_str() const noexcept { return help_; }
 61 | };
 62 | 
 63 | class PosnlBase {
 64 | public:
 65 |   virtual bool is_required() = 0;
 66 |   virtual PosnlBase *required() = 0;
 67 |   virtual void set_val(const std::string &val) = 0;
 68 |   virtual bool found() = 0;
 69 | };
 70 | 
 71 | template <typename T> class Positional : public PosnlBase {
 72 |   bool required_;
 73 |   T *val_;
 74 |   bool found_;
 75 | 
 76 | public:
 77 |   Positional(T &val) : required_(false), val_(&val), found_(false) {}
 78 | 
 79 |   PosnlBase *required() override {
 80 |     required_ = true;
 81 |     return this;
 82 |   }
 83 | 
 84 |   bool is_required() override { return required_; }
 85 | 
 86 |   // use nullpointer type to disambiguate call
 87 |   // https://stackoverflow.com/questions/5512910/explicit-specialization-of-template-class-member-function
 88 |   void set_val(const std::string &val) {
 89 |     found_ = true;
 90 |     set_val((T *)nullptr, val);
 91 |   }
 92 | 
 93 |   bool found() override { return found_; }
 94 | 
 95 | private:
 96 |   // https://stackoverflow.com/questions/5512910/explicit-specialization-of-template-class-member-function
 97 |   template <typename C>
 98 |   void get_as(C *, const std::string &val) { // to be overridden
 99 |   }
100 |   void set_val(size_t *, const std::string &val) { // convert to size_t
101 |     *val_ = std::stoull(val);
102 |   }
103 |   void set_val(double *, const std::string &val) { // convert to double
104 |     *val_ = std::stod(val);
105 |   }
106 |   void set_val(float *, const std::string &val) { // convert to float
107 |     *val_ = std::stof(val);
108 |   }
109 |   void set_val(int *, const std::string &val) { // convert to int
110 |     *val_ = std::stoi(val);
111 |   }
112 |   void set_val(std::string *, const std::string &val) { // convert to string
113 |     *val_ = val;
114 |   }
115 | };
116 | 
117 | class Parser {
118 | 
119 |   std::string description_;
120 |   bool noUnrecognized_; // error on unrecognized flags / opts
121 |   bool help_;           // help has been requested
122 |   bool consume_;        // remove consumed values from argc, argv
123 | 
124 |   std::vector<OptionBase *> opts_;
125 |   std::vector<Flag> flags_;
126 |   std::vector<PosnlBase *> posnls_;
127 | 
128 |   static bool starts_with(const std::string &s, const std::string &prefix) {
129 |     if (s.rfind(prefix, 0) == 0) {
130 |       return true;
131 |     }
132 |     return false;
133 |   }
134 | 
135 |   OptionBase *match_opt(const char *arg) const {
136 |     std::string sarg(arg);
137 |     for (int64_t i = int64_t(opts_.size()) - 1; i >= 0; --i) {
138 |       if (opts_[i]->long_str() == sarg) {
139 |         return opts_[i];
140 |       }
141 |     }
142 |     return nullptr;
143 |   }
144 | 
145 |   Flag *match_flag(const char *arg) {
146 |     std::string sarg(arg);
147 |     for (int64_t i = int64_t(flags_.size()) - 1; i >= 0; --i) {
148 |       if (flags_[i].long_str() == sarg || flags_[i].short_str() == sarg) {
149 |         return &flags_[i];
150 |       }
151 |     }
152 |     return nullptr;
153 |   }
154 | 
155 | public:
156 |   Parser() : noUnrecognized_(false), help_(false), consume_(true) {
157 |     add_flag(help_, "--help", "-h")->help("Print help message");
158 |   }
159 |   Parser(const std::string &description)
160 |       : description_(description), noUnrecognized_(false), help_(false),
161 |         consume_(true) {
162 |     add_flag(help_, "--help", "-h")->help("Print help message");
163 |   }
164 | 
165 |   bool parse(int &argc, char **argv) {
166 | 
167 |     std::vector<char *> newArgv;
168 |     if (argc > 0) {
169 |       newArgv.push_back(argv[0]);
170 |     }
171 | 
172 |     size_t pi = 0;        // positional argument position
173 |     bool optsOkay = true; // okay to interpret as opt/flag
174 |     for (int i = 1; i < argc; ++i) {
175 | 
176 |       // try interpreting as a flag or option if it looks like one
177 |       if (optsOkay && starts_with(argv[i], "-")) {
178 |         // '--' indicates only positional arguments follow
179 |         if (argv[i] == std::string("--")) {
180 |           optsOkay = false;
181 |           continue;
182 |         }
183 |         OptionBase *opt = match_opt(argv[i]);
184 |         if (opt) {
185 |           opt->set_val(argv[i + 1]);
186 |           ++i;
187 |           continue;
188 |         }
189 |         Flag *flag = match_flag(argv[i]);
190 |         if (flag) {
191 |           flag->set();
192 |           continue;
193 |         }
194 |         newArgv.push_back(argv[i]);
195 |         if (noUnrecognized_) {
196 |           std::cerr << "unrecognized " << argv[i] << "\n";
197 |           return false;
198 |         }
199 |       } else { // otherwise try it as positional
200 |         if (pi < posnls_.size()) {
201 |           posnls_[pi]->set_val(argv[i]);
202 |           ++pi;
203 |         } else {
204 |           newArgv.push_back(argv[i]);
205 |           std::cerr << "encountered unexpected positional argument " << pi
206 |                     << ": " << argv[i] << "\n";
207 |         }
208 |       }
209 |     }
210 | 
211 |     for (; pi < posnls_.size(); ++pi) {
212 |       if (posnls_[pi]->is_required()) {
213 |         std::cerr << "missing required positional argument " << pi << "\n";
214 |         return false;
215 |       }
216 |     }
217 | 
218 |     if (consume_) {
219 |       argc = newArgv.size();
220 |       for (int i = 0; i < argc; ++i) {
221 |         argv[i] = newArgv[i];
222 |       }
223 |     }
224 | 
225 |     return true;
226 |   };
227 | 
228 |   template <typename T> void add_option(T &val, const std::string &l) {
229 |     opts_.push_back(new Option<T>(val, l));
230 |   }
231 | 
232 |   Flag *add_flag(bool &val, const std::string &l, const std::string &s = "") {
233 |     flags_.push_back(Flag(val, l, s));
234 |     return &(flags_.back());
235 |   }
236 | 
237 |   template <typename T> PosnlBase *add_positional(T &val) {
238 |     posnls_.push_back(new Positional<T>(val));
239 |     return posnls_.back();
240 |   }
241 | 
242 |   std::string help() const {
243 |     std::stringstream ss;
244 | 
245 |     ss << description_ << "\n";
246 | 
247 |     for (auto &o : opts_) {
248 |       ss << o->long_str() << "\n";
249 |     }
250 | 
251 |     for (auto &f : flags_) {
252 |       ss << "  " << f.short_str() << ", " << f.long_str();
253 |       ss << "\t\t" << f.help_str();
254 |       ss << "\n";
255 |     }
256 | 
257 |     return ss.str();
258 |   }
259 | 
260 |   /*! \brief error on unrecognized flags and options
261 |    */
262 |   void no_unrecognized() { noUnrecognized_ = true; }
263 | 
264 |   /*! \brief don't modify argc/argv
265 |    */
266 |   void no_consume() { consume_ = false; }
267 | 
268 |   bool need_help() const noexcept { return help_; }
269 | };
270 | 
271 | } // namespace argparse


--------------------------------------------------------------------------------
/sgemm/rai_build.yml:
--------------------------------------------------------------------------------
 1 | rai:
 2 |   version: 0.2
 3 |   image: cwpearson/nvidia-performance-tools:amd64-10.1-master-c4d1bb1
 4 | resources:
 5 |   cpu:
 6 |     architecture: amd64
 7 |   gpu:
 8 |     count: 1
 9 |   network: false
10 |   cache: false
11 | commands:
12 |   build:
13 |     - which nsys
14 |     - nsys version
15 |     - which nv-nsight-cu-cli
16 |     - nv-nsight-cu-cli --version
17 |     - nvidia-smi
18 |     - echo "Check Nsight Configurations"
19 |     - bash -c "nv-nsight-cu-cli --devices 0 --query-metrics > metrics.txt"
20 |     - bash -c "nv-nsight-cu-cli --list-sections             > sections.txt"
21 |     - bash -c "nsys status -e                          2>&1 > status.txt"
22 |     - cp -r /src .
23 |     - cmake /src -DCMAKE_BUILD_TYPE=Release
24 |     - make
25 |     - echo "run without profiling"
26 |     - bash -c "./1-1-pinned-basic         | tee 1-1-pinned-basic.txt"
27 |     - bash -c "./1-2-pinned-tiled         | tee 1-2-pinned-tiled.txt"
28 |     - bash -c "./1-3-pinned-joint         | tee 1-3-pinned-joint.txt"
29 |     - bash -c "./2-1-pageable-basic       | tee 2-1-pageable-basic.txt"
30 |     - bash -c "./2-2-pinned-basic         | tee 2-2-pinned-basic.txt"
31 |     - bash -c "./2-3-pinned-tiled         | tee 2-3-pinned-tiled.txt"
32 |     - bash -c "./2-4-pinned-tiled-overlap | tee 2-4-pinned-tiled-overlap.txt"
33 |     - bash -c "./2-5-pinned-joint         | tee 2-5-pinned-joint.txt"
34 |     - bash -c "./2-6-pinned-joint-overlap | tee 2-6-pinned-joint-overlap.txt"
35 |     - echo "Nsight Compute Results"
36 |     - nv-nsight-cu-cli --kernel-id ::mygemm:6 --section ".*" -o 1-1-pinned-basic 1-1-pinned-basic
37 |     - nv-nsight-cu-cli --kernel-id ::mygemm:6 --section ".*" -o 1-2-pinned-tiled 1-2-pinned-tiled
38 |     - nv-nsight-cu-cli --kernel-id ::mygemm:6 --section ".*" -o 1-3-pinned-joint 1-3-pinned-joint
39 |     - echo "Nsight Systems Results"
40 |     - nsys profile -o 2-1-pageable-basic       2-1-pageable-basic 
41 |     - nsys profile -o 2-2-pinned-basic         2-2-pinned-basic
42 |     - nsys profile -o 2-3-pinned-tiled         2-3-pinned-tiled
43 |     - nsys profile -o 2-4-pinned-tiled-overlap 2-4-pinned-tiled-overlap 
44 |     - nsys profile -o 2-5-pinned-joint         2-5-pinned-joint
45 |     - nsys profile -o 2-6-pinned-joint-overlap 2-6-pinned-joint-overlap
46 |     - du -sh .
47 | 


--------------------------------------------------------------------------------
/slides/20200416_ece408.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cwpearson/nvidia-performance-tools/90890e807ef9fc1532ee08938de6689444701686/slides/20200416_ece408.pdf


--------------------------------------------------------------------------------
/slides/20200421_ece498_Nsight.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cwpearson/nvidia-performance-tools/90890e807ef9fc1532ee08938de6689444701686/slides/20200421_ece498_Nsight.pdf


--------------------------------------------------------------------------------
/slides/GEMM-joint-tiling.ppt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cwpearson/nvidia-performance-tools/90890e807ef9fc1532ee08938de6689444701686/slides/GEMM-joint-tiling.ppt


--------------------------------------------------------------------------------
/slides/memory_access_efficiency.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cwpearson/nvidia-performance-tools/90890e807ef9fc1532ee08938de6689444701686/slides/memory_access_efficiency.pdf


--------------------------------------------------------------------------------
/slides/s22141-what-the-profiler-is-telling-you-how-to-get-the-most-performance-out-of-your-hardware.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cwpearson/nvidia-performance-tools/90890e807ef9fc1532ee08938de6689444701686/slides/s22141-what-the-profiler-is-telling-you-how-to-get-the-most-performance-out-of-your-hardware.pdf


--------------------------------------------------------------------------------
/test.cu:
--------------------------------------------------------------------------------
 1 | __global__ void kernel(float *a, float *b, int n) {
 2 |     *a = *b;
 3 | }
 4 | 
 5 | int main(void) {
 6 |     float *a, *b;
 7 |     cudaMalloc(&a, 10 * sizeof(float));
 8 |     cudaMalloc(&b, 10 * sizeof(float));
 9 |     kernel<<<1,1>>>(a,b,10);
10 |     cudaDeviceSynchronize();
11 | }


--------------------------------------------------------------------------------