├── .github
    └── workflows
    │   ├── pages-html.yml
    │   ├── pages-pdf.yml
    │   └── pages.yml
├── LICENSE
├── README.md
├── about.yml
├── bonus
    └── heat-equation
    │   ├── README.md
    │   ├── common
    │       ├── bottle.dat
    │       ├── pngwriter.c
    │       └── pngwriter.h
    │   ├── cuda
    │       ├── Makefile
    │       ├── core.cpp
    │       ├── core_cuda.cu
    │       ├── heat.h
    │       ├── io.cpp
    │       ├── main.cpp
    │       ├── setup.cpp
    │       └── utilities.cpp
    │   ├── hip_solution
    │       └── Makefile
    │   └── serial
    │       ├── Makefile
    │       ├── core.cpp
    │       ├── fortran
    │           ├── Makefile
    │           ├── core.F90
    │           ├── heat_mod.F90
    │           ├── io.F90
    │           ├── main.F90
    │           ├── pngwriter_mod.F90
    │           ├── setup.F90
    │           └── utilities.F90
    │       ├── heat.h
    │       ├── io.cpp
    │       ├── main.cpp
    │       ├── setup.cpp
    │       └── utilities.cpp
├── demos
    ├── device_management_hip.cpp
    ├── device_management_mpi_hip.cpp
    ├── device_properties_hip.cpp
    ├── fill.cpp
    ├── hello.cpp
    └── warp-div.cpp
├── docs
    ├── 01-introduction.md
    ├── 02-kernels.md
    ├── 03-streams.md
    ├── 04-memory.md
    ├── 05-fortran.md
    ├── 06-optimisation.md
    ├── 07-multi-gpu.md
    ├── 08-porting-to-hip.md
    ├── LICENSE
    ├── Makefile
    ├── img
    │   ├── 01.png
    │   ├── 04.png
    │   ├── AMD-GCN-3.png
    │   ├── BankConflicts.jpeg
    │   ├── CU.png
    │   ├── CUgray.png
    │   ├── NoBankConflicts.jpeg
    │   ├── ThreadExecution.jpg
    │   ├── ThreadExecution_new.jpg
    │   ├── a100.png
    │   ├── a100_fp32_core.png
    │   ├── a100_sm.png
    │   ├── a100_smsp.png
    │   ├── amd_computeunit.png
    │   ├── amd_instinct_mi250x_oam.png
    │   ├── amd_m200.png
    │   ├── amd_mi200.jpg
    │   ├── amd_mi200.png
    │   ├── arrow.png
    │   ├── block_sm_cu.png
    │   ├── coalesced.svg
    │   ├── coalesced_access_1.png
    │   ├── coalesced_access_3.png
    │   ├── coalesced_access_4.png
    │   ├── coarse_CU.svg
    │   ├── comparison.png
    │   ├── compp.svg
    │   ├── copy_d2h.png
    │   ├── copy_h2d.png
    │   ├── cpu_waits_on_gpu.png
    │   ├── cu_sm_eu.png
    │   ├── cublas_cuda_hip.png
    │   ├── do_this_computation.png
    │   ├── execution-model.png
    │   ├── execution-model.svg
    │   ├── global-mem-arrow.svg
    │   ├── gpu-bws.png
    │   ├── gpu-cluster.png
    │   ├── gpuConnect.png
    │   ├── gpu_as_a_wide_vector_unit.png
    │   ├── gpu_as_cus_sms_eus.png
    │   ├── gpu_as_vector_units.png
    │   ├── gpu_as_vector_units_instructions.png
    │   ├── gpu_is_a_separate_processor_with_own_memory.png
    │   ├── gpufort.png
    │   ├── gpufort1.png
    │   ├── gpufort2.png
    │   ├── grid-threads.png
    │   ├── grid_gpu.png
    │   ├── hip-programming-2025-images.excalidraw
    │   ├── hipblas.png
    │   ├── hipfort.png
    │   ├── kernel_cuda_hip.png
    │   ├── lumi.jpg
    │   ├── lumi.png
    │   ├── many_blocks_to_one_sm.png
    │   ├── memlayout.png
    │   ├── memory-hierarchy.png
    │   ├── memsch.png
    │   ├── mi100-architecture.info
    │   ├── mi100-architecture.png
    │   ├── mi100_arch.png
    │   ├── mi250x.png
    │   ├── mi250x_cu.png
    │   ├── mi250x_cu_simd.png
    │   ├── microprocessor-trend-data.png
    │   ├── model_gpu.png
    │   ├── new_hipfort.png
    │   ├── no_block_to_many_sm.png
    │   ├── not_gpu_as_a_wide_vector_unit.png
    │   ├── oned_block.png
    │   ├── oned_grid.png
    │   ├── parallel_regions.png
    │   ├── parflow_single_node.png
    │   ├── perfetto.png
    │   ├── processes-threads.svg
    │   ├── runtimes_annotated.png
    │   ├── scalar_operation.png
    │   ├── single_proc_mpi_gpu2.png
    │   ├── single_proc_multi_gpu.png
    │   ├── single_proc_thread_gpu.png
    │   ├── software_hardware_mapping.png
    │   ├── stream-example.svg
    │   ├── streams-example-1.png
    │   ├── streams-example-2.png
    │   ├── streams.png
    │   ├── streams1_explain.png
    │   ├── streams2.png
    │   ├── streams2_explain.png
    │   ├── thread.png
    │   ├── thread_lane.png
    │   ├── threed_block.png
    │   ├── top500-perf-dev.png
    │   ├── top500-performance.png
    │   ├── transpose_img.png
    │   ├── twod_block.png
    │   ├── twod_grid.png
    │   ├── uncoalesced.svg
    │   ├── vector_operation.png
    │   ├── vector_unit.png
    │   ├── virtual_memory_addressing.png
    │   └── warp_wavefron_smsp_simd.png
    └── index
├── exercise-instructions.md
├── first_steps.md
├── hipfort
    ├── hiprand
    │   ├── Makefile
    │   ├── README.md
    │   ├── img
    │   │   └── pi_MC.png
    │   ├── pi.F90
    │   ├── solution
    │   │   ├── Makefile
    │   │   └── pi.F90
    │   └── solution_bonus
    │   │   ├── Makefile
    │   │   ├── hip_kernels.cpp
    │   │   └── pi.F90
    └── saxpy
    │   ├── cuda
    │       └── main.cuf
    │   └── hip
    │       ├── README.md
    │       ├── hipsaxpy.cpp
    │       └── main.f03
├── kernels
    ├── 01-hello-world
    │   ├── README.md
    │   └── hello.cpp
    ├── 02-error-checking
    │   ├── README.md
    │   ├── error-checking.cpp
    │   └── solution
    │   │   └── error-checking.cpp
    ├── 03-kernel-saxpy
    │   ├── README.md
    │   ├── saxpy.cpp
    │   └── solution
    │   │   └── saxpy.cpp
    └── 04-kernel-copy2d
    │   ├── README.md
    │   ├── copy2d.cpp
    │   └── solution
    │       └── copy2d.cpp
├── lambdas
    ├── 01-lambda
    │   ├── README.md
    │   ├── lambda.cpp
    │   └── solution
    │   │   └── lambda.cpp
    ├── 02-reduction
    │   ├── README.md
    │   ├── reduction.cpp
    │   └── solution
    │   │   └── reduction.cpp
    └── 03-hipify
    │   ├── Makefile
    │   ├── README.md
    │   └── src
    │       ├── bessel.cpp
    │       ├── comms.cpp
    │       ├── comms.h
    │       ├── devices_cuda.h
    │       ├── devices_host.h
    │       └── solution.h
├── memory
    ├── 01-prefetch
    │   ├── README.md
    │   ├── prefetch.cpp
    │   └── solution
    │   │   └── prefetch.cpp
    ├── 02-mempools
    │   ├── README.md
    │   ├── mempools.cpp
    │   └── solution
    │   │   └── mempools.cpp
    └── 03-struct
    │   ├── README.md
    │   ├── solution
    │       └── struct.cpp
    │   └── struct.cpp
├── multi-gpu
    ├── 01-p2pcopy
    │   ├── README.md
    │   ├── p2pcopy.cpp
    │   └── solution
    │   │   └── p2pcopy.cpp
    ├── 02-vector-sum
    │   ├── README.md
    │   ├── solution
    │   │   └── vector-sum.cpp
    │   └── vector-sum.cpp
    └── 03-mpi
    │   ├── Makefile
    │   ├── README.md
    │   ├── ping-pong.cpp
    │   └── solution
    │       ├── Makefile
    │       └── ping-pong.cpp
├── optimization
    ├── 01-coalescing
    │   ├── README.md
    │   ├── copy.cpp
    │   └── metrics.txt
    ├── 02-matrix_transpose
    │   ├── README.md
    │   ├── copy.cpp
    │   ├── matrix_transpose_naive.cpp
    │   ├── matrix_transpose_with_SM.cpp
    │   └── matrix_transpose_with_SM_nobc.cpp
    └── 03-trace
    │   ├── README.md
    │   └── streams.cpp
├── porting
    ├── README.md
    └── codes
    │   ├── README.md
    │   ├── Vector_Addition
    │       ├── Readme.md
    │       ├── cuda
    │       │   ├── Readme.md
    │       │   └── vecadd.cu
    │       └── hip_solution
    │       │   └── vecadd.cu
    │   └── saxpy
    │       ├── cublas
    │           ├── Makefile
    │           └── saxpy_cublas.cu
    │       ├── cuda
    │           └── saxpy.cu
    │       ├── hip
    │           └── README.md
    │       ├── hip_solution
    │           └── saxpy.cu
    │       ├── hipblas
    │           └── README.md
    │       └── hipblas_solution
    │           ├── Makefile
    │           └── saxpy_cublas.cu
├── setup_env_lumi
├── streams
    ├── 01-event-record
    │   ├── README.md
    │   ├── record.cpp
    │   └── solution
    │   │   └── record.cpp
    └── 02-concurrency
    │   ├── README.md
    │   ├── solution
    │       └── streams.cpp
    │   └── streams.cpp
└── third-party
    ├── hipcub
        └── hipcub.hpp
    └── hiprand
        ├── hiprand.h
        ├── hiprand.hpp
        ├── hiprand_hcc.h
        ├── hiprand_kernel.h
        ├── hiprand_kernel_hcc.h
        ├── hiprand_kernel_nvcc.h
        ├── hiprand_mtgp32_host.h
        ├── hiprand_nvcc.h
        └── hiprand_version.h


/.github/workflows/pages-html.yml:
--------------------------------------------------------------------------------
 1 | name: Deploy HTML slides to Pages
 2 | 
 3 | on:
 4 |   # Runs on pushes targeting the default branch
 5 |   push:
 6 |     branches:
 7 |       - "main"
 8 |     paths:
 9 |       - "docs/**"
10 |       - ".github/workflows/pages.yml"
11 | 
12 |   # Allows you to run this workflow manually from the Actions tab
13 |   workflow_dispatch:
14 | 
15 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
16 | permissions:
17 |   contents: read
18 |   pages: write
19 |   id-token: write
20 | 
21 | jobs:
22 |   pages-html:
23 |     uses: ./.github/workflows/pages.yml
24 |     with:
25 |       include_pdf: false
26 | 


--------------------------------------------------------------------------------
/.github/workflows/pages-pdf.yml:
--------------------------------------------------------------------------------
 1 | name: Deploy HTML and PDF slides to Pages
 2 | 
 3 | on:
 4 |   # Runs after HTML deployment
 5 |   workflow_run:
 6 |     workflows: [Deploy HTML slides to Pages]
 7 |     types:
 8 |       - completed
 9 | 
10 |   # Allows you to run this workflow manually from the Actions tab
11 |   workflow_dispatch:
12 | 
13 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
14 | permissions:
15 |   contents: read
16 |   pages: write
17 |   id-token: write
18 | 
19 | jobs:
20 |   pages-pdf:
21 |     if: ${{ github.event.workflow_run.conclusion != 'failure' }}
22 |     uses: ./.github/workflows/pages.yml
23 |     with:
24 |       include_pdf: true
25 | 


--------------------------------------------------------------------------------
/.github/workflows/pages.yml:
--------------------------------------------------------------------------------
 1 | # Script based on examples in https://github.com/actions/starter-workflows/tree/main/pages
 2 | name: Deploy slides to Pages
 3 | 
 4 | on:
 5 |   workflow_call:
 6 |     inputs:
 7 |       include_pdf:
 8 |         required: true
 9 |         type: boolean
10 | 
11 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
12 | permissions:
13 |   contents: read
14 |   pages: write
15 |   id-token: write
16 | 
17 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
18 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
19 | concurrency:
20 |   group: "pages"
21 |   cancel-in-progress: false
22 | 
23 | jobs:
24 |   build:
25 |     timeout-minutes: 30
26 |     runs-on: ubuntu-latest
27 |     container:
28 |       image: ghcr.io/csc-training/slidefactory:3.3.0
29 |     steps:
30 |       - name: Checkout
31 |         uses: actions/checkout@v4
32 |       - name: Setup Pages
33 |         id: pages
34 |         uses: actions/configure-pages@v4
35 |       - name: Build slides
36 |         env:
37 |           INCLUDE_PDF: ${{ inputs.include_pdf }}
38 |         shell: bash
39 |         run: |
40 |           git config --global --add safe.directory $PWD
41 |           GIT_SHORT_SHA=$(git rev-parse --short $GITHUB_SHA)
42 |           GIT_DATE=$(git show -s --format=%ci $GITHUB_SHA)
43 | 
44 |           ARGS=""
45 |           [[ "$INCLUDE_PDF" == "true" ]] && ARGS="--with-pdf"
46 | 
47 |           slidefactory pages about.yml build --info_content "Updated for [$GIT_SHORT_SHA]($GITHUB_SERVER_URL/$GITHUB_REPOSITORY/commit/$GITHUB_SHA) ($GIT_DATE)" $ARGS
48 | 
49 |       - name: Upload artifact
50 |         uses: actions/upload-pages-artifact@v3
51 |         with:
52 |           path: ./build
53 | 
54 |   deploy:
55 |     environment:
56 |       name: github-pages
57 |       url: ${{ steps.deployment.outputs.page_url }}
58 |     runs-on: ubuntu-latest
59 |     needs: build
60 |     steps:
61 |       - name: Deploy to GitHub Pages
62 |         id: deployment
63 |         uses: actions/deploy-pages@v4
64 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | All material in the directory 'docs/' and its sub-directories as well as
 2 | all images are licensed under a Creative Commons Attribution-ShareAlike 4.0
 3 | International (CC-BY-SA 4.0) license unless otherwise noted.
 4 | 
 5 | Full text of the license is available in the file 'docs/LICENSE' and at
 6 | <http://creativecommons.org/licenses/by-sa/4.0/>.
 7 | 
 8 | Any other files are licensed under the MIT license (below) unless otherwise
 9 | noted.
10 | 
11 | ---
12 | 
13 | MIT License
14 | 
15 | Copyright (c) 2021 CSC Training
16 | 
17 | Permission is hereby granted, free of charge, to any person obtaining a copy
18 | of this software and associated documentation files (the "Software"), to deal
19 | in the Software without restriction, including without limitation the rights
20 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
21 | copies of the Software, and to permit persons to whom the Software is
22 | furnished to do so, subject to the following conditions:
23 | 
24 | The above copyright notice and this permission notice shall be included in all
25 | copies or substantial portions of the Software.
26 | 
27 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
28 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
29 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
30 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
31 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
32 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 | SOFTWARE.
34 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # GPU programming with HIP
  2 | 
  3 | Course material for the CSC course "GPU programming with HIP". The course is
  4 | part of the EuroCC training activities at CSC.
  5 | 
  6 | ## Agenda
  7 | 
  8 | ### Day 1
  9 | 
 10 | | Time | Topic |
 11 | | ---- | ----- |
 12 | | 09:00–09:30 | Welcome, LUMI access, slurm, git, web interface (JL) |
 13 | | 09:30–09:45 | Break/debugging access |
 14 | | 09:45–10:30 | Introduction to GPU programming (JL) |
 15 | | 10:30–10:45 | Break & Snacks |
 16 | | 10:45-11:30 | HIP and GPU kernels (JL) |
 17 | | 11:30-11:45 | Break |
 18 | | 11:45-12:15 | Exercises |
 19 | | 12:15-13:00 | Lunch |
 20 | | 13:00-13:45 | Streams, events, and synchronization (JK) |
 21 | | 13:45-14:00 | Break |
 22 | | 14:00-14:30 | Exercises |
 23 | | 14:30-15:15 | Memory allocations, access and unified memory (JK) |
 24 | | 15:15-15:30 | Break |
 25 | | 15:30-16:00 | Exercises |
 26 | | 16:00-16:15 | Day summary |
 27 | 
 28 | 
 29 | ### Day 2
 30 | 
 31 | | Time | Topic |
 32 | | ---- | ----- |
 33 | | 09:00–10:00 | Kernel optimizations (JK) |
 34 | | 10:00–10:15 | Break & Snacks |
 35 | | 10:15–10:45 | Exercises |
 36 | | 10:45-11:30 | Multi-GPU programming, HIP+MPI (CA) |
 37 | | 11:30-11:45 | Break |
 38 | | 11:45-12:15 | Exercises |
 39 | | 12:15-13:00 | Lunch |
 40 | | 13:00-13:30 | Fortran and HIP (CA) |
 41 | | 13:30-13:45 | Break |
 42 | | 13:45-14:15 | Exercises  |
 43 | | 14:15-14:45 | Porting Applications to HIP (CA) |
 44 | | 14:45-15:45 | Break & Exercises |
 45 | | 15:45-16:00 | Close-up | 
 46 | 
 47 | 
 48 | ## Slides
 49 | 
 50 | Link to [slides](https://csc-training.github.io/hip-programming/)
 51 | 
 52 | ## First steps
 53 | - [Which technologies have you used?](https://strawpoll.com/w4nWWYReQnA)
 54 | - [First steps](first_steps.md)
 55 | 
 56 | ## Exercises
 57 | 
 58 | [General instructions](exercise-instructions.md)
 59 | 
 60 | ### Introduction and GPU kernels
 61 | 
 62 | - [Mental model quiz](https://siili.rahtiapp.fi/s/gpmWnLY8q#)
 63 | - [Hello world](kernels/01-hello-world)
 64 | - [Error checking](kernels/02-error-checking)
 65 | - [Kernel saxpy](kernels/03-kernel-saxpy)
 66 | - [Kernel copy2d](kernels/04-kernel-copy2d)
 67 | 
 68 | ### Streams, events, and synchronization
 69 | 
 70 | - [Understanding asynchronity using events](streams/01-event-record)
 71 | - [Investigating streams and events](streams/02-concurrency)
 72 | 
 73 | ### Memory allocations, access, and unified memory
 74 | 
 75 | - [Memory management strategies](memory/01-prefetch)
 76 | - [The stream-ordered memory allocator and memory pools](memory/02-mempools)
 77 | - [Unified memory and structs](memory/03-struct)
 78 | 
 79 | ### Fortran and HIP
 80 | 
 81 | - [SAXPY](hipfort/saxpy/hip/)
 82 | - [HIPRAND](hipfort/hiprand/)
 83 | 
 84 | ### Optimization
 85 | 
 86 | - [Coalescing](optimization/01-coalescing)
 87 | - [Matrix Transpose](optimization/02-matrix_transpose)
 88 | - [Tracing](optimization/03-trace)
 89 | 
 90 | ### Multi-GPU programming and HIP+MPI
 91 | 
 92 | - [Peer to peer device access](multi-gpu/01-p2pcopy)
 93 | - [Vector sum on two GPUs without MPI](multi-gpu/02-vector-sum)
 94 | - [Ping-pong with multiple GPUs and MPI](multi-gpu/03-mpi)
 95 | 
 96 | ### Porting to HIP
 97 | 
 98 | - [Converting Tools & Portability](porting)
 99 | 
100 | #### Bonus
101 | - [Heat equation with HIP](bonus/heat-equation)
102 | 


--------------------------------------------------------------------------------
/about.yml:
--------------------------------------------------------------------------------
1 | # This file is used in the generation of the web page
2 | title: GPU programming with HIP
3 | slidesdir: docs
4 | 


--------------------------------------------------------------------------------
/bonus/heat-equation/README.md:
--------------------------------------------------------------------------------
 1 | # Bonus: Heat equation solver with HIP
 2 | 
 3 | Create a parallel version of a heat equation solver using HIP.
 4 | 
 5 | Starting from a [serial heat equation solver](serial) (see below for details),
 6 | port the code to GPUs using HIP. Main computational routine is the time
 7 | evolution loop in the `core.cpp` file.
 8 | 
 9 | Alternatively, you may start from a [CUDA+MPI version](cuda) and hipify the code to jump start the work.
10 | 
11 | Note: You may need to comment out the PNG generation parts, if the system you
12 | are using doesn't have libpng installed.
13 | 
14 | ## Heat equation solver
15 | 
16 | The heat equation is a partial differential equation that describes the
17 | variation of temperature in a given region over time
18 | 
19 | $$\frac{\partial u}{\partial t} = \alpha \nabla^2 u $$
20 | 
21 | where u(x, y, z, t) represents temperature variation over space at a given
22 | time, and α is a thermal diffusivity constant.
23 | 
24 | We limit ourselves to two dimensions (plane) and discretize the equation onto
25 | a grid.  The two dimensional Laplacian can be
26 | discretized with finite differences as
27 | 
28 | ```math
29 | \nabla^2 u  = \frac{u(i-1,j)-2u(i,j)+u(i+1,j)}{(\Delta x)^2} + \frac{u(i,j-1)-2u(i,j)+u(i,j+1)}{(\Delta y)^2}
30 | 
31 | ```
32 |  Given an initial condition (u(t=0) = u0) one can follow the time dependence
33 |  of
34 |  the temperature field with explicit time evolution method:
35 | 
36 | $$u^{m+1}(i,j) = u^m(i,j) + \Delta t \alpha \nabla^2 u^m(i,j) $$
37 | 
38 |  Note: Algorithm is stable only when
39 | 
40 | $$ \Delta t < \frac{1}{2 \alpha} \frac{(\Delta x \Delta y)^2}{(\Delta x)^2+
41 |  (\Delta y)^2} $$
42 | 
43 | There is a solver for the 2D equation implemented in C++ and Fortran. You can
44 | compile the program by adjusting the Makefile as needed and typing `make`. The
45 | solver carries out the time development of the 2D heat equation over the
46 | number of time steps provided by the user. The default geometry is a flat
47 | rectangle (with grid size provided by the user), but other shapes may be used
48 | via input files. Examples on how to run the binary:
49 | 
50 | - `./heat`
51 |   No arguments - the program will run with the default arguments: 200 x 200
52 |   grid and 500 time steps
53 | - `./heat ../common/bottle.dat`
54 |   One argument - start from a temperature grid provided in the given file
55 |   for the default number of time steps
56 | - `./heat ../common/bottle.dat 1000`
57 |   Two arguments - will run the program starting from a temperature grid
58 |   provided in the given file for 1000 time steps
59 | - `./heat 1024 2048 1000`
60 |   Three arguments - will run the program using the grid dimensions (1024 x 2048) and the number of time steps (1000) specified by the arguments
61 | 
62 | The program will produce a `.png` image of the temperature field after every
63 | 100 iterations. You can change that from the parameter `image_interval`. You
64 | can visualise the images using the command animate: `animate heat_*.png`, or
65 | by using `eog heat_000.png` and using the arrow-keys to loop backward or
66 | forward through the files.
67 | 


--------------------------------------------------------------------------------
/bonus/heat-equation/common/pngwriter.h:
--------------------------------------------------------------------------------
 1 | #ifndef PNGWRITER_H_
 2 | #define PNGWRITER_H_
 3 | 
 4 | #if __cplusplus
 5 |   extern "C" {
 6 | #endif
 7 | 
 8 | int save_png(double *data, const int nx, const int ny, const char *fname,
 9 |              const char lang);
10 | 
11 | #if __cplusplus
12 |   }
13 | #endif
14 | #endif
15 | 


--------------------------------------------------------------------------------
/bonus/heat-equation/cuda/Makefile:
--------------------------------------------------------------------------------
 1 | ifeq ($(COMP),)
 2 | COMP=gnu
 3 | endif
 4 | 
 5 | COMMONDIR=../common
 6 | 
 7 | ifeq ($(COMP),pgi)
 8 | CXX=mpicxx
 9 | CC=pgcc
10 | NVCC=nvcc -ccbin pgc++
11 | NVCCFLAGS=-g -O3 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -I$(COMMONDIR)
12 | CCFLAGS=-g -O3 -I$(COMMONDIR)
13 | LDFLAGS=-L$(CUDA_INSTALL_ROOT)/lib64
14 | LIBS=-lpng -lcudart
15 | endif
16 | 
17 | ifeq ($(COMP),gnu)
18 | CXX=mpicxx
19 | CC=gcc
20 | NVCC=nvcc
21 | NVCCFLAGS=-g -O3 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -I$(COMMONDIR)
22 | CCFLAGS=-g -O3 -Wall -I$(COMMONDIR)
23 | LDFLAGS=
24 | LIBS=-lpng -lcudart
25 | endif
26 | 
27 | EXE=heat_cuda
28 | OBJS=main.o core.o core_cuda.o setup.o utilities.o io.o
29 | OBJS_PNG=$(COMMONDIR)/pngwriter.o
30 | 
31 | 
32 | all: $(EXE)
33 | 
34 | 
35 | core.o: core.cpp heat.h
36 | core_cuda.o: core_cuda.cu heat.h
37 | utilities.o: utilities.cpp heat.h
38 | setup.o: setup.cpp heat.h
39 | io.o: io.cpp heat.h
40 | main.o: main.cpp heat.h
41 | 
42 | $(OBJS_PNG): C_COMPILER := $(CC)
43 | $(OBJS): C_COMPILER := $(CXX)
44 | 
45 | $(EXE): $(OBJS) $(OBJS_PNG)
46 | 	$(CXX) $(CCFLAGS) $(OBJS) $(OBJS_PNG) -o $@ $(LDFLAGS) $(LIBS)
47 | 
48 | %.o: %.cpp
49 | 	$(CXX) $(CCFLAGS) -c $< -o $@
50 | 
51 | %.o: %.c
52 | 	$(CC) $(CCFLAGS) -c $< -o $@
53 | 
54 | %.o: %.cu
55 | 	$(NVCC) $(NVCCFLAGS) -c $< -o $@
56 | 
57 | .PHONY: clean
58 | clean:
59 | 	-/bin/rm -f $(EXE) a.out *.o *.png *~
60 | 


--------------------------------------------------------------------------------
/bonus/heat-equation/cuda/core.cpp:
--------------------------------------------------------------------------------
 1 | /* Main solver routines for heat equation solver */
 2 | 
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <string.h>
 6 | #include <assert.h>
 7 | #include <mpi.h>
 8 | 
 9 | #include "heat.h"
10 | 
11 | /* Exchange the boundary values */
12 | void exchange(field *temperature, parallel_data *parallel)
13 | {
14 |     double *data;
15 |     double *sbuf_up, *sbuf_down, *rbuf_up, *rbuf_down;
16 | 
17 |     data = temperature->devdata;
18 | 
19 |     // Send to the up, receive from down
20 |     sbuf_up = data + temperature->ny + 2; // upper data
21 |     rbuf_down = data + (temperature->nx + 1) * (temperature->ny + 2); // lower halo
22 | 
23 |     MPI_Sendrecv(sbuf_up, temperature->ny + 2, MPI_DOUBLE,
24 |                  parallel->nup, 11,
25 |                  rbuf_down, temperature->ny + 2, MPI_DOUBLE,
26 |                  parallel->ndown, 11, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
27 | 
28 |     // Send to the down, receive from up
29 |     sbuf_down = data + temperature->nx * (temperature->ny + 2); // lower data
30 |     rbuf_up = data; // upper halo
31 | 
32 |     MPI_Sendrecv(sbuf_down, temperature->ny + 2, MPI_DOUBLE,
33 |                  parallel->ndown, 12,
34 |                  rbuf_up, temperature->ny + 2, MPI_DOUBLE,
35 |                  parallel->nup, 12, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/bonus/heat-equation/cuda/core_cuda.cu:
--------------------------------------------------------------------------------
  1 | /* Main solver routines for heat equation solver */
  2 | 
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | #include <string.h>
  6 | #include <assert.h>
  7 | #include <mpi.h>
  8 | #include <cuda_runtime_api.h>
  9 | 
 10 | #include "heat.h"
 11 | 
 12 | /* CUDA error handling macro */
 13 | #define CUDA_ERR(err) (cuda_errchk(err, __FILE__, __LINE__ ))
 14 | static inline void cuda_errchk(cudaError_t err, const char *file, int line) {
 15 |   if (err != cudaSuccess) {
 16 |     printf("\n\n%s in %s at line %d\n", cudaGetErrorString(err), file, line);
 17 |     exit(EXIT_FAILURE);
 18 |   }
 19 | }
 20 | 
 21 | /* Update the temperature values using five-point stencil */
 22 | __global__ void evolve_kernel(double *currdata, double *prevdata, double a, double dt, int nx, int ny,
 23 |                        double dx2, double dy2)
 24 | {
 25 | 
 26 |     /* Determine the temperature field at next time step
 27 |      * As we have fixed boundary conditions, the outermost gridpoints
 28 |      * are not updated. */
 29 |     int ind, ip, im, jp, jm;
 30 | 
 31 |     // CUDA threads are arranged in column major order; thus j index from x, i from y
 32 |     int j = blockIdx.x * blockDim.x + threadIdx.x;
 33 |     int i = blockIdx.y * blockDim.y + threadIdx.y;
 34 | 
 35 |     if (i > 0 && j > 0 && i < nx+1 && j < ny+1) {
 36 |         ind = i * (ny + 2) + j;
 37 |         ip = (i + 1) * (ny + 2) + j;
 38 |         im = (i - 1) * (ny + 2) + j;
 39 |         jp = i * (ny + 2) + j + 1;
 40 |         jm = i * (ny + 2) + j - 1;
 41 |         currdata[ind] = prevdata[ind] + a * dt *
 42 |           ((prevdata[ip] -2.0 * prevdata[ind] + prevdata[im]) / dx2 +
 43 |           (prevdata[jp] - 2.0 * prevdata[ind] + prevdata[jm]) / dy2);
 44 | 
 45 |     }
 46 | 
 47 | }
 48 | 
 49 | void evolve(field *curr, field *prev, double a, double dt)
 50 | {
 51 |     int nx, ny;
 52 |     double dx2, dy2;
 53 |     nx = prev->nx;
 54 |     ny = prev->ny;
 55 |     dx2 = prev->dx * prev->dx;
 56 |     dy2 = prev->dy * prev->dy;
 57 | 
 58 |     /* CUDA thread settings */
 59 |     const int blocksize = 16;  //!< CUDA thread block dimension
 60 |     dim3 dimBlock(blocksize, blocksize);
 61 |     // CUDA threads are arranged in column major order; thus make ny x nx grid
 62 |     dim3 dimGrid((ny + 2 + blocksize - 1) / blocksize,
 63 |                  (nx + 2 + blocksize - 1) / blocksize);
 64 | 
 65 |     evolve_kernel<<<dimGrid, dimBlock>>>(curr->devdata, prev->devdata, a, dt, nx, ny, dx2, dy2);
 66 |     CUDA_ERR(cudaDeviceSynchronize());
 67 | }
 68 | 
 69 | void enter_data(field *temperature1, field *temperature2)
 70 | {
 71 |     size_t datasize;
 72 | 
 73 |     datasize = (temperature1->nx + 2) * (temperature1->ny + 2) * sizeof(double);
 74 | 
 75 |     CUDA_ERR(cudaMalloc(&temperature1->devdata, datasize));
 76 |     CUDA_ERR(cudaMalloc(&temperature2->devdata, datasize));
 77 | 
 78 |     CUDA_ERR(cudaMemcpy(temperature1->devdata, temperature1->data, datasize, cudaMemcpyHostToDevice));
 79 |     CUDA_ERR(cudaMemcpy(temperature2->devdata, temperature2->data, datasize, cudaMemcpyHostToDevice));
 80 | }
 81 | 
 82 | /* Copy a temperature field from the device to the host */
 83 | void update_host(field *temperature)
 84 | {
 85 |     size_t datasize;
 86 | 
 87 |     datasize = (temperature->nx + 2) * (temperature->ny + 2) * sizeof(double);
 88 |     CUDA_ERR(cudaMemcpy(temperature->data, temperature->devdata, datasize, cudaMemcpyDeviceToHost));
 89 | }
 90 | 
 91 | /* Copy a temperature field from the host to the device */
 92 | void update_device(field *temperature)
 93 | {
 94 |     size_t datasize;
 95 | 
 96 |     datasize = (temperature->nx + 2) * (temperature->ny + 2) * sizeof(double);
 97 |     CUDA_ERR(cudaMemcpy(temperature->devdata, temperature->data, datasize, cudaMemcpyHostToDevice));
 98 | }
 99 | 
100 | 


--------------------------------------------------------------------------------
/bonus/heat-equation/cuda/heat.h:
--------------------------------------------------------------------------------
 1 | #ifndef __HEAT_H__
 2 | #define __HEAT_H__
 3 | 
 4 | 
 5 | /* Datatype for temperature field */
 6 | typedef struct {
 7 |     /* nx and ny are the true dimensions of the field. The array data
 8 |      * contains also ghost layers, so it will have dimensions nx+2 x ny+2 */
 9 |     int nx;                     /* Local dimensions of the field */
10 |     int ny;
11 |     int nx_full;                /* Global dimensions of the field */
12 |     int ny_full;                /* Global dimensions of the field */
13 |     double dx;
14 |     double dy;
15 |     double *data;
16 |     double *devdata;           /* Data in device */
17 | } field;
18 | 
19 | /* Datatype for basic parallelization information */
20 | typedef struct {
21 |     int size;                   /* Number of MPI tasks */
22 |     int rank;
23 |     int nup, ndown;      /* Ranks of neighbouring MPI tasks */
24 | } parallel_data;
25 | 
26 | 
27 | /* We use here fixed grid spacing */
28 | #define DX 0.01
29 | #define DY 0.01
30 | 
31 | #if __cplusplus
32 |   extern "C" {
33 | #endif
34 | 
35 | /* Function prototypes */
36 | void set_field_dimensions(field *temperature, int nx, int ny,
37 |                           parallel_data *parallel);
38 | 
39 | void parallel_setup(parallel_data *parallel, int nx, int ny);
40 | 
41 | void parallel_set_dimensions(parallel_data *parallel, int nx, int ny);
42 | 
43 | void initialize(int argc, char *argv[], field *temperature1,
44 |                 field *temperature2, int *nsteps, parallel_data *parallel);
45 | 
46 | void generate_field(field *temperature, parallel_data *parallel);
47 | 
48 | double average(field *temperature);
49 | 
50 | void exchange(field *temperature, parallel_data *parallel);
51 | 
52 | void evolve(field *curr, field *prev, double a, double dt);
53 | 
54 | void write_field(field *temperature, int iter, parallel_data *parallel);
55 | 
56 | void read_field(field *temperature1, field *temperature2,
57 |                 char *filename, parallel_data *parallel);
58 | 
59 | void copy_field(field *temperature1, field *temperature2);
60 | 
61 | void swap_fields(field *temperature1, field *temperature2);
62 | 
63 | void allocate_field(field *temperature);
64 | 
65 | void finalize(field *temperature1, field *temperature2);
66 | 
67 | void enter_data(field *temperature1, field *temperature2);
68 | 
69 | void update_host(field *temperature);
70 | 
71 | void update_device(field *temperature);
72 | 
73 | #if __cplusplus
74 |   }
75 | #endif
76 | #endif  /* __HEAT_H__ */
77 | 
78 | 


--------------------------------------------------------------------------------
/bonus/heat-equation/cuda/main.cpp:
--------------------------------------------------------------------------------
 1 | /* Heat equation solver in 2D. */
 2 | 
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <string.h>
 6 | #include <time.h>
 7 | #include <mpi.h>
 8 | #include <mpi-ext.h> /* Needed for CUDA-aware check */
 9 | 
10 | #include "heat.h"
11 | 
12 | int main(int argc, char **argv)
13 | {
14 |     double a = 0.5;             //!< Diffusion constant
15 |     field current, previous;    //!< Current and previous temperature fields
16 | 
17 |     double dt;                  //!< Time step
18 |     int nsteps;                 //!< Number of time steps
19 | 
20 |     int image_interval = 1500;    //!< Image output interval
21 | 
22 |     parallel_data parallelization; //!< Parallelization info
23 | 
24 |     double dx2, dy2;            //!< Delta x and y squared
25 | 
26 |     double average_temp;        //!< Average temperature
27 | 
28 |     double start_clock, stop_clock;  //!< Time stamps
29 | 
30 | 
31 |     MPI_Init(&argc, &argv);
32 | 
33 |     if (1 != MPIX_Query_cuda_support()) {
34 |         printf("CUDA aware MPI required\n");
35 |         fflush(stdout);
36 |         MPI_Abort(MPI_COMM_WORLD, 5);
37 |     }
38 |     initialize(argc, argv, &current, &previous, &nsteps, &parallelization);
39 | 
40 |     /* Output the initial field */
41 |     write_field(&current, 0, &parallelization);
42 | 
43 |     average_temp = average(&current);
44 |     if (parallelization.rank == 0) {
45 |         printf("Average temperature at start: %f\n", average_temp);
46 |     }
47 | 
48 | 
49 |     /* Largest stable time step */
50 |     dx2 = current.dx * current.dx;
51 |     dy2 = current.dy * current.dy;
52 |     dt = dx2 * dy2 / (2.0 * a * (dx2 + dy2));
53 | 
54 |     /* Get the start time stamp */
55 |     start_clock = MPI_Wtime();
56 | 
57 |     /* Copy fields to device */
58 |     enter_data(&current, &previous);
59 | 
60 |     /* Time evolve */
61 |     for (int iter = 1; iter <= nsteps; iter++) {
62 |         exchange(&previous, &parallelization);
63 |         evolve(&current, &previous, a, dt);
64 |         if (iter % image_interval == 0) {
65 |             update_host(&current);
66 |             write_field(&current, iter, &parallelization);
67 |         }
68 |         /* Swap current field so that it will be used
69 |             as previous for next iteration step */
70 |         swap_fields(&current, &previous);
71 |     }
72 | 
73 |     update_host(&previous);
74 |     stop_clock = MPI_Wtime();
75 | 
76 |     /* Average temperature for reference */
77 |     average_temp = average(&previous);
78 | 
79 |     /* Determine the CPU time used for the iteration */
80 |     if (parallelization.rank == 0) {
81 |         printf("Iteration took %.3f seconds.\n", (stop_clock - start_clock));
82 |         printf("Average temperature: %f\n", average_temp);
83 |         if (argc == 1) {
84 |             printf("Reference value with default arguments: 59.281239\n");
85 |         }
86 |     }
87 | 
88 |     /* Output the final field */
89 |     write_field(&previous, nsteps, &parallelization);
90 | 
91 |     finalize(&current, &previous);
92 |     MPI_Finalize();
93 | 
94 |     return 0;
95 | }
96 | 


--------------------------------------------------------------------------------
/bonus/heat-equation/cuda/utilities.cpp:
--------------------------------------------------------------------------------
 1 | /* Utility functions for heat equation solver
 2 |  *   NOTE: This file does not need to be edited! */
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string.h>
 7 | #include <assert.h>
 8 | #include <mpi.h>
 9 | 
10 | #include "heat.h"
11 | 
12 | 
13 | /* Copy data on temperature1 into temperature2 */
14 | void copy_field(field *temperature1, field *temperature2)
15 | {
16 |     assert(temperature1->nx == temperature2->nx);
17 |     assert(temperature1->ny == temperature2->ny);
18 |     memcpy(temperature2->data, temperature1->data,
19 |            (temperature1->nx + 2) * (temperature1->ny + 2) * sizeof(double));
20 | }
21 | 
22 | /* Swap the data of fields temperature1 and temperature2 */
23 | void swap_fields(field *temperature1, field *temperature2)
24 | {
25 |     double *tmp;
26 |     tmp = temperature1->data;
27 |     temperature1->data = temperature2->data;
28 |     temperature2->data = tmp;
29 | 
30 |     tmp = temperature1->devdata;
31 |     temperature1->devdata = temperature2->devdata;
32 |     temperature2->devdata = tmp;
33 | }
34 | 
35 | /* Allocate memory for a temperature field and initialise it to zero */
36 | void allocate_field(field *temperature)
37 | {
38 |     // Allocate also ghost layers
39 |     temperature->data = new double [(temperature->nx + 2) * (temperature->ny + 2)];
40 | 
41 |     // Initialize to zero
42 |     memset(temperature->data, 0.0,
43 |            (temperature->nx + 2) * (temperature->ny + 2) * sizeof(double));
44 | }
45 | 
46 | /* Calculate average temperature */
47 | double average(field *temperature)
48 | {
49 |      double local_average = 0.0;
50 |      double average = 0.0;
51 | 
52 |      for (int i = 1; i < temperature->nx + 1; i++) {
53 |        for (int j = 1; j < temperature->ny + 1; j++) {
54 |          int ind = i * (temperature->ny + 2) + j;
55 |          local_average += temperature->data[ind];
56 |        }
57 |      }
58 | 
59 |      MPI_Allreduce(&local_average, &average, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
60 |      average /= (temperature->nx_full * temperature->ny_full);
61 |      return average;
62 | }
63 | 


--------------------------------------------------------------------------------
/bonus/heat-equation/hip_solution/Makefile:
--------------------------------------------------------------------------------
 1 | ifeq ($(COMP),)
 2 | COMP=lumi
 3 | endif
 4 | 
 5 | COMMONDIR=../common
 6 | 
 7 | ifeq ($(COMP),lumi)
 8 | LIBPNG_DIR=/appl/lumi/SW/LUMI-22.12/C/EB/libpng/1.6.38-cpeCray-22.12
 9 | CXX=CC
10 | CC=cc
11 | CXXFLAGS=-xhip -I$(COMMONDIR)
12 | CCFLAGS=-I$(LIBPNG_DIR)/include
13 | LDFLAGS=-L$(LIBPNG_DIR)/lib -Wl,-rpath=$(LIBPNG_DIR)/lib
14 | LIBS=-lpng
15 | endif
16 | 
17 | EXE=heat_hip
18 | OBJS=main.o core.o core_hip.o setup.o utilities.o io.o
19 | OBJS_PNG=$(COMMONDIR)/pngwriter.o
20 | 
21 | 
22 | all: $(EXE)
23 | 
24 | 
25 | core.o: core.cpp heat.h
26 | core_hip.o: core_hip.cpp heat.h
27 | utilities.o: utilities.cpp heat.h
28 | setup.o: setup.cpp heat.h
29 | io.o: io.cpp heat.h
30 | main.o: main.cpp heat.h
31 | 
32 | $(OBJS_PNG): C_COMPILER := $(CC)
33 | $(OBJS): C_COMPILER := $(CXX)
34 | 
35 | $(EXE): $(OBJS) $(OBJS_PNG)
36 | 	$(CXX) $(CCFLAGS) $(OBJS) $(OBJS_PNG) -o $@ $(LDFLAGS) $(LIBS)
37 | 
38 | %.o: %.cpp
39 | 	$(CXX) $(CXXFLAGS) -c $< -o $@
40 | 
41 | %.o: %.c
42 | 	$(CC) $(CCFLAGS) -c $< -o $@
43 | 
44 | .PHONY: clean
45 | clean:
46 | 	-/bin/rm -f $(EXE) a.out *.o *.png *~
47 | 


--------------------------------------------------------------------------------
/bonus/heat-equation/serial/Makefile:
--------------------------------------------------------------------------------
 1 | ifeq ($(COMP),)
 2 | COMP=gnu
 3 | endif
 4 | 
 5 | COMMONDIR=../common
 6 | 
 7 | ifeq ($(COMP),pgi)
 8 | CXX=pgCC
 9 | CC=pgcc
10 | CCFLAGS=-g -O3 -fopenmp -I$(COMMONDIR)
11 | LDFLAGS=
12 | LIBS=-lpng
13 | endif
14 | 
15 | ifeq ($(COMP),gnu)
16 | CXX=g++
17 | CC=gcc
18 | CCFLAGS=-g -O3 -fopenmp -Wall -I$(COMMONDIR)
19 | LDFLAGS=
20 | LIBS=-lpng
21 | endif
22 | 
23 | ifeq ($(COMP),intel)
24 | CXX=icpx
25 | CC=icx
26 | CCFLAGS=-g -O3 -fopenmp -I$(COMMONDIR)
27 | LDFLAGS=
28 | LIBS=-lpng
29 | endif
30 | 
31 | EXE=heat_serial
32 | OBJS=main.o core.o setup.o utilities.o io.o
33 | OBJS_PNG=$(COMMONDIR)/pngwriter.o
34 | 
35 | 
36 | all: $(EXE)
37 | 
38 | 
39 | core.o: core.cpp heat.h
40 | utilities.o: utilities.cpp heat.h
41 | setup.o: setup.cpp heat.h
42 | io.o: io.cpp heat.h
43 | main.o: main.cpp heat.h
44 | 
45 | $(OBJS_PNG): C_COMPILER := $(CC)
46 | $(OBJS): C_COMPILER := $(CXX)
47 | 
48 | $(EXE): $(OBJS) $(OBJS_PNG)
49 | 	$(CXX) $(CCFLAGS) $(OBJS) $(OBJS_PNG) -o $@ $(LDFLAGS) $(LIBS)
50 | 
51 | %.o: %.cpp
52 | 	$(CXX) $(CCFLAGS) -c $< -o $@
53 | 
54 | %.o: %.c
55 | 	$(CC) $(CCFLAGS) -c $< -o $@
56 | 
57 | .PHONY: clean
58 | clean:
59 | 	-/bin/rm -f $(EXE) a.out *.o *.png *~
60 | 


--------------------------------------------------------------------------------
/bonus/heat-equation/serial/core.cpp:
--------------------------------------------------------------------------------
 1 | /* Main solver routines for heat equation solver */
 2 | 
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <string.h>
 6 | #include <assert.h>
 7 | 
 8 | #include "heat.h"
 9 | 
10 | /* Update the temperature values using five-point stencil */
11 | void evolve(field *curr, field *prev, double a, double dt)
12 | {
13 |   double dx2, dy2;
14 |   int nx, ny;
15 |   double *currdata, *prevdata;
16 | 
17 |   currdata = curr->data;
18 |   prevdata = prev->data;
19 |   nx = curr->nx;
20 |   ny = curr->ny;
21 | 
22 |   /* Determine the temperature field at next time step
23 |    * As we have fixed boundary conditions, the outermost gridpoints
24 |    * are not updated. */
25 |   dx2 = prev->dx * prev->dx;
26 |   dy2 = prev->dy * prev->dy;
27 |   for (int i = 1; i < nx + 1; i++) {
28 |     for (int j = 1; j < ny + 1; j++) {
29 |       int ind = i * (ny + 2) + j;
30 |       int ip = (i + 1) * (ny + 2) + j;
31 |       int im = (i - 1) * (ny + 2) + j;
32 |       int jp = i * (ny + 2) + j + 1;
33 |       int jm = i * (ny + 2) + j - 1;
34 |       currdata[ind] = prevdata[ind] + a * dt *
35 |           ((prevdata[ip] -2.0 * prevdata[ind] + prevdata[im]) / dx2 +
36 |            (prevdata[jp] - 2.0 * prevdata[ind] + prevdata[jm]) / dy2);
37 |     }
38 |   }
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/bonus/heat-equation/serial/fortran/Makefile:
--------------------------------------------------------------------------------
 1 | ifeq ($(COMP),)
 2 | COMP=gnu
 3 | endif
 4 | 
 5 | COMMONDIR=../../common
 6 | 
 7 | ifeq ($(COMP),gnu)
 8 | FC=gfortran
 9 | CC=gcc
10 | FCFLAGS=-O3 -Wall -fopenmp
11 | CCFLAGS=-O3 -Wall -I$(COMMONDIR)
12 | LDFLAGS=
13 | LIBS=-lpng
14 | endif
15 | 
16 | ifeq ($(COMP),pgi)
17 | FC=pgfortran
18 | CC=gcc
19 | FCFLAGS=-O3 -acc
20 | CCFLAGS=-O3 -I$(COMMONDIR)
21 | LDFLAGS=
22 | LIBS=-lpng
23 | endif
24 | 
25 | 
26 | EXE=heat_serial
27 | OBJS=main.o heat_mod.o core.o setup.o utilities.o io.o pngwriter_mod.o
28 | OBJS_PNG=$(COMMONDIR)/pngwriter.o
29 | 
30 | all: $(EXE)
31 | 
32 | $(COMMONDIR)/pngwriter.o: $(COMMONDIR)/pngwriter.c $(COMMONDIR)/pngwriter.h
33 | core.o: core.F90 heat_mod.o
34 | utilities.o: utilities.F90 heat_mod.o
35 | io.o: io.F90 heat_mod.o pngwriter_mod.o
36 | setup.o: setup.F90 heat_mod.o utilities.o io.o
37 | pngwriter_mod.o: pngwriter_mod.F90 heat_mod.o
38 | main.o: main.F90 heat_mod.o core.o io.o setup.o utilities.o
39 | 
40 | $(EXE): $(OBJS) $(OBJS_PNG)
41 | 	$(FC) $(FCFLAGS) $(OBJS) $(OBJS_PNG) -o $@ $(LDFLAGS) $(LIBS)
42 | 
43 | %.o: %.F90
44 | 	$(FC) $(FCFLAGS) -c $< -o $@
45 | 
46 | %.o: %.c
47 | 	$(CC) $(CCFLAGS) -c $< -o $@
48 | 
49 | .PHONY: clean
50 | clean:
51 | 	-/bin/rm -f $(EXE) a.out *.o *.mod *.png *~
52 | 


--------------------------------------------------------------------------------
/bonus/heat-equation/serial/fortran/core.F90:
--------------------------------------------------------------------------------
 1 | ! Main solver routines for heat equation solver
 2 | module core
 3 |   use heat
 4 | 
 5 | contains
 6 | 
 7 |   ! Compute one time step of temperature evolution
 8 |   ! Arguments:
 9 |   !   curr (type(field)): current temperature values
10 |   !   prev (type(field)): values from previous time step
11 |   !   a (real(dp)): update equation constant
12 |   !   dt (real(dp)): time step value
13 |   subroutine evolve(curr, prev, a, dt)
14 | 
15 |     implicit none
16 | 
17 |     type(field), intent(inout) :: curr, prev
18 |     real(dp) :: a, dt
19 |     integer :: i, j, nx, ny
20 | 
21 |     nx = curr%nx
22 |     ny = curr%ny
23 | 
24 |     do j = 1, ny
25 |        do i = 1, nx
26 |           curr%data(i, j) = prev%data(i, j) + a * dt * &
27 |                & ((prev%data(i-1, j) - 2.0 * prev%data(i, j) + &
28 |                &   prev%data(i+1, j)) / curr%dx**2 + &
29 |                &  (prev%data(i, j-1) - 2.0 * prev%data(i, j) + &
30 |                &   prev%data(i, j+1)) / curr%dy**2)
31 |        end do
32 |     end do
33 |   end subroutine evolve
34 | 
35 | end module core
36 | 


--------------------------------------------------------------------------------
/bonus/heat-equation/serial/fortran/heat_mod.F90:
--------------------------------------------------------------------------------
 1 | ! Field metadata for heat equation solver
 2 | module heat
 3 |   use iso_fortran_env, only : REAL64
 4 |   implicit none
 5 | 
 6 |   integer, parameter :: dp = REAL64
 7 |   real(dp), parameter :: DX = 0.01, DY = 0.01  ! Fixed grid spacing
 8 | 
 9 |   type :: field
10 |      integer :: nx          ! local dimension of the field
11 |      integer :: ny
12 |      integer :: nx_full     ! global dimension of the field
13 |      integer :: ny_full
14 |      real(dp) :: dx
15 |      real(dp) :: dy
16 |      real(dp), dimension(:,:), allocatable :: data
17 |   end type field
18 | 
19 | contains
20 |   ! Initialize the field type metadata
21 |   ! Arguments:
22 |   !   field0 (type(field)): input field
23 |   !   nx, ny, dx, dy: field dimensions and spatial step size
24 |   subroutine set_field_dimensions(field0, nx, ny)
25 |     implicit none
26 | 
27 |     type(field), intent(out) :: field0
28 |     integer, intent(in) :: nx, ny
29 | 
30 |     field0%dx = DX
31 |     field0%dy = DY
32 |     field0%nx = nx
33 |     field0%ny = ny
34 |     field0%nx_full = nx
35 |     field0%ny_full = ny
36 | 
37 |   end subroutine set_field_dimensions
38 | 
39 | end module heat
40 | 


--------------------------------------------------------------------------------
/bonus/heat-equation/serial/fortran/io.F90:
--------------------------------------------------------------------------------
 1 | ! I/O routines for heat equation solver
 2 | module io
 3 |   use heat
 4 | 
 5 | contains
 6 | 
 7 |   ! Output routine, saves the temperature distribution as a png image
 8 |   ! Arguments:
 9 |   !   curr (type(field)): variable with the temperature data
10 |   !   iter (integer): index of the time step
11 |   subroutine write_field(curr, iter)
12 | 
13 |     use pngwriter
14 |     implicit none
15 |     type(field), intent(in) :: curr
16 |     integer, intent(in) :: iter
17 | 
18 |     character(len=85) :: filename
19 | 
20 |     integer :: stat
21 |     real(dp), dimension(:,:), allocatable, target :: full_data
22 | 
23 |     allocate(full_data(curr%nx_full, curr%ny_full))
24 |     ! Copy rand #0 data to the global array
25 |     full_data(1:curr%nx, 1:curr%ny) = curr%data(1:curr%nx, 1:curr%ny)
26 | 
27 |     write(filename,'(A5,I4.4,A4,A)')  'heat_', iter, '.png'
28 |     stat = save_png(full_data, curr%nx_full, curr%ny_full, filename)
29 |     deallocate(full_data)
30 | 
31 |   end subroutine write_field
32 | 
33 | 
34 |   ! Reads the temperature distribution from an input file
35 |   ! Arguments:
36 |   !   field0 (type(field)): field variable that will store the
37 |   !                         read data
38 |   !   filename (char): name of the input file
39 |   ! Note that this version assumes the input data to be in C memory layout
40 |   subroutine read_field(field0, filename)
41 | 
42 |     implicit none
43 |     type(field), intent(out) :: field0
44 |     character(len=85), intent(in) :: filename
45 | 
46 |     integer :: nx, ny, i
47 |     character(len=2) :: dummy
48 | 
49 |     real(dp), dimension(:,:), allocatable :: full_data
50 | 
51 |     open(10, file=filename)
52 |     ! Read the header
53 |     read(10, *) dummy, nx, ny
54 | 
55 |     call set_field_dimensions(field0, nx, ny)
56 | 
57 |     ! The arrays for temperature field contain also a halo region
58 |     allocate(field0%data(0:field0%nx+1, 0:field0%ny+1))
59 | 
60 |     allocate(full_data(nx, ny))
61 |     ! Read the data
62 |     do i = 1, nx
63 |        read(10, *) full_data(i, 1:ny)
64 |     end do
65 | 
66 |     ! Copy to full array containing also boundaries
67 |     field0%data(1:field0%nx, 1:field0%ny) = full_data(:,:)
68 | 
69 |     ! Set the boundary values
70 |     field0%data(1:field0%nx, 0) = field0%data(1:field0%nx, 1)
71 |     field0%data(1:field0%nx, field0%ny + 1) = field0%data(1:field0%nx, field0%ny)
72 |     field0%data(0, 0:field0%ny + 1) = field0%data(1, 0:field0%ny + 1)
73 |     field0%data(field0%nx + 1, 0:field0%ny + 1) = field0%data(field0%nx, 0:field0%ny + 1)
74 | 
75 |     close(10)
76 |     deallocate(full_data)
77 | 
78 |   end subroutine read_field
79 | 
80 | end module io
81 | 


--------------------------------------------------------------------------------
/bonus/heat-equation/serial/fortran/main.F90:
--------------------------------------------------------------------------------
 1 | ! Heat equation solver in 2D.
 2 | 
 3 | program heat_solve
 4 |   use heat
 5 |   use core
 6 |   use io
 7 |   use setup
 8 |   use utilities
 9 |   use omp_lib
10 | 
11 |   implicit none
12 | 
13 |   real(dp), parameter :: a = 0.5 ! Diffusion constant
14 |   type(field) :: current, previous    ! Current and previus temperature fields
15 | 
16 |   real(dp) :: dt     ! Time step
17 |   integer :: nsteps       ! Number of time steps
18 |   integer, parameter :: image_interval = 1500 ! Image output interval
19 | 
20 |   integer :: iter
21 | 
22 |   real(dp) :: average_temp   !  Average temperature
23 | 
24 |   real(kind=dp) :: start, stop ! Timers
25 | 
26 |   call initialize(current, previous, nsteps)
27 | 
28 |   ! Draw the picture of the initial state
29 |   call write_field(current, 0)
30 | 
31 |   average_temp = average(current)
32 |   write(*,'(A,F9.6)') 'Average temperature at start: ', average_temp
33 | 
34 |   ! Largest stable time step
35 |   dt = current%dx**2 * current%dy**2 / &
36 |        & (2.0 * a * (current%dx**2 + current%dy**2))
37 | 
38 |   ! Main iteration loop, save a picture every
39 |   ! image_interval steps
40 | 
41 |   start =  omp_get_wtime()
42 | 
43 |   do iter = 1, nsteps
44 |      call evolve(current, previous, a, dt)
45 |      if (mod(iter, image_interval) == 0) then
46 |         call write_field(current, iter)
47 |      end if
48 |      call swap_fields(current, previous)
49 |   end do
50 | 
51 |   stop = omp_get_wtime()
52 | 
53 |   ! Average temperature for reference
54 |   average_temp = average(previous)
55 | 
56 |   write(*,'(A,F7.3,A)') 'Iteration took ', stop - start, ' seconds.'
57 |   write(*,'(A,F9.6)') 'Average temperature: ',  average_temp
58 |   if (command_argument_count() == 0) then
59 |       write(*,'(A,F9.6)') 'Reference value with default arguments: ', 59.281239
60 |   end if
61 | 
62 |   call finalize(current, previous)
63 | 
64 | end program heat_solve
65 | 


--------------------------------------------------------------------------------
/bonus/heat-equation/serial/fortran/pngwriter_mod.F90:
--------------------------------------------------------------------------------
 1 | ! PNG writer for heat equation solver
 2 | module pngwriter
 3 |   use heat
 4 | 
 5 | contains
 6 | 
 7 |   function save_png(data, nx, ny, fname) result(stat)
 8 | 
 9 |     use, intrinsic :: ISO_C_BINDING
10 |     implicit none
11 | 
12 |     real(dp), dimension(:,:), intent(in) :: data
13 |     integer, intent(in) :: nx, ny
14 |     character(len=*), intent(in) :: fname
15 |     integer :: stat
16 | 
17 |     ! Interface for save_png C-function
18 |     interface
19 |        ! The C-function definition is
20 |        !   int save_png(double *data, const int nx, const int ny,
21 |        !                const char *fname)
22 |        function save_png_c(data, nx, ny, fname, order) &
23 |             & bind(C,name="save_png") result(stat)
24 |          use, intrinsic :: ISO_C_BINDING
25 |          implicit none
26 |          real(kind=C_DOUBLE) :: data(*)
27 |          integer(kind=C_INT), value, intent(IN) :: nx, ny
28 |          character(kind=C_CHAR), intent(IN) :: fname(*)
29 |          character(kind=C_CHAR), value, intent(IN) :: order
30 |          integer(kind=C_INT) :: stat
31 |        end function save_png_c
32 |     end interface
33 | 
34 |     stat = save_png_c(data, nx, ny, trim(fname) // C_NULL_CHAR, 'f')
35 |     if (stat /= 0) then
36 |        write(*,*) 'save_png returned error!'
37 |     end if
38 | 
39 |   end function save_png
40 | 
41 | end module pngwriter
42 | 


--------------------------------------------------------------------------------
/bonus/heat-equation/serial/fortran/utilities.F90:
--------------------------------------------------------------------------------
 1 | ! Utility routines for heat equation solver
 2 | !   NOTE: This file does not need to be edited!
 3 | module utilities
 4 |   use heat
 5 | 
 6 | contains
 7 | 
 8 |   ! Swap the data fields of two variables of type field
 9 |   ! Arguments:
10 |   !   curr, prev (type(field)): the two variables that are swapped
11 |   subroutine swap_fields(curr, prev)
12 | 
13 |     implicit none
14 | 
15 |     type(field), intent(inout) :: curr, prev
16 |     real(dp), allocatable, dimension(:,:) :: tmp
17 | 
18 |     call move_alloc(curr%data, tmp)
19 |     call move_alloc(prev%data, curr%data)
20 |     call move_alloc(tmp, prev%data)
21 |   end subroutine swap_fields
22 | 
23 |   ! Copy the data from one field to another
24 |   ! Arguments:
25 |   !   from_field (type(field)): variable to copy from
26 |   !   to_field (type(field)): variable to copy to
27 |   subroutine copy_fields(from_field, to_field)
28 | 
29 |     implicit none
30 | 
31 |     type(field), intent(in) :: from_field
32 |     type(field), intent(out) :: to_field
33 | 
34 |     ! Consistency checks
35 |     if (.not.allocated(from_field%data)) then
36 |        write (*,*) "Can not copy from a field without allocated data"
37 |        stop
38 |     end if
39 |     if (.not.allocated(to_field%data)) then
40 |        ! Target is not initialize, allocate memory
41 |        allocate(to_field%data(lbound(from_field%data, 1):ubound(from_field%data, 1), &
42 |             & lbound(from_field%data, 2):ubound(from_field%data, 2)))
43 |     else if (any(shape(from_field%data) /= shape(to_field%data))) then
44 |        write (*,*) "Wrong field data sizes in copy routine"
45 |        print *, shape(from_field%data), shape(to_field%data)
46 |        stop
47 |     end if
48 | 
49 |     to_field%data = from_field%data
50 | 
51 |     to_field%nx = from_field%nx
52 |     to_field%ny = from_field%ny
53 |     to_field%nx_full = from_field%nx_full
54 |     to_field%ny_full = from_field%ny_full
55 |     to_field%dx = from_field%dx
56 |     to_field%dy = from_field%dy
57 |   end subroutine copy_fields
58 | 
59 |   function average(field0)
60 | 
61 |     implicit none
62 | 
63 |     real(dp) :: average
64 |     type(field) :: field0
65 | 
66 |     real(dp) :: local_average
67 |     integer :: rc
68 | 
69 |     average = sum(field0%data(1:field0%nx, 1:field0%ny))
70 |     average = average / (field0%nx_full * field0%ny_full)
71 | 
72 |   end function average
73 | 
74 | end module utilities
75 | 


--------------------------------------------------------------------------------
/bonus/heat-equation/serial/heat.h:
--------------------------------------------------------------------------------
 1 | #ifndef __HEAT_H__
 2 | #define __HEAT_H__
 3 | 
 4 | 
 5 | /* Datatype for temperature field */
 6 | typedef struct {
 7 |     /* nx and ny are the true dimensions of the field. The array data
 8 |      * contains also ghost layers, so it will have dimensions nx+2 x ny+2 */
 9 |     int nx;                     /* Local dimensions of the field */
10 |     int ny;
11 |     int nx_full;                /* Global dimensions of the field */
12 |     int ny_full;                /* Global dimensions of the field */
13 |     double dx;
14 |     double dy;
15 |     double *data;
16 | } field;
17 | 
18 | /* We use here fixed grid spacing */
19 | #define DX 0.01
20 | #define DY 0.01
21 | 
22 | #if __cplusplus
23 |   extern "C" {
24 | #endif
25 | /* Function prototypes */
26 | void set_field_dimensions(field *temperature, int nx, int ny);
27 | 
28 | void initialize(int argc, char *argv[], field *temperature1,
29 |                 field *temperature2, int *nsteps);
30 | 
31 | void generate_field(field *temperature);
32 | 
33 | double average(field *temperature);
34 | 
35 | void evolve(field *curr, field *prev, double a, double dt);
36 | 
37 | void write_field(field *temperature, int iter);
38 | 
39 | void read_field(field *temperature1, field *temperature2,
40 |                 char *filename);
41 | 
42 | void copy_field(field *temperature1, field *temperature2);
43 | 
44 | void swap_fields(field *temperature1, field *temperature2);
45 | 
46 | void allocate_field(field *temperature);
47 | 
48 | void finalize(field *temperature1, field *temperature2);
49 | 
50 | #if __cplusplus
51 |   }
52 | #endif
53 | #endif  /* __HEAT_H__ */
54 | 
55 | 


--------------------------------------------------------------------------------
/bonus/heat-equation/serial/io.cpp:
--------------------------------------------------------------------------------
 1 | /* I/O related functions for heat equation solver */
 2 | 
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <string.h>
 6 | #include <assert.h>
 7 | 
 8 | #include "heat.h"
 9 | #include "pngwriter.h"
10 | 
11 | /* Output routine that prints out a picture of the temperature
12 |  * distribution. */
13 | void write_field(field *temperature, int iter)
14 | {
15 |     char filename[64];
16 | 
17 |     /* The actual write routine takes only the actual data
18 |      * (without ghost layers) so we need array for that. */
19 |     int height, width;
20 |     double *full_data;
21 | 
22 |     height = temperature->nx;
23 |     width = temperature->ny;
24 | 
25 |     /* Copy the inner data */
26 |     full_data = new double [height * width];
27 |     for (int i = 0; i < temperature->nx; i++)
28 |         memcpy(&full_data[i * width], &temperature->data[(i + 1) * (width + 2) + 1],
29 |            temperature->ny * sizeof(double));
30 | 
31 |     /* Write out the data to a png file */
32 |     sprintf(filename, "%s_%04d.png", "heat", iter);
33 |     save_png(full_data, height, width, filename, 'c');
34 |     delete[] full_data;
35 | }
36 | 
37 | /* Read the initial temperature distribution from a file and
38 |  * initialize the temperature fields temperature1 and
39 |  * temperature2 to the same initial state. */
40 | void read_field(field *temperature1, field *temperature2, char *filename)
41 | {
42 |     FILE *fp;
43 |     int nx, ny, ind;
44 |     double *full_data;
45 | 
46 |     int nx_local, ny_local, count;
47 | 
48 |     fp = fopen(filename, "r");
49 |     /* Read the header */
50 |     count = fscanf(fp, "# %d %d \n", &nx, &ny);
51 |     if (count < 2) {
52 |         fprintf(stderr, "Error while reading the input file!\n");
53 |     exit(-1);
54 |     }
55 | 
56 |     set_field_dimensions(temperature1, nx, ny);
57 |     set_field_dimensions(temperature2, nx, ny);
58 | 
59 |     /* Allocate arrays (including ghost layers) */
60 |     temperature1->data = new double[(temperature1->nx + 2) * (temperature1->ny + 2)];
61 |     temperature2->data = new double[(temperature1->nx + 2) * (temperature1->ny + 2)];
62 | 
63 |     /* Full array */
64 |     full_data = new double [nx * ny];
65 | 
66 |     /* Read the actual data */
67 |     for (int i = 0; i < nx; i++) {
68 |         for (int j = 0; j < ny; j++) {
69 |             ind = i * ny + j;
70 |             count = fscanf(fp, "%lf", &full_data[ind]);
71 |         }
72 |     }
73 | 
74 |     nx_local = temperature1->nx;
75 |     ny_local = temperature1->ny;
76 | 
77 |     /* Copy to the array containing also boundaries */
78 |     for (int i = 0; i < nx_local; i++)
79 |       memcpy(&temperature1->data[(i + 1) * (ny_local + 2) + 1], &full_data[i * ny_local],
80 |                ny * sizeof(double));
81 | 
82 |     /* Set the boundary values */
83 |     for (int i = 1; i < nx_local + 1; i++) {
84 |         temperature1->data[i * (ny_local + 2)] = temperature1->data[i * (ny_local + 2) + 1];
85 |         temperature1->data[i * (ny_local + 2) + ny + 1] = temperature1->data[i * (ny_local + 2) + ny];
86 |     }
87 |     for (int j = 0; j < ny + 2; j++) {
88 |         temperature1->data[j] = temperature1->data[ny_local + j];
89 |         temperature1->data[(nx_local + 1) * (ny_local + 2) + j] =
90 |             temperature1->data[nx_local * (ny_local + 2) + j];
91 |     }
92 | 
93 |     copy_field(temperature1, temperature2);
94 | 
95 |     delete[] full_data;
96 |     fclose(fp);
97 | }
98 | 


--------------------------------------------------------------------------------
/bonus/heat-equation/serial/main.cpp:
--------------------------------------------------------------------------------
 1 | /* Heat equation solver in 2D. */
 2 | 
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <string.h>
 6 | #include <time.h>
 7 | #include <omp.h>
 8 | 
 9 | #include "heat.h"
10 | 
11 | int main(int argc, char **argv)
12 | {
13 |     double a = 0.5;             //!< Diffusion constant
14 |     field current, previous;    //!< Current and previous temperature fields
15 | 
16 |     double dt;                  //!< Time step
17 |     int nsteps;                 //!< Number of time steps
18 | 
19 |     int image_interval = 1500;    //!< Image output interval
20 | 
21 |     double dx2, dy2;            //!< Delta x and y squared
22 | 
23 |     double average_temp;        //!< Average temperature
24 | 
25 |     double start_clock, stop_clock;  //!< Time stamps
26 | 
27 | 
28 |     initialize(argc, argv, &current, &previous, &nsteps);
29 | 
30 |     /* Output the initial field */
31 |     write_field(&current, 0);
32 | 
33 |     average_temp = average(&current);
34 |     printf("Average temperature at start: %f\n", average_temp);
35 | 
36 | 
37 |     /* Largest stable time step */
38 |     dx2 = current.dx * current.dx;
39 |     dy2 = current.dy * current.dy;
40 |     dt = dx2 * dy2 / (2.0 * a * (dx2 + dy2));
41 | 
42 |     /* Get the start time stamp */
43 |     start_clock = omp_get_wtime();
44 | 
45 |     /* Time evolve */
46 |     for (int iter = 1; iter <= nsteps; iter++) {
47 |         evolve(&current, &previous, a, dt);
48 |         if (iter % image_interval == 0) {
49 |         write_field(&current, iter);
50 |         }
51 |         /* Swap current field so that it will be used
52 |             as previous for next iteration step */
53 |         swap_fields(&current, &previous);
54 |     }
55 | 
56 |     stop_clock = omp_get_wtime();
57 | 
58 |     /* Average temperature for reference */
59 |     average_temp = average(&previous);
60 | 
61 |     /* Determine the CPU time used for the iteration */
62 |     printf("Iteration took %.3f seconds.\n", (stop_clock - start_clock));
63 |     printf("Average temperature: %f\n", average_temp);
64 |     if (argc == 1) {
65 |         printf("Reference value with default arguments: 59.281239\n");
66 |     }
67 | 
68 |     /* Output the final field */
69 |     write_field(&previous, nsteps);
70 | 
71 |     finalize(&current, &previous);
72 | 
73 |     return 0;
74 | }
75 | 


--------------------------------------------------------------------------------
/bonus/heat-equation/serial/utilities.cpp:
--------------------------------------------------------------------------------
 1 | /* Utility functions for heat equation solver
 2 |  *   NOTE: This file does not need to be edited! */
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string.h>
 7 | #include <assert.h>
 8 | 
 9 | #include "heat.h"
10 | 
11 | 
12 | /* Copy data on temperature1 into temperature2 */
13 | void copy_field(field *temperature1, field *temperature2)
14 | {
15 |     assert(temperature1->nx == temperature2->nx);
16 |     assert(temperature1->ny == temperature2->ny);
17 |     memcpy(temperature2->data, temperature1->data,
18 |            (temperature1->nx + 2) * (temperature1->ny + 2) * sizeof(double));
19 | }
20 | 
21 | /* Swap the data of fields temperature1 and temperature2 */
22 | void swap_fields(field *temperature1, field *temperature2)
23 | {
24 |     double *tmp;
25 |     tmp = temperature1->data;
26 |     temperature1->data = temperature2->data;
27 |     temperature2->data = tmp;
28 | }
29 | 
30 | /* Allocate memory for a temperature field and initialise it to zero */
31 | void allocate_field(field *temperature)
32 | {
33 |     // Allocate also ghost layers
34 |     temperature->data = new double [(temperature->nx + 2) * (temperature->ny + 2)];
35 | 
36 |     // Initialize to zero
37 |     memset(temperature->data, 0.0,
38 |            (temperature->nx + 2) * (temperature->ny + 2) * sizeof(double));
39 | }
40 | 
41 | /* Calculate average temperature */
42 | double average(field *temperature)
43 | {
44 |      double average = 0.0;
45 | 
46 |      for (int i = 1; i < temperature->nx + 1; i++) {
47 |        for (int j = 1; j < temperature->ny + 1; j++) {
48 |          int ind = i * (temperature->ny + 2) + j;
49 |          average += temperature->data[ind];
50 |        }
51 |      }
52 | 
53 |      average /= (temperature->nx_full * temperature->ny_full);
54 |      return average;
55 | }
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/demos/device_management_hip.cpp:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #include <hip/hip_runtime.h>
 3 | 
 4 | int main(int argc, char *argv[])
 5 | {
 6 |     int count, device;
 7 | 
 8 |     hipGetDeviceCount(&count);
 9 |     hipGetDevice(&device);
10 | 
11 |     printf("Hello! I'm GPU %d out of %d GPUs in total.\n", device, count);
12 | 
13 |     hipSetDevice(count - 1);
14 |     hipGetDevice(&device);
15 |     printf("Now I'm GPU %d.\n", device);
16 | 
17 |     return 0;
18 | }
19 | 


--------------------------------------------------------------------------------
/demos/device_management_mpi_hip.cpp:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #include <mpi.h>
 3 | #include <hip/hip_runtime.h>
 4 | 
 5 | int main(int argc, char *argv[])
 6 | {
 7 |     MPI_Init(&argc, &argv);
 8 | 
 9 |     int size, rank;
10 |     MPI_Comm_size(MPI_COMM_WORLD, &size);
11 |     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
12 | 
13 |     // Create communicator per node
14 |     MPI_Comm comm_node;
15 |     MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &comm_node);
16 |     int size_node, rank_node;
17 |     MPI_Comm_size(comm_node, &size_node);
18 |     MPI_Comm_rank(comm_node, &rank_node);
19 | 
20 |     int namelen;
21 |     char procname[MPI_MAX_PROCESSOR_NAME];
22 |     MPI_Get_processor_name(procname, &namelen);
23 | 
24 |     int count, device;
25 |     hipGetDeviceCount(&count);
26 |     hipGetDevice(&device);
27 | 
28 |     printf("I'm MPI rank %2d/%-2d (world) %2d/%-2d (node) on %s with GPU %2d/%-2d\n",
29 |            rank, size, rank_node, size_node, procname, device, count);
30 | 
31 |     fflush(stdout);
32 |     MPI_Barrier(MPI_COMM_WORLD);
33 | 
34 |     hipSetDevice(rank_node % count);
35 |     hipGetDevice(&device);
36 |     printf("Now MPI rank %2d/%-2d (world) %2d/%-2d (node) on %s with GPU %2d/%-2d\n",
37 |            rank, size, rank_node, size_node, procname, device, count);
38 | 
39 |     MPI_Finalize();
40 | 
41 |     return 0;
42 | }
43 | 


--------------------------------------------------------------------------------
/demos/device_properties_hip.cpp:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #include <cmath>
 3 | #include <hip/hip_runtime.h>
 4 | 
 5 | int main(int argc, char *argv[])
 6 | {
 7 |     int count, device;
 8 | 
 9 |     hipGetDeviceCount(&count);
10 |     hipGetDevice(&device);
11 | 
12 |     printf("Hello! I'm GPU %d out of %d GPUs in total.\n", device, count);
13 | 
14 |     hipDeviceProp_t prop;
15 |     hipGetDeviceProperties(&prop, device);
16 | 
17 |     // Note: name is empty string on LUMI, see https://github.com/ROCm/ROCm/issues/1625
18 |     printf("Name: %s\n", prop.name);
19 |     printf("Memory: %.2f GiB\n", prop.totalGlobalMem / pow(1024., 3));
20 |     printf("Wavefront / warp size: %d\n", prop.warpSize);
21 | 
22 |     return 0;
23 | }
24 | 


--------------------------------------------------------------------------------
/demos/fill.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <hip/hip_runtime.h>
 3 | 
 4 | // GPU kernel
 5 | __global__ void fill_kernel(int n, double *x, double a)
 6 | {
 7 |   int tid = threadIdx.x + blockIdx.x * blockDim.x;  
 8 |   
 9 |   if(tid < n)
10 |     x[tid] = tid * a;
11 | }
12 | 
13 | int main(void)
14 | {
15 |     // set problem size
16 |     const int n = 1e4;
17 | 
18 |     // allocate device memory
19 |     double *d_x;
20 |     hipMalloc(&d_x, sizeof(double) * n);
21 | 
22 |     // launch kernel
23 |     const int blocksize = 256;
24 |     const int gridsize = (n - 1 + blocksize) / blocksize;
25 |     fill_kernel<<<gridsize, blocksize>>>(n, d_x, 3.0);
26 | 
27 |     // copy data to the host and print
28 |     double x[n];
29 |     hipMemcpy(x, d_x, sizeof(double) * n, hipMemcpyDeviceToHost);
30 |     printf("%f %f %f %f ... %f %f\n",
31 |             x[0], x[1], x[2], x[3], x[n-2], x[n-1]);
32 | 
33 |     return 0;
34 | }
35 | 


--------------------------------------------------------------------------------
/demos/hello.cpp:
--------------------------------------------------------------------------------
 1 | #include <hip/hip_runtime.h>
 2 | #include <stdio.h>
 3 | 
 4 | int main(void)
 5 | {
 6 |     int count = 0;
 7 |     int device = 0;
 8 | 
 9 |     auto success = hipGetDeviceCount(&count);
10 |     success = hipGetDevice(&device);
11 | 
12 |     printf("Hello! I'm GPU %d out of %d GPUs in total.\n", device, count);
13 | 
14 |     return 0;
15 | }
16 | 


--------------------------------------------------------------------------------
/demos/warp-div.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <hip/hip_runtime.h>
  3 | 
  4 | #include <chrono>
  5 | #include <iostream>
  6 | 
  7 | #define synchronize \
  8 |     HIP_CHECK(hipGetLastError()); \
  9 |     HIP_CHECK(hipDeviceSynchronize())
 10 | 
 11 | #define HIP_CHECK(expression)                  \
 12 | {                                              \
 13 |     const hipError_t status = expression;      \
 14 |     if(status != hipSuccess){                  \
 15 |         std::cerr << "HIP error "              \
 16 |                   << status << ": "            \
 17 |                   << hipGetErrorString(status) \
 18 |                   << " at " << __FILE__ << ":" \
 19 |                   << __LINE__ << std::endl;    \
 20 |     }                                          \
 21 | }
 22 | 
 23 | 
 24 | #define starttime  { auto start = std::chrono::high_resolution_clock::now(); 
 25 | 
 26 | #define endtime \
 27 |   auto stop = std::chrono::high_resolution_clock::now(); \
 28 |   auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(stop-start).count(); \
 29 |     if (my_repeat_counter > 1) std::cout << duration; \
 30 |   }
 31 | 
 32 | #define repeat(X) for(int my_repeat_counter=1;my_repeat_counter <= (X); ++my_repeat_counter)
 33 | 
 34 | __device__ double f_1(double x, double a, int Nz)
 35 | {
 36 |   double R = x;
 37 | 
 38 | #pragma unroll 8
 39 |   for(int i = 0; i<Nz; ++i) {
 40 |     R = a*R+x;
 41 |   }
 42 |   return R;
 43 | }
 44 | 
 45 | __device__ double f_2(double x, double a, int Nz) 
 46 | {
 47 |   double R = 1;
 48 | 
 49 | #pragma unroll 8
 50 |   for(int i = 0; i<Nz; ++i) {
 51 |     R = x*R+a;
 52 |   }
 53 |   return R;
 54 | }
 55 | 
 56 | // GPU kernel
 57 | __global__ void fill_kernel_div(size_t n, double *x, double a, int Nz)
 58 | {
 59 |   int tid = threadIdx.x + blockIdx.x * blockDim.x;  
 60 |   
 61 |   if(tid < n) {
 62 |     if(tid%2 == 0) {
 63 |       x[tid] = f_1(double(tid), a, Nz);
 64 |     } else {
 65 |       x[tid] = f_2(double(tid), a, Nz);
 66 |     }
 67 |   }
 68 | }
 69 | 
 70 | //
 71 | // GPU kernel
 72 | __global__ void fill_kernel_nodiv(size_t n, double *x, double a, int Nz)
 73 | {
 74 |   int tid = threadIdx.x + blockIdx.x * blockDim.x;  
 75 |   
 76 |   if(tid <n) {
 77 |     if (((tid)/64)%2 == 0) {
 78 |       x[tid] = f_1(double(tid), a, Nz);
 79 |     } else {
 80 |       x[tid] = f_2(double(tid), a, Nz);
 81 |     }
 82 |   }
 83 | }
 84 | 
 85 | __global__ void fill_kernel_noif(size_t n, double *x, double a, int Nz)
 86 | {
 87 |   int tid = threadIdx.x + blockIdx.x * blockDim.x;  
 88 |   
 89 |   if(tid < n) {
 90 |     x[tid] = f_2(double(tid)/n, a, Nz);
 91 |     /* x[tid] = f_2(double(tid)/n, x[tid], Nz); */
 92 |   }
 93 | }
 94 | 
 95 | int main(void)
 96 | {
 97 |     // set problem size
 98 |     const size_t n = 1<<24;
 99 |     
100 |     int Nz = 100;
101 |     // allocate device memory
102 |     double *d_x;
103 |     double *d_y; 
104 |     double a = 0.1;
105 |     HIP_CHECK(hipMalloc(&d_x, sizeof(double) * n));
106 |     HIP_CHECK(hipMalloc(&d_y, sizeof(double) * n));
107 | 
108 |     // launch kernel
109 |     const size_t blocksize = 256;
110 |     const size_t gridsize = (n - 1 + blocksize) / blocksize;
111 | 
112 | 
113 |     std::cout << "% N_fma, noif, nodiv, div\n"; // TODO: count flops
114 |     repeat(50) {
115 |       if (my_repeat_counter > 1) std::cout << Nz << ", ";
116 | 
117 |       starttime
118 |         fill_kernel_noif<<<gridsize, blocksize>>>(n, d_x, a, Nz);
119 |       synchronize;
120 |       endtime
121 |         if (my_repeat_counter > 1) std::cout << ", ";
122 | 
123 |         starttime
124 |         fill_kernel_nodiv<<<gridsize, blocksize>>>(n, d_x, a, Nz);
125 |       synchronize;
126 |       endtime
127 |         if (my_repeat_counter > 1) std::cout << ", ";
128 | 
129 |       starttime
130 |         fill_kernel_div<<<gridsize, blocksize>>>(n, d_x, a, Nz);
131 |       synchronize;
132 |       endtime
133 |         if (my_repeat_counter > 1) std::cout << "\n";
134 | 
135 |       Nz += 1;
136 |     }
137 | 
138 |     // copy data to the host and print
139 |     /* HIP_CHECK(hipMemcpy(x, d_x, sizeof(double) * n, hipMemcpyDeviceToHost)); */
140 |     /* printf("%f %f %f %f ... %f %f\n", */
141 |             /* x[0], x[1], x[2], x[3], x[n-2], x[n-1]); */
142 | 
143 |     return 0;
144 | }
145 | 


--------------------------------------------------------------------------------
/docs/05-fortran.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title:    Fortran and HIP
  3 | subtitle: GPU programming with HIP
  4 | author:   CSC Training
  5 | date:     2025-03
  6 | lang:     en
  7 | ---
  8 | 
  9 | # Fortran
 10 | 
 11 | * No native GPU support in Fortran:
 12 |     - HIP functions are callable from C, using wrappers; compiled with hipcc
 13 |     - interoperability with Fortran via `iso_c_binding`
 14 |     - linking with Fortran or `hipcc`
 15 | * Fortran + HIP:
 16 |     - needs wrappers and interfaces for all HIP calls
 17 | * Hipfort:
 18 |     - Fortran Interface For GPU Kernel Libraries
 19 |       - HIP: HIP runtime, hipBLAS, hipSPARSE, hipFFT, hipRAND, hipSOLVER
 20 |       - ROCm: rocBLAS, rocSPARSE, rocFFT, rocRAND, rocSOLVER
 21 |       - memory management: `hipMalloc`, `hipMemcpy`
 22 | 
 23 | # HIPFort for SAXPY (`Y=Y+a*X`): Fortran Code
 24 | <small>
 25 | <div class="column" width=40%>>
 26 | ```cpp
 27 | program saxpy
 28 |   use iso_c_binding
 29 |   use hipfort
 30 |   use hipfort_check
 31 | 
 32 |   implicit none
 33 |   interface
 34 |      subroutine launch(dy,dx,b,N) bind(c)
 35 |        use iso_c_binding
 36 |        implicit none
 37 |        type(c_ptr),value :: dy,dx
 38 |        integer, value :: N
 39 |        real, value :: a
 40 |      end subroutine
 41 |   end interface
 42 | 
 43 |   type(c_ptr) :: dx = c_null_ptr
 44 |   type(c_ptr) :: dy = c_null_ptr
 45 |   integer, parameter :: N = 400000000
 46 |   integer(c_size_t), parameter :: bytes_per_element = 4
 47 |   integer(c_size_t), parameter :: Nbytes = N*bytes_per_element
 48 |   real, allocatable,target,dimension(:) :: x, y
 49 |   real, parameter ::  a=2.0
 50 | ```
 51 | </div>
 52 | 
 53 | <div class="column" width=59%>>
 54 | ```cpp
 55 |   allocate(x(N), y(N))
 56 | 
 57 |   x = 1.0; y = 2.0
 58 | 
 59 |   call hipCheck(hipMalloc(dx,Nbytes))
 60 |   call hipCheck(hipMalloc(dy,Nbytes))
 61 | 
 62 |   call hipCheck(hipMemcpy(dx, c_loc(x), Nbytes, hipMemcpyHostToDevice))
 63 |   call hipCheck(hipMemcpy(dy, c_loc(y), Nbytes, hipMemcpyHostToDevice))
 64 | 
 65 |   call launch(dy, dx, a, N)
 66 | 
 67 |   call hipCheck(hipDeviceSynchronize())
 68 | 
 69 |   call hipCheck(hipMemcpy(c_loc(y), dy, Nbytes, hipMemcpyDeviceToHost))
 70 | 
 71 |   write(*,*) "Max error: ", maxval(abs(y-4.0))
 72 | 
 73 |   call hipCheck(hipFree(dx));call hipCheck(hipFree(dy))
 74 | 
 75 |   deallocate(x);deallocate(y)
 76 | 
 77 | end program testSaxpy
 78 | ```
 79 | </div>
 80 | </small>
 81 | 
 82 | # HIPFort for SAXPY (`Y=Y+a*X`): HIP code
 83 | <div class="column">
 84 | ```cpp
 85 | #include <hip/hip_runtime.h>
 86 | #include <cstdio>
 87 | 
 88 | __global__ void saxpy(float *dy, float *dx, 
 89 |                       float a, int n)
 90 | {
 91 |     int i = blockDim.x*blockIdx.x+threadIdx.x;
 92 |     if (i < n) {
 93 |       dy[i] = dy[i] + a*dx[i];
 94 |     }
 95 | }
 96 | ``` 
 97 | 
 98 | </div>
 99 | 
100 | <div class="column">
101 | ``` cpp
102 | extern "C"{
103 | void launch(float *dy, float *dx, 
104 |             float a, int N)
105 |   {
106 |      dim3 tBlock(256,1,1);
107 |      dim3 grid(ceil((float)N/tBlock.x),1,1);
108 |      
109 |      saxpy<<<grid, tBlock>>>(dx, dy, a, N);
110 |   }
111 | }
112 | ```
113 | </div>
114 | 
115 | # Compilation
116 | 
117 | **NVIDIA: Mahti**
118 | ```
119 | gfortran -I$HIPFORT_HOME/include/hipfort/nvptx "-DHIPFORT_ARCH=\"nvptx\"" \
120 |          -L$HIPFORT_HOME/lib -lhipfort-nvptx -c <fortran_code>.f90
121 |  
122 | hipcc "--gpu-architecture=sm_80" --x cu -c <hip_kernels>.cpp
123 | 
124 | hipcc -lgfortran "--gpu-architecture=sm_80" -I$HIPFORT_HOME/include/hipfort/nvptx \
125 |       -L$HIPFORT_HOME/lib/ -lhipfort-nvptx <fortran_code>.o <hip_kernels>.o  -o main
126 | ```
127 | **AMD: LUMI**
128 | ```
129 | ftn -I$HIPFORT_HOME/include/hipfort/amdgcn "-DHIPFORT_ARCH=\"amd\"" \
130 |     -L$HIPFORT_HOME/lib -lhipfort-amdgcn -c <fortran_code>.f90
131 | 
132 | hipcc --offload-arch=gfx90a -c <hip_kernels>.cpp
133 | 
134 | ftn  -I$HIPFORT_HOME/include/hipfort/amdgcn "-DHIPFORT_ARCH=\"amd\"" \
135 |      -L$HIPFORT_HOME/lib -lhipfort-amdgcn <fortran_code>.o <hip_kernels>.o -o main 
136 | ```
137 | 
138 | 
139 | # Summary
140 | 
141 | * No native GPU support in Fortran
142 | * HIP functions are callable from C, using `extern C`
143 |   - `iso_c_binding` 
144 |   - GPU objects are of type `c_ptr` in Fortran
145 | * Hipfort provides Fortran interfaces for GPU libraries
146 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | ROOT_DIR:=$(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 2 | FACTORY=docker
 3 | OPTIONS=run -it --rm -v "$(ROOT_DIR)":"$(ROOT_DIR)":Z -w "$(ROOT_DIR)" ghcr.io/csc-training/slidefactory:3.2.0-beta.1
 4 | 
 5 | SRC=$(wildcard *.md)
 6 | HTML=$(patsubst %.md,%.html,$(SRC))
 7 | PDF=$(patsubst %.md,%.pdf,$(SRC))
 8 | 
 9 | .PHONY: html pdf clean
10 | 
11 | html: $(HTML)
12 | 
13 | pdf: $(PDF)
14 | 
15 | clean:
16 | 	-rm -f $(HTML) $(PDF)
17 | 
18 | %.html: %.md
19 | 	$(FACTORY) $(OPTIONS) slides --format html $<
20 | 
21 | %.pdf: %.md
22 | 	$(FACTORY) $(OPTIONS) slides --format pdf $<
23 | 


--------------------------------------------------------------------------------
/docs/img/01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/01.png


--------------------------------------------------------------------------------
/docs/img/04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/04.png


--------------------------------------------------------------------------------
/docs/img/AMD-GCN-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/AMD-GCN-3.png


--------------------------------------------------------------------------------
/docs/img/BankConflicts.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/BankConflicts.jpeg


--------------------------------------------------------------------------------
/docs/img/CU.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/CU.png


--------------------------------------------------------------------------------
/docs/img/CUgray.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/CUgray.png


--------------------------------------------------------------------------------
/docs/img/NoBankConflicts.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/NoBankConflicts.jpeg


--------------------------------------------------------------------------------
/docs/img/ThreadExecution.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/ThreadExecution.jpg


--------------------------------------------------------------------------------
/docs/img/ThreadExecution_new.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/ThreadExecution_new.jpg


--------------------------------------------------------------------------------
/docs/img/a100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/a100.png


--------------------------------------------------------------------------------
/docs/img/a100_fp32_core.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/a100_fp32_core.png


--------------------------------------------------------------------------------
/docs/img/a100_sm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/a100_sm.png


--------------------------------------------------------------------------------
/docs/img/a100_smsp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/a100_smsp.png


--------------------------------------------------------------------------------
/docs/img/amd_computeunit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/amd_computeunit.png


--------------------------------------------------------------------------------
/docs/img/amd_instinct_mi250x_oam.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/amd_instinct_mi250x_oam.png


--------------------------------------------------------------------------------
/docs/img/amd_m200.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/amd_m200.png


--------------------------------------------------------------------------------
/docs/img/amd_mi200.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/amd_mi200.jpg


--------------------------------------------------------------------------------
/docs/img/amd_mi200.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/amd_mi200.png


--------------------------------------------------------------------------------
/docs/img/arrow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/arrow.png


--------------------------------------------------------------------------------
/docs/img/block_sm_cu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/block_sm_cu.png


--------------------------------------------------------------------------------
/docs/img/coalesced_access_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/coalesced_access_1.png


--------------------------------------------------------------------------------
/docs/img/coalesced_access_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/coalesced_access_3.png


--------------------------------------------------------------------------------
/docs/img/coalesced_access_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/coalesced_access_4.png


--------------------------------------------------------------------------------
/docs/img/comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/comparison.png


--------------------------------------------------------------------------------
/docs/img/copy_d2h.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/copy_d2h.png


--------------------------------------------------------------------------------
/docs/img/copy_h2d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/copy_h2d.png


--------------------------------------------------------------------------------
/docs/img/cpu_waits_on_gpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/cpu_waits_on_gpu.png


--------------------------------------------------------------------------------
/docs/img/cu_sm_eu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/cu_sm_eu.png


--------------------------------------------------------------------------------
/docs/img/cublas_cuda_hip.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/cublas_cuda_hip.png


--------------------------------------------------------------------------------
/docs/img/do_this_computation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/do_this_computation.png


--------------------------------------------------------------------------------
/docs/img/execution-model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/execution-model.png


--------------------------------------------------------------------------------
/docs/img/gpu-bws.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpu-bws.png


--------------------------------------------------------------------------------
/docs/img/gpu-cluster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpu-cluster.png


--------------------------------------------------------------------------------
/docs/img/gpuConnect.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpuConnect.png


--------------------------------------------------------------------------------
/docs/img/gpu_as_a_wide_vector_unit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpu_as_a_wide_vector_unit.png


--------------------------------------------------------------------------------
/docs/img/gpu_as_cus_sms_eus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpu_as_cus_sms_eus.png


--------------------------------------------------------------------------------
/docs/img/gpu_as_vector_units.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpu_as_vector_units.png


--------------------------------------------------------------------------------
/docs/img/gpu_as_vector_units_instructions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpu_as_vector_units_instructions.png


--------------------------------------------------------------------------------
/docs/img/gpu_is_a_separate_processor_with_own_memory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpu_is_a_separate_processor_with_own_memory.png


--------------------------------------------------------------------------------
/docs/img/gpufort.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpufort.png


--------------------------------------------------------------------------------
/docs/img/gpufort1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpufort1.png


--------------------------------------------------------------------------------
/docs/img/gpufort2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpufort2.png


--------------------------------------------------------------------------------
/docs/img/grid-threads.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/grid-threads.png


--------------------------------------------------------------------------------
/docs/img/grid_gpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/grid_gpu.png


--------------------------------------------------------------------------------
/docs/img/hipblas.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/hipblas.png


--------------------------------------------------------------------------------
/docs/img/hipfort.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/hipfort.png


--------------------------------------------------------------------------------
/docs/img/kernel_cuda_hip.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/kernel_cuda_hip.png


--------------------------------------------------------------------------------
/docs/img/lumi.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/lumi.jpg


--------------------------------------------------------------------------------
/docs/img/lumi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/lumi.png


--------------------------------------------------------------------------------
/docs/img/many_blocks_to_one_sm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/many_blocks_to_one_sm.png


--------------------------------------------------------------------------------
/docs/img/memlayout.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/memlayout.png


--------------------------------------------------------------------------------
/docs/img/memory-hierarchy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/memory-hierarchy.png


--------------------------------------------------------------------------------
/docs/img/memsch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/memsch.png


--------------------------------------------------------------------------------
/docs/img/mi100-architecture.info:
--------------------------------------------------------------------------------
1 | Source:
2 | Introducing AMD CDNA Architecture,
3 | https://www.amd.com/system/files/documents/amd-cdna-whitepaper.pdf
4 | 
5 | Caption:
6 | Block diagram of the AMD Instinct MI100 accelerator, the first GPUs
7 | powered by the AMD CDNA architecture.
8 | 


--------------------------------------------------------------------------------
/docs/img/mi100-architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/mi100-architecture.png


--------------------------------------------------------------------------------
/docs/img/mi100_arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/mi100_arch.png


--------------------------------------------------------------------------------
/docs/img/mi250x.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/mi250x.png


--------------------------------------------------------------------------------
/docs/img/mi250x_cu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/mi250x_cu.png


--------------------------------------------------------------------------------
/docs/img/mi250x_cu_simd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/mi250x_cu_simd.png


--------------------------------------------------------------------------------
/docs/img/microprocessor-trend-data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/microprocessor-trend-data.png


--------------------------------------------------------------------------------
/docs/img/model_gpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/model_gpu.png


--------------------------------------------------------------------------------
/docs/img/new_hipfort.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/new_hipfort.png


--------------------------------------------------------------------------------
/docs/img/no_block_to_many_sm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/no_block_to_many_sm.png


--------------------------------------------------------------------------------
/docs/img/not_gpu_as_a_wide_vector_unit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/not_gpu_as_a_wide_vector_unit.png


--------------------------------------------------------------------------------
/docs/img/oned_block.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/oned_block.png


--------------------------------------------------------------------------------
/docs/img/oned_grid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/oned_grid.png


--------------------------------------------------------------------------------
/docs/img/parallel_regions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/parallel_regions.png


--------------------------------------------------------------------------------
/docs/img/parflow_single_node.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/parflow_single_node.png


--------------------------------------------------------------------------------
/docs/img/perfetto.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/perfetto.png


--------------------------------------------------------------------------------
/docs/img/runtimes_annotated.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/runtimes_annotated.png


--------------------------------------------------------------------------------
/docs/img/scalar_operation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/scalar_operation.png


--------------------------------------------------------------------------------
/docs/img/single_proc_mpi_gpu2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/single_proc_mpi_gpu2.png


--------------------------------------------------------------------------------
/docs/img/single_proc_multi_gpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/single_proc_multi_gpu.png


--------------------------------------------------------------------------------
/docs/img/single_proc_thread_gpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/single_proc_thread_gpu.png


--------------------------------------------------------------------------------
/docs/img/software_hardware_mapping.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/software_hardware_mapping.png


--------------------------------------------------------------------------------
/docs/img/streams-example-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/streams-example-1.png


--------------------------------------------------------------------------------
/docs/img/streams-example-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/streams-example-2.png


--------------------------------------------------------------------------------
/docs/img/streams.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/streams.png


--------------------------------------------------------------------------------
/docs/img/streams1_explain.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/streams1_explain.png


--------------------------------------------------------------------------------
/docs/img/streams2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/streams2.png


--------------------------------------------------------------------------------
/docs/img/streams2_explain.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/streams2_explain.png


--------------------------------------------------------------------------------
/docs/img/thread.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/thread.png


--------------------------------------------------------------------------------
/docs/img/thread_lane.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/thread_lane.png


--------------------------------------------------------------------------------
/docs/img/threed_block.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/threed_block.png


--------------------------------------------------------------------------------
/docs/img/top500-perf-dev.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/top500-perf-dev.png


--------------------------------------------------------------------------------
/docs/img/top500-performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/top500-performance.png


--------------------------------------------------------------------------------
/docs/img/transpose_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/transpose_img.png


--------------------------------------------------------------------------------
/docs/img/twod_block.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/twod_block.png


--------------------------------------------------------------------------------
/docs/img/twod_grid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/twod_grid.png


--------------------------------------------------------------------------------
/docs/img/vector_operation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/vector_operation.png


--------------------------------------------------------------------------------
/docs/img/vector_unit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/vector_unit.png


--------------------------------------------------------------------------------
/docs/img/virtual_memory_addressing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/virtual_memory_addressing.png


--------------------------------------------------------------------------------
/docs/img/warp_wavefron_smsp_simd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/warp_wavefron_smsp_simd.png


--------------------------------------------------------------------------------
/docs/index:
--------------------------------------------------------------------------------
 1 | # List of PDFs to jam together
 2 | #  an index file for jam-it.sh (https://github.com/mlouhivu/jam-it)
 3 | 
 4 | @title-course.pdf
 5 | 
 6 | @title-intro.pdf
 7 | 01-introduction.pdf
 8 | 
 9 | @title-kernels.pdf
10 | 02-kernels.pdf
11 | 
12 | @title-streams.pdf
13 | 03-streams.pdf
14 | 
15 | @title-memory.pdf
16 | 04-memory.pdf
17 | 
18 | @title-fortran.pdf
19 | 05-fortran.pdf
20 | 
21 | @title-optimisation.pdf
22 | 06-optimisation.pdf
23 | 
24 | @title-multi-gpu.pdf
25 | 07-multi-gpu.pdf
26 | 


--------------------------------------------------------------------------------
/first_steps.md:
--------------------------------------------------------------------------------
 1 | ## Accessing LUMI
 2 | 
 3 | Are you able to `ssh` to LUMI? If not, have you followed the instructions [here](https://docs.lumi-supercomputer.eu/firststeps/)?
 4 | 
 5 | If you haven't added the ssh-key correctly or cannot otherwise `ssh` to LUMI, you can use the [web interface](https://www.lumi.csc.fi/public/).
 6 | 
 7 | See the [documentation](https://docs.lumi-supercomputer.eu/firststeps/loggingin-webui/) for more help.
 8 | 
 9 | ## Getting the course material
10 | 
11 | You can clone this git repository with `git clone https://github.com/csc-training/hip-programming.git`.
12 | 
13 | This way you get a local access to the lectures, as well as the exercises (which you need to run on LUMI).
14 | 
15 | ## Using slurm
16 | 
17 | Supercomputers like LUMI are shared resources, meaning multiple users are using them at the same time.
18 | To run something on LUMI, you need to use SLURM to submit a job.
19 | 
20 | Read the [LUMI documentation](https://docs.lumi-supercomputer.eu/runjobs/) on running jobs to find out more.
21 | 
22 | ## Motivation for the course
23 | 
24 | Why do we teach GPU programming? Why should you learn to program GPUs?
25 | 
26 | Because most of the Top 500 supercomputers use (and derive most of their compute cabability from) GPUs
27 | --> if you use any of these supercomputers, you cannot avoid using GPUs.
28 | 
29 | Why are most of the Top 500 supercomputers using GPUs?
30 | 
31 | 1. Because GPUs are designed and optimized to solve problems commonly encountered in HPC and ML/AI: floating point operations, matrix multiplications.
32 | 2. Because of power limitations: performance per Watt is much greater for GPUs than CPUs: https://top500.org/statistics/efficiency-power-cores/
33 | 


--------------------------------------------------------------------------------
/hipfort/hiprand/Makefile:
--------------------------------------------------------------------------------
 1 | ifeq ($(COMP),)
 2 | COMP=lumi
 3 | endif
 4 | 
 5 | ifeq ($(COMP),lumi)
 6 | HIPFORT_HOME = /projappl/project_462000877/apps/HIPFORT
 7 | LIB_FLAGS = 
 8 | CXX = CC -xhip
 9 | FC = ftn -I$(HIPFORT_HOME)/include/hipfort/amdgcn "-DHIPFORT_ARCH=\"amd\"" -L$(HIPFORT_HOME)/lib -lhipfort-amdgcn $(LIB_FLAGS)
10 | endif 
11 | 
12 | OBJS=pi.o 
13 | 
14 | all: pi
15 | 
16 | pi: $(OBJS)
17 | 	$(FC) -o $@ $(OBJS) $(FCFLAGS) 
18 | 
19 | %.o: %.F90
20 | 	$(FC) $(FCFLAGS) -c $< -o $@
21 | 
22 | %.mod: %.F90
23 | 	$(FC) $(FCFLAGS) -c $<
24 | clean:
25 | 	rm -f pi *.o *.mod
26 | 


--------------------------------------------------------------------------------
/hipfort/hiprand/img/pi_MC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/hipfort/hiprand/img/pi_MC.png


--------------------------------------------------------------------------------
/hipfort/hiprand/pi.F90:
--------------------------------------------------------------------------------
  1 | program rand_test
  2 |   use iso_c_binding
  3 |   use iso_fortran_env, only : INT64
  4 |   ! TODO Add here the necessary modules for the GPU operation
  5 |   
  6 | 
  7 |   !OPTIONAL
  8 |   !TODO write an interface to the C wrapper which calls the reduction kernel.
  9 | 
 10 |   implicit none
 11 | 
 12 |   integer(kind=INT64) :: nsamples
 13 |   character(len=85) :: arg
 14 |   real :: pi1, pi2
 15 |   integer(c_size_t):: Nbytes
 16 | 
 17 |   if (command_argument_count() /= 1) then
 18 |     STOP 'Usage pi N where N is the number of samples'
 19 |   end if
 20 | 
 21 |   call get_command_argument(1, arg)
 22 |   read(arg, *) nsamples
 23 | 
 24 |   pi1 = cpu_pi(nsamples)
 25 |   write(*,*) 'Pi calculated with CPU', pi1
 26 |   pi2 = gpu_pi(nsamples)
 27 |   write(*,*) 'Pi calculated with GPU', pi2
 28 | 
 29 | contains
 30 | 
 31 |   real function cpu_pi(n)
 32 |     implicit none
 33 |     integer(kind=INT64) :: n
 34 |     integer :: i, inside
 35 | 
 36 |     real, allocatable:: x(:),y(:)
 37 | 
 38 | 
 39 |     allocate(x(1:n))
 40 |     allocate(y(1:n))
 41 | 
 42 |     call random_number(x)
 43 |     call random_number(y)
 44 | 
 45 |     inside = 0
 46 |     do i = 1, n
 47 |       if (x(i)**2 + y(i)**2 < 1.0) then
 48 |         inside = inside + 1
 49 |       end if
 50 |     end do
 51 | 
 52 |     cpu_pi = 4.0 * real(inside) / real(n)
 53 | 
 54 |   end function cpu_pi
 55 | 
 56 | 
 57 | 
 58 |   real function gpu_pi(n)
 59 |   use hipfort
 60 |   use hipfort_check 
 61 |   use hipfort_hiprand
 62 |     implicit none
 63 |     integer(kind=INT64) :: n
 64 |     integer :: i, inside
 65 |     type(c_ptr) :: gen = c_null_ptr
 66 |     type(c_ptr) :: x_d,y_d
 67 |     real(c_float), allocatable,target :: x(:),y(:)
 68 |     integer(c_size_t) :: istat
 69 | 
 70 |     allocate(x(1:n))
 71 |     allocate(y(1:n))
 72 |     Nbytes=sizeof(x)
 73 | 
 74 |     inside = 0
 75 |     ! Initialization for (optional) task. Instead of this one can as well initialize inside_d using a hip kernel 
 76 |     ! Sbytes = sizeof(inside)
 77 |     ! call hipCheck(hipMalloc(inside_d,Sbytes)) 
 78 |     ! call hipCheck(hipMemcpy( inside_d,c_loc(inside), Sbytes, hipMemcpyHostToDevice))
 79 | 
 80 |     !Allocate memory for the gpu arrays
 81 | 
 82 |     ! TODO  Initialize the gpu random number generator 
 83 | 
 84 |     ! TODO  Fill the arrays x and y with random uniform distributed numbers 
 85 |     
 86 |     ! TODO copy the random numbers from GPU to CPU 
 87 | 
 88 |     ! TODO Bonus exercise: replace the below reduction loop  done on the CPU with a GPU kernel
 89 |     ! The kernel is in the hip_kernels.cpp file.
 90 |     ! You need to implement an interface to call the C function simialrly to the saxpy example
 91 |     ! Note that in this case there is no need to transfer the x and y arrays to CPU, 
 92 |     ! You only need to copy the final result, inside_d
 93 | 
 94 |     do i = 1, n
 95 |       if (x(i)**2 + y(i)**2 < 1.0) then
 96 |         inside = inside + 1
 97 |       end if
 98 |     end do
 99 | 
100 |     gpu_pi = 4.0 * real(inside) / real(n)
101 | 
102 |    deallocate(x, y)
103 |   end function gpu_pi
104 |   end program
105 | 


--------------------------------------------------------------------------------
/hipfort/hiprand/solution/Makefile:
--------------------------------------------------------------------------------
 1 | ifeq ($(COMP),)
 2 | COMP=lumi
 3 | endif
 4 | 
 5 | ifeq ($(COMP),lumi)
 6 | HIPFORT_HOME = /projappl/project_462000877/apps/HIPFORT
 7 | LIB_FLAGS = -lhiprand
 8 | CXX = CC -xhip
 9 | FC = ftn -I$(HIPFORT_HOME)/include/hipfort/amdgcn "-DHIPFORT_ARCH=\"amd\"" -L$(HIPFORT_HOME)/lib -lhipfort-amdgcn $(LIB_FLAGS)
10 | endif 
11 | 
12 | OBJS=pi.o 
13 | 
14 | all: pi
15 | 
16 | pi: $(OBJS)
17 | 	$(FC) -o $@ $(OBJS) $(FCFLAGS) 
18 | 
19 | %.o: %.F90
20 | 	$(FC) $(FCFLAGS) -c $< -o $@
21 | 
22 | %.mod: %.F90
23 | 	$(FC) $(FCFLAGS) -c $<
24 | clean:
25 | 	rm -f pi *.o *.mod
26 | 


--------------------------------------------------------------------------------
/hipfort/hiprand/solution/pi.F90:
--------------------------------------------------------------------------------
 1 | program rand_test
 2 |   use iso_c_binding
 3 |   use iso_fortran_env, only : INT64
 4 |   use hipfort
 5 |   use hipfort_check 
 6 |   use hipfort_hiprand
 7 | 
 8 |   implicit none
 9 | 
10 |   integer(kind=INT64) :: nsamples
11 |   character(len=85) :: arg
12 |   real :: pi1, pi2
13 |   integer(c_size_t):: Nbytes
14 | 
15 |   if (command_argument_count() /= 1) then
16 |     STOP 'Usage pi N where N is the number of samples'
17 |   end if
18 | 
19 |   call get_command_argument(1, arg)
20 |   read(arg, *) nsamples
21 | 
22 |   pi1 = cpu_pi(nsamples)
23 |   write(*,*) 'Pi calculated with CPU', pi1
24 |   pi2 = gpu_pi(nsamples)
25 |   write(*,*) 'Pi calculated with GPU', pi2
26 | 
27 | contains
28 | 
29 |   real function cpu_pi(n)
30 |     implicit none
31 |     integer(kind=INT64) :: n
32 |     integer :: i, inside
33 | 
34 |     real, allocatable:: x(:),y(:)
35 | 
36 | 
37 |     allocate(x(1:n))
38 |     allocate(y(1:n))
39 | 
40 |     call random_number(x)
41 |     call random_number(y)
42 | 
43 |     inside = 0
44 |     do i = 1, n
45 |       if (x(i)**2 + y(i)**2 < 1.0) then
46 |         inside = inside + 1
47 |       end if
48 |     end do
49 | 
50 |     cpu_pi = 4.0 * real(inside) / real(n)
51 | 
52 |   end function cpu_pi
53 | 
54 | 
55 | 
56 |   real function gpu_pi(n)
57 |   use hipfort
58 |   use hipfort_check 
59 |   use hipfort_hiprand
60 |     implicit none
61 |     integer(kind=INT64) :: n
62 |     integer :: i, inside
63 |     type(c_ptr) :: gen = c_null_ptr
64 |     type(c_ptr) :: x_d,y_d
65 |     real(c_float), allocatable,target :: x(:),y(:)
66 |     integer(c_size_t) :: istat
67 | 
68 |     allocate(x(1:n))
69 |     allocate(y(1:n))
70 |     Nbytes=sizeof(x)
71 | 
72 |   call hipCheck(hipMalloc(x_d,Nbytes))
73 |   call hipCheck(hipMalloc(y_d,Nbytes))
74 | 
75 |     inside = 0
76 | 
77 | 
78 |     istat= hiprandCreateGenerator(gen, HIPRAND_RNG_PSEUDO_DEFAULT)
79 | 
80 |     istat= hiprandGenerateUniform(gen, x_d, n)
81 |     istat= hiprandGenerateUniform(gen, y_d, n)
82 | 
83 |   call hipCheck(hipMemcpy(c_loc(x), x_d, Nbytes, hipMemcpyDeviceToHost))
84 |   call hipCheck(hipMemcpy(c_loc(y), y_d, Nbytes, hipMemcpyDeviceToHost))
85 | 
86 |     do i = 1, n
87 |       if (x(i)**2 + y(i)**2 < 1.0) then
88 |         inside = inside + 1
89 |       end if
90 |     end do
91 | 
92 |     gpu_pi = 4.0 * real(inside) / real(n)
93 | 
94 |    deallocate(x, y)
95 |   end function gpu_pi
96 |   end program
97 | 


--------------------------------------------------------------------------------
/hipfort/hiprand/solution_bonus/Makefile:
--------------------------------------------------------------------------------
 1 | ifeq ($(COMP),)
 2 | COMP=lumi
 3 | endif
 4 | 
 5 | ifeq ($(COMP),lumi)
 6 | HIPFORT_HOME = /projappl/project_462000877/apps/HIPFORT
 7 | LIB_FLAGS = -lhiprand
 8 | CXX = CC -xhip
 9 | FC = ftn -I$(HIPFORT_HOME)/include/hipfort/amdgcn "-DHIPFORT_ARCH=\"amd\"" -L$(HIPFORT_HOME)/lib -lhipfort-amdgcn $(LIB_FLAGS)
10 | endif 
11 | 
12 | OBJS=pi.o hip_kernels.o
13 | 
14 | all: pi
15 | 
16 | pi: $(OBJS)
17 | 	$(FC) -o $@ $(OBJS) $(FCFLAGS) 
18 | 
19 | %.o: %.F90
20 | 	$(FC) $(FCFLAGS) -c $< -o $@
21 | 
22 | %.o: %.cpp
23 | 	$(CXX) -c -o $@ $<
24 | 
25 | %.mod: %.F90
26 | 	$(FC) $(FCFLAGS) -c $<
27 | clean:
28 | 	rm -f pi *.o *.mod
29 | 


--------------------------------------------------------------------------------
/hipfort/hiprand/solution_bonus/hip_kernels.cpp:
--------------------------------------------------------------------------------
 1 | #include <hip/hip_runtime.h>
 2 | #include <cstdio>
 3 | 
 4 | __global__ void countInsideKernel(float *x, float *y, int *inside,  int64_t n)
 5 | {
 6 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 7 |     
 8 |     if (idx < n) {
 9 |         if (x[idx] * x[idx] + y[idx] * y[idx] < 1.0f) {
10 |             // Atomic increment to avoid race condition
11 |             atomicAdd(inside, 1);
12 |         }
13 |     }
14 | }
15 | 
16 | extern "C"
17 | {
18 |   void launch(float *x, float *y, int *inside_d,  int64_t N)
19 |   {
20 | 
21 |      dim3 tBlock(256,1,1);
22 |      dim3 grid(ceil((float)N/tBlock.x),1,1);
23 |     
24 |      countInsideKernel<<<grid, tBlock>>>( x, y, inside_d, N);
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/hipfort/hiprand/solution_bonus/pi.F90:
--------------------------------------------------------------------------------
  1 | program rand_test
  2 |   use iso_c_binding
  3 |   use iso_fortran_env, only : INT64
  4 |   use hipfort
  5 |   use hipfort_check
  6 |   use hipfort_hiprand
  7 | 
  8 |   implicit none
  9 | 
 10 |   interface
 11 |      subroutine launch(x_d, y_d, inside_d, N) bind(c)
 12 |        use iso_c_binding
 13 |        implicit none
 14 |        type(c_ptr), value :: x_d, y_d, inside_d
 15 |        integer(c_int64_t), value :: N  ! Ensure use of correct C type for INT64
 16 |      end subroutine
 17 |   end interface
 18 | 
 19 |   integer(c_int64_t) :: nsamples
 20 |   character(len=85) :: arg
 21 |   real :: pi1, pi2
 22 |   integer(c_size_t) :: Nbytes, Sbytes
 23 | 
 24 |   if (command_argument_count() /= 1) then
 25 |     STOP 'Usage: pi N where N is the number of samples'
 26 |   end if
 27 | 
 28 |   call get_command_argument(1, arg)
 29 |   read(arg, *) nsamples
 30 | 
 31 |   pi1 = cpu_pi(nsamples)
 32 |   write(*,*) 'Pi calculated with CPU', pi1
 33 |   pi2 = gpu_pi(nsamples)
 34 |   write(*,*) 'Pi calculated with GPU', pi2
 35 | 
 36 | contains
 37 | 
 38 |   real function cpu_pi(n)
 39 |     implicit none
 40 |     integer(c_int64_t) :: n
 41 |     integer :: i, inside
 42 | 
 43 |     real, allocatable :: x(:), y(:)
 44 | 
 45 |     allocate(x(1:n))
 46 |     allocate(y(1:n))
 47 | 
 48 |     call random_number(x)
 49 |     call random_number(y)
 50 | 
 51 |     inside = 0
 52 |     do i = 1, n
 53 |       if (x(i)**2 + y(i)**2 < 1.0) then
 54 |         inside = inside + 1
 55 |       end if
 56 |     end do
 57 | 
 58 |     cpu_pi = 4.0 * real(inside) / real(n)
 59 | 
 60 |   end function cpu_pi
 61 | 
 62 |   real function gpu_pi(n)
 63 |     use hipfort
 64 |     use hipfort_check
 65 |     use hipfort_hiprand
 66 |     implicit none
 67 |     integer(c_int64_t) :: n
 68 |     integer :: inside
 69 |     type(c_ptr) :: gen = c_null_ptr
 70 |     type(c_ptr) :: x_d, y_d, inside_d
 71 |     real(c_float), allocatable, target :: x(:), y(:)
 72 |     integer(c_size_t) :: istat
 73 | 
 74 |     allocate(x(1:n))
 75 |     allocate(y(1:n))
 76 |     Nbytes = sizeof(x)
 77 | 
 78 |     call hipCheck(hipMalloc(x_d, Nbytes))
 79 |     call hipCheck(hipMalloc(y_d, Nbytes))
 80 | 
 81 |     istat = hiprandCreateGenerator(gen, HIPRAND_RNG_PSEUDO_DEFAULT)
 82 | 
 83 |     istat = hiprandGenerateUniform(gen, x_d, n)
 84 |     istat = hiprandGenerateUniform(gen, y_d, n)
 85 | 
 86 |     inside = 0
 87 |     Sbytes = sizeof(inside)
 88 |     call hipCheck(hipMalloc(inside_d, Sbytes))
 89 |     call hipCheck(hipMemcpy(inside_d, c_loc(inside), Sbytes, hipMemcpyHostToDevice))
 90 | 
 91 |     call launch(x_d, y_d, inside_d, n)
 92 | 
 93 |     call hipCheck(hipMemcpy(c_loc(inside), inside_d, Sbytes, hipMemcpyDeviceToHost))
 94 | 
 95 |     gpu_pi = 4.0 * real(inside) / real(n)
 96 | 
 97 |     deallocate(x, y)
 98 |   end function gpu_pi
 99 | 
100 | end program rand_test
101 | 


--------------------------------------------------------------------------------
/hipfort/saxpy/cuda/main.cuf:
--------------------------------------------------------------------------------
 1 | module mathOps
 2 | contains
 3 |   attributes(global) subroutine saxpy(x, y, a)
 4 |     implicit none
 5 |     real :: x(:), y(:)
 6 |     real, value :: a
 7 |     integer :: i, n
 8 |     n = size(x)
 9 |     i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
10 |     if (i <= n) y(i) = y(i) + a*x(i)
11 |   end subroutine saxpy 
12 | end module mathOps
13 | 
14 | program testSaxpy
15 |   use mathOps
16 |   use cudafor
17 |   implicit none
18 |   integer, parameter :: N = 40000
19 |   real :: x(N), y(N), a
20 |   real, device :: x_d(N), y_d(N)
21 |   type(dim3) :: grid, tBlock
22 | 
23 |   tBlock = dim3(256,1,1)
24 |   grid = dim3(ceiling(real(N)/tBlock%x),1,1)
25 | 
26 |   x = 1.0; y = 2.0; a = 2.0
27 |   x_d = x
28 |   y_d = y
29 |   call saxpy<<<grid, tBlock>>>(x_d, y_d, a)
30 |   y = y_d
31 |   write(*,*) 'Max error: ', maxval(abs(y-4.0))
32 | end program testSaxpy
33 | 


--------------------------------------------------------------------------------
/hipfort/saxpy/hip/README.md:
--------------------------------------------------------------------------------
 1 | # SAXPY using FORTRAN & HIPFORT 
 2 | 
 3 | Inspect `saxpy`  code in the present folder. The Fortran code folows the same logic as the HIP C code. 
 4 | First the data is created on the cpu. Then the memory is allocated on the GPU and the data is transfered from CPU to GPU. When the transfer is completed a kernel is executed to perform the work.  In the end the results of the computation is copied to the CPU and processed further. 
 5 | 
 6 | **Note** Fortran does can not compile HIP  C code.  The GPU code is located in a separate file, [hipsaxpy.cpp](hipsaxpy.cpp). The HIP kernel is launched via C function which acts as a wrapper. Fortran calls this C wrapper using  `iso_c_binding` module.
 7 | 
 8 | In this code all calls to HIP API are done via HIPFORT. The exercise is to check and familiarize with how the memory management (allocations and transfers) is done and how Fortran is calling C functions using `iso_c_binding` module. 
 9 | If you have previous experience with CUDA Fortran you can compare it to the equivalent code in the [cuda](../cuda) folder.
10 | 
11 | In addition to the memory management, HIPFORT provides also  bindings for the mathematical libraries running on GPUs. You can find examples of how various `hipxxx` & `rocxxx` libraries are called in `Fortran` programs in the [HIPFORT repository](https://github.com/ROCm/hipfort/tree/develop/test).
12 | 
13 | The instructions for compilation are found in the [exercise-instructions page]( ../../../exercise-instructions.md#hipfort-on-lumi).
14 | 


--------------------------------------------------------------------------------
/hipfort/saxpy/hip/hipsaxpy.cpp:
--------------------------------------------------------------------------------
 1 | #include <hip/hip_runtime.h>
 2 | #include <cstdio>
 3 | 
 4 | __global__ void saxpy(float *y, float *x, float a, int n)
 5 | {
 6 |     size_t i = blockDim.x * blockIdx.x  + threadIdx.x;
 7 |     if (i < n) y[i] = y[i] + a*x[i];
 8 | }
 9 | 
10 | 
11 | extern "C"
12 | {
13 |   void launch(float *dout, float *da, float db, int N)
14 |   {
15 | 
16 |      dim3 tBlock(256,1,1);
17 |      dim3 grid(ceil((float)N/tBlock.x),1,1);
18 |     
19 |     hipLaunchKernelGGL((saxpy), grid, tBlock, 0, 0, dout, da, db, N);
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/hipfort/saxpy/hip/main.f03:
--------------------------------------------------------------------------------
 1 | program testSaxpy
 2 |   use iso_c_binding
 3 |   use hipfort
 4 |   use hipfort_check
 5 | 
 6 |   implicit none
 7 |   interface
 8 |      subroutine launch(y,x,b,N) bind(c)
 9 |        use iso_c_binding
10 |        implicit none
11 |        type(c_ptr),value :: y,x
12 |        integer, value :: N
13 |        real, value :: b
14 |      end subroutine
15 |   end interface
16 | 
17 |   type(c_ptr) :: dx = c_null_ptr
18 |   type(c_ptr) :: dy = c_null_ptr
19 |   integer, parameter :: N = 40000
20 |   integer, parameter :: bytes_per_element = 4
21 |   integer(c_size_t), parameter :: Nbytes = N*bytes_per_element
22 |   real, allocatable,target,dimension(:) :: x, y
23 | 
24 | 
25 |   real, parameter ::  a=2.0
26 |   real :: x_d(N), y_d(N)
27 | 
28 |   call hipCheck(hipMalloc(dx,Nbytes))
29 |   call hipCheck(hipMalloc(dy,Nbytes))
30 | 
31 |   allocate(x(N))
32 |   allocate(y(N))
33 | 
34 |   x = 1.0;y = 2.0
35 | 
36 |   call hipCheck(hipMemcpy(dx, c_loc(x), Nbytes, hipMemcpyHostToDevice))
37 |   call hipCheck(hipMemcpy(dy, c_loc(y), Nbytes, hipMemcpyHostToDevice))
38 | 
39 |   call launch(dy, dx, a, N)
40 | 
41 |   call hipCheck(hipDeviceSynchronize())
42 | 
43 |   call hipCheck(hipMemcpy(c_loc(y), dy, Nbytes, hipMemcpyDeviceToHost))
44 | 
45 |   write(*,*) 'Max error: ', maxval(abs(y-4.0))
46 | 
47 |   call hipCheck(hipFree(dx))
48 |   call hipCheck(hipFree(dy))
49 | 
50 |   deallocate(x)
51 |   deallocate(y)
52 | 
53 | end program testSaxpy
54 | 


--------------------------------------------------------------------------------
/kernels/01-hello-world/README.md:
--------------------------------------------------------------------------------
1 | # Hello world with HIP
2 | 
3 | Compile and run a simple HIP test program provided as `hello.cpp`.
4 | 
5 | Please follow the system-specific instructions provided in the
6 | [exercise instructions](../../exercise-instructions.md).
7 | 


--------------------------------------------------------------------------------
/kernels/01-hello-world/hello.cpp:
--------------------------------------------------------------------------------
 1 | #include <hip/hip_runtime.h>
 2 | #include <stdio.h>
 3 | 
 4 | int main(void)
 5 | {
 6 |     int count = 0;
 7 |     auto result = hipGetDeviceCount(&count);
 8 | 
 9 |     int device = 0;
10 |     result = hipGetDevice(&device);
11 | 
12 |     printf("Hello! I'm GPU %d out of %d GPUs in total.\n", device, count);
13 | 
14 |     return 0;
15 | }
16 | 


--------------------------------------------------------------------------------
/kernels/02-error-checking/README.md:
--------------------------------------------------------------------------------
1 | # Error checking with HIP
2 | 
3 | Your task is to find a bug in the program, by implementing a HIP API error checking function.
4 | It's a good practice to wrap the API calls with the error checker to find any issues early.
5 | 


--------------------------------------------------------------------------------
/kernels/02-error-checking/error-checking.cpp:
--------------------------------------------------------------------------------
 1 | #include <hip/hip_runtime.h>
 2 | #include <stdio.h>
 3 | 
 4 | #define HIP_ERRCHK(result) hip_errchk(result, __FILE__, __LINE__)
 5 | static inline void hip_errchk(hipError_t result, const char *file, int line) {
 6 |     static_assert(false, "TODO: remove me and implement the error checking. "
 7 |                          "(Hint: check the slides)");
 8 | }
 9 | 
10 | int main() {
11 |     // There's a bug in this program, find out what it is by implementing the
12 |     // function above, and correct it
13 |     int count = 0;
14 |     HIP_ERRCHK(hipGetDeviceCount(&count));
15 |     HIP_ERRCHK(hipSetDevice(count));
16 | 
17 |     int device = 0;
18 |     HIP_ERRCHK(hipGetDevice(&device));
19 | 
20 |     printf("Hello! I'm GPU %d out of %d GPUs in total.\n", device, count);
21 | 
22 |     return 0;
23 | }
24 | 


--------------------------------------------------------------------------------
/kernels/02-error-checking/solution/error-checking.cpp:
--------------------------------------------------------------------------------
 1 | #include <hip/hip_runtime.h>
 2 | #include <stdio.h>
 3 | 
 4 | /* HIP error handling macro */
 5 | #define HIP_ERRCHK(result) hip_errchk(result, __FILE__, __LINE__)
 6 | static inline void hip_errchk(hipError_t result, const char *file, int line) {
 7 |     if (result != hipSuccess) {
 8 |         printf("\n\n%s in %s at line %d\n", hipGetErrorString(result), file,
 9 |                line);
10 |         exit(EXIT_FAILURE);
11 |     }
12 | }
13 | 
14 | int main() {
15 |     int count = 0;
16 |     HIP_ERRCHK(hipGetDeviceCount(&count));
17 |     // When setting the device, the argument must be 0 <= arg < #devices
18 |     // See
19 |     // https://rocm.docs.amd.com/projects/HIP/en/docs-6.0.0/doxygen/html/group___device.html#ga43c1e7f15925eeb762195ccb5e063eae
20 |     // for the API
21 |     HIP_ERRCHK(hipSetDevice(count - 1));
22 | 
23 |     int device = 0;
24 |     HIP_ERRCHK(hipGetDevice(&device));
25 | 
26 |     printf("Hello! I'm GPU %d out of %d GPUs in total.\n", device, count);
27 | 
28 |     return 0;
29 | }
30 | 


--------------------------------------------------------------------------------
/kernels/03-kernel-saxpy/README.md:
--------------------------------------------------------------------------------
 1 | # Kernel: saxpy
 2 | 
 3 | Write a device kernel that calculates the single precision BLAS operation
 4 | **saxpy**, i.e. `y = a * x + y`.
 5 | 
 6 | - Initialise the vectors `x` and `y` with some values on the CPU
 7 | - Perform the computation on the host to generate reference values
 8 | - Allocate memory on the device for `x` and `y`
 9 | - Copy the host `x` to device `x`, and host `y` to device `y`
10 | - Perform the computation on the device
11 | - Copy the device `y` back to the host `y`
12 | - Confirm the correctness: Is the host computed `y` equal to the device computed `y`?
13 | 
14 | You may start from a skeleton code provided in [saxpy.cpp](saxpy.cpp).
15 | 


--------------------------------------------------------------------------------
/kernels/03-kernel-saxpy/saxpy.cpp:
--------------------------------------------------------------------------------
 1 | #include <hip/hip_runtime.h>
 2 | #include <math.h>
 3 | #include <stdio.h>
 4 | #include <vector>
 5 | 
 6 | #define HIP_ERRCHK(result) hip_errchk(result, __FILE__, __LINE__)
 7 | static inline void hip_errchk(hipError_t result, const char *file, int line) {
 8 |     if (result != hipSuccess) {
 9 |         printf("\n\n%s in %s at line %d\n", hipGetErrorString(result), file,
10 |                line);
11 |         exit(EXIT_FAILURE);
12 |     }
13 | }
14 | 
15 | /*
16 | TODO: add a device kernel that calculates y = a * x + y for vectors x, y and
17 | constant a
18 | 
19 | Hints:
20 | 
21 | What attribute(s) do you need to add on a kernel declaration?
22 |   - __device__?
23 |   - __global__?
24 |   - __shared__?
25 |   - no attribute(s) needed?
26 | 
27 | What is the return type of a kernel?
28 |   - int?
29 |   - float?
30 |   - void?
31 |   - depends on the kernel?
32 | 
33 | What data do you need in the kernel to compute y = a * x + y, for vectors x, y,
34 | and constant a?
35 | 
36 | What built-in variables can you use to calculate the (global) index for a
37 | thread?
38 |   - Is threadIdx enough or do you need blockIdx, blockDim, gridDim?
39 |   - Is the problem one or multi-dimensional?
40 |   - Remember the grid, block, thread hierarchy and the launch parameters
41 | */
42 | 
43 | int main() {
44 |     // Use HIP_ERRCHK to help you find any errors you make with the API calls
45 | 
46 |     // Read the HIP Runtime API documentation to help you with the API calls:
47 |     // Ctrl-click this to open it in a browser:
48 |     // https://rocm.docs.amd.com/projects/HIP/en/docs-6.0.0/doxygen/html/group___memory.html
49 | 
50 |     static constexpr size_t n = 1000000;
51 |     static constexpr size_t num_bytes = sizeof(float) * n;
52 |     static constexpr float a = 3.4f;
53 | 
54 |     std::vector<float> x(n);
55 |     std::vector<float> y(n);
56 |     std::vector<float> y_ref(n);
57 | 
58 |     // Initialise data and calculate reference values on CPU
59 |     for (size_t i = 0; i < n; i++) {
60 |         x[i] = sin(i) * 2.3;
61 |         y[i] = cos(i) * 1.1;
62 |         y_ref[i] = a * x[i] + y[i];
63 |     }
64 | 
65 |     // TODO: Allocate + copy initial values
66 |     // - hipMalloc, hipMemcpy
67 | 
68 |     // TODO: Define grid dimensions + launch the device kernel
69 |     // int/dim3 threads = ...
70 |     // int/dim3 blocks = ...
71 |     // kernelName<<<blocks, threads>>>(arguments);
72 | 
73 |     // TODO: Copy results back to CPU
74 |     // - hipMemcpy
75 | 
76 |     // TODO: Free device memory
77 |     // - hipFree
78 | 
79 |     // Check the result of the GPU computation
80 |     printf("reference: %f %f %f %f ... %f %f\n", y_ref[0], y_ref[1], y_ref[2],
81 |            y_ref[3], y_ref[n - 2], y_ref[n - 1]);
82 |     printf("   result: %f %f %f %f ... %f %f\n", y[0], y[1], y[2], y[3],
83 |            y[n - 2], y[n - 1]);
84 | 
85 |     float error = 0.0;
86 |     static constexpr float tolerance = 1e-6f;
87 |     for (size_t i = 0; i < n; i++) {
88 |         const auto diff = abs(y_ref[i] - y[i]);
89 |         if (diff > tolerance)
90 |             error += diff;
91 |     }
92 |     printf("total error: %f\n", error);
93 |     printf("  reference: %f at (42)\n", y_ref[42]);
94 |     printf("     result: %f at (42)\n", y[42]);
95 | 
96 |     return 0;
97 | }
98 | 


--------------------------------------------------------------------------------
/kernels/04-kernel-copy2d/README.md:
--------------------------------------------------------------------------------
 1 | # Kernel: copy2d
 2 | 
 3 | Write a device kernel that performs the double precision BLAS operation
 4 | **dcopy**, i.e. `y = x` using GPU threads in a 2D grid.
 5 | 
 6 | - Assume that the vectors `x` and `y` are used to store a 400x600 matrix (in row-major format)
 7 | - Initialise the matrix `x` with some values on the CPU
 8 | - Allocate memory for `x` and `y` on the device
 9 | - Copy the host `x` to the device `x`
10 | - Perform the operation on the device using a 2D kernel
11 | - Copy device `y` to host `y`
12 | - Compare host `x` to host `y`
13 | 
14 | Are the values of `x` and `y` equal?
15 | 
16 | You may start from a skeleton code provided in [copy2d.cpp](copy2d.cpp).
17 | 


--------------------------------------------------------------------------------
/kernels/04-kernel-copy2d/copy2d.cpp:
--------------------------------------------------------------------------------
 1 | #include <hip/hip_runtime.h>
 2 | #include <math.h>
 3 | #include <stdio.h>
 4 | #include <vector>
 5 | 
 6 | #define HIP_ERRCHK(result) hip_errchk(result, __FILE__, __LINE__)
 7 | static inline void hip_errchk(hipError_t result, const char *file, int line) {
 8 |     if (result != hipSuccess) {
 9 |         printf("\n\n%s in %s at line %d\n", hipGetErrorString(result), file,
10 |                line);
11 |         exit(EXIT_FAILURE);
12 |     }
13 | }
14 | 
15 | // Copy all elements using threads in a 2D grid
16 | __global__ void copy2d(/*TODO: add arguments*/) {
17 |     // TODO: compute row and col using
18 |     // - threadIdx.x, threadIdx.y
19 |     // - blockIdx.x, blockIdx.y
20 |     // - blockDim.x, blockDim.y
21 | 
22 |     // TODO: Make sure there's no out-of-bounds access
23 |     // row must be < number of rows
24 |     // col must be < number of columns
25 | 
26 |     // We're computing 1D index from a 2D index and copying from src to dst
27 |     const size_t index = row * num_cols + col;
28 |     dst[index] = src[index];
29 | }
30 | 
31 | int main() {
32 |     static constexpr size_t num_cols = 600;
33 |     static constexpr size_t num_rows = 400;
34 |     static constexpr size_t num_values = num_cols * num_rows;
35 |     static constexpr size_t num_bytes = sizeof(double) * num_values;
36 |     std::vector<double> x(num_values);
37 |     std::vector<double> y(num_values, 0.0);
38 | 
39 |     // Initialise data
40 |     for (size_t i = 0; i < num_values; i++) {
41 |         x[i] = static_cast<double>(i) / 1000.0;
42 |     }
43 | 
44 |     // TODO: Allocate + copy initial values to GPU
45 | 
46 |     // TODO: Define grid dimensions
47 |     // Use dim3 structure for threads and blocks
48 | 
49 |     // TODO: launch the device kernel
50 | 
51 |     // TODO: Copy results back to the CPU vector y
52 | 
53 |     // TODO: Free device memory
54 | 
55 |     // Check result of computation on the GPU
56 |     double error = 0.0;
57 |     for (size_t i = 0; i < num_values; i++) {
58 |         error += abs(x[i] - y[i]);
59 |     }
60 | 
61 |     printf("total error: %f\n", error);
62 |     printf("  reference: %f at (42,42)\n", x[42 * num_rows + 42]);
63 |     printf("     result: %f at (42,42)\n", y[42 * num_rows + 42]);
64 | 
65 |     return 0;
66 | }
67 | 


--------------------------------------------------------------------------------
/lambdas/01-lambda/README.md:
--------------------------------------------------------------------------------
1 | # Host-device lambda functions and general kernels
2 | 
3 | The purpose of this exercise is to understand how the host-device lambda functions work, and how to create a general GPU kernel. Furthermore, differentiating between host and device code paths using ```__HIP_DEVICE_COMPILE__``` macro is demonstrated.
4 | 
5 | The task is to define two host-device lambda functions that can be passed for the host or the device kernel. Both lambda functions require a single integer argument, and the intended location of these definitions are indicated by `#error`. The first lambda function does not need to capture anything, but must call the predefined function ```helloFromThread(const int i)```. The second lambda function must capture the value of ```pi```, and then must multiply the thread index by the pi, and print this value from each thread.
6 | 
7 | IMPORTANT NOTE! When using the host-device lambda function with NVIDIA architectures, the following compiler argument must be added for hipcc: `--extended-lambda`
8 | 


--------------------------------------------------------------------------------
/lambdas/01-lambda/lambda.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <math.h>
 3 | #include <hip/hip_runtime.h>
 4 | 
 5 | /* Blocksize is small because we are printing from all threads */
 6 | #define BLOCKSIZE 4
 7 | 
 8 | /* CPU loop execution */
 9 | template <typename Lambda>
10 | void cpuKernel(Lambda lambda, const int loop_size) {
11 |   for(int i = 0; i < loop_size; i++){
12 |     lambda(i);
13 |   }
14 | }
15 | 
16 | /* GPU loop execution */
17 | template <typename Lambda> 
18 | __global__ void gpuKernel(Lambda lambda, const int loop_size)
19 | {
20 |   const int i = blockIdx.x * blockDim.x + threadIdx.x;
21 |   if(i < loop_size)
22 |   {
23 |     lambda(i);
24 |   }
25 | }
26 | 
27 | /* Check if this function is running on CPU or GPU */
28 | __host__ __device__ void helloFromThread(const int i) {
29 |   #ifdef __HIP_DEVICE_COMPILE__ // If running on GPU
30 |     printf("Hello from GPU! I'm thread number %d\n", i);
31 |   #else // If running on CPU
32 |     printf("Hello from CPU! I'm thread number %d\n", i);
33 |   #endif
34 | }
35 | 
36 | 
37 | /* The main function */
38 | int main()
39 | {
40 |   // Set the problem dimensions
41 |   const int loop_size = BLOCKSIZE;
42 |   const int blocksize = BLOCKSIZE;
43 |   const int gridsize = (loop_size - 1 + blocksize) / blocksize;
44 | 
45 |   // Define lambda1 function with 1 integer argument,
46 |   // the lamba must call helloFromThread with that argument
47 |   # error put the first lambda funtion definition here
48 | 
49 |   // Run lambda1 on the CPU device
50 |   cpuKernel(lambda1, loop_size);
51 | 
52 |   // Run lambda1 on the GPU device
53 |   gpuKernel<<<gridsize, blocksize>>>(lambda1, loop_size);
54 |   hipStreamSynchronize(0);
55 | 
56 |   // Store value of pi in pi
57 |   double pi = M_PI;
58 | 
59 |   // Define lambda2 that captures pi (use [=] to capture by value), 
60 |   // and prints out the results for i * pi from each thread
61 |   # error put the second lambda funtion definition here
62 | 
63 |   // Run lambda2 on the GPU device
64 |   gpuKernel<<<gridsize, blocksize>>>(lambda2, loop_size);
65 |   hipStreamSynchronize(0);
66 | }
67 | 


--------------------------------------------------------------------------------
/lambdas/01-lambda/solution/lambda.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <math.h>
 3 | #include <hip/hip_runtime.h>
 4 | 
 5 | /* Blocksize is small because we are printing from all threads */
 6 | #define BLOCKSIZE 4
 7 | 
 8 | /* CPU loop execution */
 9 | template <typename Lambda>
10 | void cpuKernel(Lambda lambda, const int loop_size) {
11 |   for(int i = 0; i < loop_size; i++){
12 |     lambda(i);
13 |   }
14 | }
15 | 
16 | /* GPU loop execution */
17 | template <typename Lambda> 
18 | __global__ void gpuKernel(Lambda lambda, const int loop_size)
19 | {
20 |   const int i = blockIdx.x * blockDim.x + threadIdx.x;
21 |   if(i < loop_size)
22 |   {
23 |     lambda(i);
24 |   }
25 | }
26 | 
27 | /* Check if this function is running on CPU or GPU */
28 | __host__ __device__ void helloFromThread(const int i) {
29 |   #ifdef __HIP_DEVICE_COMPILE__ // If running on GPU
30 |     printf("Hello from GPU! I'm thread number %d\n", i);
31 |   #else // If running on CPU
32 |     printf("Hello from CPU! I'm thread number %d\n", i);
33 |   #endif
34 | }
35 | 
36 | 
37 | /* The main function */
38 | int main()
39 | {
40 |   // Set the problem dimensions
41 |   const int loop_size = BLOCKSIZE;
42 |   const int blocksize = BLOCKSIZE;
43 |   const int gridsize = (loop_size - 1 + blocksize) / blocksize;
44 | 
45 |   // Define lambda1 function with 1 integer argument,
46 |   // the lamba must call helloFromThread with that argument
47 |   auto lambda1 = [] __host__ __device__ (const int i)
48 |   {
49 |     helloFromThread(i);
50 |   };
51 | 
52 |   // Run lambda1 on the CPU device
53 |   cpuKernel(lambda1, loop_size);
54 | 
55 |   // Run lambda1 on the GPU device
56 |   gpuKernel<<<gridsize, blocksize>>>(lambda1, loop_size);
57 |   hipStreamSynchronize(0);
58 | 
59 |   // Store value of pi in pi
60 |   double pi = M_PI;
61 | 
62 |   // Define lambda2 that captures pi (use [=] to capture by value), 
63 |   // and prints out the results for i * pi from each thread
64 |   auto lambda2 = [=] __host__ __device__ (const int i)
65 |   {
66 |     printf("i * pi = %f \n", (double)i * pi);
67 |   };
68 | 
69 |   // Run lambda2 on the GPU device
70 |   gpuKernel<<<gridsize, blocksize>>>(lambda2, loop_size);
71 |   hipStreamSynchronize(0);
72 | }
73 | 


--------------------------------------------------------------------------------
/lambdas/02-reduction/README.md:
--------------------------------------------------------------------------------
1 | # Reductions with host-device lambdas and hipCUB
2 | 
3 | The purpose of this exercise is to use host-device lambda functions and the hipCUB library to create an efficient reduction kernel. The location of the missing parts of the kernel code are indicated by #error. The CUB library documentation may be useful, particularly [this example](https://nvlabs.github.io/cub/classcub_1_1_block_reduce.html#a7632bd9c8950dd6a3528ca99fa3f0890). Note that hipCUB uses namespace "hipcub" instead of "cub" used in the original CUDA library.
4 | 
5 | IMPORTANT NOTE! When using the host-device lambda function with NVIDIA architectures, the following compiler argument must be added for hipcc: `--extended-lambda`
6 | 


--------------------------------------------------------------------------------
/lambdas/02-reduction/reduction.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <hip/hip_runtime.h>
 3 | #include "../../third-party/hipcub/hipcub.hpp"
 4 | 
 5 | /* Blocksize is divisible by the warp size */
 6 | #define BLOCKSIZE 64
 7 | 
 8 | /* CPU redution loop */
 9 | template <typename Lambda>
10 | void parallel_reduce_cpu(const int loop_size, Lambda loop_body, int *sum) {
11 |   // Evaluate the loop body
12 |   for(int i = 0; i < loop_size; i++){
13 |     loop_body(i, *sum);
14 |   }
15 | }
16 | 
17 | /* GPU redution kernel */
18 | template <typename Lambda>
19 | __global__ void reduction_kernel(Lambda loop_body, const int loop_size, int *sum)
20 | {
21 |   // Specialize BlockReduce for a 1D block of BLOCKSIZE threads of type int
22 |   #error add here hipcub typedef
23 |   
24 |   // Use shared memory for the hipcub library temporary storage
25 |   #error define the shared memory used by the hipcub library here
26 | 
27 |   // Get thread index
28 |   const int idx = blockIdx.x * blockDim.x + threadIdx.x;
29 | 
30 |   // Local storage for the thread summation value
31 |   int thread_sum = 0;
32 | 
33 |   // Evaluate the loop body, the summation value is stored in thread_sum
34 |   if(idx < loop_size)
35 |     loop_body(idx, thread_sum);
36 | 
37 |   // Compute the block-wide sum (aggregate) for the first thread of each block
38 |   int aggregate;
39 |   #error call the hipcub function to perform block-wide sum and store the result into 'aggregate'
40 | 
41 |   // The first thread of each block stores the block-wide aggregate to 'sum' using atomics
42 |   if(threadIdx.x == 0) 
43 |     #error use HIP native atomiAdd() function to sum the 'aggregate' of each block into 'sum'
44 | }
45 | 
46 | /* Wrapper for the GPU redution kernel */
47 | template <typename Lambda>
48 | void parallel_reduce_gpu(const uint loop_size, Lambda loop_body, int *sum) {
49 | 
50 |   // Set block and grid dimensions
51 |   const uint blocksize = BLOCKSIZE;
52 |   const uint gridsize = (loop_size - 1 + blocksize) / blocksize;
53 | 
54 |   // Create GPU buffer for the reduction variable
55 |   int* d_buf;
56 |   hipMalloc(&d_buf, sizeof(int));
57 | 
58 |   // Launch the reduction kernel
59 |   reduction_kernel<<<gridsize, blocksize>>>(loop_body, loop_size, d_buf);
60 |   hipStreamSynchronize(0);
61 |   
62 |   // Copy reduction variable back to host from the GPU buffer
63 |   hipMemcpy(sum, d_buf, sizeof(int), hipMemcpyDeviceToHost);
64 |   hipFree(d_buf);
65 | }
66 | 
67 | 
68 | /* The main function */
69 | int main()
70 | {
71 |   // Calculate the triangular number up to 'tn', ie, a sum of numbers from 0 to 'tn'
72 |   const int tn = 1000;
73 | 
74 |   // Calculate the triangular number on the GPU and store it in sum_gpu
75 |   int sum_gpu = 0;
76 |   parallel_reduce_gpu(tn, [] __host__ __device__ (const int i, int &sum){
77 |     int thread_idx = i;
78 |     sum += thread_idx; 
79 |   }, &sum_gpu);
80 | 
81 |   // Calculate the triangular number on the CPU and store it in sum_cpu
82 |   int sum_cpu = 0;
83 |   parallel_reduce_cpu(tn, [] __host__ __device__ (const int i, int &sum){
84 |     int thread_idx = i;
85 |     sum += thread_idx;
86 |   }, &sum_cpu);
87 | 
88 |   // Check that the results match
89 |   if(sum_gpu == sum_cpu)
90 |     printf("The results calculated by GPU = %d and CPU = %d match!\n", sum_gpu, sum_cpu);
91 |   else
92 |     printf("The results calculated by GPU = %d and CPU = %d do not match!\n", sum_gpu, sum_cpu);
93 | 
94 |   return 0;
95 | }
96 | 


--------------------------------------------------------------------------------
/lambdas/02-reduction/solution/reduction.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <hip/hip_runtime.h>
 3 | #include "../../../third-party/hipcub/hipcub.hpp"
 4 | 
 5 | /* Blocksize is divisible by the warp size */
 6 | #define BLOCKSIZE 64
 7 | 
 8 | /* CPU redution loop */
 9 | template <typename Lambda>
10 | void parallel_reduce_cpu(const int loop_size, Lambda loop_body, int *sum) {
11 |   // Evaluate the loop body
12 |   for(int i = 0; i < loop_size; i++){
13 |     loop_body(i, *sum);
14 |   }
15 | }
16 | 
17 | /* GPU redution kernel */
18 | template <typename Lambda>
19 | __global__ void reduction_kernel(Lambda loop_body, const int loop_size, int *sum)
20 | {
21 |   // Specialize BlockReduce for a 1D block of BLOCKSIZE threads of type int
22 |   typedef hipcub::BlockReduce<int, BLOCKSIZE> BlockReduce;
23 |   
24 |   // Use shared memory for the hipcub library temporary storage
25 |   __shared__ typename BlockReduce::TempStorage temp_storage;
26 | 
27 |   // Get thread index
28 |   const int idx = blockIdx.x * blockDim.x + threadIdx.x;
29 | 
30 |   // Local storage for the thread summation value
31 |   int thread_sum = 0;
32 | 
33 |   // Evaluate the loop body, the summation value is stored in thread_sum
34 |   if(idx < loop_size)
35 |     loop_body(idx, thread_sum);
36 | 
37 |   // Compute the block-wide sum (aggregate) for the first thread of each block
38 |   int aggregate = BlockReduce(temp_storage).Sum(thread_sum);
39 |   
40 |   // The first thread of each block stores the block-wide aggregate to 'sum' using atomics
41 |   if(threadIdx.x == 0) 
42 |     atomicAdd(sum, aggregate);
43 | }
44 | 
45 | /* Wrapper for the GPU redution kernel */
46 | template <typename Lambda>
47 | void parallel_reduce_gpu(const uint loop_size, Lambda loop_body, int *sum) {
48 | 
49 |   // Set block and grid dimensions
50 |   const uint blocksize = BLOCKSIZE;
51 |   const uint gridsize = (loop_size - 1 + blocksize) / blocksize;
52 | 
53 |   // Create GPU buffer for the reduction variable
54 |   int* d_buf;
55 |   hipMalloc(&d_buf, sizeof(int));
56 |   hipMemcpy(d_buf, sum, sizeof(int), hipMemcpyHostToDevice);
57 | 
58 |   // Launch the reduction kernel
59 |   reduction_kernel<<<gridsize, blocksize>>>(loop_body, loop_size, d_buf);
60 |   hipStreamSynchronize(0);
61 |   
62 |   // Copy reduction variable back to host from the GPU buffer
63 |   hipMemcpy(sum, d_buf, sizeof(int), hipMemcpyDeviceToHost);
64 |   hipFree(d_buf);
65 | }
66 | 
67 | 
68 | /* The main function */
69 | int main()
70 | {
71 |   // Calculate the triangular number up to 'tn', ie, a sum of numbers from 0 to 'tn'
72 |   const int tn = 1000;
73 | 
74 |   // Calculate the triangular number on the GPU and store it in sum_gpu
75 |   int sum_gpu = 0;
76 |   parallel_reduce_gpu(tn, [] __host__ __device__ (const int i, int &sum){
77 |     int thread_idx = i;
78 |     sum += thread_idx; 
79 |   }, &sum_gpu);
80 | 
81 |   // Calculate the triangular number on the CPU and store it in sum_cpu
82 |   int sum_cpu = 0;
83 |   parallel_reduce_cpu(tn, [] __host__ __device__ (const int i, int &sum){
84 |     int thread_idx = i;
85 |     sum += thread_idx;
86 |   }, &sum_cpu);
87 | 
88 |   // Check that the results match
89 |   if(sum_gpu == sum_cpu)
90 |     printf("The results calculated by GPU = %d and CPU = %d match!\n", sum_gpu, sum_cpu);
91 |   else
92 |     printf("The results calculated by GPU = %d and CPU = %d do not match!\n", sum_gpu, sum_cpu);
93 | 
94 |   return 0;
95 | }
96 | 


--------------------------------------------------------------------------------
/lambdas/03-hipify/Makefile:
--------------------------------------------------------------------------------
 1 | default: build
 2 | 	echo "Start Build"
 3 | 
 4 | # Accelerator architecture
 5 | ifeq ($(CUDA),1)
 6 | 
 7 | CXX = nvcc
 8 | CXXDEFS = -DHAVE_CUDA
 9 | CXXFLAGS = -g -O3 --x=cu --extended-lambda -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80
10 | EXE = bessel
11 | 
12 | else ifeq ($(HIP),CUDA)
13 | 
14 | CXX = hipcc
15 | CXXDEFS = -DHAVE_HIP -I$(shell pwd)/../../third-party/hiprand -I$(shell pwd)/../../third-party
16 | CXXFLAGS = -g -O3 --x=cu --extended-lambda -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80
17 | EXE = bessel
18 | 
19 | else ifeq ($(HIP),ROCM)
20 | 
21 | CXX = hipcc
22 | CXXDEFS = -DHAVE_HIP -I/appl/eap/opt/rocm-4.3.1/hiprand/include/ -I/appl/eap/opt/rocm-4.3.1/rocrand/include/
23 | CXXFLAGS = -g -O3 --offload-arch=gfx90a
24 | FILETYPE = .cpp
25 | EXE = bessel
26 | 
27 | else
28 | 
29 | CXX = g++
30 | CXXFLAGS = -g -O3
31 | EXE = bessel
32 | 
33 | endif
34 | 
35 | # Message passing protocol
36 | ifeq ($(MPI),1)
37 | 
38 | MPICXX = mpicxx
39 | MPICXXENV = OMPI_CXXFLAGS='' OMPI_CXX='$(CXX) -DHAVE_MPI $(CXXDEFS) $(CXXFLAGS)'
40 | LDFLAGS += -L/appl/spack/install-tree/gcc-9.1.0/openmpi-4.1.1-vonyow/lib
41 | LIBS += -lmpi
42 | 
43 | else
44 | 
45 | MPICXX = $(CXX)
46 | MPICXXFLAGS = $(CXXDEFS) $(CXXFLAGS)
47 | 
48 | endif
49 | 
50 | SRC_PATH = src/
51 | SOURCES = $(shell ls src/*.cpp)
52 | 
53 | OBJ_PATH = src/
54 | OBJECTS = $(shell for file in $(SOURCES);\
55 | 		do echo -n $$file | sed -e "s/\(.*\)\.cpp/\1\.o/";echo -n " ";\
56 | 		done)
57 | 
58 | build: $(EXE)
59 | 
60 | depend:
61 | 	makedepend $(CXXDEFS) -m $(SOURCES)
62 | 
63 | test: $(EXE)
64 | 	./$(EXE)
65 | 
66 | $(EXE): $(OBJECTS)
67 | 	$(CXX) $(LDFLAGS) $(OBJECTS) $(LIBS) -o $(EXE)
68 | 
69 | clean: $(CLEAN)
70 | 	rm -f $(OBJECTS) $(EXE)
71 | 
72 | # Compilation rules
73 | $(OBJ_PATH)%.o: $(SRC_PATH)%.cpp
74 | 	$(MPICXXENV) $(MPICXX) $(MPICXXFLAGS) -c $< -o $(SRC_PATH)$(notdir $@)
75 | 


--------------------------------------------------------------------------------
/lambdas/03-hipify/README.md:
--------------------------------------------------------------------------------
 1 | # Monte Carlo simulation with hipRAND library
 2 | 
 3 | ## Exercise description
 4 | 
 5 | The HIP header file [devices_hip.h](src/devices_hip.h) has disappeared from the [src](src/) folder. Fortunately, the respective CUDA header, [devices_cuda.h](src/devices_cuda.h), is still present. The task is to use hipify tools to translate [devices_cuda.h](src/devices_cuda.h) to [devices_hip.h](src/devices_hip.h). What does the hipify tool translate? Is there anything that is not translated properly? You may compare the result with the original HIP header named [solution.h](src/solution.h). Instructions to compile the code with HIP at the bottom. 
 6 | 
 7 | IMPORTANT NOTE on hipify-clang module usage on Puhti! Load hipify-clang to hipify CUDA code by 
 8 | ```
 9 | ml hipify-clang
10 | ```
11 | and after loading and using hipify-clang, you must do the following before trying to compile any HIP code
12 | ```
13 | ml purge
14 | ml hip
15 | ```
16 | Otherwise the compilation fails (you cannot compile HIP while having hipify-clang module loaded).
17 | ## Code description
18 | 
19 | This example uses the Monte Carlo method to simulate the value of Bessel's correction that minimizes the root mean squared error in the calculation of the sample standard deviation and variance for the chosen sample and population sizes. The sample standard deviation is typically calculated as $$s = \sqrt{\frac{1}{N - \beta}\sum_{i=1}^{N}(x_i - \bar{x})^2}$$ where $$\beta = 1.$$ The simulation calculates the root mean squared error for different values of $\beta$.
20 | 
21 | The implementation uses a special construct for the parallel loops in [bessel.cpp](src/bessel.cpp) which is based on a lambda function, an approach similar to some accelerator frameworks such as SYCL, Kokkos, RAJA, etc. The approach allows conditional compilation of the loops for multiple architectures while keeping the source code clean and readable. An example of the usage of cuRAND and hipRAND random number generation libraries inside a GPU kernel are given in [devices_cuda.h](src/devices_cuda.h) and [devices_hip.h](src/devices_hip.h).
22 | 
23 | The code can be conditionally compiled for either CUDA, HIP, or HOST execution with or without MPI. The correct definitions for each accelerator backend option are selected in [comms.h](src/comms.h) by choosing the respective header file. The compilation instructions are shown below:
24 | 
25 | ```
26 | // Compile to run sequentially on CPU
27 | make
28 | 
29 | // Compile to run parallel on CPUs with MPI
30 | make MPI=1
31 | 
32 | // Compile to run parallel on GPU with CUDA
33 | make CUDA=1
34 | 
35 | // Compile to run parallel on GPU with HIP
36 | make HIP=CUDA
37 | 
38 | // Compile to run parallel on many GPUs with HIP and MPI
39 | make HIP=CUDA MPI=1
40 | 
41 | ```
42 | 


--------------------------------------------------------------------------------
/lambdas/03-hipify/src/comms.cpp:
--------------------------------------------------------------------------------
  1 | #include "comms.h"
  2 | 
  3 | #if defined(HAVE_MPI)
  4 | 
  5 | namespace comms{
  6 | 
  7 |   static int MPI_INITIALIZED = 0;
  8 |   
  9 |   int get_procs(){
 10 |     int comm_size = 1;
 11 |     if (MPI_INITIALIZED == 1){
 12 |       MPI_Comm_size(MPI_COMM_WORLD, &comm_size);
 13 |     }
 14 |     return comm_size;
 15 |   }
 16 |   
 17 |   int get_rank(){
 18 |     int proc_rank = 0;
 19 |     if (MPI_INITIALIZED == 1){
 20 |       MPI_Comm_rank(MPI_COMM_WORLD, &proc_rank);
 21 |     }
 22 |     return proc_rank;
 23 |   }
 24 |   
 25 |   int get_node_rank(){
 26 |     int node_rank = 0;
 27 |     if (MPI_INITIALIZED == 1){
 28 |       MPI_Comm node_comm = MPI_COMM_NULL;
 29 |       MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &node_comm);
 30 |   
 31 |       MPI_Comm_rank(node_comm, &node_rank);
 32 |       MPI_Comm_free(&node_comm);
 33 |     }
 34 |     return node_rank;
 35 |   }
 36 |   
 37 |   int get_node_procs(){
 38 |     int node_comm_size = 1;
 39 |     if (MPI_INITIALIZED == 1){
 40 |       MPI_Comm node_comm = MPI_COMM_NULL;
 41 |       MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &node_comm);
 42 |   
 43 |       MPI_Comm_size(node_comm, &node_comm_size);
 44 |       MPI_Comm_free(&node_comm);
 45 |     }
 46 |     return node_comm_size;
 47 |   }
 48 |   
 49 |   void barrier_procs(){
 50 |     // Synchronize across all MPI processes
 51 |     if (MPI_INITIALIZED == 1) 
 52 |       MPI_Barrier(MPI_COMM_WORLD);
 53 |   }
 54 | 
 55 |   void reduce_procs(float *sbuf, int count){
 56 |     if (MPI_INITIALIZED == 1){
 57 |       float* rbuf;
 58 |       if(get_rank() == 0)
 59 |         rbuf = (float*)malloc(count * sizeof(float));
 60 |       MPI_Reduce(sbuf, rbuf, count, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
 61 |       if(get_rank() == 0){
 62 |         memcpy(sbuf, rbuf, count * sizeof(float));
 63 |         free((void*)rbuf);
 64 |       }
 65 |     }
 66 |   }
 67 |   
 68 |   void init_procs(int *argc, char **argv[]){   
 69 |     if(*argc > 1){
 70 |       MPI_Init(argc, argv);
 71 |       MPI_INITIALIZED = 1;
 72 |     }
 73 |     // Some device backends require an initialization
 74 |     devices::init(get_node_rank());
 75 |   }
 76 |   
 77 |   void finalize_procs(){
 78 |     // Some device backends also require a finalization
 79 |     devices::finalize(get_rank());
 80 | 
 81 |     // Finalize MPI if it is used
 82 |     if (MPI_INITIALIZED == 1) 
 83 |       MPI_Finalize();
 84 |   }
 85 | }
 86 | 
 87 | #else
 88 | 
 89 | namespace comms{
 90 |   int get_procs(){
 91 |     int comm_size = 1;
 92 |     return comm_size;
 93 |   }
 94 |   
 95 |   int get_rank(){
 96 |     int proc_rank = 0;
 97 |     return proc_rank;
 98 |   }
 99 |   
100 |   int get_node_rank(){
101 |     int node_rank = 0;
102 |     return node_rank;
103 |   }
104 |   
105 |   int get_node_procs(){
106 |     int node_comm_size = 1;
107 |     return node_comm_size;
108 |   }
109 |   
110 |   void barrier_procs(){
111 |   }
112 | 
113 |   void reduce_procs(float *sbuf, int count){
114 |   }
115 |   
116 |   void init_procs(int *argc, char **argv[]){
117 |     // Some device backends require an initialization
118 |     devices::init(get_node_rank());
119 |   }
120 |   
121 |   void finalize_procs(){
122 |     // Some device backends also require a finalization
123 |     devices::finalize(get_rank());
124 |   }
125 | }
126 | 
127 | #endif
128 | 


--------------------------------------------------------------------------------
/lambdas/03-hipify/src/comms.h:
--------------------------------------------------------------------------------
 1 | #ifndef BESSEL_COMMS_H
 2 | #define BESSEL_COMMS_H
 3 | 
 4 | #if defined(HAVE_MPI)
 5 |   #include "mpi.h"
 6 | #endif
 7 | 
 8 | #if defined(HAVE_CUDA)
 9 |   #include "devices_cuda.h"
10 | #elif defined(HAVE_HIP)
11 |   #include "devices_hip.h"
12 | #else
13 |   #include "devices_host.h"
14 | #endif
15 | 
16 | namespace comms{
17 |   int get_procs();
18 |   int get_rank();
19 |   int get_node_procs();
20 |   int get_node_rank();
21 | 
22 |   void barrier_procs();
23 |   void reduce_procs(float *sbuf, int count);
24 |   
25 |   void init_procs(int *argc, char **argv[]);
26 |   void finalize_procs(); 
27 | }
28 | 
29 | #endif // !BESSEL_COMMS_H
30 | 


--------------------------------------------------------------------------------
/lambdas/03-hipify/src/devices_cuda.h:
--------------------------------------------------------------------------------
 1 | #ifndef BESSEL_DEVICES_CUDA_H
 2 | #define BESSEL_DEVICES_CUDA_H
 3 | 
 4 | #include <cstdio>
 5 | #include <cuda_runtime.h>
 6 | #include <curand_kernel.h>
 7 | 
 8 | #define CUDA_ERR(err) (cuda_error(err, __FILE__, __LINE__))
 9 | inline static void cuda_error(cudaError_t err, const char *file, int line) {
10 | 	if (err != cudaSuccess) {
11 | 		printf("\n\n%s in %s at line %d\n", cudaGetErrorString(err), file, line);
12 | 		exit(1);
13 | 	}
14 | }
15 | 
16 | #define DEVICE_LAMBDA [=] __host__ __device__
17 | 
18 | namespace devices
19 | {
20 |   __forceinline__ static void init(int node_rank) {
21 |     int num_devices = 0;
22 |     CUDA_ERR(cudaGetDeviceCount(&num_devices));
23 |     CUDA_ERR(cudaSetDevice(node_rank % num_devices));
24 |   }
25 | 
26 |   __forceinline__ static void finalize(int rank) {
27 |     printf("Rank %d, CUDA finalized.\n", rank);
28 |   }
29 | 
30 |   __forceinline__ static void* allocate(size_t bytes) {
31 |     void* ptr;
32 |     CUDA_ERR(cudaMallocManaged(&ptr, bytes));
33 |     return ptr;
34 |   }
35 | 
36 |   __forceinline__ static void free(void* ptr) {
37 |     CUDA_ERR(cudaFree(ptr));
38 |   }
39 | 
40 |   __forceinline__ static void memcpy_d2d(void* dst, void* src, size_t bytes){
41 |     CUDA_ERR(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToDevice));
42 |   }
43 | 
44 |   template <typename LambdaBody> 
45 |   __global__ static void cudaKernel(LambdaBody lambda, const int loop_size)
46 |   {
47 |     const int i = blockIdx.x * blockDim.x + threadIdx.x;
48 |     if(i < loop_size)
49 |     {
50 |       lambda(i);
51 |     }
52 |   }
53 | 
54 |   template <typename T>
55 |   __forceinline__ static void parallel_for(int loop_size, T loop_body) {
56 |     const int blocksize = 64;
57 |     const int gridsize = (loop_size - 1 + blocksize) / blocksize;
58 |     cudaKernel<<<gridsize, blocksize>>>(loop_body, loop_size);
59 |     CUDA_ERR(cudaStreamSynchronize(0));
60 |   }
61 | 
62 |   template <typename T>
63 |   __host__ __device__ __forceinline__ static void atomic_add(T *array_loc, T value){
64 |     // Define this function depending on whether it runs on GPU or CPU
65 | #ifdef __CUDA_ARCH__
66 |     atomicAdd(array_loc, value);
67 | #else
68 |     *array_loc += value;
69 | #endif
70 |   }
71 | 
72 |   template <typename T>
73 |   __host__ __device__ static T random_float(unsigned long long seed, unsigned long long seq, int idx, T mean, T stdev){    
74 |     
75 |     T var = 0;
76 | #ifdef __CUDA_ARCH__
77 |     curandStatePhilox4_32_10_t state;
78 | 
79 |     // curand_init() reproduces the same random number with the same seed and seq
80 |     curand_init(seed, seq, 0, &state);
81 | 
82 |     // curand_normal() gives a random float from a normal distribution with mean = 0 and stdev = 1
83 |     var = stdev * curand_normal(&state) + mean;
84 | #endif
85 |     return var;
86 |   }
87 | }
88 | 
89 | #endif // !BESSEL_DEVICES_CUDA_H
90 | 


--------------------------------------------------------------------------------
/lambdas/03-hipify/src/devices_host.h:
--------------------------------------------------------------------------------
 1 | #ifndef BESSEL_DEVICES_HOST_H
 2 | #define BESSEL_DEVICES_HOST_H
 3 | 
 4 | #include <cstdio>
 5 | #include <cstdlib>
 6 | #include <random>
 7 | #include <string.h>
 8 | 
 9 | #define DEVICE_LAMBDA [=]
10 | 
11 | namespace devices
12 | {
13 |   inline static void init(int node_rank) {
14 |     // Nothing needs to be done here
15 |   }
16 | 
17 |   inline static void finalize(int rank) {
18 |     printf("Rank %d, Host finalized.\n", rank);
19 |   }
20 | 
21 |   inline static void* allocate(size_t bytes) {
22 |     return malloc(bytes);
23 |   }
24 | 
25 |   inline static void free(void* ptr) {
26 |     ::free(ptr);
27 |   }
28 |   
29 |   inline static void memcpy_d2d(void* dst, void* src, size_t bytes){
30 |     memcpy(dst, src, bytes);
31 |   }
32 | 
33 |   template <typename Lambda>
34 |   inline static void parallel_for(int loop_size, Lambda loop_body) {
35 |     for(int i = 0; i < loop_size; i++){
36 |       loop_body(i);
37 |     }
38 |   }
39 | 
40 |   template <typename T>
41 |   inline static void atomic_add(T *array_loc, T value){
42 |     *array_loc += value;
43 |   }
44 | 
45 |   template <typename T>
46 |   inline static T random_float(unsigned long long seed, unsigned long long seq, int idx, T mean, T stdev){
47 |     
48 |     // Re-seed the first case
49 |     if(idx == 0){
50 |       // Overflow is defined behavior with unsigned, and therefore ok here
51 |       srand((unsigned int)seed + (unsigned int)seq);
52 |     }
53 |   
54 |     // Use Box Muller algorithm to get a float from a normal distribution
55 |     const float two_pi = 2.0f * M_PI;
56 | 	  float u1 = (float) rand() / RAND_MAX;
57 | 	  float u2 = (float) rand() / RAND_MAX;
58 | 	  float factor = stdev * sqrtf (-2.0f * logf (u1));
59 | 	  float trig_arg = two_pi * u2;
60 | 	  
61 |     // Box Muller algorithm produces two random normally distributed floats, z0 and z1
62 |     float z0 = factor * cosf (trig_arg) + mean; // Need only one
63 | 	  // float z1 = factor * sinf (trig_arg) + mean; 
64 |     return z0;
65 |   }
66 | }
67 | #endif // !BESSEL_DEVICES_HOST_H
68 | 


--------------------------------------------------------------------------------
/lambdas/03-hipify/src/solution.h:
--------------------------------------------------------------------------------
 1 | #ifndef BESSEL_DEVICES_HIP_H
 2 | #define BESSEL_DEVICES_HIP_H
 3 | 
 4 | #include <cstdio>
 5 | #include <hip/hip_runtime.h>
 6 | #include <hiprand_kernel.h>
 7 | 
 8 | #define HIP_ERR(err) (hip_error(err, __FILE__, __LINE__))
 9 | inline static void hip_error(hipError_t err, const char *file, int line) {
10 | 	if (err != hipSuccess) {
11 | 		printf("\n\n%s in %s at line %d\n", hipGetErrorString(err), file, line);
12 | 		exit(1);
13 | 	}
14 | }
15 | 
16 | #define DEVICE_LAMBDA [=] __host__ __device__
17 | 
18 | namespace devices
19 | {
20 |   __forceinline__ static void init(int node_rank) {
21 |     int num_devices = 0;
22 |     HIP_ERR(hipGetDeviceCount(&num_devices));
23 |     HIP_ERR(hipSetDevice(node_rank % num_devices));
24 |   }
25 | 
26 |   __forceinline__ static void finalize(int rank) {
27 |     printf("Rank %d, HIP finalized.\n", rank);
28 |   }
29 | 
30 |   __forceinline__ static void* allocate(size_t bytes) {
31 |     void* ptr;
32 |     HIP_ERR(hipMallocManaged(&ptr, bytes));
33 |     return ptr;
34 |   }
35 | 
36 |   __forceinline__ static void free(void* ptr) {
37 |     HIP_ERR(hipFree(ptr));
38 |   }
39 | 
40 |   __forceinline__ static void memcpyd2d(void* dst, void* src, size_t bytes){
41 |     HIP_ERR(hipMemcpy(dst, src, bytes, hipMemcpyDeviceToDevice));
42 |   }
43 | 
44 |   template <typename LambdaBody> 
45 |   __global__ static void hipKernel(LambdaBody lambda, const int loop_size)
46 |   {
47 |     const int i = blockIdx.x * blockDim.x + threadIdx.x;
48 |     if(i < loop_size)
49 |     {
50 |       lambda(i);
51 |     }
52 |   }
53 | 
54 |   template <typename T>
55 |   __forceinline__ static void parallel_for(int loop_size, T loop_body) {
56 |     const int blocksize = 64;
57 |     const int gridsize = (loop_size - 1 + blocksize) / blocksize;
58 |     hipKernel<<<gridsize, blocksize>>>(loop_body, loop_size);
59 |     HIP_ERR(hipStreamSynchronize(0));
60 |   }
61 |   
62 |   template <typename T>
63 |   __host__ __device__ __forceinline__ static void atomic_add(T *array_loc, T value){
64 |     // Define this function depending on whether it runs on GPU or CPU
65 | #if __HIP_DEVICE_COMPILE__
66 |     atomicAdd(array_loc, value);
67 | #else
68 |     *array_loc += value;
69 | #endif
70 |   }
71 | 
72 |   template <typename T>
73 |   __host__ __device__ static T random_float(unsigned long long seed, unsigned long long seq, int idx, T mean, T stdev){    
74 |     
75 |     T var = 0;
76 | #if __HIP_DEVICE_COMPILE__
77 |     hiprandStatePhilox4_32_10_t state;
78 | 
79 |     // hiprand_init() reproduces the same random number with the same seed and seq
80 |     hiprand_init(seed, seq, 0, &state);
81 | 
82 |     // hiprand_normal() gives a random float from a normal distribution with mean = 0 and stdev = 1
83 |     var = stdev * hiprand_normal(&state) + mean;
84 | #endif
85 |     return var;
86 |   }
87 | }
88 | 
89 | #endif // !BESSEL_DEVICES_HIP_H
90 | 


--------------------------------------------------------------------------------
/memory/01-prefetch/README.md:
--------------------------------------------------------------------------------
 1 | # Memory management strategies
 2 | 
 3 | The purpose of this exercise is to compare 6 different memory management
 4 | strategies and their computational overhead. The following functions are called
 5 | at the end of this file by the `main()` function:
 6 | 
 7 | * The function `explicitMem()` represents a basic explicit memory management strategy
 8 | * The function `explicitMemPinned()` represents an explicit memory management strategy with pinned host memory
 9 | * The function `explicitMemNoCopy()` represents an explicit memory management strategy where the data can reside at GPU memory during an iterative loop (no recurring memory copies needed)
10 | * The function `unifiedMem()` represents a basic unified memory management strategy
11 | * The function `unifiedMemPrefetch()` represents a unified memory management strategy with prefetching
12 | * The function `unifiedMemNoCopy()` represents a unified memory management strategy where the data can reside at GPU memory during an iterative loop (no recurring memory copies needed)
13 | 
14 | The task is to fill the missing function calls in the code indicated by lines beginning with `#error`, and followed by a descriptive instruction.
15 | 
16 | ## Hints
17 | 
18 | `int device;`
19 | `hipGetDevice(&device);`
20 | 
21 | * prefetch:
22 | `hipMemPrefetchAsync((const void*) ptr, size_t count, int device, hipStream_t stream)`
23 | 
24 | * prefetch to device on stream 0:
25 | `hipMemPrefetchAsync(A, size, device, 0);`
26 | 
27 | * prefetch to host: use device `hipCpuDeviceId`
28 | `hipMemPrefetchAsync(A, size, hipCpuDeviceId, 0);`
29 | 
30 | *Device memset
31 | `hipMemset(A, 0, size);`
32 | 


--------------------------------------------------------------------------------
/memory/02-mempools/README.md:
--------------------------------------------------------------------------------
 1 | # The stream-ordered memory allocator and memory pools
 2 | 
 3 | The purpose of this exercise is to compare different memory allocation strategies within a loop and to understand the performance impact of using or not using a memory pool. The following timed functions are called at the end of the source file by the `main()` function:
 4 | 
 5 | * The function `noRecurringAlloc()` allocates memory outside loop only once
 6 | * The function `recurringAllocNoMemPools()` allocates memory within a loop recurringly
 7 | * The function `recurringAllocMemPool()` obtains memory from a pool within a loop recurringly
 8 | 
 9 | The task is to fill the missing function calls in the code indicated by lines beginning with `#error`, and followed by a descriptive instruction.
10 | 


--------------------------------------------------------------------------------
/memory/03-struct/README.md:
--------------------------------------------------------------------------------
 1 | # Unified memory and structs
 2 | 
 3 | The purpose of this exercise is to run a loop accessing a struct from host and
 4 | device using different memory management strategies.
 5 | 
 6 | The function `runHost()` demonstrates the execution on host and is already complete. 
 7 | 
 8 | The task is to fill the functions `runDeviceUnifiedMem()` and `runDeviceExplicitMem()` to do
 9 | the same thing parallel on the device. The latter function also requires explicitly specifying how the struct is copied to the GPU memory, which is not always trivial. Therefore, you must also fill the GPU struct allocation and deallocation functions `createDeviceExample()` and `freeDeviceExample()`.
10 | 


--------------------------------------------------------------------------------
/multi-gpu/01-p2pcopy/README.md:
--------------------------------------------------------------------------------
 1 | # Peer to peer device access
 2 | 
 3 | Benchmark memory copies with and without peer to peer device access using two
 4 | GPUs.
 5 | 
 6 | Skeleton code [p2pcopy.cpp](p2pcopy.cpp) tests peer to peer device access between two GPUs by doing a series of memory copies. The test is evaluated after calling `hipDeviceEnablePeerAccess()` and `hipDeviceDisablePeerAccess()`. The program prints calculated bandwith and time for both cases. On a CUDA platform, there should be a difference in results, whereas on an AMD platform there is none. 
 7 | 
 8 | In order to make the code work, you need to fix the missing parts marked with TODOs.
 9 | 
10 | NOTE: Remember to request 2 GPUs when running this exercise. 
11 | On Lumi, use
12 | ```
13 | srun --account=XXXXXX --partition=small-g -N1 -n1 --cpus-per-task=1 --gpus-per-node=2 --time=00:15:00 ./a.out # The reservation is for small-g partition
14 | ```
15 | 
16 | When the code is running correct run it several times and observe the bandwidths. What are the bandwidths=?
17 | 
18 | Disable the DMA engine with `export HSA_ENABLE_SDMA=0` and then try again code. What are the results now?
19 | 
20 | 
21 | On Mahti use
22 | ```
23 | srun --account=XXXXXX --partition=gputest -N1 -n1 --cpus-per-task=1 --gres=gpu:v100:2 --time=00:15:00 ./a.out
24 | ```
25 | 


--------------------------------------------------------------------------------
/multi-gpu/01-p2pcopy/p2pcopy.cpp:
--------------------------------------------------------------------------------
 1 | #include "stdio.h"
 2 | #include "stdint.h"
 3 | #include <time.h>
 4 | #include <hip/hip_runtime.h>
 5 | 
 6 | 
 7 | void copyP2P(int p2p, int gpu0, int gpu1, int* dA_0, int* dA_1, int size) {
 8 | 
 9 |     // Enable peer access for GPUs?
10 |     if (p2p)
11 |     {
12 |         // TODO: Enable peer access for GPU 0 and GPU 1
13 |     }
14 | 
15 |     // Do a dummy copy without timing to remove the impact of the first one
16 |     // TODO: Copy dA_1 on device 1 to dA_0 on device 0
17 | 
18 |     // Do a series of timed P2P memory copies
19 |     int N = 10;
20 |     clock_t tStart = clock();
21 |     // TODO: Copy dA_1 on device 1 to dA_0 on device 0, repeat for N times to
22 |     //       get timings
23 |     // TODO: After the memory copies, remember to synchronize the stream
24 |     //       before stopping the clock
25 |     clock_t tStop = clock();
26 | 
27 |     // Calcute time and bandwith
28 |     double time_s = (double) (tStop - tStart) / CLOCKS_PER_SEC;
29 |     double bandwidth = (double) size * (double) N / (double) 1e9 / time_s;
30 | 
31 |     // Disable peer access for GPUs?
32 |     if (p2p) {
33 |         // TODO: Disable peer access for GPU 0 and GPU 1
34 |         printf("P2P enabled - Bandwith: %.3f (GB/s), Time: %.3f s\n",
35 |                 bandwidth, time_s);
36 |     } else {
37 |         printf("P2P disabled - Bandwith: %.3f (GB/s), Time: %.3f s\n",
38 |                 bandwidth, time_s);
39 |     }
40 | }
41 | 
42 | 
43 | int main(int argc, char *argv[])
44 | {
45 |     // Check that we have at least two GPUs
46 |     int devcount;
47 |     hipGetDeviceCount(&devcount);
48 |     if(devcount < 2) {
49 |         printf("Need at least two GPUs!\n");
50 |         exit(EXIT_FAILURE);
51 |     } else {
52 |         printf("Found %d GPU devices, using GPUs 0 and 1!\n", devcount);
53 |     }
54 | 
55 |     // Allocate memory for both GPUs
56 |     int size = pow(2, 28);
57 |     int gpu0 = 0, gpu1 = 1;
58 |     int *dA_0, *dA_1;
59 |     hipSetDevice(gpu0);
60 |     hipMalloc((void**) &dA_0, size);
61 |     hipSetDevice(gpu1);
62 |     hipMalloc((void**) &dA_1, size);
63 | 
64 |     // Check peer accessibility between GPUs 0 and 1
65 |     int peerAccess01;
66 |     int peerAccess10;
67 |     // TODO: Check for peer to peer accessibility from device 0 to 1
68 |     //       and from 1 to 0
69 |     printf("hipDeviceCanAccessPeer: %d (GPU %d to GPU %d)\n",
70 |             peerAccess01, gpu0, gpu1);
71 |     printf("hipDeviceCanAccessPeer: %d (GPU %d to GPU %d)\n",
72 |             peerAccess10, gpu1, gpu0);
73 | 
74 |     // Memcopy, P2P enabled
75 |     if (peerAccess01 && peerAccess10)
76 |         copyP2P(1, gpu0, gpu1, dA_0, dA_1, size);
77 | 
78 |     // Memcopy, P2P disabled
79 |     copyP2P(0, gpu0, gpu1, dA_0, dA_1, size);
80 | 
81 |     // Deallocate device memory
82 |     hipFree(dA_0);
83 |     hipFree(dA_1);
84 | }
85 | 


--------------------------------------------------------------------------------
/multi-gpu/01-p2pcopy/solution/p2pcopy.cpp:
--------------------------------------------------------------------------------
 1 | #include "stdio.h"
 2 | #include "stdint.h"
 3 | #include <time.h>
 4 | #include <hip/hip_runtime.h>
 5 | 
 6 | 
 7 | void copyP2P(int p2p, int gpu0, int gpu1, int* dA_0, int* dA_1, int size) {
 8 | 
 9 |     // Enable peer access for GPUs?
10 |     if (p2p)
11 |     {
12 |         hipSetDevice(gpu0);
13 |         hipDeviceEnablePeerAccess(gpu1, 0);
14 |         hipSetDevice(gpu1);
15 |         hipDeviceEnablePeerAccess(gpu0, 0);
16 |     }
17 | 
18 |     // Do a dummy copy without timing to remove the impact of the first one
19 |     hipMemcpy(dA_0, dA_1, size, hipMemcpyDefault);
20 |     hipMemcpy(dA_1, dA_0, size, hipMemcpyDefault);
21 | 
22 |     // Do a series of timed P2P memory copies
23 |     int N = 10;
24 |     clock_t tStart = clock();
25 |     for (int i = 0; i < N; ++i) {
26 |         hipMemcpy(dA_0, dA_1, size, hipMemcpyDefault);
27 |     }
28 |     hipStreamSynchronize(0);
29 |     clock_t tStop = clock();
30 | 
31 |     // Calcute time and bandwith
32 |     double time_s = (double) (tStop - tStart) / CLOCKS_PER_SEC;
33 |     double bandwidth = (double) size * (double) N / (double) 1e9 / time_s;
34 | 
35 |     // Disable peer access for GPUs?
36 |     if (p2p) {
37 |         hipSetDevice(gpu0);
38 |         hipDeviceDisablePeerAccess(gpu1);
39 |         hipSetDevice(gpu1);
40 |         hipDeviceDisablePeerAccess(gpu0);
41 |         printf("P2P enabled - Bandwith: %.3f (GB/s), Time: %.3f s\n",
42 |                 bandwidth, time_s);
43 |     } else {
44 |         printf("P2P disabled - Bandwith: %.3f (GB/s), Time: %.3f s\n",
45 |                 bandwidth, time_s);
46 |     }
47 | }
48 | 
49 | 
50 | int main(int argc, char *argv[])
51 | {
52 |     // Check that we have at least two GPUs
53 |     int devcount;
54 |     hipGetDeviceCount(&devcount);
55 |     if(devcount < 2) {
56 |         printf("Need at least two GPUs!\n");
57 |         exit(EXIT_FAILURE);
58 |     } else {
59 |         printf("Found %d GPU devices, using GPUs 0 and 1!\n", devcount);
60 |     }
61 | 
62 |     // Allocate memory for both GPUs
63 |     int size = pow(2, 28);
64 |     int gpu0 = 0, gpu1 = 1;
65 |     int *dA_0, *dA_1;
66 |     hipSetDevice(gpu0);
67 |     hipMalloc((void**) &dA_0, size);
68 |     hipSetDevice(gpu1);
69 |     hipMalloc((void**) &dA_1, size);
70 | 
71 |     // Check peer accessibility between GPUs 0 and 1
72 |     int peerAccess01;
73 |     int peerAccess10;
74 |     hipDeviceCanAccessPeer(&peerAccess01, gpu0, gpu1);
75 |     hipDeviceCanAccessPeer(&peerAccess10, gpu1, gpu0);
76 |     printf("hipDeviceCanAccessPeer: %d (GPU %d to GPU %d)\n",
77 |             peerAccess01, gpu0, gpu1);
78 |     printf("hipDeviceCanAccessPeer: %d (GPU %d to GPU %d)\n",
79 |             peerAccess10, gpu1, gpu0);
80 | 
81 |     // Memcopy, P2P enabled
82 |     if (peerAccess01 && peerAccess10)
83 |         copyP2P(1, gpu0, gpu1, dA_0, dA_1, size);
84 | 
85 |     // Memcopy, P2P disabled
86 |     copyP2P(0, gpu0, gpu1, dA_0, dA_1, size);
87 | 
88 |     // Deallocate device memory
89 |     hipFree(dA_0);
90 |     hipFree(dA_1);
91 | }
92 | 


--------------------------------------------------------------------------------
/multi-gpu/02-vector-sum/README.md:
--------------------------------------------------------------------------------
 1 | # Vector sum on two GPUs without MPI
 2 | 
 3 | Calculate the vector sum of two vectors (C = A + B) using two GPUs.
 4 | 
 5 | Decompose the vectors into equal halves, copy data from host to device memory
 6 | and launch a GPU kernel on each part asynchronously using streams. Copy the
 7 | results back to the host to check for correctness. Add timing events to
 8 | measure the time of execution.
 9 | 
10 | A skeleton code is provided in [vector-sum.cpp](vector-sum.cpp). Your task is to fill the locations indicated by 
11 | 
12 | ```// TODO:```
13 | 
14 | NOTE: Remember to request 2 GPUs when running this exercise. On Lumi, use
15 | ```
16 | srun --account=XXXXXX --partition=small-g -N1 -n1 --cpus-per-task=1 --gpus-per-node=2 --time=00:15:00 ./a.out  # The reservation is for small-g partition
17 | ```
18 | and on Mahti use
19 | ```
20 | srun --account=XXXXXX --partition=gputest -N1 -n1 --cpus-per-task=1 --gres=gpu:v100:2 --time=00:15:00 ./a.out
21 | ```
22 | 


--------------------------------------------------------------------------------
/multi-gpu/02-vector-sum/vector-sum.cpp:
--------------------------------------------------------------------------------
  1 | #include <cstdio>
  2 | #include <cmath>
  3 | #include <hip/hip_runtime.h>
  4 | 
  5 | // Data structure for storing decomposition information
  6 | struct Decomp {
  7 |     int len;    // length of the array for the current device
  8 |     int start;  // start index for the array on the current device
  9 | };
 10 | 
 11 | 
 12 | /* HIP kernel for the addition of two vectors, i.e. C = A + B */
 13 | __global__ void vector_add(double *C, const double *A, const double *B, int N)
 14 | {
 15 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 16 | 
 17 |     // Do not try to access past the allocated memory
 18 |     if (idx < N) {
 19 |         C[idx] = A[idx] + B[idx];
 20 |     }
 21 | }
 22 | 
 23 | 
 24 | int main(int argc, char *argv[])
 25 | {
 26 |     const int ThreadsInBlock = 128;
 27 |     double *dA[2], *dB[2], *dC[2];
 28 |     double *hA, *hB, *hC;
 29 |     int devicecount;
 30 |     int N = 100;
 31 |     hipEvent_t start, stop;
 32 |     hipStream_t strm[2];
 33 |     Decomp dec[2];
 34 | 
 35 |     // TODO: Check that we have two HIP devices available
 36 | 
 37 |     // Create timing events
 38 |     hipSetDevice(0);
 39 |     hipEventCreate(&start);
 40 |     hipEventCreate(&stop);
 41 | 
 42 |     // Allocate host memory
 43 |     // TODO: Allocate enough pinned host memory for hA, hB, and hC
 44 |     //       to store N doubles each
 45 | 
 46 |     // Initialize host memory
 47 |     for(int i = 0; i < N; ++i) {
 48 |         hA[i] = 1.0;
 49 |         hB[i] = 2.0;
 50 |     }
 51 | 
 52 |     // Decomposition of data for each stream
 53 |     dec[0].len   = N / 2;
 54 |     dec[0].start = 0;
 55 |     dec[1].len   = N - N / 2;
 56 |     dec[1].start = dec[0].len;
 57 | 
 58 |     // Allocate memory for the devices and per device streams
 59 |     for (int i = 0; i < 2; ++i) {
 60 |         // TODO: Allocate enough device memory for dA[i], dB[i], dC[i]
 61 |         //       to store dec[i].len doubles
 62 |         // TODO: Create a stream for each device
 63 |     }
 64 | 
 65 |     // Start timing
 66 |     hipSetDevice(0);
 67 |     hipEventRecord(start);
 68 | 
 69 |     /* Copy each decomposed part of the vectors from host to device memory
 70 |        and execute a kernel for each part.
 71 |        Note: one needs to use streams and asynchronous calls! Without this
 72 |        the execution is serialized because the memory copies block the
 73 |        execution of the host process. */
 74 |     for (int i = 0; i < 2; ++i) {
 75 |         // TODO: Set active device
 76 |         // TODO: Copy data from host to device asynchronously (hA[dec[i].start] -> dA[i], hB[dec[i].start] -> dB[i])
 77 |         // TODO: Launch 'vector_add()' kernel to calculate dC = dA + dB
 78 |         // TODO: Copy data from device to host (dC[i] -> hC[dec[0].start])
 79 |     }
 80 | 
 81 |     // Synchronize and destroy the streams
 82 |     for (int i = 0; i < 2; ++i) {
 83 |         // TODO: Add synchronization calls and destroy streams
 84 |     }
 85 | 
 86 |     // Stop timing
 87 |     // TODO: Add here the timing event stop calls
 88 | 
 89 |     // Free device memory
 90 |     for (int i = 0; i < 2; ++i) {
 91 |         // TODO: Deallocate device memory
 92 |     }
 93 | 
 94 |     // Check results
 95 |     int errorsum = 0;
 96 |     for (int i = 0; i < N; i++) {
 97 |         errorsum += hC[i] - 3.0;
 98 |     }
 99 |     printf("Error sum = %i\n", errorsum);
100 | 
101 |     // Calculate the elapsed time
102 |     float gputime;
103 |     hipSetDevice(0);
104 |     hipEventElapsedTime(&gputime, start, stop);
105 |     printf("Time elapsed: %f\n", gputime / 1000.);
106 | 
107 |     // Deallocate host memory
108 |     hipHostFree((void*)hA);
109 |     hipHostFree((void*)hB);
110 |     hipHostFree((void*)hC);
111 | 
112 |     return 0;
113 | }
114 | 


--------------------------------------------------------------------------------
/multi-gpu/03-mpi/Makefile:
--------------------------------------------------------------------------------
 1 | HIPCC = hipcc
 2 | MPICXX = mpicxx
 3 | MPICXXFLAGS = -g -O2 -w
 4 | 
 5 | # Puhti
 6 | MPICXXENV = OMPI_CXXFLAGS='' OMPI_CXX='$(HIPCC) --x cu --gpu-architecture=sm_70'
 7 | # LDFLAGS = -L/appl/spack/v018/install-tree/gcc-11.3.0/openmpi-4.1.4-w2aekq/lib
 8 | LDFLAGS = -L/appl/spack/v018/install-tree/gcc-11.3.0/openmpi-4.1.4-gkv6dx/lib
 9 | LIBS = -lmpi
10 | 
11 | ping-pong: ping-pong.o
12 | 	$(HIPCC) $(LDFLAGS) -o $@ $< $(LIBS)
13 | 
14 | %.o: %.cpp
15 | 	$(MPICXXENV) $(MPICXX) $(MPICXXFLAGS) -c -o $@ $<
16 | 
17 | .PHONY: clean
18 | clean:
19 | 	rm -f *.o ping-pong
20 | 


--------------------------------------------------------------------------------
/multi-gpu/03-mpi/README.md:
--------------------------------------------------------------------------------
 1 | # Ping-pong with multiple GPUs and MPI
 2 | 
 3 | Implement a simple ping-pong test for GPU-to-GPU communication using:
 4 | a) indirect communication via the host, and b) direct communication with
 5 | HIP-aware MPI.
 6 | 
 7 | The ping-pong test consists of the following steps:
 8 |   1. Send a vector from one GPU to another
 9 |   2. The receiving GPU should increment all elements of the vector by one
10 |   3. Send the vector back to the original GPU
11 | 
12 | For reference, there is also a CPU-to-CPU implementation in the skeleton
13 | code ([ping-pong.cpp](ping-pong.cpp)). Timing of all tests is also included to
14 | compare the execution times.
15 | 
16 | On **Lumi**, one can compile the MPI example simply using the Cray compiler with
17 | ```
18 | CC -xhip ping-pong.cpp
19 | ```
20 | 
21 | On LUMI, enable gpu-aware MPI on runtime (and compiling) by eexecuting:
22 | ```
23 | MPICH_GPU_SUPPORT_ENABLED=1
24 | ```
25 | For running, one should use two GPUs and two MPI processes:
26 | 
27 | ```
28 | srun --account=XXXXXX --partition=small-g -N1 -tasks-per-node=2 --cpus-per-task=1 --gpus-per-node=2 --time=00:15:00 ./a.out #  # The reservation is for small-g partition
29 | ```
30 | 
31 | 
32 | On **Mahti**, to compile, just load the required modules and type `make`. A gpu-aware MPI is
33 | available with:
34 | ```
35 | ml openmpi/4.1.4-cuda
36 | ```
37 | For running, one should use two GPUs and two MPI processes:
38 | ```
39 | srun --account=XXXXXX --partition=gputest -N1 -n2 --cpus-per-task=1 --gres=gpu:v100:2 --time=00:15:00 ./a.out
40 | ```
41 | 


--------------------------------------------------------------------------------
/multi-gpu/03-mpi/solution/Makefile:
--------------------------------------------------------------------------------
 1 | HIPCC = hipcc
 2 | MPICXX = mpicxx
 3 | MPICXXFLAGS = -g -O2 -w
 4 | 
 5 | # Puhti
 6 | MPICXXENV = OMPI_CXXFLAGS='' OMPI_CXX='$(HIPCC) --x cu --gpu-architecture=sm_70'
 7 | # LDFLAGS = -L/appl/spack/v018/install-tree/gcc-11.3.0/openmpi-4.1.4-w2aekq/lib
 8 | LDFLAGS = -L/appl/spack/v018/install-tree/gcc-11.3.0/openmpi-4.1.4-gkv6dx/lib
 9 | LIBS = -lmpi
10 | 
11 | ping-pong: ping-pong.o
12 | 	$(HIPCC) $(LDFLAGS) -o $@ $< $(LIBS)
13 | 
14 | %.o: %.cpp
15 | 	$(MPICXXENV) $(MPICXX) $(MPICXXFLAGS) -c -o $@ $<
16 | 
17 | .PHONY: clean
18 | clean:
19 | 	rm -f *.o ping-pong
20 | 


--------------------------------------------------------------------------------
/optimization/01-coalescing/README.md:
--------------------------------------------------------------------------------
 1 | # Performance counters and coalesced memory access
 2 | 
 3 | ## Background and rocprof
 4 | 
 5 | `rocprof` can collect performance metric counters (`pmc`) of gpu kernels:
 6 | ```bash
 7 | > rocprof -i metrics.txt -o metrics.csv ./copy
 8 | ```
 9 | 
10 | The counters to be collected are listed in the `metrics.txt` file and they are
11 | outputted the `metrics.csv` file. For example, if the file `metrics.txt` is
12 | 
13 | ```
14 | pmc: VALUBusy, TCP_TCC_READ_REQ_sum
15 | pmc: TCC_EA_RDREQ_sum
16 | ```
17 | then `rocprof` will collect the derived metrics of how busy the vector
18 | arithmetic logic units (VALU), how many L2 read requests are issued
19 | (TCP_TCC_READ_REQ_sum) and how many global device memory read requests are
20 | issued (TCC_EA_RDREQ_sum).
21 | 
22 | Here `TCP_TCC` refers to how many read requests the L1 (TCP) cache controller
23 | issues to the L2 cache (TCC) and `TCC_EA` refers to how many reads L2 cache
24 | controller issues to the interconnect (`EA`).
25 | 
26 | The options `--list-derived` and `--list-basic` will list the available derived
27 | and basic counters. 
28 | 
29 | *Note*: `rocprof --list-derived` and `rocprof --list-basic` must be
30 | executed on a node with GPU present because it queries the available counters
31 | from the hardware itself.
32 | 
33 | An MI250x GCD has 8 MiB of L2 memory shared across the CUs and each CU has 16
34 | kiB of L1 memory.
35 | 
36 | ## Exercise
37 | 
38 | The Code `copy.cpp` will read and write memory array of 4096*4096 float32
39 | entries and various strides (`(1<<n)-1, n=1...21`) (`copy_kernel` and line
40 | '59`).
41 | 
42 | In this exercise you will inspect how well the GPU is able to coalesce the
43 | memory requests using the `rocprof` profiler.
44 | 
45 | - How many L2 read requests (64 B and 32 B combined) are issued?
46 | - How many device global memory read requests (64 B and 32 B combined) are
47 |   issued?
48 | - The number of L2 read requests drop when the stride is around 4096. Why?
49 |     *Hint*: Print out the values of `index` for some block in a 16x16 matrix.
50 |     Are some of those indices adjacent within a warp?
51 | - *Hint*: load and open the `metrics.csv` file with libreoffice or some other
52 |   spreadsheet editor for quick manual analyses
53 | 


--------------------------------------------------------------------------------
/optimization/01-coalescing/copy.cpp:
--------------------------------------------------------------------------------
 1 | #include <hip/hip_runtime.h>
 2 | 
 3 | #include <cstdlib>
 4 | #include <vector>
 5 | 
 6 | #define LOG2SIZE 12
 7 | const static int width = 1<<LOG2SIZE;
 8 | const static int height = 1<<LOG2SIZE;
 9 | 
10 | const static int tile_dim_x = 16;
11 | const static int tile_dim_y = 16;
12 | 
13 | __global__ void copy_kernel(float *in, float *out, size_t width, size_t height, int stride) {
14 |   size_t x_index = blockIdx.x * tile_dim_x + threadIdx.x;
15 |   size_t y_index = blockIdx.y * tile_dim_y + threadIdx.y;
16 | 
17 |   size_t index = (y_index * width + stride*x_index) % (width*height);
18 |   size_t index_in = (y_index * width + stride*x_index) % (width*height);
19 | 
20 |   out[index] = in[index_in];
21 | }
22 | 
23 | int main() {
24 |   std::vector<float> matrix_in;
25 |   std::vector<float> matrix_out;
26 | 
27 |   matrix_in.resize(width * height);
28 |   matrix_out.resize(width * height);
29 | 
30 |   for (int i = 0; i < width * height; i++) {
31 |     matrix_in[i] = (float)rand() / (float)RAND_MAX;
32 |   }
33 | 
34 |   float *d_in;
35 |   float *d_out;
36 | 
37 |   hipMalloc((void **)&d_in, (width * height) * sizeof(float));
38 |   hipMalloc((void **)&d_out, (width * height) * sizeof(float));
39 | 
40 |   hipMemcpy(d_in, matrix_in.data(), width * height * sizeof(float),
41 |             hipMemcpyHostToDevice);
42 | 
43 |   printf("Setup complete. Launching kernel \n");
44 |   int block_x = width / tile_dim_x;
45 |   int block_y = height / tile_dim_y;
46 |   
47 | 
48 |   // Create events
49 | 
50 |   /* printf("Warm up the gpu!\n"); */
51 |   /* for(int i=1;i<=10;i++){ */
52 |   /*   hipLaunchKernelGGL(copy_kernel, dim3(block_x, block_y), */
53 |   /*                     dim3(tile_dim_x, tile_dim_y), 0, 0, d_in, d_out, width, */
54 |   /*                     height);} */
55 | 
56 | 
57 |   
58 |   for(int i=1;i<=21;i++){
59 |     hipLaunchKernelGGL(copy_kernel, dim3(block_x, block_y),
60 |                       dim3(tile_dim_x, tile_dim_y), 0, 0, d_in, d_out, width,
61 |                       height, (1<<i)-1);}
62 |   
63 | 
64 |   hipDeviceSynchronize();
65 |   float time_kernel;
66 | 
67 |   printf("Done!\n");
68 |   hipMemcpy(matrix_out.data(), d_out, width * height * sizeof(float),
69 |             hipMemcpyDeviceToHost);
70 | 
71 | 
72 |   return 0;
73 | }
74 | 


--------------------------------------------------------------------------------
/optimization/01-coalescing/metrics.txt:
--------------------------------------------------------------------------------
1 | pmc: TCC_EA_WRREQ_64B_sum, TCC_EA_WRREQ_sum
2 | pmc: SQ_INSTS_VMEM_RD, SQ_INSTS_VMEM_WR, TCP_TCC_READ_REQ_sum, TCP_TCC_WRITE_REQ_sum, TCC_EA_RDREQ_sum, TCC_EA_RDREQ_32B_sum
3 | 
4 | 


--------------------------------------------------------------------------------
/optimization/02-matrix_transpose/copy.cpp:
--------------------------------------------------------------------------------
 1 | #include <hip/hip_runtime.h>
 2 | 
 3 | #include <cstdlib>
 4 | #include <vector>
 5 | 
 6 | const static int width = 4096;
 7 | const static int height = 4096;
 8 | const static int tile_dim = 16;
 9 | 
10 | __global__ void copy_kernel(float *in, float *out, int width, int height) {
11 |   int x_index = blockIdx.x * tile_dim + threadIdx.x;
12 |   int y_index = blockIdx.y * tile_dim + threadIdx.y;
13 | 
14 |   int index = y_index * width + x_index;
15 | 
16 |   out[index] = in[index];
17 | }
18 | 
19 | 
20 | 
21 | int main() {
22 |   std::vector<float> matrix_in;
23 |   std::vector<float> matrix_out;
24 | 
25 |   matrix_in.resize(width * height);
26 |   matrix_out.resize(width * height);
27 | 
28 |   for (int i = 0; i < width * height; i++) {
29 |     matrix_in[i] = (float)rand() / (float)RAND_MAX;
30 |   }
31 | 
32 | 
33 | 
34 |   float *d_in;
35 |   float *d_out;
36 | 
37 |   hipMalloc((void **)&d_in, width * height * sizeof(float));
38 |   hipMalloc((void **)&d_out, width * height * sizeof(float));
39 | 
40 |   hipMemcpy(d_in, matrix_in.data(), width * height * sizeof(float),
41 |             hipMemcpyHostToDevice);
42 | 
43 |   printf("Setup complete. Launching kernel \n");
44 |   int block_x = width / tile_dim;
45 |   int block_y = height / tile_dim;
46 |   
47 | 
48 |   // Create events
49 |   hipEvent_t start_kernel_event;
50 |   hipEventCreate(&start_kernel_event);
51 |   hipEvent_t end_kernel_event;
52 |   hipEventCreate(&end_kernel_event);
53 | 
54 |   printf("Warm up the gpu!\n");
55 |   for(int i=1;i<=10;i++){
56 |     hipLaunchKernelGGL(copy_kernel, dim3(block_x, block_y),
57 |                       dim3(tile_dim, tile_dim), 0, 0, d_in, d_out, width,
58 |                       height);}
59 | 
60 |   hipEventRecord(start_kernel_event, 0);
61 | 
62 |   
63 |   for(int i=1;i<=10;i++){
64 |     hipLaunchKernelGGL(copy_kernel, dim3(block_x, block_y),
65 |                       dim3(tile_dim, tile_dim), 0, 0, d_in, d_out, width,
66 |                       height);}
67 |   
68 |   hipEventRecord(end_kernel_event, 0);
69 |   hipEventSynchronize(end_kernel_event);
70 | 
71 |   hipDeviceSynchronize();
72 |   float time_kernel;
73 |   hipEventElapsedTime(&time_kernel, start_kernel_event, end_kernel_event);
74 | 
75 |   printf("Kernel execution complete \n");
76 |   printf("Event timings:\n");
77 |   printf("  %.6f ms - copy \n  Bandwidth %.6f GB/s\n", time_kernel/10, 2.0*10000*(((double)(width)*(double)height)*sizeof(float))/(time_kernel*1024*1024*1024));
78 |  
79 |    hipMemcpy(matrix_out.data(), d_out, width * height * sizeof(float),
80 |             hipMemcpyDeviceToHost);
81 | 
82 | 
83 |   return 0;
84 | }
85 | 


--------------------------------------------------------------------------------
/optimization/02-matrix_transpose/matrix_transpose_naive.cpp:
--------------------------------------------------------------------------------
 1 | #include <hip/hip_runtime.h>
 2 | 
 3 | #include <cstdlib>
 4 | #include <vector>
 5 | 
 6 | const static int width = 4096;
 7 | const static int height = 4096;
 8 | const static int tile_dim = 16;
 9 | 
10 | __global__ void transpose_naive_kernel(float *in, float *out, int width, int height) {
11 |   int x_index = blockIdx.x * tile_dim + threadIdx.x;
12 |   int y_index = blockIdx.y * tile_dim + threadIdx.y;
13 | 
14 |   int in_index = y_index * width + x_index;
15 |   int out_index = x_index * height + y_index;
16 | 
17 |   out[out_index] = in[in_index];
18 | }
19 | 
20 | 
21 | 
22 | int main() {
23 |   std::vector<float> matrix_in;
24 |   std::vector<float> matrix_out;
25 | 
26 |   matrix_in.resize(width * height);
27 |   matrix_out.resize(width * height);
28 | 
29 |   for (int i = 0; i < width * height; i++) {
30 |     matrix_in[i] = (float)rand() / (float)RAND_MAX;
31 |   }
32 | 
33 | 
34 | 
35 |   float *d_in;
36 |   float *d_out;
37 | 
38 |   hipMalloc((void **)&d_in, width * height * sizeof(float));
39 |   hipMalloc((void **)&d_out, width * height * sizeof(float));
40 | 
41 |   hipMemcpy(d_in, matrix_in.data(), width * height * sizeof(float),
42 |             hipMemcpyHostToDevice);
43 | 
44 |   printf("Setup complete. Launching kernel \n");
45 |   int block_x = width / tile_dim;
46 |   int block_y = height / tile_dim;
47 | 
48 | 
49 | 
50 |   // Create events
51 |   hipEvent_t start_kernel_event;
52 |   hipEventCreate(&start_kernel_event);
53 |   hipEvent_t end_kernel_event;
54 |   hipEventCreate(&end_kernel_event);
55 | 
56 |   printf("Warm up the gpu!\n");
57 | 
58 | 
59 |   for(int i=1;i<=10;i++){
60 |     hipLaunchKernelGGL(transpose_naive_kernel, dim3(block_x, block_y),
61 |                       dim3(tile_dim, tile_dim), 0, 0, d_in, d_out, width,
62 |                       height);}
63 | 
64 | 
65 |   hipEventRecord(start_kernel_event, 0);
66 |   for(int i=1;i<=10;i++){
67 |     hipLaunchKernelGGL(transpose_naive_kernel, dim3(block_x, block_y),
68 |                       dim3(tile_dim, tile_dim), 0, 0, d_in, d_out, width,
69 |                       height);}
70 |   
71 |   hipEventRecord(end_kernel_event, 0);
72 |   hipEventSynchronize(end_kernel_event);
73 | 
74 |   float time_kernel;
75 |   hipEventElapsedTime(&time_kernel, start_kernel_event, end_kernel_event);
76 | 
77 |    printf("Kernel execution complete \n");
78 |   printf("Event timings:\n");
79 |   printf("  %.6f ms - naive transpose \n  Bandwidth %.6f GB/s\n", time_kernel/10, 2.0*10000*(((double)(width)*(double)height)*sizeof(float))/(time_kernel*1024*1024*1024));
80 |  
81 |    hipMemcpy(matrix_out.data(), d_out, width * height * sizeof(float),
82 |             hipMemcpyDeviceToHost);
83 | 
84 | 
85 |   return 0;
86 | }
87 | 
88 | 


--------------------------------------------------------------------------------
/optimization/02-matrix_transpose/matrix_transpose_with_SM.cpp:
--------------------------------------------------------------------------------
 1 | #include <hip/hip_runtime.h>
 2 | 
 3 | #include <cstdlib>
 4 | #include <vector>
 5 | 
 6 | const static int width = 4096;
 7 | const static int height = 4096;
 8 | const static int tile_dim = 16;
 9 | 
10 | __global__ void transpose_SM_kernel(float *in, float *out, int width,
11 |                                      int height) {
12 |   __shared__ float tile[tile_dim][tile_dim];
13 | 
14 |   int x_tile_index = blockIdx.x * tile_dim;
15 |   int y_tile_index = blockIdx.y * tile_dim;
16 | 
17 |   int in_index =
18 |       (y_tile_index + threadIdx.y) * width + (x_tile_index + threadIdx.x);
19 |   int out_index =
20 |       (x_tile_index + threadIdx.y) * height + (y_tile_index + threadIdx.x);
21 | 
22 |   tile[threadIdx.y][threadIdx.x] = in[in_index];
23 | 
24 |   __syncthreads();
25 | 
26 |   out[out_index] = tile[threadIdx.x][threadIdx.y];
27 | }
28 | 
29 | 
30 | int main() {
31 |   std::vector<float> matrix_in;
32 |   std::vector<float> matrix_out;
33 | 
34 |   matrix_in.resize(width * height);
35 |   matrix_out.resize(width * height);
36 | 
37 |   for (int i = 0; i < width * height; i++) {
38 |     matrix_in[i] = (float)rand() / (float)RAND_MAX;
39 |   }
40 | 
41 | 
42 | 
43 |   float *d_in;
44 |   float *d_out;
45 | 
46 |   hipMalloc((void **)&d_in, width * height * sizeof(float));
47 |   hipMalloc((void **)&d_out, width * height * sizeof(float));
48 | 
49 |   hipMemcpy(d_in, matrix_in.data(), width * height * sizeof(float),
50 |             hipMemcpyHostToDevice);
51 | 
52 |   printf("Setup complete. Launching kernel \n");
53 |   int block_x = width / tile_dim;
54 |   int block_y = height / tile_dim;
55 | 
56 |   // Create events
57 |   hipEvent_t start_kernel_event;
58 |   hipEventCreate(&start_kernel_event);
59 |   hipEvent_t end_kernel_event;
60 |   hipEventCreate(&end_kernel_event);
61 | 
62 |   printf("Warm up the gpu!\n");
63 | 
64 | 
65 |   for(int i=1;i<=10;i++){
66 |     hipLaunchKernelGGL(transpose_SM_kernel, dim3(block_x, block_y),
67 |                       dim3(tile_dim, tile_dim), 0, 0, d_in, d_out, width,
68 |                       height);}
69 | 
70 | 
71 |   hipEventRecord(start_kernel_event, 0);
72 | 
73 |    for(int i=1;i<=10;i++){
74 |     hipLaunchKernelGGL(transpose_SM_kernel, dim3(block_x, block_y),
75 |                       dim3(tile_dim, tile_dim), 0, 0, d_in, d_out, width,
76 |                       height);}
77 |   
78 |   
79 |   hipEventRecord(end_kernel_event, 0);
80 |   hipEventSynchronize(end_kernel_event);
81 | 
82 |   float time_kernel;
83 |   hipEventElapsedTime(&time_kernel, start_kernel_event, end_kernel_event);
84 | 
85 |    printf("Kernel execution complete \n");
86 |   printf("Event timings:\n");
87 |   printf("  %.6f ms - shared memory \n  Bandwidth %.6f GB/s\n", time_kernel/10, 2.0*10000*(((double)(width)*(double)height)*sizeof(float))/(time_kernel*1024*1024*1024));
88 |  
89 |    hipMemcpy(matrix_out.data(), d_out, width * height * sizeof(float),
90 |             hipMemcpyDeviceToHost);
91 |   
92 |   hipEventDestroy(start_kernel_event);
93 |   hipEventDestroy(end_kernel_event);
94 | 
95 |   return 0;
96 | }
97 | 
98 | 


--------------------------------------------------------------------------------
/optimization/02-matrix_transpose/matrix_transpose_with_SM_nobc.cpp:
--------------------------------------------------------------------------------
 1 | #include <hip/hip_runtime.h>
 2 | 
 3 | #include <cstdlib>
 4 | #include <vector>
 5 | 
 6 | const static int width = 4096;
 7 | const static int height = 4096;
 8 | const static int tile_dim = 16;
 9 | 
10 | __global__ void transpose_SM_nobc_kernel(float *in, float *out, int width,
11 |                                      int height) {
12 |   __shared__ float tile[tile_dim][tile_dim+1];
13 | 
14 |   int x_tile_index = blockIdx.x * tile_dim;
15 |   int y_tile_index = blockIdx.y * tile_dim;
16 | 
17 |   int in_index =
18 |       (y_tile_index + threadIdx.y) * width + (x_tile_index + threadIdx.x);
19 |   int out_index =
20 |       (x_tile_index + threadIdx.y) * height + (y_tile_index + threadIdx.x);
21 | 
22 |   tile[threadIdx.y][threadIdx.x] = in[in_index];
23 | 
24 |   __syncthreads();
25 | 
26 |   out[out_index] = tile[threadIdx.x][threadIdx.y];
27 | }
28 | 
29 | 
30 | int main() {
31 |   std::vector<float> matrix_in;
32 |   std::vector<float> matrix_out;
33 | 
34 |   matrix_in.resize(width * height);
35 |   matrix_out.resize(width * height);
36 | 
37 |   for (int i = 0; i < width * height; i++) {
38 |     matrix_in[i] = (float)rand() / (float)RAND_MAX;
39 |   }
40 | 
41 | 
42 | 
43 |   float *d_in;
44 |   float *d_out;
45 | 
46 |   hipMalloc((void **)&d_in, width * height * sizeof(float));
47 |   hipMalloc((void **)&d_out, width * height * sizeof(float));
48 | 
49 |   hipMemcpy(d_in, matrix_in.data(), width * height * sizeof(float),
50 |             hipMemcpyHostToDevice);
51 | 
52 |   printf("Setup complete. Launching kernel \n");
53 |   int block_x = width / tile_dim;
54 |   int block_y = height / tile_dim;
55 | 
56 |   // Create events
57 |   hipEvent_t start_kernel_event;
58 |   hipEventCreate(&start_kernel_event);
59 |   hipEvent_t end_kernel_event;
60 |   hipEventCreate(&end_kernel_event);
61 | 
62 |   printf("Warm up the gpu!\n");
63 | 
64 | 
65 |   for(int i=1;i<=10;i++){
66 |     hipLaunchKernelGGL(transpose_SM_nobc_kernel, dim3(block_x, block_y),
67 |                       dim3(tile_dim, tile_dim), 0, 0, d_in, d_out, width,
68 |                       height);}
69 | 
70 | 
71 |   hipEventRecord(start_kernel_event, 0);
72 | 
73 |    for(int i=1;i<=10;i++){
74 |     hipLaunchKernelGGL(transpose_SM_nobc_kernel, dim3(block_x, block_y),
75 |                       dim3(tile_dim, tile_dim), 0, 0, d_in, d_out, width,
76 |                       height);}
77 |   
78 |   
79 |   hipEventRecord(end_kernel_event, 0);
80 |   hipEventSynchronize(end_kernel_event);
81 | 
82 |   float time_kernel;
83 |   hipEventElapsedTime(&time_kernel, start_kernel_event, end_kernel_event);
84 | 
85 |    printf("Kernel execution complete \n");
86 |   printf("Event timings:\n");
87 |   printf("  %.6f ms - shared memory with no bank conflicts \n  Bandwidth %.6f GB/s\n", time_kernel/10, 2.0*10000*(((double)(width)*(double)height)*sizeof(float))/(time_kernel*1024*1024*1024));
88 |  
89 |    hipMemcpy(matrix_out.data(), d_out, width * height * sizeof(float),
90 |             hipMemcpyDeviceToHost);
91 |   
92 |   hipEventDestroy(start_kernel_event);
93 |   hipEventDestroy(end_kernel_event);
94 | 
95 |   return 0;
96 | }
97 | 
98 | 


--------------------------------------------------------------------------------
/optimization/03-trace/README.md:
--------------------------------------------------------------------------------
 1 | # Tracing with rocprof
 2 | 
 3 | In this exercise your task is to trace execution of [streams/02-concurrency](../../streams/02-concurrency/solution/streams.cpp) exercise
 4 | solution.
 5 | 
 6 | Rocprof can be used to trace HIP API calls, among others, with option
 7 | 
 8 | ```bash
 9 | > rocprof --hip-trace <executable>
10 | ```
11 | 
12 | It will output a file named `results.json` which may be visualized for example
13 | with perfetto trace visualizer (https://ui.perfetto.dev/) or chrome/chromium
14 | built in visualizer tools (type `chrome://tracing/` in the URL field).
15 | 
16 | ## Exercise
17 | 
18 | - Trace the HIP API calls of the `streams.cpp` code and visualize the results.
19 | - Modify `WORK` preprocessor macro to so large that kernel executions begin to
20 |   exceed memory transfers.
21 | - Does the kernel execution order correspond to their stream numbering?
22 | 


--------------------------------------------------------------------------------
/porting/README.md:
--------------------------------------------------------------------------------
 1 | # Converting CUDA code to HIP
 2 | 
 3 | The folder [codes](codes) contains a few examples (vector addition, `saxpy` using HIP  kernel, and `saxpy`using `cublas` of CUDA codes. On Mahti or Puhti these codes will compile with the CUDA `nvcc` compiler and should run without issues. 
 4 | 
 5 | The tasks are to convert these codes to HIP. For shorter code one can do a manual conversion, but for larger codes it is recomended to use HIPIFY tools or compile them with [HOP](https://github.com/cschpc/hop) library. 
 6 | 
 7 | ## HIPIFY Tools
 8 | 0. **Optional** Convert the codes to HIP manually. On Nvidia platforms the conversion can be done in an incremental way because `hipcc` can compile mixed CUDA and HIP code. On AMD plaftorms `hipcc` can not compile CUDA code. The whole code needs to be converted in order to be able to compile it. 
 9 | 1. Convert the codes using HIPIFY tools.
10 |    
11 |     A. Examine the code. Both `hipify-perl` and `hipify-clang` support the option `--examine` option. Alternatively one can use the `hipexamine[.|-perl.]sh` scripts which will scan whole directories. This procedure will not change the source it will just determine which files contain CUDA code and how much of the code can be converted automatically.
12 |    
13 |     B. Convert individual files `hipify-[perl|clang] --inplace --print-stats` or folders using the scripts `hipconvertinplace[.|-perl.]sh <folder>`.
14 | 
15 | 
16 | **Note** that `hipify-clang` requires the  CUDA toolkit. On LUMI this is available via a container. 
17 | The image can be created using:
18 | 
19 | ```
20 | singularity pull docker://nvcr.io/nvidia/cuda:11.4.3-devel-ubuntu20.04
21 | ```
22 | This is step was already done, the image's path is `/projappl/project_462000877/apps/cuda_11.4.3-devel-ubuntu20.04.sif`
23 | Then load all the modules necessary to compile HIP codes on LUMI. 
24 | ```
25 | module load LUMI/24.03
26 | module load partition/G
27 | module load rocm
28 | ```
29 | Finally open a shell in the container which has access to the working directory and the `rocm` 
30 | ```
31 | singularity shell -B $PWD,/opt:/opt /projappl/project_462000877/apps/cuda_11.4.3-devel-ubuntu20.04.sif 
32 | export PATH=$ROCM_PATH/bin:$PATH
33 | ```
34 | 
35 | The CUDA code can be converted now  using:
36 | ```
37 | hipify-clang <file>.cu --inplace --print-stats  --cuda-path=/usr/local/cuda-11.4 -I /usr/local/cuda-11.4/include
38 | ```
39 | This command works as well on Nvidia platforms with HIP installed. 
40 | 
41 | 
42 | 2. Compile CUDA codes on AMD platorms using `hipcc` + HOP and compile HIP codes on Nvidia platforms using `nvcc` + HOP.
43 | 
44 | First you neeed to clone the HOP repository in your working folder on scratch:
45 | ```
46 | git clone https://github.com/cschpc/hop.git
47 | ``` 
48 | 
49 | **CUDA** &rArr; **HIP** on LUMI
50 | ```
51 | export HOP_ROOT=/path/to/hop
52 | export HOP_FLAGS="-I$HOP_ROOT -I$HOP_ROOT/source/cuda -DHOP_TARGET_HIP"
53 | CC -x hip $HOP_FLAGS hello.cu -o hello
54 | ./hello
55 | ```
56 | **HIP**  &rArr; **CUDA** on Mahti or Puhti
57 | ```
58 | export HOP_ROOT=/path/to/hop
59 | export HOP_FLAGS="-I$HOP_ROOT -I$HOP_ROOT/source/hip -DHOP_TARGET_CUDA"
60 | CC -x cu $HOP_FLAGS hello.cpp -o hello
61 | ./hello
62 | ```
63 | 
64 | 


--------------------------------------------------------------------------------
/porting/codes/README.md:
--------------------------------------------------------------------------------
1 | # Directory with source codes for hands-on
2 | 


--------------------------------------------------------------------------------
/porting/codes/Vector_Addition/Readme.md:
--------------------------------------------------------------------------------
1 | # Vector addition
2 | 
3 | This is simple vector addition for exemplify the [CUDA to HIP conversion]. The code executes `C[i]=A[i]+B[i]`, for `i=1,...,N`. 
4 | 
5 | Compile CUDA code: nvcc -arch=sm_70 vecadd.cu -o vecadd
6 | 


--------------------------------------------------------------------------------
/porting/codes/Vector_Addition/cuda/Readme.md:
--------------------------------------------------------------------------------
1 | # Vector addition
2 | 
3 | This is simple vector addition for exemplify the [CUDA to HIP conversion]. The code executes `C[i]=A[i]+B[i]`, for `i=1,...,N`. 
4 | 
5 | Compile: nvcc -arch=sm_70 vecadd.cu -o vecadd
6 | 


--------------------------------------------------------------------------------
/porting/codes/Vector_Addition/cuda/vecadd.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 | nvcc vecadd.cu
 3 | */
 4 | #include <cuda.h>
 5 | #include <stdio.h>
 6 | #include <stdlib.h>
 7 | #include <math.h>
 8 | #include <ctime>
 9 | 
10 | __global__ void vecAdd(int *A,int *B,int *C,int N)
11 | {
12 |    int i = blockIdx.x * blockDim.x + threadIdx.x;
13 |    if(i<N)
14 |    {
15 |      C[i] = A[i] + B[i];
16 |      //printf("C[%d] %d\n", i, C[i]);
17 |    }
18 | }
19 | 
20 | void vecAdd_h(int *A1,int *B1, int *C1, int N)
21 | {
22 |    for(int i=0;i<N;i++)
23 |       C1[i] = A1[i] + B1[i];
24 | }
25 | int main(int argc,char **argv)
26 | {
27 |    printf("Begin \n");
28 | 
29 |   int *a, *b;  // host data
30 |   int *c, *c2;  // results
31 |   int n=45000000;
32 |   size_t nBytes = n*sizeof(int);
33 |   int blockSize, gridSize;
34 |   blockSize=512;
35 |   gridSize=(n+blockSize-1)/blockSize ;
36 |   a = (int *)malloc(nBytes);
37 |   b = (int *)malloc(nBytes);
38 |   c = (int *)malloc(nBytes);
39 |   c2 = (int *)malloc(nBytes);
40 | 
41 |   int *a_d,*b_d,*c_d;
42 | 
43 |   for(int i=0;i<n;i++)
44 |      a[i]=i,b[i]=i;
45 | 
46 |   printf("Allocating device memory on host..\n");
47 |    cudaMalloc((void **)&a_d,n*sizeof(int));
48 |    cudaMalloc((void **)&b_d,n*sizeof(int));
49 |    cudaMalloc((void **)&c_d,n*sizeof(int));
50 |    printf("Copying to device..\n");
51 |    cudaMemcpy(a_d,a,nBytes,cudaMemcpyHostToDevice);
52 |    cudaMemcpy(b_d,b,nBytes,cudaMemcpyHostToDevice);
53 |    clock_t start_d=clock();
54 |    printf("Doing GPU Vector add\n");
55 |    vecAdd<<<gridSize,blockSize>>>(a_d,b_d,c_d,n);
56 |    cudaDeviceSynchronize();
57 |    clock_t end_d = clock();
58 |    clock_t start_h = clock();
59 |    printf("Doing CPU Vector add\n");
60 |    vecAdd_h(a,b,c2,n);
61 |    clock_t end_h = clock();
62 |    double time_d = (double)(end_d-start_d)/CLOCKS_PER_SEC;
63 |    double time_h = (double)(end_h-start_h)/CLOCKS_PER_SEC;
64 |    cudaMemcpy(c,c_d,nBytes,cudaMemcpyDeviceToHost);
65 |    printf("%d %f %f\n",n,time_d,time_h);
66 | 
67 |    for(int i=0; i<n; i++)
68 |    {
69 |      if(fabs(c2[i]-c[i])>1.0e-5)
70 |      printf("Error at position %d.\n", i );
71 |    }
72 |    cudaFree(a_d);
73 |    cudaFree(b_d);
74 |    cudaFree(c_d);
75 |    free(c2);
76 |    free(c);
77 |    free(a);
78 |    free(b);
79 |    return 0;
80 | }
81 | 


--------------------------------------------------------------------------------
/porting/codes/Vector_Addition/hip_solution/vecadd.cu:
--------------------------------------------------------------------------------
 1 | #include "hip/hip_runtime.h"
 2 | /*
 3 | nvcc vecadd.cu
 4 | */
 5 | #include <hip/hip_runtime.h>
 6 | #include <stdio.h>
 7 | #include <stdlib.h>
 8 | #include <math.h>
 9 | #include <ctime>
10 | 
11 | __global__ void vecAdd(int *A,int *B,int *C,int N)
12 | {
13 |    int i = blockIdx.x * blockDim.x + threadIdx.x;
14 |    if(i<N)
15 |    {
16 |      C[i] = A[i] + B[i];
17 |      //printf("C[%d] %d\n", i, C[i]);
18 |    }
19 | }
20 | 
21 | void vecAdd_h(int *A1,int *B1, int *C1, int N)
22 | {
23 |    for(int i=0;i<N;i++)
24 |       C1[i] = A1[i] + B1[i];
25 | }
26 | int main(int argc,char **argv)
27 | {
28 |    printf("Begin \n");
29 | 
30 |   int *a, *b;  // host data
31 |   int *c, *c2;  // results
32 |   int n=45000000;
33 |   size_t nBytes = n*sizeof(int);
34 |   int blockSize, gridSize;
35 |   blockSize=512;
36 |   gridSize=(n+blockSize-1)/blockSize ;
37 |   a = (int *)malloc(nBytes);
38 |   b = (int *)malloc(nBytes);
39 |   c = (int *)malloc(nBytes);
40 |   c2 = (int *)malloc(nBytes);
41 | 
42 |   int *a_d,*b_d,*c_d;
43 | 
44 |   for(int i=0;i<n;i++)
45 |      a[i]=i,b[i]=i;
46 | 
47 |   printf("Allocating device memory on host..\n");
48 |    hipMalloc((void **)&a_d,n*sizeof(int));
49 |    hipMalloc((void **)&b_d,n*sizeof(int));
50 |    hipMalloc((void **)&c_d,n*sizeof(int));
51 |    printf("Copying to device..\n");
52 |    hipMemcpy(a_d,a,nBytes,hipMemcpyHostToDevice);
53 |    hipMemcpy(b_d,b,nBytes,hipMemcpyHostToDevice);
54 |    clock_t start_d=clock();
55 |    printf("Doing GPU Vector add\n");
56 |    vecAdd<<<gridSize,blockSize>>>(a_d,b_d,c_d,n);
57 |    hipDeviceSynchronize();
58 |    clock_t end_d = clock();
59 |    clock_t start_h = clock();
60 |    printf("Doing CPU Vector add\n");
61 |    vecAdd_h(a,b,c2,n);
62 |    clock_t end_h = clock();
63 |    double time_d = (double)(end_d-start_d)/CLOCKS_PER_SEC;
64 |    double time_h = (double)(end_h-start_h)/CLOCKS_PER_SEC;
65 |    hipMemcpy(c,c_d,nBytes,hipMemcpyDeviceToHost);
66 |    printf("%d %f %f\n",n,time_d,time_h);
67 | 
68 |    for(int i=0; i<n; i++)
69 |    {
70 |      if(fabs(c2[i]-c[i])>1.0e-5)
71 |      printf("Error at position %d.\n", i );
72 |    }
73 |    hipFree(a_d);
74 |    hipFree(b_d);
75 |    hipFree(c_d);
76 |    free(c2);
77 |    free(c);
78 |    free(a);
79 |    free(b);
80 |    return 0;
81 | }
82 | 


--------------------------------------------------------------------------------
/porting/codes/saxpy/cublas/Makefile:
--------------------------------------------------------------------------------
 1 | #===============================================================================
 2 | # User Options
 3 | #===============================================================================
 4 | #
 5 | # Compiler can be set below, or via environment variable
 6 | CC        = nvcc
 7 | OPTIMIZE  = yes
 8 | #
 9 | #===============================================================================
10 | # Program name & source code list
11 | #===============================================================================
12 | program = saxpy_cublas
13 | source = saxpy_cublas.cu
14 | obj = $(source:.cu=.o)
15 | #===============================================================================
16 | # Sets Flags
17 | #===============================================================================
18 | # Standard Flags
19 | CFLAGS := -Xcompiler -Wall
20 | # Linker Flags
21 | LDFLAGS = -lcublas
22 | # Optimization Flags
23 | ifeq ($(OPTIMIZE),yes)
24 |   CFLAGS += -O3
25 | endif
26 | 
27 | #===============================================================================
28 | # Targets to Build
29 | #===============================================================================
30 | #
31 | $(program): $(obj) Makefile
32 | 	$(CC) $(CFLAGS) $(obj) -o $@ $(LDFLAGS)
33 | 
34 | %.o: %.cu Makefile
35 | 	$(CC) $(CFLAGS) -c $< -o $@
36 | 
37 | clean:
38 | 	rm -rf $(program) $(obj)
39 | 
40 | 


--------------------------------------------------------------------------------
/porting/codes/saxpy/cublas/saxpy_cublas.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include "cublas_v2.h"
 3 | using namespace std;
 4 | 
 5 | const int N = 1 << 30;
 6 | 
 7 | int main(){
 8 | 	float *a_h, *b_h;
 9 | 	a_h = new float[N];
10 | 	b_h = new float[N];
11 | 	float *a_d, *b_d;
12 | 	for(int i = 0; i < N; i++){
13 | 		a_h[i] = 1.0f;
14 |                 b_h[i] = 2.0f ;
15 | 	}
16 | 	cublasHandle_t handle;
17 | 	cublasCreate(&handle);
18 | 	cudaMalloc((void**) &a_d, sizeof(float) * N);
19 | 	cudaMalloc((void**) &b_d, sizeof(float) * N);
20 | 	cublasSetVector( N, sizeof(float), a_h, 1, a_d, 1);
21 | 	cublasSetVector( N, sizeof(float), b_h, 1, b_d, 1);
22 | 	const float s = 2.0f;
23 | 	cublasSaxpy( handle, N, &s, a_d, 1, b_d, 1);
24 | 	cublasGetVector( N, sizeof(float), b_d, 1, b_h, 1);
25 | 	cudaFree(a_d);
26 | 	cudaFree(b_d);
27 | 	cublasDestroy(handle);
28 |         float maxError = 0.0f;
29 | 
30 | 	for(int i = 0; i < N; i++)
31 | 		maxError = fmax(maxError, abs(b_h[i]-4.0f));
32 | 
33 |         cout << "Max error: " << maxError << endl;
34 | 
35 | 
36 | 	delete[] a_h;
37 | 	delete[] b_h;
38 | 	return 0;
39 | }
40 | 


--------------------------------------------------------------------------------
/porting/codes/saxpy/cuda/saxpy.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | __global__
 4 | void saxpy(int n, float a, float *x, float *y)
 5 | {
 6 |   int i = blockIdx.x*blockDim.x + threadIdx.x;
 7 |   if (i < n) y[i] = a*x[i] + y[i];
 8 | }
 9 | 
10 | int main(void)
11 | {
12 |   int N = 1<<30;
13 |   float *x, *y, *d_x, *d_y;
14 |   x = (float*)malloc(N*sizeof(float));
15 |   y = (float*)malloc(N*sizeof(float));
16 | 
17 |   cudaMalloc(&d_x, N*sizeof(float)); 
18 |   cudaMalloc(&d_y, N*sizeof(float));
19 | 
20 |   for (int i = 0; i < N; i++) {
21 |     x[i] = 1.0f;
22 |     y[i] = 2.0f;
23 |   }
24 | 
25 |   cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
26 |   cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
27 | 
28 |   // Perform SAXPY on 1M elements
29 | saxpy<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y);
30 | 
31 |   cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
32 | 
33 |   float maxError = 0.0f;
34 |   for (int i = 0; i < N; i++)
35 |     maxError = fmax(maxError, abs(y[i]-4.0f));
36 |   printf("Max error: %f\n", maxError);
37 | 
38 |   cudaFree(d_x);
39 |   cudaFree(d_y);
40 |   free(x);
41 |   free(y);
42 | }
43 | 


--------------------------------------------------------------------------------
/porting/codes/saxpy/hip/README.md:
--------------------------------------------------------------------------------
1 | # Copy the files from the CUDA folder and hipify the example here.
2 | 


--------------------------------------------------------------------------------
/porting/codes/saxpy/hip_solution/saxpy.cu:
--------------------------------------------------------------------------------
 1 | #include "hip/hip_runtime.h"
 2 | #include <stdio.h>
 3 | 
 4 | __global__
 5 | void saxpy(int n, float a, float *x, float *y)
 6 | {
 7 |   int i = blockIdx.x*blockDim.x + threadIdx.x;
 8 |   if (i < n) y[i] = a*x[i] + y[i];
 9 | }
10 | 
11 | int main(void)
12 | {
13 |   int N = 1<<30;
14 |   float *x, *y, *d_x, *d_y;
15 |   x = (float*)malloc(N*sizeof(float));
16 |   y = (float*)malloc(N*sizeof(float));
17 | 
18 |   hipMalloc(&d_x, N*sizeof(float)); 
19 |   hipMalloc(&d_y, N*sizeof(float));
20 | 
21 |   for (int i = 0; i < N; i++) {
22 |     x[i] = 1.0f;
23 |     y[i] = 2.0f;
24 |   }
25 | 
26 |   hipMemcpy(d_x, x, N*sizeof(float), hipMemcpyHostToDevice);
27 |   hipMemcpy(d_y, y, N*sizeof(float), hipMemcpyHostToDevice);
28 | 
29 |   // Perform SAXPY on 1M elements
30 |   saxpy<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y);
31 | 
32 |   hipMemcpy(y, d_y, N*sizeof(float), hipMemcpyDeviceToHost);
33 | 
34 |   float maxError = 0.0f;
35 |   for (int i = 0; i < N; i++)
36 |     maxError = fmax(maxError, abs(y[i]-4.0f));
37 |   printf("Max error: %f\n", maxError);
38 | 
39 |   hipFree(d_x);
40 |   hipFree(d_y);
41 |   free(x);
42 |   free(y);
43 | }
44 | 
45 | 


--------------------------------------------------------------------------------
/porting/codes/saxpy/hipblas/README.md:
--------------------------------------------------------------------------------
1 | 
2 | # Copy the data from cuBLAS here and HIPIFY the example.
3 | 


--------------------------------------------------------------------------------
/porting/codes/saxpy/hipblas_solution/Makefile:
--------------------------------------------------------------------------------
 1 | #===============================================================================
 2 | # User Options
 3 | #===============================================================================
 4 | #
 5 | # Compiler can be set below, or via environment variable
 6 | CC        = hipcc
 7 | OPTIMIZE  = yes
 8 | #
 9 | #===============================================================================
10 | # Program name & source code list
11 | #===============================================================================
12 | program = saxpy_cublas
13 | source = saxpy_cublas.cu
14 | obj = $(source:.cu=.o)
15 | #===============================================================================
16 | # Sets Flags
17 | #===============================================================================
18 | # Standard Flags
19 | CFLAGS := -Xcompiler -Wall -I/appl/opt/rocm/rocm-4.0.0c/hipblas/hipblas/include
20 | # Linker Flags
21 | LDFLAGS = -L/appl/opt/rocm/rocm-4.0.0c/hipblas/hipblas/lib/ -lhipblas
22 | # Optimization Flags
23 | ifeq ($(OPTIMIZE),yes)
24 |   CFLAGS += -O3
25 | endif
26 | 
27 | #===============================================================================
28 | # Targets to Build
29 | #===============================================================================
30 | #
31 | $(program): $(obj) Makefile
32 | 	$(CC) $(CFLAGS) $(obj) -o $@ $(LDFLAGS)
33 | 
34 | %.o: %.cu Makefile
35 | 	$(CC) $(CFLAGS) -c $< -o $@
36 | 
37 | clean:
38 | 	rm -rf $(program) $(obj) out* error*
39 | 
40 | 


--------------------------------------------------------------------------------
/porting/codes/saxpy/hipblas_solution/saxpy_cublas.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <hipblas/hipblas.h>
 3 | using namespace std;
 4 | 
 5 | const int N = 1 << 30;
 6 | 
 7 | int main(){
 8 | 	float *a_h, *b_h;
 9 | 	a_h = new float[N];
10 | 	b_h = new float[N];
11 | 	float *a_d, *b_d;
12 | 	for(int i = 0; i < N; i++){
13 | 		a_h[i] = 1.0f;
14 |                 b_h[i] = 2.0f ;
15 | 	}
16 | 	hipblasHandle_t handle;
17 | 	hipblasCreate(&handle);
18 | 	hipMalloc((void**) &a_d, sizeof(float) * N);
19 | 	hipMalloc((void**) &b_d, sizeof(float) * N);
20 | 	hipblasSetVector( N, sizeof(float), a_h, 1, a_d, 1);
21 | 	hipblasSetVector( N, sizeof(float), b_h, 1, b_d, 1);
22 | 	const float s = 2.0f;
23 | 	hipblasSaxpy( handle, N, &s, a_d, 1, b_d, 1);
24 | 	hipblasGetVector( N, sizeof(float), b_d, 1, b_h, 1);
25 | 	hipFree(a_d);
26 | 	hipFree(b_d);
27 | 	hipblasDestroy(handle);
28 |         float maxError = 0.0f;
29 | 
30 | 	for(int i = 0; i < N; i++)
31 | 		maxError = fmax(maxError, abs(b_h[i]-4.0f));
32 | 
33 |         cout << "Max error: " << maxError << endl;
34 | 
35 | 
36 | 	delete[] a_h;
37 | 	delete[] b_h;
38 | 	return 0;
39 | }
40 | 


--------------------------------------------------------------------------------
/setup_env_lumi:
--------------------------------------------------------------------------------
 1 | # Module environment
 2 | ml PrgEnv-cray
 3 | ml craype-accel-amd-gfx90a
 4 | ml rocm/6.0.3
 5 | 
 6 | # Environment variables for compiling
 7 | export CXX=CC
 8 | export CXXFLAGS='-xhip -O3'
 9 | 
10 | # Aliases for easy running
11 | alias runit='srun --reservation=HIPcourse --account=project_462000877 --partition=small-g --time=00:05:00 --nodes=1 --ntasks-per-node=1 --cpus-per-task=1 --gpus-per-task=1'
12 | 


--------------------------------------------------------------------------------
/streams/01-event-record/README.md:
--------------------------------------------------------------------------------
1 | # Understanding asynchronity using events
2 | 
3 | The purpose of this exercise is understand asynchronous operations, and how they can be timed using HIP events. In the skeleton, the timing has been implemented using `<time.h>` header and `clock_t` type. This attempt to time asynchronous events, however, fails to measure the timings correctly. Your task is to implement the timings correctly using HIP events (you don't have to remove the `clock_t` timings, you can leave them in place to explore the difference). The locations where modifications are required, are marked with `#error` together with an instruction. Basically, your task is to measure and print the timing of a GPU kernel, a device-to-host copy, and their combined time.
4 | 


--------------------------------------------------------------------------------
/streams/01-event-record/record.cpp:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #include <time.h>
 3 | #include <hip/hip_runtime.h>
 4 | #include <chrono>
 5 | 
 6 | #define get_mus(X) std::chrono::duration_cast<std::chrono::microseconds>(X).count()
 7 | #define chrono_clock std::chrono::high_resolution_clock::now()
 8 | 
 9 | /* A simple GPU kernel definition */
10 | __global__ void kernel(int *d_a, int n_total)
11 | {
12 |   const int idx = blockIdx.x * blockDim.x + threadIdx.x;
13 |   if(idx < n_total)
14 |     d_a[idx] = idx;
15 | }
16 | 
17 | /* The main function */
18 | int main(){
19 |   
20 |   // Problem size
21 |   constexpr int n_total = 1<<22;
22 | 
23 |   // Device grid sizes
24 |   constexpr int blocksize = 256;
25 |   constexpr int gridsize = (n_total - 1 + blocksize) / blocksize;
26 | 
27 |   // Allocate host and device memory
28 |   int *a, *d_a;
29 |   const int bytes = n_total * sizeof(int);
30 |   hipHostMalloc((void**)&a, bytes); // host pinned
31 |   hipMalloc((void**)&d_a, bytes);   // device pinned
32 | 
33 |   // Create events
34 |   #error create the required timing events here
35 | 
36 |   // Create stream
37 |   hipStream_t stream;
38 |   hipStreamCreate(&stream);
39 | 
40 |   // Start timed GPU kernel and device-to-host copy
41 |   #error record the events somewhere across the below lines of code
42 |   #error such that you can get the timing for the kernel, the
43 |   #error memory copy, and the total combined time of these
44 |   auto start_kernel_clock = chrono_clock;
45 |   kernel<<<gridsize, blocksize, 0, stream>>>(d_a, n_total);
46 | 
47 |   auto start_d2h_clock = chrono_clock;
48 |   hipMemcpyAsync(a, d_a, bytes, hipMemcpyDeviceToHost, stream);
49 | 
50 |   auto stop_clock = chrono_clock;
51 |   hipStreamSynchronize(stream);
52 | 
53 |   // Exctract elapsed timings from event recordings
54 |   #error get the elapsed time from the timing events
55 | 
56 |   // Check that the results are right
57 |   int error = 0;
58 |   for(int i = 0; i < n_total; ++i){
59 |     if(a[i] != i)
60 |       error = 1;
61 |   }
62 | 
63 |   // Print results
64 |   if(error)
65 |     printf("Results are incorrect!\n");
66 |   else
67 |     printf("Results are correct!\n");
68 | 
69 |   // Print event timings
70 |   printf("Event timings:\n");
71 |   #error print event timings here
72 | 
73 |   // Print clock timings
74 |   printf("clock_t timings:\n");
75 |   printf("  %.3f ms - kernel\n", 1e3 * (double)get_mus(start_d2h_clock - start_kernel_clock));
76 |   printf("  %.3f ms - device to host copy\n", 1e3 * (double)get_mus(stop_clock - start_d2h_clock));
77 |   printf("  %.3f ms - total time\n", 1e3 * (double)get_mus(stop_clock - start_kernel_clock));
78 | 
79 |   // Destroy Stream
80 |   hipStreamDestroy(stream);
81 | 
82 |   // Destroy events
83 |   #error destroy events here
84 | 
85 |   // Deallocations
86 |   hipFree(d_a); // Device
87 |   hipHostFree(a); // Host
88 | }
89 | 


--------------------------------------------------------------------------------
/streams/01-event-record/solution/record.cpp:
--------------------------------------------------------------------------------
  1 | #include <cstdio>
  2 | #include <time.h>
  3 | #include <hip/hip_runtime.h>
  4 | #include <chrono>
  5 | 
  6 | #define get_mus(X) std::chrono::duration_cast<std::chrono::microseconds>(X).count()
  7 | #define chrono_clock std::chrono::high_resolution_clock::now()
  8 | 
  9 | /* A simple GPU kernel definition */
 10 | __global__ void kernel(int *d_a, int n_total)
 11 | {
 12 |   const int idx = blockIdx.x * blockDim.x + threadIdx.x;
 13 |   if(idx < n_total)
 14 |     d_a[idx] = idx;
 15 | }
 16 | 
 17 | /* The main function */
 18 | int main(){
 19 |   // Problem size
 20 |   constexpr int n_total = 1<<22; // pow(2, 22);
 21 | 
 22 |   // Device grid sizes
 23 |   constexpr int blocksize = 256;
 24 |   constexpr int gridsize = (n_total - 1 + blocksize) / blocksize;
 25 | 
 26 |   // Allocate host and device memory
 27 |   int *a, *d_a;
 28 |   const int bytes = n_total * sizeof(int);
 29 |   hipHostMalloc((void**)&a, bytes); // host pinned
 30 |   hipMalloc((void**)&d_a, bytes);   // device pinned
 31 | 
 32 |   hipEvent_t pre_kernel, post_kernel, end_event;
 33 |   // Create events
 34 |   hipEventCreate(&pre_kernel);
 35 |   hipEventCreate(&post_kernel);
 36 |   hipEventCreate(&end_event);
 37 |   float timing_a, timing_b, timing_c;
 38 | 
 39 |   // Create stream
 40 |   hipStream_t stream;
 41 |   hipStreamCreate(&stream);
 42 | 
 43 |   // Start timed GPU kernel and device-to-host copy
 44 |   hipEventRecord(pre_kernel, stream);
 45 |   auto start_time = chrono_clock;
 46 | 
 47 |   kernel<<<gridsize, blocksize, 0, stream>>>(d_a, n_total);
 48 | 
 49 |   // Record event after kernel execution
 50 |   hipEventRecord(post_kernel, stream);
 51 |   auto d2h_time = chrono_clock;
 52 | 
 53 |   hipMemcpyAsync(a, d_a, bytes, hipMemcpyDeviceToHost, stream);
 54 | 
 55 |   // Record event after D2H memory copy
 56 |   hipEventRecord(end_event, stream);
 57 |   auto end_time = chrono_clock;
 58 | 
 59 |   hipStreamSynchronize(stream);
 60 | 
 61 |   // Exctract elapsed timings from event recordings
 62 |   hipEventElapsedTime(&timing_a, pre_kernel, post_kernel);
 63 |   hipEventElapsedTime(&timing_b, post_kernel, end_event);
 64 |   hipEventElapsedTime(&timing_c, pre_kernel, end_event);
 65 | 
 66 |   // Check that the results are right
 67 |   int error = 0;
 68 |   for(int i = 0; i < n_total; ++i){
 69 |     if(a[i] != i)
 70 |       error = 1;
 71 |   }
 72 | 
 73 |   // Print results
 74 |   if(error)
 75 |     printf("Results are incorrect!\n");
 76 |   else
 77 |     printf("Results are correct!\n");
 78 | 
 79 |   // Print event timings
 80 |   printf("Event timings:\n");
 81 |   printf("  %.3f ms - kernel\n", (timing_a) );
 82 |   printf("  %.3f ms - D2H copy\n", (timing_b) );
 83 |   printf("  %.3f ms - total time\n", (timing_c) );
 84 |   /* #error print event timings here */
 85 | 
 86 |   // Print clock timings
 87 |   printf("std::chrono timings:\n");
 88 |   printf("  %.3f ms - kernel\n", 1e3 * ((double)get_mus(d2h_time - start_time)) / CLOCKS_PER_SEC);
 89 |   printf("  %.3f ms - device to host copy\n", 1e3 * ((double)get_mus(end_time - d2h_time)) / CLOCKS_PER_SEC);
 90 |   printf("  %.3f ms - total time\n", 1e3 * (double)get_mus(end_time-start_time) / CLOCKS_PER_SEC);
 91 | 
 92 |   // Destroy Stream
 93 |   hipStreamDestroy(stream);
 94 | 
 95 |   // Destroy events
 96 |   /* #error destroy events here */
 97 |   hipEventDestroy(pre_kernel);
 98 |   hipEventDestroy(post_kernel);
 99 |   hipEventDestroy(end_event);
100 | 
101 |   // Deallocations
102 |   hipFree(d_a); // Device
103 |   hipHostFree(a); // Host
104 | }
105 | 


--------------------------------------------------------------------------------
/streams/02-concurrency/README.md:
--------------------------------------------------------------------------------
 1 | # Investigating streams and events
 2 | 
 3 | This exercise demonstrates an asynchronous data transfer and computation. Three different asynchronous cases are created, and their timings are printed out. The timings are recorded with hipEvent calls.
 4 | 
 5 | ## Instructions
 6 | 
 7 | In the exercise, the following HIP functions are needed:
 8 | 
 9 | * `hipStreamCreate()`
10 | * `hipMemcpyAsync()`
11 | * `hipEventRecord()`
12 | * `hipEventSynchronize()`
13 | * `hipEventElapsedTime()`
14 | * `hipStreamDestroy()`
15 | 
16 | ### Case 0
17 | 
18 | 1) Create and destroy `n_stream` streams in the main function in the locations marked by `#error`
19 | 2) The function `case_0()` is already complete and can be used as a reference
20 | 
21 | ### Case 1
22 | 
23 | 1) In the `case_1()` function, create a loop over `n_stream` and split the work done by the kernel call of Case 0 into multiple kernels calls (one kernel call per stream with an even workload per stream)
24 | 3) Record events using `start_event` and `stop_event` arrays for each stream before and after the kernel call
25 | 
26 | ### Case 2
27 | 
28 | 1) Create a loop into the function `case_2()`
29 | 	1) In the loop: Split the data copy from host to device into `n_stream` asynchronous memcopies. one for each stream (make sure the memcopies are split evenly for each stream)
30 | 	2) In the loop: Launch the kernel for each stream similarly to Case 1
31 | 	3) In the loop: Split the data copy from device to host into `n_stream` asynchronous memcopies. one for each stream (make sure the memcopies are split asynchronously 
32 | 2) Record total timing of the loop, use `start_event[n_stream]` and `stop_event[n_stream]` array positions
33 | 3) Additionally, record events for each stream using `start_event` and `stop_event` arrays before H-to-D memcopy and after D-to-H memcopy, respectively
34 | 4) Synchronize host with each `stop_event[i] `
35 | 5) Get timings between each corresponding `start_event[i]` and `stop_event[i]`
36 | 
37 | ### Case 3
38 | 
39 | 1) Copy the case 2 here
40 | 2) Instead of doing the asynchronous memcopies and the kernel in the same loop as in Case 2, create a separate loop for each (3 loops in total)
41 | 3) Make sure you record events in appropriate locations to get correct timings
42 | 
43 | ## Additional considerations
44 | 
45 | * You can try setting `USE_PINNED_HOST_MEM` to `0` at line `#6`, to see how the timings change if we do not use pinned host memory.
46 | 


--------------------------------------------------------------------------------
/third-party/hipcub/hipcub.hpp:
--------------------------------------------------------------------------------
1 | #include <cub/cub.cuh>
2 | 
3 | #define hipcub cub


--------------------------------------------------------------------------------
/third-party/hiprand/hiprand_hcc.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017 Advanced Micro Devices, Inc. All rights reserved.
 2 | //
 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | // of this software and associated documentation files (the "Software"), to deal
 5 | // in the Software without restriction, including without limitation the rights
 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | // copies of the Software, and to permit persons to whom the Software is
 8 | // furnished to do so, subject to the following conditions:
 9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 | 
21 | #ifndef HIPRAND_HCC_H_
22 | #define HIPRAND_HCC_H_
23 | 
24 | #include <rocrand/rocrand.h>
25 | 
26 | typedef rocrand_generator_base_type hiprandGenerator_st;
27 | 
28 | typedef struct rocrand_discrete_distribution_st hiprandDiscreteDistribution_st;
29 | 
30 | #endif // HIPRAND_HCC_H_
31 | 


--------------------------------------------------------------------------------
/third-party/hiprand/hiprand_kernel.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017 Advanced Micro Devices, Inc. All rights reserved.
 2 | //
 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | // of this software and associated documentation files (the "Software"), to deal
 5 | // in the Software without restriction, including without limitation the rights
 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | // copies of the Software, and to permit persons to whom the Software is
 8 | // furnished to do so, subject to the following conditions:
 9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 | 
21 | #ifndef HIPRAND_KERNEL_H_
22 | #define HIPRAND_KERNEL_H_
23 | 
24 | #ifndef QUALIFIERS
25 | #define QUALIFIERS __forceinline__ __device__
26 | #endif // QUALIFIERS
27 | 
28 | #include <hip/hip_runtime.h>
29 | #include <hiprand/hiprand.h>
30 | 
31 | /** \addtogroup hipranddevice
32 |  *
33 |  *  @{
34 |  */
35 | 
36 |  /**
37 |  * \def HIPRAND_PHILOX4x32_DEFAULT_SEED
38 |  * \brief Default seed for PHILOX4x32 PRNG.
39 |  */
40 | #define HIPRAND_PHILOX4x32_DEFAULT_SEED 0ULL
41 |  /**
42 |  * \def HIPRAND_XORWOW_DEFAULT_SEED
43 |  * \brief Default seed for XORWOW PRNG.
44 |  */
45 | #define HIPRAND_XORWOW_DEFAULT_SEED 0ULL
46 |  /**
47 |  * \def HIPRAND_MRG32K3A_DEFAULT_SEED
48 |  * \brief Default seed for MRG32K3A PRNG.
49 |  */
50 | #define HIPRAND_MRG32K3A_DEFAULT_SEED 12345ULL
51 | /** @} */ // end of group hipranddevice
52 | 
53 | #if defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)
54 | #include "hiprand/hiprand_kernel_hcc.h"
55 | #else
56 | #include "hiprand/hiprand_kernel_nvcc.h"
57 | #endif
58 | 
59 | #endif // HIPRAND_KERNEL_H_
60 | 


--------------------------------------------------------------------------------
/third-party/hiprand/hiprand_nvcc.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017 Advanced Micro Devices, Inc. All rights reserved.
 2 | //
 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | // of this software and associated documentation files (the "Software"), to deal
 5 | // in the Software without restriction, including without limitation the rights
 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | // copies of the Software, and to permit persons to whom the Software is
 8 | // furnished to do so, subject to the following conditions:
 9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 | 
21 | #ifndef HIPRAND_NVCC_H_
22 | #define HIPRAND_NVCC_H_
23 | 
24 | #include <curand.h>
25 | 
26 | typedef struct curandGenerator_st hiprandGenerator_st;
27 | 
28 | typedef struct curandDiscreteDistribution_st hiprandDiscreteDistribution_st;
29 | 
30 | #endif // HIPRAND_NVCC_H_
31 | 


--------------------------------------------------------------------------------
/third-party/hiprand/hiprand_version.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017 Advanced Micro Devices, Inc. All rights reserved.
 2 | //
 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | // of this software and associated documentation files (the "Software"), to deal
 5 | // in the Software without restriction, including without limitation the rights
 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | // copies of the Software, and to permit persons to whom the Software is
 8 | // furnished to do so, subject to the following conditions:
 9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 | 
21 | #ifndef HIPRAND_VERSION_H_
22 | #define HIPRAND_VERSION_H_
23 | 
24 | /// \def HIPRAND_VERSION
25 | /// \brief hipRAND library version
26 | ///
27 | /// Version number may not be visible in the documentation.
28 | ///
29 | /// HIPRAND_VERSION % 100 is the patch level,
30 | /// HIPRAND_VERSION / 100 % 1000 is the minor version,
31 | /// HIPRAND_VERSION / 100000 is the major version.
32 | ///
33 | /// For example, if HIPRAND_VERSION is 100500, then
34 | /// the major version is 1, the minor version is 5, and
35 | /// the patch level is 0.
36 | #define HIPRAND_VERSION 100500
37 | 
38 | #endif // HIPRAND_VERSION_H_
39 | 


--------------------------------------------------------------------------------