├── .github └── workflows │ ├── pages-html.yml │ ├── pages-pdf.yml │ └── pages.yml ├── LICENSE ├── README.md ├── about.yml ├── bonus └── heat-equation │ ├── README.md │ ├── common │ ├── bottle.dat │ ├── pngwriter.c │ └── pngwriter.h │ ├── cuda │ ├── Makefile │ ├── core.cpp │ ├── core_cuda.cu │ ├── heat.h │ ├── io.cpp │ ├── main.cpp │ ├── setup.cpp │ └── utilities.cpp │ ├── hip_solution │ └── Makefile │ └── serial │ ├── Makefile │ ├── core.cpp │ ├── fortran │ ├── Makefile │ ├── core.F90 │ ├── heat_mod.F90 │ ├── io.F90 │ ├── main.F90 │ ├── pngwriter_mod.F90 │ ├── setup.F90 │ └── utilities.F90 │ ├── heat.h │ ├── io.cpp │ ├── main.cpp │ ├── setup.cpp │ └── utilities.cpp ├── demos ├── device_management_hip.cpp ├── device_management_mpi_hip.cpp ├── device_properties_hip.cpp ├── fill.cpp ├── hello.cpp └── warp-div.cpp ├── docs ├── 01-introduction.md ├── 02-kernels.md ├── 03-streams.md ├── 04-memory.md ├── 05-fortran.md ├── 06-optimisation.md ├── 07-multi-gpu.md ├── 08-porting-to-hip.md ├── LICENSE ├── Makefile ├── img │ ├── 01.png │ ├── 04.png │ ├── AMD-GCN-3.png │ ├── BankConflicts.jpeg │ ├── CU.png │ ├── CUgray.png │ ├── NoBankConflicts.jpeg │ ├── ThreadExecution.jpg │ ├── ThreadExecution_new.jpg │ ├── a100.png │ ├── a100_fp32_core.png │ ├── a100_sm.png │ ├── a100_smsp.png │ ├── amd_computeunit.png │ ├── amd_instinct_mi250x_oam.png │ ├── amd_m200.png │ ├── amd_mi200.jpg │ ├── amd_mi200.png │ ├── arrow.png │ ├── block_sm_cu.png │ ├── coalesced.svg │ ├── coalesced_access_1.png │ ├── coalesced_access_3.png │ ├── coalesced_access_4.png │ ├── coarse_CU.svg │ ├── comparison.png │ ├── compp.svg │ ├── copy_d2h.png │ ├── copy_h2d.png │ ├── cpu_waits_on_gpu.png │ ├── cu_sm_eu.png │ ├── cublas_cuda_hip.png │ ├── do_this_computation.png │ ├── execution-model.png │ ├── execution-model.svg │ ├── global-mem-arrow.svg │ ├── gpu-bws.png │ ├── gpu-cluster.png │ ├── gpuConnect.png │ ├── gpu_as_a_wide_vector_unit.png │ ├── gpu_as_cus_sms_eus.png │ ├── gpu_as_vector_units.png │ ├── gpu_as_vector_units_instructions.png │ ├── gpu_is_a_separate_processor_with_own_memory.png │ ├── gpufort.png │ ├── gpufort1.png │ ├── gpufort2.png │ ├── grid-threads.png │ ├── grid_gpu.png │ ├── hip-programming-2025-images.excalidraw │ ├── hipblas.png │ ├── hipfort.png │ ├── kernel_cuda_hip.png │ ├── lumi.jpg │ ├── lumi.png │ ├── many_blocks_to_one_sm.png │ ├── memlayout.png │ ├── memory-hierarchy.png │ ├── memsch.png │ ├── mi100-architecture.info │ ├── mi100-architecture.png │ ├── mi100_arch.png │ ├── mi250x.png │ ├── mi250x_cu.png │ ├── mi250x_cu_simd.png │ ├── microprocessor-trend-data.png │ ├── model_gpu.png │ ├── new_hipfort.png │ ├── no_block_to_many_sm.png │ ├── not_gpu_as_a_wide_vector_unit.png │ ├── oned_block.png │ ├── oned_grid.png │ ├── parallel_regions.png │ ├── parflow_single_node.png │ ├── perfetto.png │ ├── processes-threads.svg │ ├── runtimes_annotated.png │ ├── scalar_operation.png │ ├── single_proc_mpi_gpu2.png │ ├── single_proc_multi_gpu.png │ ├── single_proc_thread_gpu.png │ ├── software_hardware_mapping.png │ ├── stream-example.svg │ ├── streams-example-1.png │ ├── streams-example-2.png │ ├── streams.png │ ├── streams1_explain.png │ ├── streams2.png │ ├── streams2_explain.png │ ├── thread.png │ ├── thread_lane.png │ ├── threed_block.png │ ├── top500-perf-dev.png │ ├── top500-performance.png │ ├── transpose_img.png │ ├── twod_block.png │ ├── twod_grid.png │ ├── uncoalesced.svg │ ├── vector_operation.png │ ├── vector_unit.png │ ├── virtual_memory_addressing.png │ └── warp_wavefron_smsp_simd.png └── index ├── exercise-instructions.md ├── first_steps.md ├── hipfort ├── hiprand │ ├── Makefile │ ├── README.md │ ├── img │ │ └── pi_MC.png │ ├── pi.F90 │ ├── solution │ │ ├── Makefile │ │ └── pi.F90 │ └── solution_bonus │ │ ├── Makefile │ │ ├── hip_kernels.cpp │ │ └── pi.F90 └── saxpy │ ├── cuda │ └── main.cuf │ └── hip │ ├── README.md │ ├── hipsaxpy.cpp │ └── main.f03 ├── kernels ├── 01-hello-world │ ├── README.md │ └── hello.cpp ├── 02-error-checking │ ├── README.md │ ├── error-checking.cpp │ └── solution │ │ └── error-checking.cpp ├── 03-kernel-saxpy │ ├── README.md │ ├── saxpy.cpp │ └── solution │ │ └── saxpy.cpp └── 04-kernel-copy2d │ ├── README.md │ ├── copy2d.cpp │ └── solution │ └── copy2d.cpp ├── lambdas ├── 01-lambda │ ├── README.md │ ├── lambda.cpp │ └── solution │ │ └── lambda.cpp ├── 02-reduction │ ├── README.md │ ├── reduction.cpp │ └── solution │ │ └── reduction.cpp └── 03-hipify │ ├── Makefile │ ├── README.md │ └── src │ ├── bessel.cpp │ ├── comms.cpp │ ├── comms.h │ ├── devices_cuda.h │ ├── devices_host.h │ └── solution.h ├── memory ├── 01-prefetch │ ├── README.md │ ├── prefetch.cpp │ └── solution │ │ └── prefetch.cpp ├── 02-mempools │ ├── README.md │ ├── mempools.cpp │ └── solution │ │ └── mempools.cpp └── 03-struct │ ├── README.md │ ├── solution │ └── struct.cpp │ └── struct.cpp ├── multi-gpu ├── 01-p2pcopy │ ├── README.md │ ├── p2pcopy.cpp │ └── solution │ │ └── p2pcopy.cpp ├── 02-vector-sum │ ├── README.md │ ├── solution │ │ └── vector-sum.cpp │ └── vector-sum.cpp └── 03-mpi │ ├── Makefile │ ├── README.md │ ├── ping-pong.cpp │ └── solution │ ├── Makefile │ └── ping-pong.cpp ├── optimization ├── 01-coalescing │ ├── README.md │ ├── copy.cpp │ └── metrics.txt ├── 02-matrix_transpose │ ├── README.md │ ├── copy.cpp │ ├── matrix_transpose_naive.cpp │ ├── matrix_transpose_with_SM.cpp │ └── matrix_transpose_with_SM_nobc.cpp └── 03-trace │ ├── README.md │ └── streams.cpp ├── porting ├── README.md └── codes │ ├── README.md │ ├── Vector_Addition │ ├── Readme.md │ ├── cuda │ │ ├── Readme.md │ │ └── vecadd.cu │ └── hip_solution │ │ └── vecadd.cu │ └── saxpy │ ├── cublas │ ├── Makefile │ └── saxpy_cublas.cu │ ├── cuda │ └── saxpy.cu │ ├── hip │ └── README.md │ ├── hip_solution │ └── saxpy.cu │ ├── hipblas │ └── README.md │ └── hipblas_solution │ ├── Makefile │ └── saxpy_cublas.cu ├── setup_env_lumi ├── streams ├── 01-event-record │ ├── README.md │ ├── record.cpp │ └── solution │ │ └── record.cpp └── 02-concurrency │ ├── README.md │ ├── solution │ └── streams.cpp │ └── streams.cpp └── third-party ├── hipcub └── hipcub.hpp └── hiprand ├── hiprand.h ├── hiprand.hpp ├── hiprand_hcc.h ├── hiprand_kernel.h ├── hiprand_kernel_hcc.h ├── hiprand_kernel_nvcc.h ├── hiprand_mtgp32_host.h ├── hiprand_nvcc.h └── hiprand_version.h /.github/workflows/pages-html.yml: -------------------------------------------------------------------------------- 1 | name: Deploy HTML slides to Pages 2 | 3 | on: 4 | # Runs on pushes targeting the default branch 5 | push: 6 | branches: 7 | - "main" 8 | paths: 9 | - "docs/**" 10 | - ".github/workflows/pages.yml" 11 | 12 | # Allows you to run this workflow manually from the Actions tab 13 | workflow_dispatch: 14 | 15 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages 16 | permissions: 17 | contents: read 18 | pages: write 19 | id-token: write 20 | 21 | jobs: 22 | pages-html: 23 | uses: ./.github/workflows/pages.yml 24 | with: 25 | include_pdf: false 26 | -------------------------------------------------------------------------------- /.github/workflows/pages-pdf.yml: -------------------------------------------------------------------------------- 1 | name: Deploy HTML and PDF slides to Pages 2 | 3 | on: 4 | # Runs after HTML deployment 5 | workflow_run: 6 | workflows: [Deploy HTML slides to Pages] 7 | types: 8 | - completed 9 | 10 | # Allows you to run this workflow manually from the Actions tab 11 | workflow_dispatch: 12 | 13 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages 14 | permissions: 15 | contents: read 16 | pages: write 17 | id-token: write 18 | 19 | jobs: 20 | pages-pdf: 21 | if: ${{ github.event.workflow_run.conclusion != 'failure' }} 22 | uses: ./.github/workflows/pages.yml 23 | with: 24 | include_pdf: true 25 | -------------------------------------------------------------------------------- /.github/workflows/pages.yml: -------------------------------------------------------------------------------- 1 | # Script based on examples in https://github.com/actions/starter-workflows/tree/main/pages 2 | name: Deploy slides to Pages 3 | 4 | on: 5 | workflow_call: 6 | inputs: 7 | include_pdf: 8 | required: true 9 | type: boolean 10 | 11 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages 12 | permissions: 13 | contents: read 14 | pages: write 15 | id-token: write 16 | 17 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. 18 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. 19 | concurrency: 20 | group: "pages" 21 | cancel-in-progress: false 22 | 23 | jobs: 24 | build: 25 | timeout-minutes: 30 26 | runs-on: ubuntu-latest 27 | container: 28 | image: ghcr.io/csc-training/slidefactory:3.3.0 29 | steps: 30 | - name: Checkout 31 | uses: actions/checkout@v4 32 | - name: Setup Pages 33 | id: pages 34 | uses: actions/configure-pages@v4 35 | - name: Build slides 36 | env: 37 | INCLUDE_PDF: ${{ inputs.include_pdf }} 38 | shell: bash 39 | run: | 40 | git config --global --add safe.directory $PWD 41 | GIT_SHORT_SHA=$(git rev-parse --short $GITHUB_SHA) 42 | GIT_DATE=$(git show -s --format=%ci $GITHUB_SHA) 43 | 44 | ARGS="" 45 | [[ "$INCLUDE_PDF" == "true" ]] && ARGS="--with-pdf" 46 | 47 | slidefactory pages about.yml build --info_content "Updated for [$GIT_SHORT_SHA]($GITHUB_SERVER_URL/$GITHUB_REPOSITORY/commit/$GITHUB_SHA) ($GIT_DATE)" $ARGS 48 | 49 | - name: Upload artifact 50 | uses: actions/upload-pages-artifact@v3 51 | with: 52 | path: ./build 53 | 54 | deploy: 55 | environment: 56 | name: github-pages 57 | url: ${{ steps.deployment.outputs.page_url }} 58 | runs-on: ubuntu-latest 59 | needs: build 60 | steps: 61 | - name: Deploy to GitHub Pages 62 | id: deployment 63 | uses: actions/deploy-pages@v4 64 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | All material in the directory 'docs/' and its sub-directories as well as 2 | all images are licensed under a Creative Commons Attribution-ShareAlike 4.0 3 | International (CC-BY-SA 4.0) license unless otherwise noted. 4 | 5 | Full text of the license is available in the file 'docs/LICENSE' and at 6 | . 7 | 8 | Any other files are licensed under the MIT license (below) unless otherwise 9 | noted. 10 | 11 | --- 12 | 13 | MIT License 14 | 15 | Copyright (c) 2021 CSC Training 16 | 17 | Permission is hereby granted, free of charge, to any person obtaining a copy 18 | of this software and associated documentation files (the "Software"), to deal 19 | in the Software without restriction, including without limitation the rights 20 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 21 | copies of the Software, and to permit persons to whom the Software is 22 | furnished to do so, subject to the following conditions: 23 | 24 | The above copyright notice and this permission notice shall be included in all 25 | copies or substantial portions of the Software. 26 | 27 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 28 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 29 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 30 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 31 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 32 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 33 | SOFTWARE. 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GPU programming with HIP 2 | 3 | Course material for the CSC course "GPU programming with HIP". The course is 4 | part of the EuroCC training activities at CSC. 5 | 6 | ## Agenda 7 | 8 | ### Day 1 9 | 10 | | Time | Topic | 11 | | ---- | ----- | 12 | | 09:00–09:30 | Welcome, LUMI access, slurm, git, web interface (JL) | 13 | | 09:30–09:45 | Break/debugging access | 14 | | 09:45–10:30 | Introduction to GPU programming (JL) | 15 | | 10:30–10:45 | Break & Snacks | 16 | | 10:45-11:30 | HIP and GPU kernels (JL) | 17 | | 11:30-11:45 | Break | 18 | | 11:45-12:15 | Exercises | 19 | | 12:15-13:00 | Lunch | 20 | | 13:00-13:45 | Streams, events, and synchronization (JK) | 21 | | 13:45-14:00 | Break | 22 | | 14:00-14:30 | Exercises | 23 | | 14:30-15:15 | Memory allocations, access and unified memory (JK) | 24 | | 15:15-15:30 | Break | 25 | | 15:30-16:00 | Exercises | 26 | | 16:00-16:15 | Day summary | 27 | 28 | 29 | ### Day 2 30 | 31 | | Time | Topic | 32 | | ---- | ----- | 33 | | 09:00–10:00 | Kernel optimizations (JK) | 34 | | 10:00–10:15 | Break & Snacks | 35 | | 10:15–10:45 | Exercises | 36 | | 10:45-11:30 | Multi-GPU programming, HIP+MPI (CA) | 37 | | 11:30-11:45 | Break | 38 | | 11:45-12:15 | Exercises | 39 | | 12:15-13:00 | Lunch | 40 | | 13:00-13:30 | Fortran and HIP (CA) | 41 | | 13:30-13:45 | Break | 42 | | 13:45-14:15 | Exercises | 43 | | 14:15-14:45 | Porting Applications to HIP (CA) | 44 | | 14:45-15:45 | Break & Exercises | 45 | | 15:45-16:00 | Close-up | 46 | 47 | 48 | ## Slides 49 | 50 | Link to [slides](https://csc-training.github.io/hip-programming/) 51 | 52 | ## First steps 53 | - [Which technologies have you used?](https://strawpoll.com/w4nWWYReQnA) 54 | - [First steps](first_steps.md) 55 | 56 | ## Exercises 57 | 58 | [General instructions](exercise-instructions.md) 59 | 60 | ### Introduction and GPU kernels 61 | 62 | - [Mental model quiz](https://siili.rahtiapp.fi/s/gpmWnLY8q#) 63 | - [Hello world](kernels/01-hello-world) 64 | - [Error checking](kernels/02-error-checking) 65 | - [Kernel saxpy](kernels/03-kernel-saxpy) 66 | - [Kernel copy2d](kernels/04-kernel-copy2d) 67 | 68 | ### Streams, events, and synchronization 69 | 70 | - [Understanding asynchronity using events](streams/01-event-record) 71 | - [Investigating streams and events](streams/02-concurrency) 72 | 73 | ### Memory allocations, access, and unified memory 74 | 75 | - [Memory management strategies](memory/01-prefetch) 76 | - [The stream-ordered memory allocator and memory pools](memory/02-mempools) 77 | - [Unified memory and structs](memory/03-struct) 78 | 79 | ### Fortran and HIP 80 | 81 | - [SAXPY](hipfort/saxpy/hip/) 82 | - [HIPRAND](hipfort/hiprand/) 83 | 84 | ### Optimization 85 | 86 | - [Coalescing](optimization/01-coalescing) 87 | - [Matrix Transpose](optimization/02-matrix_transpose) 88 | - [Tracing](optimization/03-trace) 89 | 90 | ### Multi-GPU programming and HIP+MPI 91 | 92 | - [Peer to peer device access](multi-gpu/01-p2pcopy) 93 | - [Vector sum on two GPUs without MPI](multi-gpu/02-vector-sum) 94 | - [Ping-pong with multiple GPUs and MPI](multi-gpu/03-mpi) 95 | 96 | ### Porting to HIP 97 | 98 | - [Converting Tools & Portability](porting) 99 | 100 | #### Bonus 101 | - [Heat equation with HIP](bonus/heat-equation) 102 | -------------------------------------------------------------------------------- /about.yml: -------------------------------------------------------------------------------- 1 | # This file is used in the generation of the web page 2 | title: GPU programming with HIP 3 | slidesdir: docs 4 | -------------------------------------------------------------------------------- /bonus/heat-equation/README.md: -------------------------------------------------------------------------------- 1 | # Bonus: Heat equation solver with HIP 2 | 3 | Create a parallel version of a heat equation solver using HIP. 4 | 5 | Starting from a [serial heat equation solver](serial) (see below for details), 6 | port the code to GPUs using HIP. Main computational routine is the time 7 | evolution loop in the `core.cpp` file. 8 | 9 | Alternatively, you may start from a [CUDA+MPI version](cuda) and hipify the code to jump start the work. 10 | 11 | Note: You may need to comment out the PNG generation parts, if the system you 12 | are using doesn't have libpng installed. 13 | 14 | ## Heat equation solver 15 | 16 | The heat equation is a partial differential equation that describes the 17 | variation of temperature in a given region over time 18 | 19 | $$\frac{\partial u}{\partial t} = \alpha \nabla^2 u $$ 20 | 21 | where u(x, y, z, t) represents temperature variation over space at a given 22 | time, and α is a thermal diffusivity constant. 23 | 24 | We limit ourselves to two dimensions (plane) and discretize the equation onto 25 | a grid. The two dimensional Laplacian can be 26 | discretized with finite differences as 27 | 28 | ```math 29 | \nabla^2 u = \frac{u(i-1,j)-2u(i,j)+u(i+1,j)}{(\Delta x)^2} + \frac{u(i,j-1)-2u(i,j)+u(i,j+1)}{(\Delta y)^2} 30 | 31 | ``` 32 | Given an initial condition (u(t=0) = u0) one can follow the time dependence 33 | of 34 | the temperature field with explicit time evolution method: 35 | 36 | $$u^{m+1}(i,j) = u^m(i,j) + \Delta t \alpha \nabla^2 u^m(i,j) $$ 37 | 38 | Note: Algorithm is stable only when 39 | 40 | $$ \Delta t < \frac{1}{2 \alpha} \frac{(\Delta x \Delta y)^2}{(\Delta x)^2+ 41 | (\Delta y)^2} $$ 42 | 43 | There is a solver for the 2D equation implemented in C++ and Fortran. You can 44 | compile the program by adjusting the Makefile as needed and typing `make`. The 45 | solver carries out the time development of the 2D heat equation over the 46 | number of time steps provided by the user. The default geometry is a flat 47 | rectangle (with grid size provided by the user), but other shapes may be used 48 | via input files. Examples on how to run the binary: 49 | 50 | - `./heat` 51 | No arguments - the program will run with the default arguments: 200 x 200 52 | grid and 500 time steps 53 | - `./heat ../common/bottle.dat` 54 | One argument - start from a temperature grid provided in the given file 55 | for the default number of time steps 56 | - `./heat ../common/bottle.dat 1000` 57 | Two arguments - will run the program starting from a temperature grid 58 | provided in the given file for 1000 time steps 59 | - `./heat 1024 2048 1000` 60 | Three arguments - will run the program using the grid dimensions (1024 x 2048) and the number of time steps (1000) specified by the arguments 61 | 62 | The program will produce a `.png` image of the temperature field after every 63 | 100 iterations. You can change that from the parameter `image_interval`. You 64 | can visualise the images using the command animate: `animate heat_*.png`, or 65 | by using `eog heat_000.png` and using the arrow-keys to loop backward or 66 | forward through the files. 67 | -------------------------------------------------------------------------------- /bonus/heat-equation/common/pngwriter.h: -------------------------------------------------------------------------------- 1 | #ifndef PNGWRITER_H_ 2 | #define PNGWRITER_H_ 3 | 4 | #if __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | int save_png(double *data, const int nx, const int ny, const char *fname, 9 | const char lang); 10 | 11 | #if __cplusplus 12 | } 13 | #endif 14 | #endif 15 | -------------------------------------------------------------------------------- /bonus/heat-equation/cuda/Makefile: -------------------------------------------------------------------------------- 1 | ifeq ($(COMP),) 2 | COMP=gnu 3 | endif 4 | 5 | COMMONDIR=../common 6 | 7 | ifeq ($(COMP),pgi) 8 | CXX=mpicxx 9 | CC=pgcc 10 | NVCC=nvcc -ccbin pgc++ 11 | NVCCFLAGS=-g -O3 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -I$(COMMONDIR) 12 | CCFLAGS=-g -O3 -I$(COMMONDIR) 13 | LDFLAGS=-L$(CUDA_INSTALL_ROOT)/lib64 14 | LIBS=-lpng -lcudart 15 | endif 16 | 17 | ifeq ($(COMP),gnu) 18 | CXX=mpicxx 19 | CC=gcc 20 | NVCC=nvcc 21 | NVCCFLAGS=-g -O3 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -I$(COMMONDIR) 22 | CCFLAGS=-g -O3 -Wall -I$(COMMONDIR) 23 | LDFLAGS= 24 | LIBS=-lpng -lcudart 25 | endif 26 | 27 | EXE=heat_cuda 28 | OBJS=main.o core.o core_cuda.o setup.o utilities.o io.o 29 | OBJS_PNG=$(COMMONDIR)/pngwriter.o 30 | 31 | 32 | all: $(EXE) 33 | 34 | 35 | core.o: core.cpp heat.h 36 | core_cuda.o: core_cuda.cu heat.h 37 | utilities.o: utilities.cpp heat.h 38 | setup.o: setup.cpp heat.h 39 | io.o: io.cpp heat.h 40 | main.o: main.cpp heat.h 41 | 42 | $(OBJS_PNG): C_COMPILER := $(CC) 43 | $(OBJS): C_COMPILER := $(CXX) 44 | 45 | $(EXE): $(OBJS) $(OBJS_PNG) 46 | $(CXX) $(CCFLAGS) $(OBJS) $(OBJS_PNG) -o $@ $(LDFLAGS) $(LIBS) 47 | 48 | %.o: %.cpp 49 | $(CXX) $(CCFLAGS) -c $< -o $@ 50 | 51 | %.o: %.c 52 | $(CC) $(CCFLAGS) -c $< -o $@ 53 | 54 | %.o: %.cu 55 | $(NVCC) $(NVCCFLAGS) -c $< -o $@ 56 | 57 | .PHONY: clean 58 | clean: 59 | -/bin/rm -f $(EXE) a.out *.o *.png *~ 60 | -------------------------------------------------------------------------------- /bonus/heat-equation/cuda/core.cpp: -------------------------------------------------------------------------------- 1 | /* Main solver routines for heat equation solver */ 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "heat.h" 10 | 11 | /* Exchange the boundary values */ 12 | void exchange(field *temperature, parallel_data *parallel) 13 | { 14 | double *data; 15 | double *sbuf_up, *sbuf_down, *rbuf_up, *rbuf_down; 16 | 17 | data = temperature->devdata; 18 | 19 | // Send to the up, receive from down 20 | sbuf_up = data + temperature->ny + 2; // upper data 21 | rbuf_down = data + (temperature->nx + 1) * (temperature->ny + 2); // lower halo 22 | 23 | MPI_Sendrecv(sbuf_up, temperature->ny + 2, MPI_DOUBLE, 24 | parallel->nup, 11, 25 | rbuf_down, temperature->ny + 2, MPI_DOUBLE, 26 | parallel->ndown, 11, MPI_COMM_WORLD, MPI_STATUS_IGNORE); 27 | 28 | // Send to the down, receive from up 29 | sbuf_down = data + temperature->nx * (temperature->ny + 2); // lower data 30 | rbuf_up = data; // upper halo 31 | 32 | MPI_Sendrecv(sbuf_down, temperature->ny + 2, MPI_DOUBLE, 33 | parallel->ndown, 12, 34 | rbuf_up, temperature->ny + 2, MPI_DOUBLE, 35 | parallel->nup, 12, MPI_COMM_WORLD, MPI_STATUS_IGNORE); 36 | 37 | } 38 | -------------------------------------------------------------------------------- /bonus/heat-equation/cuda/core_cuda.cu: -------------------------------------------------------------------------------- 1 | /* Main solver routines for heat equation solver */ 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "heat.h" 11 | 12 | /* CUDA error handling macro */ 13 | #define CUDA_ERR(err) (cuda_errchk(err, __FILE__, __LINE__ )) 14 | static inline void cuda_errchk(cudaError_t err, const char *file, int line) { 15 | if (err != cudaSuccess) { 16 | printf("\n\n%s in %s at line %d\n", cudaGetErrorString(err), file, line); 17 | exit(EXIT_FAILURE); 18 | } 19 | } 20 | 21 | /* Update the temperature values using five-point stencil */ 22 | __global__ void evolve_kernel(double *currdata, double *prevdata, double a, double dt, int nx, int ny, 23 | double dx2, double dy2) 24 | { 25 | 26 | /* Determine the temperature field at next time step 27 | * As we have fixed boundary conditions, the outermost gridpoints 28 | * are not updated. */ 29 | int ind, ip, im, jp, jm; 30 | 31 | // CUDA threads are arranged in column major order; thus j index from x, i from y 32 | int j = blockIdx.x * blockDim.x + threadIdx.x; 33 | int i = blockIdx.y * blockDim.y + threadIdx.y; 34 | 35 | if (i > 0 && j > 0 && i < nx+1 && j < ny+1) { 36 | ind = i * (ny + 2) + j; 37 | ip = (i + 1) * (ny + 2) + j; 38 | im = (i - 1) * (ny + 2) + j; 39 | jp = i * (ny + 2) + j + 1; 40 | jm = i * (ny + 2) + j - 1; 41 | currdata[ind] = prevdata[ind] + a * dt * 42 | ((prevdata[ip] -2.0 * prevdata[ind] + prevdata[im]) / dx2 + 43 | (prevdata[jp] - 2.0 * prevdata[ind] + prevdata[jm]) / dy2); 44 | 45 | } 46 | 47 | } 48 | 49 | void evolve(field *curr, field *prev, double a, double dt) 50 | { 51 | int nx, ny; 52 | double dx2, dy2; 53 | nx = prev->nx; 54 | ny = prev->ny; 55 | dx2 = prev->dx * prev->dx; 56 | dy2 = prev->dy * prev->dy; 57 | 58 | /* CUDA thread settings */ 59 | const int blocksize = 16; //!< CUDA thread block dimension 60 | dim3 dimBlock(blocksize, blocksize); 61 | // CUDA threads are arranged in column major order; thus make ny x nx grid 62 | dim3 dimGrid((ny + 2 + blocksize - 1) / blocksize, 63 | (nx + 2 + blocksize - 1) / blocksize); 64 | 65 | evolve_kernel<<>>(curr->devdata, prev->devdata, a, dt, nx, ny, dx2, dy2); 66 | CUDA_ERR(cudaDeviceSynchronize()); 67 | } 68 | 69 | void enter_data(field *temperature1, field *temperature2) 70 | { 71 | size_t datasize; 72 | 73 | datasize = (temperature1->nx + 2) * (temperature1->ny + 2) * sizeof(double); 74 | 75 | CUDA_ERR(cudaMalloc(&temperature1->devdata, datasize)); 76 | CUDA_ERR(cudaMalloc(&temperature2->devdata, datasize)); 77 | 78 | CUDA_ERR(cudaMemcpy(temperature1->devdata, temperature1->data, datasize, cudaMemcpyHostToDevice)); 79 | CUDA_ERR(cudaMemcpy(temperature2->devdata, temperature2->data, datasize, cudaMemcpyHostToDevice)); 80 | } 81 | 82 | /* Copy a temperature field from the device to the host */ 83 | void update_host(field *temperature) 84 | { 85 | size_t datasize; 86 | 87 | datasize = (temperature->nx + 2) * (temperature->ny + 2) * sizeof(double); 88 | CUDA_ERR(cudaMemcpy(temperature->data, temperature->devdata, datasize, cudaMemcpyDeviceToHost)); 89 | } 90 | 91 | /* Copy a temperature field from the host to the device */ 92 | void update_device(field *temperature) 93 | { 94 | size_t datasize; 95 | 96 | datasize = (temperature->nx + 2) * (temperature->ny + 2) * sizeof(double); 97 | CUDA_ERR(cudaMemcpy(temperature->devdata, temperature->data, datasize, cudaMemcpyHostToDevice)); 98 | } 99 | 100 | -------------------------------------------------------------------------------- /bonus/heat-equation/cuda/heat.h: -------------------------------------------------------------------------------- 1 | #ifndef __HEAT_H__ 2 | #define __HEAT_H__ 3 | 4 | 5 | /* Datatype for temperature field */ 6 | typedef struct { 7 | /* nx and ny are the true dimensions of the field. The array data 8 | * contains also ghost layers, so it will have dimensions nx+2 x ny+2 */ 9 | int nx; /* Local dimensions of the field */ 10 | int ny; 11 | int nx_full; /* Global dimensions of the field */ 12 | int ny_full; /* Global dimensions of the field */ 13 | double dx; 14 | double dy; 15 | double *data; 16 | double *devdata; /* Data in device */ 17 | } field; 18 | 19 | /* Datatype for basic parallelization information */ 20 | typedef struct { 21 | int size; /* Number of MPI tasks */ 22 | int rank; 23 | int nup, ndown; /* Ranks of neighbouring MPI tasks */ 24 | } parallel_data; 25 | 26 | 27 | /* We use here fixed grid spacing */ 28 | #define DX 0.01 29 | #define DY 0.01 30 | 31 | #if __cplusplus 32 | extern "C" { 33 | #endif 34 | 35 | /* Function prototypes */ 36 | void set_field_dimensions(field *temperature, int nx, int ny, 37 | parallel_data *parallel); 38 | 39 | void parallel_setup(parallel_data *parallel, int nx, int ny); 40 | 41 | void parallel_set_dimensions(parallel_data *parallel, int nx, int ny); 42 | 43 | void initialize(int argc, char *argv[], field *temperature1, 44 | field *temperature2, int *nsteps, parallel_data *parallel); 45 | 46 | void generate_field(field *temperature, parallel_data *parallel); 47 | 48 | double average(field *temperature); 49 | 50 | void exchange(field *temperature, parallel_data *parallel); 51 | 52 | void evolve(field *curr, field *prev, double a, double dt); 53 | 54 | void write_field(field *temperature, int iter, parallel_data *parallel); 55 | 56 | void read_field(field *temperature1, field *temperature2, 57 | char *filename, parallel_data *parallel); 58 | 59 | void copy_field(field *temperature1, field *temperature2); 60 | 61 | void swap_fields(field *temperature1, field *temperature2); 62 | 63 | void allocate_field(field *temperature); 64 | 65 | void finalize(field *temperature1, field *temperature2); 66 | 67 | void enter_data(field *temperature1, field *temperature2); 68 | 69 | void update_host(field *temperature); 70 | 71 | void update_device(field *temperature); 72 | 73 | #if __cplusplus 74 | } 75 | #endif 76 | #endif /* __HEAT_H__ */ 77 | 78 | -------------------------------------------------------------------------------- /bonus/heat-equation/cuda/main.cpp: -------------------------------------------------------------------------------- 1 | /* Heat equation solver in 2D. */ 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include /* Needed for CUDA-aware check */ 9 | 10 | #include "heat.h" 11 | 12 | int main(int argc, char **argv) 13 | { 14 | double a = 0.5; //!< Diffusion constant 15 | field current, previous; //!< Current and previous temperature fields 16 | 17 | double dt; //!< Time step 18 | int nsteps; //!< Number of time steps 19 | 20 | int image_interval = 1500; //!< Image output interval 21 | 22 | parallel_data parallelization; //!< Parallelization info 23 | 24 | double dx2, dy2; //!< Delta x and y squared 25 | 26 | double average_temp; //!< Average temperature 27 | 28 | double start_clock, stop_clock; //!< Time stamps 29 | 30 | 31 | MPI_Init(&argc, &argv); 32 | 33 | if (1 != MPIX_Query_cuda_support()) { 34 | printf("CUDA aware MPI required\n"); 35 | fflush(stdout); 36 | MPI_Abort(MPI_COMM_WORLD, 5); 37 | } 38 | initialize(argc, argv, ¤t, &previous, &nsteps, ¶llelization); 39 | 40 | /* Output the initial field */ 41 | write_field(¤t, 0, ¶llelization); 42 | 43 | average_temp = average(¤t); 44 | if (parallelization.rank == 0) { 45 | printf("Average temperature at start: %f\n", average_temp); 46 | } 47 | 48 | 49 | /* Largest stable time step */ 50 | dx2 = current.dx * current.dx; 51 | dy2 = current.dy * current.dy; 52 | dt = dx2 * dy2 / (2.0 * a * (dx2 + dy2)); 53 | 54 | /* Get the start time stamp */ 55 | start_clock = MPI_Wtime(); 56 | 57 | /* Copy fields to device */ 58 | enter_data(¤t, &previous); 59 | 60 | /* Time evolve */ 61 | for (int iter = 1; iter <= nsteps; iter++) { 62 | exchange(&previous, ¶llelization); 63 | evolve(¤t, &previous, a, dt); 64 | if (iter % image_interval == 0) { 65 | update_host(¤t); 66 | write_field(¤t, iter, ¶llelization); 67 | } 68 | /* Swap current field so that it will be used 69 | as previous for next iteration step */ 70 | swap_fields(¤t, &previous); 71 | } 72 | 73 | update_host(&previous); 74 | stop_clock = MPI_Wtime(); 75 | 76 | /* Average temperature for reference */ 77 | average_temp = average(&previous); 78 | 79 | /* Determine the CPU time used for the iteration */ 80 | if (parallelization.rank == 0) { 81 | printf("Iteration took %.3f seconds.\n", (stop_clock - start_clock)); 82 | printf("Average temperature: %f\n", average_temp); 83 | if (argc == 1) { 84 | printf("Reference value with default arguments: 59.281239\n"); 85 | } 86 | } 87 | 88 | /* Output the final field */ 89 | write_field(&previous, nsteps, ¶llelization); 90 | 91 | finalize(¤t, &previous); 92 | MPI_Finalize(); 93 | 94 | return 0; 95 | } 96 | -------------------------------------------------------------------------------- /bonus/heat-equation/cuda/utilities.cpp: -------------------------------------------------------------------------------- 1 | /* Utility functions for heat equation solver 2 | * NOTE: This file does not need to be edited! */ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "heat.h" 11 | 12 | 13 | /* Copy data on temperature1 into temperature2 */ 14 | void copy_field(field *temperature1, field *temperature2) 15 | { 16 | assert(temperature1->nx == temperature2->nx); 17 | assert(temperature1->ny == temperature2->ny); 18 | memcpy(temperature2->data, temperature1->data, 19 | (temperature1->nx + 2) * (temperature1->ny + 2) * sizeof(double)); 20 | } 21 | 22 | /* Swap the data of fields temperature1 and temperature2 */ 23 | void swap_fields(field *temperature1, field *temperature2) 24 | { 25 | double *tmp; 26 | tmp = temperature1->data; 27 | temperature1->data = temperature2->data; 28 | temperature2->data = tmp; 29 | 30 | tmp = temperature1->devdata; 31 | temperature1->devdata = temperature2->devdata; 32 | temperature2->devdata = tmp; 33 | } 34 | 35 | /* Allocate memory for a temperature field and initialise it to zero */ 36 | void allocate_field(field *temperature) 37 | { 38 | // Allocate also ghost layers 39 | temperature->data = new double [(temperature->nx + 2) * (temperature->ny + 2)]; 40 | 41 | // Initialize to zero 42 | memset(temperature->data, 0.0, 43 | (temperature->nx + 2) * (temperature->ny + 2) * sizeof(double)); 44 | } 45 | 46 | /* Calculate average temperature */ 47 | double average(field *temperature) 48 | { 49 | double local_average = 0.0; 50 | double average = 0.0; 51 | 52 | for (int i = 1; i < temperature->nx + 1; i++) { 53 | for (int j = 1; j < temperature->ny + 1; j++) { 54 | int ind = i * (temperature->ny + 2) + j; 55 | local_average += temperature->data[ind]; 56 | } 57 | } 58 | 59 | MPI_Allreduce(&local_average, &average, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); 60 | average /= (temperature->nx_full * temperature->ny_full); 61 | return average; 62 | } 63 | -------------------------------------------------------------------------------- /bonus/heat-equation/hip_solution/Makefile: -------------------------------------------------------------------------------- 1 | ifeq ($(COMP),) 2 | COMP=lumi 3 | endif 4 | 5 | COMMONDIR=../common 6 | 7 | ifeq ($(COMP),lumi) 8 | LIBPNG_DIR=/appl/lumi/SW/LUMI-22.12/C/EB/libpng/1.6.38-cpeCray-22.12 9 | CXX=CC 10 | CC=cc 11 | CXXFLAGS=-xhip -I$(COMMONDIR) 12 | CCFLAGS=-I$(LIBPNG_DIR)/include 13 | LDFLAGS=-L$(LIBPNG_DIR)/lib -Wl,-rpath=$(LIBPNG_DIR)/lib 14 | LIBS=-lpng 15 | endif 16 | 17 | EXE=heat_hip 18 | OBJS=main.o core.o core_hip.o setup.o utilities.o io.o 19 | OBJS_PNG=$(COMMONDIR)/pngwriter.o 20 | 21 | 22 | all: $(EXE) 23 | 24 | 25 | core.o: core.cpp heat.h 26 | core_hip.o: core_hip.cpp heat.h 27 | utilities.o: utilities.cpp heat.h 28 | setup.o: setup.cpp heat.h 29 | io.o: io.cpp heat.h 30 | main.o: main.cpp heat.h 31 | 32 | $(OBJS_PNG): C_COMPILER := $(CC) 33 | $(OBJS): C_COMPILER := $(CXX) 34 | 35 | $(EXE): $(OBJS) $(OBJS_PNG) 36 | $(CXX) $(CCFLAGS) $(OBJS) $(OBJS_PNG) -o $@ $(LDFLAGS) $(LIBS) 37 | 38 | %.o: %.cpp 39 | $(CXX) $(CXXFLAGS) -c $< -o $@ 40 | 41 | %.o: %.c 42 | $(CC) $(CCFLAGS) -c $< -o $@ 43 | 44 | .PHONY: clean 45 | clean: 46 | -/bin/rm -f $(EXE) a.out *.o *.png *~ 47 | -------------------------------------------------------------------------------- /bonus/heat-equation/serial/Makefile: -------------------------------------------------------------------------------- 1 | ifeq ($(COMP),) 2 | COMP=gnu 3 | endif 4 | 5 | COMMONDIR=../common 6 | 7 | ifeq ($(COMP),pgi) 8 | CXX=pgCC 9 | CC=pgcc 10 | CCFLAGS=-g -O3 -fopenmp -I$(COMMONDIR) 11 | LDFLAGS= 12 | LIBS=-lpng 13 | endif 14 | 15 | ifeq ($(COMP),gnu) 16 | CXX=g++ 17 | CC=gcc 18 | CCFLAGS=-g -O3 -fopenmp -Wall -I$(COMMONDIR) 19 | LDFLAGS= 20 | LIBS=-lpng 21 | endif 22 | 23 | ifeq ($(COMP),intel) 24 | CXX=icpx 25 | CC=icx 26 | CCFLAGS=-g -O3 -fopenmp -I$(COMMONDIR) 27 | LDFLAGS= 28 | LIBS=-lpng 29 | endif 30 | 31 | EXE=heat_serial 32 | OBJS=main.o core.o setup.o utilities.o io.o 33 | OBJS_PNG=$(COMMONDIR)/pngwriter.o 34 | 35 | 36 | all: $(EXE) 37 | 38 | 39 | core.o: core.cpp heat.h 40 | utilities.o: utilities.cpp heat.h 41 | setup.o: setup.cpp heat.h 42 | io.o: io.cpp heat.h 43 | main.o: main.cpp heat.h 44 | 45 | $(OBJS_PNG): C_COMPILER := $(CC) 46 | $(OBJS): C_COMPILER := $(CXX) 47 | 48 | $(EXE): $(OBJS) $(OBJS_PNG) 49 | $(CXX) $(CCFLAGS) $(OBJS) $(OBJS_PNG) -o $@ $(LDFLAGS) $(LIBS) 50 | 51 | %.o: %.cpp 52 | $(CXX) $(CCFLAGS) -c $< -o $@ 53 | 54 | %.o: %.c 55 | $(CC) $(CCFLAGS) -c $< -o $@ 56 | 57 | .PHONY: clean 58 | clean: 59 | -/bin/rm -f $(EXE) a.out *.o *.png *~ 60 | -------------------------------------------------------------------------------- /bonus/heat-equation/serial/core.cpp: -------------------------------------------------------------------------------- 1 | /* Main solver routines for heat equation solver */ 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "heat.h" 9 | 10 | /* Update the temperature values using five-point stencil */ 11 | void evolve(field *curr, field *prev, double a, double dt) 12 | { 13 | double dx2, dy2; 14 | int nx, ny; 15 | double *currdata, *prevdata; 16 | 17 | currdata = curr->data; 18 | prevdata = prev->data; 19 | nx = curr->nx; 20 | ny = curr->ny; 21 | 22 | /* Determine the temperature field at next time step 23 | * As we have fixed boundary conditions, the outermost gridpoints 24 | * are not updated. */ 25 | dx2 = prev->dx * prev->dx; 26 | dy2 = prev->dy * prev->dy; 27 | for (int i = 1; i < nx + 1; i++) { 28 | for (int j = 1; j < ny + 1; j++) { 29 | int ind = i * (ny + 2) + j; 30 | int ip = (i + 1) * (ny + 2) + j; 31 | int im = (i - 1) * (ny + 2) + j; 32 | int jp = i * (ny + 2) + j + 1; 33 | int jm = i * (ny + 2) + j - 1; 34 | currdata[ind] = prevdata[ind] + a * dt * 35 | ((prevdata[ip] -2.0 * prevdata[ind] + prevdata[im]) / dx2 + 36 | (prevdata[jp] - 2.0 * prevdata[ind] + prevdata[jm]) / dy2); 37 | } 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /bonus/heat-equation/serial/fortran/Makefile: -------------------------------------------------------------------------------- 1 | ifeq ($(COMP),) 2 | COMP=gnu 3 | endif 4 | 5 | COMMONDIR=../../common 6 | 7 | ifeq ($(COMP),gnu) 8 | FC=gfortran 9 | CC=gcc 10 | FCFLAGS=-O3 -Wall -fopenmp 11 | CCFLAGS=-O3 -Wall -I$(COMMONDIR) 12 | LDFLAGS= 13 | LIBS=-lpng 14 | endif 15 | 16 | ifeq ($(COMP),pgi) 17 | FC=pgfortran 18 | CC=gcc 19 | FCFLAGS=-O3 -acc 20 | CCFLAGS=-O3 -I$(COMMONDIR) 21 | LDFLAGS= 22 | LIBS=-lpng 23 | endif 24 | 25 | 26 | EXE=heat_serial 27 | OBJS=main.o heat_mod.o core.o setup.o utilities.o io.o pngwriter_mod.o 28 | OBJS_PNG=$(COMMONDIR)/pngwriter.o 29 | 30 | all: $(EXE) 31 | 32 | $(COMMONDIR)/pngwriter.o: $(COMMONDIR)/pngwriter.c $(COMMONDIR)/pngwriter.h 33 | core.o: core.F90 heat_mod.o 34 | utilities.o: utilities.F90 heat_mod.o 35 | io.o: io.F90 heat_mod.o pngwriter_mod.o 36 | setup.o: setup.F90 heat_mod.o utilities.o io.o 37 | pngwriter_mod.o: pngwriter_mod.F90 heat_mod.o 38 | main.o: main.F90 heat_mod.o core.o io.o setup.o utilities.o 39 | 40 | $(EXE): $(OBJS) $(OBJS_PNG) 41 | $(FC) $(FCFLAGS) $(OBJS) $(OBJS_PNG) -o $@ $(LDFLAGS) $(LIBS) 42 | 43 | %.o: %.F90 44 | $(FC) $(FCFLAGS) -c $< -o $@ 45 | 46 | %.o: %.c 47 | $(CC) $(CCFLAGS) -c $< -o $@ 48 | 49 | .PHONY: clean 50 | clean: 51 | -/bin/rm -f $(EXE) a.out *.o *.mod *.png *~ 52 | -------------------------------------------------------------------------------- /bonus/heat-equation/serial/fortran/core.F90: -------------------------------------------------------------------------------- 1 | ! Main solver routines for heat equation solver 2 | module core 3 | use heat 4 | 5 | contains 6 | 7 | ! Compute one time step of temperature evolution 8 | ! Arguments: 9 | ! curr (type(field)): current temperature values 10 | ! prev (type(field)): values from previous time step 11 | ! a (real(dp)): update equation constant 12 | ! dt (real(dp)): time step value 13 | subroutine evolve(curr, prev, a, dt) 14 | 15 | implicit none 16 | 17 | type(field), intent(inout) :: curr, prev 18 | real(dp) :: a, dt 19 | integer :: i, j, nx, ny 20 | 21 | nx = curr%nx 22 | ny = curr%ny 23 | 24 | do j = 1, ny 25 | do i = 1, nx 26 | curr%data(i, j) = prev%data(i, j) + a * dt * & 27 | & ((prev%data(i-1, j) - 2.0 * prev%data(i, j) + & 28 | & prev%data(i+1, j)) / curr%dx**2 + & 29 | & (prev%data(i, j-1) - 2.0 * prev%data(i, j) + & 30 | & prev%data(i, j+1)) / curr%dy**2) 31 | end do 32 | end do 33 | end subroutine evolve 34 | 35 | end module core 36 | -------------------------------------------------------------------------------- /bonus/heat-equation/serial/fortran/heat_mod.F90: -------------------------------------------------------------------------------- 1 | ! Field metadata for heat equation solver 2 | module heat 3 | use iso_fortran_env, only : REAL64 4 | implicit none 5 | 6 | integer, parameter :: dp = REAL64 7 | real(dp), parameter :: DX = 0.01, DY = 0.01 ! Fixed grid spacing 8 | 9 | type :: field 10 | integer :: nx ! local dimension of the field 11 | integer :: ny 12 | integer :: nx_full ! global dimension of the field 13 | integer :: ny_full 14 | real(dp) :: dx 15 | real(dp) :: dy 16 | real(dp), dimension(:,:), allocatable :: data 17 | end type field 18 | 19 | contains 20 | ! Initialize the field type metadata 21 | ! Arguments: 22 | ! field0 (type(field)): input field 23 | ! nx, ny, dx, dy: field dimensions and spatial step size 24 | subroutine set_field_dimensions(field0, nx, ny) 25 | implicit none 26 | 27 | type(field), intent(out) :: field0 28 | integer, intent(in) :: nx, ny 29 | 30 | field0%dx = DX 31 | field0%dy = DY 32 | field0%nx = nx 33 | field0%ny = ny 34 | field0%nx_full = nx 35 | field0%ny_full = ny 36 | 37 | end subroutine set_field_dimensions 38 | 39 | end module heat 40 | -------------------------------------------------------------------------------- /bonus/heat-equation/serial/fortran/io.F90: -------------------------------------------------------------------------------- 1 | ! I/O routines for heat equation solver 2 | module io 3 | use heat 4 | 5 | contains 6 | 7 | ! Output routine, saves the temperature distribution as a png image 8 | ! Arguments: 9 | ! curr (type(field)): variable with the temperature data 10 | ! iter (integer): index of the time step 11 | subroutine write_field(curr, iter) 12 | 13 | use pngwriter 14 | implicit none 15 | type(field), intent(in) :: curr 16 | integer, intent(in) :: iter 17 | 18 | character(len=85) :: filename 19 | 20 | integer :: stat 21 | real(dp), dimension(:,:), allocatable, target :: full_data 22 | 23 | allocate(full_data(curr%nx_full, curr%ny_full)) 24 | ! Copy rand #0 data to the global array 25 | full_data(1:curr%nx, 1:curr%ny) = curr%data(1:curr%nx, 1:curr%ny) 26 | 27 | write(filename,'(A5,I4.4,A4,A)') 'heat_', iter, '.png' 28 | stat = save_png(full_data, curr%nx_full, curr%ny_full, filename) 29 | deallocate(full_data) 30 | 31 | end subroutine write_field 32 | 33 | 34 | ! Reads the temperature distribution from an input file 35 | ! Arguments: 36 | ! field0 (type(field)): field variable that will store the 37 | ! read data 38 | ! filename (char): name of the input file 39 | ! Note that this version assumes the input data to be in C memory layout 40 | subroutine read_field(field0, filename) 41 | 42 | implicit none 43 | type(field), intent(out) :: field0 44 | character(len=85), intent(in) :: filename 45 | 46 | integer :: nx, ny, i 47 | character(len=2) :: dummy 48 | 49 | real(dp), dimension(:,:), allocatable :: full_data 50 | 51 | open(10, file=filename) 52 | ! Read the header 53 | read(10, *) dummy, nx, ny 54 | 55 | call set_field_dimensions(field0, nx, ny) 56 | 57 | ! The arrays for temperature field contain also a halo region 58 | allocate(field0%data(0:field0%nx+1, 0:field0%ny+1)) 59 | 60 | allocate(full_data(nx, ny)) 61 | ! Read the data 62 | do i = 1, nx 63 | read(10, *) full_data(i, 1:ny) 64 | end do 65 | 66 | ! Copy to full array containing also boundaries 67 | field0%data(1:field0%nx, 1:field0%ny) = full_data(:,:) 68 | 69 | ! Set the boundary values 70 | field0%data(1:field0%nx, 0) = field0%data(1:field0%nx, 1) 71 | field0%data(1:field0%nx, field0%ny + 1) = field0%data(1:field0%nx, field0%ny) 72 | field0%data(0, 0:field0%ny + 1) = field0%data(1, 0:field0%ny + 1) 73 | field0%data(field0%nx + 1, 0:field0%ny + 1) = field0%data(field0%nx, 0:field0%ny + 1) 74 | 75 | close(10) 76 | deallocate(full_data) 77 | 78 | end subroutine read_field 79 | 80 | end module io 81 | -------------------------------------------------------------------------------- /bonus/heat-equation/serial/fortran/main.F90: -------------------------------------------------------------------------------- 1 | ! Heat equation solver in 2D. 2 | 3 | program heat_solve 4 | use heat 5 | use core 6 | use io 7 | use setup 8 | use utilities 9 | use omp_lib 10 | 11 | implicit none 12 | 13 | real(dp), parameter :: a = 0.5 ! Diffusion constant 14 | type(field) :: current, previous ! Current and previus temperature fields 15 | 16 | real(dp) :: dt ! Time step 17 | integer :: nsteps ! Number of time steps 18 | integer, parameter :: image_interval = 1500 ! Image output interval 19 | 20 | integer :: iter 21 | 22 | real(dp) :: average_temp ! Average temperature 23 | 24 | real(kind=dp) :: start, stop ! Timers 25 | 26 | call initialize(current, previous, nsteps) 27 | 28 | ! Draw the picture of the initial state 29 | call write_field(current, 0) 30 | 31 | average_temp = average(current) 32 | write(*,'(A,F9.6)') 'Average temperature at start: ', average_temp 33 | 34 | ! Largest stable time step 35 | dt = current%dx**2 * current%dy**2 / & 36 | & (2.0 * a * (current%dx**2 + current%dy**2)) 37 | 38 | ! Main iteration loop, save a picture every 39 | ! image_interval steps 40 | 41 | start = omp_get_wtime() 42 | 43 | do iter = 1, nsteps 44 | call evolve(current, previous, a, dt) 45 | if (mod(iter, image_interval) == 0) then 46 | call write_field(current, iter) 47 | end if 48 | call swap_fields(current, previous) 49 | end do 50 | 51 | stop = omp_get_wtime() 52 | 53 | ! Average temperature for reference 54 | average_temp = average(previous) 55 | 56 | write(*,'(A,F7.3,A)') 'Iteration took ', stop - start, ' seconds.' 57 | write(*,'(A,F9.6)') 'Average temperature: ', average_temp 58 | if (command_argument_count() == 0) then 59 | write(*,'(A,F9.6)') 'Reference value with default arguments: ', 59.281239 60 | end if 61 | 62 | call finalize(current, previous) 63 | 64 | end program heat_solve 65 | -------------------------------------------------------------------------------- /bonus/heat-equation/serial/fortran/pngwriter_mod.F90: -------------------------------------------------------------------------------- 1 | ! PNG writer for heat equation solver 2 | module pngwriter 3 | use heat 4 | 5 | contains 6 | 7 | function save_png(data, nx, ny, fname) result(stat) 8 | 9 | use, intrinsic :: ISO_C_BINDING 10 | implicit none 11 | 12 | real(dp), dimension(:,:), intent(in) :: data 13 | integer, intent(in) :: nx, ny 14 | character(len=*), intent(in) :: fname 15 | integer :: stat 16 | 17 | ! Interface for save_png C-function 18 | interface 19 | ! The C-function definition is 20 | ! int save_png(double *data, const int nx, const int ny, 21 | ! const char *fname) 22 | function save_png_c(data, nx, ny, fname, order) & 23 | & bind(C,name="save_png") result(stat) 24 | use, intrinsic :: ISO_C_BINDING 25 | implicit none 26 | real(kind=C_DOUBLE) :: data(*) 27 | integer(kind=C_INT), value, intent(IN) :: nx, ny 28 | character(kind=C_CHAR), intent(IN) :: fname(*) 29 | character(kind=C_CHAR), value, intent(IN) :: order 30 | integer(kind=C_INT) :: stat 31 | end function save_png_c 32 | end interface 33 | 34 | stat = save_png_c(data, nx, ny, trim(fname) // C_NULL_CHAR, 'f') 35 | if (stat /= 0) then 36 | write(*,*) 'save_png returned error!' 37 | end if 38 | 39 | end function save_png 40 | 41 | end module pngwriter 42 | -------------------------------------------------------------------------------- /bonus/heat-equation/serial/fortran/utilities.F90: -------------------------------------------------------------------------------- 1 | ! Utility routines for heat equation solver 2 | ! NOTE: This file does not need to be edited! 3 | module utilities 4 | use heat 5 | 6 | contains 7 | 8 | ! Swap the data fields of two variables of type field 9 | ! Arguments: 10 | ! curr, prev (type(field)): the two variables that are swapped 11 | subroutine swap_fields(curr, prev) 12 | 13 | implicit none 14 | 15 | type(field), intent(inout) :: curr, prev 16 | real(dp), allocatable, dimension(:,:) :: tmp 17 | 18 | call move_alloc(curr%data, tmp) 19 | call move_alloc(prev%data, curr%data) 20 | call move_alloc(tmp, prev%data) 21 | end subroutine swap_fields 22 | 23 | ! Copy the data from one field to another 24 | ! Arguments: 25 | ! from_field (type(field)): variable to copy from 26 | ! to_field (type(field)): variable to copy to 27 | subroutine copy_fields(from_field, to_field) 28 | 29 | implicit none 30 | 31 | type(field), intent(in) :: from_field 32 | type(field), intent(out) :: to_field 33 | 34 | ! Consistency checks 35 | if (.not.allocated(from_field%data)) then 36 | write (*,*) "Can not copy from a field without allocated data" 37 | stop 38 | end if 39 | if (.not.allocated(to_field%data)) then 40 | ! Target is not initialize, allocate memory 41 | allocate(to_field%data(lbound(from_field%data, 1):ubound(from_field%data, 1), & 42 | & lbound(from_field%data, 2):ubound(from_field%data, 2))) 43 | else if (any(shape(from_field%data) /= shape(to_field%data))) then 44 | write (*,*) "Wrong field data sizes in copy routine" 45 | print *, shape(from_field%data), shape(to_field%data) 46 | stop 47 | end if 48 | 49 | to_field%data = from_field%data 50 | 51 | to_field%nx = from_field%nx 52 | to_field%ny = from_field%ny 53 | to_field%nx_full = from_field%nx_full 54 | to_field%ny_full = from_field%ny_full 55 | to_field%dx = from_field%dx 56 | to_field%dy = from_field%dy 57 | end subroutine copy_fields 58 | 59 | function average(field0) 60 | 61 | implicit none 62 | 63 | real(dp) :: average 64 | type(field) :: field0 65 | 66 | real(dp) :: local_average 67 | integer :: rc 68 | 69 | average = sum(field0%data(1:field0%nx, 1:field0%ny)) 70 | average = average / (field0%nx_full * field0%ny_full) 71 | 72 | end function average 73 | 74 | end module utilities 75 | -------------------------------------------------------------------------------- /bonus/heat-equation/serial/heat.h: -------------------------------------------------------------------------------- 1 | #ifndef __HEAT_H__ 2 | #define __HEAT_H__ 3 | 4 | 5 | /* Datatype for temperature field */ 6 | typedef struct { 7 | /* nx and ny are the true dimensions of the field. The array data 8 | * contains also ghost layers, so it will have dimensions nx+2 x ny+2 */ 9 | int nx; /* Local dimensions of the field */ 10 | int ny; 11 | int nx_full; /* Global dimensions of the field */ 12 | int ny_full; /* Global dimensions of the field */ 13 | double dx; 14 | double dy; 15 | double *data; 16 | } field; 17 | 18 | /* We use here fixed grid spacing */ 19 | #define DX 0.01 20 | #define DY 0.01 21 | 22 | #if __cplusplus 23 | extern "C" { 24 | #endif 25 | /* Function prototypes */ 26 | void set_field_dimensions(field *temperature, int nx, int ny); 27 | 28 | void initialize(int argc, char *argv[], field *temperature1, 29 | field *temperature2, int *nsteps); 30 | 31 | void generate_field(field *temperature); 32 | 33 | double average(field *temperature); 34 | 35 | void evolve(field *curr, field *prev, double a, double dt); 36 | 37 | void write_field(field *temperature, int iter); 38 | 39 | void read_field(field *temperature1, field *temperature2, 40 | char *filename); 41 | 42 | void copy_field(field *temperature1, field *temperature2); 43 | 44 | void swap_fields(field *temperature1, field *temperature2); 45 | 46 | void allocate_field(field *temperature); 47 | 48 | void finalize(field *temperature1, field *temperature2); 49 | 50 | #if __cplusplus 51 | } 52 | #endif 53 | #endif /* __HEAT_H__ */ 54 | 55 | -------------------------------------------------------------------------------- /bonus/heat-equation/serial/io.cpp: -------------------------------------------------------------------------------- 1 | /* I/O related functions for heat equation solver */ 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "heat.h" 9 | #include "pngwriter.h" 10 | 11 | /* Output routine that prints out a picture of the temperature 12 | * distribution. */ 13 | void write_field(field *temperature, int iter) 14 | { 15 | char filename[64]; 16 | 17 | /* The actual write routine takes only the actual data 18 | * (without ghost layers) so we need array for that. */ 19 | int height, width; 20 | double *full_data; 21 | 22 | height = temperature->nx; 23 | width = temperature->ny; 24 | 25 | /* Copy the inner data */ 26 | full_data = new double [height * width]; 27 | for (int i = 0; i < temperature->nx; i++) 28 | memcpy(&full_data[i * width], &temperature->data[(i + 1) * (width + 2) + 1], 29 | temperature->ny * sizeof(double)); 30 | 31 | /* Write out the data to a png file */ 32 | sprintf(filename, "%s_%04d.png", "heat", iter); 33 | save_png(full_data, height, width, filename, 'c'); 34 | delete[] full_data; 35 | } 36 | 37 | /* Read the initial temperature distribution from a file and 38 | * initialize the temperature fields temperature1 and 39 | * temperature2 to the same initial state. */ 40 | void read_field(field *temperature1, field *temperature2, char *filename) 41 | { 42 | FILE *fp; 43 | int nx, ny, ind; 44 | double *full_data; 45 | 46 | int nx_local, ny_local, count; 47 | 48 | fp = fopen(filename, "r"); 49 | /* Read the header */ 50 | count = fscanf(fp, "# %d %d \n", &nx, &ny); 51 | if (count < 2) { 52 | fprintf(stderr, "Error while reading the input file!\n"); 53 | exit(-1); 54 | } 55 | 56 | set_field_dimensions(temperature1, nx, ny); 57 | set_field_dimensions(temperature2, nx, ny); 58 | 59 | /* Allocate arrays (including ghost layers) */ 60 | temperature1->data = new double[(temperature1->nx + 2) * (temperature1->ny + 2)]; 61 | temperature2->data = new double[(temperature1->nx + 2) * (temperature1->ny + 2)]; 62 | 63 | /* Full array */ 64 | full_data = new double [nx * ny]; 65 | 66 | /* Read the actual data */ 67 | for (int i = 0; i < nx; i++) { 68 | for (int j = 0; j < ny; j++) { 69 | ind = i * ny + j; 70 | count = fscanf(fp, "%lf", &full_data[ind]); 71 | } 72 | } 73 | 74 | nx_local = temperature1->nx; 75 | ny_local = temperature1->ny; 76 | 77 | /* Copy to the array containing also boundaries */ 78 | for (int i = 0; i < nx_local; i++) 79 | memcpy(&temperature1->data[(i + 1) * (ny_local + 2) + 1], &full_data[i * ny_local], 80 | ny * sizeof(double)); 81 | 82 | /* Set the boundary values */ 83 | for (int i = 1; i < nx_local + 1; i++) { 84 | temperature1->data[i * (ny_local + 2)] = temperature1->data[i * (ny_local + 2) + 1]; 85 | temperature1->data[i * (ny_local + 2) + ny + 1] = temperature1->data[i * (ny_local + 2) + ny]; 86 | } 87 | for (int j = 0; j < ny + 2; j++) { 88 | temperature1->data[j] = temperature1->data[ny_local + j]; 89 | temperature1->data[(nx_local + 1) * (ny_local + 2) + j] = 90 | temperature1->data[nx_local * (ny_local + 2) + j]; 91 | } 92 | 93 | copy_field(temperature1, temperature2); 94 | 95 | delete[] full_data; 96 | fclose(fp); 97 | } 98 | -------------------------------------------------------------------------------- /bonus/heat-equation/serial/main.cpp: -------------------------------------------------------------------------------- 1 | /* Heat equation solver in 2D. */ 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "heat.h" 10 | 11 | int main(int argc, char **argv) 12 | { 13 | double a = 0.5; //!< Diffusion constant 14 | field current, previous; //!< Current and previous temperature fields 15 | 16 | double dt; //!< Time step 17 | int nsteps; //!< Number of time steps 18 | 19 | int image_interval = 1500; //!< Image output interval 20 | 21 | double dx2, dy2; //!< Delta x and y squared 22 | 23 | double average_temp; //!< Average temperature 24 | 25 | double start_clock, stop_clock; //!< Time stamps 26 | 27 | 28 | initialize(argc, argv, ¤t, &previous, &nsteps); 29 | 30 | /* Output the initial field */ 31 | write_field(¤t, 0); 32 | 33 | average_temp = average(¤t); 34 | printf("Average temperature at start: %f\n", average_temp); 35 | 36 | 37 | /* Largest stable time step */ 38 | dx2 = current.dx * current.dx; 39 | dy2 = current.dy * current.dy; 40 | dt = dx2 * dy2 / (2.0 * a * (dx2 + dy2)); 41 | 42 | /* Get the start time stamp */ 43 | start_clock = omp_get_wtime(); 44 | 45 | /* Time evolve */ 46 | for (int iter = 1; iter <= nsteps; iter++) { 47 | evolve(¤t, &previous, a, dt); 48 | if (iter % image_interval == 0) { 49 | write_field(¤t, iter); 50 | } 51 | /* Swap current field so that it will be used 52 | as previous for next iteration step */ 53 | swap_fields(¤t, &previous); 54 | } 55 | 56 | stop_clock = omp_get_wtime(); 57 | 58 | /* Average temperature for reference */ 59 | average_temp = average(&previous); 60 | 61 | /* Determine the CPU time used for the iteration */ 62 | printf("Iteration took %.3f seconds.\n", (stop_clock - start_clock)); 63 | printf("Average temperature: %f\n", average_temp); 64 | if (argc == 1) { 65 | printf("Reference value with default arguments: 59.281239\n"); 66 | } 67 | 68 | /* Output the final field */ 69 | write_field(&previous, nsteps); 70 | 71 | finalize(¤t, &previous); 72 | 73 | return 0; 74 | } 75 | -------------------------------------------------------------------------------- /bonus/heat-equation/serial/utilities.cpp: -------------------------------------------------------------------------------- 1 | /* Utility functions for heat equation solver 2 | * NOTE: This file does not need to be edited! */ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "heat.h" 10 | 11 | 12 | /* Copy data on temperature1 into temperature2 */ 13 | void copy_field(field *temperature1, field *temperature2) 14 | { 15 | assert(temperature1->nx == temperature2->nx); 16 | assert(temperature1->ny == temperature2->ny); 17 | memcpy(temperature2->data, temperature1->data, 18 | (temperature1->nx + 2) * (temperature1->ny + 2) * sizeof(double)); 19 | } 20 | 21 | /* Swap the data of fields temperature1 and temperature2 */ 22 | void swap_fields(field *temperature1, field *temperature2) 23 | { 24 | double *tmp; 25 | tmp = temperature1->data; 26 | temperature1->data = temperature2->data; 27 | temperature2->data = tmp; 28 | } 29 | 30 | /* Allocate memory for a temperature field and initialise it to zero */ 31 | void allocate_field(field *temperature) 32 | { 33 | // Allocate also ghost layers 34 | temperature->data = new double [(temperature->nx + 2) * (temperature->ny + 2)]; 35 | 36 | // Initialize to zero 37 | memset(temperature->data, 0.0, 38 | (temperature->nx + 2) * (temperature->ny + 2) * sizeof(double)); 39 | } 40 | 41 | /* Calculate average temperature */ 42 | double average(field *temperature) 43 | { 44 | double average = 0.0; 45 | 46 | for (int i = 1; i < temperature->nx + 1; i++) { 47 | for (int j = 1; j < temperature->ny + 1; j++) { 48 | int ind = i * (temperature->ny + 2) + j; 49 | average += temperature->data[ind]; 50 | } 51 | } 52 | 53 | average /= (temperature->nx_full * temperature->ny_full); 54 | return average; 55 | } 56 | 57 | 58 | -------------------------------------------------------------------------------- /demos/device_management_hip.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main(int argc, char *argv[]) 5 | { 6 | int count, device; 7 | 8 | hipGetDeviceCount(&count); 9 | hipGetDevice(&device); 10 | 11 | printf("Hello! I'm GPU %d out of %d GPUs in total.\n", device, count); 12 | 13 | hipSetDevice(count - 1); 14 | hipGetDevice(&device); 15 | printf("Now I'm GPU %d.\n", device); 16 | 17 | return 0; 18 | } 19 | -------------------------------------------------------------------------------- /demos/device_management_mpi_hip.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | int main(int argc, char *argv[]) 6 | { 7 | MPI_Init(&argc, &argv); 8 | 9 | int size, rank; 10 | MPI_Comm_size(MPI_COMM_WORLD, &size); 11 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 12 | 13 | // Create communicator per node 14 | MPI_Comm comm_node; 15 | MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &comm_node); 16 | int size_node, rank_node; 17 | MPI_Comm_size(comm_node, &size_node); 18 | MPI_Comm_rank(comm_node, &rank_node); 19 | 20 | int namelen; 21 | char procname[MPI_MAX_PROCESSOR_NAME]; 22 | MPI_Get_processor_name(procname, &namelen); 23 | 24 | int count, device; 25 | hipGetDeviceCount(&count); 26 | hipGetDevice(&device); 27 | 28 | printf("I'm MPI rank %2d/%-2d (world) %2d/%-2d (node) on %s with GPU %2d/%-2d\n", 29 | rank, size, rank_node, size_node, procname, device, count); 30 | 31 | fflush(stdout); 32 | MPI_Barrier(MPI_COMM_WORLD); 33 | 34 | hipSetDevice(rank_node % count); 35 | hipGetDevice(&device); 36 | printf("Now MPI rank %2d/%-2d (world) %2d/%-2d (node) on %s with GPU %2d/%-2d\n", 37 | rank, size, rank_node, size_node, procname, device, count); 38 | 39 | MPI_Finalize(); 40 | 41 | return 0; 42 | } 43 | -------------------------------------------------------------------------------- /demos/device_properties_hip.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | int main(int argc, char *argv[]) 6 | { 7 | int count, device; 8 | 9 | hipGetDeviceCount(&count); 10 | hipGetDevice(&device); 11 | 12 | printf("Hello! I'm GPU %d out of %d GPUs in total.\n", device, count); 13 | 14 | hipDeviceProp_t prop; 15 | hipGetDeviceProperties(&prop, device); 16 | 17 | // Note: name is empty string on LUMI, see https://github.com/ROCm/ROCm/issues/1625 18 | printf("Name: %s\n", prop.name); 19 | printf("Memory: %.2f GiB\n", prop.totalGlobalMem / pow(1024., 3)); 20 | printf("Wavefront / warp size: %d\n", prop.warpSize); 21 | 22 | return 0; 23 | } 24 | -------------------------------------------------------------------------------- /demos/fill.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | // GPU kernel 5 | __global__ void fill_kernel(int n, double *x, double a) 6 | { 7 | int tid = threadIdx.x + blockIdx.x * blockDim.x; 8 | 9 | if(tid < n) 10 | x[tid] = tid * a; 11 | } 12 | 13 | int main(void) 14 | { 15 | // set problem size 16 | const int n = 1e4; 17 | 18 | // allocate device memory 19 | double *d_x; 20 | hipMalloc(&d_x, sizeof(double) * n); 21 | 22 | // launch kernel 23 | const int blocksize = 256; 24 | const int gridsize = (n - 1 + blocksize) / blocksize; 25 | fill_kernel<<>>(n, d_x, 3.0); 26 | 27 | // copy data to the host and print 28 | double x[n]; 29 | hipMemcpy(x, d_x, sizeof(double) * n, hipMemcpyDeviceToHost); 30 | printf("%f %f %f %f ... %f %f\n", 31 | x[0], x[1], x[2], x[3], x[n-2], x[n-1]); 32 | 33 | return 0; 34 | } 35 | -------------------------------------------------------------------------------- /demos/hello.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main(void) 5 | { 6 | int count = 0; 7 | int device = 0; 8 | 9 | auto success = hipGetDeviceCount(&count); 10 | success = hipGetDevice(&device); 11 | 12 | printf("Hello! I'm GPU %d out of %d GPUs in total.\n", device, count); 13 | 14 | return 0; 15 | } 16 | -------------------------------------------------------------------------------- /demos/warp-div.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | 7 | #define synchronize \ 8 | HIP_CHECK(hipGetLastError()); \ 9 | HIP_CHECK(hipDeviceSynchronize()) 10 | 11 | #define HIP_CHECK(expression) \ 12 | { \ 13 | const hipError_t status = expression; \ 14 | if(status != hipSuccess){ \ 15 | std::cerr << "HIP error " \ 16 | << status << ": " \ 17 | << hipGetErrorString(status) \ 18 | << " at " << __FILE__ << ":" \ 19 | << __LINE__ << std::endl; \ 20 | } \ 21 | } 22 | 23 | 24 | #define starttime { auto start = std::chrono::high_resolution_clock::now(); 25 | 26 | #define endtime \ 27 | auto stop = std::chrono::high_resolution_clock::now(); \ 28 | auto duration = std::chrono::duration_cast(stop-start).count(); \ 29 | if (my_repeat_counter > 1) std::cout << duration; \ 30 | } 31 | 32 | #define repeat(X) for(int my_repeat_counter=1;my_repeat_counter <= (X); ++my_repeat_counter) 33 | 34 | __device__ double f_1(double x, double a, int Nz) 35 | { 36 | double R = x; 37 | 38 | #pragma unroll 8 39 | for(int i = 0; i 1) std::cout << Nz << ", "; 116 | 117 | starttime 118 | fill_kernel_noif<<>>(n, d_x, a, Nz); 119 | synchronize; 120 | endtime 121 | if (my_repeat_counter > 1) std::cout << ", "; 122 | 123 | starttime 124 | fill_kernel_nodiv<<>>(n, d_x, a, Nz); 125 | synchronize; 126 | endtime 127 | if (my_repeat_counter > 1) std::cout << ", "; 128 | 129 | starttime 130 | fill_kernel_div<<>>(n, d_x, a, Nz); 131 | synchronize; 132 | endtime 133 | if (my_repeat_counter > 1) std::cout << "\n"; 134 | 135 | Nz += 1; 136 | } 137 | 138 | // copy data to the host and print 139 | /* HIP_CHECK(hipMemcpy(x, d_x, sizeof(double) * n, hipMemcpyDeviceToHost)); */ 140 | /* printf("%f %f %f %f ... %f %f\n", */ 141 | /* x[0], x[1], x[2], x[3], x[n-2], x[n-1]); */ 142 | 143 | return 0; 144 | } 145 | -------------------------------------------------------------------------------- /docs/05-fortran.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Fortran and HIP 3 | subtitle: GPU programming with HIP 4 | author: CSC Training 5 | date: 2025-03 6 | lang: en 7 | --- 8 | 9 | # Fortran 10 | 11 | * No native GPU support in Fortran: 12 | - HIP functions are callable from C, using wrappers; compiled with hipcc 13 | - interoperability with Fortran via `iso_c_binding` 14 | - linking with Fortran or `hipcc` 15 | * Fortran + HIP: 16 | - needs wrappers and interfaces for all HIP calls 17 | * Hipfort: 18 | - Fortran Interface For GPU Kernel Libraries 19 | - HIP: HIP runtime, hipBLAS, hipSPARSE, hipFFT, hipRAND, hipSOLVER 20 | - ROCm: rocBLAS, rocSPARSE, rocFFT, rocRAND, rocSOLVER 21 | - memory management: `hipMalloc`, `hipMemcpy` 22 | 23 | # HIPFort for SAXPY (`Y=Y+a*X`): Fortran Code 24 | 25 |
> 26 | ```cpp 27 | program saxpy 28 | use iso_c_binding 29 | use hipfort 30 | use hipfort_check 31 | 32 | implicit none 33 | interface 34 | subroutine launch(dy,dx,b,N) bind(c) 35 | use iso_c_binding 36 | implicit none 37 | type(c_ptr),value :: dy,dx 38 | integer, value :: N 39 | real, value :: a 40 | end subroutine 41 | end interface 42 | 43 | type(c_ptr) :: dx = c_null_ptr 44 | type(c_ptr) :: dy = c_null_ptr 45 | integer, parameter :: N = 400000000 46 | integer(c_size_t), parameter :: bytes_per_element = 4 47 | integer(c_size_t), parameter :: Nbytes = N*bytes_per_element 48 | real, allocatable,target,dimension(:) :: x, y 49 | real, parameter :: a=2.0 50 | ``` 51 |
52 | 53 |
> 54 | ```cpp 55 | allocate(x(N), y(N)) 56 | 57 | x = 1.0; y = 2.0 58 | 59 | call hipCheck(hipMalloc(dx,Nbytes)) 60 | call hipCheck(hipMalloc(dy,Nbytes)) 61 | 62 | call hipCheck(hipMemcpy(dx, c_loc(x), Nbytes, hipMemcpyHostToDevice)) 63 | call hipCheck(hipMemcpy(dy, c_loc(y), Nbytes, hipMemcpyHostToDevice)) 64 | 65 | call launch(dy, dx, a, N) 66 | 67 | call hipCheck(hipDeviceSynchronize()) 68 | 69 | call hipCheck(hipMemcpy(c_loc(y), dy, Nbytes, hipMemcpyDeviceToHost)) 70 | 71 | write(*,*) "Max error: ", maxval(abs(y-4.0)) 72 | 73 | call hipCheck(hipFree(dx));call hipCheck(hipFree(dy)) 74 | 75 | deallocate(x);deallocate(y) 76 | 77 | end program testSaxpy 78 | ``` 79 |
80 |
81 | 82 | # HIPFort for SAXPY (`Y=Y+a*X`): HIP code 83 |
84 | ```cpp 85 | #include 86 | #include 87 | 88 | __global__ void saxpy(float *dy, float *dx, 89 | float a, int n) 90 | { 91 | int i = blockDim.x*blockIdx.x+threadIdx.x; 92 | if (i < n) { 93 | dy[i] = dy[i] + a*dx[i]; 94 | } 95 | } 96 | ``` 97 | 98 |
99 | 100 |
101 | ``` cpp 102 | extern "C"{ 103 | void launch(float *dy, float *dx, 104 | float a, int N) 105 | { 106 | dim3 tBlock(256,1,1); 107 | dim3 grid(ceil((float)N/tBlock.x),1,1); 108 | 109 | saxpy<<>>(dx, dy, a, N); 110 | } 111 | } 112 | ``` 113 |
114 | 115 | # Compilation 116 | 117 | **NVIDIA: Mahti** 118 | ``` 119 | gfortran -I$HIPFORT_HOME/include/hipfort/nvptx "-DHIPFORT_ARCH=\"nvptx\"" \ 120 | -L$HIPFORT_HOME/lib -lhipfort-nvptx -c .f90 121 | 122 | hipcc "--gpu-architecture=sm_80" --x cu -c .cpp 123 | 124 | hipcc -lgfortran "--gpu-architecture=sm_80" -I$HIPFORT_HOME/include/hipfort/nvptx \ 125 | -L$HIPFORT_HOME/lib/ -lhipfort-nvptx .o .o -o main 126 | ``` 127 | **AMD: LUMI** 128 | ``` 129 | ftn -I$HIPFORT_HOME/include/hipfort/amdgcn "-DHIPFORT_ARCH=\"amd\"" \ 130 | -L$HIPFORT_HOME/lib -lhipfort-amdgcn -c .f90 131 | 132 | hipcc --offload-arch=gfx90a -c .cpp 133 | 134 | ftn -I$HIPFORT_HOME/include/hipfort/amdgcn "-DHIPFORT_ARCH=\"amd\"" \ 135 | -L$HIPFORT_HOME/lib -lhipfort-amdgcn .o .o -o main 136 | ``` 137 | 138 | 139 | # Summary 140 | 141 | * No native GPU support in Fortran 142 | * HIP functions are callable from C, using `extern C` 143 | - `iso_c_binding` 144 | - GPU objects are of type `c_ptr` in Fortran 145 | * Hipfort provides Fortran interfaces for GPU libraries 146 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | ROOT_DIR:=$(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) 2 | FACTORY=docker 3 | OPTIONS=run -it --rm -v "$(ROOT_DIR)":"$(ROOT_DIR)":Z -w "$(ROOT_DIR)" ghcr.io/csc-training/slidefactory:3.2.0-beta.1 4 | 5 | SRC=$(wildcard *.md) 6 | HTML=$(patsubst %.md,%.html,$(SRC)) 7 | PDF=$(patsubst %.md,%.pdf,$(SRC)) 8 | 9 | .PHONY: html pdf clean 10 | 11 | html: $(HTML) 12 | 13 | pdf: $(PDF) 14 | 15 | clean: 16 | -rm -f $(HTML) $(PDF) 17 | 18 | %.html: %.md 19 | $(FACTORY) $(OPTIONS) slides --format html $< 20 | 21 | %.pdf: %.md 22 | $(FACTORY) $(OPTIONS) slides --format pdf $< 23 | -------------------------------------------------------------------------------- /docs/img/01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/01.png -------------------------------------------------------------------------------- /docs/img/04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/04.png -------------------------------------------------------------------------------- /docs/img/AMD-GCN-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/AMD-GCN-3.png -------------------------------------------------------------------------------- /docs/img/BankConflicts.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/BankConflicts.jpeg -------------------------------------------------------------------------------- /docs/img/CU.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/CU.png -------------------------------------------------------------------------------- /docs/img/CUgray.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/CUgray.png -------------------------------------------------------------------------------- /docs/img/NoBankConflicts.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/NoBankConflicts.jpeg -------------------------------------------------------------------------------- /docs/img/ThreadExecution.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/ThreadExecution.jpg -------------------------------------------------------------------------------- /docs/img/ThreadExecution_new.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/ThreadExecution_new.jpg -------------------------------------------------------------------------------- /docs/img/a100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/a100.png -------------------------------------------------------------------------------- /docs/img/a100_fp32_core.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/a100_fp32_core.png -------------------------------------------------------------------------------- /docs/img/a100_sm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/a100_sm.png -------------------------------------------------------------------------------- /docs/img/a100_smsp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/a100_smsp.png -------------------------------------------------------------------------------- /docs/img/amd_computeunit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/amd_computeunit.png -------------------------------------------------------------------------------- /docs/img/amd_instinct_mi250x_oam.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/amd_instinct_mi250x_oam.png -------------------------------------------------------------------------------- /docs/img/amd_m200.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/amd_m200.png -------------------------------------------------------------------------------- /docs/img/amd_mi200.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/amd_mi200.jpg -------------------------------------------------------------------------------- /docs/img/amd_mi200.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/amd_mi200.png -------------------------------------------------------------------------------- /docs/img/arrow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/arrow.png -------------------------------------------------------------------------------- /docs/img/block_sm_cu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/block_sm_cu.png -------------------------------------------------------------------------------- /docs/img/coalesced_access_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/coalesced_access_1.png -------------------------------------------------------------------------------- /docs/img/coalesced_access_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/coalesced_access_3.png -------------------------------------------------------------------------------- /docs/img/coalesced_access_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/coalesced_access_4.png -------------------------------------------------------------------------------- /docs/img/comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/comparison.png -------------------------------------------------------------------------------- /docs/img/copy_d2h.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/copy_d2h.png -------------------------------------------------------------------------------- /docs/img/copy_h2d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/copy_h2d.png -------------------------------------------------------------------------------- /docs/img/cpu_waits_on_gpu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/cpu_waits_on_gpu.png -------------------------------------------------------------------------------- /docs/img/cu_sm_eu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/cu_sm_eu.png -------------------------------------------------------------------------------- /docs/img/cublas_cuda_hip.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/cublas_cuda_hip.png -------------------------------------------------------------------------------- /docs/img/do_this_computation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/do_this_computation.png -------------------------------------------------------------------------------- /docs/img/execution-model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/execution-model.png -------------------------------------------------------------------------------- /docs/img/gpu-bws.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpu-bws.png -------------------------------------------------------------------------------- /docs/img/gpu-cluster.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpu-cluster.png -------------------------------------------------------------------------------- /docs/img/gpuConnect.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpuConnect.png -------------------------------------------------------------------------------- /docs/img/gpu_as_a_wide_vector_unit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpu_as_a_wide_vector_unit.png -------------------------------------------------------------------------------- /docs/img/gpu_as_cus_sms_eus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpu_as_cus_sms_eus.png -------------------------------------------------------------------------------- /docs/img/gpu_as_vector_units.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpu_as_vector_units.png -------------------------------------------------------------------------------- /docs/img/gpu_as_vector_units_instructions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpu_as_vector_units_instructions.png -------------------------------------------------------------------------------- /docs/img/gpu_is_a_separate_processor_with_own_memory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpu_is_a_separate_processor_with_own_memory.png -------------------------------------------------------------------------------- /docs/img/gpufort.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpufort.png -------------------------------------------------------------------------------- /docs/img/gpufort1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpufort1.png -------------------------------------------------------------------------------- /docs/img/gpufort2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpufort2.png -------------------------------------------------------------------------------- /docs/img/grid-threads.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/grid-threads.png -------------------------------------------------------------------------------- /docs/img/grid_gpu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/grid_gpu.png -------------------------------------------------------------------------------- /docs/img/hipblas.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/hipblas.png -------------------------------------------------------------------------------- /docs/img/hipfort.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/hipfort.png -------------------------------------------------------------------------------- /docs/img/kernel_cuda_hip.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/kernel_cuda_hip.png -------------------------------------------------------------------------------- /docs/img/lumi.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/lumi.jpg -------------------------------------------------------------------------------- /docs/img/lumi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/lumi.png -------------------------------------------------------------------------------- /docs/img/many_blocks_to_one_sm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/many_blocks_to_one_sm.png -------------------------------------------------------------------------------- /docs/img/memlayout.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/memlayout.png -------------------------------------------------------------------------------- /docs/img/memory-hierarchy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/memory-hierarchy.png -------------------------------------------------------------------------------- /docs/img/memsch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/memsch.png -------------------------------------------------------------------------------- /docs/img/mi100-architecture.info: -------------------------------------------------------------------------------- 1 | Source: 2 | Introducing AMD CDNA Architecture, 3 | https://www.amd.com/system/files/documents/amd-cdna-whitepaper.pdf 4 | 5 | Caption: 6 | Block diagram of the AMD Instinct MI100 accelerator, the first GPUs 7 | powered by the AMD CDNA architecture. 8 | -------------------------------------------------------------------------------- /docs/img/mi100-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/mi100-architecture.png -------------------------------------------------------------------------------- /docs/img/mi100_arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/mi100_arch.png -------------------------------------------------------------------------------- /docs/img/mi250x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/mi250x.png -------------------------------------------------------------------------------- /docs/img/mi250x_cu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/mi250x_cu.png -------------------------------------------------------------------------------- /docs/img/mi250x_cu_simd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/mi250x_cu_simd.png -------------------------------------------------------------------------------- /docs/img/microprocessor-trend-data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/microprocessor-trend-data.png -------------------------------------------------------------------------------- /docs/img/model_gpu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/model_gpu.png -------------------------------------------------------------------------------- /docs/img/new_hipfort.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/new_hipfort.png -------------------------------------------------------------------------------- /docs/img/no_block_to_many_sm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/no_block_to_many_sm.png -------------------------------------------------------------------------------- /docs/img/not_gpu_as_a_wide_vector_unit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/not_gpu_as_a_wide_vector_unit.png -------------------------------------------------------------------------------- /docs/img/oned_block.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/oned_block.png -------------------------------------------------------------------------------- /docs/img/oned_grid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/oned_grid.png -------------------------------------------------------------------------------- /docs/img/parallel_regions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/parallel_regions.png -------------------------------------------------------------------------------- /docs/img/parflow_single_node.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/parflow_single_node.png -------------------------------------------------------------------------------- /docs/img/perfetto.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/perfetto.png -------------------------------------------------------------------------------- /docs/img/runtimes_annotated.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/runtimes_annotated.png -------------------------------------------------------------------------------- /docs/img/scalar_operation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/scalar_operation.png -------------------------------------------------------------------------------- /docs/img/single_proc_mpi_gpu2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/single_proc_mpi_gpu2.png -------------------------------------------------------------------------------- /docs/img/single_proc_multi_gpu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/single_proc_multi_gpu.png -------------------------------------------------------------------------------- /docs/img/single_proc_thread_gpu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/single_proc_thread_gpu.png -------------------------------------------------------------------------------- /docs/img/software_hardware_mapping.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/software_hardware_mapping.png -------------------------------------------------------------------------------- /docs/img/streams-example-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/streams-example-1.png -------------------------------------------------------------------------------- /docs/img/streams-example-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/streams-example-2.png -------------------------------------------------------------------------------- /docs/img/streams.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/streams.png -------------------------------------------------------------------------------- /docs/img/streams1_explain.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/streams1_explain.png -------------------------------------------------------------------------------- /docs/img/streams2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/streams2.png -------------------------------------------------------------------------------- /docs/img/streams2_explain.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/streams2_explain.png -------------------------------------------------------------------------------- /docs/img/thread.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/thread.png -------------------------------------------------------------------------------- /docs/img/thread_lane.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/thread_lane.png -------------------------------------------------------------------------------- /docs/img/threed_block.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/threed_block.png -------------------------------------------------------------------------------- /docs/img/top500-perf-dev.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/top500-perf-dev.png -------------------------------------------------------------------------------- /docs/img/top500-performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/top500-performance.png -------------------------------------------------------------------------------- /docs/img/transpose_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/transpose_img.png -------------------------------------------------------------------------------- /docs/img/twod_block.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/twod_block.png -------------------------------------------------------------------------------- /docs/img/twod_grid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/twod_grid.png -------------------------------------------------------------------------------- /docs/img/vector_operation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/vector_operation.png -------------------------------------------------------------------------------- /docs/img/vector_unit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/vector_unit.png -------------------------------------------------------------------------------- /docs/img/virtual_memory_addressing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/virtual_memory_addressing.png -------------------------------------------------------------------------------- /docs/img/warp_wavefron_smsp_simd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/warp_wavefron_smsp_simd.png -------------------------------------------------------------------------------- /docs/index: -------------------------------------------------------------------------------- 1 | # List of PDFs to jam together 2 | # an index file for jam-it.sh (https://github.com/mlouhivu/jam-it) 3 | 4 | @title-course.pdf 5 | 6 | @title-intro.pdf 7 | 01-introduction.pdf 8 | 9 | @title-kernels.pdf 10 | 02-kernels.pdf 11 | 12 | @title-streams.pdf 13 | 03-streams.pdf 14 | 15 | @title-memory.pdf 16 | 04-memory.pdf 17 | 18 | @title-fortran.pdf 19 | 05-fortran.pdf 20 | 21 | @title-optimisation.pdf 22 | 06-optimisation.pdf 23 | 24 | @title-multi-gpu.pdf 25 | 07-multi-gpu.pdf 26 | -------------------------------------------------------------------------------- /first_steps.md: -------------------------------------------------------------------------------- 1 | ## Accessing LUMI 2 | 3 | Are you able to `ssh` to LUMI? If not, have you followed the instructions [here](https://docs.lumi-supercomputer.eu/firststeps/)? 4 | 5 | If you haven't added the ssh-key correctly or cannot otherwise `ssh` to LUMI, you can use the [web interface](https://www.lumi.csc.fi/public/). 6 | 7 | See the [documentation](https://docs.lumi-supercomputer.eu/firststeps/loggingin-webui/) for more help. 8 | 9 | ## Getting the course material 10 | 11 | You can clone this git repository with `git clone https://github.com/csc-training/hip-programming.git`. 12 | 13 | This way you get a local access to the lectures, as well as the exercises (which you need to run on LUMI). 14 | 15 | ## Using slurm 16 | 17 | Supercomputers like LUMI are shared resources, meaning multiple users are using them at the same time. 18 | To run something on LUMI, you need to use SLURM to submit a job. 19 | 20 | Read the [LUMI documentation](https://docs.lumi-supercomputer.eu/runjobs/) on running jobs to find out more. 21 | 22 | ## Motivation for the course 23 | 24 | Why do we teach GPU programming? Why should you learn to program GPUs? 25 | 26 | Because most of the Top 500 supercomputers use (and derive most of their compute cabability from) GPUs 27 | --> if you use any of these supercomputers, you cannot avoid using GPUs. 28 | 29 | Why are most of the Top 500 supercomputers using GPUs? 30 | 31 | 1. Because GPUs are designed and optimized to solve problems commonly encountered in HPC and ML/AI: floating point operations, matrix multiplications. 32 | 2. Because of power limitations: performance per Watt is much greater for GPUs than CPUs: https://top500.org/statistics/efficiency-power-cores/ 33 | -------------------------------------------------------------------------------- /hipfort/hiprand/Makefile: -------------------------------------------------------------------------------- 1 | ifeq ($(COMP),) 2 | COMP=lumi 3 | endif 4 | 5 | ifeq ($(COMP),lumi) 6 | HIPFORT_HOME = /projappl/project_462000877/apps/HIPFORT 7 | LIB_FLAGS = 8 | CXX = CC -xhip 9 | FC = ftn -I$(HIPFORT_HOME)/include/hipfort/amdgcn "-DHIPFORT_ARCH=\"amd\"" -L$(HIPFORT_HOME)/lib -lhipfort-amdgcn $(LIB_FLAGS) 10 | endif 11 | 12 | OBJS=pi.o 13 | 14 | all: pi 15 | 16 | pi: $(OBJS) 17 | $(FC) -o $@ $(OBJS) $(FCFLAGS) 18 | 19 | %.o: %.F90 20 | $(FC) $(FCFLAGS) -c $< -o $@ 21 | 22 | %.mod: %.F90 23 | $(FC) $(FCFLAGS) -c $< 24 | clean: 25 | rm -f pi *.o *.mod 26 | -------------------------------------------------------------------------------- /hipfort/hiprand/img/pi_MC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/hipfort/hiprand/img/pi_MC.png -------------------------------------------------------------------------------- /hipfort/hiprand/pi.F90: -------------------------------------------------------------------------------- 1 | program rand_test 2 | use iso_c_binding 3 | use iso_fortran_env, only : INT64 4 | ! TODO Add here the necessary modules for the GPU operation 5 | 6 | 7 | !OPTIONAL 8 | !TODO write an interface to the C wrapper which calls the reduction kernel. 9 | 10 | implicit none 11 | 12 | integer(kind=INT64) :: nsamples 13 | character(len=85) :: arg 14 | real :: pi1, pi2 15 | integer(c_size_t):: Nbytes 16 | 17 | if (command_argument_count() /= 1) then 18 | STOP 'Usage pi N where N is the number of samples' 19 | end if 20 | 21 | call get_command_argument(1, arg) 22 | read(arg, *) nsamples 23 | 24 | pi1 = cpu_pi(nsamples) 25 | write(*,*) 'Pi calculated with CPU', pi1 26 | pi2 = gpu_pi(nsamples) 27 | write(*,*) 'Pi calculated with GPU', pi2 28 | 29 | contains 30 | 31 | real function cpu_pi(n) 32 | implicit none 33 | integer(kind=INT64) :: n 34 | integer :: i, inside 35 | 36 | real, allocatable:: x(:),y(:) 37 | 38 | 39 | allocate(x(1:n)) 40 | allocate(y(1:n)) 41 | 42 | call random_number(x) 43 | call random_number(y) 44 | 45 | inside = 0 46 | do i = 1, n 47 | if (x(i)**2 + y(i)**2 < 1.0) then 48 | inside = inside + 1 49 | end if 50 | end do 51 | 52 | cpu_pi = 4.0 * real(inside) / real(n) 53 | 54 | end function cpu_pi 55 | 56 | 57 | 58 | real function gpu_pi(n) 59 | use hipfort 60 | use hipfort_check 61 | use hipfort_hiprand 62 | implicit none 63 | integer(kind=INT64) :: n 64 | integer :: i, inside 65 | type(c_ptr) :: gen = c_null_ptr 66 | type(c_ptr) :: x_d,y_d 67 | real(c_float), allocatable,target :: x(:),y(:) 68 | integer(c_size_t) :: istat 69 | 70 | allocate(x(1:n)) 71 | allocate(y(1:n)) 72 | Nbytes=sizeof(x) 73 | 74 | inside = 0 75 | ! Initialization for (optional) task. Instead of this one can as well initialize inside_d using a hip kernel 76 | ! Sbytes = sizeof(inside) 77 | ! call hipCheck(hipMalloc(inside_d,Sbytes)) 78 | ! call hipCheck(hipMemcpy( inside_d,c_loc(inside), Sbytes, hipMemcpyHostToDevice)) 79 | 80 | !Allocate memory for the gpu arrays 81 | 82 | ! TODO Initialize the gpu random number generator 83 | 84 | ! TODO Fill the arrays x and y with random uniform distributed numbers 85 | 86 | ! TODO copy the random numbers from GPU to CPU 87 | 88 | ! TODO Bonus exercise: replace the below reduction loop done on the CPU with a GPU kernel 89 | ! The kernel is in the hip_kernels.cpp file. 90 | ! You need to implement an interface to call the C function simialrly to the saxpy example 91 | ! Note that in this case there is no need to transfer the x and y arrays to CPU, 92 | ! You only need to copy the final result, inside_d 93 | 94 | do i = 1, n 95 | if (x(i)**2 + y(i)**2 < 1.0) then 96 | inside = inside + 1 97 | end if 98 | end do 99 | 100 | gpu_pi = 4.0 * real(inside) / real(n) 101 | 102 | deallocate(x, y) 103 | end function gpu_pi 104 | end program 105 | -------------------------------------------------------------------------------- /hipfort/hiprand/solution/Makefile: -------------------------------------------------------------------------------- 1 | ifeq ($(COMP),) 2 | COMP=lumi 3 | endif 4 | 5 | ifeq ($(COMP),lumi) 6 | HIPFORT_HOME = /projappl/project_462000877/apps/HIPFORT 7 | LIB_FLAGS = -lhiprand 8 | CXX = CC -xhip 9 | FC = ftn -I$(HIPFORT_HOME)/include/hipfort/amdgcn "-DHIPFORT_ARCH=\"amd\"" -L$(HIPFORT_HOME)/lib -lhipfort-amdgcn $(LIB_FLAGS) 10 | endif 11 | 12 | OBJS=pi.o 13 | 14 | all: pi 15 | 16 | pi: $(OBJS) 17 | $(FC) -o $@ $(OBJS) $(FCFLAGS) 18 | 19 | %.o: %.F90 20 | $(FC) $(FCFLAGS) -c $< -o $@ 21 | 22 | %.mod: %.F90 23 | $(FC) $(FCFLAGS) -c $< 24 | clean: 25 | rm -f pi *.o *.mod 26 | -------------------------------------------------------------------------------- /hipfort/hiprand/solution/pi.F90: -------------------------------------------------------------------------------- 1 | program rand_test 2 | use iso_c_binding 3 | use iso_fortran_env, only : INT64 4 | use hipfort 5 | use hipfort_check 6 | use hipfort_hiprand 7 | 8 | implicit none 9 | 10 | integer(kind=INT64) :: nsamples 11 | character(len=85) :: arg 12 | real :: pi1, pi2 13 | integer(c_size_t):: Nbytes 14 | 15 | if (command_argument_count() /= 1) then 16 | STOP 'Usage pi N where N is the number of samples' 17 | end if 18 | 19 | call get_command_argument(1, arg) 20 | read(arg, *) nsamples 21 | 22 | pi1 = cpu_pi(nsamples) 23 | write(*,*) 'Pi calculated with CPU', pi1 24 | pi2 = gpu_pi(nsamples) 25 | write(*,*) 'Pi calculated with GPU', pi2 26 | 27 | contains 28 | 29 | real function cpu_pi(n) 30 | implicit none 31 | integer(kind=INT64) :: n 32 | integer :: i, inside 33 | 34 | real, allocatable:: x(:),y(:) 35 | 36 | 37 | allocate(x(1:n)) 38 | allocate(y(1:n)) 39 | 40 | call random_number(x) 41 | call random_number(y) 42 | 43 | inside = 0 44 | do i = 1, n 45 | if (x(i)**2 + y(i)**2 < 1.0) then 46 | inside = inside + 1 47 | end if 48 | end do 49 | 50 | cpu_pi = 4.0 * real(inside) / real(n) 51 | 52 | end function cpu_pi 53 | 54 | 55 | 56 | real function gpu_pi(n) 57 | use hipfort 58 | use hipfort_check 59 | use hipfort_hiprand 60 | implicit none 61 | integer(kind=INT64) :: n 62 | integer :: i, inside 63 | type(c_ptr) :: gen = c_null_ptr 64 | type(c_ptr) :: x_d,y_d 65 | real(c_float), allocatable,target :: x(:),y(:) 66 | integer(c_size_t) :: istat 67 | 68 | allocate(x(1:n)) 69 | allocate(y(1:n)) 70 | Nbytes=sizeof(x) 71 | 72 | call hipCheck(hipMalloc(x_d,Nbytes)) 73 | call hipCheck(hipMalloc(y_d,Nbytes)) 74 | 75 | inside = 0 76 | 77 | 78 | istat= hiprandCreateGenerator(gen, HIPRAND_RNG_PSEUDO_DEFAULT) 79 | 80 | istat= hiprandGenerateUniform(gen, x_d, n) 81 | istat= hiprandGenerateUniform(gen, y_d, n) 82 | 83 | call hipCheck(hipMemcpy(c_loc(x), x_d, Nbytes, hipMemcpyDeviceToHost)) 84 | call hipCheck(hipMemcpy(c_loc(y), y_d, Nbytes, hipMemcpyDeviceToHost)) 85 | 86 | do i = 1, n 87 | if (x(i)**2 + y(i)**2 < 1.0) then 88 | inside = inside + 1 89 | end if 90 | end do 91 | 92 | gpu_pi = 4.0 * real(inside) / real(n) 93 | 94 | deallocate(x, y) 95 | end function gpu_pi 96 | end program 97 | -------------------------------------------------------------------------------- /hipfort/hiprand/solution_bonus/Makefile: -------------------------------------------------------------------------------- 1 | ifeq ($(COMP),) 2 | COMP=lumi 3 | endif 4 | 5 | ifeq ($(COMP),lumi) 6 | HIPFORT_HOME = /projappl/project_462000877/apps/HIPFORT 7 | LIB_FLAGS = -lhiprand 8 | CXX = CC -xhip 9 | FC = ftn -I$(HIPFORT_HOME)/include/hipfort/amdgcn "-DHIPFORT_ARCH=\"amd\"" -L$(HIPFORT_HOME)/lib -lhipfort-amdgcn $(LIB_FLAGS) 10 | endif 11 | 12 | OBJS=pi.o hip_kernels.o 13 | 14 | all: pi 15 | 16 | pi: $(OBJS) 17 | $(FC) -o $@ $(OBJS) $(FCFLAGS) 18 | 19 | %.o: %.F90 20 | $(FC) $(FCFLAGS) -c $< -o $@ 21 | 22 | %.o: %.cpp 23 | $(CXX) -c -o $@ $< 24 | 25 | %.mod: %.F90 26 | $(FC) $(FCFLAGS) -c $< 27 | clean: 28 | rm -f pi *.o *.mod 29 | -------------------------------------------------------------------------------- /hipfort/hiprand/solution_bonus/hip_kernels.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | __global__ void countInsideKernel(float *x, float *y, int *inside, int64_t n) 5 | { 6 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 7 | 8 | if (idx < n) { 9 | if (x[idx] * x[idx] + y[idx] * y[idx] < 1.0f) { 10 | // Atomic increment to avoid race condition 11 | atomicAdd(inside, 1); 12 | } 13 | } 14 | } 15 | 16 | extern "C" 17 | { 18 | void launch(float *x, float *y, int *inside_d, int64_t N) 19 | { 20 | 21 | dim3 tBlock(256,1,1); 22 | dim3 grid(ceil((float)N/tBlock.x),1,1); 23 | 24 | countInsideKernel<<>>( x, y, inside_d, N); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /hipfort/hiprand/solution_bonus/pi.F90: -------------------------------------------------------------------------------- 1 | program rand_test 2 | use iso_c_binding 3 | use iso_fortran_env, only : INT64 4 | use hipfort 5 | use hipfort_check 6 | use hipfort_hiprand 7 | 8 | implicit none 9 | 10 | interface 11 | subroutine launch(x_d, y_d, inside_d, N) bind(c) 12 | use iso_c_binding 13 | implicit none 14 | type(c_ptr), value :: x_d, y_d, inside_d 15 | integer(c_int64_t), value :: N ! Ensure use of correct C type for INT64 16 | end subroutine 17 | end interface 18 | 19 | integer(c_int64_t) :: nsamples 20 | character(len=85) :: arg 21 | real :: pi1, pi2 22 | integer(c_size_t) :: Nbytes, Sbytes 23 | 24 | if (command_argument_count() /= 1) then 25 | STOP 'Usage: pi N where N is the number of samples' 26 | end if 27 | 28 | call get_command_argument(1, arg) 29 | read(arg, *) nsamples 30 | 31 | pi1 = cpu_pi(nsamples) 32 | write(*,*) 'Pi calculated with CPU', pi1 33 | pi2 = gpu_pi(nsamples) 34 | write(*,*) 'Pi calculated with GPU', pi2 35 | 36 | contains 37 | 38 | real function cpu_pi(n) 39 | implicit none 40 | integer(c_int64_t) :: n 41 | integer :: i, inside 42 | 43 | real, allocatable :: x(:), y(:) 44 | 45 | allocate(x(1:n)) 46 | allocate(y(1:n)) 47 | 48 | call random_number(x) 49 | call random_number(y) 50 | 51 | inside = 0 52 | do i = 1, n 53 | if (x(i)**2 + y(i)**2 < 1.0) then 54 | inside = inside + 1 55 | end if 56 | end do 57 | 58 | cpu_pi = 4.0 * real(inside) / real(n) 59 | 60 | end function cpu_pi 61 | 62 | real function gpu_pi(n) 63 | use hipfort 64 | use hipfort_check 65 | use hipfort_hiprand 66 | implicit none 67 | integer(c_int64_t) :: n 68 | integer :: inside 69 | type(c_ptr) :: gen = c_null_ptr 70 | type(c_ptr) :: x_d, y_d, inside_d 71 | real(c_float), allocatable, target :: x(:), y(:) 72 | integer(c_size_t) :: istat 73 | 74 | allocate(x(1:n)) 75 | allocate(y(1:n)) 76 | Nbytes = sizeof(x) 77 | 78 | call hipCheck(hipMalloc(x_d, Nbytes)) 79 | call hipCheck(hipMalloc(y_d, Nbytes)) 80 | 81 | istat = hiprandCreateGenerator(gen, HIPRAND_RNG_PSEUDO_DEFAULT) 82 | 83 | istat = hiprandGenerateUniform(gen, x_d, n) 84 | istat = hiprandGenerateUniform(gen, y_d, n) 85 | 86 | inside = 0 87 | Sbytes = sizeof(inside) 88 | call hipCheck(hipMalloc(inside_d, Sbytes)) 89 | call hipCheck(hipMemcpy(inside_d, c_loc(inside), Sbytes, hipMemcpyHostToDevice)) 90 | 91 | call launch(x_d, y_d, inside_d, n) 92 | 93 | call hipCheck(hipMemcpy(c_loc(inside), inside_d, Sbytes, hipMemcpyDeviceToHost)) 94 | 95 | gpu_pi = 4.0 * real(inside) / real(n) 96 | 97 | deallocate(x, y) 98 | end function gpu_pi 99 | 100 | end program rand_test 101 | -------------------------------------------------------------------------------- /hipfort/saxpy/cuda/main.cuf: -------------------------------------------------------------------------------- 1 | module mathOps 2 | contains 3 | attributes(global) subroutine saxpy(x, y, a) 4 | implicit none 5 | real :: x(:), y(:) 6 | real, value :: a 7 | integer :: i, n 8 | n = size(x) 9 | i = blockDim%x * (blockIdx%x - 1) + threadIdx%x 10 | if (i <= n) y(i) = y(i) + a*x(i) 11 | end subroutine saxpy 12 | end module mathOps 13 | 14 | program testSaxpy 15 | use mathOps 16 | use cudafor 17 | implicit none 18 | integer, parameter :: N = 40000 19 | real :: x(N), y(N), a 20 | real, device :: x_d(N), y_d(N) 21 | type(dim3) :: grid, tBlock 22 | 23 | tBlock = dim3(256,1,1) 24 | grid = dim3(ceiling(real(N)/tBlock%x),1,1) 25 | 26 | x = 1.0; y = 2.0; a = 2.0 27 | x_d = x 28 | y_d = y 29 | call saxpy<<>>(x_d, y_d, a) 30 | y = y_d 31 | write(*,*) 'Max error: ', maxval(abs(y-4.0)) 32 | end program testSaxpy 33 | -------------------------------------------------------------------------------- /hipfort/saxpy/hip/README.md: -------------------------------------------------------------------------------- 1 | # SAXPY using FORTRAN & HIPFORT 2 | 3 | Inspect `saxpy` code in the present folder. The Fortran code folows the same logic as the HIP C code. 4 | First the data is created on the cpu. Then the memory is allocated on the GPU and the data is transfered from CPU to GPU. When the transfer is completed a kernel is executed to perform the work. In the end the results of the computation is copied to the CPU and processed further. 5 | 6 | **Note** Fortran does can not compile HIP C code. The GPU code is located in a separate file, [hipsaxpy.cpp](hipsaxpy.cpp). The HIP kernel is launched via C function which acts as a wrapper. Fortran calls this C wrapper using `iso_c_binding` module. 7 | 8 | In this code all calls to HIP API are done via HIPFORT. The exercise is to check and familiarize with how the memory management (allocations and transfers) is done and how Fortran is calling C functions using `iso_c_binding` module. 9 | If you have previous experience with CUDA Fortran you can compare it to the equivalent code in the [cuda](../cuda) folder. 10 | 11 | In addition to the memory management, HIPFORT provides also bindings for the mathematical libraries running on GPUs. You can find examples of how various `hipxxx` & `rocxxx` libraries are called in `Fortran` programs in the [HIPFORT repository](https://github.com/ROCm/hipfort/tree/develop/test). 12 | 13 | The instructions for compilation are found in the [exercise-instructions page]( ../../../exercise-instructions.md#hipfort-on-lumi). 14 | -------------------------------------------------------------------------------- /hipfort/saxpy/hip/hipsaxpy.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | __global__ void saxpy(float *y, float *x, float a, int n) 5 | { 6 | size_t i = blockDim.x * blockIdx.x + threadIdx.x; 7 | if (i < n) y[i] = y[i] + a*x[i]; 8 | } 9 | 10 | 11 | extern "C" 12 | { 13 | void launch(float *dout, float *da, float db, int N) 14 | { 15 | 16 | dim3 tBlock(256,1,1); 17 | dim3 grid(ceil((float)N/tBlock.x),1,1); 18 | 19 | hipLaunchKernelGGL((saxpy), grid, tBlock, 0, 0, dout, da, db, N); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /hipfort/saxpy/hip/main.f03: -------------------------------------------------------------------------------- 1 | program testSaxpy 2 | use iso_c_binding 3 | use hipfort 4 | use hipfort_check 5 | 6 | implicit none 7 | interface 8 | subroutine launch(y,x,b,N) bind(c) 9 | use iso_c_binding 10 | implicit none 11 | type(c_ptr),value :: y,x 12 | integer, value :: N 13 | real, value :: b 14 | end subroutine 15 | end interface 16 | 17 | type(c_ptr) :: dx = c_null_ptr 18 | type(c_ptr) :: dy = c_null_ptr 19 | integer, parameter :: N = 40000 20 | integer, parameter :: bytes_per_element = 4 21 | integer(c_size_t), parameter :: Nbytes = N*bytes_per_element 22 | real, allocatable,target,dimension(:) :: x, y 23 | 24 | 25 | real, parameter :: a=2.0 26 | real :: x_d(N), y_d(N) 27 | 28 | call hipCheck(hipMalloc(dx,Nbytes)) 29 | call hipCheck(hipMalloc(dy,Nbytes)) 30 | 31 | allocate(x(N)) 32 | allocate(y(N)) 33 | 34 | x = 1.0;y = 2.0 35 | 36 | call hipCheck(hipMemcpy(dx, c_loc(x), Nbytes, hipMemcpyHostToDevice)) 37 | call hipCheck(hipMemcpy(dy, c_loc(y), Nbytes, hipMemcpyHostToDevice)) 38 | 39 | call launch(dy, dx, a, N) 40 | 41 | call hipCheck(hipDeviceSynchronize()) 42 | 43 | call hipCheck(hipMemcpy(c_loc(y), dy, Nbytes, hipMemcpyDeviceToHost)) 44 | 45 | write(*,*) 'Max error: ', maxval(abs(y-4.0)) 46 | 47 | call hipCheck(hipFree(dx)) 48 | call hipCheck(hipFree(dy)) 49 | 50 | deallocate(x) 51 | deallocate(y) 52 | 53 | end program testSaxpy 54 | -------------------------------------------------------------------------------- /kernels/01-hello-world/README.md: -------------------------------------------------------------------------------- 1 | # Hello world with HIP 2 | 3 | Compile and run a simple HIP test program provided as `hello.cpp`. 4 | 5 | Please follow the system-specific instructions provided in the 6 | [exercise instructions](../../exercise-instructions.md). 7 | -------------------------------------------------------------------------------- /kernels/01-hello-world/hello.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main(void) 5 | { 6 | int count = 0; 7 | auto result = hipGetDeviceCount(&count); 8 | 9 | int device = 0; 10 | result = hipGetDevice(&device); 11 | 12 | printf("Hello! I'm GPU %d out of %d GPUs in total.\n", device, count); 13 | 14 | return 0; 15 | } 16 | -------------------------------------------------------------------------------- /kernels/02-error-checking/README.md: -------------------------------------------------------------------------------- 1 | # Error checking with HIP 2 | 3 | Your task is to find a bug in the program, by implementing a HIP API error checking function. 4 | It's a good practice to wrap the API calls with the error checker to find any issues early. 5 | -------------------------------------------------------------------------------- /kernels/02-error-checking/error-checking.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #define HIP_ERRCHK(result) hip_errchk(result, __FILE__, __LINE__) 5 | static inline void hip_errchk(hipError_t result, const char *file, int line) { 6 | static_assert(false, "TODO: remove me and implement the error checking. " 7 | "(Hint: check the slides)"); 8 | } 9 | 10 | int main() { 11 | // There's a bug in this program, find out what it is by implementing the 12 | // function above, and correct it 13 | int count = 0; 14 | HIP_ERRCHK(hipGetDeviceCount(&count)); 15 | HIP_ERRCHK(hipSetDevice(count)); 16 | 17 | int device = 0; 18 | HIP_ERRCHK(hipGetDevice(&device)); 19 | 20 | printf("Hello! I'm GPU %d out of %d GPUs in total.\n", device, count); 21 | 22 | return 0; 23 | } 24 | -------------------------------------------------------------------------------- /kernels/02-error-checking/solution/error-checking.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | /* HIP error handling macro */ 5 | #define HIP_ERRCHK(result) hip_errchk(result, __FILE__, __LINE__) 6 | static inline void hip_errchk(hipError_t result, const char *file, int line) { 7 | if (result != hipSuccess) { 8 | printf("\n\n%s in %s at line %d\n", hipGetErrorString(result), file, 9 | line); 10 | exit(EXIT_FAILURE); 11 | } 12 | } 13 | 14 | int main() { 15 | int count = 0; 16 | HIP_ERRCHK(hipGetDeviceCount(&count)); 17 | // When setting the device, the argument must be 0 <= arg < #devices 18 | // See 19 | // https://rocm.docs.amd.com/projects/HIP/en/docs-6.0.0/doxygen/html/group___device.html#ga43c1e7f15925eeb762195ccb5e063eae 20 | // for the API 21 | HIP_ERRCHK(hipSetDevice(count - 1)); 22 | 23 | int device = 0; 24 | HIP_ERRCHK(hipGetDevice(&device)); 25 | 26 | printf("Hello! I'm GPU %d out of %d GPUs in total.\n", device, count); 27 | 28 | return 0; 29 | } 30 | -------------------------------------------------------------------------------- /kernels/03-kernel-saxpy/README.md: -------------------------------------------------------------------------------- 1 | # Kernel: saxpy 2 | 3 | Write a device kernel that calculates the single precision BLAS operation 4 | **saxpy**, i.e. `y = a * x + y`. 5 | 6 | - Initialise the vectors `x` and `y` with some values on the CPU 7 | - Perform the computation on the host to generate reference values 8 | - Allocate memory on the device for `x` and `y` 9 | - Copy the host `x` to device `x`, and host `y` to device `y` 10 | - Perform the computation on the device 11 | - Copy the device `y` back to the host `y` 12 | - Confirm the correctness: Is the host computed `y` equal to the device computed `y`? 13 | 14 | You may start from a skeleton code provided in [saxpy.cpp](saxpy.cpp). 15 | -------------------------------------------------------------------------------- /kernels/03-kernel-saxpy/saxpy.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #define HIP_ERRCHK(result) hip_errchk(result, __FILE__, __LINE__) 7 | static inline void hip_errchk(hipError_t result, const char *file, int line) { 8 | if (result != hipSuccess) { 9 | printf("\n\n%s in %s at line %d\n", hipGetErrorString(result), file, 10 | line); 11 | exit(EXIT_FAILURE); 12 | } 13 | } 14 | 15 | /* 16 | TODO: add a device kernel that calculates y = a * x + y for vectors x, y and 17 | constant a 18 | 19 | Hints: 20 | 21 | What attribute(s) do you need to add on a kernel declaration? 22 | - __device__? 23 | - __global__? 24 | - __shared__? 25 | - no attribute(s) needed? 26 | 27 | What is the return type of a kernel? 28 | - int? 29 | - float? 30 | - void? 31 | - depends on the kernel? 32 | 33 | What data do you need in the kernel to compute y = a * x + y, for vectors x, y, 34 | and constant a? 35 | 36 | What built-in variables can you use to calculate the (global) index for a 37 | thread? 38 | - Is threadIdx enough or do you need blockIdx, blockDim, gridDim? 39 | - Is the problem one or multi-dimensional? 40 | - Remember the grid, block, thread hierarchy and the launch parameters 41 | */ 42 | 43 | int main() { 44 | // Use HIP_ERRCHK to help you find any errors you make with the API calls 45 | 46 | // Read the HIP Runtime API documentation to help you with the API calls: 47 | // Ctrl-click this to open it in a browser: 48 | // https://rocm.docs.amd.com/projects/HIP/en/docs-6.0.0/doxygen/html/group___memory.html 49 | 50 | static constexpr size_t n = 1000000; 51 | static constexpr size_t num_bytes = sizeof(float) * n; 52 | static constexpr float a = 3.4f; 53 | 54 | std::vector x(n); 55 | std::vector y(n); 56 | std::vector y_ref(n); 57 | 58 | // Initialise data and calculate reference values on CPU 59 | for (size_t i = 0; i < n; i++) { 60 | x[i] = sin(i) * 2.3; 61 | y[i] = cos(i) * 1.1; 62 | y_ref[i] = a * x[i] + y[i]; 63 | } 64 | 65 | // TODO: Allocate + copy initial values 66 | // - hipMalloc, hipMemcpy 67 | 68 | // TODO: Define grid dimensions + launch the device kernel 69 | // int/dim3 threads = ... 70 | // int/dim3 blocks = ... 71 | // kernelName<<>>(arguments); 72 | 73 | // TODO: Copy results back to CPU 74 | // - hipMemcpy 75 | 76 | // TODO: Free device memory 77 | // - hipFree 78 | 79 | // Check the result of the GPU computation 80 | printf("reference: %f %f %f %f ... %f %f\n", y_ref[0], y_ref[1], y_ref[2], 81 | y_ref[3], y_ref[n - 2], y_ref[n - 1]); 82 | printf(" result: %f %f %f %f ... %f %f\n", y[0], y[1], y[2], y[3], 83 | y[n - 2], y[n - 1]); 84 | 85 | float error = 0.0; 86 | static constexpr float tolerance = 1e-6f; 87 | for (size_t i = 0; i < n; i++) { 88 | const auto diff = abs(y_ref[i] - y[i]); 89 | if (diff > tolerance) 90 | error += diff; 91 | } 92 | printf("total error: %f\n", error); 93 | printf(" reference: %f at (42)\n", y_ref[42]); 94 | printf(" result: %f at (42)\n", y[42]); 95 | 96 | return 0; 97 | } 98 | -------------------------------------------------------------------------------- /kernels/04-kernel-copy2d/README.md: -------------------------------------------------------------------------------- 1 | # Kernel: copy2d 2 | 3 | Write a device kernel that performs the double precision BLAS operation 4 | **dcopy**, i.e. `y = x` using GPU threads in a 2D grid. 5 | 6 | - Assume that the vectors `x` and `y` are used to store a 400x600 matrix (in row-major format) 7 | - Initialise the matrix `x` with some values on the CPU 8 | - Allocate memory for `x` and `y` on the device 9 | - Copy the host `x` to the device `x` 10 | - Perform the operation on the device using a 2D kernel 11 | - Copy device `y` to host `y` 12 | - Compare host `x` to host `y` 13 | 14 | Are the values of `x` and `y` equal? 15 | 16 | You may start from a skeleton code provided in [copy2d.cpp](copy2d.cpp). 17 | -------------------------------------------------------------------------------- /kernels/04-kernel-copy2d/copy2d.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #define HIP_ERRCHK(result) hip_errchk(result, __FILE__, __LINE__) 7 | static inline void hip_errchk(hipError_t result, const char *file, int line) { 8 | if (result != hipSuccess) { 9 | printf("\n\n%s in %s at line %d\n", hipGetErrorString(result), file, 10 | line); 11 | exit(EXIT_FAILURE); 12 | } 13 | } 14 | 15 | // Copy all elements using threads in a 2D grid 16 | __global__ void copy2d(/*TODO: add arguments*/) { 17 | // TODO: compute row and col using 18 | // - threadIdx.x, threadIdx.y 19 | // - blockIdx.x, blockIdx.y 20 | // - blockDim.x, blockDim.y 21 | 22 | // TODO: Make sure there's no out-of-bounds access 23 | // row must be < number of rows 24 | // col must be < number of columns 25 | 26 | // We're computing 1D index from a 2D index and copying from src to dst 27 | const size_t index = row * num_cols + col; 28 | dst[index] = src[index]; 29 | } 30 | 31 | int main() { 32 | static constexpr size_t num_cols = 600; 33 | static constexpr size_t num_rows = 400; 34 | static constexpr size_t num_values = num_cols * num_rows; 35 | static constexpr size_t num_bytes = sizeof(double) * num_values; 36 | std::vector x(num_values); 37 | std::vector y(num_values, 0.0); 38 | 39 | // Initialise data 40 | for (size_t i = 0; i < num_values; i++) { 41 | x[i] = static_cast(i) / 1000.0; 42 | } 43 | 44 | // TODO: Allocate + copy initial values to GPU 45 | 46 | // TODO: Define grid dimensions 47 | // Use dim3 structure for threads and blocks 48 | 49 | // TODO: launch the device kernel 50 | 51 | // TODO: Copy results back to the CPU vector y 52 | 53 | // TODO: Free device memory 54 | 55 | // Check result of computation on the GPU 56 | double error = 0.0; 57 | for (size_t i = 0; i < num_values; i++) { 58 | error += abs(x[i] - y[i]); 59 | } 60 | 61 | printf("total error: %f\n", error); 62 | printf(" reference: %f at (42,42)\n", x[42 * num_rows + 42]); 63 | printf(" result: %f at (42,42)\n", y[42 * num_rows + 42]); 64 | 65 | return 0; 66 | } 67 | -------------------------------------------------------------------------------- /lambdas/01-lambda/README.md: -------------------------------------------------------------------------------- 1 | # Host-device lambda functions and general kernels 2 | 3 | The purpose of this exercise is to understand how the host-device lambda functions work, and how to create a general GPU kernel. Furthermore, differentiating between host and device code paths using ```__HIP_DEVICE_COMPILE__``` macro is demonstrated. 4 | 5 | The task is to define two host-device lambda functions that can be passed for the host or the device kernel. Both lambda functions require a single integer argument, and the intended location of these definitions are indicated by `#error`. The first lambda function does not need to capture anything, but must call the predefined function ```helloFromThread(const int i)```. The second lambda function must capture the value of ```pi```, and then must multiply the thread index by the pi, and print this value from each thread. 6 | 7 | IMPORTANT NOTE! When using the host-device lambda function with NVIDIA architectures, the following compiler argument must be added for hipcc: `--extended-lambda` 8 | -------------------------------------------------------------------------------- /lambdas/01-lambda/lambda.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | /* Blocksize is small because we are printing from all threads */ 6 | #define BLOCKSIZE 4 7 | 8 | /* CPU loop execution */ 9 | template 10 | void cpuKernel(Lambda lambda, const int loop_size) { 11 | for(int i = 0; i < loop_size; i++){ 12 | lambda(i); 13 | } 14 | } 15 | 16 | /* GPU loop execution */ 17 | template 18 | __global__ void gpuKernel(Lambda lambda, const int loop_size) 19 | { 20 | const int i = blockIdx.x * blockDim.x + threadIdx.x; 21 | if(i < loop_size) 22 | { 23 | lambda(i); 24 | } 25 | } 26 | 27 | /* Check if this function is running on CPU or GPU */ 28 | __host__ __device__ void helloFromThread(const int i) { 29 | #ifdef __HIP_DEVICE_COMPILE__ // If running on GPU 30 | printf("Hello from GPU! I'm thread number %d\n", i); 31 | #else // If running on CPU 32 | printf("Hello from CPU! I'm thread number %d\n", i); 33 | #endif 34 | } 35 | 36 | 37 | /* The main function */ 38 | int main() 39 | { 40 | // Set the problem dimensions 41 | const int loop_size = BLOCKSIZE; 42 | const int blocksize = BLOCKSIZE; 43 | const int gridsize = (loop_size - 1 + blocksize) / blocksize; 44 | 45 | // Define lambda1 function with 1 integer argument, 46 | // the lamba must call helloFromThread with that argument 47 | # error put the first lambda funtion definition here 48 | 49 | // Run lambda1 on the CPU device 50 | cpuKernel(lambda1, loop_size); 51 | 52 | // Run lambda1 on the GPU device 53 | gpuKernel<<>>(lambda1, loop_size); 54 | hipStreamSynchronize(0); 55 | 56 | // Store value of pi in pi 57 | double pi = M_PI; 58 | 59 | // Define lambda2 that captures pi (use [=] to capture by value), 60 | // and prints out the results for i * pi from each thread 61 | # error put the second lambda funtion definition here 62 | 63 | // Run lambda2 on the GPU device 64 | gpuKernel<<>>(lambda2, loop_size); 65 | hipStreamSynchronize(0); 66 | } 67 | -------------------------------------------------------------------------------- /lambdas/01-lambda/solution/lambda.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | /* Blocksize is small because we are printing from all threads */ 6 | #define BLOCKSIZE 4 7 | 8 | /* CPU loop execution */ 9 | template 10 | void cpuKernel(Lambda lambda, const int loop_size) { 11 | for(int i = 0; i < loop_size; i++){ 12 | lambda(i); 13 | } 14 | } 15 | 16 | /* GPU loop execution */ 17 | template 18 | __global__ void gpuKernel(Lambda lambda, const int loop_size) 19 | { 20 | const int i = blockIdx.x * blockDim.x + threadIdx.x; 21 | if(i < loop_size) 22 | { 23 | lambda(i); 24 | } 25 | } 26 | 27 | /* Check if this function is running on CPU or GPU */ 28 | __host__ __device__ void helloFromThread(const int i) { 29 | #ifdef __HIP_DEVICE_COMPILE__ // If running on GPU 30 | printf("Hello from GPU! I'm thread number %d\n", i); 31 | #else // If running on CPU 32 | printf("Hello from CPU! I'm thread number %d\n", i); 33 | #endif 34 | } 35 | 36 | 37 | /* The main function */ 38 | int main() 39 | { 40 | // Set the problem dimensions 41 | const int loop_size = BLOCKSIZE; 42 | const int blocksize = BLOCKSIZE; 43 | const int gridsize = (loop_size - 1 + blocksize) / blocksize; 44 | 45 | // Define lambda1 function with 1 integer argument, 46 | // the lamba must call helloFromThread with that argument 47 | auto lambda1 = [] __host__ __device__ (const int i) 48 | { 49 | helloFromThread(i); 50 | }; 51 | 52 | // Run lambda1 on the CPU device 53 | cpuKernel(lambda1, loop_size); 54 | 55 | // Run lambda1 on the GPU device 56 | gpuKernel<<>>(lambda1, loop_size); 57 | hipStreamSynchronize(0); 58 | 59 | // Store value of pi in pi 60 | double pi = M_PI; 61 | 62 | // Define lambda2 that captures pi (use [=] to capture by value), 63 | // and prints out the results for i * pi from each thread 64 | auto lambda2 = [=] __host__ __device__ (const int i) 65 | { 66 | printf("i * pi = %f \n", (double)i * pi); 67 | }; 68 | 69 | // Run lambda2 on the GPU device 70 | gpuKernel<<>>(lambda2, loop_size); 71 | hipStreamSynchronize(0); 72 | } 73 | -------------------------------------------------------------------------------- /lambdas/02-reduction/README.md: -------------------------------------------------------------------------------- 1 | # Reductions with host-device lambdas and hipCUB 2 | 3 | The purpose of this exercise is to use host-device lambda functions and the hipCUB library to create an efficient reduction kernel. The location of the missing parts of the kernel code are indicated by #error. The CUB library documentation may be useful, particularly [this example](https://nvlabs.github.io/cub/classcub_1_1_block_reduce.html#a7632bd9c8950dd6a3528ca99fa3f0890). Note that hipCUB uses namespace "hipcub" instead of "cub" used in the original CUDA library. 4 | 5 | IMPORTANT NOTE! When using the host-device lambda function with NVIDIA architectures, the following compiler argument must be added for hipcc: `--extended-lambda` 6 | -------------------------------------------------------------------------------- /lambdas/02-reduction/reduction.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "../../third-party/hipcub/hipcub.hpp" 4 | 5 | /* Blocksize is divisible by the warp size */ 6 | #define BLOCKSIZE 64 7 | 8 | /* CPU redution loop */ 9 | template 10 | void parallel_reduce_cpu(const int loop_size, Lambda loop_body, int *sum) { 11 | // Evaluate the loop body 12 | for(int i = 0; i < loop_size; i++){ 13 | loop_body(i, *sum); 14 | } 15 | } 16 | 17 | /* GPU redution kernel */ 18 | template 19 | __global__ void reduction_kernel(Lambda loop_body, const int loop_size, int *sum) 20 | { 21 | // Specialize BlockReduce for a 1D block of BLOCKSIZE threads of type int 22 | #error add here hipcub typedef 23 | 24 | // Use shared memory for the hipcub library temporary storage 25 | #error define the shared memory used by the hipcub library here 26 | 27 | // Get thread index 28 | const int idx = blockIdx.x * blockDim.x + threadIdx.x; 29 | 30 | // Local storage for the thread summation value 31 | int thread_sum = 0; 32 | 33 | // Evaluate the loop body, the summation value is stored in thread_sum 34 | if(idx < loop_size) 35 | loop_body(idx, thread_sum); 36 | 37 | // Compute the block-wide sum (aggregate) for the first thread of each block 38 | int aggregate; 39 | #error call the hipcub function to perform block-wide sum and store the result into 'aggregate' 40 | 41 | // The first thread of each block stores the block-wide aggregate to 'sum' using atomics 42 | if(threadIdx.x == 0) 43 | #error use HIP native atomiAdd() function to sum the 'aggregate' of each block into 'sum' 44 | } 45 | 46 | /* Wrapper for the GPU redution kernel */ 47 | template 48 | void parallel_reduce_gpu(const uint loop_size, Lambda loop_body, int *sum) { 49 | 50 | // Set block and grid dimensions 51 | const uint blocksize = BLOCKSIZE; 52 | const uint gridsize = (loop_size - 1 + blocksize) / blocksize; 53 | 54 | // Create GPU buffer for the reduction variable 55 | int* d_buf; 56 | hipMalloc(&d_buf, sizeof(int)); 57 | 58 | // Launch the reduction kernel 59 | reduction_kernel<<>>(loop_body, loop_size, d_buf); 60 | hipStreamSynchronize(0); 61 | 62 | // Copy reduction variable back to host from the GPU buffer 63 | hipMemcpy(sum, d_buf, sizeof(int), hipMemcpyDeviceToHost); 64 | hipFree(d_buf); 65 | } 66 | 67 | 68 | /* The main function */ 69 | int main() 70 | { 71 | // Calculate the triangular number up to 'tn', ie, a sum of numbers from 0 to 'tn' 72 | const int tn = 1000; 73 | 74 | // Calculate the triangular number on the GPU and store it in sum_gpu 75 | int sum_gpu = 0; 76 | parallel_reduce_gpu(tn, [] __host__ __device__ (const int i, int &sum){ 77 | int thread_idx = i; 78 | sum += thread_idx; 79 | }, &sum_gpu); 80 | 81 | // Calculate the triangular number on the CPU and store it in sum_cpu 82 | int sum_cpu = 0; 83 | parallel_reduce_cpu(tn, [] __host__ __device__ (const int i, int &sum){ 84 | int thread_idx = i; 85 | sum += thread_idx; 86 | }, &sum_cpu); 87 | 88 | // Check that the results match 89 | if(sum_gpu == sum_cpu) 90 | printf("The results calculated by GPU = %d and CPU = %d match!\n", sum_gpu, sum_cpu); 91 | else 92 | printf("The results calculated by GPU = %d and CPU = %d do not match!\n", sum_gpu, sum_cpu); 93 | 94 | return 0; 95 | } 96 | -------------------------------------------------------------------------------- /lambdas/02-reduction/solution/reduction.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "../../../third-party/hipcub/hipcub.hpp" 4 | 5 | /* Blocksize is divisible by the warp size */ 6 | #define BLOCKSIZE 64 7 | 8 | /* CPU redution loop */ 9 | template 10 | void parallel_reduce_cpu(const int loop_size, Lambda loop_body, int *sum) { 11 | // Evaluate the loop body 12 | for(int i = 0; i < loop_size; i++){ 13 | loop_body(i, *sum); 14 | } 15 | } 16 | 17 | /* GPU redution kernel */ 18 | template 19 | __global__ void reduction_kernel(Lambda loop_body, const int loop_size, int *sum) 20 | { 21 | // Specialize BlockReduce for a 1D block of BLOCKSIZE threads of type int 22 | typedef hipcub::BlockReduce BlockReduce; 23 | 24 | // Use shared memory for the hipcub library temporary storage 25 | __shared__ typename BlockReduce::TempStorage temp_storage; 26 | 27 | // Get thread index 28 | const int idx = blockIdx.x * blockDim.x + threadIdx.x; 29 | 30 | // Local storage for the thread summation value 31 | int thread_sum = 0; 32 | 33 | // Evaluate the loop body, the summation value is stored in thread_sum 34 | if(idx < loop_size) 35 | loop_body(idx, thread_sum); 36 | 37 | // Compute the block-wide sum (aggregate) for the first thread of each block 38 | int aggregate = BlockReduce(temp_storage).Sum(thread_sum); 39 | 40 | // The first thread of each block stores the block-wide aggregate to 'sum' using atomics 41 | if(threadIdx.x == 0) 42 | atomicAdd(sum, aggregate); 43 | } 44 | 45 | /* Wrapper for the GPU redution kernel */ 46 | template 47 | void parallel_reduce_gpu(const uint loop_size, Lambda loop_body, int *sum) { 48 | 49 | // Set block and grid dimensions 50 | const uint blocksize = BLOCKSIZE; 51 | const uint gridsize = (loop_size - 1 + blocksize) / blocksize; 52 | 53 | // Create GPU buffer for the reduction variable 54 | int* d_buf; 55 | hipMalloc(&d_buf, sizeof(int)); 56 | hipMemcpy(d_buf, sum, sizeof(int), hipMemcpyHostToDevice); 57 | 58 | // Launch the reduction kernel 59 | reduction_kernel<<>>(loop_body, loop_size, d_buf); 60 | hipStreamSynchronize(0); 61 | 62 | // Copy reduction variable back to host from the GPU buffer 63 | hipMemcpy(sum, d_buf, sizeof(int), hipMemcpyDeviceToHost); 64 | hipFree(d_buf); 65 | } 66 | 67 | 68 | /* The main function */ 69 | int main() 70 | { 71 | // Calculate the triangular number up to 'tn', ie, a sum of numbers from 0 to 'tn' 72 | const int tn = 1000; 73 | 74 | // Calculate the triangular number on the GPU and store it in sum_gpu 75 | int sum_gpu = 0; 76 | parallel_reduce_gpu(tn, [] __host__ __device__ (const int i, int &sum){ 77 | int thread_idx = i; 78 | sum += thread_idx; 79 | }, &sum_gpu); 80 | 81 | // Calculate the triangular number on the CPU and store it in sum_cpu 82 | int sum_cpu = 0; 83 | parallel_reduce_cpu(tn, [] __host__ __device__ (const int i, int &sum){ 84 | int thread_idx = i; 85 | sum += thread_idx; 86 | }, &sum_cpu); 87 | 88 | // Check that the results match 89 | if(sum_gpu == sum_cpu) 90 | printf("The results calculated by GPU = %d and CPU = %d match!\n", sum_gpu, sum_cpu); 91 | else 92 | printf("The results calculated by GPU = %d and CPU = %d do not match!\n", sum_gpu, sum_cpu); 93 | 94 | return 0; 95 | } 96 | -------------------------------------------------------------------------------- /lambdas/03-hipify/Makefile: -------------------------------------------------------------------------------- 1 | default: build 2 | echo "Start Build" 3 | 4 | # Accelerator architecture 5 | ifeq ($(CUDA),1) 6 | 7 | CXX = nvcc 8 | CXXDEFS = -DHAVE_CUDA 9 | CXXFLAGS = -g -O3 --x=cu --extended-lambda -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 10 | EXE = bessel 11 | 12 | else ifeq ($(HIP),CUDA) 13 | 14 | CXX = hipcc 15 | CXXDEFS = -DHAVE_HIP -I$(shell pwd)/../../third-party/hiprand -I$(shell pwd)/../../third-party 16 | CXXFLAGS = -g -O3 --x=cu --extended-lambda -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 17 | EXE = bessel 18 | 19 | else ifeq ($(HIP),ROCM) 20 | 21 | CXX = hipcc 22 | CXXDEFS = -DHAVE_HIP -I/appl/eap/opt/rocm-4.3.1/hiprand/include/ -I/appl/eap/opt/rocm-4.3.1/rocrand/include/ 23 | CXXFLAGS = -g -O3 --offload-arch=gfx90a 24 | FILETYPE = .cpp 25 | EXE = bessel 26 | 27 | else 28 | 29 | CXX = g++ 30 | CXXFLAGS = -g -O3 31 | EXE = bessel 32 | 33 | endif 34 | 35 | # Message passing protocol 36 | ifeq ($(MPI),1) 37 | 38 | MPICXX = mpicxx 39 | MPICXXENV = OMPI_CXXFLAGS='' OMPI_CXX='$(CXX) -DHAVE_MPI $(CXXDEFS) $(CXXFLAGS)' 40 | LDFLAGS += -L/appl/spack/install-tree/gcc-9.1.0/openmpi-4.1.1-vonyow/lib 41 | LIBS += -lmpi 42 | 43 | else 44 | 45 | MPICXX = $(CXX) 46 | MPICXXFLAGS = $(CXXDEFS) $(CXXFLAGS) 47 | 48 | endif 49 | 50 | SRC_PATH = src/ 51 | SOURCES = $(shell ls src/*.cpp) 52 | 53 | OBJ_PATH = src/ 54 | OBJECTS = $(shell for file in $(SOURCES);\ 55 | do echo -n $$file | sed -e "s/\(.*\)\.cpp/\1\.o/";echo -n " ";\ 56 | done) 57 | 58 | build: $(EXE) 59 | 60 | depend: 61 | makedepend $(CXXDEFS) -m $(SOURCES) 62 | 63 | test: $(EXE) 64 | ./$(EXE) 65 | 66 | $(EXE): $(OBJECTS) 67 | $(CXX) $(LDFLAGS) $(OBJECTS) $(LIBS) -o $(EXE) 68 | 69 | clean: $(CLEAN) 70 | rm -f $(OBJECTS) $(EXE) 71 | 72 | # Compilation rules 73 | $(OBJ_PATH)%.o: $(SRC_PATH)%.cpp 74 | $(MPICXXENV) $(MPICXX) $(MPICXXFLAGS) -c $< -o $(SRC_PATH)$(notdir $@) 75 | -------------------------------------------------------------------------------- /lambdas/03-hipify/README.md: -------------------------------------------------------------------------------- 1 | # Monte Carlo simulation with hipRAND library 2 | 3 | ## Exercise description 4 | 5 | The HIP header file [devices_hip.h](src/devices_hip.h) has disappeared from the [src](src/) folder. Fortunately, the respective CUDA header, [devices_cuda.h](src/devices_cuda.h), is still present. The task is to use hipify tools to translate [devices_cuda.h](src/devices_cuda.h) to [devices_hip.h](src/devices_hip.h). What does the hipify tool translate? Is there anything that is not translated properly? You may compare the result with the original HIP header named [solution.h](src/solution.h). Instructions to compile the code with HIP at the bottom. 6 | 7 | IMPORTANT NOTE on hipify-clang module usage on Puhti! Load hipify-clang to hipify CUDA code by 8 | ``` 9 | ml hipify-clang 10 | ``` 11 | and after loading and using hipify-clang, you must do the following before trying to compile any HIP code 12 | ``` 13 | ml purge 14 | ml hip 15 | ``` 16 | Otherwise the compilation fails (you cannot compile HIP while having hipify-clang module loaded). 17 | ## Code description 18 | 19 | This example uses the Monte Carlo method to simulate the value of Bessel's correction that minimizes the root mean squared error in the calculation of the sample standard deviation and variance for the chosen sample and population sizes. The sample standard deviation is typically calculated as $$s = \sqrt{\frac{1}{N - \beta}\sum_{i=1}^{N}(x_i - \bar{x})^2}$$ where $$\beta = 1.$$ The simulation calculates the root mean squared error for different values of $\beta$. 20 | 21 | The implementation uses a special construct for the parallel loops in [bessel.cpp](src/bessel.cpp) which is based on a lambda function, an approach similar to some accelerator frameworks such as SYCL, Kokkos, RAJA, etc. The approach allows conditional compilation of the loops for multiple architectures while keeping the source code clean and readable. An example of the usage of cuRAND and hipRAND random number generation libraries inside a GPU kernel are given in [devices_cuda.h](src/devices_cuda.h) and [devices_hip.h](src/devices_hip.h). 22 | 23 | The code can be conditionally compiled for either CUDA, HIP, or HOST execution with or without MPI. The correct definitions for each accelerator backend option are selected in [comms.h](src/comms.h) by choosing the respective header file. The compilation instructions are shown below: 24 | 25 | ``` 26 | // Compile to run sequentially on CPU 27 | make 28 | 29 | // Compile to run parallel on CPUs with MPI 30 | make MPI=1 31 | 32 | // Compile to run parallel on GPU with CUDA 33 | make CUDA=1 34 | 35 | // Compile to run parallel on GPU with HIP 36 | make HIP=CUDA 37 | 38 | // Compile to run parallel on many GPUs with HIP and MPI 39 | make HIP=CUDA MPI=1 40 | 41 | ``` 42 | -------------------------------------------------------------------------------- /lambdas/03-hipify/src/comms.cpp: -------------------------------------------------------------------------------- 1 | #include "comms.h" 2 | 3 | #if defined(HAVE_MPI) 4 | 5 | namespace comms{ 6 | 7 | static int MPI_INITIALIZED = 0; 8 | 9 | int get_procs(){ 10 | int comm_size = 1; 11 | if (MPI_INITIALIZED == 1){ 12 | MPI_Comm_size(MPI_COMM_WORLD, &comm_size); 13 | } 14 | return comm_size; 15 | } 16 | 17 | int get_rank(){ 18 | int proc_rank = 0; 19 | if (MPI_INITIALIZED == 1){ 20 | MPI_Comm_rank(MPI_COMM_WORLD, &proc_rank); 21 | } 22 | return proc_rank; 23 | } 24 | 25 | int get_node_rank(){ 26 | int node_rank = 0; 27 | if (MPI_INITIALIZED == 1){ 28 | MPI_Comm node_comm = MPI_COMM_NULL; 29 | MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &node_comm); 30 | 31 | MPI_Comm_rank(node_comm, &node_rank); 32 | MPI_Comm_free(&node_comm); 33 | } 34 | return node_rank; 35 | } 36 | 37 | int get_node_procs(){ 38 | int node_comm_size = 1; 39 | if (MPI_INITIALIZED == 1){ 40 | MPI_Comm node_comm = MPI_COMM_NULL; 41 | MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &node_comm); 42 | 43 | MPI_Comm_size(node_comm, &node_comm_size); 44 | MPI_Comm_free(&node_comm); 45 | } 46 | return node_comm_size; 47 | } 48 | 49 | void barrier_procs(){ 50 | // Synchronize across all MPI processes 51 | if (MPI_INITIALIZED == 1) 52 | MPI_Barrier(MPI_COMM_WORLD); 53 | } 54 | 55 | void reduce_procs(float *sbuf, int count){ 56 | if (MPI_INITIALIZED == 1){ 57 | float* rbuf; 58 | if(get_rank() == 0) 59 | rbuf = (float*)malloc(count * sizeof(float)); 60 | MPI_Reduce(sbuf, rbuf, count, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD); 61 | if(get_rank() == 0){ 62 | memcpy(sbuf, rbuf, count * sizeof(float)); 63 | free((void*)rbuf); 64 | } 65 | } 66 | } 67 | 68 | void init_procs(int *argc, char **argv[]){ 69 | if(*argc > 1){ 70 | MPI_Init(argc, argv); 71 | MPI_INITIALIZED = 1; 72 | } 73 | // Some device backends require an initialization 74 | devices::init(get_node_rank()); 75 | } 76 | 77 | void finalize_procs(){ 78 | // Some device backends also require a finalization 79 | devices::finalize(get_rank()); 80 | 81 | // Finalize MPI if it is used 82 | if (MPI_INITIALIZED == 1) 83 | MPI_Finalize(); 84 | } 85 | } 86 | 87 | #else 88 | 89 | namespace comms{ 90 | int get_procs(){ 91 | int comm_size = 1; 92 | return comm_size; 93 | } 94 | 95 | int get_rank(){ 96 | int proc_rank = 0; 97 | return proc_rank; 98 | } 99 | 100 | int get_node_rank(){ 101 | int node_rank = 0; 102 | return node_rank; 103 | } 104 | 105 | int get_node_procs(){ 106 | int node_comm_size = 1; 107 | return node_comm_size; 108 | } 109 | 110 | void barrier_procs(){ 111 | } 112 | 113 | void reduce_procs(float *sbuf, int count){ 114 | } 115 | 116 | void init_procs(int *argc, char **argv[]){ 117 | // Some device backends require an initialization 118 | devices::init(get_node_rank()); 119 | } 120 | 121 | void finalize_procs(){ 122 | // Some device backends also require a finalization 123 | devices::finalize(get_rank()); 124 | } 125 | } 126 | 127 | #endif 128 | -------------------------------------------------------------------------------- /lambdas/03-hipify/src/comms.h: -------------------------------------------------------------------------------- 1 | #ifndef BESSEL_COMMS_H 2 | #define BESSEL_COMMS_H 3 | 4 | #if defined(HAVE_MPI) 5 | #include "mpi.h" 6 | #endif 7 | 8 | #if defined(HAVE_CUDA) 9 | #include "devices_cuda.h" 10 | #elif defined(HAVE_HIP) 11 | #include "devices_hip.h" 12 | #else 13 | #include "devices_host.h" 14 | #endif 15 | 16 | namespace comms{ 17 | int get_procs(); 18 | int get_rank(); 19 | int get_node_procs(); 20 | int get_node_rank(); 21 | 22 | void barrier_procs(); 23 | void reduce_procs(float *sbuf, int count); 24 | 25 | void init_procs(int *argc, char **argv[]); 26 | void finalize_procs(); 27 | } 28 | 29 | #endif // !BESSEL_COMMS_H 30 | -------------------------------------------------------------------------------- /lambdas/03-hipify/src/devices_cuda.h: -------------------------------------------------------------------------------- 1 | #ifndef BESSEL_DEVICES_CUDA_H 2 | #define BESSEL_DEVICES_CUDA_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #define CUDA_ERR(err) (cuda_error(err, __FILE__, __LINE__)) 9 | inline static void cuda_error(cudaError_t err, const char *file, int line) { 10 | if (err != cudaSuccess) { 11 | printf("\n\n%s in %s at line %d\n", cudaGetErrorString(err), file, line); 12 | exit(1); 13 | } 14 | } 15 | 16 | #define DEVICE_LAMBDA [=] __host__ __device__ 17 | 18 | namespace devices 19 | { 20 | __forceinline__ static void init(int node_rank) { 21 | int num_devices = 0; 22 | CUDA_ERR(cudaGetDeviceCount(&num_devices)); 23 | CUDA_ERR(cudaSetDevice(node_rank % num_devices)); 24 | } 25 | 26 | __forceinline__ static void finalize(int rank) { 27 | printf("Rank %d, CUDA finalized.\n", rank); 28 | } 29 | 30 | __forceinline__ static void* allocate(size_t bytes) { 31 | void* ptr; 32 | CUDA_ERR(cudaMallocManaged(&ptr, bytes)); 33 | return ptr; 34 | } 35 | 36 | __forceinline__ static void free(void* ptr) { 37 | CUDA_ERR(cudaFree(ptr)); 38 | } 39 | 40 | __forceinline__ static void memcpy_d2d(void* dst, void* src, size_t bytes){ 41 | CUDA_ERR(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToDevice)); 42 | } 43 | 44 | template 45 | __global__ static void cudaKernel(LambdaBody lambda, const int loop_size) 46 | { 47 | const int i = blockIdx.x * blockDim.x + threadIdx.x; 48 | if(i < loop_size) 49 | { 50 | lambda(i); 51 | } 52 | } 53 | 54 | template 55 | __forceinline__ static void parallel_for(int loop_size, T loop_body) { 56 | const int blocksize = 64; 57 | const int gridsize = (loop_size - 1 + blocksize) / blocksize; 58 | cudaKernel<<>>(loop_body, loop_size); 59 | CUDA_ERR(cudaStreamSynchronize(0)); 60 | } 61 | 62 | template 63 | __host__ __device__ __forceinline__ static void atomic_add(T *array_loc, T value){ 64 | // Define this function depending on whether it runs on GPU or CPU 65 | #ifdef __CUDA_ARCH__ 66 | atomicAdd(array_loc, value); 67 | #else 68 | *array_loc += value; 69 | #endif 70 | } 71 | 72 | template 73 | __host__ __device__ static T random_float(unsigned long long seed, unsigned long long seq, int idx, T mean, T stdev){ 74 | 75 | T var = 0; 76 | #ifdef __CUDA_ARCH__ 77 | curandStatePhilox4_32_10_t state; 78 | 79 | // curand_init() reproduces the same random number with the same seed and seq 80 | curand_init(seed, seq, 0, &state); 81 | 82 | // curand_normal() gives a random float from a normal distribution with mean = 0 and stdev = 1 83 | var = stdev * curand_normal(&state) + mean; 84 | #endif 85 | return var; 86 | } 87 | } 88 | 89 | #endif // !BESSEL_DEVICES_CUDA_H 90 | -------------------------------------------------------------------------------- /lambdas/03-hipify/src/devices_host.h: -------------------------------------------------------------------------------- 1 | #ifndef BESSEL_DEVICES_HOST_H 2 | #define BESSEL_DEVICES_HOST_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define DEVICE_LAMBDA [=] 10 | 11 | namespace devices 12 | { 13 | inline static void init(int node_rank) { 14 | // Nothing needs to be done here 15 | } 16 | 17 | inline static void finalize(int rank) { 18 | printf("Rank %d, Host finalized.\n", rank); 19 | } 20 | 21 | inline static void* allocate(size_t bytes) { 22 | return malloc(bytes); 23 | } 24 | 25 | inline static void free(void* ptr) { 26 | ::free(ptr); 27 | } 28 | 29 | inline static void memcpy_d2d(void* dst, void* src, size_t bytes){ 30 | memcpy(dst, src, bytes); 31 | } 32 | 33 | template 34 | inline static void parallel_for(int loop_size, Lambda loop_body) { 35 | for(int i = 0; i < loop_size; i++){ 36 | loop_body(i); 37 | } 38 | } 39 | 40 | template 41 | inline static void atomic_add(T *array_loc, T value){ 42 | *array_loc += value; 43 | } 44 | 45 | template 46 | inline static T random_float(unsigned long long seed, unsigned long long seq, int idx, T mean, T stdev){ 47 | 48 | // Re-seed the first case 49 | if(idx == 0){ 50 | // Overflow is defined behavior with unsigned, and therefore ok here 51 | srand((unsigned int)seed + (unsigned int)seq); 52 | } 53 | 54 | // Use Box Muller algorithm to get a float from a normal distribution 55 | const float two_pi = 2.0f * M_PI; 56 | float u1 = (float) rand() / RAND_MAX; 57 | float u2 = (float) rand() / RAND_MAX; 58 | float factor = stdev * sqrtf (-2.0f * logf (u1)); 59 | float trig_arg = two_pi * u2; 60 | 61 | // Box Muller algorithm produces two random normally distributed floats, z0 and z1 62 | float z0 = factor * cosf (trig_arg) + mean; // Need only one 63 | // float z1 = factor * sinf (trig_arg) + mean; 64 | return z0; 65 | } 66 | } 67 | #endif // !BESSEL_DEVICES_HOST_H 68 | -------------------------------------------------------------------------------- /lambdas/03-hipify/src/solution.h: -------------------------------------------------------------------------------- 1 | #ifndef BESSEL_DEVICES_HIP_H 2 | #define BESSEL_DEVICES_HIP_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #define HIP_ERR(err) (hip_error(err, __FILE__, __LINE__)) 9 | inline static void hip_error(hipError_t err, const char *file, int line) { 10 | if (err != hipSuccess) { 11 | printf("\n\n%s in %s at line %d\n", hipGetErrorString(err), file, line); 12 | exit(1); 13 | } 14 | } 15 | 16 | #define DEVICE_LAMBDA [=] __host__ __device__ 17 | 18 | namespace devices 19 | { 20 | __forceinline__ static void init(int node_rank) { 21 | int num_devices = 0; 22 | HIP_ERR(hipGetDeviceCount(&num_devices)); 23 | HIP_ERR(hipSetDevice(node_rank % num_devices)); 24 | } 25 | 26 | __forceinline__ static void finalize(int rank) { 27 | printf("Rank %d, HIP finalized.\n", rank); 28 | } 29 | 30 | __forceinline__ static void* allocate(size_t bytes) { 31 | void* ptr; 32 | HIP_ERR(hipMallocManaged(&ptr, bytes)); 33 | return ptr; 34 | } 35 | 36 | __forceinline__ static void free(void* ptr) { 37 | HIP_ERR(hipFree(ptr)); 38 | } 39 | 40 | __forceinline__ static void memcpyd2d(void* dst, void* src, size_t bytes){ 41 | HIP_ERR(hipMemcpy(dst, src, bytes, hipMemcpyDeviceToDevice)); 42 | } 43 | 44 | template 45 | __global__ static void hipKernel(LambdaBody lambda, const int loop_size) 46 | { 47 | const int i = blockIdx.x * blockDim.x + threadIdx.x; 48 | if(i < loop_size) 49 | { 50 | lambda(i); 51 | } 52 | } 53 | 54 | template 55 | __forceinline__ static void parallel_for(int loop_size, T loop_body) { 56 | const int blocksize = 64; 57 | const int gridsize = (loop_size - 1 + blocksize) / blocksize; 58 | hipKernel<<>>(loop_body, loop_size); 59 | HIP_ERR(hipStreamSynchronize(0)); 60 | } 61 | 62 | template 63 | __host__ __device__ __forceinline__ static void atomic_add(T *array_loc, T value){ 64 | // Define this function depending on whether it runs on GPU or CPU 65 | #if __HIP_DEVICE_COMPILE__ 66 | atomicAdd(array_loc, value); 67 | #else 68 | *array_loc += value; 69 | #endif 70 | } 71 | 72 | template 73 | __host__ __device__ static T random_float(unsigned long long seed, unsigned long long seq, int idx, T mean, T stdev){ 74 | 75 | T var = 0; 76 | #if __HIP_DEVICE_COMPILE__ 77 | hiprandStatePhilox4_32_10_t state; 78 | 79 | // hiprand_init() reproduces the same random number with the same seed and seq 80 | hiprand_init(seed, seq, 0, &state); 81 | 82 | // hiprand_normal() gives a random float from a normal distribution with mean = 0 and stdev = 1 83 | var = stdev * hiprand_normal(&state) + mean; 84 | #endif 85 | return var; 86 | } 87 | } 88 | 89 | #endif // !BESSEL_DEVICES_HIP_H 90 | -------------------------------------------------------------------------------- /memory/01-prefetch/README.md: -------------------------------------------------------------------------------- 1 | # Memory management strategies 2 | 3 | The purpose of this exercise is to compare 6 different memory management 4 | strategies and their computational overhead. The following functions are called 5 | at the end of this file by the `main()` function: 6 | 7 | * The function `explicitMem()` represents a basic explicit memory management strategy 8 | * The function `explicitMemPinned()` represents an explicit memory management strategy with pinned host memory 9 | * The function `explicitMemNoCopy()` represents an explicit memory management strategy where the data can reside at GPU memory during an iterative loop (no recurring memory copies needed) 10 | * The function `unifiedMem()` represents a basic unified memory management strategy 11 | * The function `unifiedMemPrefetch()` represents a unified memory management strategy with prefetching 12 | * The function `unifiedMemNoCopy()` represents a unified memory management strategy where the data can reside at GPU memory during an iterative loop (no recurring memory copies needed) 13 | 14 | The task is to fill the missing function calls in the code indicated by lines beginning with `#error`, and followed by a descriptive instruction. 15 | 16 | ## Hints 17 | 18 | `int device;` 19 | `hipGetDevice(&device);` 20 | 21 | * prefetch: 22 | `hipMemPrefetchAsync((const void*) ptr, size_t count, int device, hipStream_t stream)` 23 | 24 | * prefetch to device on stream 0: 25 | `hipMemPrefetchAsync(A, size, device, 0);` 26 | 27 | * prefetch to host: use device `hipCpuDeviceId` 28 | `hipMemPrefetchAsync(A, size, hipCpuDeviceId, 0);` 29 | 30 | *Device memset 31 | `hipMemset(A, 0, size);` 32 | -------------------------------------------------------------------------------- /memory/02-mempools/README.md: -------------------------------------------------------------------------------- 1 | # The stream-ordered memory allocator and memory pools 2 | 3 | The purpose of this exercise is to compare different memory allocation strategies within a loop and to understand the performance impact of using or not using a memory pool. The following timed functions are called at the end of the source file by the `main()` function: 4 | 5 | * The function `noRecurringAlloc()` allocates memory outside loop only once 6 | * The function `recurringAllocNoMemPools()` allocates memory within a loop recurringly 7 | * The function `recurringAllocMemPool()` obtains memory from a pool within a loop recurringly 8 | 9 | The task is to fill the missing function calls in the code indicated by lines beginning with `#error`, and followed by a descriptive instruction. 10 | -------------------------------------------------------------------------------- /memory/03-struct/README.md: -------------------------------------------------------------------------------- 1 | # Unified memory and structs 2 | 3 | The purpose of this exercise is to run a loop accessing a struct from host and 4 | device using different memory management strategies. 5 | 6 | The function `runHost()` demonstrates the execution on host and is already complete. 7 | 8 | The task is to fill the functions `runDeviceUnifiedMem()` and `runDeviceExplicitMem()` to do 9 | the same thing parallel on the device. The latter function also requires explicitly specifying how the struct is copied to the GPU memory, which is not always trivial. Therefore, you must also fill the GPU struct allocation and deallocation functions `createDeviceExample()` and `freeDeviceExample()`. 10 | -------------------------------------------------------------------------------- /multi-gpu/01-p2pcopy/README.md: -------------------------------------------------------------------------------- 1 | # Peer to peer device access 2 | 3 | Benchmark memory copies with and without peer to peer device access using two 4 | GPUs. 5 | 6 | Skeleton code [p2pcopy.cpp](p2pcopy.cpp) tests peer to peer device access between two GPUs by doing a series of memory copies. The test is evaluated after calling `hipDeviceEnablePeerAccess()` and `hipDeviceDisablePeerAccess()`. The program prints calculated bandwith and time for both cases. On a CUDA platform, there should be a difference in results, whereas on an AMD platform there is none. 7 | 8 | In order to make the code work, you need to fix the missing parts marked with TODOs. 9 | 10 | NOTE: Remember to request 2 GPUs when running this exercise. 11 | On Lumi, use 12 | ``` 13 | srun --account=XXXXXX --partition=small-g -N1 -n1 --cpus-per-task=1 --gpus-per-node=2 --time=00:15:00 ./a.out # The reservation is for small-g partition 14 | ``` 15 | 16 | When the code is running correct run it several times and observe the bandwidths. What are the bandwidths=? 17 | 18 | Disable the DMA engine with `export HSA_ENABLE_SDMA=0` and then try again code. What are the results now? 19 | 20 | 21 | On Mahti use 22 | ``` 23 | srun --account=XXXXXX --partition=gputest -N1 -n1 --cpus-per-task=1 --gres=gpu:v100:2 --time=00:15:00 ./a.out 24 | ``` 25 | -------------------------------------------------------------------------------- /multi-gpu/01-p2pcopy/p2pcopy.cpp: -------------------------------------------------------------------------------- 1 | #include "stdio.h" 2 | #include "stdint.h" 3 | #include 4 | #include 5 | 6 | 7 | void copyP2P(int p2p, int gpu0, int gpu1, int* dA_0, int* dA_1, int size) { 8 | 9 | // Enable peer access for GPUs? 10 | if (p2p) 11 | { 12 | // TODO: Enable peer access for GPU 0 and GPU 1 13 | } 14 | 15 | // Do a dummy copy without timing to remove the impact of the first one 16 | // TODO: Copy dA_1 on device 1 to dA_0 on device 0 17 | 18 | // Do a series of timed P2P memory copies 19 | int N = 10; 20 | clock_t tStart = clock(); 21 | // TODO: Copy dA_1 on device 1 to dA_0 on device 0, repeat for N times to 22 | // get timings 23 | // TODO: After the memory copies, remember to synchronize the stream 24 | // before stopping the clock 25 | clock_t tStop = clock(); 26 | 27 | // Calcute time and bandwith 28 | double time_s = (double) (tStop - tStart) / CLOCKS_PER_SEC; 29 | double bandwidth = (double) size * (double) N / (double) 1e9 / time_s; 30 | 31 | // Disable peer access for GPUs? 32 | if (p2p) { 33 | // TODO: Disable peer access for GPU 0 and GPU 1 34 | printf("P2P enabled - Bandwith: %.3f (GB/s), Time: %.3f s\n", 35 | bandwidth, time_s); 36 | } else { 37 | printf("P2P disabled - Bandwith: %.3f (GB/s), Time: %.3f s\n", 38 | bandwidth, time_s); 39 | } 40 | } 41 | 42 | 43 | int main(int argc, char *argv[]) 44 | { 45 | // Check that we have at least two GPUs 46 | int devcount; 47 | hipGetDeviceCount(&devcount); 48 | if(devcount < 2) { 49 | printf("Need at least two GPUs!\n"); 50 | exit(EXIT_FAILURE); 51 | } else { 52 | printf("Found %d GPU devices, using GPUs 0 and 1!\n", devcount); 53 | } 54 | 55 | // Allocate memory for both GPUs 56 | int size = pow(2, 28); 57 | int gpu0 = 0, gpu1 = 1; 58 | int *dA_0, *dA_1; 59 | hipSetDevice(gpu0); 60 | hipMalloc((void**) &dA_0, size); 61 | hipSetDevice(gpu1); 62 | hipMalloc((void**) &dA_1, size); 63 | 64 | // Check peer accessibility between GPUs 0 and 1 65 | int peerAccess01; 66 | int peerAccess10; 67 | // TODO: Check for peer to peer accessibility from device 0 to 1 68 | // and from 1 to 0 69 | printf("hipDeviceCanAccessPeer: %d (GPU %d to GPU %d)\n", 70 | peerAccess01, gpu0, gpu1); 71 | printf("hipDeviceCanAccessPeer: %d (GPU %d to GPU %d)\n", 72 | peerAccess10, gpu1, gpu0); 73 | 74 | // Memcopy, P2P enabled 75 | if (peerAccess01 && peerAccess10) 76 | copyP2P(1, gpu0, gpu1, dA_0, dA_1, size); 77 | 78 | // Memcopy, P2P disabled 79 | copyP2P(0, gpu0, gpu1, dA_0, dA_1, size); 80 | 81 | // Deallocate device memory 82 | hipFree(dA_0); 83 | hipFree(dA_1); 84 | } 85 | -------------------------------------------------------------------------------- /multi-gpu/01-p2pcopy/solution/p2pcopy.cpp: -------------------------------------------------------------------------------- 1 | #include "stdio.h" 2 | #include "stdint.h" 3 | #include 4 | #include 5 | 6 | 7 | void copyP2P(int p2p, int gpu0, int gpu1, int* dA_0, int* dA_1, int size) { 8 | 9 | // Enable peer access for GPUs? 10 | if (p2p) 11 | { 12 | hipSetDevice(gpu0); 13 | hipDeviceEnablePeerAccess(gpu1, 0); 14 | hipSetDevice(gpu1); 15 | hipDeviceEnablePeerAccess(gpu0, 0); 16 | } 17 | 18 | // Do a dummy copy without timing to remove the impact of the first one 19 | hipMemcpy(dA_0, dA_1, size, hipMemcpyDefault); 20 | hipMemcpy(dA_1, dA_0, size, hipMemcpyDefault); 21 | 22 | // Do a series of timed P2P memory copies 23 | int N = 10; 24 | clock_t tStart = clock(); 25 | for (int i = 0; i < N; ++i) { 26 | hipMemcpy(dA_0, dA_1, size, hipMemcpyDefault); 27 | } 28 | hipStreamSynchronize(0); 29 | clock_t tStop = clock(); 30 | 31 | // Calcute time and bandwith 32 | double time_s = (double) (tStop - tStart) / CLOCKS_PER_SEC; 33 | double bandwidth = (double) size * (double) N / (double) 1e9 / time_s; 34 | 35 | // Disable peer access for GPUs? 36 | if (p2p) { 37 | hipSetDevice(gpu0); 38 | hipDeviceDisablePeerAccess(gpu1); 39 | hipSetDevice(gpu1); 40 | hipDeviceDisablePeerAccess(gpu0); 41 | printf("P2P enabled - Bandwith: %.3f (GB/s), Time: %.3f s\n", 42 | bandwidth, time_s); 43 | } else { 44 | printf("P2P disabled - Bandwith: %.3f (GB/s), Time: %.3f s\n", 45 | bandwidth, time_s); 46 | } 47 | } 48 | 49 | 50 | int main(int argc, char *argv[]) 51 | { 52 | // Check that we have at least two GPUs 53 | int devcount; 54 | hipGetDeviceCount(&devcount); 55 | if(devcount < 2) { 56 | printf("Need at least two GPUs!\n"); 57 | exit(EXIT_FAILURE); 58 | } else { 59 | printf("Found %d GPU devices, using GPUs 0 and 1!\n", devcount); 60 | } 61 | 62 | // Allocate memory for both GPUs 63 | int size = pow(2, 28); 64 | int gpu0 = 0, gpu1 = 1; 65 | int *dA_0, *dA_1; 66 | hipSetDevice(gpu0); 67 | hipMalloc((void**) &dA_0, size); 68 | hipSetDevice(gpu1); 69 | hipMalloc((void**) &dA_1, size); 70 | 71 | // Check peer accessibility between GPUs 0 and 1 72 | int peerAccess01; 73 | int peerAccess10; 74 | hipDeviceCanAccessPeer(&peerAccess01, gpu0, gpu1); 75 | hipDeviceCanAccessPeer(&peerAccess10, gpu1, gpu0); 76 | printf("hipDeviceCanAccessPeer: %d (GPU %d to GPU %d)\n", 77 | peerAccess01, gpu0, gpu1); 78 | printf("hipDeviceCanAccessPeer: %d (GPU %d to GPU %d)\n", 79 | peerAccess10, gpu1, gpu0); 80 | 81 | // Memcopy, P2P enabled 82 | if (peerAccess01 && peerAccess10) 83 | copyP2P(1, gpu0, gpu1, dA_0, dA_1, size); 84 | 85 | // Memcopy, P2P disabled 86 | copyP2P(0, gpu0, gpu1, dA_0, dA_1, size); 87 | 88 | // Deallocate device memory 89 | hipFree(dA_0); 90 | hipFree(dA_1); 91 | } 92 | -------------------------------------------------------------------------------- /multi-gpu/02-vector-sum/README.md: -------------------------------------------------------------------------------- 1 | # Vector sum on two GPUs without MPI 2 | 3 | Calculate the vector sum of two vectors (C = A + B) using two GPUs. 4 | 5 | Decompose the vectors into equal halves, copy data from host to device memory 6 | and launch a GPU kernel on each part asynchronously using streams. Copy the 7 | results back to the host to check for correctness. Add timing events to 8 | measure the time of execution. 9 | 10 | A skeleton code is provided in [vector-sum.cpp](vector-sum.cpp). Your task is to fill the locations indicated by 11 | 12 | ```// TODO:``` 13 | 14 | NOTE: Remember to request 2 GPUs when running this exercise. On Lumi, use 15 | ``` 16 | srun --account=XXXXXX --partition=small-g -N1 -n1 --cpus-per-task=1 --gpus-per-node=2 --time=00:15:00 ./a.out # The reservation is for small-g partition 17 | ``` 18 | and on Mahti use 19 | ``` 20 | srun --account=XXXXXX --partition=gputest -N1 -n1 --cpus-per-task=1 --gres=gpu:v100:2 --time=00:15:00 ./a.out 21 | ``` 22 | -------------------------------------------------------------------------------- /multi-gpu/02-vector-sum/vector-sum.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | // Data structure for storing decomposition information 6 | struct Decomp { 7 | int len; // length of the array for the current device 8 | int start; // start index for the array on the current device 9 | }; 10 | 11 | 12 | /* HIP kernel for the addition of two vectors, i.e. C = A + B */ 13 | __global__ void vector_add(double *C, const double *A, const double *B, int N) 14 | { 15 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 16 | 17 | // Do not try to access past the allocated memory 18 | if (idx < N) { 19 | C[idx] = A[idx] + B[idx]; 20 | } 21 | } 22 | 23 | 24 | int main(int argc, char *argv[]) 25 | { 26 | const int ThreadsInBlock = 128; 27 | double *dA[2], *dB[2], *dC[2]; 28 | double *hA, *hB, *hC; 29 | int devicecount; 30 | int N = 100; 31 | hipEvent_t start, stop; 32 | hipStream_t strm[2]; 33 | Decomp dec[2]; 34 | 35 | // TODO: Check that we have two HIP devices available 36 | 37 | // Create timing events 38 | hipSetDevice(0); 39 | hipEventCreate(&start); 40 | hipEventCreate(&stop); 41 | 42 | // Allocate host memory 43 | // TODO: Allocate enough pinned host memory for hA, hB, and hC 44 | // to store N doubles each 45 | 46 | // Initialize host memory 47 | for(int i = 0; i < N; ++i) { 48 | hA[i] = 1.0; 49 | hB[i] = 2.0; 50 | } 51 | 52 | // Decomposition of data for each stream 53 | dec[0].len = N / 2; 54 | dec[0].start = 0; 55 | dec[1].len = N - N / 2; 56 | dec[1].start = dec[0].len; 57 | 58 | // Allocate memory for the devices and per device streams 59 | for (int i = 0; i < 2; ++i) { 60 | // TODO: Allocate enough device memory for dA[i], dB[i], dC[i] 61 | // to store dec[i].len doubles 62 | // TODO: Create a stream for each device 63 | } 64 | 65 | // Start timing 66 | hipSetDevice(0); 67 | hipEventRecord(start); 68 | 69 | /* Copy each decomposed part of the vectors from host to device memory 70 | and execute a kernel for each part. 71 | Note: one needs to use streams and asynchronous calls! Without this 72 | the execution is serialized because the memory copies block the 73 | execution of the host process. */ 74 | for (int i = 0; i < 2; ++i) { 75 | // TODO: Set active device 76 | // TODO: Copy data from host to device asynchronously (hA[dec[i].start] -> dA[i], hB[dec[i].start] -> dB[i]) 77 | // TODO: Launch 'vector_add()' kernel to calculate dC = dA + dB 78 | // TODO: Copy data from device to host (dC[i] -> hC[dec[0].start]) 79 | } 80 | 81 | // Synchronize and destroy the streams 82 | for (int i = 0; i < 2; ++i) { 83 | // TODO: Add synchronization calls and destroy streams 84 | } 85 | 86 | // Stop timing 87 | // TODO: Add here the timing event stop calls 88 | 89 | // Free device memory 90 | for (int i = 0; i < 2; ++i) { 91 | // TODO: Deallocate device memory 92 | } 93 | 94 | // Check results 95 | int errorsum = 0; 96 | for (int i = 0; i < N; i++) { 97 | errorsum += hC[i] - 3.0; 98 | } 99 | printf("Error sum = %i\n", errorsum); 100 | 101 | // Calculate the elapsed time 102 | float gputime; 103 | hipSetDevice(0); 104 | hipEventElapsedTime(&gputime, start, stop); 105 | printf("Time elapsed: %f\n", gputime / 1000.); 106 | 107 | // Deallocate host memory 108 | hipHostFree((void*)hA); 109 | hipHostFree((void*)hB); 110 | hipHostFree((void*)hC); 111 | 112 | return 0; 113 | } 114 | -------------------------------------------------------------------------------- /multi-gpu/03-mpi/Makefile: -------------------------------------------------------------------------------- 1 | HIPCC = hipcc 2 | MPICXX = mpicxx 3 | MPICXXFLAGS = -g -O2 -w 4 | 5 | # Puhti 6 | MPICXXENV = OMPI_CXXFLAGS='' OMPI_CXX='$(HIPCC) --x cu --gpu-architecture=sm_70' 7 | # LDFLAGS = -L/appl/spack/v018/install-tree/gcc-11.3.0/openmpi-4.1.4-w2aekq/lib 8 | LDFLAGS = -L/appl/spack/v018/install-tree/gcc-11.3.0/openmpi-4.1.4-gkv6dx/lib 9 | LIBS = -lmpi 10 | 11 | ping-pong: ping-pong.o 12 | $(HIPCC) $(LDFLAGS) -o $@ $< $(LIBS) 13 | 14 | %.o: %.cpp 15 | $(MPICXXENV) $(MPICXX) $(MPICXXFLAGS) -c -o $@ $< 16 | 17 | .PHONY: clean 18 | clean: 19 | rm -f *.o ping-pong 20 | -------------------------------------------------------------------------------- /multi-gpu/03-mpi/README.md: -------------------------------------------------------------------------------- 1 | # Ping-pong with multiple GPUs and MPI 2 | 3 | Implement a simple ping-pong test for GPU-to-GPU communication using: 4 | a) indirect communication via the host, and b) direct communication with 5 | HIP-aware MPI. 6 | 7 | The ping-pong test consists of the following steps: 8 | 1. Send a vector from one GPU to another 9 | 2. The receiving GPU should increment all elements of the vector by one 10 | 3. Send the vector back to the original GPU 11 | 12 | For reference, there is also a CPU-to-CPU implementation in the skeleton 13 | code ([ping-pong.cpp](ping-pong.cpp)). Timing of all tests is also included to 14 | compare the execution times. 15 | 16 | On **Lumi**, one can compile the MPI example simply using the Cray compiler with 17 | ``` 18 | CC -xhip ping-pong.cpp 19 | ``` 20 | 21 | On LUMI, enable gpu-aware MPI on runtime (and compiling) by eexecuting: 22 | ``` 23 | MPICH_GPU_SUPPORT_ENABLED=1 24 | ``` 25 | For running, one should use two GPUs and two MPI processes: 26 | 27 | ``` 28 | srun --account=XXXXXX --partition=small-g -N1 -tasks-per-node=2 --cpus-per-task=1 --gpus-per-node=2 --time=00:15:00 ./a.out # # The reservation is for small-g partition 29 | ``` 30 | 31 | 32 | On **Mahti**, to compile, just load the required modules and type `make`. A gpu-aware MPI is 33 | available with: 34 | ``` 35 | ml openmpi/4.1.4-cuda 36 | ``` 37 | For running, one should use two GPUs and two MPI processes: 38 | ``` 39 | srun --account=XXXXXX --partition=gputest -N1 -n2 --cpus-per-task=1 --gres=gpu:v100:2 --time=00:15:00 ./a.out 40 | ``` 41 | -------------------------------------------------------------------------------- /multi-gpu/03-mpi/solution/Makefile: -------------------------------------------------------------------------------- 1 | HIPCC = hipcc 2 | MPICXX = mpicxx 3 | MPICXXFLAGS = -g -O2 -w 4 | 5 | # Puhti 6 | MPICXXENV = OMPI_CXXFLAGS='' OMPI_CXX='$(HIPCC) --x cu --gpu-architecture=sm_70' 7 | # LDFLAGS = -L/appl/spack/v018/install-tree/gcc-11.3.0/openmpi-4.1.4-w2aekq/lib 8 | LDFLAGS = -L/appl/spack/v018/install-tree/gcc-11.3.0/openmpi-4.1.4-gkv6dx/lib 9 | LIBS = -lmpi 10 | 11 | ping-pong: ping-pong.o 12 | $(HIPCC) $(LDFLAGS) -o $@ $< $(LIBS) 13 | 14 | %.o: %.cpp 15 | $(MPICXXENV) $(MPICXX) $(MPICXXFLAGS) -c -o $@ $< 16 | 17 | .PHONY: clean 18 | clean: 19 | rm -f *.o ping-pong 20 | -------------------------------------------------------------------------------- /optimization/01-coalescing/README.md: -------------------------------------------------------------------------------- 1 | # Performance counters and coalesced memory access 2 | 3 | ## Background and rocprof 4 | 5 | `rocprof` can collect performance metric counters (`pmc`) of gpu kernels: 6 | ```bash 7 | > rocprof -i metrics.txt -o metrics.csv ./copy 8 | ``` 9 | 10 | The counters to be collected are listed in the `metrics.txt` file and they are 11 | outputted the `metrics.csv` file. For example, if the file `metrics.txt` is 12 | 13 | ``` 14 | pmc: VALUBusy, TCP_TCC_READ_REQ_sum 15 | pmc: TCC_EA_RDREQ_sum 16 | ``` 17 | then `rocprof` will collect the derived metrics of how busy the vector 18 | arithmetic logic units (VALU), how many L2 read requests are issued 19 | (TCP_TCC_READ_REQ_sum) and how many global device memory read requests are 20 | issued (TCC_EA_RDREQ_sum). 21 | 22 | Here `TCP_TCC` refers to how many read requests the L1 (TCP) cache controller 23 | issues to the L2 cache (TCC) and `TCC_EA` refers to how many reads L2 cache 24 | controller issues to the interconnect (`EA`). 25 | 26 | The options `--list-derived` and `--list-basic` will list the available derived 27 | and basic counters. 28 | 29 | *Note*: `rocprof --list-derived` and `rocprof --list-basic` must be 30 | executed on a node with GPU present because it queries the available counters 31 | from the hardware itself. 32 | 33 | An MI250x GCD has 8 MiB of L2 memory shared across the CUs and each CU has 16 34 | kiB of L1 memory. 35 | 36 | ## Exercise 37 | 38 | The Code `copy.cpp` will read and write memory array of 4096*4096 float32 39 | entries and various strides (`(1< 2 | 3 | #include 4 | #include 5 | 6 | #define LOG2SIZE 12 7 | const static int width = 1< matrix_in; 25 | std::vector matrix_out; 26 | 27 | matrix_in.resize(width * height); 28 | matrix_out.resize(width * height); 29 | 30 | for (int i = 0; i < width * height; i++) { 31 | matrix_in[i] = (float)rand() / (float)RAND_MAX; 32 | } 33 | 34 | float *d_in; 35 | float *d_out; 36 | 37 | hipMalloc((void **)&d_in, (width * height) * sizeof(float)); 38 | hipMalloc((void **)&d_out, (width * height) * sizeof(float)); 39 | 40 | hipMemcpy(d_in, matrix_in.data(), width * height * sizeof(float), 41 | hipMemcpyHostToDevice); 42 | 43 | printf("Setup complete. Launching kernel \n"); 44 | int block_x = width / tile_dim_x; 45 | int block_y = height / tile_dim_y; 46 | 47 | 48 | // Create events 49 | 50 | /* printf("Warm up the gpu!\n"); */ 51 | /* for(int i=1;i<=10;i++){ */ 52 | /* hipLaunchKernelGGL(copy_kernel, dim3(block_x, block_y), */ 53 | /* dim3(tile_dim_x, tile_dim_y), 0, 0, d_in, d_out, width, */ 54 | /* height);} */ 55 | 56 | 57 | 58 | for(int i=1;i<=21;i++){ 59 | hipLaunchKernelGGL(copy_kernel, dim3(block_x, block_y), 60 | dim3(tile_dim_x, tile_dim_y), 0, 0, d_in, d_out, width, 61 | height, (1< 2 | 3 | #include 4 | #include 5 | 6 | const static int width = 4096; 7 | const static int height = 4096; 8 | const static int tile_dim = 16; 9 | 10 | __global__ void copy_kernel(float *in, float *out, int width, int height) { 11 | int x_index = blockIdx.x * tile_dim + threadIdx.x; 12 | int y_index = blockIdx.y * tile_dim + threadIdx.y; 13 | 14 | int index = y_index * width + x_index; 15 | 16 | out[index] = in[index]; 17 | } 18 | 19 | 20 | 21 | int main() { 22 | std::vector matrix_in; 23 | std::vector matrix_out; 24 | 25 | matrix_in.resize(width * height); 26 | matrix_out.resize(width * height); 27 | 28 | for (int i = 0; i < width * height; i++) { 29 | matrix_in[i] = (float)rand() / (float)RAND_MAX; 30 | } 31 | 32 | 33 | 34 | float *d_in; 35 | float *d_out; 36 | 37 | hipMalloc((void **)&d_in, width * height * sizeof(float)); 38 | hipMalloc((void **)&d_out, width * height * sizeof(float)); 39 | 40 | hipMemcpy(d_in, matrix_in.data(), width * height * sizeof(float), 41 | hipMemcpyHostToDevice); 42 | 43 | printf("Setup complete. Launching kernel \n"); 44 | int block_x = width / tile_dim; 45 | int block_y = height / tile_dim; 46 | 47 | 48 | // Create events 49 | hipEvent_t start_kernel_event; 50 | hipEventCreate(&start_kernel_event); 51 | hipEvent_t end_kernel_event; 52 | hipEventCreate(&end_kernel_event); 53 | 54 | printf("Warm up the gpu!\n"); 55 | for(int i=1;i<=10;i++){ 56 | hipLaunchKernelGGL(copy_kernel, dim3(block_x, block_y), 57 | dim3(tile_dim, tile_dim), 0, 0, d_in, d_out, width, 58 | height);} 59 | 60 | hipEventRecord(start_kernel_event, 0); 61 | 62 | 63 | for(int i=1;i<=10;i++){ 64 | hipLaunchKernelGGL(copy_kernel, dim3(block_x, block_y), 65 | dim3(tile_dim, tile_dim), 0, 0, d_in, d_out, width, 66 | height);} 67 | 68 | hipEventRecord(end_kernel_event, 0); 69 | hipEventSynchronize(end_kernel_event); 70 | 71 | hipDeviceSynchronize(); 72 | float time_kernel; 73 | hipEventElapsedTime(&time_kernel, start_kernel_event, end_kernel_event); 74 | 75 | printf("Kernel execution complete \n"); 76 | printf("Event timings:\n"); 77 | printf(" %.6f ms - copy \n Bandwidth %.6f GB/s\n", time_kernel/10, 2.0*10000*(((double)(width)*(double)height)*sizeof(float))/(time_kernel*1024*1024*1024)); 78 | 79 | hipMemcpy(matrix_out.data(), d_out, width * height * sizeof(float), 80 | hipMemcpyDeviceToHost); 81 | 82 | 83 | return 0; 84 | } 85 | -------------------------------------------------------------------------------- /optimization/02-matrix_transpose/matrix_transpose_naive.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | const static int width = 4096; 7 | const static int height = 4096; 8 | const static int tile_dim = 16; 9 | 10 | __global__ void transpose_naive_kernel(float *in, float *out, int width, int height) { 11 | int x_index = blockIdx.x * tile_dim + threadIdx.x; 12 | int y_index = blockIdx.y * tile_dim + threadIdx.y; 13 | 14 | int in_index = y_index * width + x_index; 15 | int out_index = x_index * height + y_index; 16 | 17 | out[out_index] = in[in_index]; 18 | } 19 | 20 | 21 | 22 | int main() { 23 | std::vector matrix_in; 24 | std::vector matrix_out; 25 | 26 | matrix_in.resize(width * height); 27 | matrix_out.resize(width * height); 28 | 29 | for (int i = 0; i < width * height; i++) { 30 | matrix_in[i] = (float)rand() / (float)RAND_MAX; 31 | } 32 | 33 | 34 | 35 | float *d_in; 36 | float *d_out; 37 | 38 | hipMalloc((void **)&d_in, width * height * sizeof(float)); 39 | hipMalloc((void **)&d_out, width * height * sizeof(float)); 40 | 41 | hipMemcpy(d_in, matrix_in.data(), width * height * sizeof(float), 42 | hipMemcpyHostToDevice); 43 | 44 | printf("Setup complete. Launching kernel \n"); 45 | int block_x = width / tile_dim; 46 | int block_y = height / tile_dim; 47 | 48 | 49 | 50 | // Create events 51 | hipEvent_t start_kernel_event; 52 | hipEventCreate(&start_kernel_event); 53 | hipEvent_t end_kernel_event; 54 | hipEventCreate(&end_kernel_event); 55 | 56 | printf("Warm up the gpu!\n"); 57 | 58 | 59 | for(int i=1;i<=10;i++){ 60 | hipLaunchKernelGGL(transpose_naive_kernel, dim3(block_x, block_y), 61 | dim3(tile_dim, tile_dim), 0, 0, d_in, d_out, width, 62 | height);} 63 | 64 | 65 | hipEventRecord(start_kernel_event, 0); 66 | for(int i=1;i<=10;i++){ 67 | hipLaunchKernelGGL(transpose_naive_kernel, dim3(block_x, block_y), 68 | dim3(tile_dim, tile_dim), 0, 0, d_in, d_out, width, 69 | height);} 70 | 71 | hipEventRecord(end_kernel_event, 0); 72 | hipEventSynchronize(end_kernel_event); 73 | 74 | float time_kernel; 75 | hipEventElapsedTime(&time_kernel, start_kernel_event, end_kernel_event); 76 | 77 | printf("Kernel execution complete \n"); 78 | printf("Event timings:\n"); 79 | printf(" %.6f ms - naive transpose \n Bandwidth %.6f GB/s\n", time_kernel/10, 2.0*10000*(((double)(width)*(double)height)*sizeof(float))/(time_kernel*1024*1024*1024)); 80 | 81 | hipMemcpy(matrix_out.data(), d_out, width * height * sizeof(float), 82 | hipMemcpyDeviceToHost); 83 | 84 | 85 | return 0; 86 | } 87 | 88 | -------------------------------------------------------------------------------- /optimization/02-matrix_transpose/matrix_transpose_with_SM.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | const static int width = 4096; 7 | const static int height = 4096; 8 | const static int tile_dim = 16; 9 | 10 | __global__ void transpose_SM_kernel(float *in, float *out, int width, 11 | int height) { 12 | __shared__ float tile[tile_dim][tile_dim]; 13 | 14 | int x_tile_index = blockIdx.x * tile_dim; 15 | int y_tile_index = blockIdx.y * tile_dim; 16 | 17 | int in_index = 18 | (y_tile_index + threadIdx.y) * width + (x_tile_index + threadIdx.x); 19 | int out_index = 20 | (x_tile_index + threadIdx.y) * height + (y_tile_index + threadIdx.x); 21 | 22 | tile[threadIdx.y][threadIdx.x] = in[in_index]; 23 | 24 | __syncthreads(); 25 | 26 | out[out_index] = tile[threadIdx.x][threadIdx.y]; 27 | } 28 | 29 | 30 | int main() { 31 | std::vector matrix_in; 32 | std::vector matrix_out; 33 | 34 | matrix_in.resize(width * height); 35 | matrix_out.resize(width * height); 36 | 37 | for (int i = 0; i < width * height; i++) { 38 | matrix_in[i] = (float)rand() / (float)RAND_MAX; 39 | } 40 | 41 | 42 | 43 | float *d_in; 44 | float *d_out; 45 | 46 | hipMalloc((void **)&d_in, width * height * sizeof(float)); 47 | hipMalloc((void **)&d_out, width * height * sizeof(float)); 48 | 49 | hipMemcpy(d_in, matrix_in.data(), width * height * sizeof(float), 50 | hipMemcpyHostToDevice); 51 | 52 | printf("Setup complete. Launching kernel \n"); 53 | int block_x = width / tile_dim; 54 | int block_y = height / tile_dim; 55 | 56 | // Create events 57 | hipEvent_t start_kernel_event; 58 | hipEventCreate(&start_kernel_event); 59 | hipEvent_t end_kernel_event; 60 | hipEventCreate(&end_kernel_event); 61 | 62 | printf("Warm up the gpu!\n"); 63 | 64 | 65 | for(int i=1;i<=10;i++){ 66 | hipLaunchKernelGGL(transpose_SM_kernel, dim3(block_x, block_y), 67 | dim3(tile_dim, tile_dim), 0, 0, d_in, d_out, width, 68 | height);} 69 | 70 | 71 | hipEventRecord(start_kernel_event, 0); 72 | 73 | for(int i=1;i<=10;i++){ 74 | hipLaunchKernelGGL(transpose_SM_kernel, dim3(block_x, block_y), 75 | dim3(tile_dim, tile_dim), 0, 0, d_in, d_out, width, 76 | height);} 77 | 78 | 79 | hipEventRecord(end_kernel_event, 0); 80 | hipEventSynchronize(end_kernel_event); 81 | 82 | float time_kernel; 83 | hipEventElapsedTime(&time_kernel, start_kernel_event, end_kernel_event); 84 | 85 | printf("Kernel execution complete \n"); 86 | printf("Event timings:\n"); 87 | printf(" %.6f ms - shared memory \n Bandwidth %.6f GB/s\n", time_kernel/10, 2.0*10000*(((double)(width)*(double)height)*sizeof(float))/(time_kernel*1024*1024*1024)); 88 | 89 | hipMemcpy(matrix_out.data(), d_out, width * height * sizeof(float), 90 | hipMemcpyDeviceToHost); 91 | 92 | hipEventDestroy(start_kernel_event); 93 | hipEventDestroy(end_kernel_event); 94 | 95 | return 0; 96 | } 97 | 98 | -------------------------------------------------------------------------------- /optimization/02-matrix_transpose/matrix_transpose_with_SM_nobc.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | const static int width = 4096; 7 | const static int height = 4096; 8 | const static int tile_dim = 16; 9 | 10 | __global__ void transpose_SM_nobc_kernel(float *in, float *out, int width, 11 | int height) { 12 | __shared__ float tile[tile_dim][tile_dim+1]; 13 | 14 | int x_tile_index = blockIdx.x * tile_dim; 15 | int y_tile_index = blockIdx.y * tile_dim; 16 | 17 | int in_index = 18 | (y_tile_index + threadIdx.y) * width + (x_tile_index + threadIdx.x); 19 | int out_index = 20 | (x_tile_index + threadIdx.y) * height + (y_tile_index + threadIdx.x); 21 | 22 | tile[threadIdx.y][threadIdx.x] = in[in_index]; 23 | 24 | __syncthreads(); 25 | 26 | out[out_index] = tile[threadIdx.x][threadIdx.y]; 27 | } 28 | 29 | 30 | int main() { 31 | std::vector matrix_in; 32 | std::vector matrix_out; 33 | 34 | matrix_in.resize(width * height); 35 | matrix_out.resize(width * height); 36 | 37 | for (int i = 0; i < width * height; i++) { 38 | matrix_in[i] = (float)rand() / (float)RAND_MAX; 39 | } 40 | 41 | 42 | 43 | float *d_in; 44 | float *d_out; 45 | 46 | hipMalloc((void **)&d_in, width * height * sizeof(float)); 47 | hipMalloc((void **)&d_out, width * height * sizeof(float)); 48 | 49 | hipMemcpy(d_in, matrix_in.data(), width * height * sizeof(float), 50 | hipMemcpyHostToDevice); 51 | 52 | printf("Setup complete. Launching kernel \n"); 53 | int block_x = width / tile_dim; 54 | int block_y = height / tile_dim; 55 | 56 | // Create events 57 | hipEvent_t start_kernel_event; 58 | hipEventCreate(&start_kernel_event); 59 | hipEvent_t end_kernel_event; 60 | hipEventCreate(&end_kernel_event); 61 | 62 | printf("Warm up the gpu!\n"); 63 | 64 | 65 | for(int i=1;i<=10;i++){ 66 | hipLaunchKernelGGL(transpose_SM_nobc_kernel, dim3(block_x, block_y), 67 | dim3(tile_dim, tile_dim), 0, 0, d_in, d_out, width, 68 | height);} 69 | 70 | 71 | hipEventRecord(start_kernel_event, 0); 72 | 73 | for(int i=1;i<=10;i++){ 74 | hipLaunchKernelGGL(transpose_SM_nobc_kernel, dim3(block_x, block_y), 75 | dim3(tile_dim, tile_dim), 0, 0, d_in, d_out, width, 76 | height);} 77 | 78 | 79 | hipEventRecord(end_kernel_event, 0); 80 | hipEventSynchronize(end_kernel_event); 81 | 82 | float time_kernel; 83 | hipEventElapsedTime(&time_kernel, start_kernel_event, end_kernel_event); 84 | 85 | printf("Kernel execution complete \n"); 86 | printf("Event timings:\n"); 87 | printf(" %.6f ms - shared memory with no bank conflicts \n Bandwidth %.6f GB/s\n", time_kernel/10, 2.0*10000*(((double)(width)*(double)height)*sizeof(float))/(time_kernel*1024*1024*1024)); 88 | 89 | hipMemcpy(matrix_out.data(), d_out, width * height * sizeof(float), 90 | hipMemcpyDeviceToHost); 91 | 92 | hipEventDestroy(start_kernel_event); 93 | hipEventDestroy(end_kernel_event); 94 | 95 | return 0; 96 | } 97 | 98 | -------------------------------------------------------------------------------- /optimization/03-trace/README.md: -------------------------------------------------------------------------------- 1 | # Tracing with rocprof 2 | 3 | In this exercise your task is to trace execution of [streams/02-concurrency](../../streams/02-concurrency/solution/streams.cpp) exercise 4 | solution. 5 | 6 | Rocprof can be used to trace HIP API calls, among others, with option 7 | 8 | ```bash 9 | > rocprof --hip-trace 10 | ``` 11 | 12 | It will output a file named `results.json` which may be visualized for example 13 | with perfetto trace visualizer (https://ui.perfetto.dev/) or chrome/chromium 14 | built in visualizer tools (type `chrome://tracing/` in the URL field). 15 | 16 | ## Exercise 17 | 18 | - Trace the HIP API calls of the `streams.cpp` code and visualize the results. 19 | - Modify `WORK` preprocessor macro to so large that kernel executions begin to 20 | exceed memory transfers. 21 | - Does the kernel execution order correspond to their stream numbering? 22 | -------------------------------------------------------------------------------- /porting/README.md: -------------------------------------------------------------------------------- 1 | # Converting CUDA code to HIP 2 | 3 | The folder [codes](codes) contains a few examples (vector addition, `saxpy` using HIP kernel, and `saxpy`using `cublas` of CUDA codes. On Mahti or Puhti these codes will compile with the CUDA `nvcc` compiler and should run without issues. 4 | 5 | The tasks are to convert these codes to HIP. For shorter code one can do a manual conversion, but for larger codes it is recomended to use HIPIFY tools or compile them with [HOP](https://github.com/cschpc/hop) library. 6 | 7 | ## HIPIFY Tools 8 | 0. **Optional** Convert the codes to HIP manually. On Nvidia platforms the conversion can be done in an incremental way because `hipcc` can compile mixed CUDA and HIP code. On AMD plaftorms `hipcc` can not compile CUDA code. The whole code needs to be converted in order to be able to compile it. 9 | 1. Convert the codes using HIPIFY tools. 10 | 11 | A. Examine the code. Both `hipify-perl` and `hipify-clang` support the option `--examine` option. Alternatively one can use the `hipexamine[.|-perl.]sh` scripts which will scan whole directories. This procedure will not change the source it will just determine which files contain CUDA code and how much of the code can be converted automatically. 12 | 13 | B. Convert individual files `hipify-[perl|clang] --inplace --print-stats` or folders using the scripts `hipconvertinplace[.|-perl.]sh `. 14 | 15 | 16 | **Note** that `hipify-clang` requires the CUDA toolkit. On LUMI this is available via a container. 17 | The image can be created using: 18 | 19 | ``` 20 | singularity pull docker://nvcr.io/nvidia/cuda:11.4.3-devel-ubuntu20.04 21 | ``` 22 | This is step was already done, the image's path is `/projappl/project_462000877/apps/cuda_11.4.3-devel-ubuntu20.04.sif` 23 | Then load all the modules necessary to compile HIP codes on LUMI. 24 | ``` 25 | module load LUMI/24.03 26 | module load partition/G 27 | module load rocm 28 | ``` 29 | Finally open a shell in the container which has access to the working directory and the `rocm` 30 | ``` 31 | singularity shell -B $PWD,/opt:/opt /projappl/project_462000877/apps/cuda_11.4.3-devel-ubuntu20.04.sif 32 | export PATH=$ROCM_PATH/bin:$PATH 33 | ``` 34 | 35 | The CUDA code can be converted now using: 36 | ``` 37 | hipify-clang .cu --inplace --print-stats --cuda-path=/usr/local/cuda-11.4 -I /usr/local/cuda-11.4/include 38 | ``` 39 | This command works as well on Nvidia platforms with HIP installed. 40 | 41 | 42 | 2. Compile CUDA codes on AMD platorms using `hipcc` + HOP and compile HIP codes on Nvidia platforms using `nvcc` + HOP. 43 | 44 | First you neeed to clone the HOP repository in your working folder on scratch: 45 | ``` 46 | git clone https://github.com/cschpc/hop.git 47 | ``` 48 | 49 | **CUDA** ⇒ **HIP** on LUMI 50 | ``` 51 | export HOP_ROOT=/path/to/hop 52 | export HOP_FLAGS="-I$HOP_ROOT -I$HOP_ROOT/source/cuda -DHOP_TARGET_HIP" 53 | CC -x hip $HOP_FLAGS hello.cu -o hello 54 | ./hello 55 | ``` 56 | **HIP** ⇒ **CUDA** on Mahti or Puhti 57 | ``` 58 | export HOP_ROOT=/path/to/hop 59 | export HOP_FLAGS="-I$HOP_ROOT -I$HOP_ROOT/source/hip -DHOP_TARGET_CUDA" 60 | CC -x cu $HOP_FLAGS hello.cpp -o hello 61 | ./hello 62 | ``` 63 | 64 | -------------------------------------------------------------------------------- /porting/codes/README.md: -------------------------------------------------------------------------------- 1 | # Directory with source codes for hands-on 2 | -------------------------------------------------------------------------------- /porting/codes/Vector_Addition/Readme.md: -------------------------------------------------------------------------------- 1 | # Vector addition 2 | 3 | This is simple vector addition for exemplify the [CUDA to HIP conversion]. The code executes `C[i]=A[i]+B[i]`, for `i=1,...,N`. 4 | 5 | Compile CUDA code: nvcc -arch=sm_70 vecadd.cu -o vecadd 6 | -------------------------------------------------------------------------------- /porting/codes/Vector_Addition/cuda/Readme.md: -------------------------------------------------------------------------------- 1 | # Vector addition 2 | 3 | This is simple vector addition for exemplify the [CUDA to HIP conversion]. The code executes `C[i]=A[i]+B[i]`, for `i=1,...,N`. 4 | 5 | Compile: nvcc -arch=sm_70 vecadd.cu -o vecadd 6 | -------------------------------------------------------------------------------- /porting/codes/Vector_Addition/cuda/vecadd.cu: -------------------------------------------------------------------------------- 1 | /* 2 | nvcc vecadd.cu 3 | */ 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | __global__ void vecAdd(int *A,int *B,int *C,int N) 11 | { 12 | int i = blockIdx.x * blockDim.x + threadIdx.x; 13 | if(i>>(a_d,b_d,c_d,n); 56 | cudaDeviceSynchronize(); 57 | clock_t end_d = clock(); 58 | clock_t start_h = clock(); 59 | printf("Doing CPU Vector add\n"); 60 | vecAdd_h(a,b,c2,n); 61 | clock_t end_h = clock(); 62 | double time_d = (double)(end_d-start_d)/CLOCKS_PER_SEC; 63 | double time_h = (double)(end_h-start_h)/CLOCKS_PER_SEC; 64 | cudaMemcpy(c,c_d,nBytes,cudaMemcpyDeviceToHost); 65 | printf("%d %f %f\n",n,time_d,time_h); 66 | 67 | for(int i=0; i1.0e-5) 70 | printf("Error at position %d.\n", i ); 71 | } 72 | cudaFree(a_d); 73 | cudaFree(b_d); 74 | cudaFree(c_d); 75 | free(c2); 76 | free(c); 77 | free(a); 78 | free(b); 79 | return 0; 80 | } 81 | -------------------------------------------------------------------------------- /porting/codes/Vector_Addition/hip_solution/vecadd.cu: -------------------------------------------------------------------------------- 1 | #include "hip/hip_runtime.h" 2 | /* 3 | nvcc vecadd.cu 4 | */ 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | __global__ void vecAdd(int *A,int *B,int *C,int N) 12 | { 13 | int i = blockIdx.x * blockDim.x + threadIdx.x; 14 | if(i>>(a_d,b_d,c_d,n); 57 | hipDeviceSynchronize(); 58 | clock_t end_d = clock(); 59 | clock_t start_h = clock(); 60 | printf("Doing CPU Vector add\n"); 61 | vecAdd_h(a,b,c2,n); 62 | clock_t end_h = clock(); 63 | double time_d = (double)(end_d-start_d)/CLOCKS_PER_SEC; 64 | double time_h = (double)(end_h-start_h)/CLOCKS_PER_SEC; 65 | hipMemcpy(c,c_d,nBytes,hipMemcpyDeviceToHost); 66 | printf("%d %f %f\n",n,time_d,time_h); 67 | 68 | for(int i=0; i1.0e-5) 71 | printf("Error at position %d.\n", i ); 72 | } 73 | hipFree(a_d); 74 | hipFree(b_d); 75 | hipFree(c_d); 76 | free(c2); 77 | free(c); 78 | free(a); 79 | free(b); 80 | return 0; 81 | } 82 | -------------------------------------------------------------------------------- /porting/codes/saxpy/cublas/Makefile: -------------------------------------------------------------------------------- 1 | #=============================================================================== 2 | # User Options 3 | #=============================================================================== 4 | # 5 | # Compiler can be set below, or via environment variable 6 | CC = nvcc 7 | OPTIMIZE = yes 8 | # 9 | #=============================================================================== 10 | # Program name & source code list 11 | #=============================================================================== 12 | program = saxpy_cublas 13 | source = saxpy_cublas.cu 14 | obj = $(source:.cu=.o) 15 | #=============================================================================== 16 | # Sets Flags 17 | #=============================================================================== 18 | # Standard Flags 19 | CFLAGS := -Xcompiler -Wall 20 | # Linker Flags 21 | LDFLAGS = -lcublas 22 | # Optimization Flags 23 | ifeq ($(OPTIMIZE),yes) 24 | CFLAGS += -O3 25 | endif 26 | 27 | #=============================================================================== 28 | # Targets to Build 29 | #=============================================================================== 30 | # 31 | $(program): $(obj) Makefile 32 | $(CC) $(CFLAGS) $(obj) -o $@ $(LDFLAGS) 33 | 34 | %.o: %.cu Makefile 35 | $(CC) $(CFLAGS) -c $< -o $@ 36 | 37 | clean: 38 | rm -rf $(program) $(obj) 39 | 40 | -------------------------------------------------------------------------------- /porting/codes/saxpy/cublas/saxpy_cublas.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "cublas_v2.h" 3 | using namespace std; 4 | 5 | const int N = 1 << 30; 6 | 7 | int main(){ 8 | float *a_h, *b_h; 9 | a_h = new float[N]; 10 | b_h = new float[N]; 11 | float *a_d, *b_d; 12 | for(int i = 0; i < N; i++){ 13 | a_h[i] = 1.0f; 14 | b_h[i] = 2.0f ; 15 | } 16 | cublasHandle_t handle; 17 | cublasCreate(&handle); 18 | cudaMalloc((void**) &a_d, sizeof(float) * N); 19 | cudaMalloc((void**) &b_d, sizeof(float) * N); 20 | cublasSetVector( N, sizeof(float), a_h, 1, a_d, 1); 21 | cublasSetVector( N, sizeof(float), b_h, 1, b_d, 1); 22 | const float s = 2.0f; 23 | cublasSaxpy( handle, N, &s, a_d, 1, b_d, 1); 24 | cublasGetVector( N, sizeof(float), b_d, 1, b_h, 1); 25 | cudaFree(a_d); 26 | cudaFree(b_d); 27 | cublasDestroy(handle); 28 | float maxError = 0.0f; 29 | 30 | for(int i = 0; i < N; i++) 31 | maxError = fmax(maxError, abs(b_h[i]-4.0f)); 32 | 33 | cout << "Max error: " << maxError << endl; 34 | 35 | 36 | delete[] a_h; 37 | delete[] b_h; 38 | return 0; 39 | } 40 | -------------------------------------------------------------------------------- /porting/codes/saxpy/cuda/saxpy.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | __global__ 4 | void saxpy(int n, float a, float *x, float *y) 5 | { 6 | int i = blockIdx.x*blockDim.x + threadIdx.x; 7 | if (i < n) y[i] = a*x[i] + y[i]; 8 | } 9 | 10 | int main(void) 11 | { 12 | int N = 1<<30; 13 | float *x, *y, *d_x, *d_y; 14 | x = (float*)malloc(N*sizeof(float)); 15 | y = (float*)malloc(N*sizeof(float)); 16 | 17 | cudaMalloc(&d_x, N*sizeof(float)); 18 | cudaMalloc(&d_y, N*sizeof(float)); 19 | 20 | for (int i = 0; i < N; i++) { 21 | x[i] = 1.0f; 22 | y[i] = 2.0f; 23 | } 24 | 25 | cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice); 26 | cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice); 27 | 28 | // Perform SAXPY on 1M elements 29 | saxpy<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y); 30 | 31 | cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost); 32 | 33 | float maxError = 0.0f; 34 | for (int i = 0; i < N; i++) 35 | maxError = fmax(maxError, abs(y[i]-4.0f)); 36 | printf("Max error: %f\n", maxError); 37 | 38 | cudaFree(d_x); 39 | cudaFree(d_y); 40 | free(x); 41 | free(y); 42 | } 43 | -------------------------------------------------------------------------------- /porting/codes/saxpy/hip/README.md: -------------------------------------------------------------------------------- 1 | # Copy the files from the CUDA folder and hipify the example here. 2 | -------------------------------------------------------------------------------- /porting/codes/saxpy/hip_solution/saxpy.cu: -------------------------------------------------------------------------------- 1 | #include "hip/hip_runtime.h" 2 | #include 3 | 4 | __global__ 5 | void saxpy(int n, float a, float *x, float *y) 6 | { 7 | int i = blockIdx.x*blockDim.x + threadIdx.x; 8 | if (i < n) y[i] = a*x[i] + y[i]; 9 | } 10 | 11 | int main(void) 12 | { 13 | int N = 1<<30; 14 | float *x, *y, *d_x, *d_y; 15 | x = (float*)malloc(N*sizeof(float)); 16 | y = (float*)malloc(N*sizeof(float)); 17 | 18 | hipMalloc(&d_x, N*sizeof(float)); 19 | hipMalloc(&d_y, N*sizeof(float)); 20 | 21 | for (int i = 0; i < N; i++) { 22 | x[i] = 1.0f; 23 | y[i] = 2.0f; 24 | } 25 | 26 | hipMemcpy(d_x, x, N*sizeof(float), hipMemcpyHostToDevice); 27 | hipMemcpy(d_y, y, N*sizeof(float), hipMemcpyHostToDevice); 28 | 29 | // Perform SAXPY on 1M elements 30 | saxpy<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y); 31 | 32 | hipMemcpy(y, d_y, N*sizeof(float), hipMemcpyDeviceToHost); 33 | 34 | float maxError = 0.0f; 35 | for (int i = 0; i < N; i++) 36 | maxError = fmax(maxError, abs(y[i]-4.0f)); 37 | printf("Max error: %f\n", maxError); 38 | 39 | hipFree(d_x); 40 | hipFree(d_y); 41 | free(x); 42 | free(y); 43 | } 44 | 45 | -------------------------------------------------------------------------------- /porting/codes/saxpy/hipblas/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Copy the data from cuBLAS here and HIPIFY the example. 3 | -------------------------------------------------------------------------------- /porting/codes/saxpy/hipblas_solution/Makefile: -------------------------------------------------------------------------------- 1 | #=============================================================================== 2 | # User Options 3 | #=============================================================================== 4 | # 5 | # Compiler can be set below, or via environment variable 6 | CC = hipcc 7 | OPTIMIZE = yes 8 | # 9 | #=============================================================================== 10 | # Program name & source code list 11 | #=============================================================================== 12 | program = saxpy_cublas 13 | source = saxpy_cublas.cu 14 | obj = $(source:.cu=.o) 15 | #=============================================================================== 16 | # Sets Flags 17 | #=============================================================================== 18 | # Standard Flags 19 | CFLAGS := -Xcompiler -Wall -I/appl/opt/rocm/rocm-4.0.0c/hipblas/hipblas/include 20 | # Linker Flags 21 | LDFLAGS = -L/appl/opt/rocm/rocm-4.0.0c/hipblas/hipblas/lib/ -lhipblas 22 | # Optimization Flags 23 | ifeq ($(OPTIMIZE),yes) 24 | CFLAGS += -O3 25 | endif 26 | 27 | #=============================================================================== 28 | # Targets to Build 29 | #=============================================================================== 30 | # 31 | $(program): $(obj) Makefile 32 | $(CC) $(CFLAGS) $(obj) -o $@ $(LDFLAGS) 33 | 34 | %.o: %.cu Makefile 35 | $(CC) $(CFLAGS) -c $< -o $@ 36 | 37 | clean: 38 | rm -rf $(program) $(obj) out* error* 39 | 40 | -------------------------------------------------------------------------------- /porting/codes/saxpy/hipblas_solution/saxpy_cublas.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | using namespace std; 4 | 5 | const int N = 1 << 30; 6 | 7 | int main(){ 8 | float *a_h, *b_h; 9 | a_h = new float[N]; 10 | b_h = new float[N]; 11 | float *a_d, *b_d; 12 | for(int i = 0; i < N; i++){ 13 | a_h[i] = 1.0f; 14 | b_h[i] = 2.0f ; 15 | } 16 | hipblasHandle_t handle; 17 | hipblasCreate(&handle); 18 | hipMalloc((void**) &a_d, sizeof(float) * N); 19 | hipMalloc((void**) &b_d, sizeof(float) * N); 20 | hipblasSetVector( N, sizeof(float), a_h, 1, a_d, 1); 21 | hipblasSetVector( N, sizeof(float), b_h, 1, b_d, 1); 22 | const float s = 2.0f; 23 | hipblasSaxpy( handle, N, &s, a_d, 1, b_d, 1); 24 | hipblasGetVector( N, sizeof(float), b_d, 1, b_h, 1); 25 | hipFree(a_d); 26 | hipFree(b_d); 27 | hipblasDestroy(handle); 28 | float maxError = 0.0f; 29 | 30 | for(int i = 0; i < N; i++) 31 | maxError = fmax(maxError, abs(b_h[i]-4.0f)); 32 | 33 | cout << "Max error: " << maxError << endl; 34 | 35 | 36 | delete[] a_h; 37 | delete[] b_h; 38 | return 0; 39 | } 40 | -------------------------------------------------------------------------------- /setup_env_lumi: -------------------------------------------------------------------------------- 1 | # Module environment 2 | ml PrgEnv-cray 3 | ml craype-accel-amd-gfx90a 4 | ml rocm/6.0.3 5 | 6 | # Environment variables for compiling 7 | export CXX=CC 8 | export CXXFLAGS='-xhip -O3' 9 | 10 | # Aliases for easy running 11 | alias runit='srun --reservation=HIPcourse --account=project_462000877 --partition=small-g --time=00:05:00 --nodes=1 --ntasks-per-node=1 --cpus-per-task=1 --gpus-per-task=1' 12 | -------------------------------------------------------------------------------- /streams/01-event-record/README.md: -------------------------------------------------------------------------------- 1 | # Understanding asynchronity using events 2 | 3 | The purpose of this exercise is understand asynchronous operations, and how they can be timed using HIP events. In the skeleton, the timing has been implemented using `` header and `clock_t` type. This attempt to time asynchronous events, however, fails to measure the timings correctly. Your task is to implement the timings correctly using HIP events (you don't have to remove the `clock_t` timings, you can leave them in place to explore the difference). The locations where modifications are required, are marked with `#error` together with an instruction. Basically, your task is to measure and print the timing of a GPU kernel, a device-to-host copy, and their combined time. 4 | -------------------------------------------------------------------------------- /streams/01-event-record/record.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #define get_mus(X) std::chrono::duration_cast(X).count() 7 | #define chrono_clock std::chrono::high_resolution_clock::now() 8 | 9 | /* A simple GPU kernel definition */ 10 | __global__ void kernel(int *d_a, int n_total) 11 | { 12 | const int idx = blockIdx.x * blockDim.x + threadIdx.x; 13 | if(idx < n_total) 14 | d_a[idx] = idx; 15 | } 16 | 17 | /* The main function */ 18 | int main(){ 19 | 20 | // Problem size 21 | constexpr int n_total = 1<<22; 22 | 23 | // Device grid sizes 24 | constexpr int blocksize = 256; 25 | constexpr int gridsize = (n_total - 1 + blocksize) / blocksize; 26 | 27 | // Allocate host and device memory 28 | int *a, *d_a; 29 | const int bytes = n_total * sizeof(int); 30 | hipHostMalloc((void**)&a, bytes); // host pinned 31 | hipMalloc((void**)&d_a, bytes); // device pinned 32 | 33 | // Create events 34 | #error create the required timing events here 35 | 36 | // Create stream 37 | hipStream_t stream; 38 | hipStreamCreate(&stream); 39 | 40 | // Start timed GPU kernel and device-to-host copy 41 | #error record the events somewhere across the below lines of code 42 | #error such that you can get the timing for the kernel, the 43 | #error memory copy, and the total combined time of these 44 | auto start_kernel_clock = chrono_clock; 45 | kernel<<>>(d_a, n_total); 46 | 47 | auto start_d2h_clock = chrono_clock; 48 | hipMemcpyAsync(a, d_a, bytes, hipMemcpyDeviceToHost, stream); 49 | 50 | auto stop_clock = chrono_clock; 51 | hipStreamSynchronize(stream); 52 | 53 | // Exctract elapsed timings from event recordings 54 | #error get the elapsed time from the timing events 55 | 56 | // Check that the results are right 57 | int error = 0; 58 | for(int i = 0; i < n_total; ++i){ 59 | if(a[i] != i) 60 | error = 1; 61 | } 62 | 63 | // Print results 64 | if(error) 65 | printf("Results are incorrect!\n"); 66 | else 67 | printf("Results are correct!\n"); 68 | 69 | // Print event timings 70 | printf("Event timings:\n"); 71 | #error print event timings here 72 | 73 | // Print clock timings 74 | printf("clock_t timings:\n"); 75 | printf(" %.3f ms - kernel\n", 1e3 * (double)get_mus(start_d2h_clock - start_kernel_clock)); 76 | printf(" %.3f ms - device to host copy\n", 1e3 * (double)get_mus(stop_clock - start_d2h_clock)); 77 | printf(" %.3f ms - total time\n", 1e3 * (double)get_mus(stop_clock - start_kernel_clock)); 78 | 79 | // Destroy Stream 80 | hipStreamDestroy(stream); 81 | 82 | // Destroy events 83 | #error destroy events here 84 | 85 | // Deallocations 86 | hipFree(d_a); // Device 87 | hipHostFree(a); // Host 88 | } 89 | -------------------------------------------------------------------------------- /streams/01-event-record/solution/record.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #define get_mus(X) std::chrono::duration_cast(X).count() 7 | #define chrono_clock std::chrono::high_resolution_clock::now() 8 | 9 | /* A simple GPU kernel definition */ 10 | __global__ void kernel(int *d_a, int n_total) 11 | { 12 | const int idx = blockIdx.x * blockDim.x + threadIdx.x; 13 | if(idx < n_total) 14 | d_a[idx] = idx; 15 | } 16 | 17 | /* The main function */ 18 | int main(){ 19 | // Problem size 20 | constexpr int n_total = 1<<22; // pow(2, 22); 21 | 22 | // Device grid sizes 23 | constexpr int blocksize = 256; 24 | constexpr int gridsize = (n_total - 1 + blocksize) / blocksize; 25 | 26 | // Allocate host and device memory 27 | int *a, *d_a; 28 | const int bytes = n_total * sizeof(int); 29 | hipHostMalloc((void**)&a, bytes); // host pinned 30 | hipMalloc((void**)&d_a, bytes); // device pinned 31 | 32 | hipEvent_t pre_kernel, post_kernel, end_event; 33 | // Create events 34 | hipEventCreate(&pre_kernel); 35 | hipEventCreate(&post_kernel); 36 | hipEventCreate(&end_event); 37 | float timing_a, timing_b, timing_c; 38 | 39 | // Create stream 40 | hipStream_t stream; 41 | hipStreamCreate(&stream); 42 | 43 | // Start timed GPU kernel and device-to-host copy 44 | hipEventRecord(pre_kernel, stream); 45 | auto start_time = chrono_clock; 46 | 47 | kernel<<>>(d_a, n_total); 48 | 49 | // Record event after kernel execution 50 | hipEventRecord(post_kernel, stream); 51 | auto d2h_time = chrono_clock; 52 | 53 | hipMemcpyAsync(a, d_a, bytes, hipMemcpyDeviceToHost, stream); 54 | 55 | // Record event after D2H memory copy 56 | hipEventRecord(end_event, stream); 57 | auto end_time = chrono_clock; 58 | 59 | hipStreamSynchronize(stream); 60 | 61 | // Exctract elapsed timings from event recordings 62 | hipEventElapsedTime(&timing_a, pre_kernel, post_kernel); 63 | hipEventElapsedTime(&timing_b, post_kernel, end_event); 64 | hipEventElapsedTime(&timing_c, pre_kernel, end_event); 65 | 66 | // Check that the results are right 67 | int error = 0; 68 | for(int i = 0; i < n_total; ++i){ 69 | if(a[i] != i) 70 | error = 1; 71 | } 72 | 73 | // Print results 74 | if(error) 75 | printf("Results are incorrect!\n"); 76 | else 77 | printf("Results are correct!\n"); 78 | 79 | // Print event timings 80 | printf("Event timings:\n"); 81 | printf(" %.3f ms - kernel\n", (timing_a) ); 82 | printf(" %.3f ms - D2H copy\n", (timing_b) ); 83 | printf(" %.3f ms - total time\n", (timing_c) ); 84 | /* #error print event timings here */ 85 | 86 | // Print clock timings 87 | printf("std::chrono timings:\n"); 88 | printf(" %.3f ms - kernel\n", 1e3 * ((double)get_mus(d2h_time - start_time)) / CLOCKS_PER_SEC); 89 | printf(" %.3f ms - device to host copy\n", 1e3 * ((double)get_mus(end_time - d2h_time)) / CLOCKS_PER_SEC); 90 | printf(" %.3f ms - total time\n", 1e3 * (double)get_mus(end_time-start_time) / CLOCKS_PER_SEC); 91 | 92 | // Destroy Stream 93 | hipStreamDestroy(stream); 94 | 95 | // Destroy events 96 | /* #error destroy events here */ 97 | hipEventDestroy(pre_kernel); 98 | hipEventDestroy(post_kernel); 99 | hipEventDestroy(end_event); 100 | 101 | // Deallocations 102 | hipFree(d_a); // Device 103 | hipHostFree(a); // Host 104 | } 105 | -------------------------------------------------------------------------------- /streams/02-concurrency/README.md: -------------------------------------------------------------------------------- 1 | # Investigating streams and events 2 | 3 | This exercise demonstrates an asynchronous data transfer and computation. Three different asynchronous cases are created, and their timings are printed out. The timings are recorded with hipEvent calls. 4 | 5 | ## Instructions 6 | 7 | In the exercise, the following HIP functions are needed: 8 | 9 | * `hipStreamCreate()` 10 | * `hipMemcpyAsync()` 11 | * `hipEventRecord()` 12 | * `hipEventSynchronize()` 13 | * `hipEventElapsedTime()` 14 | * `hipStreamDestroy()` 15 | 16 | ### Case 0 17 | 18 | 1) Create and destroy `n_stream` streams in the main function in the locations marked by `#error` 19 | 2) The function `case_0()` is already complete and can be used as a reference 20 | 21 | ### Case 1 22 | 23 | 1) In the `case_1()` function, create a loop over `n_stream` and split the work done by the kernel call of Case 0 into multiple kernels calls (one kernel call per stream with an even workload per stream) 24 | 3) Record events using `start_event` and `stop_event` arrays for each stream before and after the kernel call 25 | 26 | ### Case 2 27 | 28 | 1) Create a loop into the function `case_2()` 29 | 1) In the loop: Split the data copy from host to device into `n_stream` asynchronous memcopies. one for each stream (make sure the memcopies are split evenly for each stream) 30 | 2) In the loop: Launch the kernel for each stream similarly to Case 1 31 | 3) In the loop: Split the data copy from device to host into `n_stream` asynchronous memcopies. one for each stream (make sure the memcopies are split asynchronously 32 | 2) Record total timing of the loop, use `start_event[n_stream]` and `stop_event[n_stream]` array positions 33 | 3) Additionally, record events for each stream using `start_event` and `stop_event` arrays before H-to-D memcopy and after D-to-H memcopy, respectively 34 | 4) Synchronize host with each `stop_event[i] ` 35 | 5) Get timings between each corresponding `start_event[i]` and `stop_event[i]` 36 | 37 | ### Case 3 38 | 39 | 1) Copy the case 2 here 40 | 2) Instead of doing the asynchronous memcopies and the kernel in the same loop as in Case 2, create a separate loop for each (3 loops in total) 41 | 3) Make sure you record events in appropriate locations to get correct timings 42 | 43 | ## Additional considerations 44 | 45 | * You can try setting `USE_PINNED_HOST_MEM` to `0` at line `#6`, to see how the timings change if we do not use pinned host memory. 46 | -------------------------------------------------------------------------------- /third-party/hipcub/hipcub.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define hipcub cub -------------------------------------------------------------------------------- /third-party/hiprand/hiprand_hcc.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017 Advanced Micro Devices, Inc. All rights reserved. 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | 21 | #ifndef HIPRAND_HCC_H_ 22 | #define HIPRAND_HCC_H_ 23 | 24 | #include 25 | 26 | typedef rocrand_generator_base_type hiprandGenerator_st; 27 | 28 | typedef struct rocrand_discrete_distribution_st hiprandDiscreteDistribution_st; 29 | 30 | #endif // HIPRAND_HCC_H_ 31 | -------------------------------------------------------------------------------- /third-party/hiprand/hiprand_kernel.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017 Advanced Micro Devices, Inc. All rights reserved. 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | 21 | #ifndef HIPRAND_KERNEL_H_ 22 | #define HIPRAND_KERNEL_H_ 23 | 24 | #ifndef QUALIFIERS 25 | #define QUALIFIERS __forceinline__ __device__ 26 | #endif // QUALIFIERS 27 | 28 | #include 29 | #include 30 | 31 | /** \addtogroup hipranddevice 32 | * 33 | * @{ 34 | */ 35 | 36 | /** 37 | * \def HIPRAND_PHILOX4x32_DEFAULT_SEED 38 | * \brief Default seed for PHILOX4x32 PRNG. 39 | */ 40 | #define HIPRAND_PHILOX4x32_DEFAULT_SEED 0ULL 41 | /** 42 | * \def HIPRAND_XORWOW_DEFAULT_SEED 43 | * \brief Default seed for XORWOW PRNG. 44 | */ 45 | #define HIPRAND_XORWOW_DEFAULT_SEED 0ULL 46 | /** 47 | * \def HIPRAND_MRG32K3A_DEFAULT_SEED 48 | * \brief Default seed for MRG32K3A PRNG. 49 | */ 50 | #define HIPRAND_MRG32K3A_DEFAULT_SEED 12345ULL 51 | /** @} */ // end of group hipranddevice 52 | 53 | #if defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__) 54 | #include "hiprand/hiprand_kernel_hcc.h" 55 | #else 56 | #include "hiprand/hiprand_kernel_nvcc.h" 57 | #endif 58 | 59 | #endif // HIPRAND_KERNEL_H_ 60 | -------------------------------------------------------------------------------- /third-party/hiprand/hiprand_nvcc.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017 Advanced Micro Devices, Inc. All rights reserved. 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | 21 | #ifndef HIPRAND_NVCC_H_ 22 | #define HIPRAND_NVCC_H_ 23 | 24 | #include 25 | 26 | typedef struct curandGenerator_st hiprandGenerator_st; 27 | 28 | typedef struct curandDiscreteDistribution_st hiprandDiscreteDistribution_st; 29 | 30 | #endif // HIPRAND_NVCC_H_ 31 | -------------------------------------------------------------------------------- /third-party/hiprand/hiprand_version.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017 Advanced Micro Devices, Inc. All rights reserved. 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | 21 | #ifndef HIPRAND_VERSION_H_ 22 | #define HIPRAND_VERSION_H_ 23 | 24 | /// \def HIPRAND_VERSION 25 | /// \brief hipRAND library version 26 | /// 27 | /// Version number may not be visible in the documentation. 28 | /// 29 | /// HIPRAND_VERSION % 100 is the patch level, 30 | /// HIPRAND_VERSION / 100 % 1000 is the minor version, 31 | /// HIPRAND_VERSION / 100000 is the major version. 32 | /// 33 | /// For example, if HIPRAND_VERSION is 100500, then 34 | /// the major version is 1, the minor version is 5, and 35 | /// the patch level is 0. 36 | #define HIPRAND_VERSION 100500 37 | 38 | #endif // HIPRAND_VERSION_H_ 39 | --------------------------------------------------------------------------------