>>(n, d_x, a, Nz);
131 | synchronize;
132 | endtime
133 | if (my_repeat_counter > 1) std::cout << "\n";
134 |
135 | Nz += 1;
136 | }
137 |
138 | // copy data to the host and print
139 | /* HIP_CHECK(hipMemcpy(x, d_x, sizeof(double) * n, hipMemcpyDeviceToHost)); */
140 | /* printf("%f %f %f %f ... %f %f\n", */
141 | /* x[0], x[1], x[2], x[3], x[n-2], x[n-1]); */
142 |
143 | return 0;
144 | }
145 |
--------------------------------------------------------------------------------
/docs/05-fortran.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Fortran and HIP
3 | subtitle: GPU programming with HIP
4 | author: CSC Training
5 | date: 2025-03
6 | lang: en
7 | ---
8 |
9 | # Fortran
10 |
11 | * No native GPU support in Fortran:
12 | - HIP functions are callable from C, using wrappers; compiled with hipcc
13 | - interoperability with Fortran via `iso_c_binding`
14 | - linking with Fortran or `hipcc`
15 | * Fortran + HIP:
16 | - needs wrappers and interfaces for all HIP calls
17 | * Hipfort:
18 | - Fortran Interface For GPU Kernel Libraries
19 | - HIP: HIP runtime, hipBLAS, hipSPARSE, hipFFT, hipRAND, hipSOLVER
20 | - ROCm: rocBLAS, rocSPARSE, rocFFT, rocRAND, rocSOLVER
21 | - memory management: `hipMalloc`, `hipMemcpy`
22 |
23 | # HIPFort for SAXPY (`Y=Y+a*X`): Fortran Code
24 |
25 | >
26 | ```cpp
27 | program saxpy
28 | use iso_c_binding
29 | use hipfort
30 | use hipfort_check
31 |
32 | implicit none
33 | interface
34 | subroutine launch(dy,dx,b,N) bind(c)
35 | use iso_c_binding
36 | implicit none
37 | type(c_ptr),value :: dy,dx
38 | integer, value :: N
39 | real, value :: a
40 | end subroutine
41 | end interface
42 |
43 | type(c_ptr) :: dx = c_null_ptr
44 | type(c_ptr) :: dy = c_null_ptr
45 | integer, parameter :: N = 400000000
46 | integer(c_size_t), parameter :: bytes_per_element = 4
47 | integer(c_size_t), parameter :: Nbytes = N*bytes_per_element
48 | real, allocatable,target,dimension(:) :: x, y
49 | real, parameter :: a=2.0
50 | ```
51 |
52 |
53 | >
54 | ```cpp
55 | allocate(x(N), y(N))
56 |
57 | x = 1.0; y = 2.0
58 |
59 | call hipCheck(hipMalloc(dx,Nbytes))
60 | call hipCheck(hipMalloc(dy,Nbytes))
61 |
62 | call hipCheck(hipMemcpy(dx, c_loc(x), Nbytes, hipMemcpyHostToDevice))
63 | call hipCheck(hipMemcpy(dy, c_loc(y), Nbytes, hipMemcpyHostToDevice))
64 |
65 | call launch(dy, dx, a, N)
66 |
67 | call hipCheck(hipDeviceSynchronize())
68 |
69 | call hipCheck(hipMemcpy(c_loc(y), dy, Nbytes, hipMemcpyDeviceToHost))
70 |
71 | write(*,*) "Max error: ", maxval(abs(y-4.0))
72 |
73 | call hipCheck(hipFree(dx));call hipCheck(hipFree(dy))
74 |
75 | deallocate(x);deallocate(y)
76 |
77 | end program testSaxpy
78 | ```
79 |
80 |
81 |
82 | # HIPFort for SAXPY (`Y=Y+a*X`): HIP code
83 |
84 | ```cpp
85 | #include
86 | #include
87 |
88 | __global__ void saxpy(float *dy, float *dx,
89 | float a, int n)
90 | {
91 | int i = blockDim.x*blockIdx.x+threadIdx.x;
92 | if (i < n) {
93 | dy[i] = dy[i] + a*dx[i];
94 | }
95 | }
96 | ```
97 |
98 |
99 |
100 |
101 | ``` cpp
102 | extern "C"{
103 | void launch(float *dy, float *dx,
104 | float a, int N)
105 | {
106 | dim3 tBlock(256,1,1);
107 | dim3 grid(ceil((float)N/tBlock.x),1,1);
108 |
109 | saxpy<<>>(dx, dy, a, N);
110 | }
111 | }
112 | ```
113 |
114 |
115 | # Compilation
116 |
117 | **NVIDIA: Mahti**
118 | ```
119 | gfortran -I$HIPFORT_HOME/include/hipfort/nvptx "-DHIPFORT_ARCH=\"nvptx\"" \
120 | -L$HIPFORT_HOME/lib -lhipfort-nvptx -c .f90
121 |
122 | hipcc "--gpu-architecture=sm_80" --x cu -c .cpp
123 |
124 | hipcc -lgfortran "--gpu-architecture=sm_80" -I$HIPFORT_HOME/include/hipfort/nvptx \
125 | -L$HIPFORT_HOME/lib/ -lhipfort-nvptx .o .o -o main
126 | ```
127 | **AMD: LUMI**
128 | ```
129 | ftn -I$HIPFORT_HOME/include/hipfort/amdgcn "-DHIPFORT_ARCH=\"amd\"" \
130 | -L$HIPFORT_HOME/lib -lhipfort-amdgcn -c .f90
131 |
132 | hipcc --offload-arch=gfx90a -c .cpp
133 |
134 | ftn -I$HIPFORT_HOME/include/hipfort/amdgcn "-DHIPFORT_ARCH=\"amd\"" \
135 | -L$HIPFORT_HOME/lib -lhipfort-amdgcn .o .o -o main
136 | ```
137 |
138 |
139 | # Summary
140 |
141 | * No native GPU support in Fortran
142 | * HIP functions are callable from C, using `extern C`
143 | - `iso_c_binding`
144 | - GPU objects are of type `c_ptr` in Fortran
145 | * Hipfort provides Fortran interfaces for GPU libraries
146 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | ROOT_DIR:=$(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
2 | FACTORY=docker
3 | OPTIONS=run -it --rm -v "$(ROOT_DIR)":"$(ROOT_DIR)":Z -w "$(ROOT_DIR)" ghcr.io/csc-training/slidefactory:3.2.0-beta.1
4 |
5 | SRC=$(wildcard *.md)
6 | HTML=$(patsubst %.md,%.html,$(SRC))
7 | PDF=$(patsubst %.md,%.pdf,$(SRC))
8 |
9 | .PHONY: html pdf clean
10 |
11 | html: $(HTML)
12 |
13 | pdf: $(PDF)
14 |
15 | clean:
16 | -rm -f $(HTML) $(PDF)
17 |
18 | %.html: %.md
19 | $(FACTORY) $(OPTIONS) slides --format html $<
20 |
21 | %.pdf: %.md
22 | $(FACTORY) $(OPTIONS) slides --format pdf $<
23 |
--------------------------------------------------------------------------------
/docs/img/01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/01.png
--------------------------------------------------------------------------------
/docs/img/04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/04.png
--------------------------------------------------------------------------------
/docs/img/AMD-GCN-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/AMD-GCN-3.png
--------------------------------------------------------------------------------
/docs/img/BankConflicts.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/BankConflicts.jpeg
--------------------------------------------------------------------------------
/docs/img/CU.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/CU.png
--------------------------------------------------------------------------------
/docs/img/CUgray.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/CUgray.png
--------------------------------------------------------------------------------
/docs/img/NoBankConflicts.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/NoBankConflicts.jpeg
--------------------------------------------------------------------------------
/docs/img/ThreadExecution.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/ThreadExecution.jpg
--------------------------------------------------------------------------------
/docs/img/ThreadExecution_new.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/ThreadExecution_new.jpg
--------------------------------------------------------------------------------
/docs/img/a100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/a100.png
--------------------------------------------------------------------------------
/docs/img/a100_fp32_core.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/a100_fp32_core.png
--------------------------------------------------------------------------------
/docs/img/a100_sm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/a100_sm.png
--------------------------------------------------------------------------------
/docs/img/a100_smsp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/a100_smsp.png
--------------------------------------------------------------------------------
/docs/img/amd_computeunit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/amd_computeunit.png
--------------------------------------------------------------------------------
/docs/img/amd_instinct_mi250x_oam.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/amd_instinct_mi250x_oam.png
--------------------------------------------------------------------------------
/docs/img/amd_m200.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/amd_m200.png
--------------------------------------------------------------------------------
/docs/img/amd_mi200.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/amd_mi200.jpg
--------------------------------------------------------------------------------
/docs/img/amd_mi200.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/amd_mi200.png
--------------------------------------------------------------------------------
/docs/img/arrow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/arrow.png
--------------------------------------------------------------------------------
/docs/img/block_sm_cu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/block_sm_cu.png
--------------------------------------------------------------------------------
/docs/img/coalesced_access_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/coalesced_access_1.png
--------------------------------------------------------------------------------
/docs/img/coalesced_access_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/coalesced_access_3.png
--------------------------------------------------------------------------------
/docs/img/coalesced_access_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/coalesced_access_4.png
--------------------------------------------------------------------------------
/docs/img/comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/comparison.png
--------------------------------------------------------------------------------
/docs/img/copy_d2h.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/copy_d2h.png
--------------------------------------------------------------------------------
/docs/img/copy_h2d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/copy_h2d.png
--------------------------------------------------------------------------------
/docs/img/cpu_waits_on_gpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/cpu_waits_on_gpu.png
--------------------------------------------------------------------------------
/docs/img/cu_sm_eu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/cu_sm_eu.png
--------------------------------------------------------------------------------
/docs/img/cublas_cuda_hip.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/cublas_cuda_hip.png
--------------------------------------------------------------------------------
/docs/img/do_this_computation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/do_this_computation.png
--------------------------------------------------------------------------------
/docs/img/execution-model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/execution-model.png
--------------------------------------------------------------------------------
/docs/img/gpu-bws.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpu-bws.png
--------------------------------------------------------------------------------
/docs/img/gpu-cluster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpu-cluster.png
--------------------------------------------------------------------------------
/docs/img/gpuConnect.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpuConnect.png
--------------------------------------------------------------------------------
/docs/img/gpu_as_a_wide_vector_unit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpu_as_a_wide_vector_unit.png
--------------------------------------------------------------------------------
/docs/img/gpu_as_cus_sms_eus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpu_as_cus_sms_eus.png
--------------------------------------------------------------------------------
/docs/img/gpu_as_vector_units.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpu_as_vector_units.png
--------------------------------------------------------------------------------
/docs/img/gpu_as_vector_units_instructions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpu_as_vector_units_instructions.png
--------------------------------------------------------------------------------
/docs/img/gpu_is_a_separate_processor_with_own_memory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpu_is_a_separate_processor_with_own_memory.png
--------------------------------------------------------------------------------
/docs/img/gpufort.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpufort.png
--------------------------------------------------------------------------------
/docs/img/gpufort1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpufort1.png
--------------------------------------------------------------------------------
/docs/img/gpufort2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/gpufort2.png
--------------------------------------------------------------------------------
/docs/img/grid-threads.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/grid-threads.png
--------------------------------------------------------------------------------
/docs/img/grid_gpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/grid_gpu.png
--------------------------------------------------------------------------------
/docs/img/hipblas.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/hipblas.png
--------------------------------------------------------------------------------
/docs/img/hipfort.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/hipfort.png
--------------------------------------------------------------------------------
/docs/img/kernel_cuda_hip.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/kernel_cuda_hip.png
--------------------------------------------------------------------------------
/docs/img/lumi.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/lumi.jpg
--------------------------------------------------------------------------------
/docs/img/lumi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/lumi.png
--------------------------------------------------------------------------------
/docs/img/many_blocks_to_one_sm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/many_blocks_to_one_sm.png
--------------------------------------------------------------------------------
/docs/img/memlayout.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/memlayout.png
--------------------------------------------------------------------------------
/docs/img/memory-hierarchy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/memory-hierarchy.png
--------------------------------------------------------------------------------
/docs/img/memsch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/memsch.png
--------------------------------------------------------------------------------
/docs/img/mi100-architecture.info:
--------------------------------------------------------------------------------
1 | Source:
2 | Introducing AMD CDNA Architecture,
3 | https://www.amd.com/system/files/documents/amd-cdna-whitepaper.pdf
4 |
5 | Caption:
6 | Block diagram of the AMD Instinct MI100 accelerator, the first GPUs
7 | powered by the AMD CDNA architecture.
8 |
--------------------------------------------------------------------------------
/docs/img/mi100-architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/mi100-architecture.png
--------------------------------------------------------------------------------
/docs/img/mi100_arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/mi100_arch.png
--------------------------------------------------------------------------------
/docs/img/mi250x.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/mi250x.png
--------------------------------------------------------------------------------
/docs/img/mi250x_cu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/mi250x_cu.png
--------------------------------------------------------------------------------
/docs/img/mi250x_cu_simd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/mi250x_cu_simd.png
--------------------------------------------------------------------------------
/docs/img/microprocessor-trend-data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/microprocessor-trend-data.png
--------------------------------------------------------------------------------
/docs/img/model_gpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/model_gpu.png
--------------------------------------------------------------------------------
/docs/img/new_hipfort.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/new_hipfort.png
--------------------------------------------------------------------------------
/docs/img/no_block_to_many_sm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/no_block_to_many_sm.png
--------------------------------------------------------------------------------
/docs/img/not_gpu_as_a_wide_vector_unit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/not_gpu_as_a_wide_vector_unit.png
--------------------------------------------------------------------------------
/docs/img/oned_block.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/oned_block.png
--------------------------------------------------------------------------------
/docs/img/oned_grid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/oned_grid.png
--------------------------------------------------------------------------------
/docs/img/parallel_regions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/parallel_regions.png
--------------------------------------------------------------------------------
/docs/img/parflow_single_node.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/parflow_single_node.png
--------------------------------------------------------------------------------
/docs/img/perfetto.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/perfetto.png
--------------------------------------------------------------------------------
/docs/img/runtimes_annotated.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/runtimes_annotated.png
--------------------------------------------------------------------------------
/docs/img/scalar_operation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/scalar_operation.png
--------------------------------------------------------------------------------
/docs/img/single_proc_mpi_gpu2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/single_proc_mpi_gpu2.png
--------------------------------------------------------------------------------
/docs/img/single_proc_multi_gpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/single_proc_multi_gpu.png
--------------------------------------------------------------------------------
/docs/img/single_proc_thread_gpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/single_proc_thread_gpu.png
--------------------------------------------------------------------------------
/docs/img/software_hardware_mapping.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/software_hardware_mapping.png
--------------------------------------------------------------------------------
/docs/img/streams-example-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/streams-example-1.png
--------------------------------------------------------------------------------
/docs/img/streams-example-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/streams-example-2.png
--------------------------------------------------------------------------------
/docs/img/streams.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/streams.png
--------------------------------------------------------------------------------
/docs/img/streams1_explain.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/streams1_explain.png
--------------------------------------------------------------------------------
/docs/img/streams2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/streams2.png
--------------------------------------------------------------------------------
/docs/img/streams2_explain.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/streams2_explain.png
--------------------------------------------------------------------------------
/docs/img/thread.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/thread.png
--------------------------------------------------------------------------------
/docs/img/thread_lane.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/thread_lane.png
--------------------------------------------------------------------------------
/docs/img/threed_block.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/threed_block.png
--------------------------------------------------------------------------------
/docs/img/top500-perf-dev.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/top500-perf-dev.png
--------------------------------------------------------------------------------
/docs/img/top500-performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/top500-performance.png
--------------------------------------------------------------------------------
/docs/img/transpose_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/transpose_img.png
--------------------------------------------------------------------------------
/docs/img/twod_block.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/twod_block.png
--------------------------------------------------------------------------------
/docs/img/twod_grid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/twod_grid.png
--------------------------------------------------------------------------------
/docs/img/vector_operation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/vector_operation.png
--------------------------------------------------------------------------------
/docs/img/vector_unit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/vector_unit.png
--------------------------------------------------------------------------------
/docs/img/virtual_memory_addressing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/virtual_memory_addressing.png
--------------------------------------------------------------------------------
/docs/img/warp_wavefron_smsp_simd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/docs/img/warp_wavefron_smsp_simd.png
--------------------------------------------------------------------------------
/docs/index:
--------------------------------------------------------------------------------
1 | # List of PDFs to jam together
2 | # an index file for jam-it.sh (https://github.com/mlouhivu/jam-it)
3 |
4 | @title-course.pdf
5 |
6 | @title-intro.pdf
7 | 01-introduction.pdf
8 |
9 | @title-kernels.pdf
10 | 02-kernels.pdf
11 |
12 | @title-streams.pdf
13 | 03-streams.pdf
14 |
15 | @title-memory.pdf
16 | 04-memory.pdf
17 |
18 | @title-fortran.pdf
19 | 05-fortran.pdf
20 |
21 | @title-optimisation.pdf
22 | 06-optimisation.pdf
23 |
24 | @title-multi-gpu.pdf
25 | 07-multi-gpu.pdf
26 |
--------------------------------------------------------------------------------
/first_steps.md:
--------------------------------------------------------------------------------
1 | ## Accessing LUMI
2 |
3 | Are you able to `ssh` to LUMI? If not, have you followed the instructions [here](https://docs.lumi-supercomputer.eu/firststeps/)?
4 |
5 | If you haven't added the ssh-key correctly or cannot otherwise `ssh` to LUMI, you can use the [web interface](https://www.lumi.csc.fi/public/).
6 |
7 | See the [documentation](https://docs.lumi-supercomputer.eu/firststeps/loggingin-webui/) for more help.
8 |
9 | ## Getting the course material
10 |
11 | You can clone this git repository with `git clone https://github.com/csc-training/hip-programming.git`.
12 |
13 | This way you get a local access to the lectures, as well as the exercises (which you need to run on LUMI).
14 |
15 | ## Using slurm
16 |
17 | Supercomputers like LUMI are shared resources, meaning multiple users are using them at the same time.
18 | To run something on LUMI, you need to use SLURM to submit a job.
19 |
20 | Read the [LUMI documentation](https://docs.lumi-supercomputer.eu/runjobs/) on running jobs to find out more.
21 |
22 | ## Motivation for the course
23 |
24 | Why do we teach GPU programming? Why should you learn to program GPUs?
25 |
26 | Because most of the Top 500 supercomputers use (and derive most of their compute cabability from) GPUs
27 | --> if you use any of these supercomputers, you cannot avoid using GPUs.
28 |
29 | Why are most of the Top 500 supercomputers using GPUs?
30 |
31 | 1. Because GPUs are designed and optimized to solve problems commonly encountered in HPC and ML/AI: floating point operations, matrix multiplications.
32 | 2. Because of power limitations: performance per Watt is much greater for GPUs than CPUs: https://top500.org/statistics/efficiency-power-cores/
33 |
--------------------------------------------------------------------------------
/hipfort/hiprand/Makefile:
--------------------------------------------------------------------------------
1 | ifeq ($(COMP),)
2 | COMP=lumi
3 | endif
4 |
5 | ifeq ($(COMP),lumi)
6 | HIPFORT_HOME = /projappl/project_462000877/apps/HIPFORT
7 | LIB_FLAGS =
8 | CXX = CC -xhip
9 | FC = ftn -I$(HIPFORT_HOME)/include/hipfort/amdgcn "-DHIPFORT_ARCH=\"amd\"" -L$(HIPFORT_HOME)/lib -lhipfort-amdgcn $(LIB_FLAGS)
10 | endif
11 |
12 | OBJS=pi.o
13 |
14 | all: pi
15 |
16 | pi: $(OBJS)
17 | $(FC) -o $@ $(OBJS) $(FCFLAGS)
18 |
19 | %.o: %.F90
20 | $(FC) $(FCFLAGS) -c $< -o $@
21 |
22 | %.mod: %.F90
23 | $(FC) $(FCFLAGS) -c $<
24 | clean:
25 | rm -f pi *.o *.mod
26 |
--------------------------------------------------------------------------------
/hipfort/hiprand/img/pi_MC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/hip-programming/037b271a5c963c65b2037714ea0695f6be6d81dd/hipfort/hiprand/img/pi_MC.png
--------------------------------------------------------------------------------
/hipfort/hiprand/pi.F90:
--------------------------------------------------------------------------------
1 | program rand_test
2 | use iso_c_binding
3 | use iso_fortran_env, only : INT64
4 | ! TODO Add here the necessary modules for the GPU operation
5 |
6 |
7 | !OPTIONAL
8 | !TODO write an interface to the C wrapper which calls the reduction kernel.
9 |
10 | implicit none
11 |
12 | integer(kind=INT64) :: nsamples
13 | character(len=85) :: arg
14 | real :: pi1, pi2
15 | integer(c_size_t):: Nbytes
16 |
17 | if (command_argument_count() /= 1) then
18 | STOP 'Usage pi N where N is the number of samples'
19 | end if
20 |
21 | call get_command_argument(1, arg)
22 | read(arg, *) nsamples
23 |
24 | pi1 = cpu_pi(nsamples)
25 | write(*,*) 'Pi calculated with CPU', pi1
26 | pi2 = gpu_pi(nsamples)
27 | write(*,*) 'Pi calculated with GPU', pi2
28 |
29 | contains
30 |
31 | real function cpu_pi(n)
32 | implicit none
33 | integer(kind=INT64) :: n
34 | integer :: i, inside
35 |
36 | real, allocatable:: x(:),y(:)
37 |
38 |
39 | allocate(x(1:n))
40 | allocate(y(1:n))
41 |
42 | call random_number(x)
43 | call random_number(y)
44 |
45 | inside = 0
46 | do i = 1, n
47 | if (x(i)**2 + y(i)**2 < 1.0) then
48 | inside = inside + 1
49 | end if
50 | end do
51 |
52 | cpu_pi = 4.0 * real(inside) / real(n)
53 |
54 | end function cpu_pi
55 |
56 |
57 |
58 | real function gpu_pi(n)
59 | use hipfort
60 | use hipfort_check
61 | use hipfort_hiprand
62 | implicit none
63 | integer(kind=INT64) :: n
64 | integer :: i, inside
65 | type(c_ptr) :: gen = c_null_ptr
66 | type(c_ptr) :: x_d,y_d
67 | real(c_float), allocatable,target :: x(:),y(:)
68 | integer(c_size_t) :: istat
69 |
70 | allocate(x(1:n))
71 | allocate(y(1:n))
72 | Nbytes=sizeof(x)
73 |
74 | inside = 0
75 | ! Initialization for (optional) task. Instead of this one can as well initialize inside_d using a hip kernel
76 | ! Sbytes = sizeof(inside)
77 | ! call hipCheck(hipMalloc(inside_d,Sbytes))
78 | ! call hipCheck(hipMemcpy( inside_d,c_loc(inside), Sbytes, hipMemcpyHostToDevice))
79 |
80 | !Allocate memory for the gpu arrays
81 |
82 | ! TODO Initialize the gpu random number generator
83 |
84 | ! TODO Fill the arrays x and y with random uniform distributed numbers
85 |
86 | ! TODO copy the random numbers from GPU to CPU
87 |
88 | ! TODO Bonus exercise: replace the below reduction loop done on the CPU with a GPU kernel
89 | ! The kernel is in the hip_kernels.cpp file.
90 | ! You need to implement an interface to call the C function simialrly to the saxpy example
91 | ! Note that in this case there is no need to transfer the x and y arrays to CPU,
92 | ! You only need to copy the final result, inside_d
93 |
94 | do i = 1, n
95 | if (x(i)**2 + y(i)**2 < 1.0) then
96 | inside = inside + 1
97 | end if
98 | end do
99 |
100 | gpu_pi = 4.0 * real(inside) / real(n)
101 |
102 | deallocate(x, y)
103 | end function gpu_pi
104 | end program
105 |
--------------------------------------------------------------------------------
/hipfort/hiprand/solution/Makefile:
--------------------------------------------------------------------------------
1 | ifeq ($(COMP),)
2 | COMP=lumi
3 | endif
4 |
5 | ifeq ($(COMP),lumi)
6 | HIPFORT_HOME = /projappl/project_462000877/apps/HIPFORT
7 | LIB_FLAGS = -lhiprand
8 | CXX = CC -xhip
9 | FC = ftn -I$(HIPFORT_HOME)/include/hipfort/amdgcn "-DHIPFORT_ARCH=\"amd\"" -L$(HIPFORT_HOME)/lib -lhipfort-amdgcn $(LIB_FLAGS)
10 | endif
11 |
12 | OBJS=pi.o
13 |
14 | all: pi
15 |
16 | pi: $(OBJS)
17 | $(FC) -o $@ $(OBJS) $(FCFLAGS)
18 |
19 | %.o: %.F90
20 | $(FC) $(FCFLAGS) -c $< -o $@
21 |
22 | %.mod: %.F90
23 | $(FC) $(FCFLAGS) -c $<
24 | clean:
25 | rm -f pi *.o *.mod
26 |
--------------------------------------------------------------------------------
/hipfort/hiprand/solution/pi.F90:
--------------------------------------------------------------------------------
1 | program rand_test
2 | use iso_c_binding
3 | use iso_fortran_env, only : INT64
4 | use hipfort
5 | use hipfort_check
6 | use hipfort_hiprand
7 |
8 | implicit none
9 |
10 | integer(kind=INT64) :: nsamples
11 | character(len=85) :: arg
12 | real :: pi1, pi2
13 | integer(c_size_t):: Nbytes
14 |
15 | if (command_argument_count() /= 1) then
16 | STOP 'Usage pi N where N is the number of samples'
17 | end if
18 |
19 | call get_command_argument(1, arg)
20 | read(arg, *) nsamples
21 |
22 | pi1 = cpu_pi(nsamples)
23 | write(*,*) 'Pi calculated with CPU', pi1
24 | pi2 = gpu_pi(nsamples)
25 | write(*,*) 'Pi calculated with GPU', pi2
26 |
27 | contains
28 |
29 | real function cpu_pi(n)
30 | implicit none
31 | integer(kind=INT64) :: n
32 | integer :: i, inside
33 |
34 | real, allocatable:: x(:),y(:)
35 |
36 |
37 | allocate(x(1:n))
38 | allocate(y(1:n))
39 |
40 | call random_number(x)
41 | call random_number(y)
42 |
43 | inside = 0
44 | do i = 1, n
45 | if (x(i)**2 + y(i)**2 < 1.0) then
46 | inside = inside + 1
47 | end if
48 | end do
49 |
50 | cpu_pi = 4.0 * real(inside) / real(n)
51 |
52 | end function cpu_pi
53 |
54 |
55 |
56 | real function gpu_pi(n)
57 | use hipfort
58 | use hipfort_check
59 | use hipfort_hiprand
60 | implicit none
61 | integer(kind=INT64) :: n
62 | integer :: i, inside
63 | type(c_ptr) :: gen = c_null_ptr
64 | type(c_ptr) :: x_d,y_d
65 | real(c_float), allocatable,target :: x(:),y(:)
66 | integer(c_size_t) :: istat
67 |
68 | allocate(x(1:n))
69 | allocate(y(1:n))
70 | Nbytes=sizeof(x)
71 |
72 | call hipCheck(hipMalloc(x_d,Nbytes))
73 | call hipCheck(hipMalloc(y_d,Nbytes))
74 |
75 | inside = 0
76 |
77 |
78 | istat= hiprandCreateGenerator(gen, HIPRAND_RNG_PSEUDO_DEFAULT)
79 |
80 | istat= hiprandGenerateUniform(gen, x_d, n)
81 | istat= hiprandGenerateUniform(gen, y_d, n)
82 |
83 | call hipCheck(hipMemcpy(c_loc(x), x_d, Nbytes, hipMemcpyDeviceToHost))
84 | call hipCheck(hipMemcpy(c_loc(y), y_d, Nbytes, hipMemcpyDeviceToHost))
85 |
86 | do i = 1, n
87 | if (x(i)**2 + y(i)**2 < 1.0) then
88 | inside = inside + 1
89 | end if
90 | end do
91 |
92 | gpu_pi = 4.0 * real(inside) / real(n)
93 |
94 | deallocate(x, y)
95 | end function gpu_pi
96 | end program
97 |
--------------------------------------------------------------------------------
/hipfort/hiprand/solution_bonus/Makefile:
--------------------------------------------------------------------------------
1 | ifeq ($(COMP),)
2 | COMP=lumi
3 | endif
4 |
5 | ifeq ($(COMP),lumi)
6 | HIPFORT_HOME = /projappl/project_462000877/apps/HIPFORT
7 | LIB_FLAGS = -lhiprand
8 | CXX = CC -xhip
9 | FC = ftn -I$(HIPFORT_HOME)/include/hipfort/amdgcn "-DHIPFORT_ARCH=\"amd\"" -L$(HIPFORT_HOME)/lib -lhipfort-amdgcn $(LIB_FLAGS)
10 | endif
11 |
12 | OBJS=pi.o hip_kernels.o
13 |
14 | all: pi
15 |
16 | pi: $(OBJS)
17 | $(FC) -o $@ $(OBJS) $(FCFLAGS)
18 |
19 | %.o: %.F90
20 | $(FC) $(FCFLAGS) -c $< -o $@
21 |
22 | %.o: %.cpp
23 | $(CXX) -c -o $@ $<
24 |
25 | %.mod: %.F90
26 | $(FC) $(FCFLAGS) -c $<
27 | clean:
28 | rm -f pi *.o *.mod
29 |
--------------------------------------------------------------------------------
/hipfort/hiprand/solution_bonus/hip_kernels.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | __global__ void countInsideKernel(float *x, float *y, int *inside, int64_t n)
5 | {
6 | int idx = blockIdx.x * blockDim.x + threadIdx.x;
7 |
8 | if (idx < n) {
9 | if (x[idx] * x[idx] + y[idx] * y[idx] < 1.0f) {
10 | // Atomic increment to avoid race condition
11 | atomicAdd(inside, 1);
12 | }
13 | }
14 | }
15 |
16 | extern "C"
17 | {
18 | void launch(float *x, float *y, int *inside_d, int64_t N)
19 | {
20 |
21 | dim3 tBlock(256,1,1);
22 | dim3 grid(ceil((float)N/tBlock.x),1,1);
23 |
24 | countInsideKernel<<>>( x, y, inside_d, N);
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/hipfort/hiprand/solution_bonus/pi.F90:
--------------------------------------------------------------------------------
1 | program rand_test
2 | use iso_c_binding
3 | use iso_fortran_env, only : INT64
4 | use hipfort
5 | use hipfort_check
6 | use hipfort_hiprand
7 |
8 | implicit none
9 |
10 | interface
11 | subroutine launch(x_d, y_d, inside_d, N) bind(c)
12 | use iso_c_binding
13 | implicit none
14 | type(c_ptr), value :: x_d, y_d, inside_d
15 | integer(c_int64_t), value :: N ! Ensure use of correct C type for INT64
16 | end subroutine
17 | end interface
18 |
19 | integer(c_int64_t) :: nsamples
20 | character(len=85) :: arg
21 | real :: pi1, pi2
22 | integer(c_size_t) :: Nbytes, Sbytes
23 |
24 | if (command_argument_count() /= 1) then
25 | STOP 'Usage: pi N where N is the number of samples'
26 | end if
27 |
28 | call get_command_argument(1, arg)
29 | read(arg, *) nsamples
30 |
31 | pi1 = cpu_pi(nsamples)
32 | write(*,*) 'Pi calculated with CPU', pi1
33 | pi2 = gpu_pi(nsamples)
34 | write(*,*) 'Pi calculated with GPU', pi2
35 |
36 | contains
37 |
38 | real function cpu_pi(n)
39 | implicit none
40 | integer(c_int64_t) :: n
41 | integer :: i, inside
42 |
43 | real, allocatable :: x(:), y(:)
44 |
45 | allocate(x(1:n))
46 | allocate(y(1:n))
47 |
48 | call random_number(x)
49 | call random_number(y)
50 |
51 | inside = 0
52 | do i = 1, n
53 | if (x(i)**2 + y(i)**2 < 1.0) then
54 | inside = inside + 1
55 | end if
56 | end do
57 |
58 | cpu_pi = 4.0 * real(inside) / real(n)
59 |
60 | end function cpu_pi
61 |
62 | real function gpu_pi(n)
63 | use hipfort
64 | use hipfort_check
65 | use hipfort_hiprand
66 | implicit none
67 | integer(c_int64_t) :: n
68 | integer :: inside
69 | type(c_ptr) :: gen = c_null_ptr
70 | type(c_ptr) :: x_d, y_d, inside_d
71 | real(c_float), allocatable, target :: x(:), y(:)
72 | integer(c_size_t) :: istat
73 |
74 | allocate(x(1:n))
75 | allocate(y(1:n))
76 | Nbytes = sizeof(x)
77 |
78 | call hipCheck(hipMalloc(x_d, Nbytes))
79 | call hipCheck(hipMalloc(y_d, Nbytes))
80 |
81 | istat = hiprandCreateGenerator(gen, HIPRAND_RNG_PSEUDO_DEFAULT)
82 |
83 | istat = hiprandGenerateUniform(gen, x_d, n)
84 | istat = hiprandGenerateUniform(gen, y_d, n)
85 |
86 | inside = 0
87 | Sbytes = sizeof(inside)
88 | call hipCheck(hipMalloc(inside_d, Sbytes))
89 | call hipCheck(hipMemcpy(inside_d, c_loc(inside), Sbytes, hipMemcpyHostToDevice))
90 |
91 | call launch(x_d, y_d, inside_d, n)
92 |
93 | call hipCheck(hipMemcpy(c_loc(inside), inside_d, Sbytes, hipMemcpyDeviceToHost))
94 |
95 | gpu_pi = 4.0 * real(inside) / real(n)
96 |
97 | deallocate(x, y)
98 | end function gpu_pi
99 |
100 | end program rand_test
101 |
--------------------------------------------------------------------------------
/hipfort/saxpy/cuda/main.cuf:
--------------------------------------------------------------------------------
1 | module mathOps
2 | contains
3 | attributes(global) subroutine saxpy(x, y, a)
4 | implicit none
5 | real :: x(:), y(:)
6 | real, value :: a
7 | integer :: i, n
8 | n = size(x)
9 | i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
10 | if (i <= n) y(i) = y(i) + a*x(i)
11 | end subroutine saxpy
12 | end module mathOps
13 |
14 | program testSaxpy
15 | use mathOps
16 | use cudafor
17 | implicit none
18 | integer, parameter :: N = 40000
19 | real :: x(N), y(N), a
20 | real, device :: x_d(N), y_d(N)
21 | type(dim3) :: grid, tBlock
22 |
23 | tBlock = dim3(256,1,1)
24 | grid = dim3(ceiling(real(N)/tBlock%x),1,1)
25 |
26 | x = 1.0; y = 2.0; a = 2.0
27 | x_d = x
28 | y_d = y
29 | call saxpy<<>>(x_d, y_d, a)
30 | y = y_d
31 | write(*,*) 'Max error: ', maxval(abs(y-4.0))
32 | end program testSaxpy
33 |
--------------------------------------------------------------------------------
/hipfort/saxpy/hip/README.md:
--------------------------------------------------------------------------------
1 | # SAXPY using FORTRAN & HIPFORT
2 |
3 | Inspect `saxpy` code in the present folder. The Fortran code folows the same logic as the HIP C code.
4 | First the data is created on the cpu. Then the memory is allocated on the GPU and the data is transfered from CPU to GPU. When the transfer is completed a kernel is executed to perform the work. In the end the results of the computation is copied to the CPU and processed further.
5 |
6 | **Note** Fortran does can not compile HIP C code. The GPU code is located in a separate file, [hipsaxpy.cpp](hipsaxpy.cpp). The HIP kernel is launched via C function which acts as a wrapper. Fortran calls this C wrapper using `iso_c_binding` module.
7 |
8 | In this code all calls to HIP API are done via HIPFORT. The exercise is to check and familiarize with how the memory management (allocations and transfers) is done and how Fortran is calling C functions using `iso_c_binding` module.
9 | If you have previous experience with CUDA Fortran you can compare it to the equivalent code in the [cuda](../cuda) folder.
10 |
11 | In addition to the memory management, HIPFORT provides also bindings for the mathematical libraries running on GPUs. You can find examples of how various `hipxxx` & `rocxxx` libraries are called in `Fortran` programs in the [HIPFORT repository](https://github.com/ROCm/hipfort/tree/develop/test).
12 |
13 | The instructions for compilation are found in the [exercise-instructions page]( ../../../exercise-instructions.md#hipfort-on-lumi).
14 |
--------------------------------------------------------------------------------
/hipfort/saxpy/hip/hipsaxpy.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | __global__ void saxpy(float *y, float *x, float a, int n)
5 | {
6 | size_t i = blockDim.x * blockIdx.x + threadIdx.x;
7 | if (i < n) y[i] = y[i] + a*x[i];
8 | }
9 |
10 |
11 | extern "C"
12 | {
13 | void launch(float *dout, float *da, float db, int N)
14 | {
15 |
16 | dim3 tBlock(256,1,1);
17 | dim3 grid(ceil((float)N/tBlock.x),1,1);
18 |
19 | hipLaunchKernelGGL((saxpy), grid, tBlock, 0, 0, dout, da, db, N);
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/hipfort/saxpy/hip/main.f03:
--------------------------------------------------------------------------------
1 | program testSaxpy
2 | use iso_c_binding
3 | use hipfort
4 | use hipfort_check
5 |
6 | implicit none
7 | interface
8 | subroutine launch(y,x,b,N) bind(c)
9 | use iso_c_binding
10 | implicit none
11 | type(c_ptr),value :: y,x
12 | integer, value :: N
13 | real, value :: b
14 | end subroutine
15 | end interface
16 |
17 | type(c_ptr) :: dx = c_null_ptr
18 | type(c_ptr) :: dy = c_null_ptr
19 | integer, parameter :: N = 40000
20 | integer, parameter :: bytes_per_element = 4
21 | integer(c_size_t), parameter :: Nbytes = N*bytes_per_element
22 | real, allocatable,target,dimension(:) :: x, y
23 |
24 |
25 | real, parameter :: a=2.0
26 | real :: x_d(N), y_d(N)
27 |
28 | call hipCheck(hipMalloc(dx,Nbytes))
29 | call hipCheck(hipMalloc(dy,Nbytes))
30 |
31 | allocate(x(N))
32 | allocate(y(N))
33 |
34 | x = 1.0;y = 2.0
35 |
36 | call hipCheck(hipMemcpy(dx, c_loc(x), Nbytes, hipMemcpyHostToDevice))
37 | call hipCheck(hipMemcpy(dy, c_loc(y), Nbytes, hipMemcpyHostToDevice))
38 |
39 | call launch(dy, dx, a, N)
40 |
41 | call hipCheck(hipDeviceSynchronize())
42 |
43 | call hipCheck(hipMemcpy(c_loc(y), dy, Nbytes, hipMemcpyDeviceToHost))
44 |
45 | write(*,*) 'Max error: ', maxval(abs(y-4.0))
46 |
47 | call hipCheck(hipFree(dx))
48 | call hipCheck(hipFree(dy))
49 |
50 | deallocate(x)
51 | deallocate(y)
52 |
53 | end program testSaxpy
54 |
--------------------------------------------------------------------------------
/kernels/01-hello-world/README.md:
--------------------------------------------------------------------------------
1 | # Hello world with HIP
2 |
3 | Compile and run a simple HIP test program provided as `hello.cpp`.
4 |
5 | Please follow the system-specific instructions provided in the
6 | [exercise instructions](../../exercise-instructions.md).
7 |
--------------------------------------------------------------------------------
/kernels/01-hello-world/hello.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | int main(void)
5 | {
6 | int count = 0;
7 | auto result = hipGetDeviceCount(&count);
8 |
9 | int device = 0;
10 | result = hipGetDevice(&device);
11 |
12 | printf("Hello! I'm GPU %d out of %d GPUs in total.\n", device, count);
13 |
14 | return 0;
15 | }
16 |
--------------------------------------------------------------------------------
/kernels/02-error-checking/README.md:
--------------------------------------------------------------------------------
1 | # Error checking with HIP
2 |
3 | Your task is to find a bug in the program, by implementing a HIP API error checking function.
4 | It's a good practice to wrap the API calls with the error checker to find any issues early.
5 |
--------------------------------------------------------------------------------
/kernels/02-error-checking/error-checking.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | #define HIP_ERRCHK(result) hip_errchk(result, __FILE__, __LINE__)
5 | static inline void hip_errchk(hipError_t result, const char *file, int line) {
6 | static_assert(false, "TODO: remove me and implement the error checking. "
7 | "(Hint: check the slides)");
8 | }
9 |
10 | int main() {
11 | // There's a bug in this program, find out what it is by implementing the
12 | // function above, and correct it
13 | int count = 0;
14 | HIP_ERRCHK(hipGetDeviceCount(&count));
15 | HIP_ERRCHK(hipSetDevice(count));
16 |
17 | int device = 0;
18 | HIP_ERRCHK(hipGetDevice(&device));
19 |
20 | printf("Hello! I'm GPU %d out of %d GPUs in total.\n", device, count);
21 |
22 | return 0;
23 | }
24 |
--------------------------------------------------------------------------------
/kernels/02-error-checking/solution/error-checking.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | /* HIP error handling macro */
5 | #define HIP_ERRCHK(result) hip_errchk(result, __FILE__, __LINE__)
6 | static inline void hip_errchk(hipError_t result, const char *file, int line) {
7 | if (result != hipSuccess) {
8 | printf("\n\n%s in %s at line %d\n", hipGetErrorString(result), file,
9 | line);
10 | exit(EXIT_FAILURE);
11 | }
12 | }
13 |
14 | int main() {
15 | int count = 0;
16 | HIP_ERRCHK(hipGetDeviceCount(&count));
17 | // When setting the device, the argument must be 0 <= arg < #devices
18 | // See
19 | // https://rocm.docs.amd.com/projects/HIP/en/docs-6.0.0/doxygen/html/group___device.html#ga43c1e7f15925eeb762195ccb5e063eae
20 | // for the API
21 | HIP_ERRCHK(hipSetDevice(count - 1));
22 |
23 | int device = 0;
24 | HIP_ERRCHK(hipGetDevice(&device));
25 |
26 | printf("Hello! I'm GPU %d out of %d GPUs in total.\n", device, count);
27 |
28 | return 0;
29 | }
30 |
--------------------------------------------------------------------------------
/kernels/03-kernel-saxpy/README.md:
--------------------------------------------------------------------------------
1 | # Kernel: saxpy
2 |
3 | Write a device kernel that calculates the single precision BLAS operation
4 | **saxpy**, i.e. `y = a * x + y`.
5 |
6 | - Initialise the vectors `x` and `y` with some values on the CPU
7 | - Perform the computation on the host to generate reference values
8 | - Allocate memory on the device for `x` and `y`
9 | - Copy the host `x` to device `x`, and host `y` to device `y`
10 | - Perform the computation on the device
11 | - Copy the device `y` back to the host `y`
12 | - Confirm the correctness: Is the host computed `y` equal to the device computed `y`?
13 |
14 | You may start from a skeleton code provided in [saxpy.cpp](saxpy.cpp).
15 |
--------------------------------------------------------------------------------
/kernels/03-kernel-saxpy/saxpy.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 |
6 | #define HIP_ERRCHK(result) hip_errchk(result, __FILE__, __LINE__)
7 | static inline void hip_errchk(hipError_t result, const char *file, int line) {
8 | if (result != hipSuccess) {
9 | printf("\n\n%s in %s at line %d\n", hipGetErrorString(result), file,
10 | line);
11 | exit(EXIT_FAILURE);
12 | }
13 | }
14 |
15 | /*
16 | TODO: add a device kernel that calculates y = a * x + y for vectors x, y and
17 | constant a
18 |
19 | Hints:
20 |
21 | What attribute(s) do you need to add on a kernel declaration?
22 | - __device__?
23 | - __global__?
24 | - __shared__?
25 | - no attribute(s) needed?
26 |
27 | What is the return type of a kernel?
28 | - int?
29 | - float?
30 | - void?
31 | - depends on the kernel?
32 |
33 | What data do you need in the kernel to compute y = a * x + y, for vectors x, y,
34 | and constant a?
35 |
36 | What built-in variables can you use to calculate the (global) index for a
37 | thread?
38 | - Is threadIdx enough or do you need blockIdx, blockDim, gridDim?
39 | - Is the problem one or multi-dimensional?
40 | - Remember the grid, block, thread hierarchy and the launch parameters
41 | */
42 |
43 | int main() {
44 | // Use HIP_ERRCHK to help you find any errors you make with the API calls
45 |
46 | // Read the HIP Runtime API documentation to help you with the API calls:
47 | // Ctrl-click this to open it in a browser:
48 | // https://rocm.docs.amd.com/projects/HIP/en/docs-6.0.0/doxygen/html/group___memory.html
49 |
50 | static constexpr size_t n = 1000000;
51 | static constexpr size_t num_bytes = sizeof(float) * n;
52 | static constexpr float a = 3.4f;
53 |
54 | std::vector x(n);
55 | std::vector y(n);
56 | std::vector y_ref(n);
57 |
58 | // Initialise data and calculate reference values on CPU
59 | for (size_t i = 0; i < n; i++) {
60 | x[i] = sin(i) * 2.3;
61 | y[i] = cos(i) * 1.1;
62 | y_ref[i] = a * x[i] + y[i];
63 | }
64 |
65 | // TODO: Allocate + copy initial values
66 | // - hipMalloc, hipMemcpy
67 |
68 | // TODO: Define grid dimensions + launch the device kernel
69 | // int/dim3 threads = ...
70 | // int/dim3 blocks = ...
71 | // kernelName<<>>(arguments);
72 |
73 | // TODO: Copy results back to CPU
74 | // - hipMemcpy
75 |
76 | // TODO: Free device memory
77 | // - hipFree
78 |
79 | // Check the result of the GPU computation
80 | printf("reference: %f %f %f %f ... %f %f\n", y_ref[0], y_ref[1], y_ref[2],
81 | y_ref[3], y_ref[n - 2], y_ref[n - 1]);
82 | printf(" result: %f %f %f %f ... %f %f\n", y[0], y[1], y[2], y[3],
83 | y[n - 2], y[n - 1]);
84 |
85 | float error = 0.0;
86 | static constexpr float tolerance = 1e-6f;
87 | for (size_t i = 0; i < n; i++) {
88 | const auto diff = abs(y_ref[i] - y[i]);
89 | if (diff > tolerance)
90 | error += diff;
91 | }
92 | printf("total error: %f\n", error);
93 | printf(" reference: %f at (42)\n", y_ref[42]);
94 | printf(" result: %f at (42)\n", y[42]);
95 |
96 | return 0;
97 | }
98 |
--------------------------------------------------------------------------------
/kernels/04-kernel-copy2d/README.md:
--------------------------------------------------------------------------------
1 | # Kernel: copy2d
2 |
3 | Write a device kernel that performs the double precision BLAS operation
4 | **dcopy**, i.e. `y = x` using GPU threads in a 2D grid.
5 |
6 | - Assume that the vectors `x` and `y` are used to store a 400x600 matrix (in row-major format)
7 | - Initialise the matrix `x` with some values on the CPU
8 | - Allocate memory for `x` and `y` on the device
9 | - Copy the host `x` to the device `x`
10 | - Perform the operation on the device using a 2D kernel
11 | - Copy device `y` to host `y`
12 | - Compare host `x` to host `y`
13 |
14 | Are the values of `x` and `y` equal?
15 |
16 | You may start from a skeleton code provided in [copy2d.cpp](copy2d.cpp).
17 |
--------------------------------------------------------------------------------
/kernels/04-kernel-copy2d/copy2d.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 |
6 | #define HIP_ERRCHK(result) hip_errchk(result, __FILE__, __LINE__)
7 | static inline void hip_errchk(hipError_t result, const char *file, int line) {
8 | if (result != hipSuccess) {
9 | printf("\n\n%s in %s at line %d\n", hipGetErrorString(result), file,
10 | line);
11 | exit(EXIT_FAILURE);
12 | }
13 | }
14 |
15 | // Copy all elements using threads in a 2D grid
16 | __global__ void copy2d(/*TODO: add arguments*/) {
17 | // TODO: compute row and col using
18 | // - threadIdx.x, threadIdx.y
19 | // - blockIdx.x, blockIdx.y
20 | // - blockDim.x, blockDim.y
21 |
22 | // TODO: Make sure there's no out-of-bounds access
23 | // row must be < number of rows
24 | // col must be < number of columns
25 |
26 | // We're computing 1D index from a 2D index and copying from src to dst
27 | const size_t index = row * num_cols + col;
28 | dst[index] = src[index];
29 | }
30 |
31 | int main() {
32 | static constexpr size_t num_cols = 600;
33 | static constexpr size_t num_rows = 400;
34 | static constexpr size_t num_values = num_cols * num_rows;
35 | static constexpr size_t num_bytes = sizeof(double) * num_values;
36 | std::vector x(num_values);
37 | std::vector y(num_values, 0.0);
38 |
39 | // Initialise data
40 | for (size_t i = 0; i < num_values; i++) {
41 | x[i] = static_cast(i) / 1000.0;
42 | }
43 |
44 | // TODO: Allocate + copy initial values to GPU
45 |
46 | // TODO: Define grid dimensions
47 | // Use dim3 structure for threads and blocks
48 |
49 | // TODO: launch the device kernel
50 |
51 | // TODO: Copy results back to the CPU vector y
52 |
53 | // TODO: Free device memory
54 |
55 | // Check result of computation on the GPU
56 | double error = 0.0;
57 | for (size_t i = 0; i < num_values; i++) {
58 | error += abs(x[i] - y[i]);
59 | }
60 |
61 | printf("total error: %f\n", error);
62 | printf(" reference: %f at (42,42)\n", x[42 * num_rows + 42]);
63 | printf(" result: %f at (42,42)\n", y[42 * num_rows + 42]);
64 |
65 | return 0;
66 | }
67 |
--------------------------------------------------------------------------------
/lambdas/01-lambda/README.md:
--------------------------------------------------------------------------------
1 | # Host-device lambda functions and general kernels
2 |
3 | The purpose of this exercise is to understand how the host-device lambda functions work, and how to create a general GPU kernel. Furthermore, differentiating between host and device code paths using ```__HIP_DEVICE_COMPILE__``` macro is demonstrated.
4 |
5 | The task is to define two host-device lambda functions that can be passed for the host or the device kernel. Both lambda functions require a single integer argument, and the intended location of these definitions are indicated by `#error`. The first lambda function does not need to capture anything, but must call the predefined function ```helloFromThread(const int i)```. The second lambda function must capture the value of ```pi```, and then must multiply the thread index by the pi, and print this value from each thread.
6 |
7 | IMPORTANT NOTE! When using the host-device lambda function with NVIDIA architectures, the following compiler argument must be added for hipcc: `--extended-lambda`
8 |
--------------------------------------------------------------------------------
/lambdas/01-lambda/lambda.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 | /* Blocksize is small because we are printing from all threads */
6 | #define BLOCKSIZE 4
7 |
8 | /* CPU loop execution */
9 | template
10 | void cpuKernel(Lambda lambda, const int loop_size) {
11 | for(int i = 0; i < loop_size; i++){
12 | lambda(i);
13 | }
14 | }
15 |
16 | /* GPU loop execution */
17 | template
18 | __global__ void gpuKernel(Lambda lambda, const int loop_size)
19 | {
20 | const int i = blockIdx.x * blockDim.x + threadIdx.x;
21 | if(i < loop_size)
22 | {
23 | lambda(i);
24 | }
25 | }
26 |
27 | /* Check if this function is running on CPU or GPU */
28 | __host__ __device__ void helloFromThread(const int i) {
29 | #ifdef __HIP_DEVICE_COMPILE__ // If running on GPU
30 | printf("Hello from GPU! I'm thread number %d\n", i);
31 | #else // If running on CPU
32 | printf("Hello from CPU! I'm thread number %d\n", i);
33 | #endif
34 | }
35 |
36 |
37 | /* The main function */
38 | int main()
39 | {
40 | // Set the problem dimensions
41 | const int loop_size = BLOCKSIZE;
42 | const int blocksize = BLOCKSIZE;
43 | const int gridsize = (loop_size - 1 + blocksize) / blocksize;
44 |
45 | // Define lambda1 function with 1 integer argument,
46 | // the lamba must call helloFromThread with that argument
47 | # error put the first lambda funtion definition here
48 |
49 | // Run lambda1 on the CPU device
50 | cpuKernel(lambda1, loop_size);
51 |
52 | // Run lambda1 on the GPU device
53 | gpuKernel<<>>(lambda1, loop_size);
54 | hipStreamSynchronize(0);
55 |
56 | // Store value of pi in pi
57 | double pi = M_PI;
58 |
59 | // Define lambda2 that captures pi (use [=] to capture by value),
60 | // and prints out the results for i * pi from each thread
61 | # error put the second lambda funtion definition here
62 |
63 | // Run lambda2 on the GPU device
64 | gpuKernel<<>>(lambda2, loop_size);
65 | hipStreamSynchronize(0);
66 | }
67 |
--------------------------------------------------------------------------------
/lambdas/01-lambda/solution/lambda.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 | /* Blocksize is small because we are printing from all threads */
6 | #define BLOCKSIZE 4
7 |
8 | /* CPU loop execution */
9 | template
10 | void cpuKernel(Lambda lambda, const int loop_size) {
11 | for(int i = 0; i < loop_size; i++){
12 | lambda(i);
13 | }
14 | }
15 |
16 | /* GPU loop execution */
17 | template
18 | __global__ void gpuKernel(Lambda lambda, const int loop_size)
19 | {
20 | const int i = blockIdx.x * blockDim.x + threadIdx.x;
21 | if(i < loop_size)
22 | {
23 | lambda(i);
24 | }
25 | }
26 |
27 | /* Check if this function is running on CPU or GPU */
28 | __host__ __device__ void helloFromThread(const int i) {
29 | #ifdef __HIP_DEVICE_COMPILE__ // If running on GPU
30 | printf("Hello from GPU! I'm thread number %d\n", i);
31 | #else // If running on CPU
32 | printf("Hello from CPU! I'm thread number %d\n", i);
33 | #endif
34 | }
35 |
36 |
37 | /* The main function */
38 | int main()
39 | {
40 | // Set the problem dimensions
41 | const int loop_size = BLOCKSIZE;
42 | const int blocksize = BLOCKSIZE;
43 | const int gridsize = (loop_size - 1 + blocksize) / blocksize;
44 |
45 | // Define lambda1 function with 1 integer argument,
46 | // the lamba must call helloFromThread with that argument
47 | auto lambda1 = [] __host__ __device__ (const int i)
48 | {
49 | helloFromThread(i);
50 | };
51 |
52 | // Run lambda1 on the CPU device
53 | cpuKernel(lambda1, loop_size);
54 |
55 | // Run lambda1 on the GPU device
56 | gpuKernel<<>>(lambda1, loop_size);
57 | hipStreamSynchronize(0);
58 |
59 | // Store value of pi in pi
60 | double pi = M_PI;
61 |
62 | // Define lambda2 that captures pi (use [=] to capture by value),
63 | // and prints out the results for i * pi from each thread
64 | auto lambda2 = [=] __host__ __device__ (const int i)
65 | {
66 | printf("i * pi = %f \n", (double)i * pi);
67 | };
68 |
69 | // Run lambda2 on the GPU device
70 | gpuKernel<<>>(lambda2, loop_size);
71 | hipStreamSynchronize(0);
72 | }
73 |
--------------------------------------------------------------------------------
/lambdas/02-reduction/README.md:
--------------------------------------------------------------------------------
1 | # Reductions with host-device lambdas and hipCUB
2 |
3 | The purpose of this exercise is to use host-device lambda functions and the hipCUB library to create an efficient reduction kernel. The location of the missing parts of the kernel code are indicated by #error. The CUB library documentation may be useful, particularly [this example](https://nvlabs.github.io/cub/classcub_1_1_block_reduce.html#a7632bd9c8950dd6a3528ca99fa3f0890). Note that hipCUB uses namespace "hipcub" instead of "cub" used in the original CUDA library.
4 |
5 | IMPORTANT NOTE! When using the host-device lambda function with NVIDIA architectures, the following compiler argument must be added for hipcc: `--extended-lambda`
6 |
--------------------------------------------------------------------------------
/lambdas/02-reduction/reduction.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include "../../third-party/hipcub/hipcub.hpp"
4 |
5 | /* Blocksize is divisible by the warp size */
6 | #define BLOCKSIZE 64
7 |
8 | /* CPU redution loop */
9 | template
10 | void parallel_reduce_cpu(const int loop_size, Lambda loop_body, int *sum) {
11 | // Evaluate the loop body
12 | for(int i = 0; i < loop_size; i++){
13 | loop_body(i, *sum);
14 | }
15 | }
16 |
17 | /* GPU redution kernel */
18 | template
19 | __global__ void reduction_kernel(Lambda loop_body, const int loop_size, int *sum)
20 | {
21 | // Specialize BlockReduce for a 1D block of BLOCKSIZE threads of type int
22 | #error add here hipcub typedef
23 |
24 | // Use shared memory for the hipcub library temporary storage
25 | #error define the shared memory used by the hipcub library here
26 |
27 | // Get thread index
28 | const int idx = blockIdx.x * blockDim.x + threadIdx.x;
29 |
30 | // Local storage for the thread summation value
31 | int thread_sum = 0;
32 |
33 | // Evaluate the loop body, the summation value is stored in thread_sum
34 | if(idx < loop_size)
35 | loop_body(idx, thread_sum);
36 |
37 | // Compute the block-wide sum (aggregate) for the first thread of each block
38 | int aggregate;
39 | #error call the hipcub function to perform block-wide sum and store the result into 'aggregate'
40 |
41 | // The first thread of each block stores the block-wide aggregate to 'sum' using atomics
42 | if(threadIdx.x == 0)
43 | #error use HIP native atomiAdd() function to sum the 'aggregate' of each block into 'sum'
44 | }
45 |
46 | /* Wrapper for the GPU redution kernel */
47 | template
48 | void parallel_reduce_gpu(const uint loop_size, Lambda loop_body, int *sum) {
49 |
50 | // Set block and grid dimensions
51 | const uint blocksize = BLOCKSIZE;
52 | const uint gridsize = (loop_size - 1 + blocksize) / blocksize;
53 |
54 | // Create GPU buffer for the reduction variable
55 | int* d_buf;
56 | hipMalloc(&d_buf, sizeof(int));
57 |
58 | // Launch the reduction kernel
59 | reduction_kernel<<>>(loop_body, loop_size, d_buf);
60 | hipStreamSynchronize(0);
61 |
62 | // Copy reduction variable back to host from the GPU buffer
63 | hipMemcpy(sum, d_buf, sizeof(int), hipMemcpyDeviceToHost);
64 | hipFree(d_buf);
65 | }
66 |
67 |
68 | /* The main function */
69 | int main()
70 | {
71 | // Calculate the triangular number up to 'tn', ie, a sum of numbers from 0 to 'tn'
72 | const int tn = 1000;
73 |
74 | // Calculate the triangular number on the GPU and store it in sum_gpu
75 | int sum_gpu = 0;
76 | parallel_reduce_gpu(tn, [] __host__ __device__ (const int i, int &sum){
77 | int thread_idx = i;
78 | sum += thread_idx;
79 | }, &sum_gpu);
80 |
81 | // Calculate the triangular number on the CPU and store it in sum_cpu
82 | int sum_cpu = 0;
83 | parallel_reduce_cpu(tn, [] __host__ __device__ (const int i, int &sum){
84 | int thread_idx = i;
85 | sum += thread_idx;
86 | }, &sum_cpu);
87 |
88 | // Check that the results match
89 | if(sum_gpu == sum_cpu)
90 | printf("The results calculated by GPU = %d and CPU = %d match!\n", sum_gpu, sum_cpu);
91 | else
92 | printf("The results calculated by GPU = %d and CPU = %d do not match!\n", sum_gpu, sum_cpu);
93 |
94 | return 0;
95 | }
96 |
--------------------------------------------------------------------------------
/lambdas/02-reduction/solution/reduction.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include "../../../third-party/hipcub/hipcub.hpp"
4 |
5 | /* Blocksize is divisible by the warp size */
6 | #define BLOCKSIZE 64
7 |
8 | /* CPU redution loop */
9 | template
10 | void parallel_reduce_cpu(const int loop_size, Lambda loop_body, int *sum) {
11 | // Evaluate the loop body
12 | for(int i = 0; i < loop_size; i++){
13 | loop_body(i, *sum);
14 | }
15 | }
16 |
17 | /* GPU redution kernel */
18 | template
19 | __global__ void reduction_kernel(Lambda loop_body, const int loop_size, int *sum)
20 | {
21 | // Specialize BlockReduce for a 1D block of BLOCKSIZE threads of type int
22 | typedef hipcub::BlockReduce BlockReduce;
23 |
24 | // Use shared memory for the hipcub library temporary storage
25 | __shared__ typename BlockReduce::TempStorage temp_storage;
26 |
27 | // Get thread index
28 | const int idx = blockIdx.x * blockDim.x + threadIdx.x;
29 |
30 | // Local storage for the thread summation value
31 | int thread_sum = 0;
32 |
33 | // Evaluate the loop body, the summation value is stored in thread_sum
34 | if(idx < loop_size)
35 | loop_body(idx, thread_sum);
36 |
37 | // Compute the block-wide sum (aggregate) for the first thread of each block
38 | int aggregate = BlockReduce(temp_storage).Sum(thread_sum);
39 |
40 | // The first thread of each block stores the block-wide aggregate to 'sum' using atomics
41 | if(threadIdx.x == 0)
42 | atomicAdd(sum, aggregate);
43 | }
44 |
45 | /* Wrapper for the GPU redution kernel */
46 | template
47 | void parallel_reduce_gpu(const uint loop_size, Lambda loop_body, int *sum) {
48 |
49 | // Set block and grid dimensions
50 | const uint blocksize = BLOCKSIZE;
51 | const uint gridsize = (loop_size - 1 + blocksize) / blocksize;
52 |
53 | // Create GPU buffer for the reduction variable
54 | int* d_buf;
55 | hipMalloc(&d_buf, sizeof(int));
56 | hipMemcpy(d_buf, sum, sizeof(int), hipMemcpyHostToDevice);
57 |
58 | // Launch the reduction kernel
59 | reduction_kernel<<>>(loop_body, loop_size, d_buf);
60 | hipStreamSynchronize(0);
61 |
62 | // Copy reduction variable back to host from the GPU buffer
63 | hipMemcpy(sum, d_buf, sizeof(int), hipMemcpyDeviceToHost);
64 | hipFree(d_buf);
65 | }
66 |
67 |
68 | /* The main function */
69 | int main()
70 | {
71 | // Calculate the triangular number up to 'tn', ie, a sum of numbers from 0 to 'tn'
72 | const int tn = 1000;
73 |
74 | // Calculate the triangular number on the GPU and store it in sum_gpu
75 | int sum_gpu = 0;
76 | parallel_reduce_gpu(tn, [] __host__ __device__ (const int i, int &sum){
77 | int thread_idx = i;
78 | sum += thread_idx;
79 | }, &sum_gpu);
80 |
81 | // Calculate the triangular number on the CPU and store it in sum_cpu
82 | int sum_cpu = 0;
83 | parallel_reduce_cpu(tn, [] __host__ __device__ (const int i, int &sum){
84 | int thread_idx = i;
85 | sum += thread_idx;
86 | }, &sum_cpu);
87 |
88 | // Check that the results match
89 | if(sum_gpu == sum_cpu)
90 | printf("The results calculated by GPU = %d and CPU = %d match!\n", sum_gpu, sum_cpu);
91 | else
92 | printf("The results calculated by GPU = %d and CPU = %d do not match!\n", sum_gpu, sum_cpu);
93 |
94 | return 0;
95 | }
96 |
--------------------------------------------------------------------------------
/lambdas/03-hipify/Makefile:
--------------------------------------------------------------------------------
1 | default: build
2 | echo "Start Build"
3 |
4 | # Accelerator architecture
5 | ifeq ($(CUDA),1)
6 |
7 | CXX = nvcc
8 | CXXDEFS = -DHAVE_CUDA
9 | CXXFLAGS = -g -O3 --x=cu --extended-lambda -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80
10 | EXE = bessel
11 |
12 | else ifeq ($(HIP),CUDA)
13 |
14 | CXX = hipcc
15 | CXXDEFS = -DHAVE_HIP -I$(shell pwd)/../../third-party/hiprand -I$(shell pwd)/../../third-party
16 | CXXFLAGS = -g -O3 --x=cu --extended-lambda -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80
17 | EXE = bessel
18 |
19 | else ifeq ($(HIP),ROCM)
20 |
21 | CXX = hipcc
22 | CXXDEFS = -DHAVE_HIP -I/appl/eap/opt/rocm-4.3.1/hiprand/include/ -I/appl/eap/opt/rocm-4.3.1/rocrand/include/
23 | CXXFLAGS = -g -O3 --offload-arch=gfx90a
24 | FILETYPE = .cpp
25 | EXE = bessel
26 |
27 | else
28 |
29 | CXX = g++
30 | CXXFLAGS = -g -O3
31 | EXE = bessel
32 |
33 | endif
34 |
35 | # Message passing protocol
36 | ifeq ($(MPI),1)
37 |
38 | MPICXX = mpicxx
39 | MPICXXENV = OMPI_CXXFLAGS='' OMPI_CXX='$(CXX) -DHAVE_MPI $(CXXDEFS) $(CXXFLAGS)'
40 | LDFLAGS += -L/appl/spack/install-tree/gcc-9.1.0/openmpi-4.1.1-vonyow/lib
41 | LIBS += -lmpi
42 |
43 | else
44 |
45 | MPICXX = $(CXX)
46 | MPICXXFLAGS = $(CXXDEFS) $(CXXFLAGS)
47 |
48 | endif
49 |
50 | SRC_PATH = src/
51 | SOURCES = $(shell ls src/*.cpp)
52 |
53 | OBJ_PATH = src/
54 | OBJECTS = $(shell for file in $(SOURCES);\
55 | do echo -n $$file | sed -e "s/\(.*\)\.cpp/\1\.o/";echo -n " ";\
56 | done)
57 |
58 | build: $(EXE)
59 |
60 | depend:
61 | makedepend $(CXXDEFS) -m $(SOURCES)
62 |
63 | test: $(EXE)
64 | ./$(EXE)
65 |
66 | $(EXE): $(OBJECTS)
67 | $(CXX) $(LDFLAGS) $(OBJECTS) $(LIBS) -o $(EXE)
68 |
69 | clean: $(CLEAN)
70 | rm -f $(OBJECTS) $(EXE)
71 |
72 | # Compilation rules
73 | $(OBJ_PATH)%.o: $(SRC_PATH)%.cpp
74 | $(MPICXXENV) $(MPICXX) $(MPICXXFLAGS) -c $< -o $(SRC_PATH)$(notdir $@)
75 |
--------------------------------------------------------------------------------
/lambdas/03-hipify/README.md:
--------------------------------------------------------------------------------
1 | # Monte Carlo simulation with hipRAND library
2 |
3 | ## Exercise description
4 |
5 | The HIP header file [devices_hip.h](src/devices_hip.h) has disappeared from the [src](src/) folder. Fortunately, the respective CUDA header, [devices_cuda.h](src/devices_cuda.h), is still present. The task is to use hipify tools to translate [devices_cuda.h](src/devices_cuda.h) to [devices_hip.h](src/devices_hip.h). What does the hipify tool translate? Is there anything that is not translated properly? You may compare the result with the original HIP header named [solution.h](src/solution.h). Instructions to compile the code with HIP at the bottom.
6 |
7 | IMPORTANT NOTE on hipify-clang module usage on Puhti! Load hipify-clang to hipify CUDA code by
8 | ```
9 | ml hipify-clang
10 | ```
11 | and after loading and using hipify-clang, you must do the following before trying to compile any HIP code
12 | ```
13 | ml purge
14 | ml hip
15 | ```
16 | Otherwise the compilation fails (you cannot compile HIP while having hipify-clang module loaded).
17 | ## Code description
18 |
19 | This example uses the Monte Carlo method to simulate the value of Bessel's correction that minimizes the root mean squared error in the calculation of the sample standard deviation and variance for the chosen sample and population sizes. The sample standard deviation is typically calculated as $$s = \sqrt{\frac{1}{N - \beta}\sum_{i=1}^{N}(x_i - \bar{x})^2}$$ where $$\beta = 1.$$ The simulation calculates the root mean squared error for different values of $\beta$.
20 |
21 | The implementation uses a special construct for the parallel loops in [bessel.cpp](src/bessel.cpp) which is based on a lambda function, an approach similar to some accelerator frameworks such as SYCL, Kokkos, RAJA, etc. The approach allows conditional compilation of the loops for multiple architectures while keeping the source code clean and readable. An example of the usage of cuRAND and hipRAND random number generation libraries inside a GPU kernel are given in [devices_cuda.h](src/devices_cuda.h) and [devices_hip.h](src/devices_hip.h).
22 |
23 | The code can be conditionally compiled for either CUDA, HIP, or HOST execution with or without MPI. The correct definitions for each accelerator backend option are selected in [comms.h](src/comms.h) by choosing the respective header file. The compilation instructions are shown below:
24 |
25 | ```
26 | // Compile to run sequentially on CPU
27 | make
28 |
29 | // Compile to run parallel on CPUs with MPI
30 | make MPI=1
31 |
32 | // Compile to run parallel on GPU with CUDA
33 | make CUDA=1
34 |
35 | // Compile to run parallel on GPU with HIP
36 | make HIP=CUDA
37 |
38 | // Compile to run parallel on many GPUs with HIP and MPI
39 | make HIP=CUDA MPI=1
40 |
41 | ```
42 |
--------------------------------------------------------------------------------
/lambdas/03-hipify/src/comms.cpp:
--------------------------------------------------------------------------------
1 | #include "comms.h"
2 |
3 | #if defined(HAVE_MPI)
4 |
5 | namespace comms{
6 |
7 | static int MPI_INITIALIZED = 0;
8 |
9 | int get_procs(){
10 | int comm_size = 1;
11 | if (MPI_INITIALIZED == 1){
12 | MPI_Comm_size(MPI_COMM_WORLD, &comm_size);
13 | }
14 | return comm_size;
15 | }
16 |
17 | int get_rank(){
18 | int proc_rank = 0;
19 | if (MPI_INITIALIZED == 1){
20 | MPI_Comm_rank(MPI_COMM_WORLD, &proc_rank);
21 | }
22 | return proc_rank;
23 | }
24 |
25 | int get_node_rank(){
26 | int node_rank = 0;
27 | if (MPI_INITIALIZED == 1){
28 | MPI_Comm node_comm = MPI_COMM_NULL;
29 | MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &node_comm);
30 |
31 | MPI_Comm_rank(node_comm, &node_rank);
32 | MPI_Comm_free(&node_comm);
33 | }
34 | return node_rank;
35 | }
36 |
37 | int get_node_procs(){
38 | int node_comm_size = 1;
39 | if (MPI_INITIALIZED == 1){
40 | MPI_Comm node_comm = MPI_COMM_NULL;
41 | MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &node_comm);
42 |
43 | MPI_Comm_size(node_comm, &node_comm_size);
44 | MPI_Comm_free(&node_comm);
45 | }
46 | return node_comm_size;
47 | }
48 |
49 | void barrier_procs(){
50 | // Synchronize across all MPI processes
51 | if (MPI_INITIALIZED == 1)
52 | MPI_Barrier(MPI_COMM_WORLD);
53 | }
54 |
55 | void reduce_procs(float *sbuf, int count){
56 | if (MPI_INITIALIZED == 1){
57 | float* rbuf;
58 | if(get_rank() == 0)
59 | rbuf = (float*)malloc(count * sizeof(float));
60 | MPI_Reduce(sbuf, rbuf, count, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
61 | if(get_rank() == 0){
62 | memcpy(sbuf, rbuf, count * sizeof(float));
63 | free((void*)rbuf);
64 | }
65 | }
66 | }
67 |
68 | void init_procs(int *argc, char **argv[]){
69 | if(*argc > 1){
70 | MPI_Init(argc, argv);
71 | MPI_INITIALIZED = 1;
72 | }
73 | // Some device backends require an initialization
74 | devices::init(get_node_rank());
75 | }
76 |
77 | void finalize_procs(){
78 | // Some device backends also require a finalization
79 | devices::finalize(get_rank());
80 |
81 | // Finalize MPI if it is used
82 | if (MPI_INITIALIZED == 1)
83 | MPI_Finalize();
84 | }
85 | }
86 |
87 | #else
88 |
89 | namespace comms{
90 | int get_procs(){
91 | int comm_size = 1;
92 | return comm_size;
93 | }
94 |
95 | int get_rank(){
96 | int proc_rank = 0;
97 | return proc_rank;
98 | }
99 |
100 | int get_node_rank(){
101 | int node_rank = 0;
102 | return node_rank;
103 | }
104 |
105 | int get_node_procs(){
106 | int node_comm_size = 1;
107 | return node_comm_size;
108 | }
109 |
110 | void barrier_procs(){
111 | }
112 |
113 | void reduce_procs(float *sbuf, int count){
114 | }
115 |
116 | void init_procs(int *argc, char **argv[]){
117 | // Some device backends require an initialization
118 | devices::init(get_node_rank());
119 | }
120 |
121 | void finalize_procs(){
122 | // Some device backends also require a finalization
123 | devices::finalize(get_rank());
124 | }
125 | }
126 |
127 | #endif
128 |
--------------------------------------------------------------------------------
/lambdas/03-hipify/src/comms.h:
--------------------------------------------------------------------------------
1 | #ifndef BESSEL_COMMS_H
2 | #define BESSEL_COMMS_H
3 |
4 | #if defined(HAVE_MPI)
5 | #include "mpi.h"
6 | #endif
7 |
8 | #if defined(HAVE_CUDA)
9 | #include "devices_cuda.h"
10 | #elif defined(HAVE_HIP)
11 | #include "devices_hip.h"
12 | #else
13 | #include "devices_host.h"
14 | #endif
15 |
16 | namespace comms{
17 | int get_procs();
18 | int get_rank();
19 | int get_node_procs();
20 | int get_node_rank();
21 |
22 | void barrier_procs();
23 | void reduce_procs(float *sbuf, int count);
24 |
25 | void init_procs(int *argc, char **argv[]);
26 | void finalize_procs();
27 | }
28 |
29 | #endif // !BESSEL_COMMS_H
30 |
--------------------------------------------------------------------------------
/lambdas/03-hipify/src/devices_cuda.h:
--------------------------------------------------------------------------------
1 | #ifndef BESSEL_DEVICES_CUDA_H
2 | #define BESSEL_DEVICES_CUDA_H
3 |
4 | #include
5 | #include
6 | #include
7 |
8 | #define CUDA_ERR(err) (cuda_error(err, __FILE__, __LINE__))
9 | inline static void cuda_error(cudaError_t err, const char *file, int line) {
10 | if (err != cudaSuccess) {
11 | printf("\n\n%s in %s at line %d\n", cudaGetErrorString(err), file, line);
12 | exit(1);
13 | }
14 | }
15 |
16 | #define DEVICE_LAMBDA [=] __host__ __device__
17 |
18 | namespace devices
19 | {
20 | __forceinline__ static void init(int node_rank) {
21 | int num_devices = 0;
22 | CUDA_ERR(cudaGetDeviceCount(&num_devices));
23 | CUDA_ERR(cudaSetDevice(node_rank % num_devices));
24 | }
25 |
26 | __forceinline__ static void finalize(int rank) {
27 | printf("Rank %d, CUDA finalized.\n", rank);
28 | }
29 |
30 | __forceinline__ static void* allocate(size_t bytes) {
31 | void* ptr;
32 | CUDA_ERR(cudaMallocManaged(&ptr, bytes));
33 | return ptr;
34 | }
35 |
36 | __forceinline__ static void free(void* ptr) {
37 | CUDA_ERR(cudaFree(ptr));
38 | }
39 |
40 | __forceinline__ static void memcpy_d2d(void* dst, void* src, size_t bytes){
41 | CUDA_ERR(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToDevice));
42 | }
43 |
44 | template
45 | __global__ static void cudaKernel(LambdaBody lambda, const int loop_size)
46 | {
47 | const int i = blockIdx.x * blockDim.x + threadIdx.x;
48 | if(i < loop_size)
49 | {
50 | lambda(i);
51 | }
52 | }
53 |
54 | template
55 | __forceinline__ static void parallel_for(int loop_size, T loop_body) {
56 | const int blocksize = 64;
57 | const int gridsize = (loop_size - 1 + blocksize) / blocksize;
58 | cudaKernel<<>>(loop_body, loop_size);
59 | CUDA_ERR(cudaStreamSynchronize(0));
60 | }
61 |
62 | template
63 | __host__ __device__ __forceinline__ static void atomic_add(T *array_loc, T value){
64 | // Define this function depending on whether it runs on GPU or CPU
65 | #ifdef __CUDA_ARCH__
66 | atomicAdd(array_loc, value);
67 | #else
68 | *array_loc += value;
69 | #endif
70 | }
71 |
72 | template
73 | __host__ __device__ static T random_float(unsigned long long seed, unsigned long long seq, int idx, T mean, T stdev){
74 |
75 | T var = 0;
76 | #ifdef __CUDA_ARCH__
77 | curandStatePhilox4_32_10_t state;
78 |
79 | // curand_init() reproduces the same random number with the same seed and seq
80 | curand_init(seed, seq, 0, &state);
81 |
82 | // curand_normal() gives a random float from a normal distribution with mean = 0 and stdev = 1
83 | var = stdev * curand_normal(&state) + mean;
84 | #endif
85 | return var;
86 | }
87 | }
88 |
89 | #endif // !BESSEL_DEVICES_CUDA_H
90 |
--------------------------------------------------------------------------------
/lambdas/03-hipify/src/devices_host.h:
--------------------------------------------------------------------------------
1 | #ifndef BESSEL_DEVICES_HOST_H
2 | #define BESSEL_DEVICES_HOST_H
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 | #define DEVICE_LAMBDA [=]
10 |
11 | namespace devices
12 | {
13 | inline static void init(int node_rank) {
14 | // Nothing needs to be done here
15 | }
16 |
17 | inline static void finalize(int rank) {
18 | printf("Rank %d, Host finalized.\n", rank);
19 | }
20 |
21 | inline static void* allocate(size_t bytes) {
22 | return malloc(bytes);
23 | }
24 |
25 | inline static void free(void* ptr) {
26 | ::free(ptr);
27 | }
28 |
29 | inline static void memcpy_d2d(void* dst, void* src, size_t bytes){
30 | memcpy(dst, src, bytes);
31 | }
32 |
33 | template
34 | inline static void parallel_for(int loop_size, Lambda loop_body) {
35 | for(int i = 0; i < loop_size; i++){
36 | loop_body(i);
37 | }
38 | }
39 |
40 | template
41 | inline static void atomic_add(T *array_loc, T value){
42 | *array_loc += value;
43 | }
44 |
45 | template
46 | inline static T random_float(unsigned long long seed, unsigned long long seq, int idx, T mean, T stdev){
47 |
48 | // Re-seed the first case
49 | if(idx == 0){
50 | // Overflow is defined behavior with unsigned, and therefore ok here
51 | srand((unsigned int)seed + (unsigned int)seq);
52 | }
53 |
54 | // Use Box Muller algorithm to get a float from a normal distribution
55 | const float two_pi = 2.0f * M_PI;
56 | float u1 = (float) rand() / RAND_MAX;
57 | float u2 = (float) rand() / RAND_MAX;
58 | float factor = stdev * sqrtf (-2.0f * logf (u1));
59 | float trig_arg = two_pi * u2;
60 |
61 | // Box Muller algorithm produces two random normally distributed floats, z0 and z1
62 | float z0 = factor * cosf (trig_arg) + mean; // Need only one
63 | // float z1 = factor * sinf (trig_arg) + mean;
64 | return z0;
65 | }
66 | }
67 | #endif // !BESSEL_DEVICES_HOST_H
68 |
--------------------------------------------------------------------------------
/lambdas/03-hipify/src/solution.h:
--------------------------------------------------------------------------------
1 | #ifndef BESSEL_DEVICES_HIP_H
2 | #define BESSEL_DEVICES_HIP_H
3 |
4 | #include
5 | #include
6 | #include
7 |
8 | #define HIP_ERR(err) (hip_error(err, __FILE__, __LINE__))
9 | inline static void hip_error(hipError_t err, const char *file, int line) {
10 | if (err != hipSuccess) {
11 | printf("\n\n%s in %s at line %d\n", hipGetErrorString(err), file, line);
12 | exit(1);
13 | }
14 | }
15 |
16 | #define DEVICE_LAMBDA [=] __host__ __device__
17 |
18 | namespace devices
19 | {
20 | __forceinline__ static void init(int node_rank) {
21 | int num_devices = 0;
22 | HIP_ERR(hipGetDeviceCount(&num_devices));
23 | HIP_ERR(hipSetDevice(node_rank % num_devices));
24 | }
25 |
26 | __forceinline__ static void finalize(int rank) {
27 | printf("Rank %d, HIP finalized.\n", rank);
28 | }
29 |
30 | __forceinline__ static void* allocate(size_t bytes) {
31 | void* ptr;
32 | HIP_ERR(hipMallocManaged(&ptr, bytes));
33 | return ptr;
34 | }
35 |
36 | __forceinline__ static void free(void* ptr) {
37 | HIP_ERR(hipFree(ptr));
38 | }
39 |
40 | __forceinline__ static void memcpyd2d(void* dst, void* src, size_t bytes){
41 | HIP_ERR(hipMemcpy(dst, src, bytes, hipMemcpyDeviceToDevice));
42 | }
43 |
44 | template
45 | __global__ static void hipKernel(LambdaBody lambda, const int loop_size)
46 | {
47 | const int i = blockIdx.x * blockDim.x + threadIdx.x;
48 | if(i < loop_size)
49 | {
50 | lambda(i);
51 | }
52 | }
53 |
54 | template
55 | __forceinline__ static void parallel_for(int loop_size, T loop_body) {
56 | const int blocksize = 64;
57 | const int gridsize = (loop_size - 1 + blocksize) / blocksize;
58 | hipKernel<<>>(loop_body, loop_size);
59 | HIP_ERR(hipStreamSynchronize(0));
60 | }
61 |
62 | template
63 | __host__ __device__ __forceinline__ static void atomic_add(T *array_loc, T value){
64 | // Define this function depending on whether it runs on GPU or CPU
65 | #if __HIP_DEVICE_COMPILE__
66 | atomicAdd(array_loc, value);
67 | #else
68 | *array_loc += value;
69 | #endif
70 | }
71 |
72 | template
73 | __host__ __device__ static T random_float(unsigned long long seed, unsigned long long seq, int idx, T mean, T stdev){
74 |
75 | T var = 0;
76 | #if __HIP_DEVICE_COMPILE__
77 | hiprandStatePhilox4_32_10_t state;
78 |
79 | // hiprand_init() reproduces the same random number with the same seed and seq
80 | hiprand_init(seed, seq, 0, &state);
81 |
82 | // hiprand_normal() gives a random float from a normal distribution with mean = 0 and stdev = 1
83 | var = stdev * hiprand_normal(&state) + mean;
84 | #endif
85 | return var;
86 | }
87 | }
88 |
89 | #endif // !BESSEL_DEVICES_HIP_H
90 |
--------------------------------------------------------------------------------
/memory/01-prefetch/README.md:
--------------------------------------------------------------------------------
1 | # Memory management strategies
2 |
3 | The purpose of this exercise is to compare 6 different memory management
4 | strategies and their computational overhead. The following functions are called
5 | at the end of this file by the `main()` function:
6 |
7 | * The function `explicitMem()` represents a basic explicit memory management strategy
8 | * The function `explicitMemPinned()` represents an explicit memory management strategy with pinned host memory
9 | * The function `explicitMemNoCopy()` represents an explicit memory management strategy where the data can reside at GPU memory during an iterative loop (no recurring memory copies needed)
10 | * The function `unifiedMem()` represents a basic unified memory management strategy
11 | * The function `unifiedMemPrefetch()` represents a unified memory management strategy with prefetching
12 | * The function `unifiedMemNoCopy()` represents a unified memory management strategy where the data can reside at GPU memory during an iterative loop (no recurring memory copies needed)
13 |
14 | The task is to fill the missing function calls in the code indicated by lines beginning with `#error`, and followed by a descriptive instruction.
15 |
16 | ## Hints
17 |
18 | `int device;`
19 | `hipGetDevice(&device);`
20 |
21 | * prefetch:
22 | `hipMemPrefetchAsync((const void*) ptr, size_t count, int device, hipStream_t stream)`
23 |
24 | * prefetch to device on stream 0:
25 | `hipMemPrefetchAsync(A, size, device, 0);`
26 |
27 | * prefetch to host: use device `hipCpuDeviceId`
28 | `hipMemPrefetchAsync(A, size, hipCpuDeviceId, 0);`
29 |
30 | *Device memset
31 | `hipMemset(A, 0, size);`
32 |
--------------------------------------------------------------------------------
/memory/02-mempools/README.md:
--------------------------------------------------------------------------------
1 | # The stream-ordered memory allocator and memory pools
2 |
3 | The purpose of this exercise is to compare different memory allocation strategies within a loop and to understand the performance impact of using or not using a memory pool. The following timed functions are called at the end of the source file by the `main()` function:
4 |
5 | * The function `noRecurringAlloc()` allocates memory outside loop only once
6 | * The function `recurringAllocNoMemPools()` allocates memory within a loop recurringly
7 | * The function `recurringAllocMemPool()` obtains memory from a pool within a loop recurringly
8 |
9 | The task is to fill the missing function calls in the code indicated by lines beginning with `#error`, and followed by a descriptive instruction.
10 |
--------------------------------------------------------------------------------
/memory/03-struct/README.md:
--------------------------------------------------------------------------------
1 | # Unified memory and structs
2 |
3 | The purpose of this exercise is to run a loop accessing a struct from host and
4 | device using different memory management strategies.
5 |
6 | The function `runHost()` demonstrates the execution on host and is already complete.
7 |
8 | The task is to fill the functions `runDeviceUnifiedMem()` and `runDeviceExplicitMem()` to do
9 | the same thing parallel on the device. The latter function also requires explicitly specifying how the struct is copied to the GPU memory, which is not always trivial. Therefore, you must also fill the GPU struct allocation and deallocation functions `createDeviceExample()` and `freeDeviceExample()`.
10 |
--------------------------------------------------------------------------------
/multi-gpu/01-p2pcopy/README.md:
--------------------------------------------------------------------------------
1 | # Peer to peer device access
2 |
3 | Benchmark memory copies with and without peer to peer device access using two
4 | GPUs.
5 |
6 | Skeleton code [p2pcopy.cpp](p2pcopy.cpp) tests peer to peer device access between two GPUs by doing a series of memory copies. The test is evaluated after calling `hipDeviceEnablePeerAccess()` and `hipDeviceDisablePeerAccess()`. The program prints calculated bandwith and time for both cases. On a CUDA platform, there should be a difference in results, whereas on an AMD platform there is none.
7 |
8 | In order to make the code work, you need to fix the missing parts marked with TODOs.
9 |
10 | NOTE: Remember to request 2 GPUs when running this exercise.
11 | On Lumi, use
12 | ```
13 | srun --account=XXXXXX --partition=small-g -N1 -n1 --cpus-per-task=1 --gpus-per-node=2 --time=00:15:00 ./a.out # The reservation is for small-g partition
14 | ```
15 |
16 | When the code is running correct run it several times and observe the bandwidths. What are the bandwidths=?
17 |
18 | Disable the DMA engine with `export HSA_ENABLE_SDMA=0` and then try again code. What are the results now?
19 |
20 |
21 | On Mahti use
22 | ```
23 | srun --account=XXXXXX --partition=gputest -N1 -n1 --cpus-per-task=1 --gres=gpu:v100:2 --time=00:15:00 ./a.out
24 | ```
25 |
--------------------------------------------------------------------------------
/multi-gpu/01-p2pcopy/p2pcopy.cpp:
--------------------------------------------------------------------------------
1 | #include "stdio.h"
2 | #include "stdint.h"
3 | #include
4 | #include
5 |
6 |
7 | void copyP2P(int p2p, int gpu0, int gpu1, int* dA_0, int* dA_1, int size) {
8 |
9 | // Enable peer access for GPUs?
10 | if (p2p)
11 | {
12 | // TODO: Enable peer access for GPU 0 and GPU 1
13 | }
14 |
15 | // Do a dummy copy without timing to remove the impact of the first one
16 | // TODO: Copy dA_1 on device 1 to dA_0 on device 0
17 |
18 | // Do a series of timed P2P memory copies
19 | int N = 10;
20 | clock_t tStart = clock();
21 | // TODO: Copy dA_1 on device 1 to dA_0 on device 0, repeat for N times to
22 | // get timings
23 | // TODO: After the memory copies, remember to synchronize the stream
24 | // before stopping the clock
25 | clock_t tStop = clock();
26 |
27 | // Calcute time and bandwith
28 | double time_s = (double) (tStop - tStart) / CLOCKS_PER_SEC;
29 | double bandwidth = (double) size * (double) N / (double) 1e9 / time_s;
30 |
31 | // Disable peer access for GPUs?
32 | if (p2p) {
33 | // TODO: Disable peer access for GPU 0 and GPU 1
34 | printf("P2P enabled - Bandwith: %.3f (GB/s), Time: %.3f s\n",
35 | bandwidth, time_s);
36 | } else {
37 | printf("P2P disabled - Bandwith: %.3f (GB/s), Time: %.3f s\n",
38 | bandwidth, time_s);
39 | }
40 | }
41 |
42 |
43 | int main(int argc, char *argv[])
44 | {
45 | // Check that we have at least two GPUs
46 | int devcount;
47 | hipGetDeviceCount(&devcount);
48 | if(devcount < 2) {
49 | printf("Need at least two GPUs!\n");
50 | exit(EXIT_FAILURE);
51 | } else {
52 | printf("Found %d GPU devices, using GPUs 0 and 1!\n", devcount);
53 | }
54 |
55 | // Allocate memory for both GPUs
56 | int size = pow(2, 28);
57 | int gpu0 = 0, gpu1 = 1;
58 | int *dA_0, *dA_1;
59 | hipSetDevice(gpu0);
60 | hipMalloc((void**) &dA_0, size);
61 | hipSetDevice(gpu1);
62 | hipMalloc((void**) &dA_1, size);
63 |
64 | // Check peer accessibility between GPUs 0 and 1
65 | int peerAccess01;
66 | int peerAccess10;
67 | // TODO: Check for peer to peer accessibility from device 0 to 1
68 | // and from 1 to 0
69 | printf("hipDeviceCanAccessPeer: %d (GPU %d to GPU %d)\n",
70 | peerAccess01, gpu0, gpu1);
71 | printf("hipDeviceCanAccessPeer: %d (GPU %d to GPU %d)\n",
72 | peerAccess10, gpu1, gpu0);
73 |
74 | // Memcopy, P2P enabled
75 | if (peerAccess01 && peerAccess10)
76 | copyP2P(1, gpu0, gpu1, dA_0, dA_1, size);
77 |
78 | // Memcopy, P2P disabled
79 | copyP2P(0, gpu0, gpu1, dA_0, dA_1, size);
80 |
81 | // Deallocate device memory
82 | hipFree(dA_0);
83 | hipFree(dA_1);
84 | }
85 |
--------------------------------------------------------------------------------
/multi-gpu/01-p2pcopy/solution/p2pcopy.cpp:
--------------------------------------------------------------------------------
1 | #include "stdio.h"
2 | #include "stdint.h"
3 | #include
4 | #include
5 |
6 |
7 | void copyP2P(int p2p, int gpu0, int gpu1, int* dA_0, int* dA_1, int size) {
8 |
9 | // Enable peer access for GPUs?
10 | if (p2p)
11 | {
12 | hipSetDevice(gpu0);
13 | hipDeviceEnablePeerAccess(gpu1, 0);
14 | hipSetDevice(gpu1);
15 | hipDeviceEnablePeerAccess(gpu0, 0);
16 | }
17 |
18 | // Do a dummy copy without timing to remove the impact of the first one
19 | hipMemcpy(dA_0, dA_1, size, hipMemcpyDefault);
20 | hipMemcpy(dA_1, dA_0, size, hipMemcpyDefault);
21 |
22 | // Do a series of timed P2P memory copies
23 | int N = 10;
24 | clock_t tStart = clock();
25 | for (int i = 0; i < N; ++i) {
26 | hipMemcpy(dA_0, dA_1, size, hipMemcpyDefault);
27 | }
28 | hipStreamSynchronize(0);
29 | clock_t tStop = clock();
30 |
31 | // Calcute time and bandwith
32 | double time_s = (double) (tStop - tStart) / CLOCKS_PER_SEC;
33 | double bandwidth = (double) size * (double) N / (double) 1e9 / time_s;
34 |
35 | // Disable peer access for GPUs?
36 | if (p2p) {
37 | hipSetDevice(gpu0);
38 | hipDeviceDisablePeerAccess(gpu1);
39 | hipSetDevice(gpu1);
40 | hipDeviceDisablePeerAccess(gpu0);
41 | printf("P2P enabled - Bandwith: %.3f (GB/s), Time: %.3f s\n",
42 | bandwidth, time_s);
43 | } else {
44 | printf("P2P disabled - Bandwith: %.3f (GB/s), Time: %.3f s\n",
45 | bandwidth, time_s);
46 | }
47 | }
48 |
49 |
50 | int main(int argc, char *argv[])
51 | {
52 | // Check that we have at least two GPUs
53 | int devcount;
54 | hipGetDeviceCount(&devcount);
55 | if(devcount < 2) {
56 | printf("Need at least two GPUs!\n");
57 | exit(EXIT_FAILURE);
58 | } else {
59 | printf("Found %d GPU devices, using GPUs 0 and 1!\n", devcount);
60 | }
61 |
62 | // Allocate memory for both GPUs
63 | int size = pow(2, 28);
64 | int gpu0 = 0, gpu1 = 1;
65 | int *dA_0, *dA_1;
66 | hipSetDevice(gpu0);
67 | hipMalloc((void**) &dA_0, size);
68 | hipSetDevice(gpu1);
69 | hipMalloc((void**) &dA_1, size);
70 |
71 | // Check peer accessibility between GPUs 0 and 1
72 | int peerAccess01;
73 | int peerAccess10;
74 | hipDeviceCanAccessPeer(&peerAccess01, gpu0, gpu1);
75 | hipDeviceCanAccessPeer(&peerAccess10, gpu1, gpu0);
76 | printf("hipDeviceCanAccessPeer: %d (GPU %d to GPU %d)\n",
77 | peerAccess01, gpu0, gpu1);
78 | printf("hipDeviceCanAccessPeer: %d (GPU %d to GPU %d)\n",
79 | peerAccess10, gpu1, gpu0);
80 |
81 | // Memcopy, P2P enabled
82 | if (peerAccess01 && peerAccess10)
83 | copyP2P(1, gpu0, gpu1, dA_0, dA_1, size);
84 |
85 | // Memcopy, P2P disabled
86 | copyP2P(0, gpu0, gpu1, dA_0, dA_1, size);
87 |
88 | // Deallocate device memory
89 | hipFree(dA_0);
90 | hipFree(dA_1);
91 | }
92 |
--------------------------------------------------------------------------------
/multi-gpu/02-vector-sum/README.md:
--------------------------------------------------------------------------------
1 | # Vector sum on two GPUs without MPI
2 |
3 | Calculate the vector sum of two vectors (C = A + B) using two GPUs.
4 |
5 | Decompose the vectors into equal halves, copy data from host to device memory
6 | and launch a GPU kernel on each part asynchronously using streams. Copy the
7 | results back to the host to check for correctness. Add timing events to
8 | measure the time of execution.
9 |
10 | A skeleton code is provided in [vector-sum.cpp](vector-sum.cpp). Your task is to fill the locations indicated by
11 |
12 | ```// TODO:```
13 |
14 | NOTE: Remember to request 2 GPUs when running this exercise. On Lumi, use
15 | ```
16 | srun --account=XXXXXX --partition=small-g -N1 -n1 --cpus-per-task=1 --gpus-per-node=2 --time=00:15:00 ./a.out # The reservation is for small-g partition
17 | ```
18 | and on Mahti use
19 | ```
20 | srun --account=XXXXXX --partition=gputest -N1 -n1 --cpus-per-task=1 --gres=gpu:v100:2 --time=00:15:00 ./a.out
21 | ```
22 |
--------------------------------------------------------------------------------
/multi-gpu/02-vector-sum/vector-sum.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 | // Data structure for storing decomposition information
6 | struct Decomp {
7 | int len; // length of the array for the current device
8 | int start; // start index for the array on the current device
9 | };
10 |
11 |
12 | /* HIP kernel for the addition of two vectors, i.e. C = A + B */
13 | __global__ void vector_add(double *C, const double *A, const double *B, int N)
14 | {
15 | int idx = blockIdx.x * blockDim.x + threadIdx.x;
16 |
17 | // Do not try to access past the allocated memory
18 | if (idx < N) {
19 | C[idx] = A[idx] + B[idx];
20 | }
21 | }
22 |
23 |
24 | int main(int argc, char *argv[])
25 | {
26 | const int ThreadsInBlock = 128;
27 | double *dA[2], *dB[2], *dC[2];
28 | double *hA, *hB, *hC;
29 | int devicecount;
30 | int N = 100;
31 | hipEvent_t start, stop;
32 | hipStream_t strm[2];
33 | Decomp dec[2];
34 |
35 | // TODO: Check that we have two HIP devices available
36 |
37 | // Create timing events
38 | hipSetDevice(0);
39 | hipEventCreate(&start);
40 | hipEventCreate(&stop);
41 |
42 | // Allocate host memory
43 | // TODO: Allocate enough pinned host memory for hA, hB, and hC
44 | // to store N doubles each
45 |
46 | // Initialize host memory
47 | for(int i = 0; i < N; ++i) {
48 | hA[i] = 1.0;
49 | hB[i] = 2.0;
50 | }
51 |
52 | // Decomposition of data for each stream
53 | dec[0].len = N / 2;
54 | dec[0].start = 0;
55 | dec[1].len = N - N / 2;
56 | dec[1].start = dec[0].len;
57 |
58 | // Allocate memory for the devices and per device streams
59 | for (int i = 0; i < 2; ++i) {
60 | // TODO: Allocate enough device memory for dA[i], dB[i], dC[i]
61 | // to store dec[i].len doubles
62 | // TODO: Create a stream for each device
63 | }
64 |
65 | // Start timing
66 | hipSetDevice(0);
67 | hipEventRecord(start);
68 |
69 | /* Copy each decomposed part of the vectors from host to device memory
70 | and execute a kernel for each part.
71 | Note: one needs to use streams and asynchronous calls! Without this
72 | the execution is serialized because the memory copies block the
73 | execution of the host process. */
74 | for (int i = 0; i < 2; ++i) {
75 | // TODO: Set active device
76 | // TODO: Copy data from host to device asynchronously (hA[dec[i].start] -> dA[i], hB[dec[i].start] -> dB[i])
77 | // TODO: Launch 'vector_add()' kernel to calculate dC = dA + dB
78 | // TODO: Copy data from device to host (dC[i] -> hC[dec[0].start])
79 | }
80 |
81 | // Synchronize and destroy the streams
82 | for (int i = 0; i < 2; ++i) {
83 | // TODO: Add synchronization calls and destroy streams
84 | }
85 |
86 | // Stop timing
87 | // TODO: Add here the timing event stop calls
88 |
89 | // Free device memory
90 | for (int i = 0; i < 2; ++i) {
91 | // TODO: Deallocate device memory
92 | }
93 |
94 | // Check results
95 | int errorsum = 0;
96 | for (int i = 0; i < N; i++) {
97 | errorsum += hC[i] - 3.0;
98 | }
99 | printf("Error sum = %i\n", errorsum);
100 |
101 | // Calculate the elapsed time
102 | float gputime;
103 | hipSetDevice(0);
104 | hipEventElapsedTime(&gputime, start, stop);
105 | printf("Time elapsed: %f\n", gputime / 1000.);
106 |
107 | // Deallocate host memory
108 | hipHostFree((void*)hA);
109 | hipHostFree((void*)hB);
110 | hipHostFree((void*)hC);
111 |
112 | return 0;
113 | }
114 |
--------------------------------------------------------------------------------
/multi-gpu/03-mpi/Makefile:
--------------------------------------------------------------------------------
1 | HIPCC = hipcc
2 | MPICXX = mpicxx
3 | MPICXXFLAGS = -g -O2 -w
4 |
5 | # Puhti
6 | MPICXXENV = OMPI_CXXFLAGS='' OMPI_CXX='$(HIPCC) --x cu --gpu-architecture=sm_70'
7 | # LDFLAGS = -L/appl/spack/v018/install-tree/gcc-11.3.0/openmpi-4.1.4-w2aekq/lib
8 | LDFLAGS = -L/appl/spack/v018/install-tree/gcc-11.3.0/openmpi-4.1.4-gkv6dx/lib
9 | LIBS = -lmpi
10 |
11 | ping-pong: ping-pong.o
12 | $(HIPCC) $(LDFLAGS) -o $@ $< $(LIBS)
13 |
14 | %.o: %.cpp
15 | $(MPICXXENV) $(MPICXX) $(MPICXXFLAGS) -c -o $@ $<
16 |
17 | .PHONY: clean
18 | clean:
19 | rm -f *.o ping-pong
20 |
--------------------------------------------------------------------------------
/multi-gpu/03-mpi/README.md:
--------------------------------------------------------------------------------
1 | # Ping-pong with multiple GPUs and MPI
2 |
3 | Implement a simple ping-pong test for GPU-to-GPU communication using:
4 | a) indirect communication via the host, and b) direct communication with
5 | HIP-aware MPI.
6 |
7 | The ping-pong test consists of the following steps:
8 | 1. Send a vector from one GPU to another
9 | 2. The receiving GPU should increment all elements of the vector by one
10 | 3. Send the vector back to the original GPU
11 |
12 | For reference, there is also a CPU-to-CPU implementation in the skeleton
13 | code ([ping-pong.cpp](ping-pong.cpp)). Timing of all tests is also included to
14 | compare the execution times.
15 |
16 | On **Lumi**, one can compile the MPI example simply using the Cray compiler with
17 | ```
18 | CC -xhip ping-pong.cpp
19 | ```
20 |
21 | On LUMI, enable gpu-aware MPI on runtime (and compiling) by eexecuting:
22 | ```
23 | MPICH_GPU_SUPPORT_ENABLED=1
24 | ```
25 | For running, one should use two GPUs and two MPI processes:
26 |
27 | ```
28 | srun --account=XXXXXX --partition=small-g -N1 -tasks-per-node=2 --cpus-per-task=1 --gpus-per-node=2 --time=00:15:00 ./a.out # # The reservation is for small-g partition
29 | ```
30 |
31 |
32 | On **Mahti**, to compile, just load the required modules and type `make`. A gpu-aware MPI is
33 | available with:
34 | ```
35 | ml openmpi/4.1.4-cuda
36 | ```
37 | For running, one should use two GPUs and two MPI processes:
38 | ```
39 | srun --account=XXXXXX --partition=gputest -N1 -n2 --cpus-per-task=1 --gres=gpu:v100:2 --time=00:15:00 ./a.out
40 | ```
41 |
--------------------------------------------------------------------------------
/multi-gpu/03-mpi/solution/Makefile:
--------------------------------------------------------------------------------
1 | HIPCC = hipcc
2 | MPICXX = mpicxx
3 | MPICXXFLAGS = -g -O2 -w
4 |
5 | # Puhti
6 | MPICXXENV = OMPI_CXXFLAGS='' OMPI_CXX='$(HIPCC) --x cu --gpu-architecture=sm_70'
7 | # LDFLAGS = -L/appl/spack/v018/install-tree/gcc-11.3.0/openmpi-4.1.4-w2aekq/lib
8 | LDFLAGS = -L/appl/spack/v018/install-tree/gcc-11.3.0/openmpi-4.1.4-gkv6dx/lib
9 | LIBS = -lmpi
10 |
11 | ping-pong: ping-pong.o
12 | $(HIPCC) $(LDFLAGS) -o $@ $< $(LIBS)
13 |
14 | %.o: %.cpp
15 | $(MPICXXENV) $(MPICXX) $(MPICXXFLAGS) -c -o $@ $<
16 |
17 | .PHONY: clean
18 | clean:
19 | rm -f *.o ping-pong
20 |
--------------------------------------------------------------------------------
/optimization/01-coalescing/README.md:
--------------------------------------------------------------------------------
1 | # Performance counters and coalesced memory access
2 |
3 | ## Background and rocprof
4 |
5 | `rocprof` can collect performance metric counters (`pmc`) of gpu kernels:
6 | ```bash
7 | > rocprof -i metrics.txt -o metrics.csv ./copy
8 | ```
9 |
10 | The counters to be collected are listed in the `metrics.txt` file and they are
11 | outputted the `metrics.csv` file. For example, if the file `metrics.txt` is
12 |
13 | ```
14 | pmc: VALUBusy, TCP_TCC_READ_REQ_sum
15 | pmc: TCC_EA_RDREQ_sum
16 | ```
17 | then `rocprof` will collect the derived metrics of how busy the vector
18 | arithmetic logic units (VALU), how many L2 read requests are issued
19 | (TCP_TCC_READ_REQ_sum) and how many global device memory read requests are
20 | issued (TCC_EA_RDREQ_sum).
21 |
22 | Here `TCP_TCC` refers to how many read requests the L1 (TCP) cache controller
23 | issues to the L2 cache (TCC) and `TCC_EA` refers to how many reads L2 cache
24 | controller issues to the interconnect (`EA`).
25 |
26 | The options `--list-derived` and `--list-basic` will list the available derived
27 | and basic counters.
28 |
29 | *Note*: `rocprof --list-derived` and `rocprof --list-basic` must be
30 | executed on a node with GPU present because it queries the available counters
31 | from the hardware itself.
32 |
33 | An MI250x GCD has 8 MiB of L2 memory shared across the CUs and each CU has 16
34 | kiB of L1 memory.
35 |
36 | ## Exercise
37 |
38 | The Code `copy.cpp` will read and write memory array of 4096*4096 float32
39 | entries and various strides (`(1<
2 |
3 | #include
4 | #include
5 |
6 | #define LOG2SIZE 12
7 | const static int width = 1< matrix_in;
25 | std::vector matrix_out;
26 |
27 | matrix_in.resize(width * height);
28 | matrix_out.resize(width * height);
29 |
30 | for (int i = 0; i < width * height; i++) {
31 | matrix_in[i] = (float)rand() / (float)RAND_MAX;
32 | }
33 |
34 | float *d_in;
35 | float *d_out;
36 |
37 | hipMalloc((void **)&d_in, (width * height) * sizeof(float));
38 | hipMalloc((void **)&d_out, (width * height) * sizeof(float));
39 |
40 | hipMemcpy(d_in, matrix_in.data(), width * height * sizeof(float),
41 | hipMemcpyHostToDevice);
42 |
43 | printf("Setup complete. Launching kernel \n");
44 | int block_x = width / tile_dim_x;
45 | int block_y = height / tile_dim_y;
46 |
47 |
48 | // Create events
49 |
50 | /* printf("Warm up the gpu!\n"); */
51 | /* for(int i=1;i<=10;i++){ */
52 | /* hipLaunchKernelGGL(copy_kernel, dim3(block_x, block_y), */
53 | /* dim3(tile_dim_x, tile_dim_y), 0, 0, d_in, d_out, width, */
54 | /* height);} */
55 |
56 |
57 |
58 | for(int i=1;i<=21;i++){
59 | hipLaunchKernelGGL(copy_kernel, dim3(block_x, block_y),
60 | dim3(tile_dim_x, tile_dim_y), 0, 0, d_in, d_out, width,
61 | height, (1<
2 |
3 | #include
4 | #include
5 |
6 | const static int width = 4096;
7 | const static int height = 4096;
8 | const static int tile_dim = 16;
9 |
10 | __global__ void copy_kernel(float *in, float *out, int width, int height) {
11 | int x_index = blockIdx.x * tile_dim + threadIdx.x;
12 | int y_index = blockIdx.y * tile_dim + threadIdx.y;
13 |
14 | int index = y_index * width + x_index;
15 |
16 | out[index] = in[index];
17 | }
18 |
19 |
20 |
21 | int main() {
22 | std::vector matrix_in;
23 | std::vector matrix_out;
24 |
25 | matrix_in.resize(width * height);
26 | matrix_out.resize(width * height);
27 |
28 | for (int i = 0; i < width * height; i++) {
29 | matrix_in[i] = (float)rand() / (float)RAND_MAX;
30 | }
31 |
32 |
33 |
34 | float *d_in;
35 | float *d_out;
36 |
37 | hipMalloc((void **)&d_in, width * height * sizeof(float));
38 | hipMalloc((void **)&d_out, width * height * sizeof(float));
39 |
40 | hipMemcpy(d_in, matrix_in.data(), width * height * sizeof(float),
41 | hipMemcpyHostToDevice);
42 |
43 | printf("Setup complete. Launching kernel \n");
44 | int block_x = width / tile_dim;
45 | int block_y = height / tile_dim;
46 |
47 |
48 | // Create events
49 | hipEvent_t start_kernel_event;
50 | hipEventCreate(&start_kernel_event);
51 | hipEvent_t end_kernel_event;
52 | hipEventCreate(&end_kernel_event);
53 |
54 | printf("Warm up the gpu!\n");
55 | for(int i=1;i<=10;i++){
56 | hipLaunchKernelGGL(copy_kernel, dim3(block_x, block_y),
57 | dim3(tile_dim, tile_dim), 0, 0, d_in, d_out, width,
58 | height);}
59 |
60 | hipEventRecord(start_kernel_event, 0);
61 |
62 |
63 | for(int i=1;i<=10;i++){
64 | hipLaunchKernelGGL(copy_kernel, dim3(block_x, block_y),
65 | dim3(tile_dim, tile_dim), 0, 0, d_in, d_out, width,
66 | height);}
67 |
68 | hipEventRecord(end_kernel_event, 0);
69 | hipEventSynchronize(end_kernel_event);
70 |
71 | hipDeviceSynchronize();
72 | float time_kernel;
73 | hipEventElapsedTime(&time_kernel, start_kernel_event, end_kernel_event);
74 |
75 | printf("Kernel execution complete \n");
76 | printf("Event timings:\n");
77 | printf(" %.6f ms - copy \n Bandwidth %.6f GB/s\n", time_kernel/10, 2.0*10000*(((double)(width)*(double)height)*sizeof(float))/(time_kernel*1024*1024*1024));
78 |
79 | hipMemcpy(matrix_out.data(), d_out, width * height * sizeof(float),
80 | hipMemcpyDeviceToHost);
81 |
82 |
83 | return 0;
84 | }
85 |
--------------------------------------------------------------------------------
/optimization/02-matrix_transpose/matrix_transpose_naive.cpp:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | #include
4 | #include
5 |
6 | const static int width = 4096;
7 | const static int height = 4096;
8 | const static int tile_dim = 16;
9 |
10 | __global__ void transpose_naive_kernel(float *in, float *out, int width, int height) {
11 | int x_index = blockIdx.x * tile_dim + threadIdx.x;
12 | int y_index = blockIdx.y * tile_dim + threadIdx.y;
13 |
14 | int in_index = y_index * width + x_index;
15 | int out_index = x_index * height + y_index;
16 |
17 | out[out_index] = in[in_index];
18 | }
19 |
20 |
21 |
22 | int main() {
23 | std::vector matrix_in;
24 | std::vector matrix_out;
25 |
26 | matrix_in.resize(width * height);
27 | matrix_out.resize(width * height);
28 |
29 | for (int i = 0; i < width * height; i++) {
30 | matrix_in[i] = (float)rand() / (float)RAND_MAX;
31 | }
32 |
33 |
34 |
35 | float *d_in;
36 | float *d_out;
37 |
38 | hipMalloc((void **)&d_in, width * height * sizeof(float));
39 | hipMalloc((void **)&d_out, width * height * sizeof(float));
40 |
41 | hipMemcpy(d_in, matrix_in.data(), width * height * sizeof(float),
42 | hipMemcpyHostToDevice);
43 |
44 | printf("Setup complete. Launching kernel \n");
45 | int block_x = width / tile_dim;
46 | int block_y = height / tile_dim;
47 |
48 |
49 |
50 | // Create events
51 | hipEvent_t start_kernel_event;
52 | hipEventCreate(&start_kernel_event);
53 | hipEvent_t end_kernel_event;
54 | hipEventCreate(&end_kernel_event);
55 |
56 | printf("Warm up the gpu!\n");
57 |
58 |
59 | for(int i=1;i<=10;i++){
60 | hipLaunchKernelGGL(transpose_naive_kernel, dim3(block_x, block_y),
61 | dim3(tile_dim, tile_dim), 0, 0, d_in, d_out, width,
62 | height);}
63 |
64 |
65 | hipEventRecord(start_kernel_event, 0);
66 | for(int i=1;i<=10;i++){
67 | hipLaunchKernelGGL(transpose_naive_kernel, dim3(block_x, block_y),
68 | dim3(tile_dim, tile_dim), 0, 0, d_in, d_out, width,
69 | height);}
70 |
71 | hipEventRecord(end_kernel_event, 0);
72 | hipEventSynchronize(end_kernel_event);
73 |
74 | float time_kernel;
75 | hipEventElapsedTime(&time_kernel, start_kernel_event, end_kernel_event);
76 |
77 | printf("Kernel execution complete \n");
78 | printf("Event timings:\n");
79 | printf(" %.6f ms - naive transpose \n Bandwidth %.6f GB/s\n", time_kernel/10, 2.0*10000*(((double)(width)*(double)height)*sizeof(float))/(time_kernel*1024*1024*1024));
80 |
81 | hipMemcpy(matrix_out.data(), d_out, width * height * sizeof(float),
82 | hipMemcpyDeviceToHost);
83 |
84 |
85 | return 0;
86 | }
87 |
88 |
--------------------------------------------------------------------------------
/optimization/02-matrix_transpose/matrix_transpose_with_SM.cpp:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | #include
4 | #include
5 |
6 | const static int width = 4096;
7 | const static int height = 4096;
8 | const static int tile_dim = 16;
9 |
10 | __global__ void transpose_SM_kernel(float *in, float *out, int width,
11 | int height) {
12 | __shared__ float tile[tile_dim][tile_dim];
13 |
14 | int x_tile_index = blockIdx.x * tile_dim;
15 | int y_tile_index = blockIdx.y * tile_dim;
16 |
17 | int in_index =
18 | (y_tile_index + threadIdx.y) * width + (x_tile_index + threadIdx.x);
19 | int out_index =
20 | (x_tile_index + threadIdx.y) * height + (y_tile_index + threadIdx.x);
21 |
22 | tile[threadIdx.y][threadIdx.x] = in[in_index];
23 |
24 | __syncthreads();
25 |
26 | out[out_index] = tile[threadIdx.x][threadIdx.y];
27 | }
28 |
29 |
30 | int main() {
31 | std::vector matrix_in;
32 | std::vector matrix_out;
33 |
34 | matrix_in.resize(width * height);
35 | matrix_out.resize(width * height);
36 |
37 | for (int i = 0; i < width * height; i++) {
38 | matrix_in[i] = (float)rand() / (float)RAND_MAX;
39 | }
40 |
41 |
42 |
43 | float *d_in;
44 | float *d_out;
45 |
46 | hipMalloc((void **)&d_in, width * height * sizeof(float));
47 | hipMalloc((void **)&d_out, width * height * sizeof(float));
48 |
49 | hipMemcpy(d_in, matrix_in.data(), width * height * sizeof(float),
50 | hipMemcpyHostToDevice);
51 |
52 | printf("Setup complete. Launching kernel \n");
53 | int block_x = width / tile_dim;
54 | int block_y = height / tile_dim;
55 |
56 | // Create events
57 | hipEvent_t start_kernel_event;
58 | hipEventCreate(&start_kernel_event);
59 | hipEvent_t end_kernel_event;
60 | hipEventCreate(&end_kernel_event);
61 |
62 | printf("Warm up the gpu!\n");
63 |
64 |
65 | for(int i=1;i<=10;i++){
66 | hipLaunchKernelGGL(transpose_SM_kernel, dim3(block_x, block_y),
67 | dim3(tile_dim, tile_dim), 0, 0, d_in, d_out, width,
68 | height);}
69 |
70 |
71 | hipEventRecord(start_kernel_event, 0);
72 |
73 | for(int i=1;i<=10;i++){
74 | hipLaunchKernelGGL(transpose_SM_kernel, dim3(block_x, block_y),
75 | dim3(tile_dim, tile_dim), 0, 0, d_in, d_out, width,
76 | height);}
77 |
78 |
79 | hipEventRecord(end_kernel_event, 0);
80 | hipEventSynchronize(end_kernel_event);
81 |
82 | float time_kernel;
83 | hipEventElapsedTime(&time_kernel, start_kernel_event, end_kernel_event);
84 |
85 | printf("Kernel execution complete \n");
86 | printf("Event timings:\n");
87 | printf(" %.6f ms - shared memory \n Bandwidth %.6f GB/s\n", time_kernel/10, 2.0*10000*(((double)(width)*(double)height)*sizeof(float))/(time_kernel*1024*1024*1024));
88 |
89 | hipMemcpy(matrix_out.data(), d_out, width * height * sizeof(float),
90 | hipMemcpyDeviceToHost);
91 |
92 | hipEventDestroy(start_kernel_event);
93 | hipEventDestroy(end_kernel_event);
94 |
95 | return 0;
96 | }
97 |
98 |
--------------------------------------------------------------------------------
/optimization/02-matrix_transpose/matrix_transpose_with_SM_nobc.cpp:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | #include
4 | #include
5 |
6 | const static int width = 4096;
7 | const static int height = 4096;
8 | const static int tile_dim = 16;
9 |
10 | __global__ void transpose_SM_nobc_kernel(float *in, float *out, int width,
11 | int height) {
12 | __shared__ float tile[tile_dim][tile_dim+1];
13 |
14 | int x_tile_index = blockIdx.x * tile_dim;
15 | int y_tile_index = blockIdx.y * tile_dim;
16 |
17 | int in_index =
18 | (y_tile_index + threadIdx.y) * width + (x_tile_index + threadIdx.x);
19 | int out_index =
20 | (x_tile_index + threadIdx.y) * height + (y_tile_index + threadIdx.x);
21 |
22 | tile[threadIdx.y][threadIdx.x] = in[in_index];
23 |
24 | __syncthreads();
25 |
26 | out[out_index] = tile[threadIdx.x][threadIdx.y];
27 | }
28 |
29 |
30 | int main() {
31 | std::vector matrix_in;
32 | std::vector matrix_out;
33 |
34 | matrix_in.resize(width * height);
35 | matrix_out.resize(width * height);
36 |
37 | for (int i = 0; i < width * height; i++) {
38 | matrix_in[i] = (float)rand() / (float)RAND_MAX;
39 | }
40 |
41 |
42 |
43 | float *d_in;
44 | float *d_out;
45 |
46 | hipMalloc((void **)&d_in, width * height * sizeof(float));
47 | hipMalloc((void **)&d_out, width * height * sizeof(float));
48 |
49 | hipMemcpy(d_in, matrix_in.data(), width * height * sizeof(float),
50 | hipMemcpyHostToDevice);
51 |
52 | printf("Setup complete. Launching kernel \n");
53 | int block_x = width / tile_dim;
54 | int block_y = height / tile_dim;
55 |
56 | // Create events
57 | hipEvent_t start_kernel_event;
58 | hipEventCreate(&start_kernel_event);
59 | hipEvent_t end_kernel_event;
60 | hipEventCreate(&end_kernel_event);
61 |
62 | printf("Warm up the gpu!\n");
63 |
64 |
65 | for(int i=1;i<=10;i++){
66 | hipLaunchKernelGGL(transpose_SM_nobc_kernel, dim3(block_x, block_y),
67 | dim3(tile_dim, tile_dim), 0, 0, d_in, d_out, width,
68 | height);}
69 |
70 |
71 | hipEventRecord(start_kernel_event, 0);
72 |
73 | for(int i=1;i<=10;i++){
74 | hipLaunchKernelGGL(transpose_SM_nobc_kernel, dim3(block_x, block_y),
75 | dim3(tile_dim, tile_dim), 0, 0, d_in, d_out, width,
76 | height);}
77 |
78 |
79 | hipEventRecord(end_kernel_event, 0);
80 | hipEventSynchronize(end_kernel_event);
81 |
82 | float time_kernel;
83 | hipEventElapsedTime(&time_kernel, start_kernel_event, end_kernel_event);
84 |
85 | printf("Kernel execution complete \n");
86 | printf("Event timings:\n");
87 | printf(" %.6f ms - shared memory with no bank conflicts \n Bandwidth %.6f GB/s\n", time_kernel/10, 2.0*10000*(((double)(width)*(double)height)*sizeof(float))/(time_kernel*1024*1024*1024));
88 |
89 | hipMemcpy(matrix_out.data(), d_out, width * height * sizeof(float),
90 | hipMemcpyDeviceToHost);
91 |
92 | hipEventDestroy(start_kernel_event);
93 | hipEventDestroy(end_kernel_event);
94 |
95 | return 0;
96 | }
97 |
98 |
--------------------------------------------------------------------------------
/optimization/03-trace/README.md:
--------------------------------------------------------------------------------
1 | # Tracing with rocprof
2 |
3 | In this exercise your task is to trace execution of [streams/02-concurrency](../../streams/02-concurrency/solution/streams.cpp) exercise
4 | solution.
5 |
6 | Rocprof can be used to trace HIP API calls, among others, with option
7 |
8 | ```bash
9 | > rocprof --hip-trace
10 | ```
11 |
12 | It will output a file named `results.json` which may be visualized for example
13 | with perfetto trace visualizer (https://ui.perfetto.dev/) or chrome/chromium
14 | built in visualizer tools (type `chrome://tracing/` in the URL field).
15 |
16 | ## Exercise
17 |
18 | - Trace the HIP API calls of the `streams.cpp` code and visualize the results.
19 | - Modify `WORK` preprocessor macro to so large that kernel executions begin to
20 | exceed memory transfers.
21 | - Does the kernel execution order correspond to their stream numbering?
22 |
--------------------------------------------------------------------------------
/porting/README.md:
--------------------------------------------------------------------------------
1 | # Converting CUDA code to HIP
2 |
3 | The folder [codes](codes) contains a few examples (vector addition, `saxpy` using HIP kernel, and `saxpy`using `cublas` of CUDA codes. On Mahti or Puhti these codes will compile with the CUDA `nvcc` compiler and should run without issues.
4 |
5 | The tasks are to convert these codes to HIP. For shorter code one can do a manual conversion, but for larger codes it is recomended to use HIPIFY tools or compile them with [HOP](https://github.com/cschpc/hop) library.
6 |
7 | ## HIPIFY Tools
8 | 0. **Optional** Convert the codes to HIP manually. On Nvidia platforms the conversion can be done in an incremental way because `hipcc` can compile mixed CUDA and HIP code. On AMD plaftorms `hipcc` can not compile CUDA code. The whole code needs to be converted in order to be able to compile it.
9 | 1. Convert the codes using HIPIFY tools.
10 |
11 | A. Examine the code. Both `hipify-perl` and `hipify-clang` support the option `--examine` option. Alternatively one can use the `hipexamine[.|-perl.]sh` scripts which will scan whole directories. This procedure will not change the source it will just determine which files contain CUDA code and how much of the code can be converted automatically.
12 |
13 | B. Convert individual files `hipify-[perl|clang] --inplace --print-stats` or folders using the scripts `hipconvertinplace[.|-perl.]sh `.
14 |
15 |
16 | **Note** that `hipify-clang` requires the CUDA toolkit. On LUMI this is available via a container.
17 | The image can be created using:
18 |
19 | ```
20 | singularity pull docker://nvcr.io/nvidia/cuda:11.4.3-devel-ubuntu20.04
21 | ```
22 | This is step was already done, the image's path is `/projappl/project_462000877/apps/cuda_11.4.3-devel-ubuntu20.04.sif`
23 | Then load all the modules necessary to compile HIP codes on LUMI.
24 | ```
25 | module load LUMI/24.03
26 | module load partition/G
27 | module load rocm
28 | ```
29 | Finally open a shell in the container which has access to the working directory and the `rocm`
30 | ```
31 | singularity shell -B $PWD,/opt:/opt /projappl/project_462000877/apps/cuda_11.4.3-devel-ubuntu20.04.sif
32 | export PATH=$ROCM_PATH/bin:$PATH
33 | ```
34 |
35 | The CUDA code can be converted now using:
36 | ```
37 | hipify-clang .cu --inplace --print-stats --cuda-path=/usr/local/cuda-11.4 -I /usr/local/cuda-11.4/include
38 | ```
39 | This command works as well on Nvidia platforms with HIP installed.
40 |
41 |
42 | 2. Compile CUDA codes on AMD platorms using `hipcc` + HOP and compile HIP codes on Nvidia platforms using `nvcc` + HOP.
43 |
44 | First you neeed to clone the HOP repository in your working folder on scratch:
45 | ```
46 | git clone https://github.com/cschpc/hop.git
47 | ```
48 |
49 | **CUDA** ⇒ **HIP** on LUMI
50 | ```
51 | export HOP_ROOT=/path/to/hop
52 | export HOP_FLAGS="-I$HOP_ROOT -I$HOP_ROOT/source/cuda -DHOP_TARGET_HIP"
53 | CC -x hip $HOP_FLAGS hello.cu -o hello
54 | ./hello
55 | ```
56 | **HIP** ⇒ **CUDA** on Mahti or Puhti
57 | ```
58 | export HOP_ROOT=/path/to/hop
59 | export HOP_FLAGS="-I$HOP_ROOT -I$HOP_ROOT/source/hip -DHOP_TARGET_CUDA"
60 | CC -x cu $HOP_FLAGS hello.cpp -o hello
61 | ./hello
62 | ```
63 |
64 |
--------------------------------------------------------------------------------
/porting/codes/README.md:
--------------------------------------------------------------------------------
1 | # Directory with source codes for hands-on
2 |
--------------------------------------------------------------------------------
/porting/codes/Vector_Addition/Readme.md:
--------------------------------------------------------------------------------
1 | # Vector addition
2 |
3 | This is simple vector addition for exemplify the [CUDA to HIP conversion]. The code executes `C[i]=A[i]+B[i]`, for `i=1,...,N`.
4 |
5 | Compile CUDA code: nvcc -arch=sm_70 vecadd.cu -o vecadd
6 |
--------------------------------------------------------------------------------
/porting/codes/Vector_Addition/cuda/Readme.md:
--------------------------------------------------------------------------------
1 | # Vector addition
2 |
3 | This is simple vector addition for exemplify the [CUDA to HIP conversion]. The code executes `C[i]=A[i]+B[i]`, for `i=1,...,N`.
4 |
5 | Compile: nvcc -arch=sm_70 vecadd.cu -o vecadd
6 |
--------------------------------------------------------------------------------
/porting/codes/Vector_Addition/cuda/vecadd.cu:
--------------------------------------------------------------------------------
1 | /*
2 | nvcc vecadd.cu
3 | */
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 |
10 | __global__ void vecAdd(int *A,int *B,int *C,int N)
11 | {
12 | int i = blockIdx.x * blockDim.x + threadIdx.x;
13 | if(i>>(a_d,b_d,c_d,n);
56 | cudaDeviceSynchronize();
57 | clock_t end_d = clock();
58 | clock_t start_h = clock();
59 | printf("Doing CPU Vector add\n");
60 | vecAdd_h(a,b,c2,n);
61 | clock_t end_h = clock();
62 | double time_d = (double)(end_d-start_d)/CLOCKS_PER_SEC;
63 | double time_h = (double)(end_h-start_h)/CLOCKS_PER_SEC;
64 | cudaMemcpy(c,c_d,nBytes,cudaMemcpyDeviceToHost);
65 | printf("%d %f %f\n",n,time_d,time_h);
66 |
67 | for(int i=0; i1.0e-5)
70 | printf("Error at position %d.\n", i );
71 | }
72 | cudaFree(a_d);
73 | cudaFree(b_d);
74 | cudaFree(c_d);
75 | free(c2);
76 | free(c);
77 | free(a);
78 | free(b);
79 | return 0;
80 | }
81 |
--------------------------------------------------------------------------------
/porting/codes/Vector_Addition/hip_solution/vecadd.cu:
--------------------------------------------------------------------------------
1 | #include "hip/hip_runtime.h"
2 | /*
3 | nvcc vecadd.cu
4 | */
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 |
11 | __global__ void vecAdd(int *A,int *B,int *C,int N)
12 | {
13 | int i = blockIdx.x * blockDim.x + threadIdx.x;
14 | if(i>>(a_d,b_d,c_d,n);
57 | hipDeviceSynchronize();
58 | clock_t end_d = clock();
59 | clock_t start_h = clock();
60 | printf("Doing CPU Vector add\n");
61 | vecAdd_h(a,b,c2,n);
62 | clock_t end_h = clock();
63 | double time_d = (double)(end_d-start_d)/CLOCKS_PER_SEC;
64 | double time_h = (double)(end_h-start_h)/CLOCKS_PER_SEC;
65 | hipMemcpy(c,c_d,nBytes,hipMemcpyDeviceToHost);
66 | printf("%d %f %f\n",n,time_d,time_h);
67 |
68 | for(int i=0; i1.0e-5)
71 | printf("Error at position %d.\n", i );
72 | }
73 | hipFree(a_d);
74 | hipFree(b_d);
75 | hipFree(c_d);
76 | free(c2);
77 | free(c);
78 | free(a);
79 | free(b);
80 | return 0;
81 | }
82 |
--------------------------------------------------------------------------------
/porting/codes/saxpy/cublas/Makefile:
--------------------------------------------------------------------------------
1 | #===============================================================================
2 | # User Options
3 | #===============================================================================
4 | #
5 | # Compiler can be set below, or via environment variable
6 | CC = nvcc
7 | OPTIMIZE = yes
8 | #
9 | #===============================================================================
10 | # Program name & source code list
11 | #===============================================================================
12 | program = saxpy_cublas
13 | source = saxpy_cublas.cu
14 | obj = $(source:.cu=.o)
15 | #===============================================================================
16 | # Sets Flags
17 | #===============================================================================
18 | # Standard Flags
19 | CFLAGS := -Xcompiler -Wall
20 | # Linker Flags
21 | LDFLAGS = -lcublas
22 | # Optimization Flags
23 | ifeq ($(OPTIMIZE),yes)
24 | CFLAGS += -O3
25 | endif
26 |
27 | #===============================================================================
28 | # Targets to Build
29 | #===============================================================================
30 | #
31 | $(program): $(obj) Makefile
32 | $(CC) $(CFLAGS) $(obj) -o $@ $(LDFLAGS)
33 |
34 | %.o: %.cu Makefile
35 | $(CC) $(CFLAGS) -c $< -o $@
36 |
37 | clean:
38 | rm -rf $(program) $(obj)
39 |
40 |
--------------------------------------------------------------------------------
/porting/codes/saxpy/cublas/saxpy_cublas.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include "cublas_v2.h"
3 | using namespace std;
4 |
5 | const int N = 1 << 30;
6 |
7 | int main(){
8 | float *a_h, *b_h;
9 | a_h = new float[N];
10 | b_h = new float[N];
11 | float *a_d, *b_d;
12 | for(int i = 0; i < N; i++){
13 | a_h[i] = 1.0f;
14 | b_h[i] = 2.0f ;
15 | }
16 | cublasHandle_t handle;
17 | cublasCreate(&handle);
18 | cudaMalloc((void**) &a_d, sizeof(float) * N);
19 | cudaMalloc((void**) &b_d, sizeof(float) * N);
20 | cublasSetVector( N, sizeof(float), a_h, 1, a_d, 1);
21 | cublasSetVector( N, sizeof(float), b_h, 1, b_d, 1);
22 | const float s = 2.0f;
23 | cublasSaxpy( handle, N, &s, a_d, 1, b_d, 1);
24 | cublasGetVector( N, sizeof(float), b_d, 1, b_h, 1);
25 | cudaFree(a_d);
26 | cudaFree(b_d);
27 | cublasDestroy(handle);
28 | float maxError = 0.0f;
29 |
30 | for(int i = 0; i < N; i++)
31 | maxError = fmax(maxError, abs(b_h[i]-4.0f));
32 |
33 | cout << "Max error: " << maxError << endl;
34 |
35 |
36 | delete[] a_h;
37 | delete[] b_h;
38 | return 0;
39 | }
40 |
--------------------------------------------------------------------------------
/porting/codes/saxpy/cuda/saxpy.cu:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | __global__
4 | void saxpy(int n, float a, float *x, float *y)
5 | {
6 | int i = blockIdx.x*blockDim.x + threadIdx.x;
7 | if (i < n) y[i] = a*x[i] + y[i];
8 | }
9 |
10 | int main(void)
11 | {
12 | int N = 1<<30;
13 | float *x, *y, *d_x, *d_y;
14 | x = (float*)malloc(N*sizeof(float));
15 | y = (float*)malloc(N*sizeof(float));
16 |
17 | cudaMalloc(&d_x, N*sizeof(float));
18 | cudaMalloc(&d_y, N*sizeof(float));
19 |
20 | for (int i = 0; i < N; i++) {
21 | x[i] = 1.0f;
22 | y[i] = 2.0f;
23 | }
24 |
25 | cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
26 | cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
27 |
28 | // Perform SAXPY on 1M elements
29 | saxpy<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y);
30 |
31 | cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
32 |
33 | float maxError = 0.0f;
34 | for (int i = 0; i < N; i++)
35 | maxError = fmax(maxError, abs(y[i]-4.0f));
36 | printf("Max error: %f\n", maxError);
37 |
38 | cudaFree(d_x);
39 | cudaFree(d_y);
40 | free(x);
41 | free(y);
42 | }
43 |
--------------------------------------------------------------------------------
/porting/codes/saxpy/hip/README.md:
--------------------------------------------------------------------------------
1 | # Copy the files from the CUDA folder and hipify the example here.
2 |
--------------------------------------------------------------------------------
/porting/codes/saxpy/hip_solution/saxpy.cu:
--------------------------------------------------------------------------------
1 | #include "hip/hip_runtime.h"
2 | #include
3 |
4 | __global__
5 | void saxpy(int n, float a, float *x, float *y)
6 | {
7 | int i = blockIdx.x*blockDim.x + threadIdx.x;
8 | if (i < n) y[i] = a*x[i] + y[i];
9 | }
10 |
11 | int main(void)
12 | {
13 | int N = 1<<30;
14 | float *x, *y, *d_x, *d_y;
15 | x = (float*)malloc(N*sizeof(float));
16 | y = (float*)malloc(N*sizeof(float));
17 |
18 | hipMalloc(&d_x, N*sizeof(float));
19 | hipMalloc(&d_y, N*sizeof(float));
20 |
21 | for (int i = 0; i < N; i++) {
22 | x[i] = 1.0f;
23 | y[i] = 2.0f;
24 | }
25 |
26 | hipMemcpy(d_x, x, N*sizeof(float), hipMemcpyHostToDevice);
27 | hipMemcpy(d_y, y, N*sizeof(float), hipMemcpyHostToDevice);
28 |
29 | // Perform SAXPY on 1M elements
30 | saxpy<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y);
31 |
32 | hipMemcpy(y, d_y, N*sizeof(float), hipMemcpyDeviceToHost);
33 |
34 | float maxError = 0.0f;
35 | for (int i = 0; i < N; i++)
36 | maxError = fmax(maxError, abs(y[i]-4.0f));
37 | printf("Max error: %f\n", maxError);
38 |
39 | hipFree(d_x);
40 | hipFree(d_y);
41 | free(x);
42 | free(y);
43 | }
44 |
45 |
--------------------------------------------------------------------------------
/porting/codes/saxpy/hipblas/README.md:
--------------------------------------------------------------------------------
1 |
2 | # Copy the data from cuBLAS here and HIPIFY the example.
3 |
--------------------------------------------------------------------------------
/porting/codes/saxpy/hipblas_solution/Makefile:
--------------------------------------------------------------------------------
1 | #===============================================================================
2 | # User Options
3 | #===============================================================================
4 | #
5 | # Compiler can be set below, or via environment variable
6 | CC = hipcc
7 | OPTIMIZE = yes
8 | #
9 | #===============================================================================
10 | # Program name & source code list
11 | #===============================================================================
12 | program = saxpy_cublas
13 | source = saxpy_cublas.cu
14 | obj = $(source:.cu=.o)
15 | #===============================================================================
16 | # Sets Flags
17 | #===============================================================================
18 | # Standard Flags
19 | CFLAGS := -Xcompiler -Wall -I/appl/opt/rocm/rocm-4.0.0c/hipblas/hipblas/include
20 | # Linker Flags
21 | LDFLAGS = -L/appl/opt/rocm/rocm-4.0.0c/hipblas/hipblas/lib/ -lhipblas
22 | # Optimization Flags
23 | ifeq ($(OPTIMIZE),yes)
24 | CFLAGS += -O3
25 | endif
26 |
27 | #===============================================================================
28 | # Targets to Build
29 | #===============================================================================
30 | #
31 | $(program): $(obj) Makefile
32 | $(CC) $(CFLAGS) $(obj) -o $@ $(LDFLAGS)
33 |
34 | %.o: %.cu Makefile
35 | $(CC) $(CFLAGS) -c $< -o $@
36 |
37 | clean:
38 | rm -rf $(program) $(obj) out* error*
39 |
40 |
--------------------------------------------------------------------------------
/porting/codes/saxpy/hipblas_solution/saxpy_cublas.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | using namespace std;
4 |
5 | const int N = 1 << 30;
6 |
7 | int main(){
8 | float *a_h, *b_h;
9 | a_h = new float[N];
10 | b_h = new float[N];
11 | float *a_d, *b_d;
12 | for(int i = 0; i < N; i++){
13 | a_h[i] = 1.0f;
14 | b_h[i] = 2.0f ;
15 | }
16 | hipblasHandle_t handle;
17 | hipblasCreate(&handle);
18 | hipMalloc((void**) &a_d, sizeof(float) * N);
19 | hipMalloc((void**) &b_d, sizeof(float) * N);
20 | hipblasSetVector( N, sizeof(float), a_h, 1, a_d, 1);
21 | hipblasSetVector( N, sizeof(float), b_h, 1, b_d, 1);
22 | const float s = 2.0f;
23 | hipblasSaxpy( handle, N, &s, a_d, 1, b_d, 1);
24 | hipblasGetVector( N, sizeof(float), b_d, 1, b_h, 1);
25 | hipFree(a_d);
26 | hipFree(b_d);
27 | hipblasDestroy(handle);
28 | float maxError = 0.0f;
29 |
30 | for(int i = 0; i < N; i++)
31 | maxError = fmax(maxError, abs(b_h[i]-4.0f));
32 |
33 | cout << "Max error: " << maxError << endl;
34 |
35 |
36 | delete[] a_h;
37 | delete[] b_h;
38 | return 0;
39 | }
40 |
--------------------------------------------------------------------------------
/setup_env_lumi:
--------------------------------------------------------------------------------
1 | # Module environment
2 | ml PrgEnv-cray
3 | ml craype-accel-amd-gfx90a
4 | ml rocm/6.0.3
5 |
6 | # Environment variables for compiling
7 | export CXX=CC
8 | export CXXFLAGS='-xhip -O3'
9 |
10 | # Aliases for easy running
11 | alias runit='srun --reservation=HIPcourse --account=project_462000877 --partition=small-g --time=00:05:00 --nodes=1 --ntasks-per-node=1 --cpus-per-task=1 --gpus-per-task=1'
12 |
--------------------------------------------------------------------------------
/streams/01-event-record/README.md:
--------------------------------------------------------------------------------
1 | # Understanding asynchronity using events
2 |
3 | The purpose of this exercise is understand asynchronous operations, and how they can be timed using HIP events. In the skeleton, the timing has been implemented using `` header and `clock_t` type. This attempt to time asynchronous events, however, fails to measure the timings correctly. Your task is to implement the timings correctly using HIP events (you don't have to remove the `clock_t` timings, you can leave them in place to explore the difference). The locations where modifications are required, are marked with `#error` together with an instruction. Basically, your task is to measure and print the timing of a GPU kernel, a device-to-host copy, and their combined time.
4 |
--------------------------------------------------------------------------------
/streams/01-event-record/record.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 |
6 | #define get_mus(X) std::chrono::duration_cast(X).count()
7 | #define chrono_clock std::chrono::high_resolution_clock::now()
8 |
9 | /* A simple GPU kernel definition */
10 | __global__ void kernel(int *d_a, int n_total)
11 | {
12 | const int idx = blockIdx.x * blockDim.x + threadIdx.x;
13 | if(idx < n_total)
14 | d_a[idx] = idx;
15 | }
16 |
17 | /* The main function */
18 | int main(){
19 |
20 | // Problem size
21 | constexpr int n_total = 1<<22;
22 |
23 | // Device grid sizes
24 | constexpr int blocksize = 256;
25 | constexpr int gridsize = (n_total - 1 + blocksize) / blocksize;
26 |
27 | // Allocate host and device memory
28 | int *a, *d_a;
29 | const int bytes = n_total * sizeof(int);
30 | hipHostMalloc((void**)&a, bytes); // host pinned
31 | hipMalloc((void**)&d_a, bytes); // device pinned
32 |
33 | // Create events
34 | #error create the required timing events here
35 |
36 | // Create stream
37 | hipStream_t stream;
38 | hipStreamCreate(&stream);
39 |
40 | // Start timed GPU kernel and device-to-host copy
41 | #error record the events somewhere across the below lines of code
42 | #error such that you can get the timing for the kernel, the
43 | #error memory copy, and the total combined time of these
44 | auto start_kernel_clock = chrono_clock;
45 | kernel<<>>(d_a, n_total);
46 |
47 | auto start_d2h_clock = chrono_clock;
48 | hipMemcpyAsync(a, d_a, bytes, hipMemcpyDeviceToHost, stream);
49 |
50 | auto stop_clock = chrono_clock;
51 | hipStreamSynchronize(stream);
52 |
53 | // Exctract elapsed timings from event recordings
54 | #error get the elapsed time from the timing events
55 |
56 | // Check that the results are right
57 | int error = 0;
58 | for(int i = 0; i < n_total; ++i){
59 | if(a[i] != i)
60 | error = 1;
61 | }
62 |
63 | // Print results
64 | if(error)
65 | printf("Results are incorrect!\n");
66 | else
67 | printf("Results are correct!\n");
68 |
69 | // Print event timings
70 | printf("Event timings:\n");
71 | #error print event timings here
72 |
73 | // Print clock timings
74 | printf("clock_t timings:\n");
75 | printf(" %.3f ms - kernel\n", 1e3 * (double)get_mus(start_d2h_clock - start_kernel_clock));
76 | printf(" %.3f ms - device to host copy\n", 1e3 * (double)get_mus(stop_clock - start_d2h_clock));
77 | printf(" %.3f ms - total time\n", 1e3 * (double)get_mus(stop_clock - start_kernel_clock));
78 |
79 | // Destroy Stream
80 | hipStreamDestroy(stream);
81 |
82 | // Destroy events
83 | #error destroy events here
84 |
85 | // Deallocations
86 | hipFree(d_a); // Device
87 | hipHostFree(a); // Host
88 | }
89 |
--------------------------------------------------------------------------------
/streams/01-event-record/solution/record.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 |
6 | #define get_mus(X) std::chrono::duration_cast(X).count()
7 | #define chrono_clock std::chrono::high_resolution_clock::now()
8 |
9 | /* A simple GPU kernel definition */
10 | __global__ void kernel(int *d_a, int n_total)
11 | {
12 | const int idx = blockIdx.x * blockDim.x + threadIdx.x;
13 | if(idx < n_total)
14 | d_a[idx] = idx;
15 | }
16 |
17 | /* The main function */
18 | int main(){
19 | // Problem size
20 | constexpr int n_total = 1<<22; // pow(2, 22);
21 |
22 | // Device grid sizes
23 | constexpr int blocksize = 256;
24 | constexpr int gridsize = (n_total - 1 + blocksize) / blocksize;
25 |
26 | // Allocate host and device memory
27 | int *a, *d_a;
28 | const int bytes = n_total * sizeof(int);
29 | hipHostMalloc((void**)&a, bytes); // host pinned
30 | hipMalloc((void**)&d_a, bytes); // device pinned
31 |
32 | hipEvent_t pre_kernel, post_kernel, end_event;
33 | // Create events
34 | hipEventCreate(&pre_kernel);
35 | hipEventCreate(&post_kernel);
36 | hipEventCreate(&end_event);
37 | float timing_a, timing_b, timing_c;
38 |
39 | // Create stream
40 | hipStream_t stream;
41 | hipStreamCreate(&stream);
42 |
43 | // Start timed GPU kernel and device-to-host copy
44 | hipEventRecord(pre_kernel, stream);
45 | auto start_time = chrono_clock;
46 |
47 | kernel<<>>(d_a, n_total);
48 |
49 | // Record event after kernel execution
50 | hipEventRecord(post_kernel, stream);
51 | auto d2h_time = chrono_clock;
52 |
53 | hipMemcpyAsync(a, d_a, bytes, hipMemcpyDeviceToHost, stream);
54 |
55 | // Record event after D2H memory copy
56 | hipEventRecord(end_event, stream);
57 | auto end_time = chrono_clock;
58 |
59 | hipStreamSynchronize(stream);
60 |
61 | // Exctract elapsed timings from event recordings
62 | hipEventElapsedTime(&timing_a, pre_kernel, post_kernel);
63 | hipEventElapsedTime(&timing_b, post_kernel, end_event);
64 | hipEventElapsedTime(&timing_c, pre_kernel, end_event);
65 |
66 | // Check that the results are right
67 | int error = 0;
68 | for(int i = 0; i < n_total; ++i){
69 | if(a[i] != i)
70 | error = 1;
71 | }
72 |
73 | // Print results
74 | if(error)
75 | printf("Results are incorrect!\n");
76 | else
77 | printf("Results are correct!\n");
78 |
79 | // Print event timings
80 | printf("Event timings:\n");
81 | printf(" %.3f ms - kernel\n", (timing_a) );
82 | printf(" %.3f ms - D2H copy\n", (timing_b) );
83 | printf(" %.3f ms - total time\n", (timing_c) );
84 | /* #error print event timings here */
85 |
86 | // Print clock timings
87 | printf("std::chrono timings:\n");
88 | printf(" %.3f ms - kernel\n", 1e3 * ((double)get_mus(d2h_time - start_time)) / CLOCKS_PER_SEC);
89 | printf(" %.3f ms - device to host copy\n", 1e3 * ((double)get_mus(end_time - d2h_time)) / CLOCKS_PER_SEC);
90 | printf(" %.3f ms - total time\n", 1e3 * (double)get_mus(end_time-start_time) / CLOCKS_PER_SEC);
91 |
92 | // Destroy Stream
93 | hipStreamDestroy(stream);
94 |
95 | // Destroy events
96 | /* #error destroy events here */
97 | hipEventDestroy(pre_kernel);
98 | hipEventDestroy(post_kernel);
99 | hipEventDestroy(end_event);
100 |
101 | // Deallocations
102 | hipFree(d_a); // Device
103 | hipHostFree(a); // Host
104 | }
105 |
--------------------------------------------------------------------------------
/streams/02-concurrency/README.md:
--------------------------------------------------------------------------------
1 | # Investigating streams and events
2 |
3 | This exercise demonstrates an asynchronous data transfer and computation. Three different asynchronous cases are created, and their timings are printed out. The timings are recorded with hipEvent calls.
4 |
5 | ## Instructions
6 |
7 | In the exercise, the following HIP functions are needed:
8 |
9 | * `hipStreamCreate()`
10 | * `hipMemcpyAsync()`
11 | * `hipEventRecord()`
12 | * `hipEventSynchronize()`
13 | * `hipEventElapsedTime()`
14 | * `hipStreamDestroy()`
15 |
16 | ### Case 0
17 |
18 | 1) Create and destroy `n_stream` streams in the main function in the locations marked by `#error`
19 | 2) The function `case_0()` is already complete and can be used as a reference
20 |
21 | ### Case 1
22 |
23 | 1) In the `case_1()` function, create a loop over `n_stream` and split the work done by the kernel call of Case 0 into multiple kernels calls (one kernel call per stream with an even workload per stream)
24 | 3) Record events using `start_event` and `stop_event` arrays for each stream before and after the kernel call
25 |
26 | ### Case 2
27 |
28 | 1) Create a loop into the function `case_2()`
29 | 1) In the loop: Split the data copy from host to device into `n_stream` asynchronous memcopies. one for each stream (make sure the memcopies are split evenly for each stream)
30 | 2) In the loop: Launch the kernel for each stream similarly to Case 1
31 | 3) In the loop: Split the data copy from device to host into `n_stream` asynchronous memcopies. one for each stream (make sure the memcopies are split asynchronously
32 | 2) Record total timing of the loop, use `start_event[n_stream]` and `stop_event[n_stream]` array positions
33 | 3) Additionally, record events for each stream using `start_event` and `stop_event` arrays before H-to-D memcopy and after D-to-H memcopy, respectively
34 | 4) Synchronize host with each `stop_event[i] `
35 | 5) Get timings between each corresponding `start_event[i]` and `stop_event[i]`
36 |
37 | ### Case 3
38 |
39 | 1) Copy the case 2 here
40 | 2) Instead of doing the asynchronous memcopies and the kernel in the same loop as in Case 2, create a separate loop for each (3 loops in total)
41 | 3) Make sure you record events in appropriate locations to get correct timings
42 |
43 | ## Additional considerations
44 |
45 | * You can try setting `USE_PINNED_HOST_MEM` to `0` at line `#6`, to see how the timings change if we do not use pinned host memory.
46 |
--------------------------------------------------------------------------------
/third-party/hipcub/hipcub.hpp:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | #define hipcub cub
--------------------------------------------------------------------------------
/third-party/hiprand/hiprand_hcc.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2017 Advanced Micro Devices, Inc. All rights reserved.
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 |
21 | #ifndef HIPRAND_HCC_H_
22 | #define HIPRAND_HCC_H_
23 |
24 | #include
25 |
26 | typedef rocrand_generator_base_type hiprandGenerator_st;
27 |
28 | typedef struct rocrand_discrete_distribution_st hiprandDiscreteDistribution_st;
29 |
30 | #endif // HIPRAND_HCC_H_
31 |
--------------------------------------------------------------------------------
/third-party/hiprand/hiprand_kernel.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2017 Advanced Micro Devices, Inc. All rights reserved.
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 |
21 | #ifndef HIPRAND_KERNEL_H_
22 | #define HIPRAND_KERNEL_H_
23 |
24 | #ifndef QUALIFIERS
25 | #define QUALIFIERS __forceinline__ __device__
26 | #endif // QUALIFIERS
27 |
28 | #include
29 | #include
30 |
31 | /** \addtogroup hipranddevice
32 | *
33 | * @{
34 | */
35 |
36 | /**
37 | * \def HIPRAND_PHILOX4x32_DEFAULT_SEED
38 | * \brief Default seed for PHILOX4x32 PRNG.
39 | */
40 | #define HIPRAND_PHILOX4x32_DEFAULT_SEED 0ULL
41 | /**
42 | * \def HIPRAND_XORWOW_DEFAULT_SEED
43 | * \brief Default seed for XORWOW PRNG.
44 | */
45 | #define HIPRAND_XORWOW_DEFAULT_SEED 0ULL
46 | /**
47 | * \def HIPRAND_MRG32K3A_DEFAULT_SEED
48 | * \brief Default seed for MRG32K3A PRNG.
49 | */
50 | #define HIPRAND_MRG32K3A_DEFAULT_SEED 12345ULL
51 | /** @} */ // end of group hipranddevice
52 |
53 | #if defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)
54 | #include "hiprand/hiprand_kernel_hcc.h"
55 | #else
56 | #include "hiprand/hiprand_kernel_nvcc.h"
57 | #endif
58 |
59 | #endif // HIPRAND_KERNEL_H_
60 |
--------------------------------------------------------------------------------
/third-party/hiprand/hiprand_nvcc.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2017 Advanced Micro Devices, Inc. All rights reserved.
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 |
21 | #ifndef HIPRAND_NVCC_H_
22 | #define HIPRAND_NVCC_H_
23 |
24 | #include
25 |
26 | typedef struct curandGenerator_st hiprandGenerator_st;
27 |
28 | typedef struct curandDiscreteDistribution_st hiprandDiscreteDistribution_st;
29 |
30 | #endif // HIPRAND_NVCC_H_
31 |
--------------------------------------------------------------------------------
/third-party/hiprand/hiprand_version.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2017 Advanced Micro Devices, Inc. All rights reserved.
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 |
21 | #ifndef HIPRAND_VERSION_H_
22 | #define HIPRAND_VERSION_H_
23 |
24 | /// \def HIPRAND_VERSION
25 | /// \brief hipRAND library version
26 | ///
27 | /// Version number may not be visible in the documentation.
28 | ///
29 | /// HIPRAND_VERSION % 100 is the patch level,
30 | /// HIPRAND_VERSION / 100 % 1000 is the minor version,
31 | /// HIPRAND_VERSION / 100000 is the major version.
32 | ///
33 | /// For example, if HIPRAND_VERSION is 100500, then
34 | /// the major version is 1, the minor version is 5, and
35 | /// the patch level is 0.
36 | #define HIPRAND_VERSION 100500
37 |
38 | #endif // HIPRAND_VERSION_H_
39 |
--------------------------------------------------------------------------------