├── .gitignore
├── Makefile
├── Presentation.pdf
├── README.md
├── bla.hpp
├── bla_lib.cu
├── bla_lib.hpp
├── main.cpp
├── matrix.hpp
├── memory.cpp
├── memory.hpp
├── timer.cpp
└── timer.hpp
/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.mod
3 | *.modmic
4 | *.ptx
5 | *.i
6 | *.ii
7 | *.cudafe*
8 | *.fatbin*
9 | *.cubin
10 | *.module_id
11 | *.hash
12 | *.a
13 | *.so
14 | *.x
15 | *.log
16 | *.out
17 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | BINARY_NAME = bla_test.x
2 |
3 | CXX_COMP = g++
4 | CXX_FLAGS_DEV = -c -O3 -std=c++11 -fPIC -D_FORCE_INLINES -g
5 | CXX_FLAGS_OPT = -c -O3 -std=c++11 -fPIC -D_FORCE_INLINES
6 | CXX_FLAGS = $(CXX_FLAGS_OPT)
7 | CXX_INC =
8 | CXX_LIB = -lstdc++
9 |
10 | CUDA_COMP = nvcc
11 | CUDA_HOST = /usr/bin/g++
12 | CUDA_ARCH = sm_35
13 | CUDA_INC = -I/usr/local/cuda/include
14 | CUDA_LIB = -L/usr/local/cuda/lib64 -lcublas -lcudart
15 | CUDA_FLAGS_DEV = --compile -ccbin $(CUDA_HOST) -std=c++11 -arch=$(CUDA_ARCH) -O3 -m64 -w --resource-usage --ptxas-options=-v -Xcompiler -fPIC -D_FORCE_INLINES -g -G
16 | CUDA_FLAGS_OPT = --compile -ccbin $(CUDA_HOST) -std=c++11 -arch=$(CUDA_ARCH) -O3 -m64 -w --resource-usage --ptxas-options=-v -Xcompiler -fPIC -D_FORCE_INLINES
17 | CUDA_FLAGS_ADV = --compile -ccbin $(CUDA_HOST) -std=c++11 -arch=$(CUDA_ARCH) -O3 -m64 -w --resource-usage --ptxas-options=-v -lineinfo -Xcompiler -fPIC -D_FORCE_INLINES
18 | CUDA_FLAGS = $(CUDA_FLAGS_ADV)
19 |
20 | LINK_FLAGS = -fPIC
21 |
22 | OBJS = timer.o memory.o bla_lib.o main.o
23 |
24 | $(BINARY_NAME): $(OBJS)
25 | $(CXX_COMP) $(OBJS) $(LINK_FLAGS) $(CXX_LIB) $(CUDA_LIB) -o $(BINARY_NAME)
26 |
27 | timer.o: timer.cpp timer.hpp
28 | $(CXX_COMP) $(CXX_FLAGS) $(CXX_INC) $(CUDA_INC) timer.cpp
29 |
30 | memory.o: memory.cpp memory.hpp
31 | $(CXX_COMP) $(CXX_FLAGS) $(CXX_INC) $(CUDA_INC) memory.cpp
32 |
33 | bla_lib.o: bla_lib.cu bla_lib.hpp matrix.hpp memory.hpp timer.hpp
34 | $(CUDA_COMP) $(CUDA_FLAGS) $(CXX_INC) $(CUDA_INC) --ptx --source-in-ptx bla_lib.cu -o bla_lib.ptx
35 | $(CUDA_COMP) $(CUDA_FLAGS) $(CXX_INC) $(CUDA_INC) bla_lib.cu
36 |
37 | main.o: main.cpp bla_lib.cu bla_lib.hpp memory.hpp
38 | $(CXX_COMP) $(CXX_FLAGS) $(CXX_INC) $(CUDA_INC) main.cpp
39 |
40 |
41 | .PHONY: clean
42 | clean:
43 | rm -f *.out *.x *.a *.so *.o *.mod *.modmic *.ptx *.log
44 |
--------------------------------------------------------------------------------
/Presentation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DmitryLyakh/CUDA_Tutorial/a7fc4021d8843c997c06fa5faf2a31a1431f2dca/Presentation.pdf
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | CUDA Tutorial: Basic Linear Algebra (BLA) Library
2 |
3 | AUTHOR: Dmitry I. Lyakh (Liakh): quant4me@gmail.com, liakhdi@ornl.gov
4 |
5 | Copyright (C) 2018-2018 Dmitry I. Lyakh (Liakh)
6 | Copyright (C) 2018-2018 Oak Ridge National Laboratory (UT-Battelle)
7 |
8 | LICENSE: GNU Lesser General Public License v.3
9 |
10 | Persistent location:
11 | https://github.com/DmitryLyakh/CUDA_Tutorial.git
12 |
13 | Presentation from the Petascale Computing Institute 2019:
14 | Presentation.pdf
15 |
16 | YouTube video of this tutorial:
17 | https://youtu.be/Zqfa80APkDk
18 |
19 | BUILD:
20 | 1. Prerequisites: Linux, g++ 5+, CUDA 9+.
21 | 2. Update CUDA_INC and CUDA_LIB paths in the Makefile (if needed).
22 | 3. Adjust CUDA_ARCH in the Makefile to your GPU compute capability.
23 | 4. If your g++ compiler is too new for CUDA, provide an older one in CUDA_HOST.
24 | 5. make
25 |
--------------------------------------------------------------------------------
/bla.hpp:
--------------------------------------------------------------------------------
1 | /* CUDA tutorial: Basic Linear Algebra (BLA) Library
2 |
3 | !Copyright (C) 2018-2018 Dmitry I. Lyakh (Liakh)
4 | !Copyright (C) 2018-2018 Oak Ridge National Laboratory (UT-Battelle)
5 |
6 | !This file is part of CUDA BLA tutorial.
7 |
8 | !CUDA BLA is free software: you can redistribute it and/or modify
9 | !it under the terms of the GNU Lesser General Public License as published
10 | !by the Free Software Foundation, either version 3 of the License, or
11 | !(at your option) any later version.
12 |
13 | !CUDA BLA is distributed in the hope that it will be useful,
14 | !but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | !MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | !GNU Lesser General Public License for more details.
17 |
18 | !You should have received a copy of the GNU Lesser General Public License
19 | !along with CUDA BLA. If not, see . */
20 |
21 | #ifndef BLA_HPP_
22 | #define BLA_HPP_
23 |
24 | #include "matrix.hpp"
25 | #include "bla_lib.hpp"
26 |
27 | #endif //BLA_HPP_
28 |
--------------------------------------------------------------------------------
/bla_lib.cu:
--------------------------------------------------------------------------------
1 | /* CUDA tutorial: Basic Linear Algebra (BLA) Library
2 |
3 | !Copyright (C) 2018-2018 Dmitry I. Lyakh (Liakh)
4 | !Copyright (C) 2018-2018 Oak Ridge National Laboratory (UT-Battelle)
5 |
6 | !This file is part of CUDA BLA tutorial.
7 |
8 | !CUDA BLA is free software: you can redistribute it and/or modify
9 | !it under the terms of the GNU Lesser General Public License as published
10 | !by the Free Software Foundation, either version 3 of the License, or
11 | !(at your option) any later version.
12 |
13 | !CUDA BLA is distributed in the hope that it will be useful,
14 | !but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | !MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | !GNU Lesser General Public License for more details.
17 |
18 | !You should have received a copy of the GNU Lesser General Public License
19 | !along with CUDA BLA. If not, see . */
20 |
21 | #include "bla_lib.hpp"
22 |
23 | #include
24 | #include
25 |
26 | #include
27 | #include
28 | #include
29 |
30 | #include
31 |
32 | namespace bla{
33 |
34 | //GPU device constants:
35 | __device__ __constant__ static float zero_fp32 = 0.0f;
36 | __device__ __constant__ static float unity_fp32 = 1.0f;
37 | __device__ __constant__ static double zero_fp64 = 0.0;
38 | __device__ __constant__ static double unity_fp64 = 1.0;
39 |
40 |
41 | //CUDA floating point data type selector:
42 | template struct CudaFPData{};
43 | template <> struct CudaFPData{
44 | using type = float;
45 | static constexpr cudaDataType_t kind = CUDA_R_32F;
46 | };
47 | template <> struct CudaFPData{
48 | using type = double;
49 | static constexpr cudaDataType_t kind = CUDA_R_64F;
50 | };
51 | template <> struct CudaFPData>{
52 | using type = cuComplex;
53 | static constexpr cudaDataType_t kind = CUDA_C_32F;
54 | };
55 | template <> struct CudaFPData>{
56 | using type = cuDoubleComplex;
57 | static constexpr cudaDataType_t kind = CUDA_C_64F;
58 | };
59 |
60 |
61 | //Number of present GPU devices:
62 | static int totalNumGPUs = 0;
63 |
64 | //Current GEMM algorithm:
65 | static int gemmAlgorithm = 0;
66 |
67 | //CUDA device properties (for all GPU devices):
68 | cudaDeviceProp * gpuProperty;
69 |
70 | //cuBLAS handles (one per device):
71 | cublasHandle_t * cublasHandle;
72 |
73 |
74 | //Internal tests:
75 | bool test_hello();
76 | bool test_norm();
77 |
78 |
79 | //CUDA kernel prototypes:
80 | __global__ void gpu_test_presence(size_t str_len, char * __restrict__ dst, const char * __restrict__ src);
81 |
82 |
83 | template
84 | __global__ void gpu_array_norm2(size_t arr_size, const T * __restrict__ arr, volatile T * norm);
85 | __device__ static unsigned int norm_wr_lock = 0; //reduction lock (per GPU)
86 |
87 |
88 | template
89 | __global__ void gpu_array_add(size_t arr_size, T * __restrict__ arr0, const T * __restrict__ arr1, T alpha);
90 |
91 |
92 | template
93 | __global__ void gpu_gemm_nn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right);
94 |
95 | template
96 | __global__ void gpu_gemm_sh_nn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right);
97 |
98 | template
99 | __global__ void gpu_gemm_sh_reg_nn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right);
100 |
101 | template
102 | __global__ void gpu_gemm_tn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right);
103 |
104 | template
105 | __global__ void gpu_gemm_sh_tn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right);
106 |
107 | template
108 | __global__ void gpu_gemm_sh_reg_tn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right);
109 |
110 | template
111 | __global__ void gpu_gemm_nt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right);
112 |
113 | template
114 | __global__ void gpu_gemm_sh_nt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right);
115 |
116 | template
117 | __global__ void gpu_gemm_sh_reg_nt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right);
118 |
119 | template
120 | __global__ void gpu_gemm_tt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right);
121 |
122 | template
123 | __global__ void gpu_gemm_sh_tt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right);
124 |
125 | template
126 | __global__ void gpu_gemm_sh_reg_tt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right);
127 |
128 | //template
129 | //__global__ void gpu_gemm_sh_reg_old_nn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right);
130 |
131 |
132 | cublasStatus_t cublasGemm(cublasHandle_t handle,
133 | cublasOperation_t transa, cublasOperation_t transb,
134 | int m, int n, int k, const float * alpha,
135 | const float * A, int lda, const float * B, int ldb,
136 | const float * beta, float * C, int ldc)
137 | {
138 | return cublasSgemm(handle,transa,transb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
139 | }
140 |
141 | cublasStatus_t cublasGemm(cublasHandle_t handle,
142 | cublasOperation_t transa, cublasOperation_t transb,
143 | int m, int n, int k, const double * alpha,
144 | const double * A, int lda, const double * B, int ldb,
145 | const double * beta, double * C, int ldc)
146 | {
147 | return cublasDgemm(handle,transa,transb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
148 | }
149 |
150 |
151 | //Dispatch wrappers:
152 | template
153 | T matrix_norm2_gpu_(size_t num_elems,
154 | const T * matrix_body);
155 |
156 | template
157 | void matrix_addition_gpu_(size_t num_elems,
158 | T * matrix0_body,
159 | const T * matrix1_body,
160 | T alpha);
161 |
162 | template
163 | void matrix_multiplication_gpu_(bool left_transp, bool right_transp,
164 | T * matrix0_body, int nrows0, int ncols0,
165 | const T * matrix1_body, int nrows1, int ncols1,
166 | const T * matrix2_body, int nrows2, int ncols2);
167 |
168 |
169 | //IMPLEMENTATION:
170 | __global__ void gpu_test_presence(size_t str_len, char * __restrict__ dst, const char * __restrict__ src)
171 | {
172 | int tid = blockIdx.x * blockDim.x + threadIdx.x;
173 | while(tid < str_len){
174 | dst[tid] = src[tid];
175 | tid += gridDim.x * blockDim.x;
176 | }
177 | return;
178 | }
179 |
180 |
181 | template
182 | __global__ void gpu_array_norm2(size_t arr_size, //in: array size
183 | const T * __restrict__ arr, //in: pointer to arr[arr_size]
184 | volatile T * norm) //inout: sum of the squared elements of the array
185 | {
186 | extern __shared__ double thread_norm[]; //blockDim.x
187 |
188 | size_t n = gridDim.x * blockDim.x;
189 | double tnorm = 0.0;
190 | for(size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < arr_size; i += n) tnorm += arr[i] * arr[i];
191 | thread_norm[threadIdx.x] = tnorm;
192 | __syncthreads();
193 |
194 | unsigned int s = blockDim.x;
195 | while(s > 1){
196 | unsigned int j = (s+1U)>>1; //=(s+1)/2
197 | if(threadIdx.x + j < s) thread_norm[threadIdx.x] += thread_norm[threadIdx.x+j];
198 | __syncthreads();
199 | s = j;
200 | }
201 |
202 | if(threadIdx.x == 0){
203 | unsigned int j = 1;
204 | while(j){j = atomicMax(&norm_wr_lock,1);} //lock
205 | __threadfence();
206 | *norm += thread_norm[0]; //accumulate
207 | __threadfence();
208 | j=atomicExch(&norm_wr_lock,0); //unlock
209 | }
210 | __syncthreads();
211 | return;
212 | }
213 |
214 |
215 | template
216 | __global__ void gpu_array_add(size_t arr_size, //in: array size
217 | T * __restrict__ arr0, //inout: pointer to arr0[arr_size]
218 | const T * __restrict__ arr1, //in: pointer to arr1[arr_size]
219 | T alpha) //in: scaling factor
220 | {
221 | size_t n = gridDim.x * blockDim.x;
222 | for(size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < arr_size; i += n) arr0[i] += arr1[i] * alpha;
223 | return;
224 | }
225 |
226 |
227 | template
228 | __global__ void gpu_gemm_nn(int m, int n, int k, //in: matrix dimensions: C(m,n)+=A(m,k)*B(k,n)
229 | T * __restrict__ dest, //inout: pointer to C matrix data
230 | const T * __restrict__ left, //in: pointer to A matrix data
231 | const T * __restrict__ right) //in: pointer to B matrix data
232 | {
233 | size_t ty = blockIdx.y*blockDim.y + threadIdx.y; //global thread index Y
234 | size_t tx = blockIdx.x*blockDim.x + threadIdx.x; //global thread index X
235 |
236 | size_t n_pos = ty;
237 | while(n_pos < n){
238 |
239 | size_t m_pos = tx;
240 | while(m_pos < m){
241 |
242 | T tmp = static_cast(0.0);
243 | for(size_t k_pos = 0; k_pos < k; ++k_pos){
244 | tmp += left[k_pos*m + m_pos] * right[n_pos*k + k_pos];
245 | }
246 | dest[n_pos*m + m_pos] += tmp;
247 |
248 | m_pos += gridDim.x*blockDim.x;
249 | }
250 |
251 | n_pos += gridDim.y*blockDim.y;
252 | }
253 | return;
254 | }
255 |
256 |
257 | template
258 | __global__ void gpu_gemm_sh_nn(int m, int n, int k, //in: matrix dimensions: C(m,n)+=A(m,k)*B(k,n)
259 | T * __restrict__ dest, //inout: pointer to C matrix data
260 | const T * __restrict__ left, //in: pointer to A matrix data
261 | const T * __restrict__ right) //in: pointer to B matrix data
262 | {
263 | using int_t = int; //either int or size_t
264 | __shared__ T lbuf[TILE_EXT_K][TILE_EXT_M], rbuf[TILE_EXT_N][TILE_EXT_K];
265 |
266 | for(int_t n_pos = blockIdx.y*blockDim.y; n_pos < n; n_pos += gridDim.y*blockDim.y){ //tile offset in Y dimension
267 |
268 | for(int_t m_pos = blockIdx.x*blockDim.x; m_pos < m; m_pos += gridDim.x*blockDim.x){ //tile offset in X dimension
269 |
270 | T tmp = static_cast(0.0); //accumulator
271 |
272 | for(int_t k_pos = 0; k_pos < k; k_pos += TILE_EXT_K){ //k_pos is the position of the CUDA thread along the K dimension
273 | int_t k_end = k_pos + TILE_EXT_K; if(k_end > k) k_end = k;
274 |
275 | //Load a tile of matrix A(m_pos:TILE_EXT_M, k_pos:TILE_EXT_K):
276 | if(m_pos + threadIdx.x < m){
277 | for(int_t k_loc = k_pos + threadIdx.y; k_loc < k_end; k_loc += blockDim.y){
278 | lbuf[k_loc-k_pos][threadIdx.x] = left[k_loc*m + (m_pos+threadIdx.x)];
279 | }
280 | }
281 |
282 | //Load a tile of matrix B(k_pos:TILE_EXT_K, n_pos:TILE_EXT_N):
283 | if(n_pos + threadIdx.y < n){
284 | for(int_t k_loc = k_pos + threadIdx.x; k_loc < k_end; k_loc += blockDim.x){
285 | rbuf[threadIdx.y][k_loc-k_pos] = right[(n_pos+threadIdx.y)*k + k_loc];
286 | }
287 | }
288 | __syncthreads();
289 |
290 | //Multiply two loaded tiles to produce a tile of matrix C(m_pos:TILE_EXT_M,n_pos:TILE_EXT_N):
291 | if(m_pos + threadIdx.x < m && n_pos + threadIdx.y < n){
292 | if(k_end - k_pos == TILE_EXT_K){ //number of loop iterations is known at compile time: Unroll it
293 | #pragma unroll
294 | for(int_t l = 0; l < TILE_EXT_K; ++l){
295 | tmp += lbuf[l][threadIdx.x] * rbuf[threadIdx.y][l];
296 | }
297 | }else{ //number of loop iterations is not known at compile time
298 | for(int_t l = 0; l < (k_end - k_pos); ++l){
299 | tmp += lbuf[l][threadIdx.x] * rbuf[threadIdx.y][l];
300 | }
301 | }
302 | }
303 | __syncthreads();
304 |
305 | } //k_pos
306 |
307 | //Store element of the C matrix in global memory:
308 | if(m_pos + threadIdx.x < m && n_pos + threadIdx.y < n)
309 | dest[(n_pos+threadIdx.y)*m + (m_pos+threadIdx.x)] += tmp;
310 |
311 | } //m_pos
312 |
313 | } //n_pos
314 | return;
315 | }
316 |
317 |
318 | template
319 | __global__ void gpu_gemm_sh_reg_nn(int m, int n, int k, //in: matrix dimensions: C(m,n)+=A(m,k)*B(k,n)
320 | T * __restrict__ dest, //inout: pointer to C matrix data
321 | const T * __restrict__ left, //in: pointer to A matrix data
322 | const T * __restrict__ right) //in: pointer to B matrix data
323 | {
324 | using int_t = int; //either int or size_t
325 | __shared__ T lbuf[TILE_EXT_K][TILE_EXT_M], rbuf[TILE_EXT_N][TILE_EXT_K];
326 |
327 | for(int_t n_pos = blockIdx.y*TILE_EXT_N; n_pos < n; n_pos += gridDim.y*TILE_EXT_N){ //tile offset in Y dimension
328 | int_t n_end = n_pos + TILE_EXT_N; if(n_end > n) n_end = n;
329 |
330 | for(int_t m_pos = blockIdx.x*TILE_EXT_M; m_pos < m; m_pos += gridDim.x*TILE_EXT_M){ //tile offset in X dimension
331 | int_t m_end = m_pos + TILE_EXT_M; if(m_end > m) m_end = m;
332 |
333 | if((m_end - m_pos == TILE_EXT_M) && (n_end - n_pos == TILE_EXT_N)){ //complete tile C(TILE_EXT_M,TILE_EXT_N)
334 |
335 | //Initialize registers to zero:
336 | T dreg[4][4] = {static_cast(0.0)};
337 | T rreg[4] = {static_cast(0.0)};
338 | T lreg[4] = {static_cast(0.0)};
339 |
340 | for(int_t k_pos = 0; k_pos < k; k_pos += TILE_EXT_K){ //k_pos is the position of the CUDA thread along the K dimension
341 | int_t k_end = k_pos + TILE_EXT_K; if(k_end > k) k_end = k;
342 |
343 | //Load a tile of matrix A(m_pos:TILE_EXT_M, k_pos:TILE_EXT_K):
344 | for(int_t m_loc = m_pos + threadIdx.x; m_loc < m_end; m_loc += blockDim.x){
345 | for(int_t k_loc = k_pos + threadIdx.y; k_loc < k_end; k_loc += blockDim.y){
346 | lbuf[k_loc - k_pos][m_loc - m_pos] = left[k_loc*m + m_loc];
347 | }
348 | }
349 |
350 | //Load a tile of matrix B(k_pos:TILE_EXT_K, n_pos:TILE_EXT_N):
351 | for(int_t n_loc = n_pos + threadIdx.y; n_loc < n_end; n_loc += blockDim.y){
352 | for(int_t k_loc = k_pos + threadIdx.x; k_loc < k_end; k_loc += blockDim.x){
353 | rbuf[n_loc - n_pos][k_loc - k_pos] = right[n_loc*k + k_loc];
354 | }
355 | }
356 | __syncthreads();
357 |
358 | //Multiply two loaded tiles to produce a tile of matrix C(m_pos:TILE_EXT_M,n_pos:TILE_EXT_N):
359 | if(k_end - k_pos == TILE_EXT_K){
360 | #pragma unroll
361 | for(int_t l = 0; l < TILE_EXT_K; ++l){
362 | #pragma unroll
363 | for(int_t j = 0; j < 4; ++j) rreg[j] = rbuf[threadIdx.y + blockDim.y*j][l];
364 | #pragma unroll
365 | for(int_t j = 0; j < 4; ++j) lreg[j] = lbuf[l][threadIdx.x + blockDim.x*j];
366 | #pragma unroll
367 | for(int_t j = 0; j < 4; ++j){
368 | #pragma unroll
369 | for(int_t i = 0; i < 4; ++i){
370 | dreg[j][i] += lreg[i] * rreg[j];
371 | }
372 | }
373 | }
374 | }else{
375 | for(int_t l = 0; l < (k_end - k_pos); ++l){
376 | #pragma unroll
377 | for(int_t j = 0; j < 4; ++j) rreg[j] = rbuf[threadIdx.y + blockDim.y*j][l];
378 | #pragma unroll
379 | for(int_t j = 0; j < 4; ++j) lreg[j] = lbuf[l][threadIdx.x + blockDim.x*j];
380 | #pragma unroll
381 | for(int_t j = 0; j < 4; ++j){
382 | #pragma unroll
383 | for(int_t i = 0; i < 4; ++i){
384 | dreg[j][i] += lreg[i] * rreg[j];
385 | }
386 | }
387 | }
388 | }
389 | __syncthreads();
390 |
391 | } //k_pos
392 |
393 | //Store elements of the C matrix in global memory:
394 | #pragma unroll
395 | for(int_t j = 0; j < 4; ++j){
396 | #pragma unroll
397 | for(int_t i = 0; i < 4; ++i){
398 | dest[(n_pos + threadIdx.y + blockDim.y*j)*m + (m_pos + threadIdx.x + blockDim.x*i)] += dreg[j][i];
399 | }
400 | }
401 |
402 | }else{ //incomplete tile of C
403 |
404 | //Initialize registers to zero:
405 | T dreg[4][4] = {static_cast(0.0)};
406 | T rreg[4] = {static_cast(0.0)};
407 | T lreg[4] = {static_cast(0.0)};
408 |
409 | for(int_t k_pos = 0; k_pos < k; k_pos += TILE_EXT_K){ //k_pos is the position of the CUDA thread along the K dimension
410 | int_t k_end = k_pos + TILE_EXT_K; if(k_end > k) k_end = k;
411 |
412 | //Load a tile of matrix A(m_pos:TILE_EXT_M, k_pos:TILE_EXT_K):
413 | for(int_t m_loc = m_pos + threadIdx.x; m_loc < m_end; m_loc += blockDim.x){
414 | for(int_t k_loc = k_pos + threadIdx.y; k_loc < k_end; k_loc += blockDim.y){
415 | lbuf[k_loc - k_pos][m_loc - m_pos] = left[k_loc*m + m_loc];
416 | }
417 | }
418 |
419 | //Load a tile of matrix B(k_pos:TILE_EXT_K, n_pos:TILE_EXT_N):
420 | for(int_t n_loc = n_pos + threadIdx.y; n_loc < n_end; n_loc += blockDim.y){
421 | for(int_t k_loc = k_pos + threadIdx.x; k_loc < k_end; k_loc += blockDim.x){
422 | rbuf[n_loc - n_pos][k_loc - k_pos] = right[n_loc*k + k_loc];
423 | }
424 | }
425 | __syncthreads();
426 |
427 | //Multiply two loaded tiles to produce a tile of matrix C(m_pos:TILE_EXT_M,n_pos:TILE_EXT_N):
428 | for(int_t l = 0; l < (k_end - k_pos); ++l){
429 | for(int_t i = 0, j = threadIdx.y; j < n_end - n_pos; j += blockDim.y, i++) rreg[i] = rbuf[j][l];
430 | for(int_t i = 0, j = threadIdx.x; j < m_end - m_pos; j += blockDim.x, i++) lreg[i] = lbuf[l][j];
431 | #pragma unroll
432 | for(int_t j = 0; j < 4; ++j){
433 | #pragma unroll
434 | for(int_t i = 0; i < 4; ++i){
435 | dreg[j][i] += lreg[i] * rreg[j];
436 | }
437 | }
438 | }
439 | __syncthreads();
440 |
441 | } //k_pos
442 |
443 | //Store element of the C matrix in global memory:
444 | for(int_t j = 0, n_loc = n_pos + threadIdx.y; n_loc < n_end; n_loc += blockDim.y, j++){
445 | for(int_t i = 0, m_loc = m_pos + threadIdx.x; m_loc < m_end; m_loc += blockDim.x, i++){
446 | dest[n_loc*m + m_loc] += dreg[j][i];
447 | }
448 | }
449 |
450 | }
451 |
452 | } //m_pos
453 |
454 | } //n_pos
455 | return;
456 | }
457 |
458 |
459 | template
460 | __global__ void gpu_gemm_tn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right)
461 | {
462 | //`Finish
463 | return;
464 | }
465 |
466 |
467 | template
468 | __global__ void gpu_gemm_sh_tn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right)
469 | {
470 | //`Finish
471 | return;
472 | }
473 |
474 |
475 | template
476 | __global__ void gpu_gemm_sh_reg_tn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right)
477 | {
478 | //`Finish
479 | return;
480 | }
481 |
482 |
483 | template
484 | __global__ void gpu_gemm_nt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right)
485 | {
486 | //`Finish
487 | return;
488 | }
489 |
490 |
491 | template
492 | __global__ void gpu_gemm_sh_nt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right)
493 | {
494 | //`Finish
495 | return;
496 | }
497 |
498 |
499 | template
500 | __global__ void gpu_gemm_sh_reg_nt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right)
501 | {
502 | //`Finish
503 | return;
504 | }
505 |
506 |
507 | template
508 | __global__ void gpu_gemm_tt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right)
509 | {
510 | //`Finish
511 | return;
512 | }
513 |
514 |
515 | template
516 | __global__ void gpu_gemm_sh_tt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right)
517 | {
518 | //`Finish
519 | return;
520 | }
521 |
522 |
523 | template
524 | __global__ void gpu_gemm_sh_reg_tt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right)
525 | {
526 | //`Finish
527 | return;
528 | }
529 |
530 |
531 | /*
532 | template
533 | __global__ void gpu_gemm_sh_reg_old_nn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right)
534 | {
535 | using int_t = int; //either int or size_t
536 | __shared__ T lbuf[TILE_EXT_K][TILE_EXT_M], rbuf[TILE_EXT_N][TILE_EXT_K];
537 | T lreg[FRAG_EXT_M], rreg[FRAG_EXT_N], dreg[FRAG_EXT_N][FRAG_EXT_M];
538 |
539 | const int_t wyb = ((threadIdx.y*blockDim.x + threadIdx.x) / warpSize) / (TILE_EXT_M/FRAG_EXT_M) * FRAG_EXT_N;
540 | const int_t wxb = ((threadIdx.y*blockDim.x + threadIdx.x) / warpSize) % (TILE_EXT_M/FRAG_EXT_M) * FRAG_EXT_M;
541 | const int_t ln = (threadIdx.y*blockDim.x + threadIdx.x) % warpSize; //thread lane index inside a warp
542 | const int_t lny = ln / FRAG_EXT_M; //Y position inside warp fragment
543 | const int_t lnx = ln % FRAG_EXT_M; //X position inside warp fragment
544 |
545 | for(int_t n_pos = blockIdx.y*blockDim.y; n_pos < n; n_pos += gridDim.y*blockDim.y){ //tile offset in Y dimension
546 |
547 | for(int_t m_pos = blockIdx.x*blockDim.x; m_pos < m; m_pos += gridDim.x*blockDim.x){ //tile offset in X dimension
548 |
549 | if((m_pos + TILE_EXT_M <= m) && (n_pos + TILE_EXT_N <= n)){ //complete tile (TILE_EXT_N * TILE_EXT_M)
550 |
551 | //Initialize C accumulators to zero:
552 | #pragma unroll
553 | for(int_t j = 0; j < FRAG_EXT_N; ++j){
554 | #pragma unroll
555 | for(int_t i = 0; i < FRAG_EXT_M; ++i){
556 | dreg[j][i] = static_cast(0.0);
557 | }
558 | }
559 |
560 | for(int_t k_pos = 0; k_pos < k; k_pos += TILE_EXT_K){ //k_pos is the position of the CUDA thread along the K dimension
561 | int_t k_end = k_pos + TILE_EXT_K; if(k_end > k) k_end = k;
562 |
563 | //Load a tile of matrix A(m_pos:TILE_EXT_M, k_pos:TILE_EXT_K):
564 | for(int_t k_loc = k_pos + threadIdx.y; k_loc < k_end; k_loc += blockDim.y){
565 | lbuf[k_loc-k_pos][threadIdx.x] = left[k_loc*m + (m_pos+threadIdx.x)];
566 | }
567 |
568 | //Load a tile of matrix B(k_pos:TILE_EXT_K, n_pos:TILE_EXT_N):
569 | for(int_t k_loc = k_pos + threadIdx.x; k_loc < k_end; k_loc += blockDim.x){
570 | rbuf[threadIdx.y][k_loc-k_pos] = right[(n_pos+threadIdx.y)*k + k_loc];
571 | }
572 | __syncthreads();
573 |
574 | //Multiply two loaded tiles to produce a tile of matrix C(m_pos:TILE_EXT_M,n_pos:TILE_EXT_N):
575 | for(int_t l = ln; l < (k_end - k_pos); l += warpSize){
576 | //Load fragments of shared memory tiles into registers:
577 | #pragma unroll
578 | for(int_t j = 0; j < FRAG_EXT_N; ++j) rreg[j] = rbuf[wyb + j][l];
579 | #pragma unroll
580 | for(int_t j = 0; j < FRAG_EXT_M; ++j) lreg[j] = lbuf[l][wxb + j];
581 | //Compute outer product of tile fragments in registers:
582 | #pragma unroll
583 | for(int_t j = 0; j < FRAG_EXT_N; ++j){
584 | #pragma unroll
585 | for(int_t i = 0; i < FRAG_EXT_M; ++i){
586 | dreg[j][i] += lreg[i] * rreg[j];
587 | }
588 | }
589 | }
590 | __syncthreads();
591 |
592 | } //k_pos
593 |
594 | //Perform reduction of the C fragment within each warp:
595 | #pragma unroll
596 | for(int_t j = 0; j < FRAG_EXT_N; ++j){
597 | #pragma unroll
598 | for(int_t i = 0; i < FRAG_EXT_M; ++i){
599 | #pragma unroll
600 | dreg[j][i] += __shfl_xor_sync(0xffffffff,dreg[j][i],16);
601 | dreg[j][i] += __shfl_xor_sync(0xffffffff,dreg[j][i],8);
602 | dreg[j][i] += __shfl_xor_sync(0xffffffff,dreg[j][i],4);
603 | dreg[j][i] += __shfl_xor_sync(0xffffffff,dreg[j][i],2);
604 | dreg[j][i] += __shfl_xor_sync(0xffffffff,dreg[j][i],1);
605 | }
606 | }
607 |
608 | //Upload C fragments into C matrix in global memory:
609 | dest[(n_pos + wyb + lny)*m + (m_pos + wxb + lnx)] = dreg[lny][lnx];
610 |
611 | }else{ //incomplete tile
612 |
613 | //Initialize accumulator to zero:
614 | T tmp = static_cast(0.0);
615 |
616 | for(int_t k_pos = 0; k_pos < k; k_pos += TILE_EXT_K){ //k_pos is the position of the CUDA thread along the K dimension
617 | int_t k_end = k_pos + TILE_EXT_K; if(k_end > k) k_end = k;
618 |
619 | //Load a tile of matrix A(m_pos:TILE_EXT_M, k_pos:TILE_EXT_K):
620 | if(m_pos + threadIdx.x < m){
621 | for(int_t k_loc = k_pos + threadIdx.y; k_loc < k_end; k_loc += blockDim.y){
622 | lbuf[k_loc-k_pos][threadIdx.x] = left[k_loc*m + (m_pos+threadIdx.x)];
623 | }
624 | }
625 |
626 | //Load a tile of matrix B(k_pos:TILE_EXT_K, n_pos:TILE_EXT_N):
627 | if(n_pos + threadIdx.y < n){
628 | for(int_t k_loc = k_pos + threadIdx.x; k_loc < k_end; k_loc += blockDim.x){
629 | rbuf[threadIdx.y][k_loc-k_pos] = right[(n_pos+threadIdx.y)*k + k_loc];
630 | }
631 | }
632 | __syncthreads();
633 |
634 | //Multiply two loaded tiles to produce a tile of matrix C(m_pos:TILE_EXT_M,n_pos:TILE_EXT_N):
635 | if(m_pos + threadIdx.x < m && n_pos + threadIdx.y < n){
636 | if(k_end - k_pos == TILE_EXT_K){ //number of loop iterations is known at compile time: Unroll it
637 | #pragma unroll
638 | for(int_t l = 0; l < TILE_EXT_K; ++l){
639 | tmp += lbuf[l][threadIdx.x] * rbuf[threadIdx.y][l];
640 | }
641 | }else{ //number of loop iterations is not known at compile time
642 | for(int_t l = 0; l < (k_end - k_pos); ++l){
643 | tmp += lbuf[l][threadIdx.x] * rbuf[threadIdx.y][l];
644 | }
645 | }
646 | }
647 | __syncthreads();
648 |
649 | } //k_pos
650 |
651 | //Store in C matrix into global memory:
652 | if(m_pos + threadIdx.x < m && n_pos + threadIdx.y < n) dest[(n_pos+threadIdx.y)*m + (m_pos+threadIdx.x)] += tmp;
653 |
654 | }
655 |
656 | } //m_pos
657 |
658 | } //n_pos
659 | return;
660 | }
661 | */
662 |
663 |
664 | template
665 | T matrix_norm2_gpu_(size_t num_elems, const T * matrix_body)
666 | {
667 | T norm2 = static_cast(0);
668 | int dev; cudaError_t cuerr = cudaGetDevice(&dev); assert(cuerr == cudaSuccess);
669 | T * dnorm2 = static_cast(allocate(sizeof(T),dev,MemKind::Regular));
670 | cuerr = cudaMemset((void*)dnorm2,0,sizeof(T)); assert(cuerr == cudaSuccess);
671 | unsigned int num_blocks = 1024; unsigned int num_threads = 256;
672 | gpu_array_norm2<<>>(num_elems,matrix_body,dnorm2);
673 | cuerr = cudaDeviceSynchronize();
674 | cuerr = cudaGetLastError(); assert(cuerr == cudaSuccess);
675 | cuerr = cudaMemcpy((void*)(&norm2),(void*)dnorm2,sizeof(T),cudaMemcpyDefault);
676 | deallocate((void*)dnorm2);
677 | return norm2;
678 | }
679 |
680 |
681 | template
682 | void matrix_addition_gpu_(size_t num_elems, T * matrix0_body, const T * matrix1_body, T alpha)
683 | {
684 | unsigned int num_blocks = 4096; unsigned int num_threads = 256;
685 | gpu_array_add<<>>(num_elems,matrix0_body,matrix1_body,alpha);
686 | cudaError_t cuerr = cudaDeviceSynchronize();
687 | cuerr = cudaGetLastError(); assert(cuerr == cudaSuccess);
688 | return;
689 | }
690 |
691 |
692 | template
693 | void matrix_multiplication_gpu_(bool left_transp, bool right_transp,
694 | T * matrix0_body, int nrows0, int ncols0,
695 | const T * matrix1_body, int nrows1, int ncols1,
696 | const T * matrix2_body, int nrows2, int ncols2)
697 | {
698 | if(gemmAlgorithm == 0){ //BLA GEMM brute-force
699 | if(!left_transp && !right_transp){
700 | int m = nrows0, n = ncols0, k = ncols1;
701 | dim3 threads(32,32);
702 | dim3 blocks((nrows0-1)/32+1,(ncols0-1)/32+1);
703 | gpu_gemm_nn<<>>(m,n,k,matrix0_body,matrix1_body,matrix2_body);
704 | }else if(left_transp && !right_transp){
705 | int m = nrows0, n = ncols0, k = nrows1;
706 | dim3 threads(32,32);
707 | dim3 blocks((nrows0-1)/32+1,(ncols0-1)/32+1);
708 | gpu_gemm_tn<<>>(m,n,k,matrix0_body,matrix1_body,matrix2_body);
709 | }else if(!left_transp && right_transp){
710 | int m = nrows0, n = ncols0, k = ncols1;
711 | dim3 threads(32,32);
712 | dim3 blocks((nrows0-1)/32+1,(ncols0-1)/32+1);
713 | gpu_gemm_nt<<>>(m,n,k,matrix0_body,matrix1_body,matrix2_body);
714 | }else if(left_transp && right_transp){
715 | int m = nrows0, n = ncols0, k = nrows1;
716 | dim3 threads(32,32);
717 | dim3 blocks((nrows0-1)/32+1,(ncols0-1)/32+1);
718 | gpu_gemm_tt<<>>(m,n,k,matrix0_body,matrix1_body,matrix2_body);
719 | }
720 | }else if(gemmAlgorithm == 1){ //BLA GEMM with shared memory
721 | if(!left_transp && !right_transp){
722 | int m = nrows0, n = ncols0, k = ncols1;
723 | dim3 threads(16,16);
724 | dim3 blocks((nrows0-1)/16+1,(ncols0-1)/16+1);
725 | gpu_gemm_sh_nn<<>>(m,n,k,matrix0_body,matrix1_body,matrix2_body);
726 | }else if(left_transp && !right_transp){
727 | int m = nrows0, n = ncols0, k = nrows1;
728 | dim3 threads(16,16);
729 | dim3 blocks((nrows0-1)/16+1,(ncols0-1)/16+1);
730 | gpu_gemm_sh_tn<<>>(m,n,k,matrix0_body,matrix1_body,matrix2_body);
731 | }else if(!left_transp && right_transp){
732 | int m = nrows0, n = ncols0, k = ncols1;
733 | dim3 threads(16,16);
734 | dim3 blocks((nrows0-1)/16+1,(ncols0-1)/16+1);
735 | gpu_gemm_sh_nt<<>>(m,n,k,matrix0_body,matrix1_body,matrix2_body);
736 | }else if(left_transp && right_transp){
737 | int m = nrows0, n = ncols0, k = nrows1;
738 | dim3 threads(16,16);
739 | dim3 blocks((nrows0-1)/16+1,(ncols0-1)/16+1);
740 | gpu_gemm_sh_tt<<>>(m,n,k,matrix0_body,matrix1_body,matrix2_body);
741 | }
742 | }else if(gemmAlgorithm == 2){ //BLA GEMM with shared memory and register file
743 | if(!left_transp && !right_transp){
744 | int m = nrows0, n = ncols0, k = ncols1;
745 | dim3 threads(16,16);
746 | dim3 blocks((nrows0-1)/16+1,(ncols0-1)/16+1);
747 | gpu_gemm_sh_reg_nn<<>>(m,n,k,matrix0_body,matrix1_body,matrix2_body);
748 | }else if(left_transp && !right_transp){
749 | int m = nrows0, n = ncols0, k = nrows1;
750 | dim3 threads(16,16);
751 | dim3 blocks((nrows0-1)/16+1,(ncols0-1)/16+1);
752 | //gpu_gemm_sh_reg_tn<<>>(m,n,k,matrix0_body,matrix1_body,matrix2_body);
753 | }else if(!left_transp && right_transp){
754 | int m = nrows0, n = ncols0, k = ncols1;
755 | dim3 threads(16,16);
756 | dim3 blocks((nrows0-1)/16+1,(ncols0-1)/16+1);
757 | //gpu_gemm_sh_reg_nt<<>>(m,n,k,matrix0_body,matrix1_body,matrix2_body);
758 | }else if(left_transp && right_transp){
759 | int m = nrows0, n = ncols0, k = nrows1;
760 | dim3 threads(16,16);
761 | dim3 blocks((nrows0-1)/16+1,(ncols0-1)/16+1);
762 | //gpu_gemm_sh_reg_tt<<>>(m,n,k,matrix0_body,matrix1_body,matrix2_body);
763 | }
764 | }else{ //cuBLAS GEMM
765 | int dev; cudaError_t cuerr = cudaGetDevice(&dev); assert(cuerr == cudaSuccess);
766 | int m = nrows1; cublasOperation_t transa = CUBLAS_OP_N;
767 | if(left_transp){m = ncols1; transa = CUBLAS_OP_T;}
768 | int n = ncols2; cublasOperation_t transb = CUBLAS_OP_N;
769 | if(right_transp){n = nrows2; transb = CUBLAS_OP_T;}
770 | int k = ncols1; if(left_transp) k = nrows1;
771 | T *alpha, *beta;
772 | if(CudaFPData::kind == CUDA_R_32F){
773 | cuerr = cudaGetSymbolAddress((void**)&alpha,unity_fp32); assert(cuerr == cudaSuccess);
774 | cuerr = cudaGetSymbolAddress((void**)&beta,unity_fp32); assert(cuerr == cudaSuccess);
775 | }else if(CudaFPData::kind == CUDA_R_64F){
776 | cuerr = cudaGetSymbolAddress((void**)&alpha,unity_fp64); assert(cuerr == cudaSuccess);
777 | cuerr = cudaGetSymbolAddress((void**)&beta,unity_fp64); assert(cuerr == cudaSuccess);
778 | }else{
779 | assert(false);
780 | }
781 | #ifdef USE_CUBLAS_GEMM_EX
782 | cublasStatus_t custat = cublasGemmEx(cublasHandle[dev],
783 | transa,transb,
784 | m,n,k,
785 | alpha,
786 | matrix1_body,CudaFPData::kind,nrows1,
787 | matrix2_body,CudaFPData::kind,nrows2,
788 | beta,
789 | matrix0_body,CudaFPData::kind,nrows0,
790 | CudaFPData::kind, CUBLAS_GEMM_DEFAULT);
791 | #else
792 | cublasStatus_t custat = cublasGemm(cublasHandle[dev],
793 | transa,transb,
794 | m,n,k,
795 | alpha,
796 | matrix1_body,nrows1,
797 | matrix2_body,nrows2,
798 | beta,
799 | matrix0_body,nrows0);
800 | #endif
801 | if(custat != CUBLAS_STATUS_SUCCESS) std::cout << "#ERROR(cublasGemmEx): Eror " << custat << std::endl;
802 | assert(custat == CUBLAS_STATUS_SUCCESS);
803 | }
804 | cudaError_t cuerr = cudaDeviceSynchronize();
805 | cuerr = cudaGetLastError();
806 | if(cuerr != cudaSuccess){
807 | const char * error_str = cudaGetErrorString(cuerr);
808 | std::cout << "ERROR(bla::matrix_multiplication_gpu_): CUDA kernel launch failure: " << std::endl;
809 | printf("%s\n",error_str);
810 | }
811 | assert(cuerr == cudaSuccess);
812 | return;
813 | }
814 |
815 |
816 | float matrix_norm2_gpu(size_t num_elems, const float * matrix_body)
817 | {
818 | return matrix_norm2_gpu_(num_elems,matrix_body);
819 | }
820 |
821 | double matrix_norm2_gpu(size_t num_elems, const double * matrix_body)
822 | {
823 | return matrix_norm2_gpu_(num_elems,matrix_body);
824 | }
825 |
826 |
827 | void matrix_addition_gpu(size_t num_elems, float * matrix0_body, const float * matrix1_body, float alpha)
828 | {
829 | return matrix_addition_gpu_(num_elems,matrix0_body,matrix1_body,alpha);
830 | }
831 |
832 | void matrix_addition_gpu(size_t num_elems, double * matrix0_body, const double * matrix1_body, double alpha)
833 | {
834 | return matrix_addition_gpu_(num_elems,matrix0_body,matrix1_body,alpha);
835 | }
836 |
837 |
838 | void matrix_multiplication_gpu(bool left_transp, bool right_transp,
839 | float * matrix0_body, int nrows0, int ncols0,
840 | const float * matrix1_body, int nrows1, int ncols1,
841 | const float * matrix2_body, int nrows2, int ncols2)
842 | {
843 | return matrix_multiplication_gpu_(left_transp,right_transp,
844 | matrix0_body,nrows0,ncols0,
845 | matrix1_body,nrows1,ncols1,
846 | matrix2_body,nrows2,ncols2);
847 | }
848 |
849 | void matrix_multiplication_gpu(bool left_transp, bool right_transp,
850 | double * matrix0_body, int nrows0, int ncols0,
851 | const double * matrix1_body, int nrows1, int ncols1,
852 | const double * matrix2_body, int nrows2, int ncols2)
853 | {
854 | return matrix_multiplication_gpu_(left_transp,right_transp,
855 | matrix0_body,nrows0,ncols0,
856 | matrix1_body,nrows1,ncols1,
857 | matrix2_body,nrows2,ncols2);
858 | }
859 |
860 |
861 | void init()
862 | {
863 | totalNumGPUs = 0;
864 | cudaError_t cuerr = cudaGetDeviceCount(&totalNumGPUs); assert(cuerr == cudaSuccess);
865 | std::cout << "Found " << totalNumGPUs << " NVIDIA GPU" << std::endl;
866 | if(totalNumGPUs > 0){
867 | cublasStatus_t cuberr;
868 | gpuProperty = new cudaDeviceProp[totalNumGPUs];
869 | cublasHandle = new cublasHandle_t[totalNumGPUs];
870 | //Init each GPU:
871 | for(int i = (totalNumGPUs - 1); i >= 0; --i){
872 | cuerr = cudaSetDevice(i); assert(cuerr == cudaSuccess);
873 | cuerr = cudaGetDeviceProperties(&(gpuProperty[i]),i); assert(cuerr == cudaSuccess);
874 | cuberr = cublasCreate(&(cublasHandle[i])); assert(cuberr == CUBLAS_STATUS_SUCCESS);
875 | cuberr = cublasSetPointerMode(cublasHandle[i],CUBLAS_POINTER_MODE_DEVICE); assert(cuberr == CUBLAS_STATUS_SUCCESS);
876 | cuerr = cudaGetLastError(); assert(cuerr == cudaSuccess);
877 | std::cout << "Initialized GPU " << i << std::endl;
878 | }
879 | //Enable P2P access between GPU:
880 | if(totalNumGPUs > 1){
881 | for(int i = (totalNumGPUs - 1); i >= 0; --i){
882 | if(gpuProperty[i].unifiedAddressing != 0){
883 | cuerr = cudaSetDevice(i); assert(cuerr == cudaSuccess);
884 | for(int j = (totalNumGPUs - 1); j >= 0; --j){
885 | if(j != i){
886 | if(gpuProperty[j].unifiedAddressing != 0){
887 | cuerr = cudaDeviceEnablePeerAccess(j,0);
888 | if(cuerr == cudaSuccess){
889 | std::cout << "GPU " << i << " can access peer GPU " << j << std::endl;
890 | }else{
891 | std::cout << "GPU " << i << " cannot access peer GPU " << j << std::endl;
892 | }
893 | }
894 | }
895 | }
896 | }
897 | }
898 | }
899 | cuerr = cudaGetLastError();
900 | }
901 | std::cout << "BLA library initialized successfully" << std::endl;
902 | return;
903 | }
904 |
905 |
906 | void shutdown()
907 | {
908 | if(totalNumGPUs > 0){
909 | cudaError_t cuerr;
910 | cublasStatus_t cuberr;
911 | for(int i = 0; i < totalNumGPUs; ++i){
912 | cuberr = cublasDestroy(cublasHandle[i]); assert(cuberr == CUBLAS_STATUS_SUCCESS);
913 | cuerr = cudaDeviceReset(); assert(cuerr == cudaSuccess);
914 | std::cout << "Destroyed primary context on GPU " << i << std::endl;
915 | }
916 | delete [] cublasHandle;
917 | delete [] gpuProperty;
918 | }
919 | totalNumGPUs = 0;
920 | std::cout << "BLA library shut down successfully" << std::endl;
921 | return;
922 | }
923 |
924 |
925 | void print_device_properties(int device)
926 | {
927 | cudaDeviceProp prop;
928 | cudaError_t cuerr = cudaGetDeviceProperties(&prop,device);
929 | if(cuerr == cudaSuccess){
930 | std::cout << "Properties of NVIDIA GPU " << device << std::endl;
931 | std::cout << " Compute capability: " << prop.major << "." << prop.minor << std::endl;
932 | std::cout << " Register file size: " << prop.regsPerBlock << std::endl;
933 | std::cout << " Shared memory size: " << prop.sharedMemPerBlock << std::endl;
934 | }else{
935 | std::cout << "#ERROR(bla::print_device_properties): Unable to get properties for device " << device << std::endl;
936 | assert(false);
937 | }
938 | return;
939 | }
940 |
941 |
942 | void reset_gemm_algorithm(int algo)
943 | {
944 | gemmAlgorithm = algo;
945 | return;
946 | }
947 |
948 |
949 | bool test_hello()
950 | {
951 | std::cout << "Testing presence on GPU ..." << std::endl;
952 | const std::string s1("Am I really on GPU?");
953 | const std::string s2("Waiting for the answer ...");
954 | const std::string s3("Yes, you are!");
955 |
956 | size_t max_len = std::max(s1.size(),std::max(s2.size(),s3.size()));
957 | size_t str_len = max_len+1;
958 |
959 | char * hs1 = static_cast(allocate(str_len,-1,MemKind::Pinned)); assert(hs1 != nullptr);
960 | char * ds1 = static_cast(allocate(str_len,0,MemKind::Regular)); assert(ds1 != nullptr);
961 | int i = 0; for(const char & symb: s1) hs1[i++]=symb; hs1[s1.size()]='\0';
962 | printf("%s ",hs1);
963 |
964 | char * hs3 = static_cast(allocate(str_len,-1,MemKind::Pinned)); assert(hs3 != nullptr);
965 | char * ds3 = static_cast(allocate(str_len,0,MemKind::Regular)); assert(ds3 != nullptr);
966 | i = 0; for(const char & symb: s3) hs3[i++]=symb; hs3[s3.size()]='\0';
967 |
968 | cudaError_t cuerr = cudaMemcpy((void*)ds1,(void*)hs1,str_len,cudaMemcpyDefault); assert(cuerr == cudaSuccess);
969 | cuerr = cudaMemcpy((void*)ds3,(void*)hs3,str_len,cudaMemcpyDefault); assert(cuerr == cudaSuccess);
970 |
971 | cuerr = cudaGetLastError(); assert(cuerr == cudaSuccess);
972 | gpu_test_presence<<<16,256>>>(str_len,ds1,ds3);
973 | std::cout << s2 << " ";
974 | cuerr = cudaDeviceSynchronize();
975 | cuerr = cudaGetLastError(); assert(cuerr == cudaSuccess);
976 |
977 | cuerr = cudaMemcpy((void*)hs1,(void*)ds1,str_len,cudaMemcpyDefault); assert(cuerr == cudaSuccess);
978 | printf("%s\n",hs1);
979 |
980 | deallocate((void*)ds3);
981 | deallocate((void*)hs3);
982 |
983 | deallocate((void*)ds1);
984 | deallocate((void*)hs1);
985 |
986 | return true;
987 | }
988 |
989 |
990 | bool test_norm()
991 | {
992 | std::cout << "Testing norm2 on GPU 0 ... ";
993 | const float num_tolerance = 1e-5;
994 | const size_t vol = 1000000;
995 | const size_t dsize = vol * sizeof(float);
996 | float * arr0 = static_cast(allocate(dsize,-1,MemKind::Pinned));
997 | float * arr1 = static_cast(allocate(dsize,0,MemKind::Regular));
998 | float * dnorm2 = static_cast(allocate(sizeof(float),0,MemKind::Regular));
999 |
1000 | for(size_t i = 0; i < vol; ++i) arr0[i]=1.0f/sqrt((float)vol); //value of each element to make norm equal 1
1001 |
1002 | cudaError_t cuerr = cudaMemcpy((void*)arr1,(void*)arr0,dsize,cudaMemcpyDefault); assert(cuerr == cudaSuccess);
1003 |
1004 | unsigned int num_blocks = 1024; unsigned int num_threads = 256;
1005 | gpu_array_norm2<<>>(vol,arr1,dnorm2);
1006 | cuerr = cudaDeviceSynchronize();
1007 | cuerr = cudaGetLastError(); assert(cuerr == cudaSuccess);
1008 |
1009 | float norm2 = 0.0f;
1010 | cuerr = cudaMemcpy((void*)(&norm2),(void*)dnorm2,sizeof(float),cudaMemcpyDefault);
1011 | std::cout << "Norm2 = " << norm2 << " (correct value is 1.0)" << std::endl;
1012 | assert(abs(norm2-1.0f) < num_tolerance);
1013 |
1014 | deallocate((void*)dnorm2);
1015 | deallocate((void*)arr1);
1016 | deallocate((void*)arr0);
1017 | return true;
1018 | }
1019 |
1020 |
1021 | bool test_bla()
1022 | {
1023 | if(!test_hello()) return false;
1024 | if(!test_norm()) return false;
1025 | return true;
1026 | }
1027 |
1028 | } //namespace bla
1029 |
--------------------------------------------------------------------------------
/bla_lib.hpp:
--------------------------------------------------------------------------------
1 | /* CUDA tutorial: Basic Linear Algebra (BLA) Library
2 |
3 | !Copyright (C) 2018-2018 Dmitry I. Lyakh (Liakh)
4 | !Copyright (C) 2018-2018 Oak Ridge National Laboratory (UT-Battelle)
5 |
6 | !This file is part of CUDA BLA tutorial.
7 |
8 | !CUDA BLA is free software: you can redistribute it and/or modify
9 | !it under the terms of the GNU Lesser General Public License as published
10 | !by the Free Software Foundation, either version 3 of the License, or
11 | !(at your option) any later version.
12 |
13 | !CUDA BLA is distributed in the hope that it will be useful,
14 | !but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | !MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | !GNU Lesser General Public License for more details.
17 |
18 | !You should have received a copy of the GNU Lesser General Public License
19 | !along with CUDA BLA. If not, see . */
20 |
21 | #ifndef BLA_LIB_HPP_
22 | #define BLA_LIB_HPP_
23 |
24 | #include "memory.hpp"
25 | #include "timer.hpp"
26 |
27 | #include
28 |
29 | #include
30 |
31 | namespace bla{
32 |
33 | /** Initialization of BLA **/
34 | void init();
35 |
36 | /** Shutdown of BLA **/
37 | void shutdown();
38 |
39 | /** Testing BLA **/
40 | bool test_bla();
41 |
42 | /** Device properites **/
43 | void print_device_properties(int device);
44 |
45 | /** Resets GEMM algorithm:
46 | 0: Custom GEMM from BLA;
47 | 1: cuBLAS GEMM. **/
48 | void reset_gemm_algorithm(int algo);
49 |
50 | /** Matrix squared "norm" (sum of the squared elements) **/
51 | float matrix_norm2_gpu(size_t num_elems, const float * matrix_body);
52 | double matrix_norm2_gpu(size_t num_elems, const double * matrix_body);
53 |
54 | /** Matrix addition **/
55 | void matrix_addition_gpu(size_t num_elems, float * matrix0_body, const float * matrix1_body, float alpha);
56 | void matrix_addition_gpu(size_t num_elems, double * matrix0_body, const double * matrix1_bod, double alpha);
57 |
58 | /** Matrix multiplication **/
59 | void matrix_multiplication_gpu(bool left_transp, bool right_transp,
60 | float * matrix0_body, int nrows0, int ncols0,
61 | const float * matrix1_body, int nrows1, int ncols1,
62 | const float * matrix2_body, int nrows2, int ncols2);
63 | void matrix_multiplication_gpu(bool left_transp, bool right_transp,
64 | double * matrix0_body, int nrows0, int ncols0,
65 | const double * matrix1_body, int nrows1, int ncols1,
66 | const double * matrix2_body, int nrows2, int ncols2);
67 |
68 | } //namespace bla
69 |
70 | #endif //BLA_LIB_HPP_
71 |
--------------------------------------------------------------------------------
/main.cpp:
--------------------------------------------------------------------------------
1 | /* CUDA tutorial: Basic Linear Algebra (BLA) Library
2 |
3 | !Copyright (C) 2018-2018 Dmitry I. Lyakh (Liakh)
4 | !Copyright (C) 2018-2018 Oak Ridge National Laboratory (UT-Battelle)
5 |
6 | !This file is part of CUDA BLA tutorial.
7 |
8 | !CUDA BLA is free software: you can redistribute it and/or modify
9 | !it under the terms of the GNU Lesser General Public License as published
10 | !by the Free Software Foundation, either version 3 of the License, or
11 | !(at your option) any later version.
12 |
13 | !CUDA BLA is distributed in the hope that it will be useful,
14 | !but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | !MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | !GNU Lesser General Public License for more details.
17 |
18 | !You should have received a copy of the GNU Lesser General Public License
19 | !along with CUDA BLA. If not, see . */
20 |
21 | #include "bla.hpp"
22 |
23 | #include
24 |
25 | void use_bla()
26 | {
27 | //Pick which GEMM tests you enable:
28 | const bool TEST_BLA_GEMM_BRUTE = true; //enables/disables testing of brute-force GEMM
29 | const bool TEST_BLA_GEMM_SHARED = true; //enables/disables testing of shared memory GEMM
30 | const bool TEST_BLA_GEMM_REGISTER = true; //enables/disables testing of register-based GEMM
31 |
32 | std::cout << "Let's try to use BLA library ..." << std::endl;
33 |
34 | //Create matrix A:
35 | bla::Matrix A(2000,2000);
36 | //Allocate matrix A body on Host:
37 | A.allocateBody(-1,bla::MemKind::Pinned);
38 | //Set matrix A body to some non-trivial value on Host:
39 | A.setBodyHost();
40 |
41 | //Create matrix B:
42 | bla::Matrix B(2000,2000);
43 | //Allocate matrix B body on Host:
44 | B.allocateBody(-1,bla::MemKind::Pinned);
45 | //Set matrix B body to some non-trivial value on Host:
46 | B.setBodyHost();
47 |
48 | //Create matrix C:
49 | bla::Matrix C(2000,2000);
50 | //Allocate matrix C body on GPU#0:
51 | C.allocateBody(0,bla::MemKind::Regular);
52 |
53 | //Create matrix D:
54 | bla::Matrix D(2000,2000);
55 | //Allocate matrix D body on GPU#0:
56 | D.allocateBody(0,bla::MemKind::Regular);
57 |
58 | //Copy matrix A to GPU#0 from Host:
59 | A.syncBody(0,-1); //Host (-1) --> GPU#0 (0)
60 | //Compute matrix A norm on GPU#0:
61 | auto normA = A.computeNorm(0);
62 | std::cout << "Matrix A norm = " << normA << std::endl;
63 |
64 | //Copy matrix B to GPU#0 from Host:
65 | B.syncBody(0,-1); //Host (-1) --> GPU#0 (0)
66 | //Compute matrix B norm on GPU#0:
67 | auto normB = B.computeNorm(0);
68 | std::cout << "Matrix B norm = " << normB << std::endl;
69 |
70 | //Determine total number of floating point operations:
71 | double flops = 2.0 * std::sqrt(static_cast(A.getVolume()) *
72 | static_cast(B.getVolume()) *
73 | static_cast(C.getVolume()));
74 | std::cout << "Matrix multiplication C+=A*B requires " << flops/1e9 << " Gflop" << std::endl;
75 |
76 | //Perform reference matrix multiplication on GPU#0 with cuBLAS:
77 | for(int repeat = 0; repeat < 2; ++repeat){
78 | C.zeroBody(0); //set matrix C body to zero on GPU#0
79 | bla::reset_gemm_algorithm(7);
80 | std::cout << "Performing matrix multiplication C+=A*B with cuBLAS ... ";
81 | double tms = bla::time_sys_sec();
82 | C.multiplyAdd(false,false,A,B,0);
83 | double tmf = bla::time_sys_sec();
84 | std::cout << "Done: Time = " << tmf-tms << " s: Gflop/s = " << flops/(tmf-tms)/1e9 << std::endl;
85 | //Compute C norm on GPU#0:
86 | auto normC = C.computeNorm(0); //correct C matrix norm
87 | std::cout << "Matrix C norm = " << normC << std::endl;
88 | D.zeroBody(0); //set matrix D body to zero on GPU#0
89 | D.add(C,-1.0f,0); //make matrix D = -C for later correctness checks
90 | }
91 |
92 | //Perform matrix multiplication on GPU#0 with BLA GEMM brute-force:
93 | if(TEST_BLA_GEMM_BRUTE){
94 | for(int repeat = 0; repeat < 2; ++repeat){
95 | C.zeroBody(0); //set matrix C body to zero on GPU#0
96 | bla::reset_gemm_algorithm(0);
97 | std::cout << "Performing matrix multiplication C+=A*B with BLA GEMM brute-force ... ";
98 | double tms = bla::time_sys_sec();
99 | C.multiplyAdd(false,false,A,B,0);
100 | double tmf = bla::time_sys_sec();
101 | std::cout << "Done: Time = " << tmf-tms << " s: Gflop/s = " << flops/(tmf-tms)/1e9 << std::endl;
102 | //Check correctness on GPU#0:
103 | C.add(D,1.0f,0);
104 | auto norm_diff = C.computeNorm(0);
105 | std::cout << "Norm of the matrix C deviation from correct = " << norm_diff << std::endl;
106 | if(std::abs(norm_diff) > 1e-7){
107 | std::cout << "#FATAL: Matrix C is incorrect, fix your GPU kernel implementation!" << std::endl;
108 | std::exit(1);
109 | }
110 | }
111 | }
112 |
113 | //Perform matrix multiplication on GPU#0 with BLA GEMM with shared memory:
114 | if(TEST_BLA_GEMM_SHARED){
115 | for(int repeat = 0; repeat < 2; ++repeat){
116 | C.zeroBody(0); //set matrix C body to zero on GPU#0
117 | bla::reset_gemm_algorithm(1);
118 | std::cout << "Performing matrix multiplication C+=A*B with BLA GEMM with shared memory ... ";
119 | double tms = bla::time_sys_sec();
120 | C.multiplyAdd(false,false,A,B,0);
121 | double tmf = bla::time_sys_sec();
122 | std::cout << "Done: Time = " << tmf-tms << " s: Gflop/s = " << flops/(tmf-tms)/1e9 << std::endl;
123 | //Check correctness on GPU#0:
124 | C.add(D,1.0f,0);
125 | auto norm_diff = C.computeNorm(0);
126 | std::cout << "Norm of the matrix C deviation from correct = " << norm_diff << std::endl;
127 | if(std::abs(norm_diff) > 1e-7){
128 | std::cout << "#FATAL: Matrix C is incorrect, fix your GPU kernel implementation!" << std::endl;
129 | std::exit(1);
130 | }
131 | }
132 | }
133 |
134 | //Perform matrix multiplication on GPU#0 with BLA GEMM with shared memory and registers:
135 | if(TEST_BLA_GEMM_REGISTER){
136 | for(int repeat = 0; repeat < 2; ++repeat){
137 | C.zeroBody(0); //set matrix C body to zero on GPU#0
138 | bla::reset_gemm_algorithm(2);
139 | std::cout << "Performing matrix multiplication C+=A*B with BLA GEMM with shared memory and registers ... ";
140 | double tms = bla::time_sys_sec();
141 | C.multiplyAdd(false,false,A,B,0);
142 | double tmf = bla::time_sys_sec();
143 | std::cout << "Done: Time = " << tmf-tms << " s: Gflop/s = " << flops/(tmf-tms)/1e9 << std::endl;
144 | //Check correctness on GPU#0:
145 | C.add(D,1.0f,0);
146 | auto norm_diff = C.computeNorm(0);
147 | std::cout << "Norm of the matrix C deviation from correct = " << norm_diff << std::endl;
148 | if(std::abs(norm_diff) > 1e-7){
149 | std::cout << "#FATAL: Matrix C is incorrect, fix your GPU kernel implementation!" << std::endl;
150 | std::exit(1);
151 | }
152 | }
153 | }
154 |
155 | std::cout << "Seems like it works!" << std::endl;
156 | return;
157 | }
158 |
159 |
160 | int main(int argc, char ** argv)
161 | {
162 | //Initialize BLA library:
163 | bla::init();
164 | bla::print_device_properties(0); //check compute capability
165 |
166 | //Test BLA library:
167 | bla::test_bla();
168 |
169 | //Use BLA library:
170 | use_bla();
171 |
172 | //Shutdown BLA library:
173 | bla::shutdown();
174 |
175 | return 0;
176 | }
177 |
--------------------------------------------------------------------------------
/matrix.hpp:
--------------------------------------------------------------------------------
1 | /* CUDA tutorial: Basic Linear Algebra (BLA) Library
2 |
3 | !Copyright (C) 2018-2018 Dmitry I. Lyakh (Liakh)
4 | !Copyright (C) 2018-2018 Oak Ridge National Laboratory (UT-Battelle)
5 |
6 | !This file is part of CUDA BLA tutorial.
7 |
8 | !CUDA BLA is free software: you can redistribute it and/or modify
9 | !it under the terms of the GNU Lesser General Public License as published
10 | !by the Free Software Foundation, either version 3 of the License, or
11 | !(at your option) any later version.
12 |
13 | !CUDA BLA is distributed in the hope that it will be useful,
14 | !but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | !MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | !GNU Lesser General Public License for more details.
17 |
18 | !You should have received a copy of the GNU Lesser General Public License
19 | !along with CUDA BLA. If not, see . */
20 |
21 | #ifndef MATRIX_HPP_
22 | #define MATRIX_HPP_
23 |
24 | #include "bla_lib.hpp"
25 |
26 | #include
27 |
28 | #include
29 | #include
30 | #include
31 | #include
32 | #include
33 | #include
34 | #include
35 |
36 | namespace bla{
37 |
38 | template
39 | class Matrix{
40 |
41 | public:
42 |
43 | explicit Matrix(int nrows, int ncols);
44 |
45 | Matrix(const Matrix & matrix) = delete;
46 | Matrix & operator=(const Matrix &) = delete;
47 | Matrix(Matrix && matrix) noexcept = default;
48 | Matrix & operator=(Matrix && matrix) noexcept = default;
49 | virtual ~Matrix();
50 |
51 | /** Returns the number of rows in the matrix **/
52 | int getNumRows() const;
53 | /** Returns the number of columns in the matrix **/
54 | int getNumCols() const;
55 | /** Returns the volume of the matrix (number of elements) **/
56 | std::size_t getVolume() const;
57 | /** Returns the size of the matrix in bytes **/
58 | std::size_t getSize() const;
59 | /** Returns a pointer to the memory resource on requested device (if any) **/
60 | T * getBodyPtr(int device) const;
61 | /** Allocates memory resource of requested kind on requested device **/
62 | void allocateBody(int device, MemKind memkind = MemKind::Regular);
63 | /** Deallocates memory resource on requested device **/
64 | void deallocateBody(int device);
65 | /** Marks matrix body status on a given device as up-to-date or not (outdated) **/
66 | void markBodyStatus(int device, bool status);
67 | /** Initializes matrix body to zero on a given device **/
68 | void zeroBody(int device);
69 | /** Initializes matrix body to some non-trivial value on Host **/
70 | void setBodyHost();
71 | /** Synchronizes matrix body on a given device with the body from another device.
72 | By default the source device is Host (if up to date). **/
73 | void syncBody(int device, int source_device = -1);
74 |
75 | /** Computes the norm of the matrix on a given device **/
76 | double computeNorm(int device = -1);
77 | /** Performs matrix addition on a given device **/
78 | void add(Matrix & Amat, T alpha = static_cast(1.0), int device = -1);
79 | /** Performs matrix multiplication on a given device **/
80 | void multiplyAdd(bool left_transp, bool right_transp, Matrix & Amat, Matrix & Bmat, int device = -1);
81 |
82 | private:
83 |
84 | //Memory resource descriptor:
85 | typedef struct{
86 | int device;
87 | void * ptr;
88 | MemKind memkind;
89 | bool uptodate;
90 | } Resource;
91 |
92 | //Data members:
93 | int nrows_; //number of rows
94 | int ncols_; //number of columns
95 | std::size_t elem_size_; //matrix element size in bytes
96 | std::list location_; //list of memory resources occupied by the matrix
97 | };
98 |
99 |
100 | //TEMPLATE DEFINITIONS:
101 | template
102 | Matrix::Matrix(int nrows, int ncols):
103 | nrows_(nrows), ncols_(ncols), elem_size_(sizeof(T))
104 | {
105 | static_assert(std::is_floating_point::value,"#ERROR(BLA::Matrix::Matrix): Matrix type must be floating point!");
106 | assert(nrows_ > 0 && ncols_ > 0 && elem_size_ > 0);
107 | std::cout << "Matrix created with dimensions (" << nrows_ << "," << ncols_ << ")" << std::endl;
108 | }
109 |
110 |
111 | template
112 | Matrix::~Matrix()
113 | {
114 | for(auto & loc: location_) deallocate(loc.ptr);
115 | std::cout << "Matrix destroyed" << std::endl;
116 | }
117 |
118 |
119 | template
120 | int Matrix::getNumRows() const
121 | {
122 | return nrows_;
123 | }
124 |
125 |
126 | template
127 | int Matrix::getNumCols() const
128 | {
129 | return ncols_;
130 | }
131 |
132 |
133 | template
134 | std::size_t Matrix::getVolume() const
135 | {
136 | return (static_cast(nrows_)*static_cast(ncols_)); //number of elements
137 | }
138 |
139 |
140 | template
141 | std::size_t Matrix::getSize() const
142 | {
143 | return (static_cast(nrows_)*static_cast(ncols_)*elem_size_); //matrix size in bytes
144 | }
145 |
146 |
147 | template
148 | T * Matrix::getBodyPtr(int device) const
149 | {
150 | T * ptr = nullptr;
151 | for(const auto & loc: location_){
152 | if(loc.device == device){
153 | ptr = static_cast(loc.ptr);
154 | break;
155 | }
156 | }
157 | return ptr;
158 | }
159 |
160 |
161 | template
162 | void Matrix::allocateBody(int device, MemKind memkind)
163 | {
164 | std::size_t mat_size = this->getSize(); //matrix size in bytes
165 | void * ptr = allocate(mat_size,device,memkind); //allocate memory of requested kind on requested device
166 | assert(ptr != nullptr);
167 | location_.emplace_back(Resource{device,ptr,memkind,false}); //save the new memory descriptor (Resource)
168 | std::cout << "New resource acquired on device " << device << std::endl;
169 | return;
170 | }
171 |
172 |
173 | template
174 | void Matrix::deallocateBody(int device)
175 | {
176 | for(auto & loc: location_){
177 | if(loc.device == device){
178 | deallocate(loc.ptr);
179 | std::cout << "Resource released on device " << device << std::endl;
180 | }
181 | }
182 | location_.remove_if([device](const Resource & res){return (res.device == device);});
183 | return;
184 | }
185 |
186 |
187 | template
188 | void Matrix::markBodyStatus(int device, bool status)
189 | {
190 | for(auto & loc: location_){
191 | if(loc.device == device) loc.uptodate = status;
192 | }
193 | return;
194 | }
195 |
196 |
197 | template
198 | void Matrix::zeroBody(int device)
199 | {
200 | T * mat = this->getBodyPtr(device);
201 | if(mat != nullptr){
202 | std::size_t mat_size = this->getSize();
203 | assert(mat_size > 0);
204 | if(device < 0){ //Host
205 | std::memset(((void*)mat),0,mat_size);
206 | }else{ //GPU device
207 | int dev;
208 | cudaError_t cuerr = cudaGetDevice(&dev); assert(cuerr == cudaSuccess);
209 | if(device != dev){
210 | cuerr = cudaSetDevice(device); assert(cuerr == cudaSuccess);
211 | }
212 | cuerr = cudaMemset(((void*)mat),0,mat_size); assert(cuerr == cudaSuccess);
213 | if(device != dev){
214 | cuerr = cudaSetDevice(dev); assert(cuerr == cudaSuccess);
215 | }
216 | }
217 | this->markBodyStatus(device,true); //mark matrix body on device as up-to-date
218 | }else{
219 | std::cout << "#ERROR(BLA::Matrix::zeroBody): Matrix does not exist on device " << device << std::endl;
220 | assert(false);
221 | }
222 | return;
223 | }
224 |
225 |
226 | template
227 | void Matrix::setBodyHost()
228 | {
229 | T * mat = this->getBodyPtr(-1); //-1 is Host device id
230 | if(mat != nullptr){
231 | for(std::size_t j = 0; j < ncols_; ++j){
232 | std::size_t offset = j*nrows_;
233 | for(std::size_t i = 0; i < nrows_; ++i){
234 | //mat[offset+i] = static_cast(1)/(static_cast(i+7) + static_cast(j+13)); //some value
235 | mat[offset+i] = static_cast(1)/std::log(static_cast(std::rand()+13)); //some value
236 | }
237 | }
238 | this->markBodyStatus(-1,true); //mark matrix body on Host as up-to-date
239 | }else{
240 | std::cout << "#ERROR(BLA::Matrix::setBodyHost): Matrix does not exist on Host!" << std::endl;
241 | assert(false);
242 | }
243 | return;
244 | }
245 |
246 |
247 | template
248 | void Matrix::syncBody(int device, int source_device)
249 | {
250 | if(device != source_device){
251 | Resource destination_resource, source_resource;
252 | bool destination_found = false;
253 | bool source_found = false;
254 | for(auto & loc: location_){
255 | if(!source_found && loc.device == source_device && loc.uptodate){
256 | source_resource = loc;
257 | source_found = true;
258 | }
259 | if(!destination_found && loc.device == device){
260 | destination_resource = loc;
261 | destination_found = true;
262 | }
263 | }
264 | if(!destination_found){
265 | this->allocateBody(device,MemKind::Regular);
266 | for(const auto & loc: location_){
267 | if(loc.device == device){
268 | destination_resource = loc;
269 | destination_found = true;
270 | break;
271 | }
272 | }
273 | }
274 | if(source_found){
275 | cudaError_t cuerr = cudaMemcpy(destination_resource.ptr,source_resource.ptr,this->getSize(),cudaMemcpyDefault);
276 | assert(cuerr == cudaSuccess);
277 | this->markBodyStatus(device,true); //mark matrix body on device as up-to-date
278 | }else{
279 | std::cout << "#ERROR(BLA::Matrix::syncBody): Provided source device " << source_device << " has no up-to-date matrix body!" << std::endl;
280 | assert(false);
281 | }
282 | }
283 | return;
284 | }
285 |
286 |
287 | template
288 | double Matrix::computeNorm(int device)
289 | {
290 | std::size_t vol = this->getVolume();
291 | T * matrix_body = this->getBodyPtr(device); assert(matrix_body != nullptr);
292 | double result = 0.0;
293 | if(device >= 0){ //GPU
294 | int dev; cudaError_t cuerr = cudaGetDevice(&dev); assert(cuerr == cudaSuccess);
295 | if(device != dev){
296 | cuerr = cudaSetDevice(device); assert(cuerr == cudaSuccess);
297 | }
298 | result = matrix_norm2_gpu(vol,matrix_body);
299 | if(device != dev){
300 | cuerr = cudaSetDevice(dev); assert(cuerr == cudaSuccess);
301 | }
302 | }else{ //Host
303 | //`Implement
304 | assert(false);
305 | }
306 | return result;
307 | }
308 |
309 |
310 | template
311 | void Matrix::add(Matrix & Amat, T alpha, int device)
312 | {
313 | std::size_t vol = this->getVolume();
314 | assert(Amat.getVolume() == vol);
315 | T * matrix0_body = this->getBodyPtr(device); assert(matrix0_body != nullptr);
316 | const T * matrix1_body = Amat.getBodyPtr(device); assert(matrix1_body != nullptr);
317 | if(device >= 0){ //GPU
318 | int dev; cudaError_t cuerr = cudaGetDevice(&dev); assert(cuerr == cudaSuccess);
319 | if(device != dev){
320 | cuerr = cudaSetDevice(device); assert(cuerr == cudaSuccess);
321 | }
322 | matrix_addition_gpu(vol,matrix0_body,matrix1_body,alpha);
323 | if(device != dev){
324 | cuerr = cudaSetDevice(dev); assert(cuerr == cudaSuccess);
325 | }
326 | }else{ //Host
327 | //`Implement
328 | assert(false);
329 | }
330 | return;
331 | }
332 |
333 |
334 | template
335 | void Matrix::multiplyAdd(bool left_transp, bool right_transp, Matrix & Amat, Matrix & Bmat, int device)
336 | {
337 | T * matrix0_body = this->getBodyPtr(device); assert(matrix0_body != nullptr);
338 | const T * matrix1_body = Amat.getBodyPtr(device); assert(matrix1_body != nullptr);
339 | const T * matrix2_body = Bmat.getBodyPtr(device); assert(matrix2_body != nullptr);
340 | if(device >= 0){ //GPU
341 | int dev; cudaError_t cuerr = cudaGetDevice(&dev); assert(cuerr == cudaSuccess);
342 | if(device != dev){
343 | cuerr = cudaSetDevice(device); assert(cuerr == cudaSuccess);
344 | }
345 | matrix_multiplication_gpu(left_transp,right_transp,
346 | matrix0_body,this->getNumRows(),this->getNumCols(),
347 | matrix1_body,Amat.getNumRows(),Amat.getNumCols(),
348 | matrix2_body,Bmat.getNumRows(),Bmat.getNumCols());
349 | if(device != dev){
350 | cuerr = cudaSetDevice(dev); assert(cuerr == cudaSuccess);
351 | }
352 | }else{ //Host
353 | //`Implement
354 | assert(false);
355 | }
356 | return;
357 | }
358 |
359 | } //namespace bla
360 |
361 | #endif //MATRIX_HPP_
362 |
--------------------------------------------------------------------------------
/memory.cpp:
--------------------------------------------------------------------------------
1 | /* CUDA tutorial: Basic Linear Algebra (BLA) Library
2 |
3 | !Copyright (C) 2018-2018 Dmitry I. Lyakh (Liakh)
4 | !Copyright (C) 2018-2018 Oak Ridge National Laboratory (UT-Battelle)
5 |
6 | !This file is part of CUDA BLA tutorial.
7 |
8 | !CUDA BLA is free software: you can redistribute it and/or modify
9 | !it under the terms of the GNU Lesser General Public License as published
10 | !by the Free Software Foundation, either version 3 of the License, or
11 | !(at your option) any later version.
12 |
13 | !CUDA BLA is distributed in the hope that it will be useful,
14 | !but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | !MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | !GNU Lesser General Public License for more details.
17 |
18 | !You should have received a copy of the GNU Lesser General Public License
19 | !along with CUDA BLA. If not, see . */
20 |
21 | #include "memory.hpp"
22 |
23 | #include
24 |
25 | #include
26 |
27 | #include
28 | #include