├── .gitignore ├── Makefile ├── Presentation.pdf ├── README.md ├── bla.hpp ├── bla_lib.cu ├── bla_lib.hpp ├── main.cpp ├── matrix.hpp ├── memory.cpp ├── memory.hpp ├── timer.cpp └── timer.hpp /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.mod 3 | *.modmic 4 | *.ptx 5 | *.i 6 | *.ii 7 | *.cudafe* 8 | *.fatbin* 9 | *.cubin 10 | *.module_id 11 | *.hash 12 | *.a 13 | *.so 14 | *.x 15 | *.log 16 | *.out 17 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | BINARY_NAME = bla_test.x 2 | 3 | CXX_COMP = g++ 4 | CXX_FLAGS_DEV = -c -O3 -std=c++11 -fPIC -D_FORCE_INLINES -g 5 | CXX_FLAGS_OPT = -c -O3 -std=c++11 -fPIC -D_FORCE_INLINES 6 | CXX_FLAGS = $(CXX_FLAGS_OPT) 7 | CXX_INC = 8 | CXX_LIB = -lstdc++ 9 | 10 | CUDA_COMP = nvcc 11 | CUDA_HOST = /usr/bin/g++ 12 | CUDA_ARCH = sm_35 13 | CUDA_INC = -I/usr/local/cuda/include 14 | CUDA_LIB = -L/usr/local/cuda/lib64 -lcublas -lcudart 15 | CUDA_FLAGS_DEV = --compile -ccbin $(CUDA_HOST) -std=c++11 -arch=$(CUDA_ARCH) -O3 -m64 -w --resource-usage --ptxas-options=-v -Xcompiler -fPIC -D_FORCE_INLINES -g -G 16 | CUDA_FLAGS_OPT = --compile -ccbin $(CUDA_HOST) -std=c++11 -arch=$(CUDA_ARCH) -O3 -m64 -w --resource-usage --ptxas-options=-v -Xcompiler -fPIC -D_FORCE_INLINES 17 | CUDA_FLAGS_ADV = --compile -ccbin $(CUDA_HOST) -std=c++11 -arch=$(CUDA_ARCH) -O3 -m64 -w --resource-usage --ptxas-options=-v -lineinfo -Xcompiler -fPIC -D_FORCE_INLINES 18 | CUDA_FLAGS = $(CUDA_FLAGS_ADV) 19 | 20 | LINK_FLAGS = -fPIC 21 | 22 | OBJS = timer.o memory.o bla_lib.o main.o 23 | 24 | $(BINARY_NAME): $(OBJS) 25 | $(CXX_COMP) $(OBJS) $(LINK_FLAGS) $(CXX_LIB) $(CUDA_LIB) -o $(BINARY_NAME) 26 | 27 | timer.o: timer.cpp timer.hpp 28 | $(CXX_COMP) $(CXX_FLAGS) $(CXX_INC) $(CUDA_INC) timer.cpp 29 | 30 | memory.o: memory.cpp memory.hpp 31 | $(CXX_COMP) $(CXX_FLAGS) $(CXX_INC) $(CUDA_INC) memory.cpp 32 | 33 | bla_lib.o: bla_lib.cu bla_lib.hpp matrix.hpp memory.hpp timer.hpp 34 | $(CUDA_COMP) $(CUDA_FLAGS) $(CXX_INC) $(CUDA_INC) --ptx --source-in-ptx bla_lib.cu -o bla_lib.ptx 35 | $(CUDA_COMP) $(CUDA_FLAGS) $(CXX_INC) $(CUDA_INC) bla_lib.cu 36 | 37 | main.o: main.cpp bla_lib.cu bla_lib.hpp memory.hpp 38 | $(CXX_COMP) $(CXX_FLAGS) $(CXX_INC) $(CUDA_INC) main.cpp 39 | 40 | 41 | .PHONY: clean 42 | clean: 43 | rm -f *.out *.x *.a *.so *.o *.mod *.modmic *.ptx *.log 44 | -------------------------------------------------------------------------------- /Presentation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DmitryLyakh/CUDA_Tutorial/a7fc4021d8843c997c06fa5faf2a31a1431f2dca/Presentation.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | CUDA Tutorial: Basic Linear Algebra (BLA) Library 2 | 3 | AUTHOR: Dmitry I. Lyakh (Liakh): quant4me@gmail.com, liakhdi@ornl.gov 4 | 5 | Copyright (C) 2018-2018 Dmitry I. Lyakh (Liakh) 6 | Copyright (C) 2018-2018 Oak Ridge National Laboratory (UT-Battelle) 7 | 8 | LICENSE: GNU Lesser General Public License v.3 9 | 10 | Persistent location: 11 | https://github.com/DmitryLyakh/CUDA_Tutorial.git 12 | 13 | Presentation from the Petascale Computing Institute 2019: 14 | Presentation.pdf 15 | 16 | YouTube video of this tutorial: 17 | https://youtu.be/Zqfa80APkDk 18 | 19 | BUILD: 20 | 1. Prerequisites: Linux, g++ 5+, CUDA 9+. 21 | 2. Update CUDA_INC and CUDA_LIB paths in the Makefile (if needed). 22 | 3. Adjust CUDA_ARCH in the Makefile to your GPU compute capability. 23 | 4. If your g++ compiler is too new for CUDA, provide an older one in CUDA_HOST. 24 | 5. make 25 | -------------------------------------------------------------------------------- /bla.hpp: -------------------------------------------------------------------------------- 1 | /* CUDA tutorial: Basic Linear Algebra (BLA) Library 2 | 3 | !Copyright (C) 2018-2018 Dmitry I. Lyakh (Liakh) 4 | !Copyright (C) 2018-2018 Oak Ridge National Laboratory (UT-Battelle) 5 | 6 | !This file is part of CUDA BLA tutorial. 7 | 8 | !CUDA BLA is free software: you can redistribute it and/or modify 9 | !it under the terms of the GNU Lesser General Public License as published 10 | !by the Free Software Foundation, either version 3 of the License, or 11 | !(at your option) any later version. 12 | 13 | !CUDA BLA is distributed in the hope that it will be useful, 14 | !but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | !MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | !GNU Lesser General Public License for more details. 17 | 18 | !You should have received a copy of the GNU Lesser General Public License 19 | !along with CUDA BLA. If not, see . */ 20 | 21 | #ifndef BLA_HPP_ 22 | #define BLA_HPP_ 23 | 24 | #include "matrix.hpp" 25 | #include "bla_lib.hpp" 26 | 27 | #endif //BLA_HPP_ 28 | -------------------------------------------------------------------------------- /bla_lib.cu: -------------------------------------------------------------------------------- 1 | /* CUDA tutorial: Basic Linear Algebra (BLA) Library 2 | 3 | !Copyright (C) 2018-2018 Dmitry I. Lyakh (Liakh) 4 | !Copyright (C) 2018-2018 Oak Ridge National Laboratory (UT-Battelle) 5 | 6 | !This file is part of CUDA BLA tutorial. 7 | 8 | !CUDA BLA is free software: you can redistribute it and/or modify 9 | !it under the terms of the GNU Lesser General Public License as published 10 | !by the Free Software Foundation, either version 3 of the License, or 11 | !(at your option) any later version. 12 | 13 | !CUDA BLA is distributed in the hope that it will be useful, 14 | !but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | !MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | !GNU Lesser General Public License for more details. 17 | 18 | !You should have received a copy of the GNU Lesser General Public License 19 | !along with CUDA BLA. If not, see . */ 20 | 21 | #include "bla_lib.hpp" 22 | 23 | #include 24 | #include 25 | 26 | #include 27 | #include 28 | #include 29 | 30 | #include 31 | 32 | namespace bla{ 33 | 34 | //GPU device constants: 35 | __device__ __constant__ static float zero_fp32 = 0.0f; 36 | __device__ __constant__ static float unity_fp32 = 1.0f; 37 | __device__ __constant__ static double zero_fp64 = 0.0; 38 | __device__ __constant__ static double unity_fp64 = 1.0; 39 | 40 | 41 | //CUDA floating point data type selector: 42 | template struct CudaFPData{}; 43 | template <> struct CudaFPData{ 44 | using type = float; 45 | static constexpr cudaDataType_t kind = CUDA_R_32F; 46 | }; 47 | template <> struct CudaFPData{ 48 | using type = double; 49 | static constexpr cudaDataType_t kind = CUDA_R_64F; 50 | }; 51 | template <> struct CudaFPData>{ 52 | using type = cuComplex; 53 | static constexpr cudaDataType_t kind = CUDA_C_32F; 54 | }; 55 | template <> struct CudaFPData>{ 56 | using type = cuDoubleComplex; 57 | static constexpr cudaDataType_t kind = CUDA_C_64F; 58 | }; 59 | 60 | 61 | //Number of present GPU devices: 62 | static int totalNumGPUs = 0; 63 | 64 | //Current GEMM algorithm: 65 | static int gemmAlgorithm = 0; 66 | 67 | //CUDA device properties (for all GPU devices): 68 | cudaDeviceProp * gpuProperty; 69 | 70 | //cuBLAS handles (one per device): 71 | cublasHandle_t * cublasHandle; 72 | 73 | 74 | //Internal tests: 75 | bool test_hello(); 76 | bool test_norm(); 77 | 78 | 79 | //CUDA kernel prototypes: 80 | __global__ void gpu_test_presence(size_t str_len, char * __restrict__ dst, const char * __restrict__ src); 81 | 82 | 83 | template 84 | __global__ void gpu_array_norm2(size_t arr_size, const T * __restrict__ arr, volatile T * norm); 85 | __device__ static unsigned int norm_wr_lock = 0; //reduction lock (per GPU) 86 | 87 | 88 | template 89 | __global__ void gpu_array_add(size_t arr_size, T * __restrict__ arr0, const T * __restrict__ arr1, T alpha); 90 | 91 | 92 | template 93 | __global__ void gpu_gemm_nn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right); 94 | 95 | template 96 | __global__ void gpu_gemm_sh_nn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right); 97 | 98 | template 99 | __global__ void gpu_gemm_sh_reg_nn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right); 100 | 101 | template 102 | __global__ void gpu_gemm_tn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right); 103 | 104 | template 105 | __global__ void gpu_gemm_sh_tn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right); 106 | 107 | template 108 | __global__ void gpu_gemm_sh_reg_tn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right); 109 | 110 | template 111 | __global__ void gpu_gemm_nt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right); 112 | 113 | template 114 | __global__ void gpu_gemm_sh_nt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right); 115 | 116 | template 117 | __global__ void gpu_gemm_sh_reg_nt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right); 118 | 119 | template 120 | __global__ void gpu_gemm_tt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right); 121 | 122 | template 123 | __global__ void gpu_gemm_sh_tt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right); 124 | 125 | template 126 | __global__ void gpu_gemm_sh_reg_tt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right); 127 | 128 | //template 129 | //__global__ void gpu_gemm_sh_reg_old_nn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right); 130 | 131 | 132 | cublasStatus_t cublasGemm(cublasHandle_t handle, 133 | cublasOperation_t transa, cublasOperation_t transb, 134 | int m, int n, int k, const float * alpha, 135 | const float * A, int lda, const float * B, int ldb, 136 | const float * beta, float * C, int ldc) 137 | { 138 | return cublasSgemm(handle,transa,transb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc); 139 | } 140 | 141 | cublasStatus_t cublasGemm(cublasHandle_t handle, 142 | cublasOperation_t transa, cublasOperation_t transb, 143 | int m, int n, int k, const double * alpha, 144 | const double * A, int lda, const double * B, int ldb, 145 | const double * beta, double * C, int ldc) 146 | { 147 | return cublasDgemm(handle,transa,transb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc); 148 | } 149 | 150 | 151 | //Dispatch wrappers: 152 | template 153 | T matrix_norm2_gpu_(size_t num_elems, 154 | const T * matrix_body); 155 | 156 | template 157 | void matrix_addition_gpu_(size_t num_elems, 158 | T * matrix0_body, 159 | const T * matrix1_body, 160 | T alpha); 161 | 162 | template 163 | void matrix_multiplication_gpu_(bool left_transp, bool right_transp, 164 | T * matrix0_body, int nrows0, int ncols0, 165 | const T * matrix1_body, int nrows1, int ncols1, 166 | const T * matrix2_body, int nrows2, int ncols2); 167 | 168 | 169 | //IMPLEMENTATION: 170 | __global__ void gpu_test_presence(size_t str_len, char * __restrict__ dst, const char * __restrict__ src) 171 | { 172 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 173 | while(tid < str_len){ 174 | dst[tid] = src[tid]; 175 | tid += gridDim.x * blockDim.x; 176 | } 177 | return; 178 | } 179 | 180 | 181 | template 182 | __global__ void gpu_array_norm2(size_t arr_size, //in: array size 183 | const T * __restrict__ arr, //in: pointer to arr[arr_size] 184 | volatile T * norm) //inout: sum of the squared elements of the array 185 | { 186 | extern __shared__ double thread_norm[]; //blockDim.x 187 | 188 | size_t n = gridDim.x * blockDim.x; 189 | double tnorm = 0.0; 190 | for(size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < arr_size; i += n) tnorm += arr[i] * arr[i]; 191 | thread_norm[threadIdx.x] = tnorm; 192 | __syncthreads(); 193 | 194 | unsigned int s = blockDim.x; 195 | while(s > 1){ 196 | unsigned int j = (s+1U)>>1; //=(s+1)/2 197 | if(threadIdx.x + j < s) thread_norm[threadIdx.x] += thread_norm[threadIdx.x+j]; 198 | __syncthreads(); 199 | s = j; 200 | } 201 | 202 | if(threadIdx.x == 0){ 203 | unsigned int j = 1; 204 | while(j){j = atomicMax(&norm_wr_lock,1);} //lock 205 | __threadfence(); 206 | *norm += thread_norm[0]; //accumulate 207 | __threadfence(); 208 | j=atomicExch(&norm_wr_lock,0); //unlock 209 | } 210 | __syncthreads(); 211 | return; 212 | } 213 | 214 | 215 | template 216 | __global__ void gpu_array_add(size_t arr_size, //in: array size 217 | T * __restrict__ arr0, //inout: pointer to arr0[arr_size] 218 | const T * __restrict__ arr1, //in: pointer to arr1[arr_size] 219 | T alpha) //in: scaling factor 220 | { 221 | size_t n = gridDim.x * blockDim.x; 222 | for(size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < arr_size; i += n) arr0[i] += arr1[i] * alpha; 223 | return; 224 | } 225 | 226 | 227 | template 228 | __global__ void gpu_gemm_nn(int m, int n, int k, //in: matrix dimensions: C(m,n)+=A(m,k)*B(k,n) 229 | T * __restrict__ dest, //inout: pointer to C matrix data 230 | const T * __restrict__ left, //in: pointer to A matrix data 231 | const T * __restrict__ right) //in: pointer to B matrix data 232 | { 233 | size_t ty = blockIdx.y*blockDim.y + threadIdx.y; //global thread index Y 234 | size_t tx = blockIdx.x*blockDim.x + threadIdx.x; //global thread index X 235 | 236 | size_t n_pos = ty; 237 | while(n_pos < n){ 238 | 239 | size_t m_pos = tx; 240 | while(m_pos < m){ 241 | 242 | T tmp = static_cast(0.0); 243 | for(size_t k_pos = 0; k_pos < k; ++k_pos){ 244 | tmp += left[k_pos*m + m_pos] * right[n_pos*k + k_pos]; 245 | } 246 | dest[n_pos*m + m_pos] += tmp; 247 | 248 | m_pos += gridDim.x*blockDim.x; 249 | } 250 | 251 | n_pos += gridDim.y*blockDim.y; 252 | } 253 | return; 254 | } 255 | 256 | 257 | template 258 | __global__ void gpu_gemm_sh_nn(int m, int n, int k, //in: matrix dimensions: C(m,n)+=A(m,k)*B(k,n) 259 | T * __restrict__ dest, //inout: pointer to C matrix data 260 | const T * __restrict__ left, //in: pointer to A matrix data 261 | const T * __restrict__ right) //in: pointer to B matrix data 262 | { 263 | using int_t = int; //either int or size_t 264 | __shared__ T lbuf[TILE_EXT_K][TILE_EXT_M], rbuf[TILE_EXT_N][TILE_EXT_K]; 265 | 266 | for(int_t n_pos = blockIdx.y*blockDim.y; n_pos < n; n_pos += gridDim.y*blockDim.y){ //tile offset in Y dimension 267 | 268 | for(int_t m_pos = blockIdx.x*blockDim.x; m_pos < m; m_pos += gridDim.x*blockDim.x){ //tile offset in X dimension 269 | 270 | T tmp = static_cast(0.0); //accumulator 271 | 272 | for(int_t k_pos = 0; k_pos < k; k_pos += TILE_EXT_K){ //k_pos is the position of the CUDA thread along the K dimension 273 | int_t k_end = k_pos + TILE_EXT_K; if(k_end > k) k_end = k; 274 | 275 | //Load a tile of matrix A(m_pos:TILE_EXT_M, k_pos:TILE_EXT_K): 276 | if(m_pos + threadIdx.x < m){ 277 | for(int_t k_loc = k_pos + threadIdx.y; k_loc < k_end; k_loc += blockDim.y){ 278 | lbuf[k_loc-k_pos][threadIdx.x] = left[k_loc*m + (m_pos+threadIdx.x)]; 279 | } 280 | } 281 | 282 | //Load a tile of matrix B(k_pos:TILE_EXT_K, n_pos:TILE_EXT_N): 283 | if(n_pos + threadIdx.y < n){ 284 | for(int_t k_loc = k_pos + threadIdx.x; k_loc < k_end; k_loc += blockDim.x){ 285 | rbuf[threadIdx.y][k_loc-k_pos] = right[(n_pos+threadIdx.y)*k + k_loc]; 286 | } 287 | } 288 | __syncthreads(); 289 | 290 | //Multiply two loaded tiles to produce a tile of matrix C(m_pos:TILE_EXT_M,n_pos:TILE_EXT_N): 291 | if(m_pos + threadIdx.x < m && n_pos + threadIdx.y < n){ 292 | if(k_end - k_pos == TILE_EXT_K){ //number of loop iterations is known at compile time: Unroll it 293 | #pragma unroll 294 | for(int_t l = 0; l < TILE_EXT_K; ++l){ 295 | tmp += lbuf[l][threadIdx.x] * rbuf[threadIdx.y][l]; 296 | } 297 | }else{ //number of loop iterations is not known at compile time 298 | for(int_t l = 0; l < (k_end - k_pos); ++l){ 299 | tmp += lbuf[l][threadIdx.x] * rbuf[threadIdx.y][l]; 300 | } 301 | } 302 | } 303 | __syncthreads(); 304 | 305 | } //k_pos 306 | 307 | //Store element of the C matrix in global memory: 308 | if(m_pos + threadIdx.x < m && n_pos + threadIdx.y < n) 309 | dest[(n_pos+threadIdx.y)*m + (m_pos+threadIdx.x)] += tmp; 310 | 311 | } //m_pos 312 | 313 | } //n_pos 314 | return; 315 | } 316 | 317 | 318 | template 319 | __global__ void gpu_gemm_sh_reg_nn(int m, int n, int k, //in: matrix dimensions: C(m,n)+=A(m,k)*B(k,n) 320 | T * __restrict__ dest, //inout: pointer to C matrix data 321 | const T * __restrict__ left, //in: pointer to A matrix data 322 | const T * __restrict__ right) //in: pointer to B matrix data 323 | { 324 | using int_t = int; //either int or size_t 325 | __shared__ T lbuf[TILE_EXT_K][TILE_EXT_M], rbuf[TILE_EXT_N][TILE_EXT_K]; 326 | 327 | for(int_t n_pos = blockIdx.y*TILE_EXT_N; n_pos < n; n_pos += gridDim.y*TILE_EXT_N){ //tile offset in Y dimension 328 | int_t n_end = n_pos + TILE_EXT_N; if(n_end > n) n_end = n; 329 | 330 | for(int_t m_pos = blockIdx.x*TILE_EXT_M; m_pos < m; m_pos += gridDim.x*TILE_EXT_M){ //tile offset in X dimension 331 | int_t m_end = m_pos + TILE_EXT_M; if(m_end > m) m_end = m; 332 | 333 | if((m_end - m_pos == TILE_EXT_M) && (n_end - n_pos == TILE_EXT_N)){ //complete tile C(TILE_EXT_M,TILE_EXT_N) 334 | 335 | //Initialize registers to zero: 336 | T dreg[4][4] = {static_cast(0.0)}; 337 | T rreg[4] = {static_cast(0.0)}; 338 | T lreg[4] = {static_cast(0.0)}; 339 | 340 | for(int_t k_pos = 0; k_pos < k; k_pos += TILE_EXT_K){ //k_pos is the position of the CUDA thread along the K dimension 341 | int_t k_end = k_pos + TILE_EXT_K; if(k_end > k) k_end = k; 342 | 343 | //Load a tile of matrix A(m_pos:TILE_EXT_M, k_pos:TILE_EXT_K): 344 | for(int_t m_loc = m_pos + threadIdx.x; m_loc < m_end; m_loc += blockDim.x){ 345 | for(int_t k_loc = k_pos + threadIdx.y; k_loc < k_end; k_loc += blockDim.y){ 346 | lbuf[k_loc - k_pos][m_loc - m_pos] = left[k_loc*m + m_loc]; 347 | } 348 | } 349 | 350 | //Load a tile of matrix B(k_pos:TILE_EXT_K, n_pos:TILE_EXT_N): 351 | for(int_t n_loc = n_pos + threadIdx.y; n_loc < n_end; n_loc += blockDim.y){ 352 | for(int_t k_loc = k_pos + threadIdx.x; k_loc < k_end; k_loc += blockDim.x){ 353 | rbuf[n_loc - n_pos][k_loc - k_pos] = right[n_loc*k + k_loc]; 354 | } 355 | } 356 | __syncthreads(); 357 | 358 | //Multiply two loaded tiles to produce a tile of matrix C(m_pos:TILE_EXT_M,n_pos:TILE_EXT_N): 359 | if(k_end - k_pos == TILE_EXT_K){ 360 | #pragma unroll 361 | for(int_t l = 0; l < TILE_EXT_K; ++l){ 362 | #pragma unroll 363 | for(int_t j = 0; j < 4; ++j) rreg[j] = rbuf[threadIdx.y + blockDim.y*j][l]; 364 | #pragma unroll 365 | for(int_t j = 0; j < 4; ++j) lreg[j] = lbuf[l][threadIdx.x + blockDim.x*j]; 366 | #pragma unroll 367 | for(int_t j = 0; j < 4; ++j){ 368 | #pragma unroll 369 | for(int_t i = 0; i < 4; ++i){ 370 | dreg[j][i] += lreg[i] * rreg[j]; 371 | } 372 | } 373 | } 374 | }else{ 375 | for(int_t l = 0; l < (k_end - k_pos); ++l){ 376 | #pragma unroll 377 | for(int_t j = 0; j < 4; ++j) rreg[j] = rbuf[threadIdx.y + blockDim.y*j][l]; 378 | #pragma unroll 379 | for(int_t j = 0; j < 4; ++j) lreg[j] = lbuf[l][threadIdx.x + blockDim.x*j]; 380 | #pragma unroll 381 | for(int_t j = 0; j < 4; ++j){ 382 | #pragma unroll 383 | for(int_t i = 0; i < 4; ++i){ 384 | dreg[j][i] += lreg[i] * rreg[j]; 385 | } 386 | } 387 | } 388 | } 389 | __syncthreads(); 390 | 391 | } //k_pos 392 | 393 | //Store elements of the C matrix in global memory: 394 | #pragma unroll 395 | for(int_t j = 0; j < 4; ++j){ 396 | #pragma unroll 397 | for(int_t i = 0; i < 4; ++i){ 398 | dest[(n_pos + threadIdx.y + blockDim.y*j)*m + (m_pos + threadIdx.x + blockDim.x*i)] += dreg[j][i]; 399 | } 400 | } 401 | 402 | }else{ //incomplete tile of C 403 | 404 | //Initialize registers to zero: 405 | T dreg[4][4] = {static_cast(0.0)}; 406 | T rreg[4] = {static_cast(0.0)}; 407 | T lreg[4] = {static_cast(0.0)}; 408 | 409 | for(int_t k_pos = 0; k_pos < k; k_pos += TILE_EXT_K){ //k_pos is the position of the CUDA thread along the K dimension 410 | int_t k_end = k_pos + TILE_EXT_K; if(k_end > k) k_end = k; 411 | 412 | //Load a tile of matrix A(m_pos:TILE_EXT_M, k_pos:TILE_EXT_K): 413 | for(int_t m_loc = m_pos + threadIdx.x; m_loc < m_end; m_loc += blockDim.x){ 414 | for(int_t k_loc = k_pos + threadIdx.y; k_loc < k_end; k_loc += blockDim.y){ 415 | lbuf[k_loc - k_pos][m_loc - m_pos] = left[k_loc*m + m_loc]; 416 | } 417 | } 418 | 419 | //Load a tile of matrix B(k_pos:TILE_EXT_K, n_pos:TILE_EXT_N): 420 | for(int_t n_loc = n_pos + threadIdx.y; n_loc < n_end; n_loc += blockDim.y){ 421 | for(int_t k_loc = k_pos + threadIdx.x; k_loc < k_end; k_loc += blockDim.x){ 422 | rbuf[n_loc - n_pos][k_loc - k_pos] = right[n_loc*k + k_loc]; 423 | } 424 | } 425 | __syncthreads(); 426 | 427 | //Multiply two loaded tiles to produce a tile of matrix C(m_pos:TILE_EXT_M,n_pos:TILE_EXT_N): 428 | for(int_t l = 0; l < (k_end - k_pos); ++l){ 429 | for(int_t i = 0, j = threadIdx.y; j < n_end - n_pos; j += blockDim.y, i++) rreg[i] = rbuf[j][l]; 430 | for(int_t i = 0, j = threadIdx.x; j < m_end - m_pos; j += blockDim.x, i++) lreg[i] = lbuf[l][j]; 431 | #pragma unroll 432 | for(int_t j = 0; j < 4; ++j){ 433 | #pragma unroll 434 | for(int_t i = 0; i < 4; ++i){ 435 | dreg[j][i] += lreg[i] * rreg[j]; 436 | } 437 | } 438 | } 439 | __syncthreads(); 440 | 441 | } //k_pos 442 | 443 | //Store element of the C matrix in global memory: 444 | for(int_t j = 0, n_loc = n_pos + threadIdx.y; n_loc < n_end; n_loc += blockDim.y, j++){ 445 | for(int_t i = 0, m_loc = m_pos + threadIdx.x; m_loc < m_end; m_loc += blockDim.x, i++){ 446 | dest[n_loc*m + m_loc] += dreg[j][i]; 447 | } 448 | } 449 | 450 | } 451 | 452 | } //m_pos 453 | 454 | } //n_pos 455 | return; 456 | } 457 | 458 | 459 | template 460 | __global__ void gpu_gemm_tn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right) 461 | { 462 | //`Finish 463 | return; 464 | } 465 | 466 | 467 | template 468 | __global__ void gpu_gemm_sh_tn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right) 469 | { 470 | //`Finish 471 | return; 472 | } 473 | 474 | 475 | template 476 | __global__ void gpu_gemm_sh_reg_tn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right) 477 | { 478 | //`Finish 479 | return; 480 | } 481 | 482 | 483 | template 484 | __global__ void gpu_gemm_nt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right) 485 | { 486 | //`Finish 487 | return; 488 | } 489 | 490 | 491 | template 492 | __global__ void gpu_gemm_sh_nt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right) 493 | { 494 | //`Finish 495 | return; 496 | } 497 | 498 | 499 | template 500 | __global__ void gpu_gemm_sh_reg_nt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right) 501 | { 502 | //`Finish 503 | return; 504 | } 505 | 506 | 507 | template 508 | __global__ void gpu_gemm_tt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right) 509 | { 510 | //`Finish 511 | return; 512 | } 513 | 514 | 515 | template 516 | __global__ void gpu_gemm_sh_tt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right) 517 | { 518 | //`Finish 519 | return; 520 | } 521 | 522 | 523 | template 524 | __global__ void gpu_gemm_sh_reg_tt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right) 525 | { 526 | //`Finish 527 | return; 528 | } 529 | 530 | 531 | /* 532 | template 533 | __global__ void gpu_gemm_sh_reg_old_nn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right) 534 | { 535 | using int_t = int; //either int or size_t 536 | __shared__ T lbuf[TILE_EXT_K][TILE_EXT_M], rbuf[TILE_EXT_N][TILE_EXT_K]; 537 | T lreg[FRAG_EXT_M], rreg[FRAG_EXT_N], dreg[FRAG_EXT_N][FRAG_EXT_M]; 538 | 539 | const int_t wyb = ((threadIdx.y*blockDim.x + threadIdx.x) / warpSize) / (TILE_EXT_M/FRAG_EXT_M) * FRAG_EXT_N; 540 | const int_t wxb = ((threadIdx.y*blockDim.x + threadIdx.x) / warpSize) % (TILE_EXT_M/FRAG_EXT_M) * FRAG_EXT_M; 541 | const int_t ln = (threadIdx.y*blockDim.x + threadIdx.x) % warpSize; //thread lane index inside a warp 542 | const int_t lny = ln / FRAG_EXT_M; //Y position inside warp fragment 543 | const int_t lnx = ln % FRAG_EXT_M; //X position inside warp fragment 544 | 545 | for(int_t n_pos = blockIdx.y*blockDim.y; n_pos < n; n_pos += gridDim.y*blockDim.y){ //tile offset in Y dimension 546 | 547 | for(int_t m_pos = blockIdx.x*blockDim.x; m_pos < m; m_pos += gridDim.x*blockDim.x){ //tile offset in X dimension 548 | 549 | if((m_pos + TILE_EXT_M <= m) && (n_pos + TILE_EXT_N <= n)){ //complete tile (TILE_EXT_N * TILE_EXT_M) 550 | 551 | //Initialize C accumulators to zero: 552 | #pragma unroll 553 | for(int_t j = 0; j < FRAG_EXT_N; ++j){ 554 | #pragma unroll 555 | for(int_t i = 0; i < FRAG_EXT_M; ++i){ 556 | dreg[j][i] = static_cast(0.0); 557 | } 558 | } 559 | 560 | for(int_t k_pos = 0; k_pos < k; k_pos += TILE_EXT_K){ //k_pos is the position of the CUDA thread along the K dimension 561 | int_t k_end = k_pos + TILE_EXT_K; if(k_end > k) k_end = k; 562 | 563 | //Load a tile of matrix A(m_pos:TILE_EXT_M, k_pos:TILE_EXT_K): 564 | for(int_t k_loc = k_pos + threadIdx.y; k_loc < k_end; k_loc += blockDim.y){ 565 | lbuf[k_loc-k_pos][threadIdx.x] = left[k_loc*m + (m_pos+threadIdx.x)]; 566 | } 567 | 568 | //Load a tile of matrix B(k_pos:TILE_EXT_K, n_pos:TILE_EXT_N): 569 | for(int_t k_loc = k_pos + threadIdx.x; k_loc < k_end; k_loc += blockDim.x){ 570 | rbuf[threadIdx.y][k_loc-k_pos] = right[(n_pos+threadIdx.y)*k + k_loc]; 571 | } 572 | __syncthreads(); 573 | 574 | //Multiply two loaded tiles to produce a tile of matrix C(m_pos:TILE_EXT_M,n_pos:TILE_EXT_N): 575 | for(int_t l = ln; l < (k_end - k_pos); l += warpSize){ 576 | //Load fragments of shared memory tiles into registers: 577 | #pragma unroll 578 | for(int_t j = 0; j < FRAG_EXT_N; ++j) rreg[j] = rbuf[wyb + j][l]; 579 | #pragma unroll 580 | for(int_t j = 0; j < FRAG_EXT_M; ++j) lreg[j] = lbuf[l][wxb + j]; 581 | //Compute outer product of tile fragments in registers: 582 | #pragma unroll 583 | for(int_t j = 0; j < FRAG_EXT_N; ++j){ 584 | #pragma unroll 585 | for(int_t i = 0; i < FRAG_EXT_M; ++i){ 586 | dreg[j][i] += lreg[i] * rreg[j]; 587 | } 588 | } 589 | } 590 | __syncthreads(); 591 | 592 | } //k_pos 593 | 594 | //Perform reduction of the C fragment within each warp: 595 | #pragma unroll 596 | for(int_t j = 0; j < FRAG_EXT_N; ++j){ 597 | #pragma unroll 598 | for(int_t i = 0; i < FRAG_EXT_M; ++i){ 599 | #pragma unroll 600 | dreg[j][i] += __shfl_xor_sync(0xffffffff,dreg[j][i],16); 601 | dreg[j][i] += __shfl_xor_sync(0xffffffff,dreg[j][i],8); 602 | dreg[j][i] += __shfl_xor_sync(0xffffffff,dreg[j][i],4); 603 | dreg[j][i] += __shfl_xor_sync(0xffffffff,dreg[j][i],2); 604 | dreg[j][i] += __shfl_xor_sync(0xffffffff,dreg[j][i],1); 605 | } 606 | } 607 | 608 | //Upload C fragments into C matrix in global memory: 609 | dest[(n_pos + wyb + lny)*m + (m_pos + wxb + lnx)] = dreg[lny][lnx]; 610 | 611 | }else{ //incomplete tile 612 | 613 | //Initialize accumulator to zero: 614 | T tmp = static_cast(0.0); 615 | 616 | for(int_t k_pos = 0; k_pos < k; k_pos += TILE_EXT_K){ //k_pos is the position of the CUDA thread along the K dimension 617 | int_t k_end = k_pos + TILE_EXT_K; if(k_end > k) k_end = k; 618 | 619 | //Load a tile of matrix A(m_pos:TILE_EXT_M, k_pos:TILE_EXT_K): 620 | if(m_pos + threadIdx.x < m){ 621 | for(int_t k_loc = k_pos + threadIdx.y; k_loc < k_end; k_loc += blockDim.y){ 622 | lbuf[k_loc-k_pos][threadIdx.x] = left[k_loc*m + (m_pos+threadIdx.x)]; 623 | } 624 | } 625 | 626 | //Load a tile of matrix B(k_pos:TILE_EXT_K, n_pos:TILE_EXT_N): 627 | if(n_pos + threadIdx.y < n){ 628 | for(int_t k_loc = k_pos + threadIdx.x; k_loc < k_end; k_loc += blockDim.x){ 629 | rbuf[threadIdx.y][k_loc-k_pos] = right[(n_pos+threadIdx.y)*k + k_loc]; 630 | } 631 | } 632 | __syncthreads(); 633 | 634 | //Multiply two loaded tiles to produce a tile of matrix C(m_pos:TILE_EXT_M,n_pos:TILE_EXT_N): 635 | if(m_pos + threadIdx.x < m && n_pos + threadIdx.y < n){ 636 | if(k_end - k_pos == TILE_EXT_K){ //number of loop iterations is known at compile time: Unroll it 637 | #pragma unroll 638 | for(int_t l = 0; l < TILE_EXT_K; ++l){ 639 | tmp += lbuf[l][threadIdx.x] * rbuf[threadIdx.y][l]; 640 | } 641 | }else{ //number of loop iterations is not known at compile time 642 | for(int_t l = 0; l < (k_end - k_pos); ++l){ 643 | tmp += lbuf[l][threadIdx.x] * rbuf[threadIdx.y][l]; 644 | } 645 | } 646 | } 647 | __syncthreads(); 648 | 649 | } //k_pos 650 | 651 | //Store in C matrix into global memory: 652 | if(m_pos + threadIdx.x < m && n_pos + threadIdx.y < n) dest[(n_pos+threadIdx.y)*m + (m_pos+threadIdx.x)] += tmp; 653 | 654 | } 655 | 656 | } //m_pos 657 | 658 | } //n_pos 659 | return; 660 | } 661 | */ 662 | 663 | 664 | template 665 | T matrix_norm2_gpu_(size_t num_elems, const T * matrix_body) 666 | { 667 | T norm2 = static_cast(0); 668 | int dev; cudaError_t cuerr = cudaGetDevice(&dev); assert(cuerr == cudaSuccess); 669 | T * dnorm2 = static_cast(allocate(sizeof(T),dev,MemKind::Regular)); 670 | cuerr = cudaMemset((void*)dnorm2,0,sizeof(T)); assert(cuerr == cudaSuccess); 671 | unsigned int num_blocks = 1024; unsigned int num_threads = 256; 672 | gpu_array_norm2<<>>(num_elems,matrix_body,dnorm2); 673 | cuerr = cudaDeviceSynchronize(); 674 | cuerr = cudaGetLastError(); assert(cuerr == cudaSuccess); 675 | cuerr = cudaMemcpy((void*)(&norm2),(void*)dnorm2,sizeof(T),cudaMemcpyDefault); 676 | deallocate((void*)dnorm2); 677 | return norm2; 678 | } 679 | 680 | 681 | template 682 | void matrix_addition_gpu_(size_t num_elems, T * matrix0_body, const T * matrix1_body, T alpha) 683 | { 684 | unsigned int num_blocks = 4096; unsigned int num_threads = 256; 685 | gpu_array_add<<>>(num_elems,matrix0_body,matrix1_body,alpha); 686 | cudaError_t cuerr = cudaDeviceSynchronize(); 687 | cuerr = cudaGetLastError(); assert(cuerr == cudaSuccess); 688 | return; 689 | } 690 | 691 | 692 | template 693 | void matrix_multiplication_gpu_(bool left_transp, bool right_transp, 694 | T * matrix0_body, int nrows0, int ncols0, 695 | const T * matrix1_body, int nrows1, int ncols1, 696 | const T * matrix2_body, int nrows2, int ncols2) 697 | { 698 | if(gemmAlgorithm == 0){ //BLA GEMM brute-force 699 | if(!left_transp && !right_transp){ 700 | int m = nrows0, n = ncols0, k = ncols1; 701 | dim3 threads(32,32); 702 | dim3 blocks((nrows0-1)/32+1,(ncols0-1)/32+1); 703 | gpu_gemm_nn<<>>(m,n,k,matrix0_body,matrix1_body,matrix2_body); 704 | }else if(left_transp && !right_transp){ 705 | int m = nrows0, n = ncols0, k = nrows1; 706 | dim3 threads(32,32); 707 | dim3 blocks((nrows0-1)/32+1,(ncols0-1)/32+1); 708 | gpu_gemm_tn<<>>(m,n,k,matrix0_body,matrix1_body,matrix2_body); 709 | }else if(!left_transp && right_transp){ 710 | int m = nrows0, n = ncols0, k = ncols1; 711 | dim3 threads(32,32); 712 | dim3 blocks((nrows0-1)/32+1,(ncols0-1)/32+1); 713 | gpu_gemm_nt<<>>(m,n,k,matrix0_body,matrix1_body,matrix2_body); 714 | }else if(left_transp && right_transp){ 715 | int m = nrows0, n = ncols0, k = nrows1; 716 | dim3 threads(32,32); 717 | dim3 blocks((nrows0-1)/32+1,(ncols0-1)/32+1); 718 | gpu_gemm_tt<<>>(m,n,k,matrix0_body,matrix1_body,matrix2_body); 719 | } 720 | }else if(gemmAlgorithm == 1){ //BLA GEMM with shared memory 721 | if(!left_transp && !right_transp){ 722 | int m = nrows0, n = ncols0, k = ncols1; 723 | dim3 threads(16,16); 724 | dim3 blocks((nrows0-1)/16+1,(ncols0-1)/16+1); 725 | gpu_gemm_sh_nn<<>>(m,n,k,matrix0_body,matrix1_body,matrix2_body); 726 | }else if(left_transp && !right_transp){ 727 | int m = nrows0, n = ncols0, k = nrows1; 728 | dim3 threads(16,16); 729 | dim3 blocks((nrows0-1)/16+1,(ncols0-1)/16+1); 730 | gpu_gemm_sh_tn<<>>(m,n,k,matrix0_body,matrix1_body,matrix2_body); 731 | }else if(!left_transp && right_transp){ 732 | int m = nrows0, n = ncols0, k = ncols1; 733 | dim3 threads(16,16); 734 | dim3 blocks((nrows0-1)/16+1,(ncols0-1)/16+1); 735 | gpu_gemm_sh_nt<<>>(m,n,k,matrix0_body,matrix1_body,matrix2_body); 736 | }else if(left_transp && right_transp){ 737 | int m = nrows0, n = ncols0, k = nrows1; 738 | dim3 threads(16,16); 739 | dim3 blocks((nrows0-1)/16+1,(ncols0-1)/16+1); 740 | gpu_gemm_sh_tt<<>>(m,n,k,matrix0_body,matrix1_body,matrix2_body); 741 | } 742 | }else if(gemmAlgorithm == 2){ //BLA GEMM with shared memory and register file 743 | if(!left_transp && !right_transp){ 744 | int m = nrows0, n = ncols0, k = ncols1; 745 | dim3 threads(16,16); 746 | dim3 blocks((nrows0-1)/16+1,(ncols0-1)/16+1); 747 | gpu_gemm_sh_reg_nn<<>>(m,n,k,matrix0_body,matrix1_body,matrix2_body); 748 | }else if(left_transp && !right_transp){ 749 | int m = nrows0, n = ncols0, k = nrows1; 750 | dim3 threads(16,16); 751 | dim3 blocks((nrows0-1)/16+1,(ncols0-1)/16+1); 752 | //gpu_gemm_sh_reg_tn<<>>(m,n,k,matrix0_body,matrix1_body,matrix2_body); 753 | }else if(!left_transp && right_transp){ 754 | int m = nrows0, n = ncols0, k = ncols1; 755 | dim3 threads(16,16); 756 | dim3 blocks((nrows0-1)/16+1,(ncols0-1)/16+1); 757 | //gpu_gemm_sh_reg_nt<<>>(m,n,k,matrix0_body,matrix1_body,matrix2_body); 758 | }else if(left_transp && right_transp){ 759 | int m = nrows0, n = ncols0, k = nrows1; 760 | dim3 threads(16,16); 761 | dim3 blocks((nrows0-1)/16+1,(ncols0-1)/16+1); 762 | //gpu_gemm_sh_reg_tt<<>>(m,n,k,matrix0_body,matrix1_body,matrix2_body); 763 | } 764 | }else{ //cuBLAS GEMM 765 | int dev; cudaError_t cuerr = cudaGetDevice(&dev); assert(cuerr == cudaSuccess); 766 | int m = nrows1; cublasOperation_t transa = CUBLAS_OP_N; 767 | if(left_transp){m = ncols1; transa = CUBLAS_OP_T;} 768 | int n = ncols2; cublasOperation_t transb = CUBLAS_OP_N; 769 | if(right_transp){n = nrows2; transb = CUBLAS_OP_T;} 770 | int k = ncols1; if(left_transp) k = nrows1; 771 | T *alpha, *beta; 772 | if(CudaFPData::kind == CUDA_R_32F){ 773 | cuerr = cudaGetSymbolAddress((void**)&alpha,unity_fp32); assert(cuerr == cudaSuccess); 774 | cuerr = cudaGetSymbolAddress((void**)&beta,unity_fp32); assert(cuerr == cudaSuccess); 775 | }else if(CudaFPData::kind == CUDA_R_64F){ 776 | cuerr = cudaGetSymbolAddress((void**)&alpha,unity_fp64); assert(cuerr == cudaSuccess); 777 | cuerr = cudaGetSymbolAddress((void**)&beta,unity_fp64); assert(cuerr == cudaSuccess); 778 | }else{ 779 | assert(false); 780 | } 781 | #ifdef USE_CUBLAS_GEMM_EX 782 | cublasStatus_t custat = cublasGemmEx(cublasHandle[dev], 783 | transa,transb, 784 | m,n,k, 785 | alpha, 786 | matrix1_body,CudaFPData::kind,nrows1, 787 | matrix2_body,CudaFPData::kind,nrows2, 788 | beta, 789 | matrix0_body,CudaFPData::kind,nrows0, 790 | CudaFPData::kind, CUBLAS_GEMM_DEFAULT); 791 | #else 792 | cublasStatus_t custat = cublasGemm(cublasHandle[dev], 793 | transa,transb, 794 | m,n,k, 795 | alpha, 796 | matrix1_body,nrows1, 797 | matrix2_body,nrows2, 798 | beta, 799 | matrix0_body,nrows0); 800 | #endif 801 | if(custat != CUBLAS_STATUS_SUCCESS) std::cout << "#ERROR(cublasGemmEx): Eror " << custat << std::endl; 802 | assert(custat == CUBLAS_STATUS_SUCCESS); 803 | } 804 | cudaError_t cuerr = cudaDeviceSynchronize(); 805 | cuerr = cudaGetLastError(); 806 | if(cuerr != cudaSuccess){ 807 | const char * error_str = cudaGetErrorString(cuerr); 808 | std::cout << "ERROR(bla::matrix_multiplication_gpu_): CUDA kernel launch failure: " << std::endl; 809 | printf("%s\n",error_str); 810 | } 811 | assert(cuerr == cudaSuccess); 812 | return; 813 | } 814 | 815 | 816 | float matrix_norm2_gpu(size_t num_elems, const float * matrix_body) 817 | { 818 | return matrix_norm2_gpu_(num_elems,matrix_body); 819 | } 820 | 821 | double matrix_norm2_gpu(size_t num_elems, const double * matrix_body) 822 | { 823 | return matrix_norm2_gpu_(num_elems,matrix_body); 824 | } 825 | 826 | 827 | void matrix_addition_gpu(size_t num_elems, float * matrix0_body, const float * matrix1_body, float alpha) 828 | { 829 | return matrix_addition_gpu_(num_elems,matrix0_body,matrix1_body,alpha); 830 | } 831 | 832 | void matrix_addition_gpu(size_t num_elems, double * matrix0_body, const double * matrix1_body, double alpha) 833 | { 834 | return matrix_addition_gpu_(num_elems,matrix0_body,matrix1_body,alpha); 835 | } 836 | 837 | 838 | void matrix_multiplication_gpu(bool left_transp, bool right_transp, 839 | float * matrix0_body, int nrows0, int ncols0, 840 | const float * matrix1_body, int nrows1, int ncols1, 841 | const float * matrix2_body, int nrows2, int ncols2) 842 | { 843 | return matrix_multiplication_gpu_(left_transp,right_transp, 844 | matrix0_body,nrows0,ncols0, 845 | matrix1_body,nrows1,ncols1, 846 | matrix2_body,nrows2,ncols2); 847 | } 848 | 849 | void matrix_multiplication_gpu(bool left_transp, bool right_transp, 850 | double * matrix0_body, int nrows0, int ncols0, 851 | const double * matrix1_body, int nrows1, int ncols1, 852 | const double * matrix2_body, int nrows2, int ncols2) 853 | { 854 | return matrix_multiplication_gpu_(left_transp,right_transp, 855 | matrix0_body,nrows0,ncols0, 856 | matrix1_body,nrows1,ncols1, 857 | matrix2_body,nrows2,ncols2); 858 | } 859 | 860 | 861 | void init() 862 | { 863 | totalNumGPUs = 0; 864 | cudaError_t cuerr = cudaGetDeviceCount(&totalNumGPUs); assert(cuerr == cudaSuccess); 865 | std::cout << "Found " << totalNumGPUs << " NVIDIA GPU" << std::endl; 866 | if(totalNumGPUs > 0){ 867 | cublasStatus_t cuberr; 868 | gpuProperty = new cudaDeviceProp[totalNumGPUs]; 869 | cublasHandle = new cublasHandle_t[totalNumGPUs]; 870 | //Init each GPU: 871 | for(int i = (totalNumGPUs - 1); i >= 0; --i){ 872 | cuerr = cudaSetDevice(i); assert(cuerr == cudaSuccess); 873 | cuerr = cudaGetDeviceProperties(&(gpuProperty[i]),i); assert(cuerr == cudaSuccess); 874 | cuberr = cublasCreate(&(cublasHandle[i])); assert(cuberr == CUBLAS_STATUS_SUCCESS); 875 | cuberr = cublasSetPointerMode(cublasHandle[i],CUBLAS_POINTER_MODE_DEVICE); assert(cuberr == CUBLAS_STATUS_SUCCESS); 876 | cuerr = cudaGetLastError(); assert(cuerr == cudaSuccess); 877 | std::cout << "Initialized GPU " << i << std::endl; 878 | } 879 | //Enable P2P access between GPU: 880 | if(totalNumGPUs > 1){ 881 | for(int i = (totalNumGPUs - 1); i >= 0; --i){ 882 | if(gpuProperty[i].unifiedAddressing != 0){ 883 | cuerr = cudaSetDevice(i); assert(cuerr == cudaSuccess); 884 | for(int j = (totalNumGPUs - 1); j >= 0; --j){ 885 | if(j != i){ 886 | if(gpuProperty[j].unifiedAddressing != 0){ 887 | cuerr = cudaDeviceEnablePeerAccess(j,0); 888 | if(cuerr == cudaSuccess){ 889 | std::cout << "GPU " << i << " can access peer GPU " << j << std::endl; 890 | }else{ 891 | std::cout << "GPU " << i << " cannot access peer GPU " << j << std::endl; 892 | } 893 | } 894 | } 895 | } 896 | } 897 | } 898 | } 899 | cuerr = cudaGetLastError(); 900 | } 901 | std::cout << "BLA library initialized successfully" << std::endl; 902 | return; 903 | } 904 | 905 | 906 | void shutdown() 907 | { 908 | if(totalNumGPUs > 0){ 909 | cudaError_t cuerr; 910 | cublasStatus_t cuberr; 911 | for(int i = 0; i < totalNumGPUs; ++i){ 912 | cuberr = cublasDestroy(cublasHandle[i]); assert(cuberr == CUBLAS_STATUS_SUCCESS); 913 | cuerr = cudaDeviceReset(); assert(cuerr == cudaSuccess); 914 | std::cout << "Destroyed primary context on GPU " << i << std::endl; 915 | } 916 | delete [] cublasHandle; 917 | delete [] gpuProperty; 918 | } 919 | totalNumGPUs = 0; 920 | std::cout << "BLA library shut down successfully" << std::endl; 921 | return; 922 | } 923 | 924 | 925 | void print_device_properties(int device) 926 | { 927 | cudaDeviceProp prop; 928 | cudaError_t cuerr = cudaGetDeviceProperties(&prop,device); 929 | if(cuerr == cudaSuccess){ 930 | std::cout << "Properties of NVIDIA GPU " << device << std::endl; 931 | std::cout << " Compute capability: " << prop.major << "." << prop.minor << std::endl; 932 | std::cout << " Register file size: " << prop.regsPerBlock << std::endl; 933 | std::cout << " Shared memory size: " << prop.sharedMemPerBlock << std::endl; 934 | }else{ 935 | std::cout << "#ERROR(bla::print_device_properties): Unable to get properties for device " << device << std::endl; 936 | assert(false); 937 | } 938 | return; 939 | } 940 | 941 | 942 | void reset_gemm_algorithm(int algo) 943 | { 944 | gemmAlgorithm = algo; 945 | return; 946 | } 947 | 948 | 949 | bool test_hello() 950 | { 951 | std::cout << "Testing presence on GPU ..." << std::endl; 952 | const std::string s1("Am I really on GPU?"); 953 | const std::string s2("Waiting for the answer ..."); 954 | const std::string s3("Yes, you are!"); 955 | 956 | size_t max_len = std::max(s1.size(),std::max(s2.size(),s3.size())); 957 | size_t str_len = max_len+1; 958 | 959 | char * hs1 = static_cast(allocate(str_len,-1,MemKind::Pinned)); assert(hs1 != nullptr); 960 | char * ds1 = static_cast(allocate(str_len,0,MemKind::Regular)); assert(ds1 != nullptr); 961 | int i = 0; for(const char & symb: s1) hs1[i++]=symb; hs1[s1.size()]='\0'; 962 | printf("%s ",hs1); 963 | 964 | char * hs3 = static_cast(allocate(str_len,-1,MemKind::Pinned)); assert(hs3 != nullptr); 965 | char * ds3 = static_cast(allocate(str_len,0,MemKind::Regular)); assert(ds3 != nullptr); 966 | i = 0; for(const char & symb: s3) hs3[i++]=symb; hs3[s3.size()]='\0'; 967 | 968 | cudaError_t cuerr = cudaMemcpy((void*)ds1,(void*)hs1,str_len,cudaMemcpyDefault); assert(cuerr == cudaSuccess); 969 | cuerr = cudaMemcpy((void*)ds3,(void*)hs3,str_len,cudaMemcpyDefault); assert(cuerr == cudaSuccess); 970 | 971 | cuerr = cudaGetLastError(); assert(cuerr == cudaSuccess); 972 | gpu_test_presence<<<16,256>>>(str_len,ds1,ds3); 973 | std::cout << s2 << " "; 974 | cuerr = cudaDeviceSynchronize(); 975 | cuerr = cudaGetLastError(); assert(cuerr == cudaSuccess); 976 | 977 | cuerr = cudaMemcpy((void*)hs1,(void*)ds1,str_len,cudaMemcpyDefault); assert(cuerr == cudaSuccess); 978 | printf("%s\n",hs1); 979 | 980 | deallocate((void*)ds3); 981 | deallocate((void*)hs3); 982 | 983 | deallocate((void*)ds1); 984 | deallocate((void*)hs1); 985 | 986 | return true; 987 | } 988 | 989 | 990 | bool test_norm() 991 | { 992 | std::cout << "Testing norm2 on GPU 0 ... "; 993 | const float num_tolerance = 1e-5; 994 | const size_t vol = 1000000; 995 | const size_t dsize = vol * sizeof(float); 996 | float * arr0 = static_cast(allocate(dsize,-1,MemKind::Pinned)); 997 | float * arr1 = static_cast(allocate(dsize,0,MemKind::Regular)); 998 | float * dnorm2 = static_cast(allocate(sizeof(float),0,MemKind::Regular)); 999 | 1000 | for(size_t i = 0; i < vol; ++i) arr0[i]=1.0f/sqrt((float)vol); //value of each element to make norm equal 1 1001 | 1002 | cudaError_t cuerr = cudaMemcpy((void*)arr1,(void*)arr0,dsize,cudaMemcpyDefault); assert(cuerr == cudaSuccess); 1003 | 1004 | unsigned int num_blocks = 1024; unsigned int num_threads = 256; 1005 | gpu_array_norm2<<>>(vol,arr1,dnorm2); 1006 | cuerr = cudaDeviceSynchronize(); 1007 | cuerr = cudaGetLastError(); assert(cuerr == cudaSuccess); 1008 | 1009 | float norm2 = 0.0f; 1010 | cuerr = cudaMemcpy((void*)(&norm2),(void*)dnorm2,sizeof(float),cudaMemcpyDefault); 1011 | std::cout << "Norm2 = " << norm2 << " (correct value is 1.0)" << std::endl; 1012 | assert(abs(norm2-1.0f) < num_tolerance); 1013 | 1014 | deallocate((void*)dnorm2); 1015 | deallocate((void*)arr1); 1016 | deallocate((void*)arr0); 1017 | return true; 1018 | } 1019 | 1020 | 1021 | bool test_bla() 1022 | { 1023 | if(!test_hello()) return false; 1024 | if(!test_norm()) return false; 1025 | return true; 1026 | } 1027 | 1028 | } //namespace bla 1029 | -------------------------------------------------------------------------------- /bla_lib.hpp: -------------------------------------------------------------------------------- 1 | /* CUDA tutorial: Basic Linear Algebra (BLA) Library 2 | 3 | !Copyright (C) 2018-2018 Dmitry I. Lyakh (Liakh) 4 | !Copyright (C) 2018-2018 Oak Ridge National Laboratory (UT-Battelle) 5 | 6 | !This file is part of CUDA BLA tutorial. 7 | 8 | !CUDA BLA is free software: you can redistribute it and/or modify 9 | !it under the terms of the GNU Lesser General Public License as published 10 | !by the Free Software Foundation, either version 3 of the License, or 11 | !(at your option) any later version. 12 | 13 | !CUDA BLA is distributed in the hope that it will be useful, 14 | !but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | !MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | !GNU Lesser General Public License for more details. 17 | 18 | !You should have received a copy of the GNU Lesser General Public License 19 | !along with CUDA BLA. If not, see . */ 20 | 21 | #ifndef BLA_LIB_HPP_ 22 | #define BLA_LIB_HPP_ 23 | 24 | #include "memory.hpp" 25 | #include "timer.hpp" 26 | 27 | #include 28 | 29 | #include 30 | 31 | namespace bla{ 32 | 33 | /** Initialization of BLA **/ 34 | void init(); 35 | 36 | /** Shutdown of BLA **/ 37 | void shutdown(); 38 | 39 | /** Testing BLA **/ 40 | bool test_bla(); 41 | 42 | /** Device properites **/ 43 | void print_device_properties(int device); 44 | 45 | /** Resets GEMM algorithm: 46 | 0: Custom GEMM from BLA; 47 | 1: cuBLAS GEMM. **/ 48 | void reset_gemm_algorithm(int algo); 49 | 50 | /** Matrix squared "norm" (sum of the squared elements) **/ 51 | float matrix_norm2_gpu(size_t num_elems, const float * matrix_body); 52 | double matrix_norm2_gpu(size_t num_elems, const double * matrix_body); 53 | 54 | /** Matrix addition **/ 55 | void matrix_addition_gpu(size_t num_elems, float * matrix0_body, const float * matrix1_body, float alpha); 56 | void matrix_addition_gpu(size_t num_elems, double * matrix0_body, const double * matrix1_bod, double alpha); 57 | 58 | /** Matrix multiplication **/ 59 | void matrix_multiplication_gpu(bool left_transp, bool right_transp, 60 | float * matrix0_body, int nrows0, int ncols0, 61 | const float * matrix1_body, int nrows1, int ncols1, 62 | const float * matrix2_body, int nrows2, int ncols2); 63 | void matrix_multiplication_gpu(bool left_transp, bool right_transp, 64 | double * matrix0_body, int nrows0, int ncols0, 65 | const double * matrix1_body, int nrows1, int ncols1, 66 | const double * matrix2_body, int nrows2, int ncols2); 67 | 68 | } //namespace bla 69 | 70 | #endif //BLA_LIB_HPP_ 71 | -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | /* CUDA tutorial: Basic Linear Algebra (BLA) Library 2 | 3 | !Copyright (C) 2018-2018 Dmitry I. Lyakh (Liakh) 4 | !Copyright (C) 2018-2018 Oak Ridge National Laboratory (UT-Battelle) 5 | 6 | !This file is part of CUDA BLA tutorial. 7 | 8 | !CUDA BLA is free software: you can redistribute it and/or modify 9 | !it under the terms of the GNU Lesser General Public License as published 10 | !by the Free Software Foundation, either version 3 of the License, or 11 | !(at your option) any later version. 12 | 13 | !CUDA BLA is distributed in the hope that it will be useful, 14 | !but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | !MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | !GNU Lesser General Public License for more details. 17 | 18 | !You should have received a copy of the GNU Lesser General Public License 19 | !along with CUDA BLA. If not, see . */ 20 | 21 | #include "bla.hpp" 22 | 23 | #include 24 | 25 | void use_bla() 26 | { 27 | //Pick which GEMM tests you enable: 28 | const bool TEST_BLA_GEMM_BRUTE = true; //enables/disables testing of brute-force GEMM 29 | const bool TEST_BLA_GEMM_SHARED = true; //enables/disables testing of shared memory GEMM 30 | const bool TEST_BLA_GEMM_REGISTER = true; //enables/disables testing of register-based GEMM 31 | 32 | std::cout << "Let's try to use BLA library ..." << std::endl; 33 | 34 | //Create matrix A: 35 | bla::Matrix A(2000,2000); 36 | //Allocate matrix A body on Host: 37 | A.allocateBody(-1,bla::MemKind::Pinned); 38 | //Set matrix A body to some non-trivial value on Host: 39 | A.setBodyHost(); 40 | 41 | //Create matrix B: 42 | bla::Matrix B(2000,2000); 43 | //Allocate matrix B body on Host: 44 | B.allocateBody(-1,bla::MemKind::Pinned); 45 | //Set matrix B body to some non-trivial value on Host: 46 | B.setBodyHost(); 47 | 48 | //Create matrix C: 49 | bla::Matrix C(2000,2000); 50 | //Allocate matrix C body on GPU#0: 51 | C.allocateBody(0,bla::MemKind::Regular); 52 | 53 | //Create matrix D: 54 | bla::Matrix D(2000,2000); 55 | //Allocate matrix D body on GPU#0: 56 | D.allocateBody(0,bla::MemKind::Regular); 57 | 58 | //Copy matrix A to GPU#0 from Host: 59 | A.syncBody(0,-1); //Host (-1) --> GPU#0 (0) 60 | //Compute matrix A norm on GPU#0: 61 | auto normA = A.computeNorm(0); 62 | std::cout << "Matrix A norm = " << normA << std::endl; 63 | 64 | //Copy matrix B to GPU#0 from Host: 65 | B.syncBody(0,-1); //Host (-1) --> GPU#0 (0) 66 | //Compute matrix B norm on GPU#0: 67 | auto normB = B.computeNorm(0); 68 | std::cout << "Matrix B norm = " << normB << std::endl; 69 | 70 | //Determine total number of floating point operations: 71 | double flops = 2.0 * std::sqrt(static_cast(A.getVolume()) * 72 | static_cast(B.getVolume()) * 73 | static_cast(C.getVolume())); 74 | std::cout << "Matrix multiplication C+=A*B requires " << flops/1e9 << " Gflop" << std::endl; 75 | 76 | //Perform reference matrix multiplication on GPU#0 with cuBLAS: 77 | for(int repeat = 0; repeat < 2; ++repeat){ 78 | C.zeroBody(0); //set matrix C body to zero on GPU#0 79 | bla::reset_gemm_algorithm(7); 80 | std::cout << "Performing matrix multiplication C+=A*B with cuBLAS ... "; 81 | double tms = bla::time_sys_sec(); 82 | C.multiplyAdd(false,false,A,B,0); 83 | double tmf = bla::time_sys_sec(); 84 | std::cout << "Done: Time = " << tmf-tms << " s: Gflop/s = " << flops/(tmf-tms)/1e9 << std::endl; 85 | //Compute C norm on GPU#0: 86 | auto normC = C.computeNorm(0); //correct C matrix norm 87 | std::cout << "Matrix C norm = " << normC << std::endl; 88 | D.zeroBody(0); //set matrix D body to zero on GPU#0 89 | D.add(C,-1.0f,0); //make matrix D = -C for later correctness checks 90 | } 91 | 92 | //Perform matrix multiplication on GPU#0 with BLA GEMM brute-force: 93 | if(TEST_BLA_GEMM_BRUTE){ 94 | for(int repeat = 0; repeat < 2; ++repeat){ 95 | C.zeroBody(0); //set matrix C body to zero on GPU#0 96 | bla::reset_gemm_algorithm(0); 97 | std::cout << "Performing matrix multiplication C+=A*B with BLA GEMM brute-force ... "; 98 | double tms = bla::time_sys_sec(); 99 | C.multiplyAdd(false,false,A,B,0); 100 | double tmf = bla::time_sys_sec(); 101 | std::cout << "Done: Time = " << tmf-tms << " s: Gflop/s = " << flops/(tmf-tms)/1e9 << std::endl; 102 | //Check correctness on GPU#0: 103 | C.add(D,1.0f,0); 104 | auto norm_diff = C.computeNorm(0); 105 | std::cout << "Norm of the matrix C deviation from correct = " << norm_diff << std::endl; 106 | if(std::abs(norm_diff) > 1e-7){ 107 | std::cout << "#FATAL: Matrix C is incorrect, fix your GPU kernel implementation!" << std::endl; 108 | std::exit(1); 109 | } 110 | } 111 | } 112 | 113 | //Perform matrix multiplication on GPU#0 with BLA GEMM with shared memory: 114 | if(TEST_BLA_GEMM_SHARED){ 115 | for(int repeat = 0; repeat < 2; ++repeat){ 116 | C.zeroBody(0); //set matrix C body to zero on GPU#0 117 | bla::reset_gemm_algorithm(1); 118 | std::cout << "Performing matrix multiplication C+=A*B with BLA GEMM with shared memory ... "; 119 | double tms = bla::time_sys_sec(); 120 | C.multiplyAdd(false,false,A,B,0); 121 | double tmf = bla::time_sys_sec(); 122 | std::cout << "Done: Time = " << tmf-tms << " s: Gflop/s = " << flops/(tmf-tms)/1e9 << std::endl; 123 | //Check correctness on GPU#0: 124 | C.add(D,1.0f,0); 125 | auto norm_diff = C.computeNorm(0); 126 | std::cout << "Norm of the matrix C deviation from correct = " << norm_diff << std::endl; 127 | if(std::abs(norm_diff) > 1e-7){ 128 | std::cout << "#FATAL: Matrix C is incorrect, fix your GPU kernel implementation!" << std::endl; 129 | std::exit(1); 130 | } 131 | } 132 | } 133 | 134 | //Perform matrix multiplication on GPU#0 with BLA GEMM with shared memory and registers: 135 | if(TEST_BLA_GEMM_REGISTER){ 136 | for(int repeat = 0; repeat < 2; ++repeat){ 137 | C.zeroBody(0); //set matrix C body to zero on GPU#0 138 | bla::reset_gemm_algorithm(2); 139 | std::cout << "Performing matrix multiplication C+=A*B with BLA GEMM with shared memory and registers ... "; 140 | double tms = bla::time_sys_sec(); 141 | C.multiplyAdd(false,false,A,B,0); 142 | double tmf = bla::time_sys_sec(); 143 | std::cout << "Done: Time = " << tmf-tms << " s: Gflop/s = " << flops/(tmf-tms)/1e9 << std::endl; 144 | //Check correctness on GPU#0: 145 | C.add(D,1.0f,0); 146 | auto norm_diff = C.computeNorm(0); 147 | std::cout << "Norm of the matrix C deviation from correct = " << norm_diff << std::endl; 148 | if(std::abs(norm_diff) > 1e-7){ 149 | std::cout << "#FATAL: Matrix C is incorrect, fix your GPU kernel implementation!" << std::endl; 150 | std::exit(1); 151 | } 152 | } 153 | } 154 | 155 | std::cout << "Seems like it works!" << std::endl; 156 | return; 157 | } 158 | 159 | 160 | int main(int argc, char ** argv) 161 | { 162 | //Initialize BLA library: 163 | bla::init(); 164 | bla::print_device_properties(0); //check compute capability 165 | 166 | //Test BLA library: 167 | bla::test_bla(); 168 | 169 | //Use BLA library: 170 | use_bla(); 171 | 172 | //Shutdown BLA library: 173 | bla::shutdown(); 174 | 175 | return 0; 176 | } 177 | -------------------------------------------------------------------------------- /matrix.hpp: -------------------------------------------------------------------------------- 1 | /* CUDA tutorial: Basic Linear Algebra (BLA) Library 2 | 3 | !Copyright (C) 2018-2018 Dmitry I. Lyakh (Liakh) 4 | !Copyright (C) 2018-2018 Oak Ridge National Laboratory (UT-Battelle) 5 | 6 | !This file is part of CUDA BLA tutorial. 7 | 8 | !CUDA BLA is free software: you can redistribute it and/or modify 9 | !it under the terms of the GNU Lesser General Public License as published 10 | !by the Free Software Foundation, either version 3 of the License, or 11 | !(at your option) any later version. 12 | 13 | !CUDA BLA is distributed in the hope that it will be useful, 14 | !but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | !MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | !GNU Lesser General Public License for more details. 17 | 18 | !You should have received a copy of the GNU Lesser General Public License 19 | !along with CUDA BLA. If not, see . */ 20 | 21 | #ifndef MATRIX_HPP_ 22 | #define MATRIX_HPP_ 23 | 24 | #include "bla_lib.hpp" 25 | 26 | #include 27 | 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | 36 | namespace bla{ 37 | 38 | template 39 | class Matrix{ 40 | 41 | public: 42 | 43 | explicit Matrix(int nrows, int ncols); 44 | 45 | Matrix(const Matrix & matrix) = delete; 46 | Matrix & operator=(const Matrix &) = delete; 47 | Matrix(Matrix && matrix) noexcept = default; 48 | Matrix & operator=(Matrix && matrix) noexcept = default; 49 | virtual ~Matrix(); 50 | 51 | /** Returns the number of rows in the matrix **/ 52 | int getNumRows() const; 53 | /** Returns the number of columns in the matrix **/ 54 | int getNumCols() const; 55 | /** Returns the volume of the matrix (number of elements) **/ 56 | std::size_t getVolume() const; 57 | /** Returns the size of the matrix in bytes **/ 58 | std::size_t getSize() const; 59 | /** Returns a pointer to the memory resource on requested device (if any) **/ 60 | T * getBodyPtr(int device) const; 61 | /** Allocates memory resource of requested kind on requested device **/ 62 | void allocateBody(int device, MemKind memkind = MemKind::Regular); 63 | /** Deallocates memory resource on requested device **/ 64 | void deallocateBody(int device); 65 | /** Marks matrix body status on a given device as up-to-date or not (outdated) **/ 66 | void markBodyStatus(int device, bool status); 67 | /** Initializes matrix body to zero on a given device **/ 68 | void zeroBody(int device); 69 | /** Initializes matrix body to some non-trivial value on Host **/ 70 | void setBodyHost(); 71 | /** Synchronizes matrix body on a given device with the body from another device. 72 | By default the source device is Host (if up to date). **/ 73 | void syncBody(int device, int source_device = -1); 74 | 75 | /** Computes the norm of the matrix on a given device **/ 76 | double computeNorm(int device = -1); 77 | /** Performs matrix addition on a given device **/ 78 | void add(Matrix & Amat, T alpha = static_cast(1.0), int device = -1); 79 | /** Performs matrix multiplication on a given device **/ 80 | void multiplyAdd(bool left_transp, bool right_transp, Matrix & Amat, Matrix & Bmat, int device = -1); 81 | 82 | private: 83 | 84 | //Memory resource descriptor: 85 | typedef struct{ 86 | int device; 87 | void * ptr; 88 | MemKind memkind; 89 | bool uptodate; 90 | } Resource; 91 | 92 | //Data members: 93 | int nrows_; //number of rows 94 | int ncols_; //number of columns 95 | std::size_t elem_size_; //matrix element size in bytes 96 | std::list location_; //list of memory resources occupied by the matrix 97 | }; 98 | 99 | 100 | //TEMPLATE DEFINITIONS: 101 | template 102 | Matrix::Matrix(int nrows, int ncols): 103 | nrows_(nrows), ncols_(ncols), elem_size_(sizeof(T)) 104 | { 105 | static_assert(std::is_floating_point::value,"#ERROR(BLA::Matrix::Matrix): Matrix type must be floating point!"); 106 | assert(nrows_ > 0 && ncols_ > 0 && elem_size_ > 0); 107 | std::cout << "Matrix created with dimensions (" << nrows_ << "," << ncols_ << ")" << std::endl; 108 | } 109 | 110 | 111 | template 112 | Matrix::~Matrix() 113 | { 114 | for(auto & loc: location_) deallocate(loc.ptr); 115 | std::cout << "Matrix destroyed" << std::endl; 116 | } 117 | 118 | 119 | template 120 | int Matrix::getNumRows() const 121 | { 122 | return nrows_; 123 | } 124 | 125 | 126 | template 127 | int Matrix::getNumCols() const 128 | { 129 | return ncols_; 130 | } 131 | 132 | 133 | template 134 | std::size_t Matrix::getVolume() const 135 | { 136 | return (static_cast(nrows_)*static_cast(ncols_)); //number of elements 137 | } 138 | 139 | 140 | template 141 | std::size_t Matrix::getSize() const 142 | { 143 | return (static_cast(nrows_)*static_cast(ncols_)*elem_size_); //matrix size in bytes 144 | } 145 | 146 | 147 | template 148 | T * Matrix::getBodyPtr(int device) const 149 | { 150 | T * ptr = nullptr; 151 | for(const auto & loc: location_){ 152 | if(loc.device == device){ 153 | ptr = static_cast(loc.ptr); 154 | break; 155 | } 156 | } 157 | return ptr; 158 | } 159 | 160 | 161 | template 162 | void Matrix::allocateBody(int device, MemKind memkind) 163 | { 164 | std::size_t mat_size = this->getSize(); //matrix size in bytes 165 | void * ptr = allocate(mat_size,device,memkind); //allocate memory of requested kind on requested device 166 | assert(ptr != nullptr); 167 | location_.emplace_back(Resource{device,ptr,memkind,false}); //save the new memory descriptor (Resource) 168 | std::cout << "New resource acquired on device " << device << std::endl; 169 | return; 170 | } 171 | 172 | 173 | template 174 | void Matrix::deallocateBody(int device) 175 | { 176 | for(auto & loc: location_){ 177 | if(loc.device == device){ 178 | deallocate(loc.ptr); 179 | std::cout << "Resource released on device " << device << std::endl; 180 | } 181 | } 182 | location_.remove_if([device](const Resource & res){return (res.device == device);}); 183 | return; 184 | } 185 | 186 | 187 | template 188 | void Matrix::markBodyStatus(int device, bool status) 189 | { 190 | for(auto & loc: location_){ 191 | if(loc.device == device) loc.uptodate = status; 192 | } 193 | return; 194 | } 195 | 196 | 197 | template 198 | void Matrix::zeroBody(int device) 199 | { 200 | T * mat = this->getBodyPtr(device); 201 | if(mat != nullptr){ 202 | std::size_t mat_size = this->getSize(); 203 | assert(mat_size > 0); 204 | if(device < 0){ //Host 205 | std::memset(((void*)mat),0,mat_size); 206 | }else{ //GPU device 207 | int dev; 208 | cudaError_t cuerr = cudaGetDevice(&dev); assert(cuerr == cudaSuccess); 209 | if(device != dev){ 210 | cuerr = cudaSetDevice(device); assert(cuerr == cudaSuccess); 211 | } 212 | cuerr = cudaMemset(((void*)mat),0,mat_size); assert(cuerr == cudaSuccess); 213 | if(device != dev){ 214 | cuerr = cudaSetDevice(dev); assert(cuerr == cudaSuccess); 215 | } 216 | } 217 | this->markBodyStatus(device,true); //mark matrix body on device as up-to-date 218 | }else{ 219 | std::cout << "#ERROR(BLA::Matrix::zeroBody): Matrix does not exist on device " << device << std::endl; 220 | assert(false); 221 | } 222 | return; 223 | } 224 | 225 | 226 | template 227 | void Matrix::setBodyHost() 228 | { 229 | T * mat = this->getBodyPtr(-1); //-1 is Host device id 230 | if(mat != nullptr){ 231 | for(std::size_t j = 0; j < ncols_; ++j){ 232 | std::size_t offset = j*nrows_; 233 | for(std::size_t i = 0; i < nrows_; ++i){ 234 | //mat[offset+i] = static_cast(1)/(static_cast(i+7) + static_cast(j+13)); //some value 235 | mat[offset+i] = static_cast(1)/std::log(static_cast(std::rand()+13)); //some value 236 | } 237 | } 238 | this->markBodyStatus(-1,true); //mark matrix body on Host as up-to-date 239 | }else{ 240 | std::cout << "#ERROR(BLA::Matrix::setBodyHost): Matrix does not exist on Host!" << std::endl; 241 | assert(false); 242 | } 243 | return; 244 | } 245 | 246 | 247 | template 248 | void Matrix::syncBody(int device, int source_device) 249 | { 250 | if(device != source_device){ 251 | Resource destination_resource, source_resource; 252 | bool destination_found = false; 253 | bool source_found = false; 254 | for(auto & loc: location_){ 255 | if(!source_found && loc.device == source_device && loc.uptodate){ 256 | source_resource = loc; 257 | source_found = true; 258 | } 259 | if(!destination_found && loc.device == device){ 260 | destination_resource = loc; 261 | destination_found = true; 262 | } 263 | } 264 | if(!destination_found){ 265 | this->allocateBody(device,MemKind::Regular); 266 | for(const auto & loc: location_){ 267 | if(loc.device == device){ 268 | destination_resource = loc; 269 | destination_found = true; 270 | break; 271 | } 272 | } 273 | } 274 | if(source_found){ 275 | cudaError_t cuerr = cudaMemcpy(destination_resource.ptr,source_resource.ptr,this->getSize(),cudaMemcpyDefault); 276 | assert(cuerr == cudaSuccess); 277 | this->markBodyStatus(device,true); //mark matrix body on device as up-to-date 278 | }else{ 279 | std::cout << "#ERROR(BLA::Matrix::syncBody): Provided source device " << source_device << " has no up-to-date matrix body!" << std::endl; 280 | assert(false); 281 | } 282 | } 283 | return; 284 | } 285 | 286 | 287 | template 288 | double Matrix::computeNorm(int device) 289 | { 290 | std::size_t vol = this->getVolume(); 291 | T * matrix_body = this->getBodyPtr(device); assert(matrix_body != nullptr); 292 | double result = 0.0; 293 | if(device >= 0){ //GPU 294 | int dev; cudaError_t cuerr = cudaGetDevice(&dev); assert(cuerr == cudaSuccess); 295 | if(device != dev){ 296 | cuerr = cudaSetDevice(device); assert(cuerr == cudaSuccess); 297 | } 298 | result = matrix_norm2_gpu(vol,matrix_body); 299 | if(device != dev){ 300 | cuerr = cudaSetDevice(dev); assert(cuerr == cudaSuccess); 301 | } 302 | }else{ //Host 303 | //`Implement 304 | assert(false); 305 | } 306 | return result; 307 | } 308 | 309 | 310 | template 311 | void Matrix::add(Matrix & Amat, T alpha, int device) 312 | { 313 | std::size_t vol = this->getVolume(); 314 | assert(Amat.getVolume() == vol); 315 | T * matrix0_body = this->getBodyPtr(device); assert(matrix0_body != nullptr); 316 | const T * matrix1_body = Amat.getBodyPtr(device); assert(matrix1_body != nullptr); 317 | if(device >= 0){ //GPU 318 | int dev; cudaError_t cuerr = cudaGetDevice(&dev); assert(cuerr == cudaSuccess); 319 | if(device != dev){ 320 | cuerr = cudaSetDevice(device); assert(cuerr == cudaSuccess); 321 | } 322 | matrix_addition_gpu(vol,matrix0_body,matrix1_body,alpha); 323 | if(device != dev){ 324 | cuerr = cudaSetDevice(dev); assert(cuerr == cudaSuccess); 325 | } 326 | }else{ //Host 327 | //`Implement 328 | assert(false); 329 | } 330 | return; 331 | } 332 | 333 | 334 | template 335 | void Matrix::multiplyAdd(bool left_transp, bool right_transp, Matrix & Amat, Matrix & Bmat, int device) 336 | { 337 | T * matrix0_body = this->getBodyPtr(device); assert(matrix0_body != nullptr); 338 | const T * matrix1_body = Amat.getBodyPtr(device); assert(matrix1_body != nullptr); 339 | const T * matrix2_body = Bmat.getBodyPtr(device); assert(matrix2_body != nullptr); 340 | if(device >= 0){ //GPU 341 | int dev; cudaError_t cuerr = cudaGetDevice(&dev); assert(cuerr == cudaSuccess); 342 | if(device != dev){ 343 | cuerr = cudaSetDevice(device); assert(cuerr == cudaSuccess); 344 | } 345 | matrix_multiplication_gpu(left_transp,right_transp, 346 | matrix0_body,this->getNumRows(),this->getNumCols(), 347 | matrix1_body,Amat.getNumRows(),Amat.getNumCols(), 348 | matrix2_body,Bmat.getNumRows(),Bmat.getNumCols()); 349 | if(device != dev){ 350 | cuerr = cudaSetDevice(dev); assert(cuerr == cudaSuccess); 351 | } 352 | }else{ //Host 353 | //`Implement 354 | assert(false); 355 | } 356 | return; 357 | } 358 | 359 | } //namespace bla 360 | 361 | #endif //MATRIX_HPP_ 362 | -------------------------------------------------------------------------------- /memory.cpp: -------------------------------------------------------------------------------- 1 | /* CUDA tutorial: Basic Linear Algebra (BLA) Library 2 | 3 | !Copyright (C) 2018-2018 Dmitry I. Lyakh (Liakh) 4 | !Copyright (C) 2018-2018 Oak Ridge National Laboratory (UT-Battelle) 5 | 6 | !This file is part of CUDA BLA tutorial. 7 | 8 | !CUDA BLA is free software: you can redistribute it and/or modify 9 | !it under the terms of the GNU Lesser General Public License as published 10 | !by the Free Software Foundation, either version 3 of the License, or 11 | !(at your option) any later version. 12 | 13 | !CUDA BLA is distributed in the hope that it will be useful, 14 | !but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | !MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | !GNU Lesser General Public License for more details. 17 | 18 | !You should have received a copy of the GNU Lesser General Public License 19 | !along with CUDA BLA. If not, see . */ 20 | 21 | #include "memory.hpp" 22 | 23 | #include 24 | 25 | #include 26 | 27 | #include 28 | #include 29 | 30 | namespace bla{ 31 | 32 | //Memory chunk descriptor: 33 | typedef struct{ 34 | int device; 35 | MemKind mem_kind; 36 | size_t mem_size; 37 | } MemChunkDescr; 38 | 39 | 40 | //Register of allocated memory chunks: 41 | std::map mem_reg; 42 | 43 | 44 | void * allocate(size_t size, int device, MemKind mem_kind) 45 | { 46 | void * ptr = nullptr; 47 | cudaError_t cuerr; 48 | 49 | if(size > 0){ 50 | //Allocated memory: 51 | switch(mem_kind){ 52 | case MemKind::Regular: 53 | if(device < 0){ //Host 54 | ptr = malloc(size); 55 | }else{ //GPU device 56 | int dev; 57 | cuerr = cudaGetDevice(&dev); assert(cuerr == cudaSuccess); 58 | if(device != dev){ 59 | cuerr = cudaSetDevice(device); assert(cuerr == cudaSuccess); 60 | } 61 | cuerr = cudaMalloc(&ptr,size); assert(cuerr == cudaSuccess); 62 | if(device != dev){ 63 | cuerr = cudaSetDevice(dev); assert(cuerr == cudaSuccess); 64 | } 65 | } 66 | break; 67 | case MemKind::Pinned: 68 | if(device < 0){ //Host 69 | cuerr = cudaHostAlloc(&ptr,size,cudaHostAllocPortable); assert(cuerr == cudaSuccess); 70 | }else{ //GPU device 71 | std::cout << "#ERROR(BLA::memory::allocate): Pinned memory is not available on GPU!" << std::endl; 72 | assert(false); 73 | } 74 | break; 75 | case MemKind::Mapped: 76 | if(device < 0){ //Host 77 | cuerr = cudaHostAlloc(&ptr,size,cudaHostAllocPortable|cudaHostAllocMapped); assert(cuerr == cudaSuccess); 78 | }else{ //GPU device 79 | std::cout << "#ERROR(BLA::memory::allocate): Mapped pinned memory is not available on GPU!" << std::endl; 80 | assert(false); 81 | } 82 | break; 83 | case MemKind::Unified: 84 | std::cout << "#ERROR(BLA::memory::allocate): Unified memory allocation is not implemented!" << std::endl; 85 | assert(false); 86 | break; 87 | } 88 | } 89 | //Register memory with BLA: 90 | if(ptr != nullptr){ 91 | auto res = mem_reg.emplace(std::make_pair(ptr,MemChunkDescr{device,mem_kind,size})); 92 | assert(res.second); 93 | } 94 | return ptr; 95 | } 96 | 97 | 98 | void deallocate(void * ptr) 99 | { 100 | assert(ptr != nullptr); 101 | //Find the memory chunk descriptor: 102 | auto pos = mem_reg.find(ptr); 103 | if(pos == mem_reg.end()){ 104 | std::cout << "#ERROR(BLA::memory::deallocate): Attempt to deallocate a pointer not allocated by BLA!" << std::endl; 105 | assert(false); 106 | } 107 | auto device = pos->second.device; 108 | auto mem_kind = pos->second.mem_kind; 109 | //Deallocate memory: 110 | cudaError_t cuerr; 111 | switch(mem_kind){ 112 | case MemKind::Regular: 113 | if(device < 0){ //Host 114 | free(ptr); 115 | }else{ //Device 116 | int dev; 117 | cuerr = cudaGetDevice(&dev); assert(cuerr == cudaSuccess); 118 | if(device != dev){ 119 | cuerr = cudaSetDevice(device); assert(cuerr == cudaSuccess); 120 | } 121 | cuerr = cudaFree(ptr); assert(cuerr == cudaSuccess); 122 | if(device != dev){ 123 | cuerr = cudaSetDevice(dev); assert(cuerr == cudaSuccess); 124 | } 125 | } 126 | break; 127 | case MemKind::Pinned: 128 | cuerr = cudaFreeHost(ptr); assert(cuerr == cudaSuccess); 129 | break; 130 | case MemKind::Mapped: 131 | cuerr = cudaFreeHost(ptr); assert(cuerr == cudaSuccess); 132 | break; 133 | case MemKind::Unified: 134 | std::cout << "#ERROR(BLA::memory::deallocate): Unified memory allocation is not implemented!" << std::endl; 135 | assert(false); 136 | break; 137 | } 138 | //Delete memory chunk descriptor: 139 | mem_reg.erase(ptr); 140 | return; 141 | } 142 | 143 | } //namespace bla 144 | -------------------------------------------------------------------------------- /memory.hpp: -------------------------------------------------------------------------------- 1 | /* CUDA tutorial: Basic Linear Algebra (BLA) Library 2 | 3 | !Copyright (C) 2018-2018 Dmitry I. Lyakh (Liakh) 4 | !Copyright (C) 2018-2018 Oak Ridge National Laboratory (UT-Battelle) 5 | 6 | !This file is part of CUDA BLA tutorial. 7 | 8 | !CUDA BLA is free software: you can redistribute it and/or modify 9 | !it under the terms of the GNU Lesser General Public License as published 10 | !by the Free Software Foundation, either version 3 of the License, or 11 | !(at your option) any later version. 12 | 13 | !CUDA BLA is distributed in the hope that it will be useful, 14 | !but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | !MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | !GNU Lesser General Public License for more details. 17 | 18 | !You should have received a copy of the GNU Lesser General Public License 19 | !along with CUDA BLA. If not, see . */ 20 | 21 | #ifndef MEMORY_HPP_ 22 | #define MEMORY_HPP_ 23 | 24 | #include 25 | 26 | namespace bla{ 27 | 28 | //Memory kinds: 29 | enum class MemKind{ 30 | Regular, //regular global memory (either Host or Device) 31 | Pinned, //pinned memory (only Host) 32 | Mapped, //mapped pinned memory (only Host) 33 | Unified //unified memory (regardless) 34 | }; 35 | 36 | /** Allocates memory on any device (Host: -1; Device: >=0): 37 | Host (CPU): id = -1; 38 | GPU device: id >= 0. **/ 39 | void * allocate(size_t size, //in: requested memory size in bytes 40 | int device = -1, //in: device (-1: Host; >=0: corresponding GPU) 41 | MemKind mem_kind = MemKind::Regular); //in: requested memory kind (see above) 42 | 43 | /** Deallocates previously allocated memory on any device. **/ 44 | void deallocate(void * ptr); //in: pointer to the previously allocated memory chunk 45 | 46 | } //namespace bla 47 | 48 | #endif //MEMORY_HPP_ 49 | -------------------------------------------------------------------------------- /timer.cpp: -------------------------------------------------------------------------------- 1 | /* CUDA tutorial: Basic Linear Algebra (BLA) Library 2 | 3 | !Copyright (C) 2018-2018 Dmitry I. Lyakh (Liakh) 4 | !Copyright (C) 2018-2018 Oak Ridge National Laboratory (UT-Battelle) 5 | 6 | !This file is part of CUDA BLA tutorial. 7 | 8 | !CUDA BLA is free software: you can redistribute it and/or modify 9 | !it under the terms of the GNU Lesser General Public License as published 10 | !by the Free Software Foundation, either version 3 of the License, or 11 | !(at your option) any later version. 12 | 13 | !CUDA BLA is distributed in the hope that it will be useful, 14 | !but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | !MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | !GNU Lesser General Public License for more details. 17 | 18 | !You should have received a copy of the GNU Lesser General Public License 19 | !along with CUDA BLA. If not, see . */ 20 | 21 | #include "timer.hpp" 22 | 23 | #include 24 | 25 | namespace bla{ 26 | 27 | double time_sys_sec() 28 | { 29 | auto stamp = std::chrono::system_clock::now(); //current time point 30 | auto durat = std::chrono::duration(stamp.time_since_epoch()); //duration (sec) since the begining of the clock 31 | return durat.count(); //number of seconds 32 | } 33 | 34 | 35 | double time_high_sec() 36 | { 37 | auto stamp = std::chrono::high_resolution_clock::now(); //current time point 38 | auto durat = std::chrono::duration(stamp.time_since_epoch()); //duration (sec) since the begining of the clock 39 | return durat.count(); //number of seconds 40 | } 41 | 42 | } //namespace bla 43 | -------------------------------------------------------------------------------- /timer.hpp: -------------------------------------------------------------------------------- 1 | /* CUDA tutorial: Basic Linear Algebra (BLA) Library 2 | 3 | !Copyright (C) 2018-2018 Dmitry I. Lyakh (Liakh) 4 | !Copyright (C) 2018-2018 Oak Ridge National Laboratory (UT-Battelle) 5 | 6 | !This file is part of CUDA BLA tutorial. 7 | 8 | !CUDA BLA is free software: you can redistribute it and/or modify 9 | !it under the terms of the GNU Lesser General Public License as published 10 | !by the Free Software Foundation, either version 3 of the License, or 11 | !(at your option) any later version. 12 | 13 | !CUDA BLA is distributed in the hope that it will be useful, 14 | !but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | !MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | !GNU Lesser General Public License for more details. 17 | 18 | !You should have received a copy of the GNU Lesser General Public License 19 | !along with CUDA BLA. If not, see . */ 20 | 21 | #ifndef TIMER_HPP_ 22 | #define TIMER_HPP_ 23 | 24 | namespace bla{ 25 | 26 | /** System time stamp in seconds (thread-global) **/ 27 | double time_sys_sec(); 28 | 29 | /** High-resolution time stamp in seconds **/ 30 | double time_high_sec(); 31 | 32 | } //namespace bla 33 | 34 | #endif //TIMER_HPP_ 35 | --------------------------------------------------------------------------------