├── .gitignore
├── Makefile
├── Presentation.pdf
├── README.md
├── bla.hpp
├── bla_lib.cu
├── bla_lib.hpp
├── main.cpp
├── matrix.hpp
├── memory.cpp
├── memory.hpp
├── timer.cpp
└── timer.hpp


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.o
 2 | *.mod
 3 | *.modmic
 4 | *.ptx
 5 | *.i
 6 | *.ii
 7 | *.cudafe*
 8 | *.fatbin*
 9 | *.cubin
10 | *.module_id
11 | *.hash
12 | *.a
13 | *.so
14 | *.x
15 | *.log
16 | *.out
17 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | BINARY_NAME = bla_test.x
 2 | 
 3 | CXX_COMP = g++
 4 | CXX_FLAGS_DEV = -c -O3 -std=c++11 -fPIC -D_FORCE_INLINES -g
 5 | CXX_FLAGS_OPT = -c -O3 -std=c++11 -fPIC -D_FORCE_INLINES
 6 | CXX_FLAGS = $(CXX_FLAGS_OPT)
 7 | CXX_INC =
 8 | CXX_LIB = -lstdc++
 9 | 
10 | CUDA_COMP = nvcc
11 | CUDA_HOST = /usr/bin/g++
12 | CUDA_ARCH = sm_35
13 | CUDA_INC = -I/usr/local/cuda/include
14 | CUDA_LIB = -L/usr/local/cuda/lib64 -lcublas -lcudart
15 | CUDA_FLAGS_DEV = --compile -ccbin $(CUDA_HOST) -std=c++11 -arch=$(CUDA_ARCH) -O3 -m64 -w --resource-usage --ptxas-options=-v -Xcompiler -fPIC -D_FORCE_INLINES -g -G
16 | CUDA_FLAGS_OPT = --compile -ccbin $(CUDA_HOST) -std=c++11 -arch=$(CUDA_ARCH) -O3 -m64 -w --resource-usage --ptxas-options=-v -Xcompiler -fPIC -D_FORCE_INLINES
17 | CUDA_FLAGS_ADV = --compile -ccbin $(CUDA_HOST) -std=c++11 -arch=$(CUDA_ARCH) -O3 -m64 -w --resource-usage --ptxas-options=-v -lineinfo -Xcompiler -fPIC -D_FORCE_INLINES
18 | CUDA_FLAGS = $(CUDA_FLAGS_ADV)
19 | 
20 | LINK_FLAGS = -fPIC
21 | 
22 | OBJS = timer.o memory.o bla_lib.o main.o
23 | 
24 | $(BINARY_NAME): $(OBJS)
25 | 	$(CXX_COMP) $(OBJS) $(LINK_FLAGS) $(CXX_LIB) $(CUDA_LIB) -o $(BINARY_NAME)
26 | 
27 | timer.o: timer.cpp timer.hpp
28 | 	$(CXX_COMP) $(CXX_FLAGS) $(CXX_INC) $(CUDA_INC) timer.cpp
29 | 
30 | memory.o: memory.cpp memory.hpp
31 | 	$(CXX_COMP) $(CXX_FLAGS) $(CXX_INC) $(CUDA_INC) memory.cpp
32 | 
33 | bla_lib.o: bla_lib.cu bla_lib.hpp matrix.hpp memory.hpp timer.hpp
34 | 	$(CUDA_COMP) $(CUDA_FLAGS) $(CXX_INC) $(CUDA_INC) --ptx --source-in-ptx bla_lib.cu -o bla_lib.ptx
35 | 	$(CUDA_COMP) $(CUDA_FLAGS) $(CXX_INC) $(CUDA_INC) bla_lib.cu
36 | 
37 | main.o: main.cpp bla_lib.cu bla_lib.hpp memory.hpp
38 | 	$(CXX_COMP) $(CXX_FLAGS) $(CXX_INC) $(CUDA_INC) main.cpp
39 | 
40 | 
41 | .PHONY: clean
42 | clean:
43 | 	rm -f *.out *.x *.a *.so *.o *.mod *.modmic *.ptx *.log
44 | 


--------------------------------------------------------------------------------
/Presentation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DmitryLyakh/CUDA_Tutorial/a7fc4021d8843c997c06fa5faf2a31a1431f2dca/Presentation.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | CUDA Tutorial: Basic Linear Algebra (BLA) Library
 2 | 
 3 | AUTHOR: Dmitry I. Lyakh (Liakh): quant4me@gmail.com, liakhdi@ornl.gov
 4 | 
 5 | Copyright (C) 2018-2018 Dmitry I. Lyakh (Liakh)
 6 | Copyright (C) 2018-2018 Oak Ridge National Laboratory (UT-Battelle)
 7 | 
 8 | LICENSE: GNU Lesser General Public License v.3
 9 | 
10 | Persistent location:
11 | https://github.com/DmitryLyakh/CUDA_Tutorial.git
12 | 
13 | Presentation from the Petascale Computing Institute 2019:
14 | Presentation.pdf
15 | 
16 | YouTube video of this tutorial:
17 | https://youtu.be/Zqfa80APkDk
18 | 
19 | BUILD:
20 | 1. Prerequisites: Linux, g++ 5+, CUDA 9+.
21 | 2. Update CUDA_INC and CUDA_LIB paths in the Makefile (if needed).
22 | 3. Adjust CUDA_ARCH in the Makefile to your GPU compute capability.
23 | 4. If your g++ compiler is too new for CUDA, provide an older one in CUDA_HOST.
24 | 5. make
25 | 


--------------------------------------------------------------------------------
/bla.hpp:
--------------------------------------------------------------------------------
 1 | /* CUDA tutorial: Basic Linear Algebra (BLA) Library
 2 | 
 3 | !Copyright (C) 2018-2018 Dmitry I. Lyakh (Liakh)
 4 | !Copyright (C) 2018-2018 Oak Ridge National Laboratory (UT-Battelle)
 5 | 
 6 | !This file is part of CUDA BLA tutorial.
 7 | 
 8 | !CUDA BLA is free software: you can redistribute it and/or modify
 9 | !it under the terms of the GNU Lesser General Public License as published
10 | !by the Free Software Foundation, either version 3 of the License, or
11 | !(at your option) any later version.
12 | 
13 | !CUDA BLA is distributed in the hope that it will be useful,
14 | !but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | !MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | !GNU Lesser General Public License for more details.
17 | 
18 | !You should have received a copy of the GNU Lesser General Public License
19 | !along with CUDA BLA. If not, see <http://www.gnu.org/licenses/>. */
20 | 
21 | #ifndef BLA_HPP_
22 | #define BLA_HPP_
23 | 
24 | #include "matrix.hpp"
25 | #include "bla_lib.hpp"
26 | 
27 | #endif //BLA_HPP_
28 | 


--------------------------------------------------------------------------------
/bla_lib.cu:
--------------------------------------------------------------------------------
   1 | /* CUDA tutorial: Basic Linear Algebra (BLA) Library
   2 | 
   3 | !Copyright (C) 2018-2018 Dmitry I. Lyakh (Liakh)
   4 | !Copyright (C) 2018-2018 Oak Ridge National Laboratory (UT-Battelle)
   5 | 
   6 | !This file is part of CUDA BLA tutorial.
   7 | 
   8 | !CUDA BLA is free software: you can redistribute it and/or modify
   9 | !it under the terms of the GNU Lesser General Public License as published
  10 | !by the Free Software Foundation, either version 3 of the License, or
  11 | !(at your option) any later version.
  12 | 
  13 | !CUDA BLA is distributed in the hope that it will be useful,
  14 | !but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 | !MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 | !GNU Lesser General Public License for more details.
  17 | 
  18 | !You should have received a copy of the GNU Lesser General Public License
  19 | !along with CUDA BLA. If not, see <http://www.gnu.org/licenses/>. */
  20 | 
  21 | #include "bla_lib.hpp"
  22 | 
  23 | #include <cuda_runtime.h>
  24 | #include <cublas_v2.h>
  25 | 
  26 | #include <cstdio>
  27 | #include <cassert>
  28 | #include <cmath>
  29 | 
  30 | #include <iostream>
  31 | 
  32 | namespace bla{
  33 | 
  34 | //GPU device constants:
  35 | __device__ __constant__ static float zero_fp32 = 0.0f;
  36 | __device__ __constant__ static float unity_fp32 = 1.0f;
  37 | __device__ __constant__ static double zero_fp64 = 0.0;
  38 | __device__ __constant__ static double unity_fp64 = 1.0;
  39 | 
  40 | 
  41 | //CUDA floating point data type selector:
  42 | template <typename T> struct CudaFPData{};
  43 | template <> struct CudaFPData<float>{
  44 |  using type = float;
  45 |  static constexpr cudaDataType_t kind = CUDA_R_32F;
  46 | };
  47 | template <> struct CudaFPData<double>{
  48 |  using type = double;
  49 |  static constexpr cudaDataType_t kind = CUDA_R_64F;
  50 | };
  51 | template <> struct CudaFPData<std::complex<float>>{
  52 |  using type = cuComplex;
  53 |  static constexpr cudaDataType_t kind = CUDA_C_32F;
  54 | };
  55 | template <> struct CudaFPData<std::complex<double>>{
  56 |  using type = cuDoubleComplex;
  57 |  static constexpr cudaDataType_t kind = CUDA_C_64F;
  58 | };
  59 | 
  60 | 
  61 | //Number of present GPU devices:
  62 | static int totalNumGPUs = 0;
  63 | 
  64 | //Current GEMM algorithm:
  65 | static int gemmAlgorithm = 0;
  66 | 
  67 | //CUDA device properties (for all GPU devices):
  68 | cudaDeviceProp * gpuProperty;
  69 | 
  70 | //cuBLAS handles (one per device):
  71 | cublasHandle_t * cublasHandle;
  72 | 
  73 | 
  74 | //Internal tests:
  75 | bool test_hello();
  76 | bool test_norm();
  77 | 
  78 | 
  79 | //CUDA kernel prototypes:
  80 | __global__ void gpu_test_presence(size_t str_len, char * __restrict__ dst, const char * __restrict__ src);
  81 | 
  82 | 
  83 | template <typename T>
  84 | __global__ void gpu_array_norm2(size_t arr_size, const T * __restrict__ arr, volatile T * norm);
  85 | __device__ static unsigned int norm_wr_lock = 0; //reduction lock (per GPU)
  86 | 
  87 | 
  88 | template <typename T>
  89 | __global__ void gpu_array_add(size_t arr_size, T * __restrict__ arr0, const T * __restrict__ arr1, T alpha);
  90 | 
  91 | 
  92 | template <typename T>
  93 | __global__ void gpu_gemm_nn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right);
  94 | 
  95 | template <typename T, int TILE_EXT_N = 16, int TILE_EXT_M = 16, int TILE_EXT_K = 64>
  96 | __global__ void gpu_gemm_sh_nn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right);
  97 | 
  98 | template <typename T, int TILE_EXT_N = 64, int TILE_EXT_M = 64, int TILE_EXT_K = 16>
  99 | __global__ void gpu_gemm_sh_reg_nn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right);
 100 | 
 101 | template <typename T>
 102 | __global__ void gpu_gemm_tn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right);
 103 | 
 104 | template <typename T, int TILE_EXT_N = 16, int TILE_EXT_M = 16, int TILE_EXT_K = 64>
 105 | __global__ void gpu_gemm_sh_tn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right);
 106 | 
 107 | template <typename T, int TILE_EXT_N = 64, int TILE_EXT_M = 64, int TILE_EXT_K = 16>
 108 | __global__ void gpu_gemm_sh_reg_tn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right);
 109 | 
 110 | template <typename T>
 111 | __global__ void gpu_gemm_nt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right);
 112 | 
 113 | template <typename T, int TILE_EXT_N = 16, int TILE_EXT_M = 16, int TILE_EXT_K = 64>
 114 | __global__ void gpu_gemm_sh_nt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right);
 115 | 
 116 | template <typename T, int TILE_EXT_N = 64, int TILE_EXT_M = 64, int TILE_EXT_K = 16>
 117 | __global__ void gpu_gemm_sh_reg_nt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right);
 118 | 
 119 | template <typename T>
 120 | __global__ void gpu_gemm_tt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right);
 121 | 
 122 | template <typename T, int TILE_EXT_N = 16, int TILE_EXT_M = 16, int TILE_EXT_K = 64>
 123 | __global__ void gpu_gemm_sh_tt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right);
 124 | 
 125 | template <typename T, int TILE_EXT_N = 64, int TILE_EXT_M = 64, int TILE_EXT_K = 16>
 126 | __global__ void gpu_gemm_sh_reg_tt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right);
 127 | 
 128 | //template <typename T, int TILE_EXT_N = 16, int TILE_EXT_M = 16, int TILE_EXT_K = 64, int FRAG_EXT_N = 4, int FRAG_EXT_M = 8>
 129 | //__global__ void gpu_gemm_sh_reg_old_nn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right);
 130 | 
 131 | 
 132 | cublasStatus_t cublasGemm(cublasHandle_t handle,
 133 |                           cublasOperation_t transa, cublasOperation_t transb,
 134 |                           int m, int n, int k, const float * alpha,
 135 |                           const float * A, int lda, const float * B, int ldb,
 136 |                           const float * beta, float * C, int ldc)
 137 | {
 138 |  return cublasSgemm(handle,transa,transb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
 139 | }
 140 | 
 141 | cublasStatus_t cublasGemm(cublasHandle_t handle,
 142 |                           cublasOperation_t transa, cublasOperation_t transb,
 143 |                           int m, int n, int k, const double * alpha,
 144 |                           const double * A, int lda, const double * B, int ldb,
 145 |                           const double * beta, double * C, int ldc)
 146 | {
 147 |  return cublasDgemm(handle,transa,transb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
 148 | }
 149 | 
 150 | 
 151 | //Dispatch wrappers:
 152 | template <typename T>
 153 | T matrix_norm2_gpu_(size_t num_elems,
 154 |                     const T * matrix_body);
 155 | 
 156 | template <typename T>
 157 | void matrix_addition_gpu_(size_t num_elems,
 158 |                           T * matrix0_body,
 159 |                           const T * matrix1_body,
 160 |                           T alpha);
 161 | 
 162 | template <typename T>
 163 | void matrix_multiplication_gpu_(bool left_transp, bool right_transp,
 164 |                                 T * matrix0_body, int nrows0, int ncols0,
 165 |                                 const T * matrix1_body, int nrows1, int ncols1,
 166 |                                 const T * matrix2_body, int nrows2, int ncols2);
 167 | 
 168 | 
 169 | //IMPLEMENTATION:
 170 | __global__ void gpu_test_presence(size_t str_len, char * __restrict__ dst, const char * __restrict__ src)
 171 | {
 172 |  int tid = blockIdx.x * blockDim.x + threadIdx.x;
 173 |  while(tid < str_len){
 174 |   dst[tid] = src[tid];
 175 |   tid += gridDim.x * blockDim.x;
 176 |  }
 177 |  return;
 178 | }
 179 | 
 180 | 
 181 | template <typename T>
 182 | __global__ void gpu_array_norm2(size_t arr_size,            //in: array size
 183 |                                 const T * __restrict__ arr, //in: pointer to arr[arr_size]
 184 |                                 volatile T * norm)          //inout: sum of the squared elements of the array
 185 | {
 186 |  extern __shared__ double thread_norm[]; //blockDim.x
 187 | 
 188 |  size_t n = gridDim.x * blockDim.x;
 189 |  double tnorm = 0.0;
 190 |  for(size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < arr_size; i += n) tnorm += arr[i] * arr[i];
 191 |  thread_norm[threadIdx.x] = tnorm;
 192 |  __syncthreads();
 193 | 
 194 |  unsigned int s = blockDim.x;
 195 |  while(s > 1){
 196 |   unsigned int j = (s+1U)>>1; //=(s+1)/2
 197 |   if(threadIdx.x + j < s) thread_norm[threadIdx.x] += thread_norm[threadIdx.x+j];
 198 |   __syncthreads();
 199 |   s = j;
 200 |  }
 201 | 
 202 |  if(threadIdx.x == 0){
 203 |   unsigned int j = 1;
 204 |   while(j){j = atomicMax(&norm_wr_lock,1);} //lock
 205 |   __threadfence();
 206 |   *norm += thread_norm[0]; //accumulate
 207 |   __threadfence();
 208 |   j=atomicExch(&norm_wr_lock,0); //unlock
 209 |  }
 210 |  __syncthreads();
 211 |  return;
 212 | }
 213 | 
 214 | 
 215 | template <typename T>
 216 | __global__ void gpu_array_add(size_t arr_size,             //in: array size
 217 |                               T * __restrict__ arr0,       //inout: pointer to arr0[arr_size]
 218 |                               const T * __restrict__ arr1, //in: pointer to arr1[arr_size]
 219 |                               T alpha)                     //in: scaling factor
 220 | {
 221 |  size_t n = gridDim.x * blockDim.x;
 222 |  for(size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < arr_size; i += n) arr0[i] += arr1[i] * alpha;
 223 |  return;
 224 | }
 225 | 
 226 | 
 227 | template <typename T>
 228 | __global__ void gpu_gemm_nn(int m, int n, int k,          //in: matrix dimensions: C(m,n)+=A(m,k)*B(k,n)
 229 |                             T * __restrict__ dest,        //inout: pointer to C matrix data
 230 |                             const T * __restrict__ left,  //in: pointer to A matrix data
 231 |                             const T * __restrict__ right) //in: pointer to B matrix data
 232 | {
 233 |  size_t ty = blockIdx.y*blockDim.y + threadIdx.y; //global thread index Y
 234 |  size_t tx = blockIdx.x*blockDim.x + threadIdx.x; //global thread index X
 235 | 
 236 |  size_t n_pos = ty;
 237 |  while(n_pos < n){
 238 | 
 239 |   size_t m_pos = tx;
 240 |   while(m_pos < m){
 241 | 
 242 |    T tmp = static_cast<T>(0.0);
 243 |    for(size_t k_pos = 0; k_pos < k; ++k_pos){
 244 |     tmp += left[k_pos*m + m_pos] * right[n_pos*k + k_pos];
 245 |    }
 246 |    dest[n_pos*m + m_pos] += tmp;
 247 | 
 248 |    m_pos += gridDim.x*blockDim.x;
 249 |   }
 250 | 
 251 |   n_pos += gridDim.y*blockDim.y;
 252 |  }
 253 |  return;
 254 | }
 255 | 
 256 | 
 257 | template <typename T, int TILE_EXT_N, int TILE_EXT_M, int TILE_EXT_K>
 258 | __global__ void gpu_gemm_sh_nn(int m, int n, int k,          //in: matrix dimensions: C(m,n)+=A(m,k)*B(k,n)
 259 |                                T * __restrict__ dest,        //inout: pointer to C matrix data
 260 |                                const T * __restrict__ left,  //in: pointer to A matrix data
 261 |                                const T * __restrict__ right) //in: pointer to B matrix data
 262 | {
 263 |  using int_t = int; //either int or size_t
 264 |  __shared__ T lbuf[TILE_EXT_K][TILE_EXT_M], rbuf[TILE_EXT_N][TILE_EXT_K];
 265 | 
 266 |  for(int_t n_pos = blockIdx.y*blockDim.y; n_pos < n; n_pos += gridDim.y*blockDim.y){ //tile offset in Y dimension
 267 | 
 268 |   for(int_t m_pos = blockIdx.x*blockDim.x; m_pos < m; m_pos += gridDim.x*blockDim.x){ //tile offset in X dimension
 269 | 
 270 |    T tmp = static_cast<T>(0.0); //accumulator
 271 | 
 272 |    for(int_t k_pos = 0; k_pos < k; k_pos += TILE_EXT_K){ //k_pos is the position of the CUDA thread along the K dimension
 273 |     int_t k_end = k_pos + TILE_EXT_K; if(k_end > k) k_end = k;
 274 | 
 275 |     //Load a tile of matrix A(m_pos:TILE_EXT_M, k_pos:TILE_EXT_K):
 276 |     if(m_pos + threadIdx.x < m){
 277 |      for(int_t k_loc = k_pos + threadIdx.y; k_loc < k_end; k_loc += blockDim.y){
 278 |       lbuf[k_loc-k_pos][threadIdx.x] = left[k_loc*m + (m_pos+threadIdx.x)];
 279 |      }
 280 |     }
 281 | 
 282 |     //Load a tile of matrix B(k_pos:TILE_EXT_K, n_pos:TILE_EXT_N):
 283 |     if(n_pos + threadIdx.y < n){
 284 |      for(int_t k_loc = k_pos + threadIdx.x; k_loc < k_end; k_loc += blockDim.x){
 285 |       rbuf[threadIdx.y][k_loc-k_pos] = right[(n_pos+threadIdx.y)*k + k_loc];
 286 |      }
 287 |     }
 288 |     __syncthreads();
 289 | 
 290 |     //Multiply two loaded tiles to produce a tile of matrix C(m_pos:TILE_EXT_M,n_pos:TILE_EXT_N):
 291 |     if(m_pos + threadIdx.x < m && n_pos + threadIdx.y < n){
 292 |      if(k_end - k_pos == TILE_EXT_K){ //number of loop iterations is known at compile time: Unroll it
 293 | #pragma unroll
 294 |       for(int_t l = 0; l < TILE_EXT_K; ++l){
 295 |        tmp += lbuf[l][threadIdx.x] * rbuf[threadIdx.y][l];
 296 |       }
 297 |      }else{ //number of loop iterations is not known at compile time
 298 |       for(int_t l = 0; l < (k_end - k_pos); ++l){
 299 |        tmp += lbuf[l][threadIdx.x] * rbuf[threadIdx.y][l];
 300 |       }
 301 |      }
 302 |     }
 303 |     __syncthreads();
 304 | 
 305 |    } //k_pos
 306 | 
 307 |    //Store element of the C matrix in global memory:
 308 |    if(m_pos + threadIdx.x < m && n_pos + threadIdx.y < n)
 309 |     dest[(n_pos+threadIdx.y)*m + (m_pos+threadIdx.x)] += tmp;
 310 | 
 311 |   } //m_pos
 312 | 
 313 |  } //n_pos
 314 |  return;
 315 | }
 316 | 
 317 | 
 318 | template <typename T, int TILE_EXT_N, int TILE_EXT_M, int TILE_EXT_K>
 319 | __global__ void gpu_gemm_sh_reg_nn(int m, int n, int k,          //in: matrix dimensions: C(m,n)+=A(m,k)*B(k,n)
 320 |                                    T * __restrict__ dest,        //inout: pointer to C matrix data
 321 |                                    const T * __restrict__ left,  //in: pointer to A matrix data
 322 |                                    const T * __restrict__ right) //in: pointer to B matrix data
 323 | {
 324 |  using int_t = int; //either int or size_t
 325 |  __shared__ T lbuf[TILE_EXT_K][TILE_EXT_M], rbuf[TILE_EXT_N][TILE_EXT_K];
 326 | 
 327 |  for(int_t n_pos = blockIdx.y*TILE_EXT_N; n_pos < n; n_pos += gridDim.y*TILE_EXT_N){ //tile offset in Y dimension
 328 |   int_t n_end = n_pos + TILE_EXT_N; if(n_end > n) n_end = n;
 329 | 
 330 |   for(int_t m_pos = blockIdx.x*TILE_EXT_M; m_pos < m; m_pos += gridDim.x*TILE_EXT_M){ //tile offset in X dimension
 331 |    int_t m_end = m_pos + TILE_EXT_M; if(m_end > m) m_end = m;
 332 | 
 333 |    if((m_end - m_pos == TILE_EXT_M) && (n_end - n_pos == TILE_EXT_N)){ //complete tile C(TILE_EXT_M,TILE_EXT_N)
 334 | 
 335 |     //Initialize registers to zero:
 336 |     T dreg[4][4] = {static_cast<T>(0.0)};
 337 |     T rreg[4] = {static_cast<T>(0.0)};
 338 |     T lreg[4] = {static_cast<T>(0.0)};
 339 | 
 340 |     for(int_t k_pos = 0; k_pos < k; k_pos += TILE_EXT_K){ //k_pos is the position of the CUDA thread along the K dimension
 341 |      int_t k_end = k_pos + TILE_EXT_K; if(k_end > k) k_end = k;
 342 | 
 343 |      //Load a tile of matrix A(m_pos:TILE_EXT_M, k_pos:TILE_EXT_K):
 344 |      for(int_t m_loc = m_pos + threadIdx.x; m_loc < m_end; m_loc += blockDim.x){
 345 |       for(int_t k_loc = k_pos + threadIdx.y; k_loc < k_end; k_loc += blockDim.y){
 346 |        lbuf[k_loc - k_pos][m_loc - m_pos] = left[k_loc*m + m_loc];
 347 |       }
 348 |      }
 349 | 
 350 |      //Load a tile of matrix B(k_pos:TILE_EXT_K, n_pos:TILE_EXT_N):
 351 |      for(int_t n_loc = n_pos + threadIdx.y; n_loc < n_end; n_loc += blockDim.y){
 352 |       for(int_t k_loc = k_pos + threadIdx.x; k_loc < k_end; k_loc += blockDim.x){
 353 |        rbuf[n_loc - n_pos][k_loc - k_pos] = right[n_loc*k + k_loc];
 354 |       }
 355 |      }
 356 |      __syncthreads();
 357 | 
 358 |      //Multiply two loaded tiles to produce a tile of matrix C(m_pos:TILE_EXT_M,n_pos:TILE_EXT_N):
 359 |      if(k_end - k_pos == TILE_EXT_K){
 360 | #pragma unroll
 361 |       for(int_t l = 0; l < TILE_EXT_K; ++l){
 362 | #pragma unroll
 363 |        for(int_t j = 0; j < 4; ++j) rreg[j] = rbuf[threadIdx.y + blockDim.y*j][l];
 364 | #pragma unroll
 365 |        for(int_t j = 0; j < 4; ++j) lreg[j] = lbuf[l][threadIdx.x + blockDim.x*j];
 366 | #pragma unroll
 367 |        for(int_t j = 0; j < 4; ++j){
 368 | #pragma unroll
 369 |         for(int_t i = 0; i < 4; ++i){
 370 |          dreg[j][i] += lreg[i] * rreg[j];
 371 |         }
 372 |        }
 373 |       }
 374 |      }else{
 375 |       for(int_t l = 0; l < (k_end - k_pos); ++l){
 376 | #pragma unroll
 377 |        for(int_t j = 0; j < 4; ++j) rreg[j] = rbuf[threadIdx.y + blockDim.y*j][l];
 378 | #pragma unroll
 379 |        for(int_t j = 0; j < 4; ++j) lreg[j] = lbuf[l][threadIdx.x + blockDim.x*j];
 380 | #pragma unroll
 381 |        for(int_t j = 0; j < 4; ++j){
 382 | #pragma unroll
 383 |         for(int_t i = 0; i < 4; ++i){
 384 |          dreg[j][i] += lreg[i] * rreg[j];
 385 |         }
 386 |        }
 387 |       }
 388 |      }
 389 |      __syncthreads();
 390 | 
 391 |     } //k_pos
 392 | 
 393 |     //Store elements of the C matrix in global memory:
 394 | #pragma unroll
 395 |     for(int_t j = 0; j < 4; ++j){
 396 | #pragma unroll
 397 |      for(int_t i = 0; i < 4; ++i){
 398 |       dest[(n_pos + threadIdx.y + blockDim.y*j)*m + (m_pos + threadIdx.x + blockDim.x*i)] += dreg[j][i];
 399 |      }
 400 |     }
 401 | 
 402 |    }else{ //incomplete tile of C
 403 | 
 404 |     //Initialize registers to zero:
 405 |     T dreg[4][4] = {static_cast<T>(0.0)};
 406 |     T rreg[4] = {static_cast<T>(0.0)};
 407 |     T lreg[4] = {static_cast<T>(0.0)};
 408 | 
 409 |     for(int_t k_pos = 0; k_pos < k; k_pos += TILE_EXT_K){ //k_pos is the position of the CUDA thread along the K dimension
 410 |      int_t k_end = k_pos + TILE_EXT_K; if(k_end > k) k_end = k;
 411 | 
 412 |      //Load a tile of matrix A(m_pos:TILE_EXT_M, k_pos:TILE_EXT_K):
 413 |      for(int_t m_loc = m_pos + threadIdx.x; m_loc < m_end; m_loc += blockDim.x){
 414 |       for(int_t k_loc = k_pos + threadIdx.y; k_loc < k_end; k_loc += blockDim.y){
 415 |        lbuf[k_loc - k_pos][m_loc - m_pos] = left[k_loc*m + m_loc];
 416 |       }
 417 |      }
 418 | 
 419 |      //Load a tile of matrix B(k_pos:TILE_EXT_K, n_pos:TILE_EXT_N):
 420 |      for(int_t n_loc = n_pos + threadIdx.y; n_loc < n_end; n_loc += blockDim.y){
 421 |       for(int_t k_loc = k_pos + threadIdx.x; k_loc < k_end; k_loc += blockDim.x){
 422 |        rbuf[n_loc - n_pos][k_loc - k_pos] = right[n_loc*k + k_loc];
 423 |       }
 424 |      }
 425 |      __syncthreads();
 426 | 
 427 |      //Multiply two loaded tiles to produce a tile of matrix C(m_pos:TILE_EXT_M,n_pos:TILE_EXT_N):
 428 |      for(int_t l = 0; l < (k_end - k_pos); ++l){
 429 |       for(int_t i = 0, j = threadIdx.y; j < n_end - n_pos; j += blockDim.y, i++) rreg[i] = rbuf[j][l];
 430 |       for(int_t i = 0, j = threadIdx.x; j < m_end - m_pos; j += blockDim.x, i++) lreg[i] = lbuf[l][j];
 431 | #pragma unroll
 432 |       for(int_t j = 0; j < 4; ++j){
 433 | #pragma unroll
 434 |        for(int_t i = 0; i < 4; ++i){
 435 |         dreg[j][i] += lreg[i] * rreg[j];
 436 |        }
 437 |       }
 438 |      }
 439 |      __syncthreads();
 440 | 
 441 |     } //k_pos
 442 | 
 443 |     //Store element of the C matrix in global memory:
 444 |     for(int_t j = 0, n_loc = n_pos + threadIdx.y; n_loc < n_end; n_loc += blockDim.y, j++){
 445 |      for(int_t i = 0, m_loc = m_pos + threadIdx.x; m_loc < m_end; m_loc += blockDim.x, i++){
 446 |       dest[n_loc*m + m_loc] += dreg[j][i];
 447 |      }
 448 |     }
 449 | 
 450 |    }
 451 | 
 452 |   } //m_pos
 453 | 
 454 |  } //n_pos
 455 |  return;
 456 | }
 457 | 
 458 | 
 459 | template <typename T>
 460 | __global__ void gpu_gemm_tn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right)
 461 | {
 462 |  //`Finish
 463 |  return;
 464 | }
 465 | 
 466 | 
 467 | template <typename T, int TILE_EXT_N, int TILE_EXT_M, int TILE_EXT_K>
 468 | __global__ void gpu_gemm_sh_tn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right)
 469 | {
 470 |  //`Finish
 471 |  return;
 472 | }
 473 | 
 474 | 
 475 | template <typename T, int TILE_EXT_N, int TILE_EXT_M, int TILE_EXT_K>
 476 | __global__ void gpu_gemm_sh_reg_tn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right)
 477 | {
 478 |  //`Finish
 479 |  return;
 480 | }
 481 | 
 482 | 
 483 | template <typename T>
 484 | __global__ void gpu_gemm_nt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right)
 485 | {
 486 |  //`Finish
 487 |  return;
 488 | }
 489 | 
 490 | 
 491 | template <typename T, int TILE_EXT_N, int TILE_EXT_M, int TILE_EXT_K>
 492 | __global__ void gpu_gemm_sh_nt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right)
 493 | {
 494 |  //`Finish
 495 |  return;
 496 | }
 497 | 
 498 | 
 499 | template <typename T, int TILE_EXT_N, int TILE_EXT_M, int TILE_EXT_K>
 500 | __global__ void gpu_gemm_sh_reg_nt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right)
 501 | {
 502 |  //`Finish
 503 |  return;
 504 | }
 505 | 
 506 | 
 507 | template <typename T>
 508 | __global__ void gpu_gemm_tt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right)
 509 | {
 510 |  //`Finish
 511 |  return;
 512 | }
 513 | 
 514 | 
 515 | template <typename T, int TILE_EXT_N, int TILE_EXT_M, int TILE_EXT_K>
 516 | __global__ void gpu_gemm_sh_tt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right)
 517 | {
 518 |  //`Finish
 519 |  return;
 520 | }
 521 | 
 522 | 
 523 | template <typename T, int TILE_EXT_N, int TILE_EXT_M, int TILE_EXT_K>
 524 | __global__ void gpu_gemm_sh_reg_tt(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right)
 525 | {
 526 |  //`Finish
 527 |  return;
 528 | }
 529 | 
 530 | 
 531 | /*
 532 | template <typename T, int TILE_EXT_N, int TILE_EXT_M, int TILE_EXT_K, int FRAG_EXT_N, int FRAG_EXT_M>
 533 | __global__ void gpu_gemm_sh_reg_old_nn(int m, int n, int k, T * __restrict__ dest, const T * __restrict__ left, const T * __restrict__ right)
 534 | {
 535 |  using int_t = int; //either int or size_t
 536 |  __shared__ T lbuf[TILE_EXT_K][TILE_EXT_M], rbuf[TILE_EXT_N][TILE_EXT_K];
 537 |  T lreg[FRAG_EXT_M], rreg[FRAG_EXT_N], dreg[FRAG_EXT_N][FRAG_EXT_M];
 538 | 
 539 |  const int_t wyb = ((threadIdx.y*blockDim.x + threadIdx.x) / warpSize) / (TILE_EXT_M/FRAG_EXT_M) * FRAG_EXT_N;
 540 |  const int_t wxb = ((threadIdx.y*blockDim.x + threadIdx.x) / warpSize) % (TILE_EXT_M/FRAG_EXT_M) * FRAG_EXT_M;
 541 |  const int_t ln = (threadIdx.y*blockDim.x + threadIdx.x) % warpSize; //thread lane index inside a warp
 542 |  const int_t lny = ln / FRAG_EXT_M; //Y position inside warp fragment
 543 |  const int_t lnx = ln % FRAG_EXT_M; //X position inside warp fragment
 544 | 
 545 |  for(int_t n_pos = blockIdx.y*blockDim.y; n_pos < n; n_pos += gridDim.y*blockDim.y){ //tile offset in Y dimension
 546 | 
 547 |   for(int_t m_pos = blockIdx.x*blockDim.x; m_pos < m; m_pos += gridDim.x*blockDim.x){ //tile offset in X dimension
 548 | 
 549 |    if((m_pos + TILE_EXT_M <= m) && (n_pos + TILE_EXT_N <= n)){ //complete tile (TILE_EXT_N * TILE_EXT_M)
 550 | 
 551 |     //Initialize C accumulators to zero:
 552 | #pragma unroll
 553 |     for(int_t j = 0; j < FRAG_EXT_N; ++j){
 554 | #pragma unroll
 555 |      for(int_t i = 0; i < FRAG_EXT_M; ++i){
 556 |       dreg[j][i] = static_cast<T>(0.0);
 557 |      }
 558 |     }
 559 | 
 560 |     for(int_t k_pos = 0; k_pos < k; k_pos += TILE_EXT_K){ //k_pos is the position of the CUDA thread along the K dimension
 561 |      int_t k_end = k_pos + TILE_EXT_K; if(k_end > k) k_end = k;
 562 | 
 563 |      //Load a tile of matrix A(m_pos:TILE_EXT_M, k_pos:TILE_EXT_K):
 564 |      for(int_t k_loc = k_pos + threadIdx.y; k_loc < k_end; k_loc += blockDim.y){
 565 |       lbuf[k_loc-k_pos][threadIdx.x] = left[k_loc*m + (m_pos+threadIdx.x)];
 566 |      }
 567 | 
 568 |      //Load a tile of matrix B(k_pos:TILE_EXT_K, n_pos:TILE_EXT_N):
 569 |      for(int_t k_loc = k_pos + threadIdx.x; k_loc < k_end; k_loc += blockDim.x){
 570 |       rbuf[threadIdx.y][k_loc-k_pos] = right[(n_pos+threadIdx.y)*k + k_loc];
 571 |      }
 572 |      __syncthreads();
 573 | 
 574 |      //Multiply two loaded tiles to produce a tile of matrix C(m_pos:TILE_EXT_M,n_pos:TILE_EXT_N):
 575 |      for(int_t l = ln; l < (k_end - k_pos); l += warpSize){
 576 |       //Load fragments of shared memory tiles into registers:
 577 | #pragma unroll
 578 |       for(int_t j = 0; j < FRAG_EXT_N; ++j) rreg[j] = rbuf[wyb + j][l];
 579 | #pragma unroll
 580 |       for(int_t j = 0; j < FRAG_EXT_M; ++j) lreg[j] = lbuf[l][wxb + j];
 581 |       //Compute outer product of tile fragments in registers:
 582 | #pragma unroll
 583 |       for(int_t j = 0; j < FRAG_EXT_N; ++j){
 584 | #pragma unroll
 585 |        for(int_t i = 0; i < FRAG_EXT_M; ++i){
 586 |         dreg[j][i] += lreg[i] * rreg[j];
 587 |        }
 588 |       }
 589 |      }
 590 |      __syncthreads();
 591 | 
 592 |     } //k_pos
 593 | 
 594 |     //Perform reduction of the C fragment within each warp:
 595 | #pragma unroll
 596 |     for(int_t j = 0; j < FRAG_EXT_N; ++j){
 597 | #pragma unroll
 598 |      for(int_t i = 0; i < FRAG_EXT_M; ++i){
 599 | #pragma unroll
 600 |       dreg[j][i] += __shfl_xor_sync(0xffffffff,dreg[j][i],16);
 601 |       dreg[j][i] += __shfl_xor_sync(0xffffffff,dreg[j][i],8);
 602 |       dreg[j][i] += __shfl_xor_sync(0xffffffff,dreg[j][i],4);
 603 |       dreg[j][i] += __shfl_xor_sync(0xffffffff,dreg[j][i],2);
 604 |       dreg[j][i] += __shfl_xor_sync(0xffffffff,dreg[j][i],1);
 605 |      }
 606 |     }
 607 | 
 608 |     //Upload C fragments into C matrix in global memory:
 609 |     dest[(n_pos + wyb + lny)*m + (m_pos + wxb + lnx)] = dreg[lny][lnx];
 610 | 
 611 |    }else{ //incomplete tile
 612 | 
 613 |     //Initialize accumulator to zero:
 614 |     T tmp = static_cast<T>(0.0);
 615 | 
 616 |     for(int_t k_pos = 0; k_pos < k; k_pos += TILE_EXT_K){ //k_pos is the position of the CUDA thread along the K dimension
 617 |      int_t k_end = k_pos + TILE_EXT_K; if(k_end > k) k_end = k;
 618 | 
 619 |      //Load a tile of matrix A(m_pos:TILE_EXT_M, k_pos:TILE_EXT_K):
 620 |      if(m_pos + threadIdx.x < m){
 621 |       for(int_t k_loc = k_pos + threadIdx.y; k_loc < k_end; k_loc += blockDim.y){
 622 |        lbuf[k_loc-k_pos][threadIdx.x] = left[k_loc*m + (m_pos+threadIdx.x)];
 623 |       }
 624 |      }
 625 | 
 626 |      //Load a tile of matrix B(k_pos:TILE_EXT_K, n_pos:TILE_EXT_N):
 627 |      if(n_pos + threadIdx.y < n){
 628 |       for(int_t k_loc = k_pos + threadIdx.x; k_loc < k_end; k_loc += blockDim.x){
 629 |        rbuf[threadIdx.y][k_loc-k_pos] = right[(n_pos+threadIdx.y)*k + k_loc];
 630 |       }
 631 |      }
 632 |      __syncthreads();
 633 | 
 634 |      //Multiply two loaded tiles to produce a tile of matrix C(m_pos:TILE_EXT_M,n_pos:TILE_EXT_N):
 635 |      if(m_pos + threadIdx.x < m && n_pos + threadIdx.y < n){
 636 |       if(k_end - k_pos == TILE_EXT_K){ //number of loop iterations is known at compile time: Unroll it
 637 | #pragma unroll
 638 |        for(int_t l = 0; l < TILE_EXT_K; ++l){
 639 |         tmp += lbuf[l][threadIdx.x] * rbuf[threadIdx.y][l];
 640 |        }
 641 |       }else{ //number of loop iterations is not known at compile time
 642 |        for(int_t l = 0; l < (k_end - k_pos); ++l){
 643 |         tmp += lbuf[l][threadIdx.x] * rbuf[threadIdx.y][l];
 644 |        }
 645 |       }
 646 |      }
 647 |      __syncthreads();
 648 | 
 649 |     } //k_pos
 650 | 
 651 |     //Store in C matrix into global memory:
 652 |     if(m_pos + threadIdx.x < m && n_pos + threadIdx.y < n) dest[(n_pos+threadIdx.y)*m + (m_pos+threadIdx.x)] += tmp;
 653 | 
 654 |    }
 655 | 
 656 |   } //m_pos
 657 | 
 658 |  } //n_pos
 659 |  return;
 660 | }
 661 | */
 662 | 
 663 | 
 664 | template <typename T>
 665 | T matrix_norm2_gpu_(size_t num_elems, const T * matrix_body)
 666 | {
 667 |  T norm2 = static_cast<T>(0);
 668 |  int dev; cudaError_t cuerr = cudaGetDevice(&dev); assert(cuerr == cudaSuccess);
 669 |  T * dnorm2 = static_cast<T*>(allocate(sizeof(T),dev,MemKind::Regular));
 670 |  cuerr = cudaMemset((void*)dnorm2,0,sizeof(T)); assert(cuerr == cudaSuccess);
 671 |  unsigned int num_blocks = 1024; unsigned int num_threads = 256;
 672 |  gpu_array_norm2<<<num_blocks,num_threads,num_threads*sizeof(double)>>>(num_elems,matrix_body,dnorm2);
 673 |  cuerr = cudaDeviceSynchronize();
 674 |  cuerr = cudaGetLastError(); assert(cuerr == cudaSuccess);
 675 |  cuerr = cudaMemcpy((void*)(&norm2),(void*)dnorm2,sizeof(T),cudaMemcpyDefault);
 676 |  deallocate((void*)dnorm2);
 677 |  return norm2;
 678 | }
 679 | 
 680 | 
 681 | template <typename T>
 682 | void matrix_addition_gpu_(size_t num_elems, T * matrix0_body, const T * matrix1_body, T alpha)
 683 | {
 684 |  unsigned int num_blocks = 4096; unsigned int num_threads = 256;
 685 |  gpu_array_add<<<num_blocks,num_threads>>>(num_elems,matrix0_body,matrix1_body,alpha);
 686 |  cudaError_t cuerr = cudaDeviceSynchronize();
 687 |  cuerr = cudaGetLastError(); assert(cuerr == cudaSuccess);
 688 |  return;
 689 | }
 690 | 
 691 | 
 692 | template <typename T>
 693 | void matrix_multiplication_gpu_(bool left_transp, bool right_transp,
 694 |                                T * matrix0_body, int nrows0, int ncols0,
 695 |                                const T * matrix1_body, int nrows1, int ncols1,
 696 |                                const T * matrix2_body, int nrows2, int ncols2)
 697 | {
 698 |  if(gemmAlgorithm == 0){ //BLA GEMM brute-force
 699 |   if(!left_transp && !right_transp){
 700 |    int m = nrows0, n = ncols0, k = ncols1;
 701 |    dim3 threads(32,32);
 702 |    dim3 blocks((nrows0-1)/32+1,(ncols0-1)/32+1);
 703 |    gpu_gemm_nn<<<blocks,threads>>>(m,n,k,matrix0_body,matrix1_body,matrix2_body);
 704 |   }else if(left_transp && !right_transp){
 705 |    int m = nrows0, n = ncols0, k = nrows1;
 706 |    dim3 threads(32,32);
 707 |    dim3 blocks((nrows0-1)/32+1,(ncols0-1)/32+1);
 708 |    gpu_gemm_tn<<<blocks,threads>>>(m,n,k,matrix0_body,matrix1_body,matrix2_body);
 709 |   }else if(!left_transp && right_transp){
 710 |    int m = nrows0, n = ncols0, k = ncols1;
 711 |    dim3 threads(32,32);
 712 |    dim3 blocks((nrows0-1)/32+1,(ncols0-1)/32+1);
 713 |    gpu_gemm_nt<<<blocks,threads>>>(m,n,k,matrix0_body,matrix1_body,matrix2_body);
 714 |   }else if(left_transp && right_transp){
 715 |    int m = nrows0, n = ncols0, k = nrows1;
 716 |    dim3 threads(32,32);
 717 |    dim3 blocks((nrows0-1)/32+1,(ncols0-1)/32+1);
 718 |    gpu_gemm_tt<<<blocks,threads>>>(m,n,k,matrix0_body,matrix1_body,matrix2_body);
 719 |   }
 720 |  }else if(gemmAlgorithm == 1){ //BLA GEMM with shared memory
 721 |   if(!left_transp && !right_transp){
 722 |    int m = nrows0, n = ncols0, k = ncols1;
 723 |    dim3 threads(16,16);
 724 |    dim3 blocks((nrows0-1)/16+1,(ncols0-1)/16+1);
 725 |    gpu_gemm_sh_nn<T,16,16,64><<<blocks,threads>>>(m,n,k,matrix0_body,matrix1_body,matrix2_body);
 726 |   }else if(left_transp && !right_transp){
 727 |    int m = nrows0, n = ncols0, k = nrows1;
 728 |    dim3 threads(16,16);
 729 |    dim3 blocks((nrows0-1)/16+1,(ncols0-1)/16+1);
 730 |    gpu_gemm_sh_tn<T,16,16,64><<<blocks,threads>>>(m,n,k,matrix0_body,matrix1_body,matrix2_body);
 731 |   }else if(!left_transp && right_transp){
 732 |    int m = nrows0, n = ncols0, k = ncols1;
 733 |    dim3 threads(16,16);
 734 |    dim3 blocks((nrows0-1)/16+1,(ncols0-1)/16+1);
 735 |    gpu_gemm_sh_nt<T,16,16,64><<<blocks,threads>>>(m,n,k,matrix0_body,matrix1_body,matrix2_body);
 736 |   }else if(left_transp && right_transp){
 737 |    int m = nrows0, n = ncols0, k = nrows1;
 738 |    dim3 threads(16,16);
 739 |    dim3 blocks((nrows0-1)/16+1,(ncols0-1)/16+1);
 740 |    gpu_gemm_sh_tt<T,16,16,64><<<blocks,threads>>>(m,n,k,matrix0_body,matrix1_body,matrix2_body);
 741 |   }
 742 |  }else if(gemmAlgorithm == 2){ //BLA GEMM with shared memory and register file
 743 |   if(!left_transp && !right_transp){
 744 |    int m = nrows0, n = ncols0, k = ncols1;
 745 |    dim3 threads(16,16);
 746 |    dim3 blocks((nrows0-1)/16+1,(ncols0-1)/16+1);
 747 |    gpu_gemm_sh_reg_nn<T,64,64,16><<<blocks,threads>>>(m,n,k,matrix0_body,matrix1_body,matrix2_body);
 748 |   }else if(left_transp && !right_transp){
 749 |    int m = nrows0, n = ncols0, k = nrows1;
 750 |    dim3 threads(16,16);
 751 |    dim3 blocks((nrows0-1)/16+1,(ncols0-1)/16+1);
 752 |    //gpu_gemm_sh_reg_tn<T,64,64,16><<<blocks,threads>>>(m,n,k,matrix0_body,matrix1_body,matrix2_body);
 753 |   }else if(!left_transp && right_transp){
 754 |    int m = nrows0, n = ncols0, k = ncols1;
 755 |    dim3 threads(16,16);
 756 |    dim3 blocks((nrows0-1)/16+1,(ncols0-1)/16+1);
 757 |    //gpu_gemm_sh_reg_nt<T,64,64,16><<<blocks,threads>>>(m,n,k,matrix0_body,matrix1_body,matrix2_body);
 758 |   }else if(left_transp && right_transp){
 759 |    int m = nrows0, n = ncols0, k = nrows1;
 760 |    dim3 threads(16,16);
 761 |    dim3 blocks((nrows0-1)/16+1,(ncols0-1)/16+1);
 762 |    //gpu_gemm_sh_reg_tt<T,64,64,16><<<blocks,threads>>>(m,n,k,matrix0_body,matrix1_body,matrix2_body);
 763 |   }
 764 |  }else{ //cuBLAS GEMM
 765 |   int dev; cudaError_t cuerr = cudaGetDevice(&dev); assert(cuerr == cudaSuccess);
 766 |   int m = nrows1; cublasOperation_t transa = CUBLAS_OP_N;
 767 |   if(left_transp){m = ncols1; transa = CUBLAS_OP_T;}
 768 |   int n = ncols2; cublasOperation_t transb = CUBLAS_OP_N;
 769 |   if(right_transp){n = nrows2; transb = CUBLAS_OP_T;}
 770 |   int k = ncols1; if(left_transp) k = nrows1;
 771 |   T *alpha, *beta;
 772 |   if(CudaFPData<T>::kind == CUDA_R_32F){
 773 |    cuerr = cudaGetSymbolAddress((void**)&alpha,unity_fp32); assert(cuerr == cudaSuccess);
 774 |    cuerr = cudaGetSymbolAddress((void**)&beta,unity_fp32); assert(cuerr == cudaSuccess);
 775 |   }else if(CudaFPData<T>::kind == CUDA_R_64F){
 776 |    cuerr = cudaGetSymbolAddress((void**)&alpha,unity_fp64); assert(cuerr == cudaSuccess);
 777 |    cuerr = cudaGetSymbolAddress((void**)&beta,unity_fp64); assert(cuerr == cudaSuccess);
 778 |   }else{
 779 |    assert(false);
 780 |   }
 781 | #ifdef USE_CUBLAS_GEMM_EX
 782 |   cublasStatus_t custat = cublasGemmEx(cublasHandle[dev],
 783 |                                        transa,transb,
 784 |                                        m,n,k,
 785 |                                        alpha,
 786 |                                        matrix1_body,CudaFPData<T>::kind,nrows1,
 787 |                                        matrix2_body,CudaFPData<T>::kind,nrows2,
 788 |                                        beta,
 789 |                                        matrix0_body,CudaFPData<T>::kind,nrows0,
 790 |                                        CudaFPData<T>::kind, CUBLAS_GEMM_DEFAULT);
 791 | #else
 792 |   cublasStatus_t custat = cublasGemm(cublasHandle[dev],
 793 |                                      transa,transb,
 794 |                                      m,n,k,
 795 |                                      alpha,
 796 |                                      matrix1_body,nrows1,
 797 |                                      matrix2_body,nrows2,
 798 |                                      beta,
 799 |                                      matrix0_body,nrows0);
 800 | #endif
 801 |   if(custat != CUBLAS_STATUS_SUCCESS) std::cout << "#ERROR(cublasGemmEx): Eror " << custat << std::endl;
 802 |   assert(custat == CUBLAS_STATUS_SUCCESS);
 803 |  }
 804 |  cudaError_t cuerr = cudaDeviceSynchronize();
 805 |  cuerr = cudaGetLastError();
 806 |  if(cuerr != cudaSuccess){
 807 |   const char * error_str = cudaGetErrorString(cuerr);
 808 |   std::cout << "ERROR(bla::matrix_multiplication_gpu_): CUDA kernel launch failure: " << std::endl;
 809 |   printf("%s\n",error_str);
 810 |  }
 811 |  assert(cuerr == cudaSuccess);
 812 |  return;
 813 | }
 814 | 
 815 | 
 816 | float matrix_norm2_gpu(size_t num_elems, const float * matrix_body)
 817 | {
 818 |  return matrix_norm2_gpu_(num_elems,matrix_body);
 819 | }
 820 | 
 821 | double matrix_norm2_gpu(size_t num_elems, const double * matrix_body)
 822 | {
 823 |  return matrix_norm2_gpu_(num_elems,matrix_body);
 824 | }
 825 | 
 826 | 
 827 | void matrix_addition_gpu(size_t num_elems, float * matrix0_body, const float * matrix1_body, float alpha)
 828 | {
 829 |  return matrix_addition_gpu_(num_elems,matrix0_body,matrix1_body,alpha);
 830 | }
 831 | 
 832 | void matrix_addition_gpu(size_t num_elems, double * matrix0_body, const double * matrix1_body, double alpha)
 833 | {
 834 |  return matrix_addition_gpu_(num_elems,matrix0_body,matrix1_body,alpha);
 835 | }
 836 | 
 837 | 
 838 | void matrix_multiplication_gpu(bool left_transp, bool right_transp,
 839 |                                float * matrix0_body, int nrows0, int ncols0,
 840 |                                const float * matrix1_body, int nrows1, int ncols1,
 841 |                                const float * matrix2_body, int nrows2, int ncols2)
 842 | {
 843 |  return matrix_multiplication_gpu_(left_transp,right_transp,
 844 |                                    matrix0_body,nrows0,ncols0,
 845 |                                    matrix1_body,nrows1,ncols1,
 846 |                                    matrix2_body,nrows2,ncols2);
 847 | }
 848 | 
 849 | void matrix_multiplication_gpu(bool left_transp, bool right_transp,
 850 |                                double * matrix0_body, int nrows0, int ncols0,
 851 |                                const double * matrix1_body, int nrows1, int ncols1,
 852 |                                const double * matrix2_body, int nrows2, int ncols2)
 853 | {
 854 |  return matrix_multiplication_gpu_(left_transp,right_transp,
 855 |                                    matrix0_body,nrows0,ncols0,
 856 |                                    matrix1_body,nrows1,ncols1,
 857 |                                    matrix2_body,nrows2,ncols2);
 858 | }
 859 | 
 860 | 
 861 | void init()
 862 | {
 863 |  totalNumGPUs = 0;
 864 |  cudaError_t cuerr = cudaGetDeviceCount(&totalNumGPUs); assert(cuerr == cudaSuccess);
 865 |  std::cout << "Found " << totalNumGPUs << " NVIDIA GPU" << std::endl;
 866 |  if(totalNumGPUs > 0){
 867 |   cublasStatus_t cuberr;
 868 |   gpuProperty = new cudaDeviceProp[totalNumGPUs];
 869 |   cublasHandle = new cublasHandle_t[totalNumGPUs];
 870 |   //Init each GPU:
 871 |   for(int i = (totalNumGPUs - 1); i >= 0; --i){
 872 |    cuerr = cudaSetDevice(i); assert(cuerr == cudaSuccess);
 873 |    cuerr = cudaGetDeviceProperties(&(gpuProperty[i]),i); assert(cuerr == cudaSuccess);
 874 |    cuberr = cublasCreate(&(cublasHandle[i])); assert(cuberr == CUBLAS_STATUS_SUCCESS);
 875 |    cuberr = cublasSetPointerMode(cublasHandle[i],CUBLAS_POINTER_MODE_DEVICE); assert(cuberr == CUBLAS_STATUS_SUCCESS);
 876 |    cuerr = cudaGetLastError(); assert(cuerr == cudaSuccess);
 877 |    std::cout << "Initialized GPU " << i << std::endl;
 878 |   }
 879 |   //Enable P2P access between GPU:
 880 |   if(totalNumGPUs > 1){
 881 |    for(int i = (totalNumGPUs - 1); i >= 0; --i){
 882 |     if(gpuProperty[i].unifiedAddressing != 0){
 883 |      cuerr = cudaSetDevice(i); assert(cuerr == cudaSuccess);
 884 |      for(int j = (totalNumGPUs - 1); j >= 0; --j){
 885 |       if(j != i){
 886 |        if(gpuProperty[j].unifiedAddressing != 0){
 887 |         cuerr = cudaDeviceEnablePeerAccess(j,0);
 888 |         if(cuerr == cudaSuccess){
 889 |          std::cout << "GPU " << i << " can access peer GPU " << j << std::endl;
 890 |         }else{
 891 |          std::cout << "GPU " << i << " cannot access peer GPU " << j << std::endl;
 892 |         }
 893 |        }
 894 |       }
 895 |      }
 896 |     }
 897 |    }
 898 |   }
 899 |   cuerr = cudaGetLastError();
 900 |  }
 901 |  std::cout << "BLA library initialized successfully" << std::endl;
 902 |  return;
 903 | }
 904 | 
 905 | 
 906 | void shutdown()
 907 | {
 908 |  if(totalNumGPUs > 0){
 909 |   cudaError_t cuerr;
 910 |   cublasStatus_t cuberr;
 911 |   for(int i = 0; i < totalNumGPUs; ++i){
 912 |    cuberr = cublasDestroy(cublasHandle[i]); assert(cuberr == CUBLAS_STATUS_SUCCESS);
 913 |    cuerr = cudaDeviceReset(); assert(cuerr == cudaSuccess);
 914 |    std::cout << "Destroyed primary context on GPU " << i << std::endl;
 915 |   }
 916 |   delete [] cublasHandle;
 917 |   delete [] gpuProperty;
 918 |  }
 919 |  totalNumGPUs = 0;
 920 |  std::cout << "BLA library shut down successfully" << std::endl;
 921 |  return;
 922 | }
 923 | 
 924 | 
 925 | void print_device_properties(int device)
 926 | {
 927 |  cudaDeviceProp prop;
 928 |  cudaError_t cuerr = cudaGetDeviceProperties(&prop,device);
 929 |  if(cuerr == cudaSuccess){
 930 |   std::cout << "Properties of NVIDIA GPU " << device << std::endl;
 931 |   std::cout << " Compute capability: " << prop.major << "." << prop.minor << std::endl;
 932 |   std::cout << " Register file size: " << prop.regsPerBlock << std::endl;
 933 |   std::cout << " Shared memory size: " << prop.sharedMemPerBlock << std::endl;
 934 |  }else{
 935 |   std::cout << "#ERROR(bla::print_device_properties): Unable to get properties for device " << device << std::endl;
 936 |   assert(false);
 937 |  }
 938 |  return;
 939 | }
 940 | 
 941 | 
 942 | void reset_gemm_algorithm(int algo)
 943 | {
 944 |  gemmAlgorithm = algo;
 945 |  return;
 946 | }
 947 | 
 948 | 
 949 | bool test_hello()
 950 | {
 951 |  std::cout << "Testing presence on GPU ..." << std::endl;
 952 |  const std::string s1("Am I really on GPU?");
 953 |  const std::string s2("Waiting for the answer ...");
 954 |  const std::string s3("Yes, you are!");
 955 | 
 956 |  size_t max_len = std::max(s1.size(),std::max(s2.size(),s3.size()));
 957 |  size_t str_len = max_len+1;
 958 | 
 959 |  char * hs1 = static_cast<char*>(allocate(str_len,-1,MemKind::Pinned)); assert(hs1 != nullptr);
 960 |  char * ds1 = static_cast<char*>(allocate(str_len,0,MemKind::Regular)); assert(ds1 != nullptr);
 961 |  int i = 0; for(const char & symb: s1) hs1[i++]=symb; hs1[s1.size()]='\0';
 962 |  printf("%s ",hs1);
 963 | 
 964 |  char * hs3 = static_cast<char*>(allocate(str_len,-1,MemKind::Pinned)); assert(hs3 != nullptr);
 965 |  char * ds3 = static_cast<char*>(allocate(str_len,0,MemKind::Regular)); assert(ds3 != nullptr);
 966 |  i = 0; for(const char & symb: s3) hs3[i++]=symb; hs3[s3.size()]='\0';
 967 | 
 968 |  cudaError_t cuerr = cudaMemcpy((void*)ds1,(void*)hs1,str_len,cudaMemcpyDefault); assert(cuerr == cudaSuccess);
 969 |  cuerr = cudaMemcpy((void*)ds3,(void*)hs3,str_len,cudaMemcpyDefault); assert(cuerr == cudaSuccess);
 970 | 
 971 |  cuerr = cudaGetLastError(); assert(cuerr == cudaSuccess);
 972 |  gpu_test_presence<<<16,256>>>(str_len,ds1,ds3);
 973 |  std::cout << s2 << " ";
 974 |  cuerr = cudaDeviceSynchronize();
 975 |  cuerr = cudaGetLastError(); assert(cuerr == cudaSuccess);
 976 | 
 977 |  cuerr = cudaMemcpy((void*)hs1,(void*)ds1,str_len,cudaMemcpyDefault); assert(cuerr == cudaSuccess);
 978 |  printf("%s\n",hs1);
 979 | 
 980 |  deallocate((void*)ds3);
 981 |  deallocate((void*)hs3);
 982 | 
 983 |  deallocate((void*)ds1);
 984 |  deallocate((void*)hs1);
 985 | 
 986 |  return true;
 987 | }
 988 | 
 989 | 
 990 | bool test_norm()
 991 | {
 992 |  std::cout << "Testing norm2 on GPU 0 ... ";
 993 |  const float num_tolerance = 1e-5;
 994 |  const size_t vol = 1000000;
 995 |  const size_t dsize = vol * sizeof(float);
 996 |  float * arr0 = static_cast<float*>(allocate(dsize,-1,MemKind::Pinned));
 997 |  float * arr1 = static_cast<float*>(allocate(dsize,0,MemKind::Regular));
 998 |  float * dnorm2 = static_cast<float*>(allocate(sizeof(float),0,MemKind::Regular));
 999 | 
1000 |  for(size_t i = 0; i < vol; ++i) arr0[i]=1.0f/sqrt((float)vol); //value of each element to make norm equal 1
1001 | 
1002 |  cudaError_t cuerr = cudaMemcpy((void*)arr1,(void*)arr0,dsize,cudaMemcpyDefault); assert(cuerr == cudaSuccess);
1003 | 
1004 |  unsigned int num_blocks = 1024; unsigned int num_threads = 256;
1005 |  gpu_array_norm2<<<num_blocks,num_threads,num_threads*sizeof(double)>>>(vol,arr1,dnorm2);
1006 |  cuerr = cudaDeviceSynchronize();
1007 |  cuerr = cudaGetLastError(); assert(cuerr == cudaSuccess);
1008 | 
1009 |  float norm2 = 0.0f;
1010 |  cuerr = cudaMemcpy((void*)(&norm2),(void*)dnorm2,sizeof(float),cudaMemcpyDefault);
1011 |  std::cout << "Norm2 = " << norm2 << " (correct value is 1.0)" << std::endl;
1012 |  assert(abs(norm2-1.0f) < num_tolerance);
1013 | 
1014 |  deallocate((void*)dnorm2);
1015 |  deallocate((void*)arr1);
1016 |  deallocate((void*)arr0);
1017 |  return true;
1018 | }
1019 | 
1020 | 
1021 | bool test_bla()
1022 | {
1023 |  if(!test_hello()) return false;
1024 |  if(!test_norm()) return false;
1025 |  return true;
1026 | }
1027 | 
1028 | } //namespace bla
1029 | 


--------------------------------------------------------------------------------
/bla_lib.hpp:
--------------------------------------------------------------------------------
 1 | /* CUDA tutorial: Basic Linear Algebra (BLA) Library
 2 | 
 3 | !Copyright (C) 2018-2018 Dmitry I. Lyakh (Liakh)
 4 | !Copyright (C) 2018-2018 Oak Ridge National Laboratory (UT-Battelle)
 5 | 
 6 | !This file is part of CUDA BLA tutorial.
 7 | 
 8 | !CUDA BLA is free software: you can redistribute it and/or modify
 9 | !it under the terms of the GNU Lesser General Public License as published
10 | !by the Free Software Foundation, either version 3 of the License, or
11 | !(at your option) any later version.
12 | 
13 | !CUDA BLA is distributed in the hope that it will be useful,
14 | !but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | !MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | !GNU Lesser General Public License for more details.
17 | 
18 | !You should have received a copy of the GNU Lesser General Public License
19 | !along with CUDA BLA. If not, see <http://www.gnu.org/licenses/>. */
20 | 
21 | #ifndef BLA_LIB_HPP_
22 | #define BLA_LIB_HPP_
23 | 
24 | #include "memory.hpp"
25 | #include "timer.hpp"
26 | 
27 | #include <cassert>
28 | 
29 | #include <complex>
30 | 
31 | namespace bla{
32 | 
33 | /** Initialization of BLA **/
34 | void init();
35 | 
36 | /** Shutdown of BLA **/
37 | void shutdown();
38 | 
39 | /** Testing BLA **/
40 | bool test_bla();
41 | 
42 | /** Device properites **/
43 | void print_device_properties(int device);
44 | 
45 | /** Resets GEMM algorithm:
46 |     0: Custom GEMM from BLA;
47 |     1: cuBLAS GEMM. **/
48 | void reset_gemm_algorithm(int algo);
49 | 
50 | /** Matrix squared "norm" (sum of the squared elements) **/
51 | float matrix_norm2_gpu(size_t num_elems, const float * matrix_body);
52 | double matrix_norm2_gpu(size_t num_elems, const double * matrix_body);
53 | 
54 | /** Matrix addition **/
55 | void matrix_addition_gpu(size_t num_elems, float * matrix0_body, const float * matrix1_body, float alpha);
56 | void matrix_addition_gpu(size_t num_elems, double * matrix0_body, const double * matrix1_bod, double alpha);
57 | 
58 | /** Matrix multiplication **/
59 | void matrix_multiplication_gpu(bool left_transp, bool right_transp,
60 |                                float * matrix0_body, int nrows0, int ncols0,
61 |                                const float * matrix1_body, int nrows1, int ncols1,
62 |                                const float * matrix2_body, int nrows2, int ncols2);
63 | void matrix_multiplication_gpu(bool left_transp, bool right_transp,
64 |                                double * matrix0_body, int nrows0, int ncols0,
65 |                                const double * matrix1_body, int nrows1, int ncols1,
66 |                                const double * matrix2_body, int nrows2, int ncols2);
67 | 
68 | } //namespace bla
69 | 
70 | #endif //BLA_LIB_HPP_
71 | 


--------------------------------------------------------------------------------
/main.cpp:
--------------------------------------------------------------------------------
  1 | /* CUDA tutorial: Basic Linear Algebra (BLA) Library
  2 | 
  3 | !Copyright (C) 2018-2018 Dmitry I. Lyakh (Liakh)
  4 | !Copyright (C) 2018-2018 Oak Ridge National Laboratory (UT-Battelle)
  5 | 
  6 | !This file is part of CUDA BLA tutorial.
  7 | 
  8 | !CUDA BLA is free software: you can redistribute it and/or modify
  9 | !it under the terms of the GNU Lesser General Public License as published
 10 | !by the Free Software Foundation, either version 3 of the License, or
 11 | !(at your option) any later version.
 12 | 
 13 | !CUDA BLA is distributed in the hope that it will be useful,
 14 | !but WITHOUT ANY WARRANTY; without even the implied warranty of
 15 | !MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 16 | !GNU Lesser General Public License for more details.
 17 | 
 18 | !You should have received a copy of the GNU Lesser General Public License
 19 | !along with CUDA BLA. If not, see <http://www.gnu.org/licenses/>. */
 20 | 
 21 | #include "bla.hpp"
 22 | 
 23 | #include <iostream>
 24 | 
 25 | void use_bla()
 26 | {
 27 |  //Pick which GEMM tests you enable:
 28 |  const bool TEST_BLA_GEMM_BRUTE = true; //enables/disables testing of brute-force GEMM
 29 |  const bool TEST_BLA_GEMM_SHARED = true; //enables/disables testing of shared memory GEMM
 30 |  const bool TEST_BLA_GEMM_REGISTER = true; //enables/disables testing of register-based GEMM
 31 | 
 32 |  std::cout << "Let's try to use BLA library ..." << std::endl;
 33 | 
 34 |  //Create matrix A:
 35 |  bla::Matrix<float> A(2000,2000);
 36 |  //Allocate matrix A body on Host:
 37 |  A.allocateBody(-1,bla::MemKind::Pinned);
 38 |  //Set matrix A body to some non-trivial value on Host:
 39 |  A.setBodyHost();
 40 | 
 41 |  //Create matrix B:
 42 |  bla::Matrix<float> B(2000,2000);
 43 |  //Allocate matrix B body on Host:
 44 |  B.allocateBody(-1,bla::MemKind::Pinned);
 45 |  //Set matrix B body to some non-trivial value on Host:
 46 |  B.setBodyHost();
 47 | 
 48 |  //Create matrix C:
 49 |  bla::Matrix<float> C(2000,2000);
 50 |  //Allocate matrix C body on GPU#0:
 51 |  C.allocateBody(0,bla::MemKind::Regular);
 52 | 
 53 |  //Create matrix D:
 54 |  bla::Matrix<float> D(2000,2000);
 55 |  //Allocate matrix D body on GPU#0:
 56 |  D.allocateBody(0,bla::MemKind::Regular);
 57 | 
 58 |  //Copy matrix A to GPU#0 from Host:
 59 |  A.syncBody(0,-1); //Host (-1) --> GPU#0 (0)
 60 |  //Compute matrix A norm on GPU#0:
 61 |  auto normA = A.computeNorm(0);
 62 |  std::cout << "Matrix A norm = " << normA << std::endl;
 63 | 
 64 |  //Copy matrix B to GPU#0 from Host:
 65 |  B.syncBody(0,-1); //Host (-1) --> GPU#0 (0)
 66 |  //Compute matrix B norm on GPU#0:
 67 |  auto normB = B.computeNorm(0);
 68 |  std::cout << "Matrix B norm = " << normB << std::endl;
 69 | 
 70 |  //Determine total number of floating point operations:
 71 |  double flops = 2.0 * std::sqrt(static_cast<double>(A.getVolume()) *
 72 |                                 static_cast<double>(B.getVolume()) *
 73 |                                 static_cast<double>(C.getVolume()));
 74 |  std::cout << "Matrix multiplication C+=A*B requires " << flops/1e9 << " Gflop" << std::endl;
 75 | 
 76 |  //Perform reference matrix multiplication on GPU#0 with cuBLAS:
 77 |  for(int repeat = 0; repeat < 2; ++repeat){
 78 |   C.zeroBody(0); //set matrix C body to zero on GPU#0
 79 |   bla::reset_gemm_algorithm(7);
 80 |   std::cout << "Performing matrix multiplication C+=A*B with cuBLAS ... ";
 81 |   double tms = bla::time_sys_sec();
 82 |   C.multiplyAdd(false,false,A,B,0);
 83 |   double tmf = bla::time_sys_sec();
 84 |   std::cout << "Done: Time = " << tmf-tms << " s: Gflop/s = " << flops/(tmf-tms)/1e9 << std::endl;
 85 |   //Compute C norm on GPU#0:
 86 |   auto normC = C.computeNorm(0); //correct C matrix norm
 87 |   std::cout << "Matrix C norm = " << normC << std::endl;
 88 |   D.zeroBody(0); //set matrix D body to zero on GPU#0
 89 |   D.add(C,-1.0f,0); //make matrix D = -C for later correctness checks
 90 |  }
 91 | 
 92 |  //Perform matrix multiplication on GPU#0 with BLA GEMM brute-force:
 93 |  if(TEST_BLA_GEMM_BRUTE){
 94 |   for(int repeat = 0; repeat < 2; ++repeat){
 95 |    C.zeroBody(0); //set matrix C body to zero on GPU#0
 96 |    bla::reset_gemm_algorithm(0);
 97 |    std::cout << "Performing matrix multiplication C+=A*B with BLA GEMM brute-force ... ";
 98 |    double tms = bla::time_sys_sec();
 99 |    C.multiplyAdd(false,false,A,B,0);
100 |    double tmf = bla::time_sys_sec();
101 |    std::cout << "Done: Time = " << tmf-tms << " s: Gflop/s = " << flops/(tmf-tms)/1e9 << std::endl;
102 |    //Check correctness on GPU#0:
103 |    C.add(D,1.0f,0);
104 |    auto norm_diff = C.computeNorm(0);
105 |    std::cout << "Norm of the matrix C deviation from correct = " << norm_diff << std::endl;
106 |    if(std::abs(norm_diff) > 1e-7){
107 |     std::cout << "#FATAL: Matrix C is incorrect, fix your GPU kernel implementation!" << std::endl;
108 |     std::exit(1);
109 |    }
110 |   }
111 |  }
112 | 
113 |  //Perform matrix multiplication on GPU#0 with BLA GEMM with shared memory:
114 |  if(TEST_BLA_GEMM_SHARED){
115 |   for(int repeat = 0; repeat < 2; ++repeat){
116 |    C.zeroBody(0); //set matrix C body to zero on GPU#0
117 |    bla::reset_gemm_algorithm(1);
118 |    std::cout << "Performing matrix multiplication C+=A*B with BLA GEMM with shared memory ... ";
119 |    double tms = bla::time_sys_sec();
120 |    C.multiplyAdd(false,false,A,B,0);
121 |    double tmf = bla::time_sys_sec();
122 |    std::cout << "Done: Time = " << tmf-tms << " s: Gflop/s = " << flops/(tmf-tms)/1e9 << std::endl;
123 |    //Check correctness on GPU#0:
124 |    C.add(D,1.0f,0);
125 |    auto norm_diff = C.computeNorm(0);
126 |    std::cout << "Norm of the matrix C deviation from correct = " << norm_diff << std::endl;
127 |    if(std::abs(norm_diff) > 1e-7){
128 |     std::cout << "#FATAL: Matrix C is incorrect, fix your GPU kernel implementation!" << std::endl;
129 |     std::exit(1);
130 |    }
131 |   }
132 |  }
133 | 
134 |  //Perform matrix multiplication on GPU#0 with BLA GEMM with shared memory and registers:
135 |  if(TEST_BLA_GEMM_REGISTER){
136 |   for(int repeat = 0; repeat < 2; ++repeat){
137 |    C.zeroBody(0); //set matrix C body to zero on GPU#0
138 |    bla::reset_gemm_algorithm(2);
139 |    std::cout << "Performing matrix multiplication C+=A*B with BLA GEMM with shared memory and registers ... ";
140 |    double tms = bla::time_sys_sec();
141 |    C.multiplyAdd(false,false,A,B,0);
142 |    double tmf = bla::time_sys_sec();
143 |    std::cout << "Done: Time = " << tmf-tms << " s: Gflop/s = " << flops/(tmf-tms)/1e9 << std::endl;
144 |   //Check correctness on GPU#0:
145 |    C.add(D,1.0f,0);
146 |     auto norm_diff = C.computeNorm(0);
147 |    std::cout << "Norm of the matrix C deviation from correct = " << norm_diff << std::endl;
148 |    if(std::abs(norm_diff) > 1e-7){
149 |     std::cout << "#FATAL: Matrix C is incorrect, fix your GPU kernel implementation!" << std::endl;
150 |     std::exit(1);
151 |    }
152 |   }
153 |  }
154 | 
155 |  std::cout << "Seems like it works!" << std::endl;
156 |  return;
157 | }
158 | 
159 | 
160 | int main(int argc, char ** argv)
161 | {
162 | //Initialize BLA library:
163 |  bla::init();
164 |  bla::print_device_properties(0); //check compute capability
165 | 
166 | //Test BLA library:
167 |  bla::test_bla();
168 | 
169 | //Use BLA library:
170 |  use_bla();
171 | 
172 | //Shutdown BLA library:
173 |  bla::shutdown();
174 | 
175 |  return 0;
176 | }
177 | 


--------------------------------------------------------------------------------
/matrix.hpp:
--------------------------------------------------------------------------------
  1 | /* CUDA tutorial: Basic Linear Algebra (BLA) Library
  2 | 
  3 | !Copyright (C) 2018-2018 Dmitry I. Lyakh (Liakh)
  4 | !Copyright (C) 2018-2018 Oak Ridge National Laboratory (UT-Battelle)
  5 | 
  6 | !This file is part of CUDA BLA tutorial.
  7 | 
  8 | !CUDA BLA is free software: you can redistribute it and/or modify
  9 | !it under the terms of the GNU Lesser General Public License as published
 10 | !by the Free Software Foundation, either version 3 of the License, or
 11 | !(at your option) any later version.
 12 | 
 13 | !CUDA BLA is distributed in the hope that it will be useful,
 14 | !but WITHOUT ANY WARRANTY; without even the implied warranty of
 15 | !MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 16 | !GNU Lesser General Public License for more details.
 17 | 
 18 | !You should have received a copy of the GNU Lesser General Public License
 19 | !along with CUDA BLA. If not, see <http://www.gnu.org/licenses/>. */
 20 | 
 21 | #ifndef MATRIX_HPP_
 22 | #define MATRIX_HPP_
 23 | 
 24 | #include "bla_lib.hpp"
 25 | 
 26 | #include <cuda_runtime.h>
 27 | 
 28 | #include <cstdlib>
 29 | #include <cstring>
 30 | #include <cmath>
 31 | #include <cassert>
 32 | #include <iostream>
 33 | #include <list>
 34 | #include <type_traits>
 35 | 
 36 | namespace bla{
 37 | 
 38 | template <typename T>
 39 | class Matrix{
 40 | 
 41 | public:
 42 | 
 43 |  explicit Matrix(int nrows, int ncols);
 44 | 
 45 |  Matrix(const Matrix & matrix) = delete;
 46 |  Matrix & operator=(const Matrix &) = delete;
 47 |  Matrix(Matrix && matrix) noexcept = default;
 48 |  Matrix & operator=(Matrix && matrix) noexcept = default;
 49 |  virtual ~Matrix();
 50 | 
 51 |  /** Returns the number of rows in the matrix **/
 52 |  int getNumRows() const;
 53 |  /** Returns the number of columns in the matrix **/
 54 |  int getNumCols() const;
 55 |  /** Returns the volume of the matrix (number of elements) **/
 56 |  std::size_t getVolume() const;
 57 |  /** Returns the size of the matrix in bytes **/
 58 |  std::size_t getSize() const;
 59 |  /** Returns a pointer to the memory resource on requested device (if any) **/
 60 |  T * getBodyPtr(int device) const;
 61 |  /** Allocates memory resource of requested kind on requested device **/
 62 |  void allocateBody(int device, MemKind memkind = MemKind::Regular);
 63 |  /** Deallocates memory resource on requested device **/
 64 |  void deallocateBody(int device);
 65 |  /** Marks matrix body status on a given device as up-to-date or not (outdated) **/
 66 |  void markBodyStatus(int device, bool status);
 67 |  /** Initializes matrix body to zero on a given device **/
 68 |  void zeroBody(int device);
 69 |  /** Initializes matrix body to some non-trivial value on Host **/
 70 |  void setBodyHost();
 71 |  /** Synchronizes matrix body on a given device with the body from another device.
 72 |      By default the source device is Host (if up to date). **/
 73 |  void syncBody(int device, int source_device = -1);
 74 | 
 75 |  /** Computes the norm of the matrix on a given device **/
 76 |  double computeNorm(int device = -1);
 77 |  /** Performs matrix addition on a given device **/
 78 |  void add(Matrix & Amat, T alpha = static_cast<T>(1.0), int device = -1);
 79 |  /** Performs matrix multiplication on a given device **/
 80 |  void multiplyAdd(bool left_transp, bool right_transp, Matrix & Amat, Matrix & Bmat, int device = -1);
 81 | 
 82 | private:
 83 | 
 84 |  //Memory resource descriptor:
 85 |  typedef struct{
 86 |   int device;
 87 |   void * ptr;
 88 |   MemKind memkind;
 89 |   bool uptodate;
 90 |  } Resource;
 91 | 
 92 |  //Data members:
 93 |  int nrows_;                    //number of rows
 94 |  int ncols_;                    //number of columns
 95 |  std::size_t elem_size_;        //matrix element size in bytes
 96 |  std::list<Resource> location_; //list of memory resources occupied by the matrix
 97 | };
 98 | 
 99 | 
100 | //TEMPLATE DEFINITIONS:
101 | template <typename T>
102 | Matrix<T>::Matrix(int nrows, int ncols):
103 |  nrows_(nrows), ncols_(ncols), elem_size_(sizeof(T))
104 | {
105 |  static_assert(std::is_floating_point<T>::value,"#ERROR(BLA::Matrix::Matrix): Matrix type must be floating point!");
106 |  assert(nrows_ > 0 && ncols_ > 0 && elem_size_ > 0);
107 |  std::cout << "Matrix created with dimensions (" << nrows_ << "," << ncols_ << ")" << std::endl;
108 | }
109 | 
110 | 
111 | template <typename T>
112 | Matrix<T>::~Matrix()
113 | {
114 |  for(auto & loc: location_) deallocate(loc.ptr);
115 |  std::cout << "Matrix destroyed" << std::endl;
116 | }
117 | 
118 | 
119 | template <typename T>
120 | int Matrix<T>::getNumRows() const
121 | {
122 |  return nrows_;
123 | }
124 | 
125 | 
126 | template <typename T>
127 | int Matrix<T>::getNumCols() const
128 | {
129 |  return ncols_;
130 | }
131 | 
132 | 
133 | template <typename T>
134 | std::size_t Matrix<T>::getVolume() const
135 | {
136 |  return (static_cast<std::size_t>(nrows_)*static_cast<std::size_t>(ncols_)); //number of elements
137 | }
138 | 
139 | 
140 | template <typename T>
141 | std::size_t Matrix<T>::getSize() const
142 | {
143 |  return (static_cast<std::size_t>(nrows_)*static_cast<std::size_t>(ncols_)*elem_size_); //matrix size in bytes
144 | }
145 | 
146 | 
147 | template <typename T>
148 | T * Matrix<T>::getBodyPtr(int device) const
149 | {
150 |  T * ptr = nullptr;
151 |  for(const auto & loc: location_){
152 |   if(loc.device == device){
153 |    ptr = static_cast<T*>(loc.ptr);
154 |    break;
155 |   }
156 |  }
157 |  return ptr;
158 | }
159 | 
160 | 
161 | template <typename T>
162 | void Matrix<T>::allocateBody(int device, MemKind memkind)
163 | {
164 |  std::size_t mat_size = this->getSize();         //matrix size in bytes
165 |  void * ptr = allocate(mat_size,device,memkind); //allocate memory of requested kind on requested device
166 |  assert(ptr != nullptr);
167 |  location_.emplace_back(Resource{device,ptr,memkind,false}); //save the new memory descriptor (Resource)
168 |  std::cout << "New resource acquired on device " << device << std::endl;
169 |  return;
170 | }
171 | 
172 | 
173 | template <typename T>
174 | void Matrix<T>::deallocateBody(int device)
175 | {
176 |  for(auto & loc: location_){
177 |   if(loc.device == device){
178 |    deallocate(loc.ptr);
179 |    std::cout << "Resource released on device " << device << std::endl;
180 |   }
181 |  }
182 |  location_.remove_if([device](const Resource & res){return (res.device == device);});
183 |  return;
184 | }
185 | 
186 | 
187 | template <typename T>
188 | void Matrix<T>::markBodyStatus(int device, bool status)
189 | {
190 |  for(auto & loc: location_){
191 |   if(loc.device == device) loc.uptodate = status;
192 |  }
193 |  return;
194 | }
195 | 
196 | 
197 | template <typename T>
198 | void Matrix<T>::zeroBody(int device)
199 | {
200 |  T * mat = this->getBodyPtr(device);
201 |  if(mat != nullptr){
202 |   std::size_t mat_size = this->getSize();
203 |   assert(mat_size > 0);
204 |   if(device < 0){ //Host
205 |    std::memset(((void*)mat),0,mat_size);
206 |   }else{ //GPU device
207 |    int dev;
208 |    cudaError_t cuerr = cudaGetDevice(&dev); assert(cuerr == cudaSuccess);
209 |    if(device != dev){
210 |     cuerr = cudaSetDevice(device); assert(cuerr == cudaSuccess);
211 |    }
212 |    cuerr = cudaMemset(((void*)mat),0,mat_size); assert(cuerr == cudaSuccess);
213 |    if(device != dev){
214 |     cuerr = cudaSetDevice(dev); assert(cuerr == cudaSuccess);
215 |    }
216 |   }
217 |   this->markBodyStatus(device,true); //mark matrix body on device as up-to-date
218 |  }else{
219 |   std::cout << "#ERROR(BLA::Matrix::zeroBody): Matrix does not exist on device " << device << std::endl;
220 |   assert(false);
221 |  }
222 |  return;
223 | }
224 | 
225 | 
226 | template <typename T>
227 | void Matrix<T>::setBodyHost()
228 | {
229 |  T * mat = this->getBodyPtr(-1); //-1 is Host device id
230 |  if(mat != nullptr){
231 |   for(std::size_t j = 0; j < ncols_; ++j){
232 |    std::size_t offset = j*nrows_;
233 |    for(std::size_t i = 0; i < nrows_; ++i){
234 |     //mat[offset+i] = static_cast<T>(1)/(static_cast<T>(i+7) + static_cast<T>(j+13)); //some value
235 |     mat[offset+i] = static_cast<T>(1)/std::log(static_cast<T>(std::rand()+13)); //some value
236 |    }
237 |   }
238 |   this->markBodyStatus(-1,true); //mark matrix body on Host as up-to-date
239 |  }else{
240 |   std::cout << "#ERROR(BLA::Matrix::setBodyHost): Matrix does not exist on Host!" << std::endl;
241 |   assert(false);
242 |  }
243 |  return;
244 | }
245 | 
246 | 
247 | template <typename T>
248 | void Matrix<T>::syncBody(int device, int source_device)
249 | {
250 |  if(device != source_device){
251 |   Resource destination_resource, source_resource;
252 |   bool destination_found = false;
253 |   bool source_found = false;
254 |   for(auto & loc: location_){
255 |    if(!source_found && loc.device == source_device && loc.uptodate){
256 |     source_resource = loc;
257 |     source_found = true;
258 |    }
259 |    if(!destination_found && loc.device == device){
260 |     destination_resource = loc;
261 |     destination_found = true;
262 |    }
263 |   }
264 |   if(!destination_found){
265 |    this->allocateBody(device,MemKind::Regular);
266 |    for(const auto & loc: location_){
267 |     if(loc.device == device){
268 |      destination_resource = loc;
269 |      destination_found = true;
270 |      break;
271 |     }
272 |    }
273 |   }
274 |   if(source_found){
275 |    cudaError_t cuerr = cudaMemcpy(destination_resource.ptr,source_resource.ptr,this->getSize(),cudaMemcpyDefault);
276 |    assert(cuerr == cudaSuccess);
277 |    this->markBodyStatus(device,true); //mark matrix body on device as up-to-date
278 |   }else{
279 |    std::cout << "#ERROR(BLA::Matrix::syncBody): Provided source device " << source_device << " has no up-to-date matrix body!" << std::endl;
280 |    assert(false);
281 |   }
282 |  }
283 |  return;
284 | }
285 | 
286 | 
287 | template <typename T>
288 | double Matrix<T>::computeNorm(int device)
289 | {
290 |  std::size_t vol = this->getVolume();
291 |  T * matrix_body = this->getBodyPtr(device); assert(matrix_body != nullptr);
292 |  double result = 0.0;
293 |  if(device >= 0){ //GPU
294 |   int dev; cudaError_t cuerr = cudaGetDevice(&dev); assert(cuerr == cudaSuccess);
295 |   if(device != dev){
296 |    cuerr = cudaSetDevice(device); assert(cuerr == cudaSuccess);
297 |   }
298 |   result = matrix_norm2_gpu(vol,matrix_body);
299 |   if(device != dev){
300 |    cuerr = cudaSetDevice(dev); assert(cuerr == cudaSuccess);
301 |   }
302 |  }else{ //Host
303 |   //`Implement
304 |   assert(false);
305 |  }
306 |  return result;
307 | }
308 | 
309 | 
310 | template <typename T>
311 | void Matrix<T>::add(Matrix & Amat, T alpha, int device)
312 | {
313 |  std::size_t vol = this->getVolume();
314 |  assert(Amat.getVolume() == vol);
315 |  T * matrix0_body = this->getBodyPtr(device); assert(matrix0_body != nullptr);
316 |  const T * matrix1_body = Amat.getBodyPtr(device); assert(matrix1_body != nullptr);
317 |  if(device >= 0){ //GPU
318 |   int dev; cudaError_t cuerr = cudaGetDevice(&dev); assert(cuerr == cudaSuccess);
319 |   if(device != dev){
320 |    cuerr = cudaSetDevice(device); assert(cuerr == cudaSuccess);
321 |   }
322 |   matrix_addition_gpu(vol,matrix0_body,matrix1_body,alpha);
323 |   if(device != dev){
324 |    cuerr = cudaSetDevice(dev); assert(cuerr == cudaSuccess);
325 |   }
326 |  }else{ //Host
327 |   //`Implement
328 |   assert(false);
329 |  }
330 |  return;
331 | }
332 | 
333 | 
334 | template <typename T>
335 | void Matrix<T>::multiplyAdd(bool left_transp, bool right_transp, Matrix & Amat, Matrix & Bmat, int device)
336 | {
337 |  T * matrix0_body = this->getBodyPtr(device); assert(matrix0_body != nullptr);
338 |  const T * matrix1_body = Amat.getBodyPtr(device); assert(matrix1_body != nullptr);
339 |  const T * matrix2_body = Bmat.getBodyPtr(device); assert(matrix2_body != nullptr);
340 |  if(device >= 0){ //GPU
341 |   int dev; cudaError_t cuerr = cudaGetDevice(&dev); assert(cuerr == cudaSuccess);
342 |   if(device != dev){
343 |    cuerr = cudaSetDevice(device); assert(cuerr == cudaSuccess);
344 |   }
345 |   matrix_multiplication_gpu(left_transp,right_transp,
346 |                             matrix0_body,this->getNumRows(),this->getNumCols(),
347 |                             matrix1_body,Amat.getNumRows(),Amat.getNumCols(),
348 |                             matrix2_body,Bmat.getNumRows(),Bmat.getNumCols());
349 |   if(device != dev){
350 |    cuerr = cudaSetDevice(dev); assert(cuerr == cudaSuccess);
351 |   }
352 |  }else{ //Host
353 |   //`Implement
354 |   assert(false);
355 |  }
356 |  return;
357 | }
358 | 
359 | } //namespace bla
360 | 
361 | #endif //MATRIX_HPP_
362 | 


--------------------------------------------------------------------------------
/memory.cpp:
--------------------------------------------------------------------------------
  1 | /* CUDA tutorial: Basic Linear Algebra (BLA) Library
  2 | 
  3 | !Copyright (C) 2018-2018 Dmitry I. Lyakh (Liakh)
  4 | !Copyright (C) 2018-2018 Oak Ridge National Laboratory (UT-Battelle)
  5 | 
  6 | !This file is part of CUDA BLA tutorial.
  7 | 
  8 | !CUDA BLA is free software: you can redistribute it and/or modify
  9 | !it under the terms of the GNU Lesser General Public License as published
 10 | !by the Free Software Foundation, either version 3 of the License, or
 11 | !(at your option) any later version.
 12 | 
 13 | !CUDA BLA is distributed in the hope that it will be useful,
 14 | !but WITHOUT ANY WARRANTY; without even the implied warranty of
 15 | !MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 16 | !GNU Lesser General Public License for more details.
 17 | 
 18 | !You should have received a copy of the GNU Lesser General Public License
 19 | !along with CUDA BLA. If not, see <http://www.gnu.org/licenses/>. */
 20 | 
 21 | #include "memory.hpp"
 22 | 
 23 | #include <cuda_runtime.h>
 24 | 
 25 | #include <cassert>
 26 | 
 27 | #include <iostream>
 28 | #include <map>
 29 | 
 30 | namespace bla{
 31 | 
 32 | //Memory chunk descriptor:
 33 | typedef struct{
 34 |  int device;
 35 |  MemKind mem_kind;
 36 |  size_t mem_size;
 37 | } MemChunkDescr;
 38 | 
 39 | 
 40 | //Register of allocated memory chunks:
 41 | std::map<void*,MemChunkDescr> mem_reg;
 42 | 
 43 | 
 44 | void * allocate(size_t size, int device, MemKind mem_kind)
 45 | {
 46 |  void * ptr = nullptr;
 47 |  cudaError_t cuerr;
 48 | 
 49 |  if(size > 0){
 50 |   //Allocated memory:
 51 |   switch(mem_kind){
 52 |   case MemKind::Regular:
 53 |    if(device < 0){ //Host
 54 |     ptr = malloc(size);
 55 |    }else{ //GPU device
 56 |     int dev;
 57 |     cuerr = cudaGetDevice(&dev); assert(cuerr == cudaSuccess);
 58 |     if(device != dev){
 59 |      cuerr = cudaSetDevice(device); assert(cuerr == cudaSuccess);
 60 |     }
 61 |     cuerr = cudaMalloc(&ptr,size); assert(cuerr == cudaSuccess);
 62 |     if(device != dev){
 63 |      cuerr = cudaSetDevice(dev); assert(cuerr == cudaSuccess);
 64 |     }
 65 |    }
 66 |    break;
 67 |   case MemKind::Pinned:
 68 |    if(device < 0){ //Host
 69 |     cuerr = cudaHostAlloc(&ptr,size,cudaHostAllocPortable); assert(cuerr == cudaSuccess);
 70 |    }else{ //GPU device
 71 |     std::cout << "#ERROR(BLA::memory::allocate): Pinned memory is not available on GPU!" << std::endl;
 72 |     assert(false);
 73 |    }
 74 |    break;
 75 |   case MemKind::Mapped:
 76 |    if(device < 0){ //Host
 77 |     cuerr = cudaHostAlloc(&ptr,size,cudaHostAllocPortable|cudaHostAllocMapped); assert(cuerr == cudaSuccess);
 78 |    }else{ //GPU device
 79 |     std::cout << "#ERROR(BLA::memory::allocate): Mapped pinned memory is not available on GPU!" << std::endl;
 80 |     assert(false);
 81 |    }
 82 |    break;
 83 |   case MemKind::Unified:
 84 |    std::cout << "#ERROR(BLA::memory::allocate): Unified memory allocation is not implemented!" << std::endl;
 85 |    assert(false);
 86 |    break;
 87 |   }
 88 |  }
 89 |  //Register memory with BLA:
 90 |  if(ptr != nullptr){
 91 |   auto res = mem_reg.emplace(std::make_pair(ptr,MemChunkDescr{device,mem_kind,size}));
 92 |   assert(res.second);
 93 |  }
 94 |  return ptr;
 95 | }
 96 | 
 97 | 
 98 | void deallocate(void * ptr)
 99 | {
100 |  assert(ptr != nullptr);
101 |  //Find the memory chunk descriptor:
102 |  auto pos = mem_reg.find(ptr);
103 |  if(pos == mem_reg.end()){
104 |   std::cout << "#ERROR(BLA::memory::deallocate): Attempt to deallocate a pointer not allocated by BLA!" << std::endl;
105 |   assert(false);
106 |  }
107 |  auto device = pos->second.device;
108 |  auto mem_kind = pos->second.mem_kind;
109 |  //Deallocate memory:
110 |  cudaError_t cuerr;
111 |  switch(mem_kind){
112 |  case MemKind::Regular:
113 |   if(device < 0){ //Host
114 |    free(ptr);
115 |   }else{ //Device
116 |    int dev;
117 |    cuerr = cudaGetDevice(&dev); assert(cuerr == cudaSuccess);
118 |    if(device != dev){
119 |     cuerr = cudaSetDevice(device); assert(cuerr == cudaSuccess);
120 |    }
121 |    cuerr = cudaFree(ptr); assert(cuerr == cudaSuccess);
122 |    if(device != dev){
123 |     cuerr = cudaSetDevice(dev); assert(cuerr == cudaSuccess);
124 |    }
125 |   }
126 |   break;
127 |  case MemKind::Pinned:
128 |   cuerr = cudaFreeHost(ptr); assert(cuerr == cudaSuccess);
129 |   break;
130 |  case MemKind::Mapped:
131 |   cuerr = cudaFreeHost(ptr); assert(cuerr == cudaSuccess);
132 |   break;
133 |  case MemKind::Unified:
134 |   std::cout << "#ERROR(BLA::memory::deallocate): Unified memory allocation is not implemented!" << std::endl;
135 |   assert(false);
136 |   break;
137 |  }
138 |  //Delete memory chunk descriptor:
139 |  mem_reg.erase(ptr);
140 |  return;
141 | }
142 | 
143 | } //namespace bla
144 | 


--------------------------------------------------------------------------------
/memory.hpp:
--------------------------------------------------------------------------------
 1 | /* CUDA tutorial: Basic Linear Algebra (BLA) Library
 2 | 
 3 | !Copyright (C) 2018-2018 Dmitry I. Lyakh (Liakh)
 4 | !Copyright (C) 2018-2018 Oak Ridge National Laboratory (UT-Battelle)
 5 | 
 6 | !This file is part of CUDA BLA tutorial.
 7 | 
 8 | !CUDA BLA is free software: you can redistribute it and/or modify
 9 | !it under the terms of the GNU Lesser General Public License as published
10 | !by the Free Software Foundation, either version 3 of the License, or
11 | !(at your option) any later version.
12 | 
13 | !CUDA BLA is distributed in the hope that it will be useful,
14 | !but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | !MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | !GNU Lesser General Public License for more details.
17 | 
18 | !You should have received a copy of the GNU Lesser General Public License
19 | !along with CUDA BLA. If not, see <http://www.gnu.org/licenses/>. */
20 | 
21 | #ifndef MEMORY_HPP_
22 | #define MEMORY_HPP_
23 | 
24 | #include <cstdlib>
25 | 
26 | namespace bla{
27 | 
28 | //Memory kinds:
29 | enum class MemKind{
30 |  Regular, //regular global memory (either Host or Device)
31 |  Pinned,  //pinned memory (only Host)
32 |  Mapped,  //mapped pinned memory (only Host)
33 |  Unified  //unified memory (regardless)
34 | };
35 | 
36 | /** Allocates memory on any device (Host: -1; Device: >=0):
37 |     Host (CPU): id = -1;
38 |     GPU device: id >= 0. **/
39 | void * allocate(size_t size,                          //in: requested memory size in bytes
40 |                 int device = -1,                      //in: device (-1: Host; >=0: corresponding GPU)
41 |                 MemKind mem_kind = MemKind::Regular); //in: requested memory kind (see above)
42 | 
43 | /** Deallocates previously allocated memory on any device. **/
44 | void deallocate(void * ptr); //in: pointer to the previously allocated memory chunk
45 | 
46 | } //namespace bla
47 | 
48 | #endif //MEMORY_HPP_
49 | 


--------------------------------------------------------------------------------
/timer.cpp:
--------------------------------------------------------------------------------
 1 | /* CUDA tutorial: Basic Linear Algebra (BLA) Library
 2 | 
 3 | !Copyright (C) 2018-2018 Dmitry I. Lyakh (Liakh)
 4 | !Copyright (C) 2018-2018 Oak Ridge National Laboratory (UT-Battelle)
 5 | 
 6 | !This file is part of CUDA BLA tutorial.
 7 | 
 8 | !CUDA BLA is free software: you can redistribute it and/or modify
 9 | !it under the terms of the GNU Lesser General Public License as published
10 | !by the Free Software Foundation, either version 3 of the License, or
11 | !(at your option) any later version.
12 | 
13 | !CUDA BLA is distributed in the hope that it will be useful,
14 | !but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | !MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | !GNU Lesser General Public License for more details.
17 | 
18 | !You should have received a copy of the GNU Lesser General Public License
19 | !along with CUDA BLA. If not, see <http://www.gnu.org/licenses/>. */
20 | 
21 | #include "timer.hpp"
22 | 
23 | #include <chrono>
24 | 
25 | namespace bla{
26 | 
27 | double time_sys_sec()
28 | {
29 |  auto stamp = std::chrono::system_clock::now(); //current time point
30 |  auto durat = std::chrono::duration<double>(stamp.time_since_epoch()); //duration (sec) since the begining of the clock
31 |  return durat.count(); //number of seconds
32 | }
33 | 
34 | 
35 | double time_high_sec()
36 | {
37 |  auto stamp = std::chrono::high_resolution_clock::now(); //current time point
38 |  auto durat = std::chrono::duration<double>(stamp.time_since_epoch()); //duration (sec) since the begining of the clock
39 |  return durat.count(); //number of seconds
40 | }
41 | 
42 | } //namespace bla
43 | 


--------------------------------------------------------------------------------
/timer.hpp:
--------------------------------------------------------------------------------
 1 | /* CUDA tutorial: Basic Linear Algebra (BLA) Library
 2 | 
 3 | !Copyright (C) 2018-2018 Dmitry I. Lyakh (Liakh)
 4 | !Copyright (C) 2018-2018 Oak Ridge National Laboratory (UT-Battelle)
 5 | 
 6 | !This file is part of CUDA BLA tutorial.
 7 | 
 8 | !CUDA BLA is free software: you can redistribute it and/or modify
 9 | !it under the terms of the GNU Lesser General Public License as published
10 | !by the Free Software Foundation, either version 3 of the License, or
11 | !(at your option) any later version.
12 | 
13 | !CUDA BLA is distributed in the hope that it will be useful,
14 | !but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | !MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | !GNU Lesser General Public License for more details.
17 | 
18 | !You should have received a copy of the GNU Lesser General Public License
19 | !along with CUDA BLA. If not, see <http://www.gnu.org/licenses/>. */
20 | 
21 | #ifndef TIMER_HPP_
22 | #define TIMER_HPP_
23 | 
24 | namespace bla{
25 | 
26 | /** System time stamp in seconds (thread-global) **/
27 | double time_sys_sec();
28 | 
29 | /** High-resolution time stamp in seconds **/
30 | double time_high_sec();
31 | 
32 | } //namespace bla
33 | 
34 | #endif //TIMER_HPP_
35 | 


--------------------------------------------------------------------------------