├── .gitignore ├── LICENSE ├── OpSparse ├── Makefile ├── inc │ ├── CSR.h │ ├── Meta.h │ ├── Timings.h │ ├── binning.cuh │ ├── cuda_common.h │ ├── cusparse_spgemm.h │ ├── define.h │ ├── kernel_wrapper.cuh │ ├── numeric.cuh │ ├── setup.cuh │ └── symbolic.cuh ├── readme.md └── src │ ├── CSR.cu │ ├── Meta.cu │ ├── Timings.cu │ ├── opsparse.cu │ ├── reg_cusparse.cu │ └── reg_opsparse.cu ├── download_matrix.sh ├── nsparse ├── Makefile ├── inc │ ├── BIN.hpp │ ├── CSR.hpp │ ├── HashSpGEMM.hpp │ ├── HashSpGEMM_volta.hpp │ ├── Plan.hpp │ ├── SpGEMM.hpp │ ├── Timing.hpp │ ├── cuda_common.h │ ├── nsparse.hpp │ └── nsparse_asm.hpp ├── nsparse.cu ├── readme.md └── reg_nsparse.cu ├── readme.md └── spECK ├── Makefile ├── config.ini ├── include ├── COO.h ├── CSR.h ├── CUDATools │ ├── error.h │ ├── event.h │ ├── memory.h │ ├── memory_space.h │ ├── stream.h │ └── unique_handle.h ├── Compare.h ├── Config.h ├── DataLoader.h ├── Executor.h ├── GPU │ ├── BlockRange.cuh │ ├── Hash.cuh │ ├── HelperFunctions.cuh │ ├── consistent_gpu_memory.h │ ├── limits.cuh │ ├── profiler.cuh │ ├── scan_largearray_kernel.cuh │ ├── spECKKernels.h │ ├── spECK_HashLoadBalancer.cuh │ └── spECK_HashSpGEMM.cuh ├── HashMap.cuh ├── INIReader.h ├── Multiply.h ├── RunConfig.h ├── Timings.h ├── Transpose.h ├── Vector.h ├── WorkDistribution.h ├── common.cuh ├── common.h ├── cuSparseMultiply.h ├── cuda_common.h ├── dCSR.h ├── meta_utils.h ├── multi_arch_build.h └── spECKConfig.h ├── readme.md └── source ├── COO.cpp ├── CSR.cpp ├── Config.cpp ├── DataLoader.cpp ├── Executor.cpp ├── GPU ├── Compare.cu ├── Multiply.cu ├── Transpose.cu ├── common.cu ├── memory.cpp └── profiler.cu ├── RunConfig.cpp ├── cuSparseMultiply.cu ├── dCSR.cpp ├── reg_runspECK.cpp └── runspECK.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | *.d 3 | *.o 4 | *.obj 5 | *.out 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Zhaoyang Du 4 | Copyright (c) 2017 Tokyo Institute of Technology 5 | Copyright (c) 2019 Mathias Parger 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | -------------------------------------------------------------------------------- /OpSparse/Makefile: -------------------------------------------------------------------------------- 1 | CXX = g++ 2 | NVCC = nvcc 3 | 4 | #CUDAFLAGS = $(GENCODE) -g -lineinfo 5 | CUDAFLAGS = $(GENCODE) -O3 6 | 7 | #CUDAFLAGS = $(GENCODE) -g -G 8 | 9 | 10 | GENCODE = -arch=compute_70 -code=sm_70 11 | CUDAFLAGS += -Xcompiler -fopenmp 12 | 13 | # for Device Code 14 | CUDA_PATH = /usr/local/cuda 15 | LDFLAGS += -lcusparse $(CUDAFLAGS) 16 | INCLUDE += -I${CUDA_PATH}/include 17 | INCLUDE += -I${CUDA_PATH}/samples/common/inc 18 | INCLUDE += -I./inc 19 | 20 | BIN = ./bin 21 | SRC = ./src 22 | OBJ = ./obj 23 | INC = ./inc 24 | 25 | OBJ_LIB = $(OBJ)/Meta.o $(OBJ)/CSR.o $(OBJ)/Timings.o 26 | 27 | COMMON_DEP = $(INC)/cuda_common.h $(INC)/define.h 28 | 29 | $(OBJ)/%.o : $(SRC)/%.cu $(INC)/%.h $(COMMON_DEP) 30 | mkdir -p $(dir $@) 31 | @echo $^ 32 | $(NVCC) -c $(CUDAFLAGS) $(INCLUDE) -o $@ $< 33 | 34 | $(OBJ)/%.o : $(SRC)/%.cu $(COMMON_DEP) 35 | mkdir -p $(dir $@) 36 | @echo $^ 37 | $(NVCC) -c $(CUDAFLAGS) $(INCLUDE) -o $@ $< 38 | 39 | opsparse : $(OBJ_LIB) $(OBJ)/opsparse.o 40 | $(NVCC) -o $@ $^ $(LDFLAGS) $(INCLUDE) 41 | 42 | reg_opsparse : $(OBJ_LIB) $(OBJ)/reg_opsparse.o 43 | $(NVCC) -o $@ $^ $(LDFLAGS) $(INCLUDE) 44 | 45 | reg_cusparse : $(OBJ_LIB) $(OBJ)/reg_cusparse.o 46 | $(NVCC) -o $@ $^ $(LDFLAGS) $(INCLUDE) 47 | 48 | clean : 49 | rm -rf $(BIN) 50 | rm -rf $(OBJ) 51 | -------------------------------------------------------------------------------- /OpSparse/inc/CSR.h: -------------------------------------------------------------------------------- 1 | #ifndef Z_CSR_H_ 2 | #define Z_CSR_H_ 3 | #include 4 | #include 5 | #include "cuda_common.h" 6 | 7 | class CSR{ 8 | public: 9 | mint M; 10 | mint N; 11 | mint nnz; 12 | mint *rpt; 13 | mint *col; 14 | mdouble *val; 15 | 16 | mint *d_rpt; 17 | mint *d_col; 18 | mdouble *d_val; 19 | CSR():M(0), N(0), nnz(0), 20 | rpt(nullptr), col(nullptr), val(nullptr), 21 | d_rpt(nullptr), d_col(nullptr), d_val(nullptr) 22 | {} 23 | CSR(const std::string &mtx_file); 24 | CSR(const CSR& A); 25 | CSR(const CSR& A, mint M_, mint N_, mint M_start, mint N_start); 26 | ~CSR(); 27 | 28 | void hrelease(); 29 | void drelease(); 30 | void release(); 31 | void D2H(); 32 | void H2D(); 33 | bool operator==(const CSR& A); 34 | CSR& operator=(const CSR& A); 35 | void construct(const std::string &mtx_file); 36 | 37 | }; 38 | 39 | #endif 40 | -------------------------------------------------------------------------------- /OpSparse/inc/Meta.h: -------------------------------------------------------------------------------- 1 | #ifndef __Z_META__ 2 | #define __Z_META__ 3 | 4 | #include "cuda_common.h" 5 | #include "define.h" 6 | 7 | class CSR; 8 | class Meta{ 9 | public: 10 | // first, allocate C.rpt. 11 | // d_row_flop, d_estimated_row_nnz, d_row_nnz are all reused with C.rpt 12 | 13 | // combined memory 14 | mint *d_combined_mem; // second, allocate for all others 15 | mint *combined_mem; // second, allocate for all others 16 | 17 | // meta data 18 | mint M; // number of rows 19 | mint N; // number of cols 20 | mint *d_bins; // size M 21 | mint *d_bin_size; // size NUM_BIN 22 | mint *d_bin_offset; // size NUM_BIN 23 | mint *d_max_row_nnz; // size 1 24 | mint *d_total_nnz; // size 1 25 | mint *d_cub_storage; // size variable 26 | mint *bin_size; // size NUM_BIN 27 | mint *bin_offset; // size NUM_BIN 28 | mint *max_row_nnz; // size 1 29 | mint *total_nnz; // size 1 30 | size_t cub_storage_size; 31 | cudaStream_t *stream; 32 | 33 | 34 | // symbolic global and numeric global, is allocated at runtime 35 | mint* d_global_mem_pool; // size unknown, allocated at runtime 36 | size_t global_mem_pool_size; 37 | bool global_mem_pool_malloced; 38 | 39 | // ******************************************************** 40 | // public method 41 | Meta(){} 42 | Meta(const Meta&) = delete; 43 | Meta &operator=(const Meta&) = delete; 44 | Meta(CSR &C); // init and first malloc 45 | void allocate_rpt(CSR& C); 46 | void allocate(CSR &C); // malloc conbined mem and pin the variables 47 | void release(); 48 | 49 | void memset_bin_size(mint stream_idx); // set d_bin_size only to 0 50 | void memset_all(mint stream_idx); // set d_bin_size and other to 0 51 | void D2H_bin_size(mint stream_idx); 52 | void D2H_all(mint stream_idx); 53 | void H2D_bin_offset(mint stream_idx); 54 | ~Meta(); 55 | }; 56 | 57 | #endif 58 | -------------------------------------------------------------------------------- /OpSparse/inc/Timings.h: -------------------------------------------------------------------------------- 1 | #ifndef __Z_TIMING_H__ 2 | #define __Z_TIMING_H__ 3 | 4 | class Timings { 5 | public: 6 | bool measure_separate; 7 | bool measure_total; 8 | double setup; 9 | double symbolic_binning; 10 | double symbolic; 11 | double reduce; 12 | double numeric_binning; 13 | double prefix; 14 | double allocate; 15 | double numeric; 16 | double cleanup; 17 | double total; 18 | Timings(); 19 | 20 | void operator+=(const Timings& b); 21 | 22 | void operator/=(const double x); 23 | void print(const double total_flop); 24 | void reg_print(const double total_flop); 25 | void perf_print(const double total_flop); 26 | void binning_print(const double total_flop); 27 | }; 28 | 29 | #endif 30 | 31 | -------------------------------------------------------------------------------- /OpSparse/inc/binning.cuh: -------------------------------------------------------------------------------- 1 | #ifndef __Z_ESTIMATE_SYMBOLIC_BINNING_CUH__ 2 | #define __Z_ESTIMATE_SYMBOLIC_BINNING_CUH__ 3 | 4 | #include "cuda_common.h" 5 | #include "define.h" 6 | 7 | 8 | 9 | __global__ void __launch_bounds__(1024, 2) k_symbolic_binning( 10 | mint *d_row_flop, int M, mint *d_bin_size){ 11 | 12 | __shared__ mint shared_bin_size[NUM_BIN]; 13 | if(threadIdx.x < NUM_BIN){ 14 | shared_bin_size[threadIdx.x] = 0; 15 | } 16 | __syncthreads(); 17 | 18 | mint i = threadIdx.x + blockIdx.x * blockDim.x; 19 | mint row_nnz, j; 20 | //mint range[NUM_BIN] = {32, 512, 1024, 2048, 4096, 8192, 12287, INT_MAX}; // 1x 21 | mint range[NUM_BIN] = {26, 426, 853, 1706, 3413, 6826, 10240, INT_MAX}; // 1.2x 22 | //mint range[NUM_BIN] = {21, 341, 682, 1365, 2730, 5461, 8191, INT_MAX}; // 1.5x 23 | if(i < M){ 24 | row_nnz = d_row_flop[i]; 25 | //#pragma unroll 26 | for(j = 0; j < NUM_BIN; j++){ 27 | if(row_nnz <= range[j]){ 28 | atomicAdd(shared_bin_size + j, 1); 29 | goto before_end; 30 | } 31 | } 32 | } 33 | before_end: 34 | __syncthreads(); 35 | if(threadIdx.x < NUM_BIN){ 36 | atomicAdd(d_bin_size + threadIdx.x, shared_bin_size[threadIdx.x]); 37 | } 38 | } 39 | 40 | 41 | __global__ void __launch_bounds__ (1024, 2) k_symbolic_binning2( 42 | mint * __restrict__ d_row_flop, 43 | int M, 44 | mint * __restrict__ d_bins, 45 | mint * __restrict__ d_bin_size, 46 | mint * __restrict__ d_bin_offset){ 47 | 48 | 49 | __shared__ mint shared_bin_size[NUM_BIN]; 50 | __shared__ mint shared_bin_offset[NUM_BIN]; 51 | if(threadIdx.x < NUM_BIN){ 52 | shared_bin_size[threadIdx.x] = 0; 53 | } 54 | __syncthreads(); 55 | 56 | mint i = threadIdx.x + blockIdx.x * blockDim.x; 57 | mint row_nnz, j; 58 | //mint range[NUM_BIN] = {32, 512, 1024, 2048, 4096, 8192, 12287, INT_MAX}; // 1x 59 | mint range[NUM_BIN] = {26, 426, 853, 1706, 3413, 6826, 10240, INT_MAX}; // 1.2x 60 | //mint range[NUM_BIN] = {21, 341, 682, 1365, 2730, 5461, 8191, INT_MAX}; // 1.5x 61 | if(i < M){ 62 | row_nnz = d_row_flop[i]; 63 | //#pragma unroll 64 | for(j = 0; j < NUM_BIN; j++){ 65 | if(row_nnz <= range[j]){ 66 | atomicAdd(shared_bin_size + j, 1); 67 | goto before_end; 68 | } 69 | } 70 | } 71 | before_end: 72 | 73 | __syncthreads(); 74 | if(threadIdx.x < NUM_BIN){ 75 | shared_bin_offset[threadIdx.x] = atomicAdd(d_bin_size + threadIdx.x, shared_bin_size[threadIdx.x]); 76 | shared_bin_offset[threadIdx.x] += d_bin_offset[threadIdx.x]; 77 | shared_bin_size[threadIdx.x] = 0; 78 | } 79 | __syncthreads(); 80 | 81 | mint index; 82 | if(i < M){ 83 | //#pragma unroll 84 | for(j = 0; j < NUM_BIN; j++){ 85 | if(row_nnz <= range[j]){ 86 | index = atomicAdd(shared_bin_size + j, 1); 87 | d_bins[shared_bin_offset[j] + index] = i; 88 | return; 89 | } 90 | } 91 | } 92 | } 93 | 94 | 95 | __global__ void k_binning_small( 96 | mint *d_bins, mint M){ 97 | 98 | mint i = threadIdx.x + blockIdx.x * blockDim.x; 99 | if(i >= M){ 100 | return; 101 | } 102 | d_bins[i] = i; 103 | } 104 | 105 | 106 | 107 | __global__ void __launch_bounds__ (1024, 2) k_numeric_binning( 108 | mint * __restrict__ d_row_nnz, 109 | int M, 110 | mint * __restrict__ d_bin_size, 111 | mint * __restrict__ d_total_nnz, 112 | mint * __restrict__ d_max_row_nnz){ 113 | 114 | __shared__ mint shared_bin_size[NUM_BIN]; 115 | __shared__ mint shared_local_nnz[1]; 116 | __shared__ mint shared_max_row_nnz[1]; 117 | if(threadIdx.x < NUM_BIN){ 118 | shared_bin_size[threadIdx.x] = 0; 119 | } 120 | if(threadIdx.x == 32){ 121 | shared_local_nnz[0] = 0; 122 | shared_max_row_nnz[0] = 0; 123 | } 124 | __syncthreads(); 125 | //mint range[NUM_BIN] = {31, 255, 511, 1022, 2047, 4095, 8191, INT_MAX}; // 1x 126 | //mint range[NUM_BIN] = {21, 192, 384, 768, 1536, 3072, 5460, INT_MAX}; // 1.5x 127 | mint range[NUM_BIN] = {16, 128, 256, 512, 1024, 2048, 4095, INT_MAX}; // 2x 128 | //mint range[NUM_BIN] = {10, 85, 170, 341, 682, 1365, 2730, INT_MAX}; // 3x 129 | mint i = threadIdx.x + blockIdx.x * blockDim.x; 130 | mint row_nnz, j; 131 | if(i < M){ 132 | row_nnz = d_row_nnz[i]; 133 | atomicAdd(shared_local_nnz, row_nnz); 134 | atomicMax(shared_max_row_nnz, row_nnz); 135 | //#pragma unroll 136 | for(j = 0; j < NUM_BIN; j++){ 137 | if(row_nnz <= range[j]){ 138 | atomicAdd(shared_bin_size + j, 1); 139 | goto before_end; 140 | } 141 | } 142 | } 143 | before_end: 144 | 145 | 146 | __syncthreads(); 147 | if(threadIdx.x < NUM_BIN){ 148 | atomicAdd(d_bin_size + threadIdx.x, shared_bin_size[threadIdx.x]); 149 | } 150 | if(threadIdx.x == 32){ 151 | atomicAdd(d_total_nnz, shared_local_nnz[0]); 152 | } 153 | if(threadIdx.x == 64){ 154 | atomicMax(d_max_row_nnz, shared_max_row_nnz[0]); 155 | } 156 | } 157 | 158 | __global__ void k_numeric_binning2 __launch_bounds__ (1024, 2) ( 159 | mint * __restrict__ d_row_nnz, 160 | int M, 161 | mint * __restrict__ d_bins, 162 | mint * __restrict__ d_bin_size, 163 | mint * __restrict__ d_bin_offset){ 164 | 165 | __shared__ mint shared_bin_size[NUM_BIN]; 166 | __shared__ mint shared_bin_offset[NUM_BIN]; 167 | if(threadIdx.x < NUM_BIN){ 168 | shared_bin_size[threadIdx.x] = 0; 169 | } 170 | __syncthreads(); 171 | //mint range[NUM_BIN] = {31, 255, 511, 1022, 2047, 4095, 8191, INT_MAX}; // 1x 172 | //mint range[NUM_BIN] = {21, 192, 384, 768, 1536, 3072, 5460, INT_MAX}; // 1.5x 173 | mint range[NUM_BIN] = {16, 128, 256, 512, 1024, 2048, 4095, INT_MAX}; // 2x 174 | //mint range[NUM_BIN] = {10, 85, 170, 341, 682, 1365, 2730, INT_MAX}; // 3x 175 | mint i = threadIdx.x + blockIdx.x * blockDim.x; 176 | mint row_nnz, j; 177 | if(i < M){ 178 | row_nnz = d_row_nnz[i]; 179 | //#pragma unroll 180 | for(j = 0; j < NUM_BIN; j++){ 181 | if(row_nnz <= range[j]){ 182 | atomicAdd(shared_bin_size + j, 1); 183 | goto before_end; 184 | } 185 | } 186 | } 187 | before_end: 188 | 189 | 190 | __syncthreads(); 191 | if(threadIdx.x < NUM_BIN){ 192 | shared_bin_offset[threadIdx.x] = atomicAdd(d_bin_size + threadIdx.x, shared_bin_size[threadIdx.x]); 193 | shared_bin_offset[threadIdx.x] += d_bin_offset[threadIdx.x]; 194 | shared_bin_size[threadIdx.x] = 0; 195 | } 196 | __syncthreads(); 197 | mint index; 198 | if(i < M){ 199 | //#pragma unroll 200 | for(j = 0; j < NUM_BIN; j++){ 201 | if(row_nnz <= range[j]){ 202 | index = atomicAdd(shared_bin_size + j, 1); 203 | d_bins[shared_bin_offset[j] + index] = i; 204 | return; 205 | } 206 | } 207 | } 208 | } 209 | 210 | #endif 211 | -------------------------------------------------------------------------------- /OpSparse/inc/cuda_common.h: -------------------------------------------------------------------------------- 1 | #ifndef _Z_COMMON_ 2 | #define _Z_COMMON_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #define likely(x) __builtin_expect(x,1) 14 | #define unlikely(x) __builtin_expect(x,0) 15 | 16 | //typedef unsigned int mint; 17 | typedef int mint; 18 | typedef double mdouble; 19 | 20 | inline static void checkCUDA(cudaError_t err, 21 | const char *file, 22 | int line) 23 | { 24 | if (unlikely(err != cudaSuccess)) 25 | { 26 | printf("%s in %s at line %d\n", cudaGetErrorString(err), 27 | file, line); 28 | throw std::exception(); 29 | } 30 | } 31 | // #ifdef _DEBUG || NDEBUG || DEBUG 32 | #define CHECK_CUDA(err) (checkCUDA(err, __FILE__, __LINE__)) 33 | #define CHECK_ERROR(err) (checkCUDA(err, __FILE__, __LINE__)) 34 | 35 | inline void CHECK_CUSPARSE(cusparseStatus_t status, std::string errorMsg="") 36 | { 37 | if (status != CUSPARSE_STATUS_SUCCESS) { 38 | std::cout << "CuSparse error: " << errorMsg << std::endl; 39 | throw std::exception(); 40 | } 41 | } 42 | 43 | #define HP_TIMING_NOW(Var) \ 44 | ({ unsigned int _hi, _lo; \ 45 | asm volatile ("lfence\n\trdtsc" : "=a" (_lo), "=d" (_hi)); \ 46 | (Var) = ((unsigned long long int) _hi << 32) | _lo; }) 47 | 48 | /* precision is 1 clock cycle. 49 | * execute time is roughly 50 or 140 cycles depends on cpu family */ 50 | inline void cpuid(int *info, int eax, int ecx = 0){ 51 | int ax, bx, cx, dx; 52 | __asm__ __volatile__ ("cpuid": "=a" (ax), "=b" (bx), "=c" (cx), "=d" (dx) : "a" (eax)); 53 | 54 | info[0] = ax; 55 | info[1] = bx; 56 | info[2] = cx; 57 | info[3] = dx; 58 | } 59 | 60 | inline long get_tsc_freq(){ 61 | static long freq = 0; 62 | if(unlikely((freq == 0))){ 63 | int raw[4]; 64 | cpuid(raw, 0x16); // get cpu freq 65 | freq = long(raw[0]) * 1000000; 66 | //printf("static first call %f\n", freq); 67 | } 68 | return freq; 69 | } 70 | 71 | inline double fast_clock_time(){ 72 | long counter; 73 | HP_TIMING_NOW(counter); 74 | return double(counter)/get_tsc_freq(); 75 | } 76 | 77 | template 78 | inline void D2H(T *dst, T* src, size_t size){ 79 | CHECK_ERROR(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost)); 80 | } 81 | 82 | template 83 | inline void H2D(T *dst, T* src, size_t size){ 84 | CHECK_ERROR(cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice)); 85 | } 86 | 87 | template 88 | inline void D2D(T *dst, T* src, size_t size){ 89 | CHECK_ERROR(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToDevice)); 90 | } 91 | 92 | 93 | #endif 94 | -------------------------------------------------------------------------------- /OpSparse/inc/cusparse_spgemm.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "cuda_common.h" 4 | #include "CSR.h" 5 | 6 | void cusparse_spgemm_inner(int *d_row_ptr_A, int *d_col_idx_A, double *d_csr_values_A, 7 | int *d_row_ptr_B, int *d_col_idx_B, double *d_csr_values_B, 8 | int **d_row_ptr_C, int **d_col_idx_C, double **d_csr_values_C, 9 | int M, int K, int N, int nnz_A, int nnz_B, int* nnz_C){ 10 | CHECK_CUDA(cudaMalloc((void**) d_row_ptr_C, (M+1) * sizeof(int))); 11 | 12 | cusparseHandle_t handle; 13 | CHECK_CUSPARSE(cusparseCreate(&handle), "create cusparse handle"); 14 | cusparseSpMatDescr_t matA, matB, matC; 15 | void *dBuffer1 = NULL, *dBuffer2 = NULL; 16 | size_t bufferSize1 = 0, bufferSize2 = 0; 17 | CHECK_CUSPARSE( cusparseCreateCsr(&matA, M, K, nnz_A, 18 | d_row_ptr_A, d_col_idx_A, d_csr_values_A, 19 | CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 20 | CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F), "create matA" ); 21 | CHECK_CUSPARSE( cusparseCreateCsr(&matB, K, N, nnz_B, 22 | d_row_ptr_B, d_col_idx_B, d_csr_values_B, 23 | CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 24 | CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F), "create matB" ); 25 | CHECK_CUSPARSE( cusparseCreateCsr(&matC, M, N, 0, 26 | NULL, NULL, NULL, 27 | CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 28 | CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F), "create matC" ); 29 | cusparseSpGEMMDescr_t spgemmDescr; 30 | CHECK_CUSPARSE(cusparseSpGEMM_createDescr(&spgemmDescr), "create spgemm descr"); 31 | double alpha = 1.0f; 32 | double beta = 0.0f; 33 | cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 34 | cusparseOperation_t opB = CUSPARSE_OPERATION_NON_TRANSPOSE; 35 | cudaDataType computeType = CUDA_R_64F; 36 | 37 | CHECK_CUSPARSE(cusparseSpGEMM_workEstimation(handle, opA, opB, 38 | &alpha, matA, matB, &beta, matC, 39 | computeType, CUSPARSE_SPGEMM_DEFAULT, 40 | spgemmDescr, &bufferSize1, NULL), 41 | "first work estimation"); 42 | CHECK_CUDA(cudaMalloc((void**) &dBuffer1, bufferSize1)); 43 | CHECK_CUSPARSE(cusparseSpGEMM_workEstimation(handle, opA, opB, 44 | &alpha, matA, matB, &beta, matC, 45 | computeType, CUSPARSE_SPGEMM_DEFAULT, 46 | spgemmDescr, &bufferSize1, dBuffer1), 47 | "second work estimation"); 48 | CHECK_CUSPARSE(cusparseSpGEMM_compute(handle, opA, opB, 49 | &alpha, matA, matB, &beta, matC, 50 | computeType, CUSPARSE_SPGEMM_DEFAULT, 51 | spgemmDescr, &bufferSize2, NULL), 52 | "first compute"); 53 | 54 | CHECK_CUDA(cudaMalloc((void**) &dBuffer2, bufferSize2)); 55 | CHECK_CUSPARSE(cusparseSpGEMM_compute(handle, opA, opB, 56 | &alpha, matA, matB, &beta, matC, 57 | computeType, CUSPARSE_SPGEMM_DEFAULT, 58 | spgemmDescr, &bufferSize2, dBuffer2), 59 | "second compute"); 60 | 61 | int64_t M_C, N_C, nnz_C_64I; 62 | CHECK_CUSPARSE( cusparseSpMatGetSize(matC, &M_C, &N_C, &nnz_C_64I) ); 63 | *nnz_C = nnz_C_64I; 64 | CHECK_CUDA(cudaMalloc((void**)d_col_idx_C, *nnz_C*sizeof(int))); 65 | CHECK_CUDA(cudaMalloc((void**)d_csr_values_C, *nnz_C*sizeof(double))); 66 | CHECK_CUSPARSE(cusparseCsrSetPointers(matC, *d_row_ptr_C, *d_col_idx_C, *d_csr_values_C)); 67 | 68 | CHECK_CUSPARSE(cusparseSpGEMM_copy(handle, opA, opB, 69 | &alpha, matA, matB, &beta, matC, 70 | computeType, CUSPARSE_SPGEMM_DEFAULT, spgemmDescr), 71 | "spgemm copy"); 72 | CHECK_CUSPARSE( cusparseSpGEMM_destroyDescr(spgemmDescr) ); 73 | CHECK_CUSPARSE( cusparseDestroySpMat(matA) ); 74 | CHECK_CUSPARSE( cusparseDestroySpMat(matB) ); 75 | CHECK_CUSPARSE( cusparseDestroySpMat(matC) ); 76 | CHECK_CUSPARSE( cusparseDestroy(handle) ); 77 | 78 | CHECK_CUDA(cudaFree(dBuffer1)); 79 | CHECK_CUDA(cudaFree(dBuffer2)); 80 | 81 | CHECK_CUDA(cudaDeviceSynchronize()); 82 | } 83 | 84 | 85 | void cusparse_spgemm(CSR *a, CSR *b, CSR *c){ 86 | int tmp_nnz; 87 | cusparse_spgemm_inner(a->d_rpt, a->d_col, a->d_val, 88 | b->d_rpt, b->d_col, b->d_val, 89 | &(c->d_rpt), &(c->d_col), &(c->d_val), 90 | a->M, a->N, b->N, a->nnz, b->nnz, &(tmp_nnz)); 91 | c->M = a->M; 92 | c->N = b->N; 93 | c->nnz = tmp_nnz; 94 | } 95 | 96 | 97 | void cusparse_spgemm(const CSR& A, const CSR& B, CSR& C){ 98 | int tmp_nnz; 99 | cusparse_spgemm_inner(A.d_rpt, A.d_col, A.d_val, 100 | B.d_rpt, B.d_col, B.d_val, 101 | &(C.d_rpt), &(C.d_col), &(C.d_val), 102 | A.M, A.N, B.N, A.nnz, B.nnz, &(tmp_nnz)); 103 | C.M = A.M; 104 | C.N = B.N; 105 | C.nnz = tmp_nnz; 106 | } 107 | -------------------------------------------------------------------------------- /OpSparse/inc/define.h: -------------------------------------------------------------------------------- 1 | #ifndef _Z_DEFINE_H_ 2 | #define _Z_DEFINE_H_ 3 | 4 | #define div_up(a, b) ((a+b-1)/b) 5 | #define div_round_up(a, b) ((a+b-1)/b) 6 | 7 | #define NUM_BIN 8 8 | #define WSIZE 32 9 | 10 | #define PWARP 4 11 | #define PWARP_ROWS 256 12 | #define PWARP_TSIZE 32 13 | #define PWARP_BLOCK_SIZE (PWARP * PWARP_ROWS) 14 | 15 | #define NUMERIC_PWARP 8 16 | #define NUMERIC_PWARP_ROWS 128 17 | #define NUMERIC_PWARP_TSIZE 32 18 | #define NUMERIC_PWARP_BLOCK_SIZE (NUMERIC_PWARP * NUMERIC_PWARP_ROWS) 19 | 20 | #define HASH_SINGLE 21 | //#define HASH_MULTI 22 | 23 | // cannot define WARP, since thrust source code uses WARP 24 | //#define WARP 32 25 | 26 | #define SYMBOLIC_SCALE_SMALL 1 27 | #define SYMBOLIC_SCALE_LARGE 1 28 | #define NUMERIC_SCALE_LARGE 2 29 | #define NUMERIC_SCALE 1.5 30 | #define THRESH_SCALE 0.8 31 | #define HASH_SCALE 107 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /OpSparse/inc/setup.cuh: -------------------------------------------------------------------------------- 1 | #ifndef __Z_SETUP_CUH__ 2 | #define __Z_SETUP_CUH__ 3 | 4 | #include "cuda_common.h" 5 | #include "define.h" 6 | 7 | __global__ void __launch_bounds__(1024, 2) k_compute_flop( 8 | const mint* __restrict__ d_arpt, 9 | const mint* __restrict__ d_acol, 10 | const mint* __restrict__ d_brpt, 11 | mint M, 12 | mint *d_row_flop, 13 | mint *d_max_row_flop){ 14 | 15 | __shared__ mint shared_max_row_flop[1]; 16 | int i = blockIdx.x * blockDim.x + threadIdx.x; 17 | if (i >= M) { 18 | return; 19 | } 20 | if(threadIdx.x == 0){ 21 | shared_max_row_flop[0] = 0; 22 | } 23 | __syncthreads(); 24 | mint row_flop = 0; 25 | mint j; 26 | mint acol; 27 | mint arow_start, arow_end; 28 | arow_start = d_arpt[i]; 29 | arow_end = d_arpt[i+1]; 30 | for (j = arow_start; j < arow_end; j++) { 31 | acol = d_acol[j]; 32 | row_flop += d_brpt[acol + 1] - d_brpt[acol]; 33 | } 34 | d_row_flop[i] = row_flop; 35 | atomicMax(shared_max_row_flop, row_flop); 36 | __syncthreads(); 37 | if(threadIdx.x == 0){ 38 | atomicMax(d_max_row_flop, shared_max_row_flop[0]); 39 | } 40 | } 41 | 42 | 43 | #endif 44 | -------------------------------------------------------------------------------- /OpSparse/readme.md: -------------------------------------------------------------------------------- 1 | # Get started 2 | 1 Profile opsparse 3 | 4 | 1.1 ```$> make opsparse ``` 5 | 6 | 1.2 ```$> ./opsparse webbase-1M``` 7 | 8 | 2 Overall performance of opsparse 9 | 10 | 2.1 ```$> make reg_opsparse``` 11 | 12 | 2.2 ```$> ./reg_opsparse webbase-1M``` 13 | 14 | 3 Overall performance of cusparse 15 | 16 | 3.1 ```$> make reg_cusparse``` 17 | 18 | 3.2 ```$> ./reg_cusparse webbase-1M``` 19 | 20 | -------------------------------------------------------------------------------- /OpSparse/src/Meta.cu: -------------------------------------------------------------------------------- 1 | 2 | #include "Meta.h" 3 | #include "CSR.h" 4 | #include 5 | 6 | Meta::Meta(CSR &C){ 7 | allocate_rpt(C); 8 | } 9 | 10 | void Meta::allocate_rpt(CSR &C){ 11 | CHECK_ERROR(cudaMalloc(&C.d_rpt, (C.M + 1)*sizeof(mint))); 12 | } 13 | 14 | void Meta::allocate(CSR& C){ 15 | M = C.M; 16 | N = C.N; 17 | stream = new cudaStream_t [NUM_BIN]; 18 | for(int i = 0; i < NUM_BIN; i++){ 19 | CHECK_ERROR(cudaStreamCreate(stream + i)); 20 | } 21 | 22 | cub::DeviceScan::ExclusiveSum(nullptr, cub_storage_size, C.d_rpt, C.d_rpt, M + 1); // calculate tmp_storage_size in bytes 23 | 24 | mint d_combined_size = M + 2 * NUM_BIN + 2 + cub_storage_size/(sizeof(mint)); 25 | CHECK_ERROR(cudaMalloc(&d_combined_mem, d_combined_size * sizeof(mint))); 26 | mint combined_size = 2 * NUM_BIN + 2; 27 | combined_mem = (mint *)malloc(combined_size * sizeof(mint)); 28 | assert(combined_mem != nullptr); 29 | 30 | d_bins = (mint *)d_combined_mem; // size M 31 | d_bin_size = (mint *)d_combined_mem + M; // size NUM_BIN 32 | d_max_row_nnz = d_bin_size + NUM_BIN; // size 1 33 | d_total_nnz = d_bin_size + NUM_BIN + 1; // size 1 34 | d_bin_offset = d_total_nnz + 1; // size NUM_BIN 35 | d_cub_storage = d_bin_offset + 1; 36 | 37 | bin_size = (mint*) combined_mem; // size NUM_BIN 38 | max_row_nnz = bin_size + NUM_BIN; // size 1 39 | total_nnz = bin_size + NUM_BIN + 1; // size 1 40 | bin_offset = bin_size + NUM_BIN + 2; // size NUM_BIN 41 | 42 | d_global_mem_pool = nullptr; 43 | global_mem_pool_size = 0; 44 | global_mem_pool_malloced = false; 45 | } 46 | 47 | void Meta::release(){ 48 | cudaFree(d_combined_mem); 49 | d_combined_mem = nullptr; 50 | if(stream != nullptr){ 51 | for(int i = 0; i < NUM_BIN; i++){ 52 | cudaStreamDestroy(stream[i]); 53 | } 54 | delete [] stream; 55 | stream = nullptr; 56 | } 57 | delete [] combined_mem; 58 | combined_mem = nullptr; 59 | } 60 | 61 | Meta::~Meta(){ 62 | release(); 63 | } 64 | 65 | 66 | void Meta::memset_all(mint stream_idx = 1){ 67 | CHECK_ERROR(cudaMemsetAsync(d_bin_size, 0, (NUM_BIN + 2) * sizeof(mint), stream[stream_idx])); 68 | //CHECK_ERROR(cudaMemset(d_bin_size, 0, (NUM_BIN + 5) * sizeof(mint))); 69 | } 70 | void Meta::memset_bin_size(mint stream_idx = 1){ 71 | CHECK_ERROR(cudaMemsetAsync(d_bin_size, 0, NUM_BIN * sizeof(mint), stream[stream_idx])); 72 | //CHECK_ERROR(cudaMemset(d_bin_size, 0, (NUM_BIN + 5) * sizeof(mint))); 73 | } 74 | 75 | void Meta::D2H_all(mint stream_idx = 0){ 76 | CHECK_ERROR(cudaMemcpyAsync(bin_size, d_bin_size, (NUM_BIN + 2) * sizeof(mint), cudaMemcpyDeviceToHost, stream[stream_idx])); 77 | //CHECK_ERROR(cudaMemcpy(bin_size, d_bin_size, NUM_BIN * sizeof(mint), cudaMemcpyHostToDevice)); 78 | } 79 | 80 | void Meta::D2H_bin_size(mint stream_idx = 0){ 81 | CHECK_ERROR(cudaMemcpyAsync(bin_size, d_bin_size, NUM_BIN * sizeof(mint), cudaMemcpyDeviceToHost, stream[stream_idx])); 82 | //CHECK_ERROR(cudaMemcpy(bin_size, d_bin_size, NUM_BIN * sizeof(mint), cudaMemcpyHostToDevice)); 83 | } 84 | 85 | void Meta::H2D_bin_offset(mint stream_idx = 0){ 86 | CHECK_ERROR(cudaMemcpyAsync(d_bin_offset, bin_offset, NUM_BIN * sizeof(mint), cudaMemcpyHostToDevice, stream[stream_idx])); 87 | } 88 | 89 | 90 | -------------------------------------------------------------------------------- /OpSparse/src/Timings.cu: -------------------------------------------------------------------------------- 1 | #include "Timings.h" 2 | #include 3 | 4 | Timings::Timings(){ 5 | measure_separate = true; 6 | measure_total = true; 7 | setup = 0; 8 | symbolic_binning = 0; 9 | symbolic = 0; 10 | reduce = 0; 11 | numeric_binning = 0; 12 | prefix = 0; 13 | allocate = 0; 14 | numeric = 0; 15 | cleanup = 0; 16 | total = 0; 17 | } 18 | 19 | void Timings::operator+=(const Timings& b){ 20 | setup += b.setup; 21 | symbolic_binning += b.symbolic_binning; 22 | symbolic += b.symbolic; 23 | reduce += b.reduce; 24 | numeric_binning += b.numeric_binning; 25 | prefix += b.prefix; 26 | allocate += b.allocate; 27 | numeric += b.numeric; 28 | cleanup += b.cleanup; 29 | total += b.total; 30 | } 31 | 32 | void Timings::operator/=(const double x){ 33 | setup /= x; 34 | symbolic_binning /= x; 35 | symbolic /= x; 36 | reduce /= x; 37 | numeric_binning /= x; 38 | prefix /= x; 39 | allocate /= x; 40 | numeric /= x; 41 | cleanup /= x; 42 | total /= x; 43 | } 44 | 45 | void Timings::print(double total_flop){ 46 | double total_flop_G = total_flop/1000000000; 47 | printf("total flop %lf\n", total_flop); 48 | double sum_total = setup + symbolic_binning + symbolic + numeric_binning 49 | + reduce + prefix + allocate + numeric + cleanup; 50 | if(measure_separate){ 51 | //printf("time(ms): setup %.3lf symbolic_binning %.3lf symbolic %.3lf numeric_binning %.3lf prefix_allocate %.3lf numeric %.3lf cleanup %.3lf total %.3lf",) 52 | printf("time(ms):\n"); 53 | printf(" setup %8.3lfms %6.2lf%%\n", 1000*setup, setup/total*100); 54 | printf("\e[1;31m symbolic_binning %8.3lfms %6.2lf%%\n\e[0m", 1000*symbolic_binning, symbolic_binning/total*100); 55 | printf("\e[1;31m symbolic %8.3lfms %6.2lf%%\n\e[0m", 1000*symbolic, symbolic/total*100); 56 | printf(" reduce %8.3lfms %6.2lf%%\n", 1000*reduce, reduce/total*100); 57 | printf("\e[1;31m numeric_binning %8.3lfms %6.2lf%%\n\e[0m", 1000*numeric_binning, numeric_binning/total*100); 58 | printf(" prefix %8.3lfms %6.2lf%%\n", 1000*prefix, prefix/total*100); 59 | printf(" allocate %8.3lfms %6.2lf%%\n", 1000*allocate, allocate/total*100); 60 | printf("\e[1;31m numeric %8.3lfms %6.2lf%%\n\e[0m", 1000*numeric, numeric/total*100); 61 | printf(" cleanup %8.3lfms %6.2lf%%\n", 1000*cleanup, cleanup/total*100); 62 | printf(" sum_total %8.3lfms %6.2lf%%\n", 1000*sum_total, sum_total/total*100); 63 | printf(" total %8.3lfms %6.2lf%%\n", 1000*total, total/total*100); 64 | printf("perf(Gflops):\n"); 65 | printf(" setup %6.2lf\n", total_flop_G/setup); 66 | printf(" symbolic_binning %6.2lf\n", total_flop_G/symbolic_binning); 67 | printf(" symbolic %6.2lf\n", total_flop_G/symbolic); 68 | printf(" reduce %6.2lf\n", total_flop_G/reduce); 69 | printf(" numeric_binning %6.2lf\n", total_flop_G/numeric_binning); 70 | printf(" prefix %6.2lf\n", total_flop_G/prefix); 71 | printf(" allocate %6.2lf\n", total_flop_G/allocate); 72 | printf(" numeric %6.2lf\n", total_flop_G/numeric); 73 | printf(" cleanup %6.2lf\n", total_flop_G/cleanup); 74 | printf(" total %6.2lf\n", total_flop_G/total); 75 | } 76 | } 77 | 78 | void Timings::reg_print(double total_flop){ 79 | double total_flop_G = total_flop/1000000000; 80 | printf("%6.2lf\n", total_flop_G/total); 81 | } 82 | 83 | void Timings::perf_print(double total_flop){ 84 | double total_flop_G = total_flop/1000000000; 85 | printf("%6.2lf %6.2lf\n", total_flop_G/symbolic, total_flop_G/numeric); 86 | } 87 | 88 | void Timings::binning_print(double total_flop){ 89 | double total_flop_G = total_flop/1000000000; 90 | double total_binning_time = symbolic_binning + numeric_binning; 91 | printf("%.4le %.4lf\n", total_binning_time, 100*total_binning_time/total); 92 | } 93 | -------------------------------------------------------------------------------- /OpSparse/src/opsparse.cu: -------------------------------------------------------------------------------- 1 | 2 | #include "kernel_wrapper.cuh" 3 | #include 4 | #include 5 | #include 6 | #include "cusparse_spgemm.h" 7 | #include "Timings.h" 8 | 9 | 10 | void opsparse(const CSR& A, const CSR& B, CSR& C, Meta& meta, Timings& timing){ 11 | 12 | double t0, t1; 13 | t1 = t0 = fast_clock_time(); 14 | C.M = A.M; 15 | C.N = B.N; 16 | C.nnz = 0; 17 | h_setup(A, B, C, meta, timing); 18 | CHECK_ERROR(cudaDeviceSynchronize()); 19 | timing.setup = fast_clock_time() - t0; 20 | 21 | // symbolic binning 22 | t0 = fast_clock_time(); 23 | h_symbolic_binning(C, meta); 24 | CHECK_ERROR(cudaDeviceSynchronize()); 25 | timing.symbolic_binning = fast_clock_time() - t0; 26 | 27 | 28 | // symbolic phase 29 | t0 = fast_clock_time(); 30 | h_symbolic(A, B, C, meta); 31 | CHECK_ERROR(cudaDeviceSynchronize()); 32 | timing.symbolic = fast_clock_time() - t0; 33 | 34 | 35 | // numeric binning 36 | t0 = fast_clock_time(); 37 | h_numeric_binning(C, meta); 38 | CHECK_ERROR(cudaDeviceSynchronize()); 39 | timing.numeric_binning = fast_clock_time() - t0; 40 | 41 | // malloc C 42 | t0 = fast_clock_time(); 43 | C.nnz = *meta.total_nnz; 44 | CHECK_ERROR(cudaMalloc(&C.d_val, C.nnz * sizeof(mdouble))); 45 | CHECK_ERROR(cudaMalloc(&C.d_col, C.nnz * sizeof(mint))); 46 | timing.allocate = fast_clock_time() - t0; 47 | 48 | // prefix sum and malloc 49 | t0 = fast_clock_time(); 50 | cub::DeviceScan::ExclusiveSum(meta.d_cub_storage, meta.cub_storage_size, C.d_rpt, C.d_rpt, C.M + 1); 51 | CHECK_ERROR(cudaDeviceSynchronize()); 52 | timing.prefix = fast_clock_time() - t0; 53 | 54 | // numeric 55 | t0 = fast_clock_time(); 56 | h_numeric_full_occu(A, B, C, meta); 57 | CHECK_ERROR(cudaDeviceSynchronize()); 58 | timing.numeric= fast_clock_time() - t0; 59 | 60 | // cleanup 61 | t0 = fast_clock_time(); 62 | meta.release(); 63 | timing.cleanup = fast_clock_time() - t0; 64 | timing.total = fast_clock_time() - t1; 65 | } 66 | 67 | int main(int argc, char **argv) 68 | { 69 | std::string mat1, mat2; 70 | mat1 = "can_24"; 71 | mat2 = "can_24"; 72 | if(argc == 2){ 73 | mat1 = argv[1]; 74 | mat2 = argv[1]; 75 | } 76 | if(argc >= 3){ 77 | mat1 = argv[1]; 78 | mat2 = argv[2]; 79 | } 80 | std::string mat1_file; 81 | if(mat1.find("ER") != std::string::npos){ 82 | mat1_file = "../matrix/ER/" + mat1 +".mtx"; 83 | } 84 | else if(mat1.find("G500") != std::string::npos){ 85 | mat1_file = "../matrix/G500/" + mat1 +".mtx"; 86 | } 87 | else{ 88 | mat1_file = "../matrix/suite_sparse/" + mat1 + "/" + mat1 +".mtx"; 89 | } 90 | std::string mat2_file; 91 | if(mat2.find("ER") != std::string::npos){ 92 | mat2_file = "../matrix/ER/" + mat2 +".mtx"; 93 | } 94 | else if(mat2.find("G500") != std::string::npos){ 95 | mat2_file = "../matrix/G500/" + mat2 +".mtx"; 96 | } 97 | else{ 98 | mat2_file = "../matrix/suite_sparse/" + mat2 + "/" + mat2 +".mtx"; 99 | } 100 | 101 | CSR A, B; 102 | A.construct(mat1_file); 103 | if(mat1 == mat2){ 104 | B = A; 105 | } 106 | else{ 107 | B.construct(mat2_file); 108 | if(A.N == B.M){ 109 | // do nothing 110 | } 111 | else if(A.N < B.M){ 112 | CSR tmp(B, A.N, B.N, 0, 0); 113 | B = tmp; 114 | } 115 | else{ 116 | CSR tmp(A, A.M, B.M, 0, 0); 117 | A = tmp; 118 | } 119 | } 120 | 121 | A.H2D(); 122 | B.H2D(); 123 | 124 | long total_flop = compute_flop(A, B); 125 | CSR C; 126 | cudaruntime_warmup(); 127 | Meta meta; 128 | { 129 | Timings timing; 130 | opsparse(A, B, C, meta, timing); 131 | C.release(); 132 | } 133 | mint iter = 10; 134 | Timings timing, bench_timing; 135 | for(mint i = 0; i < iter; i++){ 136 | opsparse(A, B, C, meta, timing); 137 | bench_timing += timing; 138 | if(i < iter - 1){ 139 | C.release(); 140 | } 141 | } 142 | bench_timing /= iter; 143 | 144 | printf("%s ",mat1.c_str()); 145 | bench_timing.print(total_flop * 2); 146 | 147 | // compare result 148 | 149 | //C.D2H(); 150 | //CSR C_ref; 151 | //cusparse_spgemm(&A, &B, &C_ref); 152 | //C_ref.D2H(); 153 | //if(C == C_ref){ 154 | // printf("pass\n"); 155 | //} 156 | //else{ 157 | // printf("error\n"); 158 | //} 159 | 160 | A.release(); 161 | B.release(); 162 | 163 | C.release(); 164 | return 0; 165 | } 166 | 167 | 168 | -------------------------------------------------------------------------------- /OpSparse/src/reg_cusparse.cu: -------------------------------------------------------------------------------- 1 | #include "kernel_wrapper.cuh" 2 | #include 3 | #include 4 | #include 5 | #include "Timings.h" 6 | #include "cusparse_spgemm.h" 7 | 8 | 9 | int main(int argc, char **argv) 10 | { 11 | std::string mat1, mat2; 12 | mat1 = "can_24"; 13 | mat2 = "can_24"; 14 | if(argc == 2){ 15 | mat1 = argv[1]; 16 | mat2 = argv[1]; 17 | } 18 | if(argc >= 3){ 19 | mat1 = argv[1]; 20 | mat2 = argv[2]; 21 | } 22 | std::string mat1_file; 23 | if(mat1.find("ER") != std::string::npos){ 24 | mat1_file = "../matrix/ER/" + mat1 +".mtx"; 25 | } 26 | else if(mat1.find("G500") != std::string::npos){ 27 | mat1_file = "../matrix/G500/" + mat1 +".mtx"; 28 | } 29 | else{ 30 | mat1_file = "../matrix/suite_sparse/" + mat1 + "/" + mat1 +".mtx"; 31 | } 32 | std::string mat2_file; 33 | if(mat2.find("ER") != std::string::npos){ 34 | mat2_file = "../matrix/ER/" + mat2 +".mtx"; 35 | } 36 | else if(mat2.find("G500") != std::string::npos){ 37 | mat2_file = "../matrix/G500/" + mat2 +".mtx"; 38 | } 39 | else{ 40 | mat2_file = "../matrix/suite_sparse/" + mat2 + "/" + mat2 +".mtx"; 41 | } 42 | 43 | CSR A, B; 44 | A.construct(mat1_file); 45 | if(mat1 == mat2){ 46 | B = A; 47 | } 48 | else{ 49 | B.construct(mat2_file); 50 | if(A.N == B.M){ 51 | // do nothing 52 | } 53 | else if(A.N < B.M){ 54 | CSR tmp(B, A.N, B.N, 0, 0); 55 | B = tmp; 56 | } 57 | else{ 58 | CSR tmp(A, A.M, B.M, 0, 0); 59 | A = tmp; 60 | } 61 | } 62 | 63 | 64 | A.H2D(); 65 | B.H2D(); 66 | 67 | long total_flop = compute_flop(A, B); 68 | double total_flop_G = double(total_flop) * 2/1000000000; 69 | 70 | CSR C; 71 | double t0 = fast_clock_time(), t1; 72 | cusparse_spgemm(&A, &B, &C); 73 | C.release(); 74 | 75 | int iter = 10; 76 | t1 = 0; 77 | for(int i = 0; i < iter; i++){ 78 | t0 = fast_clock_time(); 79 | cusparse_spgemm(&A, &B, &C); 80 | t1 += fast_clock_time() - t0; 81 | //printf("iter %d %le\n", i, fast_clock_time() - t0); 82 | if(i < iter - 1){ 83 | C.release(); 84 | } 85 | } 86 | t1 /= iter; 87 | //printf("executione time %le, flops %lf\n\n", t1, total_flop_G / t1); 88 | printf("%s %lf\n", mat1.c_str(), total_flop_G / t1); 89 | 90 | A.release(); 91 | B.release(); 92 | C.release(); 93 | return 0; 94 | } 95 | 96 | 97 | -------------------------------------------------------------------------------- /OpSparse/src/reg_opsparse.cu: -------------------------------------------------------------------------------- 1 | 2 | #include "kernel_wrapper.cuh" 3 | #include 4 | #include 5 | #include 6 | #include "cusparse_spgemm.h" 7 | #include "Timings.h" 8 | 9 | 10 | void opsparse(const CSR& A, const CSR& B, CSR& C, Meta& meta, Timings& timing){ 11 | 12 | double t0, t1; 13 | t1 = t0 = fast_clock_time(); 14 | C.M = A.M; 15 | C.N = B.N; 16 | C.nnz = 0; 17 | h_setup(A, B, C, meta, timing); 18 | CHECK_ERROR(cudaDeviceSynchronize()); 19 | timing.setup = fast_clock_time() - t0; 20 | 21 | // symbolic binning 22 | t0 = fast_clock_time(); 23 | h_symbolic_binning(C, meta); 24 | CHECK_ERROR(cudaDeviceSynchronize()); 25 | timing.symbolic_binning = fast_clock_time() - t0; 26 | 27 | 28 | // symbolic phase 29 | t0 = fast_clock_time(); 30 | h_symbolic(A, B, C, meta); 31 | CHECK_ERROR(cudaDeviceSynchronize()); 32 | timing.symbolic = fast_clock_time() - t0; 33 | 34 | 35 | 36 | // numeric binning, exclusive sum, and allocate C 37 | meta.memset_all(0); 38 | mint BS = 1024; 39 | mint GS = div_up(C.M, BS); 40 | k_numeric_binning<<>>(C.d_rpt, C.M, 41 | meta.d_bin_size, meta.d_total_nnz, meta.d_max_row_nnz); 42 | meta.D2H_all(0); 43 | CHECK_ERROR(cudaStreamSynchronize(meta.stream[0])); 44 | C.nnz = *meta.total_nnz; 45 | 46 | if(*meta.max_row_nnz <= 16){ 47 | k_binning_small<<>>(meta.d_bins, C.M); 48 | CHECK_ERROR(cudaMalloc(&C.d_col, C.nnz * sizeof(mint))); 49 | meta.bin_size[0] = C.M; 50 | for(int i = 1; i< NUM_BIN; i++){ 51 | meta.bin_size[i] = 0; 52 | } 53 | meta.bin_offset[0] = 0; 54 | for(int i = 1; i < NUM_BIN; i++){ 55 | meta.bin_offset[i] = C.M; 56 | } 57 | } 58 | else{ 59 | meta.memset_bin_size(0); 60 | meta.bin_offset[0] = 0; 61 | for(int i = 0; i < NUM_BIN - 1; i++){ 62 | meta.bin_offset[i+1] = meta.bin_offset[i] + meta.bin_size[i]; 63 | } 64 | meta.H2D_bin_offset(0); 65 | 66 | k_numeric_binning2<<>>(C.d_rpt, C.M, 67 | meta.d_bins, meta.d_bin_size, meta.d_bin_offset); 68 | CHECK_ERROR(cudaMalloc(&C.d_col, C.nnz * sizeof(mint))); 69 | } 70 | CHECK_ERROR(cudaDeviceSynchronize()); 71 | 72 | cub::DeviceScan::ExclusiveSum(meta.d_cub_storage, meta.cub_storage_size, C.d_rpt, C.d_rpt, C.M + 1); 73 | CHECK_ERROR(cudaMalloc(&C.d_val, C.nnz * sizeof(mdouble))); 74 | CHECK_ERROR(cudaDeviceSynchronize()); 75 | 76 | // numeric 77 | t0 = fast_clock_time(); 78 | h_numeric_full_occu(A, B, C, meta); 79 | CHECK_ERROR(cudaDeviceSynchronize()); 80 | timing.numeric= fast_clock_time() - t0; 81 | 82 | // cleanup 83 | t0 = fast_clock_time(); 84 | meta.release(); 85 | timing.cleanup = fast_clock_time() - t0; 86 | timing.total = fast_clock_time() - t1; 87 | } 88 | 89 | int main(int argc, char **argv) 90 | { 91 | std::string mat1, mat2; 92 | mat1 = "can_24"; 93 | mat2 = "can_24"; 94 | if(argc == 2){ 95 | mat1 = argv[1]; 96 | mat2 = argv[1]; 97 | } 98 | if(argc >= 3){ 99 | mat1 = argv[1]; 100 | mat2 = argv[2]; 101 | } 102 | std::string mat1_file; 103 | if(mat1.find("ER") != std::string::npos){ 104 | mat1_file = "../matrix/ER/" + mat1 +".mtx"; 105 | } 106 | else if(mat1.find("G500") != std::string::npos){ 107 | mat1_file = "../matrix/G500/" + mat1 +".mtx"; 108 | } 109 | else{ 110 | mat1_file = "../matrix/suite_sparse/" + mat1 + "/" + mat1 +".mtx"; 111 | } 112 | std::string mat2_file; 113 | if(mat2.find("ER") != std::string::npos){ 114 | mat2_file = "../matrix/ER/" + mat2 +".mtx"; 115 | } 116 | else if(mat2.find("G500") != std::string::npos){ 117 | mat2_file = "../matrix/G500/" + mat2 +".mtx"; 118 | } 119 | else{ 120 | mat2_file = "../matrix/suite_sparse/" + mat2 + "/" + mat2 +".mtx"; 121 | } 122 | 123 | CSR A, B; 124 | A.construct(mat1_file); 125 | if(mat1 == mat2){ 126 | B = A; 127 | } 128 | else{ 129 | B.construct(mat2_file); 130 | if(A.N == B.M){ 131 | // do nothing 132 | } 133 | else if(A.N < B.M){ 134 | CSR tmp(B, A.N, B.N, 0, 0); 135 | B = tmp; 136 | } 137 | else{ 138 | CSR tmp(A, A.M, B.M, 0, 0); 139 | A = tmp; 140 | } 141 | } 142 | 143 | A.H2D(); 144 | B.H2D(); 145 | 146 | long total_flop = compute_flop(A, B); 147 | CSR C; 148 | cudaruntime_warmup(); 149 | Meta meta; 150 | { 151 | Timings timing; 152 | opsparse(A, B, C, meta, timing); 153 | C.release(); 154 | } 155 | 156 | mint iter = 10; 157 | Timings timing, bench_timing; 158 | for(mint i = 0; i < iter; i++){ 159 | opsparse(A, B, C, meta, timing); 160 | bench_timing += timing; 161 | if(i < iter - 1){ 162 | C.release(); 163 | } 164 | } 165 | bench_timing /= iter; 166 | 167 | printf("%s ",mat1.c_str()); 168 | bench_timing.reg_print(total_flop * 2); 169 | 170 | // compare result 171 | 172 | //C.D2H(); 173 | //CSR C_ref; 174 | //cusparse_spgemm(&A, &B, &C_ref); 175 | //C_ref.D2H(); 176 | //if(C == C_ref){ 177 | // printf("pass\n"); 178 | //} 179 | //else{ 180 | // printf("error\n"); 181 | //} 182 | 183 | A.release(); 184 | B.release(); 185 | 186 | C.release(); 187 | return 0; 188 | } 189 | 190 | 191 | -------------------------------------------------------------------------------- /download_matrix.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -d matrix/suite_sparse ]; then 4 | cd matrix/suite_sparse 5 | else 6 | mkdir -p matrix/suite_sparse 7 | cd matrix/suite_sparse 8 | fi 9 | 10 | # download webbase-1M 11 | if [ ! -e webbase-1M/webbase-1M.mtx ]; then 12 | wget https://suitesparse-collection-website.herokuapp.com/MM/Williams/webbase-1M.tar.gz 13 | tar zxvf webbase-1M.tar.gz 14 | fi 15 | echo Successfully downloaded the matrix. 16 | 17 | -------------------------------------------------------------------------------- /nsparse/Makefile: -------------------------------------------------------------------------------- 1 | CXX = nvcc 2 | NVCC = nvcc 3 | 4 | #CFLAGS = -O3 -g 5 | #CFLAGS = -g -lineinfo 6 | CFLAGS = -O3 7 | CFLAGS += -L. ${REAL} -lm 8 | LDFLAGS = ${CFLAGS} 9 | 10 | # for Device Code 11 | CUDA_PATH = /usr/local/cuda 12 | LDFLAGS += -L${CUDA_PATH}/lib64 13 | LDFLAGS += -arch=sm_70 -lcudart -lcusparse 14 | INCLUDE = -I./inc 15 | INCLUDE += -I${CUDA_PATH}/include 16 | INCLUDE += -I${CUDA_PATH}/samples/common/inc 17 | 18 | BIN = ./bin 19 | SRC = ./ 20 | OBJ = ./obj 21 | INC = ./inc 22 | 23 | OBJ_SUF = .o 24 | OS_SUF = .s.o 25 | OD_SUF = .d.o 26 | TS_SUF = _s 27 | TD_SUF = _d 28 | 29 | 30 | SRC_SPGEMM = $(SRC) 31 | SAMPLE_SPGEMM = $(wildcard $(SRC_SPGEMM)/*.cu) 32 | SAMPLE_SPGEMM_TARGET = $(SAMPLE_SPGEMM:$(SRC)%=$(BIN)%) 33 | 34 | all : 35 | make spgemm 36 | 37 | spgemm: $(SAMPLE_SPGEMM_TARGET:.cu=$(TD_SUF)) 38 | 39 | $(BIN)/%$(TS_SUF): $(OBJ)/%$(OS_SUF) 40 | mkdir -p $(dir $@) 41 | $(NVCC) -o $@ $^ $(LDFLAGS) $(INCLUDE) 42 | 43 | $(BIN)/%$(TD_SUF): $(OBJ)/%$(OD_SUF) 44 | mkdir -p $(dir $@) 45 | $(NVCC) -o $@ $^ $(LDFLAGS) $(INCLUDE) 46 | 47 | $(OBJ)/%$(OS_SUF) : $(SRC)/%.cu 48 | mkdir -p $(dir $@) 49 | $(NVCC) -c -DFLOAT $(LDFLAGS) $(INCLUDE) -o $@ $< 50 | 51 | $(OBJ)/%$(OD_SUF) : $(SRC)/%.cu 52 | mkdir -p $(dir $@) 53 | $(NVCC) -c -DDOUBLE $(LDFLAGS) $(INCLUDE) -o $@ $< 54 | 55 | $(OBJ)/%$(OS_SUF) : $(SRC)/%.cpp 56 | mkdir -p $(dir $@) 57 | $(NVCC) -c -DFLOAT $(LDFLAGS) $(INCLUDE) -o $@ $< 58 | 59 | $(OBJ)/%$(OD_SUF) : $(SRC)/%.cpp 60 | mkdir -p $(dir $@) 61 | $(NVCC) -c -DDOUBLE $(LDFLAGS) $(INCLUDE) -o $@ $< 62 | 63 | clean : 64 | rm -rf $(BIN)/* 65 | rm -rf $(OBJ)/* 66 | -------------------------------------------------------------------------------- /nsparse/inc/CSR.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | using namespace std; 6 | 7 | #ifndef CSR_H 8 | #define CSR_H 9 | template 10 | class CSR 11 | { 12 | public: 13 | CSR():nrow(0), ncolumn(0), nnz(0), device_malloc(false) 14 | { 15 | } 16 | ~CSR() 17 | { 18 | } 19 | void release_cpu_csr() 20 | { 21 | delete[] rpt; 22 | delete[] colids; 23 | delete[] values; 24 | } 25 | void release_csr() 26 | { 27 | if (device_malloc) { 28 | cudaFree(d_rpt); 29 | cudaFree(d_colids); 30 | cudaFree(d_values); 31 | } 32 | device_malloc = false; 33 | } 34 | bool operator==(CSR mat) 35 | { 36 | bool f = false; 37 | if (nrow != mat.nrow) { 38 | cout << "Number of row is not correct: " << nrow << ", " << mat.nrow << endl; 39 | return f; 40 | } 41 | if (ncolumn != mat.ncolumn) { 42 | cout << "Number of column is not correct" << ncolumn << ", " << mat.ncolumn << endl; 43 | return f; 44 | } 45 | if (nnz != mat.nnz) { 46 | cout << "Number of nz is not correct" << nnz << ", " << mat.nnz << endl; 47 | return f; 48 | } 49 | if (rpt == NULL || mat.rpt == NULL || colids == NULL || mat.colids == NULL || values == NULL || mat.values == NULL) { 50 | cout << "NULL Pointer" << endl; 51 | return f; 52 | } 53 | for (idType i = 0; i < nrow + 1; ++i) { 54 | if (rpt[i] != mat.rpt[i]) { 55 | cout << "rpt[" << i << "] is not correct" << endl; 56 | return f; 57 | } 58 | } 59 | for (idType i = 0; i < nnz; ++i) { 60 | if (colids[i] != mat.colids[i]) { 61 | cout << "colids[" << i << "] is not correct" << endl; 62 | return f; 63 | } 64 | } 65 | idType total_fail = 10; 66 | valType delta, base, scale; 67 | for (idType i = 0; i < nnz; ++i) { 68 | delta = values[i] - mat.values[i]; 69 | base = values[i]; 70 | if (delta < 0) { 71 | delta *= -1; 72 | } 73 | if (base < 0) { 74 | base *= -1; 75 | } 76 | scale = 1000; 77 | if (sizeof(valType) == sizeof(double)) { 78 | scale *= 1000; 79 | } 80 | if (delta * scale * 100 > base) { 81 | cout << i << ": " << values[i] << ", " << mat.values[i] << endl; 82 | total_fail--; 83 | } 84 | if (total_fail == 0) { 85 | cout << "values[" << i << "] is not correct" << endl; 86 | return f; 87 | } 88 | } 89 | f = true; 90 | return f; 91 | } 92 | 93 | void init_data_from_mtx(string file_path); 94 | void memcpyHtD() 95 | { 96 | if (!device_malloc) { 97 | //cout << "Allocating memory space for matrix data on device memory" << endl; 98 | cudaMalloc((void **)&d_rpt, sizeof(idType) * (nrow + 1)); 99 | cudaMalloc((void **)&d_colids, sizeof(idType) * nnz); 100 | cudaMalloc((void **)&d_values, sizeof(valType) * nnz); 101 | } 102 | //cout << "Copying matrix data to GPU device" << endl; 103 | cudaMemcpy(d_rpt, rpt, sizeof(idType) * (nrow + 1), cudaMemcpyHostToDevice); 104 | cudaMemcpy(d_colids, colids, sizeof(idType) * nnz, cudaMemcpyHostToDevice); 105 | cudaMemcpy(d_values, values, sizeof(valType) * nnz, cudaMemcpyHostToDevice); 106 | device_malloc = true; 107 | } 108 | void memcpyDtH() 109 | { 110 | rpt = new idType[nrow + 1]; 111 | colids = new idType[nnz]; 112 | values = new valType[nnz]; 113 | //cout << "Matrix data is copied to Host" << endl; 114 | cudaMemcpy(rpt, d_rpt, sizeof(idType) * (nrow + 1), cudaMemcpyDeviceToHost); 115 | cudaMemcpy(colids, d_colids, sizeof(idType) * nnz, cudaMemcpyDeviceToHost); 116 | cudaMemcpy(values, d_values, sizeof(valType) * nnz, cudaMemcpyDeviceToHost); 117 | } 118 | 119 | void spmv_cpu(valType *x, valType *y); 120 | 121 | idType *rpt; 122 | idType *colids; 123 | valType *values; 124 | idType *d_rpt; 125 | idType *d_colids; 126 | valType *d_values; 127 | idType nrow; 128 | idType ncolumn; 129 | idType nnz; 130 | bool host_malloc; 131 | bool device_malloc; 132 | }; 133 | 134 | template 135 | void CSR::init_data_from_mtx(string file_path) 136 | { 137 | idType i, num; 138 | bool isUnsy; 139 | char *line, *ch; 140 | FILE *fp; 141 | idType *col_coo, *row_coo, *nnz_num, *each_row_index; 142 | valType *val_coo; 143 | idType LINE_LENGTH_MAX = 256; 144 | 145 | device_malloc = false; 146 | 147 | isUnsy = false; 148 | line = new char[LINE_LENGTH_MAX]; 149 | 150 | /* Open File */ 151 | fp = fopen(file_path.c_str(), "r"); 152 | if (fp == NULL) { 153 | cout << "Cannot find file" << endl; 154 | exit(1); 155 | } 156 | 157 | fgets(line, LINE_LENGTH_MAX, fp); 158 | if (strstr(line, "general")) { 159 | isUnsy = true; 160 | } 161 | do { 162 | fgets(line, LINE_LENGTH_MAX, fp); 163 | } while(line[0] == '%'); 164 | 165 | /* Get size info */ 166 | sscanf(line, "%d %d %d", &nrow, &ncolumn, &nnz); 167 | 168 | /* Store in COO format */ 169 | num = 0; 170 | col_coo = new idType[nnz]; 171 | row_coo = new idType[nnz]; 172 | val_coo = new valType[nnz]; 173 | 174 | while (fgets(line, LINE_LENGTH_MAX, fp)) { 175 | ch = line; 176 | /* Read first word (row id)*/ 177 | row_coo[num] = (idType)(atoi(ch) - 1); 178 | ch = strchr(ch, ' '); 179 | ch++; 180 | /* Read second word (column id)*/ 181 | col_coo[num] = (idType)(atoi(ch) - 1); 182 | ch = strchr(ch, ' '); 183 | 184 | if (ch != NULL) { 185 | ch++; 186 | /* Read third word (value data)*/ 187 | val_coo[num] = (valType)atof(ch); 188 | ch = strchr(ch, ' '); 189 | } 190 | else { 191 | val_coo[num] = 1.0; 192 | } 193 | num++; 194 | } 195 | fclose(fp); 196 | delete[] line; 197 | 198 | /* Count the number of non-zero in each row */ 199 | nnz_num = new idType[nrow]; 200 | for (i = 0; i < nrow; i++) { 201 | nnz_num[i] = 0; 202 | } 203 | for (i = 0; i < num; i++) { 204 | nnz_num[row_coo[i]]++; 205 | if(col_coo[i] != row_coo[i] && isUnsy == false) { 206 | nnz_num[col_coo[i]]++; 207 | nnz++; 208 | } 209 | } 210 | 211 | /* Allocation of rpt, col, val */ 212 | rpt = new idType[nrow + 1]; 213 | colids = new idType[nnz]; 214 | values = new valType[nnz]; 215 | 216 | rpt[0] = 0; 217 | for (i = 0; i < nrow; i++) { 218 | rpt[i + 1] = rpt[i] + nnz_num[i]; 219 | } 220 | 221 | each_row_index = new idType[nrow]; 222 | for (i = 0; i < nrow; i++) { 223 | each_row_index[i] = 0; 224 | } 225 | 226 | for (i = 0; i < num; i++) { 227 | colids[rpt[row_coo[i]] + each_row_index[row_coo[i]]] = col_coo[i]; 228 | values[rpt[row_coo[i]] + each_row_index[row_coo[i]]++] = val_coo[i]; 229 | 230 | if (col_coo[i] != row_coo[i] && isUnsy == false) { 231 | colids[rpt[col_coo[i]] + each_row_index[col_coo[i]]] = row_coo[i]; 232 | values[rpt[col_coo[i]] + each_row_index[col_coo[i]]++] = val_coo[i]; 233 | } 234 | } 235 | 236 | //cout << "Row: " << nrow << ", Column: " << ncolumn << ", Nnz: " << nnz << endl; 237 | 238 | delete[] nnz_num; 239 | delete[] row_coo; 240 | delete[] col_coo; 241 | delete[] val_coo; 242 | delete[] each_row_index; 243 | 244 | } 245 | 246 | template 247 | void CSR::spmv_cpu(valType *x, valType *y) 248 | { 249 | idType i, j; 250 | valType ans; 251 | 252 | for (i = 0; i < nrow; ++i) { 253 | ans = 0; 254 | for (j = 0; j < (rpt[i + 1] - rpt[i]); j++) { 255 | ans += values[rpt[i] + j] * x[colids[rpt[i] + j]]; 256 | } 257 | y[i] = ans; 258 | } 259 | } 260 | 261 | #endif 262 | -------------------------------------------------------------------------------- /nsparse/inc/Plan.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | template 4 | class Plan 5 | { 6 | public: 7 | Plan(): isPlan(false), seg_size(1), block_size(1), memory_access(INT_MAX), min_msec(sfFLT_MAX) 8 | { 9 | } 10 | Plan(idType segment, idType block): isPlan(true) 11 | { 12 | seg_size = segment; 13 | if (seg_size > USHORT_MAX) { 14 | seg_size = USHORT_MAX; 15 | } 16 | block_size = block; 17 | if (block_size < 1 || block_size > MAX_BLOCK_SIZE) { 18 | block_size = 1; 19 | } 20 | } 21 | ~Plan() 22 | { 23 | } 24 | void set_plan(idType s_size, idType b_size) 25 | { 26 | seg_size = s_size; 27 | block_size = b_size; 28 | isPlan = true; 29 | } 30 | 31 | idType thread_grid; 32 | idType thread_block; 33 | bool isPlan; 34 | idType SIGMA; 35 | idType seg_size; 36 | idType seg_num; 37 | idType block_size; 38 | idType memory_access; 39 | float min_msec; 40 | }; 41 | 42 | -------------------------------------------------------------------------------- /nsparse/inc/Timing.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __Z_TIMING_H__ 2 | #define __Z_TIMING_H__ 3 | #include 4 | 5 | class Timing { 6 | public: 7 | bool measure_separate; 8 | bool measure_total; 9 | double setup; 10 | double symbolic_binning; 11 | double symbolic; 12 | double numeric_binning; 13 | double prefix; 14 | double allocate; 15 | double numeric; 16 | double cleanup; 17 | double total; 18 | Timing(); 19 | 20 | void operator+=(const Timing& b); 21 | 22 | void operator/=(const double x); 23 | void print(const double total_flop); 24 | void reg_print(const double total_flop); 25 | void binning_print(const double total_flop); 26 | }; 27 | 28 | Timing::Timing(){ 29 | measure_separate = true; 30 | measure_total = true; 31 | setup = 0; 32 | symbolic_binning = 0; 33 | symbolic = 0; 34 | numeric_binning = 0; 35 | prefix = 0; 36 | allocate = 0; 37 | numeric = 0; 38 | cleanup = 0; 39 | total = 0; 40 | } 41 | 42 | void Timing::operator+=(const Timing& b){ 43 | setup += b.setup; 44 | symbolic_binning += b.symbolic_binning; 45 | symbolic += b.symbolic; 46 | numeric_binning += b.numeric_binning; 47 | prefix += b.prefix; 48 | allocate += b.allocate; 49 | numeric += b.numeric; 50 | cleanup += b.cleanup; 51 | total += b.total; 52 | } 53 | 54 | void Timing::operator/=(const double x){ 55 | setup /= x; 56 | symbolic_binning /= x; 57 | symbolic /= x; 58 | numeric_binning /= x; 59 | prefix /= x; 60 | allocate /= x; 61 | numeric /= x; 62 | cleanup /= x; 63 | total /= x; 64 | } 65 | 66 | void Timing::print(double total_flop){ 67 | double total_flop_G = total_flop/1000000000; 68 | printf("total flop %lf\n", total_flop); 69 | if(measure_separate){ 70 | //printf("time(ms): setup %.3lf symbolic_binning %.3lf symbolic %.3lf numeric_binning %.3lf prefix_allocate %.3lf numeric %.3lf cleanup %.3lf total %.3lf",) 71 | printf("time(ms):\n"); 72 | printf(" setup %8.3lfms %6.2lf%%\n", 1000*setup, setup/total*100); 73 | printf("\e[1;31m symbolic_binning %8.3lfms %6.2lf%%\n\e[0m", 1000*symbolic_binning, symbolic_binning/total*100); 74 | printf("\e[1;31m symbolic %8.3lfms %6.2lf%%\n\e[0m", 1000*symbolic, symbolic/total*100); 75 | printf("\e[1;31m numeric_binning %8.3lfms %6.2lf%%\n\e[0m", 1000*numeric_binning, numeric_binning/total*100); 76 | printf(" prefix %8.3lfms %6.2lf%%\n", 1000*prefix, prefix/total*100); 77 | printf(" allocate %8.3lfms %6.2lf%%\n", 1000*allocate, allocate/total*100); 78 | printf("\e[1;31m numeric %8.3lfms %6.2lf%%\n\e[0m", 1000*numeric, numeric/total*100); 79 | printf(" cleanup %8.3lfms %6.2lf%%\n", 1000*cleanup, cleanup/total*100); 80 | printf(" total %8.3lfms %6.2lf%%\n", 1000*total, total/total*100); 81 | printf("perf(Gflops):\n"); 82 | printf(" setup %6.2lf\n", total_flop_G/setup); 83 | printf(" symbolic_binning %6.2lf\n", total_flop_G/symbolic_binning); 84 | printf(" symbolic %6.2lf\n", total_flop_G/symbolic); 85 | printf(" numeric_binning %6.2lf\n", total_flop_G/numeric_binning); 86 | printf(" prefix %6.2lf\n", total_flop_G/prefix); 87 | printf(" allocate %6.2lf\n", total_flop_G/allocate); 88 | printf(" numeric %6.2lf\n", total_flop_G/numeric); 89 | printf(" cleanup %6.2lf\n", total_flop_G/cleanup); 90 | printf(" total %6.2lf\n", total_flop_G/total); 91 | } 92 | } 93 | 94 | void Timing::reg_print(double total_flop){ 95 | double total_flop_G = total_flop/1000000000; 96 | printf("%6.2lf\n", total_flop_G/total); 97 | } 98 | 99 | void Timing::binning_print(double total_flop){ 100 | double total_flop_G = total_flop/1000000000; 101 | double total_binning_time = symbolic_binning + numeric_binning; 102 | printf("%.4le %.4lf\n", total_binning_time, 100*total_binning_time/total); 103 | } 104 | 105 | #endif 106 | 107 | -------------------------------------------------------------------------------- /nsparse/inc/cuda_common.h: -------------------------------------------------------------------------------- 1 | #ifndef _Z_COMMON_ 2 | #define _Z_COMMON_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #define likely(x) __builtin_expect(x,1) 13 | #define unlikely(x) __builtin_expect(x,0) 14 | 15 | inline static void checkCUDA(cudaError_t err, 16 | const char *file, 17 | int line) 18 | { 19 | if (unlikely(err != cudaSuccess)) 20 | { 21 | printf("%s in %s at line %d\n", cudaGetErrorString(err), 22 | file, line); 23 | throw std::exception(); 24 | } 25 | } 26 | // #ifdef _DEBUG || NDEBUG || DEBUG 27 | #define CHECK_CUDA(err) (checkCUDA(err, __FILE__, __LINE__)) 28 | #define CHECK_ERROR(err) (checkCUDA(err, __FILE__, __LINE__)) 29 | 30 | inline void CHECK_CUSPARSE(cusparseStatus_t status, std::string errorMsg="") 31 | { 32 | if (status != CUSPARSE_STATUS_SUCCESS) { 33 | std::cout << "CuSparse error: " << errorMsg << std::endl; 34 | throw std::exception(); 35 | } 36 | } 37 | 38 | #define HP_TIMING_NOW(Var) \ 39 | ({ unsigned int _hi, _lo; \ 40 | asm volatile ("lfence\n\trdtsc" : "=a" (_lo), "=d" (_hi)); \ 41 | (Var) = ((unsigned long long int) _hi << 32) | _lo; }) 42 | 43 | /* precision is 1 clock cycle. 44 | * execute time is roughly 50 or 140 cycles depends on cpu family */ 45 | inline void cpuid(int *info, int eax, int ecx = 0){ 46 | int ax, bx, cx, dx; 47 | __asm__ __volatile__ ("cpuid": "=a" (ax), "=b" (bx), "=c" (cx), "=d" (dx) : "a" (eax)); 48 | 49 | info[0] = ax; 50 | info[1] = bx; 51 | info[2] = cx; 52 | info[3] = dx; 53 | } 54 | 55 | inline long get_tsc_freq(){ 56 | static long freq = 0; 57 | if(unlikely((freq == 0))){ 58 | int raw[4]; 59 | cpuid(raw, 0x16); // get cpu freq 60 | freq = long(raw[0]) * 1000000; 61 | //printf("static first call %f\n", freq); 62 | } 63 | return freq; 64 | } 65 | 66 | inline double fast_clock_time(){ 67 | long counter; 68 | HP_TIMING_NOW(counter); 69 | return double(counter)/get_tsc_freq(); 70 | } 71 | 72 | template 73 | inline void D2H(T *dst, T* src, size_t size){ 74 | CHECK_ERROR(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost)); 75 | } 76 | 77 | template 78 | inline void H2D(T *dst, T* src, size_t size){ 79 | CHECK_ERROR(cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice)); 80 | } 81 | 82 | template 83 | inline void D2D(T *dst, T* src, size_t size){ 84 | CHECK_ERROR(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToDevice)); 85 | } 86 | 87 | 88 | #endif 89 | -------------------------------------------------------------------------------- /nsparse/inc/nsparse.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #ifndef NSPARSE_H 4 | #define NSPARSE_H 5 | 6 | #define div_round_up(a, b) ((a % b == 0)? a / b : a / b + 1) 7 | 8 | /* Hardware Specific Parameters */ 9 | #define warp_BIT 5 10 | #define warp 32 11 | #define MAX_LOCAL_THREAD_NUM 1024 12 | #define MAX_THREAD_BLOCK (MAX_LOCAL_THREAD_NUM / warp) 13 | 14 | /* Number of SpMV Execution for Evaluation or Test */ 15 | //#define TRI_NUM 101 16 | #define TEST_NUM 2 17 | #define SpGEMM_TRI_NUM 10 18 | 19 | /* Define 2 related */ 20 | #define sfFLT_MAX 1000000000 21 | #define SHORT_MAX 32768 22 | #define SHORT_MAX_BIT 15 23 | #define USHORT_MAX 65536 24 | #define USHORT_MAX_BIT 16 25 | 26 | #define SCL_BORDER 16 27 | #define SCL_BIT ((1 << SCL_BORDER) - 1) 28 | 29 | #define MAX_BLOCK_SIZE 20 30 | 31 | /* Check the answer */ 32 | //#define sfDEBUG 33 | 34 | /* Structure of Formats*/ 35 | /* Initializing vector */ 36 | template 37 | void init_vector(valType *x, int row) 38 | { 39 | int i; 40 | 41 | srand48((unsigned)time(NULL)); 42 | 43 | for (i = 0; i < row; i++) { 44 | x[i] = drand48(); 45 | } 46 | } 47 | 48 | /* Compare the vectors */ 49 | template 50 | void check_answer(valType *csr_ans, valType *ans_vec, idType nrow) 51 | { 52 | idType i; 53 | int total_fail = 10; 54 | valType delta, base; 55 | valType scale; 56 | if (typeid(valType) == typeid(float)) { 57 | scale = 1000; 58 | } 59 | else { 60 | scale = 1000 * 1000; 61 | } 62 | 63 | for (i = 0; i < nrow; i++) { 64 | delta = ans_vec[i] - csr_ans[i]; 65 | base = ans_vec[i]; 66 | 67 | if (delta < 0) { 68 | delta *= -1; 69 | } 70 | if (base < 0) { 71 | base *= -1; 72 | } 73 | if (delta * 100 * scale > base) { 74 | printf("i=%d, ans=%e, csr=%e, delta=%e\n", i, ans_vec[i], csr_ans[i], delta); 75 | total_fail--; 76 | if(total_fail == 0) 77 | break; 78 | } 79 | } 80 | if (total_fail != 10){ 81 | printf("Calculation Result is Incorrect\n"); 82 | } 83 | else { 84 | printf("Calculation Result is Correct\n"); 85 | } 86 | } 87 | 88 | #endif 89 | 90 | /* 91 | * Release MemObjects of Each Format structure 92 | */ 93 | /* void release_cpu_amb(sfAMB mat); */ 94 | /* void release_amb(sfAMB mat); */ 95 | 96 | /* 97 | * Converting matrix to AMB format 98 | */ 99 | /* void init_plan(sfPlan *plan); */ 100 | /* void set_plan(sfPlan *plan, size_t seg_size, int block_size); */ 101 | /* void sf_csr2amb(sfAMB *mat, sfCSR *csr_mat, real *d_x, sfPlan *plan); */ 102 | 103 | /* 104 | * SpMV Kernel 105 | */ 106 | /* void csr_ans_check(real *val, int *col, int *rpt, real *rhs_vec, real *csr_ans, int N); */ 107 | /* void sf_spmv_amb(real *d_y, sfAMB *mat, real *d_x, sfPlan *plan); */ 108 | 109 | -------------------------------------------------------------------------------- /nsparse/inc/nsparse_asm.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Inline PTX 3 | */ 4 | #ifndef NSPARSE_ASM_H 5 | #define NSPARSE_ASM_H 6 | 7 | __device__ __inline__ float ld_gbl_val(const float *val) 8 | { 9 | float return_value; 10 | asm("ld.global.cv.f32 %0, [%1];" : "=f"(return_value) : "l"(val)); 11 | return return_value; 12 | } 13 | 14 | __device__ __inline__ double ld_gbl_val(const double *val) 15 | { 16 | double return_value; 17 | asm("ld.global.cv.f64 %0, [%1];" : "=d"(return_value) : "l"(val)); 18 | return return_value; 19 | } 20 | 21 | __device__ __inline__ int ld_gbl_col(const int *col) 22 | { 23 | int return_value; 24 | asm("ld.global.cv.s32 %0, [%1];" : "=r"(return_value) : "l"(col)); 25 | return return_value; 26 | } 27 | 28 | __device__ __inline__ short ld_gbl_col(const short *col) 29 | { 30 | short return_value; 31 | asm("ld.global.cv.u16 %0, [%1];" : "=h"(return_value) : "l"(col)); 32 | return return_value; 33 | } 34 | 35 | __device__ __inline__ unsigned short ld_gbl_col(const unsigned short *col) 36 | { 37 | unsigned short return_value; 38 | asm("ld.global.cv.u16 %0, [%1];" : "=h"(return_value) : "l"(col)); 39 | return return_value; 40 | } 41 | 42 | __device__ __inline__ void st_gbl_val(const float *ptr, float val) 43 | { 44 | asm("st.global.cs.f32 [%0], %1;" :: "l"(ptr) , "f"(val)); 45 | 46 | } 47 | 48 | __device__ __inline__ void st_gbl_val(const double *ptr, double val) 49 | { 50 | asm("st.global.cs.f64 [%0], %1;" :: "l"(ptr) , "d"(val)); 51 | } 52 | 53 | /* 54 | * Multiply and Add 55 | */ 56 | template 57 | class Add 58 | { 59 | public: 60 | __device__ __inline__ T operator()(T a, T b) 61 | { 62 | return a + b; 63 | } 64 | }; 65 | 66 | template 67 | class Multiply 68 | { 69 | public: 70 | __device__ __inline__ T operator()(T a, T b) 71 | { 72 | return a * b; 73 | } 74 | }; 75 | 76 | template 77 | class AtomicAdd 78 | { 79 | public: 80 | __device__ T operator()(T* a, T v); 81 | }; 82 | 83 | template <> 84 | __device__ __inline__ float AtomicAdd::operator()(float *a, float v) 85 | { 86 | return atomicAdd(a, v); 87 | } 88 | 89 | template <> 90 | __device__ __inline__ double AtomicAdd::operator()(double *a, double v) 91 | { 92 | #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600) 93 | return atomicAdd(a, v); 94 | #else 95 | unsigned long long int *a_ull = (unsigned long long int *)(a); 96 | unsigned long long int old = *a_ull; 97 | unsigned long long int assumed; 98 | do { 99 | assumed = old; 100 | old = atomicCAS(a_ull, assumed, __double_as_longlong(v + __longlong_as_double(assumed))); 101 | } while (assumed != old); 102 | return old; 103 | #endif 104 | } 105 | 106 | #endif 107 | -------------------------------------------------------------------------------- /nsparse/nsparse.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "Timing.hpp" 16 | 17 | typedef int IT; 18 | //#ifdef FLOAT 19 | //typedef float VT; 20 | //#else 21 | //typedef double VT; 22 | //#endif 23 | typedef double VT; 24 | 25 | 26 | template 27 | void SpGEMM_Hash_Detail(CSR& a, CSR& b, CSR &c, Timing& timing) 28 | { 29 | double t0, t1; 30 | t0 = t1 = fast_clock_time(); 31 | 32 | BIN* bin = new BIN(a.nrow); 33 | 34 | c.nrow = a.nrow; 35 | c.ncolumn = b.ncolumn; 36 | c.device_malloc = true; 37 | cudaMalloc((void **)&(c.d_rpt), sizeof(idType) * (c.nrow + 1)); 38 | timing.setup = fast_clock_time() - t0; 39 | 40 | t0 = fast_clock_time(); 41 | bin->set_max_bin(a.d_rpt, a.d_colids, b.d_rpt, a.nrow, TS_S_P, TS_S_T); 42 | CHECK_ERROR(cudaDeviceSynchronize()); 43 | timing.symbolic_binning = fast_clock_time() - t0; 44 | 45 | t0 = fast_clock_time(); 46 | hash_symbolic(a, b, c, *bin); 47 | CHECK_ERROR(cudaDeviceSynchronize()); 48 | timing.symbolic = fast_clock_time() - t0; 49 | 50 | t0 = fast_clock_time(); 51 | thrust::exclusive_scan(thrust::device, bin->d_count, bin->d_count + (a.nrow + 1), c.d_rpt, 0); 52 | cudaMemcpy(&(c.nnz), c.d_rpt + c.nrow, sizeof(idType), cudaMemcpyDeviceToHost); 53 | timing.prefix = fast_clock_time() - t0; 54 | 55 | t0 = fast_clock_time(); 56 | cudaMalloc((void **)&(c.d_colids), sizeof(idType) * (c.nnz)); 57 | cudaMalloc((void **)&(c.d_values), sizeof(valType) * (c.nnz)); 58 | timing.allocate = fast_clock_time() - t0; 59 | 60 | t0 = fast_clock_time(); 61 | bin->set_min_bin(a.nrow, TS_N_P, TS_N_T); 62 | CHECK_ERROR(cudaDeviceSynchronize()); 63 | timing.numeric_binning = fast_clock_time() - t0; 64 | 65 | t0 = fast_clock_time(); 66 | hash_numeric(a, b, c, *bin); 67 | CHECK_ERROR(cudaDeviceSynchronize()); 68 | timing.numeric = fast_clock_time() - t0; 69 | 70 | t0 = fast_clock_time(); 71 | delete bin; 72 | timing.cleanup = fast_clock_time() - t0; 73 | timing.total = fast_clock_time() - t1; 74 | 75 | } 76 | 77 | 78 | template 79 | void run_spgemm(CSR& a, CSR& b, CSR &c) 80 | { 81 | 82 | /* Memcpy A and B from Host to Device */ 83 | a.memcpyHtD(); 84 | b.memcpyHtD(); 85 | 86 | /* Count flop of SpGEMM computation */ 87 | long long int flop_count; 88 | get_spgemm_flop(a, b, flop_count); 89 | 90 | /* Execution of SpGEMM on Device */ 91 | Timing warmup_timing, bench_timing, timing; 92 | 93 | SpGEMM_Hash_Detail(a, b, c, warmup_timing); 94 | c.release_csr(); 95 | 96 | for (int i = 0; i < SpGEMM_TRI_NUM; i++) { 97 | SpGEMM_Hash_Detail(a, b, c, bench_timing); 98 | if (i < SpGEMM_TRI_NUM - 1) { 99 | c.release_csr(); 100 | } 101 | timing += bench_timing; 102 | } 103 | timing /= SpGEMM_TRI_NUM; 104 | timing.print(flop_count); 105 | 106 | 107 | c.memcpyDtH(); 108 | c.release_csr(); 109 | 110 | #ifdef sfDEBUG 111 | CSR cusparse_c; 112 | SpGEMM_cuSPARSE(a, b, cusparse_c); 113 | cusparse_c.memcpyDtH(); 114 | if (c == cusparse_c) { 115 | //cout << "HashSpGEMM is correctly executed" << endl; 116 | cout << "pass" << endl; 117 | } 118 | else{ 119 | cout << "fail" << endl; 120 | } 121 | cout << "Nnz of A: " << a.nnz << endl; 122 | cout << "Number of intermediate products: " << flop_count / 2 << endl; 123 | cout << "Nnz of C: " << c.nnz << endl; 124 | cusparse_c.release_cpu_csr(); 125 | #endif 126 | 127 | a.release_csr(); 128 | b.release_csr(); 129 | 130 | } 131 | 132 | /*Main Function*/ 133 | int main(int argc, char *argv[]) 134 | { 135 | CSR a, b, c; 136 | 137 | std::string mat1, mat2; 138 | mat1 = "can_24"; 139 | mat2 = "can_24"; 140 | if(argc == 2){ 141 | mat1 = argv[1]; 142 | mat2 = argv[1]; 143 | } 144 | if(argc >= 3){ 145 | mat1 = argv[1]; 146 | mat2 = argv[2]; 147 | } 148 | std::string mat1_file; 149 | if(mat1.find("ER") != std::string::npos){ 150 | mat1_file = "../matrix/ER/" + mat1 +".mtx"; 151 | } 152 | else if(mat1.find("G500") != std::string::npos){ 153 | mat1_file = "../matrix/G500/" + mat1 +".mtx"; 154 | } 155 | else{ 156 | mat1_file = "../matrix/suite_sparse/" + mat1 + "/" + mat1 +".mtx"; 157 | } 158 | std::string mat2_file; 159 | if(mat2.find("ER") != std::string::npos){ 160 | mat2_file = "../matrix/ER/" + mat2 +".mtx"; 161 | } 162 | else if(mat2.find("G500") != std::string::npos){ 163 | mat2_file = "../matrix/G500/" + mat2 +".mtx"; 164 | } 165 | else{ 166 | mat2_file = "../matrix/suite_sparse/" + mat2 + "/" + mat2 +".mtx"; 167 | } 168 | 169 | /* Set CSR reding from MM file or generating random matrix */ 170 | //cout << "Initialize Matrix A" << endl; 171 | //cout << "Read matrix data from " << argv[1] << endl; 172 | a.init_data_from_mtx(mat1_file); 173 | 174 | //cout << "Initialize Matrix B" << endl; 175 | //cout << "Read matrix data from " << argv[1] << endl; 176 | b.init_data_from_mtx(mat2_file); 177 | 178 | /* Execution of SpGEMM on GPU */ 179 | printf("%s ", mat1.c_str()); 180 | run_spgemm(a, b, c); 181 | 182 | a.release_cpu_csr(); 183 | b.release_cpu_csr(); 184 | c.release_cpu_csr(); 185 | 186 | return 0; 187 | 188 | } 189 | 190 | -------------------------------------------------------------------------------- /nsparse/readme.md: -------------------------------------------------------------------------------- 1 | # Get started 2 | 1 Compile source code 3 | 4 | ``` $> make all``` 5 | 6 | 2 Execute nsparse 7 | 8 | 2.1 Profile nsparse 9 | 10 | ``` $> ./bin/nsparse_d webbase-1M ``` 11 | 12 | 2.2 Overall performance of nsparse 13 | 14 | ```$> ./bin/reg_nsparse_d webbase-1M ``` 15 | -------------------------------------------------------------------------------- /nsparse/reg_nsparse.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "Timing.hpp" 16 | 17 | typedef int IT; 18 | //#ifdef FLOAT 19 | //typedef float VT; 20 | //#else 21 | //typedef double VT; 22 | //#endif 23 | typedef double VT; 24 | 25 | 26 | template 27 | void SpGEMM_Hash_Detail(CSR& a, CSR& b, CSR &c, Timing& timing) 28 | { 29 | double t0, t1; 30 | t0 = t1 = fast_clock_time(); 31 | 32 | BIN* bin = new BIN(a.nrow); 33 | 34 | c.nrow = a.nrow; 35 | c.ncolumn = b.ncolumn; 36 | c.device_malloc = true; 37 | cudaMalloc((void **)&(c.d_rpt), sizeof(idType) * (c.nrow + 1)); 38 | timing.setup = fast_clock_time() - t0; 39 | 40 | t0 = fast_clock_time(); 41 | bin->set_max_bin(a.d_rpt, a.d_colids, b.d_rpt, a.nrow, TS_S_P, TS_S_T); 42 | CHECK_ERROR(cudaDeviceSynchronize()); 43 | timing.symbolic_binning = fast_clock_time() - t0; 44 | 45 | t0 = fast_clock_time(); 46 | hash_symbolic(a, b, c, *bin); 47 | CHECK_ERROR(cudaDeviceSynchronize()); 48 | timing.symbolic = fast_clock_time() - t0; 49 | 50 | t0 = fast_clock_time(); 51 | thrust::exclusive_scan(thrust::device, bin->d_count, bin->d_count + (a.nrow + 1), c.d_rpt, 0); 52 | cudaMemcpy(&(c.nnz), c.d_rpt + c.nrow, sizeof(idType), cudaMemcpyDeviceToHost); 53 | timing.prefix = fast_clock_time() - t0; 54 | 55 | t0 = fast_clock_time(); 56 | cudaMalloc((void **)&(c.d_colids), sizeof(idType) * (c.nnz)); 57 | cudaMalloc((void **)&(c.d_values), sizeof(valType) * (c.nnz)); 58 | timing.allocate = fast_clock_time() - t0; 59 | 60 | t0 = fast_clock_time(); 61 | bin->set_min_bin(a.nrow, TS_N_P, TS_N_T); 62 | CHECK_ERROR(cudaDeviceSynchronize()); 63 | timing.numeric_binning = fast_clock_time() - t0; 64 | 65 | t0 = fast_clock_time(); 66 | hash_numeric(a, b, c, *bin); 67 | CHECK_ERROR(cudaDeviceSynchronize()); 68 | timing.numeric = fast_clock_time() - t0; 69 | 70 | t0 = fast_clock_time(); 71 | delete bin; 72 | timing.cleanup = fast_clock_time() - t0; 73 | timing.total = fast_clock_time() - t1; 74 | 75 | } 76 | 77 | 78 | template 79 | void run_spgemm(CSR& a, CSR& b, CSR &c) 80 | { 81 | 82 | /* Memcpy A and B from Host to Device */ 83 | a.memcpyHtD(); 84 | b.memcpyHtD(); 85 | 86 | /* Count flop of SpGEMM computation */ 87 | long long int flop_count; 88 | get_spgemm_flop(a, b, flop_count); 89 | 90 | /* Execution of SpGEMM on Device */ 91 | Timing warmup_timing, bench_timing, timing; 92 | 93 | SpGEMM_Hash_Detail(a, b, c, warmup_timing); 94 | c.release_csr(); 95 | 96 | for (int i = 0; i < SpGEMM_TRI_NUM; i++) { 97 | SpGEMM_Hash_Detail(a, b, c, bench_timing); 98 | if (i < SpGEMM_TRI_NUM - 1) { 99 | c.release_csr(); 100 | } 101 | timing += bench_timing; 102 | } 103 | timing /= SpGEMM_TRI_NUM; 104 | //timing.print(flop_count); 105 | timing.reg_print(flop_count); 106 | 107 | 108 | c.memcpyDtH(); 109 | c.release_csr(); 110 | 111 | #ifdef sfDEBUG 112 | CSR cusparse_c; 113 | SpGEMM_cuSPARSE(a, b, cusparse_c); 114 | cusparse_c.memcpyDtH(); 115 | if (c == cusparse_c) { 116 | //cout << "HashSpGEMM is correctly executed" << endl; 117 | cout << "pass" << endl; 118 | } 119 | else{ 120 | cout << "fail" << endl; 121 | } 122 | cout << "Nnz of A: " << a.nnz << endl; 123 | cout << "Number of intermediate products: " << flop_count / 2 << endl; 124 | cout << "Nnz of C: " << c.nnz << endl; 125 | cusparse_c.release_cpu_csr(); 126 | #endif 127 | 128 | a.release_csr(); 129 | b.release_csr(); 130 | 131 | } 132 | 133 | /*Main Function*/ 134 | int main(int argc, char *argv[]) 135 | { 136 | CSR a, b, c; 137 | 138 | std::string mat1, mat2; 139 | mat1 = "can_24"; 140 | mat2 = "can_24"; 141 | if(argc == 2){ 142 | mat1 = argv[1]; 143 | mat2 = argv[1]; 144 | } 145 | if(argc >= 3){ 146 | mat1 = argv[1]; 147 | mat2 = argv[2]; 148 | } 149 | std::string mat1_file; 150 | if(mat1.find("ER") != std::string::npos){ 151 | mat1_file = "../matrix/ER/" + mat1 +".mtx"; 152 | } 153 | else if(mat1.find("G500") != std::string::npos){ 154 | mat1_file = "../matrix/G500/" + mat1 +".mtx"; 155 | } 156 | else{ 157 | mat1_file = "../matrix/suite_sparse/" + mat1 + "/" + mat1 +".mtx"; 158 | } 159 | std::string mat2_file; 160 | if(mat2.find("ER") != std::string::npos){ 161 | mat2_file = "../matrix/ER/" + mat2 +".mtx"; 162 | } 163 | else if(mat2.find("G500") != std::string::npos){ 164 | mat2_file = "../matrix/G500/" + mat2 +".mtx"; 165 | } 166 | else{ 167 | mat2_file = "../matrix/suite_sparse/" + mat2 + "/" + mat2 +".mtx"; 168 | } 169 | 170 | /* Set CSR reding from MM file or generating random matrix */ 171 | //cout << "Initialize Matrix A" << endl; 172 | //cout << "Read matrix data from " << argv[1] << endl; 173 | a.init_data_from_mtx(mat1_file); 174 | 175 | //cout << "Initialize Matrix B" << endl; 176 | //cout << "Read matrix data from " << argv[1] << endl; 177 | b.init_data_from_mtx(mat2_file); 178 | 179 | /* Execution of SpGEMM on GPU */ 180 | printf("%s ", mat1.c_str()); 181 | run_spgemm(a, b, c); 182 | 183 | a.release_cpu_csr(); 184 | b.release_cpu_csr(); 185 | c.release_cpu_csr(); 186 | 187 | return 0; 188 | 189 | } 190 | 191 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | The Source Code of OpSparse 2 | ======== 3 | 4 | This repository contain the source code of OpSparse, and part of the source code from [cuSPARSE](https://docs.nvidia.com/cuda/cusparse/index.html), [nsparse](https://github.com/EBD-CREST/nsparse.git), and [spECK](https://github.com/GPUPeople/spECK.git). 5 | ## Tested evironment 6 | CUDA 11.2, NVIDIA Tesla V100 GPU, Ubuntu 18.04 LTS 7 | 8 | ## Get started 9 | 1 Execute ```$> bash download_matrix.sh``` in the current directory to download the matrix webbase-1M into matrix/suite_sparse directory 10 | 11 | 2 For detailed execution instruction, refer the readme.md in the opsparse, nsparse, and speck sub-directory 12 | 13 | ## Bibtex 14 | ``` 15 | @ARTICLE{9851653, 16 | author={Du, Zhaoyang and Guan, Yijin and Guan, Tianchan and Niu, Dimin and Huang, Linyong and Zheng, Hongzhong and Xie, Yuan}, 17 | journal={IEEE Access}, 18 | title={OpSparse: A Highly Optimized Framework for Sparse General Matrix Multiplication on GPUs}, 19 | year={2022}, 20 | volume={10}, 21 | number={}, 22 | pages={85960-85974}, 23 | doi={10.1109/ACCESS.2022.3196940}} 24 | ``` 25 | -------------------------------------------------------------------------------- /spECK/Makefile: -------------------------------------------------------------------------------- 1 | CXX = g++ 2 | NVCC = nvcc 3 | 4 | 5 | GENCODE = -arch=compute_70 -code=sm_70 6 | 7 | 8 | #CUDAFLAGS = $(GENCODE) -g -lineinfo 9 | #CUDAFLAGS = $(GENCODE) -g -G 10 | CUDAFLAGS = $(GENCODE) -O3 11 | 12 | CUDAFLAGS += -Xcompiler -fopenmp 13 | # cannot solve shared race problem, cannot compile with -G 14 | 15 | # for Device Code 16 | CUDA_PATH = /usr/local/cuda 17 | #LDFLAGS += -L${CUDA_PATH}/lib64 18 | #LDFLAGS += -lm -lcudart -lcusparse $(CUDAFLAGS) 19 | LDFLAGS += -lcusparse $(CUDAFLAGS) 20 | INCLUDE = -I./include -I./CUDATools -I./GPU 21 | INCLUDE += -I${CUDA_PATH}/include 22 | INCLUDE += -I${CUDA_PATH}/samples/common/inc 23 | 24 | BIN = ./bin 25 | SRC = ./source 26 | OBJ = ./obj 27 | INC = ./include 28 | 29 | OBJ_LIB = $(OBJ)/CSR.o $(OBJ)/Config.o $(OBJ)/COO.o $(OBJ)/DataLoader.o $(OBJ)/Executor.o $(OBJ)/RunConfig.o $(OBJ)/dCSR.o $(OBJ)/cuSparseMultiply.o 30 | OBJ_LIB2 = $(OBJ)/GPU/Compare.o $(OBJ)/GPU/Transpose.o $(OBJ)/GPU/memory.o $(OBJ)/GPU/Multiply.o 31 | #COMMON_DEP = cuda_common.h 32 | 33 | $(OBJ)/%.o : $(SRC)/%.cu $(INC)/%.h 34 | mkdir -p $(dir $@) 35 | $(NVCC) -c $(CUDAFLAGS) $(INCLUDE) -o $@ $< 36 | 37 | $(OBJ)/%.o : $(SRC)/%.cu 38 | mkdir -p $(dir $@) 39 | $(NVCC) -c $(CUDAFLAGS) $(INCLUDE) -o $@ $< 40 | 41 | $(OBJ)/%.o : $(SRC)/%.cpp 42 | mkdir -p $(dir $@) 43 | $(NVCC) -c $(CUDAFLAGS) $(INCLUDE) -o $@ $< 44 | 45 | $(OBJ)/%.o : $(SRC)/%.cpp $(INC)/%.h 46 | mkdir -p $(dir $@) 47 | $(NVCC) -c $(CUDAFLAGS) $(INCLUDE) -o $@ $< 48 | 49 | speck : $(OBJ_LIB2) $(OBJ_LIB) $(OBJ)/runspECK.o 50 | $(NVCC) -o $@ $^ $(LDFLAGS) $(INCLUDE) 51 | 52 | reg_speck : $(OBJ_LIB2) $(OBJ_LIB) $(OBJ)/reg_runspECK.o 53 | $(NVCC) -o $@ $^ $(LDFLAGS) $(INCLUDE) 54 | 55 | all : speck reg_speck 56 | 57 | clean : 58 | rm -rf $(BIN)/* 59 | rm -rf $(OBJ)/* 60 | -------------------------------------------------------------------------------- /spECK/config.ini: -------------------------------------------------------------------------------- 1 | ; if the complete time should be measured. has only little impact on performance 2 | TrackCompleteTimes=true 3 | 4 | ; tracks and prints timings for all stages of spECK and all iterations. Has significant impact on performance 5 | ; TrackIndividualTimes=false 6 | TrackIndividualTimes=true 7 | 8 | ; compares C row lengths and column indices with CUSPARSE and prints an error if they do not match 9 | ; (we only compare indices, because values are not always the same, since spECK is not bit stable) 10 | ; no impact on measured performance, but can make overall execution much slower, because CUSPARSE can be very slow for some matrices 11 | CompareResult=false 12 | 13 | ; how many iterations should be run to raise GPU clock before measuring the time 14 | ; note that first iteration will be significantly slower, because of result matrix memory allocation 15 | IterationsWarmUp=1 16 | 17 | ; how many iterations are accumulated for to calculate mean execution time 18 | IterationsExecution=10 19 | 20 | ; enter a path to an input matrix here -> this overrides the matrix selected in the command line 21 | ; InputFile= 22 | -------------------------------------------------------------------------------- /spECK/include/COO.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "Vector.h" 4 | 5 | #include 6 | 7 | 8 | template 9 | struct COO 10 | { 11 | size_t rows, cols, nnz; 12 | 13 | std::unique_ptr data; 14 | std::unique_ptr row_ids; 15 | std::unique_ptr col_ids; 16 | 17 | COO() : rows(0), cols(0), nnz(0) { } 18 | void alloc(size_t rows, size_t cols, size_t nnz); 19 | }; 20 | 21 | template 22 | COO loadMTX(const char* file); 23 | template 24 | COO loadCOO(const char* file); 25 | template 26 | void storeCOO(const COO& mat, const char* file); 27 | 28 | template 29 | void spmv(DenseVector& res, const COO& m, const DenseVector& v, bool transpose = false); 30 | -------------------------------------------------------------------------------- /spECK/include/CSR.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | template 9 | struct COO; 10 | 11 | template 12 | struct DenseVector; 13 | 14 | template 15 | struct CSR 16 | { 17 | struct Statistics 18 | { 19 | double mean; 20 | double std_dev; 21 | size_t max; 22 | size_t min; 23 | }; 24 | 25 | void computeStatistics(double& mean, double& std_dev, size_t& max, size_t& min) 26 | { 27 | // running variance by Welford 28 | size_t count = 0; 29 | mean = 0; 30 | double M2 = 0; 31 | max = 0; 32 | min = cols; 33 | for (size_t i = 0; i < rows; ++i) 34 | { 35 | size_t r_length = row_offsets[i + 1] - row_offsets[i]; 36 | min = std::min(min, r_length); 37 | max = std::max(max, r_length); 38 | ++count; 39 | double newValue = static_cast(r_length); 40 | double delta = newValue - mean; 41 | mean = mean + delta / count; 42 | double delta2 = newValue - mean; 43 | M2 = M2 + delta * delta2; 44 | } 45 | if (count < 2) 46 | std_dev = 0; 47 | else 48 | std_dev = sqrt(M2 / (count - 1)); 49 | } 50 | 51 | Statistics rowStatistics() 52 | { 53 | Statistics stats; 54 | computeStatistics(stats.mean, stats.std_dev, stats.max, stats.min); 55 | return stats; 56 | } 57 | 58 | size_t rows, cols, nnz; 59 | 60 | //std::unique_ptr data; 61 | //std::unique_ptr row_offsets; 62 | //std::unique_ptr col_ids; 63 | T *data; 64 | int *row_offsets; 65 | int *col_ids; 66 | 67 | CSR() : rows(0), cols(0), nnz(0) { } 68 | void alloc(size_t rows, size_t cols, size_t nnz); 69 | 70 | CSR(const CSR& A, int rows, int cols, int row_start, int row_end); 71 | CSR& operator=(const CSR& A); 72 | // CSR& operator=(CSR other) 73 | // { 74 | // this->rows = other.rows; 75 | // this->cols = other.cols; 76 | // this->nnz = other.nnz; 77 | // this->data = std::move(other.data); 78 | // this->row_offsets = std::move(other.row_offsets); 79 | // this->col_ids = std::move(other.col_ids); 80 | // return *this; 81 | // } 82 | 83 | // CSR(const CSR& other) 84 | // { 85 | // this->rows = other.rows; 86 | // this->cols = other.cols; 87 | // this->nnz = other.nnz; 88 | // this->data = std::make_unique(other.nnz); 89 | // memcpy(this->data.get(), other.data.get(), sizeof(T) * other.nnz); 90 | // this->col_ids = std::make_unique(other.nnz); 91 | // memcpy(this->col_ids.get(), other.col_ids.get(), sizeof(unsigned int) * other.nnz); 92 | // this->row_offsets = std::make_unique(other.rows + 1); 93 | // memcpy(this->row_offsets.get(), other.row_offsets.get(), sizeof(unsigned int) * (other.rows + 1)); 94 | // } 95 | 96 | }; 97 | 98 | 99 | template 100 | CSR loadCSR(const char* file); 101 | template 102 | void storeCSR(const CSR& mat, const char* file); 103 | 104 | template 105 | void spmv(DenseVector& res, const CSR& m, const DenseVector& v, bool transpose = false); 106 | 107 | template 108 | void convert(CSR& res, const COO& coo); 109 | -------------------------------------------------------------------------------- /spECK/include/CUDATools/event.h: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | #ifndef INCLUDED_CUDA_EVENT 5 | #define INCLUDED_CUDA_EVENT 6 | 7 | #pragma once 8 | 9 | #include 10 | 11 | #include 12 | 13 | 14 | namespace CU 15 | { 16 | struct EventDestroyDeleter 17 | { 18 | void operator ()(CUevent event) const 19 | { 20 | cuEventDestroy(event); 21 | } 22 | }; 23 | 24 | using unique_event = unique_handle; 25 | 26 | unique_event createEvent(unsigned int flags = CU_EVENT_DEFAULT); 27 | } 28 | 29 | #endif // INCLUDED_CUDA_EVENT 30 | -------------------------------------------------------------------------------- /spECK/include/CUDATools/memory.h: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | #ifndef INCLUDED_CUDA_MEMORY 5 | #define INCLUDED_CUDA_MEMORY 6 | 7 | #pragma once 8 | 9 | #include 10 | 11 | #include 12 | 13 | #include 14 | 15 | 16 | namespace CU 17 | { 18 | struct MemFreeDeleter 19 | { 20 | void operator ()(CUdeviceptr ptr) const 21 | { 22 | cudaFree(reinterpret_cast(ptr)); 23 | } 24 | }; 25 | 26 | using unique_ptr = unique_handle; 27 | 28 | 29 | struct pitched_memory 30 | { 31 | pitched_memory(const pitched_memory&) = delete; 32 | pitched_memory& operator =(const pitched_memory&) = delete; 33 | 34 | unique_ptr memory; 35 | std::size_t pitch; 36 | 37 | pitched_memory() {} 38 | 39 | pitched_memory(unique_ptr memory, std::size_t pitch) 40 | : memory(std::move(memory)), 41 | pitch(pitch) 42 | { 43 | } 44 | 45 | pitched_memory(pitched_memory&& m) 46 | : memory(std::move(m.memory)), 47 | pitch(m.pitch) 48 | { 49 | } 50 | 51 | pitched_memory& operator =(pitched_memory&& m) 52 | { 53 | using std::swap; 54 | swap(memory, m.memory); 55 | pitch = m.pitch; 56 | return *this; 57 | } 58 | }; 59 | 60 | 61 | unique_ptr allocMemory(std::size_t size); 62 | unique_ptr allocMemoryPitched(std::size_t& pitch, std::size_t row_size, std::size_t num_rows, unsigned int element_size); 63 | pitched_memory allocMemoryPitched(std::size_t row_size, std::size_t num_rows, unsigned int element_size); 64 | } 65 | 66 | #endif // INCLUDED_CUDA_MEMORY 67 | -------------------------------------------------------------------------------- /spECK/include/CUDATools/memory_space.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace HiSparse 4 | { 5 | enum class MemorySpace 6 | { 7 | host, 8 | device 9 | }; 10 | } -------------------------------------------------------------------------------- /spECK/include/CUDATools/stream.h: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | #ifndef INCLUDED_CUDA_STREAM 5 | #define INCLUDED_CUDA_STREAM 6 | 7 | #pragma once 8 | 9 | #include 10 | 11 | #include 12 | 13 | 14 | namespace CU 15 | { 16 | struct StreamDestroyDeleter 17 | { 18 | void operator ()(CUstream stream) const 19 | { 20 | cuStreamDestroy(stream); 21 | } 22 | }; 23 | 24 | using unique_stream = unique_handle; 25 | 26 | unique_stream createStream(unsigned int flags = CU_STREAM_DEFAULT); 27 | } 28 | 29 | #endif // INCLUDED_CUDA_STREAM 30 | -------------------------------------------------------------------------------- /spECK/include/CUDATools/unique_handle.h: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | #ifndef INCLUDED_CUDA_UNIQUE_HANDLE 5 | #define INCLUDED_CUDA_UNIQUE_HANDLE 6 | 7 | #pragma once 8 | 9 | #include 10 | 11 | 12 | namespace CU 13 | { 14 | template 15 | class unique_handle : Deleter 16 | { 17 | T h; 18 | 19 | void free(T handle) noexcept 20 | { 21 | if (handle != NULL_VALUE) 22 | Deleter::operator ()(handle); 23 | } 24 | 25 | public: 26 | unique_handle(const unique_handle&) = delete; 27 | unique_handle& operator =(const unique_handle&) = delete; 28 | 29 | using handle_type = T; 30 | using deleter_type = Deleter; 31 | 32 | static constexpr T null_value = NULL_VALUE; 33 | 34 | explicit unique_handle(T handle = NULL_VALUE) noexcept 35 | : h(handle) 36 | { 37 | } 38 | 39 | void consume(T handle) noexcept { h = handle; } 40 | 41 | 42 | unique_handle(T handle, const Deleter& d) noexcept 43 | : Deleter(d), 44 | h(handle) 45 | { 46 | } 47 | 48 | unique_handle(T handle, Deleter&& d) noexcept 49 | : Deleter(std::move(d)), 50 | h(handle) 51 | { 52 | } 53 | 54 | unique_handle(unique_handle&& h) noexcept 55 | : Deleter(std::move(static_cast(h))), 56 | h(h.h) 57 | { 58 | h.h = NULL_VALUE; 59 | } 60 | 61 | ~unique_handle() 62 | { 63 | free(h); 64 | } 65 | 66 | operator T() const noexcept { return h; } 67 | 68 | template 69 | DataType* get() const noexcept { return reinterpret_cast(h); } 70 | 71 | template 72 | DataType* getRelease() noexcept { DataType* tmp = reinterpret_cast(h); h = 0ULL; return tmp; } 73 | 74 | unique_handle& operator =(unique_handle&& h) noexcept 75 | { 76 | using std::swap; 77 | swap(*this, h); 78 | return *this; 79 | } 80 | 81 | T release() noexcept 82 | { 83 | T temp = h; 84 | h = NULL_VALUE; 85 | return temp; 86 | } 87 | 88 | void reset(T handle = null_value) noexcept 89 | { 90 | using std::swap; 91 | swap(this->h, handle); 92 | free(handle); 93 | } 94 | 95 | friend void swap(unique_handle& a, unique_handle& b) noexcept 96 | { 97 | using std::swap; 98 | swap(a.h, b.h); 99 | } 100 | }; 101 | } 102 | 103 | #endif // INCLUDED_CUDA_UNIQUE_HANDLE 104 | -------------------------------------------------------------------------------- /spECK/include/Compare.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "dCSR.h" 4 | 5 | namespace spECK { 6 | template 7 | bool Compare(const dCSR& reference_mat, const dCSR& compare_mat, bool compare_data); 8 | } -------------------------------------------------------------------------------- /spECK/include/Config.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include "INIReader.h" 6 | 7 | class Config 8 | { 9 | public: 10 | enum Key 11 | { 12 | BlockNnzFillRatio, 13 | MaxRowsPerBlock, 14 | LoadBalanceScanMode, 15 | HashScanSupportRestarts, 16 | InputFile, 17 | LoadBalanceModeCounting, 18 | LoadBalanceModeNumeric, 19 | ReprocessLoadBalancingForNumeric, 20 | MaxNnzPerBlockNumeric, 21 | MaxRowsPerBlockNumeric, 22 | IterationsWarmUp, 23 | IterationsExecution, 24 | SupportGlobalFallback, 25 | TrackIndividualTimes, 26 | TrackCompleteTimes, 27 | Debug, 28 | SortMode, 29 | CompareResult, 30 | AutoSelectKernelSizeMode, 31 | CountingBlockLimit, 32 | LogsEnabled, 33 | BlocksPerSM, 34 | DenseThresholdNumericExternalSorting, 35 | DenseThresholdNumericInternalSorting, 36 | GlobalDenseThresholdNumeric, 37 | DenseThresholdCounting, 38 | SpGEMMMethodCounting, 39 | SpGEMMMethodNumeric, 40 | ThreadsPerNnzOffset, 41 | add3MinLength, 42 | add2MinLength, 43 | add1MinLength, 44 | add3MaxCols, 45 | add2MaxCols, 46 | add1MaxCols, 47 | sub2MaxCols, 48 | sub1MaxCols, 49 | sub2MinThreads, 50 | sub1MinThreads, 51 | add2MinConcurrentOps, 52 | add1MinConcurrentOps, 53 | maxOpsWeight64, 54 | maxOpsWeight128, 55 | maxOpsWeight256, 56 | maxOpsWeight512, 57 | maxOpsWeight1024, 58 | staticThreadsPerRow 59 | }; 60 | 61 | enum ScanMode 62 | { 63 | Std = 0, 64 | Cub = 1, 65 | Thrust = 2, 66 | WorkEfficient = 3 67 | }; 68 | 69 | enum SpGEMMMethods 70 | { 71 | AutoSpGEMM = 0, 72 | HashSpGEMM = 1, 73 | DenseSpGEMM = 2 74 | }; 75 | 76 | enum LoadBalanceModes 77 | { 78 | AutoEnable = 0, 79 | ForceEnable = 1, 80 | ForceDisable = 2 81 | }; 82 | 83 | enum SortModes 84 | { 85 | None = 0, 86 | Separate = 1, 87 | InPlace = 2, 88 | Auto = 3, 89 | CubSegmentedSort = 4 90 | }; 91 | 92 | private: 93 | std::map keyToString; 94 | std::map overrides; 95 | INIReader reader; 96 | 97 | void addKeyToString() { 98 | keyToString = { 99 | {BlockNnzFillRatio, "BlockNnzFillRatio"}, 100 | {MaxRowsPerBlock, "MaxRowsPerBlock"}, 101 | {LoadBalanceScanMode, "LoadBalanceScanMode"}, 102 | {HashScanSupportRestarts, "HashScanSupportRestarts"}, 103 | {InputFile, "InputFile"}, 104 | {ReprocessLoadBalancingForNumeric, "ReprocessLoadBalancingForNumeric"}, 105 | {MaxNnzPerBlockNumeric, "MaxNnzPerBlockNumeric"}, 106 | {MaxRowsPerBlockNumeric, "MaxRowsPerBlockNumeric"}, 107 | {IterationsWarmUp, "IterationsWarmUp"}, 108 | {IterationsExecution, "IterationsExecution"}, 109 | {SupportGlobalFallback, "SupportGlobalFallback"}, 110 | {TrackIndividualTimes, "TrackIndividualTimes"}, 111 | {TrackCompleteTimes, "TrackCompleteTimes"}, 112 | {Debug, "Debug"}, 113 | {LoadBalanceModeCounting, "LoadBalanceModeCounting"}, 114 | {LoadBalanceModeNumeric, "LoadBalanceModeNumeric"}, 115 | {SortMode, "SortMode"}, 116 | {CompareResult, "CompareResult"}, 117 | {AutoSelectKernelSizeMode, "AutoSelectKernelSizeMode"}, 118 | {CountingBlockLimit, "CountingBlockLimit"}, 119 | {LogsEnabled, "LogsEnabled"}, 120 | {BlocksPerSM, "BlocksPerSM"}, 121 | {DenseThresholdNumericExternalSorting, "DenseThresholdNumericExternalSorting"}, 122 | {DenseThresholdNumericInternalSorting, "DenseThresholdNumericInternalSorting"}, 123 | {DenseThresholdCounting, "DenseThresholdCounting"}, 124 | {SpGEMMMethodNumeric, "SpGEMMMethodNumeric"}, 125 | {SpGEMMMethodCounting, "SpGEMMMethodCounting"}, 126 | {GlobalDenseThresholdNumeric, "GlobalDenseThresholdNumeric"}, 127 | {add3MinLength, "add3MinLength"}, 128 | {add2MinLength, "add2MinLength"}, 129 | {add1MinLength, "add1MinLength"}, 130 | {add3MaxCols, "add3MaxCols"}, 131 | {add2MaxCols, "add2MaxCols"}, 132 | {add1MaxCols, "add1MaxCols"}, 133 | {sub2MaxCols, "sub2MaxCols"}, 134 | {sub1MaxCols, "sub1MaxCols"}, 135 | {sub2MinThreads, "sub2MinThreads"}, 136 | {sub1MinThreads, "sub1MinThreads"}, 137 | {add2MinConcurrentOps, "add2MinConcurrentOps"}, 138 | {add1MinConcurrentOps, "add1MinConcurrentOps"}, 139 | {ThreadsPerNnzOffset, "ThreadsPerNnzOffset"}, 140 | {maxOpsWeight64, "maxOpsWeight64"}, 141 | {maxOpsWeight128, "maxOpsWeight128"}, 142 | {maxOpsWeight256, "maxOpsWeight256"}, 143 | {maxOpsWeight512, "maxOpsWeight512"}, 144 | {maxOpsWeight1024, "maxOpsWeight1024"}, 145 | {staticThreadsPerRow, "staticThreadsPerRow"} }; 146 | } 147 | 148 | Config() 149 | { 150 | reader = INIReader(); 151 | addKeyToString(); 152 | } 153 | 154 | Config(std::string configPath) 155 | { 156 | reader = INIReader(configPath); 157 | addKeyToString(); 158 | } 159 | 160 | static Config &Instance() 161 | { 162 | if (_instance == nullptr) 163 | throw std::exception(); 164 | 165 | return *_instance; 166 | } 167 | 168 | public: 169 | static Config *_instance; 170 | static void init(std::string path); 171 | static void init(); 172 | 173 | static int getInt(Key key, int fallback = -1); 174 | static int setInt(Key key, int newVal); 175 | static std::string getString(Key key, std::string fallback = ""); 176 | static bool getBool(Key key, bool fallback = false); 177 | static float getFloat(Key key, float fallback = 0.0); 178 | }; 179 | 180 | 181 | -------------------------------------------------------------------------------- /spECK/include/DataLoader.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include "CSR.h" 5 | #include "dCSR.h" 6 | 7 | 8 | template 9 | struct Matrices 10 | { 11 | CSR cpuA, cpuB; 12 | dCSR gpuA, gpuB; 13 | }; 14 | 15 | template 16 | class DataLoader 17 | { 18 | public: 19 | DataLoader(std::string path, std::string path2); 20 | ~DataLoader() = default; 21 | Matrices matrices; 22 | }; 23 | -------------------------------------------------------------------------------- /spECK/include/Executor.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "RunConfig.h" 3 | 4 | template 5 | class Executor 6 | { 7 | public: 8 | Executor(int argc, char *argv[]) : runConfig(argc, argv) {} 9 | ~Executor() = default; 10 | int run(); 11 | int run_detail(); 12 | 13 | private: 14 | RunConfig runConfig; 15 | int iterationsWarmup = 0; 16 | int iterationsExecution = 1; 17 | }; 18 | 19 | -------------------------------------------------------------------------------- /spECK/include/GPU/BlockRange.cuh: -------------------------------------------------------------------------------- 1 | #ifndef spECK_BlockRange 2 | #define spECK_BlockRange 3 | #pragma once 4 | #include "limits.cuh" 5 | 6 | 7 | template 8 | struct BlockRange 9 | { 10 | // inclusive 11 | INDEX_TYPE first; 12 | // if 1, then single row, if > 1 then multiple rows. if 0, then merged with others and must not be used 13 | ROW_COUNT_TYPE numRows; 14 | // if nnZ == numeric_limits::max(), then this must not be merged with others 15 | INDEX_TYPE nnz; 16 | 17 | __host__ __device__ BlockRange() : first(spECK::numeric_limits::max()), numRows(0), nnz(0) {} 18 | 19 | __host__ __device__ BlockRange(INDEX_TYPE first, INDEX_TYPE numRows, INDEX_TYPE nnz) : first(first) 20 | { 21 | this->numRows = min(numRows, spECK::numeric_limits::max()); 22 | this->nnz = min(nnz, spECK::numeric_limits::max()); 23 | } 24 | 25 | __host__ __device__ BlockRange& operator=(const BlockRange& a) 26 | { 27 | first = a.first; 28 | numRows = a.numRows; 29 | nnz = a.nnz; 30 | return *this; 31 | } 32 | 33 | __host__ __device__ __forceinline__ INDEX_TYPE nextRow() const { return first + numRows; } 34 | __host__ __device__ __forceinline__ INDEX_TYPE last() const 35 | { 36 | if (numRows == 0) 37 | return spECK::numeric_limits::max(); 38 | 39 | return first + numRows - 1; 40 | } 41 | __host__ __device__ __forceinline__ bool valid() const { return numRows; } 42 | __host__ __device__ __forceinline__ void setInvalid() { numRows = 0; } 43 | 44 | __host__ __device__ int operator >(const BlockRange &b) 45 | { 46 | return first > b.first; 47 | } 48 | }; 49 | 50 | 51 | template 52 | struct BlockRangeKernelScale 53 | { 54 | // inclusive 55 | INDEX_TYPE first; 56 | // if 1, then single row, if > 1 then multiple rows. if 0, then merged with others and must not be used 57 | ROW_COUNT_TYPE numRows; 58 | // if nnZ == numeric_limits::max(), then this must not be merged with others 59 | INDEX_TYPE nnz; 60 | int8_t kernelScale; 61 | 62 | __host__ __device__ BlockRangeKernelScale() : first(spECK::numeric_limits::max()), numRows(0), nnz(0), kernelScale(0) {} 63 | 64 | __host__ __device__ BlockRangeKernelScale(INDEX_TYPE first, INDEX_TYPE numRows, INDEX_TYPE nnz, int8_t kernelScale) : first(first), kernelScale(kernelScale) 65 | { 66 | this->numRows = min(numRows, spECK::numeric_limits::max()); 67 | this->nnz = min(nnz, spECK::numeric_limits::max()); 68 | } 69 | 70 | __host__ __device__ BlockRangeKernelScale& operator=(const BlockRangeKernelScale& a) 71 | { 72 | first = a.first; 73 | numRows = a.numRows; 74 | nnz = a.nnz; 75 | kernelScale = a.kernelScale; 76 | return *this; 77 | } 78 | 79 | __host__ __device__ __forceinline__ INDEX_TYPE nextRow() const { return first + numRows; } 80 | __host__ __device__ __forceinline__ INDEX_TYPE last() const 81 | { 82 | if (numRows == 0) 83 | return spECK::numeric_limits::max(); 84 | 85 | return first + numRows - 1; 86 | } 87 | __host__ __device__ __forceinline__ bool valid() const { return numRows; } 88 | __host__ __device__ __forceinline__ void setInvalid() { numRows = 0; } 89 | 90 | __host__ __device__ int operator >(const BlockRangeKernelScale &b) 91 | { 92 | return first > b.first; 93 | } 94 | }; 95 | #endif -------------------------------------------------------------------------------- /spECK/include/GPU/Hash.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | __host__ __device__ __forceinline__ uint32_t currentHash(uint32_t id) { 5 | return id * 11; 6 | } -------------------------------------------------------------------------------- /spECK/include/GPU/consistent_gpu_memory.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | /*#include "../memory_space.h" 6 | #include "../consistent_memory.h" 7 | 8 | namespace spECK { 9 | template<> 10 | class ConsistentMemory : RegisteredMemory 11 | { 12 | size_t _size; 13 | CU::unique_ptr _ptr; 14 | 15 | size_t clear() override 16 | { 17 | auto s = _size; 18 | reset(0); 19 | return s; 20 | } 21 | public: 22 | ConsistentMemory() : _size(0) 23 | { 24 | register_consistent_memory(this); 25 | } 26 | 27 | ~ConsistentMemory() 28 | { 29 | unregister_consistent_memory(this); 30 | } 31 | 32 | operator CUdeviceptr() const noexcept { return _ptr; } 33 | 34 | template 35 | T* get() const noexcept { return reinterpret_cast(_ptr.operator long long unsigned int()); } 36 | 37 | void increaseMemRetainData(size_t size) 38 | { 39 | CU::unique_ptr tmp_ptr = CU::allocMemory(_size + size); 40 | cudaMemcpy(tmp_ptr.get(), _ptr.get(), _size, cudaMemcpyDeviceToDevice); 41 | _ptr.reset(); 42 | _ptr = std::move(tmp_ptr); 43 | _size += size; 44 | } 45 | 46 | void assure(size_t size) 47 | { 48 | if (size > _size) 49 | { 50 | _ptr.reset(); 51 | _ptr = CU::allocMemory(size); 52 | _size = size; 53 | } 54 | } 55 | void reset(size_t size = 0) 56 | { 57 | _ptr.reset(); 58 | _size = 0; 59 | assure(size); 60 | } 61 | }; 62 | }*/ -------------------------------------------------------------------------------- /spECK/include/GPU/limits.cuh: -------------------------------------------------------------------------------- 1 | #ifndef spECK_Limits 2 | #define spECK_Limits 3 | #pragma once 4 | #include 5 | #include 6 | 7 | namespace spECK { 8 | 9 | template 10 | struct numeric_limits 11 | { 12 | /** The minimum finite value, or for floating types with 13 | denormalization, the minimum positive normalized value. */ 14 | __host__ __device__ static constexpr _Tp 15 | min() noexcept { return _Tp(); } 16 | 17 | /** The maximum finite value. */ 18 | __host__ __device__ static constexpr _Tp 19 | max() noexcept { return _Tp(); } 20 | }; 21 | 22 | template<> 23 | struct numeric_limits 24 | { 25 | __host__ __device__ static constexpr uint32_t 26 | min() noexcept { return NPP_MIN_32U; } 27 | 28 | __host__ __device__ static constexpr uint32_t 29 | max() noexcept { return NPP_MAX_32U; } 30 | }; 31 | 32 | template<> 33 | struct numeric_limits 34 | { 35 | __host__ __device__ static constexpr int32_t 36 | min() noexcept { return NPP_MIN_32S; } 37 | 38 | __host__ __device__ static constexpr int32_t 39 | max() noexcept { return NPP_MAX_32S; } 40 | }; 41 | 42 | template<> 43 | struct numeric_limits 44 | { 45 | __host__ __device__ static constexpr uint16_t 46 | min() noexcept { return NPP_MIN_16U; } 47 | 48 | __host__ __device__ static constexpr uint16_t 49 | max() noexcept { return NPP_MAX_16U; } 50 | }; 51 | 52 | template<> 53 | struct numeric_limits 54 | { 55 | __host__ __device__ static constexpr int16_t 56 | min() noexcept { return NPP_MIN_16S; } 57 | 58 | __host__ __device__ static constexpr int16_t 59 | max() noexcept { return NPP_MAX_16S; } 60 | }; 61 | 62 | template<> 63 | struct numeric_limits 64 | { 65 | __host__ __device__ static constexpr uint8_t 66 | min() noexcept { return NPP_MIN_8U; } 67 | 68 | __host__ __device__ static constexpr uint8_t 69 | max() noexcept { return NPP_MAX_8U; } 70 | }; 71 | 72 | template<> 73 | struct numeric_limits 74 | { 75 | __host__ __device__ static constexpr int8_t 76 | min() noexcept { return NPP_MIN_8S; } 77 | 78 | __host__ __device__ static constexpr int8_t 79 | max() noexcept { return NPP_MAX_8S; } 80 | }; 81 | } 82 | #endif -------------------------------------------------------------------------------- /spECK/include/GPU/profiler.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | // Structure to hold data collected by callback 9 | typedef struct RuntimeApiTrace_st { 10 | const char *functionName; 11 | uint64_t startTimestamp; 12 | uint64_t endTimestamp; 13 | size_t memcpy_bytes; 14 | enum cudaMemcpyKind memcpy_kind; 15 | size_t currentMemoryUsage; 16 | } RuntimeApiTrace_t; 17 | 18 | enum launchOrder{ MEMCPY_H2D1, MEMCPY_H2D2, MEMCPY_D2H, KERNEL, THREAD_SYNC, LAUNCH_LAST}; 19 | 20 | class CuProfiler { 21 | private: 22 | std::vector records; 23 | CUpti_SubscriberHandle subscriber; 24 | unsigned long long int startMem; 25 | bool initialized; 26 | 27 | public: 28 | RuntimeApiTrace_t trace[LAUNCH_LAST]; 29 | 30 | CuProfiler() { 31 | initialized = false; 32 | } 33 | 34 | 35 | // void CUPTIAPI getTimestampCallback(void *userdata, CUpti_CallbackDomain domain, 36 | // CUpti_CallbackId cbid, const CUpti_CallbackData *cbInfo); 37 | 38 | size_t getPeakMemoryUsage() { 39 | if (records.size() == 0) 40 | return -1; 41 | 42 | size_t max = 0; 43 | for (auto& entry : records) { 44 | if (entry.currentMemoryUsage > max) 45 | max = entry.currentMemoryUsage; 46 | } 47 | return max; 48 | } 49 | 50 | static const char *memcpyKindStr(enum cudaMemcpyKind kind) 51 | { 52 | switch (kind) { 53 | case cudaMemcpyHostToDevice: 54 | return "HostToDevice"; 55 | case cudaMemcpyDeviceToHost: 56 | return "DeviceToHost"; 57 | default: 58 | break; 59 | } 60 | 61 | return ""; 62 | } 63 | 64 | void displayTimestamps(); 65 | 66 | 67 | static void cleanUp(int *h_A, int *h_B, int *h_C, int *d_A, int *d_B, int *d_C) 68 | { 69 | if (d_A) 70 | cudaFree(d_A); 71 | if (d_B) 72 | cudaFree(d_B); 73 | if (d_C) 74 | cudaFree(d_C); 75 | 76 | // Free host memory 77 | if (h_A) 78 | free(h_A); 79 | if (h_B) 80 | free(h_B); 81 | if (h_C) 82 | free(h_C); 83 | } 84 | 85 | void initialize(bool subtractCurrentMem = true); 86 | void finalize(); 87 | }; -------------------------------------------------------------------------------- /spECK/include/GPU/spECKKernels.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | #include 5 | typedef unsigned long long int uint64_t; 6 | 7 | #include 8 | #include "Config.h" 9 | #include "GPU/BlockRange.cuh" 10 | 11 | class spECKKernels 12 | { 13 | public: 14 | spECKKernels(uint32_t blockDim=128): 15 | blockDim{blockDim} 16 | {} 17 | 18 | void setLaunchDimensions(uint32_t _gridDim, cudaStream_t _stream = 0, uint32_t _blockDim = 128, uint32_t _sharedMem = 0) 19 | { 20 | gridDim = _gridDim; 21 | blockDim = _blockDim; 22 | stream = _stream; 23 | sharedMem = _sharedMem; 24 | } 25 | 26 | // ##################################################################### 27 | // Numeric Hash SpGEMM 28 | // 29 | template 30 | void h_HashSpGEMMNumeric(dCSRNoDealloc matA, dCSRNoDealloc matB, dCSRNoDealloc matC, GlobalMap *maps, INDEX_TYPE mapCount, 31 | INDEX_TYPE *blockStartRow, INDEX_TYPE *rowOperations, Config::SortModes sortColumns, uint32_t numberBlocks, const INDEX_TYPE *rowColMinMax, 32 | INDEX_TYPE *rowMaxOperations, bool setSortedBit, uint32_t rowsPerBlock); 33 | 34 | // ##################################################################### 35 | // Numeric SpGEMM Launcher 36 | // 37 | template 38 | void h_SpGEMMNumericLauncher(dCSRNoDealloc matA, dCSRNoDealloc matB, dCSRNoDealloc matC, 39 | GlobalHashMap *hashMaps, INDEX_TYPE hashMapCount, GlobalRowOffsetMap *rowOffsetMaps, INDEX_TYPE rowOffsetMapCount, 40 | INDEX_TYPE *blockStartRow, INDEX_TYPE *rowOperations, Config::SortModes sortColumns, uint32_t numberBlocks, const INDEX_TYPE* rowColMinMax, 41 | INDEX_TYPE *rowMaxOperations, uint32_t minimumDensity, bool setSortedBit, uint32_t rowsPerBlock); 42 | 43 | 44 | // ##################################################################### 45 | // Numeric Dense SpGEMM 46 | // 47 | template 48 | void h_DenseSpGEMMNumeric(dCSRNoDealloc matA, dCSRNoDealloc matB, dCSRNoDealloc matC, GlobalMap *maps, INDEX_TYPE mapCount, 49 | INDEX_TYPE *blockStartRow, INDEX_TYPE *rowOperations, uint32_t numberBlocks, const INDEX_TYPE *rowColMinMax, 50 | INDEX_TYPE *rowMaxOperations, bool setSortedBit, uint32_t rowsPerBlock); 51 | 52 | // ##################################################################### 53 | // Symbolic Dense SpGEMM 54 | // 55 | template 56 | void h_DenseSpGEMMCount(dCSRNoDealloc matA, dCSRNoDealloc matB, GlobalMap *maps, INDEX_TYPE mapCount, 57 | INDEX_TYPE *matCRowOffsets, INDEX_TYPE *blockStartRow, INDEX_TYPE *rowOperations, uint32_t numberBlocks, const INDEX_TYPE *rowColMinMax, 58 | INDEX_TYPE *rowMaxOperations, uint32_t *maxNnzPerRow, uint32_t rowsPerBlock); 59 | 60 | // ##################################################################### 61 | // Symbolic Hash SpGEMM used for counting NNZ elements of output matrix C 62 | // 63 | template 64 | void h_HashSpGEMMCount(dCSRNoDealloc matA, dCSRNoDealloc matB, GlobalMap *maps, INDEX_TYPE mapCount, INDEX_TYPE *matCNnzRow, 65 | INDEX_TYPE* rowOperations, INDEX_TYPE *blockStartRow, uint32_t numberBlocks, const INDEX_TYPE* rowColMinMax, 66 | INDEX_TYPE *rowMaxOperations, uint32_t *maxNnzPerRow, uint32_t rowsPerBlock); 67 | 68 | // ##################################################################### 69 | // Symbolic SpGEMM launcher used for counting NNZ elements of output matrix C 70 | // 71 | template 72 | void h_SpGEMMCountLauncher(dCSRNoDealloc matA, dCSRNoDealloc matB, 73 | GlobalMap *hashMaps, INDEX_TYPE hashMapCount, GlobalRowOffsetsMap *rowOffsetMaps, INDEX_TYPE rowOffsetMapsCount, 74 | INDEX_TYPE *matCNnzRow, INDEX_TYPE *rowOperations, INDEX_TYPE *blockStartRow, 75 | uint32_t numberBlocks, const INDEX_TYPE *rowColMinMax, 76 | INDEX_TYPE *rowMaxOperations, uint32_t minimumDensity, INDEX_TYPE *maxNnzPerRow, uint32_t rowsPerBlock); 77 | 78 | // ##################################################################### 79 | // Sorts results of Symbolic Hash SpGEMM 80 | // 81 | template 82 | void h_HashSpGEMMSorting(dCSRNoDealloc matC, INDEX_TYPE *blockStartRow, uint32_t numberBlocks, bool bitShiftNumRows); 83 | 84 | template 85 | void h_InitializeGlobalMaps(Map *maps, int count, INDEX_TYPE *ids, VALUE_TYPE *values, size_t elementsPerMap); 86 | 87 | template 88 | void h_InitializeGlobalMapsNoVal(Map *maps, int count, INDEX_TYPE *ids, size_t elementsPerMap, uint32_t maxRowsPerBlock); 89 | 90 | 91 | // ##################################################################### 92 | // Load Balancer for HashSpGEMM by assigning blocks to rows -> 1 block can have multiple rows, but 1 row is never shared by multiple blocks 93 | // this load balancer works uses the amount of operations per row for balancing 94 | // 95 | template 96 | void h_AssignHashSpGEMMBlocksToRowsOfSameSizeOperations(dCSRNoDealloc &matA, dCSRNoDealloc &matB, uint32_t *rowOperations, 97 | INDEX_TYPE *blockStartRows, INDEX_TYPE *numBlockStarts, INDEX_TYPE (&h_numBlockStarts)[KERNEL_COUNT], INDEX_TYPE *blockStartRowsCombined, 98 | uint32_t maxNnzPerBlock, uint32_t maxNnzPerBlockDynamicSharedMem, uint32_t maxRowsPerBlock, uint32_t actualKernelCount, uint32_t &h_rowsRequiringGlobal); 99 | 100 | 101 | 102 | // ##################################################################### 103 | // Load Balancer for HashSpGEMM by assigning blocks to rows -> 1 block can have multiple rows, but 1 row is never shared by multiple blocks 104 | // this load balancer tries to combine rows which fit into one as small as possible kernel 105 | // 106 | template 107 | void h_AssignHashSpGEMMBlocksToRowsOfSameSize(dCSRNoDealloc &matA, 108 | INDEX_TYPE *blockStartRows, INDEX_TYPE *blockStartRowsCombined, INDEX_TYPE *numBlockStarts, INDEX_TYPE(&h_numBlockStarts)[KERNEL_COUNT], 109 | uint32_t maxNnzPerBlock, uint32_t maxNnzPerBlockDynamicSharedMem, uint32_t maxRowsPerBlock, uint32_t actualKernelCount, uint32_t &h_rowsRequiringGlobal); 110 | 111 | 112 | private: 113 | uint32_t blockDim; 114 | uint32_t gridDim; 115 | uint32_t sharedMem; 116 | cudaStream_t stream; 117 | }; 118 | 119 | -------------------------------------------------------------------------------- /spECK/include/HashMap.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include "GPU/limits.cuh" 6 | #include 7 | 8 | __device__ __forceinline__ uint32_t toHashEntry(uint32_t row, uint32_t col) 9 | { 10 | return (row << 27) + col; 11 | } 12 | 13 | __device__ __forceinline__ uint32_t hashEntryToColumn(uint32_t hashEntry) 14 | { 15 | return hashEntry & 0x7FFFFFF; 16 | } 17 | 18 | __device__ __forceinline__ uint32_t hashEntryToRow(uint32_t hashEntry) 19 | { 20 | return hashEntry >> 27; 21 | } 22 | 23 | template 24 | struct HashMap 25 | { 26 | public: 27 | // no default values or else union does not work 28 | INDEX_TYPE *ids; 29 | VALUE_TYPE *values; 30 | uint32_t capacity; 31 | 32 | __device__ __forceinline__ static INDEX_TYPE UNUSED() { return spECK::numeric_limits::max(); } 33 | 34 | __device__ void init() 35 | { 36 | for (int i = threadIdx.x; i < capacity; i += blockDim.x) 37 | { 38 | ids[i] = UNUSED(); 39 | values[i] = VALUE_TYPE(0); 40 | } 41 | } 42 | 43 | __device__ __forceinline__ INDEX_TYPE indexOf(INDEX_TYPE id) 44 | { 45 | INDEX_TYPE hashed_id = currentHash(id); 46 | INDEX_TYPE map_id = hashed_id % getSize(); 47 | do 48 | { 49 | auto entry_id = ids[map_id]; 50 | if (entry_id == id) 51 | { 52 | return map_id; 53 | } 54 | 55 | if (entry_id == UNUSED()) 56 | { 57 | auto old_id = atomicCAS(ids + map_id, UNUSED(), id); 58 | if (old_id == UNUSED() || old_id == id) 59 | { 60 | return map_id; 61 | } 62 | } 63 | 64 | map_id = (map_id + 1) % getSize(); 65 | } while (true); 66 | } 67 | 68 | __device__ __forceinline__ VALUE_TYPE &operator[](INDEX_TYPE id) 69 | { 70 | return values[indexOf(id)]; 71 | } 72 | 73 | __device__ __forceinline__ INDEX_TYPE coordToId(INDEX_TYPE rowA, INDEX_TYPE colB) 74 | { 75 | return toHashEntry(rowA, colB); 76 | } 77 | 78 | __device__ __forceinline__ static INDEX_TYPE idToRow(INDEX_TYPE id) { return hashEntryToRow(id); } 79 | 80 | __device__ __forceinline__ static INDEX_TYPE idToCol(INDEX_TYPE id) { return hashEntryToColumn(id); } 81 | 82 | __device__ __forceinline__ VALUE_TYPE &at(INDEX_TYPE rowA, INDEX_TYPE colB) 83 | { 84 | return this->operator[](coordToId(rowA, colB)); 85 | } 86 | 87 | __device__ __forceinline__ void atomic_add_direct(INDEX_TYPE rowA, INDEX_TYPE colB, VALUE_TYPE val) 88 | { 89 | atomicAdd_block(&values[colB], val); 90 | ids[colB] = coordToId(rowA, colB); 91 | } 92 | 93 | __device__ __forceinline__ void atomic_add_direct(INDEX_TYPE colB, VALUE_TYPE val) 94 | { 95 | atomicAdd_block(&values[colB], val); 96 | ids[colB] = colB; 97 | } 98 | 99 | __device__ __forceinline__ void atomic_add(INDEX_TYPE rowA, INDEX_TYPE colB, VALUE_TYPE val) 100 | { 101 | atomic_add(coordToId(rowA, colB), val); 102 | } 103 | 104 | __device__ __forceinline__ void atomic_add(INDEX_TYPE id, VALUE_TYPE val) 105 | { 106 | atomicAdd_block(values + this->indexOf(id), val); 107 | } 108 | 109 | __device__ __forceinline__ uint32_t getSize() const { return capacity; } 110 | }; 111 | 112 | template 113 | __device__ __forceinline__ HashMap *reserveMap(HashMap *maps, uint32_t count) 114 | { 115 | uint32_t index = blockIdx.x % count; 116 | 117 | while (true) 118 | { 119 | if (atomicCAS(&maps[index].reserved, 0, 1) == 0) 120 | { 121 | return &maps[index]; 122 | } 123 | index = (index + 1) % count; 124 | } 125 | } 126 | 127 | template 128 | __device__ __forceinline__ void freeMap(HashMap *map) 129 | { 130 | if (map == nullptr) 131 | return; 132 | map->reserved = 0; 133 | map = nullptr; 134 | } 135 | 136 | template 137 | struct HashMapNoValue 138 | { 139 | private: 140 | uint32_t limit; 141 | 142 | public: 143 | __device__ INDEX_TYPE UNUSED() const { return spECK::numeric_limits::max(); } 144 | INDEX_TYPE *ids; 145 | INDEX_TYPE *occupancyPerRow; 146 | INDEX_TYPE *occupancy; 147 | 148 | // no default values or else union does not work 149 | int reserved; 150 | uint32_t capacity; 151 | 152 | __device__ void init(bool mainThread) 153 | { 154 | for (int i = threadIdx.x; i < capacity; i += blockDim.x) 155 | ids[i] = UNUSED(); 156 | 157 | for (int i = threadIdx.x; i < MAX_ROW_COUNT; i += blockDim.x) 158 | occupancyPerRow[i] = 0; 159 | 160 | if (mainThread) 161 | { 162 | *occupancy = 0; 163 | limit = capacity; 164 | } 165 | } 166 | 167 | __device__ __forceinline__ void operator[](INDEX_TYPE id) 168 | { 169 | INDEX_TYPE hashed_id = currentHash(id); 170 | INDEX_TYPE map_id = hashed_id % getSize(); 171 | 172 | do 173 | { 174 | auto entry = ids[map_id]; 175 | if (entry == id) 176 | return; 177 | 178 | if (entry == UNUSED()) 179 | { 180 | auto old_id = atomicCAS(ids + map_id, UNUSED(), id); 181 | 182 | if (old_id == UNUSED() || old_id == id) 183 | { 184 | if (old_id == UNUSED()) 185 | { 186 | atomicAdd_block(occupancy, 1); 187 | atomicAdd_block(&occupancyPerRow[idToRow(id)], 1); 188 | } 189 | return; 190 | } 191 | } 192 | 193 | map_id = (map_id + 1) % getSize(); 194 | } while (true); 195 | } 196 | 197 | __device__ __forceinline__ void limitSize(uint32_t limit) 198 | { 199 | this->limit = min(limit, capacity); 200 | } 201 | 202 | __device__ __forceinline__ INDEX_TYPE coordToId(INDEX_TYPE rowA, INDEX_TYPE colB) 203 | { 204 | return toHashEntry(rowA, colB); 205 | } 206 | 207 | __device__ __forceinline__ static INDEX_TYPE idToRow(INDEX_TYPE id) { return hashEntryToRow(id); } 208 | 209 | __device__ __forceinline__ static INDEX_TYPE idToCol(INDEX_TYPE id) { return hashEntryToColumn(id); } 210 | 211 | __device__ __forceinline__ void at(INDEX_TYPE rowA, INDEX_TYPE colB) 212 | { 213 | this->operator[](coordToId(rowA, colB)); 214 | } 215 | 216 | __device__ __forceinline__ void atDirect(INDEX_TYPE rowA, INDEX_TYPE colB) 217 | { 218 | if (ids[colB] != UNUSED()) 219 | return; 220 | 221 | INDEX_TYPE retVal = atomicCAS(&ids[colB], UNUSED(), coordToId(rowA, colB)); 222 | if (retVal == UNUSED()) 223 | { 224 | atomicAdd_block(occupancy, 1); 225 | atomicAdd_block(&occupancyPerRow[rowA], 1); 226 | } 227 | } 228 | 229 | __device__ __forceinline__ size_t getSize() const { return limit; } 230 | }; -------------------------------------------------------------------------------- /spECK/include/Multiply.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | #include "dCSR.h" 5 | #include "Timings.h" 6 | #include "spECKConfig.h" 7 | 8 | // REPLACE THESE VALUES WITH YOUR ACTUAL DEVICE SPECIFICATIONS 9 | 10 | static constexpr int spECK_STATIC_MEM_PER_BLOCK {49152}; 11 | static constexpr int spECK_DYNAMIC_MEM_PER_BLOCK{98304}; 12 | 13 | namespace spECK 14 | { 15 | template 16 | void MultiplyspECK(const dCSR &A, const dCSR &B, dCSR &matOut, spECKConfig &config, Timings &timings); 17 | 18 | template 19 | void MultiplyspECKImplementation(const dCSR &A, const dCSR &B, dCSR &matOut, spECKConfig &config, Timings &timings = Timings()); 20 | } // namespace spECK 21 | -------------------------------------------------------------------------------- /spECK/include/RunConfig.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | class RunConfig 6 | { 7 | public: 8 | RunConfig(int argc, char *argv[]); 9 | ~RunConfig(); 10 | std::string filePath; 11 | std::string mat_name; 12 | std::string filePath2; 13 | std::string mat_name2; 14 | }; 15 | 16 | -------------------------------------------------------------------------------- /spECK/include/Timings.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | class Timings { 5 | public: 6 | bool measureAll; 7 | bool measureCompleteTime; 8 | float init; 9 | float countProducts; 10 | float loadBalanceCounting; 11 | float globalMapsCounting; 12 | float spGEMMCounting; 13 | float allocC; 14 | float loadBalanceNumeric; 15 | float globalMapsNumeric; 16 | float spGEMMNumeric; 17 | float sorting; 18 | float cleanup; 19 | float complete; 20 | 21 | float setup; 22 | float symbolic_binning; 23 | float symbolic; 24 | float numeric_binning; 25 | float prefix; 26 | float allocate; 27 | float numeric; 28 | float total; 29 | 30 | Timings(){ 31 | measureAll = false; 32 | measureCompleteTime = false; 33 | init = 0.0f; 34 | countProducts = 0.0f; 35 | loadBalanceCounting = 0.0f; 36 | globalMapsCounting = 0.0f; 37 | spGEMMCounting = 0.0f; 38 | allocC = 0.0f; 39 | loadBalanceNumeric = 0.0f; 40 | globalMapsNumeric = 0.0f; 41 | spGEMMNumeric = 0.0f; 42 | sorting = 0.0f; 43 | cleanup = 0.0f; 44 | complete = 0.0f; 45 | } 46 | void operator+=(const Timings& b) { 47 | init += b.init; 48 | countProducts += b.countProducts; 49 | loadBalanceCounting += b.loadBalanceCounting; 50 | globalMapsCounting += b.globalMapsCounting; 51 | spGEMMCounting += b.spGEMMCounting; 52 | allocC += b.allocC; 53 | loadBalanceNumeric += b.loadBalanceNumeric; 54 | globalMapsNumeric += b.globalMapsNumeric; 55 | spGEMMNumeric += b.spGEMMNumeric; 56 | sorting += b.sorting; 57 | cleanup += b.cleanup; 58 | complete += b.complete; 59 | } 60 | 61 | void operator/=(const float& x) { 62 | init /= x; 63 | countProducts /= x; 64 | loadBalanceCounting /= x; 65 | globalMapsCounting /= x; 66 | spGEMMCounting /= x; 67 | allocC /= x; 68 | loadBalanceNumeric /= x; 69 | globalMapsNumeric /= x; 70 | spGEMMNumeric /= x; 71 | sorting /= x; 72 | cleanup /= x; 73 | complete /= x; 74 | } 75 | void print(long long total_flop){ 76 | float total_flop_d = float(total_flop)/1000000; 77 | setup = init + countProducts; 78 | symbolic_binning = loadBalanceCounting; 79 | symbolic = globalMapsCounting + spGEMMCounting; 80 | numeric_binning = loadBalanceNumeric; 81 | prefix = 0; 82 | allocate = allocC; 83 | numeric = globalMapsNumeric + spGEMMNumeric + sorting; 84 | total = complete; 85 | 86 | //if (measureAll){ 87 | printf("spECK initial mallocs = %f ms\n", init); 88 | printf("spECK count computations = %f ms\n", countProducts); 89 | printf("spECK load-balancer = %f ms\n", loadBalanceCounting); 90 | printf("spECK GlobalMaps Cnt = %f ms\n", globalMapsCounting); 91 | printf("spECK counting kernel = %f ms\n", spGEMMCounting); 92 | printf("spECK malloc mat C = %f ms\n", allocC); 93 | printf("spECK num load-balancer = %f ms\n", loadBalanceNumeric); 94 | printf("spECK init GlobalMaps = %f ms\n", globalMapsNumeric); 95 | printf("spECK numeric kernel = %f ms\n", spGEMMNumeric); 96 | printf("spECK Sorting kernel = %f ms\n", sorting); 97 | printf("spECK cleanup = %f ms\n", cleanup); 98 | printf("--------------------------------------------------------------\n"); 99 | //} 100 | 101 | //if(measureAll){ 102 | printf("time(ms):\n"); 103 | printf(" setup %8.3lfms %6.2lf%%\n", setup, setup/total*100); 104 | printf("\e[1;31m symbolic_binning %8.3lfms %6.2lf%%\n\e[0m", symbolic_binning, symbolic_binning/total*100); 105 | printf("\e[1;31m symbolic %8.3lfms %6.2lf%%\n\e[0m", symbolic, symbolic/total*100); 106 | printf("\e[1;31m numeric_binning %8.3lfms %6.2lf%%\n\e[0m", numeric_binning, numeric_binning/total*100); 107 | printf(" prefix %8.3lfms %6.2lf%%\n", prefix, prefix/total*100); 108 | printf(" allocate %8.3lfms %6.2lf%%\n", allocate, allocate/total*100); 109 | printf("\e[1;31m numeric %8.3lfms %6.2lf%%\n\e[0m", numeric, numeric/total*100); 110 | printf(" cleanup %8.3lfms %6.2lf%%\n", cleanup, cleanup/total*100); 111 | printf(" total %8.3lfms %6.2lf%%\n", total, total/total*100); 112 | printf("perf(Gflops):\n"); 113 | printf(" setup %6.2lf\n", total_flop_d/setup); 114 | printf(" symbolic_binning %6.2lf\n", total_flop_d/symbolic_binning); 115 | printf(" symbolic %6.2lf\n", total_flop_d/symbolic); 116 | printf(" numeric_binning %6.2lf\n", total_flop_d/numeric_binning); 117 | printf(" prefix %6.2lf\n", total_flop_d/prefix); 118 | printf(" allocate %6.2lf\n", total_flop_d/allocate); 119 | printf(" numeric %6.2lf\n", total_flop_d/numeric); 120 | printf(" cleanup %6.2lf\n", total_flop_d/cleanup); 121 | printf(" total %6.2lf\n", total_flop_d/total); 122 | //} 123 | } 124 | void reg_print(long long total_flop){ 125 | float total_flop_d = float(total_flop)/1000000; 126 | total = complete; 127 | printf("%6.2lf\n", total_flop_d/total); 128 | } 129 | void binning_print(long long total_flop){ 130 | float total_flop_d = float(total_flop)/1000000; 131 | float total_binning_time = loadBalanceCounting + loadBalanceNumeric; 132 | printf("%.4e %.4f\n", total_binning_time/1000, 100*total_binning_time/complete); 133 | } 134 | 135 | }; 136 | 137 | 138 | -------------------------------------------------------------------------------- /spECK/include/Transpose.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "dCSR.h" 4 | 5 | namespace spECK { 6 | template 7 | void Transpose(const dCSR& matIn, dCSR& matTransposeOut); 8 | } -------------------------------------------------------------------------------- /spECK/include/Vector.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | template 6 | struct DenseVector 7 | { 8 | size_t size; 9 | //std::unique_ptr data; 10 | T* data; 11 | 12 | DenseVector() : size(0) { } 13 | void alloc(size_t s) 14 | { 15 | data = std::make_unique(s); 16 | size = s; 17 | } 18 | }; 19 | -------------------------------------------------------------------------------- /spECK/include/WorkDistribution.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "stdint.h" 3 | typedef unsigned long long int uint64_t; 4 | 5 | struct WorkDistributionConfig 6 | { 7 | uint32_t threadsPerNnzOffset; 8 | uint32_t add3MinLength; 9 | uint32_t add2MinLength; 10 | uint32_t add1MinLength; 11 | uint32_t add3MaxCols; 12 | uint32_t add2MaxCols; 13 | uint32_t add1MaxCols; 14 | uint32_t sub2MaxCols; 15 | uint32_t sub1MaxCols; 16 | uint32_t sub2MinThreads; 17 | uint32_t sub1MinThreads; 18 | uint32_t add2MinConcurrentOps; 19 | uint32_t add1MinConcurrentOps; 20 | float maxOpsWeight64; 21 | float maxOpsWeight128; 22 | float maxOpsWeight256; 23 | float maxOpsWeight512; 24 | float maxOpsWeight1024; 25 | int staticThreadsPerRow; 26 | }; 27 | 28 | const int layer1inputs = 6; 29 | const int layer1outputs = 5; 30 | const float layer1weights[30] = { 31 | -1.4186962, 32 | 0.07587334, 33 | -1.7805182, 34 | 0.04314838, 35 | -0.6445114, 36 | 37 | -0.13512687, 38 | 0.04315747, 39 | -0.17808716, 40 | -0.04465475, 41 | -0.066692226, 42 | 43 | 0.0752962, 44 | -0.104078434, 45 | 0.16903225, 46 | -0.014818254, 47 | 0.041726623, 48 | 49 | 0.60707116, 50 | 0.5149234, 51 | 0.036716104, 52 | -0.070126966, 53 | 0.37001306, 54 | 55 | -0.18412519, 56 | -0.11984752, 57 | -0.0021386633, 58 | -0.046877146, 59 | -0.16237561, 60 | 61 | 0.8256881, 62 | 0.7394887, 63 | 0.07848209, 64 | 0.058255166, 65 | 0.9127046, 66 | 67 | }; 68 | const float layer1offsets[5] = { 69 | 20.326342, 70 | 4.904586, 71 | 15.644517, 72 | -0.018791957, 73 | 17.353731, 74 | }; 75 | 76 | const int layer2inputs = 5; 77 | const int layer2outputs = 11; 78 | const float layer2weights[55] = { 79 | 1.1972289, 80 | 1.9218329, 81 | 3.7349546, 82 | 9.81511, 83 | 2.6227558, 84 | -6.426236, 85 | -43.629112, 86 | -22.914429, 87 | -0.781369, 88 | -0.45372608, 89 | -0.5380075, 90 | 91 | -5.0447526, 92 | -0.009247302, 93 | -0.008086299, 94 | -7.826033, 95 | -0.09450004, 96 | -0.11218474, 97 | -0.027270528, 98 | -0.0042230682, 99 | -0.0005600392, 100 | 0.0005161164, 101 | 0.004799865, 102 | 103 | 3.116825, 104 | -0.26401007, 105 | -1.3744258, 106 | -8.859539, 107 | -140.25873, 108 | -18.334105, 109 | -8.593332, 110 | -1.0478442, 111 | -1.3645409, 112 | -1.2001375, 113 | -0.9904336, 114 | 115 | -0.0058234227, 116 | -0.063901104, 117 | -0.03595568, 118 | -0.008307365, 119 | -0.01685345, 120 | -0.012366829, 121 | 0.028791403, 122 | 0.024446918, 123 | 0.028665425, 124 | -0.060744636, 125 | 0.072986275, 126 | 127 | -0.069886416, 128 | -1.427334, 129 | -2.6894572, 130 | -1.6754347, 131 | 0.0068549747, 132 | 1.0374902, 133 | -0.71136826, 134 | -71.97164, 135 | -66.504616, 136 | -73.23368, 137 | -47.109512, 138 | 139 | }; 140 | const float layer2offsets[11] = { 141 | -27.462452, 142 | -10.207701, 143 | -7.577945, 144 | -7.4156075, 145 | -3.9403565, 146 | -0.64222777, 147 | 0.68561864, 148 | -1.2962743, 149 | -2.180761, 150 | -1.9912802, 151 | -4.157279, 152 | }; 153 | -------------------------------------------------------------------------------- /spECK/include/common.h: -------------------------------------------------------------------------------- 1 | #ifndef spECK_Common 2 | #define spECK_Common 3 | #pragma once 4 | 5 | template 6 | __host__ __device__ __forceinline__ T divup(T a, T b) 7 | { 8 | return (a + b - 1) / b; 9 | } 10 | 11 | 12 | template 13 | __host__ __device__ __forceinline__ T clamp(const T& a, const T& min, const T& max) 14 | { 15 | return a < min ? min : (a > max ? max : a); 16 | } 17 | #endif 18 | 19 | inline static void HandleError(cudaError_t err, 20 | const char *file, 21 | int line) 22 | { 23 | if (err != cudaSuccess) 24 | { 25 | printf("%s in %s at line %d\n", cudaGetErrorString(err), 26 | file, line); 27 | throw std::exception(); 28 | } 29 | } 30 | // #ifdef _DEBUG || NDEBUG || DEBUG 31 | #define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__)) 32 | // #else 33 | // #define HANDLE_ERROR(err) err 34 | // #endif -------------------------------------------------------------------------------- /spECK/include/cuSparseMultiply.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "dCSR.h" 4 | #include 5 | #include 6 | #include 7 | 8 | namespace cuSPARSE { 9 | 10 | template 11 | class CuSparseTest 12 | { 13 | cusparseHandle_t handle; 14 | cusparseStatus_t status; 15 | cusparseMatDescr_t descr; 16 | cusparseMatDescr_t descrB; 17 | cusparseMatDescr_t descrC; 18 | 19 | public: 20 | CuSparseTest(): handle(0) 21 | { 22 | checkCuSparseError(cusparseCreate(&handle), "init failed"); 23 | checkCuSparseError(cusparseCreateMatDescr(&descr), "Matrix descriptor init failed"); 24 | checkCuSparseError(cusparseCreateMatDescr(&descrB), "Matrix descriptor init failed"); 25 | checkCuSparseError(cusparseCreateMatDescr(&descrC), "Matrix descriptor init failed"); 26 | cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL); 27 | cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO); 28 | cusparseSetMatType(descrB, CUSPARSE_MATRIX_TYPE_GENERAL); 29 | cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ZERO); 30 | cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL); 31 | cusparseSetMatIndexBase(descrC, CUSPARSE_INDEX_BASE_ZERO); 32 | } 33 | 34 | ~CuSparseTest() 35 | { 36 | checkCuSparseError(cusparseDestroyMatDescr(descr), "Matrix descriptor destruction failed"); 37 | checkCuSparseError(cusparseDestroyMatDescr(descrB), "Matrix descriptor destruction failed"); 38 | checkCuSparseError(cusparseDestroyMatDescr(descrC), "Matrix descriptor destruction failed"); 39 | cusparseDestroy(handle); 40 | } 41 | 42 | // Multiply two CSR matrices 43 | float Multiply(const dCSR& A, const dCSR& B, dCSR& matOut, uint32_t& cusparse_nnz); 44 | 45 | void Transpose(const dCSR& A, dCSR& AT); 46 | 47 | cusparseStatus_t checkCuSparseError(cusparseStatus_t status, std::string errorMsg) 48 | { 49 | if (status != CUSPARSE_STATUS_SUCCESS) { 50 | std::cout << "CuSparse error: " << errorMsg << std::endl; 51 | throw std::exception(); 52 | } 53 | return status; 54 | } 55 | 56 | cusparseStatus_t CUSPARSEAPI cusparseMultiply(cusparseHandle_t handle, 57 | cusparseOperation_t transA, 58 | cusparseOperation_t transB, 59 | int m, 60 | int n, 61 | int k, 62 | const cusparseMatDescr_t descrA, 63 | int nnzA, 64 | const DataType *csrSortedValA, 65 | const int *csrSortedRowPtrA, 66 | const int *csrSortedColIndA, 67 | const cusparseMatDescr_t descrB, 68 | int nnzB, 69 | const DataType *csrSortedValB, 70 | const int *csrSortedRowPtrB, 71 | const int *csrSortedColIndB, 72 | const cusparseMatDescr_t descrC, 73 | DataType *csrSortedValC, 74 | const int *csrSortedRowPtrC, 75 | int *csrSortedColIndC); 76 | 77 | cusparseStatus_t CUSPARSEAPI cusparseTranspose(cusparseHandle_t handle, 78 | int m, 79 | int n, 80 | int nnz, 81 | const DataType *csrSortedVal, 82 | const int *csrSortedRowPtr, 83 | const int *csrSortedColInd, 84 | DataType *cscSortedVal, 85 | int *cscSortedRowInd, 86 | int *cscSortedColPtr, 87 | cusparseAction_t copyValues, 88 | cusparseIndexBase_t idxBase); 89 | }; 90 | } 91 | -------------------------------------------------------------------------------- /spECK/include/cuda_common.h: -------------------------------------------------------------------------------- 1 | #ifndef _Z_COMMON_ 2 | #define _Z_COMMON_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #define likely(x) __builtin_expect(x,1) 14 | #define unlikely(x) __builtin_expect(x,0) 15 | 16 | inline static void checkCUDA(cudaError_t err, 17 | const char *file, 18 | int line) 19 | { 20 | if (unlikely(err != cudaSuccess)) 21 | { 22 | printf("%s in %s at line %d\n", cudaGetErrorString(err), 23 | file, line); 24 | throw std::exception(); 25 | } 26 | } 27 | // #ifdef _DEBUG || NDEBUG || DEBUG 28 | #define CHECK_CUDA(err) (checkCUDA(err, __FILE__, __LINE__)) 29 | #define CHECK_ERROR(err) (checkCUDA(err, __FILE__, __LINE__)) 30 | 31 | inline void CHECK_CUSPARSE(cusparseStatus_t status, std::string errorMsg="") 32 | { 33 | if (status != CUSPARSE_STATUS_SUCCESS) { 34 | std::cout << "CuSparse error: " << errorMsg << std::endl; 35 | throw std::exception(); 36 | } 37 | } 38 | 39 | #define HP_TIMING_NOW(Var) \ 40 | ({ unsigned int _hi, _lo; \ 41 | asm volatile ("lfence\n\trdtsc" : "=a" (_lo), "=d" (_hi)); \ 42 | (Var) = ((unsigned long long int) _hi << 32) | _lo; }) 43 | 44 | /* precision is 1 clock cycle. 45 | * execute time is roughly 50 or 140 cycles depends on cpu family */ 46 | inline void cpuid(int *info, int eax, int ecx = 0){ 47 | int ax, bx, cx, dx; 48 | __asm__ __volatile__ ("cpuid": "=a" (ax), "=b" (bx), "=c" (cx), "=d" (dx) : "a" (eax)); 49 | 50 | info[0] = ax; 51 | info[1] = bx; 52 | info[2] = cx; 53 | info[3] = dx; 54 | } 55 | 56 | inline long get_tsc_freq(){ 57 | static long freq = 0; 58 | if(unlikely((freq == 0))){ 59 | int raw[4]; 60 | cpuid(raw, 0x16); // get cpu freq 61 | freq = long(raw[0]) * 1000000; 62 | //printf("static first call %f\n", freq); 63 | } 64 | return freq; 65 | } 66 | 67 | inline double fast_clock_time(){ 68 | long counter; 69 | HP_TIMING_NOW(counter); 70 | return double(counter)/get_tsc_freq(); 71 | } 72 | 73 | template 74 | inline void D2H(T *dst, T* src, size_t size){ 75 | CHECK_ERROR(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost)); 76 | } 77 | 78 | template 79 | inline void H2D(T *dst, T* src, size_t size){ 80 | CHECK_ERROR(cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice)); 81 | } 82 | 83 | template 84 | inline void D2D(T *dst, T* src, size_t size){ 85 | CHECK_ERROR(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToDevice)); 86 | } 87 | 88 | 89 | #endif 90 | -------------------------------------------------------------------------------- /spECK/include/dCSR.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | template 7 | struct CSR; 8 | 9 | template 10 | struct dCSR 11 | { 12 | size_t rows, cols, nnz; 13 | 14 | T* data; 15 | unsigned int* row_offsets; 16 | unsigned int* col_ids; 17 | 18 | dCSR() : rows(0), cols(0), nnz(0), data(nullptr), row_offsets(nullptr), col_ids(nullptr) { } 19 | void alloc(size_t rows, size_t cols, size_t nnz, bool allocOffsets = true); 20 | void reset(); 21 | virtual ~dCSR(); 22 | }; 23 | 24 | template 25 | struct dCSRNoDealloc 26 | { 27 | size_t rows, cols, nnz; 28 | 29 | T* data; 30 | unsigned int* row_offsets; 31 | unsigned int* col_ids; 32 | 33 | dCSRNoDealloc(const dCSR& a) : rows(a.rows), cols(a.cols), data(a.data), nnz(a.nnz), row_offsets(a.row_offsets), col_ids(a.col_ids) {} 34 | dCSRNoDealloc() = default; 35 | }; 36 | 37 | template 38 | void convert(dCSR& dcsr, const CSR& csr, unsigned int padding = 0); 39 | 40 | template 41 | void convert(dCSR& dcsr, const dCSR& csr, unsigned int padding = 0); 42 | 43 | template 44 | void convert(CSR& csr, const dCSR& dcsr, unsigned int padding = 0); 45 | 46 | template 47 | void convert(CSR& csr, const CSR& dcsr, unsigned int padding = 0); -------------------------------------------------------------------------------- /spECK/include/meta_utils.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef INCLUDED_HIS_META_UTILS 3 | #define INCLUDED_HIS_META_UTILS 4 | 5 | #pragma once 6 | 7 | #include 8 | #include 9 | #include "multi_arch_build.h" 10 | 11 | 12 | using std::enable_if; 13 | using std::declval; 14 | using std::is_empty; 15 | using std::conditional; 16 | 17 | template 18 | struct type_match 19 | { 20 | static const bool value = false; 21 | }; 22 | 23 | template 24 | struct type_match 25 | { 26 | static const bool value = true; 27 | }; 28 | 29 | template 30 | struct static_divup 31 | { 32 | static const int value = (X + Y - 1) / Y; 33 | }; 34 | 35 | template 36 | struct static_popcnt 37 | { 38 | static const int value = ((X & 0x1) + static_popcnt< (X >> 1) >::value); 39 | }; 40 | template<> 41 | struct static_popcnt<0> 42 | { 43 | static const int value = 0; 44 | }; 45 | 46 | template 47 | struct static_clz 48 | { 49 | static const int value = (X & 0x80000000) ? Completed : static_clz< (X << 1), Completed + 1 >::value; 50 | }; 51 | template 52 | struct static_clz 53 | { 54 | static const int value = 32; 55 | }; 56 | 57 | template 58 | struct static_max; 59 | 60 | template 61 | struct static_max 62 | { 63 | static const int value = VALUE; 64 | }; 65 | 66 | template 67 | struct static_max 68 | { 69 | static const int next_value = static_max::value; 70 | static const int value = VALUE > next_value ? VALUE : next_value; 71 | }; 72 | 73 | template 74 | struct static_min; 75 | 76 | template 77 | struct static_min 78 | { 79 | static const int value = VALUE; 80 | }; 81 | 82 | template 83 | struct static_min 84 | { 85 | static const int next_value = static_min::value; 86 | static const int value = VALUE < next_value ? VALUE : next_value; 87 | }; 88 | 89 | template 90 | struct choose; 91 | 92 | template 93 | struct choose 94 | { 95 | typedef typename choose::type type; 96 | }; 97 | template 98 | struct choose<0, NC, NCS...> 99 | { 100 | typedef NC type; 101 | }; 102 | 103 | 104 | template 105 | struct conditional_eval; 106 | 107 | template<> 108 | struct conditional_eval 109 | { 110 | template 111 | DUAL_BUILD_FUNCTION static void eval(F f) 112 | { 113 | f(); 114 | } 115 | }; 116 | template<> 117 | struct conditional_eval 118 | { 119 | template 120 | DUAL_BUILD_FUNCTION static void eval(F f) 121 | { 122 | } 123 | }; 124 | 125 | template class CONSUMER, int V, int END, int STEP, bool DONE, int... VALUES> 126 | struct static_for_impl 127 | { 128 | using type = typename static_for_impl < CONSUMER, V+STEP, END, STEP, (V + STEP < END), VALUES..., V>::type; 129 | }; 130 | template class CONSUMER, int V, int END, int STEP, int... VALUES> 131 | struct static_for_impl 132 | { 133 | using type = CONSUMER ; 134 | }; 135 | 136 | template class CONSUMER, int END, int BEGIN = 0, int STEP = 1> 137 | struct static_for 138 | { 139 | using type = typename static_for_impl < CONSUMER, BEGIN, END, STEP, (BEGIN < END)>::type; 140 | }; 141 | 142 | 143 | template 144 | struct type_list { }; 145 | 146 | template class APPLIER, class COMBLIST, class... TYPELISTS> 147 | struct apply_list_impl; 148 | template class APPLIER, class... DONETYPES, class... NEWTYPES, class... REMTYPELISTS> 149 | struct apply_list_impl, type_list, REMTYPELISTS...> 150 | { 151 | using type = typename apply_list_impl, REMTYPELISTS...>::type; 152 | }; 153 | template class APPLIER, class... DONETYPES> 154 | struct apply_list_impl> 155 | { 156 | using type = APPLIER; 157 | }; 158 | template class APPLIER, class... TYPELISTS> 159 | struct apply_list 160 | { 161 | using type = typename apply_list_impl, TYPELISTS... >::type; 162 | }; 163 | 164 | template 165 | struct inverse_list_impl; 166 | template 167 | struct inverse_list_impl, type_list> 168 | { 169 | using type = typename inverse_list_impl, type_list>::type; 170 | }; 171 | template 172 | struct inverse_list_impl> 173 | { 174 | using type = INVERSE_LIST; 175 | }; 176 | template 177 | struct inverse_list 178 | { 179 | using type = typename inverse_list_impl, TYPELIST>::type; 180 | }; 181 | 182 | 183 | template 184 | struct sequence { }; 185 | 186 | template class APPLIER, class SEQUENCE> 187 | struct apply_sequence; 188 | template class APPLIER, int... NUMS> 189 | struct apply_sequence> 190 | { 191 | using type = APPLIER; 192 | }; 193 | 194 | template 195 | struct select_from_impl; 196 | template 197 | struct select_from_impl, sequence> 198 | { 199 | using type = typename select_from_impl <(MASK >> 1U), MASK & 0x1, sequence, sequence > ::type; 200 | }; 201 | template 202 | struct select_from_impl, sequence> 203 | { 204 | using type = typename select_from_impl <(MASK >> 1U), MASK & 0x1, sequence, sequence > ::type; 205 | }; 206 | template 207 | struct select_from_impl, sequence<>> 208 | { 209 | using type = sequence; 210 | }; 211 | template 212 | struct select_from 213 | { 214 | using type = typename select_from_impl <(MASK >> 1U), MASK & 0x1, sequence<>, SEQUENCE > ::type; 215 | }; 216 | 217 | 218 | template class LOGICAL, class SEQUENCE> 219 | struct sequence_any; 220 | template class LOGICAL, int NUM, int...NUMS> 221 | struct sequence_any > 222 | { 223 | static const bool value = LOGICAL::value || sequence_any>::value; 224 | }; 225 | template class LOGICAL> 226 | struct sequence_any > 227 | { 228 | static const bool value = false; 229 | }; 230 | 231 | template 232 | struct static_is_zero 233 | { 234 | static const bool value = false; 235 | }; 236 | template<> 237 | struct static_is_zero<0> 238 | { 239 | static const bool value = true; 240 | }; 241 | 242 | 243 | #endif //INCLUDED_HIS_META_UTILS -------------------------------------------------------------------------------- /spECK/include/multi_arch_build.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef __CUDACC__ 4 | #define DUAL_BUILD_FUNCTION __host__ __device__ 5 | #else 6 | #define DUAL_BUILD_FUNCTION 7 | #endif 8 | 9 | #ifndef __CUDA_ARCH__ 10 | inline float __uint_as_float(unsigned t) 11 | { 12 | return *reinterpret_cast(&t); 13 | } 14 | #endif -------------------------------------------------------------------------------- /spECK/include/spECKConfig.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include "CUDATools/stream.h" 4 | #include "stdio.h" 5 | 6 | namespace spECK { 7 | // get device attributes for best performance and creates cudaStreams 8 | struct spECKConfig { 9 | int sm; 10 | int maxStaticSharedMemoryPerBlock; 11 | int maxDynamicSharedMemoryPerBlock; 12 | std::vector streams; 13 | cudaEvent_t completeStart = 0, completeEnd = 0, individualStart = 0, individualEnd = 0; 14 | 15 | static spECKConfig initialize(int cudaDeviceNumber) { 16 | spECKConfig config; 17 | cudaDeviceProp prop; 18 | cudaGetDeviceProperties(&prop, cudaDeviceNumber); 19 | config.sm = prop.multiProcessorCount; 20 | config.maxStaticSharedMemoryPerBlock = prop.sharedMemPerBlock; 21 | config.maxDynamicSharedMemoryPerBlock = std::max(prop.sharedMemPerBlockOptin, prop.sharedMemPerBlock); 22 | 23 | for (int i = 0; i < 6; i++) { 24 | config.streams.push_back(0); 25 | cudaStreamCreate(&config.streams[i]); 26 | } 27 | cudaEventCreate(&config.completeStart); 28 | cudaEventCreate(&config.completeEnd); 29 | cudaEventCreate(&config.individualStart); 30 | cudaEventCreate(&config.individualEnd); 31 | return config; 32 | } 33 | 34 | void cleanup() { 35 | for (auto s : streams) { 36 | cudaStreamDestroy(s); 37 | } 38 | cudaEventDestroy(completeStart); 39 | cudaEventDestroy(completeEnd); 40 | cudaEventDestroy(individualStart); 41 | cudaEventDestroy(individualEnd); 42 | streams.clear(); 43 | } 44 | 45 | ~spECKConfig() { 46 | // cleanup(); 47 | } 48 | 49 | private: 50 | spECKConfig() { 51 | 52 | } 53 | }; 54 | } -------------------------------------------------------------------------------- /spECK/readme.md: -------------------------------------------------------------------------------- 1 | # Get started 2 | 1 Profile speck 3 | 4 | 1.1 ``` $> make speck ``` 5 | 6 | 1.2 ``` $> ./speck webbase-1M ``` 7 | 8 | 2 Overall performance of speck 9 | 10 | 2.1 ``` $> make reg_speck ``` 11 | 12 | 2.2 ``` $> ./reg_speck webbase-1M ``` 13 | -------------------------------------------------------------------------------- /spECK/source/COO.cpp: -------------------------------------------------------------------------------- 1 | #include "COO.h" 2 | #include "Vector.h" 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | namespace { 12 | template 13 | struct DataTypeValidator { 14 | static const bool validate(std::string type) { 15 | return false; 16 | } 17 | }; 18 | 19 | template<> 20 | struct DataTypeValidator { 21 | static const bool validate(std::string type) { 22 | return type.compare("real") == 0 || type.compare("integer") == 0 || type.compare("double") == 0; 23 | } 24 | }; 25 | template<> 26 | struct DataTypeValidator { 27 | static const bool validate(std::string type) { 28 | return type.compare("real") == 0 || type.compare("integer") == 0 || type.compare("double") == 0; 29 | } 30 | }; 31 | 32 | template<> 33 | struct DataTypeValidator { 34 | static const bool validate(std::string type) { 35 | return type.compare("integer") == 0; 36 | } 37 | }; 38 | } 39 | 40 | template 41 | void COO::alloc(size_t r, size_t c, size_t n) 42 | { 43 | rows = r; 44 | cols = c; 45 | nnz = n; 46 | 47 | data = std::make_unique(n); 48 | row_ids = std::make_unique(n); 49 | col_ids = std::make_unique(n); 50 | } 51 | 52 | template 53 | COO loadMTX(const char * file) 54 | { 55 | std::ifstream fstream(file); 56 | if (!fstream.is_open()) 57 | throw std::runtime_error(std::string("could not open \"") + file + "\""); 58 | 59 | COO resmatrix; 60 | size_t num_rows, num_columns, num_non_zeroes; 61 | 62 | size_t line_counter = 0; 63 | std::string line; 64 | bool pattern = false; 65 | bool hermitian = false; 66 | // read header; 67 | std::getline(fstream, line); 68 | if (line.compare(0, 32, "%%MatrixMarket matrix coordinate") != 0) 69 | throw std::runtime_error("Can only read MatrixMarket format that is in coordinate form"); 70 | std::istringstream iss(line); 71 | std::vector tokens{ std::istream_iterator{iss}, std::istream_iterator{} }; 72 | bool complex = false; 73 | 74 | if (tokens[3] == "pattern") 75 | pattern = true; 76 | else if (tokens[3] == "complex") 77 | complex = true; 78 | else if (DataTypeValidator::validate(tokens[3]) == false) 79 | throw std::runtime_error("MatrixMarket data type does not match matrix format"); 80 | bool symmetric = false; 81 | if (tokens[4].compare("general") == 0) 82 | symmetric = false; 83 | else if (tokens[4].compare("symmetric") == 0) 84 | symmetric = true; 85 | else if (tokens[4].compare("Hermitian") == 0) 86 | hermitian = true; 87 | else 88 | throw std::runtime_error("Can only read MatrixMarket format that is either symmetric, general or hermitian"); 89 | 90 | while (std::getline(fstream, line)) 91 | { 92 | ++line_counter; 93 | if (line[0] == '%') 94 | continue; 95 | std::istringstream liness(line); 96 | liness >> num_rows >> num_columns >> num_non_zeroes; 97 | if (liness.fail()) 98 | throw std::runtime_error(std::string("Failed to read matrix market header from \"") + file + "\""); 99 | //std::cout << "Read matrix header" << std::endl; 100 | //std::cout << "rows: " << rows << " columns: " << columns << " nnz: " << nnz << std::endl; 101 | break; 102 | } 103 | 104 | size_t reserve = num_non_zeroes; 105 | if (symmetric || hermitian) 106 | reserve *= 2; 107 | 108 | resmatrix.alloc(num_rows, num_columns, reserve); 109 | 110 | //read data 111 | size_t read = 0; 112 | while (std::getline(fstream, line)) 113 | { 114 | ++line_counter; 115 | if (line[0] == '%') 116 | continue; 117 | 118 | std::istringstream liness(line); 119 | 120 | 121 | do 122 | { 123 | char ch; 124 | liness.get(ch); 125 | if (!isspace(ch)) 126 | { 127 | liness.putback(ch); 128 | break; 129 | } 130 | 131 | } while (!liness.eof()); 132 | if (liness.eof() || line.length() == 0) 133 | continue; 134 | 135 | uint32_t r, c; 136 | T d; 137 | liness >> r >> c; 138 | if (pattern) 139 | d = 1; 140 | else 141 | liness >> d; 142 | if (liness.fail()) 143 | throw std::runtime_error(std::string("Failed to read data at line ") + std::to_string(line_counter) + " from matrix market file \"" + file + "\""); 144 | if (r > num_rows) 145 | throw std::runtime_error(std::string("Row index out of bounds at line ") + std::to_string(line_counter) + " in matrix market file \"" + file + "\""); 146 | if (c > num_columns) 147 | throw std::runtime_error(std::string("Column index out of bounds at line ") + std::to_string(line_counter) + " in matrix market file \"" + file + "\""); 148 | 149 | resmatrix.row_ids[read] = r - 1; 150 | resmatrix.col_ids[read] = c - 1; 151 | resmatrix.data[read] = d; 152 | ++read; 153 | if ((symmetric || hermitian) && r != c) 154 | { 155 | resmatrix.row_ids[read] = c - 1; 156 | resmatrix.col_ids[read] = r - 1; 157 | resmatrix.data[read] = d; 158 | ++read; 159 | } 160 | } 161 | 162 | resmatrix.nnz = read; 163 | return resmatrix; 164 | } 165 | 166 | template 167 | COO loadCOO(const char * file) 168 | { 169 | return COO(); 170 | } 171 | 172 | template 173 | void storeCOO(const COO& mat, const char * file) 174 | { 175 | 176 | } 177 | 178 | template 179 | void spmv(DenseVector& res, const COO& m, const DenseVector& v, bool transpose) 180 | { 181 | if (transpose && v.size != m.rows) 182 | throw std::runtime_error("SPMV dimensions mismatch"); 183 | if (!transpose && v.size != m.cols) 184 | throw std::runtime_error("SPMV dimensions mismatch"); 185 | 186 | size_t outsize = transpose ? m.cols : m.rows; 187 | if (res.size < outsize) 188 | //res.data = std::make_unique(outsize); 189 | res.data = new T [outsize]; 190 | res.size = outsize; 191 | 192 | std::fill(&res.data[0], &res.data[0] + outsize, 0); 193 | 194 | 195 | if(transpose) 196 | for (size_t i = 0; i < m.nnz; ++i) 197 | res.data[m.col_ids[i]] += m.data[i] * v.data[m.row_ids[i]]; 198 | else 199 | for (size_t i = 0; i < m.nnz; ++i) 200 | res.data[m.row_ids[i]] += m.data[i] * v.data[m.col_ids[i]]; 201 | } 202 | 203 | 204 | template void COO::alloc(size_t, size_t, size_t); 205 | template void COO::alloc(size_t, size_t, size_t); 206 | 207 | template COO loadMTX(const char * file); 208 | template COO loadMTX(const char * file); 209 | 210 | template void spmv(DenseVector& res, const COO& m, const DenseVector& v, bool transpose); 211 | template void spmv(DenseVector& res, const COO& m, const DenseVector& v, bool transpose); 212 | -------------------------------------------------------------------------------- /spECK/source/CSR.cpp: -------------------------------------------------------------------------------- 1 | #include "CSR.h" 2 | #include "COO.h" 3 | 4 | #include 5 | typedef unsigned long long int uint64_t; 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | namespace { 17 | template 18 | struct State 19 | { 20 | typedef VALUE_TYPE ValueType; 21 | 22 | ValueType scaling; 23 | bool transpose; 24 | 25 | State() : scaling(1), transpose(false) { } 26 | State(ValueType scaling, bool transpose) : scaling(scaling), transpose(transpose) { } 27 | }; 28 | 29 | struct CSRIOHeader 30 | { 31 | static constexpr char Magic[] = { 'H','i', 1, 'C','o','m','p','s','d' }; 32 | 33 | char magic[sizeof(Magic)]; 34 | uint64_t typesize; 35 | uint64_t compresseddir; 36 | uint64_t indexsize; 37 | uint64_t fixedoffset; 38 | uint64_t offsetsize; 39 | uint64_t num_rows, num_columns; 40 | uint64_t num_non_zeroes; 41 | 42 | CSRIOHeader() = default; 43 | 44 | 45 | template 46 | static uint64_t typeSize() 47 | { 48 | return sizeof(T); 49 | } 50 | 51 | template 52 | CSRIOHeader(const CSR& mat) 53 | { 54 | for (size_t i = 0; i < sizeof(Magic); ++i) 55 | magic[i] = Magic[i]; 56 | typesize = typeSize(); 57 | compresseddir = 0; 58 | indexsize = typeSize(); 59 | fixedoffset = 0; 60 | offsetsize = typeSize(); 61 | 62 | num_rows = mat.rows; 63 | num_columns = mat.cols; 64 | num_non_zeroes = mat.nnz; 65 | } 66 | 67 | bool checkMagic() const 68 | { 69 | for (size_t i = 0; i < sizeof(Magic); ++i) 70 | if (magic[i] != Magic[i]) 71 | return false; 72 | return true; 73 | } 74 | }; 75 | constexpr char CSRIOHeader::Magic[]; 76 | } 77 | 78 | template 79 | void CSR::alloc(size_t r, size_t c, size_t n) 80 | { 81 | rows = r; 82 | cols = c; 83 | nnz = n; 84 | 85 | //data = std::make_unique(n); 86 | //col_ids = std::make_unique(n); 87 | //row_offsets = std::make_unique(r+1); 88 | data = new T [n]; 89 | col_ids = new int [n]; 90 | row_offsets = new int [r+1]; 91 | } 92 | 93 | template 94 | CSR loadCSR(const char * file) 95 | { 96 | std::ifstream fstream(file, std::fstream::binary); 97 | if (!fstream.is_open()) 98 | throw std::runtime_error(std::string("could not open \"") + file + "\""); 99 | 100 | CSRIOHeader header; 101 | State state; 102 | fstream.read(reinterpret_cast(&header), sizeof(CSRIOHeader)); 103 | if (!fstream.good()) 104 | throw std::runtime_error("Could not read CSR header"); 105 | if (!header.checkMagic()) 106 | throw std::runtime_error("File does not appear to be a CSR Matrix"); 107 | 108 | fstream.read(reinterpret_cast(&state), sizeof(state)); 109 | if (!fstream.good()) 110 | throw std::runtime_error("Could not read CompressedMatrix state"); 111 | if (header.typesize != CSRIOHeader::typeSize()) 112 | throw std::runtime_error("File does not contain a CSR matrix with matching type"); 113 | 114 | CSR res; 115 | res.alloc(header.num_rows, header.num_columns, header.num_non_zeroes); 116 | 117 | fstream.read(reinterpret_cast(&res.data[0]), res.nnz * sizeof(T)); 118 | fstream.read(reinterpret_cast(&res.col_ids[0]), res.nnz * sizeof(unsigned int)); 119 | fstream.read(reinterpret_cast(&res.row_offsets[0]), (res.rows+1) * sizeof(unsigned int)); 120 | 121 | if (!fstream.good()) 122 | throw std::runtime_error("Could not read CSR matrix data"); 123 | 124 | return res; 125 | } 126 | 127 | template 128 | void storeCSR(const CSR& mat, const char * file) 129 | { 130 | std::ofstream fstream(file, std::fstream::binary); 131 | if (!fstream.is_open()) 132 | throw std::runtime_error(std::string("could not open \"") + file + "\""); 133 | 134 | CSRIOHeader header(mat); 135 | State state; 136 | fstream.write(reinterpret_cast(&header), sizeof(CSRIOHeader)); 137 | fstream.write(reinterpret_cast(&state), sizeof(state)); 138 | fstream.write(reinterpret_cast(&mat.data[0]), mat.nnz * sizeof(T)); 139 | fstream.write(reinterpret_cast(&mat.col_ids[0]), mat.nnz * sizeof(unsigned int)); 140 | fstream.write(reinterpret_cast(&mat.row_offsets[0]), (mat.rows + 1) * sizeof(unsigned int)); 141 | 142 | } 143 | 144 | template 145 | void spmv(DenseVector& res, const CSR& m, const DenseVector& v, bool transpose) 146 | { 147 | if (transpose && v.size != m.rows) 148 | throw std::runtime_error("SPMV dimensions mismatch"); 149 | if (!transpose && v.size != m.cols) 150 | throw std::runtime_error("SPMV dimensions mismatch"); 151 | 152 | size_t outsize = transpose ? m.cols : m.rows; 153 | if (res.size < outsize) 154 | //res.data = std::make_unique(outsize); 155 | res.data = new T [outsize]; 156 | res.size = outsize; 157 | 158 | if (transpose) 159 | { 160 | std::fill(&res.data[0], &res.data[0] + m.cols, 0); 161 | for (size_t i = 0; i < m.rows; ++i) 162 | { 163 | for (unsigned int o = m.row_offsets[i]; o < m.row_offsets[i+1]; ++o) 164 | res.data[m.col_ids[o]] += m.data[o] * v.data[i]; 165 | } 166 | } 167 | else 168 | { 169 | for (size_t i = 0; i < m.rows; ++i) 170 | { 171 | T val = 0; 172 | for (unsigned int o = m.row_offsets[i]; o < m.row_offsets[i+1]; ++o) 173 | val += m.data[o] * v.data[m.col_ids[o]]; 174 | res.data[i] = val; 175 | } 176 | } 177 | } 178 | 179 | template 180 | void convert(CSR& res, const COO& coo) 181 | { 182 | struct Entry 183 | { 184 | unsigned int r, c; 185 | T v; 186 | bool operator < (const Entry& other) 187 | { 188 | if (r != other.r) 189 | return r < other.r; 190 | return c < other.c; 191 | } 192 | }; 193 | 194 | std::vector entries; 195 | //std::cout << "coo.nnz" << coo.nnz << std::endl; 196 | entries.reserve(coo.nnz); 197 | for (size_t i = 0; i < coo.nnz; ++i) 198 | entries.push_back(Entry{ coo.row_ids[i], coo.col_ids[i], coo.data[i] }); 199 | std::sort(std::begin(entries), std::end(entries)); 200 | 201 | res.alloc(coo.rows, coo.cols, coo.nnz); 202 | std::fill(&res.row_offsets[0], &res.row_offsets[coo.rows], 0); 203 | for (size_t i = 0; i < coo.nnz; ++i) 204 | { 205 | res.data[i] = entries[i].v; 206 | res.col_ids[i] = entries[i].c; 207 | ++res.row_offsets[entries[i].r]; 208 | } 209 | 210 | unsigned int off = 0; 211 | for (size_t i = 0; i < coo.rows; ++i) 212 | { 213 | unsigned int n = off + res.row_offsets[i]; 214 | res.row_offsets[i] = off; 215 | off = n; 216 | } 217 | res.row_offsets[coo.rows] = off; 218 | } 219 | 220 | template 221 | CSR::CSR(const CSR &A, int r, int c, int row_start, int col_start){ 222 | assert(r + row_start <= A.rows && "matrix subsect error M"); 223 | assert(c + col_start <= A.cols && "matrix subsect error N"); 224 | int row_end = row_start + r; 225 | int col_end = col_start + c; 226 | rows = r; 227 | cols = c; 228 | int *row_size = new int [rows]; 229 | memset(row_size, 0, rows*sizeof(int)); 230 | for(int i = row_start; i < row_end; i++){ 231 | for(int j = A.row_offsets[i]; j < A.row_offsets[i+1]; j++){ 232 | if(A.col_ids[j]>= col_start && A.col_ids[j] < col_end){ 233 | row_size[i - row_start]++; 234 | } 235 | } 236 | } 237 | int nnz = 0; 238 | for(int i =0; i < rows; i++){ 239 | nnz += row_size[i]; 240 | } 241 | alloc(rows, cols, nnz); 242 | 243 | row_offsets[0] = 0; 244 | for(int i = 0; i < rows; i++){ 245 | row_offsets[i+1] = row_offsets[i] + row_size[i]; 246 | } 247 | delete [] row_size; 248 | 249 | for(int i = row_start; i < row_end; i++){ 250 | int jj = row_offsets[i - row_start]; 251 | for(int j = A.row_offsets[i]; j < A.row_offsets[i+1]; j++){ 252 | if(A.col_ids[j]>= col_start && A.col_ids[j] < col_end){ 253 | col_ids[jj] = A.col_ids[j] - col_start; 254 | data[jj++] = A.data[j]; 255 | } 256 | } 257 | } 258 | 259 | } 260 | template 261 | CSR& CSR::operator=(const CSR& src){ 262 | alloc(src.rows, src.cols, src.nnz); 263 | rows = src.rows; nnz = src.nnz; cols = src.cols; 264 | memcpy(data, src.data, nnz * sizeof(T)); 265 | memcpy(col_ids, src.col_ids, nnz * sizeof(int)); 266 | memcpy(row_offsets, src.row_offsets, (rows + 1) * sizeof(int)); 267 | return *this; 268 | } 269 | 270 | template class CSR; 271 | template class CSR; 272 | 273 | //template void CSR::alloc(size_t, size_t, size_t); 274 | //template void CSR::alloc(size_t, size_t, size_t); 275 | 276 | template CSR loadCSR(const char * file); 277 | template CSR loadCSR(const char * file); 278 | 279 | template void storeCSR(const CSR& mat, const char * file); 280 | template void storeCSR(const CSR& mat, const char * file); 281 | 282 | template void spmv(DenseVector& res, const CSR& m, const DenseVector& v, bool transpose); 283 | template void spmv(DenseVector& res, const CSR& m, const DenseVector& v, bool transpose); 284 | 285 | 286 | template void convert(CSR& res, const COO& coo); 287 | template void convert(CSR& res, const COO& coo); 288 | -------------------------------------------------------------------------------- /spECK/source/Config.cpp: -------------------------------------------------------------------------------- 1 | #include "Config.h" 2 | Config *Config::_instance = nullptr; 3 | 4 | void Config::init(std::string path) 5 | { 6 | _instance = new Config(path); 7 | } 8 | 9 | void Config::init() 10 | { 11 | _instance = new Config(); 12 | } 13 | 14 | int Config::getInt(Key key, int fallback) 15 | { 16 | if (Instance().overrides.find(key) != Instance().overrides.end()) 17 | return Instance().overrides[key]; 18 | 19 | return Instance().reader.GetInteger("", Instance().keyToString[key], fallback); 20 | } 21 | 22 | int Config::setInt(Key key, int newVal) 23 | { 24 | return Instance().overrides[key] = newVal; 25 | } 26 | 27 | string Config::getString(Key key, std::string fallback) 28 | { 29 | return Instance().reader.Get("", Instance().keyToString[key], fallback); 30 | } 31 | 32 | bool Config::getBool(Key key, bool fallback) 33 | { 34 | return Instance().reader.GetBoolean("", Instance().keyToString[key], fallback); 35 | } 36 | 37 | float Config::getFloat(Key key, float fallback) 38 | { 39 | return (float) Instance().reader.GetReal("", Instance().keyToString[key], fallback); 40 | } -------------------------------------------------------------------------------- /spECK/source/DataLoader.cpp: -------------------------------------------------------------------------------- 1 | #include "DataLoader.h" 2 | 3 | #include 4 | #include "COO.h" 5 | #include 6 | 7 | template 8 | std::string typeExtension(); 9 | template<> 10 | std::string typeExtension() 11 | { 12 | return std::string(""); 13 | } 14 | template<> 15 | std::string typeExtension() 16 | { 17 | return std::string("d_"); 18 | } 19 | 20 | template class DataLoader; 21 | template class DataLoader; 22 | 23 | template 24 | DataLoader::DataLoader(std::string path, std::string path2) : matrices() 25 | { 26 | std::string csrPath = path + typeExtension() + ".hicsr"; 27 | 28 | try 29 | { 30 | //std::cout << "trying to load csr file \"" << csrPath << "\"\n"; 31 | matrices.cpuA = loadCSR(csrPath.c_str()); 32 | //std::cout << "successfully loaded: \"" << csrPath << "\"\n"; 33 | } 34 | catch (std::exception& ex) 35 | { 36 | //std::cout << "could not load csr file:\n\t" << ex.what() << "\n"; 37 | try 38 | { 39 | //std::cout << "trying to load mtx file \"" << path << "\"\n"; 40 | COO cooMat = loadMTX(path.c_str()); 41 | convert(matrices.cpuA, cooMat); 42 | //std::cout << "successfully loaded and converted: \"" << csrPath << "\"\n"; 43 | } 44 | catch (std::exception& ex) 45 | { 46 | std::cout << ex.what() << std::endl; 47 | std::cout << "could not load mtx file: \"" << path << "\"\n"; 48 | throw "could not load mtx file"; 49 | } 50 | 51 | try 52 | { 53 | //std::cout << "write csr file for future use in" << csrPath.c_str() << "\n"; 54 | //storeCSR(matrices.cpuA, csrPath.c_str()); 55 | } 56 | catch (std::exception& ex) 57 | { 58 | std::cout << ex.what() << std::endl; 59 | } 60 | } 61 | 62 | //cuSPARSE::CuSparseTest cuSparse; 63 | 64 | //calculate the transpose if matrix is not square 65 | if(path == path2){ 66 | convert(matrices.cpuB, matrices.cpuA, 0); 67 | } 68 | else{ 69 | try 70 | { 71 | //std::cout << "trying to load mtx file \"" << path << "\"\n"; 72 | COO cooMat = loadMTX(path2.c_str()); 73 | convert(matrices.cpuB, cooMat); 74 | //std::cout << "successfully loaded and converted: \"" << csrPath << "\"\n"; 75 | } 76 | catch (std::exception& ex) 77 | { 78 | std::cout << ex.what() << std::endl; 79 | std::cout << "could not load mtx file: \"" << path << "\"\n"; 80 | throw "could not load mtx file"; 81 | } 82 | if(matrices.cpuA.cols == matrices.cpuB.rows){ 83 | // do nothing 84 | } 85 | else if(matrices.cpuA.cols < matrices.cpuB.rows){ 86 | CSR tmp(matrices.cpuB, matrices.cpuA.cols, matrices.cpuB.cols, 0, 0); 87 | matrices.cpuB = tmp; 88 | } 89 | else{ 90 | CSR tmp(matrices.cpuA, matrices.cpuA.rows, matrices.cpuB.rows, 0, 0); 91 | matrices.cpuA = tmp; 92 | } 93 | } 94 | 95 | //if (matrices.gpuA.rows != matrices.gpuA.cols) 96 | //{ 97 | // cuSparse.Transpose(matrices.gpuA, matrices.gpuB); 98 | // convert(matrices.cpuB, matrices.gpuB); 99 | //} 100 | //else 101 | //{ 102 | // convert(matrices.gpuB, matrices.cpuA, 0); 103 | // convert(matrices.cpuB, matrices.cpuA, 0); 104 | //} 105 | convert(matrices.gpuA, matrices.cpuA, 0); 106 | convert(matrices.gpuB, matrices.cpuB, 0); 107 | } 108 | -------------------------------------------------------------------------------- /spECK/source/Executor.cpp: -------------------------------------------------------------------------------- 1 | #include "cuda_runtime.h" 2 | #include "Executor.h" 3 | #include "Multiply.h" 4 | #include "DataLoader.h" 5 | #include 6 | #include "Config.h" 7 | #include "Compare.h" 8 | #include 9 | #include "Timings.h" 10 | #include "spECKConfig.h" 11 | #include "common.h" 12 | #include "cuda_common.h" 13 | 14 | template 15 | long compt_flop(const CSR &A, const CSR &B){ 16 | int M = A.rows; 17 | long total_flop = 0; 18 | for(int i = 0; i < M; i++){ 19 | for(int j = A.row_offsets[i]; j < A.row_offsets[i+1]; j++){ 20 | total_flop += B.row_offsets[A.col_ids[j]+1] - B.row_offsets[A.col_ids[j]]; 21 | } 22 | } 23 | return total_flop; 24 | } 25 | 26 | 27 | template 28 | int Executor::run() 29 | { 30 | iterationsWarmup = Config::getInt(Config::IterationsWarmUp, 1); 31 | iterationsExecution = Config::getInt(Config::IterationsExecution, 10); 32 | //iterationsWarmup = 1; 33 | //iterationsExecution = 1; 34 | DataLoader data(runConfig.filePath, runConfig.filePath2); 35 | //std::cout << runConfig.filePath << std::endl; 36 | auto& matrices = data.matrices; 37 | //std::cout << "Matrix: " << matrices.cpuA.rows << "x" << matrices.cpuA.cols << ": " << matrices.cpuA.nnz << " nonzeros\n"; 38 | 39 | long total_flops = compt_flop(matrices.cpuA, matrices.cpuB); 40 | 41 | dCSR dCsrHiRes, dCsrReference; 42 | Timings timings, warmupTimings, benchTimings; 43 | //bool measureAll = Config::getBool(Config::TrackIndividualTimes, false); 44 | bool measureAll = false; 45 | bool measureCompleteTimes = Config::getBool(Config::TrackCompleteTimes, true); 46 | auto config = spECK::spECKConfig::initialize(0); 47 | 48 | //bool compareData = false; 49 | bool compareData = true; 50 | 51 | if(Config::getBool(Config::CompareResult)) 52 | { 53 | unsigned cuSubdiv_nnz = 0; 54 | cuSPARSE::CuSparseTest cusparse; 55 | cusparse.Multiply(matrices.gpuA, matrices.gpuB, dCsrReference, cuSubdiv_nnz); 56 | 57 | if(!compareData) 58 | { 59 | cudaFree(dCsrReference.data); 60 | dCsrReference.data = nullptr; 61 | } 62 | } 63 | 64 | // Warmup iterations for multiplication 65 | for (int i = 0; i < iterationsWarmup; ++i) 66 | { 67 | timings = Timings(); 68 | timings.measureAll = measureAll; 69 | timings.measureCompleteTime = measureCompleteTimes; 70 | spECK::MultiplyspECK(matrices.gpuA, matrices.gpuB, dCsrHiRes, config, timings); 71 | warmupTimings += timings; 72 | 73 | if (dCsrHiRes.data != nullptr && dCsrHiRes.col_ids != nullptr && Config::getBool(Config::CompareResult)) 74 | { 75 | printf("compare data \n"); 76 | //if (!spECK::Compare(dCsrReference, dCsrHiRes, false)) 77 | if (!spECK::Compare(dCsrReference, dCsrHiRes, compareData)) 78 | printf("Error: Matrix incorrect\n"); 79 | } 80 | dCsrHiRes.reset(); 81 | } 82 | 83 | // Multiplication 84 | for (int i = 0; i < iterationsExecution; ++i) 85 | { 86 | timings = Timings(); 87 | timings.measureAll = measureAll; 88 | timings.measureCompleteTime = measureCompleteTimes; 89 | spECK::MultiplyspECK 90 | (matrices.gpuA, matrices.gpuB, dCsrHiRes, config, timings); 91 | benchTimings += timings; 92 | 93 | // if (dCsrHiRes.data != nullptr && dCsrHiRes.col_ids != nullptr && Config::getBool(Config::CompareResult)) 94 | // { 95 | // if (!spECK::Compare(dCsrReference, dCsrHiRes, compareData)) 96 | // printf("Error: Matrix incorrect\n"); 97 | // } 98 | dCsrHiRes.reset(); 99 | } 100 | 101 | benchTimings /= iterationsExecution; 102 | benchTimings.reg_print(total_flops * 2); 103 | 104 | return 0; 105 | } 106 | 107 | template 108 | int Executor::run_detail() 109 | { 110 | iterationsWarmup = Config::getInt(Config::IterationsWarmUp, 1); 111 | iterationsExecution = Config::getInt(Config::IterationsExecution, 10); 112 | //iterationsWarmup = 1; 113 | //iterationsExecution = 1; 114 | DataLoader data(runConfig.filePath, runConfig.filePath2); 115 | //std::cout << runConfig.filePath << std::endl; 116 | auto& matrices = data.matrices; 117 | //std::cout << "Matrix: " << matrices.cpuA.rows << "x" << matrices.cpuA.cols << ": " << matrices.cpuA.nnz << " nonzeros\n"; 118 | 119 | long total_flops = compt_flop(matrices.cpuA, matrices.cpuB); 120 | 121 | dCSR dCsrHiRes, dCsrReference; 122 | Timings timings, warmupTimings, benchTimings; 123 | bool measureAll = true; 124 | bool measureCompleteTimes = Config::getBool(Config::TrackCompleteTimes, true); 125 | auto config = spECK::spECKConfig::initialize(0); 126 | 127 | //bool compareData = false; 128 | bool compareData = true; 129 | 130 | if(Config::getBool(Config::CompareResult)) 131 | { 132 | unsigned cuSubdiv_nnz = 0; 133 | cuSPARSE::CuSparseTest cusparse; 134 | cusparse.Multiply(matrices.gpuA, matrices.gpuB, dCsrReference, cuSubdiv_nnz); 135 | 136 | if(!compareData) 137 | { 138 | cudaFree(dCsrReference.data); 139 | dCsrReference.data = nullptr; 140 | } 141 | } 142 | 143 | // Warmup iterations for multiplication 144 | for (int i = 0; i < iterationsWarmup; ++i) 145 | { 146 | timings = Timings(); 147 | timings.measureAll = measureAll; 148 | timings.measureCompleteTime = measureCompleteTimes; 149 | spECK::MultiplyspECK(matrices.gpuA, matrices.gpuB, dCsrHiRes, config, timings); 150 | warmupTimings += timings; 151 | 152 | if (dCsrHiRes.data != nullptr && dCsrHiRes.col_ids != nullptr && Config::getBool(Config::CompareResult)) 153 | { 154 | printf("compare data \n"); 155 | //if (!spECK::Compare(dCsrReference, dCsrHiRes, false)) 156 | if (!spECK::Compare(dCsrReference, dCsrHiRes, compareData)) 157 | printf("Error: Matrix incorrect\n"); 158 | } 159 | dCsrHiRes.reset(); 160 | } 161 | 162 | // Multiplication 163 | for (int i = 0; i < iterationsExecution; ++i) 164 | { 165 | timings = Timings(); 166 | timings.measureAll = measureAll; 167 | timings.measureCompleteTime = measureCompleteTimes; 168 | spECK::MultiplyspECK 169 | (matrices.gpuA, matrices.gpuB, dCsrHiRes, config, timings); 170 | benchTimings += timings; 171 | 172 | // if (dCsrHiRes.data != nullptr && dCsrHiRes.col_ids != nullptr && Config::getBool(Config::CompareResult)) 173 | // { 174 | // if (!spECK::Compare(dCsrReference, dCsrHiRes, compareData)) 175 | // printf("Error: Matrix incorrect\n"); 176 | // } 177 | dCsrHiRes.reset(); 178 | } 179 | 180 | benchTimings /= iterationsExecution; 181 | benchTimings.print(total_flops * 2); 182 | 183 | return 0; 184 | } 185 | 186 | template class Executor; 187 | -------------------------------------------------------------------------------- /spECK/source/GPU/Compare.cu: -------------------------------------------------------------------------------- 1 | // Global includes 2 | #include 3 | #include 4 | typedef unsigned long long int uint64_t; 5 | 6 | // Local includes 7 | #include "Compare.h" 8 | #include "common.h" 9 | 10 | #define VERIFICATION_TEXT 11 | 12 | template 13 | __global__ void d_compare(int in_rows, int in_cols, const uint32_t* __restrict reference_offset, const uint32_t* __restrict reference_indices, const DataType* __restrict reference_values, 14 | const uint32_t* __restrict compare_offset, const uint32_t* __restrict compare_indices, const DataType* __restrict compare_values, bool compare_data, double epsilon, uint32_t* verification) 15 | { 16 | int tid = threadIdx.x + blockDim.x * blockIdx.x; 17 | if (tid >= in_rows) 18 | return; 19 | 20 | // if (tid > 10000) 21 | // return; 22 | 23 | uint32_t ref_offset = reference_offset[tid]; 24 | uint32_t comp_offset = compare_offset[tid]; 25 | uint32_t ref_number_entries = reference_offset[tid + 1] - ref_offset; 26 | uint32_t comp_number_entries = compare_offset[tid + 1] - comp_offset; 27 | 28 | if (ref_number_entries != comp_number_entries) 29 | { 30 | #ifdef VERIFICATION_TEXT 31 | printf("---------- Row: %u | Row length not identical: (Ref|Comp) : (%u|%u)\n",tid, ref_number_entries, comp_number_entries); 32 | #endif 33 | *verification = 1; 34 | return; 35 | } 36 | 37 | uint32_t num_entries = min(ref_number_entries, comp_number_entries); 38 | 39 | for (uint32_t i = 0; i < num_entries; ++i) 40 | { 41 | if (reference_indices[ref_offset + i] != compare_indices[comp_offset + i]) 42 | { 43 | #ifdef VERIFICATION_TEXT 44 | printf("Row: %u | Row indices do NOT match: (Ref|Comp) : (%u|%u) - pos: %u/%u\n", tid, reference_indices[ref_offset + i], compare_indices[comp_offset + i], i, num_entries); 45 | #endif 46 | *verification = 1; 47 | return; 48 | } 49 | if (compare_data) 50 | { 51 | if (compare_values[comp_offset + i] != 0 && std::abs(reference_values[ref_offset + i] / compare_values[comp_offset + i] - 1) > 0.01) 52 | { 53 | #ifdef VERIFICATION_TEXT 54 | printf("Row: %u | Values do NOT match: (Ref|Comp) : (%f|%f) - pos: %u/%u - col %u\n", tid, reference_values[ref_offset + i], compare_values[comp_offset + i], i, num_entries, reference_indices[ref_offset + i]); 55 | #endif 56 | *verification = 1; 57 | // return; 58 | } 59 | } 60 | } 61 | 62 | return; 63 | } 64 | 65 | namespace spECK { 66 | template 67 | bool Compare(const dCSR& reference_mat, const dCSR& compare_mat, bool compare_data) 68 | { 69 | int blockSize(256); 70 | int gridSize(divup(reference_mat.rows + 1, blockSize)); 71 | double epsilon = 0.1; 72 | uint32_t* verification, h_verification; 73 | cudaMalloc(&verification, sizeof(uint32_t)); 74 | cudaMemset(verification, 0, sizeof(uint32_t)); 75 | 76 | d_compare << > > (reference_mat.rows, reference_mat.cols, 77 | reference_mat.row_offsets, reference_mat.col_ids, reference_mat.data, 78 | compare_mat.row_offsets, compare_mat.col_ids, compare_mat.data, 79 | compare_data, epsilon, verification); 80 | 81 | cudaMemcpy(&h_verification, verification, sizeof(uint32_t), cudaMemcpyDeviceToHost); 82 | return (h_verification == 0); 83 | } 84 | 85 | template bool Compare(const dCSR& reference_mat, const dCSR& compare_mat, bool compare_data); 86 | template bool Compare(const dCSR& reference_mat, const dCSR& compare_mat, bool compare_data); 87 | } 88 | -------------------------------------------------------------------------------- /spECK/source/GPU/Transpose.cu: -------------------------------------------------------------------------------- 1 | // Global includes 2 | #include 3 | #include 4 | typedef unsigned long long int uint64_t; 5 | #include "device_launch_parameters.h" 6 | 7 | // Local includes 8 | #include "Transpose.h" 9 | #include "common.h" 10 | 11 | __global__ void d_calulateTransposeDistribution(int in_rows, int in_cols, 12 | const uint32_t* __restrict input_offset, const uint32_t* __restrict input_indices, uint32_t* output_offset) 13 | { 14 | int tid = threadIdx.x + blockDim.x * blockIdx.x; 15 | if (tid >= in_rows) 16 | return; 17 | 18 | uint32_t offset = input_offset[tid]; 19 | uint32_t number_entries = input_offset[tid + 1] - offset; 20 | 21 | for (uint32_t i = 0; i < number_entries; ++i) 22 | { 23 | atomicAdd(output_offset + input_indices[offset + i], 1); 24 | } 25 | 26 | return; 27 | } 28 | 29 | template 30 | __global__ void d_findPosition(int in_rows, int in_cols, const uint32_t* __restrict input_offset, const uint32_t* __restrict input_indices, 31 | const DataType* __restrict input_values, uint32_t* output_offset, uint32_t* output_indices, DataType* output_values, uint32_t* helper, uint32_t* helper_position) 32 | { 33 | int tid = threadIdx.x + blockDim.x * blockIdx.x; 34 | if (tid >= in_rows) 35 | return; 36 | 37 | uint32_t offset = input_offset[tid]; 38 | uint32_t number_entries = input_offset[tid + 1] - offset; 39 | 40 | for (uint32_t i = 0; i < number_entries; ++i) 41 | { 42 | uint32_t row_index = input_indices[offset + i]; 43 | uint32_t insert_position = atomicAdd(helper + row_index, 1); 44 | uint32_t o_offset = output_offset[row_index]; 45 | helper_position[o_offset + insert_position] = tid; 46 | } 47 | 48 | return; 49 | } 50 | 51 | template 52 | __global__ void d_writeTranspose(int in_rows, int in_cols, const uint32_t* __restrict input_offset, const uint32_t* __restrict input_indices, 53 | const DataType* __restrict input_values, uint32_t* output_offset, uint32_t* output_indices, DataType* output_values, uint32_t* helper, uint32_t* helper_position) 54 | { 55 | int tid = threadIdx.x + blockDim.x * blockIdx.x; 56 | if (tid >= in_rows) 57 | return; 58 | 59 | uint32_t offset = input_offset[tid]; 60 | uint32_t number_entries = input_offset[tid + 1] - offset; 61 | 62 | for (uint32_t i = 0; i < number_entries; ++i) 63 | { 64 | uint32_t row_index = input_indices[offset + i]; 65 | uint32_t actual_position(0); 66 | uint32_t entries_output = helper[row_index]; 67 | uint32_t o_offset = output_offset[row_index]; 68 | for (uint32_t j = 0; j < entries_output; ++j) 69 | { 70 | if (helper_position[o_offset + j] < tid) 71 | ++actual_position; 72 | } 73 | output_indices[o_offset + actual_position] = tid; 74 | output_values[o_offset + actual_position] = input_values[offset + i]; 75 | } 76 | 77 | return; 78 | } 79 | 80 | 81 | namespace spECK { 82 | template 83 | void Transpose(const dCSR& matIn, dCSR& matTransposeOut) 84 | { 85 | int blockSize(256); 86 | int gridSize(divup(matIn.rows + 1, blockSize)); 87 | 88 | matTransposeOut.alloc(matIn.cols, matIn.rows, matIn.nnz); 89 | 90 | // Allocate and set helper resources, Memset output vector 91 | uint32_t* d_helper_pointer, *d_helper_position; 92 | cudaMalloc(&d_helper_pointer, sizeof(uint32_t) * (matTransposeOut.rows + 1)); 93 | cudaMalloc(&d_helper_position, sizeof(uint32_t) * (matTransposeOut.nnz)); 94 | cudaMemset(d_helper_pointer, 0, sizeof(uint32_t) * (matTransposeOut.rows + 1)); 95 | cudaMemset(matTransposeOut.row_offsets, 0, (matTransposeOut.rows + 1) * sizeof(uint32_t)); 96 | 97 | // Calculate entry distribution 98 | d_calulateTransposeDistribution<<>>(matIn.rows, matIn.cols, matIn.row_offsets, matIn.col_ids, matTransposeOut.row_offsets); 99 | 100 | // Prefix sum for new offset vector 101 | thrust::device_ptr th_offset_vector(matTransposeOut.row_offsets); 102 | thrust::exclusive_scan(th_offset_vector, th_offset_vector + matTransposeOut.rows + 1, th_offset_vector); 103 | 104 | // Find position for insertion (keeping sort order) 105 | d_findPosition << > > (matIn.rows, matIn.cols, matIn.row_offsets, matIn.col_ids, matIn.data, matTransposeOut.row_offsets, matTransposeOut.col_ids, matTransposeOut.data, d_helper_pointer, d_helper_position); 106 | 107 | // Write Transpose 108 | d_writeTranspose << > > (matIn.rows, matIn.cols, matIn.row_offsets, matIn.col_ids, matIn.data, matTransposeOut.row_offsets, matTransposeOut.col_ids, matTransposeOut.data, d_helper_pointer, d_helper_position); 109 | 110 | // Free helper resources 111 | cudaFree(d_helper_pointer); 112 | cudaFree(d_helper_position); 113 | 114 | return; 115 | } 116 | 117 | template void Transpose(const dCSR& matIn, dCSR& matTransposeOut); 118 | template void Transpose(const dCSR& matIn, dCSR& matTransposeOut); 119 | } 120 | -------------------------------------------------------------------------------- /spECK/source/GPU/common.cu: -------------------------------------------------------------------------------- 1 | #include "common.cuh" -------------------------------------------------------------------------------- /spECK/source/GPU/memory.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | 7 | namespace CU 8 | { 9 | unique_ptr allocMemory(std::size_t size) 10 | { 11 | CUdeviceptr ptr; 12 | cudaMalloc(reinterpret_cast(&ptr), size); 13 | return unique_ptr(ptr); 14 | } 15 | 16 | unique_ptr allocMemoryPitched(std::size_t& pitch, std::size_t row_size, std::size_t num_rows, unsigned int element_size) 17 | { 18 | CUdeviceptr ptr; 19 | cudaMallocPitch(reinterpret_cast(&ptr), &pitch, row_size, num_rows); 20 | return unique_ptr(ptr); 21 | } 22 | 23 | pitched_memory allocMemoryPitched(std::size_t row_size, std::size_t num_rows, unsigned int element_size) 24 | { 25 | CUdeviceptr ptr; 26 | std::size_t pitch; 27 | cudaMallocPitch(reinterpret_cast(&ptr), &pitch, row_size, num_rows); 28 | return pitched_memory(unique_ptr(ptr), pitch); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /spECK/source/GPU/profiler.cu: -------------------------------------------------------------------------------- 1 | #include "GPU/profiler.cuh" 2 | 3 | #define CHECK_CU_ERROR(err, cufunc) \ 4 | if (err != CUDA_SUCCESS) \ 5 | { \ 6 | printf ("%s:%d: error %d for CUDA Driver API function '%s'\n", \ 7 | __FILE__, __LINE__, err, cufunc); \ 8 | exit(-1); \ 9 | } 10 | 11 | #define CHECK_CUPTI_ERROR(err, cuptifunc) \ 12 | if (err != CUPTI_SUCCESS) \ 13 | { \ 14 | const char *errstr; \ 15 | cuptiGetResultString(err, &errstr); \ 16 | printf ("%s:%d:Error %s for CUPTI API function '%s'.\n", \ 17 | __FILE__, __LINE__, errstr, cuptifunc); \ 18 | exit(-1); \ 19 | } 20 | 21 | 22 | // void CUPTIAPI getTimestampCallback(void *userdata, CUpti_CallbackDomain domain, 23 | // CUpti_CallbackId cbid, const CUpti_CallbackData *cbInfo) 24 | // { 25 | // static int memTransCount = 0; 26 | // uint64_t startTimestamp; 27 | // uint64_t endTimestamp; 28 | // RuntimeApiTrace_t *traceData = (RuntimeApiTrace_t*)userdata; 29 | // CUptiResult cuptiErr; 30 | 31 | // // Data is collected only for the following API 32 | // if ((cbid == CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020) || 33 | // (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000) || 34 | // (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSynchronize_v3020) || 35 | // (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020)) { 36 | 37 | // // Set pointer depending on API 38 | // if ((cbid == CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020) || 39 | // (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000)) 40 | // { 41 | // traceData = traceData + KERNEL; 42 | // } 43 | // else if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSynchronize_v3020) 44 | // traceData = traceData + THREAD_SYNC; 45 | // else if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020) 46 | // traceData = traceData + MEMCPY_H2D1 + memTransCount; 47 | // size_t freeMem = 0, totalMem = 0; 48 | // cudaMemGetInfo(&freeMem, &totalMem); 49 | // traceData->currentMemoryUsage = totalMem - freeMem; 50 | 51 | // if (cbInfo->callbackSite == CUPTI_API_ENTER) { 52 | // // for a kernel launch report the kernel name, otherwise use the API 53 | // // function name. 54 | // if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020 || 55 | // cbid == CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000) 56 | // { 57 | // traceData->functionName = cbInfo->symbolName; 58 | // } 59 | // else { 60 | // traceData->functionName = cbInfo->functionName; 61 | // } 62 | 63 | // // Store parameters passed to cudaMemcpy 64 | // if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020) { 65 | // traceData->memcpy_bytes = ((cudaMemcpy_v3020_params *)(cbInfo->functionParams))->count; 66 | // traceData->memcpy_kind = ((cudaMemcpy_v3020_params *)(cbInfo->functionParams))->kind; 67 | // } 68 | 69 | // // Collect timestamp for API start 70 | // cuptiErr = cuptiDeviceGetTimestamp(cbInfo->context, &startTimestamp); 71 | // CHECK_CUPTI_ERROR(cuptiErr, "cuptiDeviceGetTimestamp"); 72 | 73 | // traceData->startTimestamp = startTimestamp; 74 | // } 75 | 76 | // if (cbInfo->callbackSite == CUPTI_API_EXIT) { 77 | // // Collect timestamp for API exit 78 | // cuptiErr = cuptiDeviceGetTimestamp(cbInfo->context, &endTimestamp); 79 | // CHECK_CUPTI_ERROR(cuptiErr, "cuptiDeviceGetTimestamp"); 80 | 81 | // traceData->endTimestamp = endTimestamp; 82 | 83 | // // Advance to the next memory transfer operation 84 | // if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020) { 85 | // memTransCount++; 86 | // } 87 | // } 88 | // } 89 | // } 90 | 91 | void CUPTIAPI getTimestampCallback(void *userdata, CUpti_CallbackDomain domain, 92 | CUpti_CallbackId cbid, const CUpti_CallbackData *cbInfo) 93 | { 94 | if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemGetInfo_v3020) 95 | return; 96 | 97 | size_t freeMem = 0, totalMem = 0; 98 | cudaMemGetInfo(&freeMem, &totalMem); 99 | 100 | RuntimeApiTrace_t traceData; 101 | traceData.functionName = cbInfo->functionName; 102 | traceData.currentMemoryUsage = totalMem - freeMem; 103 | 104 | auto &records = *((std::vector *) userdata); 105 | records.push_back(traceData); 106 | 107 | 108 | // printf("current usage=%llu. Entry nr=%llu\n", traceData.currentMemoryUsage, (long long unsigned int) records.size()); 109 | } 110 | 111 | void CuProfiler::initialize(bool subtractCurrentMem) { 112 | CUcontext context = 0; 113 | CUdevice device = 0; 114 | CUresult cuerr; 115 | CUptiResult cuptierr; 116 | 117 | startMem = 0; 118 | if(subtractCurrentMem) { 119 | size_t freeMem = 0, totalMem = 0; 120 | cudaMemGetInfo(&freeMem, &totalMem); 121 | startMem = totalMem - freeMem; 122 | } 123 | // RuntimeApiTrace_t trace[LAUNCH_LAST]; 124 | 125 | // cuerr = cuInit(0); 126 | // CHECK_CU_ERROR(cuerr, "cuInit"); 127 | 128 | // cuerr = cuCtxCreate(&context, 0, device); 129 | // cuerr = cuCtxGetCurrent(&context); 130 | // CHECK_CU_ERROR(cuerr, "cuCtxCreate"); 131 | 132 | cuptierr = cuptiSubscribe(&subscriber, (CUpti_CallbackFunc)getTimestampCallback, &records); 133 | CHECK_CUPTI_ERROR(cuptierr, "cuptiSubscribe"); 134 | 135 | cuptierr = cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_RUNTIME_API); 136 | CHECK_CUPTI_ERROR(cuptierr, "cuptiEnableDomain"); 137 | 138 | initialized = true; 139 | } 140 | 141 | void CuProfiler::finalize() { 142 | if (!initialized) 143 | return; 144 | 145 | for (auto& entry : records) { 146 | if (entry.currentMemoryUsage < startMem) 147 | startMem = entry.currentMemoryUsage; 148 | } 149 | 150 | for (auto& entry : records) { 151 | entry.currentMemoryUsage -= startMem; 152 | } 153 | 154 | // displayTimestamps(); 155 | 156 | CUptiResult cuptierr = cuptiUnsubscribe(subscriber); 157 | CHECK_CUPTI_ERROR(cuptierr, "cuptiUnsubscribe"); 158 | 159 | // cleanUp(h_A, h_B, h_C, d_A, d_B, d_C); 160 | cudaDeviceSynchronize(); 161 | } 162 | 163 | 164 | 165 | void CuProfiler::displayTimestamps() 166 | { 167 | for (auto entry : records) { 168 | printf("MemUsage:%llu\n", entry.currentMemoryUsage); 169 | } 170 | // Calculate timestamp of kernel based on timestamp from 171 | // cudaDeviceSynchronize() call 172 | // trace[KERNEL].endTimestamp = trace[THREAD_SYNC].endTimestamp; 173 | 174 | // printf("startTimeStamp/gpuTime reported in nano-seconds\n\n"); 175 | // printf("Name\t\tStart Time\t\tGPU Time\tBytes\tKind\tcurrentMemUsage\n"); 176 | // printf("%s\t%llu\t%llu\t\t%llu\t%s\t\%llu\n", trace[MEMCPY_H2D1].functionName, 177 | // (unsigned long long)trace[MEMCPY_H2D1].startTimestamp, 178 | // (unsigned long long)trace[MEMCPY_H2D1].endTimestamp - trace[MEMCPY_H2D1].startTimestamp, 179 | // (unsigned long long)trace[MEMCPY_H2D1].memcpy_bytes, 180 | // memcpyKindStr(trace[MEMCPY_H2D1].memcpy_kind), 181 | // (unsigned long long)trace[MEMCPY_H2D1].currentMemoryUsage); 182 | // printf("%s\t%llu\t%llu\t\t%llu\t%s\t\%llu\n", trace[MEMCPY_H2D2].functionName, 183 | // (unsigned long long)trace[MEMCPY_H2D2].startTimestamp, 184 | // (unsigned long long)trace[MEMCPY_H2D2].endTimestamp - trace[MEMCPY_H2D2].startTimestamp, 185 | // (unsigned long long)trace[MEMCPY_H2D2].memcpy_bytes, 186 | // memcpyKindStr(trace[MEMCPY_H2D2].memcpy_kind), 187 | // (unsigned long long)trace[MEMCPY_H2D1].currentMemoryUsage); 188 | // printf("%s\t%llu\t%llu\t\tNA\tNA\t\%llu\n", trace[KERNEL].functionName, 189 | // (unsigned long long)trace[KERNEL].startTimestamp, 190 | // (unsigned long long)trace[KERNEL].endTimestamp - trace[KERNEL].startTimestamp, 191 | // (unsigned long long)trace[MEMCPY_H2D1].currentMemoryUsage); 192 | // printf("%s\t%llu\t%llu\t\t%llu\t%s\t\%llu\n", trace[MEMCPY_D2H].functionName, 193 | // (unsigned long long)trace[MEMCPY_D2H].startTimestamp, 194 | // (unsigned long long)trace[MEMCPY_D2H].endTimestamp - trace[MEMCPY_D2H].startTimestamp, 195 | // (unsigned long long)trace[MEMCPY_D2H].memcpy_bytes, 196 | // memcpyKindStr(trace[MEMCPY_D2H].memcpy_kind), 197 | // (unsigned long long)trace[MEMCPY_H2D1].currentMemoryUsage); 198 | } -------------------------------------------------------------------------------- /spECK/source/RunConfig.cpp: -------------------------------------------------------------------------------- 1 | #include "RunConfig.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include "Config.h" 7 | 8 | RunConfig::RunConfig(int argc, char *argv[]) 9 | { 10 | std::string mat1, mat2; 11 | mat1 = "can_24"; 12 | mat2 = "can_24"; 13 | if(argc == 2){ 14 | mat1 = argv[1]; 15 | mat2 = argv[1]; 16 | } 17 | if(argc >= 3){ 18 | mat1 = argv[1]; 19 | mat2 = argv[2]; 20 | } 21 | std::string mat1_file; 22 | if(mat1.find("ER") != std::string::npos){ 23 | mat1_file = "../matrix/ER/" + mat1 +".mtx"; 24 | } 25 | else if(mat1.find("G500") != std::string::npos){ 26 | mat1_file = "../matrix/G500/" + mat1 +".mtx"; 27 | } 28 | else{ 29 | mat1_file = "../matrix/suite_sparse/" + mat1 + "/" + mat1 +".mtx"; 30 | } 31 | std::string mat2_file; 32 | if(mat2.find("ER") != std::string::npos){ 33 | mat2_file = "../matrix/ER/" + mat2 +".mtx"; 34 | } 35 | else if(mat2.find("G500") != std::string::npos){ 36 | mat2_file = "../matrix/G500/" + mat2 +".mtx"; 37 | } 38 | else{ 39 | mat2_file = "../matrix/suite_sparse/" + mat2 + "/" + mat2 +".mtx"; 40 | } 41 | filePath = mat1_file; 42 | filePath2 = mat2_file; 43 | mat_name = mat1; 44 | mat_name2 = mat2; 45 | //printf("in RunConfig.cpp %s %s\n", filePath.c_str(), filePath2.c_str()); 46 | printf("%s %s ", mat1.c_str(), mat2.c_str()); 47 | Config::init("config.ini"); 48 | } 49 | 50 | 51 | RunConfig::~RunConfig() 52 | { 53 | } 54 | -------------------------------------------------------------------------------- /spECK/source/cuSparseMultiply.cu: -------------------------------------------------------------------------------- 1 | #include "cuSparseMultiply.h" 2 | #include 3 | #include "common.h" 4 | 5 | 6 | namespace cuSPARSE { 7 | template<> 8 | cusparseStatus_t CUSPARSEAPI CuSparseTest::cusparseTranspose(cusparseHandle_t handle, int m, int n, int nnz, 9 | const float *csrSortedVal, const int *csrSortedRowPtr, const int *csrSortedColInd, 10 | float *cscSortedVal, int *cscSortedRowInd, int *cscSortedColPtr, cusparseAction_t copyValues, cusparseIndexBase_t idxBase) 11 | { 12 | void *buffer = nullptr; 13 | size_t buffer_size = 0; 14 | checkCuSparseError(cusparseCsr2cscEx2_bufferSize(handle, m, n, nnz, csrSortedVal, csrSortedRowPtr, csrSortedColInd, cscSortedVal, 15 | cscSortedColPtr, cscSortedRowInd, CUDA_R_32F, copyValues, idxBase, CUSPARSE_CSR2CSC_ALG1, &buffer_size), "buffer size failed"); 16 | HANDLE_ERROR(cudaMalloc(&buffer, buffer_size)); 17 | 18 | auto retVal = checkCuSparseError(cusparseCsr2cscEx2(handle, m, n, nnz, csrSortedVal, csrSortedRowPtr, csrSortedColInd, cscSortedVal, 19 | cscSortedColPtr, cscSortedRowInd, CUDA_R_32F, copyValues, idxBase, CUSPARSE_CSR2CSC_ALG1, buffer), "transpose failed"); 20 | HANDLE_ERROR(cudaFree(buffer)); 21 | return retVal; 22 | } 23 | 24 | template<> 25 | cusparseStatus_t CUSPARSEAPI CuSparseTest::cusparseTranspose(cusparseHandle_t handle, int m, int n, int nnz, 26 | const double *csrSortedVal, const int *csrSortedRowPtr, const int *csrSortedColInd, 27 | double *cscSortedVal, int *cscSortedRowInd, int *cscSortedColPtr, cusparseAction_t copyValues, cusparseIndexBase_t idxBase) 28 | { 29 | void *buffer = nullptr; 30 | size_t buffer_size = 0; 31 | checkCuSparseError(cusparseCsr2cscEx2_bufferSize(handle, m, n, nnz, csrSortedVal, csrSortedRowPtr, csrSortedColInd, cscSortedVal, 32 | cscSortedColPtr, cscSortedRowInd, CUDA_R_64F, copyValues, idxBase, CUSPARSE_CSR2CSC_ALG1, &buffer_size), "buffer size failed"); 33 | HANDLE_ERROR(cudaDeviceSynchronize()); 34 | HANDLE_ERROR(cudaMalloc(&buffer, buffer_size)); 35 | 36 | auto retVal = checkCuSparseError(cusparseCsr2cscEx2(handle, m, n, nnz, csrSortedVal, csrSortedRowPtr, csrSortedColInd, cscSortedVal, 37 | cscSortedColPtr, cscSortedRowInd, CUDA_R_64F, copyValues, idxBase, CUSPARSE_CSR2CSC_ALG1, buffer), "transpose failed"); 38 | HANDLE_ERROR(cudaFree(buffer)); 39 | return retVal; 40 | } 41 | 42 | template 43 | float CuSparseTest::Multiply(const dCSR& A, const dCSR& B, dCSR& matOut, uint32_t& cusparse_nnz) 44 | { 45 | int nnzC; 46 | int *nnzTotalDevHostPtr = &nnzC; 47 | float duration; 48 | DataType alpha = (DataType) 1.0f; 49 | DataType beta = (DataType) 0.0f; 50 | 51 | cudaEvent_t start, stop; 52 | HANDLE_ERROR(cudaEventCreate(&start)); 53 | HANDLE_ERROR(cudaEventCreate(&stop)); 54 | 55 | // ############################ 56 | HANDLE_ERROR(cudaEventRecord(start)); 57 | // ############################ 58 | 59 | auto computeType = sizeof(DataType) == 4 ? CUDA_R_32F : CUDA_R_64F; 60 | cusparseSpMatDescr_t matA, matB, matC; 61 | checkCuSparseError( cusparseCreateCsr(&matA, A.rows, A.cols, A.nnz, 62 | A.row_offsets, A.col_ids, A.data, 63 | CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 64 | CUSPARSE_INDEX_BASE_ZERO, computeType), "A failed"); 65 | checkCuSparseError( cusparseCreateCsr(&matB, B.rows, B.cols, B.nnz, 66 | B.row_offsets, B.col_ids, B.data, 67 | CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 68 | CUSPARSE_INDEX_BASE_ZERO, computeType), "B failed"); 69 | checkCuSparseError( cusparseCreateCsr(&matC, A.rows, B.cols, 0, 70 | NULL, NULL, NULL, 71 | CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 72 | CUSPARSE_INDEX_BASE_ZERO, computeType), "C failed"); 73 | 74 | void* dBuffer1 = NULL, *dBuffer2 = NULL; 75 | size_t bufferSize1 = 0, bufferSize2 = 0; 76 | cusparseSpGEMMDescr_t spgemmDesc; 77 | checkCuSparseError( cusparseSpGEMM_createDescr(&spgemmDesc), "create description failed"); 78 | auto opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 79 | auto opB = CUSPARSE_OPERATION_NON_TRANSPOSE; 80 | // Device memory management: Allocate and copy A, B 81 | int *dC_csrOffsets = nullptr, *dC_columns = nullptr; 82 | DataType *dC_values; 83 | 84 | // ask bufferSize1 bytes for external memory 85 | checkCuSparseError(cusparseSpGEMM_workEstimation(handle, opA, opB, 86 | &alpha, matA, matB, &beta, matC, 87 | computeType, CUSPARSE_SPGEMM_DEFAULT, 88 | spgemmDesc, &bufferSize1, 0), "workestimation0 failed"); 89 | HANDLE_ERROR(cudaMalloc((void**) &dBuffer1, bufferSize1)); 90 | // inspect the matrices A and B to understand the memory requirement for 91 | // the next step 92 | checkCuSparseError(cusparseSpGEMM_workEstimation(handle, opA, opB, 93 | &alpha, matA, matB, &beta, matC, 94 | computeType, CUSPARSE_SPGEMM_DEFAULT, 95 | spgemmDesc, &bufferSize1, dBuffer1), "workestimation1 failed"); 96 | 97 | // ask bufferSize2 bytes for external memory 98 | checkCuSparseError(cusparseSpGEMM_compute(handle, opA, opB, 99 | &alpha, matA, matB, &beta, matC, 100 | computeType, CUSPARSE_SPGEMM_DEFAULT, 101 | spgemmDesc, &bufferSize2, NULL), "compute0 failed"); 102 | HANDLE_ERROR(cudaMalloc((void**) &dBuffer2, bufferSize2)); 103 | 104 | // compute the intermediate product of A * B 105 | checkCuSparseError(cusparseSpGEMM_compute(handle, opA, opB, 106 | &alpha, matA, matB, &beta, matC, 107 | computeType, CUSPARSE_SPGEMM_DEFAULT, 108 | spgemmDesc, &bufferSize2, dBuffer2), "compute1 failed"); 109 | // get matrix C non-zero entries C_num_nnz1 110 | int64_t C_num_rows1, C_num_cols1, C_num_nnz1; 111 | checkCuSparseError(cusparseSpMatGetSize(matC, &C_num_rows1, &C_num_cols1, &C_num_nnz1), "get size failed"); 112 | // allocate matrix C 113 | HANDLE_ERROR(cudaMalloc((void**) &dC_csrOffsets, (C_num_rows1 + 1) * sizeof(int))); 114 | HANDLE_ERROR(cudaMalloc((void**) &dC_columns, C_num_nnz1 * sizeof(int))); 115 | HANDLE_ERROR(cudaMalloc((void**) &dC_values, C_num_nnz1 * sizeof(DataType))); 116 | // update matC with the new pointers 117 | checkCuSparseError(cusparseCsrSetPointers(matC, dC_csrOffsets, dC_columns, dC_values), "get pointers failed"); 118 | 119 | // copy the final products to the matrix C 120 | checkCuSparseError(cusparseSpGEMM_copy( 121 | handle, 122 | opA, 123 | opB, 124 | &alpha, 125 | matA, 126 | matB, 127 | &beta, 128 | matC, 129 | computeType, 130 | CUSPARSE_SPGEMM_DEFAULT, 131 | spgemmDesc), 132 | "copy failed"); 133 | 134 | cusparseIndexType_t _rowType, _columnType; 135 | cusparseIndexBase_t _indexBase; 136 | cudaDataType _baseOff; 137 | checkCuSparseError(cusparseCsrGet(matC, 138 | (int64_t*) &matOut.rows, 139 | (int64_t*) &matOut.cols, 140 | (int64_t*) &matOut.nnz, 141 | (void**) &matOut.row_offsets, 142 | (void**) &matOut.col_ids, 143 | (void**) &matOut.data, 144 | &_rowType, 145 | &_columnType, 146 | &_indexBase, 147 | &_baseOff 148 | ), "get failed"); 149 | // destroy matrix/vector descriptors 150 | checkCuSparseError( cusparseSpGEMM_destroyDescr(spgemmDesc), "destroy failed" ); 151 | HANDLE_ERROR(cudaFree(dBuffer1)); 152 | HANDLE_ERROR(cudaFree(dBuffer2)); 153 | 154 | // ############################ 155 | HANDLE_ERROR(cudaDeviceSynchronize()); 156 | HANDLE_ERROR(cudaEventRecord(stop)); 157 | HANDLE_ERROR(cudaEventSynchronize(stop)); 158 | // ############################ 159 | 160 | HANDLE_ERROR(cudaEventElapsedTime(&duration, start, stop)); 161 | cusparse_nnz = matOut.nnz; 162 | 163 | return duration; 164 | } 165 | 166 | template float CuSparseTest::Multiply(const dCSR& A, const dCSR& B, dCSR& matOut, uint32_t& cusparse_nnz); 167 | template float CuSparseTest::Multiply(const dCSR& A, const dCSR& B, dCSR& matOut, uint32_t& cusparse_nnz); 168 | 169 | template 170 | void CuSparseTest::Transpose(const dCSR& A, dCSR& AT) 171 | { 172 | AT.alloc(A.cols, A.rows, A.nnz); 173 | 174 | checkCuSparseError(cusparseTranspose(handle, A.rows, A.cols, A.nnz, 175 | reinterpret_cast(A.data), reinterpret_cast(A.row_offsets), reinterpret_cast(A.col_ids), 176 | reinterpret_cast(AT.data), reinterpret_cast(AT.col_ids), reinterpret_cast(AT.row_offsets), 177 | CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO), "transpose failed"); 178 | } 179 | 180 | template void CuSparseTest::Transpose(const dCSR& A, dCSR& AT); 181 | template void CuSparseTest::Transpose(const dCSR& A, dCSR& AT); 182 | } 183 | -------------------------------------------------------------------------------- /spECK/source/dCSR.cpp: -------------------------------------------------------------------------------- 1 | #include "dCSR.h" 2 | #include "CSR.h" 3 | 4 | #include 5 | 6 | namespace 7 | { 8 | template 9 | void dealloc(dCSR& mat) 10 | { 11 | if (mat.col_ids != nullptr) 12 | cudaFree(mat.col_ids); 13 | if (mat.data != nullptr) 14 | cudaFree(mat.data); 15 | if (mat.row_offsets != nullptr) 16 | cudaFree(mat.row_offsets); 17 | mat.col_ids = nullptr; 18 | mat.data = nullptr; 19 | mat.row_offsets = nullptr; 20 | mat.nnz = 0; 21 | mat.rows = 0; 22 | } 23 | } 24 | 25 | template 26 | void dCSR::alloc(size_t r, size_t c, size_t n, bool allocOffsets) 27 | { 28 | dealloc(*this); 29 | rows = r; 30 | cols = c; 31 | nnz = n; 32 | cudaMalloc(&data, sizeof(T)*n); 33 | cudaMalloc(&col_ids, sizeof(unsigned int)*n); 34 | if (allocOffsets) 35 | cudaMalloc(&row_offsets, sizeof(unsigned int)*(r+1)); 36 | } 37 | template 38 | dCSR::~dCSR() 39 | { 40 | dealloc(*this); 41 | } 42 | 43 | template 44 | void dCSR::reset() 45 | { 46 | dealloc(*this); 47 | } 48 | 49 | 50 | template 51 | void convert(dCSR& dst, const CSR& src, unsigned int padding) 52 | { 53 | dst.alloc(src.rows + padding, src.cols, src.nnz + 8*padding); 54 | dst.rows = src.rows; dst.nnz = src.nnz; dst.cols = src.cols; 55 | cudaMemcpy(dst.data, &src.data[0], src.nnz * sizeof(T), cudaMemcpyHostToDevice); 56 | cudaMemcpy(dst.col_ids, &src.col_ids[0], src.nnz * sizeof(unsigned int), cudaMemcpyHostToDevice); 57 | cudaMemcpy(dst.row_offsets, &src.row_offsets[0], (src.rows + 1) * sizeof(unsigned int), cudaMemcpyHostToDevice); 58 | 59 | if (padding) 60 | { 61 | cudaMemset(dst.data + src.nnz, 0, 8 * padding * sizeof(T)); 62 | cudaMemset(dst.col_ids + src.nnz, 0, 8 * padding * sizeof(unsigned int)); 63 | cudaMemset(dst.row_offsets + src.rows + 1, 0, padding * sizeof(unsigned int)); 64 | } 65 | } 66 | 67 | template 68 | void convert(CSR& dst, const dCSR& src, unsigned int padding) 69 | { 70 | dst.alloc(src.rows + padding, src.cols, src.nnz + 8 * padding); 71 | dst.rows = src.rows; dst.nnz = src.nnz; dst.cols = src.cols; 72 | cudaMemcpy(dst.data, src.data, dst.nnz * sizeof(T), cudaMemcpyDeviceToHost); 73 | cudaMemcpy(dst.col_ids, src.col_ids, dst.nnz * sizeof(unsigned int), cudaMemcpyDeviceToHost); 74 | cudaMemcpy(dst.row_offsets, src.row_offsets, (dst.rows + 1) * sizeof(unsigned int), cudaMemcpyDeviceToHost); 75 | } 76 | 77 | template 78 | void convert(dCSR& dst, const dCSR& src, unsigned int padding) 79 | { 80 | dst.alloc(src.rows + padding, src.cols, src.nnz + 8 * padding); 81 | dst.rows = src.rows; dst.nnz = src.nnz; dst.cols = src.cols; 82 | cudaMemcpy(dst.data, src.data, dst.nnz * sizeof(T), cudaMemcpyDeviceToDevice); 83 | cudaMemcpy(dst.col_ids, src.col_ids, dst.nnz * sizeof(unsigned int), cudaMemcpyDeviceToDevice); 84 | cudaMemcpy(dst.row_offsets, src.row_offsets, (dst.rows + 1) * sizeof(unsigned int), cudaMemcpyDeviceToDevice); 85 | } 86 | 87 | template 88 | void convert(CSR& dst, const CSR& src, unsigned int padding) 89 | { 90 | dst.alloc(src.rows + padding, src.cols, src.nnz + 8 * padding); 91 | dst.rows = src.rows; dst.nnz = src.nnz; dst.cols = src.cols; 92 | memcpy(dst.data, src.data, dst.nnz * sizeof(T)); 93 | memcpy(dst.col_ids, src.col_ids, dst.nnz * sizeof(unsigned int)); 94 | memcpy(dst.row_offsets, src.row_offsets, (dst.rows + 1) * sizeof(unsigned int)); 95 | } 96 | 97 | template void dCSR::alloc(size_t r, size_t c, size_t n, bool allocOffsets); 98 | template void dCSR::alloc(size_t r, size_t c, size_t n, bool allocOffsets); 99 | template void dCSR::alloc(size_t r, size_t c, size_t n, bool allocOffsets); 100 | 101 | template dCSR::~dCSR(); 102 | template dCSR::~dCSR(); 103 | template dCSR::~dCSR(); 104 | 105 | template void dCSR::reset(); 106 | template void dCSR::reset(); 107 | template void dCSR::reset(); 108 | 109 | template void convert(dCSR& dcsr, const CSR& csr, unsigned int); 110 | template void convert(dCSR& dcsr, const CSR& csr, unsigned int); 111 | //template void convert(dCSR& dcsr, const CSR& csr, unsigned int); 112 | 113 | template void convert(CSR& csr, const dCSR& dcsr, unsigned int padding); 114 | template void convert(CSR& csr, const dCSR& dcsr, unsigned int padding); 115 | //template void convert(CSR& csr, const dCSR& dcsr, unsigned int padding); 116 | 117 | template void convert(dCSR& dcsr, const dCSR& csr, unsigned int); 118 | template void convert(dCSR& dcsr, const dCSR& csr, unsigned int); 119 | //template void convert(dCSR& dcsr, const dCSR& csr, unsigned int); 120 | 121 | template void convert(CSR& csr, const CSR& dcsr, unsigned int padding); 122 | template void convert(CSR& csr, const CSR& dcsr, unsigned int padding); 123 | //template void convert(CSR& csr, const CSR& dcsr, unsigned int padding); 124 | -------------------------------------------------------------------------------- /spECK/source/reg_runspECK.cpp: -------------------------------------------------------------------------------- 1 | #ifdef _WIN32 2 | #include 3 | //surpress crash notification windows (close or debug program window) 4 | #define WIN32_LEAN_AND_MEAN 5 | #define NOMINMAX 6 | #include 7 | #else 8 | #include 9 | #endif 10 | #include 11 | #include "Executor.h" 12 | 13 | int main(int argc, char *argv[]) 14 | { 15 | #ifdef _WIN32 16 | //surpress crash notification windows (close or debug program window) 17 | SetErrorMode(GetErrorMode() | SEM_NOGPFAULTERRORBOX); 18 | #endif 19 | Executor exe(argc, argv); 20 | return exe.run(); 21 | } 22 | -------------------------------------------------------------------------------- /spECK/source/runspECK.cpp: -------------------------------------------------------------------------------- 1 | #ifdef _WIN32 2 | #include 3 | //surpress crash notification windows (close or debug program window) 4 | #define WIN32_LEAN_AND_MEAN 5 | #define NOMINMAX 6 | #include 7 | #else 8 | #include 9 | #endif 10 | #include 11 | #include "Executor.h" 12 | 13 | int main(int argc, char *argv[]) 14 | { 15 | #ifdef _WIN32 16 | //surpress crash notification windows (close or debug program window) 17 | SetErrorMode(GetErrorMode() | SEM_NOGPFAULTERRORBOX); 18 | #endif 19 | Executor exe(argc, argv); 20 | return exe.run_detail(); 21 | } 22 | --------------------------------------------------------------------------------