├── .gitignore
├── LICENSE
├── OpSparse
    ├── Makefile
    ├── inc
    │   ├── CSR.h
    │   ├── Meta.h
    │   ├── Timings.h
    │   ├── binning.cuh
    │   ├── cuda_common.h
    │   ├── cusparse_spgemm.h
    │   ├── define.h
    │   ├── kernel_wrapper.cuh
    │   ├── numeric.cuh
    │   ├── setup.cuh
    │   └── symbolic.cuh
    ├── readme.md
    └── src
    │   ├── CSR.cu
    │   ├── Meta.cu
    │   ├── Timings.cu
    │   ├── opsparse.cu
    │   ├── reg_cusparse.cu
    │   └── reg_opsparse.cu
├── download_matrix.sh
├── nsparse
    ├── Makefile
    ├── inc
    │   ├── BIN.hpp
    │   ├── CSR.hpp
    │   ├── HashSpGEMM.hpp
    │   ├── HashSpGEMM_volta.hpp
    │   ├── Plan.hpp
    │   ├── SpGEMM.hpp
    │   ├── Timing.hpp
    │   ├── cuda_common.h
    │   ├── nsparse.hpp
    │   └── nsparse_asm.hpp
    ├── nsparse.cu
    ├── readme.md
    └── reg_nsparse.cu
├── readme.md
└── spECK
    ├── Makefile
    ├── config.ini
    ├── include
        ├── COO.h
        ├── CSR.h
        ├── CUDATools
        │   ├── error.h
        │   ├── event.h
        │   ├── memory.h
        │   ├── memory_space.h
        │   ├── stream.h
        │   └── unique_handle.h
        ├── Compare.h
        ├── Config.h
        ├── DataLoader.h
        ├── Executor.h
        ├── GPU
        │   ├── BlockRange.cuh
        │   ├── Hash.cuh
        │   ├── HelperFunctions.cuh
        │   ├── consistent_gpu_memory.h
        │   ├── limits.cuh
        │   ├── profiler.cuh
        │   ├── scan_largearray_kernel.cuh
        │   ├── spECKKernels.h
        │   ├── spECK_HashLoadBalancer.cuh
        │   └── spECK_HashSpGEMM.cuh
        ├── HashMap.cuh
        ├── INIReader.h
        ├── Multiply.h
        ├── RunConfig.h
        ├── Timings.h
        ├── Transpose.h
        ├── Vector.h
        ├── WorkDistribution.h
        ├── common.cuh
        ├── common.h
        ├── cuSparseMultiply.h
        ├── cuda_common.h
        ├── dCSR.h
        ├── meta_utils.h
        ├── multi_arch_build.h
        └── spECKConfig.h
    ├── readme.md
    └── source
        ├── COO.cpp
        ├── CSR.cpp
        ├── Config.cpp
        ├── DataLoader.cpp
        ├── Executor.cpp
        ├── GPU
            ├── Compare.cu
            ├── Multiply.cu
            ├── Transpose.cu
            ├── common.cu
            ├── memory.cpp
            └── profiler.cu
        ├── RunConfig.cpp
        ├── cuSparseMultiply.cu
        ├── dCSR.cpp
        ├── reg_runspECK.cpp
        └── runspECK.cpp


/.gitignore:
--------------------------------------------------------------------------------
1 | *.swp
2 | *.d
3 | *.o
4 | *.obj
5 | *.out
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Zhaoyang Du
 4 | Copyright (c) 2017 Tokyo Institute of Technology
 5 | Copyright (c) 2019 Mathias Parger
 6 | 
 7 | Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | of this software and associated documentation files (the "Software"), to deal
 9 | in the Software without restriction, including without limitation the rights
10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | copies of the Software, and to permit persons to whom the Software is
12 | furnished to do so, subject to the following conditions:
13 | 
14 | The above copyright notice and this permission notice shall be included in all
15 | copies or substantial portions of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 | 


--------------------------------------------------------------------------------
/OpSparse/Makefile:
--------------------------------------------------------------------------------
 1 | CXX = g++
 2 | NVCC = nvcc
 3 | 
 4 | #CUDAFLAGS = $(GENCODE)  -g -lineinfo 
 5 | CUDAFLAGS =  $(GENCODE)  -O3
 6 | 
 7 | #CUDAFLAGS = $(GENCODE)  -g -G
 8 | 
 9 | 
10 | GENCODE = -arch=compute_70 -code=sm_70
11 | CUDAFLAGS += -Xcompiler -fopenmp 
12 | 
13 | # for Device Code
14 | CUDA_PATH = /usr/local/cuda
15 | LDFLAGS += -lcusparse $(CUDAFLAGS)
16 | INCLUDE += -I${CUDA_PATH}/include
17 | INCLUDE += -I${CUDA_PATH}/samples/common/inc
18 | INCLUDE += -I./inc
19 | 
20 | BIN = ./bin
21 | SRC = ./src
22 | OBJ = ./obj
23 | INC = ./inc
24 | 
25 | OBJ_LIB = $(OBJ)/Meta.o $(OBJ)/CSR.o $(OBJ)/Timings.o
26 | 
27 | COMMON_DEP = $(INC)/cuda_common.h $(INC)/define.h 
28 | 
29 | $(OBJ)/%.o : $(SRC)/%.cu $(INC)/%.h $(COMMON_DEP)
30 | 	mkdir -p $(dir $@)
31 | 	@echo $^
32 | 	$(NVCC) -c $(CUDAFLAGS) $(INCLUDE) -o $@ $<
33 | 
34 | $(OBJ)/%.o : $(SRC)/%.cu $(COMMON_DEP) 
35 | 	mkdir -p $(dir $@)
36 | 	@echo $^
37 | 	$(NVCC) -c $(CUDAFLAGS) $(INCLUDE) -o $@ $<
38 | 
39 | opsparse : $(OBJ_LIB) $(OBJ)/opsparse.o
40 | 	$(NVCC)  -o $@ $^ $(LDFLAGS) $(INCLUDE)
41 | 
42 | reg_opsparse : $(OBJ_LIB) $(OBJ)/reg_opsparse.o
43 | 	$(NVCC)  -o $@ $^ $(LDFLAGS) $(INCLUDE)
44 | 
45 | reg_cusparse : $(OBJ_LIB) $(OBJ)/reg_cusparse.o
46 | 	$(NVCC)  -o $@ $^ $(LDFLAGS) $(INCLUDE)
47 | 
48 | clean :
49 | 	rm -rf $(BIN)
50 | 	rm -rf $(OBJ)
51 | 


--------------------------------------------------------------------------------
/OpSparse/inc/CSR.h:
--------------------------------------------------------------------------------
 1 | #ifndef Z_CSR_H_
 2 | #define Z_CSR_H_
 3 | #include <string>
 4 | #include <vector>
 5 | #include "cuda_common.h"
 6 | 
 7 | class CSR{
 8 |     public:
 9 |     mint M;
10 |     mint N;
11 |     mint nnz;
12 |     mint *rpt;
13 |     mint *col;
14 |     mdouble *val;
15 | 
16 |     mint *d_rpt;
17 |     mint *d_col;
18 |     mdouble *d_val;
19 |     CSR():M(0), N(0), nnz(0), 
20 |             rpt(nullptr), col(nullptr), val(nullptr),
21 |             d_rpt(nullptr), d_col(nullptr), d_val(nullptr)
22 |         {}
23 |     CSR(const std::string &mtx_file);
24 |     CSR(const CSR& A);
25 |     CSR(const CSR& A, mint M_, mint N_, mint M_start, mint N_start);
26 |     ~CSR();
27 | 
28 |     void hrelease();
29 |     void drelease();
30 |     void release();
31 |     void D2H();
32 |     void H2D();
33 |     bool operator==(const CSR& A);
34 |     CSR& operator=(const CSR& A);
35 |     void construct(const std::string &mtx_file);
36 |           
37 | };
38 | 
39 | #endif
40 | 


--------------------------------------------------------------------------------
/OpSparse/inc/Meta.h:
--------------------------------------------------------------------------------
 1 | #ifndef __Z_META__
 2 | #define __Z_META__
 3 | 
 4 | #include "cuda_common.h"
 5 | #include "define.h"
 6 | 
 7 | class CSR;
 8 | class Meta{
 9 |     public:
10 |     // first, allocate C.rpt. 
11 |     // d_row_flop, d_estimated_row_nnz, d_row_nnz are all reused with C.rpt
12 | 
13 |     // combined memory
14 |     mint *d_combined_mem; // second, allocate for all others
15 |     mint *combined_mem; // second, allocate for all others
16 | 
17 |     // meta data
18 |     mint M; // number of rows
19 |     mint N; // number of cols
20 |     mint *d_bins; // size M
21 |     mint *d_bin_size; // size NUM_BIN
22 |     mint *d_bin_offset; // size NUM_BIN
23 |     mint *d_max_row_nnz; // size 1
24 |     mint *d_total_nnz; // size 1
25 |     mint *d_cub_storage; // size variable
26 |     mint *bin_size; // size NUM_BIN
27 |     mint *bin_offset; // size NUM_BIN
28 |     mint *max_row_nnz; // size 1
29 |     mint *total_nnz; // size 1
30 |     size_t cub_storage_size;
31 |     cudaStream_t *stream;
32 | 
33 | 
34 |     // symbolic global and numeric global, is allocated at runtime
35 |     mint* d_global_mem_pool; // size unknown, allocated at runtime
36 |     size_t global_mem_pool_size;
37 |     bool global_mem_pool_malloced;
38 | 
39 |     // ********************************************************
40 |     // public method
41 |     Meta(){}
42 |     Meta(const Meta&) = delete;
43 |     Meta &operator=(const Meta&) = delete;
44 |     Meta(CSR &C); // init and first malloc
45 |     void allocate_rpt(CSR& C);
46 |     void allocate(CSR &C); // malloc conbined mem and pin the variables
47 |     void release();
48 | 
49 |     void memset_bin_size(mint stream_idx); // set d_bin_size only to 0
50 |     void memset_all(mint stream_idx); // set d_bin_size and other to 0
51 |     void D2H_bin_size(mint stream_idx);
52 |     void D2H_all(mint stream_idx);
53 |     void H2D_bin_offset(mint stream_idx);
54 |     ~Meta();
55 | };
56 | 
57 | #endif
58 | 


--------------------------------------------------------------------------------
/OpSparse/inc/Timings.h:
--------------------------------------------------------------------------------
 1 | #ifndef __Z_TIMING_H__
 2 | #define __Z_TIMING_H__
 3 | 
 4 | class Timings {
 5 |     public:
 6 |     bool measure_separate;
 7 |     bool measure_total;
 8 |     double setup;
 9 |     double symbolic_binning;
10 |     double symbolic;
11 |     double reduce;
12 |     double numeric_binning;
13 |     double prefix;
14 |     double allocate;
15 |     double numeric;
16 |     double cleanup;
17 |     double total;
18 |     Timings();
19 | 
20 |     void operator+=(const Timings& b);
21 | 
22 |     void operator/=(const double x);
23 |     void print(const double total_flop);
24 |     void reg_print(const double total_flop);
25 |     void perf_print(const double total_flop);
26 |     void binning_print(const double total_flop);
27 | };
28 | 
29 | #endif
30 | 
31 | 


--------------------------------------------------------------------------------
/OpSparse/inc/binning.cuh:
--------------------------------------------------------------------------------
  1 | #ifndef __Z_ESTIMATE_SYMBOLIC_BINNING_CUH__
  2 | #define __Z_ESTIMATE_SYMBOLIC_BINNING_CUH__
  3 | 
  4 | #include "cuda_common.h"
  5 | #include "define.h"
  6 | 
  7 | 
  8 | 
  9 | __global__ void __launch_bounds__(1024, 2) k_symbolic_binning(
 10 |     mint *d_row_flop, int M, mint *d_bin_size){
 11 |     
 12 |     __shared__ mint shared_bin_size[NUM_BIN];
 13 |     if(threadIdx.x < NUM_BIN){
 14 |         shared_bin_size[threadIdx.x] = 0;
 15 |     }
 16 |     __syncthreads();
 17 | 
 18 |     mint i = threadIdx.x + blockIdx.x * blockDim.x;
 19 |     mint row_nnz, j;
 20 |     //mint range[NUM_BIN] = {32, 512, 1024, 2048,     4096, 8192, 12287, INT_MAX}; // 1x
 21 |     mint range[NUM_BIN] = {26, 426, 853, 1706,     3413, 6826, 10240, INT_MAX}; // 1.2x
 22 |     //mint range[NUM_BIN] = {21, 341, 682, 1365,     2730, 5461, 8191, INT_MAX}; // 1.5x
 23 |     if(i < M){
 24 |         row_nnz = d_row_flop[i];
 25 |         //#pragma unroll
 26 |         for(j = 0; j < NUM_BIN; j++){
 27 |             if(row_nnz <= range[j]){
 28 |                 atomicAdd(shared_bin_size + j, 1);
 29 |                 goto before_end;
 30 |             }
 31 |         }
 32 |     }
 33 |     before_end:
 34 |     __syncthreads();
 35 |     if(threadIdx.x < NUM_BIN){
 36 |         atomicAdd(d_bin_size + threadIdx.x, shared_bin_size[threadIdx.x]);
 37 |     }
 38 | }
 39 | 
 40 | 
 41 | __global__ void __launch_bounds__ (1024, 2) k_symbolic_binning2(
 42 |     mint * __restrict__ d_row_flop, 
 43 |     int M, 
 44 |     mint * __restrict__ d_bins, 
 45 |     mint * __restrict__ d_bin_size, 
 46 |     mint * __restrict__ d_bin_offset){
 47 | 
 48 | 
 49 |     __shared__ mint shared_bin_size[NUM_BIN];
 50 |     __shared__ mint shared_bin_offset[NUM_BIN];
 51 |     if(threadIdx.x < NUM_BIN){
 52 |         shared_bin_size[threadIdx.x] = 0;
 53 |     }
 54 |     __syncthreads();
 55 | 
 56 |     mint i = threadIdx.x + blockIdx.x * blockDim.x;
 57 |     mint row_nnz, j;
 58 |     //mint range[NUM_BIN] = {32, 512, 1024, 2048,     4096, 8192, 12287, INT_MAX}; // 1x
 59 |     mint range[NUM_BIN] = {26, 426, 853, 1706,     3413, 6826, 10240, INT_MAX}; // 1.2x
 60 |     //mint range[NUM_BIN] = {21, 341, 682, 1365,     2730, 5461, 8191, INT_MAX}; // 1.5x
 61 |     if(i < M){
 62 |         row_nnz = d_row_flop[i];
 63 |         //#pragma unroll
 64 |         for(j = 0; j < NUM_BIN; j++){
 65 |             if(row_nnz <= range[j]){
 66 |                 atomicAdd(shared_bin_size + j, 1);
 67 |                 goto before_end;
 68 |             }
 69 |         }
 70 |     }
 71 |     before_end:
 72 | 
 73 |     __syncthreads();
 74 |     if(threadIdx.x < NUM_BIN){
 75 |         shared_bin_offset[threadIdx.x] = atomicAdd(d_bin_size + threadIdx.x, shared_bin_size[threadIdx.x]);
 76 |         shared_bin_offset[threadIdx.x] += d_bin_offset[threadIdx.x];
 77 |         shared_bin_size[threadIdx.x] = 0;
 78 |     }
 79 |     __syncthreads();
 80 | 
 81 |     mint index;
 82 |     if(i < M){
 83 |         //#pragma unroll
 84 |         for(j = 0; j < NUM_BIN; j++){
 85 |             if(row_nnz <= range[j]){
 86 |                 index = atomicAdd(shared_bin_size + j, 1);
 87 |                 d_bins[shared_bin_offset[j] + index] = i;
 88 |                 return;
 89 |             }
 90 |         }
 91 |     }
 92 | }
 93 | 
 94 | 
 95 | __global__ void k_binning_small(
 96 |     mint *d_bins, mint M){
 97 | 
 98 |     mint i = threadIdx.x + blockIdx.x * blockDim.x;
 99 |     if(i >= M){
100 |         return;
101 |     }
102 |     d_bins[i] = i;
103 | }
104 | 
105 | 
106 | 
107 | __global__ void __launch_bounds__ (1024, 2) k_numeric_binning(
108 |     mint * __restrict__ d_row_nnz, 
109 |     int M, 
110 |     mint * __restrict__ d_bin_size, 
111 |     mint * __restrict__ d_total_nnz, 
112 |     mint * __restrict__ d_max_row_nnz){
113 | 
114 |     __shared__ mint shared_bin_size[NUM_BIN];
115 |     __shared__ mint shared_local_nnz[1];
116 |     __shared__ mint shared_max_row_nnz[1];
117 |     if(threadIdx.x < NUM_BIN){
118 |         shared_bin_size[threadIdx.x] = 0;
119 |     }
120 |     if(threadIdx.x == 32){
121 |         shared_local_nnz[0] = 0;
122 |         shared_max_row_nnz[0] = 0;
123 |     }
124 |     __syncthreads();
125 |     //mint range[NUM_BIN] = {31, 255, 511, 1022,    2047, 4095, 8191, INT_MAX}; // 1x
126 |     //mint range[NUM_BIN] = {21, 192, 384, 768,    1536, 3072, 5460, INT_MAX}; // 1.5x
127 |     mint range[NUM_BIN] = {16, 128, 256, 512,    1024, 2048, 4095, INT_MAX}; // 2x
128 |     //mint range[NUM_BIN] = {10, 85, 170, 341,    682, 1365, 2730, INT_MAX}; // 3x
129 |     mint i = threadIdx.x + blockIdx.x * blockDim.x;
130 |     mint row_nnz, j;
131 |     if(i < M){
132 |         row_nnz = d_row_nnz[i];
133 |         atomicAdd(shared_local_nnz, row_nnz);
134 |         atomicMax(shared_max_row_nnz, row_nnz);
135 |         //#pragma unroll
136 |         for(j = 0; j < NUM_BIN; j++){
137 |             if(row_nnz <= range[j]){
138 |                 atomicAdd(shared_bin_size + j, 1);
139 |                 goto before_end;
140 |             }
141 |         }
142 |     }
143 |     before_end:
144 | 
145 | 
146 |     __syncthreads();
147 |     if(threadIdx.x < NUM_BIN){
148 |         atomicAdd(d_bin_size + threadIdx.x, shared_bin_size[threadIdx.x]);
149 |     }
150 |     if(threadIdx.x == 32){
151 |         atomicAdd(d_total_nnz, shared_local_nnz[0]);
152 |     }
153 |     if(threadIdx.x == 64){
154 |         atomicMax(d_max_row_nnz, shared_max_row_nnz[0]);
155 |     }
156 | }
157 | 
158 | __global__ void k_numeric_binning2 __launch_bounds__ (1024, 2) (
159 |     mint * __restrict__ d_row_nnz, 
160 |     int M, 
161 |     mint * __restrict__ d_bins, 
162 |     mint * __restrict__ d_bin_size, 
163 |     mint * __restrict__ d_bin_offset){ 
164 | 
165 |     __shared__ mint shared_bin_size[NUM_BIN];
166 |     __shared__ mint shared_bin_offset[NUM_BIN];
167 |     if(threadIdx.x < NUM_BIN){
168 |         shared_bin_size[threadIdx.x] = 0;
169 |     }
170 |     __syncthreads();
171 |     //mint range[NUM_BIN] = {31, 255, 511, 1022,    2047, 4095, 8191, INT_MAX}; // 1x
172 |     //mint range[NUM_BIN] = {21, 192, 384, 768,    1536, 3072, 5460, INT_MAX}; // 1.5x
173 |     mint range[NUM_BIN] = {16, 128, 256, 512,    1024, 2048, 4095, INT_MAX}; // 2x
174 |     //mint range[NUM_BIN] = {10, 85, 170, 341,    682, 1365, 2730, INT_MAX}; // 3x
175 |     mint i = threadIdx.x + blockIdx.x * blockDim.x;
176 |     mint row_nnz, j;
177 |     if(i < M){
178 |         row_nnz = d_row_nnz[i];
179 |         //#pragma unroll
180 |         for(j = 0; j < NUM_BIN; j++){
181 |             if(row_nnz <= range[j]){
182 |                 atomicAdd(shared_bin_size + j, 1);
183 |                 goto before_end;
184 |             }
185 |         }
186 |     }
187 |     before_end:
188 | 
189 | 
190 |     __syncthreads();
191 |     if(threadIdx.x < NUM_BIN){
192 |         shared_bin_offset[threadIdx.x] = atomicAdd(d_bin_size + threadIdx.x, shared_bin_size[threadIdx.x]);
193 |         shared_bin_offset[threadIdx.x] += d_bin_offset[threadIdx.x];
194 |         shared_bin_size[threadIdx.x] = 0;
195 |     }
196 |     __syncthreads();
197 |     mint index;
198 |     if(i < M){
199 |         //#pragma unroll
200 |         for(j = 0; j < NUM_BIN; j++){
201 |             if(row_nnz <= range[j]){
202 |                 index = atomicAdd(shared_bin_size + j, 1);
203 |                 d_bins[shared_bin_offset[j] + index] = i;
204 |                 return;
205 |             }
206 |         }
207 |     }
208 | }
209 | 
210 | #endif
211 | 


--------------------------------------------------------------------------------
/OpSparse/inc/cuda_common.h:
--------------------------------------------------------------------------------
 1 | #ifndef _Z_COMMON_
 2 | #define _Z_COMMON_
 3 | 
 4 | #include <cuda_runtime.h>
 5 | #include <stdio.h>
 6 | #include <exception>
 7 | #include <cusparse.h>
 8 | #include <iostream>
 9 | #include <omp.h>
10 | #include <stdlib.h>
11 | #include <algorithm>
12 | 
13 | #define likely(x) __builtin_expect(x,1)
14 | #define unlikely(x) __builtin_expect(x,0)
15 | 
16 | //typedef unsigned int mint;
17 | typedef int mint;
18 | typedef double mdouble;
19 | 
20 | inline static void checkCUDA(cudaError_t err,
21 | 							   const char *file,
22 | 							   int line)
23 | {
24 | 	if (unlikely(err != cudaSuccess))
25 | 	{
26 | 		printf("%s in %s at line %d\n", cudaGetErrorString(err),
27 | 			   file, line);
28 | 		throw std::exception();
29 | 	}
30 | }
31 | // #ifdef _DEBUG || NDEBUG || DEBUG
32 | #define CHECK_CUDA(err) (checkCUDA(err, __FILE__, __LINE__))
33 | #define CHECK_ERROR(err) (checkCUDA(err, __FILE__, __LINE__))
34 | 
35 | inline void CHECK_CUSPARSE(cusparseStatus_t status, std::string errorMsg="")
36 | {
37 | 	if (status != CUSPARSE_STATUS_SUCCESS) {
38 | 		std::cout << "CuSparse error: " << errorMsg << std::endl;
39 | 		throw std::exception();
40 | 	}
41 | }
42 | 
43 | #define HP_TIMING_NOW(Var) \
44 |   ({ unsigned int _hi, _lo; \
45 |      asm volatile ("lfence\n\trdtsc" : "=a" (_lo), "=d" (_hi)); \
46 |      (Var) = ((unsigned long long int) _hi << 32) | _lo; })
47 | 
48 | /* precision is 1 clock cycle.
49 |  * execute time is roughly 50 or 140 cycles depends on cpu family */
50 | inline void cpuid(int *info, int eax, int ecx = 0){
51 |     int ax, bx, cx, dx;
52 |     __asm__ __volatile__ ("cpuid": "=a" (ax), "=b" (bx), "=c" (cx), "=d" (dx) : "a" (eax));
53 | 
54 |     info[0] = ax;
55 |     info[1] = bx;
56 |     info[2] = cx;
57 |     info[3] = dx;
58 | }
59 | 
60 | inline long get_tsc_freq(){
61 |     static long freq = 0;
62 |     if(unlikely((freq == 0))){
63 |         int raw[4];
64 |         cpuid(raw, 0x16); // get cpu freq
65 |         freq = long(raw[0]) * 1000000;
66 |         //printf("static first call %f\n", freq);
67 |     }
68 |     return freq;
69 | }
70 | 
71 | inline double fast_clock_time(){
72 |     long counter;
73 |     HP_TIMING_NOW(counter);
74 |     return double(counter)/get_tsc_freq();
75 | }
76 | 
77 | template <typename T>
78 | inline void D2H(T *dst, T* src, size_t size){
79 |     CHECK_ERROR(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost));
80 | }
81 | 
82 | template <typename T>
83 | inline void H2D(T *dst, T* src, size_t size){
84 |     CHECK_ERROR(cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice));
85 | }
86 | 
87 | template <typename T>
88 | inline void D2D(T *dst, T* src, size_t size){
89 |     CHECK_ERROR(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToDevice));
90 | }
91 | 
92 | 
93 | #endif
94 | 


--------------------------------------------------------------------------------
/OpSparse/inc/cusparse_spgemm.h:
--------------------------------------------------------------------------------
  1 | #include <cusparse.h>
  2 | #include <cuda_runtime.h>
  3 | #include "cuda_common.h"
  4 | #include "CSR.h"
  5 | 
  6 | void cusparse_spgemm_inner(int *d_row_ptr_A, int *d_col_idx_A, double *d_csr_values_A,
  7 |                        int *d_row_ptr_B, int *d_col_idx_B, double *d_csr_values_B,
  8 |                        int **d_row_ptr_C, int **d_col_idx_C, double **d_csr_values_C,
  9 |                        int M, int K, int N, int nnz_A, int nnz_B, int* nnz_C){
 10 |     CHECK_CUDA(cudaMalloc((void**) d_row_ptr_C, (M+1) * sizeof(int)));
 11 |     
 12 |     cusparseHandle_t handle;
 13 |     CHECK_CUSPARSE(cusparseCreate(&handle), "create cusparse handle");
 14 |     cusparseSpMatDescr_t matA, matB, matC;
 15 |     void *dBuffer1 = NULL, *dBuffer2 = NULL;
 16 |     size_t bufferSize1 = 0, bufferSize2 = 0;
 17 |     CHECK_CUSPARSE( cusparseCreateCsr(&matA, M, K, nnz_A,
 18 |                                       d_row_ptr_A, d_col_idx_A, d_csr_values_A,
 19 |                                       CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
 20 |                                       CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F), "create matA" );
 21 |     CHECK_CUSPARSE( cusparseCreateCsr(&matB, K, N, nnz_B,
 22 |                                       d_row_ptr_B, d_col_idx_B, d_csr_values_B,
 23 |                                       CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
 24 |                                       CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F), "create matB" );
 25 |     CHECK_CUSPARSE( cusparseCreateCsr(&matC, M, N, 0,
 26 |                                       NULL, NULL, NULL,
 27 |                                       CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
 28 |                                       CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F), "create matC" );
 29 |     cusparseSpGEMMDescr_t spgemmDescr;
 30 |     CHECK_CUSPARSE(cusparseSpGEMM_createDescr(&spgemmDescr), "create spgemm descr");
 31 |     double               alpha       = 1.0f;
 32 |     double               beta        = 0.0f;
 33 |     cusparseOperation_t opA         = CUSPARSE_OPERATION_NON_TRANSPOSE;
 34 |     cusparseOperation_t opB         = CUSPARSE_OPERATION_NON_TRANSPOSE;
 35 |     cudaDataType        computeType = CUDA_R_64F;
 36 | 
 37 |     CHECK_CUSPARSE(cusparseSpGEMM_workEstimation(handle, opA, opB,
 38 |                                                  &alpha, matA, matB, &beta, matC,
 39 |                                                  computeType, CUSPARSE_SPGEMM_DEFAULT,
 40 |                                                  spgemmDescr, &bufferSize1, NULL), 
 41 |                                                  "first work estimation");
 42 |     CHECK_CUDA(cudaMalloc((void**) &dBuffer1, bufferSize1));
 43 |     CHECK_CUSPARSE(cusparseSpGEMM_workEstimation(handle, opA, opB,
 44 |                                                  &alpha, matA, matB, &beta, matC,
 45 |                                                  computeType, CUSPARSE_SPGEMM_DEFAULT,
 46 |                                                  spgemmDescr, &bufferSize1, dBuffer1), 
 47 |                                                  "second work estimation");
 48 |     CHECK_CUSPARSE(cusparseSpGEMM_compute(handle, opA, opB,
 49 |                                           &alpha, matA, matB, &beta, matC,
 50 |                                           computeType, CUSPARSE_SPGEMM_DEFAULT,
 51 |                                           spgemmDescr, &bufferSize2, NULL), 
 52 |                                           "first compute");
 53 | 
 54 |     CHECK_CUDA(cudaMalloc((void**) &dBuffer2, bufferSize2));
 55 |     CHECK_CUSPARSE(cusparseSpGEMM_compute(handle, opA, opB,
 56 |                                           &alpha, matA, matB, &beta, matC,
 57 |                                           computeType, CUSPARSE_SPGEMM_DEFAULT,
 58 |                                           spgemmDescr, &bufferSize2, dBuffer2), 
 59 |                                           "second compute");
 60 | 
 61 |     int64_t M_C, N_C, nnz_C_64I;
 62 |     CHECK_CUSPARSE( cusparseSpMatGetSize(matC, &M_C, &N_C, &nnz_C_64I) );
 63 |     *nnz_C = nnz_C_64I;
 64 |     CHECK_CUDA(cudaMalloc((void**)d_col_idx_C, *nnz_C*sizeof(int)));
 65 |     CHECK_CUDA(cudaMalloc((void**)d_csr_values_C, *nnz_C*sizeof(double)));
 66 |     CHECK_CUSPARSE(cusparseCsrSetPointers(matC, *d_row_ptr_C, *d_col_idx_C, *d_csr_values_C));
 67 |     
 68 |     CHECK_CUSPARSE(cusparseSpGEMM_copy(handle, opA, opB,
 69 |                                        &alpha, matA, matB, &beta, matC,
 70 |                                        computeType, CUSPARSE_SPGEMM_DEFAULT, spgemmDescr),
 71 |                                        "spgemm copy");
 72 |     CHECK_CUSPARSE( cusparseSpGEMM_destroyDescr(spgemmDescr) );
 73 |     CHECK_CUSPARSE( cusparseDestroySpMat(matA) );
 74 |     CHECK_CUSPARSE( cusparseDestroySpMat(matB) );
 75 |     CHECK_CUSPARSE( cusparseDestroySpMat(matC) );
 76 |     CHECK_CUSPARSE( cusparseDestroy(handle) );
 77 | 
 78 |     CHECK_CUDA(cudaFree(dBuffer1));
 79 |     CHECK_CUDA(cudaFree(dBuffer2));
 80 | 
 81 |     CHECK_CUDA(cudaDeviceSynchronize());
 82 | }
 83 | 
 84 | 
 85 | void cusparse_spgemm(CSR *a, CSR *b, CSR *c){
 86 | 	int tmp_nnz;
 87 | 	cusparse_spgemm_inner(a->d_rpt, a->d_col, a->d_val,
 88 | 							b->d_rpt, b->d_col, b->d_val,
 89 | 							&(c->d_rpt), &(c->d_col), &(c->d_val),
 90 | 							a->M, a->N, b->N, a->nnz, b->nnz, &(tmp_nnz));
 91 | 	c->M = a->M;
 92 | 	c->N = b->N;
 93 | 	c->nnz = tmp_nnz;
 94 | }
 95 | 
 96 | 
 97 | void cusparse_spgemm(const CSR& A, const CSR& B, CSR& C){
 98 | 	int tmp_nnz;
 99 | 	cusparse_spgemm_inner(A.d_rpt, A.d_col, A.d_val,
100 | 	    B.d_rpt, B.d_col, B.d_val,
101 | 		&(C.d_rpt), &(C.d_col), &(C.d_val),
102 | 		A.M, A.N, B.N, A.nnz, B.nnz, &(tmp_nnz));
103 | 	C.M = A.M;
104 | 	C.N = B.N;
105 | 	C.nnz = tmp_nnz;
106 | }
107 | 


--------------------------------------------------------------------------------
/OpSparse/inc/define.h:
--------------------------------------------------------------------------------
 1 | #ifndef _Z_DEFINE_H_
 2 | #define _Z_DEFINE_H_
 3 | 
 4 | #define div_up(a, b) ((a+b-1)/b)
 5 | #define div_round_up(a, b) ((a+b-1)/b)
 6 | 
 7 | #define NUM_BIN 8
 8 | #define WSIZE 32
 9 | 
10 | #define PWARP 4
11 | #define PWARP_ROWS 256
12 | #define PWARP_TSIZE 32
13 | #define PWARP_BLOCK_SIZE (PWARP * PWARP_ROWS)
14 | 
15 | #define NUMERIC_PWARP 8
16 | #define NUMERIC_PWARP_ROWS 128
17 | #define NUMERIC_PWARP_TSIZE 32
18 | #define NUMERIC_PWARP_BLOCK_SIZE (NUMERIC_PWARP * NUMERIC_PWARP_ROWS)
19 | 
20 | #define HASH_SINGLE
21 | //#define HASH_MULTI
22 | 
23 | // cannot define WARP, since thrust source code uses WARP
24 | //#define WARP 32
25 | 
26 | #define SYMBOLIC_SCALE_SMALL 1
27 | #define SYMBOLIC_SCALE_LARGE 1
28 | #define NUMERIC_SCALE_LARGE 2
29 | #define NUMERIC_SCALE 1.5
30 | #define THRESH_SCALE 0.8
31 | #define HASH_SCALE 107
32 | 
33 | #endif
34 | 


--------------------------------------------------------------------------------
/OpSparse/inc/setup.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef __Z_SETUP_CUH__
 2 | #define __Z_SETUP_CUH__
 3 | 
 4 | #include "cuda_common.h"
 5 | #include "define.h"
 6 | 
 7 | __global__ void __launch_bounds__(1024, 2) k_compute_flop(
 8 |     const mint* __restrict__ d_arpt, 
 9 |     const mint* __restrict__ d_acol,
10 |     const mint* __restrict__ d_brpt,
11 |     mint M,
12 |     mint *d_row_flop,
13 |     mint *d_max_row_flop){
14 | 
15 |     __shared__ mint shared_max_row_flop[1];
16 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
17 |     if (i >= M) {
18 |         return;
19 |     }
20 |     if(threadIdx.x == 0){
21 |         shared_max_row_flop[0] = 0;
22 |     }
23 |     __syncthreads();
24 |     mint row_flop = 0;
25 |     mint j;
26 |     mint acol;
27 |     mint arow_start, arow_end;
28 |     arow_start = d_arpt[i];
29 |     arow_end = d_arpt[i+1];
30 |     for (j = arow_start; j < arow_end; j++) {
31 |         acol = d_acol[j];
32 |         row_flop += d_brpt[acol + 1] - d_brpt[acol];
33 |     }
34 |     d_row_flop[i] = row_flop;
35 |     atomicMax(shared_max_row_flop, row_flop);
36 |     __syncthreads();
37 |     if(threadIdx.x == 0){
38 |         atomicMax(d_max_row_flop, shared_max_row_flop[0]);
39 |     }
40 | }
41 | 
42 | 
43 | #endif
44 | 


--------------------------------------------------------------------------------
/OpSparse/readme.md:
--------------------------------------------------------------------------------
 1 | # Get started
 2 | 1 Profile opsparse
 3 | 
 4 | 1.1 ```$> make opsparse ```
 5 | 
 6 | 1.2 ```$> ./opsparse webbase-1M```
 7 | 
 8 | 2 Overall performance of opsparse
 9 | 
10 | 2.1 ```$> make reg_opsparse```
11 | 
12 | 2.2 ```$> ./reg_opsparse webbase-1M```
13 | 
14 | 3 Overall performance of cusparse
15 | 
16 | 3.1 ```$> make reg_cusparse```
17 | 
18 | 3.2 ```$> ./reg_cusparse webbase-1M```
19 | 
20 | 


--------------------------------------------------------------------------------
/OpSparse/src/Meta.cu:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "Meta.h"
 3 | #include "CSR.h"
 4 | #include <cub/cub.cuh>
 5 | 
 6 | Meta::Meta(CSR &C){
 7 |     allocate_rpt(C);
 8 | }
 9 | 
10 | void Meta::allocate_rpt(CSR &C){
11 |     CHECK_ERROR(cudaMalloc(&C.d_rpt, (C.M + 1)*sizeof(mint)));
12 | }
13 | 
14 | void Meta::allocate(CSR& C){
15 |     M = C.M;
16 |     N = C.N;
17 |     stream = new cudaStream_t [NUM_BIN];
18 |     for(int i = 0; i < NUM_BIN; i++){
19 |         CHECK_ERROR(cudaStreamCreate(stream + i));
20 |     }
21 |         
22 |     cub::DeviceScan::ExclusiveSum(nullptr, cub_storage_size, C.d_rpt, C.d_rpt, M + 1); // calculate tmp_storage_size in bytes
23 | 
24 |     mint d_combined_size = M  + 2 * NUM_BIN + 2 + cub_storage_size/(sizeof(mint));
25 |     CHECK_ERROR(cudaMalloc(&d_combined_mem, d_combined_size * sizeof(mint)));
26 |     mint combined_size = 2 * NUM_BIN + 2;
27 |     combined_mem = (mint *)malloc(combined_size * sizeof(mint));
28 |     assert(combined_mem != nullptr);
29 | 
30 |     d_bins = (mint *)d_combined_mem; // size M
31 |     d_bin_size = (mint *)d_combined_mem + M; // size NUM_BIN
32 |     d_max_row_nnz = d_bin_size + NUM_BIN; // size 1
33 |     d_total_nnz = d_bin_size + NUM_BIN + 1; // size 1
34 |     d_bin_offset = d_total_nnz + 1; // size NUM_BIN
35 |     d_cub_storage = d_bin_offset + 1;
36 | 
37 |     bin_size = (mint*) combined_mem; // size NUM_BIN
38 |     max_row_nnz = bin_size + NUM_BIN; // size 1
39 |     total_nnz = bin_size + NUM_BIN + 1; // size 1
40 |     bin_offset = bin_size + NUM_BIN + 2; // size NUM_BIN
41 |     
42 |     d_global_mem_pool = nullptr;
43 |     global_mem_pool_size = 0;
44 |     global_mem_pool_malloced = false;
45 | }
46 | 
47 | void Meta::release(){
48 |     cudaFree(d_combined_mem);
49 |     d_combined_mem = nullptr;
50 |     if(stream != nullptr){
51 |         for(int i = 0; i < NUM_BIN; i++){
52 |             cudaStreamDestroy(stream[i]);
53 |         }
54 |         delete [] stream;
55 |         stream = nullptr;
56 |     }
57 |     delete [] combined_mem;
58 |     combined_mem = nullptr;
59 | }
60 | 
61 | Meta::~Meta(){
62 |     release();
63 | }
64 | 
65 | 
66 | void Meta::memset_all(mint stream_idx = 1){
67 |     CHECK_ERROR(cudaMemsetAsync(d_bin_size, 0, (NUM_BIN + 2) * sizeof(mint), stream[stream_idx]));
68 |     //CHECK_ERROR(cudaMemset(d_bin_size, 0, (NUM_BIN + 5) * sizeof(mint)));
69 | }
70 | void Meta::memset_bin_size(mint stream_idx = 1){
71 |     CHECK_ERROR(cudaMemsetAsync(d_bin_size, 0, NUM_BIN * sizeof(mint), stream[stream_idx]));
72 |     //CHECK_ERROR(cudaMemset(d_bin_size, 0, (NUM_BIN + 5) * sizeof(mint)));
73 | }
74 | 
75 | void Meta::D2H_all(mint stream_idx = 0){
76 |     CHECK_ERROR(cudaMemcpyAsync(bin_size, d_bin_size, (NUM_BIN + 2) * sizeof(mint), cudaMemcpyDeviceToHost, stream[stream_idx]));
77 |     //CHECK_ERROR(cudaMemcpy(bin_size, d_bin_size, NUM_BIN * sizeof(mint), cudaMemcpyHostToDevice));
78 | }
79 | 
80 | void Meta::D2H_bin_size(mint stream_idx = 0){
81 |     CHECK_ERROR(cudaMemcpyAsync(bin_size, d_bin_size, NUM_BIN * sizeof(mint), cudaMemcpyDeviceToHost, stream[stream_idx]));
82 |     //CHECK_ERROR(cudaMemcpy(bin_size, d_bin_size, NUM_BIN * sizeof(mint), cudaMemcpyHostToDevice));
83 | }
84 | 
85 | void Meta::H2D_bin_offset(mint stream_idx = 0){
86 |     CHECK_ERROR(cudaMemcpyAsync(d_bin_offset, bin_offset, NUM_BIN * sizeof(mint), cudaMemcpyHostToDevice, stream[stream_idx]));
87 | }
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/OpSparse/src/Timings.cu:
--------------------------------------------------------------------------------
 1 | #include "Timings.h"
 2 | #include <stdio.h>
 3 | 
 4 | Timings::Timings(){
 5 |     measure_separate = true;
 6 |     measure_total = true;
 7 |     setup = 0;
 8 |     symbolic_binning = 0;
 9 |     symbolic = 0;
10 |     reduce = 0;
11 |     numeric_binning = 0;
12 |     prefix = 0;
13 |     allocate = 0;
14 |     numeric = 0;
15 |     cleanup = 0;
16 |     total = 0;
17 | }
18 | 
19 | void Timings::operator+=(const Timings& b){
20 |     setup += b.setup;
21 |     symbolic_binning += b.symbolic_binning;
22 |     symbolic += b.symbolic;
23 |     reduce += b.reduce;
24 |     numeric_binning += b.numeric_binning;
25 |     prefix += b.prefix;
26 |     allocate += b.allocate;
27 |     numeric += b.numeric;
28 |     cleanup += b.cleanup;
29 |     total += b.total;
30 | }
31 | 
32 | void Timings::operator/=(const double x){
33 |     setup /= x;
34 |     symbolic_binning /= x;
35 |     symbolic /= x;
36 |     reduce /= x;
37 |     numeric_binning /= x;
38 |     prefix /= x;
39 |     allocate /= x;
40 |     numeric /= x;
41 |     cleanup /= x;
42 |     total /= x;
43 | }
44 | 
45 | void Timings::print(double total_flop){
46 |     double total_flop_G = total_flop/1000000000;
47 |     printf("total flop %lf\n", total_flop);
48 |     double sum_total = setup + symbolic_binning + symbolic + numeric_binning
49 |         + reduce + prefix + allocate + numeric + cleanup;
50 |     if(measure_separate){
51 |         //printf("time(ms): setup %.3lf symbolic_binning %.3lf symbolic %.3lf numeric_binning %.3lf prefix_allocate %.3lf numeric %.3lf cleanup %.3lf total %.3lf",)
52 |         printf("time(ms):\n");
53 |         printf("    setup            %8.3lfms %6.2lf%%\n", 1000*setup, setup/total*100);
54 |         printf("\e[1;31m    symbolic_binning %8.3lfms %6.2lf%%\n\e[0m", 1000*symbolic_binning, symbolic_binning/total*100);
55 |         printf("\e[1;31m    symbolic         %8.3lfms %6.2lf%%\n\e[0m", 1000*symbolic, symbolic/total*100);
56 |         printf("    reduce            %8.3lfms %6.2lf%%\n", 1000*reduce, reduce/total*100);
57 |         printf("\e[1;31m    numeric_binning  %8.3lfms %6.2lf%%\n\e[0m", 1000*numeric_binning, numeric_binning/total*100);
58 |         printf("    prefix           %8.3lfms %6.2lf%%\n", 1000*prefix, prefix/total*100);
59 |         printf("    allocate         %8.3lfms %6.2lf%%\n", 1000*allocate, allocate/total*100);
60 |         printf("\e[1;31m    numeric          %8.3lfms %6.2lf%%\n\e[0m", 1000*numeric, numeric/total*100);
61 |         printf("    cleanup          %8.3lfms %6.2lf%%\n", 1000*cleanup, cleanup/total*100);
62 |         printf("    sum_total        %8.3lfms %6.2lf%%\n", 1000*sum_total, sum_total/total*100);
63 |         printf("    total            %8.3lfms %6.2lf%%\n", 1000*total, total/total*100);
64 |         printf("perf(Gflops):\n");
65 |         printf("    setup            %6.2lf\n", total_flop_G/setup);
66 |         printf("    symbolic_binning %6.2lf\n", total_flop_G/symbolic_binning);
67 |         printf("    symbolic         %6.2lf\n", total_flop_G/symbolic);
68 |         printf("    reduce           %6.2lf\n", total_flop_G/reduce);
69 |         printf("    numeric_binning  %6.2lf\n", total_flop_G/numeric_binning);
70 |         printf("    prefix           %6.2lf\n", total_flop_G/prefix);
71 |         printf("    allocate         %6.2lf\n", total_flop_G/allocate);
72 |         printf("    numeric          %6.2lf\n", total_flop_G/numeric);
73 |         printf("    cleanup          %6.2lf\n", total_flop_G/cleanup);
74 |         printf("    total            %6.2lf\n", total_flop_G/total);
75 |     }
76 | }
77 |         
78 | void Timings::reg_print(double total_flop){
79 |     double total_flop_G = total_flop/1000000000;
80 |     printf("%6.2lf\n", total_flop_G/total);
81 | }
82 | 
83 | void Timings::perf_print(double total_flop){
84 |     double total_flop_G = total_flop/1000000000;
85 |     printf("%6.2lf %6.2lf\n", total_flop_G/symbolic, total_flop_G/numeric);
86 | }
87 | 
88 | void Timings::binning_print(double total_flop){
89 |     double total_flop_G = total_flop/1000000000;
90 |     double total_binning_time = symbolic_binning + numeric_binning;
91 |     printf("%.4le %.4lf\n", total_binning_time, 100*total_binning_time/total);
92 | }
93 | 


--------------------------------------------------------------------------------
/OpSparse/src/opsparse.cu:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "kernel_wrapper.cuh"
  3 | #include <fstream>
  4 | #include <cuda_profiler_api.h>
  5 | #include <cub/cub.cuh>
  6 | #include "cusparse_spgemm.h"
  7 | #include "Timings.h"
  8 | 
  9 | 
 10 | void opsparse(const CSR& A, const CSR& B, CSR& C, Meta& meta, Timings& timing){
 11 |     
 12 |     double t0, t1;
 13 |     t1 = t0 = fast_clock_time();
 14 |     C.M = A.M;
 15 |     C.N = B.N;
 16 |     C.nnz = 0;
 17 |     h_setup(A, B, C, meta, timing);
 18 |     CHECK_ERROR(cudaDeviceSynchronize());
 19 |     timing.setup = fast_clock_time() - t0;
 20 | 
 21 |     // symbolic binning
 22 |     t0 = fast_clock_time();
 23 |     h_symbolic_binning(C, meta);
 24 |     CHECK_ERROR(cudaDeviceSynchronize());
 25 |     timing.symbolic_binning = fast_clock_time() - t0;
 26 | 
 27 | 
 28 |     // symbolic phase
 29 |     t0 = fast_clock_time();
 30 |     h_symbolic(A, B, C, meta);
 31 |     CHECK_ERROR(cudaDeviceSynchronize());
 32 |     timing.symbolic = fast_clock_time() - t0;
 33 | 
 34 | 
 35 |     // numeric binning
 36 |     t0 = fast_clock_time();
 37 |     h_numeric_binning(C, meta);
 38 |     CHECK_ERROR(cudaDeviceSynchronize());
 39 |     timing.numeric_binning = fast_clock_time() - t0;
 40 | 
 41 |     // malloc C
 42 |     t0 = fast_clock_time();
 43 |     C.nnz = *meta.total_nnz;
 44 |     CHECK_ERROR(cudaMalloc(&C.d_val, C.nnz * sizeof(mdouble)));
 45 |     CHECK_ERROR(cudaMalloc(&C.d_col, C.nnz * sizeof(mint)));
 46 |     timing.allocate = fast_clock_time() - t0;
 47 | 
 48 |     // prefix sum and malloc
 49 |     t0 = fast_clock_time();
 50 |     cub::DeviceScan::ExclusiveSum(meta.d_cub_storage, meta.cub_storage_size, C.d_rpt, C.d_rpt, C.M + 1);
 51 |     CHECK_ERROR(cudaDeviceSynchronize());
 52 |     timing.prefix = fast_clock_time() - t0;
 53 | 
 54 |     // numeric   
 55 |     t0 = fast_clock_time();
 56 |     h_numeric_full_occu(A, B, C, meta);
 57 |     CHECK_ERROR(cudaDeviceSynchronize());
 58 |     timing.numeric= fast_clock_time() - t0;
 59 | 
 60 |     // cleanup
 61 |     t0 = fast_clock_time();
 62 |     meta.release();
 63 |     timing.cleanup = fast_clock_time() - t0;
 64 |     timing.total = fast_clock_time() - t1;
 65 | }
 66 | 
 67 | int main(int argc, char **argv)
 68 | {
 69 |     std::string mat1, mat2;
 70 |     mat1 = "can_24";
 71 |     mat2 = "can_24";
 72 |     if(argc == 2){
 73 |         mat1 = argv[1];
 74 |         mat2 = argv[1];
 75 |     }
 76 |     if(argc >= 3){
 77 |         mat1 = argv[1];
 78 |         mat2 = argv[2];
 79 |     }
 80 |     std::string mat1_file;
 81 |     if(mat1.find("ER") != std::string::npos){
 82 |         mat1_file = "../matrix/ER/" + mat1 +".mtx";
 83 |     }
 84 |     else if(mat1.find("G500") != std::string::npos){
 85 |         mat1_file = "../matrix/G500/" + mat1 +".mtx";
 86 |     }
 87 |     else{
 88 |         mat1_file = "../matrix/suite_sparse/" + mat1 + "/" + mat1 +".mtx";
 89 |     }
 90 |     std::string mat2_file;
 91 |     if(mat2.find("ER") != std::string::npos){
 92 |         mat2_file = "../matrix/ER/" + mat2 +".mtx";
 93 |     }
 94 |     else if(mat2.find("G500") != std::string::npos){
 95 |         mat2_file = "../matrix/G500/" + mat2 +".mtx";
 96 |     }
 97 |     else{
 98 |         mat2_file = "../matrix/suite_sparse/" + mat2 + "/" + mat2 +".mtx";
 99 |     }
100 | 	
101 |     CSR A, B;
102 |     A.construct(mat1_file);
103 |     if(mat1 == mat2){
104 |         B = A;
105 |     }
106 |     else{
107 |         B.construct(mat2_file);
108 |         if(A.N == B.M){
109 |             // do nothing
110 |         }
111 |         else if(A.N < B.M){
112 |             CSR tmp(B, A.N, B.N, 0, 0);
113 |             B = tmp;
114 |         }
115 |         else{
116 |             CSR tmp(A, A.M, B.M, 0, 0);
117 |             A = tmp;
118 |         }
119 |     }
120 | 
121 |     A.H2D();
122 |     B.H2D();
123 | 
124 |     long total_flop = compute_flop(A, B);
125 |     CSR C;
126 |     cudaruntime_warmup();
127 |     Meta meta;
128 |     {
129 |         Timings timing;
130 |         opsparse(A, B, C, meta, timing);
131 |         C.release();
132 |     }
133 |     mint iter = 10;
134 |     Timings timing, bench_timing;
135 |     for(mint i = 0; i < iter; i++){
136 |         opsparse(A, B, C, meta, timing);
137 |         bench_timing += timing;
138 |         if(i < iter - 1){
139 |             C.release();
140 |         }
141 |     }
142 |     bench_timing /= iter;
143 | 
144 |     printf("%s ",mat1.c_str());
145 |     bench_timing.print(total_flop * 2);
146 | 
147 |     // compare result
148 | 
149 |     //C.D2H();
150 |     //CSR C_ref;
151 |     //cusparse_spgemm(&A, &B, &C_ref);
152 |     //C_ref.D2H();
153 |     //if(C == C_ref){
154 |     //    printf("pass\n");
155 |     //}
156 |     //else{
157 |     //    printf("error\n");
158 |     //}
159 |     
160 |     A.release();
161 |     B.release();
162 | 
163 |     C.release();
164 |     return 0;
165 | }
166 | 
167 | 
168 | 


--------------------------------------------------------------------------------
/OpSparse/src/reg_cusparse.cu:
--------------------------------------------------------------------------------
 1 | #include "kernel_wrapper.cuh"
 2 | #include <fstream>
 3 | #include <cuda_profiler_api.h>
 4 | #include <cub/cub.cuh>
 5 | #include "Timings.h"
 6 | #include "cusparse_spgemm.h"
 7 | 
 8 | 
 9 | int main(int argc, char **argv)
10 | {
11 |     std::string mat1, mat2;
12 |     mat1 = "can_24";
13 |     mat2 = "can_24";
14 |     if(argc == 2){
15 |         mat1 = argv[1];
16 |         mat2 = argv[1];
17 |     }
18 |     if(argc >= 3){
19 |         mat1 = argv[1];
20 |         mat2 = argv[2];
21 |     }
22 |     std::string mat1_file;
23 |     if(mat1.find("ER") != std::string::npos){
24 |         mat1_file = "../matrix/ER/" + mat1 +".mtx";
25 |     }
26 |     else if(mat1.find("G500") != std::string::npos){
27 |         mat1_file = "../matrix/G500/" + mat1 +".mtx";
28 |     }
29 |     else{
30 |         mat1_file = "../matrix/suite_sparse/" + mat1 + "/" + mat1 +".mtx";
31 |     }
32 |     std::string mat2_file;
33 |     if(mat2.find("ER") != std::string::npos){
34 |         mat2_file = "../matrix/ER/" + mat2 +".mtx";
35 |     }
36 |     else if(mat2.find("G500") != std::string::npos){
37 |         mat2_file = "../matrix/G500/" + mat2 +".mtx";
38 |     }
39 |     else{
40 |         mat2_file = "../matrix/suite_sparse/" + mat2 + "/" + mat2 +".mtx";
41 |     }
42 | 	
43 |     CSR A, B;
44 |     A.construct(mat1_file);
45 |     if(mat1 == mat2){
46 |         B = A;
47 |     }
48 |     else{
49 |         B.construct(mat2_file);
50 |         if(A.N == B.M){
51 |             // do nothing
52 |         }
53 |         else if(A.N < B.M){
54 |             CSR tmp(B, A.N, B.N, 0, 0);
55 |             B = tmp;
56 |         }
57 |         else{
58 |             CSR tmp(A, A.M, B.M, 0, 0);
59 |             A = tmp;
60 |         }
61 |     }
62 | 
63 | 
64 |     A.H2D();
65 |     B.H2D();
66 | 
67 |     long total_flop = compute_flop(A, B);
68 |     double total_flop_G = double(total_flop) * 2/1000000000;
69 |  
70 |     CSR C;
71 |     double t0 = fast_clock_time(), t1;
72 |     cusparse_spgemm(&A, &B, &C);
73 |     C.release();
74 | 
75 |     int iter = 10;
76 |     t1 = 0;
77 |     for(int i = 0; i < iter; i++){
78 |         t0 = fast_clock_time();
79 |         cusparse_spgemm(&A, &B, &C);
80 |         t1 += fast_clock_time() - t0;
81 |         //printf("iter %d %le\n", i, fast_clock_time() - t0);
82 |         if(i < iter - 1){
83 |             C.release();
84 |         }
85 |     }
86 |     t1 /= iter;
87 |     //printf("executione time %le, flops %lf\n\n", t1, total_flop_G / t1);
88 |     printf("%s %lf\n", mat1.c_str(), total_flop_G / t1);
89 | 
90 |     A.release();
91 |     B.release();
92 |     C.release();
93 |     return 0;
94 | }
95 | 
96 | 
97 | 


--------------------------------------------------------------------------------
/OpSparse/src/reg_opsparse.cu:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "kernel_wrapper.cuh"
  3 | #include <fstream>
  4 | #include <cuda_profiler_api.h>
  5 | #include <cub/cub.cuh>
  6 | #include "cusparse_spgemm.h"
  7 | #include "Timings.h"
  8 | 
  9 | 
 10 | void opsparse(const CSR& A, const CSR& B, CSR& C, Meta& meta, Timings& timing){
 11 |     
 12 |     double t0, t1;
 13 |     t1 = t0 = fast_clock_time();
 14 |     C.M = A.M;
 15 |     C.N = B.N;
 16 |     C.nnz = 0;
 17 |     h_setup(A, B, C, meta, timing);
 18 |     CHECK_ERROR(cudaDeviceSynchronize());
 19 |     timing.setup = fast_clock_time() - t0;
 20 | 
 21 |     // symbolic binning
 22 |     t0 = fast_clock_time();
 23 |     h_symbolic_binning(C, meta);
 24 |     CHECK_ERROR(cudaDeviceSynchronize());
 25 |     timing.symbolic_binning = fast_clock_time() - t0;
 26 | 
 27 | 
 28 |     // symbolic phase
 29 |     t0 = fast_clock_time();
 30 |     h_symbolic(A, B, C, meta);
 31 |     CHECK_ERROR(cudaDeviceSynchronize());
 32 |     timing.symbolic = fast_clock_time() - t0;
 33 | 
 34 | 
 35 | 
 36 |     // numeric binning, exclusive sum, and allocate C
 37 |     meta.memset_all(0);
 38 |     mint BS = 1024;
 39 |     mint GS = div_up(C.M, BS);
 40 |     k_numeric_binning<<<GS, BS, 0 , meta.stream[0]>>>(C.d_rpt, C.M,
 41 |         meta.d_bin_size, meta.d_total_nnz, meta.d_max_row_nnz);
 42 |     meta.D2H_all(0);
 43 |     CHECK_ERROR(cudaStreamSynchronize(meta.stream[0]));
 44 |     C.nnz = *meta.total_nnz;
 45 | 
 46 |     if(*meta.max_row_nnz <= 16){
 47 |         k_binning_small<<<GS, BS>>>(meta.d_bins, C.M);
 48 |         CHECK_ERROR(cudaMalloc(&C.d_col, C.nnz * sizeof(mint)));
 49 |         meta.bin_size[0] = C.M;
 50 |         for(int i = 1; i< NUM_BIN; i++){
 51 |             meta.bin_size[i] = 0;
 52 |         }
 53 |         meta.bin_offset[0] = 0;
 54 |         for(int i = 1; i < NUM_BIN; i++){
 55 |             meta.bin_offset[i] = C.M;
 56 |         }
 57 |     }
 58 |     else{
 59 |         meta.memset_bin_size(0);
 60 |         meta.bin_offset[0] = 0;
 61 |         for(int i = 0; i < NUM_BIN - 1; i++){
 62 |             meta.bin_offset[i+1] = meta.bin_offset[i] + meta.bin_size[i];
 63 |         }
 64 |         meta.H2D_bin_offset(0);
 65 | 
 66 |         k_numeric_binning2<<<GS, BS, 0, meta.stream[0]>>>(C.d_rpt, C.M,
 67 |             meta.d_bins, meta.d_bin_size, meta.d_bin_offset);
 68 |         CHECK_ERROR(cudaMalloc(&C.d_col, C.nnz * sizeof(mint)));
 69 |     }
 70 |     CHECK_ERROR(cudaDeviceSynchronize());
 71 | 
 72 |     cub::DeviceScan::ExclusiveSum(meta.d_cub_storage, meta.cub_storage_size, C.d_rpt, C.d_rpt, C.M + 1);
 73 |     CHECK_ERROR(cudaMalloc(&C.d_val, C.nnz * sizeof(mdouble)));
 74 |     CHECK_ERROR(cudaDeviceSynchronize());
 75 | 
 76 |     // numeric   
 77 |     t0 = fast_clock_time();
 78 |     h_numeric_full_occu(A, B, C, meta);
 79 |     CHECK_ERROR(cudaDeviceSynchronize());
 80 |     timing.numeric= fast_clock_time() - t0;
 81 | 
 82 |     // cleanup
 83 |     t0 = fast_clock_time();
 84 |     meta.release();
 85 |     timing.cleanup = fast_clock_time() - t0;
 86 |     timing.total = fast_clock_time() - t1;
 87 | }
 88 | 
 89 | int main(int argc, char **argv)
 90 | {
 91 |     std::string mat1, mat2;
 92 |     mat1 = "can_24";
 93 |     mat2 = "can_24";
 94 |     if(argc == 2){
 95 |         mat1 = argv[1];
 96 |         mat2 = argv[1];
 97 |     }
 98 |     if(argc >= 3){
 99 |         mat1 = argv[1];
100 |         mat2 = argv[2];
101 |     }
102 |     std::string mat1_file;
103 |     if(mat1.find("ER") != std::string::npos){
104 |         mat1_file = "../matrix/ER/" + mat1 +".mtx";
105 |     }
106 |     else if(mat1.find("G500") != std::string::npos){
107 |         mat1_file = "../matrix/G500/" + mat1 +".mtx";
108 |     }
109 |     else{
110 |         mat1_file = "../matrix/suite_sparse/" + mat1 + "/" + mat1 +".mtx";
111 |     }
112 |     std::string mat2_file;
113 |     if(mat2.find("ER") != std::string::npos){
114 |         mat2_file = "../matrix/ER/" + mat2 +".mtx";
115 |     }
116 |     else if(mat2.find("G500") != std::string::npos){
117 |         mat2_file = "../matrix/G500/" + mat2 +".mtx";
118 |     }
119 |     else{
120 |         mat2_file = "../matrix/suite_sparse/" + mat2 + "/" + mat2 +".mtx";
121 |     }
122 | 	
123 |     CSR A, B;
124 |     A.construct(mat1_file);
125 |     if(mat1 == mat2){
126 |         B = A;
127 |     }
128 |     else{
129 |         B.construct(mat2_file);
130 |         if(A.N == B.M){
131 |             // do nothing
132 |         }
133 |         else if(A.N < B.M){
134 |             CSR tmp(B, A.N, B.N, 0, 0);
135 |             B = tmp;
136 |         }
137 |         else{
138 |             CSR tmp(A, A.M, B.M, 0, 0);
139 |             A = tmp;
140 |         }
141 |     }
142 | 
143 |     A.H2D();
144 |     B.H2D();
145 | 
146 |     long total_flop = compute_flop(A, B);
147 |     CSR C;
148 |     cudaruntime_warmup();
149 |     Meta meta;
150 |     {
151 |         Timings timing;
152 |         opsparse(A, B, C, meta, timing);
153 |         C.release();
154 |     }
155 |     
156 |     mint iter = 10;
157 |     Timings timing, bench_timing;
158 |     for(mint i = 0; i < iter; i++){
159 |         opsparse(A, B, C, meta, timing);
160 |         bench_timing += timing;
161 |         if(i < iter - 1){
162 |             C.release();
163 |         }
164 |     }
165 |     bench_timing /= iter;
166 | 
167 |     printf("%s ",mat1.c_str());
168 |     bench_timing.reg_print(total_flop * 2);
169 | 
170 |     // compare result
171 | 
172 |     //C.D2H();
173 |     //CSR C_ref;
174 |     //cusparse_spgemm(&A, &B, &C_ref);
175 |     //C_ref.D2H();
176 |     //if(C == C_ref){
177 |     //    printf("pass\n");
178 |     //}
179 |     //else{
180 |     //    printf("error\n");
181 |     //}
182 |     
183 |     A.release();
184 |     B.release();
185 | 
186 |     C.release();
187 |     return 0;
188 | }
189 | 
190 | 
191 | 


--------------------------------------------------------------------------------
/download_matrix.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -d matrix/suite_sparse ]; then
 4 |     cd matrix/suite_sparse
 5 | else
 6 |     mkdir -p matrix/suite_sparse
 7 |     cd matrix/suite_sparse
 8 | fi
 9 | 
10 | # download webbase-1M
11 | if [ ! -e webbase-1M/webbase-1M.mtx ]; then
12 |     wget https://suitesparse-collection-website.herokuapp.com/MM/Williams/webbase-1M.tar.gz
13 |     tar zxvf webbase-1M.tar.gz
14 | fi
15 | echo Successfully downloaded the matrix.
16 | 
17 | 


--------------------------------------------------------------------------------
/nsparse/Makefile:
--------------------------------------------------------------------------------
 1 | CXX = nvcc
 2 | NVCC = nvcc
 3 | 
 4 | #CFLAGS = -O3 -g
 5 | #CFLAGS = -g -lineinfo
 6 | CFLAGS = -O3
 7 | CFLAGS += -L. ${REAL} -lm
 8 | LDFLAGS = ${CFLAGS}
 9 | 
10 | # for Device Code
11 | CUDA_PATH = /usr/local/cuda
12 | LDFLAGS += -L${CUDA_PATH}/lib64
13 | LDFLAGS += -arch=sm_70 -lcudart -lcusparse
14 | INCLUDE = -I./inc
15 | INCLUDE += -I${CUDA_PATH}/include
16 | INCLUDE += -I${CUDA_PATH}/samples/common/inc
17 | 
18 | BIN = ./bin
19 | SRC = ./
20 | OBJ = ./obj
21 | INC = ./inc
22 | 
23 | OBJ_SUF = .o
24 | OS_SUF = .s.o
25 | OD_SUF = .d.o
26 | TS_SUF = _s
27 | TD_SUF = _d
28 | 
29 | 
30 | SRC_SPGEMM = $(SRC)
31 | SAMPLE_SPGEMM = $(wildcard $(SRC_SPGEMM)/*.cu)
32 | SAMPLE_SPGEMM_TARGET = $(SAMPLE_SPGEMM:$(SRC)%=$(BIN)%)
33 | 
34 | all :
35 | 	make spgemm
36 | 
37 | spgemm: $(SAMPLE_SPGEMM_TARGET:.cu=$(TD_SUF))
38 | 
39 | $(BIN)/%$(TS_SUF): $(OBJ)/%$(OS_SUF)
40 | 	mkdir -p $(dir $@)
41 | 	$(NVCC) -o $@ $^ $(LDFLAGS) $(INCLUDE)
42 | 
43 | $(BIN)/%$(TD_SUF): $(OBJ)/%$(OD_SUF)
44 | 	mkdir -p $(dir $@)
45 | 	$(NVCC) -o $@ $^ $(LDFLAGS) $(INCLUDE)
46 | 
47 | $(OBJ)/%$(OS_SUF) : $(SRC)/%.cu
48 | 	mkdir -p $(dir $@)
49 | 	$(NVCC) -c -DFLOAT $(LDFLAGS) $(INCLUDE) -o $@ $<
50 | 
51 | $(OBJ)/%$(OD_SUF) : $(SRC)/%.cu
52 | 	mkdir -p $(dir $@)
53 | 	$(NVCC) -c -DDOUBLE $(LDFLAGS) $(INCLUDE) -o $@ $<
54 | 
55 | $(OBJ)/%$(OS_SUF) : $(SRC)/%.cpp
56 | 	mkdir -p $(dir $@)
57 | 	$(NVCC) -c -DFLOAT $(LDFLAGS) $(INCLUDE) -o $@ $<
58 | 
59 | $(OBJ)/%$(OD_SUF) : $(SRC)/%.cpp
60 | 	mkdir -p $(dir $@)
61 | 	$(NVCC) -c -DDOUBLE $(LDFLAGS) $(INCLUDE) -o $@ $<
62 | 
63 | clean :
64 | 	rm -rf $(BIN)/*
65 | 	rm -rf $(OBJ)/*
66 | 


--------------------------------------------------------------------------------
/nsparse/inc/CSR.hpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <string>
  3 | #include <cuda.h>
  4 | 
  5 | using namespace std;
  6 | 
  7 | #ifndef CSR_H
  8 | #define CSR_H
  9 | template <class idType, class valType>
 10 | class CSR
 11 | {
 12 | public:
 13 |     CSR():nrow(0), ncolumn(0), nnz(0), device_malloc(false)
 14 |     {
 15 |     }
 16 |     ~CSR()
 17 |     {
 18 |     }
 19 |     void release_cpu_csr()
 20 |     {
 21 |         delete[] rpt;
 22 |         delete[] colids;
 23 |         delete[] values;
 24 |     }
 25 |     void release_csr()
 26 |     {
 27 |         if (device_malloc) {
 28 |             cudaFree(d_rpt);
 29 |             cudaFree(d_colids);
 30 |             cudaFree(d_values);
 31 |         }
 32 |         device_malloc = false;
 33 |     }
 34 |     bool operator==(CSR mat)
 35 |     {
 36 |         bool f = false;
 37 |         if (nrow != mat.nrow) {
 38 |             cout << "Number of row is not correct: " << nrow << ", " << mat.nrow << endl;
 39 |             return f;
 40 |         }
 41 |         if (ncolumn != mat.ncolumn) {
 42 |             cout << "Number of column is not correct" << ncolumn << ", " << mat.ncolumn << endl;
 43 |             return f;
 44 |         }
 45 |         if (nnz != mat.nnz) {
 46 |             cout << "Number of nz is not correct" << nnz << ", " << mat.nnz << endl;
 47 |             return f;
 48 |         }
 49 |         if (rpt == NULL || mat.rpt == NULL || colids == NULL || mat.colids == NULL || values == NULL || mat.values == NULL) {
 50 |             cout << "NULL Pointer" << endl;
 51 |             return f;
 52 |         }
 53 |         for (idType i = 0; i < nrow + 1; ++i) {
 54 |             if (rpt[i] != mat.rpt[i]) {
 55 |                 cout << "rpt[" << i << "] is not correct" << endl;
 56 |                 return f;
 57 |             }
 58 |         }
 59 |         for (idType i = 0; i < nnz; ++i) {
 60 |             if (colids[i] != mat.colids[i]) {
 61 |                 cout << "colids[" << i << "] is not correct" << endl;
 62 |                 return f;
 63 |             }
 64 |         }
 65 |         idType total_fail = 10;
 66 |         valType delta, base, scale;
 67 |         for (idType i = 0; i < nnz; ++i) {
 68 |             delta = values[i] - mat.values[i];
 69 |             base = values[i];
 70 |             if (delta < 0) {
 71 |                 delta *= -1;
 72 |             }
 73 |             if (base < 0) {
 74 |                 base *= -1;
 75 |             }
 76 |             scale = 1000;
 77 |             if (sizeof(valType) == sizeof(double)) {
 78 |                 scale *= 1000;
 79 |             }
 80 |             if (delta * scale * 100 > base) {
 81 |                 cout << i << ": " << values[i] << ", " << mat.values[i] << endl;
 82 |                 total_fail--;
 83 |             }
 84 |             if (total_fail == 0) {
 85 |                 cout << "values[" << i << "] is not correct" << endl;
 86 |                 return f;
 87 |             }
 88 |         }
 89 |         f = true;
 90 |         return f;
 91 |     }
 92 | 
 93 |     void init_data_from_mtx(string file_path);
 94 |     void memcpyHtD()
 95 |     {
 96 |         if (!device_malloc) {
 97 |             //cout << "Allocating memory space for matrix data on device memory" << endl;
 98 |             cudaMalloc((void **)&d_rpt, sizeof(idType) * (nrow + 1));
 99 |             cudaMalloc((void **)&d_colids, sizeof(idType) * nnz);
100 |             cudaMalloc((void **)&d_values, sizeof(valType) * nnz);
101 |         }
102 |         //cout << "Copying matrix data to GPU device" << endl;
103 |         cudaMemcpy(d_rpt, rpt, sizeof(idType) * (nrow + 1), cudaMemcpyHostToDevice);
104 |         cudaMemcpy(d_colids, colids, sizeof(idType) * nnz, cudaMemcpyHostToDevice);
105 |         cudaMemcpy(d_values, values, sizeof(valType) * nnz, cudaMemcpyHostToDevice);
106 |         device_malloc = true;
107 |     }
108 |     void memcpyDtH()
109 |     {
110 |         rpt = new idType[nrow + 1];
111 |         colids = new idType[nnz];
112 |         values = new valType[nnz];
113 |         //cout << "Matrix data is copied to Host" << endl;
114 |         cudaMemcpy(rpt, d_rpt, sizeof(idType) * (nrow + 1), cudaMemcpyDeviceToHost);
115 |         cudaMemcpy(colids, d_colids, sizeof(idType) * nnz, cudaMemcpyDeviceToHost);
116 |         cudaMemcpy(values, d_values, sizeof(valType) * nnz, cudaMemcpyDeviceToHost);
117 |     }
118 | 
119 |     void spmv_cpu(valType *x, valType *y);
120 |     
121 |     idType *rpt;
122 |     idType *colids;
123 |     valType *values;
124 |     idType *d_rpt;
125 |     idType *d_colids;
126 |     valType *d_values;
127 |     idType nrow;
128 |     idType ncolumn;
129 |     idType nnz;
130 |     bool host_malloc;
131 |     bool device_malloc;
132 | };
133 | 
134 | template <class idType, class valType>
135 | void CSR<idType, valType>::init_data_from_mtx(string file_path)
136 | {
137 |     idType i, num;
138 |     bool isUnsy;
139 |     char *line, *ch;
140 |     FILE *fp;
141 |     idType *col_coo, *row_coo, *nnz_num, *each_row_index;
142 |     valType *val_coo;
143 |     idType LINE_LENGTH_MAX = 256;
144 | 
145 |     device_malloc = false;
146 |     
147 |     isUnsy = false;
148 |     line = new char[LINE_LENGTH_MAX];
149 |   
150 |     /* Open File */
151 |     fp = fopen(file_path.c_str(), "r");
152 |     if (fp == NULL) {
153 |         cout << "Cannot find file" << endl;
154 |         exit(1);
155 |     }
156 | 
157 |     fgets(line, LINE_LENGTH_MAX, fp);
158 |     if (strstr(line, "general")) {
159 |         isUnsy = true;
160 |     }
161 |     do {
162 |         fgets(line, LINE_LENGTH_MAX, fp);
163 |     } while(line[0] == '%');
164 |   
165 |     /* Get size info */
166 |     sscanf(line, "%d %d %d", &nrow, &ncolumn, &nnz);
167 |     
168 |     /* Store in COO format */
169 |     num = 0;
170 |     col_coo = new idType[nnz];
171 |     row_coo = new idType[nnz];
172 |     val_coo = new valType[nnz];
173 | 
174 |     while (fgets(line, LINE_LENGTH_MAX, fp)) {
175 |         ch = line;
176 |         /* Read first word (row id)*/
177 |         row_coo[num] = (idType)(atoi(ch) - 1);
178 |         ch = strchr(ch, ' ');
179 |         ch++;
180 |         /* Read second word (column id)*/
181 |         col_coo[num] = (idType)(atoi(ch) - 1);
182 |         ch = strchr(ch, ' ');
183 | 
184 |         if (ch != NULL) {
185 |             ch++;
186 |             /* Read third word (value data)*/
187 |             val_coo[num] = (valType)atof(ch);
188 |             ch = strchr(ch, ' ');
189 |         }
190 |         else {
191 |             val_coo[num] = 1.0;
192 |         }
193 |         num++;
194 |     }
195 |     fclose(fp);
196 |     delete[] line;
197 | 
198 |     /* Count the number of non-zero in each row */
199 |     nnz_num = new idType[nrow];
200 |     for (i = 0; i < nrow; i++) {
201 |         nnz_num[i] = 0;
202 |     }
203 |     for (i = 0; i < num; i++) {
204 |         nnz_num[row_coo[i]]++;
205 |         if(col_coo[i] != row_coo[i] && isUnsy == false) {
206 |             nnz_num[col_coo[i]]++;
207 |             nnz++;
208 |         }
209 |     }
210 | 
211 |     /* Allocation of rpt, col, val */
212 |     rpt = new idType[nrow + 1];
213 |     colids = new idType[nnz];
214 |     values = new valType[nnz];
215 | 
216 |     rpt[0] = 0;
217 |     for (i = 0; i < nrow; i++) {
218 |         rpt[i + 1] = rpt[i] + nnz_num[i];
219 |     }
220 | 
221 |     each_row_index = new idType[nrow];
222 |     for (i = 0; i < nrow; i++) {
223 |         each_row_index[i] = 0;
224 |     }
225 |   
226 |     for (i = 0; i < num; i++) {
227 |         colids[rpt[row_coo[i]] + each_row_index[row_coo[i]]] = col_coo[i];
228 |         values[rpt[row_coo[i]] + each_row_index[row_coo[i]]++] = val_coo[i];
229 |     
230 |         if (col_coo[i] != row_coo[i] && isUnsy == false) {
231 |             colids[rpt[col_coo[i]] + each_row_index[col_coo[i]]] = row_coo[i];
232 |             values[rpt[col_coo[i]] + each_row_index[col_coo[i]]++] = val_coo[i];
233 |         }
234 |     }
235 | 
236 |     //cout << "Row: " << nrow << ", Column: " << ncolumn << ", Nnz: " << nnz << endl;
237 | 
238 |     delete[] nnz_num;
239 |     delete[] row_coo;
240 |     delete[] col_coo;
241 |     delete[] val_coo;
242 |     delete[] each_row_index;
243 | 
244 | }
245 | 
246 | template <class idType, class valType>
247 | void CSR<idType, valType>::spmv_cpu(valType *x, valType *y)
248 | {
249 |     idType i, j;
250 |     valType ans;
251 |   
252 |     for (i = 0; i < nrow; ++i) {
253 |         ans = 0;
254 |         for (j = 0; j < (rpt[i + 1] - rpt[i]); j++) {
255 |             ans += values[rpt[i] + j] * x[colids[rpt[i] + j]];
256 |         }
257 |         y[i] = ans;
258 |     }
259 | }
260 | 
261 | #endif
262 | 


--------------------------------------------------------------------------------
/nsparse/inc/Plan.hpp:
--------------------------------------------------------------------------------
 1 | #include <nsparse.hpp>
 2 | 
 3 | template <class idType>
 4 | class Plan
 5 | {
 6 | public:
 7 |     Plan(): isPlan(false), seg_size(1), block_size(1), memory_access(INT_MAX), min_msec(sfFLT_MAX)
 8 |     {
 9 |     }
10 |     Plan(idType segment, idType block): isPlan(true)
11 |     {
12 |         seg_size = segment;
13 |         if (seg_size > USHORT_MAX) {
14 |             seg_size = USHORT_MAX;
15 |         }
16 |         block_size = block;
17 |         if (block_size < 1 || block_size > MAX_BLOCK_SIZE) {
18 |             block_size = 1;
19 |         }
20 |     }
21 |     ~Plan()
22 |     {
23 |     }
24 |     void set_plan(idType s_size, idType b_size)
25 |     {
26 |         seg_size = s_size;
27 |         block_size = b_size;
28 |         isPlan = true;
29 |     }
30 | 
31 |     idType thread_grid;
32 |     idType thread_block;
33 |     bool isPlan;
34 |     idType SIGMA;
35 |     idType seg_size;
36 |     idType seg_num;
37 |     idType block_size;
38 |     idType memory_access;
39 |     float min_msec;
40 | };
41 | 
42 | 


--------------------------------------------------------------------------------
/nsparse/inc/Timing.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef __Z_TIMING_H__
  2 | #define __Z_TIMING_H__
  3 | #include <stdio.h>
  4 | 
  5 | class Timing {
  6 |     public:
  7 |     bool measure_separate;
  8 |     bool measure_total;
  9 |     double setup;
 10 |     double symbolic_binning;
 11 |     double symbolic;
 12 |     double numeric_binning;
 13 |     double prefix;
 14 |     double allocate;
 15 |     double numeric;
 16 |     double cleanup;
 17 |     double total;
 18 |     Timing();
 19 | 
 20 |     void operator+=(const Timing& b);
 21 | 
 22 |     void operator/=(const double x);
 23 |     void print(const double total_flop);
 24 |     void reg_print(const double total_flop);
 25 |     void binning_print(const double total_flop);
 26 | };
 27 | 
 28 | Timing::Timing(){
 29 |     measure_separate = true;
 30 |     measure_total = true;
 31 |     setup = 0;
 32 |     symbolic_binning = 0;
 33 |     symbolic = 0;
 34 |     numeric_binning = 0;
 35 |     prefix = 0;
 36 |     allocate = 0;
 37 |     numeric = 0;
 38 |     cleanup = 0;
 39 |     total = 0;
 40 | }
 41 | 
 42 | void Timing::operator+=(const Timing& b){
 43 |     setup += b.setup;
 44 |     symbolic_binning += b.symbolic_binning;
 45 |     symbolic += b.symbolic;
 46 |     numeric_binning += b.numeric_binning;
 47 |     prefix += b.prefix;
 48 |     allocate += b.allocate;
 49 |     numeric += b.numeric;
 50 |     cleanup += b.cleanup;
 51 |     total += b.total;
 52 | }
 53 | 
 54 | void Timing::operator/=(const double x){
 55 |     setup /= x;
 56 |     symbolic_binning /= x;
 57 |     symbolic /= x;
 58 |     numeric_binning /= x;
 59 |     prefix /= x;
 60 |     allocate /= x;
 61 |     numeric /= x;
 62 |     cleanup /= x;
 63 |     total /= x;
 64 | }
 65 | 
 66 | void Timing::print(double total_flop){
 67 |     double total_flop_G = total_flop/1000000000;
 68 |     printf("total flop %lf\n", total_flop);
 69 |     if(measure_separate){
 70 |         //printf("time(ms): setup %.3lf symbolic_binning %.3lf symbolic %.3lf numeric_binning %.3lf prefix_allocate %.3lf numeric %.3lf cleanup %.3lf total %.3lf",)
 71 |         printf("time(ms):\n");
 72 |         printf("    setup            %8.3lfms %6.2lf%%\n", 1000*setup, setup/total*100);
 73 |         printf("\e[1;31m    symbolic_binning %8.3lfms %6.2lf%%\n\e[0m", 1000*symbolic_binning, symbolic_binning/total*100);
 74 |         printf("\e[1;31m    symbolic         %8.3lfms %6.2lf%%\n\e[0m", 1000*symbolic, symbolic/total*100);
 75 |         printf("\e[1;31m    numeric_binning  %8.3lfms %6.2lf%%\n\e[0m", 1000*numeric_binning, numeric_binning/total*100);
 76 |         printf("    prefix           %8.3lfms %6.2lf%%\n", 1000*prefix, prefix/total*100);
 77 |         printf("    allocate         %8.3lfms %6.2lf%%\n", 1000*allocate, allocate/total*100);
 78 |         printf("\e[1;31m    numeric          %8.3lfms %6.2lf%%\n\e[0m", 1000*numeric, numeric/total*100);
 79 |         printf("    cleanup          %8.3lfms %6.2lf%%\n", 1000*cleanup, cleanup/total*100);
 80 |         printf("    total            %8.3lfms %6.2lf%%\n", 1000*total, total/total*100);
 81 |         printf("perf(Gflops):\n");
 82 |         printf("    setup            %6.2lf\n", total_flop_G/setup);
 83 |         printf("    symbolic_binning %6.2lf\n", total_flop_G/symbolic_binning);
 84 |         printf("    symbolic         %6.2lf\n", total_flop_G/symbolic);
 85 |         printf("    numeric_binning  %6.2lf\n", total_flop_G/numeric_binning);
 86 |         printf("    prefix           %6.2lf\n", total_flop_G/prefix);
 87 |         printf("    allocate         %6.2lf\n", total_flop_G/allocate);
 88 |         printf("    numeric          %6.2lf\n", total_flop_G/numeric);
 89 |         printf("    cleanup          %6.2lf\n", total_flop_G/cleanup);
 90 |         printf("    total            %6.2lf\n", total_flop_G/total);
 91 |     }
 92 | }
 93 |         
 94 | void Timing::reg_print(double total_flop){
 95 |     double total_flop_G = total_flop/1000000000;
 96 |     printf("%6.2lf\n", total_flop_G/total);
 97 | }
 98 | 
 99 | void Timing::binning_print(double total_flop){
100 |     double total_flop_G = total_flop/1000000000;
101 |     double total_binning_time = symbolic_binning + numeric_binning;
102 |     printf("%.4le %.4lf\n", total_binning_time, 100*total_binning_time/total);
103 | }
104 | 
105 | #endif
106 | 
107 | 


--------------------------------------------------------------------------------
/nsparse/inc/cuda_common.h:
--------------------------------------------------------------------------------
 1 | #ifndef _Z_COMMON_
 2 | #define _Z_COMMON_
 3 | 
 4 | #include <cuda_runtime.h>
 5 | #include <stdio.h>
 6 | #include <exception>
 7 | #include <cusparse.h>
 8 | #include <iostream>
 9 | #include <omp.h>
10 | #include <stdlib.h>
11 | 
12 | #define likely(x) __builtin_expect(x,1)
13 | #define unlikely(x) __builtin_expect(x,0)
14 | 
15 | inline static void checkCUDA(cudaError_t err,
16 | 							   const char *file,
17 | 							   int line)
18 | {
19 | 	if (unlikely(err != cudaSuccess))
20 | 	{
21 | 		printf("%s in %s at line %d\n", cudaGetErrorString(err),
22 | 			   file, line);
23 | 		throw std::exception();
24 | 	}
25 | }
26 | // #ifdef _DEBUG || NDEBUG || DEBUG
27 | #define CHECK_CUDA(err) (checkCUDA(err, __FILE__, __LINE__))
28 | #define CHECK_ERROR(err) (checkCUDA(err, __FILE__, __LINE__))
29 | 
30 | inline void CHECK_CUSPARSE(cusparseStatus_t status, std::string errorMsg="")
31 | 		{
32 | 			if (status != CUSPARSE_STATUS_SUCCESS) {
33 | 				std::cout << "CuSparse error: " << errorMsg << std::endl;
34 | 				throw std::exception();
35 | 			}
36 | 		}
37 | 
38 | #define HP_TIMING_NOW(Var) \
39 |   ({ unsigned int _hi, _lo; \
40 |      asm volatile ("lfence\n\trdtsc" : "=a" (_lo), "=d" (_hi)); \
41 |      (Var) = ((unsigned long long int) _hi << 32) | _lo; })
42 | 
43 | /* precision is 1 clock cycle.
44 |  * execute time is roughly 50 or 140 cycles depends on cpu family */
45 | inline void cpuid(int *info, int eax, int ecx = 0){
46 |     int ax, bx, cx, dx;
47 |     __asm__ __volatile__ ("cpuid": "=a" (ax), "=b" (bx), "=c" (cx), "=d" (dx) : "a" (eax));
48 | 
49 |     info[0] = ax;
50 |     info[1] = bx;
51 |     info[2] = cx;
52 |     info[3] = dx;
53 | }
54 | 
55 | inline long get_tsc_freq(){
56 |     static long freq = 0;
57 |     if(unlikely((freq == 0))){
58 |         int raw[4];
59 |         cpuid(raw, 0x16); // get cpu freq
60 |         freq = long(raw[0]) * 1000000;
61 |         //printf("static first call %f\n", freq);
62 |     }
63 |     return freq;
64 | }
65 | 
66 | inline double fast_clock_time(){
67 |     long counter;
68 |     HP_TIMING_NOW(counter);
69 |     return double(counter)/get_tsc_freq();
70 | }
71 | 
72 | template <typename T>
73 | inline void D2H(T *dst, T* src, size_t size){
74 |     CHECK_ERROR(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost));
75 | }
76 | 
77 | template <typename T>
78 | inline void H2D(T *dst, T* src, size_t size){
79 |     CHECK_ERROR(cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice));
80 | }
81 | 
82 | template <typename T>
83 | inline void D2D(T *dst, T* src, size_t size){
84 |     CHECK_ERROR(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToDevice));
85 | }
86 | 
87 | 
88 | #endif
89 | 


--------------------------------------------------------------------------------
/nsparse/inc/nsparse.hpp:
--------------------------------------------------------------------------------
  1 | #include <typeinfo>
  2 | 
  3 | #ifndef NSPARSE_H
  4 | #define NSPARSE_H
  5 | 
  6 | #define div_round_up(a, b) ((a % b == 0)? a / b : a / b + 1)
  7 | 
  8 | /* Hardware Specific Parameters */
  9 | #define warp_BIT 5
 10 | #define warp 32
 11 | #define MAX_LOCAL_THREAD_NUM 1024
 12 | #define MAX_THREAD_BLOCK (MAX_LOCAL_THREAD_NUM / warp)
 13 | 
 14 | /* Number of SpMV Execution for Evaluation or Test */
 15 | //#define TRI_NUM 101
 16 | #define TEST_NUM 2
 17 | #define SpGEMM_TRI_NUM 10
 18 | 
 19 | /* Define 2 related */
 20 | #define sfFLT_MAX 1000000000
 21 | #define SHORT_MAX 32768
 22 | #define SHORT_MAX_BIT 15
 23 | #define USHORT_MAX 65536
 24 | #define USHORT_MAX_BIT 16
 25 | 
 26 | #define SCL_BORDER 16
 27 | #define SCL_BIT ((1 << SCL_BORDER) - 1)
 28 | 
 29 | #define MAX_BLOCK_SIZE 20
 30 | 
 31 | /* Check the answer */
 32 | //#define sfDEBUG
 33 | 
 34 | /* Structure of Formats*/
 35 | /* Initializing vector */
 36 | template <class idType, class valType>
 37 | void init_vector(valType *x, int row)
 38 | {
 39 |     int i;
 40 | 
 41 |     srand48((unsigned)time(NULL));
 42 | 
 43 |     for (i = 0; i < row; i++) {
 44 |         x[i] = drand48();
 45 |     }
 46 | }
 47 | 
 48 | /* Compare the vectors */
 49 | template <class idType, class valType>
 50 | void check_answer(valType *csr_ans, valType *ans_vec, idType nrow)
 51 | {
 52 |     idType i;
 53 |     int total_fail = 10;
 54 |     valType delta, base;
 55 |     valType scale;
 56 |     if (typeid(valType) == typeid(float)) {
 57 |         scale = 1000;
 58 |     }
 59 |     else {
 60 |         scale = 1000 * 1000;
 61 |     }
 62 |   
 63 |     for (i = 0; i < nrow; i++) {
 64 |         delta = ans_vec[i] - csr_ans[i];
 65 |         base = ans_vec[i];
 66 | 
 67 |         if (delta < 0) {
 68 |             delta *= -1;
 69 |         }
 70 |         if (base < 0) {
 71 |             base *= -1;
 72 |         }
 73 |         if (delta * 100 * scale > base) {
 74 |             printf("i=%d, ans=%e, csr=%e, delta=%e\n", i, ans_vec[i], csr_ans[i], delta);
 75 |             total_fail--;
 76 |             if(total_fail == 0)
 77 |                 break;
 78 |         }
 79 |     }
 80 |     if (total_fail != 10){
 81 |         printf("Calculation Result is Incorrect\n");
 82 |     }
 83 |     else {
 84 |         printf("Calculation Result is Correct\n");
 85 |     }
 86 | }
 87 | 
 88 | #endif
 89 | 
 90 | /*
 91 |  * Release MemObjects of Each Format structure
 92 |  */
 93 | /* void release_cpu_amb(sfAMB mat); */
 94 | /* void release_amb(sfAMB mat); */
 95 | 
 96 | /*
 97 |  * Converting matrix to AMB format
 98 |  */
 99 | /* void init_plan(sfPlan *plan); */
100 | /* void set_plan(sfPlan *plan, size_t seg_size, int block_size); */
101 | /* void sf_csr2amb(sfAMB *mat, sfCSR *csr_mat, real *d_x, sfPlan *plan); */
102 | 
103 | /*
104 |  * SpMV Kernel
105 |  */
106 | /* void csr_ans_check(real *val, int *col, int *rpt, real *rhs_vec, real *csr_ans, int N); */
107 | /* void sf_spmv_amb(real *d_y, sfAMB *mat, real *d_x, sfPlan *plan); */
108 | 
109 | 


--------------------------------------------------------------------------------
/nsparse/inc/nsparse_asm.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Inline PTX
  3 |  */
  4 | #ifndef NSPARSE_ASM_H
  5 | #define NSPARSE_ASM_H
  6 | 
  7 | __device__ __inline__ float ld_gbl_val(const float *val)
  8 | {
  9 |     float return_value;
 10 |     asm("ld.global.cv.f32 %0, [%1];" : "=f"(return_value) : "l"(val));
 11 |     return return_value;
 12 | }
 13 | 
 14 | __device__ __inline__ double ld_gbl_val(const double *val)
 15 | {
 16 |     double return_value;
 17 |     asm("ld.global.cv.f64 %0, [%1];" : "=d"(return_value) : "l"(val));
 18 |     return return_value;
 19 | }
 20 | 
 21 | __device__ __inline__ int ld_gbl_col(const int *col)
 22 | {
 23 |     int return_value;
 24 |     asm("ld.global.cv.s32 %0, [%1];" : "=r"(return_value) : "l"(col));
 25 |     return return_value;
 26 | }
 27 | 
 28 | __device__ __inline__ short ld_gbl_col(const short *col)
 29 | {
 30 |     short return_value;
 31 |     asm("ld.global.cv.u16 %0, [%1];" : "=h"(return_value) : "l"(col));
 32 |     return return_value;
 33 | }
 34 | 
 35 | __device__ __inline__ unsigned short ld_gbl_col(const unsigned short *col)
 36 | {
 37 |     unsigned short return_value;
 38 |     asm("ld.global.cv.u16 %0, [%1];" : "=h"(return_value) : "l"(col));
 39 |     return return_value;
 40 | }
 41 | 
 42 | __device__ __inline__ void st_gbl_val(const float *ptr, float val)
 43 | {
 44 |     asm("st.global.cs.f32 [%0], %1;" :: "l"(ptr) , "f"(val));
 45 | 
 46 | }
 47 | 
 48 | __device__ __inline__ void st_gbl_val(const double *ptr, double val)
 49 | {
 50 |     asm("st.global.cs.f64 [%0], %1;" :: "l"(ptr) , "d"(val));
 51 | }
 52 | 
 53 | /*
 54 |  * Multiply and Add
 55 |  */
 56 | template <class T>
 57 | class Add
 58 | {
 59 | public:
 60 |     __device__ __inline__ T operator()(T a, T b)
 61 |     {
 62 |         return a + b;
 63 |     }
 64 | };
 65 | 
 66 | template <class T>
 67 | class Multiply
 68 | {
 69 | public:
 70 |     __device__ __inline__ T operator()(T a, T b)
 71 |     {
 72 |         return a * b;
 73 |     }
 74 | };
 75 | 
 76 | template <class T>
 77 | class AtomicAdd
 78 | {
 79 | public:
 80 |     __device__ T operator()(T* a, T v);
 81 | };
 82 | 
 83 | template <>
 84 | __device__ __inline__ float AtomicAdd<float>::operator()(float *a, float v)
 85 | {
 86 |     return atomicAdd(a, v);
 87 | }
 88 | 
 89 | template <>
 90 | __device__ __inline__ double AtomicAdd<double>::operator()(double *a, double v)
 91 | {
 92 | #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
 93 |     return atomicAdd(a, v);
 94 | #else
 95 |     unsigned long long int *a_ull = (unsigned long long int *)(a);
 96 |     unsigned long long int old = *a_ull;
 97 |     unsigned long long int assumed;
 98 |     do {
 99 |         assumed = old;
100 |         old = atomicCAS(a_ull, assumed, __double_as_longlong(v + __longlong_as_double(assumed)));
101 |     } while (assumed != old);
102 |     return old;
103 | #endif
104 | }
105 | 
106 | #endif
107 | 


--------------------------------------------------------------------------------
/nsparse/nsparse.cu:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <sys/time.h>
  4 | 
  5 | #include <math.h>
  6 | 
  7 | #include <cuda.h>
  8 | #include <helper_cuda.h>
  9 | #include <cusparse_v2.h>
 10 | 
 11 | #include <nsparse.hpp>
 12 | #include <CSR.hpp>
 13 | #include <SpGEMM.hpp>
 14 | #include <HashSpGEMM_volta.hpp>
 15 | #include "Timing.hpp"
 16 | 
 17 | typedef int IT;
 18 | //#ifdef FLOAT
 19 | //typedef float VT;
 20 | //#else
 21 | //typedef double VT;
 22 | //#endif
 23 | typedef double VT;
 24 | 
 25 | 
 26 | template <bool sort, class idType, class valType>
 27 | void SpGEMM_Hash_Detail(CSR<idType, valType>& a, CSR<idType, valType>& b, CSR<idType, valType> &c, Timing& timing)
 28 | {
 29 |     double t0, t1;
 30 |     t0 = t1  = fast_clock_time();
 31 | 
 32 |     BIN<idType, BIN_NUM>* bin = new BIN<idType, BIN_NUM>(a.nrow);
 33 | 
 34 |     c.nrow = a.nrow;
 35 |     c.ncolumn = b.ncolumn;
 36 |     c.device_malloc = true;
 37 |     cudaMalloc((void **)&(c.d_rpt), sizeof(idType) * (c.nrow + 1));
 38 |     timing.setup = fast_clock_time() - t0;
 39 | 
 40 |     t0 = fast_clock_time();
 41 |     bin->set_max_bin(a.d_rpt, a.d_colids, b.d_rpt, a.nrow, TS_S_P, TS_S_T);
 42 |     CHECK_ERROR(cudaDeviceSynchronize());
 43 |     timing.symbolic_binning = fast_clock_time() - t0;
 44 | 
 45 |     t0 = fast_clock_time();
 46 |     hash_symbolic(a, b, c, *bin);
 47 |     CHECK_ERROR(cudaDeviceSynchronize());
 48 |     timing.symbolic = fast_clock_time() - t0;
 49 |     
 50 |     t0 = fast_clock_time();
 51 |     thrust::exclusive_scan(thrust::device, bin->d_count, bin->d_count + (a.nrow + 1), c.d_rpt, 0);
 52 |     cudaMemcpy(&(c.nnz), c.d_rpt + c.nrow, sizeof(idType), cudaMemcpyDeviceToHost);
 53 |     timing.prefix = fast_clock_time() - t0;
 54 |     
 55 |     t0 = fast_clock_time();
 56 |     cudaMalloc((void **)&(c.d_colids), sizeof(idType) * (c.nnz));
 57 |     cudaMalloc((void **)&(c.d_values), sizeof(valType) * (c.nnz));
 58 |     timing.allocate = fast_clock_time() - t0;
 59 | 
 60 |     t0 = fast_clock_time();
 61 |     bin->set_min_bin(a.nrow, TS_N_P, TS_N_T);
 62 |     CHECK_ERROR(cudaDeviceSynchronize());
 63 |     timing.numeric_binning = fast_clock_time() - t0;
 64 | 
 65 |     t0 = fast_clock_time();
 66 |     hash_numeric<idType, valType, sort>(a, b, c, *bin);
 67 |     CHECK_ERROR(cudaDeviceSynchronize());
 68 |     timing.numeric = fast_clock_time() - t0;
 69 | 
 70 |     t0 = fast_clock_time();
 71 |     delete bin;
 72 |     timing.cleanup = fast_clock_time() - t0;
 73 |     timing.total = fast_clock_time() - t1;
 74 | 
 75 | }
 76 | 
 77 | 
 78 | template <class idType, class valType>
 79 | void run_spgemm(CSR<idType, valType>& a, CSR<idType, valType>& b, CSR<idType, valType> &c)
 80 | {
 81 | 
 82 |     /* Memcpy A and B from Host to Device */
 83 |     a.memcpyHtD();
 84 |     b.memcpyHtD();
 85 |   
 86 |     /* Count flop of SpGEMM computation */
 87 |     long long int flop_count;
 88 |     get_spgemm_flop(a, b, flop_count);
 89 | 
 90 |     /* Execution of SpGEMM on Device */
 91 |     Timing warmup_timing, bench_timing, timing;
 92 |     
 93 |     SpGEMM_Hash_Detail<true, idType, valType>(a, b, c, warmup_timing);
 94 |     c.release_csr();
 95 | 
 96 |     for (int i = 0; i < SpGEMM_TRI_NUM; i++) {
 97 |         SpGEMM_Hash_Detail<true, idType, valType>(a, b, c, bench_timing);
 98 |         if (i < SpGEMM_TRI_NUM - 1) {
 99 |             c.release_csr();
100 |         }
101 |         timing += bench_timing;
102 |     }
103 |     timing /= SpGEMM_TRI_NUM;
104 |     timing.print(flop_count);
105 | 
106 | 
107 |     c.memcpyDtH();
108 |     c.release_csr();
109 | 
110 | #ifdef sfDEBUG
111 |     CSR<IT, VT> cusparse_c;
112 |     SpGEMM_cuSPARSE(a, b, cusparse_c);
113 |     cusparse_c.memcpyDtH();
114 |     if (c == cusparse_c) {
115 |         //cout << "HashSpGEMM is correctly executed" << endl;
116 |         cout << "pass" << endl;
117 |     }
118 |     else{
119 |         cout << "fail" << endl;
120 |     }
121 |     cout << "Nnz of A: " << a.nnz << endl; 
122 |     cout << "Number of intermediate products: " << flop_count / 2 << endl; 
123 |     cout << "Nnz of C: " << c.nnz << endl; 
124 |     cusparse_c.release_cpu_csr();
125 | #endif
126 | 
127 |     a.release_csr();
128 |     b.release_csr();
129 | 
130 | }
131 | 
132 | /*Main Function*/
133 | int main(int argc, char *argv[])
134 | {
135 |     CSR<IT, VT> a, b, c;
136 | 
137 |     std::string mat1, mat2;
138 |     mat1 = "can_24";
139 |     mat2 = "can_24";
140 |     if(argc == 2){
141 |         mat1 = argv[1];
142 |         mat2 = argv[1];
143 |     }
144 |     if(argc >= 3){
145 |         mat1 = argv[1];
146 |         mat2 = argv[2];
147 |     }
148 |     std::string mat1_file;
149 |     if(mat1.find("ER") != std::string::npos){
150 |         mat1_file = "../matrix/ER/" + mat1 +".mtx";
151 |     }
152 |     else if(mat1.find("G500") != std::string::npos){
153 |         mat1_file = "../matrix/G500/" + mat1 +".mtx";
154 |     }
155 |     else{
156 |         mat1_file = "../matrix/suite_sparse/" + mat1 + "/" + mat1 +".mtx";
157 |     }
158 |     std::string mat2_file;
159 |     if(mat2.find("ER") != std::string::npos){
160 |         mat2_file = "../matrix/ER/" + mat2 +".mtx";
161 |     }
162 |     else if(mat2.find("G500") != std::string::npos){
163 |         mat2_file = "../matrix/G500/" + mat2 +".mtx";
164 |     }
165 |     else{
166 |         mat2_file = "../matrix/suite_sparse/" + mat2 + "/" + mat2 +".mtx";
167 |     }
168 | 
169 |     /* Set CSR reding from MM file or generating random matrix */
170 |     //cout << "Initialize Matrix A" << endl;
171 |     //cout << "Read matrix data from " << argv[1] << endl;
172 |     a.init_data_from_mtx(mat1_file);
173 | 
174 |     //cout << "Initialize Matrix B" << endl;
175 |     //cout << "Read matrix data from " << argv[1] << endl;
176 |     b.init_data_from_mtx(mat2_file);
177 |   
178 |     /* Execution of SpGEMM on GPU */
179 |     printf("%s ", mat1.c_str());
180 |     run_spgemm(a, b, c);
181 |     
182 |     a.release_cpu_csr();
183 |     b.release_cpu_csr();
184 |     c.release_cpu_csr();
185 |   
186 |     return 0;
187 | 
188 | }
189 | 
190 | 


--------------------------------------------------------------------------------
/nsparse/readme.md:
--------------------------------------------------------------------------------
 1 | # Get started
 2 | 1 Compile source code
 3 | 
 4 | ``` $> make all```
 5 | 
 6 | 2 Execute nsparse
 7 | 
 8 | 2.1 Profile nsparse
 9 | 
10 | ``` $> ./bin/nsparse_d webbase-1M ```
11 | 
12 | 2.2 Overall performance of nsparse
13 | 
14 | ```$> ./bin/reg_nsparse_d webbase-1M ```
15 | 


--------------------------------------------------------------------------------
/nsparse/reg_nsparse.cu:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <sys/time.h>
  4 | 
  5 | #include <math.h>
  6 | 
  7 | #include <cuda.h>
  8 | #include <helper_cuda.h>
  9 | #include <cusparse_v2.h>
 10 | 
 11 | #include <nsparse.hpp>
 12 | #include <CSR.hpp>
 13 | #include <SpGEMM.hpp>
 14 | #include <HashSpGEMM_volta.hpp>
 15 | #include "Timing.hpp"
 16 | 
 17 | typedef int IT;
 18 | //#ifdef FLOAT
 19 | //typedef float VT;
 20 | //#else
 21 | //typedef double VT;
 22 | //#endif
 23 | typedef double VT;
 24 | 
 25 | 
 26 | template <bool sort, class idType, class valType>
 27 | void SpGEMM_Hash_Detail(CSR<idType, valType>& a, CSR<idType, valType>& b, CSR<idType, valType> &c, Timing& timing)
 28 | {
 29 |     double t0, t1;
 30 |     t0 = t1  = fast_clock_time();
 31 | 
 32 |     BIN<idType, BIN_NUM>* bin = new BIN<idType, BIN_NUM>(a.nrow);
 33 | 
 34 |     c.nrow = a.nrow;
 35 |     c.ncolumn = b.ncolumn;
 36 |     c.device_malloc = true;
 37 |     cudaMalloc((void **)&(c.d_rpt), sizeof(idType) * (c.nrow + 1));
 38 |     timing.setup = fast_clock_time() - t0;
 39 | 
 40 |     t0 = fast_clock_time();
 41 |     bin->set_max_bin(a.d_rpt, a.d_colids, b.d_rpt, a.nrow, TS_S_P, TS_S_T);
 42 |     CHECK_ERROR(cudaDeviceSynchronize());
 43 |     timing.symbolic_binning = fast_clock_time() - t0;
 44 | 
 45 |     t0 = fast_clock_time();
 46 |     hash_symbolic(a, b, c, *bin);
 47 |     CHECK_ERROR(cudaDeviceSynchronize());
 48 |     timing.symbolic = fast_clock_time() - t0;
 49 |     
 50 |     t0 = fast_clock_time();
 51 |     thrust::exclusive_scan(thrust::device, bin->d_count, bin->d_count + (a.nrow + 1), c.d_rpt, 0);
 52 |     cudaMemcpy(&(c.nnz), c.d_rpt + c.nrow, sizeof(idType), cudaMemcpyDeviceToHost);
 53 |     timing.prefix = fast_clock_time() - t0;
 54 |     
 55 |     t0 = fast_clock_time();
 56 |     cudaMalloc((void **)&(c.d_colids), sizeof(idType) * (c.nnz));
 57 |     cudaMalloc((void **)&(c.d_values), sizeof(valType) * (c.nnz));
 58 |     timing.allocate = fast_clock_time() - t0;
 59 | 
 60 |     t0 = fast_clock_time();
 61 |     bin->set_min_bin(a.nrow, TS_N_P, TS_N_T);
 62 |     CHECK_ERROR(cudaDeviceSynchronize());
 63 |     timing.numeric_binning = fast_clock_time() - t0;
 64 | 
 65 |     t0 = fast_clock_time();
 66 |     hash_numeric<idType, valType, sort>(a, b, c, *bin);
 67 |     CHECK_ERROR(cudaDeviceSynchronize());
 68 |     timing.numeric = fast_clock_time() - t0;
 69 | 
 70 |     t0 = fast_clock_time();
 71 |     delete bin;
 72 |     timing.cleanup = fast_clock_time() - t0;
 73 |     timing.total = fast_clock_time() - t1;
 74 | 
 75 | }
 76 | 
 77 | 
 78 | template <class idType, class valType>
 79 | void run_spgemm(CSR<idType, valType>& a, CSR<idType, valType>& b, CSR<idType, valType> &c)
 80 | {
 81 | 
 82 |     /* Memcpy A and B from Host to Device */
 83 |     a.memcpyHtD();
 84 |     b.memcpyHtD();
 85 |   
 86 |     /* Count flop of SpGEMM computation */
 87 |     long long int flop_count;
 88 |     get_spgemm_flop(a, b, flop_count);
 89 | 
 90 |     /* Execution of SpGEMM on Device */
 91 |     Timing warmup_timing, bench_timing, timing;
 92 |     
 93 |     SpGEMM_Hash_Detail<true, idType, valType>(a, b, c, warmup_timing);
 94 |     c.release_csr();
 95 | 
 96 |     for (int i = 0; i < SpGEMM_TRI_NUM; i++) {
 97 |         SpGEMM_Hash_Detail<true, idType, valType>(a, b, c, bench_timing);
 98 |         if (i < SpGEMM_TRI_NUM - 1) {
 99 |             c.release_csr();
100 |         }
101 |         timing += bench_timing;
102 |     }
103 |     timing /= SpGEMM_TRI_NUM;
104 |     //timing.print(flop_count);
105 |     timing.reg_print(flop_count);
106 | 
107 | 
108 |     c.memcpyDtH();
109 |     c.release_csr();
110 | 
111 | #ifdef sfDEBUG
112 |     CSR<IT, VT> cusparse_c;
113 |     SpGEMM_cuSPARSE(a, b, cusparse_c);
114 |     cusparse_c.memcpyDtH();
115 |     if (c == cusparse_c) {
116 |         //cout << "HashSpGEMM is correctly executed" << endl;
117 |         cout << "pass" << endl;
118 |     }
119 |     else{
120 |         cout << "fail" << endl;
121 |     }
122 |     cout << "Nnz of A: " << a.nnz << endl; 
123 |     cout << "Number of intermediate products: " << flop_count / 2 << endl; 
124 |     cout << "Nnz of C: " << c.nnz << endl; 
125 |     cusparse_c.release_cpu_csr();
126 | #endif
127 | 
128 |     a.release_csr();
129 |     b.release_csr();
130 | 
131 | }
132 | 
133 | /*Main Function*/
134 | int main(int argc, char *argv[])
135 | {
136 |     CSR<IT, VT> a, b, c;
137 | 
138 |     std::string mat1, mat2;
139 |     mat1 = "can_24";
140 |     mat2 = "can_24";
141 |     if(argc == 2){
142 |         mat1 = argv[1];
143 |         mat2 = argv[1];
144 |     }
145 |     if(argc >= 3){
146 |         mat1 = argv[1];
147 |         mat2 = argv[2];
148 |     }
149 |     std::string mat1_file;
150 |     if(mat1.find("ER") != std::string::npos){
151 |         mat1_file = "../matrix/ER/" + mat1 +".mtx";
152 |     }
153 |     else if(mat1.find("G500") != std::string::npos){
154 |         mat1_file = "../matrix/G500/" + mat1 +".mtx";
155 |     }
156 |     else{
157 |         mat1_file = "../matrix/suite_sparse/" + mat1 + "/" + mat1 +".mtx";
158 |     }
159 |     std::string mat2_file;
160 |     if(mat2.find("ER") != std::string::npos){
161 |         mat2_file = "../matrix/ER/" + mat2 +".mtx";
162 |     }
163 |     else if(mat2.find("G500") != std::string::npos){
164 |         mat2_file = "../matrix/G500/" + mat2 +".mtx";
165 |     }
166 |     else{
167 |         mat2_file = "../matrix/suite_sparse/" + mat2 + "/" + mat2 +".mtx";
168 |     }
169 | 
170 |     /* Set CSR reding from MM file or generating random matrix */
171 |     //cout << "Initialize Matrix A" << endl;
172 |     //cout << "Read matrix data from " << argv[1] << endl;
173 |     a.init_data_from_mtx(mat1_file);
174 | 
175 |     //cout << "Initialize Matrix B" << endl;
176 |     //cout << "Read matrix data from " << argv[1] << endl;
177 |     b.init_data_from_mtx(mat2_file);
178 |   
179 |     /* Execution of SpGEMM on GPU */
180 |     printf("%s ", mat1.c_str());
181 |     run_spgemm(a, b, c);
182 |     
183 |     a.release_cpu_csr();
184 |     b.release_cpu_csr();
185 |     c.release_cpu_csr();
186 |   
187 |     return 0;
188 | 
189 | }
190 | 
191 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | The Source Code of OpSparse
 2 | ========
 3 | 
 4 | This repository contain the source code of OpSparse, and part of the source code from [cuSPARSE](https://docs.nvidia.com/cuda/cusparse/index.html), [nsparse](https://github.com/EBD-CREST/nsparse.git), and [spECK](https://github.com/GPUPeople/spECK.git).
 5 | ## Tested evironment
 6 | CUDA 11.2, NVIDIA Tesla V100 GPU, Ubuntu 18.04 LTS
 7 | 
 8 | ## Get started
 9 | 1 Execute ```$> bash download_matrix.sh``` in the current directory to download the matrix webbase-1M into matrix/suite_sparse directory
10 | 
11 | 2 For detailed execution instruction, refer the readme.md in the opsparse, nsparse, and speck sub-directory
12 | 
13 | ## Bibtex
14 | ```
15 | @ARTICLE{9851653,
16 |   author={Du, Zhaoyang and Guan, Yijin and Guan, Tianchan and Niu, Dimin and Huang, Linyong and Zheng, Hongzhong and Xie, Yuan},
17 |   journal={IEEE Access}, 
18 |   title={OpSparse: A Highly Optimized Framework for Sparse General Matrix Multiplication on GPUs}, 
19 |   year={2022},
20 |   volume={10},
21 |   number={},
22 |   pages={85960-85974},
23 |   doi={10.1109/ACCESS.2022.3196940}}
24 |   ```
25 | 


--------------------------------------------------------------------------------
/spECK/Makefile:
--------------------------------------------------------------------------------
 1 | CXX = g++
 2 | NVCC = nvcc
 3 | 
 4 | 
 5 | GENCODE = -arch=compute_70 -code=sm_70
 6 | 
 7 | 
 8 | #CUDAFLAGS = $(GENCODE)  -g -lineinfo 
 9 | #CUDAFLAGS = $(GENCODE)  -g -G
10 | CUDAFLAGS =  $(GENCODE)  -O3
11 | 
12 | CUDAFLAGS += -Xcompiler -fopenmp 
13 | # cannot solve shared race problem, cannot compile with -G
14 | 
15 | # for Device Code
16 | CUDA_PATH = /usr/local/cuda
17 | #LDFLAGS += -L${CUDA_PATH}/lib64
18 | #LDFLAGS += -lm -lcudart -lcusparse $(CUDAFLAGS)
19 | LDFLAGS += -lcusparse $(CUDAFLAGS)
20 | INCLUDE = -I./include -I./CUDATools -I./GPU 
21 | INCLUDE += -I${CUDA_PATH}/include
22 | INCLUDE += -I${CUDA_PATH}/samples/common/inc
23 | 
24 | BIN = ./bin
25 | SRC = ./source
26 | OBJ = ./obj
27 | INC = ./include
28 | 
29 | OBJ_LIB = $(OBJ)/CSR.o $(OBJ)/Config.o $(OBJ)/COO.o $(OBJ)/DataLoader.o $(OBJ)/Executor.o $(OBJ)/RunConfig.o $(OBJ)/dCSR.o  $(OBJ)/cuSparseMultiply.o
30 | OBJ_LIB2 = $(OBJ)/GPU/Compare.o $(OBJ)/GPU/Transpose.o $(OBJ)/GPU/memory.o $(OBJ)/GPU/Multiply.o 
31 | #COMMON_DEP = cuda_common.h 
32 | 
33 | $(OBJ)/%.o : $(SRC)/%.cu $(INC)/%.h
34 | 	mkdir -p $(dir $@)
35 | 	$(NVCC) -c $(CUDAFLAGS) $(INCLUDE) -o $@ $<
36 | 
37 | $(OBJ)/%.o : $(SRC)/%.cu 
38 | 	mkdir -p $(dir $@)
39 | 	$(NVCC) -c $(CUDAFLAGS) $(INCLUDE) -o $@ $<
40 | 
41 | $(OBJ)/%.o : $(SRC)/%.cpp
42 | 	mkdir -p $(dir $@)
43 | 	$(NVCC) -c $(CUDAFLAGS) $(INCLUDE) -o $@ $<
44 | 
45 | $(OBJ)/%.o : $(SRC)/%.cpp $(INC)/%.h
46 | 	mkdir -p $(dir $@)
47 | 	$(NVCC) -c $(CUDAFLAGS) $(INCLUDE) -o $@ $<
48 | 
49 | speck : $(OBJ_LIB2) $(OBJ_LIB) $(OBJ)/runspECK.o
50 | 	$(NVCC)  -o $@ $^ $(LDFLAGS) $(INCLUDE)
51 | 
52 | reg_speck : $(OBJ_LIB2) $(OBJ_LIB) $(OBJ)/reg_runspECK.o
53 | 	$(NVCC)  -o $@ $^ $(LDFLAGS) $(INCLUDE)
54 | 
55 | all : speck reg_speck
56 | 
57 | clean :
58 | 	rm -rf $(BIN)/*
59 | 	rm -rf $(OBJ)/*
60 | 


--------------------------------------------------------------------------------
/spECK/config.ini:
--------------------------------------------------------------------------------
 1 | ; if the complete time should be measured. has only little impact on performance
 2 | TrackCompleteTimes=true
 3 | 
 4 | ; tracks and prints timings for all stages of spECK and all iterations. Has significant impact on performance
 5 | ; TrackIndividualTimes=false
 6 | TrackIndividualTimes=true
 7 | 
 8 | ; compares C row lengths and column indices with CUSPARSE and prints an error if they do not match
 9 | ; (we only compare indices, because values are not always the same, since spECK is not bit stable)
10 | ; no impact on measured performance, but can make overall execution much slower, because CUSPARSE can be very slow for some matrices
11 | CompareResult=false
12 | 
13 | ; how many iterations should be run to raise GPU clock before measuring the time
14 | ; note that first iteration will be significantly slower, because of result matrix memory allocation
15 | IterationsWarmUp=1
16 | 
17 | ; how many iterations are accumulated for to calculate mean execution time
18 | IterationsExecution=10
19 | 
20 | ; enter a path to an input matrix here -> this overrides the matrix selected in the command line
21 | ; InputFile=<path-to-matrix>
22 | 


--------------------------------------------------------------------------------
/spECK/include/COO.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "Vector.h"
 4 | 
 5 | #include <memory>
 6 | 
 7 | 
 8 | template<typename T>
 9 | struct COO
10 | {
11 | 	size_t rows, cols, nnz;
12 | 
13 | 	std::unique_ptr<T[]> data;
14 | 	std::unique_ptr<unsigned int[]> row_ids;
15 | 	std::unique_ptr<unsigned int[]> col_ids;
16 | 
17 | 	COO() : rows(0), cols(0), nnz(0) { }
18 | 	void alloc(size_t rows, size_t cols, size_t nnz);
19 | };
20 | 
21 | template<typename T>
22 | COO<T> loadMTX(const char* file);
23 | template<typename T>
24 | COO<T> loadCOO(const char* file);
25 | template<typename T>
26 | void storeCOO(const COO<T>& mat, const char* file);
27 | 
28 | template<typename T>
29 | void spmv(DenseVector<T>& res, const COO<T>& m, const DenseVector<T>& v, bool transpose = false);
30 | 


--------------------------------------------------------------------------------
/spECK/include/CSR.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <memory>
  4 | #include <algorithm>
  5 | #include <math.h>
  6 | #include <cstring>
  7 | 
  8 | template<typename T>
  9 | struct COO;
 10 | 
 11 | template<typename T>
 12 | struct DenseVector;
 13 | 
 14 | template<typename T>
 15 | struct CSR
 16 | {
 17 | 	struct Statistics
 18 | 	{
 19 | 		double mean;
 20 | 		double std_dev;
 21 | 		size_t max;
 22 | 		size_t min;
 23 | 	};
 24 | 
 25 | 	void computeStatistics(double& mean, double& std_dev, size_t& max, size_t& min)
 26 | 	{
 27 | 		// running variance by Welford
 28 | 		size_t count = 0;
 29 | 		mean = 0;
 30 | 		double M2 = 0;
 31 | 		max = 0;
 32 | 		min = cols;
 33 | 		for (size_t i = 0; i < rows; ++i)
 34 | 		{
 35 | 			size_t r_length = row_offsets[i + 1] - row_offsets[i];
 36 | 			min = std::min(min, r_length);
 37 | 			max = std::max(max, r_length);
 38 | 			++count;
 39 | 			double newValue = static_cast<double>(r_length);
 40 | 			double delta = newValue - mean;
 41 | 			mean = mean + delta / count;
 42 | 			double delta2 = newValue - mean;
 43 | 			M2 = M2 + delta * delta2;
 44 | 		}
 45 | 		if (count < 2)
 46 | 			std_dev = 0;
 47 | 		else
 48 | 			std_dev = sqrt(M2 / (count - 1));
 49 | 	}
 50 | 
 51 | 	Statistics rowStatistics()
 52 | 	{
 53 | 		Statistics stats;
 54 | 		computeStatistics(stats.mean, stats.std_dev, stats.max, stats.min);
 55 | 		return stats;
 56 | 	}
 57 | 
 58 | 	size_t rows, cols, nnz;
 59 | 
 60 | 	//std::unique_ptr<T[]> data;
 61 | 	//std::unique_ptr<unsigned int[]> row_offsets;
 62 | 	//std::unique_ptr<unsigned int[]> col_ids;
 63 |     T *data;
 64 |     int *row_offsets;
 65 |     int *col_ids;
 66 | 
 67 | 	CSR() : rows(0), cols(0), nnz(0) { }
 68 | 	void alloc(size_t rows, size_t cols, size_t nnz);
 69 | 
 70 |     CSR(const CSR<T>& A, int rows, int cols, int row_start, int row_end);
 71 |     CSR<T>& operator=(const CSR<T>& A);
 72 | 	// CSR<T>& operator=(CSR<T> other)
 73 | 	// {
 74 | 	// 	this->rows = other.rows;
 75 | 	// 	this->cols = other.cols;
 76 | 	// 	this->nnz = other.nnz;
 77 | 	// 	this->data = std::move(other.data);
 78 | 	// 	this->row_offsets = std::move(other.row_offsets);
 79 | 	// 	this->col_ids = std::move(other.col_ids);
 80 | 	// 	return *this;
 81 | 	// }
 82 | 
 83 | 	// CSR(const CSR<T>& other)
 84 | 	// {
 85 | 	// 	this->rows = other.rows;
 86 | 	// 	this->cols = other.cols;
 87 | 	// 	this->nnz = other.nnz;
 88 | 	// 	this->data = std::make_unique<T[]>(other.nnz);
 89 | 	// 	memcpy(this->data.get(), other.data.get(), sizeof(T) * other.nnz);
 90 | 	// 	this->col_ids = std::make_unique<unsigned int[]>(other.nnz);
 91 | 	// 	memcpy(this->col_ids.get(), other.col_ids.get(), sizeof(unsigned int) * other.nnz);
 92 | 	// 	this->row_offsets = std::make_unique<unsigned int[]>(other.rows + 1);
 93 | 	// 	memcpy(this->row_offsets.get(), other.row_offsets.get(), sizeof(unsigned int) * (other.rows + 1));
 94 | 	// }
 95 | 
 96 | };
 97 | 
 98 | 
 99 | template<typename T>
100 | CSR<T> loadCSR(const char* file);
101 | template<typename T>
102 | void storeCSR(const CSR<T>& mat, const char* file);
103 | 
104 | template<typename T>
105 | void spmv(DenseVector<T>& res, const CSR<T>& m, const DenseVector<T>& v, bool transpose = false);
106 | 
107 | template<typename T>
108 | void convert(CSR<T>& res, const COO<T>& coo);
109 | 


--------------------------------------------------------------------------------
/spECK/include/CUDATools/event.h:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | #ifndef INCLUDED_CUDA_EVENT
 5 | #define INCLUDED_CUDA_EVENT
 6 | 
 7 | #pragma once
 8 | 
 9 | #include <cuda.h>
10 | 
11 | #include <CUDATools/unique_handle.h>
12 | 
13 | 
14 | namespace CU
15 | {
16 | 	struct EventDestroyDeleter
17 | 	{
18 | 		void operator ()(CUevent event) const
19 | 		{
20 | 			cuEventDestroy(event);
21 | 		}
22 | 	};
23 | 	
24 | 	using unique_event = unique_handle<CUevent, nullptr, EventDestroyDeleter>;
25 | 	
26 | 	unique_event createEvent(unsigned int flags = CU_EVENT_DEFAULT);
27 | }
28 | 
29 | #endif  // INCLUDED_CUDA_EVENT
30 | 


--------------------------------------------------------------------------------
/spECK/include/CUDATools/memory.h:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | #ifndef INCLUDED_CUDA_MEMORY
 5 | #define INCLUDED_CUDA_MEMORY
 6 | 
 7 | #pragma once
 8 | 
 9 | #include <cstddef>
10 | 
11 | #include <cuda_runtime.h>
12 | 
13 | #include <CUDATools/unique_handle.h>
14 | 
15 | 
16 | namespace CU
17 | {
18 | 	struct MemFreeDeleter
19 | 	{
20 | 		void operator ()(CUdeviceptr ptr) const
21 | 		{
22 | 			cudaFree(reinterpret_cast<void*>(ptr));
23 | 		}
24 | 	};
25 | 	
26 | 	using unique_ptr = unique_handle<CUdeviceptr, 0ULL, MemFreeDeleter>;
27 | 	
28 | 	
29 | 	struct pitched_memory
30 | 	{
31 | 		pitched_memory(const pitched_memory&) = delete;
32 | 		pitched_memory& operator =(const pitched_memory&) = delete;
33 | 		
34 | 		unique_ptr memory;
35 | 		std::size_t pitch;
36 | 		
37 | 		pitched_memory() {}
38 | 		
39 | 		pitched_memory(unique_ptr memory, std::size_t pitch)
40 | 			: memory(std::move(memory)),
41 | 			  pitch(pitch)
42 | 		{
43 | 		}
44 | 		
45 | 		pitched_memory(pitched_memory&& m)
46 | 			: memory(std::move(m.memory)),
47 | 			  pitch(m.pitch)
48 | 		{
49 | 		}
50 | 		
51 | 		pitched_memory& operator =(pitched_memory&& m)
52 | 		{
53 | 			using std::swap;
54 | 			swap(memory, m.memory);
55 | 			pitch = m.pitch;
56 | 			return *this;
57 | 		}
58 | 	};
59 | 	
60 | 	
61 | 	unique_ptr allocMemory(std::size_t size);
62 | 	unique_ptr allocMemoryPitched(std::size_t& pitch, std::size_t row_size, std::size_t num_rows, unsigned int element_size);
63 | 	pitched_memory allocMemoryPitched(std::size_t row_size, std::size_t num_rows, unsigned int element_size);
64 | }
65 | 
66 | #endif  // INCLUDED_CUDA_MEMORY
67 | 


--------------------------------------------------------------------------------
/spECK/include/CUDATools/memory_space.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace HiSparse
 4 | {
 5 | 	enum class MemorySpace
 6 | 	{
 7 | 		host,
 8 | 		device
 9 | 	};
10 | }


--------------------------------------------------------------------------------
/spECK/include/CUDATools/stream.h:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | #ifndef INCLUDED_CUDA_STREAM
 5 | #define INCLUDED_CUDA_STREAM
 6 | 
 7 | #pragma once
 8 | 
 9 | #include <cuda.h>
10 | 
11 | #include <CUDATools/unique_handle.h>
12 | 
13 | 
14 | namespace CU
15 | {
16 | 	struct StreamDestroyDeleter
17 | 	{
18 | 		void operator ()(CUstream stream) const
19 | 		{
20 | 			cuStreamDestroy(stream);
21 | 		}
22 | 	};
23 | 	
24 | 	using unique_stream = unique_handle<CUstream, nullptr, StreamDestroyDeleter>;
25 | 	
26 | 	unique_stream createStream(unsigned int flags = CU_STREAM_DEFAULT);
27 | }
28 | 
29 | #endif  // INCLUDED_CUDA_STREAM
30 | 


--------------------------------------------------------------------------------
/spECK/include/CUDATools/unique_handle.h:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | #ifndef INCLUDED_CUDA_UNIQUE_HANDLE
  5 | #define INCLUDED_CUDA_UNIQUE_HANDLE
  6 | 
  7 | #pragma once
  8 | 
  9 | #include <utility>
 10 | 
 11 | 
 12 | namespace CU
 13 | {
 14 | 	template <typename T, T NULL_VALUE, typename Deleter>
 15 | 	class unique_handle : Deleter
 16 | 	{
 17 | 		T h;
 18 | 		
 19 | 		void free(T handle) noexcept
 20 | 		{
 21 | 			if (handle != NULL_VALUE)
 22 | 				Deleter::operator ()(handle);
 23 | 		}
 24 | 		
 25 | 	public:
 26 | 		unique_handle(const unique_handle&) = delete;
 27 | 		unique_handle& operator =(const unique_handle&) = delete;
 28 | 		
 29 | 		using handle_type = T;
 30 | 		using deleter_type = Deleter;
 31 | 		
 32 | 		static constexpr T null_value = NULL_VALUE;
 33 | 		
 34 | 		explicit unique_handle(T handle = NULL_VALUE) noexcept
 35 | 			: h(handle)
 36 | 		{
 37 | 		}
 38 | 
 39 | 		void consume(T handle) noexcept { h = handle; }
 40 | 
 41 | 		
 42 | 		unique_handle(T handle, const Deleter& d) noexcept
 43 | 			: Deleter(d),
 44 | 			  h(handle)
 45 | 		{
 46 | 		}
 47 | 		
 48 | 		unique_handle(T handle, Deleter&& d) noexcept
 49 | 			: Deleter(std::move(d)),
 50 | 			  h(handle)
 51 | 		{
 52 | 		}
 53 | 		
 54 | 		unique_handle(unique_handle&& h) noexcept
 55 | 			: Deleter(std::move(static_cast<Deleter&&>(h))),
 56 | 			  h(h.h)
 57 | 		{
 58 | 			h.h = NULL_VALUE;
 59 | 		}
 60 | 		
 61 | 		~unique_handle()
 62 | 		{
 63 | 			free(h);
 64 | 		}
 65 | 		
 66 | 		operator T() const noexcept { return h; }
 67 | 
 68 | 		template <typename DataType = void>
 69 | 		DataType* get() const noexcept { return reinterpret_cast<DataType*>(h); }
 70 | 
 71 | 		template <typename DataType = void>
 72 | 		DataType* getRelease() noexcept { DataType* tmp = reinterpret_cast<DataType*>(h); h = 0ULL; return tmp; }
 73 | 		
 74 | 		unique_handle& operator =(unique_handle&& h) noexcept
 75 | 		{
 76 | 			using std::swap;
 77 | 			swap(*this, h);
 78 | 			return *this;
 79 | 		}
 80 | 		
 81 | 		T release() noexcept
 82 | 		{
 83 | 			T temp = h;
 84 | 			h = NULL_VALUE;
 85 | 			return temp;
 86 | 		}
 87 | 		
 88 | 		void reset(T handle = null_value) noexcept
 89 | 		{
 90 | 			using std::swap;
 91 | 			swap(this->h, handle);
 92 | 			free(handle);
 93 | 		}
 94 | 		
 95 | 		friend void swap(unique_handle& a, unique_handle& b) noexcept
 96 | 		{
 97 | 			using std::swap;
 98 | 			swap(a.h, b.h);
 99 | 		}
100 | 	};
101 | }
102 | 
103 | #endif  // INCLUDED_CUDA_UNIQUE_HANDLE
104 | 


--------------------------------------------------------------------------------
/spECK/include/Compare.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include "dCSR.h"
4 | 
5 | namespace spECK {
6 | 	template <typename DataType>
7 | 	bool Compare(const dCSR<DataType>& reference_mat, const dCSR<DataType>& compare_mat, bool compare_data);
8 | }


--------------------------------------------------------------------------------
/spECK/include/Config.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <string>
  4 | #include <map>
  5 | #include "INIReader.h"
  6 | 
  7 | class Config
  8 | {
  9 | public:
 10 |   enum Key
 11 |   {
 12 | 	  BlockNnzFillRatio,
 13 | 	  MaxRowsPerBlock,
 14 | 	  LoadBalanceScanMode,
 15 | 	  HashScanSupportRestarts,
 16 | 	  InputFile,
 17 | 	  LoadBalanceModeCounting,
 18 | 	  LoadBalanceModeNumeric,
 19 | 	  ReprocessLoadBalancingForNumeric,
 20 | 	  MaxNnzPerBlockNumeric,
 21 | 	  MaxRowsPerBlockNumeric,
 22 | 	  IterationsWarmUp,
 23 | 	  IterationsExecution,
 24 | 	  SupportGlobalFallback,
 25 | 	  TrackIndividualTimes,
 26 | 	  TrackCompleteTimes,
 27 | 	  Debug,
 28 | 	  SortMode,
 29 | 	  CompareResult,
 30 | 	  AutoSelectKernelSizeMode,
 31 | 	  CountingBlockLimit,
 32 | 	  LogsEnabled,
 33 | 	  BlocksPerSM,
 34 | 	  DenseThresholdNumericExternalSorting,
 35 | 	  DenseThresholdNumericInternalSorting,
 36 | 	  GlobalDenseThresholdNumeric,
 37 | 	  DenseThresholdCounting,
 38 | 	  SpGEMMMethodCounting,
 39 | 	  SpGEMMMethodNumeric,
 40 | 	  ThreadsPerNnzOffset,
 41 | 	  add3MinLength,
 42 | 	  add2MinLength,
 43 | 	  add1MinLength,
 44 | 	  add3MaxCols,
 45 | 	  add2MaxCols,
 46 | 	  add1MaxCols,
 47 | 	  sub2MaxCols,
 48 | 	  sub1MaxCols,
 49 | 	  sub2MinThreads,
 50 | 	  sub1MinThreads,
 51 | 	  add2MinConcurrentOps,
 52 | 	  add1MinConcurrentOps,
 53 | 	  maxOpsWeight64,
 54 | 	  maxOpsWeight128,
 55 | 	  maxOpsWeight256,
 56 | 	  maxOpsWeight512,
 57 | 	  maxOpsWeight1024,
 58 | 	  staticThreadsPerRow
 59 |   };
 60 | 
 61 |   enum ScanMode
 62 |   {
 63 | 	  Std = 0,
 64 | 	  Cub = 1,
 65 | 	  Thrust = 2,
 66 | 	  WorkEfficient = 3
 67 |   };
 68 | 
 69 |   enum SpGEMMMethods
 70 |   {
 71 | 	  AutoSpGEMM = 0,
 72 | 	  HashSpGEMM = 1,
 73 | 	  DenseSpGEMM = 2
 74 |   };
 75 | 
 76 |   enum LoadBalanceModes
 77 |   {
 78 | 	  AutoEnable = 0,
 79 | 	  ForceEnable = 1,
 80 | 	  ForceDisable = 2
 81 |   };
 82 | 
 83 |   enum SortModes
 84 |   {
 85 | 	  None = 0,
 86 | 	  Separate = 1,
 87 | 	  InPlace = 2,
 88 | 	  Auto = 3,
 89 | 	  CubSegmentedSort = 4
 90 |   };
 91 | 
 92 | private:
 93 | 	std::map<Key, std::string> keyToString;
 94 | 	std::map<Key, int> overrides;
 95 | 	INIReader reader;
 96 | 
 97 | 	void addKeyToString() {
 98 | 		keyToString = {
 99 | 			{BlockNnzFillRatio, "BlockNnzFillRatio"},
100 | 			{MaxRowsPerBlock, "MaxRowsPerBlock"},
101 | 			{LoadBalanceScanMode, "LoadBalanceScanMode"},
102 | 			{HashScanSupportRestarts, "HashScanSupportRestarts"},
103 | 			{InputFile, "InputFile"},
104 | 			{ReprocessLoadBalancingForNumeric, "ReprocessLoadBalancingForNumeric"},
105 | 			{MaxNnzPerBlockNumeric, "MaxNnzPerBlockNumeric"},
106 | 			{MaxRowsPerBlockNumeric, "MaxRowsPerBlockNumeric"},
107 | 			{IterationsWarmUp, "IterationsWarmUp"},
108 | 			{IterationsExecution, "IterationsExecution"},
109 | 			{SupportGlobalFallback, "SupportGlobalFallback"},
110 | 			{TrackIndividualTimes, "TrackIndividualTimes"},
111 | 			{TrackCompleteTimes, "TrackCompleteTimes"},
112 | 			{Debug, "Debug"},
113 | 			{LoadBalanceModeCounting, "LoadBalanceModeCounting"},
114 | 			{LoadBalanceModeNumeric, "LoadBalanceModeNumeric"},
115 | 			{SortMode, "SortMode"},
116 | 			{CompareResult, "CompareResult"},
117 | 			{AutoSelectKernelSizeMode, "AutoSelectKernelSizeMode"},
118 | 			{CountingBlockLimit, "CountingBlockLimit"},
119 | 			{LogsEnabled, "LogsEnabled"},
120 | 			{BlocksPerSM, "BlocksPerSM"},
121 | 			{DenseThresholdNumericExternalSorting, "DenseThresholdNumericExternalSorting"},
122 | 			{DenseThresholdNumericInternalSorting, "DenseThresholdNumericInternalSorting"},
123 | 			{DenseThresholdCounting, "DenseThresholdCounting"},
124 | 			{SpGEMMMethodNumeric, "SpGEMMMethodNumeric"},
125 | 			{SpGEMMMethodCounting, "SpGEMMMethodCounting"},
126 | 			{GlobalDenseThresholdNumeric, "GlobalDenseThresholdNumeric"},
127 | 			{add3MinLength, "add3MinLength"},
128 | 			{add2MinLength, "add2MinLength"},
129 | 			{add1MinLength, "add1MinLength"},
130 | 			{add3MaxCols, "add3MaxCols"},
131 | 			{add2MaxCols, "add2MaxCols"},
132 | 			{add1MaxCols, "add1MaxCols"},
133 | 			{sub2MaxCols, "sub2MaxCols"},
134 | 			{sub1MaxCols, "sub1MaxCols"},
135 | 			{sub2MinThreads, "sub2MinThreads"},
136 | 			{sub1MinThreads, "sub1MinThreads"},
137 | 			{add2MinConcurrentOps, "add2MinConcurrentOps"},
138 | 			{add1MinConcurrentOps, "add1MinConcurrentOps"},
139 | 			{ThreadsPerNnzOffset, "ThreadsPerNnzOffset"},
140 | 			{maxOpsWeight64, "maxOpsWeight64"},
141 | 			{maxOpsWeight128, "maxOpsWeight128"},
142 | 			{maxOpsWeight256, "maxOpsWeight256"},
143 | 			{maxOpsWeight512, "maxOpsWeight512"},
144 | 			{maxOpsWeight1024, "maxOpsWeight1024"},
145 | 			{staticThreadsPerRow, "staticThreadsPerRow"} };
146 | 	}
147 | 
148 | 	Config()
149 | 	{
150 | 		reader = INIReader();
151 | 		addKeyToString();
152 | 	}
153 | 
154 | 	Config(std::string configPath)
155 | 	{
156 | 		reader = INIReader(configPath);
157 | 		addKeyToString();
158 | 	}
159 | 
160 | 	static Config &Instance()
161 | 	{
162 | 		if (_instance == nullptr)
163 | 			throw std::exception();
164 | 	
165 | 		return *_instance;
166 | 	}
167 | 
168 | public:
169 | 	static Config *_instance;
170 | 	static void init(std::string path);
171 | 	static void init();
172 | 
173 | 	static int getInt(Key key, int fallback = -1);
174 | 	static int setInt(Key key, int newVal);
175 | 	static std::string getString(Key key, std::string fallback = "");
176 | 	static bool getBool(Key key, bool fallback = false);
177 | 	static float getFloat(Key key, float fallback = 0.0);
178 | };
179 | 
180 | 
181 | 


--------------------------------------------------------------------------------
/spECK/include/DataLoader.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <string>
 4 | #include "CSR.h"
 5 | #include "dCSR.h"
 6 | 
 7 | 
 8 | template<typename ValueType>
 9 | struct Matrices
10 | {
11 | 	CSR<ValueType> cpuA, cpuB;
12 | 	dCSR<ValueType> gpuA, gpuB;
13 | };
14 | 
15 | template<typename ValueType>
16 | class DataLoader
17 | {
18 | public:
19 | 	DataLoader(std::string path, std::string path2);
20 | 	~DataLoader() = default;
21 | 	Matrices<ValueType> matrices;
22 | };
23 | 


--------------------------------------------------------------------------------
/spECK/include/Executor.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "RunConfig.h"
 3 | 
 4 | template<typename ValueType>
 5 | class Executor
 6 | {
 7 | public:
 8 | 	Executor(int argc, char *argv[]) : runConfig(argc, argv) {}
 9 | 	~Executor() = default;
10 | 	int run();
11 | 	int run_detail();
12 | 
13 | private:
14 | 	RunConfig runConfig;
15 | 	int iterationsWarmup = 0;
16 | 	int iterationsExecution = 1;
17 | };
18 | 
19 | 


--------------------------------------------------------------------------------
/spECK/include/GPU/BlockRange.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef spECK_BlockRange
 2 | #define spECK_BlockRange
 3 | #pragma once
 4 | #include "limits.cuh"
 5 | 
 6 | 
 7 | template<class INDEX_TYPE, class ROW_COUNT_TYPE>
 8 | struct BlockRange
 9 | {
10 | 	// inclusive
11 | 	INDEX_TYPE first;
12 | 	// if 1, then single row, if > 1 then multiple rows. if 0, then merged with others and must not be used
13 | 	ROW_COUNT_TYPE numRows;
14 | 	// if nnZ == numeric_limits<COUNT_TYPE>::max(), then this must not be merged with others
15 | 	INDEX_TYPE nnz;
16 | 
17 | 	__host__ __device__ BlockRange() : first(spECK::numeric_limits<INDEX_TYPE>::max()), numRows(0), nnz(0) {}
18 | 
19 | 	__host__ __device__ BlockRange(INDEX_TYPE first, INDEX_TYPE numRows, INDEX_TYPE nnz) : first(first)
20 | 	{
21 | 		this->numRows = min(numRows, spECK::numeric_limits<ROW_COUNT_TYPE>::max());
22 | 		this->nnz = min(nnz, spECK::numeric_limits<INDEX_TYPE>::max());
23 | 	}
24 | 
25 | 	__host__ __device__ BlockRange& operator=(const BlockRange& a)
26 | 	{
27 | 		first = a.first;
28 | 		numRows = a.numRows;
29 | 		nnz = a.nnz;
30 | 		return *this;
31 | 	}
32 | 
33 | 	__host__ __device__ __forceinline__ INDEX_TYPE nextRow() const { return first + numRows; }
34 | 	__host__ __device__ __forceinline__ INDEX_TYPE last() const
35 | 	{
36 | 		if (numRows == 0)
37 | 			return spECK::numeric_limits<INDEX_TYPE>::max();
38 | 
39 | 		return first + numRows - 1;
40 | 	}
41 | 	__host__ __device__ __forceinline__ bool valid() const { return numRows; }
42 | 	__host__ __device__ __forceinline__ void setInvalid() { numRows = 0; }
43 | 
44 | 	__host__ __device__ int operator >(const BlockRange<INDEX_TYPE, ROW_COUNT_TYPE> &b)
45 | 	{
46 | 		return first > b.first;
47 | 	}
48 | };
49 | 
50 | 
51 | template<class INDEX_TYPE, class ROW_COUNT_TYPE>
52 | struct BlockRangeKernelScale
53 | {
54 | 	// inclusive
55 | 	INDEX_TYPE first;
56 | 	// if 1, then single row, if > 1 then multiple rows. if 0, then merged with others and must not be used
57 | 	ROW_COUNT_TYPE numRows;
58 | 	// if nnZ == numeric_limits<COUNT_TYPE>::max(), then this must not be merged with others
59 | 	INDEX_TYPE nnz;
60 | 	int8_t kernelScale;
61 | 
62 | 	__host__ __device__ BlockRangeKernelScale() : first(spECK::numeric_limits<INDEX_TYPE>::max()), numRows(0), nnz(0), kernelScale(0) {}
63 | 
64 | 	__host__ __device__ BlockRangeKernelScale(INDEX_TYPE first, INDEX_TYPE numRows, INDEX_TYPE nnz, int8_t kernelScale) : first(first), kernelScale(kernelScale)
65 | 	{
66 | 		this->numRows = min(numRows, spECK::numeric_limits<ROW_COUNT_TYPE>::max());
67 | 		this->nnz = min(nnz, spECK::numeric_limits<INDEX_TYPE>::max());
68 | 	}
69 | 
70 | 	__host__ __device__ BlockRangeKernelScale& operator=(const BlockRangeKernelScale& a)
71 | 	{
72 | 		first = a.first;
73 | 		numRows = a.numRows;
74 | 		nnz = a.nnz;
75 | 		kernelScale = a.kernelScale;
76 | 		return *this;
77 | 	}
78 | 
79 | 	__host__ __device__ __forceinline__ INDEX_TYPE nextRow() const { return first + numRows; }
80 | 	__host__ __device__ __forceinline__ INDEX_TYPE last() const
81 | 	{
82 | 		if (numRows == 0)
83 | 			return spECK::numeric_limits<INDEX_TYPE>::max();
84 | 
85 | 		return first + numRows - 1;
86 | 	}
87 | 	__host__ __device__ __forceinline__ bool valid() const { return numRows; }
88 | 	__host__ __device__ __forceinline__ void setInvalid() { numRows = 0; }
89 | 
90 | 	__host__ __device__ int operator >(const BlockRangeKernelScale<INDEX_TYPE, ROW_COUNT_TYPE> &b)
91 | 	{
92 | 		return first > b.first;
93 | 	}
94 | };
95 | #endif


--------------------------------------------------------------------------------
/spECK/include/GPU/Hash.cuh:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include <cuda_runtime.h>
3 | 
4 | __host__ __device__ __forceinline__ uint32_t currentHash(uint32_t id) {
5 | 	return id * 11;
6 | }


--------------------------------------------------------------------------------
/spECK/include/GPU/consistent_gpu_memory.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <utility>
 4 | #include <CUDATools/memory.h>
 5 | /*#include "../memory_space.h"
 6 | #include "../consistent_memory.h"
 7 | 
 8 | namespace spECK {
 9 | 	template<>
10 | 	class ConsistentMemory<MemorySpace::device> : RegisteredMemory
11 | 	{
12 | 		size_t _size;
13 | 		CU::unique_ptr _ptr;
14 | 
15 | 		size_t clear() override
16 | 		{
17 | 			auto s = _size;
18 | 			reset(0);
19 | 			return s;
20 | 		}
21 | 	public:
22 | 		ConsistentMemory() : _size(0)
23 | 		{
24 | 			register_consistent_memory(this);
25 | 		}
26 | 
27 | 		~ConsistentMemory()
28 | 		{
29 | 			unregister_consistent_memory(this);
30 | 		}
31 | 
32 | 		operator CUdeviceptr() const noexcept { return _ptr; }
33 | 
34 | 		template <typename T = void>
35 | 		T* get() const noexcept { return reinterpret_cast<T*>(_ptr.operator long long unsigned int()); }
36 | 
37 | 		void increaseMemRetainData(size_t size)
38 | 		{
39 | 			CU::unique_ptr tmp_ptr = CU::allocMemory(_size + size);
40 | 			cudaMemcpy(tmp_ptr.get(), _ptr.get(), _size, cudaMemcpyDeviceToDevice);
41 | 			_ptr.reset();
42 | 			_ptr = std::move(tmp_ptr);
43 | 			_size += size;
44 | 		}
45 | 
46 | 		void assure(size_t size)
47 | 		{
48 | 			if (size > _size)
49 | 			{
50 | 				_ptr.reset();
51 | 				_ptr = CU::allocMemory(size);
52 | 				_size = size;
53 | 			}
54 | 		}
55 | 		void reset(size_t size = 0)
56 | 		{
57 | 			_ptr.reset();
58 | 			_size = 0;
59 | 			assure(size);
60 | 		}
61 | 	};
62 | }*/


--------------------------------------------------------------------------------
/spECK/include/GPU/limits.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef spECK_Limits
 2 | #define spECK_Limits
 3 | #pragma once
 4 | #include <cstdint>
 5 | #include <npp.h>
 6 | 
 7 | namespace spECK {
 8 | 
 9 | template<typename _Tp>
10 | struct numeric_limits
11 | {
12 |   /** The minimum finite value, or for floating types with
13 | denormalization, the minimum positive normalized value.  */
14 |    __host__ __device__ static constexpr _Tp
15 |   min() noexcept { return _Tp(); }
16 | 
17 |   /** The maximum finite value.  */
18 |   __host__ __device__ static constexpr _Tp
19 |   max() noexcept { return _Tp(); }
20 | };
21 | 
22 | template<>
23 | struct numeric_limits<uint32_t>
24 | {
25 |   __host__ __device__ static constexpr uint32_t
26 |  min() noexcept { return NPP_MIN_32U; }
27 | 
28 |  __host__ __device__ static constexpr uint32_t
29 |  max() noexcept { return NPP_MAX_32U; }
30 | };
31 | 
32 | template<>
33 | struct numeric_limits<int32_t>
34 | {
35 |   __host__ __device__ static constexpr int32_t
36 |  min() noexcept { return NPP_MIN_32S; }
37 | 
38 |  __host__ __device__ static constexpr int32_t
39 |  max() noexcept { return NPP_MAX_32S; }
40 | };
41 | 
42 | template<>
43 | struct numeric_limits<uint16_t>
44 | {
45 | 	__host__ __device__ static constexpr uint16_t
46 | 		min() noexcept { return NPP_MIN_16U; }
47 | 
48 | 	__host__ __device__ static constexpr uint16_t
49 | 		max() noexcept { return NPP_MAX_16U; }
50 | };
51 | 
52 | template<>
53 | struct numeric_limits<int16_t>
54 | {
55 | 	__host__ __device__ static constexpr int16_t
56 | 		min() noexcept { return NPP_MIN_16S; }
57 | 
58 | 	__host__ __device__ static constexpr int16_t
59 | 		max() noexcept { return NPP_MAX_16S; }
60 | };
61 | 
62 | template<>
63 | struct numeric_limits<uint8_t>
64 | {
65 | 	__host__ __device__ static constexpr uint8_t
66 | 		min() noexcept { return NPP_MIN_8U; }
67 | 
68 | 	__host__ __device__ static constexpr uint8_t
69 | 		max() noexcept { return NPP_MAX_8U; }
70 | };
71 | 
72 | template<>
73 | struct numeric_limits<int8_t>
74 | {
75 | 	__host__ __device__ static constexpr int8_t
76 | 		min() noexcept { return NPP_MIN_8S; }
77 | 
78 | 	__host__ __device__ static constexpr int8_t
79 | 		max() noexcept { return NPP_MAX_8S; }
80 | };
81 | }
82 | #endif


--------------------------------------------------------------------------------
/spECK/include/GPU/profiler.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stdio.h>
 4 | #include <cuda.h>
 5 | #include <cupti.h>
 6 | #include <vector>
 7 | 
 8 | // Structure to hold data collected by callback
 9 | typedef struct RuntimeApiTrace_st {
10 |   const char *functionName;
11 |   uint64_t startTimestamp;
12 |   uint64_t endTimestamp;
13 |   size_t memcpy_bytes;
14 |   enum cudaMemcpyKind memcpy_kind;
15 |   size_t currentMemoryUsage;
16 | } RuntimeApiTrace_t;
17 | 
18 | enum launchOrder{ MEMCPY_H2D1, MEMCPY_H2D2, MEMCPY_D2H, KERNEL, THREAD_SYNC, LAUNCH_LAST};
19 | 
20 | class CuProfiler {
21 | private:
22 |     std::vector<RuntimeApiTrace_st> records;
23 |     CUpti_SubscriberHandle subscriber;
24 |     unsigned long long int startMem;
25 |     bool initialized;
26 | 
27 | public:
28 |     RuntimeApiTrace_t trace[LAUNCH_LAST];
29 | 
30 |     CuProfiler() {
31 |         initialized = false;
32 |     }
33 | 
34 | 
35 |     // void CUPTIAPI getTimestampCallback(void *userdata, CUpti_CallbackDomain domain,
36 |     //                     CUpti_CallbackId cbid, const CUpti_CallbackData *cbInfo);
37 | 
38 |     size_t getPeakMemoryUsage() {
39 |         if (records.size() == 0)
40 |             return -1;
41 | 
42 |         size_t max = 0;
43 |         for (auto& entry : records) {
44 |             if (entry.currentMemoryUsage > max)
45 |                 max = entry.currentMemoryUsage;
46 |         }
47 |         return max;
48 |     }
49 | 
50 |     static const char *memcpyKindStr(enum cudaMemcpyKind kind)
51 |     {
52 |         switch (kind) {
53 |         case cudaMemcpyHostToDevice:
54 |             return "HostToDevice";
55 |         case cudaMemcpyDeviceToHost:
56 |             return "DeviceToHost";
57 |         default:
58 |             break;
59 |         }
60 | 
61 |         return "<unknown>";
62 |     }
63 | 
64 |     void displayTimestamps();
65 | 
66 | 
67 |     static void cleanUp(int *h_A, int *h_B, int *h_C, int *d_A, int *d_B, int *d_C)
68 |     {
69 |         if (d_A)
70 |             cudaFree(d_A);
71 |         if (d_B)
72 |             cudaFree(d_B);
73 |         if (d_C)
74 |             cudaFree(d_C);
75 | 
76 |         // Free host memory
77 |         if (h_A)
78 |             free(h_A);
79 |         if (h_B)
80 |             free(h_B);
81 |         if (h_C)
82 |             free(h_C);
83 |     }
84 | 
85 |     void initialize(bool subtractCurrentMem = true);
86 |     void finalize();
87 | };


--------------------------------------------------------------------------------
/spECK/include/GPU/spECKKernels.h:
--------------------------------------------------------------------------------
  1 | 
  2 | #pragma once
  3 | 
  4 | #include <stdint.h>
  5 | typedef unsigned long long int uint64_t;
  6 | 
  7 | #include <cuda.h>
  8 | #include "Config.h"
  9 | #include "GPU/BlockRange.cuh"
 10 | 
 11 | class spECKKernels
 12 | {
 13 | public:
 14 | 	spECKKernels(uint32_t blockDim=128):
 15 | 	blockDim{blockDim}
 16 | 	{}
 17 | 
 18 | 	void setLaunchDimensions(uint32_t _gridDim, cudaStream_t _stream = 0, uint32_t _blockDim = 128, uint32_t _sharedMem = 0)
 19 | 	{
 20 | 		gridDim = _gridDim;
 21 | 		blockDim = _blockDim;
 22 | 		stream = _stream;
 23 | 		sharedMem = _sharedMem;
 24 | 	}
 25 | 
 26 | 	// #####################################################################
 27 | 	// Numeric Hash SpGEMM
 28 | 	//
 29 | 	 template <typename INDEX_TYPE, typename VALUE_TYPE, class GlobalMap, uint32_t SHARED_HASH_SIZE, bool SUPPORT_GLOBAL, uint32_t THREADS>
 30 | 	 void h_HashSpGEMMNumeric(dCSRNoDealloc<VALUE_TYPE> matA, dCSRNoDealloc<VALUE_TYPE> matB, dCSRNoDealloc<VALUE_TYPE> matC, GlobalMap *maps, INDEX_TYPE mapCount,
 31 | 		INDEX_TYPE *blockStartRow, INDEX_TYPE *rowOperations, Config::SortModes sortColumns, uint32_t numberBlocks, const INDEX_TYPE *rowColMinMax,
 32 | 		INDEX_TYPE *rowMaxOperations, bool setSortedBit, uint32_t rowsPerBlock);
 33 | 
 34 | 	 // #####################################################################
 35 | 	 // Numeric SpGEMM Launcher
 36 | 	 //
 37 | 	template <typename INDEX_TYPE, typename VALUE_TYPE, class GlobalHashMap, class GlobalRowOffsetMap, uint32_t SHARED_HASH_SIZE, bool SUPPORT_GLOBAL, uint32_t THREADS>
 38 | 	void h_SpGEMMNumericLauncher(dCSRNoDealloc<VALUE_TYPE> matA, dCSRNoDealloc<VALUE_TYPE> matB, dCSRNoDealloc<VALUE_TYPE> matC,
 39 | 		GlobalHashMap *hashMaps, INDEX_TYPE hashMapCount, GlobalRowOffsetMap *rowOffsetMaps, INDEX_TYPE rowOffsetMapCount,
 40 | 		INDEX_TYPE *blockStartRow, INDEX_TYPE *rowOperations, Config::SortModes sortColumns, uint32_t numberBlocks, const INDEX_TYPE* rowColMinMax,
 41 | 		INDEX_TYPE *rowMaxOperations, uint32_t minimumDensity, bool setSortedBit, uint32_t rowsPerBlock);
 42 | 
 43 | 
 44 | 	 // #####################################################################
 45 | 	 // Numeric Dense SpGEMM
 46 | 	 //
 47 | 	 template <typename INDEX_TYPE, typename VALUE_TYPE, class GlobalMap, uint32_t SHARED_HASH_SIZE, bool SUPPORT_GLOBAL, uint32_t THREADS>
 48 | 	 void h_DenseSpGEMMNumeric(dCSRNoDealloc<VALUE_TYPE> matA, dCSRNoDealloc<VALUE_TYPE> matB, dCSRNoDealloc<VALUE_TYPE> matC, GlobalMap *maps, INDEX_TYPE mapCount,
 49 | 		INDEX_TYPE *blockStartRow, INDEX_TYPE *rowOperations, uint32_t numberBlocks, const INDEX_TYPE *rowColMinMax,
 50 | 		INDEX_TYPE *rowMaxOperations, bool setSortedBit, uint32_t rowsPerBlock);
 51 | 
 52 | 	// #####################################################################
 53 | 	// Symbolic Dense SpGEMM
 54 | 	//
 55 | 	template <typename INDEX_TYPE, typename VALUE_TYPE, class GlobalMap, uint32_t SHARED_HASH_SIZE, bool SUPPORT_GLOBAL, uint32_t THREADS>
 56 | 	void h_DenseSpGEMMCount(dCSRNoDealloc<VALUE_TYPE> matA, dCSRNoDealloc<VALUE_TYPE> matB, GlobalMap *maps, INDEX_TYPE mapCount,
 57 | 		INDEX_TYPE *matCRowOffsets, INDEX_TYPE *blockStartRow, INDEX_TYPE *rowOperations, uint32_t numberBlocks, const INDEX_TYPE *rowColMinMax,
 58 | 		INDEX_TYPE *rowMaxOperations, uint32_t *maxNnzPerRow, uint32_t rowsPerBlock);
 59 | 
 60 | 	 // #####################################################################
 61 | 	 // Symbolic Hash SpGEMM used for counting NNZ elements of output matrix C
 62 | 	 //
 63 | 	 template <typename INDEX_TYPE, typename VALUE_TYPE, unsigned MAX_ROWS_PER_BLOCK, class GlobalMap, uint32_t SHARED_HASH_SIZE, bool SUPPORT_GLOBAL, uint32_t THREADS>
 64 | 	 void h_HashSpGEMMCount(dCSRNoDealloc<VALUE_TYPE> matA, dCSRNoDealloc<VALUE_TYPE> matB, GlobalMap *maps, INDEX_TYPE mapCount, INDEX_TYPE *matCNnzRow,
 65 | 		 INDEX_TYPE* rowOperations, INDEX_TYPE *blockStartRow, uint32_t numberBlocks, const INDEX_TYPE* rowColMinMax,
 66 | 		  INDEX_TYPE *rowMaxOperations, uint32_t *maxNnzPerRow, uint32_t rowsPerBlock);
 67 | 
 68 | 	 // #####################################################################
 69 | 	 // Symbolic SpGEMM launcher used for counting NNZ elements of output matrix C
 70 | 	 //
 71 | 	 template <typename INDEX_TYPE, typename VALUE_TYPE, unsigned MAX_ROWS_PER_BLOCK, class GlobalMap, class GlobalRowOffsetsMap, uint32_t SHARED_HASH_SIZE, bool SUPPORT_GLOBAL, uint32_t THREADS>
 72 | 	 void h_SpGEMMCountLauncher(dCSRNoDealloc<VALUE_TYPE> matA, dCSRNoDealloc<VALUE_TYPE> matB,
 73 | 								GlobalMap *hashMaps, INDEX_TYPE hashMapCount, GlobalRowOffsetsMap *rowOffsetMaps, INDEX_TYPE rowOffsetMapsCount,
 74 | 								INDEX_TYPE *matCNnzRow, INDEX_TYPE *rowOperations, INDEX_TYPE *blockStartRow,
 75 | 								uint32_t numberBlocks, const INDEX_TYPE *rowColMinMax,
 76 | 								INDEX_TYPE *rowMaxOperations, uint32_t minimumDensity, INDEX_TYPE *maxNnzPerRow, uint32_t rowsPerBlock);
 77 | 
 78 | 	 // #####################################################################
 79 | 	 // Sorts results of Symbolic Hash SpGEMM
 80 | 	 //
 81 | 	 template <typename INDEX_TYPE, typename VALUE_TYPE, uint32_t THREADS, uint32_t entriesPerBlock>
 82 | 	 void h_HashSpGEMMSorting(dCSRNoDealloc<VALUE_TYPE> matC, INDEX_TYPE *blockStartRow, uint32_t numberBlocks, bool bitShiftNumRows);
 83 | 
 84 | 	 template <typename Map, typename INDEX_TYPE, typename VALUE_TYPE>
 85 | 	 void h_InitializeGlobalMaps(Map *maps, int count, INDEX_TYPE *ids, VALUE_TYPE *values, size_t elementsPerMap);
 86 | 
 87 | 	 template <typename Map, typename INDEX_TYPE>
 88 | 	 void h_InitializeGlobalMapsNoVal(Map *maps, int count, INDEX_TYPE *ids, size_t elementsPerMap, uint32_t maxRowsPerBlock);
 89 | 
 90 | 	 	 
 91 | 	 // #####################################################################
 92 | 	 // Load Balancer for HashSpGEMM by assigning blocks to rows -> 1 block can have multiple rows, but 1 row is never shared by multiple blocks
 93 | 	 // this load balancer works uses the amount of operations per row for balancing
 94 | 	 //
 95 | 	 template <typename INDEX_TYPE, typename VALUE_TYPE, typename ROW_COUNT_TYPE, uint8_t KERNEL_COUNT>
 96 | 	 void h_AssignHashSpGEMMBlocksToRowsOfSameSizeOperations(dCSRNoDealloc<VALUE_TYPE> &matA, dCSRNoDealloc<VALUE_TYPE> &matB, uint32_t *rowOperations,
 97 | 		 INDEX_TYPE *blockStartRows, INDEX_TYPE *numBlockStarts, INDEX_TYPE (&h_numBlockStarts)[KERNEL_COUNT], INDEX_TYPE *blockStartRowsCombined,
 98 | 		 uint32_t maxNnzPerBlock, uint32_t maxNnzPerBlockDynamicSharedMem, uint32_t maxRowsPerBlock, uint32_t actualKernelCount, uint32_t &h_rowsRequiringGlobal);
 99 | 
100 | 
101 | 
102 | 	// #####################################################################
103 | 	// Load Balancer for HashSpGEMM by assigning blocks to rows -> 1 block can have multiple rows, but 1 row is never shared by multiple blocks
104 | 	// this load balancer tries to combine rows which fit into one as small as possible kernel
105 | 	//
106 | 	template <typename INDEX_TYPE, typename VALUE_TYPE, typename ROW_COUNT_TYPE, uint8_t KERNEL_COUNT>
107 | 	void h_AssignHashSpGEMMBlocksToRowsOfSameSize(dCSRNoDealloc<VALUE_TYPE> &matA,
108 | 		INDEX_TYPE *blockStartRows, INDEX_TYPE *blockStartRowsCombined, INDEX_TYPE *numBlockStarts, INDEX_TYPE(&h_numBlockStarts)[KERNEL_COUNT],
109 | 		uint32_t maxNnzPerBlock, uint32_t maxNnzPerBlockDynamicSharedMem, uint32_t maxRowsPerBlock, uint32_t actualKernelCount, uint32_t &h_rowsRequiringGlobal);
110 | 
111 | 
112 | private:
113 | 	uint32_t blockDim;
114 | 	uint32_t gridDim;
115 | 	uint32_t sharedMem;
116 | 	cudaStream_t stream;
117 | };
118 | 
119 | 


--------------------------------------------------------------------------------
/spECK/include/HashMap.cuh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cuda_runtime.h>
  4 | #include <type_traits>
  5 | #include "GPU/limits.cuh"
  6 | #include <device_launch_parameters.h>
  7 | 
  8 | __device__ __forceinline__ uint32_t toHashEntry(uint32_t row, uint32_t col)
  9 | {
 10 |     return (row << 27) + col;
 11 | }
 12 | 
 13 | __device__ __forceinline__ uint32_t hashEntryToColumn(uint32_t hashEntry)
 14 | {
 15 |     return hashEntry & 0x7FFFFFF;
 16 | }
 17 | 
 18 | __device__ __forceinline__ uint32_t hashEntryToRow(uint32_t hashEntry)
 19 | {
 20 |     return hashEntry >> 27;
 21 | }
 22 | 
 23 | template <typename INDEX_TYPE, typename VALUE_TYPE>
 24 | struct HashMap
 25 | {
 26 |   public:
 27 |     // no default values or else union does not work
 28 |     INDEX_TYPE *ids;
 29 |     VALUE_TYPE *values;
 30 |     uint32_t capacity;
 31 | 
 32 |     __device__ __forceinline__ static INDEX_TYPE UNUSED() { return spECK::numeric_limits<INDEX_TYPE>::max(); }
 33 | 
 34 |     __device__ void init()
 35 |     {
 36 |         for (int i = threadIdx.x; i < capacity; i += blockDim.x)
 37 |         {
 38 |             ids[i] = UNUSED();
 39 |             values[i] = VALUE_TYPE(0);
 40 |         }
 41 |     }
 42 | 
 43 |     __device__ __forceinline__ INDEX_TYPE indexOf(INDEX_TYPE id)
 44 |     {
 45 |         INDEX_TYPE hashed_id = currentHash(id);
 46 |         INDEX_TYPE map_id = hashed_id % getSize();
 47 |         do
 48 |         {
 49 |             auto entry_id = ids[map_id];
 50 |             if (entry_id == id)
 51 |             {
 52 |                 return map_id;
 53 |             }
 54 | 
 55 |             if (entry_id == UNUSED())
 56 |             {
 57 |                 auto old_id = atomicCAS(ids + map_id, UNUSED(), id);
 58 |                 if (old_id == UNUSED() || old_id == id)
 59 |                 {
 60 |                     return map_id;
 61 |                 }
 62 |             }
 63 | 
 64 |             map_id = (map_id + 1) % getSize();
 65 |         } while (true);
 66 |     }
 67 | 
 68 |     __device__ __forceinline__ VALUE_TYPE &operator[](INDEX_TYPE id)
 69 |     {
 70 |         return values[indexOf(id)];
 71 |     }
 72 | 
 73 |     __device__ __forceinline__ INDEX_TYPE coordToId(INDEX_TYPE rowA, INDEX_TYPE colB)
 74 |     {
 75 |         return toHashEntry(rowA, colB);
 76 |     }
 77 | 
 78 |     __device__ __forceinline__ static INDEX_TYPE idToRow(INDEX_TYPE id) { return hashEntryToRow(id); }
 79 | 
 80 |     __device__ __forceinline__ static INDEX_TYPE idToCol(INDEX_TYPE id) { return hashEntryToColumn(id); }
 81 | 
 82 |     __device__ __forceinline__ VALUE_TYPE &at(INDEX_TYPE rowA, INDEX_TYPE colB)
 83 |     {
 84 |         return this->operator[](coordToId(rowA, colB));
 85 |     }
 86 | 
 87 |     __device__ __forceinline__ void atomic_add_direct(INDEX_TYPE rowA, INDEX_TYPE colB, VALUE_TYPE val)
 88 |     {
 89 |         atomicAdd_block(&values[colB], val);
 90 |         ids[colB] = coordToId(rowA, colB);
 91 |     }
 92 | 
 93 |     __device__ __forceinline__ void atomic_add_direct(INDEX_TYPE colB, VALUE_TYPE val)
 94 |     {
 95 |         atomicAdd_block(&values[colB], val);
 96 |         ids[colB] = colB;
 97 |     }
 98 | 
 99 |     __device__ __forceinline__ void atomic_add(INDEX_TYPE rowA, INDEX_TYPE colB, VALUE_TYPE val)
100 |     {
101 |         atomic_add(coordToId(rowA, colB), val);
102 |     }
103 | 
104 |     __device__ __forceinline__ void atomic_add(INDEX_TYPE id, VALUE_TYPE val)
105 |     {
106 |         atomicAdd_block(values + this->indexOf(id), val);
107 |     }
108 | 
109 |     __device__ __forceinline__ uint32_t getSize() const { return capacity; }
110 | };
111 | 
112 | template <class HashMap>
113 | __device__ __forceinline__ HashMap *reserveMap(HashMap *maps, uint32_t count)
114 | {
115 |     uint32_t index = blockIdx.x % count;
116 | 
117 |     while (true)
118 |     {
119 |         if (atomicCAS(&maps[index].reserved, 0, 1) == 0)
120 |         {
121 |             return &maps[index];
122 |         }
123 |         index = (index + 1) % count;
124 |     }
125 | }
126 | 
127 | template <class HashMap>
128 | __device__ __forceinline__ void freeMap(HashMap *map)
129 | {
130 |     if (map == nullptr)
131 |         return;
132 |     map->reserved = 0;
133 |     map = nullptr;
134 | }
135 | 
136 | template <typename INDEX_TYPE, size_t MAX_ROW_COUNT>
137 | struct HashMapNoValue
138 | {
139 |   private:
140 |     uint32_t limit;
141 | 
142 |   public:
143 |     __device__ INDEX_TYPE UNUSED() const { return spECK::numeric_limits<INDEX_TYPE>::max(); }
144 |     INDEX_TYPE *ids;
145 |     INDEX_TYPE *occupancyPerRow;
146 |     INDEX_TYPE *occupancy;
147 | 
148 |     // no default values or else union does not work
149 |     int reserved;
150 |     uint32_t capacity;
151 | 
152 |     __device__ void init(bool mainThread)
153 |     {
154 |         for (int i = threadIdx.x; i < capacity; i += blockDim.x)
155 |             ids[i] = UNUSED();
156 | 
157 |         for (int i = threadIdx.x; i < MAX_ROW_COUNT; i += blockDim.x)
158 |             occupancyPerRow[i] = 0;
159 | 
160 |         if (mainThread)
161 |         {
162 |             *occupancy = 0;
163 |             limit = capacity;
164 |         }
165 |     }
166 | 
167 |     __device__ __forceinline__ void operator[](INDEX_TYPE id)
168 |     {
169 |         INDEX_TYPE hashed_id = currentHash(id);
170 |         INDEX_TYPE map_id = hashed_id % getSize();
171 | 
172 |         do
173 |         {
174 |             auto entry = ids[map_id];
175 |             if (entry == id)
176 |                 return;
177 | 
178 |             if (entry == UNUSED())
179 |             {
180 |                 auto old_id = atomicCAS(ids + map_id, UNUSED(), id);
181 | 
182 |                 if (old_id == UNUSED() || old_id == id)
183 |                 {
184 |                     if (old_id == UNUSED())
185 |                     {
186 |                         atomicAdd_block(occupancy, 1);
187 |                         atomicAdd_block(&occupancyPerRow[idToRow(id)], 1);
188 |                     }
189 |                     return;
190 |                 }
191 |             }
192 | 
193 |             map_id = (map_id + 1) % getSize();
194 |         } while (true);
195 |     }
196 | 
197 |     __device__ __forceinline__ void limitSize(uint32_t limit)
198 |     {
199 |         this->limit = min(limit, capacity);
200 |     }
201 | 
202 |     __device__ __forceinline__ INDEX_TYPE coordToId(INDEX_TYPE rowA, INDEX_TYPE colB)
203 |     {
204 |         return toHashEntry(rowA, colB);
205 |     }
206 | 
207 |     __device__ __forceinline__ static INDEX_TYPE idToRow(INDEX_TYPE id) { return hashEntryToRow(id); }
208 | 
209 |     __device__ __forceinline__ static INDEX_TYPE idToCol(INDEX_TYPE id) { return hashEntryToColumn(id); }
210 | 
211 |     __device__ __forceinline__ void at(INDEX_TYPE rowA, INDEX_TYPE colB)
212 |     {
213 |         this->operator[](coordToId(rowA, colB));
214 |     }
215 | 
216 |     __device__ __forceinline__ void atDirect(INDEX_TYPE rowA, INDEX_TYPE colB)
217 |     {
218 |         if (ids[colB] != UNUSED())
219 |             return;
220 | 
221 |         INDEX_TYPE retVal = atomicCAS(&ids[colB], UNUSED(), coordToId(rowA, colB));
222 |         if (retVal == UNUSED())
223 |         {
224 |             atomicAdd_block(occupancy, 1);
225 |             atomicAdd_block(&occupancyPerRow[rowA], 1);
226 |         }
227 |     }
228 | 
229 |     __device__ __forceinline__ size_t getSize() const { return limit; }
230 | };


--------------------------------------------------------------------------------
/spECK/include/Multiply.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #pragma once
 3 | 
 4 | #include "dCSR.h"
 5 | #include "Timings.h"
 6 | #include "spECKConfig.h"
 7 | 
 8 | // REPLACE THESE VALUES WITH YOUR ACTUAL DEVICE SPECIFICATIONS
 9 | 
10 | static constexpr int spECK_STATIC_MEM_PER_BLOCK {49152};
11 | static constexpr int spECK_DYNAMIC_MEM_PER_BLOCK{98304};
12 | 
13 | namespace spECK
14 | {
15 |     template <typename DataType, int BLOCKS_PER_SM, int THREADS_PER_BLOCK, int MAX_DYNAMIC_SHARED, int MAX_STATIC_SHARED>
16 |     void MultiplyspECK(const dCSR<DataType> &A, const dCSR<DataType> &B, dCSR<DataType> &matOut, spECKConfig &config, Timings &timings);
17 | 
18 |     template <typename DataType, int BLOCKS_PER_SM, int THREADS_PER_BLOCK, int MAX_DYNAMIC_SHARED, int MAX_STATIC_SHARED>
19 |     void MultiplyspECKImplementation(const dCSR<DataType> &A, const dCSR<DataType> &B, dCSR<DataType> &matOut, spECKConfig &config, Timings &timings = Timings());
20 | } // namespace spECK
21 | 


--------------------------------------------------------------------------------
/spECK/include/RunConfig.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <string>
 4 | 
 5 | class RunConfig
 6 | {
 7 | public:
 8 | 	RunConfig(int argc, char *argv[]);
 9 | 	~RunConfig();
10 | 	std::string filePath;
11 |     std::string mat_name;
12 |     std::string filePath2;
13 |     std::string mat_name2;
14 | };
15 | 
16 | 


--------------------------------------------------------------------------------
/spECK/include/Timings.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <vector>
  3 | 
  4 | class Timings {
  5 | public:
  6 |     bool measureAll;
  7 |     bool measureCompleteTime;
  8 |     float init;
  9 |     float countProducts;
 10 |     float loadBalanceCounting;
 11 |     float globalMapsCounting;
 12 |     float spGEMMCounting;
 13 |     float allocC;
 14 |     float loadBalanceNumeric;
 15 |     float globalMapsNumeric;
 16 |     float spGEMMNumeric;
 17 |     float sorting;
 18 |     float cleanup;
 19 |     float complete;
 20 | 
 21 |     float setup;
 22 |     float symbolic_binning;
 23 |     float symbolic;
 24 |     float numeric_binning;
 25 |     float prefix;
 26 |     float allocate;
 27 |     float numeric;
 28 |     float total;
 29 |     
 30 |     Timings(){
 31 |         measureAll = false;
 32 |         measureCompleteTime = false;
 33 |         init = 0.0f;
 34 |         countProducts = 0.0f;
 35 |         loadBalanceCounting = 0.0f;
 36 |         globalMapsCounting = 0.0f;
 37 |         spGEMMCounting = 0.0f;
 38 |         allocC = 0.0f;
 39 |         loadBalanceNumeric = 0.0f;
 40 |         globalMapsNumeric = 0.0f;
 41 |         spGEMMNumeric = 0.0f;
 42 |         sorting = 0.0f;
 43 |         cleanup = 0.0f;
 44 |         complete = 0.0f;
 45 |     }
 46 |     void operator+=(const Timings& b) {
 47 |         init += b.init;
 48 |         countProducts += b.countProducts;
 49 |         loadBalanceCounting += b.loadBalanceCounting;
 50 |         globalMapsCounting += b.globalMapsCounting;
 51 |         spGEMMCounting += b.spGEMMCounting;
 52 |         allocC += b.allocC;
 53 |         loadBalanceNumeric += b.loadBalanceNumeric;
 54 |         globalMapsNumeric += b.globalMapsNumeric;
 55 |         spGEMMNumeric += b.spGEMMNumeric;
 56 |         sorting += b.sorting;
 57 |         cleanup += b.cleanup;
 58 |         complete += b.complete;
 59 |     }
 60 | 
 61 |     void operator/=(const float& x) {
 62 |         init /= x;
 63 |         countProducts /= x;
 64 |         loadBalanceCounting /= x;
 65 |         globalMapsCounting /= x;
 66 |         spGEMMCounting /= x;
 67 |         allocC /= x;
 68 |         loadBalanceNumeric /= x;
 69 |         globalMapsNumeric /= x;
 70 |         spGEMMNumeric /= x;
 71 |         sorting /= x;
 72 |         cleanup /= x;
 73 |         complete /= x;
 74 |     }
 75 |     void print(long long total_flop){
 76 |         float total_flop_d = float(total_flop)/1000000;
 77 |         setup = init + countProducts;
 78 |         symbolic_binning = loadBalanceCounting;
 79 |         symbolic = globalMapsCounting + spGEMMCounting;
 80 |         numeric_binning = loadBalanceNumeric;
 81 |         prefix = 0;
 82 |         allocate = allocC;
 83 |         numeric = globalMapsNumeric + spGEMMNumeric + sorting;
 84 |         total = complete;
 85 | 
 86 |         //if (measureAll){
 87 |             printf("spECK     initial mallocs = %f ms\n", init);
 88 |             printf("spECK  count computations = %f ms\n", countProducts);
 89 |             printf("spECK       load-balancer = %f ms\n", loadBalanceCounting);
 90 |             printf("spECK      GlobalMaps Cnt = %f ms\n", globalMapsCounting);
 91 |             printf("spECK     counting kernel = %f ms\n", spGEMMCounting);
 92 |             printf("spECK        malloc mat C = %f ms\n", allocC);
 93 |             printf("spECK   num load-balancer = %f ms\n", loadBalanceNumeric);
 94 |             printf("spECK     init GlobalMaps = %f ms\n", globalMapsNumeric);
 95 |             printf("spECK      numeric kernel = %f ms\n", spGEMMNumeric);
 96 |             printf("spECK      Sorting kernel = %f ms\n", sorting);
 97 |             printf("spECK             cleanup = %f ms\n", cleanup);
 98 |             printf("--------------------------------------------------------------\n");
 99 |         //}
100 | 
101 |         //if(measureAll){
102 |             printf("time(ms):\n");
103 |             printf("    setup            %8.3lfms %6.2lf%%\n", setup, setup/total*100);
104 |             printf("\e[1;31m    symbolic_binning %8.3lfms %6.2lf%%\n\e[0m", symbolic_binning, symbolic_binning/total*100);
105 |             printf("\e[1;31m    symbolic         %8.3lfms %6.2lf%%\n\e[0m", symbolic, symbolic/total*100);
106 |             printf("\e[1;31m    numeric_binning  %8.3lfms %6.2lf%%\n\e[0m", numeric_binning, numeric_binning/total*100);
107 |             printf("    prefix           %8.3lfms %6.2lf%%\n", prefix, prefix/total*100);
108 |             printf("    allocate         %8.3lfms %6.2lf%%\n", allocate, allocate/total*100);
109 |             printf("\e[1;31m    numeric          %8.3lfms %6.2lf%%\n\e[0m", numeric, numeric/total*100);
110 |             printf("    cleanup          %8.3lfms %6.2lf%%\n", cleanup, cleanup/total*100);
111 |             printf("    total            %8.3lfms %6.2lf%%\n", total, total/total*100);
112 |             printf("perf(Gflops):\n");
113 |             printf("    setup            %6.2lf\n", total_flop_d/setup);
114 |             printf("    symbolic_binning %6.2lf\n", total_flop_d/symbolic_binning);
115 |             printf("    symbolic         %6.2lf\n", total_flop_d/symbolic);
116 |             printf("    numeric_binning  %6.2lf\n", total_flop_d/numeric_binning);
117 |             printf("    prefix           %6.2lf\n", total_flop_d/prefix);
118 |             printf("    allocate         %6.2lf\n", total_flop_d/allocate);
119 |             printf("    numeric          %6.2lf\n", total_flop_d/numeric);
120 |             printf("    cleanup          %6.2lf\n", total_flop_d/cleanup);
121 |             printf("    total            %6.2lf\n", total_flop_d/total);
122 |         //}
123 |     }
124 |     void reg_print(long long total_flop){
125 |         float total_flop_d = float(total_flop)/1000000;
126 |         total = complete;
127 |         printf("%6.2lf\n", total_flop_d/total);
128 |     }
129 |     void binning_print(long long total_flop){
130 |         float total_flop_d = float(total_flop)/1000000;
131 |         float total_binning_time = loadBalanceCounting + loadBalanceNumeric;
132 |         printf("%.4e %.4f\n", total_binning_time/1000, 100*total_binning_time/complete);
133 |     }
134 | 
135 | };
136 | 
137 | 
138 | 


--------------------------------------------------------------------------------
/spECK/include/Transpose.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include "dCSR.h"
4 | 
5 | namespace spECK {
6 | 	template <typename DataType>
7 | 	void Transpose(const dCSR<DataType>& matIn, dCSR<DataType>& matTransposeOut);
8 | }


--------------------------------------------------------------------------------
/spECK/include/Vector.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <memory>
 4 | 
 5 | template<typename T>
 6 | struct DenseVector
 7 | {
 8 | 	size_t size;
 9 | 	//std::unique_ptr<T[]> data;
10 | 	T* data;
11 | 
12 | 	DenseVector() : size(0) { }
13 | 	void alloc(size_t s)
14 | 	{
15 | 		data = std::make_unique<T[]>(s);
16 | 		size = s;
17 | 	}
18 | };
19 | 


--------------------------------------------------------------------------------
/spECK/include/WorkDistribution.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include "stdint.h"
  3 | typedef unsigned long long int uint64_t;
  4 | 
  5 | struct WorkDistributionConfig
  6 | {
  7 |     uint32_t threadsPerNnzOffset;
  8 |     uint32_t add3MinLength;
  9 |     uint32_t add2MinLength;
 10 |     uint32_t add1MinLength;
 11 |     uint32_t add3MaxCols;
 12 |     uint32_t add2MaxCols;
 13 |     uint32_t add1MaxCols;
 14 |     uint32_t sub2MaxCols;
 15 |     uint32_t sub1MaxCols;
 16 |     uint32_t sub2MinThreads;
 17 |     uint32_t sub1MinThreads;
 18 |     uint32_t add2MinConcurrentOps;
 19 |     uint32_t add1MinConcurrentOps;
 20 |     float maxOpsWeight64;
 21 |     float maxOpsWeight128;
 22 |     float maxOpsWeight256;
 23 |     float maxOpsWeight512;
 24 |     float maxOpsWeight1024;
 25 |     int staticThreadsPerRow;
 26 | };
 27 | 
 28 | const int layer1inputs = 6;
 29 | const int layer1outputs = 5;
 30 | const float layer1weights[30] = {
 31 |     -1.4186962,
 32 |     0.07587334,
 33 |     -1.7805182,
 34 |     0.04314838,
 35 |     -0.6445114,
 36 | 
 37 |     -0.13512687,
 38 |     0.04315747,
 39 |     -0.17808716,
 40 |     -0.04465475,
 41 |     -0.066692226,
 42 | 
 43 |     0.0752962,
 44 |     -0.104078434,
 45 |     0.16903225,
 46 |     -0.014818254,
 47 |     0.041726623,
 48 | 
 49 |     0.60707116,
 50 |     0.5149234,
 51 |     0.036716104,
 52 |     -0.070126966,
 53 |     0.37001306,
 54 | 
 55 |     -0.18412519,
 56 |     -0.11984752,
 57 |     -0.0021386633,
 58 |     -0.046877146,
 59 |     -0.16237561,
 60 | 
 61 |     0.8256881,
 62 |     0.7394887,
 63 |     0.07848209,
 64 |     0.058255166,
 65 |     0.9127046,
 66 | 
 67 | };
 68 | const float layer1offsets[5] = {
 69 |     20.326342,
 70 |     4.904586,
 71 |     15.644517,
 72 |     -0.018791957,
 73 |     17.353731,
 74 | };
 75 | 
 76 | const int layer2inputs = 5;
 77 | const int layer2outputs = 11;
 78 | const float layer2weights[55] = {
 79 |     1.1972289,
 80 |     1.9218329,
 81 |     3.7349546,
 82 |     9.81511,
 83 |     2.6227558,
 84 |     -6.426236,
 85 |     -43.629112,
 86 |     -22.914429,
 87 |     -0.781369,
 88 |     -0.45372608,
 89 |     -0.5380075,
 90 | 
 91 |     -5.0447526,
 92 |     -0.009247302,
 93 |     -0.008086299,
 94 |     -7.826033,
 95 |     -0.09450004,
 96 |     -0.11218474,
 97 |     -0.027270528,
 98 |     -0.0042230682,
 99 |     -0.0005600392,
100 |     0.0005161164,
101 |     0.004799865,
102 | 
103 |     3.116825,
104 |     -0.26401007,
105 |     -1.3744258,
106 |     -8.859539,
107 |     -140.25873,
108 |     -18.334105,
109 |     -8.593332,
110 |     -1.0478442,
111 |     -1.3645409,
112 |     -1.2001375,
113 |     -0.9904336,
114 | 
115 |     -0.0058234227,
116 |     -0.063901104,
117 |     -0.03595568,
118 |     -0.008307365,
119 |     -0.01685345,
120 |     -0.012366829,
121 |     0.028791403,
122 |     0.024446918,
123 |     0.028665425,
124 |     -0.060744636,
125 |     0.072986275,
126 | 
127 |     -0.069886416,
128 |     -1.427334,
129 |     -2.6894572,
130 |     -1.6754347,
131 |     0.0068549747,
132 |     1.0374902,
133 |     -0.71136826,
134 |     -71.97164,
135 |     -66.504616,
136 |     -73.23368,
137 |     -47.109512,
138 | 
139 | };
140 | const float layer2offsets[11] = {
141 |     -27.462452,
142 |     -10.207701,
143 |     -7.577945,
144 |     -7.4156075,
145 |     -3.9403565,
146 |     -0.64222777,
147 |     0.68561864,
148 |     -1.2962743,
149 |     -2.180761,
150 |     -1.9912802,
151 |     -4.157279,
152 | };
153 | 


--------------------------------------------------------------------------------
/spECK/include/common.h:
--------------------------------------------------------------------------------
 1 | #ifndef spECK_Common
 2 | #define spECK_Common
 3 | #pragma once
 4 | 
 5 | template<typename T>
 6 | __host__ __device__ __forceinline__ T divup(T a, T b)
 7 | {
 8 | 	return (a + b - 1) / b;
 9 | }
10 | 
11 | 
12 | template<typename T>
13 | __host__ __device__ __forceinline__ T clamp(const T& a, const T& min, const T& max)
14 | {
15 | 	return a < min ? min : (a > max ? max : a);
16 | }
17 | #endif
18 | 
19 | inline static void HandleError(cudaError_t err,
20 | 							   const char *file,
21 | 							   int line)
22 | {
23 | 	if (err != cudaSuccess)
24 | 	{
25 | 		printf("%s in %s at line %d\n", cudaGetErrorString(err),
26 | 			   file, line);
27 | 		throw std::exception();
28 | 	}
29 | }
30 | // #ifdef _DEBUG || NDEBUG || DEBUG
31 | #define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
32 | // #else
33 | // #define HANDLE_ERROR(err) err
34 | // #endif


--------------------------------------------------------------------------------
/spECK/include/cuSparseMultiply.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "dCSR.h"
 4 | #include <cusparse.h>
 5 | #include <iostream>
 6 | #include <string>
 7 | 
 8 | namespace cuSPARSE {
 9 | 
10 | 	template <typename DataType>
11 | 	class CuSparseTest
12 | 	{
13 | 		cusparseHandle_t handle;
14 | 		cusparseStatus_t status;
15 | 		cusparseMatDescr_t descr;
16 | 		cusparseMatDescr_t descrB;
17 | 		cusparseMatDescr_t descrC;
18 | 
19 | 	public:
20 | 		CuSparseTest(): handle(0)
21 | 		{
22 | 			checkCuSparseError(cusparseCreate(&handle), "init failed");
23 | 			checkCuSparseError(cusparseCreateMatDescr(&descr), "Matrix descriptor init failed");
24 | 			checkCuSparseError(cusparseCreateMatDescr(&descrB), "Matrix descriptor init failed");
25 | 			checkCuSparseError(cusparseCreateMatDescr(&descrC), "Matrix descriptor init failed");
26 | 			cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
27 | 			cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
28 | 			cusparseSetMatType(descrB, CUSPARSE_MATRIX_TYPE_GENERAL);
29 | 			cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ZERO);
30 | 			cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL);
31 | 			cusparseSetMatIndexBase(descrC, CUSPARSE_INDEX_BASE_ZERO);
32 | 		}
33 | 
34 | 		~CuSparseTest()
35 | 		{
36 | 			checkCuSparseError(cusparseDestroyMatDescr(descr), "Matrix descriptor destruction failed");
37 | 			checkCuSparseError(cusparseDestroyMatDescr(descrB), "Matrix descriptor destruction failed");
38 | 			checkCuSparseError(cusparseDestroyMatDescr(descrC), "Matrix descriptor destruction failed");
39 | 			cusparseDestroy(handle);
40 | 		}
41 | 
42 | 		// Multiply two CSR matrices
43 | 		float Multiply(const dCSR<DataType>& A, const dCSR<DataType>& B, dCSR<DataType>& matOut, uint32_t& cusparse_nnz);
44 | 
45 | 		void Transpose(const dCSR<DataType>& A, dCSR<DataType>& AT);
46 | 
47 | 		cusparseStatus_t checkCuSparseError(cusparseStatus_t status, std::string errorMsg)
48 | 		{
49 | 			if (status != CUSPARSE_STATUS_SUCCESS) {
50 | 				std::cout << "CuSparse error: " << errorMsg << std::endl;
51 | 				throw std::exception();
52 | 			}
53 | 			return status;
54 | 		}
55 | 
56 | 		cusparseStatus_t CUSPARSEAPI cusparseMultiply(cusparseHandle_t handle,
57 | 			cusparseOperation_t transA,
58 | 			cusparseOperation_t transB,
59 | 			int m,
60 | 			int n,
61 | 			int k,
62 | 			const cusparseMatDescr_t descrA,
63 | 			int nnzA,
64 | 			const DataType *csrSortedValA,
65 | 			const int *csrSortedRowPtrA,
66 | 			const int *csrSortedColIndA,
67 | 			const cusparseMatDescr_t descrB,
68 | 			int nnzB,
69 | 			const DataType *csrSortedValB,
70 | 			const int *csrSortedRowPtrB,
71 | 			const int *csrSortedColIndB,
72 | 			const cusparseMatDescr_t descrC,
73 | 			DataType *csrSortedValC,
74 | 			const int *csrSortedRowPtrC,
75 | 			int *csrSortedColIndC);
76 | 
77 | 		cusparseStatus_t CUSPARSEAPI cusparseTranspose(cusparseHandle_t handle,
78 | 			int m,
79 | 			int n,
80 | 			int nnz,
81 | 			const DataType  *csrSortedVal,
82 | 			const int *csrSortedRowPtr,
83 | 			const int *csrSortedColInd,
84 | 			DataType *cscSortedVal,
85 | 			int *cscSortedRowInd,
86 | 			int *cscSortedColPtr,
87 | 			cusparseAction_t copyValues,
88 | 			cusparseIndexBase_t idxBase);		
89 | 	};	
90 | }
91 | 


--------------------------------------------------------------------------------
/spECK/include/cuda_common.h:
--------------------------------------------------------------------------------
 1 | #ifndef _Z_COMMON_
 2 | #define _Z_COMMON_
 3 | 
 4 | #include <cuda_runtime.h>
 5 | #include <stdio.h>
 6 | #include <exception>
 7 | #include <cusparse.h>
 8 | #include <iostream>
 9 | #include <omp.h>
10 | #include <stdlib.h>
11 | #include <algorithm>
12 | 
13 | #define likely(x) __builtin_expect(x,1)
14 | #define unlikely(x) __builtin_expect(x,0)
15 | 
16 | inline static void checkCUDA(cudaError_t err,
17 | 							   const char *file,
18 | 							   int line)
19 | {
20 | 	if (unlikely(err != cudaSuccess))
21 | 	{
22 | 		printf("%s in %s at line %d\n", cudaGetErrorString(err),
23 | 			   file, line);
24 | 		throw std::exception();
25 | 	}
26 | }
27 | // #ifdef _DEBUG || NDEBUG || DEBUG
28 | #define CHECK_CUDA(err) (checkCUDA(err, __FILE__, __LINE__))
29 | #define CHECK_ERROR(err) (checkCUDA(err, __FILE__, __LINE__))
30 | 
31 | inline void CHECK_CUSPARSE(cusparseStatus_t status, std::string errorMsg="")
32 | 		{
33 | 			if (status != CUSPARSE_STATUS_SUCCESS) {
34 | 				std::cout << "CuSparse error: " << errorMsg << std::endl;
35 | 				throw std::exception();
36 | 			}
37 | 		}
38 | 
39 | #define HP_TIMING_NOW(Var) \
40 |   ({ unsigned int _hi, _lo; \
41 |      asm volatile ("lfence\n\trdtsc" : "=a" (_lo), "=d" (_hi)); \
42 |      (Var) = ((unsigned long long int) _hi << 32) | _lo; })
43 | 
44 | /* precision is 1 clock cycle.
45 |  * execute time is roughly 50 or 140 cycles depends on cpu family */
46 | inline void cpuid(int *info, int eax, int ecx = 0){
47 |     int ax, bx, cx, dx;
48 |     __asm__ __volatile__ ("cpuid": "=a" (ax), "=b" (bx), "=c" (cx), "=d" (dx) : "a" (eax));
49 | 
50 |     info[0] = ax;
51 |     info[1] = bx;
52 |     info[2] = cx;
53 |     info[3] = dx;
54 | }
55 | 
56 | inline long get_tsc_freq(){
57 |     static long freq = 0;
58 |     if(unlikely((freq == 0))){
59 |         int raw[4];
60 |         cpuid(raw, 0x16); // get cpu freq
61 |         freq = long(raw[0]) * 1000000;
62 |         //printf("static first call %f\n", freq);
63 |     }
64 |     return freq;
65 | }
66 | 
67 | inline double fast_clock_time(){
68 |     long counter;
69 |     HP_TIMING_NOW(counter);
70 |     return double(counter)/get_tsc_freq();
71 | }
72 | 
73 | template <typename T>
74 | inline void D2H(T *dst, T* src, size_t size){
75 |     CHECK_ERROR(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost));
76 | }
77 | 
78 | template <typename T>
79 | inline void H2D(T *dst, T* src, size_t size){
80 |     CHECK_ERROR(cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice));
81 | }
82 | 
83 | template <typename T>
84 | inline void D2D(T *dst, T* src, size_t size){
85 |     CHECK_ERROR(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToDevice));
86 | }
87 | 
88 | 
89 | #endif
90 | 


--------------------------------------------------------------------------------
/spECK/include/dCSR.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstddef>
 4 | #include <algorithm>
 5 | 
 6 | template<typename T>
 7 | struct CSR;
 8 | 
 9 | template<typename T>
10 | struct dCSR
11 | {
12 | 	size_t rows, cols, nnz;
13 | 
14 | 	T* data;
15 | 	unsigned int* row_offsets;
16 | 	unsigned int* col_ids;
17 | 
18 | 	dCSR() : rows(0), cols(0), nnz(0), data(nullptr), row_offsets(nullptr), col_ids(nullptr) { }
19 | 	void alloc(size_t rows, size_t cols, size_t nnz, bool allocOffsets = true);
20 | 	void reset();
21 | 	virtual ~dCSR();
22 | };
23 | 
24 | template <typename T>
25 | struct dCSRNoDealloc
26 | {
27 | 	size_t rows, cols, nnz;
28 | 
29 | 	T* data;
30 | 	unsigned int* row_offsets;
31 | 	unsigned int* col_ids;
32 | 
33 | 	dCSRNoDealloc(const dCSR<T>& a) : rows(a.rows), cols(a.cols), data(a.data), nnz(a.nnz), row_offsets(a.row_offsets), col_ids(a.col_ids) {}
34 | 	dCSRNoDealloc() = default;
35 | };
36 | 
37 | template<typename T>
38 | void convert(dCSR<T>& dcsr, const CSR<T>& csr, unsigned int padding = 0);
39 | 
40 | template<typename T>
41 | void convert(dCSR<T>& dcsr, const dCSR<T>& csr, unsigned int padding = 0);
42 | 
43 | template<typename T>
44 | void convert(CSR<T>& csr, const dCSR<T>& dcsr, unsigned int padding = 0);
45 | 
46 | template<typename T>
47 | void convert(CSR<T>& csr, const CSR<T>& dcsr, unsigned int padding = 0);


--------------------------------------------------------------------------------
/spECK/include/meta_utils.h:
--------------------------------------------------------------------------------
  1 | 
  2 | #ifndef INCLUDED_HIS_META_UTILS
  3 | #define INCLUDED_HIS_META_UTILS
  4 | 
  5 | #pragma once
  6 | 
  7 | #include <utility>
  8 | #include <type_traits>
  9 | #include "multi_arch_build.h"
 10 | 
 11 | 
 12 | 	using std::enable_if;
 13 | 	using std::declval;
 14 | 	using std::is_empty;
 15 | 	using std::conditional;
 16 | 
 17 | 	template <class A, class B>
 18 | 	struct type_match
 19 | 	{
 20 | 		static const bool value = false;
 21 | 	};
 22 | 
 23 | 	template <class A>
 24 | 	struct type_match<A, A>
 25 | 	{
 26 | 		static const bool value = true;
 27 | 	};
 28 | 
 29 | 	template<int X, int Y>
 30 | 	struct static_divup
 31 | 	{
 32 | 		static const int value = (X + Y - 1) / Y;
 33 | 	};
 34 | 
 35 | 	template<int X>
 36 | 	struct static_popcnt
 37 | 	{
 38 | 		static const int value = ((X & 0x1) + static_popcnt< (X >> 1) >::value);
 39 | 	};
 40 | 	template<>
 41 | 	struct static_popcnt<0>
 42 | 	{
 43 | 		static const int value = 0;
 44 | 	};
 45 | 
 46 | 	template<unsigned int X, int Completed = 0>
 47 | 	struct static_clz
 48 | 	{
 49 | 		static const int value = (X & 0x80000000) ? Completed : static_clz< (X << 1), Completed + 1 >::value;
 50 | 	};
 51 | 	template<unsigned int X>
 52 | 	struct static_clz<X, 32>
 53 | 	{
 54 | 		static const int value = 32;
 55 | 	};
 56 | 
 57 | 	template<int... VALUES>
 58 | 	struct static_max;
 59 | 
 60 | 	template<int VALUE>
 61 | 	struct static_max<VALUE>
 62 | 	{
 63 | 		static const int value = VALUE;
 64 | 	};
 65 | 
 66 | 	template<int VALUE, int... VALUES>
 67 | 	struct static_max<VALUE, VALUES...>
 68 | 	{
 69 | 		static const int next_value = static_max<VALUES...>::value;
 70 | 		static const int value = VALUE > next_value ? VALUE : next_value;
 71 | 	};
 72 | 
 73 | 	template<int... VALUES>
 74 | 	struct static_min;
 75 | 
 76 | 	template<int VALUE>
 77 | 	struct static_min<VALUE>
 78 | 	{
 79 | 		static const int value = VALUE;
 80 | 	};
 81 | 
 82 | 	template<int VALUE, int... VALUES>
 83 | 	struct static_min<VALUE, VALUES...>
 84 | 	{
 85 | 		static const int next_value = static_min<VALUES...>::value;
 86 | 		static const int value = VALUE < next_value ? VALUE : next_value;
 87 | 	};
 88 | 
 89 | 	template<int I, class... NCS>
 90 | 	struct choose;
 91 | 
 92 | 	template<int I, class NC, class... NCS>
 93 | 	struct choose<I, NC, NCS...>
 94 | 	{
 95 | 		typedef typename choose<I - 1, NCS...>::type type;
 96 | 	};
 97 | 	template<class NC, class... NCS>
 98 | 	struct choose<0, NC, NCS...>
 99 | 	{
100 | 		typedef NC type;
101 | 	};
102 | 
103 | 
104 | 	template<bool COND>
105 | 	struct conditional_eval;
106 | 
107 | 	template<>
108 | 	struct conditional_eval<true>
109 | 	{
110 | 		template<class F>
111 | 		DUAL_BUILD_FUNCTION static void eval(F f)
112 | 		{
113 | 			f();
114 | 		}
115 | 	};
116 | 	template<>
117 | 	struct conditional_eval<false>
118 | 	{
119 | 		template<class F>
120 | 		DUAL_BUILD_FUNCTION static void eval(F f)
121 | 		{
122 | 		}
123 | 	};
124 | 
125 | 	template<template<int...> class CONSUMER, int V, int END, int STEP, bool DONE, int... VALUES>
126 | 	struct static_for_impl
127 | 	{
128 | 		using type = typename static_for_impl < CONSUMER, V+STEP, END, STEP, (V + STEP < END), VALUES..., V>::type;
129 | 	};
130 | 	template<template<int...> class CONSUMER, int V, int END, int STEP, int... VALUES>
131 | 	struct static_for_impl<CONSUMER, V, END, STEP, false, VALUES...>
132 | 	{
133 | 		using type = CONSUMER <VALUES...>;
134 | 	};
135 | 
136 | 	template<template<int...> class CONSUMER, int END, int BEGIN = 0, int STEP = 1>
137 | 	struct static_for
138 | 	{
139 | 		using type = typename static_for_impl < CONSUMER, BEGIN, END, STEP, (BEGIN < END)>::type;
140 | 	};
141 | 
142 | 
143 | 	template<class...> 
144 | 	struct type_list { };
145 | 
146 | 	template<template<class...> class APPLIER, class COMBLIST, class... TYPELISTS>
147 | 	struct apply_list_impl;
148 | 	template<template<class...> class APPLIER, class... DONETYPES, class... NEWTYPES, class... REMTYPELISTS>
149 | 	struct apply_list_impl<APPLIER, type_list<DONETYPES...>, type_list<NEWTYPES...>, REMTYPELISTS...>
150 | 	{
151 | 		using type = typename apply_list_impl<APPLIER, type_list<DONETYPES..., NEWTYPES...>, REMTYPELISTS...>::type;
152 | 	};
153 | 	template<template<class...> class APPLIER, class... DONETYPES>
154 | 	struct apply_list_impl<APPLIER, type_list<DONETYPES...>>
155 | 	{
156 | 		using type = APPLIER<DONETYPES...>;
157 | 	};
158 | 	template<template<class...> class APPLIER, class... TYPELISTS>
159 | 	struct apply_list
160 | 	{
161 | 		using type = typename apply_list_impl<APPLIER, type_list<>, TYPELISTS... >::type;
162 | 	};
163 | 
164 | 	template<class INVERSE_LIST, class FORWARD_LIST>
165 | 	struct inverse_list_impl;
166 | 	template<class... INVERSE_TYPES, class FIRST, class... REMAINING>
167 | 	struct inverse_list_impl<type_list<INVERSE_TYPES...>, type_list<FIRST, REMAINING...>>
168 | 	{
169 | 		using type = typename inverse_list_impl<type_list<FIRST, INVERSE_TYPES...>, type_list<REMAINING...>>::type;
170 | 	};
171 | 	template<class INVERSE_LIST>
172 | 	struct inverse_list_impl<INVERSE_LIST, type_list<>>
173 | 	{
174 | 		using type = INVERSE_LIST;
175 | 	};
176 | 	template<class TYPELIST>
177 | 	struct inverse_list
178 | 	{
179 | 		using type = typename inverse_list_impl<type_list<>, TYPELIST>::type;
180 | 	};
181 | 
182 | 
183 | 	template<int... >
184 | 	struct sequence { };
185 | 
186 | 	template<template<int...> class APPLIER, class SEQUENCE>
187 | 	struct apply_sequence;
188 | 	template<template<int...> class APPLIER, int... NUMS>
189 | 	struct apply_sequence<APPLIER, sequence<NUMS...>>
190 | 	{
191 | 		using type = APPLIER<NUMS...>;
192 | 	};
193 | 
194 | 	template<unsigned MASK, bool TAKE, class TAKEN_SEQUENCE, class REM_SEQUENCE>
195 | 	struct select_from_impl;
196 | 	template<unsigned MASK, int... TAKEN, int NUM, int... NUMS>
197 | 	struct select_from_impl<MASK, true, sequence<TAKEN...>, sequence<NUM, NUMS...>>
198 | 	{
199 | 		using type = typename select_from_impl <(MASK >> 1U), MASK & 0x1, sequence<TAKEN..., NUM>, sequence<NUMS...> > ::type;
200 | 	};
201 | 	template<unsigned MASK, int... TAKEN, int NUM, int... NUMS>
202 | 	struct select_from_impl<MASK, false, sequence<TAKEN...>, sequence<NUM, NUMS...>>
203 | 	{
204 | 		using type = typename select_from_impl <(MASK >> 1U), MASK & 0x1, sequence<TAKEN...>, sequence<NUMS...> > ::type;
205 | 	};
206 | 	template<unsigned MASK, bool TAKE, int... TAKEN>
207 | 	struct select_from_impl<MASK, TAKE, sequence<TAKEN...>, sequence<>>
208 | 	{
209 | 		using type = sequence<TAKEN...>;
210 | 	};
211 | 	template<unsigned MASK, class SEQUENCE>
212 | 	struct select_from
213 | 	{
214 | 		using type = typename select_from_impl <(MASK >> 1U), MASK & 0x1, sequence<>, SEQUENCE > ::type;
215 | 	};
216 | 	
217 | 
218 | 	template<template<int> class LOGICAL, class SEQUENCE>
219 | 	struct sequence_any;
220 | 	template<template<int> class LOGICAL, int NUM, int...NUMS>
221 | 	struct sequence_any<LOGICAL, sequence<NUM, NUMS...> >
222 | 	{
223 | 		static const bool value = LOGICAL<NUM>::value || sequence_any<LOGICAL, sequence<NUMS...>>::value;
224 | 	};
225 | 	template<template<int> class LOGICAL>
226 | 	struct sequence_any<LOGICAL, sequence<> >
227 | 	{
228 | 		static const bool value = false;
229 | 	};
230 | 
231 | 	template<int A>
232 | 	struct static_is_zero
233 | 	{
234 | 		static const bool value = false;
235 | 	};
236 | 	template<>
237 | 	struct static_is_zero<0>
238 | 	{
239 | 		static const bool value = true;
240 | 	};
241 | 
242 | 
243 | #endif //INCLUDED_HIS_META_UTILS


--------------------------------------------------------------------------------
/spECK/include/multi_arch_build.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #ifdef __CUDACC__
 4 | #define DUAL_BUILD_FUNCTION __host__ __device__
 5 | #else
 6 | #define DUAL_BUILD_FUNCTION 
 7 | #endif
 8 | 
 9 | #ifndef __CUDA_ARCH__
10 | inline float __uint_as_float(unsigned t)
11 | {
12 | 	return *reinterpret_cast<float*>(&t);
13 | }
14 | #endif


--------------------------------------------------------------------------------
/spECK/include/spECKConfig.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <vector>
 3 | #include "CUDATools/stream.h"
 4 | #include "stdio.h"
 5 | 
 6 | namespace spECK {
 7 |     // get device attributes for best performance and creates cudaStreams
 8 |     struct spECKConfig {
 9 |         int sm;
10 |         int maxStaticSharedMemoryPerBlock;
11 |         int maxDynamicSharedMemoryPerBlock;
12 |         std::vector<CUstream> streams;
13 |         cudaEvent_t completeStart = 0, completeEnd = 0, individualStart = 0, individualEnd = 0;
14 | 
15 |         static spECKConfig initialize(int cudaDeviceNumber) {
16 | 			spECKConfig config;
17 |             cudaDeviceProp prop;
18 |             cudaGetDeviceProperties(&prop, cudaDeviceNumber);
19 |             config.sm = prop.multiProcessorCount;
20 |             config.maxStaticSharedMemoryPerBlock = prop.sharedMemPerBlock;
21 |             config.maxDynamicSharedMemoryPerBlock = std::max(prop.sharedMemPerBlockOptin, prop.sharedMemPerBlock);
22 | 
23 |             for (int i = 0; i < 6; i++) {
24 |                 config.streams.push_back(0);
25 |                 cudaStreamCreate(&config.streams[i]);
26 |             }
27 |             cudaEventCreate(&config.completeStart);
28 |             cudaEventCreate(&config.completeEnd);
29 |             cudaEventCreate(&config.individualStart);
30 |             cudaEventCreate(&config.individualEnd);
31 |             return config;
32 |         }
33 | 
34 |         void cleanup() {
35 |             for (auto s : streams) {
36 |                 cudaStreamDestroy(s);
37 |             }
38 |             cudaEventDestroy(completeStart);
39 |             cudaEventDestroy(completeEnd);
40 |             cudaEventDestroy(individualStart);
41 |             cudaEventDestroy(individualEnd);
42 |             streams.clear();
43 |         }
44 | 
45 |         ~spECKConfig() {
46 |             // cleanup();
47 |         }
48 | 
49 |     private:
50 | 		spECKConfig() {
51 | 
52 |         }
53 |     };
54 | }


--------------------------------------------------------------------------------
/spECK/readme.md:
--------------------------------------------------------------------------------
 1 | # Get started
 2 | 1 Profile speck
 3 | 
 4 | 1.1 ``` $> make speck ```
 5 | 
 6 | 1.2 ``` $> ./speck webbase-1M ```
 7 | 
 8 | 2 Overall performance of speck
 9 | 
10 | 2.1 ``` $> make reg_speck ```
11 | 
12 | 2.2 ``` $> ./reg_speck webbase-1M ```
13 | 


--------------------------------------------------------------------------------
/spECK/source/COO.cpp:
--------------------------------------------------------------------------------
  1 | #include "COO.h"
  2 | #include "Vector.h"
  3 | 
  4 | #include <string>
  5 | #include <sstream>
  6 | #include <fstream>
  7 | #include <stdexcept>
  8 | #include <iterator>
  9 | #include <vector>
 10 | 
 11 | namespace {
 12 | 	template<typename VALUE_TYPE>
 13 | 	struct DataTypeValidator {
 14 | 		static const bool validate(std::string type) {
 15 | 			return false;
 16 | 		}
 17 | 	};
 18 | 
 19 | 	template<>
 20 | 	struct DataTypeValidator<float> {
 21 | 		static const bool validate(std::string type) {
 22 | 			return type.compare("real") == 0 || type.compare("integer") == 0 || type.compare("double") == 0;
 23 | 		}
 24 | 	};
 25 | 	template<>
 26 | 	struct DataTypeValidator<double> {
 27 | 		static const bool validate(std::string type) {
 28 | 			return type.compare("real") == 0 || type.compare("integer") == 0 || type.compare("double") == 0;
 29 | 		}
 30 | 	};
 31 | 
 32 | 	template<>
 33 | 	struct DataTypeValidator<int> {
 34 | 		static const bool validate(std::string type) {
 35 | 			return type.compare("integer") == 0;
 36 | 		}
 37 | 	};
 38 | }
 39 | 
 40 | template<typename T>
 41 | void COO<T>::alloc(size_t r, size_t c, size_t n)
 42 | {
 43 | 	rows = r;
 44 | 	cols = c;
 45 | 	nnz = n;
 46 | 
 47 | 	data = std::make_unique<T[]>(n);
 48 | 	row_ids = std::make_unique<unsigned int[]>(n);
 49 | 	col_ids = std::make_unique<unsigned int[]>(n);
 50 | }
 51 | 
 52 | template<typename T>
 53 | COO<T> loadMTX(const char * file)
 54 | {
 55 | 	std::ifstream fstream(file);
 56 | 	if (!fstream.is_open())
 57 | 		throw std::runtime_error(std::string("could not open \"") + file + "\"");
 58 | 	
 59 | 	COO<T> resmatrix;
 60 | 	size_t num_rows, num_columns, num_non_zeroes;
 61 | 
 62 | 	size_t line_counter = 0;
 63 | 	std::string line;
 64 | 	bool pattern = false;
 65 | 	bool hermitian = false;
 66 | 	// read header;
 67 | 	std::getline(fstream, line);
 68 | 	if (line.compare(0, 32, "%%MatrixMarket matrix coordinate") != 0)
 69 | 		throw std::runtime_error("Can only read MatrixMarket format that is in coordinate form");
 70 | 	std::istringstream iss(line);
 71 | 	std::vector<std::string> tokens{ std::istream_iterator<std::string>{iss}, std::istream_iterator<std::string>{} };
 72 | 	bool complex = false;
 73 | 
 74 | 	if (tokens[3] == "pattern")
 75 | 		pattern = true;
 76 | 	else if (tokens[3] == "complex")
 77 | 		complex = true;
 78 | 	else if (DataTypeValidator<T>::validate(tokens[3]) == false)
 79 | 		throw std::runtime_error("MatrixMarket data type does not match matrix format");
 80 | 	bool symmetric = false;
 81 | 	if (tokens[4].compare("general") == 0)
 82 | 		symmetric = false;
 83 | 	else if (tokens[4].compare("symmetric") == 0)
 84 | 		symmetric = true;
 85 | 	else if (tokens[4].compare("Hermitian") == 0)
 86 | 		hermitian = true;
 87 | 	else
 88 | 		throw std::runtime_error("Can only read MatrixMarket format that is either symmetric, general or hermitian");
 89 | 
 90 | 	while (std::getline(fstream, line))
 91 | 	{
 92 | 		++line_counter;
 93 | 		if (line[0] == '%')
 94 | 			continue;
 95 | 		std::istringstream liness(line);
 96 | 		liness >> num_rows >> num_columns >> num_non_zeroes;
 97 | 		if (liness.fail())
 98 | 			throw std::runtime_error(std::string("Failed to read matrix market header from \"") + file + "\"");
 99 | 		//std::cout << "Read matrix header" << std::endl;
100 | 		//std::cout << "rows: " << rows << " columns: " << columns << " nnz: " << nnz << std::endl;
101 | 		break;
102 | 	}
103 | 
104 | 	size_t reserve = num_non_zeroes;
105 | 	if (symmetric || hermitian)
106 | 		reserve *= 2;
107 | 
108 | 	resmatrix.alloc(num_rows, num_columns, reserve);
109 | 
110 | 	//read data
111 | 	size_t read = 0;
112 | 	while (std::getline(fstream, line))
113 | 	{
114 | 		++line_counter;
115 | 		if (line[0] == '%')
116 | 			continue;
117 | 
118 | 		std::istringstream liness(line);
119 | 
120 | 
121 | 		do
122 | 		{
123 | 			char ch;
124 | 			liness.get(ch);
125 | 			if (!isspace(ch))
126 | 			{
127 | 				liness.putback(ch);
128 | 				break;
129 | 			}
130 | 
131 | 		} while (!liness.eof());
132 | 		if (liness.eof() || line.length() == 0)
133 | 			continue;
134 | 
135 | 		uint32_t r, c;
136 | 		T d;
137 | 		liness >> r >> c;
138 | 		if (pattern)
139 | 			d = 1;
140 | 		else
141 | 			liness >> d;
142 | 		if (liness.fail())
143 | 			throw std::runtime_error(std::string("Failed to read data at line ") + std::to_string(line_counter) + " from matrix market file \"" + file + "\"");
144 | 		if (r > num_rows)
145 | 			throw std::runtime_error(std::string("Row index out of bounds at line  ") + std::to_string(line_counter) + " in matrix market file \"" + file + "\"");
146 | 		if (c > num_columns)
147 | 			throw std::runtime_error(std::string("Column index out of bounds at line  ") + std::to_string(line_counter) + " in matrix market file \"" + file + "\"");
148 | 		
149 | 		resmatrix.row_ids[read] = r - 1;
150 | 		resmatrix.col_ids[read] = c - 1;
151 | 		resmatrix.data[read] = d;
152 | 		++read;
153 | 		if ((symmetric || hermitian) && r != c)
154 | 		{
155 | 			resmatrix.row_ids[read] = c - 1;
156 | 			resmatrix.col_ids[read] = r - 1;
157 | 			resmatrix.data[read] = d;
158 | 			++read;
159 | 		}
160 | 	}
161 | 
162 | 	resmatrix.nnz = read;
163 | 	return resmatrix;
164 | }
165 | 
166 | template<typename T>
167 | COO<T> loadCOO(const char * file)
168 | {
169 | 	return COO<T>();
170 | }
171 | 
172 | template<typename T>
173 | void storeCOO(const COO<T>& mat, const char * file)
174 | {
175 | 
176 | }
177 | 
178 | template<typename T>
179 | void spmv(DenseVector<T>& res, const COO<T>& m, const DenseVector<T>& v, bool transpose)
180 | {
181 | 	if (transpose && v.size != m.rows)
182 | 		throw std::runtime_error("SPMV dimensions mismatch");
183 | 	if (!transpose && v.size != m.cols)
184 | 		throw std::runtime_error("SPMV dimensions mismatch");
185 | 
186 | 	size_t outsize = transpose ? m.cols : m.rows;
187 | 	if (res.size < outsize)
188 | 		//res.data = std::make_unique<T[]>(outsize);
189 |         res.data = new T [outsize];
190 | 	res.size = outsize;
191 | 
192 | 	std::fill(&res.data[0], &res.data[0] + outsize, 0);
193 | 
194 | 	
195 | 	if(transpose)
196 | 		for (size_t i = 0; i < m.nnz; ++i)
197 | 			res.data[m.col_ids[i]] += m.data[i] * v.data[m.row_ids[i]];
198 | 	else
199 | 		for (size_t i = 0; i < m.nnz; ++i)
200 | 			res.data[m.row_ids[i]] += m.data[i] * v.data[m.col_ids[i]];
201 | }
202 | 
203 | 
204 | template void COO<float>::alloc(size_t, size_t, size_t);
205 | template void COO<double>::alloc(size_t, size_t, size_t);
206 | 
207 | template COO<float> loadMTX(const char * file);
208 | template COO<double> loadMTX(const char * file);
209 | 
210 | template void spmv(DenseVector<float>& res, const COO<float>& m, const DenseVector<float>& v, bool transpose);
211 | template void spmv(DenseVector<double>& res, const COO<double>& m, const DenseVector<double>& v, bool transpose);
212 | 


--------------------------------------------------------------------------------
/spECK/source/CSR.cpp:
--------------------------------------------------------------------------------
  1 | #include "CSR.h"
  2 | #include "COO.h"
  3 | 
  4 | #include <stdint.h>
  5 | typedef unsigned long long int uint64_t;
  6 | #include <string>
  7 | #include <fstream>
  8 | #include <stdexcept>
  9 | #include <iterator>
 10 | #include <vector>
 11 | #include <algorithm>
 12 | #include <memory>
 13 | #include <iostream>
 14 | #include <cassert>
 15 | 
 16 | namespace {
 17 | 	template<typename VALUE_TYPE>
 18 | 	struct State
 19 | 	{
 20 | 		typedef VALUE_TYPE ValueType;
 21 | 
 22 | 		ValueType scaling;
 23 | 		bool transpose;
 24 | 
 25 | 		State() : scaling(1), transpose(false) { }
 26 | 		State(ValueType scaling, bool transpose) : scaling(scaling), transpose(transpose) { }
 27 | 	};
 28 | 
 29 | 	struct CSRIOHeader
 30 | 	{
 31 | 		static constexpr char Magic[] = { 'H','i', 1, 'C','o','m','p','s','d' };
 32 | 
 33 | 		char magic[sizeof(Magic)];
 34 | 		uint64_t typesize;
 35 | 		uint64_t compresseddir;
 36 | 		uint64_t indexsize;
 37 | 		uint64_t fixedoffset;
 38 | 		uint64_t offsetsize;
 39 | 		uint64_t num_rows, num_columns;
 40 | 		uint64_t num_non_zeroes;
 41 | 
 42 | 		CSRIOHeader() = default;
 43 | 
 44 | 
 45 | 		template<typename T>
 46 | 		static uint64_t typeSize()
 47 | 		{
 48 | 			return sizeof(T);
 49 | 		}
 50 | 
 51 | 		template<typename T>
 52 | 		CSRIOHeader(const CSR<T>& mat)
 53 | 		{
 54 | 			for (size_t i = 0; i < sizeof(Magic); ++i)
 55 | 				magic[i] = Magic[i];
 56 | 			typesize = typeSize<T>();
 57 | 			compresseddir = 0;
 58 | 			indexsize = typeSize<uint32_t>();
 59 | 			fixedoffset = 0;
 60 | 			offsetsize = typeSize<uint32_t>();
 61 | 
 62 | 			num_rows = mat.rows;
 63 | 			num_columns = mat.cols;
 64 | 			num_non_zeroes = mat.nnz;
 65 | 		}
 66 | 
 67 | 		bool checkMagic() const
 68 | 		{
 69 | 			for (size_t i = 0; i < sizeof(Magic); ++i)
 70 | 				if (magic[i] != Magic[i])
 71 | 					return false;
 72 | 			return true;
 73 | 		}
 74 | 	};
 75 | 	constexpr char CSRIOHeader::Magic[];
 76 | }
 77 | 
 78 | template<typename T>
 79 | void CSR<T>::alloc(size_t r, size_t c, size_t n)
 80 | {
 81 | 	rows = r;
 82 | 	cols = c;
 83 | 	nnz = n;
 84 | 
 85 | 	//data = std::make_unique<T[]>(n);
 86 | 	//col_ids = std::make_unique<unsigned int[]>(n);
 87 | 	//row_offsets = std::make_unique<unsigned int[]>(r+1);
 88 |     data = new T [n];
 89 |     col_ids = new int [n];
 90 |     row_offsets = new int [r+1];
 91 | }
 92 | 
 93 | template<typename T>
 94 | CSR<T> loadCSR(const char * file)
 95 | {
 96 | 	std::ifstream fstream(file, std::fstream::binary);
 97 | 	if (!fstream.is_open())
 98 | 		throw std::runtime_error(std::string("could not open \"") + file + "\"");
 99 | 
100 | 	CSRIOHeader header;
101 | 	State<T> state;
102 | 	fstream.read(reinterpret_cast<char*>(&header), sizeof(CSRIOHeader));
103 | 	if (!fstream.good())
104 | 		throw std::runtime_error("Could not read CSR header");
105 | 	if (!header.checkMagic())
106 | 		throw std::runtime_error("File does not appear to be a CSR Matrix");
107 | 
108 | 	fstream.read(reinterpret_cast<char*>(&state), sizeof(state));
109 | 	if (!fstream.good())
110 | 		throw std::runtime_error("Could not read CompressedMatrix state");
111 | 	if (header.typesize != CSRIOHeader::typeSize<T>())
112 | 		throw std::runtime_error("File does not contain a CSR matrix with matching type");
113 | 
114 | 	CSR<T> res;
115 | 	res.alloc(header.num_rows, header.num_columns, header.num_non_zeroes);
116 | 
117 | 	fstream.read(reinterpret_cast<char*>(&res.data[0]), res.nnz * sizeof(T));
118 | 	fstream.read(reinterpret_cast<char*>(&res.col_ids[0]), res.nnz * sizeof(unsigned int));
119 | 	fstream.read(reinterpret_cast<char*>(&res.row_offsets[0]), (res.rows+1) * sizeof(unsigned int));
120 | 
121 | 	if (!fstream.good())
122 | 		throw std::runtime_error("Could not read CSR matrix data");
123 | 
124 | 	return res;
125 | }
126 | 
127 | template<typename T>
128 | void storeCSR(const CSR<T>& mat, const char * file)
129 | {
130 | 	std::ofstream fstream(file, std::fstream::binary);
131 | 	if (!fstream.is_open())
132 | 		throw std::runtime_error(std::string("could not open \"") + file + "\"");
133 | 
134 | 	CSRIOHeader header(mat);
135 | 	State<T> state;
136 | 	fstream.write(reinterpret_cast<char*>(&header), sizeof(CSRIOHeader));
137 | 	fstream.write(reinterpret_cast<const char*>(&state), sizeof(state));
138 | 	fstream.write(reinterpret_cast<char*>(&mat.data[0]), mat.nnz * sizeof(T));
139 | 	fstream.write(reinterpret_cast<char*>(&mat.col_ids[0]), mat.nnz * sizeof(unsigned int));
140 | 	fstream.write(reinterpret_cast<char*>(&mat.row_offsets[0]), (mat.rows + 1) * sizeof(unsigned int));
141 | 
142 | }
143 | 
144 | template<typename T>
145 | void spmv(DenseVector<T>& res, const CSR<T>& m, const DenseVector<T>& v, bool transpose)
146 | {
147 | 	if (transpose && v.size != m.rows)
148 | 		throw std::runtime_error("SPMV dimensions mismatch");
149 | 	if (!transpose && v.size != m.cols)
150 | 		throw std::runtime_error("SPMV dimensions mismatch");
151 | 
152 | 	size_t outsize = transpose ? m.cols : m.rows;
153 | 	if (res.size < outsize)
154 | 		//res.data = std::make_unique<T[]>(outsize);
155 | 		res.data = new T [outsize];
156 | 	res.size = outsize;
157 | 
158 | 	if (transpose)
159 | 	{
160 | 		std::fill(&res.data[0], &res.data[0] + m.cols, 0);
161 | 		for (size_t i = 0; i < m.rows; ++i)
162 | 		{
163 | 			for (unsigned int o = m.row_offsets[i]; o < m.row_offsets[i+1]; ++o)
164 | 				res.data[m.col_ids[o]] += m.data[o] * v.data[i];
165 | 		}
166 | 	}
167 | 	else
168 | 	{
169 | 		for (size_t i = 0; i < m.rows; ++i)
170 | 		{
171 | 			T val = 0;
172 | 			for (unsigned int o = m.row_offsets[i]; o < m.row_offsets[i+1]; ++o)
173 | 				val += m.data[o] * v.data[m.col_ids[o]];
174 | 			res.data[i] = val;
175 | 		}
176 | 	}
177 | }
178 | 
179 | template<typename T>
180 | void convert(CSR<T>& res, const COO<T>& coo)
181 | {
182 | 	struct Entry
183 | 	{
184 | 		unsigned int r, c;
185 | 		T v;
186 | 		bool operator < (const Entry& other)
187 | 		{
188 | 			if (r != other.r) 
189 | 				return r < other.r;
190 | 			return c < other.c;
191 | 		}
192 | 	};
193 | 
194 | 	std::vector<Entry> entries;
195 | 	//std::cout << "coo.nnz" << coo.nnz << std::endl;
196 | 	entries.reserve(coo.nnz);
197 | 	for (size_t i = 0; i < coo.nnz; ++i)
198 | 		entries.push_back(Entry{ coo.row_ids[i], coo.col_ids[i], coo.data[i] });
199 | 	std::sort(std::begin(entries), std::end(entries));
200 | 
201 | 	res.alloc(coo.rows, coo.cols, coo.nnz);
202 | 	std::fill(&res.row_offsets[0], &res.row_offsets[coo.rows], 0);
203 | 	for (size_t i = 0; i < coo.nnz; ++i)
204 | 	{
205 | 		res.data[i] = entries[i].v;
206 | 		res.col_ids[i] = entries[i].c;
207 | 		++res.row_offsets[entries[i].r];
208 | 	}
209 | 
210 | 	unsigned int off = 0;
211 | 	for (size_t i = 0; i < coo.rows; ++i)
212 | 	{
213 | 		unsigned int n = off + res.row_offsets[i];
214 | 		res.row_offsets[i] = off;
215 | 		off = n;
216 | 	}
217 | 	res.row_offsets[coo.rows] = off;
218 | }
219 | 
220 | template<typename T>
221 | CSR<T>::CSR(const CSR<T> &A, int r, int c, int row_start, int col_start){
222 |     assert(r + row_start <= A.rows && "matrix subsect error M");
223 |     assert(c + col_start <= A.cols && "matrix subsect error N");
224 |     int row_end = row_start + r;
225 |     int col_end = col_start + c;
226 |     rows = r;
227 |     cols = c;
228 |     int *row_size = new int [rows];
229 |     memset(row_size, 0, rows*sizeof(int));
230 |     for(int i = row_start; i < row_end; i++){
231 |         for(int j = A.row_offsets[i]; j < A.row_offsets[i+1]; j++){
232 |             if(A.col_ids[j]>= col_start && A.col_ids[j] < col_end){
233 |                 row_size[i - row_start]++;
234 |             }
235 |         }
236 |     }
237 |     int nnz = 0;
238 |     for(int i =0; i < rows; i++){
239 |         nnz += row_size[i];
240 |     }
241 |     alloc(rows, cols, nnz);
242 | 
243 |     row_offsets[0] = 0;
244 |     for(int i = 0; i < rows; i++){
245 |         row_offsets[i+1] = row_offsets[i] + row_size[i];
246 |     }
247 |     delete [] row_size;
248 | 
249 |     for(int i = row_start; i < row_end; i++){
250 |         int jj = row_offsets[i - row_start];
251 |         for(int j = A.row_offsets[i]; j < A.row_offsets[i+1]; j++){
252 |             if(A.col_ids[j]>= col_start && A.col_ids[j] < col_end){
253 |                 col_ids[jj] = A.col_ids[j] - col_start;
254 |                 data[jj++] = A.data[j];
255 |             }
256 |         }
257 |     }
258 | 
259 | }
260 | template <typename T>
261 | CSR<T>& CSR<T>::operator=(const CSR<T>& src){
262 |     alloc(src.rows, src.cols, src.nnz);
263 | 	rows = src.rows; nnz = src.nnz; cols = src.cols;
264 | 	memcpy(data, src.data, nnz * sizeof(T));
265 | 	memcpy(col_ids, src.col_ids, nnz * sizeof(int));
266 | 	memcpy(row_offsets, src.row_offsets, (rows + 1) * sizeof(int));
267 |     return *this;
268 | }
269 | 
270 | template class CSR<float>;
271 | template class CSR<double>;
272 | 
273 | //template void CSR<float>::alloc(size_t, size_t, size_t);
274 | //template void CSR<double>::alloc(size_t, size_t, size_t);
275 | 
276 | template CSR<float> loadCSR(const char * file);
277 | template CSR<double> loadCSR(const char * file);
278 | 
279 | template void storeCSR(const CSR<float>& mat, const char * file);
280 | template void storeCSR(const CSR<double>& mat, const char * file);
281 | 
282 | template void spmv(DenseVector<float>& res, const CSR<float>& m, const DenseVector<float>& v, bool transpose);
283 | template void spmv(DenseVector<double>& res, const CSR<double>& m, const DenseVector<double>& v, bool transpose);
284 | 
285 | 
286 | template void convert(CSR<float>& res, const COO<float>& coo);
287 | template void convert(CSR<double>& res, const COO<double>& coo);
288 | 


--------------------------------------------------------------------------------
/spECK/source/Config.cpp:
--------------------------------------------------------------------------------
 1 | #include "Config.h"
 2 | Config *Config::_instance = nullptr;
 3 | 
 4 | void Config::init(std::string path)
 5 | {
 6 | 	_instance = new Config(path);
 7 | }
 8 | 
 9 | void Config::init()
10 | {
11 | 	_instance = new Config();
12 | }
13 | 
14 | int Config::getInt(Key key, int fallback)
15 | {
16 | 	if (Instance().overrides.find(key) != Instance().overrides.end())
17 | 		return Instance().overrides[key];
18 | 
19 | 	return Instance().reader.GetInteger("", Instance().keyToString[key], fallback);
20 | }
21 | 
22 | int Config::setInt(Key key, int newVal)
23 | {
24 | 	return Instance().overrides[key] = newVal;
25 | }
26 | 
27 | string Config::getString(Key key, std::string fallback)
28 | {
29 | 	return Instance().reader.Get("", Instance().keyToString[key], fallback);
30 | }
31 | 
32 | bool Config::getBool(Key key, bool fallback)
33 | {
34 | 	return Instance().reader.GetBoolean("", Instance().keyToString[key], fallback);
35 | }
36 | 
37 | float Config::getFloat(Key key, float fallback)
38 | {
39 | 	return (float) Instance().reader.GetReal("", Instance().keyToString[key], fallback);
40 | }


--------------------------------------------------------------------------------
/spECK/source/DataLoader.cpp:
--------------------------------------------------------------------------------
  1 | #include "DataLoader.h"
  2 | 
  3 | #include <iostream>
  4 | #include "COO.h"
  5 | #include <cuSparseMultiply.h>
  6 | 
  7 | template<typename T>
  8 | std::string typeExtension();
  9 | template<>
 10 | std::string typeExtension<float>()
 11 | {
 12 | 	return std::string("");
 13 | }
 14 | template<>
 15 | std::string typeExtension<double>()
 16 | {
 17 | 	return std::string("d_");
 18 | }
 19 | 
 20 | template class DataLoader<float>;
 21 | template class DataLoader<double>;
 22 | 
 23 | template <typename ValueType>
 24 | DataLoader<ValueType>::DataLoader(std::string path, std::string path2) : matrices()
 25 | {
 26 | 	std::string csrPath = path + typeExtension<ValueType>() + ".hicsr";
 27 | 
 28 | 	try
 29 | 	{
 30 | 		//std::cout << "trying to load csr file \"" << csrPath << "\"\n";
 31 | 		matrices.cpuA = loadCSR<ValueType>(csrPath.c_str());
 32 | 		//std::cout << "successfully loaded: \"" << csrPath << "\"\n";
 33 | 	}
 34 | 	catch (std::exception& ex)
 35 | 	{
 36 | 		//std::cout << "could not load csr file:\n\t" << ex.what() << "\n";
 37 | 		try
 38 | 		{
 39 | 			//std::cout << "trying to load mtx file \"" << path << "\"\n";
 40 | 			COO<ValueType> cooMat = loadMTX<ValueType>(path.c_str());
 41 | 			convert(matrices.cpuA, cooMat);
 42 | 			//std::cout << "successfully loaded and converted: \"" << csrPath << "\"\n";
 43 | 		}
 44 | 		catch (std::exception& ex)
 45 | 		{
 46 | 			std::cout << ex.what() << std::endl;
 47 | 			std::cout << "could not load mtx file: \"" << path << "\"\n";
 48 | 			throw "could not load mtx file";
 49 | 		}
 50 | 
 51 | 		try
 52 | 		{
 53 | 			//std::cout << "write csr file for future use in" << csrPath.c_str() << "\n";
 54 | 			//storeCSR(matrices.cpuA, csrPath.c_str());
 55 | 		}
 56 | 		catch (std::exception& ex)
 57 | 		{
 58 | 			std::cout << ex.what() << std::endl;
 59 | 		}
 60 | 	}
 61 | 	
 62 | 	//cuSPARSE::CuSparseTest<ValueType> cuSparse;
 63 | 	
 64 | 	//calculate the transpose if matrix is not square
 65 |     if(path == path2){
 66 | 		convert(matrices.cpuB, matrices.cpuA, 0);
 67 |     }
 68 |     else{
 69 |         try
 70 | 		{
 71 | 			//std::cout << "trying to load mtx file \"" << path << "\"\n";
 72 | 			COO<ValueType> cooMat = loadMTX<ValueType>(path2.c_str());
 73 | 			convert(matrices.cpuB, cooMat);
 74 | 			//std::cout << "successfully loaded and converted: \"" << csrPath << "\"\n";
 75 | 		}
 76 | 		catch (std::exception& ex)
 77 | 		{
 78 | 			std::cout << ex.what() << std::endl;
 79 | 			std::cout << "could not load mtx file: \"" << path << "\"\n";
 80 | 			throw "could not load mtx file";
 81 | 		}
 82 |         if(matrices.cpuA.cols == matrices.cpuB.rows){
 83 |             // do nothing
 84 |         }
 85 |         else if(matrices.cpuA.cols < matrices.cpuB.rows){
 86 |             CSR<ValueType> tmp(matrices.cpuB, matrices.cpuA.cols, matrices.cpuB.cols, 0, 0);
 87 |             matrices.cpuB = tmp;
 88 |         }
 89 |         else{
 90 |             CSR<ValueType> tmp(matrices.cpuA, matrices.cpuA.rows, matrices.cpuB.rows, 0, 0);
 91 |             matrices.cpuA = tmp;
 92 |         }
 93 |     }
 94 | 
 95 | 	//if (matrices.gpuA.rows != matrices.gpuA.cols)
 96 | 	//{
 97 | 	//	cuSparse.Transpose(matrices.gpuA, matrices.gpuB);
 98 | 	//	convert(matrices.cpuB, matrices.gpuB);
 99 | 	//}
100 | 	//else
101 | 	//{
102 | 	//	convert(matrices.gpuB, matrices.cpuA, 0);
103 | 	//	convert(matrices.cpuB, matrices.cpuA, 0);
104 | 	//}
105 | 	convert(matrices.gpuA, matrices.cpuA, 0);
106 | 	convert(matrices.gpuB, matrices.cpuB, 0);
107 | }
108 | 


--------------------------------------------------------------------------------
/spECK/source/Executor.cpp:
--------------------------------------------------------------------------------
  1 | #include "cuda_runtime.h"
  2 | #include "Executor.h"
  3 | #include "Multiply.h"
  4 | #include "DataLoader.h"
  5 | #include <iomanip>
  6 | #include "Config.h"
  7 | #include "Compare.h"
  8 | #include <cuSparseMultiply.h>
  9 | #include "Timings.h"
 10 | #include "spECKConfig.h"
 11 | #include "common.h"
 12 | #include "cuda_common.h"
 13 | 
 14 | template <typename DataType>
 15 | long compt_flop(const CSR<DataType> &A, const CSR<DataType> &B){
 16 | 	int M = A.rows;
 17 | 	long total_flop = 0;
 18 | 	for(int i = 0; i < M; i++){
 19 | 	    for(int j = A.row_offsets[i]; j < A.row_offsets[i+1]; j++){
 20 | 	    	total_flop += B.row_offsets[A.col_ids[j]+1] - B.row_offsets[A.col_ids[j]];
 21 | 	    }
 22 | 	}
 23 | 	return total_flop;
 24 | }
 25 | 
 26 | 
 27 | template <typename ValueType>
 28 | int Executor<ValueType>::run()
 29 | {
 30 | 	iterationsWarmup = Config::getInt(Config::IterationsWarmUp, 1);
 31 | 	iterationsExecution = Config::getInt(Config::IterationsExecution, 10);
 32 | 	//iterationsWarmup = 1;
 33 | 	//iterationsExecution = 1;
 34 | 	DataLoader<ValueType> data(runConfig.filePath, runConfig.filePath2);
 35 | 	//std::cout << runConfig.filePath << std::endl;
 36 | 	auto& matrices = data.matrices;
 37 | 	//std::cout << "Matrix: " << matrices.cpuA.rows << "x" << matrices.cpuA.cols << ": " << matrices.cpuA.nnz << " nonzeros\n";
 38 | 
 39 | 	long total_flops = compt_flop(matrices.cpuA, matrices.cpuB);
 40 | 
 41 | 	dCSR<ValueType> dCsrHiRes, dCsrReference;
 42 | 	Timings timings, warmupTimings, benchTimings;
 43 | 	//bool measureAll = Config::getBool(Config::TrackIndividualTimes, false);
 44 | 	bool measureAll = false;
 45 | 	bool measureCompleteTimes = Config::getBool(Config::TrackCompleteTimes, true);
 46 | 	auto config = spECK::spECKConfig::initialize(0);
 47 | 
 48 | 	//bool compareData = false;
 49 | 	bool compareData = true;
 50 | 
 51 | 	if(Config::getBool(Config::CompareResult))
 52 | 	{
 53 | 		unsigned cuSubdiv_nnz = 0;
 54 | 		cuSPARSE::CuSparseTest<ValueType> cusparse;
 55 | 		cusparse.Multiply(matrices.gpuA, matrices.gpuB, dCsrReference, cuSubdiv_nnz);
 56 | 
 57 | 		if(!compareData)
 58 | 		{
 59 | 			cudaFree(dCsrReference.data);
 60 | 			dCsrReference.data = nullptr;
 61 | 		}
 62 | 	}
 63 | 
 64 | 	// Warmup iterations for multiplication
 65 | 	for (int i = 0; i < iterationsWarmup; ++i)
 66 | 	{
 67 | 		timings = Timings();
 68 | 		timings.measureAll = measureAll;
 69 | 		timings.measureCompleteTime = measureCompleteTimes;
 70 | 		spECK::MultiplyspECK<ValueType, 4, 1024, spECK_DYNAMIC_MEM_PER_BLOCK, spECK_STATIC_MEM_PER_BLOCK>(matrices.gpuA, matrices.gpuB, dCsrHiRes, config, timings);
 71 | 		warmupTimings += timings;
 72 | 
 73 | 		if (dCsrHiRes.data != nullptr && dCsrHiRes.col_ids != nullptr && Config::getBool(Config::CompareResult))
 74 | 		{
 75 | 			printf("compare data \n");
 76 | 			//if (!spECK::Compare(dCsrReference, dCsrHiRes, false))
 77 | 			if (!spECK::Compare(dCsrReference, dCsrHiRes, compareData))
 78 | 				printf("Error: Matrix incorrect\n");
 79 | 		}
 80 | 		dCsrHiRes.reset();
 81 | 	}
 82 | 
 83 | 	// Multiplication
 84 | 	for (int i = 0; i < iterationsExecution; ++i)
 85 | 	{
 86 | 		timings = Timings();
 87 | 		timings.measureAll = measureAll;
 88 | 		timings.measureCompleteTime = measureCompleteTimes;
 89 | 		spECK::MultiplyspECK<ValueType, 4, 1024, spECK_DYNAMIC_MEM_PER_BLOCK, spECK_STATIC_MEM_PER_BLOCK>
 90 | 		(matrices.gpuA, matrices.gpuB, dCsrHiRes, config, timings);
 91 | 		benchTimings += timings;
 92 | 
 93 | //		if (dCsrHiRes.data != nullptr && dCsrHiRes.col_ids != nullptr && Config::getBool(Config::CompareResult))
 94 | //		{
 95 | //			if (!spECK::Compare(dCsrReference, dCsrHiRes, compareData))
 96 | //				printf("Error: Matrix incorrect\n");
 97 | //		}
 98 | 		dCsrHiRes.reset();
 99 | 	}
100 | 	
101 | 	benchTimings /= iterationsExecution;
102 | 	benchTimings.reg_print(total_flops * 2);
103 | 
104 | 	return 0;
105 | }
106 | 
107 | template <typename ValueType>
108 | int Executor<ValueType>::run_detail()
109 | {
110 | 	iterationsWarmup = Config::getInt(Config::IterationsWarmUp, 1);
111 | 	iterationsExecution = Config::getInt(Config::IterationsExecution, 10);
112 | 	//iterationsWarmup = 1;
113 | 	//iterationsExecution = 1;
114 | 	DataLoader<ValueType> data(runConfig.filePath, runConfig.filePath2);
115 | 	//std::cout << runConfig.filePath << std::endl;
116 | 	auto& matrices = data.matrices;
117 | 	//std::cout << "Matrix: " << matrices.cpuA.rows << "x" << matrices.cpuA.cols << ": " << matrices.cpuA.nnz << " nonzeros\n";
118 | 
119 | 	long total_flops = compt_flop(matrices.cpuA, matrices.cpuB);
120 | 
121 | 	dCSR<ValueType> dCsrHiRes, dCsrReference;
122 | 	Timings timings, warmupTimings, benchTimings;
123 | 	bool measureAll = true;
124 | 	bool measureCompleteTimes = Config::getBool(Config::TrackCompleteTimes, true);
125 | 	auto config = spECK::spECKConfig::initialize(0);
126 | 
127 | 	//bool compareData = false;
128 | 	bool compareData = true;
129 | 
130 | 	if(Config::getBool(Config::CompareResult))
131 | 	{
132 | 		unsigned cuSubdiv_nnz = 0;
133 | 		cuSPARSE::CuSparseTest<ValueType> cusparse;
134 | 		cusparse.Multiply(matrices.gpuA, matrices.gpuB, dCsrReference, cuSubdiv_nnz);
135 | 
136 | 		if(!compareData)
137 | 		{
138 | 			cudaFree(dCsrReference.data);
139 | 			dCsrReference.data = nullptr;
140 | 		}
141 | 	}
142 | 
143 | 	// Warmup iterations for multiplication
144 | 	for (int i = 0; i < iterationsWarmup; ++i)
145 | 	{
146 | 		timings = Timings();
147 | 		timings.measureAll = measureAll;
148 | 		timings.measureCompleteTime = measureCompleteTimes;
149 | 		spECK::MultiplyspECK<ValueType, 4, 1024, spECK_DYNAMIC_MEM_PER_BLOCK, spECK_STATIC_MEM_PER_BLOCK>(matrices.gpuA, matrices.gpuB, dCsrHiRes, config, timings);
150 | 		warmupTimings += timings;
151 | 
152 | 		if (dCsrHiRes.data != nullptr && dCsrHiRes.col_ids != nullptr && Config::getBool(Config::CompareResult))
153 | 		{
154 | 			printf("compare data \n");
155 | 			//if (!spECK::Compare(dCsrReference, dCsrHiRes, false))
156 | 			if (!spECK::Compare(dCsrReference, dCsrHiRes, compareData))
157 | 				printf("Error: Matrix incorrect\n");
158 | 		}
159 | 		dCsrHiRes.reset();
160 | 	}
161 | 
162 | 	// Multiplication
163 | 	for (int i = 0; i < iterationsExecution; ++i)
164 | 	{
165 | 		timings = Timings();
166 | 		timings.measureAll = measureAll;
167 | 		timings.measureCompleteTime = measureCompleteTimes;
168 | 		spECK::MultiplyspECK<ValueType, 4, 1024, spECK_DYNAMIC_MEM_PER_BLOCK, spECK_STATIC_MEM_PER_BLOCK>
169 | 		(matrices.gpuA, matrices.gpuB, dCsrHiRes, config, timings);
170 | 		benchTimings += timings;
171 | 
172 | //		if (dCsrHiRes.data != nullptr && dCsrHiRes.col_ids != nullptr && Config::getBool(Config::CompareResult))
173 | //		{
174 | //			if (!spECK::Compare(dCsrReference, dCsrHiRes, compareData))
175 | //				printf("Error: Matrix incorrect\n");
176 | //		}
177 | 		dCsrHiRes.reset();
178 | 	}
179 | 	
180 | 	benchTimings /= iterationsExecution;
181 | 	benchTimings.print(total_flops * 2);
182 | 
183 | 	return 0;
184 | }
185 | 
186 | template class Executor<double>;
187 | 


--------------------------------------------------------------------------------
/spECK/source/GPU/Compare.cu:
--------------------------------------------------------------------------------
 1 | // Global includes
 2 | #include <stdio.h>
 3 | #include <stdint.h>
 4 | typedef unsigned long long int uint64_t;
 5 | 
 6 | // Local includes
 7 | #include "Compare.h"
 8 | #include "common.h"
 9 | 
10 | #define VERIFICATION_TEXT
11 | 
12 | template <typename DataType>
13 | __global__ void d_compare(int in_rows, int in_cols, const uint32_t* __restrict reference_offset, const uint32_t* __restrict reference_indices, const DataType* __restrict reference_values,
14 | 	const uint32_t* __restrict compare_offset, const uint32_t* __restrict compare_indices, const DataType* __restrict compare_values, bool compare_data, double epsilon, uint32_t* verification)
15 | {
16 | 	int tid = threadIdx.x + blockDim.x * blockIdx.x;
17 | 	if (tid >= in_rows)
18 | 		return;
19 | 
20 | 	// if (tid > 10000)
21 | 	// 	return;
22 | 
23 | 	uint32_t ref_offset = reference_offset[tid];
24 | 	uint32_t comp_offset = compare_offset[tid];
25 | 	uint32_t ref_number_entries = reference_offset[tid + 1] - ref_offset;
26 | 	uint32_t comp_number_entries = compare_offset[tid + 1] - comp_offset;
27 | 
28 | 	if (ref_number_entries != comp_number_entries)
29 | 	{
30 | #ifdef VERIFICATION_TEXT
31 | 		printf("---------- Row: %u | Row length not identical: (Ref|Comp) : (%u|%u)\n",tid, ref_number_entries, comp_number_entries);
32 | #endif
33 | 		*verification = 1;
34 | 		return;
35 | 	}
36 | 
37 | 	uint32_t num_entries = min(ref_number_entries, comp_number_entries);
38 | 
39 | 	for (uint32_t i = 0; i < num_entries; ++i)
40 | 	{
41 | 		if (reference_indices[ref_offset + i] != compare_indices[comp_offset + i])
42 | 		{
43 | #ifdef VERIFICATION_TEXT
44 | 			printf("Row: %u | Row indices do NOT match: (Ref|Comp) : (%u|%u) - pos: %u/%u\n", tid, reference_indices[ref_offset + i], compare_indices[comp_offset + i], i, num_entries);
45 | #endif
46 | 			*verification = 1;
47 | 			return;
48 | 		}
49 | 		if (compare_data)
50 | 		{
51 | 			if (compare_values[comp_offset + i] != 0 && std::abs(reference_values[ref_offset + i] / compare_values[comp_offset + i] - 1) > 0.01)
52 | 			{
53 | #ifdef VERIFICATION_TEXT
54 | 				printf("Row: %u | Values do NOT match: (Ref|Comp) : (%f|%f) - pos: %u/%u - col %u\n", tid, reference_values[ref_offset + i], compare_values[comp_offset + i], i, num_entries, reference_indices[ref_offset + i]);
55 | #endif
56 | 				*verification = 1;
57 | 				// return;
58 | 			}
59 | 		}
60 | 	}
61 | 
62 | 	return;
63 | }
64 | 
65 | namespace spECK {
66 | 	template <typename DataType>
67 | 	bool Compare(const dCSR<DataType>& reference_mat, const dCSR<DataType>& compare_mat, bool compare_data)
68 | 	{
69 | 		int blockSize(256);
70 | 		int gridSize(divup<int>(reference_mat.rows + 1, blockSize));
71 | 		double epsilon = 0.1;
72 | 		uint32_t* verification, h_verification;
73 | 		cudaMalloc(&verification, sizeof(uint32_t));
74 | 		cudaMemset(verification, 0, sizeof(uint32_t));
75 | 
76 | 		d_compare<DataType> << <gridSize, blockSize >> > (reference_mat.rows, reference_mat.cols,
77 | 			reference_mat.row_offsets, reference_mat.col_ids, reference_mat.data,
78 | 			compare_mat.row_offsets, compare_mat.col_ids, compare_mat.data,
79 | 			compare_data, epsilon, verification);
80 | 		 
81 | 		cudaMemcpy(&h_verification, verification, sizeof(uint32_t), cudaMemcpyDeviceToHost);
82 | 		return (h_verification == 0);
83 | 	}
84 | 
85 | 	template bool Compare<float>(const dCSR<float>& reference_mat, const dCSR<float>& compare_mat, bool compare_data);
86 | 	template bool Compare<double>(const dCSR<double>& reference_mat, const dCSR<double>& compare_mat, bool compare_data);
87 | }
88 | 


--------------------------------------------------------------------------------
/spECK/source/GPU/Transpose.cu:
--------------------------------------------------------------------------------
  1 | // Global includes
  2 | #include <thrust/device_vector.h>
  3 | #include <stdint.h>
  4 | typedef unsigned long long int uint64_t;
  5 | #include "device_launch_parameters.h"
  6 | 
  7 | // Local includes
  8 | #include "Transpose.h"
  9 | #include "common.h"
 10 | 
 11 | __global__ void d_calulateTransposeDistribution(int in_rows, int in_cols,
 12 | 	const uint32_t* __restrict input_offset, const uint32_t* __restrict input_indices, uint32_t* output_offset)
 13 | {
 14 | 	int tid = threadIdx.x + blockDim.x * blockIdx.x;
 15 | 	if (tid >= in_rows)
 16 | 		return;
 17 | 
 18 | 	uint32_t offset = input_offset[tid];
 19 | 	uint32_t number_entries = input_offset[tid + 1] - offset;
 20 | 
 21 | 	for (uint32_t i = 0; i < number_entries; ++i)
 22 | 	{
 23 | 		atomicAdd(output_offset + input_indices[offset + i], 1);
 24 | 	}
 25 | 
 26 | 	return;
 27 | }
 28 | 
 29 | template <typename DataType>
 30 | __global__ void d_findPosition(int in_rows, int in_cols, const uint32_t* __restrict input_offset, const uint32_t* __restrict input_indices,
 31 | 	const DataType* __restrict input_values, uint32_t* output_offset, uint32_t* output_indices, DataType* output_values, uint32_t* helper, uint32_t* helper_position)
 32 | {
 33 | 	int tid = threadIdx.x + blockDim.x * blockIdx.x;
 34 | 	if (tid >= in_rows)
 35 | 		return;
 36 | 
 37 | 	uint32_t offset = input_offset[tid];
 38 | 	uint32_t number_entries = input_offset[tid + 1] - offset;
 39 | 
 40 | 	for (uint32_t i = 0; i < number_entries; ++i)
 41 | 	{
 42 | 		uint32_t row_index = input_indices[offset + i];
 43 | 		uint32_t insert_position = atomicAdd(helper + row_index, 1);
 44 | 		uint32_t o_offset = output_offset[row_index];
 45 | 		helper_position[o_offset + insert_position] = tid;
 46 | 	}
 47 | 
 48 | 	return;
 49 | }
 50 | 
 51 | template <typename DataType>
 52 | __global__ void d_writeTranspose(int in_rows, int in_cols, const uint32_t* __restrict input_offset, const uint32_t* __restrict input_indices,
 53 | 	const DataType* __restrict input_values, uint32_t* output_offset, uint32_t* output_indices, DataType* output_values, uint32_t* helper, uint32_t* helper_position)
 54 | {
 55 | 	int tid = threadIdx.x + blockDim.x * blockIdx.x;
 56 | 	if (tid >= in_rows)
 57 | 		return;
 58 | 
 59 | 	uint32_t offset = input_offset[tid];
 60 | 	uint32_t number_entries = input_offset[tid + 1] - offset;
 61 | 
 62 | 	for (uint32_t i = 0; i < number_entries; ++i)
 63 | 	{
 64 | 		uint32_t row_index = input_indices[offset + i];
 65 | 		uint32_t actual_position(0);
 66 | 		uint32_t entries_output = helper[row_index];
 67 | 		uint32_t o_offset = output_offset[row_index];
 68 | 		for (uint32_t j = 0; j < entries_output; ++j)
 69 | 		{
 70 | 			if (helper_position[o_offset + j] < tid)
 71 | 				++actual_position;
 72 | 		}		
 73 | 		output_indices[o_offset + actual_position] = tid;
 74 | 		output_values[o_offset + actual_position] = input_values[offset + i];
 75 | 	}
 76 | 
 77 | 	return;
 78 | }
 79 | 
 80 | 
 81 | namespace spECK {
 82 | 	template <typename DataType>
 83 | 	void Transpose(const dCSR<DataType>& matIn, dCSR<DataType>& matTransposeOut)
 84 | 	{
 85 | 		int blockSize(256);
 86 | 		int gridSize(divup<int>(matIn.rows + 1, blockSize));
 87 | 
 88 | 		matTransposeOut.alloc(matIn.cols, matIn.rows, matIn.nnz);
 89 | 
 90 | 		// Allocate and set helper resources, Memset output vector
 91 | 		uint32_t* d_helper_pointer, *d_helper_position;
 92 | 		cudaMalloc(&d_helper_pointer, sizeof(uint32_t) * (matTransposeOut.rows + 1));
 93 | 		cudaMalloc(&d_helper_position, sizeof(uint32_t) * (matTransposeOut.nnz));
 94 | 		cudaMemset(d_helper_pointer, 0, sizeof(uint32_t) * (matTransposeOut.rows + 1));
 95 | 		cudaMemset(matTransposeOut.row_offsets, 0, (matTransposeOut.rows + 1) * sizeof(uint32_t));
 96 | 
 97 | 		// Calculate entry distribution
 98 | 		d_calulateTransposeDistribution<<<gridSize , blockSize >>>(matIn.rows, matIn.cols, matIn.row_offsets, matIn.col_ids, matTransposeOut.row_offsets);
 99 | 
100 | 		// Prefix sum for new offset vector
101 | 		thrust::device_ptr<uint32_t> th_offset_vector(matTransposeOut.row_offsets);
102 | 		thrust::exclusive_scan(th_offset_vector, th_offset_vector + matTransposeOut.rows + 1, th_offset_vector);
103 | 
104 | 		// Find position for insertion (keeping sort order)
105 | 		d_findPosition<DataType> << <gridSize, blockSize >> > (matIn.rows, matIn.cols, matIn.row_offsets, matIn.col_ids, matIn.data, matTransposeOut.row_offsets, matTransposeOut.col_ids, matTransposeOut.data, d_helper_pointer, d_helper_position);
106 | 
107 | 		// Write Transpose
108 | 		d_writeTranspose<DataType> << <gridSize, blockSize >> > (matIn.rows, matIn.cols, matIn.row_offsets, matIn.col_ids, matIn.data, matTransposeOut.row_offsets, matTransposeOut.col_ids, matTransposeOut.data, d_helper_pointer, d_helper_position);
109 | 
110 | 		// Free helper resources
111 | 		cudaFree(d_helper_pointer);
112 | 		cudaFree(d_helper_position);
113 | 
114 | 		return;
115 | 	}
116 | 
117 | 	template void Transpose<float>(const dCSR<float>& matIn, dCSR<float>& matTransposeOut);
118 | 	template void Transpose<double>(const dCSR<double>& matIn, dCSR<double>& matTransposeOut);
119 | }
120 | 


--------------------------------------------------------------------------------
/spECK/source/GPU/common.cu:
--------------------------------------------------------------------------------
1 | #include "common.cuh"


--------------------------------------------------------------------------------
/spECK/source/GPU/memory.cpp:
--------------------------------------------------------------------------------
 1 | #include <CUDATools/error.h>
 2 | #include <CUDATools/memory.h>
 3 | #include <cuda.h>
 4 | #include <cuda_runtime.h>
 5 | 
 6 | 
 7 | namespace CU
 8 | {
 9 | 	unique_ptr allocMemory(std::size_t size)
10 | 	{
11 | 		CUdeviceptr ptr;
12 | 		cudaMalloc(reinterpret_cast<void**>(&ptr), size);
13 | 		return unique_ptr(ptr);
14 | 	}
15 | 	
16 | 	unique_ptr allocMemoryPitched(std::size_t& pitch, std::size_t row_size, std::size_t num_rows, unsigned int element_size)
17 | 	{
18 | 		CUdeviceptr ptr;
19 | 		cudaMallocPitch(reinterpret_cast<void**>(&ptr), &pitch, row_size, num_rows);
20 | 		return unique_ptr(ptr);
21 | 	}
22 | 	
23 | 	pitched_memory allocMemoryPitched(std::size_t row_size, std::size_t num_rows, unsigned int element_size)
24 | 	{
25 | 		CUdeviceptr ptr;
26 | 		std::size_t pitch;
27 | 		cudaMallocPitch(reinterpret_cast<void**>(&ptr), &pitch, row_size, num_rows);
28 | 		return pitched_memory(unique_ptr(ptr), pitch);
29 | 	}
30 | }
31 | 


--------------------------------------------------------------------------------
/spECK/source/GPU/profiler.cu:
--------------------------------------------------------------------------------
  1 | #include "GPU/profiler.cuh"
  2 | 
  3 | #define CHECK_CU_ERROR(err, cufunc)                                     \
  4 |   if (err != CUDA_SUCCESS)                                              \
  5 |     {                                                                   \
  6 |       printf ("%s:%d: error %d for CUDA Driver API function '%s'\n",    \
  7 |               __FILE__, __LINE__, err, cufunc);                         \
  8 |       exit(-1);                                                         \
  9 |     }
 10 | 
 11 | #define CHECK_CUPTI_ERROR(err, cuptifunc)                               \
 12 |   if (err != CUPTI_SUCCESS)                                             \
 13 |     {                                                                   \
 14 |       const char *errstr;                                               \
 15 |       cuptiGetResultString(err, &errstr);                               \
 16 |       printf ("%s:%d:Error %s for CUPTI API function '%s'.\n",          \
 17 |               __FILE__, __LINE__, errstr, cuptifunc);                   \
 18 |       exit(-1);                                                         \
 19 |     }
 20 | 
 21 | 
 22 | // void CUPTIAPI getTimestampCallback(void *userdata, CUpti_CallbackDomain domain,
 23 | //                      CUpti_CallbackId cbid, const CUpti_CallbackData *cbInfo)
 24 | // {
 25 | //     static int memTransCount = 0;
 26 | //     uint64_t startTimestamp;
 27 | //     uint64_t endTimestamp;
 28 | //     RuntimeApiTrace_t *traceData = (RuntimeApiTrace_t*)userdata;
 29 | //     CUptiResult cuptiErr;
 30 |         
 31 | //     // Data is collected only for the following API
 32 | //     if ((cbid == CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020) ||
 33 | //         (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000) ||
 34 | //         (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSynchronize_v3020) || 
 35 | //         (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020))  { 
 36 |         
 37 | //         // Set pointer depending on API
 38 | //         if ((cbid == CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020) ||
 39 | //             (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000))
 40 | //         {
 41 | //         traceData = traceData + KERNEL;
 42 | //         }
 43 | //         else if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSynchronize_v3020) 
 44 | //         traceData = traceData + THREAD_SYNC;
 45 | //         else if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020)
 46 | //         traceData = traceData + MEMCPY_H2D1 + memTransCount;
 47 | //         size_t freeMem = 0, totalMem = 0;
 48 | //         cudaMemGetInfo(&freeMem, &totalMem);
 49 | //         traceData->currentMemoryUsage = totalMem - freeMem;
 50 |                     
 51 | //         if (cbInfo->callbackSite == CUPTI_API_ENTER) {
 52 | //             // for a kernel launch report the kernel name, otherwise use the API
 53 | //             // function name.
 54 | //             if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020 ||
 55 | //                 cbid == CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000)
 56 | //             {
 57 | //                 traceData->functionName = cbInfo->symbolName;
 58 | //             }
 59 | //             else {
 60 | //                 traceData->functionName = cbInfo->functionName;
 61 | //             }
 62 | 
 63 | //             // Store parameters passed to cudaMemcpy
 64 | //             if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020) {
 65 | //                 traceData->memcpy_bytes = ((cudaMemcpy_v3020_params *)(cbInfo->functionParams))->count;
 66 | //                 traceData->memcpy_kind = ((cudaMemcpy_v3020_params *)(cbInfo->functionParams))->kind;
 67 | //             }
 68 |                 
 69 | //             // Collect timestamp for API start
 70 | //             cuptiErr = cuptiDeviceGetTimestamp(cbInfo->context, &startTimestamp);
 71 | //             CHECK_CUPTI_ERROR(cuptiErr, "cuptiDeviceGetTimestamp");
 72 |                     
 73 | //             traceData->startTimestamp = startTimestamp;
 74 | //             }
 75 | 
 76 | //             if (cbInfo->callbackSite == CUPTI_API_EXIT) {
 77 | //             // Collect timestamp for API exit
 78 | //             cuptiErr = cuptiDeviceGetTimestamp(cbInfo->context, &endTimestamp);
 79 | //             CHECK_CUPTI_ERROR(cuptiErr, "cuptiDeviceGetTimestamp");
 80 |                     
 81 | //             traceData->endTimestamp = endTimestamp;
 82 |             
 83 | //             // Advance to the next memory transfer operation
 84 | //             if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020) {
 85 | //                 memTransCount++;
 86 | //             }
 87 | //         } 
 88 | //     }
 89 | // }
 90 | 
 91 | void CUPTIAPI getTimestampCallback(void *userdata, CUpti_CallbackDomain domain,
 92 |                      CUpti_CallbackId cbid, const CUpti_CallbackData *cbInfo)
 93 | {
 94 |     if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemGetInfo_v3020)
 95 |         return;
 96 | 
 97 |     size_t freeMem = 0, totalMem = 0;
 98 |     cudaMemGetInfo(&freeMem, &totalMem);
 99 | 
100 |     RuntimeApiTrace_t traceData;
101 |     traceData.functionName = cbInfo->functionName;
102 |     traceData.currentMemoryUsage = totalMem - freeMem;
103 | 
104 |     auto &records = *((std::vector<RuntimeApiTrace_t> *) userdata);
105 |     records.push_back(traceData);
106 |         
107 | 
108 |     // printf("current usage=%llu. Entry nr=%llu\n", traceData.currentMemoryUsage, (long long unsigned int) records.size());
109 | }
110 | 
111 | void CuProfiler::initialize(bool subtractCurrentMem) {
112 |     CUcontext context = 0;
113 |     CUdevice device = 0;
114 |     CUresult cuerr;
115 |     CUptiResult cuptierr;
116 | 
117 |     startMem = 0;
118 |     if(subtractCurrentMem) {
119 |         size_t freeMem = 0, totalMem = 0;
120 |         cudaMemGetInfo(&freeMem, &totalMem);
121 |         startMem = totalMem - freeMem;
122 |     }
123 |     // RuntimeApiTrace_t trace[LAUNCH_LAST];
124 |         
125 |     // cuerr = cuInit(0);
126 |     // CHECK_CU_ERROR(cuerr, "cuInit");
127 | 
128 |     // cuerr = cuCtxCreate(&context, 0, device);
129 |     // cuerr = cuCtxGetCurrent(&context);
130 |     // CHECK_CU_ERROR(cuerr, "cuCtxCreate");
131 | 
132 |     cuptierr = cuptiSubscribe(&subscriber, (CUpti_CallbackFunc)getTimestampCallback, &records);
133 |     CHECK_CUPTI_ERROR(cuptierr, "cuptiSubscribe");
134 | 
135 |     cuptierr = cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_RUNTIME_API);
136 |     CHECK_CUPTI_ERROR(cuptierr, "cuptiEnableDomain");
137 | 
138 |     initialized = true;
139 | }
140 | 
141 | void CuProfiler::finalize() {
142 |     if (!initialized)
143 |         return;
144 | 
145 |     for (auto& entry : records) {
146 |         if (entry.currentMemoryUsage < startMem)
147 |             startMem =  entry.currentMemoryUsage;
148 |     }
149 | 
150 |     for (auto& entry : records) {
151 |         entry.currentMemoryUsage -= startMem;
152 |     }
153 | 
154 |     // displayTimestamps();
155 | 
156 |     CUptiResult cuptierr = cuptiUnsubscribe(subscriber);
157 |     CHECK_CUPTI_ERROR(cuptierr, "cuptiUnsubscribe");
158 | 
159 |     // cleanUp(h_A, h_B, h_C, d_A, d_B, d_C);
160 |     cudaDeviceSynchronize();
161 | }
162 | 
163 | 
164 |     
165 | void CuProfiler::displayTimestamps()
166 | {
167 |     for (auto entry : records) {
168 |         printf("MemUsage:%llu\n", entry.currentMemoryUsage);
169 |     }
170 |     // Calculate timestamp of kernel based on timestamp from
171 |     // cudaDeviceSynchronize() call
172 |     // trace[KERNEL].endTimestamp = trace[THREAD_SYNC].endTimestamp;
173 | 
174 |     // printf("startTimeStamp/gpuTime reported in nano-seconds\n\n");
175 |     // printf("Name\t\tStart Time\t\tGPU Time\tBytes\tKind\tcurrentMemUsage\n");
176 |     // printf("%s\t%llu\t%llu\t\t%llu\t%s\t\%llu\n", trace[MEMCPY_H2D1].functionName,
177 |     //         (unsigned long long)trace[MEMCPY_H2D1].startTimestamp, 
178 |     //         (unsigned long long)trace[MEMCPY_H2D1].endTimestamp - trace[MEMCPY_H2D1].startTimestamp,
179 |     //         (unsigned long long)trace[MEMCPY_H2D1].memcpy_bytes,
180 |     //         memcpyKindStr(trace[MEMCPY_H2D1].memcpy_kind),
181 |     //         (unsigned long long)trace[MEMCPY_H2D1].currentMemoryUsage);
182 |     // printf("%s\t%llu\t%llu\t\t%llu\t%s\t\%llu\n", trace[MEMCPY_H2D2].functionName,
183 |     //         (unsigned long long)trace[MEMCPY_H2D2].startTimestamp,
184 |     //         (unsigned long long)trace[MEMCPY_H2D2].endTimestamp - trace[MEMCPY_H2D2].startTimestamp, 
185 |     //         (unsigned long long)trace[MEMCPY_H2D2].memcpy_bytes,
186 |     //         memcpyKindStr(trace[MEMCPY_H2D2].memcpy_kind),
187 |     //         (unsigned long long)trace[MEMCPY_H2D1].currentMemoryUsage); 
188 |     // printf("%s\t%llu\t%llu\t\tNA\tNA\t\%llu\n", trace[KERNEL].functionName,
189 |     //         (unsigned long long)trace[KERNEL].startTimestamp,
190 |     //         (unsigned long long)trace[KERNEL].endTimestamp - trace[KERNEL].startTimestamp,
191 |     //         (unsigned long long)trace[MEMCPY_H2D1].currentMemoryUsage);
192 |     // printf("%s\t%llu\t%llu\t\t%llu\t%s\t\%llu\n", trace[MEMCPY_D2H].functionName,
193 |     //         (unsigned long long)trace[MEMCPY_D2H].startTimestamp,
194 |     //         (unsigned long long)trace[MEMCPY_D2H].endTimestamp - trace[MEMCPY_D2H].startTimestamp, 
195 |     //         (unsigned long long)trace[MEMCPY_D2H].memcpy_bytes,
196 |     //         memcpyKindStr(trace[MEMCPY_D2H].memcpy_kind),
197 |     //         (unsigned long long)trace[MEMCPY_H2D1].currentMemoryUsage); 
198 | }


--------------------------------------------------------------------------------
/spECK/source/RunConfig.cpp:
--------------------------------------------------------------------------------
 1 | #include "RunConfig.h"
 2 | 
 3 | #include <string>
 4 | #include <vector>
 5 | #include <sstream>
 6 | #include "Config.h"
 7 | 
 8 | RunConfig::RunConfig(int argc, char *argv[])
 9 | {
10 |     std::string mat1, mat2;
11 |     mat1 = "can_24";
12 |     mat2 = "can_24";
13 |     if(argc == 2){
14 |         mat1 = argv[1];
15 |         mat2 = argv[1];
16 |     }
17 |     if(argc >= 3){
18 |         mat1 = argv[1];
19 |         mat2 = argv[2];
20 |     }
21 |     std::string mat1_file;
22 |     if(mat1.find("ER") != std::string::npos){
23 |         mat1_file = "../matrix/ER/" + mat1 +".mtx";
24 |     }
25 |     else if(mat1.find("G500") != std::string::npos){
26 |         mat1_file = "../matrix/G500/" + mat1 +".mtx";
27 |     }
28 |     else{
29 |         mat1_file = "../matrix/suite_sparse/" + mat1 + "/" + mat1 +".mtx";
30 |     }
31 |     std::string mat2_file;
32 |     if(mat2.find("ER") != std::string::npos){
33 |         mat2_file = "../matrix/ER/" + mat2 +".mtx";
34 |     }
35 |     else if(mat2.find("G500") != std::string::npos){
36 |         mat2_file = "../matrix/G500/" + mat2 +".mtx";
37 |     }
38 |     else{
39 |         mat2_file = "../matrix/suite_sparse/" + mat2 + "/" + mat2 +".mtx";
40 |     }
41 |     filePath = mat1_file;
42 |     filePath2 = mat2_file;
43 |     mat_name = mat1;
44 |     mat_name2 = mat2;
45 |     //printf("in RunConfig.cpp %s %s\n", filePath.c_str(), filePath2.c_str());
46 |     printf("%s %s ", mat1.c_str(), mat2.c_str());
47 | 	Config::init("config.ini");
48 | }
49 | 
50 | 
51 | RunConfig::~RunConfig()
52 | {
53 | }
54 | 


--------------------------------------------------------------------------------
/spECK/source/cuSparseMultiply.cu:
--------------------------------------------------------------------------------
  1 | #include "cuSparseMultiply.h"
  2 | #include <cuda_runtime.h>
  3 | #include "common.h"
  4 | 
  5 | 
  6 | namespace cuSPARSE {
  7 | 		template<>
  8 | 		cusparseStatus_t CUSPARSEAPI CuSparseTest<float>::cusparseTranspose(cusparseHandle_t handle, int m, int n, int nnz,
  9 | 			const float  *csrSortedVal,	const int *csrSortedRowPtr, const int *csrSortedColInd,
 10 | 			float *cscSortedVal, int *cscSortedRowInd, int *cscSortedColPtr, cusparseAction_t copyValues, cusparseIndexBase_t idxBase)
 11 | 		{
 12 |             void *buffer = nullptr;
 13 |             size_t buffer_size = 0;
 14 |             checkCuSparseError(cusparseCsr2cscEx2_bufferSize(handle, m, n, nnz, csrSortedVal, csrSortedRowPtr, csrSortedColInd, cscSortedVal,
 15 | 				cscSortedColPtr, cscSortedRowInd, CUDA_R_32F, copyValues, idxBase, CUSPARSE_CSR2CSC_ALG1, &buffer_size), "buffer size failed");
 16 |             HANDLE_ERROR(cudaMalloc(&buffer, buffer_size));
 17 | 
 18 |             auto retVal = checkCuSparseError(cusparseCsr2cscEx2(handle, m, n, nnz, csrSortedVal, csrSortedRowPtr, csrSortedColInd, cscSortedVal,
 19 | 				cscSortedColPtr, cscSortedRowInd, CUDA_R_32F, copyValues, idxBase, CUSPARSE_CSR2CSC_ALG1, buffer), "transpose failed");
 20 |             HANDLE_ERROR(cudaFree(buffer));
 21 |             return retVal;
 22 | 		}
 23 | 
 24 | 		template<>
 25 | 		cusparseStatus_t CUSPARSEAPI CuSparseTest<double>::cusparseTranspose(cusparseHandle_t handle,  int m, int n, int nnz,
 26 | 			const double  *csrSortedVal, const int *csrSortedRowPtr, const int *csrSortedColInd,
 27 | 			double *cscSortedVal, int *cscSortedRowInd, int *cscSortedColPtr, cusparseAction_t copyValues, cusparseIndexBase_t idxBase)
 28 | 		{
 29 |             void *buffer = nullptr;
 30 |             size_t buffer_size = 0;
 31 |             checkCuSparseError(cusparseCsr2cscEx2_bufferSize(handle, m, n, nnz, csrSortedVal, csrSortedRowPtr, csrSortedColInd, cscSortedVal,
 32 | 				cscSortedColPtr, cscSortedRowInd, CUDA_R_64F, copyValues, idxBase, CUSPARSE_CSR2CSC_ALG1, &buffer_size), "buffer size failed");
 33 |             HANDLE_ERROR(cudaDeviceSynchronize());
 34 |             HANDLE_ERROR(cudaMalloc(&buffer, buffer_size));
 35 | 
 36 |             auto retVal = checkCuSparseError(cusparseCsr2cscEx2(handle, m, n, nnz, csrSortedVal, csrSortedRowPtr, csrSortedColInd, cscSortedVal,
 37 | 				cscSortedColPtr, cscSortedRowInd, CUDA_R_64F, copyValues, idxBase, CUSPARSE_CSR2CSC_ALG1, buffer), "transpose failed");
 38 |             HANDLE_ERROR(cudaFree(buffer));
 39 |             return retVal;
 40 | 		}
 41 | 
 42 | 	template <typename DataType>
 43 | 	float CuSparseTest<DataType>::Multiply(const dCSR<DataType>& A, const dCSR<DataType>& B, dCSR<DataType>& matOut, uint32_t& cusparse_nnz)
 44 | 	{
 45 | 		int nnzC;
 46 | 		int *nnzTotalDevHostPtr = &nnzC;
 47 | 		float duration;
 48 |         DataType alpha = (DataType) 1.0f;
 49 |         DataType beta = (DataType) 0.0f;
 50 | 
 51 | 		cudaEvent_t start, stop;
 52 | 		HANDLE_ERROR(cudaEventCreate(&start));
 53 | 		HANDLE_ERROR(cudaEventCreate(&stop));
 54 | 
 55 | 		// ############################
 56 | 		HANDLE_ERROR(cudaEventRecord(start));
 57 | 		// ############################
 58 | 
 59 |         auto computeType = sizeof(DataType) == 4 ? CUDA_R_32F : CUDA_R_64F;
 60 |         cusparseSpMatDescr_t matA, matB, matC;
 61 |         checkCuSparseError( cusparseCreateCsr(&matA, A.rows, A.cols, A.nnz,
 62 |                                         A.row_offsets, A.col_ids, A.data,
 63 |                                         CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
 64 |                                         CUSPARSE_INDEX_BASE_ZERO, computeType), "A failed");
 65 |         checkCuSparseError( cusparseCreateCsr(&matB, B.rows, B.cols, B.nnz,
 66 |                                         B.row_offsets, B.col_ids, B.data,
 67 |                                         CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
 68 |                                         CUSPARSE_INDEX_BASE_ZERO, computeType), "B failed");
 69 |         checkCuSparseError( cusparseCreateCsr(&matC, A.rows, B.cols, 0,
 70 |                                         NULL, NULL, NULL,
 71 |                                         CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
 72 |                                         CUSPARSE_INDEX_BASE_ZERO, computeType), "C failed");
 73 | 
 74 |         void*  dBuffer1    = NULL, *dBuffer2   = NULL;
 75 |         size_t bufferSize1 = 0,    bufferSize2 = 0;
 76 |         cusparseSpGEMMDescr_t spgemmDesc;
 77 |         checkCuSparseError( cusparseSpGEMM_createDescr(&spgemmDesc), "create description failed");
 78 |         auto opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
 79 |         auto opB = CUSPARSE_OPERATION_NON_TRANSPOSE;
 80 |         // Device memory management: Allocate and copy A, B
 81 |         int   *dC_csrOffsets = nullptr, *dC_columns = nullptr;
 82 |         DataType *dC_values;
 83 | 
 84 |         // ask bufferSize1 bytes for external memory
 85 |         checkCuSparseError(cusparseSpGEMM_workEstimation(handle, opA, opB,
 86 |                                     &alpha, matA, matB, &beta, matC,
 87 |                                     computeType, CUSPARSE_SPGEMM_DEFAULT,
 88 |                                     spgemmDesc, &bufferSize1, 0), "workestimation0 failed");
 89 |         HANDLE_ERROR(cudaMalloc((void**) &dBuffer1, bufferSize1));
 90 |         // inspect the matrices A and B to understand the memory requirement for
 91 |         // the next step
 92 |         checkCuSparseError(cusparseSpGEMM_workEstimation(handle, opA, opB,
 93 |                                     &alpha, matA, matB, &beta, matC,
 94 |                                     computeType, CUSPARSE_SPGEMM_DEFAULT,
 95 |                                     spgemmDesc, &bufferSize1, dBuffer1), "workestimation1 failed");
 96 | 
 97 |         // ask bufferSize2 bytes for external memory
 98 |         checkCuSparseError(cusparseSpGEMM_compute(handle, opA, opB,
 99 |                             &alpha, matA, matB, &beta, matC,
100 |                             computeType, CUSPARSE_SPGEMM_DEFAULT,
101 |                             spgemmDesc, &bufferSize2, NULL), "compute0 failed");
102 |         HANDLE_ERROR(cudaMalloc((void**) &dBuffer2, bufferSize2));
103 | 
104 |         // compute the intermediate product of A * B
105 |         checkCuSparseError(cusparseSpGEMM_compute(handle, opA, opB,
106 |                             &alpha, matA, matB, &beta, matC,
107 |                             computeType, CUSPARSE_SPGEMM_DEFAULT,
108 |                             spgemmDesc, &bufferSize2, dBuffer2), "compute1 failed");
109 |         // get matrix C non-zero entries C_num_nnz1
110 |         int64_t C_num_rows1, C_num_cols1, C_num_nnz1;
111 |          checkCuSparseError(cusparseSpMatGetSize(matC, &C_num_rows1, &C_num_cols1, &C_num_nnz1), "get size failed");
112 |         // allocate matrix C
113 |         HANDLE_ERROR(cudaMalloc((void**) &dC_csrOffsets, (C_num_rows1 + 1) * sizeof(int)));
114 |         HANDLE_ERROR(cudaMalloc((void**) &dC_columns, C_num_nnz1 * sizeof(int)));
115 |         HANDLE_ERROR(cudaMalloc((void**) &dC_values,  C_num_nnz1 * sizeof(DataType)));
116 |         // update matC with the new pointers
117 |         checkCuSparseError(cusparseCsrSetPointers(matC, dC_csrOffsets, dC_columns, dC_values), "get pointers failed");
118 | 
119 |         // copy the final products to the matrix C
120 |         checkCuSparseError(cusparseSpGEMM_copy(
121 |             handle, 
122 |             opA, 
123 |             opB,
124 |             &alpha, 
125 |             matA, 
126 |             matB, 
127 |             &beta, 
128 |             matC,
129 |             computeType, 
130 |             CUSPARSE_SPGEMM_DEFAULT, 
131 |             spgemmDesc), 
132 |             "copy failed");
133 | 
134 |         cusparseIndexType_t _rowType, _columnType;
135 |         cusparseIndexBase_t _indexBase;
136 |         cudaDataType _baseOff;
137 |         checkCuSparseError(cusparseCsrGet(matC,
138 |             (int64_t*) &matOut.rows,
139 |             (int64_t*) &matOut.cols,
140 |             (int64_t*) &matOut.nnz,
141 |             (void**) &matOut.row_offsets,
142 |             (void**) &matOut.col_ids,
143 |             (void**) &matOut.data,
144 |             &_rowType,
145 |             &_columnType,
146 |             &_indexBase,
147 |             &_baseOff
148 |         ), "get failed");
149 |         // destroy matrix/vector descriptors
150 |         checkCuSparseError( cusparseSpGEMM_destroyDescr(spgemmDesc), "destroy failed" );
151 |         HANDLE_ERROR(cudaFree(dBuffer1));
152 |         HANDLE_ERROR(cudaFree(dBuffer2));
153 | 
154 | 		// ############################
155 | 		HANDLE_ERROR(cudaDeviceSynchronize());
156 | 		HANDLE_ERROR(cudaEventRecord(stop));
157 | 		HANDLE_ERROR(cudaEventSynchronize(stop));
158 | 		// ############################
159 | 
160 | 		HANDLE_ERROR(cudaEventElapsedTime(&duration, start, stop));
161 |         cusparse_nnz = matOut.nnz;
162 | 
163 | 		return duration;
164 | 	}
165 | 
166 | 	template float CuSparseTest<float>::Multiply(const dCSR<float>& A, const dCSR<float>& B, dCSR<float>& matOut, uint32_t& cusparse_nnz);
167 | 	template float CuSparseTest<double>::Multiply(const dCSR<double>& A, const dCSR<double>& B, dCSR<double>& matOut, uint32_t& cusparse_nnz);
168 | 
169 | 	template <typename DataType>
170 | 	void CuSparseTest<DataType>::Transpose(const dCSR<DataType>& A, dCSR<DataType>& AT)
171 | 	{
172 | 		AT.alloc(A.cols, A.rows, A.nnz);
173 | 
174 | 		checkCuSparseError(cusparseTranspose(handle, A.rows, A.cols, A.nnz,
175 | 			reinterpret_cast<const DataType*>(A.data), reinterpret_cast<const int*>(A.row_offsets), reinterpret_cast<const int*>(A.col_ids),
176 | 			reinterpret_cast<DataType*>(AT.data), reinterpret_cast<int*>(AT.col_ids), reinterpret_cast<int*>(AT.row_offsets),
177 | 			CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO), "transpose failed");
178 | 	}
179 | 
180 | 	template	void CuSparseTest<float>::Transpose(const dCSR<float>& A, dCSR<float>& AT);
181 | 	template	void CuSparseTest<double>::Transpose(const dCSR<double>& A, dCSR<double>& AT);
182 | }
183 | 


--------------------------------------------------------------------------------
/spECK/source/dCSR.cpp:
--------------------------------------------------------------------------------
  1 | #include "dCSR.h"
  2 | #include "CSR.h"
  3 | 
  4 | #include <cuda_runtime.h>
  5 | 
  6 | namespace
  7 | {
  8 | 	template<typename T>
  9 | 	void dealloc(dCSR<T>& mat)
 10 | 	{
 11 | 		if (mat.col_ids != nullptr)
 12 | 			cudaFree(mat.col_ids);
 13 | 		if (mat.data != nullptr)
 14 | 			cudaFree(mat.data);
 15 | 		if (mat.row_offsets != nullptr)
 16 | 			cudaFree(mat.row_offsets);
 17 | 		mat.col_ids = nullptr;
 18 | 		mat.data = nullptr;
 19 | 		mat.row_offsets = nullptr;
 20 | 		mat.nnz = 0;
 21 | 		mat.rows = 0;
 22 | 	}
 23 | }
 24 | 
 25 | template<typename T>
 26 | void dCSR<T>::alloc(size_t r, size_t c, size_t n, bool allocOffsets)
 27 | {
 28 | 	dealloc(*this);
 29 | 	rows = r;
 30 | 	cols = c;
 31 | 	nnz = n;
 32 | 	cudaMalloc(&data, sizeof(T)*n);
 33 | 	cudaMalloc(&col_ids, sizeof(unsigned int)*n);
 34 | 	if (allocOffsets)
 35 | 		cudaMalloc(&row_offsets, sizeof(unsigned int)*(r+1));
 36 | }
 37 | template<typename T>
 38 | dCSR<T>::~dCSR()
 39 | {
 40 | 	dealloc(*this);
 41 | }
 42 | 
 43 | template<typename T>
 44 | void dCSR<T>::reset()
 45 | {
 46 | 	dealloc(*this);
 47 | }
 48 | 
 49 | 
 50 | template<typename T>
 51 | void convert(dCSR<T>& dst, const CSR<T>& src, unsigned int padding)
 52 | {
 53 | 	dst.alloc(src.rows + padding, src.cols, src.nnz + 8*padding);
 54 | 	dst.rows = src.rows; dst.nnz = src.nnz; dst.cols = src.cols;
 55 | 	cudaMemcpy(dst.data, &src.data[0], src.nnz * sizeof(T), cudaMemcpyHostToDevice);
 56 | 	cudaMemcpy(dst.col_ids, &src.col_ids[0], src.nnz * sizeof(unsigned int), cudaMemcpyHostToDevice);
 57 | 	cudaMemcpy(dst.row_offsets, &src.row_offsets[0], (src.rows + 1) * sizeof(unsigned int), cudaMemcpyHostToDevice);
 58 | 
 59 | 	if (padding)
 60 | 	{
 61 | 		cudaMemset(dst.data + src.nnz, 0, 8 * padding * sizeof(T));
 62 | 		cudaMemset(dst.col_ids + src.nnz, 0, 8 * padding * sizeof(unsigned int));
 63 | 		cudaMemset(dst.row_offsets + src.rows + 1, 0, padding * sizeof(unsigned int));
 64 | 	}
 65 | }
 66 | 
 67 | template<typename T>
 68 | void convert(CSR<T>& dst, const dCSR<T>& src, unsigned int padding)
 69 | {
 70 | 	dst.alloc(src.rows + padding, src.cols, src.nnz + 8 * padding);
 71 | 	dst.rows = src.rows; dst.nnz = src.nnz; dst.cols = src.cols;
 72 | 	cudaMemcpy(dst.data, src.data, dst.nnz * sizeof(T), cudaMemcpyDeviceToHost);
 73 | 	cudaMemcpy(dst.col_ids, src.col_ids, dst.nnz * sizeof(unsigned int), cudaMemcpyDeviceToHost);
 74 | 	cudaMemcpy(dst.row_offsets, src.row_offsets, (dst.rows + 1) * sizeof(unsigned int), cudaMemcpyDeviceToHost);
 75 | }
 76 | 
 77 | template<typename T>
 78 | void convert(dCSR<T>& dst, const dCSR<T>& src, unsigned int padding)
 79 | {
 80 | 	dst.alloc(src.rows + padding, src.cols, src.nnz + 8 * padding);
 81 | 	dst.rows = src.rows; dst.nnz = src.nnz; dst.cols = src.cols;
 82 | 	cudaMemcpy(dst.data, src.data, dst.nnz * sizeof(T), cudaMemcpyDeviceToDevice);
 83 | 	cudaMemcpy(dst.col_ids, src.col_ids, dst.nnz * sizeof(unsigned int), cudaMemcpyDeviceToDevice);
 84 | 	cudaMemcpy(dst.row_offsets, src.row_offsets, (dst.rows + 1) * sizeof(unsigned int), cudaMemcpyDeviceToDevice);
 85 | }
 86 | 
 87 | template<typename T>
 88 | void convert(CSR<T>& dst, const CSR<T>& src, unsigned int padding)
 89 | {
 90 | 	dst.alloc(src.rows + padding, src.cols, src.nnz + 8 * padding);
 91 | 	dst.rows = src.rows; dst.nnz = src.nnz; dst.cols = src.cols;
 92 | 	memcpy(dst.data, src.data, dst.nnz * sizeof(T));
 93 | 	memcpy(dst.col_ids, src.col_ids, dst.nnz * sizeof(unsigned int));
 94 | 	memcpy(dst.row_offsets, src.row_offsets, (dst.rows + 1) * sizeof(unsigned int));
 95 | }
 96 | 
 97 | template void dCSR<float>::alloc(size_t r, size_t c, size_t n, bool allocOffsets);
 98 | template void dCSR<double>::alloc(size_t r, size_t c, size_t n, bool allocOffsets);
 99 | template void dCSR<uint64_t>::alloc(size_t r, size_t c, size_t n, bool allocOffsets);
100 | 
101 | template dCSR<float>::~dCSR();
102 | template dCSR<double>::~dCSR();
103 | template dCSR<uint64_t>::~dCSR();
104 | 
105 | template void dCSR<float>::reset();
106 | template void dCSR<double>::reset();
107 | template void dCSR<uint64_t>::reset();
108 | 
109 | template void convert(dCSR<float>& dcsr, const CSR<float>& csr, unsigned int);
110 | template void convert(dCSR<double>& dcsr, const CSR<double>& csr, unsigned int);
111 | //template void convert(dCSR<uint64_t>& dcsr, const CSR<double>& csr, unsigned int);
112 | 
113 | template void convert(CSR<float>& csr, const dCSR<float>& dcsr, unsigned int padding);
114 | template void convert(CSR<double>& csr, const dCSR<double>& dcsr, unsigned int padding);
115 | //template void convert(CSR<uint64_t>& csr, const dCSR<double>& dcsr, unsigned int padding);
116 | 
117 | template void convert(dCSR<float>& dcsr, const dCSR<float>& csr, unsigned int);
118 | template void convert(dCSR<double>& dcsr, const dCSR<double>& csr, unsigned int);
119 | //template void convert(dCSR<uint64_t>& dcsr, const dCSR<double>& csr, unsigned int);
120 | 
121 | template void convert(CSR<float>& csr, const CSR<float>& dcsr, unsigned int padding);
122 | template void convert(CSR<double>& csr, const CSR<double>& dcsr, unsigned int padding);
123 | //template void convert(CSR<uint64_t>& csr, const CSR<double>& dcsr, unsigned int padding);
124 | 


--------------------------------------------------------------------------------
/spECK/source/reg_runspECK.cpp:
--------------------------------------------------------------------------------
 1 | #ifdef _WIN32
 2 | #include <intrin.h>
 3 | //surpress crash notification windows (close or debug program window)
 4 | #define WIN32_LEAN_AND_MEAN
 5 | #define NOMINMAX
 6 | #include <windows.h>
 7 | #else
 8 | #include <x86intrin.h>
 9 | #endif
10 | #include <string>
11 | #include "Executor.h"
12 | 
13 | int main(int argc, char *argv[])
14 | {
15 | #ifdef _WIN32
16 | 	//surpress crash notification windows (close or debug program window)
17 | 	SetErrorMode(GetErrorMode() | SEM_NOGPFAULTERRORBOX);
18 | #endif
19 | 	Executor<double> exe(argc, argv);
20 | 	return exe.run();
21 | }
22 | 


--------------------------------------------------------------------------------
/spECK/source/runspECK.cpp:
--------------------------------------------------------------------------------
 1 | #ifdef _WIN32
 2 | #include <intrin.h>
 3 | //surpress crash notification windows (close or debug program window)
 4 | #define WIN32_LEAN_AND_MEAN
 5 | #define NOMINMAX
 6 | #include <windows.h>
 7 | #else
 8 | #include <x86intrin.h>
 9 | #endif
10 | #include <string>
11 | #include "Executor.h"
12 | 
13 | int main(int argc, char *argv[])
14 | {
15 | #ifdef _WIN32
16 | 	//surpress crash notification windows (close or debug program window)
17 | 	SetErrorMode(GetErrorMode() | SEM_NOGPFAULTERRORBOX);
18 | #endif
19 | 	Executor<double> exe(argc, argv);
20 | 	return exe.run_detail();
21 | }
22 | 


--------------------------------------------------------------------------------