├── README.md
├── data
    ├── run142.sh
    └── run18.sh
└── src
    ├── Makefile
    ├── common.h
    ├── csr2tile.h
    ├── cuda_fp16.h
    ├── cuda_fp16.hpp
    ├── external
        └── cusparse
        │   ├── Makefile
        │   ├── common.h
        │   ├── main.cu
        │   ├── mmio.h
        │   ├── mmio_highlevel.h
        │   ├── spgemm
        │   ├── spgemm_cusparse.h
        │   ├── spgemm_serialref_esc.h
        │   ├── spgemm_serialref_spa.h
        │   ├── spgemm_serialref_spa_new.h
        │   ├── tranpose.h
        │   ├── utils.h
        │   ├── utils_cuda_matinfo.h
        │   └── utils_cuda_segsort_subfunc
        │       ├── segsort_subfunc_fast_bin.h
        │       ├── segsort_subfunc_kern_copy_unit.h
        │       ├── segsort_subfunc_kern_exch_func.h
        │       ├── segsort_subfunc_kern_mergepath_func.h
        │       ├── segsort_subfunc_kern_selected_kepler.h
        │       ├── segsort_subfunc_kern_selected_pascal.h
        │       └── segsort_subfunc_test_grid_kernel.h
    ├── hash.h
    ├── main.cu
    ├── mmio.h
    ├── mmio_highlevel.h
    ├── nsparse_asm.h
    ├── spgemm-cpu.h
    ├── spgemm_cu.h
    ├── spgemm_nsparse_kernel.h
    ├── spgemm_serialref_spa_new.h
    ├── test
    ├── tile2csr.h
    ├── tilespgemm-cuda.h
    ├── utils.h
    └── utils_cuda_scan.h


/README.md:
--------------------------------------------------------------------------------
 1 | # TileSpGEMM
 2 | 
 3 |  
 4 | 
 5 | **TileSpGEMM** is an open source code that uses a tiled structure to optimize general sparse matrix-matrix multiplication (SpGEMM) on GPUs. 
 6 | 
 7 | 
 8 | -------------------
 9 | ## Paper information
10 | 
11 | Yuyao Niu, Zhengyang Lu, Haonan Ji, Shuhui Song, Zhou Jin, and Weifeng Liu. 2022. TileSpGEMM: A Tiled Algorithm for Parallel Sparse General Matrix-Matrix Multiplication on GPUs. In 27th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (PPoPP ’22), 17 pages. DOI:  https://doi.org/10.1145/3503221.3508431
12 | 
13 | ## Contact us
14 | 
15 | If you have any questions about running the code, please contact Yuyao Niu. 
16 | 
17 | E-mail: yuyao.niu.phd@gmail.com
18 | 
19 | ## Introduction
20 | 
21 | General sparse matrix-matrix multiplication(SpGEMM) executes C=AB, where A, B and C are all sparse matrices. TileSpGEMM sparsifies the tiled method in dense general matrix-matrix multiplication (GEMM) and saves each non-empty tile in a sparse form. By this way, the three performance issues of load imbalance, allocating proper size for intermediate products and designing a sparse accumulator can be resolved. Several optimization techniques, such as binary search for set intersection, bit mask operations for symbolic SpGEMM, and an adaptive method for selecting sparse or dense accumulator in on-chip memory, are also developed to improve efficiency. TileSpGEMM provides a version of CUDA on a high parallelism currently. 
22 | 
23 | 
24 | <!-- ## Structure
25 | ```
26 | beidoublas/README     instructions on installation
27 | beidoublas/src        C source code, to be compiled into libbeidoublas.so
28 | beidoublas/test       testing code
29 | beidoublas/Makefile   top-level Makefile that does installation and testing
30 | ``` -->
31 | 
32 | ## Installation
33 | 
34 | <!-- To use this code, you need to modify the Makefile with correct g++ installation path and use make for automatic installation. -->
35 | To better reproduce experiment results, we suggest an NVIDIA GPU with compute capability 8.6.
36 | TileSpGEMM evaluation requires the CUDA GPU driver, the nvcc CUDA compiler, and the cuSPARSE library, all of them are included with the CUDA Toolkit. The artifacts have been tested on Ubuntu 18.04/20.04, and are expected to run correctly under other Linux distributions.
37 | 
38 | ## Execution of TileSpGEMM
39 | Our test programs currently support input files encoded using the matrix market format. All matrix market datasets used in this evaluation are publicly available from the SuiteSparse Matrix Collection. 
40 | 
41 | 1. Set CUDA path in the Makefile
42 | 
43 | 2. The command 'make' generates an executable file 'test' for double precision.
44 | > **make**
45 | 
46 | 3. Run SpGEMM code on matrix data with auto-tuning in double precision. The GPU compilation takes an optional d=<gpu-device, e.g., 0> parameter that specifies the GPU device to run if multiple GPU devices are available at the same time, and another optional aat=<transpose, e.g., 0> parameter that means computing C = A^2 (-aat 0) or C = AA^T (-aat 1)). 
47 | > **$ ./test -d 0 -aat 0 <path/to/dataset/mtx>**
48 | 
49 | ## Output information
50 | 
51 | Lines 1-2 output the input matrix's information including the path of matrix file, The number of rows, columns and nonzeros.
52 | 
53 | Line 3 prints the file loading time (in seconds).
54 | 
55 | Line 4 prints the size of tile used in our TileSpGEMM algorithm.
56 | 
57 | Line 5 prints the number of floating point operations during the multiplication.
58 | 
59 | Line 6 prints the runtime of transforming the input matrix from the CSR format to our tiled data structure (in millisec- onds) (Figure 12 in our paper).
60 | 
61 | Line 7 prints TileSpGEMM data structure's space consump- tion (in million bytes) (Figure 11 in our paper).
62 | 
63 | Lines 8-14 print execution time (in milliseconds) of the three algorithm steps and all memory allocation on CPU and GPU (Figure 10 in our paper).
64 | 
65 | Line 15 prints the number of tiles of the resulting matrix C. 
66 | 
67 | Line 16 prints the number of nonzeros of the resulting matrix C.
68 | 
69 | Line 17 prints TileSpGEMM runtime (in milliseconds) and performance (in GFlOPs) (Figures 6 and 7 in our paper).
70 | 
71 | Line 18 prints the checking result after comparing our output with the one generated by cuSPARSE.
72 | 
73 | ## Release version
74 | Jan 3,2022 Version Alpha
75 | 
76 | 
77 | 
78 | 
79 |  
80 | 
81 | 
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/data/run142.sh:
--------------------------------------------------------------------------------
 1 | input="../datasets/mat-142.csv"
 2 | 
 3 | {
 4 |   read
 5 |   i=1
 6 |   while IFS=',' read -r Group Name rows cols nonzeros
 7 |   do
 8 |     echo "$Group $Name $rows $cols $nonzeros"
 9 | #    echo "../datasets/142mat/$Name/$Name.mtx"
10 | 	echo "/142mat/$Group/$Name/$Name.mtx"
11 | #    ./../TileSpGEMM/test -d 0 -aat 0 ../datasets/142mat/$Name/$Name.mtx
12 | 	./../TileSpGEMM/test -d 0 -aat 0 /home/ppopp22_test/MM/142mat/$Name.mtx
13 |     i=`expr $i + 1`
14 |   done 
15 | } < "$input"
16 | 


--------------------------------------------------------------------------------
/data/run18.sh:
--------------------------------------------------------------------------------
 1 | input="../datasets/mtx18.csv"
 2 | 
 3 | {
 4 |   read
 5 |   i=1
 6 |   while IFS=',' read -r Group Name rows cols nonzeros
 7 |   do
 8 |     echo "$Group $Name $rows $cols $nonzeros"
 9 | #    echo "../datasets/18mat/$Name/$Name.mtx"
10 |     echo "/18mat/$Group/$Name/$Name.mtx"
11 | #    ./../TileSpGEMM/test -d 0 -aat 0 ../datasets/18mat/$Name/$Name.mtx
12 |     ./../TileSpGEMM/test -d 0 -aat 0 /home/ppopp22_test/MM/18mat/$Name.mtx
13 |     i=`expr $i + 1`
14 |   done 
15 | } < "$input"
16 | 


--------------------------------------------------------------------------------
/src/Makefile:
--------------------------------------------------------------------------------
 1 | #compilers
 2 | CC=nvcc
 3 | 
 4 | #GLOBAL_PARAMETERS
 5 | MAT_VAL_TYPE = double
 6 | VALUE_TYPE = double
 7 | 
 8 | #CUDA_PARAMETERS
 9 | NVCC_FLAGS = -O3 -w -arch=compute_61 -code=sm_86 -gencode=arch=compute_61,code=sm_86
10 | #-gencode=arch=compute_61,code=sm_75
11 | # -m64 -Xptxas -dlcm=cg -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_61,code=compute_61
12 | #-Xcompiler -Wall -D_FORCE_INLINES -DVERBOSE --expt-extended-lambda -use_fast_math --expt-relaxed-constexpr
13 | 
14 | #ENVIRONMENT_PARAMETERS
15 | CUDA_INSTALL_PATH = /usr/local/cuda-11.4
16 | 
17 | #includes
18 | INCLUDES = -I$(CUDA_INSTALL_PATH)/include
19 | 
20 | #libs
21 | #CLANG_LIBS = -stdlib=libstdc++ -lstdc++
22 | CUDA_LIBS = -L$(CUDA_INSTALL_PATH)/lib64  -lcudart  -lcusparse
23 | LIBS = $(CUDA_LIBS)
24 | 
25 | #options
26 | #OPTIONS = -std=c99
27 | 
28 | make:
29 | 	$(CC) $(NVCC_FLAGS) -Xcompiler -fopenmp -Xcompiler -mfma main.cu -o test $(INCLUDES) $(LIBS) $(OPTIONS) -D VALUE_TYPE=$(VALUE_TYPE)
30 | 


--------------------------------------------------------------------------------
/src/common.h:
--------------------------------------------------------------------------------
  1 | #include <time.h>
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <string.h>
  5 | #include <math.h>
  6 | #include <mm_malloc.h>
  7 | #include <x86intrin.h>
  8 | #include <immintrin.h>
  9 | #include <nmmintrin.h>
 10 | 
 11 | 
 12 | #include <omp.h>
 13 | 
 14 | #include <sys/time.h>
 15 | #include "cuda_fp16.h"
 16 | 
 17 | #include "utils.h"
 18 | 
 19 | #ifndef MAT_VAL_TYPE
 20 | #define MAT_VAL_TYPE double
 21 | #endif
 22 | 
 23 | #ifndef MAT_PTR_TYPE
 24 | #define MAT_PTR_TYPE int
 25 | #endif
 26 | 
 27 | #define WARP_SIZE 32
 28 | #define WARP_PER_BLOCK 4
 29 | 
 30 | #define HALFWARP_SIZE 16
 31 | #define HALFWARP_PER_BLOCK 8
 32 | 
 33 | #ifndef BLOCK_SIZE
 34 | #define BLOCK_SIZE  16
 35 | #endif
 36 | 
 37 | 
 38 | #define SMEM_TNY_TH 32
 39 | #define SMEM_SML_TH 32 //112
 40 | #define SMEM_LRG_TH 224
 41 | #define SMEM_DNS_TH 256
 42 | 
 43 | #define USE_HALFWARP 1
 44 | #define TILE_PER_WARP 16 // should not be larger than WARPSIZE
 45 | #define TILE_PER_HALFWARP 8 // should not be larger than HALFWARP_SIZE
 46 | 
 47 | //#define LOAD_MASKB_TH 4
 48 | #define VECTORIZE_NNZA_OR_NNZB_TH 8
 49 | 
 50 | #define SMEM_INTERSECTION_TH 16
 51 | #define SMEM_INTERSECTION_LEN 48
 52 | 
 53 | #define USE_GMEM_SPECULATIVE_INTERSECTION 1
 54 | #define GMEM_SPECULATIVE_INTERSECTION 1
 55 | 
 56 | #define SPECULATIVE_INTERSECTION 32
 57 | 
 58 | #define SPA_INT_PER_WARP 512
 59 | #define NUMCOLC_SPA_OR_HASH_TH     SPA_INT_PER_WARP * 32 // SPA_INT_PER_WARP int per warp
 60 | 
 61 | // e.g., INTERSECTION_SPARSE_OR_DNS_TH = 0.2 means when density is higher than 20%, use DNS for intersection
 62 | #define INTERSECTION_SPARSE_OR_DNS_TH 0.2
 63 | #define NNZTOTALA_FAST_TRACK_TH2 192
 64 | 
 65 | #define USE_DNS_THREAD 1
 66 | 
 67 | #define DEBUG 1
 68 | 
 69 | #define REPEAT_NUM 1
 70 | 
 71 | #ifndef TIMING
 72 | #define TIMING 1
 73 | #endif
 74 | 
 75 | #ifndef SPACE
 76 | #define SPACE 1
 77 | #endif
 78 | 
 79 | 
 80 | #ifndef CHECK_RESULT
 81 | #define CHECK_RESULT 1
 82 | #endif
 83 | 
 84 | #define SMEM_TNY_TH 32
 85 | #define SMEM_SML_TH 32 //112
 86 | #define SMEM_LRG_TH 224
 87 | #define SMEM_DNS_TH 256
 88 | 
 89 | #define USE_HALFWARP 1
 90 | #define TILE_PER_WARP 16 // should not be larger than WARPSIZE
 91 | #define TILE_PER_HALFWARP 8 // should not be larger than HALFWARP_SIZE
 92 | 
 93 | #define VECTORIZE_NNZA_OR_NNZB_TH 8
 94 | 
 95 | #define SMEM_INTERSECTION_TH 16
 96 | #define SMEM_INTERSECTION_LEN 48
 97 | 
 98 | #define USE_GMEM_SPECULATIVE_INTERSECTION 1
 99 | #define GMEM_SPECULATIVE_INTERSECTION 1
100 | 
101 | #define SPECULATIVE_INTERSECTION 32
102 | 
103 | #define SPA_INT_PER_WARP 512
104 | #define NUMCOLC_SPA_OR_HASH_TH     SPA_INT_PER_WARP * 32 // SPA_INT_PER_WARP int per warp
105 | 
106 | 
107 | // e.g., INTERSECTION_SPARSE_OR_DNS_TH = 0.2 means when density is higher than 20%, use DNS for intersection
108 | #define INTERSECTION_SPARSE_OR_DNS_TH 0.2
109 | #define NNZTOTALA_FAST_TRACK_TH2 192
110 | 
111 | #define USE_DNS_THREAD 0
112 | #define HASH_SCALE 107
113 | 
114 | #ifndef SMATRIX
115 | #define SMATRIX
116 | typedef struct 
117 | {
118 |     int m;
119 |     int n;
120 |     int nnz;
121 |     int isSymmetric;
122 | 	MAT_VAL_TYPE *value;
123 | 	int *columnindex;
124 | 	MAT_PTR_TYPE *rowpointer;
125 |     int tilem;
126 |     int tilen;
127 |     MAT_PTR_TYPE *tile_ptr;
128 |     int *tile_columnidx;
129 |     int *tile_rowidx;
130 |     int *tile_nnz;
131 |     int numtile;
132 |     MAT_VAL_TYPE *tile_csr_Value;
133 |     unsigned char *tile_csr_Col;
134 |     unsigned char *tile_csr_Ptr;
135 |     unsigned short *mask;
136 |     int *csc_tile_ptr;
137 |     int *csc_tile_rowidx;
138 | }SMatrix;
139 | #endif
140 | 
141 | 


--------------------------------------------------------------------------------
/src/csr2tile.h:
--------------------------------------------------------------------------------
  1 | #include "common.h"
  2 | #include "utils.h"
  3 | 
  4 | /*    STEP1: Calculate the number of non-empty tile of a sparse matrix   */
  5 | /*           Record the offset of tiles in each tile row                 */
  6 | void step1_kernel(SMatrix *matrix)
  7 | 
  8 | {
  9 |     int *rowpointer = matrix->rowpointer;
 10 |     int m = matrix->m;
 11 |     int *columnidx = matrix->columnindex;
 12 |     int tilem = matrix->tilem;
 13 |     int tilen = matrix->tilen;
 14 |     MAT_PTR_TYPE *tile_ptr = matrix->tile_ptr;
 15 | 
 16 |     unsigned thread = omp_get_max_threads();
 17 |     // unsigned thread = matrix->nthreads;
 18 | 
 19 |     char *flag_g = (char *)malloc(thread * tilen * sizeof(char));
 20 | #pragma omp parallel for
 21 |     for (int blki = 0; blki < tilem; blki++)
 22 |     {
 23 |         int thread_id = omp_get_thread_num();
 24 |         char *flag = flag_g + thread_id * tilen;
 25 |         memset(flag, 0, tilen * sizeof(char));
 26 |         int start = blki * BLOCK_SIZE;
 27 |         int end = blki == tilem - 1 ? m : (blki + 1) * BLOCK_SIZE;
 28 |         for (int j = rowpointer[start]; j < rowpointer[end]; j++)
 29 |         {
 30 |             int jc = columnidx[j] / BLOCK_SIZE;
 31 |             if (flag[jc] == 0)
 32 |             {
 33 |                 flag[jc] = 1;
 34 |                 tile_ptr[blki]++;
 35 |             }
 36 |         }
 37 |     }
 38 |     free(flag_g);
 39 | }
 40 | 
 41 | /*   STEP2:  Calculate column and row index of each tile */
 42 | /*           Calculate the number of nonzeros of each tile*/
 43 | void step2_kernel(SMatrix *matrix, unsigned char *tile_csr_ptr)
 44 | 
 45 | {
 46 |     int m = matrix->m;
 47 |     int *rowpointer = matrix->rowpointer;
 48 |     int *columnidx = matrix->columnindex;
 49 | 
 50 |     int tilem = matrix->tilem;
 51 |     int tilen = matrix->tilen;
 52 |     MAT_PTR_TYPE *tile_ptr = matrix->tile_ptr;
 53 |     int *tile_columnidx = matrix->tile_columnidx;
 54 |     int *tile_rowidx = matrix->tile_rowidx;
 55 |     int *tile_nnz = matrix->tile_nnz;
 56 | 
 57 |     unsigned thread = omp_get_max_threads();
 58 |     // unsigned thread = matrix->nthreads;
 59 | 
 60 |     char *col_temp_g = (char *)malloc((thread * tilen) * sizeof(char));
 61 | 
 62 |     int *nnz_temp_g = (int *)malloc((thread * tilen) * sizeof(int));
 63 | 
 64 |     unsigned char *ptr_per_tile_g = (unsigned char *)malloc((thread * tilen * BLOCK_SIZE) * sizeof(unsigned char));
 65 | 
 66 | #pragma omp parallel for
 67 |     for (int blki = 0; blki < tilem; blki++)
 68 |     {
 69 |         int thread_id = omp_get_thread_num();
 70 |         char *col_temp = col_temp_g + thread_id * tilen;
 71 |         memset(col_temp, 0, tilen * sizeof(char));
 72 |         int *nnz_temp = nnz_temp_g + thread_id * tilen;
 73 |         memset(nnz_temp, 0, tilen * sizeof(int));
 74 |         unsigned char *ptr_per_tile = ptr_per_tile_g + thread_id * tilen * BLOCK_SIZE;
 75 |         memset(ptr_per_tile, 0, tilen * BLOCK_SIZE * sizeof(unsigned char));
 76 |         int pre_tile = tile_ptr[blki];
 77 |         int rowlen = blki == tilem - 1 ? m - (tilem - 1) * BLOCK_SIZE : BLOCK_SIZE;
 78 |         int start = blki * BLOCK_SIZE;
 79 | 
 80 |         for (int ri = 0; ri < rowlen; ri++)
 81 |         {
 82 |             for (int j = rowpointer[start + ri]; j < rowpointer[start + ri + 1]; j++)
 83 |             {
 84 |                 int jc = columnidx[j] / BLOCK_SIZE;
 85 |                 col_temp[jc] = 1;
 86 |                 nnz_temp[jc]++;
 87 |                 ptr_per_tile[jc * BLOCK_SIZE + ri]++;
 88 |             }
 89 |         }
 90 | 
 91 |         int count = 0;
 92 |         for (int blkj = 0; blkj < tilen; blkj++)
 93 |         {
 94 |             if (col_temp[blkj] == 1)
 95 |             {
 96 |                 tile_columnidx[pre_tile + count] = blkj;
 97 |                 tile_rowidx[pre_tile + count] = blki;
 98 |                 tile_nnz[pre_tile + count] = nnz_temp[blkj];
 99 |                 for (int ri = 0; ri < rowlen; ri++)
100 |                 {
101 |                     tile_csr_ptr[(pre_tile + count) * BLOCK_SIZE + ri] = ptr_per_tile[blkj * BLOCK_SIZE + ri];
102 |                 }
103 |                 count++;
104 |             }
105 |         }
106 |     }
107 |     free(col_temp_g);
108 |     free(nnz_temp_g);
109 |     free(ptr_per_tile_g);
110 | }
111 | 
112 | void step3_kernel(SMatrix *matrix, int nnz_max, int tilecnt_max)
113 | {
114 |     int *rowpointer = matrix->rowpointer;
115 |     int *columnidx = matrix->columnindex;
116 |     MAT_VAL_TYPE *value = matrix->value;
117 |     int m = matrix->m;
118 |     int n = matrix->n;
119 |     int tilem = matrix->tilem;
120 |     int tilen = matrix->tilen;
121 |     MAT_PTR_TYPE *tile_ptr = matrix->tile_ptr;
122 |     int *tile_columnidx = matrix->tile_columnidx;
123 |     int *tile_nnz = matrix->tile_nnz;
124 |     MAT_VAL_TYPE *tile_csr_Val = matrix->tile_csr_Value;
125 |     unsigned char *tile_csr_Col = matrix->tile_csr_Col;
126 |     unsigned char *tile_csr_Ptr = matrix->tile_csr_Ptr;
127 | 
128 |     unsigned short *mask = matrix->mask;
129 | 
130 |     unsigned thread = omp_get_max_threads();
131 |     // unsigned thread = matrix->nthreads;
132 | 
133 |     unsigned char *csr_colidx_temp_g = (unsigned char *)malloc((thread * nnz_max) * sizeof(unsigned char));
134 |     MAT_VAL_TYPE *csr_val_temp_g = (MAT_VAL_TYPE *)malloc((thread * nnz_max) * sizeof(MAT_VAL_TYPE));
135 |     int *tile_count_g = (int *)malloc(thread * tilecnt_max * sizeof(int));
136 | 
137 | #pragma omp parallel for
138 |     for (int blki = 0; blki < tilem; blki++)
139 |     {
140 |         int thread_id = omp_get_thread_num();
141 |         unsigned char *csr_colidx_temp = csr_colidx_temp_g + thread_id * nnz_max;
142 |         MAT_VAL_TYPE *csr_val_temp = csr_val_temp_g + thread_id * nnz_max;
143 |         int *tile_count = tile_count_g + thread_id * tilecnt_max;
144 |         memset(csr_colidx_temp, 0, (nnz_max) * sizeof(unsigned char));
145 |         memset(csr_val_temp, 0, (nnz_max) * sizeof(MAT_VAL_TYPE));
146 |         memset(tile_count, 0, (tilecnt_max) * sizeof(int));
147 |         int tilenum_per_row = tile_ptr[blki + 1] - tile_ptr[blki];
148 |         int rowlen = blki == tilem - 1 ? m - (tilem - 1) * BLOCK_SIZE : BLOCK_SIZE;
149 |         int start = blki * BLOCK_SIZE;
150 |         int end = blki == tilem - 1 ? m : (blki + 1) * BLOCK_SIZE;
151 | 
152 |         for (int blkj = rowpointer[start]; blkj < rowpointer[end]; blkj++)
153 |         {
154 |             int jc_temp = columnidx[blkj] / BLOCK_SIZE;
155 |             for (int bi = 0; bi < tilenum_per_row; bi++)
156 |             {
157 |                 int tile_id = tile_ptr[blki] + bi;
158 |                 int jc = tile_columnidx[tile_id];
159 |                 int pre_nnz = tile_nnz[tile_id] - tile_nnz[tile_ptr[blki]];
160 |                 if (jc == jc_temp)
161 |                 {
162 |                     csr_val_temp[pre_nnz + tile_count[bi]] = value[blkj];
163 |                     csr_colidx_temp[pre_nnz + tile_count[bi]] = columnidx[blkj] - jc * BLOCK_SIZE;
164 |                     tile_count[bi]++;
165 |                     break;
166 |                 }
167 |             }
168 |         }
169 |         for (int bi = 0; bi < tilenum_per_row; bi++)
170 |         {
171 |             int tile_id = tile_ptr[blki] + bi;
172 |             int tilennz = tile_nnz[tile_id + 1] - tile_nnz[tile_id];
173 |             int offset = tile_nnz[tile_id];
174 |             int pre_nnz = tile_nnz[tile_id] - tile_nnz[tile_ptr[blki]];
175 | 
176 |             unsigned char *ptr_temp = tile_csr_Ptr + tile_id * BLOCK_SIZE;
177 |             for (int ri = 0; ri < rowlen; ri++)
178 |             {
179 |                 int start = ptr_temp[ri];
180 |                 int stop = ri == rowlen - 1 ? tilennz : ptr_temp[ri + 1];
181 |                 for (int k = start; k < stop; k++)
182 |                 {
183 |                     unsigned char colidx = csr_colidx_temp[pre_nnz + k];
184 |                     tile_csr_Val[offset + k] = csr_val_temp[pre_nnz + k];
185 |                     tile_csr_Col[offset + k] = (ri << 4) + colidx;
186 |                     mask[tile_id * BLOCK_SIZE + ri] |= (0x1 << (BLOCK_SIZE - colidx - 1));
187 |                 }
188 |             }
189 |         }
190 |     }
191 |     free(csr_colidx_temp_g);
192 |     free(csr_val_temp_g);
193 |     free(tile_count_g);
194 | }
195 | 
196 | void csr2tile_row_major(SMatrix *matrix)
197 | {
198 |     int nthreads = omp_get_max_threads();
199 | 
200 |     matrix->numtile = 0;
201 |     matrix->tilem = matrix->m % BLOCK_SIZE == 0 ? matrix->m / BLOCK_SIZE : (matrix->m / BLOCK_SIZE) + 1;
202 |     matrix->tilen = matrix->n % BLOCK_SIZE == 0 ? matrix->n / BLOCK_SIZE : (matrix->n / BLOCK_SIZE) + 1;
203 | 
204 |     matrix->tile_ptr = (MAT_PTR_TYPE *)malloc((matrix->tilem + 1) * sizeof(MAT_PTR_TYPE));
205 |     memset(matrix->tile_ptr, 0, (matrix->tilem + 1) * sizeof(MAT_PTR_TYPE));
206 | 
207 |     step1_kernel(matrix);
208 |     exclusive_scan(matrix->tile_ptr, matrix->tilem + 1);
209 | 
210 |     matrix->numtile = matrix->tile_ptr[matrix->tilem];
211 | 
212 |     matrix->tile_columnidx = (int *)malloc(matrix->numtile * sizeof(int));
213 |     memset(matrix->tile_columnidx, 0, matrix->numtile * sizeof(int));
214 | 
215 |     matrix->tile_rowidx = (int *)malloc(matrix->numtile * sizeof(int));
216 |     memset(matrix->tile_rowidx, 0, matrix->numtile * sizeof(int));
217 | 
218 |     matrix->tile_nnz = (int *)malloc((matrix->numtile + 1) * sizeof(int));
219 |     memset(matrix->tile_nnz, 0, (matrix->numtile + 1) * sizeof(int));
220 | 
221 |     matrix->tile_csr_Ptr = (unsigned char *)malloc((matrix->numtile * BLOCK_SIZE) * sizeof(unsigned char));
222 |     memset(matrix->tile_csr_Ptr, 0, (matrix->numtile * BLOCK_SIZE) * sizeof(unsigned char));
223 | 
224 |     step2_kernel(matrix, matrix->tile_csr_Ptr);
225 | 
226 | #pragma omp parallel for
227 |     for (int blki = 0; blki < matrix->tilem; blki++)
228 |     {
229 |         quick_sort_key(matrix->tile_columnidx + matrix->tile_ptr[blki], matrix->tile_ptr[blki + 1] - matrix->tile_ptr[blki]);
230 |     }
231 | 
232 |     exclusive_scan(matrix->tile_nnz, matrix->numtile + 1);
233 | 
234 |     for (int i = 0; i < matrix->numtile; i++)
235 |     {
236 |         exclusive_scan_char(matrix->tile_csr_Ptr + i * BLOCK_SIZE, BLOCK_SIZE);
237 |     }
238 | 
239 |     matrix->tile_csr_Col = (unsigned char *)malloc(matrix->nnz * sizeof(unsigned char));
240 |     memset(matrix->tile_csr_Col, 0, matrix->nnz * sizeof(unsigned char));
241 | 
242 |     matrix->tile_csr_Value = (MAT_VAL_TYPE *)malloc(matrix->nnz * sizeof(MAT_VAL_TYPE));
243 |     memset(matrix->tile_csr_Value, 0, matrix->nnz * sizeof(MAT_VAL_TYPE));
244 | 
245 |     matrix->mask = (unsigned short *)malloc(matrix->numtile * BLOCK_SIZE * sizeof(unsigned short));
246 |     memset(matrix->mask, 0, matrix->numtile * BLOCK_SIZE * sizeof(unsigned short));
247 | 
248 |     int nnz_max = 0;
249 |     int tilecnt_max = 0;
250 |     for (int blki = 0; blki < matrix->tilem; blki++)
251 |     {
252 |         int start = blki * BLOCK_SIZE;
253 |         int end = blki == matrix->tilem - 1 ? matrix->m : (blki + 1) * BLOCK_SIZE;
254 |         nnz_max = nnz_max < matrix->rowpointer[end] - matrix->rowpointer[start] ? matrix->rowpointer[end] - matrix->rowpointer[start] : nnz_max;
255 |         tilecnt_max = tilecnt_max < matrix->tile_ptr[blki + 1] - matrix->tile_ptr[blki] ? matrix->tile_ptr[blki + 1] - matrix->tile_ptr[blki] : tilecnt_max;
256 |     }
257 | 
258 |     step3_kernel(matrix, nnz_max, tilecnt_max);
259 | }
260 | 
261 | void csr2tile_col_major(SMatrix *matrix)
262 | {
263 |     matrix->numtile = 0;
264 |     matrix->tilem = matrix->m % BLOCK_SIZE == 0 ? matrix->m / BLOCK_SIZE : (matrix->m / BLOCK_SIZE) + 1;
265 |     matrix->tilen = matrix->n % BLOCK_SIZE == 0 ? matrix->n / BLOCK_SIZE : (matrix->n / BLOCK_SIZE) + 1;
266 | 
267 | 	SMatrix *BT = (SMatrix *)malloc(sizeof(SMatrix));
268 | 
269 |     BT->m = matrix->n;
270 |     BT->n = matrix->m;
271 |     BT->nnz = matrix->nnz;
272 |     BT->tilem = matrix->tilen;
273 |     BT->tilen = matrix->tilem;
274 | 
275 |     BT->tile_ptr = (MAT_PTR_TYPE *)malloc((BT->tilem + 1) * sizeof(MAT_PTR_TYPE));
276 |     memset(BT->tile_ptr, 0, (BT->tilem + 1) * sizeof(MAT_PTR_TYPE));
277 | 
278 |     MAT_PTR_TYPE *cscColPtrB = (MAT_PTR_TYPE *)malloc((matrix->n + 1) * sizeof(MAT_PTR_TYPE));
279 |     int *cscRowIdxB = (int *)malloc(matrix->nnz * sizeof(int));
280 |     MAT_VAL_TYPE *cscValB = (MAT_VAL_TYPE *)malloc(matrix->nnz * sizeof(MAT_VAL_TYPE));
281 | 
282 |     matrix_transposition(matrix->m, matrix->n, matrix->nnz, matrix->rowpointer, matrix->columnindex, matrix->value, cscRowIdxB, cscColPtrB, cscValB);
283 | 
284 | 
285 |     BT->value = cscValB;
286 |     BT->columnindex = cscRowIdxB;
287 |     BT->rowpointer = cscColPtrB;
288 | 
289 | 
290 |     step1_kernel(BT);
291 |     exclusive_scan(BT->tile_ptr, BT->tilem + 1);
292 | 
293 |     BT->numtile = BT->tile_ptr[BT->tilem];
294 |     BT->tile_columnidx = (int *)malloc(BT->numtile * sizeof(int));
295 |     memset(BT->tile_columnidx, 0, BT->numtile * sizeof(int));
296 |     BT->tile_rowidx = (int *)malloc(BT->numtile * sizeof(int));
297 |     memset(BT->tile_rowidx, 0, BT->numtile * sizeof(int));
298 |     BT->tile_nnz = (int *)malloc((BT->numtile + 1) * sizeof(int));
299 |     memset(BT->tile_nnz, 0, (BT->numtile + 1) * sizeof(int));
300 |     BT->tile_csr_Ptr = (unsigned char *)malloc((BT->numtile * BLOCK_SIZE) * sizeof(unsigned char));
301 |     memset(BT->tile_csr_Ptr, 0, (BT->numtile * BLOCK_SIZE) * sizeof(unsigned char));
302 |     step2_kernel(BT, BT->tile_csr_Ptr);
303 |     exclusive_scan(BT->tile_nnz, BT->numtile + 1);
304 | 
305 |     // for (int i=0; i < BT->numtile; i ++)
306 |     // {
307 |     //     printf("tileid = %i, col = %i\n", i, BT->tile_columnidx[i]);
308 |     // }
309 | 
310 |     matrix->tile_ptr = (MAT_PTR_TYPE *)malloc((matrix->tilem + 1) * sizeof(MAT_PTR_TYPE));
311 |     memset(matrix->tile_ptr, 0, (matrix->tilem + 1) * sizeof(MAT_PTR_TYPE));
312 | 
313 |     step1_kernel(matrix);
314 |     exclusive_scan(matrix->tile_ptr, matrix->tilem + 1);
315 |     matrix->numtile = matrix->tile_ptr[matrix->tilem];
316 |     matrix->tile_columnidx = (int *)malloc(matrix->numtile * sizeof(int));
317 |     memset(matrix->tile_columnidx, 0, matrix->numtile * sizeof(int));
318 |     matrix->tile_rowidx = (int *)malloc(matrix->numtile * sizeof(int));
319 |     memset(matrix->tile_rowidx, 0, matrix->numtile * sizeof(int));
320 |     // matrix->tile_nnz = (int *)malloc((matrix->numtile + 1) * sizeof(int));
321 |     // memset(matrix->tile_nnz, 0, (matrix->numtile + 1) * sizeof(int));
322 |     // matrix->csc_tile_ptr = (MAT_PTR_TYPE *)malloc((matrix->tilen + 1) * sizeof(MAT_PTR_TYPE));
323 |     // memset(matrix->csc_tile_ptr, 0, (matrix->tilen + 1) * sizeof(MAT_PTR_TYPE));
324 |     // matrix->csc_tile_rowidx = (int *)malloc((matrix->numtile) * sizeof(int));
325 |     // memset(matrix->csc_tile_rowidx, 0, (matrix->numtile) * sizeof(int));
326 | 
327 |     matrix->csc_tile_ptr = BT->tile_ptr;
328 |     matrix->csc_tile_rowidx = BT->tile_columnidx;
329 |     matrix->tile_nnz = BT->tile_nnz;
330 | 
331 |     char *flag = (char *)malloc(matrix->tilen * sizeof(char));
332 | 
333 |     int colid = 0;
334 |     for (int i = 0; i < matrix->tilem; i++)
335 |     {
336 |         memset(flag, 0, matrix->tilen * sizeof(char));
337 |         int start = i * BLOCK_SIZE;
338 |         int end = i == matrix->tilem - 1 ? matrix->m : (i + 1) * BLOCK_SIZE;
339 |         for (int j = matrix->rowpointer[start]; j < matrix->rowpointer[end]; j++)
340 |         {
341 |             int jc = matrix->columnindex[j] / BLOCK_SIZE;
342 |             if (flag[jc] == 0)
343 |             {
344 |                 flag[jc] = 1;
345 |                 matrix->tile_columnidx[colid] = jc;
346 |                 colid++;
347 |             }
348 |         }
349 |     }
350 | 
351 | 
352 | 
353 |     matrix->tile_csr_Ptr = (unsigned char *)malloc((matrix->numtile * BLOCK_SIZE) * sizeof(unsigned char));
354 |     memset(matrix->tile_csr_Ptr, 0, (matrix->numtile * BLOCK_SIZE) * sizeof(unsigned char));
355 | 
356 |     matrix->tile_csr_Col = (unsigned char *)malloc(matrix->nnz * sizeof(unsigned char));
357 |     memset(matrix->tile_csr_Col, 0, matrix->nnz * sizeof(unsigned char));
358 | 
359 |     matrix->tile_csr_Value = (MAT_VAL_TYPE *)malloc(matrix->nnz * sizeof(MAT_VAL_TYPE));
360 |     memset(matrix->tile_csr_Value, 0, matrix->nnz * sizeof(MAT_VAL_TYPE));
361 | 
362 |     matrix->mask = (unsigned short *)malloc(matrix->numtile * BLOCK_SIZE * sizeof(unsigned short));
363 |     memset(matrix->mask, 0, matrix->numtile * BLOCK_SIZE * sizeof(unsigned short));
364 | 
365 | 
366 | #pragma omp parallel for
367 |     for (int blki = 0; blki < matrix->tilem; blki++)
368 |     {
369 |         quick_sort_key(matrix->tile_columnidx + matrix->tile_ptr[blki], matrix->tile_ptr[blki + 1] - matrix->tile_ptr[blki]);
370 |     }
371 | 
372 |     for (int blki = 0; blki < matrix->tilen; blki++)
373 |     {
374 |         int colbnum = matrix->csc_tile_ptr[blki + 1] - matrix->csc_tile_ptr[blki];
375 |         SMatrix *subrowmatrixB_trans = (SMatrix *)malloc(colbnum * sizeof(SMatrix));
376 | 
377 |         int rowlength = blki == matrix->tilen - 1 ? matrix->n - (matrix->tilen - 1) * BLOCK_SIZE : BLOCK_SIZE;
378 | 
379 |         int start = blki * BLOCK_SIZE;
380 |         int end = blki == matrix->tilen - 1 ? matrix->n : (blki + 1) * BLOCK_SIZE;
381 | 
382 |         for (int bi = 0; bi < colbnum; bi++)
383 |         {
384 |             int tile_id = matrix->csc_tile_ptr[blki] + bi;
385 |             int tilennz = matrix->tile_nnz[tile_id + 1] - matrix->tile_nnz[tile_id];
386 |             subrowmatrixB_trans[bi].value = (MAT_VAL_TYPE *)malloc(tilennz * sizeof(MAT_VAL_TYPE));
387 |             subrowmatrixB_trans[bi].columnindex = (int *)malloc(tilennz * sizeof(int));
388 | 
389 |             subrowmatrixB_trans[bi].rowpointer = (MAT_PTR_TYPE *)malloc((rowlength + 1) * sizeof(MAT_PTR_TYPE));
390 |             memset(subrowmatrixB_trans[bi].rowpointer, 0, (rowlength + 1) * sizeof(MAT_PTR_TYPE));
391 |         }
392 | 
393 |         int *num = (int *)malloc((colbnum) * sizeof(int));
394 |         memset(num, 0, (colbnum) * sizeof(int));
395 | 
396 |         for (int ri = 0; ri < rowlength; ri++)
397 |         {
398 |             for (int j = cscColPtrB[start + ri]; j < cscColPtrB[start + ri + 1]; j++)
399 |             {
400 |                 int ki;
401 |                 for (int k = matrix->csc_tile_ptr[blki], ki = 0; k < matrix->csc_tile_ptr[blki + 1], ki < colbnum; k++, ki++)
402 |                 {
403 |                     int kcstart = matrix->csc_tile_rowidx[k] * BLOCK_SIZE;
404 |                     int kcend = matrix->csc_tile_rowidx[k] == (matrix->m - 1) ? matrix->m : (matrix->csc_tile_rowidx[k] + 1) * BLOCK_SIZE;
405 |                     if (cscRowIdxB[j] >= kcstart && cscRowIdxB[j] < kcend)
406 |                     {
407 |                         num[ki]++;
408 |                         subrowmatrixB_trans[ki].value[num[ki] - 1] = cscValB[j];
409 |                         subrowmatrixB_trans[ki].columnindex[num[ki] - 1] = cscRowIdxB[j] - matrix->csc_tile_rowidx[k] * BLOCK_SIZE;
410 |                         break;
411 |                     }
412 |                 }
413 |             }
414 |             for (int bi = 0; bi < colbnum; bi++)
415 |             {
416 |                 subrowmatrixB_trans[bi].rowpointer[ri + 1] = num[bi];
417 |             }
418 |         }
419 |         //transpose submatrix
420 |         SMatrix *subrowmatrixB = (SMatrix *)malloc(colbnum * sizeof(SMatrix));
421 |         for (int bi = 0; bi < colbnum; bi++)
422 |         {
423 |             int tileid = matrix->csc_tile_ptr[blki] + bi;
424 |             int tilennz = matrix->tile_nnz[tileid + 1] - matrix->tile_nnz[tileid];
425 |             int collength = matrix->csc_tile_rowidx[tileid] == matrix->tilem - 1 ? matrix->m - (matrix->tilem - 1) * BLOCK_SIZE : BLOCK_SIZE;
426 |             subrowmatrixB[bi].value = (MAT_VAL_TYPE *)malloc((tilennz) * sizeof(MAT_VAL_TYPE));
427 |             subrowmatrixB[bi].columnindex = (int *)malloc((tilennz) * sizeof(int));
428 | 
429 |             subrowmatrixB[bi].rowpointer = (MAT_PTR_TYPE *)malloc((collength + 1) * sizeof(MAT_PTR_TYPE));
430 |             memset(subrowmatrixB[bi].rowpointer, 0, (collength + 1) * sizeof(MAT_PTR_TYPE));
431 |         }
432 |         for (int bi = 0; bi < colbnum; bi++)
433 |         {
434 |             int tileid = matrix->csc_tile_ptr[blki] + bi;
435 |             int tilennz = matrix->tile_nnz[tileid + 1] - matrix->tile_nnz[tileid];
436 |             int collength = matrix->csc_tile_rowidx[tileid] == matrix->tilem - 1 ? matrix->m - (matrix->tilem - 1) * BLOCK_SIZE : BLOCK_SIZE;
437 |             matrix_transposition(rowlength, collength, tilennz,
438 |                                  subrowmatrixB_trans[bi].rowpointer, subrowmatrixB_trans[bi].columnindex, subrowmatrixB_trans[bi].value,
439 |                                  subrowmatrixB[bi].columnindex, subrowmatrixB[bi].rowpointer, subrowmatrixB[bi].value);
440 |         }
441 |         for (int bi = 0; bi < colbnum; bi++)
442 |         {
443 |             int tileid = matrix->csc_tile_ptr[blki] + bi;
444 |             int tilennz = matrix->tile_nnz[tileid + 1] - matrix->tile_nnz[tileid];
445 |             int prennz = matrix->tile_nnz[tileid];
446 |             int collength = matrix->csc_tile_rowidx[tileid] == matrix->tilem - 1 ? matrix->m - (matrix->tilem - 1) * BLOCK_SIZE : BLOCK_SIZE;
447 |             //CSR val&col
448 |             for (int bri = 0; bri < collength; bri++)
449 |             {
450 |                 for (int k = subrowmatrixB[bi].rowpointer[bri]; k < subrowmatrixB[bi].rowpointer[bri + 1]; k++)
451 |                 {
452 |                     int colidx = subrowmatrixB[bi].columnindex[k];
453 |                     matrix->tile_csr_Value[prennz + k] = subrowmatrixB[bi].value[k];
454 |                     matrix->mask[tileid * BLOCK_SIZE + bri] |= (0x1 << (BLOCK_SIZE - colidx - 1));
455 |                     matrix->tile_csr_Col[prennz + k] = subrowmatrixB[bi].columnindex[k];
456 |                 }
457 |                 matrix->tile_csr_Ptr[tileid * BLOCK_SIZE + bri] = subrowmatrixB[bi].rowpointer[bri];
458 |             }
459 | 
460 |             for (int jid = collength; jid < BLOCK_SIZE; jid++)
461 |             {
462 |                 matrix->tile_csr_Ptr[tileid * BLOCK_SIZE + jid] = subrowmatrixB[bi].rowpointer[collength];
463 |             }
464 |         }
465 |         for (int bi = 0; bi < colbnum; bi++)
466 |         {
467 |             free(subrowmatrixB[bi].value);
468 |             free(subrowmatrixB[bi].columnindex);
469 |             free(subrowmatrixB[bi].rowpointer);
470 |             free(subrowmatrixB_trans[bi].value);
471 |             free(subrowmatrixB_trans[bi].columnindex);
472 |             free(subrowmatrixB_trans[bi].rowpointer);
473 |         }
474 |         free(subrowmatrixB);
475 |         free(subrowmatrixB_trans);
476 |         free(num);
477 |     }
478 | }
479 | 
480 | 
481 | void matrix_destroy(SMatrix *matrix)
482 | {
483 |     free(matrix->tile_ptr);
484 |     free(matrix->tile_columnidx);
485 |     free(matrix->tile_nnz);
486 |     free(matrix->tile_csr_Value);
487 |     free(matrix->tile_csr_Col);
488 |     free(matrix->tile_csr_Ptr);
489 |     free(matrix->mask);
490 | 
491 | 
492 | }
493 | 


--------------------------------------------------------------------------------
/src/external/cusparse/Makefile:
--------------------------------------------------------------------------------
 1 | #compilers
 2 | CC=nvcc
 3 | 
 4 | #GLOBAL_PARAMETERS
 5 | VALUE_TYPE = double
 6 | 
 7 | #CUDA_PARAMETERS
 8 | #-Xptxas -dlcm=cg
 9 | NVCC_FLAGS = -O3 -w -m64 -gencode=arch=compute_80,code=sm_86 -Xptxas -dlcm=cg
10 | #ENVIRONMENT_PARAMETERS
11 | CUDA_INSTALL_PATH = /usr/local/cuda-11.5
12 | SEGSORTMACRO+=-D_ARCH80
13 | 
14 | #includes
15 | INCLUDES = -I$(CUDA_INSTALL_PATH)/include
16 | 
17 | #libs
18 | #CLANG_LIBS = -stdlib=libstdc++ -lstdc++
19 | CUDA_LIBS = -L$(CUDA_INSTALL_PATH)/lib64  -lcudart -lcusparse
20 | LIBS = $(CUDA_LIBS)
21 | 
22 | #options
23 | #OPTIONS = -std=c99
24 | 
25 | make:
26 | 	$(CC) $(NVCC_FLAGS) -Xcompiler -fopenmp -Xcompiler -mavx -Xcompiler -mavx2 -Xcompiler -mfma -lm main.cu -o spgemm $(INCLUDES) $(LIBS) $(OPTIONS) -D VALUE_TYPE=$(VALUE_TYPE) $(SEGSORTMACRO)
27 | 


--------------------------------------------------------------------------------
/src/external/cusparse/common.h:
--------------------------------------------------------------------------------
 1 | #include <time.h>
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | #include <string.h>
 5 | #include <math.h>
 6 | 
 7 | #include <sys/time.h>
 8 | #include <mm_malloc.h>
 9 | #include <x86intrin.h>
10 | #include <immintrin.h>
11 | #include <nmmintrin.h>
12 | #include <omp.h>
13 | 
14 | 
15 | #ifndef VALUE_TYPE
16 | #define VALUE_TYPE double
17 | #endif
18 | 
19 | #ifndef BENCH_REPEAT
20 | #define BENCH_REPEAT 1
21 | #endif
22 | 
23 | #ifndef WARP_SIZE
24 | #define WARP_SIZE   32
25 | #endif
26 | 
27 | #ifndef WARP_PER_BLOCK
28 | #define WARP_PER_BLOCK   16
29 | #endif
30 | 
31 | 


--------------------------------------------------------------------------------
/src/external/cusparse/main.cu:
--------------------------------------------------------------------------------
  1 | #include "common.h"
  2 | #include "mmio_highlevel.h"
  3 | #include "utils.h"
  4 | #include "tranpose.h"
  5 | 
  6 | #include "spgemm_serialref_esc.h"
  7 | #include "spgemm_cusparse.h"
  8 | #include "spgemm_serialref_spa_new.h"
  9 | 
 10 | int main(int argc, char ** argv)
 11 | {
 12 |     // report precision of floating-point
 13 |     printf("---------------------------------------------------------------\n");
 14 |     char  *precision;
 15 |     if (sizeof(VALUE_TYPE) == 4)
 16 |     {
 17 |         precision = (char *)"32-bit Single Precision";
 18 |     }
 19 |     else if (sizeof(VALUE_TYPE) == 8)
 20 |     {
 21 |         precision = (char *)"64-bit Double Precision";
 22 |     }
 23 |     else
 24 |     {
 25 |         printf("Wrong precision. Program exit!\n");
 26 |         return 0;
 27 |     }
 28 | 
 29 |     printf("PRECISION = %s\n", precision);
 30 |     printf("Benchmark REPEAT = %i\n", BENCH_REPEAT);
 31 |     printf("---------------------------------------------------------------\n");
 32 | 
 33 |     int mA, nA, nnzA, isSymmetricA;
 34 |     int *csrRowPtrA;
 35 |     int *csrColIdxA;
 36 |     VALUE_TYPE *csrValA;
 37 | 
 38 |     int mB, nB, nnzB, isSymmetricB;
 39 |     int *csrRowPtrB;
 40 |     int *csrColIdxB;
 41 |     VALUE_TYPE *csrValB;
 42 | 
 43 |     int device_id = 0;
 44 |     bool check_result = 0;
 45 | 
 46 |     // "Usage: ``./spgemm -d 0 -check 0 A.mtx B.mtx'' for AB=C on device 0, no check"
 47 |     int argi = 1;
 48 | 
 49 |     // load device id
 50 |     char *devstr;
 51 |     if(argc > argi)
 52 |     {
 53 |         devstr = argv[argi];
 54 |         argi++;
 55 |     }
 56 | 
 57 |     if (strcmp(devstr, "-d") != 0) return 0;
 58 | 
 59 |     if(argc > argi)
 60 |     {
 61 |         device_id = atoi(argv[argi]);
 62 |         argi++;
 63 |     }
 64 |     printf("device_id = %i\n", device_id);
 65 | 
 66 |     // load device id
 67 |     char *checkstr;
 68 |     if(argc > argi)
 69 |     {
 70 |         checkstr = argv[argi];
 71 |         argi++;
 72 |     }
 73 | 
 74 |     if (strcmp(checkstr, "-check") != 0) return 0;
 75 | 
 76 |     if(argc > argi)
 77 |     {
 78 |         check_result = atoi(argv[argi]);
 79 |         argi++;
 80 |     }
 81 |     printf("check_result = %i\n", check_result);
 82 | 
 83 |     // load matrix A data from file
 84 |     char  *filenameA;
 85 |     if(argc > argi)
 86 |     {
 87 |         filenameA = argv[argi];
 88 |         argi++;
 89 |     }
 90 |     printf("A: -------------- %s --------------\n", filenameA);
 91 | 
 92 |     // load mtx A data to the csr format
 93 |     srand(time(NULL));
 94 |     mmio_info(&mA, &nA, &nnzA, &isSymmetricA, filenameA);
 95 |     csrRowPtrA = (int *)malloc((mA+1) * sizeof(int));
 96 |     csrColIdxA = (int *)malloc(nnzA * sizeof(int));
 97 |     csrValA    = (VALUE_TYPE *)malloc(nnzA * sizeof(VALUE_TYPE));
 98 |     mmio_data(csrRowPtrA, csrColIdxA, csrValA, filenameA);
 99 |     for (int i = 0; i < nnzA; i++) csrValA[i] = ( rand() % 9 ) + 1;
100 |     printf("input matrix A: ( %i, %i ) nnz = %i\n", mA, nA, nnzA);
101 | 
102 |     // keep each column sort
103 |     /*for (int i = 0; i < mA; i++)
104 |     {
105 |         quick_sort_key_val_pair<int, VALUE_TYPE>(&csrColIdxA[csrRowPtrA[i]],
106 |                                                  &csrValA[csrRowPtrA[i]],
107 |                                                  csrRowPtrA[i+1]-csrRowPtrA[i]);
108 |     }*/
109 | 
110 |     // load matrix B data from file
111 |     char  *filenameB;
112 |     if(argc > argi)
113 |     {
114 |         filenameB = argv[argi];
115 |         argi++;
116 |     }
117 |     printf("B: -------------- %s --------------\n", filenameB);
118 | 
119 |     // load mtx B data to the csr format
120 |     mmio_info(&mB, &nB, &nnzB, &isSymmetricB, filenameB);
121 |     csrRowPtrB = (int *)malloc((mB+1) * sizeof(int));
122 |     csrColIdxB = (int *)malloc(nnzB * sizeof(int));
123 |     csrValB    = (VALUE_TYPE *)malloc(nnzB * sizeof(VALUE_TYPE));
124 |     mmio_data(csrRowPtrB, csrColIdxB, csrValB, filenameB);
125 |     for (int i = 0; i < nnzB; i++) csrValB[i] = ( rand() % 9 ) + 1;
126 |     printf("input matrix B: ( %i, %i ) nnz = %i\n", mB, nB, nnzB);
127 |     
128 |     //if (isSymmetricB) {printf("B is symmetric, no need to compute AA. Exit\n"); return 0;}
129 |     //    if (nA != mB) {printf("nA != mB, cannot compute AA. Exit\n"); return 0;}
130 | 
131 |     // keep each column sort
132 |     /*for (int i = 0; i < mB; i++)
133 |     {
134 |         quick_sort_key_val_pair<int, VALUE_TYPE>(&csrColIdxB[csrRowPtrB[i]],
135 |                                                  &csrValB[csrRowPtrB[i]],
136 |                                                  csrRowPtrB[i+1]-csrRowPtrB[i]);
137 |     }*/
138 | 
139 |         int *csrRowPtrBT = (int *)malloc((nB+1) * sizeof(int));
140 |     int *csrColIdxBT = (int *)malloc(nnzB * sizeof(int));
141 |     VALUE_TYPE *csrValBT    = (VALUE_TYPE *)malloc(nnzB * sizeof(VALUE_TYPE));
142 | 
143 |    // matrix_transposition(mB, nB, nnzB, csrRowPtrB, csrColIdxB, csrValB,
144 |    //                      csrColIdxBT, csrRowPtrBT, csrValBT);
145 |    // mB = nA;
146 |    // nB = mA;
147 |    
148 |    
149 |    // free(csrColIdxB);
150 |    // free(csrValB);
151 |    // free(csrRowPtrB);
152 | 
153 |    // csrColIdxB = csrColIdxBT;
154 |    // csrValB = csrValBT;
155 |    // csrRowPtrB = csrRowPtrBT;
156 |     
157 | 
158 |     // calculate bytes and flops consumed
159 |     unsigned long long int nnzCub = 0;
160 |     for (int i = 0; i < nnzA; i++)
161 |     {
162 |         int rowB = csrColIdxA[i];
163 |         nnzCub += csrRowPtrB[rowB + 1] - csrRowPtrB[rowB];
164 |     }
165 |     double flops = 2 * nnzCub; // flop mul-add for each nonzero entry
166 |     printf("SpGEMM flops = %lld.\n", nnzCub);
167 | 
168 |     int mC = mA;
169 |     int nC = nB;
170 |     int nnzC_golden = 0;
171 |     int *csrRowPtrC_golden;
172 |     int *csrColIdxC_golden;
173 |     VALUE_TYPE *csrValC_golden;
174 | 
175 |     struct timeval t1, t2;
176 | 
177 |     // run serial (ESC) SpGEMM as a reference
178 |     if (check_result)
179 |     {
180 |         // printf("--------------------------------------------ESC-SPGEMM-SERIAL--\n");
181 |         printf("--------------------------------------------SPA-SPGEMM-PARALLEL--\n");
182 | 
183 |         mC = mA;
184 |         nC = nB;
185 |         nnzC_golden = 0;
186 | 
187 |         // malloc d_csrRowPtrC
188 |         csrRowPtrC_golden = (int *)malloc((mC+1) * sizeof(int));
189 |         memset(csrRowPtrC_golden, 0, (mC+1) * sizeof(int));
190 | 
191 |         gettimeofday(&t1, NULL);
192 | 
193 |         // spgemm_serialref(csrRowPtrA, csrColIdxA, csrValA, mA, nA, nnzA,
194 |         //                  csrRowPtrB, csrColIdxB, csrValB, mB, nB, nnzB,
195 |         //                  csrRowPtrC_golden, csrColIdxC_golden, csrValC_golden, mC, nC, &nnzC_golden, true);
196 |         spgemm_spa(csrRowPtrA, csrColIdxA, csrValA, mA, nA, nnzA,
197 |                          csrRowPtrB, csrColIdxB, csrValB, mB, nB, nnzB,
198 |                          csrRowPtrC_golden, csrColIdxC_golden, csrValC_golden, mC, nC, &nnzC_golden, 1);
199 | 
200 |         printf("Serial ref nnzC = %i, compression rate is %f\n",
201 |                nnzC_golden, (double)nnzCub/(double)nnzC_golden);
202 |         csrColIdxC_golden = (int *)malloc(nnzC_golden * sizeof(int));
203 |         csrValC_golden    = (VALUE_TYPE *)malloc(nnzC_golden * sizeof(VALUE_TYPE));
204 | 
205 |         double bytes =
206 |                 sizeof(int) * (mA+1) + (sizeof(int) + sizeof(VALUE_TYPE)) * nnzA +       // data loaded from A
207 |                 sizeof(int) * (mB+1) + (sizeof(int) + sizeof(VALUE_TYPE)) * nnzB +       // data loaded from B
208 |                 sizeof(int) * (mC+1) + (sizeof(int) + sizeof(VALUE_TYPE)) * nnzC_golden; // data written back for C
209 | 
210 |         spgemm_spa(csrRowPtrA, csrColIdxA, csrValA, mA, nA, nnzA,
211 |                         csrRowPtrB, csrColIdxB, csrValB, mB, nB, nnzB,
212 |                         csrRowPtrC_golden, csrColIdxC_golden, csrValC_golden, mC, nC, &nnzC_golden, 0);
213 | 
214 |         // spgemm_serialref(csrRowPtrA, csrColIdxA, csrValA, mA, nA, nnzA,
215 |         //                  csrRowPtrB, csrColIdxB, csrValB, mB, nB, nnzB,
216 |         //                  csrRowPtrC_golden, csrColIdxC_golden, csrValC_golden, mC, nC, &nnzC_golden, false);
217 | 
218 |         gettimeofday(&t2, NULL);
219 |         double time_spgemm_serialref = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0;
220 |         printf("Serial SpGEMM takes %4.2f ms, %4.2f GFlop/s, %4.2f GB/s\n",
221 |                time_spgemm_serialref, (1e-6*flops)/time_spgemm_serialref,
222 |                (1e-6*bytes)/time_spgemm_serialref);
223 |     }
224 | 
225 |     // set device
226 |     cudaSetDevice(device_id);
227 |     cudaDeviceProp deviceProp;
228 |     cudaGetDeviceProperties(&deviceProp, device_id);
229 | 
230 |     printf("---------------------------------------------------------------\n");
231 |     printf("Device [ %i ] %s @ %4.2f MHz\n",
232 |            device_id, deviceProp.name, deviceProp.clockRate * 1e-3f);
233 | 
234 |     // run cuda SpGEMM (using cuSPARSE)
235 |     printf("\n--------------- SpGEMM (using cuSPARSE) ---------------\n");
236 |         unsigned long long int nnzC = 0;
237 |     double compression_rate1 = 0;
238 |         double time_cusparse = 0;
239 |     double gflops_cusparse = 0;
240 |     int flag =0;
241 |     spgemm_cusparse(mA, nA, nnzA, csrRowPtrA, csrColIdxA, csrValA,
242 |                  mB, nB, nnzB, csrRowPtrB, csrColIdxB, csrValB,
243 |                  mC, nC, nnzC_golden, csrRowPtrC_golden, csrColIdxC_golden, csrValC_golden,
244 |                  check_result, nnzCub, &nnzC, &compression_rate1, &time_cusparse, &gflops_cusparse,&flag);
245 |     printf("---------------------------------------------------------------\n");
246 | 
247 |     // write results to text (scv) file
248 |     if (gflops_cusparse > 0 && gflops_cusparse > 0)
249 |     {
250 |     FILE *fout = fopen("results.csv", "a");
251 |     if (fout == NULL)
252 |         printf("Writing results fails.\n");
253 |     fprintf(fout, "%s,%s,%i,%i,%i,%lld,%lld,%f,%f,%f,%i\n",
254 |             filenameA, filenameB, mA, nA, nnzA, nnzCub, nnzC, compression_rate1, time_cusparse, gflops_cusparse,flag);
255 |     fclose(fout);
256 |     }
257 | 
258 |     // done!
259 |     free(csrColIdxA);
260 |     free(csrValA);
261 |     free(csrRowPtrA);
262 | 
263 |     free(csrColIdxB);
264 |     free(csrValB);
265 |     free(csrRowPtrB);
266 | 
267 |     if (check_result)
268 |     {
269 |         free(csrRowPtrC_golden);
270 |         free(csrColIdxC_golden);
271 |         free(csrValC_golden);
272 |     }
273 | 
274 |     return 0;
275 | }
276 | 


--------------------------------------------------------------------------------
/src/external/cusparse/mmio.h:
--------------------------------------------------------------------------------
  1 | /* 
  2 | *   Matrix Market I/O library for ANSI C
  3 | *
  4 | *   See http://math.nist.gov/MatrixMarket for details.
  5 | *
  6 | *
  7 | */
  8 | 
  9 | #include <stdio.h>
 10 | #include <string.h>
 11 | #include <stdlib.h>
 12 | #include <ctype.h>
 13 | 
 14 | #ifndef MM_IO_H
 15 | #define MM_IO_H
 16 | 
 17 | #define MM_MAX_LINE_LENGTH 1025
 18 | #define MatrixMarketBanner "%%MatrixMarket"
 19 | #define MM_MAX_TOKEN_LENGTH 64
 20 | 
 21 | typedef char MM_typecode[4];
 22 | 
 23 | char *mm_typecode_to_str(MM_typecode matcode);
 24 | 
 25 | int mm_read_banner(FILE *f, MM_typecode *matcode);
 26 | int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz);
 27 | int mm_read_mtx_array_size(FILE *f, int *M, int *N);
 28 | 
 29 | int mm_write_banner(FILE *f, MM_typecode matcode);
 30 | int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz);
 31 | int mm_write_mtx_array_size(FILE *f, int M, int N);
 32 | 
 33 | 
 34 | /********************* MM_typecode query fucntions ***************************/
 35 | 
 36 | #define mm_is_matrix(typecode)	((typecode)[0]=='M')
 37 | 
 38 | #define mm_is_sparse(typecode)	((typecode)[1]=='C')
 39 | #define mm_is_coordinate(typecode)((typecode)[1]=='C')
 40 | #define mm_is_dense(typecode)	((typecode)[1]=='A')
 41 | #define mm_is_array(typecode)	((typecode)[1]=='A')
 42 | 
 43 | #define mm_is_complex(typecode)	((typecode)[2]=='C')
 44 | #define mm_is_real(typecode)		((typecode)[2]=='R')
 45 | #define mm_is_pattern(typecode)	((typecode)[2]=='P')
 46 | #define mm_is_integer(typecode) ((typecode)[2]=='I')
 47 | 
 48 | #define mm_is_symmetric(typecode)((typecode)[3]=='S')
 49 | #define mm_is_general(typecode)	((typecode)[3]=='G')
 50 | #define mm_is_skew(typecode)	((typecode)[3]=='K')
 51 | #define mm_is_hermitian(typecode)((typecode)[3]=='H')
 52 | 
 53 | int mm_is_valid(MM_typecode matcode);		/* too complex for a macro */
 54 | 
 55 | 
 56 | /********************* MM_typecode modify fucntions ***************************/
 57 | 
 58 | #define mm_set_matrix(typecode)	((*typecode)[0]='M')
 59 | #define mm_set_coordinate(typecode)	((*typecode)[1]='C')
 60 | #define mm_set_array(typecode)	((*typecode)[1]='A')
 61 | #define mm_set_dense(typecode)	mm_set_array(typecode)
 62 | #define mm_set_sparse(typecode)	mm_set_coordinate(typecode)
 63 | 
 64 | #define mm_set_complex(typecode)((*typecode)[2]='C')
 65 | #define mm_set_real(typecode)	((*typecode)[2]='R')
 66 | #define mm_set_pattern(typecode)((*typecode)[2]='P')
 67 | #define mm_set_integer(typecode)((*typecode)[2]='I')
 68 | 
 69 | 
 70 | #define mm_set_symmetric(typecode)((*typecode)[3]='S')
 71 | #define mm_set_general(typecode)((*typecode)[3]='G')
 72 | #define mm_set_skew(typecode)	((*typecode)[3]='K')
 73 | #define mm_set_hermitian(typecode)((*typecode)[3]='H')
 74 | 
 75 | #define mm_clear_typecode(typecode) ((*typecode)[0]=(*typecode)[1]= \
 76 |                                     (*typecode)[2]=' ',(*typecode)[3]='G')
 77 | 
 78 | #define mm_initialize_typecode(typecode) mm_clear_typecode(typecode)
 79 | 
 80 | 
 81 | /********************* Matrix Market error codes ***************************/
 82 | 
 83 | 
 84 | #define MM_COULD_NOT_READ_FILE	11
 85 | #define MM_PREMATURE_EOF		12
 86 | #define MM_NOT_MTX				13
 87 | #define MM_NO_HEADER			14
 88 | #define MM_UNSUPPORTED_TYPE		15
 89 | #define MM_LINE_TOO_LONG		16
 90 | #define MM_COULD_NOT_WRITE_FILE	17
 91 | 
 92 | 
 93 | /******************** Matrix Market internal definitions ********************
 94 | 
 95 |    MM_matrix_typecode: 4-character sequence
 96 | 
 97 |                     ojbect 		sparse/   	data        storage
 98 |                                 dense     	type        scheme
 99 | 
100 |    string position:	 [0]        [1]			[2]         [3]
101 | 
102 |    Matrix typecode:  M(atrix)  C(oord)		R(eal)   	G(eneral)
103 |                                 A(array)	C(omplex)   H(ermitian)
104 |                                             P(attern)   S(ymmetric)
105 |                                             I(nteger)	K(kew)
106 | 
107 |  ***********************************************************************/
108 | 
109 | #define MM_MTX_STR		"matrix"
110 | #define MM_ARRAY_STR	"array"
111 | #define MM_DENSE_STR	"array"
112 | #define MM_COORDINATE_STR "coordinate"
113 | #define MM_SPARSE_STR	"coordinate"
114 | #define MM_COMPLEX_STR	"complex"
115 | #define MM_REAL_STR		"real"
116 | #define MM_INT_STR		"integer"
117 | #define MM_GENERAL_STR  "general"
118 | #define MM_SYMM_STR		"symmetric"
119 | #define MM_HERM_STR		"hermitian"
120 | #define MM_SKEW_STR		"skew-symmetric"
121 | #define MM_PATTERN_STR  "pattern"
122 | 
123 | 
124 | /*  high level routines */
125 | 
126 | int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
127 |          double val[], MM_typecode matcode);
128 | int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
129 |         double val[], MM_typecode matcode);
130 | int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img,
131 |             MM_typecode matcode);
132 | 
133 | int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
134 |                 double **val_, int **I_, int **J_);
135 | 
136 | char *mm_strdup(const char *s)
137 | {
138 |     int len = strlen(s);
139 |     char *s2 = (char *) malloc((len+1)*sizeof(char));
140 |     return strcpy(s2, s);
141 | }
142 | 
143 | char  *mm_typecode_to_str(MM_typecode matcode)
144 | {
145 |     char buffer[MM_MAX_LINE_LENGTH];
146 |     char *types[4];
147 |     char *mm_strdup(const char *);
148 |     //int error =0;
149 | 
150 |     /* check for MTX type */
151 |     if (mm_is_matrix(matcode))
152 |         types[0] = (char *)MM_MTX_STR;
153 |     //else
154 |     //    error=1;
155 | 
156 |     /* check for CRD or ARR matrix */
157 |     if (mm_is_sparse(matcode))
158 |         types[1] = (char *)MM_SPARSE_STR;
159 |     else
160 |     if (mm_is_dense(matcode))
161 |         types[1] = (char *)MM_DENSE_STR;
162 |     else
163 |         return NULL;
164 | 
165 |     /* check for element data type */
166 |     if (mm_is_real(matcode))
167 |         types[2] = (char *)MM_REAL_STR;
168 |     else
169 |     if (mm_is_complex(matcode))
170 |         types[2] = (char *)MM_COMPLEX_STR;
171 |     else
172 |     if (mm_is_pattern(matcode))
173 |         types[2] = (char *)MM_PATTERN_STR;
174 |     else
175 |     if (mm_is_integer(matcode))
176 |         types[2] = (char *)MM_INT_STR;
177 |     else
178 |         return NULL;
179 | 
180 | 
181 |     /* check for symmetry type */
182 |     if (mm_is_general(matcode))
183 |         types[3] = (char *)MM_GENERAL_STR;
184 |     else
185 |     if (mm_is_symmetric(matcode))
186 |         types[3] = (char *)MM_SYMM_STR;
187 |     else
188 |     if (mm_is_hermitian(matcode))
189 |         types[3] = (char *)MM_HERM_STR;
190 |     else
191 |     if (mm_is_skew(matcode))
192 |         types[3] = (char *)MM_SKEW_STR;
193 |     else
194 |         return NULL;
195 | 
196 |     sprintf(buffer,"%s %s %s %s", types[0], types[1], types[2], types[3]);
197 |     return mm_strdup(buffer);
198 | 
199 | }
200 | 
201 | int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J,
202 |         double **val, MM_typecode *matcode)
203 | {
204 |     int ret_code;
205 |     FILE *f;
206 | 
207 |     if (strcmp(fname, "stdin") == 0) f=stdin;
208 |     else
209 |     if ((f = fopen(fname, "r")) == NULL)
210 |         return MM_COULD_NOT_READ_FILE;
211 | 
212 | 
213 |     if ((ret_code = mm_read_banner(f, matcode)) != 0)
214 |         return ret_code;
215 | 
216 |     if (!(mm_is_valid(*matcode) && mm_is_sparse(*matcode) &&
217 |             mm_is_matrix(*matcode)))
218 |         return MM_UNSUPPORTED_TYPE;
219 | 
220 |     if ((ret_code = mm_read_mtx_crd_size(f, M, N, nz)) != 0)
221 |         return ret_code;
222 | 
223 | 
224 |     *I = (int *)  malloc(*nz * sizeof(int));
225 |     *J = (int *)  malloc(*nz * sizeof(int));
226 |     *val = NULL;
227 | 
228 |     if (mm_is_complex(*matcode))
229 |     {
230 |         *val = (double *) malloc(*nz * 2 * sizeof(double));
231 |         ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val,
232 |                 *matcode);
233 |         if (ret_code != 0) return ret_code;
234 |     }
235 |     else if (mm_is_real(*matcode))
236 |     {
237 |         *val = (double *) malloc(*nz * sizeof(double));
238 |         ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val,
239 |                 *matcode);
240 |         if (ret_code != 0) return ret_code;
241 |     }
242 | 
243 |     else if (mm_is_pattern(*matcode))
244 |     {
245 |         ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val,
246 |                 *matcode);
247 |         if (ret_code != 0) return ret_code;
248 |     }
249 | 
250 |     if (f != stdin) fclose(f);
251 |     return 0;
252 | }
253 | 
254 | int mm_read_banner(FILE *f, MM_typecode *matcode)
255 | {
256 |     char line[MM_MAX_LINE_LENGTH];
257 |     char banner[MM_MAX_TOKEN_LENGTH];
258 |     char mtx[MM_MAX_TOKEN_LENGTH];
259 |     char crd[MM_MAX_TOKEN_LENGTH];
260 |     char data_type[MM_MAX_TOKEN_LENGTH];
261 |     char storage_scheme[MM_MAX_TOKEN_LENGTH];
262 |     char *p;
263 | 
264 | 
265 |     mm_clear_typecode(matcode);
266 | 
267 |     if (fgets(line, MM_MAX_LINE_LENGTH, f) == NULL)
268 |         return MM_PREMATURE_EOF;
269 | 
270 |     if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, data_type,
271 |         storage_scheme) != 5)
272 |         return MM_PREMATURE_EOF;
273 | 
274 |     for (p=mtx; *p!='\0'; *p=tolower(*p),p++);  /* convert to lower case */
275 |     for (p=crd; *p!='\0'; *p=tolower(*p),p++);
276 |     for (p=data_type; *p!='\0'; *p=tolower(*p),p++);
277 |     for (p=storage_scheme; *p!='\0'; *p=tolower(*p),p++);
278 | 
279 |     /* check for banner */
280 |     if (strncmp(banner, MatrixMarketBanner, strlen(MatrixMarketBanner)) != 0)
281 |         return MM_NO_HEADER;
282 | 
283 |     /* first field should be "mtx" */
284 |     if (strcmp(mtx, MM_MTX_STR) != 0)
285 |         return  MM_UNSUPPORTED_TYPE;
286 |     mm_set_matrix(matcode);
287 | 
288 | 
289 |     /* second field describes whether this is a sparse matrix (in coordinate
290 |             storgae) or a dense array */
291 | 
292 | 
293 |     if (strcmp(crd, MM_SPARSE_STR) == 0)
294 |         mm_set_sparse(matcode);
295 |     else
296 |     if (strcmp(crd, MM_DENSE_STR) == 0)
297 |             mm_set_dense(matcode);
298 |     else
299 |         return MM_UNSUPPORTED_TYPE;
300 | 
301 | 
302 |     /* third field */
303 | 
304 |     if (strcmp(data_type, MM_REAL_STR) == 0)
305 |         mm_set_real(matcode);
306 |     else
307 |     if (strcmp(data_type, MM_COMPLEX_STR) == 0)
308 |         mm_set_complex(matcode);
309 |     else
310 |     if (strcmp(data_type, MM_PATTERN_STR) == 0)
311 |         mm_set_pattern(matcode);
312 |     else
313 |     if (strcmp(data_type, MM_INT_STR) == 0)
314 |         mm_set_integer(matcode);
315 |     else
316 |         return MM_UNSUPPORTED_TYPE;
317 | 
318 | 
319 |     /* fourth field */
320 | 
321 |     if (strcmp(storage_scheme, MM_GENERAL_STR) == 0)
322 |         mm_set_general(matcode);
323 |     else
324 |     if (strcmp(storage_scheme, MM_SYMM_STR) == 0)
325 |         mm_set_symmetric(matcode);
326 |     else
327 |     if (strcmp(storage_scheme, MM_HERM_STR) == 0)
328 |         mm_set_hermitian(matcode);
329 |     else
330 |     if (strcmp(storage_scheme, MM_SKEW_STR) == 0)
331 |         mm_set_skew(matcode);
332 |     else
333 |         return MM_UNSUPPORTED_TYPE;
334 | 
335 | 
336 |     return 0;
337 | }
338 | 
339 | int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz)
340 | {
341 |     char line[MM_MAX_LINE_LENGTH];
342 |     int num_items_read;
343 | 
344 |     /* set return null parameter values, in case we exit with errors */
345 |     *M = *N = *nz = 0;
346 | 
347 |     /* now continue scanning until you reach the end-of-comments */
348 |     do
349 |     {
350 |         if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL)
351 |             return MM_PREMATURE_EOF;
352 |     }while (line[0] == '%');
353 | 
354 |     /* line[] is either blank or has M,N, nz */
355 |     if (sscanf(line, "%d %d %d", M, N, nz) == 3)
356 |         return 0;
357 | 
358 |     else
359 |     do
360 |     {
361 |         num_items_read = fscanf(f, "%d %d %d", M, N, nz);
362 |         if (num_items_read == EOF) return MM_PREMATURE_EOF;
363 |     }
364 |     while (num_items_read != 3);
365 | 
366 |     return 0;
367 | }
368 | 
369 | int mm_read_mtx_array_size(FILE *f, int *M, int *N)
370 | {
371 |     char line[MM_MAX_LINE_LENGTH];
372 |     int num_items_read;
373 |     /* set return null parameter values, in case we exit with errors */
374 |     *M = *N = 0;
375 | 
376 |     /* now continue scanning until you reach the end-of-comments */
377 |     do
378 |     {
379 |         if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL)
380 |             return MM_PREMATURE_EOF;
381 |     }while (line[0] == '%');
382 | 
383 |     /* line[] is either blank or has M,N, nz */
384 |     if (sscanf(line, "%d %d", M, N) == 2)
385 |         return 0;
386 | 
387 |     else /* we have a blank line */
388 |     do
389 |     {
390 |         num_items_read = fscanf(f, "%d %d", M, N);
391 |         if (num_items_read == EOF) return MM_PREMATURE_EOF;
392 |     }
393 |     while (num_items_read != 2);
394 | 
395 |     return 0;
396 | }
397 | 
398 | int mm_write_banner(FILE *f, MM_typecode matcode)
399 | {
400 |     char *str = mm_typecode_to_str(matcode);
401 |     int ret_code;
402 | 
403 |     ret_code = fprintf(f, "%s %s\n", MatrixMarketBanner, str);
404 |     free(str);
405 |     if (ret_code !=2 )
406 |         return MM_COULD_NOT_WRITE_FILE;
407 |     else
408 |         return 0;
409 | }
410 | 
411 | int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz)
412 | {
413 |     if (fprintf(f, "%d %d %d\n", M, N, nz) != 3)
414 |         return MM_COULD_NOT_WRITE_FILE;
415 |     else
416 |         return 0;
417 | }
418 | 
419 | int mm_write_mtx_array_size(FILE *f, int M, int N)
420 | {
421 |     if (fprintf(f, "%d %d\n", M, N) != 2)
422 |         return MM_COULD_NOT_WRITE_FILE;
423 |     else
424 |         return 0;
425 | }
426 | 
427 | 
428 | 
429 | 
430 | int mm_is_valid(MM_typecode matcode)		/* too complex for a macro */
431 | {
432 |     if (!mm_is_matrix(matcode)) return 0;
433 |     if (mm_is_dense(matcode) && mm_is_pattern(matcode)) return 0;
434 |     if (mm_is_real(matcode) && mm_is_hermitian(matcode)) return 0;
435 |     if (mm_is_pattern(matcode) && (mm_is_hermitian(matcode) ||
436 |                 mm_is_skew(matcode))) return 0;
437 |     return 1;
438 | }
439 | 
440 | 
441 | 
442 | 
443 | /*  high level routines */
444 | 
445 | int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
446 |          double val[], MM_typecode matcode)
447 | {
448 |     FILE *f;
449 |     int i;
450 | 
451 |     if (strcmp(fname, "stdout") == 0)
452 |         f = stdout;
453 |     else
454 |     if ((f = fopen(fname, "w")) == NULL)
455 |         return MM_COULD_NOT_WRITE_FILE;
456 | 
457 |     /* print banner followed by typecode */
458 |     fprintf(f, "%s ", MatrixMarketBanner);
459 |     fprintf(f, "%s\n", mm_typecode_to_str(matcode));
460 | 
461 |     /* print matrix sizes and nonzeros */
462 |     fprintf(f, "%d %d %d\n", M, N, nz);
463 | 
464 |     /* print values */
465 |     if (mm_is_pattern(matcode))
466 |         for (i=0; i<nz; i++)
467 |             fprintf(f, "%d %d\n", I[i], J[i]);
468 |     else
469 |     if (mm_is_real(matcode))
470 |         for (i=0; i<nz; i++)
471 |             fprintf(f, "%d %d %20.16g\n", I[i], J[i], val[i]);
472 |     else
473 |     if (mm_is_complex(matcode))
474 |         for (i=0; i<nz; i++)
475 |             fprintf(f, "%d %d %20.16g %20.16g\n", I[i], J[i], val[2*i],
476 |                         val[2*i+1]);
477 |     else
478 |     {
479 |         if (f != stdout) fclose(f);
480 |         return MM_UNSUPPORTED_TYPE;
481 |     }
482 | 
483 |     if (f !=stdout) fclose(f);
484 | 
485 |     return 0;
486 | }
487 | 
488 | int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
489 |         double val[], MM_typecode matcode)
490 | {
491 |     int i;
492 |     if (mm_is_complex(matcode))
493 |     {
494 |         for (i=0; i<nz; i++)
495 |             if (fscanf(f, "%d %d %lg %lg", &I[i], &J[i], &val[2*i], &val[2*i+1])
496 |                 != 4) return MM_PREMATURE_EOF;
497 |     }
498 |     else if (mm_is_real(matcode))
499 |     {
500 |         for (i=0; i<nz; i++)
501 |         {
502 |             if (fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i])
503 |                 != 3) return MM_PREMATURE_EOF;
504 | 
505 |         }
506 |     }
507 | 
508 |     else if (mm_is_pattern(matcode))
509 |     {
510 |         for (i=0; i<nz; i++)
511 |             if (fscanf(f, "%d %d", &I[i], &J[i])
512 |                 != 2) return MM_PREMATURE_EOF;
513 |     }
514 |     else
515 |         return MM_UNSUPPORTED_TYPE;
516 | 
517 |     return 0;
518 | 
519 | }
520 | 
521 | int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *imag,
522 |             MM_typecode matcode)
523 | {
524 |     if (mm_is_complex(matcode))
525 |     {
526 |             if (fscanf(f, "%d %d %lg %lg", I, J, real, imag)
527 |                 != 4) return MM_PREMATURE_EOF;
528 |     }
529 |     else if (mm_is_real(matcode))
530 |     {
531 |             if (fscanf(f, "%d %d %lg\n", I, J, real)
532 |                 != 3) return MM_PREMATURE_EOF;
533 | 
534 |     }
535 | 
536 |     else if (mm_is_pattern(matcode))
537 |     {
538 |             if (fscanf(f, "%d %d", I, J) != 2) return MM_PREMATURE_EOF;
539 |     }
540 |     else
541 |         return MM_UNSUPPORTED_TYPE;
542 | 
543 |     return 0;
544 | 
545 | }
546 | 
547 | int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
548 |                 double **val_, int **I_, int **J_)
549 | {
550 |     FILE *f;
551 |     MM_typecode matcode;
552 |     int M, N, nz;
553 |     int i;
554 |     double *val;
555 |     int *I, *J;
556 | 
557 |     if ((f = fopen(fname, "r")) == NULL)
558 |             return -1;
559 | 
560 | 
561 |     if (mm_read_banner(f, &matcode) != 0)
562 |     {
563 |         printf("mm_read_unsymetric: Could not process Matrix Market banner ");
564 |         printf(" in file [%s]\n", fname);
565 |         return -1;
566 |     }
567 | 
568 | 
569 | 
570 |     if ( !(mm_is_real(matcode) && mm_is_matrix(matcode) &&
571 |             mm_is_sparse(matcode)))
572 |     {
573 |         fprintf(stderr, "Sorry, this application does not support ");
574 |         fprintf(stderr, "Market Market type: [%s]\n",
575 |                 mm_typecode_to_str(matcode));
576 |         return -1;
577 |     }
578 | 
579 |     /* find out size of sparse matrix: M, N, nz .... */
580 | 
581 |     if (mm_read_mtx_crd_size(f, &M, &N, &nz) !=0)
582 |     {
583 |         fprintf(stderr, "read_unsymmetric_sparse(): could not parse matrix size.\n");
584 |         return -1;
585 |     }
586 | 
587 |     *M_ = M;
588 |     *N_ = N;
589 |     *nz_ = nz;
590 | 
591 |     /* reseve memory for matrices */
592 | 
593 |     I = (int *) malloc(nz * sizeof(int));
594 |     J = (int *) malloc(nz * sizeof(int));
595 |     val = (double *) malloc(nz * sizeof(double));
596 | 
597 |     *val_ = val;
598 |     *I_ = I;
599 |     *J_ = J;
600 | 
601 |     /* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
602 |     /*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
603 |     /*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
604 | 
605 |     for (i=0; i<nz; i++)
606 |     {
607 |         int rt = fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i]);
608 |         I[i]--;  /* adjust from 1-based to 0-based */
609 |         J[i]--;
610 |     }
611 |     fclose(f);
612 | 
613 |     return 0;
614 | }
615 | 
616 | 
617 | 
618 | #endif
619 | 


--------------------------------------------------------------------------------
/src/external/cusparse/mmio_highlevel.h:
--------------------------------------------------------------------------------
  1 | #ifndef _MMIO_HIGHLEVEL_
  2 | #define _MMIO_HIGHLEVEL_
  3 | 
  4 | #include "mmio.h"
  5 | #include "common.h"
  6 | 
  7 | // read matrix infomation from mtx file
  8 | int mmio_info(int *m, int *n, int *nnz, int *isSymmetric, char *filename)
  9 | {
 10 |     int m_tmp, n_tmp, nnz_tmp;
 11 | 
 12 |     int ret_code;
 13 |     MM_typecode matcode;
 14 |     FILE *f;
 15 | 
 16 |     int nnz_mtx_report;
 17 |     int isInteger = 0, isReal = 0, isPattern = 0, isSymmetric_tmp = 0, isComplex = 0;
 18 | 
 19 |     // load matrix
 20 |     if ((f = fopen(filename, "r")) == NULL)
 21 |         return -1;
 22 | 
 23 |     if (mm_read_banner(f, &matcode) != 0)
 24 |     {
 25 |         printf("Could not process Matrix Market banner.\n");
 26 |         return -2;
 27 |     }
 28 | 
 29 |     if ( mm_is_pattern( matcode ) )  { isPattern = 1; /*printf("type = Pattern\n");*/ }
 30 |     if ( mm_is_real ( matcode) )     { isReal = 1; /*printf("type = real\n");*/ }
 31 |     if ( mm_is_complex( matcode ) )  { isComplex = 1; /*printf("type = real\n");*/ }
 32 |     if ( mm_is_integer ( matcode ) ) { isInteger = 1; /*printf("type = integer\n");*/ }
 33 | 
 34 |     /* find out size of sparse matrix .... */
 35 |     ret_code = mm_read_mtx_crd_size(f, &m_tmp, &n_tmp, &nnz_mtx_report);
 36 |     if (ret_code != 0)
 37 |         return -4;
 38 | 
 39 |     if ( mm_is_symmetric( matcode ) || mm_is_hermitian( matcode ) )
 40 |     {
 41 |         isSymmetric_tmp = 1;
 42 |         //printf("input matrix is symmetric = true\n");
 43 |     }
 44 |     else
 45 |     {
 46 |         //printf("input matrix is symmetric = false\n");
 47 |     }
 48 | 
 49 |     int *csrRowPtr_counter = (int *)malloc((m_tmp+1) * sizeof(int));
 50 |     memset(csrRowPtr_counter, 0, (m_tmp+1) * sizeof(int));
 51 | 
 52 |     int *csrRowIdx_tmp = (int *)malloc(nnz_mtx_report * sizeof(int));
 53 |     int *csrColIdx_tmp = (int *)malloc(nnz_mtx_report * sizeof(int));
 54 |     VALUE_TYPE *csrVal_tmp    = (VALUE_TYPE *)malloc(nnz_mtx_report * sizeof(VALUE_TYPE));
 55 | 
 56 |     /* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
 57 |     /*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
 58 |     /*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
 59 | 
 60 |     for (int i = 0; i < nnz_mtx_report; i++)
 61 |     {
 62 |         int idxi, idxj;
 63 |         double fval, fval_im;
 64 |         int ival;
 65 |         int returnvalue;
 66 | 
 67 |         if (isReal)
 68 |         {
 69 |             returnvalue = fscanf(f, "%d %d %lg\n", &idxi, &idxj, &fval);
 70 |         }
 71 |         else if (isComplex)
 72 |         {
 73 |             returnvalue = fscanf(f, "%d %d %lg %lg\n", &idxi, &idxj, &fval, &fval_im);
 74 |         }
 75 |         else if (isInteger)
 76 |         {
 77 |             returnvalue = fscanf(f, "%d %d %d\n", &idxi, &idxj, &ival);
 78 |             fval = ival;
 79 |         }
 80 |         else if (isPattern)
 81 |         {
 82 |             returnvalue = fscanf(f, "%d %d\n", &idxi, &idxj);
 83 |             fval = 1.0;
 84 |         }
 85 | 
 86 |         // adjust from 1-based to 0-based
 87 |         idxi--;
 88 |         idxj--;
 89 | 
 90 |         csrRowPtr_counter[idxi]++;
 91 |         csrRowIdx_tmp[i] = idxi;
 92 |         csrColIdx_tmp[i] = idxj;
 93 |         csrVal_tmp[i] = fval;
 94 |     }
 95 | 
 96 |     if (f != stdin)
 97 |         fclose(f);
 98 | 
 99 |     if (isSymmetric_tmp)
100 |     {
101 |         for (int i = 0; i < nnz_mtx_report; i++)
102 |         {
103 |             if (csrRowIdx_tmp[i] != csrColIdx_tmp[i])
104 |                 csrRowPtr_counter[csrColIdx_tmp[i]]++;
105 |         }
106 |     }
107 | 
108 |     // exclusive scan for csrRowPtr_counter
109 |     int old_val, new_val;
110 | 
111 |     old_val = csrRowPtr_counter[0];
112 |     csrRowPtr_counter[0] = 0;
113 |     for (int i = 1; i <= m_tmp; i++)
114 |     {
115 |         new_val = csrRowPtr_counter[i];
116 |         csrRowPtr_counter[i] = old_val + csrRowPtr_counter[i-1];
117 |         old_val = new_val;
118 |     }
119 | 
120 |     nnz_tmp = csrRowPtr_counter[m_tmp];
121 | 
122 |     *m = m_tmp;
123 |     *n = n_tmp;
124 |     *nnz = nnz_tmp;
125 |     *isSymmetric = isSymmetric_tmp;
126 | 
127 |     // free tmp space
128 |     free(csrColIdx_tmp);
129 |     free(csrVal_tmp);
130 |     free(csrRowIdx_tmp);
131 |     free(csrRowPtr_counter);
132 | 
133 |     return 0;
134 | }
135 | 
136 | // read matrix infomation from mtx file
137 | int mmio_data(int *csrRowPtr, int *csrColIdx, VALUE_TYPE *csrVal, char *filename)
138 | {
139 |     int m_tmp, n_tmp, nnz_tmp;
140 | 
141 |     int ret_code;
142 |     MM_typecode matcode;
143 |     FILE *f;
144 | 
145 |     int nnz_mtx_report;
146 |     int isInteger = 0, isReal = 0, isPattern = 0, isSymmetric_tmp = 0, isComplex = 0;
147 | 
148 |     // load matrix
149 |     if ((f = fopen(filename, "r")) == NULL)
150 |         return -1;
151 | 
152 |     if (mm_read_banner(f, &matcode) != 0)
153 |     {
154 |         printf("Could not process Matrix Market banner.\n");
155 |         return -2;
156 |     }
157 | 
158 |     if ( mm_is_pattern( matcode ) )  { isPattern = 1; /*printf("type = Pattern\n");*/ }
159 |     if ( mm_is_real ( matcode) )     { isReal = 1; /*printf("type = real\n");*/ }
160 |     if ( mm_is_complex( matcode ) )  { isComplex = 1; /*printf("type = real\n");*/ }
161 |     if ( mm_is_integer ( matcode ) ) { isInteger = 1; /*printf("type = integer\n");*/ }
162 | 
163 |     /* find out size of sparse matrix .... */
164 |     ret_code = mm_read_mtx_crd_size(f, &m_tmp, &n_tmp, &nnz_mtx_report);
165 |     if (ret_code != 0)
166 |         return -4;
167 | 
168 |     if ( mm_is_symmetric( matcode ) || mm_is_hermitian( matcode ) )
169 |     {
170 |         isSymmetric_tmp = 1;
171 |         //printf("input matrix is symmetric = true\n");
172 |     }
173 |     else
174 |     {
175 |         //printf("input matrix is symmetric = false\n");
176 |     }
177 | 
178 |     int *csrRowPtr_counter = (int *)malloc((m_tmp+1) * sizeof(int));
179 |     memset(csrRowPtr_counter, 0, (m_tmp+1) * sizeof(int));
180 | 
181 |     int *csrRowIdx_tmp = (int *)malloc(nnz_mtx_report * sizeof(int));
182 |     int *csrColIdx_tmp = (int *)malloc(nnz_mtx_report * sizeof(int));
183 |     VALUE_TYPE *csrVal_tmp    = (VALUE_TYPE *)malloc(nnz_mtx_report * sizeof(VALUE_TYPE));
184 | 
185 |     /* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
186 |     /*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
187 |     /*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
188 | 
189 |     for (int i = 0; i < nnz_mtx_report; i++)
190 |     {
191 |         int idxi, idxj;
192 |         double fval, fval_im;
193 |         int ival;
194 |         int returnvalue;
195 | 
196 |         if (isReal)
197 |         {
198 |             returnvalue = fscanf(f, "%d %d %lg\n", &idxi, &idxj, &fval);
199 |         }
200 |         else if (isComplex)
201 |         {
202 |             returnvalue = fscanf(f, "%d %d %lg %lg\n", &idxi, &idxj, &fval, &fval_im);
203 |         }
204 |         else if (isInteger)
205 |         {
206 |             returnvalue = fscanf(f, "%d %d %d\n", &idxi, &idxj, &ival);
207 |             fval = ival;
208 |         }
209 |         else if (isPattern)
210 |         {
211 |             returnvalue = fscanf(f, "%d %d\n", &idxi, &idxj);
212 |             fval = 1.0;
213 |         }
214 | 
215 |         // adjust from 1-based to 0-based
216 |         idxi--;
217 |         idxj--;
218 | 
219 |         csrRowPtr_counter[idxi]++;
220 |         csrRowIdx_tmp[i] = idxi;
221 |         csrColIdx_tmp[i] = idxj;
222 |         csrVal_tmp[i] = fval;
223 |     }
224 | 
225 |     if (f != stdin)
226 |         fclose(f);
227 | 
228 |     if (isSymmetric_tmp)
229 |     {
230 |         for (int i = 0; i < nnz_mtx_report; i++)
231 |         {
232 |             if (csrRowIdx_tmp[i] != csrColIdx_tmp[i])
233 |                 csrRowPtr_counter[csrColIdx_tmp[i]]++;
234 |         }
235 |     }
236 | 
237 |     // exclusive scan for csrRowPtr_counter
238 |     int old_val, new_val;
239 | 
240 |     old_val = csrRowPtr_counter[0];
241 |     csrRowPtr_counter[0] = 0;
242 |     for (int i = 1; i <= m_tmp; i++)
243 |     {
244 |         new_val = csrRowPtr_counter[i];
245 |         csrRowPtr_counter[i] = old_val + csrRowPtr_counter[i-1];
246 |         old_val = new_val;
247 |     }
248 | 
249 |     nnz_tmp = csrRowPtr_counter[m_tmp];
250 |     memcpy(csrRowPtr, csrRowPtr_counter, (m_tmp+1) * sizeof(int));
251 |     memset(csrRowPtr_counter, 0, (m_tmp+1) * sizeof(int));
252 | 
253 |     if (isSymmetric_tmp)
254 |     {
255 |         for (int i = 0; i < nnz_mtx_report; i++)
256 |         {
257 |             if (csrRowIdx_tmp[i] != csrColIdx_tmp[i])
258 |             {
259 |                 int offset = csrRowPtr[csrRowIdx_tmp[i]] + csrRowPtr_counter[csrRowIdx_tmp[i]];
260 |                 csrColIdx[offset] = csrColIdx_tmp[i];
261 |                 csrVal[offset] = csrVal_tmp[i];
262 |                 csrRowPtr_counter[csrRowIdx_tmp[i]]++;
263 | 
264 |                 offset = csrRowPtr[csrColIdx_tmp[i]] + csrRowPtr_counter[csrColIdx_tmp[i]];
265 |                 csrColIdx[offset] = csrRowIdx_tmp[i];
266 |                 csrVal[offset] = csrVal_tmp[i];
267 |                 csrRowPtr_counter[csrColIdx_tmp[i]]++;
268 |             }
269 |             else
270 |             {
271 |                 int offset = csrRowPtr[csrRowIdx_tmp[i]] + csrRowPtr_counter[csrRowIdx_tmp[i]];
272 |                 csrColIdx[offset] = csrColIdx_tmp[i];
273 |                 csrVal[offset] = csrVal_tmp[i];
274 |                 csrRowPtr_counter[csrRowIdx_tmp[i]]++;
275 |             }
276 |         }
277 |     }
278 |     else
279 |     {
280 |         for (int i = 0; i < nnz_mtx_report; i++)
281 |         {
282 |             int offset = csrRowPtr[csrRowIdx_tmp[i]] + csrRowPtr_counter[csrRowIdx_tmp[i]];
283 |             csrColIdx[offset] = csrColIdx_tmp[i];
284 |             csrVal[offset] = csrVal_tmp[i];
285 |             csrRowPtr_counter[csrRowIdx_tmp[i]]++;
286 |         }
287 |     }
288 | 
289 |     // free tmp space
290 |     free(csrColIdx_tmp);
291 |     free(csrVal_tmp);
292 |     free(csrRowIdx_tmp);
293 |     free(csrRowPtr_counter);
294 | 
295 |     return 0;
296 | }
297 | 
298 | #endif
299 | 


--------------------------------------------------------------------------------
/src/external/cusparse/spgemm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SuperScientificSoftwareLaboratory/TileSpGEMM/fe3a3457cec078fddd73c04f4ffed14edee7fb21/src/external/cusparse/spgemm


--------------------------------------------------------------------------------
/src/external/cusparse/spgemm_cusparse.h:
--------------------------------------------------------------------------------
  1 | #ifndef _SPGEMM_CUDA_CUSPARSE_
  2 | #define _SPGEMM_CUDA_CUSPARSE_
  3 | 
  4 | #include "common.h"
  5 | #include "utils.h"
  6 | #include <cuda_runtime.h>
  7 | #include <cusparse.h>
  8 | 
  9 | //#include "utils_cuda_sort.h"
 10 | //#include "utils_cuda_spgemm_subfunc.h"
 11 | //#include "utils_cuda_scan.h"
 12 | //#include "utils_cuda_segmerge.h"
 13 | //#include "utils_cuda_segsum.h"
 14 | 
 15 | int spgemm_cusparse_executor(cusparseHandle_t handle, cusparseSpMatDescr_t matA,
 16 |                              const int mA,
 17 |                              const int nA,
 18 |                              const int nnzA,
 19 |                              const int *d_csrRowPtrA,
 20 |                              const int *d_csrColIdxA,
 21 |                              const VALUE_TYPE *d_csrValA,
 22 |                              cusparseSpMatDescr_t matB,
 23 |                              const int mB,
 24 |                              const int nB,
 25 |                              const int nnzB,
 26 |                              const int *d_csrRowPtrB,
 27 |                              const int *d_csrColIdxB,
 28 |                              const VALUE_TYPE *d_csrValB,
 29 |                              cusparseSpMatDescr_t matC,
 30 |                              const int mC,
 31 |                              const int nC,
 32 |                              unsigned long long int *nnzC,
 33 |                              int **d_csrRowPtrC,
 34 |                              int **d_csrColIdxC,
 35 |                              VALUE_TYPE **d_csrValC)
 36 | {
 37 |     cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
 38 |     cusparseOperation_t opB = CUSPARSE_OPERATION_NON_TRANSPOSE;
 39 |     cudaDataType computeType = CUDA_R_32F;
 40 |     void *dBuffer1 = NULL, *dBuffer2 = NULL;
 41 |     size_t bufferSize1 = 0, bufferSize2 = 0;
 42 | 
 43 |     float alpha = 1.0f;
 44 |     float beta = 0.0f;
 45 | 
 46 |     cudaMalloc((void **)d_csrRowPtrC, (mC + 1) * sizeof(int));
 47 | 
 48 |     //--------------------------------------------------------------------------
 49 |     // SpGEMM Computation
 50 |     cusparseSpGEMMDescr_t spgemmDesc;
 51 |     cusparseSpGEMM_createDescr(&spgemmDesc);
 52 | 
 53 |     // ask bufferSize1 bytes for external memory
 54 |     cusparseSpGEMM_workEstimation(handle, opA, opB,
 55 |                                   &alpha, matA, matB, &beta, matC,
 56 |                                   computeType, CUSPARSE_SPGEMM_DEFAULT,
 57 |                                   spgemmDesc, &bufferSize1, NULL);
 58 |     cudaMalloc((void **)&dBuffer1, bufferSize1);
 59 |     // inspect the matrices A and B to understand the memory requiremnent for
 60 |     // the next step
 61 |     cusparseSpGEMM_workEstimation(handle, opA, opB,
 62 |                                   &alpha, matA, matB, &beta, matC,
 63 |                                   computeType, CUSPARSE_SPGEMM_DEFAULT,
 64 |                                   spgemmDesc, &bufferSize1, dBuffer1);
 65 | 
 66 |     // ask bufferSize2 bytes for external memory
 67 |     cusparseSpGEMM_compute(handle, opA, opB,
 68 |                            &alpha, matA, matB, &beta, matC,
 69 |                            computeType, CUSPARSE_SPGEMM_DEFAULT,
 70 |                            spgemmDesc, &bufferSize2, NULL);
 71 |     cudaMalloc((void **)&dBuffer2, bufferSize2);
 72 | 
 73 |     // compute the intermediate product of A * B
 74 |     cusparseSpGEMM_compute(handle, opA, opB,
 75 |                            &alpha, matA, matB, &beta, matC,
 76 |                            computeType, CUSPARSE_SPGEMM_DEFAULT,
 77 |                            spgemmDesc, &bufferSize2, dBuffer2);
 78 |     // get matrix C non-zero entries C_num_nnz1
 79 |     int64_t C_num_rows1, C_num_cols1, C_num_nnz1;
 80 |     cusparseSpMatGetSize(matC, &C_num_rows1, &C_num_cols1, &C_num_nnz1);
 81 |     // allocate matrix C
 82 |     cudaMalloc((void **)d_csrColIdxC, C_num_nnz1 * sizeof(int));
 83 |     cudaMalloc((void **)d_csrValC, C_num_nnz1 * sizeof(VALUE_TYPE));
 84 |     // update matC with the new pointers
 85 |     cusparseCsrSetPointers(matC, *d_csrRowPtrC, *d_csrColIdxC, *d_csrValC);
 86 | 
 87 |     // copy the final products to the matrix C
 88 |     cusparseSpGEMM_copy(handle, opA, opB,
 89 |                         &alpha, matA, matB, &beta, matC,
 90 |                         computeType, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc);
 91 | 
 92 |     *nnzC = C_num_nnz1;
 93 | 
 94 |     cusparseSpGEMM_destroyDescr(spgemmDesc);
 95 | 
 96 |     return 0;
 97 | }
 98 | 
 99 | int spgemm_cusparse(const int mA,
100 |                     const int nA,
101 |                     const int nnzA,
102 |                     const int *h_csrRowPtrA,
103 |                     const int *h_csrColIdxA,
104 |                     const VALUE_TYPE *h_csrValA,
105 |                     const int mB,
106 |                     const int nB,
107 |                     const int nnzB,
108 |                     const int *h_csrRowPtrB,
109 |                     const int *h_csrColIdxB,
110 |                     const VALUE_TYPE *h_csrValB,
111 |                     const int mC,
112 |                     const int nC,
113 |                     const int nnzC_golden,
114 |                     const int *h_csrRowPtrC_golden,
115 |                     const int *h_csrColIdxC_golden,
116 |                     const VALUE_TYPE *h_csrValC_golden,
117 |                     const bool check_result,
118 |                     unsigned long long int nnzCub,
119 |                     unsigned long long int *nnzC,
120 |                     double *compression_rate,
121 |                     double *time_segmerge,
122 |                     double *gflops_segmerge)
123 | 
124 | {
125 |     // transfer host mem to device mem
126 |     int *d_csrRowPtrA;
127 |     int *d_csrColIdxA;
128 |     VALUE_TYPE *d_csrValA;
129 |     int *d_csrRowPtrB;
130 |     int *d_csrColIdxB;
131 |     VALUE_TYPE *d_csrValB;
132 |     //unsigned long long int nnzC = 0;
133 |     int *d_csrRowPtrC;
134 |     int *d_csrColIdxC;
135 |     VALUE_TYPE *d_csrValC;
136 | 
137 |     // Matrix A in CSR
138 |     cudaMalloc((void **)&d_csrRowPtrA, (mA + 1) * sizeof(int));
139 |     cudaMalloc((void **)&d_csrColIdxA, nnzA * sizeof(int));
140 |     cudaMalloc((void **)&d_csrValA, nnzA * sizeof(VALUE_TYPE));
141 | 
142 |     cudaMemcpy(d_csrRowPtrA, h_csrRowPtrA, (mA + 1) * sizeof(int), cudaMemcpyHostToDevice);
143 |     cudaMemcpy(d_csrColIdxA, h_csrColIdxA, nnzA * sizeof(int), cudaMemcpyHostToDevice);
144 |     cudaMemcpy(d_csrValA, h_csrValA, nnzA * sizeof(VALUE_TYPE), cudaMemcpyHostToDevice);
145 | 
146 |     // Matrix B in CSR
147 |     cudaMalloc((void **)&d_csrRowPtrB, (mB + 1) * sizeof(int));
148 |     cudaMalloc((void **)&d_csrColIdxB, nnzB * sizeof(int));
149 |     cudaMalloc((void **)&d_csrValB, nnzB * sizeof(VALUE_TYPE));
150 | 
151 |     cudaMemcpy(d_csrRowPtrB, h_csrRowPtrB, (mB + 1) * sizeof(int), cudaMemcpyHostToDevice);
152 |     cudaMemcpy(d_csrColIdxB, h_csrColIdxB, nnzB * sizeof(int), cudaMemcpyHostToDevice);
153 |     cudaMemcpy(d_csrValB, h_csrValB, nnzB * sizeof(VALUE_TYPE), cudaMemcpyHostToDevice);
154 | 
155 |     //--------------------------------------------------------------------------
156 |     // CUSPARSE APIs
157 |     cusparseHandle_t handle = NULL;
158 |     cusparseSpMatDescr_t matA, matB, matC;
159 | 
160 |     cusparseCreate(&handle);
161 |     // Create sparse matrix A in CSR format
162 |     cusparseCreateCsr(&matA, mA, nA, nnzA,
163 |                       d_csrRowPtrA, d_csrColIdxA, d_csrValA,
164 |                       CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
165 |                       CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
166 |     cusparseCreateCsr(&matB, mB, nB, nnzB,
167 |                       d_csrRowPtrB, d_csrColIdxB, d_csrValB,
168 |                       CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
169 |                       CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
170 |     cusparseCreateCsr(&matC, mA, nB, 0,
171 |                       NULL, NULL, NULL,
172 |                       CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
173 |                       CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
174 |     //--------------------------------------------------------------------------
175 | 
176 |     //  - cuda SpGEMM start!
177 |     printf(" - cuda SpGEMM start! Benchmark runs %i times.\n", BENCH_REPEAT);
178 | 
179 |     if (check_result && BENCH_REPEAT > 1)
180 |     {
181 |         printf("If check_result, Set BENCH_REPEAT to 1.\n");
182 |         return -1;
183 |     }
184 |     //unsigned long long int nnzCub = 0;
185 | 
186 |     struct timeval t1, t2;
187 | 
188 |     cudaDeviceSynchronize();
189 |     gettimeofday(&t1, NULL);
190 | 
191 |     for (int i = 0; i < BENCH_REPEAT; i++)
192 |     {
193 |         spgemm_cusparse_executor(handle, matA, mA, nA, nnzA, d_csrRowPtrA, d_csrColIdxA, d_csrValA,
194 |                                  matB, mB, nB, nnzB, d_csrRowPtrB, d_csrColIdxB, d_csrValB,
195 |                                  matC, mC, nC, nnzC, &d_csrRowPtrC, &d_csrColIdxC, &d_csrValC);
196 | 
197 |         if (check_result != 1 || i != BENCH_REPEAT - 1)
198 |         {
199 |             cudaFree(d_csrRowPtrC);
200 |             cudaFree(d_csrColIdxC);
201 |             cudaFree(d_csrValC);
202 |         }
203 |     }
204 | 
205 |     cudaDeviceSynchronize();
206 |     gettimeofday(&t2, NULL);
207 | 
208 |     printf(" - cuda SpGEMM completed!\n\n");
209 |     double time_cuda_spgemm = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0;
210 |     time_cuda_spgemm /= BENCH_REPEAT;
211 |     *time_segmerge = time_cuda_spgemm;
212 |     *compression_rate = (double)nnzCub / (double)*nnzC;
213 |     *gflops_segmerge = 2 * (double)nnzCub / (1e6 * time_cuda_spgemm);
214 |     printf("nnzC = %i, nnzCub = %lld, Compression rate = %4.2f\n",
215 |            *nnzC, nnzCub, *compression_rate);
216 |     printf("CUDA  cuSPARSE SpGEMM runtime is %4.4f ms, GFlops = %4.4f\n",
217 |            time_cuda_spgemm, *gflops_segmerge);
218 | 
219 |     // validate C = AB
220 | 
221 |     if (check_result)
222 |     {
223 |         if (*nnzC <= 0)
224 |         {
225 |             printf("cuSPARSE failed!\n");
226 |             return 0;
227 |         }
228 |         else
229 |         {
230 |             printf("\nValidating results...\n");
231 |             if (*nnzC != nnzC_golden)
232 |             {
233 | 
234 |                 printf("[NOT PASSED] nnzC = %i, nnzC_golden = %i\n", *nnzC, nnzC_golden);
235 |             }
236 |             else
237 |             {
238 |                 printf("[PASSED] nnzC = %i\n", *nnzC);
239 |             }
240 | 
241 |             int *h_csrRowPtrC = (int *)malloc((mC + 1) * sizeof(int));
242 |             int *h_csrColIdxC = (int *)malloc(*nnzC * sizeof(int));
243 |             VALUE_TYPE *h_csrValC = (VALUE_TYPE *)malloc(*nnzC * sizeof(VALUE_TYPE));
244 | 
245 |             cudaMemcpy(h_csrRowPtrC, d_csrRowPtrC, (mC + 1) * sizeof(int), cudaMemcpyDeviceToHost);
246 |             cudaMemcpy(h_csrColIdxC, d_csrColIdxC, *nnzC * sizeof(int), cudaMemcpyDeviceToHost);
247 |             cudaMemcpy(h_csrValC, d_csrValC, *nnzC * sizeof(VALUE_TYPE), cudaMemcpyDeviceToHost);
248 | 
249 |             int errcounter = 0;
250 |             for (int i = 0; i < mC + 1; i++)
251 |             {
252 |                 if (h_csrRowPtrC[i] != h_csrRowPtrC_golden[i])
253 |                 {
254 |                     if (h_csrRowPtrC[i] < 0)
255 |                     {
256 |                         printf("cuSPARSE failed!\n");
257 |                         return 0;
258 |                     }
259 |                     else{
260 |                     errcounter++;}
261 |                 }
262 |             }
263 |             if (errcounter != 0)
264 |             {
265 |                 printf("[NOT PASSED] row_pointer, #err = %i\n", errcounter);
266 |             }
267 |             else
268 |             {
269 |                 printf("[PASSED] row_pointer\n");
270 |             }
271 | 
272 |             /*for (int i = 0; i < mC; i++)
273 |         {
274 |             quick_sort_key_val_pair<int, VALUE_TYPE>(&h_csrColIdxC[h_csrRowPtrC[i]],
275 |                                                      &h_csrValC[h_csrRowPtrC[i]],
276 |                                                      h_csrRowPtrC[i+1]-h_csrRowPtrC[i]);
277 |         }*/
278 | 
279 |             errcounter = 0;
280 |             for (int j = 0; j < *nnzC; j++)
281 |             {
282 |                 if (h_csrColIdxC[j] != h_csrColIdxC_golden[j]) //|| h_csrValC[j] != h_csrValC_golden[j])
283 |                 {
284 |                     //    printf("h_csrColIdxC[j] = %i,  h_csrColIdxC_golden[j] = %i\n",h_csrColIdxC[j] ,h_csrColIdxC_golden[j]);
285 |                     errcounter++;
286 |                 }
287 |             }
288 | 
289 |             if (errcounter != 0)
290 |             {
291 |                 printf("[NOT PASSED] column_index & value, #err = %i (%4.2f%% #nnz)\n",
292 |                        errcounter, 100.0 * (double)errcounter / (double)(*nnzC));
293 |             }
294 |             else
295 |             {
296 |                 printf("[PASSED] column_index & value\n");
297 |             }
298 | 
299 |             free(h_csrRowPtrC);
300 |             free(h_csrColIdxC);
301 |             free(h_csrValC);
302 |         }
303 |     }
304 | 
305 |     cudaFree(d_csrRowPtrA);
306 |     cudaFree(d_csrColIdxA);
307 |     cudaFree(d_csrValA);
308 |     cudaFree(d_csrRowPtrB);
309 |     cudaFree(d_csrColIdxB);
310 |     cudaFree(d_csrValB);
311 | 
312 |     if (check_result)
313 |     {
314 |         cudaFree(d_csrRowPtrC);
315 |         cudaFree(d_csrColIdxC);
316 |         cudaFree(d_csrValC);
317 |     }
318 | 
319 |     cusparseDestroySpMat(matA);
320 |     cusparseDestroySpMat(matB);
321 |     cusparseDestroySpMat(matC);
322 |     cusparseDestroy(handle);
323 | 
324 |     return 0;
325 | }
326 | 
327 | #endif
328 | 


--------------------------------------------------------------------------------
/src/external/cusparse/spgemm_serialref_esc.h:
--------------------------------------------------------------------------------
  1 | #ifndef _SPGEMM_SERIALREF_
  2 | #define _SPGEMM_SERIALREF_
  3 | 
  4 | #include "common.h"
  5 | #include "utils.h"
  6 | 
  7 | int spgemm_serialref(const int           *d_csrRowPtrA,
  8 |                      const int           *d_csrColIdxA,
  9 |                      const VALUE_TYPE    *d_csrValA,
 10 |                      const int            mA,
 11 |                      const int            nA,
 12 |                      const int            nnzA,
 13 |                      const int           *d_csrRowPtrB,
 14 |                      const int           *d_csrColIdxB,
 15 |                      const VALUE_TYPE    *d_csrValB,
 16 |                      const int            mB,
 17 |                      const int            nB,
 18 |                      const int            nnzB,
 19 |                            int           *d_csrRowPtrC,
 20 |                            int           *d_csrColIdxC,
 21 |                            VALUE_TYPE    *d_csrValC,
 22 |                      const int            mC,
 23 |                      const int            nC,
 24 |                            int           *nnzC,
 25 |                      const bool           get_nnzC_only)
 26 | {
 27 |     if (nA != mB)
 28 |     {
 29 |         printf("Cannot multiply matrix A of size %i x %i and matrix B of size %i x %i, return.\n",
 30 |                mA, nA, mB, nB);
 31 |         return -1;
 32 |     }
 33 | 
 34 |     int *d_csrRowPtrCub = (int *)malloc((mC+1) * sizeof(int));
 35 |     memset(d_csrRowPtrCub, 0, (mC+1) * sizeof(int));
 36 |     for (int i = 0; i < mA; i++)
 37 |     {
 38 |         for (int j = d_csrRowPtrA[i]; j < d_csrRowPtrA[i+1]; j++)
 39 |         {
 40 |             int rowB = d_csrColIdxA[j];
 41 |             d_csrRowPtrCub[i] += d_csrRowPtrB[rowB + 1] - d_csrRowPtrB[rowB];
 42 |         }
 43 |     }
 44 | 
 45 |     exclusive_scan(d_csrRowPtrCub, mC+1);
 46 |     int nnzCub = d_csrRowPtrCub[mC];
 47 | 
 48 |     if (get_nnzC_only == true)
 49 |     {
 50 |         //printf("round 1, rid = %i\n", rid);
 51 |         int *d_csrColIdxCub = (int *)malloc(nnzCub * sizeof(int));
 52 |         memset(d_csrColIdxCub, 0, nnzCub * sizeof(int));
 53 |         memset(d_csrRowPtrC, 0, (mC+1) * sizeof(int));
 54 | 
 55 |         for (int rid = 0; rid < mC; rid++)
 56 |         {
 57 |             // collect indices
 58 |             int rsize = d_csrRowPtrCub[rid + 1] - d_csrRowPtrCub[rid];
 59 |             int offset = d_csrRowPtrCub[rid];
 60 |             for (int j = d_csrRowPtrA[rid]; j < d_csrRowPtrA[rid+1]; j++)
 61 |             {
 62 |                 int rowB = d_csrColIdxA[j];
 63 |                 int incr = 0;
 64 |                 for (int k = d_csrRowPtrB[rowB]; k < d_csrRowPtrB[rowB + 1]; k++)
 65 |                 {
 66 |                     d_csrColIdxCub[offset+incr] = d_csrColIdxB[k];
 67 |                     incr++;
 68 |                 }
 69 |                 offset += incr;
 70 |             }
 71 | 
 72 |             // sort
 73 |             quick_sort_key(&d_csrColIdxCub[d_csrRowPtrCub[rid]], rsize);
 74 | 
 75 |             // compress
 76 |             int nnzr = rsize > 0 ? 1 : 0;
 77 |             for (int i = d_csrRowPtrCub[rid]+1; i < d_csrRowPtrCub[rid + 1]; i++)
 78 |             {
 79 |                 nnzr = d_csrColIdxCub[i] == d_csrColIdxCub[i-1] ? nnzr : nnzr+1;
 80 |             }
 81 | 
 82 |             d_csrRowPtrC[rid] = nnzr;
 83 |         }
 84 | 
 85 |         exclusive_scan(d_csrRowPtrC, mC+1);
 86 |         *nnzC = d_csrRowPtrC[mC];
 87 | 
 88 |         //printf("1st round nnzc = %i\n", *nnzC);
 89 | 
 90 |         free(d_csrColIdxCub);
 91 |     }
 92 |     else
 93 |     {
 94 |         //printf("round 2, rid = %i\n", rid);
 95 |         int *d_csrColIdxCub = (int *)malloc(nnzCub * sizeof(int));
 96 |         VALUE_TYPE *d_csrValCub = (VALUE_TYPE *)malloc(nnzCub * sizeof(VALUE_TYPE));
 97 |         bool *d_flagCub = (bool *)malloc(nnzCub * sizeof(bool));
 98 |         memset(d_csrColIdxCub, 0, nnzCub * sizeof(int));
 99 |         memset(d_csrValCub, 0, nnzCub * sizeof(VALUE_TYPE));
100 |         memset(d_flagCub, 0, nnzCub * sizeof(bool));
101 | 
102 |         for (int rid = 0; rid < mC; rid++)
103 |         {
104 |             // collect indices
105 |             int rsize = d_csrRowPtrCub[rid + 1] - d_csrRowPtrCub[rid];
106 |             if (rsize == 0) continue;
107 | 
108 |             int offset = d_csrRowPtrCub[rid];
109 |             for (int j = d_csrRowPtrA[rid]; j < d_csrRowPtrA[rid+1]; j++)
110 |             {
111 |                 int rowB = d_csrColIdxA[j];
112 |                 int val = d_csrValA[j];
113 |                 int incr = 0;
114 |                 for (int k = d_csrRowPtrB[rowB]; k < d_csrRowPtrB[rowB + 1]; k++)
115 |                 {
116 |                     d_csrColIdxCub[offset+incr] = d_csrColIdxB[k];
117 |                     d_csrValCub[offset+incr] = val * d_csrValB[k];
118 |                     incr++;
119 |                 }
120 |                 offset += incr;
121 |             }
122 | 
123 |             // sort
124 |             quick_sort_key_val_pair(&d_csrColIdxCub[d_csrRowPtrCub[rid]],
125 |                                     &d_csrValCub[d_csrRowPtrCub[rid]], rsize);
126 | 
127 |             // compress
128 |             d_flagCub[d_csrRowPtrCub[rid]] = 1;
129 |             for (int i = d_csrRowPtrCub[rid]; i < d_csrRowPtrCub[rid + 1]-1; i++)
130 |             {
131 |                 d_flagCub[1+i] = d_csrColIdxCub[1+i] == d_csrColIdxCub[i] ? 0 : 1;
132 |             }
133 |             segmented_sum<VALUE_TYPE, bool>(&d_csrValCub[d_csrRowPtrCub[rid]], &d_flagCub[d_csrRowPtrCub[rid]], rsize);
134 | 
135 |             int incr = 0;
136 |             for (int i = d_csrRowPtrCub[rid]; i < d_csrRowPtrCub[rid + 1]; i++)
137 |             {
138 |                 if (d_flagCub[i] == 1)
139 |                 {
140 |                     d_csrColIdxC[d_csrRowPtrC[rid] + incr] = d_csrColIdxCub[i];
141 |                     d_csrValC[d_csrRowPtrC[rid] + incr] = d_csrValCub[i];
142 |                     incr++;
143 |                 }
144 |             }
145 |         }
146 | 
147 |         free(d_csrColIdxCub);
148 |         free(d_csrValCub);
149 |     }
150 | 
151 |     free(d_csrRowPtrCub);
152 | 
153 |     return 0;
154 | }
155 | 
156 | 
157 | #endif
158 | 
159 | 
160 | 


--------------------------------------------------------------------------------
/src/external/cusparse/spgemm_serialref_spa.h:
--------------------------------------------------------------------------------
  1 | #ifndef _SPGEMM_SERIALREF_
  2 | #define _SPGEMM_SERIALREF_
  3 | 
  4 | #include "common.h"
  5 | #include "utils.h"
  6 | 
  7 | void compute_dense_row(const int        *d_csrRowPtrA,
  8 |                        const int        *d_csrColIdxA,
  9 |                        const VALUE_TYPE *d_csrValA,
 10 |                        const int        *d_csrRowPtrB,
 11 |                        const int        *d_csrColIdxB,
 12 |                        const VALUE_TYPE *d_csrValB,
 13 |                              int        *d_dense_row_column_flag,
 14 |                              VALUE_TYPE *d_dense_row_value,
 15 |                        const int         rid,
 16 |                        const bool        has_value)
 17 | {
 18 |     for (int rid_a = d_csrRowPtrA[rid]; rid_a < d_csrRowPtrA[rid+1]; rid_a++)
 19 |     {
 20 |         int rid_b = d_csrColIdxA[rid_a];
 21 |         VALUE_TYPE val_a = 0;
 22 |         if (has_value) val_a = d_csrValA[rid_a];
 23 | 
 24 |         for (int cid_b = d_csrRowPtrB[rid_b]; cid_b < d_csrRowPtrB[rid_b+1]; cid_b++)
 25 |         {
 26 |             d_dense_row_column_flag[d_csrColIdxB[cid_b]] = 1;
 27 |             if (has_value) d_dense_row_value[d_csrColIdxB[cid_b]] += val_a * d_csrValB[cid_b];
 28 |         }
 29 |     }
 30 |     return;
 31 | }
 32 | 
 33 | int spgemm_serialref(const int           *d_csrRowPtrA,
 34 |                      const int           *d_csrColIdxA,
 35 |                      const VALUE_TYPE    *d_csrValA,
 36 |                      const int            mA,
 37 |                      const int            nA,
 38 |                      const int            nnzA,
 39 |                      const int           *d_csrRowPtrB,
 40 |                      const int           *d_csrColIdxB,
 41 |                      const VALUE_TYPE    *d_csrValB,
 42 |                      const int            mB,
 43 |                      const int            nB,
 44 |                      const int            nnzB,
 45 |                            int           *d_csrRowPtrC,
 46 |                            int           *d_csrColIdxC,
 47 |                            VALUE_TYPE    *d_csrValC,
 48 |                      const int            mC,
 49 |                      const int            nC,
 50 |                            int           *nnzC,
 51 |                      const bool           get_nnzC_only)
 52 | {
 53 |     if (nA != mB)
 54 |     {
 55 |         printf("Cannot multiply matrix A of size %i x %i and matrix B of size %i x %i, return.\n",
 56 |                mA, nA, mB, nB);
 57 |         return -1;
 58 |     }
 59 | 
 60 |     // malloc column index of a dense row of C
 61 |     int *d_dense_row_column_flag = (int *)malloc(nC * sizeof(int));
 62 |     VALUE_TYPE *d_dense_row_value = (VALUE_TYPE *)malloc(nC * sizeof(VALUE_TYPE));
 63 | 
 64 |     if (get_nnzC_only == true)
 65 |     {
 66 |         for (int rid = 0; rid < mC; rid++)
 67 |         {
 68 |             //printf("round 1, rid = %i\n", rid);
 69 |             memset(d_dense_row_column_flag, 0, nC * sizeof(int));
 70 | 
 71 |             compute_dense_row(d_csrRowPtrA, d_csrColIdxA, d_csrValA,
 72 |                               d_csrRowPtrB, d_csrColIdxB, d_csrValB,
 73 |                               d_dense_row_column_flag, d_dense_row_value,
 74 |                               rid, !get_nnzC_only);
 75 | 
 76 |             int nnzr = 0;
 77 |             for (int cid = 0; cid < nC; cid++)
 78 |             {
 79 |                 if (d_dense_row_column_flag[cid] == 1)
 80 |                 {
 81 |                     nnzr++;
 82 |                 }
 83 |             }
 84 |             d_csrRowPtrC[rid] = nnzr;
 85 |         }
 86 |         exclusive_scan(d_csrRowPtrC, mC+1);
 87 |         *nnzC = d_csrRowPtrC[mC];
 88 |     }
 89 |     else
 90 |     {
 91 |         for (int rid = 0; rid < mC; rid++)
 92 |         {
 93 |             //printf("round 2, rid = %i\n", rid);
 94 |             memset(d_dense_row_column_flag, 0, nC * sizeof(int));
 95 |             memset(d_dense_row_value, 0, nC * sizeof(VALUE_TYPE));
 96 | 
 97 |             compute_dense_row(d_csrRowPtrA, d_csrColIdxA, d_csrValA,
 98 |                               d_csrRowPtrB, d_csrColIdxB, d_csrValB,
 99 |                               d_dense_row_column_flag, d_dense_row_value,
100 |                               rid, !get_nnzC_only);
101 | 
102 |             int nnzr = 0;
103 |             for (int cid = 0; cid < nC; cid++)
104 |             {
105 |                 if (d_dense_row_column_flag[cid] == 1)
106 |                 {
107 |                     d_csrColIdxC[d_csrRowPtrC[rid] + nnzr] = cid;
108 |                     d_csrValC[d_csrRowPtrC[rid] + nnzr] = d_dense_row_value[cid];
109 |                     nnzr++;
110 |                 }
111 |             }
112 |         }
113 |     }
114 | 
115 |     free(d_dense_row_column_flag);
116 |     free(d_dense_row_value);
117 | 
118 |     return 0;
119 | }
120 | 
121 | 
122 | #endif
123 | 
124 | 
125 | 


--------------------------------------------------------------------------------
/src/external/cusparse/spgemm_serialref_spa_new.h:
--------------------------------------------------------------------------------
  1 | #ifndef _SPGEMM_PARALLELREF_NEW_
  2 | #define _SPGEMM_PARALLELREF_NEW_
  3 | 
  4 | #include <stdbool.h>
  5 | #include "common.h"
  6 | #include "utils.h"
  7 | void spgemm_spa( const int           *d_csrRowPtrA,
  8 |                         const int           *d_csrColIdxA,
  9 |                         const VALUE_TYPE    *d_csrValA,
 10 |                         const int            mA,
 11 |                         const int            nA,
 12 |                         const int            nnzA,
 13 |                         const int           *d_csrRowPtrB,
 14 |                         const int           *d_csrColIdxB,
 15 |                         const VALUE_TYPE    *d_csrValB,
 16 |                         const int            mB,
 17 |                         const int            nB,
 18 |                         const int            nnzB,
 19 |                             int           *d_csrRowPtrC,
 20 |                             int           *d_csrColIdxC,
 21 |                            VALUE_TYPE    *d_csrValC,
 22 |                         const int            mC,
 23 |                         const int            nC,
 24 |                             int           *nnzC,
 25 |                         const int           get_nnzC_only)
 26 | {
 27 |     int nthreads = omp_get_max_threads();
 28 | 
 29 |     if (get_nnzC_only ==1 )
 30 |     {
 31 |         unsigned int *flag_g = (unsigned int *)malloc(nthreads * (nB / 32 + 1) * sizeof(unsigned int));
 32 | 
 33 |         #pragma omp parallel for
 34 |         for (int iid=0;iid<mA;iid++)
 35 |         {
 36 |             int thread_id = omp_get_thread_num();
 37 | 
 38 |             unsigned int *flag = flag_g + thread_id * (nB / 32 + 1); //(unsigned int *)malloc((nB/32+1)*sizeof(unsigned int));
 39 |             memset(flag, 0, sizeof(unsigned int) * (nB / 32 + 1));
 40 |             for (int blkj = d_csrRowPtrA[iid]; blkj < d_csrRowPtrA[iid + 1]; blkj++)
 41 |             {
 42 |                 int col = d_csrColIdxA[blkj];
 43 |                 for (int l = d_csrRowPtrB[col]; l < d_csrRowPtrB[col + 1]; l++)
 44 |                 {
 45 |                     const int key = d_csrColIdxB[l];
 46 |                     int ind = key / 32;
 47 |                     flag[ind] |= (1 << (key % 32));
 48 |                 }
 49 |             }
 50 |             //int nnzr_new=0;
 51 |             int nnzr_new1 = 0;
 52 |             for (int i = 0; i < (nB / 32) + 1; i++)
 53 |             {
 54 |                 nnzr_new1 += _mm_popcnt_u32(flag[i]);
 55 |             }
 56 | 
 57 |             d_csrRowPtrC[iid] = nnzr_new1;
 58 |         }
 59 |         exclusive_scan(d_csrRowPtrC, mC +1);
 60 |         *nnzC = d_csrRowPtrC[mC];
 61 |         free(flag_g);
 62 |     }
 63 |     else
 64 |     {
 65 |         unsigned int *flag_g = (unsigned int *)malloc(nthreads * (nB / 32 + 1) * sizeof(unsigned int));
 66 |         #pragma omp parallel for
 67 |         for (int iid=0;iid<mA;iid++)
 68 |         {
 69 |             int thread_id = omp_get_thread_num();
 70 |             unsigned int *flag = flag_g + thread_id * (nB / 32 + 1);
 71 |             memset(flag, 0, sizeof(unsigned int) * (nB / 32 + 1));
 72 |             //   int pos=0;
 73 |             //    int j=bin[iid];
 74 |             for (int blkj = d_csrRowPtrA[iid]; blkj < d_csrRowPtrA[iid + 1]; blkj++)
 75 |             {
 76 |                 int col = d_csrColIdxA[blkj];
 77 |                 for (int l = d_csrRowPtrB[col]; l < d_csrRowPtrB[col + 1]; l++)
 78 |                 {
 79 |                     const int key = d_csrColIdxB[l];
 80 |                     int ind = key / 32;
 81 |                     flag[ind] |= (1 << (key % 32));
 82 |                 }
 83 |             }
 84 | 
 85 |             // int nnzr = d_csrRowPtrC[iid];
 86 |             int nnzr_new = d_csrRowPtrC[iid];
 87 |             for (int i = 0; i < (nB / 32) + 1; i++)
 88 |             {
 89 |                 int count = 0;
 90 |                 while (flag[i])
 91 |                 {
 92 |                     count++;
 93 |                     if ((flag[i] & 1) != 0)
 94 |                     {
 95 |                         d_csrColIdxC[nnzr_new] = (i * 32 + count - 1);
 96 |                         nnzr_new++;
 97 |                     }
 98 |                     flag[i] = flag[i] >> 1;
 99 |                 }
100 |             }
101 |         }
102 |         free(flag_g);
103 |     }
104 | 
105 | }
106 | 
107 | #endif
108 | 
109 | 
110 | 


--------------------------------------------------------------------------------
/src/external/cusparse/tranpose.h:
--------------------------------------------------------------------------------
 1 | #ifndef _TRANS_
 2 | #define _TRANS_
 3 | 
 4 | #include "common.h"
 5 | 
 6 | void matrix_transposition(const int         m,
 7 |                           const int         n,
 8 |                           const int         nnz,
 9 |                           const int        *csrRowPtr,
10 |                           const int        *csrColIdx,
11 |                           const VALUE_TYPE *csrVal,
12 |                                 int        *cscRowIdx,
13 |                                 int        *cscColPtr,
14 |                                 VALUE_TYPE *cscVal)
15 | {
16 |     // histogram in column pointer
17 |     memset (cscColPtr, 0, sizeof(int) * (n+1));
18 |     for (int i = 0; i < nnz; i++)
19 |     {
20 |         cscColPtr[csrColIdx[i]]++;
21 |     }
22 | 
23 |     // prefix-sum scan to get the column pointer
24 |     exclusive_scan(cscColPtr, n + 1);
25 | 
26 |     int *cscColIncr = (int *)malloc(sizeof(int) * (n+1));
27 |     memcpy (cscColIncr, cscColPtr, sizeof(int) * (n+1));
28 | 
29 |     // insert nnz to csc
30 |     for (int row = 0; row < m; row++)
31 |     {
32 |         for (int j = csrRowPtr[row]; j < csrRowPtr[row+1]; j++)
33 |         {
34 |             int col = csrColIdx[j];
35 | 
36 |             cscRowIdx[cscColIncr[col]] = row;
37 |             cscVal[cscColIncr[col]] = csrVal[j];
38 |             cscColIncr[col]++;
39 |         }
40 |     }
41 | 
42 |     free (cscColIncr);
43 | }
44 | 
45 | #endif
46 | 


--------------------------------------------------------------------------------
/src/external/cusparse/utils.h:
--------------------------------------------------------------------------------
  1 | #ifndef _UTILS_
  2 | #define _UTILS_
  3 | 
  4 | #include "common.h"
  5 | //#include "cusparse.h"
  6 | 
  7 | // print 1D array
  8 | template<typename T>
  9 | void print_1darray(T *input, int length)
 10 | {
 11 |     for (int i = 0; i < length; i++)
 12 |         printf("%lld, ", input[i]);
 13 |     printf("\n");
 14 | }
 15 | /*
 16 | __forceinline__ __device__
 17 | static double atomicAdd(double *addr, double val)
 18 | {
 19 |     double old = *addr, assumed;
 20 |     do
 21 |     {
 22 |         assumed = old;
 23 |         old = __longlong_as_double(
 24 |                     atomicCAS((unsigned long long int*)addr,
 25 |                               __double_as_longlong(assumed),
 26 |                               __double_as_longlong(val+assumed)));
 27 | 
 28 |     }while(assumed != old);
 29 | 
 30 |     return old;
 31 | }*/
 32 | 
 33 | template<typename vT>
 34 | __forceinline__ __device__
 35 | vT sum_32_shfl(vT sum)
 36 | {
 37 | #pragma unroll
 38 |     for(int mask = WARP_SIZE / 2 ; mask > 0 ; mask >>= 1)
 39 |         sum += __shfl_xor_sync(0xffffffff, sum, mask);
 40 |     
 41 |     return sum;
 42 | }
 43 | 
 44 | /*struct assembly_timer {
 45 |     timeval t1, t2;
 46 |     struct timezone tzone;
 47 | 
 48 |     void start() {
 49 |         gettimeofday(&t1, &tzone);
 50 |     }
 51 | 
 52 |     double stop() {
 53 |         gettimeofday(&t2, &tzone);
 54 |         double elapsedTime = 0;
 55 |         elapsedTime = (t2.tv_sec - t1.tv_sec) * 1000.0;      // sec to ms
 56 |         elapsedTime += (t2.tv_usec - t1.tv_usec) / 1000.0;   // us to ms
 57 |         return elapsedTime;
 58 |     }
 59 | };*/
 60 | 
 61 | //void check_cusparse_kernel(cusparseStatus_t cudaerr)
 62 | //{
 63 | //    if (cudaerr != CUSPARSE_STATUS_SUCCESS)
 64 | //        printf("cuda kernel fail, err = %s\n", cudaerr);
 65 | //}
 66 | 
 67 | void swap(int *a , int *b)
 68 | {
 69 |     int tmp = *a;
 70 |     *a = *b;
 71 |     *b = tmp;
 72 | }
 73 | 
 74 | void swap(float *a , float *b)
 75 | {
 76 |     float tmp = *a;
 77 |     *a = *b;
 78 |     *b = tmp;
 79 | }
 80 | 
 81 | void swap(double *a , double *b)
 82 | {
 83 |     double tmp = *a;
 84 |     *a = *b;
 85 |     *b = tmp;
 86 | }
 87 | 
 88 | // quick sort key (child function)
 89 | int partition(int *key, int length, int pivot_index)
 90 | {
 91 |     int i  = 0 ;
 92 |     int small_length = pivot_index;
 93 | 
 94 |     int pivot = key[pivot_index];
 95 |     swap(&key[pivot_index], &key[pivot_index + (length - 1)]);
 96 | 
 97 |     for(; i < length; i++)
 98 |     {
 99 |         if(key[pivot_index+i] < pivot)
100 |         {
101 |             swap(&key[pivot_index+i],  &key[small_length]);
102 |             small_length++;
103 |         }
104 |     }
105 | 
106 |     swap(&key[pivot_index + length - 1],  &key[small_length]);
107 | 
108 |     return small_length;
109 | }
110 | 
111 | // quick sort key (child function)
112 | int partition(double *key, int length, int pivot_index)
113 | {
114 |     int i  = 0 ;
115 |     int small_length = pivot_index;
116 | 
117 |     double pivot = key[pivot_index];
118 |     swap(&key[pivot_index], &key[pivot_index + (length - 1)]);
119 | 
120 |     for(; i < length; i++)
121 |     {
122 |         if(key[pivot_index+i] < pivot)
123 |         {
124 |             swap(&key[pivot_index+i],  &key[small_length]);
125 |             small_length++;
126 |         }
127 |     }
128 | 
129 |     swap(&key[pivot_index + length - 1],  &key[small_length]);
130 | 
131 |     return small_length;
132 | }
133 | 
134 | // quick sort key-value pair (main function)
135 | void quick_sort_key(double *key, int length)
136 | {
137 |     if(length == 0 || length == 1)
138 |         return;
139 | 
140 |     int small_length = partition(key, length, 0) ;
141 |     quick_sort_key(key, small_length);
142 |     quick_sort_key(&key[small_length + 1], length - small_length - 1);
143 | }
144 | 
145 | // quick sort key-value pair (main function)
146 | void quick_sort_key(int *key, int length)
147 | {
148 |     if(length == 0 || length == 1)
149 |         return;
150 | 
151 |     int small_length = partition(key, length, 0) ;
152 |     quick_sort_key(key, small_length);
153 |     quick_sort_key(&key[small_length + 1], length - small_length - 1);
154 | }
155 | 
156 | template<typename T>
157 | void swap(T *a , T *b)
158 | {
159 |     T tmp = *a;
160 |     *a = *b;
161 |     *b = tmp;
162 | }
163 | 
164 | // quick sort key-value pair (child function)
165 | template<typename iT, typename vT>
166 | int partition(iT *key, vT *val, int length, int pivot_index)
167 | {
168 |     int i  = 0 ;
169 |     int small_length = pivot_index;
170 | 
171 |     iT pivot = key[pivot_index];
172 |     swap<iT>(&key[pivot_index], &key[pivot_index + (length - 1)]);
173 |     swap<vT>(&val[pivot_index], &val[pivot_index + (length - 1)]);
174 | 
175 |     for(; i < length; i++)
176 |     {
177 |         if(key[pivot_index+i] < pivot)
178 |         {
179 |             swap<iT>(&key[pivot_index+i],  &key[small_length]);
180 |             swap<vT>(&val[pivot_index+i],&val[small_length]);
181 |             small_length++;
182 |         }
183 |     }
184 | 
185 |     swap<iT>(&key[pivot_index + length - 1],  &key[small_length]);
186 |     swap<vT>(&val[pivot_index + length - 1],&val[small_length]);
187 | 
188 |     return small_length;
189 | }
190 | 
191 | // quick sort key-value pair (main function)
192 | template<typename iT, typename vT>
193 | void quick_sort_key_val_pair(iT *key, vT *val, int length)
194 | {
195 |     if(length == 0 || length == 1)
196 |         return;
197 | 
198 |     int small_length = partition<iT, vT>(key, val, length, 0) ;
199 |     quick_sort_key_val_pair<iT, vT>(key, val, small_length);
200 |     quick_sort_key_val_pair<iT, vT>(&key[small_length + 1], &val[small_length + 1], length - small_length - 1);
201 | }
202 | /*
203 | template<typename iT>
204 | void move_block(iT* first,
205 |                 iT* last,
206 |                 iT* result)
207 | {
208 |     //memcpy(result, first, sizeof(iT) * (last - first));
209 |     while (first != last)
210 |     {
211 |         *result = *first;
212 |         ++result;
213 |         ++first;
214 |     }
215 | }
216 | 
217 | template<typename iT, typename vT>
218 | void serial_merge(iT* key_left_start,
219 |                   iT* key_left_end,
220 |                   iT* key_right_start,
221 |                   iT* key_right_end,
222 |                   iT* key_output,
223 |                   vT* val_left_start,
224 |                   vT* val_left_end,
225 |                   vT* val_right_start,
226 |                   vT* val_right_end,
227 |                   vT* val_output)
228 | {
229 |     while(key_left_start != key_left_end && key_right_start != key_right_end)
230 |     {
231 |         bool which = *key_right_start < *key_left_start;
232 |         //*key_output++ = std::move(which ? *key_right_start++ : *key_left_start++);
233 |         *key_output++ = which ? *key_right_start++ : *key_left_start++;
234 |         *val_output++ = which ? *val_right_start++ : *val_left_start++;
235 |     }
236 | 
237 |     //std::move( key_left_start, key_left_end, key_output );
238 |     move_block<iT>(key_left_start, key_left_end, key_output);
239 |     move_block<vT>(val_left_start, val_left_end, val_output);
240 | 
241 |     //std::move( key_right_start, key_right_end, key_output );
242 |     move_block<iT>(key_right_start, key_right_end, key_output);
243 |     move_block<vT>(val_right_start, val_right_end, val_output);
244 | }
245 | 
246 | // merge sequences [key_left_start,key_left_end) and [key_right_start,key_right_end)
247 | // to output [key_output, key_output+(key_left_end-key_left_start)+(key_right_end-key_right_start))
248 | template<typename iT, typename vT>
249 | void parallel_merge(iT* key_left_start,
250 |                     iT* key_left_end,
251 |                     iT* key_right_start,
252 |                     iT* key_right_end,
253 |                     iT* key_output,
254 |                     vT* val_left_start,
255 |                     vT* val_left_end,
256 |                     vT* val_right_start,
257 |                     vT* val_right_end,
258 |                     vT* val_output)
259 | {
260 |     const size_t MERGE_CUT_OFF = 2000;
261 | 
262 |     if( key_left_end - key_left_start + key_right_end - key_right_start <= MERGE_CUT_OFF)
263 |     {
264 |         serial_merge<iT, vT>(key_left_start, key_left_end, key_right_start, key_right_end, key_output,
265 |                              val_left_start, val_left_end, val_right_start, val_right_end, val_output);
266 |     }
267 |     else
268 |     {
269 |         iT *key_left_middle, *key_right_middle;
270 |         vT *val_left_middle, *val_right_middle;
271 | 
272 |         if(key_left_end - key_left_start < key_right_end - key_right_start)
273 |         {
274 |             key_right_middle = key_right_start + (key_right_end - key_right_start) / 2;
275 |             val_right_middle = val_right_start + (val_right_end - val_right_start) / 2;
276 | 
277 |             key_left_middle = std::upper_bound(key_left_start, key_left_end, *key_right_middle);
278 |             val_left_middle = val_left_start + (key_left_middle - key_left_start);
279 |         }
280 |         else
281 |         {
282 |             key_left_middle = key_left_start + (key_left_end - key_left_start) / 2;
283 |             val_left_middle = val_left_start + (val_left_end - val_left_start) / 2;
284 | 
285 |             key_right_middle = std::lower_bound(key_right_start, key_right_end, *key_left_middle);
286 |             val_right_middle = val_right_start + (key_right_middle - key_right_start);
287 |         }
288 | 
289 |         iT* key_output_middle = key_output + (key_left_middle - key_left_start) + (key_right_middle - key_right_start);
290 |         iT* val_output_middle = val_output + (val_left_middle - val_left_start) + (val_right_middle - val_right_start);
291 | 
292 | #pragma omp task
293 |         parallel_merge<iT, vT>(key_left_start,  key_left_middle, key_right_start,  key_right_middle, key_output,
294 |                                val_left_start,  val_left_middle, val_right_start,  val_right_middle, val_output);
295 |         parallel_merge<iT, vT>(key_left_middle, key_left_end,    key_right_middle, key_right_end,    key_output_middle,
296 |                                val_left_middle, val_left_end,    val_right_middle, val_right_end,    val_output_middle);
297 | #pragma omp taskwait
298 |     }
299 | }
300 | 
301 | // sorts [key_start,key_end).
302 | // key_temp[0:key_end-key_start) is temporary buffer supplied by caller.
303 | // result is in [key_start,key_end) if inplace==true,
304 | // otherwise in key_temp[0:key_end-key_start).
305 | template<typename iT, typename vT>
306 | void parallel_merge_sort(iT* key_start,
307 |                          iT* key_end,
308 |                          iT* key_temp,
309 |                          vT* val_start,
310 |                          vT* val_end,
311 |                          vT* val_temp,
312 |                          bool inplace)
313 | {
314 |     const size_t SORT_CUT_OFF = 500;
315 | 
316 |     if(key_end - key_start <= SORT_CUT_OFF)
317 |     {
318 |         //std::stable_sort(key_start, key_end);
319 |         int list_length = key_end - key_start;
320 |         quick_sort_key_val_pair(key_start, val_start, list_length);
321 | 
322 |         if(!inplace)
323 |         {
324 |             //std::move( key_start, key_end, key_temp );
325 |             move_block<iT>(key_start, key_end, key_temp);
326 |             move_block<vT>(val_start, val_end, val_temp);
327 |         }
328 |     }
329 |     else
330 |     {
331 |         iT* key_middle = key_start + (key_end - key_start) / 2;
332 |         vT* val_middle = val_start + (val_end - val_start) / 2;
333 |         iT* key_temp_middel = key_temp + (key_middle - key_start);
334 |         vT* val_temp_middel = val_temp + (val_middle - val_start);
335 |         iT* key_temp_end = key_temp + (key_end - key_start);
336 |         vT* val_temp_end = val_temp + (val_end - val_start);
337 | 
338 | #pragma omp task
339 |         parallel_merge_sort<iT, vT>(key_start,  key_middle, key_temp,
340 |                                     val_start,  val_middle, val_temp,
341 |                                     !inplace);
342 |         parallel_merge_sort<iT, vT>(key_middle, key_end,    key_temp_middel,
343 |                                     val_middle, val_end,    val_temp_middel,
344 |                                     !inplace);
345 | #pragma omp taskwait
346 |         if(inplace)
347 |             parallel_merge<iT, vT>(key_temp, key_temp_middel, key_temp_middel, key_temp_end, key_start,
348 |                                    val_temp, val_temp_middel, val_temp_middel, val_temp_end, val_start);
349 |         else
350 |             parallel_merge<iT, vT>(key_start, key_middle, key_middle, key_end, key_temp,
351 |                                    val_start, val_middle, val_middle, val_end, val_temp);
352 |    }
353 | }
354 | 
355 | // OpenMP tasks do not run in parallel unless launched inside a thread team.
356 | // This outer wrapper shows how to create the thread team and run the top-level call.
357 | template<typename iT, typename vT>
358 | void do_parallel_merge_sort(iT* key_start,
359 |                             iT* key_end,
360 |                             iT* key_temp,
361 |                             vT* val_start,
362 |                             vT* val_end,
363 |                             vT* val_temp,
364 |                             bool inplace)
365 | {
366 |     // Create a thread team.
367 | #pragma omp parallel
368 |     // Make only one thread do the top-level call.
369 |     // Other threads in team pick up spawned tasks.
370 | #pragma omp single
371 |     {
372 |         parallel_merge_sort<iT, vT>(key_start, key_end, key_temp,
373 |                                     val_start, val_end, val_temp,
374 |                                     inplace);
375 |     }
376 | }
377 | 
378 | // merge sort key-value pair (main function)
379 | template<typename iT, typename vT>
380 | void omp_merge_sort_key_val_pair(iT *key, vT *val, int length)
381 | {
382 |     //quick_sort_key_val_pair<iT, vT>(key, val, length);
383 | 
384 |     if(length == 0 || length == 1)
385 |         return;
386 | 
387 |     // allocate temp space for out-of-place merge sort
388 |     iT *key_temp = (iT *)malloc(length * sizeof(iT));
389 |     vT *val_temp = (vT *)malloc(length * sizeof(vT));
390 | 
391 |     bool inplace = true;
392 |     do_parallel_merge_sort<iT, vT>(&key[0], &key[length], key_temp,
393 |                                    &val[0], &val[length], val_temp,
394 |                                    inplace);
395 | 
396 |     // free temp space
397 |     free(key_temp);
398 |     free(val_temp);
399 | }*/
400 | 
401 | // in-place exclusive scan
402 | template<typename T>
403 | void exclusive_scan(T *input, int length)
404 | {
405 |     if(length == 0 || length == 1)
406 |         return;
407 | 
408 |     T old_val, new_val;
409 | 
410 |     old_val = input[0];
411 |     input[0] = 0;
412 |     for (int i = 1; i < length; i++)
413 |     {
414 |         new_val = input[i];
415 |         input[i] = old_val + input[i-1];
416 |         old_val = new_val;
417 |     }
418 | }
419 | 
420 | // segmented sum
421 | template<typename vT, typename bT>
422 | void segmented_sum(vT *input, bT *bit_flag, int length)
423 | {
424 |     if(length == 0 || length == 1)
425 |         return;
426 | 
427 |     for (int i = 0; i < length; i++)
428 |     {
429 |         if (bit_flag[i])
430 |         {
431 |             int j = i + 1;
432 |             while (!bit_flag[j] && j < length)
433 |             {
434 |                 input[i] += input[j];
435 |                 j++;
436 |             }
437 |         }
438 |     }
439 | }
440 | 
441 | // reduce sum
442 | template<typename T>
443 | T reduce_sum(T *input, int length)
444 | {
445 |     if(length == 0)
446 |         return 0;
447 | 
448 |     T sum = 0;
449 | 
450 |     for (int i = 0; i < length; i++)
451 |     {
452 |         sum += input[i];
453 |     }
454 | 
455 |     return sum;
456 | }
457 | 
458 | #endif
459 | 


--------------------------------------------------------------------------------
/src/external/cusparse/utils_cuda_matinfo.h:
--------------------------------------------------------------------------------
 1 | #ifndef _MATRIX_INFO_UTILS_
 2 | #define _MATRIX_INFO_UTILS_
 3 | 
 4 | #include "common.h"
 5 | #include "utils.h"
 6 | 
 7 | double get_variation(const int *row_ptr,
 8 |                      const int  m)
 9 | {
10 |     int nnz = row_ptr[m];
11 |     double mean, stddev, skewness, variation, variance;
12 | 
13 |     mean       = double(nnz) / m;
14 |     variance = 0.0;
15 |     skewness = 0.0;
16 |     for (int i = 0; i < m; i++)
17 |     {
18 |         int len = row_ptr[i + 1] - row_ptr[i];
19 |         double delta = double(len) - mean;
20 |         variance   += (delta * delta);
21 |         skewness   += (delta * delta * delta);
22 |     }
23 |     variance  = variance / m;
24 |     stddev    = sqrt(variance);
25 |     skewness  = (skewness / m) / pow(stddev, 3.0);
26 |     variation = stddev / mean;
27 | 
28 |     return variation;
29 | }
30 | 
31 | double get_variation_trans(const int *row_ptr,
32 |                            const int *col_idx,
33 |                            const int  m,
34 |                            const int  n)
35 | {
36 |     int nnz = row_ptr[m];
37 |     int *col_ptr = (int *)malloc((n+1)*sizeof(int));
38 |     for (int i = 0; i < n+1; i++) col_ptr[i] = 0;
39 | 
40 |     for(int i = 0; i < nnz; i++)
41 |     {
42 |         int j = col_idx[i];
43 |         col_ptr[j]++;
44 |     }
45 |     exclusive_scan<int>(col_ptr, n+1);
46 | 
47 |     double variation = get_variation(col_ptr, n);
48 |     free(col_ptr);
49 | 
50 |     return variation;
51 | }
52 | 
53 | #endif
54 | 


--------------------------------------------------------------------------------
/src/external/cusparse/utils_cuda_segsort_subfunc/segsort_subfunc_fast_bin.h:
--------------------------------------------------------------------------------
  1 | #ifndef _FAST_BIN
  2 | #define _FAST_BIN
  3 | 
  4 | 
  5 | #include <thrust/device_ptr.h>
  6 | #include <thrust/scan.h>
  7 | #include <vector>
  8 | #include <algorithm>
  9 | //#include "fast_utils.h"
 10 | using namespace std;
 11 | 
 12 | // #define BINRULE1
 13 | // #define BINRULE2
 14 | 
 15 | #define SEGBIN_NUM 13
 16 | 
 17 | __global__
 18 | void fast_bin_step1(int *d_bin_counter, const int *d_segs, int length, int n)
 19 | {
 20 |     const int global_id = blockIdx.x * blockDim.x + threadIdx.x; 
 21 |     __shared__ int s_segbin_counter[SEGBIN_NUM];
 22 |     if (threadIdx.x < SEGBIN_NUM)
 23 |         s_segbin_counter[threadIdx.x] = 0;
 24 |     __syncthreads();
 25 | 
 26 |     if (global_id < length)
 27 |     {
 28 |         const int size = ((global_id == length-1)?n:d_segs[global_id+1]) - d_segs[global_id];
 29 | 
 30 |         if (size <= 1)
 31 |             atomicAdd((int *)&s_segbin_counter[0 ], 1);
 32 |         if (1 < size && size <= 2)
 33 |             atomicAdd((int *)&s_segbin_counter[1 ], 1);
 34 |         if (2 < size && size <= 4)
 35 |             atomicAdd((int *)&s_segbin_counter[2 ], 1);
 36 |         if (4 < size && size <= 8)
 37 |             atomicAdd((int *)&s_segbin_counter[3 ], 1);
 38 |         if (8 < size && size <= 16)
 39 |             atomicAdd((int *)&s_segbin_counter[4 ], 1);
 40 |         if (16 < size && size <= 32)
 41 |             atomicAdd((int *)&s_segbin_counter[5 ], 1);
 42 |         if (32 < size && size <= 64)
 43 |             atomicAdd((int *)&s_segbin_counter[6 ], 1);
 44 |         if (64 < size && size <= 128)
 45 |             atomicAdd((int *)&s_segbin_counter[7 ], 1);
 46 |         if (128 < size && size <= 256)
 47 |             atomicAdd((int *)&s_segbin_counter[8 ], 1);
 48 |         if (256 < size && size <= 512)
 49 |             atomicAdd((int *)&s_segbin_counter[9 ], 1);
 50 |         if (512 < size && size <= 1024)
 51 |             atomicAdd((int *)&s_segbin_counter[10], 1);
 52 |         if (1024 < size && size <= 2048)
 53 |             atomicAdd((int *)&s_segbin_counter[11], 1);
 54 |         if (2048 < size)
 55 |             atomicAdd((int *)&s_segbin_counter[12], 1);
 56 |     }
 57 |     __syncthreads();
 58 |     if (threadIdx.x < SEGBIN_NUM)
 59 |         atomicAdd((int *)&d_bin_counter[threadIdx.x], s_segbin_counter[threadIdx.x]);
 60 | }
 61 | 
 62 | template<class T>
 63 | void fast_excl_scan(T *in, const int length)
 64 | {
 65 |     thrust::device_ptr<int> d_array_thrust = thrust::device_pointer_cast(in);
 66 |     thrust::exclusive_scan(d_array_thrust, d_array_thrust + length, d_array_thrust);
 67 | }
 68 | 
 69 | __global__
 70 | void fast_bin_step2(int *d_bin_segs_id, int *d_bin_counter, 
 71 |         const int *d_segs, const int length, const int n)
 72 | {
 73 |     const int global_id = blockIdx.x * blockDim.x + threadIdx.x;
 74 | 
 75 |     if (global_id < length)
 76 |     {
 77 |         const int size = ((global_id == length-1)?n:d_segs[global_id+1]) - d_segs[global_id];
 78 |         int position;
 79 |         if (size <= 1)
 80 |             position = atomicAdd((int *)&d_bin_counter[0 ], 1);
 81 |         else if (size <= 2)                              
 82 |             position = atomicAdd((int *)&d_bin_counter[1 ], 1);
 83 |         else if (size <= 4)                              
 84 |             position = atomicAdd((int *)&d_bin_counter[2 ], 1);
 85 |         else if (size <= 8)                              
 86 |             position = atomicAdd((int *)&d_bin_counter[3 ], 1);
 87 |         else if (size <= 16)                             
 88 |             position = atomicAdd((int *)&d_bin_counter[4 ], 1);
 89 |         else if (size <= 32)                             
 90 |             position = atomicAdd((int *)&d_bin_counter[5 ], 1);
 91 |         else if (size <= 64)                             
 92 |             position = atomicAdd((int *)&d_bin_counter[6 ], 1);
 93 |         else if (size <= 128)                            
 94 |             position = atomicAdd((int *)&d_bin_counter[7 ], 1);
 95 |         else if (size <= 256)                            
 96 |             position = atomicAdd((int *)&d_bin_counter[8 ], 1);
 97 |         else if (size <= 512)                            
 98 |             position = atomicAdd((int *)&d_bin_counter[9 ], 1);
 99 |         else if (size <= 1024)
100 |             position = atomicAdd((int *)&d_bin_counter[10], 1);
101 |         else if (size <= 2048)
102 |             position = atomicAdd((int *)&d_bin_counter[11], 1);
103 |         else
104 |             position = atomicAdd((int *)&d_bin_counter[12], 1);
105 |         d_bin_segs_id[position] = global_id;
106 |     }
107 | 
108 | }
109 | 
110 | void fast_bin_cuda(int *d_bin_segs_id, int *d_bin_counter, const int *d_segs, 
111 |         const int length, const int n, int *h_bin_counter)
112 | {
113 | 
114 |     const int num_threads = 256;
115 |     const int num_blocks = ceil((double)length/(double)num_threads);
116 | 
117 | #ifdef __PROF
118 |     double time0, time1;
119 |     time0 = dtime();
120 | #endif
121 | 
122 |     fast_bin_step1<<< num_blocks, num_threads >>>(d_bin_counter, d_segs, length, n);
123 | 
124 | #ifdef __PROF
125 |     cudaDeviceSynchronize();
126 |     time1 = dtime();
127 |     cout << "time bin_step1(ms): " << time1 - time0 << endl;
128 | #endif
129 | 
130 | 
131 |     // show_me_d(d_bin_counter, SEGBIN_NUM, "bin_counter:");
132 | 
133 | #ifdef __PROF
134 |     time0 = dtime();
135 | #endif
136 | 
137 |     fast_excl_scan(d_bin_counter, SEGBIN_NUM);
138 | 
139 |     cudaMemcpyAsync(h_bin_counter, d_bin_counter, SEGBIN_NUM*sizeof(int), cudaMemcpyDeviceToHost);
140 | #ifdef __PROF
141 |     cudaDeviceSynchronize();
142 |     time1 = dtime();
143 |     cout << "time bin_scan(ms): " << time1 - time0 << endl;
144 | #endif
145 | 
146 |     // show_me_d(d_bin_counter, SEGBIN_NUM, "bin_counter(scan):");
147 | 
148 | #ifdef __PROF
149 |     time0 = dtime();
150 | #endif
151 | 
152 |     fast_bin_step2<<<num_blocks, num_threads>>>(d_bin_segs_id, d_bin_counter, d_segs, length, n);
153 | 
154 | #ifdef __PROF
155 |     cudaDeviceSynchronize();
156 |     time1 = dtime();
157 |     cout << "time bin_step2(ms): " << time1 - time0 << endl;
158 | #endif
159 | 
160 | }
161 | 
162 | 
163 | void fast_bin_cpu(int *ref_bin_segs_id, const int *segs, 
164 |         const int length, const int n)
165 | {
166 |     vector<int> bin_counter(SEGBIN_NUM, 0);
167 |     
168 |     for(int i = 0; i < length; i++)
169 |     {
170 |         const int size = ((i == length-1)?n:segs[i+1]) - segs[i];
171 | 
172 |         if (size <= 1)
173 |             bin_counter[0]++;
174 |         else if (size <= 2)
175 |             bin_counter[1]++;
176 |         else if (size <= 4)
177 |             bin_counter[2]++;
178 |         else if (size <= 8)
179 |             bin_counter[3]++;
180 |         else if (size <= 16)
181 |             bin_counter[4]++;
182 |         else if (size <= 32)
183 |             bin_counter[5]++;
184 |         else if (size <= 64)
185 |             bin_counter[6]++;
186 |         else if (size <= 128)
187 |             bin_counter[7]++;
188 |         else if (size <= 256)
189 |             bin_counter[8]++;
190 |         else if (size <= 512)
191 |             bin_counter[9]++;
192 |         else if (size <= 1024)
193 |             bin_counter[10]++;
194 |         else if (size <= 2048)
195 |             bin_counter[11]++;
196 |         else
197 |             bin_counter[12]++;
198 | 
199 | 
200 |     }
201 | 
202 |     // show_me(&bin_counter[0], SEGBIN_NUM, "bin_counter(cpu):");
203 | 
204 |     int sum = 0;
205 |     for(int i = 0; i < SEGBIN_NUM; i++)
206 |     {
207 |         int tmp = bin_counter[i];
208 |         bin_counter[i] = sum;
209 |         sum += tmp;
210 |     }
211 |     cout << "ratio_sm: " << (double)bin_counter[9]/sum << endl;
212 |     cout << "ratio_md: " << (double)(bin_counter[11]-bin_counter[9])/sum << endl;
213 |     cout << "ratio_lg: " << (double)(sum-bin_counter[SEGBIN_NUM-1])/sum << endl;
214 | 
215 |     // show_me(&bin_counter[0], SEGBIN_NUM, "bin_counter_scan(cpu):");
216 |     
217 |     for(int i = 0; i < length; i++)
218 |     {
219 |         const int size = ((i == length-1)?n:segs[i+1]) - segs[i];
220 |         int position;
221 | 
222 |         if (size <= 1)
223 |             position = bin_counter[0 ]++;
224 |         else if (size <= 2)          
225 |             position = bin_counter[1 ]++;
226 |         else if (size <= 4)          
227 |             position = bin_counter[2 ]++;
228 |         else if (size <= 8)          
229 |             position = bin_counter[3 ]++;
230 |         else if (size <= 16)         
231 |             position = bin_counter[4 ]++;
232 |         else if (size <= 32)         
233 |             position = bin_counter[5 ]++;
234 |         else if (size <= 64)         
235 |             position = bin_counter[6 ]++;
236 |         else if (size <= 128)        
237 |             position = bin_counter[7 ]++;
238 |         else if (size <= 256)        
239 |             position = bin_counter[8 ]++;
240 |         else if (size <= 512)        
241 |             position = bin_counter[9 ]++;
242 |         else if (size <= 1024)
243 |             position = bin_counter[10]++;
244 |         else if (size <= 2048)
245 |             position = bin_counter[11]++;
246 |         else
247 |             position = bin_counter[12]++;
248 | 
249 |         ref_bin_segs_id[position] = i;
250 |     }
251 | 
252 | }
253 | 
254 | 
255 | #endif
256 | 


--------------------------------------------------------------------------------
/src/external/cusparse/utils_cuda_segsort_subfunc/segsort_subfunc_kern_copy_unit.h:
--------------------------------------------------------------------------------
 1 | template<class T>
 2 | __global__
 3 | void gen_copy( 
 4 |     uintT *key, T *val, uintT *keyB, T *valB, int n, int *segs, int *bin, int bin_size, int length) {
 5 | 
 6 |     const int gid = threadIdx.x + blockIdx.x * blockDim.x;
 7 |     const int bin_it = gid;
 8 |     int k;
 9 |     if(bin_it < bin_size) {
10 |         k = segs[bin[bin_it]];
11 |         keyB[k] = key[k];
12 |         valB[k] = val[k];
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/src/external/cusparse/utils_cuda_segsort_subfunc/segsort_subfunc_kern_mergepath_func.h:
--------------------------------------------------------------------------------
 1 | __device__ int find_kth3(uintT* a,
 2 |                           int aCount,
 3 |                           uintT* b,
 4 |                           int bCount,
 5 |                           int diag)
 6 | {
 7 |     int begin = max(0, diag - bCount);
 8 |     int end = min(diag, aCount);
 9 |  
10 |     while(begin < end) {
11 |         int mid = (begin + end)>> 1;
12 |         uintT aKey = a[mid];
13 |         uintT bKey = b[diag - 1 - mid];
14 |         bool pred = aKey <= bKey;
15 |         if(pred) begin = mid + 1;
16 |         else end = mid;
17 |     }
18 |     return begin;
19 | }
20 | 


--------------------------------------------------------------------------------
/src/hash.h:
--------------------------------------------------------------------------------
 1 | #include"common.h"
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | 
 5 | 
 6 | void block_mul( const int *flag, const int *mA, SMatrix *submatrixA,SMatrix *submatrixB,SMatrix *submatrixC,
 7 |                 int blockCid,int *nnzAnum,int *nnzBnum)
 8 | {
 9 |     int *num;
10 |     num=(int*)malloc(((*mA)*SubNum)*sizeof(int));
11 | 	memset(num,0,((*mA)*SubNum)*sizeof(int));
12 | 	for (int colid=0;colid<SubNum;colid++)
13 | 	{
14 | 		int A=(blockCid/SubNum)*SubNum+colid;
15 | 		int B=blockCid%SubNum+colid*SubNum;
16 | 		//printf("A=%d,nnz[A]=%d\n",A,nnzAnum[A]);
17 | 		//printf("B=%d,nnz[B]=%d\n",B,nnzAnum[B]);
18 | 		int j=0;
19 | 		if (nnzAnum[A]!=0&&nnzBnum[B]!=0){
20 | 			for (int i=0;i<(*mA);i++){
21 | 				while(j<submatrixA[A].rowpointer[i+1])
22 | 				{	
23 | 					num[colid*(*mA)+i]+=submatrixB[B].rowpointer[submatrixA[A].columnindex[j]+1]-submatrixB[B].rowpointer[submatrixA[A].columnindex[j]];
24 | 					j++;
25 | 				}
26 | 			//	sum+=num[q*row+i];
27 | 			}
28 | 		}
29 | 	}
30 |     int *rowCub; //calculate tasks for each row in subC
31 | 	rowCub=(int *)malloc((*mA)*sizeof(int));
32 | 	memset(rowCub,0,(*mA));
33 | 	for (int i=0;i<(*mA);i++)
34 | 	{
35 | 		for (int k=0;k<SubNum;k++)
36 | 			rowCub[i]+=num[k*(*mA)+i];
37 | 	}
38 | 
39 | 	
40 | 	for (int iid=0;iid<(*mA);iid++)
41 | 	{
42 | 		hashsize_full_reg=rowCub[iid];
43 | 		int *tmpIdx2D0 = (int *)malloc(hashsize_full_reg* sizeof(int));  //index 
44 | 		
45 | 	}
46 |                     
47 | 
48 | }


--------------------------------------------------------------------------------
/src/main.cu:
--------------------------------------------------------------------------------
  1 | #include"common.h"
  2 | #include"mmio_highlevel.h"
  3 | #include"utils.h"
  4 | #include"utils_cuda_scan.h"
  5 | #include "spgemm_nsparse_kernel.h"
  6 | #include "csr2tile.h"
  7 | #include "tilespgemm-cuda.h"
  8 | #include "spgemm-cpu.h"
  9 | #include "tile2csr.h"
 10 | #include "spgemm_serialref_spa_new.h"
 11 | #include "spgemm_cu.h"
 12 | 
 13 | int main(int argc, char ** argv)
 14 | {
 15 | 
 16 | 	if (argc < 6)
 17 |     {
 18 |         printf("Run the code by './test -d 0 -aat 0 matrix.mtx'.\n");
 19 |         return 0;
 20 |     }
 21 | 	
 22 |     printf("--------------------------------!!!!!!!!------------------------------------\n");
 23 |     
 24 |     int device_id = 0;
 25 |     int aat = 0;
 26 | 
 27 |     // "Usage: ``./test -d 0 -aat 0 A.mtx'' for C=AA  on device 0", or
 28 |     // "Usage: ``./test -d 0 -aat 1 A.mtx'' for C=AAT on device 0"
 29 |     int argi = 1;
 30 | 
 31 |     // load device id
 32 |     char *devstr;
 33 |     if(argc > argi)
 34 |     {
 35 |         devstr = argv[argi];
 36 |         argi++;
 37 |     }
 38 | 
 39 |     if (strcmp(devstr, "-d") != 0) return 0;
 40 | 
 41 |     if(argc > argi)
 42 |     {
 43 |         device_id = atoi(argv[argi]);
 44 |         argi++;
 45 |     }
 46 |     printf("device_id = %i\n", device_id);
 47 |     
 48 |     // set device
 49 |     cudaSetDevice(device_id);
 50 |     cudaDeviceProp deviceProp;
 51 |     cudaGetDeviceProperties(&deviceProp, device_id);
 52 | 
 53 |     // Set aside 50% of L2 cache for persisting accesses 
 54 |     size_t size = min( int(deviceProp.l2CacheSize * 0.80) , deviceProp.persistingL2CacheMaxSize );
 55 |     cudaDeviceSetLimit( cudaLimitPersistingL2CacheSize, size); 
 56 | 
 57 |     printf("---------------------------------------------------------------\n");
 58 |     printf("Device [ %i ] %s @ %4.2f MHz\n",
 59 |            device_id, deviceProp.name, deviceProp.clockRate * 1e-3f);
 60 |            
 61 |     // load AAT flag
 62 |     char *aatstr;
 63 |     if(argc > argi)
 64 |     {
 65 |         aatstr = argv[argi];
 66 |         argi++;
 67 |     }
 68 | 
 69 |     if (strcmp(aatstr, "-aat") != 0) return 0;
 70 | 
 71 |     if(argc > argi)
 72 |     {
 73 |         aat = atoi(argv[argi]);
 74 |         argi++;
 75 |     }
 76 | 
 77 |  	struct timeval t1, t2;
 78 | 	SMatrix *matrixA = (SMatrix *)malloc(sizeof(SMatrix));
 79 | 	SMatrix *matrixB = (SMatrix *)malloc(sizeof(SMatrix));
 80 | 
 81 | 	char  *filename;
 82 |     filename = argv[argi];
 83 |     printf("MAT: -------------- %s --------------\n", filename);
 84 | 
 85 |     // load mtx A data to the csr format
 86 |     gettimeofday(&t1, NULL);
 87 |     mmio_allinone(&matrixA->m, &matrixA->n, &matrixA->nnz, &matrixA->isSymmetric, &matrixA->rowpointer, &matrixA->columnindex, &matrixA->value, filename);
 88 |     gettimeofday(&t2, NULL);
 89 |     double time_loadmat  = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0;
 90 |     printf("input matrix A: ( %i, %i ) nnz = %i\n loadfile time    = %4.5f sec\n", matrixA->m, matrixA->n, matrixA->nnz, time_loadmat/1000.0);
 91 | 
 92 |     if (!aat &&  matrixA->m != matrixA->n)
 93 |     {
 94 |         printf("matrix squaring must have rowA == colA. Exit.\n");
 95 |         return 0;
 96 |     }
 97 | 
 98 |     printf("the tilesize = %d\n",BLOCK_SIZE);
 99 | 
100 | 	for (int i = 0; i < matrixA->nnz; i++)
101 | 	    matrixA->value[i] = i % 10;
102 | 
103 |     if (aat)
104 |     {
105 |         MAT_PTR_TYPE *cscColPtrA;
106 |         int *cscRowIdxA;
107 |         MAT_VAL_TYPE *cscValA ;
108 |     
109 |         if (matrixA->m == matrixA->n && matrixA->isSymmetric)
110 |         {
111 |            printf("matrix AAT does not do symmetric matrix. Exit.\n");
112 |            return 0;
113 |         }
114 | 
115 |         matrixB->m = matrixA->n ;
116 |         matrixB->n = matrixA->m ;
117 |         matrixB->nnz = matrixA->nnz ;
118 | 
119 |         cscColPtrA = (MAT_PTR_TYPE *)malloc((matrixA->n + 1) * sizeof(MAT_PTR_TYPE));
120 |         cscRowIdxA = (int *)malloc(matrixA->nnz   * sizeof(int));
121 |         cscValA    = (MAT_VAL_TYPE *)malloc(matrixA->nnz  * sizeof(MAT_VAL_TYPE));
122 | 
123 |         // transpose A from csr to csc
124 |         matrix_transposition(matrixA->m, matrixA->n, matrixA->nnz, matrixA->rowpointer, matrixA->columnindex, matrixA->value,cscRowIdxA, cscColPtrA, cscValA);
125 | 
126 |         matrixB->rowpointer = cscColPtrA;
127 |         matrixB->columnindex = cscRowIdxA;
128 |         matrixB->value    = cscValA;
129 | 
130 | 
131 |     }
132 |     else
133 |     {
134 |         matrixB->m = matrixA->m ;
135 |         matrixB->n = matrixA->n ;
136 |         matrixB->nnz = matrixA->nnz ;
137 | 
138 |         matrixB->rowpointer = matrixA->rowpointer;
139 |         matrixB->columnindex = matrixA->columnindex;
140 |         matrixB->value    = matrixA->value;
141 |     }
142 | 
143 |         // calculate bytes and flops consumed
144 |         unsigned long long int nnzCub = 0;
145 |         for (int i = 0; i < matrixA->nnz; i++)
146 |         {
147 |             int rowidx = matrixA->columnindex[i];
148 |             nnzCub += matrixB->rowpointer[rowidx + 1] - matrixB->rowpointer[rowidx];
149 |         }
150 |     
151 |         printf("SpGEMM nnzCub = %lld\n", nnzCub);
152 | 
153 | #if TIMING
154 |         gettimeofday(&t1, NULL);
155 | #endif
156 | 
157 |         csr2tile_row_major(matrixA);
158 | #if TIMING
159 |         gettimeofday(&t2, NULL);
160 |         double time_conversion = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0;
161 |         printf("CSR to Tile conversion uses %.2f ms\n", time_conversion);
162 | #endif
163 | 
164 | #if SPACE
165 | 
166 | double tile_bytes = (matrixA->tilem + 1) * sizeof(int) + matrixA->numtile * sizeof(int) + (matrixA->numtile + 1) *sizeof(int) +
167 |                 matrixA->nnz * sizeof(MAT_VAL_TYPE) + matrixA->nnz * sizeof(unsigned char) + matrixA->numtile * BLOCK_SIZE * sizeof(unsigned char) +
168 |                 matrixA->numtile * BLOCK_SIZE * sizeof(unsigned short);
169 | 
170 | double mem = tile_bytes/1024/1024;
171 | 
172 | double CSR_bytes = (matrixA->m +1) * sizeof(int) + (matrixA->nnz) * sizeof(int) + matrixA->nnz * sizeof(MAT_VAL_TYPE);
173 | double csr_mem = CSR_bytes /1024/1024;
174 | 
175 | printf("tile space overhead = %.2f MB\n", mem);
176 | 
177 | #endif
178 | 
179 |         csr2tile_col_major(matrixB);
180 | 
181 | 
182 |         int blk_intersec_bitmask_len = ceil((double)matrixA->tilen / 32.0);
183 |         double densityA = (double)matrixA->numtile / ((double)matrixA->tilem*(double)matrixA->tilen);
184 |         double densityB = (double)matrixB->numtile / ((double)matrixB->tilem*(double)matrixB->tilen);
185 | 
186 | 
187 |         long long int lengthA = (long long int) (matrixA->tilem) * (long long int)( blk_intersec_bitmask_len) ;
188 | 
189 |     unsigned int *blk_intersec_bitmask_A = (unsigned int *)malloc(lengthA* sizeof(unsigned int));
190 |     memset(blk_intersec_bitmask_A, 0, lengthA * sizeof(unsigned int));
191 |     for (int i = 0; i < matrixA->tilem; i++)
192 |     {
193 |         for (int j = matrixA->tile_ptr[i]; j < matrixA->tile_ptr[i + 1]; j++)
194 |         {
195 |             int idx = matrixA->tile_columnidx[j];
196 |             unsigned int bitmask = 1;
197 |             bitmask <<=  (31- (idx % 32));
198 |             long long int pos = (long long int)i * (long long int)blk_intersec_bitmask_len + idx / 32;
199 |             blk_intersec_bitmask_A[pos] |= bitmask;
200 |         }
201 |     }
202 | 
203 |     long long int lengthB = (long long int) (matrixB->tilen) * (long long int)(blk_intersec_bitmask_len) ;
204 | 
205 |     unsigned int *blk_intersec_bitmask_B = (unsigned int *)malloc(lengthB * sizeof(unsigned int));
206 |     memset(blk_intersec_bitmask_B, 0, lengthB * sizeof(unsigned int));
207 |     for (int i = 0; i < matrixB->tilen; i++)
208 |     {
209 |         for (int j = matrixB->csc_tile_ptr[i]; j < matrixB->csc_tile_ptr[i+1]; j++)
210 |         {
211 |             int idx = matrixB->csc_tile_rowidx[j];
212 |             unsigned int bitmask = 0x1;
213 |             bitmask <<= (31 - (idx % 32));
214 |             long long int pos = (long long int)i * (long long int )blk_intersec_bitmask_len + idx / 32;
215 |             blk_intersec_bitmask_B[pos] |= bitmask;
216 |         }
217 |     }
218 | 
219 | 
220 |     // generate rowidx of blockA
221 |     int *tile_rowidx_A = (int *)malloc (matrixA->numtile * sizeof(int ) );
222 |     for (int i = 0; i < matrixA->tilem; i++)
223 |     {
224 |         for (int j = matrixA->tile_ptr[i]; j < matrixA->tile_ptr[i+1]; j++)
225 |         {
226 |             tile_rowidx_A[j] = i;
227 |         }
228 |     }
229 | 
230 | 
231 | 
232 | #ifdef DEBUG
233 |     // --------------------------------------------------------------------------------------------------------
234 |     SMatrix *matrixC = (SMatrix *)malloc(sizeof(SMatrix));
235 |     
236 |     struct timeval tv;
237 |     unsigned long long int nnzC_computed;
238 |     double compression_rate = 0;
239 |     double time_tile = 0;
240 |     double gflops_tile = 0;
241 |     double time_step1 =0,time_step2 =0,time_step3 =0,time_malloc=0; 
242 | 
243 | 
244 |     
245 | 
246 |     tilespgemm(matrixA,
247 |                matrixB,
248 |                matrixC,
249 |                blk_intersec_bitmask_A,
250 |                blk_intersec_bitmask_B,
251 |                blk_intersec_bitmask_len,
252 |                densityA,
253 |                densityB,
254 |                nnzCub,
255 |                &nnzC_computed,
256 |                &compression_rate,
257 |                &time_tile,
258 |                &gflops_tile,
259 |                filename,
260 |                &time_step1,&time_step2,&time_step3,&time_malloc);
261 | 
262 | 
263 |     // write results to text (scv) file
264 |     FILE *fout = fopen("../data/results_tile.csv", "a");
265 |     if (fout == NULL)
266 |         printf("Writing results fails.\n");
267 |     fprintf(fout, "%s,%i,%i,%i,%lld,%lld,%f,%f,%f\n",
268 |             filename, matrixA->m, matrixA->n, matrixA->nnz, nnzCub, nnzC_computed, compression_rate, time_tile, gflops_tile);
269 |     fclose(fout);
270 | 
271 |     // write runtime of each step to text (scv) file
272 |     FILE *fout_time = fopen("../data/step_runtime.csv", "a");
273 |     if (fout_time == NULL)
274 |         printf("Writing results fails.\n");
275 |     fprintf(fout_time, "%s,%i,%i,%i,%lld,%lld,%f,%f,%f,%f,%f\n",
276 |                 filename, matrixA->m, matrixA->n, matrixA->nnz, nnzCub, nnzC_computed, compression_rate, time_step1, time_step2,time_step3,time_malloc);
277 |     fclose(fout_time);
278 |     
279 | 
280 | #if SPACE
281 |     // write memory space of CSR and tile format to text (scv) file
282 |     FILE *fout_mem = fopen("../data/mem-cost.csv", "a");
283 |     if (fout_mem == NULL)
284 |         printf("Writing results fails.\n");
285 |     fprintf(fout_mem, "%s,%i,%i,%i,%lld,%lld,%f,%f,%f\n",
286 |                 filename, matrixA->m, matrixA->n, matrixA->nnz, nnzCub, nnzC_computed, compression_rate, csr_mem,mem);
287 |     fclose(fout_mem);
288 | 
289 | #endif
290 | 
291 | #if TIMING
292 | 
293 |     // write preprocessing overhead of CSR and tile format to text (scv) file
294 |     FILE *fout_pre = fopen("../data/preprocessing.csv", "a");
295 |     if (fout_pre == NULL)
296 |         printf("Writing results fails.\n");
297 |     fprintf(fout_pre, "%s,%i,%i,%i,%lld,%lld,%f,%f,%f\n",
298 |                     filename, matrixA->m, matrixA->n, matrixA->nnz, nnzCub, nnzC_computed, compression_rate, time_conversion,time_tile);
299 |     fclose(fout_pre);
300 |     
301 | #endif
302 | 
303 | 
304 | #endif
305 | 
306 | #if CHECK_RESULT
307 | printf("-------------------------------check----------------------------------------\n");
308 | tile2csr(matrixC);
309 |         printf("tile to CSR conversion complete!\n");
310 | 
311 |     unsigned long long int nnzC = 0;
312 |     double compression_rate1 = 0;
313 |     double time_cusparse = 0;
314 |     double gflops_cusparse = 0;
315 |     int flag =0;
316 |     int mC = matrixA->m;
317 |     int nC = matrixB->n;
318 |     int nnzC_golden = matrixC->nnz;
319 |     bool check_result = CHECK_RESULT;
320 | 
321 |     MAT_PTR_TYPE *csrRowPtrC_golden = matrixC->rowpointer;
322 |     int *csrColIdxC_golden = matrixC->columnindex;
323 |     MAT_VAL_TYPE *csrValC_golden = matrixC->value;
324 | 
325 |     spgemm_cu(matrixA->m, matrixA->n, matrixA->nnz, matrixA->rowpointer, matrixA->columnindex, matrixA->value,
326 |               matrixB->m, matrixB->n, matrixB->nnz, matrixB->rowpointer, matrixB->columnindex, matrixB->value,
327 |               mC, nC, nnzC_golden, csrRowPtrC_golden, csrColIdxC_golden, csrValC_golden,
328 |               check_result, nnzCub, &nnzC, &compression_rate1, &time_cusparse, &gflops_cusparse);
329 |     printf("---------------------------------------------------------------\n");
330 | 
331 | #endif
332 |     matrix_destroy(matrixA);
333 |     matrix_destroy(matrixB);
334 | 
335 |     free(matrixA->rowpointer);
336 |     free(matrixA->columnindex);
337 |     free(matrixA->value);
338 | 
339 |     return 0;
340 | 
341 | }


--------------------------------------------------------------------------------
/src/mmio_highlevel.h:
--------------------------------------------------------------------------------
  1 | #ifndef _MMIO_HIGHLEVEL_
  2 | 
  3 | #define _MMIO_HIGHLEVEL_
  4 | 
  5 | 
  6 | 
  7 | #include "mmio.h"
  8 | 
  9 | #include "common.h"
 10 | 
 11 | 
 12 | 
 13 | // read matrix infomation from mtx file
 14 | 
 15 | int mmio_info(int *m, int *n, int *nnz, int *isSymmetric, char *filename)
 16 | 
 17 | {
 18 | 
 19 |     int m_tmp, n_tmp, nnz_tmp;
 20 | 
 21 | 
 22 | 
 23 |     int ret_code;
 24 | 
 25 |     MM_typecode matcode;
 26 | 
 27 |     FILE *f;
 28 | 
 29 | 
 30 | 
 31 |     int nnz_mtx_report;
 32 | 
 33 |     int isInteger = 0, isReal = 0, isPattern = 0, isSymmetric_tmp = 0, isComplex = 0;
 34 | 
 35 | 
 36 | 
 37 |     // load matrix
 38 | 
 39 |     if ((f = fopen(filename, "r")) == NULL)
 40 | 
 41 |         return -1;
 42 | 
 43 | 
 44 | 
 45 |     if (mm_read_banner(f, &matcode) != 0)
 46 | 
 47 |     {
 48 | 
 49 |         printf("Could not process Matrix Market banner.\n");
 50 | 
 51 |         return -2;
 52 | 
 53 |     }
 54 | 
 55 | 
 56 | 
 57 |     if ( mm_is_pattern( matcode ) )  { isPattern = 1; /*printf("type = Pattern\n");*/ }
 58 | 
 59 |     if ( mm_is_real ( matcode) )     { isReal = 1; /*printf("type = real\n");*/ }
 60 | 
 61 |     if ( mm_is_complex( matcode ) )  { isComplex = 1; /*printf("type = real\n");*/ }
 62 | 
 63 |     if ( mm_is_integer ( matcode ) ) { isInteger = 1; /*printf("type = integer\n");*/ }
 64 | 
 65 | 
 66 | 
 67 |     /* find out size of sparse matrix .... */
 68 | 
 69 |     ret_code = mm_read_mtx_crd_size(f, &m_tmp, &n_tmp, &nnz_mtx_report);
 70 | 
 71 |     if (ret_code != 0)
 72 | 
 73 |         return -4;
 74 | 
 75 | 
 76 | 
 77 |     if ( mm_is_symmetric( matcode ) || mm_is_hermitian( matcode ) )
 78 | 
 79 |     {
 80 | 
 81 |         isSymmetric_tmp = 1;
 82 | 
 83 |         //printf("input matrix is symmetric = true\n");
 84 | 
 85 |     }
 86 | 
 87 |     else
 88 | 
 89 |     {
 90 | 
 91 |         //printf("input matrix is symmetric = false\n");
 92 | 
 93 |     }
 94 | 
 95 | 
 96 | 
 97 |     int *csrRowPtr_counter = (int *)malloc((m_tmp+1) * sizeof(int));
 98 | 
 99 |     memset(csrRowPtr_counter, 0, (m_tmp+1) * sizeof(int));
100 | 
101 | 
102 | 
103 |     int *csrRowIdx_tmp = (int *)malloc(nnz_mtx_report * sizeof(int));
104 | 
105 |     int *csrColIdx_tmp = (int *)malloc(nnz_mtx_report * sizeof(int));
106 | 
107 |     MAT_VAL_TYPE *csrVal_tmp    = (MAT_VAL_TYPE *)malloc(nnz_mtx_report * sizeof(MAT_VAL_TYPE));
108 | 
109 | 
110 | 
111 |     /* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
112 | 
113 |     /*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
114 | 
115 |     /*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
116 | 
117 | 
118 | 
119 |     for (int i = 0; i < nnz_mtx_report; i++)
120 | 
121 |     {
122 | 
123 |         int idxi, idxj;
124 | 
125 |         double fval, fval_im;
126 | 
127 |         int ival;
128 | 
129 |         int returnvalue;
130 | 
131 | 
132 | 
133 |         if (isReal)
134 | 
135 |         {
136 | 
137 |             returnvalue = fscanf(f, "%d %d %lg\n", &idxi, &idxj, &fval);
138 | 
139 |         }
140 | 
141 |         else if (isComplex)
142 | 
143 |         {
144 | 
145 |             returnvalue = fscanf(f, "%d %d %lg %lg\n", &idxi, &idxj, &fval, &fval_im);
146 | 
147 |         }
148 | 
149 |         else if (isInteger)
150 | 
151 |         {
152 | 
153 |             returnvalue = fscanf(f, "%d %d %d\n", &idxi, &idxj, &ival);
154 | 
155 |             fval = ival;
156 | 
157 |         }
158 | 
159 |         else if (isPattern)
160 | 
161 |         {
162 | 
163 |             returnvalue = fscanf(f, "%d %d\n", &idxi, &idxj);
164 | 
165 |             fval = 1.0;
166 | 
167 |         }
168 | 
169 | 
170 | 
171 |         // adjust from 1-based to 0-based
172 | 
173 |         idxi--;
174 | 
175 |         idxj--;
176 | 
177 | 
178 | 
179 |         csrRowPtr_counter[idxi]++;
180 | 
181 |         csrRowIdx_tmp[i] = idxi;
182 | 
183 |         csrColIdx_tmp[i] = idxj;
184 | 
185 |         csrVal_tmp[i] = fval;
186 | 
187 |     }
188 | 
189 | 
190 | 
191 |     if (f != stdin)
192 | 
193 |         fclose(f);
194 | 
195 | 
196 | 
197 |     if (isSymmetric_tmp)
198 | 
199 |     {
200 | 
201 |         for (int i = 0; i < nnz_mtx_report; i++)
202 | 
203 |         {
204 | 
205 |             if (csrRowIdx_tmp[i] != csrColIdx_tmp[i])
206 | 
207 |                 csrRowPtr_counter[csrColIdx_tmp[i]]++;
208 | 
209 |         }
210 | 
211 |     }
212 | 
213 | 
214 | 
215 |     // exclusive scan for csrRowPtr_counter
216 | 
217 |     int old_val, new_val;
218 | 
219 | 
220 | 
221 |     old_val = csrRowPtr_counter[0];
222 | 
223 |     csrRowPtr_counter[0] = 0;
224 | 
225 |     for (int i = 1; i <= m_tmp; i++)
226 | 
227 |     {
228 | 
229 |         new_val = csrRowPtr_counter[i];
230 | 
231 |         csrRowPtr_counter[i] = old_val + csrRowPtr_counter[i-1];
232 | 
233 |         old_val = new_val;
234 | 
235 |     }
236 | 
237 | 
238 | 
239 |     nnz_tmp = csrRowPtr_counter[m_tmp];
240 | 
241 | 
242 | 
243 |     *m = m_tmp;
244 | 
245 |     *n = n_tmp;
246 | 
247 |     *nnz = nnz_tmp;
248 | 
249 |     *isSymmetric = isSymmetric_tmp;
250 | 
251 | 
252 | 
253 |     // free tmp space
254 | 
255 |     free(csrColIdx_tmp);
256 | 
257 |     free(csrVal_tmp);
258 | 
259 |     free(csrRowIdx_tmp);
260 | 
261 |     free(csrRowPtr_counter);
262 | 
263 | 
264 | 
265 |     return 0;
266 | 
267 | }
268 | 
269 | 
270 | 
271 | // read matrix infomation from mtx file
272 | 
273 | int mmio_data(int *csrRowPtr, int *csrColIdx, MAT_VAL_TYPE *csrVal, char *filename)
274 | 
275 | {
276 | 
277 |     int m_tmp, n_tmp, nnz_tmp;
278 | 
279 | 
280 | 
281 |     int ret_code;
282 | 
283 |     MM_typecode matcode;
284 | 
285 |     FILE *f;
286 | 
287 | 
288 | 
289 |     int nnz_mtx_report;
290 | 
291 |     int isInteger = 0, isReal = 0, isPattern = 0, isSymmetric_tmp = 0, isComplex = 0;
292 | 
293 | 
294 | 
295 |     // load matrix
296 | 
297 |     if ((f = fopen(filename, "r")) == NULL)
298 | 
299 |         return -1;
300 | 
301 | 
302 | 
303 |     if (mm_read_banner(f, &matcode) != 0)
304 | 
305 |     {
306 | 
307 |         printf("Could not process Matrix Market banner.\n");
308 | 
309 |         return -2;
310 | 
311 |     }
312 | 
313 | 
314 | 
315 |     if ( mm_is_pattern( matcode ) )  { isPattern = 1; /*printf("type = Pattern\n");*/ }
316 | 
317 |     if ( mm_is_real ( matcode) )     { isReal = 1; /*printf("type = real\n");*/ }
318 | 
319 |     if ( mm_is_complex( matcode ) )  { isComplex = 1; /*printf("type = real\n");*/ }
320 | 
321 |     if ( mm_is_integer ( matcode ) ) { isInteger = 1; /*printf("type = integer\n");*/ }
322 | 
323 | 
324 | 
325 |     /* find out size of sparse matrix .... */
326 | 
327 |     ret_code = mm_read_mtx_crd_size(f, &m_tmp, &n_tmp, &nnz_mtx_report);
328 | 
329 |     if (ret_code != 0)
330 | 
331 |         return -4;
332 | 
333 | 
334 | 
335 |     if ( mm_is_symmetric( matcode ) || mm_is_hermitian( matcode ) )
336 | 
337 |     {
338 | 
339 |         isSymmetric_tmp = 1;
340 | 
341 |         //printf("input matrix is symmetric = true\n");
342 | 
343 |     }
344 | 
345 |     else
346 | 
347 |     {
348 | 
349 |         //printf("input matrix is symmetric = false\n");
350 | 
351 |     }
352 | 
353 | 
354 | 
355 |     int *csrRowPtr_counter = (int *)malloc((m_tmp+1) * sizeof(int));
356 | 
357 |     memset(csrRowPtr_counter, 0, (m_tmp+1) * sizeof(int));
358 | 
359 | 
360 | 
361 |     int *csrRowIdx_tmp = (int *)malloc(nnz_mtx_report * sizeof(int));
362 | 
363 |     int *csrColIdx_tmp = (int *)malloc(nnz_mtx_report * sizeof(int));
364 | 
365 |     MAT_VAL_TYPE *csrVal_tmp    = (MAT_VAL_TYPE *)malloc(nnz_mtx_report * sizeof(MAT_VAL_TYPE));
366 | 
367 | 
368 | 
369 |     /* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
370 | 
371 |     /*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
372 | 
373 |     /*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
374 | 
375 | 
376 | 
377 |     for (int i = 0; i < nnz_mtx_report; i++)
378 | 
379 |     {
380 | 
381 |         int idxi, idxj;
382 | 
383 |         double fval, fval_im;
384 | 
385 |         int ival;
386 | 
387 |         int returnvalue;
388 | 
389 | 
390 | 
391 |         if (isReal)
392 | 
393 |         {
394 | 
395 |             returnvalue = fscanf(f, "%d %d %lg\n", &idxi, &idxj, &fval);
396 | 
397 |         }
398 | 
399 |         else if (isComplex)
400 | 
401 |         {
402 | 
403 |             returnvalue = fscanf(f, "%d %d %lg %lg\n", &idxi, &idxj, &fval, &fval_im);
404 | 
405 |         }
406 | 
407 |         else if (isInteger)
408 | 
409 |         {
410 | 
411 |             returnvalue = fscanf(f, "%d %d %d\n", &idxi, &idxj, &ival);
412 | 
413 |             fval = ival;
414 | 
415 |         }
416 | 
417 |         else if (isPattern)
418 | 
419 |         {
420 | 
421 |             returnvalue = fscanf(f, "%d %d\n", &idxi, &idxj);
422 | 
423 |             fval = 1.0;
424 | 
425 |         }
426 | 
427 | 
428 | 
429 |         // adjust from 1-based to 0-based
430 | 
431 |         idxi--;
432 | 
433 |         idxj--;
434 | 
435 | 
436 | 
437 |         csrRowPtr_counter[idxi]++;
438 | 
439 |         csrRowIdx_tmp[i] = idxi;
440 | 
441 |         csrColIdx_tmp[i] = idxj;
442 | 
443 |         csrVal_tmp[i] = fval;
444 | 
445 |     }
446 | 
447 | 
448 | 
449 |     if (f != stdin)
450 | 
451 |         fclose(f);
452 | 
453 | 
454 | 
455 |     if (isSymmetric_tmp)
456 | 
457 |     {
458 | 
459 |         for (int i = 0; i < nnz_mtx_report; i++)
460 | 
461 |         {
462 | 
463 |             if (csrRowIdx_tmp[i] != csrColIdx_tmp[i])
464 | 
465 |                 csrRowPtr_counter[csrColIdx_tmp[i]]++;
466 | 
467 |         }
468 | 
469 |     }
470 | 
471 | 
472 | 
473 |     // exclusive scan for csrRowPtr_counter
474 | 
475 |     int old_val, new_val;
476 | 
477 | 
478 | 
479 |     old_val = csrRowPtr_counter[0];
480 | 
481 |     csrRowPtr_counter[0] = 0;
482 | 
483 |     for (int i = 1; i <= m_tmp; i++)
484 | 
485 |     {
486 | 
487 |         new_val = csrRowPtr_counter[i];
488 | 
489 |         csrRowPtr_counter[i] = old_val + csrRowPtr_counter[i-1];
490 | 
491 |         old_val = new_val;
492 | 
493 |     }
494 | 
495 | 
496 | 
497 |     nnz_tmp = csrRowPtr_counter[m_tmp];
498 | 
499 |     memcpy(csrRowPtr, csrRowPtr_counter, (m_tmp+1) * sizeof(int));
500 | 
501 |     memset(csrRowPtr_counter, 0, (m_tmp+1) * sizeof(int));
502 | 
503 | 
504 | 
505 |     if (isSymmetric_tmp)
506 | 
507 |     {
508 | 
509 |         for (int i = 0; i < nnz_mtx_report; i++)
510 | 
511 |         {
512 | 
513 |             if (csrRowIdx_tmp[i] != csrColIdx_tmp[i])
514 | 
515 |             {
516 | 
517 |                 int offset = csrRowPtr[csrRowIdx_tmp[i]] + csrRowPtr_counter[csrRowIdx_tmp[i]];
518 | 
519 |                 csrColIdx[offset] = csrColIdx_tmp[i];
520 | 
521 |                 csrVal[offset] = csrVal_tmp[i];
522 | 
523 |                 csrRowPtr_counter[csrRowIdx_tmp[i]]++;
524 | 
525 | 
526 | 
527 |                 offset = csrRowPtr[csrColIdx_tmp[i]] + csrRowPtr_counter[csrColIdx_tmp[i]];
528 | 
529 |                 csrColIdx[offset] = csrRowIdx_tmp[i];
530 | 
531 |                 csrVal[offset] = csrVal_tmp[i];
532 | 
533 |                 csrRowPtr_counter[csrColIdx_tmp[i]]++;
534 | 
535 |             }
536 | 
537 |             else
538 | 
539 |             {
540 | 
541 |                 int offset = csrRowPtr[csrRowIdx_tmp[i]] + csrRowPtr_counter[csrRowIdx_tmp[i]];
542 | 
543 |                 csrColIdx[offset] = csrColIdx_tmp[i];
544 | 
545 |                 csrVal[offset] = csrVal_tmp[i];
546 | 
547 |                 csrRowPtr_counter[csrRowIdx_tmp[i]]++;
548 | 
549 |             }
550 | 
551 |         }
552 | 
553 |     }
554 | 
555 |     else
556 | 
557 |     {
558 | 
559 |         for (int i = 0; i < nnz_mtx_report; i++)
560 | 
561 |         {
562 | 
563 |             int offset = csrRowPtr[csrRowIdx_tmp[i]] + csrRowPtr_counter[csrRowIdx_tmp[i]];
564 | 
565 |             csrColIdx[offset] = csrColIdx_tmp[i];
566 | 
567 |             csrVal[offset] = csrVal_tmp[i];
568 | 
569 |             csrRowPtr_counter[csrRowIdx_tmp[i]]++;
570 | 
571 |         }
572 | 
573 |     }
574 | 
575 | 
576 | 
577 |     // free tmp space
578 | 
579 |     free(csrColIdx_tmp);
580 | 
581 |     free(csrVal_tmp);
582 | 
583 |     free(csrRowIdx_tmp);
584 | 
585 |     free(csrRowPtr_counter);
586 | 
587 | 
588 | 
589 |     return 0;
590 | 
591 | }
592 | // read matrix infomation from mtx file
593 | int mmio_allinone(int *m, int *n, MAT_PTR_TYPE *nnz, int *isSymmetric, 
594 |                   MAT_PTR_TYPE **csrRowPtr, int **csrColIdx, MAT_VAL_TYPE **csrVal, 
595 |                   char *filename)
596 | {
597 |     int m_tmp, n_tmp;
598 |     MAT_PTR_TYPE nnz_tmp;
599 | 
600 |     int ret_code;
601 |     MM_typecode matcode;
602 |     FILE *f;
603 | 
604 |     MAT_PTR_TYPE nnz_mtx_report;
605 |     int isInteger = 0, isReal = 0, isPattern = 0, isSymmetric_tmp = 0, isComplex = 0;
606 | 
607 |     // load matrix
608 |     if ((f = fopen(filename, "r")) == NULL)
609 |         return -1;
610 | 
611 |     if (mm_read_banner(f, &matcode) != 0)
612 |     {
613 |         printf("Could not process Matrix Market banner.\n");
614 |         return -2;
615 |     }
616 | 
617 |     if ( mm_is_pattern( matcode ) )  { isPattern = 1; /*printf("type = Pattern\n");*/ }
618 |     if ( mm_is_real ( matcode) )     { isReal = 1; /*printf("type = real\n");*/ }
619 |     if ( mm_is_complex( matcode ) )  { isComplex = 1; /*printf("type = real\n");*/ }
620 |     if ( mm_is_integer ( matcode ) ) { isInteger = 1; /*printf("type = integer\n");*/ }
621 | 
622 |     /* find out size of sparse matrix .... */
623 |     ret_code = mm_read_mtx_crd_size(f, &m_tmp, &n_tmp, &nnz_mtx_report);
624 |     if (ret_code != 0)
625 |         return -4;
626 | 
627 |     if ( mm_is_symmetric( matcode ) || mm_is_hermitian( matcode ) )
628 |     {
629 |         isSymmetric_tmp = 1;
630 |         //printf("input matrix is symmetric = true\n");
631 |     }
632 |     else
633 |     {
634 |         //printf("input matrix is symmetric = false\n");
635 |     }
636 | 
637 |     MAT_PTR_TYPE *csrRowPtr_counter = (MAT_PTR_TYPE *)malloc((m_tmp+1) * sizeof(MAT_PTR_TYPE));
638 |     memset(csrRowPtr_counter, 0, (m_tmp+1) * sizeof(MAT_PTR_TYPE));
639 | 
640 |     int *csrRowIdx_tmp = (int *)malloc(nnz_mtx_report * sizeof(int));
641 |     int *csrColIdx_tmp = (int *)malloc(nnz_mtx_report * sizeof(int));
642 |     MAT_VAL_TYPE *csrVal_tmp    = (MAT_VAL_TYPE *)malloc(nnz_mtx_report * sizeof(MAT_VAL_TYPE));
643 | 
644 |     /* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
645 |     /*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
646 |     /*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
647 | 
648 |     for (MAT_PTR_TYPE i = 0; i < nnz_mtx_report; i++)
649 |     {
650 |         int idxi, idxj;
651 |         double fval, fval_im;
652 |         int ival;
653 |         int returnvalue;
654 | 
655 |         if (isReal)
656 |         {
657 |             returnvalue = fscanf(f, "%d %d %lg\n", &idxi, &idxj, &fval);
658 |         }
659 |         else if (isComplex)
660 |         {
661 |             returnvalue = fscanf(f, "%d %d %lg %lg\n", &idxi, &idxj, &fval, &fval_im);
662 |         }
663 |         else if (isInteger)
664 |         {
665 |             returnvalue = fscanf(f, "%d %d %d\n", &idxi, &idxj, &ival);
666 |             fval = ival;
667 |         }
668 |         else if (isPattern)
669 |         {
670 |             returnvalue = fscanf(f, "%d %d\n", &idxi, &idxj);
671 |             fval = 1.0;
672 |         }
673 | 
674 |         // adjust from 1-based to 0-based
675 |         idxi--;
676 |         idxj--;
677 |         
678 |         csrRowPtr_counter[idxi]++;
679 |         csrRowIdx_tmp[i] = idxi;
680 |         csrColIdx_tmp[i] = idxj;
681 |         csrVal_tmp[i] = fval;
682 |     }
683 | 
684 |     if (f != stdin)
685 |         fclose(f);
686 | 
687 |     if (isSymmetric_tmp)
688 |     {
689 |         for (MAT_PTR_TYPE i = 0; i < nnz_mtx_report; i++)
690 |         {
691 |             if (csrRowIdx_tmp[i] != csrColIdx_tmp[i])
692 |                 csrRowPtr_counter[csrColIdx_tmp[i]]++;
693 |         }
694 |     }
695 | 
696 |     // exclusive scan for csrRowPtr_counter
697 |     exclusive_scan(csrRowPtr_counter, m_tmp+1);
698 | 
699 |     MAT_PTR_TYPE *csrRowPtr_alias = (MAT_PTR_TYPE *)malloc((m_tmp+1) * sizeof(MAT_PTR_TYPE));
700 |     nnz_tmp = csrRowPtr_counter[m_tmp];
701 |     int *csrColIdx_alias = (int *)malloc(nnz_tmp * sizeof(int));
702 |     MAT_VAL_TYPE *csrVal_alias    = (MAT_VAL_TYPE *)malloc(nnz_tmp * sizeof(MAT_VAL_TYPE));
703 | 
704 |     memcpy(csrRowPtr_alias, csrRowPtr_counter, (m_tmp+1) * sizeof(MAT_PTR_TYPE));
705 |     memset(csrRowPtr_counter, 0, (m_tmp+1) * sizeof(MAT_PTR_TYPE));
706 | 
707 |     if (isSymmetric_tmp)
708 |     {
709 |         for (MAT_PTR_TYPE i = 0; i < nnz_mtx_report; i++)
710 |         {
711 |             if (csrRowIdx_tmp[i] != csrColIdx_tmp[i])
712 |             {
713 |                 MAT_PTR_TYPE offset = csrRowPtr_alias[csrRowIdx_tmp[i]] + csrRowPtr_counter[csrRowIdx_tmp[i]];
714 |                 csrColIdx_alias[offset] = csrColIdx_tmp[i];
715 |                 csrVal_alias[offset] = csrVal_tmp[i];
716 |                 csrRowPtr_counter[csrRowIdx_tmp[i]]++;
717 | 
718 |                 offset = csrRowPtr_alias[csrColIdx_tmp[i]] + csrRowPtr_counter[csrColIdx_tmp[i]];
719 |                 csrColIdx_alias[offset] = csrRowIdx_tmp[i];
720 |                 csrVal_alias[offset] = csrVal_tmp[i];
721 |                 csrRowPtr_counter[csrColIdx_tmp[i]]++;
722 |             }
723 |             else
724 |             {
725 |                 MAT_PTR_TYPE offset = csrRowPtr_alias[csrRowIdx_tmp[i]] + csrRowPtr_counter[csrRowIdx_tmp[i]];
726 |                 csrColIdx_alias[offset] = csrColIdx_tmp[i];
727 |                 csrVal_alias[offset] = csrVal_tmp[i];
728 |                 csrRowPtr_counter[csrRowIdx_tmp[i]]++;
729 |             }
730 |         }
731 |     }
732 |     else
733 |     {
734 |         for (MAT_PTR_TYPE i = 0; i < nnz_mtx_report; i++)
735 |         {            
736 |             MAT_PTR_TYPE offset = csrRowPtr_alias[csrRowIdx_tmp[i]] + csrRowPtr_counter[csrRowIdx_tmp[i]];
737 |             csrColIdx_alias[offset] = csrColIdx_tmp[i];
738 |             csrVal_alias[offset] = csrVal_tmp[i];
739 |             csrRowPtr_counter[csrRowIdx_tmp[i]]++;
740 |         }
741 |     }
742 |     
743 |     *m = m_tmp;
744 |     *n = n_tmp;
745 |     *nnz = nnz_tmp;
746 |     *isSymmetric = isSymmetric_tmp;
747 | 
748 |     *csrRowPtr = csrRowPtr_alias;
749 |     *csrColIdx = csrColIdx_alias;
750 |     *csrVal = csrVal_alias;
751 | 
752 |     // free tmp space
753 |     free(csrColIdx_tmp);
754 |     free(csrVal_tmp);
755 |     free(csrRowIdx_tmp);
756 |     free(csrRowPtr_counter);
757 | 
758 |     return 0;
759 | }
760 | 
761 | 
762 | 
763 | #endif
764 | 


--------------------------------------------------------------------------------
/src/nsparse_asm.h:
--------------------------------------------------------------------------------
  1 | #ifndef _SPGEMM_CUDA_NSPARSE_ASM_
  2 | #define _SPGEMM_CUDA_NSPARSE_ASM_
  3 | //#define FLOAT 1
  4 | 
  5 | #define real double
  6 | 
  7 | /*
  8 |  * Inline PTX
  9 |  */
 10 | __device__ __inline__ real ld_gbl_val(const real *val)
 11 | {
 12 |     real return_value;
 13 | 
 14 | // #ifndef HALF
 15 | //        asm("ld.global.cv.f16 %0, [%1];" : "=h"(return_value) : "l"(val));
 16 | #ifdef FLOAT
 17 |     asm("ld.global.cv.f32 %0, [%1];" : "=f"(return_value) : "l"(val));
 18 | #else
 19 |     asm("ld.global.cv.f64 %0, [%1];" : "=d"(return_value) : "l"(val));
 20 | #endif
 21 |   
 22 |     return return_value;
 23 | }
 24 | 
 25 | __device__ __inline__ float2 ld_gbl_float2(const float2 *val)
 26 | {
 27 |     float2 return_value;
 28 | 
 29 |     asm("ld.global.cv.v2.f32 {%0, %1}, [%2];" : "=f"(return_value.x), "=f"(return_value.y) : "l"(val));
 30 |     return return_value;
 31 | }
 32 | 
 33 | __device__ __inline__ float4 ld_gbl_float4(const float4 *val)
 34 | {
 35 |     float4 return_value;
 36 | 
 37 |     asm("ld.global.cv.v4.f32 {%0, %1, %2, %3}, [%4];" : "=f"(return_value.x), "=f"(return_value.y), "=f"(return_value.z), "=f"(return_value.w) : "l"(val));
 38 |     return return_value;
 39 | }
 40 | 
 41 | __device__ __inline__ short ld_gbl_row(const short *row)
 42 | {
 43 |     short return_value;
 44 |     asm("ld.global.cv.u16 %0, [%1];" : "=h"(return_value) : "l"(row));
 45 |     return return_value;
 46 | }
 47 | 
 48 | __device__ __inline__ int ld_gbl_col(const int *col)
 49 | {
 50 |     int return_value;
 51 |     asm("ld.global.cv.s32 %0, [%1];" : "=r"(return_value) : "l"(col));
 52 |     return return_value;
 53 | }
 54 | 
 55 | __device__ __inline__ short ld_gbl_short(const short *col)
 56 | {
 57 |     short return_value;
 58 |     asm("ld.global.cv.u16 %0, [%1];" : "=h"(return_value) : "l"(col));
 59 |     return return_value;
 60 | }
 61 | 
 62 | __device__ __inline__ unsigned short ld_gbl_ushort(const unsigned short *col)
 63 | {
 64 |     unsigned short return_value;
 65 |     asm("ld.global.cv.u16 %0, [%1];" : "=h"(return_value) : "l"(col));
 66 |     return return_value;
 67 | }
 68 | 
 69 | __device__ __inline__ unsigned char ld_gbl_uchar(const unsigned char *row)
 70 | {
 71 |     short return_value;
 72 |     asm("ld.global.cv.u8 %0, [%1];" : "=h"(return_value) : "l"(row));
 73 |     return (unsigned char)return_value;
 74 | }
 75 | 
 76 | __device__ __inline__ void st_gbl_ans(const real *ans_gpu, real answer)
 77 | {
 78 | 
 79 | // #ifndef HALF
 80 | //   asm("ld.global.cv.fp16 %0, [%1];" : "=h"(return_value) : "l"(val));
 81 | 
 82 | #ifdef FLOAT
 83 |     asm("st.global.cs.f32 [%0], %1;" :: "l"(ans_gpu) , "f"(answer));
 84 | #else
 85 |     asm("st.global.cs.f64 [%0], %1;" :: "l"(ans_gpu) , "d"(answer));
 86 | #endif
 87 | 
 88 | }
 89 | 
 90 | __device__ __inline__ real ld_gbl_real(const real *val) {
 91 | 
 92 |   real return_value;
 93 | 
 94 | // #ifndef HALF
 95 | //   asm("ld.global.cv.fp16 %0, [%1];" : "=h"(return_value) : "l"(val));
 96 | 
 97 | #ifdef FLOAT
 98 |   asm("ld.global.cv.f32 %0, [%1];" : "=f"(return_value) : "l"(val));
 99 | #else
100 |   asm("ld.global.cv.f64 %0, [%1];" : "=d"(return_value) : "l"(val));
101 | #endif
102 |   
103 |   return return_value;
104 | }
105 | 
106 | __device__ __inline__ int ld_gbl_int32(const int *col) {
107 |   int return_value;
108 |   asm("ld.global.cv.s32 %0, [%1];" : "=r"(return_value) : "l"(col));
109 |   return return_value;
110 | }
111 | 
112 | __device__ __inline__ void atomic_fadd(real *adr, real val)
113 | {
114 | #if __CUDA_ARCH__ >= 600
115 |     atomicAdd(adr, val);
116 | #else
117 | #ifdef FLOAT
118 |     atomicAdd(adr, val);
119 | #elif defined DOUBLE
120 |     unsigned long long int *address_ull = (unsigned long long int *)(adr);
121 |     unsigned long long int old_val = *address_ull;
122 |     unsigned long long int assumed;
123 |     real input = val;
124 |     do {
125 |         assumed = old_val;
126 |         old_val = atomicCAS(address_ull, assumed, __double_as_longlong(input + __longlong_as_double(assumed)));
127 |     } while (assumed != old_val);
128 | #endif
129 | #endif
130 | }
131 | 
132 | #endif
133 | 


--------------------------------------------------------------------------------
/src/spgemm-cpu.h:
--------------------------------------------------------------------------------
  1 | #include "common.h"
  2 | 
  3 | void step1(int *blkrowptrA, int *blkcolidxA, int blkmA, int blknA,
  4 |            int *blkcolptrB, int *blkrowidxB, int blkmB, int blknB,
  5 |            int *blkrowptrC, int *numblkC)
  6 | {
  7 |     struct timeval t1, t2;
  8 | 
  9 |     gettimeofday(&t1, NULL);
 10 | 
 11 |     memset(blkrowptrC, 0, (blkmA + 1) * sizeof(MAT_PTR_TYPE));
 12 | 
 13 |     for (int blki = 0; blki < blkmA; blki++)
 14 |     {
 15 |         int blknnz_C = 0;
 16 |         for (int blkj = 0; blkj < blknB; blkj++)
 17 |         {
 18 |             int posA = blkrowptrA[blki];
 19 |             int posB = blkcolptrB[blkj];
 20 |             int idxA = 0;
 21 |             int idxB = 0;
 22 |             int posa_updated = 1;
 23 |             int posb_updated = 1;
 24 |             int flag = 0;
 25 | 
 26 |             while (posA < blkrowptrA[blki + 1] && posB < blkcolptrB[blkj + 1])
 27 |             {
 28 | 
 29 |                 idxA = posa_updated ? blkcolidxA[posA] : idxA;
 30 |                 idxB = posb_updated ? blkrowidxB[posB] : idxB;
 31 |                 if (idxA == idxB) // do spgemm of this pair
 32 |                 {
 33 |                     flag = 1;
 34 |                     break;
 35 |                 }
 36 |                 else
 37 |                 {
 38 |                     posA = idxA < idxB ? posA + 1 : posA;
 39 |                     posa_updated = idxA < idxB ? 1 : 0;
 40 |                     posB = idxA > idxB ? posB + 1 : posB;
 41 |                     posb_updated = idxA > idxB ? 1 : 0;
 42 |                 }
 43 |             }
 44 |             if (flag == 1)
 45 |             {
 46 |                 blknnz_C++;
 47 |             }
 48 |         }
 49 |         blkrowptrC[blki] = blknnz_C;
 50 |     }
 51 | 
 52 |     exclusive_scan(blkrowptrC, blkmA + 1);
 53 |     *numblkC = blkrowptrC[blkmA];
 54 | }
 55 | 
 56 | void step2(int *blkrowptrA, int *blkcolidxA, int blkmA, int blknA,
 57 |            int *blkcolptrB, int *blkrowidxB, int blkmB, int blknB,
 58 |            int *blkrowptrC, int *blkcolidxC)
 59 | {
 60 |     struct timeval t1, t2;
 61 | 
 62 |     gettimeofday(&t1, NULL);
 63 | 
 64 |     int blkcolcount = 0;
 65 |     for (int blki = 0; blki < blkmA; blki++)
 66 |     {
 67 |         for (int blkj = 0; blkj < blknB; blkj++)
 68 |         {
 69 |             int posA = blkrowptrA[blki];
 70 |             int posB = blkcolptrB[blkj];
 71 |             int idxA = 0;
 72 |             int idxB = 0;
 73 |             int posa_updated = 1;
 74 |             int posb_updated = 1;
 75 |             while (posA < blkrowptrA[blki + 1] && posB < blkcolptrB[blkj + 1])
 76 |             {
 77 | 
 78 |                 idxA = posa_updated ? blkcolidxA[posA] : idxA;
 79 |                 idxB = posb_updated ? blkrowidxB[posB] : idxB;
 80 |                 if (idxA == idxB) // do spgemm of this pair
 81 |                 {
 82 |                     blkcolidxC[blkcolcount] = blkj;
 83 |                     blkcolcount++;
 84 |                     break;
 85 |                 }
 86 |                 else
 87 |                 {
 88 |                     posA = idxA < idxB ? posA + 1 : posA;
 89 |                     posa_updated = idxA < idxB ? 1 : 0;
 90 |                     posB = idxA > idxB ? posB + 1 : posB;
 91 |                     posb_updated = idxA > idxB ? 1 : 0;
 92 |                 }
 93 |             }
 94 |         }
 95 |     }
 96 | }
 97 | 
 98 | // void step3(int *blkrowptrA, int *blkcolidxA, int blkmA, int blknA, int *nnzb_A, int mA,
 99 | //            MAT_VAL_TYPE *blkcsr_Val_A, unsigned char *blkcsr_Col_A, unsigned char *blkcsr_Ptr_A,
100 | //            int *blkcolptrB, int *blkrowidxB, int blkmB, int blknB, int *nnzb_B,
101 | //            MAT_VAL_TYPE *blkcsr_Val_B, unsigned char *blkcsr_Col_B, unsigned char *blkcsr_Ptr_B,
102 | //            int *blkrowptrC, int *blkcolidxC, int *nnzb_C, int *nnzC)
103 | // {
104 | 
105 | //     struct timeval t1, t2;
106 | 
107 | //     gettimeofday(&t1, NULL);
108 | 
109 | //     char *blkc = (char *)malloc((BLOCK_SIZE * BLOCK_SIZE) * sizeof(char));
110 | 
111 | //     for (int blki = 0; blki < blkmA; blki++)
112 | //     {
113 | //         int rowlen = blki == blkmA - 1 ? mA - (blkmA - 1) * BLOCK_SIZE : BLOCK_SIZE;
114 | //         for (int blkj = blkrowptrC[blki]; blkj < blkrowptrC[blki + 1]; blkj++)
115 | //         {
116 | //             int count = 0;
117 | //             int blkccolid = blkcolidxC[blkj];
118 | //             memset(blkc, 0, (BLOCK_SIZE * BLOCK_SIZE) * sizeof(char));
119 | 
120 | //             int posA = blkrowptrA[blki];
121 | //             int posB = blkcolptrB[blkccolid];
122 | //             int idxA = 0;
123 | //             int idxB = 0;
124 | //             int posa_updated = 1;
125 | //             int posb_updated = 1;
126 | //             while (posA < blkrowptrA[blki + 1] && posB < blkcolptrB[blkccolid + 1])
127 | //             {
128 | //                 idxA = posa_updated ? blkcolidxA[posA] : idxA;
129 | //                 idxB = posb_updated ? blkrowidxB[posB] : idxB;
130 | //                 if (idxA == idxB) // do spgemm of this pair
131 | //                 {
132 | //                     for (int ri = 0; ri < BLOCK_SIZE; ri++)
133 | //                     {
134 | //                         if (ri == rowlen)
135 | //                             break;
136 | //                         int stopa = ri == BLOCK_SIZE - 1 ? nnzb_A[posA + 1] - nnzb_A[posA] : blkcsr_Ptr_A[posA * BLOCK_SIZE + ri + 1];
137 | 
138 | //                         for (int i = blkcsr_Ptr_A[posA * BLOCK_SIZE + ri]; i < stopa; i++)
139 | //                         {
140 | //                             int cola = blkcsr_Col_A[nnzb_A[posA] + i];
141 | //                             int stopb = cola == BLOCK_SIZE - 1 ? nnzb_B[posB + 1] - nnzb_B[posB] : blkcsr_Ptr_B[posB * BLOCK_SIZE + cola + 1];
142 | //                             for (int bi = blkcsr_Ptr_B[posB * BLOCK_SIZE + cola]; bi < stopb; bi++)
143 | //                             {
144 | //                                 const int colb = blkcsr_Col_B[nnzb_B[posB] + bi];
145 | //                                 if (blkc[ri * BLOCK_SIZE + colb] == 0)
146 | //                                 {
147 | //                                     blkc[ri * BLOCK_SIZE + colb] = 1;
148 | //                                 }
149 | //                             }
150 | //                         }
151 | //                     }
152 | //                     posA++;
153 | //                     posa_updated = 1;
154 | //                     posB++;
155 | //                     posb_updated = 1;
156 | //                 }
157 | //                 else
158 | //                 {
159 | //                     posA = idxA < idxB ? posA + 1 : posA;
160 | //                     posa_updated = idxA < idxB ? 1 : 0;
161 | //                     posB = idxA > idxB ? posB + 1 : posB;
162 | //                     posb_updated = idxA > idxB ? 1 : 0;
163 | //                 }
164 | //             }
165 | //             for (int ci = 0; ci < BLOCK_SIZE * BLOCK_SIZE; ci++)
166 | //             {
167 | //                 if (blkc[ci] == 1)
168 | //                 {
169 | //                     count++;
170 | //                 }
171 | //             }
172 | //             nnzb_C[blkj] = count;
173 | //         }
174 | //     }
175 | 
176 | //     exclusive_scan(nnzb_C, blkrowptrC[blkmA] + 1);
177 | //     *nnzC = nnzb_C[blkrowptrC[blkmA]];
178 | //         for (int i=0; i< blkrowptrC[blkmA] + 1; i ++)
179 | //     {
180 | //         printf("i= %i, nnz = %i\n", i, nnzb_C[i]);
181 | //     }
182 | 
183 | // }
184 | 
185 | void step3 (int *d_blkrowptrA, int *d_blkcolidxA, int blkmA, int blknA,int *nnzb_A ,int mA,
186 |             MAT_VAL_TYPE *blkcsr_Val_A , unsigned char  *blkcsr_Col_A , unsigned char *blkcsr_Ptr_A ,
187 |             int *d_blkcolptrB, int *d_blkrowidxB, int blkmB, int blknB , int *nnzb_B ,int nB,
188 |             MAT_VAL_TYPE *blkcsr_Val_B , unsigned char  *blkcsr_Col_B , unsigned char *blkcsr_Ptr_B ,
189 |             int *d_blkrowptrC, int *d_blkcolidxC,int *nnzb_C, int *nnzC)
190 | {
191 | 
192 |     struct timeval t1, t2;
193 | 
194 |     gettimeofday(&t1, NULL);
195 | 
196 |     char * blkc = (char *)malloc((BLOCK_SIZE * BLOCK_SIZE) *sizeof(char));
197 | 
198 |     for (int blki =0 ;blki <blkmA ;blki++)
199 |     {
200 |         int rowlen = blki == blkmA -1 ? mA- (blkmA -1 ) * BLOCK_SIZE : BLOCK_SIZE ;
201 |         for (int blkj =d_blkrowptrC[blki]; blkj <d_blkrowptrC[blki + 1]; blkj++)
202 |         {
203 |             int count =0 ;
204 |             int blkccolid = d_blkcolidxC[blkj];
205 |         //    int rowlen = blki == blkmA -1 ? mA- (blkmA -1 ) * BLOCK_SIZE : BLOCK_SIZE ;
206 |            int collen = blkccolid == blknB -1 ? nB - (blknB -1) *BLOCK_SIZE : BLOCK_SIZE ;
207 |             memset (blkc , 0, (BLOCK_SIZE * BLOCK_SIZE) *sizeof(char));
208 | 
209 |             int posA = d_blkrowptrA[blki];
210 |             int posB = d_blkcolptrB[blkccolid];
211 |             int idxA= 0;
212 |             int idxB =0;
213 |             int posa_updated =1;
214 |             int posb_updated =1;
215 |             while (posA < d_blkrowptrA[blki +1] && posB <d_blkcolptrB[blkccolid + 1])
216 |             {
217 |                 idxA = posa_updated ? d_blkcolidxA[posA] : idxA ;
218 |                 idxB = posb_updated ? d_blkrowidxB[posB] : idxB ;
219 |                 if (idxA == idxB)  // do spgemm of this pair
220 |                 {
221 |                 //        printf ("do spgemm of this pair, idxa = %i, idxb = %i\n", idxA, idxB);
222 |                 //for each row of block
223 |                     for (int ri =0;ri <BLOCK_SIZE ;ri ++ )
224 |                     {
225 |                         if (ri == rowlen)
226 |                             break;
227 |                         int stopa = ri == rowlen -1 ? nnzb_A[posA +1] - nnzb_A[posA] : blkcsr_Ptr_A[posA * BLOCK_SIZE + ri + 1] ;
228 |                 
229 |                         for (int i=blkcsr_Ptr_A[ posA * BLOCK_SIZE+ ri];i<stopa;i++)
230 |                         {
231 |                             int cola= blkcsr_Col_A[nnzb_A[posA]+i] ;
232 |                             int stopb = cola == collen -1  ? nnzb_B[posB +1]- nnzb_B[posB] : blkcsr_Ptr_B[posB * BLOCK_SIZE+cola +1] ;
233 |                             for (int bi= blkcsr_Ptr_B[posB * BLOCK_SIZE +cola ];bi< stopb; bi++)
234 |                             {
235 |                                 const int colb = blkcsr_Col_B[nnzb_B[posB] + bi];
236 |                                 if (blkc[ri * BLOCK_SIZE + colb] == 0)
237 |                                 {
238 |                                     blkc[ri * BLOCK_SIZE + colb] = 1;
239 |                                 }
240 |                             }
241 | 
242 |                         }
243 |                     }
244 |                     posA++;
245 |                     posa_updated = 1;
246 |                     posB++;
247 |                     posb_updated = 1;
248 |                 }
249 |                 else 
250 |                 {
251 |                 //    printf ("the smaller index goes forward (before), posa = %i, posb = %i\n", posA, posA);
252 |                     posA = idxA < idxB ? posA + 1 : posA;
253 |                     posa_updated = idxA < idxB ? 1 : 0;
254 |                     posB = idxA > idxB ? posB + 1 : posB;
255 |                     posb_updated = idxA > idxB ? 1 : 0;
256 |                 //    printf ("the smaller index goes forward (after),  posa = %i, posb = %i\n", posA, posA);
257 |                 }
258 | 
259 |             }  
260 |             for (int ci=0;ci< BLOCK_SIZE * BLOCK_SIZE ; ci ++)
261 |             {
262 |                 if (blkc[ci]== 1)
263 |                 {
264 |                     count ++ ;
265 |                 }
266 |             }
267 |             nnzb_C[blkj] = count ;
268 |         //    printf("count = %d\n",count);
269 |         }
270 |     }
271 | 
272 |     exclusive_scan(nnzb_C,d_blkrowptrC[blkmA] + 1);
273 | 
274 |     *nnzC = nnzb_C[d_blkrowptrC[blkmA]];
275 |     printf("nnzc = %i\n", *nnzC);
276 | 
277 |         gettimeofday(&t2, NULL);
278 | 
279 |     double time_kernel = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0;
280 |     printf("CPU  step3 kernel = %.2f ms\n", time_kernel);
281 | }
282 | 
283 | 
284 | void step4(int *blkrowptrA, int *blkcolidxA, int blkmA, int blknA, int *nnzb_A, int mA,
285 |            MAT_VAL_TYPE *blkcsr_Val_A, unsigned char *blkcsr_Col_A, unsigned char *blkcsr_Ptr_A,
286 |            int *blkcolptrB, int *blkrowidxB, int blkmB, int blknB, int *nnzb_B,
287 |            MAT_VAL_TYPE *blkcsr_Val_B, unsigned char *blkcsr_Col_B, unsigned char *blkcsr_Ptr_B,
288 |            int *blkrowptrC, int *blkcolidxC, int *nnzb_C,
289 |            MAT_VAL_TYPE *blkcsr_Val_C, unsigned char *blkcsr_Col_C, unsigned char *blkcsr_Ptr_C)
290 | 
291 | {
292 |     MAT_VAL_TYPE *blkcval = (MAT_VAL_TYPE *)malloc((BLOCK_SIZE * BLOCK_SIZE) * sizeof(MAT_VAL_TYPE));
293 |     char *blkc = (char *)malloc((BLOCK_SIZE * BLOCK_SIZE) * sizeof(char));
294 |     for (int blki = 0; blki < blkmA; blki++)
295 |     {
296 |         int rowlen = blki == blkmA - 1 ? mA - (blkmA - 1) * BLOCK_SIZE : BLOCK_SIZE;
297 |         for (int blkj = blkrowptrC[blki]; blkj < blkrowptrC[blki + 1]; blkj++)
298 |         {
299 |             int count = 0;
300 |             int blkccolid = blkcolidxC[blkj];
301 |             memset(blkc, 0, (BLOCK_SIZE * BLOCK_SIZE) * sizeof(char));
302 |             memset(blkcval, 0, (BLOCK_SIZE * BLOCK_SIZE) * sizeof(MAT_VAL_TYPE));
303 | 
304 |             int posA = blkrowptrA[blki];
305 |             int posB = blkcolptrB[blkccolid];
306 |             int idxA = 0;
307 |             int idxB = 0;
308 |             int posa_updated = 1;
309 |             int posb_updated = 1;
310 |             while (posA < blkrowptrA[blki + 1] && posB < blkcolptrB[blkccolid + 1])
311 |             {
312 |                 idxA = posa_updated ? blkcolidxA[posA] : idxA;
313 |                 idxB = posb_updated ? blkrowidxB[posB] : idxB;
314 |                 if (idxA == idxB) // do spgemm of this pair
315 |                 {
316 |                     for (int ri = 0; ri < BLOCK_SIZE; ri++)
317 |                     {
318 |                         if (ri == rowlen)
319 |                             break;
320 |                         int stopa = ri == BLOCK_SIZE - 1 ? nnzb_A[posA + 1] - nnzb_A[posA] : blkcsr_Ptr_A[posA * BLOCK_SIZE + ri + 1];
321 | 
322 |                         for (int i = blkcsr_Ptr_A[posA * BLOCK_SIZE + ri]; i < stopa; i++)
323 |                         {
324 |                             int cola = blkcsr_Col_A[nnzb_A[posA] + i];
325 |                             int stopb = cola == BLOCK_SIZE - 1 ? nnzb_B[posB + 1] - nnzb_B[posB] : blkcsr_Ptr_B[posB * BLOCK_SIZE + cola + 1];
326 |                             for (int bi = blkcsr_Ptr_B[posB * BLOCK_SIZE + cola]; bi < stopb; bi++)
327 |                             {
328 |                                 const int colb = blkcsr_Col_B[nnzb_B[posB] + bi];
329 | 
330 |                                 blkcval[ri * BLOCK_SIZE + colb] += blkcsr_Val_A[nnzb_A[posA] + i] * blkcsr_Val_B[nnzb_B[posB] + bi];
331 |                                 if (blkc[ri * BLOCK_SIZE + colb] == 0)
332 |                                 {
333 |                                     blkc[ri * BLOCK_SIZE + colb] = 1;
334 |                                 }
335 |                             }
336 |                         }
337 |                     }
338 |                     posA++;
339 |                     posa_updated = 1;
340 |                     posB++;
341 |                     posb_updated = 1;
342 |                 }
343 |                 else
344 |                 {
345 |                     posA = idxA < idxB ? posA + 1 : posA;
346 |                     posa_updated = idxA < idxB ? 1 : 0;
347 |                     posB = idxA > idxB ? posB + 1 : posB;
348 |                     posb_updated = idxA > idxB ? 1 : 0;
349 |                 }
350 |             }
351 |             for (int ri = 0; ri < BLOCK_SIZE; ri++)
352 |             {
353 |                 for (int ci = 0; ci < BLOCK_SIZE; ci++)
354 |                 {
355 |                     if (blkc[ri * BLOCK_SIZE + ci] == 1)
356 |                     {
357 |                         blkcsr_Val_C[nnzb_C[blkj] + count] = blkcval[ri * BLOCK_SIZE + ci];
358 |                         blkcsr_Col_C[nnzb_C[blkj] + count] = ci;
359 |                         count++;
360 |                     }
361 |                 }
362 |                 if (ri < BLOCK_SIZE - 1)
363 |                     blkcsr_Ptr_C[BLOCK_SIZE * blkj + ri + 1] = count;
364 |             }
365 |         }
366 |     }
367 | }
368 | 
369 | void spgemm_cpu(SMatrix *A,
370 |                 SMatrix *B,
371 |                 SMatrix *C)
372 | {
373 |     int blkmA = A->tilem;
374 |     int blknA = A->tilen;
375 |     int mA = A->m;
376 |     int nA = A->n;
377 |     int nnzA = A->nnz;
378 |     int numblkA = A->numtile;
379 |     int *blkrowptrA = A->tile_ptr;
380 |     int *blkcolidxA = A->tile_columnidx;
381 |     int *nnzb_A = A->tile_nnz;
382 |     MAT_VAL_TYPE *blkcsr_Val_A = A->tile_csr_Value;
383 |     unsigned char *blkcsr_Col_A = A->tile_csr_Col;
384 |     unsigned char *blkcsr_Ptr_A = A->tile_csr_Ptr;
385 | 
386 |     int blkmB = B->tilem;
387 |     int blknB = B->tilen;
388 |     int mB = B->m;
389 |     int nB = B->n;
390 |     int nnzB = B->nnz;
391 |     int numblkB = B->numtile;
392 |     int *blkcolptrB = B->csc_tile_ptr;
393 |     int *blkrowidxB = B->csc_tile_rowidx;
394 |     int *nnzb_B = B->tile_nnz;
395 |     MAT_VAL_TYPE *blkcsr_Val_B = B->tile_csr_Value;
396 |     unsigned char *blkcsr_Col_B = B->tile_csr_Col;
397 |     unsigned char *blkcsr_Ptr_B = B->tile_csr_Ptr;
398 | 
399 |     int *blkrowptrC = (int *)malloc((blkmA + 1) * sizeof(int));
400 |     memset(blkrowptrC, 0, (blkmA + 1) * sizeof(int));
401 |     int numtileC;
402 | 
403 |     step1(blkrowptrA, blkcolidxA, blkmA, blknA,
404 |           blkcolptrB, blkrowidxB, blkmB, blknB,
405 |           blkrowptrC, &numtileC);
406 | 
407 |     int *blkcolidxC = (int *)malloc(numtileC * sizeof(int));
408 |     memset(blkcolidxC, 0, (numtileC) * sizeof(int));
409 | 
410 |     step2(blkrowptrA, blkcolidxA, blkmA, blknA,
411 |           blkcolptrB, blkrowidxB, blkmB, blknB,
412 |           blkrowptrC, blkcolidxC);
413 | 
414 |     int *nnzb_C = (int *)malloc((numtileC + 1) * sizeof(int));
415 |     memset(nnzb_C, 0, (numtileC + 1) * sizeof(int));
416 |     int nnzC =0;
417 |     step3(blkrowptrA, blkcolidxA, blkmA, blknA, nnzb_A, mA,
418 |           blkcsr_Val_A, blkcsr_Col_A, blkcsr_Ptr_A,
419 |           blkcolptrB, blkrowidxB, blkmB, blknB, nnzb_B,nB,
420 |           blkcsr_Val_B, blkcsr_Col_B, blkcsr_Ptr_B,
421 |           blkrowptrC, blkcolidxC, nnzb_C, &nnzC);
422 |     // for (int i=0; i< numtileC + 1; i ++)
423 |     // {
424 |     //     printf("i= %i, nnz = %i\n", i, nnzb_C[i]);
425 |     // }
426 | 
427 | 
428 |     MAT_VAL_TYPE *blkcsr_Val_C = (MAT_VAL_TYPE *)malloc(nnzC * sizeof(MAT_VAL_TYPE));
429 |     memset(blkcsr_Val_C, 0, nnzC * sizeof(MAT_VAL_TYPE));
430 |     unsigned char *blkcsr_Col_C = (unsigned char *)malloc(nnzC * sizeof(unsigned char));
431 |     memset(blkcsr_Col_C, 0, nnzC * sizeof(unsigned char));
432 |     unsigned char *blkcsr_Ptr_C = (unsigned char *)malloc(numtileC * BLOCK_SIZE * sizeof(unsigned char));
433 |     memset(blkcsr_Ptr_C, 0, numtileC * BLOCK_SIZE * sizeof(unsigned char));
434 | 
435 |     step4(blkrowptrA, blkcolidxA, blkmA, blknA, nnzb_A, mA,
436 |           blkcsr_Val_A, blkcsr_Col_A, blkcsr_Ptr_A,
437 |           blkcolptrB, blkrowidxB, blkmB, blknB, nnzb_B,
438 |           blkcsr_Val_B, blkcsr_Col_B, blkcsr_Ptr_B,
439 |           blkrowptrC, blkcolidxC, nnzb_C,
440 |           blkcsr_Val_C, blkcsr_Col_C, blkcsr_Ptr_C);
441 | 
442 |     printf("spgemm-cpu complete\n");
443 | 
444 |     C->tile_csr_Ptr = blkcsr_Ptr_C;
445 |     C->tile_csr_Value = blkcsr_Val_C;
446 |     C->tile_csr_Col = blkcsr_Col_C;
447 |     C->tile_ptr = blkrowptrC;
448 |     C->tile_columnidx = blkcolidxC;
449 |     C->tile_nnz = nnzb_C;
450 |    // C->tile_ptr = blkrowptrC;
451 |    // C->tile_columnidx = blkcolidxC;
452 |     C->nnz = nnzC;
453 |     C->numtile = numtileC;
454 | 
455 | }


--------------------------------------------------------------------------------
/src/spgemm_cu.h:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | #include <cuda_runtime.h>
 3 | #include "external/cusparse/spgemm_cusparse.h"
 4 | 
 5 | int spgemm_cu (         const int             mA,
 6 |                         const int             nA,
 7 |                         const int             nnzA,
 8 |                         const MAT_PTR_TYPE   *csrRowPtrA,
 9 |                         const int            *csrColIdxA,
10 |                         const MAT_VAL_TYPE   *csrValA,
11 |                         const int             mB,
12 |                         const int             nB,
13 |                         const int             nnzB,
14 |                         const MAT_PTR_TYPE   *csrRowPtrB,
15 |                         const int            *csrColIdxB,
16 |                         const MAT_VAL_TYPE   *csrValB,
17 |                         const int             mC,
18 |                         const int             nC,
19 |                         const MAT_PTR_TYPE    nnzC_golden,
20 |                         const MAT_PTR_TYPE   *csrRowPtrC_golden,
21 |                         const int            *csrColIdxC_golden,
22 |                         const MAT_VAL_TYPE   *csrValC_golden,
23 |                         const bool           check_result,
24 |                         unsigned long long int nnzCub,
25 |                         unsigned long long int *nnzC,
26 |                         double        *compression_rate,
27 |                         double        *time_segmerge,
28 |                         double        *gflops_segmerge )
29 | {
30 |     // run cuda SpGEMM (using cuSPARSE)
31 |     printf("\n--------------- SpGEMM (using cuSPARSE) ---------------\n");
32 |     double compression_rate1 = 0;
33 |     double time_cusparse = 0;
34 |     double gflops_cusparse = 0;
35 |     spgemm_cusparse(mA, nA, nnzA, csrRowPtrA, csrColIdxA, csrValA,
36 |                  mB, nB, nnzB, csrRowPtrB, csrColIdxB, csrValB,
37 |                  mC, nC, nnzC_golden, csrRowPtrC_golden, csrColIdxC_golden, csrValC_golden,
38 |                  check_result, nnzCub, nnzC, &compression_rate1, &time_cusparse, &gflops_cusparse);
39 |     printf("---------------------------------------------------------------\n");
40 | 
41 | 
42 | 
43 | 
44 | 
45 |     return 0;
46 | }
47 | 
48 | 
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/src/spgemm_serialref_spa_new.h:
--------------------------------------------------------------------------------
  1 | #ifndef _SPGEMM_PARALLELREF_NEW_
  2 | #define _SPGEMM_PARALLELREF_NEW_
  3 | 
  4 | #include <stdbool.h>
  5 | #include "common.h"
  6 | #include "utils.h"
  7 | void spgemm_spa(        const int           *d_csrRowPtrA,
  8 |                         const int           *d_csrColIdxA,
  9 |                         const MAT_VAL_TYPE    *d_csrValA,
 10 |                         const int            mA,
 11 |                         const int            nA,
 12 |                         const int            nnzA,
 13 |                         const int           *d_csrRowPtrB,
 14 |                         const int           *d_csrColIdxB,
 15 |                         const MAT_VAL_TYPE    *d_csrValB,
 16 |                         const int            mB,
 17 |                         const int            nB,
 18 |                         const int            nnzB,
 19 |                             int           *d_csrRowPtrC,
 20 |                             int           *d_csrColIdxC,
 21 |                            MAT_VAL_TYPE    *d_csrValC,
 22 |                         const int            mC,
 23 |                         const int            nC,
 24 |                             int           *nnzC,
 25 |                         const int           get_nnzC_only)
 26 | {
 27 |     int nthreads = omp_get_max_threads();
 28 | 
 29 |     if (get_nnzC_only ==1 )
 30 |     {
 31 |         unsigned int *flag_g = (unsigned int *)malloc(nthreads * (nB / 32 + 1) * sizeof(unsigned int));
 32 | 
 33 |         #pragma omp parallel for
 34 |         for (int iid=0;iid<mA;iid++)
 35 |         {
 36 |             int thread_id = omp_get_thread_num();
 37 | 
 38 |             unsigned int *flag = flag_g + thread_id * (nB / 32 + 1); //(unsigned int *)malloc((nB/32+1)*sizeof(unsigned int));
 39 |             memset(flag, 0, sizeof(unsigned int) * (nB / 32 + 1));
 40 |             for (int blkj = d_csrRowPtrA[iid]; blkj < d_csrRowPtrA[iid + 1]; blkj++)
 41 |             {
 42 |                 int col = d_csrColIdxA[blkj];
 43 |                 for (int l = d_csrRowPtrB[col]; l < d_csrRowPtrB[col + 1]; l++)
 44 |                 {
 45 |                     const int key = d_csrColIdxB[l];
 46 |                     int ind = key / 32;
 47 |                     flag[ind] |= (1 << (key % 32));
 48 |                 }
 49 |             }
 50 |             //int nnzr_new=0;
 51 |             int nnzr_new1 = 0;
 52 |             for (int i = 0; i < (nB / 32) + 1; i++)
 53 |             {
 54 |                 nnzr_new1 += _mm_popcnt_u32(flag[i]);
 55 |             }
 56 | 
 57 |             d_csrRowPtrC[iid] = nnzr_new1;
 58 |         }
 59 |         exclusive_scan(d_csrRowPtrC, mC +1);
 60 |         *nnzC = d_csrRowPtrC[mC];
 61 |         free(flag_g);
 62 |     }
 63 |     else
 64 |     {
 65 |         unsigned int *flag_g = (unsigned int *)malloc(nthreads * (nB / 32 + 1) * sizeof(unsigned int));
 66 |         #pragma omp parallel for
 67 |         for (int iid=0;iid<mA;iid++)
 68 |         {
 69 |             int thread_id = omp_get_thread_num();
 70 |             unsigned int *flag = flag_g + thread_id * (nB / 32 + 1);
 71 |             memset(flag, 0, sizeof(unsigned int) * (nB / 32 + 1));
 72 |             //   int pos=0;
 73 |             //    int j=bin[iid];
 74 |             for (int blkj = d_csrRowPtrA[iid]; blkj < d_csrRowPtrA[iid + 1]; blkj++)
 75 |             {
 76 |                 int col = d_csrColIdxA[blkj];
 77 |                 for (int l = d_csrRowPtrB[col]; l < d_csrRowPtrB[col + 1]; l++)
 78 |                 {
 79 |                     const int key = d_csrColIdxB[l];
 80 |                     int ind = key / 32;
 81 |                     flag[ind] |= (1 << (key % 32));
 82 |                 }
 83 |             }
 84 | 
 85 |             // int nnzr = d_csrRowPtrC[iid];
 86 |             int nnzr_new = d_csrRowPtrC[iid];
 87 |             for (int i = 0; i < (nB / 32) + 1; i++)
 88 |             {
 89 |                 int count = 0;
 90 |                 while (flag[i])
 91 |                 {
 92 |                     count++;
 93 |                     if ((flag[i] & 1) != 0)
 94 |                     {
 95 |                         d_csrColIdxC[nnzr_new] = (i * 32 + count - 1);
 96 |                         nnzr_new++;
 97 |                     }
 98 |                     flag[i] = flag[i] >> 1;
 99 |                 }
100 |             }
101 |         }
102 |         free(flag_g);
103 |     }
104 | 
105 | }
106 | 
107 | #endif
108 | 
109 | 
110 | 


--------------------------------------------------------------------------------
/src/test:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SuperScientificSoftwareLaboratory/TileSpGEMM/fe3a3457cec078fddd73c04f4ffed14edee7fb21/src/test


--------------------------------------------------------------------------------
/src/tile2csr.h:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | #ifndef _TILETOCSR_
  4 | #define _TILETOCSR_
  5 | #include <stdio.h>
  6 | #include <stdlib.h>
  7 | #include "common.h"
  8 | void Tile_csr_to_csr_PTR(unsigned char *Tile_csr_Ptr,
  9 |                          MAT_VAL_TYPE *Tile_csr_Val,
 10 |                          int tilennz,
 11 |                          int tilem,
 12 |                          int m,
 13 |                          int tile_row,
 14 |                          int *csrRowPtr,
 15 |                          int csr_ptr_offset,
 16 |                          int tile_csr_offset,
 17 |                          int tile_csrptr_offset)
 18 | {
 19 | 
 20 |     int rowlen = tile_row == tilem - 1 ? m - (tilem - 1) * BLOCK_SIZE : BLOCK_SIZE;
 21 |     for (int i = 0; i < rowlen; i++)
 22 |     {
 23 |         int temp = i == rowlen - 1 ? tilennz : Tile_csr_Ptr[tile_csrptr_offset + i + 1];
 24 |         int cnt = 0;
 25 |         for (int j = Tile_csr_Ptr[tile_csrptr_offset + i]; j < temp; j++)
 26 |         {
 27 |             // if (Tile_csr_Val[tile_csr_offset + j] == 0)
 28 |             //     cnt++;
 29 |         }
 30 |         csrRowPtr[csr_ptr_offset + i] += temp - Tile_csr_Ptr[tile_csrptr_offset + i] - cnt;
 31 |     }
 32 | }
 33 | 
 34 | void Tile_csr_to_csr(unsigned char *Tile_csr_Ptr,
 35 |                      unsigned char *Tile_csr_Col,
 36 |                      MAT_VAL_TYPE *Tile_csr_Val,
 37 |                      int tilennz,
 38 |                      int tilem,
 39 |                      int m,
 40 |                      int tile_row,
 41 |                      int tile_col,
 42 |                      int *csrRowPtr,
 43 |                      int *csrColIdx,
 44 |                      MAT_VAL_TYPE *csrVal,
 45 |                      int csr_ptr_offset,
 46 |                      int tile_csrptr_offset,
 47 |                      int tile_csr_index_offset,
 48 |                      int *row_nnz_offset)
 49 | 
 50 | {
 51 |     int rowlen = tile_row == tilem - 1 ? m - (tilem - 1) * BLOCK_SIZE : BLOCK_SIZE;
 52 |     for (int i = 0; i < rowlen; i++)
 53 |     {
 54 |         int start = Tile_csr_Ptr[tile_csrptr_offset + i];
 55 |         int end = i == rowlen - 1 ? tilennz : Tile_csr_Ptr[tile_csrptr_offset + i + 1];
 56 |         for (int j = start; j < end; j++)
 57 |         {
 58 |             // if (Tile_csr_Val[tile_csr_index_offset + j] != 0)
 59 |             // {
 60 |                 int temp = csrRowPtr[csr_ptr_offset + i] + row_nnz_offset[tile_row * BLOCK_SIZE + i];
 61 |                 csrColIdx[temp] = tile_col * BLOCK_SIZE + Tile_csr_Col[tile_csr_index_offset + j];
 62 |                 csrVal[temp] = Tile_csr_Val[tile_csr_index_offset + j];
 63 |                 row_nnz_offset[tile_row * BLOCK_SIZE + i]++;
 64 |             // }
 65 |         }
 66 |     }
 67 | }
 68 | 
 69 | 
 70 | 
 71 | void tile2csr(SMatrix *matrix)
 72 | {
 73 | 
 74 |     matrix->rowpointer = (MAT_PTR_TYPE *)malloc((matrix->m + 1) *sizeof(MAT_PTR_TYPE));
 75 |     MAT_PTR_TYPE *csrRowPtr = matrix->rowpointer;
 76 |     memset(csrRowPtr, 0, (matrix->m + 1) * sizeof(MAT_PTR_TYPE));
 77 | 
 78 | #pragma omp parallel for
 79 |     for (int i = 0; i < matrix->tilem; i++)
 80 |     {
 81 |         for (int j = matrix->tile_ptr[i]; j < matrix->tile_ptr[i + 1]; j++)
 82 |         {
 83 |             int csr_ptr_offset = i * BLOCK_SIZE;
 84 |             int tilennz = matrix->tile_nnz[j + 1] - matrix->tile_nnz[j];
 85 |             int m = matrix->m;
 86 |             int n = matrix->n;
 87 |             int tilem = matrix->tilem;
 88 |             int tilen = matrix->tilen;
 89 |             int tile_id = j;
 90 |             int tile_row = i;
 91 |             int tile_col = matrix->tile_columnidx[j];
 92 |             int tile_csr_offset = matrix->tile_nnz[j];
 93 |             int tile_csrptr_offset = j * BLOCK_SIZE;
 94 | 
 95 |             Tile_csr_to_csr_PTR(matrix->tile_csr_Ptr, matrix->tile_csr_Value, tilennz, tilem, m, tile_row, csrRowPtr,
 96 |                                 csr_ptr_offset, tile_csr_offset, tile_csrptr_offset);
 97 |         }
 98 |     }
 99 |     exclusive_scan(csrRowPtr, matrix->m + 1);
100 |     
101 |     int nnzc_real = csrRowPtr[matrix->m];
102 |     matrix->nnz = nnzc_real;
103 | 
104 |     matrix->value = (MAT_VAL_TYPE *)malloc(nnzc_real * sizeof(MAT_VAL_TYPE));
105 |     memset(matrix->value, 0, nnzc_real * sizeof(MAT_VAL_TYPE));
106 |     matrix->columnindex = (int *)malloc(nnzc_real * sizeof(int));
107 |     memset(matrix->columnindex, 0, nnzc_real * sizeof(int));
108 | 
109 |     int *csrColIdx = matrix->columnindex;
110 |     MAT_VAL_TYPE *csrVal = matrix->value;
111 | 
112 |     int *row_nnz_offset = (int *)malloc(sizeof(int) * matrix->m);
113 |     memset(row_nnz_offset, 0, sizeof(int) * matrix->m);
114 | 
115 | #pragma omp parallel for
116 |     for (int i = 0; i < matrix->tilem; i++)
117 |     {
118 |         for (int j = matrix->tile_ptr[i]; j < matrix->tile_ptr[i + 1]; j++)
119 |         {
120 |             int csr_ptr_offset = i * BLOCK_SIZE;
121 |             int tilennz = matrix->tile_nnz[j + 1] - matrix->tile_nnz[j];
122 |             int m = matrix->m;
123 |             int n = matrix->n;
124 |             int tilem = matrix->tilem;
125 |             int tilen = matrix->tilen;
126 |             int tile_id = j;
127 |             int tile_row = i;
128 |             int tile_col = matrix->tile_columnidx[j];
129 |             int tile_csr_index_offset = matrix->tile_nnz[j];
130 |             int tile_csrptr_offset = j * BLOCK_SIZE;
131 | 
132 |             Tile_csr_to_csr(matrix->tile_csr_Ptr, matrix->tile_csr_Col, matrix->tile_csr_Value,
133 |                             tilennz, tilem, m, tile_row, tile_col, csrRowPtr, csrColIdx, csrVal,
134 |                             csr_ptr_offset, tile_csrptr_offset, tile_csr_index_offset, row_nnz_offset);
135 |         }
136 |     }
137 | }
138 | 
139 | #endif
140 | 


--------------------------------------------------------------------------------
/src/utils.h:
--------------------------------------------------------------------------------
  1 | #ifndef _UTILS_
  2 | #define _UTILS_
  3 | 
  4 | #include "common.h"
  5 | 
  6 | void binary_search_right_boundary_item_kernel(const MAT_PTR_TYPE *row_pointer, 
  7 |                                               const MAT_PTR_TYPE  key_input, 
  8 |                                               const int        size, 
  9 |                                                     int       *colpos, 
 10 |                                                     MAT_PTR_TYPE *nnzpos)
 11 | {
 12 |     int start = 0;
 13 |     int stop  = size - 1;
 14 |     MAT_PTR_TYPE median;
 15 |     MAT_PTR_TYPE key_median;
 16 | 
 17 |     while (stop >= start)
 18 |     {
 19 |         median = (stop + start) / 2;
 20 | 
 21 |         key_median = row_pointer[median];
 22 | 
 23 |         if (key_input >= key_median)
 24 |             start = median + 1;
 25 |         else
 26 |             stop = median - 1;
 27 |     }
 28 | 
 29 |     *colpos = start - 1;
 30 |     *nnzpos = key_input - row_pointer[*colpos];
 31 | }
 32 | 
 33 | // in-place exclusive scan
 34 | void exclusive_scan(MAT_PTR_TYPE *input, int length)
 35 | {
 36 |     if(length == 0 || length == 1)
 37 |         return;
 38 |     
 39 |     MAT_PTR_TYPE old_val, new_val;
 40 |     
 41 |     old_val = input[0];
 42 |     input[0] = 0;
 43 |     for (int i = 1; i < length; i++)
 44 |     {
 45 |         new_val = input[i];
 46 |         input[i] = old_val + input[i-1];
 47 |         old_val = new_val;
 48 |     }
 49 | }
 50 | 
 51 | 
 52 | // in-place exclusive scan
 53 | void exclusive_scan_char(unsigned char *input, int length)
 54 | {
 55 |     if(length == 0 || length == 1)
 56 |         return;
 57 |     
 58 |     unsigned char old_val, new_val;
 59 |     
 60 |     old_val = input[0];
 61 |     input[0] = 0;
 62 |     for (int i = 1; i < length; i++)
 63 |     {
 64 |         new_val = input[i];
 65 |         input[i] = old_val + input[i-1];
 66 |         old_val = new_val;
 67 |     }
 68 | }
 69 | 
 70 | 
 71 | void swap_key(int *a , int *b)
 72 | {
 73 |     int tmp = *a;
 74 |     *a = *b;
 75 |     *b = tmp;
 76 | }
 77 | 
 78 | void swap_val(MAT_VAL_TYPE *a , MAT_VAL_TYPE *b)
 79 | {
 80 |     MAT_VAL_TYPE tmp = *a;
 81 |     *a = *b;
 82 |     *b = tmp;
 83 | }
 84 | 
 85 | // quick sort key-value pair (child function)
 86 | int partition_key_val_pair(int *key, MAT_VAL_TYPE *val, int length, int pivot_index)
 87 | {
 88 |     int i  = 0 ;
 89 |     int small_length = pivot_index;
 90 | 
 91 |     int pivot = key[pivot_index];
 92 |     swap_key(&key[pivot_index], &key[pivot_index + (length - 1)]);
 93 |     swap_val(&val[pivot_index], &val[pivot_index + (length - 1)]);
 94 | 
 95 |     for(; i < length; i++)
 96 |     {
 97 |         if(key[pivot_index+i] < pivot)
 98 |         {
 99 |             swap_key(&key[pivot_index+i], &key[small_length]);
100 |             swap_val(&val[pivot_index+i], &val[small_length]);
101 |             small_length++;
102 |         }
103 |     }
104 | 
105 |     swap_key(&key[pivot_index + length - 1], &key[small_length]);
106 |     swap_val(&val[pivot_index + length - 1], &val[small_length]);
107 | 
108 |     return small_length;
109 | }
110 | 
111 | // quick sort key-value pair (main function)
112 | void quick_sort_key_val_pair(int *key, MAT_VAL_TYPE *val, int length)
113 | {
114 |     if(length == 0 || length == 1)
115 |         return;
116 | 
117 |     int small_length = partition_key_val_pair(key, val, length, 0) ;
118 |     quick_sort_key_val_pair(key, val, small_length);
119 |     quick_sort_key_val_pair(&key[small_length + 1], &val[small_length + 1], length - small_length - 1);
120 | }
121 | 
122 | // quick sort key (child function)
123 | int partition_key(int *key, int length, int pivot_index)
124 | {
125 |     int i  = 0 ;
126 |     int small_length = pivot_index;
127 | 
128 |     int pivot = key[pivot_index];
129 |     swap_key(&key[pivot_index], &key[pivot_index + (length - 1)]);
130 | 
131 |     for(; i < length; i++)
132 |     {
133 |         if(key[pivot_index+i] < pivot)
134 |         {
135 |             swap_key(&key[pivot_index+i], &key[small_length]);
136 |             small_length++;
137 |         }
138 |     }
139 | 
140 |     swap_key(&key[pivot_index + length - 1], &key[small_length]);
141 | 
142 |     return small_length;
143 | }
144 | 
145 | // quick sort key (main function)
146 | void quick_sort_key(int *key, int length)
147 | {
148 |     if(length == 0 || length == 1)
149 |         return;
150 | 
151 |     int small_length = partition_key(key, length, 0) ;
152 |     quick_sort_key(key, small_length);
153 |     quick_sort_key(&key[small_length + 1], length - small_length - 1);
154 | }
155 | 
156 | 
157 | 
158 | void matrix_transposition(const int           m,
159 |                           const int           n,
160 |                           const MAT_PTR_TYPE     nnz,
161 |                           const MAT_PTR_TYPE    *csrRowPtr,
162 |                           const int          *csrColIdx,
163 |                           const MAT_VAL_TYPE *csrVal,
164 |                                 int          *cscRowIdx,
165 |                                 MAT_PTR_TYPE    *cscColPtr,
166 |                                 MAT_VAL_TYPE *cscVal)
167 | {
168 |     // histogram in column pointer
169 |     memset (cscColPtr, 0, sizeof(MAT_PTR_TYPE) * (n+1));
170 |     for (MAT_PTR_TYPE i = 0; i < nnz; i++)
171 |     {
172 |         cscColPtr[csrColIdx[i]]++;
173 |     }
174 | 
175 |     // prefix-sum scan to get the column pointer
176 |     exclusive_scan(cscColPtr, n + 1);
177 | 
178 |     MAT_PTR_TYPE *cscColIncr = (MAT_PTR_TYPE *)malloc(sizeof(MAT_PTR_TYPE) * (n+1));
179 |     memcpy (cscColIncr, cscColPtr, sizeof(MAT_PTR_TYPE) * (n+1));
180 | 
181 |     // insert nnz to csc
182 |     for (int row = 0; row < m; row++)
183 |     {
184 |         for (MAT_PTR_TYPE j = csrRowPtr[row]; j < csrRowPtr[row+1]; j++)
185 |         {
186 |             int col = csrColIdx[j];
187 | 
188 |             cscRowIdx[cscColIncr[col]] = row;
189 |             cscVal[cscColIncr[col]] = csrVal[j];
190 |             cscColIncr[col]++;
191 |         }
192 |     }
193 | 
194 |     free (cscColIncr);
195 | }
196 | 
197 | #endif
198 | 


--------------------------------------------------------------------------------
/src/utils_cuda_scan.h:
--------------------------------------------------------------------------------
  1 | #ifndef _SCAN_CUDA_UTILS_
  2 | #define _SCAN_CUDA_UTILS_
  3 | 
  4 | #include "common.h"
  5 | #include "utils.h"
  6 | 
  7 | #include <cuda_runtime.h>
  8 | #include <thrust/device_malloc.h>
  9 | #include <thrust/device_free.h>
 10 | #include <thrust/device_vector.h>
 11 | #include <thrust/execution_policy.h>
 12 | #include <thrust/device_ptr.h>
 13 | #include <thrust/scan.h>
 14 | 
 15 | #define ITEM_PER_WARP 4
 16 | #define WARP_PER_BLOCK_SCAN 2
 17 | 
 18 | // inclusive scan
 19 | __forceinline__ __device__
 20 | int scan_32_shfl(      int x,
 21 |                  const int lane_id)
 22 | {
 23 |     int y = __shfl_up_sync(0xffffffff, x, 1);
 24 |     x = lane_id >= 1 ? x + y : x;
 25 |     y = __shfl_up_sync(0xffffffff, x, 2);
 26 |     x = lane_id >= 2 ? x + y : x;
 27 |     y = __shfl_up_sync(0xffffffff, x, 4);
 28 |     x = lane_id >= 4 ? x + y : x;
 29 |     y = __shfl_up_sync(0xffffffff, x, 8);
 30 |     x = lane_id >= 8 ? x + y : x;
 31 |     y = __shfl_up_sync(0xffffffff, x, 16);
 32 |     x = lane_id >= 16 ? x + y : x;
 33 | 
 34 |     return x;
 35 | }
 36 | 
 37 | __forceinline__ __device__
 38 | int scan_16_shfl(      int x,
 39 |                  const int lane_id)
 40 | {
 41 |     int y = __shfl_up_sync(0xffffffff, x, 1);
 42 |     x = lane_id >= 1 ? x + y : x;
 43 |     y = __shfl_up_sync(0xffffffff, x, 2);
 44 |     x = lane_id >= 2 ? x + y : x;
 45 |     y = __shfl_up_sync(0xffffffff, x, 4);
 46 |     x = lane_id >= 4 ? x + y : x;
 47 |     y = __shfl_up_sync(0xffffffff, x, 8);
 48 |     x = lane_id >= 8 ? x + y : x;
 49 |     //y = __shfl_up_sync(0xffffffff, x, 16);
 50 |     //x = lane_id >= 16 ? x + y : x;
 51 | 
 52 |     return x;
 53 | }
 54 | 
 55 | template<typename iT>
 56 | __inline__ __device__
 57 | int exclusive_scan_warp_cuda(       iT  *key,
 58 |                               const  int  size,
 59 |                               const  int  lane_id)
 60 | {
 61 |     const int loop = ceil((float)size/(float)WARP_SIZE);
 62 |     int sum = 0;
 63 | 
 64 |     // all rounds except the last
 65 |     for (int li = 0; li < loop - 1; li++)
 66 |     {
 67 |         const int nid = li * WARP_SIZE + lane_id;
 68 |         const int lb = key[nid];
 69 |         const int lb_scan = scan_32_shfl(lb, lane_id); // this scan is inclusive
 70 |         key[nid] = lb_scan - lb + sum;
 71 |         sum += __shfl_sync(0xffffffff, lb_scan, WARP_SIZE-1); //__syncwarp();// sum of all values
 72 |     }
 73 | 
 74 |     // the last round
 75 |     const int len_processed = (loop - 1) * WARP_SIZE;
 76 |     const int len_last_round = size - len_processed;
 77 |     const int lb = lane_id < len_last_round ? key[len_processed + lane_id] : 0;
 78 |     const int lb_scan = scan_32_shfl(lb, lane_id); // this scan is inclusive
 79 |     if (lane_id < len_last_round)
 80 |         key[len_processed + lane_id] = lb_scan - lb + sum;
 81 |     sum += __shfl_sync(0xffffffff, lb_scan, WARP_SIZE-1); // sum of all values
 82 | 
 83 |     return sum;
 84 | }
 85 | 
 86 | template<typename iT>
 87 | __inline__ __device__
 88 | int exclusive_scan_block_cuda(       iT  *key,
 89 |                                      int *s_warpsync,
 90 |                               const  int  size,
 91 |                               const  int  warp_id,
 92 |                               const  int  warp_num,
 93 |                               const  int  lane_id)
 94 | {
 95 |     const int wnum = ceil((float)size / (float)WARP_SIZE);
 96 |     int lb, lb_scan;
 97 | 
 98 |     for (int wi = warp_id; wi < wnum; wi += warp_num)
 99 |     {
100 |         const int pos = wi * WARP_SIZE + lane_id;
101 |         lb = wi == wnum - 1 ? (pos < size ? key[pos] : 0) : key[pos];
102 |         lb_scan = scan_32_shfl(lb, lane_id); // this scan is inclusive
103 |         if (pos < size) key[pos] = lb_scan - lb;
104 |         if (lane_id == WARP_SIZE-1) s_warpsync[wi] = lb_scan;
105 |     }
106 |     __syncthreads();
107 |     //if (print_tag) printf("step1 key[%i] = %i\n", warp_id*WARP_SIZE+lane_id, key[warp_id*WARP_SIZE+lane_id]);
108 |     //__syncthreads();
109 | 
110 |     if (!warp_id)
111 |     {
112 |         lb = lane_id < wnum ? s_warpsync[lane_id] : 0;
113 |         lb_scan = scan_32_shfl(lb, lane_id); // this scan is inclusive
114 |         if (lane_id < wnum) s_warpsync[lane_id] = lb_scan;
115 |         //s_warpsync[lane_id] = lb_scan - lb;
116 |     }
117 |     __syncthreads();
118 |     //if (print_tag && !warp_id) printf("before s_warpsync[%i] = %i\n", lane_id, s_warpsync[lane_id]);
119 |     //__syncthreads();
120 | 
121 |     const int sum = s_warpsync[wnum-1];
122 |     __syncthreads();
123 | 
124 |     if (!warp_id)
125 |     {
126 |         if (lane_id < wnum) s_warpsync[lane_id] = lb_scan - lb;
127 |     }
128 |     __syncthreads();
129 |     //if (print_tag && !warp_id) printf("after s_warpsync[%i] = %i\n", lane_id, s_warpsync[lane_id]);
130 |     //__syncthreads();
131 | 
132 |     for (int wi = warp_id; wi < wnum; wi += warp_num)
133 |     {
134 |         const int pos = wi * WARP_SIZE + lane_id;
135 |         lb = wi == wnum - 1 ? (pos < size ? key[pos] : 0) : key[pos];
136 |         if (pos < size) key[pos] = lb + s_warpsync[wi];
137 |     }
138 |     //if (print_tag) printf("step 2 key[%i] = %i\n", warp_id*WARP_SIZE+lane_id, key[warp_id*WARP_SIZE+lane_id]);
139 |     //__syncthreads();
140 | 
141 |     return sum;
142 | }
143 | 
144 | __global__
145 | void init_sum_cuda_kernel(int *d_sum, int segnum)
146 | {
147 |     const int global_id = blockIdx.x * blockDim.x + threadIdx.x;
148 |     
149 |     if (global_id == 0)
150 |         d_sum[global_id] = 0;
151 |     //__syncwarp();
152 |     if (global_id != 0 && global_id < segnum)
153 |         d_sum[global_id] = -1;
154 | }
155 | 
156 | __global__
157 | void exclusive_scan_cuda_kernel(int *d_key, int length, int *d_sum, int *d_id_extractor)
158 | {
159 |     //const int global_id = blockIdx.x * blockDim.x + threadIdx.x;
160 |     __shared__ int s_key_block[WARP_PER_BLOCK_SCAN * WARP_SIZE * ITEM_PER_WARP];
161 | 
162 |     // Initialize
163 |     const int local_warp_id = threadIdx.x / WARP_SIZE;
164 |     int *s_key = &s_key_block[local_warp_id * WARP_SIZE * ITEM_PER_WARP];
165 |     const int lane_id = (WARP_SIZE - 1) & threadIdx.x;
166 |     int segid = 0;
167 |     if (!lane_id)
168 |         segid = atomicAdd(d_id_extractor, 1);
169 |     segid = __shfl_sync(0xffffffff, segid, 0);
170 | 
171 |     const int start = (segid * WARP_SIZE * ITEM_PER_WARP) > length ? length : (segid * WARP_SIZE * ITEM_PER_WARP);
172 |     const int stop = ((segid + 1) * WARP_SIZE * ITEM_PER_WARP) > length ? length : ((segid + 1) * WARP_SIZE * ITEM_PER_WARP);
173 | 
174 |     if (start == stop)
175 |         return;
176 | 
177 |     // load to smem
178 |     for (int i = start + lane_id; i < stop; i += WARP_SIZE)
179 |         s_key[i - start] = d_key[i];
180 | //__syncwarp();
181 |     // ex scan on smem
182 |     int sum = exclusive_scan_warp_cuda(s_key, stop - start, lane_id);
183 | //__syncwarp();
184 |     // busy wait
185 |     do {
186 |         __threadfence_block();
187 |     }
188 |     while (d_sum[segid] == -1);
189 | 
190 |     // get incr
191 |     int incr = d_sum[segid]; //segid ? d_sum[segid] : 0;
192 | //__syncwarp();
193 |     if (!lane_id)
194 |         d_sum[segid+1] = incr + sum;
195 | //__syncwarp();
196 |     for (int i = start + lane_id; i < stop; i += WARP_SIZE)
197 |         d_key[i] = s_key[i - start] + incr;
198 | }
199 | 
200 | void exclusive_scan_device_cuda(      int   *d_key,
201 |                                 const int  length)
202 | {
203 |     // struct timeval tv;
204 | /*
205 |     printf("exclusive_scan_device_cuda, size = %i, start\n", length);
206 |     cudaDeviceSynchronize();
207 |     struct timeval t1, t2;
208 |     gettimeofday(&t1, NULL);
209 | */
210 |     //int *h_array1 = (int *)malloc(sizeof(int) * length);
211 |     //cudaMemcpy(h_array1, d_key, length * sizeof(int),   cudaMemcpyDeviceToHost);
212 |     //exclusive_scan<T>(h_array1, length);
213 | 
214 |     int *d_id_extractor;
215 |     cudaMalloc((void **)&d_id_extractor, sizeof(int));
216 |     // gettimeofday(&tv, NULL );
217 |     // time_node[*index] = tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0;
218 |     // cuda_memory_use[*index] = cuda_memory_use[(*index) - 1] + sizeof(int);
219 |     // (*index) += 1;
220 | 
221 |     cudaMemset(d_id_extractor, 0, sizeof(int));
222 | 
223 |     const int segnum = ceil((double)length / (double)(WARP_SIZE * ITEM_PER_WARP));
224 |     int *d_sum;
225 |     cudaMalloc((void **)&d_sum, sizeof(int) * (segnum+1));
226 |     // gettimeofday(&tv, NULL );
227 |     // time_node[*index] = tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0;
228 |     // cuda_memory_use[*index] = cuda_memory_use[(*index) - 1] + sizeof(int) * (segnum+1);
229 |     // (*index) += 1;
230 | 
231 | //printf("segnum = %i\n", segnum);
232 | 
233 |     int num_threads = 64;
234 |     int num_blocks = ceil ((double)(segnum+1) / (double)num_threads);
235 |     init_sum_cuda_kernel<<<num_blocks, num_threads>>>(d_sum, segnum+1);
236 | 
237 |     num_threads = WARP_SIZE * WARP_PER_BLOCK_SCAN;
238 |     num_blocks = ceil ((double)segnum / (double)(num_threads/WARP_SIZE));
239 |     exclusive_scan_cuda_kernel<<<num_blocks, num_threads>>>(d_key, length, d_sum, d_id_extractor);
240 | 
241 |     cudaFree(d_id_extractor);
242 |     // gettimeofday(&tv, NULL );
243 |     // time_node[*index] = tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0;
244 |     // cuda_memory_use[*index] = cuda_memory_use[(*index) - 1] - sizeof(int);
245 |     // (*index) += 1;
246 | 
247 |     cudaFree(d_sum);
248 |     // gettimeofday(&tv, NULL );
249 |     // time_node[*index] = tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0;
250 |     // cuda_memory_use[*index] = cuda_memory_use[(*index) - 1] - sizeof(int) * (segnum+1);
251 |     // (*index) += 1;
252 | 
253 | //    int *h_array2 = (int *)malloc(sizeof(int) * length);
254 | //cudaMemcpy(h_array2, d_key, length * sizeof(int),   cudaMemcpyDeviceToHost);
255 | 
256 | //for (int i = 0; i < length; i++)
257 | //    printf("[%4i] = %i, %i\n", i, h_array1[i], h_array2[i]);
258 | 
259 | /*
260 |     cudaDeviceSynchronize();
261 |     gettimeofday(&t2, NULL);
262 |     double time = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0;
263 |     printf("exclusive_scan_device_cuda, size = %i, RUNTIME = %4.2f ms\n", length, time);
264 | */
265 | }
266 | 
267 | template<typename T>
268 | void exclusive_scan_device_cuda_thrust(      int   *d_array,
269 |                                 const int  length)
270 | {
271 | /*
272 |     printf("exclusive_scan_device_cuda, size = %i, start\n", length);
273 |     cudaDeviceSynchronize();
274 |     struct timeval t1, t2;
275 |     gettimeofday(&t1, NULL);
276 | */
277 | 
278 | //thrust::device_ptr<T> d_array_thrust(d_array);
279 | //thrust::device_ptr<T> d_array_thrust = thrust::device_pointer_cast<T>(d_array);
280 | //    thrust::exclusive_scan(thrust::device, d_array_thrust, d_array_thrust + length, d_array_thrust);
281 | //    thrust::exclusive_scan(d_array, d_array + length, d_array);
282 | 
283 | //thrust::device_vector<T> d_array_thrust  (d_array,   d_array   + length);
284 | //thrust::device_ptr<T> d_array_thrust = thrust::device_pointer_cast<T>(d_array);
285 | //thrust::exclusive_scan(d_array_thrust.begin(), d_array_thrust.end(), d_array_thrust.begin());
286 | thrust::exclusive_scan(thrust::device, d_array, d_array + length, d_array, 0); // in-place scan
287 | 
288 | 
289 | //thrust::device_ptr<T> d_array_thrust(d_array);
290 | //thrust::exclusive_scan(d_array_thrust, d_array_thrust + length, d_array_thrust);
291 | 
292 | //thrust::device_vector<T> d_input = d_array;
293 | //thrust::exclusive_scan(d_input.begin(), d_input.end(), d_output.begin());
294 | 
295 | 
296 | //thrust::device_ptr<T> d_xxx = thrust::device_malloc<T>(length);
297 | //thrust::device_vector<T> d_input = d_xxx;
298 | //thrust::exclusive_scan(d_input.begin(), d_input.end(), d_input.begin());
299 | 
300 | //T *h_array = (T *)malloc(sizeof(T) * length);
301 | 
302 | // this part really works
303 | //    thrust::device_ptr<int> d_array_thrust = thrust::device_pointer_cast(d_array);
304 | //    thrust::exclusive_scan(d_array_thrust, d_array_thrust + length, d_array_thrust);
305 | 
306 | 
307 | //    thrust::device_ptr<T> d_array_thrust = thrust::device_pointer_cast<T>(d_array);
308 | //    thrust::exclusive_scan(d_array_thrust, d_array_thrust + length, d_array_thrust);
309 | 
310 |     //cudaDeviceSynchronize();
311 |     //gettimeofday(&t2, NULL);
312 |     //double time = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0;
313 |     //printf("exclusive_scan_device_cuda, size = %i, RUNTIME = %4.2f ms\n", length, time);
314 | 
315 | /*
316 |     T *h_array = (T *)malloc(sizeof(T) * length);
317 | cudaMemcpy(h_array, d_array, length * sizeof(T),   cudaMemcpyDeviceToHost);
318 | for (int i = 0; i < length; i++)
319 |     printf("array[%i] = %i\n", i, h_array[i]);
320 | 
321 | exclusive_scan<T>(h_array, length);
322 | cudaMemcpy(d_array, h_array, length * sizeof(T),   cudaMemcpyHostToDevice);
323 |     free(h_array);
324 | */
325 | 
326 | /*
327 |     cudaDeviceSynchronize();
328 |     gettimeofday(&t2, NULL);
329 |     double time = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0;
330 |     printf("exclusive_scan_device_cuda, size = %i, RUNTIME = %4.2f ms\n", length, time);
331 | */
332 | }
333 | 
334 | #endif
335 | 


--------------------------------------------------------------------------------