├── bin └── .gitkeep ├── .gitignore ├── Makefile ├── LICENSE ├── contrib ├── data.txt ├── generate_mtx.py └── generate_mat.py ├── README.md └── src ├── spmv_mkl.cpp ├── utility.hpp └── spmv_spv8.cpp /bin/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | bin/spmv_* 2 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Compilers 2 | CC = gcc 3 | CXX = g++ 4 | # - Debug 5 | # CFLAGS += -g 6 | # CFLAGS += -fsanitize=address # Check invalid memory access 7 | # - General 8 | CFLAGS += -O2 # Should be disabled when debug 9 | CFLAGS += -std=c++17 -Wall -Wextra -lm 10 | CFLAGS += -march=skylake-avx512 -fopenmp 11 | #CFLAGS += -static 12 | # - MKL 13 | #CFLAGS += -ltbb -lmkl_core -lmkl_tbb_thread -lmkl_intel_lp64 14 | #CFLAGS += -lmkl_core -lmkl_gnu_thread -lmkl_intel_lp64 15 | CFLAGS += -lmkl_core -lmkl_intel_thread -liomp5 -lmkl_intel_lp64 16 | VPATH = src 17 | 18 | all: bin bin/spmv_spv8 bin/spmv_mkl 19 | 20 | bin: 21 | mkdir bin 22 | 23 | bin/spmv_spv8 : spmv_spv8.cpp 24 | $(CXX) $(CFLAGS) $< -o $@ 25 | 26 | bin/spmv_mkl : spmv_mkl.cpp 27 | $(CXX) $(CFLAGS) $< -o $@ 28 | 29 | .PHONY : clean 30 | clean : 31 | -rm bin/* 32 | 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 XJTU-IAIR-CAG Research Group 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /contrib/data.txt: -------------------------------------------------------------------------------- 1 | Scale-Free: 2 | Oregon-2 3 | as-caida 4 | sx-mathoverflow 5 | email-Enron 6 | soc-sign-Slashdot090216 7 | soc-sign-Slashdot090221 8 | dc2 9 | soc-sign-epinions 10 | soc-Slashdot0811 11 | sx-superuser 12 | soc-Slashdot0902 13 | scircuit 14 | connectus 15 | language 16 | NotreDame_actors 17 | com-DBLP 18 | web-Stanford 19 | citationCiteseer 20 | cop20k_A 21 | webbase-1M 22 | IMDB 23 | wiki-Talk 24 | web-Google 25 | com-Youtube 26 | flickr 27 | higgs-twitter 28 | patents 29 | as-Skitter 30 | FullChip 31 | wiki-topcats 32 | mouse_gene 33 | soc-Pokec 34 | coPapersCiteseer 35 | soc-LiveJournal1 36 | hollywood-2009 37 | com-Orkut 38 | --------------------------------- 39 | HPC: 40 | p2p-Gnutella04 41 | p2p-Gnutella25 42 | p2p-Gnutella24 43 | p2p-Gnutella30 44 | de2010 45 | ri2010 46 | vt2010 47 | ut2010 48 | tn2010 49 | mac_econ_fwd500 50 | va2010 51 | ga2010 52 | mc2depi 53 | rma10 54 | fl2010 55 | ASIC_680k 56 | roadNet-PA 57 | amazon0312 58 | cant 59 | pdb1HYS 60 | tx2010 61 | roadNet-CA 62 | consph 63 | mip1 64 | crankseg_2 65 | Si41Ge41H72 66 | Ga41As41H72 67 | rajat31 68 | road_central 69 | ldoor 70 | spal_004 71 | bone010 72 | road_usa 73 | circuit5M 74 | cage15 75 | 76 | -------------------------------------------------------------------------------- /contrib/generate_mtx.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import glob 3 | import os 4 | import sys 5 | import numpy as np 6 | import scipy 7 | import scipy.io 8 | import scipy.sparse 9 | 10 | DATA_PATH = './data' 11 | 12 | 13 | def output(output_path, M): 14 | np.random.seed(1) 15 | x = np.random.normal(size=(M.shape[1],)) 16 | ans = M.dot(x) 17 | x = list(x) 18 | ans = list(ans) 19 | 20 | data = list(M.data) 21 | colidx = list(M.indices) 22 | rowptr = list(M.indptr) 23 | rowb = rowptr[:-1] 24 | rowe = rowptr[1:] 25 | order = list(range(M.shape[0])) 26 | 27 | os.makedirs(output_path, exist_ok=True) 28 | 29 | with open(output_path + '/info.txt', 'w') as ff: 30 | ff.write(str(len(M.data))) 31 | ff.write('\n') 32 | ff.write(str(M.shape[0])) 33 | ff.write('\n') 34 | ff.write(str(M.shape[1])) 35 | ff.write('\n') 36 | with open(output_path + '/nnz.txt', 'w') as ff: 37 | for i in data: 38 | ff.write(str(i)) 39 | ff.write(' ') 40 | with open(output_path + '/col.txt', 'w') as ff: 41 | for i in colidx: 42 | ff.write(str(i)) 43 | ff.write(' ') 44 | with open(output_path + '/rowb.txt', 'w') as ff: 45 | for i in rowb: 46 | ff.write(str(i)) 47 | ff.write(' ') 48 | with open(output_path + '/rowe.txt', 'w') as ff: 49 | for i in rowe: 50 | ff.write(str(i)) 51 | ff.write(' ') 52 | with open(output_path + '/x.txt', 'w') as ff: 53 | for i in x: 54 | ff.write(str(i)) 55 | ff.write(' ') 56 | with open(output_path + '/order.txt', 'w') as ff: 57 | for i in order: 58 | ff.write(str(i)) 59 | ff.write(' ') 60 | with open(output_path + '/ans.txt', 'w') as ff: 61 | for i in ans: 62 | ff.write(str(i)) 63 | ff.write(' ') 64 | 65 | 66 | def main(): 67 | matrices = [] 68 | print('Loading matrices') 69 | for pathname in glob.glob('matrices/*.mtx'): 70 | name = os.path.splitext(os.path.basename(pathname))[0] 71 | print('Load %s' % name) 72 | matrix = scipy.io.mmread(pathname) 73 | matrix = scipy.sparse.csr_matrix(matrix) 74 | nonzeros = matrix.count_nonzero() 75 | matrices.append((nonzeros, name, matrix)) 76 | 77 | matrices.sort(key=lambda x: x[0]) 78 | 79 | print('Write list.txt') 80 | with open(DATA_PATH + '/list.txt', 'w') as f: 81 | for m in matrices: 82 | f.write(str(m[0]) + '\t' + m[1] + '\n') 83 | 84 | print('Write matrices') 85 | for m in matrices: 86 | print('Write %s' % m[1]) 87 | output(DATA_PATH + '/' + m[1], m[2]) 88 | 89 | if __name__ == '__main__': 90 | main() 91 | -------------------------------------------------------------------------------- /contrib/generate_mat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import glob 3 | import os 4 | import sys 5 | import numpy as np 6 | import scipy 7 | import scipy.io 8 | import scipy.sparse 9 | 10 | DATA_PATH = './data' 11 | 12 | 13 | def output(output_path, M): 14 | np.random.seed(1) 15 | x = np.random.normal(size=(M.shape[1],)) 16 | ans = M.dot(x) 17 | x = list(x) 18 | ans = list(ans) 19 | 20 | data = list(M.data) 21 | colidx = list(M.indices) 22 | rowptr = list(M.indptr) 23 | rowb = rowptr[:-1] 24 | rowe = rowptr[1:] 25 | order = list(range(M.shape[0])) 26 | 27 | os.makedirs(output_path, exist_ok=True) 28 | 29 | with open(output_path + '/info.txt', 'w') as ff: 30 | ff.write(str(len(M.data))) 31 | ff.write('\n') 32 | ff.write(str(M.shape[0])) 33 | ff.write('\n') 34 | ff.write(str(M.shape[1])) 35 | ff.write('\n') 36 | with open(output_path + '/nnz.txt', 'w') as ff: 37 | for i in data: 38 | ff.write(str(i)) 39 | ff.write(' ') 40 | with open(output_path + '/col.txt', 'w') as ff: 41 | for i in colidx: 42 | ff.write(str(i)) 43 | ff.write(' ') 44 | with open(output_path + '/rowb.txt', 'w') as ff: 45 | for i in rowb: 46 | ff.write(str(i)) 47 | ff.write(' ') 48 | with open(output_path + '/rowe.txt', 'w') as ff: 49 | for i in rowe: 50 | ff.write(str(i)) 51 | ff.write(' ') 52 | with open(output_path + '/x.txt', 'w') as ff: 53 | for i in x: 54 | ff.write(str(i)) 55 | ff.write(' ') 56 | with open(output_path + '/order.txt', 'w') as ff: 57 | for i in order: 58 | ff.write(str(i)) 59 | ff.write(' ') 60 | with open(output_path + '/ans.txt', 'w') as ff: 61 | for i in ans: 62 | ff.write(str(i)) 63 | ff.write(' ') 64 | 65 | 66 | def main(): 67 | matrices = [] 68 | print('Loading matrices') 69 | for pathname in glob.glob('matrices/*.mat'): 70 | name = os.path.splitext(os.path.basename(pathname))[0] 71 | print('Load %s' % name) 72 | matrix = scipy.io.loadmat(pathname)['Problem'][0][0]['A'] 73 | matrix = scipy.sparse.csr_matrix(matrix) 74 | nonzeros = matrix.count_nonzero() 75 | matrices.append((nonzeros, name, matrix)) 76 | 77 | matrices.sort(key=lambda x: x[0]) 78 | 79 | print('Write list.txt') 80 | with open(DATA_PATH + '/list.txt', 'w') as f: 81 | for m in matrices: 82 | f.write(str(m[0]) + '\t' + m[1] + '\n') 83 | 84 | print('Write matrices') 85 | for m in matrices: 86 | print('Write %s' % m[1]) 87 | output(DATA_PATH + '/' + m[1], m[2]) 88 | 89 | if __name__ == '__main__': 90 | main() 91 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # The SpV8 kernel 2 | 3 | SpV8 is a SpMV kernel written in AVX-512. The research goal of SpV8 is to pursue optimal vectorization and regular computation pattern. 4 | 5 | This is the artifact for our paper @ DAC '21: `SpV8: Pursuing Optimal Vectorization and Regular Computation Pattern in SpMV` 6 | 7 | # Usage 8 | 9 | ## Dependencies & Recommend Environment 10 | 11 | 1. Intel Xeon Processor with AVX-512 support 12 | 2. GCC 9 13 | 3. Intel Parallel Studio XE 2020.2 14 | 4. numactl (to set numa affinity) 15 | 5. cpupower (to set frequency, other available tools should also work) 16 | 17 | ## Compilation 18 | 19 | ``` 20 | make 21 | ``` 22 | 23 | ## Run kernel 24 | 25 | Before running, CPU frequency should be fixed to eliminate CPU Turbo effects. 26 | 27 | The binaries have the same parameter list as follows: 28 | 29 | ```bash 30 | bin/spmv_(kernel) [loop_count] [use_optimize?] [thread_count] 31 | ``` 32 | 33 | 1. `loop_count`: Number of iterations of SpMV 34 | 2. `use_optimize?`: Taking effects only on MKL binary. It is used to activate MKL's Inspector-Executor Optimization. 35 | 3. `thread_count`: Number of OpenMP threads 36 | 37 | Example for running SpV8: 38 | 39 | ```bash 40 | numactl --cpunodebind=1 --membind=1 bin/spmv_spv8 1000 1 8 41 | ``` 42 | 43 | ## How to feed matrix to kernel 44 | 45 | In our experiments, we used a custom but straightforward data format to store CSR matrix. And once we execute the binary, it will search for the following data files **under the work directory**: 46 | 47 | ``` 48 | info.txt : Contains number of NNZ, rows, cols 49 | rowb.txt : 0-based NNZ index for each row begin 50 | rowe.txt : 0-based NNZ index for each row end 51 | nnz.txt : NNZ list 52 | col.txt : 0-based column index for each NNZ 53 | x.txt : A random vector x, used for SpMV 54 | ans.txt : Used to check answer 55 | ``` 56 | 57 | Meanwhile, **we also provide two script** in `contrib` for you to generate data files from Matlab Matrix Format or Matrix Market Format. 58 | 59 | ## Dataset 60 | 61 | All the matrices we used in our benchmark are listed in `contrib/data.txt`, and their files are publicly available on [SuiteSparse Matrix Collection](https://sparse.tamu.edu/). 62 | 63 | # Notes 64 | 65 | This repo only contains two kernel for SpV8 and MKL. For other methods like CVR, ESB and CSR5, we reused kernels provided in [puckbee/CVR](https://github.com/puckbee/CVR) and [puckbee/pava](https://github.com/puckbee/pava). These kernels are collected from the original authors. And we only modified their output code to simplify data collection. 66 | 67 | We thank Biwei Xie, the author of CVR, for his kind and informative discussion on running previous methods. 68 | 69 | # License 70 | 71 | The code is licensed with MIT Opensource License. 72 | 73 | But note that, the dataset, Intel MKL and other previous research kernels are copyright by other entities. 74 | -------------------------------------------------------------------------------- /src/spmv_mkl.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "mkl.h" 6 | #include "mkl_spblas.h" 7 | //#include "tbb/task_scheduler_init.h" 8 | #define __USE_GNU 9 | #include "sched.h" 10 | #include "unistd.h" 11 | 12 | // #define LOOP_COUNT 1000 13 | int main(int argc, char *argv[]) { 14 | double *nnz, *x, *y; 15 | int *col, *rowb, *rowe; 16 | int m, n, c, i, r, loop_cnt; 17 | double alpha, beta; 18 | double duration; 19 | double start, end; 20 | sparse_matrix_t A; 21 | struct matrix_descr tt = {SPARSE_MATRIX_TYPE_GENERAL, SPARSE_FILL_MODE_LOWER, 22 | SPARSE_DIAG_NON_UNIT}; 23 | sparse_status_t stat; 24 | int flag = 1; 25 | int thread_num = 8; 26 | 27 | FILE *fp1 = fopen("nnz.txt", "r"); 28 | FILE *fp2 = fopen("col.txt", "r"); 29 | FILE *fp3 = fopen("rowb.txt", "r"); 30 | FILE *fp4 = fopen("rowe.txt", "r"); 31 | FILE *fp5 = fopen("x.txt", "r"); 32 | m = 924886, n = 194085, 33 | c = 194085; // m is number of non-zeros, n is matrix row, c is matrix column 34 | loop_cnt = 5000; 35 | 36 | if (access("info.txt", F_OK) != -1) { 37 | FILE *info = fopen("info.txt", "r"); 38 | fscanf(info, "%d", &m); 39 | fscanf(info, "%d", &n); 40 | fscanf(info, "%d", &c); 41 | fclose(info); 42 | } 43 | 44 | if (argc > 3) { 45 | sscanf(argv[1], "%d", &loop_cnt); 46 | sscanf(argv[2], "%d", &flag); 47 | sscanf(argv[3], "%d", &thread_num); 48 | } 49 | 50 | alpha = 1.0; 51 | beta = 0.0; 52 | nnz = (double *)mkl_malloc(m * sizeof(double), 64); 53 | col = (int *)mkl_malloc(m * sizeof(int), 64); 54 | rowb = (int *)mkl_malloc(n * sizeof(int), 64); 55 | rowe = (int *)mkl_malloc(n * sizeof(int), 64); 56 | x = (double *)mkl_malloc(c * sizeof(double), 64); 57 | y = (double *)mkl_malloc(n * sizeof(double), 64); 58 | 59 | for (i = 0; i < m; i++) { 60 | fscanf(fp1, "%lf", &nnz[i]); 61 | } 62 | for (i = 0; i < m; i++) { 63 | fscanf(fp2, "%d", &col[i]); 64 | } 65 | for (i = 0; i < n; i++) { 66 | fscanf(fp3, "%d", &rowb[i]); 67 | } 68 | for (i = 0; i < n; i++) { 69 | fscanf(fp4, "%d", &rowe[i]); 70 | } 71 | for (i = 0; i < c; i++) { 72 | fscanf(fp5, "%lf", &x[i]); 73 | } 74 | for (i = 0; i < n; i++) { 75 | y[i] = 0.0; 76 | } 77 | 78 | // tbb::task_scheduler_init init(thread_num); 79 | mkl_set_num_threads(thread_num); 80 | mkl_sparse_d_create_csr(&A, SPARSE_INDEX_BASE_ZERO, n, c, rowb, rowe, col, 81 | nnz); 82 | 83 | if (flag) { 84 | start = dsecnd(); 85 | 86 | mkl_sparse_set_mv_hint(A, SPARSE_OPERATION_NON_TRANSPOSE, tt, 200); 87 | mkl_sparse_optimize(A); 88 | 89 | end = dsecnd(); 90 | duration = (double)(end - start); 91 | } 92 | 93 | for (r = 0; r < 300; r++) { 94 | stat = mkl_sparse_d_mv(SPARSE_OPERATION_NON_TRANSPOSE, alpha, A, tt, x, 95 | beta, y); 96 | } 97 | 98 | start = dsecnd(); 99 | for (r = 0; r < loop_cnt; r++) { 100 | stat = mkl_sparse_d_mv(SPARSE_OPERATION_NON_TRANSPOSE, alpha, A, tt, x, 101 | beta, y); 102 | } 103 | end = dsecnd(); 104 | duration = (double)(end - start) / loop_cnt; 105 | printf("%lf\n", duration * 1000); 106 | 107 | fclose(fp1); 108 | fclose(fp2); 109 | fclose(fp3); 110 | fclose(fp4); 111 | fclose(fp5); 112 | mkl_free(nnz); 113 | mkl_free(col); 114 | mkl_free(rowb); 115 | mkl_free(rowe); 116 | mkl_free(x); 117 | mkl_free(y); 118 | 119 | return 0; 120 | } 121 | -------------------------------------------------------------------------------- /src/utility.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | using namespace std; 6 | 7 | #define always_inline __inline__ __attribute__((always_inline)) 8 | #define likely(x) __builtin_expect(!!(x), 1) 9 | #define unlikely(x) __builtin_expect(!!(x), 0) 10 | 11 | typedef unsigned int u32; 12 | typedef unsigned long long u64; 13 | 14 | struct csr_matrix { 15 | int m, rows, cols; 16 | double *nnz, *x, *y, *ans; 17 | int *col, *rowb, *rowe; 18 | int *tstart; 19 | int *tend; 20 | }; 21 | 22 | void input_matrix(csr_matrix &mat) { 23 | FILE *fp1 = fopen("nnz.txt", "r"); 24 | FILE *fp2 = fopen("col.txt", "r"); 25 | FILE *fp3 = fopen("rowb.txt", "r"); 26 | FILE *fp4 = fopen("rowe.txt", "r"); 27 | FILE *fp5 = fopen("x.txt", "r"); 28 | FILE *fp6 = fopen("ans.txt", "r"); 29 | 30 | mat.nnz = (double *)mkl_malloc(mat.m * sizeof(double), 64); 31 | mat.col = (int *)mkl_malloc(mat.m * sizeof(int), 64); 32 | mat.rowb = (int *)mkl_malloc(mat.rows * sizeof(int), 64); 33 | mat.rowe = (int *)mkl_malloc(mat.rows * sizeof(int), 64); 34 | mat.x = (double *)mkl_malloc(mat.cols * sizeof(double), 64); 35 | mat.y = (double *)mkl_malloc(mat.rows * sizeof(double), 64); 36 | mat.ans = (double *)mkl_malloc(mat.rows * sizeof(double), 64); 37 | 38 | for (int i = 0; i < mat.m; i++) { 39 | fscanf(fp1, "%lf", &mat.nnz[i]); 40 | } 41 | for (int i = 0; i < mat.m; i++) { 42 | fscanf(fp2, "%d", &mat.col[i]); 43 | } 44 | for (int i = 0; i < mat.rows; i++) { 45 | fscanf(fp3, "%d", &mat.rowb[i]); 46 | } 47 | for (int i = 0; i < mat.rows; i++) { 48 | fscanf(fp4, "%d", &mat.rowe[i]); 49 | } 50 | for (int i = 0; i < mat.cols; i++) { 51 | fscanf(fp5, "%lf", &mat.x[i]); 52 | } 53 | for (int i = 0; i < mat.rows; i++) { 54 | mat.y[i] = 0.0; 55 | } 56 | for (int i = 0; i < mat.rows; i++) { 57 | fscanf(fp6, "%lf", &mat.ans[i]); 58 | } 59 | 60 | fclose(fp1); 61 | fclose(fp2); 62 | fclose(fp3); 63 | fclose(fp4); 64 | fclose(fp5); 65 | fclose(fp6); 66 | } 67 | 68 | void destroy_matrix(csr_matrix &mat) { 69 | mkl_free(mat.nnz); 70 | mkl_free(mat.col); 71 | mkl_free(mat.rowb); 72 | mkl_free(mat.rowe); 73 | mkl_free(mat.x); 74 | mkl_free(mat.y); 75 | mkl_free(mat.ans); 76 | } 77 | 78 | bool check_answer(csr_matrix &mat) { 79 | int bad_count = 0; 80 | for (int i = 0; i < mat.rows; i++) { 81 | double yi = mat.y[i]; 82 | double ansi = mat.ans[i]; 83 | if (abs(yi - ansi) > 0.01 * abs(ansi) && !(abs(yi) <= 1e-5 && abs(ansi) <= 1e-5)) { 84 | if (bad_count < 10) 85 | fprintf(stderr, "y[%d] expected %lf got %lf\n", i, mat.ans[i], mat.y[i]); 86 | bad_count++; 87 | } 88 | } 89 | if (bad_count) 90 | fprintf(stderr, "bad_count: %d\n", bad_count); 91 | return bad_count == 0 ? true : false; 92 | } 93 | 94 | csr_matrix apply_order(csr_matrix &mat, vector> &tasks, int copy_oob = true) { 95 | csr_matrix ret; 96 | 97 | ret.m = mat.m; 98 | ret.rows = mat.rows; 99 | ret.cols = mat.cols; 100 | ret.nnz = (double *)mkl_malloc(mat.m * sizeof(double), 64); 101 | ret.col = (int *)mkl_malloc(mat.m * sizeof(int), 64); 102 | ret.rowb = (int *)mkl_malloc(mat.rows * sizeof(int), 64); 103 | ret.rowe = (int *)mkl_malloc(mat.rows * sizeof(int), 64); 104 | if (copy_oob) { 105 | ret.x = (double *)mkl_malloc(mat.cols * sizeof(double), 64); 106 | ret.y = (double *)mkl_malloc(mat.rows * sizeof(double), 64); 107 | ret.ans = (double *)mkl_malloc(mat.rows * sizeof(double), 64); 108 | } 109 | ret.tstart = (int *)mkl_malloc(tasks.size() * sizeof(int), 64); 110 | ret.tend = (int *)mkl_malloc(tasks.size() * sizeof(int), 64); 111 | 112 | if (copy_oob) { 113 | for (int i = 0; i < mat.cols; i++) 114 | ret.x[i] = mat.x[i]; 115 | 116 | for (int i = 0; i < mat.rows; i++) 117 | ret.y[i] = 0; 118 | 119 | for (int i = 0; i < mat.rows; i++) 120 | ret.ans[i] = mat.ans[i]; 121 | } 122 | 123 | int npos = 0, pos = 0; 124 | int start = 0, t = 0; 125 | for (vector &task : tasks) { 126 | ret.tstart[t] = start; 127 | ret.tend[t++] = start + task.size(); 128 | start += task.size(); 129 | for (int row : task) { 130 | int b = mat.rowb[row]; 131 | int e = mat.rowe[row]; 132 | ret.rowb[pos] = npos; 133 | ret.rowe[pos++] = npos + e - b; 134 | for (int i = b; i < e; i++) { 135 | ret.nnz[npos] = mat.nnz[i]; 136 | ret.col[npos++] = mat.col[i]; 137 | } 138 | } 139 | } 140 | 141 | return ret; 142 | } 143 | 144 | -------------------------------------------------------------------------------- /src/spmv_spv8.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "unistd.h" 14 | #include "mkl.h" 15 | #include "mkl_spblas.h" 16 | #include "utility.hpp" 17 | using namespace std; 18 | using namespace chrono; 19 | 20 | struct tr_matrix { 21 | csr_matrix mat; 22 | vector spvv8_len; 23 | vector> tasks; 24 | }; 25 | 26 | tr_matrix tr_reorder(csr_matrix &mat, vector> &tasks) { 27 | tr_matrix tr; 28 | 29 | for (vector &task : tasks) { 30 | unordered_map> buckets; 31 | 32 | for (int r : task) { 33 | int rowlen = mat.rowe[r] - mat.rowb[r]; 34 | buckets[rowlen].push_back(r); 35 | } 36 | 37 | vector keys; 38 | for (auto kv : buckets) { 39 | keys.push_back(kv.first); 40 | } 41 | sort(keys.begin(), keys.end()); 42 | 43 | vector order; 44 | vector remain; 45 | for (int k : keys) { 46 | vector &samelen_task = buckets[k]; 47 | int left = samelen_task.size() % 8; 48 | if (k > 32) 49 | left = samelen_task.size(); 50 | int bulk = samelen_task.size() - left; 51 | order.insert(order.end(), samelen_task.begin(), samelen_task.begin() + bulk); 52 | remain.insert(remain.end(), samelen_task.begin() + bulk, samelen_task.end()); 53 | } 54 | 55 | tr.spvv8_len.push_back(order.size()); 56 | 57 | task.clear(); 58 | task.insert(task.end(), order.begin(), order.end()); 59 | task.insert(task.end(), remain.begin(), remain.end()); 60 | } 61 | 62 | tr.mat = apply_order(mat, tasks); 63 | 64 | int size = tasks.size(); 65 | for (int t = 0; t < size; t++) { 66 | int start = tr.mat.tstart[t]; 67 | int tr_len = tr.spvv8_len[t]; 68 | int p = 0, c = 0; 69 | for (p = start; c < tr_len; c += 8, p += 8) { 70 | int rowlen = tr.mat.rowe[p] - tr.mat.rowb[p]; 71 | int base = tr.mat.rowb[p]; 72 | vector nnz; 73 | vector col; 74 | nnz.insert(nnz.end(), tr.mat.nnz + base, tr.mat.nnz + base + rowlen * 8); 75 | col.insert(col.end(), tr.mat.col + base, tr.mat.col + base + rowlen * 8); 76 | for (int l = 0; l < rowlen; l++) { 77 | for (int r = 0; r < 8; r++) { 78 | tr.mat.nnz[base + l * 8 + r] = nnz[r * rowlen + l]; 79 | tr.mat.col[base + l * 8 + r] = col[r * rowlen + l]; 80 | } 81 | } 82 | } 83 | } 84 | 85 | tr.tasks = tasks; 86 | 87 | return tr; 88 | } 89 | 90 | bool is_banded(csr_matrix &mat, int band_size = -1) { 91 | if (band_size == -1) 92 | band_size = mat.cols / 64; 93 | int band_count = 0; 94 | bool banded = false; 95 | 96 | for (int r = 0; r < mat.rows; r++) { 97 | int rb = mat.rowb[r]; 98 | int re = mat.rowe[r]; 99 | for (int i = rb; i < re; i++) { 100 | int col = mat.col[i]; 101 | if (abs(col - r) <= band_size) 102 | band_count++; 103 | } 104 | } 105 | 106 | if (double(band_count) / mat.m >= 0.3) { 107 | banded = true; 108 | } 109 | 110 | return banded; 111 | } 112 | 113 | 114 | tr_matrix process(csr_matrix &mat, int panel_num) { 115 | vector> tasks(panel_num); 116 | 117 | int pos = 0; 118 | int len = mat.m / panel_num; 119 | int limit = mat.rows - 7; 120 | int i; 121 | int count = 0; 122 | for (i = 0; i < limit; i += 8) { 123 | for (int j = 0; j < 8; j++) { 124 | int rowlen = mat.rowe[i + j] - mat.rowb[i + j]; 125 | if (rowlen > 0) { 126 | tasks[pos].push_back(i + j); 127 | count += rowlen; 128 | } 129 | } 130 | 131 | if (count >= len) { 132 | if (pos + 1 < panel_num) { 133 | pos += 1; 134 | count = 0; 135 | } 136 | } 137 | } 138 | 139 | if (i < mat.rows) { 140 | for (; i < mat.rows; i++) { 141 | tasks[pos].push_back(i); 142 | } 143 | } 144 | 145 | tr_matrix ret = tr_reorder(mat, tasks); 146 | return ret; 147 | } 148 | 149 | 150 | always_inline double avx512_fma_spvv_kernel(int *col, double *nnz, int rowlen, 151 | double *x) { 152 | int limit = rowlen - 7; 153 | int *col_p; 154 | double *nnz_p; 155 | double sum = 0; 156 | __m256i c1; 157 | __m512d v1, v2, s; 158 | s = _mm512_setzero_pd(); 159 | int i; 160 | 161 | for (i = 0; i < limit; i += 8) { 162 | col_p = col + i; 163 | nnz_p = nnz + i; 164 | c1 = _mm256_loadu_si256((const __m256i *) col_p); 165 | v2 = _mm512_i32gather_pd(c1, x, 8); 166 | v1 = _mm512_loadu_pd(nnz_p); 167 | s = _mm512_fmadd_pd(v1, v2, s); 168 | } 169 | 170 | sum += _mm512_reduce_add_pd(s); 171 | for (; i < rowlen; i++) { 172 | sum += nnz[i] * x[col[i]]; 173 | } 174 | 175 | return sum; 176 | } 177 | 178 | always_inline void avx512_spvv8_kernel_tr(const int *rows, int *rowb, int *rowe, 179 | int *col, double *nnz, double *x, 180 | double *y) { 181 | __m256i rs = _mm256_loadu_si256((const __m256i *) rows); 182 | __m512d acc = _mm512_setzero_pd(); 183 | 184 | int rowlen = *rowe - *rowb; 185 | int base = *rowb; 186 | 187 | { 188 | int idx0 = rows[0]; 189 | int idx1 = rows[1]; 190 | int idx2 = rows[2]; 191 | int idx3 = rows[3]; 192 | int idx4 = rows[4]; 193 | int idx5 = rows[5]; 194 | int idx6 = rows[6]; 195 | int idx7 = rows[7]; 196 | 197 | _m_prefetchw(y + idx0); 198 | _m_prefetchw(y + idx1); 199 | _m_prefetchw(y + idx2); 200 | _m_prefetchw(y + idx3); 201 | _m_prefetchw(y + idx4); 202 | _m_prefetchw(y + idx5); 203 | _m_prefetchw(y + idx6); 204 | _m_prefetchw(y + idx7); 205 | } 206 | 207 | for (int c = 0; c < rowlen; c++) { 208 | int offset = base + c * 8; 209 | __m256i cc = _mm256_loadu_si256((const __m256i *) (col + offset)); 210 | __m512d nz = _mm512_loadu_pd(nnz + offset); 211 | __m512d xx = _mm512_i32gather_pd(cc, x, 8); 212 | acc = _mm512_fmadd_pd(nz, xx, acc); 213 | } 214 | 215 | _mm512_i32scatter_pd(y, rs, acc, 8); 216 | } 217 | 218 | void spmv_tr_spvv8_kernel(tr_matrix &tr, int threads) { 219 | int size = tr.tasks.size(); 220 | #pragma omp parallel for num_threads(threads) schedule(dynamic) 221 | for (int tid = 0; tid < size; tid++) { 222 | vector &task = tr.tasks[tid]; 223 | csr_matrix &mat = tr.mat; 224 | int *rows = task.data(); 225 | int start = mat.tstart[tid]; 226 | int end = mat.tend[tid]; 227 | int limit = tr.spvv8_len[tid]; 228 | int p, c; 229 | for (p = start, c = 0; c < limit; p += 8, c += 8) { 230 | avx512_spvv8_kernel_tr(rows + c, mat.rowb + p, mat.rowe + p, mat.col, mat.nnz, mat.x, mat.y); 231 | } 232 | for (; p < end; p++) { 233 | int r = rows[p - start]; 234 | int begin = mat.rowb[p]; 235 | int end = mat.rowe[p]; 236 | int rowlen = end - begin; 237 | _mm_prefetch(mat.y + r, _MM_HINT_ET1); 238 | mat.y[r] = avx512_fma_spvv_kernel(mat.col + begin, mat.nnz + begin, rowlen, mat.x); 239 | } 240 | } 241 | } 242 | 243 | int main(int argc, char *argv[]) { 244 | csr_matrix mat; 245 | int loop_count, thread_count; 246 | 247 | if (access("info.txt", F_OK) != -1) { 248 | FILE *info = fopen("info.txt", "r"); 249 | fscanf(info, "%d", &mat.m); 250 | fscanf(info, "%d", &mat.rows); 251 | fscanf(info, "%d", &mat.cols); 252 | fclose(info); 253 | } else { 254 | fprintf(stderr, "info.txt not found\n"); 255 | return -1; 256 | } 257 | 258 | if (argc > 3) { 259 | sscanf(argv[1], "%d", &loop_count); 260 | sscanf(argv[3], "%d", &thread_count); 261 | } else { 262 | fprintf(stderr, "invalid parameter\n"); 263 | return -2; 264 | } 265 | 266 | input_matrix(mat); 267 | 268 | bool banded = is_banded(mat); 269 | int panel_count = max(thread_count * 4, mat.rows / 2000); 270 | if (banded) { 271 | panel_count = max(thread_count * 4, mat.rows / 10000); 272 | } 273 | tr_matrix tr = process(mat, panel_count); 274 | 275 | // Warm-up 276 | for (int i = 0; i < 300; i++) { 277 | spmv_tr_spvv8_kernel(tr, thread_count); 278 | } 279 | 280 | auto begin = high_resolution_clock::now(); 281 | for (int i = 0; i < loop_count; i++) { 282 | spmv_tr_spvv8_kernel(tr, thread_count); 283 | } 284 | auto end = high_resolution_clock::now(); 285 | auto duration = duration_cast(end - begin); 286 | printf("%lf,", double(duration.count()) / 1000 / loop_count); 287 | 288 | destroy_matrix(mat); 289 | return 0; 290 | } 291 | 292 | --------------------------------------------------------------------------------