├── bin
    └── .gitkeep
├── .gitignore
├── Makefile
├── LICENSE
├── contrib
    ├── data.txt
    ├── generate_mtx.py
    └── generate_mat.py
├── README.md
└── src
    ├── spmv_mkl.cpp
    ├── utility.hpp
    └── spmv_spv8.cpp


/bin/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | bin/spmv_*
2 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Compilers
 2 | CC = gcc
 3 | CXX = g++
 4 | # - Debug
 5 | # CFLAGS += -g
 6 | # CFLAGS += -fsanitize=address # Check invalid memory access
 7 | # - General
 8 | CFLAGS += -O2 # Should be disabled when debug
 9 | CFLAGS += -std=c++17 -Wall -Wextra -lm
10 | CFLAGS += -march=skylake-avx512 -fopenmp
11 | #CFLAGS += -static
12 | # - MKL
13 | #CFLAGS += -ltbb -lmkl_core -lmkl_tbb_thread -lmkl_intel_lp64
14 | #CFLAGS += -lmkl_core -lmkl_gnu_thread -lmkl_intel_lp64
15 | CFLAGS += -lmkl_core -lmkl_intel_thread -liomp5 -lmkl_intel_lp64
16 | VPATH = src
17 | 
18 | all: bin bin/spmv_spv8 bin/spmv_mkl
19 | 
20 | bin:
21 | 	mkdir bin
22 | 
23 | bin/spmv_spv8 : spmv_spv8.cpp
24 | 	$(CXX) $(CFLAGS) $< -o $@
25 | 
26 | bin/spmv_mkl : spmv_mkl.cpp
27 | 	$(CXX) $(CFLAGS) $< -o $@
28 | 
29 | .PHONY : clean
30 | clean :
31 | 	-rm bin/*
32 | 
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 XJTU-IAIR-CAG Research Group
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/contrib/data.txt:
--------------------------------------------------------------------------------
 1 | Scale-Free：
 2 | Oregon-2
 3 | as-caida
 4 | sx-mathoverflow
 5 | email-Enron
 6 | soc-sign-Slashdot090216
 7 | soc-sign-Slashdot090221
 8 | dc2
 9 | soc-sign-epinions
10 | soc-Slashdot0811
11 | sx-superuser
12 | soc-Slashdot0902
13 | scircuit
14 | connectus
15 | language
16 | NotreDame_actors
17 | com-DBLP
18 | web-Stanford
19 | citationCiteseer
20 | cop20k_A
21 | webbase-1M
22 | IMDB
23 | wiki-Talk
24 | web-Google
25 | com-Youtube
26 | flickr
27 | higgs-twitter
28 | patents
29 | as-Skitter
30 | FullChip
31 | wiki-topcats
32 | mouse_gene
33 | soc-Pokec
34 | coPapersCiteseer
35 | soc-LiveJournal1
36 | hollywood-2009
37 | com-Orkut
38 | ---------------------------------
39 | HPC：
40 | p2p-Gnutella04
41 | p2p-Gnutella25
42 | p2p-Gnutella24
43 | p2p-Gnutella30
44 | de2010
45 | ri2010
46 | vt2010
47 | ut2010
48 | tn2010
49 | mac_econ_fwd500
50 | va2010
51 | ga2010
52 | mc2depi
53 | rma10
54 | fl2010
55 | ASIC_680k
56 | roadNet-PA
57 | amazon0312
58 | cant
59 | pdb1HYS
60 | tx2010
61 | roadNet-CA
62 | consph
63 | mip1
64 | crankseg_2
65 | Si41Ge41H72
66 | Ga41As41H72
67 | rajat31
68 | road_central
69 | ldoor
70 | spal_004
71 | bone010
72 | road_usa
73 | circuit5M
74 | cage15
75 | 
76 | 


--------------------------------------------------------------------------------
/contrib/generate_mtx.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import glob
 3 | import os
 4 | import sys
 5 | import numpy as np
 6 | import scipy
 7 | import scipy.io
 8 | import scipy.sparse
 9 | 
10 | DATA_PATH = './data'
11 | 
12 | 
13 | def output(output_path, M):
14 |   np.random.seed(1)
15 |   x = np.random.normal(size=(M.shape[1],))
16 |   ans = M.dot(x)
17 |   x = list(x)
18 |   ans = list(ans)
19 | 
20 |   data = list(M.data)
21 |   colidx = list(M.indices)
22 |   rowptr = list(M.indptr)
23 |   rowb = rowptr[:-1]
24 |   rowe = rowptr[1:]
25 |   order = list(range(M.shape[0]))
26 | 
27 |   os.makedirs(output_path, exist_ok=True)
28 | 
29 |   with open(output_path + '/info.txt', 'w') as ff:
30 |     ff.write(str(len(M.data)))
31 |     ff.write('\n')
32 |     ff.write(str(M.shape[0]))
33 |     ff.write('\n')
34 |     ff.write(str(M.shape[1]))
35 |     ff.write('\n')
36 |   with open(output_path + '/nnz.txt', 'w') as ff:
37 |     for i in data:
38 |       ff.write(str(i))
39 |       ff.write(' ')
40 |   with open(output_path + '/col.txt', 'w') as ff:
41 |     for i in colidx:
42 |       ff.write(str(i))
43 |       ff.write(' ')
44 |   with open(output_path + '/rowb.txt', 'w') as ff:
45 |     for i in rowb:
46 |       ff.write(str(i))
47 |       ff.write(' ')
48 |   with open(output_path + '/rowe.txt', 'w') as ff:
49 |     for i in rowe:
50 |       ff.write(str(i))
51 |       ff.write(' ')
52 |   with open(output_path + '/x.txt', 'w') as ff:
53 |     for i in x:
54 |       ff.write(str(i))
55 |       ff.write(' ')
56 |   with open(output_path + '/order.txt', 'w') as ff:
57 |     for i in order:
58 |       ff.write(str(i))
59 |       ff.write(' ')
60 |   with open(output_path + '/ans.txt', 'w') as ff:
61 |     for i in ans:
62 |       ff.write(str(i))
63 |       ff.write(' ')
64 | 
65 | 
66 | def main():
67 |   matrices = []
68 |   print('Loading matrices')
69 |   for pathname in glob.glob('matrices/*.mtx'):
70 |     name = os.path.splitext(os.path.basename(pathname))[0]
71 |     print('Load %s' % name)
72 |     matrix = scipy.io.mmread(pathname)
73 |     matrix = scipy.sparse.csr_matrix(matrix)
74 |     nonzeros = matrix.count_nonzero()
75 |     matrices.append((nonzeros, name, matrix))
76 | 
77 |   matrices.sort(key=lambda x: x[0])
78 | 
79 |   print('Write list.txt')
80 |   with open(DATA_PATH + '/list.txt', 'w') as f:
81 |     for m in matrices:
82 |       f.write(str(m[0]) + '\t' + m[1] + '\n')
83 |   
84 |   print('Write matrices')
85 |   for m in matrices:
86 |     print('Write %s' % m[1])
87 |     output(DATA_PATH + '/' + m[1], m[2])
88 | 
89 | if __name__ == '__main__':
90 |   main()
91 | 


--------------------------------------------------------------------------------
/contrib/generate_mat.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import glob
 3 | import os
 4 | import sys
 5 | import numpy as np
 6 | import scipy
 7 | import scipy.io
 8 | import scipy.sparse
 9 | 
10 | DATA_PATH = './data'
11 | 
12 | 
13 | def output(output_path, M):
14 |   np.random.seed(1)
15 |   x = np.random.normal(size=(M.shape[1],))
16 |   ans = M.dot(x)
17 |   x = list(x)
18 |   ans = list(ans)
19 | 
20 |   data = list(M.data)
21 |   colidx = list(M.indices)
22 |   rowptr = list(M.indptr)
23 |   rowb = rowptr[:-1]
24 |   rowe = rowptr[1:]
25 |   order = list(range(M.shape[0]))
26 | 
27 |   os.makedirs(output_path, exist_ok=True)
28 | 
29 |   with open(output_path + '/info.txt', 'w') as ff:
30 |     ff.write(str(len(M.data)))
31 |     ff.write('\n')
32 |     ff.write(str(M.shape[0]))
33 |     ff.write('\n')
34 |     ff.write(str(M.shape[1]))
35 |     ff.write('\n')
36 |   with open(output_path + '/nnz.txt', 'w') as ff:
37 |     for i in data:
38 |       ff.write(str(i))
39 |       ff.write(' ')
40 |   with open(output_path + '/col.txt', 'w') as ff:
41 |     for i in colidx:
42 |       ff.write(str(i))
43 |       ff.write(' ')
44 |   with open(output_path + '/rowb.txt', 'w') as ff:
45 |     for i in rowb:
46 |       ff.write(str(i))
47 |       ff.write(' ')
48 |   with open(output_path + '/rowe.txt', 'w') as ff:
49 |     for i in rowe:
50 |       ff.write(str(i))
51 |       ff.write(' ')
52 |   with open(output_path + '/x.txt', 'w') as ff:
53 |     for i in x:
54 |       ff.write(str(i))
55 |       ff.write(' ')
56 |   with open(output_path + '/order.txt', 'w') as ff:
57 |     for i in order:
58 |       ff.write(str(i))
59 |       ff.write(' ')
60 |   with open(output_path + '/ans.txt', 'w') as ff:
61 |     for i in ans:
62 |       ff.write(str(i))
63 |       ff.write(' ')
64 | 
65 | 
66 | def main():
67 |   matrices = []
68 |   print('Loading matrices')
69 |   for pathname in glob.glob('matrices/*.mat'):
70 |     name = os.path.splitext(os.path.basename(pathname))[0]
71 |     print('Load %s' % name)
72 |     matrix = scipy.io.loadmat(pathname)['Problem'][0][0]['A']
73 |     matrix = scipy.sparse.csr_matrix(matrix)
74 |     nonzeros = matrix.count_nonzero()
75 |     matrices.append((nonzeros, name, matrix))
76 | 
77 |   matrices.sort(key=lambda x: x[0])
78 | 
79 |   print('Write list.txt')
80 |   with open(DATA_PATH + '/list.txt', 'w') as f:
81 |     for m in matrices:
82 |       f.write(str(m[0]) + '\t' + m[1] + '\n')
83 |   
84 |   print('Write matrices')
85 |   for m in matrices:
86 |     print('Write %s' % m[1])
87 |     output(DATA_PATH + '/' + m[1], m[2])
88 | 
89 | if __name__ == '__main__':
90 |   main()
91 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # The SpV8 kernel
 2 | 
 3 | SpV8 is a SpMV kernel written in AVX-512. The research goal of SpV8 is to pursue optimal vectorization and regular computation pattern.
 4 | 
 5 | This is the artifact for our paper @ DAC '21: `SpV8: Pursuing Optimal Vectorization and Regular Computation Pattern in SpMV`
 6 | 
 7 | # Usage
 8 | 
 9 | ## Dependencies & Recommend Environment
10 | 
11 | 1. Intel Xeon Processor with AVX-512 support
12 | 2. GCC 9
13 | 3. Intel Parallel Studio XE 2020.2
14 | 4. numactl (to set numa affinity)
15 | 5. cpupower (to set frequency, other available tools should also work)
16 | 
17 | ## Compilation
18 | 
19 | ```
20 | make
21 | ```
22 | 
23 | ## Run kernel
24 | 
25 | Before running, CPU frequency should be fixed to eliminate CPU Turbo effects.
26 | 
27 | The binaries have the same parameter list as follows:
28 | 
29 | ```bash
30 | bin/spmv_(kernel) [loop_count] [use_optimize?] [thread_count]
31 | ```
32 | 
33 | 1. `loop_count`: Number of iterations of SpMV
34 | 2. `use_optimize?`: Taking effects only on MKL binary. It is used to activate MKL's Inspector-Executor Optimization.
35 | 3. `thread_count`: Number of OpenMP threads
36 | 
37 | Example for running SpV8:
38 | 
39 | ```bash
40 | numactl --cpunodebind=1 --membind=1 bin/spmv_spv8 1000 1 8
41 | ```
42 | 
43 | ## How to feed matrix to kernel
44 | 
45 | In our experiments, we used a custom but straightforward data format to store CSR matrix. And once we execute the binary, it will search for the following data files **under the work directory**:
46 | 
47 | ```
48 | info.txt : Contains number of NNZ, rows, cols
49 | rowb.txt : 0-based NNZ index for each row begin
50 | rowe.txt : 0-based NNZ index for each row end
51 | nnz.txt  : NNZ list
52 | col.txt  : 0-based column index for each NNZ
53 | x.txt    : A random vector x, used for SpMV
54 | ans.txt  : Used to check answer
55 | ```
56 | 
57 | Meanwhile, **we also provide two script** in `contrib` for you to generate data files from Matlab Matrix Format or Matrix Market Format.
58 | 
59 | ## Dataset
60 | 
61 | All the matrices we used in our benchmark are listed in `contrib/data.txt`, and their files are publicly available on [SuiteSparse Matrix Collection](https://sparse.tamu.edu/).
62 | 
63 | # Notes
64 | 
65 | This repo only contains two kernel for SpV8 and MKL. For other methods like CVR, ESB and CSR5, we reused kernels provided in [puckbee/CVR](https://github.com/puckbee/CVR) and [puckbee/pava](https://github.com/puckbee/pava). These kernels are collected from the original authors. And we only modified their output code to simplify data collection.
66 | 
67 | We thank Biwei Xie, the author of CVR, for his kind and informative discussion on running previous methods.
68 | 
69 | # License
70 | 
71 | The code is licensed with MIT Opensource License.
72 | 
73 | But note that, the dataset, Intel MKL and other previous research kernels are copyright by other entities.
74 | 


--------------------------------------------------------------------------------
/src/spmv_mkl.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <time.h>
  4 | 
  5 | #include "mkl.h"
  6 | #include "mkl_spblas.h"
  7 | //#include "tbb/task_scheduler_init.h"
  8 | #define __USE_GNU
  9 | #include "sched.h"
 10 | #include "unistd.h"
 11 | 
 12 | // #define LOOP_COUNT 1000
 13 | int main(int argc, char *argv[]) {
 14 |   double *nnz, *x, *y;
 15 |   int *col, *rowb, *rowe;
 16 |   int m, n, c, i, r, loop_cnt;
 17 |   double alpha, beta;
 18 |   double duration;
 19 |   double start, end;
 20 |   sparse_matrix_t A;
 21 |   struct matrix_descr tt = {SPARSE_MATRIX_TYPE_GENERAL, SPARSE_FILL_MODE_LOWER,
 22 |                             SPARSE_DIAG_NON_UNIT};
 23 |   sparse_status_t stat;
 24 |   int flag = 1;
 25 |   int thread_num = 8;
 26 | 
 27 |   FILE *fp1 = fopen("nnz.txt", "r");
 28 |   FILE *fp2 = fopen("col.txt", "r");
 29 |   FILE *fp3 = fopen("rowb.txt", "r");
 30 |   FILE *fp4 = fopen("rowe.txt", "r");
 31 |   FILE *fp5 = fopen("x.txt", "r");
 32 |   m = 924886, n = 194085,
 33 |   c = 194085;  // m is number of non-zeros, n is matrix row, c is matrix column
 34 |   loop_cnt = 5000;
 35 | 
 36 |   if (access("info.txt", F_OK) != -1) {
 37 |     FILE *info = fopen("info.txt", "r");
 38 |     fscanf(info, "%d", &m);
 39 |     fscanf(info, "%d", &n);
 40 |     fscanf(info, "%d", &c);
 41 |     fclose(info);
 42 |   }
 43 |  
 44 |   if (argc > 3) {
 45 |     sscanf(argv[1], "%d", &loop_cnt);
 46 |     sscanf(argv[2], "%d", &flag);
 47 |     sscanf(argv[3], "%d", &thread_num);
 48 |   }
 49 | 
 50 |   alpha = 1.0;
 51 |   beta = 0.0;
 52 |   nnz = (double *)mkl_malloc(m * sizeof(double), 64);
 53 |   col = (int *)mkl_malloc(m * sizeof(int), 64);
 54 |   rowb = (int *)mkl_malloc(n * sizeof(int), 64);
 55 |   rowe = (int *)mkl_malloc(n * sizeof(int), 64);
 56 |   x = (double *)mkl_malloc(c * sizeof(double), 64);
 57 |   y = (double *)mkl_malloc(n * sizeof(double), 64);
 58 | 
 59 |   for (i = 0; i < m; i++) {
 60 |     fscanf(fp1, "%lf", &nnz[i]);
 61 |   }
 62 |   for (i = 0; i < m; i++) {
 63 |     fscanf(fp2, "%d", &col[i]);
 64 |   }
 65 |   for (i = 0; i < n; i++) {
 66 |     fscanf(fp3, "%d", &rowb[i]);
 67 |   }
 68 |   for (i = 0; i < n; i++) {
 69 |     fscanf(fp4, "%d", &rowe[i]);
 70 |   }
 71 |   for (i = 0; i < c; i++) {
 72 |     fscanf(fp5, "%lf", &x[i]);
 73 |   }
 74 |   for (i = 0; i < n; i++) {
 75 |     y[i] = 0.0;
 76 |   }
 77 | 
 78 |   // tbb::task_scheduler_init init(thread_num);
 79 |   mkl_set_num_threads(thread_num);
 80 |   mkl_sparse_d_create_csr(&A, SPARSE_INDEX_BASE_ZERO, n, c, rowb, rowe, col,
 81 |                           nnz);
 82 | 
 83 |   if (flag) {
 84 |     start = dsecnd();
 85 | 
 86 |     mkl_sparse_set_mv_hint(A, SPARSE_OPERATION_NON_TRANSPOSE, tt, 200);
 87 |     mkl_sparse_optimize(A);
 88 | 
 89 |     end = dsecnd();
 90 |     duration = (double)(end - start);
 91 |   }
 92 | 
 93 |   for (r = 0; r < 300; r++) {
 94 |     stat = mkl_sparse_d_mv(SPARSE_OPERATION_NON_TRANSPOSE, alpha, A, tt, x,
 95 |                            beta, y);
 96 |   }
 97 | 
 98 |   start = dsecnd();
 99 |   for (r = 0; r < loop_cnt; r++) {
100 |     stat = mkl_sparse_d_mv(SPARSE_OPERATION_NON_TRANSPOSE, alpha, A, tt, x,
101 |                            beta, y);
102 |   }
103 |   end = dsecnd();
104 |   duration = (double)(end - start) / loop_cnt;
105 |   printf("%lf\n", duration * 1000);
106 | 
107 |   fclose(fp1);
108 |   fclose(fp2);
109 |   fclose(fp3);
110 |   fclose(fp4);
111 |   fclose(fp5);
112 |   mkl_free(nnz);
113 |   mkl_free(col);
114 |   mkl_free(rowb);
115 |   mkl_free(rowe);
116 |   mkl_free(x);
117 |   mkl_free(y);
118 | 
119 |   return 0;
120 | }
121 | 


--------------------------------------------------------------------------------
/src/utility.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <bitset>
  3 | #include <cstdio>
  4 | #include <vector>
  5 | using namespace std;
  6 | 
  7 | #define always_inline __inline__ __attribute__((always_inline))
  8 | #define likely(x) __builtin_expect(!!(x), 1)
  9 | #define unlikely(x) __builtin_expect(!!(x), 0)
 10 | 
 11 | typedef unsigned int u32;
 12 | typedef unsigned long long u64;
 13 | 
 14 | struct csr_matrix {
 15 |   int m, rows, cols;
 16 |   double *nnz, *x, *y, *ans;
 17 |   int *col, *rowb, *rowe;
 18 |   int *tstart;
 19 |   int *tend;
 20 | };
 21 | 
 22 | void input_matrix(csr_matrix &mat) {
 23 |   FILE *fp1 = fopen("nnz.txt", "r");
 24 |   FILE *fp2 = fopen("col.txt", "r");
 25 |   FILE *fp3 = fopen("rowb.txt", "r");
 26 |   FILE *fp4 = fopen("rowe.txt", "r");
 27 |   FILE *fp5 = fopen("x.txt", "r");
 28 |   FILE *fp6 = fopen("ans.txt", "r");
 29 | 
 30 |   mat.nnz = (double *)mkl_malloc(mat.m * sizeof(double), 64);
 31 |   mat.col = (int *)mkl_malloc(mat.m * sizeof(int), 64);
 32 |   mat.rowb = (int *)mkl_malloc(mat.rows * sizeof(int), 64);
 33 |   mat.rowe = (int *)mkl_malloc(mat.rows * sizeof(int), 64);
 34 |   mat.x = (double *)mkl_malloc(mat.cols * sizeof(double), 64);
 35 |   mat.y = (double *)mkl_malloc(mat.rows * sizeof(double), 64);
 36 |   mat.ans = (double *)mkl_malloc(mat.rows * sizeof(double), 64);
 37 | 
 38 |   for (int i = 0; i < mat.m; i++) {
 39 |     fscanf(fp1, "%lf", &mat.nnz[i]);
 40 |   }
 41 |   for (int i = 0; i < mat.m; i++) {
 42 |     fscanf(fp2, "%d", &mat.col[i]);
 43 |   }
 44 |   for (int i = 0; i < mat.rows; i++) {
 45 |     fscanf(fp3, "%d", &mat.rowb[i]);
 46 |   }
 47 |   for (int i = 0; i < mat.rows; i++) {
 48 |     fscanf(fp4, "%d", &mat.rowe[i]);
 49 |   }
 50 |   for (int i = 0; i < mat.cols; i++) {
 51 |     fscanf(fp5, "%lf", &mat.x[i]);
 52 |   }
 53 |   for (int i = 0; i < mat.rows; i++) {
 54 |     mat.y[i] = 0.0;
 55 |   }
 56 |   for (int i = 0; i < mat.rows; i++) {
 57 |     fscanf(fp6, "%lf", &mat.ans[i]);
 58 |   }
 59 | 
 60 |   fclose(fp1);
 61 |   fclose(fp2);
 62 |   fclose(fp3);
 63 |   fclose(fp4);
 64 |   fclose(fp5);
 65 |   fclose(fp6);
 66 | }
 67 | 
 68 | void destroy_matrix(csr_matrix &mat) {
 69 |   mkl_free(mat.nnz);
 70 |   mkl_free(mat.col);
 71 |   mkl_free(mat.rowb);
 72 |   mkl_free(mat.rowe);
 73 |   mkl_free(mat.x);
 74 |   mkl_free(mat.y);
 75 |   mkl_free(mat.ans);
 76 | }
 77 | 
 78 | bool check_answer(csr_matrix &mat) {
 79 |   int bad_count = 0;
 80 |   for (int i = 0; i < mat.rows; i++) {
 81 |     double yi = mat.y[i];
 82 |     double ansi = mat.ans[i];
 83 |     if (abs(yi - ansi) > 0.01 * abs(ansi) && !(abs(yi) <= 1e-5 && abs(ansi) <= 1e-5)) {  
 84 |     	if (bad_count < 10)
 85 |         fprintf(stderr, "y[%d] expected %lf got %lf\n", i, mat.ans[i], mat.y[i]);
 86 |       bad_count++;
 87 |     }
 88 |   }
 89 |   if (bad_count)
 90 |     fprintf(stderr, "bad_count: %d\n", bad_count);
 91 |   return bad_count == 0 ? true : false;
 92 | }
 93 | 
 94 | csr_matrix apply_order(csr_matrix &mat, vector<vector<int>> &tasks, int copy_oob = true) {
 95 |   csr_matrix ret;
 96 | 
 97 |   ret.m = mat.m;
 98 |   ret.rows = mat.rows;
 99 |   ret.cols = mat.cols;
100 |   ret.nnz = (double *)mkl_malloc(mat.m * sizeof(double), 64);
101 |   ret.col = (int *)mkl_malloc(mat.m * sizeof(int), 64);
102 |   ret.rowb = (int *)mkl_malloc(mat.rows * sizeof(int), 64);
103 |   ret.rowe = (int *)mkl_malloc(mat.rows * sizeof(int), 64);
104 |   if (copy_oob) {
105 |     ret.x = (double *)mkl_malloc(mat.cols * sizeof(double), 64);
106 |     ret.y = (double *)mkl_malloc(mat.rows * sizeof(double), 64);
107 |     ret.ans = (double *)mkl_malloc(mat.rows * sizeof(double), 64);
108 |   }
109 |   ret.tstart = (int *)mkl_malloc(tasks.size() * sizeof(int), 64);
110 |   ret.tend = (int *)mkl_malloc(tasks.size() * sizeof(int), 64);
111 | 
112 |   if (copy_oob) {
113 |     for (int i = 0; i < mat.cols; i++)
114 |       ret.x[i] = mat.x[i];
115 | 
116 |     for (int i = 0; i < mat.rows; i++)
117 |       ret.y[i] = 0;
118 | 
119 |     for (int i = 0; i < mat.rows; i++)
120 |       ret.ans[i] = mat.ans[i];
121 |   }
122 | 
123 |   int npos = 0, pos = 0;
124 |   int start = 0, t = 0;
125 |   for (vector<int> &task : tasks) {
126 |     ret.tstart[t] = start;
127 |     ret.tend[t++] = start + task.size();
128 |     start += task.size();
129 |     for (int row : task) {
130 |       int b = mat.rowb[row];
131 |       int e = mat.rowe[row];
132 |       ret.rowb[pos] = npos;
133 |       ret.rowe[pos++] = npos + e - b;
134 |       for (int i = b; i < e; i++) {
135 |         ret.nnz[npos] = mat.nnz[i];
136 |         ret.col[npos++] = mat.col[i];
137 |       }
138 |     }
139 |   }
140 | 
141 |   return ret;
142 | }
143 | 
144 | 


--------------------------------------------------------------------------------
/src/spmv_spv8.cpp:
--------------------------------------------------------------------------------
  1 | #include <immintrin.h>
  2 | #include <algorithm>
  3 | #include <chrono>
  4 | #include <cstdio>
  5 | #include <cstdlib>
  6 | #include <ctime>
  7 | #include <cassert>
  8 | #include <iostream>
  9 | #include <map>
 10 | #include <queue>
 11 | #include <vector>
 12 | #include <unordered_map>
 13 | #include "unistd.h"
 14 | #include "mkl.h"
 15 | #include "mkl_spblas.h"
 16 | #include "utility.hpp"
 17 | using namespace std;
 18 | using namespace chrono;
 19 | 
 20 | struct tr_matrix {
 21 |   csr_matrix mat;
 22 |   vector<int> spvv8_len;
 23 |   vector<vector<int>> tasks;
 24 | };
 25 | 
 26 | tr_matrix tr_reorder(csr_matrix &mat, vector<vector<int>> &tasks) {
 27 |   tr_matrix tr;
 28 | 
 29 |   for (vector<int> &task : tasks) {
 30 |     unordered_map<int, vector<int>> buckets;
 31 | 
 32 |     for (int r : task) {
 33 |       int rowlen = mat.rowe[r] - mat.rowb[r];
 34 |       buckets[rowlen].push_back(r);
 35 |     }
 36 | 
 37 |     vector<int> keys;
 38 |     for (auto kv : buckets) {
 39 |       keys.push_back(kv.first);
 40 |     }
 41 |     sort(keys.begin(), keys.end());
 42 | 
 43 |     vector<int> order;
 44 |     vector<int> remain;
 45 |     for (int k : keys) {
 46 |       vector<int> &samelen_task = buckets[k];
 47 |       int left = samelen_task.size() % 8;
 48 |       if (k > 32)
 49 |         left = samelen_task.size();
 50 |       int bulk = samelen_task.size() - left;
 51 |       order.insert(order.end(), samelen_task.begin(), samelen_task.begin() + bulk);
 52 |       remain.insert(remain.end(), samelen_task.begin() + bulk, samelen_task.end());
 53 |     }
 54 | 
 55 |     tr.spvv8_len.push_back(order.size());
 56 | 
 57 |     task.clear();
 58 |     task.insert(task.end(), order.begin(), order.end());
 59 |     task.insert(task.end(), remain.begin(), remain.end());
 60 |   }
 61 | 
 62 |   tr.mat = apply_order(mat, tasks);
 63 | 
 64 |   int size = tasks.size();
 65 |   for (int t = 0; t < size; t++) {
 66 |     int start = tr.mat.tstart[t];
 67 |     int tr_len = tr.spvv8_len[t];
 68 |     int p = 0, c = 0;
 69 |     for (p = start; c < tr_len; c += 8, p += 8) {
 70 |       int rowlen = tr.mat.rowe[p] - tr.mat.rowb[p];
 71 |       int base = tr.mat.rowb[p];
 72 |       vector<double> nnz;
 73 |       vector<int> col;
 74 |       nnz.insert(nnz.end(), tr.mat.nnz + base, tr.mat.nnz + base + rowlen * 8);
 75 |       col.insert(col.end(), tr.mat.col + base, tr.mat.col + base + rowlen * 8);
 76 |       for (int l = 0; l < rowlen; l++) {
 77 |         for (int r = 0; r < 8; r++) {
 78 |           tr.mat.nnz[base + l * 8 + r] = nnz[r * rowlen + l];
 79 |           tr.mat.col[base + l * 8 + r] = col[r * rowlen + l];
 80 |         }
 81 |       }
 82 |     }
 83 |   }
 84 | 
 85 |   tr.tasks = tasks;
 86 | 
 87 |   return tr;
 88 | }
 89 | 
 90 | bool is_banded(csr_matrix &mat, int band_size = -1) {
 91 |   if (band_size == -1)
 92 |     band_size = mat.cols / 64;
 93 |   int band_count = 0;
 94 |   bool banded = false;
 95 |   
 96 |   for (int r = 0; r < mat.rows; r++) {
 97 |     int rb = mat.rowb[r];
 98 |     int re = mat.rowe[r];
 99 |     for (int i = rb; i < re; i++) {
100 |       int col = mat.col[i];
101 |       if (abs(col - r) <= band_size)
102 |         band_count++;
103 |     }
104 |   }
105 | 
106 |   if (double(band_count) / mat.m >= 0.3) {
107 |     banded = true;
108 |   }
109 | 
110 |   return banded;
111 | }
112 | 
113 | 
114 | tr_matrix process(csr_matrix &mat, int panel_num) {
115 |   vector<vector<int>> tasks(panel_num);
116 | 
117 |   int pos = 0;
118 |   int len = mat.m / panel_num;
119 |   int limit = mat.rows - 7;
120 |   int i;
121 |   int count = 0;
122 |   for (i = 0; i < limit; i += 8) {
123 |     for (int j = 0; j < 8; j++) {
124 |       int rowlen = mat.rowe[i + j] - mat.rowb[i + j];
125 |       if (rowlen > 0) {
126 |         tasks[pos].push_back(i + j);
127 |         count += rowlen;
128 |       }
129 |     }
130 | 
131 |     if (count >= len) {
132 |       if (pos + 1 < panel_num) {
133 |         pos += 1;
134 |         count = 0;
135 |       }
136 |     }
137 |   }
138 | 
139 |   if (i < mat.rows) {
140 |     for (; i < mat.rows; i++) {
141 |       tasks[pos].push_back(i);
142 |     }
143 |   }
144 | 
145 |   tr_matrix ret = tr_reorder(mat, tasks);
146 |   return ret;
147 | }
148 | 
149 | 
150 | always_inline double avx512_fma_spvv_kernel(int *col, double *nnz, int rowlen,
151 |                                         double *x) {
152 |   int limit = rowlen - 7;
153 |   int *col_p;
154 |   double *nnz_p;
155 |   double sum = 0;
156 |   __m256i c1;
157 |   __m512d v1, v2, s;
158 |   s = _mm512_setzero_pd();
159 |   int i;
160 |   
161 |   for (i = 0; i < limit; i += 8) {
162 |     col_p = col + i;
163 |     nnz_p = nnz + i;
164 |     c1 = _mm256_loadu_si256((const __m256i *) col_p);
165 |     v2 = _mm512_i32gather_pd(c1, x, 8);
166 |     v1 = _mm512_loadu_pd(nnz_p);
167 |     s = _mm512_fmadd_pd(v1, v2, s);
168 |   }
169 | 
170 |   sum += _mm512_reduce_add_pd(s);
171 |   for (; i < rowlen; i++) {
172 |     sum += nnz[i] * x[col[i]];
173 |   }
174 | 
175 |   return sum;
176 | }
177 | 
178 | always_inline void avx512_spvv8_kernel_tr(const int *rows, int *rowb, int *rowe,
179 |                                        int *col, double *nnz, double *x,
180 |                                        double *y) {
181 |   __m256i rs = _mm256_loadu_si256((const __m256i *) rows);
182 |   __m512d acc = _mm512_setzero_pd();
183 | 
184 |   int rowlen = *rowe - *rowb;
185 |   int base = *rowb;
186 | 
187 |   {
188 |     int idx0 = rows[0];
189 |     int idx1 = rows[1];
190 |     int idx2 = rows[2];
191 |     int idx3 = rows[3];
192 |     int idx4 = rows[4];
193 |     int idx5 = rows[5];
194 |     int idx6 = rows[6];
195 |     int idx7 = rows[7];
196 | 
197 |     _m_prefetchw(y + idx0);
198 |     _m_prefetchw(y + idx1);
199 |     _m_prefetchw(y + idx2);
200 |     _m_prefetchw(y + idx3);
201 |     _m_prefetchw(y + idx4);
202 |     _m_prefetchw(y + idx5);
203 |     _m_prefetchw(y + idx6);
204 |     _m_prefetchw(y + idx7);
205 |   }
206 |   
207 |   for (int c = 0; c < rowlen; c++) {
208 |     int offset = base + c * 8;
209 |     __m256i cc = _mm256_loadu_si256((const __m256i *) (col + offset));
210 |     __m512d nz = _mm512_loadu_pd(nnz + offset);
211 |     __m512d xx = _mm512_i32gather_pd(cc, x, 8);
212 |     acc = _mm512_fmadd_pd(nz, xx, acc);
213 |   }
214 | 
215 |   _mm512_i32scatter_pd(y, rs, acc, 8);
216 | }
217 | 
218 | void spmv_tr_spvv8_kernel(tr_matrix &tr, int threads) {
219 |   int size = tr.tasks.size();
220 | #pragma omp parallel for num_threads(threads) schedule(dynamic)
221 |   for (int tid = 0; tid < size; tid++) {
222 |     vector<int> &task = tr.tasks[tid];
223 |     csr_matrix &mat = tr.mat;
224 |     int *rows = task.data();
225 |     int start = mat.tstart[tid];
226 |     int end = mat.tend[tid];
227 |     int limit = tr.spvv8_len[tid];
228 |     int p, c;
229 |     for (p = start, c = 0; c < limit; p += 8, c += 8) {
230 |       avx512_spvv8_kernel_tr(rows + c, mat.rowb + p, mat.rowe + p, mat.col, mat.nnz, mat.x, mat.y);
231 |     }
232 |     for (; p < end; p++) {
233 |       int r = rows[p - start];
234 |       int begin = mat.rowb[p];
235 |       int end = mat.rowe[p];
236 |       int rowlen = end - begin;
237 |       _mm_prefetch(mat.y + r, _MM_HINT_ET1);
238 |       mat.y[r] = avx512_fma_spvv_kernel(mat.col + begin, mat.nnz + begin, rowlen, mat.x);
239 |     }
240 |   }
241 | }
242 | 
243 | int main(int argc, char *argv[]) {
244 |   csr_matrix mat;
245 |   int loop_count, thread_count;
246 | 
247 |   if (access("info.txt", F_OK) != -1) {
248 |     FILE *info = fopen("info.txt", "r");
249 |     fscanf(info, "%d", &mat.m);
250 |     fscanf(info, "%d", &mat.rows);
251 |     fscanf(info, "%d", &mat.cols);
252 |     fclose(info);
253 |   } else {
254 |     fprintf(stderr, "info.txt not found\n");
255 |     return -1;
256 |   }
257 | 
258 |   if (argc > 3) {
259 |     sscanf(argv[1], "%d", &loop_count);
260 |     sscanf(argv[3], "%d", &thread_count);
261 |   } else {
262 |     fprintf(stderr, "invalid parameter\n");
263 |     return -2;
264 |   }
265 | 
266 |   input_matrix(mat);
267 | 
268 |   bool banded = is_banded(mat);
269 |   int panel_count = max(thread_count * 4, mat.rows / 2000);
270 |   if (banded) {
271 |     panel_count = max(thread_count * 4, mat.rows / 10000);
272 |   }
273 |   tr_matrix tr = process(mat, panel_count);
274 | 
275 |   // Warm-up
276 |   for (int i = 0; i < 300; i++) {
277 |     spmv_tr_spvv8_kernel(tr, thread_count);
278 |   }
279 | 
280 |   auto begin = high_resolution_clock::now();
281 |   for (int i = 0; i < loop_count; i++) {
282 |     spmv_tr_spvv8_kernel(tr, thread_count);
283 |   }
284 |   auto end = high_resolution_clock::now();
285 |   auto duration = duration_cast<microseconds>(end - begin);
286 |   printf("%lf,", double(duration.count()) / 1000 / loop_count);
287 | 
288 |   destroy_matrix(mat);
289 |   return 0;
290 | }
291 | 
292 | 


--------------------------------------------------------------------------------