├── src ├── bitmap │ ├── input │ ├── test │ ├── Makefile │ ├── input_test │ ├── test.c │ ├── matrix.h │ ├── csr.h │ └── bitmap.h ├── locality_generator │ ├── matrix.out │ └── locality_generator.py ├── matrix_utils │ ├── sort_correct_matrices.sh │ ├── transform_matrix_to_csr.sh │ ├── florida_to_csr.py │ └── sort_edges.py ├── spmm │ ├── timers.h │ ├── Makefile │ ├── zsim_hooks.h │ ├── naive_spmm.c │ ├── gemm_csr.c │ ├── out │ ├── matrix_generator.h │ └── spmm_bitmap.c ├── spmv │ ├── Makefile │ ├── zsim_hooks.h │ ├── spmv_bitmap.c │ └── matrix_generator.h └── storage │ └── storage.py └── README.md /src/bitmap/input: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/bitmap/test: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CMU-SAFARI/SMASH/HEAD/src/bitmap/test -------------------------------------------------------------------------------- /src/bitmap/Makefile: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | test: test.c 5 | gcc -O3 -DBCR test.c -o test 6 | 7 | clean: 8 | rm test 9 | -------------------------------------------------------------------------------- /src/bitmap/input_test: -------------------------------------------------------------------------------- 1 | 4 2 | 7 3 | 0 4 | 2 5 | 5 6 | 6 7 | 7 8 | 2 9 | 3 10 | 1 11 | 2 12 | 3 13 | 3 14 | 3 15 | 1.0 16 | 1.0 17 | 1.0 18 | 1.0 19 | 1.0 20 | 1.0 21 | 1.0 22 | -------------------------------------------------------------------------------- /src/locality_generator/matrix.out: -------------------------------------------------------------------------------- 1 | 100 100 2 | 0 0 3 | 0 5 4 | 0 10 5 | 0 15 6 | 0 20 7 | 0 25 8 | 0 30 9 | 0 35 10 | 0 40 11 | 0 45 12 | 0 50 13 | 0 55 14 | 0 60 15 | 0 65 16 | 0 70 17 | 0 75 18 | 0 80 19 | 0 85 20 | 0 90 21 | 0 95 22 | -------------------------------------------------------------------------------- /src/matrix_utils/sort_correct_matrices.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | 5 | PATH_MATRIX=/mnt/panzer/kanellok/SpMv-XComp/inputs/matrix/ 6 | PATH_SORT=/mnt/panzer/kanellok/SpMv-XComp/inputs/matrix_sort 7 | for matrix in $PATH_MATRIX/*; 8 | do 9 | filename="${matrix##*/}" 10 | python sort_edges.py $matrix $PATH_SORT/$filename 11 | done 12 | -------------------------------------------------------------------------------- /src/matrix_utils/transform_matrix_to_csr.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | 5 | PATH_MATRIX=/mnt/panzer/kanellok/SpMv-XComp/inputs/matrix_sort/ 6 | PATH_CSR=/mnt/panzer/kanellok/SpMv-XComp/inputs/csr 7 | for matrix in $PATH_MATRIX/*; 8 | do 9 | filename="${matrix##*/}" 10 | python florida_to_csr.py $matrix $PATH_CSR/$filename 11 | done 12 | -------------------------------------------------------------------------------- /src/bitmap/test.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "bitmap.h" 4 | #include "csr.h" 5 | 6 | 7 | 8 | 9 | 10 | int main(int argc, char *argv[]) 11 | { 12 | 13 | 14 | 15 | 16 | csr matrix_csr; 17 | 18 | //matrix_csr = read_csr(argv[1]); 19 | matrix_csr = csr_generator(atoi(argv[1]),atoi(argv[2])); 20 | smash matrix_smash; 21 | 22 | construct_format(&matrix_smash,2,2,2); 23 | construct_bitmap0_nza(&matrix_smash,&matrix_csr); 24 | construct_bitmap1(&matrix_smash); 25 | construct_bitmap2(&matrix_smash); 26 | 27 | print_bitmaps(&matrix_smash); 28 | matrix_smash.current_register0 = matrix_smash.bitmap0[0]; 29 | matrix_smash.current_register1 = matrix_smash.bitmap1[0]; 30 | matrix_smash.current_register2 = matrix_smash.bitmap2[0]; 31 | 32 | printf("Bitmap 0 bits = %d \n", matrix_smash.bitmap0_bits); 33 | for(int i = 0; i < matrix_smash.nza_blocks; i++) 34 | index_bitmaps(&matrix_smash); 35 | } 36 | -------------------------------------------------------------------------------- /src/spmm/timers.h: -------------------------------------------------------------------------------- 1 | #ifndef _TIMERS_H_ 2 | #define _TIMERS_H_ 3 | 4 | #include 5 | #include 6 | 7 | typedef struct timer_s { 8 | struct timeval t1; 9 | struct timeval t2; 10 | double duration; 11 | } timer_tt; 12 | 13 | static inline timer_tt *timer_alloc() { 14 | timer_tt *timer; 15 | timer = (timer_tt *)malloc(sizeof(timer_tt)); 16 | timer->duration = 0; 17 | return timer; 18 | } 19 | 20 | static inline void timer_start(timer_tt *timer) { gettimeofday(&timer->t1, 0); } 21 | 22 | static inline void timer_stop(timer_tt *timer) { 23 | gettimeofday(&timer->t2, 0); 24 | timer->duration += (double)((timer->t2.tv_sec - timer->t1.tv_sec) * 1000000 + 25 | timer->t2.tv_usec - timer->t1.tv_usec) / 26 | 1000000; 27 | } 28 | 29 | static inline double timer_report_sec(timer_tt *timer) { 30 | return timer->duration; 31 | } 32 | 33 | static inline void timer_free(timer_tt *timer) { free(timer); } 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /src/bitmap/matrix.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | struct matrix{ 7 | float *array; 8 | int rows; 9 | int columns; 10 | 11 | }; 12 | 13 | int rangeRandom (int min, int max){ 14 | srand(time(NULL)); // Seed the time 15 | int n = max - min + 1; 16 | int remainder = RAND_MAX % n; 17 | int x; 18 | do{ 19 | x = rand(); 20 | }while (x >= RAND_MAX - remainder); 21 | return min + x % n; 22 | } 23 | 24 | 25 | struct matrix read_matrix(char *path) 26 | { 27 | printf("Read matrix %s\n",path); 28 | FILE *fd; 29 | fd = fopen(path,"r"); 30 | 31 | int rows, columns, nnz, weight; 32 | 33 | fscanf(fd,"%d",&rows); 34 | fscanf(fd,"%d",&columns); 35 | fscanf(fd,"%d",&nnz); 36 | fscanf(fd,"%d",&weight); 37 | 38 | printf("Rows = %d Columns = %d Non-zero = %d \n",rows,columns,nnz); 39 | 40 | struct matrix mat; 41 | mat.array = (int *)malloc(sizeof(int)*(rows+1)*(rows+1)); 42 | mat.size = rows; 43 | for (int i = 0 ; i < rows*rows; i++) 44 | { 45 | 46 | mat.array[i] = 0 ; 47 | } 48 | 49 | int i,j; 50 | for(int w = 0 ; w < nnz; w++) 51 | { 52 | fscanf(fd,"%d",&i); 53 | fscanf(fd,"%d",&j); 54 | 55 | if(weight){ 56 | fscanf(fd,"%f",&value); 57 | mat.array[i*rows+j] = value; 58 | } 59 | 60 | } 61 | 62 | 63 | printf("Matrix Initialized using 2D array\n"); 64 | 65 | return mat; 66 | 67 | 68 | } -------------------------------------------------------------------------------- /src/spmm/Makefile: -------------------------------------------------------------------------------- 1 | 2 | #CXX := $(RISCV)/bin/riscv64-unknown-elf-g++ 3 | CXX := gcc 4 | CFLAGS = -O3 -DNATIVE 5 | 6 | 7 | INCLUDES = -I../bitmap/ 8 | 9 | # define library paths in addition to /usr/lib 10 | # if I wanted to include libraries not in /usr/lib I'd specify 11 | # their path using -Lpath, something like: 12 | LFLAGS = 13 | 14 | # define any libraries to link into executable: 15 | # if I want to link in libraries (libx.so or libx.a) I use the -llibname 16 | # option, something like (this will link in libmylib.so and libm.so: 17 | LIBS = 18 | #LIBS = 19 | # define the C source files 20 | SRCS = spmm_bitmap.c 21 | 22 | OBJS = $(SRCS:.c=.o) 23 | 24 | # define the executable file 25 | MAIN = spmm_bitmap.e 26 | 27 | # 28 | # The following part of the makefile is generic; it can be used to 29 | # build any executable just by changing the definitions above and by 30 | # deleting dependencies appended to the file from 'make depend' 31 | # 32 | 33 | .PHONY: depend clean 34 | 35 | all: $(MAIN) 36 | 37 | $(MAIN): $(OBJS) 38 | $(CXX) $(CFLAGS) $(INCLUDES) $(OBJS) $(LFLAGS) $(LIBS) -o $(MAIN) 39 | 40 | # this is a suffix replacement rule for building .o's from .c's 41 | # it uses automatic variables $<: the name of the prerequisite of 42 | # the rule(a .cpp file) and $@: the name of the target of the rule (a .o file) 43 | # (see the gnu make manual section about automatic variables) 44 | .c.o: 45 | $(CXX) $(CFLAGS) $(INCLUDES) -c $< -o $@ 46 | 47 | clean: 48 | $(RM) $(OBJS) *~ $(MAIN) 49 | 50 | depend: $(SRCS) 51 | makedepend $(INCLUDES) $^ 52 | 53 | -------------------------------------------------------------------------------- /src/spmv/Makefile: -------------------------------------------------------------------------------- 1 | 2 | #CXX := $(RISCV)/bin/riscv64-unknown-elf-g++ 3 | CXX := gcc 4 | CFLAGS = -O3 -DNATIVE 5 | 6 | 7 | INCLUDES = -I../bitmap/ 8 | 9 | # define library paths in addition to /usr/lib 10 | # if I wanted to include libraries not in /usr/lib I'd specify 11 | # their path using -Lpath, something like: 12 | LFLAGS = 13 | 14 | # define any libraries to link into executable: 15 | # if I want to link in libraries (libx.so or libx.a) I use the -llibname 16 | # option, something like (this will link in libmylib.so and libm.so: 17 | LIBS = 18 | #LIBS = 19 | # define the C source files 20 | SRCS = spmv_bitmap.c 21 | #SRCS = pagerank.cpp 22 | 23 | OBJS = $(SRCS:.c=.o) 24 | 25 | # define the executable file 26 | MAIN = spmv_bitmap.e 27 | 28 | # 29 | # The following part of the makefile is generic; it can be used to 30 | # build any executable just by changing the definitions above and by 31 | # deleting dependencies appended to the file from 'make depend' 32 | # 33 | 34 | .PHONY: depend clean 35 | 36 | all: $(MAIN) 37 | 38 | $(MAIN): $(OBJS) 39 | $(CXX) $(CFLAGS) $(INCLUDES) $(OBJS) $(LFLAGS) $(LIBS) -o $(MAIN) 40 | 41 | # this is a suffix replacement rule for building .o's from .c's 42 | # it uses automatic variables $<: the name of the prerequisite of 43 | # the rule(a .cpp file) and $@: the name of the target of the rule (a .o file) 44 | # (see the gnu make manual section about automatic variables) 45 | .c.o: 46 | $(CXX) $(CFLAGS) $(INCLUDES) -c $< -o $@ 47 | 48 | clean: 49 | $(RM) $(OBJS) *~ $(MAIN) 50 | 51 | depend: $(SRCS) 52 | makedepend $(INCLUDES) $^ 53 | -------------------------------------------------------------------------------- /src/spmm/zsim_hooks.h: -------------------------------------------------------------------------------- 1 | #ifndef __ZSIM_HOOKS_H__ 2 | #define __ZSIM_HOOKS_H__ 3 | 4 | #include 5 | #include 6 | 7 | //Avoid optimizing compilers moving code around this barrier 8 | #define COMPILER_BARRIER() { __asm__ __volatile__("" ::: "memory");} 9 | 10 | //These need to be in sync with the simulator 11 | #define ZSIM_MAGIC_OP_ROI_BEGIN (1025) 12 | #define ZSIM_MAGIC_OP_ROI_END (1026) 13 | #define ZSIM_MAGIC_OP_REGISTER_THREAD (1027) 14 | #define ZSIM_MAGIC_OP_HEARTBEAT (1028) 15 | #define ZSIM_MAGIC_OP_WORK_BEGIN (1029) //ubik 16 | #define ZSIM_MAGIC_OP_WORK_END (1030) //ubik 17 | 18 | #ifdef __x86_64__ 19 | #define HOOKS_STR "HOOKS" 20 | static inline void zsim_magic_op(uint64_t op) { 21 | COMPILER_BARRIER(); 22 | __asm__ __volatile__("xchg %%rcx, %%rcx;" : : "c"(op)); 23 | COMPILER_BARRIER(); 24 | } 25 | #else 26 | #define HOOKS_STR "NOP-HOOKS" 27 | static inline void zsim_magic_op(uint64_t op) { 28 | //NOP 29 | } 30 | #endif 31 | 32 | static inline void zsim_roi_begin() { 33 | printf("[" HOOKS_STR "] ROI begin\n"); 34 | zsim_magic_op(ZSIM_MAGIC_OP_ROI_BEGIN); 35 | } 36 | 37 | static inline void zsim_roi_end() { 38 | zsim_magic_op(ZSIM_MAGIC_OP_ROI_END); 39 | printf("[" HOOKS_STR "] ROI end\n"); 40 | } 41 | 42 | static inline void zsim_heartbeat() { 43 | zsim_magic_op(ZSIM_MAGIC_OP_HEARTBEAT); 44 | } 45 | 46 | static inline void zsim_work_begin() { zsim_magic_op(ZSIM_MAGIC_OP_WORK_BEGIN); } 47 | static inline void zsim_work_end() { zsim_magic_op(ZSIM_MAGIC_OP_WORK_END); } 48 | 49 | #endif /*__ZSIM_HOOKS_H__*/ 50 | -------------------------------------------------------------------------------- /src/spmv/zsim_hooks.h: -------------------------------------------------------------------------------- 1 | #ifndef __ZSIM_HOOKS_H__ 2 | #define __ZSIM_HOOKS_H__ 3 | 4 | #include 5 | #include 6 | 7 | //Avoid optimizing compilers moving code around this barrier 8 | #define COMPILER_BARRIER() { __asm__ __volatile__("" ::: "memory");} 9 | 10 | //These need to be in sync with the simulator 11 | #define ZSIM_MAGIC_OP_ROI_BEGIN (1025) 12 | #define ZSIM_MAGIC_OP_ROI_END (1026) 13 | #define ZSIM_MAGIC_OP_REGISTER_THREAD (1027) 14 | #define ZSIM_MAGIC_OP_HEARTBEAT (1028) 15 | #define ZSIM_MAGIC_OP_WORK_BEGIN (1029) //ubik 16 | #define ZSIM_MAGIC_OP_WORK_END (1030) //ubik 17 | 18 | #ifdef __x86_64__ 19 | #define HOOKS_STR "HOOKS" 20 | static inline void zsim_magic_op(uint64_t op) { 21 | COMPILER_BARRIER(); 22 | __asm__ __volatile__("xchg %%rcx, %%rcx;" : : "c"(op)); 23 | COMPILER_BARRIER(); 24 | } 25 | #else 26 | #define HOOKS_STR "NOP-HOOKS" 27 | static inline void zsim_magic_op(uint64_t op) { 28 | //NOP 29 | } 30 | #endif 31 | 32 | static inline void zsim_roi_begin() { 33 | printf("[" HOOKS_STR "] ROI begin\n"); 34 | zsim_magic_op(ZSIM_MAGIC_OP_ROI_BEGIN); 35 | } 36 | 37 | static inline void zsim_roi_end() { 38 | zsim_magic_op(ZSIM_MAGIC_OP_ROI_END); 39 | printf("[" HOOKS_STR "] ROI end\n"); 40 | } 41 | 42 | static inline void zsim_heartbeat() { 43 | zsim_magic_op(ZSIM_MAGIC_OP_HEARTBEAT); 44 | } 45 | 46 | static inline void zsim_work_begin() { zsim_magic_op(ZSIM_MAGIC_OP_WORK_BEGIN); } 47 | static inline void zsim_work_end() { zsim_magic_op(ZSIM_MAGIC_OP_WORK_END); } 48 | 49 | #endif /*__ZSIM_HOOKS_H__*/ 50 | -------------------------------------------------------------------------------- /src/matrix_utils/florida_to_csr.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | fp = open(sys.argv[1],'r') 4 | lines = fp.readlines() 5 | 6 | 7 | 8 | rows = (int)(lines[0].split(' ')[0]) 9 | columns = (int)(lines[0].split(' ')[1]) 10 | nnz = (int)(lines[0].split(' ')[2]) 11 | 12 | print(rows) 13 | i=0 14 | no_value = 0 15 | row_ptr = [0 for x in range(rows+1)] 16 | col_ptr = [[] for x in range(rows+1)] 17 | values = [[] for x in range(rows+1)] 18 | for line in lines: 19 | # print (line) 20 | if i != 0: 21 | splits = line.split(' ') 22 | node1 =(int)(line.split(' ')[0]) 23 | node2 =(int)(line.split(' ')[1]) 24 | if(len(splits) == 3): 25 | value =(float)(line.split(' ')[2]) 26 | else: 27 | value = 1 28 | row_ptr[node1] = row_ptr[node1]+1 29 | col_ptr[node1].append(node2) 30 | values[node1].append(value) 31 | i = i + 1 32 | 33 | print (row_ptr[1]) 34 | print (col_ptr[1]) 35 | 36 | fp_csr = open(sys.argv[2],'w') 37 | 38 | 39 | 40 | fp_csr.write((str)(rows)) 41 | fp_csr.write('\n') 42 | fp_csr.write((str)(nnz)) 43 | fp_csr.write('\n') 44 | 45 | acc = 0 46 | for i in range(rows+1): 47 | fp_csr.write((str)(acc)) 48 | fp_csr.write('\n') 49 | acc = acc + row_ptr[i] 50 | 51 | 52 | col_ptr = [inner for outer in col_ptr for inner in outer] 53 | for i in range(nnz): 54 | fp_csr.write((str)(col_ptr[i])) 55 | fp_csr.write('\n') 56 | 57 | values = [inner for outer in values for inner in outer] 58 | for i in range(nnz): 59 | fp_csr.write((str)(values[i])) 60 | fp_csr.write('\n') 61 | 62 | fp.close() 63 | fp_csr.close() 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /src/spmm/naive_spmm.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "timers.h" 4 | #include "matrix_generator.h" 5 | #include "zsim_hooks.h" 6 | 7 | 8 | 9 | 10 | __attribute__ ((noinline)) int avoid(uintptr_t matrix, uintptr_t matrix2, uintptr_t vec, uintptr_t vec2, uintptr_t out, uintptr_t out2, uint32_t *bitmap, uint32_t *bitmap2, int bitmap_size, int bitmap2_size,int size) 11 | { 12 | static int w; 13 | w = w + 10; 14 | w = w - 5; 15 | return w; 16 | } 17 | 18 | 19 | 20 | 21 | int main(int argc, char **argv) { 22 | 23 | 24 | 25 | 26 | char* path_A; 27 | char* path_B; 28 | 29 | if (argc == 3) 30 | { 31 | path_A = argv[1]; 32 | path_B = argv[2]; 33 | } 34 | int value; 35 | int flag; 36 | struct matrix A,B,C; 37 | A = read_matrix(path_A); 38 | B = read_matrix(path_B); 39 | C.array = (float *)malloc(sizeof(float)*A.size*A.size); 40 | 41 | for(int i=0; i nnz ): 27 | del row_col_list[-1] 28 | print row_col_list 29 | return row_col_list 30 | col = col +1 31 | if(col == cols): 32 | col = 0 33 | row = row + 1 34 | if(row == rows and col == cols): 35 | print row_col_list 36 | return row_col_list 37 | 38 | 39 | 40 | def print_matrix_to_file(adjacency_list,rows,cols): 41 | 42 | os.system("touch ./matrix.out") 43 | with open("matrix.out","w") as f: 44 | f.write(str(rows) +" "+str(cols)+'\n') 45 | print(adjacency_list) 46 | for element in adjacency_list: 47 | f.write (str(element[0])+" "+str(element[1])+'\n') 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | rows = (int)(sys.argv[1]) 56 | cols = (int)(sys.argv[2]) 57 | nnz = (int)(sys.argv[3]) 58 | block_size = (int)(sys.argv[4]) 59 | locality = (int)(sys.argv[5]) # Specify number of elements per block 60 | 61 | adj = produce_matrix(rows,cols,nnz,block_size,locality) 62 | print_matrix_to_file(adj,rows,cols) 63 | -------------------------------------------------------------------------------- /src/matrix_utils/sort_edges.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | fp = open(sys.argv[1],'r') 4 | lines = fp.readlines() 5 | 6 | 7 | 8 | rows = (int)(lines[0].split(' ')[0]) 9 | columns = (int)(lines[0].split(' ')[1]) 10 | nnz = (int)(lines[0].split(' ')[2]) 11 | weight = (int)(lines[0].split(' ')[2]) 12 | print(rows) 13 | print (columns) 14 | print (nnz) 15 | i=0 16 | no_value = 0 17 | nodes = [[] for x in range(rows+1)] 18 | values = [[] for x in range(rows+1)] 19 | nnz_correct = 0 20 | for line in lines: 21 | max = 0 22 | #print (line) 23 | if i != 0: 24 | splits = line.split(' ') 25 | node1 =(int)(line.split(' ')[0]) 26 | #if(node1 > rows+1): 27 | # print ("Error"+str(node1)) 28 | if(node1 > max): 29 | max = node1 30 | node2 =(int)(line.split(' ')[1]) 31 | if(len(splits) == 3): 32 | value =(float)(line.split(' ')[2]) 33 | else: 34 | value = 1 35 | #nodes[node1].append((node2,value)) 36 | nnz_correct = nnz_correct + 1 37 | 38 | i = i + 1 39 | 40 | #print(nodes) 41 | print(nnz_correct) 42 | print(max) 43 | fp_sort = open(sys.argv[2],'w') 44 | 45 | 46 | 47 | fp_sort.write((str)(rows)) 48 | fp_sort.write(' ') 49 | 50 | fp_sort.write((str)(rows)) 51 | fp_sort.write(' ') 52 | 53 | nnz_sanity = 0 54 | for i in range(rows): 55 | for j in range(len(nodes[i])): 56 | nnz_sanity = nnz_sanity + 1 57 | 58 | fp_sort.write((str)(nnz_sanity)) 59 | fp_sort.write(' ') 60 | 61 | fp_sort.write((str)(1)) 62 | fp_sort.write('\n') 63 | 64 | nnz_sanity = 0 65 | for i in range(rows): 66 | for j in range(len(nodes[i])): 67 | fp_sort.write((str)(i)) 68 | fp_sort.write(' ') 69 | fp_sort.write((str)(nodes[i][j][0])) 70 | fp_sort.write(' ') 71 | fp_sort.write((str)(nodes[i][j][1])) 72 | fp_sort.write('\n') 73 | nnz_sanity = nnz_sanity + 1 74 | 75 | 76 | print(nnz_sanity) 77 | fp.close() 78 | fp_sort.close() 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /src/storage/storage.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | 5 | 6 | 7 | def storage_calculation(file,b2,b1,b0,output): 8 | bitmap2 = {} 9 | bitmap1 = {} 10 | bitmap0 = {} 11 | row_ptr = [] 12 | col_ind = [] 13 | values = [] 14 | with open(file) as f: 15 | lines = f.readlines() 16 | rows = (int)(lines[0]) 17 | print("Rows = "+(str)(rows)) 18 | bitmap2_ratio = (int)(b2) 19 | bitmap1_ratio = (int)(b1) 20 | bitmap0_ratio = (int)(b0) 21 | 22 | bitmap0_size = (rows*rows)/bitmap0_ratio 23 | bitmap1_size = (bitmap0_size)/bitmap1_ratio 24 | bitmap2_size = (bitmap1_size)/bitmap2_ratio 25 | nnz = (int)(lines[1]) 26 | index = 2 27 | for i in range(rows+1): 28 | row_ptr.append(int(lines[index])) 29 | index = index + 1 30 | for i in range(nnz): 31 | col_ind.append(int(lines[index])) 32 | index = index + 1 33 | for i in range(nnz): 34 | values.append(float(lines[index])) 35 | index = index + 1 36 | 37 | for i in range(rows): 38 | elements = row_ptr[i+1] - row_ptr[i] 39 | #print elements 40 | for j in range(elements): 41 | row_index = i 42 | column_index = col_ind[row_ptr[i]+j] 43 | #print ("Row = " + str(row_index)) 44 | #print ("Column = " + str(column_index)) 45 | bitmap0[(row_index*rows+column_index)/bitmap0_ratio] = 1 46 | 47 | 48 | for i in range(bitmap0_size): 49 | if(bitmap0.get(i,-1)!=-1): 50 | bitmap1[i/bitmap1_ratio] = 1 51 | 52 | for i in range(bitmap1_size): 53 | if(bitmap1.get(i,-1)!=-1): 54 | bitmap2[i/bitmap2_ratio] = 1 55 | 56 | storage = len(bitmap0.keys())*bitmap0_ratio*4*8 + len(bitmap1.keys())*bitmap1_ratio + len(bitmap2.keys())*bitmap2_ratio + bitmap2_size 57 | storage = (storage*1.0)/(8.0) 58 | #print (row_ptr) 59 | #print (col_ind) 60 | #print (values) 61 | f = open(output,"a+") 62 | f.write(str(b2)+'.'+str(b1)+'.'+str(b0)+'\n') 63 | f.write("Storage:"+str(storage)+'\n') 64 | f.write("Rows:"+str(rows)) 65 | f.write("Bitmap2:"+str(bitmap2_size)+'\n') 66 | f.write("Bitmap1:"+str(len(bitmap2.keys())*bitmap2_ratio)+'\n') 67 | f.write("Bitmap0:"+str(len(bitmap1.keys())*bitmap1_ratio)+'\n') 68 | f.write("CSR Storage:"+str(len(row_ptr)*4+len(col_ind)*4+len(values)*4)+'\n') 69 | f.close() 70 | 71 | 72 | b2 = sys.argv[4] 73 | b1 = sys.argv[3] 74 | b0 = sys.argv[2] 75 | 76 | 77 | storage_calculation('./'+sys.argv[1],b2,b1,b0,sys.argv[1].split('/')[-1]) 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /src/bitmap/csr.h: -------------------------------------------------------------------------------- 1 | #ifndef CSR_H 2 | #define CSR_H 3 | #include 4 | #include 5 | #define true 1 6 | #define false 0 7 | 8 | struct csr{ 9 | 10 | int *row_ptr; 11 | int *col_ptr; 12 | float *val; 13 | int size; 14 | }; 15 | 16 | typedef struct csr csr; 17 | 18 | 19 | csr read_csr(char *path) 20 | { 21 | printf("Read matrix %s\n",path); 22 | FILE *fd; 23 | fd = fopen(path,"r"); 24 | 25 | int rows, columns, nnz; 26 | 27 | fscanf(fd,"%d",&rows); 28 | fscanf(fd,"%d",&nnz); 29 | printf("Rows = %d Non-zero = %d \n",rows,nnz); 30 | 31 | csr csr_out; 32 | int total_nnz = 0 ; 33 | 34 | csr_out.row_ptr = (int *) malloc((rows+1)*sizeof(int)); 35 | csr_out.col_ptr = (int *) malloc(nnz*sizeof(int)); 36 | csr_out.val = (float*) malloc(nnz*sizeof(float)); 37 | csr_out.size = rows; 38 | 39 | int row,col; 40 | float value; 41 | for(int w = 0 ; w < rows + 1; w++) 42 | { 43 | fscanf(fd,"%d",&row); 44 | csr_out.row_ptr[w] = row; 45 | } 46 | for(int w= 0 ; w < nnz; w++) 47 | { 48 | fscanf(fd,"%d",&col); 49 | csr_out.col_ptr[w] = col; 50 | } 51 | for(int w= 0 ; w < nnz; w++) 52 | { 53 | fscanf(fd,"%f",&value); 54 | csr_out.val[w] = value; 55 | } 56 | 57 | printf("Finished reading csr %s\n",path); 58 | return csr_out ; 59 | 60 | } 61 | 62 | 63 | csr csr_generator(int size,int nnz ){ 64 | 65 | csr csr_out; 66 | int total_nnz = 0 ; 67 | srand(time(NULL)); 68 | csr_out.row_ptr = (int *) malloc((size+1)*sizeof(int)); 69 | csr_out.col_ptr = (int *) malloc(5*sizeof(int)); 70 | csr_out.val = (int*) malloc(5*sizeof(int)); 71 | 72 | 73 | printf("Create CSR with size %d and sparsity %d\n", size, nnz); 74 | csr_out.size = size; 75 | int index; 76 | int last_nnz = 0; 77 | int nnz_row = 0; 78 | int i = 0; 79 | int j = 0; 80 | int *visited; 81 | int *temp; 82 | int max; 83 | int min; 84 | for( i = 0; i < size; i++) 85 | { 86 | 87 | last_nnz += nnz_row; 88 | 89 | max = (int)((nnz + nnz/10.0)/100.0 * size); 90 | min = (int)((nnz - nnz/10.0)/100.0 * size); 91 | nnz_row = rand()%max + min ; 92 | total_nnz += nnz_row; 93 | printf("Total_nnz = %d \n",total_nnz); 94 | csr_out.col_ptr = (int *)realloc(csr_out.col_ptr, sizeof(int)*total_nnz); 95 | csr_out.val = (int *)realloc(csr_out.val, sizeof(int)*total_nnz); 96 | visited = (int *)malloc(sizeof(int)*size); 97 | 98 | for(int w =0 ; w < size; w++) visited[w] = 0; 99 | 100 | for ( j=0; j < nnz_row ; j++){ 101 | csr_out.val[last_nnz+j] = 1; 102 | do{ 103 | index = rand()%size; 104 | }while(visited[index]); 105 | 106 | visited[index] = true; 107 | csr_out.col_ptr[last_nnz+j] = index; 108 | printf("Column = %d \n", index); 109 | } 110 | csr_out.row_ptr[i+1] = csr_out.row_ptr[i]+nnz_row; 111 | free(visited); 112 | } 113 | 114 | 115 | return csr_out; 116 | 117 | } 118 | 119 | #endif 120 | -------------------------------------------------------------------------------- /src/spmm/gemm_csr.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "timers.h" 4 | #include "matrix_generator.h" 5 | #include "zsim_hooks.h" 6 | 7 | static void sgemm_csr(int M, int N, int K, struct csr A, struct csr B, struct csr C){ 8 | int n, k, m, nnz=0; 9 | int a_nnz_pr, b_nnz_pr; 10 | int nnz_C; 11 | int a_i, b_i; 12 | int c1,c2; 13 | float a, b, sum; 14 | int pointer_A, pointer_B; 15 | c1 = 0; 16 | c2 = 0; 17 | nnz_C = 0; 18 | printf("M = %d, N = %d \n", M, N); 19 | //getchar(); 20 | C.row_ptr = (int*)malloc((M+1)*sizeof(int)); 21 | C.row_ptr[0] = 0; 22 | C.col_ptr = (int*)malloc(M*M*sizeof(int)); 23 | for(m=0; m B.row_ptr[pointer_B] && b_nnz_pr!=1 ){ 66 | 67 | 68 | do { 69 | c++; 70 | pointer_B++; 71 | } 72 | while( (A.col_ptr[pointer_A] > B.row_ptr[pointer_B]) && b_nnz_pr >= c ); 73 | 74 | 75 | } 76 | else if( A.col_ptr[pointer_A] < B.row_ptr[pointer_B] ){ 77 | 78 | pointer_A++; 79 | 80 | } 81 | } 82 | } 83 | 84 | if( sum != 0 ){ 85 | C.row_ptr[m+1]++; 86 | C.col_ptr[nnz_C]=n; 87 | } 88 | } 89 | } 90 | 91 | printf("A had %d non zero rows \n", c1 ); 92 | printf("B had %d non zero rows \n", c2 ); 93 | 94 | } 95 | 96 | 97 | 98 | int main(int argc, char **argv) { 99 | 100 | 101 | 102 | 103 | char* path_A; 104 | char* path_B; 105 | 106 | if (argc == 3) 107 | { 108 | path_A = argv[1]; 109 | path_B = argv[2]; 110 | } 111 | 112 | int flag; 113 | printf("Compute SGEMM using csr\n"); 114 | struct csr A,B,C; 115 | 116 | 117 | 118 | A = read_csr(path_A); 119 | 120 | printf("Matrix A is ready"); 121 | printf("B's path: %s \n", path_B); 122 | B = read_csc(path_B); 123 | 124 | zsim_roi_begin(); 125 | sgemm_csr(A.size,A.size,A.size,A,B,C); 126 | zsim_roi_end(); 127 | 128 | } 129 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SMASH 2 | 3 | Source code of the sparse matrix kernels and utilities used to evaluate the schemes presented in 4 | the MICRO'19 paper: 5 | 6 | >Konstantinos Kanellopoulos, Nandita Vijaykumar, Christina Giannoula, Roknoddin Azizi, Skanda Koppula, Nika Mansouri Ghiasi, Taha Shahroodi, Juan Gomez-Luna, and Onur Mutlu, 7 | [**SMASH: Co-designing Software Compression and Hardware-Accelerated Indexing for Efficient Sparse Matrix Operations**](https://people.inf.ethz.ch/omutlu/pub/SMASH-sparse-matrix-software-hardware-acceleration_micro19.pdf) 8 | Proceedings of the 52nd International Symposium on Microarchitecture (MICRO), Columbus, OH, USA, October 2019. 9 | 10 | Please cite the above work if you make use of the tools provided in this repository. 11 | 12 | The presentations of the paper are available on YouTube: 13 | >[Lightning Talk Video](https://youtu.be/VN0PQ5zgLGg) 14 | 15 | >[Full Talk Video](https://youtu.be/LWYVQ3o_SdU) 16 | 17 | >[Slides(pdf)](https://people.inf.ethz.ch/omutlu/pub/SMASH-sparse-matrix-software-hardware-acceleration_micro19-talk.pdf) 18 | 19 | >[Slides(pptx)](https://people.inf.ethz.ch/omutlu/pub/SMASH-sparse-matrix-software-hardware-acceleration_micro19-talk.pptx) 20 | 21 | 22 | 23 | 24 | ## Using the Hierarchical Bitmap Format 25 | 26 | We provide the source code to construct three-level hierarchical bitmaps under `/src/bitmap/bitmap.h`. 27 | 28 | 29 | ## 1. Initialization 30 | 31 | ### To initialize the bitmaps perform the following steps: 32 | 33 | Read the matrix using the CSR Format. 34 | 35 | ``` 36 | csr matrix_csr = read_csr(path); 37 | ``` 38 | 39 | Construct Bitmaps and and the Non-Zero Values Array. 40 | 41 | ``` 42 | construct_format(&matrix_smash,comp0,comp1,comp2); 43 | construct_bitmap0_nza(&matrix_smash,&matrix_csr); 44 | construct_bitmap1(&matrix_smash); 45 | construct_bitmap2(&matrix_smash); 46 | ``` 47 | 48 | 49 | ## 2. Using the Hierarchical Bitmaps 50 | 51 | The main function to manipulate the bitmaps is: 52 | $index_bitmap(smash *format) 53 | 54 | This function uses the following hardware intrinsic to quickly discover the set bits of the bitmaps: 55 | 56 | ``` 57 | uint64_t find_set_bit(smash* format, uint64_t a) 58 | { 59 | .. 60 | uint64_t bit = __builtin_ctzl(a); // Hardware primitive 61 | .. 62 | } 63 | 64 | ``` 65 | 66 | In case your system does not support the aforementioned instruction we provide an alternative function that sequentially reads the bits of the bitmaps. 67 | 68 | ## 3. SpMV and SpMM 69 | 70 | We provide one SpMV and one SpMM kernel which make use of the Hierarhical Bitmap format. You can execute these two applications using the following instructions: 71 | ``` 72 | ./spmm_bitmap.e -f comp0 -s comp1 -t comp2 -i matrix_csr 73 | ./spmv_bitmap.e -f comp0 -s comp1 -t comp2 -i matrix_csr 74 | ``` 75 | You need to specify the compression ratios and the input matrix using the appropriate options. 76 | 77 | SpMM currently supports the square of a matrix. To avoid transposing, we make inverse the functionality of the row and column for matrix B. 78 | 79 | 80 | ## 4. Locality Generator 81 | 82 | We provide a python tool to produce matrices with various localities of sparsity. 83 | 84 | To produce a matrix execute the script in the following way: 85 | 86 | 87 | ``` 88 | python locality_generator.py #num_rows #num_cols #num_of_zero_elements #block_size #num_elements/block 89 | ``` 90 | 91 | Using this script you can produce matrices with different discrete localities base on the size of the block. 92 | 93 | 94 | ## 5. Storage Calculation 95 | 96 | To calculate the storage of the matrix execute the following script: 97 | 98 | ``` 99 | python storage.py #input_file #compression_ratio0 #compression_ratio1 #compression_ratio2 100 | ``` 101 | 102 | The storage calculation script can be used to explore the design space of different compression ratios and bitmap levels. 103 | 104 | ## 6. Matrix Utilities 105 | 106 | We provide handy utilities to transform the matrices to the CSR format. We found out that multiple matrices from SNAP and UF contain mistakes (e.g., wrong number of non-zero elements). We provide the script `sort_edges.py`to produce a correct matrix and sort the non-zero elements. The result matrix of this script is fed in the `florida_to_csr.py` script to produce the final matrix used by the `read_csr` function. 107 | 108 | 109 | 110 | ## Contact 111 | 112 | Konstantinos Kanellopoulos (konkanello@gmail.com) 113 | 114 | 115 | -------------------------------------------------------------------------------- /src/spmm/out: -------------------------------------------------------------------------------- 1 | Compression ratio 0 of A 2 2 | Compression ratio 0 of B 2 3 | Compression ratio 1 of A 2 4 | Compression ratio 1 of B 2 5 | Compression ratio 2 of A 2 6 | Compression ratio 2 of B 2 7 | Input path ../bitmap/input_test 8 | Read matrix ../bitmap/input_test 9 | Rows = 4 Non-zero = 7 10 | Finished reading csr ../bitmap/input_test 11 | Read matrix ../bitmap/input_test 12 | Rows = 4 Non-zero = 7 13 | Finished reading csr ../bitmap/input_test 14 | Initialized Compression Ratios 15 | Initialized bitmap0 16 | Row 0 has 2 elements 17 | Element in row 0 and column 2 and block 1 18 | Mask = 0x0000000000000002 19 | Index = 0 20 | Element in row 0 and column 3 and block 1 21 | Mask = 0x0000000000000002 22 | Index = 0 23 | Row 1 has 3 elements 24 | Element in row 1 and column 1 and block 2 25 | Mask = 0x0000000000000004 26 | Index = 0 27 | Element in row 1 and column 2 and block 3 28 | Mask = 0x0000000000000008 29 | Index = 0 30 | Element in row 1 and column 3 and block 3 31 | Mask = 0x0000000000000008 32 | Index = 0 33 | Row 2 has 1 elements 34 | Element in row 2 and column 3 and block 5 35 | Mask = 0x0000000000000020 36 | Index = 0 37 | Row 3 has 1 elements 38 | Element in row 3 and column 3 and block 7 39 | Mask = 0x0000000000000080 40 | Index = 0 41 | 42 | Constructed bitmap0 and nza 43 | 0 1 1 1 0 1 0 1 44 | Set bit 1 of bitmap0 45 | Mask = 0x0000000000000002 46 | Index = 0 47 | Mask = 0x0000000000000001 48 | Index = 0 49 | Set bit 2 of bitmap0 50 | Mask = 0x0000000000000004 51 | Index = 0 52 | Mask = 0x0000000000000002 53 | Index = 0 54 | Mask = 0x0000000000000008 55 | Index = 0 56 | Mask = 0x0000000000000002 57 | Index = 0 58 | Set bit 5 of bitmap0 59 | Mask = 0x0000000000000020 60 | Index = 0 61 | Mask = 0x0000000000000004 62 | Index = 0 63 | Set bit 7 of bitmap0 64 | Mask = 0x0000000000000080 65 | Index = 0 66 | Mask = 0x0000000000000008 67 | Index = 0 68 | Constructed Bitmap1 69 | 1 1 1 1 70 | Mask = 0x0000000000000001 71 | Index = 0 72 | Mask = 0x0000000000000001 73 | Index = 0 74 | Mask = 0x0000000000000002 75 | Index = 0 76 | Mask = 0x0000000000000001 77 | Index = 0 78 | Mask = 0x0000000000000004 79 | Index = 0 80 | Mask = 0x0000000000000002 81 | Index = 0 82 | Mask = 0x0000000000000008 83 | Index = 0 84 | Mask = 0x0000000000000002 85 | Index = 0 86 | Constructed Bitmap2 87 | 1 1 88 | Initialized Compression Ratios 89 | Initialized bitmap0 90 | Row 0 has 2 elements 91 | Element in row 0 and column 2 and block 1 92 | Mask = 0x0000000000000002 93 | Index = 0 94 | Element in row 0 and column 3 and block 1 95 | Mask = 0x0000000000000002 96 | Index = 0 97 | Row 1 has 3 elements 98 | Element in row 1 and column 1 and block 2 99 | Mask = 0x0000000000000004 100 | Index = 0 101 | Element in row 1 and column 2 and block 3 102 | Mask = 0x0000000000000008 103 | Index = 0 104 | Element in row 1 and column 3 and block 3 105 | Mask = 0x0000000000000008 106 | Index = 0 107 | Row 2 has 1 elements 108 | Element in row 2 and column 3 and block 5 109 | Mask = 0x0000000000000020 110 | Index = 0 111 | Row 3 has 1 elements 112 | Element in row 3 and column 3 and block 7 113 | Mask = 0x0000000000000080 114 | Index = 0 115 | 116 | Constructed bitmap0 and nza 117 | 0 1 1 1 0 1 0 1 118 | Set bit 1 of bitmap0 119 | Mask = 0x0000000000000002 120 | Index = 0 121 | Mask = 0x0000000000000001 122 | Index = 0 123 | Set bit 2 of bitmap0 124 | Mask = 0x0000000000000004 125 | Index = 0 126 | Mask = 0x0000000000000002 127 | Index = 0 128 | Mask = 0x0000000000000008 129 | Index = 0 130 | Mask = 0x0000000000000002 131 | Index = 0 132 | Set bit 5 of bitmap0 133 | Mask = 0x0000000000000020 134 | Index = 0 135 | Mask = 0x0000000000000004 136 | Index = 0 137 | Set bit 7 of bitmap0 138 | Mask = 0x0000000000000080 139 | Index = 0 140 | Mask = 0x0000000000000008 141 | Index = 0 142 | Constructed Bitmap1 143 | 1 1 1 1 144 | Mask = 0x0000000000000001 145 | Index = 0 146 | Mask = 0x0000000000000001 147 | Index = 0 148 | Mask = 0x0000000000000002 149 | Index = 0 150 | Mask = 0x0000000000000001 151 | Index = 0 152 | Mask = 0x0000000000000004 153 | Index = 0 154 | Mask = 0x0000000000000002 155 | Index = 0 156 | Mask = 0x0000000000000008 157 | Index = 0 158 | Mask = 0x0000000000000002 159 | Index = 0 160 | Constructed Bitmap2 161 | 1 1 162 | 1 1 163 | 1 1 1 1 164 | 0 1 1 1 0 1 0 1 165 | 1 1 166 | 1 1 1 1 167 | 0 1 1 1 0 1 0 1 168 | NZA blocks A 5 169 | NZA blocks B 5 170 | Move Bitmap B 171 | Found set bit at Bitmap2[0] = 0, current_index2 = 0 172 | Counter2= 0 173 | Found set bit at Bitmap1[0] = 0 174 | Counter1= 0 175 | Register= 0x00000000000000ae 176 | Found set bit at Bitmap0[0] = 1 current_block0 = 0 177 | Counter0= 1 178 | -------------------------------------------------------------------------------- /src/spmv/spmv_bitmap.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "zsim_hooks.h" 4 | #include 5 | #include "../bitmap/bitmap.h" 6 | #include "../bitmap/csr.h" 7 | #include 8 | #include 9 | #include "zsim_hooks.h" 10 | // Global Variables 11 | 12 | int *x; // dense vector 13 | int *b; // output matrix 14 | smash matrix_smash; //smash matrix format 15 | char *path; 16 | int comp0,comp1,comp2; 17 | int K; 18 | 19 | 20 | __attribute__ ((noinline)) int send_data_to_bmu(smash *format){ // Include bitmap.h in the simulator and catch routine to save smash format in the simulator using PIN_SafeCopy() (In Sniper you can do it using builtin functions). 21 | static int i=1; 22 | i = i+10; 23 | i = i -5; 24 | return i; 25 | } 26 | 27 | __attribute__ ((noinline)) int index_bitmap_zsim(int* i, int* j){ // Catch routine in zsim/sniper to perform the indexing in the simulator. Simulator needs to write back the values using PIN_SafeCopy() (In Sniper you can do it using builtin functions) the inverse way. 28 | static int w=5; 29 | w = w+10; 30 | w = w -5; 31 | return w; 32 | } 33 | 34 | 35 | 36 | int parse_opt(int argc, char *argv[]) 37 | { 38 | 39 | fprintf(stderr,"Usage: ./spmv_bitmap -f compression_ratio0 -s compression_ratio1 -t compression_ratio2 -k #iterations\n"); 40 | 41 | int opt; 42 | while((opt = getopt(argc, argv,"f:s:t:i:k:"))){ 43 | if(opt == -1) 44 | break; 45 | switch(opt){ 46 | 47 | case 'f': 48 | matrix_smash.compression_ratio0 = atoi(optarg); 49 | comp0 = atoi(optarg); 50 | printf("Compression ratio 0 %d \n",matrix_smash.compression_ratio0); 51 | break; 52 | case 's': 53 | matrix_smash.compression_ratio1 = atoi(optarg); 54 | comp1 = atoi(optarg); 55 | 56 | printf("Compression ratio 1 %d \n",matrix_smash.compression_ratio1); 57 | break; 58 | case 't': 59 | matrix_smash.compression_ratio2 = atoi(optarg); 60 | comp2 = atoi(optarg); 61 | 62 | printf("Compression ratio 2 %d \n",matrix_smash.compression_ratio2); 63 | break; 64 | case 'i': 65 | path = optarg; 66 | printf("Input path %s \n",path); 67 | break; 68 | case 'k': 69 | K = atoi(optarg); 70 | printf("K = %d \n",K); 71 | break; 72 | default: 73 | return 1; 74 | 75 | 76 | } 77 | } 78 | 79 | 80 | 81 | } 82 | 83 | 84 | void spmv(){ 85 | 86 | int block; 87 | int cur_element = 0; 88 | int i,j; 89 | int bitmap0_contribution; 90 | int bitmap1_contribution; 91 | int bitmap2_contribution; 92 | 93 | 94 | printf("NZA blocks %d\n",matrix_smash.nza_blocks); 95 | 96 | 97 | for(block = 0; block < matrix_smash.nza_blocks; block++){ 98 | 99 | index_bitmaps(&matrix_smash); 100 | 101 | #ifdef SIM 102 | index_bitmap_zsim(&i,&j); // Signal zsim/sniper to receive the indices 103 | #endif 104 | 105 | #ifdef NATIVE 106 | bitmap0_contribution = (matrix_smash.counter0%matrix_smash.compression_ratio1)*matrix_smash.compression_ratio0; // Check the paper to see what are these computations 107 | bitmap1_contribution = (matrix_smash.counter1%matrix_smash.compression_ratio2)*matrix_smash.compression_ratio1*matrix_smash.compression_ratio0; 108 | bitmap2_contribution = matrix_smash.counter2*matrix_smash.compression_ratio2*matrix_smash.compression_ratio0*matrix_smash.compression_ratio1; 109 | 110 | i = bitmap0_contribution + bitmap1_contribution + bitmap2_contribution; 111 | j = bitmap0_contribution + bitmap1_contribution + bitmap2_contribution; 112 | 113 | i = i / matrix_smash.columns; 114 | j = j % matrix_smash.columns; 115 | 116 | printf("Row A = %d \n", i); 117 | printf("Col B = %d \n", j); 118 | 119 | #endif 120 | 121 | 122 | for(int e=0; e < matrix_smash.compression_ratio0; e++){ 123 | 124 | b[i] = x[j]*matrix_smash.nza[cur_element]; 125 | cur_element++; 126 | j++; 127 | if(j > matrix_smash.columns){ i++; j=0;} 128 | 129 | } 130 | } 131 | 132 | 133 | } 134 | 135 | int main(int argc, char* argv[]) 136 | { 137 | 138 | 139 | parse_opt(argc,argv); 140 | 141 | csr matrix_csr = read_csr(path); 142 | 143 | construct_format(&matrix_smash,comp0,comp1,comp2); 144 | construct_bitmap0_nza(&matrix_smash,&matrix_csr); 145 | construct_bitmap1(&matrix_smash); 146 | construct_bitmap2(&matrix_smash); 147 | 148 | matrix_smash.current_register0 = matrix_smash.bitmap0[0]; 149 | matrix_smash.current_register1 = matrix_smash.bitmap1[0]; 150 | matrix_smash.current_register2 = matrix_smash.bitmap2[0]; 151 | 152 | 153 | print_bitmaps(&matrix_smash); 154 | 155 | 156 | b = (float*) malloc (sizeof(float)*matrix_smash.rows); 157 | x = (float*) malloc (sizeof(float)*matrix_smash.rows); 158 | 159 | 160 | #ifdef SIM 161 | int value; 162 | zsim_roi_begin(); 163 | value = send_data_to_bmu(&matrix_smash); 164 | COMPILER_BARRIER(); // Compiler barrier to be sure that smash is stored in the simulator. 165 | #endif 166 | 167 | for(int i = 0; i < matrix_smash.rows ; i++) 168 | { 169 | x[i] = x[i]+1.0; 170 | b[i] = b[i]+1.0; // warmup the caches with the vector and the output to avoid weird caching effects 171 | } 172 | 173 | #ifdef NATIVE 174 | clock_t t; 175 | t = clock(); 176 | #endif 177 | 178 | spmv(); 179 | 180 | #ifdef SIM 181 | zsim_roi_end(); // run at least 10B instructions. 182 | #endif 183 | 184 | #ifdef NATIVE 185 | double time_taken = ((double)t)/CLOCKS_PER_SEC; // in seconds 186 | printf("SpMV-Time:%f\n", time_taken); 187 | #endif 188 | 189 | 190 | return 0; 191 | 192 | } 193 | -------------------------------------------------------------------------------- /src/spmm/matrix_generator.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #define true 0 6 | #define false 1 7 | #include 8 | #include "bitmap.h" 9 | #include "csr.h" 10 | 11 | 12 | int rangeRandom (int min, int max){ 13 | srand(time(NULL)); // Seed the time 14 | int n = max - min + 1; 15 | int remainder = RAND_MAX % n; 16 | int x; 17 | do{ 18 | x = rand(); 19 | }while (x >= RAND_MAX - remainder); 20 | return min + x % n; 21 | } 22 | 23 | struct csr csr_generator(int size,int nnz ){ 24 | 25 | struct csr csr_out; 26 | int total_nnz = 0 ; 27 | srand(time(NULL)); 28 | csr_out.row_ptr = (int *) malloc((size+1)*sizeof(int)); 29 | csr_out.col_ptr = (int *) malloc(5*sizeof(int)); 30 | csr_out.val = (int*) malloc(5*sizeof(int)); 31 | 32 | 33 | printf("Create CSR with size %d and sparsity %d\n", size, nnz); 34 | int index; 35 | int last_nnz = 0; 36 | int nnz_row = 0; 37 | int i = 0; 38 | int j = 0; 39 | int *visited; 40 | int *temp; 41 | int max; 42 | int min; 43 | for( i = 0; i < size; i++) 44 | { 45 | 46 | last_nnz += nnz_row; 47 | 48 | max = (int)((nnz + nnz/10.0)/100.0 * size); 49 | min = (int)((nnz - nnz/10.0)/100.0 * size); 50 | nnz_row = rand()%max + min ; 51 | total_nnz += nnz_row; 52 | //printf("Total_nnz = %d \n",total_nnz); 53 | csr_out.col_ptr = (int *)realloc(csr_out.col_ptr, sizeof(int)*total_nnz); 54 | csr_out.val = (int *)realloc(csr_out.val, sizeof(int)*total_nnz); 55 | visited = (int *)malloc(sizeof(int)*size); 56 | 57 | for(int w =0 ; w < size; w++) visited[w] = 0; 58 | 59 | for ( j=0; j < nnz_row ; j++){ 60 | csr_out.val[last_nnz+j] = 1; 61 | do{ 62 | index = rand()%size; 63 | }while(visited[index]); 64 | 65 | visited[index] = true; 66 | csr_out.col_ptr[last_nnz+j] = index; 67 | // printf("Column = %d \n", index); 68 | } 69 | csr_out.row_ptr[i+1] = csr_out.row_ptr[i]+nnz_row; 70 | free(visited); 71 | } 72 | 73 | 74 | return csr_out; 75 | 76 | } 77 | 78 | 79 | 80 | void print_binary(int n ) 81 | { 82 | 83 | while (n) { 84 | if (n & 1) 85 | printf("1"); 86 | else 87 | printf("0"); 88 | 89 | n >>= 1; 90 | } 91 | printf("\n"); 92 | 93 | return; 94 | 95 | } 96 | 97 | 98 | struct matrix read_matrix(char *path) 99 | { 100 | printf("Read matrix %s\n",path); 101 | FILE *fd; 102 | fd = fopen(path,"r"); 103 | 104 | int rows, columns, nnz, weight; 105 | 106 | fscanf(fd,"%d",&rows); 107 | fscanf(fd,"%d",&columns); 108 | fscanf(fd,"%d",&nnz); 109 | fscanf(fd,"%d",&weight); 110 | 111 | printf("Rows = %d Columns = %d Non-zero = %d \n",rows,columns,nnz); 112 | 113 | struct matrix mat; 114 | mat.array = (int *)malloc(sizeof(int)*(rows+1)*(rows+1)); 115 | mat.size = rows; 116 | for (int i = 0 ; i < rows*rows; i++) 117 | { 118 | 119 | mat.array[i] = 0 ; 120 | } 121 | 122 | int cache_lines = ((rows)/16)*rows; 123 | int size_bitmap= cache_lines/32+1; 124 | if(cache_lines % 32 == 0) size_bitmap = cache_lines/32; 125 | else size_bitmap = cache_lines/32+1; 126 | mat.bitmap = (uint32_t*)malloc(sizeof(uint32_t)*size_bitmap); 127 | mat.bitmap_size = size_bitmap; 128 | 129 | 130 | for(int i=0; i size) nnz_rows_loc = size; 322 | int nnz_elements_per_row = ( nnz_elements / nnz_rows_loc); 323 | if(nnz_elements % nnz_rows_loc == 0 ) nnz_elements_per_row = nnz_elements / nnz_rows_loc; 324 | else nnz_elements_per_row = nnz_elements / nnz_rows_loc+1; 325 | 326 | printf("Non-zero elements: %d\n", nnz_elements); 327 | printf("Non-zero rows: %d\n", nnz_rows); 328 | printf("Non-zero rows locality: %d\n", nnz_rows_loc); 329 | printf("Non-zero elements per row: %d\n", nnz_elements_per_row); 330 | for( i = 0; i < nnz_rows_loc; i++) 331 | { 332 | 333 | 334 | visited = (int *)malloc(sizeof(int)*size); 335 | 336 | for(int w =0 ; w < size; w++) visited[w] = 0; 337 | 338 | for ( j=0; j < nnz_elements_per_row ; j++){ 339 | 340 | do{ 341 | index = rand()%size; 342 | }while(visited[index]); 343 | 344 | visited[index] = true; 345 | mat.array[i*size+j] = 1 ; 346 | cache_line = (i*sizeof(int)*size+j*sizeof(int))/64; 347 | int bitmap_index = cache_line/32; 348 | uint32_t temp = mat.bitmap[cache_line/32]; 349 | mat.bitmap[cache_line/32] = mat.bitmap[cache_line/32] | (uint32_t) ( 1 << ( cache_line%32)); 350 | 351 | //printf("Input to shift = %u, Cache line = %d, Bitmap[%d] before = %d, Bitmap[%d] after = %d \n", (uint32_t)(1 << (cache_line%32)), cache_line, cache_line/32, temp ,cache_line/32, mat.bitmap[cache_line/32]); 352 | 353 | //printf("Cache line of array[%d][%d] = %d Bitmap = %d \n",i,index,(i*sizeof(int)*size+index*sizeof(int))/64, mat.bitmap[cache_line/32]); 354 | 355 | } 356 | 357 | // free(visited); 358 | } 359 | 360 | 361 | free(visited); 362 | 363 | return mat; 364 | 365 | 366 | } 367 | 368 | 369 | 370 | 371 | -------------------------------------------------------------------------------- /src/spmv/matrix_generator.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #define true 0 6 | #define false 1 7 | #define CACHE_LINE_SIZE 64 8 | #include 9 | struct matrix{ 10 | float *array; 11 | uint32_t *bitmap; 12 | int bitmap_size; 13 | int size; 14 | 15 | }; 16 | struct csr{ 17 | 18 | int *row_ptr; 19 | int *col_ptr; 20 | float *val; 21 | int size; 22 | }; 23 | 24 | struct bitmap_nnz{ 25 | 26 | float *nnz_array; 27 | uint32_t *bitmap; 28 | int bitmap_size; 29 | int nnz_size; 30 | int matrix_size; 31 | 32 | }; 33 | 34 | 35 | int rangeRandom (int min, int max){ 36 | srand(time(NULL)); // Seed the time 37 | int n = max - min + 1; 38 | int remainder = RAND_MAX % n; 39 | int x; 40 | do{ 41 | x = rand(); 42 | }while (x >= RAND_MAX - remainder); 43 | return min + x % n; 44 | } 45 | 46 | struct csr csr_generator(int size,int nnz ){ 47 | 48 | struct csr csr_out; 49 | int total_nnz = 0 ; 50 | srand(time(NULL)); 51 | csr_out.row_ptr = (int *) malloc((size+1)*sizeof(int)); 52 | csr_out.col_ptr = (int *) malloc(5*sizeof(int)); 53 | csr_out.val = (int*) malloc(5*sizeof(int)); 54 | 55 | 56 | printf("Create CSR with size %d and sparsity %d\n", size, nnz); 57 | int index; 58 | int last_nnz = 0; 59 | int nnz_row = 0; 60 | int i = 0; 61 | int j = 0; 62 | int *visited; 63 | int *temp; 64 | int max; 65 | int min; 66 | for( i = 0; i < size; i++) 67 | { 68 | 69 | last_nnz += nnz_row; 70 | 71 | max = (int)((nnz + nnz/10.0)/100.0 * size); 72 | min = (int)((nnz - nnz/10.0)/100.0 * size); 73 | nnz_row = rand()%max + min ; 74 | total_nnz += nnz_row; 75 | //printf("Total_nnz = %d \n",total_nnz); 76 | csr_out.col_ptr = (int *)realloc(csr_out.col_ptr, sizeof(int)*total_nnz); 77 | csr_out.val = (int *)realloc(csr_out.val, sizeof(int)*total_nnz); 78 | visited = (int *)malloc(sizeof(int)*size); 79 | 80 | for(int w =0 ; w < size; w++) visited[w] = 0; 81 | 82 | for ( j=0; j < nnz_row ; j++){ 83 | csr_out.val[last_nnz+j] = 1; 84 | do{ 85 | index = rand()%size; 86 | }while(visited[index]); 87 | 88 | visited[index] = true; 89 | csr_out.col_ptr[last_nnz+j] = index; 90 | // printf("Column = %d \n", index); 91 | } 92 | csr_out.row_ptr[i+1] = csr_out.row_ptr[i]+nnz_row; 93 | free(visited); 94 | } 95 | 96 | 97 | return csr_out; 98 | 99 | } 100 | 101 | 102 | 103 | void print_binary(int n ) 104 | { 105 | 106 | while (n) { 107 | if (n & 1) 108 | printf("1"); 109 | else 110 | printf("0"); 111 | 112 | n >>= 1; 113 | } 114 | printf("\n"); 115 | 116 | return; 117 | 118 | } 119 | 120 | struct bitmap_nnz read_matrix_bitmap_nnz(char *path) 121 | { 122 | printf("Read matrix %s\n",path); 123 | FILE *fd; 124 | fd = fopen(path,"r"); 125 | 126 | int rows, columns, nnz, weight; 127 | 128 | fscanf(fd,"%d",&rows); 129 | fscanf(fd,"%d",&columns); 130 | fscanf(fd,"%d",&nnz); 131 | fscanf(fd,"%d",&weight); 132 | 133 | printf("Rows = %d Columns = %d Non-zero = %d \n",rows,columns,nnz); 134 | 135 | struct bitmap_nnz mat; 136 | mat.matrix_size = rows; 137 | 138 | int size_bitmap; 139 | int cache_lines = (rows*rows); 140 | cache_lines = cache_lines/16; 141 | 142 | //if(cache_lines % 32 == 0) size_bitmap = (int)(cache_lines/32); 143 | size_bitmap = cache_lines/32+1; 144 | mat.bitmap = (uint32_t*)malloc(sizeof(uint32_t)*size_bitmap); 145 | printf("Bitmap size = %d \n", size_bitmap); 146 | getchar(); 147 | mat.bitmap_size = size_bitmap; 148 | mat.nnz_array = (float*)malloc(1); 149 | 150 | printf("Lets read the matrix\n"); 151 | for(int i=0; i size) nnz_rows_loc = size; 368 | int nnz_elements_per_row = ( nnz_elements / nnz_rows_loc); 369 | if(nnz_elements % nnz_rows_loc == 0 ) nnz_elements_per_row = nnz_elements / nnz_rows_loc; 370 | else nnz_elements_per_row = nnz_elements / nnz_rows_loc+1; 371 | 372 | printf("Non-zero elements: %d\n", nnz_elements); 373 | printf("Non-zero rows: %d\n", nnz_rows); 374 | printf("Non-zero rows locality: %d\n", nnz_rows_loc); 375 | printf("Non-zero elements per row: %d\n", nnz_elements_per_row); 376 | for( i = 0; i < nnz_rows_loc; i++) 377 | { 378 | 379 | 380 | visited = (int *)malloc(sizeof(int)*size); 381 | 382 | for(int w =0 ; w < size; w++) visited[w] = 0; 383 | 384 | for ( j=0; j < nnz_elements_per_row ; j++){ 385 | 386 | do{ 387 | index = rand()%size; 388 | }while(visited[index]); 389 | 390 | visited[index] = true; 391 | mat.array[i*size+j] = 1 ; 392 | cache_line = (i*sizeof(int)*size+j*sizeof(int))/64; 393 | int bitmap_index = cache_line/32; 394 | uint32_t temp = mat.bitmap[cache_line/32]; 395 | mat.bitmap[cache_line/32] = mat.bitmap[cache_line/32] | (uint32_t) ( 1 << ( cache_line%32)); 396 | 397 | //printf("Input to shift = %u, Cache line = %d, Bitmap[%d] before = %d, Bitmap[%d] after = %d \n", (uint32_t)(1 << (cache_line%32)), cache_line, cache_line/32, temp ,cache_line/32, mat.bitmap[cache_line/32]); 398 | 399 | //printf("Cache line of array[%d][%d] = %d Bitmap = %d \n",i,index,(i*sizeof(int)*size+index*sizeof(int))/64, mat.bitmap[cache_line/32]); 400 | 401 | } 402 | 403 | // free(visited); 404 | } 405 | 406 | 407 | free(visited); 408 | 409 | return mat; 410 | 411 | 412 | } 413 | 414 | 415 | 416 | 417 | -------------------------------------------------------------------------------- /src/spmm/spmm_bitmap.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "zsim_hooks.h" 4 | #include 5 | #include "../bitmap/bitmap.h" 6 | #include "../bitmap/csr.h" 7 | #include 8 | #include 9 | #include "zsim_hooks.h" 10 | 11 | // Global Variables 12 | 13 | int *C; // output matrix 14 | smash matrix_A_smash; //smash matrix format 15 | smash matrix_B_smash; //smash matrix format 16 | char *path; 17 | int K; 18 | int comp0,comp1,comp2; 19 | 20 | 21 | 22 | __attribute__ ((noinline)) int send_data_to_bmu(smash *format){ // Include bitmap.h in the simulator and catch routine to save smash format in the simulator using PIN_SafeCopy() (In Sniper you can do it using builtin functions). 23 | static int i=1; 24 | i = i+10; 25 | i = i -5; 26 | return i; 27 | } 28 | 29 | __attribute__ ((noinline)) int index_bitmap_zsim(int* i, int* j){ // Catch routine in zsim/sniper to perform the indexing in the simulator. Simulator needs to write back the values using PIN_SafeCopy() (In Sniper you can do it using builtin functions) the inverse way. 30 | static int w=5; 31 | w = w+10; 32 | w = w -5; 33 | return w; 34 | } 35 | 36 | 37 | 38 | int parse_opt(int argc, char *argv[]) 39 | { 40 | 41 | fprintf(stderr,"Usage: ./spmv_bitmap -f compression_ratio0 -s compression_ratio1 -t compression_ratio2 -k #iterations\n"); 42 | 43 | int opt; 44 | while((opt = getopt(argc, argv,"f:s:t:i:k:"))){ 45 | if(opt == -1) 46 | break; 47 | switch(opt){ 48 | 49 | case 'f': 50 | matrix_A_smash.compression_ratio0 = atoi(optarg); 51 | matrix_B_smash.compression_ratio0 = atoi(optarg); 52 | 53 | comp0 = atoi(optarg); 54 | 55 | printf("Compression ratio 0 of A %d \n",matrix_A_smash.compression_ratio0); 56 | printf("Compression ratio 0 of B %d \n",matrix_B_smash.compression_ratio0); 57 | 58 | break; 59 | case 's': 60 | matrix_A_smash.compression_ratio1 = atoi(optarg); 61 | matrix_B_smash.compression_ratio1 = atoi(optarg); 62 | 63 | comp1 = atoi(optarg); 64 | 65 | 66 | printf("Compression ratio 1 of A %d \n",matrix_A_smash.compression_ratio1); 67 | printf("Compression ratio 1 of B %d \n",matrix_B_smash.compression_ratio1); 68 | break; 69 | case 't': 70 | matrix_A_smash.compression_ratio2 = atoi(optarg); 71 | matrix_B_smash.compression_ratio2 = atoi(optarg); 72 | 73 | comp2 = atoi(optarg); 74 | 75 | printf("Compression ratio 2 of A %d \n",matrix_A_smash.compression_ratio2); 76 | printf("Compression ratio 2 of B %d \n",matrix_B_smash.compression_ratio2); 77 | break; 78 | case 'i': 79 | path = optarg; 80 | printf("Input path %s \n",path); 81 | break; 82 | case 'k': 83 | K = atoi(optarg); 84 | printf("K = %d \n",K); 85 | break; 86 | default: 87 | return 1; 88 | 89 | 90 | } 91 | } 92 | 93 | 94 | 95 | } 96 | 97 | 98 | void calculate_indices(int *i, int *j, smash matrix_format){ 99 | 100 | int bitmap0_A_contribution; 101 | int bitmap1_A_contribution; 102 | int bitmap2_A_contribution; 103 | int contrib; 104 | 105 | 106 | bitmap0_A_contribution = (matrix_format.counter0%matrix_format.compression_ratio1)*matrix_format.compression_ratio0; // Check the paper to see what are these computations 107 | bitmap1_A_contribution = (matrix_format.counter1%matrix_format.compression_ratio2)*matrix_format.compression_ratio1*matrix_format.compression_ratio0; 108 | bitmap2_A_contribution = matrix_format.counter2*matrix_format.compression_ratio2*matrix_format.compression_ratio0*matrix_format.compression_ratio1; 109 | contrib = bitmap0_A_contribution + bitmap1_A_contribution + bitmap2_A_contribution; 110 | (*i) = contrib / matrix_format.columns; 111 | (*j) = contrib % matrix_format.columns; 112 | 113 | return; 114 | } 115 | 116 | 117 | void spmm(){ 118 | 119 | 120 | #ifdef DEBUG 121 | printf("NZA blocks A %d\n",matrix_A_smash.nza_blocks); 122 | printf("NZA blocks B %d\n",matrix_B_smash.nza_blocks); 123 | 124 | #endif 125 | 126 | 127 | int row_A,col_A,row_B,col_B; 128 | 129 | row_A = 0; 130 | col_A = 0; 131 | 132 | row_B = 0; 133 | col_B = 0; 134 | 135 | int curr_row_A; 136 | int prev_row_A; 137 | 138 | smash start_matrix_A; 139 | smash temp_B_smash; 140 | int nza_start_matrix_A; 141 | 142 | int current_nza_index_A=0; 143 | int current_nza_index_B=0; 144 | 145 | int temp_nza_index_matrix_B; 146 | int result; 147 | start_matrix_A = matrix_A_smash; 148 | 149 | int current_column_B=0; 150 | temp_B_smash = matrix_B_smash; // Init matrix B SMASH 151 | 152 | int B_visited = 0; 153 | int A_visited = 0; 154 | 155 | 156 | while(!B_visited){ // iterate over B's columns 157 | 158 | printf("Move Bitmap B\n"); 159 | int col_changed = 0; 160 | 161 | while(!col_changed){ // Find next column of B 162 | 163 | 164 | #ifdef SIM 165 | index_bitmap_zsim(&row_A,&col_A); // Signal zsim/sniper to receive the indices 166 | #endif 167 | 168 | #ifdef NATIVE 169 | result = index_bitmaps(&matrix_B_smash); 170 | calculate_indices(&col_B,&row_B,matrix_B_smash); 171 | #endif 172 | 173 | if(result == -1) return; 174 | if(col_B != current_column_B) col_changed = 1; 175 | } 176 | 177 | temp_B_smash = matrix_B_smash; 178 | matrix_A_smash = start_matrix_A; 179 | 180 | 181 | #ifdef SIM 182 | index_bitmap_zsim(&row_B,&col_B); // Signal zsim/sniper to receive the indices 183 | #endif 184 | 185 | #ifdef NATIVE 186 | index_bitmaps(&matrix_A_smash); 187 | calculate_indices(&row_A,&col_A,matrix_A_smash); //Progress Bitmap A 188 | #endif 189 | 190 | 191 | while(!A_visited){ 192 | 193 | 194 | if( col_A < row_B){ // progress A 195 | 196 | 197 | #ifdef SIM 198 | index_bitmap_zsim(&row_A,&col_A); // Signal zsim/sniper to receive the indices 199 | if(row_A == -1 ) A_visited = 1; // We finished A 200 | #endif 201 | 202 | #ifdef NATIVE 203 | result = index_bitmaps(&matrix_A_smash); 204 | if(result == -1 ) A_visited = 1; // We finished A 205 | calculate_indices(&row_A,&col_A,matrix_A_smash); 206 | #endif 207 | 208 | 209 | printf("Progress bitmap A \n"); 210 | printf("Row A = %d \n",row_A); 211 | printf("Col A = %d \n",col_A); 212 | 213 | current_nza_index_A = matrix_A_smash.compression_ratio0*matrix_A_smash.current_block0; 214 | 215 | if(row_A != curr_row_A){ // if we change row in A, roll-back Bitmap B 216 | matrix_B_smash = temp_B_smash; 217 | current_nza_index_B = matrix_B_smash.compression_ratio0*matrix_B_smash.current_block0; // change the nza_index also when rolling back BitmapB 218 | curr_row_A = row_A; 219 | } 220 | } 221 | else if ( col_A > row_B){ 222 | 223 | 224 | #ifdef SIM 225 | index_bitmap_zsim(&row_B,&col_B); // Signal zsim/sniper to receive the indices 226 | if(row_B == -1 ) return; // We finished A 227 | #endif 228 | 229 | #ifdef NATIVE 230 | result = index_bitmaps(&matrix_B_smash); 231 | if(result == -1 ) return; 232 | calculate_indices(&col_B,&row_B,matrix_B_smash); 233 | #endif 234 | 235 | 236 | 237 | printf("Progress bitmap B \n"); 238 | printf("Row B = %d \n",row_B); 239 | printf("Col B = %d \n",col_B); 240 | 241 | if( col_B != current_column_B){ // If we change column, roll-back Bitmap A and store the beginning of B 242 | temp_B_smash = matrix_B_smash; 243 | matrix_A_smash = start_matrix_A; 244 | current_nza_index_A = 0; 245 | } 246 | 247 | current_nza_index_B = matrix_B_smash.compression_ratio0*matrix_B_smash.current_block0; 248 | } 249 | else if (col_A == row_B) { 250 | 251 | printf("Multiply blocks \n"); 252 | 253 | for(int e=0; e < matrix_A_smash.compression_ratio0; e++){ // Multiply the blocks assuming that you have the same compression_ratio0 254 | C[row_A*matrix_A_smash.columns+col_B] = matrix_A_smash.nza[current_nza_index_A]*matrix_B_smash.nza[current_nza_index_B]; 255 | current_nza_index_A +=1; 256 | current_nza_index_B +=1; 257 | } 258 | 259 | #ifdef SIM 260 | index_bitmap_zsim(&row_A,&col_A); // Signal zsim/sniper to receive the indices 261 | if(row_A == -1 ) A_visited = 1; // We finished A 262 | #endif 263 | 264 | #ifdef NATIVE 265 | result = index_bitmaps(&matrix_A_smash); 266 | if(result == -1 ){ 267 | printf("Completed A \n"); 268 | A_visited = 1; 269 | } 270 | calculate_indices(&row_A,&col_A,matrix_A_smash); 271 | #endif 272 | 273 | 274 | 275 | 276 | printf("Progress bitmap A after multiplication \n"); 277 | printf("Row A = %d \n",row_A); 278 | printf("Col A = %d \n",col_A); 279 | 280 | current_nza_index_A = matrix_A_smash.compression_ratio0*matrix_A_smash.current_block0; 281 | 282 | if(row_A != curr_row_A){ // if we change row in A, roll-back Bitmap B 283 | matrix_B_smash = temp_B_smash; 284 | current_nza_index_B = matrix_B_smash.compression_ratio0*matrix_B_smash.current_block0; // change the nza_index also when rolling back BitmapB 285 | curr_row_A = row_A; 286 | } 287 | 288 | } 289 | 290 | } 291 | 292 | 293 | 294 | 295 | } 296 | 297 | 298 | return; 299 | 300 | 301 | } 302 | 303 | 304 | 305 | 306 | int main(int argc, char* argv[]) 307 | { 308 | 309 | 310 | parse_opt(argc,argv); 311 | 312 | csr matrix_A_csr = read_csr(path); 313 | csr matrix_B_csr = read_csr(path); 314 | 315 | construct_format(&matrix_A_smash,comp0,comp1,comp2); 316 | construct_bitmap0_nza(&matrix_A_smash,&matrix_A_csr); 317 | construct_bitmap1(&matrix_A_smash); 318 | construct_bitmap2(&matrix_A_smash); 319 | 320 | construct_format(&matrix_B_smash,comp0,comp1,comp2); 321 | construct_bitmap0_nza(&matrix_B_smash,&matrix_B_csr); 322 | construct_bitmap1(&matrix_B_smash); 323 | construct_bitmap2(&matrix_B_smash); 324 | 325 | matrix_A_smash.current_register0 = matrix_A_smash.bitmap0[0]; 326 | matrix_A_smash.current_register1 = matrix_A_smash.bitmap1[0]; 327 | matrix_A_smash.current_register2 = matrix_A_smash.bitmap2[0]; 328 | 329 | matrix_B_smash.current_register0 = matrix_B_smash.bitmap0[0]; 330 | matrix_B_smash.current_register1 = matrix_B_smash.bitmap1[0]; 331 | matrix_B_smash.current_register2 = matrix_B_smash.bitmap2[0]; 332 | 333 | print_bitmaps(&matrix_A_smash); 334 | print_bitmaps(&matrix_B_smash); 335 | 336 | 337 | 338 | C = (float*) malloc (sizeof(float)*matrix_A_smash.rows*matrix_A_smash.columns); 339 | 340 | 341 | #ifdef SIM 342 | int value; 343 | zsim_roi_begin(); 344 | value = send_data_to_bmu(&matrix_A_smash); //the matrices are the same, send only one 345 | COMPILER_BARRIER(); // Compiler barrier to be sure that smash is stored in the simulator. 346 | #endif 347 | 348 | for(int i = 0; i < matrix_A_smash.rows ; i++) 349 | { 350 | C[i] = C[i]+1.0; // warmup the caches with the vector and the output to avoid weird caching effects when running with less than 10B instructions 351 | } 352 | 353 | #ifdef NATIVE 354 | clock_t t; 355 | t = clock(); 356 | #endif 357 | 358 | spmm(); 359 | 360 | #ifdef SIM 361 | zsim_roi_end(); // run at least 10B instructions. If you dont, you will have weird issues with cache warmups 362 | // Advice based on personal experience: ZSim's structure is a pain. They did not provide a way to catch routines, copy userspace memory etc. This way HW/SW mechanisms are not easy to design. 363 | // Shoot for Sniper who is more modular. I am re-implementing SMASH there for a follow-up idea and I will open-source it soon. 364 | #endif 365 | 366 | #ifdef NATIVE 367 | double time_taken = ((double)t)/CLOCKS_PER_SEC; // in seconds 368 | printf("SpMM-Time:%f\n", time_taken); 369 | #endif 370 | 371 | 372 | return 0; 373 | 374 | } 375 | -------------------------------------------------------------------------------- /src/bitmap/bitmap.h: -------------------------------------------------------------------------------- 1 | #ifndef BITMAP_H 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "csr.h" 11 | 12 | 13 | struct bitmap_3_level{ 14 | 15 | unsigned long *bitmap0; 16 | unsigned long *bitmap1; 17 | unsigned long *bitmap2; 18 | 19 | int counter0; 20 | int counter1; 21 | int counter2; 22 | 23 | int current_index0; 24 | int current_index1; 25 | int current_index2; 26 | 27 | unsigned long current_register0; 28 | unsigned long current_register1; 29 | unsigned long current_register2; 30 | 31 | int current_block0; 32 | int current_block1; 33 | int current_block2; 34 | 35 | int current_bitmap; 36 | 37 | int bitmap0_bits; 38 | int bitmap1_bits; 39 | int bitmap2_bits; 40 | 41 | int bitmap0_blocks; 42 | int bitmap1_blocks; 43 | int bitmap2_blocks; 44 | 45 | float *nza; 46 | int nza_blocks; 47 | 48 | int compression_ratio0; 49 | int compression_ratio1; 50 | int compression_ratio2; 51 | 52 | int rows; 53 | int columns; 54 | }; 55 | 56 | 57 | typedef struct bitmap_3_level smash; 58 | 59 | unsigned long* construct_bitmap(int num_bits){ 60 | 61 | int byteArraySize = (num_bits+63)/64; 62 | unsigned long* byteArray; 63 | 64 | byteArray = (unsigned long*) malloc (byteArraySize*sizeof(unsigned long)); 65 | 66 | if(byteArray == 0) 67 | { 68 | fprintf(stderr,"Bitvector allocation failure \n"); 69 | exit(99); 70 | } 71 | for (int i=0; i < byteArraySize; i++) 72 | byteArray[i]=0x0000000000000000; 73 | 74 | return byteArray; 75 | } 76 | 77 | inline void set_bit(unsigned long* bitmap, int bitnumber){ 78 | 79 | 80 | int index; 81 | int mask; 82 | 83 | index = bitnumber >> 6; 84 | mask = bitnumber & 0x000000000000003f; 85 | mask = 0x0000000000000001 << mask; 86 | 87 | #ifdef DEBUG 88 | 89 | printf("Mask = %#018" PRIx64 "\n",mask); 90 | printf("Index = %d\n", index); 91 | 92 | #endif 93 | 94 | 95 | 96 | bitmap[index] |= mask; 97 | 98 | 99 | // printf("%u\t",bitmap[index] |= mask); 100 | 101 | } 102 | 103 | 104 | inline void test_bit(int bitnumber, unsigned long* bitmap){ 105 | 106 | uint64_t index; 107 | uint64_t mask; 108 | 109 | 110 | index = bitnumber >> 6; 111 | mask = bitnumber & 0x000000000000003f; 112 | mask = 0x0000000000000001 << mask; 113 | 114 | //printf("%c", 0 != (bitmap[index]) & mask); 115 | 116 | } 117 | 118 | inline int read_bit(int bitnumber,unsigned long* bitmap) 119 | { 120 | 121 | int index; 122 | int mask; 123 | 124 | index = bitnumber >> 6; 125 | mask = bitnumber & 0x000000000000003f; 126 | mask = 0x0000000000000001 << mask; 127 | 128 | return (0 != ((bitmap[index]) & mask)); 129 | 130 | 131 | } 132 | 133 | void print_bitmaps(smash *format){ 134 | 135 | for(int i=0; i< format->bitmap2_bits; i++) 136 | { 137 | printf("%u\t",read_bit(i,format->bitmap2)); 138 | } 139 | printf("\n"); 140 | for(int i=0; i< format->bitmap1_bits; i++) 141 | { 142 | printf("%u\t",read_bit(i,format->bitmap1)); 143 | } 144 | printf("\n"); 145 | for(int i=0; i< format->bitmap0_bits; i++) 146 | { 147 | printf("%u\t",read_bit(i,format->bitmap0)); 148 | } 149 | printf("\n"); 150 | 151 | return; 152 | 153 | } 154 | 155 | 156 | void construct_bitmap0_nza(smash* format, csr *matrix){ 157 | int current_col=0; 158 | 159 | int previous_block = -1; 160 | int nza_blocks = 0; 161 | 162 | format->rows = matrix->size; 163 | format->columns = matrix->size; 164 | 165 | 166 | int blocks = ((matrix->size)*(matrix->size))/(format->compression_ratio0); 167 | format->bitmap0 = construct_bitmap(blocks); 168 | format->bitmap0_bits = blocks; 169 | 170 | #ifdef DEBUG 171 | printf("Initialized bitmap0 \n"); 172 | #endif 173 | 174 | format->nza = (float*)malloc(sizeof(float)); 175 | 176 | for(int i=0; i < matrix->size; i++){ 177 | int row_elements = matrix->row_ptr[i+1]-matrix->row_ptr[i]; 178 | 179 | #ifdef DEBUG 180 | printf("Row %d has %d elements \n",i,row_elements); 181 | #endif 182 | 183 | for (int j=0; jcol_ptr[current_col]; 187 | 188 | int block = (x*matrix->size+y)/(format->compression_ratio0); 189 | 190 | #ifdef DEBUG 191 | printf("Element in row %d and column %d and block %d \n",x,y,block); 192 | #endif 193 | 194 | if(block != previous_block){ 195 | nza_blocks++; 196 | format->nza = (float*)realloc(format->nza,nza_blocks*format->compression_ratio0*sizeof(float)); // We do not care about the actual value 197 | previous_block = block; 198 | 199 | } 200 | 201 | int bitnumber = block; 202 | set_bit(format->bitmap0,bitnumber); 203 | current_col+=1; 204 | } 205 | } 206 | 207 | format->nza_blocks = nza_blocks; 208 | 209 | #ifdef DEBUG 210 | printf("\n"); 211 | printf("Constructed bitmap0 and nza \n"); 212 | 213 | for(int i=0; i< format->bitmap0_bits; i++) 214 | { 215 | printf("%u\t",read_bit(i,format->bitmap0)); 216 | } 217 | printf("\n"); 218 | #endif 219 | 220 | 221 | 222 | } 223 | 224 | 225 | void construct_bitmap1(smash *format){ 226 | 227 | int bitmap1_size; 228 | bitmap1_size = (format->bitmap0_bits/format->compression_ratio1); 229 | format->bitmap1_bits = bitmap1_size; 230 | 231 | format->bitmap1 = construct_bitmap(bitmap1_size); 232 | 233 | unsigned long *new_bitmap0; 234 | new_bitmap0 = (unsigned long*)malloc(sizeof(unsigned long)); 235 | int new_bitmap0_blocks = 0; 236 | 237 | int index; 238 | int block; 239 | int previous_block=-1; 240 | 241 | for(int i = 0; i < format->bitmap0_bits; i++){ 242 | 243 | if(read_bit(i,format->bitmap0)){ 244 | block = i/(format->compression_ratio1); 245 | 246 | if(previous_block != block){ 247 | new_bitmap0_blocks++; 248 | if(new_bitmap0_blocks % 64 ==0) 249 | new_bitmap0 = (unsigned long*)realloc(new_bitmap0,new_bitmap0_blocks*sizeof(unsigned long)); 250 | 251 | #ifdef DEBUG 252 | 253 | printf("Set bit %d of bitmap0 \n",(new_bitmap0_blocks-1)*format->compression_ratio0+i%(format->compression_ratio0)); 254 | 255 | #endif 256 | previous_block = block; 257 | } 258 | 259 | set_bit(new_bitmap0,(new_bitmap0_blocks-1)*format->compression_ratio0 + i%(format->compression_ratio0)); 260 | set_bit(format->bitmap1,block); 261 | } 262 | } 263 | 264 | format->bitmap0 = new_bitmap0; 265 | format->bitmap0_blocks = new_bitmap0_blocks; 266 | format->bitmap0_bits = new_bitmap0_blocks*format->compression_ratio1; 267 | 268 | #ifdef DEBUG 269 | printf("Constructed Bitmap1 \n"); 270 | for(int i=0; i< format->bitmap1_bits; i++) 271 | { 272 | printf("%u\t",read_bit(i,format->bitmap1)); 273 | } 274 | printf("\n"); 275 | #endif 276 | 277 | 278 | return; 279 | } 280 | 281 | 282 | void construct_bitmap2(smash *format) 283 | { 284 | 285 | int bitmap2_size; 286 | bitmap2_size = (format->bitmap1_bits/format->compression_ratio2); 287 | format->bitmap2_bits = bitmap2_size; 288 | 289 | format->bitmap2 = construct_bitmap(bitmap2_size); 290 | 291 | unsigned long *new_bitmap1; 292 | new_bitmap1 = (unsigned long*)malloc(sizeof(unsigned long)); 293 | 294 | int new_bitmap1_blocks = 0; 295 | 296 | int index; 297 | int block; 298 | int previous_block=-1; 299 | 300 | for(int i = 0; i < format->bitmap1_bits; i++){ 301 | 302 | if(read_bit(i,format->bitmap1)){ 303 | block = i/(format->compression_ratio2); 304 | 305 | if(previous_block != block){ 306 | new_bitmap1_blocks++; 307 | if(new_bitmap1_blocks % 64 ==0) 308 | new_bitmap1 = (unsigned long*)realloc(new_bitmap1,new_bitmap1_blocks*sizeof(unsigned long)); 309 | previous_block = block; 310 | } 311 | 312 | 313 | set_bit(new_bitmap1,(new_bitmap1_blocks-1)*format->compression_ratio1 + i%(format->compression_ratio1)); 314 | set_bit(format->bitmap2,block); 315 | } 316 | } 317 | 318 | format->bitmap1 = new_bitmap1; 319 | format->bitmap1_blocks = new_bitmap1_blocks; 320 | format->bitmap1_bits = new_bitmap1_blocks*format->compression_ratio2; 321 | 322 | 323 | #ifdef DEBUG 324 | 325 | printf("Constructed Bitmap2 \n"); 326 | 327 | for(int i=0; i< format->bitmap2_bits; i++) 328 | { 329 | printf("%u\t",read_bit(i,format->bitmap2)); 330 | } 331 | printf("\n"); 332 | 333 | #endif 334 | 335 | 336 | 337 | 338 | return; 339 | 340 | } 341 | 342 | 343 | void construct_format(smash *format, int comp1, int comp2, int comp3) 344 | { 345 | format->compression_ratio0 = comp1; 346 | format->compression_ratio1 = comp2; 347 | format->compression_ratio2 = comp3; 348 | 349 | format->current_block0 = 0; 350 | format->current_block1 = 0; 351 | 352 | format->counter0 = 0; 353 | format->counter1 = 0; 354 | format->counter2 = 0; 355 | 356 | format->current_index0 = 0; 357 | format->current_index1 = 0; 358 | format->current_index2 = 0; 359 | 360 | format->counter1 = 0; 361 | format->counter2 = 0; 362 | format->current_bitmap = 2; 363 | 364 | #ifdef DEBUG 365 | printf("Initialized Compression Ratios \n"); 366 | #endif 367 | } 368 | 369 | #ifdef NO_BCR 370 | int index_bitmaps(smash *format) 371 | { 372 | 373 | //@kanellok For potential software optimization: It would be ideal if someone writes the x86 assembly directly for this function 374 | 375 | for(;;){ 376 | 377 | outerloop: 378 | 379 | if(format->current_bitmap==0){ 380 | if(format->counter0/format->compression_ratio1 != format->current_block0){ 381 | 382 | format->current_bitmap=1; 383 | format->current_block0 = format->counter0/format->compression_ratio1; 384 | goto outerloop; 385 | } 386 | 387 | if(format->counter0 > format->bitmap0_bits) return -1; 388 | 389 | while(!read_bit(format->counter0,format->bitmap0)){ 390 | 391 | format->counter0++; 392 | if(format->counter0/format->compression_ratio1 != format->current_block0){ 393 | 394 | format->current_bitmap=1; 395 | format->current_block0 = format->counter0/format->compression_ratio1; 396 | goto outerloop; 397 | } 398 | if(format->counter0 > format->bitmap0_bits) return -1; 399 | } 400 | format->counter0++; 401 | 402 | printf("Bitmap 0, found set bit at %d\n",(format->counter0-1)); 403 | 404 | return (format->counter0-1); 405 | } 406 | else if(format->current_bitmap==1){ 407 | 408 | if(format->counter1/format->compression_ratio1 != format->current_block1){ 409 | 410 | format->current_bitmap=2; 411 | format->current_block1 = format->counter1/format->compression_ratio2; 412 | goto outerloop; 413 | } 414 | 415 | if(format->counter1 > format->bitmap1_bits) return -1; 416 | 417 | while(!read_bit(format->counter1,format->bitmap1)){ 418 | format->counter1++; 419 | if(format->counter1/format->compression_ratio2 != format->current_block1){ 420 | format->current_bitmap=2; 421 | format->current_block1 = format->counter1/format->compression_ratio1; 422 | goto outerloop; 423 | } 424 | if(format->counter1 > format->bitmap1_bits) return -1; 425 | 426 | } 427 | 428 | format->counter1++; 429 | printf("Bitmap 1, found set bit at %d\n",(format->counter1-1)); 430 | 431 | format->current_bitmap=0; 432 | 433 | } 434 | else if(format->current_bitmap==2){ 435 | while(!read_bit(format->counter2,format->bitmap2)){ 436 | format->counter2++; 437 | if(format->counter2 > format->bitmap2_bits) return -1; 438 | } 439 | format->counter2++; 440 | printf("Bitmap 2, found set bit at %d\n",(format->counter2-1)); 441 | 442 | format->current_bitmap=1; 443 | } 444 | 445 | } 446 | } 447 | #endif 448 | 449 | 450 | 451 | 452 | uint64_t find_set_bit(smash* format, uint64_t a) 453 | { 454 | 455 | //@kanellok For potential software optimization: It would be ideal if someone writes the x86 assembly directly for this function 456 | 457 | static_assert( CHAR_BIT * sizeof(unsigned long) == 64, "__builtin_clzll isn't 64-bit operand size"); 458 | uint64_t bit = __builtin_ctzl(a); // BSR 459 | //printf("Found set bit at Bitmap[%d] = %d \n",(format->current_index0),bit); 460 | 461 | // return a ? (1ULL << bit) : 0; // ULL is guaranteed to be at least a 64-bit type 462 | return a ? bit : 0; // ULL is guaranteed to be at least a 64-bit type 463 | 464 | } 465 | 466 | 467 | 468 | int index_bitmaps(smash *format) 469 | { 470 | 471 | //@kanellok For future software optimizations: It would be ideal if someone writes the x86 assembly directly for this function 472 | int value; 473 | 474 | for(;;){ 475 | outerloop: 476 | 477 | if(format->current_bitmap==0){ 478 | 479 | if((format->counter0+1) >= format->bitmap0_bits) return -1; 480 | 481 | if(format->current_register0 == 0){ 482 | format->current_index0++; 483 | format->counter0 = format->current_index0*64; 484 | format->current_register0 = format->bitmap0[format->current_index0]; 485 | } 486 | 487 | #ifdef DEBUG 488 | 489 | printf("Register= %#018" PRIx64 "\n",format->current_register0); 490 | 491 | #endif 492 | 493 | value = find_set_bit(format,format->current_register0); 494 | 495 | 496 | 497 | #ifdef DEBUG 498 | 499 | printf("Found set bit at Bitmap%d[%d] = %d current_block0 = %d \n",format->current_bitmap,(format->current_index0),value,format->current_block0); 500 | 501 | #endif 502 | 503 | if((format->current_index0*64+(value))/format->compression_ratio1 != format->current_block0){ 504 | format->current_bitmap=1; 505 | format->current_block0 = (format->current_index0*64+(value))/format->compression_ratio1; 506 | goto outerloop; 507 | } 508 | format->counter0 = format->current_index0*64+(value); 509 | format->current_register0 &= ~(1 << (value)) ; 510 | //if(format->current_register0 == 0){ goto outerloop;} 511 | #ifdef DEBUG 512 | 513 | printf("Counter%d= %d\n",0,format->counter0); 514 | 515 | #endif 516 | 517 | 518 | 519 | 520 | // if(format->counter0 > format->bitmap0_bits) return -1; 521 | 522 | return (format->counter0); 523 | } 524 | 525 | else if(format->current_bitmap==1){ 526 | 527 | if((format->counter1+1) >= format->bitmap1_bits) return -1; 528 | 529 | if(format->current_register1 == 0){ 530 | format->current_index1++; 531 | format->counter1 = format->current_index1*64; 532 | format->current_register1 = format->bitmap1[format->current_index1]; 533 | } 534 | // printf("Register= %#018" PRIx64 "\n",format->current_register1); 535 | value = find_set_bit(format,format->current_register1); 536 | 537 | #ifdef DEBUG 538 | 539 | printf("Found set bit at Bitmap%d[%d] = %d \n",format->current_bitmap,format->current_index1,value); 540 | 541 | #endif 542 | 543 | if((format->current_index1*64+(value))/format->compression_ratio2 != format->current_block1){ 544 | format->current_bitmap=2; 545 | format->current_block1 = (format->current_index1*64+(value))/format->compression_ratio2; 546 | goto outerloop; 547 | 548 | } 549 | format->counter1 = format->current_index1*64+(value); 550 | format->current_register1 &= ~(1 << (value)) ; 551 | 552 | #ifdef DEBUG 553 | 554 | printf("Counter%d= %d\n",1,format->counter1); 555 | 556 | #endif 557 | 558 | format->current_bitmap = 0; 559 | } 560 | 561 | else if(format->current_bitmap==2){ 562 | 563 | if((format->counter2+1) >= format->bitmap2_bits) return -1; 564 | 565 | if(format->current_register2 == 0){ 566 | format->current_index2++; 567 | format->counter2 = format->current_index2*64; 568 | format->current_register2 = format->bitmap2[format->current_index1]; 569 | } 570 | // printf("Register= %#018" PRIx64 "\n",format->current_register2); 571 | value = find_set_bit(format,format->current_register2); 572 | 573 | #ifdef DEBUG 574 | 575 | printf("Found set bit at Bitmap%d[%d] = %d, current_index2 = %d \n",format->current_bitmap,(format->current_index2),value,format->current_index2); 576 | 577 | #endif 578 | 579 | format->counter2 = format->current_index2*64+(value); 580 | format->current_register2 &= ~(1 << (value)) ; 581 | 582 | #ifdef DEBUG 583 | 584 | printf("Counter%d= %d\n",2,format->counter2); 585 | 586 | #endif 587 | 588 | format->current_bitmap = 1; 589 | } 590 | 591 | } 592 | 593 | } 594 | 595 | 596 | 597 | 598 | #endif 599 | --------------------------------------------------------------------------------