├── .gitignore ├── readme.md └── gemm-single ├── gemm-blas.c ├── Makefile ├── plot.py ├── test-gemm.c ├── gemm-blocked.s ├── benchmark-test.c ├── benchmark.c ├── gemm-blocked.c ├── outfile └── gemm-naive.c /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Optimize Gemm 2 | 3 | In this repo, we will show the code about how to optimize sgemm in single-thread ARM cpu, mutli-threads ARM cpu and Nvidia gpu. 4 | 5 | In each subdirectory, use `make` to compile the program. And there will be a benchmark executable program to test the gemm. You can read the makefile files for detail. 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /gemm-single/gemm-blas.c: -------------------------------------------------------------------------------- 1 | #define GEMM sgemm_ 2 | extern void GEMM (char*, char*, int*, int*, int*, float*, float*, int*, float*, int*, float*, float*, int*); 3 | 4 | const char* gemm_desc = "Reference gemm."; 5 | 6 | /* This routine performs a gemm operation 7 | * C := C + A * B 8 | * where A, B, and C are lda-by-lda matrices stored in column-major format. 9 | * On exit, A and B maintain their input values. 10 | * This function wraps a call to the BLAS-3 routine GEMM, via the standard FORTRAN interface - hence the reference semantics. */ 11 | void square_gemm (int N, float* A, float* B, float* C) 12 | { 13 | char TRANSA = 'N'; 14 | char TRANSB = 'N'; 15 | int M = N; 16 | int K = N; 17 | float ALPHA = 1.; 18 | float BETA = 1.; 19 | int LDA = N; 20 | int LDB = N; 21 | int LDC = N; 22 | GEMM(&TRANSA, &TRANSB, &M, &N, &K, &ALPHA, A, &LDA, B, &LDB, &BETA, C, &LDC); 23 | } 24 | -------------------------------------------------------------------------------- /gemm-single/Makefile: -------------------------------------------------------------------------------- 1 | # We will benchmark you against Intel MKL implementation, the default processor vendor-tuned implementation. 2 | # This makefile is intended for the Intel C compiler. 3 | # Your code must compile (with icc) with the given CFLAGS. You may experiment with the OPT variable to invoke additional compiler options. 4 | 5 | CC = gcc 6 | # OPT = -no-multibyte-chars 7 | FLAGS = -O3 8 | # -fopt-info 9 | LDLIBS = -lpthread -lm -llapack -lopenblas -lrt -lcblas -I/usr/include/openblas -lgptl -lpapi -O 10 | targets = benchmark-test benchmark-naive benchmark-blocked benchmark-blas 11 | objects = benchmark-test.o benchmark.o gemm-naive.o gemm-blocked.o gemm-blas.o 12 | 13 | 14 | .PHONY : default 15 | default : all 16 | 17 | .PHONY : all 18 | all : clean $(targets) 19 | 20 | benchmark-test : benchmark-test.o gemm-blocked.o 21 | $(CC) -o $@ $^ $(FLAGS) $(LDLIBS) 22 | 23 | benchmark-naive : benchmark.o gemm-naive.o 24 | $(CC) -o $@ $^ $(FLAGS) $(LDLIBS) 25 | benchmark-blocked : benchmark.o gemm-blocked.o 26 | $(CC) -o $@ $^ $(FLAGS) $(LDLIBS) 27 | benchmark-blas : benchmark.o gemm-blas.o 28 | $(CC) -o $@ $^ $(FLAGS) $(LDLIBS) 29 | %.o : %.c 30 | $(CC) -c $(CFLAGS) $(FLAGS) $< 31 | 32 | .PHONY : clean 33 | clean: 34 | rm -f $(targets) $(objects) 35 | -------------------------------------------------------------------------------- /gemm-single/plot.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | import re 4 | 5 | 6 | N_list = [127, 128, 129, 255, 256, 257, 383, 384, 385, 511, 512, 513, 639, 640, 641, 767, 768, 769, 895, 896, 897, 1023, 1024, 1025, 1151, 1152, 1153, 1279, 1280, 1281] 7 | final_list = [11.20, 16.40, 14.20, 13.50, 16.30, 15.30, 14.10, 16.10, 15.50, 14.60, 16.10, 15.60, 14.70, 16.00, 15.60, 14.90, 15.90, 15.60, 14.90, 15.80, 15.50, 14.90, 15.60, 15.50, 14.80, 15.50, 15.30, 14.70, 15.40, 15.2] 8 | block_list = [10.90, 16.80, 13.10, 12.60, 13.80, 13.70, 13.10, 16.10, 14.20, 12.30, 11.90, 12.40, 13.60, 16.00, 14.10, 13.40, 14.20, 14.10, 13.70, 15.60, 14.20, 11.40, 11.70, 9.99, 13.70, 15.50, 14.10, 13.70, 14.40, 13.9] 9 | simd_list = [10.60, 15.30, 13.00, 12.00, 12.80, 12.60, 12.20, 12.40, 13.40, 11.50, 5.80, 12.10, 10.70, 12.10, 11.20, 9.68, 5.20, 10.10, 9.13, 11.30, 9.28, 9.47, 3.84, 8.84, 8.76, 5.43, 8.86, 8.73, 3.91, 8.79] 10 | square_reg_list = [5.75, 6.28, 6.44, 6.12, 5.52, 6.53, 6.29, 5.54, 6.32, 6.03, 5.04, 6.08, 5.91, 5.44, 5.75, 5.57, 4.31, 5.34, 5.40, 5.21, 5.06, 5.39, 3.64, 4.85, 5.19, 3.93, 4.93, 5.19, 3.65, 4.85] 11 | square_list = [1.37, 1.34, 1.35, 1.32, 1.02, 1.32, 1.31, 0.98, 1.31, 1.31, 0.40, 1.31, 1.28, 1.00, 1.30, 1.30, 0.45, 1.30, 1.30, 1.03, 1.29, 1.30, 0.31, 1.29, 1.14, 0.61, 1.13, 0.96, 0.32, 0.967] 12 | naive_list = [1.40, 1.38, 1.38, 1.34, 0.99, 1.34, 1.32, 1.00, 1.32, 1.32, 0.41, 1.32, 1.30, 1.00, 1.31, 1.30, 0.45, 1.30, 1.28, 1.02, 1.28, 1.27, 0.31, 1.28, 1.18, 0.68, 1.14, 1.00, 0.35, 0.986] 13 | blas_list = [24.70, 31.00, 30.10, 29.10, 32.90, 32.10, 29.90, 32.60, 32.70, 31.40, 33.20, 33.20, 31.40, 33.00, 33.20, 31.90, 33.30, 33.30, 32.10, 33.40, 33.50, 32.30, 33.30, 33.40, 32.60, 33.50, 33.50, 32.60, 33.40, 33.5] 14 | 15 | 16 | def parse_file(): 17 | file_path = "./output.log" 18 | size_list = [] 19 | gflop_list = [] 20 | with open(file_path, "r") as in_f: 21 | lines = in_f.readlines() 22 | pattern = re.compile(r'Size:\s+([0-9]+)\s+Gflop/s:\s+([0-9.]+)\s+[^\)]+\)') 23 | for line in lines: 24 | m = pattern.match(line) 25 | if m: 26 | print(m.groups()) 27 | size_num, gflop = m.groups() 28 | size_num = int(size_num) 29 | gflop = float(gflop) 30 | size_list.append(size_num) 31 | gflop_list.append(gflop) 32 | # print(size_list[0], gflop_list[0]) 33 | count = len(size_list) 34 | for i in range(count - 1): 35 | print("%d, " % size_list[i], end='') 36 | print(size_list[-1]) 37 | # print() 38 | for i in range(count - 1): 39 | print("%.2f, " % gflop_list[i], end='') 40 | print(gflop_list[-1]) 41 | # plt.plot(size_list, gflop_list) 42 | # plt.show() 43 | 44 | def show_plot(): 45 | plt.plot(N_list, final_list, label="pack", marker='o') 46 | plt.plot(N_list, block_list, label="blocked", marker='v') 47 | plt.plot(N_list, simd_list, label="simd", marker='^') 48 | plt.plot(N_list, square_reg_list, label="4x4 register", marker='<') 49 | plt.plot(N_list, square_list, label="4x4 naive", marker='>') 50 | plt.plot(N_list, naive_list, label="naive", marker='p') 51 | # plt.plot(N_list, blas_list, label="blas", marker='*') 52 | plt.legend(loc="lower right") 53 | plt.ylabel("Gflop/s") 54 | plt.xlabel("Matrix Size N") 55 | plt.show() 56 | 57 | def main(): 58 | show_plot() 59 | # plt.plot(size_list, gflop_list) 60 | # plt.show() 61 | 62 | 63 | if __name__ == '__main__': 64 | main() -------------------------------------------------------------------------------- /gemm-single/test-gemm.c: -------------------------------------------------------------------------------- 1 | extern void square_gemm (int lda, float* A, float* B, float* C); 2 | #include 3 | #include 4 | 5 | #define min(x,y) (x) < (y) ? (x) : (y) 6 | 7 | static void print_matrix(int row, int col, float *A) { 8 | for(int i = 0; i < row; ++i) { 9 | for(int j = 0; j < col; ++j) { 10 | printf("%0.2f ", A[i + j * row]); 11 | } 12 | printf("\n"); 13 | } 14 | } 15 | void test() { 16 | float A[16] = {1,5,9,13,2,6,10,14,3,7,11,15,4,8,12,16}, B[16] = {1,5,9,13,2,6,10,14,3,7,11,15,4,8,12,16}; 17 | // float A[25] = {1,6, 11, 16, 21, 2, 7, 12, 17, 22, 3, 8, 13, 18, 23, 4, 9, 14, 19, 24, 5, 10, 15, 20, 25}; 18 | // float B[25] = {1,6, 11, 16, 21, 2, 7, 12, 17, 22, 3, 8, 13, 18, 23, 4, 9, 14, 19, 24, 5, 10, 15, 20, 25}; 19 | #define TEST_N_LEN 4 20 | print_matrix(TEST_N_LEN, TEST_N_LEN, A); 21 | print_matrix(TEST_N_LEN, TEST_N_LEN, B); 22 | float C[TEST_N_LEN * TEST_N_LEN] = {0}; 23 | square_gemm(TEST_N_LEN, A, B, C); 24 | print_matrix(TEST_N_LEN, TEST_N_LEN, C); 25 | 26 | } 27 | 28 | static void transpose_small_blk(int lda, int M, int N, float *A) { 29 | for(int i = 1; i < M; ++i) { 30 | for (int j = 0; j < i && j < N; ++j) { 31 | int ij_index = i + j * lda, ji_index = j + i * lda; 32 | float tmp = A[ij_index]; 33 | A[ij_index] = A[ji_index]; 34 | A[ji_index] = tmp; 35 | } 36 | } 37 | } 38 | 39 | #define BLOCK_SIZE 2 40 | 41 | static void transpose_m_blk(int lda, float *A) { 42 | for (int i = 0; i < lda; i += BLOCK_SIZE) { 43 | for(int j = 0; j < lda; j += BLOCK_SIZE) { 44 | int M = min (BLOCK_SIZE, lda-i); 45 | int N = min (BLOCK_SIZE, lda-j); 46 | transpose_small_blk(lda, M, N, A + i + j * lda); 47 | 48 | } 49 | } 50 | } 51 | 52 | static float* pack_L_small_blk(int lda, int M, int N, float *A, float *temp_P) { 53 | for(int i = 0; i < M; ++i) { 54 | for (int j = 0; j < N; ++j) { 55 | int ij_index = i + j * lda; 56 | *temp_P = A[ij_index]; 57 | temp_P++; 58 | } 59 | } 60 | return temp_P; 61 | } 62 | 63 | static float* pack_L_block(int lda, float *A) { 64 | float *temp_A = (float*)malloc(sizeof(float) * lda * lda); 65 | float *temp_P = temp_A; 66 | for (int i = 0; i < lda; i += BLOCK_SIZE) { 67 | for(int j = 0; j < lda; j += BLOCK_SIZE) { 68 | int M = min (BLOCK_SIZE, lda-i); 69 | int N = min (BLOCK_SIZE, lda-j); 70 | temp_P = pack_L_small_blk(lda, M, N, A + i + j * lda, temp_P); 71 | 72 | } 73 | } 74 | return temp_A; 75 | } 76 | 77 | static float* pack_R_small_blk(int lda, int M, int N, float *A, float *temp_P) { 78 | for (int j = 0; j < N; ++j) { 79 | for(int i = 0; i < M; ++i) { 80 | int ij_index = i + j * lda; 81 | *temp_P = A[ij_index]; 82 | temp_P++; 83 | } 84 | } 85 | return temp_P; 86 | } 87 | 88 | static float* pack_R_block(int lda, float *A) { 89 | float *temp_A = (float*)malloc(sizeof(float) * lda * lda); 90 | float *temp_P = temp_A; 91 | for(int j = 0; j < lda; j += BLOCK_SIZE) { 92 | for (int i = 0; i < lda; i += BLOCK_SIZE) { 93 | int M = min (BLOCK_SIZE, lda-i); 94 | int N = min (BLOCK_SIZE, lda-j); 95 | temp_P = pack_R_small_blk(lda, M, N, A + i + j * lda, temp_P); 96 | 97 | } 98 | } 99 | return temp_A; 100 | } 101 | 102 | void test_blk() { 103 | float A[16] = {1,5,9,13,2,6,10,14,3,7,11,15,4,8,12,16}; 104 | print_matrix(4,4,A); 105 | // transpose_m_blk(4, A); 106 | float *B = pack_L_block(4, A); 107 | // float *B = pack_R_block(4, A); 108 | print_matrix(4,4, B); 109 | } 110 | 111 | int main() { 112 | // test_blk(); 113 | test(); 114 | } -------------------------------------------------------------------------------- /gemm-single/gemm-blocked.s: -------------------------------------------------------------------------------- 1 | 2 | gemm-blocked.o: file format elf64-littleaarch64 3 | 4 | 5 | Disassembly of section .text: 6 | 7 | 0000000000000000 : 8 | 0: 2a0003eb mov w11, w0 9 | 4: 7100001f cmp w0, #0x0 10 | 8: aa0203ef mov x15, x2 11 | c: 5400092d b.le 130 12 | 10: 51000400 sub w0, w0, #0x1 13 | 14: 531f7967 lsl w7, w11, #1 14 | 18: 11000566 add w6, w11, #0x1 15 | 1c: 121f7800 and w0, w0, #0xfffffffe 16 | 20: 51000968 sub w8, w11, #0x2 17 | 24: aa0103ec mov x12, x1 18 | 28: aa0303ed mov x13, x3 19 | 2c: 4b000108 sub w8, w8, w0 20 | 30: 937e7ce7 sbfiz x7, x7, #2, #32 21 | 34: 93407cc6 sxtw x6, w6 22 | 38: 2a0b03e9 mov w9, w11 23 | 3c: 93407d64 sxtw x4, w11 24 | 40: d280000e mov x14, #0x0 // #0 25 | 44: d503201f nop 26 | 48: 7100013f cmp w9, #0x0 27 | 4c: 5400066d b.le 118 28 | 50: aa0f03ea mov x10, x15 29 | 54: aa0d03e0 mov x0, x13 30 | 58: 2a0b03e5 mov w5, w11 31 | 5c: d503201f nop 32 | 60: aa0a03e2 mov x2, x10 33 | 64: aa0c03e1 mov x1, x12 34 | 68: 2a0b03e3 mov w3, w11 35 | 6c: d503201f nop 36 | 70: 710000bf cmp w5, #0x0 37 | 74: 540003ed b.le f0 38 | 78: bd400021 ldr s1, [x1] 39 | 7c: 7100047f cmp w3, #0x1 40 | 80: bd400042 ldr s2, [x2] 41 | 84: bd400000 ldr s0, [x0] 42 | 88: 1f010040 fmadd s0, s2, s1, s0 43 | 8c: 5400054c b.gt 134 44 | 90: bd000000 str s0, [x0] 45 | 94: 710004bf cmp w5, #0x1 46 | 98: 540000cd b.le b0 47 | 9c: bc647842 ldr s2, [x2, x4, lsl #2] 48 | a0: bd400021 ldr s1, [x1] 49 | a4: bc647800 ldr s0, [x0, x4, lsl #2] 50 | a8: 1f010040 fmadd s0, s2, s1, s0 51 | ac: bc247800 str s0, [x0, x4, lsl #2] 52 | b0: 7100053f cmp w9, #0x1 53 | b4: 540001ed b.le f0 54 | b8: bd400042 ldr s2, [x2] 55 | bc: 7100047f cmp w3, #0x1 56 | c0: bd400421 ldr s1, [x1, #4] 57 | c4: bd400400 ldr s0, [x0, #4] 58 | c8: 1f010040 fmadd s0, s2, s1, s0 59 | cc: 5400050c b.gt 16c 60 | d0: bd000400 str s0, [x0, #4] 61 | d4: 710004bf cmp w5, #0x1 62 | d8: 540000cd b.le f0 63 | dc: bc647841 ldr s1, [x2, x4, lsl #2] 64 | e0: bd400422 ldr s2, [x1, #4] 65 | e4: bc667800 ldr s0, [x0, x6, lsl #2] 66 | e8: 1f010040 fmadd s0, s2, s1, s0 67 | ec: bc267800 str s0, [x0, x6, lsl #2] 68 | f0: 51000863 sub w3, w3, #0x2 69 | f4: 8b070021 add x1, x1, x7 70 | f8: 6b08007f cmp w3, w8 71 | fc: 91002042 add x2, x2, #0x8 72 | 100: 54fffb81 b.ne 70 // b.any 73 | 104: 510008a5 sub w5, w5, #0x2 74 | 108: 8b07014a add x10, x10, x7 75 | 10c: 6b0800bf cmp w5, w8 76 | 110: 8b070000 add x0, x0, x7 77 | 114: 54fffa61 b.ne 60 // b.any 78 | 118: 910009ce add x14, x14, #0x2 79 | 11c: 51000929 sub w9, w9, #0x2 80 | 120: 6b0e017f cmp w11, w14 81 | 124: 9100218c add x12, x12, #0x8 82 | 128: 910021ad add x13, x13, #0x8 83 | 12c: 54fff8ec b.gt 48 84 | 130: d65f03c0 ret 85 | 134: bc647822 ldr s2, [x1, x4, lsl #2] 86 | 138: 710004bf cmp w5, #0x1 87 | 13c: bd400441 ldr s1, [x2, #4] 88 | 140: 1f010040 fmadd s0, s2, s1, s0 89 | 144: bd000000 str s0, [x0] 90 | 148: 54fffb4d b.le b0 91 | 14c: bc647841 ldr s1, [x2, x4, lsl #2] 92 | 150: bd400023 ldr s3, [x1] 93 | 154: bc647800 ldr s0, [x0, x4, lsl #2] 94 | 158: bc667842 ldr s2, [x2, x6, lsl #2] 95 | 15c: 1f010060 fmadd s0, s3, s1, s0 96 | 160: bc647821 ldr s1, [x1, x4, lsl #2] 97 | 164: 1f010040 fmadd s0, s2, s1, s0 98 | 168: 17ffffd1 b ac 99 | 16c: bc667821 ldr s1, [x1, x6, lsl #2] 100 | 170: 710004bf cmp w5, #0x1 101 | 174: bd400442 ldr s2, [x2, #4] 102 | 178: 1f010040 fmadd s0, s2, s1, s0 103 | 17c: bd000400 str s0, [x0, #4] 104 | 180: 54fffb8d b.le f0 105 | 184: bc647843 ldr s3, [x2, x4, lsl #2] 106 | 188: bd400421 ldr s1, [x1, #4] 107 | 18c: bc667800 ldr s0, [x0, x6, lsl #2] 108 | 190: bc667822 ldr s2, [x1, x6, lsl #2] 109 | 194: 1f010060 fmadd s0, s3, s1, s0 110 | 198: bc667841 ldr s1, [x2, x6, lsl #2] 111 | 19c: 1f010040 fmadd s0, s2, s1, s0 112 | 1a0: 17ffffd3 b ec 113 | -------------------------------------------------------------------------------- /gemm-single/benchmark-test.c: -------------------------------------------------------------------------------- 1 | #include // For: exit, drand48, malloc, free, NULL, EXIT_FAILURE 2 | #include // For: perror 3 | #include // For: memset 4 | 5 | #include // For: DBL_EPSILON 6 | #include // For: fabs 7 | 8 | #ifdef GETTIMEOFDAY 9 | #include // For struct timeval, gettimeofday 10 | #else 11 | #include // For struct timespec, clock_gettime, CLOCK_MONOTONIC 12 | #endif 13 | 14 | /* reference_gemm wraps a call to the BLAS-3 routine GEMM, via the standard FORTRAN interface - hence the reference semantics. */ 15 | #define GEMM sgemm_ 16 | extern void GEMM (char*, char*, int*, int*, int*, float*, float*, int*, float*, int*, float*, float*, int*); 17 | void reference_gemm (int N, float ALPHA, float* A, float* B, float* C) 18 | { 19 | char TRANSA = 'N'; 20 | char TRANSB = 'N'; 21 | int M = N; 22 | int K = N; 23 | float BETA = 1.; 24 | int LDA = N; 25 | int LDB = N; 26 | int LDC = N; 27 | GEMM(&TRANSA, &TRANSB, &M, &N, &K, &ALPHA, A, &LDA, B, &LDB, &BETA, C, &LDC); 28 | } 29 | 30 | /* Your function must have the following signature: */ 31 | extern const char* gemm_desc; 32 | extern void square_gemm (int, float*, float*, float*); 33 | 34 | double wall_time () 35 | { 36 | #ifdef GETTIMEOFDAY 37 | struct timeval t; 38 | gettimeofday (&t, NULL); 39 | return 1.*t.tv_sec + 1.e-6*t.tv_usec; 40 | #else 41 | struct timespec t; 42 | clock_gettime (CLOCK_MONOTONIC, &t); 43 | return 1.*t.tv_sec + 1.e-9*t.tv_nsec; 44 | #endif 45 | } 46 | 47 | void die (const char* message) 48 | { 49 | perror (message); 50 | exit (EXIT_FAILURE); 51 | } 52 | 53 | void fill (float* p, int n) 54 | { 55 | for (int i = 0; i < n; ++i) 56 | p[i] = 2 * drand48() - 1; // Uniformly distributed over [-1, 1] 57 | } 58 | 59 | void absolute_value (float *p, int n) 60 | { 61 | for (int i = 0; i < n; ++i) 62 | p[i] = fabs (p[i]); 63 | } 64 | 65 | /* The benchmarking program */ 66 | int main (int argc, char **argv) 67 | { 68 | printf ("Description:\t%s\n\n", gemm_desc); 69 | 70 | /* Test sizes should highlight performance dips at multiples of certain powers-of-two */ 71 | 72 | int test_sizes[] = 73 | 74 | /* A representative subset of the first list for initial test. */ 75 | { 31, 32, 96, 97, 127, 128, 129, 191, 192, 229, 255, 256, 257, 76 | 319, 320, 321, 417, 479, 480, 511, 512, 639, 640}; 77 | 78 | int nsizes = sizeof(test_sizes)/sizeof(test_sizes[0]); 79 | 80 | /* assume last size is also the largest size */ 81 | int nmax = test_sizes[nsizes-1]; 82 | 83 | /* allocate memory for all problems */ 84 | float* buf = NULL; 85 | buf = (float*) malloc (3 * nmax * nmax * sizeof(float)); 86 | if (buf == NULL) die ("failed to allocate largest problem size"); 87 | 88 | /* For each test size */ 89 | for (int isize = 0; isize < sizeof(test_sizes)/sizeof(test_sizes[0]); ++isize) 90 | { 91 | /* Create and fill 3 random matrices A,B,C*/ 92 | int n = test_sizes[isize]; 93 | 94 | float* A = buf + 0; 95 | float* B = A + nmax*nmax; 96 | float* C = B + nmax*nmax; 97 | 98 | fill (A, n*n); 99 | fill (B, n*n); 100 | fill (C, n*n); 101 | 102 | /* Measure performance (in Gflops/s). */ 103 | 104 | /* Time a "sufficiently long" sequence of calls to reduce noise */ 105 | double Gflops_s, seconds = -1.0; 106 | double timeout = 0.1; // "sufficiently long" := at least 1/10 second. 107 | int n_iterations = 0; 108 | for (n_iterations = 1; seconds < timeout; n_iterations *= 2) 109 | { 110 | /* Warm-up */ 111 | square_gemm (n, A, B, C); 112 | 113 | /* Benchmark n_iterations runs of square_gemm */ 114 | seconds = -wall_time(); 115 | for (int it = 0; it < n_iterations; ++it) 116 | square_gemm (n, A, B, C); 117 | seconds += wall_time(); 118 | 119 | /* compute Mflop/s rate */ 120 | Gflops_s = 2.e-9 * n_iterations * n * n * n / seconds; 121 | } 122 | printf ("Size: %d\tGflop/s: %.3g (%d iter, %.3f seconds)\n", n, Gflops_s, n_iterations, seconds); 123 | 124 | /* Ensure that error does not exceed the theoretical error bound. */ 125 | 126 | /* C := A * B, computed with square_gemm */ 127 | memset (C, 0, n * n * sizeof(float)); 128 | square_gemm (n, A, B, C); 129 | /* Do not explicitly check that A and B were unmodified on square_gemm exit 130 | * - if they were, the following will most likely detect it: 131 | * C := C - A * B, computed with reference_gemm */ 132 | reference_gemm(n, -1., A, B, C); 133 | 134 | /* A := |A|, B := |B|, C := |C| */ 135 | absolute_value (A, n * n); 136 | absolute_value (B, n * n); 137 | absolute_value (C, n * n); 138 | 139 | /* C := |C| - 3 * e_mach * n * |A| * |B|, computed with reference_gemm */ 140 | reference_gemm (n, -3.*FLT_EPSILON*n, A, B, C); 141 | 142 | /* If any element in C is positive, then something went wrong in square_gemm */ 143 | for (int i = 0; i < n * n; ++i) 144 | if (C[i] > 0) 145 | die("*** FAILURE *** Error in matrix multiply exceeds componentwise error bounds.\n" ); 146 | } 147 | 148 | free (buf); 149 | 150 | return 0; 151 | } 152 | -------------------------------------------------------------------------------- /gemm-single/benchmark.c: -------------------------------------------------------------------------------- 1 | #include // For: exit, drand48, malloc, free, NULL, EXIT_FAILURE 2 | #include // For: perror 3 | #include // For: memset 4 | 5 | #include // For: DBL_EPSILON 6 | #include // For: fabs 7 | 8 | #include 9 | #include 10 | 11 | #ifdef GETTIMEOFDAY 12 | #include // For struct timeval, gettimeofday 13 | #else 14 | #include // For struct timespec, clock_gettime, CLOCK_MONOTONIC 15 | #endif 16 | 17 | /* reference_gemm wraps a call to the BLAS-3 routine GEMM, via the standard FORTRAN interface - hence the reference semantics. */ 18 | #define GEMM sgemm_ 19 | extern void GEMM (char*, char*, int*, int*, int*, float*, float*, int*, float*, int*, float*, float*, int*); 20 | void reference_gemm (int N, float ALPHA, float* A, float* B, float* C) 21 | { 22 | char TRANSA = 'N'; 23 | char TRANSB = 'N'; 24 | int M = N; 25 | int K = N; 26 | float BETA = 1.; 27 | int LDA = N; 28 | int LDB = N; 29 | int LDC = N; 30 | GEMM(&TRANSA, &TRANSB, &M, &N, &K, &ALPHA, A, &LDA, B, &LDB, &BETA, C, &LDC); 31 | } 32 | 33 | /* Your function must have the following signature: */ 34 | extern const char* gemm_desc; 35 | extern void square_gemm (int, float*, float*, float*); 36 | 37 | double wall_time () 38 | { 39 | #ifdef GETTIMEOFDAY 40 | struct timeval t; 41 | gettimeofday (&t, NULL); 42 | return 1.*t.tv_sec + 1.e-6*t.tv_usec; 43 | #else 44 | struct timespec t; 45 | clock_gettime (CLOCK_MONOTONIC, &t); 46 | return 1.*t.tv_sec + 1.e-9*t.tv_nsec; 47 | #endif 48 | } 49 | 50 | void die (const char* message) 51 | { 52 | perror (message); 53 | exit (EXIT_FAILURE); 54 | } 55 | 56 | void fill (float* p, int n) 57 | { 58 | for (int i = 0; i < n; ++i) 59 | p[i] = 2 * drand48() - 1; // Uniformly distributed over [-1, 1] 60 | } 61 | 62 | void absolute_value (float *p, int n) 63 | { 64 | for (int i = 0; i < n; ++i) 65 | p[i] = fabs (p[i]); 66 | } 67 | 68 | /* The benchmarking program */ 69 | int main (int argc, char **argv) 70 | { 71 | printf ("Description:\t%s\n\n", gemm_desc); 72 | 73 | /* Test sizes should highlight performance dips at multiples of certain powers-of-two */ 74 | 75 | int test_sizes[] = 76 | 77 | /* Multiples-of-32, +/- 1. for final benchmarking. */ 78 | // {31,32,33,63,64,65,95,96,97,127,128,129,159,160,161,191,192,193,223,224,225,255,256,257,287,288,289,319,320,321,351,352,353,383,384,385,415,416,417,447,448,449,479,480,481,511,512,513,543,544,545,575,576,577,607,608,609,639,640,641,671,672,673,703,704,705,735,736,737,767,768,769,799,800,801,831,832,833,863,864,865,895,896,897,927,928,929,959,960,961,991,992,993,1023,1024,1025}; 79 | 80 | /* A representative subset of the first list for initial test. Currently uncommented. */ 81 | // { 31, 32, 96, 97, 127, 128, 129, 191, 192, 229, 255, 256, 257, 82 | // 319, 320, 321, 417, 479, 480, 511, 512, 639, 640, 767, 768, 769 }; 83 | // {16}; 84 | // {31, 32}; 85 | 86 | // { 31, 32, 96, 97, 127, 128, 129, 191, 192, 229, 255, 256, 257, 87 | // 319, 320, 321, 417, 479, 480, 511, 512, 639, 640, 767, 768, 769, 1024, 2048 }; 88 | 89 | {127,128,129,255,256,257,383,384,385,511,512,513,639,640, 90 | 641,767,768,769,895,896,897,1023,1024,1025,1151,1152,1153,1279,1280,1281}; 91 | 92 | int nsizes = sizeof(test_sizes)/sizeof(test_sizes[0]); 93 | 94 | /* assume last size is also the largest size */ 95 | int nmax = test_sizes[nsizes-1]; 96 | 97 | /* allocate memory for all problems */ 98 | float* buf = NULL; 99 | buf = (float*) malloc (3 * nmax * nmax * sizeof(float)); 100 | if (buf == NULL) die ("failed to allocate largest problem size"); 101 | 102 | /* For each test size */ 103 | double res = 0.0; 104 | double count = 0.0; 105 | for (int isize = 0; isize < sizeof(test_sizes)/sizeof(test_sizes[0]); ++isize) 106 | { 107 | /* Create and fill 3 random matrices A,B,C*/ 108 | int n = test_sizes[isize]; 109 | 110 | float* A = buf + 0; 111 | float* B = A + nmax*nmax; 112 | float* C = B + nmax*nmax; 113 | 114 | fill (A, n*n); 115 | fill (B, n*n); 116 | fill (C, n*n); 117 | 118 | /* Measure performance (in Gflops/s). */ 119 | 120 | /* Time a "sufficiently long" sequence of calls to reduce noise */ 121 | double Gflops_s, seconds = -1.0; 122 | double timeout = 0.1; // "sufficiently long" := at least 1/10 second. 123 | int n_iterations = 0; 124 | 125 | // int gptlRet; 126 | // int _i; 127 | 128 | // /* Events that we are interested (see events.txt). 129 | // Kunpeng920 CPU supports up to six PMU counters. 130 | // */ 131 | // char *eventnames[] = { 132 | // "CPU_CYCLES", "L1D_CACHE", 133 | // "L1D_CACHE_REFILL", "L2D_CACHE_ACCESS", 134 | // "MEM_STALL_ANYLOAD", "MEM_STALL_ANYSTORE" 135 | // }; 136 | // int eventcodes[sizeof(eventnames) / sizeof(char*)]; 137 | 138 | // PAPI_library_init(PAPI_VER_CURRENT); 139 | // for (_i = 0; _i < sizeof(eventnames) / sizeof(char*); _i++) { 140 | // GPTLevent_name_to_code(eventnames[_i], &eventcodes[_i]); 141 | // GPTLsetoption(eventcodes[_i], 1); 142 | // } 143 | 144 | // GPTLsetoption(GPTLpersec, 0); 145 | // GPTLinitialize(); 146 | // gptlRet = GPTLstart ("gemm"); 147 | 148 | for (n_iterations = 1; seconds < timeout; n_iterations *= 2) 149 | { 150 | /* Warm-up */ 151 | square_gemm (n, A, B, C); 152 | 153 | /* Benchmark n_iterations runs of square_gemm */ 154 | seconds = -wall_time(); 155 | for (int it = 0; it < n_iterations; ++it) 156 | square_gemm (n, A, B, C); 157 | seconds += wall_time(); 158 | 159 | /* compute Mflop/s rate */ 160 | Gflops_s = 2.e-9 * n_iterations * n * n * n / seconds; 161 | } 162 | // gptlRet = GPTLstop("gemm"); 163 | // gptlRet = GPTLpr_file("outfile"); 164 | printf ("Size: %d\tGflop/s: %.3g (%d iter, %.3f seconds)\n", n, Gflops_s, n_iterations, seconds); 165 | res += Gflops_s; 166 | count += 1; 167 | /* Ensure that error does not exceed the theoretical error bound. */ 168 | 169 | /* C := A * B, computed with square_gemm */ 170 | memset (C, 0, n * n * sizeof(float)); 171 | square_gemm (n, A, B, C); 172 | /* Do not explicitly check that A and B were unmodified on square_gemm exit 173 | * - if they were, the following will most likely detect it: 174 | * C := C - A * B, computed with reference_gemm */ 175 | reference_gemm(n, -1., A, B, C); 176 | 177 | /* A := |A|, B := |B|, C := |C| */ 178 | absolute_value (A, n * n); 179 | absolute_value (B, n * n); 180 | absolute_value (C, n * n); 181 | 182 | /* C := |C| - 3 * e_mach * n * |A| * |B|, computed with reference_gemm */ 183 | reference_gemm (n, -3.*FLT_EPSILON*n, A, B, C); 184 | 185 | /* If any element in C is positive, then something went wrong in square_gemm */ 186 | for (int i = 0; i < n * n; ++i) 187 | if (C[i] > 0) 188 | die("*** FAILURE *** Error in matrix multiply exceeds componentwise error bounds.\n" ); 189 | } 190 | res /= count; 191 | printf("Average %lf \n",res); 192 | 193 | free (buf); 194 | 195 | return 0; 196 | } 197 | -------------------------------------------------------------------------------- /gemm-single/gemm-blocked.c: -------------------------------------------------------------------------------- 1 | #include "arm_neon.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | const char* gemm_desc = "Simple blocked gemm."; 9 | 10 | #define BLOCK_ROW 64 11 | #define BLOCK_COL 128 12 | 13 | #define UNROLL_NUM 4 14 | 15 | #define SIMD_UNROLL 32 16 | #define SIMD_UNROLLD4 8 17 | 18 | static void print_matrix(int row, int col, float *A) { 19 | for(int i = 0; i < row; ++i) { 20 | for(int j = 0; j < col; ++j) { 21 | printf("%0.2f ", A[i + j * row]); 22 | } 23 | printf("\n"); 24 | } 25 | } 26 | 27 | #if !defined(BLOCK_SIZE) 28 | #define BLOCK_SIZE 96 29 | #endif 30 | 31 | #define min(a,b) (((a)<(b))?(a):(b)) 32 | 33 | 34 | void dot_mul(int n, int incX, float *A, float *B, float *C) { 35 | for (int i = 0; i < n; ++i) { 36 | *C += A[i * incX] * B[i]; 37 | } 38 | } 39 | 40 | #define UNROLL_NUM 4 41 | 42 | #define UNROLL_ROW (UNROLL_NUM / 4) 43 | 44 | void dot_mul_square(int n, int lda, int ldb, int ldc, float *A, float *B, float *C) { 45 | register float32x4_t c_c0, c_c1, c_c2, c_c3, a_c0_i0, a_c0_i1, a_c0_i2, a_c0_i3, b_vi0_0, b_vi1_0, b_vi2_0, b_vi3_0, temp_v0, temp_v3, part1_c0, part1_c1, part1_c2, part1_c3; 46 | 47 | #if SIMD_UNROLL == 32 48 | register float32x4_t part2_c0, part2_c1, part2_c2, part2_c3, part3_c0, part3_c1, part3_c2, part3_c3; 49 | register float32x4_t a_c0_i4, a_c0_i5, b_vi4_0, b_vi5_0; 50 | register float32x4_t a_c0_i6, a_c0_i7, b_vi6_0, b_vi7_0; 51 | #endif 52 | 53 | register float32x4_t zero = {0}; 54 | part1_c0 = zero; part1_c1 = zero; part1_c2 = zero; part1_c3 = zero; 55 | #if SIMD_UNROLL == 32 56 | part2_c0 = zero; part2_c1 = zero; part2_c2 = zero; part2_c3 = zero; 57 | part3_c0 = zero; part3_c1 = zero; part3_c2 = zero; part3_c3 = zero; 58 | #endif 59 | c_c0 = vld1q_f32(C + 0 * ldc); c_c1 = vld1q_f32(C + 1 * ldc); 60 | c_c2 = vld1q_f32(C + 2 * ldc); c_c3 = vld1q_f32(C + 3 * ldc); 61 | 62 | int i; 63 | for (i = 0; i + SIMD_UNROLLD4 <= n; i += SIMD_UNROLLD4) { 64 | a_c0_i0 = vld1q_f32(A + i * 4); 65 | a_c0_i1 = vld1q_f32(A + i * 4 + 4); 66 | b_vi0_0 = vld1q_f32(B + 0); 67 | b_vi1_0 = vld1q_f32(B + 4); 68 | temp_v0 = vfmaq_laneq_f32(c_c0, a_c0_i0, b_vi0_0, 0); 69 | c_c0 = vfmaq_laneq_f32(temp_v0, a_c0_i1, b_vi1_0, 0); 70 | 71 | temp_v3 = vfmaq_laneq_f32(c_c1, a_c0_i1, b_vi1_0, 1); 72 | c_c1 = vfmaq_laneq_f32(temp_v3, a_c0_i0, b_vi0_0, 1); 73 | 74 | temp_v0 = vfmaq_laneq_f32(c_c2, a_c0_i0, b_vi0_0, 2); 75 | c_c2 = vfmaq_laneq_f32(temp_v0, a_c0_i1, b_vi1_0, 2); 76 | 77 | temp_v3 = vfmaq_laneq_f32(c_c3, a_c0_i1, b_vi1_0, 3); 78 | c_c3 = vfmaq_laneq_f32(temp_v3, a_c0_i0, b_vi0_0, 3); 79 | 80 | 81 | a_c0_i2 = vld1q_f32(A + i * 4 + 8); 82 | a_c0_i3 = vld1q_f32(A + i * 4 + 12); 83 | b_vi2_0 = vld1q_f32(B + 8); 84 | b_vi3_0 = vld1q_f32(B + 12); 85 | temp_v0 = vfmaq_laneq_f32(part1_c0, a_c0_i2, b_vi2_0, 0); 86 | part1_c0 = vfmaq_laneq_f32(temp_v0, a_c0_i3, b_vi3_0, 0); 87 | 88 | temp_v3 = vfmaq_laneq_f32(part1_c1, a_c0_i3, b_vi3_0, 1); 89 | part1_c1 = vfmaq_laneq_f32(temp_v3, a_c0_i2, b_vi2_0, 1); 90 | 91 | temp_v0 = vfmaq_laneq_f32(part1_c2, a_c0_i2, b_vi2_0, 2); 92 | part1_c2 = vfmaq_laneq_f32(temp_v0, a_c0_i3, b_vi3_0, 2); 93 | 94 | temp_v3 = vfmaq_laneq_f32(part1_c3, a_c0_i3, b_vi3_0, 3); 95 | part1_c3 = vfmaq_laneq_f32(temp_v3, a_c0_i2, b_vi2_0, 3); 96 | 97 | #if SIMD_UNROLL == 32 98 | a_c0_i4 = vld1q_f32(A + i * 4 + 16); 99 | a_c0_i5 = vld1q_f32(A + i * 4 + 20); 100 | b_vi4_0 = vld1q_f32(B + 16); 101 | b_vi5_0 = vld1q_f32(B + 20); 102 | temp_v0 = vfmaq_laneq_f32(part2_c0, a_c0_i4, b_vi4_0, 0); 103 | part2_c0 = vfmaq_laneq_f32(temp_v0, a_c0_i5, b_vi5_0, 0); 104 | 105 | temp_v3 = vfmaq_laneq_f32(part2_c1, a_c0_i5, b_vi5_0, 1); 106 | part2_c1 = vfmaq_laneq_f32(temp_v3, a_c0_i4, b_vi4_0, 1); 107 | 108 | temp_v0 = vfmaq_laneq_f32(part2_c2, a_c0_i4, b_vi4_0, 2); 109 | part2_c2 = vfmaq_laneq_f32(temp_v0, a_c0_i5, b_vi5_0, 2); 110 | 111 | temp_v3 = vfmaq_laneq_f32(part2_c3, a_c0_i5, b_vi5_0, 3); 112 | part2_c3 = vfmaq_laneq_f32(temp_v3, a_c0_i4, b_vi4_0, 3); 113 | 114 | 115 | a_c0_i6 = vld1q_f32(A + i * 4 + 24); 116 | a_c0_i7 = vld1q_f32(A + i * 4 + 28); 117 | b_vi6_0 = vld1q_f32(B + 24); 118 | b_vi7_0 = vld1q_f32(B + 28); 119 | temp_v0 = vfmaq_laneq_f32(part3_c0, a_c0_i6, b_vi6_0, 0); 120 | part3_c0 = vfmaq_laneq_f32(temp_v0, a_c0_i7, b_vi7_0, 0); 121 | 122 | temp_v3 = vfmaq_laneq_f32(part3_c1, a_c0_i7, b_vi7_0, 1); 123 | part3_c1 = vfmaq_laneq_f32(temp_v3, a_c0_i6, b_vi6_0, 1); 124 | 125 | temp_v0 = vfmaq_laneq_f32(part3_c2, a_c0_i6, b_vi6_0, 2); 126 | part3_c2 = vfmaq_laneq_f32(temp_v0, a_c0_i7, b_vi7_0, 2); 127 | 128 | temp_v3 = vfmaq_laneq_f32(part3_c3, a_c0_i7, b_vi7_0, 3); 129 | part3_c3 = vfmaq_laneq_f32(temp_v3, a_c0_i6, b_vi6_0, 3); 130 | 131 | #endif 132 | B += SIMD_UNROLL; 133 | } 134 | 135 | #if SIMD_UNROLL == 32 136 | c_c0 = c_c0 + part1_c0 + part2_c0 + part3_c0; 137 | c_c1 = c_c1 + part1_c1 + part2_c1 + part3_c1; 138 | c_c2 = c_c2 + part1_c2 + part2_c2 + part3_c2; 139 | c_c3 = c_c3 + part1_c3 + part2_c3 + part3_c3; 140 | #else 141 | c_c0 = vaddq_f32(part1_c0, c_c0); 142 | c_c1 = vaddq_f32(part1_c1, c_c1); 143 | c_c2 = vaddq_f32(part1_c2, c_c2); 144 | c_c3 = vaddq_f32(part1_c3, c_c3); 145 | #endif 146 | 147 | float32x4_t b_vi0, b_vi1, b_vi2, b_vi3; 148 | for(; i < n; ++i) { 149 | a_c0_i0 = vld1q_f32(A + i * 4); 150 | b_vi0 = vld1q_dup_f32(B); b_vi1 = vld1q_dup_f32(B + 1); 151 | b_vi2 = vld1q_dup_f32(B + 2); b_vi3 = vld1q_dup_f32(B + 3); 152 | B += 4; 153 | c_c0 = vmlaq_f32(c_c0, a_c0_i0, b_vi0); 154 | c_c1 = vmlaq_f32(c_c1, a_c0_i0, b_vi1); 155 | c_c2 = vmlaq_f32(c_c2, a_c0_i0, b_vi2); 156 | c_c3 = vmlaq_f32(c_c3, a_c0_i0, b_vi3); 157 | } 158 | 159 | vst1q_f32(C + 0 * ldc, c_c0); vst1q_f32(C + 1 * ldc, c_c1); 160 | vst1q_f32(C + 2 * ldc, c_c2); vst1q_f32(C + 3 * ldc, c_c3); 161 | } 162 | 163 | static void pack_left_A(int K, int lda, float *A, float *packed_A) { 164 | float *dst = packed_A; 165 | int k; 166 | for (k = 0; k + SIMD_UNROLLD4 <= K; k += SIMD_UNROLLD4) { 167 | float *a0_k0_p = A + k * lda, *a0_k1_p = A + (k + 1) * lda, *a0_k2_p = A + (k + 2) * lda, *a0_k3_p = A + (k + 3) * lda; 168 | #if SIMD_UNROLL == 32 169 | float *a0_k4_p = A + (k + 4) * lda, *a0_k5_p = A + (k + 5) * lda, *a0_k6_p = A + (k + 6) * lda, *a0_k7_p = A + (k + 7) * lda; 170 | #endif 171 | *(dst + 0) = *(a0_k0_p + 0); 172 | *(dst + 1) = *(a0_k0_p + 1); 173 | *(dst + 2) = *(a0_k0_p + 2); 174 | *(dst + 3) = *(a0_k0_p + 3); 175 | *(dst + 4) = *(a0_k1_p + 0); 176 | *(dst + 5) = *(a0_k1_p + 1); 177 | *(dst + 6) = *(a0_k1_p + 2); 178 | *(dst + 7) = *(a0_k1_p + 3); 179 | *(dst + 8) = *(a0_k2_p + 0); 180 | *(dst + 9) = *(a0_k2_p + 1); 181 | *(dst + 10) = *(a0_k2_p + 2); 182 | *(dst + 11) = *(a0_k2_p + 3); 183 | *(dst + 12) = *(a0_k3_p + 0); 184 | *(dst + 13) = *(a0_k3_p + 1); 185 | *(dst + 14) = *(a0_k3_p + 2); 186 | *(dst + 15) = *(a0_k3_p + 3); 187 | #if SIMD_UNROLL == 32 188 | *(dst + 16) = *(a0_k4_p + 0); 189 | *(dst + 17) = *(a0_k4_p + 1); 190 | *(dst + 18) = *(a0_k4_p + 2); 191 | *(dst + 19) = *(a0_k4_p + 3); 192 | *(dst + 20) = *(a0_k5_p + 0); 193 | *(dst + 21) = *(a0_k5_p + 1); 194 | *(dst + 22) = *(a0_k5_p + 2); 195 | *(dst + 23) = *(a0_k5_p + 3); 196 | *(dst + 24) = *(a0_k6_p + 0); 197 | *(dst + 25) = *(a0_k6_p + 1); 198 | *(dst + 26) = *(a0_k6_p + 2); 199 | *(dst + 27) = *(a0_k6_p + 3); 200 | *(dst + 28) = *(a0_k7_p + 0); 201 | *(dst + 29) = *(a0_k7_p + 1); 202 | *(dst + 30) = *(a0_k7_p + 2); 203 | *(dst + 31) = *(a0_k7_p + 3); 204 | #endif 205 | dst += SIMD_UNROLL; 206 | // } 207 | } 208 | 209 | for (; k < K; k++) { 210 | *(dst + 0) = A[0 + k * lda]; 211 | *(dst + 1) = A[1 + k * lda]; 212 | *(dst + 2) = A[2 + k * lda]; 213 | *(dst + 3) = A[3 + k * lda]; 214 | dst += 4; 215 | } 216 | } 217 | 218 | 219 | 220 | static void pack_right_B(int K, int ldb, float *B, float *packed_B) { 221 | float *dst = packed_B; 222 | int k; 223 | for(k = 0; k + SIMD_UNROLLD4 <= K; k += SIMD_UNROLLD4) { 224 | *(dst + 0) = *(B + k + 0 + 0 * ldb); 225 | *(dst + 1) = *(B + k + 0 + 1 * ldb); 226 | *(dst + 2) = *(B + k + 0 + 2 * ldb); 227 | *(dst + 3) = *(B + k + 0 + 3 * ldb); 228 | *(dst + 4) = *(B + k + 1 + 0 * ldb); 229 | *(dst + 5) = *(B + k + 1 + 1 * ldb); 230 | *(dst + 6) = *(B + k + 1 + 2 * ldb); 231 | *(dst + 7) = *(B + k + 1 + 3 * ldb); 232 | *(dst + 8) = *(B + k + 2 + 0 * ldb); 233 | *(dst + 9) = *(B + k + 2 + 1 * ldb); 234 | *(dst + 10) = *(B + k + 2 + 2 * ldb); 235 | *(dst + 11) = *(B + k + 2 + 3 * ldb); 236 | *(dst + 12) = *(B + k + 3 + 0 * ldb); 237 | *(dst + 13) = *(B + k + 3 + 1 * ldb); 238 | *(dst + 14) = *(B + k + 3 + 2 * ldb); 239 | *(dst + 15) = *(B + k + 3 + 3 * ldb); 240 | #if SIMD_UNROLL == 32 241 | *(dst + 16) = *(B + k + 4 + 0 * ldb); 242 | *(dst + 17) = *(B + k + 4 + 1 * ldb); 243 | *(dst + 18) = *(B + k + 4 + 2 * ldb); 244 | *(dst + 19) = *(B + k + 4 + 3 * ldb); 245 | *(dst + 20) = *(B + k + 5 + 0 * ldb); 246 | *(dst + 21) = *(B + k + 5 + 1 * ldb); 247 | *(dst + 22) = *(B + k + 5 + 2 * ldb); 248 | *(dst + 23) = *(B + k + 5 + 3 * ldb); 249 | *(dst + 24) = *(B + k + 6 + 0 * ldb); 250 | *(dst + 25) = *(B + k + 6 + 1 * ldb); 251 | *(dst + 26) = *(B + k + 6 + 2 * ldb); 252 | *(dst + 27) = *(B + k + 6 + 3 * ldb); 253 | *(dst + 28) = *(B + k + 7 + 0 * ldb); 254 | *(dst + 29) = *(B + k + 7 + 1 * ldb); 255 | *(dst + 30) = *(B + k + 7 + 2 * ldb); 256 | *(dst + 31) = *(B + k + 7 + 3 * ldb); 257 | #endif 258 | dst += SIMD_UNROLL; 259 | 260 | } 261 | for (; k < K; ++k) { 262 | *(dst + 0) = *(B + k + 0 * ldb); 263 | *(dst + 1) = *(B + k + 1 * ldb); 264 | *(dst + 2) = *(B + k + 2 * ldb); 265 | *(dst + 3) = *(B + k + 3 * ldb); 266 | dst += 4; 267 | } 268 | } 269 | 270 | static void do_block(int M, int N, int K, int lda,int ldb, int ldc, float *A, float *B, float *C, float *packed_A, float *packed_B, int should_pack_B) { 271 | int j; 272 | for (j = 0; j + UNROLL_NUM <= N; j += UNROLL_NUM) { 273 | if (should_pack_B) { 274 | pack_right_B(K, ldb, B + j * ldb, packed_B + j * K); 275 | } 276 | int i; 277 | for (i = 0; i + UNROLL_NUM <= M; i += UNROLL_NUM) { 278 | if (j == 0) { 279 | pack_left_A(K, lda, A + i, packed_A + i * K); 280 | } 281 | dot_mul_square(K, SIMD_UNROLL, 1, ldc, packed_A + i * K, packed_B + j * K, C + i + j * ldc); 282 | } 283 | 284 | 285 | } 286 | } 287 | 288 | 289 | int should_padding(int m, int n, int *new_m_ptr, int *new_n_ptr) { 290 | *new_m_ptr = (m + UNROLL_NUM - 1) / UNROLL_NUM * UNROLL_NUM; 291 | *new_n_ptr = (n + UNROLL_NUM - 1) / UNROLL_NUM * UNROLL_NUM; 292 | return m != *new_m_ptr || n != *new_n_ptr; 293 | } 294 | 295 | static int64_t tempDur0 = 0, tempDur1 = 0; 296 | 297 | float *get_padding_matrix(int lda, int m, int n, int new_m, int new_n, const float *A) { 298 | float *new_A = NULL; 299 | int ret = posix_memalign((void**)&new_A, 4096, sizeof(float) * new_m * new_n); 300 | if (ret != 0) { 301 | fprintf(stderr, "Can not align malloc padding matrix!\n"); 302 | exit(-1); 303 | } 304 | int j; 305 | for (j = 0; j < n; ++j) { 306 | memcpy(new_A + j * new_m, A + j * lda, sizeof(float) * m); 307 | memset(new_A + m + j * new_m, 0, sizeof(float) * (new_m - m)); 308 | } 309 | for (;j < new_n; ++j) { 310 | memset(new_A + j * new_m, 0, sizeof(float) * new_m); 311 | } 312 | return new_A; 313 | } 314 | 315 | void back_padding(int lda, int m, int n, int new_m, int new_n, float *A, float *padding_A) { 316 | for (int j = 0; j < n; ++j) { 317 | memcpy(A + j * lda, padding_A + j * new_m, sizeof(float) * m); 318 | } 319 | } 320 | 321 | 322 | 323 | void square_gemm (int n, float* A, float* B, float* C) { 324 | 325 | float *padding_A = A, *padding_B = B, *padding_C = C; 326 | int newM, newN, newK; 327 | int should_pad_A = should_padding(n, n, &newM, &newK); 328 | int should_pad_B = should_padding(n, n, &newK, &newN); 329 | int should_pad_C = should_padding(n, n, &newM, &newN); 330 | 331 | 332 | float *packed_A, *packed_B; 333 | int tempRet1 = posix_memalign((void**)&packed_A, 4096, BLOCK_COL * BLOCK_ROW * sizeof(float)); 334 | int tempRet2 = posix_memalign((void**)&packed_B, 4096, BLOCK_COL * newN * sizeof(float)); 335 | 336 | 337 | if (tempRet1 != 0|| tempRet2 != 0) { 338 | fprintf(stderr, "Can not align malloc packed pool!\n"); 339 | exit(-1); 340 | } 341 | 342 | if (should_pad_A) { 343 | padding_A = get_padding_matrix(n, n, n, newM, newK, A); 344 | } 345 | if (should_pad_B) { 346 | padding_B = get_padding_matrix(n, n, n, newK, newN, B); 347 | } 348 | if (should_pad_C) { 349 | padding_C= get_padding_matrix(n, n, n, newM, newN, C); 350 | } 351 | 352 | 353 | for (int k = 0; k < newK; k += BLOCK_COL) { 354 | int K = min(newK - k, BLOCK_COL); 355 | for (int i = 0; i < newM; i += BLOCK_ROW) { 356 | int M = min(newM - i, BLOCK_ROW); 357 | int N = newN; 358 | 359 | do_block(M, N, K, newM, newK, newM, padding_A + i + k * newM, padding_B + k, padding_C + i, packed_A, packed_B, i == 0); 360 | } 361 | } 362 | 363 | 364 | free(packed_A); 365 | free(packed_B); 366 | 367 | 368 | if (should_pad_A) 369 | free(padding_A); 370 | if (should_pad_B) 371 | free(padding_B); 372 | 373 | 374 | if (should_pad_C) { 375 | back_padding(n, n, n, newM, newN, C, padding_C); 376 | free(padding_C); 377 | } 378 | 379 | } -------------------------------------------------------------------------------- /gemm-single/outfile: -------------------------------------------------------------------------------- 1 | GPTL version info: 8.0.3 2 | WARNING: GPTLerror was called at least once during the run. 3 | Please examine your output for error messages beginning with GPTL... 4 | GPTL was built without threading 5 | HAVE_LIBMPI was true 6 | ENABLE_PMPI was true 7 | HAVE_PAPI was true 8 | PAPI event multiplexing was OFF 9 | Description of printed events (PAPI and derived): 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | PAPI events enabled (including those required for derived events): 24 | CPU_CYCLES 25 | L1D_CACHE 26 | L1D_CACHE_REFILL 27 | L2D_CACHE_ACCESS 28 | MEM_STALL_ANYLOAD 29 | MEM_STALL_ANYSTORE 30 | 31 | ENABLE_NESTEDOMP was false 32 | Autoprofiling capability was enabled with backtrace 33 | Underlying timing routine was gettimeofday. 34 | GPTLget_overhead: using hash entry 896=gemm for getentry estimate 35 | Total overhead of 1 GPTL start or GPTLstop call=9.94e-07 seconds 36 | Components are as follows: 37 | Fortran layer: 1.0e-09 = 0.1% of total 38 | Get thread number: 7.0e-09 = 0.7% of total 39 | Generate hash index: 7.0e-09 = 0.7% of total 40 | Find hashtable entry: 1.2e-08 = 1.2% of total 41 | Underlying timing routine: 4.1e-08 = 4.1% of total 42 | Misc start/stop functions: 3.0e-09 = 0.3% of total 43 | Read PAPI counters: 9.2e-07 = 92.9% of total 44 | 45 | Overhead of backtrace (invoked once per auto-instrumented start entry)=4e-06 seconds 46 | NOTE: If GPTL is called from C not Fortran, the 'Fortran layer' overhead is zero 47 | NOTE: For calls to GPTLstart_handle()/GPTLstop_handle(), the 'Generate hash index' overhead is zero 48 | NOTE: For auto-instrumented calls, the cost of generating the hash index plus finding 49 | the hashtable entry is 0.0e+00 not the 1.9e-08 portion taken by GPTLstart 50 | NOTE: Each hash collision roughly doubles the 'Find hashtable entry' cost of that timer 51 | 52 | If overhead stats are printed, they are the columns labeled self_OH and parent_OH 53 | self_OH is estimated as 2X the Fortran layer cost (start+stop) plust the cost of 54 | a single call to the underlying timing routine. 55 | parent_OH is the overhead for the named timer which is subsumed into its parent. 56 | It is estimated as the cost of a single GPTLstart()/GPTLstop() pair. 57 | Print method was most_frequent. 58 | 59 | If a AVG_MPI_BYTES field is present, it is an estimate of the per-call 60 | average number of bytes handled by that process. 61 | If timers beginning with sync_ are present, it means MPI synchronization was turned on. 62 | 63 | If a '%_of' field is present, it is w.r.t. the first timer for thread 0. 64 | If a 'e6_per_sec' field is present, it is in millions of PAPI counts per sec. 65 | 66 | A '*' in column 1 below means the timer had multiple parents, though the values 67 | printed are for all calls. Multiple parent stats appear later in the file in the 68 | section titled 'Multiple parent info' 69 | A '!' in column 1 means the timer is currently ON and the printed timings are only 70 | valid as of the previous GPTLstop. '!' overrides '*' if the region had multiple 71 | parents and was currently ON. 72 | 73 | Process size=93.312500 MB rss=57.625000 MB 74 | 75 | Stats for thread 0: 76 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 77 | gemm 28 - 17.131 8.282 0.208 0.000 0.000 - 4.45e+10 2.03e+10 3.77e+08 9.99e+09 8.88e+09 1583 78 | 79 | Overhead sum = 5.56e-05 wallclock seconds 80 | Total calls = 28 81 | 82 | Stats for thread 1: 83 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 84 | 85 | Overhead sum = 0 wallclock seconds 86 | Total calls = 0 87 | 88 | Stats for thread 2: 89 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 90 | 91 | Overhead sum = 0 wallclock seconds 92 | Total calls = 0 93 | 94 | Stats for thread 3: 95 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 96 | 97 | Overhead sum = 0 wallclock seconds 98 | Total calls = 0 99 | 100 | Stats for thread 4: 101 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 102 | 103 | Overhead sum = 0 wallclock seconds 104 | Total calls = 0 105 | 106 | Stats for thread 5: 107 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 108 | 109 | Overhead sum = 0 wallclock seconds 110 | Total calls = 0 111 | 112 | Stats for thread 6: 113 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 114 | 115 | Overhead sum = 0 wallclock seconds 116 | Total calls = 0 117 | 118 | Stats for thread 7: 119 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 120 | 121 | Overhead sum = 0 wallclock seconds 122 | Total calls = 0 123 | 124 | Stats for thread 8: 125 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 126 | 127 | Overhead sum = 0 wallclock seconds 128 | Total calls = 0 129 | 130 | Stats for thread 9: 131 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 132 | 133 | Overhead sum = 0 wallclock seconds 134 | Total calls = 0 135 | 136 | Stats for thread 10: 137 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 138 | 139 | Overhead sum = 0 wallclock seconds 140 | Total calls = 0 141 | 142 | Stats for thread 11: 143 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 144 | 145 | Overhead sum = 0 wallclock seconds 146 | Total calls = 0 147 | 148 | Stats for thread 12: 149 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 150 | 151 | Overhead sum = 0 wallclock seconds 152 | Total calls = 0 153 | 154 | Stats for thread 13: 155 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 156 | 157 | Overhead sum = 0 wallclock seconds 158 | Total calls = 0 159 | 160 | Stats for thread 14: 161 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 162 | 163 | Overhead sum = 0 wallclock seconds 164 | Total calls = 0 165 | 166 | Stats for thread 15: 167 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 168 | 169 | Overhead sum = 0 wallclock seconds 170 | Total calls = 0 171 | 172 | Stats for thread 16: 173 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 174 | 175 | Overhead sum = 0 wallclock seconds 176 | Total calls = 0 177 | 178 | Stats for thread 17: 179 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 180 | 181 | Overhead sum = 0 wallclock seconds 182 | Total calls = 0 183 | 184 | Stats for thread 18: 185 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 186 | 187 | Overhead sum = 0 wallclock seconds 188 | Total calls = 0 189 | 190 | Stats for thread 19: 191 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 192 | 193 | Overhead sum = 0 wallclock seconds 194 | Total calls = 0 195 | 196 | Stats for thread 20: 197 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 198 | 199 | Overhead sum = 0 wallclock seconds 200 | Total calls = 0 201 | 202 | Stats for thread 21: 203 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 204 | 205 | Overhead sum = 0 wallclock seconds 206 | Total calls = 0 207 | 208 | Stats for thread 22: 209 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 210 | 211 | Overhead sum = 0 wallclock seconds 212 | Total calls = 0 213 | 214 | Stats for thread 23: 215 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 216 | 217 | Overhead sum = 0 wallclock seconds 218 | Total calls = 0 219 | 220 | Stats for thread 24: 221 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 222 | 223 | Overhead sum = 0 wallclock seconds 224 | Total calls = 0 225 | 226 | Stats for thread 25: 227 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 228 | 229 | Overhead sum = 0 wallclock seconds 230 | Total calls = 0 231 | 232 | Stats for thread 26: 233 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 234 | 235 | Overhead sum = 0 wallclock seconds 236 | Total calls = 0 237 | 238 | Stats for thread 27: 239 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 240 | 241 | Overhead sum = 0 wallclock seconds 242 | Total calls = 0 243 | 244 | Stats for thread 28: 245 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 246 | 247 | Overhead sum = 0 wallclock seconds 248 | Total calls = 0 249 | 250 | Stats for thread 29: 251 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 252 | 253 | Overhead sum = 0 wallclock seconds 254 | Total calls = 0 255 | 256 | Stats for thread 30: 257 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 258 | 259 | Overhead sum = 0 wallclock seconds 260 | Total calls = 0 261 | 262 | Stats for thread 31: 263 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 264 | 265 | Overhead sum = 0 wallclock seconds 266 | Total calls = 0 267 | 268 | Stats for thread 32: 269 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 270 | 271 | Overhead sum = 0 wallclock seconds 272 | Total calls = 0 273 | 274 | Stats for thread 33: 275 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 276 | 277 | Overhead sum = 0 wallclock seconds 278 | Total calls = 0 279 | 280 | Stats for thread 34: 281 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 282 | 283 | Overhead sum = 0 wallclock seconds 284 | Total calls = 0 285 | 286 | Stats for thread 35: 287 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 288 | 289 | Overhead sum = 0 wallclock seconds 290 | Total calls = 0 291 | 292 | Stats for thread 36: 293 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 294 | 295 | Overhead sum = 0 wallclock seconds 296 | Total calls = 0 297 | 298 | Stats for thread 37: 299 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 300 | 301 | Overhead sum = 0 wallclock seconds 302 | Total calls = 0 303 | 304 | Stats for thread 38: 305 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 306 | 307 | Overhead sum = 0 wallclock seconds 308 | Total calls = 0 309 | 310 | Stats for thread 39: 311 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 312 | 313 | Overhead sum = 0 wallclock seconds 314 | Total calls = 0 315 | 316 | Stats for thread 40: 317 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 318 | 319 | Overhead sum = 0 wallclock seconds 320 | Total calls = 0 321 | 322 | Stats for thread 41: 323 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 324 | 325 | Overhead sum = 0 wallclock seconds 326 | Total calls = 0 327 | 328 | Stats for thread 42: 329 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 330 | 331 | Overhead sum = 0 wallclock seconds 332 | Total calls = 0 333 | 334 | Stats for thread 43: 335 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 336 | 337 | Overhead sum = 0 wallclock seconds 338 | Total calls = 0 339 | 340 | Stats for thread 44: 341 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 342 | 343 | Overhead sum = 0 wallclock seconds 344 | Total calls = 0 345 | 346 | Stats for thread 45: 347 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 348 | 349 | Overhead sum = 0 wallclock seconds 350 | Total calls = 0 351 | 352 | Stats for thread 46: 353 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 354 | 355 | Overhead sum = 0 wallclock seconds 356 | Total calls = 0 357 | 358 | Stats for thread 47: 359 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 360 | 361 | Overhead sum = 0 wallclock seconds 362 | Total calls = 0 363 | 364 | Stats for thread 48: 365 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 366 | 367 | Overhead sum = 0 wallclock seconds 368 | Total calls = 0 369 | 370 | Stats for thread 49: 371 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 372 | 373 | Overhead sum = 0 wallclock seconds 374 | Total calls = 0 375 | 376 | Stats for thread 50: 377 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 378 | 379 | Overhead sum = 0 wallclock seconds 380 | Total calls = 0 381 | 382 | Stats for thread 51: 383 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 384 | 385 | Overhead sum = 0 wallclock seconds 386 | Total calls = 0 387 | 388 | Stats for thread 52: 389 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 390 | 391 | Overhead sum = 0 wallclock seconds 392 | Total calls = 0 393 | 394 | Stats for thread 53: 395 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 396 | 397 | Overhead sum = 0 wallclock seconds 398 | Total calls = 0 399 | 400 | Stats for thread 54: 401 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 402 | 403 | Overhead sum = 0 wallclock seconds 404 | Total calls = 0 405 | 406 | Stats for thread 55: 407 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 408 | 409 | Overhead sum = 0 wallclock seconds 410 | Total calls = 0 411 | 412 | Stats for thread 56: 413 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 414 | 415 | Overhead sum = 0 wallclock seconds 416 | Total calls = 0 417 | 418 | Stats for thread 57: 419 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 420 | 421 | Overhead sum = 0 wallclock seconds 422 | Total calls = 0 423 | 424 | Stats for thread 58: 425 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 426 | 427 | Overhead sum = 0 wallclock seconds 428 | Total calls = 0 429 | 430 | Stats for thread 59: 431 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 432 | 433 | Overhead sum = 0 wallclock seconds 434 | Total calls = 0 435 | 436 | Stats for thread 60: 437 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 438 | 439 | Overhead sum = 0 wallclock seconds 440 | Total calls = 0 441 | 442 | Stats for thread 61: 443 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 444 | 445 | Overhead sum = 0 wallclock seconds 446 | Total calls = 0 447 | 448 | Stats for thread 62: 449 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 450 | 451 | Overhead sum = 0 wallclock seconds 452 | Total calls = 0 453 | 454 | Stats for thread 63: 455 | Called Recurse Wall max min selfOH parentOH AVG_MPI_BYTES YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 456 | 457 | Overhead sum = 0 wallclock seconds 458 | Total calls = 0 459 | 460 | Same stats sorted by timer for threaded regions: 461 | Thd Called Recurse Wall max min selfOH parentOH YCLES ACHE ACHE_REF ACHE_ACC TALL_ANY TALL_ANY 462 | OVERHEAD.000 (wallclock seconds) = 5.56e-05 463 | OVERHEAD.001 (wallclock seconds) = 0 464 | OVERHEAD.002 (wallclock seconds) = 0 465 | OVERHEAD.003 (wallclock seconds) = 0 466 | OVERHEAD.004 (wallclock seconds) = 0 467 | OVERHEAD.005 (wallclock seconds) = 0 468 | OVERHEAD.006 (wallclock seconds) = 0 469 | OVERHEAD.007 (wallclock seconds) = 0 470 | OVERHEAD.008 (wallclock seconds) = 0 471 | OVERHEAD.009 (wallclock seconds) = 0 472 | OVERHEAD.010 (wallclock seconds) = 0 473 | OVERHEAD.011 (wallclock seconds) = 0 474 | OVERHEAD.012 (wallclock seconds) = 0 475 | OVERHEAD.013 (wallclock seconds) = 0 476 | OVERHEAD.014 (wallclock seconds) = 0 477 | OVERHEAD.015 (wallclock seconds) = 0 478 | OVERHEAD.016 (wallclock seconds) = 0 479 | OVERHEAD.017 (wallclock seconds) = 0 480 | OVERHEAD.018 (wallclock seconds) = 0 481 | OVERHEAD.019 (wallclock seconds) = 0 482 | OVERHEAD.020 (wallclock seconds) = 0 483 | OVERHEAD.021 (wallclock seconds) = 0 484 | OVERHEAD.022 (wallclock seconds) = 0 485 | OVERHEAD.023 (wallclock seconds) = 0 486 | OVERHEAD.024 (wallclock seconds) = 0 487 | OVERHEAD.025 (wallclock seconds) = 0 488 | OVERHEAD.026 (wallclock seconds) = 0 489 | OVERHEAD.027 (wallclock seconds) = 0 490 | OVERHEAD.028 (wallclock seconds) = 0 491 | OVERHEAD.029 (wallclock seconds) = 0 492 | OVERHEAD.030 (wallclock seconds) = 0 493 | OVERHEAD.031 (wallclock seconds) = 0 494 | OVERHEAD.032 (wallclock seconds) = 0 495 | OVERHEAD.033 (wallclock seconds) = 0 496 | OVERHEAD.034 (wallclock seconds) = 0 497 | OVERHEAD.035 (wallclock seconds) = 0 498 | OVERHEAD.036 (wallclock seconds) = 0 499 | OVERHEAD.037 (wallclock seconds) = 0 500 | OVERHEAD.038 (wallclock seconds) = 0 501 | OVERHEAD.039 (wallclock seconds) = 0 502 | OVERHEAD.040 (wallclock seconds) = 0 503 | OVERHEAD.041 (wallclock seconds) = 0 504 | OVERHEAD.042 (wallclock seconds) = 0 505 | OVERHEAD.043 (wallclock seconds) = 0 506 | OVERHEAD.044 (wallclock seconds) = 0 507 | OVERHEAD.045 (wallclock seconds) = 0 508 | OVERHEAD.046 (wallclock seconds) = 0 509 | OVERHEAD.047 (wallclock seconds) = 0 510 | OVERHEAD.048 (wallclock seconds) = 0 511 | OVERHEAD.049 (wallclock seconds) = 0 512 | OVERHEAD.050 (wallclock seconds) = 0 513 | OVERHEAD.051 (wallclock seconds) = 0 514 | OVERHEAD.052 (wallclock seconds) = 0 515 | OVERHEAD.053 (wallclock seconds) = 0 516 | OVERHEAD.054 (wallclock seconds) = 0 517 | OVERHEAD.055 (wallclock seconds) = 0 518 | OVERHEAD.056 (wallclock seconds) = 0 519 | OVERHEAD.057 (wallclock seconds) = 0 520 | OVERHEAD.058 (wallclock seconds) = 0 521 | OVERHEAD.059 (wallclock seconds) = 0 522 | OVERHEAD.060 (wallclock seconds) = 0 523 | OVERHEAD.061 (wallclock seconds) = 0 524 | OVERHEAD.062 (wallclock seconds) = 0 525 | OVERHEAD.063 (wallclock seconds) = 0 526 | OVERHEAD.SUM (wallclock seconds) = 5.56e-05 527 | thread 0 long name translations (empty when no auto-instrumentation): 528 | thread 1 long name translations (empty when no auto-instrumentation): 529 | thread 2 long name translations (empty when no auto-instrumentation): 530 | thread 3 long name translations (empty when no auto-instrumentation): 531 | thread 4 long name translations (empty when no auto-instrumentation): 532 | thread 5 long name translations (empty when no auto-instrumentation): 533 | thread 6 long name translations (empty when no auto-instrumentation): 534 | thread 7 long name translations (empty when no auto-instrumentation): 535 | thread 8 long name translations (empty when no auto-instrumentation): 536 | thread 9 long name translations (empty when no auto-instrumentation): 537 | thread 10 long name translations (empty when no auto-instrumentation): 538 | thread 11 long name translations (empty when no auto-instrumentation): 539 | thread 12 long name translations (empty when no auto-instrumentation): 540 | thread 13 long name translations (empty when no auto-instrumentation): 541 | thread 14 long name translations (empty when no auto-instrumentation): 542 | thread 15 long name translations (empty when no auto-instrumentation): 543 | thread 16 long name translations (empty when no auto-instrumentation): 544 | thread 17 long name translations (empty when no auto-instrumentation): 545 | thread 18 long name translations (empty when no auto-instrumentation): 546 | thread 19 long name translations (empty when no auto-instrumentation): 547 | thread 20 long name translations (empty when no auto-instrumentation): 548 | thread 21 long name translations (empty when no auto-instrumentation): 549 | thread 22 long name translations (empty when no auto-instrumentation): 550 | thread 23 long name translations (empty when no auto-instrumentation): 551 | thread 24 long name translations (empty when no auto-instrumentation): 552 | thread 25 long name translations (empty when no auto-instrumentation): 553 | thread 26 long name translations (empty when no auto-instrumentation): 554 | thread 27 long name translations (empty when no auto-instrumentation): 555 | thread 28 long name translations (empty when no auto-instrumentation): 556 | thread 29 long name translations (empty when no auto-instrumentation): 557 | thread 30 long name translations (empty when no auto-instrumentation): 558 | thread 31 long name translations (empty when no auto-instrumentation): 559 | thread 32 long name translations (empty when no auto-instrumentation): 560 | thread 33 long name translations (empty when no auto-instrumentation): 561 | thread 34 long name translations (empty when no auto-instrumentation): 562 | thread 35 long name translations (empty when no auto-instrumentation): 563 | thread 36 long name translations (empty when no auto-instrumentation): 564 | thread 37 long name translations (empty when no auto-instrumentation): 565 | thread 38 long name translations (empty when no auto-instrumentation): 566 | thread 39 long name translations (empty when no auto-instrumentation): 567 | thread 40 long name translations (empty when no auto-instrumentation): 568 | thread 41 long name translations (empty when no auto-instrumentation): 569 | thread 42 long name translations (empty when no auto-instrumentation): 570 | thread 43 long name translations (empty when no auto-instrumentation): 571 | thread 44 long name translations (empty when no auto-instrumentation): 572 | thread 45 long name translations (empty when no auto-instrumentation): 573 | thread 46 long name translations (empty when no auto-instrumentation): 574 | thread 47 long name translations (empty when no auto-instrumentation): 575 | thread 48 long name translations (empty when no auto-instrumentation): 576 | thread 49 long name translations (empty when no auto-instrumentation): 577 | thread 50 long name translations (empty when no auto-instrumentation): 578 | thread 51 long name translations (empty when no auto-instrumentation): 579 | thread 52 long name translations (empty when no auto-instrumentation): 580 | thread 53 long name translations (empty when no auto-instrumentation): 581 | thread 54 long name translations (empty when no auto-instrumentation): 582 | thread 55 long name translations (empty when no auto-instrumentation): 583 | thread 56 long name translations (empty when no auto-instrumentation): 584 | thread 57 long name translations (empty when no auto-instrumentation): 585 | thread 58 long name translations (empty when no auto-instrumentation): 586 | thread 59 long name translations (empty when no auto-instrumentation): 587 | thread 60 long name translations (empty when no auto-instrumentation): 588 | thread 61 long name translations (empty when no auto-instrumentation): 589 | thread 62 long name translations (empty when no auto-instrumentation): 590 | thread 63 long name translations (empty when no auto-instrumentation): 591 | 592 | Total GPTL memory usage = 1113.42 KB 593 | Components: 594 | Hashmem = 1047.56 KB 595 | Regionmem = 0.32 KB (papimem portion = 0.096 KB) 596 | Parent/child arrays = 0.008 KB 597 | Callstackmem = 65.536 KB 598 | 599 | Thread mapping: 600 | GPTLthreadid[0] = 0 601 | GPTLthreadid[1] = -1 602 | GPTLthreadid[2] = -1 603 | GPTLthreadid[3] = -1 604 | GPTLthreadid[4] = -1 605 | GPTLthreadid[5] = -1 606 | GPTLthreadid[6] = -1 607 | GPTLthreadid[7] = -1 608 | GPTLthreadid[8] = -1 609 | GPTLthreadid[9] = -1 610 | GPTLthreadid[10] = -1 611 | GPTLthreadid[11] = -1 612 | GPTLthreadid[12] = -1 613 | GPTLthreadid[13] = -1 614 | GPTLthreadid[14] = -1 615 | GPTLthreadid[15] = -1 616 | GPTLthreadid[16] = -1 617 | GPTLthreadid[17] = -1 618 | GPTLthreadid[18] = -1 619 | GPTLthreadid[19] = -1 620 | GPTLthreadid[20] = -1 621 | GPTLthreadid[21] = -1 622 | GPTLthreadid[22] = -1 623 | GPTLthreadid[23] = -1 624 | GPTLthreadid[24] = -1 625 | GPTLthreadid[25] = -1 626 | GPTLthreadid[26] = -1 627 | GPTLthreadid[27] = -1 628 | GPTLthreadid[28] = -1 629 | GPTLthreadid[29] = -1 630 | GPTLthreadid[30] = -1 631 | GPTLthreadid[31] = -1 632 | GPTLthreadid[32] = -1 633 | GPTLthreadid[33] = -1 634 | GPTLthreadid[34] = -1 635 | GPTLthreadid[35] = -1 636 | GPTLthreadid[36] = -1 637 | GPTLthreadid[37] = -1 638 | GPTLthreadid[38] = -1 639 | GPTLthreadid[39] = -1 640 | GPTLthreadid[40] = -1 641 | GPTLthreadid[41] = -1 642 | GPTLthreadid[42] = -1 643 | GPTLthreadid[43] = -1 644 | GPTLthreadid[44] = -1 645 | GPTLthreadid[45] = -1 646 | GPTLthreadid[46] = -1 647 | GPTLthreadid[47] = -1 648 | GPTLthreadid[48] = -1 649 | GPTLthreadid[49] = -1 650 | GPTLthreadid[50] = -1 651 | GPTLthreadid[51] = -1 652 | GPTLthreadid[52] = -1 653 | GPTLthreadid[53] = -1 654 | GPTLthreadid[54] = -1 655 | GPTLthreadid[55] = -1 656 | GPTLthreadid[56] = -1 657 | GPTLthreadid[57] = -1 658 | GPTLthreadid[58] = -1 659 | GPTLthreadid[59] = -1 660 | GPTLthreadid[60] = -1 661 | GPTLthreadid[61] = -1 662 | GPTLthreadid[62] = -1 663 | GPTLthreadid[63] = -1 664 | -------------------------------------------------------------------------------- /gemm-single/gemm-naive.c: -------------------------------------------------------------------------------- 1 | #include "arm_neon.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #define BLOCK_ROW 64 9 | #define BLOCK_COL 128 10 | 11 | #define min(a,b) (((a)<(b))?(a):(b)) 12 | 13 | const char* gemm_desc = "Naive, three-loop gemm."; 14 | 15 | static void naive_transpose(int row, int col, float *A) { 16 | for(int i = 1; i < row; ++i) { 17 | for (int j = 0;j < i; ++j) { 18 | int ij_index = i + j * row, ji_index = j + i * row; 19 | float tmp = A[ij_index]; 20 | A[ij_index] = A[ji_index]; 21 | A[ji_index] = tmp; 22 | } 23 | } 24 | } 25 | 26 | void dot_mul(int n, int incX, float *A, float *B, float *C) { 27 | for (int i = 0; i < n; ++i) { 28 | *C += A[i * incX] * B[i]; 29 | } 30 | } 31 | 32 | #define NANOSECONDS_IN_SECOND 1000000000LL 33 | 34 | static inline int64_t wall_time_ns () 35 | { 36 | #ifdef GETTIMEOFDAY 37 | struct timeval t; 38 | gettimeofday (&t, NULL); 39 | return t.tv_sec * NANOSECONDS_IN_SECOND + t.tv_usec * 1000LL; 40 | #else 41 | struct timespec t; 42 | clock_gettime (CLOCK_MONOTONIC, &t); 43 | return t.tv_sec * NANOSECONDS_IN_SECOND + t.tv_nsec; 44 | #endif 45 | } 46 | 47 | #define UNROLL_NUM 4 48 | 49 | #define SIMD_UNROLL 32 50 | #define SIMD_UNROLLD4 8 51 | 52 | void dot_mul_unroll(int n, int lda, int ldb, int ldc, float *A, float *B, float *C, float *old_A, float *old_B, float *old_C) { 53 | // dot_mul(n, lda, A, B, C); 54 | // dot_mul(n, lda, A, B + ldb, C + ldc); 55 | // dot_mul(n, lda, A, B + 2 * ldb, C + 2 * ldc); 56 | // dot_mul(n, lda, A, B + 3 * ldb, C + 3 * ldc); 57 | register float c00, c01, c02, c03, c04, c05, c06, c07, a0i; 58 | c00 = 0.0, c01 = 0.0, c02 = 0.0, c03 = 0.0, c04 = 0.0, c05 = 0.0, c06 = 0.0, c07 = 0.0; 59 | register float *bi0_p, *bi1_p, *bi2_p, *bi3_p, *bi4_p, *bi5_p, *bi6_p, *bi7_p; 60 | bi0_p = B, bi1_p = B + ldb, bi2_p = B + 2 * ldb, bi3_p = B + 3 * ldb, bi4_p = B + 4 * ldb, bi5_p = B + 5 * ldb, bi6_p = B + 6 * ldb, bi7_p = B + 7 *ldb; 61 | 62 | for (int i = 0; i < n; ++i) { 63 | // for (int x = 0; x < UNROLL_NUM; ++x) { 64 | // // if (old_C - C + x * ldc >= n * n) { 65 | // // printf("C too large, index: %d, n: %d\n", old_C - C + x * ldc, n); 66 | // // // return; 67 | // // } 68 | // // if (old_B - B + i + x * ldb >= n * n) { 69 | // // printf("B too large, index: %d, n: %d\n", old_B - B + i + x * ldb, n); 70 | // // // return; 71 | // // } 72 | // // if (old_A - A + i * lda >= n * n) { 73 | // // printf("A too large, index: %d, n: %d\n", old_A - A + i * lda, n); 74 | // // // return; 75 | // // } 76 | // C[x * ldc] += A[i * lda] * B[i + x * ldb]; 77 | // } 78 | // int ilda = i * lda; 79 | // C[0] += A[ilda] * B[i]; 80 | // C[ldc] += A[ilda] * B[i + ldb]; 81 | // C[2 * ldc] += A[ilda] * B[i + 2 * ldb]; 82 | // C[3 * ldc] += A[ilda] * B[i + 3 * ldb]; 83 | // C[4 * ldc] += A[ilda] * B[i + 4 * ldb]; 84 | // C[5 * ldc] += A[ilda] * B[i + 5 * ldb]; 85 | // C[6 * ldc] += A[ilda] * B[i + 6 * ldb]; 86 | // C[7 * ldc] += A[ilda] * B[i + 7 * ldb]; 87 | a0i = A[i * lda]; 88 | c00 += a0i * *bi0_p++; 89 | c01 += a0i * *bi1_p++; 90 | c02 += a0i * *bi2_p++; 91 | c03 += a0i * *bi3_p++; 92 | c04 += a0i * *bi4_p++; 93 | c05 += a0i * *bi5_p++; 94 | c06 += a0i * *bi6_p++; 95 | c07 += a0i * *bi7_p++; 96 | } 97 | C[0] = c00; 98 | C[ldc] = c01; 99 | C[2 * ldc] = c02; 100 | C[3 * ldc] = c03; 101 | C[4 * ldc] = c04; 102 | C[5 * ldc] = c05; 103 | C[6 * ldc] = c06; 104 | C[7 * ldc] = c07; 105 | } 106 | 107 | 108 | #define UNROLL_ROW (UNROLL_NUM / 4) 109 | 110 | void dot_mul_square(int n, int lda, int ldb, int ldc, float *A, float *B, float *C) { 111 | // for (int y = 0; y < UNROLL_NUM; ++y) { 112 | // for (int x = 0; x < UNROLL_NUM; ++x) { 113 | // dot_mul(n, lda, A + y, B + x * ldb, C + y + x * ldc); 114 | // } 115 | // } 116 | // dot_mul(n, lda, A, B, C); 117 | // dot_mul(n, lda, A, B + 1 * ldb, C + 1 * ldc); 118 | // dot_mul(n, lda, A, B + 2 * ldb, C + 2 * ldc); 119 | // dot_mul(n, lda, A, B + 3 * ldb, C + 3 * ldc); 120 | 121 | // dot_mul(n, lda, A + 1, B + 0 * ldb, C + 1 + 0 * ldc); 122 | // dot_mul(n, lda, A + 1, B + 1 * ldb, C + 1 + 1 * ldc); 123 | // dot_mul(n, lda, A + 1, B + 2 * ldb, C + 1 + 2 * ldc); 124 | // dot_mul(n, lda, A + 1, B + 3 * ldb, C + 1 + 3 * ldc); 125 | 126 | // dot_mul(n, lda, A + 2, B + 0 * ldb, C + 2 + 0 * ldc); 127 | // dot_mul(n, lda, A + 2, B + 1 * ldb, C + 2 + 1 * ldc); 128 | // dot_mul(n, lda, A + 2, B + 2 * ldb, C + 2 + 2 * ldc); 129 | // dot_mul(n, lda, A + 2, B + 3 * ldb, C + 2 + 3 * ldc); 130 | 131 | // dot_mul(n, lda, A + 3, B + 0 * ldb, C + 3 + 0 * ldc); 132 | // dot_mul(n, lda, A + 3, B + 1 * ldb, C + 3 + 1 * ldc); 133 | // dot_mul(n, lda, A + 3, B + 2 * ldb, C + 3 + 2 * ldc); 134 | // dot_mul(n, lda, A + 3, B + 3 * ldb, C + 3 + 3 * ldc); 135 | 136 | // register float c00 = 0, c01 = 0, c02 = 0, c03 = 0, c10 = 0, c11 = 0, 137 | // c12 = 0, c13 = 0, c20 = 0, c21 = 0, c22 = 0, c23 = 0, c30 = 0, c31 = 0, c32 = 0, c33 = 0; 138 | // register float a0i, a1i, a2i, a3i; 139 | // register float bi0, bi1, bi2, bi3; 140 | // printf("use micro kernel\n"); 141 | // printf("init c_c3: "); 142 | // for(int i = 0; i < 4; ++i) { 143 | // printf("%.2f ", C[3 * ldc + i]); 144 | // } 145 | // printf("\n\n"); 146 | register float32x4_t c_c0, c_c1, c_c2, c_c3, a_c0_i0, a_c0_i1, a_c0_i2, a_c0_i3, b_vi0_0, b_vi1_0, b_vi2_0, b_vi3_0, temp_v0, temp_v3, part1_c0, part1_c1, part1_c2, part1_c3; 147 | 148 | #if SIMD_UNROLL == 32 149 | register float32x4_t part2_c0, part2_c1, part2_c2, part2_c3, part3_c0, part3_c1, part3_c2, part3_c3; 150 | register float32x4_t a_c0_i4, a_c0_i5, b_vi4_0, b_vi5_0; 151 | register float32x4_t a_c0_i6, a_c0_i7, b_vi6_0, b_vi7_0; 152 | #endif 153 | 154 | register float32x4_t zero = {0}; 155 | // float32x4_t c_ci[UNROLL_NUM * UNROLL_ROW], a_rxi[UNROLL_ROW], b_vi[UNROLL_NUM]; 156 | // c_c0 = vmovq_n_f32(0.0), c_c1 = vmovq_n_f32(0.0), c_c2 = vmovq_n_f32(0.0), c_c3 = vmovq_n_f32(0.0); 157 | part1_c0 = zero; part1_c1 = zero; part1_c2 = zero; part1_c3 = zero; 158 | #if SIMD_UNROLL == 32 159 | part2_c0 = zero; part2_c1 = zero; part2_c2 = zero; part2_c3 = zero; 160 | part3_c0 = zero; part3_c1 = zero; part3_c2 = zero; part3_c3 = zero; 161 | #endif 162 | c_c0 = vld1q_f32(C + 0 * ldc); c_c1 = vld1q_f32(C + 1 * ldc); 163 | c_c2 = vld1q_f32(C + 2 * ldc); c_c3 = vld1q_f32(C + 3 * ldc); 164 | 165 | // for (int x = 0; x < UNROLL_ROW; ++x) { 166 | // for (int y = 0; y < UNROLL_NUM; ++y) { 167 | // c_ci[x * UNROLL_NUM + y] = vld1q_f32(C + x * 4 + y * ldc); 168 | // } 169 | // } 170 | // register float *bi0_p, *bi1_p, *bi2_p, *bi3_p; 171 | // float *bix_p[UNROLL_NUM]; 172 | 173 | // for (int x = 0; x < UNROLL_NUM; ++x) { 174 | // bix_p[x] = B + x * ldb; 175 | // } 176 | // bi0_p = B; bi1_p = B + 1 * ldb; bi2_p = B + 2 * ldb; bi3_p = B + 3 * ldb; 177 | // printf("Packed A: "); 178 | // for (int j = 0; j < 20; ++j) { 179 | // printf("%.2f ", A[j]); 180 | // } 181 | // printf("\n\n"); 182 | // printf("\nPacked B: "); 183 | // for (int j = 0; j < 20; ++j) { 184 | // printf("%.2f ", B[j]); 185 | // } 186 | // printf("\n\n"); 187 | int i; 188 | for (i = 0; i + SIMD_UNROLLD4 <= n; i += SIMD_UNROLLD4) { 189 | // int ilda = i * lda; 190 | // for(int y = 0; y < UNROLL_NUM; ++y) { 191 | // for(int x = 0; x < UNROLL_NUM; ++x) { 192 | // C[y + x * ldc] += A[y + ilda] * B[i + x * ldb]; 193 | // } 194 | // } 195 | // for (int x = 0; x < UNROLL_ROW; ++ x) { 196 | // a_rxi[x] = vld1q_f32(A + 4 * x + i * lda); 197 | // } 198 | a_c0_i0 = vld1q_f32(A + i * 4); 199 | a_c0_i1 = vld1q_f32(A + i * 4 + 4); 200 | b_vi0_0 = vld1q_f32(B + 0); 201 | b_vi1_0 = vld1q_f32(B + 4); 202 | temp_v0 = vfmaq_laneq_f32(c_c0, a_c0_i0, b_vi0_0, 0); 203 | // temp_v2 = vmulq_laneq_f32(a_c0_i1, b_vi1_0, 0); 204 | // c_c0 = vaddq_f32(temp_v0, temp_v2); 205 | c_c0 = vfmaq_laneq_f32(temp_v0, a_c0_i1, b_vi1_0, 0); 206 | 207 | temp_v3 = vfmaq_laneq_f32(c_c1, a_c0_i1, b_vi1_0, 1); 208 | // temp_v1 = vmulq_laneq_f32(a_c0_i0, b_vi0_0, 1); 209 | // c_c1 = vaddq_f32(temp_v1, temp_v3); 210 | c_c1 = vfmaq_laneq_f32(temp_v3, a_c0_i0, b_vi0_0, 1); 211 | 212 | temp_v0 = vfmaq_laneq_f32(c_c2, a_c0_i0, b_vi0_0, 2); 213 | // temp_v2 = vmulq_laneq_f32(a_c0_i1, b_vi1_0, 2); 214 | // c_c2 = vaddq_f32(temp_v0, temp_v2); 215 | c_c2 = vfmaq_laneq_f32(temp_v0, a_c0_i1, b_vi1_0, 2); 216 | 217 | temp_v3 = vfmaq_laneq_f32(c_c3, a_c0_i1, b_vi1_0, 3); 218 | // temp_v1 = vmulq_laneq_f32(a_c0_i0, b_vi0_0, 3); 219 | // c_c3 = vaddq_f32(temp_v1, temp_v3); 220 | c_c3 = vfmaq_laneq_f32(temp_v3, a_c0_i0, b_vi0_0, 3); 221 | 222 | 223 | a_c0_i2 = vld1q_f32(A + i * 4 + 8); 224 | a_c0_i3 = vld1q_f32(A + i * 4 + 12); 225 | b_vi2_0 = vld1q_f32(B + 8); 226 | b_vi3_0 = vld1q_f32(B + 12); 227 | temp_v0 = vfmaq_laneq_f32(part1_c0, a_c0_i2, b_vi2_0, 0); 228 | // temp_v2 = vmulq_laneq_f32(a_c0_i3, b_vi3_0, 0); 229 | // part1_c0 = vaddq_f32(temp_v0, temp_v2); 230 | part1_c0 = vfmaq_laneq_f32(temp_v0, a_c0_i3, b_vi3_0, 0); 231 | 232 | temp_v3 = vfmaq_laneq_f32(part1_c1, a_c0_i3, b_vi3_0, 1); 233 | // temp_v1 = vmulq_laneq_f32(a_c0_i2, b_vi2_0, 1); 234 | // part1_c1 = vaddq_f32(temp_v1, temp_v3); 235 | part1_c1 = vfmaq_laneq_f32(temp_v3, a_c0_i2, b_vi2_0, 1); 236 | 237 | temp_v0 = vfmaq_laneq_f32(part1_c2, a_c0_i2, b_vi2_0, 2); 238 | // temp_v2 = vmulq_laneq_f32(a_c0_i3, b_vi3_0, 2); 239 | // part1_c2 = vaddq_f32(temp_v0, temp_v2); 240 | part1_c2 = vfmaq_laneq_f32(temp_v0, a_c0_i3, b_vi3_0, 2); 241 | 242 | temp_v3 = vfmaq_laneq_f32(part1_c3, a_c0_i3, b_vi3_0, 3); 243 | // temp_v1 = vmulq_laneq_f32(a_c0_i2, b_vi2_0, 3); 244 | // part1_c3 = vaddq_f32(temp_v1, temp_v3); 245 | part1_c3 = vfmaq_laneq_f32(temp_v3, a_c0_i2, b_vi2_0, 3); 246 | 247 | #if SIMD_UNROLL == 32 248 | a_c0_i4 = vld1q_f32(A + i * 4 + 16); 249 | a_c0_i5 = vld1q_f32(A + i * 4 + 20); 250 | b_vi4_0 = vld1q_f32(B + 16); 251 | b_vi5_0 = vld1q_f32(B + 20); 252 | temp_v0 = vfmaq_laneq_f32(part2_c0, a_c0_i4, b_vi4_0, 0); 253 | part2_c0 = vfmaq_laneq_f32(temp_v0, a_c0_i5, b_vi5_0, 0); 254 | 255 | temp_v3 = vfmaq_laneq_f32(part2_c1, a_c0_i5, b_vi5_0, 1); 256 | part2_c1 = vfmaq_laneq_f32(temp_v3, a_c0_i4, b_vi4_0, 1); 257 | 258 | temp_v0 = vfmaq_laneq_f32(part2_c2, a_c0_i4, b_vi4_0, 2); 259 | part2_c2 = vfmaq_laneq_f32(temp_v0, a_c0_i5, b_vi5_0, 2); 260 | 261 | temp_v3 = vfmaq_laneq_f32(part2_c3, a_c0_i5, b_vi5_0, 3); 262 | part2_c3 = vfmaq_laneq_f32(temp_v3, a_c0_i4, b_vi4_0, 3); 263 | 264 | 265 | a_c0_i6 = vld1q_f32(A + i * 4 + 24); 266 | a_c0_i7 = vld1q_f32(A + i * 4 + 28); 267 | b_vi6_0 = vld1q_f32(B + 24); 268 | b_vi7_0 = vld1q_f32(B + 28); 269 | temp_v0 = vfmaq_laneq_f32(part3_c0, a_c0_i6, b_vi6_0, 0); 270 | part3_c0 = vfmaq_laneq_f32(temp_v0, a_c0_i7, b_vi7_0, 0); 271 | 272 | temp_v3 = vfmaq_laneq_f32(part3_c1, a_c0_i7, b_vi7_0, 1); 273 | part3_c1 = vfmaq_laneq_f32(temp_v3, a_c0_i6, b_vi6_0, 1); 274 | 275 | temp_v0 = vfmaq_laneq_f32(part3_c2, a_c0_i6, b_vi6_0, 2); 276 | part3_c2 = vfmaq_laneq_f32(temp_v0, a_c0_i7, b_vi7_0, 2); 277 | 278 | temp_v3 = vfmaq_laneq_f32(part3_c3, a_c0_i7, b_vi7_0, 3); 279 | part3_c3 = vfmaq_laneq_f32(temp_v3, a_c0_i6, b_vi6_0, 3); 280 | 281 | #endif 282 | 283 | // a_ri = vld1q_f32(A); 284 | // A += UNROLL_NUM; 285 | 286 | 287 | 288 | // b_vi0 = vld1q_dup_f32(B); b_vi1 = vld1q_dup_f32(B + 1); 289 | // b_vi2 = vld1q_dup_f32(B + 2); b_vi3 = vld1q_dup_f32(B + 3); 290 | 291 | // b_vi0 = vld1q_dup_f32(bi0_p); b_vi1 = vld1q_dup_f32(bi1_p); 292 | // b_vi2 = vld1q_dup_f32(bi2_p); b_vi3 = vld1q_dup_f32(bi3_p); 293 | // for(int x = 0; x < UNROLL_NUM; ++x) { 294 | // b_vi[x] = vld1q_dup_f32(bix_p[x]++); 295 | // } 296 | 297 | // a0i = A[i * lda]; a1i = A[1 + i * lda]; a2i = A[2 + i * lda]; a3i = A[3 + i * lda]; 298 | // bi0 = *bi0_p++; bi1 = *bi1_p++; bi2 = *bi2_p++; bi3 = *bi3_p++; 299 | 300 | // c_c0 = vaddq_f32(vaddq_f32(vmulq_laneq_f32(a_c0_i0, b_vi0_0, 0), vmulq_laneq_f32(a_c0_i1, b_vi1_0, 0)) , vaddq_f32(vmulq_laneq_f32(a_c0_i2, b_vi2_0, 0), vfmaq_laneq_f32(c_c0, a_c0_i3, b_vi3_0, 0))); 301 | 302 | // c_c1 = vaddq_f32(vaddq_f32(vmulq_laneq_f32(a_c0_i0, b_vi0_0, 1), vmulq_laneq_f32(a_c0_i1, b_vi1_0, 1)) , vaddq_f32(vmulq_laneq_f32(a_c0_i2, b_vi2_0, 1), vfmaq_laneq_f32(c_c1, a_c0_i3, b_vi3_0, 1))); 303 | 304 | // c_c2 = vaddq_f32(vaddq_f32(vmulq_laneq_f32(a_c0_i0, b_vi0_0, 2), vmulq_laneq_f32(a_c0_i1, b_vi1_0, 2)) , vaddq_f32(vmulq_laneq_f32(a_c0_i2, b_vi2_0, 2), vfmaq_laneq_f32(c_c2, a_c0_i3, b_vi3_0, 2))); 305 | 306 | // c_c3 = vaddq_f32(vaddq_f32(vmulq_laneq_f32(a_c0_i0, b_vi0_0, 3), vmulq_laneq_f32(a_c0_i1, b_vi1_0, 3)) , vaddq_f32(vmulq_laneq_f32(a_c0_i2, b_vi2_0, 3), vfmaq_laneq_f32(c_c3, a_c0_i3, b_vi3_0, 3))); 307 | B += SIMD_UNROLL; 308 | // c_c0 = vmlaq_f32(c_c0, a_ri, b_vi0); 309 | // c_c1 = vmlaq_f32(c_c1, a_ri, b_vi1); 310 | // c_c2 = vmlaq_f32(c_c2, a_ri, b_vi2); 311 | // c_c3 = vmlaq_f32(c_c3, a_ri, b_vi3); 312 | 313 | 314 | 315 | // bi0_p += UNROLL_NUM; bi1_p += UNROLL_NUM; bi2_p += UNROLL_NUM; bi3_p += UNROLL_NUM; 316 | 317 | // c00 += a0i * bi0; 318 | // c10 += a1i * bi0; 319 | // c20 += a2i * bi0; 320 | // c30 += a3i * bi0; 321 | 322 | // c01 += a0i * bi1; 323 | // c11 += a1i * bi1; 324 | // c21 += a2i * bi1; 325 | // c31 += a3i * bi1; 326 | 327 | // c02 += a0i * bi2; 328 | // c12 += a1i * bi2; 329 | // c22 += a2i * bi2; 330 | // c32 += a3i * bi2; 331 | 332 | // c03 += a0i * bi3; 333 | // c13 += a1i * bi3; 334 | // c23 += a2i * bi3; 335 | // c33 += a3i * bi3; 336 | 337 | // C[0] += A[i * lda] * B[i]; 338 | // C[ldc] += A[i * lda] * B[i + 1 * ldb]; 339 | // C[2 * ldc] += A[i * lda] * B[i + 2 * ldb]; 340 | // C[3 * ldc] += A[i * lda] * B[i + 3 * ldb]; 341 | 342 | // C[1 + 0 * ldc] += A[1 + i * lda] * B[i + 0 * ldb]; 343 | // C[1 + 1 * ldc] += A[1 + i * lda] * B[i + 1 * ldb]; 344 | // C[1 + 2 * ldc] += A[1 + i * lda] * B[i + 2 * ldb]; 345 | // C[1 + 3 * ldc] += A[1 + i * lda] * B[i + 3 * ldb]; 346 | 347 | // C[2 + 0 * ldc] += A[2 + i * lda] * B[i + 0 * ldb]; 348 | // C[2 + 1 * ldc] += A[2 + i * lda] * B[i + 1 * ldb]; 349 | // C[2 + 2 * ldc] += A[2 + i * lda] * B[i + 2 * ldb]; 350 | // C[2 + 3 * ldc] += A[2 + i * lda] * B[i + 3 * ldb]; 351 | 352 | // C[3 + 0 * ldc] += A[3 + i * lda] * B[i + 0 * ldb]; 353 | // C[3 + 1 * ldc] += A[3 + i * lda] * B[i + 1 * ldb]; 354 | // C[3 + 2 * ldc] += A[3 + i * lda] * B[i + 2 * ldb]; 355 | // C[3 + 3 * ldc] += A[3 + i * lda] * B[i + 3 * ldb]; 356 | } 357 | 358 | #if SIMD_UNROLL == 32 359 | c_c0 = c_c0 + part1_c0 + part2_c0 + part3_c0; 360 | c_c1 = c_c1 + part1_c1 + part2_c1 + part3_c1; 361 | c_c2 = c_c2 + part1_c2 + part2_c2 + part3_c2; 362 | c_c3 = c_c3 + part1_c3 + part2_c3 + part3_c3; 363 | #else 364 | c_c0 = vaddq_f32(part1_c0, c_c0); 365 | c_c1 = vaddq_f32(part1_c1, c_c1); 366 | c_c2 = vaddq_f32(part1_c2, c_c2); 367 | c_c3 = vaddq_f32(part1_c3, c_c3); 368 | #endif 369 | 370 | float32x4_t b_vi0, b_vi1, b_vi2, b_vi3; 371 | for(; i < n; ++i) { 372 | a_c0_i0 = vld1q_f32(A + i * 4); 373 | b_vi0 = vld1q_dup_f32(B); b_vi1 = vld1q_dup_f32(B + 1); 374 | b_vi2 = vld1q_dup_f32(B + 2); b_vi3 = vld1q_dup_f32(B + 3); 375 | B += 4; 376 | c_c0 = vmlaq_f32(c_c0, a_c0_i0, b_vi0); 377 | c_c1 = vmlaq_f32(c_c1, a_c0_i0, b_vi1); 378 | c_c2 = vmlaq_f32(c_c2, a_c0_i0, b_vi2); 379 | c_c3 = vmlaq_f32(c_c3, a_c0_i0, b_vi3); 380 | } 381 | // for (int x = 0; x < UNROLL_ROW; ++x) { 382 | // for (int y = 0; y < UNROLL_NUM; ++y) { 383 | // vst1q_f32(C + 4 * x + y * ldc, c_ci[x * UNROLL_NUM + y]); 384 | // } 385 | // } 386 | 387 | vst1q_f32(C + 0 * ldc, c_c0); vst1q_f32(C + 1 * ldc, c_c1); 388 | vst1q_f32(C + 2 * ldc, c_c2); vst1q_f32(C + 3 * ldc, c_c3); 389 | // printf("c_c3: "); 390 | // for(int i = 0; i < 4; ++i) { 391 | // printf("%.2f ", C[3 * ldc + i]); 392 | // } 393 | // printf("\n\n"); 394 | 395 | // C[0 + 0 * ldc] += c00; C[0 + 1 * ldc] += c01; C[0 + 2 * ldc] += c02; C[0 + 3 * ldc] += c03; 396 | // C[1 + 0 * ldc] += c10; C[1 + 1 * ldc] += c11; C[1 + 2 * ldc] += c12; C[1 + 3 * ldc] += c13; 397 | // C[2 + 0 * ldc] += c20; C[2 + 1 * ldc] += c21; C[2 + 2 * ldc] += c22; C[2 + 3 * ldc] += c23; 398 | // C[3 + 0 * ldc] += c30; C[3 + 1 * ldc] += c31; C[3 + 2 * ldc] += c32; C[3 + 3 * ldc] += c33; 399 | } 400 | 401 | static void pack_left_A(int K, int lda, float *A, float *packed_A) { 402 | // double *dst = (double*)packed_A; 403 | float *dst = packed_A; 404 | // double *src = (double*)A; 405 | int k; 406 | for (k = 0; k + SIMD_UNROLLD4 <= K; k += SIMD_UNROLLD4) { 407 | // double *a0k_p = (double*)(A + k * lda); 408 | // *dst++ = *a0k_p; 409 | // *dst++ = *(a0k_p+1); 410 | float *a0_k0_p = A + k * lda, *a0_k1_p = A + (k + 1) * lda, *a0_k2_p = A + (k + 2) * lda, *a0_k3_p = A + (k + 3) * lda; 411 | #if SIMD_UNROLL == 32 412 | float *a0_k4_p = A + (k + 4) * lda, *a0_k5_p = A + (k + 5) * lda, *a0_k6_p = A + (k + 6) * lda, *a0_k7_p = A + (k + 7) * lda; 413 | #endif 414 | // for (int x = 0; x < 4; ++ x) { 415 | *(dst + 0) = *(a0_k0_p + 0); 416 | *(dst + 1) = *(a0_k0_p + 1); 417 | *(dst + 2) = *(a0_k0_p + 2); 418 | *(dst + 3) = *(a0_k0_p + 3); 419 | *(dst + 4) = *(a0_k1_p + 0); 420 | *(dst + 5) = *(a0_k1_p + 1); 421 | *(dst + 6) = *(a0_k1_p + 2); 422 | *(dst + 7) = *(a0_k1_p + 3); 423 | *(dst + 8) = *(a0_k2_p + 0); 424 | *(dst + 9) = *(a0_k2_p + 1); 425 | *(dst + 10) = *(a0_k2_p + 2); 426 | *(dst + 11) = *(a0_k2_p + 3); 427 | *(dst + 12) = *(a0_k3_p + 0); 428 | *(dst + 13) = *(a0_k3_p + 1); 429 | *(dst + 14) = *(a0_k3_p + 2); 430 | *(dst + 15) = *(a0_k3_p + 3); 431 | #if SIMD_UNROLL == 32 432 | *(dst + 16) = *(a0_k4_p + 0); 433 | *(dst + 17) = *(a0_k4_p + 1); 434 | *(dst + 18) = *(a0_k4_p + 2); 435 | *(dst + 19) = *(a0_k4_p + 3); 436 | *(dst + 20) = *(a0_k5_p + 0); 437 | *(dst + 21) = *(a0_k5_p + 1); 438 | *(dst + 22) = *(a0_k5_p + 2); 439 | *(dst + 23) = *(a0_k5_p + 3); 440 | *(dst + 24) = *(a0_k6_p + 0); 441 | *(dst + 25) = *(a0_k6_p + 1); 442 | *(dst + 26) = *(a0_k6_p + 2); 443 | *(dst + 27) = *(a0_k6_p + 3); 444 | *(dst + 28) = *(a0_k7_p + 0); 445 | *(dst + 29) = *(a0_k7_p + 1); 446 | *(dst + 30) = *(a0_k7_p + 2); 447 | *(dst + 31) = *(a0_k7_p + 3); 448 | #endif 449 | dst += SIMD_UNROLL; 450 | // } 451 | } 452 | 453 | for (; k < K; k++) { 454 | *(dst + 0) = A[0 + k * lda]; 455 | *(dst + 1) = A[1 + k * lda]; 456 | *(dst + 2) = A[2 + k * lda]; 457 | *(dst + 3) = A[3 + k * lda]; 458 | dst += 4; 459 | } 460 | } 461 | 462 | static void pack_right_B(int K, int ldb, float *B, float *packed_B) { 463 | float *dst = packed_B; 464 | // float *bik_p = B; 465 | // float *bi0_0_p = B + 0, *bi0_1_p = B + 0 + 1 * ldb, * bi0_2_p = B + 0 + 2 * ldb, *bi0_3_p = B + 0 + 3 * ldb; 466 | // float *bi1_0_p = B + 1, *bi1_1_p = B + 1 + 1 * ldb, * bi1_2_p = B + 1 + 2 * ldb, *bi1_3_p = B + 1 + 3 * ldb; 467 | // float *bi2_0_p = B + 2, *bi2_1_p = B + 2 + 1 * ldb, * bi2_2_p = B + 2 + 2 * ldb, *bi2_3_p = B + 2 + 3 * ldb; 468 | // float *bi3_0_p = B + 3, *bi3_1_p = B + 3 + 1 * ldb, * bi3_2_p = B + 3 + 2 * ldb, *bi3_3_p = B + 3 + 3 * ldb; 469 | // float *bix_p[UNROLL_NUM]; 470 | // for (int x = 0; x < UNROLL_NUM; ++x) { 471 | // bix_p[x] = B + x * ldb; 472 | // } 473 | int k; 474 | for(k = 0; k + SIMD_UNROLLD4 <= K; k += SIMD_UNROLLD4) { 475 | // for (int x = 0; x < UNROLL_NUM; ++x) { 476 | // *dst++ = *(bik_p + x * ldb); 477 | // } 478 | // bik_p++; 479 | // for (int x = 0; x < UNROLL_NUM; ++x) { 480 | // *dst++ = *bix_p[x]++; 481 | // } 482 | *(dst + 0) = *(B + k + 0 + 0 * ldb); 483 | *(dst + 1) = *(B + k + 0 + 1 * ldb); 484 | *(dst + 2) = *(B + k + 0 + 2 * ldb); 485 | *(dst + 3) = *(B + k + 0 + 3 * ldb); 486 | *(dst + 4) = *(B + k + 1 + 0 * ldb); 487 | *(dst + 5) = *(B + k + 1 + 1 * ldb); 488 | *(dst + 6) = *(B + k + 1 + 2 * ldb); 489 | *(dst + 7) = *(B + k + 1 + 3 * ldb); 490 | *(dst + 8) = *(B + k + 2 + 0 * ldb); 491 | *(dst + 9) = *(B + k + 2 + 1 * ldb); 492 | *(dst + 10) = *(B + k + 2 + 2 * ldb); 493 | *(dst + 11) = *(B + k + 2 + 3 * ldb); 494 | *(dst + 12) = *(B + k + 3 + 0 * ldb); 495 | *(dst + 13) = *(B + k + 3 + 1 * ldb); 496 | *(dst + 14) = *(B + k + 3 + 2 * ldb); 497 | *(dst + 15) = *(B + k + 3 + 3 * ldb); 498 | #if SIMD_UNROLL == 32 499 | *(dst + 16) = *(B + k + 4 + 0 * ldb); 500 | *(dst + 17) = *(B + k + 4 + 1 * ldb); 501 | *(dst + 18) = *(B + k + 4 + 2 * ldb); 502 | *(dst + 19) = *(B + k + 4 + 3 * ldb); 503 | *(dst + 20) = *(B + k + 5 + 0 * ldb); 504 | *(dst + 21) = *(B + k + 5 + 1 * ldb); 505 | *(dst + 22) = *(B + k + 5 + 2 * ldb); 506 | *(dst + 23) = *(B + k + 5 + 3 * ldb); 507 | *(dst + 24) = *(B + k + 6 + 0 * ldb); 508 | *(dst + 25) = *(B + k + 6 + 1 * ldb); 509 | *(dst + 26) = *(B + k + 6 + 2 * ldb); 510 | *(dst + 27) = *(B + k + 6 + 3 * ldb); 511 | *(dst + 28) = *(B + k + 7 + 0 * ldb); 512 | *(dst + 29) = *(B + k + 7 + 1 * ldb); 513 | *(dst + 30) = *(B + k + 7 + 2 * ldb); 514 | *(dst + 31) = *(B + k + 7 + 3 * ldb); 515 | #endif 516 | dst += SIMD_UNROLL; 517 | // *(dst + 0) = *bi0_0_p++; 518 | // *(dst + 1) = *bi1_0_p++; 519 | // *(dst + 2) = *bi2_0_p++; 520 | // *(dst + 3) = *bi3_0_p++; 521 | // *(dst + 4) = *bi0_1_p++; 522 | // *(dst + 5) = *bi1_1_p++; 523 | // *(dst + 6) = *bi2_1_p++; 524 | // *(dst + 7) = *bi3_1_p++; 525 | // *(dst + 8) = *bi0_2_p++; 526 | // *(dst + 9) = *bi1_2_p++; 527 | // *(dst + 10) = *bi2_2_p++; 528 | // *(dst + 11) = *bi3_2_p++; 529 | // *(dst + 12) = *bi0_3_p++; 530 | // *(dst + 13) = *bi1_3_p++; 531 | // *(dst + 14) = *bi2_3_p++; 532 | // *(dst + 15) = *bi3_3_p++; 533 | 534 | } 535 | // printf("in Pack B, k: %d, K: %d, ldb: %d\n", k, K, ldb); 536 | for (; k < K; ++k) { 537 | *(dst + 0) = *(B + k + 0 * ldb); 538 | *(dst + 1) = *(B + k + 1 * ldb); 539 | *(dst + 2) = *(B + k + 2 * ldb); 540 | *(dst + 3) = *(B + k + 3 * ldb); 541 | dst += 4; 542 | } 543 | } 544 | 545 | static void do_block(int M, int N, int K, int lda,int ldb, int ldc, float *A, float *B, float *C, float *packed_A, float *packed_B, int should_pack_B) { 546 | int j; 547 | for (j = 0; j + UNROLL_NUM <= N; j += UNROLL_NUM) { 548 | if (should_pack_B) { 549 | pack_right_B(K, ldb, B + j * ldb, packed_B + j * K); 550 | } 551 | // int jn = j * ldb; 552 | int i; 553 | for (i = 0; i + UNROLL_NUM <= M; i += UNROLL_NUM) { 554 | 555 | // for(int x = 0; x < UNROLL_NUM; ++x) { 556 | // int jxn = jn + x * n, ijxn = jn + x * n + i; 557 | // dot_mul(n, n, A + i, B + jxn, C + ijxn); 558 | // } 559 | // dot_mul(n, n, A + i, B + jn, C + i + jn); 560 | // dot_mul(n, n, A + i, B + jn + n, C + i + jn + n); 561 | // dot_mul(n, n, A + i, B + jn + 2 * n, C + i + jn + 2 * n); 562 | // dot_mul(n, n, A + i, B + jn + 3 * n, C + i + jn + 3 * n); 563 | 564 | // dot_mul_unroll(n, n, n, n, A + i, B + j * n, C + i + j * n, A, B, C); 565 | if (j == 0) { 566 | pack_left_A(K, lda, A + i, packed_A + i * K); 567 | } 568 | // dot_mul_square(K, lda, ldb, ldc, A + i, B + j * ldb, C + i + j * ldc); 569 | dot_mul_square(K, SIMD_UNROLL, 1, ldc, packed_A + i * K, packed_B + j * K, C + i + j * ldc); 570 | // dot_mul(n, n, A + i, B + j * n, C + i + j * n); 571 | } 572 | // if (i < M && i + UNROLL_NUM > M) { 573 | // // printf("use addtional do_mul0, i = %d, j = %d\n", i, j); 574 | // for (int temp_j = j; temp_j < j + UNROLL_NUM; ++temp_j) { 575 | // for (int temp_i = i; temp_i < M; ++temp_i) { 576 | // dot_mul(K, lda, A + temp_i, B + temp_j * ldb, C + temp_i + temp_j * ldc); 577 | // } 578 | // } 579 | // } 580 | 581 | 582 | } 583 | // for (; j < N; ++j) { 584 | // // printf("use addtional do_mul1, j = %d\n", j); 585 | // for (int i = 0; i < M; ++i) { 586 | // dot_mul(K, lda, A + i, B + j * ldb, C + i + j * ldc); 587 | // } 588 | // } 589 | } 590 | 591 | static void print_matrix(int row, int col, float *A) { 592 | for(int i = 0; i < row; ++i) { 593 | for(int j = 0; j < col; ++j) { 594 | printf("%0.2f ", A[i + j * row]); 595 | } 596 | printf("\n"); 597 | } 598 | } 599 | 600 | int should_padding(int m, int n, int *new_m_ptr, int *new_n_ptr) { 601 | *new_m_ptr = (m + UNROLL_NUM - 1) / UNROLL_NUM * UNROLL_NUM; 602 | *new_n_ptr = (n + UNROLL_NUM - 1) / UNROLL_NUM * UNROLL_NUM; 603 | return m != *new_m_ptr || n != *new_n_ptr; 604 | } 605 | 606 | static int64_t tempDur0 = 0, tempDur1 = 0; 607 | 608 | float *get_padding_matrix(int lda, int m, int n, int new_m, int new_n, const float *A) { 609 | int64_t tempT0 = wall_time_ns(); 610 | float *new_A = NULL; 611 | int ret = posix_memalign((void**)&new_A, 4096, sizeof(float) * new_m * new_n); 612 | int64_t tempT1 = wall_time_ns(); 613 | if (ret != 0) { 614 | fprintf(stderr, "Can not align malloc padding matrix!\n"); 615 | exit(-1); 616 | } 617 | int j; 618 | for (j = 0; j < n; ++j) { 619 | memcpy(new_A + j * new_m, A + j * lda, sizeof(float) * m); 620 | memset(new_A + m + j * new_m, 0, sizeof(float) * (new_m - m)); 621 | } 622 | for (;j < new_n; ++j) { 623 | memset(new_A + j * new_m, 0, sizeof(float) * new_m); 624 | } 625 | int64_t tempT2 = wall_time_ns(); 626 | tempDur0 += tempT1 - tempT0; 627 | tempDur1 += tempT2 - tempT1; 628 | return new_A; 629 | } 630 | 631 | void back_padding(int lda, int m, int n, int new_m, int new_n, float *A, float *padding_A) { 632 | for (int j = 0; j < n; ++j) { 633 | memcpy(A + j * lda, padding_A + j * new_m, sizeof(float) * m); 634 | } 635 | } 636 | 637 | void square_gemm (int n, float* A, float* B, float* C) { 638 | // posix_memalign 639 | 640 | 641 | // float *packed_B = packed_A + BLOCK_COL * BLOCK_ROW; 642 | tempDur0 = 0; tempDur1 = 0; 643 | int64_t t0 = wall_time_ns(); 644 | 645 | float *padding_A = A, *padding_B = B, *padding_C = C; 646 | int newM, newN, newK; 647 | int should_pad_A = should_padding(n, n, &newM, &newK); 648 | int should_pad_B = should_padding(n, n, &newK, &newN); 649 | int should_pad_C = should_padding(n, n, &newM, &newN); 650 | 651 | // float *packed_A = malloc(BLOCK_COL * BLOCK_ROW * sizeof(float)); 652 | // float *packed_B = malloc(BLOCK_COL * n * sizeof(float)); 653 | 654 | float *packed_A, *packed_B; 655 | int tempRet1 = posix_memalign((void**)&packed_A, 4096, BLOCK_COL * BLOCK_ROW * sizeof(float)); 656 | int tempRet2 = posix_memalign((void**)&packed_B, 4096, BLOCK_COL * newN * sizeof(float)); 657 | 658 | int64_t t1 = wall_time_ns(); 659 | 660 | if (tempRet1 != 0|| tempRet2 != 0) { 661 | fprintf(stderr, "Can not align malloc packed pool!\n"); 662 | exit(-1); 663 | } 664 | 665 | if (should_pad_A) { 666 | padding_A = get_padding_matrix(n, n, n, newM, newK, A); 667 | } 668 | if (should_pad_B) { 669 | padding_B = get_padding_matrix(n, n, n, newK, newN, B); 670 | } 671 | if (should_pad_C) { 672 | padding_C= get_padding_matrix(n, n, n, newM, newN, C); 673 | } 674 | 675 | int64_t t2 = wall_time_ns(); 676 | 677 | for (int k = 0; k < newK; k += BLOCK_COL) { 678 | int K = min(newK - k, BLOCK_COL); 679 | for (int i = 0; i < newM; i += BLOCK_ROW) { 680 | int M = min(newM - i, BLOCK_ROW); 681 | int N = newN; 682 | // printf("do block, M: %d, N: %d, K: %d, A(%d, %d), B(%d, %d), C(%d, %d)\n", 683 | // M, N, K, i, k, k, 0, i, 0); 684 | 685 | do_block(M, N, K, newM, newK, newM, padding_A + i + k * newM, padding_B + k, padding_C + i, packed_A, packed_B, i == 0); 686 | // printf("C:\n"); 687 | // print_matrix(n, n, C); 688 | } 689 | } 690 | 691 | int64_t t3 = wall_time_ns(); 692 | 693 | free(packed_A); 694 | free(packed_B); 695 | 696 | int64_t t4 = wall_time_ns(); 697 | 698 | if (should_pad_A) 699 | free(padding_A); 700 | if (should_pad_B) 701 | free(padding_B); 702 | 703 | int64_t t5 = wall_time_ns(); 704 | 705 | if (should_pad_C) { 706 | back_padding(n, n, n, newM, newN, C, padding_C); 707 | free(padding_C); 708 | } 709 | 710 | int64_t t6 = wall_time_ns(); 711 | 712 | double totalDur = (t6 - t0) / (1e+6), dur1 = (t1 - t0) / (1e+6), dur2 = (t2 - t1) / (1e+6), dur3 = (t3 - t2)/ (1e+6), dur4 = (t4 - t3) / (1e+6), dur5 = (t5 - t4) / (1e+6), dur6 = (t6 - t5) / (1e+6); 713 | double dur2_part1 = tempDur0 / (1e+6), dur2_part2 = tempDur1 / (1e+6); 714 | // printf("Total: %.4f ms, dur1: %.4f ms, dur2: %.4f ms = (%.4f + %.4f) ms, dur3: %.4f ms, dur4: %.4f ms, dur5: %.4f ms, dur6: %.4f ms\n", totalDur, dur1, dur2, dur2_part1, dur2_part2, dur3, dur4, dur5, dur6); 715 | 716 | } 717 | 718 | /* This routine performs a gemm operation 719 | * C := C + A * B 720 | * where A, B, and C are lda-by-lda matrices stored in column-major format. 721 | * On exit, A and B maintain their input values. */ 722 | // void square_gemm (int n, float* A, float* B, float* C) 723 | // { 724 | // /* For each row i of A */ 725 | 726 | // // naive_transpose(n, n, A); 727 | // // naive_transpose(n, n, B); 728 | 729 | 730 | // int j; 731 | // for (j = 0; j + UNROLL_NUM <= n; j += UNROLL_NUM) { 732 | // int jn = j * n; 733 | // int i; 734 | // for (i = 0; i + UNROLL_NUM <= n; i += UNROLL_NUM) { 735 | 736 | // // for(int x = 0; x < UNROLL_NUM; ++x) { 737 | // // int jxn = jn + x * n, ijxn = jn + x * n + i; 738 | // // dot_mul(n, n, A + i, B + jxn, C + ijxn); 739 | // // } 740 | // // dot_mul(n, n, A + i, B + jn, C + i + jn); 741 | // // dot_mul(n, n, A + i, B + jn + n, C + i + jn + n); 742 | // // dot_mul(n, n, A + i, B + jn + 2 * n, C + i + jn + 2 * n); 743 | // // dot_mul(n, n, A + i, B + jn + 3 * n, C + i + jn + 3 * n); 744 | 745 | // // dot_mul_unroll(n, n, n, n, A + i, B + j * n, C + i + j * n, A, B, C); 746 | 747 | // dot_mul_square(n, n, n, n, A + i, B + j * n, C + i + j * n); 748 | 749 | // // dot_mul(n, n, A + i, B + j * n, C + i + j * n); 750 | // } 751 | // if (i < n && i + UNROLL_NUM > n) { 752 | // for (int temp_j = j; temp_j < j + UNROLL_NUM; ++temp_j) { 753 | // for (int temp_i = i; temp_i < n; ++temp_i) { 754 | // dot_mul(n, n, A + temp_i, B + temp_j * n, C + temp_i + temp_j * n); 755 | // } 756 | // } 757 | // } 758 | 759 | // } 760 | // for (; j < n; ++j) { 761 | // for (int i = 0; i < n; ++i) { 762 | // dot_mul(n, n, A + i, B + j * n, C + i + j * n); 763 | // } 764 | // } 765 | 766 | // // for (int i = 0; i < n; ++i) { 767 | // // /* For each column j of B */ 768 | // // int in = i * n; 769 | // // for (int j = 0; j < n; ++j) 770 | // // { 771 | // // /* Compute C(i,j) */ 772 | // // // float cij = C[i+j*n]; 773 | // // int jn = j * n; 774 | // // // for( int k = 0; k < n; k++ ) 775 | // // // // cij += A[i+k*n] * B[k+j*n]; 776 | // // // cij += A[k + i * n] * B[k + j * n]; 777 | // // // // cij += A[i + k * n] * B[j + k * n]; 778 | // // // C[i+j*n] = cij; 779 | 780 | // // // float32x4_t buf[UNROLL_NUM] = {0}; 781 | // // // int k; 782 | // // // for (k = 0; k < ((n) & (~7)); k += 4 * UNROLL_NUM) { 783 | // // // int kin = k + in, kjn = k + jn; 784 | // // // for (int x = 0; x < UNROLL_NUM; ++x) { 785 | // // // float32x4_t v1 = vld1q_f32(A + x * 4 + kin); 786 | // // // float32x4_t v2 = vld1q_f32(B + x * 4 + kjn); 787 | // // // // float32x4_t r1 = vmulq_f32(v1, v2); 788 | // // // // buf[x] = vaddq_f32(buf[x], r1); 789 | // // // buf[x] = vmlaq_f32(buf[x], v1, v2); 790 | // // // } 791 | 792 | // // // } 793 | // // // float temp = 0; 794 | // // // for (k; k < n; ++k) { 795 | // // // temp += A[k + in] * B[k + jn]; 796 | // // // } 797 | // // // float res[4]; 798 | 799 | 800 | // // // for (int x = 0; x < UNROLL_NUM; ++x) { 801 | // // // vst1q_f32(res, buf[x]); 802 | // // // temp += res[0] + res[1] + res[2] + res[3]; 803 | // // // } 804 | // // // C[i + j * n] += temp; 805 | 806 | // // // float cij = C[i * n + j]; 807 | // // // for (int k = 0; k < n; ++k) { 808 | // // // cij += A[i * n + k] * B[k * n + j]; 809 | // // // } 810 | // // // C[i * n + j] = cij; 811 | // // } 812 | // // } 813 | // // naive_transpose(n, n, A); 814 | // } 815 | --------------------------------------------------------------------------------