├── .gitignore
├── LICENSE
├── README.md
├── data
    ├── plt.py
    ├── result_0.png
    ├── result_1.png
    ├── result_2.png
    ├── result_3.png
    ├── result_4.png
    ├── result_5.png
    ├── result_6.png
    ├── result_7.png
    ├── result_8.png
    ├── result_9.png
    └── result_all.png
├── pics
    ├── GEMM.png
    ├── gemm_block.png
    ├── riscv.gif
    ├── roofline.png
    ├── step0.gif
    ├── step1.gif
    ├── step2.gif
    ├── step3.gif
    ├── step4.gif
    └── step5.gif
├── prepare
    ├── 0.hello_world
    │   ├── Makefile
    │   └── hello_world.c
    ├── 1.memory_copy
    │   ├── Makefile
    │   ├── main.c
    │   └── memcpy.S
    ├── 2.memcpy_bandwidth_test
    │   ├── Makefile
    │   └── mbw.c
    ├── 3.flw_bandwidth_test
    │   ├── Makefile
    │   ├── load_flw.S
    │   └── load_test.c
    ├── 4.vlw_bandwidth_test
    │   ├── Makefile
    │   ├── load_test.c
    │   └── load_vlw.S
    ├── 5.saxpy
    │   ├── Makefile
    │   ├── main.c
    │   └── saxpy.S
    ├── README.md
    └── imgs
    │   └── memory_bandwidth_test.png
└── sgemm
    ├── common
        ├── bl_sgemm.h
        ├── bl_sgemm_ref.c
        ├── bl_sgemm_util.c
        ├── test_bl_sgemm.c
        ├── test_bl_sgemm_packB_4x16.c
        └── test_bl_sgemm_packB_4x4.c
    ├── step0
        ├── Makefile
        └── my_sgemm.c
    ├── step1
        ├── Makefile
        └── my_sgemm.c
    ├── step2
        ├── Makefile
        ├── bl_config.h
        └── my_sgemm.c
    ├── step3
        ├── Makefile
        ├── bl_config.h
        └── my_sgemm.c
    ├── step4
        ├── Makefile
        ├── bl_config.h
        └── my_sgemm.c
    ├── step5
        ├── Makefile
        ├── bl_config.h
        └── my_sgemm.c
    ├── step6
        ├── Makefile
        ├── RvvSgemm4x16.S
        ├── bl_config.h
        ├── my_sgemm.c
        └── run.sh
    ├── step7
        ├── Makefile
        ├── RvvSgemm4x16.S
        ├── bl_config.h
        ├── my_sgemm.c
        └── run.sh
    ├── step8
        ├── Makefile
        ├── RvvSgemm4x16.S
        ├── bl_config.h
        ├── my_sgemm.c
        └── run.sh
    └── step9
        ├── Makefile
        ├── RvvSgemm4x16.S
        ├── bl_config.h
        ├── my_sgemm.c
        └── run.sh


/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.d
3 | *.x
4 | *.xlsx
5 | .DS_Store


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Andy
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # sgemm_riscv
  2 | ![](https://img.shields.io/badge/Sgemm-RISC_V-8B0012)
  3 | [![](https://img.shields.io/badge/license-MIT-blue)](./LICENSE)
  4 | [![](https://img.shields.io/badge/version-1.0-green)](./LICENSE)
  5 | 
  6 | [![](https://img.shields.io/github/forks/Zhao-Dongyu/sgemm_riscv.svg)](https://github.com/Zhao-Dongyu/sgemm_riscv/network) 
  7 | [![](https://img.shields.io/github/stars/Zhao-Dongyu/sgemm_riscv.svg)](https://github.com/Zhao-Dongyu/sgemm_riscv/stargazers)
  8 | [![](https://img.shields.io/github/issues/Zhao-Dongyu/sgemm_riscv.svg)](https://github.com/Zhao-Dongyu/sgemm_riscv/issues)
  9 | 
 10 | ---
 11 | 
 12 | <img src=./pics/riscv.gif width=50% />
 13 | 
 14 | [RISC-V](https://riscv.org/) is an open standard Instruction Set Architecture (ISA) enabling a new era of processor innovation through open collaboration.
 15 | 
 16 | ---
 17 | 
 18 | <img src=./pics/GEMM.png width=70% />
 19 | 
 20 | [GEMM](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms#Level_3) General matrix multiply, one of the Basic Linear Algebra Subprograms.
 21 | 
 22 | ---
 23 | 
 24 | <img src=./data/result_all.png width=70% />
 25 | 
 26 | This project records the process of optimizing SGEMM (single-precision floating point General Matrix Multiplication) on the riscv platform.
 27 | 
 28 | ---
 29 | 
 30 | To get started, please refer to Section [Usage](#usage)
 31 | 
 32 | Related tutorials are located on the [wiki](https://github.com/Zhao-Dongyu/sgemm_riscv/wiki).
 33 | 
 34 | 
 35 | ## Blislab
 36 | 
 37 | [Blislab](https://github.com/flame/blislab) is an open source teaching project that teaches you step-by-step optimization of matrix multiplication.
 38 | 
 39 | On the basis of the blislab project, [surez-ok](https://github.com/surez-ok/blislab_riscv) has made some deletions and optimizations (deleted into the simplest code, only supports x86 or riscv Linux OS), so The project is clearer and easier to get started.
 40 | 
 41 | ## Project structure
 42 | 
 43 |     .
 44 |     ├── data
 45 |     ├── pics
 46 |     ├── prepare
 47 |     │   ├── 0.hello_world
 48 |     │   ├── 1.memory_copy
 49 |     │   ├── 2.memcpy_bandwidth_test
 50 |     │   ├── 3.flw_bandwidth_test
 51 |     │   ├── 4.vlw_bandwidth_test
 52 |     │   ├── 5.saxpy
 53 |     │   └── imgs
 54 |     └── sgemm
 55 |         ├── common
 56 |         ├── step0
 57 |         ├── step1
 58 |         ├── step2
 59 |         ├── step3
 60 |         ├── step4
 61 |         ├── step5
 62 |         ├── step6
 63 |         ├── step7
 64 |         ├── step8
 65 |         └── step9
 66 | 
 67 | In the `prepare` folder, I compiled some tutorials and demos for hardware performance testing.
 68 | 
 69 | In the `sgemm` folder, `step0` to `step9` are my experiments.
 70 | 
 71 | See [wiki](https://github.com/Zhao-Dongyu/sgemm_riscv/wiki) for more details.
 72 | 
 73 | ## Installation
 74 | 
 75 | You need to download the cross-compilation chain of riscv.
 76 | 
 77 | The development board I use is Nezha D1, download from [here](https://xuantie.t-head.cn/community/download?id=4090445921563774976)
 78 | 
 79 | 
 80 | ## Usage<span id = "usage"></span>
 81 | 
 82 | Take `step1` as an example
 83 | 
 84 | > You need to modify the Makefile and configure CROSS_COMPILE in the first few lines of the Makefile as the correct cross-compiler
 85 | 
 86 | ```shell
 87 | $ cd step1
 88 | $ make
 89 | $ adb push test_bl_sgemm_step1.x ./.
 90 | $ adb shell './test_bl_sgemm_step1.x'
 91 | ```
 92 | 
 93 | # Acknowledgement
 94 | 
 95 | - [BLISlab: A Sandbox for Optimizing GEMM](https://github.com/flame/blislab)
 96 | 
 97 |     This project introduced me to how to optimize GEMM
 98 | 
 99 | - [riscv平台优化矩阵乘(基于blislab优化实践)](https://github.com/surez-ok/blislab_riscv)
100 | 
101 |     I conduct experiments and exploration based on this project
102 | 
103 | - Thanks to Mr. Ding for your guidance.
104 | 
105 | ## Support
106 | 
107 | zhaodongyu1024@gmail.com
108 | 
109 | ## License
110 | 
111 | [MIT License](./LICENSE)
112 | 


--------------------------------------------------------------------------------
/data/plt.py:
--------------------------------------------------------------------------------
 1 | # pip install pandas matplotlib openpyxl
 2 | 
 3 | import pandas as pd
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | df = pd.read_excel('sgemm_riscv.xlsx', index_col=0)
 7 | # 绘制多行数据
 8 | plt.plot(df.loc['version 0'], label='version 0')
 9 | plt.plot(df.loc['version 1'], label='version 1')
10 | plt.plot(df.loc['version 2'], label='version 2')
11 | plt.plot(df.loc['version 3'], label='version 3')
12 | plt.plot(df.loc['version 4'], label='version 4')
13 | plt.plot(df.loc['version 5'], label='version 5')
14 | plt.plot(df.loc['version 6'], label='version 6')
15 | plt.plot(df.loc['version 7'], label='version 7')
16 | plt.plot(df.loc['version 8'], label='version 8')
17 | plt.plot(df.loc['version 9'], label='version 9')
18 | 
19 | plt.title("Sgemm on Nezha D1")
20 | plt.xlabel("M=N=K")
21 | plt.ylabel("GFLOPS")
22 | 
23 | plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5)
24 | 
25 | plt.tight_layout()
26 | 
27 | plt.savefig('result.png', dpi=300)
28 | 
29 | plt.show()


--------------------------------------------------------------------------------
/data/result_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/data/result_0.png


--------------------------------------------------------------------------------
/data/result_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/data/result_1.png


--------------------------------------------------------------------------------
/data/result_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/data/result_2.png


--------------------------------------------------------------------------------
/data/result_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/data/result_3.png


--------------------------------------------------------------------------------
/data/result_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/data/result_4.png


--------------------------------------------------------------------------------
/data/result_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/data/result_5.png


--------------------------------------------------------------------------------
/data/result_6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/data/result_6.png


--------------------------------------------------------------------------------
/data/result_7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/data/result_7.png


--------------------------------------------------------------------------------
/data/result_8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/data/result_8.png


--------------------------------------------------------------------------------
/data/result_9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/data/result_9.png


--------------------------------------------------------------------------------
/data/result_all.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/data/result_all.png


--------------------------------------------------------------------------------
/pics/GEMM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/pics/GEMM.png


--------------------------------------------------------------------------------
/pics/gemm_block.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/pics/gemm_block.png


--------------------------------------------------------------------------------
/pics/riscv.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/pics/riscv.gif


--------------------------------------------------------------------------------
/pics/roofline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/pics/roofline.png


--------------------------------------------------------------------------------
/pics/step0.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/pics/step0.gif


--------------------------------------------------------------------------------
/pics/step1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/pics/step1.gif


--------------------------------------------------------------------------------
/pics/step2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/pics/step2.gif


--------------------------------------------------------------------------------
/pics/step3.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/pics/step3.gif


--------------------------------------------------------------------------------
/pics/step4.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/pics/step4.gif


--------------------------------------------------------------------------------
/pics/step5.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/pics/step5.gif


--------------------------------------------------------------------------------
/prepare/0.hello_world/Makefile:
--------------------------------------------------------------------------------
 1 | CTOOL := riscv64-unknown-linux-gnu-
 2 | CCL := /home/zhaodongyu/toolchain/riscv/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
 3 | CC := ${CCL}/bin/${CTOOL}gcc
 4 | 
 5 | hello_world:hello_world.c
 6 | 	${CC} -o hello_world hello_world.c
 7 | 
 8 | clean:
 9 | 	rm hello_world
10 | 
11 | 


--------------------------------------------------------------------------------
/prepare/0.hello_world/hello_world.c:
--------------------------------------------------------------------------------
1 | #include <stdio.h>
2 | int main(int argc, char const *argv[])
3 | {
4 |     printf("Hello NeZha\n");
5 |     return 0;
6 | }
7 | 


--------------------------------------------------------------------------------
/prepare/1.memory_copy/Makefile:
--------------------------------------------------------------------------------
 1 | CTOOL := riscv64-unknown-linux-gnu-
 2 | CCL := /home/zhaodongyu/toolchain/riscv/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
 3 | CC := ${CCL}/bin/${CTOOL}gcc
 4 | 
 5 | C_FLAGS := -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static
 6 | 
 7 | test:main.o memcpy.o
 8 | 	${CC} $(C_FLAGS) -o test main.o memcpy.o
 9 | 
10 | main.o:main.c
11 | 	${CC} $(C_FLAGS) -c main.c
12 | 
13 | memcpy.o:memcpy.S
14 | 	${CC} $(C_FLAGS) -c memcpy.S
15 | 
16 | clean:
17 | 	rm test main.o memcpy.o


--------------------------------------------------------------------------------
/prepare/1.memory_copy/main.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | extern void *memcpy(void* dest, const void* src, size_t n);
 4 | 
 5 | 
 6 | int main(int argc, char const *argv[])
 7 | {
 8 |     int array_a[10] = {0,1,2,3,4,5,6,7,8,9};
 9 |     int array_b[10] = {};
10 |     printf("Hello NeZha\n");
11 |     memcpy(array_b, array_a, 10 * sizeof(int));
12 | 
13 |     for(int i = 0; i < 10; i ++) {
14 |         printf("%d\t", array_b[i]);
15 |     }
16 |     printf("\n");
17 |     return 0;
18 | }


--------------------------------------------------------------------------------
/prepare/1.memory_copy/memcpy.S:
--------------------------------------------------------------------------------
 1 | .text
 2 | .balign 4
 3 | .global memcpy
 4 | # void *memcpy(void* dest, const void* src, size_t n)
 5 | # a0=dest, a1=src, a2=n
 6 | #
 7 | memcpy:
 8 |     mv a3, a0           # Copy destination
 9 | loop:
10 | vsetvli t0, a2, e8      # Vectors of 8b
11 | vlb.v v0, (a1)          # Load bytes
12 |     add a1, a1, t0      # Bump pointer
13 |     sub a2, a2, t0      # Decrement count
14 | vsb.v v0, (a3)          # Store bytes
15 |     add a3, a3, t0      # Bump pointer
16 |     bnez a2, loop       # Any more?
17 |     ret                 # Return
18 | 


--------------------------------------------------------------------------------
/prepare/2.memcpy_bandwidth_test/Makefile:
--------------------------------------------------------------------------------
 1 | CTOOL := riscv64-unknown-linux-gnu-
 2 | CCL := /home/zhaodongyu/toolchain/riscv/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
 3 | CC := ${CCL}/bin/${CTOOL}gcc
 4 | 
 5 | C_FLAGS := -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static
 6 | 
 7 | mbw:mbw.o
 8 | 	${CC} $(C_FLAGS) -o mbw  mbw.o
 9 | 
10 | clean:
11 | 	rm mbw mbw.o


--------------------------------------------------------------------------------
/prepare/2.memcpy_bandwidth_test/mbw.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * vim: ai ts=4 sts=4 sw=4 cinoptions=>4 expandtab
  3 |  */
  4 | #define _GNU_SOURCE
  5 | 
  6 | #include <stdio.h>
  7 | #include <stdlib.h>
  8 | #include <unistd.h>
  9 | #include <errno.h>
 10 | #include <sys/mman.h>
 11 | #include <sys/types.h>
 12 | #include <sys/time.h>
 13 | #include <time.h>
 14 | #include <string.h>
 15 | #include <unistd.h>
 16 | 
 17 | /* how many runs to average by default */
 18 | #define DEFAULT_NR_LOOPS 10
 19 | 
 20 | /* we have 3 tests at the moment */
 21 | #define MAX_TESTS 3
 22 | 
 23 | /* default block size for test 2, in bytes */
 24 | #define DEFAULT_BLOCK_SIZE 262144
 25 | 
 26 | /* test types */
 27 | #define TEST_MEMCPY 0
 28 | #define TEST_DUMB 1
 29 | #define TEST_MCBLOCK 2
 30 | 
 31 | /* version number */
 32 | #define VERSION "1.4"
 33 | 
 34 | /*
 35 |  * MBW memory bandwidth benchmark
 36 |  *
 37 |  * 2006, 2012 Andras.Horvath@gmail.com
 38 |  * 2013 j.m.slocum@gmail.com
 39 |  * (Special thanks to Stephen Pasich)
 40 |  *
 41 |  * http://github.com/raas/mbw
 42 |  *
 43 |  * compile with:
 44 |  *			gcc -O -o mbw mbw.c
 45 |  *
 46 |  * run with eg.:
 47 |  *
 48 |  *			./mbw 300
 49 |  *
 50 |  * or './mbw -h' for help
 51 |  *
 52 |  * watch out for swap usage (or turn off swap)
 53 |  */
 54 | 
 55 | void usage()
 56 | {
 57 |     printf("mbw memory benchmark v%s, https://github.com/raas/mbw\n", VERSION);
 58 |     printf("Usage: mbw [options] array_size_in_MiB\n");
 59 |     printf("Options:\n");
 60 |     printf("	-n: number of runs per test (0 to run forever)\n");
 61 |     printf("	-a: Don't display average\n");
 62 |     printf("	-t%d: memcpy test\n", TEST_MEMCPY);
 63 |     printf("	-t%d: dumb (b[i]=a[i] style) test\n", TEST_DUMB);
 64 |     printf("	-t%d: memcpy test with fixed block size\n", TEST_MCBLOCK);
 65 |     printf("	-b <size>: block size in bytes for -t2 (default: %d)\n", DEFAULT_BLOCK_SIZE);
 66 |     printf("	-q: quiet (print statistics only)\n");
 67 |     printf("(will then use two arrays, watch out for swapping)\n");
 68 |     printf("'Bandwidth' is amount of data copied over the time this operation took.\n");
 69 |     printf("\nThe default is to run all tests available.\n");
 70 | }
 71 | 
 72 | /* ------------------------------------------------------ */
 73 | 
 74 | /* allocate a test array and fill it with data
 75 |  * so as to force Linux to _really_ allocate it */
 76 | long *make_array(unsigned long long asize)
 77 | {
 78 |     unsigned long long t;
 79 |     unsigned int long_size=sizeof(long);
 80 |     long *a;
 81 | 
 82 |     a=calloc(asize, long_size);
 83 | 
 84 |     if(NULL==a) {
 85 |         perror("Error allocating memory");
 86 |         exit(1);
 87 |     }
 88 | 
 89 |     /* make sure both arrays are allocated, fill with pattern */
 90 |     for(t=0; t<asize; t++) {
 91 |         a[t]=0xaa;
 92 |     }
 93 |     return a;
 94 | }
 95 | 
 96 | /* actual benchmark */
 97 | /* asize: number of type 'long' elements in test arrays
 98 |  * long_size: sizeof(long) cached
 99 |  * type: 0=use memcpy, 1=use dumb copy loop (whatever GCC thinks best)
100 |  *
101 |  * return value: elapsed time in seconds
102 |  */
103 | double worker(unsigned long long asize, long *a, long *b, int type, unsigned long long block_size)
104 | {
105 |     unsigned long long t;
106 |     struct timeval starttime, endtime;
107 |     double te;
108 |     unsigned int long_size=sizeof(long);
109 |     /* array size in bytes */
110 |     unsigned long long array_bytes=asize*long_size;
111 | 
112 |     if(type==TEST_MEMCPY) { /* memcpy test */
113 |         /* timer starts */
114 |         gettimeofday(&starttime, NULL);
115 |         memcpy(b, a, array_bytes);
116 |         /* timer stops */
117 |         gettimeofday(&endtime, NULL);
118 |     } else if(type==TEST_MCBLOCK) { /* memcpy block test */
119 |         char* aa = (char*)a;
120 |         char* bb = (char*)b;
121 |         gettimeofday(&starttime, NULL);
122 |         for (t=array_bytes; t >= block_size; t-=block_size, aa+=block_size){
123 |             bb=(char *) memcpy(bb, aa, block_size) + block_size;
124 |         }
125 |         if(t) {
126 |             bb=(char *) memcpy(bb, aa, t) + t;
127 |         }
128 |         gettimeofday(&endtime, NULL);
129 |     } else if(type==TEST_DUMB) { /* dumb test */
130 |         gettimeofday(&starttime, NULL);
131 |         for(t=0; t<asize; t++) {
132 |             b[t]=a[t];
133 |         }
134 |         gettimeofday(&endtime, NULL);
135 |     }
136 | 
137 |     te=((double)(endtime.tv_sec*1000000-starttime.tv_sec*1000000+endtime.tv_usec-starttime.tv_usec))/1000000;
138 | 
139 |     return te;
140 | }
141 | 
142 | /* ------------------------------------------------------ */
143 | 
144 | /* pretty print worker's output in human-readable terms */
145 | /* te: elapsed time in seconds
146 |  * mt: amount of transferred data in MiB
147 |  * type: see 'worker' above
148 |  *
149 |  * return value: -
150 |  */
151 | void printout(double te, double mt, int type)
152 | {
153 |     switch(type) {
154 |         case TEST_MEMCPY:
155 |             printf("Method: MEMCPY\t");
156 |             break;
157 |         case TEST_DUMB:
158 |             printf("Method: DUMB\t");
159 |             break;
160 |         case TEST_MCBLOCK:
161 |             printf("Method: MCBLOCK\t");
162 |             break;
163 |     }
164 |     printf("Elapsed: %.5f\t", te);
165 |     printf("MiB: %.5f\t", mt);
166 |     printf("Copy: %.3f MiB/s\n", mt/te);
167 |     return;
168 | }
169 | 
170 | /* ------------------------------------------------------ */
171 | 
172 | int main(int argc, char **argv)
173 | {
174 |     unsigned int long_size=0;
175 |     double te, te_sum; /* time elapsed */
176 |     unsigned long long asize=0; /* array size (elements in array) */
177 |     int i;
178 |     long *a, *b; /* the two arrays to be copied from/to */
179 |     int o; /* getopt options */
180 |     unsigned long testno;
181 | 
182 |     /* options */
183 | 
184 |     /* how many runs to average? */
185 |     int nr_loops=DEFAULT_NR_LOOPS;
186 |     /* fixed memcpy block size for -t2 */
187 |     unsigned long long block_size=DEFAULT_BLOCK_SIZE;
188 |     /* show average, -a */
189 |     int showavg=1;
190 |     /* what tests to run (-t x) */
191 |     int tests[MAX_TESTS];
192 |     double mt=0; /* MiBytes transferred == array size in MiB */
193 |     int quiet=0; /* suppress extra messages */
194 | 
195 |     tests[0]=0;
196 |     tests[1]=0;
197 |     tests[2]=0;
198 | 
199 |     while((o=getopt(argc, argv, "haqn:t:b:")) != EOF) {
200 |         switch(o) {
201 |             case 'h':
202 |                 usage();
203 |                 exit(1);
204 |                 break;
205 |             case 'a': /* suppress printing average */
206 |                 showavg=0;
207 |                 break;
208 |             case 'n': /* no. loops */
209 |                 nr_loops=strtoul(optarg, (char **)NULL, 10);
210 |                 break;
211 |             case 't': /* test to run */
212 |                 testno=strtoul(optarg, (char **)NULL, 10);
213 |                 if(testno>MAX_TESTS-1) {
214 |                     printf("Error: test number must be between 0 and %d\n", MAX_TESTS-1);
215 |                     exit(1);
216 |                 }
217 |                 tests[testno]=1;
218 |                 break;
219 |             case 'b': /* block size in bytes*/
220 |                 block_size=strtoull(optarg, (char **)NULL, 10);
221 |                 if(0>=block_size) {
222 |                     printf("Error: what block size do you mean?\n");
223 |                     exit(1);
224 |                 }
225 |                 break;
226 |             case 'q': /* quiet */
227 |                 quiet=1;
228 |                 break;
229 |             default:
230 |                 break;
231 |         }
232 |     }
233 | 
234 |     /* default is to run all tests if no specific tests were requested */
235 |     if( (tests[0]+tests[1]+tests[2]) == 0) {
236 |         tests[0]=1;
237 |         tests[1]=1;
238 |         tests[2]=1;
239 |     }
240 | 
241 |     if( nr_loops==0 && ((tests[0]+tests[1]+tests[2]) != 1) ) {
242 |         printf("Error: nr_loops can be zero if only one test selected!\n");
243 |         exit(1);
244 |     }
245 | 
246 |     if(optind<argc) {
247 |         mt=strtoul(argv[optind++], (char **)NULL, 10);
248 |     } else {
249 |         printf("Error: no array size given!\n");
250 |         exit(1);
251 |     }
252 | 
253 |     if(0>=mt) {
254 |         printf("Error: array size wrong!\n");
255 |         exit(1);
256 |     }
257 | 
258 |     /* ------------------------------------------------------ */
259 | 
260 |     long_size=sizeof(long); /* the size of long on this platform */
261 |     asize=1024*1024/long_size*mt; /* how many longs then in one array? */
262 | 
263 |     if(asize*long_size < block_size) {
264 |         printf("Error: array size larger than block size (%llu bytes)!\n", block_size);
265 |         exit(1);
266 |     }
267 | 
268 |     if(!quiet) {
269 |         printf("Long uses %d bytes. ", long_size);
270 |         printf("Allocating 2*%lld elements = %lld bytes of memory.\n", asize, 2*asize*long_size);
271 |         if(tests[2]) {
272 |             printf("Using %lld bytes as blocks for memcpy block copy test.\n", block_size);
273 |         }
274 |     }
275 | 
276 |     a=make_array(asize);
277 |     b=make_array(asize);
278 | 
279 |     /* ------------------------------------------------------ */
280 |     if(!quiet) {
281 |         printf("Getting down to business... Doing %d runs per test.\n", nr_loops);
282 |     }
283 | 
284 |     /* run all tests requested, the proper number of times */
285 |     for(testno=0; testno<MAX_TESTS; testno++) {
286 |         te_sum=0;
287 |         if(tests[testno]) {
288 |             for (i=0; nr_loops==0 || i<nr_loops; i++) {
289 |                 te=worker(asize, a, b, testno, block_size);
290 |                 te_sum+=te;
291 |                 printf("%d\t", i);
292 |                 printout(te, mt, testno);
293 |             }
294 |             if(showavg) {
295 |                 printf("AVG\t");
296 |                 printout(te_sum/nr_loops, mt, testno);
297 |             }
298 |         }
299 |     }
300 | 
301 |     free(a);
302 |     free(b);
303 |     return 0;
304 | }
305 | 


--------------------------------------------------------------------------------
/prepare/3.flw_bandwidth_test/Makefile:
--------------------------------------------------------------------------------
 1 | CTOOL := riscv64-unknown-linux-gnu-
 2 | CCL := /home/zhaodongyu/toolchain/riscv/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
 3 | CC := ${CCL}/bin/${CTOOL}gcc
 4 | 
 5 | C_FLAGS := -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static
 6 | 
 7 | test:load_test.o load_flw.o
 8 | 	${CC} $(C_FLAGS) -o test load_test.o load_flw.o
 9 | 
10 | load_test.o:load_test.c
11 | 	${CC} $(C_FLAGS) -c load_test.c
12 | 
13 | load_flw.o:load_flw.S
14 | 	${CC} $(C_FLAGS) -c load_flw.S
15 | 
16 | clean:
17 | 	rm test load_test.o load_flw.o


--------------------------------------------------------------------------------
/prepare/3.flw_bandwidth_test/load_flw.S:
--------------------------------------------------------------------------------
  1 | /*************************************************
  2 |  *
  3 |  *  Created by Aidget on 2022/11/30.           
  4 |  *  Copyright © 2022,  developed by Midea AIIC 
  5 |  *
  6 |  *************************************************/
  7 | 
  8 | # void load(size_t mc,         // nr <= 16, a0
  9 | #           size_t nc,         // mr <= 4,  a1
 10 | #           const float* a,    // mr * k,   a3
 11 | #           float* c          // mr * nr,  a5
 12 | #                 );
 13 | .global load
 14 | .type load, @function
 15 | 
 16 | #define mc a0
 17 | #define nc a1
 18 | #define ap a2
 19 | #define cp a3
 20 | 
 21 | #define mt       t0
 22 | #define nt       t1
 23 | #define ap0      t2
 24 | 
 25 | load:
 26 |     # mv s3, nc
 27 |     # vsetvli s2, s3, e32, m4
 28 |     mv mt, mc
 29 |     mv nt, nc
 30 |     mv ap0, ap
 31 | 
 32 | .start:
 33 | .loop1:
 34 |     mv mt, mc
 35 |     addi nc, nc, -1
 36 |     slti t6, nc, 0      # nc < 0, t6 = 1
 37 |     bnez t6, .end
 38 | .loop2:
 39 |     flw ft0, (ap0)
 40 |     addi ap0, ap0, 4
 41 |     flw ft0, (ap0)
 42 |     addi ap0, ap0, 4
 43 |     flw ft0, (ap0)
 44 |     addi ap0, ap0, 4
 45 |     flw ft0, (ap0)
 46 |     addi ap0, ap0, 4
 47 |     flw ft0, (ap0)
 48 |     addi ap0, ap0, 4
 49 |     flw ft0, (ap0)
 50 |     addi ap0, ap0, 4
 51 |     flw ft0, (ap0)
 52 |     addi ap0, ap0, 4
 53 |     flw ft0, (ap0)
 54 |     addi ap0, ap0, 4
 55 |     flw ft0, (ap0)
 56 |     addi ap0, ap0, 4
 57 |     flw ft0, (ap0)
 58 |     addi ap0, ap0, 4
 59 |     flw ft0, (ap0)
 60 |     addi ap0, ap0, 4
 61 |     flw ft0, (ap0)
 62 |     addi ap0, ap0, 4
 63 |     flw ft0, (ap0)
 64 |     addi ap0, ap0, 4
 65 |     flw ft0, (ap0)
 66 |     addi ap0, ap0, 4
 67 |     flw ft0, (ap0)
 68 |     addi ap0, ap0, 4
 69 |     flw ft0, (ap0)
 70 |     addi ap0, ap0, 4
 71 |     flw ft0, (ap0)
 72 |     addi ap0, ap0, 4
 73 |     flw ft0, (ap0)
 74 |     addi ap0, ap0, 4
 75 |     flw ft0, (ap0)
 76 |     addi ap0, ap0, 4
 77 |     flw ft0, (ap0)
 78 |     addi ap0, ap0, 4
 79 |     flw ft0, (ap0)
 80 |     addi ap0, ap0, 4
 81 |     flw ft0, (ap0)
 82 |     addi ap0, ap0, 4
 83 |     flw ft0, (ap0)
 84 |     addi ap0, ap0, 4
 85 |     flw ft0, (ap0)
 86 |     addi ap0, ap0, 4
 87 |     flw ft0, (ap0)
 88 |     addi ap0, ap0, 4
 89 |     flw ft0, (ap0)
 90 |     addi ap0, ap0, 4
 91 |     flw ft0, (ap0)
 92 |     addi ap0, ap0, 4
 93 |     flw ft0, (ap0)
 94 |     addi ap0, ap0, 4
 95 |     flw ft0, (ap0)
 96 |     addi ap0, ap0, 4
 97 |     flw ft0, (ap0)
 98 |     addi ap0, ap0, 4
 99 |     flw ft0, (ap0)
100 |     addi ap0, ap0, 4
101 |     flw ft0, (ap0)
102 |     addi ap0, ap0, 4
103 | 
104 |     addi mt, mt, -32
105 |     slti t6, mt, 1     # nt < 1, t6 = 1
106 |     beqz t6, .loop2
107 |     
108 |     j .loop1
109 | 
110 | .end:
111 |     ret
112 | 


--------------------------------------------------------------------------------
/prepare/3.flw_bandwidth_test/load_test.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <sys/time.h>
 3 | #include <stdlib.h>
 4 | #include <inttypes.h>
 5 | #include <stdbool.h>
 6 | 
 7 | extern void load(size_t mc,         // nr <= 16, a0
 8 |                  size_t nc,         // mr <= 4,  a1
 9 |                  const float* a,    // mr * k,   a3
10 |                  float* c          // mr * nr,  a5
11 |                 );
12 | 
13 | 
14 | int main(int argc, char const *argv[])
15 | {
16 |     int warmup_times = 10;
17 |     int test_times = 1000;
18 |     int nc = 1024;
19 |     int mc = 1024;
20 | 
21 |     float* a = (float*)malloc(mc * nc * sizeof(float));
22 |     float* c = (float*)malloc(mc * nc * sizeof(float));
23 | 
24 |     for(int i = 0; i < nc * mc; i++) a[i] = i;
25 | 
26 | 
27 |     float time_use=0;
28 |     struct timeval start;
29 |     struct timeval end;
30 | 
31 |     //warmup
32 |     for (int i = 0; i < warmup_times; i++){
33 |         load(mc, nc, a, c);
34 |     }
35 | 
36 | 
37 |     gettimeofday(&start,NULL);
38 |     for (int i = 0; i < test_times; i++){
39 |         load(mc, nc, a, c);
40 |     }
41 |     gettimeofday(&end,NULL);
42 |     time_use=(end.tv_sec-start.tv_sec)*1000000+(end.tv_usec-start.tv_usec);//微秒
43 |     printf("time_use is %.3fms\n", time_use/1000);
44 | 
45 | 
46 |     free(a);
47 |     free(c);
48 | }
49 | 


--------------------------------------------------------------------------------
/prepare/4.vlw_bandwidth_test/Makefile:
--------------------------------------------------------------------------------
 1 | CTOOL := riscv64-unknown-linux-gnu-
 2 | CCL := /home/zhaodongyu/toolchain/riscv/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
 3 | CC := ${CCL}/bin/${CTOOL}gcc
 4 | 
 5 | C_FLAGS := -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static
 6 | 
 7 | test:load_test.o load_vlw.o
 8 | 	${CC} $(C_FLAGS) -o test load_test.o load_vlw.o
 9 | 
10 | load_test.o:load_test.c
11 | 	${CC} $(C_FLAGS) -c load_test.c
12 | 
13 | load_vlw.o:load_vlw.S
14 | 	${CC} $(C_FLAGS) -c load_vlw.S
15 | 
16 | clean:
17 | 	rm test load_test.o load_vlw.o


--------------------------------------------------------------------------------
/prepare/4.vlw_bandwidth_test/load_test.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <sys/time.h>
 3 | #include <stdlib.h>
 4 | #include <inttypes.h>
 5 | #include <stdbool.h>
 6 | 
 7 | extern void load(size_t mc,         // nr <= 16, a0
 8 |                  size_t nc,         // mr <= 4,  a1
 9 |                  const float* a,    // mr * k,   a3
10 |                  float* c          // mr * nr,  a5
11 |                 );
12 | 
13 | 
14 | int main(int argc, char const *argv[])
15 | {
16 |     int warmup_times = 10;
17 |     int test_times = 1000;
18 |     int nc = 1024;
19 |     int mc = 1024;
20 | 
21 |     float* a = (float*)malloc(mc * nc * sizeof(float));
22 |     float* c = (float*)malloc(mc * nc * sizeof(float));
23 | 
24 |     for(int i = 0; i < nc * mc; i++) a[i] = i;
25 | 
26 | 
27 |     float time_use=0;
28 |     struct timeval start;
29 |     struct timeval end;
30 | 
31 |     //warmup
32 |     for (int i = 0; i < warmup_times; i++){
33 |         load(mc, nc, a, c);
34 |     }
35 | 
36 | 
37 |     gettimeofday(&start,NULL);
38 |     for (int i = 0; i < test_times; i++){
39 |         load(mc, nc, a, c);
40 |     }
41 |     gettimeofday(&end,NULL);
42 |     time_use=(end.tv_sec-start.tv_sec)*1000000+(end.tv_usec-start.tv_usec);//微秒
43 |     printf("time_use is %.3fms\n", time_use/1000);
44 | 
45 | 
46 |     free(a);
47 |     free(c);
48 | }
49 | 


--------------------------------------------------------------------------------
/prepare/4.vlw_bandwidth_test/load_vlw.S:
--------------------------------------------------------------------------------
 1 | /*************************************************
 2 |  *
 3 |  *  Created by Aidget on 2022/11/30.           
 4 |  *  Copyright © 2022,  developed by Midea AIIC 
 5 |  *
 6 |  *************************************************/
 7 | 
 8 | # void load(size_t mc,         // nr <= 16, a0
 9 | #           size_t nc,         // mr <= 4,  a1
10 | #           const float* a,    // mr * k,   a3
11 | #           float* c          // mr * nr,  a5
12 | #                 );
13 | .global load
14 | .type load, @function
15 | 
16 | #define mc a0
17 | #define nc a1
18 | #define ap a2
19 | #define cp a3
20 | 
21 | #define mt       t0
22 | #define nt       t1
23 | #define ap0      t2
24 | 
25 | load:
26 |     mv t0, nc
27 |     vsetvli t1, t0, e32, m8
28 |     mv mt, mc
29 |     mv nt, nc
30 |     mv ap0, ap
31 | 
32 | .start:
33 | .loop1:
34 |     mv mt, mc
35 |     addi nc, nc, -1
36 |     slti t6, nc, 0      # nc < 0, t6 = 1
37 |     bnez t6, .end
38 | .loop2:
39 |     vlw.v v0, (ap0)
40 |     addi ap0, ap0, 32
41 |     vlw.v v0, (ap0)
42 |     addi ap0, ap0, 32
43 |     vlw.v v0, (ap0)
44 |     addi ap0, ap0, 32
45 |     vlw.v v0, (ap0)
46 |     addi ap0, ap0, 32
47 |     vlw.v v0, (ap0)
48 |     addi ap0, ap0, 32
49 |     vlw.v v0, (ap0)
50 |     addi ap0, ap0, 32
51 |     vlw.v v0, (ap0)
52 |     addi ap0, ap0, 32
53 |     vlw.v v0, (ap0)
54 |     addi ap0, ap0, 32
55 | 
56 |     vlw.v v0, (ap0)
57 |     addi ap0, ap0, 32
58 |     vlw.v v0, (ap0)
59 |     addi ap0, ap0, 32
60 |     vlw.v v0, (ap0)
61 |     addi ap0, ap0, 32
62 |     vlw.v v0, (ap0)
63 |     addi ap0, ap0, 32
64 |     vlw.v v0, (ap0)
65 |     addi ap0, ap0, 32
66 |     vlw.v v0, (ap0)
67 |     addi ap0, ap0, 32
68 |     vlw.v v0, (ap0)
69 |     addi ap0, ap0, 32
70 |     vlw.v v0, (ap0)
71 |     addi ap0, ap0, 32
72 | 
73 |     addi mt, mt, -128
74 |     slti t6, mt, 1     # mt < 1, t6 = 1s
75 |     beqz t6, .loop2
76 |     
77 |     j .loop1
78 | 
79 | .end:
80 |     ret
81 | 


--------------------------------------------------------------------------------
/prepare/5.saxpy/Makefile:
--------------------------------------------------------------------------------
 1 | CTOOL := riscv64-unknown-linux-gnu-
 2 | CCL := /home/zhaodongyu/toolchain/riscv/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
 3 | CC := ${CCL}/bin/${CTOOL}gcc
 4 | 
 5 | C_FLAGS := -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static
 6 | 
 7 | test:main.o saxpy.o  
 8 | 	${CC} $(C_FLAGS) -o test  saxpy.o  main.o 
 9 | 
10 | main.o:main.c
11 | 	${CC} $(C_FLAGS) -c main.c
12 | 
13 | saxpy.o:saxpy.S
14 | 	${CC} $(C_FLAGS) -c saxpy.S
15 | 
16 | clean:
17 | 	rm test main.o saxpy.o 


--------------------------------------------------------------------------------
/prepare/5.saxpy/main.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <sys/time.h>
 3 | #include <stdlib.h>
 4 | #include<inttypes.h>
 5 | 
 6 | extern void saxpy(size_t n, const float a, const float *x, float *y);
 7 | // # void 
 8 | // # saxpy(size_t n, const float a, const float *x, float *y) 
 9 | // # { 
10 | // #   size_t i; 
11 | // #   for (i=0; i<n; i++) 
12 | // #     y[i] = a * x[i] + y[i]; 
13 | // # } 
14 | 
15 | int main(int argc, char const *argv[])
16 | {
17 |     size_t n = 11;
18 |     float a = 2.0;
19 |     float x[11] = {1,2,3,4,5,6,7,8,9,10,11};
20 |     float y[11] = {1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,1.95,1.97};
21 | 
22 |     saxpy(n, a, x, y);
23 | 
24 |     for(int i = 0; i < 11 ; i++) {
25 |         printf("%f\t", y[i]);
26 |     }
27 |     printf("\n");
28 | }
29 | 


--------------------------------------------------------------------------------
/prepare/5.saxpy/saxpy.S:
--------------------------------------------------------------------------------
 1 | .text
 2 | .balign 4
 3 | .global saxpy
 4 | 
 5 | # void 
 6 | # saxpy(size_t n, const float a, const float *x, float *y) 
 7 | # { 
 8 | #   size_t i; 
 9 | #   for (i=0; i<n; i++) 
10 | #     y[i] = a * x[i] + y[i]; 
11 | # } 
12 | # 
13 | # register arguments: 
14 | #     a0      n 
15 | #     fa0     a 
16 | #     a1      x 
17 | #     a2      y 
18 |  
19 | saxpy: 
20 |     vsetvli a4, a0, e32, m8 
21 |     vlw.v v0, (a1) 
22 |     sub a0, a0, a4 
23 |     slli a4, a4, 2 
24 |     add a1, a1, a4 
25 |     vlw.v v8, (a2) 
26 |     vfmacc.vf v8, fa0, v0 
27 |     vsw.v v8, (a2) 
28 |     add a2, a2, a4 
29 |     bnez a0, saxpy 
30 |     ret


--------------------------------------------------------------------------------
/prepare/README.md:
--------------------------------------------------------------------------------
  1 | # riscv_op_test
  2 | 
  3 | 测试与优化riscv的算子
  4 | 
  5 | - 运行指令
  6 | 
  7 | ```shell
  8 | make -j8
  9 | 
 10 | adb push test ./.
 11 | 
 12 | adb shell "./test"
 13 | ```
 14 | 
 15 | ## 0.hello world
 16 | 一个简易的helloworld工程，可用于测试交叉编译链是否可用。
 17 | ```shell
 18 | $ make
 19 | $ adb push hello_world ./.
 20 | $ adb shell "./hello_world"
 21 | Hello NeZha
 22 | ```
 23 | ## 1.memory copy
 24 | memcpy的小实验，需要注意`vsetvli`、`load`、`store`的指令v0.7和v1.0是不一样的，目前用的是v0.7。
 25 | ```shell
 26 | $ make
 27 | $ adb push test ./.
 28 | $ adb shell "./test"
 29 | Hello NeZha
 30 | 0       1       2       3       4       5       6       7       8       9
 31 | ```
 32 | | 指令    | v0.7                   | v1.0                           | 备注          |
 33 | | ------- | ---------------------- | ------------------------------ | ------------- |
 34 | | vsetvli | vsetvli t0, a2, e8, m8 | vsetvli t0, a2, e8, m8, ta, ma | Vectors of 8b |
 35 | | load    | vlb.v v0, (a1)         | vle8.v v0, (a1)                | Load bytes    |
 36 | | store   | vsb.v v0, (a3)         | vse8.v v0, (a3)                | Store bytes   |
 37 | 
 38 |     ta   # Tail agnostic
 39 |     tu   # Tail undisturbed
 40 |     ma   # Mask agnostic
 41 |     mu   # Mask undisturbed
 42 | 
 43 | > 在 v0.9 之前，当未在 vsetvli 上指定这些标志时，它们默认为掩码未受干扰/尾部未受干扰
 44 | 
 45 | `vsetvli t0, a2, e8`
 46 | 
 47 | 这个例子中，初见`vsetvli`指令，a2是长度n。
 48 | 
 49 | - 1st，a2 = 10 --> t0 = 8 --> a2 = 2
 50 | - 2nd, a2 = 2 --> t0 = 2 --> a2 = 0 -->ret
 51 | 
 52 | ## 2.memcpy bandwidth test
 53 | 测试内存带宽的小脚本
 54 | ```shell
 55 | $ make
 56 | $ adb push mbw ./.
 57 | $ adb shell "./mbw 100"
 58 | Long uses 8 bytes. Allocating 2*13107200 elements = 209715200 bytes of memory.
 59 | Using 262144 bytes as blocks for memcpy block copy test.
 60 | Getting down to business... Doing 10 runs per test.
 61 | ...
 62 | AVG     Method: MEMCPY  Elapsed: 0.09614        MiB: 100.00000  Copy: 1040.166 MiB/s
 63 | ...
 64 | AVG     Method: DUMB    Elapsed: 0.60301        MiB: 100.00000  Copy: 165.835 MiB/s
 65 | ...
 66 | AVG     Method: MCBLOCK Elapsed: 0.09692        MiB: 100.00000  Copy: 1031.754 MiB/s
 67 | ```
 68 | ## 3.flw bandwidth test
 69 | 使用`flw`测试内存带宽的小脚本
 70 | 
 71 | ```shell
 72 | $ make
 73 | $ adb push test ./.
 74 | $ adb shell "./test"
 75 | time_use is 2686.789ms
 76 | ```
 77 | flw: 4000MB/(2678.298 − 1171.154)ms = 2.592GB/s
 78 | 
 79 | ## 4.vlw bandwidth test
 80 | 使用`vlw`测试内存带宽的小脚本
 81 | 
 82 | ```shell
 83 | $ make
 84 | $ adb push test ./.
 85 | $ adb shell "./test"
 86 | ```
 87 | vlw(m1): 4000MB/(10391.898 − 1273.346)ms = 0.428GB/s 
 88 | 
 89 | vlw(m2): 4000MB/(9922.699 − 641.811)ms = 0.421GB/s 
 90 | 
 91 | vlw(m4): 4000MB/(4002.607 − 327.388)ms = 1.063GB/s 
 92 | 
 93 | vlw(m8): 4000MB/(3829.181 − 166.246)ms = 1.066GB/s
 94 | 
 95 | 综上,哪吒D1的内存带宽测试结果为：
 96 | 
 97 | ![](./imgs/memory_bandwidth_test.png)
 98 | 
 99 | ---
100 | 
101 | ## 5.saxpy
102 | > SAXPY（Scalar Alpha X Plus Y）是一个在 Basic Linear Algebra Subprograms（BLAS）数据包中的函数，并且是一个并行向量处理机（vector processor）中常用的计算操作指令。
103 | 
104 | y=αx+y,其中α是标量，x和y矢量。
105 | ```shell
106 | $ make
107 | $ adb push hello_world ./.
108 | $ adb shell "./test"
109 | 3.100000        5.200000        7.300000        9.400000        11.500000       13.600000       15.700000       17.799999       19.900000       21.950001 23.969999
110 | ```
111 | `vsetvli a4, a0, e32, m8 `
112 | 
113 | 这个例子中，又见`vsetvli`指令，`vsetvli`使用`m8`参数设置了每条指令处理8个连续的向量寄存器，a0是长度n。
114 | 
115 | n = 11 --> a0 = 11
116 | 
117 | a = 2.0 --> fa0 = 2.0
118 | 
119 | > `vsetvli a4, a0, e32, m8`
120 | 
121 | a4 = min(11,8) = 8
122 | 
123 | > `vlw.v v0, (a1) `
124 | 
125 | v0-v7 = x0-x7  next: v0-v7 = x8-...
126 | 
127 | > `sub a0, a0, a4`
128 | 
129 | a0 = a0 - a4 = 11 - 8 = 3
130 | 
131 | > `slli a4, a4, 2 `
132 | 
133 | a4 = a4 << 2 = 8*4 = 32 # float占4个Byte
134 | 
135 | > `add a1, a1, a4 `
136 | 
137 | a1本指向x0，现在指向x8
138 | 
139 | > `vlw.v v8, (a2) `
140 | 
141 | y0-y7 load到 v8-v15
142 | 
143 | > `vfmacc.vf v8, fa0, v0 `
144 | 
145 | (v8-v15) = fa0 * (v0-v7) + (v8-v15)
146 | 
147 | > `vsw.v v8, (a2) `
148 | 
149 | store到`y0`
150 | 
151 | > `add a2, a2, a4 `
152 | 
153 | `a1`本指向`y0`，现在指向`y8`
154 | 
155 | ---
156 | 
157 | OK，我认为进行到这里，一些关于`RISC-V V扩展`的基础知识已经具备，关于板子的内存性能也已经得到，可以开始写 `sgemm` 算子了!


--------------------------------------------------------------------------------
/prepare/imgs/memory_bandwidth_test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/prepare/imgs/memory_bandwidth_test.png


--------------------------------------------------------------------------------
/sgemm/common/bl_sgemm.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * --------------------------------------------------------------------------
  3 |  * BLISLAB 
  4 |  * --------------------------------------------------------------------------
  5 |  * Copyright (C) 2016, The University of Texas at Austin
  6 |  *
  7 |  * Redistribution and use in source and binary forms, with or without
  8 |  * modification, are permitted provided that the following conditions are
  9 |  * met:
 10 |  *  - Redistributions of source code must retain the above copyright
 11 |  *    notice, this list of conditions and the following disclaimer.
 12 |  *  - Redistributions in binary form must reproduce the above copyright
 13 |  *    notice, this list of conditions and the following disclaimer in the
 14 |  *    documentation and/or other materials provided with the distribution.
 15 |  *  - Neither the name of The University of Texas nor the names of its
 16 |  *    contributors may be used to endorse or promote products derived
 17 |  *    from this software without specific prior written permission.
 18 |  *
 19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 20 |  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 21 |  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 22 |  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 23 |  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 24 |  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 25 |  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 26 |  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 27 |  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 28 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 29 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 30 |  *
 31 |  *
 32 |  * bl_sgemm.h
 33 |  *
 34 |  *
 35 |  * Purpose:
 36 |  * this header file contains all function prototypes.
 37 |  *
 38 |  * Todo:
 39 |  *
 40 |  *
 41 |  * Modification:
 42 |  *
 43 |  * 
 44 |  * */
 45 | 
 46 | 
 47 | #ifndef BLISLAB_DGEMM_H
 48 | #define BLISLAB_DGEMM_H
 49 | 
 50 | // Allow C++ users to include this header file in their source code. However,
 51 | // we make the extern "C" conditional on whether we're using a C++ compiler,
 52 | // since regular C compilers don't understand the extern "C" construct.
 53 | #ifdef __cplusplus
 54 | extern "C" {
 55 | #endif
 56 | 
 57 | 
 58 | #include <math.h>
 59 | 
 60 | #include <stdio.h>
 61 | #include <stdlib.h>
 62 | #include <stdbool.h>
 63 | 
 64 | // Determine the target operating system
 65 | 
 66 | #if defined(__linux__)
 67 | #define BL_OS_LINUX 1
 68 | #else
 69 | #error "unsupport OS, this only support Linux"
 70 | #endif
 71 | 
 72 | // gettimeofday() needs this.
 73 | #include <sys/time.h>
 74 | #include <time.h>
 75 | 
 76 | #define GEMM_SIMD_ALIGN_SIZE 32
 77 | 
 78 | #define min( i, j ) ( (i)<(j) ? (i): (j) )
 79 | 
 80 | // #define A( i, j )     A[ (j)*lda + (i) ]
 81 | // #define B( i, j )     B[ (j)*ldb + (i) ]
 82 | // #define C( i, j )     C[ (j)*ldc + (i) ]
 83 | // #define C_ref( i, j ) C_ref[ (j)*ldc_ref + (i) ]
 84 | 
 85 | #define A( i, j )     A[ (i)*lda + (j) ]
 86 | #define B( i, j )     B[ (i)*ldb + (j) ]
 87 | #define C( i, j )     C[ (i)*ldc + (j) ]
 88 | #define C_ref( i, j ) C_ref[ (i)*ldc_ref + (j) ]
 89 | 
 90 | void bl_sgemm(
 91 |         int    m,
 92 |         int    n,
 93 |         int    k,
 94 |         float *A,
 95 |         int    lda,
 96 |         float *B,
 97 |         int    ldb,
 98 |         float *C,
 99 |         int    ldc
100 |         );
101 | 
102 | void bl_sgemm_pack(
103 |         int    m,
104 |         int    mr,
105 |         int    n,
106 |         int    nr,
107 |         int    k,
108 |         float *A,
109 |         float *packA,
110 |         int    lda,
111 |         float *B,
112 |         float *packB,
113 |         int    ldb,
114 |         float *C,
115 |         int    ldc
116 |         );
117 | 
118 | float *bl_malloc_aligned(
119 |         int    m,
120 |         int    n,
121 |         int    size
122 |         );
123 | 
124 | void bl_printmatrix(
125 |         float *A,
126 |         int    lda,
127 |         int    m,
128 |         int    n
129 |         );
130 | 
131 | float bl_clock( void );
132 | float bl_clock_helper();
133 | 
134 | void bl_sgemm_ref(
135 |     int    m,
136 |     int    n,
137 |     int    k,
138 |     float *XA,
139 |     int    lda,
140 |     float *XB,
141 |     int    ldb,
142 |     float *XC,
143 |     int    ldc
144 |     );
145 | 
146 | // End extern "C" construct block.
147 | #ifdef __cplusplus
148 | }
149 | #endif
150 | 
151 | #endif
152 | 
153 | 


--------------------------------------------------------------------------------
/sgemm/common/bl_sgemm_ref.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * --------------------------------------------------------------------------
 3 |  * BLISLAB 
 4 |  * --------------------------------------------------------------------------
 5 |  * Copyright (C) 2016, The University of Texas at Austin
 6 |  *
 7 |  * Redistribution and use in source and binary forms, with or without
 8 |  * modification, are permitted provided that the following conditions are
 9 |  * met:
10 |  *  - Redistributions of source code must retain the above copyright
11 |  *    notice, this list of conditions and the following disclaimer.
12 |  *  - Redistributions in binary form must reproduce the above copyright
13 |  *    notice, this list of conditions and the following disclaimer in the
14 |  *    documentation and/or other materials provided with the distribution.
15 |  *  - Neither the name of The University of Texas nor the names of its
16 |  *    contributors may be used to endorse or promote products derived
17 |  *    from this software without specific prior written permission.
18 |  *
19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 |  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 |  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 |  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 |  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 |  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 |  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 |  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 |  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |  *
31 |  *
32 |  * bl_sgemm_ref.c
33 |  *
34 |  *
35 |  * Purpose:
36 |  * implement reference mkl using GEMM (optional) in C.
37 |  *
38 |  * Todo:
39 |  *
40 |  *
41 |  * Modification:
42 |  *
43 |  * 
44 |  * */
45 | 
46 | #include <bl_sgemm.h>
47 | 
48 | void bl_sgemm_ref(
49 |         int    m,
50 |         int    n,
51 |         int    k,
52 |         float *XA,
53 |         int    lda,
54 |         float *XB,
55 |         int    ldb,
56 |         float *XC,
57 |         int    ldc
58 |         )
59 | {
60 |     // Local variables.
61 |     int    i, j, p;
62 |     float alpha = 1.0, beta = 1.0;
63 | 
64 |     // Sanity check for early return.
65 |     if ( m == 0 || n == 0 || k == 0 ) return;
66 | 
67 |     // Reference GEMM implementation.
68 |     for ( i = 0; i < m; i ++ ) {
69 |         for ( p = 0; p < k; p ++ ) {
70 |             for ( j = 0; j < n; j ++ ) {
71 |                 XC[ i * ldc + j ] += XA[ i * lda + p ] * XB[ p * ldb + j ];
72 |             }
73 |         }
74 |     }
75 | }
76 | 
77 | 


--------------------------------------------------------------------------------
/sgemm/common/bl_sgemm_util.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * --------------------------------------------------------------------------
  3 |  * BLISLAB 
  4 |  * --------------------------------------------------------------------------
  5 |  * Copyright (C) 2016, The University of Texas at Austin
  6 |  *
  7 |  * Redistribution and use in source and binary forms, with or without
  8 |  * modification, are permitted provided that the following conditions are
  9 |  * met:
 10 |  *  - Redistributions of source code must retain the above copyright
 11 |  *    notice, this list of conditions and the following disclaimer.
 12 |  *  - Redistributions in binary form must reproduce the above copyright
 13 |  *    notice, this list of conditions and the following disclaimer in the
 14 |  *    documentation and/or other materials provided with the distribution.
 15 |  *  - Neither the name of The University of Texas nor the names of its
 16 |  *    contributors may be used to endorse or promote products derived
 17 |  *    from this software without specific prior written permission.
 18 |  *
 19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 20 |  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 21 |  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 22 |  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 23 |  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 24 |  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 25 |  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 26 |  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 27 |  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 28 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 29 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 30 |  *
 31 |  *
 32 |  * bl_sgemm_util.c
 33 |  *
 34 |  *
 35 |  * Purpose:
 36 |  * Utility routines (Mem allocation, Print, etc.) that will come in handy later.
 37 |  *
 38 |  * Todo:
 39 |  *
 40 |  *
 41 |  * Modification:
 42 |  *
 43 |  * 
 44 |  * */
 45 | 
 46 | #include "bl_sgemm.h"
 47 | 
 48 | /*
 49 |  *
 50 |  *
 51 |  */ 
 52 | float *bl_malloc_aligned(
 53 |         int    m,
 54 |         int    n,
 55 |         int    size
 56 |         )
 57 | {
 58 |     float *ptr;
 59 |     int    err;
 60 | 
 61 |     err = posix_memalign( (void**)&ptr, (size_t)GEMM_SIMD_ALIGN_SIZE, size * m * n );
 62 | 
 63 |     if ( err ) {
 64 |         printf( "bl_malloc_aligned(): posix_memalign() failures" );
 65 |         exit( 1 );    
 66 |     }
 67 | 
 68 |     return ptr;
 69 | }
 70 | 
 71 | 
 72 | 
 73 | /*
 74 |  *
 75 |  *
 76 |  */
 77 | void bl_sgemm_printmatrix(
 78 |         float *A,
 79 |         int    lda,
 80 |         int    m,
 81 |         int    n
 82 |         )
 83 | {
 84 |     int    i, j;
 85 |     for ( i = 0; i < m; i ++ ) {
 86 |         for ( j = 0; j < n; j ++ ) {
 87 |             printf("%lf\t", A[j * lda + i]);
 88 |         }
 89 |         printf("\n");
 90 |     }
 91 | }
 92 | 
 93 | /*
 94 |  * The timer functions are copied directly from BLIS 0.2.0
 95 |  *
 96 |  */
 97 | static float gtod_ref_time_sec = 0.0;
 98 | 
 99 | float bl_clock( void )
100 | {
101 | 	return bl_clock_helper();
102 | }
103 | 
104 | // --- Begin Linux build definitions -------------------------------------------
105 | 
106 | float bl_clock_helper()
107 | {
108 |     float the_time, norm_sec;
109 |     struct timespec ts;
110 | 
111 |     clock_gettime( CLOCK_MONOTONIC, &ts );
112 | 
113 |     if ( gtod_ref_time_sec == 0.0 )
114 |         gtod_ref_time_sec = ( float ) ts.tv_sec;
115 | 
116 |     norm_sec = ( float ) ts.tv_sec - gtod_ref_time_sec;
117 | 
118 |     the_time = norm_sec + ts.tv_nsec * 1.0e-9;
119 | 
120 |     return the_time;
121 | }
122 | 
123 | // --- End Linux build definitions ---------------------------------------------
124 | 
125 | 
126 | 
127 | 
128 | 


--------------------------------------------------------------------------------
/sgemm/common/test_bl_sgemm.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * --------------------------------------------------------------------------
  3 |  * BLISLAB 
  4 |  * --------------------------------------------------------------------------
  5 |  * Copyright (C) 2016, The University of Texas at Austin
  6 |  *
  7 |  * Redistribution and use in source and binary forms, with or without
  8 |  * modification, are permitted provided that the following conditions are
  9 |  * met:
 10 |  *  - Redistributions of source code must retain the above copyright
 11 |  *    notice, this list of conditions and the following disclaimer.
 12 |  *  - Redistributions in binary form must reproduce the above copyright
 13 |  *    notice, this list of conditions and the following disclaimer in the
 14 |  *    documentation and/or other materials provided with the distribution.
 15 |  *  - Neither the name of The University of Texas nor the names of its
 16 |  *    contributors may be used to endorse or promote products derived
 17 |  *    from this software without specific prior written permission.
 18 |  *
 19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 20 |  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 21 |  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 22 |  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 23 |  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 24 |  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 25 |  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 26 |  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 27 |  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 28 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 29 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 30 |  *
 31 |  *
 32 |  * test_bl_sgemm.c
 33 |  *
 34 |  *
 35 |  * Purpose:
 36 |  * test driver for BLISLAB sgemm routine and reference sgemm routine.
 37 |  *
 38 |  * Todo:
 39 |  *
 40 |  *
 41 |  * Modification:
 42 |  *
 43 |  * 
 44 |  * */
 45 | 
 46 | 
 47 | #include "bl_sgemm.h"
 48 | 
 49 | #define ERROR_TEST
 50 | 
 51 | #define TOLERANCE 1E-2
 52 | void computeError(
 53 |         int    ldc,
 54 |         int    ldc_ref,
 55 |         int    m,
 56 |         int    n,
 57 |         float *C,
 58 |         float *C_ref
 59 |         )
 60 | {
 61 |     int    i, j;
 62 |     for ( i = 0; i < m; i ++ ) {
 63 |         for ( j = 0; j < n; j ++ ) {
 64 |             if ( fabs( C( i, j ) - C_ref( i, j ) ) > TOLERANCE ) {
 65 |                 printf( "C[ %d ][ %d ] != C_ref, %E, %E\n", i, j, C( i, j ), C_ref( i, j ) );
 66 |                 break;
 67 |             }
 68 |         }
 69 |     }
 70 | 
 71 | }
 72 | 
 73 | void test_bl_sgemm(
 74 |         int m,
 75 |         int n,
 76 |         int k
 77 |         ) 
 78 | {
 79 |     int    i, j, p, nx;
 80 |     float *A, *B, *C, *C_ref;
 81 |     float tmp, error, flops;
 82 |     float ref_beg, ref_time, bl_sgemm_beg, bl_sgemm_time;
 83 |     int    nrepeats;
 84 |     int    lda, ldb, ldc, ldc_ref;
 85 |     float ref_rectime, bl_sgemm_rectime;
 86 | 
 87 |     A    = (float*)malloc( sizeof(float) * m * k );
 88 |     B    = (float*)malloc( sizeof(float) * k * n );
 89 | 
 90 |     lda = m;
 91 |     ldb = k;
 92 |     ldc     = m;
 93 |     ldc_ref = m;
 94 |     C     = bl_malloc_aligned( ldc, n + 4, sizeof(float) );
 95 |     C_ref = (float*)malloc( sizeof(float) * m * n );
 96 | 
 97 |     nrepeats = 3;
 98 | 
 99 |     srand48 (time(NULL));
100 | 
101 |     // Randonly generate points in [ 0, 1 ].
102 |     for ( p = 0; p < k; p ++ ) {
103 |         for ( i = 0; i < m; i ++ ) {
104 |             A( i, p ) = (float)( drand48() );	
105 |         }
106 |     }
107 |     for ( j = 0; j < n; j ++ ) {
108 |         for ( p = 0; p < k; p ++ ) {
109 |             B( p, j ) = (float)( drand48() );
110 |         }
111 |     }
112 | 
113 |     for ( j = 0; j < n; j ++ ) {
114 |         for ( i = 0; i < m; i ++ ) {
115 |             C_ref( i, j ) = (float)( 0.0 );	
116 |                 C( i, j ) = (float)( 0.0 );	
117 |         }
118 |     }
119 | 
120 |     for ( i = 0; i < nrepeats; i ++ ) {
121 |         bl_sgemm_beg = bl_clock();
122 |         {
123 |             bl_sgemm(
124 |                     m,
125 |                     n,
126 |                     k,
127 |                     A,
128 |                     lda,
129 |                     B,
130 |                     ldb,
131 |                     C,
132 |                     ldc
133 |                     );
134 |         }
135 |         bl_sgemm_time = bl_clock() - bl_sgemm_beg;
136 | 
137 |         if ( i == 0 ) {
138 |             bl_sgemm_rectime = bl_sgemm_time;
139 |         } else {
140 |             bl_sgemm_rectime = bl_sgemm_time < bl_sgemm_rectime ? bl_sgemm_time : bl_sgemm_rectime;
141 |         }
142 |     }
143 | 
144 | #ifdef ERROR_TEST
145 |     for ( i = 0; i < nrepeats; i ++ ) {
146 |         ref_beg = bl_clock();
147 |         {
148 |             bl_sgemm_ref(
149 |                     m,
150 |                     n,
151 |                     k,
152 |                     A,
153 |                     lda,
154 |                     B,
155 |                     ldb,
156 |                     C_ref,
157 |                     ldc_ref
158 |                     );
159 |         }
160 |         ref_time = bl_clock() - ref_beg;
161 | 
162 |         if ( i == 0 ) {
163 |             ref_rectime = ref_time;
164 |         } else {
165 |             ref_rectime = ref_time < ref_rectime ? ref_time : ref_rectime;
166 |         }
167 |     }
168 | 
169 |     computeError(
170 |             ldc,
171 |             ldc_ref,
172 |             m,
173 |             n,
174 |             C,
175 |             C_ref
176 |             );
177 | #endif
178 | 
179 |     // Compute overall floating point operations.
180 |     flops = ( m * n / ( 1000.0 * 1000.0 * 1000.0 ) ) * ( 2 * k );
181 | 
182 |     printf( "%5d\t %5d\t %5d\t %5.3lf\t %5.3lf\n", 
183 |             m, n, k, flops / bl_sgemm_rectime, flops / ref_rectime );
184 | 
185 |     free( A     );
186 |     free( B     );
187 |     free( C     );
188 |     free( C_ref );
189 | }
190 | 
191 | int main( int argc, char *argv[] )
192 | {
193 |     printf("%%m\t%%n\t%%k\t%%MY_GFLOPS\t%%REF_GFLOPS\n");
194 |     for(int i = 16; i <= 800; i += 4) {
195 |         test_bl_sgemm( i, i, i );
196 |     }
197 | 
198 |     return 0;
199 | }
200 | 
201 | 


--------------------------------------------------------------------------------
/sgemm/common/test_bl_sgemm_packB_4x16.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * --------------------------------------------------------------------------
  3 |  * BLISLAB 
  4 |  * --------------------------------------------------------------------------
  5 |  * Copyright (C) 2016, The University of Texas at Austin
  6 |  *
  7 |  * Redistribution and use in source and binary forms, with or without
  8 |  * modification, are permitted provided that the following conditions are
  9 |  * met:
 10 |  *  - Redistributions of source code must retain the above copyright
 11 |  *    notice, this list of conditions and the following disclaimer.
 12 |  *  - Redistributions in binary form must reproduce the above copyright
 13 |  *    notice, this list of conditions and the following disclaimer in the
 14 |  *    documentation and/or other materials provided with the distribution.
 15 |  *  - Neither the name of The University of Texas nor the names of its
 16 |  *    contributors may be used to endorse or promote products derived
 17 |  *    from this software without specific prior written permission.
 18 |  *
 19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 20 |  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 21 |  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 22 |  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 23 |  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 24 |  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 25 |  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 26 |  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 27 |  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 28 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 29 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 30 |  *
 31 |  *
 32 |  * test_bl_sgemm.c
 33 |  *
 34 |  *
 35 |  * Purpose:
 36 |  * test driver for BLISLAB sgemm routine and reference sgemm routine.
 37 |  *
 38 |  * Todo:
 39 |  *
 40 |  *
 41 |  * Modification:
 42 |  *
 43 |  * 
 44 |  * */
 45 | 
 46 | 
 47 | #include "bl_sgemm.h"
 48 | 
 49 | #define ERROR_TEST
 50 | 
 51 | #define TOLERANCE 1E-2
 52 | void computeError(
 53 |         int    ldc,
 54 |         int    ldc_ref,
 55 |         int    m,
 56 |         int    n,
 57 |         float *C,
 58 |         float *C_ref
 59 |         )
 60 | {
 61 |     int    i, j;
 62 |     for ( i = 0; i < m; i ++ ) {
 63 |         for ( j = 0; j < n; j ++ ) {
 64 |             if ( fabs( C( i, j ) - C_ref( i, j ) ) > TOLERANCE ) {
 65 |                 printf( "C[ %d ][ %d ] != C_ref, %E, %E\n", i, j, C( i, j ), C_ref( i, j ) );
 66 |                 break;
 67 |             }
 68 |         }
 69 |     }
 70 | 
 71 | }
 72 | 
 73 | void PackWeightLayout(float* dst, const float* src, int nc, int kc, int nr, bool transpose) {
 74 |     int index = 0;
 75 |     for (int nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
 76 |         int nr_block_size = nr;
 77 |         if((nc - nr_block_start) < nr) nr_block_size = nc - nr_block_start;
 78 | 
 79 |         for (int kr_block_start = 0; kr_block_start < kc; kr_block_start++) {
 80 |             for (int nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
 81 |                 if (nr_block_offset >= nr_block_size) {
 82 |                     index++;
 83 |                     continue;
 84 |                 }
 85 |                 int x_idx = transpose ? kr_block_start : (nr_block_start + nr_block_offset);
 86 |                 int y_idx = transpose ? (nr_block_start + nr_block_offset) : kr_block_start;
 87 |                 int x_size = transpose ? kc : nc;
 88 |                 dst[index++] = src[y_idx * x_size + x_idx];
 89 |             }
 90 |         }
 91 |     }
 92 | }
 93 | 
 94 | void test_bl_sgemm(
 95 |         int m,
 96 |         int n,
 97 |         int k
 98 |         ) 
 99 | {
100 |     int    i, j, p, nx;
101 |     float *A, *B, *C, *C_ref, *packA, *packB;
102 |     float tmp, error, flops;
103 |     static float ref_beg, ref_time, bl_sgemm_beg, bl_sgemm_time;
104 |     int    nrepeats;
105 |     int    lda, ldb, ldc, ldc_ref;
106 |     float ref_rectime, bl_sgemm_rectime;
107 | 
108 |     int mr = 4;
109 |     int nr = 16;
110 |     
111 |     A    = (float*)malloc( sizeof(float) * m * k *2);
112 |     B    = (float*)malloc( sizeof(float) * k * n );
113 |     // Allocate packing buffers
114 |     packA  = bl_malloc_aligned( m + mr, k, sizeof(float) );
115 |     packB  = bl_malloc_aligned( k*2, n + nr, sizeof(float) );
116 | 
117 | 
118 |     lda = k;
119 |     ldb = n;
120 |     ldc     = n;
121 |     ldc_ref = n;
122 |     C     = bl_malloc_aligned( ldc, n + nr, sizeof(float) );
123 |     C_ref = (float*)malloc( sizeof(float) * m * n );
124 | 
125 |     nrepeats = 1;
126 | 
127 |     srand48 (time(NULL));
128 | 
129 |     // Randonly generate points in [ 0, 1 ].
130 |     for ( p = 0; p < k; p ++ ) {
131 |         for ( i = 0; i < m; i ++ ) {
132 |             A( i, p ) = (float)( drand48() );	
133 |             // A( i, p ) = (float)( i*m + p );
134 |         }
135 |     }
136 |     for ( j = 0; j < n; j ++ ) {
137 |         for ( p = 0; p < k; p ++ ) {
138 |             B( p, j ) = (float)( drand48() );
139 |             // B( p, j ) = (float)( p*n + j );
140 |         }
141 |     }
142 | 
143 |     for ( j = 0; j < n; j ++ ) {
144 |         for ( i = 0; i < m; i ++ ) {
145 |             C_ref( i, j ) = (float)( 0.0 );	
146 |                 C( i, j ) = (float)( 0.0 );	
147 |         }
148 |     }
149 | 
150 |     PackWeightLayout(packB, B, n, k, nr, false);
151 | 
152 |     // printf("[B]\n");
153 |     // for(int i = 0; i < k; i++) {
154 |     //   for(int j = 0; j < n; j++) {
155 |     //     printf("%.1f\t", B[i * n + j]);
156 |     //   }
157 |     //   printf("\n");
158 |     // }
159 | 
160 |     // printf("[packB]\n");
161 |     // for(int i = 0; i < k; i++) {
162 |     //   for(int j = 0; j < nr; j++) {
163 |     //     printf("%.1f\t", packB[i * nr + j]);
164 |     //   }
165 |     //   printf("\n");
166 |     // }
167 | 
168 |     for ( i = 0; i < nrepeats; i ++ ) {
169 |         bl_sgemm_beg = bl_clock();
170 |         {
171 |             bl_sgemm_pack(
172 |                     m,
173 |                     mr,
174 |                     n,
175 |                     nr,
176 |                     k,
177 |                     A,
178 |                     packA,
179 |                     lda,
180 |                     B,
181 |                     packB,
182 |                     ldb,
183 |                     C,
184 |                     ldc
185 |                     );
186 |         }
187 |         bl_sgemm_time = bl_clock() - bl_sgemm_beg;
188 | 
189 |         if ( i == 0 ) {
190 |             bl_sgemm_rectime = bl_sgemm_time;
191 |         } else {
192 |             bl_sgemm_rectime = bl_sgemm_time < bl_sgemm_rectime ? bl_sgemm_time : bl_sgemm_rectime;
193 |         }
194 |     }
195 | 
196 | #ifdef ERROR_TEST
197 |     for ( i = 0; i < nrepeats; i ++ ) {
198 |         ref_beg = bl_clock();
199 |         {
200 |             bl_sgemm_ref(
201 |                     m,
202 |                     n,
203 |                     k,
204 |                     A,
205 |                     lda,
206 |                     B,
207 |                     ldb,
208 |                     C_ref,
209 |                     ldc_ref
210 |                     );
211 |         }
212 |         ref_time = bl_clock() - ref_beg;
213 | 
214 |         if ( i == 0 ) {
215 |             ref_rectime = ref_time;
216 |         } else {
217 |             ref_rectime = ref_time < ref_rectime ? ref_time : ref_rectime;
218 |         }
219 |     }
220 | 
221 |     computeError(
222 |             ldc,
223 |             ldc_ref,
224 |             m,
225 |             n,
226 |             C,
227 |             C_ref
228 |             );
229 | #endif
230 |     // printf("ref\n");
231 |     // for(int i = 0; i < m; i++) {
232 |     //   for(int j = 0; j < n; j++) {
233 |     //     printf("%.0f\t", C_ref[i * n + j]);
234 |     //   }
235 |     //   printf("\n");
236 |     // }
237 |     // printf("\n\n");
238 | 
239 |     // printf("C\n");
240 |     // for(int i = 0; i < m; i++) {
241 |     //   for(int j = 0; j < n; j++) {
242 |     //     printf("%.0f\t", C[i * n + j]);
243 |     //   }
244 |     //   printf("\n");
245 |     // }
246 |     // printf("\n\n");
247 | 
248 |     // Compute overall floating point operations.
249 |     flops = ( m * n / ( 1000.0 * 1000.0 * 1000.0 ) ) * ( 2 * k );
250 |     printf( "%5d\t %5d\t %5d\t %5.3lf\t %5.3lf\n", 
251 |             m, n, k, flops / bl_sgemm_rectime, flops / ref_rectime );
252 | 
253 |     free( A     );
254 |     free( packA );
255 |     free( B     );
256 |     free( packB );
257 |     free( C     );
258 |     free( C_ref );
259 | }
260 | 
261 | int main( int argc, char *argv[] )
262 | {
263 |     printf("%%m\t%%n\t%%k\t%%MY_GFLOPS\t%%REF_GFLOPS\n");
264 |     for(int i = 16; i <= 800; i += 4) {
265 |         test_bl_sgemm( i, i, i );
266 |     }
267 | 
268 |     return 0;
269 | }
270 | 
271 | 


--------------------------------------------------------------------------------
/sgemm/common/test_bl_sgemm_packB_4x4.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * --------------------------------------------------------------------------
  3 |  * BLISLAB 
  4 |  * --------------------------------------------------------------------------
  5 |  * Copyright (C) 2016, The University of Texas at Austin
  6 |  *
  7 |  * Redistribution and use in source and binary forms, with or without
  8 |  * modification, are permitted provided that the following conditions are
  9 |  * met:
 10 |  *  - Redistributions of source code must retain the above copyright
 11 |  *    notice, this list of conditions and the following disclaimer.
 12 |  *  - Redistributions in binary form must reproduce the above copyright
 13 |  *    notice, this list of conditions and the following disclaimer in the
 14 |  *    documentation and/or other materials provided with the distribution.
 15 |  *  - Neither the name of The University of Texas nor the names of its
 16 |  *    contributors may be used to endorse or promote products derived
 17 |  *    from this software without specific prior written permission.
 18 |  *
 19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 20 |  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 21 |  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 22 |  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 23 |  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 24 |  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 25 |  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 26 |  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 27 |  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 28 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 29 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 30 |  *
 31 |  *
 32 |  * test_bl_sgemm.c
 33 |  *
 34 |  *
 35 |  * Purpose:
 36 |  * test driver for BLISLAB sgemm routine and reference sgemm routine.
 37 |  *
 38 |  * Todo:
 39 |  *
 40 |  *
 41 |  * Modification:
 42 |  *
 43 |  * 
 44 |  * */
 45 | 
 46 | 
 47 | #include "bl_sgemm.h"
 48 | 
 49 | #define ERROR_TEST
 50 | 
 51 | #define TOLERANCE 1E-2
 52 | void computeError(
 53 |         int    ldc,
 54 |         int    ldc_ref,
 55 |         int    m,
 56 |         int    n,
 57 |         float *C,
 58 |         float *C_ref
 59 |         )
 60 | {
 61 |     int    i, j;
 62 |     for ( i = 0; i < m; i ++ ) {
 63 |         for ( j = 0; j < n; j ++ ) {
 64 |             if ( fabs( C( i, j ) - C_ref( i, j ) ) > TOLERANCE ) {
 65 |                 printf( "C[ %d ][ %d ] != C_ref, %E, %E\n", i, j, C( i, j ), C_ref( i, j ) );
 66 |                 break;
 67 |             }
 68 |         }
 69 |     }
 70 | 
 71 | }
 72 | 
 73 | void PackWeightLayout(float* dst, const float* src, int nc, int kc, int nr, bool transpose) {
 74 |     int index = 0;
 75 |     for (int nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
 76 |         int nr_block_size = nr;
 77 |         if((nc - nr_block_start) < nr) nr_block_size = nc - nr_block_start;
 78 | 
 79 |         for (int kr_block_start = 0; kr_block_start < kc; kr_block_start++) {
 80 |             for (int nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
 81 |                 if (nr_block_offset >= nr_block_size) {
 82 |                     index++;
 83 |                     continue;
 84 |                 }
 85 |                 int x_idx = transpose ? kr_block_start : (nr_block_start + nr_block_offset);
 86 |                 int y_idx = transpose ? (nr_block_start + nr_block_offset) : kr_block_start;
 87 |                 int x_size = transpose ? kc : nc;
 88 |                 dst[index++] = src[y_idx * x_size + x_idx];
 89 |             }
 90 |         }
 91 |     }
 92 | }
 93 | 
 94 | void test_bl_sgemm(
 95 |         int m,
 96 |         int n,
 97 |         int k
 98 |         ) 
 99 | {
100 |     int    i, j, p, nx;
101 |     float *A, *B, *C, *C_ref, *packA, *packB;
102 |     float tmp, error, flops;
103 |     static float ref_beg, ref_time, bl_sgemm_beg, bl_sgemm_time;
104 |     int    nrepeats;
105 |     int    lda, ldb, ldc, ldc_ref;
106 |     float ref_rectime, bl_sgemm_rectime;
107 | 
108 |     int mr = 4;
109 |     int nr = 4;
110 |     
111 |     A    = (float*)malloc( sizeof(float) * m * k *2);
112 |     B    = (float*)malloc( sizeof(float) * k * n );
113 |     // Allocate packing buffers
114 |     packA  = bl_malloc_aligned( m + mr, k, sizeof(float) );
115 |     packB  = bl_malloc_aligned( k*2, n + nr, sizeof(float) );
116 | 
117 | 
118 |     lda = k;
119 |     ldb = n;
120 |     ldc     = n;
121 |     ldc_ref = n;
122 |     C     = bl_malloc_aligned( ldc, n + nr, sizeof(float) );
123 |     C_ref = (float*)malloc( sizeof(float) * m * n );
124 | 
125 |     nrepeats = 1;
126 | 
127 |     srand48 (time(NULL));
128 | 
129 |     // Randonly generate points in [ 0, 1 ].
130 |     for ( p = 0; p < k; p ++ ) {
131 |         for ( i = 0; i < m; i ++ ) {
132 |             A( i, p ) = (float)( drand48() );	
133 |             // A( i, p ) = (float)( i*m + p );
134 |         }
135 |     }
136 |     for ( j = 0; j < n; j ++ ) {
137 |         for ( p = 0; p < k; p ++ ) {
138 |             B( p, j ) = (float)( drand48() );
139 |             // B( p, j ) = (float)( p*n + j );
140 |         }
141 |     }
142 | 
143 |     for ( j = 0; j < n; j ++ ) {
144 |         for ( i = 0; i < m; i ++ ) {
145 |             C_ref( i, j ) = (float)( 0.0 );	
146 |                 C( i, j ) = (float)( 0.0 );	
147 |         }
148 |     }
149 | 
150 |     PackWeightLayout(packB, B, n, k, nr, false);
151 | 
152 |     // printf("[B]\n");
153 |     // for(int i = 0; i < k; i++) {
154 |     //   for(int j = 0; j < n; j++) {
155 |     //     printf("%.1f\t", B[i * n + j]);
156 |     //   }
157 |     //   printf("\n");
158 |     // }
159 | 
160 |     // printf("[packB]\n");
161 |     // for(int i = 0; i < k; i++) {
162 |     //   for(int j = 0; j < nr; j++) {
163 |     //     printf("%.1f\t", packB[i * nr + j]);
164 |     //   }
165 |     //   printf("\n");
166 |     // }
167 | 
168 |     for ( i = 0; i < nrepeats; i ++ ) {
169 |         bl_sgemm_beg = bl_clock();
170 |         {
171 |             bl_sgemm_pack(
172 |                     m,
173 |                     mr,
174 |                     n,
175 |                     nr,
176 |                     k,
177 |                     A,
178 |                     packA,
179 |                     lda,
180 |                     B,
181 |                     packB,
182 |                     ldb,
183 |                     C,
184 |                     ldc
185 |                     );
186 |         }
187 |         bl_sgemm_time = bl_clock() - bl_sgemm_beg;
188 | 
189 |         if ( i == 0 ) {
190 |             bl_sgemm_rectime = bl_sgemm_time;
191 |         } else {
192 |             bl_sgemm_rectime = bl_sgemm_time < bl_sgemm_rectime ? bl_sgemm_time : bl_sgemm_rectime;
193 |         }
194 |     }
195 | 
196 | #ifdef ERROR_TEST
197 |     for ( i = 0; i < nrepeats; i ++ ) {
198 |         ref_beg = bl_clock();
199 |         {
200 |             bl_sgemm_ref(
201 |                     m,
202 |                     n,
203 |                     k,
204 |                     A,
205 |                     lda,
206 |                     B,
207 |                     ldb,
208 |                     C_ref,
209 |                     ldc_ref
210 |                     );
211 |         }
212 |         ref_time = bl_clock() - ref_beg;
213 | 
214 |         if ( i == 0 ) {
215 |             ref_rectime = ref_time;
216 |         } else {
217 |             ref_rectime = ref_time < ref_rectime ? ref_time : ref_rectime;
218 |         }
219 |     }
220 | 
221 |     computeError(
222 |             ldc,
223 |             ldc_ref,
224 |             m,
225 |             n,
226 |             C,
227 |             C_ref
228 |             );
229 | #endif
230 |     // printf("ref\n");
231 |     // for(int i = 0; i < m; i++) {
232 |     //   for(int j = 0; j < n; j++) {
233 |     //     printf("%.0f\t", C_ref[i * n + j]);
234 |     //   }
235 |     //   printf("\n");
236 |     // }
237 |     // printf("\n\n");
238 | 
239 |     // printf("C\n");
240 |     // for(int i = 0; i < m; i++) {
241 |     //   for(int j = 0; j < n; j++) {
242 |     //     printf("%.0f\t", C[i * n + j]);
243 |     //   }
244 |     //   printf("\n");
245 |     // }
246 |     // printf("\n\n");
247 | 
248 |     // Compute overall floating point operations.
249 |     flops = ( m * n / ( 1000.0 * 1000.0 * 1000.0 ) ) * ( 2 * k );
250 |     printf( "%5d\t %5d\t %5d\t %5.3lf\t %5.3lf\n", 
251 |             m, n, k, flops / bl_sgemm_rectime, flops / ref_rectime );
252 | 
253 |     free( A     );
254 |     free( packA );
255 |     free( B     );
256 |     free( packB );
257 |     free( C     );
258 |     free( C_ref );
259 | }
260 | 
261 | int main( int argc, char *argv[] )
262 | {
263 |     printf("%%m\t%%n\t%%k\t%%MY_GFLOPS\t%%REF_GFLOPS\n");
264 |     for(int i = 16; i <= 800; i += 4) {
265 |         test_bl_sgemm( i, i, i );
266 |     }
267 | 
268 |     return 0;
269 | }
270 | 
271 | 


--------------------------------------------------------------------------------
/sgemm/step0/Makefile:
--------------------------------------------------------------------------------
 1 | TARGET=test_bl_sgemm_step0
 2 | 
 3 | #CROSS_COMPILE ?= riscv64-unknown-linux-gnu-
 4 | # CROSS_COMPILE ?=
 5 | 
 6 | CCL := /home/zhaodongyu/toolchain/riscv/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
 7 | CTOOL := riscv64-unknown-linux-gnu-
 8 | CROSS_COMPILE := ${CCL}/bin/${CTOOL}
 9 | 
10 | CC  = $(CROSS_COMPILE)gcc
11 | CXX = $(CROSS_COMPILE)g++
12 | 
13 | COMPILER_OPT_LEVEL=O3
14 | 
15 | ifeq ($(CROSS_COMPILE),)
16 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=core-avx2 -fPIC -static
17 | LDLIBS = -lpthread -lm
18 | else
19 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static
20 | LDLIBS = -lm
21 | endif
22 | 
23 | COMMONDIR = ../common
24 | CFLAGS += -I$(COMMONDIR)
25 | 
26 | ASM_SRC= 
27 | CC_SRC= my_sgemm.c \
28 |         $(COMMONDIR)/test_bl_sgemm.c \
29 |         $(COMMONDIR)/bl_sgemm_ref.c \
30 |         $(COMMONDIR)/bl_sgemm_util.c
31 | 
32 | CPP_SRC=
33 | 
34 | ALL_ASM_OBJS := $(ASM_SRC:=.o)
35 | ALL_C_OBJS := $(CC_SRC:=.o)
36 | ALL_CXX_OBJS := $(CPP_SRC:=.o)
37 | 
38 | MKDEP_OPT = -MMD -MF $@.d
39 | 
40 | ALL_OBJS += $(ALL_ASM_OBJS) $(ALL_C_OBJS) $(ALL_CXX_OBJS)
41 | ALL_DEPS := $(ALL_OBJS:=.d)
42 | 
43 | BLISLAB_TEST_EXE=$(TARGET:=.x)
44 | 
45 | .PHONY: all clean
46 | 
47 | all: $(BLISLAB_TEST_EXE)
48 | 
49 | # include dependency files of application
50 | ifneq ($(MAKECMDGOALS),clean)
51 | -include $(ALL_DEPS)
52 | endif
53 | 
54 | $(BLISLAB_TEST_EXE): $(ALL_OBJS)
55 | 	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS) $(LDLIBS)
56 | 
57 | # ---------------------------------------------------------------------------
58 | # Object files compiling rules
59 | # ---------------------------------------------------------------------------
60 | %.c.o: %.c
61 | 	$(CC) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
62 | 
63 | %.cpp.o: %.cpp
64 | 	$(CXX) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
65 | # ---------------------------------------------------------------------------
66 | 
67 | clean:
68 | 	@rm -rf $(ALL_OBJS) $(SHAREDLIBBLISLAB) $(BLISLAB_TEST_EXE) $(ALL_DEPS)
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/sgemm/step0/my_sgemm.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * --------------------------------------------------------------------------
 3 |  * BLISLAB 
 4 |  * --------------------------------------------------------------------------
 5 |  * Copyright (C) 2016, The University of Texas at Austin
 6 |  *
 7 |  * Redistribution and use in source and binary forms, with or without
 8 |  * modification, are permitted provided that the following conditions are
 9 |  * met:
10 |  *  - Redistributions of source code must retain the above copyright
11 |  *    notice, this list of conditions and the following disclaimer.
12 |  *  - Redistributions in binary form must reproduce the above copyright
13 |  *    notice, this list of conditions and the following disclaimer in the
14 |  *    documentation and/or other materials provided with the distribution.
15 |  *  - Neither the name of The University of Texas nor the names of its
16 |  *    contributors may be used to endorse or promote products derived
17 |  *    from this software without specific prior written permission.
18 |  *
19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 |  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 |  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 |  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 |  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 |  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 |  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 |  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 |  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |  *
31 |  *
32 |  * bl_sgemm.c
33 |  *
34 |  *
35 |  * Purpose:
36 |  * this is the main file of blislab sgemm.
37 |  *
38 |  * Todo:
39 |  *
40 |  *
41 |  * Modification:
42 |  *
43 |  * 
44 |  * */
45 |  
46 | 
47 | #include <bl_sgemm.h>
48 | 
49 | void bl_sgemm(
50 |     int    m,
51 |     int    n,
52 |     int    k,
53 |     float *A,
54 |     int    lda,
55 |     float *B,
56 |     int    ldb,
57 |     float *C,        // must be aligned
58 |     int    ldc        // ldc must also be aligned
59 | )
60 | {
61 |   int    i, j, p;
62 | 
63 |   // Early return if possible
64 |   if ( m == 0 || n == 0 || k == 0 ) {
65 |     printf( "bl_sgemm(): early return\n" );
66 |     return;
67 |   }
68 | 
69 |   for ( i = 0; i < m; i ++ ) {              // Start 2-th loop
70 |       for ( j = 0; j < n; j ++ ) {          // Start 1-nd loop
71 |         for ( p = 0; p < k; p ++ ) {        // Start 0-st loop
72 | 
73 |               C( i, j ) += A( i, p ) * B( p, j ); //Each operand is a MACRO defined in bl_sgemm() function.
74 | 
75 |           }                                 // End   0-th loop
76 |       }                                     // End   1-st loop
77 |   }                                         // End   2-nd loop
78 | }
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------
/sgemm/step1/Makefile:
--------------------------------------------------------------------------------
 1 | TARGET=test_bl_sgemm_step1
 2 | 
 3 | #CROSS_COMPILE ?= riscv64-unknown-linux-gnu-
 4 | # CROSS_COMPILE ?=
 5 | 
 6 | CCL := /home/zhaodongyu/toolchain/riscv/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
 7 | CTOOL := riscv64-unknown-linux-gnu-
 8 | CROSS_COMPILE := ${CCL}/bin/${CTOOL}
 9 | 
10 | CC  = $(CROSS_COMPILE)gcc
11 | CXX = $(CROSS_COMPILE)g++
12 | 
13 | COMPILER_OPT_LEVEL=O3
14 | 
15 | ifeq ($(CROSS_COMPILE),)
16 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=core-avx2 -fPIC -static
17 | LDLIBS = -lpthread -lm
18 | else
19 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static
20 | LDLIBS = -lm
21 | endif
22 | 
23 | COMMONDIR = ../common
24 | CFLAGS += -I$(COMMONDIR)
25 | 
26 | ASM_SRC= 
27 | CC_SRC= my_sgemm.c \
28 |         $(COMMONDIR)/test_bl_sgemm.c \
29 |         $(COMMONDIR)/bl_sgemm_ref.c \
30 |         $(COMMONDIR)/bl_sgemm_util.c
31 | 
32 | CPP_SRC=
33 | 
34 | ALL_ASM_OBJS := $(ASM_SRC:=.o)
35 | ALL_C_OBJS := $(CC_SRC:=.o)
36 | ALL_CXX_OBJS := $(CPP_SRC:=.o)
37 | 
38 | MKDEP_OPT = -MMD -MF $@.d
39 | 
40 | ALL_OBJS += $(ALL_ASM_OBJS) $(ALL_C_OBJS) $(ALL_CXX_OBJS)
41 | ALL_DEPS := $(ALL_OBJS:=.d)
42 | 
43 | BLISLAB_TEST_EXE=$(TARGET:=.x)
44 | 
45 | .PHONY: all clean
46 | 
47 | all: $(BLISLAB_TEST_EXE)
48 | 
49 | # include dependency files of application
50 | ifneq ($(MAKECMDGOALS),clean)
51 | -include $(ALL_DEPS)
52 | endif
53 | 
54 | $(BLISLAB_TEST_EXE): $(ALL_OBJS)
55 | 	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS) $(LDLIBS)
56 | 
57 | # ---------------------------------------------------------------------------
58 | # Object files compiling rules
59 | # ---------------------------------------------------------------------------
60 | %.c.o: %.c
61 | 	$(CC) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
62 | 
63 | %.cpp.o: %.cpp
64 | 	$(CXX) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
65 | # ---------------------------------------------------------------------------
66 | 
67 | clean:
68 | 	@rm -rf $(ALL_OBJS) $(SHAREDLIBBLISLAB) $(BLISLAB_TEST_EXE) $(ALL_DEPS)
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/sgemm/step1/my_sgemm.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * --------------------------------------------------------------------------
 3 |  * BLISLAB 
 4 |  * --------------------------------------------------------------------------
 5 |  * Copyright (C) 2016, The University of Texas at Austin
 6 |  *
 7 |  * Redistribution and use in source and binary forms, with or without
 8 |  * modification, are permitted provided that the following conditions are
 9 |  * met:
10 |  *  - Redistributions of source code must retain the above copyright
11 |  *    notice, this list of conditions and the following disclaimer.
12 |  *  - Redistributions in binary form must reproduce the above copyright
13 |  *    notice, this list of conditions and the following disclaimer in the
14 |  *    documentation and/or other materials provided with the distribution.
15 |  *  - Neither the name of The University of Texas nor the names of its
16 |  *    contributors may be used to endorse or promote products derived
17 |  *    from this software without specific prior written permission.
18 |  *
19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 |  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 |  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 |  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 |  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 |  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 |  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 |  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 |  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |  *
31 |  *
32 |  * bl_sgemm.c
33 |  *
34 |  *
35 |  * Purpose:
36 |  * this is the main file of blislab sgemm.
37 |  *
38 |  * Todo:
39 |  *
40 |  *
41 |  * Modification:
42 |  *
43 |  * 
44 |  * */
45 |  
46 | 
47 | #include <bl_sgemm.h>
48 | 
49 | void bl_sgemm(
50 |     int    m,
51 |     int    n,
52 |     int    k,
53 |     float *A,
54 |     int    lda,
55 |     float *B,
56 |     int    ldb,
57 |     float *C,        // must be aligned
58 |     int    ldc        // ldc must also be aligned
59 | )
60 | {
61 |   int    i, j, p;
62 | 
63 |   // Early return if possible
64 |   if ( m == 0 || n == 0 || k == 0 ) {
65 |     printf( "bl_sgemm(): early return\n" );
66 |     return;
67 |   }
68 | 
69 |   for ( i = 0; i < m; i ++ ) {              // Start 2-th loop
70 |       for ( p = 0; p < k; p ++ ) {          // Start 1-st loop
71 |           for ( j = 0; j < n; j ++ ) {      // Start 0-nd loop
72 | 
73 |               C( i, j ) += A( i, p ) * B( p, j ); //Each operand is a MACRO defined in bl_sgemm() function.
74 | 
75 |           }                                 // End   0-th loop
76 |       }                                     // End   1-st loop
77 |   }                                         // End   2-nd loop
78 | }
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------
/sgemm/step2/Makefile:
--------------------------------------------------------------------------------
 1 | TARGET=test_bl_sgemm_step2
 2 | 
 3 | #CROSS_COMPILE ?= riscv64-unknown-linux-gnu-
 4 | # CROSS_COMPILE ?=
 5 | 
 6 | CCL := /home/zhaodongyu/toolchain/riscv/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
 7 | CTOOL := riscv64-unknown-linux-gnu-
 8 | CROSS_COMPILE := ${CCL}/bin/${CTOOL}
 9 | 
10 | CC  = $(CROSS_COMPILE)gcc
11 | CXX = $(CROSS_COMPILE)g++
12 | 
13 | COMPILER_OPT_LEVEL=O3
14 | 
15 | ifeq ($(CROSS_COMPILE),)
16 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=core-avx2 -fPIC -static
17 | LDLIBS = -lpthread -lm
18 | else
19 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static
20 | LDLIBS = -lm
21 | endif
22 | 
23 | COMMONDIR = ../common
24 | CFLAGS += -I$(COMMONDIR)
25 | 
26 | ASM_SRC= 
27 | CC_SRC= my_sgemm.c \
28 |         $(COMMONDIR)/test_bl_sgemm.c \
29 |         $(COMMONDIR)/bl_sgemm_ref.c \
30 |         $(COMMONDIR)/bl_sgemm_util.c
31 | 
32 | CPP_SRC=
33 | 
34 | ALL_ASM_OBJS := $(ASM_SRC:=.o)
35 | ALL_C_OBJS := $(CC_SRC:=.o)
36 | ALL_CXX_OBJS := $(CPP_SRC:=.o)
37 | 
38 | MKDEP_OPT = -MMD -MF $@.d
39 | 
40 | ALL_OBJS += $(ALL_ASM_OBJS) $(ALL_C_OBJS) $(ALL_CXX_OBJS)
41 | ALL_DEPS := $(ALL_OBJS:=.d)
42 | 
43 | BLISLAB_TEST_EXE=$(TARGET:=.x)
44 | 
45 | .PHONY: all clean
46 | 
47 | all: $(BLISLAB_TEST_EXE)
48 | 
49 | # include dependency files of application
50 | ifneq ($(MAKECMDGOALS),clean)
51 | -include $(ALL_DEPS)
52 | endif
53 | 
54 | $(BLISLAB_TEST_EXE): $(ALL_OBJS)
55 | 	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS) $(LDLIBS)
56 | 
57 | # ---------------------------------------------------------------------------
58 | # Object files compiling rules
59 | # ---------------------------------------------------------------------------
60 | %.c.o: %.c
61 | 	$(CC) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
62 | 
63 | %.cpp.o: %.cpp
64 | 	$(CXX) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
65 | # ---------------------------------------------------------------------------
66 | 
67 | clean:
68 | 	@rm -rf $(ALL_OBJS) $(SHAREDLIBBLISLAB) $(BLISLAB_TEST_EXE) $(ALL_DEPS)
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/sgemm/step2/bl_config.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * --------------------------------------------------------------------------
 3 |  * BLISLAB 
 4 |  * --------------------------------------------------------------------------
 5 |  * Copyright (C) 2016, The University of Texas at Austin
 6 |  *
 7 |  * Redistribution and use in source and binary forms, with or without
 8 |  * modification, are permitted provided that the following conditions are
 9 |  * met:
10 |  *  - Redistributions of source code must retain the above copyright
11 |  *    notice, this list of conditions and the following disclaimer.
12 |  *  - Redistributions in binary form must reproduce the above copyright
13 |  *    notice, this list of conditions and the following disclaimer in the
14 |  *    documentation and/or other materials provided with the distribution.
15 |  *  - Neither the name of The University of Texas nor the names of its
16 |  *    contributors may be used to endorse or promote products derived
17 |  *    from this software without specific prior written permission.
18 |  *
19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 |  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 |  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 |  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 |  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 |  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 |  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 |  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 |  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |  *
31 |  *
32 |  * bl_config.h
33 |  *
34 |  *
35 |  * Purpose:
36 |  * this header file contains configuration parameters.
37 |  *
38 |  * Todo:
39 |  *
40 |  *
41 |  * Modification:
42 |  *
43 |  * 
44 |  * */
45 | 
46 | #ifndef BLISLAB_CONFIG_H
47 | #define BLISLAB_CONFIG_H
48 | 
49 | // Allow C++ users to include this header file in their source code. However,
50 | // we make the extern "C" conditional on whether we're using a C++ compiler,
51 | // since regular C compilers don't understand the extern "C" construct.
52 | #ifdef __cplusplus
53 | extern "C" {
54 | #endif
55 | 
56 | #define DGEMM_MR 4
57 | #define DGEMM_NR 4
58 | 
59 | // End extern "C" construct block.
60 | #ifdef __cplusplus
61 | }
62 | #endif
63 | 
64 | #endif
65 | 
66 | 


--------------------------------------------------------------------------------
/sgemm/step2/my_sgemm.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * --------------------------------------------------------------------------
 3 |  * BLISLAB 
 4 |  * --------------------------------------------------------------------------
 5 |  * Copyright (C) 2016, The University of Texas at Austin
 6 |  *
 7 |  * Redistribution and use in source and binary forms, with or without
 8 |  * modification, are permitted provided that the following conditions are
 9 |  * met:
10 |  *  - Redistributions of source code must retain the above copyright
11 |  *    notice, this list of conditions and the following disclaimer.
12 |  *  - Redistributions in binary form must reproduce the above copyright
13 |  *    notice, this list of conditions and the following disclaimer in the
14 |  *    documentation and/or other materials provided with the distribution.
15 |  *  - Neither the name of The University of Texas nor the names of its
16 |  *    contributors may be used to endorse or promote products derived
17 |  *    from this software without specific prior written permission.
18 |  *
19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 |  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 |  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 |  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 |  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 |  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 |  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 |  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 |  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |  *
31 |  *
32 |  * bl_sgemm.c
33 |  *
34 |  *
35 |  * Purpose:
36 |  * this is the main file of blislab sgemm.
37 |  *
38 |  * Todo:
39 |  *
40 |  *
41 |  * Modification:
42 |  *
43 |  * 
44 |  * */
45 |  
46 | 
47 | #include "bl_sgemm.h"
48 | #include "bl_config.h"
49 | 
50 | void AddDot( int k, float *A, int lda, float *B, int ldb, float *result ) {
51 |   int p;
52 |   for ( p = 0; p < k; p++ ) {
53 |     *result += A( 0, p ) * B( p, 0 );
54 |   }
55 | }
56 | 
57 | void AddDot_MRxNR( int k, float *A, int lda, float *B, int ldb, float *C, int ldc )
58 | {
59 |   int ir, jr;
60 |   int p;
61 |   for ( jr = 0; jr < DGEMM_NR; jr++ ) {
62 |     for ( ir = 0; ir < DGEMM_MR; ir++ ) {
63 |       AddDot( k, &A( ir, 0 ), lda, &B( 0, jr ), ldb, &C( ir, jr ) );
64 |     }
65 |   }
66 | }
67 | 
68 | 
69 | void bl_sgemm(
70 |     int    m,
71 |     int    n,
72 |     int    k,
73 |     float *A,
74 |     int    lda,
75 |     float *B,
76 |     int    ldb,
77 |     float *C,        // must be aligned
78 |     int    ldc        // ldc must also be aligned
79 | )
80 | {
81 |     int i, j, p;
82 |     int ir, jr;
83 | 
84 |     // Early return if possible
85 |     if ( m == 0 || n == 0 || k == 0 ) {
86 |         printf( "bl_sgemm(): early return\n" );
87 |         return;
88 |     }
89 | 
90 |     for ( i = 0; i < m; i += DGEMM_MR ) {          // Start 2-nd loop
91 |       for ( j = 0; j < n; j += DGEMM_NR ) {        // Start 1-st loop
92 |            AddDot_MRxNR( k, &A( i, 0 ), lda, &B( 0, j ), ldb, &C( i, j ), ldc );
93 |         }                                          // End   1-st loop
94 |     }                                              // End   2-nd loop
95 | }
96 | 
97 | 
98 | 


--------------------------------------------------------------------------------
/sgemm/step3/Makefile:
--------------------------------------------------------------------------------
 1 | TARGET=test_bl_sgemm_step3
 2 | 
 3 | #CROSS_COMPILE ?= riscv64-unknown-linux-gnu-
 4 | # CROSS_COMPILE ?=
 5 | 
 6 | CCL := /home/zhaodongyu/toolchain/riscv/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
 7 | CTOOL := riscv64-unknown-linux-gnu-
 8 | CROSS_COMPILE := ${CCL}/bin/${CTOOL}
 9 | 
10 | CC  = $(CROSS_COMPILE)gcc
11 | CXX = $(CROSS_COMPILE)g++
12 | 
13 | COMPILER_OPT_LEVEL=O3
14 | 
15 | ifeq ($(CROSS_COMPILE),)
16 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=core-avx2 -fPIC -static
17 | LDLIBS = -lpthread -lm
18 | else
19 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static
20 | LDLIBS = -lm
21 | endif
22 | 
23 | COMMONDIR = ../common
24 | CFLAGS += -I$(COMMONDIR)
25 | 
26 | ASM_SRC= 
27 | CC_SRC= my_sgemm.c \
28 |         $(COMMONDIR)/test_bl_sgemm.c \
29 |         $(COMMONDIR)/bl_sgemm_ref.c \
30 |         $(COMMONDIR)/bl_sgemm_util.c
31 | 
32 | CPP_SRC=
33 | 
34 | ALL_ASM_OBJS := $(ASM_SRC:=.o)
35 | ALL_C_OBJS := $(CC_SRC:=.o)
36 | ALL_CXX_OBJS := $(CPP_SRC:=.o)
37 | 
38 | MKDEP_OPT = -MMD -MF $@.d
39 | 
40 | ALL_OBJS += $(ALL_ASM_OBJS) $(ALL_C_OBJS) $(ALL_CXX_OBJS)
41 | ALL_DEPS := $(ALL_OBJS:=.d)
42 | 
43 | BLISLAB_TEST_EXE=$(TARGET:=.x)
44 | 
45 | .PHONY: all clean
46 | 
47 | all: $(BLISLAB_TEST_EXE)
48 | 
49 | # include dependency files of application
50 | ifneq ($(MAKECMDGOALS),clean)
51 | -include $(ALL_DEPS)
52 | endif
53 | 
54 | $(BLISLAB_TEST_EXE): $(ALL_OBJS)
55 | 	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS) $(LDLIBS)
56 | 
57 | # ---------------------------------------------------------------------------
58 | # Object files compiling rules
59 | # ---------------------------------------------------------------------------
60 | %.c.o: %.c
61 | 	$(CC) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
62 | 
63 | %.cpp.o: %.cpp
64 | 	$(CXX) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
65 | # ---------------------------------------------------------------------------
66 | 
67 | clean:
68 | 	@rm -rf $(ALL_OBJS) $(SHAREDLIBBLISLAB) $(BLISLAB_TEST_EXE) $(ALL_DEPS)
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/sgemm/step3/bl_config.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * --------------------------------------------------------------------------
 3 |  * BLISLAB 
 4 |  * --------------------------------------------------------------------------
 5 |  * Copyright (C) 2016, The University of Texas at Austin
 6 |  *
 7 |  * Redistribution and use in source and binary forms, with or without
 8 |  * modification, are permitted provided that the following conditions are
 9 |  * met:
10 |  *  - Redistributions of source code must retain the above copyright
11 |  *    notice, this list of conditions and the following disclaimer.
12 |  *  - Redistributions in binary form must reproduce the above copyright
13 |  *    notice, this list of conditions and the following disclaimer in the
14 |  *    documentation and/or other materials provided with the distribution.
15 |  *  - Neither the name of The University of Texas nor the names of its
16 |  *    contributors may be used to endorse or promote products derived
17 |  *    from this software without specific prior written permission.
18 |  *
19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 |  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 |  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 |  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 |  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 |  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 |  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 |  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 |  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |  *
31 |  *
32 |  * bl_config.h
33 |  *
34 |  *
35 |  * Purpose:
36 |  * this header file contains configuration parameters.
37 |  *
38 |  * Todo:
39 |  *
40 |  *
41 |  * Modification:
42 |  *
43 |  * 
44 |  * */
45 | 
46 | #ifndef BLISLAB_CONFIG_H
47 | #define BLISLAB_CONFIG_H
48 | 
49 | // Allow C++ users to include this header file in their source code. However,
50 | // we make the extern "C" conditional on whether we're using a C++ compiler,
51 | // since regular C compilers don't understand the extern "C" construct.
52 | #ifdef __cplusplus
53 | extern "C" {
54 | #endif
55 | 
56 | #define DGEMM_MR 4
57 | #define DGEMM_NR 4
58 | 
59 | // End extern "C" construct block.
60 | #ifdef __cplusplus
61 | }
62 | #endif
63 | 
64 | #endif
65 | 
66 | 


--------------------------------------------------------------------------------
/sgemm/step3/my_sgemm.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * --------------------------------------------------------------------------
  3 |  * BLISLAB 
  4 |  * --------------------------------------------------------------------------
  5 |  * Copyright (C) 2016, The University of Texas at Austin
  6 |  *
  7 |  * Redistribution and use in source and binary forms, with or without
  8 |  * modification, are permitted provided that the following conditions are
  9 |  * met:
 10 |  *  - Redistributions of source code must retain the above copyright
 11 |  *    notice, this list of conditions and the following disclaimer.
 12 |  *  - Redistributions in binary form must reproduce the above copyright
 13 |  *    notice, this list of conditions and the following disclaimer in the
 14 |  *    documentation and/or other materials provided with the distribution.
 15 |  *  - Neither the name of The University of Texas nor the names of its
 16 |  *    contributors may be used to endorse or promote products derived
 17 |  *    from this software without specific prior written permission.
 18 |  *
 19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 20 |  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 21 |  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 22 |  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 23 |  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 24 |  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 25 |  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 26 |  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 27 |  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 28 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 29 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 30 |  *
 31 |  *
 32 |  * bl_sgemm.c
 33 |  *
 34 |  *
 35 |  * Purpose:
 36 |  * this is the main file of blislab sgemm.
 37 |  *
 38 |  * Todo:
 39 |  *
 40 |  *
 41 |  * Modification:
 42 |  *
 43 |  * 
 44 |  * */
 45 | 
 46 | #include "bl_sgemm.h"
 47 | #include "bl_config.h"
 48 | 
 49 | void AddDot( int k, float *A, int lda, float *B, int ldb, float *result ) {
 50 |   int p;
 51 |   for ( p = 0; p < k; p++ ) {
 52 |     *result += A( 0, p ) * B( p, 0 );
 53 |   }
 54 | }
 55 | 
 56 | void AddDot_MRxNR( int k, float *A, int lda, float *B, int ldb, float *C, int ldc )
 57 | {
 58 |   int ir, jr;
 59 |   int p;
 60 |   for ( jr = 0; jr < DGEMM_NR; jr++ ) {
 61 |     for ( ir = 0; ir < DGEMM_MR; ir++ ) {
 62 |       AddDot( k, &A( ir, 0 ), lda, &B( 0, jr ), ldb, &C( ir, jr ) );
 63 |     }
 64 |   }
 65 | }
 66 | 
 67 | void AddDot_4x4_opt( int k, float *A, int lda, float *B, int ldb, float *C, int ldc )
 68 | {
 69 |    register float C00, C01, C02, C03, C10, C11, C12, C13, C20, C21, C22, C23, C30, C31, C32, C33;
 70 |    int p;
 71 | 
 72 |    C00 = 0.0f;
 73 |    C01 = 0.0f;
 74 |    C02 = 0.0f;
 75 |    C03 = 0.0f;
 76 |    C10 = 0.0f;
 77 |    C11 = 0.0f;
 78 |    C12 = 0.0f;
 79 |    C13 = 0.0f;
 80 |    C20 = 0.0f;
 81 |    C21 = 0.0f;
 82 |    C22 = 0.0f;
 83 |    C23 = 0.0f;
 84 |    C30 = 0.0f;
 85 |    C31 = 0.0f;
 86 |    C32 = 0.0f;
 87 |    C33 = 0.0f;
 88 |    for (p = 0; p < k; p++) {
 89 |      C00 += A(0, p) * B(p, 0);
 90 |      C01 += A(0, p) * B(p, 1);
 91 |      C02 += A(0, p) * B(p, 2);
 92 |      C03 += A(0, p) * B(p, 3);
 93 |      C10 += A(1, p) * B(p, 0);
 94 |      C11 += A(1, p) * B(p, 1);
 95 |      C12 += A(1, p) * B(p, 2);
 96 |      C13 += A(1, p) * B(p, 3);
 97 |      C20 += A(2, p) * B(p, 0);
 98 |      C21 += A(2, p) * B(p, 1);
 99 |      C22 += A(2, p) * B(p, 2);
100 |      C23 += A(2, p) * B(p, 3);
101 |      C30 += A(3, p) * B(p, 0);
102 |      C31 += A(3, p) * B(p, 1);
103 |      C32 += A(3, p) * B(p, 2);
104 |      C33 += A(3, p) * B(p, 3);
105 |    }
106 |    C(0, 0) += C00;
107 |    C(0, 1) += C01;
108 |    C(0, 2) += C02;
109 |    C(0, 3) += C03;
110 |    C(1, 0) += C10;
111 |    C(1, 1) += C11;
112 |    C(1, 2) += C12;
113 |    C(1, 3) += C13;
114 |    C(2, 0) += C20;
115 |    C(2, 1) += C21;
116 |    C(2, 2) += C22;
117 |    C(2, 3) += C23;
118 |    C(3, 0) += C30;
119 |    C(3, 1) += C31;
120 |    C(3, 2) += C32;
121 |    C(3, 3) += C33;
122 | }
123 | 
124 | void bl_sgemm(
125 |     int    m,
126 |     int    n,
127 |     int    k,
128 |     float *A,
129 |     int    lda,
130 |     float *B,
131 |     int    ldb,
132 |     float *C,        // must be aligned
133 |     int    ldc        // ldc must also be aligned
134 | )
135 | {
136 |     int i, j, p;
137 |     int ir, jr;
138 | 
139 |     // Early return if possible
140 |     if ( m == 0 || n == 0 || k == 0 ) {
141 |         printf( "bl_sgemm(): early return\n" );
142 |         return;
143 |     }
144 | 
145 |     for ( i = 0; i < m; i += DGEMM_MR ) {          // Start 2-nd loop
146 |       for ( j = 0; j < n; j += DGEMM_NR ) {        // Start 1-st loop
147 |            #if !(DGEMM_MR == 4 && DGEMM_NR == 4)
148 |            AddDot_MRxNR( k, &A( i, 0 ), lda, &B( 0, j ), ldb, &C( i, j ), ldc );
149 |            #else
150 |            AddDot_4x4_opt( k, &A( i, 0 ), lda, &B( 0, j ), ldb, &C( i, j ), ldc );
151 |            #endif
152 |         }                                          // End   1-st loop
153 |     }                                              // End   2-nd loop
154 | }
155 | 
156 | 
157 | 


--------------------------------------------------------------------------------
/sgemm/step4/Makefile:
--------------------------------------------------------------------------------
 1 | TARGET=test_bl_sgemm_step4
 2 | 
 3 | #CROSS_COMPILE ?= riscv64-unknown-linux-gnu-
 4 | # CROSS_COMPILE ?=
 5 | 
 6 | CCL := /home/zhaodongyu/toolchain/riscv/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
 7 | CTOOL := riscv64-unknown-linux-gnu-
 8 | CROSS_COMPILE := ${CCL}/bin/${CTOOL}
 9 | 
10 | CC  = $(CROSS_COMPILE)gcc
11 | CXX = $(CROSS_COMPILE)g++
12 | 
13 | COMPILER_OPT_LEVEL=O3
14 | 
15 | ifeq ($(CROSS_COMPILE),)
16 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=core-avx2 -fPIC -static
17 | LDLIBS = -lpthread -lm
18 | else
19 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static
20 | LDLIBS = -lm
21 | endif
22 | 
23 | COMMONDIR = ../common
24 | CFLAGS += -I$(COMMONDIR)
25 | 
26 | ASM_SRC= 
27 | CC_SRC= my_sgemm.c \
28 |         $(COMMONDIR)/test_bl_sgemm_packB_4x4.c \
29 |         $(COMMONDIR)/bl_sgemm_ref.c \
30 |         $(COMMONDIR)/bl_sgemm_util.c
31 | 
32 | CPP_SRC=
33 | 
34 | ALL_ASM_OBJS := $(ASM_SRC:=.o)
35 | ALL_C_OBJS := $(CC_SRC:=.o)
36 | ALL_CXX_OBJS := $(CPP_SRC:=.o)
37 | 
38 | MKDEP_OPT = -MMD -MF $@.d
39 | 
40 | ALL_OBJS += $(ALL_ASM_OBJS) $(ALL_C_OBJS) $(ALL_CXX_OBJS)
41 | ALL_DEPS := $(ALL_OBJS:=.d)
42 | 
43 | BLISLAB_TEST_EXE=$(TARGET:=.x)
44 | 
45 | .PHONY: all clean
46 | 
47 | all: $(BLISLAB_TEST_EXE)
48 | 
49 | # include dependency files of application
50 | ifneq ($(MAKECMDGOALS),clean)
51 | -include $(ALL_DEPS)
52 | endif
53 | 
54 | $(BLISLAB_TEST_EXE): $(ALL_OBJS)
55 | 	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS) $(LDLIBS)
56 | 
57 | # ---------------------------------------------------------------------------
58 | # Object files compiling rules
59 | # ---------------------------------------------------------------------------
60 | %.c.o: %.c
61 | 	$(CC) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
62 | 
63 | %.cpp.o: %.cpp
64 | 	$(CXX) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
65 | # ---------------------------------------------------------------------------
66 | 
67 | clean:
68 | 	@rm -rf $(ALL_OBJS) $(SHAREDLIBBLISLAB) $(BLISLAB_TEST_EXE) $(ALL_DEPS)
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/sgemm/step4/bl_config.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * --------------------------------------------------------------------------
 3 |  * BLISLAB 
 4 |  * --------------------------------------------------------------------------
 5 |  * Copyright (C) 2016, The University of Texas at Austin
 6 |  *
 7 |  * Redistribution and use in source and binary forms, with or without
 8 |  * modification, are permitted provided that the following conditions are
 9 |  * met:
10 |  *  - Redistributions of source code must retain the above copyright
11 |  *    notice, this list of conditions and the following disclaimer.
12 |  *  - Redistributions in binary form must reproduce the above copyright
13 |  *    notice, this list of conditions and the following disclaimer in the
14 |  *    documentation and/or other materials provided with the distribution.
15 |  *  - Neither the name of The University of Texas nor the names of its
16 |  *    contributors may be used to endorse or promote products derived
17 |  *    from this software without specific prior written permission.
18 |  *
19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 |  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 |  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 |  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 |  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 |  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 |  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 |  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 |  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |  *
31 |  *
32 |  * bl_config.h
33 |  *
34 |  *
35 |  * Purpose:
36 |  * this header file contains configuration parameters.
37 |  *
38 |  * Todo:
39 |  *
40 |  *
41 |  * Modification:
42 |  *
43 |  * 
44 |  * */
45 | 
46 | #ifndef BLISLAB_CONFIG_H
47 | #define BLISLAB_CONFIG_H
48 | 
49 | // Allow C++ users to include this header file in their source code. However,
50 | // we make the extern "C" conditional on whether we're using a C++ compiler,
51 | // since regular C compilers don't understand the extern "C" construct.
52 | #ifdef __cplusplus
53 | extern "C" {
54 | #endif
55 | 
56 | #define DGEMM_MR 4
57 | #define DGEMM_NR 4
58 | 
59 | // End extern "C" construct block.
60 | #ifdef __cplusplus
61 | }
62 | #endif
63 | 
64 | #endif
65 | 
66 | 


--------------------------------------------------------------------------------
/sgemm/step4/my_sgemm.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * --------------------------------------------------------------------------
  3 |  * BLISLAB 
  4 |  * --------------------------------------------------------------------------
  5 |  * Copyright (C) 2016, The University of Texas at Austin
  6 |  *
  7 |  * Redistribution and use in source and binary forms, with or without
  8 |  * modification, are permitted provided that the following conditions are
  9 |  * met:
 10 |  *  - Redistributions of source code must retain the above copyright
 11 |  *    notice, this list of conditions and the following disclaimer.
 12 |  *  - Redistributions in binary form must reproduce the above copyright
 13 |  *    notice, this list of conditions and the following disclaimer in the
 14 |  *    documentation and/or other materials provided with the distribution.
 15 |  *  - Neither the name of The University of Texas nor the names of its
 16 |  *    contributors may be used to endorse or promote products derived
 17 |  *    from this software without specific prior written permission.
 18 |  *
 19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 20 |  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 21 |  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 22 |  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 23 |  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 24 |  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 25 |  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 26 |  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 27 |  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 28 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 29 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 30 |  *
 31 |  *
 32 |  * bl_sgemm.c
 33 |  *
 34 |  *
 35 |  * Purpose:
 36 |  * this is the main file of blislab sgemm.
 37 |  *
 38 |  * Todo:
 39 |  *
 40 |  *
 41 |  * Modification:
 42 |  *
 43 |  * 
 44 |  * */
 45 | 
 46 | #include "bl_sgemm.h"
 47 | #include "bl_config.h"
 48 | 
 49 | void AddDot_4x4_opt( int k, float *A, int lda, float *packB, int ldb, float *C, int ldc )
 50 | {
 51 |    register float C00, C01, C02, C03, C10, C11, C12, C13, C20, C21, C22, C23, C30, C31, C32, C33;
 52 |    float *packBp;
 53 |    int p;
 54 | 
 55 |    C00 = 0.0f;
 56 |    C01 = 0.0f;
 57 |    C02 = 0.0f;
 58 |    C03 = 0.0f;
 59 |    C10 = 0.0f;
 60 |    C11 = 0.0f;
 61 |    C12 = 0.0f;
 62 |    C13 = 0.0f;
 63 |    C20 = 0.0f;
 64 |    C21 = 0.0f;
 65 |    C22 = 0.0f;
 66 |    C23 = 0.0f;
 67 |    C30 = 0.0f;
 68 |    C31 = 0.0f;
 69 |    C32 = 0.0f;
 70 |    C33 = 0.0f;
 71 |    for (p = 0; p < k; p++) {
 72 |      packBp = &packB[p * 4];
 73 | 
 74 |      C00 += A(0, p+0) * packBp[0];
 75 |      C01 += A(0, p+0) * packBp[1];
 76 |      C02 += A(0, p+0) * packBp[2];
 77 |      C03 += A(0, p+0) * packBp[3];
 78 |      C10 += A(1, p+0) * packBp[0];
 79 |      C11 += A(1, p+0) * packBp[1];
 80 |      C12 += A(1, p+0) * packBp[2];
 81 |      C13 += A(1, p+0) * packBp[3];
 82 |      C20 += A(2, p+0) * packBp[0];
 83 |      C21 += A(2, p+0) * packBp[1];
 84 |      C22 += A(2, p+0) * packBp[2];
 85 |      C23 += A(2, p+0) * packBp[3];
 86 |      C30 += A(3, p+0) * packBp[0];
 87 |      C31 += A(3, p+0) * packBp[1];
 88 |      C32 += A(3, p+0) * packBp[2];
 89 |      C33 += A(3, p+0) * packBp[3];
 90 | 
 91 |    }
 92 |    C(0, 0) += C00;
 93 |    C(0, 1) += C01;
 94 |    C(0, 2) += C02;
 95 |    C(0, 3) += C03;
 96 |    C(1, 0) += C10;
 97 |    C(1, 1) += C11;
 98 |    C(1, 2) += C12;
 99 |    C(1, 3) += C13;
100 |    C(2, 0) += C20;
101 |    C(2, 1) += C21;
102 |    C(2, 2) += C22;
103 |    C(2, 3) += C23;
104 |    C(3, 0) += C30;
105 |    C(3, 1) += C31;
106 |    C(3, 2) += C32;
107 |    C(3, 3) += C33;
108 | }
109 | 
110 | 
111 | void bl_sgemm_pack(
112 |     int    m,
113 |     int    mr,
114 |     int    n,
115 |     int    nr,
116 |     int    k,
117 |     float *A,
118 |     float *packA,
119 |     int    lda,
120 |     float *B,
121 |     float *packB,
122 |     int    ldb,
123 |     float *C,        // must be aligned
124 |     int    ldc        // ldc must also be aligned
125 | )
126 | {
127 |     int i, j, p;
128 |     int ir, jr;
129 | 
130 |     // Early return if possible
131 |     if ( m == 0 || n == 0 || k == 0 ) {
132 |         printf( "bl_sgemm(): early return\n" );
133 |         return;
134 |     }
135 | 
136 |     for ( i = 0; i < m; i += DGEMM_MR ) {          // Start 2-nd loop
137 |       for ( j = 0; j < n; j += DGEMM_NR ) {        // Start 1-st loop
138 |            AddDot_4x4_opt( k, &A( i, 0 ), lda, &packB[j * k], ldb, &C( i, j ), ldc );
139 |         }                                          // End   1-st loop
140 |     }                                              // End   2-nd loop
141 | }
142 | 
143 | 
144 | 


--------------------------------------------------------------------------------
/sgemm/step5/Makefile:
--------------------------------------------------------------------------------
 1 | TARGET=test_bl_sgemm_step5
 2 | 
 3 | #CROSS_COMPILE ?= riscv64-unknown-linux-gnu-
 4 | # CROSS_COMPILE ?=
 5 | 
 6 | CCL := /home/zhaodongyu/toolchain/riscv/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
 7 | CTOOL := riscv64-unknown-linux-gnu-
 8 | CROSS_COMPILE := ${CCL}/bin/${CTOOL}
 9 | 
10 | CC  = $(CROSS_COMPILE)gcc
11 | CXX = $(CROSS_COMPILE)g++
12 | 
13 | COMPILER_OPT_LEVEL=O3
14 | 
15 | ifeq ($(CROSS_COMPILE),)
16 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=core-avx2 -fPIC -static
17 | LDLIBS = -lpthread -lm
18 | else
19 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static
20 | LDLIBS = -lm
21 | endif
22 | 
23 | COMMONDIR = ../common
24 | CFLAGS += -I$(COMMONDIR)
25 | 
26 | ASM_SRC= 
27 | CC_SRC= my_sgemm.c \
28 |         $(COMMONDIR)/test_bl_sgemm_packB_4x4.c \
29 |         $(COMMONDIR)/bl_sgemm_ref.c \
30 |         $(COMMONDIR)/bl_sgemm_util.c
31 | 
32 | CPP_SRC=
33 | 
34 | ALL_ASM_OBJS := $(ASM_SRC:=.o)
35 | ALL_C_OBJS := $(CC_SRC:=.o)
36 | ALL_CXX_OBJS := $(CPP_SRC:=.o)
37 | 
38 | MKDEP_OPT = -MMD -MF $@.d
39 | 
40 | ALL_OBJS += $(ALL_ASM_OBJS) $(ALL_C_OBJS) $(ALL_CXX_OBJS)
41 | ALL_DEPS := $(ALL_OBJS:=.d)
42 | 
43 | BLISLAB_TEST_EXE=$(TARGET:=.x)
44 | 
45 | .PHONY: all clean
46 | 
47 | all: $(BLISLAB_TEST_EXE)
48 | 
49 | # include dependency files of application
50 | ifneq ($(MAKECMDGOALS),clean)
51 | -include $(ALL_DEPS)
52 | endif
53 | 
54 | $(BLISLAB_TEST_EXE): $(ALL_OBJS)
55 | 	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS) $(LDLIBS)
56 | 
57 | # ---------------------------------------------------------------------------
58 | # Object files compiling rules
59 | # ---------------------------------------------------------------------------
60 | %.c.o: %.c
61 | 	$(CC) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
62 | 
63 | %.cpp.o: %.cpp
64 | 	$(CXX) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
65 | # ---------------------------------------------------------------------------
66 | 
67 | clean:
68 | 	@rm -rf $(ALL_OBJS) $(SHAREDLIBBLISLAB) $(BLISLAB_TEST_EXE) $(ALL_DEPS)
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/sgemm/step5/bl_config.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * --------------------------------------------------------------------------
 3 |  * BLISLAB 
 4 |  * --------------------------------------------------------------------------
 5 |  * Copyright (C) 2016, The University of Texas at Austin
 6 |  *
 7 |  * Redistribution and use in source and binary forms, with or without
 8 |  * modification, are permitted provided that the following conditions are
 9 |  * met:
10 |  *  - Redistributions of source code must retain the above copyright
11 |  *    notice, this list of conditions and the following disclaimer.
12 |  *  - Redistributions in binary form must reproduce the above copyright
13 |  *    notice, this list of conditions and the following disclaimer in the
14 |  *    documentation and/or other materials provided with the distribution.
15 |  *  - Neither the name of The University of Texas nor the names of its
16 |  *    contributors may be used to endorse or promote products derived
17 |  *    from this software without specific prior written permission.
18 |  *
19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 |  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 |  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 |  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 |  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 |  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 |  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 |  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 |  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |  *
31 |  *
32 |  * bl_config.h
33 |  *
34 |  *
35 |  * Purpose:
36 |  * this header file contains configuration parameters.
37 |  *
38 |  * Todo:
39 |  *
40 |  *
41 |  * Modification:
42 |  *
43 |  * 
44 |  * */
45 | 
46 | #ifndef BLISLAB_CONFIG_H
47 | #define BLISLAB_CONFIG_H
48 | 
49 | // Allow C++ users to include this header file in their source code. However,
50 | // we make the extern "C" conditional on whether we're using a C++ compiler,
51 | // since regular C compilers don't understand the extern "C" construct.
52 | #ifdef __cplusplus
53 | extern "C" {
54 | #endif
55 | 
56 | #define DGEMM_MR 4
57 | #define DGEMM_NR 4
58 | 
59 | // End extern "C" construct block.
60 | #ifdef __cplusplus
61 | }
62 | #endif
63 | 
64 | #endif
65 | 
66 | 


--------------------------------------------------------------------------------
/sgemm/step5/my_sgemm.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * --------------------------------------------------------------------------
  3 |  * BLISLAB 
  4 |  * --------------------------------------------------------------------------
  5 |  * Copyright (C) 2016, The University of Texas at Austin
  6 |  *
  7 |  * Redistribution and use in source and binary forms, with or without
  8 |  * modification, are permitted provided that the following conditions are
  9 |  * met:
 10 |  *  - Redistributions of source code must retain the above copyright
 11 |  *    notice, this list of conditions and the following disclaimer.
 12 |  *  - Redistributions in binary form must reproduce the above copyright
 13 |  *    notice, this list of conditions and the following disclaimer in the
 14 |  *    documentation and/or other materials provided with the distribution.
 15 |  *  - Neither the name of The University of Texas nor the names of its
 16 |  *    contributors may be used to endorse or promote products derived
 17 |  *    from this software without specific prior written permission.
 18 |  *
 19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 20 |  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 21 |  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 22 |  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 23 |  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 24 |  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 25 |  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 26 |  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 27 |  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 28 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 29 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 30 |  *
 31 |  *
 32 |  * bl_sgemm.c
 33 |  *
 34 |  *
 35 |  * Purpose:
 36 |  * this is the main file of blislab sgemm.
 37 |  *
 38 |  * Todo:
 39 |  *
 40 |  *
 41 |  * Modification:
 42 |  *
 43 |  * 
 44 |  * */
 45 | 
 46 | #include "bl_sgemm.h"
 47 | #include "bl_config.h"
 48 | 
 49 | inline void PackInputLayout(float* dst, const float* src, int m, int k, int mr) {
 50 |     int    i, j, p;
 51 | 
 52 |     for ( j = 0; j < m / mr; j ++ ) {
 53 |         for ( i = 0; i < k; i ++ ) {
 54 |             for ( p = 0; p < mr; p ++ ) {
 55 |                 *dst ++ = *(src + p * k + j * mr * k + i);
 56 |             }
 57 |         }
 58 |     }
 59 | }
 60 | 
 61 | 
 62 | void AddDot_4x4_opt( int k, float *packA, int lda, float *packB, int ldb, float *C, int ldc )
 63 | {
 64 |    register float C00, C01, C02, C03, C10, C11, C12, C13, C20, C21, C22, C23, C30, C31, C32, C33;
 65 |    float *packAp, *packBp;
 66 |    int p;
 67 | 
 68 |    C00 = 0.0f;
 69 |    C01 = 0.0f;
 70 |    C02 = 0.0f;
 71 |    C03 = 0.0f;
 72 |    C10 = 0.0f;
 73 |    C11 = 0.0f;
 74 |    C12 = 0.0f;
 75 |    C13 = 0.0f;
 76 |    C20 = 0.0f;
 77 |    C21 = 0.0f;
 78 |    C22 = 0.0f;
 79 |    C23 = 0.0f;
 80 |    C30 = 0.0f;
 81 |    C31 = 0.0f;
 82 |    C32 = 0.0f;
 83 |    C33 = 0.0f;
 84 |    for (p = 0; p < k; p++) {
 85 |      packAp = &packA[p * 4];
 86 |      packBp = &packB[p * 4];
 87 | 
 88 |      C00 += packAp[0] * packBp[0];
 89 |      C01 += packAp[0] * packBp[1];
 90 |      C02 += packAp[0] * packBp[2];
 91 |      C03 += packAp[0] * packBp[3];
 92 |      C10 += packAp[1] * packBp[0];
 93 |      C11 += packAp[1] * packBp[1];
 94 |      C12 += packAp[1] * packBp[2];
 95 |      C13 += packAp[1] * packBp[3];
 96 |      C20 += packAp[2] * packBp[0];
 97 |      C21 += packAp[2] * packBp[1];
 98 |      C22 += packAp[2] * packBp[2];
 99 |      C23 += packAp[2] * packBp[3];
100 |      C30 += packAp[3] * packBp[0];
101 |      C31 += packAp[3] * packBp[1];
102 |      C32 += packAp[3] * packBp[2];
103 |      C33 += packAp[3] * packBp[3];
104 | 
105 |    }
106 |    C(0, 0) += C00;
107 |    C(0, 1) += C01;
108 |    C(0, 2) += C02;
109 |    C(0, 3) += C03;
110 |    C(1, 0) += C10;
111 |    C(1, 1) += C11;
112 |    C(1, 2) += C12;
113 |    C(1, 3) += C13;
114 |    C(2, 0) += C20;
115 |    C(2, 1) += C21;
116 |    C(2, 2) += C22;
117 |    C(2, 3) += C23;
118 |    C(3, 0) += C30;
119 |    C(3, 1) += C31;
120 |    C(3, 2) += C32;
121 |    C(3, 3) += C33;
122 | }
123 | 
124 | 
125 | void bl_sgemm_pack(
126 |     int    m,
127 |     int    mr,
128 |     int    n,
129 |     int    nr,
130 |     int    k,
131 |     float *A,
132 |     float *packA,
133 |     int    lda,
134 |     float *B,
135 |     float *packB,
136 |     int    ldb,
137 |     float *C,        // must be aligned
138 |     int    ldc        // ldc must also be aligned
139 | )
140 | {
141 |     int i, j, p;
142 |     int ir, jr;
143 | 
144 |     // Early return if possible
145 |     if ( m == 0 || n == 0 || k == 0 ) {
146 |         printf( "bl_sgemm(): early return\n" );
147 |         return;
148 |     }
149 | 
150 |     PackInputLayout(packA, A, m, k, mr);
151 | 
152 |     // printf("A\n");
153 |     // for(int i = 0; i < m; i++) {
154 |     //   for(int j = 0; j < m; j++) {
155 |     //     printf("%.0f\t", A[i * m + j]);
156 |     //   }
157 |     //   printf("\n");
158 |     // }
159 |     // printf("\n\n");
160 | 
161 |     // printf("packA\n");
162 |     // for(int i = 0; i < m; i++) {
163 |     //   for(int j = 0; j < m; j++) {
164 |     //     printf("%.0f\t", packA[i * m + j]);
165 |     //   }
166 |     //   printf("\n");
167 |     // }
168 |     // printf("\n\n");
169 | 
170 |     for ( i = 0; i < m; i += DGEMM_MR ) {          // Start 2-nd loop
171 |       for ( j = 0; j < n; j += DGEMM_NR ) {        // Start 1-st loop
172 |         //    AddDot_4x4_opt( k, &A( i, 0 ), lda, &packB[j * k], ldb, &C( i, j ), ldc );
173 |            AddDot_4x4_opt( k, &packA[i * k], lda, &packB[j * k], ldb, &C( i, j ), ldc );
174 |         }                                          // End   1-st loop
175 |     }                                              // End   2-nd loop
176 | }
177 | 
178 | 
179 | 


--------------------------------------------------------------------------------
/sgemm/step6/Makefile:
--------------------------------------------------------------------------------
 1 | TARGET=test_bl_sgemm_step6
 2 | 
 3 | #CROSS_COMPILE ?= riscv64-unknown-linux-gnu-
 4 | # CROSS_COMPILE ?=
 5 | 
 6 | CCL := /home/zhaodongyu/toolchain/riscv/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
 7 | CTOOL := riscv64-unknown-linux-gnu-
 8 | CROSS_COMPILE := ${CCL}/bin/${CTOOL}
 9 | 
10 | CC  = $(CROSS_COMPILE)gcc
11 | CXX = $(CROSS_COMPILE)g++
12 | 
13 | COMPILER_OPT_LEVEL=O3
14 | 
15 | ifeq ($(CROSS_COMPILE),)
16 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=core-avx2 -fPIC -static
17 | LDLIBS = -lpthread -lm
18 | else
19 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static
20 | LDLIBS = -lm
21 | endif
22 | 
23 | COMMONDIR = ../common
24 | CFLAGS += -I$(COMMONDIR)
25 | 
26 | ASM_SRC= RvvSgemm4x16.S
27 | CC_SRC= my_sgemm.c \
28 |         $(COMMONDIR)/test_bl_sgemm_packB_4x16.c \
29 |         $(COMMONDIR)/bl_sgemm_ref.c \
30 |         $(COMMONDIR)/bl_sgemm_util.c
31 | 
32 | CPP_SRC=
33 | 
34 | ALL_ASM_OBJS := $(ASM_SRC:=.o)
35 | ALL_C_OBJS := $(CC_SRC:=.o)
36 | ALL_CXX_OBJS := $(CPP_SRC:=.o)
37 | 
38 | MKDEP_OPT = -MMD -MF $@.d
39 | 
40 | ALL_OBJS += $(ALL_ASM_OBJS) $(ALL_C_OBJS) $(ALL_CXX_OBJS)
41 | ALL_DEPS := $(ALL_OBJS:=.d)
42 | 
43 | BLISLAB_TEST_EXE=$(TARGET:=.x)
44 | 
45 | .PHONY: all clean
46 | 
47 | all: $(BLISLAB_TEST_EXE)
48 | 
49 | # include dependency files of application
50 | ifneq ($(MAKECMDGOALS),clean)
51 | -include $(ALL_DEPS)
52 | endif
53 | 
54 | $(BLISLAB_TEST_EXE): $(ALL_OBJS)
55 | 	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS) $(LDLIBS)
56 | 
57 | # ---------------------------------------------------------------------------
58 | # Object files compiling rules
59 | # ---------------------------------------------------------------------------
60 | %.c.o: %.c
61 | 	$(CC) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
62 | 
63 | %.cpp.o: %.cpp
64 | 	$(CXX) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
65 | 
66 | %.S.o: %.S
67 | 	$(CC) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
68 | # ---------------------------------------------------------------------------
69 | 
70 | clean:
71 | 	@rm -rf $(ALL_OBJS) $(SHAREDLIBBLISLAB) $(BLISLAB_TEST_EXE) $(ALL_DEPS)
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/sgemm/step6/RvvSgemm4x16.S:
--------------------------------------------------------------------------------
  1 | # |<-- k=4 -->|
  2 | # +++++++++++++ -           f0, f4, f0, f4
  3 | # +           + |           f1, f5, f1, f5
  4 | # +     A     + mr=4        f2, f6, f2, f6
  5 | # +           + |           f3, f7, f3, f7
  6 | # +++++++++++++ -       
  7 |                        
  8 | 
  9 | # |<--  nr=16  -->|
 10 | # +++++++++++++++++ -       v0 v1 v2 v3
 11 | # +               + |       v4 v5 v6 v7
 12 | # +       B       + k=4     v8,v9,v10,v11     
 13 | # +               + |       v12,v13,v14,v15
 14 | # +++++++++++++++++ -
 15 | 
 16 | 
 17 | # |<--    nr    -->|
 18 | # +++++++++++++++++ -       
 19 | # +               + |       v16,v17,v18,v19
 20 | # +       C       + k=4     v20,v21,v22,v23
 21 | # +               + |       v24,v25,v26,v27
 22 | # +++++++++++++++++ -       v28,v29,v30,v31
 23 | #
 24 | 
 25 | # void RvvSgemm4x16(size_t nr,         // nr <= 16, a0
 26 | #                   size_t mr,         // mr <= 4,  a1
 27 | #                   size_t k,          // astride = k*sizeof(float), a2
 28 | #                   const float* a,    // mr * k,   a3
 29 | #                   const float* b,    // k * 16,   a4
 30 | #                   float* c,          // mr * nr,  a5
 31 | #                   size_t c_stride,  // Len(N) * sizeof(float), a6
 32 | #                   const float* bias  // bias, a7
 33 | #                   );
 34 | .global RvvSgemm4x16
 35 | .type RvvSgemm4x16, @function
 36 | 
 37 | #define nr a0
 38 | #define mr a1
 39 | #define k  a2
 40 | #define ap a3
 41 | #define bp a4
 42 | #define cp a5
 43 | #define c_stride a6
 44 | #define bias a7
 45 | 
 46 | #define ap1 t0
 47 | #define ap2 t1
 48 | #define ap3 t2
 49 | #define cp1 t3
 50 | #define cp2 t4
 51 | #define cp3 t5
 52 | #define a_stride s0
 53 | #define bp0      s1
 54 | #define biasp    s2
 55 | #define kt       s3
 56 | #define ap0      s4
 57 | #define cp0      s5
 58 | #define bp1      s6
 59 | #define vl       s7
 60 | #define ap_offset       s10
 61 | #define cp_offset       s11
 62 | 
 63 | #define FRAMESIZE 104
 64 | 
 65 | RvvSgemm4x16:
 66 |     addi sp, sp, -FRAMESIZE # callee update stack pointer
 67 |     sd s0, 96(sp)           # callee saved frame pointer
 68 |     addi s0, sp, FRAMESIZE  # generate new frame pointer
 69 |     sd s1, -16(s0)
 70 |     sd s2, -24(s0)
 71 |     sd s3, -32(s0)
 72 |     sd s4, -40(s0)
 73 |     sd s5, -48(s0)
 74 |     sd s6, -56(s0)
 75 |     sd s7, -64(s0)
 76 |     sd s8, -72(s0)
 77 |     sd s9, -80(s0)
 78 |     sd s10, -88(s0)
 79 |     sd s11, -96(s0)
 80 | 
 81 |     li ap_offset, 0
 82 |     li cp_offset, 0
 83 |     slli a_stride, k, 2     # astride = k * sizeof(float)
 84 |     mv s3, nr
 85 |     vsetvli s2, s3, e32, m4
 86 |     mv ap0, ap
 87 |     mv bp0, bp
 88 |     mv cp0, cp
 89 | .a1_offset:
 90 |     mv ap1, ap0
 91 |     mv cp1, cp0
 92 |     slti t6, mr, 2          # mr < 2
 93 |     bnez t6, .a2_offset
 94 |     add ap1, ap0, a_stride
 95 |     add cp1, cp0, c_stride
 96 | .a2_offset:
 97 |     mv ap2, ap1
 98 |     mv cp2, cp1
 99 |     slti t6, mr, 3          # mr < 3
100 |     bnez t6, .a3_offset
101 |     add ap2, ap1, a_stride
102 |     add cp2, cp1, c_stride
103 | .a3_offset:
104 |     mv ap3, ap2
105 |     mv cp3, cp2
106 |     slti t6, mr, 4          # mr < 4
107 |     bnez t6, .start
108 |     add ap3, ap2, a_stride
109 |     add cp3, cp2, c_stride
110 | 
111 | .start:
112 |     mv biasp, bias
113 |     mv kt, k
114 |     beqz mr, .end
115 | 
116 |     vlw.v v16, (biasp)
117 |     vlw.v v20, (biasp)
118 |     vlw.v v24, (biasp)
119 |     vlw.v v28, (biasp)
120 |     addi biasp, biasp, 64
121 |     slti t6, kt, 4               # kt < 4, t6 = 1
122 |     bnez t6, .k2_tail
123 |     
124 | 
125 |     // flw fs0, 64(ap0)          # pre-load A
126 |     // flw fs1, 64(ap1)          # pre-load A
127 |     // flw fs2, 64(ap2)          # pre-load A
128 |     // flw fs3, 64(ap3)          # pre-load A
129 |     
130 |     // flw fs4, 512(bp0)       # pre-load B
131 |     // flw fs5, 576(bp0)       # pre-load B
132 |     // flw fs6, 640(bp0)       # pre-load B
133 |     // flw fs7, 704(bp0)       # pre-load B
134 | 
135 |     // load 4 row A (A0, A1, A2, A3)
136 |     flw ft0, (ap0)
137 |     addi ap0, ap0, 4
138 |     flw ft1, (ap1)
139 |     addi ap1, ap1, 4
140 |     flw ft2, (ap2)
141 |     addi ap2, ap2, 4
142 |     flw ft3, (ap3)
143 |     addi ap3, ap3, 4
144 |     // load 16 col B(B0, B1, B2, B3)
145 |     vlw.v v0, (bp0)
146 |     addi bp0,bp0,64
147 | 
148 |     addi kt, kt, -4         # Decrement k counter
149 |     slti t6, kt, 4          # kt < 4
150 |     bnez t6, .k4_tail       # jump to k4_tail
151 | 
152 | .k4_main:
153 |     addi kt, kt, -4         # Decrement k counter
154 |     // first group of 16 fma, second group load
155 |     vfmacc.vf v16,  ft0, v0
156 |     vlw.v v4, (bp0)         # b0'->v4
157 |     // flw fs4, 384(bp0)       # pre-load B
158 |     addi bp0,bp0,64
159 |     vfmacc.vf v20,  ft1, v0
160 |     flw ft4, (ap0)          # a0'->ft4
161 |     addi ap0, ap0, 4
162 |     vfmacc.vf v24,  ft2, v0
163 |     flw ft5, (ap1)          # a1'->ft5
164 |     addi ap1, ap1, 4
165 |     vfmacc.vf v28,  ft3, v0
166 |     flw ft6, (ap2)          # a2'->ft6
167 |     addi ap2, ap2, 4
168 |     flw ft7, (ap3)          # a3'->ft7
169 |     addi ap3, ap3, 4
170 |     // second group of 16 fma, third group load
171 |     slti t6, kt, 4              # kt < 4, t6 = 1
172 |     vfmacc.vf v16,  ft4, v4
173 |     vlw.v v8, (bp0)         # b0'->v0
174 |     // flw fs5, 384(bp0)       # pre-load B
175 |     addi bp0,bp0,64
176 |     vfmacc.vf v20,  ft5, v4
177 |     flw ft0, (ap0)          # a0'->ft0
178 |     addi ap0, ap0, 4
179 |     vfmacc.vf v24,  ft6, v4
180 |     flw ft1, (ap1)          # a1'->ft1
181 |     addi ap1, ap1, 4
182 |     vfmacc.vf v28,  ft7, v4
183 |     flw ft2, (ap2)          # a2'->ft2
184 |     addi ap2, ap2, 4
185 |     flw ft3, (ap3)          # a3'->ft3
186 |     addi ap3, ap3, 4
187 |     // third group of 16 fma, fourth group load
188 |     vfmacc.vf v16,  ft0, v8
189 |     vlw.v v12, (bp0)         # b0'->v0
190 |     // flw fs6, 384(bp0)       # pre-load B
191 |     addi bp0,bp0,64
192 |     vfmacc.vf v20,  ft1, v8
193 |     flw ft4, (ap0)          # a0'->ft0
194 |     addi ap0, ap0, 4
195 |     vfmacc.vf v24,  ft2, v8
196 |     flw ft5, (ap1)          # a1'->ft1
197 |     addi ap1, ap1, 4
198 |     vfmacc.vf v28,  ft3, v8
199 |     flw ft6, (ap2)          # a2'->ft2
200 |     addi ap2, ap2, 4
201 |     flw ft7, (ap3)          # a3'->ft3
202 |     addi ap3, ap3, 4
203 |     // fourth group of 16 fma, first group load
204 |     vfmacc.vf v16,  ft4, v12
205 |     vlw.v v0, (bp0)         # b0'->v0
206 |     // flw fs7, 384(bp0)       # pre-load B
207 |     addi bp0,bp0,64
208 |     vfmacc.vf v20,  ft5, v12
209 |     flw ft0, (ap0)          # a0'->ft0
210 |     addi ap0, ap0, 4
211 |     vfmacc.vf v24,  ft6, v12
212 |     flw ft1, (ap1)          # a0'->ft0
213 |     addi ap1, ap1, 4
214 |     vfmacc.vf v28,  ft7, v12
215 |     flw ft2, (ap2)          # a0'->ft0
216 |     addi ap2, ap2, 4
217 |     flw ft3, (ap3)          # a0'->ft0
218 |     addi ap3, ap3, 4
219 | 
220 |     // flw fs0, 64(ap0)          # pre-load A
221 |     // flw fs1, 64(ap1)          # pre-load A
222 |     // flw fs2, 64(ap2)          # pre-load A
223 |     // flw fs3, 64(ap3)          # pre-load A
224 | 
225 |     beqz t6, .k4_main
226 | .k4_tail:
227 |     // first group of 16 fma, second group load
228 |     vfmacc.vf v16,  ft0, v0
229 |     vlw.v v4, (bp0)         # b0'->v4
230 |     addi bp0,bp0,64
231 |     vfmacc.vf v20,  ft1, v0
232 |     flw ft4, (ap0)          # a0'->ft4
233 |     addi ap0, ap0, 4
234 |     vfmacc.vf v24,  ft2, v0
235 |     flw ft5, (ap1)          # a1'->ft5
236 |     addi ap1, ap1, 4
237 |     vfmacc.vf v28,  ft3, v0
238 |     flw ft6, (ap2)          # a2'->ft6
239 |     addi ap2, ap2, 4
240 |     flw ft7, (ap3)          # a3'->ft7
241 |     addi ap3, ap3, 4
242 |     // second group of 16 fma, third group load
243 |     vfmacc.vf v16,  ft4, v4
244 |     vlw.v v8, (bp0)         # b0'->v0
245 |     addi bp0,bp0,64
246 |     vfmacc.vf v20,  ft5, v4
247 |     flw ft0, (ap0)          # a0'->ft0
248 |     addi ap0, ap0, 4
249 |     vfmacc.vf v24,  ft6, v4
250 |     flw ft1, (ap1)          # a1'->ft1
251 |     addi ap1, ap1, 4
252 |     vfmacc.vf v28,  ft7, v4
253 |     flw ft2, (ap2)          # a2'->ft2
254 |     addi ap2, ap2, 4
255 |     flw ft3, (ap3)          # a3'->ft3
256 |     addi ap3, ap3, 4
257 |     // third group of 16 fma, fourth group load
258 |     vfmacc.vf v16,  ft0, v8
259 |     vlw.v v12, (bp0)         # b0'->v0
260 |     addi bp0,bp0,64
261 |     vfmacc.vf v20,  ft1, v8
262 |     flw ft4, (ap0)          # a0'->ft0
263 |     addi ap0, ap0, 4
264 |     vfmacc.vf v24,  ft2, v8
265 |     flw ft5, (ap1)          # a1'->ft1
266 |     addi ap1, ap1, 4
267 |     vfmacc.vf v28,  ft3, v8
268 |     flw ft6, (ap2)          # a2'->ft2
269 |     addi ap2, ap2, 4
270 |     flw ft7, (ap3)          # a3'->ft3
271 |     addi ap3, ap3, 4
272 |     // fourth group of 16 fma, no group load
273 |     vfmacc.vf v16,  ft4, v12
274 |     vfmacc.vf v20,  ft5, v12
275 |     vfmacc.vf v24,  ft6, v12
276 |     vfmacc.vf v28,  ft7, v12
277 | .k2_tail:
278 |     slti t6, kt, 2          # kt < 2
279 |     bnez t6, .k1_tail
280 |     flw ft0, (ap0)
281 |     addi ap0, ap0, 4
282 |     vlw.v v0, (bp0)
283 |     addi bp0,bp0,64
284 |     flw ft1, (ap1)
285 |     addi ap1, ap1, 4
286 |     flw ft2, (ap2)
287 |     addi ap2, ap2, 4
288 |     flw ft3, (ap3)
289 |     addi ap3, ap3, 4
290 |     // first group of 16 fma, second group load
291 |     vfmacc.vf v16,  ft0, v0
292 |     vlw.v v4, (bp0)         # b0'->v4
293 |     addi bp0,bp0,64
294 |     vfmacc.vf v20,  ft1, v0
295 |     flw ft4, (ap0)          # a0'->ft4
296 |     addi ap0, ap0, 4
297 |     vfmacc.vf v24,  ft2, v0
298 |     flw ft5, (ap1)          # a1'->ft5
299 |     addi ap1, ap1, 4
300 |     vfmacc.vf v28,  ft3, v0
301 |     flw ft6, (ap2)          # a2'->ft6
302 |     addi ap2, ap2, 4
303 |     flw ft7, (ap3)          # a3'->ft7
304 |     addi ap3, ap3, 4
305 |     // second group of 16 fma, third group load
306 |     vfmacc.vf v16,  ft4, v4
307 |     vfmacc.vf v20,  ft5, v4
308 |     vfmacc.vf v24,  ft6, v4
309 |     vfmacc.vf v28,  ft7, v4
310 | .k1_tail:
311 |     slti t6, kt, 1          # kt < 1
312 |     bnez t6, .store_tile
313 |     flw ft0, (ap0)
314 |     addi ap0, ap0, 4
315 |     vlw.v v0, (bp0)
316 |     addi bp0,bp0,64
317 |     flw ft1, (ap1)
318 |     addi ap1, ap1, 4
319 |     flw ft2, (ap2)
320 |     addi ap2, ap2, 4
321 |     flw ft3, (ap3)
322 |     addi ap3, ap3, 4
323 |     vfmacc.vf v16,  ft0, v0
324 |     vfmacc.vf v20,  ft1, v0
325 |     vfmacc.vf v24,  ft2, v0
326 |     vfmacc.vf v28,  ft3, v0
327 | .store_tile:
328 |     add cp0, cp0, cp_offset
329 |     vsw.v v16, (cp0)
330 |     addi cp0, cp0, 64
331 | 
332 |     vsw.v v20, (cp1)
333 |     addi cp1, cp1, 64
334 | 
335 |     vsw.v v24, (cp2)
336 |     addi cp2, cp2, 64
337 | 
338 |     vsw.v v28, (cp3)
339 |     addi cp3, cp3, 64
340 |     j .end
341 | 
342 | .end:
343 |     ld s0, 96(sp)
344 |     ld s1, 88(sp)
345 |     ld s2, 80(sp)
346 |     ld s3, 72(sp)
347 |     ld s4, 64(sp)
348 |     ld s5, 56(sp)
349 |     ld s6, 48(sp)
350 |     ld s7, 40(sp)
351 |     ld s8, 32(sp)
352 |     ld s9, 24(sp)
353 |     ld s10, 16(sp)
354 |     ld s11, 8(sp)
355 |     addi sp, sp, FRAMESIZE
356 |     ret
357 | 


--------------------------------------------------------------------------------
/sgemm/step6/bl_config.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * --------------------------------------------------------------------------
 3 |  * BLISLAB 
 4 |  * --------------------------------------------------------------------------
 5 |  * Copyright (C) 2016, The University of Texas at Austin
 6 |  *
 7 |  * Redistribution and use in source and binary forms, with or without
 8 |  * modification, are permitted provided that the following conditions are
 9 |  * met:
10 |  *  - Redistributions of source code must retain the above copyright
11 |  *    notice, this list of conditions and the following disclaimer.
12 |  *  - Redistributions in binary form must reproduce the above copyright
13 |  *    notice, this list of conditions and the following disclaimer in the
14 |  *    documentation and/or other materials provided with the distribution.
15 |  *  - Neither the name of The University of Texas nor the names of its
16 |  *    contributors may be used to endorse or promote products derived
17 |  *    from this software without specific prior written permission.
18 |  *
19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 |  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 |  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 |  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 |  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 |  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 |  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 |  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 |  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |  *
31 |  *
32 |  * bl_config.h
33 |  *
34 |  *
35 |  * Purpose:
36 |  * this header file contains configuration parameters.
37 |  *
38 |  * Todo:
39 |  *
40 |  *
41 |  * Modification:
42 |  *
43 |  * 
44 |  * */
45 | 
46 | #ifndef BLISLAB_CONFIG_H
47 | #define BLISLAB_CONFIG_H
48 | 
49 | // Allow C++ users to include this header file in their source code. However,
50 | // we make the extern "C" conditional on whether we're using a C++ compiler,
51 | // since regular C compilers don't understand the extern "C" construct.
52 | #ifdef __cplusplus
53 | extern "C" {
54 | #endif
55 | 
56 | #define DGEMM_MR 4
57 | #define DGEMM_NR 16
58 | 
59 | // End extern "C" construct block.
60 | #ifdef __cplusplus
61 | }
62 | #endif
63 | 
64 | #endif
65 | 
66 | 


--------------------------------------------------------------------------------
/sgemm/step6/my_sgemm.c:
--------------------------------------------------------------------------------
 1 | #include "bl_sgemm.h"
 2 | #include "bl_config.h"
 3 | 
 4 | extern void RvvSgemm4x16( size_t nr,         // nr <= 16
 5 |                           size_t mr,         // mr <= 4
 6 |                           size_t k,          // astride = k*sizeof(float)
 7 |                           const float* a,    // mr * k
 8 |                           const float* b,    // k * 16
 9 |                           float* c,          // mr * nr
10 |                           size_t cn_stride,  // Len(N) * sizeof(float)
11 |                           const float* bias  // bias 
12 |                         );
13 | 
14 | 
15 | void bl_sgemm_pack(
16 |     int    m,
17 |     int    mr,
18 |     int    n,
19 |     int    nr,
20 |     int    k,
21 |     float *A,
22 |     float *packA,
23 |     int    lda,
24 |     float *B,
25 |     float *packB,
26 |     int    ldb,
27 |     float *C,           // must be aligned
28 |     int    ldc          // ldc must also be aligned
29 | )
30 | {
31 |     int i, j, p;
32 |     int ir, jr;
33 | 
34 |     // Early return if possible
35 |     if ( m == 0 || n == 0 || k == 0 ) {
36 |         printf( "bl_sgemm(): early return\n" );
37 |         return;
38 |     }
39 | 
40 |     float bias[800];
41 |     for(int i = 0; i < 800; i++) {bias[i] = 0;}
42 | 
43 |     for ( i = 0; i < m; i += DGEMM_MR ) {       // Start 2-nd loop
44 |         int mb = DGEMM_MR;
45 |         if((m - i) < DGEMM_MR) mb = m - i; 
46 | 
47 |         for ( j = 0; j < n; j += DGEMM_NR ) {   // Start 1-st loop
48 |             int nb = DGEMM_NR;
49 |             if((n - j) < DGEMM_NR) nb = n - j; 
50 | 
51 |             RvvSgemm4x16(   nb,                 // nr <= 16, a0
52 |                             mb,                 // mr <= 4,  a1
53 |                             k,                  // astride = k*sizeof(float), a2
54 |                             &A[i * k],          // mr * k,   a3
55 |                             &packB[j * k],      // k * 16,   a4
56 |                             &C( i, j ),         // mr * nr,  a5
57 |                             n * sizeof(float),  // Len(N) * sizeof(float), a6
58 |                             bias
59 |                             );
60 |         }                                       // End   1-st loop
61 |     }                                           // End   2-nd loop
62 | }
63 | 
64 | 
65 | 


--------------------------------------------------------------------------------
/sgemm/step6/run.sh:
--------------------------------------------------------------------------------
1 | make &&
2 | adb push test_bl_sgemm_step6.x ./. &&
3 | adb shell "./test_bl_sgemm_step6.x"


--------------------------------------------------------------------------------
/sgemm/step7/Makefile:
--------------------------------------------------------------------------------
 1 | TARGET=test_bl_sgemm_step7
 2 | 
 3 | #CROSS_COMPILE ?= riscv64-unknown-linux-gnu-
 4 | # CROSS_COMPILE ?=
 5 | 
 6 | CCL := /home/zhaodongyu/toolchain/riscv/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
 7 | CTOOL := riscv64-unknown-linux-gnu-
 8 | CROSS_COMPILE := ${CCL}/bin/${CTOOL}
 9 | 
10 | CC  = $(CROSS_COMPILE)gcc
11 | CXX = $(CROSS_COMPILE)g++
12 | 
13 | COMPILER_OPT_LEVEL=O3
14 | 
15 | ifeq ($(CROSS_COMPILE),)
16 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=core-avx2 -fPIC -static
17 | LDLIBS = -lpthread -lm
18 | else
19 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static
20 | LDLIBS = -lm
21 | endif
22 | 
23 | COMMONDIR = ../common
24 | CFLAGS += -I$(COMMONDIR)
25 | 
26 | ASM_SRC= RvvSgemm4x16.S
27 | CC_SRC= my_sgemm.c \
28 |         $(COMMONDIR)/test_bl_sgemm_packB_4x16.c \
29 |         $(COMMONDIR)/bl_sgemm_ref.c \
30 |         $(COMMONDIR)/bl_sgemm_util.c
31 | 
32 | CPP_SRC=
33 | 
34 | ALL_ASM_OBJS := $(ASM_SRC:=.o)
35 | ALL_C_OBJS := $(CC_SRC:=.o)
36 | ALL_CXX_OBJS := $(CPP_SRC:=.o)
37 | 
38 | MKDEP_OPT = -MMD -MF $@.d
39 | 
40 | ALL_OBJS += $(ALL_ASM_OBJS) $(ALL_C_OBJS) $(ALL_CXX_OBJS)
41 | ALL_DEPS := $(ALL_OBJS:=.d)
42 | 
43 | BLISLAB_TEST_EXE=$(TARGET:=.x)
44 | 
45 | .PHONY: all clean
46 | 
47 | all: $(BLISLAB_TEST_EXE)
48 | 
49 | # include dependency files of application
50 | ifneq ($(MAKECMDGOALS),clean)
51 | -include $(ALL_DEPS)
52 | endif
53 | 
54 | $(BLISLAB_TEST_EXE): $(ALL_OBJS)
55 | 	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS) $(LDLIBS)
56 | 
57 | # ---------------------------------------------------------------------------
58 | # Object files compiling rules
59 | # ---------------------------------------------------------------------------
60 | %.c.o: %.c
61 | 	$(CC) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
62 | 
63 | %.cpp.o: %.cpp
64 | 	$(CXX) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
65 | 
66 | %.S.o: %.S
67 | 	$(CC) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
68 | # ---------------------------------------------------------------------------
69 | 
70 | clean:
71 | 	@rm -rf $(ALL_OBJS) $(SHAREDLIBBLISLAB) $(BLISLAB_TEST_EXE) $(ALL_DEPS)
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/sgemm/step7/RvvSgemm4x16.S:
--------------------------------------------------------------------------------
  1 | # |<-- k=4 -->|
  2 | # +++++++++++++ -           f0, f4, f0, f4
  3 | # +           + |           f1, f5, f1, f5
  4 | # +     A     + mr=4        f2, f6, f2, f6
  5 | # +           + |           f3, f7, f3, f7
  6 | # +++++++++++++ -       
  7 |                        
  8 | 
  9 | # |<--  nr=16  -->|
 10 | # +++++++++++++++++ -       v0 v1 v2 v3
 11 | # +               + |       v4 v5 v6 v7
 12 | # +       B       + k=4     v8,v9,v10,v11     
 13 | # +               + |       v12,v13,v14,v15
 14 | # +++++++++++++++++ -
 15 | 
 16 | 
 17 | # |<--    nr    -->|
 18 | # +++++++++++++++++ -       
 19 | # +               + |       v16,v17,v18,v19
 20 | # +       C       + k=4     v20,v21,v22,v23
 21 | # +               + |       v24,v25,v26,v27
 22 | # +++++++++++++++++ -       v28,v29,v30,v31
 23 | #
 24 | 
 25 | # void RvvSgemm4x16(size_t nr,         // nr <= 16, a0
 26 | #                   size_t mr,         // mr <= 4,  a1
 27 | #                   size_t k,          // astride = k*sizeof(float), a2
 28 | #                   const float* a,    // mr * k,   a3
 29 | #                   const float* b,    // k * 16,   a4
 30 | #                   float* c,          // mr * nr,  a5
 31 | #                   size_t c_stride,  // Len(N) * sizeof(float), a6
 32 | #                   const float* bias  // bias, a7
 33 | #                   );
 34 | .global RvvSgemm4x16
 35 | .type RvvSgemm4x16, @function
 36 | 
 37 | #define nr a0
 38 | #define mr a1
 39 | #define k  a2
 40 | #define ap a3
 41 | #define bp a4
 42 | #define cp a5
 43 | #define c_stride a6
 44 | #define bias a7
 45 | 
 46 | #define ap1 t0
 47 | #define ap2 t1
 48 | #define ap3 t2
 49 | #define cp1 t3
 50 | #define cp2 t4
 51 | #define cp3 t5
 52 | #define a_stride s0
 53 | #define bp0      s1
 54 | #define biasp    s2
 55 | #define kt       s3
 56 | #define ap0      s4
 57 | #define cp0      s5
 58 | #define bp1      s6
 59 | #define vl       s7
 60 | #define ap_offset       s10
 61 | #define cp_offset       s11
 62 | 
 63 | #define FRAMESIZE 104
 64 | 
 65 | RvvSgemm4x16:
 66 |     addi sp, sp, -FRAMESIZE # callee update stack pointer
 67 |     sd s0, 96(sp)           # callee saved frame pointer
 68 |     addi s0, sp, FRAMESIZE  # generate new frame pointer
 69 |     sd s1, -16(s0)
 70 |     sd s2, -24(s0)
 71 |     sd s3, -32(s0)
 72 |     sd s4, -40(s0)
 73 |     sd s5, -48(s0)
 74 |     sd s6, -56(s0)
 75 |     sd s7, -64(s0)
 76 |     sd s8, -72(s0)
 77 |     sd s9, -80(s0)
 78 |     sd s10, -88(s0)
 79 |     sd s11, -96(s0)
 80 | 
 81 |     li ap_offset, 0
 82 |     li cp_offset, 0
 83 |     slli a_stride, k, 2     # astride = k * sizeof(float)
 84 |     mv s3, nr
 85 |     vsetvli s2, s3, e32, m4
 86 |     mv ap0, ap
 87 |     mv bp0, bp
 88 |     mv cp0, cp
 89 | .a1_offset:
 90 |     mv ap1, ap0
 91 |     mv cp1, cp0
 92 |     slti t6, mr, 2          # mr < 2
 93 |     bnez t6, .a2_offset
 94 |     add ap1, ap0, a_stride
 95 |     add cp1, cp0, c_stride
 96 | .a2_offset:
 97 |     mv ap2, ap1
 98 |     mv cp2, cp1
 99 |     slti t6, mr, 3          # mr < 3
100 |     bnez t6, .a3_offset
101 |     add ap2, ap1, a_stride
102 |     add cp2, cp1, c_stride
103 | .a3_offset:
104 |     mv ap3, ap2
105 |     mv cp3, cp2
106 |     slti t6, mr, 4          # mr < 4
107 |     bnez t6, .start
108 |     add ap3, ap2, a_stride
109 |     add cp3, cp2, c_stride
110 | 
111 | .start:
112 |     mv biasp, bias
113 |     mv kt, k
114 |     beqz mr, .end
115 | 
116 |     vlw.v v16, (biasp)
117 |     vlw.v v20, (biasp)
118 |     vlw.v v24, (biasp)
119 |     vlw.v v28, (biasp)
120 |     addi biasp, biasp, 64
121 |     slti t6, kt, 4               # kt < 4, t6 = 1
122 |     bnez t6, .k2_tail
123 |     
124 | 
125 |     // flw fs0, 64(ap0)          # pre-load A
126 |     // flw fs1, 64(ap1)          # pre-load A
127 |     // flw fs2, 64(ap2)          # pre-load A
128 |     // flw fs3, 64(ap3)          # pre-load A
129 |     
130 |     // flw fs4, 512(bp0)       # pre-load B
131 |     // flw fs5, 576(bp0)       # pre-load B
132 |     // flw fs6, 640(bp0)       # pre-load B
133 |     // flw fs7, 704(bp0)       # pre-load B
134 | 
135 |     // load 4 row A (A0, A1, A2, A3)
136 |     flw ft0, (ap0)
137 |     addi ap0, ap0, 4
138 |     flw ft1, (ap1)
139 |     addi ap1, ap1, 4
140 |     flw ft2, (ap2)
141 |     addi ap2, ap2, 4
142 |     flw ft3, (ap3)
143 |     addi ap3, ap3, 4
144 |     // load 16 col B(B0, B1, B2, B3)
145 |     vlw.v v0, (bp0)
146 |     addi bp0,bp0,64
147 | 
148 |     addi kt, kt, -4         # Decrement k counter
149 |     slti t6, kt, 4          # kt < 4
150 |     bnez t6, .k4_tail       # jump to k4_tail
151 | 
152 | .k4_main:
153 |     addi kt, kt, -4         # Decrement k counter
154 |     // first group of 16 fma, second group load
155 |     vfmacc.vf v16,  ft0, v0
156 |     vlw.v v4, (bp0)         # b0'->v4
157 |     // flw fs4, 384(bp0)       # pre-load B
158 |     addi bp0,bp0,64
159 |     vfmacc.vf v20,  ft1, v0
160 |     flw ft4, (ap0)          # a0'->ft4
161 |     addi ap0, ap0, 4
162 |     vfmacc.vf v24,  ft2, v0
163 |     flw ft5, (ap1)          # a1'->ft5
164 |     addi ap1, ap1, 4
165 |     vfmacc.vf v28,  ft3, v0
166 |     flw ft6, (ap2)          # a2'->ft6
167 |     addi ap2, ap2, 4
168 |     flw ft7, (ap3)          # a3'->ft7
169 |     addi ap3, ap3, 4
170 |     // second group of 16 fma, third group load
171 |     slti t6, kt, 4              # kt < 4, t6 = 1
172 |     vfmacc.vf v16,  ft4, v4
173 |     vlw.v v8, (bp0)         # b0'->v0
174 |     // flw fs5, 384(bp0)       # pre-load B
175 |     addi bp0,bp0,64
176 |     vfmacc.vf v20,  ft5, v4
177 |     flw ft0, (ap0)          # a0'->ft0
178 |     addi ap0, ap0, 4
179 |     vfmacc.vf v24,  ft6, v4
180 |     flw ft1, (ap1)          # a1'->ft1
181 |     addi ap1, ap1, 4
182 |     vfmacc.vf v28,  ft7, v4
183 |     flw ft2, (ap2)          # a2'->ft2
184 |     addi ap2, ap2, 4
185 |     flw ft3, (ap3)          # a3'->ft3
186 |     addi ap3, ap3, 4
187 |     // third group of 16 fma, fourth group load
188 |     vfmacc.vf v16,  ft0, v8
189 |     vlw.v v12, (bp0)         # b0'->v0
190 |     // flw fs6, 384(bp0)       # pre-load B
191 |     addi bp0,bp0,64
192 |     vfmacc.vf v20,  ft1, v8
193 |     flw ft4, (ap0)          # a0'->ft0
194 |     addi ap0, ap0, 4
195 |     vfmacc.vf v24,  ft2, v8
196 |     flw ft5, (ap1)          # a1'->ft1
197 |     addi ap1, ap1, 4
198 |     vfmacc.vf v28,  ft3, v8
199 |     flw ft6, (ap2)          # a2'->ft2
200 |     addi ap2, ap2, 4
201 |     flw ft7, (ap3)          # a3'->ft3
202 |     addi ap3, ap3, 4
203 |     // fourth group of 16 fma, first group load
204 |     vfmacc.vf v16,  ft4, v12
205 |     vlw.v v0, (bp0)         # b0'->v0
206 |     // flw fs7, 384(bp0)       # pre-load B
207 |     addi bp0,bp0,64
208 |     vfmacc.vf v20,  ft5, v12
209 |     flw ft0, (ap0)          # a0'->ft0
210 |     addi ap0, ap0, 4
211 |     vfmacc.vf v24,  ft6, v12
212 |     flw ft1, (ap1)          # a0'->ft0
213 |     addi ap1, ap1, 4
214 |     vfmacc.vf v28,  ft7, v12
215 |     flw ft2, (ap2)          # a0'->ft0
216 |     addi ap2, ap2, 4
217 |     flw ft3, (ap3)          # a0'->ft0
218 |     addi ap3, ap3, 4
219 | 
220 |     // flw fs0, 64(ap0)          # pre-load A
221 |     // flw fs1, 64(ap1)          # pre-load A
222 |     // flw fs2, 64(ap2)          # pre-load A
223 |     // flw fs3, 64(ap3)          # pre-load A
224 | 
225 |     beqz t6, .k4_main
226 | .k4_tail:
227 |     // first group of 16 fma, second group load
228 |     vfmacc.vf v16,  ft0, v0
229 |     vlw.v v4, (bp0)         # b0'->v4
230 |     addi bp0,bp0,64
231 |     vfmacc.vf v20,  ft1, v0
232 |     flw ft4, (ap0)          # a0'->ft4
233 |     addi ap0, ap0, 4
234 |     vfmacc.vf v24,  ft2, v0
235 |     flw ft5, (ap1)          # a1'->ft5
236 |     addi ap1, ap1, 4
237 |     vfmacc.vf v28,  ft3, v0
238 |     flw ft6, (ap2)          # a2'->ft6
239 |     addi ap2, ap2, 4
240 |     flw ft7, (ap3)          # a3'->ft7
241 |     addi ap3, ap3, 4
242 |     // second group of 16 fma, third group load
243 |     vfmacc.vf v16,  ft4, v4
244 |     vlw.v v8, (bp0)         # b0'->v0
245 |     addi bp0,bp0,64
246 |     vfmacc.vf v20,  ft5, v4
247 |     flw ft0, (ap0)          # a0'->ft0
248 |     addi ap0, ap0, 4
249 |     vfmacc.vf v24,  ft6, v4
250 |     flw ft1, (ap1)          # a1'->ft1
251 |     addi ap1, ap1, 4
252 |     vfmacc.vf v28,  ft7, v4
253 |     flw ft2, (ap2)          # a2'->ft2
254 |     addi ap2, ap2, 4
255 |     flw ft3, (ap3)          # a3'->ft3
256 |     addi ap3, ap3, 4
257 |     // third group of 16 fma, fourth group load
258 |     vfmacc.vf v16,  ft0, v8
259 |     vlw.v v12, (bp0)         # b0'->v0
260 |     addi bp0,bp0,64
261 |     vfmacc.vf v20,  ft1, v8
262 |     flw ft4, (ap0)          # a0'->ft0
263 |     addi ap0, ap0, 4
264 |     vfmacc.vf v24,  ft2, v8
265 |     flw ft5, (ap1)          # a1'->ft1
266 |     addi ap1, ap1, 4
267 |     vfmacc.vf v28,  ft3, v8
268 |     flw ft6, (ap2)          # a2'->ft2
269 |     addi ap2, ap2, 4
270 |     flw ft7, (ap3)          # a3'->ft3
271 |     addi ap3, ap3, 4
272 |     // fourth group of 16 fma, no group load
273 |     vfmacc.vf v16,  ft4, v12
274 |     vfmacc.vf v20,  ft5, v12
275 |     vfmacc.vf v24,  ft6, v12
276 |     vfmacc.vf v28,  ft7, v12
277 | .k2_tail:
278 |     slti t6, kt, 2          # kt < 2
279 |     bnez t6, .k1_tail
280 |     flw ft0, (ap0)
281 |     addi ap0, ap0, 4
282 |     vlw.v v0, (bp0)
283 |     addi bp0,bp0,64
284 |     flw ft1, (ap1)
285 |     addi ap1, ap1, 4
286 |     flw ft2, (ap2)
287 |     addi ap2, ap2, 4
288 |     flw ft3, (ap3)
289 |     addi ap3, ap3, 4
290 |     // first group of 16 fma, second group load
291 |     vfmacc.vf v16,  ft0, v0
292 |     vlw.v v4, (bp0)         # b0'->v4
293 |     addi bp0,bp0,64
294 |     vfmacc.vf v20,  ft1, v0
295 |     flw ft4, (ap0)          # a0'->ft4
296 |     addi ap0, ap0, 4
297 |     vfmacc.vf v24,  ft2, v0
298 |     flw ft5, (ap1)          # a1'->ft5
299 |     addi ap1, ap1, 4
300 |     vfmacc.vf v28,  ft3, v0
301 |     flw ft6, (ap2)          # a2'->ft6
302 |     addi ap2, ap2, 4
303 |     flw ft7, (ap3)          # a3'->ft7
304 |     addi ap3, ap3, 4
305 |     // second group of 16 fma, third group load
306 |     vfmacc.vf v16,  ft4, v4
307 |     vfmacc.vf v20,  ft5, v4
308 |     vfmacc.vf v24,  ft6, v4
309 |     vfmacc.vf v28,  ft7, v4
310 | .k1_tail:
311 |     slti t6, kt, 1          # kt < 1
312 |     bnez t6, .store_tile
313 |     flw ft0, (ap0)
314 |     addi ap0, ap0, 4
315 |     vlw.v v0, (bp0)
316 |     addi bp0,bp0,64
317 |     flw ft1, (ap1)
318 |     addi ap1, ap1, 4
319 |     flw ft2, (ap2)
320 |     addi ap2, ap2, 4
321 |     flw ft3, (ap3)
322 |     addi ap3, ap3, 4
323 |     vfmacc.vf v16,  ft0, v0
324 |     vfmacc.vf v20,  ft1, v0
325 |     vfmacc.vf v24,  ft2, v0
326 |     vfmacc.vf v28,  ft3, v0
327 | .store_tile:
328 |     add cp0, cp0, cp_offset
329 |     vsw.v v16, (cp0)
330 |     addi cp0, cp0, 64
331 | 
332 |     vsw.v v20, (cp1)
333 |     addi cp1, cp1, 64
334 | 
335 |     vsw.v v24, (cp2)
336 |     addi cp2, cp2, 64
337 | 
338 |     vsw.v v28, (cp3)
339 |     addi cp3, cp3, 64
340 |     j .end
341 | 
342 | .end:
343 |     ld s0, 96(sp)
344 |     ld s1, 88(sp)
345 |     ld s2, 80(sp)
346 |     ld s3, 72(sp)
347 |     ld s4, 64(sp)
348 |     ld s5, 56(sp)
349 |     ld s6, 48(sp)
350 |     ld s7, 40(sp)
351 |     ld s8, 32(sp)
352 |     ld s9, 24(sp)
353 |     ld s10, 16(sp)
354 |     ld s11, 8(sp)
355 |     addi sp, sp, FRAMESIZE
356 |     ret
357 | 


--------------------------------------------------------------------------------
/sgemm/step7/bl_config.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * --------------------------------------------------------------------------
 3 |  * BLISLAB 
 4 |  * --------------------------------------------------------------------------
 5 |  * Copyright (C) 2016, The University of Texas at Austin
 6 |  *
 7 |  * Redistribution and use in source and binary forms, with or without
 8 |  * modification, are permitted provided that the following conditions are
 9 |  * met:
10 |  *  - Redistributions of source code must retain the above copyright
11 |  *    notice, this list of conditions and the following disclaimer.
12 |  *  - Redistributions in binary form must reproduce the above copyright
13 |  *    notice, this list of conditions and the following disclaimer in the
14 |  *    documentation and/or other materials provided with the distribution.
15 |  *  - Neither the name of The University of Texas nor the names of its
16 |  *    contributors may be used to endorse or promote products derived
17 |  *    from this software without specific prior written permission.
18 |  *
19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 |  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 |  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 |  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 |  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 |  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 |  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 |  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 |  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |  *
31 |  *
32 |  * bl_config.h
33 |  *
34 |  *
35 |  * Purpose:
36 |  * this header file contains configuration parameters.
37 |  *
38 |  * Todo:
39 |  *
40 |  *
41 |  * Modification:
42 |  *
43 |  * 
44 |  * */
45 | 
46 | #ifndef BLISLAB_CONFIG_H
47 | #define BLISLAB_CONFIG_H
48 | 
49 | // Allow C++ users to include this header file in their source code. However,
50 | // we make the extern "C" conditional on whether we're using a C++ compiler,
51 | // since regular C compilers don't understand the extern "C" construct.
52 | #ifdef __cplusplus
53 | extern "C" {
54 | #endif
55 | 
56 | #define DGEMM_MR 4
57 | #define DGEMM_NR 16
58 | 
59 | // End extern "C" construct block.
60 | #ifdef __cplusplus
61 | }
62 | #endif
63 | 
64 | #endif
65 | 
66 | 


--------------------------------------------------------------------------------
/sgemm/step7/my_sgemm.c:
--------------------------------------------------------------------------------
 1 | #include "bl_sgemm.h"
 2 | #include "bl_config.h"
 3 | 
 4 | extern void RvvSgemm4x16( size_t nr,         // nr <= 16
 5 |                           size_t mr,         // mr <= 4
 6 |                           size_t k,          // astride = k*sizeof(float)
 7 |                           const float* a,    // mr * k
 8 |                           const float* b,    // k * 16
 9 |                           float* c,          // mr * nr
10 |                           size_t cn_stride,  // Len(N) * sizeof(float)
11 |                           const float* bias  // bias 
12 |                         );
13 | 
14 | 
15 | void bl_sgemm_pack(
16 |     int    m,
17 |     int    mr,
18 |     int    n,
19 |     int    nr,
20 |     int    k,
21 |     float *A,
22 |     float *packA,
23 |     int    lda,
24 |     float *B,
25 |     float *packB,
26 |     int    ldb,
27 |     float *C,           // must be aligned
28 |     int    ldc          // ldc must also be aligned
29 | )
30 | {
31 |     int i, j, p;
32 |     int ir, jr;
33 | 
34 |     // Early return if possible
35 |     if ( m == 0 || n == 0 || k == 0 ) {
36 |         printf( "bl_sgemm(): early return\n" );
37 |         return;
38 |     }
39 | 
40 |     float bias[800];
41 |     for(int i = 0; i < 800; i++) {bias[i] = 0;}
42 | 
43 |     for ( j = 0; j < n; j += DGEMM_NR ) {       // Start 2-st loop
44 |         int nb = DGEMM_NR;
45 |         if((n - j) < DGEMM_NR) nb = n - j; 
46 | 
47 |         for ( i = 0; i < m; i += DGEMM_MR ) {   // Start 1-nd loop
48 |             int mb = DGEMM_MR;
49 |             if((m - i) < DGEMM_MR) mb = m - i; 
50 | 
51 |             RvvSgemm4x16(   nb,                 // nr <= 16, a0
52 |                             mb,                 // mr <= 4,  a1
53 |                             k,                  // astride = k*sizeof(float), a2
54 |                             &A[i * k],          // mr * k,   a3
55 |                             &packB[j * k],      // k * 16,   a4
56 |                             &C( i, j ),         // mr * nr,  a5
57 |                             n * sizeof(float),  // Len(N) * sizeof(float), a6
58 |                             bias
59 |                             );
60 |         }                                       // End   1-st loop
61 |     }                                           // End   2-nd loop
62 | }
63 | 
64 | 
65 | 


--------------------------------------------------------------------------------
/sgemm/step7/run.sh:
--------------------------------------------------------------------------------
1 | make &&
2 | adb push test_bl_sgemm_step7.x ./. &&
3 | adb shell "./test_bl_sgemm_step7.x"


--------------------------------------------------------------------------------
/sgemm/step8/Makefile:
--------------------------------------------------------------------------------
 1 | TARGET=test_bl_sgemm_step8
 2 | 
 3 | #CROSS_COMPILE ?= riscv64-unknown-linux-gnu-
 4 | # CROSS_COMPILE ?=
 5 | 
 6 | CCL := /home/zhaodongyu/toolchain/riscv/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
 7 | CTOOL := riscv64-unknown-linux-gnu-
 8 | CROSS_COMPILE := ${CCL}/bin/${CTOOL}
 9 | 
10 | CC  = $(CROSS_COMPILE)gcc
11 | CXX = $(CROSS_COMPILE)g++
12 | 
13 | COMPILER_OPT_LEVEL=O3
14 | 
15 | ifeq ($(CROSS_COMPILE),)
16 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=core-avx2 -fPIC -static
17 | LDLIBS = -lpthread -lm
18 | else
19 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static
20 | LDLIBS = -lm
21 | endif
22 | 
23 | COMMONDIR = ../common
24 | CFLAGS += -I$(COMMONDIR)
25 | 
26 | ASM_SRC= RvvSgemm4x16.S
27 | CC_SRC= my_sgemm.c \
28 |         $(COMMONDIR)/test_bl_sgemm_packB_4x16.c \
29 |         $(COMMONDIR)/bl_sgemm_ref.c \
30 |         $(COMMONDIR)/bl_sgemm_util.c
31 | 
32 | CPP_SRC=
33 | 
34 | ALL_ASM_OBJS := $(ASM_SRC:=.o)
35 | ALL_C_OBJS := $(CC_SRC:=.o)
36 | ALL_CXX_OBJS := $(CPP_SRC:=.o)
37 | 
38 | MKDEP_OPT = -MMD -MF $@.d
39 | 
40 | ALL_OBJS += $(ALL_ASM_OBJS) $(ALL_C_OBJS) $(ALL_CXX_OBJS)
41 | ALL_DEPS := $(ALL_OBJS:=.d)
42 | 
43 | BLISLAB_TEST_EXE=$(TARGET:=.x)
44 | 
45 | .PHONY: all clean
46 | 
47 | all: $(BLISLAB_TEST_EXE)
48 | 
49 | # include dependency files of application
50 | ifneq ($(MAKECMDGOALS),clean)
51 | -include $(ALL_DEPS)
52 | endif
53 | 
54 | $(BLISLAB_TEST_EXE): $(ALL_OBJS)
55 | 	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS) $(LDLIBS)
56 | 
57 | # ---------------------------------------------------------------------------
58 | # Object files compiling rules
59 | # ---------------------------------------------------------------------------
60 | %.c.o: %.c
61 | 	$(CC) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
62 | 
63 | %.cpp.o: %.cpp
64 | 	$(CXX) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
65 | 
66 | %.S.o: %.S
67 | 	$(CC) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
68 | # ---------------------------------------------------------------------------
69 | 
70 | clean:
71 | 	@rm -rf $(ALL_OBJS) $(SHAREDLIBBLISLAB) $(BLISLAB_TEST_EXE) $(ALL_DEPS)
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/sgemm/step8/RvvSgemm4x16.S:
--------------------------------------------------------------------------------
  1 | # |<-- k=4 -->|
  2 | # +++++++++++++ -           f0, f4, f0, f4
  3 | # +           + |           f1, f5, f1, f5
  4 | # +     A     + mr=4        f2, f6, f2, f6
  5 | # +           + |           f3, f7, f3, f7
  6 | # +++++++++++++ -       
  7 |                        
  8 | 
  9 | # |<--  nr=16  -->|
 10 | # +++++++++++++++++ -       v0 v1 v2 v3
 11 | # +               + |       v4 v5 v6 v7
 12 | # +       B       + k=4     v8,v9,v10,v11     
 13 | # +               + |       v12,v13,v14,v15
 14 | # +++++++++++++++++ -
 15 | 
 16 | 
 17 | # |<--    nr    -->|
 18 | # +++++++++++++++++ -       
 19 | # +               + |       v16,v17,v18,v19
 20 | # +       C       + k=4     v20,v21,v22,v23
 21 | # +               + |       v24,v25,v26,v27
 22 | # +++++++++++++++++ -       v28,v29,v30,v31
 23 | #
 24 | 
 25 | # void RvvSgemm4x16(size_t nr,         // nr <= 16, a0
 26 | #                   size_t mr,         // mr <= 4,  a1
 27 | #                   size_t k,          // astride = k*sizeof(float), a2
 28 | #                   const float* a,    // mr * k,   a3
 29 | #                   const float* b,    // k * 16,   a4
 30 | #                   float* c,          // mr * nr,  a5
 31 | #                   size_t c_stride,  // Len(N) * sizeof(float), a6
 32 | #                   const float* bias  // bias, a7
 33 | #                   );
 34 | .global RvvSgemm4x16
 35 | .type RvvSgemm4x16, @function
 36 | 
 37 | #define nr a0
 38 | #define mr a1
 39 | #define k  a2
 40 | #define ap a3
 41 | #define bp a4
 42 | #define cp a5
 43 | #define c_stride a6
 44 | #define bias a7
 45 | 
 46 | #define ap1 t0
 47 | #define ap2 t1
 48 | #define ap3 t2
 49 | #define cp1 t3
 50 | #define cp2 t4
 51 | #define cp3 t5
 52 | #define a_stride s0
 53 | #define bp0      s1
 54 | #define biasp    s2
 55 | #define kt       s3
 56 | #define ap0      s4
 57 | #define cp0      s5
 58 | #define bp1      s6
 59 | #define vl       s7
 60 | #define ap_offset       s10
 61 | #define cp_offset       s11
 62 | 
 63 | #define FRAMESIZE 104
 64 | 
 65 | RvvSgemm4x16:
 66 |     addi sp, sp, -FRAMESIZE # callee update stack pointer
 67 |     sd s0, 96(sp)           # callee saved frame pointer
 68 |     addi s0, sp, FRAMESIZE  # generate new frame pointer
 69 |     sd s1, -16(s0)
 70 |     sd s2, -24(s0)
 71 |     sd s3, -32(s0)
 72 |     sd s4, -40(s0)
 73 |     sd s5, -48(s0)
 74 |     sd s6, -56(s0)
 75 |     sd s7, -64(s0)
 76 |     sd s8, -72(s0)
 77 |     sd s9, -80(s0)
 78 |     sd s10, -88(s0)
 79 |     sd s11, -96(s0)
 80 | 
 81 |     li ap_offset, 0
 82 |     li cp_offset, 0
 83 |     slli a_stride, k, 2     # astride = k * sizeof(float)
 84 |     mv s3, nr
 85 |     vsetvli s2, s3, e32, m4
 86 |     mv ap0, ap
 87 |     mv bp0, bp
 88 |     mv cp0, cp
 89 | .a1_offset:
 90 |     mv ap1, ap0
 91 |     mv cp1, cp0
 92 |     slti t6, mr, 2          # mr < 2
 93 |     bnez t6, .a2_offset
 94 |     add ap1, ap0, a_stride
 95 |     add cp1, cp0, c_stride
 96 | .a2_offset:
 97 |     mv ap2, ap1
 98 |     mv cp2, cp1
 99 |     slti t6, mr, 3          # mr < 3
100 |     bnez t6, .a3_offset
101 |     add ap2, ap1, a_stride
102 |     add cp2, cp1, c_stride
103 | .a3_offset:
104 |     mv ap3, ap2
105 |     mv cp3, cp2
106 |     slti t6, mr, 4          # mr < 4
107 |     bnez t6, .start
108 |     add ap3, ap2, a_stride
109 |     add cp3, cp2, c_stride
110 | 
111 | .start:
112 |     mv biasp, bias
113 |     mv kt, k
114 |     beqz mr, .end
115 | 
116 |     vlw.v v16, (biasp)
117 |     vlw.v v20, (biasp)
118 |     vlw.v v24, (biasp)
119 |     vlw.v v28, (biasp)
120 |     addi biasp, biasp, 64
121 |     slti t6, kt, 4               # kt < 4, t6 = 1
122 |     bnez t6, .k2_tail
123 |     
124 | 
125 |     flw fs0, 64(ap0)          # pre-load A
126 |     flw fs1, 64(ap1)          # pre-load A
127 |     flw fs2, 64(ap2)          # pre-load A
128 |     flw fs3, 64(ap3)          # pre-load A
129 |     
130 |     flw fs4, 512(bp0)       # pre-load B
131 |     flw fs5, 576(bp0)       # pre-load B
132 |     flw fs6, 640(bp0)       # pre-load B
133 |     flw fs7, 704(bp0)       # pre-load B
134 | 
135 |     // load 4 row A (A0, A1, A2, A3)
136 |     flw ft0, (ap0)
137 |     addi ap0, ap0, 4
138 |     flw ft1, (ap1)
139 |     addi ap1, ap1, 4
140 |     flw ft2, (ap2)
141 |     addi ap2, ap2, 4
142 |     flw ft3, (ap3)
143 |     addi ap3, ap3, 4
144 |     // load 16 col B(B0, B1, B2, B3)
145 |     vlw.v v0, (bp0)
146 |     addi bp0,bp0,64
147 | 
148 |     addi kt, kt, -4         # Decrement k counter
149 |     slti t6, kt, 4          # kt < 4
150 |     bnez t6, .k4_tail       # jump to k4_tail
151 | 
152 | .k4_main:
153 |     addi kt, kt, -4         # Decrement k counter
154 |     // first group of 16 fma, second group load
155 |     vfmacc.vf v16,  ft0, v0
156 |     vlw.v v4, (bp0)         # b0'->v4
157 |     flw fs4, 384(bp0)       # pre-load B
158 |     addi bp0,bp0,64
159 |     vfmacc.vf v20,  ft1, v0
160 |     flw ft4, (ap0)          # a0'->ft4
161 |     addi ap0, ap0, 4
162 |     vfmacc.vf v24,  ft2, v0
163 |     flw ft5, (ap1)          # a1'->ft5
164 |     addi ap1, ap1, 4
165 |     vfmacc.vf v28,  ft3, v0
166 |     flw ft6, (ap2)          # a2'->ft6
167 |     addi ap2, ap2, 4
168 |     flw ft7, (ap3)          # a3'->ft7
169 |     addi ap3, ap3, 4
170 |     // second group of 16 fma, third group load
171 |     slti t6, kt, 4              # kt < 4, t6 = 1
172 |     vfmacc.vf v16,  ft4, v4
173 |     vlw.v v8, (bp0)         # b0'->v0
174 |     flw fs5, 384(bp0)       # pre-load B
175 |     addi bp0,bp0,64
176 |     vfmacc.vf v20,  ft5, v4
177 |     flw ft0, (ap0)          # a0'->ft0
178 |     addi ap0, ap0, 4
179 |     vfmacc.vf v24,  ft6, v4
180 |     flw ft1, (ap1)          # a1'->ft1
181 |     addi ap1, ap1, 4
182 |     vfmacc.vf v28,  ft7, v4
183 |     flw ft2, (ap2)          # a2'->ft2
184 |     addi ap2, ap2, 4
185 |     flw ft3, (ap3)          # a3'->ft3
186 |     addi ap3, ap3, 4
187 |     // third group of 16 fma, fourth group load
188 |     vfmacc.vf v16,  ft0, v8
189 |     vlw.v v12, (bp0)         # b0'->v0
190 |     flw fs6, 384(bp0)       # pre-load B
191 |     addi bp0,bp0,64
192 |     vfmacc.vf v20,  ft1, v8
193 |     flw ft4, (ap0)          # a0'->ft0
194 |     addi ap0, ap0, 4
195 |     vfmacc.vf v24,  ft2, v8
196 |     flw ft5, (ap1)          # a1'->ft1
197 |     addi ap1, ap1, 4
198 |     vfmacc.vf v28,  ft3, v8
199 |     flw ft6, (ap2)          # a2'->ft2
200 |     addi ap2, ap2, 4
201 |     flw ft7, (ap3)          # a3'->ft3
202 |     addi ap3, ap3, 4
203 |     // fourth group of 16 fma, first group load
204 |     vfmacc.vf v16,  ft4, v12
205 |     vlw.v v0, (bp0)         # b0'->v0
206 |     flw fs7, 384(bp0)       # pre-load B
207 |     addi bp0,bp0,64
208 |     vfmacc.vf v20,  ft5, v12
209 |     flw ft0, (ap0)          # a0'->ft0
210 |     addi ap0, ap0, 4
211 |     vfmacc.vf v24,  ft6, v12
212 |     flw ft1, (ap1)          # a0'->ft0
213 |     addi ap1, ap1, 4
214 |     vfmacc.vf v28,  ft7, v12
215 |     flw ft2, (ap2)          # a0'->ft0
216 |     addi ap2, ap2, 4
217 |     flw ft3, (ap3)          # a0'->ft0
218 |     addi ap3, ap3, 4
219 | 
220 |     flw fs0, 64(ap0)          # pre-load A
221 |     flw fs1, 64(ap1)          # pre-load A
222 |     flw fs2, 64(ap2)          # pre-load A
223 |     flw fs3, 64(ap3)          # pre-load A
224 | 
225 |     beqz t6, .k4_main
226 | .k4_tail:
227 |     // first group of 16 fma, second group load
228 |     vfmacc.vf v16,  ft0, v0
229 |     vlw.v v4, (bp0)         # b0'->v4
230 |     addi bp0,bp0,64
231 |     vfmacc.vf v20,  ft1, v0
232 |     flw ft4, (ap0)          # a0'->ft4
233 |     addi ap0, ap0, 4
234 |     vfmacc.vf v24,  ft2, v0
235 |     flw ft5, (ap1)          # a1'->ft5
236 |     addi ap1, ap1, 4
237 |     vfmacc.vf v28,  ft3, v0
238 |     flw ft6, (ap2)          # a2'->ft6
239 |     addi ap2, ap2, 4
240 |     flw ft7, (ap3)          # a3'->ft7
241 |     addi ap3, ap3, 4
242 |     // second group of 16 fma, third group load
243 |     vfmacc.vf v16,  ft4, v4
244 |     vlw.v v8, (bp0)         # b0'->v0
245 |     addi bp0,bp0,64
246 |     vfmacc.vf v20,  ft5, v4
247 |     flw ft0, (ap0)          # a0'->ft0
248 |     addi ap0, ap0, 4
249 |     vfmacc.vf v24,  ft6, v4
250 |     flw ft1, (ap1)          # a1'->ft1
251 |     addi ap1, ap1, 4
252 |     vfmacc.vf v28,  ft7, v4
253 |     flw ft2, (ap2)          # a2'->ft2
254 |     addi ap2, ap2, 4
255 |     flw ft3, (ap3)          # a3'->ft3
256 |     addi ap3, ap3, 4
257 |     // third group of 16 fma, fourth group load
258 |     vfmacc.vf v16,  ft0, v8
259 |     vlw.v v12, (bp0)         # b0'->v0
260 |     addi bp0,bp0,64
261 |     vfmacc.vf v20,  ft1, v8
262 |     flw ft4, (ap0)          # a0'->ft0
263 |     addi ap0, ap0, 4
264 |     vfmacc.vf v24,  ft2, v8
265 |     flw ft5, (ap1)          # a1'->ft1
266 |     addi ap1, ap1, 4
267 |     vfmacc.vf v28,  ft3, v8
268 |     flw ft6, (ap2)          # a2'->ft2
269 |     addi ap2, ap2, 4
270 |     flw ft7, (ap3)          # a3'->ft3
271 |     addi ap3, ap3, 4
272 |     // fourth group of 16 fma, no group load
273 |     vfmacc.vf v16,  ft4, v12
274 |     vfmacc.vf v20,  ft5, v12
275 |     vfmacc.vf v24,  ft6, v12
276 |     vfmacc.vf v28,  ft7, v12
277 | .k2_tail:
278 |     slti t6, kt, 2          # kt < 2
279 |     bnez t6, .k1_tail
280 |     flw ft0, (ap0)
281 |     addi ap0, ap0, 4
282 |     vlw.v v0, (bp0)
283 |     addi bp0,bp0,64
284 |     flw ft1, (ap1)
285 |     addi ap1, ap1, 4
286 |     flw ft2, (ap2)
287 |     addi ap2, ap2, 4
288 |     flw ft3, (ap3)
289 |     addi ap3, ap3, 4
290 |     // first group of 16 fma, second group load
291 |     vfmacc.vf v16,  ft0, v0
292 |     vlw.v v4, (bp0)         # b0'->v4
293 |     addi bp0,bp0,64
294 |     vfmacc.vf v20,  ft1, v0
295 |     flw ft4, (ap0)          # a0'->ft4
296 |     addi ap0, ap0, 4
297 |     vfmacc.vf v24,  ft2, v0
298 |     flw ft5, (ap1)          # a1'->ft5
299 |     addi ap1, ap1, 4
300 |     vfmacc.vf v28,  ft3, v0
301 |     flw ft6, (ap2)          # a2'->ft6
302 |     addi ap2, ap2, 4
303 |     flw ft7, (ap3)          # a3'->ft7
304 |     addi ap3, ap3, 4
305 |     // second group of 16 fma, third group load
306 |     vfmacc.vf v16,  ft4, v4
307 |     vfmacc.vf v20,  ft5, v4
308 |     vfmacc.vf v24,  ft6, v4
309 |     vfmacc.vf v28,  ft7, v4
310 | .k1_tail:
311 |     slti t6, kt, 1          # kt < 1
312 |     bnez t6, .store_tile
313 |     flw ft0, (ap0)
314 |     addi ap0, ap0, 4
315 |     vlw.v v0, (bp0)
316 |     addi bp0,bp0,64
317 |     flw ft1, (ap1)
318 |     addi ap1, ap1, 4
319 |     flw ft2, (ap2)
320 |     addi ap2, ap2, 4
321 |     flw ft3, (ap3)
322 |     addi ap3, ap3, 4
323 |     vfmacc.vf v16,  ft0, v0
324 |     vfmacc.vf v20,  ft1, v0
325 |     vfmacc.vf v24,  ft2, v0
326 |     vfmacc.vf v28,  ft3, v0
327 | .store_tile:
328 |     add cp0, cp0, cp_offset
329 |     vsw.v v16, (cp0)
330 |     addi cp0, cp0, 64
331 | 
332 |     vsw.v v20, (cp1)
333 |     addi cp1, cp1, 64
334 | 
335 |     vsw.v v24, (cp2)
336 |     addi cp2, cp2, 64
337 | 
338 |     vsw.v v28, (cp3)
339 |     addi cp3, cp3, 64
340 |     j .end
341 | 
342 | .end:
343 |     ld s0, 96(sp)
344 |     ld s1, 88(sp)
345 |     ld s2, 80(sp)
346 |     ld s3, 72(sp)
347 |     ld s4, 64(sp)
348 |     ld s5, 56(sp)
349 |     ld s6, 48(sp)
350 |     ld s7, 40(sp)
351 |     ld s8, 32(sp)
352 |     ld s9, 24(sp)
353 |     ld s10, 16(sp)
354 |     ld s11, 8(sp)
355 |     addi sp, sp, FRAMESIZE
356 |     ret
357 | 


--------------------------------------------------------------------------------
/sgemm/step8/bl_config.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * --------------------------------------------------------------------------
 3 |  * BLISLAB 
 4 |  * --------------------------------------------------------------------------
 5 |  * Copyright (C) 2016, The University of Texas at Austin
 6 |  *
 7 |  * Redistribution and use in source and binary forms, with or without
 8 |  * modification, are permitted provided that the following conditions are
 9 |  * met:
10 |  *  - Redistributions of source code must retain the above copyright
11 |  *    notice, this list of conditions and the following disclaimer.
12 |  *  - Redistributions in binary form must reproduce the above copyright
13 |  *    notice, this list of conditions and the following disclaimer in the
14 |  *    documentation and/or other materials provided with the distribution.
15 |  *  - Neither the name of The University of Texas nor the names of its
16 |  *    contributors may be used to endorse or promote products derived
17 |  *    from this software without specific prior written permission.
18 |  *
19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 |  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 |  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 |  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 |  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 |  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 |  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 |  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 |  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |  *
31 |  *
32 |  * bl_config.h
33 |  *
34 |  *
35 |  * Purpose:
36 |  * this header file contains configuration parameters.
37 |  *
38 |  * Todo:
39 |  *
40 |  *
41 |  * Modification:
42 |  *
43 |  * 
44 |  * */
45 | 
46 | #ifndef BLISLAB_CONFIG_H
47 | #define BLISLAB_CONFIG_H
48 | 
49 | // Allow C++ users to include this header file in their source code. However,
50 | // we make the extern "C" conditional on whether we're using a C++ compiler,
51 | // since regular C compilers don't understand the extern "C" construct.
52 | #ifdef __cplusplus
53 | extern "C" {
54 | #endif
55 | 
56 | #define DGEMM_MR 4
57 | #define DGEMM_NR 16
58 | 
59 | // End extern "C" construct block.
60 | #ifdef __cplusplus
61 | }
62 | #endif
63 | 
64 | #endif
65 | 
66 | 


--------------------------------------------------------------------------------
/sgemm/step8/my_sgemm.c:
--------------------------------------------------------------------------------
 1 | #include "bl_sgemm.h"
 2 | #include "bl_config.h"
 3 | 
 4 | extern void RvvSgemm4x16( size_t nr,         // nr <= 16
 5 |                           size_t mr,         // mr <= 4
 6 |                           size_t k,          // astride = k*sizeof(float)
 7 |                           const float* a,    // mr * k
 8 |                           const float* b,    // k * 16
 9 |                           float* c,          // mr * nr
10 |                           size_t cn_stride,  // Len(N) * sizeof(float)
11 |                           const float* bias  // bias 
12 |                         );
13 | 
14 | 
15 | void bl_sgemm_pack(
16 |     int    m,
17 |     int    mr,
18 |     int    n,
19 |     int    nr,
20 |     int    k,
21 |     float *A,
22 |     float *packA,
23 |     int    lda,
24 |     float *B,
25 |     float *packB,
26 |     int    ldb,
27 |     float *C,           // must be aligned
28 |     int    ldc          // ldc must also be aligned
29 | )
30 | {
31 |     int i, j, p;
32 |     int ir, jr;
33 | 
34 |     // Early return if possible
35 |     if ( m == 0 || n == 0 || k == 0 ) {
36 |         printf( "bl_sgemm(): early return\n" );
37 |         return;
38 |     }
39 | 
40 |     float bias[800];
41 |     for(int i = 0; i < 800; i++) {bias[i] = 0;}
42 | 
43 |     for ( j = 0; j < n; j += DGEMM_NR ) {       // Start 2-st loop
44 |         int nb = DGEMM_NR;
45 |         if((n - j) < DGEMM_NR) nb = n - j; 
46 | 
47 |         for ( i = 0; i < m; i += DGEMM_MR ) {   // Start 1-nd loop
48 |             int mb = DGEMM_MR;
49 |             if((m - i) < DGEMM_MR) mb = m - i; 
50 | 
51 |             RvvSgemm4x16(   nb,                 // nr <= 16, a0
52 |                             mb,                 // mr <= 4,  a1
53 |                             k,                  // astride = k*sizeof(float), a2
54 |                             &A[i * k],          // mr * k,   a3
55 |                             &packB[j * k],      // k * 16,   a4
56 |                             &C( i, j ),         // mr * nr,  a5
57 |                             n * sizeof(float),  // Len(N) * sizeof(float), a6
58 |                             bias
59 |                             );
60 |         }                                       // End   1-st loop
61 |     }                                           // End   2-nd loop
62 | }
63 | 
64 | 
65 | 


--------------------------------------------------------------------------------
/sgemm/step8/run.sh:
--------------------------------------------------------------------------------
1 | make &&
2 | adb push test_bl_sgemm_step8.x ./. &&
3 | adb shell "./test_bl_sgemm_step8.x"


--------------------------------------------------------------------------------
/sgemm/step9/Makefile:
--------------------------------------------------------------------------------
 1 | TARGET=test_bl_sgemm_step9
 2 | 
 3 | #CROSS_COMPILE ?= riscv64-unknown-linux-gnu-
 4 | # CROSS_COMPILE ?=
 5 | 
 6 | CCL := /home/zhaodongyu/toolchain/riscv/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
 7 | CTOOL := riscv64-unknown-linux-gnu-
 8 | CROSS_COMPILE := ${CCL}/bin/${CTOOL}
 9 | 
10 | CC  = $(CROSS_COMPILE)gcc
11 | CXX = $(CROSS_COMPILE)g++
12 | 
13 | COMPILER_OPT_LEVEL=O3
14 | 
15 | ifeq ($(CROSS_COMPILE),)
16 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=core-avx2 -fPIC -static
17 | LDLIBS = -lpthread -lm
18 | else
19 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static
20 | LDLIBS = -lm
21 | endif
22 | 
23 | COMMONDIR = ../common
24 | CFLAGS += -I$(COMMONDIR)
25 | 
26 | ASM_SRC= RvvSgemm4x16.S
27 | CC_SRC= my_sgemm.c \
28 |         $(COMMONDIR)/test_bl_sgemm_packB_4x16.c \
29 |         $(COMMONDIR)/bl_sgemm_ref.c \
30 |         $(COMMONDIR)/bl_sgemm_util.c
31 | 
32 | CPP_SRC=
33 | 
34 | ALL_ASM_OBJS := $(ASM_SRC:=.o)
35 | ALL_C_OBJS := $(CC_SRC:=.o)
36 | ALL_CXX_OBJS := $(CPP_SRC:=.o)
37 | 
38 | MKDEP_OPT = -MMD -MF $@.d
39 | 
40 | ALL_OBJS += $(ALL_ASM_OBJS) $(ALL_C_OBJS) $(ALL_CXX_OBJS)
41 | ALL_DEPS := $(ALL_OBJS:=.d)
42 | 
43 | BLISLAB_TEST_EXE=$(TARGET:=.x)
44 | 
45 | .PHONY: all clean
46 | 
47 | all: $(BLISLAB_TEST_EXE)
48 | 
49 | # include dependency files of application
50 | ifneq ($(MAKECMDGOALS),clean)
51 | -include $(ALL_DEPS)
52 | endif
53 | 
54 | $(BLISLAB_TEST_EXE): $(ALL_OBJS)
55 | 	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS) $(LDLIBS)
56 | 
57 | # ---------------------------------------------------------------------------
58 | # Object files compiling rules
59 | # ---------------------------------------------------------------------------
60 | %.c.o: %.c
61 | 	$(CC) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
62 | 
63 | %.cpp.o: %.cpp
64 | 	$(CXX) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
65 | 
66 | %.S.o: %.S
67 | 	$(CC) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
68 | # ---------------------------------------------------------------------------
69 | 
70 | clean:
71 | 	@rm -rf $(ALL_OBJS) $(SHAREDLIBBLISLAB) $(BLISLAB_TEST_EXE) $(ALL_DEPS)
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/sgemm/step9/RvvSgemm4x16.S:
--------------------------------------------------------------------------------
  1 | # |<-- k=4 -->|
  2 | # +++++++++++++ -           f0, f4, f0, f4
  3 | # +           + |           f1, f5, f1, f5
  4 | # +     A     + mr=4        f2, f6, f2, f6
  5 | # +           + |           f3, f7, f3, f7
  6 | # +++++++++++++ -       
  7 |                        
  8 | 
  9 | # |<--  nr=16  -->|
 10 | # +++++++++++++++++ -       v0 v1 v2 v3
 11 | # +               + |       v4 v5 v6 v7
 12 | # +       B       + k=4     v8,v9,v10,v11     
 13 | # +               + |       v12,v13,v14,v15
 14 | # +++++++++++++++++ -
 15 | 
 16 | 
 17 | # |<--    nr    -->|
 18 | # +++++++++++++++++ -       
 19 | # +               + |       v16,v17,v18,v19
 20 | # +       C       + k=4     v20,v21,v22,v23
 21 | # +               + |       v24,v25,v26,v27
 22 | # +++++++++++++++++ -       v28,v29,v30,v31
 23 | #
 24 | 
 25 | # void RvvSgemm4x16(size_t nr,         // nr <= 16, a0
 26 | #                   size_t mr,         // mr <= 4,  a1
 27 | #                   size_t k,          // astride = k*sizeof(float), a2
 28 | #                   const float* a,    // mr * k,   a3
 29 | #                   const float* b,    // k * 16,   a4
 30 | #                   float* c,          // mr * nr,  a5
 31 | #                   size_t c_stride,  // Len(N) * sizeof(float), a6
 32 | #                   const float* bias  // bias, a7
 33 | #                   );
 34 | .global RvvSgemm4x16
 35 | .type RvvSgemm4x16, @function
 36 | 
 37 | #define nr a0
 38 | #define mr a1
 39 | #define k  a2
 40 | #define ap a3
 41 | #define bp a4
 42 | #define cp a5
 43 | #define c_stride a6
 44 | #define bias a7
 45 | 
 46 | #define ap0 t0
 47 | 
 48 | #define cp1 t3
 49 | #define cp2 t4
 50 | #define cp3 t5
 51 | #define a_stride s0
 52 | #define bp0      s1
 53 | #define biasp    s2
 54 | #define kt       s3
 55 | #define cp0      s5
 56 | #define bp1      s6
 57 | #define vl       s7
 58 | #define ap_offset       s10
 59 | #define cp_offset       s11
 60 | 
 61 | #define FRAMESIZE 104
 62 | 
 63 | RvvSgemm4x16:
 64 |     addi sp, sp, -FRAMESIZE # callee update stack pointer
 65 |     sd s0, 96(sp)           # callee saved frame pointer
 66 |     addi s0, sp, FRAMESIZE  # generate new frame pointer
 67 |     sd s1, -16(s0)
 68 |     sd s2, -24(s0)
 69 |     sd s3, -32(s0)
 70 |     sd s4, -40(s0)
 71 |     sd s5, -48(s0)
 72 |     sd s6, -56(s0)
 73 |     sd s7, -64(s0)
 74 |     sd s8, -72(s0)
 75 |     sd s9, -80(s0)
 76 |     sd s10, -88(s0)
 77 |     sd s11, -96(s0)
 78 | 
 79 |     li ap_offset, 0
 80 |     li cp_offset, 0
 81 |     slli a_stride, k, 2     # astride = k * sizeof(float)
 82 |     mv s3, nr
 83 |     vsetvli s2, s3, e32, m4
 84 |     mv ap0, ap
 85 |     mv bp0, bp
 86 |     mv cp0, cp
 87 | .a1_offset:
 88 |     mv cp1, cp0
 89 |     slti t6, mr, 2          # mr < 2
 90 |     bnez t6, .a2_offset
 91 |     add cp1, cp0, c_stride
 92 | .a2_offset:
 93 |     mv cp2, cp1
 94 |     slti t6, mr, 3          # mr < 3
 95 |     bnez t6, .a3_offset
 96 |     add cp2, cp1, c_stride
 97 | .a3_offset:
 98 |     mv cp3, cp2
 99 |     slti t6, mr, 4          # mr < 4
100 |     bnez t6, .start
101 |     add cp3, cp2, c_stride
102 | 
103 | .start:
104 |     mv biasp, bias
105 |     mv kt, k
106 |     beqz mr, .end
107 | 
108 |     vlw.v v16, (biasp)
109 |     vlw.v v20, (biasp)
110 |     vlw.v v24, (biasp)
111 |     vlw.v v28, (biasp)
112 |     addi biasp, biasp, 64
113 |     slti t6, kt, 4               # kt < 4, t6 = 1
114 |     bnez t6, .k2_tail
115 |     
116 | 
117 |     flw fs0, 64(ap0)          # pre-load A
118 |     flw fs1, 128(ap0)          # pre-load A
119 |     flw fs2, 192(ap0)          # pre-load A
120 |     flw fs3, 256(ap0)          # pre-load A
121 |     
122 |     flw fs4, 512(bp0)       # pre-load B
123 |     flw fs5, 576(bp0)       # pre-load B
124 |     flw fs6, 640(bp0)       # pre-load B
125 |     flw fs7, 704(bp0)       # pre-load B
126 | 
127 |     // load 4 row A (A0, A1, A2, A3)
128 |     flw ft0, (ap0)
129 |     addi ap0, ap0, 4
130 |     flw ft1, (ap0)
131 |     addi ap0, ap0, 4
132 |     flw ft2, (ap0)
133 |     addi ap0, ap0, 4
134 |     flw ft3, (ap0)
135 |     addi ap0, ap0, 4
136 |     // load 16 col B(B0, B1, B2, B3)
137 |     vlw.v v0, (bp0)
138 |     addi bp0,bp0,64
139 | 
140 |     addi kt, kt, -4         # Decrement k counter
141 |     slti t6, kt, 4          # kt < 4
142 |     bnez t6, .k4_tail       # jump to k4_tail
143 | 
144 | .k4_main:
145 |     addi kt, kt, -4         # Decrement k counter
146 |     // first group of 16 fma, second group load
147 |     vfmacc.vf v16,  ft0, v0
148 |     vlw.v v4, (bp0)         # b0'->v4
149 |     flw fs4, 384(bp0)       # pre-load B
150 |     addi bp0,bp0,64
151 |     vfmacc.vf v20,  ft1, v0
152 |     flw ft4, (ap0)          # a0'->ft4
153 |     addi ap0, ap0, 4
154 |     vfmacc.vf v24,  ft2, v0
155 |     flw ft5, (ap0)          # a1'->ft5
156 |     addi ap0, ap0, 4
157 |     vfmacc.vf v28,  ft3, v0
158 |     flw ft6, (ap0)          # a2'->ft6
159 |     addi ap0, ap0, 4
160 |     flw ft7, (ap0)          # a3'->ft7
161 |     addi ap0, ap0, 4
162 |     // second group of 16 fma, third group load
163 |     slti t6, kt, 4              # kt < 4, t6 = 1
164 |     vfmacc.vf v16,  ft4, v4
165 |     vlw.v v8, (bp0)         # b0'->v0
166 |     flw fs5, 384(bp0)       # pre-load B
167 |     addi bp0,bp0,64
168 |     vfmacc.vf v20,  ft5, v4
169 |     flw ft0, (ap0)          # a0'->ft0
170 |     addi ap0, ap0, 4
171 |     vfmacc.vf v24,  ft6, v4
172 |     flw ft1, (ap0)          # a1'->ft1
173 |     addi ap0, ap0, 4
174 |     vfmacc.vf v28,  ft7, v4
175 |     flw ft2, (ap0)          # a2'->ft2
176 |     addi ap0, ap0, 4
177 |     flw ft3, (ap0)          # a3'->ft3
178 |     addi ap0, ap0, 4
179 |     // third group of 16 fma, fourth group load
180 |     vfmacc.vf v16,  ft0, v8
181 |     vlw.v v12, (bp0)         # b0'->v0
182 |     flw fs6, 384(bp0)       # pre-load B
183 |     addi bp0,bp0,64
184 |     vfmacc.vf v20,  ft1, v8
185 |     flw ft4, (ap0)          # a0'->ft0
186 |     addi ap0, ap0, 4
187 |     vfmacc.vf v24,  ft2, v8
188 |     flw ft5, (ap0)          # a1'->ft1
189 |     addi ap0, ap0, 4
190 |     vfmacc.vf v28,  ft3, v8
191 |     flw ft6, (ap0)          # a2'->ft2
192 |     addi ap0, ap0, 4
193 |     flw ft7, (ap0)          # a3'->ft3
194 |     addi ap0, ap0, 4
195 |     // fourth group of 16 fma, first group load
196 |     vfmacc.vf v16,  ft4, v12
197 |     vlw.v v0, (bp0)         # b0'->v0
198 |     flw fs7, 384(bp0)       # pre-load B
199 |     addi bp0,bp0,64
200 |     vfmacc.vf v20,  ft5, v12
201 |     flw ft0, (ap0)          # a0'->ft0
202 |     addi ap0, ap0, 4
203 |     vfmacc.vf v24,  ft6, v12
204 |     flw ft1, (ap0)          # a0'->ft0
205 |     addi ap0, ap0, 4
206 |     vfmacc.vf v28,  ft7, v12
207 |     flw ft2, (ap0)          # a0'->ft0
208 |     addi ap0, ap0, 4
209 |     flw ft3, (ap0)          # a0'->ft0
210 |     addi ap0, ap0, 4
211 | 
212 |     flw fs0, 64(ap0)          # pre-load A
213 |     flw fs1, 128(ap0)          # pre-load A
214 |     flw fs2, 192(ap0)          # pre-load A
215 |     flw fs3, 256(ap0)          # pre-load A
216 | 
217 |     beqz t6, .k4_main
218 | .k4_tail:
219 |     // first group of 16 fma, second group load
220 |     vfmacc.vf v16,  ft0, v0
221 |     vlw.v v4, (bp0)         # b0'->v4
222 |     addi bp0,bp0,64
223 |     vfmacc.vf v20,  ft1, v0
224 |     flw ft4, (ap0)          # a0'->ft4
225 |     addi ap0, ap0, 4
226 |     vfmacc.vf v24,  ft2, v0
227 |     flw ft5, (ap0)          # a1'->ft5
228 |     addi ap0, ap0, 4
229 |     vfmacc.vf v28,  ft3, v0
230 |     flw ft6, (ap0)          # a2'->ft6
231 |     addi ap0, ap0, 4
232 |     flw ft7, (ap0)          # a3'->ft7
233 |     addi ap0, ap0, 4
234 |     // second group of 16 fma, third group load
235 |     vfmacc.vf v16,  ft4, v4
236 |     vlw.v v8, (bp0)         # b0'->v0
237 |     addi bp0,bp0,64
238 |     vfmacc.vf v20,  ft5, v4
239 |     flw ft0, (ap0)          # a0'->ft0
240 |     addi ap0, ap0, 4
241 |     vfmacc.vf v24,  ft6, v4
242 |     flw ft1, (ap0)          # a1'->ft1
243 |     addi ap0, ap0, 4
244 |     vfmacc.vf v28,  ft7, v4
245 |     flw ft2, (ap0)          # a2'->ft2
246 |     addi ap0, ap0, 4
247 |     flw ft3, (ap0)          # a3'->ft3
248 |     addi ap0, ap0, 4
249 |     // third group of 16 fma, fourth group load
250 |     vfmacc.vf v16,  ft0, v8
251 |     vlw.v v12, (bp0)         # b0'->v0
252 |     addi bp0,bp0,64
253 |     vfmacc.vf v20,  ft1, v8
254 |     flw ft4, (ap0)          # a0'->ft0
255 |     addi ap0, ap0, 4
256 |     vfmacc.vf v24,  ft2, v8
257 |     flw ft5, (ap0)          # a1'->ft1
258 |     addi ap0, ap0, 4
259 |     vfmacc.vf v28,  ft3, v8
260 |     flw ft6, (ap0)          # a2'->ft2
261 |     addi ap0, ap0, 4
262 |     flw ft7, (ap0)          # a3'->ft3
263 |     addi ap0, ap0, 4
264 |     // fourth group of 16 fma, no group load
265 |     vfmacc.vf v16,  ft4, v12
266 |     vfmacc.vf v20,  ft5, v12
267 |     vfmacc.vf v24,  ft6, v12
268 |     vfmacc.vf v28,  ft7, v12
269 | .k2_tail:
270 |     slti t6, kt, 2          # kt < 2
271 |     bnez t6, .k1_tail
272 |     flw ft0, (ap0)
273 |     addi ap0, ap0, 4
274 |     vlw.v v0, (bp0)
275 |     addi bp0,bp0,64
276 |     flw ft1, (ap0)
277 |     addi ap0, ap0, 4
278 |     flw ft2, (ap0)
279 |     addi ap0, ap0, 4
280 |     flw ft3, (ap0)
281 |     addi ap0, ap0, 4
282 |     // first group of 16 fma, second group load
283 |     vfmacc.vf v16,  ft0, v0
284 |     vlw.v v4, (bp0)         # b0'->v4
285 |     addi bp0,bp0,64
286 |     vfmacc.vf v20,  ft1, v0
287 |     flw ft4, (ap0)          # a0'->ft4
288 |     addi ap0, ap0, 4
289 |     vfmacc.vf v24,  ft2, v0
290 |     flw ft5, (ap0)          # a1'->ft5
291 |     addi ap0, ap0, 4
292 |     vfmacc.vf v28,  ft3, v0
293 |     flw ft6, (ap0)          # a2'->ft6
294 |     addi ap0, ap0, 4
295 |     flw ft7, (ap0)          # a3'->ft7
296 |     addi ap0, ap0, 4
297 |     // second group of 16 fma, third group load
298 |     vfmacc.vf v16,  ft4, v4
299 |     vfmacc.vf v20,  ft5, v4
300 |     vfmacc.vf v24,  ft6, v4
301 |     vfmacc.vf v28,  ft7, v4
302 | .k1_tail:
303 |     slti t6, kt, 1          # kt < 1
304 |     bnez t6, .store_tile
305 |     flw ft0, (ap0)
306 |     addi ap0, ap0, 4
307 |     vlw.v v0, (bp0)
308 |     addi bp0,bp0,64
309 |     flw ft1, (ap0)
310 |     addi ap0, ap0, 4
311 |     flw ft2, (ap0)
312 |     addi ap0, ap0, 4
313 |     flw ft3, (ap0)
314 |     addi ap0, ap0, 4
315 |     vfmacc.vf v16,  ft0, v0
316 |     vfmacc.vf v20,  ft1, v0
317 |     vfmacc.vf v24,  ft2, v0
318 |     vfmacc.vf v28,  ft3, v0
319 | .store_tile:
320 |     add cp0, cp0, cp_offset
321 |     vsw.v v16, (cp0)
322 |     addi cp0, cp0, 64
323 | 
324 |     vsw.v v20, (cp1)
325 |     addi cp1, cp1, 64
326 | 
327 |     vsw.v v24, (cp2)
328 |     addi cp2, cp2, 64
329 | 
330 |     vsw.v v28, (cp3)
331 |     addi cp3, cp3, 64
332 |     j .end
333 | 
334 | .end:
335 |     ld s0, 96(sp)
336 |     ld s1, 88(sp)
337 |     ld s2, 80(sp)
338 |     ld s3, 72(sp)
339 |     ld s4, 64(sp)
340 |     ld s5, 56(sp)
341 |     ld s6, 48(sp)
342 |     ld s7, 40(sp)
343 |     ld s8, 32(sp)
344 |     ld s9, 24(sp)
345 |     ld s10, 16(sp)
346 |     ld s11, 8(sp)
347 |     addi sp, sp, FRAMESIZE
348 |     ret
349 | 


--------------------------------------------------------------------------------
/sgemm/step9/bl_config.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * --------------------------------------------------------------------------
 3 |  * BLISLAB 
 4 |  * --------------------------------------------------------------------------
 5 |  * Copyright (C) 2016, The University of Texas at Austin
 6 |  *
 7 |  * Redistribution and use in source and binary forms, with or without
 8 |  * modification, are permitted provided that the following conditions are
 9 |  * met:
10 |  *  - Redistributions of source code must retain the above copyright
11 |  *    notice, this list of conditions and the following disclaimer.
12 |  *  - Redistributions in binary form must reproduce the above copyright
13 |  *    notice, this list of conditions and the following disclaimer in the
14 |  *    documentation and/or other materials provided with the distribution.
15 |  *  - Neither the name of The University of Texas nor the names of its
16 |  *    contributors may be used to endorse or promote products derived
17 |  *    from this software without specific prior written permission.
18 |  *
19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 |  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 |  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 |  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 |  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 |  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 |  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 |  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 |  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |  *
31 |  *
32 |  * bl_config.h
33 |  *
34 |  *
35 |  * Purpose:
36 |  * this header file contains configuration parameters.
37 |  *
38 |  * Todo:
39 |  *
40 |  *
41 |  * Modification:
42 |  *
43 |  * 
44 |  * */
45 | 
46 | #ifndef BLISLAB_CONFIG_H
47 | #define BLISLAB_CONFIG_H
48 | 
49 | // Allow C++ users to include this header file in their source code. However,
50 | // we make the extern "C" conditional on whether we're using a C++ compiler,
51 | // since regular C compilers don't understand the extern "C" construct.
52 | #ifdef __cplusplus
53 | extern "C" {
54 | #endif
55 | 
56 | #define DGEMM_MR 4
57 | #define DGEMM_NR 16
58 | 
59 | // End extern "C" construct block.
60 | #ifdef __cplusplus
61 | }
62 | #endif
63 | 
64 | #endif
65 | 
66 | 


--------------------------------------------------------------------------------
/sgemm/step9/my_sgemm.c:
--------------------------------------------------------------------------------
 1 | #include "bl_sgemm.h"
 2 | #include "bl_config.h"
 3 | 
 4 | extern void RvvSgemm4x16( size_t nr,         // nr <= 16
 5 |                           size_t mr,         // mr <= 4
 6 |                           size_t k,          // astride = k*sizeof(float)
 7 |                           const float* a,    // mr * k
 8 |                           const float* b,    // k * 16
 9 |                           float* c,          // mr * nr
10 |                           size_t cn_stride,  // Len(N) * sizeof(float)
11 |                           const float* bias  // bias 
12 |                         );
13 | 
14 | 
15 | inline void PackInputLayout(float* dst, const float* src, int m, int k, int mr) {
16 |     int    i, j, p;
17 | 
18 |     for ( j = 0; j < m / mr; j ++ ) {
19 |         for ( i = 0; i < k; i ++ ) {
20 |             for ( p = 0; p < mr; p ++ ) {
21 |                 *dst ++ = *(src + p * k + j * mr * k + i);
22 |             }
23 |         }
24 |     }
25 | }
26 | 
27 | 
28 | void bl_sgemm_pack(
29 |     int    m,
30 |     int    mr,
31 |     int    n,
32 |     int    nr,
33 |     int    k,
34 |     float *A,
35 |     float *packA,
36 |     int    lda,
37 |     float *B,
38 |     float *packB,
39 |     int    ldb,
40 |     float *C,         // must be aligned
41 |     int    ldc        // ldc must also be aligned
42 | )
43 | {
44 |     int i, j, p;
45 |     int ir, jr;
46 | 
47 |     // Early return if possible
48 |     if ( m == 0 || n == 0 || k == 0 ) {
49 |         printf( "bl_sgemm(): early return\n" );
50 |         return;
51 |     }
52 |     
53 |     PackInputLayout(packA, A, m, k, mr);
54 | 
55 |     // printf("[A]\n");
56 |     // for(int i = 0; i < m; i++) {
57 |     //   for(int j = 0; j < k; j++) {
58 |     //     printf("%.0f\t", A[i * k + j]);
59 |     //   }
60 |     //   printf("\n");
61 |     // }
62 |     // printf("[packA]\n");
63 |     // for(int i = 0; i < m; i++) {
64 |     //   for(int j = 0; j < k; j++) {
65 |     //     printf("%.0f\t", packA[i * k + j]);
66 |     //   }
67 |     //   printf("\n");
68 |     // }
69 | 
70 |     float bias[800] = {0};
71 |     for(int i = 0; i < 800; i++) {bias[i] = 0;}
72 | 
73 |     for ( j = 0; j < n; j += DGEMM_NR ) {       // Start 2-st loop
74 |         int nb = DGEMM_NR;
75 |         if((n - j) < DGEMM_NR) nb = n - j; 
76 | 
77 |         for ( i = 0; i < m; i += DGEMM_MR ) {   // Start 1-nd loop
78 |             int mb = DGEMM_MR;
79 |             if((m - i) < DGEMM_MR) mb = m - i; 
80 | 
81 |             RvvSgemm4x16(   nb,                 // nr <= 16, a0
82 |                             mb,                 // mr <= 4,  a1
83 |                             k,                  // astride = k*sizeof(float), a2
84 |                             &packA[i * k],      // mr * k,   a3
85 |                             &packB[j * k],      // k * 16,   a4
86 |                             &C( i, j ),         // mr * nr,  a5
87 |                             n * sizeof(float),  // Len(N) * sizeof(float), a6
88 |                             bias
89 |                             );
90 |         }                                       // End   1-st loop
91 |     }                                           // End   2-nd loop
92 | }
93 | 


--------------------------------------------------------------------------------
/sgemm/step9/run.sh:
--------------------------------------------------------------------------------
1 | make &&
2 | adb push test_bl_sgemm_step9.x ./. &&
3 | adb shell "./test_bl_sgemm_step9.x"


--------------------------------------------------------------------------------