├── .gitignore
├── LICENSE
├── README.md
├── data
├── plt.py
├── result_0.png
├── result_1.png
├── result_2.png
├── result_3.png
├── result_4.png
├── result_5.png
├── result_6.png
├── result_7.png
├── result_8.png
├── result_9.png
└── result_all.png
├── pics
├── GEMM.png
├── gemm_block.png
├── riscv.gif
├── roofline.png
├── step0.gif
├── step1.gif
├── step2.gif
├── step3.gif
├── step4.gif
└── step5.gif
├── prepare
├── 0.hello_world
│ ├── Makefile
│ └── hello_world.c
├── 1.memory_copy
│ ├── Makefile
│ ├── main.c
│ └── memcpy.S
├── 2.memcpy_bandwidth_test
│ ├── Makefile
│ └── mbw.c
├── 3.flw_bandwidth_test
│ ├── Makefile
│ ├── load_flw.S
│ └── load_test.c
├── 4.vlw_bandwidth_test
│ ├── Makefile
│ ├── load_test.c
│ └── load_vlw.S
├── 5.saxpy
│ ├── Makefile
│ ├── main.c
│ └── saxpy.S
├── README.md
└── imgs
│ └── memory_bandwidth_test.png
└── sgemm
├── common
├── bl_sgemm.h
├── bl_sgemm_ref.c
├── bl_sgemm_util.c
├── test_bl_sgemm.c
├── test_bl_sgemm_packB_4x16.c
└── test_bl_sgemm_packB_4x4.c
├── step0
├── Makefile
└── my_sgemm.c
├── step1
├── Makefile
└── my_sgemm.c
├── step2
├── Makefile
├── bl_config.h
└── my_sgemm.c
├── step3
├── Makefile
├── bl_config.h
└── my_sgemm.c
├── step4
├── Makefile
├── bl_config.h
└── my_sgemm.c
├── step5
├── Makefile
├── bl_config.h
└── my_sgemm.c
├── step6
├── Makefile
├── RvvSgemm4x16.S
├── bl_config.h
├── my_sgemm.c
└── run.sh
├── step7
├── Makefile
├── RvvSgemm4x16.S
├── bl_config.h
├── my_sgemm.c
└── run.sh
├── step8
├── Makefile
├── RvvSgemm4x16.S
├── bl_config.h
├── my_sgemm.c
└── run.sh
└── step9
├── Makefile
├── RvvSgemm4x16.S
├── bl_config.h
├── my_sgemm.c
└── run.sh
/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.d
3 | *.x
4 | *.xlsx
5 | .DS_Store
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Andy
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # sgemm_riscv
2 | 
3 | [](./LICENSE)
4 | [](./LICENSE)
5 |
6 | [](https://github.com/Zhao-Dongyu/sgemm_riscv/network)
7 | [](https://github.com/Zhao-Dongyu/sgemm_riscv/stargazers)
8 | [](https://github.com/Zhao-Dongyu/sgemm_riscv/issues)
9 |
10 | ---
11 |
12 |
13 |
14 | [RISC-V](https://riscv.org/) is an open standard Instruction Set Architecture (ISA) enabling a new era of processor innovation through open collaboration.
15 |
16 | ---
17 |
18 |
19 |
20 | [GEMM](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms#Level_3) General matrix multiply, one of the Basic Linear Algebra Subprograms.
21 |
22 | ---
23 |
24 |
25 |
26 | This project records the process of optimizing SGEMM (single-precision floating point General Matrix Multiplication) on the riscv platform.
27 |
28 | ---
29 |
30 | To get started, please refer to Section [Usage](#usage)
31 |
32 | Related tutorials are located on the [wiki](https://github.com/Zhao-Dongyu/sgemm_riscv/wiki).
33 |
34 |
35 | ## Blislab
36 |
37 | [Blislab](https://github.com/flame/blislab) is an open source teaching project that teaches you step-by-step optimization of matrix multiplication.
38 |
39 | On the basis of the blislab project, [surez-ok](https://github.com/surez-ok/blislab_riscv) has made some deletions and optimizations (deleted into the simplest code, only supports x86 or riscv Linux OS), so The project is clearer and easier to get started.
40 |
41 | ## Project structure
42 |
43 | .
44 | ├── data
45 | ├── pics
46 | ├── prepare
47 | │ ├── 0.hello_world
48 | │ ├── 1.memory_copy
49 | │ ├── 2.memcpy_bandwidth_test
50 | │ ├── 3.flw_bandwidth_test
51 | │ ├── 4.vlw_bandwidth_test
52 | │ ├── 5.saxpy
53 | │ └── imgs
54 | └── sgemm
55 | ├── common
56 | ├── step0
57 | ├── step1
58 | ├── step2
59 | ├── step3
60 | ├── step4
61 | ├── step5
62 | ├── step6
63 | ├── step7
64 | ├── step8
65 | └── step9
66 |
67 | In the `prepare` folder, I compiled some tutorials and demos for hardware performance testing.
68 |
69 | In the `sgemm` folder, `step0` to `step9` are my experiments.
70 |
71 | See [wiki](https://github.com/Zhao-Dongyu/sgemm_riscv/wiki) for more details.
72 |
73 | ## Installation
74 |
75 | You need to download the cross-compilation chain of riscv.
76 |
77 | The development board I use is Nezha D1, download from [here](https://xuantie.t-head.cn/community/download?id=4090445921563774976)
78 |
79 |
80 | ## Usage
81 |
82 | Take `step1` as an example
83 |
84 | > You need to modify the Makefile and configure CROSS_COMPILE in the first few lines of the Makefile as the correct cross-compiler
85 |
86 | ```shell
87 | $ cd step1
88 | $ make
89 | $ adb push test_bl_sgemm_step1.x ./.
90 | $ adb shell './test_bl_sgemm_step1.x'
91 | ```
92 |
93 | # Acknowledgement
94 |
95 | - [BLISlab: A Sandbox for Optimizing GEMM](https://github.com/flame/blislab)
96 |
97 | This project introduced me to how to optimize GEMM
98 |
99 | - [riscv平台优化矩阵乘(基于blislab优化实践)](https://github.com/surez-ok/blislab_riscv)
100 |
101 | I conduct experiments and exploration based on this project
102 |
103 | - Thanks to Mr. Ding for your guidance.
104 |
105 | ## Support
106 |
107 | zhaodongyu1024@gmail.com
108 |
109 | ## License
110 |
111 | [MIT License](./LICENSE)
112 |
--------------------------------------------------------------------------------
/data/plt.py:
--------------------------------------------------------------------------------
1 | # pip install pandas matplotlib openpyxl
2 |
3 | import pandas as pd
4 | import matplotlib.pyplot as plt
5 |
6 | df = pd.read_excel('sgemm_riscv.xlsx', index_col=0)
7 | # 绘制多行数据
8 | plt.plot(df.loc['version 0'], label='version 0')
9 | plt.plot(df.loc['version 1'], label='version 1')
10 | plt.plot(df.loc['version 2'], label='version 2')
11 | plt.plot(df.loc['version 3'], label='version 3')
12 | plt.plot(df.loc['version 4'], label='version 4')
13 | plt.plot(df.loc['version 5'], label='version 5')
14 | plt.plot(df.loc['version 6'], label='version 6')
15 | plt.plot(df.loc['version 7'], label='version 7')
16 | plt.plot(df.loc['version 8'], label='version 8')
17 | plt.plot(df.loc['version 9'], label='version 9')
18 |
19 | plt.title("Sgemm on Nezha D1")
20 | plt.xlabel("M=N=K")
21 | plt.ylabel("GFLOPS")
22 |
23 | plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5)
24 |
25 | plt.tight_layout()
26 |
27 | plt.savefig('result.png', dpi=300)
28 |
29 | plt.show()
--------------------------------------------------------------------------------
/data/result_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/data/result_0.png
--------------------------------------------------------------------------------
/data/result_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/data/result_1.png
--------------------------------------------------------------------------------
/data/result_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/data/result_2.png
--------------------------------------------------------------------------------
/data/result_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/data/result_3.png
--------------------------------------------------------------------------------
/data/result_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/data/result_4.png
--------------------------------------------------------------------------------
/data/result_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/data/result_5.png
--------------------------------------------------------------------------------
/data/result_6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/data/result_6.png
--------------------------------------------------------------------------------
/data/result_7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/data/result_7.png
--------------------------------------------------------------------------------
/data/result_8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/data/result_8.png
--------------------------------------------------------------------------------
/data/result_9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/data/result_9.png
--------------------------------------------------------------------------------
/data/result_all.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/data/result_all.png
--------------------------------------------------------------------------------
/pics/GEMM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/pics/GEMM.png
--------------------------------------------------------------------------------
/pics/gemm_block.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/pics/gemm_block.png
--------------------------------------------------------------------------------
/pics/riscv.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/pics/riscv.gif
--------------------------------------------------------------------------------
/pics/roofline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/pics/roofline.png
--------------------------------------------------------------------------------
/pics/step0.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/pics/step0.gif
--------------------------------------------------------------------------------
/pics/step1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/pics/step1.gif
--------------------------------------------------------------------------------
/pics/step2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/pics/step2.gif
--------------------------------------------------------------------------------
/pics/step3.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/pics/step3.gif
--------------------------------------------------------------------------------
/pics/step4.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/pics/step4.gif
--------------------------------------------------------------------------------
/pics/step5.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/pics/step5.gif
--------------------------------------------------------------------------------
/prepare/0.hello_world/Makefile:
--------------------------------------------------------------------------------
1 | CTOOL := riscv64-unknown-linux-gnu-
2 | CCL := /home/zhaodongyu/toolchain/riscv/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
3 | CC := ${CCL}/bin/${CTOOL}gcc
4 |
5 | hello_world:hello_world.c
6 | ${CC} -o hello_world hello_world.c
7 |
8 | clean:
9 | rm hello_world
10 |
11 |
--------------------------------------------------------------------------------
/prepare/0.hello_world/hello_world.c:
--------------------------------------------------------------------------------
1 | #include
2 | int main(int argc, char const *argv[])
3 | {
4 | printf("Hello NeZha\n");
5 | return 0;
6 | }
7 |
--------------------------------------------------------------------------------
/prepare/1.memory_copy/Makefile:
--------------------------------------------------------------------------------
1 | CTOOL := riscv64-unknown-linux-gnu-
2 | CCL := /home/zhaodongyu/toolchain/riscv/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
3 | CC := ${CCL}/bin/${CTOOL}gcc
4 |
5 | C_FLAGS := -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static
6 |
7 | test:main.o memcpy.o
8 | ${CC} $(C_FLAGS) -o test main.o memcpy.o
9 |
10 | main.o:main.c
11 | ${CC} $(C_FLAGS) -c main.c
12 |
13 | memcpy.o:memcpy.S
14 | ${CC} $(C_FLAGS) -c memcpy.S
15 |
16 | clean:
17 | rm test main.o memcpy.o
--------------------------------------------------------------------------------
/prepare/1.memory_copy/main.c:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | extern void *memcpy(void* dest, const void* src, size_t n);
4 |
5 |
6 | int main(int argc, char const *argv[])
7 | {
8 | int array_a[10] = {0,1,2,3,4,5,6,7,8,9};
9 | int array_b[10] = {};
10 | printf("Hello NeZha\n");
11 | memcpy(array_b, array_a, 10 * sizeof(int));
12 |
13 | for(int i = 0; i < 10; i ++) {
14 | printf("%d\t", array_b[i]);
15 | }
16 | printf("\n");
17 | return 0;
18 | }
--------------------------------------------------------------------------------
/prepare/1.memory_copy/memcpy.S:
--------------------------------------------------------------------------------
1 | .text
2 | .balign 4
3 | .global memcpy
4 | # void *memcpy(void* dest, const void* src, size_t n)
5 | # a0=dest, a1=src, a2=n
6 | #
7 | memcpy:
8 | mv a3, a0 # Copy destination
9 | loop:
10 | vsetvli t0, a2, e8 # Vectors of 8b
11 | vlb.v v0, (a1) # Load bytes
12 | add a1, a1, t0 # Bump pointer
13 | sub a2, a2, t0 # Decrement count
14 | vsb.v v0, (a3) # Store bytes
15 | add a3, a3, t0 # Bump pointer
16 | bnez a2, loop # Any more?
17 | ret # Return
18 |
--------------------------------------------------------------------------------
/prepare/2.memcpy_bandwidth_test/Makefile:
--------------------------------------------------------------------------------
1 | CTOOL := riscv64-unknown-linux-gnu-
2 | CCL := /home/zhaodongyu/toolchain/riscv/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
3 | CC := ${CCL}/bin/${CTOOL}gcc
4 |
5 | C_FLAGS := -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static
6 |
7 | mbw:mbw.o
8 | ${CC} $(C_FLAGS) -o mbw mbw.o
9 |
10 | clean:
11 | rm mbw mbw.o
--------------------------------------------------------------------------------
/prepare/2.memcpy_bandwidth_test/mbw.c:
--------------------------------------------------------------------------------
1 | /*
2 | * vim: ai ts=4 sts=4 sw=4 cinoptions=>4 expandtab
3 | */
4 | #define _GNU_SOURCE
5 |
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 | #include
14 | #include
15 | #include
16 |
17 | /* how many runs to average by default */
18 | #define DEFAULT_NR_LOOPS 10
19 |
20 | /* we have 3 tests at the moment */
21 | #define MAX_TESTS 3
22 |
23 | /* default block size for test 2, in bytes */
24 | #define DEFAULT_BLOCK_SIZE 262144
25 |
26 | /* test types */
27 | #define TEST_MEMCPY 0
28 | #define TEST_DUMB 1
29 | #define TEST_MCBLOCK 2
30 |
31 | /* version number */
32 | #define VERSION "1.4"
33 |
34 | /*
35 | * MBW memory bandwidth benchmark
36 | *
37 | * 2006, 2012 Andras.Horvath@gmail.com
38 | * 2013 j.m.slocum@gmail.com
39 | * (Special thanks to Stephen Pasich)
40 | *
41 | * http://github.com/raas/mbw
42 | *
43 | * compile with:
44 | * gcc -O -o mbw mbw.c
45 | *
46 | * run with eg.:
47 | *
48 | * ./mbw 300
49 | *
50 | * or './mbw -h' for help
51 | *
52 | * watch out for swap usage (or turn off swap)
53 | */
54 |
55 | void usage()
56 | {
57 | printf("mbw memory benchmark v%s, https://github.com/raas/mbw\n", VERSION);
58 | printf("Usage: mbw [options] array_size_in_MiB\n");
59 | printf("Options:\n");
60 | printf(" -n: number of runs per test (0 to run forever)\n");
61 | printf(" -a: Don't display average\n");
62 | printf(" -t%d: memcpy test\n", TEST_MEMCPY);
63 | printf(" -t%d: dumb (b[i]=a[i] style) test\n", TEST_DUMB);
64 | printf(" -t%d: memcpy test with fixed block size\n", TEST_MCBLOCK);
65 | printf(" -b : block size in bytes for -t2 (default: %d)\n", DEFAULT_BLOCK_SIZE);
66 | printf(" -q: quiet (print statistics only)\n");
67 | printf("(will then use two arrays, watch out for swapping)\n");
68 | printf("'Bandwidth' is amount of data copied over the time this operation took.\n");
69 | printf("\nThe default is to run all tests available.\n");
70 | }
71 |
72 | /* ------------------------------------------------------ */
73 |
74 | /* allocate a test array and fill it with data
75 | * so as to force Linux to _really_ allocate it */
76 | long *make_array(unsigned long long asize)
77 | {
78 | unsigned long long t;
79 | unsigned int long_size=sizeof(long);
80 | long *a;
81 |
82 | a=calloc(asize, long_size);
83 |
84 | if(NULL==a) {
85 | perror("Error allocating memory");
86 | exit(1);
87 | }
88 |
89 | /* make sure both arrays are allocated, fill with pattern */
90 | for(t=0; t= block_size; t-=block_size, aa+=block_size){
123 | bb=(char *) memcpy(bb, aa, block_size) + block_size;
124 | }
125 | if(t) {
126 | bb=(char *) memcpy(bb, aa, t) + t;
127 | }
128 | gettimeofday(&endtime, NULL);
129 | } else if(type==TEST_DUMB) { /* dumb test */
130 | gettimeofday(&starttime, NULL);
131 | for(t=0; tMAX_TESTS-1) {
214 | printf("Error: test number must be between 0 and %d\n", MAX_TESTS-1);
215 | exit(1);
216 | }
217 | tests[testno]=1;
218 | break;
219 | case 'b': /* block size in bytes*/
220 | block_size=strtoull(optarg, (char **)NULL, 10);
221 | if(0>=block_size) {
222 | printf("Error: what block size do you mean?\n");
223 | exit(1);
224 | }
225 | break;
226 | case 'q': /* quiet */
227 | quiet=1;
228 | break;
229 | default:
230 | break;
231 | }
232 | }
233 |
234 | /* default is to run all tests if no specific tests were requested */
235 | if( (tests[0]+tests[1]+tests[2]) == 0) {
236 | tests[0]=1;
237 | tests[1]=1;
238 | tests[2]=1;
239 | }
240 |
241 | if( nr_loops==0 && ((tests[0]+tests[1]+tests[2]) != 1) ) {
242 | printf("Error: nr_loops can be zero if only one test selected!\n");
243 | exit(1);
244 | }
245 |
246 | if(optind=mt) {
254 | printf("Error: array size wrong!\n");
255 | exit(1);
256 | }
257 |
258 | /* ------------------------------------------------------ */
259 |
260 | long_size=sizeof(long); /* the size of long on this platform */
261 | asize=1024*1024/long_size*mt; /* how many longs then in one array? */
262 |
263 | if(asize*long_size < block_size) {
264 | printf("Error: array size larger than block size (%llu bytes)!\n", block_size);
265 | exit(1);
266 | }
267 |
268 | if(!quiet) {
269 | printf("Long uses %d bytes. ", long_size);
270 | printf("Allocating 2*%lld elements = %lld bytes of memory.\n", asize, 2*asize*long_size);
271 | if(tests[2]) {
272 | printf("Using %lld bytes as blocks for memcpy block copy test.\n", block_size);
273 | }
274 | }
275 |
276 | a=make_array(asize);
277 | b=make_array(asize);
278 |
279 | /* ------------------------------------------------------ */
280 | if(!quiet) {
281 | printf("Getting down to business... Doing %d runs per test.\n", nr_loops);
282 | }
283 |
284 | /* run all tests requested, the proper number of times */
285 | for(testno=0; testno
2 | #include
3 | #include
4 | #include
5 | #include
6 |
7 | extern void load(size_t mc, // nr <= 16, a0
8 | size_t nc, // mr <= 4, a1
9 | const float* a, // mr * k, a3
10 | float* c // mr * nr, a5
11 | );
12 |
13 |
14 | int main(int argc, char const *argv[])
15 | {
16 | int warmup_times = 10;
17 | int test_times = 1000;
18 | int nc = 1024;
19 | int mc = 1024;
20 |
21 | float* a = (float*)malloc(mc * nc * sizeof(float));
22 | float* c = (float*)malloc(mc * nc * sizeof(float));
23 |
24 | for(int i = 0; i < nc * mc; i++) a[i] = i;
25 |
26 |
27 | float time_use=0;
28 | struct timeval start;
29 | struct timeval end;
30 |
31 | //warmup
32 | for (int i = 0; i < warmup_times; i++){
33 | load(mc, nc, a, c);
34 | }
35 |
36 |
37 | gettimeofday(&start,NULL);
38 | for (int i = 0; i < test_times; i++){
39 | load(mc, nc, a, c);
40 | }
41 | gettimeofday(&end,NULL);
42 | time_use=(end.tv_sec-start.tv_sec)*1000000+(end.tv_usec-start.tv_usec);//微秒
43 | printf("time_use is %.3fms\n", time_use/1000);
44 |
45 |
46 | free(a);
47 | free(c);
48 | }
49 |
--------------------------------------------------------------------------------
/prepare/4.vlw_bandwidth_test/Makefile:
--------------------------------------------------------------------------------
1 | CTOOL := riscv64-unknown-linux-gnu-
2 | CCL := /home/zhaodongyu/toolchain/riscv/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
3 | CC := ${CCL}/bin/${CTOOL}gcc
4 |
5 | C_FLAGS := -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static
6 |
7 | test:load_test.o load_vlw.o
8 | ${CC} $(C_FLAGS) -o test load_test.o load_vlw.o
9 |
10 | load_test.o:load_test.c
11 | ${CC} $(C_FLAGS) -c load_test.c
12 |
13 | load_vlw.o:load_vlw.S
14 | ${CC} $(C_FLAGS) -c load_vlw.S
15 |
16 | clean:
17 | rm test load_test.o load_vlw.o
--------------------------------------------------------------------------------
/prepare/4.vlw_bandwidth_test/load_test.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 |
7 | extern void load(size_t mc, // nr <= 16, a0
8 | size_t nc, // mr <= 4, a1
9 | const float* a, // mr * k, a3
10 | float* c // mr * nr, a5
11 | );
12 |
13 |
14 | int main(int argc, char const *argv[])
15 | {
16 | int warmup_times = 10;
17 | int test_times = 1000;
18 | int nc = 1024;
19 | int mc = 1024;
20 |
21 | float* a = (float*)malloc(mc * nc * sizeof(float));
22 | float* c = (float*)malloc(mc * nc * sizeof(float));
23 |
24 | for(int i = 0; i < nc * mc; i++) a[i] = i;
25 |
26 |
27 | float time_use=0;
28 | struct timeval start;
29 | struct timeval end;
30 |
31 | //warmup
32 | for (int i = 0; i < warmup_times; i++){
33 | load(mc, nc, a, c);
34 | }
35 |
36 |
37 | gettimeofday(&start,NULL);
38 | for (int i = 0; i < test_times; i++){
39 | load(mc, nc, a, c);
40 | }
41 | gettimeofday(&end,NULL);
42 | time_use=(end.tv_sec-start.tv_sec)*1000000+(end.tv_usec-start.tv_usec);//微秒
43 | printf("time_use is %.3fms\n", time_use/1000);
44 |
45 |
46 | free(a);
47 | free(c);
48 | }
49 |
--------------------------------------------------------------------------------
/prepare/4.vlw_bandwidth_test/load_vlw.S:
--------------------------------------------------------------------------------
1 | /*************************************************
2 | *
3 | * Created by Aidget on 2022/11/30.
4 | * Copyright © 2022, developed by Midea AIIC
5 | *
6 | *************************************************/
7 |
8 | # void load(size_t mc, // nr <= 16, a0
9 | # size_t nc, // mr <= 4, a1
10 | # const float* a, // mr * k, a3
11 | # float* c // mr * nr, a5
12 | # );
13 | .global load
14 | .type load, @function
15 |
16 | #define mc a0
17 | #define nc a1
18 | #define ap a2
19 | #define cp a3
20 |
21 | #define mt t0
22 | #define nt t1
23 | #define ap0 t2
24 |
25 | load:
26 | mv t0, nc
27 | vsetvli t1, t0, e32, m8
28 | mv mt, mc
29 | mv nt, nc
30 | mv ap0, ap
31 |
32 | .start:
33 | .loop1:
34 | mv mt, mc
35 | addi nc, nc, -1
36 | slti t6, nc, 0 # nc < 0, t6 = 1
37 | bnez t6, .end
38 | .loop2:
39 | vlw.v v0, (ap0)
40 | addi ap0, ap0, 32
41 | vlw.v v0, (ap0)
42 | addi ap0, ap0, 32
43 | vlw.v v0, (ap0)
44 | addi ap0, ap0, 32
45 | vlw.v v0, (ap0)
46 | addi ap0, ap0, 32
47 | vlw.v v0, (ap0)
48 | addi ap0, ap0, 32
49 | vlw.v v0, (ap0)
50 | addi ap0, ap0, 32
51 | vlw.v v0, (ap0)
52 | addi ap0, ap0, 32
53 | vlw.v v0, (ap0)
54 | addi ap0, ap0, 32
55 |
56 | vlw.v v0, (ap0)
57 | addi ap0, ap0, 32
58 | vlw.v v0, (ap0)
59 | addi ap0, ap0, 32
60 | vlw.v v0, (ap0)
61 | addi ap0, ap0, 32
62 | vlw.v v0, (ap0)
63 | addi ap0, ap0, 32
64 | vlw.v v0, (ap0)
65 | addi ap0, ap0, 32
66 | vlw.v v0, (ap0)
67 | addi ap0, ap0, 32
68 | vlw.v v0, (ap0)
69 | addi ap0, ap0, 32
70 | vlw.v v0, (ap0)
71 | addi ap0, ap0, 32
72 |
73 | addi mt, mt, -128
74 | slti t6, mt, 1 # mt < 1, t6 = 1s
75 | beqz t6, .loop2
76 |
77 | j .loop1
78 |
79 | .end:
80 | ret
81 |
--------------------------------------------------------------------------------
/prepare/5.saxpy/Makefile:
--------------------------------------------------------------------------------
1 | CTOOL := riscv64-unknown-linux-gnu-
2 | CCL := /home/zhaodongyu/toolchain/riscv/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
3 | CC := ${CCL}/bin/${CTOOL}gcc
4 |
5 | C_FLAGS := -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static
6 |
7 | test:main.o saxpy.o
8 | ${CC} $(C_FLAGS) -o test saxpy.o main.o
9 |
10 | main.o:main.c
11 | ${CC} $(C_FLAGS) -c main.c
12 |
13 | saxpy.o:saxpy.S
14 | ${CC} $(C_FLAGS) -c saxpy.S
15 |
16 | clean:
17 | rm test main.o saxpy.o
--------------------------------------------------------------------------------
/prepare/5.saxpy/main.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 |
6 | extern void saxpy(size_t n, const float a, const float *x, float *y);
7 | // # void
8 | // # saxpy(size_t n, const float a, const float *x, float *y)
9 | // # {
10 | // # size_t i;
11 | // # for (i=0; i 在 v0.9 之前,当未在 vsetvli 上指定这些标志时,它们默认为掩码未受干扰/尾部未受干扰
44 |
45 | `vsetvli t0, a2, e8`
46 |
47 | 这个例子中,初见`vsetvli`指令,a2是长度n。
48 |
49 | - 1st,a2 = 10 --> t0 = 8 --> a2 = 2
50 | - 2nd, a2 = 2 --> t0 = 2 --> a2 = 0 -->ret
51 |
52 | ## 2.memcpy bandwidth test
53 | 测试内存带宽的小脚本
54 | ```shell
55 | $ make
56 | $ adb push mbw ./.
57 | $ adb shell "./mbw 100"
58 | Long uses 8 bytes. Allocating 2*13107200 elements = 209715200 bytes of memory.
59 | Using 262144 bytes as blocks for memcpy block copy test.
60 | Getting down to business... Doing 10 runs per test.
61 | ...
62 | AVG Method: MEMCPY Elapsed: 0.09614 MiB: 100.00000 Copy: 1040.166 MiB/s
63 | ...
64 | AVG Method: DUMB Elapsed: 0.60301 MiB: 100.00000 Copy: 165.835 MiB/s
65 | ...
66 | AVG Method: MCBLOCK Elapsed: 0.09692 MiB: 100.00000 Copy: 1031.754 MiB/s
67 | ```
68 | ## 3.flw bandwidth test
69 | 使用`flw`测试内存带宽的小脚本
70 |
71 | ```shell
72 | $ make
73 | $ adb push test ./.
74 | $ adb shell "./test"
75 | time_use is 2686.789ms
76 | ```
77 | flw: 4000MB/(2678.298 − 1171.154)ms = 2.592GB/s
78 |
79 | ## 4.vlw bandwidth test
80 | 使用`vlw`测试内存带宽的小脚本
81 |
82 | ```shell
83 | $ make
84 | $ adb push test ./.
85 | $ adb shell "./test"
86 | ```
87 | vlw(m1): 4000MB/(10391.898 − 1273.346)ms = 0.428GB/s
88 |
89 | vlw(m2): 4000MB/(9922.699 − 641.811)ms = 0.421GB/s
90 |
91 | vlw(m4): 4000MB/(4002.607 − 327.388)ms = 1.063GB/s
92 |
93 | vlw(m8): 4000MB/(3829.181 − 166.246)ms = 1.066GB/s
94 |
95 | 综上,哪吒D1的内存带宽测试结果为:
96 |
97 | 
98 |
99 | ---
100 |
101 | ## 5.saxpy
102 | > SAXPY(Scalar Alpha X Plus Y)是一个在 Basic Linear Algebra Subprograms(BLAS)数据包中的函数,并且是一个并行向量处理机(vector processor)中常用的计算操作指令。
103 |
104 | y=αx+y,其中α是标量,x和y矢量。
105 | ```shell
106 | $ make
107 | $ adb push hello_world ./.
108 | $ adb shell "./test"
109 | 3.100000 5.200000 7.300000 9.400000 11.500000 13.600000 15.700000 17.799999 19.900000 21.950001 23.969999
110 | ```
111 | `vsetvli a4, a0, e32, m8 `
112 |
113 | 这个例子中,又见`vsetvli`指令,`vsetvli`使用`m8`参数设置了每条指令处理8个连续的向量寄存器,a0是长度n。
114 |
115 | n = 11 --> a0 = 11
116 |
117 | a = 2.0 --> fa0 = 2.0
118 |
119 | > `vsetvli a4, a0, e32, m8`
120 |
121 | a4 = min(11,8) = 8
122 |
123 | > `vlw.v v0, (a1) `
124 |
125 | v0-v7 = x0-x7 next: v0-v7 = x8-...
126 |
127 | > `sub a0, a0, a4`
128 |
129 | a0 = a0 - a4 = 11 - 8 = 3
130 |
131 | > `slli a4, a4, 2 `
132 |
133 | a4 = a4 << 2 = 8*4 = 32 # float占4个Byte
134 |
135 | > `add a1, a1, a4 `
136 |
137 | a1本指向x0,现在指向x8
138 |
139 | > `vlw.v v8, (a2) `
140 |
141 | y0-y7 load到 v8-v15
142 |
143 | > `vfmacc.vf v8, fa0, v0 `
144 |
145 | (v8-v15) = fa0 * (v0-v7) + (v8-v15)
146 |
147 | > `vsw.v v8, (a2) `
148 |
149 | store到`y0`
150 |
151 | > `add a2, a2, a4 `
152 |
153 | `a1`本指向`y0`,现在指向`y8`
154 |
155 | ---
156 |
157 | OK,我认为进行到这里,一些关于`RISC-V V扩展`的基础知识已经具备,关于板子的内存性能也已经得到,可以开始写 `sgemm` 算子了!
--------------------------------------------------------------------------------
/prepare/imgs/memory_bandwidth_test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhao-Dongyu/sgemm_riscv/b18a85b7a1eb9a5faa810755e1a6d632f136d98a/prepare/imgs/memory_bandwidth_test.png
--------------------------------------------------------------------------------
/sgemm/common/bl_sgemm.h:
--------------------------------------------------------------------------------
1 | /*
2 | * --------------------------------------------------------------------------
3 | * BLISLAB
4 | * --------------------------------------------------------------------------
5 | * Copyright (C) 2016, The University of Texas at Austin
6 | *
7 | * Redistribution and use in source and binary forms, with or without
8 | * modification, are permitted provided that the following conditions are
9 | * met:
10 | * - Redistributions of source code must retain the above copyright
11 | * notice, this list of conditions and the following disclaimer.
12 | * - Redistributions in binary form must reproduce the above copyright
13 | * notice, this list of conditions and the following disclaimer in the
14 | * documentation and/or other materials provided with the distribution.
15 | * - Neither the name of The University of Texas nor the names of its
16 | * contributors may be used to endorse or promote products derived
17 | * from this software without specific prior written permission.
18 | *
19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | *
31 | *
32 | * bl_sgemm.h
33 | *
34 | *
35 | * Purpose:
36 | * this header file contains all function prototypes.
37 | *
38 | * Todo:
39 | *
40 | *
41 | * Modification:
42 | *
43 | *
44 | * */
45 |
46 |
47 | #ifndef BLISLAB_DGEMM_H
48 | #define BLISLAB_DGEMM_H
49 |
50 | // Allow C++ users to include this header file in their source code. However,
51 | // we make the extern "C" conditional on whether we're using a C++ compiler,
52 | // since regular C compilers don't understand the extern "C" construct.
53 | #ifdef __cplusplus
54 | extern "C" {
55 | #endif
56 |
57 |
58 | #include
59 |
60 | #include
61 | #include
62 | #include
63 |
64 | // Determine the target operating system
65 |
66 | #if defined(__linux__)
67 | #define BL_OS_LINUX 1
68 | #else
69 | #error "unsupport OS, this only support Linux"
70 | #endif
71 |
72 | // gettimeofday() needs this.
73 | #include
74 | #include
75 |
76 | #define GEMM_SIMD_ALIGN_SIZE 32
77 |
78 | #define min( i, j ) ( (i)<(j) ? (i): (j) )
79 |
80 | // #define A( i, j ) A[ (j)*lda + (i) ]
81 | // #define B( i, j ) B[ (j)*ldb + (i) ]
82 | // #define C( i, j ) C[ (j)*ldc + (i) ]
83 | // #define C_ref( i, j ) C_ref[ (j)*ldc_ref + (i) ]
84 |
85 | #define A( i, j ) A[ (i)*lda + (j) ]
86 | #define B( i, j ) B[ (i)*ldb + (j) ]
87 | #define C( i, j ) C[ (i)*ldc + (j) ]
88 | #define C_ref( i, j ) C_ref[ (i)*ldc_ref + (j) ]
89 |
90 | void bl_sgemm(
91 | int m,
92 | int n,
93 | int k,
94 | float *A,
95 | int lda,
96 | float *B,
97 | int ldb,
98 | float *C,
99 | int ldc
100 | );
101 |
102 | void bl_sgemm_pack(
103 | int m,
104 | int mr,
105 | int n,
106 | int nr,
107 | int k,
108 | float *A,
109 | float *packA,
110 | int lda,
111 | float *B,
112 | float *packB,
113 | int ldb,
114 | float *C,
115 | int ldc
116 | );
117 |
118 | float *bl_malloc_aligned(
119 | int m,
120 | int n,
121 | int size
122 | );
123 |
124 | void bl_printmatrix(
125 | float *A,
126 | int lda,
127 | int m,
128 | int n
129 | );
130 |
131 | float bl_clock( void );
132 | float bl_clock_helper();
133 |
134 | void bl_sgemm_ref(
135 | int m,
136 | int n,
137 | int k,
138 | float *XA,
139 | int lda,
140 | float *XB,
141 | int ldb,
142 | float *XC,
143 | int ldc
144 | );
145 |
146 | // End extern "C" construct block.
147 | #ifdef __cplusplus
148 | }
149 | #endif
150 |
151 | #endif
152 |
153 |
--------------------------------------------------------------------------------
/sgemm/common/bl_sgemm_ref.c:
--------------------------------------------------------------------------------
1 | /*
2 | * --------------------------------------------------------------------------
3 | * BLISLAB
4 | * --------------------------------------------------------------------------
5 | * Copyright (C) 2016, The University of Texas at Austin
6 | *
7 | * Redistribution and use in source and binary forms, with or without
8 | * modification, are permitted provided that the following conditions are
9 | * met:
10 | * - Redistributions of source code must retain the above copyright
11 | * notice, this list of conditions and the following disclaimer.
12 | * - Redistributions in binary form must reproduce the above copyright
13 | * notice, this list of conditions and the following disclaimer in the
14 | * documentation and/or other materials provided with the distribution.
15 | * - Neither the name of The University of Texas nor the names of its
16 | * contributors may be used to endorse or promote products derived
17 | * from this software without specific prior written permission.
18 | *
19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | *
31 | *
32 | * bl_sgemm_ref.c
33 | *
34 | *
35 | * Purpose:
36 | * implement reference mkl using GEMM (optional) in C.
37 | *
38 | * Todo:
39 | *
40 | *
41 | * Modification:
42 | *
43 | *
44 | * */
45 |
46 | #include
47 |
48 | void bl_sgemm_ref(
49 | int m,
50 | int n,
51 | int k,
52 | float *XA,
53 | int lda,
54 | float *XB,
55 | int ldb,
56 | float *XC,
57 | int ldc
58 | )
59 | {
60 | // Local variables.
61 | int i, j, p;
62 | float alpha = 1.0, beta = 1.0;
63 |
64 | // Sanity check for early return.
65 | if ( m == 0 || n == 0 || k == 0 ) return;
66 |
67 | // Reference GEMM implementation.
68 | for ( i = 0; i < m; i ++ ) {
69 | for ( p = 0; p < k; p ++ ) {
70 | for ( j = 0; j < n; j ++ ) {
71 | XC[ i * ldc + j ] += XA[ i * lda + p ] * XB[ p * ldb + j ];
72 | }
73 | }
74 | }
75 | }
76 |
77 |
--------------------------------------------------------------------------------
/sgemm/common/bl_sgemm_util.c:
--------------------------------------------------------------------------------
1 | /*
2 | * --------------------------------------------------------------------------
3 | * BLISLAB
4 | * --------------------------------------------------------------------------
5 | * Copyright (C) 2016, The University of Texas at Austin
6 | *
7 | * Redistribution and use in source and binary forms, with or without
8 | * modification, are permitted provided that the following conditions are
9 | * met:
10 | * - Redistributions of source code must retain the above copyright
11 | * notice, this list of conditions and the following disclaimer.
12 | * - Redistributions in binary form must reproduce the above copyright
13 | * notice, this list of conditions and the following disclaimer in the
14 | * documentation and/or other materials provided with the distribution.
15 | * - Neither the name of The University of Texas nor the names of its
16 | * contributors may be used to endorse or promote products derived
17 | * from this software without specific prior written permission.
18 | *
19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | *
31 | *
32 | * bl_sgemm_util.c
33 | *
34 | *
35 | * Purpose:
36 | * Utility routines (Mem allocation, Print, etc.) that will come in handy later.
37 | *
38 | * Todo:
39 | *
40 | *
41 | * Modification:
42 | *
43 | *
44 | * */
45 |
46 | #include "bl_sgemm.h"
47 |
48 | /*
49 | *
50 | *
51 | */
52 | float *bl_malloc_aligned(
53 | int m,
54 | int n,
55 | int size
56 | )
57 | {
58 | float *ptr;
59 | int err;
60 |
61 | err = posix_memalign( (void**)&ptr, (size_t)GEMM_SIMD_ALIGN_SIZE, size * m * n );
62 |
63 | if ( err ) {
64 | printf( "bl_malloc_aligned(): posix_memalign() failures" );
65 | exit( 1 );
66 | }
67 |
68 | return ptr;
69 | }
70 |
71 |
72 |
73 | /*
74 | *
75 | *
76 | */
77 | void bl_sgemm_printmatrix(
78 | float *A,
79 | int lda,
80 | int m,
81 | int n
82 | )
83 | {
84 | int i, j;
85 | for ( i = 0; i < m; i ++ ) {
86 | for ( j = 0; j < n; j ++ ) {
87 | printf("%lf\t", A[j * lda + i]);
88 | }
89 | printf("\n");
90 | }
91 | }
92 |
93 | /*
94 | * The timer functions are copied directly from BLIS 0.2.0
95 | *
96 | */
97 | static float gtod_ref_time_sec = 0.0;
98 |
99 | float bl_clock( void )
100 | {
101 | return bl_clock_helper();
102 | }
103 |
104 | // --- Begin Linux build definitions -------------------------------------------
105 |
106 | float bl_clock_helper()
107 | {
108 | float the_time, norm_sec;
109 | struct timespec ts;
110 |
111 | clock_gettime( CLOCK_MONOTONIC, &ts );
112 |
113 | if ( gtod_ref_time_sec == 0.0 )
114 | gtod_ref_time_sec = ( float ) ts.tv_sec;
115 |
116 | norm_sec = ( float ) ts.tv_sec - gtod_ref_time_sec;
117 |
118 | the_time = norm_sec + ts.tv_nsec * 1.0e-9;
119 |
120 | return the_time;
121 | }
122 |
123 | // --- End Linux build definitions ---------------------------------------------
124 |
125 |
126 |
127 |
128 |
--------------------------------------------------------------------------------
/sgemm/common/test_bl_sgemm.c:
--------------------------------------------------------------------------------
1 | /*
2 | * --------------------------------------------------------------------------
3 | * BLISLAB
4 | * --------------------------------------------------------------------------
5 | * Copyright (C) 2016, The University of Texas at Austin
6 | *
7 | * Redistribution and use in source and binary forms, with or without
8 | * modification, are permitted provided that the following conditions are
9 | * met:
10 | * - Redistributions of source code must retain the above copyright
11 | * notice, this list of conditions and the following disclaimer.
12 | * - Redistributions in binary form must reproduce the above copyright
13 | * notice, this list of conditions and the following disclaimer in the
14 | * documentation and/or other materials provided with the distribution.
15 | * - Neither the name of The University of Texas nor the names of its
16 | * contributors may be used to endorse or promote products derived
17 | * from this software without specific prior written permission.
18 | *
19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | *
31 | *
32 | * test_bl_sgemm.c
33 | *
34 | *
35 | * Purpose:
36 | * test driver for BLISLAB sgemm routine and reference sgemm routine.
37 | *
38 | * Todo:
39 | *
40 | *
41 | * Modification:
42 | *
43 | *
44 | * */
45 |
46 |
47 | #include "bl_sgemm.h"
48 |
49 | #define ERROR_TEST
50 |
51 | #define TOLERANCE 1E-2
52 | void computeError(
53 | int ldc,
54 | int ldc_ref,
55 | int m,
56 | int n,
57 | float *C,
58 | float *C_ref
59 | )
60 | {
61 | int i, j;
62 | for ( i = 0; i < m; i ++ ) {
63 | for ( j = 0; j < n; j ++ ) {
64 | if ( fabs( C( i, j ) - C_ref( i, j ) ) > TOLERANCE ) {
65 | printf( "C[ %d ][ %d ] != C_ref, %E, %E\n", i, j, C( i, j ), C_ref( i, j ) );
66 | break;
67 | }
68 | }
69 | }
70 |
71 | }
72 |
73 | void test_bl_sgemm(
74 | int m,
75 | int n,
76 | int k
77 | )
78 | {
79 | int i, j, p, nx;
80 | float *A, *B, *C, *C_ref;
81 | float tmp, error, flops;
82 | float ref_beg, ref_time, bl_sgemm_beg, bl_sgemm_time;
83 | int nrepeats;
84 | int lda, ldb, ldc, ldc_ref;
85 | float ref_rectime, bl_sgemm_rectime;
86 |
87 | A = (float*)malloc( sizeof(float) * m * k );
88 | B = (float*)malloc( sizeof(float) * k * n );
89 |
90 | lda = m;
91 | ldb = k;
92 | ldc = m;
93 | ldc_ref = m;
94 | C = bl_malloc_aligned( ldc, n + 4, sizeof(float) );
95 | C_ref = (float*)malloc( sizeof(float) * m * n );
96 |
97 | nrepeats = 3;
98 |
99 | srand48 (time(NULL));
100 |
101 | // Randonly generate points in [ 0, 1 ].
102 | for ( p = 0; p < k; p ++ ) {
103 | for ( i = 0; i < m; i ++ ) {
104 | A( i, p ) = (float)( drand48() );
105 | }
106 | }
107 | for ( j = 0; j < n; j ++ ) {
108 | for ( p = 0; p < k; p ++ ) {
109 | B( p, j ) = (float)( drand48() );
110 | }
111 | }
112 |
113 | for ( j = 0; j < n; j ++ ) {
114 | for ( i = 0; i < m; i ++ ) {
115 | C_ref( i, j ) = (float)( 0.0 );
116 | C( i, j ) = (float)( 0.0 );
117 | }
118 | }
119 |
120 | for ( i = 0; i < nrepeats; i ++ ) {
121 | bl_sgemm_beg = bl_clock();
122 | {
123 | bl_sgemm(
124 | m,
125 | n,
126 | k,
127 | A,
128 | lda,
129 | B,
130 | ldb,
131 | C,
132 | ldc
133 | );
134 | }
135 | bl_sgemm_time = bl_clock() - bl_sgemm_beg;
136 |
137 | if ( i == 0 ) {
138 | bl_sgemm_rectime = bl_sgemm_time;
139 | } else {
140 | bl_sgemm_rectime = bl_sgemm_time < bl_sgemm_rectime ? bl_sgemm_time : bl_sgemm_rectime;
141 | }
142 | }
143 |
144 | #ifdef ERROR_TEST
145 | for ( i = 0; i < nrepeats; i ++ ) {
146 | ref_beg = bl_clock();
147 | {
148 | bl_sgemm_ref(
149 | m,
150 | n,
151 | k,
152 | A,
153 | lda,
154 | B,
155 | ldb,
156 | C_ref,
157 | ldc_ref
158 | );
159 | }
160 | ref_time = bl_clock() - ref_beg;
161 |
162 | if ( i == 0 ) {
163 | ref_rectime = ref_time;
164 | } else {
165 | ref_rectime = ref_time < ref_rectime ? ref_time : ref_rectime;
166 | }
167 | }
168 |
169 | computeError(
170 | ldc,
171 | ldc_ref,
172 | m,
173 | n,
174 | C,
175 | C_ref
176 | );
177 | #endif
178 |
179 | // Compute overall floating point operations.
180 | flops = ( m * n / ( 1000.0 * 1000.0 * 1000.0 ) ) * ( 2 * k );
181 |
182 | printf( "%5d\t %5d\t %5d\t %5.3lf\t %5.3lf\n",
183 | m, n, k, flops / bl_sgemm_rectime, flops / ref_rectime );
184 |
185 | free( A );
186 | free( B );
187 | free( C );
188 | free( C_ref );
189 | }
190 |
191 | int main( int argc, char *argv[] )
192 | {
193 | printf("%%m\t%%n\t%%k\t%%MY_GFLOPS\t%%REF_GFLOPS\n");
194 | for(int i = 16; i <= 800; i += 4) {
195 | test_bl_sgemm( i, i, i );
196 | }
197 |
198 | return 0;
199 | }
200 |
201 |
--------------------------------------------------------------------------------
/sgemm/common/test_bl_sgemm_packB_4x16.c:
--------------------------------------------------------------------------------
1 | /*
2 | * --------------------------------------------------------------------------
3 | * BLISLAB
4 | * --------------------------------------------------------------------------
5 | * Copyright (C) 2016, The University of Texas at Austin
6 | *
7 | * Redistribution and use in source and binary forms, with or without
8 | * modification, are permitted provided that the following conditions are
9 | * met:
10 | * - Redistributions of source code must retain the above copyright
11 | * notice, this list of conditions and the following disclaimer.
12 | * - Redistributions in binary form must reproduce the above copyright
13 | * notice, this list of conditions and the following disclaimer in the
14 | * documentation and/or other materials provided with the distribution.
15 | * - Neither the name of The University of Texas nor the names of its
16 | * contributors may be used to endorse or promote products derived
17 | * from this software without specific prior written permission.
18 | *
19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | *
31 | *
32 | * test_bl_sgemm.c
33 | *
34 | *
35 | * Purpose:
36 | * test driver for BLISLAB sgemm routine and reference sgemm routine.
37 | *
38 | * Todo:
39 | *
40 | *
41 | * Modification:
42 | *
43 | *
44 | * */
45 |
46 |
47 | #include "bl_sgemm.h"
48 |
49 | #define ERROR_TEST
50 |
51 | #define TOLERANCE 1E-2
52 | void computeError(
53 | int ldc,
54 | int ldc_ref,
55 | int m,
56 | int n,
57 | float *C,
58 | float *C_ref
59 | )
60 | {
61 | int i, j;
62 | for ( i = 0; i < m; i ++ ) {
63 | for ( j = 0; j < n; j ++ ) {
64 | if ( fabs( C( i, j ) - C_ref( i, j ) ) > TOLERANCE ) {
65 | printf( "C[ %d ][ %d ] != C_ref, %E, %E\n", i, j, C( i, j ), C_ref( i, j ) );
66 | break;
67 | }
68 | }
69 | }
70 |
71 | }
72 |
73 | void PackWeightLayout(float* dst, const float* src, int nc, int kc, int nr, bool transpose) {
74 | int index = 0;
75 | for (int nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
76 | int nr_block_size = nr;
77 | if((nc - nr_block_start) < nr) nr_block_size = nc - nr_block_start;
78 |
79 | for (int kr_block_start = 0; kr_block_start < kc; kr_block_start++) {
80 | for (int nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
81 | if (nr_block_offset >= nr_block_size) {
82 | index++;
83 | continue;
84 | }
85 | int x_idx = transpose ? kr_block_start : (nr_block_start + nr_block_offset);
86 | int y_idx = transpose ? (nr_block_start + nr_block_offset) : kr_block_start;
87 | int x_size = transpose ? kc : nc;
88 | dst[index++] = src[y_idx * x_size + x_idx];
89 | }
90 | }
91 | }
92 | }
93 |
94 | void test_bl_sgemm(
95 | int m,
96 | int n,
97 | int k
98 | )
99 | {
100 | int i, j, p, nx;
101 | float *A, *B, *C, *C_ref, *packA, *packB;
102 | float tmp, error, flops;
103 | static float ref_beg, ref_time, bl_sgemm_beg, bl_sgemm_time;
104 | int nrepeats;
105 | int lda, ldb, ldc, ldc_ref;
106 | float ref_rectime, bl_sgemm_rectime;
107 |
108 | int mr = 4;
109 | int nr = 16;
110 |
111 | A = (float*)malloc( sizeof(float) * m * k *2);
112 | B = (float*)malloc( sizeof(float) * k * n );
113 | // Allocate packing buffers
114 | packA = bl_malloc_aligned( m + mr, k, sizeof(float) );
115 | packB = bl_malloc_aligned( k*2, n + nr, sizeof(float) );
116 |
117 |
118 | lda = k;
119 | ldb = n;
120 | ldc = n;
121 | ldc_ref = n;
122 | C = bl_malloc_aligned( ldc, n + nr, sizeof(float) );
123 | C_ref = (float*)malloc( sizeof(float) * m * n );
124 |
125 | nrepeats = 1;
126 |
127 | srand48 (time(NULL));
128 |
129 | // Randonly generate points in [ 0, 1 ].
130 | for ( p = 0; p < k; p ++ ) {
131 | for ( i = 0; i < m; i ++ ) {
132 | A( i, p ) = (float)( drand48() );
133 | // A( i, p ) = (float)( i*m + p );
134 | }
135 | }
136 | for ( j = 0; j < n; j ++ ) {
137 | for ( p = 0; p < k; p ++ ) {
138 | B( p, j ) = (float)( drand48() );
139 | // B( p, j ) = (float)( p*n + j );
140 | }
141 | }
142 |
143 | for ( j = 0; j < n; j ++ ) {
144 | for ( i = 0; i < m; i ++ ) {
145 | C_ref( i, j ) = (float)( 0.0 );
146 | C( i, j ) = (float)( 0.0 );
147 | }
148 | }
149 |
150 | PackWeightLayout(packB, B, n, k, nr, false);
151 |
152 | // printf("[B]\n");
153 | // for(int i = 0; i < k; i++) {
154 | // for(int j = 0; j < n; j++) {
155 | // printf("%.1f\t", B[i * n + j]);
156 | // }
157 | // printf("\n");
158 | // }
159 |
160 | // printf("[packB]\n");
161 | // for(int i = 0; i < k; i++) {
162 | // for(int j = 0; j < nr; j++) {
163 | // printf("%.1f\t", packB[i * nr + j]);
164 | // }
165 | // printf("\n");
166 | // }
167 |
168 | for ( i = 0; i < nrepeats; i ++ ) {
169 | bl_sgemm_beg = bl_clock();
170 | {
171 | bl_sgemm_pack(
172 | m,
173 | mr,
174 | n,
175 | nr,
176 | k,
177 | A,
178 | packA,
179 | lda,
180 | B,
181 | packB,
182 | ldb,
183 | C,
184 | ldc
185 | );
186 | }
187 | bl_sgemm_time = bl_clock() - bl_sgemm_beg;
188 |
189 | if ( i == 0 ) {
190 | bl_sgemm_rectime = bl_sgemm_time;
191 | } else {
192 | bl_sgemm_rectime = bl_sgemm_time < bl_sgemm_rectime ? bl_sgemm_time : bl_sgemm_rectime;
193 | }
194 | }
195 |
196 | #ifdef ERROR_TEST
197 | for ( i = 0; i < nrepeats; i ++ ) {
198 | ref_beg = bl_clock();
199 | {
200 | bl_sgemm_ref(
201 | m,
202 | n,
203 | k,
204 | A,
205 | lda,
206 | B,
207 | ldb,
208 | C_ref,
209 | ldc_ref
210 | );
211 | }
212 | ref_time = bl_clock() - ref_beg;
213 |
214 | if ( i == 0 ) {
215 | ref_rectime = ref_time;
216 | } else {
217 | ref_rectime = ref_time < ref_rectime ? ref_time : ref_rectime;
218 | }
219 | }
220 |
221 | computeError(
222 | ldc,
223 | ldc_ref,
224 | m,
225 | n,
226 | C,
227 | C_ref
228 | );
229 | #endif
230 | // printf("ref\n");
231 | // for(int i = 0; i < m; i++) {
232 | // for(int j = 0; j < n; j++) {
233 | // printf("%.0f\t", C_ref[i * n + j]);
234 | // }
235 | // printf("\n");
236 | // }
237 | // printf("\n\n");
238 |
239 | // printf("C\n");
240 | // for(int i = 0; i < m; i++) {
241 | // for(int j = 0; j < n; j++) {
242 | // printf("%.0f\t", C[i * n + j]);
243 | // }
244 | // printf("\n");
245 | // }
246 | // printf("\n\n");
247 |
248 | // Compute overall floating point operations.
249 | flops = ( m * n / ( 1000.0 * 1000.0 * 1000.0 ) ) * ( 2 * k );
250 | printf( "%5d\t %5d\t %5d\t %5.3lf\t %5.3lf\n",
251 | m, n, k, flops / bl_sgemm_rectime, flops / ref_rectime );
252 |
253 | free( A );
254 | free( packA );
255 | free( B );
256 | free( packB );
257 | free( C );
258 | free( C_ref );
259 | }
260 |
261 | int main( int argc, char *argv[] )
262 | {
263 | printf("%%m\t%%n\t%%k\t%%MY_GFLOPS\t%%REF_GFLOPS\n");
264 | for(int i = 16; i <= 800; i += 4) {
265 | test_bl_sgemm( i, i, i );
266 | }
267 |
268 | return 0;
269 | }
270 |
271 |
--------------------------------------------------------------------------------
/sgemm/common/test_bl_sgemm_packB_4x4.c:
--------------------------------------------------------------------------------
1 | /*
2 | * --------------------------------------------------------------------------
3 | * BLISLAB
4 | * --------------------------------------------------------------------------
5 | * Copyright (C) 2016, The University of Texas at Austin
6 | *
7 | * Redistribution and use in source and binary forms, with or without
8 | * modification, are permitted provided that the following conditions are
9 | * met:
10 | * - Redistributions of source code must retain the above copyright
11 | * notice, this list of conditions and the following disclaimer.
12 | * - Redistributions in binary form must reproduce the above copyright
13 | * notice, this list of conditions and the following disclaimer in the
14 | * documentation and/or other materials provided with the distribution.
15 | * - Neither the name of The University of Texas nor the names of its
16 | * contributors may be used to endorse or promote products derived
17 | * from this software without specific prior written permission.
18 | *
19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | *
31 | *
32 | * test_bl_sgemm.c
33 | *
34 | *
35 | * Purpose:
36 | * test driver for BLISLAB sgemm routine and reference sgemm routine.
37 | *
38 | * Todo:
39 | *
40 | *
41 | * Modification:
42 | *
43 | *
44 | * */
45 |
46 |
47 | #include "bl_sgemm.h"
48 |
49 | #define ERROR_TEST
50 |
51 | #define TOLERANCE 1E-2
52 | void computeError(
53 | int ldc,
54 | int ldc_ref,
55 | int m,
56 | int n,
57 | float *C,
58 | float *C_ref
59 | )
60 | {
61 | int i, j;
62 | for ( i = 0; i < m; i ++ ) {
63 | for ( j = 0; j < n; j ++ ) {
64 | if ( fabs( C( i, j ) - C_ref( i, j ) ) > TOLERANCE ) {
65 | printf( "C[ %d ][ %d ] != C_ref, %E, %E\n", i, j, C( i, j ), C_ref( i, j ) );
66 | break;
67 | }
68 | }
69 | }
70 |
71 | }
72 |
73 | void PackWeightLayout(float* dst, const float* src, int nc, int kc, int nr, bool transpose) {
74 | int index = 0;
75 | for (int nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
76 | int nr_block_size = nr;
77 | if((nc - nr_block_start) < nr) nr_block_size = nc - nr_block_start;
78 |
79 | for (int kr_block_start = 0; kr_block_start < kc; kr_block_start++) {
80 | for (int nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
81 | if (nr_block_offset >= nr_block_size) {
82 | index++;
83 | continue;
84 | }
85 | int x_idx = transpose ? kr_block_start : (nr_block_start + nr_block_offset);
86 | int y_idx = transpose ? (nr_block_start + nr_block_offset) : kr_block_start;
87 | int x_size = transpose ? kc : nc;
88 | dst[index++] = src[y_idx * x_size + x_idx];
89 | }
90 | }
91 | }
92 | }
93 |
94 | void test_bl_sgemm(
95 | int m,
96 | int n,
97 | int k
98 | )
99 | {
100 | int i, j, p, nx;
101 | float *A, *B, *C, *C_ref, *packA, *packB;
102 | float tmp, error, flops;
103 | static float ref_beg, ref_time, bl_sgemm_beg, bl_sgemm_time;
104 | int nrepeats;
105 | int lda, ldb, ldc, ldc_ref;
106 | float ref_rectime, bl_sgemm_rectime;
107 |
108 | int mr = 4;
109 | int nr = 4;
110 |
111 | A = (float*)malloc( sizeof(float) * m * k *2);
112 | B = (float*)malloc( sizeof(float) * k * n );
113 | // Allocate packing buffers
114 | packA = bl_malloc_aligned( m + mr, k, sizeof(float) );
115 | packB = bl_malloc_aligned( k*2, n + nr, sizeof(float) );
116 |
117 |
118 | lda = k;
119 | ldb = n;
120 | ldc = n;
121 | ldc_ref = n;
122 | C = bl_malloc_aligned( ldc, n + nr, sizeof(float) );
123 | C_ref = (float*)malloc( sizeof(float) * m * n );
124 |
125 | nrepeats = 1;
126 |
127 | srand48 (time(NULL));
128 |
129 | // Randonly generate points in [ 0, 1 ].
130 | for ( p = 0; p < k; p ++ ) {
131 | for ( i = 0; i < m; i ++ ) {
132 | A( i, p ) = (float)( drand48() );
133 | // A( i, p ) = (float)( i*m + p );
134 | }
135 | }
136 | for ( j = 0; j < n; j ++ ) {
137 | for ( p = 0; p < k; p ++ ) {
138 | B( p, j ) = (float)( drand48() );
139 | // B( p, j ) = (float)( p*n + j );
140 | }
141 | }
142 |
143 | for ( j = 0; j < n; j ++ ) {
144 | for ( i = 0; i < m; i ++ ) {
145 | C_ref( i, j ) = (float)( 0.0 );
146 | C( i, j ) = (float)( 0.0 );
147 | }
148 | }
149 |
150 | PackWeightLayout(packB, B, n, k, nr, false);
151 |
152 | // printf("[B]\n");
153 | // for(int i = 0; i < k; i++) {
154 | // for(int j = 0; j < n; j++) {
155 | // printf("%.1f\t", B[i * n + j]);
156 | // }
157 | // printf("\n");
158 | // }
159 |
160 | // printf("[packB]\n");
161 | // for(int i = 0; i < k; i++) {
162 | // for(int j = 0; j < nr; j++) {
163 | // printf("%.1f\t", packB[i * nr + j]);
164 | // }
165 | // printf("\n");
166 | // }
167 |
168 | for ( i = 0; i < nrepeats; i ++ ) {
169 | bl_sgemm_beg = bl_clock();
170 | {
171 | bl_sgemm_pack(
172 | m,
173 | mr,
174 | n,
175 | nr,
176 | k,
177 | A,
178 | packA,
179 | lda,
180 | B,
181 | packB,
182 | ldb,
183 | C,
184 | ldc
185 | );
186 | }
187 | bl_sgemm_time = bl_clock() - bl_sgemm_beg;
188 |
189 | if ( i == 0 ) {
190 | bl_sgemm_rectime = bl_sgemm_time;
191 | } else {
192 | bl_sgemm_rectime = bl_sgemm_time < bl_sgemm_rectime ? bl_sgemm_time : bl_sgemm_rectime;
193 | }
194 | }
195 |
196 | #ifdef ERROR_TEST
197 | for ( i = 0; i < nrepeats; i ++ ) {
198 | ref_beg = bl_clock();
199 | {
200 | bl_sgemm_ref(
201 | m,
202 | n,
203 | k,
204 | A,
205 | lda,
206 | B,
207 | ldb,
208 | C_ref,
209 | ldc_ref
210 | );
211 | }
212 | ref_time = bl_clock() - ref_beg;
213 |
214 | if ( i == 0 ) {
215 | ref_rectime = ref_time;
216 | } else {
217 | ref_rectime = ref_time < ref_rectime ? ref_time : ref_rectime;
218 | }
219 | }
220 |
221 | computeError(
222 | ldc,
223 | ldc_ref,
224 | m,
225 | n,
226 | C,
227 | C_ref
228 | );
229 | #endif
230 | // printf("ref\n");
231 | // for(int i = 0; i < m; i++) {
232 | // for(int j = 0; j < n; j++) {
233 | // printf("%.0f\t", C_ref[i * n + j]);
234 | // }
235 | // printf("\n");
236 | // }
237 | // printf("\n\n");
238 |
239 | // printf("C\n");
240 | // for(int i = 0; i < m; i++) {
241 | // for(int j = 0; j < n; j++) {
242 | // printf("%.0f\t", C[i * n + j]);
243 | // }
244 | // printf("\n");
245 | // }
246 | // printf("\n\n");
247 |
248 | // Compute overall floating point operations.
249 | flops = ( m * n / ( 1000.0 * 1000.0 * 1000.0 ) ) * ( 2 * k );
250 | printf( "%5d\t %5d\t %5d\t %5.3lf\t %5.3lf\n",
251 | m, n, k, flops / bl_sgemm_rectime, flops / ref_rectime );
252 |
253 | free( A );
254 | free( packA );
255 | free( B );
256 | free( packB );
257 | free( C );
258 | free( C_ref );
259 | }
260 |
261 | int main( int argc, char *argv[] )
262 | {
263 | printf("%%m\t%%n\t%%k\t%%MY_GFLOPS\t%%REF_GFLOPS\n");
264 | for(int i = 16; i <= 800; i += 4) {
265 | test_bl_sgemm( i, i, i );
266 | }
267 |
268 | return 0;
269 | }
270 |
271 |
--------------------------------------------------------------------------------
/sgemm/step0/Makefile:
--------------------------------------------------------------------------------
1 | TARGET=test_bl_sgemm_step0
2 |
3 | #CROSS_COMPILE ?= riscv64-unknown-linux-gnu-
4 | # CROSS_COMPILE ?=
5 |
6 | CCL := /home/zhaodongyu/toolchain/riscv/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
7 | CTOOL := riscv64-unknown-linux-gnu-
8 | CROSS_COMPILE := ${CCL}/bin/${CTOOL}
9 |
10 | CC = $(CROSS_COMPILE)gcc
11 | CXX = $(CROSS_COMPILE)g++
12 |
13 | COMPILER_OPT_LEVEL=O3
14 |
15 | ifeq ($(CROSS_COMPILE),)
16 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=core-avx2 -fPIC -static
17 | LDLIBS = -lpthread -lm
18 | else
19 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static
20 | LDLIBS = -lm
21 | endif
22 |
23 | COMMONDIR = ../common
24 | CFLAGS += -I$(COMMONDIR)
25 |
26 | ASM_SRC=
27 | CC_SRC= my_sgemm.c \
28 | $(COMMONDIR)/test_bl_sgemm.c \
29 | $(COMMONDIR)/bl_sgemm_ref.c \
30 | $(COMMONDIR)/bl_sgemm_util.c
31 |
32 | CPP_SRC=
33 |
34 | ALL_ASM_OBJS := $(ASM_SRC:=.o)
35 | ALL_C_OBJS := $(CC_SRC:=.o)
36 | ALL_CXX_OBJS := $(CPP_SRC:=.o)
37 |
38 | MKDEP_OPT = -MMD -MF $@.d
39 |
40 | ALL_OBJS += $(ALL_ASM_OBJS) $(ALL_C_OBJS) $(ALL_CXX_OBJS)
41 | ALL_DEPS := $(ALL_OBJS:=.d)
42 |
43 | BLISLAB_TEST_EXE=$(TARGET:=.x)
44 |
45 | .PHONY: all clean
46 |
47 | all: $(BLISLAB_TEST_EXE)
48 |
49 | # include dependency files of application
50 | ifneq ($(MAKECMDGOALS),clean)
51 | -include $(ALL_DEPS)
52 | endif
53 |
54 | $(BLISLAB_TEST_EXE): $(ALL_OBJS)
55 | $(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS) $(LDLIBS)
56 |
57 | # ---------------------------------------------------------------------------
58 | # Object files compiling rules
59 | # ---------------------------------------------------------------------------
60 | %.c.o: %.c
61 | $(CC) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
62 |
63 | %.cpp.o: %.cpp
64 | $(CXX) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
65 | # ---------------------------------------------------------------------------
66 |
67 | clean:
68 | @rm -rf $(ALL_OBJS) $(SHAREDLIBBLISLAB) $(BLISLAB_TEST_EXE) $(ALL_DEPS)
69 |
70 |
71 |
--------------------------------------------------------------------------------
/sgemm/step0/my_sgemm.c:
--------------------------------------------------------------------------------
1 | /*
2 | * --------------------------------------------------------------------------
3 | * BLISLAB
4 | * --------------------------------------------------------------------------
5 | * Copyright (C) 2016, The University of Texas at Austin
6 | *
7 | * Redistribution and use in source and binary forms, with or without
8 | * modification, are permitted provided that the following conditions are
9 | * met:
10 | * - Redistributions of source code must retain the above copyright
11 | * notice, this list of conditions and the following disclaimer.
12 | * - Redistributions in binary form must reproduce the above copyright
13 | * notice, this list of conditions and the following disclaimer in the
14 | * documentation and/or other materials provided with the distribution.
15 | * - Neither the name of The University of Texas nor the names of its
16 | * contributors may be used to endorse or promote products derived
17 | * from this software without specific prior written permission.
18 | *
19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | *
31 | *
32 | * bl_sgemm.c
33 | *
34 | *
35 | * Purpose:
36 | * this is the main file of blislab sgemm.
37 | *
38 | * Todo:
39 | *
40 | *
41 | * Modification:
42 | *
43 | *
44 | * */
45 |
46 |
47 | #include
48 |
49 | void bl_sgemm(
50 | int m,
51 | int n,
52 | int k,
53 | float *A,
54 | int lda,
55 | float *B,
56 | int ldb,
57 | float *C, // must be aligned
58 | int ldc // ldc must also be aligned
59 | )
60 | {
61 | int i, j, p;
62 |
63 | // Early return if possible
64 | if ( m == 0 || n == 0 || k == 0 ) {
65 | printf( "bl_sgemm(): early return\n" );
66 | return;
67 | }
68 |
69 | for ( i = 0; i < m; i ++ ) { // Start 2-th loop
70 | for ( j = 0; j < n; j ++ ) { // Start 1-nd loop
71 | for ( p = 0; p < k; p ++ ) { // Start 0-st loop
72 |
73 | C( i, j ) += A( i, p ) * B( p, j ); //Each operand is a MACRO defined in bl_sgemm() function.
74 |
75 | } // End 0-th loop
76 | } // End 1-st loop
77 | } // End 2-nd loop
78 | }
79 |
80 |
81 |
--------------------------------------------------------------------------------
/sgemm/step1/Makefile:
--------------------------------------------------------------------------------
1 | TARGET=test_bl_sgemm_step1
2 |
3 | #CROSS_COMPILE ?= riscv64-unknown-linux-gnu-
4 | # CROSS_COMPILE ?=
5 |
6 | CCL := /home/zhaodongyu/toolchain/riscv/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
7 | CTOOL := riscv64-unknown-linux-gnu-
8 | CROSS_COMPILE := ${CCL}/bin/${CTOOL}
9 |
10 | CC = $(CROSS_COMPILE)gcc
11 | CXX = $(CROSS_COMPILE)g++
12 |
13 | COMPILER_OPT_LEVEL=O3
14 |
15 | ifeq ($(CROSS_COMPILE),)
16 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=core-avx2 -fPIC -static
17 | LDLIBS = -lpthread -lm
18 | else
19 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static
20 | LDLIBS = -lm
21 | endif
22 |
23 | COMMONDIR = ../common
24 | CFLAGS += -I$(COMMONDIR)
25 |
26 | ASM_SRC=
27 | CC_SRC= my_sgemm.c \
28 | $(COMMONDIR)/test_bl_sgemm.c \
29 | $(COMMONDIR)/bl_sgemm_ref.c \
30 | $(COMMONDIR)/bl_sgemm_util.c
31 |
32 | CPP_SRC=
33 |
34 | ALL_ASM_OBJS := $(ASM_SRC:=.o)
35 | ALL_C_OBJS := $(CC_SRC:=.o)
36 | ALL_CXX_OBJS := $(CPP_SRC:=.o)
37 |
38 | MKDEP_OPT = -MMD -MF $@.d
39 |
40 | ALL_OBJS += $(ALL_ASM_OBJS) $(ALL_C_OBJS) $(ALL_CXX_OBJS)
41 | ALL_DEPS := $(ALL_OBJS:=.d)
42 |
43 | BLISLAB_TEST_EXE=$(TARGET:=.x)
44 |
45 | .PHONY: all clean
46 |
47 | all: $(BLISLAB_TEST_EXE)
48 |
49 | # include dependency files of application
50 | ifneq ($(MAKECMDGOALS),clean)
51 | -include $(ALL_DEPS)
52 | endif
53 |
54 | $(BLISLAB_TEST_EXE): $(ALL_OBJS)
55 | $(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS) $(LDLIBS)
56 |
57 | # ---------------------------------------------------------------------------
58 | # Object files compiling rules
59 | # ---------------------------------------------------------------------------
60 | %.c.o: %.c
61 | $(CC) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
62 |
63 | %.cpp.o: %.cpp
64 | $(CXX) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
65 | # ---------------------------------------------------------------------------
66 |
67 | clean:
68 | @rm -rf $(ALL_OBJS) $(SHAREDLIBBLISLAB) $(BLISLAB_TEST_EXE) $(ALL_DEPS)
69 |
70 |
71 |
--------------------------------------------------------------------------------
/sgemm/step1/my_sgemm.c:
--------------------------------------------------------------------------------
1 | /*
2 | * --------------------------------------------------------------------------
3 | * BLISLAB
4 | * --------------------------------------------------------------------------
5 | * Copyright (C) 2016, The University of Texas at Austin
6 | *
7 | * Redistribution and use in source and binary forms, with or without
8 | * modification, are permitted provided that the following conditions are
9 | * met:
10 | * - Redistributions of source code must retain the above copyright
11 | * notice, this list of conditions and the following disclaimer.
12 | * - Redistributions in binary form must reproduce the above copyright
13 | * notice, this list of conditions and the following disclaimer in the
14 | * documentation and/or other materials provided with the distribution.
15 | * - Neither the name of The University of Texas nor the names of its
16 | * contributors may be used to endorse or promote products derived
17 | * from this software without specific prior written permission.
18 | *
19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | *
31 | *
32 | * bl_sgemm.c
33 | *
34 | *
35 | * Purpose:
36 | * this is the main file of blislab sgemm.
37 | *
38 | * Todo:
39 | *
40 | *
41 | * Modification:
42 | *
43 | *
44 | * */
45 |
46 |
47 | #include
48 |
49 | void bl_sgemm(
50 | int m,
51 | int n,
52 | int k,
53 | float *A,
54 | int lda,
55 | float *B,
56 | int ldb,
57 | float *C, // must be aligned
58 | int ldc // ldc must also be aligned
59 | )
60 | {
61 | int i, j, p;
62 |
63 | // Early return if possible
64 | if ( m == 0 || n == 0 || k == 0 ) {
65 | printf( "bl_sgemm(): early return\n" );
66 | return;
67 | }
68 |
69 | for ( i = 0; i < m; i ++ ) { // Start 2-th loop
70 | for ( p = 0; p < k; p ++ ) { // Start 1-st loop
71 | for ( j = 0; j < n; j ++ ) { // Start 0-nd loop
72 |
73 | C( i, j ) += A( i, p ) * B( p, j ); //Each operand is a MACRO defined in bl_sgemm() function.
74 |
75 | } // End 0-th loop
76 | } // End 1-st loop
77 | } // End 2-nd loop
78 | }
79 |
80 |
81 |
--------------------------------------------------------------------------------
/sgemm/step2/Makefile:
--------------------------------------------------------------------------------
1 | TARGET=test_bl_sgemm_step2
2 |
3 | #CROSS_COMPILE ?= riscv64-unknown-linux-gnu-
4 | # CROSS_COMPILE ?=
5 |
6 | CCL := /home/zhaodongyu/toolchain/riscv/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
7 | CTOOL := riscv64-unknown-linux-gnu-
8 | CROSS_COMPILE := ${CCL}/bin/${CTOOL}
9 |
10 | CC = $(CROSS_COMPILE)gcc
11 | CXX = $(CROSS_COMPILE)g++
12 |
13 | COMPILER_OPT_LEVEL=O3
14 |
15 | ifeq ($(CROSS_COMPILE),)
16 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=core-avx2 -fPIC -static
17 | LDLIBS = -lpthread -lm
18 | else
19 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static
20 | LDLIBS = -lm
21 | endif
22 |
23 | COMMONDIR = ../common
24 | CFLAGS += -I$(COMMONDIR)
25 |
26 | ASM_SRC=
27 | CC_SRC= my_sgemm.c \
28 | $(COMMONDIR)/test_bl_sgemm.c \
29 | $(COMMONDIR)/bl_sgemm_ref.c \
30 | $(COMMONDIR)/bl_sgemm_util.c
31 |
32 | CPP_SRC=
33 |
34 | ALL_ASM_OBJS := $(ASM_SRC:=.o)
35 | ALL_C_OBJS := $(CC_SRC:=.o)
36 | ALL_CXX_OBJS := $(CPP_SRC:=.o)
37 |
38 | MKDEP_OPT = -MMD -MF $@.d
39 |
40 | ALL_OBJS += $(ALL_ASM_OBJS) $(ALL_C_OBJS) $(ALL_CXX_OBJS)
41 | ALL_DEPS := $(ALL_OBJS:=.d)
42 |
43 | BLISLAB_TEST_EXE=$(TARGET:=.x)
44 |
45 | .PHONY: all clean
46 |
47 | all: $(BLISLAB_TEST_EXE)
48 |
49 | # include dependency files of application
50 | ifneq ($(MAKECMDGOALS),clean)
51 | -include $(ALL_DEPS)
52 | endif
53 |
54 | $(BLISLAB_TEST_EXE): $(ALL_OBJS)
55 | $(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS) $(LDLIBS)
56 |
57 | # ---------------------------------------------------------------------------
58 | # Object files compiling rules
59 | # ---------------------------------------------------------------------------
60 | %.c.o: %.c
61 | $(CC) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
62 |
63 | %.cpp.o: %.cpp
64 | $(CXX) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
65 | # ---------------------------------------------------------------------------
66 |
67 | clean:
68 | @rm -rf $(ALL_OBJS) $(SHAREDLIBBLISLAB) $(BLISLAB_TEST_EXE) $(ALL_DEPS)
69 |
70 |
71 |
--------------------------------------------------------------------------------
/sgemm/step2/bl_config.h:
--------------------------------------------------------------------------------
1 | /*
2 | * --------------------------------------------------------------------------
3 | * BLISLAB
4 | * --------------------------------------------------------------------------
5 | * Copyright (C) 2016, The University of Texas at Austin
6 | *
7 | * Redistribution and use in source and binary forms, with or without
8 | * modification, are permitted provided that the following conditions are
9 | * met:
10 | * - Redistributions of source code must retain the above copyright
11 | * notice, this list of conditions and the following disclaimer.
12 | * - Redistributions in binary form must reproduce the above copyright
13 | * notice, this list of conditions and the following disclaimer in the
14 | * documentation and/or other materials provided with the distribution.
15 | * - Neither the name of The University of Texas nor the names of its
16 | * contributors may be used to endorse or promote products derived
17 | * from this software without specific prior written permission.
18 | *
19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | *
31 | *
32 | * bl_config.h
33 | *
34 | *
35 | * Purpose:
36 | * this header file contains configuration parameters.
37 | *
38 | * Todo:
39 | *
40 | *
41 | * Modification:
42 | *
43 | *
44 | * */
45 |
46 | #ifndef BLISLAB_CONFIG_H
47 | #define BLISLAB_CONFIG_H
48 |
49 | // Allow C++ users to include this header file in their source code. However,
50 | // we make the extern "C" conditional on whether we're using a C++ compiler,
51 | // since regular C compilers don't understand the extern "C" construct.
52 | #ifdef __cplusplus
53 | extern "C" {
54 | #endif
55 |
56 | #define DGEMM_MR 4
57 | #define DGEMM_NR 4
58 |
59 | // End extern "C" construct block.
60 | #ifdef __cplusplus
61 | }
62 | #endif
63 |
64 | #endif
65 |
66 |
--------------------------------------------------------------------------------
/sgemm/step2/my_sgemm.c:
--------------------------------------------------------------------------------
1 | /*
2 | * --------------------------------------------------------------------------
3 | * BLISLAB
4 | * --------------------------------------------------------------------------
5 | * Copyright (C) 2016, The University of Texas at Austin
6 | *
7 | * Redistribution and use in source and binary forms, with or without
8 | * modification, are permitted provided that the following conditions are
9 | * met:
10 | * - Redistributions of source code must retain the above copyright
11 | * notice, this list of conditions and the following disclaimer.
12 | * - Redistributions in binary form must reproduce the above copyright
13 | * notice, this list of conditions and the following disclaimer in the
14 | * documentation and/or other materials provided with the distribution.
15 | * - Neither the name of The University of Texas nor the names of its
16 | * contributors may be used to endorse or promote products derived
17 | * from this software without specific prior written permission.
18 | *
19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | *
31 | *
32 | * bl_sgemm.c
33 | *
34 | *
35 | * Purpose:
36 | * this is the main file of blislab sgemm.
37 | *
38 | * Todo:
39 | *
40 | *
41 | * Modification:
42 | *
43 | *
44 | * */
45 |
46 |
47 | #include "bl_sgemm.h"
48 | #include "bl_config.h"
49 |
50 | void AddDot( int k, float *A, int lda, float *B, int ldb, float *result ) {
51 | int p;
52 | for ( p = 0; p < k; p++ ) {
53 | *result += A( 0, p ) * B( p, 0 );
54 | }
55 | }
56 |
57 | void AddDot_MRxNR( int k, float *A, int lda, float *B, int ldb, float *C, int ldc )
58 | {
59 | int ir, jr;
60 | int p;
61 | for ( jr = 0; jr < DGEMM_NR; jr++ ) {
62 | for ( ir = 0; ir < DGEMM_MR; ir++ ) {
63 | AddDot( k, &A( ir, 0 ), lda, &B( 0, jr ), ldb, &C( ir, jr ) );
64 | }
65 | }
66 | }
67 |
68 |
69 | void bl_sgemm(
70 | int m,
71 | int n,
72 | int k,
73 | float *A,
74 | int lda,
75 | float *B,
76 | int ldb,
77 | float *C, // must be aligned
78 | int ldc // ldc must also be aligned
79 | )
80 | {
81 | int i, j, p;
82 | int ir, jr;
83 |
84 | // Early return if possible
85 | if ( m == 0 || n == 0 || k == 0 ) {
86 | printf( "bl_sgemm(): early return\n" );
87 | return;
88 | }
89 |
90 | for ( i = 0; i < m; i += DGEMM_MR ) { // Start 2-nd loop
91 | for ( j = 0; j < n; j += DGEMM_NR ) { // Start 1-st loop
92 | AddDot_MRxNR( k, &A( i, 0 ), lda, &B( 0, j ), ldb, &C( i, j ), ldc );
93 | } // End 1-st loop
94 | } // End 2-nd loop
95 | }
96 |
97 |
98 |
--------------------------------------------------------------------------------
/sgemm/step3/Makefile:
--------------------------------------------------------------------------------
1 | TARGET=test_bl_sgemm_step3
2 |
3 | #CROSS_COMPILE ?= riscv64-unknown-linux-gnu-
4 | # CROSS_COMPILE ?=
5 |
6 | CCL := /home/zhaodongyu/toolchain/riscv/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
7 | CTOOL := riscv64-unknown-linux-gnu-
8 | CROSS_COMPILE := ${CCL}/bin/${CTOOL}
9 |
10 | CC = $(CROSS_COMPILE)gcc
11 | CXX = $(CROSS_COMPILE)g++
12 |
13 | COMPILER_OPT_LEVEL=O3
14 |
15 | ifeq ($(CROSS_COMPILE),)
16 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=core-avx2 -fPIC -static
17 | LDLIBS = -lpthread -lm
18 | else
19 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static
20 | LDLIBS = -lm
21 | endif
22 |
23 | COMMONDIR = ../common
24 | CFLAGS += -I$(COMMONDIR)
25 |
26 | ASM_SRC=
27 | CC_SRC= my_sgemm.c \
28 | $(COMMONDIR)/test_bl_sgemm.c \
29 | $(COMMONDIR)/bl_sgemm_ref.c \
30 | $(COMMONDIR)/bl_sgemm_util.c
31 |
32 | CPP_SRC=
33 |
34 | ALL_ASM_OBJS := $(ASM_SRC:=.o)
35 | ALL_C_OBJS := $(CC_SRC:=.o)
36 | ALL_CXX_OBJS := $(CPP_SRC:=.o)
37 |
38 | MKDEP_OPT = -MMD -MF $@.d
39 |
40 | ALL_OBJS += $(ALL_ASM_OBJS) $(ALL_C_OBJS) $(ALL_CXX_OBJS)
41 | ALL_DEPS := $(ALL_OBJS:=.d)
42 |
43 | BLISLAB_TEST_EXE=$(TARGET:=.x)
44 |
45 | .PHONY: all clean
46 |
47 | all: $(BLISLAB_TEST_EXE)
48 |
49 | # include dependency files of application
50 | ifneq ($(MAKECMDGOALS),clean)
51 | -include $(ALL_DEPS)
52 | endif
53 |
54 | $(BLISLAB_TEST_EXE): $(ALL_OBJS)
55 | $(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS) $(LDLIBS)
56 |
57 | # ---------------------------------------------------------------------------
58 | # Object files compiling rules
59 | # ---------------------------------------------------------------------------
60 | %.c.o: %.c
61 | $(CC) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
62 |
63 | %.cpp.o: %.cpp
64 | $(CXX) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
65 | # ---------------------------------------------------------------------------
66 |
67 | clean:
68 | @rm -rf $(ALL_OBJS) $(SHAREDLIBBLISLAB) $(BLISLAB_TEST_EXE) $(ALL_DEPS)
69 |
70 |
71 |
--------------------------------------------------------------------------------
/sgemm/step3/bl_config.h:
--------------------------------------------------------------------------------
1 | /*
2 | * --------------------------------------------------------------------------
3 | * BLISLAB
4 | * --------------------------------------------------------------------------
5 | * Copyright (C) 2016, The University of Texas at Austin
6 | *
7 | * Redistribution and use in source and binary forms, with or without
8 | * modification, are permitted provided that the following conditions are
9 | * met:
10 | * - Redistributions of source code must retain the above copyright
11 | * notice, this list of conditions and the following disclaimer.
12 | * - Redistributions in binary form must reproduce the above copyright
13 | * notice, this list of conditions and the following disclaimer in the
14 | * documentation and/or other materials provided with the distribution.
15 | * - Neither the name of The University of Texas nor the names of its
16 | * contributors may be used to endorse or promote products derived
17 | * from this software without specific prior written permission.
18 | *
19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | *
31 | *
32 | * bl_config.h
33 | *
34 | *
35 | * Purpose:
36 | * this header file contains configuration parameters.
37 | *
38 | * Todo:
39 | *
40 | *
41 | * Modification:
42 | *
43 | *
44 | * */
45 |
46 | #ifndef BLISLAB_CONFIG_H
47 | #define BLISLAB_CONFIG_H
48 |
49 | // Allow C++ users to include this header file in their source code. However,
50 | // we make the extern "C" conditional on whether we're using a C++ compiler,
51 | // since regular C compilers don't understand the extern "C" construct.
52 | #ifdef __cplusplus
53 | extern "C" {
54 | #endif
55 |
56 | #define DGEMM_MR 4
57 | #define DGEMM_NR 4
58 |
59 | // End extern "C" construct block.
60 | #ifdef __cplusplus
61 | }
62 | #endif
63 |
64 | #endif
65 |
66 |
--------------------------------------------------------------------------------
/sgemm/step3/my_sgemm.c:
--------------------------------------------------------------------------------
1 | /*
2 | * --------------------------------------------------------------------------
3 | * BLISLAB
4 | * --------------------------------------------------------------------------
5 | * Copyright (C) 2016, The University of Texas at Austin
6 | *
7 | * Redistribution and use in source and binary forms, with or without
8 | * modification, are permitted provided that the following conditions are
9 | * met:
10 | * - Redistributions of source code must retain the above copyright
11 | * notice, this list of conditions and the following disclaimer.
12 | * - Redistributions in binary form must reproduce the above copyright
13 | * notice, this list of conditions and the following disclaimer in the
14 | * documentation and/or other materials provided with the distribution.
15 | * - Neither the name of The University of Texas nor the names of its
16 | * contributors may be used to endorse or promote products derived
17 | * from this software without specific prior written permission.
18 | *
19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | *
31 | *
32 | * bl_sgemm.c
33 | *
34 | *
35 | * Purpose:
36 | * this is the main file of blislab sgemm.
37 | *
38 | * Todo:
39 | *
40 | *
41 | * Modification:
42 | *
43 | *
44 | * */
45 |
46 | #include "bl_sgemm.h"
47 | #include "bl_config.h"
48 |
49 | void AddDot( int k, float *A, int lda, float *B, int ldb, float *result ) {
50 | int p;
51 | for ( p = 0; p < k; p++ ) {
52 | *result += A( 0, p ) * B( p, 0 );
53 | }
54 | }
55 |
56 | void AddDot_MRxNR( int k, float *A, int lda, float *B, int ldb, float *C, int ldc )
57 | {
58 | int ir, jr;
59 | int p;
60 | for ( jr = 0; jr < DGEMM_NR; jr++ ) {
61 | for ( ir = 0; ir < DGEMM_MR; ir++ ) {
62 | AddDot( k, &A( ir, 0 ), lda, &B( 0, jr ), ldb, &C( ir, jr ) );
63 | }
64 | }
65 | }
66 |
67 | void AddDot_4x4_opt( int k, float *A, int lda, float *B, int ldb, float *C, int ldc )
68 | {
69 | register float C00, C01, C02, C03, C10, C11, C12, C13, C20, C21, C22, C23, C30, C31, C32, C33;
70 | int p;
71 |
72 | C00 = 0.0f;
73 | C01 = 0.0f;
74 | C02 = 0.0f;
75 | C03 = 0.0f;
76 | C10 = 0.0f;
77 | C11 = 0.0f;
78 | C12 = 0.0f;
79 | C13 = 0.0f;
80 | C20 = 0.0f;
81 | C21 = 0.0f;
82 | C22 = 0.0f;
83 | C23 = 0.0f;
84 | C30 = 0.0f;
85 | C31 = 0.0f;
86 | C32 = 0.0f;
87 | C33 = 0.0f;
88 | for (p = 0; p < k; p++) {
89 | C00 += A(0, p) * B(p, 0);
90 | C01 += A(0, p) * B(p, 1);
91 | C02 += A(0, p) * B(p, 2);
92 | C03 += A(0, p) * B(p, 3);
93 | C10 += A(1, p) * B(p, 0);
94 | C11 += A(1, p) * B(p, 1);
95 | C12 += A(1, p) * B(p, 2);
96 | C13 += A(1, p) * B(p, 3);
97 | C20 += A(2, p) * B(p, 0);
98 | C21 += A(2, p) * B(p, 1);
99 | C22 += A(2, p) * B(p, 2);
100 | C23 += A(2, p) * B(p, 3);
101 | C30 += A(3, p) * B(p, 0);
102 | C31 += A(3, p) * B(p, 1);
103 | C32 += A(3, p) * B(p, 2);
104 | C33 += A(3, p) * B(p, 3);
105 | }
106 | C(0, 0) += C00;
107 | C(0, 1) += C01;
108 | C(0, 2) += C02;
109 | C(0, 3) += C03;
110 | C(1, 0) += C10;
111 | C(1, 1) += C11;
112 | C(1, 2) += C12;
113 | C(1, 3) += C13;
114 | C(2, 0) += C20;
115 | C(2, 1) += C21;
116 | C(2, 2) += C22;
117 | C(2, 3) += C23;
118 | C(3, 0) += C30;
119 | C(3, 1) += C31;
120 | C(3, 2) += C32;
121 | C(3, 3) += C33;
122 | }
123 |
124 | void bl_sgemm(
125 | int m,
126 | int n,
127 | int k,
128 | float *A,
129 | int lda,
130 | float *B,
131 | int ldb,
132 | float *C, // must be aligned
133 | int ldc // ldc must also be aligned
134 | )
135 | {
136 | int i, j, p;
137 | int ir, jr;
138 |
139 | // Early return if possible
140 | if ( m == 0 || n == 0 || k == 0 ) {
141 | printf( "bl_sgemm(): early return\n" );
142 | return;
143 | }
144 |
145 | for ( i = 0; i < m; i += DGEMM_MR ) { // Start 2-nd loop
146 | for ( j = 0; j < n; j += DGEMM_NR ) { // Start 1-st loop
147 | #if !(DGEMM_MR == 4 && DGEMM_NR == 4)
148 | AddDot_MRxNR( k, &A( i, 0 ), lda, &B( 0, j ), ldb, &C( i, j ), ldc );
149 | #else
150 | AddDot_4x4_opt( k, &A( i, 0 ), lda, &B( 0, j ), ldb, &C( i, j ), ldc );
151 | #endif
152 | } // End 1-st loop
153 | } // End 2-nd loop
154 | }
155 |
156 |
157 |
--------------------------------------------------------------------------------
/sgemm/step4/Makefile:
--------------------------------------------------------------------------------
1 | TARGET=test_bl_sgemm_step4
2 |
3 | #CROSS_COMPILE ?= riscv64-unknown-linux-gnu-
4 | # CROSS_COMPILE ?=
5 |
6 | CCL := /home/zhaodongyu/toolchain/riscv/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
7 | CTOOL := riscv64-unknown-linux-gnu-
8 | CROSS_COMPILE := ${CCL}/bin/${CTOOL}
9 |
10 | CC = $(CROSS_COMPILE)gcc
11 | CXX = $(CROSS_COMPILE)g++
12 |
13 | COMPILER_OPT_LEVEL=O3
14 |
15 | ifeq ($(CROSS_COMPILE),)
16 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=core-avx2 -fPIC -static
17 | LDLIBS = -lpthread -lm
18 | else
19 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static
20 | LDLIBS = -lm
21 | endif
22 |
23 | COMMONDIR = ../common
24 | CFLAGS += -I$(COMMONDIR)
25 |
26 | ASM_SRC=
27 | CC_SRC= my_sgemm.c \
28 | $(COMMONDIR)/test_bl_sgemm_packB_4x4.c \
29 | $(COMMONDIR)/bl_sgemm_ref.c \
30 | $(COMMONDIR)/bl_sgemm_util.c
31 |
32 | CPP_SRC=
33 |
34 | ALL_ASM_OBJS := $(ASM_SRC:=.o)
35 | ALL_C_OBJS := $(CC_SRC:=.o)
36 | ALL_CXX_OBJS := $(CPP_SRC:=.o)
37 |
38 | MKDEP_OPT = -MMD -MF $@.d
39 |
40 | ALL_OBJS += $(ALL_ASM_OBJS) $(ALL_C_OBJS) $(ALL_CXX_OBJS)
41 | ALL_DEPS := $(ALL_OBJS:=.d)
42 |
43 | BLISLAB_TEST_EXE=$(TARGET:=.x)
44 |
45 | .PHONY: all clean
46 |
47 | all: $(BLISLAB_TEST_EXE)
48 |
49 | # include dependency files of application
50 | ifneq ($(MAKECMDGOALS),clean)
51 | -include $(ALL_DEPS)
52 | endif
53 |
54 | $(BLISLAB_TEST_EXE): $(ALL_OBJS)
55 | $(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS) $(LDLIBS)
56 |
57 | # ---------------------------------------------------------------------------
58 | # Object files compiling rules
59 | # ---------------------------------------------------------------------------
60 | %.c.o: %.c
61 | $(CC) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
62 |
63 | %.cpp.o: %.cpp
64 | $(CXX) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
65 | # ---------------------------------------------------------------------------
66 |
67 | clean:
68 | @rm -rf $(ALL_OBJS) $(SHAREDLIBBLISLAB) $(BLISLAB_TEST_EXE) $(ALL_DEPS)
69 |
70 |
71 |
--------------------------------------------------------------------------------
/sgemm/step4/bl_config.h:
--------------------------------------------------------------------------------
1 | /*
2 | * --------------------------------------------------------------------------
3 | * BLISLAB
4 | * --------------------------------------------------------------------------
5 | * Copyright (C) 2016, The University of Texas at Austin
6 | *
7 | * Redistribution and use in source and binary forms, with or without
8 | * modification, are permitted provided that the following conditions are
9 | * met:
10 | * - Redistributions of source code must retain the above copyright
11 | * notice, this list of conditions and the following disclaimer.
12 | * - Redistributions in binary form must reproduce the above copyright
13 | * notice, this list of conditions and the following disclaimer in the
14 | * documentation and/or other materials provided with the distribution.
15 | * - Neither the name of The University of Texas nor the names of its
16 | * contributors may be used to endorse or promote products derived
17 | * from this software without specific prior written permission.
18 | *
19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | *
31 | *
32 | * bl_config.h
33 | *
34 | *
35 | * Purpose:
36 | * this header file contains configuration parameters.
37 | *
38 | * Todo:
39 | *
40 | *
41 | * Modification:
42 | *
43 | *
44 | * */
45 |
46 | #ifndef BLISLAB_CONFIG_H
47 | #define BLISLAB_CONFIG_H
48 |
49 | // Allow C++ users to include this header file in their source code. However,
50 | // we make the extern "C" conditional on whether we're using a C++ compiler,
51 | // since regular C compilers don't understand the extern "C" construct.
52 | #ifdef __cplusplus
53 | extern "C" {
54 | #endif
55 |
56 | #define DGEMM_MR 4
57 | #define DGEMM_NR 4
58 |
59 | // End extern "C" construct block.
60 | #ifdef __cplusplus
61 | }
62 | #endif
63 |
64 | #endif
65 |
66 |
--------------------------------------------------------------------------------
/sgemm/step4/my_sgemm.c:
--------------------------------------------------------------------------------
1 | /*
2 | * --------------------------------------------------------------------------
3 | * BLISLAB
4 | * --------------------------------------------------------------------------
5 | * Copyright (C) 2016, The University of Texas at Austin
6 | *
7 | * Redistribution and use in source and binary forms, with or without
8 | * modification, are permitted provided that the following conditions are
9 | * met:
10 | * - Redistributions of source code must retain the above copyright
11 | * notice, this list of conditions and the following disclaimer.
12 | * - Redistributions in binary form must reproduce the above copyright
13 | * notice, this list of conditions and the following disclaimer in the
14 | * documentation and/or other materials provided with the distribution.
15 | * - Neither the name of The University of Texas nor the names of its
16 | * contributors may be used to endorse or promote products derived
17 | * from this software without specific prior written permission.
18 | *
19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | *
31 | *
32 | * bl_sgemm.c
33 | *
34 | *
35 | * Purpose:
36 | * this is the main file of blislab sgemm.
37 | *
38 | * Todo:
39 | *
40 | *
41 | * Modification:
42 | *
43 | *
44 | * */
45 |
46 | #include "bl_sgemm.h"
47 | #include "bl_config.h"
48 |
49 | void AddDot_4x4_opt( int k, float *A, int lda, float *packB, int ldb, float *C, int ldc )
50 | {
51 | register float C00, C01, C02, C03, C10, C11, C12, C13, C20, C21, C22, C23, C30, C31, C32, C33;
52 | float *packBp;
53 | int p;
54 |
55 | C00 = 0.0f;
56 | C01 = 0.0f;
57 | C02 = 0.0f;
58 | C03 = 0.0f;
59 | C10 = 0.0f;
60 | C11 = 0.0f;
61 | C12 = 0.0f;
62 | C13 = 0.0f;
63 | C20 = 0.0f;
64 | C21 = 0.0f;
65 | C22 = 0.0f;
66 | C23 = 0.0f;
67 | C30 = 0.0f;
68 | C31 = 0.0f;
69 | C32 = 0.0f;
70 | C33 = 0.0f;
71 | for (p = 0; p < k; p++) {
72 | packBp = &packB[p * 4];
73 |
74 | C00 += A(0, p+0) * packBp[0];
75 | C01 += A(0, p+0) * packBp[1];
76 | C02 += A(0, p+0) * packBp[2];
77 | C03 += A(0, p+0) * packBp[3];
78 | C10 += A(1, p+0) * packBp[0];
79 | C11 += A(1, p+0) * packBp[1];
80 | C12 += A(1, p+0) * packBp[2];
81 | C13 += A(1, p+0) * packBp[3];
82 | C20 += A(2, p+0) * packBp[0];
83 | C21 += A(2, p+0) * packBp[1];
84 | C22 += A(2, p+0) * packBp[2];
85 | C23 += A(2, p+0) * packBp[3];
86 | C30 += A(3, p+0) * packBp[0];
87 | C31 += A(3, p+0) * packBp[1];
88 | C32 += A(3, p+0) * packBp[2];
89 | C33 += A(3, p+0) * packBp[3];
90 |
91 | }
92 | C(0, 0) += C00;
93 | C(0, 1) += C01;
94 | C(0, 2) += C02;
95 | C(0, 3) += C03;
96 | C(1, 0) += C10;
97 | C(1, 1) += C11;
98 | C(1, 2) += C12;
99 | C(1, 3) += C13;
100 | C(2, 0) += C20;
101 | C(2, 1) += C21;
102 | C(2, 2) += C22;
103 | C(2, 3) += C23;
104 | C(3, 0) += C30;
105 | C(3, 1) += C31;
106 | C(3, 2) += C32;
107 | C(3, 3) += C33;
108 | }
109 |
110 |
111 | void bl_sgemm_pack(
112 | int m,
113 | int mr,
114 | int n,
115 | int nr,
116 | int k,
117 | float *A,
118 | float *packA,
119 | int lda,
120 | float *B,
121 | float *packB,
122 | int ldb,
123 | float *C, // must be aligned
124 | int ldc // ldc must also be aligned
125 | )
126 | {
127 | int i, j, p;
128 | int ir, jr;
129 |
130 | // Early return if possible
131 | if ( m == 0 || n == 0 || k == 0 ) {
132 | printf( "bl_sgemm(): early return\n" );
133 | return;
134 | }
135 |
136 | for ( i = 0; i < m; i += DGEMM_MR ) { // Start 2-nd loop
137 | for ( j = 0; j < n; j += DGEMM_NR ) { // Start 1-st loop
138 | AddDot_4x4_opt( k, &A( i, 0 ), lda, &packB[j * k], ldb, &C( i, j ), ldc );
139 | } // End 1-st loop
140 | } // End 2-nd loop
141 | }
142 |
143 |
144 |
--------------------------------------------------------------------------------
/sgemm/step5/Makefile:
--------------------------------------------------------------------------------
1 | TARGET=test_bl_sgemm_step5
2 |
3 | #CROSS_COMPILE ?= riscv64-unknown-linux-gnu-
4 | # CROSS_COMPILE ?=
5 |
6 | CCL := /home/zhaodongyu/toolchain/riscv/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
7 | CTOOL := riscv64-unknown-linux-gnu-
8 | CROSS_COMPILE := ${CCL}/bin/${CTOOL}
9 |
10 | CC = $(CROSS_COMPILE)gcc
11 | CXX = $(CROSS_COMPILE)g++
12 |
13 | COMPILER_OPT_LEVEL=O3
14 |
15 | ifeq ($(CROSS_COMPILE),)
16 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=core-avx2 -fPIC -static
17 | LDLIBS = -lpthread -lm
18 | else
19 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static
20 | LDLIBS = -lm
21 | endif
22 |
23 | COMMONDIR = ../common
24 | CFLAGS += -I$(COMMONDIR)
25 |
26 | ASM_SRC=
27 | CC_SRC= my_sgemm.c \
28 | $(COMMONDIR)/test_bl_sgemm_packB_4x4.c \
29 | $(COMMONDIR)/bl_sgemm_ref.c \
30 | $(COMMONDIR)/bl_sgemm_util.c
31 |
32 | CPP_SRC=
33 |
34 | ALL_ASM_OBJS := $(ASM_SRC:=.o)
35 | ALL_C_OBJS := $(CC_SRC:=.o)
36 | ALL_CXX_OBJS := $(CPP_SRC:=.o)
37 |
38 | MKDEP_OPT = -MMD -MF $@.d
39 |
40 | ALL_OBJS += $(ALL_ASM_OBJS) $(ALL_C_OBJS) $(ALL_CXX_OBJS)
41 | ALL_DEPS := $(ALL_OBJS:=.d)
42 |
43 | BLISLAB_TEST_EXE=$(TARGET:=.x)
44 |
45 | .PHONY: all clean
46 |
47 | all: $(BLISLAB_TEST_EXE)
48 |
49 | # include dependency files of application
50 | ifneq ($(MAKECMDGOALS),clean)
51 | -include $(ALL_DEPS)
52 | endif
53 |
54 | $(BLISLAB_TEST_EXE): $(ALL_OBJS)
55 | $(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS) $(LDLIBS)
56 |
57 | # ---------------------------------------------------------------------------
58 | # Object files compiling rules
59 | # ---------------------------------------------------------------------------
60 | %.c.o: %.c
61 | $(CC) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
62 |
63 | %.cpp.o: %.cpp
64 | $(CXX) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
65 | # ---------------------------------------------------------------------------
66 |
67 | clean:
68 | @rm -rf $(ALL_OBJS) $(SHAREDLIBBLISLAB) $(BLISLAB_TEST_EXE) $(ALL_DEPS)
69 |
70 |
71 |
--------------------------------------------------------------------------------
/sgemm/step5/bl_config.h:
--------------------------------------------------------------------------------
1 | /*
2 | * --------------------------------------------------------------------------
3 | * BLISLAB
4 | * --------------------------------------------------------------------------
5 | * Copyright (C) 2016, The University of Texas at Austin
6 | *
7 | * Redistribution and use in source and binary forms, with or without
8 | * modification, are permitted provided that the following conditions are
9 | * met:
10 | * - Redistributions of source code must retain the above copyright
11 | * notice, this list of conditions and the following disclaimer.
12 | * - Redistributions in binary form must reproduce the above copyright
13 | * notice, this list of conditions and the following disclaimer in the
14 | * documentation and/or other materials provided with the distribution.
15 | * - Neither the name of The University of Texas nor the names of its
16 | * contributors may be used to endorse or promote products derived
17 | * from this software without specific prior written permission.
18 | *
19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | *
31 | *
32 | * bl_config.h
33 | *
34 | *
35 | * Purpose:
36 | * this header file contains configuration parameters.
37 | *
38 | * Todo:
39 | *
40 | *
41 | * Modification:
42 | *
43 | *
44 | * */
45 |
46 | #ifndef BLISLAB_CONFIG_H
47 | #define BLISLAB_CONFIG_H
48 |
49 | // Allow C++ users to include this header file in their source code. However,
50 | // we make the extern "C" conditional on whether we're using a C++ compiler,
51 | // since regular C compilers don't understand the extern "C" construct.
52 | #ifdef __cplusplus
53 | extern "C" {
54 | #endif
55 |
56 | #define DGEMM_MR 4
57 | #define DGEMM_NR 4
58 |
59 | // End extern "C" construct block.
60 | #ifdef __cplusplus
61 | }
62 | #endif
63 |
64 | #endif
65 |
66 |
--------------------------------------------------------------------------------
/sgemm/step5/my_sgemm.c:
--------------------------------------------------------------------------------
1 | /*
2 | * --------------------------------------------------------------------------
3 | * BLISLAB
4 | * --------------------------------------------------------------------------
5 | * Copyright (C) 2016, The University of Texas at Austin
6 | *
7 | * Redistribution and use in source and binary forms, with or without
8 | * modification, are permitted provided that the following conditions are
9 | * met:
10 | * - Redistributions of source code must retain the above copyright
11 | * notice, this list of conditions and the following disclaimer.
12 | * - Redistributions in binary form must reproduce the above copyright
13 | * notice, this list of conditions and the following disclaimer in the
14 | * documentation and/or other materials provided with the distribution.
15 | * - Neither the name of The University of Texas nor the names of its
16 | * contributors may be used to endorse or promote products derived
17 | * from this software without specific prior written permission.
18 | *
19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | *
31 | *
32 | * bl_sgemm.c
33 | *
34 | *
35 | * Purpose:
36 | * this is the main file of blislab sgemm.
37 | *
38 | * Todo:
39 | *
40 | *
41 | * Modification:
42 | *
43 | *
44 | * */
45 |
46 | #include "bl_sgemm.h"
47 | #include "bl_config.h"
48 |
49 | inline void PackInputLayout(float* dst, const float* src, int m, int k, int mr) {
50 | int i, j, p;
51 |
52 | for ( j = 0; j < m / mr; j ++ ) {
53 | for ( i = 0; i < k; i ++ ) {
54 | for ( p = 0; p < mr; p ++ ) {
55 | *dst ++ = *(src + p * k + j * mr * k + i);
56 | }
57 | }
58 | }
59 | }
60 |
61 |
62 | void AddDot_4x4_opt( int k, float *packA, int lda, float *packB, int ldb, float *C, int ldc )
63 | {
64 | register float C00, C01, C02, C03, C10, C11, C12, C13, C20, C21, C22, C23, C30, C31, C32, C33;
65 | float *packAp, *packBp;
66 | int p;
67 |
68 | C00 = 0.0f;
69 | C01 = 0.0f;
70 | C02 = 0.0f;
71 | C03 = 0.0f;
72 | C10 = 0.0f;
73 | C11 = 0.0f;
74 | C12 = 0.0f;
75 | C13 = 0.0f;
76 | C20 = 0.0f;
77 | C21 = 0.0f;
78 | C22 = 0.0f;
79 | C23 = 0.0f;
80 | C30 = 0.0f;
81 | C31 = 0.0f;
82 | C32 = 0.0f;
83 | C33 = 0.0f;
84 | for (p = 0; p < k; p++) {
85 | packAp = &packA[p * 4];
86 | packBp = &packB[p * 4];
87 |
88 | C00 += packAp[0] * packBp[0];
89 | C01 += packAp[0] * packBp[1];
90 | C02 += packAp[0] * packBp[2];
91 | C03 += packAp[0] * packBp[3];
92 | C10 += packAp[1] * packBp[0];
93 | C11 += packAp[1] * packBp[1];
94 | C12 += packAp[1] * packBp[2];
95 | C13 += packAp[1] * packBp[3];
96 | C20 += packAp[2] * packBp[0];
97 | C21 += packAp[2] * packBp[1];
98 | C22 += packAp[2] * packBp[2];
99 | C23 += packAp[2] * packBp[3];
100 | C30 += packAp[3] * packBp[0];
101 | C31 += packAp[3] * packBp[1];
102 | C32 += packAp[3] * packBp[2];
103 | C33 += packAp[3] * packBp[3];
104 |
105 | }
106 | C(0, 0) += C00;
107 | C(0, 1) += C01;
108 | C(0, 2) += C02;
109 | C(0, 3) += C03;
110 | C(1, 0) += C10;
111 | C(1, 1) += C11;
112 | C(1, 2) += C12;
113 | C(1, 3) += C13;
114 | C(2, 0) += C20;
115 | C(2, 1) += C21;
116 | C(2, 2) += C22;
117 | C(2, 3) += C23;
118 | C(3, 0) += C30;
119 | C(3, 1) += C31;
120 | C(3, 2) += C32;
121 | C(3, 3) += C33;
122 | }
123 |
124 |
125 | void bl_sgemm_pack(
126 | int m,
127 | int mr,
128 | int n,
129 | int nr,
130 | int k,
131 | float *A,
132 | float *packA,
133 | int lda,
134 | float *B,
135 | float *packB,
136 | int ldb,
137 | float *C, // must be aligned
138 | int ldc // ldc must also be aligned
139 | )
140 | {
141 | int i, j, p;
142 | int ir, jr;
143 |
144 | // Early return if possible
145 | if ( m == 0 || n == 0 || k == 0 ) {
146 | printf( "bl_sgemm(): early return\n" );
147 | return;
148 | }
149 |
150 | PackInputLayout(packA, A, m, k, mr);
151 |
152 | // printf("A\n");
153 | // for(int i = 0; i < m; i++) {
154 | // for(int j = 0; j < m; j++) {
155 | // printf("%.0f\t", A[i * m + j]);
156 | // }
157 | // printf("\n");
158 | // }
159 | // printf("\n\n");
160 |
161 | // printf("packA\n");
162 | // for(int i = 0; i < m; i++) {
163 | // for(int j = 0; j < m; j++) {
164 | // printf("%.0f\t", packA[i * m + j]);
165 | // }
166 | // printf("\n");
167 | // }
168 | // printf("\n\n");
169 |
170 | for ( i = 0; i < m; i += DGEMM_MR ) { // Start 2-nd loop
171 | for ( j = 0; j < n; j += DGEMM_NR ) { // Start 1-st loop
172 | // AddDot_4x4_opt( k, &A( i, 0 ), lda, &packB[j * k], ldb, &C( i, j ), ldc );
173 | AddDot_4x4_opt( k, &packA[i * k], lda, &packB[j * k], ldb, &C( i, j ), ldc );
174 | } // End 1-st loop
175 | } // End 2-nd loop
176 | }
177 |
178 |
179 |
--------------------------------------------------------------------------------
/sgemm/step6/Makefile:
--------------------------------------------------------------------------------
1 | TARGET=test_bl_sgemm_step6
2 |
3 | #CROSS_COMPILE ?= riscv64-unknown-linux-gnu-
4 | # CROSS_COMPILE ?=
5 |
6 | CCL := /home/zhaodongyu/toolchain/riscv/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
7 | CTOOL := riscv64-unknown-linux-gnu-
8 | CROSS_COMPILE := ${CCL}/bin/${CTOOL}
9 |
10 | CC = $(CROSS_COMPILE)gcc
11 | CXX = $(CROSS_COMPILE)g++
12 |
13 | COMPILER_OPT_LEVEL=O3
14 |
15 | ifeq ($(CROSS_COMPILE),)
16 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=core-avx2 -fPIC -static
17 | LDLIBS = -lpthread -lm
18 | else
19 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static
20 | LDLIBS = -lm
21 | endif
22 |
23 | COMMONDIR = ../common
24 | CFLAGS += -I$(COMMONDIR)
25 |
26 | ASM_SRC= RvvSgemm4x16.S
27 | CC_SRC= my_sgemm.c \
28 | $(COMMONDIR)/test_bl_sgemm_packB_4x16.c \
29 | $(COMMONDIR)/bl_sgemm_ref.c \
30 | $(COMMONDIR)/bl_sgemm_util.c
31 |
32 | CPP_SRC=
33 |
34 | ALL_ASM_OBJS := $(ASM_SRC:=.o)
35 | ALL_C_OBJS := $(CC_SRC:=.o)
36 | ALL_CXX_OBJS := $(CPP_SRC:=.o)
37 |
38 | MKDEP_OPT = -MMD -MF $@.d
39 |
40 | ALL_OBJS += $(ALL_ASM_OBJS) $(ALL_C_OBJS) $(ALL_CXX_OBJS)
41 | ALL_DEPS := $(ALL_OBJS:=.d)
42 |
43 | BLISLAB_TEST_EXE=$(TARGET:=.x)
44 |
45 | .PHONY: all clean
46 |
47 | all: $(BLISLAB_TEST_EXE)
48 |
49 | # include dependency files of application
50 | ifneq ($(MAKECMDGOALS),clean)
51 | -include $(ALL_DEPS)
52 | endif
53 |
54 | $(BLISLAB_TEST_EXE): $(ALL_OBJS)
55 | $(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS) $(LDLIBS)
56 |
57 | # ---------------------------------------------------------------------------
58 | # Object files compiling rules
59 | # ---------------------------------------------------------------------------
60 | %.c.o: %.c
61 | $(CC) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
62 |
63 | %.cpp.o: %.cpp
64 | $(CXX) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
65 |
66 | %.S.o: %.S
67 | $(CC) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
68 | # ---------------------------------------------------------------------------
69 |
70 | clean:
71 | @rm -rf $(ALL_OBJS) $(SHAREDLIBBLISLAB) $(BLISLAB_TEST_EXE) $(ALL_DEPS)
72 |
73 |
74 |
--------------------------------------------------------------------------------
/sgemm/step6/RvvSgemm4x16.S:
--------------------------------------------------------------------------------
1 | # |<-- k=4 -->|
2 | # +++++++++++++ - f0, f4, f0, f4
3 | # + + | f1, f5, f1, f5
4 | # + A + mr=4 f2, f6, f2, f6
5 | # + + | f3, f7, f3, f7
6 | # +++++++++++++ -
7 |
8 |
9 | # |<-- nr=16 -->|
10 | # +++++++++++++++++ - v0 v1 v2 v3
11 | # + + | v4 v5 v6 v7
12 | # + B + k=4 v8,v9,v10,v11
13 | # + + | v12,v13,v14,v15
14 | # +++++++++++++++++ -
15 |
16 |
17 | # |<-- nr -->|
18 | # +++++++++++++++++ -
19 | # + + | v16,v17,v18,v19
20 | # + C + k=4 v20,v21,v22,v23
21 | # + + | v24,v25,v26,v27
22 | # +++++++++++++++++ - v28,v29,v30,v31
23 | #
24 |
25 | # void RvvSgemm4x16(size_t nr, // nr <= 16, a0
26 | # size_t mr, // mr <= 4, a1
27 | # size_t k, // astride = k*sizeof(float), a2
28 | # const float* a, // mr * k, a3
29 | # const float* b, // k * 16, a4
30 | # float* c, // mr * nr, a5
31 | # size_t c_stride, // Len(N) * sizeof(float), a6
32 | # const float* bias // bias, a7
33 | # );
34 | .global RvvSgemm4x16
35 | .type RvvSgemm4x16, @function
36 |
37 | #define nr a0
38 | #define mr a1
39 | #define k a2
40 | #define ap a3
41 | #define bp a4
42 | #define cp a5
43 | #define c_stride a6
44 | #define bias a7
45 |
46 | #define ap1 t0
47 | #define ap2 t1
48 | #define ap3 t2
49 | #define cp1 t3
50 | #define cp2 t4
51 | #define cp3 t5
52 | #define a_stride s0
53 | #define bp0 s1
54 | #define biasp s2
55 | #define kt s3
56 | #define ap0 s4
57 | #define cp0 s5
58 | #define bp1 s6
59 | #define vl s7
60 | #define ap_offset s10
61 | #define cp_offset s11
62 |
63 | #define FRAMESIZE 104
64 |
65 | RvvSgemm4x16:
66 | addi sp, sp, -FRAMESIZE # callee update stack pointer
67 | sd s0, 96(sp) # callee saved frame pointer
68 | addi s0, sp, FRAMESIZE # generate new frame pointer
69 | sd s1, -16(s0)
70 | sd s2, -24(s0)
71 | sd s3, -32(s0)
72 | sd s4, -40(s0)
73 | sd s5, -48(s0)
74 | sd s6, -56(s0)
75 | sd s7, -64(s0)
76 | sd s8, -72(s0)
77 | sd s9, -80(s0)
78 | sd s10, -88(s0)
79 | sd s11, -96(s0)
80 |
81 | li ap_offset, 0
82 | li cp_offset, 0
83 | slli a_stride, k, 2 # astride = k * sizeof(float)
84 | mv s3, nr
85 | vsetvli s2, s3, e32, m4
86 | mv ap0, ap
87 | mv bp0, bp
88 | mv cp0, cp
89 | .a1_offset:
90 | mv ap1, ap0
91 | mv cp1, cp0
92 | slti t6, mr, 2 # mr < 2
93 | bnez t6, .a2_offset
94 | add ap1, ap0, a_stride
95 | add cp1, cp0, c_stride
96 | .a2_offset:
97 | mv ap2, ap1
98 | mv cp2, cp1
99 | slti t6, mr, 3 # mr < 3
100 | bnez t6, .a3_offset
101 | add ap2, ap1, a_stride
102 | add cp2, cp1, c_stride
103 | .a3_offset:
104 | mv ap3, ap2
105 | mv cp3, cp2
106 | slti t6, mr, 4 # mr < 4
107 | bnez t6, .start
108 | add ap3, ap2, a_stride
109 | add cp3, cp2, c_stride
110 |
111 | .start:
112 | mv biasp, bias
113 | mv kt, k
114 | beqz mr, .end
115 |
116 | vlw.v v16, (biasp)
117 | vlw.v v20, (biasp)
118 | vlw.v v24, (biasp)
119 | vlw.v v28, (biasp)
120 | addi biasp, biasp, 64
121 | slti t6, kt, 4 # kt < 4, t6 = 1
122 | bnez t6, .k2_tail
123 |
124 |
125 | // flw fs0, 64(ap0) # pre-load A
126 | // flw fs1, 64(ap1) # pre-load A
127 | // flw fs2, 64(ap2) # pre-load A
128 | // flw fs3, 64(ap3) # pre-load A
129 |
130 | // flw fs4, 512(bp0) # pre-load B
131 | // flw fs5, 576(bp0) # pre-load B
132 | // flw fs6, 640(bp0) # pre-load B
133 | // flw fs7, 704(bp0) # pre-load B
134 |
135 | // load 4 row A (A0, A1, A2, A3)
136 | flw ft0, (ap0)
137 | addi ap0, ap0, 4
138 | flw ft1, (ap1)
139 | addi ap1, ap1, 4
140 | flw ft2, (ap2)
141 | addi ap2, ap2, 4
142 | flw ft3, (ap3)
143 | addi ap3, ap3, 4
144 | // load 16 col B(B0, B1, B2, B3)
145 | vlw.v v0, (bp0)
146 | addi bp0,bp0,64
147 |
148 | addi kt, kt, -4 # Decrement k counter
149 | slti t6, kt, 4 # kt < 4
150 | bnez t6, .k4_tail # jump to k4_tail
151 |
152 | .k4_main:
153 | addi kt, kt, -4 # Decrement k counter
154 | // first group of 16 fma, second group load
155 | vfmacc.vf v16, ft0, v0
156 | vlw.v v4, (bp0) # b0'->v4
157 | // flw fs4, 384(bp0) # pre-load B
158 | addi bp0,bp0,64
159 | vfmacc.vf v20, ft1, v0
160 | flw ft4, (ap0) # a0'->ft4
161 | addi ap0, ap0, 4
162 | vfmacc.vf v24, ft2, v0
163 | flw ft5, (ap1) # a1'->ft5
164 | addi ap1, ap1, 4
165 | vfmacc.vf v28, ft3, v0
166 | flw ft6, (ap2) # a2'->ft6
167 | addi ap2, ap2, 4
168 | flw ft7, (ap3) # a3'->ft7
169 | addi ap3, ap3, 4
170 | // second group of 16 fma, third group load
171 | slti t6, kt, 4 # kt < 4, t6 = 1
172 | vfmacc.vf v16, ft4, v4
173 | vlw.v v8, (bp0) # b0'->v0
174 | // flw fs5, 384(bp0) # pre-load B
175 | addi bp0,bp0,64
176 | vfmacc.vf v20, ft5, v4
177 | flw ft0, (ap0) # a0'->ft0
178 | addi ap0, ap0, 4
179 | vfmacc.vf v24, ft6, v4
180 | flw ft1, (ap1) # a1'->ft1
181 | addi ap1, ap1, 4
182 | vfmacc.vf v28, ft7, v4
183 | flw ft2, (ap2) # a2'->ft2
184 | addi ap2, ap2, 4
185 | flw ft3, (ap3) # a3'->ft3
186 | addi ap3, ap3, 4
187 | // third group of 16 fma, fourth group load
188 | vfmacc.vf v16, ft0, v8
189 | vlw.v v12, (bp0) # b0'->v0
190 | // flw fs6, 384(bp0) # pre-load B
191 | addi bp0,bp0,64
192 | vfmacc.vf v20, ft1, v8
193 | flw ft4, (ap0) # a0'->ft0
194 | addi ap0, ap0, 4
195 | vfmacc.vf v24, ft2, v8
196 | flw ft5, (ap1) # a1'->ft1
197 | addi ap1, ap1, 4
198 | vfmacc.vf v28, ft3, v8
199 | flw ft6, (ap2) # a2'->ft2
200 | addi ap2, ap2, 4
201 | flw ft7, (ap3) # a3'->ft3
202 | addi ap3, ap3, 4
203 | // fourth group of 16 fma, first group load
204 | vfmacc.vf v16, ft4, v12
205 | vlw.v v0, (bp0) # b0'->v0
206 | // flw fs7, 384(bp0) # pre-load B
207 | addi bp0,bp0,64
208 | vfmacc.vf v20, ft5, v12
209 | flw ft0, (ap0) # a0'->ft0
210 | addi ap0, ap0, 4
211 | vfmacc.vf v24, ft6, v12
212 | flw ft1, (ap1) # a0'->ft0
213 | addi ap1, ap1, 4
214 | vfmacc.vf v28, ft7, v12
215 | flw ft2, (ap2) # a0'->ft0
216 | addi ap2, ap2, 4
217 | flw ft3, (ap3) # a0'->ft0
218 | addi ap3, ap3, 4
219 |
220 | // flw fs0, 64(ap0) # pre-load A
221 | // flw fs1, 64(ap1) # pre-load A
222 | // flw fs2, 64(ap2) # pre-load A
223 | // flw fs3, 64(ap3) # pre-load A
224 |
225 | beqz t6, .k4_main
226 | .k4_tail:
227 | // first group of 16 fma, second group load
228 | vfmacc.vf v16, ft0, v0
229 | vlw.v v4, (bp0) # b0'->v4
230 | addi bp0,bp0,64
231 | vfmacc.vf v20, ft1, v0
232 | flw ft4, (ap0) # a0'->ft4
233 | addi ap0, ap0, 4
234 | vfmacc.vf v24, ft2, v0
235 | flw ft5, (ap1) # a1'->ft5
236 | addi ap1, ap1, 4
237 | vfmacc.vf v28, ft3, v0
238 | flw ft6, (ap2) # a2'->ft6
239 | addi ap2, ap2, 4
240 | flw ft7, (ap3) # a3'->ft7
241 | addi ap3, ap3, 4
242 | // second group of 16 fma, third group load
243 | vfmacc.vf v16, ft4, v4
244 | vlw.v v8, (bp0) # b0'->v0
245 | addi bp0,bp0,64
246 | vfmacc.vf v20, ft5, v4
247 | flw ft0, (ap0) # a0'->ft0
248 | addi ap0, ap0, 4
249 | vfmacc.vf v24, ft6, v4
250 | flw ft1, (ap1) # a1'->ft1
251 | addi ap1, ap1, 4
252 | vfmacc.vf v28, ft7, v4
253 | flw ft2, (ap2) # a2'->ft2
254 | addi ap2, ap2, 4
255 | flw ft3, (ap3) # a3'->ft3
256 | addi ap3, ap3, 4
257 | // third group of 16 fma, fourth group load
258 | vfmacc.vf v16, ft0, v8
259 | vlw.v v12, (bp0) # b0'->v0
260 | addi bp0,bp0,64
261 | vfmacc.vf v20, ft1, v8
262 | flw ft4, (ap0) # a0'->ft0
263 | addi ap0, ap0, 4
264 | vfmacc.vf v24, ft2, v8
265 | flw ft5, (ap1) # a1'->ft1
266 | addi ap1, ap1, 4
267 | vfmacc.vf v28, ft3, v8
268 | flw ft6, (ap2) # a2'->ft2
269 | addi ap2, ap2, 4
270 | flw ft7, (ap3) # a3'->ft3
271 | addi ap3, ap3, 4
272 | // fourth group of 16 fma, no group load
273 | vfmacc.vf v16, ft4, v12
274 | vfmacc.vf v20, ft5, v12
275 | vfmacc.vf v24, ft6, v12
276 | vfmacc.vf v28, ft7, v12
277 | .k2_tail:
278 | slti t6, kt, 2 # kt < 2
279 | bnez t6, .k1_tail
280 | flw ft0, (ap0)
281 | addi ap0, ap0, 4
282 | vlw.v v0, (bp0)
283 | addi bp0,bp0,64
284 | flw ft1, (ap1)
285 | addi ap1, ap1, 4
286 | flw ft2, (ap2)
287 | addi ap2, ap2, 4
288 | flw ft3, (ap3)
289 | addi ap3, ap3, 4
290 | // first group of 16 fma, second group load
291 | vfmacc.vf v16, ft0, v0
292 | vlw.v v4, (bp0) # b0'->v4
293 | addi bp0,bp0,64
294 | vfmacc.vf v20, ft1, v0
295 | flw ft4, (ap0) # a0'->ft4
296 | addi ap0, ap0, 4
297 | vfmacc.vf v24, ft2, v0
298 | flw ft5, (ap1) # a1'->ft5
299 | addi ap1, ap1, 4
300 | vfmacc.vf v28, ft3, v0
301 | flw ft6, (ap2) # a2'->ft6
302 | addi ap2, ap2, 4
303 | flw ft7, (ap3) # a3'->ft7
304 | addi ap3, ap3, 4
305 | // second group of 16 fma, third group load
306 | vfmacc.vf v16, ft4, v4
307 | vfmacc.vf v20, ft5, v4
308 | vfmacc.vf v24, ft6, v4
309 | vfmacc.vf v28, ft7, v4
310 | .k1_tail:
311 | slti t6, kt, 1 # kt < 1
312 | bnez t6, .store_tile
313 | flw ft0, (ap0)
314 | addi ap0, ap0, 4
315 | vlw.v v0, (bp0)
316 | addi bp0,bp0,64
317 | flw ft1, (ap1)
318 | addi ap1, ap1, 4
319 | flw ft2, (ap2)
320 | addi ap2, ap2, 4
321 | flw ft3, (ap3)
322 | addi ap3, ap3, 4
323 | vfmacc.vf v16, ft0, v0
324 | vfmacc.vf v20, ft1, v0
325 | vfmacc.vf v24, ft2, v0
326 | vfmacc.vf v28, ft3, v0
327 | .store_tile:
328 | add cp0, cp0, cp_offset
329 | vsw.v v16, (cp0)
330 | addi cp0, cp0, 64
331 |
332 | vsw.v v20, (cp1)
333 | addi cp1, cp1, 64
334 |
335 | vsw.v v24, (cp2)
336 | addi cp2, cp2, 64
337 |
338 | vsw.v v28, (cp3)
339 | addi cp3, cp3, 64
340 | j .end
341 |
342 | .end:
343 | ld s0, 96(sp)
344 | ld s1, 88(sp)
345 | ld s2, 80(sp)
346 | ld s3, 72(sp)
347 | ld s4, 64(sp)
348 | ld s5, 56(sp)
349 | ld s6, 48(sp)
350 | ld s7, 40(sp)
351 | ld s8, 32(sp)
352 | ld s9, 24(sp)
353 | ld s10, 16(sp)
354 | ld s11, 8(sp)
355 | addi sp, sp, FRAMESIZE
356 | ret
357 |
--------------------------------------------------------------------------------
/sgemm/step6/bl_config.h:
--------------------------------------------------------------------------------
1 | /*
2 | * --------------------------------------------------------------------------
3 | * BLISLAB
4 | * --------------------------------------------------------------------------
5 | * Copyright (C) 2016, The University of Texas at Austin
6 | *
7 | * Redistribution and use in source and binary forms, with or without
8 | * modification, are permitted provided that the following conditions are
9 | * met:
10 | * - Redistributions of source code must retain the above copyright
11 | * notice, this list of conditions and the following disclaimer.
12 | * - Redistributions in binary form must reproduce the above copyright
13 | * notice, this list of conditions and the following disclaimer in the
14 | * documentation and/or other materials provided with the distribution.
15 | * - Neither the name of The University of Texas nor the names of its
16 | * contributors may be used to endorse or promote products derived
17 | * from this software without specific prior written permission.
18 | *
19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | *
31 | *
32 | * bl_config.h
33 | *
34 | *
35 | * Purpose:
36 | * this header file contains configuration parameters.
37 | *
38 | * Todo:
39 | *
40 | *
41 | * Modification:
42 | *
43 | *
44 | * */
45 |
46 | #ifndef BLISLAB_CONFIG_H
47 | #define BLISLAB_CONFIG_H
48 |
49 | // Allow C++ users to include this header file in their source code. However,
50 | // we make the extern "C" conditional on whether we're using a C++ compiler,
51 | // since regular C compilers don't understand the extern "C" construct.
52 | #ifdef __cplusplus
53 | extern "C" {
54 | #endif
55 |
56 | #define DGEMM_MR 4
57 | #define DGEMM_NR 16
58 |
59 | // End extern "C" construct block.
60 | #ifdef __cplusplus
61 | }
62 | #endif
63 |
64 | #endif
65 |
66 |
--------------------------------------------------------------------------------
/sgemm/step6/my_sgemm.c:
--------------------------------------------------------------------------------
1 | #include "bl_sgemm.h"
2 | #include "bl_config.h"
3 |
4 | extern void RvvSgemm4x16( size_t nr, // nr <= 16
5 | size_t mr, // mr <= 4
6 | size_t k, // astride = k*sizeof(float)
7 | const float* a, // mr * k
8 | const float* b, // k * 16
9 | float* c, // mr * nr
10 | size_t cn_stride, // Len(N) * sizeof(float)
11 | const float* bias // bias
12 | );
13 |
14 |
15 | void bl_sgemm_pack(
16 | int m,
17 | int mr,
18 | int n,
19 | int nr,
20 | int k,
21 | float *A,
22 | float *packA,
23 | int lda,
24 | float *B,
25 | float *packB,
26 | int ldb,
27 | float *C, // must be aligned
28 | int ldc // ldc must also be aligned
29 | )
30 | {
31 | int i, j, p;
32 | int ir, jr;
33 |
34 | // Early return if possible
35 | if ( m == 0 || n == 0 || k == 0 ) {
36 | printf( "bl_sgemm(): early return\n" );
37 | return;
38 | }
39 |
40 | float bias[800];
41 | for(int i = 0; i < 800; i++) {bias[i] = 0;}
42 |
43 | for ( i = 0; i < m; i += DGEMM_MR ) { // Start 2-nd loop
44 | int mb = DGEMM_MR;
45 | if((m - i) < DGEMM_MR) mb = m - i;
46 |
47 | for ( j = 0; j < n; j += DGEMM_NR ) { // Start 1-st loop
48 | int nb = DGEMM_NR;
49 | if((n - j) < DGEMM_NR) nb = n - j;
50 |
51 | RvvSgemm4x16( nb, // nr <= 16, a0
52 | mb, // mr <= 4, a1
53 | k, // astride = k*sizeof(float), a2
54 | &A[i * k], // mr * k, a3
55 | &packB[j * k], // k * 16, a4
56 | &C( i, j ), // mr * nr, a5
57 | n * sizeof(float), // Len(N) * sizeof(float), a6
58 | bias
59 | );
60 | } // End 1-st loop
61 | } // End 2-nd loop
62 | }
63 |
64 |
65 |
--------------------------------------------------------------------------------
/sgemm/step6/run.sh:
--------------------------------------------------------------------------------
1 | make &&
2 | adb push test_bl_sgemm_step6.x ./. &&
3 | adb shell "./test_bl_sgemm_step6.x"
--------------------------------------------------------------------------------
/sgemm/step7/Makefile:
--------------------------------------------------------------------------------
1 | TARGET=test_bl_sgemm_step7
2 |
3 | #CROSS_COMPILE ?= riscv64-unknown-linux-gnu-
4 | # CROSS_COMPILE ?=
5 |
6 | CCL := /home/zhaodongyu/toolchain/riscv/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
7 | CTOOL := riscv64-unknown-linux-gnu-
8 | CROSS_COMPILE := ${CCL}/bin/${CTOOL}
9 |
10 | CC = $(CROSS_COMPILE)gcc
11 | CXX = $(CROSS_COMPILE)g++
12 |
13 | COMPILER_OPT_LEVEL=O3
14 |
15 | ifeq ($(CROSS_COMPILE),)
16 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=core-avx2 -fPIC -static
17 | LDLIBS = -lpthread -lm
18 | else
19 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static
20 | LDLIBS = -lm
21 | endif
22 |
23 | COMMONDIR = ../common
24 | CFLAGS += -I$(COMMONDIR)
25 |
26 | ASM_SRC= RvvSgemm4x16.S
27 | CC_SRC= my_sgemm.c \
28 | $(COMMONDIR)/test_bl_sgemm_packB_4x16.c \
29 | $(COMMONDIR)/bl_sgemm_ref.c \
30 | $(COMMONDIR)/bl_sgemm_util.c
31 |
32 | CPP_SRC=
33 |
34 | ALL_ASM_OBJS := $(ASM_SRC:=.o)
35 | ALL_C_OBJS := $(CC_SRC:=.o)
36 | ALL_CXX_OBJS := $(CPP_SRC:=.o)
37 |
38 | MKDEP_OPT = -MMD -MF $@.d
39 |
40 | ALL_OBJS += $(ALL_ASM_OBJS) $(ALL_C_OBJS) $(ALL_CXX_OBJS)
41 | ALL_DEPS := $(ALL_OBJS:=.d)
42 |
43 | BLISLAB_TEST_EXE=$(TARGET:=.x)
44 |
45 | .PHONY: all clean
46 |
47 | all: $(BLISLAB_TEST_EXE)
48 |
49 | # include dependency files of application
50 | ifneq ($(MAKECMDGOALS),clean)
51 | -include $(ALL_DEPS)
52 | endif
53 |
54 | $(BLISLAB_TEST_EXE): $(ALL_OBJS)
55 | $(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS) $(LDLIBS)
56 |
57 | # ---------------------------------------------------------------------------
58 | # Object files compiling rules
59 | # ---------------------------------------------------------------------------
60 | %.c.o: %.c
61 | $(CC) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
62 |
63 | %.cpp.o: %.cpp
64 | $(CXX) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
65 |
66 | %.S.o: %.S
67 | $(CC) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
68 | # ---------------------------------------------------------------------------
69 |
70 | clean:
71 | @rm -rf $(ALL_OBJS) $(SHAREDLIBBLISLAB) $(BLISLAB_TEST_EXE) $(ALL_DEPS)
72 |
73 |
74 |
--------------------------------------------------------------------------------
/sgemm/step7/RvvSgemm4x16.S:
--------------------------------------------------------------------------------
1 | # |<-- k=4 -->|
2 | # +++++++++++++ - f0, f4, f0, f4
3 | # + + | f1, f5, f1, f5
4 | # + A + mr=4 f2, f6, f2, f6
5 | # + + | f3, f7, f3, f7
6 | # +++++++++++++ -
7 |
8 |
9 | # |<-- nr=16 -->|
10 | # +++++++++++++++++ - v0 v1 v2 v3
11 | # + + | v4 v5 v6 v7
12 | # + B + k=4 v8,v9,v10,v11
13 | # + + | v12,v13,v14,v15
14 | # +++++++++++++++++ -
15 |
16 |
17 | # |<-- nr -->|
18 | # +++++++++++++++++ -
19 | # + + | v16,v17,v18,v19
20 | # + C + k=4 v20,v21,v22,v23
21 | # + + | v24,v25,v26,v27
22 | # +++++++++++++++++ - v28,v29,v30,v31
23 | #
24 |
25 | # void RvvSgemm4x16(size_t nr, // nr <= 16, a0
26 | # size_t mr, // mr <= 4, a1
27 | # size_t k, // astride = k*sizeof(float), a2
28 | # const float* a, // mr * k, a3
29 | # const float* b, // k * 16, a4
30 | # float* c, // mr * nr, a5
31 | # size_t c_stride, // Len(N) * sizeof(float), a6
32 | # const float* bias // bias, a7
33 | # );
34 | .global RvvSgemm4x16
35 | .type RvvSgemm4x16, @function
36 |
37 | #define nr a0
38 | #define mr a1
39 | #define k a2
40 | #define ap a3
41 | #define bp a4
42 | #define cp a5
43 | #define c_stride a6
44 | #define bias a7
45 |
46 | #define ap1 t0
47 | #define ap2 t1
48 | #define ap3 t2
49 | #define cp1 t3
50 | #define cp2 t4
51 | #define cp3 t5
52 | #define a_stride s0
53 | #define bp0 s1
54 | #define biasp s2
55 | #define kt s3
56 | #define ap0 s4
57 | #define cp0 s5
58 | #define bp1 s6
59 | #define vl s7
60 | #define ap_offset s10
61 | #define cp_offset s11
62 |
63 | #define FRAMESIZE 104
64 |
65 | RvvSgemm4x16:
66 | addi sp, sp, -FRAMESIZE # callee update stack pointer
67 | sd s0, 96(sp) # callee saved frame pointer
68 | addi s0, sp, FRAMESIZE # generate new frame pointer
69 | sd s1, -16(s0)
70 | sd s2, -24(s0)
71 | sd s3, -32(s0)
72 | sd s4, -40(s0)
73 | sd s5, -48(s0)
74 | sd s6, -56(s0)
75 | sd s7, -64(s0)
76 | sd s8, -72(s0)
77 | sd s9, -80(s0)
78 | sd s10, -88(s0)
79 | sd s11, -96(s0)
80 |
81 | li ap_offset, 0
82 | li cp_offset, 0
83 | slli a_stride, k, 2 # astride = k * sizeof(float)
84 | mv s3, nr
85 | vsetvli s2, s3, e32, m4
86 | mv ap0, ap
87 | mv bp0, bp
88 | mv cp0, cp
89 | .a1_offset:
90 | mv ap1, ap0
91 | mv cp1, cp0
92 | slti t6, mr, 2 # mr < 2
93 | bnez t6, .a2_offset
94 | add ap1, ap0, a_stride
95 | add cp1, cp0, c_stride
96 | .a2_offset:
97 | mv ap2, ap1
98 | mv cp2, cp1
99 | slti t6, mr, 3 # mr < 3
100 | bnez t6, .a3_offset
101 | add ap2, ap1, a_stride
102 | add cp2, cp1, c_stride
103 | .a3_offset:
104 | mv ap3, ap2
105 | mv cp3, cp2
106 | slti t6, mr, 4 # mr < 4
107 | bnez t6, .start
108 | add ap3, ap2, a_stride
109 | add cp3, cp2, c_stride
110 |
111 | .start:
112 | mv biasp, bias
113 | mv kt, k
114 | beqz mr, .end
115 |
116 | vlw.v v16, (biasp)
117 | vlw.v v20, (biasp)
118 | vlw.v v24, (biasp)
119 | vlw.v v28, (biasp)
120 | addi biasp, biasp, 64
121 | slti t6, kt, 4 # kt < 4, t6 = 1
122 | bnez t6, .k2_tail
123 |
124 |
125 | // flw fs0, 64(ap0) # pre-load A
126 | // flw fs1, 64(ap1) # pre-load A
127 | // flw fs2, 64(ap2) # pre-load A
128 | // flw fs3, 64(ap3) # pre-load A
129 |
130 | // flw fs4, 512(bp0) # pre-load B
131 | // flw fs5, 576(bp0) # pre-load B
132 | // flw fs6, 640(bp0) # pre-load B
133 | // flw fs7, 704(bp0) # pre-load B
134 |
135 | // load 4 row A (A0, A1, A2, A3)
136 | flw ft0, (ap0)
137 | addi ap0, ap0, 4
138 | flw ft1, (ap1)
139 | addi ap1, ap1, 4
140 | flw ft2, (ap2)
141 | addi ap2, ap2, 4
142 | flw ft3, (ap3)
143 | addi ap3, ap3, 4
144 | // load 16 col B(B0, B1, B2, B3)
145 | vlw.v v0, (bp0)
146 | addi bp0,bp0,64
147 |
148 | addi kt, kt, -4 # Decrement k counter
149 | slti t6, kt, 4 # kt < 4
150 | bnez t6, .k4_tail # jump to k4_tail
151 |
152 | .k4_main:
153 | addi kt, kt, -4 # Decrement k counter
154 | // first group of 16 fma, second group load
155 | vfmacc.vf v16, ft0, v0
156 | vlw.v v4, (bp0) # b0'->v4
157 | // flw fs4, 384(bp0) # pre-load B
158 | addi bp0,bp0,64
159 | vfmacc.vf v20, ft1, v0
160 | flw ft4, (ap0) # a0'->ft4
161 | addi ap0, ap0, 4
162 | vfmacc.vf v24, ft2, v0
163 | flw ft5, (ap1) # a1'->ft5
164 | addi ap1, ap1, 4
165 | vfmacc.vf v28, ft3, v0
166 | flw ft6, (ap2) # a2'->ft6
167 | addi ap2, ap2, 4
168 | flw ft7, (ap3) # a3'->ft7
169 | addi ap3, ap3, 4
170 | // second group of 16 fma, third group load
171 | slti t6, kt, 4 # kt < 4, t6 = 1
172 | vfmacc.vf v16, ft4, v4
173 | vlw.v v8, (bp0) # b0'->v0
174 | // flw fs5, 384(bp0) # pre-load B
175 | addi bp0,bp0,64
176 | vfmacc.vf v20, ft5, v4
177 | flw ft0, (ap0) # a0'->ft0
178 | addi ap0, ap0, 4
179 | vfmacc.vf v24, ft6, v4
180 | flw ft1, (ap1) # a1'->ft1
181 | addi ap1, ap1, 4
182 | vfmacc.vf v28, ft7, v4
183 | flw ft2, (ap2) # a2'->ft2
184 | addi ap2, ap2, 4
185 | flw ft3, (ap3) # a3'->ft3
186 | addi ap3, ap3, 4
187 | // third group of 16 fma, fourth group load
188 | vfmacc.vf v16, ft0, v8
189 | vlw.v v12, (bp0) # b0'->v0
190 | // flw fs6, 384(bp0) # pre-load B
191 | addi bp0,bp0,64
192 | vfmacc.vf v20, ft1, v8
193 | flw ft4, (ap0) # a0'->ft0
194 | addi ap0, ap0, 4
195 | vfmacc.vf v24, ft2, v8
196 | flw ft5, (ap1) # a1'->ft1
197 | addi ap1, ap1, 4
198 | vfmacc.vf v28, ft3, v8
199 | flw ft6, (ap2) # a2'->ft2
200 | addi ap2, ap2, 4
201 | flw ft7, (ap3) # a3'->ft3
202 | addi ap3, ap3, 4
203 | // fourth group of 16 fma, first group load
204 | vfmacc.vf v16, ft4, v12
205 | vlw.v v0, (bp0) # b0'->v0
206 | // flw fs7, 384(bp0) # pre-load B
207 | addi bp0,bp0,64
208 | vfmacc.vf v20, ft5, v12
209 | flw ft0, (ap0) # a0'->ft0
210 | addi ap0, ap0, 4
211 | vfmacc.vf v24, ft6, v12
212 | flw ft1, (ap1) # a0'->ft0
213 | addi ap1, ap1, 4
214 | vfmacc.vf v28, ft7, v12
215 | flw ft2, (ap2) # a0'->ft0
216 | addi ap2, ap2, 4
217 | flw ft3, (ap3) # a0'->ft0
218 | addi ap3, ap3, 4
219 |
220 | // flw fs0, 64(ap0) # pre-load A
221 | // flw fs1, 64(ap1) # pre-load A
222 | // flw fs2, 64(ap2) # pre-load A
223 | // flw fs3, 64(ap3) # pre-load A
224 |
225 | beqz t6, .k4_main
226 | .k4_tail:
227 | // first group of 16 fma, second group load
228 | vfmacc.vf v16, ft0, v0
229 | vlw.v v4, (bp0) # b0'->v4
230 | addi bp0,bp0,64
231 | vfmacc.vf v20, ft1, v0
232 | flw ft4, (ap0) # a0'->ft4
233 | addi ap0, ap0, 4
234 | vfmacc.vf v24, ft2, v0
235 | flw ft5, (ap1) # a1'->ft5
236 | addi ap1, ap1, 4
237 | vfmacc.vf v28, ft3, v0
238 | flw ft6, (ap2) # a2'->ft6
239 | addi ap2, ap2, 4
240 | flw ft7, (ap3) # a3'->ft7
241 | addi ap3, ap3, 4
242 | // second group of 16 fma, third group load
243 | vfmacc.vf v16, ft4, v4
244 | vlw.v v8, (bp0) # b0'->v0
245 | addi bp0,bp0,64
246 | vfmacc.vf v20, ft5, v4
247 | flw ft0, (ap0) # a0'->ft0
248 | addi ap0, ap0, 4
249 | vfmacc.vf v24, ft6, v4
250 | flw ft1, (ap1) # a1'->ft1
251 | addi ap1, ap1, 4
252 | vfmacc.vf v28, ft7, v4
253 | flw ft2, (ap2) # a2'->ft2
254 | addi ap2, ap2, 4
255 | flw ft3, (ap3) # a3'->ft3
256 | addi ap3, ap3, 4
257 | // third group of 16 fma, fourth group load
258 | vfmacc.vf v16, ft0, v8
259 | vlw.v v12, (bp0) # b0'->v0
260 | addi bp0,bp0,64
261 | vfmacc.vf v20, ft1, v8
262 | flw ft4, (ap0) # a0'->ft0
263 | addi ap0, ap0, 4
264 | vfmacc.vf v24, ft2, v8
265 | flw ft5, (ap1) # a1'->ft1
266 | addi ap1, ap1, 4
267 | vfmacc.vf v28, ft3, v8
268 | flw ft6, (ap2) # a2'->ft2
269 | addi ap2, ap2, 4
270 | flw ft7, (ap3) # a3'->ft3
271 | addi ap3, ap3, 4
272 | // fourth group of 16 fma, no group load
273 | vfmacc.vf v16, ft4, v12
274 | vfmacc.vf v20, ft5, v12
275 | vfmacc.vf v24, ft6, v12
276 | vfmacc.vf v28, ft7, v12
277 | .k2_tail:
278 | slti t6, kt, 2 # kt < 2
279 | bnez t6, .k1_tail
280 | flw ft0, (ap0)
281 | addi ap0, ap0, 4
282 | vlw.v v0, (bp0)
283 | addi bp0,bp0,64
284 | flw ft1, (ap1)
285 | addi ap1, ap1, 4
286 | flw ft2, (ap2)
287 | addi ap2, ap2, 4
288 | flw ft3, (ap3)
289 | addi ap3, ap3, 4
290 | // first group of 16 fma, second group load
291 | vfmacc.vf v16, ft0, v0
292 | vlw.v v4, (bp0) # b0'->v4
293 | addi bp0,bp0,64
294 | vfmacc.vf v20, ft1, v0
295 | flw ft4, (ap0) # a0'->ft4
296 | addi ap0, ap0, 4
297 | vfmacc.vf v24, ft2, v0
298 | flw ft5, (ap1) # a1'->ft5
299 | addi ap1, ap1, 4
300 | vfmacc.vf v28, ft3, v0
301 | flw ft6, (ap2) # a2'->ft6
302 | addi ap2, ap2, 4
303 | flw ft7, (ap3) # a3'->ft7
304 | addi ap3, ap3, 4
305 | // second group of 16 fma, third group load
306 | vfmacc.vf v16, ft4, v4
307 | vfmacc.vf v20, ft5, v4
308 | vfmacc.vf v24, ft6, v4
309 | vfmacc.vf v28, ft7, v4
310 | .k1_tail:
311 | slti t6, kt, 1 # kt < 1
312 | bnez t6, .store_tile
313 | flw ft0, (ap0)
314 | addi ap0, ap0, 4
315 | vlw.v v0, (bp0)
316 | addi bp0,bp0,64
317 | flw ft1, (ap1)
318 | addi ap1, ap1, 4
319 | flw ft2, (ap2)
320 | addi ap2, ap2, 4
321 | flw ft3, (ap3)
322 | addi ap3, ap3, 4
323 | vfmacc.vf v16, ft0, v0
324 | vfmacc.vf v20, ft1, v0
325 | vfmacc.vf v24, ft2, v0
326 | vfmacc.vf v28, ft3, v0
327 | .store_tile:
328 | add cp0, cp0, cp_offset
329 | vsw.v v16, (cp0)
330 | addi cp0, cp0, 64
331 |
332 | vsw.v v20, (cp1)
333 | addi cp1, cp1, 64
334 |
335 | vsw.v v24, (cp2)
336 | addi cp2, cp2, 64
337 |
338 | vsw.v v28, (cp3)
339 | addi cp3, cp3, 64
340 | j .end
341 |
342 | .end:
343 | ld s0, 96(sp)
344 | ld s1, 88(sp)
345 | ld s2, 80(sp)
346 | ld s3, 72(sp)
347 | ld s4, 64(sp)
348 | ld s5, 56(sp)
349 | ld s6, 48(sp)
350 | ld s7, 40(sp)
351 | ld s8, 32(sp)
352 | ld s9, 24(sp)
353 | ld s10, 16(sp)
354 | ld s11, 8(sp)
355 | addi sp, sp, FRAMESIZE
356 | ret
357 |
--------------------------------------------------------------------------------
/sgemm/step7/bl_config.h:
--------------------------------------------------------------------------------
1 | /*
2 | * --------------------------------------------------------------------------
3 | * BLISLAB
4 | * --------------------------------------------------------------------------
5 | * Copyright (C) 2016, The University of Texas at Austin
6 | *
7 | * Redistribution and use in source and binary forms, with or without
8 | * modification, are permitted provided that the following conditions are
9 | * met:
10 | * - Redistributions of source code must retain the above copyright
11 | * notice, this list of conditions and the following disclaimer.
12 | * - Redistributions in binary form must reproduce the above copyright
13 | * notice, this list of conditions and the following disclaimer in the
14 | * documentation and/or other materials provided with the distribution.
15 | * - Neither the name of The University of Texas nor the names of its
16 | * contributors may be used to endorse or promote products derived
17 | * from this software without specific prior written permission.
18 | *
19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | *
31 | *
32 | * bl_config.h
33 | *
34 | *
35 | * Purpose:
36 | * this header file contains configuration parameters.
37 | *
38 | * Todo:
39 | *
40 | *
41 | * Modification:
42 | *
43 | *
44 | * */
45 |
46 | #ifndef BLISLAB_CONFIG_H
47 | #define BLISLAB_CONFIG_H
48 |
49 | // Allow C++ users to include this header file in their source code. However,
50 | // we make the extern "C" conditional on whether we're using a C++ compiler,
51 | // since regular C compilers don't understand the extern "C" construct.
52 | #ifdef __cplusplus
53 | extern "C" {
54 | #endif
55 |
56 | #define DGEMM_MR 4
57 | #define DGEMM_NR 16
58 |
59 | // End extern "C" construct block.
60 | #ifdef __cplusplus
61 | }
62 | #endif
63 |
64 | #endif
65 |
66 |
--------------------------------------------------------------------------------
/sgemm/step7/my_sgemm.c:
--------------------------------------------------------------------------------
1 | #include "bl_sgemm.h"
2 | #include "bl_config.h"
3 |
4 | extern void RvvSgemm4x16( size_t nr, // nr <= 16
5 | size_t mr, // mr <= 4
6 | size_t k, // astride = k*sizeof(float)
7 | const float* a, // mr * k
8 | const float* b, // k * 16
9 | float* c, // mr * nr
10 | size_t cn_stride, // Len(N) * sizeof(float)
11 | const float* bias // bias
12 | );
13 |
14 |
15 | void bl_sgemm_pack(
16 | int m,
17 | int mr,
18 | int n,
19 | int nr,
20 | int k,
21 | float *A,
22 | float *packA,
23 | int lda,
24 | float *B,
25 | float *packB,
26 | int ldb,
27 | float *C, // must be aligned
28 | int ldc // ldc must also be aligned
29 | )
30 | {
31 | int i, j, p;
32 | int ir, jr;
33 |
34 | // Early return if possible
35 | if ( m == 0 || n == 0 || k == 0 ) {
36 | printf( "bl_sgemm(): early return\n" );
37 | return;
38 | }
39 |
40 | float bias[800];
41 | for(int i = 0; i < 800; i++) {bias[i] = 0;}
42 |
43 | for ( j = 0; j < n; j += DGEMM_NR ) { // Start 2-st loop
44 | int nb = DGEMM_NR;
45 | if((n - j) < DGEMM_NR) nb = n - j;
46 |
47 | for ( i = 0; i < m; i += DGEMM_MR ) { // Start 1-nd loop
48 | int mb = DGEMM_MR;
49 | if((m - i) < DGEMM_MR) mb = m - i;
50 |
51 | RvvSgemm4x16( nb, // nr <= 16, a0
52 | mb, // mr <= 4, a1
53 | k, // astride = k*sizeof(float), a2
54 | &A[i * k], // mr * k, a3
55 | &packB[j * k], // k * 16, a4
56 | &C( i, j ), // mr * nr, a5
57 | n * sizeof(float), // Len(N) * sizeof(float), a6
58 | bias
59 | );
60 | } // End 1-st loop
61 | } // End 2-nd loop
62 | }
63 |
64 |
65 |
--------------------------------------------------------------------------------
/sgemm/step7/run.sh:
--------------------------------------------------------------------------------
1 | make &&
2 | adb push test_bl_sgemm_step7.x ./. &&
3 | adb shell "./test_bl_sgemm_step7.x"
--------------------------------------------------------------------------------
/sgemm/step8/Makefile:
--------------------------------------------------------------------------------
1 | TARGET=test_bl_sgemm_step8
2 |
3 | #CROSS_COMPILE ?= riscv64-unknown-linux-gnu-
4 | # CROSS_COMPILE ?=
5 |
6 | CCL := /home/zhaodongyu/toolchain/riscv/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
7 | CTOOL := riscv64-unknown-linux-gnu-
8 | CROSS_COMPILE := ${CCL}/bin/${CTOOL}
9 |
10 | CC = $(CROSS_COMPILE)gcc
11 | CXX = $(CROSS_COMPILE)g++
12 |
13 | COMPILER_OPT_LEVEL=O3
14 |
15 | ifeq ($(CROSS_COMPILE),)
16 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=core-avx2 -fPIC -static
17 | LDLIBS = -lpthread -lm
18 | else
19 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static
20 | LDLIBS = -lm
21 | endif
22 |
23 | COMMONDIR = ../common
24 | CFLAGS += -I$(COMMONDIR)
25 |
26 | ASM_SRC= RvvSgemm4x16.S
27 | CC_SRC= my_sgemm.c \
28 | $(COMMONDIR)/test_bl_sgemm_packB_4x16.c \
29 | $(COMMONDIR)/bl_sgemm_ref.c \
30 | $(COMMONDIR)/bl_sgemm_util.c
31 |
32 | CPP_SRC=
33 |
34 | ALL_ASM_OBJS := $(ASM_SRC:=.o)
35 | ALL_C_OBJS := $(CC_SRC:=.o)
36 | ALL_CXX_OBJS := $(CPP_SRC:=.o)
37 |
38 | MKDEP_OPT = -MMD -MF $@.d
39 |
40 | ALL_OBJS += $(ALL_ASM_OBJS) $(ALL_C_OBJS) $(ALL_CXX_OBJS)
41 | ALL_DEPS := $(ALL_OBJS:=.d)
42 |
43 | BLISLAB_TEST_EXE=$(TARGET:=.x)
44 |
45 | .PHONY: all clean
46 |
47 | all: $(BLISLAB_TEST_EXE)
48 |
49 | # include dependency files of application
50 | ifneq ($(MAKECMDGOALS),clean)
51 | -include $(ALL_DEPS)
52 | endif
53 |
54 | $(BLISLAB_TEST_EXE): $(ALL_OBJS)
55 | $(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS) $(LDLIBS)
56 |
57 | # ---------------------------------------------------------------------------
58 | # Object files compiling rules
59 | # ---------------------------------------------------------------------------
60 | %.c.o: %.c
61 | $(CC) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
62 |
63 | %.cpp.o: %.cpp
64 | $(CXX) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
65 |
66 | %.S.o: %.S
67 | $(CC) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
68 | # ---------------------------------------------------------------------------
69 |
70 | clean:
71 | @rm -rf $(ALL_OBJS) $(SHAREDLIBBLISLAB) $(BLISLAB_TEST_EXE) $(ALL_DEPS)
72 |
73 |
74 |
--------------------------------------------------------------------------------
/sgemm/step8/RvvSgemm4x16.S:
--------------------------------------------------------------------------------
1 | # |<-- k=4 -->|
2 | # +++++++++++++ - f0, f4, f0, f4
3 | # + + | f1, f5, f1, f5
4 | # + A + mr=4 f2, f6, f2, f6
5 | # + + | f3, f7, f3, f7
6 | # +++++++++++++ -
7 |
8 |
9 | # |<-- nr=16 -->|
10 | # +++++++++++++++++ - v0 v1 v2 v3
11 | # + + | v4 v5 v6 v7
12 | # + B + k=4 v8,v9,v10,v11
13 | # + + | v12,v13,v14,v15
14 | # +++++++++++++++++ -
15 |
16 |
17 | # |<-- nr -->|
18 | # +++++++++++++++++ -
19 | # + + | v16,v17,v18,v19
20 | # + C + k=4 v20,v21,v22,v23
21 | # + + | v24,v25,v26,v27
22 | # +++++++++++++++++ - v28,v29,v30,v31
23 | #
24 |
25 | # void RvvSgemm4x16(size_t nr, // nr <= 16, a0
26 | # size_t mr, // mr <= 4, a1
27 | # size_t k, // astride = k*sizeof(float), a2
28 | # const float* a, // mr * k, a3
29 | # const float* b, // k * 16, a4
30 | # float* c, // mr * nr, a5
31 | # size_t c_stride, // Len(N) * sizeof(float), a6
32 | # const float* bias // bias, a7
33 | # );
34 | .global RvvSgemm4x16
35 | .type RvvSgemm4x16, @function
36 |
37 | #define nr a0
38 | #define mr a1
39 | #define k a2
40 | #define ap a3
41 | #define bp a4
42 | #define cp a5
43 | #define c_stride a6
44 | #define bias a7
45 |
46 | #define ap1 t0
47 | #define ap2 t1
48 | #define ap3 t2
49 | #define cp1 t3
50 | #define cp2 t4
51 | #define cp3 t5
52 | #define a_stride s0
53 | #define bp0 s1
54 | #define biasp s2
55 | #define kt s3
56 | #define ap0 s4
57 | #define cp0 s5
58 | #define bp1 s6
59 | #define vl s7
60 | #define ap_offset s10
61 | #define cp_offset s11
62 |
63 | #define FRAMESIZE 104
64 |
65 | RvvSgemm4x16:
66 | addi sp, sp, -FRAMESIZE # callee update stack pointer
67 | sd s0, 96(sp) # callee saved frame pointer
68 | addi s0, sp, FRAMESIZE # generate new frame pointer
69 | sd s1, -16(s0)
70 | sd s2, -24(s0)
71 | sd s3, -32(s0)
72 | sd s4, -40(s0)
73 | sd s5, -48(s0)
74 | sd s6, -56(s0)
75 | sd s7, -64(s0)
76 | sd s8, -72(s0)
77 | sd s9, -80(s0)
78 | sd s10, -88(s0)
79 | sd s11, -96(s0)
80 |
81 | li ap_offset, 0
82 | li cp_offset, 0
83 | slli a_stride, k, 2 # astride = k * sizeof(float)
84 | mv s3, nr
85 | vsetvli s2, s3, e32, m4
86 | mv ap0, ap
87 | mv bp0, bp
88 | mv cp0, cp
89 | .a1_offset:
90 | mv ap1, ap0
91 | mv cp1, cp0
92 | slti t6, mr, 2 # mr < 2
93 | bnez t6, .a2_offset
94 | add ap1, ap0, a_stride
95 | add cp1, cp0, c_stride
96 | .a2_offset:
97 | mv ap2, ap1
98 | mv cp2, cp1
99 | slti t6, mr, 3 # mr < 3
100 | bnez t6, .a3_offset
101 | add ap2, ap1, a_stride
102 | add cp2, cp1, c_stride
103 | .a3_offset:
104 | mv ap3, ap2
105 | mv cp3, cp2
106 | slti t6, mr, 4 # mr < 4
107 | bnez t6, .start
108 | add ap3, ap2, a_stride
109 | add cp3, cp2, c_stride
110 |
111 | .start:
112 | mv biasp, bias
113 | mv kt, k
114 | beqz mr, .end
115 |
116 | vlw.v v16, (biasp)
117 | vlw.v v20, (biasp)
118 | vlw.v v24, (biasp)
119 | vlw.v v28, (biasp)
120 | addi biasp, biasp, 64
121 | slti t6, kt, 4 # kt < 4, t6 = 1
122 | bnez t6, .k2_tail
123 |
124 |
125 | flw fs0, 64(ap0) # pre-load A
126 | flw fs1, 64(ap1) # pre-load A
127 | flw fs2, 64(ap2) # pre-load A
128 | flw fs3, 64(ap3) # pre-load A
129 |
130 | flw fs4, 512(bp0) # pre-load B
131 | flw fs5, 576(bp0) # pre-load B
132 | flw fs6, 640(bp0) # pre-load B
133 | flw fs7, 704(bp0) # pre-load B
134 |
135 | // load 4 row A (A0, A1, A2, A3)
136 | flw ft0, (ap0)
137 | addi ap0, ap0, 4
138 | flw ft1, (ap1)
139 | addi ap1, ap1, 4
140 | flw ft2, (ap2)
141 | addi ap2, ap2, 4
142 | flw ft3, (ap3)
143 | addi ap3, ap3, 4
144 | // load 16 col B(B0, B1, B2, B3)
145 | vlw.v v0, (bp0)
146 | addi bp0,bp0,64
147 |
148 | addi kt, kt, -4 # Decrement k counter
149 | slti t6, kt, 4 # kt < 4
150 | bnez t6, .k4_tail # jump to k4_tail
151 |
152 | .k4_main:
153 | addi kt, kt, -4 # Decrement k counter
154 | // first group of 16 fma, second group load
155 | vfmacc.vf v16, ft0, v0
156 | vlw.v v4, (bp0) # b0'->v4
157 | flw fs4, 384(bp0) # pre-load B
158 | addi bp0,bp0,64
159 | vfmacc.vf v20, ft1, v0
160 | flw ft4, (ap0) # a0'->ft4
161 | addi ap0, ap0, 4
162 | vfmacc.vf v24, ft2, v0
163 | flw ft5, (ap1) # a1'->ft5
164 | addi ap1, ap1, 4
165 | vfmacc.vf v28, ft3, v0
166 | flw ft6, (ap2) # a2'->ft6
167 | addi ap2, ap2, 4
168 | flw ft7, (ap3) # a3'->ft7
169 | addi ap3, ap3, 4
170 | // second group of 16 fma, third group load
171 | slti t6, kt, 4 # kt < 4, t6 = 1
172 | vfmacc.vf v16, ft4, v4
173 | vlw.v v8, (bp0) # b0'->v0
174 | flw fs5, 384(bp0) # pre-load B
175 | addi bp0,bp0,64
176 | vfmacc.vf v20, ft5, v4
177 | flw ft0, (ap0) # a0'->ft0
178 | addi ap0, ap0, 4
179 | vfmacc.vf v24, ft6, v4
180 | flw ft1, (ap1) # a1'->ft1
181 | addi ap1, ap1, 4
182 | vfmacc.vf v28, ft7, v4
183 | flw ft2, (ap2) # a2'->ft2
184 | addi ap2, ap2, 4
185 | flw ft3, (ap3) # a3'->ft3
186 | addi ap3, ap3, 4
187 | // third group of 16 fma, fourth group load
188 | vfmacc.vf v16, ft0, v8
189 | vlw.v v12, (bp0) # b0'->v0
190 | flw fs6, 384(bp0) # pre-load B
191 | addi bp0,bp0,64
192 | vfmacc.vf v20, ft1, v8
193 | flw ft4, (ap0) # a0'->ft0
194 | addi ap0, ap0, 4
195 | vfmacc.vf v24, ft2, v8
196 | flw ft5, (ap1) # a1'->ft1
197 | addi ap1, ap1, 4
198 | vfmacc.vf v28, ft3, v8
199 | flw ft6, (ap2) # a2'->ft2
200 | addi ap2, ap2, 4
201 | flw ft7, (ap3) # a3'->ft3
202 | addi ap3, ap3, 4
203 | // fourth group of 16 fma, first group load
204 | vfmacc.vf v16, ft4, v12
205 | vlw.v v0, (bp0) # b0'->v0
206 | flw fs7, 384(bp0) # pre-load B
207 | addi bp0,bp0,64
208 | vfmacc.vf v20, ft5, v12
209 | flw ft0, (ap0) # a0'->ft0
210 | addi ap0, ap0, 4
211 | vfmacc.vf v24, ft6, v12
212 | flw ft1, (ap1) # a0'->ft0
213 | addi ap1, ap1, 4
214 | vfmacc.vf v28, ft7, v12
215 | flw ft2, (ap2) # a0'->ft0
216 | addi ap2, ap2, 4
217 | flw ft3, (ap3) # a0'->ft0
218 | addi ap3, ap3, 4
219 |
220 | flw fs0, 64(ap0) # pre-load A
221 | flw fs1, 64(ap1) # pre-load A
222 | flw fs2, 64(ap2) # pre-load A
223 | flw fs3, 64(ap3) # pre-load A
224 |
225 | beqz t6, .k4_main
226 | .k4_tail:
227 | // first group of 16 fma, second group load
228 | vfmacc.vf v16, ft0, v0
229 | vlw.v v4, (bp0) # b0'->v4
230 | addi bp0,bp0,64
231 | vfmacc.vf v20, ft1, v0
232 | flw ft4, (ap0) # a0'->ft4
233 | addi ap0, ap0, 4
234 | vfmacc.vf v24, ft2, v0
235 | flw ft5, (ap1) # a1'->ft5
236 | addi ap1, ap1, 4
237 | vfmacc.vf v28, ft3, v0
238 | flw ft6, (ap2) # a2'->ft6
239 | addi ap2, ap2, 4
240 | flw ft7, (ap3) # a3'->ft7
241 | addi ap3, ap3, 4
242 | // second group of 16 fma, third group load
243 | vfmacc.vf v16, ft4, v4
244 | vlw.v v8, (bp0) # b0'->v0
245 | addi bp0,bp0,64
246 | vfmacc.vf v20, ft5, v4
247 | flw ft0, (ap0) # a0'->ft0
248 | addi ap0, ap0, 4
249 | vfmacc.vf v24, ft6, v4
250 | flw ft1, (ap1) # a1'->ft1
251 | addi ap1, ap1, 4
252 | vfmacc.vf v28, ft7, v4
253 | flw ft2, (ap2) # a2'->ft2
254 | addi ap2, ap2, 4
255 | flw ft3, (ap3) # a3'->ft3
256 | addi ap3, ap3, 4
257 | // third group of 16 fma, fourth group load
258 | vfmacc.vf v16, ft0, v8
259 | vlw.v v12, (bp0) # b0'->v0
260 | addi bp0,bp0,64
261 | vfmacc.vf v20, ft1, v8
262 | flw ft4, (ap0) # a0'->ft0
263 | addi ap0, ap0, 4
264 | vfmacc.vf v24, ft2, v8
265 | flw ft5, (ap1) # a1'->ft1
266 | addi ap1, ap1, 4
267 | vfmacc.vf v28, ft3, v8
268 | flw ft6, (ap2) # a2'->ft2
269 | addi ap2, ap2, 4
270 | flw ft7, (ap3) # a3'->ft3
271 | addi ap3, ap3, 4
272 | // fourth group of 16 fma, no group load
273 | vfmacc.vf v16, ft4, v12
274 | vfmacc.vf v20, ft5, v12
275 | vfmacc.vf v24, ft6, v12
276 | vfmacc.vf v28, ft7, v12
277 | .k2_tail:
278 | slti t6, kt, 2 # kt < 2
279 | bnez t6, .k1_tail
280 | flw ft0, (ap0)
281 | addi ap0, ap0, 4
282 | vlw.v v0, (bp0)
283 | addi bp0,bp0,64
284 | flw ft1, (ap1)
285 | addi ap1, ap1, 4
286 | flw ft2, (ap2)
287 | addi ap2, ap2, 4
288 | flw ft3, (ap3)
289 | addi ap3, ap3, 4
290 | // first group of 16 fma, second group load
291 | vfmacc.vf v16, ft0, v0
292 | vlw.v v4, (bp0) # b0'->v4
293 | addi bp0,bp0,64
294 | vfmacc.vf v20, ft1, v0
295 | flw ft4, (ap0) # a0'->ft4
296 | addi ap0, ap0, 4
297 | vfmacc.vf v24, ft2, v0
298 | flw ft5, (ap1) # a1'->ft5
299 | addi ap1, ap1, 4
300 | vfmacc.vf v28, ft3, v0
301 | flw ft6, (ap2) # a2'->ft6
302 | addi ap2, ap2, 4
303 | flw ft7, (ap3) # a3'->ft7
304 | addi ap3, ap3, 4
305 | // second group of 16 fma, third group load
306 | vfmacc.vf v16, ft4, v4
307 | vfmacc.vf v20, ft5, v4
308 | vfmacc.vf v24, ft6, v4
309 | vfmacc.vf v28, ft7, v4
310 | .k1_tail:
311 | slti t6, kt, 1 # kt < 1
312 | bnez t6, .store_tile
313 | flw ft0, (ap0)
314 | addi ap0, ap0, 4
315 | vlw.v v0, (bp0)
316 | addi bp0,bp0,64
317 | flw ft1, (ap1)
318 | addi ap1, ap1, 4
319 | flw ft2, (ap2)
320 | addi ap2, ap2, 4
321 | flw ft3, (ap3)
322 | addi ap3, ap3, 4
323 | vfmacc.vf v16, ft0, v0
324 | vfmacc.vf v20, ft1, v0
325 | vfmacc.vf v24, ft2, v0
326 | vfmacc.vf v28, ft3, v0
327 | .store_tile:
328 | add cp0, cp0, cp_offset
329 | vsw.v v16, (cp0)
330 | addi cp0, cp0, 64
331 |
332 | vsw.v v20, (cp1)
333 | addi cp1, cp1, 64
334 |
335 | vsw.v v24, (cp2)
336 | addi cp2, cp2, 64
337 |
338 | vsw.v v28, (cp3)
339 | addi cp3, cp3, 64
340 | j .end
341 |
342 | .end:
343 | ld s0, 96(sp)
344 | ld s1, 88(sp)
345 | ld s2, 80(sp)
346 | ld s3, 72(sp)
347 | ld s4, 64(sp)
348 | ld s5, 56(sp)
349 | ld s6, 48(sp)
350 | ld s7, 40(sp)
351 | ld s8, 32(sp)
352 | ld s9, 24(sp)
353 | ld s10, 16(sp)
354 | ld s11, 8(sp)
355 | addi sp, sp, FRAMESIZE
356 | ret
357 |
--------------------------------------------------------------------------------
/sgemm/step8/bl_config.h:
--------------------------------------------------------------------------------
1 | /*
2 | * --------------------------------------------------------------------------
3 | * BLISLAB
4 | * --------------------------------------------------------------------------
5 | * Copyright (C) 2016, The University of Texas at Austin
6 | *
7 | * Redistribution and use in source and binary forms, with or without
8 | * modification, are permitted provided that the following conditions are
9 | * met:
10 | * - Redistributions of source code must retain the above copyright
11 | * notice, this list of conditions and the following disclaimer.
12 | * - Redistributions in binary form must reproduce the above copyright
13 | * notice, this list of conditions and the following disclaimer in the
14 | * documentation and/or other materials provided with the distribution.
15 | * - Neither the name of The University of Texas nor the names of its
16 | * contributors may be used to endorse or promote products derived
17 | * from this software without specific prior written permission.
18 | *
19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | *
31 | *
32 | * bl_config.h
33 | *
34 | *
35 | * Purpose:
36 | * this header file contains configuration parameters.
37 | *
38 | * Todo:
39 | *
40 | *
41 | * Modification:
42 | *
43 | *
44 | * */
45 |
46 | #ifndef BLISLAB_CONFIG_H
47 | #define BLISLAB_CONFIG_H
48 |
49 | // Allow C++ users to include this header file in their source code. However,
50 | // we make the extern "C" conditional on whether we're using a C++ compiler,
51 | // since regular C compilers don't understand the extern "C" construct.
52 | #ifdef __cplusplus
53 | extern "C" {
54 | #endif
55 |
56 | #define DGEMM_MR 4
57 | #define DGEMM_NR 16
58 |
59 | // End extern "C" construct block.
60 | #ifdef __cplusplus
61 | }
62 | #endif
63 |
64 | #endif
65 |
66 |
--------------------------------------------------------------------------------
/sgemm/step8/my_sgemm.c:
--------------------------------------------------------------------------------
1 | #include "bl_sgemm.h"
2 | #include "bl_config.h"
3 |
4 | extern void RvvSgemm4x16( size_t nr, // nr <= 16
5 | size_t mr, // mr <= 4
6 | size_t k, // astride = k*sizeof(float)
7 | const float* a, // mr * k
8 | const float* b, // k * 16
9 | float* c, // mr * nr
10 | size_t cn_stride, // Len(N) * sizeof(float)
11 | const float* bias // bias
12 | );
13 |
14 |
15 | void bl_sgemm_pack(
16 | int m,
17 | int mr,
18 | int n,
19 | int nr,
20 | int k,
21 | float *A,
22 | float *packA,
23 | int lda,
24 | float *B,
25 | float *packB,
26 | int ldb,
27 | float *C, // must be aligned
28 | int ldc // ldc must also be aligned
29 | )
30 | {
31 | int i, j, p;
32 | int ir, jr;
33 |
34 | // Early return if possible
35 | if ( m == 0 || n == 0 || k == 0 ) {
36 | printf( "bl_sgemm(): early return\n" );
37 | return;
38 | }
39 |
40 | float bias[800];
41 | for(int i = 0; i < 800; i++) {bias[i] = 0;}
42 |
43 | for ( j = 0; j < n; j += DGEMM_NR ) { // Start 2-st loop
44 | int nb = DGEMM_NR;
45 | if((n - j) < DGEMM_NR) nb = n - j;
46 |
47 | for ( i = 0; i < m; i += DGEMM_MR ) { // Start 1-nd loop
48 | int mb = DGEMM_MR;
49 | if((m - i) < DGEMM_MR) mb = m - i;
50 |
51 | RvvSgemm4x16( nb, // nr <= 16, a0
52 | mb, // mr <= 4, a1
53 | k, // astride = k*sizeof(float), a2
54 | &A[i * k], // mr * k, a3
55 | &packB[j * k], // k * 16, a4
56 | &C( i, j ), // mr * nr, a5
57 | n * sizeof(float), // Len(N) * sizeof(float), a6
58 | bias
59 | );
60 | } // End 1-st loop
61 | } // End 2-nd loop
62 | }
63 |
64 |
65 |
--------------------------------------------------------------------------------
/sgemm/step8/run.sh:
--------------------------------------------------------------------------------
1 | make &&
2 | adb push test_bl_sgemm_step8.x ./. &&
3 | adb shell "./test_bl_sgemm_step8.x"
--------------------------------------------------------------------------------
/sgemm/step9/Makefile:
--------------------------------------------------------------------------------
1 | TARGET=test_bl_sgemm_step9
2 |
3 | #CROSS_COMPILE ?= riscv64-unknown-linux-gnu-
4 | # CROSS_COMPILE ?=
5 |
6 | CCL := /home/zhaodongyu/toolchain/riscv/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
7 | CTOOL := riscv64-unknown-linux-gnu-
8 | CROSS_COMPILE := ${CCL}/bin/${CTOOL}
9 |
10 | CC = $(CROSS_COMPILE)gcc
11 | CXX = $(CROSS_COMPILE)g++
12 |
13 | COMPILER_OPT_LEVEL=O3
14 |
15 | ifeq ($(CROSS_COMPILE),)
16 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=core-avx2 -fPIC -static
17 | LDLIBS = -lpthread -lm
18 | else
19 | CFLAGS = -$(COMPILER_OPT_LEVEL) -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -static
20 | LDLIBS = -lm
21 | endif
22 |
23 | COMMONDIR = ../common
24 | CFLAGS += -I$(COMMONDIR)
25 |
26 | ASM_SRC= RvvSgemm4x16.S
27 | CC_SRC= my_sgemm.c \
28 | $(COMMONDIR)/test_bl_sgemm_packB_4x16.c \
29 | $(COMMONDIR)/bl_sgemm_ref.c \
30 | $(COMMONDIR)/bl_sgemm_util.c
31 |
32 | CPP_SRC=
33 |
34 | ALL_ASM_OBJS := $(ASM_SRC:=.o)
35 | ALL_C_OBJS := $(CC_SRC:=.o)
36 | ALL_CXX_OBJS := $(CPP_SRC:=.o)
37 |
38 | MKDEP_OPT = -MMD -MF $@.d
39 |
40 | ALL_OBJS += $(ALL_ASM_OBJS) $(ALL_C_OBJS) $(ALL_CXX_OBJS)
41 | ALL_DEPS := $(ALL_OBJS:=.d)
42 |
43 | BLISLAB_TEST_EXE=$(TARGET:=.x)
44 |
45 | .PHONY: all clean
46 |
47 | all: $(BLISLAB_TEST_EXE)
48 |
49 | # include dependency files of application
50 | ifneq ($(MAKECMDGOALS),clean)
51 | -include $(ALL_DEPS)
52 | endif
53 |
54 | $(BLISLAB_TEST_EXE): $(ALL_OBJS)
55 | $(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS) $(LDLIBS)
56 |
57 | # ---------------------------------------------------------------------------
58 | # Object files compiling rules
59 | # ---------------------------------------------------------------------------
60 | %.c.o: %.c
61 | $(CC) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
62 |
63 | %.cpp.o: %.cpp
64 | $(CXX) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
65 |
66 | %.S.o: %.S
67 | $(CC) $(CFLAGS) -c $< -o $@ $(LDFLAGS) $(MKDEP_OPT)
68 | # ---------------------------------------------------------------------------
69 |
70 | clean:
71 | @rm -rf $(ALL_OBJS) $(SHAREDLIBBLISLAB) $(BLISLAB_TEST_EXE) $(ALL_DEPS)
72 |
73 |
74 |
--------------------------------------------------------------------------------
/sgemm/step9/RvvSgemm4x16.S:
--------------------------------------------------------------------------------
1 | # |<-- k=4 -->|
2 | # +++++++++++++ - f0, f4, f0, f4
3 | # + + | f1, f5, f1, f5
4 | # + A + mr=4 f2, f6, f2, f6
5 | # + + | f3, f7, f3, f7
6 | # +++++++++++++ -
7 |
8 |
9 | # |<-- nr=16 -->|
10 | # +++++++++++++++++ - v0 v1 v2 v3
11 | # + + | v4 v5 v6 v7
12 | # + B + k=4 v8,v9,v10,v11
13 | # + + | v12,v13,v14,v15
14 | # +++++++++++++++++ -
15 |
16 |
17 | # |<-- nr -->|
18 | # +++++++++++++++++ -
19 | # + + | v16,v17,v18,v19
20 | # + C + k=4 v20,v21,v22,v23
21 | # + + | v24,v25,v26,v27
22 | # +++++++++++++++++ - v28,v29,v30,v31
23 | #
24 |
25 | # void RvvSgemm4x16(size_t nr, // nr <= 16, a0
26 | # size_t mr, // mr <= 4, a1
27 | # size_t k, // astride = k*sizeof(float), a2
28 | # const float* a, // mr * k, a3
29 | # const float* b, // k * 16, a4
30 | # float* c, // mr * nr, a5
31 | # size_t c_stride, // Len(N) * sizeof(float), a6
32 | # const float* bias // bias, a7
33 | # );
34 | .global RvvSgemm4x16
35 | .type RvvSgemm4x16, @function
36 |
37 | #define nr a0
38 | #define mr a1
39 | #define k a2
40 | #define ap a3
41 | #define bp a4
42 | #define cp a5
43 | #define c_stride a6
44 | #define bias a7
45 |
46 | #define ap0 t0
47 |
48 | #define cp1 t3
49 | #define cp2 t4
50 | #define cp3 t5
51 | #define a_stride s0
52 | #define bp0 s1
53 | #define biasp s2
54 | #define kt s3
55 | #define cp0 s5
56 | #define bp1 s6
57 | #define vl s7
58 | #define ap_offset s10
59 | #define cp_offset s11
60 |
61 | #define FRAMESIZE 104
62 |
63 | RvvSgemm4x16:
64 | addi sp, sp, -FRAMESIZE # callee update stack pointer
65 | sd s0, 96(sp) # callee saved frame pointer
66 | addi s0, sp, FRAMESIZE # generate new frame pointer
67 | sd s1, -16(s0)
68 | sd s2, -24(s0)
69 | sd s3, -32(s0)
70 | sd s4, -40(s0)
71 | sd s5, -48(s0)
72 | sd s6, -56(s0)
73 | sd s7, -64(s0)
74 | sd s8, -72(s0)
75 | sd s9, -80(s0)
76 | sd s10, -88(s0)
77 | sd s11, -96(s0)
78 |
79 | li ap_offset, 0
80 | li cp_offset, 0
81 | slli a_stride, k, 2 # astride = k * sizeof(float)
82 | mv s3, nr
83 | vsetvli s2, s3, e32, m4
84 | mv ap0, ap
85 | mv bp0, bp
86 | mv cp0, cp
87 | .a1_offset:
88 | mv cp1, cp0
89 | slti t6, mr, 2 # mr < 2
90 | bnez t6, .a2_offset
91 | add cp1, cp0, c_stride
92 | .a2_offset:
93 | mv cp2, cp1
94 | slti t6, mr, 3 # mr < 3
95 | bnez t6, .a3_offset
96 | add cp2, cp1, c_stride
97 | .a3_offset:
98 | mv cp3, cp2
99 | slti t6, mr, 4 # mr < 4
100 | bnez t6, .start
101 | add cp3, cp2, c_stride
102 |
103 | .start:
104 | mv biasp, bias
105 | mv kt, k
106 | beqz mr, .end
107 |
108 | vlw.v v16, (biasp)
109 | vlw.v v20, (biasp)
110 | vlw.v v24, (biasp)
111 | vlw.v v28, (biasp)
112 | addi biasp, biasp, 64
113 | slti t6, kt, 4 # kt < 4, t6 = 1
114 | bnez t6, .k2_tail
115 |
116 |
117 | flw fs0, 64(ap0) # pre-load A
118 | flw fs1, 128(ap0) # pre-load A
119 | flw fs2, 192(ap0) # pre-load A
120 | flw fs3, 256(ap0) # pre-load A
121 |
122 | flw fs4, 512(bp0) # pre-load B
123 | flw fs5, 576(bp0) # pre-load B
124 | flw fs6, 640(bp0) # pre-load B
125 | flw fs7, 704(bp0) # pre-load B
126 |
127 | // load 4 row A (A0, A1, A2, A3)
128 | flw ft0, (ap0)
129 | addi ap0, ap0, 4
130 | flw ft1, (ap0)
131 | addi ap0, ap0, 4
132 | flw ft2, (ap0)
133 | addi ap0, ap0, 4
134 | flw ft3, (ap0)
135 | addi ap0, ap0, 4
136 | // load 16 col B(B0, B1, B2, B3)
137 | vlw.v v0, (bp0)
138 | addi bp0,bp0,64
139 |
140 | addi kt, kt, -4 # Decrement k counter
141 | slti t6, kt, 4 # kt < 4
142 | bnez t6, .k4_tail # jump to k4_tail
143 |
144 | .k4_main:
145 | addi kt, kt, -4 # Decrement k counter
146 | // first group of 16 fma, second group load
147 | vfmacc.vf v16, ft0, v0
148 | vlw.v v4, (bp0) # b0'->v4
149 | flw fs4, 384(bp0) # pre-load B
150 | addi bp0,bp0,64
151 | vfmacc.vf v20, ft1, v0
152 | flw ft4, (ap0) # a0'->ft4
153 | addi ap0, ap0, 4
154 | vfmacc.vf v24, ft2, v0
155 | flw ft5, (ap0) # a1'->ft5
156 | addi ap0, ap0, 4
157 | vfmacc.vf v28, ft3, v0
158 | flw ft6, (ap0) # a2'->ft6
159 | addi ap0, ap0, 4
160 | flw ft7, (ap0) # a3'->ft7
161 | addi ap0, ap0, 4
162 | // second group of 16 fma, third group load
163 | slti t6, kt, 4 # kt < 4, t6 = 1
164 | vfmacc.vf v16, ft4, v4
165 | vlw.v v8, (bp0) # b0'->v0
166 | flw fs5, 384(bp0) # pre-load B
167 | addi bp0,bp0,64
168 | vfmacc.vf v20, ft5, v4
169 | flw ft0, (ap0) # a0'->ft0
170 | addi ap0, ap0, 4
171 | vfmacc.vf v24, ft6, v4
172 | flw ft1, (ap0) # a1'->ft1
173 | addi ap0, ap0, 4
174 | vfmacc.vf v28, ft7, v4
175 | flw ft2, (ap0) # a2'->ft2
176 | addi ap0, ap0, 4
177 | flw ft3, (ap0) # a3'->ft3
178 | addi ap0, ap0, 4
179 | // third group of 16 fma, fourth group load
180 | vfmacc.vf v16, ft0, v8
181 | vlw.v v12, (bp0) # b0'->v0
182 | flw fs6, 384(bp0) # pre-load B
183 | addi bp0,bp0,64
184 | vfmacc.vf v20, ft1, v8
185 | flw ft4, (ap0) # a0'->ft0
186 | addi ap0, ap0, 4
187 | vfmacc.vf v24, ft2, v8
188 | flw ft5, (ap0) # a1'->ft1
189 | addi ap0, ap0, 4
190 | vfmacc.vf v28, ft3, v8
191 | flw ft6, (ap0) # a2'->ft2
192 | addi ap0, ap0, 4
193 | flw ft7, (ap0) # a3'->ft3
194 | addi ap0, ap0, 4
195 | // fourth group of 16 fma, first group load
196 | vfmacc.vf v16, ft4, v12
197 | vlw.v v0, (bp0) # b0'->v0
198 | flw fs7, 384(bp0) # pre-load B
199 | addi bp0,bp0,64
200 | vfmacc.vf v20, ft5, v12
201 | flw ft0, (ap0) # a0'->ft0
202 | addi ap0, ap0, 4
203 | vfmacc.vf v24, ft6, v12
204 | flw ft1, (ap0) # a0'->ft0
205 | addi ap0, ap0, 4
206 | vfmacc.vf v28, ft7, v12
207 | flw ft2, (ap0) # a0'->ft0
208 | addi ap0, ap0, 4
209 | flw ft3, (ap0) # a0'->ft0
210 | addi ap0, ap0, 4
211 |
212 | flw fs0, 64(ap0) # pre-load A
213 | flw fs1, 128(ap0) # pre-load A
214 | flw fs2, 192(ap0) # pre-load A
215 | flw fs3, 256(ap0) # pre-load A
216 |
217 | beqz t6, .k4_main
218 | .k4_tail:
219 | // first group of 16 fma, second group load
220 | vfmacc.vf v16, ft0, v0
221 | vlw.v v4, (bp0) # b0'->v4
222 | addi bp0,bp0,64
223 | vfmacc.vf v20, ft1, v0
224 | flw ft4, (ap0) # a0'->ft4
225 | addi ap0, ap0, 4
226 | vfmacc.vf v24, ft2, v0
227 | flw ft5, (ap0) # a1'->ft5
228 | addi ap0, ap0, 4
229 | vfmacc.vf v28, ft3, v0
230 | flw ft6, (ap0) # a2'->ft6
231 | addi ap0, ap0, 4
232 | flw ft7, (ap0) # a3'->ft7
233 | addi ap0, ap0, 4
234 | // second group of 16 fma, third group load
235 | vfmacc.vf v16, ft4, v4
236 | vlw.v v8, (bp0) # b0'->v0
237 | addi bp0,bp0,64
238 | vfmacc.vf v20, ft5, v4
239 | flw ft0, (ap0) # a0'->ft0
240 | addi ap0, ap0, 4
241 | vfmacc.vf v24, ft6, v4
242 | flw ft1, (ap0) # a1'->ft1
243 | addi ap0, ap0, 4
244 | vfmacc.vf v28, ft7, v4
245 | flw ft2, (ap0) # a2'->ft2
246 | addi ap0, ap0, 4
247 | flw ft3, (ap0) # a3'->ft3
248 | addi ap0, ap0, 4
249 | // third group of 16 fma, fourth group load
250 | vfmacc.vf v16, ft0, v8
251 | vlw.v v12, (bp0) # b0'->v0
252 | addi bp0,bp0,64
253 | vfmacc.vf v20, ft1, v8
254 | flw ft4, (ap0) # a0'->ft0
255 | addi ap0, ap0, 4
256 | vfmacc.vf v24, ft2, v8
257 | flw ft5, (ap0) # a1'->ft1
258 | addi ap0, ap0, 4
259 | vfmacc.vf v28, ft3, v8
260 | flw ft6, (ap0) # a2'->ft2
261 | addi ap0, ap0, 4
262 | flw ft7, (ap0) # a3'->ft3
263 | addi ap0, ap0, 4
264 | // fourth group of 16 fma, no group load
265 | vfmacc.vf v16, ft4, v12
266 | vfmacc.vf v20, ft5, v12
267 | vfmacc.vf v24, ft6, v12
268 | vfmacc.vf v28, ft7, v12
269 | .k2_tail:
270 | slti t6, kt, 2 # kt < 2
271 | bnez t6, .k1_tail
272 | flw ft0, (ap0)
273 | addi ap0, ap0, 4
274 | vlw.v v0, (bp0)
275 | addi bp0,bp0,64
276 | flw ft1, (ap0)
277 | addi ap0, ap0, 4
278 | flw ft2, (ap0)
279 | addi ap0, ap0, 4
280 | flw ft3, (ap0)
281 | addi ap0, ap0, 4
282 | // first group of 16 fma, second group load
283 | vfmacc.vf v16, ft0, v0
284 | vlw.v v4, (bp0) # b0'->v4
285 | addi bp0,bp0,64
286 | vfmacc.vf v20, ft1, v0
287 | flw ft4, (ap0) # a0'->ft4
288 | addi ap0, ap0, 4
289 | vfmacc.vf v24, ft2, v0
290 | flw ft5, (ap0) # a1'->ft5
291 | addi ap0, ap0, 4
292 | vfmacc.vf v28, ft3, v0
293 | flw ft6, (ap0) # a2'->ft6
294 | addi ap0, ap0, 4
295 | flw ft7, (ap0) # a3'->ft7
296 | addi ap0, ap0, 4
297 | // second group of 16 fma, third group load
298 | vfmacc.vf v16, ft4, v4
299 | vfmacc.vf v20, ft5, v4
300 | vfmacc.vf v24, ft6, v4
301 | vfmacc.vf v28, ft7, v4
302 | .k1_tail:
303 | slti t6, kt, 1 # kt < 1
304 | bnez t6, .store_tile
305 | flw ft0, (ap0)
306 | addi ap0, ap0, 4
307 | vlw.v v0, (bp0)
308 | addi bp0,bp0,64
309 | flw ft1, (ap0)
310 | addi ap0, ap0, 4
311 | flw ft2, (ap0)
312 | addi ap0, ap0, 4
313 | flw ft3, (ap0)
314 | addi ap0, ap0, 4
315 | vfmacc.vf v16, ft0, v0
316 | vfmacc.vf v20, ft1, v0
317 | vfmacc.vf v24, ft2, v0
318 | vfmacc.vf v28, ft3, v0
319 | .store_tile:
320 | add cp0, cp0, cp_offset
321 | vsw.v v16, (cp0)
322 | addi cp0, cp0, 64
323 |
324 | vsw.v v20, (cp1)
325 | addi cp1, cp1, 64
326 |
327 | vsw.v v24, (cp2)
328 | addi cp2, cp2, 64
329 |
330 | vsw.v v28, (cp3)
331 | addi cp3, cp3, 64
332 | j .end
333 |
334 | .end:
335 | ld s0, 96(sp)
336 | ld s1, 88(sp)
337 | ld s2, 80(sp)
338 | ld s3, 72(sp)
339 | ld s4, 64(sp)
340 | ld s5, 56(sp)
341 | ld s6, 48(sp)
342 | ld s7, 40(sp)
343 | ld s8, 32(sp)
344 | ld s9, 24(sp)
345 | ld s10, 16(sp)
346 | ld s11, 8(sp)
347 | addi sp, sp, FRAMESIZE
348 | ret
349 |
--------------------------------------------------------------------------------
/sgemm/step9/bl_config.h:
--------------------------------------------------------------------------------
1 | /*
2 | * --------------------------------------------------------------------------
3 | * BLISLAB
4 | * --------------------------------------------------------------------------
5 | * Copyright (C) 2016, The University of Texas at Austin
6 | *
7 | * Redistribution and use in source and binary forms, with or without
8 | * modification, are permitted provided that the following conditions are
9 | * met:
10 | * - Redistributions of source code must retain the above copyright
11 | * notice, this list of conditions and the following disclaimer.
12 | * - Redistributions in binary form must reproduce the above copyright
13 | * notice, this list of conditions and the following disclaimer in the
14 | * documentation and/or other materials provided with the distribution.
15 | * - Neither the name of The University of Texas nor the names of its
16 | * contributors may be used to endorse or promote products derived
17 | * from this software without specific prior written permission.
18 | *
19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | *
31 | *
32 | * bl_config.h
33 | *
34 | *
35 | * Purpose:
36 | * this header file contains configuration parameters.
37 | *
38 | * Todo:
39 | *
40 | *
41 | * Modification:
42 | *
43 | *
44 | * */
45 |
46 | #ifndef BLISLAB_CONFIG_H
47 | #define BLISLAB_CONFIG_H
48 |
49 | // Allow C++ users to include this header file in their source code. However,
50 | // we make the extern "C" conditional on whether we're using a C++ compiler,
51 | // since regular C compilers don't understand the extern "C" construct.
52 | #ifdef __cplusplus
53 | extern "C" {
54 | #endif
55 |
56 | #define DGEMM_MR 4
57 | #define DGEMM_NR 16
58 |
59 | // End extern "C" construct block.
60 | #ifdef __cplusplus
61 | }
62 | #endif
63 |
64 | #endif
65 |
66 |
--------------------------------------------------------------------------------
/sgemm/step9/my_sgemm.c:
--------------------------------------------------------------------------------
1 | #include "bl_sgemm.h"
2 | #include "bl_config.h"
3 |
4 | extern void RvvSgemm4x16( size_t nr, // nr <= 16
5 | size_t mr, // mr <= 4
6 | size_t k, // astride = k*sizeof(float)
7 | const float* a, // mr * k
8 | const float* b, // k * 16
9 | float* c, // mr * nr
10 | size_t cn_stride, // Len(N) * sizeof(float)
11 | const float* bias // bias
12 | );
13 |
14 |
15 | inline void PackInputLayout(float* dst, const float* src, int m, int k, int mr) {
16 | int i, j, p;
17 |
18 | for ( j = 0; j < m / mr; j ++ ) {
19 | for ( i = 0; i < k; i ++ ) {
20 | for ( p = 0; p < mr; p ++ ) {
21 | *dst ++ = *(src + p * k + j * mr * k + i);
22 | }
23 | }
24 | }
25 | }
26 |
27 |
28 | void bl_sgemm_pack(
29 | int m,
30 | int mr,
31 | int n,
32 | int nr,
33 | int k,
34 | float *A,
35 | float *packA,
36 | int lda,
37 | float *B,
38 | float *packB,
39 | int ldb,
40 | float *C, // must be aligned
41 | int ldc // ldc must also be aligned
42 | )
43 | {
44 | int i, j, p;
45 | int ir, jr;
46 |
47 | // Early return if possible
48 | if ( m == 0 || n == 0 || k == 0 ) {
49 | printf( "bl_sgemm(): early return\n" );
50 | return;
51 | }
52 |
53 | PackInputLayout(packA, A, m, k, mr);
54 |
55 | // printf("[A]\n");
56 | // for(int i = 0; i < m; i++) {
57 | // for(int j = 0; j < k; j++) {
58 | // printf("%.0f\t", A[i * k + j]);
59 | // }
60 | // printf("\n");
61 | // }
62 | // printf("[packA]\n");
63 | // for(int i = 0; i < m; i++) {
64 | // for(int j = 0; j < k; j++) {
65 | // printf("%.0f\t", packA[i * k + j]);
66 | // }
67 | // printf("\n");
68 | // }
69 |
70 | float bias[800] = {0};
71 | for(int i = 0; i < 800; i++) {bias[i] = 0;}
72 |
73 | for ( j = 0; j < n; j += DGEMM_NR ) { // Start 2-st loop
74 | int nb = DGEMM_NR;
75 | if((n - j) < DGEMM_NR) nb = n - j;
76 |
77 | for ( i = 0; i < m; i += DGEMM_MR ) { // Start 1-nd loop
78 | int mb = DGEMM_MR;
79 | if((m - i) < DGEMM_MR) mb = m - i;
80 |
81 | RvvSgemm4x16( nb, // nr <= 16, a0
82 | mb, // mr <= 4, a1
83 | k, // astride = k*sizeof(float), a2
84 | &packA[i * k], // mr * k, a3
85 | &packB[j * k], // k * 16, a4
86 | &C( i, j ), // mr * nr, a5
87 | n * sizeof(float), // Len(N) * sizeof(float), a6
88 | bias
89 | );
90 | } // End 1-st loop
91 | } // End 2-nd loop
92 | }
93 |
--------------------------------------------------------------------------------
/sgemm/step9/run.sh:
--------------------------------------------------------------------------------
1 | make &&
2 | adb push test_bl_sgemm_step9.x ./. &&
3 | adb shell "./test_bl_sgemm_step9.x"
--------------------------------------------------------------------------------