├── LICENSE
├── Makefile
├── README.md
├── bench
├── bench.sh
└── smem
│ ├── LDS
│ ├── Makefile
│ ├── lds128.sass
│ ├── lds32.sass
│ ├── lds64.sass
│ ├── lds64_opt3.sass
│ └── main.cu
│ └── STS
│ ├── Makefile
│ ├── main.cu
│ ├── sts.sass
│ ├── sts128.sass
│ ├── sts128_0.sass
│ ├── sts32.sass
│ ├── sts64.sass
│ ├── sts64_2bank_conflict.sass
│ ├── sts64_broadcast.sass
│ └── sts64_opt3.sass
└── src
├── FX_m2.cu
├── ampere
├── convolutionForward_32x64x8.cu
├── convolutionForward_32x64x8_baseline.cu
├── store_and_transform_output_baseline.cuh
├── store_and_transform_output_optLDS64.cuh
├── store_and_transform_output_optSTS64.cuh
└── store_and_transform_output_optSTS64_compact.cuh
├── config.hpp
├── convolutionForward_32x64x8.cu
├── convolutionForward_32x64x8_baseline.cu
├── openCNN_winograd.cu
├── outer_product.cuh
├── outer_product_suffle.cuh
├── store_and_transform_output_baseline.cuh
├── store_and_transform_output_optLDS64.cuh
├── store_and_transform_output_optSTS64.cuh
└── store_and_transform_output_optSTS64_compact.cuh
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | ARCH = 75 # modify this. Ampere=86
2 | NAME = wgrad
3 | OUT = OPTSTS64
4 | #MODE = PROF
5 | #LBR = OPENCNN
6 |
7 | all:
8 | nvcc src/openCNN_winograd.cu -lcudnn -m64 -arch=compute_$(ARCH) -code=sm_$(ARCH)-o $(NAME) -D$(OUT)
9 |
10 | clean:
11 | rm $(NAME)
12 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # OpenCNN
2 | A winograd’s minimal filtering algorithm implementation in CUDA
3 | ## Requirements
4 | - CUDA Toolkit
5 | - cuDNN
6 | - CMake
7 |
8 | ## Build the project
9 | ```
10 | git clone https://github.com/UDC-GAC/openCNN.git
11 | cd openCNN
12 | ```
13 | GPU architecture code must be specified inside the Makefile before compiling.
14 |
15 | ```
16 | make
17 | ```
18 |
19 | Compile time macros have been defined for testing purposes. They can be specified in the MakeFile according to the following values:
20 | ```
21 | $OUT (builds an specific output storage and transform version):
22 | - BASE: baseline layout
23 | - OPTSTS64 (default): optSTS64 layout
24 | - OUTSTS64_CMP: optSTS64_compact layout
25 | - OUTLDS64: optLDS64 layout
26 | ```
27 | ## Run examples
28 | (Recommended before time measurement) Lock the clocks:
29 | ```
30 | sudo nvidia-smi -i 0 -pm 1
31 | sudo nvidia-smi -lgc 1750 -i 0
32 | ```
33 | 1. OpenCNN benchmark
34 | ```
35 | cd bench
36 | ./bench.sh
37 | ```
38 | 2. Instruction-level microbenchmarking (Require Turing devices). TuringAs (https://github.com/daadaada/turingas) must be installed for running instruction-level microbenchmarking.
39 | ```
40 | cd bench/smem/STS
41 | make
42 | ./test
43 | ```
44 |
45 | ## Citation
46 | If you find this tool helpful, please cite:
47 | ```
48 | @Article{math9172033,
49 | AUTHOR = {Castro, Roberto L. and Andrade, Diego and Fraguela, Basilio B.},
50 | TITLE = {OpenCNN: A Winograd Minimal Filtering Algorithm Implementation in CUDA},
51 | JOURNAL = {Mathematics},
52 | VOLUME = {9},
53 | YEAR = {2021},
54 | NUMBER = {17},
55 | ARTICLE-NUMBER = {2033},
56 | URL = {https://www.mdpi.com/2227-7390/9/17/2033},
57 | ISSN = {2227-7390},
58 | DOI = {10.3390/math9172033}
59 | }
60 | ```
61 | ## License
62 | Apache-2.0 License
63 |
64 | -- Roberto López Castro
65 |
--------------------------------------------------------------------------------
/bench/bench.sh:
--------------------------------------------------------------------------------
1 | echo "in_n,in_c,in_h,filt_k,filt_w,openCNN_sec,openCNN_flops,cuDNN_sec,cuDNN_flops"
2 |
3 | for n in 32 64 96 128;
4 | do
5 | ../wgrad $n 64 56 56 64 64 3 3
6 | done
7 |
8 | for n in 32 64 96 128;
9 | do
10 | ../wgrad $n 128 28 28 128 128 3 3
11 | done
12 |
13 | for n in 32 64 96 128;
14 | do
15 | ../wgrad $n 256 14 14 256 256 3 3
16 | done
17 |
18 | for n in 32 64 96 128;
19 | do
20 | ../wgrad $n 512 7 7 512 512 3 3
21 | done
22 |
23 |
--------------------------------------------------------------------------------
/bench/smem/LDS/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 | python -m turingas.main -i lds32.sass -o lds32.cubin
3 | python -m turingas.main -i lds64.sass -o lds64.cubin
4 | python -m turingas.main -i lds128.sass -o lds128.cubin
5 | python -m turingas.main -i lds64_opt3.sass -o lds64_opt3.cubin
6 | nvcc -arch=sm_75 main.cu -lcuda -o test
7 |
8 | clean:
9 | lds32.cubin test lds64.cubin lds64_opt3.cubin lds128.cubin
10 |
--------------------------------------------------------------------------------
/bench/smem/LDS/lds128.sass:
--------------------------------------------------------------------------------
1 |
2 | output, 8
3 |
4 |
5 |
6 | 0-18 ~ output0, output1, tid, offset, target, start, end, iter, tmp
7 |
8 |
9 | --:-:1:-:5 S2R tid, SR_TID.X;
10 | --:-:-:-:2 MOV output0, output[0];
11 | --:-:-:-:5 MOV output1, output[1];
12 |
13 | 02:-:-:-:6 SHF.L offset, tid, 4, RZ;
14 | --:-:-:-:5 MOV iter, RZ;
15 |
16 |
17 | --:-:-:-:1 CS2R start, SR_CLOCKLO;
18 |
19 | LOOP:
20 |
21 | out = []
22 | out.append('--:-:-:-:1 IADD3 iter, iter, 1, RZ;')
23 | for i in range(128):
24 | if i == 64:
25 | out.append(f'--:-:-:-:1 ISETP.LT.AND P0, PT, iter, 128, PT;')
26 | out.append(f'--:-:-:-:1 LDS.128 tmp, [offset];')
27 | out.append('--:-:-:-:2 @P0 BRA LOOP;')
28 | out_ = '\n'.join(out) + '\n'
29 |
30 |
31 | --:-:-:-:5 CS2R end, SR_CLOCKLO;
32 | --:-:-:-:5 IADD3 end, end, -start, RZ;
33 | --:-:-:-:2 STG.E.GPU [output0], end;
34 |
35 |
36 |
37 | --:-:-:-:2 EXIT;
38 |
--------------------------------------------------------------------------------
/bench/smem/LDS/lds32.sass:
--------------------------------------------------------------------------------
1 |
2 | output, 8
3 |
4 |
5 |
6 | 0-16 ~ output0, output1, tid, offset, target, start, end, iter
7 |
8 |
9 | --:-:1:-:5 S2R tid, SR_TID.X;
10 | --:-:-:-:2 MOV output0, output[0];
11 | --:-:-:-:5 MOV output1, output[1];
12 | 02:-:-:-:6 SHF.L offset, tid, 2, RZ;
13 | --:-:-:-:5 MOV iter, RZ;
14 |
15 | --:-:-:-:1 CS2R start, SR_CLOCKLO;
16 |
17 | LOOP:
18 |
19 | out = []
20 | out.append('--:-:-:-:1 IADD3 iter, iter, 1, RZ;')
21 | for i in range(128):
22 | if i == 64:
23 | out.append(f'--:-:-:-:1 ISETP.LT.AND P0, PT, iter, 128, PT;')
24 | out.append(f'--:-:-:-:1 LDS target, [offset];')
25 | out.append('--:-:-:-:2 @P0 BRA LOOP;')
26 | out_ = '\n'.join(out) + '\n'
27 |
28 |
29 | --:-:-:-:5 CS2R end, SR_CLOCKLO;
30 | --:-:-:-:5 IADD3 end, end, -start, RZ;
31 | --:-:-:-:2 STG.E.GPU [output0], end;
32 |
33 |
34 |
35 |
36 | --:-:-:-:2 EXIT;
37 |
--------------------------------------------------------------------------------
/bench/smem/LDS/lds64.sass:
--------------------------------------------------------------------------------
1 |
2 | output, 8
3 |
4 |
5 |
6 | 0-17 ~ output0, output1, tid, offset, target, start, end, iter, tmp
7 |
8 |
9 | --:-:1:-:5 S2R tid, SR_TID.X;
10 | --:-:-:-:2 MOV output0, output[0];
11 | --:-:-:-:5 MOV output1, output[1];
12 | 02:-:-:-:6 SHF.L offset, tid, 3, RZ;
13 | --:-:-:-:5 MOV iter, RZ;
14 |
15 | --:-:-:-:1 CS2R start, SR_CLOCKLO;
16 |
17 | LOOP:
18 |
19 | out = []
20 | out.append('--:-:-:-:1 IADD3 iter, iter, 1, RZ;')
21 | for i in range(128):
22 | if i == 64:
23 | out.append(f'--:-:-:-:1 ISETP.LT.AND P0, PT, iter, 128, PT;')
24 | out.append(f'--:-:-:-:1 LDS.64 tmp, [offset];')
25 | out.append('--:-:-:-:2 @P0 BRA LOOP;')
26 | out_ = '\n'.join(out) + '\n'
27 |
28 |
29 | --:-:-:-:5 CS2R end, SR_CLOCKLO;
30 | --:-:-:-:5 IADD3 end, end, -start, RZ;
31 | --:-:-:-:2 STG.E.GPU [output0], end;
32 |
33 |
34 |
35 |
36 | --:-:-:-:2 EXIT;
37 |
--------------------------------------------------------------------------------
/bench/smem/LDS/lds64_opt3.sass:
--------------------------------------------------------------------------------
1 |
2 | output, 8
3 |
4 |
5 |
6 | 0-19 ~ output0, output1, tid, offset, target, start, end, iter, aux, aux2, tmp
7 |
8 |
9 | --:-:1:-:5 S2R tid, SR_TID.X;
10 | --:-:-:-:2 MOV output0, output[0];
11 | --:-:-:-:5 MOV output1, output[1];
12 |
13 | 02:-:-:-:6 SHF.R aux, tid, 4, RZ;
14 | 02:-:-:-:6 SHF.L aux, aux, 4, RZ;
15 | 04:-:1:-:6 IMAD aux, aux, -1, tid; #tid%16
16 | --:-:-:-:1 ISETP.LT.AND P0, PT, aux, 8, PT;
17 |
18 | 02:-:-:-:6 SHF.R aux2, tid, 4, RZ;
19 | 04:-:1:-:6 IMAD aux2, aux2, 8, aux; #tid/16*tid%16
20 |
21 | --:-:-:-:5 MOV iter, RZ;
22 |
23 | --:-:-:-:2 @P0 BRA JMP;
24 | --:-:-:-:5 IADD3 aux2, aux2, 8, RZ;
25 |
26 | JMP:
27 | 02:-:-:-:6 SHF.L offset, aux2, 3, RZ;
28 |
29 | --:-:-:-:1 CS2R start, SR_CLOCKLO;
30 |
31 | LOOP:
32 |
33 | out = []
34 | out.append('--:-:-:-:1 IADD3 iter, iter, 1, RZ;')
35 | for i in range(128):
36 | if i == 64:
37 | out.append(f'--:-:-:-:1 ISETP.LT.AND P0, PT, iter, 128, PT;')
38 | out.append(f'--:-:-:-:1 LDS.64 tmp, [offset];')
39 | out.append('--:-:-:-:2 @P0 BRA LOOP;')
40 | out_ = '\n'.join(out) + '\n'
41 |
42 |
43 | --:-:-:-:5 CS2R end, SR_CLOCKLO;
44 | --:-:-:-:5 IADD3 end, end, -start, RZ;
45 | --:-:-:-:2 STG.E.GPU [output0], end;
46 |
47 | #02:-:-:-:6 SHF.L aux, tid, 2, RZ;
48 | #--:-:-:-:5 IADD3 output0, output0, aux, RZ;
49 | ##--:-:-:-:2 STG.E.GPU [output0], aux2;
50 | #--:-:-:-:2 STG.E.GPU [output0], aux2;
51 |
52 |
53 | --:-:-:-:2 EXIT;
54 |
--------------------------------------------------------------------------------
/bench/smem/LDS/main.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 | char* concat(const char *s1, const char *s2)
6 | {
7 | char *result = (char*)malloc(strlen(s1) + strlen(s2) + 1); // +1 for the null-terminator
8 | // in real code you would check for errors in malloc here
9 | strcpy(result, s1);
10 | strcat(result, s2);
11 | return result;
12 | }
13 |
14 | void run(char * name, int size){
15 | char * file_name = concat(name, ".cubin");
16 |
17 | int *output;
18 | cudaMalloc((void**)&output, sizeof(int)*32);
19 |
20 | CUmodule module;
21 | CUfunction kernel;
22 |
23 | cuModuleLoad(&module, file_name);
24 | cuModuleGetFunction(&kernel, module, "kern");
25 |
26 | void * args[1] = {&output};
27 | cuLaunchKernel(kernel, 1, 1, 1,
28 | 32, 1, 1,
29 | 32*sizeof(float)*size, 0, args, 0);
30 |
31 | int *output_h = (int*)malloc(sizeof(int)*32);
32 |
33 | cudaMemcpy(output_h, output, sizeof(int)*32, cudaMemcpyDeviceToHost);
34 |
35 | printf("%s took %d clocks.\n", name, output_h[0]);
36 | printf("Each instruction takes %.2f clocks.\n", (float)output_h[0]/(128.0*128.0)); // workload of a thread
37 | printf("Throughput %.2f bytes/cycle.\n\n", ((double)32*128*128*4*size)/output_h[0]);
38 |
39 | cudaFree(output);
40 | free(output_h);
41 | }
42 |
43 | int main(){
44 | run("lds32", 1);
45 | printf("\n");
46 | run("lds64", 2);
47 | printf("\n");
48 | run("lds128", 4);
49 |
50 | printf("\n");
51 | run("lds64_opt3", 2);
52 | return 0;
53 | }
54 |
--------------------------------------------------------------------------------
/bench/smem/STS/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 | python -m turingas.main -i sts32.sass -o sts32.cubin
3 | python -m turingas.main -i sts64.sass -o sts64.cubin
4 | python -m turingas.main -i sts128.sass -o sts128.cubin
5 | python -m turingas.main -i sts128_0.sass -o sts128_0.cubin
6 | python -m turingas.main -i sts64_2bank_conflict.sass -o sts64_2bank_conflict.cubin
7 | python -m turingas.main -i sts64_broadcast.sass -o sts64_broadcast.cubin
8 | python -m turingas.main -i sts64_opt3.sass -o sts64_opt3.cubin
9 | nvcc -arch=sm_75 main.cu -lcuda -o test
10 |
11 | clean:
12 | sts32.cubin sts64.cubin sts128.cubin sts128_0.cubin sts64_2bank_conflict.cubin sts64_broadcast.cubin sts64_opt3.cubin test
13 |
--------------------------------------------------------------------------------
/bench/smem/STS/main.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 | #define ITERS 32768
6 |
7 | char* concat(const char *s1, const char *s2)
8 | {
9 | char *result = (char*)malloc(strlen(s1) + strlen(s2) + 1); // +1 for the null-terminator
10 | // in real code you would check for errors in malloc here
11 | strcpy(result, s1);
12 | strcat(result, s2);
13 | return result;
14 | }
15 |
16 | #define CUDA_SAFE_CALL( call) { \
17 | cudaError err = call; \
18 | if( cudaSuccess != err) { \
19 | fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
20 | __FILE__, __LINE__, cudaGetErrorString( err) ); \
21 | exit(EXIT_FAILURE); \
22 | } }
23 |
24 | void initializeEvents(cudaEvent_t *start, cudaEvent_t *stop){
25 | CUDA_SAFE_CALL( cudaEventCreate(start) );
26 | CUDA_SAFE_CALL( cudaEventCreate(stop) );
27 | CUDA_SAFE_CALL( cudaEventRecord(*start, 0) );
28 | }
29 |
30 | float finalizeEvents(cudaEvent_t start, cudaEvent_t stop){
31 | CUDA_SAFE_CALL( cudaGetLastError() );
32 | CUDA_SAFE_CALL( cudaEventRecord(stop, 0) );
33 | CUDA_SAFE_CALL( cudaEventSynchronize(stop) );
34 | float kernel_time;
35 | CUDA_SAFE_CALL( cudaEventElapsedTime(&kernel_time, start, stop) );
36 | CUDA_SAFE_CALL( cudaEventDestroy(start) );
37 | CUDA_SAFE_CALL( cudaEventDestroy(stop) );
38 | return kernel_time;
39 | }
40 |
41 | template
42 | void run(char * name, T scal, int type_size, int threads){
43 | char * file_name = concat(name, ".cubin");
44 |
45 | int *output;
46 | cudaMalloc((void**)&output, sizeof(int)*32);
47 | cudaMemset(output, 0, 32*sizeof(int));
48 |
49 | CUmodule module;
50 | CUfunction kernel;
51 |
52 | cuModuleLoad(&module, file_name);
53 | cuModuleGetFunction(&kernel, module, "kern");
54 |
55 | int blk_size = 32;
56 | int total_blks = 1;//size/blk_size;
57 | int sh_mem_size = blk_size*sizeof(float)*type_size;
58 | void * args[2] = {&scal, &output};
59 |
60 | //cudaEvent_t start, stop;
61 | //initializeEvents(&start, &stop);
62 | cuLaunchKernel(kernel, total_blks, 1, 1,
63 | blk_size, 1, 1,
64 | sh_mem_size, 0, args, 0);
65 | //float krn_time_shmem_32b = finalizeEvents(start, stop);
66 |
67 | int *output_h = (int*)malloc(sizeof(int)*32);
68 |
69 | cudaMemcpy(output_h, output, sizeof(int)*32, cudaMemcpyDeviceToHost);
70 |
71 | /*for(int i=0; i<32; i++){
72 | printf("%d ", output_h[i]);
73 | }printf("\n");*/
74 |
75 | printf("%s took %d clocks \n", name, output_h[0]);
76 | double clocks_instr = (float)output_h[0]/(128.0*128.0); // wokload of a thread
77 | printf("Each instruction takes %.2f clocks.\n", clocks_instr);
78 | printf("Throughput %.2f bytes/cycle.\n\n", ((double)threads*128*128*type_size*4)/output_h[0]); // Size of information stores divided by the number of threads of the latest thread
79 |
80 | cudaFree(output);
81 | free(output_h);
82 | }
83 |
84 | int main(){
85 | float scal = 4;
86 | run("sts32", scal, 1, 32);
87 | printf("\n");
88 | float2 scal2;
89 | scal2.x = 4; scal2.y = 4;
90 | run("sts64", scal2, 2, 32);
91 | printf("\n");
92 | float4 scal4;
93 | scal4.x = 4; scal4.y = 4;
94 | scal4.z = 4; scal4.w = 4;
95 | // No thread-divergence
96 | run("sts128_0", scal4, 4, 32);
97 |
98 | printf("\n");
99 | // Only half of the threads store data
100 | run("sts128", scal4, 4, 16);
101 |
102 | /* run2_aux("sts64_2bank_conflict");
103 | printf("\n");
104 | run2_aux("sts64_broadcast");
105 | printf("\n");
106 | run2_aux("sts64_opt3");
107 | printf("\n"); */
108 |
109 | return 0;
110 | }
111 |
--------------------------------------------------------------------------------
/bench/smem/STS/sts.sass:
--------------------------------------------------------------------------------
1 |
2 | scal, 8
3 | output, 8
4 |
5 |
6 |
7 | 0-17 ~ output0, output1, tid, offset, target, start, end, iter, tmp
8 |
9 |
10 | --:-:1:-:5 S2R tid, SR_TID.X;
11 | --:-:-:-:2 MOV output0, output[0];
12 | --:-:-:-:5 MOV output1, output[1];
13 | 02:-:-:-:6 SHF.L offset, tid, 6, RZ;
14 | --:-:-:-:2 MOV tmp, scal;
15 | --:-:-:-:5 MOV iter, RZ;
16 |
17 | --:-:-:-:1 CS2R start, SR_CLOCKLO;
18 |
19 | LOOP:
20 |
21 | out = []
22 | out.append('--:-:-:-:1 IADD3 iter, iter, 1, RZ;')
23 | for i in range(1024):
24 | if i == 64:
25 | out.append(f'--:-:-:-:1 ISETP.LT.AND P0, PT, iter, 128, PT;')
26 | out.append(f'--:-:-:-:1 STS.128 [offset], tmp;')
27 | out.append('--:-:-:-:2 @P0 BRA LOOP;')
28 | out_ = '\n'.join(out) + '\n'
29 |
30 |
31 | --:-:-:-:5 CS2R end, SR_CLOCKLO;
32 | --:-:-:-:5 IADD3 end, end, -start, RZ;
33 | --:-:-:-:2 STG.E.GPU [output0], end;
34 |
35 |
36 |
37 |
38 | --:-:-:-:2 EXIT;
39 |
--------------------------------------------------------------------------------
/bench/smem/STS/sts128.sass:
--------------------------------------------------------------------------------
1 |
2 | scal, 8
3 | output, 8
4 |
5 |
6 |
7 | 0-18 ~ output0, output1, tid, offset, target, start, end, iter, tmp, aux
8 |
9 |
10 | --:-:1:-:5 S2R tid, SR_TID.X;
11 | --:-:-:-:2 MOV output0, output[0];
12 | --:-:-:-:5 MOV output1, output[1];
13 |
14 | 02:-:-:-:6 SHF.R aux, tid, 4, RZ;
15 | 02:-:-:-:6 SHF.L aux, aux, 4, RZ;
16 | 04:-:1:-:6 IMAD aux, aux, -1, tid;
17 |
18 | --:-:-:-:1 ISETP.GT.AND P0, PT, aux, 8, PT;
19 | 02:-:-:-:6 SHF.L offset, tid, 4, RZ;
20 | --:-:-:-:2 MOV tmp, scal;
21 | --:-:-:-:5 MOV iter, RZ;
22 |
23 | --:-:-:-:2 @P0 EXIT;
24 |
25 | --:-:-:-:1 CS2R start, SR_CLOCKLO;
26 |
27 | LOOP:
28 |
29 | out = []
30 | out.append('--:-:-:-:1 IADD3 iter, iter, 1, RZ;')
31 | for i in range(128):
32 | if i == 64:
33 | out.append(f'--:-:-:-:1 ISETP.LT.AND P0, PT, iter, 128, PT;')
34 | out.append(f'--:-:-:-:1 STS.128 [offset], tmp;')
35 | out.append('--:-:-:-:2 @P0 BRA LOOP;')
36 | out_ = '\n'.join(out) + '\n'
37 |
38 |
39 | --:-:-:-:5 CS2R end, SR_CLOCKLO;
40 | --:-:-:-:5 IADD3 end, end, -start, RZ;
41 | --:-:-:-:2 STG.E.GPU [output0], end;
42 |
43 | #02:-:-:-:6 SHF.L aux, tid, 2, RZ;
44 | #--:-:-:-:5 IADD3 output0, output0, aux, RZ;
45 | #--:-:-:-:2 STG.E.GPU [output0], iter;
46 |
47 |
48 |
49 | --:-:-:-:2 EXIT;
50 |
--------------------------------------------------------------------------------
/bench/smem/STS/sts128_0.sass:
--------------------------------------------------------------------------------
1 |
2 | scal, 8
3 | output, 8
4 |
5 |
6 |
7 | 0-17 ~ output0, output1, tid, offset, target, start, end, iter, tmp
8 |
9 |
10 | --:-:1:-:5 S2R tid, SR_TID.X;
11 | --:-:-:-:2 MOV output0, output[0];
12 | --:-:-:-:5 MOV output1, output[1];
13 | 02:-:-:-:6 SHF.L offset, tid, 4, RZ;
14 | --:-:-:-:2 MOV tmp, scal;
15 | --:-:-:-:5 MOV iter, RZ;
16 |
17 | --:-:-:-:1 CS2R start, SR_CLOCKLO;
18 |
19 | LOOP:
20 |
21 | out = []
22 | out.append('--:-:-:-:1 IADD3 iter, iter, 1, RZ;')
23 | for i in range(128):
24 | if i == 64:
25 | out.append(f'--:-:-:-:1 ISETP.LT.AND P0, PT, iter, 128, PT;')
26 | out.append(f'--:-:-:-:1 STS.128 [offset], tmp;')
27 | out.append('--:-:-:-:2 @P0 BRA LOOP;')
28 | out_ = '\n'.join(out) + '\n'
29 |
30 |
31 | --:-:-:-:5 CS2R end, SR_CLOCKLO;
32 | --:-:-:-:5 IADD3 end, end, -start, RZ;
33 | --:-:-:-:2 STG.E.GPU [output0], end;
34 |
35 |
36 |
37 |
38 | --:-:-:-:2 EXIT;
--------------------------------------------------------------------------------
/bench/smem/STS/sts32.sass:
--------------------------------------------------------------------------------
1 |
2 | scal, 8
3 | output, 8
4 |
5 |
6 |
7 | 0-17 ~ output0, output1, tid, offset, target, start, end, iter, tmp
8 |
9 |
10 | --:-:1:-:5 S2R tid, SR_TID.X;
11 | --:-:-:-:2 MOV output0, output[0];
12 | --:-:-:-:5 MOV output1, output[1];
13 | 02:-:-:-:6 SHF.L offset, tid, 2, RZ;
14 | --:-:-:-:2 MOV tmp, scal;
15 | --:-:-:-:5 MOV iter, RZ;
16 |
17 | --:-:-:-:1 CS2R start, SR_CLOCKLO;
18 |
19 | LOOP:
20 |
21 | out = []
22 | out.append('--:-:-:-:1 IADD3 iter, iter, 1, RZ;')
23 | for i in range(128):
24 | if i == 64:
25 | out.append(f'--:-:-:-:1 ISETP.LT.AND P0, PT, iter, 128, PT;')
26 | out.append(f'--:-:-:-:1 STS.32 [offset], tmp;')
27 | out.append('--:-:-:-:2 @P0 BRA LOOP;')
28 | out_ = '\n'.join(out) + '\n'
29 |
30 |
31 | --:-:-:-:5 CS2R end, SR_CLOCKLO;
32 | --:-:-:-:5 IADD3 end, end, -start, RZ;
33 | --:-:-:-:2 STG.E.GPU [output0], end;
34 |
35 |
36 | --:-:-:-:2 EXIT;
37 |
--------------------------------------------------------------------------------
/bench/smem/STS/sts64.sass:
--------------------------------------------------------------------------------
1 |
2 | scal, 8
3 | output, 8
4 |
5 |
6 |
7 | 0-17 ~ output0, output1, tid, offset, target, start, end, iter, tmp
8 |
9 |
10 | --:-:1:-:5 S2R tid, SR_TID.X;
11 | --:-:-:-:2 MOV output0, output[0];
12 | --:-:-:-:5 MOV output1, output[1];
13 | 02:-:-:-:6 SHF.L offset, tid, 3, RZ;
14 | --:-:-:-:2 MOV tmp, scal;
15 | --:-:-:-:5 MOV iter, RZ;
16 |
17 | --:-:-:-:1 CS2R start, SR_CLOCKLO;
18 |
19 | LOOP:
20 |
21 | out = []
22 | out.append('--:-:-:-:1 IADD3 iter, iter, 1, RZ;')
23 | for i in range(128):
24 | if i == 64:
25 | out.append(f'--:-:-:-:1 ISETP.LT.AND P0, PT, iter, 128, PT;')
26 | out.append(f'--:-:-:-:1 STS.64 [offset], tmp;')
27 | out.append('--:-:-:-:2 @P0 BRA LOOP;')
28 | out_ = '\n'.join(out) + '\n'
29 |
30 |
31 | --:-:-:-:5 CS2R end, SR_CLOCKLO;
32 | --:-:-:-:5 IADD3 end, end, -start, RZ;
33 | --:-:-:-:2 STG.E.GPU [output0], end;
34 |
35 |
36 |
37 |
38 | --:-:-:-:2 EXIT;
39 |
--------------------------------------------------------------------------------
/bench/smem/STS/sts64_2bank_conflict.sass:
--------------------------------------------------------------------------------
1 |
2 | scal, 8
3 | output, 8
4 |
5 |
6 |
7 | 0-19 ~ output0, output1, tid, offset, target, start, end, iter, tmp, aux, aux2
8 |
9 |
10 | --:-:1:-:5 S2R tid, SR_TID.X;
11 | --:-:-:-:2 MOV output0, output[0];
12 | --:-:-:-:5 MOV output1, output[1];
13 |
14 | 04:-:1:-:6 IMAD aux2, tid, 2, RZ;
15 |
16 | 02:-:-:-:6 SHF.R aux, aux2, 5, RZ;
17 | 02:-:-:-:6 SHF.L aux, aux, 5, RZ;
18 | 04:-:1:-:6 IMAD aux, aux, -1, aux2;
19 |
20 | 02:-:-:-:6 SHF.R aux2, tid, 4, RZ;
21 | 04:-:1:-:6 IMAD aux, aux2, 32, aux; #tid/16*tid%16
22 |
23 | --:-:-:-:2 MOV tmp, scal;
24 | --:-:-:-:5 MOV iter, RZ;
25 |
26 | 02:-:-:-:6 SHF.L offset, aux, 3, RZ;
27 |
28 | --:-:-:-:1 CS2R start, SR_CLOCKLO;
29 |
30 | LOOP:
31 |
32 | out = []
33 | out.append('--:-:-:-:1 IADD3 iter, iter, 1, RZ;')
34 | for i in range(128):
35 | if i == 64:
36 | out.append(f'--:-:-:-:1 ISETP.LT.AND P0, PT, iter, 128, PT;')
37 | out.append(f'--:-:-:-:1 STS.64 [offset], tmp;')
38 | out.append('--:-:-:-:2 @P0 BRA LOOP;')
39 | out_ = '\n'.join(out) + '\n'
40 |
41 |
42 | --:-:-:-:5 CS2R end, SR_CLOCKLO;
43 | --:-:-:-:5 IADD3 end, end, -start, RZ;
44 | --:-:-:-:2 STG.E.GPU [output0], end;
45 |
46 | #--:-:-:-:5 MOV aux2, aux;
47 | #02:-:-:-:6 SHF.L aux, tid, 2, RZ;
48 | #--:-:-:-:5 IADD3 output0, output0, aux, RZ;
49 | #--:-:-:-:2 STG.E.GPU [output0], aux2;
50 |
51 | --:-:-:-:2 EXIT;
52 |
--------------------------------------------------------------------------------
/bench/smem/STS/sts64_broadcast.sass:
--------------------------------------------------------------------------------
1 |
2 | scal, 8
3 | output, 8
4 |
5 |
6 |
7 | 0-19 ~ output0, output1, tid, offset, target, start, end, iter, tmp, aux, aux2
8 |
9 |
10 | --:-:1:-:5 S2R tid, SR_TID.X;
11 | --:-:-:-:2 MOV output0, output[0];
12 | --:-:-:-:5 MOV output1, output[1];
13 |
14 | 02:-:-:-:6 SHF.R aux, tid, 4, RZ;
15 | 02:-:-:-:6 SHF.L aux, aux, 4, RZ;
16 | 04:-:1:-:6 IMAD aux, aux, -1, tid; #tid%16
17 | --:-:-:-:1 ISETP.LT.AND P0, PT, aux, 8, PT;
18 |
19 | 02:-:-:-:6 SHF.R aux2, tid, 4, RZ;
20 | 04:-:1:-:6 IMAD aux2, aux2, 8, aux; #tid/16*tid%16
21 |
22 | --:-:-:-:2 MOV tmp, scal;
23 | --:-:-:-:5 MOV iter, RZ;
24 |
25 | --:-:-:-:2 @P0 BRA JMP;
26 | --:-:-:-:5 IADD3 aux, aux, -8, RZ;
27 |
28 | JMP:
29 | 02:-:-:-:6 SHF.L offset, aux, 3, RZ;
30 |
31 | --:-:-:-:1 CS2R start, SR_CLOCKLO;
32 |
33 | LOOP:
34 |
35 | out = []
36 | out.append('--:-:-:-:1 IADD3 iter, iter, 1, RZ;')
37 | for i in range(128):
38 | if i == 64:
39 | out.append(f'--:-:-:-:1 ISETP.LT.AND P0, PT, iter, 128, PT;')
40 | out.append(f'--:-:-:-:1 STS.64 [offset], tmp;')
41 | out.append('--:-:-:-:2 @P0 BRA LOOP;')
42 | out_ = '\n'.join(out) + '\n'
43 |
44 |
45 | --:-:-:-:5 CS2R end, SR_CLOCKLO;
46 | --:-:-:-:5 IADD3 end, end, -start, RZ;
47 | --:-:-:-:2 STG.E.GPU [output0], end;
48 |
49 | #--:-:-:-:5 MOV aux2, aux;
50 | #02:-:-:-:6 SHF.L aux, tid, 2, RZ;
51 | #--:-:-:-:5 IADD3 output0, output0, aux, RZ;
52 | #--:-:-:-:2 STG.E.GPU [output0], aux;
53 |
54 |
55 | --:-:-:-:2 EXIT;
56 |
--------------------------------------------------------------------------------
/bench/smem/STS/sts64_opt3.sass:
--------------------------------------------------------------------------------
1 |
2 | scal, 8
3 | output, 8
4 |
5 |
6 |
7 | 0-19 ~ output0, output1, tid, offset, target, start, end, iter, tmp, aux, aux2
8 |
9 |
10 | --:-:1:-:5 S2R tid, SR_TID.X;
11 | --:-:-:-:2 MOV output0, output[0];
12 | --:-:-:-:5 MOV output1, output[1];
13 |
14 | 02:-:-:-:6 SHF.R aux, tid, 4, RZ;
15 | 02:-:-:-:6 SHF.L aux, aux, 4, RZ;
16 | 04:-:1:-:6 IMAD aux, aux, -1, tid; #tid%16
17 | --:-:-:-:1 ISETP.LT.AND P0, PT, aux, 8, PT;
18 |
19 | 02:-:-:-:6 SHF.R aux2, tid, 4, RZ;
20 | 04:-:1:-:6 IMAD aux2, aux2, 8, aux; #tid/16*tid%16
21 |
22 | --:-:-:-:2 MOV tmp, scal;
23 | --:-:-:-:5 MOV iter, RZ;
24 |
25 | --:-:-:-:2 @P0 BRA JMP;
26 | --:-:-:-:5 IADD3 aux2, aux2, 8, RZ;
27 |
28 | JMP:
29 | 02:-:-:-:6 SHF.L offset, aux2, 3, RZ;
30 |
31 | --:-:-:-:1 CS2R start, SR_CLOCKLO;
32 |
33 | LOOP:
34 |
35 | out = []
36 | out.append('--:-:-:-:1 IADD3 iter, iter, 1, RZ;')
37 | for i in range(128):
38 | if i == 64:
39 | out.append(f'--:-:-:-:1 ISETP.LT.AND P0, PT, iter, 128, PT;')
40 | out.append(f'--:-:-:-:1 STS.64 [offset], tmp;')
41 | out.append('--:-:-:-:2 @P0 BRA LOOP;')
42 | out_ = '\n'.join(out) + '\n'
43 |
44 |
45 | --:-:-:-:5 CS2R end, SR_CLOCKLO;
46 | --:-:-:-:5 IADD3 end, end, -start, RZ;
47 | --:-:-:-:2 STG.E.GPU [output0], end;
48 |
49 | #02:-:-:-:6 SHF.L aux, tid, 2, RZ;
50 | #--:-:-:-:5 IADD3 output0, output0, aux, RZ;
51 | ##--:-:-:-:2 STG.E.GPU [output0], aux2;
52 | #--:-:-:-:2 STG.E.GPU [output0], aux2;
53 |
54 |
55 | --:-:-:-:2 EXIT;
56 |
--------------------------------------------------------------------------------
/src/FX_m2.cu:
--------------------------------------------------------------------------------
1 |
2 | // Copyright 2021 Roberto Lopez Castro
3 | //
4 | // Licensed under the Apache License, Version 2.0 (the "License");
5 | // you may not use this file except in compliance with the License.
6 | // You may obtain a copy of the License at
7 | //
8 | // http://www.apache.org/licenses/LICENSE-2.0
9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 |
16 |
17 | #ifndef _FX_
18 | #define _FX_
19 | extern "C"
20 | {
21 |
22 | // Set of functions per row in Gw product
23 | __device__ float f_row1(float *Gw, int j){
24 | return Gw[j];
25 | }
26 | __device__ float f_row2(float *Gw, int j){
27 | return 0.5*(Gw[j] + Gw[6+j] + Gw[3+j]);
28 | }
29 | __device__ float f_row3(float *Gw, int j){
30 | return 0.5*(Gw[j] + Gw[6+j] - Gw[3+j]);
31 | }
32 | __device__ float f_row4(float *Gw, int j){
33 | return Gw[6+j];
34 | }
35 | // Set of functions per column in GwGt product
36 | __device__ float f_col1(float *Gw, int j){
37 | return Gw[j];
38 | }
39 | __device__ float f_col2(float *Gw, int j){
40 | return 0.5*(Gw[j] + Gw[j+2] + Gw[j+1]);
41 | }
42 | __device__ float f_col3(float *Gw, int j){
43 | return 0.5*(Gw[j] + Gw[j+2] - Gw[j+1]);
44 | }
45 | __device__ float f_col4(float *Gw, int j){
46 | return Gw[j+2];
47 | }
48 |
49 | typedef float(*pointFunction_t)(float *, int);
50 |
51 | __global__ void FX(float *pInputs, float *pOutputs, int filt_k,
52 | int filt_c, int filt_h, int filt_w, int alpha){
53 | int Inx = threadIdx.x, Iny = threadIdx.y;
54 | int TileX = blockIdx.x, TileY = blockIdx.y;
55 |
56 | int c_glb_offset = filt_k*filt_h*filt_w;
57 | int c_kernel = TileY*BC*c_glb_offset + TileX*BK + Iny*c_glb_offset + Inx;
58 | int c_glb_offset_s = filt_k*4*4;
59 | int c_kernel_s = TileY*BC*c_glb_offset_s + TileX*BK + Iny*c_glb_offset_s + Inx;
60 |
61 | float Gw[21]; //9+12. In registers
62 | float *Gw_buffer = Gw+9;
63 |
64 | pointFunction_t func1[4] = {f_row1, f_row2, f_row3, f_row4};
65 | pointFunction_t func2[4] = {f_col1, f_col2, f_col3, f_col4};
66 |
67 | for(int bk=0; bk>>(w, Ww, filt_k, filt_c, filt_h, filt_w, alpha);
273 |
274 | #ifdef OPTSTS64_CMP
275 | smem_size = 65536; // 64 KB
276 | cudaFuncSetAttribute(Winograd_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
277 | #endif
278 |
279 | Winograd_kernel<<>>(k, Ww, C, tiles_dim, in_c, in_n, in_h, in_w, tile_size, filt_k, filt_c, tiles_2d_dim, out_c, out_n, tile_2d_s, out_h, out_w);
280 |
281 | return cudaGetLastError();
282 | }
283 |
284 | }
285 | #endif
286 |
--------------------------------------------------------------------------------
/src/ampere/convolutionForward_32x64x8_baseline.cu:
--------------------------------------------------------------------------------
1 | // Copyright 2021 Roberto Lopez Castro
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 |
16 | #include "../FX_m2.cu"
17 | #include "store_and_transform_output_baseline.cuh"
18 | #include "../outer_product.cuh"
19 |
20 | #ifdef _noWALL_
21 | typedef struct rusage resnfo;
22 | typedef struct _timenfo {
23 | double time;
24 | double systime;
25 | } timenfo;
26 | #define timestamp(sample) getrusage(RUSAGE_SELF, (sample))
27 | #define printtime(t) printf("%15f s (%f user + %f sys) ", \
28 | t.time + t.systime, t.time, t.systime);
29 | #else
30 | typedef struct timeval resnfo;
31 | typedef double timenfo;
32 | #define timestamp(sample) gettimeofday((sample), 0)
33 | #define printtime(t) printf("%15f s ", t);
34 | #endif
35 |
36 | #ifndef _WINOGRAD_
37 | #define _WINOGRAD_
38 | extern "C"
39 | {
40 |
41 |
42 | #define d(input, i, j) ( input[(i<<2) + (j)] )
43 |
44 | __device__ __forceinline__ void load_and_transform_input_tile(float *Btd, float *pOutputs, int in_h, int in_w,
45 | int tiles_dim, int in_c, int in_n, int tile_size,
46 | int tiles_2d_dim, int tile_2d_s){
47 |
48 | float workspace[3];
49 |
50 | #pragma unroll
51 | for(int j=0; j<4; j++){
52 | workspace[0] = Btd[j];
53 | workspace[1] = Btd[j+4];
54 | workspace[2] = Btd[j+8];
55 |
56 | Btd[j] = workspace[0] - workspace[2];
57 | Btd[j+4] = workspace[1] + workspace[2];
58 | Btd[j+8] = workspace[2] - workspace[1];
59 | Btd[j+12] = workspace[1] - Btd[j+12];
60 | }
61 |
62 | int c_offset = BN*BC;
63 | int c_tensor = threadIdx.y*BN + threadIdx.x;
64 |
65 | #pragma unroll
66 | for(int i=0; i<4; i++){ // prefetch 1 input tile/thread
67 | pOutputs[c_tensor+i*c_offset*4] = d(Btd, i, 0) - d(Btd, i, 2);
68 | pOutputs[c_tensor+i*c_offset*4+c_offset] = d(Btd, i, 1) + d(Btd, i, 2);
69 | pOutputs[c_tensor+i*c_offset*4+2*c_offset] = d(Btd, i, 2) - d(Btd, i, 1);
70 | pOutputs[c_tensor+i*c_offset*4+3*c_offset] = d(Btd, i, 1) - d(Btd, i, 3);
71 | }
72 |
73 | }
74 |
75 | __device__ __forceinline__ void load_filter_tile(float *tiles, float *pOutputs,
76 | int filt_c, int filt_k){
77 |
78 | int c_tensor_s = threadIdx.y*BK + threadIdx.x;
79 | int c_offset_s = BK*BC;
80 |
81 | for(int k=0; k<2; k++){ // prefetch 2 filter tiles/thread
82 | for(int i=0; i<4; i++){
83 | #pragma unroll
84 | for(int j=0; j<4; j++){
85 | pOutputs[c_tensor_s + i*c_offset_s*4 + j*c_offset_s] = tiles[k*16 + i*4 + j];
86 | }
87 | }
88 |
89 | c_tensor_s += BN;
90 | }
91 |
92 | }
93 |
94 | __device__ __forceinline__ void prefetch_filter_tile(float *pInputs, float *tiles, int filt_k){
95 |
96 | int c_tensor = blockIdx.z*BK + (threadIdx.y*filt_k<<4) + threadIdx.x; // Iny*filt_k*4*4
97 |
98 | int acumm;
99 | #pragma unroll
100 | for(int i=0; i<4; i++){
101 | acumm = (i*filt_k<<2);
102 | #pragma unroll
103 | for(int j=0; j<4; j++){
104 | tiles[(i<<2) + j] = pInputs[acumm + j*filt_k + c_tensor];
105 | tiles[16 + (i<<2) + j] = pInputs[acumm + j*filt_k + c_tensor+BN];
106 | }
107 | }
108 | }
109 |
110 | __device__ __forceinline__ void prefetch_input_tile(float *pInputs, float *tile, int in_h, int in_w, int in_n, int tiles_dim, short mask){
111 |
112 | int c_tensor = (blockIdx.y%tiles_dim)*in_n*2 + (blockIdx.y/tiles_dim)*in_n*in_w*2 + blockIdx.x*BN + threadIdx.y*(in_n*in_h*in_w) + (threadIdx.x/in_n)*2*in_n + (threadIdx.x%in_n) - (in_n*in_w+in_n);
113 | int acumm,x;
114 | //short x1,x2;
115 |
116 | if(mask==0xFFFF){
117 | #pragma unroll
118 | for(int i=0; i<4; i++){
119 | acumm = i*in_n*in_w;
120 | #pragma unroll
121 | for(int j=0; j<4; j++){
122 | tile[(i<<2) + j] = pInputs[acumm + j*in_n + c_tensor];
123 | }
124 | }
125 |
126 | } else {
127 | for(int i=0; i<4; i++){
128 | acumm = i*in_n*in_w;
129 | #pragma unroll
130 | for(int j=0; j<4; j++){
131 | x = (i<<2) + j;
132 | tile[x] = 0;
133 | if(mask&(1<>>(w, Ww, filt_k, filt_c, filt_h, filt_w, alpha);
267 |
268 | Winograd_kernel<<>>(k, Ww, C,
269 | tiles_dim, in_c, in_n, in_h, in_w, tile_size,
270 | filt_k, filt_c, tiles_2d_dim, out_c, out_n, tile_2d_s,
271 | out_h, out_w);
272 |
273 | return cudaGetLastError();
274 | }
275 |
276 | }
277 | #endif
278 |
--------------------------------------------------------------------------------
/src/ampere/store_and_transform_output_baseline.cuh:
--------------------------------------------------------------------------------
1 | // Copyright 2021 Roberto Lopez Castro
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #include "../config.hpp"
16 |
17 | #ifndef _OUTPUT_KERNEL_PPoPP_
18 | #define _OUTPUT_KERNEL_PPoPP_
19 | extern "C"
20 | {
21 |
22 | __device__ void transform_output_tile(float *pOutputs, float *C_tile, float *At, int out_h, int out_w,
23 | int tiles_dim, int round, int in_n, int offset, int out_thread[][4], short mask, int c_tensor, int c_glb_offset){
24 |
25 | for(int j=0; j<4; j++){
26 | At[j] = C_tile[j] + C_tile[4+j] + C_tile[8+j];
27 | At[j+8] = C_tile[j+16] + C_tile[4+j+16] + C_tile[8+j+16];
28 |
29 | At[4+j] = C_tile[4+j] - C_tile[8+j] - C_tile[12+j];
30 | At[4+j+8] = C_tile[4+j+16] - C_tile[8+j+16] - C_tile[12+j+16];
31 | }
32 |
33 | int idx = out_thread[round][threadIdx.y%4] + threadIdx.y/4 + offset;
34 | c_tensor += idx*c_glb_offset;
35 | int x, x1;
36 |
37 | for(int i=0; i<2; i++){
38 | x = i*4;
39 | //x1 = i*(in_n*(tiles_dim-1) + in_n/2)*2;
40 | x1 = i*(in_n*(tiles_dim-(out_w%2)) + (out_w%2)*in_n/2)*2;
41 | if(mask&(1<<(i*2))){
42 | pOutputs[c_tensor+ x1] = At[x] + At[x+1] + At[x+2];
43 | pOutputs[c_tensor+2*c_glb_offset+x1] = At[x+8] + At[x+1+8] + At[x+2+8];
44 | }
45 |
46 | if(mask&(1<<(i*2+1))){
47 | pOutputs[c_tensor+x1+in_n] = At[x+1] - At[x+2] - At[x+3];
48 | pOutputs[c_tensor+2*c_glb_offset+x1+in_n] = At[x+1+8] - At[x+2+8] - At[x+3+8];
49 | }
50 | }
51 |
52 | }
53 |
54 | __device__ __inline__ void store_output_tile(float4 acumm_smem[][16], float *shared_mem, float *C, int out_h, int out_w, int tiles_dim, int in_n, float4 *input_frag_mem, float4* filter_frag_mem, int out_thread[][4], int access_s_out[][16], short mask){
55 |
56 | float4 *output_smem = (float4 *) shared_mem;
57 | float4 *accumulator = (float4 *) acumm_smem;
58 |
59 | float *C_tile = (float*) input_frag_mem;
60 | float *At = (float*) filter_frag_mem;
61 |
62 | mask = 0x000F;
63 | if((blockIdx.y/tiles_dim)==(tiles_dim-1) && out_w%2) mask&=0x0003;
64 | if(!((blockIdx.y+1)%tiles_dim) && out_w%2) mask&=0X0005;
65 |
66 | // output transpose step
67 | int t=0;
68 | int acumm1, acumm2;
69 |
70 | acumm1 = access_s_out[0][threadIdx.x%8 + (threadIdx.x/16)*8];
71 | acumm2 = access_s_out[1][threadIdx.x%8 + (threadIdx.x/16)*8];
72 |
73 | int offset = BN_p*4;
74 | int init = (threadIdx.y/4)*BN_p*16*4 + (threadIdx.y%4)*40 + threadIdx.x;
75 | int acumm3 = threadIdx.y * BN_p;
76 | int acumm4 = BN_p*8*2;
77 |
78 | int idx = acumm3;
79 | int idx2 = idx + BN_p*8;
80 |
81 | float* out = (float *) output_smem;
82 |
83 | int c_glb_offset = in_n*out_h*out_w;
84 | int c_tensor = blockIdx.z*in_n*out_h*out_w*BK + (blockIdx.y%tiles_dim)*in_n*2 + (blockIdx.y/tiles_dim)*in_n*out_w*2 + blockIdx.x*BN + threadIdx.x;
85 |
86 | //#pragma unroll
87 | for(int round=0; round<4; round++){
88 |
89 | //transformation step
90 | if ( ((!round || round==1) && (threadIdx.x&15)<8) || ((round==2 || round==3) && (threadIdx.x&15)>7) ){
91 |
92 | #pragma unroll
93 | for(int i=0; i<4; i+=2){
94 |
95 | *( (float4*) (output_smem + idx+i*acumm4 + acumm1) ) = *(accumulator+t); // k=0
96 | *( (float4*) (output_smem + idx+i*acumm4 + acumm2) ) = *(accumulator+t+1);
97 | *( (float4*) (output_smem + idx+(i+1)*acumm4 + acumm1) ) = *(accumulator+2+t); // k=1
98 | *( (float4*) (output_smem + idx+(i+1)*acumm4 + acumm2) ) = *(accumulator+3+t);
99 |
100 | *( (float4*) (output_smem + idx2+i*acumm4 + acumm1) ) = *(accumulator+16+t);
101 | *( (float4*) (output_smem + idx2+i*acumm4 + acumm2) ) = *(accumulator+17+t);
102 | *( (float4*) (output_smem + idx2+(i+1)*acumm4 + acumm1) ) = *(accumulator+18+t);
103 | *( (float4*) (output_smem + idx2+(i+1)*acumm4 + acumm2) ) = *(accumulator+19+t);
104 |
105 | t+=4;
106 | }
107 | }
108 | __syncthreads();
109 |
110 | for(int i=0; i<16; i++){
111 | C_tile[i] = out[init + i*offset];
112 | C_tile[i+16] = out[init + 2*BN_p*16*4 + i*offset];
113 | }
114 |
115 | // transform output tiles
116 | transform_output_tile(C, C_tile, At, out_h, out_w, tiles_dim, round, in_n, 0, out_thread, mask, c_tensor, c_glb_offset);
117 |
118 |
119 | __syncthreads();
120 | }
121 |
122 | }
123 |
124 | }
125 | #endif
126 |
--------------------------------------------------------------------------------
/src/ampere/store_and_transform_output_optLDS64.cuh:
--------------------------------------------------------------------------------
1 | // Copyright 2021 Roberto Lopez Castro
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #include "../config.hpp"
16 |
17 | #ifndef _OUTPUT_KERNEL_OPT3_
18 | #define _OUTPUT_KERNEL_OPT3_
19 | extern "C"
20 | {
21 |
22 | __device__ void __inline__ transform_output_tile(float2 *pOutputs, float2 *C_tile, float2 *At, int tiles_dim, int round, int in_n, int c_tensor, int c_glb_offset, short mask, int out_w)
23 | {
24 |
25 | c_tensor += ( (round/2)*32 + (round%2)*2 )*c_glb_offset/2;
26 | int x, x1;
27 |
28 | for(int j=0; j<4; j++){
29 | At[j].x = C_tile[j].x + C_tile[4+j].x + C_tile[8+j].x;
30 | At[j].y = C_tile[j].y + C_tile[4+j].y + C_tile[8+j].y;
31 |
32 | At[4+j].x = C_tile[4+j].x - C_tile[8+j].x - C_tile[12+j].x;
33 | At[4+j].y = C_tile[4+j].y - C_tile[8+j].y - C_tile[12+j].y;
34 | }
35 |
36 | for(int i=0; i<2; i++){
37 | x = i*4;
38 | x1 = i*(in_n*(tiles_dim-(out_w%2)) + (out_w%2)*in_n/2);
39 | if(mask&(1<<(i*2))){
40 |
41 | pOutputs[x1 + c_tensor].x = At[x].x + At[x+1].x + At[x+2].x;
42 | pOutputs[x1 + c_tensor].y = At[x].y + At[x+1].y + At[x+2].y;
43 | }
44 | if(mask&(1<<(i*2+1))){
45 | pOutputs[x1 + in_n/2 + c_tensor].x = At[x+1].x - At[x+2].x - At[x+3].x;
46 | pOutputs[x1 + in_n/2 + c_tensor].y = At[x+1].y - At[x+2].y - At[x+3].y;
47 | }
48 | }
49 |
50 | }
51 |
52 | __device__ __inline__ void store_output_tile(float4 acumm_smem[][16], float *shared_mem,
53 | float *C, int out_h, int out_w,
54 | int tiles_dim, int in_n, float4 *input_frag_mem, float4* filter_frag_mem, short mask){
55 |
56 | float2 *output_smem = (float2 *) shared_mem;
57 | float2 *accumulator = (float2 *) acumm_smem;
58 | float2 *C_out = (float2*)C;
59 |
60 | float2 *C_tile = (float2*) input_frag_mem;
61 | float2 *At = (float2*) filter_frag_mem;
62 |
63 | mask = 0x000F;
64 | if((blockIdx.y/tiles_dim)==(tiles_dim-1) && out_w%2) mask&=0x0003;
65 | if(!((blockIdx.y+1)%tiles_dim) && out_w%2) mask&=0X0005;
66 |
67 | // output transpose step
68 | int t=0;
69 | int acumm1, acumm2;
70 | // For transposing
71 | //acumm1 = access_s_out[threadIdx.x]; //* 4
72 | acumm1 = ((threadIdx.x%8)/2)*34 + threadIdx.x%2 + (threadIdx.x/16)*2 + ((threadIdx.x/8)%2)*8;
73 | acumm2 = acumm1+4;
74 |
75 | int acumm4 = BN_p*8*2 ; //*4
76 | int idx = threadIdx.y * BN_p;
77 | int idx2 = idx + BN_p*8; //(BN_p*2 *8)/2
78 |
79 | // For transformating
80 | int offset = BN_p; //*2/2
81 | int init = (threadIdx.y/4)*BN_p*16 + (threadIdx.y%4)*(32+2);
82 | init += (threadIdx.x/16)*8 + ((threadIdx.x/8)%2)*16 + (threadIdx.x%8); //40=(8+2)*4, 4 blocks/buffer
83 |
84 |
85 | int c_glb_offset = in_n*out_h*out_w;
86 | int c_tensor = blockIdx.z*c_glb_offset*BK + (blockIdx.y%tiles_dim)*in_n*2 + (blockIdx.y/tiles_dim)*in_n*out_w*2 + blockIdx.x*BN + ((threadIdx.x/8)%2)*2 + (threadIdx.x%8)*2*2 + ((threadIdx.x/16)*16 + (threadIdx.y%4)*4 + threadIdx.y/4)*c_glb_offset;
87 | c_tensor/=2;
88 |
89 | #pragma unroll
90 | for(int round=0; round<4; round++){
91 |
92 | *( (float2*) (output_smem + idx + acumm1) ) = *(accumulator+t);
93 | *( (float2*) (output_smem + idx + acumm1 + 16) ) = *(accumulator+t+1); // float 4, t
94 | *( (float2*) (output_smem + idx + acumm2) ) = *(accumulator+t+2);
95 | *( (float2*) (output_smem + idx + acumm2 + 16) ) = *(accumulator+t+3); // float 4, t+1
96 |
97 | *( (float2*) (output_smem + idx2 + acumm1) ) = *(accumulator+t+32);
98 | *( (float2*) (output_smem + idx2 + acumm1 + 16) ) = *(accumulator+t+33); // float 4, t+16
99 | *( (float2*) (output_smem + idx2 + acumm2) ) = *(accumulator+t+34);
100 | *( (float2*) (output_smem + idx2 + acumm2 + 16) ) = *(accumulator+t+35); // float 4, t+17
101 |
102 | *( (float2*) (output_smem + idx + acumm4 + acumm1) ) = *(accumulator+t+4);
103 | *( (float2*) (output_smem + idx + acumm4 + acumm1 + 16) ) = *(accumulator+t+5); // float 4, t+2
104 | *( (float2*) (output_smem + idx + acumm4 + acumm2) ) = *(accumulator+t+6);
105 | *( (float2*) (output_smem + idx + acumm4 + acumm2 + 16) ) = *(accumulator+t+7); // float 4, t+3
106 |
107 | *( (float2*) (output_smem + idx2 + acumm4 + acumm1) ) = *(accumulator+t+36);
108 | *( (float2*) (output_smem + idx2 + acumm4 + acumm1 + 16) ) = *(accumulator+t+37); // float 4, t+18
109 | *( (float2*) (output_smem + idx2 + acumm4 + acumm2) ) = *(accumulator+t+38);
110 | *( (float2*) (output_smem + idx2 + acumm4 + acumm2 + 16) ) = *(accumulator+t+39); // float 4, t+19
111 |
112 | t+=8;
113 |
114 | __syncthreads();
115 |
116 |
117 | for(int i=0; i<16; i++){
118 | C_tile[i].x = output_smem[i*offset + init].x; //16*4
119 | C_tile[i].y = output_smem[i*offset + init].y; //16*4
120 | }
121 |
122 | // transform output tiles
123 | transform_output_tile(C_out, C_tile, At, tiles_dim, round, in_n, c_tensor, c_glb_offset, mask , out_w);
124 |
125 | __syncthreads();
126 |
127 | }
128 | }
129 |
130 | }
131 | #endif
132 |
--------------------------------------------------------------------------------
/src/ampere/store_and_transform_output_optSTS64.cuh:
--------------------------------------------------------------------------------
1 | // Copyright 2021 Roberto Lopez Castro
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #include "../config.hpp"
16 |
17 | #ifndef _OUTPUT_KERNEL_OPT1_
18 | #define _OUTPUT_KERNEL_OPT1_
19 | extern "C"
20 | {
21 |
22 | __device__ __forceinline__ void transform_output_tile(float2 *pOutputs, float2 *C_tile, float2 *At, int tiles_dim, int round, int in_n, int c_tensor, int c_glb_offset, short mask, int out_w)
23 | {
24 | c_tensor += (((round)/2)*32 + ((round)%2)*2)*c_glb_offset/2;
25 | int x, x1;
26 |
27 | #pragma unroll
28 | for(int j=0; j<4; j++){
29 |
30 | At[j].x = C_tile[j].x + C_tile[4+j].x + C_tile[8+j].x;
31 | At[j].y = C_tile[j].y + C_tile[4+j].y + C_tile[8+j].y;
32 |
33 | At[4+j].x = C_tile[4+j].x - C_tile[8+j].x - C_tile[12+j].x;
34 | At[4+j].y = C_tile[4+j].y - C_tile[8+j].y - C_tile[12+j].y;
35 | }
36 |
37 |
38 | #pragma unroll
39 | for(int i=0; i<2; i++){
40 | x = i*4;
41 | x1 = i*(in_n*(tiles_dim-(out_w%2)) + (out_w%2)*in_n/2);
42 | if(mask&(1<<(i*2))){
43 | pOutputs[x1 + c_tensor].x = At[x].x + At[x+1].x + At[x+2].x;
44 | pOutputs[x1 + c_tensor].y = At[x].y + At[x+1].y + At[x+2].y;
45 | }
46 |
47 | if(mask&(1<<(i*2+1))){
48 | pOutputs[x1 + in_n/2 + c_tensor].x = At[x+1].x - At[x+2].x - At[x+3].x;
49 | pOutputs[x1 + in_n/2 + c_tensor].y = At[x+1].y - At[x+2].y - At[x+3].y;
50 | }
51 | }
52 | }
53 |
54 | __device__ __forceinline__ void store_output_tile(float4 acumm_smem[][16], float *shared_mem, float *C, int out_h, int out_w, int tiles_dim, int in_n, float4 *input_frag_mem, float4* filter_frag_mem, short mask){
55 |
56 | float2 *output_smem = (float2 *) shared_mem;
57 | float2 *accumulator = (float2 *) acumm_smem;
58 | float2 *C_out = (float2*)C;
59 |
60 | float2 *C_tile = (float2*) input_frag_mem;
61 | float2 *At = (float2*) filter_frag_mem;
62 |
63 | mask = 0x000F;
64 | if((blockIdx.y/tiles_dim)==(tiles_dim-1) && out_w%2) mask&=0x0003;
65 | if(!((blockIdx.y+1)%tiles_dim) && out_w%2) mask&=0X0005;
66 |
67 | // output transpose step
68 | int t=0;
69 | int acumm1, acumm2;
70 | // For transposing
71 | //acumm1 = access_s_out[Inx]; //* 4
72 | acumm1 = ((threadIdx.x%8)/2)*34 + threadIdx.x%2 + (threadIdx.x/16)*2 + ((threadIdx.x/8)%2)*8;
73 | acumm2 = acumm1+4;
74 |
75 | int acumm4 = BN_p*16 ; //*4
76 | int idx = threadIdx.y * BN_p;
77 | int idx2 = idx + BN_p*8; //(BN_p*2 *8)/2
78 |
79 | // For transformating
80 | int offset = BN_p *2; //*2/2
81 | int init = ( (threadIdx.y/4)*BN_p*16 + (threadIdx.y%4)*(32+2) ) *2 + threadIdx.x;
82 |
83 | int c_glb_offset = in_n*out_h*out_w;
84 | int c_tensor = blockIdx.z*c_glb_offset*BK + (blockIdx.y%tiles_dim)*in_n*2 + (blockIdx.y/tiles_dim)*in_n*out_w*2 + blockIdx.x*BN + (threadIdx.x%16)*2+
85 | ((threadIdx.x/16)*16 + (threadIdx.y%4)*4 + threadIdx.y/4)*c_glb_offset;
86 | c_tensor/=2;
87 |
88 | #pragma unroll
89 | for(int round=0; round<4; round++){
90 |
91 | *( (float2*) (output_smem + idx + acumm1) ) = *(accumulator+t);
92 | *( (float2*) (output_smem + idx + acumm1 + 16) ) = *(accumulator+t+1); // float 4, t
93 | *( (float2*) (output_smem + idx + acumm2) ) = *(accumulator+t+2);
94 | *( (float2*) (output_smem + idx + acumm2 + 16) ) = *(accumulator+t+3); // float 4, t+1
95 |
96 | *( (float2*) (output_smem + idx2 + acumm1) ) = *(accumulator+t+32);
97 | *( (float2*) (output_smem + idx2 + acumm1 + 16) ) = *(accumulator+t+33); // float 4, t+16
98 | *( (float2*) (output_smem + idx2 + acumm2) ) = *(accumulator+t+34);
99 | *( (float2*) (output_smem + idx2 + acumm2 + 16) ) = *(accumulator+t+35); // float 4, t+17
100 |
101 | *( (float2*) (output_smem + idx + acumm4 + acumm1) ) = *(accumulator+t+4);
102 | *( (float2*) (output_smem + idx + acumm4 + acumm1 + 16) ) = *(accumulator+t+5); // float 4, t+2
103 | *( (float2*) (output_smem + idx + acumm4 + acumm2) ) = *(accumulator+t+6);
104 | *( (float2*) (output_smem + idx + acumm4 + acumm2 + 16) ) = *(accumulator+t+7); // float 4, t+3
105 |
106 | *( (float2*) (output_smem + idx2 + acumm4 + acumm1) ) = *(accumulator+t+36);
107 | *( (float2*) (output_smem + idx2 + acumm4 + acumm1 + 16) ) = *(accumulator+t+37); // float 4, t+18
108 | *( (float2*) (output_smem + idx2 + acumm4 + acumm2) ) = *(accumulator+t+38);
109 | *( (float2*) (output_smem + idx2 + acumm4 + acumm2 + 16) ) = *(accumulator+t+39); // float 4, t+19
110 |
111 | t+=8;
112 |
113 | __syncthreads();
114 |
115 |
116 | for(int i=0; i<16; i++){
117 | C_tile[i].x = shared_mem[i*offset + init];
118 | C_tile[i].y = shared_mem[i*offset + init + 32];
119 | }
120 |
121 | // transform output tiles
122 | transform_output_tile(C_out, C_tile, At, tiles_dim, round, in_n, c_tensor, c_glb_offset, mask, out_w);
123 | __syncthreads();
124 | }
125 | }
126 |
127 | }
128 | #endif
129 |
--------------------------------------------------------------------------------
/src/ampere/store_and_transform_output_optSTS64_compact.cuh:
--------------------------------------------------------------------------------
1 | // Copyright 2021 Roberto Lopez Castro
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #include "../config.hpp"
16 |
17 | #ifndef _OUTPUT_KERNEL_OPT1_
18 | #define _OUTPUT_KERNEL_OPT1_
19 | extern "C"
20 | {
21 |
22 | __device__ void transform_output_tile(float2 *pOutputs, float2 *C_tile, float2 *At,
23 | int tiles_dim, int round, int in_n, int c_tensor, int c_glb_offset,
24 | short mask, int out_w){
25 | c_tensor += (((round)/2)*32 + ((round)%2)*2)*c_glb_offset/2;
26 | int x, x1;
27 |
28 |
29 | #pragma unroll
30 | for(int j=0; j<4; j++){
31 | At[j].x = C_tile[j].x + C_tile[4+j].x + C_tile[8+j].x;
32 | At[j].y = C_tile[j].y + C_tile[4+j].y + C_tile[8+j].y;
33 | At[4+j].x = C_tile[4+j].x - C_tile[8+j].x - C_tile[12+j].x;
34 | At[4+j].y = C_tile[4+j].y - C_tile[8+j].y - C_tile[12+j].y;
35 | }
36 |
37 | x = in_n/2;
38 | pOutputs[c_tensor].x = At[0].x + At[1].x + At[2].x;
39 | pOutputs[c_tensor].y = At[0].y + At[1].y + At[2].y;
40 |
41 | if(mask&0x2){
42 | pOutputs[x + c_tensor].x = At[1].x - At[2].x - At[3].x;
43 | pOutputs[x + c_tensor].y = At[1].y - At[2].y - At[3].y;
44 | }
45 |
46 | //x1 = in_n*(tiles_dim-1) + x;
47 | x1 = in_n*(tiles_dim-(out_w%2)) + (out_w%2)*x;
48 | if(mask&0x4){
49 | pOutputs[x1 + c_tensor].x = At[4].x + At[5].x + At[6].x;
50 | pOutputs[x1 + c_tensor].y = At[4].y + At[5].y + At[6].y;
51 | }
52 |
53 | if(mask&0x8){
54 | pOutputs[x1 + x + c_tensor].x = At[5].x - At[6].x - At[7].x;
55 | pOutputs[x1 + x + c_tensor].y = At[5].y - At[6].y - At[7].y;
56 | }
57 | }
58 |
59 | __device__ __forceinline__ void store_output_tile(float4 acumm_smem[][16], float *shared_mem, float *C, int out_h, int out_w, int tiles_dim, int in_n, float4 *input_frag_mem, float4* filter_frag_mem, short mask){
60 |
61 | float2 *output_smem = (float2 *) shared_mem;
62 | float2 *accumulator = (float2 *) acumm_smem;
63 | float2 *C_out = (float2*)C;
64 |
65 | float2 *C_tile = (float2*) input_frag_mem;
66 | float2 *At = (float2*) filter_frag_mem;
67 |
68 | mask = 0x000F;
69 | if((blockIdx.y/tiles_dim)==(tiles_dim-1) && out_w%2) mask&=0x0003;
70 | if(!((blockIdx.y+1)%tiles_dim) && out_w%2) mask&=0X0005;
71 |
72 | // output transpose step
73 | int t,j;
74 | int acumm1, acumm2;
75 | // For transposing
76 | t = threadIdx.x%8/2;
77 | acumm1 = t*18 + threadIdx.x%2 + (threadIdx.x/16)*2 + ((threadIdx.x/8)%2)*8;
78 | acumm2 = acumm1+4;
79 | acumm1 = acumm1 - acumm1/((t+1)*16)*16 + t*16;
80 | acumm2 = acumm2 - acumm2/((t+1)*16)*16 + t*16;
81 | t=0;
82 |
83 | int acumm4 = BN_p*16 ; //*4
84 | int idx = threadIdx.y * BN_p;
85 | int idx2 = idx + BN_p*8; //(BN_p*2 *8)/2
86 |
87 | // For transformating
88 | int offset = BN_p *2; //*2/2
89 |
90 | int init = (threadIdx.y%4)*(16+2)*2 + threadIdx.x;
91 | init = init - init/((threadIdx.y%4+1)*32)*32 + threadIdx.y%4*32;
92 | init += (threadIdx.y/4)*BN_p*16*2;
93 |
94 | int c_glb_offset = in_n*out_h*out_w;
95 | int c_tensor = blockIdx.z*c_glb_offset*BK + (blockIdx.y%tiles_dim)*in_n*2 + (blockIdx.y/tiles_dim)*in_n*out_w*2 + blockIdx.x*BN + (threadIdx.x%16)*2+
96 | ((threadIdx.x/16)*16 + (threadIdx.y%4)*4 + threadIdx.y/4)*c_glb_offset;
97 | c_tensor/=2;
98 |
99 | // k=0, block 0
100 | *( (float2*) (output_smem + idx + acumm1) ) = *(accumulator);
101 | *( (float2*) (output_smem + idx + acumm1 + 16) ) = *(accumulator+1);
102 | *( (float2*) (output_smem + idx + acumm2) ) = *(accumulator+2);
103 | *( (float2*) (output_smem + idx + acumm2 + 16) ) = *(accumulator+3);
104 |
105 | // K=1, block 0
106 | *( (float2*) (output_smem + idx + acumm4 + acumm1) ) = *(accumulator+4);
107 | *( (float2*) (output_smem + idx + acumm4 + acumm1 + 16) ) = *(accumulator+5);
108 | *( (float2*) (output_smem + idx + acumm4 + acumm2) ) = *(accumulator+6);
109 | *( (float2*) (output_smem + idx + acumm4 + acumm2 + 16) ) = *(accumulator+7);
110 |
111 | // k=0, block 1
112 | *( (float2*) (output_smem + idx2 + acumm1) ) = *(accumulator+32);
113 | *( (float2*) (output_smem + idx2 + acumm1 + 16) ) = *(accumulator+33);
114 | *( (float2*) (output_smem + idx2 + acumm2) ) = *(accumulator+34);
115 | *( (float2*) (output_smem + idx2 + acumm2 + 16) ) = *(accumulator+35);
116 |
117 | // K=1, block 1
118 | *( (float2*) (output_smem + idx2 + acumm4 + acumm1) ) = *(accumulator+36);
119 | *( (float2*) (output_smem + idx2 + acumm4 + acumm1 + 16) ) = *(accumulator+37);
120 | *( (float2*) (output_smem + idx2 + acumm4 + acumm2) ) = *(accumulator+38);
121 | *( (float2*) (output_smem + idx2 + acumm4 + acumm2 + 16) ) = *(accumulator+39);
122 |
123 | j=0; t+=8;
124 |
125 | #pragma unroll
126 | for(int round=0; round<3; round++){
127 |
128 | __syncthreads();
129 |
130 | int disp = j/2*(BN_p*2*16)*2;
131 | #pragma unroll
132 | for(int i=0; i<16; i++){
133 | C_tile[i].x = shared_mem[disp + i*offset + init];
134 | C_tile[i].y = shared_mem[disp + i*offset + init + 32];
135 | }
136 |
137 | // transform output tiles
138 | transform_output_tile(C_out, C_tile, At, tiles_dim, (round/2)*2+j/2, in_n, c_tensor, c_glb_offset, mask, out_w);
139 |
140 | j = 2 - j; //switch between 0 and 2
141 |
142 | // k=0, block 0
143 | *( (float2*) (output_smem + idx + (j)*acumm4 + acumm1) ) = *(accumulator+t);
144 | *( (float2*) (output_smem + idx + (j)*acumm4 + acumm1 + 16) ) = *(accumulator+t+1);
145 | *( (float2*) (output_smem + idx + (j)*acumm4 + acumm2) ) = *(accumulator+t+2);
146 | *( (float2*) (output_smem + idx + (j)*acumm4 + acumm2 + 16) ) = *(accumulator+t+3);
147 |
148 | // K=1, block 0
149 | *( (float2*) (output_smem + idx + (j+1)*acumm4 + acumm1) ) = *(accumulator+t+4);
150 | *( (float2*) (output_smem + idx + (j+1)*acumm4 + acumm1 + 16) ) = *(accumulator+t+5);
151 | *( (float2*) (output_smem + idx + (j+1)*acumm4 + acumm2) ) = *(accumulator+t+6);
152 | *( (float2*) (output_smem + idx + (j+1)*acumm4 + acumm2 + 16) ) = *(accumulator+t+7);
153 |
154 | // k=0, block 1
155 | *( (float2*) (output_smem + idx2 + (j)*acumm4 + acumm1) ) = *(accumulator+t+32);
156 | *( (float2*) (output_smem + idx2 + (j)*acumm4 + acumm1 + 16) ) = *(accumulator+t+33);
157 | *( (float2*) (output_smem + idx2 + (j)*acumm4 + acumm2) ) = *(accumulator+t+34);
158 | *( (float2*) (output_smem + idx2 + (j)*acumm4 + acumm2 + 16) ) = *(accumulator+t+35);
159 |
160 | // K=1, block 1
161 | *( (float2*) (output_smem + idx2 + (j+1)*acumm4 + acumm1) ) = *(accumulator+t+36);
162 | *( (float2*) (output_smem + idx2 + (j+1)*acumm4 + acumm1 + 16) ) = *(accumulator+t+37);
163 | *( (float2*) (output_smem + idx2 + (j+1)*acumm4 + acumm2) ) = *(accumulator+t+38);
164 | *( (float2*) (output_smem + idx2 + (j+1)*acumm4 + acumm2 + 16) ) = *(accumulator+t+39);
165 |
166 | t+=8;
167 |
168 | }
169 |
170 | __syncthreads();
171 |
172 | int disp = j/2*(BN_p*2*16)*2;
173 | #pragma unroll
174 | for(int i=0; i<16; i++){
175 | C_tile[i].x = shared_mem[disp + i*offset + init];
176 | C_tile[i].y = shared_mem[disp + i*offset + init + 32];
177 | }
178 | // transform output tiles
179 | transform_output_tile(C_out, C_tile, At, tiles_dim, 2+j/2, in_n, c_tensor, c_glb_offset, mask, out_w);
180 | }
181 |
182 | }
183 | #endif
184 |
--------------------------------------------------------------------------------
/src/config.hpp:
--------------------------------------------------------------------------------
1 | // Copyright 2021 Roberto Lopez Castro
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 |
16 | #ifndef COMMON_INCLUDE_FILE
17 | #define COMMON_INCLUDE_FILE
18 |
19 | #define BC 8
20 | #define BN 32
21 | #define BK 64
22 | ///////////////////// For Non-Fused version
23 | #define BC_GEMM 8
24 | #define BN_GEMM 128
25 | #define BK_GEMM 128
26 | ///////////////////// For Non-Fused version
27 |
28 | #ifdef OPTSTS64_CMP
29 | #define BN_p 128
30 | #elif BASE
31 | #define BN_p 40
32 | #else
33 | #define BN_p 138
34 | #endif
35 |
36 | #define N 128 // values: 32,64,96,128
37 | #define C_in 256 // values: 64,128,256,512
38 | #define W 14 // values: 56,28,14,7
39 |
40 | #define K 256 // values: 64,128,256,512
41 | #define R 3 // values: 3
42 |
43 | #define PAD_H 1
44 | #define PAD_W 1
45 | #define STR_H 1
46 | #define STR_W 1
47 | #define DIL_H 1
48 | #define DIL_W 1
49 |
50 | #define M 2 // values: 2
51 |
52 | __constant__ int access_f_s[2][32];
53 | __constant__ int access_s[2][32];
54 | #ifndef BASE
55 | __constant__ int access_s_out[32];
56 | __constant__ int out_thread[2][4][4];
57 | __constant__ int out_sgemm[32];
58 | __constant__ int exhange[32];
59 | #else
60 | __constant__ int access_s_out[2][16];
61 | __constant__ int out_thread[4][4];
62 | #endif
63 |
64 |
65 | // access_f_s
66 | const int aux[2][32] = {
67 | {0,0,1,1,2,2,3,3,4,4,5,5,6,6,
68 | 7,7,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7},
69 | {8,8,9,9,10,10,11,11,12,12,13,13,
70 | 14,14,15,15,8,8,9,9,10,10,11,11,12,12,
71 | 13,13,14,14,15,15}
72 | };
73 | // access_s
74 | const int aux2[2][32] = {
75 | {0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,
76 | 3,2,3,2,3,2,3,2,3,2,3,2,3,2,3},
77 | {4,5,4,5,4,5,4,5,4,5,4,
78 | 5,4,5,4,5,6,7,6,7,6,7,6,7,
79 | 6,7,6,7,6,7,6,7}
80 | };
81 |
82 | #ifndef BASE
83 | // access_s_out
84 | const int aux3[32] = {
85 | 0,1,34,35,68,69,102,103, // first quarter
86 | 8,9,42,43,76,77,110,111, // second quarter
87 | 2,3,36,37,70,71,104,105, // third quarter
88 | 10,11,44,45,78,79,112,113 // fourth quarter
89 | };
90 | // out_thread
91 | const int aux4[2][4][4] = { {{0,4,8,12}, {2,6,10,14},
92 | {32,36,40,44}, {34,38,42,46} },
93 | {{16,20,24,28}, {18,22,26,30},
94 | {48,52,56,60}, {50,54,58,62}}};
95 | // out_sgemm
96 | const int aux5[32] = { 0,1,8,9,16,17,24,25,
97 | 32,33,40,41,48,49,56,57,
98 | 2,3,10,11,18,19,26,27,
99 | 34,35,42,43,50,51,58,59
100 | };
101 | // exhange
102 | const int aux6[32] = {
103 | 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,
104 | 18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29
105 | };
106 |
107 | #else
108 | const int aux3[2][16] = {
109 | {0,1,10,11,20,21,30,31, 2,3,12,13,22,23,32,33},
110 | {4,5,14,15,24,25,34,35, 6,7,16,17,26,27,36,37}
111 | };
112 | const int aux4[4][4] = { {0,4,8,12}, {32,36,40,44}, {16,20,24,28}, {48,52,56,60} };
113 | #endif
114 |
115 | #endif
116 |
--------------------------------------------------------------------------------
/src/convolutionForward_32x64x8.cu:
--------------------------------------------------------------------------------
1 | // Copyright 2021 Roberto Lopez Castro
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 |
16 | #include "FX_m2.cu"
17 |
18 | #ifdef OPTLDS64
19 | #include "store_and_transform_output_optLDS64.cuh"
20 | #include "outer_product.cuh"
21 | #elif OPTSTS64_CMP
22 | #include "store_and_transform_output_optSTS64_compact.cuh"
23 | #include "outer_product_suffle.cuh"
24 | #else
25 | #include "store_and_transform_output_optSTS64.cuh"
26 | #include "outer_product_suffle.cuh"
27 | #endif
28 |
29 | #ifdef _noWALL_
30 | typedef struct rusage resnfo;
31 | typedef struct _timenfo {
32 | double time;
33 | double systime;
34 | } timenfo;
35 | #define timestamp(sample) getrusage(RUSAGE_SELF, (sample))
36 | #define printtime(t) printf("%15f s (%f user + %f sys) ", \
37 | t.time + t.systime, t.time, t.systime);
38 | #else
39 | typedef struct timeval resnfo;
40 | typedef double timenfo;
41 | #define timestamp(sample) gettimeofday((sample), 0)
42 | #define printtime(t) printf("%15f s ", t);
43 | #endif
44 |
45 | #ifndef _WINOGRAD_
46 | #define _WINOGRAD_
47 | extern "C"
48 | {
49 |
50 |
51 | #define d(input, i, j) ( input[(i<<2) + (j)] )
52 |
53 | __device__ __forceinline__ void load_and_transform_input_tile(float *Btd, float *pOutputs, int in_h, int in_w,
54 | int tiles_dim, int in_c, int in_n, int tile_size,
55 | int tiles_2d_dim, int tile_2d_s, int Inx, int Iny, int TileX, int TileY){
56 |
57 | float workspace[3];
58 |
59 | #pragma unroll
60 | for(int j=0; j<4; j++){
61 | workspace[0] = Btd[j];
62 | workspace[1] = Btd[j+4];
63 | workspace[2] = Btd[j+8];
64 |
65 | Btd[j] = workspace[0] - workspace[2];
66 | Btd[j+4] = workspace[1] + workspace[2];
67 | Btd[j+8] = workspace[2] - workspace[1];
68 | Btd[j+12] = workspace[1] - Btd[j+12];
69 | }
70 |
71 | int c_offset = BN*BC;
72 | int c_tensor = Iny*BN + Inx;
73 |
74 | #pragma unroll
75 | for(int i=0; i<4; i++){ // prefetch 1 input tile/thread
76 | pOutputs[c_tensor+i*c_offset*4] = d(Btd, i, 0) - d(Btd, i, 2);
77 | pOutputs[c_tensor+i*c_offset*4+c_offset] = d(Btd, i, 1) + d(Btd, i, 2);
78 | pOutputs[c_tensor+i*c_offset*4+2*c_offset] = d(Btd, i, 2) - d(Btd, i, 1);
79 | pOutputs[c_tensor+i*c_offset*4+3*c_offset] = d(Btd, i, 1) - d(Btd, i, 3);
80 | }
81 |
82 | }
83 |
84 | __device__ __forceinline__ void load_filter_tile(float *tiles, float *pOutputs,
85 | int filt_c, int filt_k, int Inx, int Iny){
86 |
87 | int c_tensor_s = Iny*BK + Inx;
88 | int c_offset_s = BK*BC;
89 |
90 | for(int k=0; k<2; k++){ // prefetch 2 filter tiles/thread
91 | for(int i=0; i<4; i++){
92 | for(int j=0; j<4; j++){
93 | pOutputs[c_tensor_s + i*c_offset_s*4 + j*c_offset_s] = tiles[k*16 + i*4 + j];
94 | }
95 | }
96 |
97 | c_tensor_s += BN;
98 | }
99 |
100 | }
101 |
102 | __device__ __forceinline__ void prefetch_filter_tile(float *pInputs, float *tiles,
103 | int filt_k, int Inx, int Iny, int TileZ){
104 |
105 | int c_tensor = TileZ*BK + (Iny*filt_k<<4) + Inx;
106 |
107 | int acumm;
108 | #pragma unroll
109 | for(int i=0; i<4; i++){
110 | acumm = (i*filt_k<<2);
111 | for(int j=0; j<4; j++){
112 | tiles[(i<<2) + j] = pInputs[acumm + j*filt_k + c_tensor];
113 | tiles[16 + (i<<2) + j] = pInputs[acumm + j*filt_k + c_tensor+BN];
114 | }
115 | }
116 | }
117 |
118 | __device__ __forceinline__ void prefetch_input_tile(float *pInputs, float *tile, int in_h, int in_w,
119 | int in_n, int Inx, int Iny, int TileX, int TileY, int tiles_dim, short mask){
120 |
121 | int c_tensor = (TileY%tiles_dim)*in_n*2 + (TileY/tiles_dim)*in_n*in_w*2 + TileX*BN + Iny*(in_n*in_h*in_w) + (Inx/in_n)*2*in_n + (Inx%in_n) - (in_n*in_w+in_n);
122 | int acumm,x;
123 |
124 | if(mask==0xFFFF){
125 |
126 | for(int i=0; i<4; i++){
127 | acumm = i*in_n*in_w;
128 | #pragma unroll
129 | for(int j=0; j<4; j++){
130 | tile[(i<<2) + j] = pInputs[acumm + j*in_n + c_tensor];
131 | }
132 | }
133 |
134 | } else {
135 |
136 | for(int i=0; i<4; i++){
137 | acumm = i*in_n*in_w;
138 | #pragma unroll
139 | for(int j=0; j<4; j++){
140 | x = (i<<2) + j;
141 | tile[x] = 0;
142 | if(mask&(1<>>(w, Ww, filt_k, filt_c, filt_h, filt_w, alpha);
275 |
276 | #ifdef OPTSTS64_CMP
277 | smem_size = 65536; // 64 KB
278 | cudaFuncSetAttribute(Winograd_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
279 | #endif
280 |
281 | Winograd_kernel<<>>(k, Ww, C, tiles_dim, in_c, in_n, in_h, in_w, tile_size, filt_k, filt_c, tiles_2d_dim, out_c, out_n, tile_2d_s, out_h, out_w);
282 |
283 | return cudaGetLastError();
284 | }
285 |
286 | }
287 | #endif
288 |
--------------------------------------------------------------------------------
/src/convolutionForward_32x64x8_baseline.cu:
--------------------------------------------------------------------------------
1 | // Copyright 2021 Roberto Lopez Castro
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 |
16 | #include "FX_m2.cu"
17 | #include "store_and_transform_output_baseline.cuh"
18 | #include "outer_product.cuh"
19 |
20 | #ifdef _noWALL_
21 | typedef struct rusage resnfo;
22 | typedef struct _timenfo {
23 | double time;
24 | double systime;
25 | } timenfo;
26 | #define timestamp(sample) getrusage(RUSAGE_SELF, (sample))
27 | #define printtime(t) printf("%15f s (%f user + %f sys) ", \
28 | t.time + t.systime, t.time, t.systime);
29 | #else
30 | typedef struct timeval resnfo;
31 | typedef double timenfo;
32 | #define timestamp(sample) gettimeofday((sample), 0)
33 | #define printtime(t) printf("%15f s ", t);
34 | #endif
35 |
36 | #ifndef _WINOGRAD_
37 | #define _WINOGRAD_
38 | extern "C"
39 | {
40 |
41 |
42 | #define d(input, i, j) ( input[(i<<2) + (j)] )
43 |
44 | __device__ void load_and_transform_input_tile(float *Btd, float *pOutputs, int in_h, int in_w, int tiles_dim, int in_c, int in_n, int tile_size, int tiles_2d_dim, int tile_2d_s, int Inx, int Iny, int TileX, int TileY)
45 | {
46 |
47 | float workspace[4];
48 |
49 | for(int j=0; j<4; j++){
50 | workspace[0] = Btd[j];
51 | workspace[1] = Btd[j+4];
52 | workspace[2] = Btd[j+8];
53 | workspace[3] = Btd[j+12];
54 |
55 | Btd[j] = workspace[0] - workspace[2];
56 | Btd[j+4] = workspace[1] + workspace[2];
57 | Btd[j+8] = workspace[2] - workspace[1];
58 | Btd[j+12] = workspace[1] - workspace[3];
59 | }
60 |
61 | int c_offset = BN*BC;
62 | int c_tensor = Iny*BN + Inx;
63 |
64 | for(int i=0; i<4; i++){ // prefetch 1 input tile/thread
65 | pOutputs[c_tensor+i*c_offset*4] = d(Btd, i, 0) - d(Btd, i, 2);
66 | pOutputs[c_tensor+i*c_offset*4+c_offset] = d(Btd, i, 1) + d(Btd, i, 2);
67 | pOutputs[c_tensor+i*c_offset*4+2*c_offset] = d(Btd, i, 2) - d(Btd, i, 1);
68 | pOutputs[c_tensor+i*c_offset*4+3*c_offset] = d(Btd, i, 1) - d(Btd, i, 3);
69 | }
70 | }
71 |
72 | __device__ void load_filter_tile(float *tiles, float *pOutputs, int filt_c, int filt_k, int Inx, int Iny)
73 | {
74 |
75 | int c_tensor_s = Iny*BK + Inx;
76 | int c_offset_s = BK*BC;
77 |
78 | for(int k=0; k<2; k++){ // prefetch 2 filter tiles/thread
79 | for(int i=0; i<4; i++){
80 | for(int j=0; j<4; j++){
81 | pOutputs[c_tensor_s + i*c_offset_s*4 + j*c_offset_s] = tiles[k*16 + i*4 + j];
82 | }
83 | }
84 |
85 | c_tensor_s += BN;
86 | }
87 |
88 | }
89 |
90 | __device__ void prefetch_filter_tile(float *pInputs, float *tiles, int filt_k, int Inx, int Iny, int TileZ)
91 | {
92 |
93 | int c_tensor = TileZ*BK + (Iny*filt_k<<4) + Inx;
94 |
95 | int acumm;
96 | #pragma unroll
97 | for(int i=0; i<4; i++){
98 | acumm = (i*filt_k<<2);
99 | for(int j=0; j<4; j++){
100 | tiles[(i<<2) + j] = pInputs[acumm + j*filt_k + c_tensor];
101 | tiles[16 + (i<<2) + j] = pInputs[acumm + j*filt_k + c_tensor+BN];
102 | }
103 | }
104 | }
105 |
106 | __device__ void prefetch_input_tile(float *pInputs, float *tile, int in_h, int in_w, int in_n, int Inx, int Iny, int TileX, int TileY, int tiles_dim, short mask)
107 | {
108 |
109 | int c_tensor = (TileY%tiles_dim)*in_n*2 + (TileY/tiles_dim)*in_n*in_w*2 + TileX*BN + Iny*(in_n*in_h*in_w) + (Inx/in_n)*2*in_n + (Inx%in_n) - (in_n*in_w+in_n);
110 | int acumm;
111 |
112 | //#pragma unroll
113 | for(int i=0; i<4; i++){
114 | acumm = i*in_n*in_w;
115 | for(int j=0; j<4; j++){
116 | if(mask&(1<<((i<<2) + j)))
117 | tile[(i<<2) + j] = pInputs[acumm + j*in_n + c_tensor];
118 | else
119 | tile[(i<<2) + j] = 0;
120 | }
121 | }
122 | }
123 |
124 | __device__ void __inline__ prefetch_filter_frag(float4 *filter_frag, float4 *B_frag, int f_frag_offset, int Inx, int offset1, int offset2)
125 | {
126 |
127 | *((float4*) (filter_frag)) = *(B_frag + offset1);
128 | *((float4*) (filter_frag + 1)) = *(B_frag + offset2);
129 |
130 | *((float4*) (filter_frag + 2)) = *(B_frag + f_frag_offset + offset1);
131 | *((float4*) (filter_frag + 3)) = *(B_frag + f_frag_offset + offset2);
132 | }
133 |
134 | __device__ void __inline__ prefetch_input_frag(float4* input_frag, float4 *A_frag, int frag_offset, int Inx, int offset1, int offset2)
135 | {
136 |
137 | *((float4*) (input_frag)) = *(A_frag + offset1); //ld_shared(A_frag + offset1);
138 | *((float4*) (input_frag + 1)) = *(A_frag + offset2);
139 |
140 | *((float4*) (input_frag + 2)) = *(A_frag + frag_offset + offset1);
141 | *((float4*) (input_frag + 3)) = *(A_frag + frag_offset + offset2); //3=2+1
142 | }
143 |
144 | __global__ void Winograd_kernel(float *A, float *B, float *C,
145 | int tiles_dim, int in_c, int in_n, int in_h, int in_w,
146 | int tile_size, int filt_k, int filt_c,
147 | int tiles_2d_dim, int out_c, int out_n,
148 | int tile_2d_s, int out_h, int out_w){
149 |
150 | extern __shared__ float shared_mem[];
151 | float *input_smem = (float*)shared_mem;
152 | float *filter_smem = (float*)&shared_mem[16*BC*BN];
153 |
154 | short m = 0xFFFF;
155 | if((blockIdx.y/tiles_dim)==0) m&=0xFFF0;
156 | if((blockIdx.y/tiles_dim)==(tiles_dim-1)) m &= (!(in_w%2))?(0x0FFF):(0x00FF);
157 | if(!((blockIdx.y+1)%tiles_dim)) m &= (!(in_w%2))?(0x7777):(0x3333);
158 | if(!((blockIdx.y)%tiles_dim)) m&=0xeeee;
159 |
160 | float img_tile[16]; // Prefetch input from GMEM
161 | float filter_tile[32]; // Prefetch filter from GMEM
162 |
163 | float4 input_frag_mem[8]; //2*2(2*8/4) Data to do Outer Product + prefetch f. SMEM (double_buffer)
164 | float4 filter_frag_mem[8]; //2*2 Data to do Outer Product + prefetch f. SMEM (double_buffer)
165 | float4 accumulator[2][16] = {0.0f}; // Accumulators
166 |
167 | float4 *A_frag; // Input data pointer
168 | int frag_offset = 2* (BC*BN); // (2=8/4) SMEM input read offset
169 |
170 | float4 *B_frag; // Filter data pointer
171 | int f_frag_offset = 2* (BC*BK); // (2=8/4) SMEM filter read offset
172 |
173 | float4 *input_frag = (float4*) input_frag_mem;
174 | float4 *filter_frag = (float4*) filter_frag_mem;
175 |
176 | float4 *swap;
177 |
178 | prefetch_input_tile(A, img_tile, in_h, in_w, in_n, threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, tiles_dim, m);
179 | prefetch_filter_tile(B, filter_tile, filt_k, threadIdx.x, threadIdx.y, blockIdx.z);
180 |
181 | float4 *input_frag_buffer = (float4*) (input_frag+4);
182 | float4 *filter_frag_buffer = (float4*) (filter_frag+4);
183 |
184 | // Mainloop - iterates over the entire K dimension - not unrolled
185 | for(int iter=0; iter>>(w, Ww, filt_k, filt_c, filt_h, filt_w, alpha);
248 |
249 | Winograd_kernel<<>>(k, Ww, C,
250 | tiles_dim, in_c, in_n, in_h, in_w, tile_size,
251 | filt_k, filt_c, tiles_2d_dim, out_c, out_n, tile_2d_s,
252 | out_h, out_w);
253 |
254 | return cudaGetLastError();
255 | }
256 |
257 | }
258 | #endif
--------------------------------------------------------------------------------
/src/openCNN_winograd.cu:
--------------------------------------------------------------------------------
1 | // Copyright 2021 Roberto Lopez Castro
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 |
16 | #include
17 | #include
18 | #include
19 | #include
20 | #include
21 | #include
22 | #include
23 |
24 | #include
25 | #include
26 | #include
27 | #include
28 | #include
29 |
30 | #include
31 |
32 | #include "config.hpp"
33 |
34 | #ifdef BASE
35 | #if __CUDA_ARCH__ < 800
36 | #include "convolutionForward_32x64x8_baseline.cu"
37 | #else
38 | #include "ampere/convolutionForward_32x64x8_baseline.cu"
39 | #endif
40 | #else
41 | #if __CUDA_ARCH__ < 800
42 | #include "convolutionForward_32x64x8.cu"
43 | #else
44 | #include "ampere/convolutionForward_32x64x8.cu"
45 | #endif
46 | #endif
47 |
48 | /*
49 | In order to measure the elapsed time:
50 |
51 | resnfo: datatype defined to abstract the metric of the resources to use
52 | timenfo: datatype defined to abstract the time metric to use
53 |
54 | timestamp: it abstract the function used to take the time
55 |
56 | printtime: it abstracts the function used to print the time
57 |
58 | void myElapsedtime(resnfo start, resnfo end, timenfo *t): function to obtain the
59 | time between two measures
60 | */
61 |
62 | #ifdef _noWALL_
63 | typedef struct rusage resnfo;
64 | typedef struct _timenfo {
65 | double time;
66 | double systime;
67 | } timenfo;
68 | #define timestamp(sample) getrusage(RUSAGE_SELF, (sample))
69 | #define printtime(t) printf("%15f s (%f user + %f sys) ", \
70 | t.time + t.systime, t.time, t.systime);
71 | #else
72 | typedef struct timeval resnfo;
73 | typedef double timenfo;
74 | #define timestamp(sample) gettimeofday((sample), 0)
75 | #define printtime(t) printf("%15f s ", t);
76 | #endif
77 |
78 | void myElapsedtime(resnfo start, resnfo end, timenfo *t)
79 | {
80 | #ifdef _noWALL_
81 | t->time = (end.ru_utime.tv_sec + (end.ru_utime.tv_usec * 1E-6))
82 | - (start.ru_utime.tv_sec + (start.ru_utime.tv_usec * 1E-6));
83 | t->systime = (end.ru_stime.tv_sec + (end.ru_stime.tv_usec * 1E-6))
84 | - (start.ru_stime.tv_sec + (start.ru_stime.tv_usec * 1E-6));
85 | #else
86 | *t = (end.tv_sec + (end.tv_usec * 1E-6))
87 | - (start.tv_sec + (start.tv_usec * 1E-6));
88 | #endif /*_noWALL_*/
89 | }
90 |
91 | #define CUDA_CALL(f) { \
92 | cudaError_t err = (f); \
93 | if (err != cudaSuccess) { \
94 | std::cout \
95 | << " Error occurred: " << err << std::endl; \
96 | std::exit(1); \
97 | } \
98 | }
99 |
100 | #define CUDNN_CALL(f) { \
101 | cudnnStatus_t err = (f); \
102 | if (err != CUDNN_STATUS_SUCCESS) { \
103 | printf(" Error occurred: \n"); \
104 | std::exit(1); \
105 | } \
106 | }
107 |
108 | #define OPENCNN_CALL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
109 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
110 | {
111 | if (code != cudaSuccess)
112 | {
113 | fprintf(stderr,"Error occurred: %s %s %d\n", cudaGetErrorString(code), file, line);
114 | if (abort) exit(code);
115 | }
116 | }
117 |
118 | void tflops(int in_n, int in_w, int in_h, int in_c, int filt_w, int filt_h, int filt_k, int pad, int str,
119 | int out_w, int out_h, float ms)
120 | {
121 |
122 | double L = (double) 2.0*in_n*in_c*(in_h+2*PAD_H)*(in_w+2*PAD_W)*filt_k*3.0*3.0;
123 |
124 | printf("%.3f,%.2f", ms, L/(2.25 * ms * 1e9) );
125 | }
126 |
127 | __global__ void dev_const(float *px, float k, int n) {
128 | int tid = threadIdx.x + blockIdx.x * blockDim.x;
129 |
130 | curandState state;
131 | curand_init(clock64(), tid, 0, &state);
132 |
133 | if (tid < n)
134 | px[tid] = curand_uniform(&state);
135 | }
136 |
137 | __global__ void dev_iota(float *px, int n) {
138 | int tid = threadIdx.x + blockIdx.x * blockDim.x;
139 |
140 | curandState state;
141 | curand_init(clock64(), tid, 0, &state);
142 |
143 | if (tid < n)
144 | px[tid] = curand_uniform(&state);
145 | }
146 |
147 | __global__ void data_cpy(float *px, float *py,
148 | int in_w, int in_h, int in_c, int in_n) {
149 | int tid = blockIdx.y + blockIdx.z*in_w + threadIdx.x*in_h*in_w + blockIdx.x*in_h*in_w*in_c;
150 | int id = blockIdx.x + blockIdx.y*in_n + blockIdx.z*in_n*in_w + threadIdx.x*in_n*in_h*in_w;
151 |
152 | px[id] = py[tid];
153 | }
154 |
155 | void print(const float *data, int n, int c, int h, int w) {
156 | std::vector buffer(1 << 20);
157 | CUDA_CALL(cudaMemcpy(
158 | buffer.data(), data,
159 | n * c * h * w * sizeof(float),
160 | cudaMemcpyDeviceToHost));
161 | int a = 0;
162 | for (int i = 0; i < n; ++i) {
163 | for (int j = 0; j < c; ++j) {
164 | std::cout << "n=" << i << ", c=" << j << ":" << std::endl;
165 | for (int k = 0; k < h; ++k) {
166 | for (int l = 0; l < w; ++l) {
167 | std::cout << std::setw(12) << std::right << buffer[a];
168 | ++a;
169 | }
170 | std::cout << std::endl << std::endl;
171 | }
172 | }
173 | }
174 | std::cout << std::endl;
175 | }
176 |
177 | void output_checker(float* A, float* B, int n, int len, int channel, int shift) {
178 | int error_cnt = 0, i, j, k, m;
179 | float max_error = 0;
180 | for(k = 0; k < channel; k++){
181 | for (i = 0; i < len; i++) {
182 | for (j = 0; j < len; j++) {
183 | for (m = 0; m < n; m++) {
184 | float diff = fabs(
185 | A[k*len*len*n + i*len*n + j*n + m] -
186 | B[m*len*len*channel + k*len*len + i*len + j]);
187 | if (diff > 1){ //1e-4
188 | error_cnt++;
189 | printf("h:%d, w:%d, n:%d, c:%d -> %f vs %f : +- %f\n", i, j, m, k,
190 | A[k*len*len*n + i*len*n + j*n + m],
191 | B[m*len*len*channel + k*len*len + i*len + j], diff);
192 | std::exit(1);
193 | }
194 | if (diff > max_error)
195 | max_error = diff;
196 | }
197 | }
198 | }
199 | }
200 | printf("[max_error: %f][error_cnt: %d] of %d\n", max_error, error_cnt, n*len*len*channel*shift);
201 | }
202 |
203 |
204 | cudaError_t convolutionForward(float *k, int in_h, int in_w, float *w, int out_h,
205 | int out_w, int out_n, int out_c, float *C, float *Ww,
206 | const unsigned int n,
207 | int tiles_dim, int in_n, int tile_size, int elems_dim,
208 | int in_c, int filt_k, int filt_c, int filt_h, int filt_w,
209 | int alpha, int m){
210 | cudaError_t out;
211 |
212 | if(BN==32 && BK==64 && BC==8){
213 | out = convolutionForward_32x64x8(k, in_h, in_w, w, out_h, out_w, out_n, out_c, C, Ww, n, tiles_dim, in_n, tile_size, in_c, filt_k, filt_c, filt_h, filt_w, alpha, m);
214 | } else {
215 | std::cout << "Configuration not supported yet" << std::endl;
216 | }
217 |
218 | return out;
219 | }
220 |
221 | cudaError_t init_data(float *in_data, float *in_data_open, float *filt_data, float *filt_data_open, int in_w, int in_h, int in_c, int in_n, int filt_w, int filt_h, int filt_c, int filt_k, int tile_size){
222 |
223 | int n = in_n*in_c*in_h*in_w;
224 | int blk_size = 256;
225 |
226 | dim3 dimBlock(blk_size);
227 | dim3 dimGrid((n + dimBlock.x -1)/dimBlock.x);
228 |
229 | dev_iota<<>>(in_data, n);
230 | data_cpy<<>>(in_data_open, in_data, in_w, in_h, in_c, in_n);
231 |
232 | n = filt_k*filt_c*filt_h*filt_w;
233 | dim3 dimGrid_f = dim3((n + dimBlock.x -1)/dimBlock.x);
234 | dev_const<<>>(filt_data, 1.f, n);
235 | data_cpy<<>>(filt_data_open, filt_data, filt_w, filt_h, filt_c, filt_k);
236 |
237 | return cudaGetLastError();
238 | }
239 |
240 |
241 | int main(int argc, char *argv[]) {
242 |
243 |
244 | // ========== Set ImageBatch, filter, convolution and output parameters ========== //
245 | // ImageBatch
246 | const int in_n = (argc > 1)?atoi (argv[1]):N; // Number of images
247 | const int in_c = (argc > 2)?atoi (argv[2]):C_in; // Number of feature maps per image
248 | const int in_h = (argc > 3)?atoi (argv[3]):W; // Height of each feature map
249 | const int in_w = (argc > 4)?atoi (argv[4]):W; // Width of each feature map
250 |
251 | // Filter
252 | const int filt_k = (argc > 5)?atoi (argv[5]):K;
253 | const int filt_c = (argc > 6)?atoi (argv[6]):C_in;
254 | const int filt_h = (argc > 7)?atoi (argv[7]):R;
255 | const int filt_w = (argc > 8)?atoi (argv[8]):R;
256 |
257 | std::cout << in_n << "," << in_c << "," << in_h << "," << filt_k << "," << filt_h << ",";
258 |
259 | // Convolution config
260 | const int pad_h = PAD_H; // Zero-padding height
261 | const int pad_w = PAD_W; // Zero-padding width
262 | const int str_h = STR_H; // Vertical filter stride
263 | const int str_w = STR_W; // Horizontal filter stride
264 | const int dil_h = DIL_H; // Filter height dilation
265 | const int dil_w = DIL_W; // Filter width dilation
266 |
267 |
268 | // Output
269 | int out_n; // Number of outputs
270 | int out_c; // Number of feature maps per output
271 | int out_h; // Height of each feature map
272 | int out_w; // Width of each feature map
273 |
274 | /*
275 | ####################################################################
276 | ======================= openCNN preparation =======================
277 | ####################################################################
278 | */
279 | // Winograd config
280 | const int m = M;
281 | const int r = filt_h;
282 | const int tile_size = m+r-1; // alpha value
283 | int elems_dim;
284 | int tiles_dim;
285 |
286 | if(m==2){
287 | tiles_dim = ceil(ceil((double)(in_w+2)/2)-1);
288 | elems_dim = tiles_dim*4;
289 | } else {
290 | std::cout << "Configuration not supported yet" << std::endl;
291 | exit(0);
292 | }
293 |
294 | // Output
295 | out_n = in_n; // Number of outputs
296 | out_c = filt_k; // Number of feature maps per output
297 | out_h = in_h; // Height of each feature map
298 | out_w = in_w; // Width of each feature map
299 |
300 | float *in_data_open;
301 | float *filt_data_open, *workspace;
302 |
303 | // ImageBatch openCNN
304 | OPENCNN_CALL(cudaMalloc(
305 | &in_data_open, in_n * in_c * in_h * in_w * sizeof(float)));
306 | // Filter openCNN
307 | OPENCNN_CALL(cudaMalloc(
308 | &filt_data_open, filt_k * filt_c * filt_h * filt_w * sizeof(float)));
309 | // Filter transformation
310 | OPENCNN_CALL(cudaMalloc(
311 | &workspace, filt_k * filt_c * tile_size * tile_size * sizeof(float)));
312 |
313 | // Output openCNN
314 | float *out_data;
315 | OPENCNN_CALL(cudaMalloc(
316 | &out_data, out_n * out_c * out_h * out_w * sizeof(float)));
317 |
318 | // =================== openCNN layouts =================== //
319 | cudaMemcpyToSymbol(access_f_s, aux, 64*sizeof(int));
320 | cudaMemcpyToSymbol(access_s, aux2, 64*sizeof(int));
321 | #ifndef BASE
322 | #if defined(OPTLDS64)
323 | cudaMemcpyToSymbol(access_s_out, aux3, 32*sizeof(int));
324 | cudaMemcpyToSymbol(out_thread, aux4, 32*sizeof(int));
325 | cudaMemcpyToSymbol(out_sgemm, aux5, 32*sizeof(int));
326 | cudaMemcpyToSymbol(exhange, aux6, 32*sizeof(int));
327 | #endif
328 | #else
329 | cudaMemcpyToSymbol(access_s_out, aux3, 32*sizeof(int));
330 | cudaMemcpyToSymbol(out_thread, aux4, 16*sizeof(int));
331 | #endif
332 |
333 | /*
334 | ####################################################################
335 | ====================== cuDNN preparation ======================
336 | ####################################################################
337 | */
338 |
339 | float *in_data, *filt_data;
340 |
341 | // ImageBatch cuDNN
342 | CUDA_CALL(cudaMalloc(
343 | &in_data, in_n * in_c * in_h * in_w * sizeof(float)));
344 | // Filter cuDNN
345 | CUDA_CALL(cudaMalloc(
346 | &filt_data, filt_k * filt_c * filt_h * filt_w * sizeof(float)));
347 |
348 | // =================== Set descriptors =================== //
349 | cudnnHandle_t cudnn;
350 | CUDNN_CALL(cudnnCreate(&cudnn));
351 |
352 | // Input image Descriptors
353 | cudnnTensorDescriptor_t in_desc;
354 | CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc));
355 | CUDNN_CALL(cudnnSetTensor4dDescriptor(
356 | in_desc, CUDNN_TENSOR_NCHW/*CUDNN_TENSOR_NHWC*/, CUDNN_DATA_FLOAT,
357 | in_n, in_c, in_h, in_w));
358 |
359 | // Filter Descriptors
360 | cudnnFilterDescriptor_t filt_desc;
361 | CUDNN_CALL(cudnnCreateFilterDescriptor(&filt_desc));
362 | CUDNN_CALL(cudnnSetFilter4dDescriptor(
363 | filt_desc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW/*CUDNN_TENSOR_NHWC*/,
364 | filt_k, filt_c, filt_h, filt_w));
365 |
366 | // Convolution Descriptors
367 | cudnnConvolutionDescriptor_t conv_desc;
368 | CUDNN_CALL(cudnnCreateConvolutionDescriptor(&conv_desc));
369 | CUDNN_CALL(cudnnSetConvolution2dDescriptor(
370 | conv_desc,
371 | pad_h, pad_w, str_h, str_w, dil_h, dil_w,
372 | CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT)); //CUDNN_CONVOLUTION
373 |
374 |
375 | // =================== Query output layout =================== //
376 | CUDNN_CALL(cudnnGetConvolution2dForwardOutputDim(
377 | conv_desc, in_desc, filt_desc,
378 | &out_n, &out_c, &out_h, &out_w));
379 |
380 | // =================== Set and allocate output tensor descriptor ===================//
381 | cudnnTensorDescriptor_t out_desc;
382 | CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc));
383 | CUDNN_CALL(cudnnSetTensor4dDescriptor(
384 | out_desc, CUDNN_TENSOR_NCHW/*CUDNN_TENSOR_NHWC*/, CUDNN_DATA_FLOAT,
385 | out_n, out_c, out_h, out_w));
386 |
387 | float *out_data_cudnn;
388 | CUDA_CALL(cudaMalloc(
389 | &out_data_cudnn, out_n * out_c * out_h * out_w * sizeof(float)));
390 |
391 | // =================== Query convolution forward algorithm =================== //
392 | cudnnConvolutionFwdAlgo_t algo = (cudnnConvolutionFwdAlgo_t)6;
393 |
394 | // =================== Query workspace and allocate =================== //
395 | size_t ws_size;
396 | CUDNN_CALL(cudnnGetConvolutionForwardWorkspaceSize(
397 | cudnn, in_desc, filt_desc, conv_desc, out_desc, algo, &ws_size));
398 |
399 | float *ws_data;
400 | CUDA_CALL(cudaMalloc(&ws_data, ws_size));
401 |
402 | // =================== Launch convolution on cuDNN =================== //
403 | float alpha = 1.f;
404 | float beta = 0.f;
405 |
406 | /*
407 | ####################################################################
408 | ============================= Init data =============================
409 | ####################################################################
410 | */
411 |
412 | OPENCNN_CALL(init_data(in_data, in_data_open, filt_data, filt_data_open, in_w, in_h, in_c, in_n,
413 | filt_w, filt_h, filt_c, filt_k, tile_size));
414 |
415 | /*
416 | ####################################################################
417 | ============================= Execution =============================
418 | ####################################################################
419 | */
420 | CUevent hStart, hStop;
421 | float ms;
422 | OPENCNN_CALL( cudaEventCreate(&hStart, CU_EVENT_BLOCKING_SYNC) ); // CU_EVENT_DEFAULT
423 | OPENCNN_CALL( cudaEventCreate(&hStop, CU_EVENT_BLOCKING_SYNC) );
424 |
425 | // Loop of executions
426 | int iterations = 100;
427 |
428 | // Performs warmup operation
429 | OPENCNN_CALL(convolutionForward(in_data_open, in_h, in_w, filt_data_open, out_h, out_w, out_n, out_c, out_data, workspace,
430 | out_c*out_n*out_h*out_w,
431 | tiles_dim, in_n, tile_size, elems_dim, in_c, filt_k, filt_c, filt_h, filt_w, tile_size, m));
432 |
433 | // ============================= openCNN exec =============================
434 | cudaDeviceSynchronize();
435 | ( cudaEventRecord( hStart, NULL ) );
436 | for(int iter=0; iter7) ){
89 |
90 | #pragma unroll
91 | for(int i=0; i<4; i+=2){
92 |
93 | *( (float4*) (output_smem + idx+i*acumm4 + acumm1) ) = *(accumulator+t); // k=0
94 | *( (float4*) (output_smem + idx+i*acumm4 + acumm2) ) = *(accumulator+t+1);
95 | *( (float4*) (output_smem + idx+(i+1)*acumm4 + acumm1) ) = *(accumulator+2+t); // k=1
96 | *( (float4*) (output_smem + idx+(i+1)*acumm4 + acumm2) ) = *(accumulator+3+t);
97 |
98 | *( (float4*) (output_smem + idx2+i*acumm4 + acumm1) ) = *(accumulator+16+t);
99 | *( (float4*) (output_smem + idx2+i*acumm4 + acumm2) ) = *(accumulator+17+t);
100 | *( (float4*) (output_smem + idx2+(i+1)*acumm4 + acumm1) ) = *(accumulator+18+t);
101 | *( (float4*) (output_smem + idx2+(i+1)*acumm4 + acumm2) ) = *(accumulator+19+t);
102 |
103 | t+=4;
104 | }
105 | }
106 |
107 | __syncthreads();
108 |
109 | for(int i=0; i<16; i++){
110 | C_tile[i] = out[init + i*offset];
111 | C_tile[i+16] = out[init + 2*BN_p*16*4 + i*offset];
112 | }
113 |
114 | // transform output tiles
115 | transform_output_tile(C, C_tile, At, Inx, Iny, TileX, TileY, TileZ, out_h, out_w, tiles_dim, round, in_n, 0, out_thread, mask, c_tensor, c_glb_offset);
116 |
117 |
118 | __syncthreads();
119 | }
120 |
121 | }
122 |
123 | }
124 | #endif
--------------------------------------------------------------------------------
/src/store_and_transform_output_optLDS64.cuh:
--------------------------------------------------------------------------------
1 | // Copyright 2021 Roberto Lopez Castro
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 |
16 | #include "config.hpp"
17 |
18 | #ifndef _OUTPUT_KERNEL_OPT3_
19 | #define _OUTPUT_KERNEL_OPT3_
20 | extern "C"
21 | {
22 |
23 | __device__ void __inline__ transform_output_tile(float2 *pOutputs, float2 *C_tile, float2 *At, int tiles_dim, int round, int in_n, int c_tensor, int c_glb_offset, short mask, int out_w)
24 | {
25 | c_tensor += ( (round/2)*32 + (round%2)*2 )*c_glb_offset/2;
26 | int x, x1;
27 |
28 | for(int j=0; j<4; j++){
29 | At[j].x = C_tile[j].x + C_tile[4+j].x + C_tile[8+j].x;
30 | At[j].y = C_tile[j].y + C_tile[4+j].y + C_tile[8+j].y;
31 |
32 | At[4+j].x = C_tile[4+j].x - C_tile[8+j].x - C_tile[12+j].x;
33 | At[4+j].y = C_tile[4+j].y - C_tile[8+j].y - C_tile[12+j].y;
34 | }
35 |
36 | for(int i=0; i<2; i++){
37 | x = i*4;
38 | x1 = i*(in_n*(tiles_dim-(out_w%2)) + (out_w%2)*in_n/2);
39 | if(mask&(1<<(i*2))){
40 |
41 | pOutputs[x1 + c_tensor].x = At[x].x + At[x+1].x + At[x+2].x;
42 | pOutputs[x1 + c_tensor].y = At[x].y + At[x+1].y + At[x+2].y;
43 | }
44 | if(mask&(1<<(i*2+1))){
45 | pOutputs[x1 + in_n/2 + c_tensor].x = At[x+1].x - At[x+2].x - At[x+3].x;
46 | pOutputs[x1 + in_n/2 + c_tensor].y = At[x+1].y - At[x+2].y - At[x+3].y;
47 | }
48 | }
49 |
50 | }
51 |
52 | __device__ __inline__ void store_output_tile(float4 acumm_smem[][16], float *shared_mem, int Inx, int Iny,
53 | float *C, int TileX, int TileY, int TileZ, int out_h, int out_w,
54 | int tiles_dim, int in_n, float4 *input_frag_mem, float4* filter_frag_mem, short mask){
55 |
56 | float2 *output_smem = (float2 *) shared_mem;
57 | float2 *accumulator = (float2 *) acumm_smem;
58 | float2 *C_out = (float2*)C;
59 |
60 | float2 *C_tile = (float2*) input_frag_mem;
61 | float2 *At = (float2*) filter_frag_mem;
62 |
63 | mask = 0x000F;
64 | if((blockIdx.y/tiles_dim)==(tiles_dim-1) && out_w%2) mask&=0x0003;
65 | if(!((blockIdx.y+1)%tiles_dim) && out_w%2) mask&=0X0005;
66 |
67 | // output transpose step
68 | int t=0;
69 | int acumm1, acumm2;
70 | // For transposing
71 | //acumm1 = access_s_out[Inx]; //* 4
72 | acumm1 = ((Inx%8)/2)*34 + Inx%2 + (Inx/16)*2 + ((Inx/8)%2)*8;
73 | acumm2 = acumm1+4;
74 |
75 | int acumm4 = BN_p*8*2 ; //*4
76 | int idx = Iny * BN_p;
77 | int idx2 = idx + BN_p*8; //(BN_p*2 *8)/2
78 |
79 | // For transformating
80 | int offset = BN_p; //*2/2
81 | int init = (Iny/4)*BN_p*16 + (Iny%4)*(32+2);
82 | init += (Inx/16)*8 + ((Inx/8)%2)*16 + (Inx%8); //40=(8+2)*4, 4 blocks/buffer
83 |
84 |
85 | int c_glb_offset = in_n*out_h*out_w;
86 | int c_tensor = TileZ*c_glb_offset*BK + (TileY%tiles_dim)*in_n*2 + (TileY/tiles_dim)*in_n*out_w*2 + TileX*BN + ((Inx/8)%2)*2 + (Inx%8)*2*2 + ((Inx/16)*16 + (Iny%4)*4 + Iny/4)*c_glb_offset;
87 | c_tensor/=2;
88 |
89 | #pragma unroll
90 | for(int round=0; round<4; round++){
91 |
92 | *( (float2*) (output_smem + idx + acumm1) ) = *(accumulator+t);
93 | *( (float2*) (output_smem + idx + acumm1 + 16) ) = *(accumulator+t+1); // float 4, t
94 | *( (float2*) (output_smem + idx + acumm2) ) = *(accumulator+t+2);
95 | *( (float2*) (output_smem + idx + acumm2 + 16) ) = *(accumulator+t+3); // float 4, t+1
96 |
97 | *( (float2*) (output_smem + idx2 + acumm1) ) = *(accumulator+t+32);
98 | *( (float2*) (output_smem + idx2 + acumm1 + 16) ) = *(accumulator+t+33); // float 4, t+16
99 | *( (float2*) (output_smem + idx2 + acumm2) ) = *(accumulator+t+34);
100 | *( (float2*) (output_smem + idx2 + acumm2 + 16) ) = *(accumulator+t+35); // float 4, t+17
101 |
102 | *( (float2*) (output_smem + idx + acumm4 + acumm1) ) = *(accumulator+t+4);
103 | *( (float2*) (output_smem + idx + acumm4 + acumm1 + 16) ) = *(accumulator+t+5); // float 4, t+2
104 | *( (float2*) (output_smem + idx + acumm4 + acumm2) ) = *(accumulator+t+6);
105 | *( (float2*) (output_smem + idx + acumm4 + acumm2 + 16) ) = *(accumulator+t+7); // float 4, t+3
106 |
107 | *( (float2*) (output_smem + idx2 + acumm4 + acumm1) ) = *(accumulator+t+36);
108 | *( (float2*) (output_smem + idx2 + acumm4 + acumm1 + 16) ) = *(accumulator+t+37); // float 4, t+18
109 | *( (float2*) (output_smem + idx2 + acumm4 + acumm2) ) = *(accumulator+t+38);
110 | *( (float2*) (output_smem + idx2 + acumm4 + acumm2 + 16) ) = *(accumulator+t+39); // float 4, t+19
111 |
112 | t+=8;
113 |
114 | __syncthreads();
115 |
116 |
117 | for(int i=0; i<16; i++){
118 | C_tile[i].x = output_smem[i*offset + init].x; //16*4
119 | C_tile[i].y = output_smem[i*offset + init].y; //16*4
120 | }
121 |
122 | // transform output tiles
123 | transform_output_tile(C_out, C_tile, At, tiles_dim, round, in_n, c_tensor, c_glb_offset, mask , out_w);
124 |
125 | __syncthreads();
126 |
127 | }
128 | }
129 |
130 | }
131 | #endif
--------------------------------------------------------------------------------
/src/store_and_transform_output_optSTS64.cuh:
--------------------------------------------------------------------------------
1 | // Copyright 2021 Roberto Lopez Castro
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 |
16 | #include "config.hpp"
17 |
18 | #ifndef _OUTPUT_KERNEL_OPT1_
19 | #define _OUTPUT_KERNEL_OPT1_
20 | extern "C"
21 | {
22 |
23 | __device__ void transform_output_tile(float2 *pOutputs, float2 *C_tile, float2 *At,
24 | int tiles_dim, int round, int in_n, int c_tensor, int c_glb_offset, short mask, int out_w){
25 | c_tensor += (((round)/2)*32 + ((round)%2)*2)*c_glb_offset/2;
26 | int x, x1;
27 |
28 | #pragma unroll
29 | for(int j=0; j<4; j++){
30 | At[j].x = C_tile[j].x + C_tile[4+j].x + C_tile[8+j].x;
31 | At[j].y = C_tile[j].y + C_tile[4+j].y + C_tile[8+j].y;
32 |
33 | At[4+j].x = C_tile[4+j].x - C_tile[8+j].x - C_tile[12+j].x;
34 | At[4+j].y = C_tile[4+j].y - C_tile[8+j].y - C_tile[12+j].y;
35 | }
36 |
37 |
38 | #pragma unroll
39 | for(int i=0; i<2; i++){
40 | x = i*4;
41 | x1 = i*(in_n*(tiles_dim-(out_w%2)) + (out_w%2)*in_n/2);
42 | if(mask&(1<<(i*2))){
43 | pOutputs[x1 + c_tensor].x = At[x].x + At[x+1].x + At[x+2].x;
44 | pOutputs[x1 + c_tensor].y = At[x].y + At[x+1].y + At[x+2].y;
45 | }
46 |
47 | if(mask&(1<<(i*2+1))){
48 | pOutputs[x1 + in_n/2 + c_tensor].x = At[x+1].x - At[x+2].x - At[x+3].x;
49 | pOutputs[x1 + in_n/2 + c_tensor].y = At[x+1].y - At[x+2].y - At[x+3].y;
50 | }
51 | }
52 | }
53 |
54 | __device__ __inline__ void store_output_tile(float4 acumm_smem[][16], float *shared_mem, int Inx, int Iny,
55 | float *C, int TileX, int TileY, int TileZ, int out_h, int out_w,
56 | int tiles_dim, int in_n, float4 *input_frag_mem, float4* filter_frag_mem, short mask){
57 |
58 | float2 *output_smem = (float2 *) shared_mem;
59 | float2 *accumulator = (float2 *) acumm_smem;
60 | float2 *C_out = (float2*)C;
61 |
62 | float2 *C_tile = (float2*) input_frag_mem;
63 | float2 *At = (float2*) filter_frag_mem;
64 |
65 | mask = 0x000F;
66 | if((blockIdx.y/tiles_dim)==(tiles_dim-1) && out_w%2) mask&=0x0003;
67 | if(!((blockIdx.y+1)%tiles_dim) && out_w%2) mask&=0X0005;
68 |
69 | // output transpose step
70 | int t=0;
71 | int acumm1, acumm2;
72 | // For transposing
73 | //acumm1 = access_s_out[Inx]; //* 4
74 | acumm1 = ((Inx%8)/2)*34 + Inx%2 + (Inx/16)*2 + ((Inx/8)%2)*8;
75 | acumm2 = acumm1+4;
76 |
77 | int acumm4 = BN_p*16 ; //*4
78 | int idx = Iny * BN_p;
79 | int idx2 = idx + BN_p*8; //(BN_p*2 *8)/2
80 |
81 | // For transformating
82 | int offset = BN_p *2; //*2/2
83 | int init = ( (Iny/4)*BN_p*16 + (Iny%4)*(32+2) ) *2 + Inx;
84 |
85 | int c_glb_offset = in_n*out_h*out_w;
86 | int c_tensor = TileZ*c_glb_offset*BK + (TileY%tiles_dim)*in_n*2 + (TileY/tiles_dim)*in_n*out_w*2 + TileX*BN + (Inx%16)*2+
87 | ((Inx/16)*16 + (Iny%4)*4 + Iny/4)*c_glb_offset;
88 | c_tensor/=2;
89 |
90 | #pragma unroll
91 | for(int round=0; round<4; round++){
92 |
93 | *( (float2*) (output_smem + idx + acumm1) ) = *(accumulator+t);
94 | *( (float2*) (output_smem + idx + acumm1 + 16) ) = *(accumulator+t+1); // float 4, t
95 | *( (float2*) (output_smem + idx + acumm2) ) = *(accumulator+t+2);
96 | *( (float2*) (output_smem + idx + acumm2 + 16) ) = *(accumulator+t+3); // float 4, t+1
97 |
98 | *( (float2*) (output_smem + idx2 + acumm1) ) = *(accumulator+t+32);
99 | *( (float2*) (output_smem + idx2 + acumm1 + 16) ) = *(accumulator+t+33); // float 4, t+16
100 | *( (float2*) (output_smem + idx2 + acumm2) ) = *(accumulator+t+34);
101 | *( (float2*) (output_smem + idx2 + acumm2 + 16) ) = *(accumulator+t+35); // float 4, t+17
102 |
103 | *( (float2*) (output_smem + idx + acumm4 + acumm1) ) = *(accumulator+t+4);
104 | *( (float2*) (output_smem + idx + acumm4 + acumm1 + 16) ) = *(accumulator+t+5); // float 4, t+2
105 | *( (float2*) (output_smem + idx + acumm4 + acumm2) ) = *(accumulator+t+6);
106 | *( (float2*) (output_smem + idx + acumm4 + acumm2 + 16) ) = *(accumulator+t+7); // float 4, t+3
107 |
108 | *( (float2*) (output_smem + idx2 + acumm4 + acumm1) ) = *(accumulator+t+36);
109 | *( (float2*) (output_smem + idx2 + acumm4 + acumm1 + 16) ) = *(accumulator+t+37); // float 4, t+18
110 | *( (float2*) (output_smem + idx2 + acumm4 + acumm2) ) = *(accumulator+t+38);
111 | *( (float2*) (output_smem + idx2 + acumm4 + acumm2 + 16) ) = *(accumulator+t+39); // float 4, t+19
112 |
113 | t+=8;
114 |
115 | __syncthreads();
116 |
117 |
118 | for(int i=0; i<16; i++){
119 | C_tile[i].x = shared_mem[i*offset + init];
120 | C_tile[i].y = shared_mem[i*offset + init + 32];
121 | }
122 |
123 | // transform output tiles
124 | transform_output_tile(C_out, C_tile, At, tiles_dim, round, in_n, c_tensor, c_glb_offset, mask, out_w);
125 | __syncthreads();
126 | }
127 | }
128 |
129 | }
130 | #endif
--------------------------------------------------------------------------------
/src/store_and_transform_output_optSTS64_compact.cuh:
--------------------------------------------------------------------------------
1 | // Copyright 2021 Roberto Lopez Castro
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 |
16 | #include "config.hpp"
17 |
18 | #ifndef _OUTPUT_KERNEL_OPT1_
19 | #define _OUTPUT_KERNEL_OPT1_
20 | extern "C"
21 | {
22 |
23 | __device__ __forceinline__ void transform_output_tile(float2 *pOutputs, float2 *C_tile, float2 *At,
24 | int tiles_dim, int round, int in_n, int c_tensor, int c_glb_offset,
25 | short mask, int out_w){
26 | c_tensor += (((round)/2)*32 + ((round)%2)*2)*c_glb_offset/2;
27 | int x, x1;
28 |
29 |
30 | #pragma unroll
31 | for(int j=0; j<4; j++){
32 | At[j].x = C_tile[j].x + C_tile[4+j].x + C_tile[8+j].x;
33 | At[j].y = C_tile[j].y + C_tile[4+j].y + C_tile[8+j].y;
34 | At[4+j].x = C_tile[4+j].x - C_tile[8+j].x - C_tile[12+j].x;
35 | At[4+j].y = C_tile[4+j].y - C_tile[8+j].y - C_tile[12+j].y;
36 | }
37 |
38 | x = in_n/2;
39 | pOutputs[c_tensor].x = At[0].x + At[1].x + At[2].x;
40 | pOutputs[c_tensor].y = At[0].y + At[1].y + At[2].y;
41 |
42 | if(mask&0x2){
43 | pOutputs[x + c_tensor].x = At[1].x - At[2].x - At[3].x;
44 | pOutputs[x + c_tensor].y = At[1].y - At[2].y - At[3].y;
45 | }
46 |
47 | x1 = in_n*(tiles_dim-(out_w%2)) + (out_w%2)*x;
48 | if(mask&0x4){
49 | pOutputs[x1 + c_tensor].x = At[4].x + At[5].x + At[6].x;
50 | pOutputs[x1 + c_tensor].y = At[4].y + At[5].y + At[6].y;
51 | }
52 |
53 | if(mask&0x8){
54 | pOutputs[x1 + x + c_tensor].x = At[5].x - At[6].x - At[7].x;
55 | pOutputs[x1 + x + c_tensor].y = At[5].y - At[6].y - At[7].y;
56 | }
57 | }
58 |
59 | __device__ __forceinline__ void store_output_tile(float4 acumm_smem[][16], float *shared_mem, int Inx, int Iny,
60 | float *C, int TileX, int TileY, int TileZ, int out_h, int out_w,
61 | int tiles_dim, int in_n, float4 *input_frag_mem, float4* filter_frag_mem,
62 | short mask){
63 |
64 | float2 *output_smem = (float2 *) shared_mem;
65 | float2 *accumulator = (float2 *) acumm_smem;
66 | float2 *C_out = (float2*)C;
67 |
68 | float2 *C_tile = (float2*) input_frag_mem;
69 | float2 *At = (float2*) filter_frag_mem;
70 |
71 | mask = 0x000F;
72 | if((blockIdx.y/tiles_dim)==(tiles_dim-1) && out_w%2) mask&=0x0003;
73 | if(!((blockIdx.y+1)%tiles_dim) && out_w%2) mask&=0X0005;
74 |
75 | // output transpose step
76 | int t,j;
77 | int acumm1, acumm2;
78 | // For transposing
79 | t = Inx%8/2;
80 | acumm1 = t*18 + Inx%2 + (Inx/16)*2 + ((Inx/8)%2)*8;
81 | acumm2 = acumm1+4;
82 | acumm1 = acumm1 - acumm1/((t+1)*16)*16 + t*16;
83 | acumm2 = acumm2 - acumm2/((t+1)*16)*16 + t*16;
84 | t=0;
85 |
86 | int acumm4 = BN_p*16 ; //*4
87 | int idx = Iny * BN_p;
88 | int idx2 = idx + BN_p*8; //(BN_p*2 *8)/2
89 |
90 | // For transformating
91 | int offset = BN_p *2; //*2/2
92 |
93 | int init = (Iny%4)*(16+2)*2 + Inx;
94 | init = init - init/((Iny%4+1)*32)*32 + Iny%4*32;
95 | init += (Iny/4)*BN_p*16*2;
96 |
97 | int c_glb_offset = in_n*out_h*out_w;
98 | int c_tensor = TileZ*c_glb_offset*BK + (TileY%tiles_dim)*in_n*2 + (TileY/tiles_dim)*in_n*out_w*2 + TileX*BN + (Inx%16)*2+
99 | ((Inx/16)*16 + (Iny%4)*4 + Iny/4)*c_glb_offset;
100 | c_tensor/=2;
101 |
102 | // k=0, block 0
103 | *( (float2*) (output_smem + idx + acumm1) ) = *(accumulator);
104 | *( (float2*) (output_smem + idx + acumm1 + 16) ) = *(accumulator+1);
105 | *( (float2*) (output_smem + idx + acumm2) ) = *(accumulator+2);
106 | *( (float2*) (output_smem + idx + acumm2 + 16) ) = *(accumulator+3);
107 |
108 | // K=1, block 0
109 | *( (float2*) (output_smem + idx + acumm4 + acumm1) ) = *(accumulator+4);
110 | *( (float2*) (output_smem + idx + acumm4 + acumm1 + 16) ) = *(accumulator+5);
111 | *( (float2*) (output_smem + idx + acumm4 + acumm2) ) = *(accumulator+6);
112 | *( (float2*) (output_smem + idx + acumm4 + acumm2 + 16) ) = *(accumulator+7);
113 |
114 | // k=0, block 1
115 | *( (float2*) (output_smem + idx2 + acumm1) ) = *(accumulator+32);
116 | *( (float2*) (output_smem + idx2 + acumm1 + 16) ) = *(accumulator+33);
117 | *( (float2*) (output_smem + idx2 + acumm2) ) = *(accumulator+34);
118 | *( (float2*) (output_smem + idx2 + acumm2 + 16) ) = *(accumulator+35);
119 |
120 | // K=1, block 1
121 | *( (float2*) (output_smem + idx2 + acumm4 + acumm1) ) = *(accumulator+36);
122 | *( (float2*) (output_smem + idx2 + acumm4 + acumm1 + 16) ) = *(accumulator+37);
123 | *( (float2*) (output_smem + idx2 + acumm4 + acumm2) ) = *(accumulator+38);
124 | *( (float2*) (output_smem + idx2 + acumm4 + acumm2 + 16) ) = *(accumulator+39);
125 |
126 | j=0; t+=8;
127 |
128 | #pragma unroll
129 | for(int round=0; round<3; round++){
130 |
131 | __syncthreads();
132 |
133 | int disp = j/2*(BN_p*2*16)*2;
134 | #pragma unroll
135 | for(int i=0; i<16; i++){
136 | C_tile[i].x = shared_mem[disp + i*offset + init];
137 | C_tile[i].y = shared_mem[disp + i*offset + init + 32];
138 | }
139 |
140 | // transform output tiles
141 | transform_output_tile(C_out, C_tile, At, tiles_dim, (round/2)*2+j/2, in_n, c_tensor, c_glb_offset, mask, out_w);
142 |
143 | j = 2 - j; //switch between 0 and 2
144 |
145 | // k=0, block 0
146 | *( (float2*) (output_smem + idx + (j)*acumm4 + acumm1) ) = *(accumulator+t);
147 | *( (float2*) (output_smem + idx + (j)*acumm4 + acumm1 + 16) ) = *(accumulator+t+1);
148 | *( (float2*) (output_smem + idx + (j)*acumm4 + acumm2) ) = *(accumulator+t+2);
149 | *( (float2*) (output_smem + idx + (j)*acumm4 + acumm2 + 16) ) = *(accumulator+t+3);
150 |
151 | // K=1, block 0
152 | *( (float2*) (output_smem + idx + (j+1)*acumm4 + acumm1) ) = *(accumulator+t+4);
153 | *( (float2*) (output_smem + idx + (j+1)*acumm4 + acumm1 + 16) ) = *(accumulator+t+5);
154 | *( (float2*) (output_smem + idx + (j+1)*acumm4 + acumm2) ) = *(accumulator+t+6);
155 | *( (float2*) (output_smem + idx + (j+1)*acumm4 + acumm2 + 16) ) = *(accumulator+t+7);
156 |
157 | // k=0, block 1
158 | *( (float2*) (output_smem + idx2 + (j)*acumm4 + acumm1) ) = *(accumulator+t+32);
159 | *( (float2*) (output_smem + idx2 + (j)*acumm4 + acumm1 + 16) ) = *(accumulator+t+33);
160 | *( (float2*) (output_smem + idx2 + (j)*acumm4 + acumm2) ) = *(accumulator+t+34);
161 | *( (float2*) (output_smem + idx2 + (j)*acumm4 + acumm2 + 16) ) = *(accumulator+t+35);
162 |
163 | // K=1, block 1
164 | *( (float2*) (output_smem + idx2 + (j+1)*acumm4 + acumm1) ) = *(accumulator+t+36);
165 | *( (float2*) (output_smem + idx2 + (j+1)*acumm4 + acumm1 + 16) ) = *(accumulator+t+37);
166 | *( (float2*) (output_smem + idx2 + (j+1)*acumm4 + acumm2) ) = *(accumulator+t+38);
167 | *( (float2*) (output_smem + idx2 + (j+1)*acumm4 + acumm2 + 16) ) = *(accumulator+t+39);
168 |
169 | t+=8;
170 |
171 | }
172 |
173 | __syncthreads();
174 |
175 | int disp = j/2*(BN_p*2*16)*2;
176 | #pragma unroll
177 | for(int i=0; i<16; i++){
178 | C_tile[i].x = shared_mem[disp + i*offset + init];
179 | C_tile[i].y = shared_mem[disp + i*offset + init + 32];
180 | }
181 | // transform output tiles
182 | transform_output_tile(C_out, C_tile, At, tiles_dim, 2+j/2, in_n, c_tensor, c_glb_offset, mask, out_w);
183 | }
184 |
185 | }
186 | #endif
--------------------------------------------------------------------------------