├── LICENSE
├── Makefile
├── README.md
├── bench
    ├── bench.sh
    └── smem
    │   ├── LDS
    │       ├── Makefile
    │       ├── lds128.sass
    │       ├── lds32.sass
    │       ├── lds64.sass
    │       ├── lds64_opt3.sass
    │       └── main.cu
    │   └── STS
    │       ├── Makefile
    │       ├── main.cu
    │       ├── sts.sass
    │       ├── sts128.sass
    │       ├── sts128_0.sass
    │       ├── sts32.sass
    │       ├── sts64.sass
    │       ├── sts64_2bank_conflict.sass
    │       ├── sts64_broadcast.sass
    │       └── sts64_opt3.sass
└── src
    ├── FX_m2.cu
    ├── ampere
        ├── convolutionForward_32x64x8.cu
        ├── convolutionForward_32x64x8_baseline.cu
        ├── store_and_transform_output_baseline.cuh
        ├── store_and_transform_output_optLDS64.cuh
        ├── store_and_transform_output_optSTS64.cuh
        └── store_and_transform_output_optSTS64_compact.cuh
    ├── config.hpp
    ├── convolutionForward_32x64x8.cu
    ├── convolutionForward_32x64x8_baseline.cu
    ├── openCNN_winograd.cu
    ├── outer_product.cuh
    ├── outer_product_suffle.cuh
    ├── store_and_transform_output_baseline.cuh
    ├── store_and_transform_output_optLDS64.cuh
    ├── store_and_transform_output_optSTS64.cuh
    └── store_and_transform_output_optSTS64_compact.cuh


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | ARCH = 75 # modify this. Ampere=86
 2 | NAME = wgrad
 3 | OUT = OPTSTS64
 4 | #MODE = PROF
 5 | #LBR = OPENCNN
 6 | 
 7 | all:
 8 | 	nvcc src/openCNN_winograd.cu -lcudnn -m64 -arch=compute_$(ARCH) -code=sm_$(ARCH)-o $(NAME) -D$(OUT)
 9 | 
10 | clean:
11 | 	rm $(NAME)
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # OpenCNN
 2 | A winograd’s minimal filtering algorithm implementation in CUDA
 3 | ## Requirements
 4 |  - CUDA Toolkit
 5 |  - cuDNN
 6 |  - CMake
 7 | 
 8 | ## Build the project
 9 | ```
10 | git clone https://github.com/UDC-GAC/openCNN.git
11 | cd openCNN
12 | ```
13 | GPU architecture code must be specified inside the Makefile before compiling.
14 | 
15 | ```
16 | make
17 | ```
18 | 
19 | Compile time macros have been defined for testing purposes. They can be specified in the MakeFile according to the following values:
20 | ```
21 | $OUT (builds an specific output storage and transform version):
22 |   - BASE: baseline layout
23 |   - OPTSTS64 (default): optSTS64 layout
24 |   - OUTSTS64_CMP: optSTS64_compact layout
25 |   - OUTLDS64: optLDS64 layout
26 | ```
27 | ## Run examples
28 | (Recommended before time measurement) Lock the clocks:
29 | ```
30 | sudo nvidia-smi -i 0 -pm 1
31 | sudo nvidia-smi -lgc 1750 -i 0
32 | ```
33 | 1. OpenCNN benchmark
34 | ```
35 | cd bench
36 | ./bench.sh
37 | ```
38 | 2. Instruction-level microbenchmarking (Require Turing devices). TuringAs (https://github.com/daadaada/turingas) must be installed for running instruction-level microbenchmarking.
39 | ```
40 | cd bench/smem/STS
41 | make
42 | ./test
43 | ```
44 | 
45 | ## Citation
46 | If you find this tool helpful, please cite:
47 | ```
48 | @Article{math9172033,
49 | AUTHOR = {Castro, Roberto L. and Andrade, Diego and  Fraguela, Basilio B.},
50 | TITLE = {OpenCNN: A Winograd Minimal Filtering Algorithm Implementation in CUDA},
51 | JOURNAL = {Mathematics},
52 | VOLUME = {9},
53 | YEAR = {2021},
54 | NUMBER = {17},
55 | ARTICLE-NUMBER = {2033},
56 | URL = {https://www.mdpi.com/2227-7390/9/17/2033},
57 | ISSN = {2227-7390},
58 | DOI = {10.3390/math9172033}
59 | }
60 | ```
61 | ## License
62 | Apache-2.0 License
63 | 
64 | -- Roberto López Castro
65 | 


--------------------------------------------------------------------------------
/bench/bench.sh:
--------------------------------------------------------------------------------
 1 | echo "in_n,in_c,in_h,filt_k,filt_w,openCNN_sec,openCNN_flops,cuDNN_sec,cuDNN_flops"
 2 | 
 3 | for n in 32 64 96 128;
 4 |     do
 5 |         ../wgrad $n 64 56 56 64 64 3 3
 6 |     done
 7 | 
 8 | for n in 32 64 96 128; 
 9 |     do
10 |         ../wgrad $n 128 28 28 128 128 3 3
11 |     done
12 | 
13 | for n in 32 64 96 128; 
14 |     do
15 |         ../wgrad $n 256 14 14 256 256 3 3
16 |     done
17 |     
18 | for n in 32 64 96 128; 
19 |     do
20 |         ../wgrad $n 512 7 7 512 512 3 3
21 |     done
22 |         
23 | 


--------------------------------------------------------------------------------
/bench/smem/LDS/Makefile:
--------------------------------------------------------------------------------
 1 | all:
 2 | 	python -m turingas.main -i lds32.sass -o lds32.cubin
 3 | 	python -m turingas.main -i lds64.sass -o lds64.cubin
 4 | 	python -m turingas.main -i lds128.sass -o lds128.cubin
 5 | 	python -m turingas.main -i lds64_opt3.sass -o lds64_opt3.cubin
 6 | 	nvcc -arch=sm_75 main.cu -lcuda -o test
 7 | 
 8 | clean:
 9 | 	lds32.cubin test lds64.cubin lds64_opt3.cubin lds128.cubin
10 | 


--------------------------------------------------------------------------------
/bench/smem/LDS/lds128.sass:
--------------------------------------------------------------------------------
 1 | <params>
 2 | output, 8
 3 | </params>
 4 | 
 5 | <regs>
 6 | 0-18 ~ output0, output1, tid, offset, target, start, end, iter, tmp
 7 | </regs>
 8 | 
 9 | --:-:1:-:5    S2R tid, SR_TID.X;
10 | --:-:-:-:2    MOV output0, output[0];
11 | --:-:-:-:5    MOV output1, output[1];
12 | 
13 | 02:-:-:-:6    SHF.L offset, tid, 4, RZ;
14 | --:-:-:-:5    MOV iter, RZ;
15 | 
16 | 
17 | --:-:-:-:1    CS2R start, SR_CLOCKLO;
18 | 
19 | LOOP:
20 | <code>
21 | out = []
22 | out.append('--:-:-:-:1    IADD3 iter, iter, 1, RZ;')
23 | for i in range(128):
24 |   if i == 64:
25 |     out.append(f'--:-:-:-:1    ISETP.LT.AND P0, PT, iter, 128, PT;')
26 |   out.append(f'--:-:-:-:1    LDS.128 tmp, [offset];')
27 | out.append('--:-:-:-:2  @P0  BRA LOOP;')  
28 | out_ = '\n'.join(out) + '\n'
29 | </code>
30 | 
31 | --:-:-:-:5    CS2R end, SR_CLOCKLO;
32 | --:-:-:-:5    IADD3 end, end, -start, RZ;
33 | --:-:-:-:2    STG.E.GPU [output0], end;
34 | 
35 | 
36 | 
37 | --:-:-:-:2    EXIT;
38 | 


--------------------------------------------------------------------------------
/bench/smem/LDS/lds32.sass:
--------------------------------------------------------------------------------
 1 | <params>
 2 | output, 8
 3 | </params>
 4 | 
 5 | <regs>
 6 | 0-16 ~ output0, output1, tid, offset, target, start, end, iter
 7 | </regs>
 8 | 
 9 | --:-:1:-:5    S2R tid, SR_TID.X;
10 | --:-:-:-:2    MOV output0, output[0];
11 | --:-:-:-:5    MOV output1, output[1];
12 | 02:-:-:-:6    SHF.L offset, tid, 2, RZ;
13 | --:-:-:-:5    MOV iter, RZ;
14 | 
15 | --:-:-:-:1    CS2R start, SR_CLOCKLO;
16 | 
17 | LOOP:
18 | <code>
19 | out = []
20 | out.append('--:-:-:-:1    IADD3 iter, iter, 1, RZ;')
21 | for i in range(128):
22 |   if i == 64:
23 |     out.append(f'--:-:-:-:1    ISETP.LT.AND P0, PT, iter, 128, PT;')
24 |   out.append(f'--:-:-:-:1    LDS target, [offset];')
25 | out.append('--:-:-:-:2  @P0  BRA LOOP;')
26 | out_ = '\n'.join(out) + '\n'
27 | </code>
28 | 
29 | --:-:-:-:5    CS2R end, SR_CLOCKLO;
30 | --:-:-:-:5    IADD3 end, end, -start, RZ;
31 | --:-:-:-:2    STG.E.GPU [output0], end;
32 | 
33 | 
34 | 
35 | 
36 | --:-:-:-:2    EXIT;
37 | 


--------------------------------------------------------------------------------
/bench/smem/LDS/lds64.sass:
--------------------------------------------------------------------------------
 1 | <params>
 2 | output, 8
 3 | </params>
 4 | 
 5 | <regs>
 6 | 0-17 ~ output0, output1, tid, offset, target, start, end, iter, tmp
 7 | </regs>
 8 | 
 9 | --:-:1:-:5    S2R tid, SR_TID.X;
10 | --:-:-:-:2    MOV output0, output[0];
11 | --:-:-:-:5    MOV output1, output[1];
12 | 02:-:-:-:6    SHF.L offset, tid, 3, RZ;
13 | --:-:-:-:5    MOV iter, RZ;
14 | 
15 | --:-:-:-:1    CS2R start, SR_CLOCKLO;
16 | 
17 | LOOP:
18 | <code>
19 | out = []
20 | out.append('--:-:-:-:1    IADD3 iter, iter, 1, RZ;')
21 | for i in range(128):
22 |   if i == 64:
23 |     out.append(f'--:-:-:-:1    ISETP.LT.AND P0, PT, iter, 128, PT;')
24 |   out.append(f'--:-:-:-:1    LDS.64 tmp, [offset];')
25 | out.append('--:-:-:-:2  @P0  BRA LOOP;')
26 | out_ = '\n'.join(out) + '\n'
27 | </code>
28 | 
29 | --:-:-:-:5    CS2R end, SR_CLOCKLO;
30 | --:-:-:-:5    IADD3 end, end, -start, RZ;
31 | --:-:-:-:2    STG.E.GPU [output0], end;
32 | 
33 | 
34 | 
35 | 
36 | --:-:-:-:2    EXIT;
37 | 


--------------------------------------------------------------------------------
/bench/smem/LDS/lds64_opt3.sass:
--------------------------------------------------------------------------------
 1 | <params>
 2 | output, 8
 3 | </params>
 4 | 
 5 | <regs>
 6 | 0-19 ~ output0, output1, tid, offset, target, start, end, iter, aux, aux2, tmp
 7 | </regs>
 8 | 
 9 | --:-:1:-:5    S2R tid, SR_TID.X;
10 | --:-:-:-:2    MOV output0, output[0];
11 | --:-:-:-:5    MOV output1, output[1];
12 | 
13 | 02:-:-:-:6    SHF.R aux, tid, 4, RZ;
14 | 02:-:-:-:6    SHF.L aux, aux, 4, RZ;
15 | 04:-:1:-:6    IMAD aux, aux, -1, tid;  #tid%16
16 | --:-:-:-:1    ISETP.LT.AND P0, PT, aux, 8, PT;
17 | 
18 | 02:-:-:-:6    SHF.R aux2, tid, 4, RZ;
19 | 04:-:1:-:6    IMAD aux2, aux2, 8, aux;  #tid/16*tid%16
20 | 
21 | --:-:-:-:5    MOV iter, RZ;
22 | 
23 | --:-:-:-:2    @P0  BRA JMP;
24 | --:-:-:-:5    IADD3 aux2, aux2, 8, RZ;
25 | 
26 | JMP:
27 | 02:-:-:-:6    SHF.L offset, aux2, 3, RZ;
28 | 
29 | --:-:-:-:1    CS2R start, SR_CLOCKLO;
30 | 
31 | LOOP:
32 | <code>
33 | out = []
34 | out.append('--:-:-:-:1    IADD3 iter, iter, 1, RZ;')
35 | for i in range(128):
36 |   if i == 64:
37 |     out.append(f'--:-:-:-:1    ISETP.LT.AND P0, PT, iter, 128, PT;')
38 |   out.append(f'--:-:-:-:1    LDS.64 tmp, [offset];')
39 | out.append('--:-:-:-:2  @P0  BRA LOOP;')
40 | out_ = '\n'.join(out) + '\n'
41 | </code>
42 | 
43 | --:-:-:-:5    CS2R end, SR_CLOCKLO;
44 | --:-:-:-:5    IADD3 end, end, -start, RZ;
45 | --:-:-:-:2    STG.E.GPU [output0], end;
46 | 
47 | #02:-:-:-:6    SHF.L aux, tid, 2, RZ;
48 | #--:-:-:-:5    IADD3 output0, output0, aux, RZ;
49 | ##--:-:-:-:2    STG.E.GPU [output0], aux2;
50 | #--:-:-:-:2    STG.E.GPU [output0], aux2;
51 | 
52 | 
53 | --:-:-:-:2    EXIT;
54 | 


--------------------------------------------------------------------------------
/bench/smem/LDS/main.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda.h>
 2 | #include <stdio.h>
 3 | #include <string.h>
 4 | 
 5 | char* concat(const char *s1, const char *s2)
 6 | {
 7 |     char *result = (char*)malloc(strlen(s1) + strlen(s2) + 1); // +1 for the null-terminator
 8 |     // in real code you would check for errors in malloc here
 9 |     strcpy(result, s1);
10 |     strcat(result, s2);
11 |     return result;
12 | }
13 | 
14 | void run(char * name, int size){
15 | 	char * file_name = concat(name, ".cubin");
16 | 
17 | 	int *output;
18 | 	cudaMalloc((void**)&output, sizeof(int)*32);
19 | 
20 | 	CUmodule module;
21 | 	CUfunction kernel;
22 | 
23 | 	cuModuleLoad(&module, file_name);
24 | 	cuModuleGetFunction(&kernel, module, "kern");
25 | 
26 | 	void * args[1] = {&output};
27 | 	cuLaunchKernel(kernel, 1, 1, 1,
28 | 			32, 1, 1,
29 | 			32*sizeof(float)*size, 0, args, 0);
30 | 
31 | 	int *output_h = (int*)malloc(sizeof(int)*32);
32 | 
33 | 	cudaMemcpy(output_h, output, sizeof(int)*32, cudaMemcpyDeviceToHost);
34 | 
35 | 	printf("%s took %d clocks.\n", name, output_h[0]);
36 | 	printf("Each instruction takes %.2f clocks.\n", (float)output_h[0]/(128.0*128.0)); // workload of a thread
37 | 	printf("Throughput %.2f bytes/cycle.\n\n", ((double)32*128*128*4*size)/output_h[0]);
38 | 
39 | 	cudaFree(output);
40 | 	free(output_h);
41 | }
42 | 
43 | int main(){
44 | 	run("lds32", 1);
45 | 	printf("\n");
46 | 	run("lds64", 2);
47 | 	printf("\n");
48 | 	run("lds128", 4);
49 | 
50 | 	printf("\n");
51 | 	run("lds64_opt3", 2);
52 | 	return 0;
53 | }
54 | 


--------------------------------------------------------------------------------
/bench/smem/STS/Makefile:
--------------------------------------------------------------------------------
 1 | all:
 2 | 	python -m turingas.main -i sts32.sass -o sts32.cubin
 3 | 	python -m turingas.main -i sts64.sass -o sts64.cubin
 4 | 	python -m turingas.main -i sts128.sass -o sts128.cubin
 5 | 	python -m turingas.main -i sts128_0.sass -o sts128_0.cubin
 6 | 	python -m turingas.main -i sts64_2bank_conflict.sass -o sts64_2bank_conflict.cubin
 7 | 	python -m turingas.main -i sts64_broadcast.sass -o sts64_broadcast.cubin
 8 | 	python -m turingas.main -i sts64_opt3.sass -o sts64_opt3.cubin
 9 | 	nvcc -arch=sm_75 main.cu -lcuda -o test
10 | 
11 | clean:
12 | 	sts32.cubin sts64.cubin sts128.cubin sts128_0.cubin sts64_2bank_conflict.cubin sts64_broadcast.cubin sts64_opt3.cubin test
13 | 


--------------------------------------------------------------------------------
/bench/smem/STS/main.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda.h>
  2 | #include <stdio.h>
  3 | #include <string.h>
  4 | 
  5 | #define ITERS 32768
  6 | 
  7 | char* concat(const char *s1, const char *s2)
  8 | {
  9 |     char *result = (char*)malloc(strlen(s1) + strlen(s2) + 1); // +1 for the null-terminator
 10 |     // in real code you would check for errors in malloc here
 11 |     strcpy(result, s1);
 12 |     strcat(result, s2);
 13 |     return result;
 14 | }
 15 | 
 16 | #define CUDA_SAFE_CALL( call) {                                    \
 17 |     cudaError err = call;                                                    \
 18 |     if( cudaSuccess != err) {                                                \
 19 |         fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
 20 |                 __FILE__, __LINE__, cudaGetErrorString( err) );              \
 21 |         exit(EXIT_FAILURE);                                                  \
 22 |     } }
 23 | 
 24 | void initializeEvents(cudaEvent_t *start, cudaEvent_t *stop){
 25 | 	CUDA_SAFE_CALL( cudaEventCreate(start) );
 26 | 	CUDA_SAFE_CALL( cudaEventCreate(stop) );
 27 | 	CUDA_SAFE_CALL( cudaEventRecord(*start, 0) );
 28 | }
 29 | 
 30 | float finalizeEvents(cudaEvent_t start, cudaEvent_t stop){
 31 | 	CUDA_SAFE_CALL( cudaGetLastError() );
 32 | 	CUDA_SAFE_CALL( cudaEventRecord(stop, 0) );
 33 | 	CUDA_SAFE_CALL( cudaEventSynchronize(stop) );
 34 | 	float kernel_time;
 35 | 	CUDA_SAFE_CALL( cudaEventElapsedTime(&kernel_time, start, stop) );
 36 | 	CUDA_SAFE_CALL( cudaEventDestroy(start) );
 37 | 	CUDA_SAFE_CALL( cudaEventDestroy(stop) );
 38 | 	return kernel_time;
 39 | }
 40 | 
 41 | template<typename T>
 42 | void run(char * name, T scal, int type_size, int threads){
 43 | 	char * file_name = concat(name, ".cubin");
 44 | 
 45 | 	int *output;
 46 | 	cudaMalloc((void**)&output, sizeof(int)*32);
 47 | 	cudaMemset(output, 0, 32*sizeof(int));
 48 | 
 49 | 	CUmodule module;
 50 | 	CUfunction kernel;
 51 | 
 52 | 	cuModuleLoad(&module, file_name);
 53 | 	cuModuleGetFunction(&kernel, module, "kern");
 54 | 
 55 | 	int blk_size = 32;
 56 | 	int total_blks = 1;//size/blk_size;	
 57 | 	int sh_mem_size = blk_size*sizeof(float)*type_size;
 58 | 	void * args[2] = {&scal, &output};
 59 | 
 60 | 	//cudaEvent_t start, stop;
 61 | 	//initializeEvents(&start, &stop);
 62 | 	cuLaunchKernel(kernel, total_blks, 1, 1,
 63 | 			blk_size, 1, 1,
 64 | 			sh_mem_size, 0, args, 0);
 65 | 	//float krn_time_shmem_32b = finalizeEvents(start, stop);
 66 | 
 67 | 	int *output_h = (int*)malloc(sizeof(int)*32);	
 68 | 
 69 | 	cudaMemcpy(output_h, output, sizeof(int)*32, cudaMemcpyDeviceToHost);	
 70 | 
 71 | 	/*for(int i=0; i<32; i++){
 72 | 		printf("%d ", output_h[i]);
 73 | 	}printf("\n");*/
 74 | 
 75 | 	printf("%s took %d clocks \n", name, output_h[0]);
 76 | 	double clocks_instr = (float)output_h[0]/(128.0*128.0); // wokload of a thread
 77 | 	printf("Each instruction takes %.2f clocks.\n", clocks_instr);
 78 | 	printf("Throughput %.2f bytes/cycle.\n\n", ((double)threads*128*128*type_size*4)/output_h[0]); // Size of information stores divided by the number of threads of the latest thread
 79 | 
 80 | 	cudaFree(output);
 81 | 	free(output_h);
 82 | }
 83 | 
 84 | int main(){
 85 | 	float scal = 4;
 86 | 	run("sts32", scal, 1, 32);
 87 | 	printf("\n");
 88 | 	float2 scal2;
 89 | 	scal2.x = 4; scal2.y = 4;
 90 | 	run("sts64", scal2, 2, 32);
 91 | 	printf("\n");
 92 | 	float4 scal4;
 93 | 	scal4.x = 4; scal4.y = 4;
 94 | 	scal4.z = 4; scal4.w = 4;
 95 | 	// No thread-divergence
 96 | 	run("sts128_0", scal4, 4, 32);
 97 | 
 98 | 	printf("\n");
 99 | 	// Only half of the threads store data
100 | 	run("sts128", scal4, 4, 16);
101 | 
102 | 	/* run2_aux("sts64_2bank_conflict");
103 | 	printf("\n");
104 | 	run2_aux("sts64_broadcast");
105 | 	printf("\n");
106 | 	run2_aux("sts64_opt3");
107 | 	printf("\n"); */
108 | 
109 | 	return 0;
110 | }
111 | 


--------------------------------------------------------------------------------
/bench/smem/STS/sts.sass:
--------------------------------------------------------------------------------
 1 | <params>
 2 | scal, 8
 3 | output, 8
 4 | </params>
 5 | 
 6 | <regs>
 7 | 0-17 ~ output0, output1, tid, offset, target, start, end, iter, tmp
 8 | </regs>
 9 | 
10 | --:-:1:-:5    S2R tid, SR_TID.X;
11 | --:-:-:-:2    MOV output0, output[0];
12 | --:-:-:-:5    MOV output1, output[1];
13 | 02:-:-:-:6    SHF.L offset, tid, 6, RZ;
14 | --:-:-:-:2    MOV tmp, scal;
15 | --:-:-:-:5    MOV iter, RZ;
16 | 
17 | --:-:-:-:1    CS2R start, SR_CLOCKLO;
18 | 
19 | LOOP:
20 | <code>
21 | out = []
22 | out.append('--:-:-:-:1    IADD3 iter, iter, 1, RZ;')
23 | for i in range(1024):
24 |   if i == 64:
25 |     out.append(f'--:-:-:-:1    ISETP.LT.AND P0, PT, iter, 128, PT;')
26 |   out.append(f'--:-:-:-:1    STS.128 [offset], tmp;')
27 | out.append('--:-:-:-:2  @P0  BRA LOOP;')
28 | out_ = '\n'.join(out) + '\n'
29 | </code>
30 | 
31 | --:-:-:-:5    CS2R end, SR_CLOCKLO;
32 | --:-:-:-:5    IADD3 end, end, -start, RZ;
33 | --:-:-:-:2    STG.E.GPU [output0], end;
34 | 
35 | 
36 | 
37 | 
38 | --:-:-:-:2    EXIT;
39 | 


--------------------------------------------------------------------------------
/bench/smem/STS/sts128.sass:
--------------------------------------------------------------------------------
 1 | <params>
 2 | scal, 8
 3 | output, 8
 4 | </params>
 5 | 
 6 | <regs>
 7 | 0-18 ~ output0, output1, tid, offset, target, start, end, iter, tmp, aux
 8 | </regs>
 9 | 
10 | --:-:1:-:5    S2R tid, SR_TID.X;
11 | --:-:-:-:2    MOV output0, output[0];
12 | --:-:-:-:5    MOV output1, output[1];
13 | 
14 | 02:-:-:-:6    SHF.R aux, tid, 4, RZ;
15 | 02:-:-:-:6    SHF.L aux, aux, 4, RZ;
16 | 04:-:1:-:6    IMAD aux, aux, -1, tid; 
17 | 
18 | --:-:-:-:1    ISETP.GT.AND P0, PT, aux, 8, PT;
19 | 02:-:-:-:6    SHF.L offset, tid, 4, RZ;
20 | --:-:-:-:2    MOV tmp, scal;
21 | --:-:-:-:5    MOV iter, RZ;
22 | 
23 | --:-:-:-:2    @P0  EXIT;
24 | 
25 | --:-:-:-:1    CS2R start, SR_CLOCKLO;
26 | 
27 | LOOP:
28 | <code>
29 | out = []
30 | out.append('--:-:-:-:1    IADD3 iter, iter, 1, RZ;')
31 | for i in range(128):
32 |   if i == 64:
33 |     out.append(f'--:-:-:-:1    ISETP.LT.AND P0, PT, iter, 128, PT;')
34 |   out.append(f'--:-:-:-:1    STS.128 [offset], tmp;')
35 | out.append('--:-:-:-:2  @P0  BRA LOOP;')  
36 | out_ = '\n'.join(out) + '\n'
37 | </code>
38 | 
39 | --:-:-:-:5    CS2R end, SR_CLOCKLO;
40 | --:-:-:-:5    IADD3 end, end, -start, RZ;
41 | --:-:-:-:2    STG.E.GPU [output0], end;
42 | 
43 | #02:-:-:-:6    SHF.L aux, tid, 2, RZ;
44 | #--:-:-:-:5    IADD3 output0, output0, aux, RZ;
45 | #--:-:-:-:2    STG.E.GPU [output0], iter;
46 | 
47 | 
48 | 
49 | --:-:-:-:2    EXIT;
50 | 


--------------------------------------------------------------------------------
/bench/smem/STS/sts128_0.sass:
--------------------------------------------------------------------------------
 1 | <params>
 2 | scal, 8
 3 | output, 8
 4 | </params>
 5 | 
 6 | <regs>
 7 | 0-17 ~ output0, output1, tid, offset, target, start, end, iter, tmp
 8 | </regs>
 9 | 
10 | --:-:1:-:5    S2R tid, SR_TID.X;
11 | --:-:-:-:2    MOV output0, output[0];
12 | --:-:-:-:5    MOV output1, output[1];
13 | 02:-:-:-:6    SHF.L offset, tid, 4, RZ;
14 | --:-:-:-:2    MOV tmp, scal;
15 | --:-:-:-:5    MOV iter, RZ;
16 | 
17 | --:-:-:-:1    CS2R start, SR_CLOCKLO;
18 | 
19 | LOOP:
20 | <code>
21 | out = []
22 | out.append('--:-:-:-:1    IADD3 iter, iter, 1, RZ;')
23 | for i in range(128):
24 |   if i == 64:
25 |     out.append(f'--:-:-:-:1    ISETP.LT.AND P0, PT, iter, 128, PT;')
26 |   out.append(f'--:-:-:-:1    STS.128 [offset], tmp;')
27 | out.append('--:-:-:-:2  @P0  BRA LOOP;')  
28 | out_ = '\n'.join(out) + '\n'
29 | </code>
30 | 
31 | --:-:-:-:5    CS2R end, SR_CLOCKLO;
32 | --:-:-:-:5    IADD3 end, end, -start, RZ;
33 | --:-:-:-:2    STG.E.GPU [output0], end;
34 | 
35 | 
36 | 
37 | 
38 | --:-:-:-:2    EXIT;


--------------------------------------------------------------------------------
/bench/smem/STS/sts32.sass:
--------------------------------------------------------------------------------
 1 | <params>
 2 | scal, 8
 3 | output, 8
 4 | </params>
 5 | 
 6 | <regs>
 7 | 0-17 ~ output0, output1, tid, offset, target, start, end, iter, tmp
 8 | </regs>
 9 | 
10 | --:-:1:-:5    S2R tid, SR_TID.X;
11 | --:-:-:-:2    MOV output0, output[0];
12 | --:-:-:-:5    MOV output1, output[1];
13 | 02:-:-:-:6    SHF.L offset, tid, 2, RZ;
14 | --:-:-:-:2    MOV tmp, scal;
15 | --:-:-:-:5    MOV iter, RZ;
16 | 
17 | --:-:-:-:1    CS2R start, SR_CLOCKLO;
18 | 
19 | LOOP:
20 | <code>
21 | out = []
22 | out.append('--:-:-:-:1    IADD3 iter, iter, 1, RZ;')
23 | for i in range(128):
24 |   if i == 64:
25 |     out.append(f'--:-:-:-:1    ISETP.LT.AND P0, PT, iter, 128, PT;')
26 |   out.append(f'--:-:-:-:1    STS.32 [offset], tmp;')
27 | out.append('--:-:-:-:2  @P0  BRA LOOP;')
28 | out_ = '\n'.join(out) + '\n'
29 | </code>
30 | 
31 | --:-:-:-:5    CS2R end, SR_CLOCKLO;
32 | --:-:-:-:5    IADD3 end, end, -start, RZ;
33 | --:-:-:-:2    STG.E.GPU [output0], end;
34 | 
35 | 
36 | --:-:-:-:2    EXIT;
37 | 


--------------------------------------------------------------------------------
/bench/smem/STS/sts64.sass:
--------------------------------------------------------------------------------
 1 | <params>
 2 | scal, 8
 3 | output, 8
 4 | </params>
 5 | 
 6 | <regs>
 7 | 0-17 ~ output0, output1, tid, offset, target, start, end, iter, tmp
 8 | </regs>
 9 | 
10 | --:-:1:-:5    S2R tid, SR_TID.X;
11 | --:-:-:-:2    MOV output0, output[0];
12 | --:-:-:-:5    MOV output1, output[1];
13 | 02:-:-:-:6    SHF.L offset, tid, 3, RZ;
14 | --:-:-:-:2    MOV tmp, scal;
15 | --:-:-:-:5    MOV iter, RZ;
16 | 
17 | --:-:-:-:1    CS2R start, SR_CLOCKLO;
18 | 
19 | LOOP:
20 | <code>
21 | out = []
22 | out.append('--:-:-:-:1    IADD3 iter, iter, 1, RZ;')
23 | for i in range(128):
24 |   if i == 64:
25 |     out.append(f'--:-:-:-:1    ISETP.LT.AND P0, PT, iter, 128, PT;')
26 |   out.append(f'--:-:-:-:1    STS.64 [offset], tmp;')
27 | out.append('--:-:-:-:2  @P0  BRA LOOP;')
28 | out_ = '\n'.join(out) + '\n'
29 | </code>
30 | 
31 | --:-:-:-:5    CS2R end, SR_CLOCKLO;
32 | --:-:-:-:5    IADD3 end, end, -start, RZ;
33 | --:-:-:-:2    STG.E.GPU [output0], end;
34 | 
35 | 
36 | 
37 | 
38 | --:-:-:-:2    EXIT;
39 | 


--------------------------------------------------------------------------------
/bench/smem/STS/sts64_2bank_conflict.sass:
--------------------------------------------------------------------------------
 1 | <params>
 2 | scal, 8
 3 | output, 8
 4 | </params>
 5 | 
 6 | <regs>
 7 | 0-19 ~ output0, output1, tid, offset, target, start, end, iter, tmp, aux, aux2
 8 | </regs>
 9 | 
10 | --:-:1:-:5    S2R tid, SR_TID.X;
11 | --:-:-:-:2    MOV output0, output[0];
12 | --:-:-:-:5    MOV output1, output[1];
13 | 
14 | 04:-:1:-:6    IMAD aux2, tid, 2, RZ; 
15 | 
16 | 02:-:-:-:6    SHF.R aux, aux2, 5, RZ;
17 | 02:-:-:-:6    SHF.L aux, aux, 5, RZ;
18 | 04:-:1:-:6    IMAD aux, aux, -1, aux2; 
19 | 
20 | 02:-:-:-:6    SHF.R aux2, tid, 4, RZ;
21 | 04:-:1:-:6    IMAD aux, aux2, 32, aux;  #tid/16*tid%16
22 | 
23 | --:-:-:-:2    MOV tmp, scal;
24 | --:-:-:-:5    MOV iter, RZ;
25 | 
26 | 02:-:-:-:6    SHF.L offset, aux, 3, RZ;
27 | 
28 | --:-:-:-:1    CS2R start, SR_CLOCKLO;
29 | 
30 | LOOP:
31 | <code>
32 | out = []
33 | out.append('--:-:-:-:1    IADD3 iter, iter, 1, RZ;')
34 | for i in range(128):
35 |   if i == 64:
36 |     out.append(f'--:-:-:-:1    ISETP.LT.AND P0, PT, iter, 128, PT;')
37 |   out.append(f'--:-:-:-:1    STS.64 [offset], tmp;')
38 | out.append('--:-:-:-:2  @P0  BRA LOOP;')
39 | out_ = '\n'.join(out) + '\n'
40 | </code>
41 | 
42 | --:-:-:-:5    CS2R end, SR_CLOCKLO;
43 | --:-:-:-:5    IADD3 end, end, -start, RZ;
44 | --:-:-:-:2    STG.E.GPU [output0], end;
45 | 
46 | #--:-:-:-:5    MOV aux2, aux;
47 | #02:-:-:-:6    SHF.L aux, tid, 2, RZ;
48 | #--:-:-:-:5    IADD3 output0, output0, aux, RZ;
49 | #--:-:-:-:2    STG.E.GPU [output0], aux2;
50 | 
51 | --:-:-:-:2    EXIT;
52 | 


--------------------------------------------------------------------------------
/bench/smem/STS/sts64_broadcast.sass:
--------------------------------------------------------------------------------
 1 | <params>
 2 | scal, 8
 3 | output, 8
 4 | </params>
 5 | 
 6 | <regs>
 7 | 0-19 ~ output0, output1, tid, offset, target, start, end, iter, tmp, aux, aux2
 8 | </regs>
 9 | 
10 | --:-:1:-:5    S2R tid, SR_TID.X;
11 | --:-:-:-:2    MOV output0, output[0];
12 | --:-:-:-:5    MOV output1, output[1];
13 | 
14 | 02:-:-:-:6    SHF.R aux, tid, 4, RZ;
15 | 02:-:-:-:6    SHF.L aux, aux, 4, RZ;
16 | 04:-:1:-:6    IMAD aux, aux, -1, tid;  #tid%16
17 | --:-:-:-:1    ISETP.LT.AND P0, PT, aux, 8, PT;
18 | 
19 | 02:-:-:-:6    SHF.R aux2, tid, 4, RZ;
20 | 04:-:1:-:6    IMAD aux2, aux2, 8, aux;  #tid/16*tid%16
21 | 
22 | --:-:-:-:2    MOV tmp, scal;
23 | --:-:-:-:5    MOV iter, RZ;
24 | 
25 | --:-:-:-:2    @P0  BRA JMP;
26 | --:-:-:-:5    IADD3 aux, aux, -8, RZ;
27 | 
28 | JMP:
29 | 02:-:-:-:6    SHF.L offset, aux, 3, RZ;
30 | 
31 | --:-:-:-:1    CS2R start, SR_CLOCKLO;
32 | 
33 | LOOP:
34 | <code>
35 | out = []
36 | out.append('--:-:-:-:1    IADD3 iter, iter, 1, RZ;')
37 | for i in range(128):
38 |   if i == 64:
39 |     out.append(f'--:-:-:-:1    ISETP.LT.AND P0, PT, iter, 128, PT;')
40 |   out.append(f'--:-:-:-:1    STS.64 [offset], tmp;')
41 | out.append('--:-:-:-:2  @P0  BRA LOOP;')
42 | out_ = '\n'.join(out) + '\n'
43 | </code>
44 | 
45 | --:-:-:-:5    CS2R end, SR_CLOCKLO;
46 | --:-:-:-:5    IADD3 end, end, -start, RZ;
47 | --:-:-:-:2    STG.E.GPU [output0], end;
48 | 
49 | #--:-:-:-:5    MOV aux2, aux;
50 | #02:-:-:-:6    SHF.L aux, tid, 2, RZ;
51 | #--:-:-:-:5    IADD3 output0, output0, aux, RZ;
52 | #--:-:-:-:2    STG.E.GPU [output0], aux;
53 | 
54 | 
55 | --:-:-:-:2    EXIT;
56 | 


--------------------------------------------------------------------------------
/bench/smem/STS/sts64_opt3.sass:
--------------------------------------------------------------------------------
 1 | <params>
 2 | scal, 8
 3 | output, 8
 4 | </params>
 5 | 
 6 | <regs>
 7 | 0-19 ~ output0, output1, tid, offset, target, start, end, iter, tmp, aux, aux2
 8 | </regs>
 9 | 
10 | --:-:1:-:5    S2R tid, SR_TID.X;
11 | --:-:-:-:2    MOV output0, output[0];
12 | --:-:-:-:5    MOV output1, output[1];
13 | 
14 | 02:-:-:-:6    SHF.R aux, tid, 4, RZ;
15 | 02:-:-:-:6    SHF.L aux, aux, 4, RZ;
16 | 04:-:1:-:6    IMAD aux, aux, -1, tid;  #tid%16
17 | --:-:-:-:1    ISETP.LT.AND P0, PT, aux, 8, PT;
18 | 
19 | 02:-:-:-:6    SHF.R aux2, tid, 4, RZ;
20 | 04:-:1:-:6    IMAD aux2, aux2, 8, aux;  #tid/16*tid%16
21 | 
22 | --:-:-:-:2    MOV tmp, scal;
23 | --:-:-:-:5    MOV iter, RZ;
24 | 
25 | --:-:-:-:2    @P0  BRA JMP;
26 | --:-:-:-:5    IADD3 aux2, aux2, 8, RZ;
27 | 
28 | JMP:
29 | 02:-:-:-:6    SHF.L offset, aux2, 3, RZ;
30 | 
31 | --:-:-:-:1    CS2R start, SR_CLOCKLO;
32 | 
33 | LOOP:
34 | <code>
35 | out = []
36 | out.append('--:-:-:-:1    IADD3 iter, iter, 1, RZ;')
37 | for i in range(128):
38 |   if i == 64:
39 |     out.append(f'--:-:-:-:1    ISETP.LT.AND P0, PT, iter, 128, PT;')
40 |   out.append(f'--:-:-:-:1    STS.64 [offset], tmp;')
41 | out.append('--:-:-:-:2  @P0  BRA LOOP;')
42 | out_ = '\n'.join(out) + '\n'
43 | </code>
44 | 
45 | --:-:-:-:5    CS2R end, SR_CLOCKLO;
46 | --:-:-:-:5    IADD3 end, end, -start, RZ;
47 | --:-:-:-:2    STG.E.GPU [output0], end;
48 | 
49 | #02:-:-:-:6    SHF.L aux, tid, 2, RZ;
50 | #--:-:-:-:5    IADD3 output0, output0, aux, RZ;
51 | ##--:-:-:-:2    STG.E.GPU [output0], aux2;
52 | #--:-:-:-:2    STG.E.GPU [output0], aux2;
53 | 
54 | 
55 | --:-:-:-:2    EXIT;
56 | 


--------------------------------------------------------------------------------
/src/FX_m2.cu:
--------------------------------------------------------------------------------
 1 | 
 2 | // Copyright 2021 Roberto Lopez Castro
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //    http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | 
16 | 
17 | #ifndef _FX_
18 | #define _FX_
19 | extern "C"
20 | {
21 | 
22 | // Set of functions per row in Gw product
23 | __device__ float f_row1(float *Gw, int j){
24 |     return Gw[j];
25 |   }
26 |   __device__ float f_row2(float *Gw, int j){
27 |     return 0.5*(Gw[j] + Gw[6+j] + Gw[3+j]);
28 |   }
29 |   __device__ float f_row3(float *Gw, int j){
30 |     return 0.5*(Gw[j] + Gw[6+j] - Gw[3+j]);
31 |   }
32 |   __device__ float f_row4(float *Gw, int j){
33 |     return Gw[6+j];
34 |   }
35 |   // Set of functions per column in GwGt product
36 |   __device__ float f_col1(float *Gw, int j){
37 |     return Gw[j];
38 |   }
39 |   __device__ float f_col2(float *Gw, int j){
40 |     return 0.5*(Gw[j] + Gw[j+2] + Gw[j+1]);
41 |   }
42 |   __device__ float f_col3(float *Gw, int j){
43 |     return 0.5*(Gw[j] + Gw[j+2] - Gw[j+1]);
44 |   }
45 |   __device__ float f_col4(float *Gw, int j){
46 |     return Gw[j+2];
47 |   }
48 |   
49 |   typedef float(*pointFunction_t)(float *, int);
50 |   
51 |   __global__ void FX(float *pInputs, float *pOutputs, int filt_k, 
52 |                       int filt_c, int filt_h, int filt_w, int alpha){
53 |     int Inx = threadIdx.x, Iny = threadIdx.y;
54 |     int TileX = blockIdx.x, TileY = blockIdx.y;
55 |   
56 |     int c_glb_offset = filt_k*filt_h*filt_w;
57 |     int c_kernel = TileY*BC*c_glb_offset + TileX*BK + Iny*c_glb_offset + Inx;
58 |     int c_glb_offset_s = filt_k*4*4;
59 |     int c_kernel_s = TileY*BC*c_glb_offset_s + TileX*BK + Iny*c_glb_offset_s + Inx;
60 |   
61 |     float Gw[21]; //9+12. In registers
62 |     float *Gw_buffer = Gw+9;
63 |   
64 |     pointFunction_t func1[4] = {f_row1, f_row2, f_row3, f_row4};
65 |     pointFunction_t func2[4] = {f_col1, f_col2, f_col3, f_col4};
66 |   
67 |     for(int bk=0; bk<BK; bk+=blockDim.x){
68 |       for(int i=0; i<9; i++){
69 |         Gw[i] = pInputs[c_kernel + i*filt_k];
70 |       }
71 |   
72 |       int aux;
73 |       for(int i=0; i<4; i++){
74 |         aux = i*3;
75 |         for(int j=0; j<3; j++){
76 |           Gw_buffer[j+aux] = (*func1[i])(Gw, j);
77 |         }
78 |       }
79 |   
80 |       int aux2;
81 |       for(int i=0; i<4; i++){
82 |         aux = i*3; aux2 = i<<2;
83 |         for(int j=0; j<4; j++){
84 |           pOutputs[c_kernel_s+aux2*filt_k+j*filt_k] = (*func2[j])(Gw_buffer, aux);
85 |         }
86 |       }
87 |   
88 |       c_kernel   += blockDim.x;
89 |       c_kernel_s += blockDim.x;
90 |     }
91 |   }
92 | 
93 | }
94 | #endif


--------------------------------------------------------------------------------
/src/ampere/convolutionForward_32x64x8.cu:
--------------------------------------------------------------------------------
  1 | // Copyright 2021 Roberto Lopez Castro
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //    http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | 
 16 | #include "../FX_m2.cu"
 17 | 
 18 | #ifdef OPTLDS64
 19 | #include "store_and_transform_output_optLDS64.cuh"
 20 | #include "../outer_product.cuh"
 21 | #elif OPTSTS64_CMP
 22 | #include "store_and_transform_output_optSTS64_compact.cuh"
 23 | #include "../outer_product_suffle.cuh"
 24 | #else
 25 | #include "store_and_transform_output_optSTS64.cuh"
 26 | #include "../outer_product_suffle.cuh"
 27 | #endif
 28 | 
 29 | #ifdef _noWALL_
 30 | typedef struct rusage resnfo;
 31 | typedef struct _timenfo {
 32 |   double time;
 33 |   double systime;
 34 | } timenfo;
 35 | #define timestamp(sample) getrusage(RUSAGE_SELF, (sample))
 36 | #define printtime(t) printf("%15f s (%f user + %f sys) ",		\
 37 | 			    t.time + t.systime, t.time, t.systime);
 38 | #else
 39 | typedef struct timeval resnfo;
 40 | typedef double timenfo;
 41 | #define timestamp(sample)     gettimeofday((sample), 0)
 42 | #define printtime(t) printf("%15f s ", t);
 43 | #endif
 44 | 
 45 | #ifndef _WINOGRAD_
 46 | #define _WINOGRAD_
 47 | extern "C"
 48 | {
 49 | 
 50 | 
 51 | #define d(input, i, j) ( input[(i<<2) + (j)] )
 52 | 
 53 | __device__ __forceinline__ void load_and_transform_input_tile(float *Btd, float *pOutputs, int in_h, int in_w,
 54 |                                   int tiles_dim, int in_c, int in_n, int tile_size, 
 55 |                                   int tiles_2d_dim, int tile_2d_s){
 56 | 
 57 |   float workspace[3]; 
 58 |   
 59 |   #pragma unroll
 60 |   for(int j=0; j<4; j++){
 61 |     workspace[0] = Btd[j];
 62 |     workspace[1] = Btd[j+4];
 63 |     workspace[2] = Btd[j+8];
 64 | 
 65 |     Btd[j]    = workspace[0] - workspace[2];
 66 |     Btd[j+4]  = workspace[1] + workspace[2];
 67 |     Btd[j+8]  = workspace[2] - workspace[1];
 68 |     Btd[j+12] = workspace[1] - Btd[j+12];
 69 |   }
 70 |   
 71 |   int c_offset = BN*BC;
 72 |   int c_tensor = threadIdx.y*BN + threadIdx.x;
 73 |   
 74 |   #pragma unroll
 75 |   for(int i=0; i<4; i++){ // prefetch 1 input tile/thread
 76 |     pOutputs[c_tensor+i*c_offset*4] = d(Btd, i, 0) - d(Btd, i, 2);  
 77 |     pOutputs[c_tensor+i*c_offset*4+c_offset] = d(Btd, i, 1) + d(Btd, i, 2);
 78 |     pOutputs[c_tensor+i*c_offset*4+2*c_offset] = d(Btd, i, 2) - d(Btd, i, 1);
 79 |     pOutputs[c_tensor+i*c_offset*4+3*c_offset] = d(Btd, i, 1) - d(Btd, i, 3);
 80 |   }     
 81 | 
 82 | }
 83 | 
 84 | __device__ __forceinline__ void load_filter_tile(float *tiles, float *pOutputs, 
 85 |                                 int filt_c, int filt_k){
 86 |  
 87 |   int c_tensor_s = threadIdx.y*BK + threadIdx.x;
 88 |   int c_offset_s = BK*BC;
 89 |   
 90 |   for(int k=0; k<2; k++){ // prefetch 2 filter tiles/thread
 91 |     for(int i=0; i<4; i++){
 92 |       #pragma unroll
 93 |       for(int j=0; j<4; j++){
 94 |         pOutputs[c_tensor_s + i*c_offset_s*4 + j*c_offset_s] = tiles[k*16 + i*4 + j];
 95 |       }
 96 |     }
 97 | 
 98 |     c_tensor_s += BN;
 99 |   }
100 |   
101 | }
102 | 
103 | __device__ __forceinline__ void prefetch_filter_tile(float *pInputs, float *tiles, int filt_k){
104 | 
105 |   int c_tensor = blockIdx.z*BK + (threadIdx.y*filt_k<<4) + threadIdx.x; // Iny*filt_k*4*4
106 |   
107 |   int acumm;
108 |   #pragma unroll  
109 |   for(int i=0; i<4; i++){
110 |       acumm = (i*filt_k<<2);
111 |       #pragma unroll
112 |       for(int j=0; j<4; j++){
113 |           tiles[(i<<2) + j] = pInputs[acumm + j*filt_k + c_tensor];
114 |           tiles[16 + (i<<2) + j] = pInputs[acumm + j*filt_k + c_tensor+BN];
115 |       }
116 |   }
117 | }
118 | 
119 | __device__ __forceinline__ void prefetch_input_tile(float *pInputs, float *tile, int in_h, int in_w, int in_n, int tiles_dim, short mask){
120 |   
121 |   int c_tensor = (blockIdx.y%tiles_dim)*in_n*2 + (blockIdx.y/tiles_dim)*in_n*in_w*2 + blockIdx.x*BN + threadIdx.y*(in_n*in_h*in_w) + (threadIdx.x/in_n)*2*in_n + (threadIdx.x%in_n) - (in_n*in_w+in_n);
122 |   int acumm,x;
123 |   //short x1,x2;                 
124 |            
125 |   if(mask==0xFFFF){
126 |     #pragma unroll
127 |     for(int i=0; i<4; i++){
128 |       acumm = i*in_n*in_w;   
129 |       #pragma unroll
130 |       for(int j=0; j<4; j++){
131 |         tile[(i<<2) + j] = pInputs[acumm + j*in_n + c_tensor];
132 |       }
133 |     }
134 | 
135 |   } else {
136 |     for(int i=0; i<4; i++){
137 |       acumm = i*in_n*in_w;   
138 |       #pragma unroll
139 |       for(int j=0; j<4; j++){
140 |         x = (i<<2) + j;
141 |         tile[x] = 0;
142 |         if(mask&(1<<x))
143 |           tile[x]=pInputs[acumm + j*in_n + c_tensor];
144 |       }
145 |     }
146 |   }
147 | }
148 | 
149 | __device__  __forceinline__ void prefetch_filter_frag(float4 *filter_frag, float4 *B_frag, int f_frag_offset, int offset1, int offset2){
150 | 
151 |   *((float4*) (filter_frag))     = *(B_frag + offset1);
152 |   *((float4*) (filter_frag + 1)) = *(B_frag + offset2);
153 | 
154 |   *((float4*) (filter_frag + 2)) = *(B_frag + f_frag_offset + offset1);
155 |   *((float4*) (filter_frag + 3)) = *(B_frag + f_frag_offset + offset2);
156 | }
157 | 
158 | __device__  __forceinline__ void prefetch_input_frag(float4* input_frag, float4 *A_frag, int frag_offset, int offset1, int offset2){  
159 | 
160 |   *((float4*) (input_frag))     = *(A_frag + offset1); //ld_shared(A_frag + offset1);
161 |   *((float4*) (input_frag + 1)) = *(A_frag + offset2);
162 | 
163 |   *((float4*) (input_frag + 2)) = *(A_frag + frag_offset + offset1);
164 |   *((float4*) (input_frag + 3)) = *(A_frag + frag_offset + offset2); //3=2+1
165 | }
166 | 
167 | __global__ void Winograd_kernel(float *A, float *B, float *C,
168 |                     int tiles_dim, int in_c, int in_n, int in_h, int in_w, 
169 |                     int tile_size, int filt_k, int filt_c,
170 |                     int tiles_2d_dim, int out_c, int out_n, 
171 |                     int tile_2d_s, int out_h, int out_w){
172 | 
173 |   extern __shared__ float shared_mem[];
174 |   float *input_smem  = (float*)shared_mem;
175 |   float *filter_smem = (float*)&shared_mem[16*BC*BN];
176 | 
177 |   short m = 0xFFFF;
178 |   if((blockIdx.y/tiles_dim)==0)   m&=0xFFF0;
179 |   if((blockIdx.y/tiles_dim)==(tiles_dim-1)) m &= (!(in_w%2))?(0x0FFF):(0x00FF);
180 |   if(!((blockIdx.y+1)%tiles_dim)) m &= (!(in_w%2))?(0x7777):(0x3333);
181 |   if(!((blockIdx.y)%tiles_dim))   m&=0xeeee;
182 | 
183 |   float img_tile[16]; // Prefetch input from GMEM
184 |   float filter_tile[32]; // Prefetch filter from GMEM
185 | 
186 |   float4 input_frag_mem[8];  //2*2(2*8/4) Data to do Outer Product + prefetch f. SMEM (double_buffer)
187 |   float4 filter_frag_mem[8]; //2*2 Data to do Outer Product + prefetch f. SMEM (double_buffer)
188 |   float4 accumulator[2][16] = {0.0f};  // Accumulators 
189 | 
190 |   float4 *A_frag; // Input data pointer
191 |   int frag_offset = 2* (BC*BN); // (2=8/4) SMEM input read offset
192 | 
193 |   float4 *B_frag; // Filter data pointer
194 |   int f_frag_offset = 2* (BC*BK); // (2=8/4) SMEM filter read offset
195 | 
196 |   float4 *input_frag  = (float4*) input_frag_mem;
197 |   float4 *filter_frag = (float4*) filter_frag_mem;
198 | 
199 |   float4 *swap;
200 | 
201 |   prefetch_input_tile(A, img_tile, in_h, in_w, in_n, tiles_dim, m);
202 |   prefetch_filter_tile(B, filter_tile, filt_k);
203 | 
204 |   float4 *input_frag_buffer  = (float4*) (input_frag+4);
205 |   float4 *filter_frag_buffer = (float4*) (filter_frag+4);
206 |   
207 |   // Mainloop - iterates over the entire K dimension - not unrolled
208 |   for(int iter=0; iter<in_c; iter+=BC){ // Current iteration
209 | 
210 |     A_frag = (float4*) (input_smem  + threadIdx.y*BC*BN);
211 |     B_frag = (float4*) (filter_smem + threadIdx.y*BC*BK);
212 | 
213 |     load_and_transform_input_tile(img_tile, input_smem, in_h, in_w,
214 |                  tiles_dim, in_c, in_n, tile_size,
215 |                  tiles_2d_dim, tile_2d_s);
216 |     load_filter_tile(filter_tile, filter_smem, filt_c, filt_k);
217 | 
218 |     __syncthreads();
219 | 
220 |     prefetch_input_frag(input_frag, A_frag, frag_offset, access_s[0][threadIdx.x], access_s[1][threadIdx.x]);
221 |     prefetch_filter_frag(filter_frag, B_frag, f_frag_offset, access_f_s[0][threadIdx.x], access_f_s[1][threadIdx.x]);
222 |     
223 |     #pragma unroll
224 |     for(int i=0; i<BC; i++){
225 | 
226 |       if(i<(BC-1)){
227 |         A_frag += BN/4;
228 |         B_frag += BK/4;
229 | 
230 |         prefetch_input_frag(input_frag_buffer, A_frag, frag_offset, access_s[0][threadIdx.x], access_s[1][threadIdx.x]);
231 |         prefetch_filter_frag(filter_frag_buffer, B_frag, f_frag_offset, access_f_s[0][threadIdx.x], access_f_s[1][threadIdx.x]);
232 |       }
233 | 
234 |       outer_product(input_frag, filter_frag, accumulator);
235 | 
236 |       swap = input_frag;
237 |       input_frag = input_frag_buffer;
238 |       input_frag_buffer = swap;
239 | 
240 |       swap = filter_frag;
241 |       filter_frag = filter_frag_buffer;
242 |       filter_frag_buffer = swap;
243 |       
244 |     }
245 |     
246 |     A += in_n*BC*in_w*in_h;
247 |     B += filt_k*BC*4*4;
248 | 
249 |     if(iter<(in_c-BC)){
250 |       prefetch_input_tile(A, img_tile, in_h, in_w, in_n, tiles_dim, m);
251 |       prefetch_filter_tile(B, filter_tile, filt_k);
252 |     }
253 | 
254 |     __syncthreads();
255 |   }
256 | 
257 |   // Transpose, transform and store accumulated result
258 |   store_output_tile(accumulator, shared_mem, C, out_h, out_w, tiles_dim, out_n, input_frag_mem, filter_frag_mem, m);
259 |                      
260 | }
261 | 
262 | cudaError_t convolutionForward_32x64x8(float *k, int in_h, int in_w, float *w, int out_h,
263 |                   int out_w, int out_n, int out_c, float *C, float *Ww, 
264 |                 const unsigned int n,
265 |                 int tiles_dim, int in_n, int tile_size,
266 |                 int in_c, int filt_k, int filt_c, int filt_h, int filt_w, int alpha, int m){
267 | 
268 |   int tile_2d_s = tile_size*tile_size;
269 |   int tiles_2d_dim = tiles_dim*tiles_dim;
270 |   int smem_size = (16*BC*BN + 16*BC*BK)*4;
271 | 
272 |   FX<<<dim3(filt_k/BK, filt_c/BC), dim3(BN, BC)>>>(w, Ww, filt_k, filt_c, filt_h, filt_w, alpha);
273 |         
274 |   #ifdef OPTSTS64_CMP
275 |   smem_size = 65536; // 64 KB
276 |   cudaFuncSetAttribute(Winograd_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
277 |   #endif
278 | 
279 |   Winograd_kernel<<<dim3(in_n/BN, tiles_2d_dim, filt_k/BK), dim3(BN, 8), smem_size>>>(k, Ww, C, tiles_dim, in_c, in_n, in_h, in_w, tile_size, filt_k, filt_c, tiles_2d_dim, out_c, out_n, tile_2d_s, out_h, out_w);
280 | 
281 |   return cudaGetLastError();
282 | }
283 | 
284 | }
285 | #endif
286 | 


--------------------------------------------------------------------------------
/src/ampere/convolutionForward_32x64x8_baseline.cu:
--------------------------------------------------------------------------------
  1 | // Copyright 2021 Roberto Lopez Castro
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //    http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | 
 16 | #include "../FX_m2.cu"
 17 | #include "store_and_transform_output_baseline.cuh"
 18 | #include "../outer_product.cuh"
 19 | 
 20 | #ifdef _noWALL_
 21 | typedef struct rusage resnfo;
 22 | typedef struct _timenfo {
 23 |   double time;
 24 |   double systime;
 25 | } timenfo;
 26 | #define timestamp(sample) getrusage(RUSAGE_SELF, (sample))
 27 | #define printtime(t) printf("%15f s (%f user + %f sys) ",		\
 28 | 			    t.time + t.systime, t.time, t.systime);
 29 | #else
 30 | typedef struct timeval resnfo;
 31 | typedef double timenfo;
 32 | #define timestamp(sample)     gettimeofday((sample), 0)
 33 | #define printtime(t) printf("%15f s ", t);
 34 | #endif
 35 | 
 36 | #ifndef _WINOGRAD_
 37 | #define _WINOGRAD_
 38 | extern "C"
 39 | {
 40 | 
 41 | 
 42 | #define d(input, i, j) ( input[(i<<2) + (j)] )
 43 | 
 44 | __device__ __forceinline__ void load_and_transform_input_tile(float *Btd, float *pOutputs, int in_h, int in_w,
 45 |                                   int tiles_dim, int in_c, int in_n, int tile_size, 
 46 |                                   int tiles_2d_dim, int tile_2d_s){
 47 | 
 48 |   float workspace[3];
 49 |   
 50 |   #pragma unroll
 51 |   for(int j=0; j<4; j++){
 52 |     workspace[0] = Btd[j];
 53 |     workspace[1] = Btd[j+4];
 54 |     workspace[2] = Btd[j+8];
 55 | 
 56 |     Btd[j]    = workspace[0] - workspace[2];
 57 |     Btd[j+4]  = workspace[1] + workspace[2];
 58 |     Btd[j+8]  = workspace[2] - workspace[1];
 59 |     Btd[j+12] = workspace[1] - Btd[j+12];
 60 |   }
 61 |   
 62 |   int c_offset = BN*BC;
 63 |   int c_tensor = threadIdx.y*BN + threadIdx.x;
 64 |   
 65 |   #pragma unroll
 66 |   for(int i=0; i<4; i++){ // prefetch 1 input tile/thread
 67 |     pOutputs[c_tensor+i*c_offset*4] = d(Btd, i, 0) - d(Btd, i, 2);  
 68 |     pOutputs[c_tensor+i*c_offset*4+c_offset] = d(Btd, i, 1) + d(Btd, i, 2);
 69 |     pOutputs[c_tensor+i*c_offset*4+2*c_offset] = d(Btd, i, 2) - d(Btd, i, 1);
 70 |     pOutputs[c_tensor+i*c_offset*4+3*c_offset] = d(Btd, i, 1) - d(Btd, i, 3);
 71 |   }     
 72 | 
 73 | }
 74 | 
 75 | __device__ __forceinline__ void load_filter_tile(float *tiles, float *pOutputs, 
 76 |                                 int filt_c, int filt_k){
 77 |  
 78 |   int c_tensor_s = threadIdx.y*BK + threadIdx.x;
 79 |   int c_offset_s = BK*BC;
 80 |   
 81 |   for(int k=0; k<2; k++){ // prefetch 2 filter tiles/thread
 82 |     for(int i=0; i<4; i++){
 83 |       #pragma unroll
 84 |       for(int j=0; j<4; j++){
 85 |         pOutputs[c_tensor_s + i*c_offset_s*4 + j*c_offset_s] = tiles[k*16 + i*4 + j];
 86 |       }
 87 |     }
 88 | 
 89 |     c_tensor_s += BN;
 90 |   }
 91 |   
 92 | }
 93 | 
 94 | __device__ __forceinline__ void prefetch_filter_tile(float *pInputs, float *tiles, int filt_k){
 95 | 
 96 |   int c_tensor = blockIdx.z*BK + (threadIdx.y*filt_k<<4) + threadIdx.x; // Iny*filt_k*4*4
 97 |   
 98 |   int acumm;
 99 |   #pragma unroll  
100 |   for(int i=0; i<4; i++){
101 |       acumm = (i*filt_k<<2);
102 |       #pragma unroll
103 |       for(int j=0; j<4; j++){
104 |           tiles[(i<<2) + j] = pInputs[acumm + j*filt_k + c_tensor];
105 |           tiles[16 + (i<<2) + j] = pInputs[acumm + j*filt_k + c_tensor+BN];
106 |       }
107 |   }
108 | }
109 | 
110 | __device__ __forceinline__ void prefetch_input_tile(float *pInputs, float *tile, int in_h, int in_w, int in_n, int tiles_dim, short mask){
111 |   
112 |   int c_tensor = (blockIdx.y%tiles_dim)*in_n*2 + (blockIdx.y/tiles_dim)*in_n*in_w*2 + blockIdx.x*BN + threadIdx.y*(in_n*in_h*in_w) + (threadIdx.x/in_n)*2*in_n + (threadIdx.x%in_n) - (in_n*in_w+in_n);
113 |   int acumm,x;
114 |   //short x1,x2;                 
115 |            
116 |   if(mask==0xFFFF){
117 |     #pragma unroll
118 |     for(int i=0; i<4; i++){
119 |       acumm = i*in_n*in_w;   
120 |       #pragma unroll
121 |       for(int j=0; j<4; j++){
122 |         tile[(i<<2) + j] = pInputs[acumm + j*in_n + c_tensor];
123 |       }
124 |     }
125 | 
126 |   } else {
127 |     for(int i=0; i<4; i++){
128 |       acumm = i*in_n*in_w;   
129 |       #pragma unroll
130 |       for(int j=0; j<4; j++){
131 |         x = (i<<2) + j;
132 |         tile[x] = 0;
133 |         if(mask&(1<<x))
134 |           tile[x]=pInputs[acumm + j*in_n + c_tensor];
135 |       }
136 |     }
137 |   }
138 | }
139 | 
140 | __device__  __forceinline__ void prefetch_filter_frag(float4 *filter_frag, float4 *B_frag, int f_frag_offset, int offset1, int offset2){
141 |   
142 |   //float4 *B_start = (float4*) (B_frag);
143 | 
144 |   *((float4*) (filter_frag))     = *(B_frag + offset1);
145 |   *((float4*) (filter_frag + 1)) = *(B_frag + offset2);
146 | 
147 |   *((float4*) (filter_frag + 2)) = *(B_frag + f_frag_offset + offset1);
148 |   *((float4*) (filter_frag + 3)) = *(B_frag + f_frag_offset + offset2);
149 | }
150 | 
151 | __device__  __forceinline__ void prefetch_input_frag(float4* input_frag, float4 *A_frag, int frag_offset, int offset1, int offset2){  
152 |   //float4 *A_start = (float4*) (A_frag);
153 | 
154 |   *((float4*) (input_frag))     = *(A_frag + offset1); //ld_shared(A_frag + offset1);
155 |   *((float4*) (input_frag + 1)) = *(A_frag + offset2);
156 | 
157 |   *((float4*) (input_frag + 2)) = *(A_frag + frag_offset + offset1);
158 |   *((float4*) (input_frag + 3)) = *(A_frag + frag_offset + offset2); //3=2+1
159 | }
160 | 
161 | __global__ void Winograd_kernel(float *A, float *B, float *C,
162 |                     int tiles_dim, int in_c, int in_n, int in_h, int in_w, 
163 |                     int tile_size, int filt_k, int filt_c,
164 |                     int tiles_2d_dim, int out_c, int out_n, 
165 |                     int tile_2d_s, int out_h, int out_w){
166 | 
167 |   extern __shared__ float shared_mem[];
168 |   float *input_smem  = (float*)shared_mem;
169 |   float *filter_smem = (float*)&shared_mem[16*BC*BN];
170 | 
171 |   short m = 0xFFFF;
172 |   if((blockIdx.y/tiles_dim)==0)   m&=0xFFF0;
173 |   if((blockIdx.y/tiles_dim)==(tiles_dim-1)) m &= (!(in_w%2))?(0x0FFF):(0x00FF);
174 |   if(!((blockIdx.y+1)%tiles_dim)) m &= (!(in_w%2))?(0x7777):(0x3333);
175 |   if(!((blockIdx.y)%tiles_dim))   m&=0xeeee;
176 | 
177 |   float img_tile[16]; // Prefetch input from GMEM
178 |   float filter_tile[32]; // Prefetch filter from GMEM
179 | 
180 |   float4 input_frag_mem[8];  //2*2(2*8/4) Data to do Outer Product + prefetch f. SMEM (double_buffer)
181 |   float4 filter_frag_mem[8]; //2*2 Data to do Outer Product + prefetch f. SMEM (double_buffer)
182 |   float4 accumulator[2][16] = {0.0f};  // Accumulators 
183 | 
184 |   float4 *A_frag; // Input data pointer
185 |   int frag_offset = 2* (BC*BN); // (2=8/4) SMEM input read offset
186 | 
187 |   float4 *B_frag; // Filter data pointer
188 |   int f_frag_offset = 2* (BC*BK); // (2=8/4) SMEM filter read offset
189 | 
190 |   float4 *input_frag  = (float4*) input_frag_mem;
191 |   float4 *filter_frag = (float4*) filter_frag_mem;
192 | 
193 |   float4 *swap;
194 | 
195 |   prefetch_input_tile(A, img_tile, in_h, in_w, in_n, tiles_dim, m);
196 |   prefetch_filter_tile(B, filter_tile, filt_k);
197 | 
198 |   float4 *input_frag_buffer  = (float4*) (input_frag+4);
199 |   float4 *filter_frag_buffer = (float4*) (filter_frag+4);
200 |   
201 |   // Mainloop - iterates over the entire K dimension - not unrolled
202 |   for(int iter=0; iter<in_c; iter+=BC){ // Current iteration
203 | 
204 |     A_frag = (float4*) (input_smem  + threadIdx.y*BC*BN);
205 |     B_frag = (float4*) (filter_smem + threadIdx.y*BC*BK);
206 | 
207 |     load_and_transform_input_tile(img_tile, input_smem, in_h, in_w,
208 |                  tiles_dim, in_c, in_n, tile_size,
209 |                  tiles_2d_dim, tile_2d_s);
210 |     load_filter_tile(filter_tile, filter_smem, filt_c, filt_k);
211 | 
212 |     __syncthreads();
213 | 
214 |     prefetch_input_frag(input_frag, A_frag, frag_offset, access_s[0][threadIdx.x], access_s[1][threadIdx.x]);
215 |     prefetch_filter_frag(filter_frag, B_frag, f_frag_offset, access_f_s[0][threadIdx.x], access_f_s[1][threadIdx.x]);
216 |     
217 |     #pragma unroll
218 |     for(int i=0; i<BC; i++){
219 | 
220 |       if(i<(BC-1)){
221 |         A_frag += BN/4;
222 |         B_frag += BK/4;
223 | 
224 |         prefetch_input_frag(input_frag_buffer, A_frag, frag_offset, access_s[0][threadIdx.x], access_s[1][threadIdx.x]);
225 |         prefetch_filter_frag(filter_frag_buffer, B_frag, f_frag_offset, access_f_s[0][threadIdx.x], access_f_s[1][threadIdx.x]);
226 |       }
227 | 
228 |       outer_product(input_frag, filter_frag, accumulator);
229 | 
230 |       swap = input_frag;
231 |       input_frag = input_frag_buffer;
232 |       input_frag_buffer = swap;
233 | 
234 |       swap = filter_frag;
235 |       filter_frag = filter_frag_buffer;
236 |       filter_frag_buffer = swap;
237 |       
238 |     }
239 |     
240 |     A += in_n*BC*in_w*in_h;
241 |     B += filt_k*BC*4*4;
242 | 
243 |     if(iter<(in_c-BC)){
244 |       prefetch_input_tile(A, img_tile, in_h, in_w, in_n, tiles_dim, m);
245 |       prefetch_filter_tile(B, filter_tile, filt_k);
246 |     }
247 | 
248 |     __syncthreads();
249 |   }
250 | 
251 |   // Transpose, transform and store accumulated result
252 |   store_output_tile(accumulator, shared_mem, C, out_h, out_w, tiles_dim, out_n, input_frag_mem, filter_frag_mem, out_thread, access_s_out, m);                  
253 |                      
254 | }
255 | 
256 | cudaError_t convolutionForward_32x64x8(float *k, int in_h, int in_w, float *w, int out_h,
257 |                   int out_w, int out_n, int out_c, float *C, float *Ww, 
258 |                 const unsigned int n,
259 |                 int tiles_dim, int in_n, int tile_size,
260 |                 int in_c, int filt_k, int filt_c, int filt_h, int filt_w, int alpha, int m){
261 | 
262 |   int tile_2d_s = tile_size*tile_size;
263 |   int tiles_2d_dim = tiles_dim*tiles_dim;
264 |   int smem_size = 16*BC*BN + 16*BC*BK;
265 | 
266 |   FX<<<dim3(filt_k/BK, filt_c/BC), dim3(BN, BC)>>>(w, Ww, filt_k, filt_c, filt_h, filt_w, alpha);
267 |         
268 |   Winograd_kernel<<<dim3(in_n/BN, tiles_2d_dim, filt_k/BK), dim3(BN, 8), (smem_size)<<2 >>>(k, Ww, C,
269 |                              tiles_dim, in_c, in_n, in_h, in_w, tile_size,
270 |                              filt_k, filt_c, tiles_2d_dim, out_c, out_n, tile_2d_s,
271 |                              out_h, out_w);
272 | 
273 |   return cudaGetLastError();
274 | }
275 | 
276 | }
277 | #endif
278 | 


--------------------------------------------------------------------------------
/src/ampere/store_and_transform_output_baseline.cuh:
--------------------------------------------------------------------------------
  1 | // Copyright 2021 Roberto Lopez Castro
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //    http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | #include "../config.hpp"
 16 | 
 17 | #ifndef _OUTPUT_KERNEL_PPoPP_
 18 | #define _OUTPUT_KERNEL_PPoPP_
 19 | extern "C"
 20 | {
 21 | 
 22 | __device__ void  transform_output_tile(float *pOutputs, float *C_tile, float *At, int out_h, int out_w, 
 23 |                                       int tiles_dim, int round, int in_n, int offset, int out_thread[][4], short mask, int c_tensor, int c_glb_offset){
 24 | 
 25 |   for(int j=0; j<4; j++){
 26 |     At[j] = C_tile[j] + C_tile[4+j] + C_tile[8+j];
 27 |     At[j+8] = C_tile[j+16] + C_tile[4+j+16] + C_tile[8+j+16];
 28 |     
 29 |     At[4+j] = C_tile[4+j] - C_tile[8+j] - C_tile[12+j];
 30 |     At[4+j+8] = C_tile[4+j+16] - C_tile[8+j+16] - C_tile[12+j+16];          
 31 |   }
 32 | 
 33 |   int idx = out_thread[round][threadIdx.y%4] + threadIdx.y/4 + offset;
 34 |   c_tensor += idx*c_glb_offset;
 35 |   int x, x1;
 36 | 
 37 |   for(int i=0; i<2; i++){
 38 |     x = i*4;
 39 |     //x1 = i*(in_n*(tiles_dim-1) + in_n/2)*2;
 40 |     x1 = i*(in_n*(tiles_dim-(out_w%2)) + (out_w%2)*in_n/2)*2;
 41 |     if(mask&(1<<(i*2))){
 42 |       pOutputs[c_tensor+ x1] = At[x] + At[x+1] + At[x+2];
 43 |       pOutputs[c_tensor+2*c_glb_offset+x1] = At[x+8] + At[x+1+8] + At[x+2+8];
 44 |     }
 45 | 
 46 |     if(mask&(1<<(i*2+1))){
 47 |       pOutputs[c_tensor+x1+in_n] = At[x+1] - At[x+2] - At[x+3];
 48 |       pOutputs[c_tensor+2*c_glb_offset+x1+in_n] = At[x+1+8] - At[x+2+8] - At[x+3+8];          
 49 |     }
 50 |   }
 51 | 
 52 | }
 53 | 
 54 | __device__ __inline__ void store_output_tile(float4 acumm_smem[][16], float *shared_mem, float *C, int out_h, int out_w, int tiles_dim, int in_n, float4 *input_frag_mem, float4* filter_frag_mem, int out_thread[][4], int access_s_out[][16], short mask){
 55 |                                       
 56 |   float4 *output_smem = (float4 *) shared_mem;
 57 |   float4 *accumulator = (float4 *) acumm_smem;  
 58 | 
 59 |   float *C_tile = (float*) input_frag_mem;
 60 |   float *At = (float*) filter_frag_mem;
 61 | 
 62 |   mask = 0x000F;
 63 |   if((blockIdx.y/tiles_dim)==(tiles_dim-1) && out_w%2) mask&=0x0003;
 64 |   if(!((blockIdx.y+1)%tiles_dim) && out_w%2)           mask&=0X0005;
 65 | 
 66 |   // output transpose step
 67 |   int t=0;
 68 |   int acumm1, acumm2;
 69 | 
 70 |   acumm1 = access_s_out[0][threadIdx.x%8 + (threadIdx.x/16)*8];
 71 |   acumm2 = access_s_out[1][threadIdx.x%8 + (threadIdx.x/16)*8]; 
 72 | 
 73 |   int offset = BN_p*4;
 74 |   int init = (threadIdx.y/4)*BN_p*16*4 + (threadIdx.y%4)*40 + threadIdx.x; 
 75 |   int acumm3 = threadIdx.y * BN_p;
 76 |   int acumm4 = BN_p*8*2;
 77 | 
 78 |   int idx  = acumm3;
 79 |   int idx2 = idx + BN_p*8;
 80 | 
 81 |   float* out = (float *) output_smem;
 82 | 
 83 |   int c_glb_offset = in_n*out_h*out_w;
 84 |   int c_tensor = blockIdx.z*in_n*out_h*out_w*BK + (blockIdx.y%tiles_dim)*in_n*2 + (blockIdx.y/tiles_dim)*in_n*out_w*2 + blockIdx.x*BN + threadIdx.x;
 85 | 
 86 |   //#pragma unroll                                  
 87 |   for(int round=0; round<4; round++){
 88 | 
 89 |     //transformation step    
 90 |     if ( ((!round || round==1) && (threadIdx.x&15)<8) || ((round==2 || round==3) && (threadIdx.x&15)>7) ){
 91 | 
 92 |         #pragma unroll   
 93 |         for(int i=0; i<4; i+=2){
 94 | 
 95 |             *( (float4*) (output_smem + idx+i*acumm4 + acumm1) )  = *(accumulator+t);            // k=0
 96 |             *( (float4*) (output_smem + idx+i*acumm4 + acumm2) )  = *(accumulator+t+1);
 97 |             *( (float4*) (output_smem + idx+(i+1)*acumm4 + acumm1) )  = *(accumulator+2+t);   // k=1
 98 |             *( (float4*) (output_smem + idx+(i+1)*acumm4 + acumm2) )  = *(accumulator+3+t);
 99 | 
100 |             *( (float4*) (output_smem + idx2+i*acumm4 + acumm1) ) = *(accumulator+16+t);
101 |             *( (float4*) (output_smem + idx2+i*acumm4 + acumm2) ) = *(accumulator+17+t);
102 |             *( (float4*) (output_smem + idx2+(i+1)*acumm4 + acumm1) ) = *(accumulator+18+t);
103 |             *( (float4*) (output_smem + idx2+(i+1)*acumm4 + acumm2) ) = *(accumulator+19+t);
104 | 
105 |             t+=4;
106 |         }
107 |     } 
108 |     __syncthreads();
109 |     
110 |     for(int i=0; i<16; i++){
111 |         C_tile[i] =    out[init + i*offset];
112 |         C_tile[i+16] = out[init + 2*BN_p*16*4 + i*offset];
113 |     }
114 | 
115 |     // transform output tiles
116 |     transform_output_tile(C, C_tile, At, out_h, out_w, tiles_dim, round, in_n, 0, out_thread, mask, c_tensor, c_glb_offset);
117 |                         
118 | 
119 |     __syncthreads();
120 |   }
121 |   
122 | }
123 | 
124 | }
125 | #endif     
126 | 


--------------------------------------------------------------------------------
/src/ampere/store_and_transform_output_optLDS64.cuh:
--------------------------------------------------------------------------------
  1 | // Copyright 2021 Roberto Lopez Castro
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //    http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | #include "../config.hpp"
 16 | 
 17 | #ifndef _OUTPUT_KERNEL_OPT3_
 18 | #define _OUTPUT_KERNEL_OPT3_
 19 | extern "C"
 20 | {
 21 | 
 22 | __device__ void  __inline__ transform_output_tile(float2 *pOutputs, float2 *C_tile, float2 *At, int tiles_dim, int round, int in_n, int c_tensor, int c_glb_offset, short mask, int out_w)
 23 | {
 24 | 
 25 |   c_tensor += ( (round/2)*32 + (round%2)*2 )*c_glb_offset/2;  
 26 |   int x, x1;
 27 | 
 28 |   for(int j=0; j<4; j++){
 29 |       At[j].x = C_tile[j].x + C_tile[4+j].x + C_tile[8+j].x;
 30 |       At[j].y = C_tile[j].y + C_tile[4+j].y + C_tile[8+j].y;
 31 |     
 32 |       At[4+j].x = C_tile[4+j].x - C_tile[8+j].x - C_tile[12+j].x;
 33 |       At[4+j].y = C_tile[4+j].y - C_tile[8+j].y - C_tile[12+j].y;      
 34 |   }
 35 | 
 36 |   for(int i=0; i<2; i++){
 37 |     x = i*4;
 38 |     x1 = i*(in_n*(tiles_dim-(out_w%2)) + (out_w%2)*in_n/2);
 39 |     if(mask&(1<<(i*2))){
 40 | 
 41 |       pOutputs[x1 + c_tensor].x = At[x].x + At[x+1].x + At[x+2].x;
 42 |       pOutputs[x1 + c_tensor].y = At[x].y + At[x+1].y + At[x+2].y;
 43 |     }
 44 |     if(mask&(1<<(i*2+1))){
 45 |       pOutputs[x1 + in_n/2 + c_tensor].x = At[x+1].x - At[x+2].x - At[x+3].x;
 46 |       pOutputs[x1 + in_n/2 + c_tensor].y = At[x+1].y - At[x+2].y - At[x+3].y;
 47 |     }
 48 |   }
 49 | 
 50 | }
 51 | 
 52 | __device__ __inline__ void store_output_tile(float4 acumm_smem[][16], float  *shared_mem,
 53 |                                   float *C, int out_h, int out_w, 
 54 |                                   int tiles_dim, int in_n, float4 *input_frag_mem, float4* filter_frag_mem, short mask){
 55 |   
 56 |   float2 *output_smem = (float2 *) shared_mem;
 57 |   float2 *accumulator = (float2 *) acumm_smem;
 58 |   float2 *C_out = (float2*)C;
 59 | 
 60 |   float2 *C_tile = (float2*) input_frag_mem;
 61 |   float2 *At = (float2*) filter_frag_mem;
 62 | 
 63 |   mask = 0x000F;
 64 |   if((blockIdx.y/tiles_dim)==(tiles_dim-1) && out_w%2) mask&=0x0003;
 65 |   if(!((blockIdx.y+1)%tiles_dim) && out_w%2)           mask&=0X0005;
 66 |   
 67 |   // output transpose step
 68 |   int t=0;
 69 |   int acumm1, acumm2;
 70 |   // For transposing
 71 |   //acumm1 = access_s_out[threadIdx.x]; //* 4
 72 |   acumm1 = ((threadIdx.x%8)/2)*34 + threadIdx.x%2 + (threadIdx.x/16)*2 + ((threadIdx.x/8)%2)*8;
 73 |   acumm2 = acumm1+4;
 74 |                                     
 75 |   int acumm4 = BN_p*8*2 ; //*4
 76 |   int idx  = threadIdx.y * BN_p;
 77 |   int idx2 = idx + BN_p*8; //(BN_p*2 *8)/2
 78 | 
 79 |   // For transformating
 80 |   int offset = BN_p; //*2/2
 81 |   int init = (threadIdx.y/4)*BN_p*16 + (threadIdx.y%4)*(32+2);
 82 |   init += (threadIdx.x/16)*8 + ((threadIdx.x/8)%2)*16 + (threadIdx.x%8); //40=(8+2)*4, 4 blocks/buffer
 83 | 
 84 | 
 85 |   int c_glb_offset = in_n*out_h*out_w;                    
 86 |   int c_tensor = blockIdx.z*c_glb_offset*BK + (blockIdx.y%tiles_dim)*in_n*2 + (blockIdx.y/tiles_dim)*in_n*out_w*2 + blockIdx.x*BN + ((threadIdx.x/8)%2)*2 + (threadIdx.x%8)*2*2 + ((threadIdx.x/16)*16 + (threadIdx.y%4)*4 + threadIdx.y/4)*c_glb_offset;
 87 |   c_tensor/=2; 
 88 | 
 89 |   #pragma unroll                                  
 90 |   for(int round=0; round<4; round++){
 91 | 
 92 |     *( (float2*) (output_smem + idx + acumm1) )  = *(accumulator+t);
 93 |     *( (float2*) (output_smem + idx + acumm1 + 16) )  = *(accumulator+t+1); // float 4, t
 94 |     *( (float2*) (output_smem + idx + acumm2) )  = *(accumulator+t+2);
 95 |     *( (float2*) (output_smem + idx + acumm2 + 16) )  = *(accumulator+t+3); // float 4, t+1
 96 | 
 97 |     *( (float2*) (output_smem + idx2 + acumm1) ) = *(accumulator+t+32);
 98 |     *( (float2*) (output_smem + idx2 + acumm1 + 16) ) = *(accumulator+t+33); // float 4, t+16
 99 |     *( (float2*) (output_smem + idx2 + acumm2) ) = *(accumulator+t+34);
100 |     *( (float2*) (output_smem + idx2 + acumm2 + 16) ) = *(accumulator+t+35); // float 4, t+17
101 | 
102 |     *( (float2*) (output_smem + idx + acumm4 + acumm1) )  = *(accumulator+t+4); 
103 |     *( (float2*) (output_smem + idx + acumm4 + acumm1 + 16) )  = *(accumulator+t+5); // float 4, t+2
104 |     *( (float2*) (output_smem + idx + acumm4 + acumm2) )  = *(accumulator+t+6);
105 |     *( (float2*) (output_smem + idx + acumm4 + acumm2 + 16) )  = *(accumulator+t+7); // float 4, t+3
106 | 
107 |     *( (float2*) (output_smem + idx2 + acumm4 + acumm1) ) = *(accumulator+t+36);
108 |     *( (float2*) (output_smem + idx2 + acumm4 + acumm1 + 16) ) = *(accumulator+t+37); // float 4, t+18
109 |     *( (float2*) (output_smem + idx2 + acumm4 + acumm2) ) = *(accumulator+t+38);
110 |     *( (float2*) (output_smem + idx2 + acumm4 + acumm2 + 16) ) = *(accumulator+t+39); // float 4, t+19
111 | 
112 |     t+=8;
113 | 
114 |     __syncthreads();
115 | 
116 | 
117 |     for(int i=0; i<16; i++){
118 |       C_tile[i].x = output_smem[i*offset + init].x; //16*4
119 |       C_tile[i].y = output_smem[i*offset + init].y; //16*4
120 |     }
121 | 
122 |     // transform output tiles
123 |     transform_output_tile(C_out, C_tile, At, tiles_dim, round, in_n, c_tensor, c_glb_offset, mask , out_w);
124 | 
125 |     __syncthreads();
126 |     
127 |   }
128 | }
129 | 
130 | }
131 | #endif     
132 | 


--------------------------------------------------------------------------------
/src/ampere/store_and_transform_output_optSTS64.cuh:
--------------------------------------------------------------------------------
  1 | // Copyright 2021 Roberto Lopez Castro
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //    http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | #include "../config.hpp"
 16 | 
 17 | #ifndef _OUTPUT_KERNEL_OPT1_
 18 | #define _OUTPUT_KERNEL_OPT1_
 19 | extern "C"
 20 | {
 21 |     
 22 | __device__ __forceinline__ void  transform_output_tile(float2 *pOutputs, float2 *C_tile, float2 *At, int tiles_dim, int round, int in_n, int c_tensor, int c_glb_offset, short mask, int out_w)
 23 | {                     
 24 |   c_tensor += (((round)/2)*32 + ((round)%2)*2)*c_glb_offset/2;  
 25 |   int x, x1;
 26 | 
 27 |   #pragma unroll
 28 |   for(int j=0; j<4; j++){
 29 | 
 30 |     At[j].x = C_tile[j].x + C_tile[4+j].x + C_tile[8+j].x;
 31 |     At[j].y = C_tile[j].y + C_tile[4+j].y + C_tile[8+j].y;
 32 | 
 33 |     At[4+j].x = C_tile[4+j].x - C_tile[8+j].x - C_tile[12+j].x;
 34 |     At[4+j].y = C_tile[4+j].y - C_tile[8+j].y - C_tile[12+j].y;
 35 |   }
 36 | 
 37 | 
 38 |   #pragma unroll
 39 |   for(int i=0; i<2; i++){
 40 |     x = i*4;
 41 |     x1 = i*(in_n*(tiles_dim-(out_w%2)) + (out_w%2)*in_n/2);
 42 |     if(mask&(1<<(i*2))){
 43 |       pOutputs[x1 + c_tensor].x = At[x].x + At[x+1].x + At[x+2].x;
 44 |       pOutputs[x1 + c_tensor].y = At[x].y + At[x+1].y + At[x+2].y;
 45 |     }
 46 | 
 47 |     if(mask&(1<<(i*2+1))){
 48 |       pOutputs[x1 + in_n/2 + c_tensor].x = At[x+1].x - At[x+2].x - At[x+3].x;
 49 |       pOutputs[x1 + in_n/2 + c_tensor].y = At[x+1].y - At[x+2].y - At[x+3].y;
 50 |     }
 51 |   } 
 52 | }
 53 | 
 54 | __device__ __forceinline__ void store_output_tile(float4 acumm_smem[][16], float *shared_mem, float *C, int out_h, int out_w, int tiles_dim, int in_n, float4 *input_frag_mem, float4* filter_frag_mem,  short mask){
 55 |   
 56 |   float2 *output_smem = (float2 *) shared_mem;
 57 |   float2 *accumulator = (float2 *) acumm_smem;
 58 |   float2 *C_out = (float2*)C;
 59 | 
 60 |   float2 *C_tile = (float2*) input_frag_mem;
 61 |   float2 *At = (float2*) filter_frag_mem;
 62 | 
 63 |   mask = 0x000F;
 64 |   if((blockIdx.y/tiles_dim)==(tiles_dim-1) && out_w%2) mask&=0x0003;
 65 |   if(!((blockIdx.y+1)%tiles_dim) && out_w%2)           mask&=0X0005;
 66 |   
 67 |   // output transpose step
 68 |   int t=0;
 69 |   int acumm1, acumm2;
 70 |   // For transposing
 71 |   //acumm1 = access_s_out[Inx]; //* 4
 72 |   acumm1 = ((threadIdx.x%8)/2)*34 + threadIdx.x%2 + (threadIdx.x/16)*2 + ((threadIdx.x/8)%2)*8;
 73 |   acumm2 = acumm1+4;
 74 |                        
 75 |   int acumm4 = BN_p*16 ; //*4
 76 |   int idx  = threadIdx.y * BN_p;
 77 |   int idx2 = idx + BN_p*8; //(BN_p*2 *8)/2
 78 | 
 79 |   // For transformating
 80 |   int offset = BN_p *2; //*2/2
 81 |   int init = ( (threadIdx.y/4)*BN_p*16 + (threadIdx.y%4)*(32+2) ) *2 + threadIdx.x;
 82 | 
 83 |   int c_glb_offset = in_n*out_h*out_w;
 84 |   int c_tensor = blockIdx.z*c_glb_offset*BK + (blockIdx.y%tiles_dim)*in_n*2 + (blockIdx.y/tiles_dim)*in_n*out_w*2 + blockIdx.x*BN + (threadIdx.x%16)*2+
 85 |                 ((threadIdx.x/16)*16 + (threadIdx.y%4)*4 + threadIdx.y/4)*c_glb_offset;
 86 |   c_tensor/=2; 
 87 | 
 88 |   #pragma unroll                                  
 89 |   for(int round=0; round<4; round++){
 90 | 
 91 |     *( (float2*) (output_smem + idx + acumm1) )  = *(accumulator+t);
 92 |     *( (float2*) (output_smem + idx + acumm1 + 16) )  = *(accumulator+t+1); // float 4, t
 93 |     *( (float2*) (output_smem + idx + acumm2) )  = *(accumulator+t+2);
 94 |     *( (float2*) (output_smem + idx + acumm2 + 16) )  = *(accumulator+t+3); // float 4, t+1
 95 | 
 96 |     *( (float2*) (output_smem + idx2 + acumm1) ) = *(accumulator+t+32);
 97 |     *( (float2*) (output_smem + idx2 + acumm1 + 16) ) = *(accumulator+t+33); // float 4, t+16
 98 |     *( (float2*) (output_smem + idx2 + acumm2) ) = *(accumulator+t+34);
 99 |     *( (float2*) (output_smem + idx2 + acumm2 + 16) ) = *(accumulator+t+35); // float 4, t+17
100 | 
101 |     *( (float2*) (output_smem + idx + acumm4 + acumm1) )  = *(accumulator+t+4); 
102 |     *( (float2*) (output_smem + idx + acumm4 + acumm1 + 16) )  = *(accumulator+t+5); // float 4, t+2
103 |     *( (float2*) (output_smem + idx + acumm4 + acumm2) )  = *(accumulator+t+6);
104 |     *( (float2*) (output_smem + idx + acumm4 + acumm2 + 16) )  = *(accumulator+t+7); // float 4, t+3
105 | 
106 |     *( (float2*) (output_smem + idx2 + acumm4 + acumm1) ) = *(accumulator+t+36);
107 |     *( (float2*) (output_smem + idx2 + acumm4 + acumm1 + 16) ) = *(accumulator+t+37); // float 4, t+18
108 |     *( (float2*) (output_smem + idx2 + acumm4 + acumm2) ) = *(accumulator+t+38);
109 |     *( (float2*) (output_smem + idx2 + acumm4 + acumm2 + 16) ) = *(accumulator+t+39); // float 4, t+19
110 |     
111 |     t+=8;
112 | 
113 |     __syncthreads();
114 | 
115 |     
116 |     for(int i=0; i<16; i++){
117 |       C_tile[i].x = shared_mem[i*offset + init];
118 |       C_tile[i].y = shared_mem[i*offset + init + 32];
119 |     }
120 | 
121 |     // transform output tiles
122 |     transform_output_tile(C_out, C_tile, At, tiles_dim, round, in_n, c_tensor, c_glb_offset, mask, out_w);
123 |     __syncthreads();
124 |   }
125 | }
126 | 
127 | }
128 | #endif     
129 | 


--------------------------------------------------------------------------------
/src/ampere/store_and_transform_output_optSTS64_compact.cuh:
--------------------------------------------------------------------------------
  1 | // Copyright 2021 Roberto Lopez Castro
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //    http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | #include "../config.hpp"
 16 | 
 17 | #ifndef _OUTPUT_KERNEL_OPT1_
 18 | #define _OUTPUT_KERNEL_OPT1_
 19 | extern "C"
 20 | {
 21 |     
 22 | __device__ void  transform_output_tile(float2 *pOutputs, float2 *C_tile, float2 *At,
 23 |                                       int tiles_dim, int round, int in_n, int c_tensor, int c_glb_offset,
 24 |                                       short mask, int out_w){             
 25 |   c_tensor += (((round)/2)*32 + ((round)%2)*2)*c_glb_offset/2;  
 26 |   int x, x1;
 27 | 
 28 |   
 29 |   #pragma unroll
 30 |   for(int j=0; j<4; j++){
 31 |       At[j].x = C_tile[j].x + C_tile[4+j].x + C_tile[8+j].x;
 32 |       At[j].y = C_tile[j].y + C_tile[4+j].y + C_tile[8+j].y;
 33 |       At[4+j].x = C_tile[4+j].x - C_tile[8+j].x - C_tile[12+j].x;
 34 |       At[4+j].y = C_tile[4+j].y - C_tile[8+j].y - C_tile[12+j].y;
 35 |   }
 36 |   
 37 |   x = in_n/2;
 38 |   pOutputs[c_tensor].x = At[0].x + At[1].x + At[2].x;
 39 |   pOutputs[c_tensor].y = At[0].y + At[1].y + At[2].y;
 40 | 
 41 |   if(mask&0x2){
 42 |     pOutputs[x + c_tensor].x = At[1].x - At[2].x - At[3].x;
 43 |     pOutputs[x + c_tensor].y = At[1].y - At[2].y - At[3].y;
 44 |   }
 45 | 
 46 |   //x1 = in_n*(tiles_dim-1) + x;
 47 |   x1 = in_n*(tiles_dim-(out_w%2)) + (out_w%2)*x;
 48 |   if(mask&0x4){
 49 |     pOutputs[x1 + c_tensor].x = At[4].x + At[5].x + At[6].x;
 50 |     pOutputs[x1 + c_tensor].y = At[4].y + At[5].y + At[6].y;
 51 |   }   
 52 |   
 53 |   if(mask&0x8){
 54 |     pOutputs[x1 + x + c_tensor].x = At[5].x - At[6].x - At[7].x;
 55 |     pOutputs[x1 + x + c_tensor].y = At[5].y - At[6].y - At[7].y;
 56 |   }
 57 | }
 58 | 
 59 | __device__ __forceinline__ void store_output_tile(float4 acumm_smem[][16], float *shared_mem, float *C, int out_h, int out_w, int tiles_dim, int in_n, float4 *input_frag_mem, float4* filter_frag_mem,  short mask){
 60 |   
 61 |   float2 *output_smem = (float2 *) shared_mem;
 62 |   float2 *accumulator = (float2 *) acumm_smem;
 63 |   float2 *C_out = (float2*)C;
 64 | 
 65 |   float2 *C_tile = (float2*) input_frag_mem;
 66 |   float2 *At = (float2*) filter_frag_mem;
 67 | 
 68 |   mask = 0x000F;
 69 |   if((blockIdx.y/tiles_dim)==(tiles_dim-1) && out_w%2) mask&=0x0003;
 70 |   if(!((blockIdx.y+1)%tiles_dim) && out_w%2)           mask&=0X0005;
 71 |   
 72 |   // output transpose step
 73 |   int t,j;
 74 |   int acumm1, acumm2;
 75 |   // For transposing
 76 |   t = threadIdx.x%8/2;
 77 |   acumm1 = t*18 + threadIdx.x%2 + (threadIdx.x/16)*2 + ((threadIdx.x/8)%2)*8;
 78 |   acumm2 = acumm1+4;
 79 |   acumm1 = acumm1 - acumm1/((t+1)*16)*16 + t*16;  
 80 |   acumm2 = acumm2 - acumm2/((t+1)*16)*16 + t*16;
 81 |   t=0;
 82 |                        
 83 |   int acumm4 = BN_p*16 ; //*4
 84 |   int idx  = threadIdx.y * BN_p;
 85 |   int idx2 = idx + BN_p*8; //(BN_p*2 *8)/2
 86 | 
 87 |   // For transformating
 88 |   int offset = BN_p *2; //*2/2
 89 |   
 90 |   int init = (threadIdx.y%4)*(16+2)*2 + threadIdx.x;
 91 |   init = init - init/((threadIdx.y%4+1)*32)*32 + threadIdx.y%4*32;
 92 |   init += (threadIdx.y/4)*BN_p*16*2;
 93 | 
 94 |   int c_glb_offset = in_n*out_h*out_w;
 95 |   int c_tensor = blockIdx.z*c_glb_offset*BK + (blockIdx.y%tiles_dim)*in_n*2 + (blockIdx.y/tiles_dim)*in_n*out_w*2 + blockIdx.x*BN + (threadIdx.x%16)*2+
 96 |                 ((threadIdx.x/16)*16 + (threadIdx.y%4)*4 + threadIdx.y/4)*c_glb_offset;
 97 |   c_tensor/=2; 
 98 | 
 99 |   // k=0, block 0
100 |   *( (float2*) (output_smem + idx + acumm1) )  = *(accumulator);
101 |   *( (float2*) (output_smem + idx + acumm1 + 16) )  = *(accumulator+1);
102 |   *( (float2*) (output_smem + idx + acumm2) )  = *(accumulator+2);
103 |   *( (float2*) (output_smem + idx + acumm2 + 16) )  = *(accumulator+3);
104 |   
105 |   // K=1, block 0
106 |   *( (float2*) (output_smem + idx + acumm4 + acumm1) )  = *(accumulator+4); 
107 |   *( (float2*) (output_smem + idx + acumm4 + acumm1 + 16) )  = *(accumulator+5);
108 |   *( (float2*) (output_smem + idx + acumm4 + acumm2) )  = *(accumulator+6);
109 |   *( (float2*) (output_smem + idx + acumm4 + acumm2 + 16) )  = *(accumulator+7);
110 |   
111 |   // k=0, block 1
112 |   *( (float2*) (output_smem + idx2 + acumm1) ) = *(accumulator+32);
113 |   *( (float2*) (output_smem + idx2 + acumm1 + 16) ) = *(accumulator+33); 
114 |   *( (float2*) (output_smem + idx2 + acumm2) ) = *(accumulator+34);
115 |   *( (float2*) (output_smem + idx2 + acumm2 + 16) ) = *(accumulator+35); 
116 |   
117 |   // K=1, block 1
118 |   *( (float2*) (output_smem + idx2 + acumm4 + acumm1) ) = *(accumulator+36);
119 |   *( (float2*) (output_smem + idx2 + acumm4 + acumm1 + 16) ) = *(accumulator+37);
120 |   *( (float2*) (output_smem + idx2 + acumm4 + acumm2) ) = *(accumulator+38);
121 |   *( (float2*) (output_smem + idx2 + acumm4 + acumm2 + 16) ) = *(accumulator+39);
122 |   
123 |   j=0; t+=8;
124 | 
125 |   #pragma unroll                                  
126 |   for(int round=0; round<3; round++){
127 |     
128 |     __syncthreads();
129 | 
130 |     int disp = j/2*(BN_p*2*16)*2;
131 |     #pragma unroll
132 |     for(int i=0; i<16; i++){
133 |       C_tile[i].x = shared_mem[disp + i*offset + init];
134 |       C_tile[i].y = shared_mem[disp + i*offset + init + 32];
135 |     }
136 | 
137 |     // transform output tiles
138 |     transform_output_tile(C_out, C_tile, At, tiles_dim, (round/2)*2+j/2, in_n, c_tensor, c_glb_offset, mask, out_w);
139 | 
140 |     j = 2 - j; //switch between 0 and 2
141 |       
142 |     // k=0, block 0
143 |     *( (float2*) (output_smem + idx + (j)*acumm4 + acumm1) )  = *(accumulator+t);
144 |     *( (float2*) (output_smem + idx + (j)*acumm4 + acumm1 + 16) )  = *(accumulator+t+1);
145 |     *( (float2*) (output_smem + idx + (j)*acumm4 + acumm2) )  = *(accumulator+t+2);
146 |     *( (float2*) (output_smem + idx + (j)*acumm4 + acumm2 + 16) )  = *(accumulator+t+3);
147 |     
148 |     // K=1, block 0
149 |     *( (float2*) (output_smem + idx + (j+1)*acumm4 + acumm1) )  = *(accumulator+t+4); 
150 |     *( (float2*) (output_smem + idx + (j+1)*acumm4 + acumm1 + 16) )  = *(accumulator+t+5);
151 |     *( (float2*) (output_smem + idx + (j+1)*acumm4 + acumm2) )  = *(accumulator+t+6);
152 |     *( (float2*) (output_smem + idx + (j+1)*acumm4 + acumm2 + 16) )  = *(accumulator+t+7);
153 |     
154 |     // k=0, block 1
155 |     *( (float2*) (output_smem + idx2 + (j)*acumm4 + acumm1) ) = *(accumulator+t+32);
156 |     *( (float2*) (output_smem + idx2 + (j)*acumm4 + acumm1 + 16) ) = *(accumulator+t+33); 
157 |     *( (float2*) (output_smem + idx2 + (j)*acumm4 + acumm2) ) = *(accumulator+t+34);
158 |     *( (float2*) (output_smem + idx2 + (j)*acumm4 + acumm2 + 16) ) = *(accumulator+t+35); 
159 |     
160 |     // K=1, block 1
161 |     *( (float2*) (output_smem + idx2 + (j+1)*acumm4 + acumm1) ) = *(accumulator+t+36);
162 |     *( (float2*) (output_smem + idx2 + (j+1)*acumm4 + acumm1 + 16) ) = *(accumulator+t+37);
163 |     *( (float2*) (output_smem + idx2 + (j+1)*acumm4 + acumm2) ) = *(accumulator+t+38);
164 |     *( (float2*) (output_smem + idx2 + (j+1)*acumm4 + acumm2 + 16) ) = *(accumulator+t+39);
165 |       
166 |     t+=8;
167 | 
168 |   }
169 | 
170 |   __syncthreads();
171 | 
172 |   int disp = j/2*(BN_p*2*16)*2;
173 |   #pragma unroll
174 |   for(int i=0; i<16; i++){
175 |     C_tile[i].x = shared_mem[disp + i*offset + init];
176 |     C_tile[i].y = shared_mem[disp + i*offset + init + 32];
177 |   }
178 |   // transform output tiles
179 |   transform_output_tile(C_out, C_tile, At, tiles_dim, 2+j/2, in_n, c_tensor, c_glb_offset, mask, out_w);
180 | }
181 | 
182 | }
183 | #endif     
184 | 


--------------------------------------------------------------------------------
/src/config.hpp:
--------------------------------------------------------------------------------
  1 | // Copyright 2021 Roberto Lopez Castro
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //    http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | 
 16 | #ifndef COMMON_INCLUDE_FILE
 17 | #define COMMON_INCLUDE_FILE
 18 | 
 19 | #define BC 8
 20 | #define BN 32
 21 | #define BK 64
 22 | ///////////////////// For Non-Fused version
 23 | #define BC_GEMM 8
 24 | #define BN_GEMM 128
 25 | #define BK_GEMM 128
 26 | ///////////////////// For Non-Fused version
 27 | 
 28 | #ifdef OPTSTS64_CMP
 29 | #define BN_p 128
 30 | #elif BASE
 31 | #define BN_p 40
 32 | #else 
 33 | #define BN_p 138
 34 | #endif
 35 | 
 36 | #define N 128 // values: 32,64,96,128
 37 | #define C_in 256 // values: 64,128,256,512
 38 | #define W 14 // values: 56,28,14,7
 39 | 
 40 | #define K 256 // values: 64,128,256,512
 41 | #define R 3 // values: 3
 42 | 
 43 | #define PAD_H 1
 44 | #define PAD_W 1
 45 | #define STR_H 1
 46 | #define STR_W 1
 47 | #define DIL_H 1
 48 | #define DIL_W 1
 49 | 
 50 | #define M 2 // values: 2
 51 | 
 52 | __constant__ int access_f_s[2][32];
 53 | __constant__ int access_s[2][32];
 54 | #ifndef BASE
 55 | __constant__ int access_s_out[32];
 56 | __constant__ int out_thread[2][4][4];
 57 | __constant__ int out_sgemm[32];
 58 | __constant__ int exhange[32];
 59 | #else 
 60 | __constant__ int access_s_out[2][16];
 61 | __constant__ int out_thread[4][4];
 62 | #endif
 63 | 
 64 | 
 65 | // access_f_s
 66 | const int aux[2][32] = {
 67 |                         {0,0,1,1,2,2,3,3,4,4,5,5,6,6,
 68 |                             7,7,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7},
 69 |                         {8,8,9,9,10,10,11,11,12,12,13,13,
 70 |                             14,14,15,15,8,8,9,9,10,10,11,11,12,12,
 71 |                             13,13,14,14,15,15}
 72 |                         };
 73 | // access_s
 74 | const int aux2[2][32] = {
 75 |                          {0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,
 76 |                              3,2,3,2,3,2,3,2,3,2,3,2,3,2,3},
 77 |                          {4,5,4,5,4,5,4,5,4,5,4,
 78 |                             5,4,5,4,5,6,7,6,7,6,7,6,7,
 79 |                             6,7,6,7,6,7,6,7}
 80 |                         };                
 81 | 
 82 | #ifndef BASE
 83 | // access_s_out
 84 | const int aux3[32] = { 
 85 |                         0,1,34,35,68,69,102,103,  // first quarter
 86 |                         8,9,42,43,76,77,110,111,  // second quarter  
 87 |                         2,3,36,37,70,71,104,105,  // third quarter
 88 |                         10,11,44,45,78,79,112,113 // fourth quarter          
 89 |                         };
 90 | // out_thread
 91 | const int aux4[2][4][4] = { {{0,4,8,12}, {2,6,10,14},
 92 |                             {32,36,40,44}, {34,38,42,46} },
 93 |                             {{16,20,24,28}, {18,22,26,30},
 94 |                             {48,52,56,60}, {50,54,58,62}}};
 95 | // out_sgemm
 96 | const int aux5[32] = { 0,1,8,9,16,17,24,25,
 97 |                        32,33,40,41,48,49,56,57,
 98 |                        2,3,10,11,18,19,26,27,
 99 |                        34,35,42,43,50,51,58,59
100 |                         };
101 | // exhange                        
102 | const int aux6[32] = {
103 |                         2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,
104 |                         18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29
105 |                     };     
106 |  
107 | #else
108 | const int aux3[2][16] = { 
109 |                       {0,1,10,11,20,21,30,31, 2,3,12,13,22,23,32,33},
110 |                       {4,5,14,15,24,25,34,35, 6,7,16,17,26,27,36,37}
111 |                     };
112 | const int aux4[4][4] = { {0,4,8,12}, {32,36,40,44}, {16,20,24,28}, {48,52,56,60} }; 
113 | #endif
114 | 
115 | #endif
116 | 


--------------------------------------------------------------------------------
/src/convolutionForward_32x64x8.cu:
--------------------------------------------------------------------------------
  1 | // Copyright 2021 Roberto Lopez Castro
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //    http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | 
 16 | #include "FX_m2.cu"
 17 | 
 18 | #ifdef OPTLDS64
 19 | #include "store_and_transform_output_optLDS64.cuh"
 20 | #include "outer_product.cuh"
 21 | #elif OPTSTS64_CMP
 22 | #include "store_and_transform_output_optSTS64_compact.cuh"
 23 | #include "outer_product_suffle.cuh"
 24 | #else
 25 | #include "store_and_transform_output_optSTS64.cuh"
 26 | #include "outer_product_suffle.cuh"
 27 | #endif
 28 | 
 29 | #ifdef _noWALL_
 30 | typedef struct rusage resnfo;
 31 | typedef struct _timenfo {
 32 |   double time;
 33 |   double systime;
 34 | } timenfo;
 35 | #define timestamp(sample) getrusage(RUSAGE_SELF, (sample))
 36 | #define printtime(t) printf("%15f s (%f user + %f sys) ",		\
 37 | 			    t.time + t.systime, t.time, t.systime);
 38 | #else
 39 | typedef struct timeval resnfo;
 40 | typedef double timenfo;
 41 | #define timestamp(sample)     gettimeofday((sample), 0)
 42 | #define printtime(t) printf("%15f s ", t);
 43 | #endif
 44 | 
 45 | #ifndef _WINOGRAD_
 46 | #define _WINOGRAD_
 47 | extern "C"
 48 | {
 49 | 
 50 | 
 51 | #define d(input, i, j) ( input[(i<<2) + (j)] )
 52 | 
 53 | __device__ __forceinline__ void load_and_transform_input_tile(float *Btd, float *pOutputs, int in_h, int in_w,
 54 |                                   int tiles_dim, int in_c, int in_n, int tile_size, 
 55 |                                   int tiles_2d_dim, int tile_2d_s, int Inx, int Iny, int TileX, int TileY){
 56 | 
 57 |   float workspace[3];
 58 |   
 59 |   #pragma unroll
 60 |   for(int j=0; j<4; j++){
 61 |     workspace[0] = Btd[j];
 62 |     workspace[1] = Btd[j+4];
 63 |     workspace[2] = Btd[j+8];
 64 | 
 65 |     Btd[j]    = workspace[0] - workspace[2];
 66 |     Btd[j+4]  = workspace[1] + workspace[2];
 67 |     Btd[j+8]  = workspace[2] - workspace[1];
 68 |     Btd[j+12] = workspace[1] - Btd[j+12];
 69 |   }
 70 |   
 71 |   int c_offset = BN*BC;
 72 |   int c_tensor = Iny*BN + Inx;
 73 |   
 74 |   #pragma unroll
 75 |   for(int i=0; i<4; i++){ // prefetch 1 input tile/thread
 76 |     pOutputs[c_tensor+i*c_offset*4] = d(Btd, i, 0) - d(Btd, i, 2);  
 77 |     pOutputs[c_tensor+i*c_offset*4+c_offset] = d(Btd, i, 1) + d(Btd, i, 2);
 78 |     pOutputs[c_tensor+i*c_offset*4+2*c_offset] = d(Btd, i, 2) - d(Btd, i, 1);
 79 |     pOutputs[c_tensor+i*c_offset*4+3*c_offset] = d(Btd, i, 1) - d(Btd, i, 3);
 80 |   }     
 81 | 
 82 | }
 83 | 
 84 | __device__ __forceinline__ void load_filter_tile(float *tiles, float *pOutputs, 
 85 |                                 int filt_c, int filt_k, int Inx, int Iny){
 86 |  
 87 |   int c_tensor_s = Iny*BK + Inx;
 88 |   int c_offset_s = BK*BC;
 89 |   
 90 |   for(int k=0; k<2; k++){ // prefetch 2 filter tiles/thread
 91 |     for(int i=0; i<4; i++){
 92 |       for(int j=0; j<4; j++){
 93 |         pOutputs[c_tensor_s + i*c_offset_s*4 + j*c_offset_s] = tiles[k*16 + i*4 + j];
 94 |       }
 95 |     }
 96 | 
 97 |     c_tensor_s += BN;
 98 |   }
 99 |   
100 | }
101 | 
102 | __device__ __forceinline__ void prefetch_filter_tile(float *pInputs, float *tiles, 
103 |                                       int filt_k, int Inx, int Iny, int TileZ){
104 | 
105 |   int c_tensor = TileZ*BK + (Iny*filt_k<<4) + Inx;
106 |   
107 |   int acumm;
108 |   #pragma unroll  
109 |   for(int i=0; i<4; i++){
110 |       acumm = (i*filt_k<<2);
111 |       for(int j=0; j<4; j++){
112 |           tiles[(i<<2) + j] = pInputs[acumm + j*filt_k + c_tensor];
113 |           tiles[16 + (i<<2) + j] = pInputs[acumm + j*filt_k + c_tensor+BN];
114 |       }
115 |   }
116 | }
117 | 
118 | __device__ __forceinline__ void prefetch_input_tile(float *pInputs, float *tile, int in_h, int in_w,
119 |                   int in_n, int Inx, int Iny, int TileX, int TileY, int tiles_dim, short mask){
120 |   
121 |   int c_tensor = (TileY%tiles_dim)*in_n*2 + (TileY/tiles_dim)*in_n*in_w*2 + TileX*BN + Iny*(in_n*in_h*in_w) + (Inx/in_n)*2*in_n + (Inx%in_n) - (in_n*in_w+in_n);
122 |   int acumm,x;
123 |            
124 |   if(mask==0xFFFF){
125 | 
126 |     for(int i=0; i<4; i++){
127 |       acumm = i*in_n*in_w;   
128 |       #pragma unroll
129 |       for(int j=0; j<4; j++){
130 |         tile[(i<<2) + j] = pInputs[acumm + j*in_n + c_tensor];
131 |       }
132 |     }
133 | 
134 |   } else {
135 | 
136 |      for(int i=0; i<4; i++){
137 |       acumm = i*in_n*in_w;   
138 |       #pragma unroll
139 |       for(int j=0; j<4; j++){
140 |         x = (i<<2) + j;
141 |         tile[x] = 0;
142 |         if(mask&(1<<x))
143 |           tile[x]=pInputs[acumm + j*in_n + c_tensor];
144 |       }
145 |     }
146 |   }
147 | }
148 | 
149 | __device__  __forceinline__ void prefetch_filter_frag(float4 *filter_frag, float4 *B_frag, int f_frag_offset, 
150 |                           int Inx, int offset1, int offset2){
151 | 
152 |   *((float4*) (filter_frag))     = *(B_frag + offset1);
153 |   *((float4*) (filter_frag + 1)) = *(B_frag + offset2);
154 | 
155 |   *((float4*) (filter_frag + 2)) = *(B_frag + f_frag_offset + offset1);
156 |   *((float4*) (filter_frag + 3)) = *(B_frag + f_frag_offset + offset2);
157 | }
158 | 
159 | __device__  __forceinline__ void prefetch_input_frag(float4* input_frag, float4 *A_frag, int frag_offset, 
160 |                       int Inx, int offset1, int offset2){  
161 |   
162 |   *((float4*) (input_frag))     = *(A_frag + offset1); //ld_shared(A_frag + offset1);
163 |   *((float4*) (input_frag + 1)) = *(A_frag + offset2);
164 | 
165 |   *((float4*) (input_frag + 2)) = *(A_frag + frag_offset + offset1);
166 |   *((float4*) (input_frag + 3)) = *(A_frag + frag_offset + offset2); //3=2+1
167 | }
168 | 
169 | __global__ void Winograd_kernel(float *A, float *B, float *C,
170 |                     int tiles_dim, int in_c, int in_n, int in_h, int in_w, 
171 |                     int tile_size, int filt_k, int filt_c,
172 |                     int tiles_2d_dim, int out_c, int out_n, 
173 |                     int tile_2d_s, int out_h, int out_w){
174 | 
175 |   extern __shared__ float shared_mem[];
176 |   float *input_smem  = (float*)shared_mem;
177 |   float *filter_smem = (float*)&shared_mem[16*BC*BN];
178 | 
179 |   short m = 0xFFFF;
180 |   if((blockIdx.y/tiles_dim)==0)   m&=0xFFF0;
181 |   if((blockIdx.y/tiles_dim)==(tiles_dim-1)) m &= (!(in_w%2))?(0x0FFF):(0x00FF);
182 |   if(!((blockIdx.y+1)%tiles_dim)) m &= (!(in_w%2))?(0x7777):(0x3333);
183 |   if(!((blockIdx.y)%tiles_dim))   m&=0xeeee;
184 | 
185 |   float img_tile[16]; // Prefetch input from GMEM
186 |   float filter_tile[32]; // Prefetch filter from GMEM
187 | 
188 |   float4 input_frag_mem[8];  //2*2(2*8/4) Data to do Outer Product + prefetch f. SMEM (double_buffer)
189 |   float4 filter_frag_mem[8]; //2*2 Data to do Outer Product + prefetch f. SMEM (double_buffer)
190 |   float4 accumulator[2][16] = {0.0f};  // Accumulators 
191 | 
192 |   float4 *A_frag; // Input data pointer
193 |   int frag_offset = 2* (BC*BN); // (2=8/4) SMEM input read offset
194 | 
195 |   float4 *B_frag; // Filter data pointer
196 |   int f_frag_offset = 2* (BC*BK); // (2=8/4) SMEM filter read offset
197 | 
198 |   float4 *input_frag  = (float4*) input_frag_mem;
199 |   float4 *filter_frag = (float4*) filter_frag_mem;
200 | 
201 |   float4 *swap;
202 | 
203 |   prefetch_input_tile(A, img_tile, in_h, in_w, in_n, threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, tiles_dim, m);
204 |   prefetch_filter_tile(B, filter_tile, filt_k, threadIdx.x, threadIdx.y, blockIdx.z);
205 | 
206 |   float4 *input_frag_buffer  = (float4*) (input_frag+4);
207 |   float4 *filter_frag_buffer = (float4*) (filter_frag+4);
208 |   
209 |   // Mainloop - iterates over the entire K dimension - not unrolled
210 |   for(int iter=0; iter<in_c; iter+=BC){ // Current iteration
211 | 
212 |     A_frag = (float4*) (input_smem  + threadIdx.y*BC*BN);
213 |     B_frag = (float4*) (filter_smem + threadIdx.y*BC*BK);
214 | 
215 |     load_and_transform_input_tile(img_tile, input_smem, in_h, in_w,
216 |                  tiles_dim, in_c, in_n, tile_size,
217 |                  tiles_2d_dim, tile_2d_s, threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y);
218 |     load_filter_tile(filter_tile, filter_smem, filt_c, filt_k, threadIdx.x, threadIdx.y);
219 | 
220 |     __syncthreads();
221 | 
222 |     prefetch_input_frag(input_frag, A_frag, frag_offset, threadIdx.x, access_s[0][threadIdx.x], access_s[1][threadIdx.x]);
223 |     prefetch_filter_frag(filter_frag, B_frag, f_frag_offset, threadIdx.x, access_f_s[0][threadIdx.x], access_f_s[1][threadIdx.x]);
224 |     
225 |     #pragma unroll
226 |     for(int i=0; i<BC; i++){
227 | 
228 |       if(i<(BC-1)){
229 |         A_frag += BN/4;
230 |         B_frag += BK/4;
231 | 
232 |         prefetch_input_frag(input_frag_buffer, A_frag, frag_offset, threadIdx.x, access_s[0][threadIdx.x], access_s[1][threadIdx.x]);
233 |         prefetch_filter_frag(filter_frag_buffer, B_frag, f_frag_offset, threadIdx.x, access_f_s[0][threadIdx.x], access_f_s[1][threadIdx.x]);
234 |       }
235 | 
236 |       outer_product(input_frag, filter_frag, accumulator);
237 | 
238 |       swap = input_frag;
239 |       input_frag = input_frag_buffer;
240 |       input_frag_buffer = swap;
241 | 
242 |       swap = filter_frag;
243 |       filter_frag = filter_frag_buffer;
244 |       filter_frag_buffer = swap;
245 |       
246 |     }
247 |     
248 |     A += in_n*BC*in_w*in_h;
249 |     B += filt_k*BC*4*4;
250 | 
251 |     if(iter<(in_c-BC)){
252 |       prefetch_input_tile(A, img_tile, in_h, in_w, in_n, threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, tiles_dim, m);
253 |       prefetch_filter_tile(B, filter_tile, filt_k, threadIdx.x, threadIdx.y, blockIdx.z);
254 |     }
255 | 
256 |     __syncthreads();
257 |   }
258 | 
259 |   // Transpose, transform and store accumulated result
260 |   store_output_tile(accumulator, shared_mem, threadIdx.x, threadIdx.y, C, blockIdx.x,blockIdx.y, blockIdx.z, out_h, out_w, tiles_dim, out_n, input_frag_mem, filter_frag_mem, m);
261 |                      
262 | }
263 | 
264 | cudaError_t convolutionForward_32x64x8(float *k, int in_h, int in_w, float *w, int out_h,
265 |                   int out_w, int out_n, int out_c, float *C, float *Ww, 
266 |                 const unsigned int n,
267 |                 int tiles_dim, int in_n, int tile_size,
268 |                 int in_c, int filt_k, int filt_c, int filt_h, int filt_w, int alpha, int m){
269 | 
270 |   int tile_2d_s = tile_size*tile_size;
271 |   int tiles_2d_dim = tiles_dim*tiles_dim;
272 |   int smem_size = (16*BC*BN + 16*BC*BK)*4;
273 | 
274 |   FX<<<dim3(filt_k/BK, filt_c/BC), dim3(BN, BC)>>>(w, Ww, filt_k, filt_c, filt_h, filt_w, alpha);
275 |         
276 |   #ifdef OPTSTS64_CMP
277 |   smem_size = 65536; // 64 KB
278 |   cudaFuncSetAttribute(Winograd_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
279 |   #endif
280 | 
281 |   Winograd_kernel<<<dim3(in_n/BN, tiles_2d_dim, filt_k/BK), dim3(BN, 8), smem_size>>>(k, Ww, C, tiles_dim, in_c, in_n, in_h, in_w, tile_size, filt_k, filt_c, tiles_2d_dim, out_c, out_n, tile_2d_s, out_h, out_w);
282 | 
283 |   return cudaGetLastError();
284 | }
285 | 
286 | }
287 | #endif
288 | 


--------------------------------------------------------------------------------
/src/convolutionForward_32x64x8_baseline.cu:
--------------------------------------------------------------------------------
  1 | // Copyright 2021 Roberto Lopez Castro
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //    http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | 
 16 | #include "FX_m2.cu"
 17 | #include "store_and_transform_output_baseline.cuh"
 18 | #include "outer_product.cuh"
 19 | 
 20 | #ifdef _noWALL_
 21 | typedef struct rusage resnfo;
 22 | typedef struct _timenfo {
 23 |   double time;
 24 |   double systime;
 25 | } timenfo;
 26 | #define timestamp(sample) getrusage(RUSAGE_SELF, (sample))
 27 | #define printtime(t) printf("%15f s (%f user + %f sys) ",		\
 28 | 			    t.time + t.systime, t.time, t.systime);
 29 | #else
 30 | typedef struct timeval resnfo;
 31 | typedef double timenfo;
 32 | #define timestamp(sample)     gettimeofday((sample), 0)
 33 | #define printtime(t) printf("%15f s ", t);
 34 | #endif
 35 | 
 36 | #ifndef _WINOGRAD_
 37 | #define _WINOGRAD_
 38 | extern "C"
 39 | {
 40 | 
 41 | 
 42 | #define d(input, i, j) ( input[(i<<2) + (j)] )
 43 | 
 44 | __device__ void load_and_transform_input_tile(float *Btd, float *pOutputs, int in_h, int in_w, int tiles_dim, int in_c, int in_n, int tile_size, int tiles_2d_dim, int tile_2d_s, int Inx, int Iny, int TileX, int TileY)
 45 | {
 46 | 
 47 |   float workspace[4];
 48 |   
 49 |   for(int j=0; j<4; j++){
 50 |     workspace[0] = Btd[j];
 51 |     workspace[1] = Btd[j+4];
 52 |     workspace[2] = Btd[j+8];
 53 |     workspace[3] = Btd[j+12];
 54 | 
 55 |     Btd[j]    = workspace[0] - workspace[2];
 56 |     Btd[j+4]  = workspace[1] + workspace[2];
 57 |     Btd[j+8]  = workspace[2] - workspace[1];
 58 |     Btd[j+12] = workspace[1] - workspace[3];
 59 |   }
 60 |   
 61 |   int c_offset = BN*BC;
 62 |   int c_tensor = Iny*BN + Inx;
 63 |   
 64 |   for(int i=0; i<4; i++){ // prefetch 1 input tile/thread
 65 |     pOutputs[c_tensor+i*c_offset*4] = d(Btd, i, 0) - d(Btd, i, 2);
 66 |     pOutputs[c_tensor+i*c_offset*4+c_offset] = d(Btd, i, 1) + d(Btd, i, 2);
 67 |     pOutputs[c_tensor+i*c_offset*4+2*c_offset] = d(Btd, i, 2) - d(Btd, i, 1);
 68 |     pOutputs[c_tensor+i*c_offset*4+3*c_offset] = d(Btd, i, 1) - d(Btd, i, 3);
 69 |   }     
 70 | }
 71 | 
 72 | __device__ void load_filter_tile(float *tiles, float *pOutputs, int filt_c, int filt_k, int Inx, int Iny)
 73 | {
 74 |  
 75 |   int c_tensor_s = Iny*BK + Inx;
 76 |   int c_offset_s = BK*BC;
 77 |   
 78 |   for(int k=0; k<2; k++){ // prefetch 2 filter tiles/thread
 79 |     for(int i=0; i<4; i++){
 80 |       for(int j=0; j<4; j++){
 81 |         pOutputs[c_tensor_s + i*c_offset_s*4 + j*c_offset_s] = tiles[k*16 + i*4 + j];
 82 |       }
 83 |     }
 84 | 
 85 |     c_tensor_s += BN;
 86 |   }
 87 |   
 88 | }
 89 | 
 90 | __device__ void prefetch_filter_tile(float *pInputs, float *tiles, int filt_k, int Inx, int Iny, int TileZ)
 91 | {
 92 | 
 93 |   int c_tensor = TileZ*BK + (Iny*filt_k<<4) + Inx;
 94 |   
 95 |   int acumm;
 96 |   #pragma unroll  
 97 |   for(int i=0; i<4; i++){
 98 |       acumm = (i*filt_k<<2);
 99 |       for(int j=0; j<4; j++){
100 |           tiles[(i<<2) + j] = pInputs[acumm + j*filt_k + c_tensor];
101 |           tiles[16 + (i<<2) + j] = pInputs[acumm + j*filt_k + c_tensor+BN];
102 |       }
103 |   }
104 | }
105 | 
106 | __device__ void prefetch_input_tile(float *pInputs, float *tile, int in_h, int in_w, int in_n, int Inx, int Iny, int TileX, int TileY, int tiles_dim, short mask)
107 | {
108 |   
109 |   int c_tensor = (TileY%tiles_dim)*in_n*2 + (TileY/tiles_dim)*in_n*in_w*2 + TileX*BN + Iny*(in_n*in_h*in_w) + (Inx/in_n)*2*in_n + (Inx%in_n) - (in_n*in_w+in_n);
110 |   int acumm;                 
111 | 
112 |   //#pragma unroll                 
113 |   for(int i=0; i<4; i++){
114 |     acumm = i*in_n*in_w;   
115 |     for(int j=0; j<4; j++){
116 |       if(mask&(1<<((i<<2) + j)))
117 |         tile[(i<<2) + j] = pInputs[acumm + j*in_n + c_tensor];
118 |       else 
119 |         tile[(i<<2) + j] = 0;
120 |     }
121 |   }
122 | }
123 | 
124 | __device__ void __inline__ prefetch_filter_frag(float4 *filter_frag, float4 *B_frag, int f_frag_offset, int Inx, int offset1, int offset2)
125 | {
126 | 
127 |   *((float4*) (filter_frag))     = *(B_frag + offset1);
128 |   *((float4*) (filter_frag + 1)) = *(B_frag + offset2);
129 | 
130 |   *((float4*) (filter_frag + 2)) = *(B_frag + f_frag_offset + offset1);
131 |   *((float4*) (filter_frag + 3)) = *(B_frag + f_frag_offset + offset2);
132 | }
133 | 
134 | __device__ void __inline__ prefetch_input_frag(float4* input_frag, float4 *A_frag, int frag_offset, int Inx, int offset1, int offset2)
135 | {  
136 | 
137 |   *((float4*) (input_frag))     = *(A_frag + offset1); //ld_shared(A_frag + offset1);
138 |   *((float4*) (input_frag + 1)) = *(A_frag + offset2);
139 | 
140 |   *((float4*) (input_frag + 2)) = *(A_frag + frag_offset + offset1);
141 |   *((float4*) (input_frag + 3)) = *(A_frag + frag_offset + offset2); //3=2+1
142 | }
143 | 
144 | __global__ void Winograd_kernel(float *A, float *B, float *C,
145 |                     int tiles_dim, int in_c, int in_n, int in_h, int in_w, 
146 |                     int tile_size, int filt_k, int filt_c,
147 |                     int tiles_2d_dim, int out_c, int out_n, 
148 |                     int tile_2d_s, int out_h, int out_w){
149 | 
150 |   extern __shared__ float shared_mem[];
151 |   float *input_smem  = (float*)shared_mem;
152 |   float *filter_smem = (float*)&shared_mem[16*BC*BN];
153 | 
154 |   short m = 0xFFFF;
155 |   if((blockIdx.y/tiles_dim)==0)   m&=0xFFF0;
156 |   if((blockIdx.y/tiles_dim)==(tiles_dim-1)) m &= (!(in_w%2))?(0x0FFF):(0x00FF);
157 |   if(!((blockIdx.y+1)%tiles_dim)) m &= (!(in_w%2))?(0x7777):(0x3333);
158 |   if(!((blockIdx.y)%tiles_dim))   m&=0xeeee;
159 | 
160 |   float img_tile[16]; // Prefetch input from GMEM
161 |   float filter_tile[32]; // Prefetch filter from GMEM
162 | 
163 |   float4 input_frag_mem[8];  //2*2(2*8/4) Data to do Outer Product + prefetch f. SMEM (double_buffer)
164 |   float4 filter_frag_mem[8]; //2*2 Data to do Outer Product + prefetch f. SMEM (double_buffer)
165 |   float4 accumulator[2][16] = {0.0f};  // Accumulators 
166 | 
167 |   float4 *A_frag; // Input data pointer
168 |   int frag_offset = 2* (BC*BN); // (2=8/4) SMEM input read offset
169 | 
170 |   float4 *B_frag; // Filter data pointer
171 |   int f_frag_offset = 2* (BC*BK); // (2=8/4) SMEM filter read offset
172 | 
173 |   float4 *input_frag  = (float4*) input_frag_mem;
174 |   float4 *filter_frag = (float4*) filter_frag_mem;
175 | 
176 |   float4 *swap;
177 | 
178 |   prefetch_input_tile(A, img_tile, in_h, in_w, in_n, threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, tiles_dim, m);
179 |   prefetch_filter_tile(B, filter_tile, filt_k, threadIdx.x, threadIdx.y, blockIdx.z);
180 | 
181 |   float4 *input_frag_buffer  = (float4*) (input_frag+4);
182 |   float4 *filter_frag_buffer = (float4*) (filter_frag+4);
183 |   
184 |   // Mainloop - iterates over the entire K dimension - not unrolled
185 |   for(int iter=0; iter<in_c; iter+=BC){ // Current iteration
186 | 
187 |     A_frag = (float4*) (input_smem  + threadIdx.y*BC*BN);
188 |     B_frag = (float4*) (filter_smem + threadIdx.y*BC*BK);
189 | 
190 |     load_and_transform_input_tile(img_tile, input_smem, in_h, in_w,
191 |                  tiles_dim, in_c, in_n, tile_size, 
192 |                  tiles_2d_dim, tile_2d_s, threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y);
193 |     load_filter_tile(filter_tile, filter_smem, filt_c, filt_k, threadIdx.x, threadIdx.y);
194 | 
195 |     __syncthreads();
196 | 
197 |     prefetch_input_frag(input_frag, A_frag, frag_offset, threadIdx.x, access_s[0][threadIdx.x], access_s[1][threadIdx.x]);
198 |     prefetch_filter_frag(filter_frag, B_frag, f_frag_offset, threadIdx.x, access_f_s[0][threadIdx.x], access_f_s[1][threadIdx.x]);
199 |     
200 |     #pragma unroll
201 |     for(int i=0; i<BC; i++){
202 | 
203 |       if(i<(BC-1)){
204 |         A_frag += BN/4;
205 |         B_frag += BK/4;
206 | 
207 |         prefetch_input_frag(input_frag_buffer, A_frag, frag_offset, threadIdx.x, access_s[0][threadIdx.x], access_s[1][threadIdx.x]);
208 |         prefetch_filter_frag(filter_frag_buffer, B_frag, f_frag_offset, threadIdx.x, access_f_s[0][threadIdx.x], access_f_s[1][threadIdx.x]);
209 |       }
210 | 
211 |       outer_product(input_frag, filter_frag, accumulator);
212 | 
213 |       swap = input_frag;
214 |       input_frag = input_frag_buffer;
215 |       input_frag_buffer = swap;
216 | 
217 |       swap = filter_frag;
218 |       filter_frag = filter_frag_buffer;
219 |       filter_frag_buffer = swap;
220 |       
221 |     }
222 |     
223 |     A += in_n*BC*in_w*in_h;
224 |     B += filt_k*BC*4*4;
225 | 
226 |     if(iter<(in_c-BC)){
227 |       prefetch_input_tile(A, img_tile, in_h, in_w, in_n, threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, tiles_dim, m);
228 |       prefetch_filter_tile(B, filter_tile, filt_k, threadIdx.x, threadIdx.y, blockIdx.z);
229 |     }
230 | 
231 |     __syncthreads();
232 |   }
233 | 
234 |   // Transpose, transform and store accumulated result
235 |   store_output_tile(accumulator, shared_mem, threadIdx.x, threadIdx.y, C, blockIdx.x, blockIdx.y, blockIdx.z, out_h, out_w, tiles_dim, out_n, input_frag_mem, filter_frag_mem, out_thread, access_s_out, m);
236 | }
237 | 
238 | cudaError_t convolutionForward_32x64x8(float *k, int in_h, int in_w, float *w, int out_h, int out_w, int out_n, int out_c, float *C, float *Ww, 
239 |                 const unsigned int n,
240 |                 int tiles_dim, int in_n, int tile_size,
241 |                 int in_c, int filt_k, int filt_c, int filt_h, int filt_w, int alpha, int m){
242 | 
243 |   int tile_2d_s = tile_size*tile_size;
244 |   int tiles_2d_dim = tiles_dim*tiles_dim;
245 |   int smem_size = 16*BC*BN + 16*BC*BK;
246 | 
247 |   FX<<<dim3(filt_k/BK, filt_c/BC), dim3(BN, BC)>>>(w, Ww, filt_k, filt_c, filt_h, filt_w, alpha);
248 |         
249 |   Winograd_kernel<<<dim3(in_n/BN, tiles_2d_dim, filt_k/BK), dim3(BN, 8), (smem_size)<<2 >>>(k, Ww, C,
250 |                              tiles_dim, in_c, in_n, in_h, in_w, tile_size,
251 |                              filt_k, filt_c, tiles_2d_dim, out_c, out_n, tile_2d_s,
252 |                              out_h, out_w);
253 | 
254 |   return cudaGetLastError();
255 | }
256 | 
257 | }
258 | #endif


--------------------------------------------------------------------------------
/src/openCNN_winograd.cu:
--------------------------------------------------------------------------------
  1 | // Copyright 2021 Roberto Lopez Castro
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //    http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | 
 16 | #include <iomanip>
 17 | #include <iostream>
 18 | #include <cstdlib>
 19 | #include <vector>
 20 | #include <math.h>
 21 | #include <cuda.h>
 22 | #include <omp.h>
 23 | 
 24 | #include <curand.h>
 25 | #include <curand_kernel.h>
 26 | #include <cuda_runtime.h>
 27 | #include <sys/time.h>
 28 | #include <sys/resource.h>
 29 | 
 30 | #include <cudnn.h>
 31 | 
 32 | #include "config.hpp"
 33 | 
 34 | #ifdef BASE
 35 |   #if __CUDA_ARCH__ < 800
 36 |   #include "convolutionForward_32x64x8_baseline.cu"
 37 |   #else
 38 |   #include "ampere/convolutionForward_32x64x8_baseline.cu"
 39 |   #endif
 40 | #else
 41 |   #if __CUDA_ARCH__ < 800
 42 |   #include "convolutionForward_32x64x8.cu"  
 43 |   #else 
 44 |   #include "ampere/convolutionForward_32x64x8.cu"
 45 |   #endif
 46 | #endif
 47 | 
 48 | /*
 49 |    In order to measure the elapsed time:
 50 | 
 51 |    resnfo: datatype defined to abstract the metric of the resources to use
 52 |    timenfo: datatype defined to abstract the time metric to use
 53 | 
 54 |    timestamp: it abstract the function used to take the time
 55 | 
 56 |    printtime: it abstracts the function used to print the time
 57 | 
 58 |    void myElapsedtime(resnfo start, resnfo end, timenfo *t): function to obtain the
 59 |                        time between two measures
 60 | */
 61 | 
 62 | #ifdef _noWALL_
 63 | typedef struct rusage resnfo;
 64 | typedef struct _timenfo {
 65 |   double time;
 66 |   double systime;
 67 | } timenfo;
 68 | #define timestamp(sample) getrusage(RUSAGE_SELF, (sample))
 69 | #define printtime(t) printf("%15f s (%f user + %f sys) ",		\
 70 | 			    t.time + t.systime, t.time, t.systime);
 71 | #else
 72 | typedef struct timeval resnfo;
 73 | typedef double timenfo;
 74 | #define timestamp(sample)     gettimeofday((sample), 0)
 75 | #define printtime(t) printf("%15f s ", t);
 76 | #endif
 77 | 
 78 | void myElapsedtime(resnfo start, resnfo end, timenfo *t)
 79 | {
 80 | #ifdef _noWALL_
 81 |   t->time = (end.ru_utime.tv_sec + (end.ru_utime.tv_usec * 1E-6))
 82 |     - (start.ru_utime.tv_sec + (start.ru_utime.tv_usec * 1E-6));
 83 |   t->systime = (end.ru_stime.tv_sec + (end.ru_stime.tv_usec * 1E-6))
 84 |     - (start.ru_stime.tv_sec + (start.ru_stime.tv_usec * 1E-6));
 85 | #else
 86 |   *t = (end.tv_sec + (end.tv_usec * 1E-6))
 87 |     - (start.tv_sec + (start.tv_usec * 1E-6));
 88 | #endif /*_noWALL_*/
 89 | }
 90 | 
 91 | #define CUDA_CALL(f) { \
 92 |   cudaError_t err = (f); \
 93 |   if (err != cudaSuccess) { \
 94 |     std::cout \
 95 |         << "    Error occurred: " << err << std::endl; \
 96 |     std::exit(1); \
 97 |   } \
 98 | }
 99 | 
100 | #define CUDNN_CALL(f) { \
101 |   cudnnStatus_t err = (f); \
102 |   if (err != CUDNN_STATUS_SUCCESS) { \
103 |     printf("    Error occurred: \n"); \
104 |     std::exit(1); \
105 |   } \
106 | }
107 | 
108 | #define OPENCNN_CALL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
109 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
110 | {
111 |    if (code != cudaSuccess)
112 |    {
113 |       fprintf(stderr,"Error occurred: %s %s %d\n", cudaGetErrorString(code), file, line);
114 |       if (abort) exit(code);  
115 |    }
116 | }
117 | 
118 | void tflops(int in_n, int in_w, int in_h, int in_c, int filt_w, int filt_h, int filt_k, int pad, int str, 
119 |             int out_w, int out_h, float ms)
120 | {
121 |   
122 |   double L = (double) 2.0*in_n*in_c*(in_h+2*PAD_H)*(in_w+2*PAD_W)*filt_k*3.0*3.0;
123 | 
124 |   printf("%.3f,%.2f", ms, L/(2.25 * ms * 1e9) );
125 | }
126 | 
127 | __global__ void dev_const(float *px, float k, int n) {
128 |   int tid = threadIdx.x + blockIdx.x * blockDim.x;
129 |  
130 |   curandState state;
131 |   curand_init(clock64(), tid, 0, &state);
132 | 
133 |   if (tid < n)
134 |     px[tid] = curand_uniform(&state);
135 | }
136 | 
137 | __global__ void dev_iota(float *px, int n) {
138 |   int tid = threadIdx.x + blockIdx.x * blockDim.x;
139 | 
140 |   curandState state;
141 |   curand_init(clock64(), tid, 0, &state);
142 |   
143 |   if (tid < n)
144 |     px[tid] = curand_uniform(&state);
145 | }
146 | 
147 | __global__ void data_cpy(float *px, float *py, 
148 |           int in_w, int in_h, int in_c, int in_n) {
149 |   int tid = blockIdx.y + blockIdx.z*in_w + threadIdx.x*in_h*in_w + blockIdx.x*in_h*in_w*in_c;
150 |   int id  = blockIdx.x + blockIdx.y*in_n + blockIdx.z*in_n*in_w + threadIdx.x*in_n*in_h*in_w;
151 | 
152 |   px[id] = py[tid];
153 | }
154 | 
155 | void print(const float *data, int n, int c, int h, int w) {
156 |   std::vector<float> buffer(1 << 20);
157 |   CUDA_CALL(cudaMemcpy(
158 |         buffer.data(), data,
159 |         n * c * h * w * sizeof(float),
160 |         cudaMemcpyDeviceToHost));
161 |   int a = 0;
162 |   for (int i = 0; i < n; ++i) {
163 |     for (int j = 0; j < c; ++j) {
164 |       std::cout << "n=" << i << ", c=" << j << ":" << std::endl;
165 |       for (int k = 0; k < h; ++k) {
166 |         for (int l = 0; l < w; ++l) {
167 |           std::cout << std::setw(12) << std::right << buffer[a];
168 |           ++a;
169 |         }
170 |         std::cout << std::endl << std::endl;
171 |       }
172 |     }
173 |   }
174 |   std::cout << std::endl;
175 | }
176 |   
177 | void output_checker(float* A, float* B, int n, int len, int channel, int shift) {
178 |   int error_cnt = 0, i, j, k, m;
179 |   float max_error = 0;
180 |   for(k = 0; k < channel; k++){
181 |     for (i = 0; i < len; i++) {
182 |         for (j = 0; j < len; j++) {
183 |         for (m = 0; m < n; m++) {
184 |             float diff = fabs(
185 |                 A[k*len*len*n + i*len*n + j*n + m] - 
186 |                 B[m*len*len*channel + k*len*len + i*len + j]);
187 |             if (diff > 1){ //1e-4
188 |               error_cnt++;
189 |               printf("h:%d, w:%d, n:%d, c:%d -> %f vs %f : +- %f\n", i, j, m, k,
190 |               A[k*len*len*n + i*len*n + j*n + m],
191 |               B[m*len*len*channel + k*len*len + i*len + j], diff);
192 |               std::exit(1);
193 |             }
194 |             if (diff > max_error)
195 |             max_error = diff;
196 |         }
197 |         }
198 |     }
199 |   }
200 |   printf("[max_error: %f][error_cnt: %d] of %d\n", max_error, error_cnt, n*len*len*channel*shift);
201 | }
202 | 
203 | 
204 | cudaError_t convolutionForward(float *k, int in_h, int in_w, float *w, int out_h,
205 |                                     int out_w, int out_n, int out_c, float *C, float *Ww, 
206 |                                   const unsigned int n,
207 |                                   int tiles_dim, int in_n, int tile_size, int elems_dim,
208 |                                   int in_c, int filt_k, int filt_c, int filt_h, int filt_w,
209 |                                   int alpha, int m){
210 |   cudaError_t out;
211 | 
212 |   if(BN==32 && BK==64 && BC==8){
213 |     out = convolutionForward_32x64x8(k, in_h, in_w, w, out_h, out_w, out_n, out_c, C, Ww, n, tiles_dim, in_n, tile_size, in_c, filt_k, filt_c, filt_h, filt_w, alpha, m);
214 |   } else {
215 |     std::cout << "Configuration not supported yet" << std::endl;
216 |   }
217 | 
218 |   return out;
219 | }
220 | 
221 | cudaError_t init_data(float *in_data, float *in_data_open, float *filt_data, float *filt_data_open, int in_w, int in_h, int in_c, int in_n, int filt_w, int filt_h, int filt_c, int filt_k, int tile_size){
222 | 
223 |   int n = in_n*in_c*in_h*in_w;
224 |   int blk_size = 256;
225 | 
226 |   dim3 dimBlock(blk_size);
227 |   dim3 dimGrid((n + dimBlock.x -1)/dimBlock.x);
228 | 
229 |   dev_iota<<<dimGrid, dimBlock>>>(in_data, n);
230 |   data_cpy<<<dim3(in_n, in_w, in_h), in_c>>>(in_data_open, in_data, in_w, in_h, in_c, in_n);
231 | 
232 |   n = filt_k*filt_c*filt_h*filt_w;
233 |   dim3 dimGrid_f = dim3((n + dimBlock.x -1)/dimBlock.x);
234 |   dev_const<<<dimGrid_f, dimBlock>>>(filt_data, 1.f, n);
235 |   data_cpy<<<dim3(filt_k, filt_w, filt_h), dim3(filt_c)>>>(filt_data_open, filt_data, filt_w, filt_h, filt_c, filt_k);
236 | 
237 |   return cudaGetLastError();
238 | }
239 | 
240 | 
241 | int main(int argc, char *argv[]) {
242 | 
243 |  
244 |   // ========== Set ImageBatch, filter, convolution and output parameters ========== //
245 |   // ImageBatch
246 |   const int in_n = (argc > 1)?atoi (argv[1]):N; // Number of images
247 |   const int in_c = (argc > 2)?atoi (argv[2]):C_in; // Number of feature maps per image
248 |   const int in_h = (argc > 3)?atoi (argv[3]):W; // Height of each feature map
249 |   const int in_w = (argc > 4)?atoi (argv[4]):W; // Width of each feature map
250 | 
251 |   // Filter
252 |   const int filt_k = (argc > 5)?atoi (argv[5]):K;
253 |   const int filt_c = (argc > 6)?atoi (argv[6]):C_in;
254 |   const int filt_h = (argc > 7)?atoi (argv[7]):R;
255 |   const int filt_w = (argc > 8)?atoi (argv[8]):R;  
256 | 
257 |   std::cout << in_n << "," << in_c <<  "," << in_h <<  "," << filt_k <<  "," << filt_h << ",";
258 | 
259 |   // Convolution config
260 |   const int pad_h = PAD_H; // Zero-padding height
261 |   const int pad_w = PAD_W; // Zero-padding width
262 |   const int str_h = STR_H; // Vertical filter stride
263 |   const int str_w = STR_W; // Horizontal filter stride
264 |   const int dil_h = DIL_H; // Filter height dilation
265 |   const int dil_w = DIL_W; // Filter width dilation
266 | 
267 | 
268 |   // Output
269 |   int out_n; // Number of outputs
270 |   int out_c; // Number of feature maps per output
271 |   int out_h; // Height of each feature map
272 |   int out_w; // Width of each feature map
273 |      
274 |   /*
275 |    ####################################################################
276 |    ======================= openCNN preparation =======================
277 |    ####################################################################
278 |    */
279 |   // Winograd config
280 |   const int m         = M;
281 |   const int r         = filt_h;
282 |   const int tile_size = m+r-1; // alpha value
283 |   int elems_dim;
284 |   int tiles_dim;
285 |   
286 |   if(m==2){
287 |     tiles_dim = ceil(ceil((double)(in_w+2)/2)-1);
288 |     elems_dim = tiles_dim*4;
289 |   } else {
290 |     std::cout << "Configuration not supported yet" << std::endl;
291 |     exit(0);
292 |   }
293 | 
294 |   // Output
295 |   out_n = in_n;   // Number of outputs
296 |   out_c = filt_k; // Number of feature maps per output
297 |   out_h = in_h;   // Height of each feature map
298 |   out_w = in_w;   // Width of each feature map
299 | 
300 |   float *in_data_open;
301 |   float *filt_data_open, *workspace;
302 | 
303 |   // ImageBatch openCNN
304 |   OPENCNN_CALL(cudaMalloc(
305 |         &in_data_open, in_n * in_c * in_h * in_w * sizeof(float))); 
306 |   // Filter openCNN
307 |   OPENCNN_CALL(cudaMalloc(
308 |       &filt_data_open, filt_k * filt_c * filt_h * filt_w * sizeof(float)));  
309 |   // Filter transformation
310 |   OPENCNN_CALL(cudaMalloc(
311 |       &workspace, filt_k * filt_c * tile_size * tile_size * sizeof(float)));  
312 | 
313 |   // Output openCNN    
314 |   float *out_data;
315 |   OPENCNN_CALL(cudaMalloc(
316 |         &out_data, out_n * out_c * out_h * out_w * sizeof(float)));  
317 | 
318 |   // =================== openCNN layouts =================== //    
319 |   cudaMemcpyToSymbol(access_f_s, aux, 64*sizeof(int));
320 |   cudaMemcpyToSymbol(access_s, aux2, 64*sizeof(int));
321 |   #ifndef BASE
322 |     #if defined(OPTLDS64)
323 |     cudaMemcpyToSymbol(access_s_out, aux3, 32*sizeof(int));
324 |     cudaMemcpyToSymbol(out_thread, aux4, 32*sizeof(int));  
325 |     cudaMemcpyToSymbol(out_sgemm, aux5, 32*sizeof(int));
326 |     cudaMemcpyToSymbol(exhange, aux6, 32*sizeof(int));
327 |     #endif
328 |   #else 
329 |     cudaMemcpyToSymbol(access_s_out, aux3, 32*sizeof(int));
330 |     cudaMemcpyToSymbol(out_thread, aux4, 16*sizeof(int));  
331 |   #endif  
332 | 
333 |   /*
334 |    ####################################################################
335 |    ====================== cuDNN preparation ======================
336 |    ####################################################################
337 |    */
338 | 
339 |   float *in_data, *filt_data; 
340 | 
341 |   // ImageBatch cuDNN
342 |   CUDA_CALL(cudaMalloc(
343 |         &in_data, in_n * in_c * in_h * in_w * sizeof(float)));
344 |   // Filter cuDNN
345 |   CUDA_CALL(cudaMalloc(
346 |         &filt_data, filt_k * filt_c * filt_h * filt_w * sizeof(float)));
347 |   
348 |   // =================== Set descriptors =================== //
349 |   cudnnHandle_t cudnn;
350 |   CUDNN_CALL(cudnnCreate(&cudnn));
351 | 
352 |   // Input image Descriptors
353 |   cudnnTensorDescriptor_t in_desc;
354 |   CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc));
355 |   CUDNN_CALL(cudnnSetTensor4dDescriptor(
356 |         in_desc, CUDNN_TENSOR_NCHW/*CUDNN_TENSOR_NHWC*/, CUDNN_DATA_FLOAT,
357 |         in_n, in_c, in_h, in_w));
358 | 
359 |   // Filter Descriptors      
360 |   cudnnFilterDescriptor_t filt_desc;
361 |   CUDNN_CALL(cudnnCreateFilterDescriptor(&filt_desc));
362 |   CUDNN_CALL(cudnnSetFilter4dDescriptor(
363 |         filt_desc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW/*CUDNN_TENSOR_NHWC*/,
364 |         filt_k, filt_c, filt_h, filt_w));
365 |   
366 |   // Convolution Descriptors
367 |   cudnnConvolutionDescriptor_t conv_desc;
368 |   CUDNN_CALL(cudnnCreateConvolutionDescriptor(&conv_desc));
369 |   CUDNN_CALL(cudnnSetConvolution2dDescriptor(
370 |         conv_desc,
371 |         pad_h, pad_w, str_h, str_w, dil_h, dil_w,
372 |         CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));  //CUDNN_CONVOLUTION
373 |   
374 |   
375 |   // =================== Query output layout =================== //
376 |   CUDNN_CALL(cudnnGetConvolution2dForwardOutputDim(
377 |         conv_desc, in_desc, filt_desc,
378 |         &out_n, &out_c, &out_h, &out_w));
379 | 
380 |   // =================== Set and allocate output tensor descriptor ===================//
381 |   cudnnTensorDescriptor_t out_desc;
382 |   CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc));
383 |   CUDNN_CALL(cudnnSetTensor4dDescriptor(
384 |         out_desc, CUDNN_TENSOR_NCHW/*CUDNN_TENSOR_NHWC*/, CUDNN_DATA_FLOAT,
385 |         out_n, out_c, out_h, out_w));   
386 | 
387 |   float *out_data_cudnn;
388 |   CUDA_CALL(cudaMalloc(
389 |         &out_data_cudnn, out_n * out_c * out_h * out_w * sizeof(float)));   
390 | 
391 |   // =================== Query convolution forward algorithm =================== //
392 |   cudnnConvolutionFwdAlgo_t algo = (cudnnConvolutionFwdAlgo_t)6;
393 | 
394 |   // =================== Query workspace and allocate =================== //
395 |   size_t ws_size;
396 |   CUDNN_CALL(cudnnGetConvolutionForwardWorkspaceSize(
397 |         cudnn, in_desc, filt_desc, conv_desc, out_desc, algo, &ws_size));
398 | 
399 |   float *ws_data;
400 |   CUDA_CALL(cudaMalloc(&ws_data, ws_size));
401 |   
402 |   // =================== Launch convolution on cuDNN =================== //
403 |   float alpha = 1.f;
404 |   float beta = 0.f;
405 | 
406 |   /*
407 |    ####################################################################
408 |    ============================= Init data =============================
409 |    ####################################################################
410 |    */
411 | 
412 |   OPENCNN_CALL(init_data(in_data, in_data_open, filt_data, filt_data_open, in_w, in_h, in_c, in_n,
413 |     filt_w, filt_h, filt_c, filt_k, tile_size));
414 | 
415 |   /*
416 |   ####################################################################
417 |   ============================= Execution =============================
418 |   ####################################################################
419 |   */
420 |   CUevent hStart, hStop;
421 |   float ms;
422 |   OPENCNN_CALL( cudaEventCreate(&hStart, CU_EVENT_BLOCKING_SYNC) ); // CU_EVENT_DEFAULT
423 |   OPENCNN_CALL( cudaEventCreate(&hStop,  CU_EVENT_BLOCKING_SYNC) );
424 |   
425 |   // Loop of executions
426 |   int iterations = 100;
427 | 
428 |   // Performs warmup operation
429 |   OPENCNN_CALL(convolutionForward(in_data_open, in_h, in_w, filt_data_open, out_h, out_w, out_n, out_c, out_data, workspace,
430 |     out_c*out_n*out_h*out_w,
431 |     tiles_dim, in_n, tile_size, elems_dim, in_c, filt_k, filt_c, filt_h, filt_w, tile_size, m));
432 | 
433 |   // ============================= openCNN exec =============================  
434 |   cudaDeviceSynchronize();
435 |   ( cudaEventRecord( hStart, NULL ) );
436 |   for(int iter=0; iter<iterations; iter++){
437 |     OPENCNN_CALL(convolutionForward(in_data_open, in_h, in_w, filt_data_open, out_h, out_w, out_n, out_c, out_data, workspace,
438 |                   out_c*out_n*out_h*out_w,
439 |                   tiles_dim, in_n, tile_size, elems_dim, in_c, filt_k, filt_c, filt_h, filt_w, tile_size, m));
440 |   }
441 |   ( cudaEventRecord( hStop, NULL ) );
442 |   ( cudaEventSynchronize( hStop ) );
443 |   ( cudaEventElapsedTime( &ms, hStart, hStop ) );
444 |   ms = ms/iterations;
445 |   tflops(in_n, in_w, in_h, in_c, filt_w, filt_h, filt_k, pad_w, str_w, out_w, out_h, ms);
446 | 
447 |   std::cout << ",";
448 | 
449 |   // Performs warmup operation
450 |   CUDNN_CALL(cudnnConvolutionForward(cudnn, &alpha, in_desc, in_data, filt_desc, filt_data,
451 |     conv_desc, algo, ws_data, ws_size, &beta, out_desc, out_data_cudnn));
452 | 
453 |   // ============================= cuDNN exec =============================
454 |   cudaDeviceSynchronize();
455 |   ( cudaEventRecord( hStart, NULL ) );
456 |   for(int iter=0; iter<iterations; iter++){
457 |     CUDNN_CALL(cudnnConvolutionForward(cudnn, &alpha, in_desc, in_data, filt_desc, filt_data,
458 |         conv_desc, algo, ws_data, ws_size, &beta, out_desc, out_data_cudnn));
459 |   }
460 |   ( cudaEventRecord( hStop, NULL ) );
461 |   ( cudaEventSynchronize( hStop ) );
462 |   ( cudaEventElapsedTime( &ms, hStart, hStop ) );
463 |   ms = ms/iterations;
464 |   tflops(in_n, in_w, in_h, in_c, filt_w, filt_h, filt_k, pad_w, str_w, out_w, out_h, ms);
465 | 
466 | 
467 |   std::cout << std::endl;
468 |   // ============================= Compare results =============================  
469 |   std::cout << "********************************************" << std::endl;    
470 |   float *tmp_openCNN = (float*) malloc (out_n*out_h*out_w*out_c*sizeof(float)),
471 |       *tmp_cudnn   = (float*) malloc (out_n*out_h*out_w*out_c*sizeof(float)); 
472 |   cudaMemcpy(tmp_openCNN, out_data, (out_n*out_h*out_w*out_c)<<2, cudaMemcpyDeviceToHost);
473 |   cudaMemcpy(tmp_cudnn, out_data_cudnn, (out_n*out_h*out_w*out_c)<<2, cudaMemcpyDeviceToHost);
474 |   
475 |   output_checker(tmp_openCNN, tmp_cudnn, out_n, out_h, out_c, str_w);
476 |   free(tmp_openCNN); free(tmp_cudnn); 
477 | 
478 | 
479 |   CUDNN_CALL(cudnnDestroy(cudnn));
480 |   CUDA_CALL(cudaFree(out_data_cudnn));
481 |   CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc));
482 |   CUDNN_CALL(cudnnDestroyConvolutionDescriptor(conv_desc));
483 |   CUDNN_CALL(cudnnDestroyFilterDescriptor(filt_desc));
484 |   CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc));
485 |   CUDA_CALL(cudaFree(filt_data));
486 |   CUDA_CALL(cudaFree(in_data));
487 |   CUDA_CALL(cudaFree(ws_data));
488 | 
489 |   CUDA_CALL(cudaFree(out_data));
490 |   CUDA_CALL(cudaFree(filt_data_open));
491 |   CUDA_CALL(cudaFree(workspace));
492 |   CUDA_CALL(cudaFree(in_data_open));
493 | 
494 |   return 0;
495 | }
496 | 


--------------------------------------------------------------------------------
/src/outer_product.cuh:
--------------------------------------------------------------------------------
  1 | // Copyright 2021 Roberto Lopez Castro
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //    http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | 
 16 | __device__ void __inline__ outer_product(float4* input_frag, float4* filter_frag, float4 accumulator[][16]){
 17 |     accumulator[0][0].x += input_frag[0].x*filter_frag[0].x;
 18 |     accumulator[0][0].y += input_frag[0].y*filter_frag[0].x;
 19 |     accumulator[0][0].z += input_frag[0].z*filter_frag[0].x;
 20 |     accumulator[0][0].w += input_frag[0].w*filter_frag[0].x;
 21 |     
 22 |     accumulator[0][1].x += input_frag[1].x*filter_frag[0].x;
 23 |     accumulator[0][1].y += input_frag[1].y*filter_frag[0].x;
 24 |     accumulator[0][1].z += input_frag[1].z*filter_frag[0].x;                                     
 25 |     accumulator[0][1].w += input_frag[1].w*filter_frag[0].x;
 26 |   
 27 |     accumulator[0][2].x += input_frag[0].x*filter_frag[0].y;
 28 |     accumulator[0][2].y += input_frag[0].y*filter_frag[0].y;                                
 29 |     accumulator[0][2].z += input_frag[0].z*filter_frag[0].y;                                
 30 |     accumulator[0][2].w += input_frag[0].w*filter_frag[0].y;                                  
 31 |   
 32 |     accumulator[0][3].x += input_frag[1].x*filter_frag[0].y;
 33 |     accumulator[0][3].y += input_frag[1].y*filter_frag[0].y;                                 
 34 |     accumulator[0][3].z += input_frag[1].z*filter_frag[0].y;                                 
 35 |     accumulator[0][3].w += input_frag[1].w*filter_frag[0].y;
 36 |                                       
 37 |     accumulator[0][4].x += input_frag[0].x*filter_frag[0].z;
 38 |     accumulator[0][4].y += input_frag[0].y*filter_frag[0].z;                                     
 39 |     accumulator[0][4].z += input_frag[0].z*filter_frag[0].z;                                     
 40 |     accumulator[0][4].w += input_frag[0].w*filter_frag[0].z;
 41 |                                         
 42 |     accumulator[0][5].x += input_frag[1].x*filter_frag[0].z;
 43 |     accumulator[0][5].y += input_frag[1].y*filter_frag[0].z;                                     
 44 |     accumulator[0][5].z += input_frag[1].z*filter_frag[0].z;                                     
 45 |     accumulator[0][5].w += input_frag[1].w*filter_frag[0].z;
 46 |   
 47 |     accumulator[0][6].x += input_frag[0].x*filter_frag[0].w;
 48 |     accumulator[0][6].y += input_frag[0].y*filter_frag[0].w;                                   
 49 |     accumulator[0][6].z += input_frag[0].z*filter_frag[0].w;                                   
 50 |     accumulator[0][6].w += input_frag[0].w*filter_frag[0].w;
 51 |                                         
 52 |     accumulator[0][7].x += input_frag[1].x*filter_frag[0].w;
 53 |     accumulator[0][7].y += input_frag[1].y*filter_frag[0].w;                                    
 54 |     accumulator[0][7].z += input_frag[1].z*filter_frag[0].w;                                    
 55 |     accumulator[0][7].w += input_frag[1].w*filter_frag[0].w;
 56 |   
 57 |     //
 58 |     accumulator[0][8].x += input_frag[0].x*filter_frag[1].x;
 59 |     accumulator[0][8].y += input_frag[0].y*filter_frag[1].x;
 60 |     accumulator[0][8].z += input_frag[0].z*filter_frag[1].x;
 61 |     accumulator[0][8].w += input_frag[0].w*filter_frag[1].x;
 62 |     
 63 |     accumulator[0][9].x += input_frag[1].x*filter_frag[1].x;
 64 |     accumulator[0][9].y += input_frag[1].y*filter_frag[1].x;
 65 |     accumulator[0][9].z += input_frag[1].z*filter_frag[1].x;                                     
 66 |     accumulator[0][9].w += input_frag[1].w*filter_frag[1].x;
 67 |   
 68 |     accumulator[0][10].x += input_frag[0].x*filter_frag[1].y;
 69 |     accumulator[0][10].y += input_frag[0].y*filter_frag[1].y;                                
 70 |     accumulator[0][10].z += input_frag[0].z*filter_frag[1].y;                                
 71 |     accumulator[0][10].w += input_frag[0].w*filter_frag[1].y;                                  
 72 |   
 73 |     accumulator[0][11].x += input_frag[1].x*filter_frag[1].y;
 74 |     accumulator[0][11].y += input_frag[1].y*filter_frag[1].y;                                 
 75 |     accumulator[0][11].z += input_frag[1].z*filter_frag[1].y;                                 
 76 |     accumulator[0][11].w += input_frag[1].w*filter_frag[1].y;
 77 |                                       
 78 |     accumulator[0][12].x += input_frag[0].x*filter_frag[1].z;
 79 |     accumulator[0][12].y += input_frag[0].y*filter_frag[1].z;                                     
 80 |     accumulator[0][12].z += input_frag[0].z*filter_frag[1].z;                                     
 81 |     accumulator[0][12].w += input_frag[0].w*filter_frag[1].z;
 82 |                                         
 83 |     accumulator[0][13].x += input_frag[1].x*filter_frag[1].z;
 84 |     accumulator[0][13].y += input_frag[1].y*filter_frag[1].z;                                     
 85 |     accumulator[0][13].z += input_frag[1].z*filter_frag[1].z;                                     
 86 |     accumulator[0][13].w += input_frag[1].w*filter_frag[1].z;
 87 |   
 88 |     accumulator[0][14].x += input_frag[0].x*filter_frag[1].w;
 89 |     accumulator[0][14].y += input_frag[0].y*filter_frag[1].w;                                   
 90 |     accumulator[0][14].z += input_frag[0].z*filter_frag[1].w;                                   
 91 |     accumulator[0][14].w += input_frag[0].w*filter_frag[1].w;
 92 |                                         
 93 |     accumulator[0][15].x += input_frag[1].x*filter_frag[1].w;
 94 |     accumulator[0][15].y += input_frag[1].y*filter_frag[1].w;                                    
 95 |     accumulator[0][15].z += input_frag[1].z*filter_frag[1].w;                                    
 96 |     accumulator[0][15].w += input_frag[1].w*filter_frag[1].w;
 97 |   
 98 |     //////
 99 |     accumulator[1][0].x += input_frag[2].x*filter_frag[2].x;
100 |     accumulator[1][0].y += input_frag[2].y*filter_frag[2].x;
101 |     accumulator[1][0].z += input_frag[2].z*filter_frag[2].x;
102 |     accumulator[1][0].w += input_frag[2].w*filter_frag[2].x;
103 |     
104 |     accumulator[1][1].x += input_frag[3].x*filter_frag[2].x;
105 |     accumulator[1][1].y += input_frag[3].y*filter_frag[2].x;
106 |     accumulator[1][1].z += input_frag[3].z*filter_frag[2].x;                                     
107 |     accumulator[1][1].w += input_frag[3].w*filter_frag[2].x;
108 |   
109 |     accumulator[1][2].x += input_frag[2].x*filter_frag[2].y;
110 |     accumulator[1][2].y += input_frag[2].y*filter_frag[2].y;                                
111 |     accumulator[1][2].z += input_frag[2].z*filter_frag[2].y;                                
112 |     accumulator[1][2].w += input_frag[2].w*filter_frag[2].y;                                  
113 |   
114 |     accumulator[1][3].x += input_frag[3].x*filter_frag[2].y;
115 |     accumulator[1][3].y += input_frag[3].y*filter_frag[2].y;                                 
116 |     accumulator[1][3].z += input_frag[3].z*filter_frag[2].y;                                 
117 |     accumulator[1][3].w += input_frag[3].w*filter_frag[2].y;
118 |                                       
119 |     accumulator[1][4].x += input_frag[2].x*filter_frag[2].z;
120 |     accumulator[1][4].y += input_frag[2].y*filter_frag[2].z;                                     
121 |     accumulator[1][4].z += input_frag[2].z*filter_frag[2].z;                                     
122 |     accumulator[1][4].w += input_frag[2].w*filter_frag[2].z;
123 |                                         
124 |     accumulator[1][5].x += input_frag[3].x*filter_frag[2].z;
125 |     accumulator[1][5].y += input_frag[3].y*filter_frag[2].z;                                     
126 |     accumulator[1][5].z += input_frag[3].z*filter_frag[2].z;                                     
127 |     accumulator[1][5].w += input_frag[3].w*filter_frag[2].z;
128 |   
129 |     accumulator[1][6].x += input_frag[2].x*filter_frag[2].w;
130 |     accumulator[1][6].y += input_frag[2].y*filter_frag[2].w;                                   
131 |     accumulator[1][6].z += input_frag[2].z*filter_frag[2].w;                                   
132 |     accumulator[1][6].w += input_frag[2].w*filter_frag[2].w;
133 |                                         
134 |     accumulator[1][7].x += input_frag[3].x*filter_frag[2].w;
135 |     accumulator[1][7].y += input_frag[3].y*filter_frag[2].w;                                    
136 |     accumulator[1][7].z += input_frag[3].z*filter_frag[2].w;                                    
137 |     accumulator[1][7].w += input_frag[3].w*filter_frag[2].w;
138 |   
139 |     //
140 |     accumulator[1][8].x += input_frag[2].x*filter_frag[3].x;
141 |     accumulator[1][8].y += input_frag[2].y*filter_frag[3].x;
142 |     accumulator[1][8].z += input_frag[2].z*filter_frag[3].x;
143 |     accumulator[1][8].w += input_frag[2].w*filter_frag[3].x;
144 |     
145 |     accumulator[1][9].x += input_frag[3].x*filter_frag[3].x;
146 |     accumulator[1][9].y += input_frag[3].y*filter_frag[3].x;
147 |     accumulator[1][9].z += input_frag[3].z*filter_frag[3].x;                                     
148 |     accumulator[1][9].w += input_frag[3].w*filter_frag[3].x;
149 |   
150 |     accumulator[1][10].x += input_frag[2].x*filter_frag[3].y;
151 |     accumulator[1][10].y += input_frag[2].y*filter_frag[3].y;                                
152 |     accumulator[1][10].z += input_frag[2].z*filter_frag[3].y;                                
153 |     accumulator[1][10].w += input_frag[2].w*filter_frag[3].y;                                  
154 |   
155 |     accumulator[1][11].x += input_frag[3].x*filter_frag[3].y;
156 |     accumulator[1][11].y += input_frag[3].y*filter_frag[3].y;                                 
157 |     accumulator[1][11].z += input_frag[3].z*filter_frag[3].y;                                 
158 |     accumulator[1][11].w += input_frag[3].w*filter_frag[3].y;
159 |                                       
160 |     accumulator[1][12].x += input_frag[2].x*filter_frag[3].z;
161 |     accumulator[1][12].y += input_frag[2].y*filter_frag[3].z;                                     
162 |     accumulator[1][12].z += input_frag[2].z*filter_frag[3].z;                                     
163 |     accumulator[1][12].w += input_frag[2].w*filter_frag[3].z;
164 |                                         
165 |     accumulator[1][13].x += input_frag[3].x*filter_frag[3].z;
166 |     accumulator[1][13].y += input_frag[3].y*filter_frag[3].z;                                     
167 |     accumulator[1][13].z += input_frag[3].z*filter_frag[3].z;                                     
168 |     accumulator[1][13].w += input_frag[3].w*filter_frag[3].z;
169 |   
170 |     accumulator[1][14].x += input_frag[2].x*filter_frag[3].w;
171 |     accumulator[1][14].y += input_frag[2].y*filter_frag[3].w;                                   
172 |     accumulator[1][14].z += input_frag[2].z*filter_frag[3].w;                                   
173 |     accumulator[1][14].w += input_frag[2].w*filter_frag[3].w;
174 |                                         
175 |     accumulator[1][15].x += input_frag[3].x*filter_frag[3].w;
176 |     accumulator[1][15].y += input_frag[3].y*filter_frag[3].w;                                    
177 |     accumulator[1][15].z += input_frag[3].z*filter_frag[3].w;                                    
178 |     accumulator[1][15].w += input_frag[3].w*filter_frag[3].w;
179 |   }
180 |   


--------------------------------------------------------------------------------
/src/outer_product_suffle.cuh:
--------------------------------------------------------------------------------
  1 | // Copyright 2021 Roberto Lopez Castro
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //    http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | 
 16 | __device__  __forceinline__ void outer_product(float4* input_frag, float4* filter_frag, float4 accumulator[][16]){
 17 |     accumulator[0][0].x += input_frag[0].x*filter_frag[0].x;
 18 |     accumulator[0][0].z += input_frag[0].y*filter_frag[0].x;
 19 |     accumulator[0][0].y += input_frag[0].z*filter_frag[0].x;
 20 |     accumulator[0][0].w += input_frag[0].w*filter_frag[0].x;
 21 |     
 22 |     accumulator[0][1].x += input_frag[1].x*filter_frag[0].x;
 23 |     accumulator[0][1].z += input_frag[1].y*filter_frag[0].x;
 24 |     accumulator[0][1].y += input_frag[1].z*filter_frag[0].x;                                     
 25 |     accumulator[0][1].w += input_frag[1].w*filter_frag[0].x;
 26 |   
 27 |     accumulator[0][2].x += input_frag[0].x*filter_frag[0].y;
 28 |     accumulator[0][2].z += input_frag[0].y*filter_frag[0].y;                                
 29 |     accumulator[0][2].y += input_frag[0].z*filter_frag[0].y;                                
 30 |     accumulator[0][2].w += input_frag[0].w*filter_frag[0].y;                                  
 31 |   
 32 |     accumulator[0][3].x += input_frag[1].x*filter_frag[0].y;
 33 |     accumulator[0][3].z += input_frag[1].y*filter_frag[0].y;                                 
 34 |     accumulator[0][3].y += input_frag[1].z*filter_frag[0].y;                                 
 35 |     accumulator[0][3].w += input_frag[1].w*filter_frag[0].y;
 36 |                                       
 37 |     accumulator[0][4].x += input_frag[0].x*filter_frag[0].z;
 38 |     accumulator[0][4].z += input_frag[0].y*filter_frag[0].z;                                     
 39 |     accumulator[0][4].y += input_frag[0].z*filter_frag[0].z;                                     
 40 |     accumulator[0][4].w += input_frag[0].w*filter_frag[0].z;
 41 |                                         
 42 |     accumulator[0][5].x += input_frag[1].x*filter_frag[0].z;
 43 |     accumulator[0][5].z += input_frag[1].y*filter_frag[0].z;                                     
 44 |     accumulator[0][5].y += input_frag[1].z*filter_frag[0].z;                                     
 45 |     accumulator[0][5].w += input_frag[1].w*filter_frag[0].z;
 46 |   
 47 |     accumulator[0][6].x += input_frag[0].x*filter_frag[0].w;
 48 |     accumulator[0][6].z += input_frag[0].y*filter_frag[0].w;                                   
 49 |     accumulator[0][6].y += input_frag[0].z*filter_frag[0].w;                                   
 50 |     accumulator[0][6].w += input_frag[0].w*filter_frag[0].w;
 51 |                                         
 52 |     accumulator[0][7].x += input_frag[1].x*filter_frag[0].w;
 53 |     accumulator[0][7].z += input_frag[1].y*filter_frag[0].w;                                    
 54 |     accumulator[0][7].y += input_frag[1].z*filter_frag[0].w;                                    
 55 |     accumulator[0][7].w += input_frag[1].w*filter_frag[0].w;
 56 |   
 57 |     //
 58 |     accumulator[0][8].x += input_frag[0].x*filter_frag[1].x;
 59 |     accumulator[0][8].z += input_frag[0].y*filter_frag[1].x;
 60 |     accumulator[0][8].y += input_frag[0].z*filter_frag[1].x;
 61 |     accumulator[0][8].w += input_frag[0].w*filter_frag[1].x;
 62 |     
 63 |     accumulator[0][9].x += input_frag[1].x*filter_frag[1].x;
 64 |     accumulator[0][9].z += input_frag[1].y*filter_frag[1].x;
 65 |     accumulator[0][9].y += input_frag[1].z*filter_frag[1].x;                                     
 66 |     accumulator[0][9].w += input_frag[1].w*filter_frag[1].x;
 67 |   
 68 |     accumulator[0][10].x += input_frag[0].x*filter_frag[1].y;
 69 |     accumulator[0][10].z += input_frag[0].y*filter_frag[1].y;                                
 70 |     accumulator[0][10].y += input_frag[0].z*filter_frag[1].y;                                
 71 |     accumulator[0][10].w += input_frag[0].w*filter_frag[1].y;                                  
 72 |   
 73 |     accumulator[0][11].x += input_frag[1].x*filter_frag[1].y;
 74 |     accumulator[0][11].z += input_frag[1].y*filter_frag[1].y;                                 
 75 |     accumulator[0][11].y += input_frag[1].z*filter_frag[1].y;                                 
 76 |     accumulator[0][11].w += input_frag[1].w*filter_frag[1].y;
 77 |                                       
 78 |     accumulator[0][12].x += input_frag[0].x*filter_frag[1].z;
 79 |     accumulator[0][12].z += input_frag[0].y*filter_frag[1].z;                                     
 80 |     accumulator[0][12].y += input_frag[0].z*filter_frag[1].z;                                     
 81 |     accumulator[0][12].w += input_frag[0].w*filter_frag[1].z;
 82 |                                         
 83 |     accumulator[0][13].x += input_frag[1].x*filter_frag[1].z;
 84 |     accumulator[0][13].z += input_frag[1].y*filter_frag[1].z;                                     
 85 |     accumulator[0][13].y += input_frag[1].z*filter_frag[1].z;                                     
 86 |     accumulator[0][13].w += input_frag[1].w*filter_frag[1].z;
 87 |   
 88 |     accumulator[0][14].x += input_frag[0].x*filter_frag[1].w;
 89 |     accumulator[0][14].z += input_frag[0].y*filter_frag[1].w;                                   
 90 |     accumulator[0][14].y += input_frag[0].z*filter_frag[1].w;                                   
 91 |     accumulator[0][14].w += input_frag[0].w*filter_frag[1].w;
 92 |                                         
 93 |     accumulator[0][15].x += input_frag[1].x*filter_frag[1].w;
 94 |     accumulator[0][15].z += input_frag[1].y*filter_frag[1].w;                                    
 95 |     accumulator[0][15].y += input_frag[1].z*filter_frag[1].w;                                    
 96 |     accumulator[0][15].w += input_frag[1].w*filter_frag[1].w;
 97 |   
 98 |     //////
 99 |     accumulator[1][0].x += input_frag[2].x*filter_frag[2].x;
100 |     accumulator[1][0].z += input_frag[2].y*filter_frag[2].x;
101 |     accumulator[1][0].y += input_frag[2].z*filter_frag[2].x;
102 |     accumulator[1][0].w += input_frag[2].w*filter_frag[2].x;
103 |     
104 |     accumulator[1][1].x += input_frag[3].x*filter_frag[2].x;
105 |     accumulator[1][1].z += input_frag[3].y*filter_frag[2].x;
106 |     accumulator[1][1].y += input_frag[3].z*filter_frag[2].x;                                     
107 |     accumulator[1][1].w += input_frag[3].w*filter_frag[2].x;
108 |   
109 |     accumulator[1][2].x += input_frag[2].x*filter_frag[2].y;
110 |     accumulator[1][2].z += input_frag[2].y*filter_frag[2].y;                                
111 |     accumulator[1][2].y += input_frag[2].z*filter_frag[2].y;                                
112 |     accumulator[1][2].w += input_frag[2].w*filter_frag[2].y;                                  
113 |   
114 |     accumulator[1][3].x += input_frag[3].x*filter_frag[2].y;
115 |     accumulator[1][3].z += input_frag[3].y*filter_frag[2].y;                                 
116 |     accumulator[1][3].y += input_frag[3].z*filter_frag[2].y;                                 
117 |     accumulator[1][3].w += input_frag[3].w*filter_frag[2].y;
118 |                                       
119 |     accumulator[1][4].x += input_frag[2].x*filter_frag[2].z;
120 |     accumulator[1][4].z += input_frag[2].y*filter_frag[2].z;                                     
121 |     accumulator[1][4].y += input_frag[2].z*filter_frag[2].z;                                     
122 |     accumulator[1][4].w += input_frag[2].w*filter_frag[2].z;
123 |                                         
124 |     accumulator[1][5].x += input_frag[3].x*filter_frag[2].z;
125 |     accumulator[1][5].z += input_frag[3].y*filter_frag[2].z;                                     
126 |     accumulator[1][5].y += input_frag[3].z*filter_frag[2].z;                                     
127 |     accumulator[1][5].w += input_frag[3].w*filter_frag[2].z;
128 |   
129 |     accumulator[1][6].x += input_frag[2].x*filter_frag[2].w;
130 |     accumulator[1][6].z += input_frag[2].y*filter_frag[2].w;                                   
131 |     accumulator[1][6].y += input_frag[2].z*filter_frag[2].w;                                   
132 |     accumulator[1][6].w += input_frag[2].w*filter_frag[2].w;
133 |                                         
134 |     accumulator[1][7].x += input_frag[3].x*filter_frag[2].w;
135 |     accumulator[1][7].z += input_frag[3].y*filter_frag[2].w;                                    
136 |     accumulator[1][7].y += input_frag[3].z*filter_frag[2].w;                                    
137 |     accumulator[1][7].w += input_frag[3].w*filter_frag[2].w;
138 |   
139 |     //
140 |     accumulator[1][8].x += input_frag[2].x*filter_frag[3].x;
141 |     accumulator[1][8].z += input_frag[2].y*filter_frag[3].x;
142 |     accumulator[1][8].y += input_frag[2].z*filter_frag[3].x;
143 |     accumulator[1][8].w += input_frag[2].w*filter_frag[3].x;
144 |     
145 |     accumulator[1][9].x += input_frag[3].x*filter_frag[3].x;
146 |     accumulator[1][9].z += input_frag[3].y*filter_frag[3].x;
147 |     accumulator[1][9].y += input_frag[3].z*filter_frag[3].x;                                     
148 |     accumulator[1][9].w += input_frag[3].w*filter_frag[3].x;
149 |   
150 |     accumulator[1][10].x += input_frag[2].x*filter_frag[3].y;
151 |     accumulator[1][10].z += input_frag[2].y*filter_frag[3].y;                                
152 |     accumulator[1][10].y += input_frag[2].z*filter_frag[3].y;                                
153 |     accumulator[1][10].w += input_frag[2].w*filter_frag[3].y;                                  
154 |   
155 |     accumulator[1][11].x += input_frag[3].x*filter_frag[3].y;
156 |     accumulator[1][11].z += input_frag[3].y*filter_frag[3].y;                                 
157 |     accumulator[1][11].y += input_frag[3].z*filter_frag[3].y;                                 
158 |     accumulator[1][11].w += input_frag[3].w*filter_frag[3].y;
159 |                                       
160 |     accumulator[1][12].x += input_frag[2].x*filter_frag[3].z;
161 |     accumulator[1][12].z += input_frag[2].y*filter_frag[3].z;                                     
162 |     accumulator[1][12].y += input_frag[2].z*filter_frag[3].z;                                     
163 |     accumulator[1][12].w += input_frag[2].w*filter_frag[3].z;
164 |                                         
165 |     accumulator[1][13].x += input_frag[3].x*filter_frag[3].z;
166 |     accumulator[1][13].z += input_frag[3].y*filter_frag[3].z;                                     
167 |     accumulator[1][13].y += input_frag[3].z*filter_frag[3].z;                                     
168 |     accumulator[1][13].w += input_frag[3].w*filter_frag[3].z;
169 |   
170 |     accumulator[1][14].x += input_frag[2].x*filter_frag[3].w;
171 |     accumulator[1][14].z += input_frag[2].y*filter_frag[3].w;                                   
172 |     accumulator[1][14].y += input_frag[2].z*filter_frag[3].w;                                   
173 |     accumulator[1][14].w += input_frag[2].w*filter_frag[3].w;
174 |                                         
175 |     accumulator[1][15].x += input_frag[3].x*filter_frag[3].w;
176 |     accumulator[1][15].z += input_frag[3].y*filter_frag[3].w;                                    
177 |     accumulator[1][15].y += input_frag[3].z*filter_frag[3].w;                                    
178 |     accumulator[1][15].w += input_frag[3].w*filter_frag[3].w;
179 |   }


--------------------------------------------------------------------------------
/src/store_and_transform_output_baseline.cuh:
--------------------------------------------------------------------------------
  1 | 
  2 | // Copyright 2021 Roberto Lopez Castro
  3 | //
  4 | // Licensed under the Apache License, Version 2.0 (the "License");
  5 | // you may not use this file except in compliance with the License.
  6 | // You may obtain a copy of the License at
  7 | //
  8 | //    http://www.apache.org/licenses/LICENSE-2.0
  9 | //
 10 | // Unless required by applicable law or agreed to in writing, software
 11 | // distributed under the License is distributed on an "AS IS" BASIS,
 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | // See the License for the specific language governing permissions and
 14 | // limitations under the License.
 15 | 
 16 | #include "config.hpp"
 17 | 
 18 | #ifndef _OUTPUT_KERNEL_PPoPP_
 19 | #define _OUTPUT_KERNEL_PPoPP_
 20 | extern "C"
 21 | {
 22 | 
 23 | __device__ void  transform_output_tile(float *pOutputs, float *C_tile, float *At, int Inx, int Iny, int TileX, int TileY, int TileZ, int out_h, int out_w, int tiles_dim, int round, int in_n, int offset, int out_thread[][4], short mask, int c_tensor, int c_glb_offset){
 24 | 
 25 |   for(int j=0; j<4; j++){
 26 |     At[j] = C_tile[j] + C_tile[4+j] + C_tile[8+j];
 27 |     At[j+8] = C_tile[j+16] + C_tile[4+j+16] + C_tile[8+j+16];
 28 |     
 29 |     At[4+j] = C_tile[4+j] - C_tile[8+j] - C_tile[12+j];
 30 |     At[4+j+8] = C_tile[4+j+16] - C_tile[8+j+16] - C_tile[12+j+16];          
 31 |   }
 32 | 
 33 |   int idx = out_thread[round][Iny%4] + Iny/4 + offset;
 34 |   c_tensor += idx*c_glb_offset;
 35 |   int x, x1;
 36 | 
 37 |   for(int i=0; i<2; i++){
 38 |     x = i*4;
 39 |     x1 = i*(in_n*(tiles_dim-(out_w%2)) + (out_w%2)*in_n/2)*2;
 40 |     if(mask&(1<<(i*2))){
 41 |       pOutputs[c_tensor+ x1] = At[x] + At[x+1] + At[x+2];
 42 |       pOutputs[c_tensor+2*c_glb_offset+x1] = At[x+8] + At[x+1+8] + At[x+2+8];
 43 |     }
 44 | 
 45 |     if(mask&(1<<(i*2+1))){
 46 |       pOutputs[c_tensor+x1+in_n] = At[x+1] - At[x+2] - At[x+3];
 47 |       pOutputs[c_tensor+2*c_glb_offset+x1+in_n] = At[x+1+8] - At[x+2+8] - At[x+3+8];          
 48 |     }
 49 |   }
 50 | 
 51 | }
 52 | 
 53 | __device__ __inline__ void store_output_tile(float4 acumm_smem[][16], float *shared_mem, int Inx, int Iny, float *C, int TileX, int TileY, int TileZ, int out_h, int out_w,  int tiles_dim, int in_n, float4 *input_frag_mem, float4* filter_frag_mem, int out_thread[][4], int access_s_out[][16], short mask){
 54 |                                       
 55 |   float4 *output_smem = (float4 *) shared_mem;
 56 |   float4 *accumulator = (float4 *) acumm_smem;  
 57 | 
 58 |   float *C_tile = (float*) input_frag_mem;
 59 |   float *At = (float*) filter_frag_mem;
 60 | 
 61 |   mask = 0x000F;
 62 |   if((blockIdx.y/tiles_dim)==(tiles_dim-1) && out_w%2) mask&=0x0003;
 63 |   if(!((blockIdx.y+1)%tiles_dim) && out_w%2)           mask&=0X0005;
 64 | 
 65 |   // output transpose step
 66 |   int t=0;
 67 |   int acumm1, acumm2;
 68 | 
 69 |   acumm1 = access_s_out[0][Inx%8 + (Inx/16)*8];
 70 |   acumm2 = access_s_out[1][Inx%8 + (Inx/16)*8]; 
 71 | 
 72 |   int offset = BN_p*4;
 73 |   int init = (Iny/4)*BN_p*16*4 + (Iny%4)*40 + Inx; 
 74 |   int acumm3 = Iny * BN_p;
 75 |   int acumm4 = BN_p*8*2;
 76 | 
 77 |   int idx  = acumm3;
 78 |   int idx2 = idx + BN_p*8;
 79 | 
 80 |   float* out = (float *) output_smem;
 81 | 
 82 |   int c_glb_offset = in_n*out_h*out_w;
 83 |   int c_tensor = TileZ*in_n*out_h*out_w*BK + (TileY%tiles_dim)*in_n*2 + (TileY/tiles_dim)*in_n*out_w*2 + TileX*BN + Inx;
 84 |                   
 85 |   for(int round=0; round<4; round++){
 86 | 
 87 |     //transformation step    
 88 |     if ( ((!round || round==1) && (Inx&15)<8) || ((round==2 || round==3) && (Inx&15)>7) ){
 89 | 
 90 |         #pragma unroll   
 91 |         for(int i=0; i<4; i+=2){
 92 | 
 93 |             *( (float4*) (output_smem + idx+i*acumm4 + acumm1) )  = *(accumulator+t);            // k=0
 94 |             *( (float4*) (output_smem + idx+i*acumm4 + acumm2) )  = *(accumulator+t+1);
 95 |             *( (float4*) (output_smem + idx+(i+1)*acumm4 + acumm1) )  = *(accumulator+2+t);   // k=1
 96 |             *( (float4*) (output_smem + idx+(i+1)*acumm4 + acumm2) )  = *(accumulator+3+t);
 97 | 
 98 |             *( (float4*) (output_smem + idx2+i*acumm4 + acumm1) ) = *(accumulator+16+t);
 99 |             *( (float4*) (output_smem + idx2+i*acumm4 + acumm2) ) = *(accumulator+17+t);
100 |             *( (float4*) (output_smem + idx2+(i+1)*acumm4 + acumm1) ) = *(accumulator+18+t);
101 |             *( (float4*) (output_smem + idx2+(i+1)*acumm4 + acumm2) ) = *(accumulator+19+t);
102 | 
103 |             t+=4;
104 |         }
105 |     } 
106 |     
107 |     __syncthreads();
108 |     
109 |     for(int i=0; i<16; i++){
110 |         C_tile[i] =    out[init + i*offset];
111 |         C_tile[i+16] = out[init + 2*BN_p*16*4 + i*offset];
112 |     }
113 | 
114 |     // transform output tiles
115 |     transform_output_tile(C, C_tile, At, Inx, Iny, TileX, TileY, TileZ, out_h, out_w, tiles_dim, round, in_n, 0, out_thread, mask, c_tensor, c_glb_offset);
116 |                         
117 | 
118 |     __syncthreads();
119 |   }
120 |   
121 | }
122 | 
123 | }
124 | #endif     


--------------------------------------------------------------------------------
/src/store_and_transform_output_optLDS64.cuh:
--------------------------------------------------------------------------------
  1 | // Copyright 2021 Roberto Lopez Castro
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //    http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | 
 16 | #include "config.hpp"
 17 | 
 18 | #ifndef _OUTPUT_KERNEL_OPT3_
 19 | #define _OUTPUT_KERNEL_OPT3_
 20 | extern "C"
 21 | {
 22 | 
 23 | __device__ void  __inline__ transform_output_tile(float2 *pOutputs, float2 *C_tile, float2 *At, int tiles_dim, int round, int in_n, int c_tensor, int c_glb_offset, short mask, int out_w)
 24 | {
 25 |   c_tensor += ( (round/2)*32 + (round%2)*2 )*c_glb_offset/2;  
 26 |   int x, x1;
 27 | 
 28 |   for(int j=0; j<4; j++){
 29 |       At[j].x = C_tile[j].x + C_tile[4+j].x + C_tile[8+j].x;
 30 |       At[j].y = C_tile[j].y + C_tile[4+j].y + C_tile[8+j].y;
 31 |     
 32 |       At[4+j].x = C_tile[4+j].x - C_tile[8+j].x - C_tile[12+j].x;
 33 |       At[4+j].y = C_tile[4+j].y - C_tile[8+j].y - C_tile[12+j].y;      
 34 |   }
 35 | 
 36 |   for(int i=0; i<2; i++){
 37 |     x = i*4;
 38 |     x1 = i*(in_n*(tiles_dim-(out_w%2)) + (out_w%2)*in_n/2);
 39 |     if(mask&(1<<(i*2))){
 40 | 
 41 |       pOutputs[x1 + c_tensor].x = At[x].x + At[x+1].x + At[x+2].x;
 42 |       pOutputs[x1 + c_tensor].y = At[x].y + At[x+1].y + At[x+2].y;
 43 |     }
 44 |     if(mask&(1<<(i*2+1))){
 45 |       pOutputs[x1 + in_n/2 + c_tensor].x = At[x+1].x - At[x+2].x - At[x+3].x;
 46 |       pOutputs[x1 + in_n/2 + c_tensor].y = At[x+1].y - At[x+2].y - At[x+3].y;
 47 |     }
 48 |   }
 49 | 
 50 | }
 51 | 
 52 | __device__ __inline__ void store_output_tile(float4 acumm_smem[][16], float  *shared_mem, int Inx, int Iny,
 53 |                                   float *C, int TileX, int TileY, int TileZ, int out_h, int out_w, 
 54 |                                   int tiles_dim, int in_n, float4 *input_frag_mem, float4* filter_frag_mem, short mask){
 55 |   
 56 |   float2 *output_smem = (float2 *) shared_mem;
 57 |   float2 *accumulator = (float2 *) acumm_smem;
 58 |   float2 *C_out = (float2*)C;
 59 | 
 60 |   float2 *C_tile = (float2*) input_frag_mem;
 61 |   float2 *At = (float2*) filter_frag_mem;
 62 | 
 63 |   mask = 0x000F;
 64 |   if((blockIdx.y/tiles_dim)==(tiles_dim-1) && out_w%2) mask&=0x0003;
 65 |   if(!((blockIdx.y+1)%tiles_dim) && out_w%2)           mask&=0X0005;
 66 |   
 67 |   // output transpose step
 68 |   int t=0;
 69 |   int acumm1, acumm2;
 70 |   // For transposing
 71 |   //acumm1 = access_s_out[Inx]; //* 4
 72 |   acumm1 = ((Inx%8)/2)*34 + Inx%2 + (Inx/16)*2 + ((Inx/8)%2)*8;
 73 |   acumm2 = acumm1+4;
 74 |                                     
 75 |   int acumm4 = BN_p*8*2 ; //*4
 76 |   int idx  = Iny * BN_p;
 77 |   int idx2 = idx + BN_p*8; //(BN_p*2 *8)/2
 78 | 
 79 |   // For transformating
 80 |   int offset = BN_p; //*2/2
 81 |   int init = (Iny/4)*BN_p*16 + (Iny%4)*(32+2);
 82 |   init += (Inx/16)*8 + ((Inx/8)%2)*16 + (Inx%8); //40=(8+2)*4, 4 blocks/buffer
 83 | 
 84 | 
 85 |   int c_glb_offset = in_n*out_h*out_w;                    
 86 |   int c_tensor = TileZ*c_glb_offset*BK + (TileY%tiles_dim)*in_n*2 + (TileY/tiles_dim)*in_n*out_w*2 + TileX*BN + ((Inx/8)%2)*2 + (Inx%8)*2*2 + ((Inx/16)*16 + (Iny%4)*4 + Iny/4)*c_glb_offset;
 87 |   c_tensor/=2; 
 88 | 
 89 |   #pragma unroll                                  
 90 |   for(int round=0; round<4; round++){
 91 | 
 92 |     *( (float2*) (output_smem + idx + acumm1) )  = *(accumulator+t);
 93 |     *( (float2*) (output_smem + idx + acumm1 + 16) )  = *(accumulator+t+1); // float 4, t
 94 |     *( (float2*) (output_smem + idx + acumm2) )  = *(accumulator+t+2);
 95 |     *( (float2*) (output_smem + idx + acumm2 + 16) )  = *(accumulator+t+3); // float 4, t+1
 96 | 
 97 |     *( (float2*) (output_smem + idx2 + acumm1) ) = *(accumulator+t+32);
 98 |     *( (float2*) (output_smem + idx2 + acumm1 + 16) ) = *(accumulator+t+33); // float 4, t+16
 99 |     *( (float2*) (output_smem + idx2 + acumm2) ) = *(accumulator+t+34);
100 |     *( (float2*) (output_smem + idx2 + acumm2 + 16) ) = *(accumulator+t+35); // float 4, t+17
101 | 
102 |     *( (float2*) (output_smem + idx + acumm4 + acumm1) )  = *(accumulator+t+4); 
103 |     *( (float2*) (output_smem + idx + acumm4 + acumm1 + 16) )  = *(accumulator+t+5); // float 4, t+2
104 |     *( (float2*) (output_smem + idx + acumm4 + acumm2) )  = *(accumulator+t+6);
105 |     *( (float2*) (output_smem + idx + acumm4 + acumm2 + 16) )  = *(accumulator+t+7); // float 4, t+3
106 | 
107 |     *( (float2*) (output_smem + idx2 + acumm4 + acumm1) ) = *(accumulator+t+36);
108 |     *( (float2*) (output_smem + idx2 + acumm4 + acumm1 + 16) ) = *(accumulator+t+37); // float 4, t+18
109 |     *( (float2*) (output_smem + idx2 + acumm4 + acumm2) ) = *(accumulator+t+38);
110 |     *( (float2*) (output_smem + idx2 + acumm4 + acumm2 + 16) ) = *(accumulator+t+39); // float 4, t+19
111 | 
112 |     t+=8;
113 | 
114 |     __syncthreads();
115 | 
116 | 
117 |     for(int i=0; i<16; i++){
118 |       C_tile[i].x = output_smem[i*offset + init].x; //16*4
119 |       C_tile[i].y = output_smem[i*offset + init].y; //16*4
120 |     }
121 | 
122 |     // transform output tiles
123 |     transform_output_tile(C_out, C_tile, At, tiles_dim, round, in_n, c_tensor, c_glb_offset, mask , out_w);
124 | 
125 |     __syncthreads();
126 |     
127 |   }
128 | }
129 | 
130 | }
131 | #endif     


--------------------------------------------------------------------------------
/src/store_and_transform_output_optSTS64.cuh:
--------------------------------------------------------------------------------
  1 | // Copyright 2021 Roberto Lopez Castro
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //    http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | 
 16 | #include "config.hpp"
 17 | 
 18 | #ifndef _OUTPUT_KERNEL_OPT1_
 19 | #define _OUTPUT_KERNEL_OPT1_
 20 | extern "C"
 21 | {
 22 |     
 23 | __device__ void  transform_output_tile(float2 *pOutputs, float2 *C_tile, float2 *At,
 24 |                                       int tiles_dim, int round, int in_n, int c_tensor, int c_glb_offset, short mask, int out_w){                     
 25 |   c_tensor += (((round)/2)*32 + ((round)%2)*2)*c_glb_offset/2;  
 26 |   int x, x1;
 27 | 
 28 |   #pragma unroll
 29 |   for(int j=0; j<4; j++){
 30 |     At[j].x = C_tile[j].x + C_tile[4+j].x + C_tile[8+j].x;
 31 |     At[j].y = C_tile[j].y + C_tile[4+j].y + C_tile[8+j].y;
 32 | 
 33 |     At[4+j].x = C_tile[4+j].x - C_tile[8+j].x - C_tile[12+j].x;
 34 |     At[4+j].y = C_tile[4+j].y - C_tile[8+j].y - C_tile[12+j].y;
 35 |   }
 36 | 
 37 | 
 38 |   #pragma unroll
 39 |   for(int i=0; i<2; i++){
 40 |     x = i*4;
 41 |     x1 = i*(in_n*(tiles_dim-(out_w%2)) + (out_w%2)*in_n/2);
 42 |     if(mask&(1<<(i*2))){
 43 |       pOutputs[x1 + c_tensor].x = At[x].x + At[x+1].x + At[x+2].x;
 44 |       pOutputs[x1 + c_tensor].y = At[x].y + At[x+1].y + At[x+2].y;
 45 |     }
 46 | 
 47 |     if(mask&(1<<(i*2+1))){
 48 |       pOutputs[x1 + in_n/2 + c_tensor].x = At[x+1].x - At[x+2].x - At[x+3].x;
 49 |       pOutputs[x1 + in_n/2 + c_tensor].y = At[x+1].y - At[x+2].y - At[x+3].y;
 50 |     }
 51 |   } 
 52 | }
 53 | 
 54 | __device__ __inline__ void store_output_tile(float4 acumm_smem[][16], float *shared_mem, int Inx, int Iny,
 55 |                                   float *C, int TileX, int TileY, int TileZ, int out_h, int out_w, 
 56 |                                   int tiles_dim, int in_n, float4 *input_frag_mem, float4* filter_frag_mem,  short mask){
 57 |   
 58 |   float2 *output_smem = (float2 *) shared_mem;
 59 |   float2 *accumulator = (float2 *) acumm_smem;
 60 |   float2 *C_out = (float2*)C;
 61 | 
 62 |   float2 *C_tile = (float2*) input_frag_mem;
 63 |   float2 *At = (float2*) filter_frag_mem;
 64 | 
 65 |   mask = 0x000F;
 66 |   if((blockIdx.y/tiles_dim)==(tiles_dim-1) && out_w%2) mask&=0x0003;
 67 |   if(!((blockIdx.y+1)%tiles_dim) && out_w%2)           mask&=0X0005;
 68 |   
 69 |   // output transpose step
 70 |   int t=0;
 71 |   int acumm1, acumm2;
 72 |   // For transposing
 73 |   //acumm1 = access_s_out[Inx]; //* 4
 74 |   acumm1 = ((Inx%8)/2)*34 + Inx%2 + (Inx/16)*2 + ((Inx/8)%2)*8;
 75 |   acumm2 = acumm1+4;
 76 |                        
 77 |   int acumm4 = BN_p*16 ; //*4
 78 |   int idx  = Iny * BN_p;
 79 |   int idx2 = idx + BN_p*8; //(BN_p*2 *8)/2
 80 | 
 81 |   // For transformating
 82 |   int offset = BN_p *2; //*2/2
 83 |   int init = ( (Iny/4)*BN_p*16 + (Iny%4)*(32+2) ) *2 + Inx;
 84 | 
 85 |   int c_glb_offset = in_n*out_h*out_w;
 86 |   int c_tensor = TileZ*c_glb_offset*BK + (TileY%tiles_dim)*in_n*2 + (TileY/tiles_dim)*in_n*out_w*2 + TileX*BN + (Inx%16)*2+
 87 |                 ((Inx/16)*16 + (Iny%4)*4 + Iny/4)*c_glb_offset;
 88 |   c_tensor/=2; 
 89 | 
 90 |   #pragma unroll                                  
 91 |   for(int round=0; round<4; round++){
 92 | 
 93 |     *( (float2*) (output_smem + idx + acumm1) )  = *(accumulator+t);
 94 |     *( (float2*) (output_smem + idx + acumm1 + 16) )  = *(accumulator+t+1); // float 4, t
 95 |     *( (float2*) (output_smem + idx + acumm2) )  = *(accumulator+t+2);
 96 |     *( (float2*) (output_smem + idx + acumm2 + 16) )  = *(accumulator+t+3); // float 4, t+1
 97 | 
 98 |     *( (float2*) (output_smem + idx2 + acumm1) ) = *(accumulator+t+32);
 99 |     *( (float2*) (output_smem + idx2 + acumm1 + 16) ) = *(accumulator+t+33); // float 4, t+16
100 |     *( (float2*) (output_smem + idx2 + acumm2) ) = *(accumulator+t+34);
101 |     *( (float2*) (output_smem + idx2 + acumm2 + 16) ) = *(accumulator+t+35); // float 4, t+17
102 | 
103 |     *( (float2*) (output_smem + idx + acumm4 + acumm1) )  = *(accumulator+t+4); 
104 |     *( (float2*) (output_smem + idx + acumm4 + acumm1 + 16) )  = *(accumulator+t+5); // float 4, t+2
105 |     *( (float2*) (output_smem + idx + acumm4 + acumm2) )  = *(accumulator+t+6);
106 |     *( (float2*) (output_smem + idx + acumm4 + acumm2 + 16) )  = *(accumulator+t+7); // float 4, t+3
107 | 
108 |     *( (float2*) (output_smem + idx2 + acumm4 + acumm1) ) = *(accumulator+t+36);
109 |     *( (float2*) (output_smem + idx2 + acumm4 + acumm1 + 16) ) = *(accumulator+t+37); // float 4, t+18
110 |     *( (float2*) (output_smem + idx2 + acumm4 + acumm2) ) = *(accumulator+t+38);
111 |     *( (float2*) (output_smem + idx2 + acumm4 + acumm2 + 16) ) = *(accumulator+t+39); // float 4, t+19
112 |     
113 |     t+=8;
114 | 
115 |     __syncthreads();
116 | 
117 |     
118 |     for(int i=0; i<16; i++){
119 |       C_tile[i].x = shared_mem[i*offset + init];
120 |       C_tile[i].y = shared_mem[i*offset + init + 32];
121 |     }
122 | 
123 |     // transform output tiles
124 |     transform_output_tile(C_out, C_tile, At, tiles_dim, round, in_n, c_tensor, c_glb_offset, mask, out_w);
125 |     __syncthreads();
126 |   }
127 | }
128 | 
129 | }
130 | #endif     


--------------------------------------------------------------------------------
/src/store_and_transform_output_optSTS64_compact.cuh:
--------------------------------------------------------------------------------
  1 | // Copyright 2021 Roberto Lopez Castro
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //    http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | 
 16 | #include "config.hpp"
 17 | 
 18 | #ifndef _OUTPUT_KERNEL_OPT1_
 19 | #define _OUTPUT_KERNEL_OPT1_
 20 | extern "C"
 21 | {
 22 |     
 23 | __device__ __forceinline__ void transform_output_tile(float2 *pOutputs, float2 *C_tile, float2 *At,
 24 |                                       int tiles_dim, int round, int in_n, int c_tensor, int c_glb_offset,
 25 |                                       short mask, int out_w){                     
 26 |   c_tensor += (((round)/2)*32 + ((round)%2)*2)*c_glb_offset/2;  
 27 |   int x, x1;
 28 | 
 29 |   
 30 |   #pragma unroll
 31 |   for(int j=0; j<4; j++){
 32 |     At[j].x = C_tile[j].x + C_tile[4+j].x + C_tile[8+j].x;
 33 |     At[j].y = C_tile[j].y + C_tile[4+j].y + C_tile[8+j].y;
 34 |     At[4+j].x = C_tile[4+j].x - C_tile[8+j].x - C_tile[12+j].x;
 35 |     At[4+j].y = C_tile[4+j].y - C_tile[8+j].y - C_tile[12+j].y;
 36 |   }
 37 | 
 38 |   x = in_n/2;
 39 |   pOutputs[c_tensor].x = At[0].x + At[1].x + At[2].x;
 40 |   pOutputs[c_tensor].y = At[0].y + At[1].y + At[2].y;
 41 | 
 42 |   if(mask&0x2){
 43 |     pOutputs[x + c_tensor].x = At[1].x - At[2].x - At[3].x;
 44 |     pOutputs[x + c_tensor].y = At[1].y - At[2].y - At[3].y;
 45 |   }
 46 | 
 47 |   x1 = in_n*(tiles_dim-(out_w%2)) + (out_w%2)*x;
 48 |   if(mask&0x4){
 49 |     pOutputs[x1 + c_tensor].x = At[4].x + At[5].x + At[6].x;
 50 |     pOutputs[x1 + c_tensor].y = At[4].y + At[5].y + At[6].y;
 51 |   }   
 52 |   
 53 |   if(mask&0x8){
 54 |     pOutputs[x1 + x + c_tensor].x = At[5].x - At[6].x - At[7].x;
 55 |     pOutputs[x1 + x + c_tensor].y = At[5].y - At[6].y - At[7].y;
 56 |   }
 57 | }
 58 | 
 59 | __device__ __forceinline__ void store_output_tile(float4 acumm_smem[][16], float *shared_mem, int Inx, int Iny,
 60 |                                   float *C, int TileX, int TileY, int TileZ, int out_h, int out_w, 
 61 |                                   int tiles_dim, int in_n, float4 *input_frag_mem, float4* filter_frag_mem, 
 62 |                                   short mask){
 63 |   
 64 |   float2 *output_smem = (float2 *) shared_mem;
 65 |   float2 *accumulator = (float2 *) acumm_smem;
 66 |   float2 *C_out = (float2*)C;
 67 | 
 68 |   float2 *C_tile = (float2*) input_frag_mem;
 69 |   float2 *At = (float2*) filter_frag_mem;
 70 | 
 71 |   mask = 0x000F;
 72 |   if((blockIdx.y/tiles_dim)==(tiles_dim-1) && out_w%2) mask&=0x0003;
 73 |   if(!((blockIdx.y+1)%tiles_dim) && out_w%2)           mask&=0X0005;
 74 |   
 75 |   // output transpose step
 76 |   int t,j;
 77 |   int acumm1, acumm2;
 78 |   // For transposing
 79 |   t = Inx%8/2;
 80 |   acumm1 = t*18 + Inx%2 + (Inx/16)*2 + ((Inx/8)%2)*8;
 81 |   acumm2 = acumm1+4;
 82 |   acumm1 = acumm1 - acumm1/((t+1)*16)*16 + t*16;  
 83 |   acumm2 = acumm2 - acumm2/((t+1)*16)*16 + t*16;
 84 |   t=0;
 85 |                        
 86 |   int acumm4 = BN_p*16 ; //*4
 87 |   int idx  = Iny * BN_p;
 88 |   int idx2 = idx + BN_p*8; //(BN_p*2 *8)/2
 89 | 
 90 |   // For transformating
 91 |   int offset = BN_p *2; //*2/2
 92 |   
 93 |   int init = (Iny%4)*(16+2)*2 + Inx;
 94 |   init = init - init/((Iny%4+1)*32)*32 + Iny%4*32;
 95 |   init += (Iny/4)*BN_p*16*2;
 96 | 
 97 |   int c_glb_offset = in_n*out_h*out_w;
 98 |   int c_tensor = TileZ*c_glb_offset*BK + (TileY%tiles_dim)*in_n*2 + (TileY/tiles_dim)*in_n*out_w*2 + TileX*BN + (Inx%16)*2+
 99 |                 ((Inx/16)*16 + (Iny%4)*4 + Iny/4)*c_glb_offset;
100 |   c_tensor/=2; 
101 | 
102 |   // k=0, block 0
103 |   *( (float2*) (output_smem + idx + acumm1) )  = *(accumulator);
104 |   *( (float2*) (output_smem + idx + acumm1 + 16) )  = *(accumulator+1);
105 |   *( (float2*) (output_smem + idx + acumm2) )  = *(accumulator+2);
106 |   *( (float2*) (output_smem + idx + acumm2 + 16) )  = *(accumulator+3);
107 |   
108 |   // K=1, block 0
109 |   *( (float2*) (output_smem + idx + acumm4 + acumm1) )  = *(accumulator+4); 
110 |   *( (float2*) (output_smem + idx + acumm4 + acumm1 + 16) )  = *(accumulator+5);
111 |   *( (float2*) (output_smem + idx + acumm4 + acumm2) )  = *(accumulator+6);
112 |   *( (float2*) (output_smem + idx + acumm4 + acumm2 + 16) )  = *(accumulator+7);
113 |   
114 |   // k=0, block 1
115 |   *( (float2*) (output_smem + idx2 + acumm1) ) = *(accumulator+32);
116 |   *( (float2*) (output_smem + idx2 + acumm1 + 16) ) = *(accumulator+33); 
117 |   *( (float2*) (output_smem + idx2 + acumm2) ) = *(accumulator+34);
118 |   *( (float2*) (output_smem + idx2 + acumm2 + 16) ) = *(accumulator+35); 
119 |   
120 |   // K=1, block 1
121 |   *( (float2*) (output_smem + idx2 + acumm4 + acumm1) ) = *(accumulator+36);
122 |   *( (float2*) (output_smem + idx2 + acumm4 + acumm1 + 16) ) = *(accumulator+37);
123 |   *( (float2*) (output_smem + idx2 + acumm4 + acumm2) ) = *(accumulator+38);
124 |   *( (float2*) (output_smem + idx2 + acumm4 + acumm2 + 16) ) = *(accumulator+39);
125 |   
126 |   j=0; t+=8;
127 | 
128 |   #pragma unroll                                  
129 |   for(int round=0; round<3; round++){
130 |     
131 |     __syncthreads();
132 | 
133 |     int disp = j/2*(BN_p*2*16)*2;
134 |     #pragma unroll
135 |     for(int i=0; i<16; i++){
136 |       C_tile[i].x = shared_mem[disp + i*offset + init];
137 |       C_tile[i].y = shared_mem[disp + i*offset + init + 32];
138 |     }
139 | 
140 |     // transform output tiles
141 |     transform_output_tile(C_out, C_tile, At, tiles_dim, (round/2)*2+j/2, in_n, c_tensor, c_glb_offset, mask, out_w);
142 | 
143 |     j = 2 - j; //switch between 0 and 2
144 |       
145 |     // k=0, block 0
146 |     *( (float2*) (output_smem + idx + (j)*acumm4 + acumm1) )  = *(accumulator+t);
147 |     *( (float2*) (output_smem + idx + (j)*acumm4 + acumm1 + 16) )  = *(accumulator+t+1);
148 |     *( (float2*) (output_smem + idx + (j)*acumm4 + acumm2) )  = *(accumulator+t+2);
149 |     *( (float2*) (output_smem + idx + (j)*acumm4 + acumm2 + 16) )  = *(accumulator+t+3);
150 |     
151 |     // K=1, block 0
152 |     *( (float2*) (output_smem + idx + (j+1)*acumm4 + acumm1) )  = *(accumulator+t+4); 
153 |     *( (float2*) (output_smem + idx + (j+1)*acumm4 + acumm1 + 16) )  = *(accumulator+t+5);
154 |     *( (float2*) (output_smem + idx + (j+1)*acumm4 + acumm2) )  = *(accumulator+t+6);
155 |     *( (float2*) (output_smem + idx + (j+1)*acumm4 + acumm2 + 16) )  = *(accumulator+t+7);
156 |     
157 |     // k=0, block 1
158 |     *( (float2*) (output_smem + idx2 + (j)*acumm4 + acumm1) ) = *(accumulator+t+32);
159 |     *( (float2*) (output_smem + idx2 + (j)*acumm4 + acumm1 + 16) ) = *(accumulator+t+33); 
160 |     *( (float2*) (output_smem + idx2 + (j)*acumm4 + acumm2) ) = *(accumulator+t+34);
161 |     *( (float2*) (output_smem + idx2 + (j)*acumm4 + acumm2 + 16) ) = *(accumulator+t+35); 
162 |     
163 |     // K=1, block 1
164 |     *( (float2*) (output_smem + idx2 + (j+1)*acumm4 + acumm1) ) = *(accumulator+t+36);
165 |     *( (float2*) (output_smem + idx2 + (j+1)*acumm4 + acumm1 + 16) ) = *(accumulator+t+37);
166 |     *( (float2*) (output_smem + idx2 + (j+1)*acumm4 + acumm2) ) = *(accumulator+t+38);
167 |     *( (float2*) (output_smem + idx2 + (j+1)*acumm4 + acumm2 + 16) ) = *(accumulator+t+39);
168 |       
169 |     t+=8;
170 | 
171 |   }
172 | 
173 |   __syncthreads();
174 | 
175 |   int disp = j/2*(BN_p*2*16)*2;
176 |   #pragma unroll
177 |   for(int i=0; i<16; i++){
178 |     C_tile[i].x = shared_mem[disp + i*offset + init];
179 |     C_tile[i].y = shared_mem[disp + i*offset + init + 32];
180 |   }
181 |   // transform output tiles
182 |   transform_output_tile(C_out, C_tile, At, tiles_dim, 2+j/2, in_n, c_tensor, c_glb_offset, mask, out_w);
183 | }
184 | 
185 | }
186 | #endif     


--------------------------------------------------------------------------------