├── LICENSE ├── Makefile ├── README.md ├── bench ├── bench.sh └── smem │ ├── LDS │ ├── Makefile │ ├── lds128.sass │ ├── lds32.sass │ ├── lds64.sass │ ├── lds64_opt3.sass │ └── main.cu │ └── STS │ ├── Makefile │ ├── main.cu │ ├── sts.sass │ ├── sts128.sass │ ├── sts128_0.sass │ ├── sts32.sass │ ├── sts64.sass │ ├── sts64_2bank_conflict.sass │ ├── sts64_broadcast.sass │ └── sts64_opt3.sass └── src ├── FX_m2.cu ├── ampere ├── convolutionForward_32x64x8.cu ├── convolutionForward_32x64x8_baseline.cu ├── store_and_transform_output_baseline.cuh ├── store_and_transform_output_optLDS64.cuh ├── store_and_transform_output_optSTS64.cuh └── store_and_transform_output_optSTS64_compact.cuh ├── config.hpp ├── convolutionForward_32x64x8.cu ├── convolutionForward_32x64x8_baseline.cu ├── openCNN_winograd.cu ├── outer_product.cuh ├── outer_product_suffle.cuh ├── store_and_transform_output_baseline.cuh ├── store_and_transform_output_optLDS64.cuh ├── store_and_transform_output_optSTS64.cuh └── store_and_transform_output_optSTS64_compact.cuh /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | ARCH = 75 # modify this. Ampere=86 2 | NAME = wgrad 3 | OUT = OPTSTS64 4 | #MODE = PROF 5 | #LBR = OPENCNN 6 | 7 | all: 8 | nvcc src/openCNN_winograd.cu -lcudnn -m64 -arch=compute_$(ARCH) -code=sm_$(ARCH)-o $(NAME) -D$(OUT) 9 | 10 | clean: 11 | rm $(NAME) 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OpenCNN 2 | A winograd’s minimal filtering algorithm implementation in CUDA 3 | ## Requirements 4 | - CUDA Toolkit 5 | - cuDNN 6 | - CMake 7 | 8 | ## Build the project 9 | ``` 10 | git clone https://github.com/UDC-GAC/openCNN.git 11 | cd openCNN 12 | ``` 13 | GPU architecture code must be specified inside the Makefile before compiling. 14 | 15 | ``` 16 | make 17 | ``` 18 | 19 | Compile time macros have been defined for testing purposes. They can be specified in the MakeFile according to the following values: 20 | ``` 21 | $OUT (builds an specific output storage and transform version): 22 | - BASE: baseline layout 23 | - OPTSTS64 (default): optSTS64 layout 24 | - OUTSTS64_CMP: optSTS64_compact layout 25 | - OUTLDS64: optLDS64 layout 26 | ``` 27 | ## Run examples 28 | (Recommended before time measurement) Lock the clocks: 29 | ``` 30 | sudo nvidia-smi -i 0 -pm 1 31 | sudo nvidia-smi -lgc 1750 -i 0 32 | ``` 33 | 1. OpenCNN benchmark 34 | ``` 35 | cd bench 36 | ./bench.sh 37 | ``` 38 | 2. Instruction-level microbenchmarking (Require Turing devices). TuringAs (https://github.com/daadaada/turingas) must be installed for running instruction-level microbenchmarking. 39 | ``` 40 | cd bench/smem/STS 41 | make 42 | ./test 43 | ``` 44 | 45 | ## Citation 46 | If you find this tool helpful, please cite: 47 | ``` 48 | @Article{math9172033, 49 | AUTHOR = {Castro, Roberto L. and Andrade, Diego and  Fraguela, Basilio B.}, 50 | TITLE = {OpenCNN: A Winograd Minimal Filtering Algorithm Implementation in CUDA}, 51 | JOURNAL = {Mathematics}, 52 | VOLUME = {9}, 53 | YEAR = {2021}, 54 | NUMBER = {17}, 55 | ARTICLE-NUMBER = {2033}, 56 | URL = {https://www.mdpi.com/2227-7390/9/17/2033}, 57 | ISSN = {2227-7390}, 58 | DOI = {10.3390/math9172033} 59 | } 60 | ``` 61 | ## License 62 | Apache-2.0 License 63 | 64 | -- Roberto López Castro 65 | -------------------------------------------------------------------------------- /bench/bench.sh: -------------------------------------------------------------------------------- 1 | echo "in_n,in_c,in_h,filt_k,filt_w,openCNN_sec,openCNN_flops,cuDNN_sec,cuDNN_flops" 2 | 3 | for n in 32 64 96 128; 4 | do 5 | ../wgrad $n 64 56 56 64 64 3 3 6 | done 7 | 8 | for n in 32 64 96 128; 9 | do 10 | ../wgrad $n 128 28 28 128 128 3 3 11 | done 12 | 13 | for n in 32 64 96 128; 14 | do 15 | ../wgrad $n 256 14 14 256 256 3 3 16 | done 17 | 18 | for n in 32 64 96 128; 19 | do 20 | ../wgrad $n 512 7 7 512 512 3 3 21 | done 22 | 23 | -------------------------------------------------------------------------------- /bench/smem/LDS/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | python -m turingas.main -i lds32.sass -o lds32.cubin 3 | python -m turingas.main -i lds64.sass -o lds64.cubin 4 | python -m turingas.main -i lds128.sass -o lds128.cubin 5 | python -m turingas.main -i lds64_opt3.sass -o lds64_opt3.cubin 6 | nvcc -arch=sm_75 main.cu -lcuda -o test 7 | 8 | clean: 9 | lds32.cubin test lds64.cubin lds64_opt3.cubin lds128.cubin 10 | -------------------------------------------------------------------------------- /bench/smem/LDS/lds128.sass: -------------------------------------------------------------------------------- 1 | 2 | output, 8 3 | 4 | 5 | 6 | 0-18 ~ output0, output1, tid, offset, target, start, end, iter, tmp 7 | 8 | 9 | --:-:1:-:5 S2R tid, SR_TID.X; 10 | --:-:-:-:2 MOV output0, output[0]; 11 | --:-:-:-:5 MOV output1, output[1]; 12 | 13 | 02:-:-:-:6 SHF.L offset, tid, 4, RZ; 14 | --:-:-:-:5 MOV iter, RZ; 15 | 16 | 17 | --:-:-:-:1 CS2R start, SR_CLOCKLO; 18 | 19 | LOOP: 20 | 21 | out = [] 22 | out.append('--:-:-:-:1 IADD3 iter, iter, 1, RZ;') 23 | for i in range(128): 24 | if i == 64: 25 | out.append(f'--:-:-:-:1 ISETP.LT.AND P0, PT, iter, 128, PT;') 26 | out.append(f'--:-:-:-:1 LDS.128 tmp, [offset];') 27 | out.append('--:-:-:-:2 @P0 BRA LOOP;') 28 | out_ = '\n'.join(out) + '\n' 29 | 30 | 31 | --:-:-:-:5 CS2R end, SR_CLOCKLO; 32 | --:-:-:-:5 IADD3 end, end, -start, RZ; 33 | --:-:-:-:2 STG.E.GPU [output0], end; 34 | 35 | 36 | 37 | --:-:-:-:2 EXIT; 38 | -------------------------------------------------------------------------------- /bench/smem/LDS/lds32.sass: -------------------------------------------------------------------------------- 1 | 2 | output, 8 3 | 4 | 5 | 6 | 0-16 ~ output0, output1, tid, offset, target, start, end, iter 7 | 8 | 9 | --:-:1:-:5 S2R tid, SR_TID.X; 10 | --:-:-:-:2 MOV output0, output[0]; 11 | --:-:-:-:5 MOV output1, output[1]; 12 | 02:-:-:-:6 SHF.L offset, tid, 2, RZ; 13 | --:-:-:-:5 MOV iter, RZ; 14 | 15 | --:-:-:-:1 CS2R start, SR_CLOCKLO; 16 | 17 | LOOP: 18 | 19 | out = [] 20 | out.append('--:-:-:-:1 IADD3 iter, iter, 1, RZ;') 21 | for i in range(128): 22 | if i == 64: 23 | out.append(f'--:-:-:-:1 ISETP.LT.AND P0, PT, iter, 128, PT;') 24 | out.append(f'--:-:-:-:1 LDS target, [offset];') 25 | out.append('--:-:-:-:2 @P0 BRA LOOP;') 26 | out_ = '\n'.join(out) + '\n' 27 | 28 | 29 | --:-:-:-:5 CS2R end, SR_CLOCKLO; 30 | --:-:-:-:5 IADD3 end, end, -start, RZ; 31 | --:-:-:-:2 STG.E.GPU [output0], end; 32 | 33 | 34 | 35 | 36 | --:-:-:-:2 EXIT; 37 | -------------------------------------------------------------------------------- /bench/smem/LDS/lds64.sass: -------------------------------------------------------------------------------- 1 | 2 | output, 8 3 | 4 | 5 | 6 | 0-17 ~ output0, output1, tid, offset, target, start, end, iter, tmp 7 | 8 | 9 | --:-:1:-:5 S2R tid, SR_TID.X; 10 | --:-:-:-:2 MOV output0, output[0]; 11 | --:-:-:-:5 MOV output1, output[1]; 12 | 02:-:-:-:6 SHF.L offset, tid, 3, RZ; 13 | --:-:-:-:5 MOV iter, RZ; 14 | 15 | --:-:-:-:1 CS2R start, SR_CLOCKLO; 16 | 17 | LOOP: 18 | 19 | out = [] 20 | out.append('--:-:-:-:1 IADD3 iter, iter, 1, RZ;') 21 | for i in range(128): 22 | if i == 64: 23 | out.append(f'--:-:-:-:1 ISETP.LT.AND P0, PT, iter, 128, PT;') 24 | out.append(f'--:-:-:-:1 LDS.64 tmp, [offset];') 25 | out.append('--:-:-:-:2 @P0 BRA LOOP;') 26 | out_ = '\n'.join(out) + '\n' 27 | 28 | 29 | --:-:-:-:5 CS2R end, SR_CLOCKLO; 30 | --:-:-:-:5 IADD3 end, end, -start, RZ; 31 | --:-:-:-:2 STG.E.GPU [output0], end; 32 | 33 | 34 | 35 | 36 | --:-:-:-:2 EXIT; 37 | -------------------------------------------------------------------------------- /bench/smem/LDS/lds64_opt3.sass: -------------------------------------------------------------------------------- 1 | 2 | output, 8 3 | 4 | 5 | 6 | 0-19 ~ output0, output1, tid, offset, target, start, end, iter, aux, aux2, tmp 7 | 8 | 9 | --:-:1:-:5 S2R tid, SR_TID.X; 10 | --:-:-:-:2 MOV output0, output[0]; 11 | --:-:-:-:5 MOV output1, output[1]; 12 | 13 | 02:-:-:-:6 SHF.R aux, tid, 4, RZ; 14 | 02:-:-:-:6 SHF.L aux, aux, 4, RZ; 15 | 04:-:1:-:6 IMAD aux, aux, -1, tid; #tid%16 16 | --:-:-:-:1 ISETP.LT.AND P0, PT, aux, 8, PT; 17 | 18 | 02:-:-:-:6 SHF.R aux2, tid, 4, RZ; 19 | 04:-:1:-:6 IMAD aux2, aux2, 8, aux; #tid/16*tid%16 20 | 21 | --:-:-:-:5 MOV iter, RZ; 22 | 23 | --:-:-:-:2 @P0 BRA JMP; 24 | --:-:-:-:5 IADD3 aux2, aux2, 8, RZ; 25 | 26 | JMP: 27 | 02:-:-:-:6 SHF.L offset, aux2, 3, RZ; 28 | 29 | --:-:-:-:1 CS2R start, SR_CLOCKLO; 30 | 31 | LOOP: 32 | 33 | out = [] 34 | out.append('--:-:-:-:1 IADD3 iter, iter, 1, RZ;') 35 | for i in range(128): 36 | if i == 64: 37 | out.append(f'--:-:-:-:1 ISETP.LT.AND P0, PT, iter, 128, PT;') 38 | out.append(f'--:-:-:-:1 LDS.64 tmp, [offset];') 39 | out.append('--:-:-:-:2 @P0 BRA LOOP;') 40 | out_ = '\n'.join(out) + '\n' 41 | 42 | 43 | --:-:-:-:5 CS2R end, SR_CLOCKLO; 44 | --:-:-:-:5 IADD3 end, end, -start, RZ; 45 | --:-:-:-:2 STG.E.GPU [output0], end; 46 | 47 | #02:-:-:-:6 SHF.L aux, tid, 2, RZ; 48 | #--:-:-:-:5 IADD3 output0, output0, aux, RZ; 49 | ##--:-:-:-:2 STG.E.GPU [output0], aux2; 50 | #--:-:-:-:2 STG.E.GPU [output0], aux2; 51 | 52 | 53 | --:-:-:-:2 EXIT; 54 | -------------------------------------------------------------------------------- /bench/smem/LDS/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | char* concat(const char *s1, const char *s2) 6 | { 7 | char *result = (char*)malloc(strlen(s1) + strlen(s2) + 1); // +1 for the null-terminator 8 | // in real code you would check for errors in malloc here 9 | strcpy(result, s1); 10 | strcat(result, s2); 11 | return result; 12 | } 13 | 14 | void run(char * name, int size){ 15 | char * file_name = concat(name, ".cubin"); 16 | 17 | int *output; 18 | cudaMalloc((void**)&output, sizeof(int)*32); 19 | 20 | CUmodule module; 21 | CUfunction kernel; 22 | 23 | cuModuleLoad(&module, file_name); 24 | cuModuleGetFunction(&kernel, module, "kern"); 25 | 26 | void * args[1] = {&output}; 27 | cuLaunchKernel(kernel, 1, 1, 1, 28 | 32, 1, 1, 29 | 32*sizeof(float)*size, 0, args, 0); 30 | 31 | int *output_h = (int*)malloc(sizeof(int)*32); 32 | 33 | cudaMemcpy(output_h, output, sizeof(int)*32, cudaMemcpyDeviceToHost); 34 | 35 | printf("%s took %d clocks.\n", name, output_h[0]); 36 | printf("Each instruction takes %.2f clocks.\n", (float)output_h[0]/(128.0*128.0)); // workload of a thread 37 | printf("Throughput %.2f bytes/cycle.\n\n", ((double)32*128*128*4*size)/output_h[0]); 38 | 39 | cudaFree(output); 40 | free(output_h); 41 | } 42 | 43 | int main(){ 44 | run("lds32", 1); 45 | printf("\n"); 46 | run("lds64", 2); 47 | printf("\n"); 48 | run("lds128", 4); 49 | 50 | printf("\n"); 51 | run("lds64_opt3", 2); 52 | return 0; 53 | } 54 | -------------------------------------------------------------------------------- /bench/smem/STS/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | python -m turingas.main -i sts32.sass -o sts32.cubin 3 | python -m turingas.main -i sts64.sass -o sts64.cubin 4 | python -m turingas.main -i sts128.sass -o sts128.cubin 5 | python -m turingas.main -i sts128_0.sass -o sts128_0.cubin 6 | python -m turingas.main -i sts64_2bank_conflict.sass -o sts64_2bank_conflict.cubin 7 | python -m turingas.main -i sts64_broadcast.sass -o sts64_broadcast.cubin 8 | python -m turingas.main -i sts64_opt3.sass -o sts64_opt3.cubin 9 | nvcc -arch=sm_75 main.cu -lcuda -o test 10 | 11 | clean: 12 | sts32.cubin sts64.cubin sts128.cubin sts128_0.cubin sts64_2bank_conflict.cubin sts64_broadcast.cubin sts64_opt3.cubin test 13 | -------------------------------------------------------------------------------- /bench/smem/STS/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #define ITERS 32768 6 | 7 | char* concat(const char *s1, const char *s2) 8 | { 9 | char *result = (char*)malloc(strlen(s1) + strlen(s2) + 1); // +1 for the null-terminator 10 | // in real code you would check for errors in malloc here 11 | strcpy(result, s1); 12 | strcat(result, s2); 13 | return result; 14 | } 15 | 16 | #define CUDA_SAFE_CALL( call) { \ 17 | cudaError err = call; \ 18 | if( cudaSuccess != err) { \ 19 | fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ 20 | __FILE__, __LINE__, cudaGetErrorString( err) ); \ 21 | exit(EXIT_FAILURE); \ 22 | } } 23 | 24 | void initializeEvents(cudaEvent_t *start, cudaEvent_t *stop){ 25 | CUDA_SAFE_CALL( cudaEventCreate(start) ); 26 | CUDA_SAFE_CALL( cudaEventCreate(stop) ); 27 | CUDA_SAFE_CALL( cudaEventRecord(*start, 0) ); 28 | } 29 | 30 | float finalizeEvents(cudaEvent_t start, cudaEvent_t stop){ 31 | CUDA_SAFE_CALL( cudaGetLastError() ); 32 | CUDA_SAFE_CALL( cudaEventRecord(stop, 0) ); 33 | CUDA_SAFE_CALL( cudaEventSynchronize(stop) ); 34 | float kernel_time; 35 | CUDA_SAFE_CALL( cudaEventElapsedTime(&kernel_time, start, stop) ); 36 | CUDA_SAFE_CALL( cudaEventDestroy(start) ); 37 | CUDA_SAFE_CALL( cudaEventDestroy(stop) ); 38 | return kernel_time; 39 | } 40 | 41 | template 42 | void run(char * name, T scal, int type_size, int threads){ 43 | char * file_name = concat(name, ".cubin"); 44 | 45 | int *output; 46 | cudaMalloc((void**)&output, sizeof(int)*32); 47 | cudaMemset(output, 0, 32*sizeof(int)); 48 | 49 | CUmodule module; 50 | CUfunction kernel; 51 | 52 | cuModuleLoad(&module, file_name); 53 | cuModuleGetFunction(&kernel, module, "kern"); 54 | 55 | int blk_size = 32; 56 | int total_blks = 1;//size/blk_size; 57 | int sh_mem_size = blk_size*sizeof(float)*type_size; 58 | void * args[2] = {&scal, &output}; 59 | 60 | //cudaEvent_t start, stop; 61 | //initializeEvents(&start, &stop); 62 | cuLaunchKernel(kernel, total_blks, 1, 1, 63 | blk_size, 1, 1, 64 | sh_mem_size, 0, args, 0); 65 | //float krn_time_shmem_32b = finalizeEvents(start, stop); 66 | 67 | int *output_h = (int*)malloc(sizeof(int)*32); 68 | 69 | cudaMemcpy(output_h, output, sizeof(int)*32, cudaMemcpyDeviceToHost); 70 | 71 | /*for(int i=0; i<32; i++){ 72 | printf("%d ", output_h[i]); 73 | }printf("\n");*/ 74 | 75 | printf("%s took %d clocks \n", name, output_h[0]); 76 | double clocks_instr = (float)output_h[0]/(128.0*128.0); // wokload of a thread 77 | printf("Each instruction takes %.2f clocks.\n", clocks_instr); 78 | printf("Throughput %.2f bytes/cycle.\n\n", ((double)threads*128*128*type_size*4)/output_h[0]); // Size of information stores divided by the number of threads of the latest thread 79 | 80 | cudaFree(output); 81 | free(output_h); 82 | } 83 | 84 | int main(){ 85 | float scal = 4; 86 | run("sts32", scal, 1, 32); 87 | printf("\n"); 88 | float2 scal2; 89 | scal2.x = 4; scal2.y = 4; 90 | run("sts64", scal2, 2, 32); 91 | printf("\n"); 92 | float4 scal4; 93 | scal4.x = 4; scal4.y = 4; 94 | scal4.z = 4; scal4.w = 4; 95 | // No thread-divergence 96 | run("sts128_0", scal4, 4, 32); 97 | 98 | printf("\n"); 99 | // Only half of the threads store data 100 | run("sts128", scal4, 4, 16); 101 | 102 | /* run2_aux("sts64_2bank_conflict"); 103 | printf("\n"); 104 | run2_aux("sts64_broadcast"); 105 | printf("\n"); 106 | run2_aux("sts64_opt3"); 107 | printf("\n"); */ 108 | 109 | return 0; 110 | } 111 | -------------------------------------------------------------------------------- /bench/smem/STS/sts.sass: -------------------------------------------------------------------------------- 1 | 2 | scal, 8 3 | output, 8 4 | 5 | 6 | 7 | 0-17 ~ output0, output1, tid, offset, target, start, end, iter, tmp 8 | 9 | 10 | --:-:1:-:5 S2R tid, SR_TID.X; 11 | --:-:-:-:2 MOV output0, output[0]; 12 | --:-:-:-:5 MOV output1, output[1]; 13 | 02:-:-:-:6 SHF.L offset, tid, 6, RZ; 14 | --:-:-:-:2 MOV tmp, scal; 15 | --:-:-:-:5 MOV iter, RZ; 16 | 17 | --:-:-:-:1 CS2R start, SR_CLOCKLO; 18 | 19 | LOOP: 20 | 21 | out = [] 22 | out.append('--:-:-:-:1 IADD3 iter, iter, 1, RZ;') 23 | for i in range(1024): 24 | if i == 64: 25 | out.append(f'--:-:-:-:1 ISETP.LT.AND P0, PT, iter, 128, PT;') 26 | out.append(f'--:-:-:-:1 STS.128 [offset], tmp;') 27 | out.append('--:-:-:-:2 @P0 BRA LOOP;') 28 | out_ = '\n'.join(out) + '\n' 29 | 30 | 31 | --:-:-:-:5 CS2R end, SR_CLOCKLO; 32 | --:-:-:-:5 IADD3 end, end, -start, RZ; 33 | --:-:-:-:2 STG.E.GPU [output0], end; 34 | 35 | 36 | 37 | 38 | --:-:-:-:2 EXIT; 39 | -------------------------------------------------------------------------------- /bench/smem/STS/sts128.sass: -------------------------------------------------------------------------------- 1 | 2 | scal, 8 3 | output, 8 4 | 5 | 6 | 7 | 0-18 ~ output0, output1, tid, offset, target, start, end, iter, tmp, aux 8 | 9 | 10 | --:-:1:-:5 S2R tid, SR_TID.X; 11 | --:-:-:-:2 MOV output0, output[0]; 12 | --:-:-:-:5 MOV output1, output[1]; 13 | 14 | 02:-:-:-:6 SHF.R aux, tid, 4, RZ; 15 | 02:-:-:-:6 SHF.L aux, aux, 4, RZ; 16 | 04:-:1:-:6 IMAD aux, aux, -1, tid; 17 | 18 | --:-:-:-:1 ISETP.GT.AND P0, PT, aux, 8, PT; 19 | 02:-:-:-:6 SHF.L offset, tid, 4, RZ; 20 | --:-:-:-:2 MOV tmp, scal; 21 | --:-:-:-:5 MOV iter, RZ; 22 | 23 | --:-:-:-:2 @P0 EXIT; 24 | 25 | --:-:-:-:1 CS2R start, SR_CLOCKLO; 26 | 27 | LOOP: 28 | 29 | out = [] 30 | out.append('--:-:-:-:1 IADD3 iter, iter, 1, RZ;') 31 | for i in range(128): 32 | if i == 64: 33 | out.append(f'--:-:-:-:1 ISETP.LT.AND P0, PT, iter, 128, PT;') 34 | out.append(f'--:-:-:-:1 STS.128 [offset], tmp;') 35 | out.append('--:-:-:-:2 @P0 BRA LOOP;') 36 | out_ = '\n'.join(out) + '\n' 37 | 38 | 39 | --:-:-:-:5 CS2R end, SR_CLOCKLO; 40 | --:-:-:-:5 IADD3 end, end, -start, RZ; 41 | --:-:-:-:2 STG.E.GPU [output0], end; 42 | 43 | #02:-:-:-:6 SHF.L aux, tid, 2, RZ; 44 | #--:-:-:-:5 IADD3 output0, output0, aux, RZ; 45 | #--:-:-:-:2 STG.E.GPU [output0], iter; 46 | 47 | 48 | 49 | --:-:-:-:2 EXIT; 50 | -------------------------------------------------------------------------------- /bench/smem/STS/sts128_0.sass: -------------------------------------------------------------------------------- 1 | 2 | scal, 8 3 | output, 8 4 | 5 | 6 | 7 | 0-17 ~ output0, output1, tid, offset, target, start, end, iter, tmp 8 | 9 | 10 | --:-:1:-:5 S2R tid, SR_TID.X; 11 | --:-:-:-:2 MOV output0, output[0]; 12 | --:-:-:-:5 MOV output1, output[1]; 13 | 02:-:-:-:6 SHF.L offset, tid, 4, RZ; 14 | --:-:-:-:2 MOV tmp, scal; 15 | --:-:-:-:5 MOV iter, RZ; 16 | 17 | --:-:-:-:1 CS2R start, SR_CLOCKLO; 18 | 19 | LOOP: 20 | 21 | out = [] 22 | out.append('--:-:-:-:1 IADD3 iter, iter, 1, RZ;') 23 | for i in range(128): 24 | if i == 64: 25 | out.append(f'--:-:-:-:1 ISETP.LT.AND P0, PT, iter, 128, PT;') 26 | out.append(f'--:-:-:-:1 STS.128 [offset], tmp;') 27 | out.append('--:-:-:-:2 @P0 BRA LOOP;') 28 | out_ = '\n'.join(out) + '\n' 29 | 30 | 31 | --:-:-:-:5 CS2R end, SR_CLOCKLO; 32 | --:-:-:-:5 IADD3 end, end, -start, RZ; 33 | --:-:-:-:2 STG.E.GPU [output0], end; 34 | 35 | 36 | 37 | 38 | --:-:-:-:2 EXIT; -------------------------------------------------------------------------------- /bench/smem/STS/sts32.sass: -------------------------------------------------------------------------------- 1 | 2 | scal, 8 3 | output, 8 4 | 5 | 6 | 7 | 0-17 ~ output0, output1, tid, offset, target, start, end, iter, tmp 8 | 9 | 10 | --:-:1:-:5 S2R tid, SR_TID.X; 11 | --:-:-:-:2 MOV output0, output[0]; 12 | --:-:-:-:5 MOV output1, output[1]; 13 | 02:-:-:-:6 SHF.L offset, tid, 2, RZ; 14 | --:-:-:-:2 MOV tmp, scal; 15 | --:-:-:-:5 MOV iter, RZ; 16 | 17 | --:-:-:-:1 CS2R start, SR_CLOCKLO; 18 | 19 | LOOP: 20 | 21 | out = [] 22 | out.append('--:-:-:-:1 IADD3 iter, iter, 1, RZ;') 23 | for i in range(128): 24 | if i == 64: 25 | out.append(f'--:-:-:-:1 ISETP.LT.AND P0, PT, iter, 128, PT;') 26 | out.append(f'--:-:-:-:1 STS.32 [offset], tmp;') 27 | out.append('--:-:-:-:2 @P0 BRA LOOP;') 28 | out_ = '\n'.join(out) + '\n' 29 | 30 | 31 | --:-:-:-:5 CS2R end, SR_CLOCKLO; 32 | --:-:-:-:5 IADD3 end, end, -start, RZ; 33 | --:-:-:-:2 STG.E.GPU [output0], end; 34 | 35 | 36 | --:-:-:-:2 EXIT; 37 | -------------------------------------------------------------------------------- /bench/smem/STS/sts64.sass: -------------------------------------------------------------------------------- 1 | 2 | scal, 8 3 | output, 8 4 | 5 | 6 | 7 | 0-17 ~ output0, output1, tid, offset, target, start, end, iter, tmp 8 | 9 | 10 | --:-:1:-:5 S2R tid, SR_TID.X; 11 | --:-:-:-:2 MOV output0, output[0]; 12 | --:-:-:-:5 MOV output1, output[1]; 13 | 02:-:-:-:6 SHF.L offset, tid, 3, RZ; 14 | --:-:-:-:2 MOV tmp, scal; 15 | --:-:-:-:5 MOV iter, RZ; 16 | 17 | --:-:-:-:1 CS2R start, SR_CLOCKLO; 18 | 19 | LOOP: 20 | 21 | out = [] 22 | out.append('--:-:-:-:1 IADD3 iter, iter, 1, RZ;') 23 | for i in range(128): 24 | if i == 64: 25 | out.append(f'--:-:-:-:1 ISETP.LT.AND P0, PT, iter, 128, PT;') 26 | out.append(f'--:-:-:-:1 STS.64 [offset], tmp;') 27 | out.append('--:-:-:-:2 @P0 BRA LOOP;') 28 | out_ = '\n'.join(out) + '\n' 29 | 30 | 31 | --:-:-:-:5 CS2R end, SR_CLOCKLO; 32 | --:-:-:-:5 IADD3 end, end, -start, RZ; 33 | --:-:-:-:2 STG.E.GPU [output0], end; 34 | 35 | 36 | 37 | 38 | --:-:-:-:2 EXIT; 39 | -------------------------------------------------------------------------------- /bench/smem/STS/sts64_2bank_conflict.sass: -------------------------------------------------------------------------------- 1 | 2 | scal, 8 3 | output, 8 4 | 5 | 6 | 7 | 0-19 ~ output0, output1, tid, offset, target, start, end, iter, tmp, aux, aux2 8 | 9 | 10 | --:-:1:-:5 S2R tid, SR_TID.X; 11 | --:-:-:-:2 MOV output0, output[0]; 12 | --:-:-:-:5 MOV output1, output[1]; 13 | 14 | 04:-:1:-:6 IMAD aux2, tid, 2, RZ; 15 | 16 | 02:-:-:-:6 SHF.R aux, aux2, 5, RZ; 17 | 02:-:-:-:6 SHF.L aux, aux, 5, RZ; 18 | 04:-:1:-:6 IMAD aux, aux, -1, aux2; 19 | 20 | 02:-:-:-:6 SHF.R aux2, tid, 4, RZ; 21 | 04:-:1:-:6 IMAD aux, aux2, 32, aux; #tid/16*tid%16 22 | 23 | --:-:-:-:2 MOV tmp, scal; 24 | --:-:-:-:5 MOV iter, RZ; 25 | 26 | 02:-:-:-:6 SHF.L offset, aux, 3, RZ; 27 | 28 | --:-:-:-:1 CS2R start, SR_CLOCKLO; 29 | 30 | LOOP: 31 | 32 | out = [] 33 | out.append('--:-:-:-:1 IADD3 iter, iter, 1, RZ;') 34 | for i in range(128): 35 | if i == 64: 36 | out.append(f'--:-:-:-:1 ISETP.LT.AND P0, PT, iter, 128, PT;') 37 | out.append(f'--:-:-:-:1 STS.64 [offset], tmp;') 38 | out.append('--:-:-:-:2 @P0 BRA LOOP;') 39 | out_ = '\n'.join(out) + '\n' 40 | 41 | 42 | --:-:-:-:5 CS2R end, SR_CLOCKLO; 43 | --:-:-:-:5 IADD3 end, end, -start, RZ; 44 | --:-:-:-:2 STG.E.GPU [output0], end; 45 | 46 | #--:-:-:-:5 MOV aux2, aux; 47 | #02:-:-:-:6 SHF.L aux, tid, 2, RZ; 48 | #--:-:-:-:5 IADD3 output0, output0, aux, RZ; 49 | #--:-:-:-:2 STG.E.GPU [output0], aux2; 50 | 51 | --:-:-:-:2 EXIT; 52 | -------------------------------------------------------------------------------- /bench/smem/STS/sts64_broadcast.sass: -------------------------------------------------------------------------------- 1 | 2 | scal, 8 3 | output, 8 4 | 5 | 6 | 7 | 0-19 ~ output0, output1, tid, offset, target, start, end, iter, tmp, aux, aux2 8 | 9 | 10 | --:-:1:-:5 S2R tid, SR_TID.X; 11 | --:-:-:-:2 MOV output0, output[0]; 12 | --:-:-:-:5 MOV output1, output[1]; 13 | 14 | 02:-:-:-:6 SHF.R aux, tid, 4, RZ; 15 | 02:-:-:-:6 SHF.L aux, aux, 4, RZ; 16 | 04:-:1:-:6 IMAD aux, aux, -1, tid; #tid%16 17 | --:-:-:-:1 ISETP.LT.AND P0, PT, aux, 8, PT; 18 | 19 | 02:-:-:-:6 SHF.R aux2, tid, 4, RZ; 20 | 04:-:1:-:6 IMAD aux2, aux2, 8, aux; #tid/16*tid%16 21 | 22 | --:-:-:-:2 MOV tmp, scal; 23 | --:-:-:-:5 MOV iter, RZ; 24 | 25 | --:-:-:-:2 @P0 BRA JMP; 26 | --:-:-:-:5 IADD3 aux, aux, -8, RZ; 27 | 28 | JMP: 29 | 02:-:-:-:6 SHF.L offset, aux, 3, RZ; 30 | 31 | --:-:-:-:1 CS2R start, SR_CLOCKLO; 32 | 33 | LOOP: 34 | 35 | out = [] 36 | out.append('--:-:-:-:1 IADD3 iter, iter, 1, RZ;') 37 | for i in range(128): 38 | if i == 64: 39 | out.append(f'--:-:-:-:1 ISETP.LT.AND P0, PT, iter, 128, PT;') 40 | out.append(f'--:-:-:-:1 STS.64 [offset], tmp;') 41 | out.append('--:-:-:-:2 @P0 BRA LOOP;') 42 | out_ = '\n'.join(out) + '\n' 43 | 44 | 45 | --:-:-:-:5 CS2R end, SR_CLOCKLO; 46 | --:-:-:-:5 IADD3 end, end, -start, RZ; 47 | --:-:-:-:2 STG.E.GPU [output0], end; 48 | 49 | #--:-:-:-:5 MOV aux2, aux; 50 | #02:-:-:-:6 SHF.L aux, tid, 2, RZ; 51 | #--:-:-:-:5 IADD3 output0, output0, aux, RZ; 52 | #--:-:-:-:2 STG.E.GPU [output0], aux; 53 | 54 | 55 | --:-:-:-:2 EXIT; 56 | -------------------------------------------------------------------------------- /bench/smem/STS/sts64_opt3.sass: -------------------------------------------------------------------------------- 1 | 2 | scal, 8 3 | output, 8 4 | 5 | 6 | 7 | 0-19 ~ output0, output1, tid, offset, target, start, end, iter, tmp, aux, aux2 8 | 9 | 10 | --:-:1:-:5 S2R tid, SR_TID.X; 11 | --:-:-:-:2 MOV output0, output[0]; 12 | --:-:-:-:5 MOV output1, output[1]; 13 | 14 | 02:-:-:-:6 SHF.R aux, tid, 4, RZ; 15 | 02:-:-:-:6 SHF.L aux, aux, 4, RZ; 16 | 04:-:1:-:6 IMAD aux, aux, -1, tid; #tid%16 17 | --:-:-:-:1 ISETP.LT.AND P0, PT, aux, 8, PT; 18 | 19 | 02:-:-:-:6 SHF.R aux2, tid, 4, RZ; 20 | 04:-:1:-:6 IMAD aux2, aux2, 8, aux; #tid/16*tid%16 21 | 22 | --:-:-:-:2 MOV tmp, scal; 23 | --:-:-:-:5 MOV iter, RZ; 24 | 25 | --:-:-:-:2 @P0 BRA JMP; 26 | --:-:-:-:5 IADD3 aux2, aux2, 8, RZ; 27 | 28 | JMP: 29 | 02:-:-:-:6 SHF.L offset, aux2, 3, RZ; 30 | 31 | --:-:-:-:1 CS2R start, SR_CLOCKLO; 32 | 33 | LOOP: 34 | 35 | out = [] 36 | out.append('--:-:-:-:1 IADD3 iter, iter, 1, RZ;') 37 | for i in range(128): 38 | if i == 64: 39 | out.append(f'--:-:-:-:1 ISETP.LT.AND P0, PT, iter, 128, PT;') 40 | out.append(f'--:-:-:-:1 STS.64 [offset], tmp;') 41 | out.append('--:-:-:-:2 @P0 BRA LOOP;') 42 | out_ = '\n'.join(out) + '\n' 43 | 44 | 45 | --:-:-:-:5 CS2R end, SR_CLOCKLO; 46 | --:-:-:-:5 IADD3 end, end, -start, RZ; 47 | --:-:-:-:2 STG.E.GPU [output0], end; 48 | 49 | #02:-:-:-:6 SHF.L aux, tid, 2, RZ; 50 | #--:-:-:-:5 IADD3 output0, output0, aux, RZ; 51 | ##--:-:-:-:2 STG.E.GPU [output0], aux2; 52 | #--:-:-:-:2 STG.E.GPU [output0], aux2; 53 | 54 | 55 | --:-:-:-:2 EXIT; 56 | -------------------------------------------------------------------------------- /src/FX_m2.cu: -------------------------------------------------------------------------------- 1 | 2 | // Copyright 2021 Roberto Lopez Castro 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | 16 | 17 | #ifndef _FX_ 18 | #define _FX_ 19 | extern "C" 20 | { 21 | 22 | // Set of functions per row in Gw product 23 | __device__ float f_row1(float *Gw, int j){ 24 | return Gw[j]; 25 | } 26 | __device__ float f_row2(float *Gw, int j){ 27 | return 0.5*(Gw[j] + Gw[6+j] + Gw[3+j]); 28 | } 29 | __device__ float f_row3(float *Gw, int j){ 30 | return 0.5*(Gw[j] + Gw[6+j] - Gw[3+j]); 31 | } 32 | __device__ float f_row4(float *Gw, int j){ 33 | return Gw[6+j]; 34 | } 35 | // Set of functions per column in GwGt product 36 | __device__ float f_col1(float *Gw, int j){ 37 | return Gw[j]; 38 | } 39 | __device__ float f_col2(float *Gw, int j){ 40 | return 0.5*(Gw[j] + Gw[j+2] + Gw[j+1]); 41 | } 42 | __device__ float f_col3(float *Gw, int j){ 43 | return 0.5*(Gw[j] + Gw[j+2] - Gw[j+1]); 44 | } 45 | __device__ float f_col4(float *Gw, int j){ 46 | return Gw[j+2]; 47 | } 48 | 49 | typedef float(*pointFunction_t)(float *, int); 50 | 51 | __global__ void FX(float *pInputs, float *pOutputs, int filt_k, 52 | int filt_c, int filt_h, int filt_w, int alpha){ 53 | int Inx = threadIdx.x, Iny = threadIdx.y; 54 | int TileX = blockIdx.x, TileY = blockIdx.y; 55 | 56 | int c_glb_offset = filt_k*filt_h*filt_w; 57 | int c_kernel = TileY*BC*c_glb_offset + TileX*BK + Iny*c_glb_offset + Inx; 58 | int c_glb_offset_s = filt_k*4*4; 59 | int c_kernel_s = TileY*BC*c_glb_offset_s + TileX*BK + Iny*c_glb_offset_s + Inx; 60 | 61 | float Gw[21]; //9+12. In registers 62 | float *Gw_buffer = Gw+9; 63 | 64 | pointFunction_t func1[4] = {f_row1, f_row2, f_row3, f_row4}; 65 | pointFunction_t func2[4] = {f_col1, f_col2, f_col3, f_col4}; 66 | 67 | for(int bk=0; bk>>(w, Ww, filt_k, filt_c, filt_h, filt_w, alpha); 273 | 274 | #ifdef OPTSTS64_CMP 275 | smem_size = 65536; // 64 KB 276 | cudaFuncSetAttribute(Winograd_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size); 277 | #endif 278 | 279 | Winograd_kernel<<>>(k, Ww, C, tiles_dim, in_c, in_n, in_h, in_w, tile_size, filt_k, filt_c, tiles_2d_dim, out_c, out_n, tile_2d_s, out_h, out_w); 280 | 281 | return cudaGetLastError(); 282 | } 283 | 284 | } 285 | #endif 286 | -------------------------------------------------------------------------------- /src/ampere/convolutionForward_32x64x8_baseline.cu: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Roberto Lopez Castro 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | 16 | #include "../FX_m2.cu" 17 | #include "store_and_transform_output_baseline.cuh" 18 | #include "../outer_product.cuh" 19 | 20 | #ifdef _noWALL_ 21 | typedef struct rusage resnfo; 22 | typedef struct _timenfo { 23 | double time; 24 | double systime; 25 | } timenfo; 26 | #define timestamp(sample) getrusage(RUSAGE_SELF, (sample)) 27 | #define printtime(t) printf("%15f s (%f user + %f sys) ", \ 28 | t.time + t.systime, t.time, t.systime); 29 | #else 30 | typedef struct timeval resnfo; 31 | typedef double timenfo; 32 | #define timestamp(sample) gettimeofday((sample), 0) 33 | #define printtime(t) printf("%15f s ", t); 34 | #endif 35 | 36 | #ifndef _WINOGRAD_ 37 | #define _WINOGRAD_ 38 | extern "C" 39 | { 40 | 41 | 42 | #define d(input, i, j) ( input[(i<<2) + (j)] ) 43 | 44 | __device__ __forceinline__ void load_and_transform_input_tile(float *Btd, float *pOutputs, int in_h, int in_w, 45 | int tiles_dim, int in_c, int in_n, int tile_size, 46 | int tiles_2d_dim, int tile_2d_s){ 47 | 48 | float workspace[3]; 49 | 50 | #pragma unroll 51 | for(int j=0; j<4; j++){ 52 | workspace[0] = Btd[j]; 53 | workspace[1] = Btd[j+4]; 54 | workspace[2] = Btd[j+8]; 55 | 56 | Btd[j] = workspace[0] - workspace[2]; 57 | Btd[j+4] = workspace[1] + workspace[2]; 58 | Btd[j+8] = workspace[2] - workspace[1]; 59 | Btd[j+12] = workspace[1] - Btd[j+12]; 60 | } 61 | 62 | int c_offset = BN*BC; 63 | int c_tensor = threadIdx.y*BN + threadIdx.x; 64 | 65 | #pragma unroll 66 | for(int i=0; i<4; i++){ // prefetch 1 input tile/thread 67 | pOutputs[c_tensor+i*c_offset*4] = d(Btd, i, 0) - d(Btd, i, 2); 68 | pOutputs[c_tensor+i*c_offset*4+c_offset] = d(Btd, i, 1) + d(Btd, i, 2); 69 | pOutputs[c_tensor+i*c_offset*4+2*c_offset] = d(Btd, i, 2) - d(Btd, i, 1); 70 | pOutputs[c_tensor+i*c_offset*4+3*c_offset] = d(Btd, i, 1) - d(Btd, i, 3); 71 | } 72 | 73 | } 74 | 75 | __device__ __forceinline__ void load_filter_tile(float *tiles, float *pOutputs, 76 | int filt_c, int filt_k){ 77 | 78 | int c_tensor_s = threadIdx.y*BK + threadIdx.x; 79 | int c_offset_s = BK*BC; 80 | 81 | for(int k=0; k<2; k++){ // prefetch 2 filter tiles/thread 82 | for(int i=0; i<4; i++){ 83 | #pragma unroll 84 | for(int j=0; j<4; j++){ 85 | pOutputs[c_tensor_s + i*c_offset_s*4 + j*c_offset_s] = tiles[k*16 + i*4 + j]; 86 | } 87 | } 88 | 89 | c_tensor_s += BN; 90 | } 91 | 92 | } 93 | 94 | __device__ __forceinline__ void prefetch_filter_tile(float *pInputs, float *tiles, int filt_k){ 95 | 96 | int c_tensor = blockIdx.z*BK + (threadIdx.y*filt_k<<4) + threadIdx.x; // Iny*filt_k*4*4 97 | 98 | int acumm; 99 | #pragma unroll 100 | for(int i=0; i<4; i++){ 101 | acumm = (i*filt_k<<2); 102 | #pragma unroll 103 | for(int j=0; j<4; j++){ 104 | tiles[(i<<2) + j] = pInputs[acumm + j*filt_k + c_tensor]; 105 | tiles[16 + (i<<2) + j] = pInputs[acumm + j*filt_k + c_tensor+BN]; 106 | } 107 | } 108 | } 109 | 110 | __device__ __forceinline__ void prefetch_input_tile(float *pInputs, float *tile, int in_h, int in_w, int in_n, int tiles_dim, short mask){ 111 | 112 | int c_tensor = (blockIdx.y%tiles_dim)*in_n*2 + (blockIdx.y/tiles_dim)*in_n*in_w*2 + blockIdx.x*BN + threadIdx.y*(in_n*in_h*in_w) + (threadIdx.x/in_n)*2*in_n + (threadIdx.x%in_n) - (in_n*in_w+in_n); 113 | int acumm,x; 114 | //short x1,x2; 115 | 116 | if(mask==0xFFFF){ 117 | #pragma unroll 118 | for(int i=0; i<4; i++){ 119 | acumm = i*in_n*in_w; 120 | #pragma unroll 121 | for(int j=0; j<4; j++){ 122 | tile[(i<<2) + j] = pInputs[acumm + j*in_n + c_tensor]; 123 | } 124 | } 125 | 126 | } else { 127 | for(int i=0; i<4; i++){ 128 | acumm = i*in_n*in_w; 129 | #pragma unroll 130 | for(int j=0; j<4; j++){ 131 | x = (i<<2) + j; 132 | tile[x] = 0; 133 | if(mask&(1<>>(w, Ww, filt_k, filt_c, filt_h, filt_w, alpha); 267 | 268 | Winograd_kernel<<>>(k, Ww, C, 269 | tiles_dim, in_c, in_n, in_h, in_w, tile_size, 270 | filt_k, filt_c, tiles_2d_dim, out_c, out_n, tile_2d_s, 271 | out_h, out_w); 272 | 273 | return cudaGetLastError(); 274 | } 275 | 276 | } 277 | #endif 278 | -------------------------------------------------------------------------------- /src/ampere/store_and_transform_output_baseline.cuh: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Roberto Lopez Castro 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "../config.hpp" 16 | 17 | #ifndef _OUTPUT_KERNEL_PPoPP_ 18 | #define _OUTPUT_KERNEL_PPoPP_ 19 | extern "C" 20 | { 21 | 22 | __device__ void transform_output_tile(float *pOutputs, float *C_tile, float *At, int out_h, int out_w, 23 | int tiles_dim, int round, int in_n, int offset, int out_thread[][4], short mask, int c_tensor, int c_glb_offset){ 24 | 25 | for(int j=0; j<4; j++){ 26 | At[j] = C_tile[j] + C_tile[4+j] + C_tile[8+j]; 27 | At[j+8] = C_tile[j+16] + C_tile[4+j+16] + C_tile[8+j+16]; 28 | 29 | At[4+j] = C_tile[4+j] - C_tile[8+j] - C_tile[12+j]; 30 | At[4+j+8] = C_tile[4+j+16] - C_tile[8+j+16] - C_tile[12+j+16]; 31 | } 32 | 33 | int idx = out_thread[round][threadIdx.y%4] + threadIdx.y/4 + offset; 34 | c_tensor += idx*c_glb_offset; 35 | int x, x1; 36 | 37 | for(int i=0; i<2; i++){ 38 | x = i*4; 39 | //x1 = i*(in_n*(tiles_dim-1) + in_n/2)*2; 40 | x1 = i*(in_n*(tiles_dim-(out_w%2)) + (out_w%2)*in_n/2)*2; 41 | if(mask&(1<<(i*2))){ 42 | pOutputs[c_tensor+ x1] = At[x] + At[x+1] + At[x+2]; 43 | pOutputs[c_tensor+2*c_glb_offset+x1] = At[x+8] + At[x+1+8] + At[x+2+8]; 44 | } 45 | 46 | if(mask&(1<<(i*2+1))){ 47 | pOutputs[c_tensor+x1+in_n] = At[x+1] - At[x+2] - At[x+3]; 48 | pOutputs[c_tensor+2*c_glb_offset+x1+in_n] = At[x+1+8] - At[x+2+8] - At[x+3+8]; 49 | } 50 | } 51 | 52 | } 53 | 54 | __device__ __inline__ void store_output_tile(float4 acumm_smem[][16], float *shared_mem, float *C, int out_h, int out_w, int tiles_dim, int in_n, float4 *input_frag_mem, float4* filter_frag_mem, int out_thread[][4], int access_s_out[][16], short mask){ 55 | 56 | float4 *output_smem = (float4 *) shared_mem; 57 | float4 *accumulator = (float4 *) acumm_smem; 58 | 59 | float *C_tile = (float*) input_frag_mem; 60 | float *At = (float*) filter_frag_mem; 61 | 62 | mask = 0x000F; 63 | if((blockIdx.y/tiles_dim)==(tiles_dim-1) && out_w%2) mask&=0x0003; 64 | if(!((blockIdx.y+1)%tiles_dim) && out_w%2) mask&=0X0005; 65 | 66 | // output transpose step 67 | int t=0; 68 | int acumm1, acumm2; 69 | 70 | acumm1 = access_s_out[0][threadIdx.x%8 + (threadIdx.x/16)*8]; 71 | acumm2 = access_s_out[1][threadIdx.x%8 + (threadIdx.x/16)*8]; 72 | 73 | int offset = BN_p*4; 74 | int init = (threadIdx.y/4)*BN_p*16*4 + (threadIdx.y%4)*40 + threadIdx.x; 75 | int acumm3 = threadIdx.y * BN_p; 76 | int acumm4 = BN_p*8*2; 77 | 78 | int idx = acumm3; 79 | int idx2 = idx + BN_p*8; 80 | 81 | float* out = (float *) output_smem; 82 | 83 | int c_glb_offset = in_n*out_h*out_w; 84 | int c_tensor = blockIdx.z*in_n*out_h*out_w*BK + (blockIdx.y%tiles_dim)*in_n*2 + (blockIdx.y/tiles_dim)*in_n*out_w*2 + blockIdx.x*BN + threadIdx.x; 85 | 86 | //#pragma unroll 87 | for(int round=0; round<4; round++){ 88 | 89 | //transformation step 90 | if ( ((!round || round==1) && (threadIdx.x&15)<8) || ((round==2 || round==3) && (threadIdx.x&15)>7) ){ 91 | 92 | #pragma unroll 93 | for(int i=0; i<4; i+=2){ 94 | 95 | *( (float4*) (output_smem + idx+i*acumm4 + acumm1) ) = *(accumulator+t); // k=0 96 | *( (float4*) (output_smem + idx+i*acumm4 + acumm2) ) = *(accumulator+t+1); 97 | *( (float4*) (output_smem + idx+(i+1)*acumm4 + acumm1) ) = *(accumulator+2+t); // k=1 98 | *( (float4*) (output_smem + idx+(i+1)*acumm4 + acumm2) ) = *(accumulator+3+t); 99 | 100 | *( (float4*) (output_smem + idx2+i*acumm4 + acumm1) ) = *(accumulator+16+t); 101 | *( (float4*) (output_smem + idx2+i*acumm4 + acumm2) ) = *(accumulator+17+t); 102 | *( (float4*) (output_smem + idx2+(i+1)*acumm4 + acumm1) ) = *(accumulator+18+t); 103 | *( (float4*) (output_smem + idx2+(i+1)*acumm4 + acumm2) ) = *(accumulator+19+t); 104 | 105 | t+=4; 106 | } 107 | } 108 | __syncthreads(); 109 | 110 | for(int i=0; i<16; i++){ 111 | C_tile[i] = out[init + i*offset]; 112 | C_tile[i+16] = out[init + 2*BN_p*16*4 + i*offset]; 113 | } 114 | 115 | // transform output tiles 116 | transform_output_tile(C, C_tile, At, out_h, out_w, tiles_dim, round, in_n, 0, out_thread, mask, c_tensor, c_glb_offset); 117 | 118 | 119 | __syncthreads(); 120 | } 121 | 122 | } 123 | 124 | } 125 | #endif 126 | -------------------------------------------------------------------------------- /src/ampere/store_and_transform_output_optLDS64.cuh: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Roberto Lopez Castro 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "../config.hpp" 16 | 17 | #ifndef _OUTPUT_KERNEL_OPT3_ 18 | #define _OUTPUT_KERNEL_OPT3_ 19 | extern "C" 20 | { 21 | 22 | __device__ void __inline__ transform_output_tile(float2 *pOutputs, float2 *C_tile, float2 *At, int tiles_dim, int round, int in_n, int c_tensor, int c_glb_offset, short mask, int out_w) 23 | { 24 | 25 | c_tensor += ( (round/2)*32 + (round%2)*2 )*c_glb_offset/2; 26 | int x, x1; 27 | 28 | for(int j=0; j<4; j++){ 29 | At[j].x = C_tile[j].x + C_tile[4+j].x + C_tile[8+j].x; 30 | At[j].y = C_tile[j].y + C_tile[4+j].y + C_tile[8+j].y; 31 | 32 | At[4+j].x = C_tile[4+j].x - C_tile[8+j].x - C_tile[12+j].x; 33 | At[4+j].y = C_tile[4+j].y - C_tile[8+j].y - C_tile[12+j].y; 34 | } 35 | 36 | for(int i=0; i<2; i++){ 37 | x = i*4; 38 | x1 = i*(in_n*(tiles_dim-(out_w%2)) + (out_w%2)*in_n/2); 39 | if(mask&(1<<(i*2))){ 40 | 41 | pOutputs[x1 + c_tensor].x = At[x].x + At[x+1].x + At[x+2].x; 42 | pOutputs[x1 + c_tensor].y = At[x].y + At[x+1].y + At[x+2].y; 43 | } 44 | if(mask&(1<<(i*2+1))){ 45 | pOutputs[x1 + in_n/2 + c_tensor].x = At[x+1].x - At[x+2].x - At[x+3].x; 46 | pOutputs[x1 + in_n/2 + c_tensor].y = At[x+1].y - At[x+2].y - At[x+3].y; 47 | } 48 | } 49 | 50 | } 51 | 52 | __device__ __inline__ void store_output_tile(float4 acumm_smem[][16], float *shared_mem, 53 | float *C, int out_h, int out_w, 54 | int tiles_dim, int in_n, float4 *input_frag_mem, float4* filter_frag_mem, short mask){ 55 | 56 | float2 *output_smem = (float2 *) shared_mem; 57 | float2 *accumulator = (float2 *) acumm_smem; 58 | float2 *C_out = (float2*)C; 59 | 60 | float2 *C_tile = (float2*) input_frag_mem; 61 | float2 *At = (float2*) filter_frag_mem; 62 | 63 | mask = 0x000F; 64 | if((blockIdx.y/tiles_dim)==(tiles_dim-1) && out_w%2) mask&=0x0003; 65 | if(!((blockIdx.y+1)%tiles_dim) && out_w%2) mask&=0X0005; 66 | 67 | // output transpose step 68 | int t=0; 69 | int acumm1, acumm2; 70 | // For transposing 71 | //acumm1 = access_s_out[threadIdx.x]; //* 4 72 | acumm1 = ((threadIdx.x%8)/2)*34 + threadIdx.x%2 + (threadIdx.x/16)*2 + ((threadIdx.x/8)%2)*8; 73 | acumm2 = acumm1+4; 74 | 75 | int acumm4 = BN_p*8*2 ; //*4 76 | int idx = threadIdx.y * BN_p; 77 | int idx2 = idx + BN_p*8; //(BN_p*2 *8)/2 78 | 79 | // For transformating 80 | int offset = BN_p; //*2/2 81 | int init = (threadIdx.y/4)*BN_p*16 + (threadIdx.y%4)*(32+2); 82 | init += (threadIdx.x/16)*8 + ((threadIdx.x/8)%2)*16 + (threadIdx.x%8); //40=(8+2)*4, 4 blocks/buffer 83 | 84 | 85 | int c_glb_offset = in_n*out_h*out_w; 86 | int c_tensor = blockIdx.z*c_glb_offset*BK + (blockIdx.y%tiles_dim)*in_n*2 + (blockIdx.y/tiles_dim)*in_n*out_w*2 + blockIdx.x*BN + ((threadIdx.x/8)%2)*2 + (threadIdx.x%8)*2*2 + ((threadIdx.x/16)*16 + (threadIdx.y%4)*4 + threadIdx.y/4)*c_glb_offset; 87 | c_tensor/=2; 88 | 89 | #pragma unroll 90 | for(int round=0; round<4; round++){ 91 | 92 | *( (float2*) (output_smem + idx + acumm1) ) = *(accumulator+t); 93 | *( (float2*) (output_smem + idx + acumm1 + 16) ) = *(accumulator+t+1); // float 4, t 94 | *( (float2*) (output_smem + idx + acumm2) ) = *(accumulator+t+2); 95 | *( (float2*) (output_smem + idx + acumm2 + 16) ) = *(accumulator+t+3); // float 4, t+1 96 | 97 | *( (float2*) (output_smem + idx2 + acumm1) ) = *(accumulator+t+32); 98 | *( (float2*) (output_smem + idx2 + acumm1 + 16) ) = *(accumulator+t+33); // float 4, t+16 99 | *( (float2*) (output_smem + idx2 + acumm2) ) = *(accumulator+t+34); 100 | *( (float2*) (output_smem + idx2 + acumm2 + 16) ) = *(accumulator+t+35); // float 4, t+17 101 | 102 | *( (float2*) (output_smem + idx + acumm4 + acumm1) ) = *(accumulator+t+4); 103 | *( (float2*) (output_smem + idx + acumm4 + acumm1 + 16) ) = *(accumulator+t+5); // float 4, t+2 104 | *( (float2*) (output_smem + idx + acumm4 + acumm2) ) = *(accumulator+t+6); 105 | *( (float2*) (output_smem + idx + acumm4 + acumm2 + 16) ) = *(accumulator+t+7); // float 4, t+3 106 | 107 | *( (float2*) (output_smem + idx2 + acumm4 + acumm1) ) = *(accumulator+t+36); 108 | *( (float2*) (output_smem + idx2 + acumm4 + acumm1 + 16) ) = *(accumulator+t+37); // float 4, t+18 109 | *( (float2*) (output_smem + idx2 + acumm4 + acumm2) ) = *(accumulator+t+38); 110 | *( (float2*) (output_smem + idx2 + acumm4 + acumm2 + 16) ) = *(accumulator+t+39); // float 4, t+19 111 | 112 | t+=8; 113 | 114 | __syncthreads(); 115 | 116 | 117 | for(int i=0; i<16; i++){ 118 | C_tile[i].x = output_smem[i*offset + init].x; //16*4 119 | C_tile[i].y = output_smem[i*offset + init].y; //16*4 120 | } 121 | 122 | // transform output tiles 123 | transform_output_tile(C_out, C_tile, At, tiles_dim, round, in_n, c_tensor, c_glb_offset, mask , out_w); 124 | 125 | __syncthreads(); 126 | 127 | } 128 | } 129 | 130 | } 131 | #endif 132 | -------------------------------------------------------------------------------- /src/ampere/store_and_transform_output_optSTS64.cuh: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Roberto Lopez Castro 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "../config.hpp" 16 | 17 | #ifndef _OUTPUT_KERNEL_OPT1_ 18 | #define _OUTPUT_KERNEL_OPT1_ 19 | extern "C" 20 | { 21 | 22 | __device__ __forceinline__ void transform_output_tile(float2 *pOutputs, float2 *C_tile, float2 *At, int tiles_dim, int round, int in_n, int c_tensor, int c_glb_offset, short mask, int out_w) 23 | { 24 | c_tensor += (((round)/2)*32 + ((round)%2)*2)*c_glb_offset/2; 25 | int x, x1; 26 | 27 | #pragma unroll 28 | for(int j=0; j<4; j++){ 29 | 30 | At[j].x = C_tile[j].x + C_tile[4+j].x + C_tile[8+j].x; 31 | At[j].y = C_tile[j].y + C_tile[4+j].y + C_tile[8+j].y; 32 | 33 | At[4+j].x = C_tile[4+j].x - C_tile[8+j].x - C_tile[12+j].x; 34 | At[4+j].y = C_tile[4+j].y - C_tile[8+j].y - C_tile[12+j].y; 35 | } 36 | 37 | 38 | #pragma unroll 39 | for(int i=0; i<2; i++){ 40 | x = i*4; 41 | x1 = i*(in_n*(tiles_dim-(out_w%2)) + (out_w%2)*in_n/2); 42 | if(mask&(1<<(i*2))){ 43 | pOutputs[x1 + c_tensor].x = At[x].x + At[x+1].x + At[x+2].x; 44 | pOutputs[x1 + c_tensor].y = At[x].y + At[x+1].y + At[x+2].y; 45 | } 46 | 47 | if(mask&(1<<(i*2+1))){ 48 | pOutputs[x1 + in_n/2 + c_tensor].x = At[x+1].x - At[x+2].x - At[x+3].x; 49 | pOutputs[x1 + in_n/2 + c_tensor].y = At[x+1].y - At[x+2].y - At[x+3].y; 50 | } 51 | } 52 | } 53 | 54 | __device__ __forceinline__ void store_output_tile(float4 acumm_smem[][16], float *shared_mem, float *C, int out_h, int out_w, int tiles_dim, int in_n, float4 *input_frag_mem, float4* filter_frag_mem, short mask){ 55 | 56 | float2 *output_smem = (float2 *) shared_mem; 57 | float2 *accumulator = (float2 *) acumm_smem; 58 | float2 *C_out = (float2*)C; 59 | 60 | float2 *C_tile = (float2*) input_frag_mem; 61 | float2 *At = (float2*) filter_frag_mem; 62 | 63 | mask = 0x000F; 64 | if((blockIdx.y/tiles_dim)==(tiles_dim-1) && out_w%2) mask&=0x0003; 65 | if(!((blockIdx.y+1)%tiles_dim) && out_w%2) mask&=0X0005; 66 | 67 | // output transpose step 68 | int t=0; 69 | int acumm1, acumm2; 70 | // For transposing 71 | //acumm1 = access_s_out[Inx]; //* 4 72 | acumm1 = ((threadIdx.x%8)/2)*34 + threadIdx.x%2 + (threadIdx.x/16)*2 + ((threadIdx.x/8)%2)*8; 73 | acumm2 = acumm1+4; 74 | 75 | int acumm4 = BN_p*16 ; //*4 76 | int idx = threadIdx.y * BN_p; 77 | int idx2 = idx + BN_p*8; //(BN_p*2 *8)/2 78 | 79 | // For transformating 80 | int offset = BN_p *2; //*2/2 81 | int init = ( (threadIdx.y/4)*BN_p*16 + (threadIdx.y%4)*(32+2) ) *2 + threadIdx.x; 82 | 83 | int c_glb_offset = in_n*out_h*out_w; 84 | int c_tensor = blockIdx.z*c_glb_offset*BK + (blockIdx.y%tiles_dim)*in_n*2 + (blockIdx.y/tiles_dim)*in_n*out_w*2 + blockIdx.x*BN + (threadIdx.x%16)*2+ 85 | ((threadIdx.x/16)*16 + (threadIdx.y%4)*4 + threadIdx.y/4)*c_glb_offset; 86 | c_tensor/=2; 87 | 88 | #pragma unroll 89 | for(int round=0; round<4; round++){ 90 | 91 | *( (float2*) (output_smem + idx + acumm1) ) = *(accumulator+t); 92 | *( (float2*) (output_smem + idx + acumm1 + 16) ) = *(accumulator+t+1); // float 4, t 93 | *( (float2*) (output_smem + idx + acumm2) ) = *(accumulator+t+2); 94 | *( (float2*) (output_smem + idx + acumm2 + 16) ) = *(accumulator+t+3); // float 4, t+1 95 | 96 | *( (float2*) (output_smem + idx2 + acumm1) ) = *(accumulator+t+32); 97 | *( (float2*) (output_smem + idx2 + acumm1 + 16) ) = *(accumulator+t+33); // float 4, t+16 98 | *( (float2*) (output_smem + idx2 + acumm2) ) = *(accumulator+t+34); 99 | *( (float2*) (output_smem + idx2 + acumm2 + 16) ) = *(accumulator+t+35); // float 4, t+17 100 | 101 | *( (float2*) (output_smem + idx + acumm4 + acumm1) ) = *(accumulator+t+4); 102 | *( (float2*) (output_smem + idx + acumm4 + acumm1 + 16) ) = *(accumulator+t+5); // float 4, t+2 103 | *( (float2*) (output_smem + idx + acumm4 + acumm2) ) = *(accumulator+t+6); 104 | *( (float2*) (output_smem + idx + acumm4 + acumm2 + 16) ) = *(accumulator+t+7); // float 4, t+3 105 | 106 | *( (float2*) (output_smem + idx2 + acumm4 + acumm1) ) = *(accumulator+t+36); 107 | *( (float2*) (output_smem + idx2 + acumm4 + acumm1 + 16) ) = *(accumulator+t+37); // float 4, t+18 108 | *( (float2*) (output_smem + idx2 + acumm4 + acumm2) ) = *(accumulator+t+38); 109 | *( (float2*) (output_smem + idx2 + acumm4 + acumm2 + 16) ) = *(accumulator+t+39); // float 4, t+19 110 | 111 | t+=8; 112 | 113 | __syncthreads(); 114 | 115 | 116 | for(int i=0; i<16; i++){ 117 | C_tile[i].x = shared_mem[i*offset + init]; 118 | C_tile[i].y = shared_mem[i*offset + init + 32]; 119 | } 120 | 121 | // transform output tiles 122 | transform_output_tile(C_out, C_tile, At, tiles_dim, round, in_n, c_tensor, c_glb_offset, mask, out_w); 123 | __syncthreads(); 124 | } 125 | } 126 | 127 | } 128 | #endif 129 | -------------------------------------------------------------------------------- /src/ampere/store_and_transform_output_optSTS64_compact.cuh: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Roberto Lopez Castro 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "../config.hpp" 16 | 17 | #ifndef _OUTPUT_KERNEL_OPT1_ 18 | #define _OUTPUT_KERNEL_OPT1_ 19 | extern "C" 20 | { 21 | 22 | __device__ void transform_output_tile(float2 *pOutputs, float2 *C_tile, float2 *At, 23 | int tiles_dim, int round, int in_n, int c_tensor, int c_glb_offset, 24 | short mask, int out_w){ 25 | c_tensor += (((round)/2)*32 + ((round)%2)*2)*c_glb_offset/2; 26 | int x, x1; 27 | 28 | 29 | #pragma unroll 30 | for(int j=0; j<4; j++){ 31 | At[j].x = C_tile[j].x + C_tile[4+j].x + C_tile[8+j].x; 32 | At[j].y = C_tile[j].y + C_tile[4+j].y + C_tile[8+j].y; 33 | At[4+j].x = C_tile[4+j].x - C_tile[8+j].x - C_tile[12+j].x; 34 | At[4+j].y = C_tile[4+j].y - C_tile[8+j].y - C_tile[12+j].y; 35 | } 36 | 37 | x = in_n/2; 38 | pOutputs[c_tensor].x = At[0].x + At[1].x + At[2].x; 39 | pOutputs[c_tensor].y = At[0].y + At[1].y + At[2].y; 40 | 41 | if(mask&0x2){ 42 | pOutputs[x + c_tensor].x = At[1].x - At[2].x - At[3].x; 43 | pOutputs[x + c_tensor].y = At[1].y - At[2].y - At[3].y; 44 | } 45 | 46 | //x1 = in_n*(tiles_dim-1) + x; 47 | x1 = in_n*(tiles_dim-(out_w%2)) + (out_w%2)*x; 48 | if(mask&0x4){ 49 | pOutputs[x1 + c_tensor].x = At[4].x + At[5].x + At[6].x; 50 | pOutputs[x1 + c_tensor].y = At[4].y + At[5].y + At[6].y; 51 | } 52 | 53 | if(mask&0x8){ 54 | pOutputs[x1 + x + c_tensor].x = At[5].x - At[6].x - At[7].x; 55 | pOutputs[x1 + x + c_tensor].y = At[5].y - At[6].y - At[7].y; 56 | } 57 | } 58 | 59 | __device__ __forceinline__ void store_output_tile(float4 acumm_smem[][16], float *shared_mem, float *C, int out_h, int out_w, int tiles_dim, int in_n, float4 *input_frag_mem, float4* filter_frag_mem, short mask){ 60 | 61 | float2 *output_smem = (float2 *) shared_mem; 62 | float2 *accumulator = (float2 *) acumm_smem; 63 | float2 *C_out = (float2*)C; 64 | 65 | float2 *C_tile = (float2*) input_frag_mem; 66 | float2 *At = (float2*) filter_frag_mem; 67 | 68 | mask = 0x000F; 69 | if((blockIdx.y/tiles_dim)==(tiles_dim-1) && out_w%2) mask&=0x0003; 70 | if(!((blockIdx.y+1)%tiles_dim) && out_w%2) mask&=0X0005; 71 | 72 | // output transpose step 73 | int t,j; 74 | int acumm1, acumm2; 75 | // For transposing 76 | t = threadIdx.x%8/2; 77 | acumm1 = t*18 + threadIdx.x%2 + (threadIdx.x/16)*2 + ((threadIdx.x/8)%2)*8; 78 | acumm2 = acumm1+4; 79 | acumm1 = acumm1 - acumm1/((t+1)*16)*16 + t*16; 80 | acumm2 = acumm2 - acumm2/((t+1)*16)*16 + t*16; 81 | t=0; 82 | 83 | int acumm4 = BN_p*16 ; //*4 84 | int idx = threadIdx.y * BN_p; 85 | int idx2 = idx + BN_p*8; //(BN_p*2 *8)/2 86 | 87 | // For transformating 88 | int offset = BN_p *2; //*2/2 89 | 90 | int init = (threadIdx.y%4)*(16+2)*2 + threadIdx.x; 91 | init = init - init/((threadIdx.y%4+1)*32)*32 + threadIdx.y%4*32; 92 | init += (threadIdx.y/4)*BN_p*16*2; 93 | 94 | int c_glb_offset = in_n*out_h*out_w; 95 | int c_tensor = blockIdx.z*c_glb_offset*BK + (blockIdx.y%tiles_dim)*in_n*2 + (blockIdx.y/tiles_dim)*in_n*out_w*2 + blockIdx.x*BN + (threadIdx.x%16)*2+ 96 | ((threadIdx.x/16)*16 + (threadIdx.y%4)*4 + threadIdx.y/4)*c_glb_offset; 97 | c_tensor/=2; 98 | 99 | // k=0, block 0 100 | *( (float2*) (output_smem + idx + acumm1) ) = *(accumulator); 101 | *( (float2*) (output_smem + idx + acumm1 + 16) ) = *(accumulator+1); 102 | *( (float2*) (output_smem + idx + acumm2) ) = *(accumulator+2); 103 | *( (float2*) (output_smem + idx + acumm2 + 16) ) = *(accumulator+3); 104 | 105 | // K=1, block 0 106 | *( (float2*) (output_smem + idx + acumm4 + acumm1) ) = *(accumulator+4); 107 | *( (float2*) (output_smem + idx + acumm4 + acumm1 + 16) ) = *(accumulator+5); 108 | *( (float2*) (output_smem + idx + acumm4 + acumm2) ) = *(accumulator+6); 109 | *( (float2*) (output_smem + idx + acumm4 + acumm2 + 16) ) = *(accumulator+7); 110 | 111 | // k=0, block 1 112 | *( (float2*) (output_smem + idx2 + acumm1) ) = *(accumulator+32); 113 | *( (float2*) (output_smem + idx2 + acumm1 + 16) ) = *(accumulator+33); 114 | *( (float2*) (output_smem + idx2 + acumm2) ) = *(accumulator+34); 115 | *( (float2*) (output_smem + idx2 + acumm2 + 16) ) = *(accumulator+35); 116 | 117 | // K=1, block 1 118 | *( (float2*) (output_smem + idx2 + acumm4 + acumm1) ) = *(accumulator+36); 119 | *( (float2*) (output_smem + idx2 + acumm4 + acumm1 + 16) ) = *(accumulator+37); 120 | *( (float2*) (output_smem + idx2 + acumm4 + acumm2) ) = *(accumulator+38); 121 | *( (float2*) (output_smem + idx2 + acumm4 + acumm2 + 16) ) = *(accumulator+39); 122 | 123 | j=0; t+=8; 124 | 125 | #pragma unroll 126 | for(int round=0; round<3; round++){ 127 | 128 | __syncthreads(); 129 | 130 | int disp = j/2*(BN_p*2*16)*2; 131 | #pragma unroll 132 | for(int i=0; i<16; i++){ 133 | C_tile[i].x = shared_mem[disp + i*offset + init]; 134 | C_tile[i].y = shared_mem[disp + i*offset + init + 32]; 135 | } 136 | 137 | // transform output tiles 138 | transform_output_tile(C_out, C_tile, At, tiles_dim, (round/2)*2+j/2, in_n, c_tensor, c_glb_offset, mask, out_w); 139 | 140 | j = 2 - j; //switch between 0 and 2 141 | 142 | // k=0, block 0 143 | *( (float2*) (output_smem + idx + (j)*acumm4 + acumm1) ) = *(accumulator+t); 144 | *( (float2*) (output_smem + idx + (j)*acumm4 + acumm1 + 16) ) = *(accumulator+t+1); 145 | *( (float2*) (output_smem + idx + (j)*acumm4 + acumm2) ) = *(accumulator+t+2); 146 | *( (float2*) (output_smem + idx + (j)*acumm4 + acumm2 + 16) ) = *(accumulator+t+3); 147 | 148 | // K=1, block 0 149 | *( (float2*) (output_smem + idx + (j+1)*acumm4 + acumm1) ) = *(accumulator+t+4); 150 | *( (float2*) (output_smem + idx + (j+1)*acumm4 + acumm1 + 16) ) = *(accumulator+t+5); 151 | *( (float2*) (output_smem + idx + (j+1)*acumm4 + acumm2) ) = *(accumulator+t+6); 152 | *( (float2*) (output_smem + idx + (j+1)*acumm4 + acumm2 + 16) ) = *(accumulator+t+7); 153 | 154 | // k=0, block 1 155 | *( (float2*) (output_smem + idx2 + (j)*acumm4 + acumm1) ) = *(accumulator+t+32); 156 | *( (float2*) (output_smem + idx2 + (j)*acumm4 + acumm1 + 16) ) = *(accumulator+t+33); 157 | *( (float2*) (output_smem + idx2 + (j)*acumm4 + acumm2) ) = *(accumulator+t+34); 158 | *( (float2*) (output_smem + idx2 + (j)*acumm4 + acumm2 + 16) ) = *(accumulator+t+35); 159 | 160 | // K=1, block 1 161 | *( (float2*) (output_smem + idx2 + (j+1)*acumm4 + acumm1) ) = *(accumulator+t+36); 162 | *( (float2*) (output_smem + idx2 + (j+1)*acumm4 + acumm1 + 16) ) = *(accumulator+t+37); 163 | *( (float2*) (output_smem + idx2 + (j+1)*acumm4 + acumm2) ) = *(accumulator+t+38); 164 | *( (float2*) (output_smem + idx2 + (j+1)*acumm4 + acumm2 + 16) ) = *(accumulator+t+39); 165 | 166 | t+=8; 167 | 168 | } 169 | 170 | __syncthreads(); 171 | 172 | int disp = j/2*(BN_p*2*16)*2; 173 | #pragma unroll 174 | for(int i=0; i<16; i++){ 175 | C_tile[i].x = shared_mem[disp + i*offset + init]; 176 | C_tile[i].y = shared_mem[disp + i*offset + init + 32]; 177 | } 178 | // transform output tiles 179 | transform_output_tile(C_out, C_tile, At, tiles_dim, 2+j/2, in_n, c_tensor, c_glb_offset, mask, out_w); 180 | } 181 | 182 | } 183 | #endif 184 | -------------------------------------------------------------------------------- /src/config.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Roberto Lopez Castro 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | 16 | #ifndef COMMON_INCLUDE_FILE 17 | #define COMMON_INCLUDE_FILE 18 | 19 | #define BC 8 20 | #define BN 32 21 | #define BK 64 22 | ///////////////////// For Non-Fused version 23 | #define BC_GEMM 8 24 | #define BN_GEMM 128 25 | #define BK_GEMM 128 26 | ///////////////////// For Non-Fused version 27 | 28 | #ifdef OPTSTS64_CMP 29 | #define BN_p 128 30 | #elif BASE 31 | #define BN_p 40 32 | #else 33 | #define BN_p 138 34 | #endif 35 | 36 | #define N 128 // values: 32,64,96,128 37 | #define C_in 256 // values: 64,128,256,512 38 | #define W 14 // values: 56,28,14,7 39 | 40 | #define K 256 // values: 64,128,256,512 41 | #define R 3 // values: 3 42 | 43 | #define PAD_H 1 44 | #define PAD_W 1 45 | #define STR_H 1 46 | #define STR_W 1 47 | #define DIL_H 1 48 | #define DIL_W 1 49 | 50 | #define M 2 // values: 2 51 | 52 | __constant__ int access_f_s[2][32]; 53 | __constant__ int access_s[2][32]; 54 | #ifndef BASE 55 | __constant__ int access_s_out[32]; 56 | __constant__ int out_thread[2][4][4]; 57 | __constant__ int out_sgemm[32]; 58 | __constant__ int exhange[32]; 59 | #else 60 | __constant__ int access_s_out[2][16]; 61 | __constant__ int out_thread[4][4]; 62 | #endif 63 | 64 | 65 | // access_f_s 66 | const int aux[2][32] = { 67 | {0,0,1,1,2,2,3,3,4,4,5,5,6,6, 68 | 7,7,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7}, 69 | {8,8,9,9,10,10,11,11,12,12,13,13, 70 | 14,14,15,15,8,8,9,9,10,10,11,11,12,12, 71 | 13,13,14,14,15,15} 72 | }; 73 | // access_s 74 | const int aux2[2][32] = { 75 | {0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,2, 76 | 3,2,3,2,3,2,3,2,3,2,3,2,3,2,3}, 77 | {4,5,4,5,4,5,4,5,4,5,4, 78 | 5,4,5,4,5,6,7,6,7,6,7,6,7, 79 | 6,7,6,7,6,7,6,7} 80 | }; 81 | 82 | #ifndef BASE 83 | // access_s_out 84 | const int aux3[32] = { 85 | 0,1,34,35,68,69,102,103, // first quarter 86 | 8,9,42,43,76,77,110,111, // second quarter 87 | 2,3,36,37,70,71,104,105, // third quarter 88 | 10,11,44,45,78,79,112,113 // fourth quarter 89 | }; 90 | // out_thread 91 | const int aux4[2][4][4] = { {{0,4,8,12}, {2,6,10,14}, 92 | {32,36,40,44}, {34,38,42,46} }, 93 | {{16,20,24,28}, {18,22,26,30}, 94 | {48,52,56,60}, {50,54,58,62}}}; 95 | // out_sgemm 96 | const int aux5[32] = { 0,1,8,9,16,17,24,25, 97 | 32,33,40,41,48,49,56,57, 98 | 2,3,10,11,18,19,26,27, 99 | 34,35,42,43,50,51,58,59 100 | }; 101 | // exhange 102 | const int aux6[32] = { 103 | 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13, 104 | 18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29 105 | }; 106 | 107 | #else 108 | const int aux3[2][16] = { 109 | {0,1,10,11,20,21,30,31, 2,3,12,13,22,23,32,33}, 110 | {4,5,14,15,24,25,34,35, 6,7,16,17,26,27,36,37} 111 | }; 112 | const int aux4[4][4] = { {0,4,8,12}, {32,36,40,44}, {16,20,24,28}, {48,52,56,60} }; 113 | #endif 114 | 115 | #endif 116 | -------------------------------------------------------------------------------- /src/convolutionForward_32x64x8.cu: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Roberto Lopez Castro 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | 16 | #include "FX_m2.cu" 17 | 18 | #ifdef OPTLDS64 19 | #include "store_and_transform_output_optLDS64.cuh" 20 | #include "outer_product.cuh" 21 | #elif OPTSTS64_CMP 22 | #include "store_and_transform_output_optSTS64_compact.cuh" 23 | #include "outer_product_suffle.cuh" 24 | #else 25 | #include "store_and_transform_output_optSTS64.cuh" 26 | #include "outer_product_suffle.cuh" 27 | #endif 28 | 29 | #ifdef _noWALL_ 30 | typedef struct rusage resnfo; 31 | typedef struct _timenfo { 32 | double time; 33 | double systime; 34 | } timenfo; 35 | #define timestamp(sample) getrusage(RUSAGE_SELF, (sample)) 36 | #define printtime(t) printf("%15f s (%f user + %f sys) ", \ 37 | t.time + t.systime, t.time, t.systime); 38 | #else 39 | typedef struct timeval resnfo; 40 | typedef double timenfo; 41 | #define timestamp(sample) gettimeofday((sample), 0) 42 | #define printtime(t) printf("%15f s ", t); 43 | #endif 44 | 45 | #ifndef _WINOGRAD_ 46 | #define _WINOGRAD_ 47 | extern "C" 48 | { 49 | 50 | 51 | #define d(input, i, j) ( input[(i<<2) + (j)] ) 52 | 53 | __device__ __forceinline__ void load_and_transform_input_tile(float *Btd, float *pOutputs, int in_h, int in_w, 54 | int tiles_dim, int in_c, int in_n, int tile_size, 55 | int tiles_2d_dim, int tile_2d_s, int Inx, int Iny, int TileX, int TileY){ 56 | 57 | float workspace[3]; 58 | 59 | #pragma unroll 60 | for(int j=0; j<4; j++){ 61 | workspace[0] = Btd[j]; 62 | workspace[1] = Btd[j+4]; 63 | workspace[2] = Btd[j+8]; 64 | 65 | Btd[j] = workspace[0] - workspace[2]; 66 | Btd[j+4] = workspace[1] + workspace[2]; 67 | Btd[j+8] = workspace[2] - workspace[1]; 68 | Btd[j+12] = workspace[1] - Btd[j+12]; 69 | } 70 | 71 | int c_offset = BN*BC; 72 | int c_tensor = Iny*BN + Inx; 73 | 74 | #pragma unroll 75 | for(int i=0; i<4; i++){ // prefetch 1 input tile/thread 76 | pOutputs[c_tensor+i*c_offset*4] = d(Btd, i, 0) - d(Btd, i, 2); 77 | pOutputs[c_tensor+i*c_offset*4+c_offset] = d(Btd, i, 1) + d(Btd, i, 2); 78 | pOutputs[c_tensor+i*c_offset*4+2*c_offset] = d(Btd, i, 2) - d(Btd, i, 1); 79 | pOutputs[c_tensor+i*c_offset*4+3*c_offset] = d(Btd, i, 1) - d(Btd, i, 3); 80 | } 81 | 82 | } 83 | 84 | __device__ __forceinline__ void load_filter_tile(float *tiles, float *pOutputs, 85 | int filt_c, int filt_k, int Inx, int Iny){ 86 | 87 | int c_tensor_s = Iny*BK + Inx; 88 | int c_offset_s = BK*BC; 89 | 90 | for(int k=0; k<2; k++){ // prefetch 2 filter tiles/thread 91 | for(int i=0; i<4; i++){ 92 | for(int j=0; j<4; j++){ 93 | pOutputs[c_tensor_s + i*c_offset_s*4 + j*c_offset_s] = tiles[k*16 + i*4 + j]; 94 | } 95 | } 96 | 97 | c_tensor_s += BN; 98 | } 99 | 100 | } 101 | 102 | __device__ __forceinline__ void prefetch_filter_tile(float *pInputs, float *tiles, 103 | int filt_k, int Inx, int Iny, int TileZ){ 104 | 105 | int c_tensor = TileZ*BK + (Iny*filt_k<<4) + Inx; 106 | 107 | int acumm; 108 | #pragma unroll 109 | for(int i=0; i<4; i++){ 110 | acumm = (i*filt_k<<2); 111 | for(int j=0; j<4; j++){ 112 | tiles[(i<<2) + j] = pInputs[acumm + j*filt_k + c_tensor]; 113 | tiles[16 + (i<<2) + j] = pInputs[acumm + j*filt_k + c_tensor+BN]; 114 | } 115 | } 116 | } 117 | 118 | __device__ __forceinline__ void prefetch_input_tile(float *pInputs, float *tile, int in_h, int in_w, 119 | int in_n, int Inx, int Iny, int TileX, int TileY, int tiles_dim, short mask){ 120 | 121 | int c_tensor = (TileY%tiles_dim)*in_n*2 + (TileY/tiles_dim)*in_n*in_w*2 + TileX*BN + Iny*(in_n*in_h*in_w) + (Inx/in_n)*2*in_n + (Inx%in_n) - (in_n*in_w+in_n); 122 | int acumm,x; 123 | 124 | if(mask==0xFFFF){ 125 | 126 | for(int i=0; i<4; i++){ 127 | acumm = i*in_n*in_w; 128 | #pragma unroll 129 | for(int j=0; j<4; j++){ 130 | tile[(i<<2) + j] = pInputs[acumm + j*in_n + c_tensor]; 131 | } 132 | } 133 | 134 | } else { 135 | 136 | for(int i=0; i<4; i++){ 137 | acumm = i*in_n*in_w; 138 | #pragma unroll 139 | for(int j=0; j<4; j++){ 140 | x = (i<<2) + j; 141 | tile[x] = 0; 142 | if(mask&(1<>>(w, Ww, filt_k, filt_c, filt_h, filt_w, alpha); 275 | 276 | #ifdef OPTSTS64_CMP 277 | smem_size = 65536; // 64 KB 278 | cudaFuncSetAttribute(Winograd_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size); 279 | #endif 280 | 281 | Winograd_kernel<<>>(k, Ww, C, tiles_dim, in_c, in_n, in_h, in_w, tile_size, filt_k, filt_c, tiles_2d_dim, out_c, out_n, tile_2d_s, out_h, out_w); 282 | 283 | return cudaGetLastError(); 284 | } 285 | 286 | } 287 | #endif 288 | -------------------------------------------------------------------------------- /src/convolutionForward_32x64x8_baseline.cu: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Roberto Lopez Castro 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | 16 | #include "FX_m2.cu" 17 | #include "store_and_transform_output_baseline.cuh" 18 | #include "outer_product.cuh" 19 | 20 | #ifdef _noWALL_ 21 | typedef struct rusage resnfo; 22 | typedef struct _timenfo { 23 | double time; 24 | double systime; 25 | } timenfo; 26 | #define timestamp(sample) getrusage(RUSAGE_SELF, (sample)) 27 | #define printtime(t) printf("%15f s (%f user + %f sys) ", \ 28 | t.time + t.systime, t.time, t.systime); 29 | #else 30 | typedef struct timeval resnfo; 31 | typedef double timenfo; 32 | #define timestamp(sample) gettimeofday((sample), 0) 33 | #define printtime(t) printf("%15f s ", t); 34 | #endif 35 | 36 | #ifndef _WINOGRAD_ 37 | #define _WINOGRAD_ 38 | extern "C" 39 | { 40 | 41 | 42 | #define d(input, i, j) ( input[(i<<2) + (j)] ) 43 | 44 | __device__ void load_and_transform_input_tile(float *Btd, float *pOutputs, int in_h, int in_w, int tiles_dim, int in_c, int in_n, int tile_size, int tiles_2d_dim, int tile_2d_s, int Inx, int Iny, int TileX, int TileY) 45 | { 46 | 47 | float workspace[4]; 48 | 49 | for(int j=0; j<4; j++){ 50 | workspace[0] = Btd[j]; 51 | workspace[1] = Btd[j+4]; 52 | workspace[2] = Btd[j+8]; 53 | workspace[3] = Btd[j+12]; 54 | 55 | Btd[j] = workspace[0] - workspace[2]; 56 | Btd[j+4] = workspace[1] + workspace[2]; 57 | Btd[j+8] = workspace[2] - workspace[1]; 58 | Btd[j+12] = workspace[1] - workspace[3]; 59 | } 60 | 61 | int c_offset = BN*BC; 62 | int c_tensor = Iny*BN + Inx; 63 | 64 | for(int i=0; i<4; i++){ // prefetch 1 input tile/thread 65 | pOutputs[c_tensor+i*c_offset*4] = d(Btd, i, 0) - d(Btd, i, 2); 66 | pOutputs[c_tensor+i*c_offset*4+c_offset] = d(Btd, i, 1) + d(Btd, i, 2); 67 | pOutputs[c_tensor+i*c_offset*4+2*c_offset] = d(Btd, i, 2) - d(Btd, i, 1); 68 | pOutputs[c_tensor+i*c_offset*4+3*c_offset] = d(Btd, i, 1) - d(Btd, i, 3); 69 | } 70 | } 71 | 72 | __device__ void load_filter_tile(float *tiles, float *pOutputs, int filt_c, int filt_k, int Inx, int Iny) 73 | { 74 | 75 | int c_tensor_s = Iny*BK + Inx; 76 | int c_offset_s = BK*BC; 77 | 78 | for(int k=0; k<2; k++){ // prefetch 2 filter tiles/thread 79 | for(int i=0; i<4; i++){ 80 | for(int j=0; j<4; j++){ 81 | pOutputs[c_tensor_s + i*c_offset_s*4 + j*c_offset_s] = tiles[k*16 + i*4 + j]; 82 | } 83 | } 84 | 85 | c_tensor_s += BN; 86 | } 87 | 88 | } 89 | 90 | __device__ void prefetch_filter_tile(float *pInputs, float *tiles, int filt_k, int Inx, int Iny, int TileZ) 91 | { 92 | 93 | int c_tensor = TileZ*BK + (Iny*filt_k<<4) + Inx; 94 | 95 | int acumm; 96 | #pragma unroll 97 | for(int i=0; i<4; i++){ 98 | acumm = (i*filt_k<<2); 99 | for(int j=0; j<4; j++){ 100 | tiles[(i<<2) + j] = pInputs[acumm + j*filt_k + c_tensor]; 101 | tiles[16 + (i<<2) + j] = pInputs[acumm + j*filt_k + c_tensor+BN]; 102 | } 103 | } 104 | } 105 | 106 | __device__ void prefetch_input_tile(float *pInputs, float *tile, int in_h, int in_w, int in_n, int Inx, int Iny, int TileX, int TileY, int tiles_dim, short mask) 107 | { 108 | 109 | int c_tensor = (TileY%tiles_dim)*in_n*2 + (TileY/tiles_dim)*in_n*in_w*2 + TileX*BN + Iny*(in_n*in_h*in_w) + (Inx/in_n)*2*in_n + (Inx%in_n) - (in_n*in_w+in_n); 110 | int acumm; 111 | 112 | //#pragma unroll 113 | for(int i=0; i<4; i++){ 114 | acumm = i*in_n*in_w; 115 | for(int j=0; j<4; j++){ 116 | if(mask&(1<<((i<<2) + j))) 117 | tile[(i<<2) + j] = pInputs[acumm + j*in_n + c_tensor]; 118 | else 119 | tile[(i<<2) + j] = 0; 120 | } 121 | } 122 | } 123 | 124 | __device__ void __inline__ prefetch_filter_frag(float4 *filter_frag, float4 *B_frag, int f_frag_offset, int Inx, int offset1, int offset2) 125 | { 126 | 127 | *((float4*) (filter_frag)) = *(B_frag + offset1); 128 | *((float4*) (filter_frag + 1)) = *(B_frag + offset2); 129 | 130 | *((float4*) (filter_frag + 2)) = *(B_frag + f_frag_offset + offset1); 131 | *((float4*) (filter_frag + 3)) = *(B_frag + f_frag_offset + offset2); 132 | } 133 | 134 | __device__ void __inline__ prefetch_input_frag(float4* input_frag, float4 *A_frag, int frag_offset, int Inx, int offset1, int offset2) 135 | { 136 | 137 | *((float4*) (input_frag)) = *(A_frag + offset1); //ld_shared(A_frag + offset1); 138 | *((float4*) (input_frag + 1)) = *(A_frag + offset2); 139 | 140 | *((float4*) (input_frag + 2)) = *(A_frag + frag_offset + offset1); 141 | *((float4*) (input_frag + 3)) = *(A_frag + frag_offset + offset2); //3=2+1 142 | } 143 | 144 | __global__ void Winograd_kernel(float *A, float *B, float *C, 145 | int tiles_dim, int in_c, int in_n, int in_h, int in_w, 146 | int tile_size, int filt_k, int filt_c, 147 | int tiles_2d_dim, int out_c, int out_n, 148 | int tile_2d_s, int out_h, int out_w){ 149 | 150 | extern __shared__ float shared_mem[]; 151 | float *input_smem = (float*)shared_mem; 152 | float *filter_smem = (float*)&shared_mem[16*BC*BN]; 153 | 154 | short m = 0xFFFF; 155 | if((blockIdx.y/tiles_dim)==0) m&=0xFFF0; 156 | if((blockIdx.y/tiles_dim)==(tiles_dim-1)) m &= (!(in_w%2))?(0x0FFF):(0x00FF); 157 | if(!((blockIdx.y+1)%tiles_dim)) m &= (!(in_w%2))?(0x7777):(0x3333); 158 | if(!((blockIdx.y)%tiles_dim)) m&=0xeeee; 159 | 160 | float img_tile[16]; // Prefetch input from GMEM 161 | float filter_tile[32]; // Prefetch filter from GMEM 162 | 163 | float4 input_frag_mem[8]; //2*2(2*8/4) Data to do Outer Product + prefetch f. SMEM (double_buffer) 164 | float4 filter_frag_mem[8]; //2*2 Data to do Outer Product + prefetch f. SMEM (double_buffer) 165 | float4 accumulator[2][16] = {0.0f}; // Accumulators 166 | 167 | float4 *A_frag; // Input data pointer 168 | int frag_offset = 2* (BC*BN); // (2=8/4) SMEM input read offset 169 | 170 | float4 *B_frag; // Filter data pointer 171 | int f_frag_offset = 2* (BC*BK); // (2=8/4) SMEM filter read offset 172 | 173 | float4 *input_frag = (float4*) input_frag_mem; 174 | float4 *filter_frag = (float4*) filter_frag_mem; 175 | 176 | float4 *swap; 177 | 178 | prefetch_input_tile(A, img_tile, in_h, in_w, in_n, threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, tiles_dim, m); 179 | prefetch_filter_tile(B, filter_tile, filt_k, threadIdx.x, threadIdx.y, blockIdx.z); 180 | 181 | float4 *input_frag_buffer = (float4*) (input_frag+4); 182 | float4 *filter_frag_buffer = (float4*) (filter_frag+4); 183 | 184 | // Mainloop - iterates over the entire K dimension - not unrolled 185 | for(int iter=0; iter>>(w, Ww, filt_k, filt_c, filt_h, filt_w, alpha); 248 | 249 | Winograd_kernel<<>>(k, Ww, C, 250 | tiles_dim, in_c, in_n, in_h, in_w, tile_size, 251 | filt_k, filt_c, tiles_2d_dim, out_c, out_n, tile_2d_s, 252 | out_h, out_w); 253 | 254 | return cudaGetLastError(); 255 | } 256 | 257 | } 258 | #endif -------------------------------------------------------------------------------- /src/openCNN_winograd.cu: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Roberto Lopez Castro 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | 30 | #include 31 | 32 | #include "config.hpp" 33 | 34 | #ifdef BASE 35 | #if __CUDA_ARCH__ < 800 36 | #include "convolutionForward_32x64x8_baseline.cu" 37 | #else 38 | #include "ampere/convolutionForward_32x64x8_baseline.cu" 39 | #endif 40 | #else 41 | #if __CUDA_ARCH__ < 800 42 | #include "convolutionForward_32x64x8.cu" 43 | #else 44 | #include "ampere/convolutionForward_32x64x8.cu" 45 | #endif 46 | #endif 47 | 48 | /* 49 | In order to measure the elapsed time: 50 | 51 | resnfo: datatype defined to abstract the metric of the resources to use 52 | timenfo: datatype defined to abstract the time metric to use 53 | 54 | timestamp: it abstract the function used to take the time 55 | 56 | printtime: it abstracts the function used to print the time 57 | 58 | void myElapsedtime(resnfo start, resnfo end, timenfo *t): function to obtain the 59 | time between two measures 60 | */ 61 | 62 | #ifdef _noWALL_ 63 | typedef struct rusage resnfo; 64 | typedef struct _timenfo { 65 | double time; 66 | double systime; 67 | } timenfo; 68 | #define timestamp(sample) getrusage(RUSAGE_SELF, (sample)) 69 | #define printtime(t) printf("%15f s (%f user + %f sys) ", \ 70 | t.time + t.systime, t.time, t.systime); 71 | #else 72 | typedef struct timeval resnfo; 73 | typedef double timenfo; 74 | #define timestamp(sample) gettimeofday((sample), 0) 75 | #define printtime(t) printf("%15f s ", t); 76 | #endif 77 | 78 | void myElapsedtime(resnfo start, resnfo end, timenfo *t) 79 | { 80 | #ifdef _noWALL_ 81 | t->time = (end.ru_utime.tv_sec + (end.ru_utime.tv_usec * 1E-6)) 82 | - (start.ru_utime.tv_sec + (start.ru_utime.tv_usec * 1E-6)); 83 | t->systime = (end.ru_stime.tv_sec + (end.ru_stime.tv_usec * 1E-6)) 84 | - (start.ru_stime.tv_sec + (start.ru_stime.tv_usec * 1E-6)); 85 | #else 86 | *t = (end.tv_sec + (end.tv_usec * 1E-6)) 87 | - (start.tv_sec + (start.tv_usec * 1E-6)); 88 | #endif /*_noWALL_*/ 89 | } 90 | 91 | #define CUDA_CALL(f) { \ 92 | cudaError_t err = (f); \ 93 | if (err != cudaSuccess) { \ 94 | std::cout \ 95 | << " Error occurred: " << err << std::endl; \ 96 | std::exit(1); \ 97 | } \ 98 | } 99 | 100 | #define CUDNN_CALL(f) { \ 101 | cudnnStatus_t err = (f); \ 102 | if (err != CUDNN_STATUS_SUCCESS) { \ 103 | printf(" Error occurred: \n"); \ 104 | std::exit(1); \ 105 | } \ 106 | } 107 | 108 | #define OPENCNN_CALL(ans) { gpuAssert((ans), __FILE__, __LINE__); } 109 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) 110 | { 111 | if (code != cudaSuccess) 112 | { 113 | fprintf(stderr,"Error occurred: %s %s %d\n", cudaGetErrorString(code), file, line); 114 | if (abort) exit(code); 115 | } 116 | } 117 | 118 | void tflops(int in_n, int in_w, int in_h, int in_c, int filt_w, int filt_h, int filt_k, int pad, int str, 119 | int out_w, int out_h, float ms) 120 | { 121 | 122 | double L = (double) 2.0*in_n*in_c*(in_h+2*PAD_H)*(in_w+2*PAD_W)*filt_k*3.0*3.0; 123 | 124 | printf("%.3f,%.2f", ms, L/(2.25 * ms * 1e9) ); 125 | } 126 | 127 | __global__ void dev_const(float *px, float k, int n) { 128 | int tid = threadIdx.x + blockIdx.x * blockDim.x; 129 | 130 | curandState state; 131 | curand_init(clock64(), tid, 0, &state); 132 | 133 | if (tid < n) 134 | px[tid] = curand_uniform(&state); 135 | } 136 | 137 | __global__ void dev_iota(float *px, int n) { 138 | int tid = threadIdx.x + blockIdx.x * blockDim.x; 139 | 140 | curandState state; 141 | curand_init(clock64(), tid, 0, &state); 142 | 143 | if (tid < n) 144 | px[tid] = curand_uniform(&state); 145 | } 146 | 147 | __global__ void data_cpy(float *px, float *py, 148 | int in_w, int in_h, int in_c, int in_n) { 149 | int tid = blockIdx.y + blockIdx.z*in_w + threadIdx.x*in_h*in_w + blockIdx.x*in_h*in_w*in_c; 150 | int id = blockIdx.x + blockIdx.y*in_n + blockIdx.z*in_n*in_w + threadIdx.x*in_n*in_h*in_w; 151 | 152 | px[id] = py[tid]; 153 | } 154 | 155 | void print(const float *data, int n, int c, int h, int w) { 156 | std::vector buffer(1 << 20); 157 | CUDA_CALL(cudaMemcpy( 158 | buffer.data(), data, 159 | n * c * h * w * sizeof(float), 160 | cudaMemcpyDeviceToHost)); 161 | int a = 0; 162 | for (int i = 0; i < n; ++i) { 163 | for (int j = 0; j < c; ++j) { 164 | std::cout << "n=" << i << ", c=" << j << ":" << std::endl; 165 | for (int k = 0; k < h; ++k) { 166 | for (int l = 0; l < w; ++l) { 167 | std::cout << std::setw(12) << std::right << buffer[a]; 168 | ++a; 169 | } 170 | std::cout << std::endl << std::endl; 171 | } 172 | } 173 | } 174 | std::cout << std::endl; 175 | } 176 | 177 | void output_checker(float* A, float* B, int n, int len, int channel, int shift) { 178 | int error_cnt = 0, i, j, k, m; 179 | float max_error = 0; 180 | for(k = 0; k < channel; k++){ 181 | for (i = 0; i < len; i++) { 182 | for (j = 0; j < len; j++) { 183 | for (m = 0; m < n; m++) { 184 | float diff = fabs( 185 | A[k*len*len*n + i*len*n + j*n + m] - 186 | B[m*len*len*channel + k*len*len + i*len + j]); 187 | if (diff > 1){ //1e-4 188 | error_cnt++; 189 | printf("h:%d, w:%d, n:%d, c:%d -> %f vs %f : +- %f\n", i, j, m, k, 190 | A[k*len*len*n + i*len*n + j*n + m], 191 | B[m*len*len*channel + k*len*len + i*len + j], diff); 192 | std::exit(1); 193 | } 194 | if (diff > max_error) 195 | max_error = diff; 196 | } 197 | } 198 | } 199 | } 200 | printf("[max_error: %f][error_cnt: %d] of %d\n", max_error, error_cnt, n*len*len*channel*shift); 201 | } 202 | 203 | 204 | cudaError_t convolutionForward(float *k, int in_h, int in_w, float *w, int out_h, 205 | int out_w, int out_n, int out_c, float *C, float *Ww, 206 | const unsigned int n, 207 | int tiles_dim, int in_n, int tile_size, int elems_dim, 208 | int in_c, int filt_k, int filt_c, int filt_h, int filt_w, 209 | int alpha, int m){ 210 | cudaError_t out; 211 | 212 | if(BN==32 && BK==64 && BC==8){ 213 | out = convolutionForward_32x64x8(k, in_h, in_w, w, out_h, out_w, out_n, out_c, C, Ww, n, tiles_dim, in_n, tile_size, in_c, filt_k, filt_c, filt_h, filt_w, alpha, m); 214 | } else { 215 | std::cout << "Configuration not supported yet" << std::endl; 216 | } 217 | 218 | return out; 219 | } 220 | 221 | cudaError_t init_data(float *in_data, float *in_data_open, float *filt_data, float *filt_data_open, int in_w, int in_h, int in_c, int in_n, int filt_w, int filt_h, int filt_c, int filt_k, int tile_size){ 222 | 223 | int n = in_n*in_c*in_h*in_w; 224 | int blk_size = 256; 225 | 226 | dim3 dimBlock(blk_size); 227 | dim3 dimGrid((n + dimBlock.x -1)/dimBlock.x); 228 | 229 | dev_iota<<>>(in_data, n); 230 | data_cpy<<>>(in_data_open, in_data, in_w, in_h, in_c, in_n); 231 | 232 | n = filt_k*filt_c*filt_h*filt_w; 233 | dim3 dimGrid_f = dim3((n + dimBlock.x -1)/dimBlock.x); 234 | dev_const<<>>(filt_data, 1.f, n); 235 | data_cpy<<>>(filt_data_open, filt_data, filt_w, filt_h, filt_c, filt_k); 236 | 237 | return cudaGetLastError(); 238 | } 239 | 240 | 241 | int main(int argc, char *argv[]) { 242 | 243 | 244 | // ========== Set ImageBatch, filter, convolution and output parameters ========== // 245 | // ImageBatch 246 | const int in_n = (argc > 1)?atoi (argv[1]):N; // Number of images 247 | const int in_c = (argc > 2)?atoi (argv[2]):C_in; // Number of feature maps per image 248 | const int in_h = (argc > 3)?atoi (argv[3]):W; // Height of each feature map 249 | const int in_w = (argc > 4)?atoi (argv[4]):W; // Width of each feature map 250 | 251 | // Filter 252 | const int filt_k = (argc > 5)?atoi (argv[5]):K; 253 | const int filt_c = (argc > 6)?atoi (argv[6]):C_in; 254 | const int filt_h = (argc > 7)?atoi (argv[7]):R; 255 | const int filt_w = (argc > 8)?atoi (argv[8]):R; 256 | 257 | std::cout << in_n << "," << in_c << "," << in_h << "," << filt_k << "," << filt_h << ","; 258 | 259 | // Convolution config 260 | const int pad_h = PAD_H; // Zero-padding height 261 | const int pad_w = PAD_W; // Zero-padding width 262 | const int str_h = STR_H; // Vertical filter stride 263 | const int str_w = STR_W; // Horizontal filter stride 264 | const int dil_h = DIL_H; // Filter height dilation 265 | const int dil_w = DIL_W; // Filter width dilation 266 | 267 | 268 | // Output 269 | int out_n; // Number of outputs 270 | int out_c; // Number of feature maps per output 271 | int out_h; // Height of each feature map 272 | int out_w; // Width of each feature map 273 | 274 | /* 275 | #################################################################### 276 | ======================= openCNN preparation ======================= 277 | #################################################################### 278 | */ 279 | // Winograd config 280 | const int m = M; 281 | const int r = filt_h; 282 | const int tile_size = m+r-1; // alpha value 283 | int elems_dim; 284 | int tiles_dim; 285 | 286 | if(m==2){ 287 | tiles_dim = ceil(ceil((double)(in_w+2)/2)-1); 288 | elems_dim = tiles_dim*4; 289 | } else { 290 | std::cout << "Configuration not supported yet" << std::endl; 291 | exit(0); 292 | } 293 | 294 | // Output 295 | out_n = in_n; // Number of outputs 296 | out_c = filt_k; // Number of feature maps per output 297 | out_h = in_h; // Height of each feature map 298 | out_w = in_w; // Width of each feature map 299 | 300 | float *in_data_open; 301 | float *filt_data_open, *workspace; 302 | 303 | // ImageBatch openCNN 304 | OPENCNN_CALL(cudaMalloc( 305 | &in_data_open, in_n * in_c * in_h * in_w * sizeof(float))); 306 | // Filter openCNN 307 | OPENCNN_CALL(cudaMalloc( 308 | &filt_data_open, filt_k * filt_c * filt_h * filt_w * sizeof(float))); 309 | // Filter transformation 310 | OPENCNN_CALL(cudaMalloc( 311 | &workspace, filt_k * filt_c * tile_size * tile_size * sizeof(float))); 312 | 313 | // Output openCNN 314 | float *out_data; 315 | OPENCNN_CALL(cudaMalloc( 316 | &out_data, out_n * out_c * out_h * out_w * sizeof(float))); 317 | 318 | // =================== openCNN layouts =================== // 319 | cudaMemcpyToSymbol(access_f_s, aux, 64*sizeof(int)); 320 | cudaMemcpyToSymbol(access_s, aux2, 64*sizeof(int)); 321 | #ifndef BASE 322 | #if defined(OPTLDS64) 323 | cudaMemcpyToSymbol(access_s_out, aux3, 32*sizeof(int)); 324 | cudaMemcpyToSymbol(out_thread, aux4, 32*sizeof(int)); 325 | cudaMemcpyToSymbol(out_sgemm, aux5, 32*sizeof(int)); 326 | cudaMemcpyToSymbol(exhange, aux6, 32*sizeof(int)); 327 | #endif 328 | #else 329 | cudaMemcpyToSymbol(access_s_out, aux3, 32*sizeof(int)); 330 | cudaMemcpyToSymbol(out_thread, aux4, 16*sizeof(int)); 331 | #endif 332 | 333 | /* 334 | #################################################################### 335 | ====================== cuDNN preparation ====================== 336 | #################################################################### 337 | */ 338 | 339 | float *in_data, *filt_data; 340 | 341 | // ImageBatch cuDNN 342 | CUDA_CALL(cudaMalloc( 343 | &in_data, in_n * in_c * in_h * in_w * sizeof(float))); 344 | // Filter cuDNN 345 | CUDA_CALL(cudaMalloc( 346 | &filt_data, filt_k * filt_c * filt_h * filt_w * sizeof(float))); 347 | 348 | // =================== Set descriptors =================== // 349 | cudnnHandle_t cudnn; 350 | CUDNN_CALL(cudnnCreate(&cudnn)); 351 | 352 | // Input image Descriptors 353 | cudnnTensorDescriptor_t in_desc; 354 | CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc)); 355 | CUDNN_CALL(cudnnSetTensor4dDescriptor( 356 | in_desc, CUDNN_TENSOR_NCHW/*CUDNN_TENSOR_NHWC*/, CUDNN_DATA_FLOAT, 357 | in_n, in_c, in_h, in_w)); 358 | 359 | // Filter Descriptors 360 | cudnnFilterDescriptor_t filt_desc; 361 | CUDNN_CALL(cudnnCreateFilterDescriptor(&filt_desc)); 362 | CUDNN_CALL(cudnnSetFilter4dDescriptor( 363 | filt_desc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW/*CUDNN_TENSOR_NHWC*/, 364 | filt_k, filt_c, filt_h, filt_w)); 365 | 366 | // Convolution Descriptors 367 | cudnnConvolutionDescriptor_t conv_desc; 368 | CUDNN_CALL(cudnnCreateConvolutionDescriptor(&conv_desc)); 369 | CUDNN_CALL(cudnnSetConvolution2dDescriptor( 370 | conv_desc, 371 | pad_h, pad_w, str_h, str_w, dil_h, dil_w, 372 | CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT)); //CUDNN_CONVOLUTION 373 | 374 | 375 | // =================== Query output layout =================== // 376 | CUDNN_CALL(cudnnGetConvolution2dForwardOutputDim( 377 | conv_desc, in_desc, filt_desc, 378 | &out_n, &out_c, &out_h, &out_w)); 379 | 380 | // =================== Set and allocate output tensor descriptor ===================// 381 | cudnnTensorDescriptor_t out_desc; 382 | CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc)); 383 | CUDNN_CALL(cudnnSetTensor4dDescriptor( 384 | out_desc, CUDNN_TENSOR_NCHW/*CUDNN_TENSOR_NHWC*/, CUDNN_DATA_FLOAT, 385 | out_n, out_c, out_h, out_w)); 386 | 387 | float *out_data_cudnn; 388 | CUDA_CALL(cudaMalloc( 389 | &out_data_cudnn, out_n * out_c * out_h * out_w * sizeof(float))); 390 | 391 | // =================== Query convolution forward algorithm =================== // 392 | cudnnConvolutionFwdAlgo_t algo = (cudnnConvolutionFwdAlgo_t)6; 393 | 394 | // =================== Query workspace and allocate =================== // 395 | size_t ws_size; 396 | CUDNN_CALL(cudnnGetConvolutionForwardWorkspaceSize( 397 | cudnn, in_desc, filt_desc, conv_desc, out_desc, algo, &ws_size)); 398 | 399 | float *ws_data; 400 | CUDA_CALL(cudaMalloc(&ws_data, ws_size)); 401 | 402 | // =================== Launch convolution on cuDNN =================== // 403 | float alpha = 1.f; 404 | float beta = 0.f; 405 | 406 | /* 407 | #################################################################### 408 | ============================= Init data ============================= 409 | #################################################################### 410 | */ 411 | 412 | OPENCNN_CALL(init_data(in_data, in_data_open, filt_data, filt_data_open, in_w, in_h, in_c, in_n, 413 | filt_w, filt_h, filt_c, filt_k, tile_size)); 414 | 415 | /* 416 | #################################################################### 417 | ============================= Execution ============================= 418 | #################################################################### 419 | */ 420 | CUevent hStart, hStop; 421 | float ms; 422 | OPENCNN_CALL( cudaEventCreate(&hStart, CU_EVENT_BLOCKING_SYNC) ); // CU_EVENT_DEFAULT 423 | OPENCNN_CALL( cudaEventCreate(&hStop, CU_EVENT_BLOCKING_SYNC) ); 424 | 425 | // Loop of executions 426 | int iterations = 100; 427 | 428 | // Performs warmup operation 429 | OPENCNN_CALL(convolutionForward(in_data_open, in_h, in_w, filt_data_open, out_h, out_w, out_n, out_c, out_data, workspace, 430 | out_c*out_n*out_h*out_w, 431 | tiles_dim, in_n, tile_size, elems_dim, in_c, filt_k, filt_c, filt_h, filt_w, tile_size, m)); 432 | 433 | // ============================= openCNN exec ============================= 434 | cudaDeviceSynchronize(); 435 | ( cudaEventRecord( hStart, NULL ) ); 436 | for(int iter=0; iter7) ){ 89 | 90 | #pragma unroll 91 | for(int i=0; i<4; i+=2){ 92 | 93 | *( (float4*) (output_smem + idx+i*acumm4 + acumm1) ) = *(accumulator+t); // k=0 94 | *( (float4*) (output_smem + idx+i*acumm4 + acumm2) ) = *(accumulator+t+1); 95 | *( (float4*) (output_smem + idx+(i+1)*acumm4 + acumm1) ) = *(accumulator+2+t); // k=1 96 | *( (float4*) (output_smem + idx+(i+1)*acumm4 + acumm2) ) = *(accumulator+3+t); 97 | 98 | *( (float4*) (output_smem + idx2+i*acumm4 + acumm1) ) = *(accumulator+16+t); 99 | *( (float4*) (output_smem + idx2+i*acumm4 + acumm2) ) = *(accumulator+17+t); 100 | *( (float4*) (output_smem + idx2+(i+1)*acumm4 + acumm1) ) = *(accumulator+18+t); 101 | *( (float4*) (output_smem + idx2+(i+1)*acumm4 + acumm2) ) = *(accumulator+19+t); 102 | 103 | t+=4; 104 | } 105 | } 106 | 107 | __syncthreads(); 108 | 109 | for(int i=0; i<16; i++){ 110 | C_tile[i] = out[init + i*offset]; 111 | C_tile[i+16] = out[init + 2*BN_p*16*4 + i*offset]; 112 | } 113 | 114 | // transform output tiles 115 | transform_output_tile(C, C_tile, At, Inx, Iny, TileX, TileY, TileZ, out_h, out_w, tiles_dim, round, in_n, 0, out_thread, mask, c_tensor, c_glb_offset); 116 | 117 | 118 | __syncthreads(); 119 | } 120 | 121 | } 122 | 123 | } 124 | #endif -------------------------------------------------------------------------------- /src/store_and_transform_output_optLDS64.cuh: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Roberto Lopez Castro 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | 16 | #include "config.hpp" 17 | 18 | #ifndef _OUTPUT_KERNEL_OPT3_ 19 | #define _OUTPUT_KERNEL_OPT3_ 20 | extern "C" 21 | { 22 | 23 | __device__ void __inline__ transform_output_tile(float2 *pOutputs, float2 *C_tile, float2 *At, int tiles_dim, int round, int in_n, int c_tensor, int c_glb_offset, short mask, int out_w) 24 | { 25 | c_tensor += ( (round/2)*32 + (round%2)*2 )*c_glb_offset/2; 26 | int x, x1; 27 | 28 | for(int j=0; j<4; j++){ 29 | At[j].x = C_tile[j].x + C_tile[4+j].x + C_tile[8+j].x; 30 | At[j].y = C_tile[j].y + C_tile[4+j].y + C_tile[8+j].y; 31 | 32 | At[4+j].x = C_tile[4+j].x - C_tile[8+j].x - C_tile[12+j].x; 33 | At[4+j].y = C_tile[4+j].y - C_tile[8+j].y - C_tile[12+j].y; 34 | } 35 | 36 | for(int i=0; i<2; i++){ 37 | x = i*4; 38 | x1 = i*(in_n*(tiles_dim-(out_w%2)) + (out_w%2)*in_n/2); 39 | if(mask&(1<<(i*2))){ 40 | 41 | pOutputs[x1 + c_tensor].x = At[x].x + At[x+1].x + At[x+2].x; 42 | pOutputs[x1 + c_tensor].y = At[x].y + At[x+1].y + At[x+2].y; 43 | } 44 | if(mask&(1<<(i*2+1))){ 45 | pOutputs[x1 + in_n/2 + c_tensor].x = At[x+1].x - At[x+2].x - At[x+3].x; 46 | pOutputs[x1 + in_n/2 + c_tensor].y = At[x+1].y - At[x+2].y - At[x+3].y; 47 | } 48 | } 49 | 50 | } 51 | 52 | __device__ __inline__ void store_output_tile(float4 acumm_smem[][16], float *shared_mem, int Inx, int Iny, 53 | float *C, int TileX, int TileY, int TileZ, int out_h, int out_w, 54 | int tiles_dim, int in_n, float4 *input_frag_mem, float4* filter_frag_mem, short mask){ 55 | 56 | float2 *output_smem = (float2 *) shared_mem; 57 | float2 *accumulator = (float2 *) acumm_smem; 58 | float2 *C_out = (float2*)C; 59 | 60 | float2 *C_tile = (float2*) input_frag_mem; 61 | float2 *At = (float2*) filter_frag_mem; 62 | 63 | mask = 0x000F; 64 | if((blockIdx.y/tiles_dim)==(tiles_dim-1) && out_w%2) mask&=0x0003; 65 | if(!((blockIdx.y+1)%tiles_dim) && out_w%2) mask&=0X0005; 66 | 67 | // output transpose step 68 | int t=0; 69 | int acumm1, acumm2; 70 | // For transposing 71 | //acumm1 = access_s_out[Inx]; //* 4 72 | acumm1 = ((Inx%8)/2)*34 + Inx%2 + (Inx/16)*2 + ((Inx/8)%2)*8; 73 | acumm2 = acumm1+4; 74 | 75 | int acumm4 = BN_p*8*2 ; //*4 76 | int idx = Iny * BN_p; 77 | int idx2 = idx + BN_p*8; //(BN_p*2 *8)/2 78 | 79 | // For transformating 80 | int offset = BN_p; //*2/2 81 | int init = (Iny/4)*BN_p*16 + (Iny%4)*(32+2); 82 | init += (Inx/16)*8 + ((Inx/8)%2)*16 + (Inx%8); //40=(8+2)*4, 4 blocks/buffer 83 | 84 | 85 | int c_glb_offset = in_n*out_h*out_w; 86 | int c_tensor = TileZ*c_glb_offset*BK + (TileY%tiles_dim)*in_n*2 + (TileY/tiles_dim)*in_n*out_w*2 + TileX*BN + ((Inx/8)%2)*2 + (Inx%8)*2*2 + ((Inx/16)*16 + (Iny%4)*4 + Iny/4)*c_glb_offset; 87 | c_tensor/=2; 88 | 89 | #pragma unroll 90 | for(int round=0; round<4; round++){ 91 | 92 | *( (float2*) (output_smem + idx + acumm1) ) = *(accumulator+t); 93 | *( (float2*) (output_smem + idx + acumm1 + 16) ) = *(accumulator+t+1); // float 4, t 94 | *( (float2*) (output_smem + idx + acumm2) ) = *(accumulator+t+2); 95 | *( (float2*) (output_smem + idx + acumm2 + 16) ) = *(accumulator+t+3); // float 4, t+1 96 | 97 | *( (float2*) (output_smem + idx2 + acumm1) ) = *(accumulator+t+32); 98 | *( (float2*) (output_smem + idx2 + acumm1 + 16) ) = *(accumulator+t+33); // float 4, t+16 99 | *( (float2*) (output_smem + idx2 + acumm2) ) = *(accumulator+t+34); 100 | *( (float2*) (output_smem + idx2 + acumm2 + 16) ) = *(accumulator+t+35); // float 4, t+17 101 | 102 | *( (float2*) (output_smem + idx + acumm4 + acumm1) ) = *(accumulator+t+4); 103 | *( (float2*) (output_smem + idx + acumm4 + acumm1 + 16) ) = *(accumulator+t+5); // float 4, t+2 104 | *( (float2*) (output_smem + idx + acumm4 + acumm2) ) = *(accumulator+t+6); 105 | *( (float2*) (output_smem + idx + acumm4 + acumm2 + 16) ) = *(accumulator+t+7); // float 4, t+3 106 | 107 | *( (float2*) (output_smem + idx2 + acumm4 + acumm1) ) = *(accumulator+t+36); 108 | *( (float2*) (output_smem + idx2 + acumm4 + acumm1 + 16) ) = *(accumulator+t+37); // float 4, t+18 109 | *( (float2*) (output_smem + idx2 + acumm4 + acumm2) ) = *(accumulator+t+38); 110 | *( (float2*) (output_smem + idx2 + acumm4 + acumm2 + 16) ) = *(accumulator+t+39); // float 4, t+19 111 | 112 | t+=8; 113 | 114 | __syncthreads(); 115 | 116 | 117 | for(int i=0; i<16; i++){ 118 | C_tile[i].x = output_smem[i*offset + init].x; //16*4 119 | C_tile[i].y = output_smem[i*offset + init].y; //16*4 120 | } 121 | 122 | // transform output tiles 123 | transform_output_tile(C_out, C_tile, At, tiles_dim, round, in_n, c_tensor, c_glb_offset, mask , out_w); 124 | 125 | __syncthreads(); 126 | 127 | } 128 | } 129 | 130 | } 131 | #endif -------------------------------------------------------------------------------- /src/store_and_transform_output_optSTS64.cuh: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Roberto Lopez Castro 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | 16 | #include "config.hpp" 17 | 18 | #ifndef _OUTPUT_KERNEL_OPT1_ 19 | #define _OUTPUT_KERNEL_OPT1_ 20 | extern "C" 21 | { 22 | 23 | __device__ void transform_output_tile(float2 *pOutputs, float2 *C_tile, float2 *At, 24 | int tiles_dim, int round, int in_n, int c_tensor, int c_glb_offset, short mask, int out_w){ 25 | c_tensor += (((round)/2)*32 + ((round)%2)*2)*c_glb_offset/2; 26 | int x, x1; 27 | 28 | #pragma unroll 29 | for(int j=0; j<4; j++){ 30 | At[j].x = C_tile[j].x + C_tile[4+j].x + C_tile[8+j].x; 31 | At[j].y = C_tile[j].y + C_tile[4+j].y + C_tile[8+j].y; 32 | 33 | At[4+j].x = C_tile[4+j].x - C_tile[8+j].x - C_tile[12+j].x; 34 | At[4+j].y = C_tile[4+j].y - C_tile[8+j].y - C_tile[12+j].y; 35 | } 36 | 37 | 38 | #pragma unroll 39 | for(int i=0; i<2; i++){ 40 | x = i*4; 41 | x1 = i*(in_n*(tiles_dim-(out_w%2)) + (out_w%2)*in_n/2); 42 | if(mask&(1<<(i*2))){ 43 | pOutputs[x1 + c_tensor].x = At[x].x + At[x+1].x + At[x+2].x; 44 | pOutputs[x1 + c_tensor].y = At[x].y + At[x+1].y + At[x+2].y; 45 | } 46 | 47 | if(mask&(1<<(i*2+1))){ 48 | pOutputs[x1 + in_n/2 + c_tensor].x = At[x+1].x - At[x+2].x - At[x+3].x; 49 | pOutputs[x1 + in_n/2 + c_tensor].y = At[x+1].y - At[x+2].y - At[x+3].y; 50 | } 51 | } 52 | } 53 | 54 | __device__ __inline__ void store_output_tile(float4 acumm_smem[][16], float *shared_mem, int Inx, int Iny, 55 | float *C, int TileX, int TileY, int TileZ, int out_h, int out_w, 56 | int tiles_dim, int in_n, float4 *input_frag_mem, float4* filter_frag_mem, short mask){ 57 | 58 | float2 *output_smem = (float2 *) shared_mem; 59 | float2 *accumulator = (float2 *) acumm_smem; 60 | float2 *C_out = (float2*)C; 61 | 62 | float2 *C_tile = (float2*) input_frag_mem; 63 | float2 *At = (float2*) filter_frag_mem; 64 | 65 | mask = 0x000F; 66 | if((blockIdx.y/tiles_dim)==(tiles_dim-1) && out_w%2) mask&=0x0003; 67 | if(!((blockIdx.y+1)%tiles_dim) && out_w%2) mask&=0X0005; 68 | 69 | // output transpose step 70 | int t=0; 71 | int acumm1, acumm2; 72 | // For transposing 73 | //acumm1 = access_s_out[Inx]; //* 4 74 | acumm1 = ((Inx%8)/2)*34 + Inx%2 + (Inx/16)*2 + ((Inx/8)%2)*8; 75 | acumm2 = acumm1+4; 76 | 77 | int acumm4 = BN_p*16 ; //*4 78 | int idx = Iny * BN_p; 79 | int idx2 = idx + BN_p*8; //(BN_p*2 *8)/2 80 | 81 | // For transformating 82 | int offset = BN_p *2; //*2/2 83 | int init = ( (Iny/4)*BN_p*16 + (Iny%4)*(32+2) ) *2 + Inx; 84 | 85 | int c_glb_offset = in_n*out_h*out_w; 86 | int c_tensor = TileZ*c_glb_offset*BK + (TileY%tiles_dim)*in_n*2 + (TileY/tiles_dim)*in_n*out_w*2 + TileX*BN + (Inx%16)*2+ 87 | ((Inx/16)*16 + (Iny%4)*4 + Iny/4)*c_glb_offset; 88 | c_tensor/=2; 89 | 90 | #pragma unroll 91 | for(int round=0; round<4; round++){ 92 | 93 | *( (float2*) (output_smem + idx + acumm1) ) = *(accumulator+t); 94 | *( (float2*) (output_smem + idx + acumm1 + 16) ) = *(accumulator+t+1); // float 4, t 95 | *( (float2*) (output_smem + idx + acumm2) ) = *(accumulator+t+2); 96 | *( (float2*) (output_smem + idx + acumm2 + 16) ) = *(accumulator+t+3); // float 4, t+1 97 | 98 | *( (float2*) (output_smem + idx2 + acumm1) ) = *(accumulator+t+32); 99 | *( (float2*) (output_smem + idx2 + acumm1 + 16) ) = *(accumulator+t+33); // float 4, t+16 100 | *( (float2*) (output_smem + idx2 + acumm2) ) = *(accumulator+t+34); 101 | *( (float2*) (output_smem + idx2 + acumm2 + 16) ) = *(accumulator+t+35); // float 4, t+17 102 | 103 | *( (float2*) (output_smem + idx + acumm4 + acumm1) ) = *(accumulator+t+4); 104 | *( (float2*) (output_smem + idx + acumm4 + acumm1 + 16) ) = *(accumulator+t+5); // float 4, t+2 105 | *( (float2*) (output_smem + idx + acumm4 + acumm2) ) = *(accumulator+t+6); 106 | *( (float2*) (output_smem + idx + acumm4 + acumm2 + 16) ) = *(accumulator+t+7); // float 4, t+3 107 | 108 | *( (float2*) (output_smem + idx2 + acumm4 + acumm1) ) = *(accumulator+t+36); 109 | *( (float2*) (output_smem + idx2 + acumm4 + acumm1 + 16) ) = *(accumulator+t+37); // float 4, t+18 110 | *( (float2*) (output_smem + idx2 + acumm4 + acumm2) ) = *(accumulator+t+38); 111 | *( (float2*) (output_smem + idx2 + acumm4 + acumm2 + 16) ) = *(accumulator+t+39); // float 4, t+19 112 | 113 | t+=8; 114 | 115 | __syncthreads(); 116 | 117 | 118 | for(int i=0; i<16; i++){ 119 | C_tile[i].x = shared_mem[i*offset + init]; 120 | C_tile[i].y = shared_mem[i*offset + init + 32]; 121 | } 122 | 123 | // transform output tiles 124 | transform_output_tile(C_out, C_tile, At, tiles_dim, round, in_n, c_tensor, c_glb_offset, mask, out_w); 125 | __syncthreads(); 126 | } 127 | } 128 | 129 | } 130 | #endif -------------------------------------------------------------------------------- /src/store_and_transform_output_optSTS64_compact.cuh: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Roberto Lopez Castro 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | 16 | #include "config.hpp" 17 | 18 | #ifndef _OUTPUT_KERNEL_OPT1_ 19 | #define _OUTPUT_KERNEL_OPT1_ 20 | extern "C" 21 | { 22 | 23 | __device__ __forceinline__ void transform_output_tile(float2 *pOutputs, float2 *C_tile, float2 *At, 24 | int tiles_dim, int round, int in_n, int c_tensor, int c_glb_offset, 25 | short mask, int out_w){ 26 | c_tensor += (((round)/2)*32 + ((round)%2)*2)*c_glb_offset/2; 27 | int x, x1; 28 | 29 | 30 | #pragma unroll 31 | for(int j=0; j<4; j++){ 32 | At[j].x = C_tile[j].x + C_tile[4+j].x + C_tile[8+j].x; 33 | At[j].y = C_tile[j].y + C_tile[4+j].y + C_tile[8+j].y; 34 | At[4+j].x = C_tile[4+j].x - C_tile[8+j].x - C_tile[12+j].x; 35 | At[4+j].y = C_tile[4+j].y - C_tile[8+j].y - C_tile[12+j].y; 36 | } 37 | 38 | x = in_n/2; 39 | pOutputs[c_tensor].x = At[0].x + At[1].x + At[2].x; 40 | pOutputs[c_tensor].y = At[0].y + At[1].y + At[2].y; 41 | 42 | if(mask&0x2){ 43 | pOutputs[x + c_tensor].x = At[1].x - At[2].x - At[3].x; 44 | pOutputs[x + c_tensor].y = At[1].y - At[2].y - At[3].y; 45 | } 46 | 47 | x1 = in_n*(tiles_dim-(out_w%2)) + (out_w%2)*x; 48 | if(mask&0x4){ 49 | pOutputs[x1 + c_tensor].x = At[4].x + At[5].x + At[6].x; 50 | pOutputs[x1 + c_tensor].y = At[4].y + At[5].y + At[6].y; 51 | } 52 | 53 | if(mask&0x8){ 54 | pOutputs[x1 + x + c_tensor].x = At[5].x - At[6].x - At[7].x; 55 | pOutputs[x1 + x + c_tensor].y = At[5].y - At[6].y - At[7].y; 56 | } 57 | } 58 | 59 | __device__ __forceinline__ void store_output_tile(float4 acumm_smem[][16], float *shared_mem, int Inx, int Iny, 60 | float *C, int TileX, int TileY, int TileZ, int out_h, int out_w, 61 | int tiles_dim, int in_n, float4 *input_frag_mem, float4* filter_frag_mem, 62 | short mask){ 63 | 64 | float2 *output_smem = (float2 *) shared_mem; 65 | float2 *accumulator = (float2 *) acumm_smem; 66 | float2 *C_out = (float2*)C; 67 | 68 | float2 *C_tile = (float2*) input_frag_mem; 69 | float2 *At = (float2*) filter_frag_mem; 70 | 71 | mask = 0x000F; 72 | if((blockIdx.y/tiles_dim)==(tiles_dim-1) && out_w%2) mask&=0x0003; 73 | if(!((blockIdx.y+1)%tiles_dim) && out_w%2) mask&=0X0005; 74 | 75 | // output transpose step 76 | int t,j; 77 | int acumm1, acumm2; 78 | // For transposing 79 | t = Inx%8/2; 80 | acumm1 = t*18 + Inx%2 + (Inx/16)*2 + ((Inx/8)%2)*8; 81 | acumm2 = acumm1+4; 82 | acumm1 = acumm1 - acumm1/((t+1)*16)*16 + t*16; 83 | acumm2 = acumm2 - acumm2/((t+1)*16)*16 + t*16; 84 | t=0; 85 | 86 | int acumm4 = BN_p*16 ; //*4 87 | int idx = Iny * BN_p; 88 | int idx2 = idx + BN_p*8; //(BN_p*2 *8)/2 89 | 90 | // For transformating 91 | int offset = BN_p *2; //*2/2 92 | 93 | int init = (Iny%4)*(16+2)*2 + Inx; 94 | init = init - init/((Iny%4+1)*32)*32 + Iny%4*32; 95 | init += (Iny/4)*BN_p*16*2; 96 | 97 | int c_glb_offset = in_n*out_h*out_w; 98 | int c_tensor = TileZ*c_glb_offset*BK + (TileY%tiles_dim)*in_n*2 + (TileY/tiles_dim)*in_n*out_w*2 + TileX*BN + (Inx%16)*2+ 99 | ((Inx/16)*16 + (Iny%4)*4 + Iny/4)*c_glb_offset; 100 | c_tensor/=2; 101 | 102 | // k=0, block 0 103 | *( (float2*) (output_smem + idx + acumm1) ) = *(accumulator); 104 | *( (float2*) (output_smem + idx + acumm1 + 16) ) = *(accumulator+1); 105 | *( (float2*) (output_smem + idx + acumm2) ) = *(accumulator+2); 106 | *( (float2*) (output_smem + idx + acumm2 + 16) ) = *(accumulator+3); 107 | 108 | // K=1, block 0 109 | *( (float2*) (output_smem + idx + acumm4 + acumm1) ) = *(accumulator+4); 110 | *( (float2*) (output_smem + idx + acumm4 + acumm1 + 16) ) = *(accumulator+5); 111 | *( (float2*) (output_smem + idx + acumm4 + acumm2) ) = *(accumulator+6); 112 | *( (float2*) (output_smem + idx + acumm4 + acumm2 + 16) ) = *(accumulator+7); 113 | 114 | // k=0, block 1 115 | *( (float2*) (output_smem + idx2 + acumm1) ) = *(accumulator+32); 116 | *( (float2*) (output_smem + idx2 + acumm1 + 16) ) = *(accumulator+33); 117 | *( (float2*) (output_smem + idx2 + acumm2) ) = *(accumulator+34); 118 | *( (float2*) (output_smem + idx2 + acumm2 + 16) ) = *(accumulator+35); 119 | 120 | // K=1, block 1 121 | *( (float2*) (output_smem + idx2 + acumm4 + acumm1) ) = *(accumulator+36); 122 | *( (float2*) (output_smem + idx2 + acumm4 + acumm1 + 16) ) = *(accumulator+37); 123 | *( (float2*) (output_smem + idx2 + acumm4 + acumm2) ) = *(accumulator+38); 124 | *( (float2*) (output_smem + idx2 + acumm4 + acumm2 + 16) ) = *(accumulator+39); 125 | 126 | j=0; t+=8; 127 | 128 | #pragma unroll 129 | for(int round=0; round<3; round++){ 130 | 131 | __syncthreads(); 132 | 133 | int disp = j/2*(BN_p*2*16)*2; 134 | #pragma unroll 135 | for(int i=0; i<16; i++){ 136 | C_tile[i].x = shared_mem[disp + i*offset + init]; 137 | C_tile[i].y = shared_mem[disp + i*offset + init + 32]; 138 | } 139 | 140 | // transform output tiles 141 | transform_output_tile(C_out, C_tile, At, tiles_dim, (round/2)*2+j/2, in_n, c_tensor, c_glb_offset, mask, out_w); 142 | 143 | j = 2 - j; //switch between 0 and 2 144 | 145 | // k=0, block 0 146 | *( (float2*) (output_smem + idx + (j)*acumm4 + acumm1) ) = *(accumulator+t); 147 | *( (float2*) (output_smem + idx + (j)*acumm4 + acumm1 + 16) ) = *(accumulator+t+1); 148 | *( (float2*) (output_smem + idx + (j)*acumm4 + acumm2) ) = *(accumulator+t+2); 149 | *( (float2*) (output_smem + idx + (j)*acumm4 + acumm2 + 16) ) = *(accumulator+t+3); 150 | 151 | // K=1, block 0 152 | *( (float2*) (output_smem + idx + (j+1)*acumm4 + acumm1) ) = *(accumulator+t+4); 153 | *( (float2*) (output_smem + idx + (j+1)*acumm4 + acumm1 + 16) ) = *(accumulator+t+5); 154 | *( (float2*) (output_smem + idx + (j+1)*acumm4 + acumm2) ) = *(accumulator+t+6); 155 | *( (float2*) (output_smem + idx + (j+1)*acumm4 + acumm2 + 16) ) = *(accumulator+t+7); 156 | 157 | // k=0, block 1 158 | *( (float2*) (output_smem + idx2 + (j)*acumm4 + acumm1) ) = *(accumulator+t+32); 159 | *( (float2*) (output_smem + idx2 + (j)*acumm4 + acumm1 + 16) ) = *(accumulator+t+33); 160 | *( (float2*) (output_smem + idx2 + (j)*acumm4 + acumm2) ) = *(accumulator+t+34); 161 | *( (float2*) (output_smem + idx2 + (j)*acumm4 + acumm2 + 16) ) = *(accumulator+t+35); 162 | 163 | // K=1, block 1 164 | *( (float2*) (output_smem + idx2 + (j+1)*acumm4 + acumm1) ) = *(accumulator+t+36); 165 | *( (float2*) (output_smem + idx2 + (j+1)*acumm4 + acumm1 + 16) ) = *(accumulator+t+37); 166 | *( (float2*) (output_smem + idx2 + (j+1)*acumm4 + acumm2) ) = *(accumulator+t+38); 167 | *( (float2*) (output_smem + idx2 + (j+1)*acumm4 + acumm2 + 16) ) = *(accumulator+t+39); 168 | 169 | t+=8; 170 | 171 | } 172 | 173 | __syncthreads(); 174 | 175 | int disp = j/2*(BN_p*2*16)*2; 176 | #pragma unroll 177 | for(int i=0; i<16; i++){ 178 | C_tile[i].x = shared_mem[disp + i*offset + init]; 179 | C_tile[i].y = shared_mem[disp + i*offset + init + 32]; 180 | } 181 | // transform output tiles 182 | transform_output_tile(C_out, C_tile, At, tiles_dim, 2+j/2, in_n, c_tensor, c_glb_offset, mask, out_w); 183 | } 184 | 185 | } 186 | #endif --------------------------------------------------------------------------------