├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── README.md ├── data ├── cifar10.json └── mnist.json ├── include ├── acts.h ├── cuacts.h ├── cuesp.h ├── culayers.h ├── cumem.h ├── cunn.h ├── cuptens.h ├── cutens.h ├── esp.h ├── handlers.h ├── handlers │ ├── cifar.h │ └── mnist.h ├── kernels.h ├── layers.h ├── layers │ ├── bnorm.h │ ├── conv.h │ ├── cubnorm.h │ ├── cuconv.h │ ├── cudense.h │ ├── cuinput.h │ ├── cupconv.h │ ├── cupdense.h │ ├── cupinput.h │ ├── cupool.h │ ├── dense.h │ ├── input.h │ └── pool.h ├── nn.h ├── nn │ ├── cnn.h │ ├── cucnn.h │ ├── cumlp.h │ ├── cupcnn.h │ ├── cupmlp.h │ └── mlp.h ├── params.h ├── tens.h ├── timez.h ├── util.cuh └── util.h ├── readEspresso.py ├── src ├── CMakeLists.txt ├── activations │ ├── psignl.cu │ ├── relul.c │ ├── signl.c │ ├── signl.cu │ └── softmaxl.c ├── cnn.c ├── cnn.cu ├── cumem.cu ├── handlers │ ├── cifar.c │ └── mnist.c ├── kernels │ ├── bnorm.cu │ ├── bnorm.cuh │ ├── bp.cu │ ├── bp.cuh │ ├── copy.cu │ ├── copy.cuh │ ├── lower.cu │ ├── lower.cuh │ ├── norm.cu │ ├── norm.cuh │ ├── pack.cu │ ├── pack.cuh │ ├── pad.cu │ ├── pad.cuh │ ├── pgemm.cu │ ├── pgemm.cuh │ ├── pgemv.cu │ ├── pgemv.cuh │ ├── pool.cu │ ├── pool.cuh │ ├── set.cu │ ├── set.cuh │ ├── sgemm.cu │ ├── sgemm.cuh │ ├── sgemv.cu │ ├── sgemv.cuh │ ├── sign.cu │ ├── sign.cuh │ ├── tch.cu │ └── tch.cuh ├── layers │ ├── bnorml.c │ ├── bnorml.cu │ ├── convl.c │ ├── convl.cu │ ├── densel.c │ ├── densel.cu │ ├── inputl.c │ ├── inputl.cu │ ├── pconvl.cu │ ├── pdensel.cu │ ├── pinputl.cu │ ├── pooll.c │ └── pooll.cu ├── mlp.c ├── mlp.cu ├── params.c ├── pcnn.cu ├── pmlp.cu ├── ptens.cu ├── scratch.c ├── tens.c ├── tens.cu └── timez.c ├── test ├── CMakeLists.txt └── cup_mnist.cu └── toEspresso.py /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | cifar_test 3 | cifar_params 4 | mnist_test 5 | mnist_params 6 | mnist_lab 7 | *.npz 8 | *.dat 9 | *.esp 10 | *.o 11 | *~ 12 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | project(espresso) 3 | 4 | find_package(BLAS REQUIRED) 5 | find_package(CUDA) 6 | 7 | option(CMAKE_EXPORT_COMPILE_COMMANDS "" 1) 8 | 9 | set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -arch=sm_50) 10 | 11 | include_directories("${CMAKE_SOURCE_DIR}/include") 12 | include_directories("${CMAKE_SOURCE_DIR}/src/kernels") 13 | include_directories("${CMAKE_SOURCE_DIR}/include/layers") 14 | 15 | add_subdirectory("src") 16 | add_subdirectory("test") 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2017 Fabrizio Pedersoli 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Espresso 2 | Efficient forward propagation for BCNNs 3 | 4 | [https://arxiv.org/pdf/1705.07175] 5 | 6 | -------------------------------------------------------------------------------- /data/cifar10.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"type": "ndense", "val": 3}, 3 | {"type": "nbnorm", "val": 9}, 4 | {"type": "nconv", "val": 6}, 5 | {"type": "npool", "val": 3}, 6 | 7 | {"type": "input", "dim": [32, 32, 3]}, 8 | 9 | {"type": "conv", "dim": [1, 128, 3, 3, 3, 1, 1]}, 10 | {"type": "bnorm", "dim": 128}, 11 | {"type": "conv", "dim": [1, 128, 3, 3, 128, 1, 1]}, 12 | {"type": "pool", "dim": [2, 2, 2, 2]}, 13 | {"type": "bnorm", "dim": 128}, 14 | 15 | {"type": "conv", "dim": [1, 256, 3, 3, 128, 1, 1]}, 16 | {"type": "bnorm", "dim": 256}, 17 | {"type": "conv", "dim": [1, 256, 3, 3, 256, 1, 1]}, 18 | {"type": "pool", "dim": [2, 2, 2, 2]}, 19 | {"type": "bnorm", "dim": 256}, 20 | 21 | {"type": "conv", "dim": [1, 512, 3, 3, 256, 1, 1]}, 22 | {"type": "bnorm", "dim": 512}, 23 | {"type": "conv", "dim": [1, 512, 3, 3, 512, 1, 1]}, 24 | {"type": "pool", "dim": [2, 2, 2, 2]}, 25 | {"type": "bnorm", "dim": 512}, 26 | 27 | {"type": "dense", "dim": [1024, 8192]}, 28 | {"type": "bnorm", "dim": 1024}, 29 | {"type": "dense", "dim": [1024, 1024]}, 30 | {"type": "bnorm", "dim": 1024}, 31 | 32 | {"type": "dense", "dim": [10, 1024]}, 33 | {"type": "bnorm", "dim": 10} 34 | ] 35 | -------------------------------------------------------------------------------- /data/mnist.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"type": "ndense", "val": 4}, 3 | {"type": "nbnorm", "val": 4}, 4 | 5 | {"type": "input", "dim": [28, 28, 1]}, 6 | 7 | {"type": "dense", "dim": [4096, 784]}, 8 | {"type": "bnorm", "dim": 4096}, 9 | {"type": "dense", "dim": [4096, 4096]}, 10 | {"type": "bnorm", "dim": 4096}, 11 | {"type": "dense", "dim": [4096, 4096]}, 12 | {"type": "bnorm", "dim": 4096}, 13 | {"type": "dense", "dim": [10, 4096]}, 14 | {"type": "bnorm", "dim": 10} 15 | ] 16 | -------------------------------------------------------------------------------- /include/acts.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_ACTIVATIONS_H 2 | #define ESP_ACTIVATIONS_H 3 | 4 | #include "tens.h" 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | 10 | void signAct_forward(ftens *t); 11 | void signAct_backward(ftens *dout); 12 | void reluAct_forward(ftens *t); 13 | void reluAct_backward(ftens *dout); 14 | void softmaxAct_forward(ftens *t); 15 | void softmaxAct_backward(ftens *dout); 16 | 17 | 18 | #ifdef __cplusplus 19 | } 20 | #endif 21 | 22 | 23 | #endif /* ESP_ACTIVATIONS_H */ 24 | -------------------------------------------------------------------------------- /include/cuacts.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_CUACT_H 2 | #define ESP_CUACT_H 3 | 4 | #include "cuptens.h" 5 | 6 | 7 | void cusignAct_forward(cuftens *t); 8 | void cusignAct_backward(cuftens *dout); 9 | void cupsignAct_forward(cuftens *src, cuptens *out); 10 | 11 | 12 | 13 | #endif /* ESP_CUACT_H */ 14 | -------------------------------------------------------------------------------- /include/cuesp.h: -------------------------------------------------------------------------------- 1 | #ifndef CUESP_H 2 | #define CUESP_H 3 | 4 | #include "util.cuh" 5 | #include "cutens.h" 6 | #include "cuptens.h" 7 | #include "cumem.h" 8 | #include "culayers.h" 9 | #include "cunn.h" 10 | #include "cuacts.h" 11 | #include "handlers.h" 12 | 13 | 14 | #endif /* CUESP_H */ 15 | -------------------------------------------------------------------------------- /include/culayers.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_CULAYERS_H 2 | #define ESP_CULAYERS_H 3 | 4 | #include "layers/cuinput.h" 5 | #include "layers/cuconv.h" 6 | #include "layers/cudense.h" 7 | #include "layers/cupool.h" 8 | #include "layers/cubnorm.h" 9 | 10 | #include "layers/cupinput.h" 11 | #include "layers/cupdense.h" 12 | #include "layers/cupconv.h" 13 | 14 | 15 | #endif /* ESP_CULAYERS_H */ 16 | -------------------------------------------------------------------------------- /include/cumem.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_CUMEM_H 2 | #define ESP_CUMEM_H 3 | 4 | #include 5 | 6 | #define CUFMEM _cufmem() 7 | #define CUPMEM _cupmem() 8 | 9 | 10 | typedef struct { 11 | int total, used; 12 | float *base; 13 | float *curr; 14 | } cufmem; 15 | 16 | 17 | typedef struct { 18 | int total, used; 19 | uint64_t *base; 20 | uint64_t *curr; 21 | } cupmem; 22 | 23 | 24 | int _cufmem(); 25 | int _cupmem(); 26 | 27 | void cufmem_alloc(int len); 28 | float *cufmem_reserve(int len); 29 | void cufmem_free(); 30 | void cufmem_reset(); 31 | 32 | void cupmem_alloc(int len); 33 | uint64_t *cupmem_reserve(int len); 34 | void cupmem_free(); 35 | void cupmem_reset(); 36 | 37 | 38 | #endif /* ESP_CUMEM_H */ 39 | -------------------------------------------------------------------------------- /include/cunn.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_CUNN_H 2 | #define ESP_CUNN_H 3 | 4 | #include "nn/cumlp.h" 5 | #include "nn/cucnn.h" 6 | #include "nn/cupmlp.h" 7 | #include "nn/cupcnn.h" 8 | 9 | #endif /* ESP_CUNN_H */ 10 | -------------------------------------------------------------------------------- /include/cuptens.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_CUPTENS_H 2 | #define ESP_CUPTENS_H 3 | 4 | #include 5 | #include "cumem.h" 6 | #include "tens.h" 7 | #include "cutens.h" 8 | 9 | 10 | typedef struct { 11 | int D, M, N, L; 12 | int X, p, MNL; 13 | int bytes; 14 | uint64_t *data; 15 | } cuptens; 16 | 17 | 18 | cuptens cuptens_empty(int D, int M, int N, int L); 19 | cuptens cuptens_init(int D, int M, int N, int L); 20 | cuptens cuptens_pad(cuptens *src, int p); 21 | cuptens cuptens_from_cupmem(int D, int M, int N, int L); 22 | cuptens cuptens_lower_init(cuptens *src, int W, int H, int Sx, int Sy); 23 | cuptens cuptens_lower_cupmem(cuptens *src, int W, int H, int Sx, int Sy); 24 | 25 | cuptens cuptens_convert(ftens *t); 26 | cuptens cuptens_convert(cuftens *t); 27 | void cuptens_convert(cuftens *src, cuptens *dst); 28 | void cuptens_free(cuptens *pt); 29 | uint64_t *cuptens_dump(cuptens *t); 30 | void cuptens_print_shape(cuptens *t); 31 | void cuptens_print(cuptens *t); 32 | void cuptens_print(cuptens *t, const char *fmt); 33 | void cuptens_print_ch(cuptens *t, int w, int k, const char *fmt); 34 | 35 | #endif /* ESP_CUPTENS_H */ 36 | -------------------------------------------------------------------------------- /include/cutens.h: -------------------------------------------------------------------------------- 1 | #ifndef EPS_CUTENS_H 2 | #define EPS_CUTENS_H 3 | 4 | #include "util.cuh" 5 | #include "cumem.h" 6 | #include "tens.h" 7 | 8 | 9 | typedef struct { 10 | int D, M, N, L, MNL; 11 | int bytes; 12 | float *data; 13 | } cuftens; 14 | 15 | 16 | #define CUFTENS_INIT(t,D,M,N,L) ( \ 17 | t.D=D, t.M=M, t.N=N, t.L=L, t.MNL=M*N*L, \ 18 | t.bytes=BYTES(float, D*M*N*L), \ 19 | t.data=NULL) 20 | 21 | #define DIM_CHECK(D,M,N,L) \ 22 | cuASSERT(D>0 && M>0 && N>0 && N>0 && L>0, \ 23 | "err: cuftens invalid size\n") 24 | 25 | 26 | cuftens cuftens_empty(int D, int M, int N, int L); 27 | cuftens cuftens_init(int D, int M, int N, int L); 28 | cuftens cuftens_from_cufmem(int D, int M, int N, int L); 29 | cuftens cuftens_lower_init(cuftens *t, int W, int H, int Sx, int Sy); 30 | cuftens cuftens_lower_cufmem(cuftens *t, int W, int H, int Sx, int Sy); 31 | cuftens cuftens_zeros(int D, int M, int N, int L); 32 | cuftens cuftens_ones(int D, int M, int N, int L); 33 | cuftens cuftens_rand(int D, int M, int N, int L); 34 | cuftens cuftens_rand(int D, int M, int N, int L, 35 | float min, float max); 36 | 37 | void cuftens_round_up(cuftens *src, cuftens *dst); 38 | cuftens cuftens_round_up(ftens *t, int n); 39 | cuftens cuftens_round_up(cuftens *t, int n); 40 | cuftens cuftens_round_up_cufmem(cuftens *t, int n); 41 | cuftens cuftens_copy(cuftens *t); 42 | void cuftens_copy(cuftens *src, cuftens *dst); 43 | cuftens cuftens_convert(ftens *t); 44 | 45 | void cuftens_pad(cuftens *src, cuftens *dst, int p); 46 | cuftens cuftens_pad(cuftens *t, int p); 47 | 48 | ftens cuftens_dump(cuftens *t); 49 | 50 | void cuftens_free(cuftens *t); 51 | void cuftens_reshape(cuftens *t, int D, int M, int N, int L); 52 | void cuftens_print_shape(cuftens *t); 53 | void cuftens_print(cuftens *t); 54 | void cuftens_print(cuftens *t, const char *fmt); 55 | void cuftens_print_ch(cuftens *t, int b, int ch, int I, int J, 56 | const char *fmt); 57 | 58 | static inline 59 | int cuftens_len(cuftens *t) {return t->bytes/sizeof(float);} 60 | 61 | 62 | #endif /* EPS_CUTENS_H */ 63 | -------------------------------------------------------------------------------- /include/esp.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_H 2 | #define ESP_H 3 | 4 | #include "tens.h" 5 | #include "layers.h" 6 | #include "acts.h" 7 | #include "mlp.h" 8 | #include "cnn.h" 9 | 10 | #endif /* ESP_H */ 11 | -------------------------------------------------------------------------------- /include/handlers.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_HANDLERS_H 2 | #define ESP_HANDLERS_H 3 | 4 | #include "handlers/mnist.h" 5 | #include "handlers/cifar.h" 6 | 7 | #endif /* ESP_HANDLERS_H */ 8 | -------------------------------------------------------------------------------- /include/handlers/cifar.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_CIFAR_H 2 | #define ESP_CIFAR_H 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | void cifar10_load_Xy(const char *tf, int start, int num, 9 | ftens *X, ftens *y); 10 | 11 | #ifdef __cplusplus 12 | } 13 | #endif 14 | 15 | #endif /* ESP_CIFAR_H */ 16 | -------------------------------------------------------------------------------- /include/handlers/mnist.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_MNIST_H 2 | #define ESP_MNIST_H 3 | 4 | #include "tens.h" 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | 10 | 11 | void mnist_load_X(const char *tf, int start, int num, 12 | ftens *X); 13 | 14 | void mnist_load_y(const char *lf, int start, int num, 15 | ftens *y); 16 | 17 | 18 | #ifdef __cplusplus 19 | } 20 | #endif 21 | 22 | #endif /* ESP_MNIST_H */ 23 | -------------------------------------------------------------------------------- /include/kernels.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_KERNELS_H 2 | #define ESP_KERNELS_H 3 | 4 | #include 5 | #include "cuptens.h" 6 | 7 | 8 | void cuset(cuftens *t, float v); 9 | void cucopy(cuftens *src, cuftens *dst); 10 | void cupad(cuftens *src, cuftens *dst, int p); 11 | void cupad(cuptens *src, cuptens *dst, int p); 12 | void cupack(cuftens *src, cuptens *dst); 13 | void cutch(cuftens *src, cuftens *dst); 14 | void cusign(cuftens *a); 15 | void cunorm(cuftens *t); 16 | 17 | void cubp_split_pack(cuftens *src, cuptens *dst); 18 | void cubp_merge(cuftens *src, cuftens *dst, cuftens *fix, float norm); 19 | 20 | void culower (cuftens *src, cuftens *dst, int W, int H, int Sx, int Sy); 21 | void cuplower (cuptens *src, cuptens *dst, int W, int H, int Sx, int Sy); 22 | void cumaxpool(cuftens *src, cuftens *dst, int W, int H, int Sx, int Sy); 23 | 24 | void cubnorm(cuftens *mean, cuftens *istd, 25 | cuftens *beta, cuftens *gamma, 26 | cuftens *in); 27 | 28 | void sgemv(cuftens *a, cuftens *b, cuftens *c); 29 | void sgemm(cuftens *a, cuftens *b, cuftens *c); 30 | 31 | void sgemm(int M, int N, int K, 32 | const float * __restrict__ A, int lda, 33 | const float * __restrict__ B, int ldb, 34 | float * __restrict__ C, int ldc); 35 | 36 | void pgemv(int m, int n, 37 | const uint64_t * __restrict__ A, 38 | const uint64_t * __restrict__ x, 39 | float * __restrict__ y); 40 | 41 | void pgemm(int M, int N, int K, 42 | const uint64_t * __restrict__ A, 43 | const uint64_t * __restrict__ B, 44 | float * __restrict__ C); 45 | 46 | void pgemm_init(int M, int N, int K, 47 | const uint64_t * __restrict__ A, 48 | const uint64_t * __restrict__ B, 49 | float * __restrict__ C); 50 | 51 | void pgemm_init(int M, int N, int K, 52 | const uint64_t * __restrict__ a, int lda, 53 | const uint64_t * __restrict__ b, int ldb, 54 | float * __restrict__ c, int ldc); 55 | 56 | void pgemm_init_rev(int M, int N, int K, 57 | const uint64_t * __restrict__ A, 58 | const uint64_t * __restrict__ B, 59 | float * __restrict__ C); 60 | 61 | void pgemm32(int M, int N, int K, 62 | const uint32_t * __restrict__ A, 63 | const uint32_t * __restrict__ B, 64 | float * __restrict__ C); 65 | 66 | 67 | 68 | #endif /* ESP_KERNELS_H */ 69 | -------------------------------------------------------------------------------- /include/layers.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_LAYERS_H 2 | #define ESP_LAYERS_H 3 | 4 | #include "layers/input.h" 5 | #include "layers/conv.h" 6 | #include "layers/dense.h" 7 | #include "layers/pool.h" 8 | #include "layers/bnorm.h" 9 | 10 | #endif /* ESP_LAYERS_H */ 11 | -------------------------------------------------------------------------------- /include/layers/bnorm.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_BNORM_H 2 | #define ESP_BNORM_H 3 | 4 | #include "tens.h" 5 | 6 | #define BNORML_INIT(bnl) { \ 7 | bnl.N=0; bnl.ug=0; \ 8 | bnl.mean. data=NULL; bnl.istd .data=NULL; \ 9 | bnl.gmean.data=NULL; bnl.gistd .data=NULL; \ 10 | bnl.beta .data=NULL; bnl.gamma .data=NULL; \ 11 | bnl.dbeta.data=NULL; bnl.dgamma.data=NULL; \ 12 | bnl.in. data=NULL; bnl.tmp. data=NULL; \ 13 | } 14 | 15 | 16 | #ifdef __cplusplus 17 | extern "C" { 18 | #endif 19 | 20 | typedef struct { 21 | int N, ug; 22 | ftens mean, istd, gmean, gistd; 23 | ftens gamma, beta, dgamma, dbeta; 24 | ftens in, tmp; 25 | } bnormLayer; 26 | 27 | 28 | bnormLayer bnormLayer_init(int use_global); 29 | void bnormLayer_free(bnormLayer *bnl); 30 | void bnormLayer_forward(ftens *t, bnormLayer *bnl, int save); 31 | void bnormLayer_backward(ftens *dt, bnormLayer *bnl); 32 | void bnormLayer_update(bnormLayer *bnl); 33 | void bnormLayer_set(ftens *mean, ftens *istd, 34 | ftens *gamma, ftens *beta, bnormLayer *bnl); 35 | 36 | 37 | #ifdef __cplusplus 38 | } 39 | #endif 40 | 41 | #endif /* ESP_BNORM_H */ 42 | -------------------------------------------------------------------------------- /include/layers/conv.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_CONV_H 2 | #define ESP_CONV_H 3 | 4 | #include "tens.h" 5 | 6 | #define CONVL_INIT(cl) { \ 7 | cl.D=0; cl.M=0; cl.N=0; cl.L=0; \ 8 | cl.Sm=0; cl.Sn=0; cl.p=0; \ 9 | cl.W.data=NULL; cl.b.data=NULL; cl.out.data=NULL; \ 10 | cl.dW.data=NULL; cl.db.data=NULL; cl.in.data=NULL; \ 11 | } 12 | 13 | #ifdef __cplusplus 14 | extern "C" { 15 | #endif 16 | 17 | typedef struct { 18 | int D, M, N, L, Sm, Sn, p; 19 | ftens W, b, out; 20 | ftens dW, db, in; 21 | } convLayer; 22 | 23 | convLayer convLayer_init(int Sm, int Sn, int p); 24 | void convLayer_print_shape(convLayer *cl); 25 | void convLayer_free(convLayer *cl); 26 | void convLayer_set(ftens *W, convLayer *cl); 27 | void convLayer_forward(ftens *t, convLayer *cl, int save); 28 | 29 | #ifdef __cplusplus 30 | } 31 | #endif 32 | 33 | #endif /* ESP_CONV_H */ 34 | -------------------------------------------------------------------------------- /include/layers/cubnorm.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_CUBNORN_H 2 | #define ESP_CUBNORN_H 3 | 4 | #include "cutens.h" 5 | #include "bnorm.h" 6 | 7 | 8 | typedef struct { 9 | int N, ug; 10 | cuftens mean, istd, gmean, gistd; 11 | cuftens gamma, beta, dgamma, dbeta; 12 | cuftens in, tmp; 13 | } cubnormLayer; 14 | 15 | 16 | cubnormLayer cubnormLayer_init(int use_global); 17 | void cubnormLayer_convert(bnormLayer *src, cubnormLayer *dst); 18 | void cubnormLayer_free(cubnormLayer *bnl); 19 | void cubnormLayer_forward(cuftens *t, cubnormLayer *bnl, int save); 20 | void cubnormLayer_backward(cuftens *dt, cubnormLayer *bnl); 21 | void cubnormLayer_update(cubnormLayer *bnl); 22 | void cubnormLayer_set(ftens *mean, ftens *istd, 23 | ftens *gamma, ftens *beta, 24 | cubnormLayer *bnl); 25 | 26 | 27 | #endif /* ESP_CUBNORN_H */ 28 | -------------------------------------------------------------------------------- /include/layers/cuconv.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_CUCONV_H 2 | #define ESP_CUCONV_H 3 | 4 | #include "cutens.h" 5 | #include "conv.h" 6 | 7 | 8 | typedef struct { 9 | int D, M, N, L, Sm, Sn, p; 10 | cuftens W, b, out; 11 | cuftens dW, db, in; 12 | } cuconvLayer; 13 | 14 | 15 | cuconvLayer cuconvLayer_init(int Sm, int Sn, int p); 16 | void cuconvLayer_free(cuconvLayer *cl); 17 | void cuftens_print(cuftens *t); 18 | void cuconvLayer_print_shape(cuconvLayer *cl); 19 | void cuconvLayer_convert(convLayer *src, cuconvLayer *dst); 20 | void cuconvLayer_set(ftens *W, cuconvLayer *cl); 21 | cuftens cuconvLayer_pad_input(cuftens *t, int p); 22 | cuftens cuconvLayer_lower_input(cuftens *t, cuconvLayer *cl); 23 | void cuconvLayer_forward(cuftens *t, cuconvLayer *cl, int save); 24 | void cuconvLayer_backward(cuftens *dout, cuconvLayer *cl); 25 | 26 | 27 | #endif /* ESP_CUCONV_H */ 28 | -------------------------------------------------------------------------------- /include/layers/cudense.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_CUDENSE_H 2 | #define ESP_CUDENSE_H 3 | 4 | #include "cutens.h" 5 | #include "dense.h" 6 | 7 | 8 | typedef struct { 9 | int M, N; 10 | cuftens W, b, out; 11 | cuftens dW, db, in; 12 | } cudenseLayer; 13 | 14 | 15 | cudenseLayer cudenseLayer_init(int M, int N); 16 | void cudenseLayer_convert(denseLayer *src, cudenseLayer *dst); 17 | void cudenseLayer_free(cudenseLayer *dl); 18 | void cudenseLayer_print_size(cudenseLayer *dl); 19 | void cudenseLayer_set(ftens *W, cudenseLayer *dl); 20 | void cudenseLayer_forward(cuftens *t, cudenseLayer *dl, int save); 21 | void cudenseLayer_backward(cuftens *dt, cudenseLayer *dl); 22 | 23 | 24 | #endif /* ESP_CUDENSE_H */ 25 | -------------------------------------------------------------------------------- /include/layers/cuinput.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_CUINPUT_H 2 | #define ESP_CUINPUT_H 3 | 4 | #include "tens.h" 5 | #include "cutens.h" 6 | 7 | 8 | typedef struct { 9 | cuftens out; 10 | } cuinputLayer; 11 | 12 | 13 | void cuinputLayer_forward(ftens *t, cuinputLayer *il, int norm); 14 | void cuinputLayer_free(cuinputLayer *il); 15 | 16 | 17 | #endif /* ESP_CUINPUT_H */ 18 | -------------------------------------------------------------------------------- /include/layers/cupconv.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_CUPCONV_H 2 | #define ESP_CUPCONV_H 3 | 4 | #include "conv.h" 5 | #include "cuptens.h" 6 | 7 | 8 | #define CUPCONVL_INIT(cl) \ 9 | (cl.D=0, cl.M=0, cl.N=0, cl.L=0, \ 10 | cl.Sm=0, cl.Sn=0, cl.p=0, \ 11 | cl.W. data=NULL, cl.pout.data=NULL, \ 12 | cl.dW. data=NULL, cl.out .data=NULL, \ 13 | cl.fix.data=NULL, cl.bfix.data=NULL, \ 14 | cl.in. data=NULL) 15 | 16 | 17 | typedef struct { 18 | int D, M, N, L, Sm, Sn, p; 19 | cuptens W, pout; 20 | cuftens dW, out, in; 21 | cuftens fix, bfix; 22 | } cupconvLayer; 23 | 24 | 25 | cupconvLayer cupconvLayer_init(); 26 | cupconvLayer cupconvLayer_init(int Sm, int Sn, int p); 27 | void cupconvLayer_convert(convLayer *src, cupconvLayer *dst, int fix); 28 | void cupconvLayer_set(ftens *W, cupconvLayer *cl, int fix); 29 | void cupconvLayer_forward(cuptens *t, cupconvLayer *cl); 30 | void cupconvLayer_forward_initial(cuftens *t, cupconvLayer *cl, float norm); 31 | void cupconvLayer_free(cupconvLayer *cl); 32 | 33 | 34 | 35 | #endif /* ESP_CUPCONV_H */ 36 | -------------------------------------------------------------------------------- /include/layers/cupdense.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_CUPDENSE_H 2 | #define ESP_CUPDENSE_H 3 | 4 | #include "cuptens.h" 5 | #include "dense.h" 6 | 7 | 8 | #define CUPDENSEL_INIT(dl) \ 9 | (dl.M=0, dl.N=0, \ 10 | dl.W. data=NULL, dl.in. data=NULL, dl.pout.data=NULL, \ 11 | dl.dW.data=NULL, dl.out.data=NULL, dl.fix.data=NULL) 12 | 13 | 14 | typedef struct { 15 | int M, N; 16 | cuptens W, in, pout; 17 | cuftens dW, out, fix; 18 | } cupdenseLayer; 19 | 20 | 21 | cupdenseLayer cupdenseLayer_init(); 22 | void cupdenseLayer_convert(denseLayer *src, cupdenseLayer *dst, int fix); 23 | void cupdenseLayer_free(cupdenseLayer *dl); 24 | void cupdenseLayer_print_size(cupdenseLayer *dl); 25 | void cupdenseLayer_set(ftens *W, cupdenseLayer *dl, int fix); 26 | void cupdenseLayer_forward(cuptens *t, cupdenseLayer *dl, int save); 27 | void cupdenseLayer_forward_initial(cuftens *t, cupdenseLayer *dl, float norm); 28 | 29 | 30 | #endif /* ESP_CUPDENSE_H */ 31 | -------------------------------------------------------------------------------- /include/layers/cupinput.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_CUPINPUT_H 2 | #define ESP_CUPINPUT_H 3 | 4 | /* #include "cuptens.h" */ 5 | 6 | 7 | /* typedef struct { */ 8 | /* cuptens out; */ 9 | /* } cupinputLayer; */ 10 | 11 | 12 | /* cupinputLayer cupinputLayer_init(); */ 13 | /* void cupinputLayer_forward(ftens *t, cupinputLayer *il); */ 14 | /* void cupinputLayer_free(cupinputLayer *il); */ 15 | 16 | 17 | #endif /* ESP_CUPINPUT_H */ 18 | -------------------------------------------------------------------------------- /include/layers/cupool.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_CUPOOL_H 2 | #define ESP_CUPOOL_H 3 | 4 | #include "pool.h" 5 | #include "cutens.h" 6 | 7 | 8 | typedef struct { 9 | int M, N, Sm, Sn; 10 | pool_t op; 11 | cuftens out, mask; 12 | } cupoolLayer; 13 | 14 | 15 | cupoolLayer cupoolLayer_init(int M, int N, int Sm, int Sn); 16 | void cupoolLayer_free(cupoolLayer *pl); 17 | void cupoolLayer_convert(poolLayer *src, cupoolLayer *dst); 18 | void cupoolLayer_forward(cuftens *t, cupoolLayer *pl); 19 | void cupoolLayer_backward(cuftens *dt, cupoolLayer *pl); 20 | void cupoolLayer_print(cupoolLayer *pl); 21 | 22 | 23 | #endif /* ESP_CUPOOL_H */ 24 | -------------------------------------------------------------------------------- /include/layers/dense.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_DENSE_H 2 | #define ESP_DENSE_H 3 | 4 | #include "tens.h" 5 | 6 | 7 | #define DENSEL_INIT(dl) ( \ 8 | dl.M=0, dl.N=0, \ 9 | dl.W .data=NULL, dl.dW .data=NULL, \ 10 | dl.b .data=NULL, dl.db .data=NULL, \ 11 | dl.in.data=NULL, dl.out.data=NULL) 12 | 13 | 14 | #ifdef __cplusplus 15 | extern "C" { 16 | #endif 17 | 18 | typedef struct { 19 | int M, N; 20 | ftens W, b, dW, db; 21 | ftens in, out; 22 | } denseLayer; 23 | 24 | denseLayer denseLayer_init(int M, int N); 25 | void denseLayer_print_shape(denseLayer *dl); 26 | void denseLayer_free(denseLayer *dl); 27 | void denseLayer_set(ftens *W, denseLayer *dl); 28 | void denseLayer_forward(ftens *t, denseLayer *dl, int cpy); 29 | void denseLayer_backward(ftens *dt, denseLayer *dl); 30 | 31 | 32 | #ifdef __cplusplus 33 | } 34 | #endif 35 | 36 | #endif /* ESP_DENSE_H */ 37 | -------------------------------------------------------------------------------- /include/layers/input.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_INPUT_H 2 | #define ESP_INPUT_H 3 | 4 | #include "tens.h" 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | 10 | typedef struct { 11 | ftens out; 12 | } inputLayer; 13 | 14 | void inputLayer_load(ftens *in, inputLayer *il); 15 | void inputLayer_free(inputLayer *il); 16 | void inputLayer_forward(inputLayer *il); 17 | void inputLayer_pad(inputLayer *il, int p); 18 | 19 | 20 | #ifdef __cplusplus 21 | } 22 | #endif 23 | 24 | #endif /* ESP_INPUT_H */ 25 | -------------------------------------------------------------------------------- /include/layers/pool.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_POOL_H 2 | #define ESP_POOL_H 3 | 4 | #include "tens.h" 5 | 6 | #define POOLL_INIT(pl) \ 7 | (pl.M=0, pl.N=0, pl.Sm=0, pl.Sn=0, pl.op=MAX, \ 8 | pl.out.data=NULL, pl.mask.data=NULL) 9 | 10 | 11 | #ifdef __cplusplus 12 | extern "C" { 13 | #endif 14 | 15 | typedef enum {MAX, AVG} pool_t; 16 | 17 | typedef struct { 18 | int M, N, Sm, Sn; pool_t op; 19 | ftens out, mask; 20 | } poolLayer; 21 | 22 | 23 | poolLayer poolLayer_init(int M, int N, int Sm, int Sn); 24 | void poolLayer_free(poolLayer *pl); 25 | void poolLayer_forward(ftens *t, poolLayer *pl); 26 | void poolLayer_backward(ftens *dout, poolLayer *pl); 27 | 28 | 29 | #ifdef __cplusplus 30 | } 31 | #endif 32 | 33 | 34 | #endif /* ESP_POOL_H */ 35 | -------------------------------------------------------------------------------- /include/nn.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_NN_H 2 | #define ESP_NN_H 3 | 4 | #include "nn/mlp.h" 5 | #include "nn/cnn.h" 6 | 7 | 8 | #endif /* ESP_NN_H */ 9 | -------------------------------------------------------------------------------- /include/nn/cnn.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_CNN_H 2 | #define ESP_CNN_H 3 | 4 | #include "tens.h" 5 | #include "layers.h" 6 | 7 | 8 | #ifdef __cplusplus 9 | extern "C" { 10 | #endif 11 | 12 | typedef struct { 13 | int Ncl, Npl, Ndl, Nbnl; 14 | inputLayer il; 15 | convLayer *cl; 16 | poolLayer *pl; 17 | denseLayer *dl; 18 | bnormLayer *bnl; 19 | } cnn; 20 | 21 | cnn cnn_init(int Ncl, int Npl, int Ndl, int Nbnl); 22 | cnn cnn_load(const char *esp, int bin, int rev); 23 | void cnn_free(cnn *net); 24 | void cnn_print(cnn *net); 25 | 26 | 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | 31 | #endif /* ESP_CNN_H */ 32 | -------------------------------------------------------------------------------- /include/nn/cucnn.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_CUMLP_H 2 | #define ESP_CUMLP_H 3 | 4 | #include "culayers.h" 5 | #include "cnn.h" 6 | 7 | 8 | typedef struct { 9 | int Ncl, Npl, Ndl, Nbnl; 10 | cuinputLayer il; 11 | cuconvLayer *cl; 12 | cupoolLayer *pl; 13 | cudenseLayer *dl; 14 | cubnormLayer *bnl; 15 | } cucnn; 16 | 17 | 18 | cucnn cucnn_init(int Ncl, int Npl, int Ndl, int Nbnl); 19 | void cucnn_print(cucnn *net); 20 | void cucnn_free(cucnn *net); 21 | cucnn cucnn_convert(cnn *net); 22 | 23 | 24 | #endif /* ESP_CUMLP_H */ 25 | -------------------------------------------------------------------------------- /include/nn/cumlp.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_CUMLP_H 2 | #define ESP_CUMLP_H 3 | 4 | #include "mlp.h" 5 | #include "culayers.h" 6 | 7 | typedef struct { 8 | int Ndl, Nbnl; 9 | cuinputLayer il; 10 | cudenseLayer *dl; 11 | cubnormLayer *bnl; 12 | } cumlp; 13 | 14 | 15 | cumlp cumlp_init(int Ndl, int Nbnl); 16 | cumlp cumlp_convert(mlp *net); 17 | 18 | void cumlp_print(cumlp *net); 19 | void cumlp_free(cumlp *net); 20 | 21 | 22 | 23 | #endif /* ESP_CUMLP_H */ 24 | -------------------------------------------------------------------------------- /include/nn/cupcnn.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_CUPCNN_H 2 | #define ESP_CUPCNN_H 3 | 4 | #include "culayers.h" 5 | #include "cnn.h" 6 | 7 | 8 | typedef struct { 9 | int Ncl, Npl, Ndl, Nbnl; 10 | cuinputLayer il; 11 | cupconvLayer *cl; 12 | cupoolLayer *pl; 13 | cupdenseLayer *dl; 14 | cubnormLayer *bnl; 15 | } cupcnn; 16 | 17 | 18 | cupcnn cupcnn_init(int Ncl, int Npl, int Ndl, int Nbnl); 19 | cupcnn cupcnn_convert(cnn *nn); 20 | void cupcnn_print(cupcnn *nn); 21 | void cupcnn_free(cupcnn *nn); 22 | 23 | 24 | #endif /* ESP_CUPCNN_H */ 25 | -------------------------------------------------------------------------------- /include/nn/cupmlp.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_PMLP_H 2 | #define ESP_PMLP_H 3 | 4 | #include "mlp.h" 5 | #include "culayers.h" 6 | 7 | 8 | typedef struct { 9 | int Ndl, Nbnl; 10 | cuinputLayer il; 11 | cupdenseLayer *dl; 12 | cubnormLayer *bnl; 13 | } cupmlp; 14 | 15 | 16 | cupmlp cupmlp_init(int Ndl, int Nbnl); 17 | cupmlp cupmlp_convert(mlp *nn); 18 | 19 | void cupmlp_print(cupmlp *nn); 20 | void cupmlp_free(cupmlp *nn); 21 | 22 | 23 | 24 | #endif /* ESP_PMLP_H */ 25 | -------------------------------------------------------------------------------- /include/nn/mlp.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_MLP_H 2 | #define ESP_MLP_H 3 | 4 | #include "layers.h" 5 | 6 | 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | 11 | typedef struct { 12 | int Ndl, Nbnl; 13 | inputLayer il; 14 | denseLayer *dl; 15 | bnormLayer *bnl; 16 | } mlp; 17 | 18 | 19 | mlp mlp_init(int Ndl, int Nbnl); 20 | mlp mlp_load(const char *esp, int bin); 21 | void mlp_free(mlp *net); 22 | void mlp_print(mlp *net); 23 | 24 | 25 | #ifdef __cplusplus 26 | } 27 | #endif 28 | 29 | 30 | #endif /* ESP_MLP_H */ 31 | -------------------------------------------------------------------------------- /include/params.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_PARAMS_H 2 | #define ESP_PARAMS_H 3 | 4 | #include "layers.h" 5 | 6 | #define INPUTL 0 7 | #define CONVL 3 8 | #define POOLL 4 9 | #define DENSEL 1 10 | #define BNORML 2 11 | #define LNUM (1<<4) 12 | #define LDAT (2<<4) 13 | 14 | #ifdef __cplusplus 15 | extern "C" { 16 | #endif 17 | 18 | void load_denseLayer(denseLayer *dl, FILE * const pf, int bin); 19 | void load_bnormLayer(bnormLayer *bnl, FILE * const pf); 20 | void load_convLayer(convLayer *cl, FILE * const pf, 21 | int bin, int rev); 22 | void load_poolLayer(poolLayer *pl, FILE * const pf); 23 | 24 | 25 | #ifdef __cplusplus 26 | } 27 | #endif 28 | 29 | 30 | #endif /* ESP_PARAMS_H */ 31 | -------------------------------------------------------------------------------- /include/tens.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_TENS_H 2 | #define ESP_TENS_H 3 | 4 | #include 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | 10 | typedef struct { 11 | int D, M, N, L, MNL; 12 | int bytes; 13 | float *data; 14 | } ftens; 15 | 16 | 17 | ftens ftens_init(int D, int M, int N, int L); 18 | ftens ftens_zeros(int D, int M, int N, int L); 19 | ftens ftens_ones(int D, int M, int N, int L); 20 | ftens ftens_rand(int D, int M, int N, int L); 21 | ftens ftens_rand_range(int D, int M, int N, int L, 22 | float min, float max); 23 | 24 | ftens ftens_copy(ftens *in); 25 | ftens ftens_copy_pad(ftens *t, int p); 26 | 27 | ftens ftens_from_ptr(int D, int M, int N, int L, float *ptr); 28 | ftens ftens_from_file(int D, int M, int N, int L, FILE *pf); 29 | 30 | ftens ftens_copy_tch(ftens *a); 31 | void ftens_tch(ftens *a, ftens *b); 32 | void ftens_clear(ftens *t); 33 | void ftens_reshape(ftens *t, int D, int M, int N, int L); 34 | void ftens_pad(ftens *src, ftens *dst, int p); 35 | void ftens_maxpool(ftens *src, ftens *dst, int W, int H, 36 | int Sx, int Sy); 37 | 38 | void ftens_lower(ftens *src, ftens *dst, 39 | int W, int H, int Sx, int Sy); 40 | 41 | void ftens_sign(ftens *t); 42 | void ftens_free(ftens *t); 43 | void ftens_print_shape(ftens *t); 44 | void ftens_print(ftens *t, const char *fmt); 45 | void ftens_print_ch(ftens *t, int w, int k, int I, int J, 46 | const char *fmt); 47 | 48 | static inline 49 | int ftens_len(ftens *t) {return t->bytes/sizeof(float);} 50 | 51 | 52 | #ifdef __cplusplus 53 | } 54 | #endif 55 | 56 | #endif /* ESP_TENS_H */ 57 | -------------------------------------------------------------------------------- /include/timez.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_TIME_H 2 | #define ESP_TIME_H 3 | 4 | #include 5 | 6 | #define TIME_START() time_record(&aaat1) 7 | #define TIME_STOP() time_record(&aaat2) 8 | #define TIME_STOP_PRINT() TIME_STOP(); elapsed_time(aaat1, aaat2) 9 | #define TIME_STOP_SAVE(t) TIME_STOP(); t = elapsed_time(aaat1, aaat2) 10 | 11 | 12 | typedef struct timeval myTime; 13 | myTime aaat1, aaat2; 14 | 15 | void time_record(myTime *time); 16 | long elapsed_time(myTime start, myTime stop); 17 | 18 | #endif /* ESP_TIME_H */ 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /include/util.cuh: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "util.h" 5 | 6 | #define FPTR d_fscratch 7 | #define PPTR d_pscratch 8 | #define FPTR_INC(x, len) d_fscratch ? x + len : NULL 9 | #define PPTR_INC(x, len) d_pscratch ? x + len : NULL 10 | 11 | #define cuHtoD cudaMemcpyHostToDevice 12 | #define cuDtoH cudaMemcpyDeviceToHost 13 | #define cuDtoD cudaMemcpyDeviceToDevice 14 | 15 | #ifndef cuD 16 | #define cuD(x) ((x).d_data) 17 | #endif 18 | 19 | #define cuASSERT(exp, msg) \ 20 | if (!(exp)) { \ 21 | fprintf(stderr, msg); \ 22 | exit(-1); \ 23 | } 24 | 25 | #define CUDA_SAFE_CALL(call) \ 26 | { \ 27 | cudaError_t err = call; \ 28 | if (cudaSuccess != err) { \ 29 | fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\ 30 | __FILE__, __LINE__, cudaGetErrorString(err) ); \ 31 | exit(EXIT_FAILURE); \ 32 | } \ 33 | } 34 | 35 | 36 | #define CHECK_LAUNCH_ERROR() \ 37 | { \ 38 | cudaError_t err = cudaGetLastError(); \ 39 | if (cudaSuccess != err) { \ 40 | fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\ 41 | __FILE__, __LINE__, cudaGetErrorString(err) ); \ 42 | exit(EXIT_FAILURE); \ 43 | } \ 44 | err = cudaThreadSynchronize(); \ 45 | if (cudaSuccess != err) { \ 46 | fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\ 47 | __FILE__, __LINE__, cudaGetErrorString( err) ); \ 48 | exit(EXIT_FAILURE); \ 49 | } \ 50 | } 51 | -------------------------------------------------------------------------------- /include/util.h: -------------------------------------------------------------------------------- 1 | #ifndef ESP_UTIL_H 2 | #define ESP_UTIL_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | 12 | #ifdef __cplusplus 13 | extern "C" { 14 | #endif 15 | 16 | #define ID2(i, j, N) ((i)*(N) + (j)) 17 | #define ID3(i, j, k, N, L) (ID2(i,j,N)*(L) + (k)) 18 | #define IDX(i, j, k, N, L) (((i)*N + (j))*(L) + (k)) 19 | 20 | #define SET(x, y) (x).y = y 21 | #define PSET(x, y) (x)->y = y 22 | 23 | #define PAD(x, p) ((x) + ((p)<<1)) 24 | #define CEIL(x, y) (((x) + (y) - 1) / (y)) 25 | #define ROUND_UP(x, y) (CEIL(x, y) * (y)) 26 | #define MAX(x, y) ((x) > (y) ? (x) : (y)) 27 | #define MIN(x, y) ((x) < (y) ? (x) : (y)) 28 | #define OUT_LEN(x, y, z) (ceilf(((x)-(y)+1)/(float)z)) 29 | 30 | #define SP printf(" "); 31 | #define NL printf("\n"); 32 | #define SEP printf(" | "); 33 | #define D(t) (t.data) 34 | #define LEN(t) (t.bytes/sizeof(float)) 35 | #define FOR(x,y,n) for (int x=y; x<(n); x++) 36 | #define READ_INT(x, pf) FREAD(x, sizeof(int), 1, pf, "asd") 37 | #define READ_UINT8(x, pf) FREAD(x, sizeof(uint8_t), 1, pf, "asd") 38 | #define MALLOC(type, num) (type *) malloc((num) * sizeof(type)) 39 | #define BYTES(type, num) ((num) * sizeof(type)) 40 | #define CALLOC(type, num) (type *) calloc(num, sizeof(type)) 41 | 42 | #define FREAD(x, type, num, pf, msg) \ 43 | if (fread(x, sizeof(type), num, pf) != num) { \ 44 | fprintf(stderr, msg); \ 45 | exit(2); \ 46 | } 47 | 48 | #ifdef NDEBUG 49 | #define ASSERT(exp, msg) assert(EXP && MSG) 50 | 51 | #else 52 | #define ASSERT(exp, msg) \ 53 | if(!(exp)) { \ 54 | fprintf(stderr, msg); \ 55 | exit(1); \ 56 | } 57 | #endif 58 | 59 | 60 | #ifdef __cplusplus 61 | } 62 | #endif 63 | 64 | #endif /* ESP_UTIL_H */ 65 | -------------------------------------------------------------------------------- /readEspresso.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """" 5 | Usage: readEspresso.py 6 | 7 | Options: 8 | -h --help 9 | """ 10 | 11 | import os 12 | import sys 13 | import struct 14 | import numpy as np 15 | 16 | INPUT = 0 17 | DENSE = 1 18 | BNORM = 2 19 | CONV = 3 20 | POOL = 4 21 | NUM = 1<<4 22 | DATA = 2<<4 23 | 24 | 25 | def pop_string(asd, n): 26 | return asd[:n], asd[n:] 27 | 28 | def uint8_read(string): 29 | t, c = pop_string(string, 1) 30 | return struct.unpack('B', t)[0], c 31 | 32 | def int32_read(string): 33 | t, c = pop_string(string, 4) 34 | return struct.unpack('i', t)[0], c 35 | 36 | def float_read(f): 37 | return struct.unpack('f', string[i:i+4])[0], i+4 38 | 39 | 40 | 41 | def main(infile): 42 | assert(os.path.exists(infile)) 43 | parameters = [] 44 | with open(infile, 'rb') as f: 45 | data = f.read() 46 | while data: 47 | caz, data = uint8_read(data) 48 | 49 | if caz == DENSE | NUM: 50 | num, data = int32_read(data) 51 | print 'ndense %d' % num 52 | 53 | elif caz == BNORM | NUM: 54 | num, data = int32_read(data) 55 | print 'nbnorm %d' % num 56 | 57 | elif caz == POOL | NUM: 58 | num, data = int32_read(data) 59 | print 'npool %d' % num 60 | 61 | elif caz == CONV | NUM: 62 | num, data = int32_read(data) 63 | print 'nconv %d' % num 64 | 65 | elif caz == INPUT | DATA: 66 | pass 67 | #caz, data = pop_string(data, 4*3) 68 | #M, N, nch = struct.unpack('3i', caz) 69 | #print 'input %d %d %d' % (M, N, nch) 70 | 71 | elif caz == DENSE | DATA: 72 | caz, data = pop_string(data, 4*2) 73 | M, N = struct.unpack('2i', caz) 74 | print 'dense %d %d' % (M, N) 75 | caz, data = pop_string(data, 4*M*N) 76 | asd = np.frombuffer(caz, dtype=np.float32) 77 | asd = asd.reshape((M, N)) 78 | parameters.append(asd) 79 | caz, data = pop_string(data, 4*M) 80 | asd = np.frombuffer(caz, dtype=np.float32) 81 | parameters.append(asd) 82 | 83 | elif caz == BNORM | DATA: 84 | dim, data = int32_read(data) 85 | print 'bnorm %d' % dim 86 | for i in range(4): 87 | caz, data = pop_string(data, 4*dim) 88 | asd = np.frombuffer(caz, dtype=np.float32) 89 | parameters.append(asd) 90 | 91 | elif caz == CONV | DATA: 92 | caz, data = pop_string(data, 4*7) 93 | pad, nfil, M, N, nch, Sx, Sy = struct.unpack('7i', caz) 94 | print 'conv %d %d %d %d %d %d %d' % (pad, nfil, M, N, nch, Sx, Sy) 95 | caz, data = pop_string(data, 4*M*N*nch*nfil) 96 | asd = np.frombuffer(caz, dtype=np.float32) 97 | asd = asd.reshape((nfil, nch, M, N)) 98 | parameters.append(asd) 99 | caz, data = pop_string(data, 4*nfil) 100 | asd = np.frombuffer(caz, dtype=np.float32) 101 | parameters.append(asd) 102 | 103 | elif caz == POOL | DATA: 104 | dim, data = pop_string(data, 4*4) 105 | dim = struct.unpack('4i', dim) 106 | print 'pool %d %d %d %d' % tuple(dim) 107 | 108 | else: 109 | print 'caz' 110 | 111 | return parameters 112 | 113 | 114 | if __name__ == '__main__': 115 | from docopt import docopt 116 | 117 | args = docopt(__doc__) 118 | params = main(args['']) 119 | import pdb; pdb.set_trace() 120 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # src 2 | file(GLOB SRC *.c layers/*.c handlers/*.c activations/*.c) 3 | add_library(espresso ${SRC}) 4 | 5 | if (CUDA_FOUND) 6 | file(GLOB CU_SRC *.cu layers/*.cu kernels/*.cu activations/*.cu tens.c) 7 | cuda_add_library(cuespresso ${CU_SRC}) 8 | endif() 9 | -------------------------------------------------------------------------------- /src/activations/psignl.cu: -------------------------------------------------------------------------------- 1 | #include "kernels.h" 2 | 3 | 4 | void cupsignAct_forward(cuftens *src, cuptens *out) 5 | { 6 | int D=src->D, M=src->M, N=src->N, L=src->L; 7 | cuASSERT(L % 64 == 0 || N % 64 == 0, "err: psignact % 64\n"); 8 | 9 | if (!out->data) *out = cuptens_init(D, M, N, L); 10 | cupack(src, out); 11 | } 12 | -------------------------------------------------------------------------------- /src/activations/relul.c: -------------------------------------------------------------------------------- 1 | #include "tens.h" 2 | #include "util.h" 3 | 4 | 5 | void relu_forward(ftens *t) 6 | { 7 | const int len = ftens_len(t); 8 | for (int i=0; i < len; i++) 9 | t->data[i] = MAX(0, t->data[i]); 10 | } 11 | 12 | void relu_backward(ftens *dout) 13 | { 14 | fprintf(stderr, "not implemeted yer\n"); 15 | exit(-4); 16 | } 17 | -------------------------------------------------------------------------------- /src/activations/signl.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | #include "tens.h" 3 | 4 | void signAct_forward(ftens *t) 5 | { 6 | ftens_sign(t); 7 | //const int len = ftens_len(t); 8 | //for (int i=0; i < len; i++) 9 | //t->data[i] = 2.0f * (t->data[i] > 0.0f) - 1.0f; 10 | } 11 | 12 | 13 | void signAct_backward(ftens *t) 14 | { 15 | fprintf(stderr, "not implemeted yet\n"); 16 | exit(-4); 17 | } 18 | -------------------------------------------------------------------------------- /src/activations/signl.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "kernels.h" 3 | 4 | 5 | void cusignAct_forward(cuftens *t) 6 | { 7 | cusign(t); 8 | } 9 | 10 | 11 | void cusignAct_backward(cuftens *dout) 12 | { 13 | fprintf(stderr, "not implemented yet\n"); 14 | exit(-4); 15 | } 16 | -------------------------------------------------------------------------------- /src/activations/softmaxl.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | #include "tens.h" 3 | 4 | void softmax_forward(ftens *t) 5 | { 6 | fprintf(stderr, "not implemeted yet\n"); 7 | exit(-4); 8 | } 9 | 10 | 11 | void softmax_backward(ftens *dout) 12 | { 13 | fprintf(stderr, "not implemeted yet\n"); 14 | exit(-4); 15 | } 16 | -------------------------------------------------------------------------------- /src/cnn.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | #include "layers.h" 3 | #include "params.h" 4 | #include "nn/cnn.h" 5 | 6 | 7 | cnn cnn_init(int Ncl, int Npl, int Ndl, int Nbnl) 8 | { 9 | cnn out = {Ncl, Npl, Ndl, Nbnl}; 10 | out.cl = Ncl ? MALLOC(convLayer, Ncl) : NULL; 11 | out.pl = Npl ? MALLOC(poolLayer, Npl) : NULL; 12 | out.dl = Ndl ? MALLOC(denseLayer, Ndl) : NULL; 13 | out.bnl = Nbnl ? MALLOC(bnormLayer, Nbnl) : NULL; 14 | for (int i=0; iNcl, net->Npl, net->Ndl, net->Nbnl); 65 | } 66 | 67 | void cnn_free(cnn *net) 68 | { 69 | inputLayer_free(&net->il); 70 | for (int i=0; iNcl; i++) convLayer_free(net->cl + i); 71 | for (int i=0; iNpl; i++) poolLayer_free(net->pl + i); 72 | for (int i=0; iNdl; i++) denseLayer_free(net->dl + i); 73 | for (int i=0; iNbnl; i++) bnormLayer_free(net->bnl + i); 74 | } 75 | -------------------------------------------------------------------------------- /src/cnn.cu: -------------------------------------------------------------------------------- 1 | #include "util.cuh" 2 | #include "culayers.h" 3 | #include "nn/cucnn.h" 4 | 5 | 6 | cucnn cucnn_init(int Ncl, int Npl, int Ndl, int Nbnl) 7 | { 8 | cucnn out = {Ncl, Npl, Ndl, Nbnl}; 9 | out.cl = Ncl ? MALLOC(cuconvLayer, Ncl) : NULL; 10 | out.pl = Npl ? MALLOC(cupoolLayer, Npl) : NULL; 11 | out.dl = Ndl ? MALLOC(cudenseLayer, Ndl) : NULL; 12 | out.bnl = Nbnl ? MALLOC(cubnormLayer, Nbnl) : NULL; 13 | for (int i=0; iil); 23 | for (int i=0; iNcl; i++) cuconvLayer_free(net->cl + i); 24 | for (int i=0; iNpl; i++) cupoolLayer_free(net->pl + i); 25 | for (int i=0; iNdl; i++) cudenseLayer_free(net->dl + i); 26 | for (int i=0; iNbnl; i++) cubnormLayer_free(net->bnl + i); 27 | } 28 | 29 | 30 | cucnn cucnn_convert(cnn *net) 31 | { 32 | const int Ncl=net->Ndl, Npl=net->Npl; 33 | const int Ndl=net->Ndl, Nbnl=net->Nbnl; 34 | cucnn out = cucnn_init(Ncl, Npl, Ndl, Nbnl); 35 | for (int i=0; icl[i], &out.cl[i]); 37 | 38 | for (int i=0; ipl[i], &out.pl[i]); 40 | 41 | for (int i=0; idl[i], &out.dl[i]); 43 | 44 | for (int i=0; ibnl[i], &out.bnl[i]); 46 | 47 | return out; 48 | } 49 | 50 | 51 | void cucnn_print(cucnn *net) 52 | { 53 | printf("CUCNN: Ncl=%d Npl=%d Ndl=%d Nbnl=%d\n", 54 | net->Ncl, net->Npl, net->Ndl, net->Nbnl); 55 | } 56 | -------------------------------------------------------------------------------- /src/cumem.cu: -------------------------------------------------------------------------------- 1 | #include "util.cuh" 2 | #include "cumem.h" 3 | 4 | #define PTR_INIT(x) (x.total=0, x.used=0, x.base=NULL, x.curr=NULL) 5 | 6 | cufmem fptr = {0, 0, NULL, NULL}; 7 | cupmem pptr = {0, 0, NULL, NULL}; 8 | 9 | 10 | int _cufmem() {return fptr.base != NULL;} 11 | int _cupmem() {return pptr.base != NULL;} 12 | 13 | 14 | void cufmem_alloc(int bytes) 15 | { 16 | fptr.total = bytes; 17 | fptr.used = 0; 18 | cudaMalloc(&fptr.base, bytes); 19 | cuASSERT(fptr.base, "err:\n"); 20 | fptr.curr = fptr.base; 21 | } 22 | 23 | float *cufmem_reserve(int bytes) 24 | { 25 | float *out = fptr.curr; 26 | cuASSERT(fptr.base, "err: cufmem not init \n"); 27 | cuASSERT(fptr.used + bytes < fptr.total, "err: out of cufmem\n"); 28 | fptr.used += bytes; 29 | fptr.curr += (bytes/sizeof(float)); 30 | return out; 31 | } 32 | 33 | void cufmem_free() 34 | { 35 | if (fptr.base) cudaFree(fptr.base); 36 | PTR_INIT(fptr); 37 | } 38 | 39 | void cufmem_reset() 40 | { 41 | fptr.used = 0; 42 | fptr.curr = fptr.base; 43 | } 44 | 45 | void cupmem_alloc(int bytes) 46 | { 47 | pptr.total = bytes; 48 | pptr.used = 0; 49 | cudaMalloc(&pptr.base, bytes); 50 | cuASSERT(pptr.base, "err:\n"); 51 | pptr.curr = pptr.base; 52 | } 53 | 54 | uint64_t *cupmem_reserve(int bytes) 55 | { 56 | uint64_t *out = pptr.curr; 57 | cuASSERT(pptr.base, "err: cumem not init \n"); 58 | cuASSERT(pptr.used + bytes < pptr.total, "err: out of cupmem\n"); 59 | pptr.used += bytes; 60 | pptr.curr += (bytes/sizeof(uint64_t)); 61 | return out; 62 | } 63 | 64 | void cupmem_free() 65 | { 66 | if (pptr.base) cudaFree(pptr.base); 67 | PTR_INIT(pptr); 68 | } 69 | 70 | void cupmem_reset() 71 | { 72 | pptr.used = 0; 73 | pptr.curr = pptr.base; 74 | } 75 | -------------------------------------------------------------------------------- /src/handlers/cifar.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | #include "tens.h" 3 | 4 | #define W 32 5 | #define H 32 6 | #define L 3 7 | #define TRAIN_IMG 50000 8 | #define TEST_IMG 10000 9 | 10 | 11 | void cifar10_load_Xy(const char *tf, int start, int num, 12 | ftens *X, ftens *y) 13 | { 14 | ASSERT(start + num < TEST_IMG, "err: cifar num\n"); 15 | ASSERT(X->MNL == W*H*L, "err: input shape\n"); 16 | uint8_t X_buff[W*H*L]; 17 | uint8_t y_buff; 18 | FILE *pf = fopen(tf, "rb"); 19 | ASSERT(pf, "err: fopen \n"); 20 | ftens tmpX = ftens_init(num, L, W, H); 21 | ftens_clear(y); 22 | fseek(pf, (W*H*L+1)*start, SEEK_SET); 23 | for (int i=0; i < num; i++) { 24 | float *outX = tmpX.data + i * tmpX.MNL; 25 | float *outy = y->data + i * y->MNL; 26 | fread(&y_buff, sizeof(uint8_t), 1, pf); 27 | fread(X_buff, sizeof(uint8_t), W*H*L, pf); 28 | outy[y_buff] = 1.0f; 29 | for (int i=0; i < W*H*L; i++) 30 | outX[i] = (float) X_buff[i]; 31 | } 32 | 33 | ftens_tch(&tmpX, X); 34 | ftens_free(&tmpX); 35 | } 36 | 37 | #undef W 38 | #undef H 39 | #undef L 40 | -------------------------------------------------------------------------------- /src/handlers/mnist.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "util.h" 5 | #include "tens.h" 6 | 7 | #define W 28 8 | #define H 28 9 | #define C 1 10 | #define IMG_B W*H 11 | #define LAB_B 1 12 | #define TRAIN_IMG 6000 13 | #define TRAIN_HB 16 14 | #define TEST_HB 8 15 | 16 | 17 | void mnist_load_X(const char *tf, int start, int num, 18 | ftens *X) 19 | { 20 | ASSERT(start + num < TRAIN_IMG, "mnist err img"); 21 | uint8_t buff[W*H]; 22 | FILE *pf = fopen(tf, "rb"); ASSERT(pf, "mnist err tf"); 23 | fseek(pf, TRAIN_HB + start * IMG_B, SEEK_SET); 24 | for (int i=0; i < num; i++) { 25 | float *out = X->data + i*X->MNL; 26 | FREAD(buff, uint8_t, IMG_B, pf, "mnist err read"); 27 | for (int i=0; i < W*H; i++) 28 | out[i] = (float) buff[i]; 29 | } 30 | fclose(pf); 31 | } 32 | 33 | 34 | void mnist_load_y(const char *lf, int start, int num, ftens *y) 35 | { 36 | ASSERT(start + num < TRAIN_IMG, "mnist err lab\n"); 37 | float *out = y->data; uint8_t buff; 38 | FILE *pf = fopen(lf, "rb"); ASSERT(pf, "mnist err rl\n"); 39 | fseek(pf, TEST_HB + start * LAB_B, SEEK_SET); 40 | for (int i=0; i < num; i++) { 41 | memset(out, 0, sizeof(float) * 10); 42 | fread(&buff, sizeof(uint8_t), 1, pf); 43 | out[buff] = 1.0f; 44 | out += y->MNL; 45 | } 46 | } 47 | 48 | 49 | #undef W 50 | #undef H 51 | #undef C 52 | #undef HEADER_BYTES 53 | #undef IMG_B 54 | #undef LAB_B 55 | #undef TRAIN_IMG 56 | #undef TRAIN_HB 57 | #undef TEST_HB 58 | -------------------------------------------------------------------------------- /src/kernels/bnorm.cu: -------------------------------------------------------------------------------- 1 | #include "util.cuh" 2 | #include "bnorm.cuh" 3 | #include "cutens.h" 4 | 5 | void cubnorm (cuftens *mean, cuftens *istd, 6 | cuftens *beta, cuftens *gamma, 7 | cuftens *in) 8 | { 9 | const int TS=32; 10 | const int M=in->N, N=in->N, L=in->L; 11 | const int len = cuftens_len(in); 12 | 13 | ker_bnorm <<>> 14 | (mean->data, istd->data, 15 | beta->data, gamma->data, len, L > 1 ? L : M*N, 16 | in->data); 17 | } 18 | -------------------------------------------------------------------------------- /src/kernels/bnorm.cuh: -------------------------------------------------------------------------------- 1 | static __global__ 2 | void ker_bnorm (const float * __restrict__ mean, 3 | const float * __restrict__ istd, 4 | const float * __restrict__ beta, 5 | const float * __restrict__ gamma, 6 | const int len, const int N, 7 | float * __restrict__ dst) 8 | { 9 | int i=threadIdx.x + blockIdx.x * blockDim.x; 10 | 11 | if (i >= len) return; 12 | 13 | dst[i] = (dst[i] - mean[i%N]) * istd[i%N] * gamma[i%N] + beta[i%N]; 14 | } 15 | -------------------------------------------------------------------------------- /src/kernels/bp.cu: -------------------------------------------------------------------------------- 1 | #include "util.cuh" 2 | #include "bp.cuh" 3 | #include "cuptens.h" 4 | 5 | 6 | void cubp_split_pack(cuftens *src, cuptens *dst) 7 | { 8 | int D=src->D, Ns=src->MNL, Nd=dst->X*2; 9 | 10 | dim3 grid(D, CEIL(Ns, 32)); 11 | dim3 block(1, 32); 12 | 13 | ker_bpsplit <8> <<>> 14 | (src->data, (uint32_t *)dst->data, Ns, Nd); 15 | 16 | } 17 | 18 | void cubp_merge(cuftens *src, cuftens *dst, cuftens *fix, float norm) 19 | { 20 | const int D=src->D, N=src->MNL/8; 21 | 22 | dim3 grid(D, CEIL(N, 32)); 23 | dim3 block(8, 32); 24 | 25 | ker_bpmerge <8> <<>> 26 | (src->data, dst->data, fix->data, norm, N, fix->N); 27 | } 28 | -------------------------------------------------------------------------------- /src/kernels/bp.cuh: -------------------------------------------------------------------------------- 1 | #include "util.cuh" 2 | 3 | 4 | template static __global__ 5 | void ker_bpsplit(const float *a, uint32_t *b, int Ns, int Nd) 6 | { 7 | int w=threadIdx.x, W=w + blockIdx.x * blockDim.x; 8 | int j=threadIdx.y, J=j + blockIdx.y * blockDim.y; 9 | 10 | __shared__ uint32_t sm[32]; 11 | __shared__ uint32_t c[B]; 12 | 13 | if (J >= Ns) return; 14 | 15 | sm[j] = (uint32_t) a[W*Ns + J]; 16 | 17 | __syncthreads(); 18 | 19 | #pragma unroll 20 | for (int i = 0; i < B; i++) { 21 | c[i] = __ballot((sm[j] & (1 << i))); 22 | } 23 | 24 | __syncthreads(); 25 | 26 | if (j == 0) { 27 | uint32_t *ptr = b + W*B*Nd; 28 | for (int i=0; i < B; i++) { 29 | ptr[i*Nd + blockIdx.y] = c[i]; 30 | } 31 | } 32 | } 33 | 34 | template static __global__ 35 | void ker_bpmerge(const float *a, float *b, float *fix, float norm, 36 | int N, int N2) 37 | { 38 | int i=threadIdx.x, I=i + blockIdx.x*blockDim.x; 39 | int j=threadIdx.y, J=j + blockIdx.y*blockDim.y; 40 | 41 | __shared__ float sm[B]; 42 | 43 | if (J >= N) return; 44 | 45 | sm[i] = a[I*N + J]; 46 | __syncthreads(); 47 | 48 | if (i == 0) { 49 | int id = blockIdx.x * N + J; 50 | float c = 0.0f; 51 | 52 | #pragma unroll 53 | for (int i=0; i < B; i++) 54 | c += sm[i] * (1< static __global__ 62 | // void ker_bpsplit(const float * __restrict__ a, 63 | // uint32_t * __restrict__ b, 64 | // const int N) 65 | // { 66 | // int j=threadIdx.x, J=j + blockIdx.x*blockDim.x; 67 | 68 | // __shared__ uint32_t p[32]; 69 | // __shared__ uint32_t c[B]; 70 | 71 | // p[j] = (uint32_t) a[J]; 72 | 73 | // __syncthreads(); 74 | 75 | // #pragma unroll 76 | // for (int i=0; i < B; i++) 77 | // c[i] = __ballot((p[j] & (1<>5] = c[i]; 83 | // } 84 | // } 85 | 86 | // template static __global__ 87 | // void ker_bpmerge(int D, int M, int N, 88 | // const float * __restrict__ src, 89 | // float * __restrict__ dst, 90 | // const float * __restrict__ b, 91 | // float norm) 92 | // { 93 | // int i=threadIdx.x, I=i + blockIdx.x*blockDim.x; 94 | // int j=threadIdx.y, J=j + blockIdx.y*blockDim.y; 95 | 96 | // if (I>=D*M || J>=N) return; 97 | 98 | // __shared__ float sm[B][32]; 99 | // sm[i][j] = src[ID2(I,J,N)]; 100 | 101 | // __syncthreads(); 102 | 103 | // if (i == 0) { 104 | // float c = 0.0f; 105 | // #pragma unroll 106 | // for(int k=0; k < B; k++) 107 | // c += sm[k][j] * (float)(1<D*src->M, Ns=src->N, Ls=src->L; 8 | int Md=dst->D*dst->M, Nd=dst->N, Ld=dst->L; 9 | 10 | if (Ls > 1 && Ld > 1) { 11 | int TS = 8; 12 | dim3 grid(CEIL(Ls,TS), CEIL(Ns,TS), CEIL(Ms,TS)); 13 | dim3 block(TS, TS, TS); 14 | 15 | ker_copy3D <<>> 16 | (src->data, dst->data, Ms, Ns, Ls, Md, Nd, Ld); 17 | } else { 18 | int TS = 16; 19 | dim3 grid(CEIL(Ns, TS), CEIL(Ms, TS)); 20 | dim3 block(TS, TS); 21 | 22 | ker_copy2D <<>> 23 | (src->data, dst->data, Ms, Ns, Md, Nd); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/kernels/copy.cuh: -------------------------------------------------------------------------------- 1 | #include "util.cuh" 2 | 3 | __global__ static 4 | void ker_copy2D (const float *src, float *dst, 5 | const int Ms, const int Ns, 6 | const int Md, const int Nd) 7 | { 8 | int j=threadIdx.x + blockIdx.x*blockDim.x; 9 | int i=threadIdx.y + blockIdx.y*blockDim.y; 10 | 11 | if (i>=Ms || j>=Ns) return; 12 | 13 | dst[ID2(i,j,Nd)] = src[ID2(i,j,Ns)]; 14 | } 15 | 16 | 17 | __global__ static 18 | void ker_copy3D (const float *src, float *dst, 19 | const int Ms, const int Ns, const int Ls, 20 | const int Md, const int Nd, const int Ld) 21 | { 22 | int k=threadIdx.x + blockIdx.x * blockDim.x; 23 | int j=threadIdx.y + blockIdx.y * blockDim.y; 24 | int i=threadIdx.z + blockIdx.z * blockDim.z; 25 | 26 | if (i>=Ms || j>=Ns || k>=Ls) return; 27 | 28 | dst[ID3(i,j,k,Nd,Ld)] = src[ID3(i,j,k,Ns,Ls)]; 29 | } 30 | -------------------------------------------------------------------------------- /src/kernels/lower.cu: -------------------------------------------------------------------------------- 1 | #include "lower.cuh" 2 | #include "cuptens.h" 3 | 4 | #define KER_SETUP(type, TM, TN, TL) \ 5 | int bytes = (TM+H-1) * (TN+W-1)* TL * sizeof(type); \ 6 | dim3 grid(CEIL(L, TL), CEIL(Nd, TN), CEIL(Md, TM)); \ 7 | dim3 block(TL, TN, TM) 8 | 9 | 10 | 11 | void culower(cuftens *src, cuftens *dst, int W, int H, int Sx, int Sy) 12 | { 13 | const int D=src->D, L=src->L; 14 | const int Ms=src->M, Ns=src->N; 15 | const int Md=dst->M, Nd=dst->N; 16 | 17 | cuASSERT(D==dst->D, "err: lower shape\n"); 18 | 19 | const int TM=8, TN=8, TL=8; 20 | KER_SETUP(float, TM, TN, TL); 21 | for (int w=0; w < D; w++) { 22 | float *s = src->data + w * src->MNL; 23 | float *d = dst->data + w * dst->MNL; 24 | ker_lower <<>> 25 | (s, d, Ms, Ns, Md, Nd, L, W, H, Sx, Sy); 26 | } 27 | } 28 | 29 | 30 | void cuplower(cuptens *src, cuptens *dst, int W, int H, int Sx, int Sy) 31 | { 32 | const int D=src->D, L=src->X; 33 | const int Ms=src->M, Ns=src->N; 34 | const int Md=dst->M, Nd=dst->N; 35 | 36 | cuASSERT(D==dst->D, "err: lower shape\n"); 37 | 38 | const int TM=16, TN=16, TL=4; 39 | KER_SETUP(uint64_t, TM, TN, TL); 40 | 41 | for (int w=0; w < D; w++) { 42 | uint64_t *s = src->data + w * src->MNL; 43 | uint64_t *d = dst->data + w * dst->MNL; 44 | ker_plower <<>> 45 | (s, d, Ms, Ns, Md, Nd, L, W, H, Sx, Sy); 46 | } 47 | } 48 | 49 | 50 | #undef KER_SETUP 51 | -------------------------------------------------------------------------------- /src/kernels/lower.cuh: -------------------------------------------------------------------------------- 1 | #include "util.cuh" 2 | 3 | 4 | #define SM_FILL(sm) { \ 5 | sm[ID3(i,j,k,X,Z)] = src[ID3(I,J,K,Ns,L)]; \ 6 | if (i static __global__ 25 | void ker_lower(const float * __restrict__ src, 26 | float * __restrict__ dst, 27 | int Ms, int Ns, int Md, int Nd, int L, 28 | int W, int H, int Sx, int Sy) 29 | { 30 | int k=threadIdx.x, K=k+blockIdx.x * blockDim.x; 31 | int j=threadIdx.y, J=j+blockIdx.y * blockDim.y; 32 | int i=threadIdx.z, I=i+blockIdx.z * blockDim.z; 33 | const int X=TN+W-1, Z=TL, Ld=W*H*L; 34 | 35 | if (I>=Ms || J>=Ns || K>=L) return; 36 | 37 | extern __shared__ float sm[]; 38 | 39 | SM_FILL(sm); 40 | 41 | // sm[ID3(i,j,k,X,Z)] = src[ID3(I,J,K,Ns,L)]; 42 | // if (i=Md || J/Sx>=Nd || K>=L) return; 55 | 56 | if ((I % Sy) == 0 && (J % Sx) == 0) { 57 | DST_FILL(sm); 58 | // int l=0, R=(I/Sy*Nd + J/Sx)*Ld; 59 | // for (int y=0; y < H; y++) 60 | // for (int x=0; x < W; x++) { 61 | // dst[R+l*L+K] = sm[ID3(i+y,j+x,k,X,Z)]; 62 | // l++; 63 | // } 64 | } 65 | } 66 | 67 | 68 | template static __global__ 69 | void ker_plower(const uint64_t * __restrict__ src, 70 | uint64_t * __restrict__ dst, 71 | int Ms, int Ns, int Md, int Nd, int L, 72 | int W, int H, int Sx, int Sy) 73 | { 74 | int k=threadIdx.x, K=k+blockIdx.x * blockDim.x; 75 | int j=threadIdx.y, J=j+blockIdx.y * blockDim.y; 76 | int i=threadIdx.z, I=i+blockIdx.z * blockDim.z; 77 | const int X=TN+W-1, Z=TL, Ld=W*H*L; 78 | 79 | if (I>=Ms || J>=Ns || K>=L) return; 80 | 81 | extern __shared__ uint64_t psm[]; 82 | 83 | SM_FILL(psm); 84 | 85 | // psm[ID3(i,j,k,X,Z)] = src[ID3(I,J,K,Ns,L)]; 86 | // if (i=Md || J/Sx>=Nd || K>=L) return; 99 | 100 | if ((I % Sy) == 0 && (J % Sx) == 0) { 101 | DST_FILL(psm); 102 | // int l=0, R=(I/Sy*Nd + J/Sx)*Ld; 103 | // for (int y=0; y < H; y++) 104 | // for (int x=0; x < W; x++) { 105 | // dst[R+l*L+K] = psm[ID3(i+y,j+x,k,X,Z)]; 106 | // l++; 107 | // } 108 | } 109 | } 110 | 111 | #undef SM_FILL 112 | #undef DST_FILL 113 | -------------------------------------------------------------------------------- /src/kernels/norm.cu: -------------------------------------------------------------------------------- 1 | #include "norm.cuh" 2 | #include "cutens.h" 3 | 4 | 5 | void cunorm (cuftens *t) 6 | { 7 | const int BS=32; 8 | const int len = t->bytes/sizeof(float); 9 | 10 | ker_norm <<>> (t->data, len); 11 | } 12 | -------------------------------------------------------------------------------- /src/kernels/norm.cuh: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | 3 | static __global__ 4 | void ker_norm(float * __restrict__ src, const int len) 5 | { 6 | int i=threadIdx.x + blockIdx.x * blockDim.x; 7 | if (i >= len) return; 8 | src[i] = 2.0f * src[i] / 255.0f - 1.0f; 9 | } 10 | -------------------------------------------------------------------------------- /src/kernels/pack.cu: -------------------------------------------------------------------------------- 1 | #include "util.cuh" 2 | #include "pack.cuh" 3 | #include "cuptens.h" 4 | 5 | // void cupack(const float *a, uint32_t *b, int M, int N, int L) 6 | // { 7 | // ker_pack <<>> (a, b); 8 | // } 9 | 10 | 11 | void cupack(cuftens *src, cuptens *dst) 12 | { 13 | const int len = cuftens_len(src); 14 | ker_pack <<>> 15 | (src->data, (uint32_t *)dst->data); 16 | } 17 | -------------------------------------------------------------------------------- /src/kernels/pack.cuh: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | static __global__ 4 | void ker_pack(const float * __restrict__ a, 5 | uint32_t * __restrict__ b) 6 | { 7 | int i=threadIdx.x + blockIdx.x*blockDim.x; 8 | b[i>>5] = __ballot((uint32_t)(a[i] > 0.0f)); 9 | } 10 | -------------------------------------------------------------------------------- /src/kernels/pad.cu: -------------------------------------------------------------------------------- 1 | #include "pad.cuh" 2 | #include "cuptens.h" 3 | 4 | 5 | void cupad(cuftens *src, cuftens *dst, int p) 6 | { 7 | const int D=src->D, L=src->L; 8 | const int Ms=src->M, Ns=src->N; 9 | const int Md=dst->M, Nd=dst->N; 10 | 11 | cuASSERT(D == dst->D && 12 | L == dst->L && 13 | Md >= Ms && 14 | Nd >= Ns, "err: pad dim\n"); 15 | 16 | cudaMemset(dst->data, 0, dst->bytes); 17 | 18 | cupad_template 19 | (src->data, dst->data, p, D, L, Ms, Ns, Md, Nd); 20 | 21 | // if (L == 1) { 22 | // const int BS = 16; 23 | // dim3 grid(CEIL(Ns, BS), CEIL(Ms, BS)); 24 | // dim3 block(BS, BS); 25 | // for (int w=0; w < D; w++) { 26 | // float *s = src->data + w * src->MNL; 27 | // float *d = dst->data + w * dst->MNL; 28 | // ker_pad2D <<>> 29 | // (s, d, p, Ms, Ns, Md, Nd); 30 | // } 31 | // } else { 32 | // const int BS = 8; 33 | // dim3 grid(CEIL(L, BS), CEIL(Ns, BS), CEIL(Ms, BS)); 34 | // dim3 block(BS, BS, BS); 35 | // for (int w=0; w < D; w++) { 36 | // float *s = src->data + w * src->MNL; 37 | // float *d = dst->data + w * dst->MNL; 38 | // ker_pad3D <<>> 39 | // (s, d, p, Ms, Ns, Md, Nd, L); 40 | // } 41 | // } 42 | } 43 | 44 | 45 | void cupad(cuptens *src, cuptens *dst, int p) 46 | { 47 | const int Ms=src->M, Ns=src->N; 48 | const int Md=dst->M, Nd=dst->N; 49 | const int L=src->X, D=src->D; 50 | 51 | cuASSERT(L==dst->X && D==dst->D, "err: pad dim\n"); 52 | 53 | cudaMemset(dst->data, 0, dst->bytes); 54 | 55 | cupad_template 56 | (src->data, dst->data, p, D, L, Ms, Ns, Md, Nd); 57 | 58 | // const int BS = 8; 59 | // dim3 grid(CEIL(L, BS), CEIL(Ns, BS), CEIL(Ms, BS)); 60 | // dim3 block(BS, BS, BS); 61 | 62 | // for (int w=0; w < D; w++) { 63 | // uint64_t *s = src->data + w * src->MNL; 64 | // uint64_t *d = dst->data + w * dst->MNL; 65 | // ker_pad3D <<>> 66 | // (s, d, p, Ms, Ns, Md, Nd, L); 67 | // } 68 | } 69 | -------------------------------------------------------------------------------- /src/kernels/pad.cuh: -------------------------------------------------------------------------------- 1 | #include "util.cuh" 2 | 3 | template static __global__ 4 | void ker_pad2D(const T * __restrict__ src, 5 | T * __restrict__ dst, int p, 6 | int Ms, int Ns, int Md, int Nd) 7 | { 8 | int j=threadIdx.x + blockIdx.x*blockDim.x; 9 | int i=threadIdx.y + blockIdx.y*blockDim.y; 10 | 11 | if (i>=Ms || j>=Ns) return; 12 | 13 | dst[ID2(i+p,j+p,Nd)] = src[ID2(i,j,Ns)]; 14 | } 15 | 16 | 17 | template static __global__ 18 | void ker_pad3D(const T * __restrict__ src, 19 | T * __restrict__ dst, int p, 20 | int Ms, int Ns, int Md, int Nd, int L) 21 | { 22 | int k=threadIdx.x + blockIdx.x*blockDim.x; 23 | int j=threadIdx.y + blockIdx.y*blockDim.y; 24 | int i=threadIdx.z + blockIdx.z*blockDim.z; 25 | 26 | if (i>=Ms || j>=Ns || k>=L) return; 27 | 28 | dst[ID3(i+p,j+p,k,Nd,L)] = src[ID3(i,j,k,Ns,L)]; 29 | } 30 | 31 | 32 | template 33 | void cupad_template(T *src, T *dst, int p, int D, int L, 34 | int Ms, int Ns, int Md, int Nd) 35 | { 36 | if (L == 1) { 37 | const int BS = 16; 38 | dim3 grid(CEIL(Ns, BS), CEIL(Ms, BS)); 39 | dim3 block(BS, BS); 40 | for (int w=0; w < D; w++) { 41 | T *s = src + w * Ms*Ns*L; 42 | T *d = dst + w * Md*Nd*L; 43 | ker_pad2D <<>> 44 | (s, d, p, Ms, Ns, Md, Nd); 45 | } 46 | } else { 47 | const int BS = 8; 48 | dim3 grid(CEIL(L, BS), CEIL(Ns, BS), CEIL(Ms, BS)); 49 | dim3 block(BS, BS, BS); 50 | for (int w=0; w < D; w++) { 51 | T *s = src + w * Ms*Ns*L; 52 | T *d = dst + w * Md*Nd*L; 53 | ker_pad3D <<>> 54 | (s, d, p, Ms, Ns, Md, Nd, L); 55 | } 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/kernels/pgemm.cu: -------------------------------------------------------------------------------- 1 | #include "util.cuh" 2 | #include "pgemm.cuh" 3 | 4 | 5 | template 8 | 9 | static void 10 | pgemm_template(int M, int N, int K, 11 | const uint64_t * __restrict__ A, int LDA, 12 | const uint64_t * __restrict__ B, int LDB, 13 | float * __restrict__ C, int LDC) 14 | { 15 | size_t offsA=0, offsB=0; 16 | offsA /= sizeof(A[0]); 17 | offsB /= sizeof(B[0]); 18 | 19 | dim3 dimBlock(DIM_Y, DIM_X); 20 | dim3 dimGrid(CEIL(N, BLK_N), CEIL(M, BLK_M)); 21 | 22 | switch (INIT) { 23 | case 0: 24 | pgemm_kernel 27 | <<>> 28 | (M, N, K, A, LDA, B, LDB, C, LDC, offsA, offsB); 29 | break; 30 | 31 | case 1: 32 | pgemm_kernel_init 35 | <<>> 36 | (M, N, K, A, LDA, B, LDB, C, LDC, offsA, offsB); 37 | break; 38 | 39 | case 2: 40 | pgemm_kernel_init_rev 43 | <<>> 44 | (M, N, K, A, LDA, B, LDB, C, LDC, offsA, offsB); 45 | break; 46 | } 47 | } 48 | 49 | 50 | void pgemm(int M, int N, int K, 51 | const uint64_t * __restrict__ A, 52 | const uint64_t * __restrict__ B, 53 | float * __restrict__ C) 54 | { 55 | pgemm_template 56 | <0, 16,16, 16,16,16, 16,16, 16,16> 57 | (M, N, K, A, K, B, K, C, N); 58 | } 59 | 60 | 61 | void pgemm_init(int M, int N, int K, 62 | const uint64_t * __restrict__ A, 63 | const uint64_t * __restrict__ B, 64 | float * __restrict__ C) 65 | { 66 | pgemm_template 67 | <1, 16,16, 16,16,16, 16,16, 16,16> 68 | (M, N, K, A, K, B, K, C, N); 69 | } 70 | 71 | 72 | void pgemm_init(int M, int N, int K, 73 | const uint64_t * __restrict__ a, int lda, 74 | const uint64_t * __restrict__ b, int ldb, 75 | float * __restrict__ c, int ldc) 76 | { 77 | pgemm_template <1, 16, 16, 16,16,16, 16,16, 16,16> 78 | (M, N, K, a, lda, b, ldb, c, ldc); 79 | } 80 | 81 | 82 | void pgemm_init_rev(const int M, const int N, const int K, 83 | const uint64_t * __restrict__ A, 84 | const uint64_t * __restrict__ B, 85 | float * __restrict__ C) 86 | { 87 | pgemm_template 88 | <2, 16,16, 16,16,16, 16,16, 16,16> 89 | (M, N, K, A, K, B, K, C, N); 90 | } 91 | 92 | /////////////////////////////////////////////////////// 93 | template 96 | 97 | static void 98 | pgemm32_template(int M, int N, int K, 99 | const uint32_t * __restrict__ A, int LDA, 100 | const uint32_t * __restrict__ B, int LDB, 101 | float * __restrict__ C, int LDC) 102 | { 103 | size_t offsA=0, offsB=0; 104 | offsA /= sizeof(A[0]); 105 | offsB /= sizeof(B[0]); 106 | 107 | dim3 dimBlock(DIM_Y, DIM_X); 108 | dim3 dimGrid(CEIL(N, BLK_N), CEIL(M, BLK_M)); 109 | 110 | pgemm32_kernel 113 | 114 | <<>> 115 | (M, N, K, A, LDA, B, LDB, C, LDC, offsA, offsB); 116 | } 117 | 118 | 119 | void pgemm32(int M, int N, int K, 120 | const uint32_t * __restrict__ A, 121 | const uint32_t * __restrict__ B, 122 | float * __restrict__ C) 123 | { 124 | pgemm32_template 125 | <0, 16,16, 16,16,16, 16,16, 16,16> 126 | (M, N, K, A, K, B, K, C, N); 127 | } 128 | -------------------------------------------------------------------------------- /src/kernels/pgemm.cuh: -------------------------------------------------------------------------------- 1 | #define fetch(A, m, n, bound) offs_##A[MIN((m)*LD##A+n, bound)] 2 | 3 | 4 | __forceinline__ __device__ 5 | long long int int2_as_longlong (int2 a) 6 | { 7 | long long int res; 8 | asm ("mov.b64 %0, {%1,%2};" : "=l"(res) : "r"(a.x), "r"(a.y)); 9 | return res; 10 | } 11 | 12 | 13 | // __forceinline__ __device__ 14 | // void xcaz(int init, int *c, uint64_t a, uint64_t b) 15 | // { 16 | // switch (init) { 17 | // case 0: *c += __popcll(a ^ b); break; 18 | // case 1: *c += __popcll(a & b) - __popcll((a ^ b) & b); break; 19 | // case 2: *c += __popcll(a & b) - __popcll((a ^ b) & a); break; 20 | // } 21 | // } 22 | 23 | 24 | template 29 | 30 | static __global__ 31 | void pgemm_kernel(const int M, const int N, const int K, 32 | const uint64_t * __restrict__ A, const int LDA, 33 | const uint64_t * __restrict__ B, const int LDB, 34 | float * __restrict__ C, const int LDC, 35 | int offsA, int offsB) 36 | { 37 | int blx=blockIdx.y, bly=blockIdx.x; 38 | int idx=threadIdx.y, idy=threadIdx.x, idt=idx*DIM_Y+idy; 39 | int idxA=idt/DIM_YA, idyA=idt % DIM_YA; 40 | int idxB=idt/DIM_YB, idyB=idt % DIM_YB; 41 | 42 | int rC[THR_M][THR_N]; 43 | uint64_t rA[THR_M], rB[THR_N]; 44 | uint64_t ra[BLK_M/DIM_XA][BLK_K/DIM_YA]; 45 | uint64_t rb[BLK_N/DIM_XB][BLK_K/DIM_YB]; 46 | 47 | __shared__ uint64_t sA[BLK_M][BLK_K+1]; 48 | __shared__ uint64_t sB[BLK_N][BLK_K+1]; 49 | 50 | 51 | #define cazA (blx*BLK_M*LDA + idxA*LDA + idyA) 52 | #define cazB (bly*BLK_N*LDB + idxB*LDB + idyB) 53 | 54 | #ifdef TEX1D 55 | int coord_A = offsA + cazA; 56 | int coord_B = offsB + cazB; 57 | #else 58 | const uint64_t *offs_A = A + cazA; 59 | const uint64_t *offs_B = B + cazB; 60 | ptrdiff_t boundA = (LDA*(M-1)+K) - cazA - 1; 61 | ptrdiff_t boundB = (LDB*(N-1)+K) - cazB - 1; 62 | #endif 63 | 64 | #undef cazA 65 | #undef cazB 66 | 67 | int m, n, k, kk; 68 | 69 | #pragma unroll 70 | for (m=0; m 191 | 192 | static __global__ 193 | void pgemm_kernel_init(const int M, const int N, const int K, 194 | const uint64_t * __restrict__ A, const int LDA, 195 | const uint64_t * __restrict__ B, const int LDB, 196 | float * __restrict__ C, const int LDC, 197 | int offsA, int offsB) 198 | { 199 | int blx=blockIdx.y, bly=blockIdx.x; 200 | int idx=threadIdx.y, idy=threadIdx.x, idt=idx*DIM_Y+idy; 201 | int idxA=idt/DIM_YA, idyA=idt % DIM_YA; 202 | int idxB=idt/DIM_YB, idyB=idt % DIM_YB; 203 | 204 | int rC[THR_M][THR_N]; 205 | uint64_t rA[THR_M], rB[THR_N]; 206 | uint64_t ra[BLK_M/DIM_XA][BLK_K/DIM_YA]; 207 | uint64_t rb[BLK_N/DIM_XB][BLK_K/DIM_YB]; 208 | 209 | __shared__ uint64_t sA[BLK_M][BLK_K+1]; 210 | __shared__ uint64_t sB[BLK_N][BLK_K+1]; 211 | 212 | 213 | #define cazA (blx*BLK_M*LDA + idxA*LDA + idyA) 214 | #define cazB (bly*BLK_N*LDB + idxB*LDB + idyB) 215 | 216 | const uint64_t *offs_A = A + cazA; 217 | const uint64_t *offs_B = B + cazB; 218 | ptrdiff_t boundA = (LDA*(M-1)+K) - cazA - 1; 219 | ptrdiff_t boundB = (LDB*(N-1)+K) - cazB - 1; 220 | 221 | #undef cazA 222 | #undef cazB 223 | 224 | int m, n, k, kk; 225 | 226 | #pragma unroll 227 | for (m=0; m 342 | 343 | static __global__ 344 | void pgemm_kernel_init_rev(const int M, const int N, const int K, 345 | const uint64_t * __restrict__ A, const int LDA, 346 | const uint64_t * __restrict__ B, const int LDB, 347 | float * __restrict__ C, const int LDC, 348 | int offsA, int offsB) 349 | { 350 | int blx=blockIdx.y, bly=blockIdx.x; 351 | int idx=threadIdx.y, idy=threadIdx.x, idt=idx*DIM_Y+idy; 352 | int idxA=idt/DIM_YA, idyA=idt % DIM_YA; 353 | int idxB=idt/DIM_YB, idyB=idt % DIM_YB; 354 | 355 | int rC[THR_M][THR_N]; 356 | uint64_t rA[THR_M], rB[THR_N]; 357 | uint64_t ra[BLK_M/DIM_XA][BLK_K/DIM_YA]; 358 | uint64_t rb[BLK_N/DIM_XB][BLK_K/DIM_YB]; 359 | 360 | __shared__ uint64_t sA[BLK_M][BLK_K+1]; 361 | __shared__ uint64_t sB[BLK_N][BLK_K+1]; 362 | 363 | 364 | #define cazA (blx*BLK_M*LDA + idxA*LDA + idyA) 365 | #define cazB (bly*BLK_N*LDB + idxB*LDB + idyB) 366 | 367 | const uint64_t *offs_A = A + cazA; 368 | const uint64_t *offs_B = B + cazB; 369 | ptrdiff_t boundA = (LDA*(M-1)+K) - cazA - 1; 370 | ptrdiff_t boundB = (LDB*(N-1)+K) - cazB - 1; 371 | 372 | #undef cazA 373 | #undef cazB 374 | 375 | int m, n, k, kk; 376 | 377 | #pragma unroll 378 | for (m=0; m 487 | 488 | static __global__ 489 | void pgemm32_kernel(const int M, const int N, const int K, 490 | const uint32_t * __restrict__ A, const int LDA, 491 | const uint32_t * __restrict__ B, const int LDB, 492 | float * __restrict__ C, const int LDC, 493 | int offsA, int offsB) 494 | { 495 | int blx=blockIdx.y, bly=blockIdx.x; 496 | int idx=threadIdx.y, idy=threadIdx.x, idt=idx*DIM_Y+idy; 497 | int idxA=idt/DIM_YA, idyA=idt % DIM_YA; 498 | int idxB=idt/DIM_YB, idyB=idt % DIM_YB; 499 | 500 | int rC[THR_M][THR_N]; 501 | uint32_t rA[THR_M], rB[THR_N]; 502 | uint32_t ra[BLK_M/DIM_XA][BLK_K/DIM_YA]; 503 | uint32_t rb[BLK_N/DIM_XB][BLK_K/DIM_YB]; 504 | 505 | __shared__ uint32_t sA[BLK_M][BLK_K+1]; 506 | __shared__ uint32_t sB[BLK_N][BLK_K+1]; 507 | 508 | #define cazA (blx*BLK_M*LDA + idxA*LDA + idyA) 509 | #define cazB (bly*BLK_N*LDB + idxB*LDB + idyB) 510 | 511 | const uint32_t *offs_A = A + cazA; 512 | const uint32_t *offs_B = B + cazB; 513 | ptrdiff_t boundA = (LDA*(M-1)+K) - cazA - 1; 514 | ptrdiff_t boundB = (LDB*(N-1)+K) - cazB - 1; 515 | 516 | #undef cazA 517 | #undef cazB 518 | 519 | int m, n, k, kk; 520 | 521 | #pragma unroll 522 | for (m=0; m 5 | 6 | static 7 | void pgemv_template(const int m, const int n, 8 | const uint64_t * __restrict__ A, int lda, 9 | const uint64_t * __restrict__ x, 10 | float * __restrict__ y) 11 | { 12 | dim3 grid(CEIL(m, TS), 1); 13 | dim3 threads(DIM_X, DIM_Y); 14 | 15 | pgemv_kernel 16 | <<>> 17 | (m, n, A, lda, x, y); 18 | 19 | } 20 | 21 | 22 | void pgemv(const int m, const int n, 23 | const uint64_t * __restrict__ A, 24 | const uint64_t * __restrict__ x, 25 | float * __restrict__ y) 26 | { 27 | pgemv_template <128, 1, 128> 28 | (m, n, A, n, x, y); 29 | } 30 | -------------------------------------------------------------------------------- /src/kernels/pgemv.cuh: -------------------------------------------------------------------------------- 1 | template 2 | 3 | __global__ 4 | void pgemv_kernel(const int m, const int n, 5 | const uint64_t * __restrict__ A, int lda, 6 | const uint64_t * __restrict__ x, 7 | float * __restrict__ y) 8 | { 9 | if (m <= 0 || n <= 0) return; 10 | 11 | int nt = blockDim.x * blockDim.y * blockDim.z; 12 | 13 | if (DIM_X * DIM_Y != nt) return; 14 | 15 | int tid = threadIdx.x + threadIdx.y * blockDim.x; 16 | int tx = tid % DIM_X, ty = tid / DIM_X; 17 | int ind = blockIdx.x * TS + tx; 18 | 19 | __shared__ int sdata[DIM_X * DIM_Y]; 20 | 21 | int st = blockIdx.x * TS; 22 | int ed = MIN(st + TS, ROUND_UP(m, DIM_X)); 23 | int iters = (ed - st)/DIM_X; 24 | 25 | for (int i=0; i < iters; i++) { 26 | if (ind < m ) A += ind*lda; 27 | int res = 0; 28 | if (ind < m ) { 29 | for (int col=ty; col < n; col += DIM_Y) 30 | res += __popcll(A[col] ^ x[col]); 31 | } 32 | 33 | sdata[ty + tx * DIM_Y] = res; 34 | 35 | __syncthreads(); 36 | 37 | if (ty == 0 && ind < m) { 38 | for (int i=1; i < DIM_Y; i++) 39 | sdata[tx * DIM_Y] += sdata[i + tx * DIM_Y]; 40 | } 41 | 42 | if (ty == 0 && ind < m) 43 | y[ind] = (lda<<6) - (sdata[tx * DIM_Y]<<1); 44 | 45 | __syncthreads(); 46 | 47 | if (ind < m) A -= ind*lda; 48 | ind += DIM_X; 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/kernels/pool.cu: -------------------------------------------------------------------------------- 1 | #include "pool.cuh" 2 | #include "cutens.h" 3 | 4 | 5 | void cumaxpool(cuftens *src, cuftens *dst, 6 | int W, int H, int Sx, int Sy) 7 | { 8 | 9 | int D=src->D, L=src->L; 10 | int Ms=src->M, Ns=src->N; 11 | int Md=dst->M, Nd=dst->N; 12 | 13 | cuASSERT(L == dst->L && D == dst->D, "err: cupool shape\n"); 14 | 15 | int TS = 16; 16 | dim3 grid(CEIL(L, TS), CEIL(Nd, W), CEIL(Md, H)); 17 | dim3 block(TS, W, H); 18 | 19 | for (int w = 0; w < D; w++) { 20 | float *s = src->data + w * src->MNL; 21 | float *d = dst->data + w * dst->MNL; 22 | ker_maxpool <<>> 23 | (s, d, Ms, Ns, Md, Nd, L, W, H, Sx, Sy); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/kernels/pool.cuh: -------------------------------------------------------------------------------- 1 | #include 2 | #include "util.cuh" 3 | 4 | static __global__ 5 | void ker_maxpool (const float * __restrict__ src, 6 | float * __restrict__ dst, 7 | const int Ms, const int Ns, 8 | const int Md, const int Nd, const int L, 9 | const int W, const int H, 10 | const int Sx, const int Sy) 11 | { 12 | int k=threadIdx.x + blockIdx.x * blockDim.x; 13 | int j=threadIdx.y + blockIdx.y * blockDim.y; 14 | int i=threadIdx.z + blockIdx.z * blockDim.z; 15 | 16 | int I=i*Sy, J=j*Sx; 17 | 18 | if (i >= Md || j >= Nd || k >= L) return; 19 | 20 | float val, max=FLT_MIN; 21 | for (int y=0; y < H; y++) 22 | for (int x=0; x < W; x++) { 23 | val = src[ID3(I+y,J+x,k,Ns,L)]; 24 | if (val > max) max = val; 25 | } 26 | 27 | dst[ID3(i,j,k,Nd,L)] = max; 28 | } 29 | -------------------------------------------------------------------------------- /src/kernels/set.cu: -------------------------------------------------------------------------------- 1 | #include "set.cuh" 2 | #include "cutens.h" 3 | 4 | 5 | // void cuset2D(cuftens *t, const float v) 6 | // { 7 | // const int BS=32; 8 | // const int M=t->M, N=t->N; 9 | 10 | // dim3 grid(CEIL(N, BS), CEIL(M, BS)); 11 | // dim3 block(BS, BS); 12 | 13 | // ker_set2D <<>> (t->data, v, M, N); 14 | // } 15 | 16 | 17 | void cuset(cuftens *t, const float v) 18 | { 19 | const int M=t->D*t->M, N=t->N, L=t->L; 20 | 21 | if (L > 1) { 22 | const int BS = 8; 23 | dim3 grid(CEIL(L, BS), CEIL(N, BS), CEIL(M, BS)); 24 | dim3 block(BS, BS, BS); 25 | 26 | ker_set3D <<>> 27 | (t->data, v, M, N, L); 28 | 29 | } else { 30 | const int BS = 16; 31 | dim3 grid(CEIL(N, BS), CEIL(M, BS)); 32 | dim3 block(BS, BS); 33 | 34 | ker_set2D <<>> 35 | (t->data, v, M, N); 36 | 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/kernels/set.cuh: -------------------------------------------------------------------------------- 1 | #include "util.cuh" 2 | 3 | static __global__ 4 | void ker_set2D(float *dst, const float v, int M, int N) 5 | { 6 | int j=threadIdx.x + blockIdx.x * blockDim.x; 7 | int i=threadIdx.y + blockIdx.y * blockDim.y; 8 | 9 | if (i>=M || j>=N) return; 10 | 11 | dst[ID2(i,j,N)] = v; 12 | } 13 | 14 | 15 | static __global__ 16 | void ker_set3D(float *dst, const float v, int M, int N, int L) 17 | { 18 | int k=threadIdx.x + blockIdx.x * blockDim.x; 19 | int j=threadIdx.y + blockIdx.y * blockDim.y; 20 | int i=threadIdx.z + blockIdx.z * blockDim.z; 21 | 22 | if (i>=M || j>=N || k>=L) return; 23 | 24 | dst[ID3(i,j,k,N,L)] = v; 25 | } 26 | -------------------------------------------------------------------------------- /src/kernels/sgemm.cu: -------------------------------------------------------------------------------- 1 | #include "cutens.h" 2 | #include "util.cuh" 3 | #include "sgemm.cuh" 4 | 5 | 6 | template 8 | 9 | static void 10 | sgemm_template(int M, int N, int K, 11 | const float * __restrict__ A, int LDA, 12 | const float * __restrict__ B, int LDB, 13 | float * __restrict__ C, int LDC) 14 | { 15 | dim3 dimBlock(DIM_Y, DIM_X); 16 | dim3 dimGrid(CEIL(N, BLK_N), CEIL(M, BLK_M)); 17 | 18 | ker_sgemm 21 | 22 | <<>> 23 | (M, N, K, A, LDA, B, LDB, C, LDC); 24 | } 25 | 26 | 27 | void sgemm(int M, int N, int K, 28 | const float * __restrict__ A, int lda, 29 | const float * __restrict__ B, int ldb, 30 | float * __restrict__ C, int ldc) 31 | { 32 | sgemm_template <16,16, 96,96,16, 32,8, 32,8> 33 | (M, N, K, A, K, B, K, C, N); 34 | } 35 | 36 | 37 | void sgemm(cuftens *a, cuftens *b, cuftens *c) 38 | { 39 | const int D=a->D, M=a->M, N=b->M, K=a->N; 40 | cuASSERT(a->N == b->N, "err: shape\n"); 41 | sgemm_template <16,16, 96,96,16, 32,8, 32,8> 42 | (D*M, N, K, a->data, K, b->data, K, c->data, N); 43 | } 44 | -------------------------------------------------------------------------------- /src/kernels/sgemm.cuh: -------------------------------------------------------------------------------- 1 | #define fetch(A, m, n, bound) offs_##A[MIN((m)*LD##A+n, bound)] 2 | #define cazA (blx*BLK_M*LDA + idxA*LDA + idyA) 3 | #define cazB (bly*BLK_N*LDB + idxB*LDB + idyB) 4 | 5 | template 8 | 9 | static __global__ 10 | void ker_sgemm(int M, int N, int K, 11 | const float * __restrict__ A, int LDA, 12 | const float * __restrict__ B, int LDB, 13 | float * __restrict__ C, int LDC) 14 | { 15 | int blx=blockIdx.y, bly=blockIdx.x; 16 | int idx=threadIdx.y, idy=threadIdx.x, idt=idx*DIM_Y+idy; 17 | int idxA=idt/DIM_YA, idyA=idt % DIM_YA; 18 | int idxB=idt/DIM_YB, idyB=idt % DIM_YB; 19 | 20 | float rC[THR_M][THR_N], rA[THR_M], rB[THR_N]; 21 | float ra[BLK_M/DIM_XA][BLK_K/DIM_YA]; 22 | float rb[BLK_N/DIM_XB][BLK_K/DIM_YB]; 23 | 24 | __shared__ float sA[BLK_M][BLK_K+1]; 25 | __shared__ float sB[BLK_N][BLK_K+1]; 26 | 27 | const float *offs_A = A + cazA; 28 | const float *offs_B = B + cazB; 29 | ptrdiff_t boundA = (LDA*(M-1)+K) - cazA - 1; 30 | ptrdiff_t boundB = (LDB*(N-1)+K) - cazB - 1; 31 | 32 | int m, n, k, kk; 33 | #pragma unroll 34 | for (m=0; m 6 | 7 | void sgemv_template(const int m, const int n, 8 | const float * __restrict__ A, int lda, 9 | const float * __restrict__ x, 10 | float * __restrict__ y) 11 | { 12 | dim3 grid(CEIL(m, TS), 1); 13 | dim3 block(DIM_X, DIM_Y); 14 | 15 | ker_sgemv 16 | <<>> 17 | (m, n, A, lda, x, y); 18 | } 19 | 20 | 21 | void sgemv (cuftens *a, cuftens *b, cuftens *c) 22 | { 23 | const int M=a->M, N=a->N; 24 | cuASSERT(b->M==1 && b->N==N, "err: sgemv shape\n"); 25 | sgemv_template <256, 1, 256> 26 | (M, N, a->data, N, b->data, c->data); 27 | } 28 | -------------------------------------------------------------------------------- /src/kernels/sgemv.cuh: -------------------------------------------------------------------------------- 1 | #include "util.cuh" 2 | 3 | template 4 | 5 | static __global__ 6 | void ker_sgemv(const int m, const int n, 7 | const float * __restrict__ A, int lda, 8 | const float * __restrict__ x, 9 | float * __restrict__ y) 10 | { 11 | if (m <= 0 || n <= 0) return; 12 | 13 | int nt = blockDim.x * blockDim.y * blockDim.z; 14 | 15 | if (DIM_X * DIM_Y != nt) return; 16 | 17 | int tid = threadIdx.x + threadIdx.y * blockDim.x; 18 | int tx = tid % DIM_X, ty = tid / DIM_X; 19 | int ind = blockIdx.x * TS + tx; 20 | 21 | __shared__ float sdata[DIM_X * DIM_Y]; 22 | 23 | int st = blockIdx.x * TS; 24 | int ed = MIN(st + TS, ROUND_UP(m, DIM_X)); 25 | int iters = (ed - st)/DIM_X; 26 | 27 | for (int i=0; i < iters; i++) { 28 | if (ind < m ) A += ind*lda; 29 | 30 | float res = 0.0; 31 | if (ind < m ) { 32 | for (int col=ty; col < n; col += DIM_Y) 33 | res += A[col] * x[col]; 34 | } 35 | 36 | sdata[ty + tx * DIM_Y] = res; 37 | 38 | __syncthreads(); 39 | 40 | if (ty == 0 && ind < m) { 41 | #pragma unroll 42 | for (int i=1; i < DIM_Y; i++) 43 | sdata[tx * DIM_Y] += sdata[i + tx * DIM_Y]; 44 | } 45 | 46 | if (ty == 0 && ind < m) 47 | y[ind] = sdata[tx * DIM_Y]; 48 | 49 | __syncthreads(); 50 | 51 | if (ind < m) A -= ind*lda; 52 | 53 | ind += DIM_X; 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/kernels/sign.cu: -------------------------------------------------------------------------------- 1 | #include "util.cuh" 2 | #include "cutens.h" 3 | #include "sign.cuh" 4 | 5 | 6 | void cusign(cuftens *a) 7 | { 8 | const int BS=32; 9 | const int len = cuftens_len(a); 10 | ker_sign <<>> (a->data, len); 11 | } 12 | -------------------------------------------------------------------------------- /src/kernels/sign.cuh: -------------------------------------------------------------------------------- 1 | static __global__ 2 | void ker_sign (float *src, const int len) 3 | { 4 | int i=threadIdx.x + blockIdx.x*blockDim.x; 5 | 6 | if (i >= len) return; 7 | 8 | src[i] = 2.0f * (src[i] > 0.0f) - 1.0f; 9 | } 10 | -------------------------------------------------------------------------------- /src/kernels/tch.cu: -------------------------------------------------------------------------------- 1 | #include "tch.cuh" 2 | #include "cutens.h" 3 | 4 | 5 | void cutch(cuftens *src, cuftens *dst) 6 | { 7 | int M=src->M, N=src->N, L=src->L; 8 | cuASSERT(src->MNL == dst->MNL && src->D == dst->D, 9 | "err: cuth shape\n"); 10 | 11 | int TS=8; 12 | dim3 blocks(CEIL(L, TS), CEIL(N, TS), CEIL(M, TS)); 13 | dim3 threads(TS, TS, TS); 14 | 15 | for (int w = 0; w < src->D; w++) { 16 | float *s = src->data + w * src->MNL; 17 | float *d = dst->data + w * dst->MNL; 18 | ker_tch <<>> 19 | (s, d, M, N, L); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/kernels/tch.cuh: -------------------------------------------------------------------------------- 1 | #include "util.cuh" 2 | 3 | static __global__ 4 | void ker_tch (const float * __restrict__ src, 5 | float * __restrict__ dst, 6 | const int M, const int N, const int L) 7 | { 8 | int k=threadIdx.x + blockIdx.x * blockDim.x; 9 | int j=threadIdx.y + blockIdx.y * blockDim.y; 10 | int i=threadIdx.z + blockIdx.z * blockDim.z; 11 | 12 | if (i >= M || j >= N || k >= L) return; 13 | 14 | dst[ID3(j,k,i,L,M)] = src[ID3(i,j,k,N,L)]; 15 | } 16 | -------------------------------------------------------------------------------- /src/layers/bnorml.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | #include "layers/bnorm.h" 3 | 4 | 5 | bnormLayer bnormLayer_init(int use_global) 6 | { 7 | bnormLayer bnl; BNORML_INIT(bnl); bnl.ug = use_global; 8 | return bnl; 9 | } 10 | 11 | void bnormLayer_free(bnormLayer *bnl) 12 | { 13 | ftens_free(&bnl->mean); ftens_free(&bnl->istd); 14 | ftens_free(&bnl->gmean); ftens_free(&bnl->gistd); 15 | ftens_free(&bnl->beta); ftens_free(&bnl->gamma); 16 | ftens_free(&bnl->dbeta); ftens_free(&bnl->dgamma); 17 | ftens_free(&bnl->tmp); ftens_free(&bnl->in); 18 | } 19 | 20 | void bnormLayer_print_shape(bnormLayer *bnl) 21 | { 22 | printf("bnorm: %d %d\n", bnl->N, bnl->ug); 23 | } 24 | 25 | void bnormLayer_set(ftens *mean, ftens *istd, 26 | ftens *gamma, ftens *beta, bnormLayer *bnl) 27 | { 28 | const int N=ftens_len(mean); 29 | ASSERT(N == ftens_len(istd) && 30 | N == ftens_len(beta) && 31 | N == ftens_len(gamma), "err: bnorm shape\n"); 32 | 33 | bnormLayer_free(bnl); 34 | bnl->N = N; 35 | bnl->mean = ftens_copy(mean); 36 | bnl->istd = ftens_copy(istd); 37 | bnl->beta = ftens_copy(beta); 38 | bnl->gamma = ftens_copy(gamma); 39 | } 40 | 41 | 42 | static 43 | void bnorm(const float *mean, const float *istd, 44 | const float *beta, const float *gamma, 45 | const int len, const int N, 46 | float *in) 47 | { 48 | for (int i=0; i < len; i++) 49 | in[i] = ((in[i] - mean[i%N]) * 50 | (istd[i%N] * gamma[i%N]) - 51 | (beta[i%N])); 52 | } 53 | 54 | void bnormLayer_forward(ftens *t, bnormLayer *bnl, int save) 55 | { 56 | const int D=t->D, M=t->M, N=t->N, L=t->L; 57 | const int asd = L>1 ? L : N*M; 58 | ASSERT(asd == bnl->N, "err: bnorm shape\n") 59 | 60 | if (save) { 61 | if (!bnl->in.data) bnl->in=ftens_init(D, M, N, L); 62 | memcpy(bnl->in.data, t->data, t->bytes); 63 | } 64 | 65 | if (bnl->ug) { 66 | // compute curr mean, istd 67 | // moving avg -> update globals 68 | fprintf(stderr, "not implemented\n"); 69 | exit(-3); 70 | } 71 | 72 | float *in = t->data; 73 | float *mean = bnl->mean.data; 74 | float *istd = bnl->istd.data; 75 | float *beta = bnl->beta.data; 76 | float *gamma = bnl->gamma.data; 77 | 78 | bnorm(mean, istd, beta, gamma, D*M*N*L, asd, in); 79 | } 80 | 81 | 82 | void bnormLayer_backward(ftens *dt, bnormLayer *bnl) 83 | { 84 | fprintf(stderr, "not implemented\n"); 85 | exit(-2); 86 | } 87 | 88 | 89 | void bnormLayer_update(bnormLayer *bnl) 90 | { 91 | fprintf(stderr, "not implemented\n"); 92 | exit(-2); 93 | } 94 | -------------------------------------------------------------------------------- /src/layers/bnorml.cu: -------------------------------------------------------------------------------- 1 | #include "util.cuh" 2 | #include "layers/cubnorm.h" 3 | #include "kernels.h" 4 | 5 | 6 | cubnormLayer cubnormLayer_init(int use_global) 7 | { 8 | cubnormLayer bnl; BNORML_INIT(bnl); 9 | bnl.ug = use_global; 10 | return bnl; 11 | } 12 | 13 | 14 | void cubnormLayer_free(cubnormLayer *bnl) 15 | { 16 | cuftens_free(&bnl->mean); cuftens_free(&bnl->istd); 17 | cuftens_free(&bnl->gmean); cuftens_free(&bnl->gistd); 18 | cuftens_free(&bnl->beta); cuftens_free(&bnl->gamma); 19 | cuftens_free(&bnl->dbeta); cuftens_free(&bnl->dgamma); 20 | cuftens_free(&bnl->tmp); cuftens_free(&bnl->in); 21 | } 22 | 23 | 24 | void cubnormLayer_print_shape(cubnormLayer *bnl) 25 | { 26 | printf("cubnorm: N=%d ug=%d\n", bnl->N, bnl->ug); 27 | } 28 | 29 | 30 | void cubnormLayer_convert(bnormLayer *src, cubnormLayer *dst) 31 | { 32 | cubnormLayer_set(&src->mean, &src->istd, &src->gamma, 33 | &src->beta, dst); 34 | } 35 | 36 | 37 | void cubnormLayer_set(ftens *mean, ftens *istd, 38 | ftens *gamma, ftens *beta, 39 | cubnormLayer *bnl) 40 | { 41 | const int N=ftens_len(mean); 42 | cuASSERT(N == ftens_len(istd) && 43 | N == ftens_len(beta) && 44 | N == ftens_len(gamma), "err: cubnorm shape\n"); 45 | 46 | cubnormLayer_free(bnl); 47 | bnl->N = N; 48 | bnl->mean = cuftens_convert(mean); 49 | bnl->istd = cuftens_convert(istd); 50 | bnl->beta = cuftens_convert(beta); 51 | bnl->gamma = cuftens_convert(gamma); 52 | } 53 | 54 | 55 | void cubnormLayer_forward(cuftens *t, cubnormLayer *bnl, int save) 56 | { 57 | const int D=t->D, M=t->M, N=t->N, L=t->L; 58 | if (save) { 59 | if (!bnl->in.data) bnl->in=cuftens_init(D, M, N, L); 60 | cudaMemcpy(bnl->in.data, t->data, t->bytes, cuDtoD); 61 | } 62 | 63 | if (bnl->ug) { 64 | // compute bath mean, istd 65 | // moving avg -> update globals 66 | fprintf(stderr, "not implemented yet\n"); 67 | exit(-3); 68 | } 69 | 70 | cubnorm(&bnl->mean, &bnl->istd, &bnl->beta, &bnl->gamma, t); 71 | } 72 | 73 | 74 | void cubnormLayer_backward(cuftens *dout, cubnormLayer *bnl) 75 | { 76 | fprintf(stderr, "not implemented yet\n"); 77 | exit(-2); 78 | } 79 | 80 | 81 | 82 | void cubnormLayer_update(cubnormLayer *bnl) 83 | { 84 | fprintf(stderr, "not implemented yet\n"); 85 | exit(-2); 86 | } 87 | -------------------------------------------------------------------------------- /src/layers/convl.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "util.h" 3 | #include "conv.h" 4 | 5 | extern float *scratch; 6 | 7 | 8 | convLayer convLayer_init(int Sm, int Sn, int p) 9 | { 10 | convLayer cl; CONVL_INIT(cl); 11 | cl.Sm=Sm; cl.Sn=Sn; cl.p=p; 12 | return cl; 13 | } 14 | 15 | 16 | void convLayer_free(convLayer *cl) 17 | { 18 | ftens_free(&cl->W); ftens_free(&cl->b); 19 | ftens_free(&cl->dW); ftens_free(&cl->db); 20 | ftens_free(&cl->out); ftens_free(&cl->in); 21 | } 22 | 23 | 24 | void convLayer_print_shape(convLayer *cl) 25 | { 26 | printf("conv: D=%d M=%d N=%d L=%d Sm=%d Sn=%d p=%d\n", 27 | cl->D, cl->M, cl->N, cl->L, cl->Sm, cl->Sn, cl->p); 28 | } 29 | 30 | 31 | void convLayer_set(ftens *W, convLayer *cl) 32 | { 33 | int D=W->D, M=W->M, N=W->N, L=W->L; 34 | ftens_free(&cl->W); 35 | cl->D=D; cl->M=M; cl->N=N; cl->L=L; 36 | cl->W = ftens_copy(W); 37 | } 38 | 39 | 40 | void convLayer_copy_input(ftens *t, convLayer *cl) 41 | { 42 | if (!cl->in.data) 43 | cl->in=ftens_init(t->D, t->M, t->N, t->L); 44 | memcpy(cl->in.data, t->data, t->bytes); 45 | } 46 | 47 | 48 | ftens convLayer_pad_input(ftens *t, float *scr, 49 | int *M, int *N, int p) 50 | { 51 | ftens tp; const int D=t->D, L=t->L; 52 | *M=PAD(*M, p); *N=PAD(*N, p); 53 | if (!scratch) tp=ftens_copy_pad(t, p); 54 | else { 55 | tp = ftens_from_ptr(D, *M, *N, L, scr); 56 | ftens_pad(t, &tp, p); 57 | scr += (*M)*(*N)*L*D; 58 | } 59 | 60 | return tp; 61 | } 62 | 63 | 64 | void convLayer_forward(ftens *t, convLayer *cl, int save) 65 | { 66 | float *scr = scratch; ftens tp, tmp; 67 | int D=t->D, Ms=t->M, Ns=t->N, Ls=t->L; 68 | int F=cl->D, W=cl->M, H=cl->N, L=cl->L; 69 | int p=cl->p, Sy=cl->Sm, Sx=cl->Sn; 70 | ASSERT(t->L == cl->L, "err: conv shape\n"); 71 | 72 | if (save) convLayer_copy_input(t, cl); 73 | if (p) tp = convLayer_pad_input(t, scr, &Ms, &Ns, p); 74 | 75 | // lower 76 | const int Md = OUT_LEN(Ms, H, Sy); 77 | const int Nd = OUT_LEN(Ns, W, Sx); 78 | const int Ld = W*H*L; 79 | if (!scratch) tmp=ftens_init(D, Md, Nd, Ld); 80 | else tmp=ftens_from_ptr(D, Md, Nd, Ld, scr); 81 | 82 | ftens_lower(p ? &tp : t, &tmp, W, H, Sx, Sy); 83 | 84 | // mat mul 85 | if (!cl->out.data) cl->out=ftens_init(D, Md, Nd, F); 86 | int M=Md*Nd, N=F, K=cl->W.MNL; 87 | cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, 88 | M, N, K, 1, tmp.data, K, cl->W.data, K, 89 | 0, cl->out.data, N); 90 | 91 | 92 | if (!scratch) ftens_free(&tmp); 93 | if (!scratch && p) ftens_free(&tp); 94 | } 95 | 96 | 97 | void convLayer_backward(ftens *dout, convLayer *cl) 98 | { 99 | exit(-2); 100 | } 101 | 102 | 103 | void convLayer_update(convLayer *cl) 104 | { 105 | exit(-3); 106 | } 107 | -------------------------------------------------------------------------------- /src/layers/convl.cu: -------------------------------------------------------------------------------- 1 | #include "util.cuh" 2 | #include "cuconv.h" 3 | #include "kernels.h" 4 | 5 | 6 | extern cufmem fptr; 7 | 8 | 9 | cuconvLayer cuconvLayer_init(int Sm, int Sn, int p) 10 | { 11 | cuconvLayer cl; CONVL_INIT(cl); 12 | cl.Sm=Sm; cl.Sn=Sn; cl.p=p; 13 | return cl; 14 | } 15 | 16 | void cuconvLayer_free(cuconvLayer *cl) 17 | { 18 | cuftens_free(&cl->W); cuftens_free(&cl->b); 19 | cuftens_free(&cl->dW); cuftens_free(&cl->db); 20 | cuftens_free(&cl->out); cuftens_free(&cl->in); 21 | } 22 | 23 | void cuconvLayer_set(ftens *W, cuconvLayer *cl) 24 | { 25 | int D=W->D, M=W->M, N=W->N, L=W->L; 26 | cuftens_free(&cl->W); 27 | cl->D=D; cl->M=M; cl->N=N; cl->L=L; 28 | cl->W = cuftens_init(D, M, N, L); 29 | cudaMemcpy(cl->W.data, W->data, W->bytes, cuHtoD); 30 | } 31 | 32 | void cuconvLayer_convert(convLayer *src, cuconvLayer *dst) 33 | { 34 | cuconvLayer_set(&src->W, dst); 35 | } 36 | 37 | void cuconvLayer_copy_input(cuftens *t, cuconvLayer *cl) 38 | { 39 | int D=t->D, M=t->M, N=t->N, L=t->L; 40 | if (!cl->in.data) cl->in = cuftens_init(D, M, N, L); 41 | cudaMemcpy(cl->in.data, t->data, t->bytes, cuDtoD); 42 | } 43 | 44 | cuftens cuconvLayer_pad_input(cuftens *t, int p) 45 | { 46 | cuftens tp; 47 | int M = PAD(t->M, p); 48 | int N = PAD(t->N, p); 49 | if (!CUFMEM) tp = cuftens_pad(t, p); 50 | else {tp = cuftens_from_cufmem(t->D, M, N, t->L); 51 | cuftens_pad(t, &tp, p);} 52 | return tp; 53 | } 54 | 55 | cuftens cuconvLayer_lower_input(cuftens *t, cuconvLayer *cl) 56 | { 57 | int W=cl->N, H=cl->M, Sx=cl->Sn, Sy=cl->Sm; 58 | cuftens tl = (CUFMEM ? 59 | cuftens_lower_cufmem(t, W, H, Sx, Sy) : 60 | cuftens_lower_init(t, W, H, Sx, Sy)); 61 | 62 | culower(t, &tl, W, H, Sx, Sy); 63 | 64 | return tl; 65 | } 66 | 67 | void cuconvLayer_forward(cuftens *t, cuconvLayer *cl, int save) 68 | { 69 | cufmem_reset(); 70 | 71 | int D=t-> D, M=t->M, N=t->N, L=t->L; 72 | int F=cl->D, W=cl->N, H=cl->M; 73 | int p=cl->p, Sx=cl->Sn, Sy=cl->Sm; 74 | 75 | cuASSERT(t->L == cl->L, "err: cuconv shape\n"); 76 | 77 | cuftens tp = !p ? *t : cuconvLayer_pad_input(t, p); 78 | cuftens tl = cuconvLayer_lower_input(&tp, cl); 79 | 80 | M = tl.M; N = tl.N; 81 | if (!cl->out.data) cl->out=cuftens_init(D, M, N, F); 82 | 83 | M=D*M*N; N=F; int K=W*H*L; 84 | sgemm(M, N, K, tl.data, K, cl->W.data, K, cl->out.data, N); 85 | 86 | if (!CUFMEM) cuftens_free(&tl); 87 | if (!CUFMEM && p) cuftens_free(&tp); 88 | } 89 | 90 | 91 | void cuconvLayer_backward(cuftens *dout, cuconvLayer *cl) 92 | { 93 | exit(-2); 94 | } 95 | 96 | 97 | void cuconvLayer_print_shape(cuconvLayer *cl) 98 | { 99 | printf("cuconv: D=%d M=%d N=%d L=%d Sm=%d Sn=%d p=%d\n", 100 | cl->D, cl->M, cl->N, cl->L, cl->Sm, cl->Sn, cl->p); 101 | } 102 | -------------------------------------------------------------------------------- /src/layers/densel.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "util.h" 5 | #include "dense.h" 6 | 7 | 8 | denseLayer denseLayer_init(int M, int N) 9 | { 10 | denseLayer dl; DENSEL_INIT(dl); dl.M=M; dl.N=N; 11 | return dl; 12 | } 13 | 14 | 15 | void denseLayer_free(denseLayer *dl) 16 | { 17 | ftens_free(&dl->W); ftens_free(&dl->b); 18 | ftens_free(&dl->dW); ftens_free(&dl->db); 19 | ftens_free(&dl->out); ftens_free(&dl->in); 20 | } 21 | 22 | 23 | void denseLayer_print_shape(denseLayer *dl) 24 | { 25 | printf("dense: %d %d\n", dl->M, dl->N); 26 | } 27 | 28 | 29 | void denseLayer_set(ftens *W, denseLayer *dl) 30 | { 31 | const int M=W->M, N=W->N; 32 | ASSERT(W->D==1 && W->L==1, "err: dense shape\n"); 33 | ftens_free(&dl->W); 34 | dl->M = M; dl->N = N; 35 | dl->W = ftens_copy(W); 36 | } 37 | 38 | 39 | void denseLayer_forward(ftens *t, denseLayer *dl, int save) 40 | { 41 | const int D=t->D, M=dl->M, N=dl->N; 42 | ASSERT(t->MNL == dl->N, "err: dense shape\n"); 43 | 44 | if (save) { 45 | int M=t->M, N=t->N, L=t->L; 46 | if (!dl->in.data) dl->in = ftens_init(D,M,N,L); 47 | memcpy(dl->in.data, t->data, t->bytes); 48 | } 49 | 50 | if (!dl->out.data) dl->out = ftens_init(D, 1, M, 1); 51 | const float *a=dl->W.data; 52 | const float *b=t->data; 53 | float *c=dl->out.data; 54 | 55 | cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, 56 | D, M, N, 1, b, N, a, N, 0, c, M); 57 | 58 | } 59 | 60 | 61 | void denseLayer_backward(ftens *dout, denseLayer *dl) 62 | { 63 | fprintf(stderr, "not implemented yet\n"); 64 | exit(-2); 65 | } 66 | -------------------------------------------------------------------------------- /src/layers/densel.cu: -------------------------------------------------------------------------------- 1 | #include "util.cuh" 2 | #include "cudense.h" 3 | #include "kernels.h" 4 | 5 | 6 | cudenseLayer cudenseLayer_init(int M, int N) 7 | { 8 | cudenseLayer dl; DENSEL_INIT(dl); dl.M=M; dl.N=N; 9 | return dl; 10 | } 11 | 12 | void cudenseLayer_free(cudenseLayer *dl) 13 | { 14 | cuftens_free(&dl->W); cuftens_free(&dl->b); 15 | cuftens_free(&dl->dW); cuftens_free(&dl->db); 16 | cuftens_free(&dl->out); cuftens_free(&dl->in); 17 | } 18 | 19 | void cudenseLayer_convert(denseLayer *src, cudenseLayer *dst) 20 | { 21 | cudenseLayer_set(&src->W, dst); 22 | } 23 | 24 | void cudenseLayer_set(ftens *W, cudenseLayer *dl) 25 | { 26 | int M=W->M, N=W->N; 27 | cudenseLayer_free(dl); 28 | dl->M=M; dl->N=N; dl->W=cuftens_init(1, M, N, 1); 29 | cudaMemcpy(dl->W.data, W->data, W->bytes, cuHtoD); 30 | } 31 | 32 | void cudenseLayer_copy_input(cuftens *t, cudenseLayer *dl) 33 | { 34 | if (!dl->in.data) 35 | dl->in = cuftens_init(t->D, t->M, t->N, t->L); 36 | cudaMemcpy(dl->in.data, t->data, t->bytes, cuHtoD); 37 | } 38 | 39 | void cudenseLayer_forward(cuftens *t, cudenseLayer *dl, int save) 40 | { 41 | int D=t->D, M=t->M, N=t->N; 42 | cuftens_reshape(t, D, 1, t->MNL, 1); 43 | cuASSERT(t->MNL == dl->N, "err: cudense shape\n"); 44 | 45 | if (save) cudenseLayer_copy_input(t, dl); 46 | if (!dl->out.data) dl->out=cuftens_init(D, 1, dl->M, 1); 47 | 48 | if (D == 1) sgemv(&dl->W, t, &dl->out); 49 | else sgemm(M, 1, N, t->data, N, dl->W.data, N, 50 | dl->out.data, 1); 51 | } 52 | 53 | 54 | void cudenseLayer_backward(cuftens *dt, cudenseLayer *dl) 55 | { 56 | fprintf(stderr, "err: dense bprop not implemented yet\n"); 57 | exit(-2); 58 | } 59 | 60 | 61 | void cudenseLayer_print_size(cudenseLayer *dl) 62 | { 63 | printf("cudense: %d %d\n", dl->M, dl->N); 64 | } 65 | -------------------------------------------------------------------------------- /src/layers/inputl.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | #include "input.h" 3 | 4 | 5 | void inputLayer_load(ftens *t, inputLayer *il) 6 | { 7 | il->out = ftens_copy(t); 8 | } 9 | 10 | 11 | void inputLayer_free(inputLayer *il) 12 | { 13 | ftens_free(&il->out); 14 | } 15 | 16 | 17 | void inputLayer_forward(inputLayer *il) 18 | { 19 | if (!il->out.data) { 20 | fprintf(stderr, "err: in null\n"); 21 | exit(-1); 22 | } 23 | 24 | float *ptr = il->out.data; 25 | const int len = ftens_len(&il->out); 26 | for (int i=0; i < len; i++) 27 | ptr[i] = 2.0f * ptr[i]/255.0f - 1.0f; 28 | } 29 | 30 | 31 | /* void inputLayer_pad(inputLayer *il, const int p) */ 32 | /* { */ 33 | /* ftens tmp = ftens_copy_pad(&il->out, p); */ 34 | /* ftens_free(&il->out); */ 35 | /* il->out = tmp; */ 36 | /* } */ 37 | -------------------------------------------------------------------------------- /src/layers/inputl.cu: -------------------------------------------------------------------------------- 1 | #include "util.cuh" 2 | #include "cuinput.h" 3 | #include "kernels.h" 4 | 5 | 6 | void cuinputLayer_forward(ftens *t, cuinputLayer *il, int norm) 7 | { 8 | int D=t->D, M=t->M, N=t->N, L=t->L; 9 | 10 | if (!il->out.data) il->out=cuftens_init(D, M, N, L); 11 | cudaMemcpy(il->out.data, t->data, t->bytes, cuHtoD); 12 | 13 | if (norm) cunorm(&il->out); 14 | } 15 | 16 | void cuinputLayer_free(cuinputLayer *il) 17 | { 18 | cuftens_free(&il->out); 19 | } 20 | -------------------------------------------------------------------------------- /src/layers/pconvl.cu: -------------------------------------------------------------------------------- 1 | #include "cumem.h" 2 | #include "layers/cuconv.h" 3 | #include "layers/cupconv.h" 4 | #include "kernels.h" 5 | 6 | 7 | cupconvLayer cupconvLayer_init(int Sm, int Sn, int p) 8 | { 9 | cupconvLayer out; 10 | CUPCONVL_INIT(out); 11 | out.Sm=Sm; out.Sn=Sn; out.p=p; 12 | return out; 13 | } 14 | 15 | void cupconvLayer_free(cupconvLayer *cl) 16 | { 17 | cuptens_free(&cl->W); cuptens_free(&cl->pout); 18 | cuftens_free(&cl->dW); cuftens_free(&cl->out); 19 | cuftens_free(&cl->in); cuftens_free(&cl->fix); 20 | cuftens_free(&cl->bfix); 21 | } 22 | 23 | void cupconvLayer_set(ftens *W, cupconvLayer *cl, int fix) 24 | { 25 | int D=W->D, M=W->M, N=W->N, L=W->L; 26 | cupconvLayer_free(cl); 27 | cl->D=D; cl->M=M; cl->N=N; cl->L=L; 28 | cl->W = cuptens_convert(W); 29 | if (fix) { 30 | ftens tmp = ftens_init(1, 1, D, 1); 31 | for (int i = 0; i < D; i++) { 32 | float *s = W->data + i * W->MNL; ; 33 | tmp.data[i] = 0.0f; 34 | for (int j = 0; j < W->MNL; j++) 35 | tmp.data[i] += s[j]; 36 | } 37 | if (!cl->fix.data) cl->fix = cuftens_init(1, 1, D, 1); 38 | cudaMemcpy(cl->fix.data, tmp.data, tmp.bytes, cuHtoD); 39 | ftens_free(&tmp); 40 | } 41 | } 42 | 43 | void cupconvLayer_convert(convLayer *src, cupconvLayer *dst, int fix) 44 | { 45 | dst->Sm=src->Sm; dst->Sn=src->Sn; dst->p=src->p; 46 | cupconvLayer_set(&src->W, dst, fix); 47 | } 48 | 49 | cuptens cupconvLayer_pad_input(cuptens *t, int p) 50 | { 51 | cuptens tp; 52 | int M = PAD(t->M, p); 53 | int N = PAD(t->N, p); 54 | if (CUPMEM) { 55 | tp = cuptens_from_cupmem(t->D, M, N, t->L); 56 | cupad(t, &tp, p); 57 | } else 58 | tp = cuptens_pad(t, p); 59 | 60 | return tp; 61 | } 62 | 63 | cuptens cupconvLayer_lower_input(cuptens *t, cupconvLayer *cl) 64 | { 65 | int W=cl->N, H=cl->M, Sx=cl->Sn, Sy=cl->Sm; 66 | cuptens tl = (CUPMEM ? 67 | cuptens_lower_cupmem(t, W, H, Sx, Sy) : 68 | cuptens_lower_init(t, W, H, Sx, Sy)); 69 | 70 | cuplower(t, &tl, W, H, Sx, Sy); 71 | 72 | return tl; 73 | } 74 | 75 | void cupconvLayer_forward(cuptens *t, cupconvLayer *cl) 76 | { 77 | cupmem_reset(); 78 | 79 | int D=t->D, M=t->M, N=t->N, L=t->X; 80 | int F=cl->D, p=cl->p, W=cl->N, H=cl->M; 81 | cuASSERT(L == cl->W.X, "err: cupconv forward\n"); 82 | 83 | cuptens tp = !p ? *t : cupconvLayer_pad_input(t, p); 84 | cuptens tl = cupconvLayer_lower_input(&tp, cl); 85 | 86 | M=tl.M; N=tl.N; 87 | 88 | if (!cl->out.data) cl->out = cuftens_init(D, M, N, F); 89 | 90 | pgemm(D*M*N, F, W*H*L, tl.data, cl->W.data, cl->out.data); 91 | 92 | if(!CUPMEM) cuptens_free(&tl); 93 | if(!CUPMEM && p) cuptens_free(&tp); 94 | } 95 | 96 | void cupconvLayer_forward_initial(cuftens *t, cupconvLayer *cl, 97 | float norm) 98 | { 99 | cufmem_reset(); 100 | cupmem_reset(); 101 | 102 | cuftens tmp = (!CUFMEM ? 103 | cuftens_round_up(t, 64) : 104 | cuftens_round_up_cufmem(t, 64)); 105 | 106 | int D=t->D, M=tmp.M, N=tmp.N, L=tmp.L; 107 | int F=cl->D, p=cl->p, W=cl->N, H=cl->M; 108 | 109 | cuftens tp = !p ? tmp : cuconvLayer_pad_input(&tmp, p); 110 | cuftens tl = cuconvLayer_lower_input(&tp, (cuconvLayer*)cl); 111 | 112 | M=tl.M; N=tl.N; L=tl.L; 113 | 114 | cuptens qwe = (!CUPMEM ? 115 | cuptens_init(D, 8, M*N*L, 1) : 116 | cuptens_from_cupmem(D, 8, M*N*L, 1)); 117 | 118 | 119 | cubp_split_pack(&tl, &qwe); 120 | 121 | cufmem_reset(); 122 | cuftens tmp2 = (!CUFMEM ? 123 | cuftens_init(D, M*8, N, F) : 124 | cuftens_from_cufmem(D, M*8, N, F)); 125 | 126 | pgemm_init_rev(D*M*N*8, F, cl->W.MNL, qwe.data, 127 | cl->W.data, tmp2.data); 128 | 129 | if (!cl->out.data) cl->out = cuftens_init(D, M, N, F); 130 | cubp_merge(&tmp2, &cl->out, &cl->fix, norm); 131 | 132 | if (!CUFMEM) {cuftens_free(&tmp); cuftens_free(&tl);} 133 | if (!CUPMEM) cuptens_free(&qwe); 134 | if (!CUFMEM && p) cuftens_free(&tp); 135 | } 136 | 137 | void cupconvLayer_print(cupconvLayer *cl) 138 | { 139 | printf("cupconvLayer: \n"); 140 | } 141 | -------------------------------------------------------------------------------- /src/layers/pdensel.cu: -------------------------------------------------------------------------------- 1 | #include "util.cuh" 2 | #include "kernels.h" 3 | #include "layers/cupdense.h" 4 | 5 | 6 | cupdenseLayer cupdenseLayer_init() 7 | { 8 | cupdenseLayer dl; 9 | CUPDENSEL_INIT(dl); 10 | return dl; 11 | } 12 | 13 | void cupdenseLayer_free(cupdenseLayer *dl) 14 | { 15 | cuptens_free(&dl->W); cuptens_free(&dl->pout); 16 | cuptens_free(&dl->in); cuftens_free(&dl->out); 17 | cuftens_free(&dl->dW); cuftens_free(&dl->fix); 18 | } 19 | 20 | void cupdenseLayer_set(ftens *W, cupdenseLayer *dl, int fix) 21 | { 22 | int M = W->M, N = W->N; 23 | cupdenseLayer_free(dl); 24 | dl->M=M; dl->N=N; dl->W=cuptens_convert(W); 25 | 26 | if (fix) { 27 | ftens tmp = ftens_init(1, 1, M, 1); 28 | for (int i=0; idata[ID2(i,j,N)]; 32 | } 33 | dl->fix = cuftens_convert(&tmp); 34 | ftens_free(&tmp); 35 | } 36 | } 37 | 38 | void cupdenseLayer_convert(denseLayer *src, cupdenseLayer *dst, int fix) 39 | { 40 | cupdenseLayer_set(&src->W, dst, fix); 41 | } 42 | 43 | static 44 | void cupdenseLayer_copy_input(cuptens *t, cupdenseLayer *dl) 45 | { 46 | int D=t->D, M=t->M, N=t->N, L=t->L; 47 | if (!dl->in.data) dl->in = cuptens_init(D, M, N, L); 48 | cudaMemcpy(dl->in.data, t->data, t->bytes, cuDtoD); 49 | } 50 | 51 | void cupdenseLayer_forward(cuptens *t, cupdenseLayer *dl, int save) 52 | { 53 | int D=t->D, M=dl->M, N=dl->W.X; 54 | cuASSERT(t->MNL == dl->N/64, "err: cupdense shape\n"); 55 | 56 | if (save) cupdenseLayer_copy_input(t, dl); 57 | 58 | if (!dl->out.data) dl->out = cuftens_init(D, 1, M, 1); 59 | if (D == 1) pgemv(M, N, dl->W.data, t->data, dl->out.data); 60 | else pgemm(D, M, N, t->data, dl->W.data, dl->out.data); 61 | } 62 | 63 | static 64 | cuptens cupdenseLayer_bpsplit_input(cuftens *t, cupdenseLayer *dl) 65 | { 66 | int D=t->D, N=t->MNL, ru=N & 63; 67 | cuftens tmp; tmp.data=NULL; 68 | if (ru) { 69 | if (CUFMEM) { 70 | int asd = ROUND_UP(N, 64); 71 | tmp = cuftens_from_cufmem(D, 1, asd, 1); 72 | cudaMemset(tmp.data, 0, tmp.bytes); 73 | cucopy(t, &tmp); 74 | } 75 | else { 76 | tmp = cuftens_round_up(t, 64); 77 | } 78 | } 79 | 80 | cuptens out = (CUPMEM ? 81 | cuptens_from_cupmem(D, 8, N, 1) : 82 | cuptens_init(D, 8, N, 1)); 83 | 84 | cubp_split_pack(ru ? &tmp : t, &out); 85 | 86 | if (ru && !CUFMEM) cuftens_free(&tmp); 87 | return out; 88 | } 89 | 90 | 91 | void cupdenseLayer_forward_initial(cuftens *t, cupdenseLayer *dl, 92 | float norm) 93 | { 94 | int D=t->D, M=dl->M, N=8, K=dl->W.X; 95 | 96 | cufmem_reset(); 97 | cupmem_reset(); 98 | 99 | cuftens_reshape(t, t->D, 1, t->MNL, 1); 100 | cuptens asd = cupdenseLayer_bpsplit_input(t, dl); 101 | cuftens tmp = (CUFMEM ? 102 | cuftens_from_cufmem(D, N, M, 1) : 103 | cuftens_init(D, N, M, 1)); 104 | 105 | pgemm_init_rev(D*N, M, K, asd.data, dl->W.data, tmp.data); 106 | 107 | if (!dl->out.data) dl->out = cuftens_init(D, 1, M, 1); 108 | 109 | cubp_merge(&tmp, &dl->out, &dl->fix, norm); 110 | 111 | if (!CUPMEM) cuptens_free(&asd); 112 | if (!CUFMEM) cuftens_free(&tmp); 113 | } 114 | 115 | 116 | void cupdenseLayer_backward(cuptens *dout, cupdenseLayer *dl) 117 | { 118 | fprintf(stderr, "err: cupdensel bprop not yet implemented\n"); 119 | exit(-2); 120 | } 121 | 122 | void cupdenseLayer_update(cupdenseLayer *dl) 123 | { 124 | fprintf(stderr, "err: cupdensel bprop not yet implemented\n"); 125 | exit(-2); 126 | } 127 | -------------------------------------------------------------------------------- /src/layers/pinputl.cu: -------------------------------------------------------------------------------- 1 | #include "layers/cupinput.h" 2 | 3 | 4 | // cupinputLayer cupinputLayer_init() 5 | // { 6 | // cupinputLayer il; il.out.data=NULL; 7 | // return il; 8 | // } 9 | 10 | // void cupinputLayer_forward(ftens *t, cupinputLayer *il) 11 | // { 12 | // int D=t->D, M=t->M, N=t->N, L=t->L; cuftens tmp; 13 | // if (!CUFMEM) 14 | // tmp = cuftens_convert(t); 15 | // else { 16 | // tmp = cuftens_from_cufmem(D, M, N, L); 17 | // cudaMemcpy(tmp.data, t->data, t->bytes, cuHtoD); 18 | // } 19 | 20 | // if (!il->out.data) il->out = cuptens_init(D, M, N, L); 21 | // cuptens_convert(&tmp, &il->out); 22 | 23 | // if(!d_fscratch) cuftens_free(&tmp); 24 | // } 25 | 26 | // void cupinputLayer_free(cupinputLayer *il) 27 | // { 28 | // cuptens_free(&il->out); 29 | // } 30 | -------------------------------------------------------------------------------- /src/layers/pooll.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | #include "layers/pool.h" 3 | 4 | 5 | poolLayer poolLayer_init(int M, int N, int Sm, int Sn) 6 | { 7 | poolLayer pl = {M, N, Sm, Sn, MAX}; 8 | pl.out. data = NULL; 9 | pl.mask.data = NULL; 10 | return pl; 11 | } 12 | 13 | 14 | void poolLayer_free(poolLayer *pl) 15 | { 16 | ftens_free(&pl->out); 17 | ftens_free(&pl->mask); 18 | } 19 | 20 | 21 | void poolLayer_forward(ftens *t, poolLayer *pl) 22 | { 23 | const int W=pl->M, H=pl->N, Sy=pl->Sm, Sx=pl->Sn; 24 | const int D=t->D, L=t->L, Ms=t->M, Ns=t->N; 25 | const int Md=OUT_LEN(Ms, H, Sy); 26 | const int Nd=OUT_LEN(Ns, W, Sx); 27 | 28 | if (!pl->out.data) pl->out=ftens_init(D, Md, Nd, L); 29 | 30 | if (pl->op == MAX) 31 | ftens_maxpool(t, &pl->out, W, H, Sx, Sy); 32 | else 33 | exit(-3); 34 | } 35 | 36 | 37 | void poolLayer_backward(ftens *dout, poolLayer *pl) 38 | { 39 | exit(-2); 40 | } 41 | -------------------------------------------------------------------------------- /src/layers/pooll.cu: -------------------------------------------------------------------------------- 1 | #include "util.cuh" 2 | #include "kernels.h" 3 | #include "layers/cupool.h" 4 | 5 | 6 | cupoolLayer cupoolLayer_init(int M, int N, int Sm, int Sn) 7 | { 8 | cupoolLayer pl = {M, N, Sm, Sn, MAX}; 9 | pl.out. data = NULL; 10 | pl.mask.data = NULL; 11 | return pl; 12 | } 13 | 14 | void cupoolLayer_free(cupoolLayer *pl) 15 | { 16 | cuftens_free(&pl->out); 17 | cuftens_free(&pl->mask); 18 | } 19 | 20 | void cupoolLayer_convert(poolLayer *src, cupoolLayer *dst) 21 | { 22 | dst->M = src->N; 23 | dst->N = src->N; 24 | dst->Sm = src->Sm; 25 | dst->Sn = src->Sn; 26 | dst->op = src->op; 27 | } 28 | 29 | void cupoolLayer_forward(cuftens *t, cupoolLayer *pl) 30 | { 31 | int W=pl->M, H=pl->N, Sy=pl->Sm, Sx=pl->Sn; 32 | int M = OUT_LEN(t->M, H, Sy); 33 | int N = OUT_LEN(t->N, W, Sx); 34 | 35 | if (!pl->out.data) 36 | pl->out = cuftens_init(t->D, M, N, t->L); 37 | 38 | cuASSERT(pl->op == MAX, "err: pool type not impl\n"); 39 | cumaxpool(t, &pl->out, W, H, Sx, Sy); 40 | } 41 | 42 | 43 | void cupoolLayer_backward(cuftens *dt, cupoolLayer *pl) 44 | { 45 | exit(-2); 46 | } 47 | 48 | 49 | void cupoolLayer_print(cupoolLayer *pl) 50 | { 51 | printf("cupool: %d %d %d %d\n", pl->M, pl->N, pl->Sm, pl->Sn); 52 | } 53 | -------------------------------------------------------------------------------- /src/mlp.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | #include "params.h" 3 | #include "nn/mlp.h" 4 | 5 | 6 | mlp mlp_init(int Ndl, int Nbnl) 7 | { 8 | mlp out = {Ndl, Nbnl}; 9 | out.dl = Ndl ? MALLOC(denseLayer, Ndl) : NULL; 10 | out.bnl = Nbnl ? MALLOC(bnormLayer, Nbnl) : NULL; 11 | for (int i=0; iil); 19 | for (int i=0; iNdl; i++) denseLayer_free(&net->dl[i]); 20 | for (int i=0; iNbnl; i++) bnormLayer_free(&net->bnl[i]); 21 | } 22 | 23 | mlp mlp_load(const char *esp, int bin) 24 | { 25 | mlp out; 26 | int Ndl; denseLayer *dl; 27 | int Nbnl; bnormLayer *bnl; 28 | 29 | FILE *pf = fopen(esp, "rb"); 30 | ASSERT(pf, "err: esp fopen\n"); 31 | 32 | int val; 33 | while ((val = fgetc(pf)) != EOF) { 34 | switch (val) { 35 | case DENSEL|LNUM: fread(&Ndl, sizeof(int), 1, pf); break; 36 | case BNORML|LNUM: fread(&Nbnl, sizeof(int), 1, pf); break; 37 | 38 | case INPUTL|LDAT: 39 | out = mlp_init(Ndl, Nbnl); 40 | dl=out.dl; bnl=out.bnl; 41 | break; 42 | 43 | case DENSEL|LDAT: load_denseLayer(dl, pf, bin); dl++; break; 44 | case BNORML|LDAT: load_bnormLayer(bnl, pf); bnl++; break; 45 | break; 46 | 47 | default: 48 | fprintf(stderr, "err: mlp loader\n"); 49 | exit(-3); 50 | } 51 | } 52 | 53 | fclose(pf); 54 | return out; 55 | } 56 | 57 | void mlp_print(mlp *net) 58 | { 59 | printf("mlp: Ndl=%d Nbnl=%d\n", net->Ndl, net->Nbnl); 60 | } 61 | -------------------------------------------------------------------------------- /src/mlp.cu: -------------------------------------------------------------------------------- 1 | #include "util.cuh" 2 | #include "nn/cumlp.h" 3 | 4 | 5 | 6 | cumlp cumlp_init(int Ndl, int Nbnl) 7 | { 8 | cumlp out = {Ndl, Nbnl}; 9 | out.dl = Ndl ? MALLOC(cudenseLayer, Ndl) : NULL; 10 | out.bnl = Nbnl ? MALLOC(cubnormLayer, Nbnl) : NULL; 11 | for (int i=0; iil); 20 | for (int i=0; iNdl; i++) cudenseLayer_free(&net->dl[i]); 21 | for (int i=0; iNbnl; i++) cubnormLayer_free(&net->bnl[i]); 22 | } 23 | 24 | 25 | cumlp cumlp_convert(mlp *net) 26 | { 27 | cumlp out = cumlp_init(net->Ndl, net->Nbnl); 28 | for (int i=0; iNdl; i++) 29 | cudenseLayer_convert(&net->dl[i], &out.dl[i]); 30 | 31 | for (int i=0; iNbnl; i++) 32 | cubnormLayer_convert(&net->bnl[i], &out.bnl[i]); 33 | 34 | return out; 35 | } 36 | 37 | 38 | void cumlp_print(cumlp *net) 39 | { 40 | printf("cumlp: Ndl=%d Nbnl=%d\n", net->Ndl, net->Nbnl); 41 | } 42 | -------------------------------------------------------------------------------- /src/params.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | #include "layers.h" 3 | #include "nn/mlp.h" 4 | #include "nn/cnn.h" 5 | 6 | 7 | static inline 8 | void reverse(float *v, const int len) 9 | { 10 | float t; int j=len-1; 11 | for (int i = 0; i < len/2; i++) { 12 | t = v[i]; v[i] = v[j]; v[j] = t; 13 | j--; 14 | } 15 | } 16 | 17 | 18 | void load_denseLayer(denseLayer *dl, FILE * const pf, int bin) 19 | { 20 | int M; fread(&M, sizeof(int), 1, pf); 21 | int N; fread(&N, sizeof(int), 1, pf); 22 | printf("dense: %d %d\n", M, N); 23 | ftens W = ftens_from_file(1, M, N, 1, pf); 24 | ftens b = ftens_from_file(1, 1, M, 1, pf); 25 | if (bin) ftens_sign(&W); 26 | denseLayer_set(&W, dl); 27 | ftens_free(&W); 28 | ftens_free(&b); 29 | } 30 | 31 | 32 | void load_bnormLayer(bnormLayer *bnl, FILE *pf) 33 | { 34 | int N; fread(&N, sizeof(int), 1, pf); 35 | printf("bnorm: %d\n", N); 36 | ftens beta = ftens_from_file(1, 1, N, 1, pf); 37 | ftens gamma = ftens_from_file(1, 1, N, 1, pf); 38 | ftens mean = ftens_from_file(1, 1, N, 1, pf); 39 | ftens istd = ftens_from_file(1, 1, N, 1, pf); 40 | bnormLayer_set(&mean, &istd, &beta, &gamma, bnl); 41 | ftens_free(&mean); ftens_free(&istd); 42 | ftens_free(&beta); ftens_free(&gamma); 43 | } 44 | 45 | void load_convLayer(convLayer *cl, FILE *pf, int bin, int rev) 46 | { 47 | int a[7]; fread(a, sizeof(int), 7, pf); 48 | int p=a[0], D=a[1], M=a[2], N=a[3], L=a[4]; 49 | printf("conv: \n"); 50 | ftens fil = ftens_from_file(D, L, M, N, pf); 51 | if (bin) ftens_sign(&fil); 52 | if (rev) 53 | for (int w=0; wSm=a[5]; cl->Sn=a[6]; cl->p=a[0]; 59 | convLayer_set(&asd, cl); 60 | ftens_free(&fil); 61 | ftens_free(&asd); 62 | ftens_free(&b); 63 | } 64 | 65 | 66 | void load_poolLayer(poolLayer *pl, FILE *pf) 67 | { 68 | int M, N, Sm, Sn; 69 | printf("pool: \n"); 70 | fread(&M, sizeof(int), 1, pf); 71 | fread(&N, sizeof(int), 1, pf); 72 | fread(&Sm, sizeof(int), 1, pf); 73 | fread(&Sn, sizeof(int), 1, pf); 74 | pl->M=M; pl->N=N; pl->Sm=Sm; pl->Sn=Sn; 75 | } 76 | -------------------------------------------------------------------------------- /src/pcnn.cu: -------------------------------------------------------------------------------- 1 | #include "util.cuh" 2 | #include "nn/cupcnn.h" 3 | 4 | 5 | cupcnn cupcnn_init(int Ncl, int Npl, int Ndl, int Nbnl) 6 | { 7 | cupcnn out = {Ncl, Npl, Ndl, Nbnl}; 8 | out.cl = Ncl ? MALLOC(cupconvLayer, Ncl) : NULL; 9 | out.pl = Npl ? MALLOC(cupoolLayer, Npl) : NULL; 10 | out.dl = Ndl ? MALLOC(cupdenseLayer, Ndl) : NULL; 11 | out.bnl = Nbnl ? MALLOC(cubnormLayer, Nbnl) : NULL; 12 | for (int i=0; iil); 22 | for (int i=0; iNcl; i++) cupconvLayer_free(net->cl + i); 23 | for (int i=0; iNpl; i++) cupoolLayer_free(net->pl + i); 24 | for (int i=0; iNdl; i++) cupdenseLayer_free(net->dl + i); 25 | for (int i=0; iNbnl; i++) cubnormLayer_free(net->bnl + i); 26 | } 27 | 28 | cupcnn cupcnn_convert(cnn *net) 29 | { 30 | int Ncl=net->Ncl, Npl=net->Npl; 31 | int Ndl=net->Ndl, Nbnl=net->Nbnl; 32 | 33 | cupcnn out = cupcnn_init(Ncl, Npl, Ndl, Nbnl); 34 | 35 | for (int i=0; icl[i], &out.cl[i], i==0); 37 | 38 | for (int i=0; ipl[i], &out.pl[i]); 40 | 41 | for (int i=0; idl[i], &out.dl[i], i==0); 43 | 44 | for (int i=0; ibnl[i], &out.bnl[i]); 46 | 47 | return out; 48 | } 49 | 50 | void cupcnn_print(cupcnn *net) 51 | { 52 | printf("CUPCNN: Ncl=%d Npl=%d Ndl=%d Nbnl=%d\n", 53 | net->Ncl, net->Npl, net->Ndl, net->Nbnl); 54 | } 55 | -------------------------------------------------------------------------------- /src/pmlp.cu: -------------------------------------------------------------------------------- 1 | #include "util.cuh" 2 | #include "nn/cupmlp.h" 3 | 4 | 5 | cupmlp cupmlp_init(int Ndl, int Nbnl) 6 | { 7 | cupmlp out = {Ndl, Nbnl}; 8 | out.dl = Ndl ? MALLOC(cupdenseLayer, Ndl) : NULL; 9 | out.bnl = Nbnl ? MALLOC(cubnormLayer, Nbnl) : NULL; 10 | for (int i=0; i < Ndl; i++) CUPDENSEL_INIT(out.dl[i]); 11 | for (int i=0; i < Nbnl; i++) BNORML_INIT(out.bnl[i]); 12 | return out; 13 | } 14 | 15 | void cupmlp_free(cupmlp *nn) 16 | { 17 | cuinputLayer_free(&nn->il); 18 | for (int i=0; i < nn->Ndl; i++) cupdenseLayer_free(&nn->dl[i]); 19 | for (int i=0; i < nn->Nbnl; i++) cubnormLayer_free(&nn->bnl[i]); 20 | } 21 | 22 | cupmlp cupmlp_convert(mlp *nn) 23 | { 24 | cupmlp out = cupmlp_init(nn->Ndl, nn->Nbnl); 25 | for (int i=0; i < nn->Ndl; i++) 26 | cupdenseLayer_convert(&nn->dl[i], &out.dl[i], i==0); 27 | 28 | for (int i=0; i < nn->Nbnl; i++) 29 | cubnormLayer_convert(&nn->bnl[i], &out.bnl[i]); 30 | 31 | return out; 32 | } 33 | 34 | void cupmlp_print(cupmlp *nn) 35 | { 36 | printf("cupmlp: Npdl=%d Nbnl=%d\n", nn->Ndl, nn->Nbnl); 37 | } 38 | -------------------------------------------------------------------------------- /src/ptens.cu: -------------------------------------------------------------------------------- 1 | #include "util.cuh" 2 | #include "cuptens.h" 3 | #include "kernels.h" 4 | 5 | 6 | cuptens cuptens_empty(int D, int M, int N, int L) 7 | { 8 | DIM_CHECK(D, M, N, L); 9 | cuptens out = {D, M, N, L}; 10 | out.X = L>1 ? CEIL(L, 64) : CEIL(N, 64); 11 | out.p = (L>1 ? L : N) - out.X+1; 12 | out.MNL = (L>1 ? M * N : M) * out.X; 13 | out.bytes = BYTES(uint64_t, D * out.MNL); 14 | out.data = NULL; 15 | return out; 16 | } 17 | 18 | cuptens cuptens_init(int D, int M, int N, int L) 19 | { 20 | cuptens out = cuptens_empty(D, M, N, L); 21 | cudaMalloc(&out.data, out.bytes); 22 | cuASSERT(out.data, "err: cuptens malloc\n"); 23 | //cudaMemset(&(out.data), 0, out.bytes); 24 | return out; 25 | } 26 | 27 | 28 | cuptens cuptens_from_cupmem(int D, int M, int N, int L) 29 | { 30 | cuptens out = cuptens_empty(D, M, N, L); 31 | out.data = cupmem_reserve(out.bytes); 32 | return out; 33 | } 34 | 35 | cuptens cuptens_pad(cuptens *src, int p) 36 | { 37 | cuASSERT(src->data && src->L > 1, "err: L>1 for pad\n"); 38 | int D=src->D, M=src->M, N=src->N, L=src->L; 39 | cuptens out = cuptens_init(D, PAD(M,p), PAD(N,p), L); 40 | cupad(src, &out, p); 41 | return out; 42 | } 43 | 44 | cuptens cuptens_lower_init(cuptens *src, int W, int H, int Sx, int Sy) 45 | { 46 | cuASSERT(src->data && src->L > 1, "err\n"); 47 | int D=src->D, L=src->X; 48 | int Md = OUT_LEN(src->M, H, Sy); 49 | int Nd = OUT_LEN(src->N, W, Sx); 50 | return cuptens_init(D, Md, Nd, W*H*L*64); 51 | } 52 | 53 | cuptens cuptens_lower_cupmem(cuptens *src, 54 | int W, int H, int Sx, int Sy) 55 | { 56 | cuASSERT(src->data && src->L > 1, "err\n"); 57 | int D=src->D, L=src->X; 58 | int Md = OUT_LEN(src->M, H, Sy); 59 | int Nd = OUT_LEN(src->N, W, Sx); 60 | cuptens out = cuptens_empty(D, Md, Nd, W*H*L*64); 61 | out.data = cupmem_reserve(out.bytes); 62 | return out; 63 | } 64 | 65 | void cuptens_free(cuptens *t) 66 | { 67 | if (t->data) {cudaFree(t->data); t->data=NULL;} 68 | } 69 | 70 | cuptens cuptens_convert(ftens *t) 71 | { 72 | cuASSERT(t->data, "err\n"); 73 | cuftens tmp = cuftens_convert(t); 74 | cuptens out = cuptens_convert(&tmp); 75 | cuftens_free(&tmp); 76 | return out; 77 | } 78 | 79 | cuptens cuptens_convert(cuftens *t) 80 | { 81 | cuftens tmp = cuftens_round_up(t, 64); 82 | cuptens out = cuptens_init(tmp.D, tmp.M, tmp.N, tmp.L); 83 | cupack(&tmp, &out); 84 | cuftens_free(&tmp); 85 | return out; 86 | } 87 | 88 | void cuptens_convert(cuftens *src, cuptens *dst) 89 | { 90 | cupack(src, dst); 91 | } 92 | 93 | uint64_t *cuptens_dump(cuptens *t) 94 | { 95 | uint64_t *out = MALLOC(uint64_t, t->bytes); 96 | cuASSERT(out, "err: dump malloc\n"); 97 | cudaMemcpy(out, t->data, t->bytes, cuDtoH); 98 | return out; 99 | } 100 | 101 | // void cuptens_print_ch(cuptens *t, int w, int k, const char *fmt) 102 | // { 103 | // int D=t->D, M=t->M, N=t->N, L=t->L, X=t->X; 104 | 105 | // if (!t->data) {printf("err: cuptens null\n"); return;} 106 | 107 | // uint64_t *a = cuptens_dump(t); 108 | // const uint64_t *b = a + w*t->MNL; 109 | 110 | // for (int i=0; i < M; i++) { 111 | // for (int j=0; j < N; j++) { 112 | // int v, p, o; 113 | // p=k>>6; o=k&63; 114 | // v = (b[ID3(i,j,p,N,X)] >> o) &1; 115 | // printf(fmt, 2*v -1); 116 | // } 117 | // NL; 118 | // } 119 | // free(a); 120 | // } 121 | 122 | void cuptens_print(cuptens *t) 123 | { 124 | cuptens_print(t, "%2d"); 125 | } 126 | 127 | void cuptens_print(cuptens *t, const char *fmt) 128 | { 129 | int D=t->D, M=t->M, N=t->N, L=t->L, X=t->X; 130 | 131 | if (!t->data) {printf("err: cuptens null\n"); return;} 132 | 133 | uint64_t *a = cuptens_dump(t); 134 | 135 | for (int w=0; w < D; w++) { 136 | const uint64_t *b = a + w*t->MNL; 137 | for (int i=0; i < M; i++) { 138 | for (int k=0; k < L; k++) { 139 | for (int j=0; j < N; j++) { 140 | int v, p, o; 141 | if (L == 1) { 142 | p=j>>6; o=j&63; 143 | v = (b[ID2(i,p,X)] >> o) &1; 144 | } else { 145 | p=k>>6; o=k&63; 146 | v = (b[ID3(i,j,p,N,X)] >> o) &1; 147 | } 148 | 149 | printf(fmt, 2*v -1); 150 | 151 | } SEP; } NL; } NL; } 152 | free(a); 153 | } 154 | 155 | void cuptens_print_shape(cuptens *t) 156 | { 157 | printf("cuptens: %d %d %d %d %d\n", t->D, t->M, t->N, t->L, t->X); 158 | } 159 | -------------------------------------------------------------------------------- /src/scratch.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | 3 | float *scratch; 4 | 5 | 6 | void scratch_alloc(int len) 7 | { 8 | if (len) scratch = MALLOC(float, len); 9 | else scratch = NULL; 10 | } 11 | 12 | void scratch_free() 13 | { 14 | if (scratch) {free(scratch); scratch=NULL;} 15 | } 16 | -------------------------------------------------------------------------------- /src/tens.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "tens.h" 5 | #include "util.h" 6 | 7 | 8 | ftens ftens_init(int D, int M, int N, int L) 9 | { 10 | ftens t = {D, M, N, L, M*N*L, BYTES(float, D*M*N*L)}; 11 | t.data = MALLOC(float, D*M*N*L); 12 | ASSERT(t.data, "err: ftens malloc"); 13 | return t; 14 | } 15 | 16 | ftens ftens_from_ptr(int D, int M, int N, int L, float *ptr) 17 | { 18 | ftens t = {D, M, N, L, M*N*L, BYTES(float, D*M*N*L)}; 19 | ASSERT(ptr, "err: NULL ptr\n"); 20 | t.data = ptr; 21 | return t; 22 | } 23 | 24 | void ftens_print_shape(ftens *t) 25 | { 26 | printf("ftens: %d %d %d %d\n", t->D, t->M, t->N, t->L); 27 | } 28 | 29 | void ftens_free(ftens *t) 30 | { 31 | if (t->data) {free(t->data); t->data=NULL;} 32 | } 33 | 34 | ftens ftens_copy(ftens *t) 35 | { 36 | const int D=t->D, M=t->M, N=t->N, L=t->L; 37 | ftens out = ftens_init(D, M, N, L); 38 | ASSERT(t->data, "err: null tens\n"); 39 | memcpy(out.data, t->data, t->bytes); 40 | return out; 41 | } 42 | 43 | ftens ftens_from_file(int D, int M, int N, int L, FILE *pf) 44 | { 45 | ftens out = ftens_init(D, M, N, L); 46 | fread(out.data, sizeof(float), D*M*N*L, pf); 47 | return out; 48 | } 49 | 50 | void ftens_reshape(ftens *t, int D, int M, int N, int L) 51 | { 52 | const int len = ftens_len(t); 53 | ASSERT(len== D*M*N*L, "err: ftens reshape\n"); 54 | t->D=D; t->M=M; t->N=N; t->L=L; 55 | } 56 | 57 | 58 | void ftens_clear(ftens *t) {memset(t->data, 0, t->bytes);} 59 | 60 | ftens ftens_zeros(int D, int M, int N, int L) 61 | { 62 | ftens t = ftens_init(D, M, N, L); 63 | memset(t.data, 0, t.bytes); 64 | return t; 65 | } 66 | 67 | ftens ftens_ones(int D, int M, int N, int L) 68 | { 69 | ftens t = ftens_init(D, M, N, L); 70 | for (int i=0; i < LEN(t); i++) 71 | D(t)[i] = 1.0f; 72 | return t; 73 | } 74 | 75 | ftens ftens_rand(int D, int M, int N, int L) 76 | { 77 | ftens t = ftens_init(D, M, N, L); 78 | for (int i=0; i < LEN(t); i++) 79 | D(t)[i] = (rand() % 255) - 128.0f; 80 | return t; 81 | } 82 | 83 | 84 | void ftens_sign(ftens *t) 85 | { 86 | for (int i=0; i < t->bytes/sizeof(float); i++) 87 | t->data[i] = 2.0f * (t->data[i] > 0.0f) - 1.0f; 88 | } 89 | 90 | ftens ftens_rand_range(int D, int M, int N, int L, 91 | float min, float max) 92 | { 93 | ftens t = ftens_init(D, M, N, L); 94 | for (int i=0; i < ftens_len(&t); i++) 95 | t.data[i] = ((max-min)*rand())/RAND_MAX + min; 96 | return t; 97 | } 98 | 99 | ftens ftens_copy_tch(ftens *a) 100 | { 101 | const int M=a->M, N=a->N, L=a->L, D=a->D; 102 | ftens b = ftens_init(D, N, L, M); 103 | for (int w=0; wdata + w*a->MNL; 105 | float *dst = b. data + w*b. MNL; 106 | for (int i=0; iM, N=a->N, L=a->L, D=a->D; 118 | for (int w=0; wdata + w*a->MNL; 120 | float *dst = b->data + w*b->MNL; 121 | for (int i=0; iD; 133 | const int Ms=src->M, Ns=src->N, Ls=src->L; 134 | const int Md=dst->M, Nd=dst->N, Ld=src->L; 135 | ASSERT(Ls == Ld && dst->D == D, "err: lowering shape\n"); 136 | float *d = dst->data; int n=0; 137 | for (int w=0; w < D; w++) { 138 | float *s = src->data + w*src->MNL; 139 | for (int i=0; iD, L =src->L; 154 | const int Ms=src->M, Ns=src->N; 155 | const int Md=dst->M, Nd=dst->N; 156 | ASSERT(D==dst->D && L==dst->L, "err: pool shape\n"); 157 | float *d=dst->data; int n=0; 158 | for (int w=0; w < D; w++) { 159 | float *s=src->data + w*src->MNL; 160 | for (int i=0; i < Ms; i+=Sy) 161 | for (int j=0; j < Ns; j+=Sx) 162 | for (int k=0; k < L; k++) { 163 | float v, max=FLT_MIN; 164 | for (int y=0; y max) max = v; 168 | } 169 | d[n++] = max; 170 | } 171 | } 172 | } 173 | 174 | 175 | ftens ftens_copy_pad(ftens *t, int p) 176 | { 177 | const int Ms=t->M, Ns=t->N, L=t->L, D=t->D; 178 | const int Md=PAD(Ms,p), Nd=PAD(Ns,p); 179 | ftens out = ftens_zeros(D, Md, Nd, L); 180 | float *pin = t->data; 181 | float *pout = out.data; 182 | for (int w=0; w < D; w++) { 183 | for (int i=0; i < Ms; i++) 184 | for (int j=0; j < Ns; j++) 185 | for (int k=0; k < L; k++) 186 | pout[ID3(i+p,j+p,k,Nd,L)] = 187 | pin[ID3(i,j,k,Ns,L)]; 188 | pin += t->MNL; 189 | pout += out.MNL; 190 | } 191 | return out; 192 | } 193 | 194 | void ftens_pad(ftens *src, ftens *dst, int p) 195 | { 196 | const int D=src->D, L=src->L; 197 | const int Ms=src->M, Ns=src->N; 198 | const int Md=dst->M, Nd=dst->N; 199 | ASSERT(D==dst->D && L==dst->L, "err: pad shape\n"); 200 | float *s = src->data; 201 | float *d = dst->data; 202 | memset(d, 0, dst->bytes); 203 | for (int w=0; w < D; w++) { 204 | for (int i=0; i < Ms; i++) 205 | for (int j=0; j < Ns; j++) 206 | for (int k=0; k < L; k++) 207 | d[ID3(i+p,j+p,k,Nd,L)] = 208 | s[ID3(i,j,k,Ns,L)]; 209 | s += src->MNL; 210 | d += dst->MNL; 211 | } 212 | } 213 | 214 | void ftens_print(ftens *t, const char *fmt) 215 | { 216 | if (!t->data) {printf("ftens NULL\n"); return;} 217 | const int M=t->M, N=t->N, L=t->L, D=t->D; 218 | float *ptr = t->data; 219 | for (int w=0; w < D; w++) { 220 | for (int i=0; i < M; i++) { 221 | for (int k=0; k < L; k++) { 222 | for (int j=0; j < N; j++) { 223 | float v = ptr[ID3(i,j,k,N,L)]; 224 | printf(fmt, v); 225 | } printf(" | "); 226 | } NL; 227 | } 228 | ptr += t->MNL; NL; 229 | } 230 | NL; 231 | } 232 | 233 | void ftens_print_ch(ftens *t, int w, int k, int I, int J, 234 | const char *fmt) 235 | { 236 | if (!t->data) {printf("ftens NULL\n"); return;} 237 | const int D=t->D, M=t->M, N=t->N, L=t->L; 238 | ASSERT(w < D, "err: print\n"); 239 | float *ptr = t->data + w * t->MNL; 240 | for (int i=0; i < MIN(M, (unsigned)I); i++) { 241 | for (int j=0; j < MIN(N,(unsigned)J); j++) { 242 | printf(fmt, ptr[ID3(i,j,k,N,L)]); 243 | } 244 | NL; 245 | } 246 | } 247 | -------------------------------------------------------------------------------- /src/tens.cu: -------------------------------------------------------------------------------- 1 | #include "cutens.h" 2 | #include "kernels.h" 3 | 4 | 5 | cuftens cuftens_empty(int D, int M, int N, int L) 6 | { 7 | cuftens t; 8 | DIM_CHECK(D, M, N, L); 9 | CUFTENS_INIT(t, D, M, N, L); 10 | return t; 11 | } 12 | 13 | cuftens cuftens_init(int D, int M, int N, int L) 14 | { 15 | cuftens t = cuftens_empty(D, M, N, L); 16 | cudaMalloc(&t.data, t.bytes); 17 | cuASSERT(t.data, "err: cuftens cuMalloc\n"); 18 | return t; 19 | } 20 | 21 | cuftens cuftens_lower_init(cuftens *t, int W, int H, int Sx, int Sy) 22 | { 23 | int M = OUT_LEN(t->M, H, Sy); 24 | int N = OUT_LEN(t->N, W, Sx); 25 | return cuftens_init(t->D, M, N, W*H*t->L); 26 | } 27 | 28 | cuftens cuftens_lower_cufmem(cuftens *t, int W, int H, int Sx, int Sy) 29 | { 30 | int M = OUT_LEN(t->M, H, Sy); 31 | int N = OUT_LEN(t->N, W, Sx); 32 | cuftens out = cuftens_empty(t->D, M, N, W*H*t->L); 33 | out.data = cufmem_reserve(out.bytes); 34 | return out; 35 | } 36 | 37 | void cuftens_free(cuftens *t) 38 | { 39 | if (t->data) {cudaFree(t->data); t->data=NULL;} 40 | } 41 | 42 | cuftens cuftens_zeros(int D, int M, int N, int L) 43 | { 44 | cuftens t = cuftens_init(D, M, N, L); 45 | cudaMemset(t.data, 0, t.bytes); 46 | return t; 47 | } 48 | 49 | cuftens cuftens_ones(int D, int M, int N, int L) 50 | { 51 | cuftens t = cuftens_init(D, M, N, L); 52 | cuset(&t, 1.0); 53 | return t; 54 | } 55 | 56 | cuftens cuftens_rand(int D, int M, int N, int L) 57 | { 58 | cuftens t = cuftens_init(D, M, N, L); 59 | ftens tmp = ftens_rand(D, M, N, L); 60 | cudaMemcpy(t.data, tmp.data, t.bytes, cuHtoD); 61 | ftens_free(&tmp); 62 | return t; 63 | } 64 | 65 | cuftens cuftens_rand(int D, int M, int N, int L, float min, float max) 66 | { 67 | cuftens t = cuftens_init(D, M, N, L); 68 | ftens tmp = ftens_rand_range(D, M, N, L, min, max); 69 | cudaMemcpy(t.data, tmp.data, t.bytes, cuHtoD); 70 | ftens_free(&tmp); 71 | return t; 72 | } 73 | 74 | cuftens cuftens_from_cufmem(int D, int M, int N, int L) 75 | { 76 | cuftens t = cuftens_empty(D, M, N, L); 77 | t.data = cufmem_reserve(t.bytes); 78 | return t; 79 | } 80 | 81 | cuftens cuftens_convert(ftens *t) 82 | { 83 | cuftens out = cuftens_init(t->D, t->M, t->N, t->L); 84 | cudaMemcpy(out.data, t->data, t->bytes, cuHtoD); 85 | return out; 86 | } 87 | 88 | void cuftens_reshape(cuftens *t, int D, int M, int N, int L) 89 | { 90 | const int len = cuftens_len(t); 91 | cuASSERT(len == D*M*N*L, "err: cuftens reshape\n"); 92 | t->D=D; t->M=M; t->N=N; t->L=L; 93 | } 94 | 95 | cuftens cuftens_copy(cuftens *t) 96 | { 97 | cuASSERT(t->data, "err\n"); 98 | cuftens out = cuftens_init(t->D, t->M, t->N, t->L); 99 | cudaMemcpy(out.data, t->data, t->bytes, cuDtoD); 100 | return out; 101 | } 102 | 103 | void cuftens_copy(cuftens *src, cuftens *dst) 104 | { 105 | cuASSERT(src->data && dst->data, "err\n"); 106 | cucopy(src, dst); 107 | } 108 | 109 | void cuftens_pad(cuftens *src, cuftens *dst, int p) 110 | { 111 | cuASSERT(dst->data && dst->data, "err\n"); 112 | cupad(src, dst, p); 113 | } 114 | 115 | cuftens cuftens_pad(cuftens *t, const int p) 116 | { 117 | const int D=t->D, M=t->M, N=t->N, L=t->L; 118 | cuftens out = cuftens_zeros(D, PAD(M,p), PAD(N,p), L); 119 | cupad(t, &out, p); 120 | return out; 121 | } 122 | 123 | cuftens cuftens_round_up(ftens *t, int n) 124 | { 125 | cuftens tmp = cuftens_convert(t); 126 | cuftens out = cuftens_round_up(&tmp, n); 127 | cuftens_free(&tmp); 128 | return out; 129 | } 130 | 131 | cuftens cuftens_round_up(cuftens *t, int n) 132 | { 133 | int D=t->D, M=t->M, N=t->N, L=t->L; 134 | if (L > 1) L = ROUND_UP(L, n); 135 | else N = ROUND_UP(N, n); 136 | cuftens out = cuftens_zeros(D, M, N, L); 137 | cucopy(t, &out); 138 | return out; 139 | } 140 | 141 | cuftens cuftens_round_up_cufmem(cuftens *t, int n) 142 | { 143 | int D=t->D, M=t->M, N=t->N, L=t->L; 144 | if (L > 1) L = ROUND_UP(L, n); 145 | else N = ROUND_UP(N, n); 146 | cuftens out = cuftens_from_cufmem(D, M, N, L); 147 | cudaMemset(out.data, 0, out.bytes); 148 | cucopy(t, &out); 149 | return out; 150 | } 151 | 152 | ftens cuftens_dump(cuftens *t) 153 | { 154 | int D=t->D, M=t->M, N=t->N, L=t->L; 155 | cuASSERT(t->data, "err\n"); 156 | ftens out = ftens_init(D, M, N, L); 157 | cudaMemcpy(out.data, t->data, t->bytes, cuDtoH); 158 | return out; 159 | } 160 | 161 | void cuftens_print(cuftens *t) 162 | { 163 | cuftens_print(t, "%.2f "); 164 | } 165 | 166 | void cuftens_print(cuftens *t, const char *fmt) 167 | { 168 | if (!t->data) {printf("tens null\n"); return;} 169 | ftens tmp = cuftens_dump(t); 170 | ftens_print(&tmp, fmt); 171 | ftens_free(&tmp); 172 | } 173 | 174 | void cuftens_print_ch(cuftens *t, int b, int ch, int I, int J, 175 | const char *fmt) 176 | { 177 | ftens tmp = cuftens_dump(t); 178 | ftens_print_ch(&tmp, b, ch, I, J, fmt); 179 | ftens_free(&tmp); 180 | } 181 | 182 | void cuftens_print_shape(cuftens *t) 183 | { 184 | printf("cuftens: %d %d %d %d\n", t->D, t->M, t->N, t->L); 185 | } 186 | -------------------------------------------------------------------------------- /src/timez.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "timez.h" 3 | 4 | 5 | void time_record(myTime *time) 6 | { 7 | gettimeofday(time, NULL); 8 | } 9 | 10 | 11 | long elapsed_time(myTime start, myTime stop) 12 | { 13 | long sec = stop.tv_sec - start.tv_sec; 14 | long usec = stop.tv_usec - start.tv_usec; 15 | usec += (sec * 1e6); 16 | printf("%ld us\t", usec); 17 | return usec; 18 | } 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # test 2 | set(ESP_LIBS espresso m ${BLAS_LIBRARIES}) 3 | set(CUESP_LIBS cuespresso ${CUDA_LIBRARIES}) 4 | 5 | add_executable(t1 t1.c) 6 | target_link_libraries(t1 ${ESP_LIBS}) 7 | 8 | add_executable(mnist mnist.c) 9 | target_link_libraries(mnist ${ESP_LIBS}) 10 | 11 | add_executable(cifar cifar.c) 12 | target_link_libraries(cifar ${ESP_LIBS}) 13 | 14 | if (CUDA_FOUND) 15 | cuda_add_executable(cut1 t1.cu) 16 | target_link_libraries(cut1 ${ESP_LIBS} ${CUESP_LIBS}) 17 | 18 | cuda_add_executable(cup_mnist cup_mnist.cu) 19 | target_link_libraries(cup_mnist ${ESP_LIBS} ${CUESP_LIBS}) 20 | endif () 21 | -------------------------------------------------------------------------------- /test/cup_mnist.cu: -------------------------------------------------------------------------------- 1 | #include "cuesp.h" 2 | 3 | 4 | int main(int argc, char *argv[]) 5 | { 6 | cufmem_alloc(BYTES(float, 4096*28*28)); 7 | cupmem_alloc(BYTES(uint64_t, 4096*28*28/64)); 8 | 9 | mlp tmp = mlp_load("mnist_params", 1); 10 | cupmlp nn = cupmlp_convert(&tmp); 11 | mlp_free(&tmp); 12 | 13 | // batch size = 1 14 | ftens img = ftens_init(1, 28, 28, 1); 15 | ftens lab = ftens_init(1, 1, 10, 1); 16 | mnist_load_X("mnist_test", 0, 1, &img); 17 | mnist_load_y("mnist_lab", 0, 1, &lab); 18 | 19 | // forward of 1 image 20 | cuinputLayer *il = &nn.il; 21 | cupdenseLayer *dl = nn.dl; 22 | cubnormLayer *bnl = nn.bnl; 23 | 24 | cuinputLayer_forward(&img, il, 0); 25 | 26 | cupdenseLayer_forward_initial(&il->out, dl, 128); 27 | cubnormLayer_forward(&dl->out, bnl, 0); 28 | cupsignAct_forward(&dl->out, &dl->pout); 29 | 30 | dl++; bnl++; 31 | cupdenseLayer_forward(&(dl-1)->pout, dl, 0); 32 | cubnormLayer_forward(&dl->out, bnl, 0); 33 | cupsignAct_forward(&dl->out, &dl->pout); 34 | 35 | dl++; bnl++; 36 | cupdenseLayer_forward(&(dl-1)->pout, dl, 0); 37 | cubnormLayer_forward(&dl->out, bnl, 0); 38 | cupsignAct_forward(&dl->out, &dl->pout); 39 | 40 | dl++; bnl++; 41 | cupdenseLayer_forward(&(dl-1)->pout, dl, 0); 42 | cubnormLayer_forward(&dl->out, bnl, 0); 43 | 44 | cudaDeviceSynchronize(); 45 | cuftens_print(&dl->out); 46 | 47 | cupmlp_free(&nn); 48 | cufmem_free(); 49 | cupmem_free(); 50 | 51 | return 0; 52 | } 53 | -------------------------------------------------------------------------------- /toEspresso.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """" 5 | Usage: toEspresso.py 6 | 7 | Options: 8 | -h --help show this message 9 | """ 10 | 11 | import os 12 | import numpy as np 13 | import struct 14 | import json 15 | 16 | INPUT = 0 17 | DENSE = 1 18 | BNORM = 2 19 | CONV = 3 20 | POOL = 4 21 | NUM = 1<<4 22 | DATA = 2<<4 23 | 24 | 25 | def read_param(params, n): 26 | return params['arr_%d' % n].astype(np.float32) 27 | 28 | 29 | def main(data_file, desc_file, out_file): 30 | assert(os.path.exists(desc_file) and os.path.exists(data_file)) 31 | desc = json.load(open(desc_file, 'r')) 32 | params, n = np.load(data_file), 0 33 | with open(out_file, 'wb') as out: 34 | for elem in desc: 35 | tmp = '' 36 | 37 | if elem['type'] == "ndense": 38 | N = elem['val'] 39 | print 'ndense: %d' % N 40 | tmp = struct.pack('B', DENSE | NUM) + \ 41 | struct.pack('i', N) 42 | 43 | elif elem['type'] == "nbnorm": 44 | N = elem['val'] 45 | print 'nbnorm: %d' % N 46 | tmp = struct.pack('B', BNORM | NUM) + \ 47 | struct.pack('i', N) 48 | 49 | elif elem['type'] == "nconv": 50 | N = elem['val'] 51 | print 'nconv: %d' % N 52 | tmp = struct.pack('B', CONV | NUM) + \ 53 | struct.pack('i', N) 54 | 55 | elif elem['type'] == "npool": 56 | N = elem['val'] 57 | print 'npool: %d' % N 58 | tmp = struct.pack('B', POOL | NUM) + \ 59 | struct.pack('i', N) 60 | 61 | elif elem['type'] == 'input': 62 | tmp = struct.pack('B', INPUT | DATA) 63 | dim = elem['dim'] 64 | print 'input: %d %d %d' % tuple(dim) 65 | #tmp += struct.pack('3i', *dim) 66 | 67 | elif elem['type'] == 'dense': 68 | M, N = elem['dim'] 69 | print 'dense: %d %d' % (M, N) 70 | tmp = struct.pack('B', DENSE | DATA) + \ 71 | struct.pack('2i', M, N) 72 | W = read_param(params, n).T 73 | b = read_param(params, n + 1) 74 | tmp += W.tostring('C') + b.tostring('C') 75 | n += 2 76 | 77 | elif elem['type'] == 'bnorm': 78 | N = elem['dim'] 79 | print 'bnorm: %d' % N 80 | tmp = struct.pack('B', BNORM | DATA) + \ 81 | struct.pack('i', N) 82 | for i in range(4): 83 | tmp += read_param(params, n+i).tostring('C') 84 | n += 4 85 | 86 | elif elem['type'] == 'conv': 87 | dim = elem['dim'] 88 | print 'conv: %d %d %d %d %d %d %d' % tuple(dim) 89 | tmp = struct.pack('B', CONV | DATA) + \ 90 | struct.pack('7i', *dim) 91 | H = read_param(params, n) 92 | b = read_param(params, n + 1) 93 | tmp += H.tostring('C') 94 | tmp += b.tostring('C') 95 | n += 2 96 | 97 | elif elem['type'] == 'pool': 98 | dim = elem['dim'] 99 | print 'max pool: %d %d %d %d' % tuple(dim) 100 | tmp = struct.pack('B', POOL | DATA) + \ 101 | struct.pack('4i', *dim) 102 | else: 103 | pass 104 | 105 | print len(tmp) 106 | out.write(tmp) 107 | 108 | 109 | if __name__ == '__main__': 110 | from docopt import docopt 111 | 112 | args = docopt(__doc__) 113 | data, desc, out = args[''], args[''], args[''] 114 | 115 | main(data, desc, out) 116 | --------------------------------------------------------------------------------