├── .gitignore
├── CMakeLists.txt
├── LICENSE
├── README.md
├── data
    ├── cifar10.json
    └── mnist.json
├── include
    ├── acts.h
    ├── cuacts.h
    ├── cuesp.h
    ├── culayers.h
    ├── cumem.h
    ├── cunn.h
    ├── cuptens.h
    ├── cutens.h
    ├── esp.h
    ├── handlers.h
    ├── handlers
    │   ├── cifar.h
    │   └── mnist.h
    ├── kernels.h
    ├── layers.h
    ├── layers
    │   ├── bnorm.h
    │   ├── conv.h
    │   ├── cubnorm.h
    │   ├── cuconv.h
    │   ├── cudense.h
    │   ├── cuinput.h
    │   ├── cupconv.h
    │   ├── cupdense.h
    │   ├── cupinput.h
    │   ├── cupool.h
    │   ├── dense.h
    │   ├── input.h
    │   └── pool.h
    ├── nn.h
    ├── nn
    │   ├── cnn.h
    │   ├── cucnn.h
    │   ├── cumlp.h
    │   ├── cupcnn.h
    │   ├── cupmlp.h
    │   └── mlp.h
    ├── params.h
    ├── tens.h
    ├── timez.h
    ├── util.cuh
    └── util.h
├── readEspresso.py
├── src
    ├── CMakeLists.txt
    ├── activations
    │   ├── psignl.cu
    │   ├── relul.c
    │   ├── signl.c
    │   ├── signl.cu
    │   └── softmaxl.c
    ├── cnn.c
    ├── cnn.cu
    ├── cumem.cu
    ├── handlers
    │   ├── cifar.c
    │   └── mnist.c
    ├── kernels
    │   ├── bnorm.cu
    │   ├── bnorm.cuh
    │   ├── bp.cu
    │   ├── bp.cuh
    │   ├── copy.cu
    │   ├── copy.cuh
    │   ├── lower.cu
    │   ├── lower.cuh
    │   ├── norm.cu
    │   ├── norm.cuh
    │   ├── pack.cu
    │   ├── pack.cuh
    │   ├── pad.cu
    │   ├── pad.cuh
    │   ├── pgemm.cu
    │   ├── pgemm.cuh
    │   ├── pgemv.cu
    │   ├── pgemv.cuh
    │   ├── pool.cu
    │   ├── pool.cuh
    │   ├── set.cu
    │   ├── set.cuh
    │   ├── sgemm.cu
    │   ├── sgemm.cuh
    │   ├── sgemv.cu
    │   ├── sgemv.cuh
    │   ├── sign.cu
    │   ├── sign.cuh
    │   ├── tch.cu
    │   └── tch.cuh
    ├── layers
    │   ├── bnorml.c
    │   ├── bnorml.cu
    │   ├── convl.c
    │   ├── convl.cu
    │   ├── densel.c
    │   ├── densel.cu
    │   ├── inputl.c
    │   ├── inputl.cu
    │   ├── pconvl.cu
    │   ├── pdensel.cu
    │   ├── pinputl.cu
    │   ├── pooll.c
    │   └── pooll.cu
    ├── mlp.c
    ├── mlp.cu
    ├── params.c
    ├── pcnn.cu
    ├── pmlp.cu
    ├── ptens.cu
    ├── scratch.c
    ├── tens.c
    ├── tens.cu
    └── timez.c
├── test
    ├── CMakeLists.txt
    └── cup_mnist.cu
└── toEspresso.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | build/
 2 | cifar_test
 3 | cifar_params
 4 | mnist_test
 5 | mnist_params
 6 | mnist_lab
 7 | *.npz
 8 | *.dat
 9 | *.esp
10 | *.o
11 | *~
12 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.8)
 2 | project(espresso)
 3 | 
 4 | find_package(BLAS REQUIRED)
 5 | find_package(CUDA)
 6 | 
 7 | option(CMAKE_EXPORT_COMPILE_COMMANDS "" 1)
 8 | 
 9 | set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -arch=sm_50)
10 | 
11 | include_directories("${CMAKE_SOURCE_DIR}/include")
12 | include_directories("${CMAKE_SOURCE_DIR}/src/kernels")
13 | include_directories("${CMAKE_SOURCE_DIR}/include/layers")
14 | 
15 | add_subdirectory("src")
16 | add_subdirectory("test")
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2017 Fabrizio Pedersoli
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Espresso
2 | Efficient forward propagation for BCNNs
3 | 
4 | [https://arxiv.org/pdf/1705.07175]
5 | 
6 | 


--------------------------------------------------------------------------------
/data/cifar10.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {"type": "ndense", "val": 3},
 3 |     {"type": "nbnorm", "val": 9},
 4 |     {"type": "nconv", "val": 6},
 5 |     {"type": "npool", "val": 3},
 6 | 
 7 |     {"type": "input", "dim": [32, 32, 3]},
 8 | 
 9 |     {"type": "conv", "dim": [1, 128, 3, 3, 3, 1, 1]},
10 |     {"type": "bnorm", "dim": 128},
11 |     {"type": "conv", "dim": [1, 128, 3, 3, 128, 1, 1]},
12 |     {"type": "pool", "dim": [2, 2, 2, 2]},
13 |     {"type": "bnorm", "dim": 128},
14 | 
15 |     {"type": "conv", "dim": [1, 256, 3, 3, 128, 1, 1]},
16 |     {"type": "bnorm", "dim": 256},
17 |     {"type": "conv", "dim": [1, 256, 3, 3, 256, 1, 1]},
18 |     {"type": "pool", "dim": [2, 2, 2, 2]},
19 |     {"type": "bnorm", "dim": 256},
20 | 
21 |     {"type": "conv", "dim": [1, 512, 3, 3, 256, 1, 1]},
22 |     {"type": "bnorm", "dim": 512},
23 |     {"type": "conv", "dim": [1, 512, 3, 3, 512, 1, 1]},
24 |     {"type": "pool", "dim": [2, 2, 2, 2]},
25 |     {"type": "bnorm", "dim": 512},
26 | 
27 |     {"type": "dense", "dim": [1024, 8192]},
28 |     {"type": "bnorm", "dim": 1024},
29 |     {"type": "dense", "dim": [1024, 1024]},
30 |     {"type": "bnorm", "dim": 1024},
31 | 
32 |     {"type": "dense", "dim": [10, 1024]},
33 |     {"type": "bnorm", "dim": 10}
34 | ]
35 | 


--------------------------------------------------------------------------------
/data/mnist.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {"type": "ndense", "val": 4},
 3 |     {"type": "nbnorm", "val": 4},
 4 | 
 5 |     {"type": "input", "dim": [28, 28, 1]},
 6 |     
 7 |     {"type": "dense", "dim": [4096, 784]},
 8 |     {"type": "bnorm", "dim": 4096},
 9 |     {"type": "dense", "dim": [4096, 4096]},
10 |     {"type": "bnorm", "dim": 4096},
11 |     {"type": "dense", "dim": [4096, 4096]},
12 |     {"type": "bnorm", "dim": 4096},
13 |     {"type": "dense", "dim": [10, 4096]},
14 |     {"type": "bnorm", "dim": 10}
15 | ]
16 | 


--------------------------------------------------------------------------------
/include/acts.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_ACTIVATIONS_H
 2 | #define ESP_ACTIVATIONS_H
 3 | 
 4 | #include "tens.h"
 5 | 
 6 | #ifdef __cplusplus
 7 | extern "C" {
 8 | #endif
 9 | 
10 |      void signAct_forward(ftens *t);
11 |      void signAct_backward(ftens *dout);
12 |      void reluAct_forward(ftens *t);
13 |      void reluAct_backward(ftens *dout);
14 |      void softmaxAct_forward(ftens *t);
15 |      void softmaxAct_backward(ftens *dout);
16 | 
17 | 
18 | #ifdef __cplusplus
19 | }
20 | #endif
21 | 
22 | 
23 | #endif /* ESP_ACTIVATIONS_H */
24 | 


--------------------------------------------------------------------------------
/include/cuacts.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_CUACT_H
 2 | #define ESP_CUACT_H
 3 | 
 4 | #include "cuptens.h"
 5 | 
 6 | 
 7 | void cusignAct_forward(cuftens *t);
 8 | void cusignAct_backward(cuftens *dout);
 9 | void cupsignAct_forward(cuftens *src, cuptens *out);
10 | 
11 | 
12 | 
13 | #endif /* ESP_CUACT_H */
14 | 


--------------------------------------------------------------------------------
/include/cuesp.h:
--------------------------------------------------------------------------------
 1 | #ifndef CUESP_H
 2 | #define CUESP_H
 3 | 
 4 | #include "util.cuh"
 5 | #include "cutens.h"
 6 | #include "cuptens.h"
 7 | #include "cumem.h"
 8 | #include "culayers.h"
 9 | #include "cunn.h"
10 | #include "cuacts.h"
11 | #include "handlers.h"
12 | 
13 | 
14 | #endif /* CUESP_H */
15 | 


--------------------------------------------------------------------------------
/include/culayers.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_CULAYERS_H
 2 | #define ESP_CULAYERS_H
 3 | 
 4 | #include "layers/cuinput.h"
 5 | #include "layers/cuconv.h"
 6 | #include "layers/cudense.h"
 7 | #include "layers/cupool.h"
 8 | #include "layers/cubnorm.h"
 9 | 
10 | #include "layers/cupinput.h"
11 | #include "layers/cupdense.h"
12 | #include "layers/cupconv.h"
13 | 
14 | 
15 | #endif /* ESP_CULAYERS_H */
16 | 


--------------------------------------------------------------------------------
/include/cumem.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_CUMEM_H
 2 | #define ESP_CUMEM_H
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | #define CUFMEM  _cufmem()
 7 | #define CUPMEM  _cupmem()
 8 | 
 9 | 
10 | typedef struct {
11 |      int total, used;
12 |      float *base;
13 |      float *curr;
14 | } cufmem;
15 | 
16 | 
17 | typedef struct {
18 |      int total, used;
19 |      uint64_t *base;
20 |      uint64_t *curr;
21 | } cupmem;
22 | 
23 | 
24 | int _cufmem();
25 | int _cupmem();
26 | 
27 | void cufmem_alloc(int len);
28 | float *cufmem_reserve(int len);
29 | void cufmem_free();
30 | void cufmem_reset();
31 | 
32 | void cupmem_alloc(int len);
33 | uint64_t *cupmem_reserve(int len);
34 | void cupmem_free();
35 | void cupmem_reset();
36 | 
37 | 
38 | #endif /* ESP_CUMEM_H */
39 | 


--------------------------------------------------------------------------------
/include/cunn.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_CUNN_H
 2 | #define ESP_CUNN_H
 3 | 
 4 | #include "nn/cumlp.h"
 5 | #include "nn/cucnn.h"
 6 | #include "nn/cupmlp.h"
 7 | #include "nn/cupcnn.h"
 8 | 
 9 | #endif /* ESP_CUNN_H */
10 | 


--------------------------------------------------------------------------------
/include/cuptens.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_CUPTENS_H
 2 | #define ESP_CUPTENS_H
 3 | 
 4 | #include <stdint.h>
 5 | #include "cumem.h"
 6 | #include "tens.h"
 7 | #include "cutens.h"
 8 | 
 9 | 
10 | typedef struct {
11 |      int D, M, N, L;
12 |      int X, p, MNL;
13 |      int bytes;
14 |      uint64_t *data;
15 | } cuptens;
16 | 
17 | 
18 | cuptens cuptens_empty(int D, int M, int N, int L);
19 | cuptens cuptens_init(int D, int M, int N, int L);
20 | cuptens cuptens_pad(cuptens *src, int p);
21 | cuptens cuptens_from_cupmem(int D, int M, int N, int L);
22 | cuptens cuptens_lower_init(cuptens *src, int W, int H, int Sx, int Sy);
23 | cuptens cuptens_lower_cupmem(cuptens *src, int W, int H, int Sx, int Sy);
24 | 
25 | cuptens cuptens_convert(ftens *t);
26 | cuptens cuptens_convert(cuftens *t);
27 | void cuptens_convert(cuftens *src, cuptens *dst);
28 | void cuptens_free(cuptens *pt);
29 | uint64_t *cuptens_dump(cuptens *t);
30 | void cuptens_print_shape(cuptens *t);
31 | void cuptens_print(cuptens *t);
32 | void cuptens_print(cuptens *t, const char *fmt);
33 | void cuptens_print_ch(cuptens *t, int w, int k, const char *fmt);
34 | 
35 | #endif /* ESP_CUPTENS_H */
36 | 


--------------------------------------------------------------------------------
/include/cutens.h:
--------------------------------------------------------------------------------
 1 | #ifndef EPS_CUTENS_H
 2 | #define EPS_CUTENS_H
 3 | 
 4 | #include "util.cuh"
 5 | #include "cumem.h"
 6 | #include "tens.h"
 7 | 
 8 | 
 9 | typedef struct {
10 |      int D, M, N, L, MNL;
11 |      int bytes;
12 |      float *data;
13 | } cuftens;
14 | 
15 | 
16 | #define CUFTENS_INIT(t,D,M,N,L) (                     \
17 |           t.D=D, t.M=M, t.N=N, t.L=L, t.MNL=M*N*L,    \
18 |           t.bytes=BYTES(float, D*M*N*L),              \
19 |           t.data=NULL)
20 | 
21 | #define DIM_CHECK(D,M,N,L)                      \
22 |      cuASSERT(D>0 && M>0 && N>0 && N>0 && L>0,  \
23 |               "err: cuftens invalid size\n")
24 | 
25 | 
26 | cuftens cuftens_empty(int D, int M, int N, int L);
27 | cuftens cuftens_init(int D, int M, int N, int L);
28 | cuftens cuftens_from_cufmem(int D, int M, int N, int L);
29 | cuftens cuftens_lower_init(cuftens *t, int W, int H, int Sx, int Sy);
30 | cuftens cuftens_lower_cufmem(cuftens *t, int W, int H, int Sx, int Sy);
31 | cuftens cuftens_zeros(int D, int M, int N, int L);
32 | cuftens cuftens_ones(int D, int M, int N, int L);
33 | cuftens cuftens_rand(int D, int M, int N, int L);
34 | cuftens cuftens_rand(int D, int M, int N, int L,
35 |                      float min, float max);
36 | 
37 | void    cuftens_round_up(cuftens *src, cuftens *dst);
38 | cuftens cuftens_round_up(ftens *t, int n);
39 | cuftens cuftens_round_up(cuftens *t, int n);
40 | cuftens cuftens_round_up_cufmem(cuftens *t, int n);
41 | cuftens cuftens_copy(cuftens *t);
42 | void    cuftens_copy(cuftens *src, cuftens *dst);
43 | cuftens cuftens_convert(ftens *t);
44 | 
45 | void    cuftens_pad(cuftens *src, cuftens *dst, int p);
46 | cuftens cuftens_pad(cuftens *t, int p);
47 | 
48 | ftens cuftens_dump(cuftens *t);
49 | 
50 | void cuftens_free(cuftens *t);
51 | void cuftens_reshape(cuftens *t, int D, int M, int N, int L);
52 | void cuftens_print_shape(cuftens *t);
53 | void cuftens_print(cuftens *t);
54 | void cuftens_print(cuftens *t, const char *fmt);
55 | void cuftens_print_ch(cuftens *t, int b, int ch, int I, int J,
56 |                       const char *fmt);
57 | 
58 | static inline
59 | int cuftens_len(cuftens *t) {return t->bytes/sizeof(float);}
60 | 
61 | 
62 | #endif /* EPS_CUTENS_H */
63 | 


--------------------------------------------------------------------------------
/include/esp.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_H
 2 | #define ESP_H
 3 | 
 4 | #include "tens.h"
 5 | #include "layers.h"
 6 | #include "acts.h"
 7 | #include "mlp.h"
 8 | #include "cnn.h"
 9 | 
10 | #endif /* ESP_H */
11 | 


--------------------------------------------------------------------------------
/include/handlers.h:
--------------------------------------------------------------------------------
1 | #ifndef ESP_HANDLERS_H
2 | #define ESP_HANDLERS_H
3 | 
4 | #include "handlers/mnist.h"
5 | #include "handlers/cifar.h"
6 | 
7 | #endif /* ESP_HANDLERS_H */
8 | 


--------------------------------------------------------------------------------
/include/handlers/cifar.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_CIFAR_H
 2 | #define ESP_CIFAR_H
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 |      void cifar10_load_Xy(const char *tf, int start, int num,
 9 |                           ftens *X, ftens *y);
10 | 
11 | #ifdef __cplusplus
12 | }
13 | #endif
14 | 
15 | #endif /* ESP_CIFAR_H */
16 | 


--------------------------------------------------------------------------------
/include/handlers/mnist.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_MNIST_H
 2 | #define ESP_MNIST_H
 3 | 
 4 | #include "tens.h"
 5 | 
 6 | #ifdef __cplusplus
 7 | extern "C" {
 8 | #endif
 9 | 
10 | 
11 | void mnist_load_X(const char *tf, int start, int num,
12 |                   ftens *X);
13 | 
14 | void  mnist_load_y(const char *lf, int start, int num,
15 |                    ftens *y);
16 | 
17 | 
18 | #ifdef __cplusplus
19 | }
20 | #endif
21 | 
22 | #endif /* ESP_MNIST_H */
23 | 


--------------------------------------------------------------------------------
/include/kernels.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_KERNELS_H
 2 | #define ESP_KERNELS_H
 3 | 
 4 | #include <stdint.h>
 5 | #include "cuptens.h"
 6 | 
 7 | 
 8 | void cuset(cuftens *t, float v);
 9 | void cucopy(cuftens *src, cuftens *dst);
10 | void cupad(cuftens *src, cuftens *dst, int p);
11 | void cupad(cuptens *src, cuptens *dst, int p);
12 | void cupack(cuftens *src, cuptens *dst);
13 | void cutch(cuftens *src, cuftens *dst);
14 | void cusign(cuftens *a);
15 | void cunorm(cuftens *t);
16 | 
17 | void cubp_split_pack(cuftens *src, cuptens *dst);
18 | void cubp_merge(cuftens *src, cuftens *dst, cuftens *fix, float norm);
19 | 
20 | void culower  (cuftens *src, cuftens *dst, int W, int H, int Sx, int Sy);
21 | void cuplower (cuptens *src, cuptens *dst, int W, int H, int Sx, int Sy);
22 | void cumaxpool(cuftens *src, cuftens *dst, int W, int H, int Sx, int Sy);
23 | 
24 | void cubnorm(cuftens *mean, cuftens *istd,
25 |              cuftens *beta, cuftens *gamma,
26 |              cuftens  *in);
27 | 
28 | void sgemv(cuftens *a, cuftens *b, cuftens *c);
29 | void sgemm(cuftens *a, cuftens *b, cuftens *c);
30 | 
31 | void sgemm(int M, int N, int K,
32 |            const float * __restrict__ A, int lda,
33 |            const float * __restrict__ B, int ldb,
34 |            float *       __restrict__ C, int ldc);
35 | 
36 | void pgemv(int m, int n,
37 |            const uint64_t * __restrict__ A,
38 |            const uint64_t * __restrict__ x,
39 |            float *          __restrict__ y);
40 | 
41 | void pgemm(int M, int N, int K,
42 |            const uint64_t * __restrict__ A,
43 |            const uint64_t * __restrict__ B,
44 |            float *          __restrict__ C);
45 | 
46 | void pgemm_init(int M, int N, int K,
47 |                 const uint64_t * __restrict__ A,
48 |                 const uint64_t * __restrict__ B,
49 |                 float *          __restrict__ C);
50 | 
51 | void pgemm_init(int M, int N, int K,
52 |                 const uint64_t * __restrict__ a, int lda,
53 |                 const uint64_t * __restrict__ b, int ldb,
54 |                 float *          __restrict__ c, int ldc);
55 | 
56 | void pgemm_init_rev(int M, int N, int K,
57 |                     const uint64_t * __restrict__ A,
58 |                     const uint64_t * __restrict__ B,
59 |                     float *          __restrict__ C);
60 | 
61 | void pgemm32(int M, int N, int K,
62 |              const uint32_t * __restrict__ A,
63 |              const uint32_t * __restrict__ B,
64 |              float *          __restrict__ C);
65 | 
66 | 
67 | 
68 | #endif /* ESP_KERNELS_H */
69 | 


--------------------------------------------------------------------------------
/include/layers.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_LAYERS_H
 2 | #define ESP_LAYERS_H
 3 | 
 4 | #include "layers/input.h"
 5 | #include "layers/conv.h"
 6 | #include "layers/dense.h"
 7 | #include "layers/pool.h"
 8 | #include "layers/bnorm.h"
 9 | 
10 | #endif /* ESP_LAYERS_H */
11 | 


--------------------------------------------------------------------------------
/include/layers/bnorm.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_BNORM_H
 2 | #define ESP_BNORM_H
 3 | 
 4 | #include "tens.h"
 5 | 
 6 | #define BNORML_INIT(bnl) {                                       \
 7 |           bnl.N=0; bnl.ug=0;                                    \
 8 |           bnl.mean. data=NULL; bnl.istd  .data=NULL;            \
 9 |           bnl.gmean.data=NULL; bnl.gistd .data=NULL;            \
10 |           bnl.beta .data=NULL; bnl.gamma .data=NULL;            \
11 |           bnl.dbeta.data=NULL; bnl.dgamma.data=NULL;            \
12 |           bnl.in.   data=NULL; bnl.tmp.   data=NULL;            \
13 |      }
14 | 
15 | 
16 | #ifdef __cplusplus
17 | extern "C" {
18 | #endif
19 | 
20 |      typedef struct {
21 |           int N, ug;
22 |           ftens mean,  istd, gmean,  gistd;
23 |           ftens gamma, beta, dgamma, dbeta;
24 |           ftens in, tmp;
25 |      } bnormLayer;
26 | 
27 | 
28 |      bnormLayer bnormLayer_init(int use_global);
29 |      void bnormLayer_free(bnormLayer *bnl);
30 |      void bnormLayer_forward(ftens *t, bnormLayer *bnl, int save);
31 |      void bnormLayer_backward(ftens *dt, bnormLayer *bnl);
32 |      void bnormLayer_update(bnormLayer *bnl);
33 |      void bnormLayer_set(ftens *mean,  ftens *istd,
34 |                          ftens *gamma, ftens *beta, bnormLayer *bnl);
35 | 
36 | 
37 | #ifdef __cplusplus
38 | }
39 | #endif
40 | 
41 | #endif /* ESP_BNORM_H */
42 | 


--------------------------------------------------------------------------------
/include/layers/conv.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_CONV_H
 2 | #define ESP_CONV_H
 3 | 
 4 | #include "tens.h"
 5 | 
 6 | #define CONVL_INIT(cl) {                                       \
 7 |           cl.D=0; cl.M=0; cl.N=0; cl.L=0;                      \
 8 |           cl.Sm=0; cl.Sn=0; cl.p=0;                            \
 9 |           cl.W.data=NULL;  cl.b.data=NULL; cl.out.data=NULL;   \
10 |           cl.dW.data=NULL; cl.db.data=NULL; cl.in.data=NULL;   \
11 | }
12 | 
13 | #ifdef __cplusplus
14 | extern "C" {
15 | #endif
16 | 
17 |      typedef struct {
18 |           int D, M, N, L, Sm, Sn, p;
19 |           ftens W, b, out;
20 |           ftens dW, db, in;
21 |      } convLayer;
22 | 
23 |      convLayer convLayer_init(int Sm, int Sn, int p);
24 |      void convLayer_print_shape(convLayer *cl);
25 |      void convLayer_free(convLayer *cl);
26 |      void convLayer_set(ftens *W, convLayer *cl);
27 |      void convLayer_forward(ftens *t, convLayer *cl, int save);
28 | 
29 | #ifdef __cplusplus
30 | }
31 | #endif
32 | 
33 | #endif /* ESP_CONV_H */
34 | 


--------------------------------------------------------------------------------
/include/layers/cubnorm.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_CUBNORN_H
 2 | #define ESP_CUBNORN_H
 3 | 
 4 | #include "cutens.h"
 5 | #include "bnorm.h"
 6 | 
 7 | 
 8 | typedef struct {
 9 |      int N, ug;
10 |      cuftens mean,  istd, gmean,  gistd;
11 |      cuftens gamma, beta, dgamma, dbeta;
12 |      cuftens in, tmp;
13 | } cubnormLayer;
14 | 
15 | 
16 | cubnormLayer cubnormLayer_init(int use_global);
17 | void cubnormLayer_convert(bnormLayer *src, cubnormLayer *dst);
18 | void cubnormLayer_free(cubnormLayer *bnl);
19 | void cubnormLayer_forward(cuftens *t, cubnormLayer *bnl, int save);
20 | void cubnormLayer_backward(cuftens *dt, cubnormLayer *bnl);
21 | void cubnormLayer_update(cubnormLayer *bnl);
22 | void cubnormLayer_set(ftens *mean,  ftens *istd,
23 |                       ftens *gamma, ftens *beta,
24 |                       cubnormLayer *bnl);
25 | 
26 | 
27 | #endif /* ESP_CUBNORN_H */
28 | 


--------------------------------------------------------------------------------
/include/layers/cuconv.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_CUCONV_H
 2 | #define ESP_CUCONV_H
 3 | 
 4 | #include "cutens.h"
 5 | #include "conv.h"
 6 | 
 7 | 
 8 | typedef struct {
 9 |      int D, M, N, L, Sm, Sn, p;
10 |      cuftens W, b, out;
11 |      cuftens dW, db, in;
12 | } cuconvLayer;
13 | 
14 | 
15 | cuconvLayer cuconvLayer_init(int Sm, int Sn, int p);
16 | void cuconvLayer_free(cuconvLayer *cl);
17 | void cuftens_print(cuftens *t);
18 | void cuconvLayer_print_shape(cuconvLayer *cl);
19 | void cuconvLayer_convert(convLayer *src, cuconvLayer *dst);
20 | void cuconvLayer_set(ftens *W, cuconvLayer *cl);
21 | cuftens cuconvLayer_pad_input(cuftens *t, int p);
22 | cuftens cuconvLayer_lower_input(cuftens *t, cuconvLayer *cl);
23 | void cuconvLayer_forward(cuftens *t, cuconvLayer *cl, int save);
24 | void cuconvLayer_backward(cuftens *dout, cuconvLayer *cl);
25 | 
26 | 
27 | #endif /* ESP_CUCONV_H */
28 | 


--------------------------------------------------------------------------------
/include/layers/cudense.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_CUDENSE_H
 2 | #define ESP_CUDENSE_H
 3 | 
 4 | #include "cutens.h"
 5 | #include "dense.h"
 6 | 
 7 | 
 8 | typedef struct {
 9 |      int M, N;
10 |      cuftens W, b, out;
11 |      cuftens dW, db, in;
12 | } cudenseLayer;
13 | 
14 | 
15 | cudenseLayer cudenseLayer_init(int M, int N);
16 | void cudenseLayer_convert(denseLayer *src, cudenseLayer *dst);
17 | void cudenseLayer_free(cudenseLayer *dl);
18 | void cudenseLayer_print_size(cudenseLayer *dl);
19 | void cudenseLayer_set(ftens *W, cudenseLayer *dl);
20 | void cudenseLayer_forward(cuftens *t, cudenseLayer *dl, int save);
21 | void cudenseLayer_backward(cuftens *dt, cudenseLayer *dl);
22 | 
23 | 
24 | #endif /* ESP_CUDENSE_H */
25 | 


--------------------------------------------------------------------------------
/include/layers/cuinput.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_CUINPUT_H
 2 | #define ESP_CUINPUT_H
 3 | 
 4 | #include "tens.h"
 5 | #include "cutens.h"
 6 | 
 7 | 
 8 | typedef struct {
 9 |      cuftens out;
10 | } cuinputLayer;
11 | 
12 | 
13 | void cuinputLayer_forward(ftens *t, cuinputLayer *il, int norm);
14 | void cuinputLayer_free(cuinputLayer *il);
15 | 
16 | 
17 | #endif /* ESP_CUINPUT_H */
18 | 


--------------------------------------------------------------------------------
/include/layers/cupconv.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_CUPCONV_H
 2 | #define ESP_CUPCONV_H
 3 | 
 4 | #include "conv.h"
 5 | #include "cuptens.h"
 6 | 
 7 | 
 8 | #define CUPCONVL_INIT(cl)                                              \
 9 |      (cl.D=0,  cl.M=0,  cl.N=0, cl.L=0,                                \
10 |       cl.Sm=0, cl.Sn=0, cl.p=0,                                        \
11 |       cl.W.  data=NULL, cl.pout.data=NULL,                             \
12 |       cl.dW. data=NULL, cl.out .data=NULL,                             \
13 |       cl.fix.data=NULL, cl.bfix.data=NULL,                             \
14 |       cl.in. data=NULL)
15 | 
16 | 
17 | typedef struct {
18 |      int D, M, N, L, Sm, Sn, p;
19 |      cuptens W, pout;
20 |      cuftens dW, out, in;
21 |      cuftens fix, bfix;
22 | } cupconvLayer;
23 | 
24 | 
25 | cupconvLayer cupconvLayer_init();
26 | cupconvLayer cupconvLayer_init(int Sm, int Sn, int p);
27 | void cupconvLayer_convert(convLayer *src, cupconvLayer *dst, int fix);
28 | void cupconvLayer_set(ftens *W, cupconvLayer *cl, int fix);
29 | void cupconvLayer_forward(cuptens *t, cupconvLayer *cl);
30 | void cupconvLayer_forward_initial(cuftens *t, cupconvLayer *cl, float norm);
31 | void cupconvLayer_free(cupconvLayer *cl);
32 | 
33 | 
34 | 
35 | #endif /* ESP_CUPCONV_H */
36 | 


--------------------------------------------------------------------------------
/include/layers/cupdense.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_CUPDENSE_H
 2 | #define ESP_CUPDENSE_H
 3 | 
 4 | #include "cuptens.h"
 5 | #include "dense.h"
 6 | 
 7 | 
 8 | #define CUPDENSEL_INIT(dl) \
 9 |      (dl.M=0, dl.N=0,                                              \
10 |       dl.W. data=NULL, dl.in. data=NULL, dl.pout.data=NULL,        \
11 |       dl.dW.data=NULL, dl.out.data=NULL, dl.fix.data=NULL)
12 | 
13 | 
14 | typedef struct {
15 |      int M, N;
16 |      cuptens W, in, pout;
17 |      cuftens dW, out, fix;
18 | } cupdenseLayer;
19 | 
20 | 
21 | cupdenseLayer cupdenseLayer_init();
22 | void cupdenseLayer_convert(denseLayer *src, cupdenseLayer *dst, int fix);
23 | void cupdenseLayer_free(cupdenseLayer *dl);
24 | void cupdenseLayer_print_size(cupdenseLayer *dl);
25 | void cupdenseLayer_set(ftens *W, cupdenseLayer *dl, int fix);
26 | void cupdenseLayer_forward(cuptens *t, cupdenseLayer *dl, int save);
27 | void cupdenseLayer_forward_initial(cuftens *t, cupdenseLayer *dl, float norm);
28 | 
29 | 
30 | #endif /* ESP_CUPDENSE_H */
31 | 


--------------------------------------------------------------------------------
/include/layers/cupinput.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_CUPINPUT_H
 2 | #define ESP_CUPINPUT_H
 3 | 
 4 | /* #include "cuptens.h" */
 5 | 
 6 | 
 7 | /* typedef struct { */
 8 | /*      cuptens out; */
 9 | /* } cupinputLayer; */
10 | 
11 | 
12 | /* cupinputLayer cupinputLayer_init(); */
13 | /* void cupinputLayer_forward(ftens *t, cupinputLayer *il); */
14 | /* void cupinputLayer_free(cupinputLayer *il); */
15 | 
16 | 
17 | #endif /* ESP_CUPINPUT_H */
18 | 


--------------------------------------------------------------------------------
/include/layers/cupool.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_CUPOOL_H
 2 | #define ESP_CUPOOL_H
 3 | 
 4 | #include "pool.h"
 5 | #include "cutens.h"
 6 | 
 7 | 
 8 | typedef struct {
 9 |      int M, N, Sm, Sn;
10 |      pool_t op;
11 |      cuftens out, mask;
12 | } cupoolLayer;
13 | 
14 | 
15 | cupoolLayer cupoolLayer_init(int M, int N, int Sm, int Sn);
16 | void cupoolLayer_free(cupoolLayer *pl);
17 | void cupoolLayer_convert(poolLayer *src, cupoolLayer *dst);
18 | void cupoolLayer_forward(cuftens *t, cupoolLayer *pl);
19 | void cupoolLayer_backward(cuftens *dt, cupoolLayer *pl);
20 | void cupoolLayer_print(cupoolLayer *pl);
21 | 
22 | 
23 | #endif /* ESP_CUPOOL_H */
24 | 


--------------------------------------------------------------------------------
/include/layers/dense.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_DENSE_H
 2 | #define ESP_DENSE_H
 3 | 
 4 | #include "tens.h"
 5 | 
 6 | 
 7 | #define DENSEL_INIT(dl) (                            \
 8 |           dl.M=0, dl.N=0,                            \
 9 |           dl.W .data=NULL, dl.dW .data=NULL,         \
10 |           dl.b .data=NULL, dl.db .data=NULL,         \
11 |           dl.in.data=NULL, dl.out.data=NULL)
12 | 
13 | 
14 | #ifdef __cplusplus
15 | extern "C" {
16 | #endif
17 | 
18 |      typedef struct {
19 |           int M, N;
20 |           ftens W, b, dW, db;
21 |           ftens in, out;
22 |      } denseLayer;
23 | 
24 |      denseLayer denseLayer_init(int M, int N);
25 |      void denseLayer_print_shape(denseLayer *dl);
26 |      void denseLayer_free(denseLayer *dl);
27 |      void denseLayer_set(ftens *W, denseLayer *dl);
28 |      void denseLayer_forward(ftens *t, denseLayer *dl, int cpy);
29 |      void denseLayer_backward(ftens *dt, denseLayer *dl);
30 | 
31 | 
32 | #ifdef __cplusplus
33 | }
34 | #endif
35 | 
36 | #endif /* ESP_DENSE_H */
37 | 


--------------------------------------------------------------------------------
/include/layers/input.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_INPUT_H
 2 | #define ESP_INPUT_H
 3 | 
 4 | #include "tens.h"
 5 | 
 6 | #ifdef __cplusplus
 7 | extern "C" {
 8 | #endif
 9 | 
10 |      typedef struct {
11 |           ftens out;
12 |      } inputLayer;
13 | 
14 |      void inputLayer_load(ftens *in, inputLayer *il);
15 |      void inputLayer_free(inputLayer *il);
16 |      void inputLayer_forward(inputLayer *il);
17 |      void inputLayer_pad(inputLayer *il, int p);
18 | 
19 | 
20 | #ifdef __cplusplus
21 | }
22 | #endif
23 | 
24 | #endif /* ESP_INPUT_H */
25 | 


--------------------------------------------------------------------------------
/include/layers/pool.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_POOL_H
 2 | #define ESP_POOL_H
 3 | 
 4 | #include "tens.h"
 5 | 
 6 | #define POOLL_INIT(pl)                                  \
 7 |      (pl.M=0, pl.N=0, pl.Sm=0, pl.Sn=0, pl.op=MAX,      \
 8 |       pl.out.data=NULL, pl.mask.data=NULL)
 9 | 
10 | 
11 | #ifdef __cplusplus
12 | extern "C" {
13 | #endif
14 | 
15 |      typedef enum {MAX, AVG} pool_t;
16 | 
17 |      typedef struct {
18 |           int M, N, Sm, Sn; pool_t op;
19 |           ftens out, mask;
20 |      } poolLayer;
21 | 
22 | 
23 |      poolLayer poolLayer_init(int M, int N, int Sm, int Sn);
24 |      void poolLayer_free(poolLayer *pl);
25 |      void poolLayer_forward(ftens *t, poolLayer *pl);
26 |      void poolLayer_backward(ftens *dout, poolLayer *pl);
27 | 
28 | 
29 | #ifdef __cplusplus
30 | }
31 | #endif
32 | 
33 | 
34 | #endif /* ESP_POOL_H */
35 | 


--------------------------------------------------------------------------------
/include/nn.h:
--------------------------------------------------------------------------------
1 | #ifndef ESP_NN_H
2 | #define ESP_NN_H
3 | 
4 | #include "nn/mlp.h"
5 | #include "nn/cnn.h"
6 | 
7 | 
8 | #endif /* ESP_NN_H */
9 | 


--------------------------------------------------------------------------------
/include/nn/cnn.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_CNN_H
 2 | #define ESP_CNN_H
 3 | 
 4 | #include "tens.h"
 5 | #include "layers.h"
 6 | 
 7 | 
 8 | #ifdef __cplusplus
 9 | extern "C" {
10 | #endif
11 | 
12 |      typedef struct {
13 |           int Ncl, Npl, Ndl, Nbnl;
14 |           inputLayer il;
15 |           convLayer  *cl;
16 |           poolLayer  *pl;
17 |           denseLayer *dl;
18 |           bnormLayer *bnl;
19 |      } cnn;
20 | 
21 |      cnn cnn_init(int Ncl, int Npl, int Ndl, int Nbnl);
22 |      cnn cnn_load(const char *esp, int bin, int rev);
23 |      void cnn_free(cnn *net);
24 |      void cnn_print(cnn *net);
25 | 
26 | 
27 | #ifdef __cplusplus
28 | }
29 | #endif
30 | 
31 | #endif /* ESP_CNN_H */
32 | 


--------------------------------------------------------------------------------
/include/nn/cucnn.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_CUMLP_H
 2 | #define ESP_CUMLP_H
 3 | 
 4 | #include "culayers.h"
 5 | #include "cnn.h"
 6 | 
 7 | 
 8 | typedef struct {
 9 |      int Ncl, Npl, Ndl, Nbnl;
10 |      cuinputLayer il;
11 |      cuconvLayer  *cl;
12 |      cupoolLayer  *pl;
13 |      cudenseLayer *dl;
14 |      cubnormLayer *bnl;
15 | } cucnn;
16 | 
17 | 
18 | cucnn cucnn_init(int Ncl, int Npl, int Ndl, int Nbnl);
19 | void cucnn_print(cucnn *net);
20 | void cucnn_free(cucnn *net);
21 | cucnn cucnn_convert(cnn *net);
22 | 
23 | 
24 | #endif /* ESP_CUMLP_H */
25 | 


--------------------------------------------------------------------------------
/include/nn/cumlp.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_CUMLP_H
 2 | #define ESP_CUMLP_H
 3 | 
 4 | #include "mlp.h"
 5 | #include "culayers.h"
 6 | 
 7 | typedef struct {
 8 |      int Ndl, Nbnl;
 9 |      cuinputLayer il;
10 |      cudenseLayer *dl;
11 |      cubnormLayer *bnl;
12 | } cumlp;
13 | 
14 | 
15 | cumlp cumlp_init(int Ndl, int Nbnl);
16 | cumlp cumlp_convert(mlp *net);
17 | 
18 | void cumlp_print(cumlp *net);
19 | void cumlp_free(cumlp *net);
20 | 
21 | 
22 | 
23 | #endif /* ESP_CUMLP_H */
24 | 


--------------------------------------------------------------------------------
/include/nn/cupcnn.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_CUPCNN_H
 2 | #define ESP_CUPCNN_H
 3 | 
 4 | #include "culayers.h"
 5 | #include "cnn.h"
 6 | 
 7 | 
 8 | typedef struct {
 9 |      int Ncl, Npl, Ndl, Nbnl;
10 |      cuinputLayer  il;
11 |      cupconvLayer  *cl;
12 |      cupoolLayer   *pl;
13 |      cupdenseLayer *dl;
14 |      cubnormLayer  *bnl;
15 | } cupcnn;
16 | 
17 | 
18 | cupcnn cupcnn_init(int Ncl, int Npl, int Ndl, int Nbnl);
19 | cupcnn cupcnn_convert(cnn *nn);
20 | void cupcnn_print(cupcnn *nn);
21 | void cupcnn_free(cupcnn *nn);
22 | 
23 | 
24 | #endif /* ESP_CUPCNN_H */
25 | 


--------------------------------------------------------------------------------
/include/nn/cupmlp.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_PMLP_H
 2 | #define ESP_PMLP_H
 3 | 
 4 | #include "mlp.h"
 5 | #include "culayers.h"
 6 | 
 7 | 
 8 | typedef struct {
 9 |      int Ndl, Nbnl;
10 |      cuinputLayer il;
11 |      cupdenseLayer *dl;
12 |      cubnormLayer *bnl;
13 | } cupmlp;
14 | 
15 | 
16 | cupmlp cupmlp_init(int Ndl, int Nbnl);
17 | cupmlp cupmlp_convert(mlp *nn);
18 | 
19 | void cupmlp_print(cupmlp *nn);
20 | void cupmlp_free(cupmlp *nn);
21 | 
22 | 
23 | 
24 | #endif /* ESP_PMLP_H */
25 | 


--------------------------------------------------------------------------------
/include/nn/mlp.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_MLP_H
 2 | #define ESP_MLP_H
 3 | 
 4 | #include "layers.h"
 5 | 
 6 | 
 7 | #ifdef __cplusplus
 8 | extern "C" {
 9 | #endif
10 | 
11 |      typedef struct {
12 |           int Ndl, Nbnl;
13 |           inputLayer il;
14 |           denseLayer *dl;
15 |           bnormLayer *bnl;
16 |      } mlp;
17 | 
18 | 
19 |      mlp  mlp_init(int Ndl, int Nbnl);
20 |      mlp  mlp_load(const char *esp, int bin);
21 |      void mlp_free(mlp *net);
22 |      void mlp_print(mlp *net);
23 | 
24 | 
25 | #ifdef __cplusplus
26 | }
27 | #endif
28 | 
29 | 
30 | #endif /* ESP_MLP_H */
31 | 


--------------------------------------------------------------------------------
/include/params.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_PARAMS_H
 2 | #define ESP_PARAMS_H
 3 | 
 4 | #include "layers.h"
 5 | 
 6 | #define INPUTL  0
 7 | #define CONVL   3
 8 | #define POOLL   4
 9 | #define DENSEL  1
10 | #define BNORML  2
11 | #define LNUM    (1<<4)
12 | #define LDAT    (2<<4)
13 | 
14 | #ifdef __cplusplus
15 | extern "C" {
16 | #endif
17 | 
18 |      void load_denseLayer(denseLayer *dl, FILE * const pf, int bin);
19 |      void load_bnormLayer(bnormLayer *bnl, FILE * const pf);
20 |      void load_convLayer(convLayer *cl, FILE * const pf,
21 |                          int bin, int rev);
22 |      void load_poolLayer(poolLayer *pl, FILE * const pf);
23 | 
24 | 
25 | #ifdef __cplusplus
26 | }
27 | #endif
28 | 
29 | 
30 | #endif /* ESP_PARAMS_H */
31 | 


--------------------------------------------------------------------------------
/include/tens.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_TENS_H
 2 | #define ESP_TENS_H
 3 | 
 4 | #include <stdio.h>
 5 | 
 6 | #ifdef __cplusplus
 7 | extern "C" {
 8 | #endif
 9 | 
10 |      typedef struct {
11 |           int D, M, N, L, MNL;
12 |           int bytes;
13 |           float *data;
14 |      } ftens;
15 | 
16 | 
17 |      ftens ftens_init(int D, int M, int N, int L);
18 |      ftens ftens_zeros(int D, int M, int N, int L);
19 |      ftens ftens_ones(int D, int M, int N, int L);
20 |      ftens ftens_rand(int D, int M, int N, int L);
21 |      ftens ftens_rand_range(int D, int M, int N, int L,
22 |                             float min, float max);
23 | 
24 |      ftens ftens_copy(ftens *in);
25 |      ftens ftens_copy_pad(ftens *t, int p);
26 | 
27 |      ftens ftens_from_ptr(int D, int M, int N, int L, float *ptr);
28 |      ftens ftens_from_file(int D, int M, int N, int L, FILE *pf);
29 | 
30 |      ftens ftens_copy_tch(ftens *a);
31 |      void  ftens_tch(ftens *a, ftens *b);
32 |      void  ftens_clear(ftens *t);
33 |      void  ftens_reshape(ftens *t, int D, int M, int N, int L);
34 |      void  ftens_pad(ftens *src, ftens *dst, int p);
35 |      void  ftens_maxpool(ftens *src, ftens *dst, int W, int H,
36 |                          int Sx, int Sy);
37 | 
38 |      void ftens_lower(ftens *src, ftens *dst,
39 |                       int W, int H, int Sx, int Sy);
40 | 
41 |      void ftens_sign(ftens *t);
42 |      void ftens_free(ftens *t);
43 |      void ftens_print_shape(ftens *t);
44 |      void ftens_print(ftens *t, const char *fmt);
45 |      void ftens_print_ch(ftens *t, int w, int k, int I, int J,
46 |                          const char *fmt);
47 | 
48 |      static inline
49 |      int ftens_len(ftens *t) {return t->bytes/sizeof(float);}
50 | 
51 | 
52 | #ifdef __cplusplus
53 | }
54 | #endif
55 | 
56 | #endif /* ESP_TENS_H */
57 | 


--------------------------------------------------------------------------------
/include/timez.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_TIME_H
 2 | #define ESP_TIME_H
 3 | 
 4 | #include <sys/time.h>
 5 | 
 6 | #define TIME_START()       time_record(&aaat1)
 7 | #define TIME_STOP()        time_record(&aaat2) 
 8 | #define TIME_STOP_PRINT()  TIME_STOP(); elapsed_time(aaat1, aaat2)
 9 | #define TIME_STOP_SAVE(t)  TIME_STOP(); t = elapsed_time(aaat1, aaat2)
10 | 
11 | 
12 | typedef struct timeval myTime;
13 | myTime aaat1, aaat2;
14 | 
15 | void time_record(myTime *time);
16 | long elapsed_time(myTime start, myTime stop);
17 | 
18 | #endif /* ESP_TIME_H */
19 | 
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/include/util.cuh:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | 
 4 | #include "util.h"
 5 | 
 6 | #define FPTR d_fscratch
 7 | #define PPTR d_pscratch
 8 | #define FPTR_INC(x, len) d_fscratch ? x + len : NULL
 9 | #define PPTR_INC(x, len) d_pscratch ? x + len : NULL
10 | 
11 | #define cuHtoD    cudaMemcpyHostToDevice
12 | #define cuDtoH    cudaMemcpyDeviceToHost
13 | #define cuDtoD    cudaMemcpyDeviceToDevice
14 | 
15 | #ifndef cuD
16 | #define cuD(x) ((x).d_data)
17 | #endif
18 | 
19 | #define cuASSERT(exp, msg)                       \
20 |   if (!(exp)) {                                  \
21 |        fprintf(stderr, msg);                     \
22 |        exit(-1);                                 \
23 |   }
24 | 
25 | #define CUDA_SAFE_CALL(call)                                          \
26 | {                                                                  \
27 |     cudaError_t err = call;                                           \
28 |     if (cudaSuccess != err) {                                         \
29 |         fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
30 |                  __FILE__, __LINE__, cudaGetErrorString(err) );       \
31 |         exit(EXIT_FAILURE);                                           \
32 |     }                                                                 \
33 | }
34 | 
35 | 
36 | #define CHECK_LAUNCH_ERROR()                                          \
37 | {                                                                  \
38 |     cudaError_t err = cudaGetLastError();                             \
39 |     if (cudaSuccess != err) {                                         \
40 |         fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
41 |                  __FILE__, __LINE__, cudaGetErrorString(err) );       \
42 |         exit(EXIT_FAILURE);                                           \
43 |     }                                                                 \
44 |     err = cudaThreadSynchronize();                                    \
45 |     if (cudaSuccess != err) {                                         \
46 |         fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
47 |                  __FILE__, __LINE__, cudaGetErrorString( err) );      \
48 |         exit(EXIT_FAILURE);                                           \
49 |     }                                                                 \
50 | }
51 | 


--------------------------------------------------------------------------------
/include/util.h:
--------------------------------------------------------------------------------
 1 | #ifndef ESP_UTIL_H
 2 | #define ESP_UTIL_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdint.h>
 6 | #include <stdlib.h>
 7 | #include <string.h>
 8 | #include <math.h>
 9 | #include <assert.h>
10 | 
11 | 
12 | #ifdef __cplusplus
13 | extern "C" {
14 | #endif
15 | 
16 | #define ID2(i, j, N)        ((i)*(N) + (j))
17 | #define ID3(i, j, k, N, L)  (ID2(i,j,N)*(L) + (k))
18 | #define IDX(i, j, k, N, L)  (((i)*N + (j))*(L) + (k))
19 | 
20 | #define SET(x, y)           (x).y = y
21 | #define PSET(x, y)          (x)->y = y
22 | 
23 | #define PAD(x, p)           ((x) + ((p)<<1))
24 | #define CEIL(x, y)          (((x) + (y) - 1) / (y))
25 | #define ROUND_UP(x, y)      (CEIL(x, y) * (y))
26 | #define MAX(x, y)           ((x) > (y) ? (x) : (y))
27 | #define MIN(x, y)           ((x) < (y) ? (x) : (y))
28 | #define OUT_LEN(x, y, z)    (ceilf(((x)-(y)+1)/(float)z))
29 | 
30 | #define SP                  printf(" ");
31 | #define NL                  printf("\n");
32 | #define SEP                 printf(" | ");
33 | #define D(t)                (t.data)
34 | #define LEN(t)              (t.bytes/sizeof(float))
35 | #define FOR(x,y,n)          for (int x=y; x<(n); x++)
36 | #define READ_INT(x, pf)     FREAD(x, sizeof(int), 1, pf, "asd")
37 | #define READ_UINT8(x, pf)   FREAD(x, sizeof(uint8_t), 1, pf, "asd")
38 | #define MALLOC(type, num)   (type *) malloc((num) * sizeof(type))
39 | #define BYTES(type, num)    ((num) * sizeof(type))
40 | #define CALLOC(type, num)   (type *) calloc(num, sizeof(type))
41 | 
42 | #define FREAD(x, type, num, pf, msg)                        \
43 |      if (fread(x, sizeof(type), num, pf) != num) {          \
44 |           fprintf(stderr, msg);                             \
45 |           exit(2);                                         \
46 |      }
47 | 
48 | #ifdef NDEBUG
49 | #define ASSERT(exp, msg) assert(EXP && MSG)
50 | 
51 | #else
52 | #define ASSERT(exp, msg)                         \
53 |      if(!(exp)) {                                \
54 |           fprintf(stderr, msg);                  \
55 |           exit(1);                               \
56 |      }
57 | #endif
58 | 
59 | 
60 | #ifdef __cplusplus
61 | }
62 | #endif
63 | 
64 | #endif /* ESP_UTIL_H */
65 | 


--------------------------------------------------------------------------------
/readEspresso.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """"
  5 | Usage: readEspresso.py <src>
  6 | 
  7 | Options:
  8 |    -h --help
  9 | """
 10 | 
 11 | import os
 12 | import sys
 13 | import struct
 14 | import numpy as np
 15 | 
 16 | INPUT = 0
 17 | DENSE = 1
 18 | BNORM = 2
 19 | CONV = 3
 20 | POOL = 4
 21 | NUM = 1<<4
 22 | DATA = 2<<4
 23 | 
 24 | 
 25 | def pop_string(asd, n):
 26 |     return asd[:n], asd[n:]
 27 | 
 28 | def uint8_read(string):
 29 |     t, c = pop_string(string, 1)
 30 |     return struct.unpack('B', t)[0], c
 31 | 
 32 | def int32_read(string):
 33 |     t, c = pop_string(string, 4)
 34 |     return struct.unpack('i', t)[0], c
 35 | 
 36 | def float_read(f):
 37 |     return struct.unpack('f', string[i:i+4])[0], i+4
 38 | 
 39 | 
 40 | 
 41 | def main(infile):
 42 |     assert(os.path.exists(infile))
 43 |     parameters = []
 44 |     with open(infile, 'rb') as f:
 45 |         data = f.read()
 46 |         while data:
 47 |             caz, data = uint8_read(data)
 48 | 
 49 |             if caz == DENSE | NUM:
 50 |                 num, data = int32_read(data)
 51 |                 print 'ndense %d' % num
 52 | 
 53 |             elif caz == BNORM | NUM:
 54 |                 num, data = int32_read(data)
 55 |                 print 'nbnorm %d' % num
 56 | 
 57 |             elif caz == POOL | NUM:
 58 |                 num, data = int32_read(data)
 59 |                 print 'npool %d' % num
 60 | 
 61 |             elif caz == CONV | NUM:
 62 |                 num, data = int32_read(data)
 63 |                 print 'nconv %d' % num
 64 | 
 65 |             elif caz == INPUT | DATA:
 66 |                 pass
 67 |                 #caz, data = pop_string(data, 4*3)
 68 |                 #M, N, nch = struct.unpack('3i', caz)
 69 |                 #print 'input %d %d %d' % (M, N, nch)
 70 | 
 71 |             elif caz == DENSE | DATA:
 72 |                 caz, data = pop_string(data, 4*2)
 73 |                 M, N = struct.unpack('2i', caz)
 74 |                 print 'dense %d %d' % (M, N)
 75 |                 caz, data = pop_string(data, 4*M*N)
 76 |                 asd = np.frombuffer(caz, dtype=np.float32)
 77 |                 asd = asd.reshape((M, N))
 78 |                 parameters.append(asd)
 79 |                 caz, data = pop_string(data, 4*M)
 80 |                 asd = np.frombuffer(caz, dtype=np.float32)
 81 |                 parameters.append(asd)
 82 | 
 83 |             elif caz == BNORM | DATA:
 84 |                 dim, data = int32_read(data)
 85 |                 print 'bnorm %d' % dim
 86 |                 for i in range(4):
 87 |                     caz, data = pop_string(data, 4*dim)
 88 |                     asd = np.frombuffer(caz, dtype=np.float32)
 89 |                     parameters.append(asd)
 90 | 
 91 |             elif caz == CONV | DATA:
 92 |                 caz, data = pop_string(data, 4*7)
 93 |                 pad, nfil, M, N, nch, Sx, Sy = struct.unpack('7i', caz)
 94 |                 print 'conv %d %d %d %d %d %d %d' % (pad, nfil, M, N, nch, Sx, Sy)
 95 |                 caz, data = pop_string(data, 4*M*N*nch*nfil)
 96 |                 asd = np.frombuffer(caz, dtype=np.float32)
 97 |                 asd = asd.reshape((nfil, nch, M, N))
 98 |                 parameters.append(asd)
 99 |                 caz, data = pop_string(data, 4*nfil)
100 |                 asd = np.frombuffer(caz, dtype=np.float32)
101 |                 parameters.append(asd)
102 | 
103 |             elif caz == POOL | DATA:
104 |                 dim, data = pop_string(data, 4*4)
105 |                 dim = struct.unpack('4i', dim)
106 |                 print 'pool %d %d %d %d' % tuple(dim)
107 | 
108 |             else:
109 |                 print 'caz'
110 | 
111 |     return parameters
112 | 
113 | 
114 | if __name__ == '__main__':
115 |     from docopt import docopt
116 | 
117 |     args = docopt(__doc__)
118 |     params = main(args['<src>'])
119 |     import pdb; pdb.set_trace()
120 | 


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # src
2 | file(GLOB SRC *.c layers/*.c handlers/*.c activations/*.c)
3 | add_library(espresso ${SRC})
4 | 
5 | if (CUDA_FOUND)
6 |   file(GLOB CU_SRC *.cu layers/*.cu kernels/*.cu activations/*.cu tens.c)
7 |   cuda_add_library(cuespresso ${CU_SRC})
8 | endif()
9 | 


--------------------------------------------------------------------------------
/src/activations/psignl.cu:
--------------------------------------------------------------------------------
 1 | #include "kernels.h"
 2 | 
 3 | 
 4 | void cupsignAct_forward(cuftens *src, cuptens *out)
 5 | {
 6 |      int D=src->D, M=src->M, N=src->N, L=src->L;
 7 |      cuASSERT(L % 64 == 0 || N % 64 == 0, "err: psignact % 64\n");
 8 | 
 9 |      if (!out->data) *out = cuptens_init(D, M, N, L);
10 |      cupack(src, out);
11 | }
12 | 


--------------------------------------------------------------------------------
/src/activations/relul.c:
--------------------------------------------------------------------------------
 1 | #include "tens.h"
 2 | #include "util.h"
 3 | 
 4 | 
 5 | void relu_forward(ftens *t)
 6 | {
 7 |      const int len = ftens_len(t);
 8 |      for (int i=0; i < len; i++)
 9 |           t->data[i] = MAX(0, t->data[i]);
10 | }
11 | 
12 | void relu_backward(ftens *dout)
13 | {
14 |      fprintf(stderr, "not implemeted yer\n");
15 |      exit(-4);
16 | }
17 | 


--------------------------------------------------------------------------------
/src/activations/signl.c:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | #include "tens.h"
 3 | 
 4 | void signAct_forward(ftens *t)
 5 | {
 6 |      ftens_sign(t);
 7 |      //const int len = ftens_len(t);
 8 |      //for (int i=0; i < len; i++)
 9 |      //t->data[i] = 2.0f * (t->data[i] > 0.0f) - 1.0f;
10 | }
11 | 
12 | 
13 | void signAct_backward(ftens *t)
14 | {
15 |      fprintf(stderr, "not implemeted yet\n");
16 |      exit(-4);
17 | }
18 | 


--------------------------------------------------------------------------------
/src/activations/signl.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include "kernels.h"
 3 | 
 4 | 
 5 | void cusignAct_forward(cuftens *t)
 6 | {
 7 |      cusign(t);
 8 | }
 9 | 
10 | 
11 | void cusignAct_backward(cuftens *dout)
12 | {
13 |      fprintf(stderr, "not implemented yet\n");
14 |      exit(-4);
15 | }
16 | 


--------------------------------------------------------------------------------
/src/activations/softmaxl.c:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | #include "tens.h"
 3 | 
 4 | void softmax_forward(ftens *t)
 5 | {
 6 |      fprintf(stderr, "not implemeted yet\n");
 7 |      exit(-4);
 8 | }
 9 | 
10 | 
11 | void softmax_backward(ftens *dout)
12 | {
13 |      fprintf(stderr, "not implemeted yet\n");
14 |      exit(-4);
15 | }
16 | 


--------------------------------------------------------------------------------
/src/cnn.c:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | #include "layers.h"
 3 | #include "params.h"
 4 | #include "nn/cnn.h"
 5 | 
 6 | 
 7 | cnn cnn_init(int Ncl, int Npl, int Ndl, int Nbnl)
 8 | {
 9 |      cnn out = {Ncl, Npl, Ndl, Nbnl};
10 |      out.cl  = Ncl  ? MALLOC(convLayer,   Ncl) : NULL;
11 |      out.pl  = Npl  ? MALLOC(poolLayer,   Npl) : NULL;
12 |      out.dl  = Ndl  ? MALLOC(denseLayer,  Ndl) : NULL;
13 |      out.bnl = Nbnl ? MALLOC(bnormLayer, Nbnl) : NULL;
14 |      for (int i=0; i<Ncl; i++)  CONVL_INIT(out.cl[i]);
15 |      for (int i=0; i<Npl; i++)  POOLL_INIT(out.pl[i]);
16 |      for (int i=0; i<Ndl; i++)  DENSEL_INIT(out.dl[i]);
17 |      for (int i=0; i<Nbnl; i++) BNORML_INIT(out.bnl[i]);
18 |      return out;
19 | }
20 | 
21 | cnn cnn_load(const char *esp, int bin, int rev)
22 | {
23 |      cnn out;
24 |      int Ncl;  convLayer  *cl;
25 |      int Npl;  poolLayer  *pl;
26 |      int Ndl;  denseLayer *dl;
27 |      int Nbnl; bnormLayer *bnl;
28 | 
29 |      FILE *pf = fopen(esp, "rb");
30 |      ASSERT(pf, "err: esp fopen\n");
31 | 
32 |      int val;
33 |      while ((val = fgetc(pf)) != EOF) {
34 |           switch (val) {
35 |           case CONVL |LNUM: fread(&Ncl,  sizeof(int), 1, pf); break;
36 |           case POOLL |LNUM: fread(&Npl,  sizeof(int), 1, pf); break;
37 |           case DENSEL|LNUM: fread(&Ndl,  sizeof(int), 1, pf); break;
38 |           case BNORML|LNUM: fread(&Nbnl, sizeof(int), 1, pf); break;
39 | 
40 |           case INPUTL|LDAT:
41 |                out = cnn_init(Ncl, Npl, Ndl, Nbnl);
42 |                cl=out.cl; pl=out.pl; dl=out.dl; bnl=out.bnl;
43 |                break;
44 | 
45 |           case CONVL |LDAT: load_convLayer(cl, pf, bin, rev); cl++;  break;
46 |           case POOLL |LDAT: load_poolLayer(pl, pf);           pl++;  break;
47 |           case DENSEL|LDAT: load_denseLayer(dl, pf, bin);     dl++;  break;
48 |           case BNORML|LDAT: load_bnormLayer(bnl, pf);         bnl++; break;
49 | 
50 |           default:
51 |                fprintf(stderr, "err: cnn load\n");
52 |                exit(-3);
53 |           }
54 |      }
55 | 
56 |      fclose(pf);
57 |      return out;
58 | 
59 | }
60 | 
61 | void cnn_print(cnn *net)
62 | {
63 |      printf("CNN: Ncl=%d Npl=%d Ndl=%d Nbnl=%d\n",
64 |             net->Ncl, net->Npl, net->Ndl, net->Nbnl);
65 | }
66 | 
67 | void cnn_free(cnn *net)
68 | {
69 |      inputLayer_free(&net->il);
70 |      for (int i=0; i<net->Ncl; i++)  convLayer_free(net->cl + i);
71 |      for (int i=0; i<net->Npl; i++)  poolLayer_free(net->pl + i);
72 |      for (int i=0; i<net->Ndl; i++)  denseLayer_free(net->dl + i);
73 |      for (int i=0; i<net->Nbnl; i++) bnormLayer_free(net->bnl + i);
74 | }
75 | 


--------------------------------------------------------------------------------
/src/cnn.cu:
--------------------------------------------------------------------------------
 1 | #include "util.cuh"
 2 | #include "culayers.h"
 3 | #include "nn/cucnn.h"
 4 | 
 5 | 
 6 | cucnn cucnn_init(int Ncl, int Npl, int Ndl, int Nbnl)
 7 | {
 8 |      cucnn out = {Ncl, Npl, Ndl, Nbnl};
 9 |      out.cl  = Ncl  ? MALLOC(cuconvLayer,  Ncl)  : NULL;
10 |      out.pl  = Npl  ? MALLOC(cupoolLayer,  Npl)  : NULL;
11 |      out.dl  = Ndl  ? MALLOC(cudenseLayer, Ndl)  : NULL;
12 |      out.bnl = Nbnl ? MALLOC(cubnormLayer, Nbnl) : NULL;
13 |      for (int i=0; i<Ncl;  i++) CONVL_INIT(out.cl[i]);
14 |      for (int i=0; i<Npl;  i++) POOLL_INIT(out.pl[i]);
15 |      for (int i=0; i<Ndl;  i++) DENSEL_INIT(out.dl[i]);
16 |      for (int i=0; i<Nbnl; i++) BNORML_INIT(out.bnl[i]);
17 |      return out;
18 | }
19 | 
20 | void cucnn_free(cucnn *net)
21 | {
22 |      cuinputLayer_free(&net->il);
23 |      for (int i=0; i<net->Ncl;  i++) cuconvLayer_free(net->cl + i);
24 |      for (int i=0; i<net->Npl;  i++) cupoolLayer_free(net->pl + i);
25 |      for (int i=0; i<net->Ndl;  i++) cudenseLayer_free(net->dl + i);
26 |      for (int i=0; i<net->Nbnl; i++) cubnormLayer_free(net->bnl + i);
27 | }
28 | 
29 | 
30 | cucnn cucnn_convert(cnn *net)
31 | {
32 |      const int Ncl=net->Ndl, Npl=net->Npl;
33 |      const int Ndl=net->Ndl, Nbnl=net->Nbnl;
34 |      cucnn out = cucnn_init(Ncl, Npl, Ndl, Nbnl);
35 |      for (int i=0; i<Ncl; i++)
36 |           cuconvLayer_convert(&net->cl[i], &out.cl[i]);
37 | 
38 |      for (int i=0; i<Npl; i++)
39 |           cupoolLayer_convert(&net->pl[i], &out.pl[i]);
40 | 
41 |      for (int i=0; i<Ndl; i++)
42 |           cudenseLayer_convert(&net->dl[i], &out.dl[i]);
43 | 
44 |      for (int i=0; i<Nbnl; i++)
45 |           cubnormLayer_convert(&net->bnl[i], &out.bnl[i]);
46 | 
47 |      return out;
48 | }
49 | 
50 | 
51 | void cucnn_print(cucnn *net)
52 | {
53 |      printf("CUCNN: Ncl=%d Npl=%d Ndl=%d Nbnl=%d\n",
54 |             net->Ncl, net->Npl, net->Ndl, net->Nbnl);
55 | }
56 | 


--------------------------------------------------------------------------------
/src/cumem.cu:
--------------------------------------------------------------------------------
 1 | #include "util.cuh"
 2 | #include "cumem.h"
 3 | 
 4 | #define PTR_INIT(x) (x.total=0, x.used=0, x.base=NULL, x.curr=NULL)
 5 | 
 6 | cufmem fptr = {0, 0, NULL, NULL};
 7 | cupmem pptr = {0, 0, NULL, NULL};
 8 | 
 9 | 
10 | int _cufmem() {return fptr.base != NULL;}
11 | int _cupmem() {return pptr.base != NULL;}
12 | 
13 | 
14 | void cufmem_alloc(int bytes)
15 | {
16 |      fptr.total = bytes;
17 |      fptr.used  = 0;
18 |      cudaMalloc(&fptr.base, bytes);
19 |      cuASSERT(fptr.base, "err:\n");
20 |      fptr.curr = fptr.base;
21 | }
22 | 
23 | float *cufmem_reserve(int bytes)
24 | {
25 |      float *out = fptr.curr;
26 |      cuASSERT(fptr.base, "err: cufmem not init \n");
27 |      cuASSERT(fptr.used + bytes < fptr.total, "err: out of cufmem\n");
28 |      fptr.used += bytes;
29 |      fptr.curr += (bytes/sizeof(float));
30 |      return out;
31 | }
32 | 
33 | void cufmem_free()
34 | {
35 |      if (fptr.base) cudaFree(fptr.base);
36 |      PTR_INIT(fptr);
37 | }
38 | 
39 | void cufmem_reset()
40 | {
41 |      fptr.used = 0;
42 |      fptr.curr = fptr.base;
43 | }
44 | 
45 | void cupmem_alloc(int bytes)
46 | {
47 |      pptr.total = bytes;
48 |      pptr.used  = 0;
49 |      cudaMalloc(&pptr.base, bytes);
50 |      cuASSERT(pptr.base, "err:\n");
51 |      pptr.curr = pptr.base;
52 | }
53 | 
54 | uint64_t *cupmem_reserve(int bytes)
55 | {
56 |      uint64_t *out = pptr.curr;
57 |      cuASSERT(pptr.base, "err: cumem not init \n");
58 |      cuASSERT(pptr.used + bytes < pptr.total, "err: out of cupmem\n");
59 |      pptr.used += bytes;
60 |      pptr.curr += (bytes/sizeof(uint64_t));
61 |      return out;
62 | }
63 | 
64 | void cupmem_free()
65 | {
66 |      if (pptr.base) cudaFree(pptr.base);
67 |      PTR_INIT(pptr);
68 | }
69 | 
70 | void cupmem_reset()
71 | {
72 |      pptr.used = 0;
73 |      pptr.curr = pptr.base;
74 | }
75 | 


--------------------------------------------------------------------------------
/src/handlers/cifar.c:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | #include "tens.h"
 3 | 
 4 | #define W 32
 5 | #define H 32
 6 | #define L 3
 7 | #define TRAIN_IMG 50000
 8 | #define TEST_IMG  10000
 9 | 
10 | 
11 | void cifar10_load_Xy(const char *tf, int start, int num,
12 |                      ftens *X, ftens *y)
13 | {
14 |      ASSERT(start + num < TEST_IMG, "err: cifar num\n");
15 |      ASSERT(X->MNL == W*H*L, "err: input shape\n");
16 |      uint8_t X_buff[W*H*L];
17 |      uint8_t y_buff;
18 |      FILE *pf = fopen(tf, "rb");
19 |      ASSERT(pf, "err: fopen \n");
20 |      ftens tmpX = ftens_init(num, L, W, H);
21 |      ftens_clear(y);
22 |      fseek(pf, (W*H*L+1)*start, SEEK_SET);
23 |      for (int i=0; i < num; i++) {
24 |           float *outX = tmpX.data + i * tmpX.MNL;
25 |           float *outy = y->data   + i * y->MNL;
26 |           fread(&y_buff, sizeof(uint8_t), 1,     pf);
27 |           fread(X_buff,  sizeof(uint8_t), W*H*L, pf);
28 |           outy[y_buff] = 1.0f;
29 |           for (int i=0; i < W*H*L; i++)
30 |                outX[i] = (float) X_buff[i];
31 |      }
32 | 
33 |      ftens_tch(&tmpX, X);
34 |      ftens_free(&tmpX);
35 | }
36 | 
37 | #undef W
38 | #undef H
39 | #undef L
40 | 


--------------------------------------------------------------------------------
/src/handlers/mnist.c:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <string.h>
 3 | 
 4 | #include "util.h"
 5 | #include "tens.h"
 6 | 
 7 | #define W 28
 8 | #define H 28
 9 | #define C 1
10 | #define IMG_B W*H
11 | #define LAB_B 1
12 | #define TRAIN_IMG 6000
13 | #define TRAIN_HB 16
14 | #define TEST_HB 8
15 | 
16 | 
17 | void mnist_load_X(const char *tf, int start, int num,
18 |                   ftens *X)
19 | {
20 |      ASSERT(start + num < TRAIN_IMG, "mnist err img");
21 |      uint8_t buff[W*H];
22 |      FILE *pf = fopen(tf, "rb"); ASSERT(pf, "mnist err tf");
23 |      fseek(pf, TRAIN_HB + start * IMG_B, SEEK_SET);
24 |      for (int i=0; i < num; i++) {
25 |           float *out = X->data + i*X->MNL;
26 |           FREAD(buff, uint8_t, IMG_B, pf, "mnist err read");
27 |           for (int i=0; i < W*H; i++)
28 |                out[i] = (float) buff[i];
29 |      }
30 |      fclose(pf);
31 | }
32 | 
33 | 
34 | void mnist_load_y(const char *lf, int start, int num, ftens *y)
35 | {
36 |      ASSERT(start + num < TRAIN_IMG, "mnist err lab\n");
37 |      float *out = y->data; uint8_t buff;
38 |      FILE *pf = fopen(lf, "rb"); ASSERT(pf, "mnist err rl\n");
39 |      fseek(pf, TEST_HB + start * LAB_B, SEEK_SET);
40 |      for (int i=0; i < num; i++) {
41 |           memset(out, 0, sizeof(float) * 10);
42 |           fread(&buff, sizeof(uint8_t), 1, pf);
43 |           out[buff] = 1.0f;
44 |           out += y->MNL;
45 |      }
46 | }
47 | 
48 | 
49 | #undef W
50 | #undef H
51 | #undef C
52 | #undef HEADER_BYTES
53 | #undef IMG_B
54 | #undef LAB_B
55 | #undef TRAIN_IMG
56 | #undef TRAIN_HB
57 | #undef TEST_HB
58 | 


--------------------------------------------------------------------------------
/src/kernels/bnorm.cu:
--------------------------------------------------------------------------------
 1 | #include "util.cuh"
 2 | #include "bnorm.cuh"
 3 | #include "cutens.h"
 4 | 
 5 | void cubnorm (cuftens *mean, cuftens *istd,
 6 |               cuftens *beta, cuftens *gamma,
 7 |               cuftens  *in)
 8 | {
 9 |      const int TS=32;
10 |      const int M=in->N, N=in->N, L=in->L;
11 |      const int len = cuftens_len(in);
12 | 
13 |      ker_bnorm <<<CEIL(len, TS), TS>>>
14 |           (mean->data, istd->data,
15 |            beta->data, gamma->data, len, L > 1 ? L : M*N,
16 |            in->data);
17 | }
18 | 


--------------------------------------------------------------------------------
/src/kernels/bnorm.cuh:
--------------------------------------------------------------------------------
 1 | static __global__
 2 | void ker_bnorm (const float * __restrict__ mean,
 3 |                 const float * __restrict__ istd,
 4 |                 const float * __restrict__ beta,
 5 |                 const float * __restrict__ gamma,
 6 |                 const int len, const int N,
 7 |                 float *      __restrict__  dst)
 8 | {
 9 |      int i=threadIdx.x + blockIdx.x * blockDim.x;
10 | 
11 |      if (i >= len) return;
12 | 
13 |      dst[i] = (dst[i] - mean[i%N]) * istd[i%N] * gamma[i%N] + beta[i%N];
14 | }
15 | 


--------------------------------------------------------------------------------
/src/kernels/bp.cu:
--------------------------------------------------------------------------------
 1 | #include "util.cuh"
 2 | #include "bp.cuh"
 3 | #include "cuptens.h"
 4 | 
 5 | 
 6 | void cubp_split_pack(cuftens *src, cuptens *dst)
 7 | {
 8 |      int D=src->D, Ns=src->MNL, Nd=dst->X*2;
 9 | 
10 |      dim3 grid(D, CEIL(Ns, 32));
11 |      dim3 block(1, 32);
12 | 
13 |      ker_bpsplit <8> <<<grid, block>>>
14 |           (src->data, (uint32_t *)dst->data, Ns, Nd);
15 | 
16 | }
17 | 
18 | void cubp_merge(cuftens *src, cuftens *dst, cuftens *fix, float norm)
19 | {
20 |      const int D=src->D, N=src->MNL/8;
21 | 
22 |      dim3 grid(D, CEIL(N, 32));
23 |      dim3 block(8, 32);
24 | 
25 |      ker_bpmerge <8> <<<grid, block>>>
26 |           (src->data, dst->data, fix->data, norm, N, fix->N);
27 | }
28 | 


--------------------------------------------------------------------------------
/src/kernels/bp.cuh:
--------------------------------------------------------------------------------
  1 | #include "util.cuh"
  2 | 
  3 | 
  4 | template <int B> static __global__
  5 | void ker_bpsplit(const float *a, uint32_t *b, int Ns, int Nd)
  6 | {
  7 |      int w=threadIdx.x, W=w + blockIdx.x * blockDim.x;
  8 |      int j=threadIdx.y, J=j + blockIdx.y * blockDim.y;
  9 | 
 10 |      __shared__ uint32_t sm[32];
 11 |      __shared__ uint32_t c[B];
 12 | 
 13 |      if (J >= Ns) return;
 14 | 
 15 |      sm[j] = (uint32_t) a[W*Ns + J];
 16 | 
 17 |      __syncthreads();
 18 | 
 19 |      #pragma unroll
 20 |      for (int i = 0; i < B; i++) {
 21 |            c[i] = __ballot((sm[j] & (1 << i)));
 22 |      }
 23 | 
 24 |      __syncthreads();
 25 | 
 26 |      if (j == 0) {
 27 |           uint32_t *ptr = b + W*B*Nd;
 28 |           for (int i=0; i < B; i++) {
 29 |                ptr[i*Nd + blockIdx.y] = c[i];
 30 |           }
 31 |      }
 32 | }
 33 | 
 34 | template <int B> static __global__
 35 | void ker_bpmerge(const float *a, float *b, float *fix, float norm,
 36 |                  int N, int N2)
 37 | {
 38 |      int i=threadIdx.x, I=i + blockIdx.x*blockDim.x;
 39 |      int j=threadIdx.y, J=j + blockIdx.y*blockDim.y;
 40 | 
 41 |      __shared__ float sm[B];
 42 | 
 43 |      if (J >= N) return;
 44 | 
 45 |      sm[i] = a[I*N + J];
 46 |      __syncthreads();
 47 | 
 48 |      if (i == 0) {
 49 |           int id = blockIdx.x * N + J;
 50 |           float c = 0.0f;
 51 | 
 52 |           #pragma unroll
 53 |           for (int i=0; i < B; i++)
 54 |                c += sm[i] * (1<<i)/norm;
 55 | 
 56 |           b[id] = c - (fix ? fix[J % N2] : 0.0f);
 57 |      }
 58 | }
 59 | 
 60 | 
 61 | // template <int B> static __global__
 62 | // void ker_bpsplit(const float * __restrict__ a,
 63 | //                  uint32_t    * __restrict__ b,
 64 | //                  const int N)
 65 | // {
 66 | //      int j=threadIdx.x, J=j + blockIdx.x*blockDim.x;
 67 | 
 68 | //      __shared__ uint32_t p[32];
 69 | //      __shared__ uint32_t c[B];
 70 | 
 71 | //      p[j] = (uint32_t) a[J];
 72 | 
 73 | //      __syncthreads();
 74 | 
 75 | //      #pragma unroll
 76 | //      for (int i=0; i < B; i++)
 77 | //           c[i] = __ballot((p[j] & (1<<i)));
 78 | 
 79 | //      if (j == 0) {
 80 | //           #pragma unroll
 81 | //           for (int i=0; i < B; i++)
 82 | //                b[(i*N + (J%N) + (J/N*B*N))>>5] = c[i];
 83 | //      }
 84 | // }
 85 | 
 86 | // template <int B> static __global__
 87 | // void ker_bpmerge(int D, int M, int N,
 88 | //                  const float * __restrict__ src,
 89 | //                  float       * __restrict__ dst,
 90 | //                  const float * __restrict__ b,
 91 | //                  float norm)
 92 | // {
 93 | //      int i=threadIdx.x, I=i + blockIdx.x*blockDim.x;
 94 | //      int j=threadIdx.y, J=j + blockIdx.y*blockDim.y;
 95 | 
 96 | //      if (I>=D*M || J>=N) return;
 97 | 
 98 | //      __shared__ float sm[B][32];
 99 | //      sm[i][j] = src[ID2(I,J,N)];
100 | 
101 | //      __syncthreads();
102 | 
103 | //      if (i == 0) {
104 | //           float c = 0.0f;
105 | //           #pragma unroll
106 | //           for(int k=0; k < B; k++)
107 | //                c += sm[k][j] * (float)(1<<k)/norm;
108 | 
109 | //           dst[ID2(blockIdx.x, J, N)] = c - (b ? b[N] : 0.0f);
110 | //      }
111 | // }
112 | 


--------------------------------------------------------------------------------
/src/kernels/copy.cu:
--------------------------------------------------------------------------------
 1 | #include "copy.cuh"
 2 | #include "cutens.h"
 3 | 
 4 | 
 5 | void cucopy(cuftens *src, cuftens *dst)
 6 | {
 7 |      int Ms=src->D*src->M, Ns=src->N, Ls=src->L;
 8 |      int Md=dst->D*dst->M, Nd=dst->N, Ld=dst->L;
 9 | 
10 |      if (Ls > 1 && Ld > 1) {
11 |           int TS = 8;
12 |           dim3 grid(CEIL(Ls,TS), CEIL(Ns,TS), CEIL(Ms,TS));
13 |           dim3 block(TS, TS, TS);
14 | 
15 |           ker_copy3D <<<grid, block>>>
16 |                (src->data, dst->data, Ms, Ns, Ls, Md, Nd, Ld);
17 |      } else {
18 |           int TS = 16;
19 |           dim3 grid(CEIL(Ns, TS), CEIL(Ms, TS));
20 |           dim3 block(TS, TS);
21 | 
22 |           ker_copy2D <<<grid, block>>>
23 |                (src->data, dst->data, Ms, Ns, Md, Nd);
24 |      }
25 | }
26 | 


--------------------------------------------------------------------------------
/src/kernels/copy.cuh:
--------------------------------------------------------------------------------
 1 | #include "util.cuh"
 2 | 
 3 | __global__ static
 4 | void ker_copy2D (const float *src, float *dst,
 5 |                  const int Ms, const int Ns,
 6 |                  const int Md, const int Nd)
 7 | {
 8 |      int j=threadIdx.x + blockIdx.x*blockDim.x;
 9 |      int i=threadIdx.y + blockIdx.y*blockDim.y;
10 | 
11 |      if (i>=Ms || j>=Ns) return;
12 | 
13 |      dst[ID2(i,j,Nd)] = src[ID2(i,j,Ns)];
14 | }
15 | 
16 | 
17 | __global__ static
18 | void ker_copy3D (const float *src, float *dst,
19 |                  const int Ms, const int Ns, const int Ls,
20 |                  const int Md, const int Nd, const int Ld)
21 | {
22 |      int k=threadIdx.x + blockIdx.x * blockDim.x;
23 |      int j=threadIdx.y + blockIdx.y * blockDim.y;
24 |      int i=threadIdx.z + blockIdx.z * blockDim.z;
25 | 
26 |      if (i>=Ms || j>=Ns || k>=Ls) return;
27 | 
28 |      dst[ID3(i,j,k,Nd,Ld)] = src[ID3(i,j,k,Ns,Ls)];
29 | }
30 | 


--------------------------------------------------------------------------------
/src/kernels/lower.cu:
--------------------------------------------------------------------------------
 1 | #include "lower.cuh"
 2 | #include "cuptens.h"
 3 | 
 4 | #define KER_SETUP(type, TM, TN, TL)                                \
 5 |      int bytes = (TM+H-1) * (TN+W-1)* TL * sizeof(type);           \
 6 |      dim3 grid(CEIL(L, TL), CEIL(Nd, TN), CEIL(Md, TM));           \
 7 |      dim3 block(TL, TN, TM)
 8 | 
 9 | 
10 | 
11 | void culower(cuftens *src, cuftens *dst, int W, int H, int Sx, int Sy)
12 | {
13 |      const int D=src->D,  L=src->L;
14 |      const int Ms=src->M, Ns=src->N;
15 |      const int Md=dst->M, Nd=dst->N;
16 | 
17 |      cuASSERT(D==dst->D, "err: lower shape\n");
18 | 
19 |      const int TM=8, TN=8, TL=8;
20 |      KER_SETUP(float, TM, TN, TL);
21 |      for (int w=0; w < D; w++) {
22 |            float *s = src->data + w * src->MNL;
23 |            float *d = dst->data + w * dst->MNL;
24 |            ker_lower <TM, TN, TL> <<<grid, block, bytes>>>
25 |                 (s, d, Ms, Ns, Md, Nd, L, W, H, Sx, Sy);
26 |       }
27 | }
28 | 
29 | 
30 | void cuplower(cuptens *src, cuptens *dst, int W, int H, int Sx, int Sy)
31 | {
32 |      const int D=src->D,  L=src->X;
33 |      const int Ms=src->M, Ns=src->N;
34 |      const int Md=dst->M, Nd=dst->N;
35 | 
36 |      cuASSERT(D==dst->D, "err: lower shape\n");
37 | 
38 |      const int TM=16, TN=16, TL=4;
39 |      KER_SETUP(uint64_t, TM, TN, TL);
40 | 
41 |      for (int w=0; w < D; w++) {
42 |           uint64_t *s = src->data + w * src->MNL;
43 |           uint64_t *d = dst->data + w * dst->MNL;
44 |           ker_plower <TM, TN, TL> <<<grid, block, bytes>>>
45 |                (s, d, Ms, Ns, Md, Nd, L, W, H, Sx, Sy);
46 |      }
47 | }
48 | 
49 | 
50 | #undef KER_SETUP
51 | 


--------------------------------------------------------------------------------
/src/kernels/lower.cuh:
--------------------------------------------------------------------------------
  1 | #include "util.cuh"
  2 | 
  3 | 
  4 | #define SM_FILL(sm) {                                                   \
  5 |           sm[ID3(i,j,k,X,Z)] = src[ID3(I,J,K,Ns,L)];                    \
  6 |           if (i<H-1 && I+TM<Ms)                                         \
  7 |                sm[ID3(i+TM,j,k,X,Z)] = src[ID3(I+TM,J,K,Ns,L)];         \
  8 |           if (j<W-1 && J+TN<Ns)                                         \
  9 |                sm[ID3(i,j+TN,k,X,Z)] = src[ID3(I,J+TN,K,Ns,L)];         \
 10 |           if (i<H-1 && j<W-1 && I+TM<Ms && J+TN<Ns)                     \
 11 |                sm[ID3(i+TM,j+TN,k,X,Z)] = src[ID3(I+TM,J+TN,K,Ns,L)];   \
 12 |      }
 13 | 
 14 | #define DST_FILL(sm) {                                           \
 15 |           int l=0, R=(I/Sy*Nd + J/Sx)*Ld;                        \
 16 |           for (int y=0; y < H; y++)                              \
 17 |                for (int x=0; x < W; x++) {                       \
 18 |                     dst[R+l*L+K] = sm[ID3(i+y,j+x,k,X,Z)];       \
 19 |                     l++;                                         \
 20 |                }                                                 \
 21 |      }                                                           \
 22 | 
 23 | 
 24 | template <int TM, int TN, int TL> static __global__
 25 | void ker_lower(const float * __restrict__ src,
 26 |                float       * __restrict__ dst,
 27 |                int Ms, int Ns, int Md, int Nd, int L,
 28 |                int W,  int H,  int Sx, int Sy)
 29 | {
 30 |      int k=threadIdx.x, K=k+blockIdx.x * blockDim.x;
 31 |      int j=threadIdx.y, J=j+blockIdx.y * blockDim.y;
 32 |      int i=threadIdx.z, I=i+blockIdx.z * blockDim.z;
 33 |      const int X=TN+W-1, Z=TL, Ld=W*H*L;
 34 | 
 35 |      if (I>=Ms || J>=Ns || K>=L) return;
 36 | 
 37 |      extern __shared__ float sm[];
 38 | 
 39 |      SM_FILL(sm);
 40 | 
 41 |      // sm[ID3(i,j,k,X,Z)] = src[ID3(I,J,K,Ns,L)];
 42 |      // if (i<H-1 && I+TM<Ms)
 43 |      //      sm[ID3(i+TM,j,k,X,Z)] = src[ID3(I+TM,J,K,Ns,L)];
 44 | 
 45 |      // if (j<W-1 && J+TN<Ns)
 46 |      //      sm[ID3(i,j+TN,k,X,Z)] = src[ID3(I,J+TN,K,Ns,L)];
 47 | 
 48 |      // if (i<H-1 && j<W-1 && I+TM<Ms && J+TN<Ns)
 49 |      //      sm[ID3(i+TM,j+TN,k,X,Z)] = src[ID3(I+TM,J+TN,K,Ns,L)];
 50 | 
 51 |      __syncthreads();
 52 | 
 53 | 
 54 |      if (I/Sy>=Md || J/Sx>=Nd || K>=L) return;
 55 | 
 56 |      if ((I % Sy) == 0 && (J % Sx) == 0) {
 57 |           DST_FILL(sm);
 58 |           // int l=0, R=(I/Sy*Nd + J/Sx)*Ld;
 59 |           // for (int y=0; y < H; y++)
 60 |           //      for (int x=0; x < W; x++) {
 61 |           //           dst[R+l*L+K] = sm[ID3(i+y,j+x,k,X,Z)];
 62 |           //           l++;
 63 |           //      }
 64 |      }
 65 | }
 66 | 
 67 | 
 68 | template <int TM, int TN, int TL> static __global__
 69 | void ker_plower(const uint64_t * __restrict__ src,
 70 |                 uint64_t       * __restrict__ dst,
 71 |                 int Ms, int Ns, int Md, int Nd, int L,
 72 |                 int W,  int H,  int Sx, int Sy)
 73 | {
 74 |      int k=threadIdx.x, K=k+blockIdx.x * blockDim.x;
 75 |      int j=threadIdx.y, J=j+blockIdx.y * blockDim.y;
 76 |      int i=threadIdx.z, I=i+blockIdx.z * blockDim.z;
 77 |      const int X=TN+W-1, Z=TL, Ld=W*H*L;
 78 | 
 79 |      if (I>=Ms || J>=Ns || K>=L) return;
 80 | 
 81 |      extern __shared__ uint64_t psm[];
 82 | 
 83 |      SM_FILL(psm);
 84 | 
 85 |      // psm[ID3(i,j,k,X,Z)] = src[ID3(I,J,K,Ns,L)];
 86 |      // if (i<H-1 && I+TM<Ms)
 87 |      //      psm[ID3(i+TM,j,k,X,Z)] = src[ID3(I+TM,J,K,Ns,L)];
 88 | 
 89 |      // if (j<W-1 && J+TN<Ns)
 90 |      //      psm[ID3(i,j+TN,k,X,Z)] = src[ID3(I,J+TN,K,Ns,L)];
 91 | 
 92 |      // if (i<H-1 && j<W-1 && I+TM<Ms && J+TN<Ns)
 93 |      //      psm[ID3(i+TM,j+TN,k,X,Z)] = src[ID3(I+TM,J+TN,K,Ns,L)];
 94 | 
 95 |      __syncthreads();
 96 | 
 97 | 
 98 |      if (I/Sy>=Md || J/Sx>=Nd || K>=L) return;
 99 | 
100 |      if ((I % Sy) == 0 && (J % Sx) == 0) {
101 |           DST_FILL(psm);
102 |           // int l=0, R=(I/Sy*Nd + J/Sx)*Ld;
103 |           // for (int y=0; y < H; y++)
104 |           //      for (int x=0; x < W; x++) {
105 |           //           dst[R+l*L+K] = psm[ID3(i+y,j+x,k,X,Z)];
106 |           //           l++;
107 |           //      }
108 |      }
109 | }
110 | 
111 | #undef SM_FILL
112 | #undef DST_FILL
113 | 


--------------------------------------------------------------------------------
/src/kernels/norm.cu:
--------------------------------------------------------------------------------
 1 | #include "norm.cuh"
 2 | #include "cutens.h"
 3 | 
 4 | 
 5 | void cunorm (cuftens *t)
 6 | {
 7 |      const int BS=32;
 8 |      const int len = t->bytes/sizeof(float);
 9 | 
10 |      ker_norm <<<CEIL(len, BS), BS>>> (t->data, len);
11 | }
12 | 


--------------------------------------------------------------------------------
/src/kernels/norm.cuh:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | 
 3 | static __global__
 4 | void ker_norm(float * __restrict__ src, const int len)
 5 | {
 6 |      int i=threadIdx.x + blockIdx.x * blockDim.x;
 7 |      if (i >= len) return;
 8 |      src[i] = 2.0f * src[i] / 255.0f - 1.0f;
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/kernels/pack.cu:
--------------------------------------------------------------------------------
 1 | #include "util.cuh"
 2 | #include "pack.cuh"
 3 | #include "cuptens.h"
 4 | 
 5 | // void cupack(const float *a, uint32_t *b, int M, int N, int L)
 6 | // {
 7 | //      ker_pack <<<CEIL(M*N*L, 32), 32>>> (a, b);
 8 | // }
 9 | 
10 | 
11 | void cupack(cuftens *src, cuptens *dst)
12 | {
13 |      const int len = cuftens_len(src);
14 |      ker_pack <<<CEIL(len, 32), 32>>>
15 |           (src->data, (uint32_t *)dst->data);
16 | }
17 | 


--------------------------------------------------------------------------------
/src/kernels/pack.cuh:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | 
 3 | static __global__
 4 | void ker_pack(const float * __restrict__ a,
 5 |               uint32_t *    __restrict__ b)
 6 | {
 7 |      int i=threadIdx.x + blockIdx.x*blockDim.x;
 8 |      b[i>>5] = __ballot((uint32_t)(a[i] > 0.0f));
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/kernels/pad.cu:
--------------------------------------------------------------------------------
 1 | #include "pad.cuh"
 2 | #include "cuptens.h"
 3 | 
 4 | 
 5 | void cupad(cuftens *src, cuftens *dst, int p)
 6 | {
 7 |      const int D=src->D, L=src->L;
 8 |      const int Ms=src->M, Ns=src->N;
 9 |      const int Md=dst->M, Nd=dst->N;
10 | 
11 |      cuASSERT(D == dst->D &&
12 |               L == dst->L &&
13 |               Md >= Ms    &&
14 |               Nd >= Ns, "err: pad dim\n");
15 | 
16 |      cudaMemset(dst->data, 0, dst->bytes);
17 | 
18 |      cupad_template <float>
19 |           (src->data, dst->data, p, D, L, Ms, Ns, Md, Nd);
20 | 
21 |      // if (L == 1) {
22 |      //      const int BS = 16;
23 |      //      dim3 grid(CEIL(Ns, BS), CEIL(Ms, BS));
24 |      //      dim3 block(BS, BS);
25 |      //      for (int w=0; w < D; w++) {
26 |      //           float *s = src->data + w * src->MNL;
27 |      //           float *d = dst->data + w * dst->MNL;
28 |      //           ker_pad2D <float> <<<grid, block>>>
29 |      //                (s, d, p, Ms, Ns, Md, Nd);
30 |      //      }
31 |      // } else {
32 |      //      const int BS = 8;
33 |      //      dim3 grid(CEIL(L, BS), CEIL(Ns, BS), CEIL(Ms, BS));
34 |      //      dim3 block(BS, BS, BS);
35 |      //      for (int w=0; w < D; w++) {
36 |      //           float *s = src->data + w * src->MNL;
37 |      //           float *d = dst->data + w * dst->MNL;
38 |      //           ker_pad3D <float> <<<grid, block>>>
39 |      //                (s, d, p, Ms, Ns, Md, Nd, L);
40 |      //      }
41 |      // }
42 | }
43 | 
44 | 
45 | void cupad(cuptens *src, cuptens *dst, int p)
46 | {
47 |      const int Ms=src->M, Ns=src->N;
48 |      const int Md=dst->M, Nd=dst->N;
49 |      const int L=src->X, D=src->D;
50 | 
51 |      cuASSERT(L==dst->X && D==dst->D, "err: pad dim\n");
52 | 
53 |      cudaMemset(dst->data, 0, dst->bytes);
54 | 
55 |      cupad_template <uint64_t>
56 |           (src->data, dst->data, p, D, L, Ms, Ns, Md, Nd);
57 | 
58 |      // const int BS = 8;
59 |      // dim3 grid(CEIL(L, BS), CEIL(Ns, BS), CEIL(Ms, BS));
60 |      // dim3 block(BS, BS, BS);
61 | 
62 |      // for (int w=0; w < D; w++) {
63 |      //      uint64_t *s = src->data + w * src->MNL;
64 |      //      uint64_t *d = dst->data + w * dst->MNL;
65 |      //      ker_pad3D <uint64_t> <<<grid, block>>>
66 |      //           (s, d, p, Ms, Ns, Md, Nd, L);
67 |      // }
68 | }
69 | 


--------------------------------------------------------------------------------
/src/kernels/pad.cuh:
--------------------------------------------------------------------------------
 1 | #include "util.cuh"
 2 | 
 3 | template <typename T> static __global__
 4 | void ker_pad2D(const T * __restrict__ src,
 5 |                T       * __restrict__ dst,  int p,
 6 |                int Ms, int Ns, int Md, int Nd)
 7 | {
 8 |      int j=threadIdx.x + blockIdx.x*blockDim.x;
 9 |      int i=threadIdx.y + blockIdx.y*blockDim.y;
10 | 
11 |      if (i>=Ms || j>=Ns) return;
12 | 
13 |      dst[ID2(i+p,j+p,Nd)] = src[ID2(i,j,Ns)];
14 | }
15 | 
16 | 
17 | template <typename T> static __global__
18 | void ker_pad3D(const T * __restrict__ src,
19 |                T       * __restrict__ dst, int p,
20 |                int Ms, int Ns, int Md, int Nd, int L)
21 | {
22 |      int k=threadIdx.x + blockIdx.x*blockDim.x;
23 |      int j=threadIdx.y + blockIdx.y*blockDim.y;
24 |      int i=threadIdx.z + blockIdx.z*blockDim.z;
25 | 
26 |      if (i>=Ms || j>=Ns || k>=L) return;
27 | 
28 |      dst[ID3(i+p,j+p,k,Nd,L)] = src[ID3(i,j,k,Ns,L)];
29 | }
30 | 
31 | 
32 | template <typename T>
33 | void cupad_template(T *src, T *dst, int p, int D, int L,
34 |                     int Ms, int Ns, int Md, int Nd)
35 | {
36 |      if (L == 1) {
37 |           const int BS = 16;
38 |           dim3 grid(CEIL(Ns, BS), CEIL(Ms, BS));
39 |           dim3 block(BS, BS);
40 |           for (int w=0; w < D; w++) {
41 |                T *s = src + w * Ms*Ns*L;
42 |                T *d = dst + w * Md*Nd*L;
43 |                ker_pad2D <T> <<<grid, block>>>
44 |                     (s, d, p, Ms, Ns, Md, Nd);
45 |           }
46 |      } else {
47 |           const int BS = 8;
48 |           dim3 grid(CEIL(L, BS), CEIL(Ns, BS), CEIL(Ms, BS));
49 |           dim3 block(BS, BS, BS);
50 |           for (int w=0; w < D; w++) {
51 |                T *s = src + w * Ms*Ns*L;
52 |                T *d = dst + w * Md*Nd*L;
53 |                ker_pad3D <T> <<<grid, block>>>
54 |                     (s, d, p, Ms, Ns, Md, Nd, L);
55 |           }
56 |      }
57 | }
58 | 


--------------------------------------------------------------------------------
/src/kernels/pgemm.cu:
--------------------------------------------------------------------------------
  1 | #include "util.cuh"
  2 | #include "pgemm.cuh"
  3 | 
  4 | 
  5 | template <int INIT,
  6 |           int DIM_X,  int DIM_Y,  int BLK_M,  int BLK_N, int BLK_K,
  7 |           int DIM_XA, int DIM_YA, int DIM_XB, int DIM_YB>
  8 | 
  9 | static void
 10 | pgemm_template(int M, int N, int K,
 11 |                const uint64_t * __restrict__ A, int LDA,
 12 |                const uint64_t * __restrict__ B, int LDB,
 13 |                float *          __restrict__ C, int LDC)
 14 | {
 15 |      size_t offsA=0, offsB=0;
 16 |      offsA /= sizeof(A[0]);
 17 |      offsB /= sizeof(B[0]);
 18 | 
 19 |      dim3 dimBlock(DIM_Y, DIM_X);
 20 |      dim3 dimGrid(CEIL(N, BLK_N), CEIL(M, BLK_M));
 21 | 
 22 |      switch (INIT) {
 23 |      case 0:
 24 |           pgemm_kernel <DIM_X,  DIM_Y,  BLK_M,  BLK_N, BLK_K,
 25 |                         DIM_XA, DIM_YA, DIM_XB, DIM_YB,
 26 |                         BLK_M/DIM_X, BLK_N/DIM_Y>
 27 |                <<<dimGrid, dimBlock>>>
 28 |                (M, N, K, A, LDA, B, LDB, C, LDC, offsA, offsB);
 29 |           break;
 30 | 
 31 |      case 1:
 32 |           pgemm_kernel_init <DIM_X,  DIM_Y,  BLK_M,  BLK_N, BLK_K,
 33 |                              DIM_XA, DIM_YA, DIM_XB, DIM_YB,
 34 |                              BLK_M/DIM_X, BLK_N/DIM_Y>
 35 |                <<<dimGrid, dimBlock>>>
 36 |                (M, N, K, A, LDA, B, LDB, C, LDC, offsA, offsB);
 37 |           break;
 38 | 
 39 |      case 2:
 40 |           pgemm_kernel_init_rev <DIM_X,  DIM_Y,  BLK_M,  BLK_N, BLK_K,
 41 |                                  DIM_XA, DIM_YA, DIM_XB, DIM_YB,
 42 |                                  BLK_M/DIM_X, BLK_N/DIM_Y>
 43 |                <<<dimGrid, dimBlock>>>
 44 |                (M, N, K, A, LDA, B, LDB, C, LDC, offsA, offsB);
 45 |           break;
 46 |      }
 47 | }
 48 | 
 49 | 
 50 | void pgemm(int M, int N, int K,
 51 |            const uint64_t * __restrict__ A,
 52 |            const uint64_t * __restrict__ B,
 53 |            float *          __restrict__ C)
 54 | {
 55 |      pgemm_template
 56 |           <0, 16,16, 16,16,16, 16,16, 16,16>
 57 |           (M, N, K, A, K, B, K, C, N);
 58 | }
 59 | 
 60 | 
 61 | void pgemm_init(int M, int N, int K,
 62 |                 const uint64_t * __restrict__ A,
 63 |                 const uint64_t * __restrict__ B,
 64 |                 float *          __restrict__ C)
 65 | {
 66 |      pgemm_template
 67 |           <1, 16,16, 16,16,16, 16,16, 16,16>
 68 |           (M, N, K, A, K, B, K, C, N);
 69 | }
 70 | 
 71 | 
 72 | void pgemm_init(int M, int N, int K,
 73 |                 const uint64_t * __restrict__ a, int lda,
 74 |                 const uint64_t * __restrict__ b, int ldb,
 75 |                 float *          __restrict__ c, int ldc)
 76 | {
 77 |      pgemm_template <1, 16, 16, 16,16,16, 16,16, 16,16>
 78 |           (M, N, K, a, lda, b, ldb, c, ldc);
 79 | }
 80 | 
 81 | 
 82 | void pgemm_init_rev(const int M, const int N, const int K,
 83 |                     const uint64_t * __restrict__ A,
 84 |                     const uint64_t * __restrict__ B,
 85 |                     float *          __restrict__ C)
 86 | {
 87 |      pgemm_template
 88 |           <2, 16,16, 16,16,16, 16,16, 16,16>
 89 |           (M, N, K, A, K, B, K, C, N);
 90 | }
 91 | 
 92 | ///////////////////////////////////////////////////////
 93 | template <int INIT,
 94 |           int DIM_X,  int DIM_Y,  int BLK_M,  int BLK_N, int BLK_K,
 95 |           int DIM_XA, int DIM_YA, int DIM_XB, int DIM_YB>
 96 | 
 97 | static void
 98 | pgemm32_template(int M, int N, int K,
 99 |                  const uint32_t * __restrict__ A, int LDA,
100 |                  const uint32_t * __restrict__ B, int LDB,
101 |                  float *          __restrict__ C, int LDC)
102 | {
103 |      size_t offsA=0, offsB=0;
104 |      offsA /= sizeof(A[0]);
105 |      offsB /= sizeof(B[0]);
106 | 
107 |      dim3 dimBlock(DIM_Y, DIM_X);
108 |      dim3 dimGrid(CEIL(N, BLK_N), CEIL(M, BLK_M));
109 | 
110 |      pgemm32_kernel <INIT, DIM_X,  DIM_Y,  BLK_M,  BLK_N, BLK_K,
111 |                      DIM_XA, DIM_YA, DIM_XB, DIM_YB,
112 |                      BLK_M/DIM_X, BLK_N/DIM_Y>
113 | 
114 |           <<<dimGrid, dimBlock>>>
115 |           (M, N, K, A, LDA, B, LDB, C, LDC, offsA, offsB);
116 | }
117 | 
118 | 
119 | void pgemm32(int M, int N, int K,
120 |              const uint32_t * __restrict__ A,
121 |              const uint32_t * __restrict__ B,
122 |              float *          __restrict__ C)
123 | {
124 |      pgemm32_template
125 |           <0, 16,16, 16,16,16, 16,16, 16,16>
126 |           (M, N, K, A, K, B, K, C, N);
127 | }
128 | 


--------------------------------------------------------------------------------
/src/kernels/pgemm.cuh:
--------------------------------------------------------------------------------
  1 | #define fetch(A, m, n, bound) offs_##A[MIN((m)*LD##A+n, bound)]
  2 | 
  3 | 
  4 | __forceinline__ __device__
  5 | long long int int2_as_longlong (int2 a)
  6 | {
  7 |     long long int res;
  8 |     asm ("mov.b64 %0, {%1,%2};" : "=l"(res) : "r"(a.x), "r"(a.y));
  9 |     return res;
 10 | }
 11 | 
 12 | 
 13 | // __forceinline__ __device__
 14 | // void xcaz(int init, int *c, uint64_t a, uint64_t b)
 15 | // {
 16 | //      switch (init) {
 17 | //      case 0: *c += __popcll(a ^ b); break;
 18 | //      case 1: *c += __popcll(a & b) - __popcll((a ^ b) & b); break;
 19 | //      case 2: *c += __popcll(a & b) - __popcll((a ^ b) & a); break;
 20 | //      }
 21 | // }
 22 | 
 23 | 
 24 | template <const int DIM_X,  const int DIM_Y,
 25 |           const int BLK_M,  const int BLK_N, const int BLK_K,
 26 |           const int DIM_XA, const int DIM_YA,
 27 |           const int DIM_XB, const int DIM_YB,
 28 |           const int THR_M, const int THR_N>
 29 | 
 30 | static __global__
 31 | void pgemm_kernel(const int M, const int N, const int K,
 32 |                   const uint64_t * __restrict__ A, const int LDA,
 33 |                   const uint64_t * __restrict__ B, const int LDB,
 34 |                   float *       __restrict__ C, const int LDC,
 35 |                   int offsA, int offsB)
 36 | {
 37 |      int blx=blockIdx.y,  bly=blockIdx.x;
 38 |      int idx=threadIdx.y, idy=threadIdx.x,  idt=idx*DIM_Y+idy;
 39 |      int idxA=idt/DIM_YA, idyA=idt % DIM_YA;
 40 |      int idxB=idt/DIM_YB, idyB=idt % DIM_YB;
 41 | 
 42 |      int rC[THR_M][THR_N];
 43 |      uint64_t rA[THR_M], rB[THR_N];
 44 |      uint64_t ra[BLK_M/DIM_XA][BLK_K/DIM_YA];
 45 |      uint64_t rb[BLK_N/DIM_XB][BLK_K/DIM_YB];
 46 | 
 47 |      __shared__ uint64_t sA[BLK_M][BLK_K+1];
 48 |      __shared__ uint64_t sB[BLK_N][BLK_K+1];
 49 | 
 50 | 
 51 | #define cazA  (blx*BLK_M*LDA + idxA*LDA + idyA)
 52 | #define cazB  (bly*BLK_N*LDB + idxB*LDB + idyB)
 53 | 
 54 | #ifdef TEX1D
 55 |      int coord_A = offsA + cazA;
 56 |      int coord_B = offsB + cazB;
 57 | #else
 58 |      const uint64_t *offs_A = A + cazA;
 59 |      const uint64_t *offs_B = B + cazB;
 60 |      ptrdiff_t boundA = (LDA*(M-1)+K) - cazA - 1;
 61 |      ptrdiff_t boundB = (LDB*(N-1)+K) - cazB - 1;
 62 | #endif
 63 | 
 64 | #undef cazA
 65 | #undef cazB
 66 | 
 67 |      int m, n, k, kk;
 68 | 
 69 |      #pragma unroll
 70 |      for (m=0; m<THR_M; m++)
 71 |           #pragma unroll
 72 |           for (n=0; n<THR_N; n++)
 73 |                rC[m][n] = 0;
 74 | 
 75 |      #pragma unroll
 76 |      for (m=0; m<BLK_M; m+=DIM_XA)
 77 |           #pragma unroll
 78 |           for (n=0; n<BLK_K; n+=DIM_YA)
 79 |                sA[m+idxA][n+idyA] = fetch(A, m, n, boundA);
 80 | 
 81 |      #pragma unroll
 82 |      for (m=0; m<BLK_N; m+=DIM_XB)
 83 |           #pragma unroll
 84 |           for (n=0; n<BLK_K; n+=DIM_YB)
 85 |                sB[m+idxB][n+idyB] = fetch(B, m, n, boundB);
 86 | 
 87 |      __syncthreads();
 88 | 
 89 |      for (kk=0; kk<K-BLK_K; kk+=BLK_K) {
 90 |           #ifdef TEX1D
 91 |           coord_A += BLK_K;
 92 |           coord_B += BLK_K;
 93 |           #else
 94 |           offs_A += BLK_K; boundA -= BLK_K;
 95 |           offs_B += BLK_K; boundB -= BLK_K;
 96 |           #endif
 97 | 
 98 |           #pragma unroll
 99 |           for (m=0; m<BLK_M/DIM_XA; m++)
100 |                #pragma unroll
101 |                for (n=0; n<BLK_K/DIM_YA; n++)
102 |                     ra[m][n] = fetch(A, m*DIM_XA, n*DIM_YA, boundA);
103 | 
104 |           #pragma unroll
105 |           for (m=0; m<BLK_N/DIM_XB; m++)
106 |                #pragma unroll
107 |                for (n=0; n<BLK_K/DIM_YB; n++)
108 |                     rb[m][n] = fetch(B, m*DIM_XB, n*DIM_YB, boundB);
109 | 
110 | 
111 |           #pragma unroll
112 |           for (k=0; k<BLK_K; k++) {
113 |                #pragma unroll
114 |                for (m=0; m<THR_M; m++)
115 |                     rA[m] = sA[m*DIM_X+idx][k];
116 | 
117 |                #pragma unroll
118 |                for (n=0; n<THR_N; n++)
119 |                     rB[n] = sB[n*DIM_Y+idy][k];
120 | 
121 |                #pragma unroll
122 |                for (m=0; m<THR_M; m++)
123 |                     #pragma unroll
124 |                     for (n=0; n<THR_N; n++) {
125 | 
126 |                          rC[m][n] += __popcll(rA[m] ^ rB[n]);
127 |                          //xcaz(INIT, &rC[m][n], rA[m], rB[n]);
128 | 
129 |                     }
130 |           }
131 | 
132 |           __syncthreads();
133 | 
134 |           #pragma unroll
135 |           for (m=0; m<BLK_M/DIM_XA; m++)
136 |                #pragma unroll
137 |                for (n=0; n<BLK_K/DIM_YA; n++)
138 |                     sA[m*DIM_XA+idxA][n*DIM_YA+idyA] = ra[m][n];
139 | 
140 |           #pragma unroll
141 |           for (m=0; m<BLK_N/DIM_XB; m++)
142 |                #pragma unroll
143 |                for (n=0; n<BLK_K/DIM_YB; n++)
144 |                     sB[m*DIM_XB+idxB][n*DIM_YB+idyB] = rb[m][n];
145 | 
146 |           __syncthreads();
147 | 
148 |      }
149 | 
150 |      kk=K-kk;
151 |      #pragma unroll
152 |      for (k=0; k<kk; k++) {
153 |           #pragma unroll
154 |           for (m=0; m<THR_M; m++)
155 |                rA[m] = sA[m*DIM_X+idx][k];
156 | 
157 |           #pragma unroll
158 |           for (n=0; n<THR_N; n++)
159 |                rB[n] = sB[n*DIM_Y+idy][k];
160 | 
161 |           #pragma unroll
162 |           for (m=0; m<THR_M; m++)
163 |                #pragma unroll
164 |                for (n=0; n<THR_N;n++) {
165 |                     rC[m][n] += __popcll(rA[m] ^ rB[n]);
166 |                     //xcaz(INIT, &rC[m][n], rA[m], rB[n]);
167 |                }
168 |      }
169 | 
170 |      #pragma unroll
171 |      for (m=0; m<THR_M; m++) {
172 |           int i = blx*BLK_M + m*DIM_X + idx;
173 |           #pragma unroll
174 |           for (n=0; n<THR_N; n++) {
175 |                int j = bly*BLK_N + n*DIM_Y + idy;
176 |                if (i<M && j<N)
177 |                     //if (INIT)
178 |                     ///C[i*LDC+j] = rC[m][m];
179 |                     //else
180 |                     C[i*LDC+j] = (LDA<<6)-(rC[m][n]<<1);
181 |           }
182 |      }
183 | }
184 | 
185 | //////////////////////////////////////////////////////////
186 | template <const int DIM_X,  const int DIM_Y,
187 |           const int BLK_M,  const int BLK_N, const int BLK_K,
188 |           const int DIM_XA, const int DIM_YA,
189 |           const int DIM_XB, const int DIM_YB,
190 |           const int THR_M,  const int THR_N>
191 | 
192 | static __global__
193 | void pgemm_kernel_init(const int M, const int N, const int K,
194 |                        const uint64_t * __restrict__ A, const int LDA,
195 |                        const uint64_t * __restrict__ B, const int LDB,
196 |                        float *       __restrict__ C, const int LDC,
197 |                        int offsA, int offsB)
198 | {
199 |      int blx=blockIdx.y,  bly=blockIdx.x;
200 |      int idx=threadIdx.y, idy=threadIdx.x,  idt=idx*DIM_Y+idy;
201 |      int idxA=idt/DIM_YA, idyA=idt % DIM_YA;
202 |      int idxB=idt/DIM_YB, idyB=idt % DIM_YB;
203 | 
204 |      int rC[THR_M][THR_N];
205 |      uint64_t rA[THR_M], rB[THR_N];
206 |      uint64_t ra[BLK_M/DIM_XA][BLK_K/DIM_YA];
207 |      uint64_t rb[BLK_N/DIM_XB][BLK_K/DIM_YB];
208 | 
209 |      __shared__ uint64_t sA[BLK_M][BLK_K+1];
210 |      __shared__ uint64_t sB[BLK_N][BLK_K+1];
211 | 
212 | 
213 | #define cazA  (blx*BLK_M*LDA + idxA*LDA + idyA)
214 | #define cazB  (bly*BLK_N*LDB + idxB*LDB + idyB)
215 | 
216 |      const uint64_t *offs_A = A + cazA;
217 |      const uint64_t *offs_B = B + cazB;
218 |      ptrdiff_t boundA = (LDA*(M-1)+K) - cazA - 1;
219 |      ptrdiff_t boundB = (LDB*(N-1)+K) - cazB - 1;
220 | 
221 | #undef cazA
222 | #undef cazB
223 | 
224 |      int m, n, k, kk;
225 | 
226 |      #pragma unroll
227 |      for (m=0; m<THR_M; m++)
228 |           #pragma unroll
229 |           for (n=0; n<THR_N; n++)
230 |                rC[m][n] = 0;
231 | 
232 |      #pragma unroll
233 |      for (m=0; m<BLK_M; m+=DIM_XA)
234 |           #pragma unroll
235 |           for (n=0; n<BLK_K; n+=DIM_YA)
236 |                sA[m+idxA][n+idyA] = fetch(A, m, n, boundA);
237 | 
238 |      #pragma unroll
239 |      for (m=0; m<BLK_N; m+=DIM_XB)
240 |           #pragma unroll
241 |           for (n=0; n<BLK_K; n+=DIM_YB)
242 |                sB[m+idxB][n+idyB] = fetch(B, m, n, boundB);
243 | 
244 |      __syncthreads();
245 | 
246 |      for (kk=0; kk<K-BLK_K; kk+=BLK_K) {
247 |           #ifdef TEX1D
248 |           coord_A += BLK_K;
249 |           coord_B += BLK_K;
250 |           #else
251 |           offs_A += BLK_K; boundA -= BLK_K;
252 |           offs_B += BLK_K; boundB -= BLK_K;
253 |           #endif
254 | 
255 |           #pragma unroll
256 |           for (m=0; m<BLK_M/DIM_XA; m++)
257 |                #pragma unroll
258 |                for (n=0; n<BLK_K/DIM_YA; n++)
259 |                     ra[m][n] = fetch(A, m*DIM_XA, n*DIM_YA, boundA);
260 | 
261 |           #pragma unroll
262 |           for (m=0; m<BLK_N/DIM_XB; m++)
263 |                #pragma unroll
264 |                for (n=0; n<BLK_K/DIM_YB; n++)
265 |                     rb[m][n] = fetch(B, m*DIM_XB, n*DIM_YB, boundB);
266 | 
267 | 
268 |           #pragma unroll
269 |           for (k=0; k<BLK_K; k++) {
270 |                #pragma unroll
271 |                for (m=0; m<THR_M; m++)
272 |                     rA[m] = sA[m*DIM_X+idx][k];
273 | 
274 |                #pragma unroll
275 |                for (n=0; n<THR_N; n++)
276 |                     rB[n] = sB[n*DIM_Y+idy][k];
277 | 
278 |                #pragma unroll
279 |                for (m=0; m<THR_M; m++)
280 |                     #pragma unroll
281 |                     for (n=0; n<THR_N; n++) {
282 |                          rC[m][n] += (__popcll(rA[m] & rB[n]) -
283 |                                      __popcll((rA[m] ^ rB[n]) & rB[n]));
284 |                     }
285 |           }
286 | 
287 |           __syncthreads();
288 | 
289 |           #pragma unroll
290 |           for (m=0; m<BLK_M/DIM_XA; m++)
291 |                #pragma unroll
292 |                for (n=0; n<BLK_K/DIM_YA; n++)
293 |                     sA[m*DIM_XA+idxA][n*DIM_YA+idyA] = ra[m][n];
294 | 
295 |           #pragma unroll
296 |           for (m=0; m<BLK_N/DIM_XB; m++)
297 |                #pragma unroll
298 |                for (n=0; n<BLK_K/DIM_YB; n++)
299 |                     sB[m*DIM_XB+idxB][n*DIM_YB+idyB] = rb[m][n];
300 | 
301 |           __syncthreads();
302 | 
303 |      }
304 | 
305 |      kk=K-kk;
306 |      #pragma unroll
307 |      for (k=0; k<kk; k++) {
308 |           #pragma unroll
309 |           for (m=0; m<THR_M; m++)
310 |                rA[m] = sA[m*DIM_X+idx][k];
311 | 
312 |           #pragma unroll
313 |           for (n=0; n<THR_N; n++)
314 |                rB[n] = sB[n*DIM_Y+idy][k];
315 | 
316 |           #pragma unroll
317 |           for (m=0; m<THR_M; m++)
318 |                #pragma unroll
319 |                for (n=0; n<THR_N;n++) {
320 |                     rC[m][n] += (__popcll(rA[m] & rB[n]) -
321 |                                 __popcll((rA[m] ^ rB[n]) & rB[n]));
322 |                }
323 |      }
324 | 
325 |      #pragma unroll
326 |      for (m=0; m<THR_M; m++) {
327 |           int i = blx*BLK_M + m*DIM_X + idx;
328 |           #pragma unroll
329 |           for (n=0; n<THR_N; n++) {
330 |                int j = bly*BLK_N + n*DIM_Y + idy;
331 |                if (i<M && j<N)
332 |                     C[i*LDC+j] = rC[m][n];
333 |           }
334 |      }
335 | }
336 | ///////////////////////////////////////////////////////////////
337 | template <const int DIM_X,  const int DIM_Y,
338 |           const int BLK_M,  const int BLK_N, const int BLK_K,
339 |           const int DIM_XA, const int DIM_YA,
340 |           const int DIM_XB, const int DIM_YB,
341 |           const int THR_M,  const int THR_N>
342 | 
343 | static __global__
344 | void pgemm_kernel_init_rev(const int M, const int N, const int K,
345 |                            const uint64_t * __restrict__ A, const int LDA,
346 |                            const uint64_t * __restrict__ B, const int LDB,
347 |                            float *       __restrict__ C, const int LDC,
348 |                            int offsA, int offsB)
349 | {
350 |      int blx=blockIdx.y,  bly=blockIdx.x;
351 |      int idx=threadIdx.y, idy=threadIdx.x,  idt=idx*DIM_Y+idy;
352 |      int idxA=idt/DIM_YA, idyA=idt % DIM_YA;
353 |      int idxB=idt/DIM_YB, idyB=idt % DIM_YB;
354 | 
355 |      int rC[THR_M][THR_N];
356 |      uint64_t rA[THR_M], rB[THR_N];
357 |      uint64_t ra[BLK_M/DIM_XA][BLK_K/DIM_YA];
358 |      uint64_t rb[BLK_N/DIM_XB][BLK_K/DIM_YB];
359 | 
360 |      __shared__ uint64_t sA[BLK_M][BLK_K+1];
361 |      __shared__ uint64_t sB[BLK_N][BLK_K+1];
362 | 
363 | 
364 | #define cazA  (blx*BLK_M*LDA + idxA*LDA + idyA)
365 | #define cazB  (bly*BLK_N*LDB + idxB*LDB + idyB)
366 | 
367 |      const uint64_t *offs_A = A + cazA;
368 |      const uint64_t *offs_B = B + cazB;
369 |      ptrdiff_t boundA = (LDA*(M-1)+K) - cazA - 1;
370 |      ptrdiff_t boundB = (LDB*(N-1)+K) - cazB - 1;
371 | 
372 | #undef cazA
373 | #undef cazB
374 | 
375 |      int m, n, k, kk;
376 | 
377 |      #pragma unroll
378 |      for (m=0; m<THR_M; m++)
379 |           #pragma unroll
380 |           for (n=0; n<THR_N; n++)
381 |                rC[m][n] = 0;
382 | 
383 |      #pragma unroll
384 |      for (m=0; m<BLK_M; m+=DIM_XA)
385 |           #pragma unroll
386 |           for (n=0; n<BLK_K; n+=DIM_YA)
387 |                sA[m+idxA][n+idyA] = fetch(A, m, n, boundA);
388 | 
389 |      #pragma unroll
390 |      for (m=0; m<BLK_N; m+=DIM_XB)
391 |           #pragma unroll
392 |           for (n=0; n<BLK_K; n+=DIM_YB)
393 |                sB[m+idxB][n+idyB] = fetch(B, m, n, boundB);
394 | 
395 |      __syncthreads();
396 | 
397 |      for (kk=0; kk<K-BLK_K; kk+=BLK_K) {
398 |           offs_A += BLK_K; boundA -= BLK_K;
399 |           offs_B += BLK_K; boundB -= BLK_K;
400 | 
401 |           #pragma unroll
402 |           for (m=0; m<BLK_M/DIM_XA; m++)
403 |                #pragma unroll
404 |                for (n=0; n<BLK_K/DIM_YA; n++)
405 |                     ra[m][n] = fetch(A, m*DIM_XA, n*DIM_YA, boundA);
406 | 
407 |           #pragma unroll
408 |           for (m=0; m<BLK_N/DIM_XB; m++)
409 |                #pragma unroll
410 |                for (n=0; n<BLK_K/DIM_YB; n++)
411 |                     rb[m][n] = fetch(B, m*DIM_XB, n*DIM_YB, boundB);
412 | 
413 | 
414 |           #pragma unroll
415 |           for (k=0; k<BLK_K; k++) {
416 |                #pragma unroll
417 |                for (m=0; m<THR_M; m++)
418 |                     rA[m] = sA[m*DIM_X+idx][k];
419 | 
420 |                #pragma unroll
421 |                for (n=0; n<THR_N; n++)
422 |                     rB[n] = sB[n*DIM_Y+idy][k];
423 | 
424 |                #pragma unroll
425 |                for (m=0; m<THR_M; m++)
426 |                     #pragma unroll
427 |                     for (n=0; n<THR_N; n++)
428 |                          rC[m][n] += __popcll(rA[m] & rB[n]) -
429 |                               __popcll((rA[m] ^ rB[n]) & rA[n]);
430 |           }
431 | 
432 |           __syncthreads();
433 | 
434 |           #pragma unroll
435 |           for (m=0; m<BLK_M/DIM_XA; m++)
436 |                #pragma unroll
437 |                for (n=0; n<BLK_K/DIM_YA; n++)
438 |                     sA[m*DIM_XA+idxA][n*DIM_YA+idyA] = ra[m][n];
439 | 
440 |           #pragma unroll
441 |           for (m=0; m<BLK_N/DIM_XB; m++)
442 |                #pragma unroll
443 |                for (n=0; n<BLK_K/DIM_YB; n++)
444 |                     sB[m*DIM_XB+idxB][n*DIM_YB+idyB] = rb[m][n];
445 | 
446 |           __syncthreads();
447 | 
448 |      }
449 | 
450 |      kk=K-kk;
451 |      #pragma unroll
452 |      for (k=0; k<kk; k++) {
453 |           #pragma unroll
454 |           for (m=0; m<THR_M; m++)
455 |                rA[m] = sA[m*DIM_X+idx][k];
456 | 
457 |           #pragma unroll
458 |           for (n=0; n<THR_N; n++)
459 |                rB[n] = sB[n*DIM_Y+idy][k];
460 | 
461 |           #pragma unroll
462 |           for (m=0; m<THR_M; m++)
463 |                #pragma unroll
464 |                for (n=0; n<THR_N;n++)
465 |                     rC[m][n] += __popcll(rA[m] & rB[n]) -
466 |                          __popcll((rA[m] ^ rB[n]) & rA[n]);
467 |      }
468 | 
469 |      #pragma unroll
470 |      for (m=0; m<THR_M; m++) {
471 |           int i = blx*BLK_M + m*DIM_X + idx;
472 |           #pragma unroll
473 |           for (n=0; n<THR_N; n++) {
474 |                int j = bly*BLK_N + n*DIM_Y + idy;
475 |                if (i<M && j<N)
476 |                     C[i*LDC+j] = rC[m][n];
477 |           }
478 |      }
479 | }
480 | ///////////////////////////////////////////////////////////////
481 | template <const int INIT,
482 |           const int DIM_X,  const int DIM_Y,
483 |           const int BLK_M,  const int BLK_N, const int BLK_K,
484 |           const int DIM_XA, const int DIM_YA,
485 |           const int DIM_XB, const int DIM_YB,
486 |           const int THR_M, const int THR_N>
487 | 
488 | static __global__
489 | void pgemm32_kernel(const int M, const int N, const int K,
490 |                     const uint32_t * __restrict__ A, const int LDA,
491 |                     const uint32_t * __restrict__ B, const int LDB,
492 |                     float *       __restrict__ C, const int LDC,
493 |                     int offsA, int offsB)
494 | {
495 |      int blx=blockIdx.y,  bly=blockIdx.x;
496 |      int idx=threadIdx.y, idy=threadIdx.x,  idt=idx*DIM_Y+idy;
497 |      int idxA=idt/DIM_YA, idyA=idt % DIM_YA;
498 |      int idxB=idt/DIM_YB, idyB=idt % DIM_YB;
499 | 
500 |      int rC[THR_M][THR_N];
501 |      uint32_t rA[THR_M], rB[THR_N];
502 |      uint32_t ra[BLK_M/DIM_XA][BLK_K/DIM_YA];
503 |      uint32_t rb[BLK_N/DIM_XB][BLK_K/DIM_YB];
504 | 
505 |      __shared__ uint32_t sA[BLK_M][BLK_K+1];
506 |      __shared__ uint32_t sB[BLK_N][BLK_K+1];
507 | 
508 | #define cazA  (blx*BLK_M*LDA + idxA*LDA + idyA)
509 | #define cazB  (bly*BLK_N*LDB + idxB*LDB + idyB)
510 | 
511 |      const uint32_t *offs_A = A + cazA;
512 |      const uint32_t *offs_B = B + cazB;
513 |      ptrdiff_t boundA = (LDA*(M-1)+K) - cazA - 1;
514 |      ptrdiff_t boundB = (LDB*(N-1)+K) - cazB - 1;
515 | 
516 | #undef cazA
517 | #undef cazB
518 | 
519 |      int m, n, k, kk;
520 | 
521 |      #pragma unroll
522 |      for (m=0; m<THR_M; m++)
523 |           #pragma unroll
524 |           for (n=0; n<THR_N; n++)
525 |                rC[m][n] = 0;
526 | 
527 |      #pragma unroll
528 |      for (m=0; m<BLK_M; m+=DIM_XA)
529 |           #pragma unroll
530 |           for (n=0; n<BLK_K; n+=DIM_YA)
531 |                sA[m+idxA][n+idyA] = fetch(A, m, n, boundA);
532 | 
533 |      #pragma unroll
534 |      for (m=0; m<BLK_N; m+=DIM_XB)
535 |           #pragma unroll
536 |           for (n=0; n<BLK_K; n+=DIM_YB)
537 |                sB[m+idxB][n+idyB] = fetch(B, m, n, boundB);
538 | 
539 |      __syncthreads();
540 | 
541 |      for (kk=0; kk<K-BLK_K; kk+=BLK_K) {
542 |           offs_A += BLK_K; boundA -= BLK_K;
543 |           offs_B += BLK_K; boundB -= BLK_K;
544 | 
545 |           #pragma unroll
546 |           for (m=0; m<BLK_M/DIM_XA; m++)
547 |                #pragma unroll
548 |                for (n=0; n<BLK_K/DIM_YA; n++)
549 |                     ra[m][n] = fetch(A, m*DIM_XA, n*DIM_YA, boundA);
550 | 
551 |           #pragma unroll
552 |           for (m=0; m<BLK_N/DIM_XB; m++)
553 |                #pragma unroll
554 |                for (n=0; n<BLK_K/DIM_YB; n++)
555 |                     rb[m][n] = fetch(B, m*DIM_XB, n*DIM_YB, boundB);
556 | 
557 | 
558 |           #pragma unroll
559 |           for (k=0; k<BLK_K; k++) {
560 |                #pragma unroll
561 |                for (m=0; m<THR_M; m++)
562 |                     rA[m] = sA[m*DIM_X+idx][k];
563 | 
564 |                #pragma unroll
565 |                for (n=0; n<THR_N; n++)
566 |                     rB[n] = sB[n*DIM_Y+idy][k];
567 | 
568 |                #pragma unroll
569 |                for (m=0; m<THR_M; m++)
570 |                     #pragma unroll
571 |                     for (n=0; n<THR_N; n++)
572 |                          rC[m][n] += __popc(rA[m] ^ rB[n]);
573 | 
574 |           }
575 | 
576 |           __syncthreads();
577 | 
578 |           #pragma unroll
579 |           for (m=0; m<BLK_M/DIM_XA; m++)
580 |                #pragma unroll
581 |                for (n=0; n<BLK_K/DIM_YA; n++)
582 |                     sA[m*DIM_XA+idxA][n*DIM_YA+idyA] = ra[m][n];
583 | 
584 |           #pragma unroll
585 |           for (m=0; m<BLK_N/DIM_XB; m++)
586 |                #pragma unroll
587 |                for (n=0; n<BLK_K/DIM_YB; n++)
588 |                     sB[m*DIM_XB+idxB][n*DIM_YB+idyB] = rb[m][n];
589 | 
590 |           __syncthreads();
591 | 
592 |      }
593 | 
594 |      kk=K-kk;
595 |      #pragma unroll
596 |      for (k=0; k<kk; k++) {
597 |           #pragma unroll
598 |           for (m=0; m<THR_M; m++)
599 |                rA[m] = sA[m*DIM_X+idx][k];
600 | 
601 |           #pragma unroll
602 |           for (n=0; n<THR_N; n++)
603 |                rB[n] = sB[n*DIM_Y+idy][k];
604 | 
605 |           #pragma unroll
606 |           for (m=0; m<THR_M; m++)
607 |                #pragma unroll
608 |                for (n=0; n<THR_N;n++)
609 |                     rC[m][n] += __popc(rA[m] ^ rB[n]);
610 |      }
611 | 
612 |      #pragma unroll
613 |      for (m=0; m<THR_M; m++) {
614 |           int i = blx*BLK_M + m*DIM_X + idx;
615 |           #pragma unroll
616 |           for (n=0; n<THR_N; n++) {
617 |                int j = bly*BLK_N + n*DIM_Y + idy;
618 |                if (i<M && j<N)
619 |                     C[i*LDC+j] = (LDA<<5)-(rC[m][n]<<1);
620 |           }
621 |      }
622 | }
623 | 


--------------------------------------------------------------------------------
/src/kernels/pgemv.cu:
--------------------------------------------------------------------------------
 1 | #include "util.cuh"
 2 | #include "pgemv.cuh"
 3 | 
 4 | template <const int DIM_X, const int DIM_Y, const int TS>
 5 | 
 6 | static
 7 | void pgemv_template(const int m, const int n,
 8 |                     const uint64_t * __restrict__ A, int lda,
 9 |                     const uint64_t * __restrict__ x,
10 |                     float *          __restrict__ y)
11 | {
12 |      dim3 grid(CEIL(m, TS), 1);
13 |      dim3 threads(DIM_X, DIM_Y);
14 | 
15 |      pgemv_kernel <DIM_X, DIM_Y, TS>
16 |           <<<grid, threads>>>
17 |           (m, n, A, lda, x, y);
18 | 
19 | }
20 | 
21 | 
22 | void pgemv(const int m, const int n,
23 |            const uint64_t * __restrict__ A,
24 |            const uint64_t * __restrict__ x,
25 |            float *          __restrict__ y)
26 | {
27 |      pgemv_template <128, 1, 128>
28 |           (m, n, A, n, x, y);
29 | }
30 | 


--------------------------------------------------------------------------------
/src/kernels/pgemv.cuh:
--------------------------------------------------------------------------------
 1 | template <const int DIM_X, const int DIM_Y, const int TS>
 2 | 
 3 | __global__
 4 | void pgemv_kernel(const int m, const int n,
 5 |                   const uint64_t * __restrict__ A, int lda,
 6 |                   const uint64_t * __restrict__ x,
 7 |                   float          * __restrict__ y)
 8 | {
 9 |      if (m <= 0 || n <= 0) return;
10 | 
11 |      int nt = blockDim.x * blockDim.y * blockDim.z;
12 | 
13 |      if (DIM_X * DIM_Y != nt) return;
14 | 
15 |      int tid = threadIdx.x + threadIdx.y * blockDim.x;
16 |      int tx  = tid % DIM_X, ty = tid / DIM_X;
17 |      int ind = blockIdx.x * TS + tx;
18 | 
19 |      __shared__ int sdata[DIM_X * DIM_Y];
20 | 
21 |      int st = blockIdx.x * TS;
22 |      int ed = MIN(st + TS, ROUND_UP(m, DIM_X));
23 |      int iters = (ed - st)/DIM_X;
24 | 
25 |      for (int i=0; i < iters; i++) {
26 |           if (ind < m ) A += ind*lda;
27 |           int res = 0;
28 |           if (ind < m ) {
29 |                for (int col=ty; col < n; col += DIM_Y)
30 |                     res += __popcll(A[col] ^ x[col]);
31 |           }
32 | 
33 |           sdata[ty + tx * DIM_Y] = res;
34 | 
35 |           __syncthreads();
36 | 
37 |           if (ty == 0 && ind < m) {
38 |                for (int i=1; i < DIM_Y; i++)
39 |                     sdata[tx * DIM_Y] += sdata[i + tx * DIM_Y];
40 |           }
41 | 
42 |           if (ty == 0 && ind < m)
43 |                y[ind] = (lda<<6) - (sdata[tx * DIM_Y]<<1);
44 | 
45 |           __syncthreads();
46 | 
47 |           if (ind < m) A -= ind*lda;
48 |           ind += DIM_X;
49 |      }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/kernels/pool.cu:
--------------------------------------------------------------------------------
 1 | #include "pool.cuh"
 2 | #include "cutens.h"
 3 | 
 4 | 
 5 | void cumaxpool(cuftens *src, cuftens *dst,
 6 |                int W, int H, int Sx, int Sy)
 7 | {
 8 | 
 9 |      int D=src->D,  L=src->L;
10 |      int Ms=src->M, Ns=src->N;
11 |      int Md=dst->M, Nd=dst->N;
12 | 
13 |      cuASSERT(L == dst->L && D == dst->D, "err: cupool shape\n");
14 | 
15 |      int TS = 16;
16 |      dim3 grid(CEIL(L, TS), CEIL(Nd, W), CEIL(Md, H));
17 |      dim3 block(TS, W, H);
18 | 
19 |      for (int w = 0; w < D; w++) {
20 |           float *s = src->data + w * src->MNL;
21 |           float *d = dst->data + w * dst->MNL;
22 |           ker_maxpool <<<grid, block>>>
23 |                (s, d, Ms, Ns, Md, Nd, L, W, H, Sx, Sy);
24 |      }
25 | }
26 | 


--------------------------------------------------------------------------------
/src/kernels/pool.cuh:
--------------------------------------------------------------------------------
 1 | #include <float.h>
 2 | #include "util.cuh"
 3 | 
 4 | static __global__
 5 | void ker_maxpool (const float * __restrict__ src,
 6 |                   float *       __restrict__ dst,
 7 |                   const int Ms, const int Ns,
 8 |                   const int Md, const int Nd, const int L,
 9 |                   const int W,  const int H,
10 |                   const int Sx, const int Sy)
11 | {
12 |      int k=threadIdx.x + blockIdx.x * blockDim.x;
13 |      int j=threadIdx.y + blockIdx.y * blockDim.y;
14 |      int i=threadIdx.z + blockIdx.z * blockDim.z;
15 | 
16 |      int I=i*Sy, J=j*Sx;
17 | 
18 |      if (i >= Md || j >= Nd || k >= L) return;
19 | 
20 |      float val, max=FLT_MIN;
21 |      for (int y=0; y < H; y++)
22 |           for (int x=0; x < W; x++) {
23 |                val = src[ID3(I+y,J+x,k,Ns,L)];
24 |                if (val > max) max = val;
25 |           }
26 | 
27 |      dst[ID3(i,j,k,Nd,L)] = max;
28 | }
29 | 


--------------------------------------------------------------------------------
/src/kernels/set.cu:
--------------------------------------------------------------------------------
 1 | #include "set.cuh"
 2 | #include "cutens.h"
 3 | 
 4 | 
 5 | // void cuset2D(cuftens *t, const float v)
 6 | // {
 7 | //      const int BS=32;
 8 | //      const int M=t->M, N=t->N;
 9 | 
10 | //      dim3 grid(CEIL(N, BS), CEIL(M, BS));
11 | //      dim3 block(BS, BS);
12 | 
13 | //      ker_set2D <<<grid, block>>> (t->data, v, M, N);
14 | // }
15 | 
16 | 
17 | void cuset(cuftens *t, const float v)
18 | {
19 |      const int M=t->D*t->M, N=t->N, L=t->L;
20 | 
21 |      if (L > 1) {
22 |           const int BS = 8;
23 |           dim3 grid(CEIL(L, BS), CEIL(N, BS), CEIL(M, BS));
24 |           dim3 block(BS, BS, BS);
25 | 
26 |           ker_set3D <<<grid, block>>>
27 |                (t->data, v, M, N, L);
28 | 
29 |      } else {
30 |           const int BS = 16;
31 |           dim3 grid(CEIL(N, BS), CEIL(M, BS));
32 |           dim3 block(BS, BS);
33 | 
34 |           ker_set2D <<<grid, block>>>
35 |                (t->data, v, M, N);
36 | 
37 |      }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/kernels/set.cuh:
--------------------------------------------------------------------------------
 1 | #include "util.cuh"
 2 | 
 3 | static __global__
 4 | void ker_set2D(float *dst, const float v, int M, int N)
 5 | {
 6 |      int j=threadIdx.x + blockIdx.x * blockDim.x;
 7 |      int i=threadIdx.y + blockIdx.y * blockDim.y;
 8 | 
 9 |      if (i>=M || j>=N) return;
10 | 
11 |      dst[ID2(i,j,N)] = v;
12 | }
13 | 
14 | 
15 | static __global__
16 | void ker_set3D(float *dst, const float v, int M, int N, int L)
17 | {
18 |      int k=threadIdx.x + blockIdx.x * blockDim.x;
19 |      int j=threadIdx.y + blockIdx.y * blockDim.y;
20 |      int i=threadIdx.z + blockIdx.z * blockDim.z;
21 | 
22 |      if (i>=M || j>=N || k>=L) return;
23 | 
24 |      dst[ID3(i,j,k,N,L)] = v;
25 | }
26 | 


--------------------------------------------------------------------------------
/src/kernels/sgemm.cu:
--------------------------------------------------------------------------------
 1 | #include "cutens.h"
 2 | #include "util.cuh"
 3 | #include "sgemm.cuh"
 4 | 
 5 | 
 6 | template <int DIM_X,  int DIM_Y,  int BLK_M,  int BLK_N, int BLK_K,
 7 |           int DIM_XA, int DIM_YA, int DIM_XB, int DIM_YB>
 8 | 
 9 | static void
10 | sgemm_template(int M, int N, int K,
11 |                const float * __restrict__ A,  int LDA,
12 |                const float * __restrict__ B,  int LDB,
13 |                float *       __restrict__ C,  int LDC)
14 | {
15 |      dim3 dimBlock(DIM_Y, DIM_X);
16 |      dim3 dimGrid(CEIL(N, BLK_N), CEIL(M, BLK_M));
17 | 
18 |      ker_sgemm  <DIM_X,  DIM_Y,  BLK_M,  BLK_N, BLK_K,
19 |                  DIM_XA, DIM_YA, DIM_XB, DIM_YB,
20 |                  BLK_M/DIM_X, BLK_N/DIM_Y>
21 | 
22 |           <<<dimGrid, dimBlock>>>
23 |           (M, N, K, A, LDA, B, LDB, C, LDC);
24 | }
25 | 
26 | 
27 | void sgemm(int M, int N, int K,
28 |            const float * __restrict__ A, int lda,
29 |            const float * __restrict__ B, int ldb,
30 |            float *       __restrict__ C, int ldc)
31 | {
32 |      sgemm_template <16,16, 96,96,16, 32,8, 32,8>
33 |           (M, N, K, A, K, B, K, C, N);
34 | }
35 | 
36 | 
37 | void sgemm(cuftens *a, cuftens *b, cuftens *c)
38 | {
39 |      const int D=a->D, M=a->M, N=b->M, K=a->N;
40 |      cuASSERT(a->N == b->N, "err:  shape\n");
41 |      sgemm_template <16,16, 96,96,16, 32,8, 32,8>
42 |           (D*M, N, K, a->data, K, b->data, K, c->data, N);
43 | }
44 | 


--------------------------------------------------------------------------------
/src/kernels/sgemm.cuh:
--------------------------------------------------------------------------------
  1 | #define fetch(A, m, n, bound) offs_##A[MIN((m)*LD##A+n, bound)]
  2 | #define cazA  (blx*BLK_M*LDA + idxA*LDA + idyA)
  3 | #define cazB  (bly*BLK_N*LDB + idxB*LDB + idyB)
  4 | 
  5 | template <int DIM_X,  int DIM_Y,  int BLK_M,  int BLK_N, int BLK_K,
  6 |           int DIM_XA, int DIM_YA, int DIM_XB, int DIM_YB,
  7 |           int THR_M,  int THR_N>
  8 | 
  9 | static __global__
 10 | void ker_sgemm(int M, int N, int K,
 11 |                const float * __restrict__ A, int LDA,
 12 |                const float * __restrict__ B, int LDB,
 13 |                float *       __restrict__ C, int LDC)
 14 | {
 15 |      int blx=blockIdx.y,  bly=blockIdx.x;
 16 |      int idx=threadIdx.y, idy=threadIdx.x,  idt=idx*DIM_Y+idy;
 17 |      int idxA=idt/DIM_YA, idyA=idt % DIM_YA;
 18 |      int idxB=idt/DIM_YB, idyB=idt % DIM_YB;
 19 | 
 20 |      float rC[THR_M][THR_N], rA[THR_M], rB[THR_N];
 21 |      float ra[BLK_M/DIM_XA][BLK_K/DIM_YA];
 22 |      float rb[BLK_N/DIM_XB][BLK_K/DIM_YB];
 23 | 
 24 |      __shared__ float sA[BLK_M][BLK_K+1];
 25 |      __shared__ float sB[BLK_N][BLK_K+1];
 26 | 
 27 |      const float *offs_A = A + cazA;
 28 |      const float *offs_B = B + cazB;
 29 |      ptrdiff_t boundA = (LDA*(M-1)+K) - cazA - 1;
 30 |      ptrdiff_t boundB = (LDB*(N-1)+K) - cazB - 1;
 31 | 
 32 |      int m, n, k, kk;
 33 |      #pragma unroll
 34 |      for (m=0; m<THR_M; m++)
 35 |           #pragma unroll
 36 |           for (n=0; n<THR_N; n++)
 37 |                rC[m][n] = 0.0f;
 38 | 
 39 |      #pragma unroll
 40 |      for (m=0; m<BLK_M; m+=DIM_XA)
 41 |           #pragma unroll
 42 |           for (n=0; n<BLK_K; n+=DIM_YA)
 43 |                sA[m+idxA][n+idyA] = fetch(A, m, n, boundA);
 44 | 
 45 |      #pragma unroll
 46 |      for (m=0; m<BLK_N; m+=DIM_XB)
 47 |           #pragma unroll
 48 |           for (n=0; n<BLK_K; n+=DIM_YB)
 49 |                sB[m+idxB][n+idyB] = fetch(B, m, n, boundB);
 50 | 
 51 |      __syncthreads();
 52 | 
 53 |      for (kk=0; kk<K-BLK_K; kk+=BLK_K) {
 54 |           offs_A += BLK_K; boundA -= BLK_K;
 55 |           offs_B += BLK_K; boundB -= BLK_K;
 56 | 
 57 |           #pragma unroll
 58 |           for (m=0; m<BLK_M/DIM_XA; m++)
 59 |                #pragma unroll
 60 |                for (n=0; n<BLK_K/DIM_YA; n++)
 61 |                     ra[m][n] = fetch(A, m*DIM_XA, n*DIM_YA, boundA);
 62 | 
 63 |           #pragma unroll
 64 |           for (m=0; m<BLK_N/DIM_XB; m++)
 65 |                #pragma unroll
 66 |                for (n=0; n<BLK_K/DIM_YB; n++)
 67 |                     rb[m][n] = fetch(B, m*DIM_XB, n*DIM_YB, boundB);
 68 | 
 69 | 
 70 |           #pragma unroll
 71 |           for (k=0; k<BLK_K; k++) {
 72 |                #pragma unroll
 73 |                for (m=0; m<THR_M; m++)
 74 |                     rA[m] = sA[m*DIM_X+idx][k];
 75 | 
 76 |                #pragma unroll
 77 |                for (n=0; n<THR_N; n++)
 78 |                     rB[n] = sB[n*DIM_Y+idy][k];
 79 | 
 80 |                #pragma unroll
 81 |                for (m=0; m<THR_M; m++)
 82 |                     #pragma unroll
 83 |                     for (n=0; n<THR_N; n++)
 84 |                          rC[m][n] += rA[m] * rB[n];
 85 |           }
 86 | 
 87 |           __syncthreads();
 88 | 
 89 |           #pragma unroll
 90 |           for (m=0; m<BLK_M/DIM_XA; m++)
 91 |                #pragma unroll
 92 |                for (n=0; n<BLK_K/DIM_YA; n++)
 93 |                     sA[m*DIM_XA+idxA][n*DIM_YA+idyA] = ra[m][n];
 94 | 
 95 |           #pragma unroll
 96 |           for (m=0; m<BLK_N/DIM_XB; m++)
 97 |                #pragma unroll
 98 |                for (n=0; n<BLK_K/DIM_YB; n++)
 99 |                     sB[m*DIM_XB+idxB][n*DIM_YB+idyB] = rb[m][n];
100 | 
101 |           __syncthreads();
102 | 
103 |      }
104 | 
105 |      kk = K - kk;
106 |      #pragma unroll
107 |      for (k=0; k<kk; k++) {
108 |           #pragma unroll
109 |           for (m=0; m<THR_M; m++)
110 |                rA[m] = sA[m*DIM_X+idx][k];
111 | 
112 |           #pragma unroll
113 |           for (n=0; n<THR_N; n++)
114 |                rB[n] = sB[n*DIM_Y+idy][k];
115 | 
116 |           #pragma unroll
117 |           for (m=0; m<THR_M; m++)
118 |                #pragma unroll
119 |                for (n=0; n<THR_N;n++)
120 |                     rC[m][n] += rA[m]*rB[n];
121 |      }
122 | 
123 |      #pragma unroll
124 |      for (m=0; m<THR_M; m++) {
125 |           int i = blx*BLK_M + m*DIM_X + idx;
126 |           #pragma unroll
127 |           for (n=0; n<THR_N; n++) {
128 |                int j = bly*BLK_N + n*DIM_Y + idy;
129 |                if (i<M && j<N)
130 |                     C[i*LDC+j] = rC[m][n];
131 |           }
132 |      }
133 | }
134 | 
135 | #undef cazA
136 | #undef cazB
137 | #undef fetch
138 | 


--------------------------------------------------------------------------------
/src/kernels/sgemv.cu:
--------------------------------------------------------------------------------
 1 | #include "sgemv.cuh"
 2 | #include "util.cuh"
 3 | #include "cutens.h"
 4 | 
 5 | template <const int DIM_X, const int DIM_Y, const int TS>
 6 | 
 7 | void sgemv_template(const int m, const int n,
 8 |                     const float * __restrict__ A, int lda,
 9 |                     const float * __restrict__ x,
10 |                     float *       __restrict__ y)
11 | {
12 |      dim3 grid(CEIL(m, TS), 1);
13 |      dim3 block(DIM_X, DIM_Y);
14 | 
15 |      ker_sgemv <DIM_X, DIM_Y, TS>
16 |           <<<grid, block>>>
17 |           (m, n, A, lda, x, y);
18 | }
19 | 
20 | 
21 | void sgemv (cuftens *a, cuftens *b, cuftens *c)
22 | {
23 |      const int M=a->M, N=a->N;
24 |      cuASSERT(b->M==1 && b->N==N, "err: sgemv shape\n");
25 |      sgemv_template <256, 1, 256>
26 |           (M, N, a->data, N, b->data, c->data);
27 | }
28 | 


--------------------------------------------------------------------------------
/src/kernels/sgemv.cuh:
--------------------------------------------------------------------------------
 1 | #include "util.cuh"
 2 | 
 3 | template <const int DIM_X, const int DIM_Y, const int TS>
 4 | 
 5 | static __global__
 6 | void ker_sgemv(const int m, const int n,
 7 |                const float * __restrict__ A, int lda,
 8 |                const float * __restrict__ x,
 9 |                float       * __restrict__ y)
10 | {
11 |      if (m <= 0 || n <= 0) return;
12 | 
13 |      int nt = blockDim.x * blockDim.y * blockDim.z;
14 | 
15 |      if (DIM_X * DIM_Y != nt) return;
16 | 
17 |      int tid = threadIdx.x + threadIdx.y * blockDim.x;
18 |      int tx  = tid % DIM_X, ty = tid / DIM_X;
19 |      int ind = blockIdx.x * TS + tx;
20 | 
21 |      __shared__ float sdata[DIM_X * DIM_Y];
22 | 
23 |      int st = blockIdx.x * TS;
24 |      int ed = MIN(st + TS, ROUND_UP(m, DIM_X));
25 |      int iters = (ed - st)/DIM_X;
26 | 
27 |      for (int i=0; i < iters; i++) {
28 |           if (ind < m ) A += ind*lda;
29 | 
30 |           float res = 0.0;
31 |           if (ind < m ) {
32 |                for (int col=ty; col < n; col += DIM_Y)
33 |                     res += A[col] * x[col];
34 |           }
35 | 
36 |           sdata[ty + tx * DIM_Y] = res;
37 | 
38 |           __syncthreads();
39 | 
40 |           if (ty == 0 && ind < m) {
41 |                #pragma unroll
42 |                for (int i=1; i < DIM_Y; i++)
43 |                     sdata[tx * DIM_Y] += sdata[i + tx * DIM_Y];
44 |           }
45 | 
46 |           if (ty == 0 && ind < m)
47 |                y[ind] = sdata[tx * DIM_Y];
48 | 
49 |           __syncthreads();
50 | 
51 |           if (ind < m) A -= ind*lda;
52 | 
53 |           ind += DIM_X;
54 |      }
55 | }
56 | 


--------------------------------------------------------------------------------
/src/kernels/sign.cu:
--------------------------------------------------------------------------------
 1 | #include "util.cuh"
 2 | #include "cutens.h"
 3 | #include "sign.cuh"
 4 | 
 5 | 
 6 | void cusign(cuftens *a)
 7 | {
 8 |      const int BS=32;
 9 |      const int len = cuftens_len(a);
10 |      ker_sign <<<CEIL(len, BS), BS>>> (a->data, len);
11 | }
12 | 


--------------------------------------------------------------------------------
/src/kernels/sign.cuh:
--------------------------------------------------------------------------------
 1 | static __global__
 2 | void ker_sign (float *src, const int len)
 3 | {
 4 |      int i=threadIdx.x + blockIdx.x*blockDim.x;
 5 | 
 6 |      if (i >= len) return;
 7 | 
 8 |      src[i] = 2.0f * (src[i] > 0.0f) - 1.0f;
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/kernels/tch.cu:
--------------------------------------------------------------------------------
 1 | #include "tch.cuh"
 2 | #include "cutens.h"
 3 | 
 4 | 
 5 | void cutch(cuftens *src, cuftens *dst)
 6 | {
 7 |      int M=src->M, N=src->N, L=src->L;
 8 |      cuASSERT(src->MNL == dst->MNL && src->D == dst->D,
 9 |               "err: cuth shape\n");
10 | 
11 |      int TS=8;
12 |      dim3 blocks(CEIL(L, TS), CEIL(N, TS), CEIL(M, TS));
13 |      dim3 threads(TS, TS, TS);
14 | 
15 |      for (int w = 0; w < src->D; w++) {
16 |           float *s = src->data + w * src->MNL;
17 |           float *d = dst->data + w * dst->MNL;
18 |           ker_tch <<<blocks, threads>>>
19 |                (s, d, M, N, L);
20 |      }
21 | }
22 | 


--------------------------------------------------------------------------------
/src/kernels/tch.cuh:
--------------------------------------------------------------------------------
 1 | #include "util.cuh"
 2 | 
 3 | static __global__
 4 | void ker_tch (const float * __restrict__ src,
 5 |               float *       __restrict__ dst,
 6 |               const int M, const int N, const int L)
 7 | {
 8 |      int k=threadIdx.x + blockIdx.x * blockDim.x;
 9 |      int j=threadIdx.y + blockIdx.y * blockDim.y;
10 |      int i=threadIdx.z + blockIdx.z * blockDim.z;
11 | 
12 |      if (i >= M || j >= N || k >= L) return;
13 | 
14 |      dst[ID3(j,k,i,L,M)] = src[ID3(i,j,k,N,L)];
15 | }
16 | 


--------------------------------------------------------------------------------
/src/layers/bnorml.c:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | #include "layers/bnorm.h"
 3 | 
 4 | 
 5 | bnormLayer bnormLayer_init(int use_global)
 6 | {
 7 |      bnormLayer bnl; BNORML_INIT(bnl); bnl.ug = use_global;
 8 |      return bnl;
 9 | }
10 | 
11 | void bnormLayer_free(bnormLayer *bnl)
12 | {
13 |      ftens_free(&bnl->mean);  ftens_free(&bnl->istd);
14 |      ftens_free(&bnl->gmean); ftens_free(&bnl->gistd);
15 |      ftens_free(&bnl->beta);  ftens_free(&bnl->gamma);
16 |      ftens_free(&bnl->dbeta); ftens_free(&bnl->dgamma);
17 |      ftens_free(&bnl->tmp);   ftens_free(&bnl->in);
18 | }
19 | 
20 | void bnormLayer_print_shape(bnormLayer *bnl)
21 | {
22 |      printf("bnorm: %d %d\n", bnl->N, bnl->ug);
23 | }
24 | 
25 | void bnormLayer_set(ftens *mean,  ftens *istd,
26 |                     ftens *gamma, ftens *beta, bnormLayer *bnl)
27 | {
28 |      const int N=ftens_len(mean);
29 |      ASSERT(N == ftens_len(istd) &&
30 |             N == ftens_len(beta) &&
31 |             N == ftens_len(gamma), "err: bnorm shape\n");
32 | 
33 |      bnormLayer_free(bnl);
34 |      bnl->N     = N;
35 |      bnl->mean  = ftens_copy(mean);
36 |      bnl->istd  = ftens_copy(istd);
37 |      bnl->beta  = ftens_copy(beta);
38 |      bnl->gamma = ftens_copy(gamma);
39 | }
40 | 
41 | 
42 | static
43 | void bnorm(const float *mean, const float *istd,
44 |            const float *beta, const float *gamma,
45 |            const int len, const int N,
46 |            float *in)
47 | {
48 |      for (int i=0; i < len; i++)
49 |           in[i] = ((in[i] - mean[i%N]) *
50 |                    (istd[i%N] * gamma[i%N]) -
51 |                    (beta[i%N]));
52 | }
53 | 
54 | void bnormLayer_forward(ftens *t, bnormLayer *bnl, int save)
55 | {
56 |      const int D=t->D, M=t->M, N=t->N, L=t->L;
57 |      const int asd = L>1 ? L : N*M;
58 |      ASSERT(asd == bnl->N, "err: bnorm shape\n")
59 | 
60 |      if (save) {
61 |           if (!bnl->in.data) bnl->in=ftens_init(D, M, N, L);
62 |           memcpy(bnl->in.data, t->data, t->bytes);
63 |      }
64 | 
65 |      if (bnl->ug) {
66 |           // compute curr mean, istd
67 |           // moving avg -> update globals
68 |           fprintf(stderr, "not implemented\n");
69 |           exit(-3);
70 |      }
71 | 
72 |      float *in = t->data;
73 |      float *mean = bnl->mean.data;
74 |      float *istd = bnl->istd.data;
75 |      float *beta = bnl->beta.data;
76 |      float *gamma = bnl->gamma.data;
77 | 
78 |      bnorm(mean, istd, beta, gamma, D*M*N*L, asd, in);
79 | }
80 | 
81 | 
82 | void bnormLayer_backward(ftens *dt, bnormLayer *bnl)
83 | {
84 |      fprintf(stderr, "not implemented\n");
85 |      exit(-2);
86 | }
87 | 
88 | 
89 | void bnormLayer_update(bnormLayer *bnl)
90 | {
91 |      fprintf(stderr, "not implemented\n");
92 |      exit(-2);
93 | }
94 | 


--------------------------------------------------------------------------------
/src/layers/bnorml.cu:
--------------------------------------------------------------------------------
 1 | #include "util.cuh"
 2 | #include "layers/cubnorm.h"
 3 | #include "kernels.h"
 4 | 
 5 | 
 6 | cubnormLayer cubnormLayer_init(int use_global)
 7 | {
 8 |      cubnormLayer bnl; BNORML_INIT(bnl);
 9 |      bnl.ug = use_global;
10 |      return bnl;
11 | }
12 | 
13 | 
14 | void cubnormLayer_free(cubnormLayer *bnl)
15 | {
16 |      cuftens_free(&bnl->mean);  cuftens_free(&bnl->istd);
17 |      cuftens_free(&bnl->gmean); cuftens_free(&bnl->gistd);
18 |      cuftens_free(&bnl->beta);  cuftens_free(&bnl->gamma);
19 |      cuftens_free(&bnl->dbeta); cuftens_free(&bnl->dgamma);
20 |      cuftens_free(&bnl->tmp);   cuftens_free(&bnl->in);
21 | }
22 | 
23 | 
24 | void cubnormLayer_print_shape(cubnormLayer *bnl)
25 | {
26 |      printf("cubnorm: N=%d ug=%d\n", bnl->N, bnl->ug);
27 | }
28 | 
29 | 
30 | void cubnormLayer_convert(bnormLayer *src, cubnormLayer *dst)
31 | {
32 |      cubnormLayer_set(&src->mean, &src->istd, &src->gamma,
33 |                       &src->beta, dst);
34 | }
35 | 
36 | 
37 | void cubnormLayer_set(ftens *mean,  ftens *istd,
38 |                       ftens *gamma, ftens *beta,
39 |                       cubnormLayer *bnl)
40 | {
41 |      const int N=ftens_len(mean);
42 |      cuASSERT(N == ftens_len(istd) &&
43 |               N == ftens_len(beta) &&
44 |               N == ftens_len(gamma), "err: cubnorm shape\n");
45 | 
46 |      cubnormLayer_free(bnl);
47 |      bnl->N     = N;
48 |      bnl->mean  = cuftens_convert(mean);
49 |      bnl->istd  = cuftens_convert(istd);
50 |      bnl->beta  = cuftens_convert(beta);
51 |      bnl->gamma = cuftens_convert(gamma);
52 | }
53 | 
54 | 
55 | void cubnormLayer_forward(cuftens *t, cubnormLayer *bnl, int save)
56 | {
57 |      const int D=t->D, M=t->M, N=t->N, L=t->L;
58 |      if (save) {
59 |           if (!bnl->in.data) bnl->in=cuftens_init(D, M, N, L);
60 |           cudaMemcpy(bnl->in.data, t->data, t->bytes, cuDtoD);
61 |      }
62 | 
63 |      if (bnl->ug) {
64 |           // compute bath mean, istd
65 |           // moving avg -> update globals
66 |           fprintf(stderr, "not implemented yet\n");
67 |           exit(-3);
68 |      }
69 | 
70 |      cubnorm(&bnl->mean, &bnl->istd, &bnl->beta, &bnl->gamma, t);
71 | }
72 | 
73 | 
74 | void cubnormLayer_backward(cuftens *dout, cubnormLayer *bnl)
75 | {
76 |      fprintf(stderr, "not implemented yet\n");
77 |      exit(-2);
78 | }
79 | 
80 | 
81 | 
82 | void cubnormLayer_update(cubnormLayer *bnl)
83 | {
84 |      fprintf(stderr, "not implemented yet\n");
85 |      exit(-2);
86 | }
87 | 


--------------------------------------------------------------------------------
/src/layers/convl.c:
--------------------------------------------------------------------------------
  1 | #include <cblas.h>
  2 | #include "util.h"
  3 | #include "conv.h"
  4 | 
  5 | extern float *scratch;
  6 | 
  7 | 
  8 | convLayer convLayer_init(int Sm, int Sn, int p)
  9 | {
 10 |      convLayer cl; CONVL_INIT(cl);
 11 |      cl.Sm=Sm; cl.Sn=Sn; cl.p=p;
 12 |      return cl;
 13 | }
 14 | 
 15 | 
 16 | void convLayer_free(convLayer *cl)
 17 | {
 18 |      ftens_free(&cl->W);   ftens_free(&cl->b);
 19 |      ftens_free(&cl->dW);  ftens_free(&cl->db);
 20 |      ftens_free(&cl->out); ftens_free(&cl->in);
 21 | }
 22 | 
 23 | 
 24 | void convLayer_print_shape(convLayer *cl)
 25 | {
 26 |      printf("conv: D=%d M=%d N=%d L=%d Sm=%d Sn=%d p=%d\n",
 27 |             cl->D, cl->M, cl->N, cl->L, cl->Sm, cl->Sn, cl->p);
 28 | }
 29 | 
 30 | 
 31 | void convLayer_set(ftens *W, convLayer *cl)
 32 | {
 33 |      int D=W->D, M=W->M, N=W->N, L=W->L;
 34 |      ftens_free(&cl->W);
 35 |      cl->D=D; cl->M=M; cl->N=N; cl->L=L;
 36 |      cl->W = ftens_copy(W);
 37 | }
 38 | 
 39 | 
 40 | void convLayer_copy_input(ftens *t, convLayer *cl)
 41 | {
 42 |      if (!cl->in.data)
 43 |           cl->in=ftens_init(t->D, t->M, t->N, t->L);
 44 |      memcpy(cl->in.data, t->data, t->bytes);
 45 | }
 46 | 
 47 | 
 48 | ftens convLayer_pad_input(ftens *t, float *scr,
 49 |                           int *M, int *N, int p)
 50 | {
 51 |      ftens tp; const int D=t->D, L=t->L;
 52 |      *M=PAD(*M, p); *N=PAD(*N, p);
 53 |      if (!scratch) tp=ftens_copy_pad(t, p);
 54 |      else {
 55 |           tp = ftens_from_ptr(D, *M, *N, L, scr);
 56 |           ftens_pad(t, &tp, p);
 57 |           scr += (*M)*(*N)*L*D;
 58 |      }
 59 | 
 60 |      return tp;
 61 | }
 62 | 
 63 | 
 64 | void convLayer_forward(ftens *t, convLayer *cl, int save)
 65 | {
 66 |      float *scr = scratch; ftens tp, tmp;
 67 |      int D=t->D,  Ms=t->M, Ns=t->N, Ls=t->L;
 68 |      int F=cl->D, W=cl->M, H=cl->N, L=cl->L;
 69 |      int p=cl->p, Sy=cl->Sm, Sx=cl->Sn;
 70 |      ASSERT(t->L == cl->L, "err: conv shape\n");
 71 | 
 72 |      if (save)      convLayer_copy_input(t, cl);
 73 |      if (p)    tp = convLayer_pad_input(t, scr, &Ms, &Ns, p);
 74 | 
 75 |      // lower
 76 |      const int Md = OUT_LEN(Ms, H, Sy);
 77 |      const int Nd = OUT_LEN(Ns, W, Sx);
 78 |      const int Ld =  W*H*L;
 79 |      if (!scratch) tmp=ftens_init(D, Md, Nd, Ld);
 80 |      else          tmp=ftens_from_ptr(D, Md, Nd, Ld, scr);
 81 | 
 82 |      ftens_lower(p ? &tp : t, &tmp, W, H, Sx, Sy);
 83 | 
 84 |      // mat mul
 85 |      if (!cl->out.data) cl->out=ftens_init(D, Md, Nd, F);
 86 |      int M=Md*Nd, N=F, K=cl->W.MNL;
 87 |      cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
 88 |                  M, N, K, 1, tmp.data, K, cl->W.data, K,
 89 |                  0, cl->out.data, N);
 90 | 
 91 | 
 92 |      if (!scratch)      ftens_free(&tmp);
 93 |      if (!scratch && p) ftens_free(&tp);
 94 | }
 95 | 
 96 | 
 97 | void convLayer_backward(ftens *dout, convLayer *cl)
 98 | {
 99 |      exit(-2);
100 | }
101 | 
102 | 
103 | void convLayer_update(convLayer *cl)
104 | {
105 |      exit(-3);
106 | }
107 | 


--------------------------------------------------------------------------------
/src/layers/convl.cu:
--------------------------------------------------------------------------------
  1 | #include "util.cuh"
  2 | #include "cuconv.h"
  3 | #include "kernels.h"
  4 | 
  5 | 
  6 | extern cufmem fptr;
  7 | 
  8 | 
  9 | cuconvLayer cuconvLayer_init(int Sm, int Sn, int p)
 10 | {
 11 |      cuconvLayer cl; CONVL_INIT(cl);
 12 |      cl.Sm=Sm; cl.Sn=Sn; cl.p=p;
 13 |      return cl;
 14 | }
 15 | 
 16 | void cuconvLayer_free(cuconvLayer *cl)
 17 | {
 18 |      cuftens_free(&cl->W);   cuftens_free(&cl->b);
 19 |      cuftens_free(&cl->dW);  cuftens_free(&cl->db);
 20 |      cuftens_free(&cl->out); cuftens_free(&cl->in);
 21 | }
 22 | 
 23 | void cuconvLayer_set(ftens *W, cuconvLayer *cl)
 24 | {
 25 |      int D=W->D, M=W->M, N=W->N, L=W->L;
 26 |      cuftens_free(&cl->W);
 27 |      cl->D=D; cl->M=M; cl->N=N; cl->L=L;
 28 |      cl->W = cuftens_init(D, M, N, L);
 29 |      cudaMemcpy(cl->W.data, W->data, W->bytes, cuHtoD);
 30 | }
 31 | 
 32 | void cuconvLayer_convert(convLayer *src, cuconvLayer *dst)
 33 | {
 34 |      cuconvLayer_set(&src->W, dst);
 35 | }
 36 | 
 37 | void cuconvLayer_copy_input(cuftens *t, cuconvLayer *cl)
 38 | {
 39 |      int D=t->D, M=t->M, N=t->N, L=t->L;
 40 |      if (!cl->in.data) cl->in = cuftens_init(D, M, N, L);
 41 |      cudaMemcpy(cl->in.data, t->data, t->bytes, cuDtoD);
 42 | }
 43 | 
 44 | cuftens cuconvLayer_pad_input(cuftens *t, int p)
 45 | {
 46 |      cuftens tp;
 47 |      int M = PAD(t->M, p);
 48 |      int N = PAD(t->N, p);
 49 |      if (!CUFMEM) tp = cuftens_pad(t, p);
 50 |      else {tp = cuftens_from_cufmem(t->D, M, N, t->L);
 51 |                 cuftens_pad(t, &tp, p);}
 52 |      return tp;
 53 | }
 54 | 
 55 | cuftens cuconvLayer_lower_input(cuftens *t, cuconvLayer *cl)
 56 | {
 57 |      int W=cl->N, H=cl->M, Sx=cl->Sn, Sy=cl->Sm;
 58 |      cuftens tl = (CUFMEM ?
 59 |                    cuftens_lower_cufmem(t, W, H, Sx, Sy) :
 60 |                    cuftens_lower_init(t, W, H, Sx, Sy));
 61 | 
 62 |      culower(t, &tl, W, H, Sx, Sy);
 63 | 
 64 |      return tl;
 65 | }
 66 | 
 67 | void cuconvLayer_forward(cuftens *t, cuconvLayer *cl, int save)
 68 | {
 69 |      cufmem_reset();
 70 | 
 71 |      int D=t-> D, M=t->M, N=t->N, L=t->L;
 72 |      int F=cl->D, W=cl->N, H=cl->M;
 73 |      int p=cl->p, Sx=cl->Sn, Sy=cl->Sm;
 74 | 
 75 |      cuASSERT(t->L == cl->L, "err: cuconv shape\n");
 76 | 
 77 |      cuftens tp = !p ? *t : cuconvLayer_pad_input(t, p);
 78 |      cuftens tl = cuconvLayer_lower_input(&tp, cl);
 79 | 
 80 |      M = tl.M; N = tl.N;
 81 |      if (!cl->out.data) cl->out=cuftens_init(D, M, N, F);
 82 | 
 83 |      M=D*M*N; N=F; int K=W*H*L;
 84 |      sgemm(M, N, K, tl.data, K, cl->W.data, K, cl->out.data, N);
 85 | 
 86 |      if (!CUFMEM)      cuftens_free(&tl);
 87 |      if (!CUFMEM && p) cuftens_free(&tp);
 88 | }
 89 | 
 90 | 
 91 | void cuconvLayer_backward(cuftens *dout, cuconvLayer *cl)
 92 | {
 93 |      exit(-2);
 94 | }
 95 | 
 96 | 
 97 | void cuconvLayer_print_shape(cuconvLayer *cl)
 98 | {
 99 |      printf("cuconv: D=%d M=%d N=%d L=%d Sm=%d Sn=%d p=%d\n",
100 |             cl->D, cl->M, cl->N, cl->L, cl->Sm, cl->Sn, cl->p);
101 | }
102 | 


--------------------------------------------------------------------------------
/src/layers/densel.c:
--------------------------------------------------------------------------------
 1 | #include <string.h>
 2 | #include <cblas.h>
 3 | 
 4 | #include "util.h"
 5 | #include "dense.h"
 6 | 
 7 | 
 8 | denseLayer denseLayer_init(int M, int N)
 9 | {
10 |      denseLayer dl; DENSEL_INIT(dl); dl.M=M; dl.N=N;
11 |      return dl;
12 | }
13 | 
14 | 
15 | void denseLayer_free(denseLayer *dl)
16 | {
17 |      ftens_free(&dl->W);   ftens_free(&dl->b);
18 |      ftens_free(&dl->dW);  ftens_free(&dl->db);
19 |      ftens_free(&dl->out); ftens_free(&dl->in);
20 | }
21 | 
22 | 
23 | void denseLayer_print_shape(denseLayer *dl)
24 | {
25 |      printf("dense: %d %d\n", dl->M, dl->N);
26 | }
27 | 
28 | 
29 | void denseLayer_set(ftens *W, denseLayer *dl)
30 | {
31 |      const int M=W->M, N=W->N;
32 |      ASSERT(W->D==1 && W->L==1, "err: dense shape\n");
33 |      ftens_free(&dl->W);
34 |      dl->M = M; dl->N = N;
35 |      dl->W = ftens_copy(W);
36 | }
37 | 
38 | 
39 | void denseLayer_forward(ftens *t, denseLayer *dl, int save)
40 | {
41 |      const int D=t->D, M=dl->M, N=dl->N;
42 |      ASSERT(t->MNL == dl->N,  "err: dense shape\n");
43 | 
44 |      if (save) {
45 |           int M=t->M, N=t->N, L=t->L;
46 |           if (!dl->in.data) dl->in = ftens_init(D,M,N,L);
47 |           memcpy(dl->in.data, t->data, t->bytes);
48 |      }
49 | 
50 |      if (!dl->out.data) dl->out = ftens_init(D, 1, M, 1);
51 |      const float *a=dl->W.data;
52 |      const float *b=t->data;
53 |      float       *c=dl->out.data;
54 | 
55 |      cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
56 |                  D, M, N, 1, b, N, a, N, 0, c, M);
57 | 
58 | }
59 | 
60 | 
61 | void denseLayer_backward(ftens *dout, denseLayer *dl)
62 | {
63 |      fprintf(stderr, "not implemented yet\n");
64 |      exit(-2);
65 | }
66 | 


--------------------------------------------------------------------------------
/src/layers/densel.cu:
--------------------------------------------------------------------------------
 1 | #include "util.cuh"
 2 | #include "cudense.h"
 3 | #include "kernels.h"
 4 | 
 5 | 
 6 | cudenseLayer cudenseLayer_init(int M, int N)
 7 | {
 8 |      cudenseLayer dl; DENSEL_INIT(dl); dl.M=M; dl.N=N;
 9 |      return dl;
10 | }
11 | 
12 | void cudenseLayer_free(cudenseLayer *dl)
13 | {
14 |      cuftens_free(&dl->W);   cuftens_free(&dl->b);
15 |      cuftens_free(&dl->dW);  cuftens_free(&dl->db);
16 |      cuftens_free(&dl->out); cuftens_free(&dl->in);
17 | }
18 | 
19 | void cudenseLayer_convert(denseLayer *src, cudenseLayer *dst)
20 | {
21 |      cudenseLayer_set(&src->W, dst);
22 | }
23 | 
24 | void cudenseLayer_set(ftens *W, cudenseLayer *dl)
25 | {
26 |      int M=W->M, N=W->N;
27 |      cudenseLayer_free(dl);
28 |      dl->M=M; dl->N=N; dl->W=cuftens_init(1, M, N, 1);
29 |      cudaMemcpy(dl->W.data, W->data, W->bytes, cuHtoD);
30 | }
31 | 
32 | void cudenseLayer_copy_input(cuftens *t, cudenseLayer *dl)
33 | {
34 |      if (!dl->in.data)
35 |           dl->in = cuftens_init(t->D, t->M, t->N, t->L);
36 |      cudaMemcpy(dl->in.data, t->data, t->bytes, cuHtoD);
37 | }
38 | 
39 | void cudenseLayer_forward(cuftens *t, cudenseLayer *dl, int save)
40 | {
41 |      int D=t->D, M=t->M, N=t->N;
42 |      cuftens_reshape(t, D, 1, t->MNL, 1);
43 |      cuASSERT(t->MNL == dl->N, "err: cudense shape\n");
44 | 
45 |      if (save)          cudenseLayer_copy_input(t, dl);
46 |      if (!dl->out.data) dl->out=cuftens_init(D, 1, dl->M, 1);
47 | 
48 |      if (D == 1) sgemv(&dl->W, t, &dl->out);
49 |      else        sgemm(M, 1, N, t->data, N, dl->W.data, N,
50 |                        dl->out.data, 1);
51 | }
52 | 
53 | 
54 | void cudenseLayer_backward(cuftens *dt, cudenseLayer *dl)
55 | {
56 |      fprintf(stderr, "err: dense bprop not implemented yet\n");
57 |      exit(-2);
58 | }
59 | 
60 | 
61 | void cudenseLayer_print_size(cudenseLayer *dl)
62 | {
63 |      printf("cudense: %d %d\n", dl->M, dl->N);
64 | }
65 | 


--------------------------------------------------------------------------------
/src/layers/inputl.c:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | #include "input.h"
 3 | 
 4 | 
 5 | void inputLayer_load(ftens *t, inputLayer *il)
 6 | {
 7 |      il->out = ftens_copy(t);
 8 | }
 9 | 
10 | 
11 | void inputLayer_free(inputLayer *il)
12 | {
13 |      ftens_free(&il->out);
14 | }
15 | 
16 | 
17 | void inputLayer_forward(inputLayer *il)
18 | {
19 |      if (!il->out.data) {
20 |           fprintf(stderr, "err: in null\n");
21 |           exit(-1);
22 |      }
23 | 
24 |      float *ptr = il->out.data;
25 |      const int len = ftens_len(&il->out);
26 |      for (int i=0; i < len; i++)
27 |           ptr[i] = 2.0f * ptr[i]/255.0f - 1.0f;
28 | }
29 | 
30 | 
31 | /* void inputLayer_pad(inputLayer *il, const int p) */
32 | /* { */
33 | /*      ftens tmp = ftens_copy_pad(&il->out, p); */
34 | /*      ftens_free(&il->out); */
35 | /*      il->out = tmp; */
36 | /* } */
37 | 


--------------------------------------------------------------------------------
/src/layers/inputl.cu:
--------------------------------------------------------------------------------
 1 | #include "util.cuh"
 2 | #include "cuinput.h"
 3 | #include "kernels.h"
 4 | 
 5 | 
 6 | void cuinputLayer_forward(ftens *t, cuinputLayer *il, int norm)
 7 | {
 8 |      int D=t->D, M=t->M, N=t->N, L=t->L;
 9 | 
10 |      if (!il->out.data) il->out=cuftens_init(D, M, N, L);
11 |      cudaMemcpy(il->out.data, t->data, t->bytes, cuHtoD);
12 | 
13 |      if (norm) cunorm(&il->out);
14 | }
15 | 
16 | void cuinputLayer_free(cuinputLayer *il)
17 | {
18 |      cuftens_free(&il->out);
19 | }
20 | 


--------------------------------------------------------------------------------
/src/layers/pconvl.cu:
--------------------------------------------------------------------------------
  1 | #include "cumem.h"
  2 | #include "layers/cuconv.h"
  3 | #include "layers/cupconv.h"
  4 | #include "kernels.h"
  5 | 
  6 | 
  7 | cupconvLayer cupconvLayer_init(int Sm, int Sn, int p)
  8 | {
  9 |      cupconvLayer out;
 10 |      CUPCONVL_INIT(out);
 11 |      out.Sm=Sm; out.Sn=Sn; out.p=p;
 12 |      return out;
 13 | }
 14 | 
 15 | void cupconvLayer_free(cupconvLayer *cl)
 16 | {
 17 |      cuptens_free(&cl->W);  cuptens_free(&cl->pout);
 18 |      cuftens_free(&cl->dW); cuftens_free(&cl->out);
 19 |      cuftens_free(&cl->in); cuftens_free(&cl->fix);
 20 |      cuftens_free(&cl->bfix);
 21 | }
 22 | 
 23 | void cupconvLayer_set(ftens *W, cupconvLayer *cl, int fix)
 24 | {
 25 |      int D=W->D, M=W->M, N=W->N, L=W->L;
 26 |      cupconvLayer_free(cl);
 27 |      cl->D=D; cl->M=M; cl->N=N; cl->L=L;
 28 |      cl->W = cuptens_convert(W);
 29 |      if (fix) {
 30 |           ftens tmp = ftens_init(1, 1, D, 1);
 31 |           for (int i = 0; i < D; i++) {
 32 |                float *s = W->data + i * W->MNL; ;
 33 |                tmp.data[i] = 0.0f;
 34 |                for (int j = 0; j < W->MNL; j++)
 35 |                     tmp.data[i] += s[j];
 36 |           }
 37 |           if (!cl->fix.data) cl->fix = cuftens_init(1, 1, D, 1);
 38 |           cudaMemcpy(cl->fix.data, tmp.data, tmp.bytes, cuHtoD);
 39 |           ftens_free(&tmp);
 40 |      }
 41 | }
 42 | 
 43 | void cupconvLayer_convert(convLayer *src, cupconvLayer *dst, int fix)
 44 | {
 45 |      dst->Sm=src->Sm; dst->Sn=src->Sn; dst->p=src->p;
 46 |      cupconvLayer_set(&src->W, dst, fix);
 47 | }
 48 | 
 49 | cuptens cupconvLayer_pad_input(cuptens *t, int p)
 50 | {
 51 |      cuptens tp;
 52 |      int M = PAD(t->M, p);
 53 |      int N = PAD(t->N, p);
 54 |      if (CUPMEM) {
 55 |           tp = cuptens_from_cupmem(t->D, M, N, t->L);
 56 |           cupad(t, &tp, p);
 57 |      } else
 58 |           tp = cuptens_pad(t, p);
 59 | 
 60 |      return tp;
 61 | }
 62 | 
 63 | cuptens cupconvLayer_lower_input(cuptens *t, cupconvLayer *cl)
 64 | {
 65 |      int W=cl->N, H=cl->M, Sx=cl->Sn, Sy=cl->Sm;
 66 |      cuptens tl = (CUPMEM  ?
 67 |                    cuptens_lower_cupmem(t, W, H, Sx, Sy) :
 68 |                    cuptens_lower_init(t, W, H, Sx, Sy));
 69 | 
 70 |      cuplower(t, &tl, W, H, Sx, Sy);
 71 | 
 72 |      return tl;
 73 | }
 74 | 
 75 | void cupconvLayer_forward(cuptens *t, cupconvLayer *cl)
 76 | {
 77 |      cupmem_reset();
 78 | 
 79 |      int D=t->D,  M=t->M,  N=t->N,  L=t->X;
 80 |      int F=cl->D, p=cl->p, W=cl->N, H=cl->M;
 81 |      cuASSERT(L == cl->W.X, "err: cupconv forward\n");
 82 | 
 83 |      cuptens tp = !p ? *t : cupconvLayer_pad_input(t, p);
 84 |      cuptens tl = cupconvLayer_lower_input(&tp, cl);
 85 | 
 86 |      M=tl.M; N=tl.N;
 87 | 
 88 |      if (!cl->out.data) cl->out = cuftens_init(D, M, N, F);
 89 | 
 90 |      pgemm(D*M*N, F, W*H*L, tl.data, cl->W.data, cl->out.data);
 91 | 
 92 |      if(!CUPMEM)      cuptens_free(&tl);
 93 |      if(!CUPMEM && p) cuptens_free(&tp);
 94 | }
 95 | 
 96 | void cupconvLayer_forward_initial(cuftens *t, cupconvLayer *cl,
 97 |                                   float norm)
 98 | {
 99 |      cufmem_reset();
100 |      cupmem_reset();
101 | 
102 |      cuftens tmp = (!CUFMEM ?
103 |                     cuftens_round_up(t, 64) :
104 |                     cuftens_round_up_cufmem(t, 64));
105 | 
106 |      int D=t->D,  M=tmp.M, N=tmp.N, L=tmp.L;
107 |      int F=cl->D, p=cl->p, W=cl->N, H=cl->M;
108 | 
109 |      cuftens tp = !p ? tmp : cuconvLayer_pad_input(&tmp, p);
110 |      cuftens tl = cuconvLayer_lower_input(&tp, (cuconvLayer*)cl);
111 | 
112 |      M=tl.M; N=tl.N; L=tl.L;
113 | 
114 |      cuptens qwe = (!CUPMEM ?
115 |                     cuptens_init(D, 8, M*N*L, 1) :
116 |                     cuptens_from_cupmem(D, 8, M*N*L, 1));
117 | 
118 | 
119 |      cubp_split_pack(&tl, &qwe);
120 | 
121 |      cufmem_reset();
122 |      cuftens tmp2 = (!CUFMEM ?
123 |                      cuftens_init(D, M*8, N, F) :
124 |                      cuftens_from_cufmem(D, M*8, N, F));
125 | 
126 |      pgemm_init_rev(D*M*N*8, F, cl->W.MNL, qwe.data,
127 |                     cl->W.data, tmp2.data);
128 | 
129 |      if (!cl->out.data) cl->out = cuftens_init(D, M, N, F);
130 |      cubp_merge(&tmp2, &cl->out, &cl->fix, norm);
131 | 
132 |      if (!CUFMEM)     {cuftens_free(&tmp); cuftens_free(&tl);}
133 |      if (!CUPMEM)      cuptens_free(&qwe);
134 |      if (!CUFMEM && p) cuftens_free(&tp);
135 | }
136 | 
137 | void cupconvLayer_print(cupconvLayer *cl)
138 | {
139 |      printf("cupconvLayer: \n");
140 | }
141 | 


--------------------------------------------------------------------------------
/src/layers/pdensel.cu:
--------------------------------------------------------------------------------
  1 | #include "util.cuh"
  2 | #include "kernels.h"
  3 | #include "layers/cupdense.h"
  4 | 
  5 | 
  6 | cupdenseLayer cupdenseLayer_init()
  7 | {
  8 |      cupdenseLayer dl;
  9 |      CUPDENSEL_INIT(dl);
 10 |      return dl;
 11 | }
 12 | 
 13 | void cupdenseLayer_free(cupdenseLayer *dl)
 14 | {
 15 |      cuptens_free(&dl->W);  cuptens_free(&dl->pout);
 16 |      cuptens_free(&dl->in); cuftens_free(&dl->out);
 17 |      cuftens_free(&dl->dW); cuftens_free(&dl->fix);
 18 | }
 19 | 
 20 | void cupdenseLayer_set(ftens *W, cupdenseLayer *dl, int fix)
 21 | {
 22 |      int M = W->M, N = W->N;
 23 |      cupdenseLayer_free(dl);
 24 |      dl->M=M; dl->N=N;  dl->W=cuptens_convert(W);
 25 | 
 26 |      if (fix) {
 27 |           ftens tmp = ftens_init(1, 1, M, 1);
 28 |           for (int i=0; i<M; i++) {
 29 |                tmp.data[i] = 0.0f;
 30 |                for (int j=0; j < N; j++)
 31 |                     tmp.data[i] += W->data[ID2(i,j,N)];
 32 |           }
 33 |           dl->fix = cuftens_convert(&tmp);
 34 |           ftens_free(&tmp);
 35 |      }
 36 | }
 37 | 
 38 | void cupdenseLayer_convert(denseLayer *src, cupdenseLayer *dst, int fix)
 39 | {
 40 |      cupdenseLayer_set(&src->W, dst, fix);
 41 | }
 42 | 
 43 | static
 44 | void cupdenseLayer_copy_input(cuptens *t, cupdenseLayer *dl)
 45 | {
 46 |      int D=t->D, M=t->M, N=t->N, L=t->L;
 47 |      if (!dl->in.data) dl->in = cuptens_init(D, M, N, L);
 48 |      cudaMemcpy(dl->in.data, t->data, t->bytes, cuDtoD);
 49 | }
 50 | 
 51 | void cupdenseLayer_forward(cuptens *t, cupdenseLayer *dl, int save)
 52 | {
 53 |      int D=t->D, M=dl->M, N=dl->W.X;
 54 |      cuASSERT(t->MNL == dl->N/64, "err: cupdense shape\n");
 55 | 
 56 |      if (save) cupdenseLayer_copy_input(t, dl);
 57 | 
 58 |      if (!dl->out.data) dl->out = cuftens_init(D, 1, M, 1);
 59 |      if (D == 1) pgemv(M, N, dl->W.data, t->data, dl->out.data);
 60 |      else        pgemm(D, M, N, t->data, dl->W.data, dl->out.data);
 61 | }
 62 | 
 63 | static
 64 | cuptens cupdenseLayer_bpsplit_input(cuftens *t, cupdenseLayer *dl)
 65 | {
 66 |      int D=t->D, N=t->MNL, ru=N & 63;
 67 |      cuftens tmp; tmp.data=NULL;
 68 |      if (ru) {
 69 |           if (CUFMEM) {
 70 |                int asd = ROUND_UP(N, 64);
 71 |                tmp = cuftens_from_cufmem(D, 1, asd, 1);
 72 |                cudaMemset(tmp.data, 0, tmp.bytes);
 73 |                cucopy(t, &tmp);
 74 |           }
 75 |           else {
 76 |                tmp = cuftens_round_up(t, 64);
 77 |           }
 78 |      }
 79 | 
 80 |      cuptens out = (CUPMEM ?
 81 |                     cuptens_from_cupmem(D, 8, N, 1) :
 82 |                     cuptens_init(D, 8, N, 1));
 83 | 
 84 |      cubp_split_pack(ru ? &tmp : t, &out);
 85 | 
 86 |      if (ru && !CUFMEM) cuftens_free(&tmp);
 87 |      return out;
 88 | }
 89 | 
 90 | 
 91 | void cupdenseLayer_forward_initial(cuftens *t, cupdenseLayer *dl,
 92 |                                    float norm)
 93 | {
 94 |      int D=t->D, M=dl->M, N=8, K=dl->W.X;
 95 | 
 96 |      cufmem_reset();
 97 |      cupmem_reset();
 98 | 
 99 |      cuftens_reshape(t, t->D, 1, t->MNL, 1);
100 |      cuptens asd = cupdenseLayer_bpsplit_input(t, dl);
101 |      cuftens tmp = (CUFMEM ?
102 |                     cuftens_from_cufmem(D, N, M, 1) :
103 |                     cuftens_init(D, N, M, 1));
104 | 
105 |      pgemm_init_rev(D*N, M, K, asd.data, dl->W.data, tmp.data);
106 | 
107 |      if (!dl->out.data) dl->out = cuftens_init(D, 1, M, 1);
108 | 
109 |      cubp_merge(&tmp, &dl->out, &dl->fix, norm);
110 | 
111 |      if (!CUPMEM) cuptens_free(&asd);
112 |      if (!CUFMEM) cuftens_free(&tmp);
113 | }
114 | 
115 | 
116 | void cupdenseLayer_backward(cuptens *dout, cupdenseLayer *dl)
117 | {
118 |      fprintf(stderr, "err: cupdensel bprop not yet implemented\n");
119 |      exit(-2);
120 | }
121 | 
122 | void cupdenseLayer_update(cupdenseLayer *dl)
123 | {
124 |      fprintf(stderr, "err: cupdensel bprop not yet implemented\n");
125 |      exit(-2);
126 | }
127 | 


--------------------------------------------------------------------------------
/src/layers/pinputl.cu:
--------------------------------------------------------------------------------
 1 | #include "layers/cupinput.h"
 2 | 
 3 | 
 4 | // cupinputLayer cupinputLayer_init()
 5 | // {
 6 | //      cupinputLayer il; il.out.data=NULL;
 7 | //      return il;
 8 | // }
 9 | 
10 | // void cupinputLayer_forward(ftens *t, cupinputLayer *il)
11 | // {
12 | //      int D=t->D, M=t->M, N=t->N, L=t->L; cuftens tmp;
13 | //      if (!CUFMEM)
14 | //           tmp = cuftens_convert(t);
15 | //      else {
16 | //           tmp = cuftens_from_cufmem(D, M, N, L);
17 | //           cudaMemcpy(tmp.data, t->data, t->bytes, cuHtoD);
18 | //      }
19 | 
20 | //      if (!il->out.data) il->out = cuptens_init(D, M, N, L);
21 | //      cuptens_convert(&tmp, &il->out);
22 | 
23 | //      if(!d_fscratch) cuftens_free(&tmp);
24 | // }
25 | 
26 | // void cupinputLayer_free(cupinputLayer *il)
27 | // {
28 | //      cuptens_free(&il->out);
29 | // }
30 | 


--------------------------------------------------------------------------------
/src/layers/pooll.c:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | #include "layers/pool.h"
 3 | 
 4 | 
 5 | poolLayer poolLayer_init(int M, int N, int Sm, int Sn)
 6 | {
 7 |      poolLayer pl = {M, N, Sm, Sn, MAX};
 8 |      pl.out. data = NULL;
 9 |      pl.mask.data = NULL;
10 |      return pl;
11 | }
12 | 
13 | 
14 | void poolLayer_free(poolLayer *pl)
15 | {
16 |      ftens_free(&pl->out);
17 |      ftens_free(&pl->mask);
18 | }
19 | 
20 | 
21 | void poolLayer_forward(ftens *t, poolLayer *pl)
22 | {
23 |      const int W=pl->M, H=pl->N, Sy=pl->Sm, Sx=pl->Sn;
24 |      const int D=t->D, L=t->L, Ms=t->M, Ns=t->N;
25 |      const int Md=OUT_LEN(Ms, H, Sy);
26 |      const int Nd=OUT_LEN(Ns, W, Sx);
27 | 
28 |      if (!pl->out.data) pl->out=ftens_init(D, Md, Nd, L);
29 | 
30 |      if (pl->op == MAX)
31 |           ftens_maxpool(t, &pl->out, W, H, Sx, Sy);
32 |      else
33 |           exit(-3);
34 | }
35 | 
36 | 
37 | void poolLayer_backward(ftens *dout, poolLayer *pl)
38 | {
39 |      exit(-2);
40 | }
41 | 


--------------------------------------------------------------------------------
/src/layers/pooll.cu:
--------------------------------------------------------------------------------
 1 | #include "util.cuh"
 2 | #include "kernels.h"
 3 | #include "layers/cupool.h"
 4 | 
 5 | 
 6 | cupoolLayer cupoolLayer_init(int M, int N, int Sm, int Sn)
 7 | {
 8 |      cupoolLayer pl = {M, N, Sm, Sn, MAX};
 9 |      pl.out. data = NULL;
10 |      pl.mask.data = NULL;
11 |      return pl;
12 | }
13 | 
14 | void cupoolLayer_free(cupoolLayer *pl)
15 | {
16 |      cuftens_free(&pl->out);
17 |      cuftens_free(&pl->mask);
18 | }
19 | 
20 | void cupoolLayer_convert(poolLayer *src, cupoolLayer *dst)
21 | {
22 |      dst->M  = src->N;
23 |      dst->N  = src->N;
24 |      dst->Sm = src->Sm;
25 |      dst->Sn = src->Sn;
26 |      dst->op = src->op;
27 | }
28 | 
29 | void cupoolLayer_forward(cuftens *t, cupoolLayer *pl)
30 | {
31 |      int W=pl->M, H=pl->N, Sy=pl->Sm, Sx=pl->Sn;
32 |      int M = OUT_LEN(t->M, H, Sy);
33 |      int N = OUT_LEN(t->N, W, Sx);
34 | 
35 |      if (!pl->out.data)
36 |           pl->out = cuftens_init(t->D, M, N, t->L);
37 | 
38 |      cuASSERT(pl->op == MAX, "err: pool type not impl\n");
39 |      cumaxpool(t, &pl->out, W, H, Sx, Sy);
40 | }
41 | 
42 | 
43 | void cupoolLayer_backward(cuftens *dt, cupoolLayer *pl)
44 | {
45 |      exit(-2);
46 | }
47 | 
48 | 
49 | void cupoolLayer_print(cupoolLayer *pl)
50 | {
51 |      printf("cupool: %d %d %d %d\n", pl->M, pl->N, pl->Sm, pl->Sn);
52 | }
53 | 


--------------------------------------------------------------------------------
/src/mlp.c:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | #include "params.h"
 3 | #include "nn/mlp.h"
 4 | 
 5 | 
 6 | mlp mlp_init(int Ndl, int Nbnl)
 7 | {
 8 |      mlp out = {Ndl, Nbnl};
 9 |      out.dl  = Ndl  ? MALLOC(denseLayer, Ndl)  : NULL;
10 |      out.bnl = Nbnl ? MALLOC(bnormLayer, Nbnl) : NULL;
11 |      for (int i=0; i<Ndl;  i++) DENSEL_INIT(out.dl[i]);
12 |      for (int i=0; i<Nbnl; i++) BNORML_INIT(out.bnl[i]);
13 |      return out;
14 | }
15 | 
16 | void mlp_free(mlp *net)
17 | {
18 |      inputLayer_free(&net->il);
19 |      for (int i=0; i<net->Ndl;  i++) denseLayer_free(&net->dl[i]);
20 |      for (int i=0; i<net->Nbnl; i++) bnormLayer_free(&net->bnl[i]);
21 | }
22 | 
23 | mlp mlp_load(const char *esp, int bin)
24 | {
25 |      mlp out;
26 |      int Ndl;  denseLayer *dl;
27 |      int Nbnl; bnormLayer *bnl;
28 | 
29 |      FILE *pf = fopen(esp, "rb");
30 |      ASSERT(pf, "err: esp fopen\n");
31 | 
32 |      int val;
33 |      while ((val = fgetc(pf)) != EOF) {
34 |           switch (val) {
35 |           case DENSEL|LNUM: fread(&Ndl,  sizeof(int), 1, pf); break;
36 |           case BNORML|LNUM: fread(&Nbnl, sizeof(int), 1, pf); break;
37 | 
38 |           case INPUTL|LDAT:
39 |                out = mlp_init(Ndl, Nbnl);
40 |                dl=out.dl; bnl=out.bnl;
41 |                break;
42 | 
43 |           case DENSEL|LDAT: load_denseLayer(dl, pf, bin); dl++;  break;
44 |           case BNORML|LDAT: load_bnormLayer(bnl, pf);     bnl++; break;
45 |                break;
46 | 
47 |           default:
48 |                fprintf(stderr, "err: mlp loader\n");
49 |                exit(-3);
50 |           }
51 |      }
52 | 
53 |      fclose(pf);
54 |      return out;
55 | }
56 | 
57 | void mlp_print(mlp *net)
58 | {
59 |      printf("mlp: Ndl=%d Nbnl=%d\n", net->Ndl, net->Nbnl);
60 | }
61 | 


--------------------------------------------------------------------------------
/src/mlp.cu:
--------------------------------------------------------------------------------
 1 | #include "util.cuh"
 2 | #include "nn/cumlp.h"
 3 | 
 4 | 
 5 | 
 6 | cumlp cumlp_init(int Ndl, int Nbnl)
 7 | {
 8 |      cumlp out = {Ndl, Nbnl};
 9 |      out.dl  = Ndl  ? MALLOC(cudenseLayer, Ndl)  : NULL;
10 |      out.bnl = Nbnl ? MALLOC(cubnormLayer, Nbnl) : NULL;
11 |      for (int i=0; i<Ndl;  i++) DENSEL_INIT(out.dl[i]);
12 |      for (int i=0; i<Nbnl; i++) BNORML_INIT(out.bnl[i]);
13 |      return out;
14 | }
15 | 
16 | 
17 | void cumlp_free(cumlp *net)
18 | {
19 |      cuinputLayer_free(&net->il);
20 |      for (int i=0; i<net->Ndl;  i++) cudenseLayer_free(&net->dl[i]);
21 |      for (int i=0; i<net->Nbnl; i++) cubnormLayer_free(&net->bnl[i]);
22 | }
23 | 
24 | 
25 | cumlp cumlp_convert(mlp *net)
26 | {
27 |      cumlp out = cumlp_init(net->Ndl, net->Nbnl);
28 |      for (int i=0; i<net->Ndl;  i++)
29 |           cudenseLayer_convert(&net->dl[i], &out.dl[i]);
30 | 
31 |      for (int i=0; i<net->Nbnl; i++)
32 |           cubnormLayer_convert(&net->bnl[i], &out.bnl[i]);
33 | 
34 |      return out;
35 | }
36 | 
37 | 
38 | void cumlp_print(cumlp *net)
39 | {
40 |      printf("cumlp: Ndl=%d Nbnl=%d\n", net->Ndl, net->Nbnl);
41 | }
42 | 


--------------------------------------------------------------------------------
/src/params.c:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | #include "layers.h"
 3 | #include "nn/mlp.h"
 4 | #include "nn/cnn.h"
 5 | 
 6 | 
 7 | static inline
 8 | void reverse(float *v, const int len)
 9 | {
10 |      float t; int j=len-1;
11 |      for (int i = 0; i < len/2; i++) {
12 |           t = v[i]; v[i] = v[j]; v[j] = t;
13 |           j--;
14 |      }
15 | }
16 | 
17 | 
18 | void load_denseLayer(denseLayer *dl, FILE * const pf, int bin)
19 | {
20 |      int M; fread(&M, sizeof(int), 1, pf);
21 |      int N; fread(&N, sizeof(int), 1, pf);
22 |      printf("dense: %d %d\n", M, N);
23 |      ftens W = ftens_from_file(1, M, N, 1, pf);
24 |      ftens b = ftens_from_file(1, 1, M, 1, pf);
25 |      if (bin) ftens_sign(&W);
26 |      denseLayer_set(&W, dl);
27 |      ftens_free(&W);
28 |      ftens_free(&b);
29 | }
30 | 
31 | 
32 | void load_bnormLayer(bnormLayer *bnl, FILE *pf)
33 | {
34 |      int N; fread(&N, sizeof(int), 1, pf);
35 |      printf("bnorm: %d\n", N);
36 |      ftens beta  = ftens_from_file(1, 1, N, 1, pf);
37 |      ftens gamma = ftens_from_file(1, 1, N, 1, pf);
38 |      ftens mean  = ftens_from_file(1, 1, N, 1, pf);
39 |      ftens istd  = ftens_from_file(1, 1, N, 1, pf);
40 |      bnormLayer_set(&mean, &istd, &beta, &gamma, bnl);
41 |      ftens_free(&mean); ftens_free(&istd);
42 |      ftens_free(&beta); ftens_free(&gamma);
43 | }
44 | 
45 | void load_convLayer(convLayer *cl, FILE *pf, int bin, int rev)
46 | {
47 |      int a[7]; fread(a, sizeof(int), 7, pf);
48 |      int p=a[0], D=a[1], M=a[2], N=a[3], L=a[4];
49 |      printf("conv: \n");
50 |      ftens fil = ftens_from_file(D, L, M, N, pf);
51 |      if (bin) ftens_sign(&fil);
52 |      if (rev)
53 |           for (int w=0; w<D; w++)
54 |                reverse(fil.data + (fil.MNL)*w, M*N*L);
55 | 
56 |      ftens asd = ftens_copy_tch(&fil);
57 |      ftens b = ftens_from_file(1, 1, D, 1, pf);
58 |      cl->Sm=a[5]; cl->Sn=a[6]; cl->p=a[0];
59 |      convLayer_set(&asd, cl);
60 |      ftens_free(&fil);
61 |      ftens_free(&asd);
62 |      ftens_free(&b);
63 | }
64 | 
65 | 
66 | void load_poolLayer(poolLayer *pl, FILE *pf)
67 | {
68 |      int M, N, Sm, Sn;
69 |      printf("pool: \n");
70 |      fread(&M,  sizeof(int), 1, pf);
71 |      fread(&N,  sizeof(int), 1, pf);
72 |      fread(&Sm, sizeof(int), 1, pf);
73 |      fread(&Sn, sizeof(int), 1, pf);
74 |      pl->M=M; pl->N=N; pl->Sm=Sm; pl->Sn=Sn;
75 | }
76 | 


--------------------------------------------------------------------------------
/src/pcnn.cu:
--------------------------------------------------------------------------------
 1 | #include "util.cuh"
 2 | #include "nn/cupcnn.h"
 3 | 
 4 | 
 5 | cupcnn cupcnn_init(int Ncl, int Npl, int Ndl, int Nbnl)
 6 | {
 7 |      cupcnn out = {Ncl, Npl, Ndl, Nbnl};
 8 |      out.cl  = Ncl  ? MALLOC(cupconvLayer,  Ncl)  : NULL;
 9 |      out.pl  = Npl  ? MALLOC(cupoolLayer,   Npl)  : NULL;
10 |      out.dl  = Ndl  ? MALLOC(cupdenseLayer, Ndl)  : NULL;
11 |      out.bnl = Nbnl ? MALLOC(cubnormLayer,  Nbnl) : NULL;
12 |      for (int i=0; i<Ncl;  i++) CUPCONVL_INIT (out.cl[i]);
13 |      for (int i=0; i<Npl;  i++) POOLL_INIT    (out.pl[i]);
14 |      for (int i=0; i<Ndl;  i++) CUPDENSEL_INIT(out.dl[i]);
15 |      for (int i=0; i<Nbnl; i++) BNORML_INIT   (out.bnl[i]);
16 |      return out;
17 | }
18 | 
19 | void cupcnn_free(cupcnn *net)
20 | {
21 |      cuinputLayer_free(&net->il);
22 |      for (int i=0; i<net->Ncl;  i++) cupconvLayer_free(net->cl + i);
23 |      for (int i=0; i<net->Npl;  i++) cupoolLayer_free(net->pl + i);
24 |      for (int i=0; i<net->Ndl;  i++) cupdenseLayer_free(net->dl + i);
25 |      for (int i=0; i<net->Nbnl; i++) cubnormLayer_free(net->bnl + i);
26 | }
27 | 
28 | cupcnn cupcnn_convert(cnn *net)
29 | {
30 |      int Ncl=net->Ncl, Npl=net->Npl;
31 |      int Ndl=net->Ndl, Nbnl=net->Nbnl;
32 | 
33 |      cupcnn out = cupcnn_init(Ncl, Npl, Ndl, Nbnl);
34 | 
35 |      for (int i=0; i<Ncl; i++)
36 |           cupconvLayer_convert(&net->cl[i], &out.cl[i], i==0);
37 | 
38 |      for (int i=0; i<Npl; i++)
39 |           cupoolLayer_convert(&net->pl[i], &out.pl[i]);
40 | 
41 |      for (int i=0; i<Ndl; i++)
42 |           cupdenseLayer_convert(&net->dl[i], &out.dl[i], i==0);
43 | 
44 |      for (int i=0; i<Nbnl; i++)
45 |           cubnormLayer_convert(&net->bnl[i], &out.bnl[i]);
46 | 
47 |      return out;
48 | }
49 | 
50 | void cupcnn_print(cupcnn *net)
51 | {
52 |      printf("CUPCNN: Ncl=%d Npl=%d Ndl=%d Nbnl=%d\n",
53 |             net->Ncl, net->Npl, net->Ndl, net->Nbnl);
54 | }
55 | 


--------------------------------------------------------------------------------
/src/pmlp.cu:
--------------------------------------------------------------------------------
 1 | #include "util.cuh"
 2 | #include "nn/cupmlp.h"
 3 | 
 4 | 
 5 | cupmlp cupmlp_init(int Ndl, int Nbnl)
 6 | {
 7 |      cupmlp out = {Ndl, Nbnl};
 8 |      out.dl  = Ndl  ? MALLOC(cupdenseLayer, Ndl)  : NULL;
 9 |      out.bnl = Nbnl ? MALLOC(cubnormLayer,  Nbnl) : NULL;
10 |      for (int i=0; i < Ndl;  i++) CUPDENSEL_INIT(out.dl[i]);
11 |      for (int i=0; i < Nbnl; i++) BNORML_INIT(out.bnl[i]);
12 |      return out;
13 | }
14 | 
15 | void cupmlp_free(cupmlp *nn)
16 | {
17 |      cuinputLayer_free(&nn->il);
18 |      for (int i=0; i < nn->Ndl;  i++) cupdenseLayer_free(&nn->dl[i]);
19 |      for (int i=0; i < nn->Nbnl; i++) cubnormLayer_free(&nn->bnl[i]);
20 | }
21 | 
22 | cupmlp cupmlp_convert(mlp *nn)
23 | {
24 |      cupmlp out = cupmlp_init(nn->Ndl, nn->Nbnl);
25 |      for (int i=0; i < nn->Ndl; i++)
26 |           cupdenseLayer_convert(&nn->dl[i], &out.dl[i], i==0);
27 | 
28 |      for (int i=0; i < nn->Nbnl; i++)
29 |           cubnormLayer_convert(&nn->bnl[i], &out.bnl[i]);
30 | 
31 |      return out;
32 | }
33 | 
34 | void cupmlp_print(cupmlp *nn)
35 | {
36 |      printf("cupmlp: Npdl=%d Nbnl=%d\n", nn->Ndl, nn->Nbnl);
37 | }
38 | 


--------------------------------------------------------------------------------
/src/ptens.cu:
--------------------------------------------------------------------------------
  1 | #include "util.cuh"
  2 | #include "cuptens.h"
  3 | #include "kernels.h"
  4 | 
  5 | 
  6 | cuptens cuptens_empty(int D, int M, int N, int L)
  7 | {
  8 |      DIM_CHECK(D, M, N, L);
  9 |      cuptens out = {D, M, N, L};
 10 |      out.X       =  L>1 ? CEIL(L, 64) : CEIL(N, 64);
 11 |      out.p       = (L>1 ? L     : N) - out.X+1;
 12 |      out.MNL     = (L>1 ? M * N : M) * out.X;
 13 |      out.bytes   = BYTES(uint64_t, D * out.MNL);
 14 |      out.data    = NULL;
 15 |      return out;
 16 | }
 17 | 
 18 | cuptens cuptens_init(int D, int M, int N, int L)
 19 | {
 20 |      cuptens out = cuptens_empty(D, M, N, L);
 21 |      cudaMalloc(&out.data, out.bytes);
 22 |      cuASSERT(out.data, "err: cuptens malloc\n");
 23 |      //cudaMemset(&(out.data), 0, out.bytes);
 24 |      return out;
 25 | }
 26 | 
 27 | 
 28 | cuptens cuptens_from_cupmem(int D, int M, int N, int L)
 29 | {
 30 |      cuptens out = cuptens_empty(D, M, N, L);
 31 |      out.data = cupmem_reserve(out.bytes);
 32 |      return out;
 33 | }
 34 | 
 35 | cuptens cuptens_pad(cuptens *src, int p)
 36 | {
 37 |      cuASSERT(src->data && src->L > 1, "err: L>1 for pad\n");
 38 |      int D=src->D, M=src->M, N=src->N, L=src->L;
 39 |      cuptens out = cuptens_init(D, PAD(M,p), PAD(N,p), L);
 40 |      cupad(src, &out, p);
 41 |      return out;
 42 | }
 43 | 
 44 | cuptens cuptens_lower_init(cuptens *src, int W, int H, int Sx, int Sy)
 45 | {
 46 |      cuASSERT(src->data && src->L > 1, "err\n");
 47 |      int D=src->D, L=src->X;
 48 |      int Md = OUT_LEN(src->M, H, Sy);
 49 |      int Nd = OUT_LEN(src->N, W, Sx);
 50 |      return cuptens_init(D, Md, Nd, W*H*L*64);
 51 | }
 52 | 
 53 | cuptens cuptens_lower_cupmem(cuptens *src,
 54 |                              int W, int H, int Sx, int Sy)
 55 | {
 56 |      cuASSERT(src->data && src->L > 1, "err\n");
 57 |      int D=src->D, L=src->X;
 58 |      int Md = OUT_LEN(src->M, H, Sy);
 59 |      int Nd = OUT_LEN(src->N, W, Sx);
 60 |      cuptens out = cuptens_empty(D, Md, Nd, W*H*L*64);
 61 |      out.data = cupmem_reserve(out.bytes);
 62 |      return out;
 63 | }
 64 | 
 65 | void cuptens_free(cuptens *t)
 66 | {
 67 |      if (t->data) {cudaFree(t->data); t->data=NULL;}
 68 | }
 69 | 
 70 | cuptens cuptens_convert(ftens *t)
 71 | {
 72 |      cuASSERT(t->data, "err\n");
 73 |      cuftens tmp = cuftens_convert(t);
 74 |      cuptens out = cuptens_convert(&tmp);
 75 |      cuftens_free(&tmp);
 76 |      return out;
 77 | }
 78 | 
 79 | cuptens cuptens_convert(cuftens *t)
 80 | {
 81 |      cuftens tmp = cuftens_round_up(t, 64);
 82 |      cuptens out = cuptens_init(tmp.D, tmp.M, tmp.N, tmp.L);
 83 |      cupack(&tmp, &out);
 84 |      cuftens_free(&tmp);
 85 |      return out;
 86 | }
 87 | 
 88 | void cuptens_convert(cuftens *src, cuptens *dst)
 89 | {
 90 |      cupack(src, dst);
 91 | }
 92 | 
 93 | uint64_t *cuptens_dump(cuptens *t)
 94 | {
 95 |      uint64_t *out = MALLOC(uint64_t, t->bytes);
 96 |      cuASSERT(out, "err: dump malloc\n");
 97 |      cudaMemcpy(out, t->data, t->bytes, cuDtoH);
 98 |      return out;
 99 | }
100 | 
101 | // void cuptens_print_ch(cuptens *t, int w, int k, const char *fmt)
102 | // {
103 | //      int D=t->D, M=t->M, N=t->N, L=t->L, X=t->X;
104 | 
105 | //      if (!t->data) {printf("err: cuptens null\n"); return;}
106 | 
107 | //      uint64_t *a = cuptens_dump(t);
108 | //      const uint64_t *b = a + w*t->MNL;
109 | 
110 | //      for (int i=0; i < M; i++) {
111 | //           for (int j=0; j < N; j++) {
112 | //                int v, p, o;
113 | //                p=k>>6; o=k&63;
114 | //                v = (b[ID3(i,j,p,N,X)] >> o) &1;
115 | //                printf(fmt, 2*v -1);
116 | //           }
117 | //           NL;
118 | //      }
119 | //      free(a);
120 | // }
121 | 
122 | void cuptens_print(cuptens *t)
123 | {
124 |      cuptens_print(t, "%2d");
125 | }
126 | 
127 | void cuptens_print(cuptens *t, const char *fmt)
128 | {
129 |      int D=t->D, M=t->M, N=t->N, L=t->L, X=t->X;
130 | 
131 |      if (!t->data) {printf("err: cuptens null\n"); return;}
132 | 
133 |      uint64_t *a = cuptens_dump(t);
134 | 
135 |      for (int w=0; w < D; w++) {
136 |           const uint64_t *b = a + w*t->MNL;
137 |           for (int i=0; i < M; i++) {
138 |                for (int k=0; k < L; k++) {
139 |                     for (int j=0; j < N; j++) {
140 |                          int v, p, o;
141 |                          if (L == 1) {
142 |                               p=j>>6; o=j&63;
143 |                               v = (b[ID2(i,p,X)] >> o) &1;
144 |                          } else {
145 |                               p=k>>6; o=k&63;
146 |                               v = (b[ID3(i,j,p,N,X)] >> o) &1;
147 |                          }
148 | 
149 |                          printf(fmt, 2*v -1);
150 | 
151 |                     } SEP; } NL; } NL; }
152 |      free(a);
153 | }
154 | 
155 | void cuptens_print_shape(cuptens *t)
156 | {
157 |      printf("cuptens: %d %d %d %d %d\n", t->D, t->M, t->N, t->L, t->X);
158 | }
159 | 


--------------------------------------------------------------------------------
/src/scratch.c:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | 
 3 | float *scratch;
 4 | 
 5 | 
 6 | void scratch_alloc(int len)
 7 | {
 8 |      if (len) scratch = MALLOC(float, len);
 9 |      else     scratch = NULL;
10 | }
11 | 
12 | void scratch_free()
13 | {
14 |      if (scratch) {free(scratch); scratch=NULL;}
15 | }
16 | 


--------------------------------------------------------------------------------
/src/tens.c:
--------------------------------------------------------------------------------
  1 | #include <string.h>
  2 | #include <float.h>
  3 | 
  4 | #include "tens.h"
  5 | #include "util.h"
  6 | 
  7 | 
  8 | ftens ftens_init(int D, int M, int N, int L)
  9 | {
 10 |      ftens t = {D, M, N, L, M*N*L, BYTES(float, D*M*N*L)};
 11 |      t.data = MALLOC(float, D*M*N*L);
 12 |      ASSERT(t.data, "err: ftens malloc");
 13 |      return t;
 14 | }
 15 | 
 16 | ftens ftens_from_ptr(int D, int M, int N, int L, float *ptr)
 17 | {
 18 |      ftens t = {D, M, N, L, M*N*L, BYTES(float, D*M*N*L)};
 19 |      ASSERT(ptr, "err: NULL ptr\n");
 20 |      t.data = ptr;
 21 |      return t;
 22 | }
 23 | 
 24 | void ftens_print_shape(ftens *t)
 25 | {
 26 |      printf("ftens: %d %d %d %d\n", t->D, t->M, t->N, t->L);
 27 | }
 28 | 
 29 | void ftens_free(ftens *t)
 30 | {
 31 |      if (t->data) {free(t->data); t->data=NULL;}
 32 | }
 33 | 
 34 | ftens ftens_copy(ftens *t)
 35 | {
 36 |      const int D=t->D, M=t->M, N=t->N, L=t->L;
 37 |      ftens out = ftens_init(D, M, N, L);
 38 |      ASSERT(t->data, "err: null tens\n");
 39 |      memcpy(out.data, t->data, t->bytes);
 40 |      return out;
 41 | }
 42 | 
 43 | ftens ftens_from_file(int D, int M, int N, int L, FILE *pf)
 44 | {
 45 |      ftens out = ftens_init(D, M, N, L);
 46 |      fread(out.data, sizeof(float), D*M*N*L, pf);
 47 |      return out;
 48 | }
 49 | 
 50 | void ftens_reshape(ftens *t, int D, int M, int N, int L)
 51 | {
 52 |      const int len = ftens_len(t);
 53 |      ASSERT(len== D*M*N*L, "err: ftens reshape\n");
 54 |      t->D=D; t->M=M; t->N=N; t->L=L;
 55 | }
 56 | 
 57 | 
 58 | void ftens_clear(ftens *t) {memset(t->data, 0, t->bytes);}
 59 | 
 60 | ftens ftens_zeros(int D, int M, int N, int L)
 61 | {
 62 |      ftens t = ftens_init(D, M, N, L);
 63 |      memset(t.data, 0, t.bytes);
 64 |      return t;
 65 | }
 66 | 
 67 | ftens ftens_ones(int D, int M, int N, int L)
 68 | {
 69 |      ftens t = ftens_init(D, M, N, L);
 70 |      for (int i=0; i < LEN(t); i++)
 71 |           D(t)[i] = 1.0f;
 72 |      return t;
 73 | }
 74 | 
 75 | ftens ftens_rand(int D, int M, int N, int L)
 76 | {
 77 |      ftens t = ftens_init(D, M, N, L);
 78 |      for (int i=0; i < LEN(t); i++)
 79 |           D(t)[i] = (rand() % 255) - 128.0f;
 80 |      return t;
 81 | }
 82 | 
 83 | 
 84 | void ftens_sign(ftens *t)
 85 | {
 86 |      for (int i=0; i < t->bytes/sizeof(float); i++)
 87 |           t->data[i] = 2.0f * (t->data[i] > 0.0f) - 1.0f;
 88 | }
 89 | 
 90 | ftens ftens_rand_range(int D, int M, int N, int L,
 91 |                        float min, float max)
 92 | {
 93 |      ftens t = ftens_init(D, M, N, L);
 94 |      for (int i=0; i < ftens_len(&t); i++)
 95 |           t.data[i] = ((max-min)*rand())/RAND_MAX + min;
 96 |      return t;
 97 | }
 98 | 
 99 | ftens ftens_copy_tch(ftens *a)
100 | {
101 |      const int M=a->M, N=a->N, L=a->L, D=a->D;
102 |      ftens b = ftens_init(D, N, L, M);
103 |      for (int w=0; w<D; w++) {
104 |           float *src = a->data + w*a->MNL;
105 |           float *dst = b. data + w*b. MNL;
106 |           for (int i=0; i<M; i++)
107 |                for (int j=0; j<N; j++)
108 |                     for (int k=0; k<L; k++)
109 |                          dst[ID3(j,k,i,L,M)] =
110 |                               src[ID3(i,j,k,N,L)];
111 |      }
112 |      return b;
113 | }
114 | 
115 | void ftens_tch(ftens *a, ftens *b)
116 | {
117 |      const int M=a->M, N=a->N, L=a->L, D=a->D;
118 |      for (int w=0; w<D; w++) {
119 |           float *src = a->data + w*a->MNL;
120 |           float *dst = b->data + w*b->MNL;
121 |           for (int i=0; i<M; i++)
122 |                for (int j=0; j<N; j++)
123 |                     for (int k=0; k<L; k++)
124 |                          dst[ID3(j,k,i,L,M)] =
125 |                               src[ID3(i,j,k,N,L)];
126 |      }
127 | }
128 | 
129 | void ftens_lower(ftens *src, ftens *dst,
130 |                  int W, int H, int Sx, int Sy)
131 | {
132 |      const int D=src->D;
133 |      const int Ms=src->M, Ns=src->N, Ls=src->L;
134 |      const int Md=dst->M, Nd=dst->N, Ld=src->L;
135 |      ASSERT(Ls == Ld && dst->D == D, "err: lowering shape\n");
136 |      float *d = dst->data; int n=0;
137 |      for (int w=0;  w < D; w++) {
138 |           float *s = src->data + w*src->MNL;
139 |           for (int i=0; i<Md; i++)
140 |                for (int j=0; j<Nd; j++)
141 |                     for (int y=0; y<H; y++)
142 |                          for (int x=0; x<W; x++)
143 |                               for (int k=0; k<Ld; k++)
144 |                                    d[n++] =
145 |                                         s[ID3(i*Sy+y,j*Sx+x,k,Ns,Ls)];
146 |      }
147 | }
148 | 
149 | 
150 | void ftens_maxpool(ftens *src, ftens *dst, int W, int H,
151 |                    int Sx, int Sy)
152 | {
153 |      const int D =src->D, L =src->L;
154 |      const int Ms=src->M, Ns=src->N;
155 |      const int Md=dst->M, Nd=dst->N;
156 |      ASSERT(D==dst->D && L==dst->L, "err: pool shape\n");
157 |      float *d=dst->data; int n=0;
158 |      for (int w=0; w < D; w++) {
159 |           float *s=src->data + w*src->MNL;
160 |           for (int i=0; i < Ms; i+=Sy)
161 |                for (int j=0; j < Ns; j+=Sx)
162 |                     for (int k=0; k < L; k++) {
163 |                          float v, max=FLT_MIN;
164 |                          for (int y=0; y<H; y++)
165 |                               for (int x=0; x<W; x++) {
166 |                                    v = s[ID3(i+y,j+x,k,Ns,L)];
167 |                                    if (v > max) max = v;
168 |                               }
169 |                          d[n++] = max;
170 |                     }
171 |      }
172 | }
173 | 
174 | 
175 | ftens ftens_copy_pad(ftens *t, int p)
176 | {
177 |      const int Ms=t->M, Ns=t->N, L=t->L, D=t->D;
178 |      const int Md=PAD(Ms,p), Nd=PAD(Ns,p);
179 |      ftens out = ftens_zeros(D, Md, Nd, L);
180 |      float *pin  = t->data;
181 |      float *pout = out.data;
182 |      for (int w=0; w < D; w++) {
183 |           for (int i=0; i < Ms; i++)
184 |                for (int j=0; j < Ns; j++)
185 |                     for (int k=0; k < L; k++)
186 |                          pout[ID3(i+p,j+p,k,Nd,L)] =
187 |                               pin[ID3(i,j,k,Ns,L)];
188 |           pin += t->MNL;
189 |           pout += out.MNL;
190 |      }
191 |      return out;
192 | }
193 | 
194 | void ftens_pad(ftens *src, ftens *dst, int p)
195 | {
196 |      const int D=src->D, L=src->L;
197 |      const int Ms=src->M, Ns=src->N;
198 |      const int Md=dst->M, Nd=dst->N;
199 |      ASSERT(D==dst->D && L==dst->L, "err: pad shape\n");
200 |      float *s = src->data;
201 |      float *d = dst->data;
202 |      memset(d, 0, dst->bytes);
203 |      for (int w=0; w < D; w++) {
204 |           for (int i=0; i < Ms; i++)
205 |                for (int j=0; j < Ns; j++)
206 |                     for (int k=0; k < L; k++)
207 |                          d[ID3(i+p,j+p,k,Nd,L)] =
208 |                               s[ID3(i,j,k,Ns,L)];
209 |           s += src->MNL;
210 |           d += dst->MNL;
211 |      }
212 | }
213 | 
214 | void ftens_print(ftens *t, const char *fmt)
215 | {
216 |      if (!t->data) {printf("ftens NULL\n"); return;}
217 |      const int M=t->M, N=t->N, L=t->L, D=t->D;
218 |      float *ptr = t->data;
219 |      for (int w=0; w < D; w++) {
220 |           for (int i=0; i < M; i++) {
221 |                for (int k=0; k < L; k++) {
222 |                     for (int j=0; j < N; j++) {
223 |                          float v = ptr[ID3(i,j,k,N,L)];
224 |                          printf(fmt, v);
225 |                     } printf(" | ");
226 |                } NL;
227 |           }
228 |           ptr += t->MNL; NL;
229 |      }
230 |      NL;
231 | }
232 | 
233 | void ftens_print_ch(ftens *t, int w, int k, int I, int J,
234 |                     const char *fmt)
235 | {
236 |      if (!t->data) {printf("ftens NULL\n"); return;}
237 |      const int D=t->D, M=t->M, N=t->N, L=t->L;
238 |      ASSERT(w < D, "err: print\n");
239 |      float *ptr = t->data + w * t->MNL;
240 |      for (int i=0; i < MIN(M, (unsigned)I); i++) {
241 |           for (int j=0; j < MIN(N,(unsigned)J); j++) {
242 |                printf(fmt, ptr[ID3(i,j,k,N,L)]);
243 |           }
244 |           NL;
245 |      }
246 | }
247 | 


--------------------------------------------------------------------------------
/src/tens.cu:
--------------------------------------------------------------------------------
  1 | #include "cutens.h"
  2 | #include "kernels.h"
  3 | 
  4 | 
  5 | cuftens cuftens_empty(int D, int M, int N, int L)
  6 | {
  7 |      cuftens t;
  8 |      DIM_CHECK(D, M, N, L);
  9 |      CUFTENS_INIT(t, D, M, N, L);
 10 |      return t;
 11 | }
 12 | 
 13 | cuftens cuftens_init(int D, int M, int N, int L)
 14 | {
 15 |      cuftens t = cuftens_empty(D, M, N, L);
 16 |      cudaMalloc(&t.data, t.bytes);
 17 |      cuASSERT(t.data, "err: cuftens cuMalloc\n");
 18 |      return t;
 19 | }
 20 | 
 21 | cuftens cuftens_lower_init(cuftens *t, int W, int H, int Sx, int Sy)
 22 | {
 23 |      int M = OUT_LEN(t->M, H, Sy);
 24 |      int N = OUT_LEN(t->N, W, Sx);
 25 |      return cuftens_init(t->D, M, N, W*H*t->L);
 26 | }
 27 | 
 28 | cuftens cuftens_lower_cufmem(cuftens *t, int W, int H, int Sx, int Sy)
 29 | {
 30 |      int M = OUT_LEN(t->M, H, Sy);
 31 |      int N = OUT_LEN(t->N, W, Sx);
 32 |      cuftens out = cuftens_empty(t->D, M, N, W*H*t->L);
 33 |      out.data    = cufmem_reserve(out.bytes);
 34 |      return out;
 35 | }
 36 | 
 37 | void cuftens_free(cuftens *t)
 38 | {
 39 |      if (t->data) {cudaFree(t->data); t->data=NULL;}
 40 | }
 41 | 
 42 | cuftens cuftens_zeros(int D, int M, int N, int L)
 43 | {
 44 |      cuftens t = cuftens_init(D, M, N, L);
 45 |      cudaMemset(t.data, 0, t.bytes);
 46 |      return t;
 47 | }
 48 | 
 49 | cuftens cuftens_ones(int D, int M, int N, int L)
 50 | {
 51 |      cuftens t = cuftens_init(D, M, N, L);
 52 |      cuset(&t, 1.0);
 53 |      return t;
 54 | }
 55 | 
 56 | cuftens cuftens_rand(int D, int M, int N, int L)
 57 | {
 58 |      cuftens t = cuftens_init(D, M, N, L);
 59 |      ftens tmp = ftens_rand(D, M, N, L);
 60 |      cudaMemcpy(t.data, tmp.data, t.bytes, cuHtoD);
 61 |      ftens_free(&tmp);
 62 |      return t;
 63 | }
 64 | 
 65 | cuftens cuftens_rand(int D, int M, int N, int L, float min, float max)
 66 | {
 67 |      cuftens t = cuftens_init(D, M, N, L);
 68 |      ftens tmp = ftens_rand_range(D, M, N, L, min, max);
 69 |      cudaMemcpy(t.data, tmp.data, t.bytes, cuHtoD);
 70 |      ftens_free(&tmp);
 71 |      return t;
 72 | }
 73 | 
 74 | cuftens cuftens_from_cufmem(int D, int M, int N, int L)
 75 | {
 76 |      cuftens t = cuftens_empty(D, M, N, L);
 77 |      t.data    = cufmem_reserve(t.bytes);
 78 |      return t;
 79 | }
 80 | 
 81 | cuftens cuftens_convert(ftens *t)
 82 | {
 83 |      cuftens out = cuftens_init(t->D, t->M, t->N, t->L);
 84 |      cudaMemcpy(out.data, t->data, t->bytes, cuHtoD);
 85 |      return out;
 86 | }
 87 | 
 88 | void cuftens_reshape(cuftens *t, int D, int M, int N, int L)
 89 | {
 90 |      const int len = cuftens_len(t);
 91 |      cuASSERT(len == D*M*N*L, "err: cuftens reshape\n");
 92 |      t->D=D; t->M=M; t->N=N; t->L=L;
 93 | }
 94 | 
 95 | cuftens cuftens_copy(cuftens *t)
 96 | {
 97 |      cuASSERT(t->data, "err\n");
 98 |      cuftens out = cuftens_init(t->D, t->M, t->N, t->L);
 99 |      cudaMemcpy(out.data, t->data, t->bytes, cuDtoD);
100 |      return out;
101 | }
102 | 
103 | void cuftens_copy(cuftens *src, cuftens *dst)
104 | {
105 |      cuASSERT(src->data && dst->data, "err\n");
106 |      cucopy(src, dst);
107 | }
108 | 
109 | void cuftens_pad(cuftens *src, cuftens *dst, int p)
110 | {
111 |      cuASSERT(dst->data && dst->data, "err\n");
112 |      cupad(src, dst, p);
113 | }
114 | 
115 | cuftens cuftens_pad(cuftens *t, const int p)
116 | {
117 |      const int D=t->D, M=t->M, N=t->N, L=t->L;
118 |      cuftens out = cuftens_zeros(D, PAD(M,p), PAD(N,p), L);
119 |      cupad(t, &out, p);
120 |      return out;
121 | }
122 | 
123 | cuftens cuftens_round_up(ftens *t, int n)
124 | {
125 |      cuftens tmp = cuftens_convert(t);
126 |      cuftens out = cuftens_round_up(&tmp, n);
127 |      cuftens_free(&tmp);
128 |      return out;
129 | }
130 | 
131 | cuftens cuftens_round_up(cuftens *t, int n)
132 | {
133 |      int D=t->D, M=t->M, N=t->N, L=t->L;
134 |      if (L > 1) L = ROUND_UP(L, n);
135 |      else       N = ROUND_UP(N, n);
136 |      cuftens out = cuftens_zeros(D, M, N, L);
137 |      cucopy(t, &out);
138 |      return out;
139 | }
140 | 
141 | cuftens cuftens_round_up_cufmem(cuftens *t, int n)
142 | {
143 |      int D=t->D, M=t->M, N=t->N, L=t->L;
144 |      if (L > 1) L = ROUND_UP(L, n);
145 |      else       N = ROUND_UP(N, n);
146 |      cuftens out = cuftens_from_cufmem(D, M, N, L);
147 |      cudaMemset(out.data, 0, out.bytes);
148 |      cucopy(t, &out);
149 |      return out;
150 | }
151 | 
152 | ftens cuftens_dump(cuftens *t)
153 | {
154 |      int D=t->D, M=t->M, N=t->N, L=t->L;
155 |      cuASSERT(t->data, "err\n");
156 |      ftens out = ftens_init(D, M, N, L);
157 |      cudaMemcpy(out.data, t->data, t->bytes, cuDtoH);
158 |      return out;
159 | }
160 | 
161 | void cuftens_print(cuftens *t)
162 | {
163 |      cuftens_print(t, "%.2f ");
164 | }
165 | 
166 | void cuftens_print(cuftens *t, const char *fmt)
167 | {
168 |      if (!t->data) {printf("tens null\n"); return;}
169 |      ftens tmp = cuftens_dump(t);
170 |      ftens_print(&tmp, fmt);
171 |      ftens_free(&tmp);
172 | }
173 | 
174 | void cuftens_print_ch(cuftens *t, int b, int ch, int I, int J,
175 |                       const char *fmt)
176 | {
177 |      ftens tmp = cuftens_dump(t);
178 |      ftens_print_ch(&tmp, b, ch, I, J, fmt);
179 |      ftens_free(&tmp);
180 | }
181 | 
182 | void cuftens_print_shape(cuftens *t)
183 | {
184 |      printf("cuftens: %d %d %d %d\n", t->D, t->M, t->N, t->L);
185 | }
186 | 


--------------------------------------------------------------------------------
/src/timez.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include "timez.h"
 3 | 
 4 | 
 5 | void time_record(myTime *time)
 6 | {
 7 |      gettimeofday(time, NULL);
 8 | }
 9 | 
10 | 
11 | long elapsed_time(myTime start, myTime stop)
12 | {
13 |      long sec = stop.tv_sec - start.tv_sec;
14 |      long usec = stop.tv_usec - start.tv_usec;
15 |      usec += (sec * 1e6);
16 |      printf("%ld us\t", usec);
17 |      return usec;
18 | }
19 | 
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/test/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # test
 2 | set(ESP_LIBS espresso m ${BLAS_LIBRARIES})
 3 | set(CUESP_LIBS cuespresso ${CUDA_LIBRARIES})
 4 | 
 5 | add_executable(t1 t1.c)
 6 | target_link_libraries(t1 ${ESP_LIBS})
 7 | 
 8 | add_executable(mnist mnist.c)
 9 | target_link_libraries(mnist ${ESP_LIBS})
10 | 
11 | add_executable(cifar cifar.c)
12 | target_link_libraries(cifar ${ESP_LIBS})
13 | 
14 | if (CUDA_FOUND)
15 |   cuda_add_executable(cut1 t1.cu)
16 |   target_link_libraries(cut1 ${ESP_LIBS} ${CUESP_LIBS})
17 | 
18 |   cuda_add_executable(cup_mnist cup_mnist.cu)
19 |   target_link_libraries(cup_mnist ${ESP_LIBS} ${CUESP_LIBS})
20 | endif ()
21 | 


--------------------------------------------------------------------------------
/test/cup_mnist.cu:
--------------------------------------------------------------------------------
 1 | #include "cuesp.h"
 2 | 
 3 | 
 4 | int main(int argc, char *argv[])
 5 | {
 6 |      cufmem_alloc(BYTES(float, 4096*28*28));
 7 |      cupmem_alloc(BYTES(uint64_t, 4096*28*28/64));
 8 | 
 9 |      mlp tmp = mlp_load("mnist_params", 1);
10 |      cupmlp nn  = cupmlp_convert(&tmp);
11 |      mlp_free(&tmp);
12 | 
13 |      // batch size = 1
14 |      ftens img = ftens_init(1, 28, 28, 1);
15 |      ftens lab = ftens_init(1, 1,  10, 1);
16 |      mnist_load_X("mnist_test", 0, 1, &img);
17 |      mnist_load_y("mnist_lab", 0, 1, &lab);
18 | 
19 |      // forward of 1 image
20 |      cuinputLayer  *il = &nn.il;
21 |      cupdenseLayer *dl = nn.dl;
22 |      cubnormLayer *bnl = nn.bnl;
23 | 
24 |      cuinputLayer_forward(&img, il, 0);
25 | 
26 |      cupdenseLayer_forward_initial(&il->out, dl, 128);
27 |      cubnormLayer_forward(&dl->out, bnl, 0);
28 |      cupsignAct_forward(&dl->out, &dl->pout);
29 | 
30 |      dl++; bnl++;
31 |      cupdenseLayer_forward(&(dl-1)->pout, dl, 0);
32 |      cubnormLayer_forward(&dl->out, bnl, 0);
33 |      cupsignAct_forward(&dl->out, &dl->pout);
34 | 
35 |      dl++; bnl++;
36 |      cupdenseLayer_forward(&(dl-1)->pout, dl, 0);
37 |      cubnormLayer_forward(&dl->out, bnl, 0);
38 |      cupsignAct_forward(&dl->out, &dl->pout);
39 | 
40 |      dl++; bnl++;
41 |      cupdenseLayer_forward(&(dl-1)->pout, dl, 0);
42 |      cubnormLayer_forward(&dl->out, bnl, 0);
43 | 
44 |      cudaDeviceSynchronize();
45 |      cuftens_print(&dl->out);
46 | 
47 |      cupmlp_free(&nn);
48 |      cufmem_free();
49 |      cupmem_free();
50 | 
51 |      return 0;
52 | }
53 | 


--------------------------------------------------------------------------------
/toEspresso.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """"
  5 | Usage: toEspresso.py <data> <desc> <out>
  6 | 
  7 | Options:
  8 |    -h --help   show this message
  9 | """
 10 | 
 11 | import os
 12 | import numpy as np
 13 | import struct
 14 | import json
 15 | 
 16 | INPUT = 0
 17 | DENSE = 1
 18 | BNORM = 2
 19 | CONV = 3
 20 | POOL = 4
 21 | NUM = 1<<4
 22 | DATA = 2<<4
 23 | 
 24 | 
 25 | def read_param(params, n):
 26 |     return params['arr_%d' % n].astype(np.float32)
 27 | 
 28 | 
 29 | def main(data_file, desc_file, out_file):
 30 |     assert(os.path.exists(desc_file) and os.path.exists(data_file))
 31 |     desc = json.load(open(desc_file, 'r'))
 32 |     params, n = np.load(data_file), 0
 33 |     with open(out_file, 'wb') as out:
 34 |         for elem in desc:
 35 |             tmp = ''
 36 | 
 37 |             if elem['type'] == "ndense":
 38 |                 N = elem['val']
 39 |                 print 'ndense: %d' % N
 40 |                 tmp = struct.pack('B', DENSE | NUM) + \
 41 |                       struct.pack('i', N)
 42 | 
 43 |             elif elem['type'] == "nbnorm":
 44 |                 N = elem['val']
 45 |                 print 'nbnorm: %d' % N
 46 |                 tmp = struct.pack('B', BNORM | NUM) + \
 47 |                       struct.pack('i', N)
 48 | 
 49 |             elif elem['type'] == "nconv":
 50 |                 N = elem['val']
 51 |                 print 'nconv: %d' % N
 52 |                 tmp = struct.pack('B', CONV | NUM) + \
 53 |                       struct.pack('i', N)
 54 | 
 55 |             elif elem['type'] == "npool":
 56 |                 N = elem['val']
 57 |                 print 'npool: %d' % N
 58 |                 tmp = struct.pack('B', POOL | NUM) + \
 59 |                       struct.pack('i', N)
 60 | 
 61 |             elif elem['type'] == 'input':
 62 |                tmp = struct.pack('B', INPUT | DATA)
 63 |                dim = elem['dim']
 64 |                print 'input: %d %d %d' % tuple(dim)
 65 |                #tmp += struct.pack('3i', *dim)
 66 | 
 67 |             elif elem['type'] == 'dense':
 68 |                 M, N = elem['dim']
 69 |                 print 'dense: %d %d' % (M, N)
 70 |                 tmp = struct.pack('B', DENSE | DATA) + \
 71 |                       struct.pack('2i', M, N)
 72 |                 W = read_param(params, n).T
 73 |                 b = read_param(params, n + 1)
 74 |                 tmp += W.tostring('C') + b.tostring('C')
 75 |                 n += 2
 76 | 
 77 |             elif elem['type'] == 'bnorm':
 78 |                 N = elem['dim']
 79 |                 print 'bnorm: %d' % N
 80 |                 tmp = struct.pack('B', BNORM | DATA) + \
 81 |                       struct.pack('i', N)
 82 |                 for i in range(4):
 83 |                     tmp += read_param(params, n+i).tostring('C')
 84 |                 n += 4
 85 | 
 86 |             elif elem['type'] == 'conv':
 87 |                 dim = elem['dim']
 88 |                 print 'conv: %d %d %d %d %d %d %d' % tuple(dim)
 89 |                 tmp = struct.pack('B', CONV | DATA) + \
 90 |                       struct.pack('7i', *dim)
 91 |                 H = read_param(params, n)
 92 |                 b = read_param(params, n + 1)
 93 |                 tmp += H.tostring('C')
 94 |                 tmp += b.tostring('C')
 95 |                 n += 2
 96 | 
 97 |             elif elem['type'] == 'pool':
 98 |                 dim = elem['dim']
 99 |                 print 'max pool: %d %d %d %d' % tuple(dim)
100 |                 tmp = struct.pack('B', POOL | DATA) + \
101 |                       struct.pack('4i', *dim)
102 |             else:
103 |                 pass
104 | 
105 |             print len(tmp)
106 |             out.write(tmp)
107 | 
108 | 
109 | if __name__ == '__main__':
110 |     from docopt import docopt
111 | 
112 |     args = docopt(__doc__)
113 |     data, desc, out = args['<data>'], args['<desc>'], args['<out>']
114 | 
115 |     main(data, desc, out)
116 | 


--------------------------------------------------------------------------------