├── .editorconfig
├── Bender.yml
├── LICENSE.hw
├── LICENSE.sw
├── README.md
├── rtl
    ├── accumulator
    │   ├── ne16_accumulator_normquant.sv
    │   ├── ne16_accumulator_scm.sv
    │   ├── ne16_accumulator_scm_test_wrap.sv
    │   ├── ne16_normquant.sv
    │   ├── ne16_normquant_bias.sv
    │   ├── ne16_normquant_multiplier.sv
    │   └── ne16_normquant_shifter.sv
    ├── array
    │   ├── ne16_binconv_array.sv
    │   ├── ne16_binconv_block.sv
    │   ├── ne16_binconv_column.sv
    │   └── ne16_scale.sv
    ├── ctrl
    │   ├── ne16_ctrl.sv
    │   └── ne16_ctrl_fsm.sv
    ├── input_buffer
    │   ├── ne16_input_buffer.sv
    │   ├── ne16_input_buffer_scm.sv
    │   └── ne16_input_buffer_scm_test_wrap.sv
    ├── ne16_engine.sv
    ├── ne16_package.sv
    ├── ne16_streamer.sv
    ├── ne16_top.sv
    └── ne16_top_wrap.sv
├── src_files.yml
└── ucode
    ├── code.yml
    ├── code_dw.yml
    ├── uloop_check.py
    ├── uloop_check_dw.py
    ├── uloop_common.py
    ├── uloop_compile.py
    ├── uloop_compile_dw.py
    └── uloop_run.py


/.editorconfig:
--------------------------------------------------------------------------------
 1 | # top-most EditorConfig file
 2 | root = true
 3 | 
 4 | # Unix-style newlines with a newline ending every file
 5 | [*]
 6 | end_of_line = lf
 7 | insert_final_newline = true
 8 | trim_trailing_whitespace = true
 9 | max_line_length = 100
10 | # 2 space indentation
11 | [*.{sv, svh, v, vhd}]
12 | indent_style = space
13 | indent_size = 2
14 | 


--------------------------------------------------------------------------------
/Bender.yml:
--------------------------------------------------------------------------------
 1 | package:
 2 |   name: ne16
 3 |   authors:
 4 |     - "Francesco Conti <francesco.conti@greenwaves-technologies.com>"
 5 | 
 6 | dependencies:
 7 |   hwpe-stream: { git: "https://github.com/pulp-platform/hwpe-stream.git", version: 1.6 }
 8 |   hci:         { git: "https://github.com/pulp-platform/hci.git", version: 1.0.6 }
 9 |   hwpe-ctrl:   { git: "https://github.com/pulp-platform/hwpe-ctrl.git", version: 1.6 }
10 | 
11 | sources:
12 |   - rtl/ne16_package.sv
13 |   - rtl/accumulator/ne16_accumulator_scm_test_wrap.sv
14 |   - rtl/input_buffer/ne16_input_buffer_scm_test_wrap.sv
15 |   - rtl/accumulator/ne16_accumulator_scm.sv
16 |   - rtl/accumulator/ne16_accumulator_normquant.sv
17 |   - rtl/accumulator/ne16_normquant.sv
18 |   - rtl/accumulator/ne16_normquant_shifter.sv
19 |   - rtl/accumulator/ne16_normquant_bias.sv
20 |   - rtl/accumulator/ne16_normquant_multiplier.sv
21 |   - rtl/input_buffer/ne16_input_buffer_scm.sv
22 |   - rtl/input_buffer/ne16_input_buffer.sv
23 |   - rtl/array/ne16_scale.sv
24 |   - rtl/array/ne16_binconv_block.sv
25 |   - rtl/array/ne16_binconv_column.sv
26 |   - rtl/array/ne16_binconv_array.sv
27 |   - rtl/ctrl/ne16_ctrl_fsm.sv
28 |   - rtl/ctrl/ne16_ctrl.sv
29 |   - rtl/ne16_engine.sv
30 |   - rtl/ne16_streamer.sv
31 |   - rtl/ne16_top.sv
32 |   - rtl/ne16_top_wrap.sv


--------------------------------------------------------------------------------
/LICENSE.hw:
--------------------------------------------------------------------------------
  1 | SOLDERPAD HARDWARE LICENSE version 0.51
  2 | 
  3 | This license is based closely on the Apache License Version 2.0, but is not
  4 | approved or endorsed by the Apache Foundation. A copy of the non-modified
  5 | Apache License 2.0 can be found at http://www.apache.org/licenses/LICENSE-2.0.
  6 | 
  7 | As this license is not currently OSI or FSF approved, the Licensor permits any
  8 | Work licensed under this License, at the option of the Licensee, to be treated
  9 | as licensed under the Apache License Version 2.0 (which is so approved).
 10 | 
 11 | This License is licensed under the terms of this License and in particular
 12 | clause 7 below (Disclaimer of Warranties) applies in relation to its use.
 13 | 
 14 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
 15 | 
 16 | 1. Definitions.
 17 | 
 18 | “License” shall mean the terms and conditions for use, reproduction, and
 19 | distribution as defined by Sections 1 through 9 of this document.
 20 | 
 21 | “Licensor” shall mean the Rights owner or entity authorized by the Rights owner
 22 | that is granting the License.
 23 | 
 24 | “Legal Entity” shall mean the union of the acting entity and all other entities
 25 | that control, are controlled by, or are under common control with that entity.
 26 | For the purposes of this definition, “control” means (i) the power, direct or
 27 | indirect, to cause the direction or management of such entity, whether by
 28 | contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the
 29 | outstanding shares, or (iii) beneficial ownership of such entity.
 30 | 
 31 | “You” (or “Your”) shall mean an individual or Legal Entity exercising
 32 | permissions granted by this License.
 33 | 
 34 | “Rights” means copyright and any similar right including design right (whether
 35 | registered or unregistered), semiconductor topography (mask) rights and
 36 | database rights (but excluding Patents and Trademarks).
 37 | 
 38 | “Source” form shall mean the preferred form for making modifications, including
 39 | but not limited to source code, net lists, board layouts, CAD files,
 40 | documentation source, and configuration files.
 41 | 
 42 | “Object” form shall mean any form resulting from mechanical transformation or
 43 | translation of a Source form, including but not limited to compiled object
 44 | code, generated documentation, the instantiation of a hardware design and
 45 | conversions to other media types, including intermediate forms such as
 46 | bytecodes, FPGA bitstreams, artwork and semiconductor topographies (mask
 47 | works).
 48 | 
 49 | “Work” shall mean the work of authorship, whether in Source form or other
 50 | Object form, made available under the License, as indicated by a Rights notice
 51 | that is included in or attached to the work (an example is provided in the
 52 | Appendix below).
 53 | 
 54 | “Derivative Works” shall mean any work, whether in Source or Object form, that
 55 | is based on (or derived from) the Work and for which the editorial revisions,
 56 | annotations, elaborations, or other modifications represent, as a whole, an
 57 | original work of authorship. For the purposes of this License, Derivative Works
 58 | shall not include works that remain separable from, or merely link (or bind by
 59 | name) or physically connect to or interoperate with the interfaces of, the Work
 60 | and Derivative Works thereof.
 61 | 
 62 | “Contribution” shall mean any design or work of authorship, including the
 63 | original version of the Work and any modifications or additions to that Work or
 64 | Derivative Works thereof, that is intentionally submitted to Licensor for
 65 | inclusion in the Work by the Rights owner or by an individual or Legal Entity
 66 | authorized to submit on behalf of the Rights owner. For the purposes of this
 67 | definition, “submitted” means any form of electronic, verbal, or written
 68 | communication sent to the Licensor or its representatives, including but not
 69 | limited to communication on electronic mailing lists, source code control
 70 | systems, and issue tracking systems that are managed by, or on behalf of, the
 71 | Licensor for the purpose of discussing and improving the Work, but excluding
 72 | communication that is conspicuously marked or otherwise designated in writing
 73 | by the Rights owner as “Not a Contribution.”
 74 | 
 75 | “Contributor” shall mean Licensor and any individual or Legal Entity on behalf
 76 | of whom a Contribution has been received by Licensor and subsequently
 77 | incorporated within the Work.
 78 | 
 79 | 2. Grant of License. Subject to the terms and conditions of this License, each
 80 | Contributor hereby grants to You a perpetual, worldwide, non-exclusive,
 81 | no-charge, royalty-free, irrevocable license under the Rights to reproduce,
 82 | prepare Derivative Works of, publicly display, publicly perform, sublicense,
 83 | and distribute the Work and such Derivative Works in Source or Object form and
 84 | do anything in relation to the Work as if the Rights did not exist.
 85 | 
 86 | 3. Grant of Patent License. Subject to the terms and conditions of this
 87 | License, each Contributor hereby grants to You a perpetual, worldwide,
 88 | non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this
 89 | section) patent license to make, have made, use, offer to sell, sell, import,
 90 | and otherwise transfer the Work, where such license applies only to those
 91 | patent claims licensable by such Contributor that are necessarily infringed by
 92 | their Contribution(s) alone or by combination of their Contribution(s) with the
 93 | Work to which such Contribution(s) was submitted. If You institute patent
 94 | litigation against any entity (including a cross-claim or counterclaim in a
 95 | lawsuit) alleging that the Work or a Contribution incorporated within the Work
 96 | constitutes direct or contributory patent infringement, then any patent
 97 | licenses granted to You under this License for that Work shall terminate as of
 98 | the date such litigation is filed.
 99 | 
100 | 4. Redistribution. You may reproduce and distribute copies of the Work or
101 | Derivative Works thereof in any medium, with or without modifications, and in
102 | Source or Object form, provided that You meet the following conditions:
103 | 
104 |     You must give any other recipients of the Work or Derivative Works a copy
105 |     of this License; and
106 | 
107 |     You must cause any modified files to carry prominent notices stating that
108 |     You changed the files; and
109 | 
110 |     You must retain, in the Source form of any Derivative Works that You
111 |     distribute, all copyright, patent, trademark, and attribution notices from
112 |     the Source form of the Work, excluding those notices that do not pertain to
113 |     any part of the Derivative Works; and
114 | 
115 |     If the Work includes a “NOTICE” text file as part of its distribution, then
116 |     any Derivative Works that You distribute must include a readable copy of
117 |     the attribution notices contained within such NOTICE file, excluding those
118 |     notices that do not pertain to any part of the Derivative Works, in at
119 |     least one of the following places: within a NOTICE text file distributed as
120 |     part of the Derivative Works; within the Source form or documentation, if
121 |     provided along with the Derivative Works; or, within a display generated by
122 |     the Derivative Works, if and wherever such third-party notices normally
123 |     appear. The contents of the NOTICE file are for informational purposes only
124 |     and do not modify the License. You may add Your own attribution notices
125 |     within Derivative Works that You distribute, alongside or as an addendum to
126 |     the NOTICE text from the Work, provided that such additional attribution
127 |     notices cannot be construed as modifying the License. You may add Your own
128 |     copyright statement to Your modifications and may provide additional or
129 |     different license terms and conditions for use, reproduction, or
130 |     distribution of Your modifications, or for any such Derivative Works as a
131 |     whole, provided Your use, reproduction, and distribution of the Work
132 |     otherwise complies with the conditions stated in this License.
133 | 
134 | 5. Submission of Contributions. Unless You explicitly state otherwise, any
135 | Contribution intentionally submitted for inclusion in the Work by You to the
136 | Licensor shall be under the terms and conditions of this License, without any
137 | additional terms or conditions. Notwithstanding the above, nothing herein shall
138 | supersede or modify the terms of any separate license agreement you may have
139 | executed with Licensor regarding such Contributions.
140 | 
141 | 6. Trademarks. This License does not grant permission to use the trade names,
142 | trademarks, service marks, or product names of the Licensor, except as required
143 | for reasonable and customary use in describing the origin of the Work and
144 | reproducing the content of the NOTICE file.
145 | 
146 | 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in
147 | writing, Licensor provides the Work (and each Contributor provides its
148 | Contributions) on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
149 | KIND, either express or implied, including, without limitation, any warranties
150 | or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
151 | PARTICULAR PURPOSE. You are solely responsible for determining the
152 | appropriateness of using or redistributing the Work and assume any risks
153 | associated with Your exercise of permissions under this License.
154 | 
155 | 8. Limitation of Liability. In no event and under no legal theory, whether in
156 | tort (including negligence), contract, or otherwise, unless required by
157 | applicable law (such as deliberate and grossly negligent acts) or agreed to in
158 | writing, shall any Contributor be liable to You for damages, including any
159 | direct, indirect, special, incidental, or consequential damages of any
160 | character arising as a result of this License or out of the use or inability to
161 | use the Work (including but not limited to damages for loss of goodwill, work
162 | stoppage, computer failure or malfunction, or any and all other commercial
163 | damages or losses), even if such Contributor has been advised of the
164 | possibility of such damages.
165 | 
166 | 9. Accepting Warranty or Additional Liability. While redistributing the Work or
167 | Derivative Works thereof, You may choose to offer, and charge a fee for,
168 | acceptance of support, warranty, indemnity, or other liability obligations
169 | and/or rights consistent with this License. However, in accepting such
170 | obligations, You may act only on Your own behalf and on Your sole
171 | responsibility, not on behalf of any other Contributor, and only if You agree
172 | to indemnify, defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason of your
174 | accepting any such warranty or additional liability.
175 | 
176 | END OF TERMS AND CONDITIONS
177 | 


--------------------------------------------------------------------------------
/LICENSE.sw:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Neural Engine 16-channels
 2 | The Neural Engine 16-channels (NE16) is a Deep Neural Network accelerator which uses Hardware Processing Engine (HWPE) concepts [1] and is designed to be integrated in a PULPOpen cluster configuration in combination with the Heterogeneous Cluster Interconnect (HCI). It makes use of the open-source IPs 'hci', 'hwpe-ctrl', and 'hwpe-stream'.
 3 | 
 4 | In general the NE16 has built-in HW supports the following features:
 5 | 
 6 | - Filters: 1x1, 3x3, depthwise, linear
 7 | - Batch normalization
 8 | - ReLU
 9 | - Activation input bits: 8,16
10 | - Weight bits: 2,3,4,5,6,7,8
11 | - Activation output bits: 8,16,32
12 | - Nr of input channels: arbitrary
13 | - Nr of output channels: arbitrary
14 | 
15 | The NE16 is a direct derivative of the Reconfigurable Binary Engine (RBE) design https://github.com/pulp-platform/rbe by Gianna Paulin (ETH Zürich) and Francesco Conti (University of Bologna).
16 | 
17 | ## Contributors
18 |  - Francesco Conti, University of Bologna and GreenWaves Technologies (*f.conti@unibo.it*)
19 | 
20 | ## Acknowledgement
21 | The development of NE16 has been funded by GreenWaves Technologies, SAS.
22 | 
23 | # License
24 | This repository makes use of two licenses:
25 | - for all *software*: Apache License Version 2.0
26 | - for all *hardware*: Solderpad Hardware License Version 0.51
27 | 
28 | For further information have a look at the license files: `LICENSE.hw`, `LICENSE.sw`
29 | 
30 | # References
31 | [1] F. Conti, P. Schiavone, and L Benini. "XNOR neural engine: A hardware accelerator IP for 21.6-fJ/op binary neural network inference." IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems 37.11 (2018): 2940-2951.
32 | 


--------------------------------------------------------------------------------
/rtl/accumulator/ne16_accumulator_scm.sv:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ne16_accumulator_scm.sv
  3 |  *
  4 |  * Copyright (C) 2019-2021 ETH Zurich, University of Bologna and GreenWaves Technologies
  5 |  *
  6 |  * Copyright and related rights are licensed under the Solderpad Hardware
  7 |  * License, Version 0.51 (the "License"); you may not use this file except in
  8 |  * compliance with the License.  You may obtain a copy of the License at
  9 |  * http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
 10 |  * or agreed to in writing, software, hardware and materials distributed under
 11 |  * this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 |  * specific language governing permissions and limitations under the License.
 14 |  */
 15 | 
 16 | /*
 17 |  * Authors (RBE):  Gianna Paulin <pauling@iis.ee.ethz.ch>
 18 |  *                 Francesco Conti <f.conti@unibo.it>
 19 |  * Authors (NE16): Francesco Conti <francesco.conti@greenwaves-technologies.com>
 20 |  */
 21 | 
 22 | module ne16_accumulator_scm
 23 | #(
 24 |   parameter int unsigned ADDR_WIDTH   = 5,
 25 |   parameter int unsigned DATA_WIDTH   = 32,
 26 |   parameter int unsigned NUM_WORDS    = 2**ADDR_WIDTH,
 27 |   parameter int unsigned WIDTH_FACTOR = 4
 28 | )
 29 | (
 30 |   input  logic                               clk_i,
 31 |   input  logic                               rst_ni,
 32 |   input  logic                               clear_i,
 33 |   input  logic                               test_mode_i,
 34 |   input  logic [WIDTH_FACTOR-1:0]            wide_enable_i,
 35 | 
 36 |   // Read port
 37 |   input  logic                               re_i,
 38 |   input  logic [ADDR_WIDTH-1:0]              raddr_i,
 39 |   output logic [DATA_WIDTH-1:0]              rdata_o,
 40 |   output logic [WIDTH_FACTOR*DATA_WIDTH-1:0] rdata_wide_o,
 41 | 
 42 |   // Write port
 43 |   input  logic                               we_i,
 44 |   input  logic                               we_all_i,
 45 |   input  logic [ADDR_WIDTH-1:0]              waddr_i,
 46 |   input  logic [DATA_WIDTH-1:0]              wdata_i,
 47 |   input  logic [WIDTH_FACTOR*DATA_WIDTH-1:0] wdata_wide_i,
 48 | 
 49 |   output logic [NUM_WORDS-1:0][DATA_WIDTH-1:0] accumulators_o
 50 | );
 51 | 
 52 |   // Read address register, located at the input of the address decoder
 53 |   logic [NUM_WORDS-1:0][DATA_WIDTH-1:0] accumulators;
 54 |   logic [NUM_WORDS-1:0]  waddr_onehot;
 55 |   logic [NUM_WORDS-1:0]  clk_we;
 56 | 
 57 |   logic [WIDTH_FACTOR*DATA_WIDTH-1:0]      rdata_q;
 58 |   logic [WIDTH_FACTOR-1:0][DATA_WIDTH-1:0] wdata_q;
 59 | 
 60 |   logic clk_gated;
 61 | 
 62 |   // ========================================================================
 63 |   // CLK GATE
 64 |   // ========================================================================
 65 |   cluster_clock_gating i_cg_we_global
 66 |   (
 67 |     .clk_o     ( clk_gated      ),
 68 |     .en_i      ( we_i | clear_i ),
 69 |     .test_en_i ( test_mode_i    ),
 70 |     .clk_i     ( clk_i          )
 71 |   );
 72 | 
 73 |   // ========================================================================
 74 |   // WDATA SAMPLING
 75 |   // ========================================================================
 76 | 
 77 |   logic [WIDTH_FACTOR-1:0][DATA_WIDTH-1:0] wdata_d;
 78 |   generate
 79 | 
 80 |     for(genvar ii=0; ii<WIDTH_FACTOR; ii++) begin
 81 | 
 82 |       localparam ii_rem2 = ii % 2;
 83 |       localparam ii_rem4 = ii % 4;
 84 |       localparam ii_rem8 = ii % 8;
 85 | 
 86 |       assign wdata_d[ii] = (wide_enable_i == 8'h1) || (wide_enable_i == 8'h2) || (wide_enable_i == 8'h4) || (wide_enable_i == 8'h8) || (wide_enable_i == 8'h10) || (wide_enable_i == 8'h20) || (wide_enable_i == 8'h40) || (wide_enable_i == 8'h80) ? wdata_wide_i[DATA_WIDTH-1:0] :
 87 |                            (wide_enable_i == 8'h3) || (wide_enable_i == 8'hc) || (wide_enable_i == 8'h30) || (wide_enable_i == 8'hc0) ? wdata_wide_i[(ii_rem2+1)*DATA_WIDTH-1:ii_rem2*DATA_WIDTH] :
 88 |                            (wide_enable_i == 8'hf) || (wide_enable_i == 8'hf0) ? wdata_wide_i[(ii_rem4+1)*DATA_WIDTH-1:ii_rem4*DATA_WIDTH] :
 89 |                            (wide_enable_i == 8'hff) ? wdata_wide_i[(ii_rem8+1)*DATA_WIDTH-1:ii_rem8*DATA_WIDTH] : wdata_i;
 90 | 
 91 |       always_ff @(posedge clk_i or negedge rst_ni)
 92 |       begin
 93 |         if(~rst_ni)
 94 |           wdata_q[ii] <= '0;
 95 |         else if(clear_i)
 96 |           wdata_q[ii] <= '0;
 97 |         else if(we_i | we_all_i)
 98 |           wdata_q[ii] <= wdata_d[ii];
 99 |       end
100 |     end
101 |   endgenerate
102 | 
103 |   // ========================================================================
104 |   // SCM (LATCHES)
105 |   // ========================================================================
106 | 
107 |   // use the sampled address to select the correct rdata_o
108 |   generate
109 |     always_ff @(posedge clk_i or negedge rst_ni)
110 |     begin
111 |       if(~rst_ni)
112 |         rdata_q[DATA_WIDTH-1:0] <= '0;
113 |       else if(clear_i)
114 |         rdata_q[DATA_WIDTH-1:0] <= '0;
115 |       else if(re_i) begin
116 |         rdata_q[DATA_WIDTH-1:0] <= accumulators[raddr_i];
117 |       end
118 |     end
119 |     for(genvar ii=1; ii<WIDTH_FACTOR; ii++) begin
120 | 
121 |       logic [ADDR_WIDTH-1:0] raddr_wide;
122 |       assign raddr_wide = raddr_i + ii;
123 | 
124 |       always_ff @(posedge clk_i or negedge rst_ni)
125 |       begin
126 |         if(~rst_ni)
127 |           rdata_q[(ii+1)*DATA_WIDTH-1:ii*DATA_WIDTH] <= '0;
128 |         else if(clear_i)
129 |           rdata_q[(ii+1)*DATA_WIDTH-1:ii*DATA_WIDTH] <= '0;
130 |         else if(re_i & (|wide_enable_i)) begin
131 |           rdata_q[(ii+1)*DATA_WIDTH-1:ii*DATA_WIDTH] <= accumulators[raddr_wide];
132 |         end
133 |       end
134 |     end
135 |   endgenerate
136 | 
137 |   assign rdata_o      = rdata_q[DATA_WIDTH-1:0];
138 |   assign rdata_wide_o = rdata_q;
139 | 
140 |   // decode
141 | 
142 |   // three modes:
143 |   //  - broadcast: all waddr_onehot are enabled
144 |   //  - normal:    the waddr_onehot is the decoded version of waddr_i
145 |   //  - wide:      decoded on wide words, masked by wide_enable
146 | 
147 |   logic [NUM_WORDS-1:0] waddr_decoded_normal;
148 |   logic [NUM_WORDS-1:0] waddr_decoded_wide;
149 | 
150 |   generate
151 |     for(genvar ii=0; ii<NUM_WORDS/WIDTH_FACTOR; ii++) begin : WADDR_DECODE_ITER_1
152 | 
153 |       logic [ADDR_WIDTH-1:0] idx_ii;
154 |       assign idx_ii = ii*WIDTH_FACTOR;
155 | 
156 |       for(genvar jj=0; jj<WIDTH_FACTOR; jj++) begin : WADDR_DECODE_ITER_0
157 | 
158 |         logic [ADDR_WIDTH-1:0] idx_ii_jj;
159 |         assign idx_ii_jj = ii*WIDTH_FACTOR+jj;
160 | 
161 |         always_comb
162 |         begin : waddr_decoding
163 |           if((we_i==1'b1) && (waddr_i == idx_ii_jj))
164 |             waddr_decoded_normal[ii*WIDTH_FACTOR+jj] = 1'b1;
165 |           else
166 |             waddr_decoded_normal[ii*WIDTH_FACTOR+jj] = 1'b0;
167 |         end
168 | 
169 |       end
170 | 
171 |       assign waddr_decoded_wide[(ii+1)*WIDTH_FACTOR-1:ii*WIDTH_FACTOR] = (we_i==1'b1) && (waddr_i == idx_ii) ? wide_enable_i : '0;
172 | 
173 |     end
174 |   endgenerate
175 | 
176 |   assign waddr_onehot = clear_i | we_all_i ? '1 :
177 |                         (|(wide_enable_i)) ? waddr_decoded_wide :
178 |                                              waddr_decoded_normal;
179 | 
180 |   // generate one clock-gating cell for each register element
181 |   generate
182 |     for(genvar ii=0; ii<NUM_WORDS; ii++) begin : CG_CELL_WORD_ITER
183 | 
184 |       cluster_clock_gating i_cg
185 |       (
186 |         .clk_o     ( clk_we[ii]       ),
187 |         .en_i      ( waddr_onehot[ii] ),
188 |         .test_en_i ( test_mode_i      ),
189 |         .clk_i     ( clk_i            )
190 |       );
191 | 
192 |     end
193 |   endgenerate
194 | 
195 |   generate
196 | 
197 |     for(genvar ii=0; ii<NUM_WORDS/WIDTH_FACTOR; ii++) begin : LATCH_ITER_1
198 |       for(genvar jj=0; jj<WIDTH_FACTOR; jj++) begin : LATCH_ITER_0
199 | 
200 |         always_latch
201 |         begin : latch_wdata
202 |           if( clk_we[ii*WIDTH_FACTOR+jj] ) begin
203 |             accumulators[ii*WIDTH_FACTOR+jj] = clear_i ? '0 : wdata_q[jj];
204 |           end
205 |         end
206 | 
207 |       end
208 |     end
209 | 
210 |   endgenerate
211 | 
212 |   assign accumulators_o = accumulators;
213 | 
214 | endmodule // ne16_accumulator_scm
215 | 


--------------------------------------------------------------------------------
/rtl/accumulator/ne16_accumulator_scm_test_wrap.sv:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ne16_accumulator_normquant.sv
  3 |  *
  4 |  * Copyright (C) 2019-2021 ETH Zurich, University of Bologna and GreenWaves Technologies
  5 |  *
  6 |  * Copyright and related rights are licensed under the Solderpad Hardware
  7 |  * License, Version 0.51 (the "License"); you may not use this file except in
  8 |  * compliance with the License.  You may obtain a copy of the License at
  9 |  * http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
 10 |  * or agreed to in writing, software, hardware and materials distributed under
 11 |  * this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 |  * specific language governing permissions and limitations under the License.
 14 |  */
 15 | 
 16 | /*
 17 |  * Authors (RBE):  Gianna Paulin <pauling@iis.ee.ethz.ch>
 18 |  *                 Francesco Conti <f.conti@unibo.it>
 19 |  * Authors (NE16): Francesco Conti <francesco.conti@greenwaves-technologies.com>
 20 |  */
 21 | 
 22 | module ne16_accumulator_scm_test_wrap
 23 | #(
 24 |   parameter int unsigned ADDR_WIDTH   = 5,
 25 |   parameter int unsigned DATA_WIDTH   = 32,
 26 |   parameter int unsigned NUM_WORDS    = 2**ADDR_WIDTH,
 27 |   parameter int unsigned WIDTH_FACTOR = 4
 28 | )
 29 | (
 30 |   input  logic                               clk_i,
 31 |   input  logic                               rst_ni,
 32 |   input  logic                               clear_i,
 33 |   input  logic                               test_mode_i,
 34 |   input  logic [WIDTH_FACTOR-1:0]            wide_enable_i,
 35 | 
 36 |   // Read port
 37 |   input  logic                               re_i,
 38 |   input  logic [ADDR_WIDTH-1:0]              raddr_i,
 39 |   output logic [DATA_WIDTH-1:0]              rdata_o,
 40 |   output logic [WIDTH_FACTOR*DATA_WIDTH-1:0] rdata_wide_o,
 41 | 
 42 |   // Write port
 43 |   input  logic                               we_i,
 44 |   input  logic                               we_all_i,
 45 |   input  logic [ADDR_WIDTH-1:0]              waddr_i,
 46 |   input  logic [DATA_WIDTH-1:0]              wdata_i,
 47 |   input  logic [WIDTH_FACTOR*DATA_WIDTH-1:0] wdata_wide_i,
 48 | 
 49 |   output logic [NUM_WORDS-1:0][DATA_WIDTH-1:0] accumulators_o,
 50 | 
 51 |   // BIST ENABLE
 52 |   input  logic                                BIST,
 53 |   //BIST ports
 54 |   input  logic                                CSN_T,
 55 |   input  logic                                WEN_T,
 56 |   input  logic [ADDR_WIDTH-1:0]               A_T,
 57 |   input  logic [DATA_WIDTH-1:0]               D_T,
 58 |   output logic [DATA_WIDTH-1:0]               Q_T
 59 | );
 60 | 
 61 |    logic                         clear_muxed;
 62 | 
 63 |    logic                         ReadEnable_muxed;
 64 |    logic [ADDR_WIDTH-1:0]        ReadAddr_muxed;
 65 | 
 66 |    logic                         WriteEnable_muxed;
 67 |    logic                         WriteEnable_all_muxed;
 68 |    logic [ADDR_WIDTH-1:0]        WriteAddr_muxed;
 69 |    logic [DATA_WIDTH-1:0]        WriteData_muxed;
 70 | 
 71 |    always_comb
 72 |    begin
 73 |       if(BIST)
 74 |       begin
 75 |          ReadEnable_muxed  = (( CSN_T == 1'b0 ) && ( WEN_T == 1'b1));
 76 |          ReadAddr_muxed    = A_T;
 77 |          clear_muxed       = 1'b0;
 78 | 
 79 |          WriteEnable_all_muxed = 1'b0;
 80 |          WriteEnable_muxed = (( CSN_T == 1'b0 ) && ( WEN_T == 1'b0));
 81 |          WriteAddr_muxed   = A_T;
 82 |          WriteData_muxed   = D_T;
 83 |       end
 84 |       else
 85 |       begin
 86 |          ReadEnable_muxed  = re_i;
 87 |          ReadAddr_muxed    = raddr_i;
 88 |          clear_muxed       = clear_i;
 89 | 
 90 |          WriteEnable_all_muxed = we_all_i;
 91 |          WriteEnable_muxed     = we_i;
 92 |          WriteAddr_muxed       = waddr_i;
 93 |          WriteData_muxed       = wdata_i;
 94 |       end
 95 |    end
 96 | 
 97 |    assign Q_T = rdata_o;
 98 | 
 99 |   ne16_accumulator_scm
100 |   #(
101 |     .ADDR_WIDTH   ( ADDR_WIDTH    ), //= 5,
102 |     .DATA_WIDTH   ( DATA_WIDTH    ), //= 32,
103 |     .NUM_WORDS    ( NUM_WORDS     ), //= 2**ADDR_WIDTH,
104 |     .WIDTH_FACTOR ( WIDTH_FACTOR  )  //= 4
105 |   )
106 |   ne16_accumulator_scm_i
107 |   (
108 |     .clk_i            ( clk_i                 ),
109 |     .rst_ni           ( rst_ni                ),
110 |     .clear_i          ( clear_muxed           ),
111 |     .test_mode_i      ( test_mode_i           ),
112 |     .wide_enable_i    ( wide_enable_i         ),
113 | 
114 |     // Read port
115 |     .re_i             ( ReadEnable_muxed      ),
116 |     .raddr_i          ( ReadAddr_muxed        ),
117 |     .rdata_o          ( rdata_o               ),
118 |     .rdata_wide_o     ( rdata_wide_o          ),
119 | 
120 |     // Write port
121 |     .we_i             ( WriteEnable_muxed     ),
122 |     .we_all_i         ( WriteEnable_all_muxed ),
123 |     .waddr_i          ( WriteAddr_muxed       ),
124 |     .wdata_i          ( WriteData_muxed       ),
125 |     .wdata_wide_i     ( wdata_wide_i          ),
126 | 
127 |     .accumulators_o   ( accumulators_o        )
128 |   );
129 | 
130 | endmodule // ne16_accumulator_scm
131 | 


--------------------------------------------------------------------------------
/rtl/accumulator/ne16_normquant.sv:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ne16_normquant.sv
  3 |  *
  4 |  * Copyright (C) 2019-2021 ETH Zurich, University of Bologna and GreenWaves Technologies
  5 |  *
  6 |  * Copyright and related rights are licensed under the Solderpad Hardware
  7 |  * License, Version 0.51 (the "License"); you may not use this file except in
  8 |  * compliance with the License.  You may obtain a copy of the License at
  9 |  * http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
 10 |  * or agreed to in writing, software, hardware and materials distributed under
 11 |  * this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 |  * specific language governing permissions and limitations under the License.
 14 |  */
 15 | 
 16 | /*
 17 |  * Authors (RBE):  Gianna Paulin <pauling@iis.ee.ethz.ch>
 18 |  *                 Francesco Conti <f.conti@unibo.it>
 19 |  * Authors (NE16): Francesco Conti <francesco.conti@greenwaves-technologies.com>
 20 |  */
 21 | 
 22 | import ne16_package::*;
 23 | 
 24 | module ne16_normquant #(
 25 |   parameter int unsigned NMULT = 4,
 26 |   parameter int unsigned NMS = ne16_package::NORM_MULT_SIZE,
 27 |   parameter int unsigned ACC = ne16_package::NE16_ACCUM_SIZE,
 28 |   parameter int unsigned INT = 48,
 29 |   parameter int unsigned QNT = 32,
 30 |   parameter int unsigned PIPE = 1,
 31 |   parameter int unsigned OUTPUT_REGISTER = 0
 32 | ) (
 33 |   // global signals
 34 |   input  logic                          clk_i,
 35 |   input  logic                          rst_ni,
 36 |   input  logic                          test_mode_i,
 37 |   // local clear
 38 |   input  logic                          clear_i,
 39 |   // normalization parameters
 40 |   input  logic unsigned [NMULT*NMS-1:0] norm_mult_i,
 41 |   input  logic unsigned [NMULT*8-1:0]   shift_i,
 42 |   // accumulation
 43 |   input  logic signed   [NMULT*ACC-1:0] accumulator_i,
 44 |   output logic signed   [NMULT*ACC-1:0] accumulator_o,
 45 |   // control channel
 46 |   input  ne16_package::ctrl_normquant_t  ctrl_i,
 47 |   output ne16_package::flags_normquant_t [NMULT-1:0] flags_o
 48 | );
 49 | 
 50 |   logic signed [NMULT-1  :0][NMS+ACC-1:0]  product;
 51 |   logic signed [NMULT-1  :0][INT-1:0]  product_48b;
 52 |   logic signed [NMULT-1  :0][INT-1:0] product_8b;
 53 |   logic signed [NMULT/2-1:0][INT-1:0] product_16b;
 54 |   logic signed              [INT-1:0] product_32b;
 55 |   logic signed [NMULT-1  :0][INT-1:0] product_to_shift;
 56 |   logic signed [NMULT-1  :0][INT-1:0] rounding;
 57 | 
 58 |   generate
 59 |     for(genvar ii=0; ii<NMULT; ii++) begin : mult_gen
 60 | 
 61 |       localparam ii_div2 = ii / 2;
 62 | 
 63 |       logic sign_bit;
 64 |       logic signed [NMS:0]   norm_mult_signed;
 65 |       logic        [ACC-1:0] accumulator_selected;
 66 | 
 67 |       assign accumulator_selected = (ctrl_i.norm_mode == NE16_MODE_8B)  ? accumulator_i [(ii+1)*32-1:ii*32] :
 68 |                                     (ctrl_i.norm_mode == NE16_MODE_16B) ? accumulator_i [(ii_div2+1)*32-1:ii_div2*32] :
 69 |                                     (ctrl_i.norm_mode == NE16_MODE_32B) ? accumulator_i [32-1:0] : '0;
 70 | 
 71 |       assign sign_bit = norm_mult_i[NMULT*NMS-1]; // sign is used only in WEIGHTOFFS
 72 | 
 73 |       assign norm_mult_signed = {ctrl_i.norm_signed & sign_bit, norm_mult_i[(ii+1)*NMS-1:ii*NMS]};
 74 |       ne16_normquant_multiplier #(
 75 |         .NMS  ( NMS  ),
 76 |         .ACC  ( ACC  ),
 77 |         .PIPE ( PIPE )
 78 |       ) i_multiplier (
 79 |         .clk_i              ( clk_i                ),
 80 |         .rst_ni             ( rst_ni               ),
 81 |         .test_mode_i        ( test_mode_i          ),
 82 |         .clear_i            ( clear_i              ),
 83 |         .enable_i           ( 1'b1                 ),
 84 |         .norm_mult_signed_i ( norm_mult_signed     ),
 85 |         .accumulator_i      ( accumulator_selected ),
 86 |         .product_o          ( product [ii]         )
 87 |       );
 88 |     end
 89 | 
 90 |     // FIXME hardwired params
 91 |     assign product_48b[0] = $signed(product[0]);
 92 |     assign product_48b[1] = $signed(product[1]);
 93 |     assign product_48b[2] = $signed(product[2]);
 94 |     assign product_48b[3] = $signed(product[3]);
 95 |     assign product_32b    = $signed(product_48b[0] + (product_48b[1] <<< 8) + (product_48b[2] <<< 16) + (product_48b[3] <<< 24));
 96 |     assign product_16b[0] = $signed(product_48b[0] + (product_48b[1] <<< 8));
 97 |     assign product_16b[1] = $signed(product_48b[2] + (product_48b[3] <<< 8));
 98 |     assign product_8b[0]  = $signed(product[0]);
 99 |     assign product_8b[1]  = $signed(product[1]);
100 |     assign product_8b[2]  = $signed(product[2]);
101 |     assign product_8b[3]  = $signed(product[3]);
102 |     assign product_to_shift = (ctrl_i.norm_mode == NE16_MODE_8B)  ? product_8b :
103 |                               (ctrl_i.norm_mode == NE16_MODE_16B) ? { 48'b0, 48'b0, product_16b } :
104 |                               (ctrl_i.norm_mode == NE16_MODE_32B) ? { 48'b0, 48'b0, 48'b0, product_32b } : '0;
105 | 
106 |     for(genvar ii=0; ii<NMULT; ii++) begin : shift_sat_gen
107 | 
108 |       logic [31:0] accumulator_loc;
109 |       ne16_normquant_shifter #(
110 |         .ACC             ( ACC             ),
111 |         .INT             ( INT             ),
112 |         .OUTPUT_REGISTER ( OUTPUT_REGISTER )
113 |       ) i_shifter (
114 |         .clk_i         ( clk_i                    ),
115 |         .rst_ni        ( rst_ni                   ),
116 |         .test_mode_i   ( test_mode_i              ),
117 |         .clear_i       ( clear_i                  ),
118 |         .data_i        ( product_to_shift[ii]     ),
119 |         .shift_i       ( shift_i[(ii+1)*8-1:ii*8] ),
120 |         .accumulator_o ( accumulator_loc          ),
121 |         .ctrl_i        ( ctrl_i                   )
122 |       );
123 |       assign accumulator_o[(ii+1)*32-1:ii*32] = accumulator_loc;
124 | 
125 |     end
126 |   endgenerate
127 | 
128 | endmodule // ne16_normquant
129 | 


--------------------------------------------------------------------------------
/rtl/accumulator/ne16_normquant_bias.sv:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * ne16_normquant_bias.sv
 3 |  *
 4 |  * Copyright (C) 2019-2021 ETH Zurich, University of Bologna and GreenWaves Technologies
 5 |  *
 6 |  * Copyright and related rights are licensed under the Solderpad Hardware
 7 |  * License, Version 0.51 (the "License"); you may not use this file except in
 8 |  * compliance with the License.  You may obtain a copy of the License at
 9 |  * http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
10 |  * or agreed to in writing, software, hardware and materials distributed under
11 |  * this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 |  * specific language governing permissions and limitations under the License.
14 |  */
15 | 
16 | /*
17 |  * Authors (RBE):  Gianna Paulin <pauling@iis.ee.ethz.ch>
18 |  *                 Francesco Conti <f.conti@unibo.it>
19 |  * Authors (NE16): Francesco Conti <francesco.conti@greenwaves-technologies.com>
20 |  */
21 | 
22 | import ne16_package::*;
23 | 
24 | module ne16_normquant_bias #(
25 |   parameter int unsigned NADD = 8,
26 |   parameter int unsigned ACC = ne16_package::NE16_ACCUM_SIZE,
27 |   parameter int unsigned QNT = 32,
28 |   parameter int unsigned OUTPUT_REGISTER = 0
29 | ) (
30 |   // global signals
31 |   input  logic                         clk_i,
32 |   input  logic                         rst_ni,
33 |   input  logic                         test_mode_i,
34 |   // local clear
35 |   input  logic                         clear_i,
36 |   // normalization parameters
37 |   input  logic unsigned [NADD*ACC-1:0] norm_bias_i,
38 |   input  logic unsigned [NADD*8-1:0]   shift_i,
39 |   // accumulation
40 |   input  logic signed   [NADD*ACC-1:0] accumulator_i,
41 |   output logic signed   [NADD*ACC-1:0] accumulator_o,
42 |   // control channel
43 |   input  ne16_package::ctrl_normquant_t  ctrl_i
44 | );
45 | 
46 |   generate
47 | 
48 |     logic        [NADD-1:0][ACC-1:0] biased_data;
49 | 
50 |     for(genvar ii=0; ii<NADD; ii++) begin : biased_data_gen
51 |       assign biased_data[ii] = norm_bias_i[(ii+1)*ACC-1:ii*ACC] + accumulator_i[(ii+1)*ACC-1:ii*ACC];
52 |     end
53 | 
54 |     for(genvar ii=0; ii<NADD; ii++) begin : shift_sat_gen
55 | 
56 |       logic [31:0] accumulator_loc;
57 |       ne16_normquant_shifter #(
58 |         .ACC             ( ACC             ),
59 |         .INT             ( ACC             ),
60 |         .OUTPUT_REGISTER ( OUTPUT_REGISTER )
61 |       ) i_shifter (
62 |         .clk_i         ( clk_i                    ),
63 |         .rst_ni        ( rst_ni                   ),
64 |         .test_mode_i   ( test_mode_i              ),
65 |         .clear_i       ( clear_i                  ),
66 |         .data_i        ( biased_data[ii]          ),
67 |         .shift_i       ( shift_i[(ii+1)*8-1:ii*8] ),
68 |         .accumulator_o ( accumulator_loc          ),
69 |         .ctrl_i        ( ctrl_i                   )
70 |       );
71 |       assign accumulator_o[(ii+1)*32-1:ii*32] = accumulator_loc;
72 | 
73 |     end
74 |   endgenerate
75 | 
76 | endmodule // ne16_normquant_bias
77 | 


--------------------------------------------------------------------------------
/rtl/accumulator/ne16_normquant_multiplier.sv:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * ne16_normquant_multiplier.sv
 3 |  *
 4 |  * Copyright (C) 2019-2021 ETH Zurich, University of Bologna and GreenWaves Technologies
 5 |  *
 6 |  * Copyright and related rights are licensed under the Solderpad Hardware
 7 |  * License, Version 0.51 (the "License"); you may not use this file except in
 8 |  * compliance with the License.  You may obtain a copy of the License at
 9 |  * http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
10 |  * or agreed to in writing, software, hardware and materials distributed under
11 |  * this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 |  * specific language governing permissions and limitations under the License.
14 |  */
15 | 
16 | /*
17 |  * Authors (RBE):  Gianna Paulin <pauling@iis.ee.ethz.ch>
18 |  *                 Francesco Conti <f.conti@unibo.it>
19 |  * Authors (NE16): Francesco Conti <francesco.conti@greenwaves-technologies.com>
20 |  */
21 | 
22 | import ne16_package::*;
23 | 
24 | module ne16_normquant_multiplier #(
25 |   parameter int unsigned NMS = ne16_package::NORM_MULT_SIZE,
26 |   parameter int unsigned ACC = ne16_package::NE16_ACCUM_SIZE,
27 |   parameter int unsigned PIPE = 0
28 | ) (
29 |   input  logic                      clk_i,
30 |   input  logic                      rst_ni,
31 |   input  logic                      test_mode_i,
32 |   input  logic                      clear_i,
33 |   input  logic                      enable_i,
34 |   input  logic signed [NMS:0]       norm_mult_signed_i,
35 |   input  logic signed [ACC-1:0]     accumulator_i,
36 |   output logic signed [NMS+ACC-1:0] product_o
37 | );
38 | 
39 |   logic [NMS+ACC-1:0] product_d, product_q;
40 |   assign product_d = norm_mult_signed_i * accumulator_i;
41 | 
42 |   generate
43 | 
44 |     if(PIPE == 1) begin : pipe_gen
45 |       always_ff@(posedge clk_i or negedge rst_ni)
46 |       begin
47 |         if(~rst_ni) begin
48 |           product_q <= '0;
49 |         end
50 |         else if(clear_i) begin
51 |           product_q <= '0;
52 |         end
53 |         else if(enable_i) begin
54 |           product_q <= product_d;
55 |         end
56 |       end
57 |       assign product_o = product_q;
58 |     end
59 |     else begin : no_pipe_gen
60 |       assign product_o = product_d;
61 |     end
62 | 
63 |   endgenerate
64 | 
65 | endmodule // ne16_normquant_multiplier
66 | 


--------------------------------------------------------------------------------
/rtl/accumulator/ne16_normquant_shifter.sv:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ne16_normquant_shifter.sv
  3 |  *
  4 |  * Copyright (C) 2019-2021 ETH Zurich, University of Bologna and GreenWaves Technologies
  5 |  *
  6 |  * Copyright and related rights are licensed under the Solderpad Hardware
  7 |  * License, Version 0.51 (the "License"); you may not use this file except in
  8 |  * compliance with the License.  You may obtain a copy of the License at
  9 |  * http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
 10 |  * or agreed to in writing, software, hardware and materials distributed under
 11 |  * this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 |  * specific language governing permissions and limitations under the License.
 14 |  */
 15 | 
 16 | /*
 17 |  * Authors (RBE):  Gianna Paulin <pauling@iis.ee.ethz.ch>
 18 |  *                 Francesco Conti <f.conti@unibo.it>
 19 |  * Authors (NE16): Francesco Conti <francesco.conti@greenwaves-technologies.com>
 20 |  */
 21 | 
 22 | import ne16_package::*;
 23 | 
 24 | module ne16_normquant_shifter #(
 25 |   parameter int unsigned ACC = ne16_package::NE16_ACCUM_SIZE,
 26 |   parameter int unsigned INT = 33,
 27 |   parameter int unsigned OUTPUT_REGISTER = 0
 28 | ) (
 29 |   input  logic                           clk_i,
 30 |   input  logic                           rst_ni,
 31 |   input  logic                           test_mode_i,
 32 |   input  logic                           clear_i,
 33 |   input  logic unsigned [INT-1:0]        data_i,
 34 |   input  logic unsigned [7:0]            shift_i,
 35 |   output logic signed   [ACC-1:0]        accumulator_o,
 36 |   input  ne16_package::ctrl_normquant_t  ctrl_i
 37 | );
 38 | 
 39 |   logic [INT-1:0] shifted;
 40 |   logic signed [INT-1:0] rounding;
 41 |   logic [ACC-1:0] accumulator_d;
 42 |   logic [ACC-1:0] accumulator_q;
 43 |   logic [5:0] right_shift;
 44 | 
 45 |   assign right_shift = shift_i;
 46 | 
 47 |   assign rounding = 1 <<< (right_shift-1);
 48 |   assign shifted = ~ctrl_i.use_shifting ? $signed(data_i) :
 49 |                                           $signed(data_i) >>> right_shift;
 50 | 
 51 |   logic [INT-2:0] sat_big_or_shifted;
 52 |   logic [INT-2:0] sat_big_nand_shifted;
 53 | 
 54 |   always_comb
 55 |   begin
 56 |     sat_big_or_shifted   =  shifted[INT-2:0];
 57 |     sat_big_nand_shifted = ~shifted[INT-2:0];
 58 |     if(ctrl_i.relu) begin
 59 |       if(ctrl_i.quant_mode == NE16_MODE_8B) begin
 60 |         sat_big_or_shifted [7:0] = '0;
 61 |       end
 62 |       else if(ctrl_i.quant_mode == NE16_MODE_16B) begin
 63 |         sat_big_or_shifted [15:0] = '0;
 64 |       end
 65 |       else if(ctrl_i.quant_mode == NE16_MODE_32B) begin
 66 |         sat_big_or_shifted = '0;
 67 |       end
 68 |     end
 69 |     else begin
 70 |       if(ctrl_i.quant_mode == NE16_MODE_8B) begin
 71 |         sat_big_or_shifted  [6:0] = '0;
 72 |         sat_big_nand_shifted[6:0] = '0;
 73 |       end
 74 |       else if(ctrl_i.quant_mode == NE16_MODE_16B) begin
 75 |         sat_big_or_shifted  [14:0] = '0;
 76 |         sat_big_nand_shifted[14:0] = '0;
 77 |       end
 78 |       else if(ctrl_i.quant_mode == NE16_MODE_32B) begin
 79 |         sat_big_or_shifted  [30:0] = '0;
 80 |         sat_big_nand_shifted[30:0] = '0;
 81 |       end
 82 |     end
 83 |   end
 84 | 
 85 |   always_comb
 86 |   begin
 87 | 
 88 |     accumulator_d = '0;
 89 |     if(ctrl_i.quant_mode == NE16_MODE_8B) begin
 90 |       accumulator_d[7:0] = shifted[7:0];
 91 |     end
 92 |     else if(ctrl_i.quant_mode == NE16_MODE_16B) begin
 93 |       accumulator_d[15:0] = shifted[15:0];
 94 |     end
 95 |     else if(ctrl_i.quant_mode == NE16_MODE_32B) begin
 96 |       accumulator_d = shifted[ACC-1:0];
 97 |     end
 98 | 
 99 |     if(ctrl_i.use_shifting) begin
100 |       if(ctrl_i.relu) begin
101 |         if(shifted[INT-1])
102 |           accumulator_d = '0; // neg or sat- with relu active
103 |         else if(~shifted[INT-1] & (|(sat_big_or_shifted))) begin
104 |           accumulator_d = '1; // sat+
105 |         end
106 |       end
107 |       else begin
108 |         if (shifted[INT-1] & (|(sat_big_nand_shifted))) begin
109 |           accumulator_d = '0;
110 |           if(ctrl_i.quant_mode == NE16_MODE_8B) begin
111 |             accumulator_d[7] = 1'b1; // sat-
112 |           end
113 |           else if(ctrl_i.quant_mode == NE16_MODE_16B) begin
114 |             accumulator_d[15] = 1'b1; // sat-
115 |           end
116 |           else if(ctrl_i.quant_mode == NE16_MODE_32B) begin
117 |             accumulator_d[31] = 1'b1; // sat-
118 |           end
119 |         end
120 |         else if(~shifted[INT-1] & (|(sat_big_or_shifted))) begin
121 |           accumulator_d = '1; // sat+
122 |           if(ctrl_i.quant_mode == NE16_MODE_32B) begin
123 |             accumulator_d[31] = 1'b0; // sat+
124 |           end
125 |           else if(ctrl_i.quant_mode == NE16_MODE_16B) begin
126 |             accumulator_d[15] = 1'b0; // sat+
127 |           end
128 |           else if(ctrl_i.quant_mode == NE16_MODE_8B) begin
129 |             accumulator_d[7] = 1'b0; // sat+
130 |           end
131 |         end
132 |       end
133 |     end
134 | 
135 |   end
136 | 
137 |   if(OUTPUT_REGISTER) begin : output_register_gen
138 | 
139 |     always_ff @(posedge clk_i or negedge rst_ni)
140 |     begin
141 |       if(~rst_ni) begin
142 |         accumulator_q <= '0;
143 |       end
144 |       else if(clear_i) begin
145 |         accumulator_q <= '0;
146 |       end
147 |       else if(ctrl_i.start) begin
148 |         accumulator_q <= accumulator_d;
149 |       end
150 |     end
151 | 
152 |   end
153 |   else begin : no_output_register_gen
154 |     assign accumulator_q = accumulator_d;
155 |   end
156 | 
157 |   assign accumulator_o = accumulator_q;
158 | 
159 | endmodule // ne16_normquant_shifter
160 | 


--------------------------------------------------------------------------------
/rtl/array/ne16_binconv_block.sv:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ne16_binconv_block.sv
  3 |  *
  4 |  * Copyright (C) 2019-2021 ETH Zurich, University of Bologna and GreenWaves Technologies
  5 |  *
  6 |  * Copyright and related rights are licensed under the Solderpad Hardware
  7 |  * License, Version 0.51 (the "License"); you may not use this file except in
  8 |  * compliance with the License.  You may obtain a copy of the License at
  9 |  * http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
 10 |  * or agreed to in writing, software, hardware and materials distributed under
 11 |  * this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 |  * specific language governing permissions and limitations under the License.
 14 |  */
 15 | 
 16 | /*
 17 |  * Authors (RBE):  Gianna Paulin <pauling@iis.ee.ethz.ch>
 18 |  *                 Francesco Conti <f.conti@unibo.it>
 19 |  * Authors (NE16): Francesco Conti <francesco.conti@greenwaves-technologies.com>
 20 |  */
 21 | import ne16_package::*;
 22 | 
 23 | module ne16_binconv_block #(
 24 |   parameter int unsigned BLOCK_SIZE = NE16_BLOCK_SIZE,           // number of SoP's per BinConv block (default 4)
 25 |   parameter int unsigned TP_IN      = NE16_TP_IN,                // number of input elements processed per cycle
 26 |   parameter int unsigned PIPELINE   = 1
 27 | ) (
 28 |   // global signals
 29 |   input  logic                   clk_i,
 30 |   input  logic                   rst_ni,
 31 |   input  logic                   test_mode_i,
 32 |   // local enable & clear
 33 |   input  logic                   enable_i,
 34 |   input  logic                   clear_i,
 35 |   // input activation stream + handshake
 36 |   hwpe_stream_intf_stream.sink   activation_i [BLOCK_SIZE-1:0],
 37 |   // input weight stream + handshake
 38 |   hwpe_stream_intf_stream.sink   weight_i,
 39 |   // output features + handshake
 40 |   hwpe_stream_intf_stream.source block_pres_o,
 41 |   // control channel
 42 |   input  ctrl_binconv_block_t    ctrl_i,
 43 |   output flags_binconv_block_t   flags_o
 44 | );
 45 | 
 46 |   logic clk_gated;
 47 |   cluster_clock_gating i_hier_block_gate (
 48 |     .clk_i     ( clk_i              ),
 49 |     .en_i      ( enable_i | clear_i ),
 50 |     .test_en_i ( test_mode_i        ),
 51 |     .clk_o     ( clk_gated          )
 52 |   );
 53 | 
 54 |   ///////////////////////////////////////////
 55 |   // Local Params, Interfaces, and Signals //
 56 |   ///////////////////////////////////////////
 57 | 
 58 |   // internal weight interface
 59 |   hwpe_stream_intf_stream #(
 60 |     .DATA_WIDTH ( 1 )
 61 | `ifndef SYNTHESIS
 62 |     ,
 63 |     .BYPASS_VCR_ASSERT( 1'b1  ),
 64 |     .BYPASS_VDR_ASSERT( 1'b1  )
 65 | `endif
 66 |   ) weight_int [BLOCK_SIZE-1:0] (
 67 |     .clk ( clk_i )
 68 |   );
 69 | 
 70 |   // BinConv result interface
 71 |   hwpe_stream_intf_stream #(
 72 |     .DATA_WIDTH ( NE16_QA_IN )
 73 | `ifndef SYNTHESIS
 74 |     ,
 75 |     .BYPASS_VCR_ASSERT( 1'b1  ),
 76 |     .BYPASS_VDR_ASSERT( 1'b1  )
 77 | `endif
 78 |   ) popcount [BLOCK_SIZE-1:0] (
 79 |     .clk ( clk_i )
 80 |   );
 81 | 
 82 |   hwpe_stream_intf_stream #(
 83 |     .DATA_WIDTH ( NE16_QA_IN+$clog2(BLOCK_SIZE)+NE16_QA_16BIT )
 84 | `ifndef SYNTHESIS
 85 |     ,
 86 |     .BYPASS_VCR_ASSERT( 1'b1  ),
 87 |     .BYPASS_VDR_ASSERT( 1'b1  )
 88 | `endif
 89 |   ) pres_nonscaled (
 90 |     .clk ( clk_i )
 91 |   );
 92 | 
 93 |   hwpe_stream_intf_stream #(
 94 |     .DATA_WIDTH ( NE16_QA_IN+$clog2(BLOCK_SIZE)+NE16_QA_16BIT+8 )
 95 | `ifndef SYNTHESIS
 96 |     ,
 97 |     .BYPASS_VCR_ASSERT( 1'b1  ),
 98 |     .BYPASS_VDR_ASSERT( 1'b1  )
 99 | `endif
100 |   ) pres (
101 |     .clk ( clk_i )
102 |   );
103 | 
104 |   logic clear_int;
105 | 
106 |   logic [NE16_QA_IN+$clog2(BLOCK_SIZE)+NE16_QA_16BIT-1:0] binconv_block_pres_nonscaled_d, binconv_block_pres_nonscaled_q;
107 |   logic [NE16_QA_IN+$clog2(BLOCK_SIZE)-2:0] binconv_block_pres_nonscaled_hi_d;
108 |   logic [NE16_QA_IN+$clog2(BLOCK_SIZE)-2:0] binconv_block_pres_nonscaled_lo_d;
109 |   logic                                     binconv_block_pres_nonscaled_valid_d, binconv_block_pres_nonscaled_valid_q;
110 | 
111 |   logic [NE16_QA_IN+NE16_QA_16BIT+8+$clog2(BLOCK_SIZE)-1:0] binconv_block_pres_q;
112 |   logic                                                     binconv_block_pres_valid_q;
113 | 
114 |   ctrl_scale_t scale_ctrl;
115 |   ctrl_scale_t scale_ctrl_q;
116 | 
117 |   logic [BLOCK_SIZE-1:0] [NE16_QA_IN-1:0]     popcount_data;
118 | 
119 |   assign clear_int = clear_i | ctrl_i.clear;
120 | 
121 |   ///////////////////////////////
122 |   // BinConv and Scale Modules //
123 |   ///////////////////////////////
124 |   // iterate over all BLOCK_SIZE BinConvs in a singe block
125 | 
126 |   generate
127 | 
128 |     for(genvar ii=0; ii<BLOCK_SIZE; ii+=1) begin : sop_gen
129 | 
130 |       localparam ii_div2 = ii/2;
131 | 
132 |       assign weight_int[ii].data  = ctrl_i.weight_offset ? 1    : ctrl_i.mode_16 ? weight_i.data[ii_div2] : weight_i.data[ii];
133 |       assign weight_int[ii].valid = ctrl_i.weight_offset ? 1'b1 : weight_i.valid;
134 |       assign weight_int[ii].strb  = weight_i.strb;
135 | 
136 |       assign popcount[ii].valid = (ctrl_i.weight_offset==1'b0)                  ? activation_i[ii].valid & activation_i[ii].ready & weight_int[ii].valid & weight_int[ii].ready :
137 |                                   (ctrl_i.filter_mode==NE16_FILTER_MODE_3X3_DW) ? activation_i[ii].valid & activation_i[ii].ready & ~ctrl_i.invalidate :
138 |                                   (ctrl_i.block_cnt=='0)                        ? activation_i[ii].valid : '0;
139 |       assign popcount[ii].strb  = '1;
140 | 
141 |       // 1x8bit "multipliers" (i.e., simple multiplexers)
142 |       assign popcount[ii].data  = ctrl_i.enable_mac[ii] & weight_int[ii].data ? activation_i[ii].data : '0;
143 | 
144 |       // ========================================================================
145 |       // INPUT STREAMER HANDSHAKING
146 |       // ========================================================================
147 | 
148 |       always_comb
149 |       begin : ready_propagation
150 |         case({activation_i[ii].valid, weight_int[ii].valid})
151 |           2'b00 : begin
152 |             activation_i[ii].ready = popcount[ii].ready;
153 |             weight_int[ii].ready   = popcount[ii].ready;
154 |           end
155 |           2'b01 : begin
156 |             activation_i[ii].ready = popcount[ii].ready;
157 |             weight_int[ii].ready   = 1'b0;
158 |           end
159 |           2'b10 : begin
160 |             activation_i[ii].ready = 1'b0;
161 |             weight_int[ii].ready   = popcount[ii].ready;
162 |           end
163 |           2'b11 : begin
164 |             activation_i[ii].ready = popcount[ii].ready;
165 |             weight_int[ii].ready   = popcount[ii].ready;
166 |           end
167 |         endcase
168 |       end
169 | 
170 |       assign popcount_data[ii] = popcount[ii].data;
171 |       assign popcount[ii].ready = pres_nonscaled.ready;
172 | 
173 |     end // sop_gen
174 | 
175 |     if (PIPELINE ==1 ) begin : pipe_stage_gen
176 | 
177 |       always_ff @(posedge clk_gated or negedge rst_ni)
178 |       begin
179 |         if(~rst_ni) begin
180 |           binconv_block_pres_nonscaled_q       <= '0;
181 |           scale_ctrl_q <= '0;
182 |         end
183 |         else if(clear_int) begin
184 |           binconv_block_pres_nonscaled_q       <= '0;
185 |           scale_ctrl_q <= '0;
186 |         end
187 |         else if(enable_i) begin
188 |           binconv_block_pres_nonscaled_q       <= binconv_block_pres_nonscaled_d;
189 |           scale_ctrl_q <= scale_ctrl;
190 |         end
191 |       end
192 | 
193 |       always_ff @(posedge clk_i or negedge rst_ni)
194 |       begin
195 |         if(~rst_ni) begin
196 |           binconv_block_pres_nonscaled_valid_q <= '0;
197 |         end
198 |         else if(clear_int) begin
199 |           binconv_block_pres_nonscaled_valid_q <= '0;
200 |         end
201 |         else begin
202 |           binconv_block_pres_nonscaled_valid_q <= binconv_block_pres_nonscaled_valid_d;
203 |         end
204 |       end
205 | 
206 |     end
207 |     else begin
208 | 
209 |       assign binconv_block_pres_nonscaled_q = binconv_block_pres_nonscaled_d;
210 |       assign binconv_block_pres_nonscaled_valid_q = binconv_block_pres_nonscaled_valid_d;
211 |       assign scale_ctrl_q = scale_ctrl;
212 | 
213 |     end
214 | 
215 |   endgenerate
216 | 
217 | 
218 |   //////////////////////////////////
219 |   // Block-level reduction
220 |   //////////////////////////////////
221 |   always_comb
222 |   begin
223 |     binconv_block_pres_nonscaled_hi_d = '0;
224 |     for(int i=1; i<BLOCK_SIZE; i+=2) begin
225 |       binconv_block_pres_nonscaled_hi_d += popcount_data[i];
226 |     end
227 |   end
228 |   always_comb
229 |   begin
230 |     binconv_block_pres_nonscaled_lo_d = '0;
231 |     for(int i=0; i<BLOCK_SIZE; i+=2) begin
232 |       binconv_block_pres_nonscaled_lo_d += popcount_data[i];
233 |     end
234 |   end
235 |   assign binconv_block_pres_nonscaled_d = ctrl_i.mode_16 ? {binconv_block_pres_nonscaled_hi_d, 8'b0} + binconv_block_pres_nonscaled_lo_d :
236 |                                                            binconv_block_pres_nonscaled_hi_d         + binconv_block_pres_nonscaled_lo_d;
237 | 
238 |   assign binconv_block_pres_nonscaled_valid_d = popcount[0].valid;
239 | 
240 |   assign pres_nonscaled.strb  = '1;
241 |   assign pres_nonscaled.data  = binconv_block_pres_nonscaled_q;
242 |   assign pres_nonscaled.valid = binconv_block_pres_nonscaled_valid_q;
243 | 
244 |   //////////////////////////////////
245 |   // Scaling factor
246 |   //////////////////////////////////
247 |   ne16_scale #(
248 |     .INP_ACC     ( NE16_QA_IN + $clog2(BLOCK_SIZE) + NE16_QA_16BIT     ),
249 |     .OUT_ACC     ( NE16_QA_IN + $clog2(BLOCK_SIZE) + NE16_QA_16BIT + 8 ),
250 |     .N_SHIFTS    ( 8                                                   )
251 |   ) i_binconv_scale (
252 |     .clk_i       ( clk_gated      ),
253 |     .rst_ni      ( rst_ni         ),
254 |     .test_mode_i ( test_mode_i    ),
255 |     .data_i      ( pres_nonscaled ),
256 |     .data_o      ( pres           ),
257 |     .ctrl_i      ( scale_ctrl_q   ),
258 |     .flags_o     (                )
259 |   );
260 | 
261 |   assign flags_o = '0; // FIXME
262 | 
263 |   ////////////////////////
264 |   // Output Assignments //
265 |   ////////////////////////
266 |   assign weight_i.ready = weight_int[0].ready;
267 | 
268 |   assign block_pres_o.valid = binconv_block_pres_valid_q;
269 |   assign block_pres_o.data  = binconv_block_pres_q;
270 | 
271 |   assign pres.ready = block_pres_o.ready;
272 | 
273 |   ///////////////
274 |   // Registers //
275 |   ///////////////
276 |   // registers for block results
277 |   always_ff @(posedge clk_gated or negedge rst_ni)
278 |     begin
279 |       if(~rst_ni)
280 |         binconv_block_pres_q <= '0;
281 |       else if(clear_int)
282 |         binconv_block_pres_q <= '0;
283 |       else if(pres.valid & pres.ready)
284 |         binconv_block_pres_q <= pres.data;
285 |     end
286 | 
287 |   // registers for output valid signal
288 |   always_ff @(posedge clk_i or negedge rst_ni)
289 |     begin
290 |       if(~rst_ni)
291 |         binconv_block_pres_valid_q <= '0;
292 |       else if(clear_int)
293 |         binconv_block_pres_valid_q <= '0;
294 |       else if(pres.ready)
295 |         binconv_block_pres_valid_q <= pres.valid;
296 |     end
297 | 
298 |   generate
299 |     assign scale_ctrl.shift_sel = (ctrl_i.filter_mode == NE16_FILTER_MODE_3X3_DW & ctrl_i.weight_offset) ? '0 :
300 |                                   ~ctrl_i.mode_linear & (ctrl_i.filter_mode == NE16_FILTER_MODE_1X1)     ? ctrl_i.scale_shift :
301 |                                                                                                            ctrl_i.block_cnt;
302 |     assign scale_ctrl.invert = 1'b0;
303 |   endgenerate
304 | 
305 | endmodule // ne16_binconv_block
306 | 


--------------------------------------------------------------------------------
/rtl/array/ne16_binconv_column.sv:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ne16_binconv_column.sv
  3 |  *
  4 |  * Copyright (C) 2019-2021 ETH Zurich, University of Bologna and GreenWaves Technologies
  5 |  *
  6 |  * Copyright and related rights are licensed under the Solderpad Hardware
  7 |  * License, Version 0.51 (the "License"); you may not use this file except in
  8 |  * compliance with the License.  You may obtain a copy of the License at
  9 |  * http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
 10 |  * or agreed to in writing, software, hardware and materials distributed under
 11 |  * this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 |  * specific language governing permissions and limitations under the License.
 14 |  */
 15 | 
 16 | /*
 17 |  * Authors (RBE):  Gianna Paulin <pauling@iis.ee.ethz.ch>
 18 |  *                 Francesco Conti <f.conti@unibo.it>
 19 |  * Authors (NE16): Francesco Conti <francesco.conti@greenwaves-technologies.com>
 20 |  */
 21 | 
 22 | import ne16_package::*;
 23 | 
 24 | module ne16_binconv_column #(
 25 |   parameter int unsigned COLUMN_SIZE      = NE16_COLUMN_SIZE,         // number of BinConv blocks per column (default 9)
 26 |   parameter int unsigned BLOCK_SIZE       = NE16_BLOCK_SIZE,          // number of SoP's per BinConv block (default 4)
 27 |   parameter int unsigned BC_COLBLOCK_SIZE = COLUMN_SIZE*BLOCK_SIZE,
 28 |   parameter int unsigned TP_IN            = NE16_TP_IN                   // number of input elements processed per cycle
 29 | ) (
 30 |   // global signals
 31 |   input  logic                   clk_i,
 32 |   input  logic                   rst_ni,
 33 |   input  logic                   test_mode_i,
 34 |   // local enable & clear
 35 |   input  logic                   enable_i,
 36 |   input  logic                   clear_i,
 37 |   // input activation stream + handshake
 38 |   hwpe_stream_intf_stream.sink   activation_i  [BC_COLBLOCK_SIZE-1:0],
 39 |   // input weight stream + handshake
 40 |   hwpe_stream_intf_stream.sink   weight_i      [COLUMN_SIZE-1:0],
 41 |   // output features + handshake
 42 |   hwpe_stream_intf_stream.source column_pres_o,
 43 |   // control channel
 44 |   input  ctrl_binconv_column_t   ctrl_i,
 45 |   output flags_binconv_column_t  flags_o
 46 | );
 47 | 
 48 |   ///////////////////////////////////////////
 49 |   // Local Params, Interfaces, and Signals //
 50 |   ///////////////////////////////////////////
 51 | 
 52 |   localparam BLOCK_PRES_SIZE  = NE16_QA_IN+NE16_QA_16BIT+8+$clog2(BLOCK_SIZE);
 53 |   localparam COLUMN_PRES_SIZE = BLOCK_PRES_SIZE+$clog2(COLUMN_SIZE);
 54 | 
 55 |   hwpe_stream_intf_stream #(
 56 |     .DATA_WIDTH ( BLOCK_PRES_SIZE )
 57 |   `ifndef SYNTHESIS
 58 |     ,
 59 |     .BYPASS_VCR_ASSERT( 1'b1  ),
 60 |     .BYPASS_VDR_ASSERT( 1'b1  )
 61 |   `endif
 62 |   ) block_pres [COLUMN_SIZE-1:0] (
 63 |     .clk ( clk_i )
 64 |   );
 65 | 
 66 |   logic signed [COLUMN_PRES_SIZE-1:0]   binconv_column_pres_d, binconv_column_pres_q;
 67 |   logic                                 binconv_column_pres_valid_d, binconv_column_pres_valid_q;
 68 |   logic        [COLUMN_PRES_SIZE/8-1:0] binconv_column_pres_strb_d, binconv_column_pres_strb_q;
 69 | 
 70 |   logic signed [COLUMN_SIZE-1:0][BLOCK_PRES_SIZE-1:0] block_pres_data;
 71 | 
 72 |   ///////////////////
 73 |   // Block Modules //
 74 |   ///////////////////
 75 |   generate
 76 |     for(genvar ii=0; ii<COLUMN_SIZE; ii++) begin : block_gen
 77 | 
 78 |       ctrl_binconv_block_t ctrl_block;
 79 | 
 80 |       always_comb
 81 |       begin
 82 |         ctrl_block = ctrl_i.ctrl_block;
 83 |         ctrl_block.scale_shift = ii; // used for 1x1
 84 |       end
 85 | 
 86 |       ne16_binconv_block #(
 87 |         .BLOCK_SIZE ( BLOCK_SIZE ),
 88 |         .TP_IN      ( TP_IN      )
 89 |       ) i_block (
 90 |         .clk_i        ( clk_i                                            ),
 91 |         .rst_ni       ( rst_ni                                           ),
 92 |         .test_mode_i  ( test_mode_i                                      ),
 93 |         .enable_i     ( ctrl_i.enable_block[ii]                          ),
 94 |         .clear_i      ( clear_i                                          ),
 95 |         .activation_i ( activation_i [(ii+1)*BLOCK_SIZE-1:ii*BLOCK_SIZE] ),
 96 |         .weight_i     ( weight_i [ii]                                    ),
 97 |         .block_pres_o ( block_pres [ii]                                  ),
 98 |         .ctrl_i       ( ctrl_block                                       ),
 99 |         .flags_o      ( flags_o.flags_block[ii]                          )
100 |       );
101 | 
102 |       assign block_pres_data[ii] = ctrl_i.enable_block[ii] ? block_pres[ii].data : '0;
103 | 
104 |     end // block_gen
105 |   endgenerate
106 | 
107 | 
108 |   ///////////////////////////////////
109 |   // Computation of Column Results //
110 |   ///////////////////////////////////
111 | 
112 |   always_comb
113 |   begin
114 |     binconv_column_pres_d = '0;
115 |     for(int i=0; i<COLUMN_SIZE; i++) begin
116 |       binconv_column_pres_d += BLOCK_PRES_SIZE'(signed'(block_pres_data[i]));
117 |     end
118 |   end
119 |   assign binconv_column_pres_valid_d = block_pres[0].valid;
120 |   assign binconv_column_pres_strb_d  = block_pres[0].strb;
121 | 
122 | 
123 |   ////////////////////////
124 |   // Output Assignments //
125 |   ////////////////////////
126 | 
127 |   assign column_pres_o.valid = binconv_column_pres_valid_q;
128 |   assign column_pres_o.strb  = binconv_column_pres_strb_q;
129 |   assign column_pres_o.data  = enable_i ? binconv_column_pres_q : ctrl_i.padding_value[COLUMN_PRES_SIZE-1:0];
130 | 
131 |   generate
132 |     for(genvar ii=0; ii<COLUMN_SIZE; ii++) begin : ready_prop_gen
133 |       assign block_pres[ii].ready = column_pres_o.ready;
134 |     end // ready_prop_gen
135 |   endgenerate
136 | 
137 | 
138 |   ///////////////
139 |   // Registers //
140 |   ///////////////
141 | 
142 |   // registers for column results
143 |   always_ff @(posedge clk_i or negedge rst_ni)
144 |   begin
145 |     if(~rst_ni)
146 |       binconv_column_pres_q <= '0;
147 |     else if(clear_i)
148 |       binconv_column_pres_q <= '0;
149 |     else if(enable_i & block_pres[0].valid & block_pres[0].ready)
150 |       binconv_column_pres_q <= binconv_column_pres_d;
151 |   end
152 | 
153 |   // registers for output valid signal
154 |   always_ff @(posedge clk_i or negedge rst_ni)
155 |   begin
156 |     if(~rst_ni) begin
157 |       binconv_column_pres_valid_q <= '0;
158 |       binconv_column_pres_strb_q  <= '0;
159 |     end
160 |     else if(clear_i) begin
161 |       binconv_column_pres_valid_q <= '0;
162 |       binconv_column_pres_strb_q  <= '0;
163 |     end
164 |     else if(block_pres[0].ready) begin
165 |       binconv_column_pres_valid_q <= binconv_column_pres_valid_d;
166 |       binconv_column_pres_strb_q  <= binconv_column_pres_strb_d;
167 |     end
168 | 
169 |   end
170 | 
171 | endmodule // ne16_binconv_column
172 | 


--------------------------------------------------------------------------------
/rtl/array/ne16_scale.sv:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * ne16_scale.sv
 3 |  *
 4 |  * Copyright (C) 2019-2021 ETH Zurich, University of Bologna and GreenWaves Technologies
 5 |  *
 6 |  * Copyright and related rights are licensed under the Solderpad Hardware
 7 |  * License, Version 0.51 (the "License"); you may not use this file except in
 8 |  * compliance with the License.  You may obtain a copy of the License at
 9 |  * http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
10 |  * or agreed to in writing, software, hardware and materials distributed under
11 |  * this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 |  * specific language governing permissions and limitations under the License.
14 |  */
15 | 
16 | /*
17 |  * Authors (RBE):  Gianna Paulin <pauling@iis.ee.ethz.ch>
18 |  *                 Francesco Conti <f.conti@unibo.it>
19 |  * Authors (NE16): Francesco Conti <francesco.conti@greenwaves-technologies.com>
20 |  */
21 | 
22 | import ne16_package::*;
23 | 
24 | module ne16_scale #(
25 |   parameter int unsigned INP_ACC  =  8, // input bitwidth
26 |   parameter int unsigned OUT_ACC  = 16, // output bitwidth
27 |   parameter int unsigned N_SHIFTS =  8  // number of mutliplexed shifts
28 | ) (
29 |   // global signals
30 |   input logic                    clk_i,
31 |   input logic                    rst_ni,
32 |   input logic                    test_mode_i,
33 |   // local enable & clear
34 |   // input  logic                enable_i,
35 |   // input  logic                clear_i,
36 |   // input data
37 |   hwpe_stream_intf_stream.sink   data_i,
38 |   // output data
39 |   hwpe_stream_intf_stream.source data_o,
40 |   // control channel
41 |   input ctrl_scale_t             ctrl_i,
42 |   output flags_scale_t           flags_o
43 | );
44 | 
45 |   // ========================================================================
46 |   // SIGNAL DECLARATIONS
47 |   // ========================================================================
48 | 
49 |   logic [OUT_ACC-1:0] shifted_data  [N_SHIFTS-1:0];
50 |   logic [OUT_ACC-1:0] unshifted_data;
51 |   logic [OUT_ACC-1:0] shifted_data_out;
52 |   logic signed [OUT_ACC-1:0] inverted_data_out;
53 | 
54 |   logic [INP_ACC-1:0] data;
55 | 
56 |   assign data = data_i.data;
57 | 
58 |   assign unshifted_data[INP_ACC-1:0] = data[INP_ACC-1:0];
59 | 
60 |   generate
61 |     if (OUT_ACC-1 >= INP_ACC) begin
62 |       assign unshifted_data[OUT_ACC-1:INP_ACC] = '0;//data_i[INP_ACC-1:0];
63 |     end
64 |   endgenerate
65 | 
66 |   // All other shifts
67 |   always_comb
68 |     begin
69 |       // Assign data with shift index 0
70 |       // assign shifted_data[0] = unshifted_data;
71 | 
72 |       for(int i=0; i<N_SHIFTS; i++)
73 |         shifted_data[i] = unshifted_data << i;
74 |     end
75 | 
76 |   assign shifted_data_out = shifted_data[ctrl_i.shift_sel];
77 |   assign inverted_data_out = -shifted_data_out;
78 |   assign data_o.data  = ctrl_i.invert ? inverted_data_out : shifted_data_out;
79 | 
80 |   assign data_i.ready = data_o.ready;
81 |   assign data_o.valid = data_i.valid;
82 |   assign data_o.strb  = data_i.strb;
83 | 
84 |   assign flags_o.shift_sel = ctrl_i.shift_sel;
85 | 
86 | endmodule // ne16_scale
87 | 


--------------------------------------------------------------------------------
/rtl/ctrl/ne16_ctrl_fsm.sv:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ne16_ctrl_fsm.sv
  3 |  *
  4 |  * Copyright (C) 2019-2021 ETH Zurich, University of Bologna and GreenWaves Technologies
  5 |  *
  6 |  * Copyright and related rights are licensed under the Solderpad Hardware
  7 |  * License, Version 0.51 (the "License"); you may not use this file except in
  8 |  * compliance with the License.  You may obtain a copy of the License at
  9 |  * http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
 10 |  * or agreed to in writing, software, hardware and materials distributed under
 11 |  * this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 |  * specific language governing permissions and limitations under the License.
 14 |  */
 15 | 
 16 | /*
 17 |  * Authors (RBE):  Gianna Paulin <pauling@iis.ee.ethz.ch>
 18 |  *                 Francesco Conti <f.conti@unibo.it>
 19 |  * Authors (NE16): Francesco Conti <francesco.conti@greenwaves-technologies.com>
 20 |  */
 21 | 
 22 | import ne16_package::*;
 23 | import hwpe_ctrl_package::*;
 24 | import hci_package::*;
 25 | 
 26 | module ne16_ctrl_fsm (
 27 |   // global signals
 28 |   input  logic             clk_i,
 29 |   input  logic             rst_ni,
 30 |   input  logic             test_mode_i,
 31 |   input  logic             clear_i,
 32 |   input  logic             start_i,
 33 |   // ctrl & flags
 34 |   input  flags_engine_t    flags_engine_i,
 35 |   input  flags_streamer_t  flags_streamer_i,
 36 |   input  config_ne16_t     config_i,
 37 |   output state_ne16_t      state_o,
 38 |   output logic             state_change_o,
 39 |   input  logic             uloop_ready_i,
 40 |   output index_ne16_t      index_o,
 41 |   output base_addr_ne16_t  base_addr_o
 42 | );
 43 | 
 44 |   /* signal declarations */
 45 |   state_ne16_t state_d, state_q;
 46 |   logic state_change_d, state_change_q;
 47 | 
 48 |   ctrl_uloop_t       ctrl_uloop;
 49 |   flags_uloop_t      flags_uloop;
 50 |   uloop_code_t       code_uloop;
 51 |   logic [17:0][31:0] ro_reg;
 52 | 
 53 |   index_ne16_t     index_d, index_q;
 54 |   index_update_ne16_t index_update_d, index_update_q;
 55 |   base_addr_ne16_t base_addr_d, base_addr_q;
 56 |   logic streamin_en;
 57 | 
 58 |   /* finite state machine */
 59 |   always_ff @(posedge clk_i or negedge rst_ni)
 60 |   begin : fsm_sequential
 61 |     if(~rst_ni) begin
 62 |       state_q <= IDLE;
 63 |       state_change_q <= '0;
 64 |     end
 65 |     else if(clear_i) begin
 66 |       state_q <= IDLE;
 67 |       state_change_q <= '0;
 68 |     end
 69 |     else begin
 70 |       state_q <= state_d;
 71 |       state_change_q <= state_change_d;
 72 |     end
 73 |   end
 74 | 
 75 |   always_comb
 76 |   begin: fsm_next_state
 77 |     state_d = state_q;
 78 |     state_change_d = 1'b0;
 79 | 
 80 |     case(state_q)
 81 | 
 82 |       IDLE: begin
 83 |         if(start_i) begin
 84 |           state_d = LOAD;
 85 |           state_change_d = 1'b1;
 86 |         end
 87 |       end
 88 | 
 89 |       LOAD: begin
 90 |         if(flags_engine_i.flags_input_buffer.state == IB_EXTRACT) begin
 91 |           state_d = WEIGHTOFFS;
 92 |           state_change_d = 1'b1;
 93 |         end
 94 |       end
 95 | 
 96 |       WEIGHTOFFS: begin
 97 |         if(flags_engine_i.flags_accumulator[8].state == AQ_ACCUM_DONE) begin
 98 |           if(streamin_en) begin
 99 |             state_d = STREAMIN;
100 |             state_change_d = 1'b1;
101 |           end
102 |           else begin
103 |             state_d = MATRIXVEC;
104 |             state_change_d = 1'b1;
105 |           end
106 |         end
107 |       end
108 | 
109 |       STREAMIN: begin
110 |         if(flags_engine_i.flags_accumulator[8].state == AQ_STREAMIN_DONE) begin
111 |           state_d = MATRIXVEC;
112 |           state_change_d = 1'b1;
113 |         end
114 |       end
115 | 
116 |       MATRIXVEC: begin
117 |         if(flags_engine_i.flags_accumulator[8].state == AQ_ACCUM_DONE) begin
118 |           if(~uloop_ready_i) begin
119 |             state_d = UPDATEIDX_WAIT;
120 |             state_change_d = 1'b1;
121 |           end
122 |           else begin
123 |             state_d = UPDATEIDX;
124 |             state_change_d = 1'b1;
125 |           end
126 |         end
127 |       end
128 | 
129 |       NORMQUANT_SHIFT: begin
130 |         if(flags_engine_i.flags_accumulator[8].state == AQ_NORMQUANT) begin
131 |           state_d = NORMQUANT;
132 |           state_change_d = 1'b1;
133 |         end
134 |       end
135 | 
136 |       NORMQUANT: begin
137 |         if(flags_engine_i.flags_accumulator[8].state == AQ_NORMQUANT_BIAS) begin
138 |           state_d = NORMQUANT_BIAS;
139 |           state_change_d = 1'b1;
140 |         end
141 |         else if(~config_i.norm_option_bias & flags_engine_i.flags_accumulator[8].state == AQ_NORMQUANT_DONE) begin
142 |           state_d = STREAMOUT;
143 |           state_change_d = 1'b1;
144 |         end
145 |       end
146 | 
147 |       NORMQUANT_BIAS: begin
148 |         if(flags_engine_i.flags_accumulator[8].state == AQ_NORMQUANT_DONE) begin
149 |           state_d = STREAMOUT;
150 |           state_change_d = 1'b1;
151 |         end
152 |       end
153 | 
154 |       STREAMOUT: begin
155 |         if(flags_engine_i.flags_accumulator[8].state == AQ_STREAMOUT_DONE) begin
156 |           if(flags_uloop.done) begin
157 |             state_d = DONE;
158 |             state_change_d = 1'b1;
159 |           end
160 |           else begin
161 |             state_d = STREAMOUT_DONE;
162 |             state_change_d = 1'b1;
163 |           end
164 |         end
165 |       end
166 | 
167 |       STREAMOUT_DONE: begin
168 |         if(flags_streamer_i.tcdm_fifo_empty) begin
169 |           state_d = LOAD;
170 |           state_change_d = 1'b1;
171 |         end
172 |       end
173 | 
174 |       UPDATEIDX_WAIT: begin
175 |         if(uloop_ready_i) begin
176 |           state_d = UPDATEIDX;
177 |           state_change_d = 1'b1;
178 |         end
179 |       end
180 | 
181 |       UPDATEIDX: begin
182 |         if(flags_uloop.valid) begin
183 |           if((config_i.filter_mode != NE16_FILTER_MODE_3X3_DW) && (flags_uloop.idx_update == 4'b0001) && (~flags_uloop.done)) begin
184 |             state_d = LOAD;
185 |             state_change_d = 1'b1;
186 |           end
187 |           else if(~config_i.streamout_quant) begin
188 |             state_d = STREAMOUT;
189 |             state_change_d = 1'b1;
190 |           end
191 |           else if(config_i.norm_option_shift) begin
192 |             state_d = NORMQUANT_SHIFT;
193 |             state_change_d = 1'b1;
194 |           end
195 |           else begin
196 |             state_d = NORMQUANT;
197 |             state_change_d = 1'b1;
198 |           end
199 |         end
200 |       end
201 | 
202 |       DONE: begin
203 |         state_d = IDLE;
204 |         state_change_d = 1'b1;
205 |       end
206 | 
207 |     endcase
208 |   end
209 | 
210 |   /* uloop instantiation */
211 |   always_comb
212 |   begin
213 |     code_uloop = '0;
214 |     code_uloop.code     = config_i.filter_mode == NE16_FILTER_MODE_3X3_DW ? ULOOP_CODE_DEPTHWISE   : ULOOP_CODE_NORMAL;
215 |     code_uloop.loops    = config_i.filter_mode == NE16_FILTER_MODE_3X3_DW ? ULOOP_LOOPS_DEPTHWISE  : ULOOP_LOOPS_NORMAL;
216 |     code_uloop.range[0] = config_i.filter_mode == NE16_FILTER_MODE_3X3_DW ? config_i.subtile_nb_wo : config_i.subtile_nb_ki;
217 |     code_uloop.range[1] = config_i.filter_mode == NE16_FILTER_MODE_3X3_DW ? config_i.subtile_nb_ho : config_i.subtile_nb_wo;
218 |     code_uloop.range[2] = config_i.filter_mode == NE16_FILTER_MODE_3X3_DW ? config_i.subtile_nb_ko : config_i.subtile_nb_ho;
219 |     code_uloop.range[3] = config_i.filter_mode == NE16_FILTER_MODE_3X3_DW ? 1                      : config_i.subtile_nb_ko;
220 |   end
221 | 
222 |   assign ctrl_uloop.enable = (state_q == UPDATEIDX) & ~flags_uloop.valid;
223 |   assign ctrl_uloop.clear  = (state_q == IDLE);
224 |   assign ctrl_uloop.ready  = config_i.filter_mode == NE16_FILTER_MODE_1X1 ? 1'b1 : uloop_ready_i;
225 | 
226 |   hwpe_ctrl_uloop #(
227 |     .LENGTH    ( 32 ),
228 |     .NB_LOOPS  ( 4  ),
229 |     .NB_RO_REG ( 18 ),
230 |     .NB_REG    ( 4  ),
231 |     .REG_WIDTH ( 32 ),
232 |     .CNT_WIDTH ( 16 ),
233 |     .SHADOWED  ( 1  )
234 | `ifndef SYNTHESIS
235 |     ,
236 |     .DEBUG_DISPLAY ( 0 )
237 | `endif
238 |   ) i_uloop (
239 |     .clk_i            ( clk_i                      ),
240 |     .rst_ni           ( rst_ni                     ),
241 |     .test_mode_i      ( test_mode_i                ),
242 |     .clear_i          ( clear_i | ctrl_uloop.clear ),
243 |     .ctrl_i           ( ctrl_uloop                 ),
244 |     .flags_o          ( flags_uloop                ),
245 |     .uloop_code_i     ( code_uloop                 ),
246 |     .registers_read_i ( ro_reg                     )
247 |   );
248 | 
249 |   assign ro_reg[NE16_ULOOP_RO_WEIGHTS_KOM_ITER]       = config_i.uloop_iter.weights_kom_iter;
250 |   assign ro_reg[NE16_ULOOP_RO_WEIGHTS_KIM_ITER]       = config_i.uloop_iter.weights_kim_iter;
251 |   assign ro_reg[NE16_ULOOP_RO_WEIGHTS_KOM_RESET_ITER] = config_i.uloop_iter.weights_kom_reset_iter;
252 |   assign ro_reg[NE16_ULOOP_RO_WEIGHTS_KIM_RESET_ITER] = config_i.uloop_iter.weights_kim_reset_iter;
253 |   assign ro_reg[NE16_ULOOP_RO_INFEAT_KIM_ITER]        = config_i.uloop_iter.infeat_kim_iter;
254 |   assign ro_reg[NE16_ULOOP_RO_INFEAT_WOM_ITER]        = config_i.uloop_iter.infeat_wom_iter;
255 |   assign ro_reg[NE16_ULOOP_RO_INFEAT_HOM_ITER]        = config_i.uloop_iter.infeat_hom_iter;
256 |   assign ro_reg[NE16_ULOOP_RO_INFEAT_KIM_RESET_ITER]  = config_i.uloop_iter.infeat_kim_reset_iter;
257 |   assign ro_reg[NE16_ULOOP_RO_INFEAT_WOM_RESET_ITER]  = config_i.uloop_iter.infeat_wom_reset_iter;
258 |   assign ro_reg[NE16_ULOOP_RO_INFEAT_HOM_RESET_ITER]  = config_i.uloop_iter.infeat_hom_reset_iter;
259 |   assign ro_reg[NE16_ULOOP_RO_OUTFEAT_WOM_ITER]       = config_i.uloop_iter.outfeat_wom_iter;
260 |   assign ro_reg[NE16_ULOOP_RO_OUTFEAT_HOM_ITER]       = config_i.uloop_iter.outfeat_hom_iter;
261 |   assign ro_reg[NE16_ULOOP_RO_OUTFEAT_KOM_ITER]       = config_i.uloop_iter.outfeat_kom_iter;
262 |   assign ro_reg[NE16_ULOOP_RO_OUTFEAT_WOM_RESET_ITER] = config_i.uloop_iter.outfeat_wom_reset_iter;
263 |   assign ro_reg[NE16_ULOOP_RO_OUTFEAT_HOM_RESET_ITER] = config_i.uloop_iter.outfeat_hom_reset_iter;
264 |   assign ro_reg[NE16_ULOOP_RO_OUTFEAT_KOM_RESET_ITER] = config_i.uloop_iter.outfeat_kom_reset_iter;
265 |   assign ro_reg[NE16_ULOOP_RO_SCALE_KOM_ITER]         = config_i.uloop_iter.scale_kom_iter;
266 |   assign ro_reg[NE16_ULOOP_RO_ZERO]                   = '0;
267 | 
268 |   /* index registers */
269 |   logic index_sample_en;
270 |   assign index_sample_en = ((state_d == WEIGHTOFFS & config_i.filter_mode==NE16_FILTER_MODE_3X3_DW) || state_d == LOAD || state_d == STREAMOUT_DONE) & state_change_d;
271 |   always_ff @(posedge clk_i or negedge rst_ni)
272 |   begin
273 |     if(~rst_ni) begin
274 |       index_q        <= '0;
275 |       index_update_q <= '0;
276 |       base_addr_q    <= '0;
277 |     end
278 |     else if(clear_i) begin
279 |       index_q        <= '0;
280 |       index_update_q <= '0;
281 |       base_addr_q    <= '0;
282 |     end
283 |     else if(index_sample_en) begin // commit indeces when loading
284 |       index_q        <= index_d;
285 |       index_update_q <= index_update_d;
286 |       base_addr_q    <= base_addr_d;
287 |     end
288 |   end
289 | 
290 |   /* FSM output binding */
291 |   assign state_o        = state_d;
292 |   assign state_change_o = state_change_d;
293 | 
294 |   assign index_d.k_out_major = config_i.filter_mode==NE16_FILTER_MODE_3X3_DW ? flags_uloop.idx[2] : flags_uloop.idx[3];
295 |   assign index_d.i_major     = config_i.filter_mode==NE16_FILTER_MODE_3X3_DW ? flags_uloop.idx[1] : flags_uloop.idx[2];
296 |   assign index_d.j_major     = config_i.filter_mode==NE16_FILTER_MODE_3X3_DW ? flags_uloop.idx[0] : flags_uloop.idx[1];
297 |   assign index_d.k_in_major  = config_i.filter_mode==NE16_FILTER_MODE_3X3_DW ? flags_uloop.idx[2] : flags_uloop.idx[0];
298 | 
299 |   assign index_update_d.k_out_major = config_i.filter_mode==NE16_FILTER_MODE_3X3_DW ? flags_uloop.idx_update[2] : flags_uloop.idx_update[3];
300 |   assign index_update_d.i_major     = config_i.filter_mode==NE16_FILTER_MODE_3X3_DW ? flags_uloop.idx_update[1] : flags_uloop.idx_update[2];
301 |   assign index_update_d.j_major     = config_i.filter_mode==NE16_FILTER_MODE_3X3_DW ? flags_uloop.idx_update[0] : flags_uloop.idx_update[1];
302 |   assign index_update_d.k_in_major  = config_i.filter_mode==NE16_FILTER_MODE_3X3_DW ? flags_uloop.idx_update[2] : flags_uloop.idx_update[0];
303 | 
304 |   assign base_addr_d.weights = flags_uloop.offs[NE16_ULOOP_BASE_ADDR_W];
305 |   assign base_addr_d.infeat  = flags_uloop.offs[NE16_ULOOP_BASE_ADDR_X];
306 |   assign base_addr_d.outfeat = flags_uloop.offs[NE16_ULOOP_BASE_ADDR_Y];
307 |   assign base_addr_d.scale   = flags_uloop.offs[NE16_ULOOP_BASE_ADDR_S];
308 | 
309 |   assign index_o     = index_sample_en ? index_d     : index_q;
310 |   assign base_addr_o = index_sample_en ? base_addr_d : base_addr_q;
311 | 
312 |   assign streamin_en = config_i.streamin & ((index_update_d.k_out_major | index_update_d.i_major | index_update_d.j_major) | (index_q.k_out_major=='0 & index_q.k_in_major=='0 & index_q.i_major=='0 & index_q.j_major=='0));
313 | 
314 | endmodule // ne16_ctrl_fsm
315 | 


--------------------------------------------------------------------------------
/rtl/input_buffer/ne16_input_buffer.sv:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ne16_input_buffer.sv
  3 |  *
  4 |  * Copyright (C) 2019-2021 ETH Zurich, University of Bologna and GreenWaves Technologies
  5 |  *
  6 |  * Copyright and related rights are licensed under the Solderpad Hardware
  7 |  * License, Version 0.51 (the "License"); you may not use this file except in
  8 |  * compliance with the License.  You may obtain a copy of the License at
  9 |  * http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
 10 |  * or agreed to in writing, software, hardware and materials distributed under
 11 |  * this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 |  * specific language governing permissions and limitations under the License.
 14 |  */
 15 | 
 16 | /*
 17 |  * Authors (RBE):  Gianna Paulin <pauling@iis.ee.ethz.ch>
 18 |  *                 Francesco Conti <f.conti@unibo.it>
 19 |  * Authors (NE16): Francesco Conti <francesco.conti@greenwaves-technologies.com>
 20 |  */
 21 | 
 22 | import ne16_package::*;
 23 | 
 24 | module ne16_input_buffer #(
 25 |   parameter int unsigned INPUT_BUF_SIZE = 400,
 26 |   parameter int unsigned BLOCK_SIZE     = NE16_BLOCK_SIZE,
 27 |   parameter int unsigned DW             = NE16_QA_IN
 28 | ) (
 29 |   // global signals
 30 |   input  logic                   clk_i,
 31 |   input  logic                   rst_ni,
 32 |   input  logic                   test_mode_i,
 33 | 
 34 |   // local enable and clear
 35 |   input  logic                   enable_i,
 36 |   input  logic                   clear_i,
 37 | 
 38 |   // control channel
 39 |   input  ctrl_input_buffer_t     ctrl_i,
 40 |   output flags_input_buffer_t    flags_o,
 41 | 
 42 |   // input / output streams
 43 |   hwpe_stream_intf_stream.sink   feat_i [BLOCK_SIZE-1:0],
 44 |   hwpe_stream_intf_stream.source feat_o [INPUT_BUF_SIZE-1:0]
 45 | );
 46 | 
 47 |   localparam NW = INPUT_BUF_SIZE/BLOCK_SIZE;
 48 |   localparam AW = $clog2(NW);
 49 |   localparam DS = DW*BLOCK_SIZE;
 50 | 
 51 |   // Standard-cell memory based feature register
 52 |   logic                  scm_re;
 53 |   logic [AW-1:0]         scm_raddr;
 54 |   logic                  scm_we;
 55 |   logic                  scm_we_all;
 56 |   logic [AW-1:0]         scm_waddr;
 57 |   logic [DS-1:0]         scm_wdata;
 58 |   logic [NW-1:0][DS-1:0] scm_input_buffer;
 59 | 
 60 |   // Finite-state machine + counters
 61 |   state_input_buffer_t fsm_cs, fsm_ns;
 62 |   logic                vlen_cnt_clr, vlen_cnt_gl_en, vlen_cnt_en;
 63 |   logic [AW-1:0] vlen_cnt;
 64 |   logic [AW-1:0] vlen_cnt_next;
 65 | 
 66 |   ne16_input_buffer_scm_test_wrap #(
 67 |     .ADDR_WIDTH ( AW ),
 68 |     .DATA_WIDTH ( DS ),
 69 |     .NUM_WORDS  ( NW )
 70 |   ) i_input_buffer_scm (
 71 |     .clk_i          ( clk_i            ),
 72 |     .rst_ni         ( rst_ni           ),
 73 |     .clear_i        ( clear_i          ),
 74 |     .test_mode_i    ( test_mode_i      ),
 75 |     .re_i           ( scm_re           ),
 76 |     .raddr_i        ( scm_raddr        ),
 77 |     .rdata_o        (                  ),
 78 |     .we_i           ( scm_we           ),
 79 |     .we_all_i       ( scm_we_all       ),
 80 |     .waddr_i        ( scm_waddr        ),
 81 |     .wdata_i        ( scm_wdata        ),
 82 |     .input_buffer_o ( scm_input_buffer ),
 83 |     .BIST           (                  ),
 84 |     .CSN_T          (                  ),
 85 |     .WEN_T          (                  ),
 86 |     .A_T            (                  ),
 87 |     .D_T            (                  ),
 88 |     .Q_T            (                  )
 89 |   );
 90 | 
 91 |   // this mask is used to load only 9 pixels instead of 25 in 1x1 mode (see ne16_ctrl for other masks)
 92 |   logic [24:0] mask_1x1;
 93 |   logic [4:0]  mask_1x1_s;
 94 |   assign mask_1x1_s = (1 << 3) - 1;
 95 |   always_comb
 96 |   begin
 97 |     mask_1x1 = '1;
 98 |     mask_1x1 &= {5{mask_1x1_s}};
 99 |     mask_1x1 &= {{5{mask_1x1_s[4]}}, {5{mask_1x1_s[3]}}, {5{mask_1x1_s[2]}}, {5{mask_1x1_s[1]}}, {5{mask_1x1_s[0]}}};
100 |   end
101 | 
102 |   // implicit padding --> comes from incomplete subtiles in the spatial dimensions --> always padded with 0
103 |   // explicit padding --> requested through the padding register --> padded with config.padding_value
104 |   // priority: implicit padding --> explicit padding --> normal feature
105 |   assign scm_we     = feat_i[0].valid & (feat_i[0].ready | (ctrl_i.filter_mode == NE16_FILTER_MODE_1X1 ? ~mask_1x1[vlen_cnt] : 1'b0));
106 |   assign scm_we_all = '0;
107 |   assign scm_waddr  = vlen_cnt;
108 |   generate
109 |     for(genvar ii=0; ii<BLOCK_SIZE/2; ii++) begin : scm_wdata_gen
110 |       assign scm_wdata[(2*ii+1)*8-1:(2*ii)  *8] = ctrl_i.enable_implicit_padding[vlen_cnt] ? '0 : ctrl_i.enable_explicit_padding[vlen_cnt] ? ctrl_i.explicit_padding_value_lo: feat_i[2*ii].data;
111 |       assign scm_wdata[(2*ii+2)*8-1:(2*ii+1)*8] = ctrl_i.enable_implicit_padding[vlen_cnt] ? '0 : ctrl_i.enable_explicit_padding[vlen_cnt] ? ctrl_i.explicit_padding_value_hi : feat_i[2*ii+1].data;
112 |     end
113 |   endgenerate
114 |   assign scm_re    = '0;
115 |   assign scm_raddr = '0;
116 | 
117 |   generate
118 |     for(genvar ii=0; ii<INPUT_BUF_SIZE/BLOCK_SIZE; ii++) begin : input_buf_output_gen_outer
119 |       for(genvar jj=0; jj<BLOCK_SIZE; jj++) begin : input_buf_output_gen_inner
120 |         localparam int unsigned ii_jj = ii*BLOCK_SIZE+jj;
121 |         assign feat_o[ii_jj].data = scm_input_buffer[ii][(jj+1)*8-1:jj*8];
122 |         assign feat_o[ii_jj].strb = '1;
123 |       end
124 |     end
125 |   endgenerate
126 | 
127 |   /* valid/ready broadcast */
128 |   generate
129 |     for(genvar ii=1; ii<BLOCK_SIZE; ii++) begin : broadcast_ready_gen
130 |       assign feat_i[ii].ready = feat_i[0].ready;
131 |     end
132 |     for(genvar ii=1; ii<INPUT_BUF_SIZE; ii++) begin : broadcast_valid_gen
133 |       assign feat_o[ii].valid = feat_o[0].valid;
134 |     end
135 |   endgenerate
136 | 
137 |   /* control */
138 | 
139 |   // finite-state machine + buffer virtual length counter
140 |   always_ff @(posedge clk_i or negedge rst_ni)
141 |   begin : fsm_seq
142 |     if(~rst_ni)
143 |       fsm_cs <= IB_IDLE;
144 |     else if(clear_i)
145 |       fsm_cs <= IB_IDLE;
146 |     else if(enable_i)
147 |       fsm_cs <= fsm_ns;
148 |   end
149 | 
150 | 
151 |   always_comb
152 |   begin : fsm_comb
153 |     fsm_ns          = fsm_cs;
154 |     feat_i[0].ready = 1'b0;
155 |     feat_o[0].valid = 1'b0;
156 |     vlen_cnt_clr    = 1'b1;
157 |     vlen_cnt_gl_en  = 1'b0;
158 | 
159 |     case (fsm_cs)
160 |       // in IB_IDLE state, wait for a IB_LOAD / IB_EXTRACT command
161 |       IB_IDLE: begin
162 |         if(ctrl_i.goto_load)
163 |           fsm_ns = IB_LOAD;
164 |         else if(ctrl_i.goto_extract)
165 |           fsm_ns = IB_EXTRACT;
166 |       end
167 | 
168 |       // in IB_LOAD state, raise the ready for the stream hs until the buffer virtual length vlen has been reached
169 |       IB_LOAD: begin
170 |         feat_i[0].ready = ctrl_i.filter_mode == NE16_FILTER_MODE_1X1 ? mask_1x1[vlen_cnt] : 1'b1;
171 |         vlen_cnt_gl_en = 1'b1;
172 |         vlen_cnt_clr = 1'b0;
173 |         if(scm_we && ({1'b0, vlen_cnt} == ctrl_i.load_len-1)) begin
174 |           fsm_ns = IB_EXTRACT; // an intermediate IB_IDLE state before going to IB_EXTRACT is necessary
175 |                                // in any case due to the way the latch-based register works
176 |           vlen_cnt_clr = 1'b1;
177 |         end
178 |       end
179 | 
180 |       // in IB_EXTRACT state, raise the valid for the feat hs until the buffer virtual length vlen has been reached
181 |       IB_EXTRACT: begin
182 |         feat_o[0].valid = 1'b1;
183 |         vlen_cnt_gl_en = 1'b0;
184 |         vlen_cnt_clr = 1'b1;
185 |         if(ctrl_i.goto_idle) begin
186 |           fsm_ns = IB_IDLE;
187 |         end
188 |       end
189 | 
190 |       default : begin
191 |         if(ctrl_i.goto_load)
192 |           fsm_ns = IB_LOAD;
193 |         else if(ctrl_i.goto_extract)
194 |           fsm_ns = IB_EXTRACT;
195 |       end
196 | 
197 |     endcase
198 |   end
199 | 
200 |   // virtual length counter (counts words of BP*32 size in IB_LOAD mode and, for now, also in IB_EXTRACT mode)
201 |   assign vlen_cnt_en = scm_we;
202 |   assign vlen_cnt_next = (vlen_cnt_clr == 1'b0) ? vlen_cnt + 1 : '0;
203 |   always_ff @(posedge clk_i or negedge rst_ni)
204 |   begin : vlen_counter
205 |     if(~rst_ni)
206 |       vlen_cnt <= '0;
207 |     else if (enable_i & vlen_cnt_gl_en) begin
208 |       if (vlen_cnt_clr == 1'b1)
209 |         vlen_cnt <= '0;
210 |       else if(vlen_cnt_en)
211 |         vlen_cnt <= vlen_cnt_next;
212 |     end
213 |   end
214 | 
215 |   assign flags_o.state = fsm_cs;
216 | 
217 | `ifndef SYNTHESIS
218 |   logic [24:0][15:0][7:0] scm_input_buffer_reorder;
219 |   assign scm_input_buffer_reorder = scm_input_buffer;
220 | `endif
221 | 
222 | endmodule // ne16_input_buffer
223 | 


--------------------------------------------------------------------------------
/rtl/input_buffer/ne16_input_buffer_scm.sv:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ne16_input_buffer_scm.sv
  3 |  *
  4 |  * Copyright (C) 2019-2021 ETH Zurich, University of Bologna and GreenWaves Technologies
  5 |  *
  6 |  * Copyright and related rights are licensed under the Solderpad Hardware
  7 |  * License, Version 0.51 (the "License"); you may not use this file except in
  8 |  * compliance with the License.  You may obtain a copy of the License at
  9 |  * http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
 10 |  * or agreed to in writing, software, hardware and materials distributed under
 11 |  * this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 |  * specific language governing permissions and limitations under the License.
 14 |  */
 15 | 
 16 | /*
 17 |  * Authors (RBE):  Gianna Paulin <pauling@iis.ee.ethz.ch>
 18 |  *                 Francesco Conti <f.conti@unibo.it>
 19 |  * Authors (NE16): Francesco Conti <francesco.conti@greenwaves-technologies.com>
 20 |  */
 21 | 
 22 | module ne16_input_buffer_scm
 23 | #(
 24 |   parameter int unsigned ADDR_WIDTH   = 5,
 25 |   parameter int unsigned DATA_WIDTH   = 128,
 26 |   parameter int unsigned NUM_WORDS    = 25
 27 | )
 28 | (
 29 |   input  logic                                 clk_i,
 30 |   input  logic                                 rst_ni,
 31 |   input  logic                                 clear_i,
 32 |   input  logic                                 test_mode_i,
 33 | 
 34 |   // Read port
 35 |   input  logic                                 re_i,
 36 |   input  logic [ADDR_WIDTH-1:0]                raddr_i,
 37 |   output logic [DATA_WIDTH-1:0]                rdata_o,
 38 | 
 39 |   // Write port
 40 |   input  logic                                 we_i,
 41 |   input  logic                                 we_all_i,
 42 |   input  logic [ADDR_WIDTH-1:0]                waddr_i,
 43 |   input  logic [DATA_WIDTH-1:0]                wdata_i,
 44 | 
 45 |   output logic [NUM_WORDS-1:0][DATA_WIDTH-1:0] input_buffer_o
 46 | );
 47 | 
 48 |   // Read address register, located at the input of the address decoder
 49 |   logic [NUM_WORDS-1:0][DATA_WIDTH-1:0] buffer;
 50 |   logic [NUM_WORDS-1:0]  waddr_onehot;
 51 |   logic [NUM_WORDS-1:0]  clk_we;
 52 | 
 53 |   logic [DATA_WIDTH-1:0] rdata_q;
 54 |   logic [DATA_WIDTH-1:0] wdata_q;
 55 | 
 56 |   logic clk_gated;
 57 | 
 58 |   // ========================================================================
 59 |   // CLK GATE
 60 |   // ========================================================================
 61 |   cluster_clock_gating i_cg_we_global
 62 |   (
 63 |     .clk_o     ( clk_gated      ),
 64 |     .en_i      ( we_i | clear_i ),
 65 |     .test_en_i ( test_mode_i    ),
 66 |     .clk_i     ( clk_i          )
 67 |   );
 68 | 
 69 |   // ========================================================================
 70 |   // WDATA SAMPLING
 71 |   // ========================================================================
 72 |   always_ff @(posedge clk_i or negedge rst_ni)
 73 |   begin
 74 |     if(~rst_ni)
 75 |       wdata_q <= '0;
 76 |     else if(clear_i)
 77 |       wdata_q <= '0;
 78 |     else if(we_i)
 79 |       wdata_q <= wdata_i;
 80 |   end
 81 | 
 82 |   // ========================================================================
 83 |   // SCM (LATCHES)
 84 |   // ========================================================================
 85 | 
 86 |   // use the sampled address to select the correct rdata_o
 87 |   always_ff @(posedge clk_i or negedge rst_ni)
 88 |   begin
 89 |     if(~rst_ni)
 90 |       rdata_q[DATA_WIDTH-1:0] <= '0;
 91 |     else if(clear_i)
 92 |       rdata_q[DATA_WIDTH-1:0] <= '0;
 93 |     else if(re_i) begin
 94 |       rdata_q[DATA_WIDTH-1:0] <= buffer[raddr_i];
 95 |     end
 96 |   end
 97 | 
 98 |   assign rdata_o = rdata_q[DATA_WIDTH-1:0];
 99 | 
100 |   // decode
101 |   generate
102 |     for(genvar ii=0; ii<NUM_WORDS; ii++) begin : WADDR_DECODE
103 | 
104 |       always_comb
105 |       begin : waddr_decoding
106 |         if((we_i==1'b1) && (waddr_i == ii))
107 |           waddr_onehot[ii] = 1'b1;
108 |         else if(we_all_i==1'b1)
109 |           waddr_onehot[ii] = 1'b1;
110 |         else
111 |           waddr_onehot[ii] = clear_i;
112 |       end
113 | 
114 |     end
115 |   endgenerate
116 | 
117 |   // generate one clock-gating cell for each register element
118 |   generate
119 |     for(genvar ii=0; ii<NUM_WORDS; ii++) begin : CG_CELL_WORD_ITER
120 | 
121 |       cluster_clock_gating i_cg
122 |       (
123 |         .clk_o     ( clk_we[ii]       ),
124 |         .en_i      ( waddr_onehot[ii] ),
125 |         .test_en_i ( test_mode_i      ),
126 |         .clk_i     ( clk_i            )
127 |       );
128 | 
129 |     end
130 |   endgenerate
131 | 
132 |   generate
133 | 
134 |     for(genvar ii=0; ii<NUM_WORDS; ii++) begin : LATCH
135 | 
136 |       always_latch
137 |       begin : latch_wdata
138 |         if( clk_we[ii] ) begin
139 |           buffer[ii] = clear_i ? '0 : wdata_q;
140 |         end
141 |       end
142 | 
143 |     end
144 | 
145 |   endgenerate
146 | 
147 |   assign input_buffer_o = buffer;
148 | 
149 | endmodule // ne16_input_buffer_scm
150 | 


--------------------------------------------------------------------------------
/rtl/input_buffer/ne16_input_buffer_scm_test_wrap.sv:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ne16_input_buffer_scm_test_wrap.sv
  3 |  *
  4 |  * Copyright (C) 2019-2021 ETH Zurich, University of Bologna and GreenWaves Technologies
  5 |  *
  6 |  * Copyright and related rights are licensed under the Solderpad Hardware
  7 |  * License, Version 0.51 (the "License"); you may not use this file except in
  8 |  * compliance with the License.  You may obtain a copy of the License at
  9 |  * http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
 10 |  * or agreed to in writing, software, hardware and materials distributed under
 11 |  * this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 |  * specific language governing permissions and limitations under the License.
 14 |  */
 15 | 
 16 | /*
 17 |  * Authors (RBE):  Gianna Paulin <pauling@iis.ee.ethz.ch>
 18 |  *                 Francesco Conti <f.conti@unibo.it>
 19 |  * Authors (NE16): Francesco Conti <francesco.conti@greenwaves-technologies.com>
 20 |  */
 21 | 
 22 | module ne16_input_buffer_scm_test_wrap
 23 | #(
 24 |   parameter int unsigned ADDR_WIDTH   = 5,
 25 |   parameter int unsigned DATA_WIDTH   = 128,
 26 |   parameter int unsigned NUM_WORDS    = 25
 27 | )
 28 | (
 29 |   input  logic                                 clk_i,
 30 |   input  logic                                 rst_ni,
 31 |   input  logic                                 clear_i,
 32 |   input  logic                                 test_mode_i,
 33 | 
 34 |   // Read port
 35 |   input  logic                                 re_i,
 36 |   input  logic [ADDR_WIDTH-1:0]                raddr_i,
 37 |   output logic [DATA_WIDTH-1:0]                rdata_o,
 38 | 
 39 |   // Write port
 40 |   input  logic                                 we_i,
 41 |   input  logic                                 we_all_i,
 42 |   input  logic [ADDR_WIDTH-1:0]                waddr_i,
 43 |   input  logic [DATA_WIDTH-1:0]                wdata_i,
 44 | 
 45 |   output logic [NUM_WORDS-1:0][DATA_WIDTH-1:0] input_buffer_o,
 46 | 
 47 |   // BIST ENABLE
 48 |   input  logic                                BIST,
 49 | 
 50 |   //BIST ports
 51 |   input  logic                                CSN_T,
 52 |   input  logic                                WEN_T,
 53 |   input  logic [ADDR_WIDTH-1:0]               A_T,
 54 |   input  logic [DATA_WIDTH-1:0]               D_T,
 55 |   output logic [DATA_WIDTH-1:0]               Q_T
 56 | );
 57 | 
 58 | 
 59 |    logic                         ReadEnable_muxed;
 60 |    logic [ADDR_WIDTH-1:0]        ReadAddr_muxed;
 61 | 
 62 |    logic                         WriteEnable_all_muxed;
 63 |    logic                         WriteEnable_muxed;
 64 |    logic [ADDR_WIDTH-1:0]        WriteAddr_muxed;
 65 |    logic [DATA_WIDTH-1:0]        WriteData_muxed;
 66 | 
 67 |    logic                         clear_muxed;
 68 | 
 69 |    always_comb
 70 |    begin
 71 |       if(BIST)
 72 |       begin
 73 |          ReadEnable_muxed  = (( CSN_T == 1'b0 ) && ( WEN_T == 1'b1));
 74 |          ReadAddr_muxed    = A_T;
 75 | 
 76 |          WriteEnable_all_muxed = 1'b0;
 77 |          WriteEnable_muxed = (( CSN_T == 1'b0 ) && ( WEN_T == 1'b0));
 78 |          WriteAddr_muxed   = A_T;
 79 |          WriteData_muxed   = D_T;
 80 | 
 81 |          clear_muxed       = 1'b0;
 82 |       end
 83 |       else
 84 |       begin
 85 |          ReadEnable_muxed  = re_i;
 86 |          ReadAddr_muxed    = raddr_i;
 87 | 
 88 |          WriteEnable_muxed     = we_i;
 89 |          WriteEnable_all_muxed = we_all_i;
 90 |          WriteAddr_muxed       = waddr_i;
 91 |          WriteData_muxed       = wdata_i;
 92 | 
 93 |          clear_muxed           = clear_i;
 94 |       end
 95 |    end
 96 | 
 97 |     assign Q_T = rdata_o;
 98 | 
 99 | 
100 |     ne16_input_buffer_scm
101 |     #(
102 |       .ADDR_WIDTH     ( ADDR_WIDTH ),
103 |       .DATA_WIDTH     ( DATA_WIDTH ),
104 |       .NUM_WORDS      ( NUM_WORDS  )
105 |     )
106 |     ne16_input_buffer_scm_i
107 |     (
108 |       .clk_i          ( clk_i                 ),
109 |       .rst_ni         ( rst_ni                ),
110 |       .clear_i        ( clear_muxed           ),
111 |       .test_mode_i    ( test_mode_i           ),
112 | 
113 |       // Read port
114 |       .re_i           ( ReadEnable_muxed      ),
115 |       .raddr_i        ( ReadAddr_muxed        ),
116 |       .rdata_o        ( rdata_o               ),
117 | 
118 |       // Write port
119 |       .we_i           ( WriteEnable_muxed     ),
120 |       .we_all_i       ( WriteEnable_all_muxed ),
121 |       .waddr_i        ( WriteAddr_muxed       ),
122 |       .wdata_i        ( WriteData_muxed       ),
123 | 
124 |       .input_buffer_o ( input_buffer_o        )
125 |     );
126 | 
127 | endmodule : ne16_input_buffer_scm_test_wrap
128 | 


--------------------------------------------------------------------------------
/rtl/ne16_engine.sv:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ne16_engine.sv
  3 |  *
  4 |  * Copyright (C) 2019-2021 ETH Zurich, University of Bologna and GreenWaves Technologies
  5 |  *
  6 |  * Copyright and related rights are licensed under the Solderpad Hardware
  7 |  * License, Version 0.51 (the "License"); you may not use this file except in
  8 |  * compliance with the License.  You may obtain a copy of the License at
  9 |  * http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
 10 |  * or agreed to in writing, software, hardware and materials distributed under
 11 |  * this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 |  * specific language governing permissions and limitations under the License.
 14 |  */
 15 | 
 16 | /*
 17 |  * Authors (RBE):  Gianna Paulin <pauling@iis.ee.ethz.ch>
 18 |  *                 Francesco Conti <f.conti@unibo.it>
 19 |  * Authors (NE16): Francesco Conti <francesco.conti@greenwaves-technologies.com>
 20 |  */
 21 | 
 22 | import ne16_package::*;
 23 | 
 24 | module ne16_engine #(
 25 |   parameter int unsigned COLUMN_SIZE    = NE16_COLUMN_SIZE, // number of BinConv blocks per column (default 9)
 26 |   parameter int unsigned NR_COLUMN      = NE16_COLUMN_SIZE, // number of BinConv columns (default 9 -- same of size of BinConv columns!)
 27 |   parameter int unsigned BLOCK_SIZE     = NE16_BLOCK_SIZE,  // number of SoP's per BinConv block (default 4)
 28 |   parameter int unsigned INPUT_BUF_SIZE = 32*BLOCK_SIZE,    // TODO FIXME
 29 |   parameter int unsigned TP_IN          = NE16_TP_IN,       // number of input elements processed per cycle
 30 |   parameter int unsigned TP_OUT         = NE16_TP_OUT
 31 | ) (
 32 |   // global signals
 33 |   input  logic                   clk_i,
 34 |   input  logic                   rst_ni,
 35 |   input  logic                   test_mode_i,
 36 |   // local enable & clear
 37 |   input  logic                   enable_i,
 38 |   input  logic                   clear_i,
 39 |   // input streams + handshake
 40 |   hwpe_stream_intf_stream.sink   load_in,
 41 |   hwpe_stream_intf_stream.sink   load_weight,
 42 |   hwpe_stream_intf_stream.sink   load_norm,
 43 |   hwpe_stream_intf_stream.sink   load_streamin,
 44 |   hwpe_stream_intf_stream.source store_out,
 45 |   input  ctrl_engine_t           ctrl_i,
 46 |   output flags_engine_t          flags_o
 47 | );
 48 | 
 49 |   /* Local Params, Interfaces, and Signals */
 50 |   localparam BLOCK_PRES_SIZE  = NE16_QA_IN+NE16_QA_16BIT+8+$clog2(BLOCK_SIZE);
 51 |   localparam COLUMN_PRES_SIZE = BLOCK_PRES_SIZE+$clog2(COLUMN_SIZE);
 52 | 
 53 |   logic                      all_norm_ready;
 54 |   logic [NE16_NR_COLUMN-1:0] all_norm_ready_tree;
 55 | 
 56 |   hwpe_stream_intf_stream #(
 57 |     .DATA_WIDTH ( NE16_QA_IN )
 58 | `ifndef SYNTHESIS
 59 |     ,
 60 |     .BYPASS_VCR_ASSERT( 1'b1  ),
 61 |     .BYPASS_VDR_ASSERT( 1'b1  )
 62 | `endif
 63 |   ) load_in_blocks [BLOCK_SIZE-1:0] (
 64 |     .clk ( clk_i )
 65 |   );
 66 | 
 67 |   hwpe_stream_intf_stream #(
 68 |     .DATA_WIDTH ( NE16_MEM_BANDWIDTH )
 69 | `ifndef SYNTHESIS
 70 |     ,
 71 |     .BYPASS_VCR_ASSERT( 1'b1  ),
 72 |     .BYPASS_VDR_ASSERT( 1'b1  )
 73 | `endif
 74 |   ) load_weight_fifo (
 75 |     .clk ( clk_i )
 76 |   );
 77 | 
 78 |   hwpe_stream_intf_stream #(
 79 |     .DATA_WIDTH ( NE16_MEM_BANDWIDTH )
 80 | `ifndef SYNTHESIS
 81 |     ,
 82 |     .BYPASS_VCR_ASSERT( 1'b1  ),
 83 |     .BYPASS_VDR_ASSERT( 1'b1  )
 84 | `endif
 85 |   ) load_weight_fifo_demuxed [1:0] (
 86 |     .clk ( clk_i )
 87 |   );
 88 | 
 89 |   hwpe_stream_intf_stream #(
 90 |     .DATA_WIDTH ( TP_IN )
 91 | `ifndef SYNTHESIS
 92 |     ,
 93 |     .BYPASS_VCR_ASSERT( 1'b1  ),
 94 |     .BYPASS_VDR_ASSERT( 1'b1  )
 95 | `endif
 96 |   ) load_weight_rows_mode8 [15:0] (
 97 |     .clk ( clk_i )
 98 |   );
 99 | 
100 |   hwpe_stream_intf_stream #(
101 |     .DATA_WIDTH ( TP_IN/2 )
102 | `ifndef SYNTHESIS
103 |     ,
104 |     .BYPASS_VCR_ASSERT( 1'b1  ),
105 |     .BYPASS_VDR_ASSERT( 1'b1  )
106 | `endif
107 |   ) load_weight_rows_mode16_8bit [31:0] (
108 |     .clk ( clk_i )
109 |   );
110 | 
111 |   hwpe_stream_intf_stream #(
112 |     .DATA_WIDTH ( TP_IN )
113 | `ifndef SYNTHESIS
114 |     ,
115 |     .BYPASS_VCR_ASSERT( 1'b1  ),
116 |     .BYPASS_VDR_ASSERT( 1'b1  )
117 | `endif
118 |   ) load_weight_rows_mode16 [31:0] (
119 |     .clk ( clk_i )
120 |   );
121 | 
122 |   hwpe_stream_intf_stream #(
123 |     .DATA_WIDTH ( TP_IN )
124 | `ifndef SYNTHESIS
125 |     ,
126 |     .BYPASS_VCR_ASSERT( 1'b1  ),
127 |     .BYPASS_VDR_ASSERT( 1'b1  )
128 | `endif
129 |   ) load_weight_rows_conv [COLUMN_SIZE-1:0] (
130 |     .clk ( clk_i )
131 |   );
132 | 
133 |   hwpe_stream_intf_stream #(
134 |     .DATA_WIDTH ( TP_IN )
135 | `ifndef SYNTHESIS
136 |     ,
137 |     .BYPASS_VCR_ASSERT( 1'b1  ),
138 |     .BYPASS_VDR_ASSERT( 1'b1  )
139 | `endif
140 |   ) load_weight_rows_linear [31:0] (
141 |     .clk ( clk_i )
142 |   );
143 | 
144 |   hwpe_stream_intf_stream #(
145 |     .DATA_WIDTH ( NE16_MEM_BANDWIDTH )
146 | `ifndef SYNTHESIS
147 |     ,
148 |     .BYPASS_VCR_ASSERT( 1'b1  ),
149 |     .BYPASS_VDR_ASSERT( 1'b1  )
150 | `endif
151 |   ) store_out_cols [NR_COLUMN-1:0] (
152 |     .clk ( clk_i )
153 |   );
154 | 
155 |   hwpe_stream_intf_stream #(
156 |     .DATA_WIDTH ( NE16_MEM_BANDWIDTH )
157 | `ifndef SYNTHESIS
158 |     ,
159 |     .BYPASS_VCR_ASSERT( 1'b1  ),
160 |     .BYPASS_VDR_ASSERT( 1'b1  )
161 | `endif
162 |   ) load_streamin_cols [NR_COLUMN-1:0] (
163 |     .clk ( clk_i )
164 |   );
165 | 
166 |   hwpe_stream_intf_stream #(
167 |     .DATA_WIDTH ( NE16_QA_IN )
168 | `ifndef SYNTHESIS
169 |     ,
170 |     .BYPASS_VCR_ASSERT( 1'b1  ),
171 |     .BYPASS_VDR_ASSERT( 1'b1  )
172 | `endif
173 |   ) in_from_buf [INPUT_BUF_SIZE-1:0] (
174 |     .clk ( clk_i )
175 |   );
176 | 
177 |   hwpe_stream_intf_stream #(
178 |     .DATA_WIDTH ( COLUMN_PRES_SIZE )
179 | `ifndef SYNTHESIS
180 |     ,
181 |     .BYPASS_VCR_ASSERT( 1'b1  ),
182 |     .BYPASS_VDR_ASSERT( 1'b1  )
183 | `endif
184 |   ) pres [NR_COLUMN-1:0] (
185 |     .clk ( clk_i )
186 |   );
187 | 
188 |   hwpe_stream_intf_stream #(
189 |     .DATA_WIDTH ( NE16_MEM_BANDWIDTH )
190 | `ifndef SYNTHESIS
191 |     ,
192 |     .BYPASS_VCR_ASSERT( 1'b1  ),
193 |     .BYPASS_VDR_ASSERT( 1'b1  )
194 | `endif
195 |   ) norm [NR_COLUMN-1:0] (
196 |     .clk ( clk_i )
197 |   );
198 | 
199 |   hwpe_stream_intf_stream #(
200 |     .DATA_WIDTH ( NE16_MEM_BANDWIDTH )
201 | `ifndef SYNTHESIS
202 |     ,
203 |     .BYPASS_VCR_ASSERT( 1'b1  ),
204 |     .BYPASS_VDR_ASSERT( 1'b1  )
205 | `endif
206 |   ) load_norm_fifo (
207 |     .clk ( clk_i )
208 |   );
209 | 
210 |   hwpe_stream_intf_stream #(
211 |     .DATA_WIDTH ( NE16_MEM_BANDWIDTH )
212 | `ifndef SYNTHESIS
213 |     ,
214 |     .BYPASS_VCR_ASSERT( 1'b1  ),
215 |     .BYPASS_VDR_ASSERT( 1'b1  )
216 | `endif
217 |   ) load_streamin_fifo (
218 |     .clk ( clk_i )
219 |   );
220 | 
221 |   // Infeat data from the input buffer is split in blocks of size 16bits
222 |   //
223 |   //            load_in[128b]
224 |   //                 ||
225 |   //                 \/
226 |   //         +-----------------+
227 |   //         |hwpe_stream_split|
228 |   //         +-----------------+
229 |   //                 ||
230 |   //                 \/
231 |   //        load_in_blocks[15:0][8b]
232 | 
233 |   hwpe_stream_split #(
234 |     .NB_OUT_STREAMS ( BLOCK_SIZE            ),
235 |     .DATA_WIDTH_IN  ( NE16_QA_IN*BLOCK_SIZE )
236 |   ) i_split_load_in_blocks (
237 |     .clk_i   ( clk_i          ),
238 |     .rst_ni  ( rst_ni         ),
239 |     .clear_i ( clear_i        ),
240 |     .push_i  ( load_in        ),
241 |     .pop_o   ( load_in_blocks )
242 |   );
243 |   
244 |   // The following diagram explains the way that the weight stream is split in order to
245 |   // support the various CONV modes and the LINEAR mode at 16 and 8 bits.
246 |   // 
247 |   //                               load_weight[256b]
248 |   //                                      ||
249 |   //                                      \/
250 |   //                                    |____|
251 |   //                                    |____| hwpe_stream_fifo
252 |   //                                      ||
253 |   //                                      \/
254 |   //                             load_weight_fifo[256b]
255 |   //                                      ||
256 |   //                                      \/
257 |   //                          /------------------------\
258 |   // ctrl_i.mode16 ------->  /__________________________\
259 |   //                          || 0                  1 ||
260 |   //                          \/                      \/
261 |   //       load_weight_fifo_demuxed[0][256b] load_weight_fifo_demuxed[1][256b]
262 |   //                          ||                      ||
263 |   //                          \/                      \/
264 |   //                 +-----------------+      +-----------------+
265 |   //                 |hwpe_stream_split|      |hwpe_stream_split|
266 |   //                 +-----------------+      +-----------------+
267 |   //                          ||                      ||
268 |   //                          ||                      \/
269 |   //                          ||                load_weight_rows_mode16_8bit[31:0][8b]
270 |   //                          ||                      ||
271 |   //                          ||                      \/
272 |   //                          ||              +-----------------+
273 |   //                          ||              |   zero-extend   |
274 |   //                          ||              +-----------------+
275 |   //                          ||                      ||
276 |   //                          \/                      \/
277 |   //       load_weight_rows_mode8[15:0][16b]    load_weight_rows_mode16[31:0][16b]
278 |   //
279 |   //
280 |   // Convolutional modes actually use only 144 of the 256bits of memory interface:
281 |   //       load_weight_rows_mode8[15:0][16b]    load_weight_rows_mode16[31:0][16b]
282 |   //                          || [8:0]]               || [8:0]
283 |   //                          \/ 0                  1 \/
284 |   //                         \--------------------------/
285 |   // ctrl_i.mode16 ---------->\________________________/
286 |   //                                      ||
287 |   //                                      \/
288 |   //                          load_weight_rows_conv[8:0][16b]
289 |   //
290 |   // 
291 |   // Linear mode uses 256 bits of bandwidth in both 8 and 16 bit modes -- with 16 bit mode using 2x the number of MACs
292 |   //
293 |   //       load_weight_rows_mode8[15:0][16b]    load_weight_rows_mode16[31:0][16b]                     256b zeros              load_weight_rows_mode16[31:0][16b]  
294 |   //                          ||                      || [15:0]                                              ||                      || [31:16]           
295 |   //                          \/ 0                  1 \/                                                     \/ 0                  1 \/                            
296 |   //                         \--------------------------/                                                   \--------------------------/                           
297 |   // ctrl_i.mode16 ---------->\________________________/                            ctrl_i.mode16 ---------->\________________________/                            
298 |   //                                      ||                                                                             ||                                        
299 |   //                                      \/                                                                             \/                                        
300 |   //                          load_weight_rows_linear[15:0][16b]                                             load_weight_rows_linear[31:0][16b]                    
301 | 
302 |   hwpe_stream_fifo #(
303 |     .DATA_WIDTH ( NE16_MEM_BANDWIDTH ),
304 |     .FIFO_DEPTH ( 2                  )
305 |   ) i_fifo_load_weight (
306 |     .clk_i   ( clk_i            ),
307 |     .rst_ni  ( rst_ni           ),
308 |     .clear_i ( clear_i          ),
309 |     .flags_o (                  ),
310 |     .push_i  ( load_weight      ),
311 |     .pop_o   ( load_weight_fifo )
312 |   );
313 | 
314 |   hwpe_stream_demux_static #(
315 |     .NB_OUT_STREAMS ( 2 )
316 |   ) i_fifo_load_weight_fifo_demux (
317 |     .clk_i   ( clk_i                    ),
318 |     .rst_ni  ( rst_ni                   ),
319 |     .clear_i ( clear_i                  ),
320 |     .sel_i   ( ctrl_i.mode_16           ),
321 |     .push_i  ( load_weight_fifo         ),
322 |     .pop_o   ( load_weight_fifo_demuxed )
323 |   );
324 | 
325 |   hwpe_stream_split #(
326 |     .NB_OUT_STREAMS ( 16                 ),
327 |     .DATA_WIDTH_IN  ( NE16_MEM_BANDWIDTH )
328 |   ) i_split_load_weight_rows_mode8 (
329 |     .clk_i   ( clk_i                       ),
330 |     .rst_ni  ( rst_ni                      ),
331 |     .clear_i ( clear_i                     ),
332 |     .push_i  ( load_weight_fifo_demuxed[0] ),
333 |     .pop_o   ( load_weight_rows_mode8      )
334 |   );
335 | 
336 |   hwpe_stream_split #(
337 |     .NB_OUT_STREAMS ( 32                 ),
338 |     .DATA_WIDTH_IN  ( NE16_MEM_BANDWIDTH )
339 |   ) i_split_load_weight_rows_mode16 (
340 |     .clk_i   ( clk_i                        ),
341 |     .rst_ni  ( rst_ni                       ),
342 |     .clear_i ( clear_i                      ),
343 |     .push_i  ( load_weight_fifo_demuxed[1]  ),
344 |     .pop_o   ( load_weight_rows_mode16_8bit )
345 |   );
346 | 
347 |   generate
348 | 
349 |     for(genvar ii=0; ii<32; ii++) begin: load_weight_rows_mode16_adapt_gen
350 |       assign load_weight_rows_mode16[ii].data  = { 8'b0, load_weight_rows_mode16_8bit[ii].data };
351 |       assign load_weight_rows_mode16[ii].valid = load_weight_rows_mode16_8bit[0].valid;
352 |       assign load_weight_rows_mode16[ii].strb  = load_weight_rows_mode16_8bit[0].strb;
353 |       assign load_weight_rows_mode16_8bit[ii].ready = load_weight_rows_mode16[ii].ready;
354 |     end
355 | 
356 |     logic ready_conv, ready_linear;
357 |     assign ready_conv = load_weight_rows_conv[0].ready;
358 |     assign ready_linear = load_weight_rows_linear[0].ready;
359 | 
360 |     for(genvar ii=0; ii<COLUMN_SIZE; ii++) begin: load_weight_rows_mode16_conv_mux_gen
361 |       assign load_weight_rows_conv[ii].data  = ctrl_i.mode_16 ? load_weight_rows_mode16[ii].data : load_weight_rows_mode8[ii].data;
362 |       assign load_weight_rows_conv[ii].valid = ctrl_i.mode_16 ? load_weight_rows_mode16[0].valid : load_weight_rows_mode8[0].valid;
363 |       assign load_weight_rows_conv[ii].strb  = '1;
364 |     end
365 |     
366 |     for(genvar ii=0; ii<16; ii++) begin: load_weight_rows_mode16_linear_mux_lo_gen
367 |       assign load_weight_rows_linear[ii].data  = ctrl_i.mode_16 ? load_weight_rows_mode16[ii].data : load_weight_rows_mode8[ii].data;
368 |       assign load_weight_rows_linear[ii].valid = ctrl_i.mode_16 ? load_weight_rows_mode16[0].valid : load_weight_rows_mode8[0].valid;
369 |       assign load_weight_rows_linear[ii].strb  = '1;
370 |     end
371 |     
372 |     for(genvar ii=16; ii<32; ii++) begin: load_weight_rows_mode16_linear_mux_hi_gen
373 |       assign load_weight_rows_linear[ii].data  = ctrl_i.mode_16 ? load_weight_rows_mode16[ii].data : '0;
374 |       assign load_weight_rows_linear[ii].valid = ctrl_i.mode_16 ? load_weight_rows_mode16[0].valid : '0;
375 |       assign load_weight_rows_linear[ii].strb  = '1;
376 |     end
377 | 
378 |     for(genvar ii=0; ii<16; ii++) begin: load_weight_rows_lo_ready_prop_gen
379 |       assign load_weight_rows_mode16[ii].ready =  ctrl_i.mode_16 ? ready_conv | ready_linear : 1'b0;
380 |       assign load_weight_rows_mode8[ii].ready  = ~ctrl_i.mode_16 ? ready_conv | ready_linear : 1'b0;
381 |     end
382 |     
383 |     for(genvar ii=16; ii<32; ii++) begin: load_weight_rows_hi_ready_prop_gen
384 |       assign load_weight_rows_mode16[ii].ready =  ctrl_i.mode_16 ? ready_conv | ready_linear : 1'b0;
385 |     end
386 | 
387 |   endgenerate
388 | 
389 |   // Streamout data from the column accumulators is serialized one column after the other
390 |   //
391 |   //        store_out_cols[8:0][256b]
392 |   //                 ||
393 |   //                 \/
394 |   //       +---------------------+
395 |   //       |hwpe_stream_serialize|
396 |   //       +---------------------+
397 |   //                 ||
398 |   //                 \/
399 |   //           store_out[256b]
400 | 
401 |   hwpe_stream_serialize #(
402 |     .NB_IN_STREAMS ( NR_COLUMN          ),
403 |     .DATA_WIDTH    ( NE16_MEM_BANDWIDTH )
404 |   ) i_serialize_store_out (
405 |     .clk_i   ( clk_i                 ),
406 |     .rst_ni  ( rst_ni                ),
407 |     .clear_i ( clear_i               ),
408 |     .ctrl_i  ( ctrl_i.ctrl_serialize ),
409 |     .push_i  ( store_out_cols        ),
410 |     .pop_o   ( store_out             )
411 |   );
412 | 
413 |   // Streamin data goingo into the column accumulators comes per column and is deserialized
414 |   //
415 |   //          load_streamin[256b]
416 |   //                 ||
417 |   //                 \/
418 |   //               |____|
419 |   //               |____| hwpe_stream_fifo
420 |   //                 ||
421 |   //                 \/
422 |   //        load_streamin_fifo[256b]
423 |   //                 ||
424 |   //                 \/
425 |   //      +-----------------------+
426 |   //      |hwpe_stream_deserialize|
427 |   //      +-----------------------+
428 |   //                 ||
429 |   //                 \/
430 |   //           load_streamin_cols[8:0][256b]
431 | 
432 |   hwpe_stream_fifo #(
433 |     .DATA_WIDTH ( NE16_MEM_BANDWIDTH ),
434 |     .FIFO_DEPTH ( 2                  )
435 |   ) i_fifo_load_streamin (
436 |     .clk_i   ( clk_i              ),
437 |     .rst_ni  ( rst_ni             ),
438 |     .clear_i ( clear_i            ),
439 |     .flags_o (                    ),
440 |     .push_i  ( load_streamin      ),
441 |     .pop_o   ( load_streamin_fifo )
442 |   );
443 | 
444 |   hwpe_stream_deserialize #(
445 |     .NB_OUT_STREAMS ( NR_COLUMN          ),
446 |     .DATA_WIDTH     ( NE16_MEM_BANDWIDTH )
447 |   ) i_deserialize_load_streamin (
448 |     .clk_i   ( clk_i                      ),
449 |     .rst_ni  ( rst_ni                     ),
450 |     .clear_i ( clear_i | ctrl_i.clear_des ),
451 |     .ctrl_i  ( ctrl_i.ctrl_serialize      ),
452 |     .push_i  ( load_streamin_fifo         ),
453 |     .pop_o   ( load_streamin_cols         )
454 |   );
455 | 
456 |   // The same norm stream, coming simply from a FIFO, is shared between all columns.
457 |   //
458 |   //          load_norm[256b]
459 |   //                 ||
460 |   //                 \/
461 |   //               |____|
462 |   //               |____| hwpe_stream_fifo
463 |   //                 ||
464 |   //                 \/
465 |   //          load_norm_fifo[256b]
466 |   //                 || copy 9x
467 |   //                 \/
468 |   //             norm[8:0][256b]
469 | 
470 |   // enqueue norm stream
471 |   hwpe_stream_fifo #(
472 |     .DATA_WIDTH ( NE16_MEM_BANDWIDTH ),
473 |     .FIFO_DEPTH ( 2                  )
474 |   ) i_fifo_load_norm (
475 |     .clk_i   ( clk_i          ),
476 |     .rst_ni  ( rst_ni         ),
477 |     .clear_i ( clear_i        ),
478 |     .flags_o (                ),
479 |     .push_i  ( load_norm      ),
480 |     .pop_o   ( load_norm_fifo )
481 |   );
482 | 
483 |   // duplicate norm stream
484 |   generate
485 |     for(genvar ii=0; ii<NE16_NR_COLUMN; ii++) begin
486 |       assign all_norm_ready_tree[ii] = norm[ii].ready;
487 |       assign norm[ii].data           = load_norm_fifo.data;
488 |       assign norm[ii].valid          = load_norm_fifo.valid;
489 |       assign norm[ii].strb           = load_norm_fifo.strb;
490 |     end
491 | 
492 |     assign all_norm_ready = &(all_norm_ready_tree);
493 |     assign load_norm_fifo.ready = all_norm_ready;
494 |   endgenerate
495 | 
496 |   /* Input Buffer */
497 |   ne16_input_buffer #(
498 |     .INPUT_BUF_SIZE ( INPUT_BUF_SIZE ),
499 |     .BLOCK_SIZE     ( BLOCK_SIZE     ),
500 |     .DW             ( NE16_QA_IN     )
501 |   ) i_input_buffer (
502 |     .clk_i       ( clk_i                      ),
503 |     .rst_ni      ( rst_ni                     ),
504 |     .test_mode_i ( test_mode_i                ),
505 |     .enable_i    ( enable_i                   ),
506 |     .clear_i     ( clear_i                    ),
507 |     .ctrl_i      ( ctrl_i.ctrl_input_buffer   ),
508 |     .flags_o     ( flags_o.flags_input_buffer ),
509 |     .feat_i      ( load_in_blocks             ),
510 |     .feat_o      ( in_from_buf                )
511 |   );
512 | 
513 |   /* BinConv Array */
514 |   ne16_binconv_array #(
515 |     .COLUMN_SIZE    ( COLUMN_SIZE    ),
516 |     .NR_COLUMN      ( NR_COLUMN      ),
517 |     .NR_ACTIVATIONS ( INPUT_BUF_SIZE ),
518 |     .BLOCK_SIZE     ( BLOCK_SIZE     ),
519 |     .TP_IN          ( TP_IN          )
520 |   ) i_binconv_array (
521 |     .clk_i             ( clk_i                          ),
522 |     .rst_ni            ( rst_ni                         ),
523 |     .test_mode_i       ( test_mode_i                    ),
524 |     .enable_i          ( enable_i                       ),
525 |     .clear_i           ( clear_i                        ),
526 |     .activation_i      ( in_from_buf                    ),
527 |     .weight_conv_i     ( load_weight_rows_conv          ),
528 |     .weight_linear_i   ( load_weight_rows_linear        ),
529 |     .pres_o            ( pres                           ),
530 |     .ctrl_i            ( ctrl_i.ctrl_binconv_array      ),
531 |     .flags_o           ( flags_o.flags_binconv_array    )
532 |   );
533 | 
534 |   /* Accumulators + Normalization/Quantization */
535 |   generate
536 |     for (genvar ii=0; ii<NR_COLUMN; ii++) begin : accumulator_gen
537 | 
538 |       ctrl_aq_t ctrl_accumulator;
539 |       always_comb
540 |       begin
541 |         ctrl_accumulator = ctrl_i.ctrl_accumulator;
542 |         ctrl_accumulator.enable_streamout = ctrl_i.enable_accumulator[ii];
543 |       end
544 | 
545 |       ne16_accumulator_normquant #(
546 |         .TP  ( TP_IN  ),
547 |         .AP  ( TP_OUT ),
548 |         .ACC ( 32     )
549 |       ) i_accumulator (
550 |         .clk_i       ( clk_i                          ),
551 |         .rst_ni      ( rst_ni                         ),
552 |         .test_mode_i ( test_mode_i                    ),
553 |         .enable_i    ( enable_i                       ),
554 |         .clear_i     ( clear_i                        ),
555 |         .conv_i      ( pres [ii]                      ),
556 |         .norm_i      ( norm [ii]                      ),
557 |         .streamin_i  ( load_streamin_cols [ii]        ),
558 |         .conv_o      ( store_out_cols [ii]            ),
559 |         .ctrl_i      ( ctrl_accumulator               ),
560 |         .flags_o     ( flags_o.flags_accumulator [ii] )
561 |       );
562 | 
563 |     end // accumulator_gen
564 |   endgenerate
565 | 
566 | endmodule // ne16_engine
567 | 


--------------------------------------------------------------------------------
/rtl/ne16_package.sv:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ne16_package.sv
  3 |  *
  4 |  * Copyright (C) 2019-2021 ETH Zurich, University of Bologna and GreenWaves Technologies
  5 |  *
  6 |  * Copyright and related rights are licensed under the Solderpad Hardware
  7 |  * License, Version 0.51 (the "License"); you may not use this file except in
  8 |  * compliance with the License.  You may obtain a copy of the License at
  9 |  * http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
 10 |  * or agreed to in writing, software, hardware and materials distributed under
 11 |  * this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 |  * specific language governing permissions and limitations under the License.
 14 |  */
 15 | 
 16 | /*
 17 |  * Authors (RBE):  Gianna Paulin <pauling@iis.ee.ethz.ch>
 18 |  *                 Francesco Conti <f.conti@unibo.it>
 19 |  * Authors (NE16): Francesco Conti <francesco.conti@greenwaves-technologies.com>
 20 |  */
 21 | 
 22 | package ne16_package;
 23 | 
 24 |   // ========================================================================
 25 |   // PULP contents
 26 |   // ========================================================================
 27 | 
 28 |   parameter int NR_HWPE_REG   = 11;
 29 |   parameter int NR_HCI_REG    = 1;
 30 |   parameter int NR_UCODE_REG  = 12;
 31 | 
 32 |   // general PULP environment parameters including clusters etc
 33 |   // default number of cores
 34 |   parameter int NR_CORES = 9;
 35 | 
 36 |   // number of contexts
 37 |   parameter int NR_CONTEXT = 1;
 38 | 
 39 |   // default id width
 40 |   parameter int ID_WIDTH = 16;
 41 | 
 42 |   // number of registers
 43 |   parameter int NR_IO_REGS      = NR_HWPE_REG + NR_UCODE_REG; // 10 + 11 = 21
 44 |   parameter int NR_GENERIC_REGS = NR_HCI_REG;                 // 1
 45 | 
 46 |   // Maximum weight exponent offset (limits MAC bitwidths)
 47 |   // parameter int N2_MAX = 64;
 48 | 
 49 |   // ========================================================================
 50 |   // CTRL Registers
 51 |   // ========================================================================
 52 | 
 53 |   // ctrl counter bit-widths
 54 |   parameter int SPATIAL_CNT_SIZE   = 16;
 55 |   parameter int FILTER_CNT_SIZE    =  5;
 56 |   parameter int FEAT_CNT_SIZE      = 12;
 57 |   parameter int QUANT_CNT_SIZE     =  8;
 58 |   parameter int NB_ACC_CNT_SIZE    =  8;
 59 | 
 60 |   // ========================================================================
 61 |   // BANDWIDTH related types
 62 |   parameter int NE16_MEM_BANDWIDTH_EXT = 288; // bits (9 ports x 32 bits)
 63 |   parameter int NE16_MEM_BANDWIDTH     = 256; // bits (8 ports x 32 bits) -- this is after realignment
 64 |   parameter int NE16_STREAM_BANDWIDTH  = 160; // bits (9 ports x 32 bits)
 65 | 
 66 |   // ========================================================================
 67 |   // BINCONV related types
 68 |   // Throughput parameter for a single BinConv module
 69 |   parameter int NE16_TP_IN = 16;
 70 |   parameter int NE16_QA_IN = 8;
 71 |   parameter int NE16_QA_OUT = 8;
 72 |   parameter int NE16_TP_OUT = 32;
 73 |   parameter int NE16_QA_16BIT = 8; // overhead in 16-bit mode
 74 | 
 75 |   // number of 1x8-bit multipliers per BinConv block
 76 |   parameter int NE16_BLOCK_SIZE = 16;
 77 | 
 78 |   // architectural parameters of NE1
 79 |   parameter int NE16_INPUT_BUFFER_SIZE = 32;
 80 | 
 81 |   // number of binary BinConv blocks per BinConv column
 82 |   parameter int NE16_COLUMN_SIZE = 9;
 83 | 
 84 |   // number of binary BinConv blocks per BinConv array
 85 |   parameter int NE16_NR_COLUMN = 9;
 86 | 
 87 |   // number of shift cycles
 88 |   parameter int NE16_SHIFT_CYCLES = 2;
 89 | 
 90 |   // ========================================================================
 91 |   // ACCUMULATOR module related types
 92 |   // number of bits used in vlen_cnt
 93 |   parameter int NE16_ACCUM_SIZE = 32;
 94 |   parameter int VLEN_CNT_SIZE           = 16;
 95 | 
 96 |   // (batch-)normalization parameters
 97 |   parameter int unsigned NORM_MULT_SIZE = 8;
 98 | 
 99 |   // ========================================================================
100 |   // FEAT_BUFFER related types
101 |   // ========================================================================
102 |   typedef struct packed {
103 |     logic                     goto_load;
104 |     logic                     goto_extract;
105 |     logic                     goto_idle;
106 |     logic [VLEN_CNT_SIZE-1:0] load_len;
107 |     logic [NE16_INPUT_BUFFER_SIZE-1:0] enable_implicit_padding;
108 |     logic [NE16_INPUT_BUFFER_SIZE-1:0] enable_explicit_padding;
109 |     logic [NE16_QA_IN-1:0]    explicit_padding_value_hi;
110 |     logic [NE16_QA_IN-1:0]    explicit_padding_value_lo;
111 |     logic [1:0]               filter_mode;
112 |   } ctrl_input_buffer_t;
113 | 
114 |   typedef enum {
115 |     IB_IDLE, IB_LOAD, IB_EXTRACT
116 |   } state_input_buffer_t;
117 | 
118 |   typedef struct packed {
119 |     state_input_buffer_t state;
120 |   } flags_input_buffer_t;
121 | 
122 | 
123 |   // ========================================================================
124 |   // SIGN_BUFFER related types
125 |   // ========================================================================
126 |   typedef struct packed {
127 |     logic                     goto_load;
128 |     logic                     goto_extract;
129 |     logic [VLEN_CNT_SIZE-1:0] i_vlen;       // virtual buffer length
130 |     logic [VLEN_CNT_SIZE-1:0] o_vlen;
131 |   } ctrl_sign_buf_t;
132 | 
133 |   typedef enum {
134 |     SR_IDLE, SR_LOAD, SR_EXTRACT
135 |   } state_sign_buf_t;
136 | 
137 |   typedef struct packed {
138 |     state_sign_buf_t state;
139 |   } flags_sign_buf_t;
140 | 
141 | 
142 |   // ========================================================================
143 |   // SOP related types
144 |   // ========================================================================
145 |   typedef struct packed {
146 |     logic                    operation_sel; // 1:xnor, 0: and
147 |     logic [NE16_TP_IN-1:0]   inactive_mask;
148 |     logic                    clear;
149 |   } ctrl_sop_t;
150 | 
151 | 
152 |   // ========================================================================
153 |   // Accumulator Quantizor related types
154 |   // ========================================================================
155 | 
156 |   typedef struct packed {
157 |     logic                            start;
158 |     logic                            relu;
159 |     logic [4:0]                      right_shift;
160 |     logic [1:0]                      norm_mode;
161 |     logic [1:0]                      quant_mode;
162 |     logic                            norm_signed;
163 |     logic                            use_rounding;
164 |     logic                            use_shifting;
165 |   } ctrl_normquant_t;
166 | 
167 |   typedef struct packed {
168 |     logic ready;
169 |   } flags_normquant_t;
170 | 
171 |   parameter logic[1:0] NE16_MODE_8B  = 2'b00;
172 |   parameter logic[1:0] NE16_MODE_16B = 2'b01;
173 |   parameter logic[1:0] NE16_MODE_32B = 2'b10;
174 | 
175 |   parameter logic[1:0] NE16_FILTER_MODE_LINEAR = 2'b11;
176 |   parameter logic[1:0] NE16_FILTER_MODE_1X1    = 2'b10;
177 |   parameter logic[1:0] NE16_FILTER_MODE_3X3_DW = 2'b01;
178 |   parameter logic[1:0] NE16_FILTER_MODE_3X3    = 2'b00;
179 | 
180 |   typedef struct packed {
181 |     logic [       VLEN_CNT_SIZE-1:0] full_accumulation_len;   // nr of accumulations
182 |     logic [       VLEN_CNT_SIZE-1:0] streamout_len;
183 |     logic [       VLEN_CNT_SIZE-1:0] scale_len;
184 |     logic [       VLEN_CNT_SIZE-1:0] bias_len;
185 |     logic                            clear;
186 |     logic                            clear_offset;
187 |     logic                            goto_normquant;
188 |     logic                            goto_accum;
189 |     logic                            goto_streamin;
190 |     logic                            goto_streamout;
191 |     logic                            goto_idle;
192 |     logic                            sample_shift;
193 |     logic [1:0]                      quant_mode;   // 00: 8 bits, 01: 16 bits (reserved for future usage), 11: 32 bits
194 |     logic [1:0]                      norm_mode;    // 00: 8 bits, 01: 16 bits, 11: 32 bits
195 |     ctrl_normquant_t                 ctrl_normquant;
196 |     logic        norm_option_bias;
197 |     logic        norm_option_shift;
198 |     logic weight_offset;
199 |     logic [31:0] weight_offset_scale;
200 |     logic [$clog2(QUANT_CNT_SIZE):0] qw;       // weights quantization
201 |     logic enable_streamout;
202 |     logic depthwise;
203 |   } ctrl_aq_t;
204 | 
205 |   typedef enum {
206 |     AQ_IDLE, AQ_ACCUM, AQ_NORMQUANT_SHIFT, AQ_NORMQUANT, AQ_NORMQUANT_TOBIAS, AQ_NORMQUANT_BIAS, AQ_STREAMIN, AQ_STREAMOUT, AQ_ACCUM_DONE, AQ_NORMQUANT_DONE, AQ_STREAMIN_DONE, AQ_STREAMOUT_DONE
207 |   } state_aq_t;
208 | 
209 |   typedef struct packed {
210 |     state_aq_t  state;
211 |     logic       addr_cnt_en_q;
212 |   } flags_aq_t;
213 | 
214 |   // ========================================================================
215 |   // SCALE related types
216 |   // ========================================================================
217 | 
218 |   parameter int unsigned MAX_SHIFT = 16;
219 |   typedef struct packed {
220 |     logic [$clog2(MAX_SHIFT):0] shift_sel;
221 |     logic                       invert;
222 |   } ctrl_scale_t;
223 | 
224 |   typedef struct packed {
225 |     logic [$clog2(MAX_SHIFT):0] shift_sel;
226 |   } flags_scale_t;
227 | 
228 |   // ========================================================================
229 |   // BINCONV_BLOCK related types
230 |   // ========================================================================
231 | 
232 |   typedef struct packed {
233 |     logic [$clog2(QUANT_CNT_SIZE):0] qw;
234 |     logic [1:0] filter_mode;              // filter size
235 |     logic [$clog2(8):0] scale_shift;
236 |     logic                weight_offset;
237 |     logic                clear;
238 |     logic [NE16_BLOCK_SIZE-1:0] enable_mac;
239 |     logic [$clog2(NE16_QA_IN):0] block_cnt;
240 |     logic invalidate;
241 |     logic        mode_16;
242 |     logic        mode_linear;
243 |   } ctrl_binconv_block_t;
244 | 
245 |   typedef struct packed {
246 |     flags_scale_t [NE16_BLOCK_SIZE-1:0]  flags_scale;
247 |   } flags_binconv_block_t;
248 | 
249 |   // ========================================================================
250 |   // BINCONV_COLUMN related types
251 |   // ========================================================================
252 | 
253 |   typedef struct packed {
254 |     ctrl_binconv_block_t ctrl_block;
255 |     logic [NE16_COLUMN_SIZE-1:0] enable_block;
256 |     logic [NE16_NR_COLUMN-1:0][NE16_COLUMN_SIZE-1:0] enable_block_linear;
257 |     logic [31:0] padding_value;
258 |   } ctrl_binconv_column_t;
259 | 
260 |   typedef struct packed {
261 |     flags_binconv_block_t [NE16_COLUMN_SIZE-1:0] flags_block;
262 |   } flags_binconv_column_t;
263 | 
264 |   // ========================================================================
265 |   // BINCONV_ARRAY related types
266 |   // ========================================================================
267 | 
268 |   typedef struct packed {
269 |     ctrl_binconv_column_t ctrl_column;
270 |     logic [1:0]  filter_mode;
271 |     logic [NE16_NR_COLUMN-1:0] enable_column;
272 |     logic weight_offset;
273 |     logic [$clog2(NE16_TP_IN):0] depthwise_len;
274 |     logic mode_16;
275 |     logic mode_linear;
276 |   } ctrl_binconv_array_t;
277 | 
278 |   typedef struct packed {
279 |     flags_binconv_column_t [NE16_NR_COLUMN-1:0] flags_column;
280 |   } flags_binconv_array_t;
281 | 
282 |   // ========================================================================
283 |   // ENGINE related types
284 |   // ========================================================================
285 | 
286 |   typedef struct packed {
287 |     ctrl_input_buffer_t  ctrl_input_buffer;
288 |     ctrl_binconv_array_t ctrl_binconv_array;
289 |     ctrl_aq_t            ctrl_accumulator;
290 |     hwpe_stream_package::ctrl_serdes_t ctrl_serialize;
291 |     logic [NE16_NR_COLUMN-1:0] enable_accumulator;
292 |     logic clear_des;
293 |     logic mode_16;
294 |     logic mode_linear;
295 |   } ctrl_engine_t;
296 | 
297 |   typedef struct packed {
298 |     flags_input_buffer_t                  flags_input_buffer;
299 |     flags_aq_t       [NE16_NR_COLUMN-1:0] flags_accumulator;
300 |     flags_binconv_array_t                 flags_binconv_array;
301 |   } flags_engine_t;
302 | 
303 |   // ========================================================================
304 |   // URISCY CTRL related types
305 |   // ========================================================================
306 | 
307 |   typedef struct packed {
308 |     logic start;
309 |   } ctrl_ctrlmult_t;
310 | 
311 |   typedef struct packed {
312 |     logic valid;
313 |   } flags_ctrlmult_t;
314 | 
315 | 
316 |   // ========================================================================
317 |   // STREAMER related types
318 |   // ========================================================================
319 | 
320 |   typedef enum { LD_FEAT_SEL, LD_WEIGHT_SEL, LD_NORM_SEL, LD_STREAMIN_SEL } ld_which_mux_sel_t;
321 |   parameter logic LD_SEL = 1'b0;
322 |   parameter logic ST_SEL = 1'b1;
323 | 
324 |   typedef struct packed {
325 |     ld_which_mux_sel_t               ld_which_mux_sel;
326 |     logic                            ld_st_mux_sel;
327 |     logic                            clear_fifo;
328 |     logic                            clear_source;
329 |     logic                            clear_sink;
330 |     hci_package::hci_streamer_ctrl_t feat_source_ctrl;
331 |     hci_package::hci_streamer_ctrl_t weight_source_ctrl;
332 |     hci_package::hci_streamer_ctrl_t norm_source_ctrl;
333 |     hci_package::hci_streamer_ctrl_t conv_sink_ctrl;
334 |     hci_package::hci_streamer_ctrl_t streamin_source_ctrl;
335 |   } ctrl_streamer_t;
336 | 
337 |   typedef struct packed {
338 |     hci_package::hci_streamer_flags_t feat_source_flags;
339 |     hci_package::hci_streamer_flags_t weight_source_flags;
340 |     hci_package::hci_streamer_flags_t norm_source_flags;
341 |     hci_package::hci_streamer_flags_t conv_sink_flags;
342 |     logic tcdm_fifo_empty;
343 |   } flags_streamer_t;
344 | 
345 | 
346 |   // ========================================================================
347 |   // CTRL FSM related types
348 |   // ========================================================================
349 | 
350 |   typedef enum {
351 |     IDLE, STREAMIN, LOAD, WEIGHTOFFS, MATRIXVEC, NORMQUANT, NORMQUANT_BIAS, NORMQUANT_SHIFT, STREAMOUT, STREAMOUT_DONE, UPDATEIDX, UPDATEIDX_WAIT, DONE
352 |   } state_ne16_t; // FIXME --> move NORMQUANT to SCALE
353 | 
354 |   typedef struct packed {
355 |     logic [31:0] weights_kom_iter;
356 |     logic [31:0] weights_kim_iter;
357 |     logic [31:0] weights_kom_reset_iter;
358 |     logic [31:0] weights_kim_reset_iter;
359 |     logic [31:0] infeat_kim_iter;
360 |     logic [31:0] infeat_wom_iter;
361 |     logic [31:0] infeat_hom_iter;
362 |     logic [31:0] infeat_kim_reset_iter;
363 |     logic [31:0] infeat_wom_reset_iter;
364 |     logic [31:0] infeat_hom_reset_iter;
365 |     logic [31:0] outfeat_wom_iter;
366 |     logic [31:0] outfeat_hom_iter;
367 |     logic [31:0] outfeat_kom_iter;
368 |     logic [31:0] outfeat_wom_reset_iter;
369 |     logic [31:0] outfeat_hom_reset_iter;
370 |     logic [31:0] outfeat_kom_reset_iter;
371 |     logic [31:0] scale_kom_iter;
372 |   } uloop_iter_ne16_t;
373 | 
374 |   typedef struct packed {
375 |     logic [31:0] weights_ptr;
376 |     logic [31:0] infeat_ptr;
377 |     logic [31:0] outfeat_ptr;
378 |     logic [31:0] scale_ptr;
379 |     logic [31:0] scale_shift_ptr;
380 |     logic [31:0] scale_bias_ptr;
381 |     logic [15:0] subtile_nb_ko; // register n_tiles_k_out
382 |     logic [15:0] subtile_rem_ko; // register k_out_rest
383 |     logic [15:0] subtile_nb_ki; // register n_tiles_k_in
384 |     logic [15:0] subtile_rem_ki; // register k_in_rest
385 |     logic [15:0] subtile_nb_ho; // register n_tiles_h_out
386 |     logic [15:0] subtile_rem_ho; // register h_out_rest
387 |     logic [15:0] subtile_nb_wo; // register n_tiles_w_out
388 |     logic [15:0] subtile_rem_wo; // register w_out_rest
389 |     logic [15:0] subtile_rem_hi; // register h_in_rest
390 |     logic [15:0] subtile_rem_wi; // register w_in_rest
391 |     logic [31:0] infeat_d0_stride; // register x_word_stride
392 |     logic [31:0] infeat_d1_stride; // register x_line_stride
393 |     logic [31:0] infeat_d2_stride; // register x_block_stride
394 |     logic [31:0] weights_d0_stride; // register W_word_stride
395 |     logic [31:0] weights_d1_stride; // register W_line_stride
396 |     logic [31:0] weights_d2_stride; // register W_block_stride
397 |     logic [31:0] outfeat_d0_stride; // register y_word_stride
398 |     logic [31:0] outfeat_d1_stride; // register y_line_stride
399 |     logic [31:0] outfeat_d2_stride; // register y_block_stride
400 |     logic [3:0]  padding_top;
401 |     logic [3:0]  padding_right;
402 |     logic [3:0]  padding_bottom;
403 |     logic [3:0]  padding_left;
404 |     logic [15:0] padding_value;
405 |     logic        norm_option_bias;
406 |     logic        norm_option_shift;
407 |     logic [31:0] weight_offset_scale;
408 |     logic [7:0]  filter_mask_top;
409 |     logic [7:0]  filter_mask_right;
410 |     logic [7:0]  filter_mask_bottom;
411 |     logic [7:0]  filter_mask_left;
412 |     logic [1:0]  filter_mode;
413 |     logic [1:0]  norm_mode;
414 |     logic [1:0]  quant_mode;
415 |     logic        relu;
416 |     logic        streamin;
417 |     logic        streamout_quant;
418 |     logic        mode_16;
419 |     logic        mode_linear;
420 |     logic        mode_strided;
421 |     logic [3:0]  weight_bits;
422 |     logic        use_rounding;
423 |     logic [4:0]  shift_reqnt;
424 |     uloop_iter_ne16_t uloop_iter;
425 |   } config_ne16_t;
426 | 
427 |   typedef struct packed {
428 |     logic [15:0] k_out_major;
429 |     logic [15:0] i_major;
430 |     logic [15:0] j_major;
431 |     logic [15:0] k_in_major;
432 |   } index_ne16_t;
433 | 
434 |   typedef struct packed {
435 |     logic k_out_major;
436 |     logic i_major;
437 |     logic j_major;
438 |     logic k_in_major;
439 |   } index_update_ne16_t;
440 | 
441 |   typedef struct packed {
442 |     logic [31:0] weights;
443 |     logic [31:0] infeat;
444 |     logic [31:0] outfeat;
445 |     logic [31:0] scale;
446 |   } base_addr_ne16_t;
447 | 
448 |   parameter int unsigned NE16_ULOOP_BASE_ADDR_W            = 0;
449 |   parameter int unsigned NE16_ULOOP_BASE_ADDR_X            = 1;
450 |   parameter int unsigned NE16_ULOOP_BASE_ADDR_Y            = 2;
451 |   parameter int unsigned NE16_ULOOP_BASE_ADDR_S            = 3;
452 |   parameter int unsigned NE16_ULOOP_RO_WEIGHTS_KOM_ITER  = 4  - 4;
453 |   parameter int unsigned NE16_ULOOP_RO_WEIGHTS_KIM_ITER  = 5  - 4;
454 |   parameter int unsigned NE16_ULOOP_RO_WEIGHTS_KOM_RESET_ITER = 6  - 4;
455 |   parameter int unsigned NE16_ULOOP_RO_WEIGHTS_KIM_RESET_ITER = 7  - 4;
456 |   parameter int unsigned NE16_ULOOP_RO_INFEAT_KIM_ITER   = 8  - 4;
457 |   parameter int unsigned NE16_ULOOP_RO_INFEAT_WOM_ITER   = 9  - 4;
458 |   parameter int unsigned NE16_ULOOP_RO_INFEAT_HOM_ITER   = 10 - 4;
459 |   parameter int unsigned NE16_ULOOP_RO_INFEAT_KIM_RESET_ITER  = 11 - 4;
460 |   parameter int unsigned NE16_ULOOP_RO_INFEAT_WOM_RESET_ITER  = 12 - 4;
461 |   parameter int unsigned NE16_ULOOP_RO_INFEAT_HOM_RESET_ITER  = 13 - 4;
462 |   parameter int unsigned NE16_ULOOP_RO_OUTFEAT_WOM_ITER  = 14 - 4;
463 |   parameter int unsigned NE16_ULOOP_RO_OUTFEAT_HOM_ITER  = 15 - 4;
464 |   parameter int unsigned NE16_ULOOP_RO_OUTFEAT_KOM_ITER  = 16 - 4;
465 |   parameter int unsigned NE16_ULOOP_RO_OUTFEAT_WOM_RESET_ITER = 17 - 4;
466 |   parameter int unsigned NE16_ULOOP_RO_OUTFEAT_HOM_RESET_ITER = 18 - 4;
467 |   parameter int unsigned NE16_ULOOP_RO_OUTFEAT_KOM_RESET_ITER = 19 - 4;
468 |   parameter int unsigned NE16_ULOOP_RO_SCALE_KOM_ITER         = 20 - 4;
469 |   parameter int unsigned NE16_ULOOP_RO_ZERO                   = 21 - 4;
470 | 
471 |   // implemented with dual-context hwpe regs:
472 |   parameter int NE16_REG_WEIGHTS_PTR       = 0;  // Weights pointer: pointer to Weights tensor in memory (d3=Ko, d2=Fy, d1=Fx, d0=Ki).
473 |   parameter int NE16_REG_INFEAT_PTR        = 1;  // InFeat pointer: pointer to InFeat tensor in memory (d2=Hi, d1=Wi, d0=Ki).
474 |   parameter int NE16_REG_OUTFEAT_PTR       = 2;  // OutFeat pointer: pointer to OutFeat tensor in memory (d2=Ho, d1=Wo, d0=Ko).
475 |   parameter int NE16_REG_SCALE_PTR         = 3;  // Scale pointer: pointer to Scale parameters in memory (d0=Ko).
476 |   parameter int NE16_REG_SCALE_SHIFT_PTR   = 4;  // ScaleShift pointer: pointer to ScaleShift parameters in memory (d0=Ko).
477 |   parameter int NE16_REG_SCALE_BIAS_PTR    = 5;  // ScaleBias pointer: pointer to ScaleBias parameters in memory (d0=Ko).
478 |   parameter int NE16_REG_INFEAT_D0_STRIDE  = 6;  // InFeat d0 stride
479 |   parameter int NE16_REG_INFEAT_D1_STRIDE  = 7;  // InFeat d1 stride
480 |   parameter int NE16_REG_INFEAT_D2_STRIDE  = 8;  // InFeat d2 stride
481 |   parameter int NE16_REG_OUTFEAT_D0_STRIDE = 9;  // OutFeat d0 stride
482 |   parameter int NE16_REG_OUTFEAT_D1_STRIDE = 10; // OutFeat d1 stride
483 |   parameter int NE16_REG_OUTFEAT_D2_STRIDE = 11; // OutFeat d2 stride
484 |   parameter int NE16_REG_WEIGHTS_D0_STRIDE = 12; // Weights d0 stride
485 |   parameter int NE16_REG_WEIGHTS_D1_STRIDE = 13; // Weights d1 stride
486 |   parameter int NE16_REG_WEIGHTS_D2_STRIDE = 14; // Weights d2 stride (may be removable)
487 |   parameter int NE16_REG_SUBTILE_REM0      = 15; // Subtile Remainder 0: [31:16] Ko, [15:0] Ki.
488 |   parameter int NE16_REG_SUBTILE_REM1      = 16; // Subtile Remainder 1: [31:16] Ho, [15:0] Wo.
489 |   parameter int NE16_REG_SUBTILE_REM2      = 17; // Subtile Remainder 2: [31:16] Hi, [15:0] Wi.
490 |   parameter int NE16_REG_SUBTILE_NB0       = 18; // Subtile Number 0: [31:16] Ko, [15:0] Ki.
491 |   parameter int NE16_REG_SUBTILE_NB1       = 19; // Subtile Number 1: [31:16] Ho, [15:0] Wo.
492 |   parameter int NE16_REG_PADDING           = 20; // Padding
493 |   parameter int NE16_REG_WEIGHT_OFFSET     = 21; // Weight offset factor
494 |   parameter int NE16_REG_FILTER_MASK       = 22; // Filter masking: [31:24] top, [23:16] right, [15:8] bottom, [7:0] left.
495 |   parameter int NE16_REG_CONFIG0           = 23; // Config 0:  [31:16] Reserved (striding, dilation?) [15] weight_offseting [14] streamin [13:12] normalization bits (00=8, 01=16, 10=32), [11] rounding (0=round, 1=do not round), [10:7] padding flag (top/right/bottom/left)  [6:5] filter mode (11=linear, 10=1x1, 01=3x3 depthwise, 00=3x3)  [4] streamout / quantization, [3] reserved (16 bits?), [2:0] weight bits minus 1.
496 | 
497 |   // normal uloop microcode, generated by ucode/uloop_compile.py
498 |   parameter logic[351:0] ULOOP_CODE_NORMAL     = 352'h04748a101215c078a30b22942d89f0aa15c078a30b22742985701e14405;
499 |   parameter logic[53:0]  ULOOP_LOOPS_NORMAL    = 54'b011001001001100110000100100000000010;
500 |   // depthwise uloop microcode, generated by ucode/uloop_compile_dw.py
501 |   parameter logic[351:0] ULOOP_CODE_DEPTHWISE  = 352'h0420863a4288a101228c2c8a50b627c2a8a30b227429;
502 |   parameter logic[53:0]  ULOOP_LOOPS_DEPTHWISE = 54'b011100010001101000000100100000000010;
503 | 
504 |   // mapping of weights in linear layers
505 |   parameter int NE16_LINEAR_MAP[0:80] = {
506 |     0,  1,  2,  3,  4,  5,  6,  7,  -1,
507 |     8,  9,  10, 11, 12, 13, 14, 15, -1,
508 |     16, 17, 18, 19, 20, 21, 22, 23, -1,
509 |     24, 25, 26, 27, 28, 29, 30, 31, -1,
510 |     -1, -1, -1, -1, -1, -1, -1, -1, -1,
511 |     -1, -1, -1, -1, -1, -1, -1, -1, -1,
512 |     -1, -1, -1, -1, -1, -1, -1, -1, -1,
513 |     -1, -1, -1, -1, -1, -1, -1, -1, -1,
514 |     -1, -1, -1, -1, -1, -1, -1, -1, -1
515 |   };
516 | 
517 | endpackage
518 | 


--------------------------------------------------------------------------------
/rtl/ne16_streamer.sv:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ne16_streamer.sv
  3 |  *
  4 |  * Copyright (C) 2019-2021 ETH Zurich, University of Bologna and GreenWaves Technologies
  5 |  *
  6 |  * Copyright and related rights are licensed under the Solderpad Hardware
  7 |  * License, Version 0.51 (the "License"); you may not use this file except in
  8 |  * compliance with the License.  You may obtain a copy of the License at
  9 |  * http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
 10 |  * or agreed to in writing, software, hardware and materials distributed under
 11 |  * this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 |  * specific language governing permissions and limitations under the License.
 14 |  */
 15 | 
 16 | /*
 17 |  * Authors (RBE):  Gianna Paulin <pauling@iis.ee.ethz.ch>
 18 |  *                 Francesco Conti <f.conti@unibo.it>
 19 |  * Authors (NE16): Francesco Conti <francesco.conti@greenwaves-technologies.com>
 20 |  */
 21 | 
 22 | import ne16_package::*;
 23 | import hwpe_stream_package::*;
 24 | import hci_package::*;
 25 | 
 26 | module ne16_streamer #(
 27 |   parameter int unsigned TCDM_FIFO_DEPTH = 2,
 28 |   parameter int unsigned BW = NE16_MEM_BANDWIDTH_EXT // bandwidth
 29 | ) (
 30 |   // global signals
 31 |   input  logic                   clk_i,
 32 |   input  logic                   rst_ni,
 33 |   input  logic                   test_mode_i,
 34 |   // local enable & clear
 35 |   input  logic                   enable_i,
 36 |   input  logic                   clear_i,
 37 |   // input feat stream + handshake
 38 |   hwpe_stream_intf_stream.source feat_o,
 39 |   // input weight stream + handshake
 40 |   hwpe_stream_intf_stream.source weight_o,
 41 |   // input norm stream + handshake
 42 |   hwpe_stream_intf_stream.source norm_o,
 43 |   // input streamin stream + handshake
 44 |   hwpe_stream_intf_stream.source streamin_o,
 45 |   // output features + handshake
 46 |   hwpe_stream_intf_stream.sink   conv_i,
 47 |   // TCDM ports
 48 |   hci_core_intf.master           tcdm,
 49 |   // control channel
 50 |   input  ctrl_streamer_t         ctrl_i,
 51 |   output flags_streamer_t        flags_o
 52 | );
 53 | 
 54 |   // NE16_MEM_BANDWIDTH parameter: number of bits per tile.
 55 | 
 56 |   hci_streamer_ctrl_t  all_source_ctrl;
 57 |   hci_streamer_flags_t all_source_flags;
 58 |   flags_fifo_t tcdm_fifo_flags;
 59 | 
 60 |   hwpe_stream_intf_stream #(
 61 |     .DATA_WIDTH ( NE16_MEM_BANDWIDTH )
 62 | `ifndef SYNTHESIS
 63 |     ,
 64 |     .BYPASS_VCR_ASSERT( 1'b1  ),
 65 |     .BYPASS_VDR_ASSERT( 1'b1  )
 66 | `endif
 67 |   ) all_source (
 68 |     .clk ( clk_i )
 69 |   );
 70 | 
 71 |   hwpe_stream_intf_stream #(
 72 |     .DATA_WIDTH ( NE16_MEM_BANDWIDTH )
 73 | `ifndef SYNTHESIS
 74 |     ,
 75 |     .BYPASS_VCR_ASSERT( 1'b1  ),
 76 |     .BYPASS_VDR_ASSERT( 1'b1  )
 77 | `endif
 78 |   ) virt_source[3:0] (
 79 |     .clk ( clk_i )
 80 |   );
 81 | 
 82 |   hci_core_intf #(
 83 |     .DW ( NE16_MEM_BANDWIDTH_EXT )
 84 |   ) virt_tcdm [1:0] (
 85 |     .clk ( clk_i )
 86 |   );
 87 | 
 88 |   hci_core_intf #(
 89 |     .DW ( NE16_MEM_BANDWIDTH_EXT )
 90 |   ) tcdm_prefifo (
 91 |     .clk ( clk_i )
 92 |   );
 93 | 
 94 |   hci_core_intf #(
 95 |     .DW ( NE16_MEM_BANDWIDTH_EXT )
 96 |   ) tcdm_prefilter (
 97 |     .clk ( clk_i )
 98 |   );
 99 | 
100 |   hci_core_source #(
101 |     .DATA_WIDTH ( NE16_MEM_BANDWIDTH_EXT )
102 |   ) i_all_source (
103 |     .clk_i       ( clk_i                         ),
104 |     .rst_ni      ( rst_ni                        ),
105 |     .test_mode_i ( test_mode_i                   ),
106 |     .clear_i     ( clear_i | ctrl_i.clear_source ),
107 |     .enable_i    ( ~ctrl_i.ld_st_mux_sel         ),
108 |     .tcdm        ( virt_tcdm [0]                 ),
109 |     .stream      ( all_source                    ),
110 |     .ctrl_i      ( all_source_ctrl               ),
111 |     .flags_o     ( all_source_flags              )
112 |   );
113 | 
114 |   hci_core_sink #(
115 |     .DATA_WIDTH ( NE16_MEM_BANDWIDTH_EXT )
116 |   ) i_sink (
117 |     .clk_i       ( clk_i                       ),
118 |     .rst_ni      ( rst_ni                      ),
119 |     .test_mode_i ( test_mode_i                 ),
120 |     .clear_i     ( clear_i | ctrl_i.clear_sink ),
121 |     .enable_i    ( ctrl_i.ld_st_mux_sel        ),
122 |     .tcdm        ( virt_tcdm [1]               ),
123 |     .stream      ( conv_i                      ),
124 |     .ctrl_i      ( ctrl_i.conv_sink_ctrl       ),
125 |     .flags_o     ( flags_o.conv_sink_flags     )
126 |   );
127 | 
128 |   generate
129 |     if(TCDM_FIFO_DEPTH > 0) begin : use_fifo_gen
130 |       hci_core_mux_static #(
131 |         .NB_CHAN (2),
132 |         .DW ( NE16_MEM_BANDWIDTH_EXT )
133 |       ) i_ld_st_mux_static (
134 |         .clk_i   ( clk_i                ),
135 |         .rst_ni  ( rst_ni               ),
136 |         .clear_i ( clear_i              ),
137 |         .sel_i   ( ctrl_i.ld_st_mux_sel ),
138 |         .in      ( virt_tcdm            ),
139 |         .out     ( tcdm_prefifo         )
140 |       );
141 | 
142 |       hci_core_fifo #(
143 |         .FIFO_DEPTH ( TCDM_FIFO_DEPTH        ),
144 |         .DW         ( NE16_MEM_BANDWIDTH_EXT ),
145 |         .AW         ( 32                     ),
146 |         .OW         (  1                     )
147 |       ) i_tcdm_fifo (
148 |         .clk_i       ( clk_i                       ),
149 |         .rst_ni      ( rst_ni                      ),
150 |         .clear_i     ( clear_i | ctrl_i.clear_fifo ),
151 |         .flags_o     ( tcdm_fifo_flags             ),
152 |         .tcdm_slave  ( tcdm_prefifo                ),
153 |         .tcdm_master ( tcdm_prefilter              )
154 |       );
155 |     end
156 |     else begin : dont_use_fifo_gen
157 |       hci_core_mux_static #(
158 |         .NB_CHAN (2),
159 |         .DW ( NE16_MEM_BANDWIDTH_EXT )
160 |       ) i_ld_st_mux_static (
161 |         .clk_i   ( clk_i                ),
162 |         .rst_ni  ( rst_ni               ),
163 |         .clear_i ( clear_i              ),
164 |         .sel_i   ( ctrl_i.ld_st_mux_sel ),
165 |         .in      ( virt_tcdm            ),
166 |         .out     ( tcdm_prefilter       )
167 |       );
168 |       assign tcdm_fifo_flags.empty = 1'b1;
169 |     end
170 |   endgenerate
171 | 
172 |   hci_core_r_valid_filter i_tcdm_filter (
173 |     .clk_i       ( clk_i                ),
174 |     .rst_ni      ( rst_ni               ),
175 |     .clear_i     ( clear_i              ),
176 |     .enable_i    ( 1'b1                 ),
177 |     .tcdm_slave  ( tcdm_prefilter       ),
178 |     .tcdm_master ( tcdm                 )
179 |   );
180 | 
181 |   always_comb
182 |   begin : ld_which_ctrl_mux
183 |     all_source_ctrl = '0;
184 |     if(ctrl_i.ld_which_mux_sel == LD_FEAT_SEL)
185 |       all_source_ctrl = ctrl_i.feat_source_ctrl;
186 |     else if(ctrl_i.ld_which_mux_sel == LD_WEIGHT_SEL)
187 |       all_source_ctrl = ctrl_i.weight_source_ctrl;
188 |     else if(ctrl_i.ld_which_mux_sel == LD_NORM_SEL)
189 |       all_source_ctrl = ctrl_i.norm_source_ctrl;
190 |     else if(ctrl_i.ld_which_mux_sel == LD_STREAMIN_SEL)
191 |       all_source_ctrl = ctrl_i.streamin_source_ctrl;
192 |   end
193 | 
194 |   assign flags_o.feat_source_flags = all_source_flags;
195 |   assign flags_o.norm_source_flags = all_source_flags;
196 |   assign flags_o.weight_source_flags = all_source_flags;
197 |   assign flags_o.tcdm_fifo_empty = tcdm_fifo_flags.empty;
198 | 
199 |   logic [1:0] ld_which_mux_sel;
200 |   assign ld_which_mux_sel = (ctrl_i.ld_which_mux_sel == LD_FEAT_SEL)   ? 2'b00 :
201 |                             (ctrl_i.ld_which_mux_sel == LD_WEIGHT_SEL) ? 2'b01 :
202 |                             (ctrl_i.ld_which_mux_sel == LD_NORM_SEL)   ? 2'b10 :
203 |                                                                          2'b11; // LD_STREAMIN_SEL
204 | 
205 |   hwpe_stream_demux_static #(
206 |     .NB_OUT_STREAMS ( 4 )
207 |   ) i_all_source_demux (
208 |     .clk_i   ( clk_i            ),
209 |     .rst_ni  ( rst_ni           ),
210 |     .clear_i ( clear_i          ),
211 |     .sel_i   ( ld_which_mux_sel ),
212 |     .push_i  ( all_source       ),
213 |     .pop_o   ( virt_source      )
214 |   );
215 | 
216 |   hwpe_stream_assign i_assign_feat     ( .push_i (virt_source[0]), .pop_o ( feat_o     ) );
217 |   hwpe_stream_assign i_assign_weight   ( .push_i (virt_source[1]), .pop_o ( weight_o   ) );
218 |   hwpe_stream_assign i_assign_norm     ( .push_i (virt_source[2]), .pop_o ( norm_o     ) );
219 |   hwpe_stream_assign i_assign_streamin ( .push_i (virt_source[3]), .pop_o ( streamin_o ) );
220 | 
221 | endmodule // ne16_streamer
222 | 


--------------------------------------------------------------------------------
/rtl/ne16_top.sv:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ne16_top.sv
  3 |  *
  4 |  * Copyright (C) 2019-2021 ETH Zurich, University of Bologna and GreenWaves Technologies
  5 |  *
  6 |  * Copyright and related rights are licensed under the Solderpad Hardware
  7 |  * License, Version 0.51 (the "License"); you may not use this file except in
  8 |  * compliance with the License.  You may obtain a copy of the License at
  9 |  * http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
 10 |  * or agreed to in writing, software, hardware and materials distributed under
 11 |  * this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 |  * specific language governing permissions and limitations under the License.
 14 |  */
 15 | 
 16 | /*
 17 |  * Authors (RBE):  Gianna Paulin <pauling@iis.ee.ethz.ch>
 18 |  *                 Francesco Conti <f.conti@unibo.it>
 19 |  * Authors (NE16): Francesco Conti <francesco.conti@greenwaves-technologies.com>
 20 |  */
 21 | 
 22 | import ne16_package::*;
 23 | import hwpe_ctrl_package::*;
 24 | import hci_package::*;
 25 | 
 26 | module ne16_top #(
 27 |   parameter int unsigned TP_IN     = NE16_TP_IN, // number of input elements processed per cycle
 28 |   parameter int unsigned TP_OUT    = NE16_TP_OUT, // number of output elements processed per cycle
 29 |   parameter int unsigned CNT       = VLEN_CNT_SIZE,         // counter size
 30 |   parameter int unsigned ID        = ID_WIDTH,
 31 |   parameter int unsigned BW        = NE16_MEM_BANDWIDTH_EXT,             // NE16_MEM_BANDWIDTH
 32 |   parameter int unsigned DW        = NE16_STREAM_BANDWIDTH,
 33 | 
 34 |   parameter int unsigned N_CORES   = NR_CORES,
 35 |   parameter int unsigned N_CONTEXT = NR_CONTEXT
 36 | ) (
 37 |   // global signals
 38 |   input  logic                                  clk_i,
 39 |   input  logic                                  rst_ni,
 40 |   input  logic                                  test_mode_i,
 41 |   // events
 42 |   output logic [N_CORES-1:0][REGFILE_N_EVT-1:0] evt_o,
 43 |   output logic                                  busy_o,
 44 |   // tcdm master ports
 45 |   hci_core_intf.master                          tcdm,
 46 |   // periph slave port
 47 |   hwpe_ctrl_intf_periph.slave                   periph
 48 | );
 49 | 
 50 |   // signals
 51 |   logic enable;
 52 |   logic clear;
 53 | 
 54 |   ctrl_streamer_t  streamer_ctrl;
 55 |   flags_streamer_t streamer_flags;
 56 |   ctrl_engine_t    engine_ctrl;
 57 |   flags_engine_t   engine_flags;
 58 | 
 59 |   hwpe_stream_intf_stream #(
 60 |     .DATA_WIDTH(DW)
 61 | `ifndef SYNTHESIS
 62 |     ,
 63 |     .BYPASS_VCR_ASSERT( 1'b1  ),
 64 |     .BYPASS_VDR_ASSERT( 1'b1  )
 65 | `endif
 66 |   ) feat   (.clk(clk_i));
 67 | 
 68 |   hwpe_stream_intf_stream #(
 69 |     .DATA_WIDTH(NE16_MEM_BANDWIDTH)
 70 | `ifndef SYNTHESIS
 71 |     ,
 72 |     .BYPASS_VCR_ASSERT( 1'b1  ),
 73 |     .BYPASS_VDR_ASSERT( 1'b1  )
 74 | `endif
 75 |   ) weight (.clk(clk_i));
 76 | 
 77 |   hwpe_stream_intf_stream #(
 78 |     .DATA_WIDTH(NE16_MEM_BANDWIDTH)
 79 | `ifndef SYNTHESIS
 80 |     ,
 81 |     .BYPASS_VCR_ASSERT( 1'b1  ),
 82 |     .BYPASS_VDR_ASSERT( 1'b1  )
 83 | `endif
 84 |   ) norm   (.clk(clk_i));
 85 | 
 86 |   hwpe_stream_intf_stream #(
 87 |     .DATA_WIDTH(NE16_MEM_BANDWIDTH)
 88 | `ifndef SYNTHESIS
 89 |     ,
 90 |     .BYPASS_VCR_ASSERT( 1'b1  ),
 91 |     .BYPASS_VDR_ASSERT( 1'b1  )
 92 | `endif
 93 |   ) streamin   (.clk(clk_i));
 94 | 
 95 |   hwpe_stream_intf_stream #(
 96 |     .DATA_WIDTH(NE16_MEM_BANDWIDTH)
 97 | `ifndef SYNTHESIS
 98 |     ,
 99 |     .BYPASS_VCR_ASSERT( 1'b1  ),
100 |     .BYPASS_VDR_ASSERT( 1'b1  )
101 | `endif
102 |   ) conv   (.clk(clk_i));
103 | 
104 |   ne16_engine i_engine (
105 |     .clk_i         ( clk_i        ),
106 |     .rst_ni        ( rst_ni       ),
107 |     .test_mode_i   ( test_mode_i  ),
108 |     .enable_i      ( enable       ),
109 |     .clear_i       ( clear        ),
110 |     .load_in       ( feat         ),
111 |     .load_weight   ( weight       ),
112 |     .load_norm     ( norm         ),
113 |     .load_streamin ( streamin     ),
114 |     .store_out     ( conv         ),
115 |     .ctrl_i        ( engine_ctrl  ),
116 |     .flags_o       ( engine_flags )
117 |   );
118 | 
119 |   ne16_streamer #(
120 |     .BW ( NE16_MEM_BANDWIDTH_EXT )
121 |   ) i_streamer (
122 |     .clk_i       ( clk_i          ),
123 |     .rst_ni      ( rst_ni         ),
124 |     .test_mode_i ( test_mode_i    ),
125 |     .enable_i    ( enable         ),
126 |     .clear_i     ( clear          ),
127 |     .feat_o      ( feat           ),
128 |     .weight_o    ( weight         ),
129 |     .norm_o      ( norm           ),
130 |     .streamin_o  ( streamin       ),
131 |     .conv_i      ( conv           ),
132 |     .tcdm        ( tcdm           ),
133 |     .ctrl_i      ( streamer_ctrl  ),
134 |     .flags_o     ( streamer_flags )
135 |   );
136 | 
137 |   ne16_ctrl #(
138 |     .ID      ( ID      ),
139 |     .N_CORES ( N_CORES )
140 |   ) i_ctrl (
141 |     .clk_i            ( clk_i          ),
142 |     .rst_ni           ( rst_ni         ),
143 |     .test_mode_i      ( test_mode_i    ),
144 |     .busy_o           ( busy_o         ),
145 |     .evt_o            ( evt_o          ),
146 |     .clear_o          ( clear          ),
147 |     .ctrl_streamer_o  ( streamer_ctrl  ),
148 |     .flags_streamer_i ( streamer_flags ),
149 |     .ctrl_engine_o    ( engine_ctrl    ),
150 |     .flags_engine_i   ( engine_flags   ),
151 |     .periph           ( periph         )
152 |   );
153 | 
154 |   assign enable = busy_o;
155 | 
156 | endmodule // ne16_top
157 | 


--------------------------------------------------------------------------------
/rtl/ne16_top_wrap.sv:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ne16_top_wrap.sv
  3 |  *
  4 |  * Copyright (C) 2019-2021 ETH Zurich, University of Bologna and GreenWaves Technologies
  5 |  *
  6 |  * Copyright and related rights are licensed under the Solderpad Hardware
  7 |  * License, Version 0.51 (the "License"); you may not use this file except in
  8 |  * compliance with the License.  You may obtain a copy of the License at
  9 |  * http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
 10 |  * or agreed to in writing, software, hardware and materials distributed under
 11 |  * this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 |  * specific language governing permissions and limitations under the License.
 14 |  */
 15 | 
 16 | /*
 17 |  * Authors (RBE):  Gianna Paulin <pauling@iis.ee.ethz.ch>
 18 |  *                 Francesco Conti <f.conti@unibo.it>
 19 |  * Authors (NE16): Francesco Conti <francesco.conti@greenwaves-technologies.com>
 20 |  */
 21 | 
 22 | import ne16_package::*;
 23 | import hwpe_ctrl_package::*;
 24 | import hci_package::*;
 25 | 
 26 | module ne16_top_wrap #(
 27 |   parameter int unsigned TP_IN     = NE16_TP_IN, // number of input elements processed per cycle
 28 |   parameter int unsigned TP_OUT    = NE16_TP_OUT, // number of output elements processed per cycle
 29 |   parameter int unsigned CNT       = VLEN_CNT_SIZE,                // counter size
 30 |   parameter int unsigned BW        = NE16_MEM_BANDWIDTH_EXT,          // NE16_MEM_BANDWIDTH
 31 |   parameter int unsigned MP        = BW/32,                        // number of memory ports (each a 32bit data)
 32 |   parameter int unsigned ID        = ID_WIDTH,
 33 |   parameter int unsigned N_CORES   = NR_CORES,
 34 |   parameter int unsigned N_CONTEXT = NR_CONTEXT
 35 | ) (
 36 |   // global signals
 37 |   input  logic                                  clk_i,
 38 |   input  logic                                  rst_ni,
 39 |   input  logic                                  test_mode_i,
 40 |   // evnets
 41 |   output logic [N_CORES-1:0][REGFILE_N_EVT-1:0] evt_o,
 42 |   output logic                                  busy_o,
 43 |   // tcdm master ports
 44 |   output logic [     MP-1:0]                    tcdm_req,
 45 |   input  logic [     MP-1:0]                    tcdm_gnt,
 46 |   output logic [     MP-1:0][             31:0] tcdm_add,
 47 |   output logic [     MP-1:0]                    tcdm_wen,
 48 |   output logic [     MP-1:0][              3:0] tcdm_be,
 49 |   output logic [     MP-1:0][             31:0] tcdm_data,
 50 |   input  logic [     MP-1:0][             31:0] tcdm_r_data,
 51 |   input  logic [     MP-1:0]                    tcdm_r_valid,
 52 |   // periph slave port
 53 |   input  logic                                  periph_req,
 54 |   output logic                                  periph_gnt,
 55 |   input  logic [       31:0]                    periph_add,
 56 |   input  logic                                  periph_wen,
 57 |   input  logic [        3:0]                    periph_be,
 58 |   input  logic [       31:0]                    periph_data,
 59 |   input  logic [     ID-1:0]                    periph_id,
 60 |   output logic [       31:0]                    periph_r_data,
 61 |   output logic                                  periph_r_valid,
 62 |   output logic [     ID-1:0]                    periph_r_id
 63 | );
 64 | 
 65 |   hci_core_intf #(
 66 |     .DW ( BW )
 67 |   ) tcdm (
 68 |     .clk ( clk_i )
 69 |   );
 70 | 
 71 |   hwpe_ctrl_intf_periph #(.ID_WIDTH(ID)) periph (.clk(clk_i));
 72 | 
 73 |   // bindings
 74 |   generate
 75 |     for(genvar ii=0; ii<MP; ii++) begin: tcdm_binding
 76 |       assign tcdm_req  [ii] = tcdm.req;
 77 |       assign tcdm_add  [ii] = tcdm.add + ii*4;
 78 |       assign tcdm_wen  [ii] = tcdm.wen;
 79 |       assign tcdm_be   [ii] = tcdm.be[(ii+1)*4-1:ii*4];
 80 |       assign tcdm_data [ii] = tcdm.data[(ii+1)*32-1:ii*32];
 81 |     end
 82 |     assign tcdm.gnt     = &(tcdm_gnt);
 83 |     assign tcdm.r_valid = &(tcdm_r_valid);
 84 |     assign tcdm.r_data  = { >> {tcdm_r_data} } ;
 85 |   endgenerate
 86 | 
 87 |   always_comb
 88 |     begin
 89 |       periph.req     = periph_req;
 90 |       periph.add     = periph_add;
 91 |       periph.wen     = periph_wen;
 92 |       periph.be      = periph_be;
 93 |       periph.data    = periph_data;
 94 |       periph.id      = periph_id;
 95 |       periph_gnt     = periph.gnt;
 96 |       periph_r_data  = periph.r_data;
 97 |       periph_r_valid = periph.r_valid;
 98 |       periph_r_id    = periph.r_id;
 99 |     end
100 | 
101 |   ne16_top #(
102 |     .TP_IN    (TP_IN    ),
103 |     .TP_OUT   (TP_OUT   ),
104 |     .CNT      (CNT      ),
105 |     .BW       (BW       ),
106 |     .ID       (ID       ),
107 |     .N_CORES  (N_CORES  ),
108 |     .N_CONTEXT(N_CONTEXT)
109 |   ) i_ne16_top (
110 |     .clk_i       ( clk_i        ),
111 |     .rst_ni      ( rst_ni       ),
112 |     .test_mode_i ( test_mode_i  ),
113 |     .evt_o       ( evt_o        ),
114 |     .busy_o      ( busy_o       ),
115 |     .tcdm        ( tcdm.master  ),
116 |     .periph      ( periph.slave )
117 |   );
118 | 
119 | endmodule // ne16_top_wrap
120 | 


--------------------------------------------------------------------------------
/src_files.yml:
--------------------------------------------------------------------------------
 1 | ne16:
 2 |   vlog_opts: [
 3 |     +nowarnSVCHK,
 4 |     -suppress 2275,
 5 |     -L hwpe_stream_lib,
 6 |     -L hwpe_ctrl_lib,
 7 |     -L hci_lib,
 8 |   ]
 9 |   incdirs: [
10 |     .,
11 |     ../hwpe-stream/rtl,
12 |     ../hwpe-ctrl/rtl,
13 |   ]
14 |   files: [
15 |     rtl/ne16_package.sv,
16 |     rtl/accumulator/ne16_accumulator_scm_test_wrap.sv,
17 |     rtl/input_buffer/ne16_input_buffer_scm_test_wrap.sv,
18 |     rtl/accumulator/ne16_accumulator_scm.sv,
19 |     rtl/accumulator/ne16_accumulator_normquant.sv,
20 |     rtl/accumulator/ne16_normquant.sv,
21 |     rtl/accumulator/ne16_normquant_shifter.sv,
22 |     rtl/accumulator/ne16_normquant_bias.sv,
23 |     rtl/accumulator/ne16_normquant_multiplier.sv,
24 |     rtl/input_buffer/ne16_input_buffer_scm.sv,
25 |     rtl/input_buffer/ne16_input_buffer.sv,
26 |     rtl/array/ne16_scale.sv,
27 |     rtl/array/ne16_binconv_block.sv,
28 |     rtl/array/ne16_binconv_column.sv,
29 |     rtl/array/ne16_binconv_array.sv,
30 |     rtl/ctrl/ne16_ctrl_fsm.sv,
31 |     rtl/ctrl/ne16_ctrl.sv,
32 |     rtl/ne16_engine.sv,
33 |     rtl/ne16_streamer.sv,
34 |     rtl/ne16_top.sv,
35 |     rtl/ne16_top_wrap.sv,
36 |   ]
37 | 


--------------------------------------------------------------------------------
/ucode/code.yml:
--------------------------------------------------------------------------------
 1 | #
 2 | # code.yml
 3 | #
 4 | # Copyright (C) 2019-2021 ETH Zurich, University of Bologna and GreenWaves Technologies
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # Author: Francesco Conti <f.conti@unibo.it>
19 | 
20 | # mnemonics to simplify microcode writing
21 | mnemonics:
22 |     base_addr_W:            0
23 |     base_addr_x:            1
24 |     base_addr_y:            2
25 |     base_addr_s:            3
26 |     weights_kom_iter:       4
27 |     weights_kim_iter:       5
28 |     weights_kom_reset_iter: 6
29 |     weights_kim_reset_iter: 7
30 |     infeat_kim_iter:        8
31 |     infeat_wom_iter:        9
32 |     infeat_hom_iter:        10
33 |     infeat_kim_reset_iter:  11
34 |     infeat_wom_reset_iter:  12
35 |     infeat_hom_reset_iter:  13
36 |     outfeat_wom_iter:       14
37 |     outfeat_hom_iter:       15
38 |     outfeat_kom_iter:       16
39 |     outfeat_wom_reset_iter: 17
40 |     outfeat_hom_reset_iter: 18
41 |     outfeat_kom_reset_iter: 19
42 |     scale_kom_iter:         20
43 |     zero:                   21
44 | 
45 | # NE16 code
46 | code:
47 |   k_in_major:
48 |     - { op : add,  a: base_addr_W, b: weights_kim_iter  }
49 |     - { op : add,  a: base_addr_x, b: infeat_kim_iter   }
50 |   j_major:
51 |     - { op : add,  a: base_addr_W, b: weights_kim_reset_iter } # weights_kim_reset_iter = - subtile_nb_ki * weights_kim_iter
52 |     - { op : add,  a: base_addr_x, b: infeat_kim_reset_iter  } # infeat_kim_reset_iter  = - subtile_nb_ki * infeat_kim_iter
53 |     - { op : add,  a: base_addr_x, b: infeat_wom_iter   }
54 |     - { op : add,  a: base_addr_y, b: outfeat_wom_iter  }
55 |   i_major:
56 |     - { op : add,  a: base_addr_x, b: infeat_wom_reset_iter  } # infeat_wom_reset_iter  = - subtile_nb_wo * infeat_wom_iter
57 |     - { op : add,  a: base_addr_y, b: outfeat_wom_reset_iter } # outfeat_wom_reset_iter = - subtile_nb_wo * outfeat_wom_iter
58 |     - { op : add,  a: base_addr_W, b: weights_kim_reset_iter } # weights_kim_reset_iter = - subtile_nb_ki * weights_kim_iter
59 |     - { op : add,  a: base_addr_x, b: infeat_kim_reset_iter  } # infeat_kim_reset_iter  = - subtile_nb_ki * infeat_kim_iter
60 |     - { op : add,  a: base_addr_x, b: infeat_hom_iter   }
61 |     - { op : add,  a: base_addr_y, b: outfeat_hom_iter  }
62 |   k_out_major:
63 |     - { op : add,  a: base_addr_x, b: infeat_hom_reset_iter  } # infeat_hom_reset_iter  = - subtile_nb_ho * infeat_hom_iter
64 |     - { op : add,  a: base_addr_y, b: outfeat_hom_reset_iter } # outfeat_hom_reset_iter = - subtile_nb_ho * outfeat_hom_iter
65 |     - { op : add,  a: base_addr_x, b: infeat_wom_reset_iter  } # infeat_wom_reset_iter  = - subtile_nb_wo * infeat_wom_iter
66 |     - { op : add,  a: base_addr_y, b: outfeat_wom_reset_iter } # outfeat_wom_reset_iter = - subtile_nb_wo * outfeat_wom_iter
67 |     - { op : add,  a: base_addr_W, b: weights_kim_reset_iter } # weights_kim_reset_iter = - subtile_nb_ki * weights_kim_iter
68 |     - { op : add,  a: base_addr_x, b: infeat_kim_reset_iter  } # infeat_kim_reset_iter  = - subtile_nb_ki * infeat_kim_iter
69 |     - { op : add,  a: base_addr_W, b: weights_kom_iter  }
70 |     - { op : add,  a: base_addr_y, b: outfeat_kom_iter  }
71 |     - { op : add,  a: base_addr_s, b: scale_kom_iter    }
72 | 


--------------------------------------------------------------------------------
/ucode/code_dw.yml:
--------------------------------------------------------------------------------
 1 | #
 2 | # code_dw.yml
 3 | #
 4 | # Copyright (C) 2019-2021 ETH Zurich, University of Bologna and GreenWaves Technologies
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # Author: Francesco Conti <f.conti@unibo.it>
19 | 
20 | # mnemonics to simplify microcode writing
21 | mnemonics:
22 |     base_addr_W:            0
23 |     base_addr_x:            1
24 |     base_addr_y:            2
25 |     base_addr_s:            3
26 |     weights_km_iter:        4
27 |     null5:                  5
28 |     weights_km_reset_iter:  6
29 |     null7:                  7
30 |     infeat_km_iter:         8
31 |     infeat_wom_iter:        9
32 |     infeat_hom_iter:        10
33 |     infeat_km_reset_iter:   11
34 |     infeat_wom_reset_iter:  12
35 |     infeat_hom_reset_iter:  13
36 |     outfeat_wom_iter:       14
37 |     outfeat_hom_iter:       15
38 |     outfeat_km_iter:        16
39 |     outfeat_wom_reset_iter: 17
40 |     outfeat_hom_reset_iter: 18
41 |     outfeat_km_reset_iter:  19
42 |     scale_km_iter:          20
43 |     zero:                   21
44 | 
45 | # NE16 code
46 | code:
47 |   j_major:
48 |     - { op : add,  a: base_addr_x, b: infeat_wom_iter   }
49 |     - { op : add,  a: base_addr_y, b: outfeat_wom_iter  }
50 |   i_major:
51 |     - { op : add,  a: base_addr_x, b: infeat_wom_reset_iter  } # infeat_wom_reset_iter  = - subtile_nb_wo * infeat_wom_iter
52 |     - { op : add,  a: base_addr_y, b: outfeat_wom_reset_iter } # outfeat_wom_reset_iter = - subtile_nb_wo * outfeat_wom_iter
53 |     - { op : add,  a: base_addr_x, b: infeat_hom_iter   }
54 |     - { op : add,  a: base_addr_y, b: outfeat_hom_iter  }
55 |   k_out_major:
56 |     - { op : add,  a: base_addr_x, b: infeat_hom_reset_iter  } # infeat_hom_reset_iter  = - subtile_nb_ho * infeat_hom_iter
57 |     - { op : add,  a: base_addr_y, b: outfeat_hom_reset_iter } # outfeat_hom_reset_iter = - subtile_nb_ho * outfeat_hom_iter
58 |     - { op : add,  a: base_addr_x, b: infeat_wom_reset_iter  } # infeat_wom_reset_iter  = - subtile_nb_wo * infeat_wom_iter
59 |     - { op : add,  a: base_addr_y, b: outfeat_wom_reset_iter } # outfeat_wom_reset_iter = - subtile_nb_wo * outfeat_wom_iter
60 |     - { op : add,  a: base_addr_W, b: weights_km_iter  }
61 |     - { op : add,  a: base_addr_y, b: outfeat_km_iter  }
62 |     - { op : add,  a: base_addr_x, b: infeat_km_iter   }
63 |     - { op : add,  a: base_addr_s, b: scale_km_iter    }
64 |   fake_loop:
65 |     - { op : mv,   a: base_addr_x, b: base_addr_x }
66 |     - { op : mv,   a: base_addr_x, b: base_addr_x }
67 | 


--------------------------------------------------------------------------------
/ucode/uloop_check.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | #
  3 | # uloop_check.sv
  4 | # Francesco Conti <fconti@iis.ee.ethz.ch>
  5 | #
  6 | # Copyright (C) 2017-2019 ETH Zurich, University of Bologna
  7 | # Licensed under the Apache License, Version 2.0 (the "License");
  8 | # you may not use this file except in compliance with the License.
  9 | # See LICENSE.sw.txt for details.
 10 | # You may obtain a copy of the License at
 11 | #
 12 | #     http://www.apache.org/licenses/LICENSE-2.0
 13 | #
 14 | # Unless required by applicable law or agreed to in writing, software
 15 | # distributed under the License is distributed on an "AS IS" BASIS,
 16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 17 | # See the License for the specific language governing permissions and
 18 | # limitations under the License.
 19 | #
 20 | 
 21 | from __future__ import print_function
 22 | from uloop_common import *
 23 | import math
 24 | 
 25 | # high-level loop
 26 | def iterate_hl_loop(subtile_nb_ko, subtile_nb_ho, subtile_nb_wo, subtile_nb_ki, h_size_out, w_size_out, infeat_hom_iter, infeat_wom_iter, infeat_kim_iter, weights_kom_iter, weights_kim_iter, outfeat_hom_iter, outfeat_wom_iter, outfeat_kom_iter, scale_kom_iter):
 27 | 
 28 |     for k_out_major in range(subtile_nb_ko):
 29 |         for i_major in range(subtile_nb_ho):
 30 |             for j_major in range(subtile_nb_wo):
 31 |                 for k_in_major in range(subtile_nb_ki):
 32 | 
 33 |                     # auto base_addr_x = i_major*h_size_out*this->w_in_int*this->k_in + j_major*w_size_out*this->k_in + k_in_major*this->TP_IN;
 34 |                     base_addr_x = i_major*infeat_hom_iter + j_major*infeat_wom_iter + k_in_major*infeat_kim_iter
 35 | 
 36 |                     # auto base_addr_W_3x3 = (k_out_major*this->TP_OUT*this->subtile_nb_ki*this->qw + k_in_major*this->qw) * this->FILTER_SIZE*this->FILTER_SIZE * 2;
 37 |                     # auto base_addr_W_1x1 = (k_out_major*this->TP_OUT*this->subtile_nb_ki + k_in_major) * this->qw * 2;
 38 |                     base_addr_W = k_out_major*weights_kom_iter + k_in_major*weights_kim_iter
 39 | 
 40 |                     # auto base_addr_y = i_major*h_size_out*this->w_out_int*this->k_out + j_major*w_size_out*this->k_out + k_out_major*this->TP_OUT;
 41 |                     base_addr_y = i_major*outfeat_hom_iter + j_major*outfeat_wom_iter + k_out_major*outfeat_kom_iter
 42 | 
 43 |                     base_addr_s = k_out_major*scale_kom_iter
 44 | 
 45 |                     yield base_addr_W, base_addr_x, base_addr_y, base_addr_s
 46 | 
 47 | VERBOSE = True
 48 | 
 49 | 
 50 | def uloop_check(
 51 |     subtile_nb_ko,
 52 |     subtile_nb_ho,
 53 |     subtile_nb_wo,
 54 |     subtile_nb_ki,
 55 |     h_size_out,
 56 |     w_size_out,
 57 |     k_in,
 58 |     w_in_int,
 59 |     k_out,
 60 |     w_out_int,
 61 |     qw,
 62 |     fs,
 63 |     FILTER_SIZE=3,
 64 |     TP_IN=16,
 65 |     TP_OUT=32,
 66 | 
 67 |     # infeat_hom_iter,
 68 |     # infeat_wom_iter,
 69 |     # infeat_kim_iter,
 70 |     # weights_kom_iter,
 71 |     # weights_kim_iter,
 72 |     # outfeat_hom_iter,
 73 |     # outfeat_wom_iter,
 74 |     # outfeat_kom_iter,
 75 |     verbose=VERBOSE
 76 | ):
 77 | 
 78 |     infeat_hom_iter = h_size_out * w_in_int * k_in
 79 |     infeat_wom_iter = w_size_out * k_in
 80 |     infeat_kim_iter = TP_IN
 81 | 
 82 |     if fs==3:
 83 |         weights_kom_iter = TP_OUT*subtile_nb_ki*qw * FILTER_SIZE*FILTER_SIZE * 2
 84 |         weights_kim_iter = qw * FILTER_SIZE*FILTER_SIZE * 2
 85 |     else:
 86 |         weights_kom_iter = TP_OUT*subtile_nb_ki*qw * 2
 87 |         weights_kim_iter = qw * 2
 88 | 
 89 |     outfeat_hom_iter = h_size_out * w_out_int * k_out
 90 |     outfeat_wom_iter = w_size_out * k_out
 91 |     outfeat_kom_iter = TP_OUT
 92 | 
 93 |     scale_kom_iter = TP_OUT>>2
 94 | 
 95 |     print("> Base iter\n\tsubtile_nb_ko=%d\n\tsubtile_nb_ho=%d\n\tsubtile_nb_wo=%d\n\tsubtile_nb_ki=%d\n\th_size_out=%d\n\tw_size_out=%d\n\tinfeat_hom_iter=%x\n\tinfeat_wom_iter=%x\n\tinfeat_kim_iter=%x\n\tweights_kom_iter=%x\n\tweights_kim_iter=%x\n\toutfeat_hom_iter=%x\n\toutfeat_wom_iter=%x\n\toutfeat_kom_iter=%x\n\tscale_kom_iter=%x" % (subtile_nb_ko, subtile_nb_ho, subtile_nb_wo, subtile_nb_ki, h_size_out, w_size_out, infeat_hom_iter, infeat_wom_iter, infeat_kim_iter, weights_kom_iter, weights_kim_iter, outfeat_hom_iter, outfeat_wom_iter, outfeat_kom_iter, scale_kom_iter))
 96 |     weights_kom_reset_iter = - (subtile_nb_ko-1) * weights_kom_iter
 97 |     weights_kim_reset_iter = - (subtile_nb_ki-1) * weights_kim_iter
 98 |     infeat_kim_reset_iter  = - (subtile_nb_ki-1) * infeat_kim_iter
 99 |     infeat_wom_reset_iter  = - (subtile_nb_wo-1) * infeat_wom_iter
100 |     outfeat_wom_reset_iter = - (subtile_nb_wo-1) * outfeat_wom_iter
101 |     infeat_hom_reset_iter  = - (subtile_nb_ho-1) * infeat_hom_iter
102 |     outfeat_hom_reset_iter = - (subtile_nb_ho-1) * outfeat_hom_iter
103 |     outfeat_kom_reset_iter = - (subtile_nb_ko-1) * outfeat_kom_iter
104 |     print("> Reset iter\n\tweights_kom_reset_iter=%x\n\tweights_kim_reset_iter=%x\n\tinfeat_kim_reset_iter=%x\n\tinfeat_wom_reset_iter=%x\n\toutfeat_wom_reset_iter=%x\n\tinfeat_hom_reset_iter=%x\n\toutfeat_hom_reset_iter=%x\n\toutfeat_kom_reset_iter=%x" % (weights_kom_reset_iter, weights_kim_reset_iter, infeat_kim_reset_iter, infeat_wom_reset_iter, outfeat_wom_reset_iter, infeat_hom_reset_iter, outfeat_hom_reset_iter, outfeat_kom_reset_iter))
105 | 
106 |     registers = [
107 |         0, # base_addr_W
108 |         0, # base_addr_x
109 |         0, # base_addr_y
110 |         0, # base_addr_s
111 |         weights_kom_iter,
112 |         weights_kim_iter,
113 |         weights_kom_reset_iter,
114 |         weights_kim_reset_iter,
115 |         infeat_kim_iter,
116 |         infeat_wom_iter,
117 |         infeat_hom_iter,
118 |         infeat_kim_reset_iter,
119 |         infeat_wom_reset_iter,
120 |         infeat_hom_reset_iter,
121 |         outfeat_wom_iter,
122 |         outfeat_hom_iter,
123 |         outfeat_kom_iter,
124 |         outfeat_wom_reset_iter,
125 |         outfeat_hom_reset_iter,
126 |         outfeat_kom_reset_iter,
127 |         scale_kom_iter,
128 |         0
129 |     ]
130 | 
131 |     loops_ops,code,mnem = uloop_load("code.yml")
132 |     loops = uloop_get_loops(loops_ops, (subtile_nb_ki, subtile_nb_wo, subtile_nb_ho, subtile_nb_ko))
133 | 
134 |     err = 0
135 |     idx  = []
136 |     nb_loops = 4
137 |     for j in range(nb_loops):
138 |         idx.append(0)
139 |     state = (0,0,0,idx)
140 |     busy = False
141 |     execute = True
142 |     # uloop_print_idx(state, registers)
143 |     hidx = 0, 0, 0, 0
144 |     hl_loop = iterate_hl_loop(subtile_nb_ko, subtile_nb_ho, subtile_nb_wo, subtile_nb_ki, h_size_out, w_size_out, infeat_hom_iter, infeat_wom_iter, infeat_kim_iter, weights_kom_iter, weights_kim_iter, outfeat_hom_iter, outfeat_wom_iter, outfeat_kom_iter, scale_kom_iter)
145 |     hW, hX, hY, hS = next(hl_loop)
146 |     for i in range(0,1000000):
147 |         new_registers = uloop_execute(state, code, registers)
148 |         execute,end,busy,state = uloop_state_machine(loops, state, verbose=verbose, nb_loops=nb_loops)
149 |         if execute:
150 |             registers = new_registers
151 |         if not busy:
152 |             try:
153 |                 hW, hX, hY, hS = next(hl_loop)
154 |             except StopIteration:
155 |                 pass
156 |             if verbose:
157 |                 uloop_print_idx(state, registers, register_names=('weights', 'infeat', 'outfeat', 'scale'))
158 |             uW, uX, uY, uS = registers[0:4]
159 |             if (hW != uW or hX != uX or hY != uY or hS != uS):
160 |                 if verbose:
161 |                     print("  ERROR!!!")
162 |                     print("  High-level: weights:%x infeat:%x outfeat:%x scale:%x" % (hW, hX, hY, hS))
163 |                     print("  uLoop:      weights:%x infeat:%x outfeat:%x scale:%x" % (uW, uX, uY, uS))
164 |                 err += 1
165 |         if end:
166 |             break
167 | 
168 |     print(err, " errors", "!!!" if err > 0 else "")
169 |     return err
170 | 
171 | uloop_check(
172 |     2, # subtile_nb_ko,
173 |     1, # subtile_nb_ho,
174 |     1, # subtile_nb_wo,
175 |     1, # subtile_nb_ki,
176 |     3, # h_size_out,
177 |     3, # w_size_out,
178 |     16, # k_in,
179 |     5, # w_in_int,
180 |     64, # k_out,
181 |     3, # w_out_int,
182 |     8, # qw,
183 |     3, # fs,
184 |     verbose = True
185 | )
186 | 


--------------------------------------------------------------------------------
/ucode/uloop_check_dw.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | #
  3 | # uloop_check.sv
  4 | # Francesco Conti <fconti@iis.ee.ethz.ch>
  5 | #
  6 | # Copyright (C) 2017-2019 ETH Zurich, University of Bologna
  7 | # Licensed under the Apache License, Version 2.0 (the "License");
  8 | # you may not use this file except in compliance with the License.
  9 | # See LICENSE.sw.txt for details.
 10 | # You may obtain a copy of the License at
 11 | #
 12 | #     http://www.apache.org/licenses/LICENSE-2.0
 13 | #
 14 | # Unless required by applicable law or agreed to in writing, software
 15 | # distributed under the License is distributed on an "AS IS" BASIS,
 16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 17 | # See the License for the specific language governing permissions and
 18 | # limitations under the License.
 19 | #
 20 | 
 21 | from __future__ import print_function
 22 | from uloop_common import *
 23 | import math
 24 | 
 25 | # high-level loop
 26 | def iterate_hl_loop(subtile_nb_k, subtile_nb_ho, subtile_nb_wo, h_size_out, w_size_out, infeat_hom_iter, infeat_wom_iter, infeat_km_iter, weights_km_iter, outfeat_hom_iter, outfeat_wom_iter, outfeat_km_iter, scale_km_iter):
 27 | 
 28 |     for k_major in range(subtile_nb_k):
 29 |         for i_major in range(subtile_nb_ho):
 30 |             for j_major in range(subtile_nb_wo):
 31 | 
 32 |                 # auto base_addr_x = i_major*h_size_out*this->w_in_int*this->k_in + j_major*w_size_out*this->k_in + k_in_major*this->TP_IN;
 33 |                 base_addr_x = i_major*infeat_hom_iter + j_major*infeat_wom_iter + k_major*infeat_km_iter
 34 | 
 35 |                 # auto base_addr_W_3x3 = (k_out_major*this->TP_OUT*this->subtile_nb_ki*this->qw + k_in_major*this->qw) * this->FILTER_SIZE*this->FILTER_SIZE * 2;
 36 |                 # auto base_addr_W_1x1 = (k_out_major*this->TP_OUT*this->subtile_nb_ki + k_in_major) * this->qw * 2;
 37 |                 base_addr_W = k_major*weights_km_iter
 38 | 
 39 |                 # auto base_addr_y = i_major*h_size_out*this->w_out_int*this->k_out + j_major*w_size_out*this->k_out + k_out_major*this->TP_OUT;
 40 |                 base_addr_y = i_major*outfeat_hom_iter + j_major*outfeat_wom_iter + k_major*outfeat_km_iter
 41 | 
 42 |                 base_addr_s = k_major*scale_km_iter
 43 | 
 44 |                 yield base_addr_W, base_addr_x, base_addr_y, base_addr_s
 45 | 
 46 | VERBOSE = True
 47 | 
 48 | 
 49 | def uloop_check(
 50 |     subtile_nb_k,
 51 |     subtile_nb_ho,
 52 |     subtile_nb_wo,
 53 |     h_size_out,
 54 |     w_size_out,
 55 |     k,
 56 |     w_in_int,
 57 |     w_out_int,
 58 |     qw,
 59 |     fs=3,
 60 |     FILTER_SIZE=3,
 61 |     TP_IN=16,
 62 |     TP_OUT=16, # in depthwise mode, effective TP_OUT=16
 63 |     verbose=VERBOSE
 64 | ):
 65 | 
 66 |     infeat_hom_iter = h_size_out * w_in_int * k
 67 |     infeat_wom_iter = w_size_out * k
 68 |     infeat_km_iter = TP_IN
 69 | 
 70 |     weights_km_iter = qw * FILTER_SIZE*FILTER_SIZE * 2
 71 | 
 72 |     outfeat_hom_iter = h_size_out * w_out_int * k
 73 |     outfeat_wom_iter = w_size_out * k
 74 |     outfeat_km_iter = TP_OUT
 75 | 
 76 |     scale_km_iter = TP_OUT>>2
 77 | 
 78 |     print("> Base iter\n\tsubtile_nb_k=%d\n\tsubtile_nb_ho=%d\n\tsubtile_nb_wo=%d\n\th_size_out=%d\n\tw_size_out=%d\n\tinfeat_hom_iter=%x\n\tinfeat_wom_iter=%x\n\tinfeat_km_iter=%x\n\tweights_km_iter=%x\n\toutfeat_hom_iter=%x\n\toutfeat_wom_iter=%x\n\toutfeat_km_iter=%x\n\tscale_km_iter=%x" % (subtile_nb_k, subtile_nb_ho, subtile_nb_wo, h_size_out, w_size_out, infeat_hom_iter, infeat_wom_iter, infeat_km_iter, weights_km_iter, outfeat_hom_iter, outfeat_wom_iter, outfeat_km_iter, scale_km_iter))
 79 |     weights_km_reset_iter  = - (subtile_nb_k-1)  * weights_km_iter
 80 |     infeat_km_reset_iter   = - (subtile_nb_k-1)  * infeat_km_iter
 81 |     infeat_wom_reset_iter  = - (subtile_nb_wo-1) * infeat_wom_iter
 82 |     outfeat_wom_reset_iter = - (subtile_nb_wo-1) * outfeat_wom_iter
 83 |     infeat_hom_reset_iter  = - (subtile_nb_ho-1) * infeat_hom_iter
 84 |     outfeat_hom_reset_iter = - (subtile_nb_ho-1) * outfeat_hom_iter
 85 |     outfeat_km_reset_iter  = - (subtile_nb_k-1)  * outfeat_km_iter
 86 |     print("> Reset iter\n\tweights_km_reset_iter=%x\n\tinfeat_km_reset_iter=%x\n\tinfeat_wom_reset_iter=%x\n\toutfeat_wom_reset_iter=%x\n\tinfeat_hom_reset_iter=%x\n\toutfeat_hom_reset_iter=%x\n\toutfeat_km_reset_iter=%x" % (weights_km_reset_iter, infeat_km_reset_iter, infeat_wom_reset_iter, outfeat_wom_reset_iter, infeat_hom_reset_iter, outfeat_hom_reset_iter, outfeat_km_reset_iter))
 87 | 
 88 |     registers = [
 89 |         0, # base_addr_W
 90 |         0, # base_addr_x
 91 |         0, # base_addr_y
 92 |         0, # base_addr_s
 93 |         weights_km_iter, # weights_kom_iter,
 94 |         weights_km_iter, # weights_kim_iter,
 95 |         weights_km_reset_iter, # weights_kom_reset_iter,
 96 |         weights_km_reset_iter, # weights_kim_reset_iter,
 97 |         infeat_km_iter, #infeat_kim_iter,
 98 |         infeat_wom_iter,
 99 |         infeat_hom_iter,
100 |         infeat_km_reset_iter, # infeat_kim_reset_iter,
101 |         infeat_wom_reset_iter,
102 |         infeat_hom_reset_iter,
103 |         outfeat_wom_iter,
104 |         outfeat_hom_iter,
105 |         outfeat_km_iter, # outfeat_kom_iter,
106 |         outfeat_wom_reset_iter,
107 |         outfeat_hom_reset_iter,
108 |         outfeat_km_reset_iter, # outfeat_kom_reset_iter,
109 |         scale_km_iter, # scale_kom_iter,
110 |         0
111 |     ]
112 | 
113 |     loops_ops,code,mnem = uloop_load("code_dw.yml")
114 |     loops = uloop_get_loops(loops_ops, (subtile_nb_wo, subtile_nb_ho, subtile_nb_k, 1))
115 | 
116 |     err = 0
117 |     idx  = []
118 |     nb_loops=4
119 |     for j in range(nb_loops):
120 |         idx.append(0)
121 |     state = (0,0,0,idx)
122 |     busy = False
123 |     execute = True
124 |     # uloop_print_idx(state, registers)
125 |     hidx = 0, 0, 0, 0
126 |     hl_loop = iterate_hl_loop(subtile_nb_k, subtile_nb_ho, subtile_nb_wo, h_size_out, w_size_out, infeat_hom_iter, infeat_wom_iter, infeat_km_iter, weights_km_iter, outfeat_hom_iter, outfeat_wom_iter, outfeat_km_iter, scale_km_iter)
127 |     hW, hX, hY, hS = next(hl_loop)
128 |     for i in range(0,1000000):
129 |         new_registers = uloop_execute(state, code, registers)
130 |         execute,end,busy,state = uloop_state_machine(loops, state, verbose=verbose, nb_loops=nb_loops)
131 |         if execute:
132 |             registers = new_registers
133 |         if not busy:
134 |             try:
135 |                 hW, hX, hY, hS = next(hl_loop)
136 |             except StopIteration:
137 |                 pass
138 |             if verbose:
139 |                 uloop_print_idx(state, registers, register_names=('weights', 'infeat', 'outfeat', 'scale'))
140 |             uW, uX, uY, uS = registers[0:4]
141 |             if (hW != uW or hX != uX or hY != uY or hS != uS):
142 |                 if verbose:
143 |                     print("  ERROR!!!")
144 |                     print("  High-level: weights:%x infeat:%x outfeat:%x scale:%x" % (hW, hX, hY, hS))
145 |                     print("  uLoop:      weights:%x infeat:%x outfeat:%x scale:%x" % (uW, uX, uY, uS))
146 |                 err += 1
147 |         if end:
148 |             break
149 | 
150 |     print(err, " errors", "!!!" if err > 0 else "")
151 |     return err
152 | 
153 | uloop_check(
154 |     2, # subtile_nb_k,
155 |     1, # subtile_nb_ho,
156 |     1, # subtile_nb_wo,
157 |     3, # h_size_out,
158 |     3, # w_size_out,
159 |     32, # k,
160 |     5, # w_in_int,
161 |     3, # w_out_int,
162 |     8, # qw,
163 |     verbose = True
164 | )
165 | 


--------------------------------------------------------------------------------
/ucode/uloop_common.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # uloop_common.sv
  4 | # Francesco Conti <fconti@iis.ee.ethz.ch>
  5 | #
  6 | # Copyright (C) 2017-2019 ETH Zurich, University of Bologna
  7 | # Licensed under the Apache License, Version 2.0 (the "License");
  8 | # you may not use this file except in compliance with the License.
  9 | # See LICENSE.sw.txt for details.
 10 | # You may obtain a copy of the License at
 11 | #
 12 | #     http://www.apache.org/licenses/LICENSE-2.0
 13 | #
 14 | # Unless required by applicable law or agreed to in writing, software
 15 | # distributed under the License is distributed on an "AS IS" BASIS,
 16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 17 | # See the License for the specific language governing permissions and
 18 | # limitations under the License.
 19 | #
 20 | 
 21 | from __future__ import print_function
 22 | from bitstring import *
 23 | import yaml
 24 | 
 25 | try:
 26 |     from collections import OrderedDict
 27 | except ImportError:
 28 |     from ordereddict import OrderedDict
 29 | 
 30 | DEFAULT_NB_LOOPS  = 4
 31 | ULOOP_LEN = 352 # was 176
 32 | 
 33 | def yaml_ordered_load(stream, Loader=yaml.Loader, object_pairs_hook=OrderedDict):
 34 |     class OrderedLoader(Loader):
 35 |         pass
 36 |     def construct_mapping(loader, node):
 37 |         loader.flatten_mapping(node)
 38 |         return object_pairs_hook(loader.construct_pairs(node))
 39 |     OrderedLoader.add_constructor(
 40 |         yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG,
 41 |         construct_mapping)
 42 |     return yaml.load(stream, OrderedLoader)
 43 | 
 44 | def uloop_state_machine(loops, curr_state, verbose=False, nb_loops=DEFAULT_NB_LOOPS):
 45 |     curr_addr, curr_loop, curr_op, curr_idx = curr_state
 46 |     next_addr = curr_addr
 47 |     next_loop = curr_loop
 48 |     next_op   = curr_op
 49 |     next_idx  = curr_idx
 50 |     end = False
 51 |     busy = False
 52 |     execute = False
 53 |     # if next operation is within the current loop, update address
 54 |     if curr_idx[curr_loop] < loops[curr_loop]['range'] - 1 and curr_op < loops[curr_loop]['nb_ops'] - 1:
 55 |         if verbose:
 56 |             print ("@%d %s UPDATE CURRENT LOOP %d                   " % (curr_addr, str(curr_state[3][::-1]), curr_loop))
 57 |         next_addr = curr_addr + 1
 58 |         next_op   = curr_op + 1
 59 |         busy = True
 60 |         execute = True
 61 |     # if there is a lower level loop, go to it
 62 |     elif curr_idx[curr_loop] < loops[curr_loop]['range'] - 1 and curr_loop > 0:
 63 |         if verbose:
 64 |             print ("@%d %s ITERATE CURRENT LOOP %d & GOTO LOOP 0" % (curr_addr, str(curr_state[3][::-1]), curr_loop))
 65 |         next_loop = 0
 66 |         for j in range(0,curr_loop):
 67 |             next_idx[j] = 0
 68 |         next_idx[curr_loop] = curr_idx[curr_loop] + 1
 69 |         next_addr = loops[0]['uloop_addr']
 70 |         next_op   = 0
 71 |         busy = False
 72 |         execute = True
 73 |     # if we are still within the current loop range, go back to start loop address
 74 |     elif curr_idx[curr_loop] < loops[curr_loop]['range'] - 1:
 75 |         if verbose:
 76 |             print ("@%d %s ITERATE CURRENT LOOP %d                  " % (curr_addr, str(curr_state[3][::-1]), curr_loop))
 77 |         next_addr = loops[curr_loop]['uloop_addr']
 78 |         next_op   = 0
 79 |         next_idx[curr_loop] = curr_idx[curr_loop] + 1
 80 |         busy = False
 81 |         execute = True
 82 |     # if not, go to next loop
 83 |     elif curr_loop < nb_loops-1:
 84 |         if verbose:
 85 |             print ("@%d %s GOTO NEXT LOOP %d                        " % (curr_addr, str(curr_state[3][::-1]), curr_loop+1))
 86 |         next_loop = curr_loop + 1
 87 |         next_addr = loops[curr_loop+1]['uloop_addr']
 88 |         next_op   = 0
 89 |         busy = True
 90 |         execute = False
 91 |     else:
 92 |         if verbose:
 93 |             print ("@%d %s TERMINATION                              " % (curr_addr, str(curr_state[3][::-1])))
 94 |         end = True
 95 |         next_loop = 0
 96 |         next_addr = 0
 97 |         next_op   = 0
 98 |         next_idx  = []
 99 |         for j in range(nb_loops):
100 |             next_idx.append(0)
101 |         busy = False
102 |         execute = False
103 |     next_state = next_addr, next_loop, next_op, next_idx
104 |     return execute,end,busy,next_state
105 | 
106 | def uloop_execute(state, code, registers):
107 |     addr, loop, op, idx = state
108 |     new_registers = registers[:]
109 |     try:
110 |         if code[addr]['op_sel']:
111 |             new_registers[code[addr]['a']] = registers[code[addr]['a']] + registers[code[addr]['b']]
112 |         else:
113 |             new_registers[code[addr]['a']] = registers[code[addr]['b']]
114 |     except TypeError:
115 |         import pdb; pdb.set_trace()
116 |     return new_registers
117 | 
118 | def uloop_print_idx(state, registers, compact=False, register_names=None):
119 |     if not compact and register_names is None:
120 |         print ("r0:%x r1:%x r2:%x r3:%x" % (registers[0], registers[1], registers[2], registers[3]))
121 |     elif not compact:
122 |         print ("%s:%x %s:%x %s:%x %s:%x" % (register_names[0], registers[0], register_names[1], registers[1], register_names[2], registers[2], register_names[3], registers[3]))
123 |     else:
124 |         print ("%d,%d,%d,%d" % (registers[0], registers[1], registers[2], registers[3]))
125 | 
126 | def uloop_bytecode(code, loops_ops):
127 |     bytecode = {}
128 |     bytecode['code'] = BitArray()
129 |     for c in code[::-1]:
130 |         if c['op_sel'] == 1:
131 |             b = BitArray(uint=1, length=1)
132 |         else:
133 |             b = BitArray(uint=0, length=1)
134 |         a_b = BitArray(uint=c['a'], length=5)
135 |         b_b = BitArray(uint=c['b'], length=5)
136 |         b.append(a_b)
137 |         b.append(b_b)
138 |         bytecode['code'].append(b)
139 |     if bytecode['code'].length < ULOOP_LEN:
140 |         bytecode['code'].prepend(BitArray(uint=0, length=ULOOP_LEN-bytecode['code'].length))
141 |     else:
142 |         print("Error!!! ULOOP_LEN=%d is too small for bytecode of %d bits" % (ULOOP_LEN, bytecode['code'].length))
143 |         return None
144 |     bytecode['loops'] = BitArray()
145 |     a = 0
146 |     loops_addr = []
147 |     for o in loops_ops:
148 |         loops_addr.append(a)
149 |         a += o
150 |     for o,a in zip(loops_ops[::-1], loops_addr[::-1]):
151 |         a_b = BitArray(uint=a, length=5)
152 |         o_b = BitArray(uint=o, length=4)
153 |         bytecode['loops'].append(a_b)
154 |         bytecode['loops'].append(o_b)
155 |     return bytecode
156 | 
157 | def uloop_load(name):
158 |     with open(name) as f:
159 |         code_p = yaml_ordered_load(f, yaml.SafeLoader)
160 |     mnem_p = code_p['mnemonics']
161 |     code_p = code_p['code']
162 |     # code_p is a dictionary of loops
163 |     code_l = []
164 |     loops_ops = []
165 |     for l in code_p:
166 |         code_l.extend(code_p[l])
167 |         loops_ops.append(len(code_p[l]))
168 |     code = []
169 |     for c in code_l:
170 |         cn = {}
171 |         if c['op'] == 'add':
172 |             cn['op_sel'] = 1
173 |         else:
174 |             cn['op_sel'] = 0
175 |         try:
176 |             cn['a'] = mnem_p[c['a']]
177 |         except KeyError:
178 |             cn['a'] = c['a']
179 |         try:
180 |             cn['b'] = mnem_p[c['b']]
181 |         except KeyError:
182 |             cn['b'] = c['b']
183 |         code.append(cn)
184 |     return loops_ops,code,mnem_p
185 | 
186 | def uloop_get_loops(loops_ops, loops_range):
187 |     loops = []
188 |     a = 0
189 |     for o,r in zip(loops_ops, loops_range):
190 |         l = {}
191 |         l['nb_ops']     = o
192 |         l['range']      = r
193 |         l['uloop_addr'] = a
194 |         a += o
195 |         loops.append(l)
196 |     return loops
197 | 
198 | 


--------------------------------------------------------------------------------
/ucode/uloop_compile.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # uloop_compile.sv
 4 | # Francesco Conti <fconti@iis.ee.ethz.ch>
 5 | #
 6 | # Copyright (C) 2017-2019 ETH Zurich, University of Bologna
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # See LICENSE.sw.txt for details.
10 | # You may obtain a copy of the License at
11 | #
12 | #     http://www.apache.org/licenses/LICENSE-2.0
13 | #
14 | # Unless required by applicable law or agreed to in writing, software
15 | # distributed under the License is distributed on an "AS IS" BASIS,
16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | # See the License for the specific language governing permissions and
18 | # limitations under the License.
19 | #
20 | 
21 | from __future__ import print_function
22 | from uloop_common import *
23 | 
24 | loops_ops,code,mnem = uloop_load("code.yml")
25 | 
26 | bytecode = uloop_bytecode(code, loops_ops)
27 | print (bytecode['code'].length)
28 | print ("uloop bytecode: %d'h%s" % (bytecode['code'].length, str(bytecode['code'].hex)))
29 | print ("uloop loops:    %d'b%s" % (bytecode['loops'].length, str(bytecode['loops'].bin)))
30 | 


--------------------------------------------------------------------------------
/ucode/uloop_compile_dw.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # uloop_compile.sv
 4 | # Francesco Conti <fconti@iis.ee.ethz.ch>
 5 | #
 6 | # Copyright (C) 2017-2019 ETH Zurich, University of Bologna
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # See LICENSE.sw.txt for details.
10 | # You may obtain a copy of the License at
11 | #
12 | #     http://www.apache.org/licenses/LICENSE-2.0
13 | #
14 | # Unless required by applicable law or agreed to in writing, software
15 | # distributed under the License is distributed on an "AS IS" BASIS,
16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | # See the License for the specific language governing permissions and
18 | # limitations under the License.
19 | #
20 | 
21 | from __future__ import print_function
22 | from uloop_common import *
23 | 
24 | loops_ops,code,mnem = uloop_load("code_dw.yml")
25 | 
26 | bytecode = uloop_bytecode(code, loops_ops)
27 | print (bytecode['code'].length)
28 | print ("uloop bytecode: %d'h%s" % (bytecode['code'].length, str(bytecode['code'].hex)))
29 | print ("uloop loops:    %d'b%s" % (bytecode['loops'].length, str(bytecode['loops'].bin)))
30 | 


--------------------------------------------------------------------------------
/ucode/uloop_run.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # uloop_run.sv
  4 | # Francesco Conti <fconti@iis.ee.ethz.ch>
  5 | #
  6 | # Copyright (C) 2017-2019 ETH Zurich, University of Bologna
  7 | # Licensed under the Apache License, Version 2.0 (the "License");
  8 | # you may not use this file except in compliance with the License.
  9 | # See LICENSE.sw.txt for details.
 10 | # You may obtain a copy of the License at
 11 | #
 12 | #     http://www.apache.org/licenses/LICENSE-2.0
 13 | #
 14 | # Unless required by applicable law or agreed to in writing, software
 15 | # distributed under the License is distributed on an "AS IS" BASIS,
 16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 17 | # See the License for the specific language governing permissions and
 18 | # limitations under the License.
 19 | #
 20 | 
 21 | from __future__ import print_function
 22 | from uloop_common import *
 23 | import math
 24 | 
 25 | VERBOSE = True
 26 | FB = 5 # filter buffer size (FB*FB)
 27 | BS = 4 # block size
 28 | TP = 32
 29 | 
 30 | fs = 3
 31 | oh = 1
 32 | ow = 1
 33 | ih = (oh - 1) + fs
 34 | iw = (ow - 1) + fs
 35 | nof = 32
 36 | nif = 32
 37 | qa = 4
 38 | qw = 4
 39 | 
 40 | qa_max     = 4 #min(4,qa)
 41 | 
 42 | n_tiles_qa = 1
 43 | n_tiles_kin = nif/TP
 44 | n_tiles_kout = nof/TP
 45 | 
 46 | 
 47 | n_tiles_K_in = int(math.ceil(nif/TP))
 48 | n_tiles_K_out = int(math.ceil(nof/TP))
 49 | n_tiles_Hout = int(math.ceil(ih/FB))
 50 | n_tiles_Wout = int(math.ceil(iw/FB))
 51 | n_tiles_qa   = int(math.ceil(qa/BS))
 52 | n_xpatches = n_tiles_Hout * n_tiles_Wout # * n_tiles_qa
 53 | 
 54 | print("n_xpatches: ", n_xpatches)
 55 | 
 56 | loops_range = [
 57 |     n_tiles_qa,
 58 |     n_tiles_K_in,
 59 |     n_tiles_K_out,
 60 |     n_xpatches
 61 | ]
 62 | 
 63 | if fs==3:
 64 |     stream_size_fs = TP*fs*qw
 65 | 
 66 | else:
 67 |     stream_size_fs = TP*fs*fs*qw
 68 | 
 69 | registers = [
 70 |     0,
 71 |     0,
 72 |     0,
 73 |     0,
 74 |     0,
 75 |     0,
 76 |     nif,
 77 |     nof,
 78 |     TP*FB*FB*4,
 79 |     TP*9,
 80 |     stream_size_fs, #TP*fs*qw, # or TP*fs*fs*qw
 81 |     TP*fs*fs*qw+2,
 82 |     32*(32+16),
 83 |     0
 84 | ]
 85 | 
 86 | loops_ops,code,mnem = uloop_load("code.yml")
 87 | loops = uloop_get_loops(loops_ops, loops_range)
 88 | 
 89 | idx  = []
 90 | for j in range(NB_LOOPS):
 91 |     idx.append(0)
 92 | state = (0,0,0,idx)
 93 | busy = False
 94 | execute = True
 95 | uloop_print_idx(state, registers, compact=True)
 96 | nb_iter = 0
 97 | for i in range(0,1000000):
 98 |     new_registers = uloop_execute(state, code, registers)
 99 |     execute,end,busy,state = uloop_state_machine(loops, state, verbose=VERBOSE)
100 |     if execute:
101 |         registers = new_registers
102 |     if not busy:
103 |         nb_iter += 1
104 |         uloop_print_idx(state, registers, compact=True)
105 |     if end:
106 |         break
107 | print("nb_iter=%d" % (nb_iter+1))
108 | 


--------------------------------------------------------------------------------