├── .editorconfig ├── Bender.yml ├── LICENSE.hw ├── LICENSE.sw ├── README.md ├── rtl ├── accumulator │ ├── ne16_accumulator_normquant.sv │ ├── ne16_accumulator_scm.sv │ ├── ne16_accumulator_scm_test_wrap.sv │ ├── ne16_normquant.sv │ ├── ne16_normquant_bias.sv │ ├── ne16_normquant_multiplier.sv │ └── ne16_normquant_shifter.sv ├── array │ ├── ne16_binconv_array.sv │ ├── ne16_binconv_block.sv │ ├── ne16_binconv_column.sv │ └── ne16_scale.sv ├── ctrl │ ├── ne16_ctrl.sv │ └── ne16_ctrl_fsm.sv ├── input_buffer │ ├── ne16_input_buffer.sv │ ├── ne16_input_buffer_scm.sv │ └── ne16_input_buffer_scm_test_wrap.sv ├── ne16_engine.sv ├── ne16_package.sv ├── ne16_streamer.sv ├── ne16_top.sv └── ne16_top_wrap.sv ├── src_files.yml └── ucode ├── code.yml ├── code_dw.yml ├── uloop_check.py ├── uloop_check_dw.py ├── uloop_common.py ├── uloop_compile.py ├── uloop_compile_dw.py └── uloop_run.py /.editorconfig: -------------------------------------------------------------------------------- 1 | # top-most EditorConfig file 2 | root = true 3 | 4 | # Unix-style newlines with a newline ending every file 5 | [*] 6 | end_of_line = lf 7 | insert_final_newline = true 8 | trim_trailing_whitespace = true 9 | max_line_length = 100 10 | # 2 space indentation 11 | [*.{sv, svh, v, vhd}] 12 | indent_style = space 13 | indent_size = 2 14 | -------------------------------------------------------------------------------- /Bender.yml: -------------------------------------------------------------------------------- 1 | package: 2 | name: ne16 3 | authors: 4 | - "Francesco Conti " 5 | 6 | dependencies: 7 | hwpe-stream: { git: "https://github.com/pulp-platform/hwpe-stream.git", version: 1.6 } 8 | hci: { git: "https://github.com/pulp-platform/hci.git", version: 1.0.6 } 9 | hwpe-ctrl: { git: "https://github.com/pulp-platform/hwpe-ctrl.git", version: 1.6 } 10 | 11 | sources: 12 | - rtl/ne16_package.sv 13 | - rtl/accumulator/ne16_accumulator_scm_test_wrap.sv 14 | - rtl/input_buffer/ne16_input_buffer_scm_test_wrap.sv 15 | - rtl/accumulator/ne16_accumulator_scm.sv 16 | - rtl/accumulator/ne16_accumulator_normquant.sv 17 | - rtl/accumulator/ne16_normquant.sv 18 | - rtl/accumulator/ne16_normquant_shifter.sv 19 | - rtl/accumulator/ne16_normquant_bias.sv 20 | - rtl/accumulator/ne16_normquant_multiplier.sv 21 | - rtl/input_buffer/ne16_input_buffer_scm.sv 22 | - rtl/input_buffer/ne16_input_buffer.sv 23 | - rtl/array/ne16_scale.sv 24 | - rtl/array/ne16_binconv_block.sv 25 | - rtl/array/ne16_binconv_column.sv 26 | - rtl/array/ne16_binconv_array.sv 27 | - rtl/ctrl/ne16_ctrl_fsm.sv 28 | - rtl/ctrl/ne16_ctrl.sv 29 | - rtl/ne16_engine.sv 30 | - rtl/ne16_streamer.sv 31 | - rtl/ne16_top.sv 32 | - rtl/ne16_top_wrap.sv -------------------------------------------------------------------------------- /LICENSE.hw: -------------------------------------------------------------------------------- 1 | SOLDERPAD HARDWARE LICENSE version 0.51 2 | 3 | This license is based closely on the Apache License Version 2.0, but is not 4 | approved or endorsed by the Apache Foundation. A copy of the non-modified 5 | Apache License 2.0 can be found at http://www.apache.org/licenses/LICENSE-2.0. 6 | 7 | As this license is not currently OSI or FSF approved, the Licensor permits any 8 | Work licensed under this License, at the option of the Licensee, to be treated 9 | as licensed under the Apache License Version 2.0 (which is so approved). 10 | 11 | This License is licensed under the terms of this License and in particular 12 | clause 7 below (Disclaimer of Warranties) applies in relation to its use. 13 | 14 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 15 | 16 | 1. Definitions. 17 | 18 | “License” shall mean the terms and conditions for use, reproduction, and 19 | distribution as defined by Sections 1 through 9 of this document. 20 | 21 | “Licensor” shall mean the Rights owner or entity authorized by the Rights owner 22 | that is granting the License. 23 | 24 | “Legal Entity” shall mean the union of the acting entity and all other entities 25 | that control, are controlled by, or are under common control with that entity. 26 | For the purposes of this definition, “control” means (i) the power, direct or 27 | indirect, to cause the direction or management of such entity, whether by 28 | contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the 29 | outstanding shares, or (iii) beneficial ownership of such entity. 30 | 31 | “You” (or “Your”) shall mean an individual or Legal Entity exercising 32 | permissions granted by this License. 33 | 34 | “Rights” means copyright and any similar right including design right (whether 35 | registered or unregistered), semiconductor topography (mask) rights and 36 | database rights (but excluding Patents and Trademarks). 37 | 38 | “Source” form shall mean the preferred form for making modifications, including 39 | but not limited to source code, net lists, board layouts, CAD files, 40 | documentation source, and configuration files. 41 | 42 | “Object” form shall mean any form resulting from mechanical transformation or 43 | translation of a Source form, including but not limited to compiled object 44 | code, generated documentation, the instantiation of a hardware design and 45 | conversions to other media types, including intermediate forms such as 46 | bytecodes, FPGA bitstreams, artwork and semiconductor topographies (mask 47 | works). 48 | 49 | “Work” shall mean the work of authorship, whether in Source form or other 50 | Object form, made available under the License, as indicated by a Rights notice 51 | that is included in or attached to the work (an example is provided in the 52 | Appendix below). 53 | 54 | “Derivative Works” shall mean any work, whether in Source or Object form, that 55 | is based on (or derived from) the Work and for which the editorial revisions, 56 | annotations, elaborations, or other modifications represent, as a whole, an 57 | original work of authorship. For the purposes of this License, Derivative Works 58 | shall not include works that remain separable from, or merely link (or bind by 59 | name) or physically connect to or interoperate with the interfaces of, the Work 60 | and Derivative Works thereof. 61 | 62 | “Contribution” shall mean any design or work of authorship, including the 63 | original version of the Work and any modifications or additions to that Work or 64 | Derivative Works thereof, that is intentionally submitted to Licensor for 65 | inclusion in the Work by the Rights owner or by an individual or Legal Entity 66 | authorized to submit on behalf of the Rights owner. For the purposes of this 67 | definition, “submitted” means any form of electronic, verbal, or written 68 | communication sent to the Licensor or its representatives, including but not 69 | limited to communication on electronic mailing lists, source code control 70 | systems, and issue tracking systems that are managed by, or on behalf of, the 71 | Licensor for the purpose of discussing and improving the Work, but excluding 72 | communication that is conspicuously marked or otherwise designated in writing 73 | by the Rights owner as “Not a Contribution.” 74 | 75 | “Contributor” shall mean Licensor and any individual or Legal Entity on behalf 76 | of whom a Contribution has been received by Licensor and subsequently 77 | incorporated within the Work. 78 | 79 | 2. Grant of License. Subject to the terms and conditions of this License, each 80 | Contributor hereby grants to You a perpetual, worldwide, non-exclusive, 81 | no-charge, royalty-free, irrevocable license under the Rights to reproduce, 82 | prepare Derivative Works of, publicly display, publicly perform, sublicense, 83 | and distribute the Work and such Derivative Works in Source or Object form and 84 | do anything in relation to the Work as if the Rights did not exist. 85 | 86 | 3. Grant of Patent License. Subject to the terms and conditions of this 87 | License, each Contributor hereby grants to You a perpetual, worldwide, 88 | non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this 89 | section) patent license to make, have made, use, offer to sell, sell, import, 90 | and otherwise transfer the Work, where such license applies only to those 91 | patent claims licensable by such Contributor that are necessarily infringed by 92 | their Contribution(s) alone or by combination of their Contribution(s) with the 93 | Work to which such Contribution(s) was submitted. If You institute patent 94 | litigation against any entity (including a cross-claim or counterclaim in a 95 | lawsuit) alleging that the Work or a Contribution incorporated within the Work 96 | constitutes direct or contributory patent infringement, then any patent 97 | licenses granted to You under this License for that Work shall terminate as of 98 | the date such litigation is filed. 99 | 100 | 4. Redistribution. You may reproduce and distribute copies of the Work or 101 | Derivative Works thereof in any medium, with or without modifications, and in 102 | Source or Object form, provided that You meet the following conditions: 103 | 104 | You must give any other recipients of the Work or Derivative Works a copy 105 | of this License; and 106 | 107 | You must cause any modified files to carry prominent notices stating that 108 | You changed the files; and 109 | 110 | You must retain, in the Source form of any Derivative Works that You 111 | distribute, all copyright, patent, trademark, and attribution notices from 112 | the Source form of the Work, excluding those notices that do not pertain to 113 | any part of the Derivative Works; and 114 | 115 | If the Work includes a “NOTICE” text file as part of its distribution, then 116 | any Derivative Works that You distribute must include a readable copy of 117 | the attribution notices contained within such NOTICE file, excluding those 118 | notices that do not pertain to any part of the Derivative Works, in at 119 | least one of the following places: within a NOTICE text file distributed as 120 | part of the Derivative Works; within the Source form or documentation, if 121 | provided along with the Derivative Works; or, within a display generated by 122 | the Derivative Works, if and wherever such third-party notices normally 123 | appear. The contents of the NOTICE file are for informational purposes only 124 | and do not modify the License. You may add Your own attribution notices 125 | within Derivative Works that You distribute, alongside or as an addendum to 126 | the NOTICE text from the Work, provided that such additional attribution 127 | notices cannot be construed as modifying the License. You may add Your own 128 | copyright statement to Your modifications and may provide additional or 129 | different license terms and conditions for use, reproduction, or 130 | distribution of Your modifications, or for any such Derivative Works as a 131 | whole, provided Your use, reproduction, and distribution of the Work 132 | otherwise complies with the conditions stated in this License. 133 | 134 | 5. Submission of Contributions. Unless You explicitly state otherwise, any 135 | Contribution intentionally submitted for inclusion in the Work by You to the 136 | Licensor shall be under the terms and conditions of this License, without any 137 | additional terms or conditions. Notwithstanding the above, nothing herein shall 138 | supersede or modify the terms of any separate license agreement you may have 139 | executed with Licensor regarding such Contributions. 140 | 141 | 6. Trademarks. This License does not grant permission to use the trade names, 142 | trademarks, service marks, or product names of the Licensor, except as required 143 | for reasonable and customary use in describing the origin of the Work and 144 | reproducing the content of the NOTICE file. 145 | 146 | 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in 147 | writing, Licensor provides the Work (and each Contributor provides its 148 | Contributions) on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 149 | KIND, either express or implied, including, without limitation, any warranties 150 | or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 151 | PARTICULAR PURPOSE. You are solely responsible for determining the 152 | appropriateness of using or redistributing the Work and assume any risks 153 | associated with Your exercise of permissions under this License. 154 | 155 | 8. Limitation of Liability. In no event and under no legal theory, whether in 156 | tort (including negligence), contract, or otherwise, unless required by 157 | applicable law (such as deliberate and grossly negligent acts) or agreed to in 158 | writing, shall any Contributor be liable to You for damages, including any 159 | direct, indirect, special, incidental, or consequential damages of any 160 | character arising as a result of this License or out of the use or inability to 161 | use the Work (including but not limited to damages for loss of goodwill, work 162 | stoppage, computer failure or malfunction, or any and all other commercial 163 | damages or losses), even if such Contributor has been advised of the 164 | possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing the Work or 167 | Derivative Works thereof, You may choose to offer, and charge a fee for, 168 | acceptance of support, warranty, indemnity, or other liability obligations 169 | and/or rights consistent with this License. However, in accepting such 170 | obligations, You may act only on Your own behalf and on Your sole 171 | responsibility, not on behalf of any other Contributor, and only if You agree 172 | to indemnify, defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason of your 174 | accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | -------------------------------------------------------------------------------- /LICENSE.sw: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Neural Engine 16-channels 2 | The Neural Engine 16-channels (NE16) is a Deep Neural Network accelerator which uses Hardware Processing Engine (HWPE) concepts [1] and is designed to be integrated in a PULPOpen cluster configuration in combination with the Heterogeneous Cluster Interconnect (HCI). It makes use of the open-source IPs 'hci', 'hwpe-ctrl', and 'hwpe-stream'. 3 | 4 | In general the NE16 has built-in HW supports the following features: 5 | 6 | - Filters: 1x1, 3x3, depthwise, linear 7 | - Batch normalization 8 | - ReLU 9 | - Activation input bits: 8,16 10 | - Weight bits: 2,3,4,5,6,7,8 11 | - Activation output bits: 8,16,32 12 | - Nr of input channels: arbitrary 13 | - Nr of output channels: arbitrary 14 | 15 | The NE16 is a direct derivative of the Reconfigurable Binary Engine (RBE) design https://github.com/pulp-platform/rbe by Gianna Paulin (ETH Zürich) and Francesco Conti (University of Bologna). 16 | 17 | ## Contributors 18 | - Francesco Conti, University of Bologna and GreenWaves Technologies (*f.conti@unibo.it*) 19 | 20 | ## Acknowledgement 21 | The development of NE16 has been funded by GreenWaves Technologies, SAS. 22 | 23 | # License 24 | This repository makes use of two licenses: 25 | - for all *software*: Apache License Version 2.0 26 | - for all *hardware*: Solderpad Hardware License Version 0.51 27 | 28 | For further information have a look at the license files: `LICENSE.hw`, `LICENSE.sw` 29 | 30 | # References 31 | [1] F. Conti, P. Schiavone, and L Benini. "XNOR neural engine: A hardware accelerator IP for 21.6-fJ/op binary neural network inference." IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems 37.11 (2018): 2940-2951. 32 | -------------------------------------------------------------------------------- /rtl/accumulator/ne16_accumulator_scm.sv: -------------------------------------------------------------------------------- 1 | /* 2 | * ne16_accumulator_scm.sv 3 | * 4 | * Copyright (C) 2019-2021 ETH Zurich, University of Bologna and GreenWaves Technologies 5 | * 6 | * Copyright and related rights are licensed under the Solderpad Hardware 7 | * License, Version 0.51 (the "License"); you may not use this file except in 8 | * compliance with the License. You may obtain a copy of the License at 9 | * http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law 10 | * or agreed to in writing, software, hardware and materials distributed under 11 | * this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | * CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | * specific language governing permissions and limitations under the License. 14 | */ 15 | 16 | /* 17 | * Authors (RBE): Gianna Paulin 18 | * Francesco Conti 19 | * Authors (NE16): Francesco Conti 20 | */ 21 | 22 | module ne16_accumulator_scm 23 | #( 24 | parameter int unsigned ADDR_WIDTH = 5, 25 | parameter int unsigned DATA_WIDTH = 32, 26 | parameter int unsigned NUM_WORDS = 2**ADDR_WIDTH, 27 | parameter int unsigned WIDTH_FACTOR = 4 28 | ) 29 | ( 30 | input logic clk_i, 31 | input logic rst_ni, 32 | input logic clear_i, 33 | input logic test_mode_i, 34 | input logic [WIDTH_FACTOR-1:0] wide_enable_i, 35 | 36 | // Read port 37 | input logic re_i, 38 | input logic [ADDR_WIDTH-1:0] raddr_i, 39 | output logic [DATA_WIDTH-1:0] rdata_o, 40 | output logic [WIDTH_FACTOR*DATA_WIDTH-1:0] rdata_wide_o, 41 | 42 | // Write port 43 | input logic we_i, 44 | input logic we_all_i, 45 | input logic [ADDR_WIDTH-1:0] waddr_i, 46 | input logic [DATA_WIDTH-1:0] wdata_i, 47 | input logic [WIDTH_FACTOR*DATA_WIDTH-1:0] wdata_wide_i, 48 | 49 | output logic [NUM_WORDS-1:0][DATA_WIDTH-1:0] accumulators_o 50 | ); 51 | 52 | // Read address register, located at the input of the address decoder 53 | logic [NUM_WORDS-1:0][DATA_WIDTH-1:0] accumulators; 54 | logic [NUM_WORDS-1:0] waddr_onehot; 55 | logic [NUM_WORDS-1:0] clk_we; 56 | 57 | logic [WIDTH_FACTOR*DATA_WIDTH-1:0] rdata_q; 58 | logic [WIDTH_FACTOR-1:0][DATA_WIDTH-1:0] wdata_q; 59 | 60 | logic clk_gated; 61 | 62 | // ======================================================================== 63 | // CLK GATE 64 | // ======================================================================== 65 | cluster_clock_gating i_cg_we_global 66 | ( 67 | .clk_o ( clk_gated ), 68 | .en_i ( we_i | clear_i ), 69 | .test_en_i ( test_mode_i ), 70 | .clk_i ( clk_i ) 71 | ); 72 | 73 | // ======================================================================== 74 | // WDATA SAMPLING 75 | // ======================================================================== 76 | 77 | logic [WIDTH_FACTOR-1:0][DATA_WIDTH-1:0] wdata_d; 78 | generate 79 | 80 | for(genvar ii=0; ii 18 | * Francesco Conti 19 | * Authors (NE16): Francesco Conti 20 | */ 21 | 22 | module ne16_accumulator_scm_test_wrap 23 | #( 24 | parameter int unsigned ADDR_WIDTH = 5, 25 | parameter int unsigned DATA_WIDTH = 32, 26 | parameter int unsigned NUM_WORDS = 2**ADDR_WIDTH, 27 | parameter int unsigned WIDTH_FACTOR = 4 28 | ) 29 | ( 30 | input logic clk_i, 31 | input logic rst_ni, 32 | input logic clear_i, 33 | input logic test_mode_i, 34 | input logic [WIDTH_FACTOR-1:0] wide_enable_i, 35 | 36 | // Read port 37 | input logic re_i, 38 | input logic [ADDR_WIDTH-1:0] raddr_i, 39 | output logic [DATA_WIDTH-1:0] rdata_o, 40 | output logic [WIDTH_FACTOR*DATA_WIDTH-1:0] rdata_wide_o, 41 | 42 | // Write port 43 | input logic we_i, 44 | input logic we_all_i, 45 | input logic [ADDR_WIDTH-1:0] waddr_i, 46 | input logic [DATA_WIDTH-1:0] wdata_i, 47 | input logic [WIDTH_FACTOR*DATA_WIDTH-1:0] wdata_wide_i, 48 | 49 | output logic [NUM_WORDS-1:0][DATA_WIDTH-1:0] accumulators_o, 50 | 51 | // BIST ENABLE 52 | input logic BIST, 53 | //BIST ports 54 | input logic CSN_T, 55 | input logic WEN_T, 56 | input logic [ADDR_WIDTH-1:0] A_T, 57 | input logic [DATA_WIDTH-1:0] D_T, 58 | output logic [DATA_WIDTH-1:0] Q_T 59 | ); 60 | 61 | logic clear_muxed; 62 | 63 | logic ReadEnable_muxed; 64 | logic [ADDR_WIDTH-1:0] ReadAddr_muxed; 65 | 66 | logic WriteEnable_muxed; 67 | logic WriteEnable_all_muxed; 68 | logic [ADDR_WIDTH-1:0] WriteAddr_muxed; 69 | logic [DATA_WIDTH-1:0] WriteData_muxed; 70 | 71 | always_comb 72 | begin 73 | if(BIST) 74 | begin 75 | ReadEnable_muxed = (( CSN_T == 1'b0 ) && ( WEN_T == 1'b1)); 76 | ReadAddr_muxed = A_T; 77 | clear_muxed = 1'b0; 78 | 79 | WriteEnable_all_muxed = 1'b0; 80 | WriteEnable_muxed = (( CSN_T == 1'b0 ) && ( WEN_T == 1'b0)); 81 | WriteAddr_muxed = A_T; 82 | WriteData_muxed = D_T; 83 | end 84 | else 85 | begin 86 | ReadEnable_muxed = re_i; 87 | ReadAddr_muxed = raddr_i; 88 | clear_muxed = clear_i; 89 | 90 | WriteEnable_all_muxed = we_all_i; 91 | WriteEnable_muxed = we_i; 92 | WriteAddr_muxed = waddr_i; 93 | WriteData_muxed = wdata_i; 94 | end 95 | end 96 | 97 | assign Q_T = rdata_o; 98 | 99 | ne16_accumulator_scm 100 | #( 101 | .ADDR_WIDTH ( ADDR_WIDTH ), //= 5, 102 | .DATA_WIDTH ( DATA_WIDTH ), //= 32, 103 | .NUM_WORDS ( NUM_WORDS ), //= 2**ADDR_WIDTH, 104 | .WIDTH_FACTOR ( WIDTH_FACTOR ) //= 4 105 | ) 106 | ne16_accumulator_scm_i 107 | ( 108 | .clk_i ( clk_i ), 109 | .rst_ni ( rst_ni ), 110 | .clear_i ( clear_muxed ), 111 | .test_mode_i ( test_mode_i ), 112 | .wide_enable_i ( wide_enable_i ), 113 | 114 | // Read port 115 | .re_i ( ReadEnable_muxed ), 116 | .raddr_i ( ReadAddr_muxed ), 117 | .rdata_o ( rdata_o ), 118 | .rdata_wide_o ( rdata_wide_o ), 119 | 120 | // Write port 121 | .we_i ( WriteEnable_muxed ), 122 | .we_all_i ( WriteEnable_all_muxed ), 123 | .waddr_i ( WriteAddr_muxed ), 124 | .wdata_i ( WriteData_muxed ), 125 | .wdata_wide_i ( wdata_wide_i ), 126 | 127 | .accumulators_o ( accumulators_o ) 128 | ); 129 | 130 | endmodule // ne16_accumulator_scm 131 | -------------------------------------------------------------------------------- /rtl/accumulator/ne16_normquant.sv: -------------------------------------------------------------------------------- 1 | /* 2 | * ne16_normquant.sv 3 | * 4 | * Copyright (C) 2019-2021 ETH Zurich, University of Bologna and GreenWaves Technologies 5 | * 6 | * Copyright and related rights are licensed under the Solderpad Hardware 7 | * License, Version 0.51 (the "License"); you may not use this file except in 8 | * compliance with the License. You may obtain a copy of the License at 9 | * http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law 10 | * or agreed to in writing, software, hardware and materials distributed under 11 | * this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | * CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | * specific language governing permissions and limitations under the License. 14 | */ 15 | 16 | /* 17 | * Authors (RBE): Gianna Paulin 18 | * Francesco Conti 19 | * Authors (NE16): Francesco Conti 20 | */ 21 | 22 | import ne16_package::*; 23 | 24 | module ne16_normquant #( 25 | parameter int unsigned NMULT = 4, 26 | parameter int unsigned NMS = ne16_package::NORM_MULT_SIZE, 27 | parameter int unsigned ACC = ne16_package::NE16_ACCUM_SIZE, 28 | parameter int unsigned INT = 48, 29 | parameter int unsigned QNT = 32, 30 | parameter int unsigned PIPE = 1, 31 | parameter int unsigned OUTPUT_REGISTER = 0 32 | ) ( 33 | // global signals 34 | input logic clk_i, 35 | input logic rst_ni, 36 | input logic test_mode_i, 37 | // local clear 38 | input logic clear_i, 39 | // normalization parameters 40 | input logic unsigned [NMULT*NMS-1:0] norm_mult_i, 41 | input logic unsigned [NMULT*8-1:0] shift_i, 42 | // accumulation 43 | input logic signed [NMULT*ACC-1:0] accumulator_i, 44 | output logic signed [NMULT*ACC-1:0] accumulator_o, 45 | // control channel 46 | input ne16_package::ctrl_normquant_t ctrl_i, 47 | output ne16_package::flags_normquant_t [NMULT-1:0] flags_o 48 | ); 49 | 50 | logic signed [NMULT-1 :0][NMS+ACC-1:0] product; 51 | logic signed [NMULT-1 :0][INT-1:0] product_48b; 52 | logic signed [NMULT-1 :0][INT-1:0] product_8b; 53 | logic signed [NMULT/2-1:0][INT-1:0] product_16b; 54 | logic signed [INT-1:0] product_32b; 55 | logic signed [NMULT-1 :0][INT-1:0] product_to_shift; 56 | logic signed [NMULT-1 :0][INT-1:0] rounding; 57 | 58 | generate 59 | for(genvar ii=0; ii 18 | * Francesco Conti 19 | * Authors (NE16): Francesco Conti 20 | */ 21 | 22 | import ne16_package::*; 23 | 24 | module ne16_normquant_bias #( 25 | parameter int unsigned NADD = 8, 26 | parameter int unsigned ACC = ne16_package::NE16_ACCUM_SIZE, 27 | parameter int unsigned QNT = 32, 28 | parameter int unsigned OUTPUT_REGISTER = 0 29 | ) ( 30 | // global signals 31 | input logic clk_i, 32 | input logic rst_ni, 33 | input logic test_mode_i, 34 | // local clear 35 | input logic clear_i, 36 | // normalization parameters 37 | input logic unsigned [NADD*ACC-1:0] norm_bias_i, 38 | input logic unsigned [NADD*8-1:0] shift_i, 39 | // accumulation 40 | input logic signed [NADD*ACC-1:0] accumulator_i, 41 | output logic signed [NADD*ACC-1:0] accumulator_o, 42 | // control channel 43 | input ne16_package::ctrl_normquant_t ctrl_i 44 | ); 45 | 46 | generate 47 | 48 | logic [NADD-1:0][ACC-1:0] biased_data; 49 | 50 | for(genvar ii=0; ii 18 | * Francesco Conti 19 | * Authors (NE16): Francesco Conti 20 | */ 21 | 22 | import ne16_package::*; 23 | 24 | module ne16_normquant_multiplier #( 25 | parameter int unsigned NMS = ne16_package::NORM_MULT_SIZE, 26 | parameter int unsigned ACC = ne16_package::NE16_ACCUM_SIZE, 27 | parameter int unsigned PIPE = 0 28 | ) ( 29 | input logic clk_i, 30 | input logic rst_ni, 31 | input logic test_mode_i, 32 | input logic clear_i, 33 | input logic enable_i, 34 | input logic signed [NMS:0] norm_mult_signed_i, 35 | input logic signed [ACC-1:0] accumulator_i, 36 | output logic signed [NMS+ACC-1:0] product_o 37 | ); 38 | 39 | logic [NMS+ACC-1:0] product_d, product_q; 40 | assign product_d = norm_mult_signed_i * accumulator_i; 41 | 42 | generate 43 | 44 | if(PIPE == 1) begin : pipe_gen 45 | always_ff@(posedge clk_i or negedge rst_ni) 46 | begin 47 | if(~rst_ni) begin 48 | product_q <= '0; 49 | end 50 | else if(clear_i) begin 51 | product_q <= '0; 52 | end 53 | else if(enable_i) begin 54 | product_q <= product_d; 55 | end 56 | end 57 | assign product_o = product_q; 58 | end 59 | else begin : no_pipe_gen 60 | assign product_o = product_d; 61 | end 62 | 63 | endgenerate 64 | 65 | endmodule // ne16_normquant_multiplier 66 | -------------------------------------------------------------------------------- /rtl/accumulator/ne16_normquant_shifter.sv: -------------------------------------------------------------------------------- 1 | /* 2 | * ne16_normquant_shifter.sv 3 | * 4 | * Copyright (C) 2019-2021 ETH Zurich, University of Bologna and GreenWaves Technologies 5 | * 6 | * Copyright and related rights are licensed under the Solderpad Hardware 7 | * License, Version 0.51 (the "License"); you may not use this file except in 8 | * compliance with the License. You may obtain a copy of the License at 9 | * http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law 10 | * or agreed to in writing, software, hardware and materials distributed under 11 | * this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | * CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | * specific language governing permissions and limitations under the License. 14 | */ 15 | 16 | /* 17 | * Authors (RBE): Gianna Paulin 18 | * Francesco Conti 19 | * Authors (NE16): Francesco Conti 20 | */ 21 | 22 | import ne16_package::*; 23 | 24 | module ne16_normquant_shifter #( 25 | parameter int unsigned ACC = ne16_package::NE16_ACCUM_SIZE, 26 | parameter int unsigned INT = 33, 27 | parameter int unsigned OUTPUT_REGISTER = 0 28 | ) ( 29 | input logic clk_i, 30 | input logic rst_ni, 31 | input logic test_mode_i, 32 | input logic clear_i, 33 | input logic unsigned [INT-1:0] data_i, 34 | input logic unsigned [7:0] shift_i, 35 | output logic signed [ACC-1:0] accumulator_o, 36 | input ne16_package::ctrl_normquant_t ctrl_i 37 | ); 38 | 39 | logic [INT-1:0] shifted; 40 | logic signed [INT-1:0] rounding; 41 | logic [ACC-1:0] accumulator_d; 42 | logic [ACC-1:0] accumulator_q; 43 | logic [5:0] right_shift; 44 | 45 | assign right_shift = shift_i; 46 | 47 | assign rounding = 1 <<< (right_shift-1); 48 | assign shifted = ~ctrl_i.use_shifting ? $signed(data_i) : 49 | $signed(data_i) >>> right_shift; 50 | 51 | logic [INT-2:0] sat_big_or_shifted; 52 | logic [INT-2:0] sat_big_nand_shifted; 53 | 54 | always_comb 55 | begin 56 | sat_big_or_shifted = shifted[INT-2:0]; 57 | sat_big_nand_shifted = ~shifted[INT-2:0]; 58 | if(ctrl_i.relu) begin 59 | if(ctrl_i.quant_mode == NE16_MODE_8B) begin 60 | sat_big_or_shifted [7:0] = '0; 61 | end 62 | else if(ctrl_i.quant_mode == NE16_MODE_16B) begin 63 | sat_big_or_shifted [15:0] = '0; 64 | end 65 | else if(ctrl_i.quant_mode == NE16_MODE_32B) begin 66 | sat_big_or_shifted = '0; 67 | end 68 | end 69 | else begin 70 | if(ctrl_i.quant_mode == NE16_MODE_8B) begin 71 | sat_big_or_shifted [6:0] = '0; 72 | sat_big_nand_shifted[6:0] = '0; 73 | end 74 | else if(ctrl_i.quant_mode == NE16_MODE_16B) begin 75 | sat_big_or_shifted [14:0] = '0; 76 | sat_big_nand_shifted[14:0] = '0; 77 | end 78 | else if(ctrl_i.quant_mode == NE16_MODE_32B) begin 79 | sat_big_or_shifted [30:0] = '0; 80 | sat_big_nand_shifted[30:0] = '0; 81 | end 82 | end 83 | end 84 | 85 | always_comb 86 | begin 87 | 88 | accumulator_d = '0; 89 | if(ctrl_i.quant_mode == NE16_MODE_8B) begin 90 | accumulator_d[7:0] = shifted[7:0]; 91 | end 92 | else if(ctrl_i.quant_mode == NE16_MODE_16B) begin 93 | accumulator_d[15:0] = shifted[15:0]; 94 | end 95 | else if(ctrl_i.quant_mode == NE16_MODE_32B) begin 96 | accumulator_d = shifted[ACC-1:0]; 97 | end 98 | 99 | if(ctrl_i.use_shifting) begin 100 | if(ctrl_i.relu) begin 101 | if(shifted[INT-1]) 102 | accumulator_d = '0; // neg or sat- with relu active 103 | else if(~shifted[INT-1] & (|(sat_big_or_shifted))) begin 104 | accumulator_d = '1; // sat+ 105 | end 106 | end 107 | else begin 108 | if (shifted[INT-1] & (|(sat_big_nand_shifted))) begin 109 | accumulator_d = '0; 110 | if(ctrl_i.quant_mode == NE16_MODE_8B) begin 111 | accumulator_d[7] = 1'b1; // sat- 112 | end 113 | else if(ctrl_i.quant_mode == NE16_MODE_16B) begin 114 | accumulator_d[15] = 1'b1; // sat- 115 | end 116 | else if(ctrl_i.quant_mode == NE16_MODE_32B) begin 117 | accumulator_d[31] = 1'b1; // sat- 118 | end 119 | end 120 | else if(~shifted[INT-1] & (|(sat_big_or_shifted))) begin 121 | accumulator_d = '1; // sat+ 122 | if(ctrl_i.quant_mode == NE16_MODE_32B) begin 123 | accumulator_d[31] = 1'b0; // sat+ 124 | end 125 | else if(ctrl_i.quant_mode == NE16_MODE_16B) begin 126 | accumulator_d[15] = 1'b0; // sat+ 127 | end 128 | else if(ctrl_i.quant_mode == NE16_MODE_8B) begin 129 | accumulator_d[7] = 1'b0; // sat+ 130 | end 131 | end 132 | end 133 | end 134 | 135 | end 136 | 137 | if(OUTPUT_REGISTER) begin : output_register_gen 138 | 139 | always_ff @(posedge clk_i or negedge rst_ni) 140 | begin 141 | if(~rst_ni) begin 142 | accumulator_q <= '0; 143 | end 144 | else if(clear_i) begin 145 | accumulator_q <= '0; 146 | end 147 | else if(ctrl_i.start) begin 148 | accumulator_q <= accumulator_d; 149 | end 150 | end 151 | 152 | end 153 | else begin : no_output_register_gen 154 | assign accumulator_q = accumulator_d; 155 | end 156 | 157 | assign accumulator_o = accumulator_q; 158 | 159 | endmodule // ne16_normquant_shifter 160 | -------------------------------------------------------------------------------- /rtl/array/ne16_binconv_block.sv: -------------------------------------------------------------------------------- 1 | /* 2 | * ne16_binconv_block.sv 3 | * 4 | * Copyright (C) 2019-2021 ETH Zurich, University of Bologna and GreenWaves Technologies 5 | * 6 | * Copyright and related rights are licensed under the Solderpad Hardware 7 | * License, Version 0.51 (the "License"); you may not use this file except in 8 | * compliance with the License. You may obtain a copy of the License at 9 | * http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law 10 | * or agreed to in writing, software, hardware and materials distributed under 11 | * this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | * CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | * specific language governing permissions and limitations under the License. 14 | */ 15 | 16 | /* 17 | * Authors (RBE): Gianna Paulin 18 | * Francesco Conti 19 | * Authors (NE16): Francesco Conti 20 | */ 21 | import ne16_package::*; 22 | 23 | module ne16_binconv_block #( 24 | parameter int unsigned BLOCK_SIZE = NE16_BLOCK_SIZE, // number of SoP's per BinConv block (default 4) 25 | parameter int unsigned TP_IN = NE16_TP_IN, // number of input elements processed per cycle 26 | parameter int unsigned PIPELINE = 1 27 | ) ( 28 | // global signals 29 | input logic clk_i, 30 | input logic rst_ni, 31 | input logic test_mode_i, 32 | // local enable & clear 33 | input logic enable_i, 34 | input logic clear_i, 35 | // input activation stream + handshake 36 | hwpe_stream_intf_stream.sink activation_i [BLOCK_SIZE-1:0], 37 | // input weight stream + handshake 38 | hwpe_stream_intf_stream.sink weight_i, 39 | // output features + handshake 40 | hwpe_stream_intf_stream.source block_pres_o, 41 | // control channel 42 | input ctrl_binconv_block_t ctrl_i, 43 | output flags_binconv_block_t flags_o 44 | ); 45 | 46 | logic clk_gated; 47 | cluster_clock_gating i_hier_block_gate ( 48 | .clk_i ( clk_i ), 49 | .en_i ( enable_i | clear_i ), 50 | .test_en_i ( test_mode_i ), 51 | .clk_o ( clk_gated ) 52 | ); 53 | 54 | /////////////////////////////////////////// 55 | // Local Params, Interfaces, and Signals // 56 | /////////////////////////////////////////// 57 | 58 | // internal weight interface 59 | hwpe_stream_intf_stream #( 60 | .DATA_WIDTH ( 1 ) 61 | `ifndef SYNTHESIS 62 | , 63 | .BYPASS_VCR_ASSERT( 1'b1 ), 64 | .BYPASS_VDR_ASSERT( 1'b1 ) 65 | `endif 66 | ) weight_int [BLOCK_SIZE-1:0] ( 67 | .clk ( clk_i ) 68 | ); 69 | 70 | // BinConv result interface 71 | hwpe_stream_intf_stream #( 72 | .DATA_WIDTH ( NE16_QA_IN ) 73 | `ifndef SYNTHESIS 74 | , 75 | .BYPASS_VCR_ASSERT( 1'b1 ), 76 | .BYPASS_VDR_ASSERT( 1'b1 ) 77 | `endif 78 | ) popcount [BLOCK_SIZE-1:0] ( 79 | .clk ( clk_i ) 80 | ); 81 | 82 | hwpe_stream_intf_stream #( 83 | .DATA_WIDTH ( NE16_QA_IN+$clog2(BLOCK_SIZE)+NE16_QA_16BIT ) 84 | `ifndef SYNTHESIS 85 | , 86 | .BYPASS_VCR_ASSERT( 1'b1 ), 87 | .BYPASS_VDR_ASSERT( 1'b1 ) 88 | `endif 89 | ) pres_nonscaled ( 90 | .clk ( clk_i ) 91 | ); 92 | 93 | hwpe_stream_intf_stream #( 94 | .DATA_WIDTH ( NE16_QA_IN+$clog2(BLOCK_SIZE)+NE16_QA_16BIT+8 ) 95 | `ifndef SYNTHESIS 96 | , 97 | .BYPASS_VCR_ASSERT( 1'b1 ), 98 | .BYPASS_VDR_ASSERT( 1'b1 ) 99 | `endif 100 | ) pres ( 101 | .clk ( clk_i ) 102 | ); 103 | 104 | logic clear_int; 105 | 106 | logic [NE16_QA_IN+$clog2(BLOCK_SIZE)+NE16_QA_16BIT-1:0] binconv_block_pres_nonscaled_d, binconv_block_pres_nonscaled_q; 107 | logic [NE16_QA_IN+$clog2(BLOCK_SIZE)-2:0] binconv_block_pres_nonscaled_hi_d; 108 | logic [NE16_QA_IN+$clog2(BLOCK_SIZE)-2:0] binconv_block_pres_nonscaled_lo_d; 109 | logic binconv_block_pres_nonscaled_valid_d, binconv_block_pres_nonscaled_valid_q; 110 | 111 | logic [NE16_QA_IN+NE16_QA_16BIT+8+$clog2(BLOCK_SIZE)-1:0] binconv_block_pres_q; 112 | logic binconv_block_pres_valid_q; 113 | 114 | ctrl_scale_t scale_ctrl; 115 | ctrl_scale_t scale_ctrl_q; 116 | 117 | logic [BLOCK_SIZE-1:0] [NE16_QA_IN-1:0] popcount_data; 118 | 119 | assign clear_int = clear_i | ctrl_i.clear; 120 | 121 | /////////////////////////////// 122 | // BinConv and Scale Modules // 123 | /////////////////////////////// 124 | // iterate over all BLOCK_SIZE BinConvs in a singe block 125 | 126 | generate 127 | 128 | for(genvar ii=0; ii 18 | * Francesco Conti 19 | * Authors (NE16): Francesco Conti 20 | */ 21 | 22 | import ne16_package::*; 23 | 24 | module ne16_binconv_column #( 25 | parameter int unsigned COLUMN_SIZE = NE16_COLUMN_SIZE, // number of BinConv blocks per column (default 9) 26 | parameter int unsigned BLOCK_SIZE = NE16_BLOCK_SIZE, // number of SoP's per BinConv block (default 4) 27 | parameter int unsigned BC_COLBLOCK_SIZE = COLUMN_SIZE*BLOCK_SIZE, 28 | parameter int unsigned TP_IN = NE16_TP_IN // number of input elements processed per cycle 29 | ) ( 30 | // global signals 31 | input logic clk_i, 32 | input logic rst_ni, 33 | input logic test_mode_i, 34 | // local enable & clear 35 | input logic enable_i, 36 | input logic clear_i, 37 | // input activation stream + handshake 38 | hwpe_stream_intf_stream.sink activation_i [BC_COLBLOCK_SIZE-1:0], 39 | // input weight stream + handshake 40 | hwpe_stream_intf_stream.sink weight_i [COLUMN_SIZE-1:0], 41 | // output features + handshake 42 | hwpe_stream_intf_stream.source column_pres_o, 43 | // control channel 44 | input ctrl_binconv_column_t ctrl_i, 45 | output flags_binconv_column_t flags_o 46 | ); 47 | 48 | /////////////////////////////////////////// 49 | // Local Params, Interfaces, and Signals // 50 | /////////////////////////////////////////// 51 | 52 | localparam BLOCK_PRES_SIZE = NE16_QA_IN+NE16_QA_16BIT+8+$clog2(BLOCK_SIZE); 53 | localparam COLUMN_PRES_SIZE = BLOCK_PRES_SIZE+$clog2(COLUMN_SIZE); 54 | 55 | hwpe_stream_intf_stream #( 56 | .DATA_WIDTH ( BLOCK_PRES_SIZE ) 57 | `ifndef SYNTHESIS 58 | , 59 | .BYPASS_VCR_ASSERT( 1'b1 ), 60 | .BYPASS_VDR_ASSERT( 1'b1 ) 61 | `endif 62 | ) block_pres [COLUMN_SIZE-1:0] ( 63 | .clk ( clk_i ) 64 | ); 65 | 66 | logic signed [COLUMN_PRES_SIZE-1:0] binconv_column_pres_d, binconv_column_pres_q; 67 | logic binconv_column_pres_valid_d, binconv_column_pres_valid_q; 68 | logic [COLUMN_PRES_SIZE/8-1:0] binconv_column_pres_strb_d, binconv_column_pres_strb_q; 69 | 70 | logic signed [COLUMN_SIZE-1:0][BLOCK_PRES_SIZE-1:0] block_pres_data; 71 | 72 | /////////////////// 73 | // Block Modules // 74 | /////////////////// 75 | generate 76 | for(genvar ii=0; ii 18 | * Francesco Conti 19 | * Authors (NE16): Francesco Conti 20 | */ 21 | 22 | import ne16_package::*; 23 | 24 | module ne16_scale #( 25 | parameter int unsigned INP_ACC = 8, // input bitwidth 26 | parameter int unsigned OUT_ACC = 16, // output bitwidth 27 | parameter int unsigned N_SHIFTS = 8 // number of mutliplexed shifts 28 | ) ( 29 | // global signals 30 | input logic clk_i, 31 | input logic rst_ni, 32 | input logic test_mode_i, 33 | // local enable & clear 34 | // input logic enable_i, 35 | // input logic clear_i, 36 | // input data 37 | hwpe_stream_intf_stream.sink data_i, 38 | // output data 39 | hwpe_stream_intf_stream.source data_o, 40 | // control channel 41 | input ctrl_scale_t ctrl_i, 42 | output flags_scale_t flags_o 43 | ); 44 | 45 | // ======================================================================== 46 | // SIGNAL DECLARATIONS 47 | // ======================================================================== 48 | 49 | logic [OUT_ACC-1:0] shifted_data [N_SHIFTS-1:0]; 50 | logic [OUT_ACC-1:0] unshifted_data; 51 | logic [OUT_ACC-1:0] shifted_data_out; 52 | logic signed [OUT_ACC-1:0] inverted_data_out; 53 | 54 | logic [INP_ACC-1:0] data; 55 | 56 | assign data = data_i.data; 57 | 58 | assign unshifted_data[INP_ACC-1:0] = data[INP_ACC-1:0]; 59 | 60 | generate 61 | if (OUT_ACC-1 >= INP_ACC) begin 62 | assign unshifted_data[OUT_ACC-1:INP_ACC] = '0;//data_i[INP_ACC-1:0]; 63 | end 64 | endgenerate 65 | 66 | // All other shifts 67 | always_comb 68 | begin 69 | // Assign data with shift index 0 70 | // assign shifted_data[0] = unshifted_data; 71 | 72 | for(int i=0; i 18 | * Francesco Conti 19 | * Authors (NE16): Francesco Conti 20 | */ 21 | 22 | import ne16_package::*; 23 | import hwpe_ctrl_package::*; 24 | import hci_package::*; 25 | 26 | module ne16_ctrl_fsm ( 27 | // global signals 28 | input logic clk_i, 29 | input logic rst_ni, 30 | input logic test_mode_i, 31 | input logic clear_i, 32 | input logic start_i, 33 | // ctrl & flags 34 | input flags_engine_t flags_engine_i, 35 | input flags_streamer_t flags_streamer_i, 36 | input config_ne16_t config_i, 37 | output state_ne16_t state_o, 38 | output logic state_change_o, 39 | input logic uloop_ready_i, 40 | output index_ne16_t index_o, 41 | output base_addr_ne16_t base_addr_o 42 | ); 43 | 44 | /* signal declarations */ 45 | state_ne16_t state_d, state_q; 46 | logic state_change_d, state_change_q; 47 | 48 | ctrl_uloop_t ctrl_uloop; 49 | flags_uloop_t flags_uloop; 50 | uloop_code_t code_uloop; 51 | logic [17:0][31:0] ro_reg; 52 | 53 | index_ne16_t index_d, index_q; 54 | index_update_ne16_t index_update_d, index_update_q; 55 | base_addr_ne16_t base_addr_d, base_addr_q; 56 | logic streamin_en; 57 | 58 | /* finite state machine */ 59 | always_ff @(posedge clk_i or negedge rst_ni) 60 | begin : fsm_sequential 61 | if(~rst_ni) begin 62 | state_q <= IDLE; 63 | state_change_q <= '0; 64 | end 65 | else if(clear_i) begin 66 | state_q <= IDLE; 67 | state_change_q <= '0; 68 | end 69 | else begin 70 | state_q <= state_d; 71 | state_change_q <= state_change_d; 72 | end 73 | end 74 | 75 | always_comb 76 | begin: fsm_next_state 77 | state_d = state_q; 78 | state_change_d = 1'b0; 79 | 80 | case(state_q) 81 | 82 | IDLE: begin 83 | if(start_i) begin 84 | state_d = LOAD; 85 | state_change_d = 1'b1; 86 | end 87 | end 88 | 89 | LOAD: begin 90 | if(flags_engine_i.flags_input_buffer.state == IB_EXTRACT) begin 91 | state_d = WEIGHTOFFS; 92 | state_change_d = 1'b1; 93 | end 94 | end 95 | 96 | WEIGHTOFFS: begin 97 | if(flags_engine_i.flags_accumulator[8].state == AQ_ACCUM_DONE) begin 98 | if(streamin_en) begin 99 | state_d = STREAMIN; 100 | state_change_d = 1'b1; 101 | end 102 | else begin 103 | state_d = MATRIXVEC; 104 | state_change_d = 1'b1; 105 | end 106 | end 107 | end 108 | 109 | STREAMIN: begin 110 | if(flags_engine_i.flags_accumulator[8].state == AQ_STREAMIN_DONE) begin 111 | state_d = MATRIXVEC; 112 | state_change_d = 1'b1; 113 | end 114 | end 115 | 116 | MATRIXVEC: begin 117 | if(flags_engine_i.flags_accumulator[8].state == AQ_ACCUM_DONE) begin 118 | if(~uloop_ready_i) begin 119 | state_d = UPDATEIDX_WAIT; 120 | state_change_d = 1'b1; 121 | end 122 | else begin 123 | state_d = UPDATEIDX; 124 | state_change_d = 1'b1; 125 | end 126 | end 127 | end 128 | 129 | NORMQUANT_SHIFT: begin 130 | if(flags_engine_i.flags_accumulator[8].state == AQ_NORMQUANT) begin 131 | state_d = NORMQUANT; 132 | state_change_d = 1'b1; 133 | end 134 | end 135 | 136 | NORMQUANT: begin 137 | if(flags_engine_i.flags_accumulator[8].state == AQ_NORMQUANT_BIAS) begin 138 | state_d = NORMQUANT_BIAS; 139 | state_change_d = 1'b1; 140 | end 141 | else if(~config_i.norm_option_bias & flags_engine_i.flags_accumulator[8].state == AQ_NORMQUANT_DONE) begin 142 | state_d = STREAMOUT; 143 | state_change_d = 1'b1; 144 | end 145 | end 146 | 147 | NORMQUANT_BIAS: begin 148 | if(flags_engine_i.flags_accumulator[8].state == AQ_NORMQUANT_DONE) begin 149 | state_d = STREAMOUT; 150 | state_change_d = 1'b1; 151 | end 152 | end 153 | 154 | STREAMOUT: begin 155 | if(flags_engine_i.flags_accumulator[8].state == AQ_STREAMOUT_DONE) begin 156 | if(flags_uloop.done) begin 157 | state_d = DONE; 158 | state_change_d = 1'b1; 159 | end 160 | else begin 161 | state_d = STREAMOUT_DONE; 162 | state_change_d = 1'b1; 163 | end 164 | end 165 | end 166 | 167 | STREAMOUT_DONE: begin 168 | if(flags_streamer_i.tcdm_fifo_empty) begin 169 | state_d = LOAD; 170 | state_change_d = 1'b1; 171 | end 172 | end 173 | 174 | UPDATEIDX_WAIT: begin 175 | if(uloop_ready_i) begin 176 | state_d = UPDATEIDX; 177 | state_change_d = 1'b1; 178 | end 179 | end 180 | 181 | UPDATEIDX: begin 182 | if(flags_uloop.valid) begin 183 | if((config_i.filter_mode != NE16_FILTER_MODE_3X3_DW) && (flags_uloop.idx_update == 4'b0001) && (~flags_uloop.done)) begin 184 | state_d = LOAD; 185 | state_change_d = 1'b1; 186 | end 187 | else if(~config_i.streamout_quant) begin 188 | state_d = STREAMOUT; 189 | state_change_d = 1'b1; 190 | end 191 | else if(config_i.norm_option_shift) begin 192 | state_d = NORMQUANT_SHIFT; 193 | state_change_d = 1'b1; 194 | end 195 | else begin 196 | state_d = NORMQUANT; 197 | state_change_d = 1'b1; 198 | end 199 | end 200 | end 201 | 202 | DONE: begin 203 | state_d = IDLE; 204 | state_change_d = 1'b1; 205 | end 206 | 207 | endcase 208 | end 209 | 210 | /* uloop instantiation */ 211 | always_comb 212 | begin 213 | code_uloop = '0; 214 | code_uloop.code = config_i.filter_mode == NE16_FILTER_MODE_3X3_DW ? ULOOP_CODE_DEPTHWISE : ULOOP_CODE_NORMAL; 215 | code_uloop.loops = config_i.filter_mode == NE16_FILTER_MODE_3X3_DW ? ULOOP_LOOPS_DEPTHWISE : ULOOP_LOOPS_NORMAL; 216 | code_uloop.range[0] = config_i.filter_mode == NE16_FILTER_MODE_3X3_DW ? config_i.subtile_nb_wo : config_i.subtile_nb_ki; 217 | code_uloop.range[1] = config_i.filter_mode == NE16_FILTER_MODE_3X3_DW ? config_i.subtile_nb_ho : config_i.subtile_nb_wo; 218 | code_uloop.range[2] = config_i.filter_mode == NE16_FILTER_MODE_3X3_DW ? config_i.subtile_nb_ko : config_i.subtile_nb_ho; 219 | code_uloop.range[3] = config_i.filter_mode == NE16_FILTER_MODE_3X3_DW ? 1 : config_i.subtile_nb_ko; 220 | end 221 | 222 | assign ctrl_uloop.enable = (state_q == UPDATEIDX) & ~flags_uloop.valid; 223 | assign ctrl_uloop.clear = (state_q == IDLE); 224 | assign ctrl_uloop.ready = config_i.filter_mode == NE16_FILTER_MODE_1X1 ? 1'b1 : uloop_ready_i; 225 | 226 | hwpe_ctrl_uloop #( 227 | .LENGTH ( 32 ), 228 | .NB_LOOPS ( 4 ), 229 | .NB_RO_REG ( 18 ), 230 | .NB_REG ( 4 ), 231 | .REG_WIDTH ( 32 ), 232 | .CNT_WIDTH ( 16 ), 233 | .SHADOWED ( 1 ) 234 | `ifndef SYNTHESIS 235 | , 236 | .DEBUG_DISPLAY ( 0 ) 237 | `endif 238 | ) i_uloop ( 239 | .clk_i ( clk_i ), 240 | .rst_ni ( rst_ni ), 241 | .test_mode_i ( test_mode_i ), 242 | .clear_i ( clear_i | ctrl_uloop.clear ), 243 | .ctrl_i ( ctrl_uloop ), 244 | .flags_o ( flags_uloop ), 245 | .uloop_code_i ( code_uloop ), 246 | .registers_read_i ( ro_reg ) 247 | ); 248 | 249 | assign ro_reg[NE16_ULOOP_RO_WEIGHTS_KOM_ITER] = config_i.uloop_iter.weights_kom_iter; 250 | assign ro_reg[NE16_ULOOP_RO_WEIGHTS_KIM_ITER] = config_i.uloop_iter.weights_kim_iter; 251 | assign ro_reg[NE16_ULOOP_RO_WEIGHTS_KOM_RESET_ITER] = config_i.uloop_iter.weights_kom_reset_iter; 252 | assign ro_reg[NE16_ULOOP_RO_WEIGHTS_KIM_RESET_ITER] = config_i.uloop_iter.weights_kim_reset_iter; 253 | assign ro_reg[NE16_ULOOP_RO_INFEAT_KIM_ITER] = config_i.uloop_iter.infeat_kim_iter; 254 | assign ro_reg[NE16_ULOOP_RO_INFEAT_WOM_ITER] = config_i.uloop_iter.infeat_wom_iter; 255 | assign ro_reg[NE16_ULOOP_RO_INFEAT_HOM_ITER] = config_i.uloop_iter.infeat_hom_iter; 256 | assign ro_reg[NE16_ULOOP_RO_INFEAT_KIM_RESET_ITER] = config_i.uloop_iter.infeat_kim_reset_iter; 257 | assign ro_reg[NE16_ULOOP_RO_INFEAT_WOM_RESET_ITER] = config_i.uloop_iter.infeat_wom_reset_iter; 258 | assign ro_reg[NE16_ULOOP_RO_INFEAT_HOM_RESET_ITER] = config_i.uloop_iter.infeat_hom_reset_iter; 259 | assign ro_reg[NE16_ULOOP_RO_OUTFEAT_WOM_ITER] = config_i.uloop_iter.outfeat_wom_iter; 260 | assign ro_reg[NE16_ULOOP_RO_OUTFEAT_HOM_ITER] = config_i.uloop_iter.outfeat_hom_iter; 261 | assign ro_reg[NE16_ULOOP_RO_OUTFEAT_KOM_ITER] = config_i.uloop_iter.outfeat_kom_iter; 262 | assign ro_reg[NE16_ULOOP_RO_OUTFEAT_WOM_RESET_ITER] = config_i.uloop_iter.outfeat_wom_reset_iter; 263 | assign ro_reg[NE16_ULOOP_RO_OUTFEAT_HOM_RESET_ITER] = config_i.uloop_iter.outfeat_hom_reset_iter; 264 | assign ro_reg[NE16_ULOOP_RO_OUTFEAT_KOM_RESET_ITER] = config_i.uloop_iter.outfeat_kom_reset_iter; 265 | assign ro_reg[NE16_ULOOP_RO_SCALE_KOM_ITER] = config_i.uloop_iter.scale_kom_iter; 266 | assign ro_reg[NE16_ULOOP_RO_ZERO] = '0; 267 | 268 | /* index registers */ 269 | logic index_sample_en; 270 | assign index_sample_en = ((state_d == WEIGHTOFFS & config_i.filter_mode==NE16_FILTER_MODE_3X3_DW) || state_d == LOAD || state_d == STREAMOUT_DONE) & state_change_d; 271 | always_ff @(posedge clk_i or negedge rst_ni) 272 | begin 273 | if(~rst_ni) begin 274 | index_q <= '0; 275 | index_update_q <= '0; 276 | base_addr_q <= '0; 277 | end 278 | else if(clear_i) begin 279 | index_q <= '0; 280 | index_update_q <= '0; 281 | base_addr_q <= '0; 282 | end 283 | else if(index_sample_en) begin // commit indeces when loading 284 | index_q <= index_d; 285 | index_update_q <= index_update_d; 286 | base_addr_q <= base_addr_d; 287 | end 288 | end 289 | 290 | /* FSM output binding */ 291 | assign state_o = state_d; 292 | assign state_change_o = state_change_d; 293 | 294 | assign index_d.k_out_major = config_i.filter_mode==NE16_FILTER_MODE_3X3_DW ? flags_uloop.idx[2] : flags_uloop.idx[3]; 295 | assign index_d.i_major = config_i.filter_mode==NE16_FILTER_MODE_3X3_DW ? flags_uloop.idx[1] : flags_uloop.idx[2]; 296 | assign index_d.j_major = config_i.filter_mode==NE16_FILTER_MODE_3X3_DW ? flags_uloop.idx[0] : flags_uloop.idx[1]; 297 | assign index_d.k_in_major = config_i.filter_mode==NE16_FILTER_MODE_3X3_DW ? flags_uloop.idx[2] : flags_uloop.idx[0]; 298 | 299 | assign index_update_d.k_out_major = config_i.filter_mode==NE16_FILTER_MODE_3X3_DW ? flags_uloop.idx_update[2] : flags_uloop.idx_update[3]; 300 | assign index_update_d.i_major = config_i.filter_mode==NE16_FILTER_MODE_3X3_DW ? flags_uloop.idx_update[1] : flags_uloop.idx_update[2]; 301 | assign index_update_d.j_major = config_i.filter_mode==NE16_FILTER_MODE_3X3_DW ? flags_uloop.idx_update[0] : flags_uloop.idx_update[1]; 302 | assign index_update_d.k_in_major = config_i.filter_mode==NE16_FILTER_MODE_3X3_DW ? flags_uloop.idx_update[2] : flags_uloop.idx_update[0]; 303 | 304 | assign base_addr_d.weights = flags_uloop.offs[NE16_ULOOP_BASE_ADDR_W]; 305 | assign base_addr_d.infeat = flags_uloop.offs[NE16_ULOOP_BASE_ADDR_X]; 306 | assign base_addr_d.outfeat = flags_uloop.offs[NE16_ULOOP_BASE_ADDR_Y]; 307 | assign base_addr_d.scale = flags_uloop.offs[NE16_ULOOP_BASE_ADDR_S]; 308 | 309 | assign index_o = index_sample_en ? index_d : index_q; 310 | assign base_addr_o = index_sample_en ? base_addr_d : base_addr_q; 311 | 312 | assign streamin_en = config_i.streamin & ((index_update_d.k_out_major | index_update_d.i_major | index_update_d.j_major) | (index_q.k_out_major=='0 & index_q.k_in_major=='0 & index_q.i_major=='0 & index_q.j_major=='0)); 313 | 314 | endmodule // ne16_ctrl_fsm 315 | -------------------------------------------------------------------------------- /rtl/input_buffer/ne16_input_buffer.sv: -------------------------------------------------------------------------------- 1 | /* 2 | * ne16_input_buffer.sv 3 | * 4 | * Copyright (C) 2019-2021 ETH Zurich, University of Bologna and GreenWaves Technologies 5 | * 6 | * Copyright and related rights are licensed under the Solderpad Hardware 7 | * License, Version 0.51 (the "License"); you may not use this file except in 8 | * compliance with the License. You may obtain a copy of the License at 9 | * http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law 10 | * or agreed to in writing, software, hardware and materials distributed under 11 | * this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | * CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | * specific language governing permissions and limitations under the License. 14 | */ 15 | 16 | /* 17 | * Authors (RBE): Gianna Paulin 18 | * Francesco Conti 19 | * Authors (NE16): Francesco Conti 20 | */ 21 | 22 | import ne16_package::*; 23 | 24 | module ne16_input_buffer #( 25 | parameter int unsigned INPUT_BUF_SIZE = 400, 26 | parameter int unsigned BLOCK_SIZE = NE16_BLOCK_SIZE, 27 | parameter int unsigned DW = NE16_QA_IN 28 | ) ( 29 | // global signals 30 | input logic clk_i, 31 | input logic rst_ni, 32 | input logic test_mode_i, 33 | 34 | // local enable and clear 35 | input logic enable_i, 36 | input logic clear_i, 37 | 38 | // control channel 39 | input ctrl_input_buffer_t ctrl_i, 40 | output flags_input_buffer_t flags_o, 41 | 42 | // input / output streams 43 | hwpe_stream_intf_stream.sink feat_i [BLOCK_SIZE-1:0], 44 | hwpe_stream_intf_stream.source feat_o [INPUT_BUF_SIZE-1:0] 45 | ); 46 | 47 | localparam NW = INPUT_BUF_SIZE/BLOCK_SIZE; 48 | localparam AW = $clog2(NW); 49 | localparam DS = DW*BLOCK_SIZE; 50 | 51 | // Standard-cell memory based feature register 52 | logic scm_re; 53 | logic [AW-1:0] scm_raddr; 54 | logic scm_we; 55 | logic scm_we_all; 56 | logic [AW-1:0] scm_waddr; 57 | logic [DS-1:0] scm_wdata; 58 | logic [NW-1:0][DS-1:0] scm_input_buffer; 59 | 60 | // Finite-state machine + counters 61 | state_input_buffer_t fsm_cs, fsm_ns; 62 | logic vlen_cnt_clr, vlen_cnt_gl_en, vlen_cnt_en; 63 | logic [AW-1:0] vlen_cnt; 64 | logic [AW-1:0] vlen_cnt_next; 65 | 66 | ne16_input_buffer_scm_test_wrap #( 67 | .ADDR_WIDTH ( AW ), 68 | .DATA_WIDTH ( DS ), 69 | .NUM_WORDS ( NW ) 70 | ) i_input_buffer_scm ( 71 | .clk_i ( clk_i ), 72 | .rst_ni ( rst_ni ), 73 | .clear_i ( clear_i ), 74 | .test_mode_i ( test_mode_i ), 75 | .re_i ( scm_re ), 76 | .raddr_i ( scm_raddr ), 77 | .rdata_o ( ), 78 | .we_i ( scm_we ), 79 | .we_all_i ( scm_we_all ), 80 | .waddr_i ( scm_waddr ), 81 | .wdata_i ( scm_wdata ), 82 | .input_buffer_o ( scm_input_buffer ), 83 | .BIST ( ), 84 | .CSN_T ( ), 85 | .WEN_T ( ), 86 | .A_T ( ), 87 | .D_T ( ), 88 | .Q_T ( ) 89 | ); 90 | 91 | // this mask is used to load only 9 pixels instead of 25 in 1x1 mode (see ne16_ctrl for other masks) 92 | logic [24:0] mask_1x1; 93 | logic [4:0] mask_1x1_s; 94 | assign mask_1x1_s = (1 << 3) - 1; 95 | always_comb 96 | begin 97 | mask_1x1 = '1; 98 | mask_1x1 &= {5{mask_1x1_s}}; 99 | mask_1x1 &= {{5{mask_1x1_s[4]}}, {5{mask_1x1_s[3]}}, {5{mask_1x1_s[2]}}, {5{mask_1x1_s[1]}}, {5{mask_1x1_s[0]}}}; 100 | end 101 | 102 | // implicit padding --> comes from incomplete subtiles in the spatial dimensions --> always padded with 0 103 | // explicit padding --> requested through the padding register --> padded with config.padding_value 104 | // priority: implicit padding --> explicit padding --> normal feature 105 | assign scm_we = feat_i[0].valid & (feat_i[0].ready | (ctrl_i.filter_mode == NE16_FILTER_MODE_1X1 ? ~mask_1x1[vlen_cnt] : 1'b0)); 106 | assign scm_we_all = '0; 107 | assign scm_waddr = vlen_cnt; 108 | generate 109 | for(genvar ii=0; ii 18 | * Francesco Conti 19 | * Authors (NE16): Francesco Conti 20 | */ 21 | 22 | module ne16_input_buffer_scm_test_wrap 23 | #( 24 | parameter int unsigned ADDR_WIDTH = 5, 25 | parameter int unsigned DATA_WIDTH = 128, 26 | parameter int unsigned NUM_WORDS = 25 27 | ) 28 | ( 29 | input logic clk_i, 30 | input logic rst_ni, 31 | input logic clear_i, 32 | input logic test_mode_i, 33 | 34 | // Read port 35 | input logic re_i, 36 | input logic [ADDR_WIDTH-1:0] raddr_i, 37 | output logic [DATA_WIDTH-1:0] rdata_o, 38 | 39 | // Write port 40 | input logic we_i, 41 | input logic we_all_i, 42 | input logic [ADDR_WIDTH-1:0] waddr_i, 43 | input logic [DATA_WIDTH-1:0] wdata_i, 44 | 45 | output logic [NUM_WORDS-1:0][DATA_WIDTH-1:0] input_buffer_o, 46 | 47 | // BIST ENABLE 48 | input logic BIST, 49 | 50 | //BIST ports 51 | input logic CSN_T, 52 | input logic WEN_T, 53 | input logic [ADDR_WIDTH-1:0] A_T, 54 | input logic [DATA_WIDTH-1:0] D_T, 55 | output logic [DATA_WIDTH-1:0] Q_T 56 | ); 57 | 58 | 59 | logic ReadEnable_muxed; 60 | logic [ADDR_WIDTH-1:0] ReadAddr_muxed; 61 | 62 | logic WriteEnable_all_muxed; 63 | logic WriteEnable_muxed; 64 | logic [ADDR_WIDTH-1:0] WriteAddr_muxed; 65 | logic [DATA_WIDTH-1:0] WriteData_muxed; 66 | 67 | logic clear_muxed; 68 | 69 | always_comb 70 | begin 71 | if(BIST) 72 | begin 73 | ReadEnable_muxed = (( CSN_T == 1'b0 ) && ( WEN_T == 1'b1)); 74 | ReadAddr_muxed = A_T; 75 | 76 | WriteEnable_all_muxed = 1'b0; 77 | WriteEnable_muxed = (( CSN_T == 1'b0 ) && ( WEN_T == 1'b0)); 78 | WriteAddr_muxed = A_T; 79 | WriteData_muxed = D_T; 80 | 81 | clear_muxed = 1'b0; 82 | end 83 | else 84 | begin 85 | ReadEnable_muxed = re_i; 86 | ReadAddr_muxed = raddr_i; 87 | 88 | WriteEnable_muxed = we_i; 89 | WriteEnable_all_muxed = we_all_i; 90 | WriteAddr_muxed = waddr_i; 91 | WriteData_muxed = wdata_i; 92 | 93 | clear_muxed = clear_i; 94 | end 95 | end 96 | 97 | assign Q_T = rdata_o; 98 | 99 | 100 | ne16_input_buffer_scm 101 | #( 102 | .ADDR_WIDTH ( ADDR_WIDTH ), 103 | .DATA_WIDTH ( DATA_WIDTH ), 104 | .NUM_WORDS ( NUM_WORDS ) 105 | ) 106 | ne16_input_buffer_scm_i 107 | ( 108 | .clk_i ( clk_i ), 109 | .rst_ni ( rst_ni ), 110 | .clear_i ( clear_muxed ), 111 | .test_mode_i ( test_mode_i ), 112 | 113 | // Read port 114 | .re_i ( ReadEnable_muxed ), 115 | .raddr_i ( ReadAddr_muxed ), 116 | .rdata_o ( rdata_o ), 117 | 118 | // Write port 119 | .we_i ( WriteEnable_muxed ), 120 | .we_all_i ( WriteEnable_all_muxed ), 121 | .waddr_i ( WriteAddr_muxed ), 122 | .wdata_i ( WriteData_muxed ), 123 | 124 | .input_buffer_o ( input_buffer_o ) 125 | ); 126 | 127 | endmodule : ne16_input_buffer_scm_test_wrap 128 | -------------------------------------------------------------------------------- /rtl/ne16_engine.sv: -------------------------------------------------------------------------------- 1 | /* 2 | * ne16_engine.sv 3 | * 4 | * Copyright (C) 2019-2021 ETH Zurich, University of Bologna and GreenWaves Technologies 5 | * 6 | * Copyright and related rights are licensed under the Solderpad Hardware 7 | * License, Version 0.51 (the "License"); you may not use this file except in 8 | * compliance with the License. You may obtain a copy of the License at 9 | * http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law 10 | * or agreed to in writing, software, hardware and materials distributed under 11 | * this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | * CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | * specific language governing permissions and limitations under the License. 14 | */ 15 | 16 | /* 17 | * Authors (RBE): Gianna Paulin 18 | * Francesco Conti 19 | * Authors (NE16): Francesco Conti 20 | */ 21 | 22 | import ne16_package::*; 23 | 24 | module ne16_engine #( 25 | parameter int unsigned COLUMN_SIZE = NE16_COLUMN_SIZE, // number of BinConv blocks per column (default 9) 26 | parameter int unsigned NR_COLUMN = NE16_COLUMN_SIZE, // number of BinConv columns (default 9 -- same of size of BinConv columns!) 27 | parameter int unsigned BLOCK_SIZE = NE16_BLOCK_SIZE, // number of SoP's per BinConv block (default 4) 28 | parameter int unsigned INPUT_BUF_SIZE = 32*BLOCK_SIZE, // TODO FIXME 29 | parameter int unsigned TP_IN = NE16_TP_IN, // number of input elements processed per cycle 30 | parameter int unsigned TP_OUT = NE16_TP_OUT 31 | ) ( 32 | // global signals 33 | input logic clk_i, 34 | input logic rst_ni, 35 | input logic test_mode_i, 36 | // local enable & clear 37 | input logic enable_i, 38 | input logic clear_i, 39 | // input streams + handshake 40 | hwpe_stream_intf_stream.sink load_in, 41 | hwpe_stream_intf_stream.sink load_weight, 42 | hwpe_stream_intf_stream.sink load_norm, 43 | hwpe_stream_intf_stream.sink load_streamin, 44 | hwpe_stream_intf_stream.source store_out, 45 | input ctrl_engine_t ctrl_i, 46 | output flags_engine_t flags_o 47 | ); 48 | 49 | /* Local Params, Interfaces, and Signals */ 50 | localparam BLOCK_PRES_SIZE = NE16_QA_IN+NE16_QA_16BIT+8+$clog2(BLOCK_SIZE); 51 | localparam COLUMN_PRES_SIZE = BLOCK_PRES_SIZE+$clog2(COLUMN_SIZE); 52 | 53 | logic all_norm_ready; 54 | logic [NE16_NR_COLUMN-1:0] all_norm_ready_tree; 55 | 56 | hwpe_stream_intf_stream #( 57 | .DATA_WIDTH ( NE16_QA_IN ) 58 | `ifndef SYNTHESIS 59 | , 60 | .BYPASS_VCR_ASSERT( 1'b1 ), 61 | .BYPASS_VDR_ASSERT( 1'b1 ) 62 | `endif 63 | ) load_in_blocks [BLOCK_SIZE-1:0] ( 64 | .clk ( clk_i ) 65 | ); 66 | 67 | hwpe_stream_intf_stream #( 68 | .DATA_WIDTH ( NE16_MEM_BANDWIDTH ) 69 | `ifndef SYNTHESIS 70 | , 71 | .BYPASS_VCR_ASSERT( 1'b1 ), 72 | .BYPASS_VDR_ASSERT( 1'b1 ) 73 | `endif 74 | ) load_weight_fifo ( 75 | .clk ( clk_i ) 76 | ); 77 | 78 | hwpe_stream_intf_stream #( 79 | .DATA_WIDTH ( NE16_MEM_BANDWIDTH ) 80 | `ifndef SYNTHESIS 81 | , 82 | .BYPASS_VCR_ASSERT( 1'b1 ), 83 | .BYPASS_VDR_ASSERT( 1'b1 ) 84 | `endif 85 | ) load_weight_fifo_demuxed [1:0] ( 86 | .clk ( clk_i ) 87 | ); 88 | 89 | hwpe_stream_intf_stream #( 90 | .DATA_WIDTH ( TP_IN ) 91 | `ifndef SYNTHESIS 92 | , 93 | .BYPASS_VCR_ASSERT( 1'b1 ), 94 | .BYPASS_VDR_ASSERT( 1'b1 ) 95 | `endif 96 | ) load_weight_rows_mode8 [15:0] ( 97 | .clk ( clk_i ) 98 | ); 99 | 100 | hwpe_stream_intf_stream #( 101 | .DATA_WIDTH ( TP_IN/2 ) 102 | `ifndef SYNTHESIS 103 | , 104 | .BYPASS_VCR_ASSERT( 1'b1 ), 105 | .BYPASS_VDR_ASSERT( 1'b1 ) 106 | `endif 107 | ) load_weight_rows_mode16_8bit [31:0] ( 108 | .clk ( clk_i ) 109 | ); 110 | 111 | hwpe_stream_intf_stream #( 112 | .DATA_WIDTH ( TP_IN ) 113 | `ifndef SYNTHESIS 114 | , 115 | .BYPASS_VCR_ASSERT( 1'b1 ), 116 | .BYPASS_VDR_ASSERT( 1'b1 ) 117 | `endif 118 | ) load_weight_rows_mode16 [31:0] ( 119 | .clk ( clk_i ) 120 | ); 121 | 122 | hwpe_stream_intf_stream #( 123 | .DATA_WIDTH ( TP_IN ) 124 | `ifndef SYNTHESIS 125 | , 126 | .BYPASS_VCR_ASSERT( 1'b1 ), 127 | .BYPASS_VDR_ASSERT( 1'b1 ) 128 | `endif 129 | ) load_weight_rows_conv [COLUMN_SIZE-1:0] ( 130 | .clk ( clk_i ) 131 | ); 132 | 133 | hwpe_stream_intf_stream #( 134 | .DATA_WIDTH ( TP_IN ) 135 | `ifndef SYNTHESIS 136 | , 137 | .BYPASS_VCR_ASSERT( 1'b1 ), 138 | .BYPASS_VDR_ASSERT( 1'b1 ) 139 | `endif 140 | ) load_weight_rows_linear [31:0] ( 141 | .clk ( clk_i ) 142 | ); 143 | 144 | hwpe_stream_intf_stream #( 145 | .DATA_WIDTH ( NE16_MEM_BANDWIDTH ) 146 | `ifndef SYNTHESIS 147 | , 148 | .BYPASS_VCR_ASSERT( 1'b1 ), 149 | .BYPASS_VDR_ASSERT( 1'b1 ) 150 | `endif 151 | ) store_out_cols [NR_COLUMN-1:0] ( 152 | .clk ( clk_i ) 153 | ); 154 | 155 | hwpe_stream_intf_stream #( 156 | .DATA_WIDTH ( NE16_MEM_BANDWIDTH ) 157 | `ifndef SYNTHESIS 158 | , 159 | .BYPASS_VCR_ASSERT( 1'b1 ), 160 | .BYPASS_VDR_ASSERT( 1'b1 ) 161 | `endif 162 | ) load_streamin_cols [NR_COLUMN-1:0] ( 163 | .clk ( clk_i ) 164 | ); 165 | 166 | hwpe_stream_intf_stream #( 167 | .DATA_WIDTH ( NE16_QA_IN ) 168 | `ifndef SYNTHESIS 169 | , 170 | .BYPASS_VCR_ASSERT( 1'b1 ), 171 | .BYPASS_VDR_ASSERT( 1'b1 ) 172 | `endif 173 | ) in_from_buf [INPUT_BUF_SIZE-1:0] ( 174 | .clk ( clk_i ) 175 | ); 176 | 177 | hwpe_stream_intf_stream #( 178 | .DATA_WIDTH ( COLUMN_PRES_SIZE ) 179 | `ifndef SYNTHESIS 180 | , 181 | .BYPASS_VCR_ASSERT( 1'b1 ), 182 | .BYPASS_VDR_ASSERT( 1'b1 ) 183 | `endif 184 | ) pres [NR_COLUMN-1:0] ( 185 | .clk ( clk_i ) 186 | ); 187 | 188 | hwpe_stream_intf_stream #( 189 | .DATA_WIDTH ( NE16_MEM_BANDWIDTH ) 190 | `ifndef SYNTHESIS 191 | , 192 | .BYPASS_VCR_ASSERT( 1'b1 ), 193 | .BYPASS_VDR_ASSERT( 1'b1 ) 194 | `endif 195 | ) norm [NR_COLUMN-1:0] ( 196 | .clk ( clk_i ) 197 | ); 198 | 199 | hwpe_stream_intf_stream #( 200 | .DATA_WIDTH ( NE16_MEM_BANDWIDTH ) 201 | `ifndef SYNTHESIS 202 | , 203 | .BYPASS_VCR_ASSERT( 1'b1 ), 204 | .BYPASS_VDR_ASSERT( 1'b1 ) 205 | `endif 206 | ) load_norm_fifo ( 207 | .clk ( clk_i ) 208 | ); 209 | 210 | hwpe_stream_intf_stream #( 211 | .DATA_WIDTH ( NE16_MEM_BANDWIDTH ) 212 | `ifndef SYNTHESIS 213 | , 214 | .BYPASS_VCR_ASSERT( 1'b1 ), 215 | .BYPASS_VDR_ASSERT( 1'b1 ) 216 | `endif 217 | ) load_streamin_fifo ( 218 | .clk ( clk_i ) 219 | ); 220 | 221 | // Infeat data from the input buffer is split in blocks of size 16bits 222 | // 223 | // load_in[128b] 224 | // || 225 | // \/ 226 | // +-----------------+ 227 | // |hwpe_stream_split| 228 | // +-----------------+ 229 | // || 230 | // \/ 231 | // load_in_blocks[15:0][8b] 232 | 233 | hwpe_stream_split #( 234 | .NB_OUT_STREAMS ( BLOCK_SIZE ), 235 | .DATA_WIDTH_IN ( NE16_QA_IN*BLOCK_SIZE ) 236 | ) i_split_load_in_blocks ( 237 | .clk_i ( clk_i ), 238 | .rst_ni ( rst_ni ), 239 | .clear_i ( clear_i ), 240 | .push_i ( load_in ), 241 | .pop_o ( load_in_blocks ) 242 | ); 243 | 244 | // The following diagram explains the way that the weight stream is split in order to 245 | // support the various CONV modes and the LINEAR mode at 16 and 8 bits. 246 | // 247 | // load_weight[256b] 248 | // || 249 | // \/ 250 | // |____| 251 | // |____| hwpe_stream_fifo 252 | // || 253 | // \/ 254 | // load_weight_fifo[256b] 255 | // || 256 | // \/ 257 | // /------------------------\ 258 | // ctrl_i.mode16 -------> /__________________________\ 259 | // || 0 1 || 260 | // \/ \/ 261 | // load_weight_fifo_demuxed[0][256b] load_weight_fifo_demuxed[1][256b] 262 | // || || 263 | // \/ \/ 264 | // +-----------------+ +-----------------+ 265 | // |hwpe_stream_split| |hwpe_stream_split| 266 | // +-----------------+ +-----------------+ 267 | // || || 268 | // || \/ 269 | // || load_weight_rows_mode16_8bit[31:0][8b] 270 | // || || 271 | // || \/ 272 | // || +-----------------+ 273 | // || | zero-extend | 274 | // || +-----------------+ 275 | // || || 276 | // \/ \/ 277 | // load_weight_rows_mode8[15:0][16b] load_weight_rows_mode16[31:0][16b] 278 | // 279 | // 280 | // Convolutional modes actually use only 144 of the 256bits of memory interface: 281 | // load_weight_rows_mode8[15:0][16b] load_weight_rows_mode16[31:0][16b] 282 | // || [8:0]] || [8:0] 283 | // \/ 0 1 \/ 284 | // \--------------------------/ 285 | // ctrl_i.mode16 ---------->\________________________/ 286 | // || 287 | // \/ 288 | // load_weight_rows_conv[8:0][16b] 289 | // 290 | // 291 | // Linear mode uses 256 bits of bandwidth in both 8 and 16 bit modes -- with 16 bit mode using 2x the number of MACs 292 | // 293 | // load_weight_rows_mode8[15:0][16b] load_weight_rows_mode16[31:0][16b] 256b zeros load_weight_rows_mode16[31:0][16b] 294 | // || || [15:0] || || [31:16] 295 | // \/ 0 1 \/ \/ 0 1 \/ 296 | // \--------------------------/ \--------------------------/ 297 | // ctrl_i.mode16 ---------->\________________________/ ctrl_i.mode16 ---------->\________________________/ 298 | // || || 299 | // \/ \/ 300 | // load_weight_rows_linear[15:0][16b] load_weight_rows_linear[31:0][16b] 301 | 302 | hwpe_stream_fifo #( 303 | .DATA_WIDTH ( NE16_MEM_BANDWIDTH ), 304 | .FIFO_DEPTH ( 2 ) 305 | ) i_fifo_load_weight ( 306 | .clk_i ( clk_i ), 307 | .rst_ni ( rst_ni ), 308 | .clear_i ( clear_i ), 309 | .flags_o ( ), 310 | .push_i ( load_weight ), 311 | .pop_o ( load_weight_fifo ) 312 | ); 313 | 314 | hwpe_stream_demux_static #( 315 | .NB_OUT_STREAMS ( 2 ) 316 | ) i_fifo_load_weight_fifo_demux ( 317 | .clk_i ( clk_i ), 318 | .rst_ni ( rst_ni ), 319 | .clear_i ( clear_i ), 320 | .sel_i ( ctrl_i.mode_16 ), 321 | .push_i ( load_weight_fifo ), 322 | .pop_o ( load_weight_fifo_demuxed ) 323 | ); 324 | 325 | hwpe_stream_split #( 326 | .NB_OUT_STREAMS ( 16 ), 327 | .DATA_WIDTH_IN ( NE16_MEM_BANDWIDTH ) 328 | ) i_split_load_weight_rows_mode8 ( 329 | .clk_i ( clk_i ), 330 | .rst_ni ( rst_ni ), 331 | .clear_i ( clear_i ), 332 | .push_i ( load_weight_fifo_demuxed[0] ), 333 | .pop_o ( load_weight_rows_mode8 ) 334 | ); 335 | 336 | hwpe_stream_split #( 337 | .NB_OUT_STREAMS ( 32 ), 338 | .DATA_WIDTH_IN ( NE16_MEM_BANDWIDTH ) 339 | ) i_split_load_weight_rows_mode16 ( 340 | .clk_i ( clk_i ), 341 | .rst_ni ( rst_ni ), 342 | .clear_i ( clear_i ), 343 | .push_i ( load_weight_fifo_demuxed[1] ), 344 | .pop_o ( load_weight_rows_mode16_8bit ) 345 | ); 346 | 347 | generate 348 | 349 | for(genvar ii=0; ii<32; ii++) begin: load_weight_rows_mode16_adapt_gen 350 | assign load_weight_rows_mode16[ii].data = { 8'b0, load_weight_rows_mode16_8bit[ii].data }; 351 | assign load_weight_rows_mode16[ii].valid = load_weight_rows_mode16_8bit[0].valid; 352 | assign load_weight_rows_mode16[ii].strb = load_weight_rows_mode16_8bit[0].strb; 353 | assign load_weight_rows_mode16_8bit[ii].ready = load_weight_rows_mode16[ii].ready; 354 | end 355 | 356 | logic ready_conv, ready_linear; 357 | assign ready_conv = load_weight_rows_conv[0].ready; 358 | assign ready_linear = load_weight_rows_linear[0].ready; 359 | 360 | for(genvar ii=0; ii 18 | * Francesco Conti 19 | * Authors (NE16): Francesco Conti 20 | */ 21 | 22 | package ne16_package; 23 | 24 | // ======================================================================== 25 | // PULP contents 26 | // ======================================================================== 27 | 28 | parameter int NR_HWPE_REG = 11; 29 | parameter int NR_HCI_REG = 1; 30 | parameter int NR_UCODE_REG = 12; 31 | 32 | // general PULP environment parameters including clusters etc 33 | // default number of cores 34 | parameter int NR_CORES = 9; 35 | 36 | // number of contexts 37 | parameter int NR_CONTEXT = 1; 38 | 39 | // default id width 40 | parameter int ID_WIDTH = 16; 41 | 42 | // number of registers 43 | parameter int NR_IO_REGS = NR_HWPE_REG + NR_UCODE_REG; // 10 + 11 = 21 44 | parameter int NR_GENERIC_REGS = NR_HCI_REG; // 1 45 | 46 | // Maximum weight exponent offset (limits MAC bitwidths) 47 | // parameter int N2_MAX = 64; 48 | 49 | // ======================================================================== 50 | // CTRL Registers 51 | // ======================================================================== 52 | 53 | // ctrl counter bit-widths 54 | parameter int SPATIAL_CNT_SIZE = 16; 55 | parameter int FILTER_CNT_SIZE = 5; 56 | parameter int FEAT_CNT_SIZE = 12; 57 | parameter int QUANT_CNT_SIZE = 8; 58 | parameter int NB_ACC_CNT_SIZE = 8; 59 | 60 | // ======================================================================== 61 | // BANDWIDTH related types 62 | parameter int NE16_MEM_BANDWIDTH_EXT = 288; // bits (9 ports x 32 bits) 63 | parameter int NE16_MEM_BANDWIDTH = 256; // bits (8 ports x 32 bits) -- this is after realignment 64 | parameter int NE16_STREAM_BANDWIDTH = 160; // bits (9 ports x 32 bits) 65 | 66 | // ======================================================================== 67 | // BINCONV related types 68 | // Throughput parameter for a single BinConv module 69 | parameter int NE16_TP_IN = 16; 70 | parameter int NE16_QA_IN = 8; 71 | parameter int NE16_QA_OUT = 8; 72 | parameter int NE16_TP_OUT = 32; 73 | parameter int NE16_QA_16BIT = 8; // overhead in 16-bit mode 74 | 75 | // number of 1x8-bit multipliers per BinConv block 76 | parameter int NE16_BLOCK_SIZE = 16; 77 | 78 | // architectural parameters of NE1 79 | parameter int NE16_INPUT_BUFFER_SIZE = 32; 80 | 81 | // number of binary BinConv blocks per BinConv column 82 | parameter int NE16_COLUMN_SIZE = 9; 83 | 84 | // number of binary BinConv blocks per BinConv array 85 | parameter int NE16_NR_COLUMN = 9; 86 | 87 | // number of shift cycles 88 | parameter int NE16_SHIFT_CYCLES = 2; 89 | 90 | // ======================================================================== 91 | // ACCUMULATOR module related types 92 | // number of bits used in vlen_cnt 93 | parameter int NE16_ACCUM_SIZE = 32; 94 | parameter int VLEN_CNT_SIZE = 16; 95 | 96 | // (batch-)normalization parameters 97 | parameter int unsigned NORM_MULT_SIZE = 8; 98 | 99 | // ======================================================================== 100 | // FEAT_BUFFER related types 101 | // ======================================================================== 102 | typedef struct packed { 103 | logic goto_load; 104 | logic goto_extract; 105 | logic goto_idle; 106 | logic [VLEN_CNT_SIZE-1:0] load_len; 107 | logic [NE16_INPUT_BUFFER_SIZE-1:0] enable_implicit_padding; 108 | logic [NE16_INPUT_BUFFER_SIZE-1:0] enable_explicit_padding; 109 | logic [NE16_QA_IN-1:0] explicit_padding_value_hi; 110 | logic [NE16_QA_IN-1:0] explicit_padding_value_lo; 111 | logic [1:0] filter_mode; 112 | } ctrl_input_buffer_t; 113 | 114 | typedef enum { 115 | IB_IDLE, IB_LOAD, IB_EXTRACT 116 | } state_input_buffer_t; 117 | 118 | typedef struct packed { 119 | state_input_buffer_t state; 120 | } flags_input_buffer_t; 121 | 122 | 123 | // ======================================================================== 124 | // SIGN_BUFFER related types 125 | // ======================================================================== 126 | typedef struct packed { 127 | logic goto_load; 128 | logic goto_extract; 129 | logic [VLEN_CNT_SIZE-1:0] i_vlen; // virtual buffer length 130 | logic [VLEN_CNT_SIZE-1:0] o_vlen; 131 | } ctrl_sign_buf_t; 132 | 133 | typedef enum { 134 | SR_IDLE, SR_LOAD, SR_EXTRACT 135 | } state_sign_buf_t; 136 | 137 | typedef struct packed { 138 | state_sign_buf_t state; 139 | } flags_sign_buf_t; 140 | 141 | 142 | // ======================================================================== 143 | // SOP related types 144 | // ======================================================================== 145 | typedef struct packed { 146 | logic operation_sel; // 1:xnor, 0: and 147 | logic [NE16_TP_IN-1:0] inactive_mask; 148 | logic clear; 149 | } ctrl_sop_t; 150 | 151 | 152 | // ======================================================================== 153 | // Accumulator Quantizor related types 154 | // ======================================================================== 155 | 156 | typedef struct packed { 157 | logic start; 158 | logic relu; 159 | logic [4:0] right_shift; 160 | logic [1:0] norm_mode; 161 | logic [1:0] quant_mode; 162 | logic norm_signed; 163 | logic use_rounding; 164 | logic use_shifting; 165 | } ctrl_normquant_t; 166 | 167 | typedef struct packed { 168 | logic ready; 169 | } flags_normquant_t; 170 | 171 | parameter logic[1:0] NE16_MODE_8B = 2'b00; 172 | parameter logic[1:0] NE16_MODE_16B = 2'b01; 173 | parameter logic[1:0] NE16_MODE_32B = 2'b10; 174 | 175 | parameter logic[1:0] NE16_FILTER_MODE_LINEAR = 2'b11; 176 | parameter logic[1:0] NE16_FILTER_MODE_1X1 = 2'b10; 177 | parameter logic[1:0] NE16_FILTER_MODE_3X3_DW = 2'b01; 178 | parameter logic[1:0] NE16_FILTER_MODE_3X3 = 2'b00; 179 | 180 | typedef struct packed { 181 | logic [ VLEN_CNT_SIZE-1:0] full_accumulation_len; // nr of accumulations 182 | logic [ VLEN_CNT_SIZE-1:0] streamout_len; 183 | logic [ VLEN_CNT_SIZE-1:0] scale_len; 184 | logic [ VLEN_CNT_SIZE-1:0] bias_len; 185 | logic clear; 186 | logic clear_offset; 187 | logic goto_normquant; 188 | logic goto_accum; 189 | logic goto_streamin; 190 | logic goto_streamout; 191 | logic goto_idle; 192 | logic sample_shift; 193 | logic [1:0] quant_mode; // 00: 8 bits, 01: 16 bits (reserved for future usage), 11: 32 bits 194 | logic [1:0] norm_mode; // 00: 8 bits, 01: 16 bits, 11: 32 bits 195 | ctrl_normquant_t ctrl_normquant; 196 | logic norm_option_bias; 197 | logic norm_option_shift; 198 | logic weight_offset; 199 | logic [31:0] weight_offset_scale; 200 | logic [$clog2(QUANT_CNT_SIZE):0] qw; // weights quantization 201 | logic enable_streamout; 202 | logic depthwise; 203 | } ctrl_aq_t; 204 | 205 | typedef enum { 206 | AQ_IDLE, AQ_ACCUM, AQ_NORMQUANT_SHIFT, AQ_NORMQUANT, AQ_NORMQUANT_TOBIAS, AQ_NORMQUANT_BIAS, AQ_STREAMIN, AQ_STREAMOUT, AQ_ACCUM_DONE, AQ_NORMQUANT_DONE, AQ_STREAMIN_DONE, AQ_STREAMOUT_DONE 207 | } state_aq_t; 208 | 209 | typedef struct packed { 210 | state_aq_t state; 211 | logic addr_cnt_en_q; 212 | } flags_aq_t; 213 | 214 | // ======================================================================== 215 | // SCALE related types 216 | // ======================================================================== 217 | 218 | parameter int unsigned MAX_SHIFT = 16; 219 | typedef struct packed { 220 | logic [$clog2(MAX_SHIFT):0] shift_sel; 221 | logic invert; 222 | } ctrl_scale_t; 223 | 224 | typedef struct packed { 225 | logic [$clog2(MAX_SHIFT):0] shift_sel; 226 | } flags_scale_t; 227 | 228 | // ======================================================================== 229 | // BINCONV_BLOCK related types 230 | // ======================================================================== 231 | 232 | typedef struct packed { 233 | logic [$clog2(QUANT_CNT_SIZE):0] qw; 234 | logic [1:0] filter_mode; // filter size 235 | logic [$clog2(8):0] scale_shift; 236 | logic weight_offset; 237 | logic clear; 238 | logic [NE16_BLOCK_SIZE-1:0] enable_mac; 239 | logic [$clog2(NE16_QA_IN):0] block_cnt; 240 | logic invalidate; 241 | logic mode_16; 242 | logic mode_linear; 243 | } ctrl_binconv_block_t; 244 | 245 | typedef struct packed { 246 | flags_scale_t [NE16_BLOCK_SIZE-1:0] flags_scale; 247 | } flags_binconv_block_t; 248 | 249 | // ======================================================================== 250 | // BINCONV_COLUMN related types 251 | // ======================================================================== 252 | 253 | typedef struct packed { 254 | ctrl_binconv_block_t ctrl_block; 255 | logic [NE16_COLUMN_SIZE-1:0] enable_block; 256 | logic [NE16_NR_COLUMN-1:0][NE16_COLUMN_SIZE-1:0] enable_block_linear; 257 | logic [31:0] padding_value; 258 | } ctrl_binconv_column_t; 259 | 260 | typedef struct packed { 261 | flags_binconv_block_t [NE16_COLUMN_SIZE-1:0] flags_block; 262 | } flags_binconv_column_t; 263 | 264 | // ======================================================================== 265 | // BINCONV_ARRAY related types 266 | // ======================================================================== 267 | 268 | typedef struct packed { 269 | ctrl_binconv_column_t ctrl_column; 270 | logic [1:0] filter_mode; 271 | logic [NE16_NR_COLUMN-1:0] enable_column; 272 | logic weight_offset; 273 | logic [$clog2(NE16_TP_IN):0] depthwise_len; 274 | logic mode_16; 275 | logic mode_linear; 276 | } ctrl_binconv_array_t; 277 | 278 | typedef struct packed { 279 | flags_binconv_column_t [NE16_NR_COLUMN-1:0] flags_column; 280 | } flags_binconv_array_t; 281 | 282 | // ======================================================================== 283 | // ENGINE related types 284 | // ======================================================================== 285 | 286 | typedef struct packed { 287 | ctrl_input_buffer_t ctrl_input_buffer; 288 | ctrl_binconv_array_t ctrl_binconv_array; 289 | ctrl_aq_t ctrl_accumulator; 290 | hwpe_stream_package::ctrl_serdes_t ctrl_serialize; 291 | logic [NE16_NR_COLUMN-1:0] enable_accumulator; 292 | logic clear_des; 293 | logic mode_16; 294 | logic mode_linear; 295 | } ctrl_engine_t; 296 | 297 | typedef struct packed { 298 | flags_input_buffer_t flags_input_buffer; 299 | flags_aq_t [NE16_NR_COLUMN-1:0] flags_accumulator; 300 | flags_binconv_array_t flags_binconv_array; 301 | } flags_engine_t; 302 | 303 | // ======================================================================== 304 | // URISCY CTRL related types 305 | // ======================================================================== 306 | 307 | typedef struct packed { 308 | logic start; 309 | } ctrl_ctrlmult_t; 310 | 311 | typedef struct packed { 312 | logic valid; 313 | } flags_ctrlmult_t; 314 | 315 | 316 | // ======================================================================== 317 | // STREAMER related types 318 | // ======================================================================== 319 | 320 | typedef enum { LD_FEAT_SEL, LD_WEIGHT_SEL, LD_NORM_SEL, LD_STREAMIN_SEL } ld_which_mux_sel_t; 321 | parameter logic LD_SEL = 1'b0; 322 | parameter logic ST_SEL = 1'b1; 323 | 324 | typedef struct packed { 325 | ld_which_mux_sel_t ld_which_mux_sel; 326 | logic ld_st_mux_sel; 327 | logic clear_fifo; 328 | logic clear_source; 329 | logic clear_sink; 330 | hci_package::hci_streamer_ctrl_t feat_source_ctrl; 331 | hci_package::hci_streamer_ctrl_t weight_source_ctrl; 332 | hci_package::hci_streamer_ctrl_t norm_source_ctrl; 333 | hci_package::hci_streamer_ctrl_t conv_sink_ctrl; 334 | hci_package::hci_streamer_ctrl_t streamin_source_ctrl; 335 | } ctrl_streamer_t; 336 | 337 | typedef struct packed { 338 | hci_package::hci_streamer_flags_t feat_source_flags; 339 | hci_package::hci_streamer_flags_t weight_source_flags; 340 | hci_package::hci_streamer_flags_t norm_source_flags; 341 | hci_package::hci_streamer_flags_t conv_sink_flags; 342 | logic tcdm_fifo_empty; 343 | } flags_streamer_t; 344 | 345 | 346 | // ======================================================================== 347 | // CTRL FSM related types 348 | // ======================================================================== 349 | 350 | typedef enum { 351 | IDLE, STREAMIN, LOAD, WEIGHTOFFS, MATRIXVEC, NORMQUANT, NORMQUANT_BIAS, NORMQUANT_SHIFT, STREAMOUT, STREAMOUT_DONE, UPDATEIDX, UPDATEIDX_WAIT, DONE 352 | } state_ne16_t; // FIXME --> move NORMQUANT to SCALE 353 | 354 | typedef struct packed { 355 | logic [31:0] weights_kom_iter; 356 | logic [31:0] weights_kim_iter; 357 | logic [31:0] weights_kom_reset_iter; 358 | logic [31:0] weights_kim_reset_iter; 359 | logic [31:0] infeat_kim_iter; 360 | logic [31:0] infeat_wom_iter; 361 | logic [31:0] infeat_hom_iter; 362 | logic [31:0] infeat_kim_reset_iter; 363 | logic [31:0] infeat_wom_reset_iter; 364 | logic [31:0] infeat_hom_reset_iter; 365 | logic [31:0] outfeat_wom_iter; 366 | logic [31:0] outfeat_hom_iter; 367 | logic [31:0] outfeat_kom_iter; 368 | logic [31:0] outfeat_wom_reset_iter; 369 | logic [31:0] outfeat_hom_reset_iter; 370 | logic [31:0] outfeat_kom_reset_iter; 371 | logic [31:0] scale_kom_iter; 372 | } uloop_iter_ne16_t; 373 | 374 | typedef struct packed { 375 | logic [31:0] weights_ptr; 376 | logic [31:0] infeat_ptr; 377 | logic [31:0] outfeat_ptr; 378 | logic [31:0] scale_ptr; 379 | logic [31:0] scale_shift_ptr; 380 | logic [31:0] scale_bias_ptr; 381 | logic [15:0] subtile_nb_ko; // register n_tiles_k_out 382 | logic [15:0] subtile_rem_ko; // register k_out_rest 383 | logic [15:0] subtile_nb_ki; // register n_tiles_k_in 384 | logic [15:0] subtile_rem_ki; // register k_in_rest 385 | logic [15:0] subtile_nb_ho; // register n_tiles_h_out 386 | logic [15:0] subtile_rem_ho; // register h_out_rest 387 | logic [15:0] subtile_nb_wo; // register n_tiles_w_out 388 | logic [15:0] subtile_rem_wo; // register w_out_rest 389 | logic [15:0] subtile_rem_hi; // register h_in_rest 390 | logic [15:0] subtile_rem_wi; // register w_in_rest 391 | logic [31:0] infeat_d0_stride; // register x_word_stride 392 | logic [31:0] infeat_d1_stride; // register x_line_stride 393 | logic [31:0] infeat_d2_stride; // register x_block_stride 394 | logic [31:0] weights_d0_stride; // register W_word_stride 395 | logic [31:0] weights_d1_stride; // register W_line_stride 396 | logic [31:0] weights_d2_stride; // register W_block_stride 397 | logic [31:0] outfeat_d0_stride; // register y_word_stride 398 | logic [31:0] outfeat_d1_stride; // register y_line_stride 399 | logic [31:0] outfeat_d2_stride; // register y_block_stride 400 | logic [3:0] padding_top; 401 | logic [3:0] padding_right; 402 | logic [3:0] padding_bottom; 403 | logic [3:0] padding_left; 404 | logic [15:0] padding_value; 405 | logic norm_option_bias; 406 | logic norm_option_shift; 407 | logic [31:0] weight_offset_scale; 408 | logic [7:0] filter_mask_top; 409 | logic [7:0] filter_mask_right; 410 | logic [7:0] filter_mask_bottom; 411 | logic [7:0] filter_mask_left; 412 | logic [1:0] filter_mode; 413 | logic [1:0] norm_mode; 414 | logic [1:0] quant_mode; 415 | logic relu; 416 | logic streamin; 417 | logic streamout_quant; 418 | logic mode_16; 419 | logic mode_linear; 420 | logic mode_strided; 421 | logic [3:0] weight_bits; 422 | logic use_rounding; 423 | logic [4:0] shift_reqnt; 424 | uloop_iter_ne16_t uloop_iter; 425 | } config_ne16_t; 426 | 427 | typedef struct packed { 428 | logic [15:0] k_out_major; 429 | logic [15:0] i_major; 430 | logic [15:0] j_major; 431 | logic [15:0] k_in_major; 432 | } index_ne16_t; 433 | 434 | typedef struct packed { 435 | logic k_out_major; 436 | logic i_major; 437 | logic j_major; 438 | logic k_in_major; 439 | } index_update_ne16_t; 440 | 441 | typedef struct packed { 442 | logic [31:0] weights; 443 | logic [31:0] infeat; 444 | logic [31:0] outfeat; 445 | logic [31:0] scale; 446 | } base_addr_ne16_t; 447 | 448 | parameter int unsigned NE16_ULOOP_BASE_ADDR_W = 0; 449 | parameter int unsigned NE16_ULOOP_BASE_ADDR_X = 1; 450 | parameter int unsigned NE16_ULOOP_BASE_ADDR_Y = 2; 451 | parameter int unsigned NE16_ULOOP_BASE_ADDR_S = 3; 452 | parameter int unsigned NE16_ULOOP_RO_WEIGHTS_KOM_ITER = 4 - 4; 453 | parameter int unsigned NE16_ULOOP_RO_WEIGHTS_KIM_ITER = 5 - 4; 454 | parameter int unsigned NE16_ULOOP_RO_WEIGHTS_KOM_RESET_ITER = 6 - 4; 455 | parameter int unsigned NE16_ULOOP_RO_WEIGHTS_KIM_RESET_ITER = 7 - 4; 456 | parameter int unsigned NE16_ULOOP_RO_INFEAT_KIM_ITER = 8 - 4; 457 | parameter int unsigned NE16_ULOOP_RO_INFEAT_WOM_ITER = 9 - 4; 458 | parameter int unsigned NE16_ULOOP_RO_INFEAT_HOM_ITER = 10 - 4; 459 | parameter int unsigned NE16_ULOOP_RO_INFEAT_KIM_RESET_ITER = 11 - 4; 460 | parameter int unsigned NE16_ULOOP_RO_INFEAT_WOM_RESET_ITER = 12 - 4; 461 | parameter int unsigned NE16_ULOOP_RO_INFEAT_HOM_RESET_ITER = 13 - 4; 462 | parameter int unsigned NE16_ULOOP_RO_OUTFEAT_WOM_ITER = 14 - 4; 463 | parameter int unsigned NE16_ULOOP_RO_OUTFEAT_HOM_ITER = 15 - 4; 464 | parameter int unsigned NE16_ULOOP_RO_OUTFEAT_KOM_ITER = 16 - 4; 465 | parameter int unsigned NE16_ULOOP_RO_OUTFEAT_WOM_RESET_ITER = 17 - 4; 466 | parameter int unsigned NE16_ULOOP_RO_OUTFEAT_HOM_RESET_ITER = 18 - 4; 467 | parameter int unsigned NE16_ULOOP_RO_OUTFEAT_KOM_RESET_ITER = 19 - 4; 468 | parameter int unsigned NE16_ULOOP_RO_SCALE_KOM_ITER = 20 - 4; 469 | parameter int unsigned NE16_ULOOP_RO_ZERO = 21 - 4; 470 | 471 | // implemented with dual-context hwpe regs: 472 | parameter int NE16_REG_WEIGHTS_PTR = 0; // Weights pointer: pointer to Weights tensor in memory (d3=Ko, d2=Fy, d1=Fx, d0=Ki). 473 | parameter int NE16_REG_INFEAT_PTR = 1; // InFeat pointer: pointer to InFeat tensor in memory (d2=Hi, d1=Wi, d0=Ki). 474 | parameter int NE16_REG_OUTFEAT_PTR = 2; // OutFeat pointer: pointer to OutFeat tensor in memory (d2=Ho, d1=Wo, d0=Ko). 475 | parameter int NE16_REG_SCALE_PTR = 3; // Scale pointer: pointer to Scale parameters in memory (d0=Ko). 476 | parameter int NE16_REG_SCALE_SHIFT_PTR = 4; // ScaleShift pointer: pointer to ScaleShift parameters in memory (d0=Ko). 477 | parameter int NE16_REG_SCALE_BIAS_PTR = 5; // ScaleBias pointer: pointer to ScaleBias parameters in memory (d0=Ko). 478 | parameter int NE16_REG_INFEAT_D0_STRIDE = 6; // InFeat d0 stride 479 | parameter int NE16_REG_INFEAT_D1_STRIDE = 7; // InFeat d1 stride 480 | parameter int NE16_REG_INFEAT_D2_STRIDE = 8; // InFeat d2 stride 481 | parameter int NE16_REG_OUTFEAT_D0_STRIDE = 9; // OutFeat d0 stride 482 | parameter int NE16_REG_OUTFEAT_D1_STRIDE = 10; // OutFeat d1 stride 483 | parameter int NE16_REG_OUTFEAT_D2_STRIDE = 11; // OutFeat d2 stride 484 | parameter int NE16_REG_WEIGHTS_D0_STRIDE = 12; // Weights d0 stride 485 | parameter int NE16_REG_WEIGHTS_D1_STRIDE = 13; // Weights d1 stride 486 | parameter int NE16_REG_WEIGHTS_D2_STRIDE = 14; // Weights d2 stride (may be removable) 487 | parameter int NE16_REG_SUBTILE_REM0 = 15; // Subtile Remainder 0: [31:16] Ko, [15:0] Ki. 488 | parameter int NE16_REG_SUBTILE_REM1 = 16; // Subtile Remainder 1: [31:16] Ho, [15:0] Wo. 489 | parameter int NE16_REG_SUBTILE_REM2 = 17; // Subtile Remainder 2: [31:16] Hi, [15:0] Wi. 490 | parameter int NE16_REG_SUBTILE_NB0 = 18; // Subtile Number 0: [31:16] Ko, [15:0] Ki. 491 | parameter int NE16_REG_SUBTILE_NB1 = 19; // Subtile Number 1: [31:16] Ho, [15:0] Wo. 492 | parameter int NE16_REG_PADDING = 20; // Padding 493 | parameter int NE16_REG_WEIGHT_OFFSET = 21; // Weight offset factor 494 | parameter int NE16_REG_FILTER_MASK = 22; // Filter masking: [31:24] top, [23:16] right, [15:8] bottom, [7:0] left. 495 | parameter int NE16_REG_CONFIG0 = 23; // Config 0: [31:16] Reserved (striding, dilation?) [15] weight_offseting [14] streamin [13:12] normalization bits (00=8, 01=16, 10=32), [11] rounding (0=round, 1=do not round), [10:7] padding flag (top/right/bottom/left) [6:5] filter mode (11=linear, 10=1x1, 01=3x3 depthwise, 00=3x3) [4] streamout / quantization, [3] reserved (16 bits?), [2:0] weight bits minus 1. 496 | 497 | // normal uloop microcode, generated by ucode/uloop_compile.py 498 | parameter logic[351:0] ULOOP_CODE_NORMAL = 352'h04748a101215c078a30b22942d89f0aa15c078a30b22742985701e14405; 499 | parameter logic[53:0] ULOOP_LOOPS_NORMAL = 54'b011001001001100110000100100000000010; 500 | // depthwise uloop microcode, generated by ucode/uloop_compile_dw.py 501 | parameter logic[351:0] ULOOP_CODE_DEPTHWISE = 352'h0420863a4288a101228c2c8a50b627c2a8a30b227429; 502 | parameter logic[53:0] ULOOP_LOOPS_DEPTHWISE = 54'b011100010001101000000100100000000010; 503 | 504 | // mapping of weights in linear layers 505 | parameter int NE16_LINEAR_MAP[0:80] = { 506 | 0, 1, 2, 3, 4, 5, 6, 7, -1, 507 | 8, 9, 10, 11, 12, 13, 14, 15, -1, 508 | 16, 17, 18, 19, 20, 21, 22, 23, -1, 509 | 24, 25, 26, 27, 28, 29, 30, 31, -1, 510 | -1, -1, -1, -1, -1, -1, -1, -1, -1, 511 | -1, -1, -1, -1, -1, -1, -1, -1, -1, 512 | -1, -1, -1, -1, -1, -1, -1, -1, -1, 513 | -1, -1, -1, -1, -1, -1, -1, -1, -1, 514 | -1, -1, -1, -1, -1, -1, -1, -1, -1 515 | }; 516 | 517 | endpackage 518 | -------------------------------------------------------------------------------- /rtl/ne16_streamer.sv: -------------------------------------------------------------------------------- 1 | /* 2 | * ne16_streamer.sv 3 | * 4 | * Copyright (C) 2019-2021 ETH Zurich, University of Bologna and GreenWaves Technologies 5 | * 6 | * Copyright and related rights are licensed under the Solderpad Hardware 7 | * License, Version 0.51 (the "License"); you may not use this file except in 8 | * compliance with the License. You may obtain a copy of the License at 9 | * http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law 10 | * or agreed to in writing, software, hardware and materials distributed under 11 | * this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | * CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | * specific language governing permissions and limitations under the License. 14 | */ 15 | 16 | /* 17 | * Authors (RBE): Gianna Paulin 18 | * Francesco Conti 19 | * Authors (NE16): Francesco Conti 20 | */ 21 | 22 | import ne16_package::*; 23 | import hwpe_stream_package::*; 24 | import hci_package::*; 25 | 26 | module ne16_streamer #( 27 | parameter int unsigned TCDM_FIFO_DEPTH = 2, 28 | parameter int unsigned BW = NE16_MEM_BANDWIDTH_EXT // bandwidth 29 | ) ( 30 | // global signals 31 | input logic clk_i, 32 | input logic rst_ni, 33 | input logic test_mode_i, 34 | // local enable & clear 35 | input logic enable_i, 36 | input logic clear_i, 37 | // input feat stream + handshake 38 | hwpe_stream_intf_stream.source feat_o, 39 | // input weight stream + handshake 40 | hwpe_stream_intf_stream.source weight_o, 41 | // input norm stream + handshake 42 | hwpe_stream_intf_stream.source norm_o, 43 | // input streamin stream + handshake 44 | hwpe_stream_intf_stream.source streamin_o, 45 | // output features + handshake 46 | hwpe_stream_intf_stream.sink conv_i, 47 | // TCDM ports 48 | hci_core_intf.master tcdm, 49 | // control channel 50 | input ctrl_streamer_t ctrl_i, 51 | output flags_streamer_t flags_o 52 | ); 53 | 54 | // NE16_MEM_BANDWIDTH parameter: number of bits per tile. 55 | 56 | hci_streamer_ctrl_t all_source_ctrl; 57 | hci_streamer_flags_t all_source_flags; 58 | flags_fifo_t tcdm_fifo_flags; 59 | 60 | hwpe_stream_intf_stream #( 61 | .DATA_WIDTH ( NE16_MEM_BANDWIDTH ) 62 | `ifndef SYNTHESIS 63 | , 64 | .BYPASS_VCR_ASSERT( 1'b1 ), 65 | .BYPASS_VDR_ASSERT( 1'b1 ) 66 | `endif 67 | ) all_source ( 68 | .clk ( clk_i ) 69 | ); 70 | 71 | hwpe_stream_intf_stream #( 72 | .DATA_WIDTH ( NE16_MEM_BANDWIDTH ) 73 | `ifndef SYNTHESIS 74 | , 75 | .BYPASS_VCR_ASSERT( 1'b1 ), 76 | .BYPASS_VDR_ASSERT( 1'b1 ) 77 | `endif 78 | ) virt_source[3:0] ( 79 | .clk ( clk_i ) 80 | ); 81 | 82 | hci_core_intf #( 83 | .DW ( NE16_MEM_BANDWIDTH_EXT ) 84 | ) virt_tcdm [1:0] ( 85 | .clk ( clk_i ) 86 | ); 87 | 88 | hci_core_intf #( 89 | .DW ( NE16_MEM_BANDWIDTH_EXT ) 90 | ) tcdm_prefifo ( 91 | .clk ( clk_i ) 92 | ); 93 | 94 | hci_core_intf #( 95 | .DW ( NE16_MEM_BANDWIDTH_EXT ) 96 | ) tcdm_prefilter ( 97 | .clk ( clk_i ) 98 | ); 99 | 100 | hci_core_source #( 101 | .DATA_WIDTH ( NE16_MEM_BANDWIDTH_EXT ) 102 | ) i_all_source ( 103 | .clk_i ( clk_i ), 104 | .rst_ni ( rst_ni ), 105 | .test_mode_i ( test_mode_i ), 106 | .clear_i ( clear_i | ctrl_i.clear_source ), 107 | .enable_i ( ~ctrl_i.ld_st_mux_sel ), 108 | .tcdm ( virt_tcdm [0] ), 109 | .stream ( all_source ), 110 | .ctrl_i ( all_source_ctrl ), 111 | .flags_o ( all_source_flags ) 112 | ); 113 | 114 | hci_core_sink #( 115 | .DATA_WIDTH ( NE16_MEM_BANDWIDTH_EXT ) 116 | ) i_sink ( 117 | .clk_i ( clk_i ), 118 | .rst_ni ( rst_ni ), 119 | .test_mode_i ( test_mode_i ), 120 | .clear_i ( clear_i | ctrl_i.clear_sink ), 121 | .enable_i ( ctrl_i.ld_st_mux_sel ), 122 | .tcdm ( virt_tcdm [1] ), 123 | .stream ( conv_i ), 124 | .ctrl_i ( ctrl_i.conv_sink_ctrl ), 125 | .flags_o ( flags_o.conv_sink_flags ) 126 | ); 127 | 128 | generate 129 | if(TCDM_FIFO_DEPTH > 0) begin : use_fifo_gen 130 | hci_core_mux_static #( 131 | .NB_CHAN (2), 132 | .DW ( NE16_MEM_BANDWIDTH_EXT ) 133 | ) i_ld_st_mux_static ( 134 | .clk_i ( clk_i ), 135 | .rst_ni ( rst_ni ), 136 | .clear_i ( clear_i ), 137 | .sel_i ( ctrl_i.ld_st_mux_sel ), 138 | .in ( virt_tcdm ), 139 | .out ( tcdm_prefifo ) 140 | ); 141 | 142 | hci_core_fifo #( 143 | .FIFO_DEPTH ( TCDM_FIFO_DEPTH ), 144 | .DW ( NE16_MEM_BANDWIDTH_EXT ), 145 | .AW ( 32 ), 146 | .OW ( 1 ) 147 | ) i_tcdm_fifo ( 148 | .clk_i ( clk_i ), 149 | .rst_ni ( rst_ni ), 150 | .clear_i ( clear_i | ctrl_i.clear_fifo ), 151 | .flags_o ( tcdm_fifo_flags ), 152 | .tcdm_slave ( tcdm_prefifo ), 153 | .tcdm_master ( tcdm_prefilter ) 154 | ); 155 | end 156 | else begin : dont_use_fifo_gen 157 | hci_core_mux_static #( 158 | .NB_CHAN (2), 159 | .DW ( NE16_MEM_BANDWIDTH_EXT ) 160 | ) i_ld_st_mux_static ( 161 | .clk_i ( clk_i ), 162 | .rst_ni ( rst_ni ), 163 | .clear_i ( clear_i ), 164 | .sel_i ( ctrl_i.ld_st_mux_sel ), 165 | .in ( virt_tcdm ), 166 | .out ( tcdm_prefilter ) 167 | ); 168 | assign tcdm_fifo_flags.empty = 1'b1; 169 | end 170 | endgenerate 171 | 172 | hci_core_r_valid_filter i_tcdm_filter ( 173 | .clk_i ( clk_i ), 174 | .rst_ni ( rst_ni ), 175 | .clear_i ( clear_i ), 176 | .enable_i ( 1'b1 ), 177 | .tcdm_slave ( tcdm_prefilter ), 178 | .tcdm_master ( tcdm ) 179 | ); 180 | 181 | always_comb 182 | begin : ld_which_ctrl_mux 183 | all_source_ctrl = '0; 184 | if(ctrl_i.ld_which_mux_sel == LD_FEAT_SEL) 185 | all_source_ctrl = ctrl_i.feat_source_ctrl; 186 | else if(ctrl_i.ld_which_mux_sel == LD_WEIGHT_SEL) 187 | all_source_ctrl = ctrl_i.weight_source_ctrl; 188 | else if(ctrl_i.ld_which_mux_sel == LD_NORM_SEL) 189 | all_source_ctrl = ctrl_i.norm_source_ctrl; 190 | else if(ctrl_i.ld_which_mux_sel == LD_STREAMIN_SEL) 191 | all_source_ctrl = ctrl_i.streamin_source_ctrl; 192 | end 193 | 194 | assign flags_o.feat_source_flags = all_source_flags; 195 | assign flags_o.norm_source_flags = all_source_flags; 196 | assign flags_o.weight_source_flags = all_source_flags; 197 | assign flags_o.tcdm_fifo_empty = tcdm_fifo_flags.empty; 198 | 199 | logic [1:0] ld_which_mux_sel; 200 | assign ld_which_mux_sel = (ctrl_i.ld_which_mux_sel == LD_FEAT_SEL) ? 2'b00 : 201 | (ctrl_i.ld_which_mux_sel == LD_WEIGHT_SEL) ? 2'b01 : 202 | (ctrl_i.ld_which_mux_sel == LD_NORM_SEL) ? 2'b10 : 203 | 2'b11; // LD_STREAMIN_SEL 204 | 205 | hwpe_stream_demux_static #( 206 | .NB_OUT_STREAMS ( 4 ) 207 | ) i_all_source_demux ( 208 | .clk_i ( clk_i ), 209 | .rst_ni ( rst_ni ), 210 | .clear_i ( clear_i ), 211 | .sel_i ( ld_which_mux_sel ), 212 | .push_i ( all_source ), 213 | .pop_o ( virt_source ) 214 | ); 215 | 216 | hwpe_stream_assign i_assign_feat ( .push_i (virt_source[0]), .pop_o ( feat_o ) ); 217 | hwpe_stream_assign i_assign_weight ( .push_i (virt_source[1]), .pop_o ( weight_o ) ); 218 | hwpe_stream_assign i_assign_norm ( .push_i (virt_source[2]), .pop_o ( norm_o ) ); 219 | hwpe_stream_assign i_assign_streamin ( .push_i (virt_source[3]), .pop_o ( streamin_o ) ); 220 | 221 | endmodule // ne16_streamer 222 | -------------------------------------------------------------------------------- /rtl/ne16_top.sv: -------------------------------------------------------------------------------- 1 | /* 2 | * ne16_top.sv 3 | * 4 | * Copyright (C) 2019-2021 ETH Zurich, University of Bologna and GreenWaves Technologies 5 | * 6 | * Copyright and related rights are licensed under the Solderpad Hardware 7 | * License, Version 0.51 (the "License"); you may not use this file except in 8 | * compliance with the License. You may obtain a copy of the License at 9 | * http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law 10 | * or agreed to in writing, software, hardware and materials distributed under 11 | * this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | * CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | * specific language governing permissions and limitations under the License. 14 | */ 15 | 16 | /* 17 | * Authors (RBE): Gianna Paulin 18 | * Francesco Conti 19 | * Authors (NE16): Francesco Conti 20 | */ 21 | 22 | import ne16_package::*; 23 | import hwpe_ctrl_package::*; 24 | import hci_package::*; 25 | 26 | module ne16_top #( 27 | parameter int unsigned TP_IN = NE16_TP_IN, // number of input elements processed per cycle 28 | parameter int unsigned TP_OUT = NE16_TP_OUT, // number of output elements processed per cycle 29 | parameter int unsigned CNT = VLEN_CNT_SIZE, // counter size 30 | parameter int unsigned ID = ID_WIDTH, 31 | parameter int unsigned BW = NE16_MEM_BANDWIDTH_EXT, // NE16_MEM_BANDWIDTH 32 | parameter int unsigned DW = NE16_STREAM_BANDWIDTH, 33 | 34 | parameter int unsigned N_CORES = NR_CORES, 35 | parameter int unsigned N_CONTEXT = NR_CONTEXT 36 | ) ( 37 | // global signals 38 | input logic clk_i, 39 | input logic rst_ni, 40 | input logic test_mode_i, 41 | // events 42 | output logic [N_CORES-1:0][REGFILE_N_EVT-1:0] evt_o, 43 | output logic busy_o, 44 | // tcdm master ports 45 | hci_core_intf.master tcdm, 46 | // periph slave port 47 | hwpe_ctrl_intf_periph.slave periph 48 | ); 49 | 50 | // signals 51 | logic enable; 52 | logic clear; 53 | 54 | ctrl_streamer_t streamer_ctrl; 55 | flags_streamer_t streamer_flags; 56 | ctrl_engine_t engine_ctrl; 57 | flags_engine_t engine_flags; 58 | 59 | hwpe_stream_intf_stream #( 60 | .DATA_WIDTH(DW) 61 | `ifndef SYNTHESIS 62 | , 63 | .BYPASS_VCR_ASSERT( 1'b1 ), 64 | .BYPASS_VDR_ASSERT( 1'b1 ) 65 | `endif 66 | ) feat (.clk(clk_i)); 67 | 68 | hwpe_stream_intf_stream #( 69 | .DATA_WIDTH(NE16_MEM_BANDWIDTH) 70 | `ifndef SYNTHESIS 71 | , 72 | .BYPASS_VCR_ASSERT( 1'b1 ), 73 | .BYPASS_VDR_ASSERT( 1'b1 ) 74 | `endif 75 | ) weight (.clk(clk_i)); 76 | 77 | hwpe_stream_intf_stream #( 78 | .DATA_WIDTH(NE16_MEM_BANDWIDTH) 79 | `ifndef SYNTHESIS 80 | , 81 | .BYPASS_VCR_ASSERT( 1'b1 ), 82 | .BYPASS_VDR_ASSERT( 1'b1 ) 83 | `endif 84 | ) norm (.clk(clk_i)); 85 | 86 | hwpe_stream_intf_stream #( 87 | .DATA_WIDTH(NE16_MEM_BANDWIDTH) 88 | `ifndef SYNTHESIS 89 | , 90 | .BYPASS_VCR_ASSERT( 1'b1 ), 91 | .BYPASS_VDR_ASSERT( 1'b1 ) 92 | `endif 93 | ) streamin (.clk(clk_i)); 94 | 95 | hwpe_stream_intf_stream #( 96 | .DATA_WIDTH(NE16_MEM_BANDWIDTH) 97 | `ifndef SYNTHESIS 98 | , 99 | .BYPASS_VCR_ASSERT( 1'b1 ), 100 | .BYPASS_VDR_ASSERT( 1'b1 ) 101 | `endif 102 | ) conv (.clk(clk_i)); 103 | 104 | ne16_engine i_engine ( 105 | .clk_i ( clk_i ), 106 | .rst_ni ( rst_ni ), 107 | .test_mode_i ( test_mode_i ), 108 | .enable_i ( enable ), 109 | .clear_i ( clear ), 110 | .load_in ( feat ), 111 | .load_weight ( weight ), 112 | .load_norm ( norm ), 113 | .load_streamin ( streamin ), 114 | .store_out ( conv ), 115 | .ctrl_i ( engine_ctrl ), 116 | .flags_o ( engine_flags ) 117 | ); 118 | 119 | ne16_streamer #( 120 | .BW ( NE16_MEM_BANDWIDTH_EXT ) 121 | ) i_streamer ( 122 | .clk_i ( clk_i ), 123 | .rst_ni ( rst_ni ), 124 | .test_mode_i ( test_mode_i ), 125 | .enable_i ( enable ), 126 | .clear_i ( clear ), 127 | .feat_o ( feat ), 128 | .weight_o ( weight ), 129 | .norm_o ( norm ), 130 | .streamin_o ( streamin ), 131 | .conv_i ( conv ), 132 | .tcdm ( tcdm ), 133 | .ctrl_i ( streamer_ctrl ), 134 | .flags_o ( streamer_flags ) 135 | ); 136 | 137 | ne16_ctrl #( 138 | .ID ( ID ), 139 | .N_CORES ( N_CORES ) 140 | ) i_ctrl ( 141 | .clk_i ( clk_i ), 142 | .rst_ni ( rst_ni ), 143 | .test_mode_i ( test_mode_i ), 144 | .busy_o ( busy_o ), 145 | .evt_o ( evt_o ), 146 | .clear_o ( clear ), 147 | .ctrl_streamer_o ( streamer_ctrl ), 148 | .flags_streamer_i ( streamer_flags ), 149 | .ctrl_engine_o ( engine_ctrl ), 150 | .flags_engine_i ( engine_flags ), 151 | .periph ( periph ) 152 | ); 153 | 154 | assign enable = busy_o; 155 | 156 | endmodule // ne16_top 157 | -------------------------------------------------------------------------------- /rtl/ne16_top_wrap.sv: -------------------------------------------------------------------------------- 1 | /* 2 | * ne16_top_wrap.sv 3 | * 4 | * Copyright (C) 2019-2021 ETH Zurich, University of Bologna and GreenWaves Technologies 5 | * 6 | * Copyright and related rights are licensed under the Solderpad Hardware 7 | * License, Version 0.51 (the "License"); you may not use this file except in 8 | * compliance with the License. You may obtain a copy of the License at 9 | * http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law 10 | * or agreed to in writing, software, hardware and materials distributed under 11 | * this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | * CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | * specific language governing permissions and limitations under the License. 14 | */ 15 | 16 | /* 17 | * Authors (RBE): Gianna Paulin 18 | * Francesco Conti 19 | * Authors (NE16): Francesco Conti 20 | */ 21 | 22 | import ne16_package::*; 23 | import hwpe_ctrl_package::*; 24 | import hci_package::*; 25 | 26 | module ne16_top_wrap #( 27 | parameter int unsigned TP_IN = NE16_TP_IN, // number of input elements processed per cycle 28 | parameter int unsigned TP_OUT = NE16_TP_OUT, // number of output elements processed per cycle 29 | parameter int unsigned CNT = VLEN_CNT_SIZE, // counter size 30 | parameter int unsigned BW = NE16_MEM_BANDWIDTH_EXT, // NE16_MEM_BANDWIDTH 31 | parameter int unsigned MP = BW/32, // number of memory ports (each a 32bit data) 32 | parameter int unsigned ID = ID_WIDTH, 33 | parameter int unsigned N_CORES = NR_CORES, 34 | parameter int unsigned N_CONTEXT = NR_CONTEXT 35 | ) ( 36 | // global signals 37 | input logic clk_i, 38 | input logic rst_ni, 39 | input logic test_mode_i, 40 | // evnets 41 | output logic [N_CORES-1:0][REGFILE_N_EVT-1:0] evt_o, 42 | output logic busy_o, 43 | // tcdm master ports 44 | output logic [ MP-1:0] tcdm_req, 45 | input logic [ MP-1:0] tcdm_gnt, 46 | output logic [ MP-1:0][ 31:0] tcdm_add, 47 | output logic [ MP-1:0] tcdm_wen, 48 | output logic [ MP-1:0][ 3:0] tcdm_be, 49 | output logic [ MP-1:0][ 31:0] tcdm_data, 50 | input logic [ MP-1:0][ 31:0] tcdm_r_data, 51 | input logic [ MP-1:0] tcdm_r_valid, 52 | // periph slave port 53 | input logic periph_req, 54 | output logic periph_gnt, 55 | input logic [ 31:0] periph_add, 56 | input logic periph_wen, 57 | input logic [ 3:0] periph_be, 58 | input logic [ 31:0] periph_data, 59 | input logic [ ID-1:0] periph_id, 60 | output logic [ 31:0] periph_r_data, 61 | output logic periph_r_valid, 62 | output logic [ ID-1:0] periph_r_id 63 | ); 64 | 65 | hci_core_intf #( 66 | .DW ( BW ) 67 | ) tcdm ( 68 | .clk ( clk_i ) 69 | ); 70 | 71 | hwpe_ctrl_intf_periph #(.ID_WIDTH(ID)) periph (.clk(clk_i)); 72 | 73 | // bindings 74 | generate 75 | for(genvar ii=0; ii> {tcdm_r_data} } ; 85 | endgenerate 86 | 87 | always_comb 88 | begin 89 | periph.req = periph_req; 90 | periph.add = periph_add; 91 | periph.wen = periph_wen; 92 | periph.be = periph_be; 93 | periph.data = periph_data; 94 | periph.id = periph_id; 95 | periph_gnt = periph.gnt; 96 | periph_r_data = periph.r_data; 97 | periph_r_valid = periph.r_valid; 98 | periph_r_id = periph.r_id; 99 | end 100 | 101 | ne16_top #( 102 | .TP_IN (TP_IN ), 103 | .TP_OUT (TP_OUT ), 104 | .CNT (CNT ), 105 | .BW (BW ), 106 | .ID (ID ), 107 | .N_CORES (N_CORES ), 108 | .N_CONTEXT(N_CONTEXT) 109 | ) i_ne16_top ( 110 | .clk_i ( clk_i ), 111 | .rst_ni ( rst_ni ), 112 | .test_mode_i ( test_mode_i ), 113 | .evt_o ( evt_o ), 114 | .busy_o ( busy_o ), 115 | .tcdm ( tcdm.master ), 116 | .periph ( periph.slave ) 117 | ); 118 | 119 | endmodule // ne16_top_wrap 120 | -------------------------------------------------------------------------------- /src_files.yml: -------------------------------------------------------------------------------- 1 | ne16: 2 | vlog_opts: [ 3 | +nowarnSVCHK, 4 | -suppress 2275, 5 | -L hwpe_stream_lib, 6 | -L hwpe_ctrl_lib, 7 | -L hci_lib, 8 | ] 9 | incdirs: [ 10 | ., 11 | ../hwpe-stream/rtl, 12 | ../hwpe-ctrl/rtl, 13 | ] 14 | files: [ 15 | rtl/ne16_package.sv, 16 | rtl/accumulator/ne16_accumulator_scm_test_wrap.sv, 17 | rtl/input_buffer/ne16_input_buffer_scm_test_wrap.sv, 18 | rtl/accumulator/ne16_accumulator_scm.sv, 19 | rtl/accumulator/ne16_accumulator_normquant.sv, 20 | rtl/accumulator/ne16_normquant.sv, 21 | rtl/accumulator/ne16_normquant_shifter.sv, 22 | rtl/accumulator/ne16_normquant_bias.sv, 23 | rtl/accumulator/ne16_normquant_multiplier.sv, 24 | rtl/input_buffer/ne16_input_buffer_scm.sv, 25 | rtl/input_buffer/ne16_input_buffer.sv, 26 | rtl/array/ne16_scale.sv, 27 | rtl/array/ne16_binconv_block.sv, 28 | rtl/array/ne16_binconv_column.sv, 29 | rtl/array/ne16_binconv_array.sv, 30 | rtl/ctrl/ne16_ctrl_fsm.sv, 31 | rtl/ctrl/ne16_ctrl.sv, 32 | rtl/ne16_engine.sv, 33 | rtl/ne16_streamer.sv, 34 | rtl/ne16_top.sv, 35 | rtl/ne16_top_wrap.sv, 36 | ] 37 | -------------------------------------------------------------------------------- /ucode/code.yml: -------------------------------------------------------------------------------- 1 | # 2 | # code.yml 3 | # 4 | # Copyright (C) 2019-2021 ETH Zurich, University of Bologna and GreenWaves Technologies 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Author: Francesco Conti 19 | 20 | # mnemonics to simplify microcode writing 21 | mnemonics: 22 | base_addr_W: 0 23 | base_addr_x: 1 24 | base_addr_y: 2 25 | base_addr_s: 3 26 | weights_kom_iter: 4 27 | weights_kim_iter: 5 28 | weights_kom_reset_iter: 6 29 | weights_kim_reset_iter: 7 30 | infeat_kim_iter: 8 31 | infeat_wom_iter: 9 32 | infeat_hom_iter: 10 33 | infeat_kim_reset_iter: 11 34 | infeat_wom_reset_iter: 12 35 | infeat_hom_reset_iter: 13 36 | outfeat_wom_iter: 14 37 | outfeat_hom_iter: 15 38 | outfeat_kom_iter: 16 39 | outfeat_wom_reset_iter: 17 40 | outfeat_hom_reset_iter: 18 41 | outfeat_kom_reset_iter: 19 42 | scale_kom_iter: 20 43 | zero: 21 44 | 45 | # NE16 code 46 | code: 47 | k_in_major: 48 | - { op : add, a: base_addr_W, b: weights_kim_iter } 49 | - { op : add, a: base_addr_x, b: infeat_kim_iter } 50 | j_major: 51 | - { op : add, a: base_addr_W, b: weights_kim_reset_iter } # weights_kim_reset_iter = - subtile_nb_ki * weights_kim_iter 52 | - { op : add, a: base_addr_x, b: infeat_kim_reset_iter } # infeat_kim_reset_iter = - subtile_nb_ki * infeat_kim_iter 53 | - { op : add, a: base_addr_x, b: infeat_wom_iter } 54 | - { op : add, a: base_addr_y, b: outfeat_wom_iter } 55 | i_major: 56 | - { op : add, a: base_addr_x, b: infeat_wom_reset_iter } # infeat_wom_reset_iter = - subtile_nb_wo * infeat_wom_iter 57 | - { op : add, a: base_addr_y, b: outfeat_wom_reset_iter } # outfeat_wom_reset_iter = - subtile_nb_wo * outfeat_wom_iter 58 | - { op : add, a: base_addr_W, b: weights_kim_reset_iter } # weights_kim_reset_iter = - subtile_nb_ki * weights_kim_iter 59 | - { op : add, a: base_addr_x, b: infeat_kim_reset_iter } # infeat_kim_reset_iter = - subtile_nb_ki * infeat_kim_iter 60 | - { op : add, a: base_addr_x, b: infeat_hom_iter } 61 | - { op : add, a: base_addr_y, b: outfeat_hom_iter } 62 | k_out_major: 63 | - { op : add, a: base_addr_x, b: infeat_hom_reset_iter } # infeat_hom_reset_iter = - subtile_nb_ho * infeat_hom_iter 64 | - { op : add, a: base_addr_y, b: outfeat_hom_reset_iter } # outfeat_hom_reset_iter = - subtile_nb_ho * outfeat_hom_iter 65 | - { op : add, a: base_addr_x, b: infeat_wom_reset_iter } # infeat_wom_reset_iter = - subtile_nb_wo * infeat_wom_iter 66 | - { op : add, a: base_addr_y, b: outfeat_wom_reset_iter } # outfeat_wom_reset_iter = - subtile_nb_wo * outfeat_wom_iter 67 | - { op : add, a: base_addr_W, b: weights_kim_reset_iter } # weights_kim_reset_iter = - subtile_nb_ki * weights_kim_iter 68 | - { op : add, a: base_addr_x, b: infeat_kim_reset_iter } # infeat_kim_reset_iter = - subtile_nb_ki * infeat_kim_iter 69 | - { op : add, a: base_addr_W, b: weights_kom_iter } 70 | - { op : add, a: base_addr_y, b: outfeat_kom_iter } 71 | - { op : add, a: base_addr_s, b: scale_kom_iter } 72 | -------------------------------------------------------------------------------- /ucode/code_dw.yml: -------------------------------------------------------------------------------- 1 | # 2 | # code_dw.yml 3 | # 4 | # Copyright (C) 2019-2021 ETH Zurich, University of Bologna and GreenWaves Technologies 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Author: Francesco Conti 19 | 20 | # mnemonics to simplify microcode writing 21 | mnemonics: 22 | base_addr_W: 0 23 | base_addr_x: 1 24 | base_addr_y: 2 25 | base_addr_s: 3 26 | weights_km_iter: 4 27 | null5: 5 28 | weights_km_reset_iter: 6 29 | null7: 7 30 | infeat_km_iter: 8 31 | infeat_wom_iter: 9 32 | infeat_hom_iter: 10 33 | infeat_km_reset_iter: 11 34 | infeat_wom_reset_iter: 12 35 | infeat_hom_reset_iter: 13 36 | outfeat_wom_iter: 14 37 | outfeat_hom_iter: 15 38 | outfeat_km_iter: 16 39 | outfeat_wom_reset_iter: 17 40 | outfeat_hom_reset_iter: 18 41 | outfeat_km_reset_iter: 19 42 | scale_km_iter: 20 43 | zero: 21 44 | 45 | # NE16 code 46 | code: 47 | j_major: 48 | - { op : add, a: base_addr_x, b: infeat_wom_iter } 49 | - { op : add, a: base_addr_y, b: outfeat_wom_iter } 50 | i_major: 51 | - { op : add, a: base_addr_x, b: infeat_wom_reset_iter } # infeat_wom_reset_iter = - subtile_nb_wo * infeat_wom_iter 52 | - { op : add, a: base_addr_y, b: outfeat_wom_reset_iter } # outfeat_wom_reset_iter = - subtile_nb_wo * outfeat_wom_iter 53 | - { op : add, a: base_addr_x, b: infeat_hom_iter } 54 | - { op : add, a: base_addr_y, b: outfeat_hom_iter } 55 | k_out_major: 56 | - { op : add, a: base_addr_x, b: infeat_hom_reset_iter } # infeat_hom_reset_iter = - subtile_nb_ho * infeat_hom_iter 57 | - { op : add, a: base_addr_y, b: outfeat_hom_reset_iter } # outfeat_hom_reset_iter = - subtile_nb_ho * outfeat_hom_iter 58 | - { op : add, a: base_addr_x, b: infeat_wom_reset_iter } # infeat_wom_reset_iter = - subtile_nb_wo * infeat_wom_iter 59 | - { op : add, a: base_addr_y, b: outfeat_wom_reset_iter } # outfeat_wom_reset_iter = - subtile_nb_wo * outfeat_wom_iter 60 | - { op : add, a: base_addr_W, b: weights_km_iter } 61 | - { op : add, a: base_addr_y, b: outfeat_km_iter } 62 | - { op : add, a: base_addr_x, b: infeat_km_iter } 63 | - { op : add, a: base_addr_s, b: scale_km_iter } 64 | fake_loop: 65 | - { op : mv, a: base_addr_x, b: base_addr_x } 66 | - { op : mv, a: base_addr_x, b: base_addr_x } 67 | -------------------------------------------------------------------------------- /ucode/uloop_check.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # uloop_check.sv 4 | # Francesco Conti 5 | # 6 | # Copyright (C) 2017-2019 ETH Zurich, University of Bologna 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # See LICENSE.sw.txt for details. 10 | # You may obtain a copy of the License at 11 | # 12 | # http://www.apache.org/licenses/LICENSE-2.0 13 | # 14 | # Unless required by applicable law or agreed to in writing, software 15 | # distributed under the License is distributed on an "AS IS" BASIS, 16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | # See the License for the specific language governing permissions and 18 | # limitations under the License. 19 | # 20 | 21 | from __future__ import print_function 22 | from uloop_common import * 23 | import math 24 | 25 | # high-level loop 26 | def iterate_hl_loop(subtile_nb_ko, subtile_nb_ho, subtile_nb_wo, subtile_nb_ki, h_size_out, w_size_out, infeat_hom_iter, infeat_wom_iter, infeat_kim_iter, weights_kom_iter, weights_kim_iter, outfeat_hom_iter, outfeat_wom_iter, outfeat_kom_iter, scale_kom_iter): 27 | 28 | for k_out_major in range(subtile_nb_ko): 29 | for i_major in range(subtile_nb_ho): 30 | for j_major in range(subtile_nb_wo): 31 | for k_in_major in range(subtile_nb_ki): 32 | 33 | # auto base_addr_x = i_major*h_size_out*this->w_in_int*this->k_in + j_major*w_size_out*this->k_in + k_in_major*this->TP_IN; 34 | base_addr_x = i_major*infeat_hom_iter + j_major*infeat_wom_iter + k_in_major*infeat_kim_iter 35 | 36 | # auto base_addr_W_3x3 = (k_out_major*this->TP_OUT*this->subtile_nb_ki*this->qw + k_in_major*this->qw) * this->FILTER_SIZE*this->FILTER_SIZE * 2; 37 | # auto base_addr_W_1x1 = (k_out_major*this->TP_OUT*this->subtile_nb_ki + k_in_major) * this->qw * 2; 38 | base_addr_W = k_out_major*weights_kom_iter + k_in_major*weights_kim_iter 39 | 40 | # auto base_addr_y = i_major*h_size_out*this->w_out_int*this->k_out + j_major*w_size_out*this->k_out + k_out_major*this->TP_OUT; 41 | base_addr_y = i_major*outfeat_hom_iter + j_major*outfeat_wom_iter + k_out_major*outfeat_kom_iter 42 | 43 | base_addr_s = k_out_major*scale_kom_iter 44 | 45 | yield base_addr_W, base_addr_x, base_addr_y, base_addr_s 46 | 47 | VERBOSE = True 48 | 49 | 50 | def uloop_check( 51 | subtile_nb_ko, 52 | subtile_nb_ho, 53 | subtile_nb_wo, 54 | subtile_nb_ki, 55 | h_size_out, 56 | w_size_out, 57 | k_in, 58 | w_in_int, 59 | k_out, 60 | w_out_int, 61 | qw, 62 | fs, 63 | FILTER_SIZE=3, 64 | TP_IN=16, 65 | TP_OUT=32, 66 | 67 | # infeat_hom_iter, 68 | # infeat_wom_iter, 69 | # infeat_kim_iter, 70 | # weights_kom_iter, 71 | # weights_kim_iter, 72 | # outfeat_hom_iter, 73 | # outfeat_wom_iter, 74 | # outfeat_kom_iter, 75 | verbose=VERBOSE 76 | ): 77 | 78 | infeat_hom_iter = h_size_out * w_in_int * k_in 79 | infeat_wom_iter = w_size_out * k_in 80 | infeat_kim_iter = TP_IN 81 | 82 | if fs==3: 83 | weights_kom_iter = TP_OUT*subtile_nb_ki*qw * FILTER_SIZE*FILTER_SIZE * 2 84 | weights_kim_iter = qw * FILTER_SIZE*FILTER_SIZE * 2 85 | else: 86 | weights_kom_iter = TP_OUT*subtile_nb_ki*qw * 2 87 | weights_kim_iter = qw * 2 88 | 89 | outfeat_hom_iter = h_size_out * w_out_int * k_out 90 | outfeat_wom_iter = w_size_out * k_out 91 | outfeat_kom_iter = TP_OUT 92 | 93 | scale_kom_iter = TP_OUT>>2 94 | 95 | print("> Base iter\n\tsubtile_nb_ko=%d\n\tsubtile_nb_ho=%d\n\tsubtile_nb_wo=%d\n\tsubtile_nb_ki=%d\n\th_size_out=%d\n\tw_size_out=%d\n\tinfeat_hom_iter=%x\n\tinfeat_wom_iter=%x\n\tinfeat_kim_iter=%x\n\tweights_kom_iter=%x\n\tweights_kim_iter=%x\n\toutfeat_hom_iter=%x\n\toutfeat_wom_iter=%x\n\toutfeat_kom_iter=%x\n\tscale_kom_iter=%x" % (subtile_nb_ko, subtile_nb_ho, subtile_nb_wo, subtile_nb_ki, h_size_out, w_size_out, infeat_hom_iter, infeat_wom_iter, infeat_kim_iter, weights_kom_iter, weights_kim_iter, outfeat_hom_iter, outfeat_wom_iter, outfeat_kom_iter, scale_kom_iter)) 96 | weights_kom_reset_iter = - (subtile_nb_ko-1) * weights_kom_iter 97 | weights_kim_reset_iter = - (subtile_nb_ki-1) * weights_kim_iter 98 | infeat_kim_reset_iter = - (subtile_nb_ki-1) * infeat_kim_iter 99 | infeat_wom_reset_iter = - (subtile_nb_wo-1) * infeat_wom_iter 100 | outfeat_wom_reset_iter = - (subtile_nb_wo-1) * outfeat_wom_iter 101 | infeat_hom_reset_iter = - (subtile_nb_ho-1) * infeat_hom_iter 102 | outfeat_hom_reset_iter = - (subtile_nb_ho-1) * outfeat_hom_iter 103 | outfeat_kom_reset_iter = - (subtile_nb_ko-1) * outfeat_kom_iter 104 | print("> Reset iter\n\tweights_kom_reset_iter=%x\n\tweights_kim_reset_iter=%x\n\tinfeat_kim_reset_iter=%x\n\tinfeat_wom_reset_iter=%x\n\toutfeat_wom_reset_iter=%x\n\tinfeat_hom_reset_iter=%x\n\toutfeat_hom_reset_iter=%x\n\toutfeat_kom_reset_iter=%x" % (weights_kom_reset_iter, weights_kim_reset_iter, infeat_kim_reset_iter, infeat_wom_reset_iter, outfeat_wom_reset_iter, infeat_hom_reset_iter, outfeat_hom_reset_iter, outfeat_kom_reset_iter)) 105 | 106 | registers = [ 107 | 0, # base_addr_W 108 | 0, # base_addr_x 109 | 0, # base_addr_y 110 | 0, # base_addr_s 111 | weights_kom_iter, 112 | weights_kim_iter, 113 | weights_kom_reset_iter, 114 | weights_kim_reset_iter, 115 | infeat_kim_iter, 116 | infeat_wom_iter, 117 | infeat_hom_iter, 118 | infeat_kim_reset_iter, 119 | infeat_wom_reset_iter, 120 | infeat_hom_reset_iter, 121 | outfeat_wom_iter, 122 | outfeat_hom_iter, 123 | outfeat_kom_iter, 124 | outfeat_wom_reset_iter, 125 | outfeat_hom_reset_iter, 126 | outfeat_kom_reset_iter, 127 | scale_kom_iter, 128 | 0 129 | ] 130 | 131 | loops_ops,code,mnem = uloop_load("code.yml") 132 | loops = uloop_get_loops(loops_ops, (subtile_nb_ki, subtile_nb_wo, subtile_nb_ho, subtile_nb_ko)) 133 | 134 | err = 0 135 | idx = [] 136 | nb_loops = 4 137 | for j in range(nb_loops): 138 | idx.append(0) 139 | state = (0,0,0,idx) 140 | busy = False 141 | execute = True 142 | # uloop_print_idx(state, registers) 143 | hidx = 0, 0, 0, 0 144 | hl_loop = iterate_hl_loop(subtile_nb_ko, subtile_nb_ho, subtile_nb_wo, subtile_nb_ki, h_size_out, w_size_out, infeat_hom_iter, infeat_wom_iter, infeat_kim_iter, weights_kom_iter, weights_kim_iter, outfeat_hom_iter, outfeat_wom_iter, outfeat_kom_iter, scale_kom_iter) 145 | hW, hX, hY, hS = next(hl_loop) 146 | for i in range(0,1000000): 147 | new_registers = uloop_execute(state, code, registers) 148 | execute,end,busy,state = uloop_state_machine(loops, state, verbose=verbose, nb_loops=nb_loops) 149 | if execute: 150 | registers = new_registers 151 | if not busy: 152 | try: 153 | hW, hX, hY, hS = next(hl_loop) 154 | except StopIteration: 155 | pass 156 | if verbose: 157 | uloop_print_idx(state, registers, register_names=('weights', 'infeat', 'outfeat', 'scale')) 158 | uW, uX, uY, uS = registers[0:4] 159 | if (hW != uW or hX != uX or hY != uY or hS != uS): 160 | if verbose: 161 | print(" ERROR!!!") 162 | print(" High-level: weights:%x infeat:%x outfeat:%x scale:%x" % (hW, hX, hY, hS)) 163 | print(" uLoop: weights:%x infeat:%x outfeat:%x scale:%x" % (uW, uX, uY, uS)) 164 | err += 1 165 | if end: 166 | break 167 | 168 | print(err, " errors", "!!!" if err > 0 else "") 169 | return err 170 | 171 | uloop_check( 172 | 2, # subtile_nb_ko, 173 | 1, # subtile_nb_ho, 174 | 1, # subtile_nb_wo, 175 | 1, # subtile_nb_ki, 176 | 3, # h_size_out, 177 | 3, # w_size_out, 178 | 16, # k_in, 179 | 5, # w_in_int, 180 | 64, # k_out, 181 | 3, # w_out_int, 182 | 8, # qw, 183 | 3, # fs, 184 | verbose = True 185 | ) 186 | -------------------------------------------------------------------------------- /ucode/uloop_check_dw.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # uloop_check.sv 4 | # Francesco Conti 5 | # 6 | # Copyright (C) 2017-2019 ETH Zurich, University of Bologna 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # See LICENSE.sw.txt for details. 10 | # You may obtain a copy of the License at 11 | # 12 | # http://www.apache.org/licenses/LICENSE-2.0 13 | # 14 | # Unless required by applicable law or agreed to in writing, software 15 | # distributed under the License is distributed on an "AS IS" BASIS, 16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | # See the License for the specific language governing permissions and 18 | # limitations under the License. 19 | # 20 | 21 | from __future__ import print_function 22 | from uloop_common import * 23 | import math 24 | 25 | # high-level loop 26 | def iterate_hl_loop(subtile_nb_k, subtile_nb_ho, subtile_nb_wo, h_size_out, w_size_out, infeat_hom_iter, infeat_wom_iter, infeat_km_iter, weights_km_iter, outfeat_hom_iter, outfeat_wom_iter, outfeat_km_iter, scale_km_iter): 27 | 28 | for k_major in range(subtile_nb_k): 29 | for i_major in range(subtile_nb_ho): 30 | for j_major in range(subtile_nb_wo): 31 | 32 | # auto base_addr_x = i_major*h_size_out*this->w_in_int*this->k_in + j_major*w_size_out*this->k_in + k_in_major*this->TP_IN; 33 | base_addr_x = i_major*infeat_hom_iter + j_major*infeat_wom_iter + k_major*infeat_km_iter 34 | 35 | # auto base_addr_W_3x3 = (k_out_major*this->TP_OUT*this->subtile_nb_ki*this->qw + k_in_major*this->qw) * this->FILTER_SIZE*this->FILTER_SIZE * 2; 36 | # auto base_addr_W_1x1 = (k_out_major*this->TP_OUT*this->subtile_nb_ki + k_in_major) * this->qw * 2; 37 | base_addr_W = k_major*weights_km_iter 38 | 39 | # auto base_addr_y = i_major*h_size_out*this->w_out_int*this->k_out + j_major*w_size_out*this->k_out + k_out_major*this->TP_OUT; 40 | base_addr_y = i_major*outfeat_hom_iter + j_major*outfeat_wom_iter + k_major*outfeat_km_iter 41 | 42 | base_addr_s = k_major*scale_km_iter 43 | 44 | yield base_addr_W, base_addr_x, base_addr_y, base_addr_s 45 | 46 | VERBOSE = True 47 | 48 | 49 | def uloop_check( 50 | subtile_nb_k, 51 | subtile_nb_ho, 52 | subtile_nb_wo, 53 | h_size_out, 54 | w_size_out, 55 | k, 56 | w_in_int, 57 | w_out_int, 58 | qw, 59 | fs=3, 60 | FILTER_SIZE=3, 61 | TP_IN=16, 62 | TP_OUT=16, # in depthwise mode, effective TP_OUT=16 63 | verbose=VERBOSE 64 | ): 65 | 66 | infeat_hom_iter = h_size_out * w_in_int * k 67 | infeat_wom_iter = w_size_out * k 68 | infeat_km_iter = TP_IN 69 | 70 | weights_km_iter = qw * FILTER_SIZE*FILTER_SIZE * 2 71 | 72 | outfeat_hom_iter = h_size_out * w_out_int * k 73 | outfeat_wom_iter = w_size_out * k 74 | outfeat_km_iter = TP_OUT 75 | 76 | scale_km_iter = TP_OUT>>2 77 | 78 | print("> Base iter\n\tsubtile_nb_k=%d\n\tsubtile_nb_ho=%d\n\tsubtile_nb_wo=%d\n\th_size_out=%d\n\tw_size_out=%d\n\tinfeat_hom_iter=%x\n\tinfeat_wom_iter=%x\n\tinfeat_km_iter=%x\n\tweights_km_iter=%x\n\toutfeat_hom_iter=%x\n\toutfeat_wom_iter=%x\n\toutfeat_km_iter=%x\n\tscale_km_iter=%x" % (subtile_nb_k, subtile_nb_ho, subtile_nb_wo, h_size_out, w_size_out, infeat_hom_iter, infeat_wom_iter, infeat_km_iter, weights_km_iter, outfeat_hom_iter, outfeat_wom_iter, outfeat_km_iter, scale_km_iter)) 79 | weights_km_reset_iter = - (subtile_nb_k-1) * weights_km_iter 80 | infeat_km_reset_iter = - (subtile_nb_k-1) * infeat_km_iter 81 | infeat_wom_reset_iter = - (subtile_nb_wo-1) * infeat_wom_iter 82 | outfeat_wom_reset_iter = - (subtile_nb_wo-1) * outfeat_wom_iter 83 | infeat_hom_reset_iter = - (subtile_nb_ho-1) * infeat_hom_iter 84 | outfeat_hom_reset_iter = - (subtile_nb_ho-1) * outfeat_hom_iter 85 | outfeat_km_reset_iter = - (subtile_nb_k-1) * outfeat_km_iter 86 | print("> Reset iter\n\tweights_km_reset_iter=%x\n\tinfeat_km_reset_iter=%x\n\tinfeat_wom_reset_iter=%x\n\toutfeat_wom_reset_iter=%x\n\tinfeat_hom_reset_iter=%x\n\toutfeat_hom_reset_iter=%x\n\toutfeat_km_reset_iter=%x" % (weights_km_reset_iter, infeat_km_reset_iter, infeat_wom_reset_iter, outfeat_wom_reset_iter, infeat_hom_reset_iter, outfeat_hom_reset_iter, outfeat_km_reset_iter)) 87 | 88 | registers = [ 89 | 0, # base_addr_W 90 | 0, # base_addr_x 91 | 0, # base_addr_y 92 | 0, # base_addr_s 93 | weights_km_iter, # weights_kom_iter, 94 | weights_km_iter, # weights_kim_iter, 95 | weights_km_reset_iter, # weights_kom_reset_iter, 96 | weights_km_reset_iter, # weights_kim_reset_iter, 97 | infeat_km_iter, #infeat_kim_iter, 98 | infeat_wom_iter, 99 | infeat_hom_iter, 100 | infeat_km_reset_iter, # infeat_kim_reset_iter, 101 | infeat_wom_reset_iter, 102 | infeat_hom_reset_iter, 103 | outfeat_wom_iter, 104 | outfeat_hom_iter, 105 | outfeat_km_iter, # outfeat_kom_iter, 106 | outfeat_wom_reset_iter, 107 | outfeat_hom_reset_iter, 108 | outfeat_km_reset_iter, # outfeat_kom_reset_iter, 109 | scale_km_iter, # scale_kom_iter, 110 | 0 111 | ] 112 | 113 | loops_ops,code,mnem = uloop_load("code_dw.yml") 114 | loops = uloop_get_loops(loops_ops, (subtile_nb_wo, subtile_nb_ho, subtile_nb_k, 1)) 115 | 116 | err = 0 117 | idx = [] 118 | nb_loops=4 119 | for j in range(nb_loops): 120 | idx.append(0) 121 | state = (0,0,0,idx) 122 | busy = False 123 | execute = True 124 | # uloop_print_idx(state, registers) 125 | hidx = 0, 0, 0, 0 126 | hl_loop = iterate_hl_loop(subtile_nb_k, subtile_nb_ho, subtile_nb_wo, h_size_out, w_size_out, infeat_hom_iter, infeat_wom_iter, infeat_km_iter, weights_km_iter, outfeat_hom_iter, outfeat_wom_iter, outfeat_km_iter, scale_km_iter) 127 | hW, hX, hY, hS = next(hl_loop) 128 | for i in range(0,1000000): 129 | new_registers = uloop_execute(state, code, registers) 130 | execute,end,busy,state = uloop_state_machine(loops, state, verbose=verbose, nb_loops=nb_loops) 131 | if execute: 132 | registers = new_registers 133 | if not busy: 134 | try: 135 | hW, hX, hY, hS = next(hl_loop) 136 | except StopIteration: 137 | pass 138 | if verbose: 139 | uloop_print_idx(state, registers, register_names=('weights', 'infeat', 'outfeat', 'scale')) 140 | uW, uX, uY, uS = registers[0:4] 141 | if (hW != uW or hX != uX or hY != uY or hS != uS): 142 | if verbose: 143 | print(" ERROR!!!") 144 | print(" High-level: weights:%x infeat:%x outfeat:%x scale:%x" % (hW, hX, hY, hS)) 145 | print(" uLoop: weights:%x infeat:%x outfeat:%x scale:%x" % (uW, uX, uY, uS)) 146 | err += 1 147 | if end: 148 | break 149 | 150 | print(err, " errors", "!!!" if err > 0 else "") 151 | return err 152 | 153 | uloop_check( 154 | 2, # subtile_nb_k, 155 | 1, # subtile_nb_ho, 156 | 1, # subtile_nb_wo, 157 | 3, # h_size_out, 158 | 3, # w_size_out, 159 | 32, # k, 160 | 5, # w_in_int, 161 | 3, # w_out_int, 162 | 8, # qw, 163 | verbose = True 164 | ) 165 | -------------------------------------------------------------------------------- /ucode/uloop_common.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # uloop_common.sv 4 | # Francesco Conti 5 | # 6 | # Copyright (C) 2017-2019 ETH Zurich, University of Bologna 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # See LICENSE.sw.txt for details. 10 | # You may obtain a copy of the License at 11 | # 12 | # http://www.apache.org/licenses/LICENSE-2.0 13 | # 14 | # Unless required by applicable law or agreed to in writing, software 15 | # distributed under the License is distributed on an "AS IS" BASIS, 16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | # See the License for the specific language governing permissions and 18 | # limitations under the License. 19 | # 20 | 21 | from __future__ import print_function 22 | from bitstring import * 23 | import yaml 24 | 25 | try: 26 | from collections import OrderedDict 27 | except ImportError: 28 | from ordereddict import OrderedDict 29 | 30 | DEFAULT_NB_LOOPS = 4 31 | ULOOP_LEN = 352 # was 176 32 | 33 | def yaml_ordered_load(stream, Loader=yaml.Loader, object_pairs_hook=OrderedDict): 34 | class OrderedLoader(Loader): 35 | pass 36 | def construct_mapping(loader, node): 37 | loader.flatten_mapping(node) 38 | return object_pairs_hook(loader.construct_pairs(node)) 39 | OrderedLoader.add_constructor( 40 | yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, 41 | construct_mapping) 42 | return yaml.load(stream, OrderedLoader) 43 | 44 | def uloop_state_machine(loops, curr_state, verbose=False, nb_loops=DEFAULT_NB_LOOPS): 45 | curr_addr, curr_loop, curr_op, curr_idx = curr_state 46 | next_addr = curr_addr 47 | next_loop = curr_loop 48 | next_op = curr_op 49 | next_idx = curr_idx 50 | end = False 51 | busy = False 52 | execute = False 53 | # if next operation is within the current loop, update address 54 | if curr_idx[curr_loop] < loops[curr_loop]['range'] - 1 and curr_op < loops[curr_loop]['nb_ops'] - 1: 55 | if verbose: 56 | print ("@%d %s UPDATE CURRENT LOOP %d " % (curr_addr, str(curr_state[3][::-1]), curr_loop)) 57 | next_addr = curr_addr + 1 58 | next_op = curr_op + 1 59 | busy = True 60 | execute = True 61 | # if there is a lower level loop, go to it 62 | elif curr_idx[curr_loop] < loops[curr_loop]['range'] - 1 and curr_loop > 0: 63 | if verbose: 64 | print ("@%d %s ITERATE CURRENT LOOP %d & GOTO LOOP 0" % (curr_addr, str(curr_state[3][::-1]), curr_loop)) 65 | next_loop = 0 66 | for j in range(0,curr_loop): 67 | next_idx[j] = 0 68 | next_idx[curr_loop] = curr_idx[curr_loop] + 1 69 | next_addr = loops[0]['uloop_addr'] 70 | next_op = 0 71 | busy = False 72 | execute = True 73 | # if we are still within the current loop range, go back to start loop address 74 | elif curr_idx[curr_loop] < loops[curr_loop]['range'] - 1: 75 | if verbose: 76 | print ("@%d %s ITERATE CURRENT LOOP %d " % (curr_addr, str(curr_state[3][::-1]), curr_loop)) 77 | next_addr = loops[curr_loop]['uloop_addr'] 78 | next_op = 0 79 | next_idx[curr_loop] = curr_idx[curr_loop] + 1 80 | busy = False 81 | execute = True 82 | # if not, go to next loop 83 | elif curr_loop < nb_loops-1: 84 | if verbose: 85 | print ("@%d %s GOTO NEXT LOOP %d " % (curr_addr, str(curr_state[3][::-1]), curr_loop+1)) 86 | next_loop = curr_loop + 1 87 | next_addr = loops[curr_loop+1]['uloop_addr'] 88 | next_op = 0 89 | busy = True 90 | execute = False 91 | else: 92 | if verbose: 93 | print ("@%d %s TERMINATION " % (curr_addr, str(curr_state[3][::-1]))) 94 | end = True 95 | next_loop = 0 96 | next_addr = 0 97 | next_op = 0 98 | next_idx = [] 99 | for j in range(nb_loops): 100 | next_idx.append(0) 101 | busy = False 102 | execute = False 103 | next_state = next_addr, next_loop, next_op, next_idx 104 | return execute,end,busy,next_state 105 | 106 | def uloop_execute(state, code, registers): 107 | addr, loop, op, idx = state 108 | new_registers = registers[:] 109 | try: 110 | if code[addr]['op_sel']: 111 | new_registers[code[addr]['a']] = registers[code[addr]['a']] + registers[code[addr]['b']] 112 | else: 113 | new_registers[code[addr]['a']] = registers[code[addr]['b']] 114 | except TypeError: 115 | import pdb; pdb.set_trace() 116 | return new_registers 117 | 118 | def uloop_print_idx(state, registers, compact=False, register_names=None): 119 | if not compact and register_names is None: 120 | print ("r0:%x r1:%x r2:%x r3:%x" % (registers[0], registers[1], registers[2], registers[3])) 121 | elif not compact: 122 | print ("%s:%x %s:%x %s:%x %s:%x" % (register_names[0], registers[0], register_names[1], registers[1], register_names[2], registers[2], register_names[3], registers[3])) 123 | else: 124 | print ("%d,%d,%d,%d" % (registers[0], registers[1], registers[2], registers[3])) 125 | 126 | def uloop_bytecode(code, loops_ops): 127 | bytecode = {} 128 | bytecode['code'] = BitArray() 129 | for c in code[::-1]: 130 | if c['op_sel'] == 1: 131 | b = BitArray(uint=1, length=1) 132 | else: 133 | b = BitArray(uint=0, length=1) 134 | a_b = BitArray(uint=c['a'], length=5) 135 | b_b = BitArray(uint=c['b'], length=5) 136 | b.append(a_b) 137 | b.append(b_b) 138 | bytecode['code'].append(b) 139 | if bytecode['code'].length < ULOOP_LEN: 140 | bytecode['code'].prepend(BitArray(uint=0, length=ULOOP_LEN-bytecode['code'].length)) 141 | else: 142 | print("Error!!! ULOOP_LEN=%d is too small for bytecode of %d bits" % (ULOOP_LEN, bytecode['code'].length)) 143 | return None 144 | bytecode['loops'] = BitArray() 145 | a = 0 146 | loops_addr = [] 147 | for o in loops_ops: 148 | loops_addr.append(a) 149 | a += o 150 | for o,a in zip(loops_ops[::-1], loops_addr[::-1]): 151 | a_b = BitArray(uint=a, length=5) 152 | o_b = BitArray(uint=o, length=4) 153 | bytecode['loops'].append(a_b) 154 | bytecode['loops'].append(o_b) 155 | return bytecode 156 | 157 | def uloop_load(name): 158 | with open(name) as f: 159 | code_p = yaml_ordered_load(f, yaml.SafeLoader) 160 | mnem_p = code_p['mnemonics'] 161 | code_p = code_p['code'] 162 | # code_p is a dictionary of loops 163 | code_l = [] 164 | loops_ops = [] 165 | for l in code_p: 166 | code_l.extend(code_p[l]) 167 | loops_ops.append(len(code_p[l])) 168 | code = [] 169 | for c in code_l: 170 | cn = {} 171 | if c['op'] == 'add': 172 | cn['op_sel'] = 1 173 | else: 174 | cn['op_sel'] = 0 175 | try: 176 | cn['a'] = mnem_p[c['a']] 177 | except KeyError: 178 | cn['a'] = c['a'] 179 | try: 180 | cn['b'] = mnem_p[c['b']] 181 | except KeyError: 182 | cn['b'] = c['b'] 183 | code.append(cn) 184 | return loops_ops,code,mnem_p 185 | 186 | def uloop_get_loops(loops_ops, loops_range): 187 | loops = [] 188 | a = 0 189 | for o,r in zip(loops_ops, loops_range): 190 | l = {} 191 | l['nb_ops'] = o 192 | l['range'] = r 193 | l['uloop_addr'] = a 194 | a += o 195 | loops.append(l) 196 | return loops 197 | 198 | -------------------------------------------------------------------------------- /ucode/uloop_compile.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # uloop_compile.sv 4 | # Francesco Conti 5 | # 6 | # Copyright (C) 2017-2019 ETH Zurich, University of Bologna 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # See LICENSE.sw.txt for details. 10 | # You may obtain a copy of the License at 11 | # 12 | # http://www.apache.org/licenses/LICENSE-2.0 13 | # 14 | # Unless required by applicable law or agreed to in writing, software 15 | # distributed under the License is distributed on an "AS IS" BASIS, 16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | # See the License for the specific language governing permissions and 18 | # limitations under the License. 19 | # 20 | 21 | from __future__ import print_function 22 | from uloop_common import * 23 | 24 | loops_ops,code,mnem = uloop_load("code.yml") 25 | 26 | bytecode = uloop_bytecode(code, loops_ops) 27 | print (bytecode['code'].length) 28 | print ("uloop bytecode: %d'h%s" % (bytecode['code'].length, str(bytecode['code'].hex))) 29 | print ("uloop loops: %d'b%s" % (bytecode['loops'].length, str(bytecode['loops'].bin))) 30 | -------------------------------------------------------------------------------- /ucode/uloop_compile_dw.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # uloop_compile.sv 4 | # Francesco Conti 5 | # 6 | # Copyright (C) 2017-2019 ETH Zurich, University of Bologna 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # See LICENSE.sw.txt for details. 10 | # You may obtain a copy of the License at 11 | # 12 | # http://www.apache.org/licenses/LICENSE-2.0 13 | # 14 | # Unless required by applicable law or agreed to in writing, software 15 | # distributed under the License is distributed on an "AS IS" BASIS, 16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | # See the License for the specific language governing permissions and 18 | # limitations under the License. 19 | # 20 | 21 | from __future__ import print_function 22 | from uloop_common import * 23 | 24 | loops_ops,code,mnem = uloop_load("code_dw.yml") 25 | 26 | bytecode = uloop_bytecode(code, loops_ops) 27 | print (bytecode['code'].length) 28 | print ("uloop bytecode: %d'h%s" % (bytecode['code'].length, str(bytecode['code'].hex))) 29 | print ("uloop loops: %d'b%s" % (bytecode['loops'].length, str(bytecode['loops'].bin))) 30 | -------------------------------------------------------------------------------- /ucode/uloop_run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # uloop_run.sv 4 | # Francesco Conti 5 | # 6 | # Copyright (C) 2017-2019 ETH Zurich, University of Bologna 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # See LICENSE.sw.txt for details. 10 | # You may obtain a copy of the License at 11 | # 12 | # http://www.apache.org/licenses/LICENSE-2.0 13 | # 14 | # Unless required by applicable law or agreed to in writing, software 15 | # distributed under the License is distributed on an "AS IS" BASIS, 16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | # See the License for the specific language governing permissions and 18 | # limitations under the License. 19 | # 20 | 21 | from __future__ import print_function 22 | from uloop_common import * 23 | import math 24 | 25 | VERBOSE = True 26 | FB = 5 # filter buffer size (FB*FB) 27 | BS = 4 # block size 28 | TP = 32 29 | 30 | fs = 3 31 | oh = 1 32 | ow = 1 33 | ih = (oh - 1) + fs 34 | iw = (ow - 1) + fs 35 | nof = 32 36 | nif = 32 37 | qa = 4 38 | qw = 4 39 | 40 | qa_max = 4 #min(4,qa) 41 | 42 | n_tiles_qa = 1 43 | n_tiles_kin = nif/TP 44 | n_tiles_kout = nof/TP 45 | 46 | 47 | n_tiles_K_in = int(math.ceil(nif/TP)) 48 | n_tiles_K_out = int(math.ceil(nof/TP)) 49 | n_tiles_Hout = int(math.ceil(ih/FB)) 50 | n_tiles_Wout = int(math.ceil(iw/FB)) 51 | n_tiles_qa = int(math.ceil(qa/BS)) 52 | n_xpatches = n_tiles_Hout * n_tiles_Wout # * n_tiles_qa 53 | 54 | print("n_xpatches: ", n_xpatches) 55 | 56 | loops_range = [ 57 | n_tiles_qa, 58 | n_tiles_K_in, 59 | n_tiles_K_out, 60 | n_xpatches 61 | ] 62 | 63 | if fs==3: 64 | stream_size_fs = TP*fs*qw 65 | 66 | else: 67 | stream_size_fs = TP*fs*fs*qw 68 | 69 | registers = [ 70 | 0, 71 | 0, 72 | 0, 73 | 0, 74 | 0, 75 | 0, 76 | nif, 77 | nof, 78 | TP*FB*FB*4, 79 | TP*9, 80 | stream_size_fs, #TP*fs*qw, # or TP*fs*fs*qw 81 | TP*fs*fs*qw+2, 82 | 32*(32+16), 83 | 0 84 | ] 85 | 86 | loops_ops,code,mnem = uloop_load("code.yml") 87 | loops = uloop_get_loops(loops_ops, loops_range) 88 | 89 | idx = [] 90 | for j in range(NB_LOOPS): 91 | idx.append(0) 92 | state = (0,0,0,idx) 93 | busy = False 94 | execute = True 95 | uloop_print_idx(state, registers, compact=True) 96 | nb_iter = 0 97 | for i in range(0,1000000): 98 | new_registers = uloop_execute(state, code, registers) 99 | execute,end,busy,state = uloop_state_machine(loops, state, verbose=VERBOSE) 100 | if execute: 101 | registers = new_registers 102 | if not busy: 103 | nb_iter += 1 104 | uloop_print_idx(state, registers, compact=True) 105 | if end: 106 | break 107 | print("nb_iter=%d" % (nb_iter+1)) 108 | --------------------------------------------------------------------------------