├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md ├── fpga ├── .gitignore ├── Makefile ├── handler_function.cpp ├── handler_function.h ├── kernel │ ├── .gitignore │ ├── dummy_kernel.cpp │ ├── gen_xo.tcl │ ├── package_kernel.tcl │ └── pr.xdc ├── main.cpp ├── ntt_cfg.h ├── u55n.cfg ├── util.cpp ├── util.h ├── xrt.ini └── xsim.tcl ├── layout.png └── rtl ├── axi_hbm_pkg.sv ├── components ├── bin_to_gray.sv ├── cdc_fifo_core.sv ├── cdc_sync.sv ├── fifo.sv ├── fifo_core.sv ├── fifo_ctrl.sv ├── gray_to_bin.sv ├── ram_1w1r_1clk.sv ├── slrx_rx_reg.sv └── slrx_tx_reg.sv ├── config_pkg.sv ├── csr.v ├── dma ├── dma.sv ├── dma_counter.sv ├── point_dma.sv ├── point_dma_r_channel.sv ├── point_dma_w_channel.sv ├── point_from_ntt.sv └── point_to_ntt.sv ├── dsp48e2 ├── butterfly.sv ├── math_pkg.sv ├── modaddsub.sv ├── mul64x64.sv ├── mulred.sv └── red128t64.sv ├── files.f ├── nantucket.v ├── nantucket_sv.sv └── ntt ├── TWIDDLE_ROM_WA0_NLEVEL12.mem ├── TWIDDLE_ROM_WA0_NLEVEL7.mem ├── TWIDDLE_ROM_WA0_NLEVEL9.mem ├── TWIDDLE_ROM_WA1_NLEVEL12.mem ├── TWIDDLE_ROM_WA1_NLEVEL7.mem ├── TWIDDLE_ROM_WA1_NLEVEL9.mem ├── ntt.sv ├── ntt_bitrev.sv ├── ntt_butterfly.sv ├── ntt_cgram.sv ├── ntt_opt_pkg.sv ├── ntt_top.sv └── ntt_twiddle.sv /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Permission is hereby granted, free of charge, to any person obtaining a copy 2 | of this software and associated documentation files (the "Software"), to deal 3 | in the Software without restriction, including without limitation the rights 4 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 5 | copies of the Software, and to permit persons to whom the Software is 6 | furnished to do so, subject to the following conditions: 7 | 8 | The above copyright notice and this permission notice shall be included in all 9 | copies or substantial portions of the Software. 10 | 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 13 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 14 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 15 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 17 | SOFTWARE. 18 | -------------------------------------------------------------------------------- /fpga/.gitignore: -------------------------------------------------------------------------------- 1 | _x/ 2 | hw_emu/ 3 | sw_emu/ 4 | hw/ 5 | .ipcache/ 6 | v++_*.log 7 | xcd.log 8 | xrc.log 9 | -------------------------------------------------------------------------------- /fpga/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright Supranational LLC 2 | # Licensed under the Apache License, Version 2.0, see LICENSE-APACHE 3 | # or the MIT license, see LICENSE-MIT, at your option. 4 | # SPDX-License-Identifier: Apache-2.0 OR MIT 5 | 6 | # Example use cases... 7 | # 8 | # Run HW emulation assuming input/expects filenames in main.cpp are valid: 9 | # make run TARGET=hw_emu 10 | # 11 | # Build and run HW target with XRT API assuming input/expects filenames in main.cpp are valid: 12 | # make run 13 | # 14 | # Build and run HW target with OpenCL API assuming input/expects filenames in main.cpp are valid: 15 | # make run_opencl 16 | # 17 | # Build and run with XRT API and specifying input/expected points on the command line: 18 | # make build 19 | # cd hw; ./app.exe 20 | 21 | TARGET := hw 22 | PLATFORM := xilinx_u55n_gen3x4_xdma_2_202110_1 23 | BUILD_DIR := $(TARGET) 24 | RTL_DIR := ../rtl 25 | 26 | CXXFLAGS += -I$(XILINX_XRT)/include -I$(XILINX_VIVADO)/include -Wall -O0 -g -std=c++1y -DUSE_XRT=1 27 | LDFLAGS += -L$(XILINX_XRT)/lib -pthread -lOpenCL 28 | 29 | # Host compiler global settings 30 | CXXFLAGS += -fmessage-length=0 31 | LDFLAGS += -lrt -lstdc++ 32 | LDFLAGS += -luuid -lxrt_coreutil 33 | 34 | XRT_CFILES = main.cpp \ 35 | handler_function.cpp 36 | 37 | OPENCL_CFILES = $(XRT_CFILES) util.cpp 38 | 39 | CFILES = $(OPENCL_CFILES) ntt_cfg.h util.h handler_function.h 40 | 41 | VFILES = $(addprefix $(RTL_DIR)/,$(shell cat $(RTL_DIR)/files.f)) 42 | TCLARGS = $(subst $() $(),:,$(addprefix ../,$(VFILES))) 43 | 44 | run: build 45 | ifeq ($(TARGET),hw) 46 | cp xrt.ini $(BUILD_DIR) 47 | cd $(BUILD_DIR) && ./app.exe 48 | else 49 | cp xrt.ini $(BUILD_DIR) 50 | cd $(BUILD_DIR) && XCL_EMULATION_MODE=$(TARGET) ./app.exe 51 | endif 52 | 53 | run_opencl: build 54 | ifeq ($(TARGET),hw) 55 | cp xrt.ini $(BUILD_DIR) 56 | cd $(BUILD_DIR) && ./app_opencl.exe 57 | else 58 | cp xrt.ini $(BUILD_DIR) 59 | cd $(BUILD_DIR) && XCL_EMULATION_MODE=$(TARGET) ./app_opencl.exe 60 | endif 61 | 62 | build: $(BUILD_DIR)/app.exe $(BUILD_DIR)/app_opencl.exe $(BUILD_DIR)/emconfig.json $(BUILD_DIR)/nantucket.xclbin 63 | 64 | $(BUILD_DIR)/app.exe: $(XRT_CFILES) $(BUILD_DIR)/nantucket.xclbin 65 | mkdir -p $(BUILD_DIR) 66 | g++ -o $@ $(XRT_CFILES) $(CXXFLAGS) $(LDFLAGS) 67 | 68 | $(BUILD_DIR)/app_opencl.exe: $(OPENCL_CFILES) $(BUILD_DIR)/nantucket.xclbin 69 | mkdir -p $(BUILD_DIR) 70 | g++ -Wall -g -std=c++11 $(OPENCL_CFILES) -o $@ \ 71 | -I. -Ikernel \ 72 | -I${XILINX_XRT}/include/ \ 73 | -I/tools/Xilinx/Vitis_HLS/2022.1/include/ \ 74 | -L${XILINX_XRT}/lib/ -lOpenCL -pthread -lrt -lstdc++ 75 | 76 | kernel/nantucket.xo: kernel/gen_xo.tcl kernel/package_kernel.tcl $(CFILES) $(VFILES) 77 | cd $( read_data_from_file(const char *File_Name){ 4 | std::ifstream input(File_Name); 5 | std::string line; 6 | std::vector str_arr; 7 | while (std::getline(input, line)) 8 | { 9 | str_arr.push_back(line); 10 | } 11 | return str_arr; 12 | } 13 | -------------------------------------------------------------------------------- /fpga/handler_function.h: -------------------------------------------------------------------------------- 1 | #ifndef _HANDLER_FUNCTION_H 2 | #define HANDLER_FUNCTION_H 3 | #endif 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | std::vector read_data_from_file(const char *File_Name); 11 | -------------------------------------------------------------------------------- /fpga/kernel/.gitignore: -------------------------------------------------------------------------------- 1 | packaged_kernel* 2 | tmp_kernel_pack* 3 | vivado*.jou 4 | vivado*.log 5 | nantucket.xo 6 | -------------------------------------------------------------------------------- /fpga/kernel/dummy_kernel.cpp: -------------------------------------------------------------------------------- 1 | // Copyright Supranational LLC 2 | // Licensed under the Apache License, Version 2.0, see LICENSE-APACHE 3 | // or the MIT license, see LICENSE-MIT, at your option. 4 | // SPDX-License-Identifier: Apache-2.0 OR MIT 5 | 6 | //----------------------------------------------------------------------------- 7 | // kernel: nantucket 8 | // 9 | // Purpose: Since the HLS flow is not used the sw_emu make target has little 10 | // value. This code implements a dummy kernel which allows the 11 | // sw_emu flow to build but it doesn't compute any results. 12 | //----------------------------------------------------------------------------- 13 | #include 14 | #include "hls_half.h" 15 | #include "ap_axi_sdata.h" 16 | #include "hls_stream.h" 17 | 18 | // Function declaration/Interface pragmas to match RTL Kernel 19 | extern "C" void nantucket ( 20 | unsigned char chicken_bits, 21 | int* axi00_ptr0, 22 | int* axi01_ptr0, 23 | int* axi02_ptr0, 24 | int* axi03_ptr0, 25 | int* axi04_ptr0, 26 | int* axi05_ptr0, 27 | int* axi06_ptr0, 28 | int* axi07_ptr0, 29 | int* axi08_ptr0, 30 | int* axi09_ptr0, 31 | int* axi10_ptr0, 32 | int* axi11_ptr0, 33 | int* axi12_ptr0, 34 | int* axi13_ptr0, 35 | int* axi14_ptr0, 36 | int* axi15_ptr0, 37 | int* axi16_ptr0, 38 | int* axi17_ptr0, 39 | int* axi18_ptr0, 40 | int* axi19_ptr0, 41 | int* axi20_ptr0, 42 | int* axi21_ptr0, 43 | int* axi22_ptr0, 44 | int* axi23_ptr0, 45 | int* axi24_ptr0, 46 | int* axi25_ptr0, 47 | int* axi26_ptr0, 48 | int* axi27_ptr0, 49 | int* axi28_ptr0, 50 | int* axi29_ptr0, 51 | int* axi30_ptr0, 52 | int* axi31_ptr0 53 | ) { 54 | #pragma HLS INTERFACE m_axi port=axi00_ptr0 offset=slave bundle=m00_axi 55 | #pragma HLS INTERFACE m_axi port=axi01_ptr0 offset=slave bundle=m01_axi 56 | #pragma HLS INTERFACE m_axi port=axi02_ptr0 offset=slave bundle=m02_axi 57 | #pragma HLS INTERFACE m_axi port=axi03_ptr0 offset=slave bundle=m03_axi 58 | #pragma HLS INTERFACE m_axi port=axi04_ptr0 offset=slave bundle=m04_axi 59 | #pragma HLS INTERFACE m_axi port=axi05_ptr0 offset=slave bundle=m05_axi 60 | #pragma HLS INTERFACE m_axi port=axi06_ptr0 offset=slave bundle=m06_axi 61 | #pragma HLS INTERFACE m_axi port=axi07_ptr0 offset=slave bundle=m07_axi 62 | #pragma HLS INTERFACE m_axi port=axi08_ptr0 offset=slave bundle=m08_axi 63 | #pragma HLS INTERFACE m_axi port=axi09_ptr0 offset=slave bundle=m09_axi 64 | #pragma HLS INTERFACE m_axi port=axi10_ptr0 offset=slave bundle=m10_axi 65 | #pragma HLS INTERFACE m_axi port=axi11_ptr0 offset=slave bundle=m11_axi 66 | #pragma HLS INTERFACE m_axi port=axi12_ptr0 offset=slave bundle=m12_axi 67 | #pragma HLS INTERFACE m_axi port=axi13_ptr0 offset=slave bundle=m13_axi 68 | #pragma HLS INTERFACE m_axi port=axi14_ptr0 offset=slave bundle=m14_axi 69 | #pragma HLS INTERFACE m_axi port=axi15_ptr0 offset=slave bundle=m15_axi 70 | #pragma HLS INTERFACE m_axi port=axi16_ptr0 offset=slave bundle=m16_axi 71 | #pragma HLS INTERFACE m_axi port=axi17_ptr0 offset=slave bundle=m17_axi 72 | #pragma HLS INTERFACE m_axi port=axi18_ptr0 offset=slave bundle=m18_axi 73 | #pragma HLS INTERFACE m_axi port=axi19_ptr0 offset=slave bundle=m19_axi 74 | #pragma HLS INTERFACE m_axi port=axi20_ptr0 offset=slave bundle=m20_axi 75 | #pragma HLS INTERFACE m_axi port=axi21_ptr0 offset=slave bundle=m21_axi 76 | #pragma HLS INTERFACE m_axi port=axi22_ptr0 offset=slave bundle=m22_axi 77 | #pragma HLS INTERFACE m_axi port=axi23_ptr0 offset=slave bundle=m23_axi 78 | #pragma HLS INTERFACE m_axi port=axi24_ptr0 offset=slave bundle=m24_axi 79 | #pragma HLS INTERFACE m_axi port=axi25_ptr0 offset=slave bundle=m25_axi 80 | #pragma HLS INTERFACE m_axi port=axi26_ptr0 offset=slave bundle=m26_axi 81 | #pragma HLS INTERFACE m_axi port=axi27_ptr0 offset=slave bundle=m27_axi 82 | #pragma HLS INTERFACE m_axi port=axi28_ptr0 offset=slave bundle=m28_axi 83 | #pragma HLS INTERFACE m_axi port=axi29_ptr0 offset=slave bundle=m29_axi 84 | #pragma HLS INTERFACE m_axi port=axi30_ptr0 offset=slave bundle=m30_axi 85 | #pragma HLS INTERFACE m_axi port=axi31_ptr0 offset=slave bundle=m31_axi 86 | #pragma HLS INTERFACE s_axilite port=chicken_bits bundle=control 87 | #pragma HLS INTERFACE s_axilite port=axi00_ptr0 bundle=control 88 | #pragma HLS INTERFACE s_axilite port=axi01_ptr0 bundle=control 89 | #pragma HLS INTERFACE s_axilite port=axi02_ptr0 bundle=control 90 | #pragma HLS INTERFACE s_axilite port=axi03_ptr0 bundle=control 91 | #pragma HLS INTERFACE s_axilite port=axi04_ptr0 bundle=control 92 | #pragma HLS INTERFACE s_axilite port=axi05_ptr0 bundle=control 93 | #pragma HLS INTERFACE s_axilite port=axi06_ptr0 bundle=control 94 | #pragma HLS INTERFACE s_axilite port=axi07_ptr0 bundle=control 95 | #pragma HLS INTERFACE s_axilite port=axi08_ptr0 bundle=control 96 | #pragma HLS INTERFACE s_axilite port=axi09_ptr0 bundle=control 97 | #pragma HLS INTERFACE s_axilite port=axi10_ptr0 bundle=control 98 | #pragma HLS INTERFACE s_axilite port=axi11_ptr0 bundle=control 99 | #pragma HLS INTERFACE s_axilite port=axi12_ptr0 bundle=control 100 | #pragma HLS INTERFACE s_axilite port=axi13_ptr0 bundle=control 101 | #pragma HLS INTERFACE s_axilite port=axi14_ptr0 bundle=control 102 | #pragma HLS INTERFACE s_axilite port=axi15_ptr0 bundle=control 103 | #pragma HLS INTERFACE s_axilite port=axi16_ptr0 bundle=control 104 | #pragma HLS INTERFACE s_axilite port=axi17_ptr0 bundle=control 105 | #pragma HLS INTERFACE s_axilite port=axi18_ptr0 bundle=control 106 | #pragma HLS INTERFACE s_axilite port=axi19_ptr0 bundle=control 107 | #pragma HLS INTERFACE s_axilite port=axi20_ptr0 bundle=control 108 | #pragma HLS INTERFACE s_axilite port=axi21_ptr0 bundle=control 109 | #pragma HLS INTERFACE s_axilite port=axi22_ptr0 bundle=control 110 | #pragma HLS INTERFACE s_axilite port=axi23_ptr0 bundle=control 111 | #pragma HLS INTERFACE s_axilite port=axi24_ptr0 bundle=control 112 | #pragma HLS INTERFACE s_axilite port=axi25_ptr0 bundle=control 113 | #pragma HLS INTERFACE s_axilite port=axi26_ptr0 bundle=control 114 | #pragma HLS INTERFACE s_axilite port=axi27_ptr0 bundle=control 115 | #pragma HLS INTERFACE s_axilite port=axi28_ptr0 bundle=control 116 | #pragma HLS INTERFACE s_axilite port=axi29_ptr0 bundle=control 117 | #pragma HLS INTERFACE s_axilite port=axi30_ptr0 bundle=control 118 | #pragma HLS INTERFACE s_axilite port=axi31_ptr0 bundle=control 119 | #pragma HLS INTERFACE s_axilite port=return bundle=control 120 | #pragma HLS INTERFACE ap_ctrl_hs port=return 121 | 122 | unsigned long *base [32]; 123 | 124 | base[0] = (unsigned long *) axi00_ptr0; 125 | base[1] = (unsigned long *) axi01_ptr0; 126 | base[2] = (unsigned long *) axi02_ptr0; 127 | base[3] = (unsigned long *) axi03_ptr0; 128 | base[4] = (unsigned long *) axi04_ptr0; 129 | base[5] = (unsigned long *) axi05_ptr0; 130 | base[6] = (unsigned long *) axi06_ptr0; 131 | base[7] = (unsigned long *) axi07_ptr0; 132 | base[8] = (unsigned long *) axi08_ptr0; 133 | base[9] = (unsigned long *) axi09_ptr0; 134 | base[10] = (unsigned long *) axi10_ptr0; 135 | base[11] = (unsigned long *) axi11_ptr0; 136 | base[12] = (unsigned long *) axi12_ptr0; 137 | base[13] = (unsigned long *) axi13_ptr0; 138 | base[14] = (unsigned long *) axi14_ptr0; 139 | base[15] = (unsigned long *) axi15_ptr0; 140 | base[16] = (unsigned long *) axi16_ptr0; 141 | base[17] = (unsigned long *) axi17_ptr0; 142 | base[18] = (unsigned long *) axi18_ptr0; 143 | base[19] = (unsigned long *) axi19_ptr0; 144 | base[20] = (unsigned long *) axi20_ptr0; 145 | base[21] = (unsigned long *) axi21_ptr0; 146 | base[22] = (unsigned long *) axi22_ptr0; 147 | base[23] = (unsigned long *) axi23_ptr0; 148 | base[24] = (unsigned long *) axi24_ptr0; 149 | base[25] = (unsigned long *) axi25_ptr0; 150 | base[26] = (unsigned long *) axi26_ptr0; 151 | base[27] = (unsigned long *) axi27_ptr0; 152 | base[28] = (unsigned long *) axi28_ptr0; 153 | base[29] = (unsigned long *) axi29_ptr0; 154 | base[30] = (unsigned long *) axi30_ptr0; 155 | base[31] = (unsigned long *) axi31_ptr0; 156 | } 157 | -------------------------------------------------------------------------------- /fpga/kernel/gen_xo.tcl: -------------------------------------------------------------------------------- 1 | set xoname nantucket.xo 2 | set krnl_name nantucket 3 | set device xilinx_u55n_gen3x4_xdma_2_202110_1 4 | 5 | set suffix "${krnl_name}_${device}" 6 | set path_to_packaged "./packaged_kernel_${suffix}" 7 | set path_to_tmp_project "./tmp_kernel_pack_${suffix}" 8 | 9 | create_project -force kernel_pack $path_to_tmp_project 10 | 11 | if { $argc != 1 } { 12 | puts "The script requires a target and a colon separated file list." 13 | puts "Please try again." 14 | } else { 15 | set vfiles [split [lindex $argv 0] ":"] 16 | } 17 | 18 | puts $vfiles 19 | add_files $vfiles \ 20 | ../../rtl/ntt/TWIDDLE_ROM_WA0_NLEVEL7.mem \ 21 | ../../rtl/ntt/TWIDDLE_ROM_WA1_NLEVEL7.mem \ 22 | ../../rtl/ntt/TWIDDLE_ROM_WA0_NLEVEL9.mem \ 23 | ../../rtl/ntt/TWIDDLE_ROM_WA1_NLEVEL9.mem \ 24 | ../../rtl/ntt/TWIDDLE_ROM_WA0_NLEVEL12.mem \ 25 | ../../rtl/ntt/TWIDDLE_ROM_WA1_NLEVEL12.mem 26 | 27 | 28 | update_compile_order -fileset sources_1 29 | update_compile_order -fileset sim_1 30 | 31 | ipx::package_project -root_dir $path_to_packaged -vendor supranational.net -library RTLKernel -taxonomy /KernelIP -import_files -set_current true 32 | 33 | 34 | # Kernel specific setup output from GUI flow. 35 | source package_kernel.tcl 36 | edit_core [ipx::current_core] 37 | 38 | set_property ipi_drc {ignore_freq_hz false} [ipx::current_core] 39 | set_property vitis_drc {ctrl_protocol ap_ctrl_hs} [ipx::current_core] 40 | set_property supported_families { } [ipx::current_core] 41 | set_property auto_family_support_level level_2 [ipx::current_core] 42 | 43 | # Packaging Vivado IP 44 | ipx::update_source_project_archive -component [ipx::current_core] 45 | ipx::save_core [ipx::current_core] 46 | 47 | package_xo -force -xo_path ${xoname} -kernel_name ${krnl_name} -ip_directory ${path_to_packaged} -kernel_files dummy_kernel.cpp 48 | -------------------------------------------------------------------------------- /fpga/main.cpp: -------------------------------------------------------------------------------- 1 | // Copyright Supranational LLC 2 | // Licensed under the Apache License, Version 2.0, see LICENSE-APACHE 3 | // or the MIT license, see LICENSE-MIT, at your option. 4 | // SPDX-License-Identifier: Apache-2.0 OR MIT 5 | 6 | #include "handler_function.h" 7 | /* 8 | 9 | Add your Header Files and definitions 10 | 11 | 12 | */ 13 | 14 | #include "ntt_cfg.h" 15 | #include "util.h" 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #if (USE_XRT==1) 25 | #include "experimental/xrt_bo.h" 26 | #include "experimental/xrt_device.h" 27 | #include "experimental/xrt_kernel.h" 28 | #endif 29 | 30 | void wait_for_enter(const std::string &msg) { 31 | std::cout << msg << std::endl; 32 | std::cin.ignore(std::numeric_limits::max(), '\n'); 33 | } 34 | 35 | int main(int argc, char** argv) { 36 | 37 | // ---------------------------------------------------------------------- 38 | // Enhanced template to take filenames from main arguments. 39 | 40 | std::string input_filename = "./ntt2p18_random_input.txt"; 41 | std::string expect_filename = "./ntt2p18_random_output.txt"; 42 | std::string xclbin_filename = "nantucket.xclbin"; 43 | 44 | // If three arguments, assume these are the input and expected points. 45 | if (argc >= 3) { 46 | input_filename = argv[1]; 47 | expect_filename = argv[2]; 48 | } 49 | // If four arguments, assume the fourth is the xclbin filename. 50 | if (argc == 4) { 51 | xclbin_filename = argv[3]; 52 | } 53 | 54 | // ---------------------------------------------------------------------- 55 | 56 | std::ofstream outputFile_Handler; // output file handler 57 | outputFile_Handler.open("ntt_generated_output.txt"); // open file for writing 58 | std::vector input_pts; //pointer to receive the hex data from file reading function 59 | std::vector expect_pts; //pointer to receive the hex data from file reading function 60 | // set precision for timer e.g pico seconds 61 | std::cout << std::fixed << std::setprecision(12) << std::left; 62 | //define kernel time parameter 63 | std::chrono::duration kernel_time(0); 64 | // Read data from the file Note: the data will be read from file as string convert it accordingly 65 | input_pts = read_data_from_file(input_filename.c_str()); // input test vectors 66 | expect_pts = read_data_from_file(expect_filename.c_str()); // expected output vectors for comparison 67 | 68 | /* 69 | 70 | Convert String data to int/long/unsigned long accordingly and store it in CPU memory and HBM 71 | The input data should be used as input to the NTT and the expected output data should be 72 | used for comparison and checking the correctness of the NTT core 73 | */ 74 | 75 | // Make sure the input and expected data sizes are legal. 76 | assert(input_pts.size() != 0); 77 | assert(input_pts.size() == expect_pts.size()); 78 | assert(input_pts.size() == (1<<18) || input_pts.size() == (1<<24)); 79 | 80 | const unsigned long Nmax = 1<<24; 81 | const unsigned long Npts = input_pts.size(); 82 | const unsigned int Nhop = Nmax / Npts; 83 | const unsigned char chicken_bits = 0; 84 | 85 | std::cout << "N................. " << Npts << std::endl; 86 | std::cout << "Input points...... " << input_filename << std::endl; 87 | std::cout << "Expected points... " << expect_filename << std::endl; 88 | std::cout << "xclbin............ " << xclbin_filename << std::endl; 89 | 90 | using namespace ntt_cfg; 91 | 92 | #if (USE_XRT==1) 93 | 94 | std::string binaryFile = xclbin_filename; 95 | int device_index = 0; 96 | std::cout << "Open the device " << device_index << std::endl; 97 | auto device = xrt::device(device_index); 98 | std::cout << "Load the xclbin " << binaryFile << std::endl; 99 | auto uuid = device.load_xclbin(binaryFile); 100 | auto krnl = xrt::kernel(device, uuid, "nantucket"); 101 | 102 | #else 103 | 104 | // Not using HLS so we need to load the FPGA programming file into the device. 105 | // Also initialize OpenCL environment. 106 | cl_int err; 107 | unsigned fileBufSize; 108 | std::vector devices = get_xilinx_devices(); 109 | devices.resize(1); // If multiple devices, choose the first. 110 | cl::Device device = devices[0]; 111 | cl::Context context(device, NULL, NULL, NULL, &err); 112 | char *fileBuf = read_binary_file(xclbin_filename, fileBufSize); 113 | cl::Program::Binaries bins{{fileBuf, fileBufSize}}; 114 | cl::Program program(context, devices, bins, NULL, &err); 115 | cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE | 116 | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err); 117 | cl::Kernel krnl_ntt(program, "nantucket", &err); 118 | 119 | // Tell the kernel if this is a 2**18 or 2**24 sized problem. 120 | krnl_ntt.setArg(0, chicken_bits); 121 | 122 | #endif 123 | 124 | // Create the buffers and allocate memory on the CPU side. 125 | const unsigned int point_size = sizeof(unsigned long); 126 | const unsigned int bytes_per_hbm_channel = (point_size * Nmax) / NLANE; 127 | 128 | #if (USE_XRT==1) 129 | 130 | std::cout << "Doing xrt::bo() ..." << std::endl; 131 | std::vector pt_buf; 132 | for (unsigned int ch = 0*NLANE; ch < 2*NLANE; ch++) { 133 | pt_buf.push_back(xrt::bo(device, bytes_per_hbm_channel, krnl.group_id(ch+1))); 134 | } 135 | 136 | std::cout << "Doing xrt::bo::map() ..." << std::endl; 137 | unsigned long *p_pt_buf[2*NLANE]; 138 | for (unsigned int ch = 0*NLANE; ch < 2*NLANE; ch++) { 139 | p_pt_buf[ch] = pt_buf[ch].map(); 140 | } 141 | 142 | #else 143 | 144 | std::vector pt_buf; 145 | for (unsigned int ch = 0*NLANE; ch < 2*NLANE; ch++) { 146 | pt_buf.push_back(cl::Buffer(context, CL_MEM_READ_WRITE, bytes_per_hbm_channel, NULL, &err)); 147 | } 148 | 149 | // Tell XRT which buffers will be used by which kernels so that allocations can 150 | // be performed in the correct HBM 151 | for (unsigned int ch = 0*NLANE; ch < 2*NLANE; ch++) { 152 | krnl_ntt.setArg(ch+1, pt_buf[ch]); 153 | } 154 | 155 | // Schedule transfer of inputs to device memory, execution of kernel, 156 | // and transfer of outputs back to host memory 157 | cl::vector pt_buf_vec; 158 | for (unsigned int ch = 0*NLANE; ch < 2*NLANE; ch++) { 159 | pt_buf_vec.push_back(pt_buf[ch]); 160 | } 161 | 162 | // Map host-side buffer memory to user-space pointers 163 | unsigned long *p_pt_buf[2*NLANE]; 164 | for (unsigned int ch = 0*NLANE; ch < 2*NLANE; ch++) { 165 | p_pt_buf[ch] = (unsigned long *)q.enqueueMapBuffer(pt_buf[ch], CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, bytes_per_hbm_channel); 166 | } 167 | 168 | #endif 169 | 170 | // Initialize the vectors used in the test 171 | for (unsigned int p = 0; p < Nmax; p++) { 172 | 173 | const unsigned int ch = get_channel_from_point(p)*2; // *2 to map to HBM channels 174 | const unsigned int idx = get_index_from_point(p, 0); 175 | unsigned long val = 0; 176 | 177 | if (p < Npts) { 178 | // Convert from hex string to unsigned long. 179 | std::stringstream str(input_pts[p]); 180 | str >> std::hex >> val; 181 | } 182 | 183 | p_pt_buf[ch+0][idx] = val; 184 | p_pt_buf[ch+1][idx] = 0; 185 | } 186 | 187 | /* 188 | 189 | Code body 190 | Insert your Initialization(e.g copying data to HBM etc.) host code before executing the kernel 191 | */ 192 | 193 | #if (USE_XRT==1) 194 | 195 | // Synchronize buffer content with device side 196 | std::cout << "synchronize input buffer data to device global memory\n"; 197 | for (unsigned int ch = 0*NLANE; ch < 2*NLANE; ch++) { 198 | pt_buf[ch].sync(XCL_BO_SYNC_BO_TO_DEVICE); 199 | } 200 | 201 | #else 202 | 203 | // Send the input points from CPU memory to HBM memory. 204 | q.enqueueMigrateMemObjects(pt_buf_vec, 0 /* 0 means from host*/); 205 | q.finish(); 206 | 207 | #endif 208 | 209 | //Time Measurement Starts Here 210 | auto kernel_start = std::chrono::high_resolution_clock::now(); // note start time 211 | /* 212 | 213 | Execute your kernel here 214 | Note: Please also add the pre-processing(operations performed on input data) functions here if any 215 | 216 | */ 217 | 218 | //wait_for_enter("\nPress ENTER to continue after setting up ILA trigger..."); 219 | 220 | #if (USE_XRT==1) 221 | 222 | std::cout << "Execution of the kernel\n"; 223 | auto run = xrt::run(krnl); 224 | run.set_arg(0,chicken_bits); 225 | for (unsigned int ch = 0*NLANE; ch < 2*NLANE; ch++) { 226 | run.set_arg(ch+1,pt_buf[ch]); 227 | } 228 | run.start(); 229 | run.wait(); 230 | 231 | #else 232 | 233 | q.enqueueTask(krnl_ntt); // Execute the kernel. 234 | q.finish(); 235 | 236 | #endif 237 | 238 | auto kernel_end = std::chrono::high_resolution_clock::now(); // note end time 239 | kernel_time = std::chrono::duration(kernel_end - kernel_start); // calculate the difference 240 | std::cout << "Kernel time: "; 241 | std::cout << kernel_time.count()*1000 << " ms" << std::endl; // print the time in milliseconds 242 | 243 | /* 244 | 245 | Comparison 246 | Compare the generated results with the expected data 247 | Also please save the generated results in file 248 | 249 | outputFile_Handler << std::hex <> std::hex >> x_exp; 282 | 283 | if (x_act != x_exp) { 284 | fprintf(stdout, "Error: @(%06x), exp: %016lx, act: %016lx, ch: %2d, idx: %06x\n", 285 | p/Nhop, x_exp, x_act, ch, idx); 286 | match = false; 287 | /* 288 | } else { 289 | fprintf(stdout, "Match: @(%06x), exp: %016lx, act: %016lx, ch: %2d, idx: %06x\n", 290 | p/Nhop, x_exp, x_act, ch, idx); 291 | */ 292 | } 293 | } 294 | } 295 | 296 | /* 297 | // Check first pass intermediate results. 298 | const unsigned long M = 0xffffffff00000001; 299 | std::cout << "Checking computed results againt expected ..." << std::endl; 300 | for (unsigned int p = 0; p < Nmax; p++) { 301 | // Use Nhop to support NTT sizes smaller than 2**24. 302 | if ((p % Nhop) == 0) { 303 | const unsigned int ch = get_channel_from_point(p)*2+1; 304 | const unsigned int idx = get_index_from_point(p, 0); 305 | const unsigned long x_act = p_pt_buf[ch][idx]; 306 | outputFile_Handler << std::hex << x_act << std::endl; 307 | 308 | // Convert from hex string to unsigned long. 309 | unsigned long x_exp; 310 | std::stringstream str(expect_pts[p/Nhop]); 311 | str >> std::hex >> x_exp; 312 | 313 | if ((x_act % M) != x_exp) { 314 | fprintf(stdout, "Error: @(%06x), exp: %016lx, act: %016lx, ch: %2d, idx: %06x\n", 315 | p/Nhop, x_exp, (x_act % M), ch, idx); 316 | match = false; 317 | } else { 318 | fprintf(stdout, "Match: @(%06x), exp: %016lx, act: %016lx, ch: %2d, idx: %06x\n", 319 | p/Nhop, x_exp, (x_act % M), ch, idx); 320 | } 321 | } 322 | } 323 | */ 324 | 325 | std::cout << "TEST " << (match ? "PASSED" : "FAILED") << std::endl; 326 | return (match ? EXIT_SUCCESS : EXIT_FAILURE); 327 | 328 | outputFile_Handler.close(); 329 | 330 | #if (USE_XRT==1) 331 | #else 332 | delete[] fileBuf; 333 | #endif 334 | } 335 | -------------------------------------------------------------------------------- /fpga/ntt_cfg.h: -------------------------------------------------------------------------------- 1 | // Copyright Supranational LLC 2 | // Licensed under the Apache License, Version 2.0, see LICENSE-APACHE 3 | // or the MIT license, see LICENSE-MIT, at your option. 4 | // SPDX-License-Identifier: Apache-2.0 OR MIT 5 | 6 | #pragma once 7 | namespace ntt_cfg 8 | { 9 | const unsigned int NLANE = 16; 10 | 11 | unsigned int get_channel_from_point(unsigned long p) { 12 | 13 | unsigned int ch; 14 | 15 | ch = (p >> 1) % (NLANE/2); 16 | ch += (p >> 13) % (NLANE/2); 17 | ch %= NLANE/2; 18 | 19 | ch *= 2; 20 | ch += ((p >> 23) ^ (p >> 11)) & 1; 21 | 22 | return ch; 23 | } 24 | 25 | unsigned int idx_to_addr(unsigned int idx) { 26 | return idx << 3; // log2 size of unsigned long 27 | } 28 | 29 | unsigned int addr_to_idx(unsigned int addr) { 30 | return addr >> 3; // log2 size of unsigned long 31 | } 32 | 33 | unsigned int bank_optimize_addr(unsigned int addr) { 34 | 35 | unsigned int result; 36 | 37 | result = ((addr >> 14) & ((1 << 9)-1)); 38 | result <<= 3; 39 | result |= ((addr >> 15) & ((1 << 3)-1)) ^ 40 | ((addr >> 11) & ((1 << 3)-1)); 41 | result <<= 11; 42 | result |= ((addr >> 0) & ((1 << 11)-1)); 43 | 44 | return result; 45 | } 46 | 47 | unsigned int get_index_from_point(unsigned long p, unsigned int last) { 48 | 49 | unsigned int idx; 50 | unsigned int addr; 51 | 52 | if (last) { 53 | idx = ((p >> 16) & ((1 << 2)-1));// 17 16 54 | idx <<= 8; 55 | idx |= ((p >> 4) & ((1 << 8)-1));// 11 10 9 8 7 6 5 4 56 | idx <<= 5; 57 | idx |= ((p >> 18) & ((1 << 5)-1));// 22 21 20 19 18 58 | idx <<= 4; 59 | idx |= ((p >> 0) & ((1 << 4)-1));// 3 2 1 0 60 | idx <<= 1; 61 | idx |= ((p >> 12) & ((1 << 1)-1));// 12 62 | } else { 63 | idx = ((p >> 16) & ((1 << 2)-1));// 17 16 64 | idx <<= 8; 65 | idx |= ((p >> 4) & ((1 << 8)-1));// 11 10 9 8 7 6 5 4 66 | idx <<= 5; 67 | idx |= ((p >> 18) & ((1 << 5)-1));// 22 21 20 19 18 68 | idx <<= 4; 69 | idx |= ((p >> 12) & ((1 << 4)-1));// 15 14 13 12 70 | idx <<= 1; 71 | idx |= ((p >> 0) & ((1 << 1)-1));// 0 72 | }; 73 | 74 | addr = idx_to_addr(idx); 75 | addr = bank_optimize_addr(addr); 76 | idx = addr_to_idx(addr); 77 | 78 | return idx; 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /fpga/u55n.cfg: -------------------------------------------------------------------------------- 1 | debug=1 2 | save-temps=1 3 | 4 | [connectivity] 5 | nk=nantucket:1 6 | sp=nantucket_1.m00_axi:HBM[0] 7 | sp=nantucket_1.m01_axi:HBM[1] 8 | sp=nantucket_1.m02_axi:HBM[2] 9 | sp=nantucket_1.m03_axi:HBM[3] 10 | sp=nantucket_1.m04_axi:HBM[4] 11 | sp=nantucket_1.m05_axi:HBM[5] 12 | sp=nantucket_1.m06_axi:HBM[6] 13 | sp=nantucket_1.m07_axi:HBM[7] 14 | sp=nantucket_1.m08_axi:HBM[8] 15 | sp=nantucket_1.m09_axi:HBM[9] 16 | sp=nantucket_1.m10_axi:HBM[10] 17 | sp=nantucket_1.m11_axi:HBM[11] 18 | sp=nantucket_1.m12_axi:HBM[12] 19 | sp=nantucket_1.m13_axi:HBM[13] 20 | sp=nantucket_1.m14_axi:HBM[14] 21 | sp=nantucket_1.m15_axi:HBM[15] 22 | sp=nantucket_1.m16_axi:HBM[16] 23 | sp=nantucket_1.m17_axi:HBM[17] 24 | sp=nantucket_1.m18_axi:HBM[18] 25 | sp=nantucket_1.m19_axi:HBM[19] 26 | sp=nantucket_1.m20_axi:HBM[20] 27 | sp=nantucket_1.m21_axi:HBM[21] 28 | sp=nantucket_1.m22_axi:HBM[22] 29 | sp=nantucket_1.m23_axi:HBM[23] 30 | sp=nantucket_1.m24_axi:HBM[24] 31 | sp=nantucket_1.m25_axi:HBM[25] 32 | sp=nantucket_1.m26_axi:HBM[26] 33 | sp=nantucket_1.m27_axi:HBM[27] 34 | sp=nantucket_1.m28_axi:HBM[28] 35 | sp=nantucket_1.m29_axi:HBM[29] 36 | sp=nantucket_1.m30_axi:HBM[30] 37 | sp=nantucket_1.m31_axi:HBM[31] 38 | -------------------------------------------------------------------------------- /fpga/util.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2021 Xilinx, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | // Code to program the FPGA with the xclbin file. 18 | 19 | #include 20 | #include 21 | #include 22 | #include "util.h" 23 | 24 | // ------------------------------------------------------------------------------------ 25 | // Utility functions 26 | // ------------------------------------------------------------------------------------ 27 | std::vector get_xilinx_devices() 28 | { 29 | size_t i; 30 | cl_int err; 31 | std::vector platforms; 32 | err = cl::Platform::get(&platforms); 33 | cl::Platform platform; 34 | for (i = 0; i < platforms.size(); i++) 35 | { 36 | platform = platforms[i]; 37 | std::string platformName = platform.getInfo(&err); 38 | if (platformName == "Xilinx") 39 | { 40 | std::cout << "INFO: Found Xilinx Platform" << std::endl; 41 | break; 42 | } 43 | } 44 | if (i == platforms.size()) 45 | { 46 | std::cout << "ERROR: Failed to find Xilinx platform" << std::endl; 47 | exit(EXIT_FAILURE); 48 | } 49 | 50 | //Getting ACCELERATOR Devices and selecting 1st such device 51 | std::vector devices; 52 | err = platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices); 53 | return devices; 54 | } 55 | 56 | char *read_binary_file(const std::string &xclbin_file_name, unsigned &nb) 57 | { 58 | if (access(xclbin_file_name.c_str(), R_OK) != 0) 59 | { 60 | printf("ERROR: %s xclbin not available please build\n", xclbin_file_name.c_str()); 61 | exit(EXIT_FAILURE); 62 | } 63 | //Loading XCL Bin into char buffer 64 | std::cout << "INFO: Loading '" << xclbin_file_name << "'\n"; 65 | std::ifstream bin_file(xclbin_file_name.c_str(), std::ifstream::binary); 66 | bin_file.seekg(0, bin_file.end); 67 | nb = bin_file.tellg(); 68 | bin_file.seekg(0, bin_file.beg); 69 | char *buf = new char[nb]; 70 | bin_file.read(buf, nb); 71 | return buf; 72 | } 73 | -------------------------------------------------------------------------------- /fpga/util.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2021 Xilinx, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #define CL_HPP_CL_1_2_DEFAULT_BUILD 20 | #define CL_HPP_TARGET_OPENCL_VERSION 120 21 | #define CL_HPP_MINIMUM_OPENCL_VERSION 120 22 | #define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1 23 | #define CL_USE_DEPRECATED_OPENCL_1_2_APIS 24 | 25 | #include 26 | #include 27 | 28 | std::vector get_xilinx_devices(); 29 | char *read_binary_file(const std::string &xclbin_file_name, unsigned &nb); 30 | -------------------------------------------------------------------------------- /fpga/xrt.ini: -------------------------------------------------------------------------------- 1 | [Debug] 2 | opencl_trace=true 3 | 4 | [Emulation] 5 | debug_mode=batch 6 | #user_pre_sim_script=xsim.tcl 7 | -------------------------------------------------------------------------------- /fpga/xsim.tcl: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Xilinx, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | log_wave -r * 18 | run all 19 | exit 20 | -------------------------------------------------------------------------------- /layout.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/supranational/zprize-fpga-ntt/62eee3c12adbea3bfa57ef312658eeda7adf6153/layout.png -------------------------------------------------------------------------------- /rtl/axi_hbm_pkg.sv: -------------------------------------------------------------------------------- 1 | // Copyright Supranational LLC 2 | // Licensed under the Apache License, Version 2.0, see LICENSE-APACHE 3 | // or the MIT license, see LICENSE-MIT, at your option. 4 | // SPDX-License-Identifier: Apache-2.0 OR MIT 5 | 6 | package axi_hbm_pkg; 7 | 8 | localparam int DATA_WIDTH_IN_BYTES = 32; 9 | 10 | localparam int N_HBM_PC = 32; 11 | 12 | typedef logic [32:0] addr_t; 13 | typedef logic [5:0] wid_t; 14 | typedef logic [5:0] rid_t; 15 | typedef logic [3:0] len_t; 16 | 17 | localparam addr_t HBM_PC_SIZE_IN_BYTES = 1 << 28; // 256 MB 18 | 19 | typedef logic [DATA_WIDTH_IN_BYTES*8-1:0] data_t; 20 | typedef logic [DATA_WIDTH_IN_BYTES-1:0] strb_t; 21 | 22 | function automatic strb_t parity(input data_t data); 23 | for (int i = 0; i < DATA_WIDTH_IN_BYTES; i++) begin 24 | parity[i] = ^data[i*8 +: 8]; 25 | end 26 | endfunction 27 | 28 | endpackage 29 | -------------------------------------------------------------------------------- /rtl/components/bin_to_gray.sv: -------------------------------------------------------------------------------- 1 | // Copyright Supranational LLC 2 | // Licensed under the Apache License, Version 2.0, see LICENSE-APACHE 3 | // or the MIT license, see LICENSE-MIT, at your option. 4 | // SPDX-License-Identifier: Apache-2.0 OR MIT 5 | 6 | // Binary to gray code encoder 7 | 8 | module bin_to_gray #(parameter int WIDTH = 2) 9 | ( 10 | input logic [WIDTH-1:0] i, 11 | output logic [WIDTH-1:0] o 12 | ); 13 | 14 | assign o = i ^ {1'b0, i[WIDTH-1:1]}; 15 | 16 | endmodule 17 | -------------------------------------------------------------------------------- /rtl/components/cdc_fifo_core.sv: -------------------------------------------------------------------------------- 1 | // Copyright Supranational LLC 2 | // Licensed under the Apache License, Version 2.0, see LICENSE-APACHE 3 | // or the MIT license, see LICENSE-MIT, at your option. 4 | // SPDX-License-Identifier: Apache-2.0 OR MIT 5 | 6 | // Clock domain crossing FIFO core data storage 7 | // 8 | // Coded to infer FPGA block RAMs automatically. 9 | // 10 | // Note: Flops adhering to the cdc_*_q naming convention 11 | // signal the input is from a different clock domain than 12 | // the flop's clock. 13 | 14 | module cdc_fifo_core 15 | #( 16 | parameter type DATA_t = logic, 17 | parameter int DEPTH = 2, 18 | parameter bit RESET = 0, 19 | parameter bit SYNC_RESET = 0, 20 | parameter type ADDR_t = logic [$clog2(DEPTH)-1:0] 21 | ) 22 | ( 23 | input logic wclk_i, 24 | input logic wrst_ni, 25 | input logic we_i, 26 | input DATA_t wdata_i, 27 | input ADDR_t waddr_i, 28 | input logic rclk_i, 29 | input logic rrst_ni, 30 | input logic re_i, 31 | input ADDR_t raddr_i, 32 | output DATA_t rdata_o 33 | ); 34 | 35 | DATA_t mem_q [DEPTH]; 36 | DATA_t cdc_rdata_q; // Special CDC flop naming cdc_*_q 37 | 38 | if (!RESET) begin : gen_no_reset 39 | 40 | always_ff @(posedge wclk_i) begin 41 | if (we_i) begin 42 | mem_q[waddr_i] <= wdata_i; 43 | end 44 | end 45 | 46 | always_ff @(posedge rclk_i) begin 47 | if (re_i) begin 48 | cdc_rdata_q <= mem_q[raddr_i]; 49 | end 50 | end 51 | 52 | end : gen_no_reset 53 | else if (SYNC_RESET) begin : gen_sync_reset 54 | 55 | always_ff @(posedge wclk_i) begin 56 | if (!wrst_ni) begin 57 | mem_q <= '{default: DATA_t'('0)}; 58 | end else if (we_i) begin 59 | mem_q[waddr_i] <= wdata_i; 60 | end 61 | end 62 | 63 | always_ff @(posedge rclk_i) begin 64 | if (!rrst_ni) begin 65 | cdc_rdata_q <= DATA_t'('0); 66 | end else if (re_i) begin 67 | cdc_rdata_q <= mem_q[raddr_i]; 68 | end 69 | end 70 | 71 | end : gen_sync_reset 72 | else begin : gen_async_reset 73 | 74 | always_ff @(posedge wclk_i or negedge wrst_ni) begin 75 | if (!wrst_ni) begin 76 | mem_q <= '{default: DATA_t'('0)}; 77 | end else if (we_i) begin 78 | mem_q[waddr_i] <= wdata_i; 79 | end 80 | end 81 | 82 | always_ff @(posedge rclk_i or negedge rrst_ni) begin 83 | if (!rrst_ni) begin 84 | cdc_rdata_q <= DATA_t'('0); 85 | end else if (re_i) begin 86 | cdc_rdata_q <= mem_q[raddr_i]; 87 | end 88 | end 89 | 90 | end : gen_async_reset 91 | 92 | assign rdata_o = cdc_rdata_q; 93 | 94 | endmodule 95 | -------------------------------------------------------------------------------- /rtl/components/cdc_sync.sv: -------------------------------------------------------------------------------- 1 | // Copyright Supranational LLC 2 | // Licensed under the Apache License, Version 2.0, see LICENSE-APACHE 3 | // or the MIT license, see LICENSE-MIT, at your option. 4 | // SPDX-License-Identifier: Apache-2.0 OR MIT 5 | 6 | // Copyright Supranational LLC 7 | // 8 | // Universal synchronizer 9 | // 10 | // Note: Flops adhering to the cdc_*_q naming convention 11 | // signal the input is from a different clock domain than 12 | // the flop's clock. 13 | 14 | module cdc_sync 15 | #( 16 | parameter int WIDTH = 1, 17 | parameter bit RESET = 1, 18 | parameter int RANK = 2, 19 | parameter bit RAND = 1, 20 | parameter bit [WIDTH-1:0] RESET_VALUE = '0 21 | ) 22 | ( 23 | input logic clk_i, 24 | input logic rst_ni,//NTH = 1'b1, // Default input might not work in some tools 25 | input logic [WIDTH-1:0] i, 26 | output logic [WIDTH-1:0] o 27 | ); 28 | 29 | `ifndef SYNTHESIS 30 | assert #0 ((RANK >= 2) && (RANK <= 4)) else 31 | $fatal(1, "Rank must be between 2 and 4"); 32 | `endif 33 | 34 | // Give these flops a special instance name 35 | // using the "CDC" prefix so we can easily 36 | // script CDC reports in various EDA tools. 37 | logic [WIDTH-1:0] cdc_flop_q, cdc_flop_d; 38 | 39 | `ifdef SYNTHESIS 40 | 41 | assign cdc_flop_d = i; 42 | 43 | `else 44 | 45 | // ---------------------------------------------------------------------- 46 | // Model synchronizer randomness for simulation 47 | 48 | if (RAND) begin : gen_rand_true 49 | 50 | // Synchronizer randomness enabled. 51 | logic [WIDTH-1:0] mask, ambiguous_bits, i_prev; 52 | 53 | always @(posedge clk_i or rst_ni or i) begin 54 | // Identify the ambiguous bits. 55 | mask = (i ^ i_prev); 56 | 57 | // Remember value for the next time. 58 | i_prev = i; 59 | 60 | // Randomize the ambiguous bits. 61 | ambiguous_bits = $urandom_range((2**WIDTH)-1, 0); 62 | 63 | // Combine the known bits and ambiguous bits. 64 | cdc_flop_d <= (i & ~mask) | (ambiguous_bits & mask); 65 | end 66 | 67 | end else begin : gen_rand_false 68 | 69 | // Synchronizer randomness disabled. 70 | assign cdc_flop_d = i; 71 | 72 | end 73 | 74 | `endif 75 | 76 | // Synchronizer flops 77 | logic [RANK-1:0][WIDTH-1:0] sync_q; 78 | assign sync_q[0] = cdc_flop_q; 79 | 80 | if (RESET) begin : gen_reset_true 81 | 82 | // Reset required for flop, async-reset assumed. 83 | always_ff @(posedge clk_i or negedge rst_ni) begin 84 | if (!rst_ni) begin 85 | cdc_flop_q <= RESET_VALUE; 86 | sync_q[RANK-1:1] <= '{default: RESET_VALUE}; 87 | end 88 | else begin 89 | cdc_flop_q <= cdc_flop_d; 90 | sync_q[RANK-1:1] <= sync_q[RANK-2:0]; 91 | end 92 | end 93 | 94 | end else begin : gen_reset_false 95 | 96 | // No reset required for flops. 97 | always_ff @(posedge clk_i) begin 98 | cdc_flop_q <= cdc_flop_d; 99 | sync_q[RANK-1:1] <= sync_q[RANK-2:0]; 100 | end 101 | 102 | end 103 | 104 | assign o = sync_q[RANK-1]; 105 | 106 | endmodule 107 | -------------------------------------------------------------------------------- /rtl/components/fifo.sv: -------------------------------------------------------------------------------- 1 | // Copyright Supranational LLC 2 | // Licensed under the Apache License, Version 2.0, see LICENSE-APACHE 3 | // or the MIT license, see LICENSE-MIT, at your option. 4 | // SPDX-License-Identifier: Apache-2.0 OR MIT 5 | 6 | // Clock domain crossing FIFO 7 | 8 | module fifo 9 | #( 10 | parameter type DATA_t = logic, 11 | parameter int DEPTH = 2, 12 | parameter bit RESET = 0, 13 | parameter bit SYNC_RESET = 0, 14 | parameter type ADDR_t = logic [$clog2(DEPTH)-1:0] 15 | ) 16 | ( 17 | input logic clk_i, 18 | input logic rst_ni, 19 | input DATA_t wdata_i, 20 | input logic wvalid_i, 21 | output logic wready_o, 22 | output DATA_t rdata_o, 23 | output logic rvalid_o, 24 | input logic rready_i 25 | ); 26 | 27 | case (DEPTH) 28 | 0: begin : gen_depth_0 29 | 30 | always_comb begin 31 | rdata_o = wdata_i; 32 | rvalid_o = wvalid_i; 33 | wready_o = rready_i; 34 | end 35 | 36 | end 37 | 1: begin : gen_depth_1 38 | 39 | DATA_t rdata_d; 40 | logic rvalid_d; 41 | 42 | always_comb begin 43 | wready_o = !rvalid_o || rready_i; 44 | 45 | rdata_d = rdata_o; 46 | rvalid_d = rvalid_o; 47 | 48 | if (wvalid_i && wready_o) begin 49 | rdata_d = wdata_i; 50 | rvalid_d = 1'b1; 51 | end else if (rvalid_o && rready_i) begin 52 | rvalid_d = 1'b0; 53 | end 54 | end 55 | 56 | if (!RESET) begin : gen_no_reset 57 | always_ff @(posedge clk_i) begin 58 | rdata_o <= rdata_d; 59 | rvalid_o <= rvalid_d; 60 | end 61 | end : gen_no_reset 62 | 63 | else if (SYNC_RESET) begin : gen_sync_reset 64 | always_ff @(posedge clk_i) begin 65 | if (!rst_ni) begin 66 | rdata_o <= DATA_t'('0); 67 | rvalid_o <= 1'b0; 68 | end else begin 69 | rdata_o <= rdata_d; 70 | rvalid_o <= rvalid_d; 71 | end 72 | end 73 | end : gen_sync_reset 74 | 75 | else begin : gen_async_reset 76 | always_ff @(posedge clk_i or negedge rst_ni) begin 77 | if (!rst_ni) begin 78 | rdata_o <= DATA_t'('0); 79 | rvalid_o <= 1'b0; 80 | end else begin 81 | rdata_o <= rdata_d; 82 | rvalid_o <= rvalid_d; 83 | end 84 | end 85 | end : gen_async_reset 86 | 87 | end 88 | default: begin : gen_depth_n 89 | 90 | ADDR_t waddr, raddr; 91 | logic we, re; 92 | DATA_t rdata; 93 | 94 | fifo_ctrl 95 | #( 96 | .DEPTH(DEPTH), 97 | .SYNC_RESET(SYNC_RESET) 98 | ) _ctrl 99 | ( 100 | .clk_i, 101 | .rst_ni, 102 | .wvalid_i, 103 | .wready_o, 104 | .waddr_o(waddr), 105 | .we_o(we), 106 | .rvalid_o, 107 | .rready_i, 108 | .raddr_o(raddr), 109 | .re_o(re) 110 | ); 111 | 112 | fifo_core 113 | #( 114 | .DATA_t(DATA_t), 115 | .DEPTH(DEPTH), 116 | .RESET(RESET), 117 | .SYNC_RESET(SYNC_RESET) 118 | ) _core 119 | ( 120 | .clk_i, 121 | .rst_ni, 122 | .we_i(we), 123 | .waddr_i(waddr), 124 | .wdata_i, 125 | .re_i(re), 126 | .raddr_i(raddr), 127 | .rdata_o(rdata) 128 | ); 129 | 130 | assign rdata_o = rvalid_o ? rdata : '0; 131 | 132 | end 133 | endcase 134 | 135 | endmodule 136 | -------------------------------------------------------------------------------- /rtl/components/fifo_core.sv: -------------------------------------------------------------------------------- 1 | // Copyright Supranational LLC 2 | // Licensed under the Apache License, Version 2.0, see LICENSE-APACHE 3 | // or the MIT license, see LICENSE-MIT, at your option. 4 | // SPDX-License-Identifier: Apache-2.0 OR MIT 5 | 6 | // FIFO core data storage 7 | // 8 | // Coded to infer FPGA block RAMs automatically. 9 | 10 | module fifo_core 11 | #( 12 | parameter type DATA_t = logic, 13 | parameter int DEPTH = 2, 14 | parameter bit RESET = 0, 15 | parameter bit SYNC_RESET = 0, 16 | parameter type ADDR_t = logic [$clog2(DEPTH)-1:0] 17 | ) 18 | ( 19 | input logic clk_i, 20 | input logic rst_ni, 21 | input logic we_i, 22 | input DATA_t wdata_i, 23 | input ADDR_t waddr_i, 24 | input logic re_i, 25 | input ADDR_t raddr_i, 26 | output DATA_t rdata_o 27 | ); 28 | 29 | DATA_t mem_q [DEPTH]; 30 | DATA_t rdata_q; // Special CDC flop naming cdc_*_q 31 | 32 | if (!RESET) begin : gen_no_reset 33 | 34 | always_ff @(posedge clk_i) begin 35 | if (we_i) begin 36 | mem_q[waddr_i] <= wdata_i; 37 | end 38 | end 39 | 40 | always_ff @(posedge clk_i) begin 41 | if (re_i) begin 42 | rdata_q <= mem_q[raddr_i]; 43 | end 44 | end 45 | 46 | end : gen_no_reset 47 | else if (SYNC_RESET) begin : gen_sync_reset 48 | 49 | always_ff @(posedge clk_i) begin 50 | if (!rst_ni) begin 51 | mem_q <= '{default: DATA_t'('0)}; 52 | end else if (we_i) begin 53 | mem_q[waddr_i] <= wdata_i; 54 | end 55 | end 56 | 57 | always_ff @(posedge clk_i) begin 58 | if (!rst_ni) begin 59 | rdata_q <= DATA_t'('0); 60 | end else if (re_i) begin 61 | rdata_q <= mem_q[raddr_i]; 62 | end 63 | end 64 | 65 | end : gen_sync_reset 66 | else begin : gen_async_reset 67 | 68 | always_ff @(posedge clk_i or negedge rst_ni) begin 69 | if (!rst_ni) begin 70 | mem_q <= '{default: DATA_t'('0)}; 71 | end else if (we_i) begin 72 | mem_q[waddr_i] <= wdata_i; 73 | end 74 | end 75 | 76 | always_ff @(posedge clk_i or negedge rst_ni) begin 77 | if (!rst_ni) begin 78 | rdata_q <= DATA_t'('0); 79 | end else if (re_i) begin 80 | rdata_q <= mem_q[raddr_i]; 81 | end 82 | end 83 | 84 | end : gen_async_reset 85 | 86 | assign rdata_o = rdata_q; 87 | 88 | endmodule 89 | -------------------------------------------------------------------------------- /rtl/components/fifo_ctrl.sv: -------------------------------------------------------------------------------- 1 | // Copyright Supranational LLC 2 | // Licensed under the Apache License, Version 2.0, see LICENSE-APACHE 3 | // or the MIT license, see LICENSE-MIT, at your option. 4 | // SPDX-License-Identifier: Apache-2.0 OR MIT 5 | 6 | // FIFO control logic 7 | 8 | module fifo_ctrl 9 | #( 10 | parameter int DEPTH = 2, 11 | parameter int ADJ_LSB = 0, 12 | parameter bit SYNC_RESET = 0, 13 | parameter type ADDR_t = logic [$clog2(DEPTH)-1:0], 14 | parameter type PTR_t = logic [$clog2(DEPTH):0] 15 | ) 16 | ( 17 | input logic clk_i, 18 | input logic rst_ni, 19 | input logic wvalid_i, 20 | output logic wready_o, 21 | output ADDR_t waddr_o, 22 | output logic we_o, 23 | output logic rvalid_o, 24 | input logic rready_i, 25 | output ADDR_t raddr_o, 26 | output logic re_o 27 | ); 28 | 29 | `ifndef SYNTHESIS 30 | assert #0 (DEPTH >=2) 31 | else $fatal(1, "FIFO must have a depth of 2 or more"); 32 | `endif 33 | 34 | ADDR_t waddr_q, waddr_d; 35 | ADDR_t waddr_wrap; 36 | PTR_t wptr_q, wptr_d; 37 | PTR_t wptr_plus1; 38 | 39 | logic rempty; 40 | logic rvalid_q, rvalid_d; 41 | ADDR_t raddr_q, raddr_d; 42 | ADDR_t raddr_wrap; 43 | PTR_t rptr_q, rptr_d; 44 | PTR_t rptr_plus1; 45 | 46 | PTR_t wptr_minus_rptr; 47 | 48 | localparam int DEPTH_ADJ = ADJ_LSB == 0 ? 0 : 2**ADJ_LSB; 49 | localparam int SAFE_DEPTH = DEPTH - DEPTH_ADJ; 50 | 51 | // ---------------------------------------------------------------------- 52 | // Write side 53 | 54 | assign wptr_plus1 = wptr_q + 1'b1; 55 | 56 | always_comb begin 57 | wptr_minus_rptr = wptr_q - rptr_q; 58 | 59 | // Wrap or increment to advance. 60 | if (waddr_q == (DEPTH-1)) begin 61 | waddr_wrap = '0; 62 | end else begin 63 | waddr_wrap = waddr_q + 1'b1; 64 | end 65 | 66 | // Not ready for writes if full. 67 | wready_o = wptr_minus_rptr != PTR_t'(SAFE_DEPTH); 68 | 69 | we_o = wvalid_i && wready_o; 70 | waddr_o = waddr_q; 71 | 72 | // Push the FIFO. 73 | if (wvalid_i && wready_o) begin 74 | wptr_d = wptr_plus1; 75 | waddr_d = waddr_wrap; 76 | end else begin 77 | wptr_d = wptr_q; 78 | waddr_d = waddr_q; 79 | end 80 | end 81 | 82 | // ---------------------------------------------------------------------- 83 | // Read side 84 | 85 | always_comb begin 86 | rvalid_o = rvalid_q; 87 | rempty = rptr_q[$clog2(DEPTH):ADJ_LSB] == 88 | wptr_q[$clog2(DEPTH):ADJ_LSB]; 89 | rptr_plus1 = rptr_q + 1'b1; 90 | 91 | // Wrap or increment to advance. 92 | if (raddr_q == (DEPTH-1)) begin 93 | raddr_wrap = '0; 94 | end else begin 95 | raddr_wrap = raddr_q + 1'b1; 96 | end 97 | 98 | raddr_o = raddr_wrap; 99 | 100 | if (!rvalid_q && !rempty) begin 101 | // Prime the read when exiting empty state. 102 | re_o = 1'b1; 103 | raddr_d = raddr_wrap; 104 | rptr_d = rptr_plus1; 105 | rvalid_d = 1'b1; 106 | end else if (rvalid_q && rready_i && rempty) begin 107 | // Don't read past empty. 108 | re_o = 1'b0; 109 | raddr_d = raddr_q; 110 | rptr_d = rptr_q; 111 | rvalid_d = 1'b0; 112 | end else if (rvalid_q && rready_i) begin 113 | // Normal FIFO read advance. 114 | re_o = 1'b1; 115 | raddr_d = raddr_wrap; 116 | rptr_d = rptr_plus1; 117 | rvalid_d = rvalid_q; 118 | end else begin 119 | // Hold state. 120 | re_o = 1'b0; 121 | raddr_d = raddr_q; 122 | rptr_d = rptr_q; 123 | rvalid_d = rvalid_q; 124 | end 125 | end 126 | 127 | // ---------------------------------------------------------------------- 128 | // Flops. 129 | 130 | if (SYNC_RESET) begin : gen_sync_reset 131 | always_ff @(posedge clk_i) begin 132 | if (!rst_ni) begin 133 | rptr_q <= '0; 134 | raddr_q <= DEPTH-1; 135 | rvalid_q <= '0; 136 | wptr_q <= '0; 137 | waddr_q <= '0; 138 | end else begin 139 | rptr_q <= rptr_d; 140 | raddr_q <= raddr_d; 141 | rvalid_q <= rvalid_d; 142 | wptr_q <= wptr_d; 143 | waddr_q <= waddr_d; 144 | end 145 | end 146 | end : gen_sync_reset 147 | 148 | else begin : gen_async_reset 149 | always_ff @(posedge clk_i or negedge rst_ni) begin 150 | if (!rst_ni) begin 151 | rptr_q <= '0; 152 | raddr_q <= DEPTH-1; 153 | rvalid_q <= '0; 154 | wptr_q <= '0; 155 | waddr_q <= '0; 156 | end else begin 157 | rptr_q <= rptr_d; 158 | raddr_q <= raddr_d; 159 | rvalid_q <= rvalid_d; 160 | wptr_q <= wptr_d; 161 | waddr_q <= waddr_d; 162 | end 163 | end 164 | end : gen_async_reset 165 | 166 | // ---------------------------------------------------------------------- 167 | // Assertions. 168 | 169 | `ifndef SYNTHESIS 170 | ASSERT_no_underflow: 171 | assert property (@(posedge clk_i) disable iff (!rst_ni) 172 | re_o === 1 |-> rempty === 0 173 | ) else $error("FIFO underflow."); 174 | 175 | ASSERT_no_overflow: 176 | assert property (@(posedge clk_i) disable iff (!rst_ni) 177 | we_o === 1 |-> wready_o === 1 178 | ) else $fatal(1, "FIFO overflow."); 179 | `endif 180 | 181 | endmodule 182 | -------------------------------------------------------------------------------- /rtl/components/gray_to_bin.sv: -------------------------------------------------------------------------------- 1 | // Copyright Supranational LLC 2 | // Licensed under the Apache License, Version 2.0, see LICENSE-APACHE 3 | // or the MIT license, see LICENSE-MIT, at your option. 4 | // SPDX-License-Identifier: Apache-2.0 OR MIT 5 | 6 | // Gray code to binary decoder 7 | 8 | module gray_to_bin #(parameter int WIDTH = 2) 9 | ( 10 | input logic [WIDTH-1:0] i, 11 | output logic [WIDTH-1:0] o 12 | ); 13 | 14 | always_comb begin 15 | for (int w = 0; w < WIDTH; w++) begin 16 | o[w] = ^(i >> w); 17 | end 18 | end 19 | 20 | endmodule 21 | -------------------------------------------------------------------------------- /rtl/components/ram_1w1r_1clk.sv: -------------------------------------------------------------------------------- 1 | // Copyright Supranational LLC 2 | // Licensed under the Apache License, Version 2.0, see LICENSE-APACHE 3 | // or the MIT license, see LICENSE-MIT, at your option. 4 | // SPDX-License-Identifier: Apache-2.0 OR MIT 5 | 6 | module ram_1w1r_1clk 7 | #( 8 | parameter ADDR_WIDTH = 8, 9 | parameter WORDS = (1<> $clog2(NBPP)); 136 | return idx; 137 | endfunction 138 | 139 | function automatic addr_t bank_optimize_addr(input addr_t addr); 140 | addr = {addr[22:14], 141 | addr[17:15] ^ addr[13:11], 142 | addr[10:0]}; 143 | return addr; 144 | endfunction 145 | 146 | function automatic int get_channel_from_point; 147 | input int p; 148 | 149 | int ch; 150 | 151 | ch = (p >> 1) % (NLANE/2); 152 | ch += (p >> 13) % (NLANE/2); 153 | ch %= NLANE/2; 154 | 155 | ch *= 2; 156 | ch += ((p >> 23) ^ (p >> 11)) & 1; 157 | 158 | return ch; 159 | endfunction 160 | 161 | function automatic int get_index_from_point; 162 | input int p; 163 | input bit last; 164 | 165 | bit_point_t p_bits; 166 | int idx; 167 | addr_t addr; 168 | 169 | p_bits = p; 170 | 171 | if (last) begin 172 | idx = {p_bits[17:16], 173 | p_bits[11:4], 174 | p_bits[22:18], 175 | p_bits[3:0], 176 | p_bits[12]}; 177 | end else begin 178 | idx = {p_bits[17:16], 179 | p_bits[11:4], 180 | p_bits[22:18], 181 | p_bits[15:12], 182 | p_bits[0]}; 183 | end 184 | 185 | addr = idx_to_addr(idx); 186 | addr = bank_optimize_addr(addr); 187 | idx = addr_to_idx(addr); 188 | 189 | assert(idx < N/NLANE); 190 | return idx; 191 | endfunction 192 | 193 | function automatic mem_index_t tb_mem_index; 194 | input addr_t addr; 195 | 196 | mem_index_t result; 197 | // Should clip off unneeded upper bits. 198 | result = { addr[$bits(addr_t)-1:$clog2(HBM_PC_SIZE_IN_BYTES)+1],//+1 for read vs. write channel 199 | addr[$clog2(NBPP)+:(K-$clog2(NLANE))] }; 200 | 201 | return result; 202 | endfunction 203 | 204 | function automatic int ntt_order_from_natural; 205 | input int p; 206 | input int nhw; 207 | input int pass; 208 | 209 | int zz_i, zz_j, zz_k, zz_l; 210 | int zz_ijk; 211 | int zz_ijkl; 212 | int nlevel, nlevel0; 213 | 214 | nlevel = $clog2(nhw); 215 | nlevel0 = $clog2(nhw)-7; 216 | 217 | zz_i = p / 2 / NLANE % (nhw/2) / (1 << nlevel0); 218 | zz_j = p / 2 / NLANE % (nhw/2) % (1 << nlevel0); 219 | 220 | zz_k = p % 2; 221 | zz_l = p / 2 % NLANE + p / nhw / NLANE * NLANE; 222 | 223 | zz_ijk = zz_k * (nhw/2) + zz_j * (1<<(nlevel-nlevel0-1)) + zz_i; 224 | 225 | case (pass) 226 | 0: zz_ijkl = zz_ijk * nhw + zz_l; 227 | 1: zz_ijkl = zz_ijk + zz_l * nhw; 228 | endcase 229 | 230 | return zz_ijkl; 231 | endfunction 232 | 233 | // ---------------------------------------------------------------------- 234 | // Helper functions. 235 | 236 | function automatic addr_t get_addr_from_beat; 237 | input beat_id_t beat_id; 238 | input lane_id_t lane_id; 239 | input pass_id_t pass_id; 240 | input logic is_rd; 241 | 242 | addr_t addr; 243 | 244 | case ({pass_id, is_rd}) 245 | 2'b01, 246 | 2'b00, 247 | 2'b10: begin 248 | addr = addr_t'({ 249 | beat_id[$clog2(DMA_LEN*GROUP)+0 +: 2], 250 | beat_id[$clog2(DMA_LEN*GROUP)+2 +: 8], 251 | beat_id[$clog2(DMA_LEN) +: $clog2(GROUP)], 252 | $clog2(DMA_LEN)'(0), 253 | $clog2(DATA_WIDTH_IN_BYTES)'(0) 254 | }); 255 | end 256 | 2'b11: begin 257 | addr = addr_t'({ 258 | beat_id[$clog2(DMA_LEN*GROUP)+2 +: 2], 259 | beat_id[$clog2(DMA_LEN*GROUP)+9] ^ lane_id[0], 260 | beat_id[$clog2(DMA_LEN) +: $clog2(GROUP)], 261 | beat_id[$clog2(DMA_LEN*GROUP)+0 +: 2], 262 | beat_id[$clog2(DMA_LEN*GROUP)+4 +: 5], 263 | $clog2(DMA_LEN)'(0), 264 | $clog2(DATA_WIDTH_IN_BYTES)'(0) 265 | }); 266 | end 267 | endcase 268 | 269 | return bank_optimize_addr(addr); 270 | endfunction 271 | 272 | function automatic shift_t get_shift_from_cycle; 273 | input cycle_id_t cycle_id; 274 | input logic is_rd; 275 | 276 | shift_t shift; 277 | 278 | if (is_rd) begin 279 | shift = cycle_id[6 +: $bits(shift_t)]; 280 | end else begin 281 | shift = $bits(shift_t)'('0) - cycle_id[6 +: $bits(shift_t)]; 282 | end 283 | 284 | return shift; 285 | endfunction 286 | 287 | function automatic logic get_swap_from_beat; 288 | input beat_id_t beat_id; 289 | 290 | logic swap; 291 | 292 | swap = beat_id[$clog2(N_BEATS)-1]; 293 | 294 | return swap; 295 | endfunction 296 | 297 | function automatic logic get_swap_from_cycle; 298 | input cycle_id_t cycle_id; 299 | 300 | logic swap; 301 | 302 | swap = cycle_id[$clog2(N_CYCLES)-1]; 303 | 304 | return swap; 305 | endfunction 306 | 307 | function automatic ppch_id_t get_ppch_from_cycle; 308 | input cycle_id_t cycle_id; 309 | input logic pair; 310 | input pass_id_t pass_id; 311 | 312 | ppch_id_t ppch_id; 313 | 314 | case (pass_id) 315 | 1'b0: begin 316 | ppch_id[0] = pair; 317 | ppch_id[1] = cycle_id[$clog2(GROUP)]; 318 | end 319 | 1'b1: begin 320 | ppch_id[0] = cycle_id[$clog2(GROUP)]; 321 | ppch_id[1] = pair; 322 | end 323 | endcase 324 | 325 | return ppch_id; 326 | endfunction 327 | 328 | function automatic fine_t get_fine_from_cycle; 329 | input cycle_id_t cycle_id; 330 | input lane_id_t lane_id; 331 | input pass_id_t pass_id; 332 | 333 | logic [$clog2(DMA_LEN)-1:0] lsbs; 334 | fine_t addr; 335 | 336 | // Grab DMA_LEN's worth of lsbs and do the transform below 337 | // to grab the appropriate data when reading FIFO for 338 | // DMA to NTT and writing for NTT to DMA. Only for 339 | // second pass. 340 | // CH 0,1: adjust factor = 7 341 | // Original: 07654321 342 | // Adjusted: 76543210 343 | // Flipped: 01234567 344 | // CH 2,3: adjust factor = 6 345 | // Original: 10765432 346 | // Adjusted: 76543210 347 | // Flipped: 01234567 348 | // ... 349 | // CH 14,15: adjust factor = 0 350 | // Original: 76543210 351 | // Adjusted: 76543210 352 | // Flipped: 01234567 353 | // 354 | // ((NCH//2)-1)-(x+((NCH//2)-(ch//2)-1)) 355 | // ((ch//2)-x) % (NCH//2) 356 | lsbs = cycle_id[$clog2(GROUP*N_CYCLES/N_BEATS)+:$clog2(DMA_LEN)]; 357 | 358 | if (pass_id == 1'b1) begin 359 | lsbs = $clog2(DMA_LEN)'((lane_id>>1)-lsbs); 360 | end 361 | 362 | // Fine FIFO address = {group, burst} 363 | addr = fine_t'({cycle_id[$bits(cycle_id_t)-1:$clog2(DMA_LEN*GROUP*N_CYCLES/N_BEATS)], 364 | cycle_id[0+:$clog2(GROUP)], 365 | lsbs 366 | }); 367 | 368 | return addr; 369 | endfunction 370 | 371 | endpackage 372 | -------------------------------------------------------------------------------- /rtl/dma/dma_counter.sv: -------------------------------------------------------------------------------- 1 | // Copyright Supranational LLC 2 | // Licensed under the Apache License, Version 2.0, see LICENSE-APACHE 3 | // or the MIT license, see LICENSE-MIT, at your option. 4 | // SPDX-License-Identifier: Apache-2.0 OR MIT 5 | 6 | module dma_counter 7 | #( 8 | parameter int MAX = 2, 9 | parameter int START = 0, 10 | parameter int STEP = 1, 11 | parameter int FLAG = 0, 12 | parameter type COUNT_t = logic [$clog2(MAX)-1:0] 13 | ) 14 | ( 15 | input logic clk_i, 16 | input logic rst_ni, 17 | input logic restart_i, 18 | input logic step_i, 19 | output logic done_o, 20 | output COUNT_t count_o, 21 | output logic flag_o, 22 | input COUNT_t wrap_i 23 | ); 24 | 25 | COUNT_t counter_q, counter_d; 26 | COUNT_t mask; 27 | logic done_d; 28 | logic last; 29 | 30 | localparam int CMP_MSP = FLAG == 0 ? 0 : $clog2(FLAG+1)-1; 31 | 32 | always_ff @(posedge clk_i) begin 33 | if (!rst_ni) begin 34 | counter_q <= START; 35 | done_o <= '0; 36 | end else if (restart_i || step_i) begin 37 | counter_q <= counter_d; 38 | done_o <= done_d; 39 | end 40 | end 41 | 42 | always_comb begin 43 | mask = '1; 44 | mask <<= $clog2(STEP); 45 | 46 | last = counter_q == (wrap_i & mask); 47 | 48 | if (restart_i) begin 49 | counter_d = START; 50 | done_d = '0; 51 | end else if (!step_i) begin 52 | counter_d = counter_q; 53 | done_d = done_o; 54 | end else if (last) begin 55 | counter_d = START; 56 | done_d = !done_o; 57 | end else begin 58 | counter_d = counter_q + STEP; 59 | done_d = done_o; 60 | end 61 | 62 | count_o = counter_q; 63 | 64 | if (FLAG == 0) begin : gen_flag0 65 | flag_o = 1'b0; 66 | end else begin : gen_flag 67 | flag_o = counter_q[CMP_MSP:0] == FLAG; 68 | end 69 | end 70 | 71 | endmodule 72 | 73 | -------------------------------------------------------------------------------- /rtl/dma/point_dma_r_channel.sv: -------------------------------------------------------------------------------- 1 | // Copyright Supranational LLC 2 | // Licensed under the Apache License, Version 2.0, see LICENSE-APACHE 3 | // or the MIT license, see LICENSE-MIT, at your option. 4 | // SPDX-License-Identifier: Apache-2.0 OR MIT 5 | 6 | module point_dma_r_channel 7 | import config_pkg::*; 8 | #( 9 | parameter int ID = 0, 10 | parameter int C_M_AXI_ADDR_WIDTH = 64, 11 | parameter int C_M_AXI_DATA_WIDTH = 256 12 | ) 13 | ( 14 | input logic clk_i, 15 | input logic rst_ni, 16 | input logic start_i, 17 | output logic done_o, 18 | output logic hold_o, 19 | input logic release_i, 20 | 21 | output point_t [NPPCH-1:0] wdata_o, 22 | output logic [NPPCH-1:0] we_o, 23 | output fine_t [NPPCH-1:0] waddr_o, 24 | output coarse_t wcoarse_o, 25 | input coarse_t rcoarse_i, 26 | 27 | output beat_id_t dbg_rbeat, 28 | output logic dbg_rstep, 29 | 30 | // Tool generated ports. 31 | output logic m0_axi_arvalid, 32 | input logic m0_axi_arready, 33 | output logic [C_M_AXI_ADDR_WIDTH-1:0] m0_axi_araddr, 34 | output logic [8-1:0] m0_axi_arlen, 35 | input logic m0_axi_rvalid, 36 | output logic m0_axi_rready, 37 | input logic [C_M_AXI_DATA_WIDTH-1:0] m0_axi_rdata, 38 | input logic m0_axi_rlast, 39 | 40 | output logic m1_axi_arvalid, 41 | input logic m1_axi_arready, 42 | output logic [C_M_AXI_ADDR_WIDTH-1:0] m1_axi_araddr, 43 | output logic [8-1:0] m1_axi_arlen, 44 | input logic m1_axi_rvalid, 45 | output logic m1_axi_rready, 46 | input logic [C_M_AXI_DATA_WIDTH-1:0] m1_axi_rdata, 47 | input logic m1_axi_rlast, 48 | 49 | input logic [C_M_AXI_ADDR_WIDTH-1:0] ctrl_addr_offset0, 50 | input logic [C_M_AXI_ADDR_WIDTH-1:0] ctrl_addr_offset1 51 | ); 52 | 53 | import axi_hbm_pkg::*; 54 | 55 | localparam int LEN = DMA_LEN; 56 | localparam int LANE = ID / (N_HBM_PC / NLANE); 57 | 58 | typedef enum logic [1:0] {IDLE, BUSY, PASS, DONE} state_e; 59 | 60 | state_e state_q, state_d; 61 | pass_id_t pass_q, pass_d; 62 | beat_id_t arbeat_q; 63 | beat_id_t rbeat_q; 64 | beat_id_t wrap; 65 | 66 | logic restart; 67 | logic utilized; 68 | 69 | coarse_t volume; 70 | logic pause; 71 | addr_t addr; 72 | 73 | // ---------------------------------------------------------------------- 74 | // Pointer synchronization and gray/bin conversion. 75 | 76 | coarse_t rcoarse_gray, rcoarse_d, rcoarse_q; 77 | coarse_t wcoarse_gray; 78 | coarse_t wcoarse; 79 | 80 | cdc_sync #(.WIDTH($bits(coarse_t)), .RESET(0)) _rcoarse_gray 81 | (.clk_i, .rst_ni, .i(rcoarse_i), .o(rcoarse_gray)); 82 | 83 | gray_to_bin #(.WIDTH($bits(coarse_t))) _rcoarse_d 84 | (.i(rcoarse_gray), .o(rcoarse_d)); 85 | 86 | always_ff @(posedge clk_i) begin 87 | rcoarse_q <= rcoarse_d; 88 | wcoarse_o <= wcoarse_gray; 89 | end 90 | 91 | bin_to_gray #(.WIDTH($bits(coarse_t))) _wcoarse_gray 92 | (.i(wcoarse), .o(wcoarse_gray)); 93 | 94 | // ---------------------------------------------------------------------- 95 | // AXI burst tracking. 96 | 97 | logic ar_step, ar_done; 98 | dma_counter #(.MAX(N_BEATS), .STEP(LEN), .START(START_BEAT)) _counter_ar 99 | ( 100 | .clk_i, 101 | .rst_ni, 102 | .restart_i(restart), 103 | .step_i(ar_step), 104 | .done_o(ar_done), 105 | .count_o(arbeat_q), 106 | .flag_o(), 107 | .wrap_i(wrap) 108 | ); 109 | 110 | logic r_step, r_done; 111 | dma_counter #(.MAX(N_BEATS), .STEP(1), .START(START_BEAT)) _counter_r 112 | ( 113 | .clk_i, 114 | .rst_ni, 115 | .restart_i(restart), 116 | .step_i(r_step), 117 | .done_o(r_done), 118 | .count_o(rbeat_q), 119 | .flag_o(), 120 | .wrap_i(wrap) 121 | ); 122 | 123 | // ---------------------------------------------------------------------- 124 | // AXI signal assignments. 125 | 126 | always_comb begin 127 | addr = get_addr_from_beat(arbeat_q, LANE, pass_q, POINT_READ); 128 | 129 | m0_axi_arvalid = (pass_q == 1'b0) && (state_q == BUSY) && !ar_done; 130 | m1_axi_arvalid = (pass_q == 1'b1) && (state_q == BUSY) && !ar_done; 131 | 132 | m0_axi_rready = !pause; 133 | m1_axi_rready = !pause; 134 | 135 | m0_axi_arlen = LEN-1; 136 | m1_axi_arlen = LEN-1; 137 | 138 | m0_axi_araddr = addr | $bits(m0_axi_araddr)'(ctrl_addr_offset0); 139 | m1_axi_araddr = addr | $bits(m1_axi_araddr)'(ctrl_addr_offset1); 140 | 141 | // Removed do to suspected HBM crossbar IP bug. 142 | // Trade addresses with the write channel paired with this one. 143 | // This uses the built in crossbar in the HBM IP to ping pong 144 | // between read and write buffers for first and second passes 145 | // thus saving a lot of 2:1 muxes and routing complexity. 146 | //m_axi_araddr[$clog2(HBM_PC_SIZE_IN_BYTES)+0] ^= pass_q; 147 | 148 | // Swap addresses with the read channel paired with this one. 149 | // This swap could be handled in point_to_ntt but using the 150 | // built in crossbar in the HBM IP to perform the swap saves 151 | // a lot of 2:1 muxes and routing complexity. 152 | //m_axi_araddr[$clog2(HBM_PC_SIZE_IN_BYTES)+1] ^= get_swap_from_beat(arbeat_q); 153 | 154 | ar_step = m0_axi_arvalid && m0_axi_arready || 155 | m1_axi_arvalid && m1_axi_arready; 156 | 157 | r_step = m0_axi_rvalid && m0_axi_rready || 158 | m1_axi_rvalid && m1_axi_rready; 159 | 160 | utilized = r_step; // Just for debug. 161 | 162 | wrap = beat_id_t'(LAST_BEAT); 163 | end 164 | 165 | assign dbg_rbeat = rbeat_q; 166 | assign dbg_rstep = r_step; 167 | 168 | // ---------------------------------------------------------------------- 169 | // Next state logic. 170 | 171 | always_ff @(posedge clk_i) begin 172 | if (!rst_ni) begin 173 | pass_q <= START_PASS; 174 | state_q <= IDLE; 175 | end else begin 176 | pass_q <= pass_d; 177 | state_q <= state_d; 178 | end 179 | end 180 | 181 | always_comb begin 182 | pass_d = pass_q; 183 | state_d = state_q; 184 | restart = 1'b0; 185 | 186 | done_o = state_q inside {DONE}; 187 | hold_o = state_q inside {DONE, PASS}; 188 | 189 | case (state_q) 190 | IDLE: begin 191 | if (start_i) begin 192 | restart = 1'b1; 193 | state_d = BUSY; 194 | end 195 | end 196 | BUSY: begin 197 | if (ar_done && r_done) begin 198 | state_d = PASS; 199 | end 200 | end 201 | PASS: begin 202 | if (pass_q == LAST_PASS) begin 203 | restart = 1'b1; 204 | state_d = DONE; 205 | end else if (release_i) begin 206 | pass_d = 1'b1; 207 | restart = 1'b1; 208 | state_d = BUSY; 209 | end 210 | end 211 | DONE: begin 212 | if (release_i) begin 213 | state_d = IDLE; 214 | pass_d = 1'b0; 215 | end 216 | end 217 | endcase 218 | 219 | wdata_o = pass_q ? m1_axi_rdata : m0_axi_rdata; 220 | we_o = {NPPCH{r_step}}; 221 | waddr_o = {NPPCH{fine_t'(rbeat_q)}}; 222 | wcoarse = coarse_t'(rbeat_q >> $clog2(FIFO_FINE_DEPTH)); 223 | volume = coarse_t'(wcoarse - rcoarse_q); 224 | pause = volume == FIFO_COARSE_DEPTH; 225 | end 226 | 227 | endmodule 228 | -------------------------------------------------------------------------------- /rtl/dma/point_dma_w_channel.sv: -------------------------------------------------------------------------------- 1 | // Copyright Supranational LLC 2 | // Licensed under the Apache License, Version 2.0, see LICENSE-APACHE 3 | // or the MIT license, see LICENSE-MIT, at your option. 4 | // SPDX-License-Identifier: Apache-2.0 OR MIT 5 | 6 | module point_dma_w_channel 7 | import config_pkg::*; 8 | #( 9 | parameter int ID = 0, 10 | parameter int C_M_AXI_ADDR_WIDTH = 64, 11 | parameter int C_M_AXI_DATA_WIDTH = 256 12 | ) 13 | ( 14 | input logic clk_i, 15 | input logic rst_ni, 16 | input logic start_i, 17 | output logic done_o, 18 | output logic hold_o, 19 | input logic release_i, 20 | 21 | input point_t [NPPCH-1:0] rdata_i, 22 | output logic [NPPCH-1:0] re_o, 23 | output fine_t [NPPCH-1:0] raddr_o, 24 | input coarse_t wcoarse_i, 25 | output coarse_t rcoarse_o, 26 | 27 | output beat_id_t dbg_wbeat, 28 | output logic dbg_wstep, 29 | 30 | // Tool generated ports. 31 | output logic m0_axi_awvalid, 32 | input logic m0_axi_awready, 33 | output logic [C_M_AXI_ADDR_WIDTH-1:0] m0_axi_awaddr, 34 | output logic [8-1:0] m0_axi_awlen, 35 | output logic m0_axi_wvalid, 36 | input logic m0_axi_wready, 37 | output logic [C_M_AXI_DATA_WIDTH-1:0] m0_axi_wdata, 38 | output logic [C_M_AXI_DATA_WIDTH/8-1:0] m0_axi_wstrb, 39 | output logic m0_axi_wlast, 40 | input logic m0_axi_bvalid, 41 | output logic m0_axi_bready, 42 | 43 | output logic m1_axi_awvalid, 44 | input logic m1_axi_awready, 45 | output logic [C_M_AXI_ADDR_WIDTH-1:0] m1_axi_awaddr, 46 | output logic [8-1:0] m1_axi_awlen, 47 | output logic m1_axi_wvalid, 48 | input logic m1_axi_wready, 49 | output logic [C_M_AXI_DATA_WIDTH-1:0] m1_axi_wdata, 50 | output logic [C_M_AXI_DATA_WIDTH/8-1:0] m1_axi_wstrb, 51 | output logic m1_axi_wlast, 52 | input logic m1_axi_bvalid, 53 | output logic m1_axi_bready, 54 | 55 | input logic [C_M_AXI_ADDR_WIDTH-1:0] ctrl_addr_offset0, 56 | input logic [C_M_AXI_ADDR_WIDTH-1:0] ctrl_addr_offset1 57 | ); 58 | 59 | import axi_hbm_pkg::*; 60 | 61 | localparam int LEN = DMA_LEN; 62 | localparam int LANE = ID / (N_HBM_PC / NLANE); 63 | 64 | typedef enum logic [1:0] {IDLE, BUSY, PASS, DONE} state_e; 65 | 66 | typedef struct packed { 67 | logic last; 68 | point_t [NPPCH-1:0] data; 69 | } timing_fifo_data_s; 70 | 71 | timing_fifo_data_s timing_fifo_wdata, timing_fifo_rdata; 72 | logic timing_fifo_wvalid, timing_fifo_rvalid; 73 | logic timing_fifo_wready, timing_fifo_rready; 74 | 75 | state_e state_q, state_d; 76 | pass_id_t pass_q, pass_d; 77 | beat_id_t awbeat_q; 78 | beat_id_t wbeat_q; 79 | beat_id_t bbeat_q; 80 | beat_id_t wrap; 81 | 82 | logic restart; 83 | logic utilized; 84 | logic pause; 85 | logic read; 86 | logic wvalid_q, wvalid_d; 87 | logic wready; 88 | fine_t raddr_next; 89 | addr_t addr; 90 | 91 | // ---------------------------------------------------------------------- 92 | // Pointer synchronization and gray/bin conversion. 93 | 94 | coarse_t wcoarse_gray, wcoarse_d, wcoarse_q; 95 | coarse_t rcoarse_gray; 96 | coarse_t rcoarse; 97 | 98 | cdc_sync #(.WIDTH($bits(coarse_t)), .RESET(0)) _wcoarse_gray 99 | (.clk_i, .rst_ni, .i(wcoarse_i), .o(wcoarse_gray)); 100 | 101 | gray_to_bin #(.WIDTH($bits(coarse_t))) _wcoarse_d 102 | (.i(wcoarse_gray), .o(wcoarse_d)); 103 | 104 | always_ff @(posedge clk_i) begin 105 | wcoarse_q <= wcoarse_d; 106 | rcoarse_o <= rcoarse_gray; 107 | end 108 | 109 | bin_to_gray #(.WIDTH($bits(coarse_t))) _rcoarse_gray 110 | (.i(rcoarse), .o(rcoarse_gray)); 111 | 112 | // ---------------------------------------------------------------------- 113 | // AXI burst tracking. 114 | 115 | logic aw_step, aw_done; 116 | dma_counter #(.MAX(N_BEATS), .STEP(LEN), .START(START_BEAT)) _counter_aw 117 | ( 118 | .clk_i, 119 | .rst_ni, 120 | .restart_i(restart), 121 | .step_i(aw_step), 122 | .done_o(aw_done), 123 | .count_o(awbeat_q), 124 | .flag_o(), 125 | .wrap_i(wrap) 126 | ); 127 | 128 | logic w_step, w_done, w_last; 129 | dma_counter #(.MAX(N_BEATS), .STEP(1), .FLAG(LEN-1), .START(START_BEAT)) _counter_w 130 | ( 131 | .clk_i, 132 | .rst_ni, 133 | .restart_i(restart), 134 | .step_i(w_step), 135 | .done_o(w_done), 136 | .count_o(wbeat_q), 137 | .flag_o(w_last), 138 | .wrap_i(wrap) 139 | ); 140 | 141 | logic b_step, b_done; 142 | dma_counter #(.MAX(N_BEATS), .STEP(LEN), .START(START_BEAT)) _counter_b 143 | ( 144 | .clk_i, 145 | .rst_ni, 146 | .restart_i(restart), 147 | .step_i(b_step), 148 | .done_o(b_done), 149 | .count_o(bbeat_q), 150 | .flag_o(), 151 | .wrap_i(wrap) 152 | ); 153 | 154 | // ---------------------------------------------------------------------- 155 | // AXI signal assignments. 156 | 157 | always_comb begin 158 | addr = get_addr_from_beat(awbeat_q, LANE, pass_q, POINT_WRITE); 159 | 160 | m0_axi_awvalid = (pass_q == 1'b1) && (state_q == BUSY) && !aw_done; 161 | m1_axi_awvalid = (pass_q == 1'b0) && (state_q == BUSY) && !aw_done; 162 | 163 | timing_fifo_wvalid = wvalid_q && !w_done && !pause; 164 | timing_fifo_wdata = '{last: w_last, data: rdata_i}; 165 | 166 | m0_axi_wstrb = '1; 167 | m1_axi_wstrb = '1; 168 | 169 | m0_axi_bready = 1'b1; 170 | m1_axi_bready = 1'b1; 171 | 172 | m0_axi_awlen = LEN-1; 173 | m1_axi_awlen = LEN-1; 174 | 175 | m0_axi_awaddr = addr | $bits(m0_axi_awaddr)'(ctrl_addr_offset0); 176 | m1_axi_awaddr = addr | $bits(m1_axi_awaddr)'(ctrl_addr_offset1); 177 | 178 | // Removed do to suspected HBM crossbar IP bug. 179 | // Trade addresses with the write channel paired with this one. 180 | // This uses the built in crossbar in the HBM IP to ping pong 181 | // between read and write buffers for first and second passes 182 | // thus saving a lot of 2:1 muxes and routing complexity. 183 | //m_axi_awaddr[$clog2(HBM_PC_SIZE_IN_BYTES)+0] ^= pass_q; 184 | 185 | // Swap addresses with the read channel paired with this one. 186 | // This swap could be handled in point_to_ntt but using the 187 | // built in crossbar in the HBM IP to perform the swap saves 188 | // a lot of 2:1 muxes and routing complexity. 189 | //m_axi_awaddr[$clog2(HBM_PC_SIZE_IN_BYTES)+1] ^= get_swap_from_beat(awbeat_q); 190 | 191 | aw_step = m0_axi_awvalid && m0_axi_awready || 192 | m1_axi_awvalid && m1_axi_awready; 193 | 194 | w_step = timing_fifo_wvalid && timing_fifo_wready; 195 | 196 | b_step = m0_axi_bvalid && m0_axi_bready || 197 | m1_axi_bvalid && m1_axi_bready; 198 | 199 | utilized = w_step; // Just for debug. 200 | 201 | wrap = beat_id_t'(LAST_BEAT); 202 | end 203 | 204 | assign dbg_wbeat = wbeat_q; 205 | assign dbg_wstep = w_step; 206 | 207 | // ---------------------------------------------------------------------- 208 | // Next state logic. 209 | 210 | always_ff @(posedge clk_i) begin 211 | if (!rst_ni) begin 212 | pass_q <= START_PASS; 213 | state_q <= IDLE; 214 | wvalid_q <= '0; 215 | end else begin 216 | pass_q <= pass_d; 217 | state_q <= state_d; 218 | wvalid_q <= wvalid_d; 219 | end 220 | end 221 | 222 | assign pause = wcoarse_q == rcoarse; 223 | 224 | always_comb begin 225 | pass_d = pass_q; 226 | state_d = state_q; 227 | wvalid_d = wvalid_q; 228 | restart = 1'b0; 229 | 230 | done_o = state_q inside {DONE}; 231 | hold_o = state_q inside {DONE, PASS}; 232 | 233 | case (state_q) 234 | IDLE: begin 235 | if (start_i) begin 236 | restart = 1'b1; 237 | state_d = BUSY; 238 | end 239 | end 240 | BUSY: begin 241 | if (aw_done && w_done && b_done) begin 242 | state_d = PASS; 243 | end 244 | end 245 | PASS: begin 246 | if (pass_q == LAST_PASS) begin 247 | restart = 1'b1; 248 | state_d = DONE; 249 | end else if (release_i) begin 250 | pass_d = 1'b1; 251 | restart = 1'b1; 252 | state_d = BUSY; 253 | end 254 | end 255 | DONE: begin 256 | if (release_i) begin 257 | state_d = IDLE; 258 | pass_d = 1'b0; 259 | end 260 | end 261 | endcase 262 | 263 | rcoarse = coarse_t'(wbeat_q >> $clog2(FIFO_FINE_DEPTH)); 264 | read = (state_q == BUSY) && !w_done && !pause; 265 | 266 | case (pass_q) 267 | 1'b0: wready = m1_axi_wready; 268 | 1'b1: wready = m0_axi_wready; 269 | endcase 270 | 271 | if (restart) wvalid_d = 1'b0; 272 | else if (!wvalid_q && read) wvalid_d = 1'b1; 273 | else if (wvalid_q && !read && wready) wvalid_d = 1'b0; 274 | 275 | if (w_step) begin 276 | raddr_next = fine_t'(wbeat_q + 1'b1); 277 | end else begin 278 | raddr_next = fine_t'(wbeat_q); 279 | end 280 | 281 | raddr_o = {NPPCH{raddr_next}}; 282 | re_o = {NPPCH{wvalid_d}}; 283 | end 284 | 285 | // ---------------------------------------------------------------------- 286 | // Break write data timing paths. 287 | 288 | fifo 289 | #( 290 | .DATA_t(timing_fifo_data_s), 291 | .DEPTH(DMA_W_FIFO_DEPTH), 292 | .SYNC_RESET(1) 293 | ) _timing_fifo 294 | ( 295 | .clk_i, 296 | .rst_ni, 297 | .wdata_i(timing_fifo_wdata), 298 | .wvalid_i(timing_fifo_wvalid), 299 | .wready_o(timing_fifo_wready), 300 | .rdata_o(timing_fifo_rdata), 301 | .rvalid_o(timing_fifo_rvalid), 302 | .rready_i(timing_fifo_rready) 303 | ); 304 | 305 | always_comb begin 306 | m0_axi_wdata = timing_fifo_rdata.data; 307 | m1_axi_wdata = timing_fifo_rdata.data; 308 | 309 | m0_axi_wlast = timing_fifo_rdata.last; 310 | m1_axi_wlast = timing_fifo_rdata.last; 311 | 312 | m0_axi_wvalid = timing_fifo_rvalid && (pass_q == 1'b1); 313 | m1_axi_wvalid = timing_fifo_rvalid && (pass_q == 1'b0); 314 | 315 | timing_fifo_rready = pass_q ? m0_axi_wready : m1_axi_wready; 316 | end 317 | 318 | endmodule 319 | -------------------------------------------------------------------------------- /rtl/dma/point_from_ntt.sv: -------------------------------------------------------------------------------- 1 | // Copyright Supranational LLC 2 | // Licensed under the Apache License, Version 2.0, see LICENSE-APACHE 3 | // or the MIT license, see LICENSE-MIT, at your option. 4 | // SPDX-License-Identifier: Apache-2.0 OR MIT 5 | 6 | module point_from_ntt 7 | import config_pkg::*; 8 | ( 9 | input logic clk_i, 10 | input logic rst_ni, 11 | 12 | input point_t [NLANE-1:0][1:0] x_i, 13 | input logic [NLANE-1:0] valid_i, 14 | 15 | output logic [NLANE-1:0][NPPCH-1:0] we_o, 16 | output point_t [NLANE-1:0][NPPCH-1:0] wdata_o, 17 | output fine_t [NLANE-1:0][NPPCH-1:0] waddr_o, 18 | output coarse_t [NLANE-1:0] wcoarse_o, 19 | input coarse_t [NLANE-1:0] rcoarse_i 20 | ); 21 | 22 | // ---------------------------------------------------------------------- 23 | // Pointer synchronization and gray/bin conversion. 24 | 25 | coarse_t [NLANE-1:0] rcoarse_gray, rcoarse_d, rcoarse_q; 26 | coarse_t wcoarse_gray; 27 | coarse_t wcoarse; 28 | 29 | cdc_sync #(.WIDTH($bits(coarse_t)), .RESET(0)) _rcoarse_gray[NLANE-1:0] 30 | (.clk_i, .rst_ni, .i(rcoarse_i), .o(rcoarse_gray)); 31 | 32 | gray_to_bin #(.WIDTH($bits(coarse_t))) _rcoarse_d[NLANE-1:0] 33 | (.i(rcoarse_gray), .o(rcoarse_d)); 34 | 35 | always_ff @(posedge clk_i) begin 36 | rcoarse_q <= rcoarse_d; 37 | wcoarse_o <= {NLANE{wcoarse_gray}}; 38 | end 39 | 40 | bin_to_gray #(.WIDTH($bits(coarse_t))) _wcoarse_gray 41 | (.i(wcoarse), .o(wcoarse_gray)); 42 | 43 | // ---------------------------------------------------------------------- 44 | // This engine pushes points to the FIFOs in a synchronous manner 45 | // using coarse flow control to minimize control paths across a wide 46 | // datapath. 47 | 48 | localparam int CYCLE_TO_COARSE = ($clog2(FIFO_FINE_DEPTH) + 49 | $clog2(N_CYCLES/N_BEATS)); 50 | 51 | // ---------------------------------------------------------------------- 52 | // Stage 1: Flop incoming data. 53 | 54 | point_t [NLANE-1:0][1:0] stage1_x_q; 55 | logic stage1_valid_q; 56 | cycle_id_t stage1_cycle_q, stage1_cycle_d; 57 | pass_id_t stage1_pass_q, stage1_pass_d; 58 | shift_t stage1_shift_q, stage1_shift_d; 59 | 60 | always_ff @(posedge clk_i) begin 61 | stage1_x_q <= x_i; 62 | end 63 | 64 | always_ff @(posedge clk_i) begin 65 | if (!rst_ni) begin 66 | stage1_valid_q <= 1'b0; 67 | stage1_cycle_q <= START_CYCLE; 68 | stage1_pass_q <= START_PASS; 69 | stage1_shift_q <= '0; 70 | end else begin 71 | stage1_valid_q <= valid_i[0]; 72 | stage1_cycle_q <= stage1_cycle_d; 73 | stage1_pass_q <= stage1_pass_d; 74 | stage1_shift_q <= stage1_shift_d; 75 | end 76 | end 77 | 78 | always_comb begin 79 | if (!stage1_valid_q) begin 80 | // No transfer so hold state. 81 | stage1_cycle_d = stage1_cycle_q; 82 | stage1_pass_d = stage1_pass_q; 83 | end else if (stage1_cycle_q == LAST_CYCLE) begin 84 | // Wrap cycles. 85 | stage1_cycle_d = START_CYCLE; 86 | if (stage1_pass_q == LAST_PASS) begin 87 | // Wrap passes. 88 | stage1_pass_d = '0; 89 | end else begin 90 | // Next pass. 91 | stage1_pass_d = stage1_pass_q + 1'b1; 92 | end 93 | end else begin 94 | // Increment cycles. 95 | stage1_cycle_d = stage1_cycle_q + 1'b1; 96 | stage1_pass_d = stage1_pass_q; 97 | end 98 | 99 | stage1_shift_d = get_shift_from_cycle(stage1_cycle_d, 100 | POINT_WRITE); 101 | end 102 | 103 | // ---------------------------------------------------------------------- 104 | // Stage 2: xbar/circular shift to reorient x. 105 | 106 | cycle_id_t [SHIFT_PIPE_DEPTH:0][1:0] shift_cycle; 107 | logic [SHIFT_PIPE_DEPTH:0][1:0] shift_pass; 108 | logic [SHIFT_PIPE_DEPTH:0] shift_valid; 109 | shift_t [SHIFT_PIPE_DEPTH:0] shift_amount; 110 | point_t [SHIFT_PIPE_DEPTH:0][NLANE/2-1:0][1:0][1:0] shift_x; 111 | 112 | always_comb begin 113 | shift_cycle [0] = stage1_cycle_q; 114 | shift_pass [0] = stage1_pass_q; 115 | shift_amount[0] = stage1_shift_q; 116 | shift_valid [0] = stage1_valid_q; 117 | shift_x [0] = stage1_x_q; 118 | end 119 | 120 | point_t [NLANE/2-1:0][1:0][1:0] shifted_q; 121 | cycle_id_t shifted_cycle_q; 122 | logic shifted_pass_q; 123 | logic shifted_valid_q; 124 | 125 | if (SHIFT_PIPE_DEPTH != 0) begin : _shift_pipe 126 | 127 | localparam MAX_SHIFT = (NLANE/2) / SHIFT_PIPE_DEPTH; 128 | 129 | for (genvar i = 0; i < SHIFT_PIPE_DEPTH; i++) begin : _shift 130 | 131 | always_ff @(posedge clk_i) begin 132 | if (!rst_ni) begin 133 | shift_valid[i+1] <= 1'b0; 134 | end else begin 135 | shift_valid[i+1] <= shift_valid[i]; 136 | end 137 | end 138 | 139 | shift_t shift_remaining; 140 | point_t [NLANE/2-1:0][1:0][1:0] shifted; 141 | 142 | always_comb begin 143 | shift_remaining = shift_amount[i]; 144 | shifted = shift_x [i]; 145 | 146 | for (int j = 0; j < MAX_SHIFT; j++) begin 147 | if (shift_remaining != 0) begin 148 | shift_remaining = shift_remaining - 1'b1; 149 | shifted = {shifted[0], shifted[NLANE/2-1:1]}; 150 | end 151 | end 152 | end 153 | 154 | always_ff @(posedge clk_i) begin 155 | shift_cycle [i+1] <= shift_cycle[i]; 156 | shift_pass [i+1] <= shift_pass[i]; 157 | shift_amount[i+1] <= shift_remaining; 158 | shift_x [i+1] <= shifted; 159 | end 160 | end 161 | 162 | always_comb begin 163 | shifted_cycle_q = shift_cycle[SHIFT_PIPE_DEPTH]; 164 | shifted_pass_q = shift_pass [SHIFT_PIPE_DEPTH]; 165 | shifted_q = shift_x [SHIFT_PIPE_DEPTH]; 166 | shifted_valid_q = shift_valid[SHIFT_PIPE_DEPTH]; 167 | end 168 | 169 | end else begin : _shift 170 | 171 | point_t [NLANE/2-1:0][1:0][1:0] shifted; 172 | 173 | always_comb begin 174 | case (shift_amount[0]) 175 | 3'h0: shifted = shift_x; 176 | 3'h1: shifted = {shift_x[0][0], shift_x[0][NLANE/2-1:1]}; 177 | 3'h2: shifted = {shift_x[0][1], shift_x[0][NLANE/2-1:2]}; 178 | 3'h3: shifted = {shift_x[0][2], shift_x[0][NLANE/2-1:3]}; 179 | 3'h4: shifted = {shift_x[0][3], shift_x[0][NLANE/2-1:4]}; 180 | 3'h5: shifted = {shift_x[0][4], shift_x[0][NLANE/2-1:5]}; 181 | 3'h6: shifted = {shift_x[0][5], shift_x[0][NLANE/2-1:6]}; 182 | 3'h7: shifted = {shift_x[0][6], shift_x[0][NLANE/2-1:7]}; 183 | endcase 184 | end 185 | 186 | always_ff @(posedge clk_i) begin 187 | shifted_cycle_q <= stage1_cycle_q; 188 | shifted_q <= shifted; 189 | end 190 | 191 | always_ff @(posedge clk_i) begin 192 | if (!rst_ni) begin 193 | shifted_valid_q <= 1'b0; 194 | shifted_pass_q <= START_PASS; 195 | end else begin 196 | shifted_valid_q <= stage1_valid_q; 197 | shifted_pass_q <= stage1_pass_q; 198 | end 199 | end 200 | end 201 | 202 | // ---------------------------------------------------------------------- 203 | // Reorg and PPCH mux. 204 | 205 | point_t [NLANE/2-1:0][1:0][NPPCH-1:0] reorg; 206 | logic [NLANE/2-1:0][1:0][NPPCH-1:0] reorg_valid; 207 | ppch_id_t ppch_sel; 208 | logic swap; 209 | logic lane_inner_swap; 210 | 211 | always_comb begin 212 | reorg = '0; 213 | reorg_valid = '0; 214 | 215 | swap = get_swap_from_cycle(shifted_cycle_q); 216 | 217 | for (int lane_outer = 0; lane_outer < NLANE/2; lane_outer++) begin 218 | // Hardcoded "2" below is the square root of NPPCH. 219 | for (int ppch_idx = 0; ppch_idx < 2; ppch_idx++) begin 220 | ppch_sel = get_ppch_from_cycle(shifted_cycle_q, 221 | ppch_idx, 222 | shifted_pass_q); 223 | for (int lane_inner = 0; lane_inner < 2; lane_inner++) begin 224 | lane_inner_swap = logic'(lane_inner) ^ swap; 225 | 226 | // These should reduce to 3:1 muxes. 227 | // Also handle reorg in same logic. 228 | reorg_valid[lane_outer][lane_inner_swap][ppch_sel] = 1'b1; 229 | reorg [lane_outer][lane_inner_swap][ppch_sel] = 230 | shifted_q[lane_outer][ppch_idx][lane_inner]; 231 | end 232 | end 233 | end 234 | end 235 | 236 | // ---------------------------------------------------------------------- 237 | // FIFO write interface. 238 | 239 | point_t [NLANE-1:0][NPPCH-1:0] wdata_d; 240 | logic [NLANE-1:0][NPPCH-1:0] we_d; 241 | fine_t [NLANE-1:0][NPPCH-1:0] waddr_d; 242 | 243 | always_comb begin 244 | wcoarse = coarse_t'(shifted_cycle_q >> CYCLE_TO_COARSE); 245 | wdata_d = reorg; 246 | we_d = shifted_valid_q ? reorg_valid : '0; 247 | 248 | for (int lane = 0; lane < NLANE; lane++) begin 249 | waddr_d[lane] = {NPPCH{get_fine_from_cycle(shifted_cycle_q, 250 | lane, 251 | shifted_pass_q)}}; 252 | end 253 | end 254 | 255 | always_ff @(posedge clk_i) begin 256 | we_o <= we_d; 257 | waddr_o <= waddr_d; 258 | wdata_o <= wdata_d; 259 | end 260 | 261 | // ---------------------------------------------------------------------- 262 | 263 | `ifndef SYNTHESIS 264 | logic [NLANE-1:0] backpressure; 265 | coarse_t [NLANE-1:0] volume; 266 | 267 | always_comb begin 268 | for (int lane = 0; lane < NLANE; lane++) begin 269 | volume[lane] = coarse_t'(wcoarse - rcoarse_q[lane]); 270 | backpressure[lane] = volume[lane] == FIFO_COARSE_DEPTH; 271 | end 272 | end 273 | 274 | ASSERT_no_backpressure: 275 | assert property (@(posedge clk_i) disable iff (!rst_ni) 276 | we_o !== '0 |-> (we_o & backpressure) === '0 277 | ) else $fatal(1, "NTT-->DMA point backpressure."); 278 | `endif 279 | 280 | endmodule 281 | -------------------------------------------------------------------------------- /rtl/dma/point_to_ntt.sv: -------------------------------------------------------------------------------- 1 | // Copyright Supranational LLC 2 | // Licensed under the Apache License, Version 2.0, see LICENSE-APACHE 3 | // or the MIT license, see LICENSE-MIT, at your option. 4 | // SPDX-License-Identifier: Apache-2.0 OR MIT 5 | 6 | module point_to_ntt 7 | import config_pkg::*; 8 | ( 9 | input logic clk_i, 10 | input logic rst_ni, 11 | output logic pass_o, 12 | 13 | output point_t [NLANE-1:0][1:0] x_o, 14 | (* dont_touch = "true" *) output logic [NLANE-1:0] valid_o, 15 | 16 | output logic [NLANE-1:0][NPPCH-1:0] re_o, 17 | input point_t [NLANE-1:0][NPPCH-1:0] rdata_i, 18 | output fine_t [NLANE-1:0][NPPCH-1:0] raddr_o, 19 | input coarse_t [NLANE-1:0] wcoarse_i, 20 | output coarse_t [NLANE-1:0] rcoarse_o 21 | ); 22 | 23 | logic re_d; 24 | fine_t [NLANE-1:0][NPPCH-1:0] raddr_d; 25 | logic stage1_valid; 26 | cycle_id_t stage1_cycle_q, stage1_cycle_d; 27 | pass_id_t stage1_pass_q, stage1_pass_d; 28 | 29 | point_t [NLANE/2-1:0][1:0][NPPCH-1:0] stage1_data_q; 30 | 31 | // ---------------------------------------------------------------------- 32 | // Pointer synchronization and gray/bin conversion. 33 | 34 | coarse_t [NLANE-1:0] wcoarse_gray, wcoarse_d, wcoarse_q; 35 | coarse_t rcoarse_gray; 36 | coarse_t rcoarse; 37 | 38 | cdc_sync #(.WIDTH($bits(coarse_t)), .RESET(0)) _wcoarse_gray[NLANE-1:0] 39 | (.clk_i, .rst_ni, .i(wcoarse_i), .o(wcoarse_gray)); 40 | 41 | gray_to_bin #(.WIDTH($bits(coarse_t))) _wcoarse_d[NLANE-1:0] 42 | (.i(wcoarse_gray), .o(wcoarse_d)); 43 | 44 | always_ff @(posedge clk_i) begin 45 | wcoarse_q <= wcoarse_d; 46 | rcoarse_o <= {NLANE{rcoarse_gray}}; 47 | end 48 | 49 | bin_to_gray #(.WIDTH($bits(coarse_t))) _rcoarse_gray 50 | (.i(rcoarse), .o(rcoarse_gray)); 51 | 52 | // ---------------------------------------------------------------------- 53 | // This engine pulls points from the FIFOs in a synchronous manner 54 | // using coarse flow control to minimize control paths across a wide 55 | // datapath. 56 | 57 | localparam int CYCLE_TO_COARSE = ($clog2(FIFO_FINE_DEPTH) + 58 | $clog2(N_CYCLES/N_BEATS)); 59 | 60 | // ---------------------------------------------------------------------- 61 | // Flop incoming data and generate control signals. 62 | 63 | always_ff @(posedge clk_i) begin 64 | if (!rst_ni) begin 65 | stage1_cycle_q <= START_CYCLE; 66 | stage1_pass_q <= START_PASS; 67 | end else begin 68 | stage1_cycle_q <= stage1_cycle_d; 69 | stage1_pass_q <= stage1_pass_d; 70 | end 71 | end 72 | 73 | always_ff @(posedge clk_i) begin 74 | re_o <= {NLANE*NPPCH{re_d}}; 75 | raddr_o <= raddr_d; 76 | end 77 | 78 | always_comb begin 79 | rcoarse = coarse_t'(stage1_cycle_q >> CYCLE_TO_COARSE); 80 | 81 | // Flow control from FIFO read to NTT. 82 | stage1_valid = 1'b1; 83 | for (int lane = 0; lane < NLANE; lane++) begin 84 | stage1_valid &= wcoarse_q[lane] != rcoarse; 85 | end 86 | 87 | // FIFO read interface. 88 | re_d = stage1_valid; 89 | 90 | for (int lane = 0; lane < NLANE; lane++) begin 91 | raddr_d[lane] = {NPPCH{get_fine_from_cycle(stage1_cycle_q, 92 | lane, 93 | stage1_pass_q)}}; 94 | end 95 | 96 | if (!stage1_valid) begin 97 | // No transfer so hold state. 98 | stage1_cycle_d = stage1_cycle_q; 99 | stage1_pass_d = stage1_pass_q; 100 | end else if (stage1_cycle_q == LAST_CYCLE) begin 101 | // Wrap cycles. 102 | stage1_cycle_d = START_CYCLE; 103 | if (stage1_pass_q == LAST_PASS) begin 104 | // Wrap passes. 105 | stage1_pass_d = '0; 106 | end else begin 107 | // Next pass. 108 | stage1_pass_d = stage1_pass_q + 1'b1; 109 | end 110 | end else begin 111 | // Increment cycles. 112 | stage1_cycle_d = stage1_cycle_q + 1'b1; 113 | stage1_pass_d = stage1_pass_q; 114 | end 115 | end 116 | 117 | // ---------------------------------------------------------------------- 118 | // Pipe along some control signals until FIFO read data is ready. 119 | 120 | logic rden_valid_q; 121 | cycle_id_t rden_cycle_q; 122 | logic rden_pass_q; 123 | 124 | logic rdval_valid_q; 125 | cycle_id_t rdval_cycle_q; 126 | logic rdval_pass_q; 127 | 128 | logic rdcap_valid_q; 129 | cycle_id_t rdcap_cycle_q; 130 | logic rdcap_pass_q; 131 | point_t [NLANE/2-1:0][1:0][NPPCH-1:0] rdcap_q; 132 | 133 | // FIFO read enable stage. 134 | always_ff @(posedge clk_i) begin 135 | if (!rst_ni) begin 136 | rden_valid_q <= '0; 137 | rden_cycle_q <= START_CYCLE; 138 | rden_pass_q <= START_PASS; 139 | end else begin 140 | rden_valid_q <= stage1_valid; 141 | rden_cycle_q <= stage1_cycle_q; 142 | rden_pass_q <= stage1_pass_q; 143 | end 144 | end 145 | 146 | // FIFO read data valid stage. 147 | always_ff @(posedge clk_i) begin 148 | if (!rst_ni) begin 149 | rdval_valid_q <= '0; 150 | rdval_cycle_q <= START_CYCLE; 151 | rdval_pass_q <= START_PASS; 152 | end else begin 153 | rdval_valid_q <= rden_valid_q; 154 | rdval_cycle_q <= rden_cycle_q; 155 | rdval_pass_q <= rden_pass_q; 156 | end 157 | end 158 | 159 | // FIFO read data captured stage. 160 | always_ff @(posedge clk_i) begin 161 | if (!rst_ni) begin 162 | rdcap_valid_q <= '0; 163 | rdcap_cycle_q <= START_CYCLE; 164 | rdcap_pass_q <= START_PASS; 165 | end else begin 166 | rdcap_valid_q <= rdval_valid_q; 167 | rdcap_cycle_q <= rdval_cycle_q; 168 | rdcap_pass_q <= rdval_pass_q; 169 | end 170 | end 171 | 172 | always_ff @(posedge clk_i) begin 173 | rdcap_q <= rdata_i; 174 | end 175 | 176 | // ---------------------------------------------------------------------- 177 | // Reorg and PPCH mux. 178 | 179 | point_t [NLANE/2-1:0][1:0][1:0] reorg_q, reorg_d; 180 | logic reorg_valid_q; 181 | logic reorg_pass_q; 182 | shift_t reorg_shift_q; 183 | ppch_id_t ppch_sel; 184 | logic swap; 185 | logic lane_inner_swap; 186 | 187 | always_comb begin 188 | swap = get_swap_from_cycle(rdcap_cycle_q); 189 | 190 | for (int lane_outer = 0; lane_outer < NLANE/2; lane_outer++) begin 191 | // Hardcoded "2" below is the square root of NPPCH. 192 | for (int ppch_idx = 0; ppch_idx < 2; ppch_idx++) begin 193 | ppch_sel = get_ppch_from_cycle(rdcap_cycle_q, 194 | ppch_idx, 195 | rdcap_pass_q); 196 | 197 | for (int lane_inner = 0; lane_inner < 2; lane_inner++) begin 198 | lane_inner_swap = logic'(lane_inner) ^ swap; 199 | 200 | // These should reduce to 3:1 muxes. 201 | // Also handle reorg in same logic. 202 | reorg_d[lane_outer][ppch_idx][lane_inner_swap] = 203 | rdcap_q[lane_outer][lane_inner][ppch_sel]; 204 | end 205 | end 206 | end 207 | end 208 | 209 | always_ff @(posedge clk_i) begin 210 | if (!rst_ni) begin 211 | reorg_valid_q <= '0; 212 | reorg_pass_q <= START_PASS; 213 | end else begin 214 | reorg_valid_q <= rdcap_valid_q; 215 | reorg_pass_q <= rdcap_pass_q; 216 | end 217 | end 218 | 219 | always_ff @(posedge clk_i) begin 220 | reorg_q = reorg_d; 221 | reorg_shift_q = get_shift_from_cycle(rdcap_cycle_q, POINT_READ); 222 | end 223 | 224 | // ---------------------------------------------------------------------- 225 | // Circular shift to finalize x. 226 | 227 | logic [SHIFT_PIPE_DEPTH:0] shift_pass; 228 | logic [SHIFT_PIPE_DEPTH:0] shift_valid; 229 | shift_t [SHIFT_PIPE_DEPTH:0] shift_amount; 230 | point_t [SHIFT_PIPE_DEPTH:0][NLANE/2-1:0][1:0][1:0] shift_x; 231 | 232 | always_comb begin 233 | shift_amount[0] = reorg_shift_q; 234 | shift_pass [0] = reorg_pass_q; 235 | shift_valid [0] = reorg_valid_q; 236 | shift_x [0] = reorg_q; 237 | end 238 | 239 | if (SHIFT_PIPE_DEPTH != 0) begin : _shift_pipe 240 | 241 | localparam MAX_SHIFT = (NLANE/2) / SHIFT_PIPE_DEPTH; 242 | 243 | for (genvar i = 0; i < SHIFT_PIPE_DEPTH; i++) begin : _shift 244 | 245 | always_ff @(posedge clk_i) begin 246 | if (!rst_ni) begin 247 | shift_valid[i+1] <= 1'b0; 248 | shift_pass [i+1] <= START_PASS; 249 | end else begin 250 | shift_valid[i+1] <= shift_valid[i]; 251 | shift_pass [i+1] <= shift_pass [i]; 252 | end 253 | end 254 | 255 | shift_t shift_remaining; 256 | point_t [NLANE/2-1:0][1:0][1:0] shifted; 257 | 258 | always_comb begin 259 | shift_remaining = shift_amount[i]; 260 | shifted = shift_x [i]; 261 | 262 | for (int j = 0; j < MAX_SHIFT; j++) begin 263 | if (shift_remaining != 0) begin 264 | shift_remaining = shift_remaining - 1'b1; 265 | shifted = {shifted[0], shifted[NLANE/2-1:1]}; 266 | end 267 | end 268 | end 269 | 270 | always_ff @(posedge clk_i) begin 271 | shift_amount[i+1] <= shift_remaining; 272 | shift_x [i+1] <= shifted; 273 | end 274 | end 275 | 276 | always_comb begin 277 | x_o = shift_x [SHIFT_PIPE_DEPTH]; 278 | pass_o = shift_pass [SHIFT_PIPE_DEPTH]; 279 | end 280 | 281 | always_ff @(posedge clk_i) begin 282 | valid_o <= {NLANE{shift_valid[SHIFT_PIPE_DEPTH-1]}}; 283 | end 284 | 285 | end else begin : _shift 286 | 287 | point_t [NLANE/2-1:0][1:0][1:0] shifted; 288 | 289 | always_comb begin 290 | case (shift_amount[0]) 291 | 3'h0: shifted = shift_x; 292 | 3'h1: shifted = {shift_x[0][0], shift_x[0][NLANE/2-1:1]}; 293 | 3'h2: shifted = {shift_x[0][1], shift_x[0][NLANE/2-1:2]}; 294 | 3'h3: shifted = {shift_x[0][2], shift_x[0][NLANE/2-1:3]}; 295 | 3'h4: shifted = {shift_x[0][3], shift_x[0][NLANE/2-1:4]}; 296 | 3'h5: shifted = {shift_x[0][4], shift_x[0][NLANE/2-1:5]}; 297 | 3'h6: shifted = {shift_x[0][5], shift_x[0][NLANE/2-1:6]}; 298 | 3'h7: shifted = {shift_x[0][6], shift_x[0][NLANE/2-1:7]}; 299 | endcase 300 | end 301 | 302 | always_ff @(posedge clk_i) begin 303 | x_o <= shifted; 304 | end 305 | 306 | always_ff @(posedge clk_i) begin 307 | if (!rst_ni) begin 308 | valid_o <= {NLANE{1'b0}}; 309 | pass_o <= START_PASS; 310 | end else begin 311 | valid_o <= {NLANE{reorg_valid_q}}; 312 | pass_o <= reorg_pass_q; 313 | end 314 | end 315 | end 316 | 317 | endmodule 318 | -------------------------------------------------------------------------------- /rtl/dsp48e2/butterfly.sv: -------------------------------------------------------------------------------- 1 | // Copyright Supranational LLC 2 | // Licensed under the Apache License, Version 2.0, see LICENSE-APACHE 3 | // or the MIT license, see LICENSE-MIT, at your option. 4 | // SPDX-License-Identifier: Apache-2.0 OR MIT 5 | 6 | module butterfly 7 | import math_pkg::*; 8 | #( 9 | parameter MODE = BUTTERFLY_GENERIC, 10 | parameter BFLYDSP = 24, // 24, 16, or 12 11 | parameter CANONICAL = 0 12 | ) 13 | ( 14 | input logic rst_ni, 15 | input logic clk_i, 16 | input ce_i, 17 | input nop_i, 18 | 19 | input logic [63:0] x_i, 20 | input logic [63:0] y_i, 21 | input logic [63:0] w_i, 22 | 23 | output logic [63:0] x_o, 24 | output logic [63:0] y_o 25 | ); 26 | 27 | if (MODE == BUTTERFLY_W0) begin 28 | 29 | modaddsub 30 | #( .NO_DSP_ADD64 ( BFLYDSP==24 ? 0 : BFLYDSP==16 ? 1 : BFLYDSP==12 ? 1 : 0 ), 31 | .NO_DSP_SUB64 ( BFLYDSP==24 ? 0 : BFLYDSP==16 ? 1 : BFLYDSP==12 ? 1 : 0 ), 32 | .NO_DSP_ADDSUB64 ( BFLYDSP==24 ? 0 : BFLYDSP==16 ? 1 : BFLYDSP==12 ? 1 : 0 ), 33 | .CANONICAL ( CANONICAL ) 34 | ) 35 | modaddsub 36 | ( .rst_ni, 37 | .clk_i, 38 | .ce_i, 39 | .nop_i, 40 | .x_i, 41 | .y_i, 42 | .x_add_y_o(x_o), 43 | .x_sub_y_o(y_o) 44 | ); 45 | 46 | end 47 | else if (MODE == BUTTERFLY_W0_W2) begin 48 | 49 | logic [63:0] x_add_y; 50 | logic [63:0] x_sub_y; 51 | modaddsub 52 | #( .NO_DSP_ADD64 ( BFLYDSP==24 ? 0 : BFLYDSP==16 ? 1 : BFLYDSP==12 ? 1 : 0 ), 53 | .NO_DSP_SUB64 ( BFLYDSP==24 ? 0 : BFLYDSP==16 ? 1 : BFLYDSP==12 ? 1 : 0 ), 54 | .NO_DSP_ADDSUB64 ( BFLYDSP==24 ? 0 : BFLYDSP==16 ? 1 : BFLYDSP==12 ? 1 : 0 ), 55 | .CANONICAL ( CANONICAL ) 56 | ) 57 | modaddsub 58 | ( .rst_ni, 59 | .clk_i, 60 | .ce_i, 61 | .nop_i, 62 | .x_i, 63 | .y_i, 64 | .x_add_y_o(x_add_y), 65 | .x_sub_y_o(x_sub_y) 66 | ); 67 | 68 | logic [PIPE_DEPTH_MODADDSUB-1:0] w_q; 69 | always_ff @(posedge clk_i) begin 70 | if (ce_i) begin 71 | w_q <= {w_q, nop_i ? 1'h1 : w_i[0]}; 72 | end 73 | end 74 | 75 | logic [127:0] p; 76 | assign p = w_q[PIPE_DEPTH_MODADDSUB-1] ? x_sub_y : (x_sub_y << 48); 77 | 78 | red128t64 79 | #( .NO_DSP_ADD32 ( BFLYDSP==24 ? 1 : BFLYDSP==16 ? 1 : BFLYDSP==12 ? 1 : 0 ), 80 | .NO_DSP_SUB64 ( BFLYDSP==24 ? 0 : BFLYDSP==16 ? 0 : BFLYDSP==12 ? 1 : 0 ), 81 | .NO_DSP_ADDSUB64 ( BFLYDSP==24 ? 0 : BFLYDSP==16 ? 0 : BFLYDSP==12 ? 1 : 0 ) 82 | ) 83 | red128t64 84 | ( .rst_ni, 85 | .clk_i, 86 | .ce_i, 87 | .p_i(p), 88 | .r_o(y_o) 89 | ); 90 | 91 | logic [PIPE_DEPTH_RED128T64-1:0][63:0] x_add_y_q; 92 | always_ff @(posedge clk_i) begin 93 | if (ce_i) begin 94 | x_add_y_q <= {x_add_y_q, x_add_y}; 95 | end 96 | end 97 | 98 | assign x_o = x_add_y_q[PIPE_DEPTH_RED128T64-1]; 99 | 100 | end 101 | 102 | else /* if (MODE == BUTTERFLY_GENERIC) */ begin 103 | 104 | logic [63:0] x_add_y; 105 | logic [63:0] x_sub_y; 106 | modaddsub 107 | #( .NO_DSP_ADD64 ( BFLYDSP==24 ? 0 : BFLYDSP==16 ? 1 : BFLYDSP==12 ? 1 : 0 ), 108 | .NO_DSP_SUB64 ( BFLYDSP==24 ? 0 : BFLYDSP==16 ? 1 : BFLYDSP==12 ? 1 : 0 ), 109 | .NO_DSP_ADDSUB64 ( BFLYDSP==24 ? 0 : BFLYDSP==16 ? 1 : BFLYDSP==12 ? 1 : 0 ), 110 | .CANONICAL ( CANONICAL ) 111 | ) 112 | modaddsub 113 | ( .rst_ni, 114 | .clk_i, 115 | .ce_i, 116 | .nop_i, 117 | .x_i, 118 | .y_i, 119 | .x_add_y_o(x_add_y), 120 | .x_sub_y_o(x_sub_y) 121 | ); 122 | 123 | logic [PIPE_DEPTH_MODADDSUB-1:0][63:0] w_q; 124 | always_ff @(posedge clk_i) begin 125 | if (ce_i) begin 126 | w_q <= {w_q, nop_i ? 64'h1 : w_i}; 127 | end 128 | end 129 | 130 | mulred #(.BFLYDSP(BFLYDSP)) mulred 131 | ( .rst_ni, 132 | .clk_i, 133 | .ce_i, 134 | .a_i(x_sub_y), 135 | .b_i(w_q[PIPE_DEPTH_MODADDSUB-1]), 136 | .r_o(y_o) 137 | ); 138 | 139 | logic [PIPE_DEPTH_MULRED-1:0][63:0] x_add_y_q; 140 | always_ff @(posedge clk_i) begin 141 | if (ce_i) begin 142 | x_add_y_q <= {x_add_y_q, x_add_y}; 143 | end 144 | end 145 | 146 | assign x_o = x_add_y_q[PIPE_DEPTH_MULRED-1]; 147 | 148 | end 149 | 150 | `ifndef SYNTHESIS 151 | 152 | localparam PD 153 | = (MODE == BUTTERFLY_W0 ) ? PIPE_DEPTH_BUTTERFLY_W0 : 154 | (MODE == BUTTERFLY_W0_W2) ? PIPE_DEPTH_BUTTERFLY_W0_W2 : 155 | PIPE_DEPTH_BUTTERFLY; 156 | 157 | logic [PD-1:0][63:0] x_i_q, y_i_q, w_i_q; 158 | logic [PD-1:0] nop_i_q; 159 | logic [PD-1:0] v_q; 160 | always_ff @(posedge clk_i) begin 161 | if (!rst_ni) begin 162 | v_q <= 0; 163 | end 164 | else if (ce_i) begin 165 | v_q <= {v_q, 1'b1}; 166 | x_i_q <= {x_i_q, x_i}; 167 | y_i_q <= {y_i_q, y_i}; 168 | w_i_q <= {w_i_q, w_i}; 169 | nop_i_q <= {nop_i_q, nop_i}; 170 | end 171 | end 172 | logic [64:0] addexp, subexp; 173 | logic [127:0] pp; 174 | logic [65:0] rexp; 175 | always_ff @(posedge clk_i) begin 176 | if (ce_i) begin 177 | addexp = nop_i_q[PD-1] ? x_i_q[PD-1] : 178 | (x_i_q[PD-1] + y_i_q[PD-1]); 179 | if (addexp[64]) begin 180 | addexp -= 1 << 64; 181 | addexp += 32'hffffffff; 182 | end 183 | if (addexp[64]) begin 184 | addexp -= 1 << 64; 185 | addexp += 32'hffffffff; 186 | end 187 | subexp = nop_i_q[PD-1] ? y_i_q[PD-1] : 188 | (x_i_q[PD-1] - y_i_q[PD-1]); 189 | if (subexp[64]) begin 190 | subexp += 1 << 64; 191 | subexp -= 32'hffffffff; 192 | end 193 | if (subexp[64]) begin 194 | subexp += 1 << 64; 195 | subexp -= 32'hffffffff; 196 | end 197 | pp = subexp * (nop_i_q[PD-1] ? 1 : w_i_q[PD-1]); 198 | rexp = pp[63:0] - pp[96+:32] + {pp[64+:32],32'h0} - pp[64+:32]; 199 | if (rexp[65]) begin 200 | rexp += 1 << 64; 201 | rexp -= 32'hffffffff; 202 | end 203 | else if (rexp[64]) begin 204 | rexp -= 1 << 64; 205 | rexp += 32'hffffffff; 206 | end 207 | if (v_q[PD-1] && (addexp!=x_o || rexp!=y_o)) begin 208 | $display("v=%b x_i=%x y_i=%x w_i=%x nop_i=%d x_o=%x y_o=%x x_o_exp=%x y_o_exp=%x ERROR", 209 | v_q[PD-1], 210 | x_i_q[PD-1],y_i_q[PD-1], 211 | w_i_q[PD-1],nop_i_q[PD-1], 212 | x_o,y_o,addexp,rexp); 213 | $finish; 214 | end 215 | else begin 216 | `ifdef NEVER 217 | $display("v=%b x_i=%x y_i=%x w_i=%x nop_i=%d x_o=%x y_o=%x x_o_exp=%x y_o_exp=%x", 218 | v_q[PD-1], 219 | x_i_q[PD-1],y_i_q[PD-1], 220 | w_i_q[PD-1],nop_i_q[PD-1], 221 | x_o,y_o,addexp,rexp); 222 | `endif 223 | end 224 | end 225 | end 226 | `endif 227 | 228 | endmodule 229 | -------------------------------------------------------------------------------- /rtl/dsp48e2/math_pkg.sv: -------------------------------------------------------------------------------- 1 | // Copyright Supranational LLC 2 | // Licensed under the Apache License, Version 2.0, see LICENSE-APACHE 3 | // or the MIT license, see LICENSE-MIT, at your option. 4 | // SPDX-License-Identifier: Apache-2.0 OR MIT 5 | 6 | package math_pkg; 7 | 8 | localparam unsigned BUTTERFLY_GENERIC = 0; 9 | localparam unsigned BUTTERFLY_W0 = 1; 10 | localparam unsigned BUTTERFLY_W0_W2 = 2; 11 | 12 | localparam unsigned PIPE_DEPTH_MODADDSUB = 6; 13 | localparam unsigned PIPE_DEPTH_MUL64X64 = 14; 14 | localparam unsigned PIPE_DEPTH_RED128T64 = 8; 15 | localparam unsigned PIPE_DEPTH_MULRED = PIPE_DEPTH_MUL64X64 + PIPE_DEPTH_RED128T64; 16 | localparam unsigned PIPE_DEPTH_BUTTERFLY = PIPE_DEPTH_MODADDSUB + PIPE_DEPTH_MULRED; 17 | 18 | localparam unsigned PIPE_DEPTH_BUTTERFLY_W0 = PIPE_DEPTH_MODADDSUB; 19 | 20 | localparam unsigned PIPE_DEPTH_BUTTERFLY_W0_W2 = PIPE_DEPTH_MODADDSUB + PIPE_DEPTH_RED128T64; 21 | 22 | endpackage 23 | -------------------------------------------------------------------------------- /rtl/dsp48e2/modaddsub.sv: -------------------------------------------------------------------------------- 1 | // Copyright Supranational LLC 2 | // Licensed under the Apache License, Version 2.0, see LICENSE-APACHE 3 | // or the MIT license, see LICENSE-MIT, at your option. 4 | // SPDX-License-Identifier: Apache-2.0 OR MIT 5 | 6 | module modaddsub 7 | import math_pkg::*; 8 | #( 9 | parameter NO_DSP_ADD64 = 0, 10 | parameter NO_DSP_SUB64 = 0, 11 | parameter NO_DSP_ADDSUB64 = 0, 12 | parameter CANONICAL = 0 13 | ) 14 | ( 15 | input logic rst_ni, 16 | input logic clk_i, 17 | input ce_i, 18 | input nop_i, 19 | 20 | input logic [63:0] x_i, 21 | input logic [63:0] y_i, 22 | 23 | output logic [63:0] x_add_y_o, 24 | output logic [63:0] x_sub_y_o 25 | ); 26 | 27 | logic nop_q; 28 | always_ff @(posedge clk_i) begin 29 | if (ce_i) begin 30 | nop_q <= nop_i; 31 | end 32 | end 33 | 34 | // 35 | // add 36 | // 37 | 38 | logic [47:0] add0_p; 39 | logic add0_carry, add0_pdetect; 40 | dsp_modaddsub 41 | #( .AB_DUAL_REG (0), 42 | .C_DUAL_REG (0), 43 | .USE_CARRY (0), 44 | .PATTERN ({32'h00000000, 16'h0000}), 45 | .MASK ({32'h00000000, 16'hffff}), 46 | .NO_DSP (NO_DSP_ADD64) ) 47 | dsp_add0 48 | ( 49 | .rst_ni, 50 | .clk_i, 51 | .ce_i, 52 | .AB_i({y_i[0+:32],16'h0}), 53 | .C_i({x_i[0+:32],16'h0}), 54 | .sub_i(1'b0), 55 | .zero_ab_i(nop_i), 56 | .zero_c_i(1'b0), 57 | .CARRYCASCIN_i(), 58 | .P_o(add0_p), 59 | .CARRYCASCOUT_o(add0_carry), 60 | .PATTERNDETECT_o(add0_pdetect) 61 | ); 62 | 63 | logic [47:0] add1_p; 64 | logic add1_pdetect; 65 | dsp_modaddsub 66 | #( .AB_DUAL_REG (1), 67 | .C_DUAL_REG (1), 68 | .USE_CARRY (1), 69 | .PATTERN ({16'h0000, 32'hffffffff}), 70 | .MASK ({16'hffff, 32'h00000000}), 71 | .NO_DSP (NO_DSP_ADD64) ) 72 | dsp_add1 73 | ( 74 | .rst_ni, 75 | .clk_i, 76 | .ce_i, 77 | .AB_i({16'h0,y_i[32+:32]}), 78 | .C_i({16'h0,x_i[32+:32]}), 79 | .sub_i(1'b0), 80 | .zero_ab_i(nop_q), 81 | .zero_c_i(1'b0), 82 | .CARRYCASCIN_i(add0_carry), 83 | .P_o(add1_p), 84 | .CARRYCASCOUT_o(), 85 | .PATTERNDETECT_o(add1_pdetect) 86 | ); 87 | 88 | logic addmod_nop, addmod_nop_q; 89 | logic add0_pdetect_q, addmod_2x, addmod_2x_q; 90 | always_comb begin 91 | addmod_nop = add1_p[32]==0; 92 | addmod_2x = add1_pdetect && !add0_pdetect_q; 93 | end 94 | always_ff @(posedge clk_i) begin 95 | if (ce_i) begin 96 | add0_pdetect_q <= add0_pdetect; 97 | addmod_nop_q <= addmod_nop; 98 | addmod_2x_q <= addmod_2x; 99 | end 100 | end 101 | 102 | logic [47:0] radd0_p; 103 | logic radd0_carry; 104 | logic addmod0_pdetect; 105 | dsp_modaddsub 106 | #( .AB_DUAL_REG (0), 107 | .C_DUAL_REG (1), 108 | .USE_CARRY (0), 109 | .PATTERN ({32'h00000000, 16'h0000}), 110 | .MASK ({32'h00000000, 16'hffff}), 111 | .NO_DSP (NO_DSP_ADDSUB64) ) 112 | dsp_addmod0 113 | ( 114 | .rst_ni, 115 | .clk_i, 116 | .ce_i, 117 | .AB_i({addmod_2x ? 32'hfffffffe : 32'hffffffff,16'h0}), 118 | .C_i({add0_p[47:16],16'h0}), 119 | .sub_i(1'b0), 120 | .zero_ab_i(addmod_nop), 121 | .zero_c_i(1'b0), 122 | .CARRYCASCIN_i(), 123 | .P_o(radd0_p), 124 | .CARRYCASCOUT_o(radd0_carry), 125 | .PATTERNDETECT_o(addmod0_pdetect) 126 | ); 127 | 128 | logic [47:0] radd1_p; 129 | logic addmod1_pdetect; 130 | dsp_modaddsub 131 | #( .AB_DUAL_REG (0), 132 | .C_DUAL_REG (1), 133 | .USE_CARRY (1), 134 | .PATTERN ({16'h0000, 32'hffffffff}), 135 | .MASK ({16'hffff, 32'h00000000}), 136 | .NO_DSP (NO_DSP_ADDSUB64) ) 137 | dsp_addmod1 138 | ( 139 | .rst_ni, 140 | .clk_i, 141 | .ce_i, 142 | .AB_i(addmod_2x_q ? 48'h1 : 48'h0), 143 | .C_i({16'h0,add1_p[31:0]}), 144 | .sub_i(1'b0), 145 | .zero_ab_i(addmod_nop_q), 146 | .zero_c_i(1'b0), 147 | .CARRYCASCIN_i(radd0_carry), 148 | .P_o(radd1_p), 149 | .CARRYCASCOUT_o(), 150 | .PATTERNDETECT_o(addmod1_pdetect) 151 | ); 152 | 153 | logic [47:0] radd0_q; 154 | logic [31:0] radd0_dec_q; 155 | logic addmod0_pdetect_q; 156 | always_ff @(posedge clk_i) begin 157 | if (ce_i) begin 158 | addmod0_pdetect_q <= addmod0_pdetect; 159 | radd0_q <= radd0_p; 160 | radd0_dec_q <= radd0_p[47:16] + 32'hffffffff; 161 | end 162 | end 163 | 164 | always_comb begin 165 | if (CANONICAL && addmod1_pdetect && !addmod0_pdetect_q) begin 166 | x_add_y_o = {32'h00000000,radd0_dec_q}; 167 | end 168 | else begin 169 | x_add_y_o = {radd1_p[31:0],radd0_q[47:16]}; 170 | end 171 | end 172 | 173 | 174 | // 175 | // sub 176 | // 177 | 178 | logic [47:0] sub0_p; 179 | logic sub0_carry, sub0_pdetect; 180 | dsp_modaddsub 181 | #( .AB_DUAL_REG (0), 182 | .C_DUAL_REG (0), 183 | .USE_CARRY (0), 184 | .PATTERN ({32'hffffffff, 16'h0000}), 185 | .MASK ({32'h00000000, 16'hffff}), 186 | .NO_DSP (NO_DSP_SUB64) ) 187 | dsp_sub0 188 | ( 189 | .rst_ni, 190 | .clk_i, 191 | .ce_i, 192 | .AB_i({y_i[0+:32],16'h0}), 193 | .C_i({x_i[0+:32],16'h0}), 194 | .sub_i(!nop_i), 195 | .zero_ab_i(1'b0), 196 | .zero_c_i(nop_i), 197 | .CARRYCASCIN_i(), 198 | .P_o(sub0_p), 199 | .CARRYCASCOUT_o(sub0_carry), 200 | .PATTERNDETECT_o(sub0_pdetect) 201 | ); 202 | 203 | logic [47:0] sub1_p; 204 | logic sub1_pdetect; 205 | dsp_modaddsub 206 | #( .AB_DUAL_REG (1), 207 | .C_DUAL_REG (1), 208 | .USE_CARRY (1), 209 | .PATTERN ({16'h0000, 32'h00000000}), 210 | .MASK ({16'hffff, 32'h00000000}), 211 | .NO_DSP (NO_DSP_SUB64) ) 212 | dsp_sub1 213 | ( 214 | .rst_ni, 215 | .clk_i, 216 | .ce_i, 217 | .AB_i({16'h0,y_i[32+:32]}), 218 | .C_i({16'h0,x_i[32+:32]}), 219 | .sub_i(!nop_q), 220 | .zero_ab_i(1'b0), 221 | .zero_c_i(nop_q), 222 | .CARRYCASCIN_i(sub0_carry), 223 | .P_o(sub1_p), 224 | .CARRYCASCOUT_o(), 225 | .PATTERNDETECT_o(sub1_pdetect) 226 | ); 227 | 228 | logic submod_nop, submod_nop_q; 229 | logic sub0_pdetect_q, submod_2x, submod_2x_q; 230 | always_comb begin 231 | submod_nop = sub1_p[32]==0; 232 | submod_2x = sub1_pdetect && !sub0_pdetect_q; 233 | end 234 | always_ff @(posedge clk_i) begin 235 | if (ce_i) begin 236 | sub0_pdetect_q <= sub0_pdetect; 237 | submod_nop_q <= submod_nop; 238 | submod_2x_q <= submod_2x; 239 | end 240 | end 241 | 242 | logic [47:0] rsub0_p; 243 | logic rsub0_carry; 244 | logic submod0_pdetect; 245 | dsp_modaddsub 246 | #( .AB_DUAL_REG (0), 247 | .C_DUAL_REG (1), 248 | .USE_CARRY (0), 249 | .PATTERN ({32'h00000000, 16'h0000}), 250 | .MASK ({32'h00000000, 16'hffff}), 251 | .NO_DSP (NO_DSP_ADDSUB64) ) 252 | dsp_submod0 253 | ( 254 | .rst_ni, 255 | .clk_i, 256 | .ce_i, 257 | .AB_i({submod_2x ? 32'hfffffffe : 32'hffffffff,16'h0}), 258 | .C_i({sub0_p[47:16],16'h0}), 259 | .sub_i(!submod_nop), 260 | .zero_ab_i(submod_nop), 261 | .zero_c_i(1'b0), 262 | .CARRYCASCIN_i(), 263 | .P_o(rsub0_p), 264 | .CARRYCASCOUT_o(rsub0_carry), 265 | .PATTERNDETECT_o(submod0_pdetect) 266 | ); 267 | 268 | logic [47:0] rsub1_p; 269 | logic submod1_pdetect; 270 | dsp_modaddsub 271 | #( .AB_DUAL_REG (0), 272 | .C_DUAL_REG (1), 273 | .USE_CARRY (1), 274 | .PATTERN ({16'h0000, 32'hffffffff}), 275 | .MASK ({16'hffff, 32'h00000000}), 276 | .NO_DSP (NO_DSP_ADDSUB64) ) 277 | dsp_submod1 278 | ( 279 | .rst_ni, 280 | .clk_i, 281 | .ce_i, 282 | .AB_i(submod_2x_q ? 48'h1 : 48'h0), 283 | .C_i({16'h0,sub1_p[31:0]}), 284 | .sub_i(!submod_nop_q), 285 | .zero_ab_i(submod_nop_q), 286 | .zero_c_i(1'b0), 287 | .CARRYCASCIN_i(rsub0_carry), 288 | .P_o(rsub1_p), 289 | .CARRYCASCOUT_o(), 290 | .PATTERNDETECT_o(submod1_pdetect) 291 | ); 292 | 293 | logic [47:0] rsub0_q; 294 | logic [31:0] rsub0_dec_q; 295 | logic submod0_pdetect_q; 296 | always_ff @(posedge clk_i) begin 297 | if (ce_i) begin 298 | submod0_pdetect_q <= submod0_pdetect; 299 | rsub0_q <= rsub0_p; 300 | rsub0_dec_q <= rsub0_p[47:16] + 32'hffffffff; 301 | end 302 | end 303 | 304 | always_comb begin 305 | if (CANONICAL && submod1_pdetect && !submod0_pdetect_q) begin 306 | x_sub_y_o = {32'h00000000,rsub0_dec_q}; 307 | end 308 | else begin 309 | x_sub_y_o = {rsub1_p[31:0],rsub0_q[47:16]}; 310 | end 311 | end 312 | 313 | `ifndef SYNTHESIS 314 | localparam [63:0] M = 64'hffff_ffff_0000_0001; 315 | logic [PIPE_DEPTH_MODADDSUB-1:0][63:0] x_q, y_q; 316 | logic [PIPE_DEPTH_MODADDSUB-1:0] nop_i_q; 317 | logic [PIPE_DEPTH_MODADDSUB-1:0] v_q; 318 | always_ff @(posedge clk_i) begin 319 | if (!rst_ni) begin 320 | v_q <= 0; 321 | end 322 | else if (ce_i) begin 323 | v_q <= {v_q, 1'b1}; 324 | x_q <= {x_q, x_i}; 325 | y_q <= {y_q, y_i}; 326 | nop_i_q <= {nop_i_q, nop_i}; 327 | end 328 | end 329 | logic [64:0] addexp, subexp; 330 | always_ff @(posedge clk_i) begin 331 | if (ce_i) begin 332 | addexp = nop_i_q[PIPE_DEPTH_MODADDSUB-1] ? x_q[PIPE_DEPTH_MODADDSUB-1] : 333 | (x_q[PIPE_DEPTH_MODADDSUB-1] + y_q[PIPE_DEPTH_MODADDSUB-1]); 334 | if (addexp[64]) begin 335 | addexp -= 1 << 64; 336 | addexp += 32'hffffffff; 337 | end 338 | if (addexp[64]) begin 339 | addexp -= 1 << 64; 340 | addexp += 32'hffffffff; 341 | end 342 | if (CANONICAL) begin 343 | addexp %= M; 344 | end 345 | subexp = nop_i_q[PIPE_DEPTH_MODADDSUB-1] ? y_q[PIPE_DEPTH_MODADDSUB-1] : 346 | (x_q[PIPE_DEPTH_MODADDSUB-1] - y_q[PIPE_DEPTH_MODADDSUB-1]); 347 | if (subexp[64]) begin 348 | subexp += 1 << 64; 349 | subexp -= 32'hffffffff; 350 | end 351 | if (subexp[64]) begin 352 | subexp += 1 << 64; 353 | subexp -= 32'hffffffff; 354 | end 355 | if (CANONICAL) begin 356 | subexp %= M; 357 | end 358 | if (v_q[PIPE_DEPTH_MODADDSUB-1] && (addexp!=x_add_y_o || subexp!=x_sub_y_o)) begin 359 | $display("v=%b x=%x y=%x nop=%d addact=%x addexp=%x subact=%x subexp=%x ERROR", 360 | v_q[PIPE_DEPTH_MODADDSUB-1], 361 | x_q[PIPE_DEPTH_MODADDSUB-1],y_q[PIPE_DEPTH_MODADDSUB-1], 362 | nop_i_q[PIPE_DEPTH_MODADDSUB-1], 363 | x_add_y_o,addexp,x_sub_y_o,subexp); 364 | $finish; 365 | end 366 | else begin 367 | `ifdef NEVER 368 | $display("v=%b x=%x y=%x nop=%d addact=%x addexp=%x subact=%x subexp=%x", 369 | v_q[PIPE_DEPTH_MODADDSUB-1], 370 | x_q[PIPE_DEPTH_MODADDSUB-1],y_q[PIPE_DEPTH_MODADDSUB-1], 371 | nop_i_q[PIPE_DEPTH_MODADDSUB-1], 372 | x_add_y_o,addexp,x_sub_y_o,subexp); 373 | `endif 374 | end 375 | end 376 | end 377 | `endif 378 | 379 | endmodule 380 | 381 | 382 | module dsp_modaddsub 383 | #( 384 | parameter AB_DUAL_REG = 0, 385 | parameter C_DUAL_REG = 0, 386 | parameter USE_CARRY = 0, 387 | parameter [47:0] PATTERN = 0, 388 | parameter [47:0] MASK = 0, 389 | parameter NO_DSP = 0 390 | ) 391 | ( 392 | input logic rst_ni, 393 | input logic clk_i, 394 | input logic ce_i, 395 | input logic [47:0] AB_i, 396 | input logic [47:0] C_i, 397 | input logic sub_i, 398 | input logic zero_ab_i, 399 | input logic zero_c_i, 400 | input logic CARRYCASCIN_i, 401 | output logic [47:0] P_o, 402 | output logic CARRYCASCOUT_o, 403 | output logic PATTERNDETECT_o 404 | ); 405 | 406 | logic signed [47:0] c0_q; 407 | always_ff @(posedge clk_i) begin 408 | if (ce_i) begin 409 | c0_q <= C_i; 410 | end 411 | end 412 | 413 | if (NO_DSP) begin 414 | 415 | logic signed [47:0] ab0_q; 416 | always_ff @(posedge clk_i) begin 417 | if (ce_i) begin 418 | ab0_q <= AB_i; 419 | end 420 | end 421 | 422 | logic signed [47:0] ab_q; 423 | logic signed [47:0] c_q; 424 | logic sub_q, zero_ab_q, zero_c_q; 425 | always_ff @(posedge clk_i) begin 426 | if (ce_i) begin 427 | ab_q <= AB_DUAL_REG ? ab0_q : AB_i; 428 | c_q <= C_DUAL_REG ? c0_q : C_i; 429 | sub_q <= sub_i; 430 | zero_ab_q <= zero_ab_i; 431 | zero_c_q <= zero_c_i; 432 | end 433 | end 434 | 435 | logic signed [48:0] p_q; 436 | always_ff @(posedge clk_i) begin 437 | if (ce_i) begin 438 | p_q <= sub_q ? ((zero_c_q ? 0 : c_q) - (zero_ab_q ? 0 : ab_q) - (USE_CARRY & CARRYCASCIN_i)) 439 | : ((zero_c_q ? 0 : c_q) + (zero_ab_q ? 0 : ab_q) + (USE_CARRY & CARRYCASCIN_i)); 440 | end 441 | end 442 | 443 | assign P_o = p_q[47:0]; 444 | assign CARRYCASCOUT_o = p_q[48]; 445 | assign PATTERNDETECT_o = (p_q[47:0] & ~MASK) == (PATTERN & ~MASK); 446 | 447 | end 448 | else begin 449 | 450 | logic [3:0] alumode; 451 | logic [4:0] inmode; 452 | logic [8:0] opmode; 453 | logic [47:0] dsp48e2_p; 454 | logic dsp48e2_carrycascout; 455 | logic dsp48e2_patterndetect; 456 | 457 | always_comb begin 458 | alumode = sub_i ? 4'b0011 : 4'b0000; // Z - (W + X + Y + CIN) : Z + (W + X + Y + CIN) 459 | inmode = AB_DUAL_REG ? 5'b00000 : 5'b10001; 460 | opmode = { 461 | 2'b00, // W <- 0 462 | zero_c_i ? 3'b000: 3'b011, // Z <- 0 : C 463 | 2'b00, // Y <- 0 464 | zero_ab_i ? 2'b00 : 2'b11 // X <- 0 : A:B 465 | }; 466 | end 467 | 468 | DSP48E2 469 | #( 470 | .ACASCREG(), 471 | .ADREG(), 472 | .ALUMODEREG(), 473 | .AMULTSEL(), 474 | .AREG(AB_DUAL_REG ? 2 : 1), 475 | .AUTORESET_PATDET(), 476 | .AUTORESET_PRIORITY(), 477 | .A_INPUT(), 478 | .BCASCREG(), 479 | .BMULTSEL(), 480 | .BREG(AB_DUAL_REG ? 2 : 1), 481 | .B_INPUT(), 482 | .CARRYINREG(), 483 | .CARRYINSELREG(), 484 | .CREG(), 485 | .DREG(), 486 | .INMODEREG(), 487 | .IS_ALUMODE_INVERTED(), 488 | .IS_CARRYIN_INVERTED(), 489 | .IS_CLK_INVERTED(), 490 | .IS_INMODE_INVERTED(), 491 | .IS_OPMODE_INVERTED(), 492 | .IS_RSTALLCARRYIN_INVERTED(), 493 | .IS_RSTALUMODE_INVERTED(), 494 | .IS_RSTA_INVERTED(), 495 | .IS_RSTB_INVERTED(), 496 | .IS_RSTCTRL_INVERTED(), 497 | .IS_RSTC_INVERTED(), 498 | .IS_RSTD_INVERTED(), 499 | .IS_RSTINMODE_INVERTED(), 500 | .IS_RSTM_INVERTED(), 501 | .IS_RSTP_INVERTED(), 502 | .MASK(MASK), 503 | .MREG(0), 504 | .OPMODEREG(), 505 | .PATTERN(PATTERN), 506 | .PREADDINSEL(), 507 | .PREG(), 508 | .RND(), 509 | .SEL_MASK(), 510 | .SEL_PATTERN(), 511 | .USE_MULT("NONE"), 512 | .USE_PATTERN_DETECT("PATDET"), 513 | .USE_SIMD(), 514 | .USE_WIDEXOR(), 515 | .XORSIMD() 516 | ) 517 | dsp 518 | ( 519 | .ACOUT(), 520 | .BCOUT(), 521 | .CARRYCASCOUT(dsp48e2_carrycascout), 522 | .CARRYOUT(), 523 | .MULTSIGNOUT(), 524 | .OVERFLOW(), 525 | .P(dsp48e2_p), 526 | .PATTERNBDETECT(), 527 | .PATTERNDETECT(dsp48e2_patterndetect), 528 | .PCOUT(), 529 | .UNDERFLOW(), 530 | .XOROUT(), 531 | 532 | .A(AB_i[47:18]), 533 | .ACIN(30'h0), 534 | .ALUMODE(alumode[3:0]), 535 | .B(AB_i[17:0]), 536 | .BCIN(18'h0), 537 | .C(C_DUAL_REG ? c0_q : C_i), 538 | .CARRYCASCIN(CARRYCASCIN_i), 539 | .CARRYIN(1'b0), 540 | .CARRYINSEL(USE_CARRY ? 3'b010: 3'b000), // CARRYSCANIN : CARRYIN 541 | .CEA1(ce_i), 542 | .CEA2(ce_i), 543 | .CEAD(ce_i), 544 | .CEALUMODE(ce_i), 545 | .CEB1(ce_i), 546 | .CEB2(ce_i), 547 | .CEC(ce_i), 548 | .CECARRYIN(ce_i), 549 | .CECTRL(ce_i), 550 | .CED(ce_i), 551 | .CEINMODE(ce_i), 552 | .CEM(ce_i), 553 | .CEP(ce_i), 554 | .CLK(clk_i), 555 | .D(27'h0), 556 | .INMODE(inmode[4:0]), 557 | .MULTSIGNIN(1'b0), 558 | .OPMODE(opmode[8:0]), 559 | .PCIN(), 560 | .RSTA(!rst_ni), 561 | .RSTALLCARRYIN(!rst_ni), 562 | .RSTALUMODE(!rst_ni), 563 | .RSTB(!rst_ni), 564 | .RSTC(!rst_ni), 565 | .RSTCTRL(!rst_ni), 566 | .RSTD(!rst_ni), 567 | .RSTINMODE(!rst_ni), 568 | .RSTM(!rst_ni), 569 | .RSTP(!rst_ni) 570 | ); 571 | 572 | assign P_o = dsp48e2_p; 573 | assign CARRYCASCOUT_o = dsp48e2_carrycascout; 574 | assign PATTERNDETECT_o = dsp48e2_patterndetect; 575 | 576 | end 577 | 578 | endmodule 579 | -------------------------------------------------------------------------------- /rtl/dsp48e2/mul64x64.sv: -------------------------------------------------------------------------------- 1 | // Copyright Supranational LLC 2 | // Licensed under the Apache License, Version 2.0, see LICENSE-APACHE 3 | // or the MIT license, see LICENSE-MIT, at your option. 4 | // SPDX-License-Identifier: Apache-2.0 OR MIT 5 | 6 | module mul64x64 7 | import math_pkg::*; 8 | ( 9 | input logic rst_ni, 10 | input logic clk_i, 11 | input ce_i, 12 | 13 | input logic [63:0] a_i, 14 | input logic [63:0] b_i, 15 | 16 | output logic [127:0] p_o 17 | ); 18 | 19 | // pipe along inputs ... many of these flops will be optimized away 20 | logic [PIPE_DEPTH_MUL64X64:0][63:0] a, b; 21 | logic [PIPE_DEPTH_MUL64X64-1:0][63:0] a_q, b_q; 22 | always_comb begin 23 | a = {a_q, a_i}; 24 | b = {b_q, b_i}; 25 | end 26 | always_ff @(posedge clk_i) begin 27 | if (ce_i) begin 28 | a_q <= a; 29 | b_q <= b; 30 | end 31 | end 32 | 33 | 34 | logic [2:0][3:0][47:0] dsp_p, dsp_p_q, dsp_pc; 35 | logic [2:0][3:0][29:0] dsp_ac; 36 | always_ff @(posedge clk_i) begin 37 | if (ce_i) begin 38 | dsp_p_q <= dsp_p; 39 | end 40 | end 41 | 42 | dsp_mul64x64 43 | #( .A_DUAL_REG (0), 44 | .B_DUAL_REG (0), 45 | .A_CASCADE (0), 46 | .Z_SEL_17BITSHIFT (0), 47 | .W_SEL_C (0) ) 48 | dsp_0_0 49 | ( 50 | .rst_ni, 51 | .clk_i, 52 | .ce_i, 53 | .A_i({4'b0,a[0][0*26 +:26]}), 54 | .ACIN_i(30'h0), 55 | .B_i({1'b0,b[0][0*17 +:17]}), 56 | .C_i(), 57 | .PCIN_i(), 58 | .ACOUT_o(dsp_ac[0][0]), 59 | .P_o(dsp_p[0][0]), 60 | .PCOUT_o(dsp_pc[0][0]) 61 | ); 62 | 63 | dsp_mul64x64 64 | #( .A_DUAL_REG (0), 65 | .B_DUAL_REG (1), 66 | .A_CASCADE (1), 67 | .Z_SEL_17BITSHIFT (1), 68 | .W_SEL_C (0) ) 69 | dsp_0_1 70 | ( 71 | .rst_ni, 72 | .clk_i, 73 | .ce_i, 74 | .A_i(30'h0), 75 | .ACIN_i(dsp_ac[0][0]), 76 | .B_i({1'b0,b[0][1*17 +:17]}), 77 | .C_i(), 78 | .PCIN_i(dsp_pc[0][0]), 79 | .ACOUT_o(dsp_ac[0][1]), 80 | .P_o(dsp_p[0][1]), 81 | .PCOUT_o(dsp_pc[0][1]) 82 | ); 83 | 84 | dsp_mul64x64 85 | #( .A_DUAL_REG (0), 86 | .B_DUAL_REG (1), 87 | .A_CASCADE (1), 88 | .Z_SEL_17BITSHIFT (1), 89 | .W_SEL_C (0) ) 90 | dsp_0_2 91 | ( 92 | .rst_ni, 93 | .clk_i, 94 | .ce_i, 95 | .A_i(30'h0), 96 | .ACIN_i(dsp_ac[0][1]), 97 | .B_i({1'b0,b[1][2*17 +:17]}), 98 | .C_i(), 99 | .PCIN_i(dsp_pc[0][1]), 100 | .ACOUT_o(dsp_ac[0][2]), 101 | .P_o(dsp_p[0][2]), 102 | .PCOUT_o(dsp_pc[0][2]) 103 | ); 104 | 105 | dsp_mul64x64 106 | #( .A_DUAL_REG (0), 107 | .B_DUAL_REG (1), 108 | .A_CASCADE (1), 109 | .Z_SEL_17BITSHIFT (1), 110 | .W_SEL_C (0) ) 111 | dsp_0_3 112 | ( 113 | .rst_ni, 114 | .clk_i, 115 | .ce_i, 116 | .A_i(30'h0), 117 | .ACIN_i(dsp_ac[0][2]), 118 | .B_i({1'b0,4'b0,b[2][63: 3*17]}), 119 | .C_i(), 120 | .PCIN_i(dsp_pc[0][2]), 121 | .ACOUT_o(dsp_ac[0][3]), 122 | .P_o(dsp_p[0][3]), 123 | .PCOUT_o(dsp_pc[0][3]) 124 | ); 125 | 126 | 127 | 128 | dsp_mul64x64 129 | #( .A_DUAL_REG (1), 130 | .B_DUAL_REG (1), 131 | .A_CASCADE (0), 132 | .Z_SEL_17BITSHIFT (0), 133 | .W_SEL_C (1) ) 134 | dsp_1_0 135 | ( 136 | .rst_ni, 137 | .clk_i, 138 | .ce_i, 139 | .A_i({4'b0,a[3][1*26 +:26]}), 140 | .ACIN_i(30'h0), 141 | .B_i({1'b0,b[3][0*17 +:17]}), 142 | .C_i({{(48-17){1'b0}},dsp_p[0][2][(26-17-1):0],dsp_p_q[0][1][16:(26-17)]}), 143 | .PCIN_i(), 144 | .ACOUT_o(dsp_ac[1][0]), 145 | .P_o(dsp_p[1][0]), 146 | .PCOUT_o(dsp_pc[1][0]) 147 | ); 148 | 149 | dsp_mul64x64 150 | #( .A_DUAL_REG (0), 151 | .B_DUAL_REG (1), 152 | .A_CASCADE (1), 153 | .Z_SEL_17BITSHIFT (1), 154 | .W_SEL_C (1) ) 155 | dsp_1_1 156 | ( 157 | .rst_ni, 158 | .clk_i, 159 | .ce_i, 160 | .A_i(30'h0), 161 | .ACIN_i(dsp_ac[1][0]), 162 | .B_i({1'b0,b[4][1*17 +:17]}), 163 | .C_i({{(48-17){1'b0}},dsp_p[0][3][(26-17-1):0],dsp_p_q[0][2][16:(26-17)]}), 164 | .PCIN_i(dsp_pc[1][0]), 165 | .ACOUT_o(dsp_ac[1][1]), 166 | .P_o(dsp_p[1][1]), 167 | .PCOUT_o(dsp_pc[1][1]) 168 | ); 169 | 170 | dsp_mul64x64 171 | #( .A_DUAL_REG (0), 172 | .B_DUAL_REG (1), 173 | .A_CASCADE (1), 174 | .Z_SEL_17BITSHIFT (1), 175 | .W_SEL_C (1) ) 176 | dsp_1_2 177 | ( 178 | .rst_ni, 179 | .clk_i, 180 | .ce_i, 181 | .A_i(30'h0), 182 | .ACIN_i(dsp_ac[1][1]), 183 | .B_i({1'b0,b[5][2*17 +:17]}), 184 | .C_i({{(48-39){1'b0}},dsp_p_q[0][3][47:(26-17)]}), 185 | .PCIN_i(dsp_pc[1][1]), 186 | .ACOUT_o(dsp_ac[1][2]), 187 | .P_o(dsp_p[1][2]), 188 | .PCOUT_o(dsp_pc[1][2]) 189 | ); 190 | 191 | dsp_mul64x64 192 | #( .A_DUAL_REG (0), 193 | .B_DUAL_REG (1), 194 | .A_CASCADE (1), 195 | .Z_SEL_17BITSHIFT (1), 196 | .W_SEL_C (0) ) 197 | dsp_1_3 198 | ( 199 | .rst_ni, 200 | .clk_i, 201 | .ce_i, 202 | .A_i(30'h0), 203 | .ACIN_i(dsp_ac[1][2]), 204 | .B_i({1'b0,4'b0,b[6][63: 3*17]}), 205 | .C_i(), 206 | .PCIN_i(dsp_pc[1][2]), 207 | .ACOUT_o(dsp_ac[1][3]), 208 | .P_o(dsp_p[1][3]), 209 | .PCOUT_o(dsp_pc[1][3]) 210 | ); 211 | 212 | 213 | 214 | dsp_mul64x64 215 | #( .A_DUAL_REG (1), 216 | .B_DUAL_REG (1), 217 | .A_CASCADE (0), 218 | .Z_SEL_17BITSHIFT (0), 219 | .W_SEL_C (1) ) 220 | dsp_2_0 221 | ( 222 | .rst_ni, 223 | .clk_i, 224 | .ce_i, 225 | .A_i({4'b0,{(30-16){1'b0}},a[7][63: 2*26]}), 226 | .ACIN_i(30'h0), 227 | .B_i({1'b0,b[7][0*17 +:17]}), 228 | .C_i({{(48-17){1'b0}},dsp_p[1][2][(26-17-1):0],dsp_p_q[1][1][16:(26-17)]}), 229 | .PCIN_i(), 230 | .ACOUT_o(dsp_ac[2][0]), 231 | .P_o(dsp_p[2][0]), 232 | .PCOUT_o(dsp_pc[2][0]) 233 | ); 234 | 235 | dsp_mul64x64 236 | #( .A_DUAL_REG (0), 237 | .B_DUAL_REG (1), 238 | .A_CASCADE (1), 239 | .Z_SEL_17BITSHIFT (1), 240 | .W_SEL_C (1) ) 241 | dsp_2_1 242 | ( 243 | .rst_ni, 244 | .clk_i, 245 | .ce_i, 246 | .A_i(30'h0), 247 | .ACIN_i(dsp_ac[2][0]), 248 | .B_i({1'b0,b[8][1*17 +:17]}), 249 | .C_i({{(48-17){1'b0}},dsp_p[1][3][(26-17-1):0],dsp_p_q[1][2][16:(26-17)]}), 250 | .PCIN_i(dsp_pc[2][0]), 251 | .ACOUT_o(dsp_ac[2][1]), 252 | .P_o(dsp_p[2][1]), 253 | .PCOUT_o(dsp_pc[2][1]) 254 | ); 255 | 256 | dsp_mul64x64 257 | #( .A_DUAL_REG (0), 258 | .B_DUAL_REG (1), 259 | .A_CASCADE (1), 260 | .Z_SEL_17BITSHIFT (1), 261 | .W_SEL_C (1) ) 262 | dsp_2_2 263 | ( 264 | .rst_ni, 265 | .clk_i, 266 | .ce_i, 267 | .A_i(30'h0), 268 | .ACIN_i(dsp_ac[2][1]), 269 | .B_i({1'b0,b[9][2*17 +:17]}), 270 | .C_i({{(48-39){1'b0}},dsp_p_q[1][3][47:(26-17)]}), 271 | .PCIN_i(dsp_pc[2][1]), 272 | .ACOUT_o(dsp_ac[2][2]), 273 | .P_o(dsp_p[2][2]), 274 | .PCOUT_o(dsp_pc[2][2]) 275 | ); 276 | 277 | dsp_mul64x64 278 | #( .A_DUAL_REG (0), 279 | .B_DUAL_REG (1), 280 | .A_CASCADE (1), 281 | .Z_SEL_17BITSHIFT (1), 282 | .W_SEL_C (0) ) 283 | dsp_2_3 284 | ( 285 | .rst_ni, 286 | .clk_i, 287 | .ce_i, 288 | .A_i(30'h0), 289 | .ACIN_i(dsp_ac[2][2]), 290 | .B_i({1'b0,4'b0,b[10][63: 3*17]}), 291 | .C_i(), 292 | .PCIN_i(dsp_pc[2][2]), 293 | .ACOUT_o(dsp_ac[2][3]), 294 | .P_o(dsp_p[2][3]), 295 | .PCOUT_o(dsp_pc[2][3]) 296 | ); 297 | 298 | 299 | logic [14:0][127:0] p, p_q; 300 | always_comb begin 301 | p[0] = 0; 302 | p[1] = p_q[0]; 303 | p[2] = p_q[1]; 304 | p[3] = p_q[2] | (dsp_p[0][0][16:0] << 0); 305 | p[4] = p_q[3] | (dsp_p[0][1][(26-17-1):0] << 17); 306 | p[5] = p_q[4]; 307 | p[6] = p_q[5]; 308 | p[7] = p_q[6] | (dsp_p[1][0][16:0] << 26); 309 | p[8] = p_q[7] | (dsp_p[1][1][(26-17-1):0] << 43); 310 | p[9] = p_q[8]; 311 | p[10] = p_q[9]; 312 | p[11] = p_q[10] | (dsp_p[2][0][16:0] << 52); 313 | p[12] = p_q[11] | (dsp_p[2][1][16:0] << 69); 314 | p[13] = p_q[12] | (dsp_p[2][2][16:0] << 86); 315 | p[14] = p_q[13] | (dsp_p[2][3] << 103); 316 | p_o = p[14]; 317 | end 318 | always_ff @(posedge clk_i) begin 319 | if (ce_i) begin 320 | p_q <= p; 321 | end 322 | end 323 | 324 | `ifndef SYNTHESIS 325 | logic [PIPE_DEPTH_MUL64X64-1:0] v_q; 326 | always_ff @(posedge clk_i) begin 327 | if (!rst_ni) begin 328 | v_q <= 0; 329 | end 330 | else if (glbl.GSR) begin 331 | v_q <= 0; 332 | end 333 | else if (ce_i) begin 334 | v_q <= {v_q, 1'b1}; 335 | end 336 | end 337 | logic [127:0] pexp; 338 | always_ff @(posedge clk_i) begin 339 | if (ce_i) begin 340 | pexp = a[PIPE_DEPTH_MUL64X64]*b[PIPE_DEPTH_MUL64X64]; 341 | if (v_q[PIPE_DEPTH_MUL64X64-1] && pexp!=p_o) begin 342 | $display("v=%b a=%x b=%x pact=%x pexp=%x ERROR", 343 | v_q[PIPE_DEPTH_MUL64X64-1], 344 | a[PIPE_DEPTH_MUL64X64],b[PIPE_DEPTH_MUL64X64],p_o,pexp); 345 | $stop; 346 | end 347 | else begin 348 | `ifdef NEVER 349 | $display("v=%b a=%x b=%x pact=%x pexp=%x", 350 | v_q[PIPE_DEPTH_MUL64X64-1], 351 | a[PIPE_DEPTH_MUL64X64],b[PIPE_DEPTH_MUL64X64],p_o,pexp); 352 | `endif 353 | end 354 | end 355 | end 356 | `endif 357 | 358 | endmodule 359 | 360 | 361 | //`define USE_DSP_ABSTRACT 362 | 363 | module dsp_mul64x64 364 | #( 365 | parameter A_DUAL_REG = 0, 366 | parameter B_DUAL_REG = 0, 367 | parameter A_CASCADE = 0, 368 | parameter Z_SEL_17BITSHIFT = 0, 369 | parameter W_SEL_C = 0 370 | ) 371 | ( 372 | input logic rst_ni, 373 | input logic clk_i, 374 | input logic ce_i, 375 | input logic [29:0] A_i, 376 | input logic [29:0] ACIN_i, 377 | input logic [17:0] B_i, 378 | input logic [47:0] C_i, 379 | input logic [47:0] PCIN_i, 380 | output logic [29:0] ACOUT_o, 381 | output logic [47:0] P_o, 382 | output logic [47:0] PCOUT_o 383 | ); 384 | 385 | `ifdef USE_DSP_ABSTRACT 386 | 387 | logic signed [26:0] a0_q; 388 | logic signed [17:0] b0_q; 389 | always_ff @(posedge clk_i) begin 390 | if (ce_i) begin 391 | a0_q <= A_CASCADE ? ACIN_i : A_i; 392 | b0_q <= B_i; 393 | end 394 | end 395 | 396 | logic signed [26:0] a_q; 397 | logic signed [17:0] b_q; 398 | always_ff @(posedge clk_i) begin 399 | if (ce_i) begin 400 | a_q <= A_DUAL_REG ? a0_q : A_CASCADE ? ACIN_i : A_i; 401 | b_q <= B_DUAL_REG ? b0_q : B_i; 402 | end 403 | end 404 | 405 | logic signed [44:0] c_q, m_q; 406 | always_ff @(posedge clk_i) begin 407 | if (ce_i) begin 408 | m_q <= a_q * b_q; 409 | c_q <= C_i; 410 | end 411 | end 412 | 413 | logic signed [47:0] p_q; 414 | always_ff @(posedge clk_i) begin 415 | if (ce_i) begin 416 | p_q <= m_q + (Z_SEL_17BITSHIFT ? (PCIN_i>>17) : 0) + (W_SEL_C ? c_q : 0); 417 | end 418 | end 419 | 420 | `endif 421 | 422 | logic [3:0] alumode; 423 | logic [4:0] inmode; 424 | logic [8:0] opmode; 425 | logic [47:0] dsp48e2_p, dsp48e2_pcout; 426 | logic [29:0] dsp48e2_acout; 427 | 428 | always_comb begin 429 | alumode = 4'b0000; 430 | inmode = (A_DUAL_REG ? 5'b00000 : 5'b00001) | (B_DUAL_REG ? 5'b00000 : 5'b10000); 431 | opmode = { 432 | W_SEL_C ? 2'b11 : 2'b00, // W <- C : 0 433 | Z_SEL_17BITSHIFT ? 3'b101 : 3'b000, // Z <- PCIN>>17 : 0 434 | 4'b0101 // XY <- M 435 | }; 436 | end 437 | 438 | DSP48E2 439 | #( 440 | .ACASCREG(A_DUAL_REG ? 2 : 1), 441 | .ADREG(), 442 | .ALUMODEREG(), 443 | .AMULTSEL(), 444 | .AREG(A_DUAL_REG ? 2 : 1), 445 | .AUTORESET_PATDET(), 446 | .AUTORESET_PRIORITY(), 447 | .A_INPUT(A_CASCADE ? "CASCADE" : "DIRECT"), 448 | .BCASCREG(), 449 | .BMULTSEL(), 450 | .BREG(B_DUAL_REG ? 2 : 1), 451 | .B_INPUT(), 452 | .CARRYINREG(), 453 | .CARRYINSELREG(), 454 | .CREG(), 455 | .DREG(), 456 | .INMODEREG(), 457 | .IS_ALUMODE_INVERTED(), 458 | .IS_CARRYIN_INVERTED(), 459 | .IS_CLK_INVERTED(), 460 | .IS_INMODE_INVERTED(), 461 | .IS_OPMODE_INVERTED(), 462 | .IS_RSTALLCARRYIN_INVERTED(), 463 | .IS_RSTALUMODE_INVERTED(), 464 | .IS_RSTA_INVERTED(), 465 | .IS_RSTB_INVERTED(), 466 | .IS_RSTCTRL_INVERTED(), 467 | .IS_RSTC_INVERTED(), 468 | .IS_RSTD_INVERTED(), 469 | .IS_RSTINMODE_INVERTED(), 470 | .IS_RSTM_INVERTED(), 471 | .IS_RSTP_INVERTED(), 472 | .MASK(), 473 | .MREG(), 474 | .OPMODEREG(), 475 | .PATTERN(), 476 | .PREADDINSEL(), 477 | .PREG(), 478 | .RND(), 479 | .SEL_MASK(), 480 | .SEL_PATTERN(), 481 | .USE_MULT(), 482 | .USE_PATTERN_DETECT(), 483 | .USE_SIMD(), 484 | .USE_WIDEXOR(), 485 | .XORSIMD() 486 | ) 487 | dsp 488 | ( 489 | .ACOUT(dsp48e2_acout), 490 | .BCOUT(), 491 | .CARRYCASCOUT(), 492 | .CARRYOUT(), 493 | .MULTSIGNOUT(), 494 | .OVERFLOW(), 495 | .P(dsp48e2_p), 496 | .PATTERNBDETECT(), 497 | .PATTERNDETECT(), 498 | .PCOUT(dsp48e2_pcout), 499 | .UNDERFLOW(), 500 | .XOROUT(), 501 | 502 | .A(A_i), 503 | .ACIN(ACIN_i), 504 | .ALUMODE(alumode[3:0]), 505 | .B(B_i), 506 | .BCIN(18'h0), 507 | .C(C_i), 508 | .CARRYCASCIN(1'b0), 509 | .CARRYIN(1'b0), 510 | .CARRYINSEL(3'h0), 511 | .CEA1(ce_i), 512 | .CEA2(ce_i), 513 | .CEAD(ce_i), 514 | .CEALUMODE(ce_i), 515 | .CEB1(ce_i), 516 | .CEB2(ce_i), 517 | .CEC(ce_i), 518 | .CECARRYIN(ce_i), 519 | .CECTRL(ce_i), 520 | .CED(ce_i), 521 | .CEINMODE(ce_i), 522 | .CEM(ce_i), 523 | .CEP(ce_i), 524 | .CLK(clk_i), 525 | .D(27'h0), 526 | .INMODE(inmode[4:0]), 527 | .MULTSIGNIN(1'b0), 528 | .OPMODE(opmode[8:0]), 529 | .PCIN(PCIN_i), 530 | .RSTA(!rst_ni), 531 | .RSTALLCARRYIN(!rst_ni), 532 | .RSTALUMODE(!rst_ni), 533 | .RSTB(!rst_ni), 534 | .RSTC(!rst_ni), 535 | .RSTCTRL(!rst_ni), 536 | .RSTD(!rst_ni), 537 | .RSTINMODE(!rst_ni), 538 | .RSTM(!rst_ni), 539 | .RSTP(!rst_ni) 540 | ); 541 | 542 | `ifdef USE_DSP_ABSTRACT 543 | wire match = (dsp48e2_p == p_q) && (dsp48e2_pcout == p_q) && (dsp48e2_acout == a_q); 544 | `undef USE_DSP_ABSTACT 545 | `endif 546 | 547 | assign ACOUT_o = dsp48e2_acout; 548 | assign P_o = dsp48e2_p; 549 | assign PCOUT_o = dsp48e2_pcout; 550 | 551 | endmodule 552 | -------------------------------------------------------------------------------- /rtl/dsp48e2/mulred.sv: -------------------------------------------------------------------------------- 1 | // Copyright Supranational LLC 2 | // Licensed under the Apache License, Version 2.0, see LICENSE-APACHE 3 | // or the MIT license, see LICENSE-MIT, at your option. 4 | // SPDX-License-Identifier: Apache-2.0 OR MIT 5 | 6 | module mulred 7 | import math_pkg::*; 8 | #( 9 | parameter BFLYDSP = 24 // 24, 16, or 12 10 | ) 11 | ( 12 | input logic rst_ni, 13 | input logic clk_i, 14 | input ce_i, 15 | 16 | input logic [63:0] a_i, 17 | input logic [63:0] b_i, 18 | 19 | output logic [63:0] r_o 20 | ); 21 | 22 | logic [127:0] p; 23 | 24 | mul64x64 mul64x64 25 | ( .rst_ni, 26 | .clk_i, 27 | .ce_i, 28 | .a_i, 29 | .b_i, 30 | .p_o(p) 31 | ); 32 | 33 | red128t64 34 | #( .NO_DSP_ADD32 ( BFLYDSP==24 ? 1 : BFLYDSP==16 ? 1 : BFLYDSP==12 ? 1 : 0 ), 35 | .NO_DSP_SUB64 ( BFLYDSP==24 ? 0 : BFLYDSP==16 ? 0 : BFLYDSP==12 ? 1 : 0 ), 36 | .NO_DSP_ADDSUB64 ( BFLYDSP==24 ? 0 : BFLYDSP==16 ? 0 : BFLYDSP==12 ? 1 : 0 ) 37 | ) 38 | red128t64 39 | ( .rst_ni, 40 | .clk_i, 41 | .ce_i, 42 | .p_i(p), 43 | .r_o 44 | ); 45 | 46 | `ifndef SYNTHESIS 47 | logic [PIPE_DEPTH_MULRED-1:0][63:0] a_q, b_q; 48 | logic [PIPE_DEPTH_MULRED-1:0] v_q; 49 | logic [127:0] pp; 50 | always_ff @(posedge clk_i) begin 51 | if (!rst_ni) begin 52 | v_q <= 0; 53 | end 54 | else if (glbl.GSR) begin 55 | v_q <= 0; 56 | end 57 | else if (ce_i) begin 58 | v_q <= {v_q, 1'b1}; 59 | a_q <= {a_q, a_i}; 60 | b_q <= {b_q, b_i}; 61 | end 62 | end 63 | logic [65:0] rexp; 64 | always_ff @(posedge clk_i) begin 65 | if (ce_i) begin 66 | pp = a_q[PIPE_DEPTH_MULRED-1] * b_q[PIPE_DEPTH_MULRED-1]; 67 | rexp = pp[63:0] - pp[96+:32] + {pp[64+:32],32'h0} - pp[64+:32]; 68 | if (rexp[65]) begin 69 | rexp += 1 << 64; 70 | rexp -= 32'hffffffff; 71 | end 72 | else if (rexp[64]) begin 73 | rexp -= 1 << 64; 74 | rexp += 32'hffffffff; 75 | end 76 | if (v_q[PIPE_DEPTH_MULRED-1] && rexp!=r_o) begin 77 | $display("v=%b a=%x b=%x ract=%x rexp=%x ERROR", 78 | v_q[PIPE_DEPTH_MULRED-1], 79 | a_q[PIPE_DEPTH_MULRED-1],b_q[PIPE_DEPTH_MULRED-1],r_o,rexp); 80 | $finish; 81 | end 82 | else begin 83 | `ifdef NEVER 84 | $display("v=%b a=%x b=%x ract=%x rexp=%x", 85 | v_q[PIPE_DEPTH_MULRED-1], 86 | a_q[PIPE_DEPTH_MULRED-1],b_q[PIPE_DEPTH_MULRED-1],r_o,rexp); 87 | `endif 88 | end 89 | end 90 | end 91 | `endif 92 | 93 | endmodule 94 | -------------------------------------------------------------------------------- /rtl/dsp48e2/red128t64.sv: -------------------------------------------------------------------------------- 1 | // Copyright Supranational LLC 2 | // Licensed under the Apache License, Version 2.0, see LICENSE-APACHE 3 | // or the MIT license, see LICENSE-MIT, at your option. 4 | // SPDX-License-Identifier: Apache-2.0 OR MIT 5 | 6 | module red128t64 7 | import math_pkg::*; 8 | #( 9 | parameter NO_DSP_ADD32 = 1, 10 | parameter NO_DSP_SUB64 = 0, 11 | parameter NO_DSP_ADDSUB64 = 0 12 | ) 13 | ( 14 | input logic rst_ni, 15 | input logic clk_i, 16 | input ce_i, 17 | 18 | input logic [127:0] p_i, 19 | 20 | output logic [63:0] r_o 21 | ); 22 | 23 | logic [47:0] dword3_p_dword2; 24 | dsp_red128t64 25 | #( .AB_DUAL_REG (0), 26 | .C_DUAL_REG (0), 27 | .USE_CARRY (0), 28 | .NO_DSP (NO_DSP_ADD32) ) 29 | dsp_dword3_p_dword2 30 | ( 31 | .rst_ni, 32 | .clk_i, 33 | .ce_i, 34 | .AB_i({16'h0,p_i[96+:32]}), 35 | .C_i({16'h0,p_i[64+:32]}), 36 | .sub_i(1'b0), 37 | .nop_i(1'b0), 38 | .CARRYCASCIN_i(), 39 | .P_o(dword3_p_dword2), 40 | .CARRYCASCOUT_o() 41 | ); 42 | 43 | logic [47:0] dword2_p_dword1; 44 | dsp_red128t64 45 | #( .AB_DUAL_REG (0), 46 | .C_DUAL_REG (0), 47 | .USE_CARRY (0), 48 | .NO_DSP (NO_DSP_ADD32) ) 49 | dsp_dword2_p_dword1 50 | ( 51 | .rst_ni, 52 | .clk_i, 53 | .ce_i, 54 | .AB_i({16'h0,p_i[64+:32]}), 55 | .C_i({16'h0,p_i[32+:32]}), 56 | .sub_i(1'b0), 57 | .nop_i(1'b0), 58 | .CARRYCASCIN_i(), 59 | .P_o(dword2_p_dword1), 60 | .CARRYCASCOUT_o() 61 | ); 62 | 63 | logic [31:0] dword0_p1_q; 64 | logic [32:0] dword3_p_dword2_p1_q; 65 | always_ff @(posedge clk_i) begin 66 | if (ce_i) begin 67 | dword0_p1_q <= p_i[0 +: 32]; 68 | dword3_p_dword2_p1_q <= dword3_p_dword2; 69 | end 70 | end 71 | 72 | 73 | logic [47:0] r0_pre; 74 | logic r0_pre_carry; 75 | dsp_red128t64 76 | #( .AB_DUAL_REG (0), 77 | .C_DUAL_REG (1), 78 | .USE_CARRY (0), 79 | .NO_DSP (NO_DSP_SUB64) ) 80 | dsp_sub0 81 | ( 82 | .rst_ni, 83 | .clk_i, 84 | .ce_i, 85 | .AB_i({dword3_p_dword2[31:0],16'h0}), 86 | .C_i({dword0_p1_q,16'h0}), 87 | .sub_i(1'b1), 88 | .nop_i(1'b0), 89 | .CARRYCASCIN_i(), 90 | .P_o(r0_pre), 91 | .CARRYCASCOUT_o(r0_pre_carry) 92 | ); 93 | 94 | logic [47:0] r1_pre; 95 | dsp_red128t64 96 | #( .AB_DUAL_REG (0), 97 | .C_DUAL_REG (1), 98 | .USE_CARRY (1), 99 | .NO_DSP (NO_DSP_SUB64) ) 100 | dsp_sub1 101 | ( 102 | .rst_ni, 103 | .clk_i, 104 | .ce_i, 105 | .AB_i({47'h0,dword3_p_dword2_p1_q[32]}), 106 | .C_i({15'h0,dword2_p_dword1[32:0]}), 107 | .sub_i(1'b1), 108 | .nop_i(1'b0), 109 | .CARRYCASCIN_i(r0_pre_carry), 110 | .P_o(r1_pre), 111 | .CARRYCASCOUT_o() 112 | ); 113 | 114 | logic mod_nop, mod_nop_q; 115 | logic mod_sub, mod_sub_q; 116 | 117 | always_comb begin 118 | mod_nop = r1_pre[33:32]==0; 119 | mod_sub = r1_pre[33]; 120 | end 121 | always_ff @(posedge clk_i) begin 122 | if (ce_i) begin 123 | mod_nop_q <= mod_nop; 124 | mod_sub_q <= mod_sub; 125 | end 126 | end 127 | 128 | logic [47:0] r0; 129 | logic r0_carry; 130 | dsp_red128t64 131 | #( .AB_DUAL_REG (0), 132 | .C_DUAL_REG (1), 133 | .USE_CARRY (0), 134 | .NO_DSP (NO_DSP_ADDSUB64) ) 135 | dsp_mod0 136 | ( 137 | .rst_ni, 138 | .clk_i, 139 | .ce_i, 140 | .AB_i({32'hffffffff,16'h0}), 141 | .C_i({r0_pre[47:16],16'h0}), 142 | .sub_i(mod_sub), 143 | .nop_i(mod_nop), 144 | .CARRYCASCIN_i(), 145 | .P_o(r0), 146 | .CARRYCASCOUT_o(r0_carry) 147 | ); 148 | 149 | logic [47:0] r1; 150 | dsp_red128t64 151 | #( .AB_DUAL_REG (0), 152 | .C_DUAL_REG (1), 153 | .USE_CARRY (1), 154 | .NO_DSP (NO_DSP_ADDSUB64) ) 155 | dsp_mod1 156 | ( 157 | .rst_ni, 158 | .clk_i, 159 | .ce_i, 160 | .AB_i(48'h0), 161 | .C_i({16'h0,r1_pre[31:0]}), 162 | .sub_i(mod_sub_q), 163 | .nop_i(mod_nop_q), 164 | .CARRYCASCIN_i(r0_carry), 165 | .P_o(r1), 166 | .CARRYCASCOUT_o() 167 | ); 168 | 169 | logic [47:0] r0_q; 170 | always_ff @(posedge clk_i) begin 171 | if (ce_i) begin 172 | r0_q <= r0; 173 | end 174 | end 175 | 176 | always_comb begin 177 | r_o = {r1[31:0],r0_q[47:16]}; 178 | end 179 | 180 | `ifndef SYNTHESIS 181 | logic [PIPE_DEPTH_RED128T64-1:0][127:0] p_q; 182 | logic [PIPE_DEPTH_RED128T64-1:0] v_q; 183 | always_ff @(posedge clk_i) begin 184 | if (!rst_ni) begin 185 | v_q <= 0; 186 | end 187 | else if (ce_i) begin 188 | v_q <= {v_q, 1'b1}; 189 | p_q <= {p_q, p_i}; 190 | end 191 | end 192 | logic [65:0] rexp; 193 | always_ff @(posedge clk_i) begin 194 | if (ce_i) begin 195 | rexp = p_q[PIPE_DEPTH_RED128T64-1][63:0] 196 | - p_q[PIPE_DEPTH_RED128T64-1][96+:32] 197 | + {p_q[PIPE_DEPTH_RED128T64-1][64+:32],32'h0} 198 | - p_q[PIPE_DEPTH_RED128T64-1][64+:32]; 199 | if (rexp[65]) begin 200 | rexp += 1 << 64; 201 | rexp -= 32'hffffffff; 202 | end 203 | else if (rexp[64]) begin 204 | rexp -= 1 << 64; 205 | rexp += 32'hffffffff; 206 | end 207 | if (v_q[PIPE_DEPTH_RED128T64-1] && rexp!=r_o) begin 208 | $display("v=%b p=%x ract=%x rexp=%x ERROR", 209 | v_q[PIPE_DEPTH_RED128T64-1], 210 | p_q[PIPE_DEPTH_RED128T64-1],r_o,rexp); 211 | $stop; 212 | end 213 | else begin 214 | `ifdef NEVER 215 | $display("v=%b p=%x ract=%x rexp=%x", 216 | v_q[PIPE_DEPTH_RED128T64-1], 217 | p_q[PIPE_DEPTH_RED128T64-1],r_o,rexp); 218 | `endif 219 | end 220 | end 221 | end 222 | `endif 223 | 224 | endmodule 225 | 226 | 227 | module dsp_red128t64 228 | #( 229 | parameter AB_DUAL_REG = 0, 230 | parameter C_DUAL_REG = 0, 231 | parameter USE_CARRY = 0, 232 | parameter NO_DSP = 0 233 | ) 234 | ( 235 | input logic rst_ni, 236 | input logic clk_i, 237 | input logic ce_i, 238 | input logic [47:0] AB_i, 239 | input logic [47:0] C_i, 240 | input logic sub_i, 241 | input logic nop_i, 242 | input logic CARRYCASCIN_i, 243 | output logic [47:0] P_o, 244 | output logic CARRYCASCOUT_o 245 | ); 246 | 247 | logic signed [47:0] c0_q; 248 | always_ff @(posedge clk_i) begin 249 | if (ce_i) begin 250 | c0_q <= C_i; 251 | end 252 | end 253 | 254 | if (NO_DSP) begin 255 | 256 | logic signed [47:0] ab0_q; 257 | always_ff @(posedge clk_i) begin 258 | if (ce_i) begin 259 | ab0_q <= AB_i; 260 | end 261 | end 262 | 263 | logic signed [47:0] ab_q; 264 | logic signed [47:0] c_q; 265 | logic sub_q, nop_q; 266 | always_ff @(posedge clk_i) begin 267 | if (ce_i) begin 268 | ab_q <= AB_DUAL_REG ? ab0_q : AB_i; 269 | c_q <= C_DUAL_REG ? c0_q : C_i; 270 | sub_q <= sub_i; 271 | nop_q <= nop_i; 272 | end 273 | end 274 | 275 | logic signed [48:0] p_q; 276 | always_ff @(posedge clk_i) begin 277 | if (ce_i) begin 278 | p_q <= nop_q ? c_q 279 | : sub_q ? (c_q - ab_q - (USE_CARRY & CARRYCASCIN_i)) 280 | : (c_q + ab_q + (USE_CARRY & CARRYCASCIN_i)); 281 | end 282 | end 283 | 284 | assign P_o = p_q[47:0]; 285 | assign CARRYCASCOUT_o = p_q[48]; 286 | 287 | end 288 | else begin 289 | 290 | logic [3:0] alumode; 291 | logic [4:0] inmode; 292 | logic [8:0] opmode; 293 | logic [47:0] dsp48e2_p; 294 | logic dsp48e2_carrycascout; 295 | 296 | always_comb begin 297 | alumode = sub_i ? 4'b0011 : 4'b0000; // Z - (W + X + Y + CIN) : Z + (W + X + Y + CIN) 298 | inmode = AB_DUAL_REG ? 5'b00000 : 5'b10001; 299 | opmode = { 300 | 2'b00, // W <- 0 301 | 3'b011, // Z <- C 302 | 2'b00, // Y <- 0 303 | nop_i ? 2'b00 : 2'b11 // X <- 0 : A:B 304 | }; 305 | end 306 | 307 | DSP48E2 308 | #( 309 | .ACASCREG(), 310 | .ADREG(), 311 | .ALUMODEREG(), 312 | .AMULTSEL(), 313 | .AREG(AB_DUAL_REG ? 2 : 1), 314 | .AUTORESET_PATDET(), 315 | .AUTORESET_PRIORITY(), 316 | .A_INPUT(), 317 | .BCASCREG(), 318 | .BMULTSEL(), 319 | .BREG(AB_DUAL_REG ? 2 : 1), 320 | .B_INPUT(), 321 | .CARRYINREG(), 322 | .CARRYINSELREG(), 323 | .CREG(), 324 | .DREG(), 325 | .INMODEREG(), 326 | .IS_ALUMODE_INVERTED(), 327 | .IS_CARRYIN_INVERTED(), 328 | .IS_CLK_INVERTED(), 329 | .IS_INMODE_INVERTED(), 330 | .IS_OPMODE_INVERTED(), 331 | .IS_RSTALLCARRYIN_INVERTED(), 332 | .IS_RSTALUMODE_INVERTED(), 333 | .IS_RSTA_INVERTED(), 334 | .IS_RSTB_INVERTED(), 335 | .IS_RSTCTRL_INVERTED(), 336 | .IS_RSTC_INVERTED(), 337 | .IS_RSTD_INVERTED(), 338 | .IS_RSTINMODE_INVERTED(), 339 | .IS_RSTM_INVERTED(), 340 | .IS_RSTP_INVERTED(), 341 | .MASK(), 342 | .MREG(0), 343 | .OPMODEREG(), 344 | .PATTERN(), 345 | .PREADDINSEL(), 346 | .PREG(), 347 | .RND(), 348 | .SEL_MASK(), 349 | .SEL_PATTERN(), 350 | .USE_MULT("NONE"), 351 | .USE_PATTERN_DETECT(), 352 | .USE_SIMD(), 353 | .USE_WIDEXOR(), 354 | .XORSIMD() 355 | ) 356 | dsp 357 | ( 358 | .ACOUT(), 359 | .BCOUT(), 360 | .CARRYCASCOUT(dsp48e2_carrycascout), 361 | .CARRYOUT(), 362 | .MULTSIGNOUT(), 363 | .OVERFLOW(), 364 | .P(dsp48e2_p), 365 | .PATTERNBDETECT(), 366 | .PATTERNDETECT(), 367 | .PCOUT(), 368 | .UNDERFLOW(), 369 | .XOROUT(), 370 | 371 | .A(AB_i[47:18]), 372 | .ACIN(30'h0), 373 | .ALUMODE(alumode[3:0]), 374 | .B(AB_i[17:0]), 375 | .BCIN(18'h0), 376 | .C(C_DUAL_REG ? c0_q : C_i), 377 | .CARRYCASCIN(CARRYCASCIN_i), 378 | .CARRYIN(1'b0), 379 | .CARRYINSEL(USE_CARRY ? 3'b010: 3'b000), // CARRYSCANIN : CARRYIN 380 | .CEA1(ce_i), 381 | .CEA2(ce_i), 382 | .CEAD(ce_i), 383 | .CEALUMODE(ce_i), 384 | .CEB1(ce_i), 385 | .CEB2(ce_i), 386 | .CEC(ce_i), 387 | .CECARRYIN(ce_i), 388 | .CECTRL(ce_i), 389 | .CED(ce_i), 390 | .CEINMODE(ce_i), 391 | .CEM(ce_i), 392 | .CEP(ce_i), 393 | .CLK(clk_i), 394 | .D(27'h0), 395 | .INMODE(inmode[4:0]), 396 | .MULTSIGNIN(1'b0), 397 | .OPMODE(opmode[8:0]), 398 | .PCIN(), 399 | .RSTA(!rst_ni), 400 | .RSTALLCARRYIN(!rst_ni), 401 | .RSTALUMODE(!rst_ni), 402 | .RSTB(!rst_ni), 403 | .RSTC(!rst_ni), 404 | .RSTCTRL(!rst_ni), 405 | .RSTD(!rst_ni), 406 | .RSTINMODE(!rst_ni), 407 | .RSTM(!rst_ni), 408 | .RSTP(!rst_ni) 409 | ); 410 | 411 | assign P_o = dsp48e2_p; 412 | assign CARRYCASCOUT_o = dsp48e2_carrycascout; 413 | 414 | end 415 | 416 | endmodule 417 | -------------------------------------------------------------------------------- /rtl/files.f: -------------------------------------------------------------------------------- 1 | axi_hbm_pkg.sv 2 | config_pkg.sv 3 | 4 | components/fifo_ctrl.sv 5 | components/fifo_core.sv 6 | components/fifo.sv 7 | components/cdc_fifo_core.sv 8 | components/bin_to_gray.sv 9 | components/gray_to_bin.sv 10 | components/cdc_sync.sv 11 | components/ram_1w1r_1clk.sv 12 | components/slrx_tx_reg.sv 13 | components/slrx_rx_reg.sv 14 | 15 | dsp48e2/math_pkg.sv 16 | dsp48e2/butterfly.sv 17 | dsp48e2/modaddsub.sv 18 | dsp48e2/mulred.sv 19 | dsp48e2/mul64x64.sv 20 | dsp48e2/red128t64.sv 21 | 22 | ntt/ntt_opt_pkg.sv 23 | ntt/ntt_top.sv 24 | ntt/ntt.sv 25 | ntt/ntt_twiddle.sv 26 | ntt/ntt_butterfly.sv 27 | ntt/ntt_cgram.sv 28 | ntt/ntt_bitrev.sv 29 | 30 | dma/dma_counter.sv 31 | dma/point_to_ntt.sv 32 | dma/point_from_ntt.sv 33 | dma/point_dma_r_channel.sv 34 | dma/point_dma_w_channel.sv 35 | dma/point_dma.sv 36 | dma/dma.sv 37 | 38 | csr.v 39 | nantucket_sv.sv 40 | nantucket.v 41 | -------------------------------------------------------------------------------- /rtl/ntt/TWIDDLE_ROM_WA0_NLEVEL7.mem: -------------------------------------------------------------------------------- 1 | 1 2 | 1 3 | 1 4 | 1000000000000 5 | 1 6 | fffffffeff000001 7 | 1000000000000 8 | fffffeff00000101 9 | 1 10 | efffffff00000001 11 | fffffffeff000001 12 | ffffffff00000 13 | 1000000000000 14 | 1000 15 | fffffeff00000101 16 | ffffffef00000001 17 | 1 18 | 3fffffffc000 19 | efffffff00000001 20 | 40000000000 21 | fffffffeff000001 22 | 40 23 | ffffffff00000 24 | fffffffb00000005 25 | 1000000000000 26 | fffffffec0000001 27 | 1000 28 | 3fffffffc000000 29 | fffffeff00000101 30 | 40000000000000 31 | ffffffef00000001 32 | 40000 33 | 1 34 | 8000000000 35 | 3fffffffc000 36 | fffffffeffe00001 37 | efffffff00000001 38 | 8 39 | 40000000000 40 | 1fffffffe0000 41 | fffffffeff000001 42 | 7fffffff00000001 43 | 40 44 | 200000000000 45 | ffffffff00000 46 | fffffffef8000001 47 | fffffffb00000005 48 | 200 49 | 1000000000000 50 | 7fffffff800000 51 | fffffffec0000001 52 | ffffffdf00000021 53 | 1000 54 | 8000000000000 55 | 3fffffffc000000 56 | fffffffd00000001 57 | fffffeff00000101 58 | 8000 59 | 40000000000000 60 | 1fffffffe0000000 61 | ffffffef00000001 62 | fffff7ff00000801 63 | 40000 64 | 200000000000000 65 | -------------------------------------------------------------------------------- /rtl/ntt/TWIDDLE_ROM_WA0_NLEVEL9.mem: -------------------------------------------------------------------------------- 1 | 1 2 | 1 3 | 1 4 | 1000000000000 5 | 1 6 | fffffffeff000001 7 | 1000000000000 8 | fffffeff00000101 9 | 1 10 | efffffff00000001 11 | fffffffeff000001 12 | ffffffff00000 13 | 1000000000000 14 | 1000 15 | fffffeff00000101 16 | ffffffef00000001 17 | 1 18 | 3fffffffc000 19 | efffffff00000001 20 | 40000000000 21 | fffffffeff000001 22 | 40 23 | ffffffff00000 24 | fffffffb00000005 25 | 1000000000000 26 | fffffffec0000001 27 | 1000 28 | 3fffffffc000000 29 | fffffeff00000101 30 | 40000000000000 31 | ffffffef00000001 32 | 40000 33 | 1 34 | 8000000000 35 | 3fffffffc000 36 | fffffffeffe00001 37 | efffffff00000001 38 | 8 39 | 40000000000 40 | 1fffffffe0000 41 | fffffffeff000001 42 | 7fffffff00000001 43 | 40 44 | 200000000000 45 | ffffffff00000 46 | fffffffef8000001 47 | fffffffb00000005 48 | 200 49 | 1000000000000 50 | 7fffffff800000 51 | fffffffec0000001 52 | ffffffdf00000021 53 | 1000 54 | 8000000000000 55 | 3fffffffc000000 56 | fffffffd00000001 57 | fffffeff00000101 58 | 8000 59 | 40000000000000 60 | 1fffffffe0000000 61 | ffffffef00000001 62 | fffff7ff00000801 63 | 40000 64 | 200000000000000 65 | 1 66 | f80007ff08000001 67 | 8000000000 68 | 40003fffc0000 69 | 3fffffffc000 70 | 1fffdfffe00 71 | fffffffeffe00001 72 | fffffffdffff0002 73 | efffffff00000001 74 | ff7fffff00000081 75 | 8 76 | c0003fff40000001 77 | 40000000000 78 | 20001fffe00000 79 | 1fffffffe0000 80 | fffeffff000 81 | fffffffeff000001 82 | fffffff6fff80009 83 | 7fffffff00000001 84 | fbffffff00000401 85 | 40 86 | 2000000000002 87 | 200000000000 88 | 10000ffff000000 89 | ffffffff00000 90 | 7fff7fff8000 91 | fffffffef8000001 92 | ffffffbeffc00041 93 | fffffffb00000005 94 | dfffffff00002001 95 | 200 96 | 10000000000010 97 | 1000000000000 98 | 80007fff8000000 99 | 7fffffff800000 100 | 3fffbfffc0000 101 | fffffffec0000001 102 | fffffdfefe000201 103 | ffffffdf00000021 104 | fffffffe00010002 105 | 1000 106 | 80000000000080 107 | 8000000000000 108 | 40003fffc0000000 109 | 3fffffffc000000 110 | 1fffdfffe00000 111 | fffffffd00000001 112 | ffffeffef0001001 113 | fffffeff00000101 114 | fffffff700080009 115 | 8000 116 | 400000000000400 117 | 40000000000000 118 | 1fffffffffffe 119 | 1fffffffe0000000 120 | fffeffff000000 121 | ffffffef00000001 122 | ffff7ffe80008001 123 | fffff7ff00000801 124 | ffffffbf00400041 125 | 40000 126 | 2000000000002000 127 | 200000000000000 128 | ffffffffffff0 129 | 1 130 | bf79143ce60ca966 131 | f80007ff08000001 132 | 3e8dfd24e8e781f 133 | 8000000000 134 | c2ded1724375e12e 135 | 40003fffc0000 136 | 3babf8a70b9016d7 137 | 3fffffffc000 138 | 2a5950219097467d 139 | 1fffdfffe00 140 | 9e07bf052a03ac5d 141 | fffffffeffe00001 142 | 784b4f47d357ef23 143 | fffffffdffff0002 144 | 5b5b114fc207d1c 145 | efffffff00000001 146 | d19f3568da585bdb 147 | ff7fffff00000081 148 | eb17187d25277580 149 | 8 150 | fbc8a1ec30654b2b 151 | c0003fff40000001 152 | 1f46fe927473c0f8 153 | 40000000000 154 | 16f68b981baf096a 155 | 20001fffe00000 156 | dd5fc5395c80b6b7 157 | 1fffffffe0000 158 | 52ca810d84ba33e7 159 | fffeffff000 160 | f03df82d501d62e4 161 | fffffffeff000001 162 | c25a7a419abf7915 163 | fffffff6fff80009 164 | 2dad88a7e103e8e0 165 | 7fffffff00000001 166 | 8cf9ab4cd2c2ded2 167 | fbffffff00000401 168 | 58b8c3f0293babf9 169 | 40 170 | de450f68832a5951 171 | 2000000000002 172 | fa37f493a39e07c0 173 | 200000000000 174 | b7b45cc0dd784b50 175 | 10000ffff000000 176 | eafe29d0e405b5b2 177 | ffffffff00000 178 | 9654086e25d19f36 179 | 7fff7fff8000 180 | 81efc17180eb1719 181 | fffffffef8000001 182 | 12d3d212d5fbc8a2 183 | ffffffbeffc00041 184 | 6d6c4540081f46ff 185 | fffffffb00000005 186 | 67cd5a6a9616f68c 187 | dfffffff00002001 188 | c5c61f8349dd5fc6 189 | 200 190 | f2287b4a1952ca82 191 | 10000000000010 192 | d1bfa4a41cf03df9 193 | 1000000000000 194 | bda2e60bebc25a7b 195 | 80007fff8000000 196 | 57f14e8e202dad89 197 | 7fffffff800000 198 | b2a043752e8cf9ac 199 | 3fffbfffc0000 200 | f7e0b900758b8c4 201 | fffffffec0000001 202 | 969e9096afde4510 203 | fffffdfefe000201 204 | 6b622a0340fa37f5 205 | ffffffdf00000021 206 | 3e6ad357b0b7b45d 207 | fffffffe00010002 208 | 2e30fc204eeafe2a 209 | 1000 210 | 9143da57ca965409 211 | 80000000000080 212 | 8dfd2526e781efc2 213 | 8000000000000 214 | ed1730645e12d3d3 215 | 40003fffc0000000 216 | bf8a7473016d6c46 217 | 3fffffffc000000 218 | 95021bae7467cd5b 219 | 1fffdfffe00000 220 | 7bf05c803ac5c620 221 | fffffffd00000001 222 | b4f484b97ef2287c 223 | ffffeffef0001001 224 | 5b11501d07d1bfa5 225 | fffffeff00000101 226 | f3569abe85bda2e7 227 | fffffff700080009 228 | 7187e1037757f14f 229 | 8000 230 | 8a1ed2c254b2a044 231 | 400000000000400 232 | 6fe9293b3c0f7e0c 233 | 40000000000000 234 | 68b98329f0969e91 235 | 1fffffffffffe 236 | fc53a39d0b6b622b 237 | 1fffffffe0000000 238 | a810dd77a33e6ad4 239 | fffeffff000000 240 | df82e404d62e30fd 241 | ffffffef00000001 242 | a7a425d0f79143db 243 | ffff7ffe80008001 244 | d88a80ea3e8dfd26 245 | fffff7ff00000801 246 | 9ab4d5fb2ded1731 247 | ffffffbf00400041 248 | 8c3f081ebabf8a75 249 | 40000 250 | 50f69616a595021c 251 | 2000000000002000 252 | 7f4949dce07bf05d 253 | 200000000000000 254 | 45cc195284b4f485 255 | ffffffffffff0 256 | e29d1cef5b5b1151 257 | -------------------------------------------------------------------------------- /rtl/ntt/TWIDDLE_ROM_WA1_NLEVEL7.mem: -------------------------------------------------------------------------------- 1 | 1 2 | f80007ff08000001 3 | 8000000000 4 | 40003fffc0000 5 | 3fffffffc000 6 | 1fffdfffe00 7 | fffffffeffe00001 8 | fffffffdffff0002 9 | efffffff00000001 10 | ff7fffff00000081 11 | 8 12 | c0003fff40000001 13 | 40000000000 14 | 20001fffe00000 15 | 1fffffffe0000 16 | fffeffff000 17 | fffffffeff000001 18 | fffffff6fff80009 19 | 7fffffff00000001 20 | fbffffff00000401 21 | 40 22 | 2000000000002 23 | 200000000000 24 | 10000ffff000000 25 | ffffffff00000 26 | 7fff7fff8000 27 | fffffffef8000001 28 | ffffffbeffc00041 29 | fffffffb00000005 30 | dfffffff00002001 31 | 200 32 | 10000000000010 33 | 1000000000000 34 | 80007fff8000000 35 | 7fffffff800000 36 | 3fffbfffc0000 37 | fffffffec0000001 38 | fffffdfefe000201 39 | ffffffdf00000021 40 | fffffffe00010002 41 | 1000 42 | 80000000000080 43 | 8000000000000 44 | 40003fffc0000000 45 | 3fffffffc000000 46 | 1fffdfffe00000 47 | fffffffd00000001 48 | ffffeffef0001001 49 | fffffeff00000101 50 | fffffff700080009 51 | 8000 52 | 400000000000400 53 | 40000000000000 54 | 1fffffffffffe 55 | 1fffffffe0000000 56 | fffeffff000000 57 | ffffffef00000001 58 | ffff7ffe80008001 59 | fffff7ff00000801 60 | ffffffbf00400041 61 | 40000 62 | 2000000000002000 63 | 200000000000000 64 | ffffffffffff0 65 | -------------------------------------------------------------------------------- /rtl/ntt/TWIDDLE_ROM_WA1_NLEVEL9.mem: -------------------------------------------------------------------------------- 1 | 1 2 | 1905d02a5c411f4e 3 | bf79143ce60ca966 4 | ba25eb5cd1970aeb 5 | f80007ff08000001 6 | c843f1629460b551 7 | 3e8dfd24e8e781f 8 | 95836de70f31cbfa 9 | 8000000000 10 | a377bc2d7d17eac6 11 | c2ded1724375e12e 12 | de7b23e7ed0a513b 13 | 40003fffc0000 14 | 525359c9de074e52 15 | 3babf8a70b9016d7 16 | 5a9cf0873e490c2e 17 | 3fffffffc000 18 | 47d379be4421e8f0 19 | 2a5950219097467d 20 | c2ba9175c26e0b9b 21 | 1fffdfffe00 22 | 2d540deed6531ae8 23 | 9e07bf052a03ac5d 24 | 72fe5a9eb187bc34 25 | fffffffeffe00001 26 | fab15721164320bb 27 | 784b4f47d357ef23 28 | 944e8860a2b744be 29 | fffffffdffff0002 30 | d3946b6a55f9087f 31 | 5b5b114fc207d1c 32 | 430b695880d2b06e 33 | efffffff00000001 34 | 7a3bee0aa7546ef8 35 | d19f3568da585bdb 36 | 82e68f50d8bbcf65 37 | ff7fffff00000081 38 | c6b9f4aa35ca4a6c 39 | eb17187d25277580 40 | ef0ce33f7a4b539f 41 | 8 42 | c82e8152e208fa70 43 | fbc8a1ec30654b2b 44 | d12f5aeb8cb85753 45 | c0003fff40000001 46 | 421f8b1aa305aa82 47 | 1f46fe927473c0f8 48 | ac1b6f3c798e5fcc 49 | 40000000000 50 | 1bbde170e8bf562b 51 | 16f68b981baf096a 52 | f3d91f45685289d2 53 | 20001fffe00000 54 | 929ace50f03a728e 55 | dd5fc5395c80b6b7 56 | d4e7843bf248616e 57 | 1fffffffe0000 58 | 3e9bcdf4210f477e 59 | 52ca810d84ba33e7 60 | 15d48bb413705cd2 61 | fffeffff000 62 | 6aa06f77b298d73f 63 | f03df82d501d62e4 64 | 97f2d4f88c3de19d 65 | fffffffeff000001 66 | d58ab90fb21905d1 67 | c25a7a419abf7915 68 | a274430915ba25ec 69 | fffffff6fff80009 70 | 9ca35b58afc843f2 71 | 2dad88a7e103e8e0 72 | 185b4ac60695836e 73 | 7fffffff00000001 74 | d1df70583aa377bd 75 | 8cf9ab4cd2c2ded2 76 | 17347a8ac5de7b24 77 | fbffffff00000401 78 | 35cfa557ae52535a 79 | 58b8c3f0293babf9 80 | 78671a02d25a9cf1 81 | 40 82 | 41740a9d1047d37a 83 | de450f68832a5951 84 | 897ad76265c2ba92 85 | 2000000000002 86 | 10fc58d7182d540e 87 | fa37f493a39e07c0 88 | 60db79e8cc72fe5b 89 | 200000000000 90 | ddef0b8745fab158 91 | b7b45cc0dd784b50 92 | 9ec8fa3242944e89 93 | 10000ffff000000 94 | 94d6728b81d3946c 95 | eafe29d0e405b5b2 96 | a73c21e592430b6a 97 | ffffffff00000 98 | f4de6fa2087a3bef 99 | 9654086e25d19f36 100 | aea45da09b82e690 101 | 7fff7fff8000 102 | 55037bc094c6b9f5 103 | 81efc17180eb1719 104 | bf96a7c861ef0ce4 105 | fffffffef8000001 106 | ac55c88390c82e82 107 | 12d3d212d5fbc8a2 108 | 13a2184dadd12f5b 109 | ffffffbeffc00041 110 | e51adac97e421f8c 111 | 6d6c4540081f46ff 112 | c2da563034ac1b70 113 | fffffffb00000005 114 | 8efb82c7d51bbde2 115 | 67cd5a6a9616f68c 116 | b9a3d4562ef3d920 117 | dfffffff00002001 118 | ae7d2abe72929acf 119 | c5c61f8349dd5fc6 120 | c338d01992d4e785 121 | 200 122 | ba054ea823e9bce 123 | f2287b4a1952ca82 124 | 4bd6bb172e15d48c 125 | 10000000000010 126 | 87e2c6b8c16aa070 127 | d1bfa4a41cf03df9 128 | 6dbcf496397f2d5 129 | 1000000000000 130 | ef785c402fd58aba 131 | bda2e60bebc25a7b 132 | f647d19614a27444 133 | 80007fff8000000 134 | a6b394600e9ca35c 135 | 57f14e8e202dad89 136 | 39e10f3192185b4b 137 | 7fffffff800000 138 | a6f37d1743d1df71 139 | b2a043752e8cf9ac 140 | 7522ed09dc17347b 141 | 3fffbfffc0000 142 | a81bde06a635cfa6 143 | f7e0b900758b8c4 144 | fcb53e480f78671b 145 | fffffffec0000001 146 | 62ae44218641740b 147 | 969e9096afde4510 148 | 9d10c26d6e897ad8 149 | fffffdfefe000201 150 | 28d6d652f210fc59 151 | 6b622a0340fa37f5 152 | 16d2b187a560db7a 153 | ffffffdf00000021 154 | 77dc1642a8ddef0c 155 | 3e6ad357b0b7b45d 156 | cd1ea2b6779ec8fb 157 | fffffffe00010002 158 | 73e955f89494d673 159 | 2e30fc204eeafe2a 160 | 19c680d296a73c22 161 | 1000 162 | 5d02a75411f4de70 163 | 9143da57ca965409 164 | 5eb5d8bb70aea45e 165 | 80000000000080 166 | 3f1635ca0b55037c 167 | 8dfd2526e781efc2 168 | 36de7a4b1cbf96a8 169 | 8000000000000 170 | 7bc2e2087eac55c9 171 | ed1730645e12d3d3 172 | b23e8cb7a513a219 173 | 40003fffc0000000 174 | 359ca30574e51adb 175 | bf8a7473016d6c46 176 | cf08798d90c2da57 177 | 3fffffffc000000 178 | 379be8bf1e8efb83 179 | 95021bae7467cd5b 180 | a9176851e0b9a3d5 181 | 1fffdfffe00000 182 | 40def03a31ae7d2b 183 | 7bf05c803ac5c620 184 | e5a9f2477bc338d1 185 | fffffffd00000001 186 | 1572210f320ba055 187 | b4f484b97ef2287c 188 | e886136f744bd6bc 189 | ffffeffef0001001 190 | 46b6b2989087e2c7 191 | 5b11501d07d1bfa5 192 | b6958c3d2b06dbd0 193 | fffffeff00000101 194 | bee0b21846ef785d 195 | f3569abe85bda2e7 196 | 68f515b9bcf647d2 197 | fffffff700080009 198 | 9f4aafc7a4a6b395 199 | 7187e1037757f14f 200 | ce340694b539e110 201 | 8000 202 | e8153aa28fa6f37e 203 | 8a1ed2c254b2a044 204 | f5aec5dd857522ee 205 | 400000000000400 206 | f8b1ae515aa81bdf 207 | 6fe9293b3c0f7e0c 208 | b6f3d259e5fcb53f 209 | 40000000000000 210 | de171046f562ae45 211 | 68b98329f0969e91 212 | 91f465c2289d10c3 213 | 1fffffffffffe 214 | ace5182ca728d6d7 215 | fc53a39d0b6b622b 216 | 7843cc728616d2b2 217 | 1fffffffe0000000 218 | bcdf45f9f477dc17 219 | a810dd77a33e6ad4 220 | 48bb429405cd1ea3 221 | fffeffff000000 222 | 6f781d38d73e956 223 | df82e404d62e30fd 224 | 2d4f9242de19c681 225 | ffffffef00000001 226 | ab910879905d02a8 227 | a7a425d0f79143db 228 | 44309b82a25eb5d9 229 | ffff7ffe80008001 230 | 35b594c6843f1636 231 | d88a80ea3e8dfd26 232 | b4ac61ee5836de7b 233 | fffff7ff00000801 234 | f70590c7377bc2e3 235 | 9ab4d5fb2ded1731 236 | 47a8add0e7b23e8d 237 | ffffffbf00400041 238 | fa557e4125359ca4 239 | 8c3f081ebabf8a75 240 | 71a034aba9cf087a 241 | 40000 242 | 40a9d51b7d379be9 243 | 50f69616a595021c 244 | ad762ef32ba91769 245 | 2000000000002000 246 | c58d7291d540def1 247 | 7f4949dce07bf05d 248 | b79e92d42fe5a9f3 249 | 200000000000000 250 | f0b8823dab157222 251 | 45cc195284b4f485 252 | 8fa32e1544e88614 253 | ffffffffffff0 254 | 6728c16a3946b6b3 255 | e29d1cef5b5b1151 256 | c21e639730b6958d 257 | -------------------------------------------------------------------------------- /rtl/ntt/ntt.sv: -------------------------------------------------------------------------------- 1 | // Copyright Supranational LLC 2 | // Licensed under the Apache License, Version 2.0, see LICENSE-APACHE 3 | // or the MIT license, see LICENSE-MIT, at your option. 4 | // SPDX-License-Identifier: Apache-2.0 OR MIT 5 | 6 | (* keep_hierarchy = "yes" *) module ntt 7 | import ntt_opt_pkg::*; 8 | #( 9 | parameter NLEVEL = 12, // number of butterfly levels (2^12 NTT requires 12 levels) 10 | parameter NLEVEL0 = 0, // number of mini-cgram levels per stage 11 | parameter NLANE = 1, // number of lanes to operate in parallel 12 | parameter PASS = 0, // 0 or 1 13 | parameter BFLYDSP = 24, // 24, 16, or 12 14 | parameter NOP = 0 // useful for debugging, dataflow only no b-fly calcs 15 | ) 16 | ( 17 | input logic rst_ni, 18 | input logic clk_i, 19 | 20 | input logic [NLANE-1:0][1:0][63:0] x_i, 21 | input logic [NLANE-1:0][1:0][63:0] w_i, 22 | input logic [NLANE-1:0] valid_i, 23 | 24 | output logic [NLANE-1:0][1:0][63:0] x_o, 25 | output logic [NLANE-1:0] valid_o 26 | ); 27 | 28 | localparam NLEVEL_PER_STAGE = NLEVEL0 + 1; 29 | localparam NSTAGE = NLEVEL / NLEVEL_PER_STAGE; 30 | 31 | initial assert(NSTAGE * NLEVEL_PER_STAGE == NLEVEL); 32 | 33 | wire [2*NLEVEL-1:0][NLANE-1:0][1:0][63:0] x; 34 | wire [2*NLEVEL-1:0][NLANE-1:0][1:0][63:0] w; 35 | wire [2*NLEVEL-1:0][NLANE-1:0] valid; 36 | 37 | assign x[0] = x_i; 38 | assign w[0] = (PASS==0 || OPTIMIZE_PASS1_TWIDDLES==0) ? w_i : {NLANE{w_i[0]}}; 39 | assign valid[0] = valid_i; 40 | 41 | for (genvar i=0; i> NLEVEL0; 106 | if (!NOP) begin 107 | for (int i=0; i<$clog2(N); i++) begin 108 | bitrev[i] = wo[$clog2(N)-1-i]; 109 | end 110 | end 111 | else begin 112 | bitrev = wo; 113 | end 114 | ra[0] = {rcnt_q[$clog2(N)], bitrev}; 115 | ra[1] = {rcnt_q[$clog2(N)], bitrev}; 116 | end 117 | 118 | always_ff @(posedge clk_i) begin 119 | if (!rst_ni) begin 120 | valid_o <= 0; 121 | end 122 | else begin 123 | valid_o <= rvalid_p2_q; 124 | end 125 | end 126 | 127 | always_ff @(posedge clk_i) begin 128 | x_o[0] <= rx[0]; 129 | x_o[1] <= rx[1]; 130 | end 131 | 132 | `ifdef NEVER 133 | `ifndef SYNTHESIS 134 | 135 | string s; 136 | int fdi, fdo, icnt, ocnt; 137 | 138 | initial begin 139 | $sformat(s,"%m_in.log"); 140 | fdi = $fopen(s,"w"); 141 | $sformat(s,"%m_out.log"); 142 | fdo = $fopen(s,"w"); 143 | icnt = 0; 144 | ocnt = 0; 145 | end 146 | 147 | always @(posedge clk_i) begin 148 | if (valid_i) begin 149 | $sformat(s,"%0d: ",icnt); 150 | $fwrite(fdi,s); 151 | for (int i=0; i<2; i++) begin 152 | $sformat(s,"[%0d] 0x%x ",i,x_i[i]); 153 | $fwrite(fdi,s); 154 | end 155 | $fwrite(fdi,"\n"); 156 | icnt=icnt+1; 157 | end 158 | if (valid_o) begin 159 | $sformat(s,"%0d: ",ocnt); 160 | $fwrite(fdo,s); 161 | for (int i=0; i<2; i++) begin 162 | $sformat(s,"[%0d] 0x%x ",i,x_o[i]); 163 | $fwrite(fdo,s); 164 | end 165 | $fwrite(fdo,"\n"); 166 | ocnt=ocnt+1; 167 | end 168 | end 169 | 170 | `endif 171 | `endif 172 | 173 | endmodule 174 | -------------------------------------------------------------------------------- /rtl/ntt/ntt_butterfly.sv: -------------------------------------------------------------------------------- 1 | // Copyright Supranational LLC 2 | // Licensed under the Apache License, Version 2.0, see LICENSE-APACHE 3 | // or the MIT license, see LICENSE-MIT, at your option. 4 | // SPDX-License-Identifier: Apache-2.0 OR MIT 5 | 6 | module ntt_butterfly 7 | import math_pkg::*; 8 | import ntt_opt_pkg::*; 9 | #( 10 | parameter NOP = 0, 11 | parameter LEVEL_FROM_END = 0, 12 | parameter BFLYDSP = 24 // 24, 16 or 12 13 | ) 14 | ( 15 | input logic rst_ni, 16 | input logic clk_i, 17 | 18 | input logic [1:0][63:0] x_i, 19 | input logic [1:0][63:0] w_i, 20 | input logic valid_i, 21 | 22 | output logic [1:0][63:0] x_o, 23 | output logic [1:0][63:0] w_o, 24 | output logic valid_o 25 | ); 26 | 27 | localparam MODE 28 | = (LEVEL_FROM_END==1 && OPTIMIZE_FINAL_BFLY_STAGES) ? BUTTERFLY_W0 : 29 | (LEVEL_FROM_END==2 && OPTIMIZE_FINAL_BFLY_STAGES) ? BUTTERFLY_W0_W2 : 30 | BUTTERFLY_GENERIC; 31 | 32 | localparam PD 33 | = (MODE == BUTTERFLY_W0 ) ? PIPE_DEPTH_BUTTERFLY_W0 : 34 | (MODE == BUTTERFLY_W0_W2) ? PIPE_DEPTH_BUTTERFLY_W0_W2 : 35 | PIPE_DEPTH_BUTTERFLY; 36 | 37 | logic [PD-2:0][63:0] w; 38 | logic [PD-2:0] valid_q; 39 | 40 | butterfly 41 | #( 42 | .MODE(MODE), 43 | .BFLYDSP(BFLYDSP), 44 | .CANONICAL((LEVEL_FROM_END==1) ? 1 : 0) 45 | ) 46 | butterfly 47 | ( 48 | .rst_ni( 1'b1 ), 49 | .clk_i, 50 | .nop_i ( NOP ? 1'b1 : 1'b0 ), 51 | .ce_i ( 1'b1 ), 52 | .x_i ( x_i[0] ), 53 | .y_i ( x_i[1] ), 54 | .w_i ( w_i[1] ), 55 | .x_o ( x_o[0] ), 56 | .y_o ( x_o[1] ) 57 | ); 58 | 59 | always_ff @(posedge clk_i) begin 60 | {w_o[0], w} <= { w, w_i[0] }; 61 | end 62 | 63 | if (OPTIMIZE_TWIDDLE_PIPELINING) begin 64 | // help tools to optimize away logic as ntt_cgram.sv will replicate 65 | assign w_o[1] = 0; 66 | end 67 | else begin 68 | assign w_o[1] = w_o[0]; 69 | end 70 | 71 | always_ff @(posedge clk_i or negedge rst_ni) begin 72 | if (!rst_ni) begin 73 | {valid_o, valid_q} <= '0; 74 | end 75 | else begin 76 | {valid_o, valid_q} <= {valid_q, valid_i}; 77 | end 78 | end 79 | 80 | endmodule 81 | -------------------------------------------------------------------------------- /rtl/ntt/ntt_cgram.sv: -------------------------------------------------------------------------------- 1 | // Copyright Supranational LLC 2 | // Licensed under the Apache License, Version 2.0, see LICENSE-APACHE 3 | // or the MIT license, see LICENSE-MIT, at your option. 4 | // SPDX-License-Identifier: Apache-2.0 OR MIT 5 | 6 | module ntt_cgram 7 | import math_pkg::*; 8 | import ntt_opt_pkg::*; 9 | #( 10 | parameter N = 2**11, // number of pairs of points 11 | parameter LEVEL = 0, // butterfly level which this cgram feeds 12 | parameter NLEVEL0 = 0, 13 | parameter NO_TWIDDLES = 0 14 | ) 15 | ( 16 | input logic rst_ni, 17 | input logic clk_i, 18 | 19 | input logic [1:0][63:0] x_i, 20 | input logic [1:0][63:0] w_i, 21 | input logic valid_i, 22 | 23 | output logic [1:0][63:0] x_o, 24 | output logic [1:0][63:0] w_o, 25 | output logic valid_o 26 | ); 27 | 28 | logic [1:0][$clog2(N):0] wa; 29 | logic [$clog2(N)+1:0] wcnt_q; 30 | logic wshift; 31 | logic [1:0][63:0] wx, ww; 32 | logic rvalid, rvalid_p1_q, rvalid_p2_q; 33 | logic [1:0][$clog2(N):0] ra; 34 | logic [$clog2(N)+1:0] rcnt_q; 35 | logic [$clog2(N)-1:0] wo; 36 | logic rshift, rshift_p1_q, rshift_p2_q; 37 | logic [1:0][63:0] rx, rw; 38 | logic [1:0][63:0] x, w; 39 | logic valid; 40 | 41 | always_ff @(posedge clk_i) begin 42 | if (!rst_ni) begin 43 | wcnt_q <= 0; 44 | end 45 | else begin 46 | wcnt_q <= wcnt_q + valid_i; 47 | end 48 | end 49 | 50 | always_comb begin 51 | wa[0] = wcnt_q; 52 | wa[1] = wcnt_q; 53 | wshift = wcnt_q[$clog2(N)-1]; 54 | end 55 | 56 | always_comb begin 57 | wx[0] = x_i[ wshift]; 58 | wx[1] = x_i[!wshift]; 59 | ww[0] = w_i[ wshift]; 60 | ww[1] = w_i[!wshift]; 61 | end 62 | 63 | ram_1w1r_1clk 64 | #( 65 | .ADDR_WIDTH ($clog2(N)+1), 66 | .DATA_WIDTH (64) 67 | ) 68 | ram_x0 69 | ( 70 | .clk_i ( clk_i ), 71 | .a_a_i ( wa[0] ), 72 | .a_wd_i ( wx[0] ), 73 | .a_we_i ( valid_i ), 74 | .b_a_i ( ra[0] ), 75 | .b_re_i ( rvalid || rvalid_p1_q ), 76 | .b_rd_o ( rx[0] ) 77 | ); 78 | 79 | ram_1w1r_1clk 80 | #( 81 | .ADDR_WIDTH ($clog2(N)+1), 82 | .DATA_WIDTH (64) 83 | ) 84 | ram_x1 85 | ( 86 | .clk_i ( clk_i ), 87 | .a_a_i ( wa[1] ), 88 | .a_wd_i ( wx[1] ), 89 | .a_we_i ( valid_i ), 90 | .b_a_i ( ra[1] ), 91 | .b_re_i ( rvalid || rvalid_p1_q ), 92 | .b_rd_o ( rx[1] ) 93 | ); 94 | 95 | if (NO_TWIDDLES==0) begin : twiddle 96 | if (OPTIMIZE_TWIDDLE_PIPELINING==0) begin : no_wopt 97 | 98 | ram_1w1r_1clk 99 | #( 100 | .ADDR_WIDTH ($clog2(N)+1), 101 | .DATA_WIDTH (64) 102 | ) 103 | ram_w0 104 | ( 105 | .clk_i ( clk_i ), 106 | .a_a_i ( wa[0] ), 107 | .a_wd_i ( ww[0] ), 108 | .a_we_i ( valid_i ), 109 | .b_a_i ( ra[0] ), 110 | .b_re_i ( rvalid || rvalid_p1_q ), 111 | .b_rd_o ( rw[0] ) 112 | ); 113 | 114 | ram_1w1r_1clk 115 | #( 116 | .ADDR_WIDTH ($clog2(N)+1), 117 | .DATA_WIDTH (64) 118 | ) 119 | ram_w1 120 | ( 121 | .clk_i ( clk_i ), 122 | .a_a_i ( wa[1] ), 123 | .a_wd_i ( ww[1] ), 124 | .a_we_i ( valid_i ), 125 | .b_a_i ( ra[1] ), 126 | .b_re_i ( rvalid || rvalid_p1_q ), 127 | .b_rd_o ( rw[1] ) 128 | ); 129 | 130 | end 131 | else begin : wopt 132 | 133 | localparam W_ADDR_WIDTH = $clog2(N)+1-LEVEL; 134 | logic [1:0][W_ADDR_WIDTH-1:0] _wa; 135 | logic [1:0][W_ADDR_WIDTH-1:0] _ra; 136 | logic [1:0] _we; 137 | logic [1:0] rsel_q; 138 | logic [1:0][63:0] _rw; 139 | always_comb begin 140 | _wa[0] = (wa[0][$clog2(N)]<<($clog2(N)-LEVEL)) | ((wa[0]>>(LEVEL-1)) & ((1<<($clog2(N)-LEVEL))-1)); 141 | _wa[1] = (wa[1][$clog2(N)]<<($clog2(N)-LEVEL)) | ((wa[1]>>(LEVEL-1)) & ((1<<($clog2(N)-LEVEL))-1)); 142 | _ra[0] = (ra[0][$clog2(N)]<<($clog2(N)-LEVEL)) | ((ra[0]>>(LEVEL-1)) & ((1<<($clog2(N)-LEVEL))-1)); 143 | _ra[1] = (ra[1][$clog2(N)]<<($clog2(N)-LEVEL)) | ((ra[1]>>(LEVEL-1)) & ((1<<($clog2(N)-LEVEL))-1)); 144 | _we[0] = (wa[0][$clog2(N)-1]==0) && ( (wa[0] & ((1<<(LEVEL-1))-1)) == 0 ); 145 | _we[1] = (wa[1][$clog2(N)-1]==1) && ( (wa[1] & ((1<<(LEVEL-1))-1)) == 0 ); 146 | end 147 | always_ff @(posedge clk_i) begin 148 | rsel_q <= {rsel_q,ra[0][$clog2(N)-1]}; 149 | end 150 | 151 | ram_1w1r_1clk 152 | #( 153 | .ADDR_WIDTH (W_ADDR_WIDTH), 154 | .DATA_WIDTH (64) 155 | ) 156 | ram_w0 157 | ( 158 | .clk_i ( clk_i ), 159 | .a_a_i ( _wa[0] ), 160 | .a_wd_i ( ww[0] ), 161 | .a_we_i ( valid_i && _we[0] ), 162 | .b_a_i ( _ra[0] ), 163 | .b_re_i ( rvalid || rvalid_p1_q ), 164 | .b_rd_o ( _rw[0] ) 165 | ); 166 | 167 | ram_1w1r_1clk 168 | #( 169 | .ADDR_WIDTH (W_ADDR_WIDTH), 170 | .DATA_WIDTH (64) 171 | ) 172 | ram_w1 173 | ( 174 | .clk_i ( clk_i ), 175 | .a_a_i ( _wa[1] ), 176 | .a_wd_i ( ww[1] ), 177 | .a_we_i ( valid_i && _we[1] ), 178 | .b_a_i ( _ra[1] ), 179 | .b_re_i ( rvalid || rvalid_p1_q ), 180 | .b_rd_o ( _rw[1] ) 181 | ); 182 | 183 | assign rw[0] = _rw[rsel_q[1]]; 184 | assign rw[1] = _rw[!rsel_q[1]]; 185 | 186 | end 187 | end 188 | 189 | always_comb begin 190 | rvalid = wcnt_q[$clog2(N)+1:$clog2(N)] != rcnt_q[$clog2(N)+1:$clog2(N)]; 191 | end 192 | 193 | always_ff @(posedge clk_i) begin 194 | if (!rst_ni) begin 195 | rcnt_q <= 0; 196 | rvalid_p1_q <= 0; 197 | rvalid_p2_q <= 0; 198 | end 199 | else begin 200 | rcnt_q <= rcnt_q + rvalid; 201 | rvalid_p1_q <= rvalid; 202 | rvalid_p2_q <= rvalid_p1_q; 203 | end 204 | end 205 | 206 | always_comb begin 207 | wo = rcnt_q[$clog2(N)-1:0]; 208 | wo = {wo, wo} >> NLEVEL0; 209 | rshift = wo[0]; 210 | ra[0] = ({rcnt_q[$clog2(N)], wo[0]} << $clog2(N/2)) | ((wo>>1) & (N/2-1)); 211 | ra[1] = ({rcnt_q[$clog2(N)],!wo[0]} << $clog2(N/2)) | ((wo>>1) & (N/2-1)); 212 | end 213 | 214 | always_ff @(posedge clk_i) begin 215 | rshift_p1_q <= rshift; 216 | rshift_p2_q <= rshift_p1_q; 217 | end 218 | 219 | always_ff @(posedge clk_i) begin 220 | if (!rst_ni) begin 221 | valid_o <= 0; 222 | end 223 | else begin 224 | valid_o <= rvalid_p2_q; 225 | end 226 | end 227 | 228 | always_ff @(posedge clk_i) begin 229 | x_o[0] <= rx[ rshift_p2_q]; 230 | x_o[1] <= rx[!rshift_p2_q]; 231 | end 232 | if (NO_TWIDDLES==0) begin 233 | always_ff @(posedge clk_i) begin 234 | w_o[0] <= rw[ rshift_p2_q]; 235 | w_o[1] <= rw[!rshift_p2_q]; 236 | end 237 | end 238 | 239 | `ifdef NEVER 240 | `ifndef SYNTHESIS 241 | 242 | string s; 243 | int fdi, fdo, icnt, ocnt; 244 | 245 | initial begin 246 | $sformat(s,"%m_in.log"); 247 | fdi = $fopen(s,"w"); 248 | $sformat(s,"%m_out.log"); 249 | fdo = $fopen(s,"w"); 250 | icnt = 0; 251 | ocnt = 0; 252 | end 253 | 254 | always @(posedge clk_i) begin 255 | if (valid_i) begin 256 | $sformat(s,"%0d: ",icnt); 257 | $fwrite(fdi,s); 258 | for (int i=0; i<2; i++) begin 259 | $sformat(s,"[%0d] x 0x%x w 0x%x ",i,x_i[i],w_i[i]); 260 | //$sformat(s,"[%0d] 0x%x ",i,x_i[i]); 261 | //$sformat(s,"[%0d] 0x%x ",i,w_i[i]); 262 | $fwrite(fdi,s); 263 | end 264 | $fwrite(fdi,"\n"); 265 | icnt=icnt+1; 266 | end 267 | if (valid_o) begin 268 | $sformat(s,"%0d: ",ocnt); 269 | $fwrite(fdo,s); 270 | for (int i=0; i<2; i++) begin 271 | $sformat(s,"[%0d] x 0x%x w 0x%x ",i,x_o[i],w_o[i]); 272 | //$sformat(s,"[%0d] 0x%x ",i,x_o[i]); 273 | //$sformat(s,"[%0d] 0x%x ",i,w_o[i]); 274 | $fwrite(fdo,s); 275 | end 276 | $fwrite(fdo,"\n"); 277 | ocnt=ocnt+1; 278 | end 279 | end 280 | 281 | `endif 282 | `endif 283 | 284 | endmodule 285 | -------------------------------------------------------------------------------- /rtl/ntt/ntt_opt_pkg.sv: -------------------------------------------------------------------------------- 1 | // Copyright Supranational LLC 2 | // Licensed under the Apache License, Version 2.0, see LICENSE-APACHE 3 | // or the MIT license, see LICENSE-MIT, at your option. 4 | // SPDX-License-Identifier: Apache-2.0 OR MIT 5 | 6 | package ntt_opt_pkg; 7 | 8 | // Optimize by storing only those twiddles which are unique and replicating 9 | // as needed. With Gentleman-Sande, each butterfly level eliminates the need 10 | // for 1/2 of twiddles. 11 | localparam OPTIMIZE_TWIDDLE_PIPELINING = 1; 12 | 13 | 14 | // In a 2 pass NTT as we use for 2^18 or 2^24 the first pass (pass==0) has 15 | // unique twiddles at the input for each point. For the second pass (pass==1) 16 | // the twiddles for each lane are identical. We can use this to save on 17 | // RAMs. 18 | localparam OPTIMIZE_PASS1_TWIDDLES = 1; 19 | 20 | // The final butterfly stage uses a twiddle factor of W0 = 1, and the second 21 | // to last stage uses twiddle factors of W0 or W2 = (1 << 48) for the choice 22 | // of generator = 7. As such, we can optimize these stages compared to the 23 | // more generic stages which need a true 64x64 multiply. 24 | localparam OPTIMIZE_FINAL_BFLY_STAGES = 1; 25 | 26 | endpackage 27 | -------------------------------------------------------------------------------- /rtl/ntt/ntt_top.sv: -------------------------------------------------------------------------------- 1 | // Copyright Supranational LLC 2 | // Licensed under the Apache License, Version 2.0, see LICENSE-APACHE 3 | // or the MIT license, see LICENSE-MIT, at your option. 4 | // SPDX-License-Identifier: Apache-2.0 OR MIT 5 | 6 | module ntt_top 7 | #( 8 | parameter NLEVEL = 12, // number of butterfly levels (2^12 NTT requires 12 levels) 9 | parameter NLEVEL0 = 5, // number of mini-cgram levels per stage 10 | parameter NLANE = 8, // number of lanes to operate in parallel 11 | parameter PASS1 = 0, // 0 for pass 0 or 1 via pass1_i, 1 for pass 1 optimized 12 | parameter BFLYDSP = 12, // 24, 16, or 12 13 | parameter SLRX_X = 1, // SLR0->SLR1 crossing on x_i, SLR1->SLR0 crossing on x_o 14 | parameter SLRX_W = 1, // SLR0->SLR1 crossing on w 15 | parameter NOP = 0 // useful for debugging, dataflow only no b-fly calcs 16 | ) 17 | ( 18 | input logic rst_ni, 19 | input logic clk_i, 20 | 21 | input logic pass1_i, 22 | 23 | input logic [NLANE-1:0][1:0][63:0] x_i, 24 | input logic [NLANE-1:0] valid_i, 25 | 26 | output logic [NLANE-1:0][1:0][63:0] x_o, 27 | output logic [NLANE-1:0] valid_o 28 | ); 29 | 30 | logic [NLANE-1:0][1:0][63:0] w; 31 | logic [NLANE-1:0][1:0][63:0] w_slrx, w_slr1; 32 | logic [NLANE-1:0][1:0][63:0] x_slrx_i, x_slr1_i; 33 | logic [NLANE-1:0] valid_slrx_i, valid_slr1_i, ready_w; 34 | logic [NLANE-1:0][1:0][63:0] x_slrx_o, x_slr1_o; 35 | logic [NLANE-1:0] valid_slrx_o, valid_slr1_o; 36 | 37 | (* dont_touch = "true" *) logic twid_rst_dt_qn; 38 | (* dont_touch = "true" *) logic core_rst_dt_qn; 39 | logic twid_rst_qn; 40 | logic core_rst_qn; 41 | always_ff @(posedge clk_i) begin 42 | twid_rst_dt_qn <= rst_ni; 43 | core_rst_dt_qn <= rst_ni; 44 | twid_rst_qn <= twid_rst_dt_qn; 45 | core_rst_qn <= core_rst_dt_qn; 46 | end 47 | 48 | ntt_twiddle 49 | #( 50 | .NLEVEL(NLEVEL), 51 | .NLEVEL0(NLEVEL0), 52 | .NLANE(NLANE), 53 | .PASS1_ONLY(PASS1), 54 | .BFLYDSP(BFLYDSP) 55 | ) 56 | ntt_twiddle 57 | ( 58 | .rst_ni(twid_rst_qn), 59 | .clk_i, 60 | .pass1_i(pass1_i), 61 | .ready_i(ready_w), 62 | .w_o(w) 63 | ); 64 | 65 | if (SLRX_X) begin : slrx_xi 66 | for (genvar gv_i=0;gv_i